Merge branch 'master' into Basilisk-releasev2018.11.04

author: wolfbeast <mcwerewolf@gmail.com> 2018-11-03 09:06:25 +0100
committer: wolfbeast <mcwerewolf@gmail.com> 2018-11-03 09:06:25 +0100
commit: 1626b5d7041ea9c3db92200f91542da46e49dde6 (patch)
tree: e99c393052ef818645027da57774672990e29514
parent: 314fb761d144b160d3aeb72840c89e31c4f21a4a (diff)
parent: 1d55939c7ca0e80555a24b240ff68d5bdbb48b4a (diff)
download: UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar
UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar.gz
UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar.lz
UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar.xz
UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.zip
1403 files changed, 508937 insertions, 19129 deletions
diff --git a/CLOBBER b/CLOBBER
index 76ebbb54b..12431db71 100644
--- a/CLOBBER
+++ b/CLOBBER
@@ -22,4 +22,4 @@
 # changes to stick? As of bug 928195, this shouldn't be necessary! Please
 # don't change CLOBBER for WebIDL changes any more.
 
-Clobber required for updating NSS to 3.38 (poly1305 symbol changes)
+Clobber for updating ffvpx to 4.0.2
diff --git a/application/basilisk/app/Makefile.in b/application/basilisk/app/Makefile.in
index 4a3b9758e..83d6cb36a 100644
--- a/application/basilisk/app/Makefile.in
+++ b/application/basilisk/app/Makefile.in
@@ -87,8 +87,10 @@ tools repackage:: $(DIST)/bin/$(MOZ_APP_NAME)
 	rsync -aL $(DIST)/bin/$(MOZ_APP_NAME) $(dist_dest)/Contents/MacOS
 	cp -RL $(DIST)/branding/firefox.icns $(dist_dest)/Contents/Resources/firefox.icns
 	cp -RL $(DIST)/branding/document.icns $(dist_dest)/Contents/Resources/document.icns
+ifdef MOZ_UPDATER
 	$(MKDIR) -p $(dist_dest)/Contents/Library/LaunchServices
 	mv -f $(dist_dest)/Contents/MacOS/updater.app/Contents/MacOS/org.mozilla.updater $(dist_dest)/Contents/Library/LaunchServices
 	ln -s ../../../../Library/LaunchServices/org.mozilla.updater $(dist_dest)/Contents/MacOS/updater.app/Contents/MacOS/org.mozilla.updater
+endif
 	printf APPLMOZB > $(dist_dest)/Contents/PkgInfo
 endif
diff --git a/application/basilisk/app/profile/basilisk.js b/application/basilisk/app/profile/basilisk.js
index fd81e8204..eeec29eb9 100644
--- a/application/basilisk/app/profile/basilisk.js
+++ b/application/basilisk/app/profile/basilisk.js
@@ -1266,13 +1266,6 @@ pref("media.gmp-widevinecdm.enabled", true);
 // -1 means no experiment is run and we use the preferred value for frecency (6h)
 pref("browser.cache.frecency_experiment", 0);
 
-pref("browser.translation.detectLanguage", false);
-pref("browser.translation.neverForLanguages", "");
-// Show the translation UI bits, like the info bar, notification icon and preferences.
-pref("browser.translation.ui.show", false);
-// Allows to define the translation engine. Bing is default, Yandex may optionally switched on.
-pref("browser.translation.engine", "bing");
-
 // Telemetry settings.
 // Determines if Telemetry pings can be archived locally.
 pref("toolkit.telemetry.archive.enabled", true);
diff --git a/application/basilisk/base/content/browser.css b/application/basilisk/base/content/browser.css
index e951985dc..517c1c5eb 100644
--- a/application/basilisk/base/content/browser.css
+++ b/application/basilisk/base/content/browser.css
@@ -933,11 +933,6 @@ html|*#gcli-output-frame,
   transition: none;
 }
 
-/* Translation */
-notification[value="translation"] {
-  -moz-binding: url("chrome://browser/content/translation-infobar.xml#translationbar");
-}
-
 /** See bug 872317 for why the following rule is necessary. */
 
 #downloads-button {
diff --git a/application/basilisk/base/content/browser.js b/application/basilisk/base/content/browser.js
index 4f4ebb08f..d45956191 100644
--- a/application/basilisk/base/content/browser.js
+++ b/application/basilisk/base/content/browser.js
@@ -45,7 +45,6 @@ Cu.import("resource://gre/modules/NotificationDB.jsm");
   ["SitePermissions", "resource:///modules/SitePermissions.jsm"],
   ["TabCrashHandler", "resource:///modules/ContentCrashHandlers.jsm"],
   ["Task", "resource://gre/modules/Task.jsm"],
-  ["Translation", "resource:///modules/translation/Translation.jsm"],
   ["UpdateUtils", "resource://gre/modules/UpdateUtils.jsm"],
   ["Weave", "resource://services-sync/main.js"],
   ["fxAccounts", "resource://gre/modules/FxAccounts.jsm"],
@@ -969,7 +968,6 @@ var gBrowserInit = {
     // the listener is registered.
     DOMLinkHandler.init();
     gPageStyleMenu.init();
-    LanguageDetectionListener.init();
     BrowserOnClick.init();
     FeedHandler.init();
     DevEdition.init();
@@ -5680,16 +5678,6 @@ function setStyleDisabled(disabled) {
     gPageStyleMenu.disableStyle();
 }
 
-
-var LanguageDetectionListener = {
-  init: function() {
-    window.messageManager.addMessageListener("Translation:DocumentState", msg => {
-      Translation.documentStateReceived(msg.target, msg.data);
-    });
-  }
-};
-
-
 var BrowserOffline = {
   _inited: false,
 
diff --git a/application/basilisk/base/content/browser.xul b/application/basilisk/base/content/browser.xul
index 74a90f5e0..3208538c1 100644
--- a/application/basilisk/base/content/browser.xul
+++ b/application/basilisk/base/content/browser.xul
@@ -675,10 +675,6 @@
                          tooltiptext="&urlbar.webRTCShareScreenNotificationAnchor.tooltip;"/>
                   <image id="servicesInstall-notification-icon" class="notification-anchor-icon service-icon" role="button"
                          tooltiptext="&urlbar.servicesNotificationAnchor.tooltip;"/>
-                  <image id="translate-notification-icon" class="notification-anchor-icon translation-icon" role="button"
-                         tooltiptext="&urlbar.translateNotificationAnchor.tooltip;"/>
-                  <image id="translated-notification-icon" class="notification-anchor-icon translation-icon in-use" role="button"
-                         tooltiptext="&urlbar.translatedNotificationAnchor.tooltip;"/>
                   <image id="eme-notification-icon" class="notification-anchor-icon drm-icon" role="button"
                          tooltiptext="&urlbar.emeNotificationAnchor.tooltip;"/>
                 </box>
diff --git a/application/basilisk/base/content/tab-content.js b/application/basilisk/base/content/tab-content.js
index 11a9fabce..6d053dd2b 100644
--- a/application/basilisk/base/content/tab-content.js
+++ b/application/basilisk/base/content/tab-content.js
@@ -558,13 +558,6 @@ var PageStyleHandler = {
 };
 PageStyleHandler.init();
 
-// Keep a reference to the translation content handler to avoid it it being GC'ed.
-var trHandler = null;
-if (Services.prefs.getBoolPref("browser.translation.detectLanguage")) {
-  Cu.import("resource:///modules/translation/TranslationContentHandler.jsm");
-  trHandler = new TranslationContentHandler(global, docShell);
-}
-
 function gKeywordURIFixup(fixupInfo) {
   fixupInfo.QueryInterface(Ci.nsIURIFixupInfo);
   if (!fixupInfo.consumer) {
diff --git a/application/basilisk/base/content/web-panels.xul b/application/basilisk/base/content/web-panels.xul
index 223b20ed7..ed868c24a 100644
--- a/application/basilisk/base/content/web-panels.xul
+++ b/application/basilisk/base/content/web-panels.xul
@@ -44,6 +44,21 @@
              disabled="true"/>
     <command id="Browser:Stop" oncommand="PanelBrowserStop();"/>
     <command id="Browser:Reload" oncommand="PanelBrowserReload();"/>
+    <command id="Browser:BackOrBackDuplicate"
+             oncommand="getPanelBrowser().webNavigation.goBack(event);"
+             disabled="true">
+      <observes element="Browser:Back" attribute="disabled"/>
+    </command>
+    <command id="Browser:ForwardOrForwardDuplicate"
+             oncommand="getPanelBrowser().webNavigation.goForward(event);"
+             disabled="true">
+      <observes element="Browser:Forward" attribute="disabled"/>
+    </command>
+    <command id="Browser:ReloadOrDuplicate"
+             oncommand="PanelBrowserReload(event)"
+             disabled="true">
+      <observes element="Browser:Reload" attribute="disabled"/>
+    </command>
   </commandset>
 
   <popupset id="mainPopupSet">
diff --git a/application/basilisk/branding/official/branding.nsi b/application/basilisk/branding/official/branding.nsi
index 58d7554df..250abd84d 100644
--- a/application/basilisk/branding/official/branding.nsi
+++ b/application/basilisk/branding/official/branding.nsi
@@ -12,39 +12,5 @@
 !define CompanyName           "Moonchild Productions"
 !define URLInfoAbout          "https://www.basilisk-browser.org"
 !define URLUpdateInfo         "https://www.basilisk-browser.org/releasenotes.shtml"
-!define HelpLink              "https://www.basilisk-browser.org"
-
-; The OFFICIAL define is a workaround to support different urls for Release and
-; Beta since they share the same branding when building with other branches that
-; set the update channel to beta.
-!define OFFICIAL
-!define URLStubDownload ""
-!define URLManualDownload ""
-!define URLSystemRequirements "http://www.basilisk-browser.org/requirements.shtml"
-!define Channel "release"
-
-# The installer's certificate name and issuer expected by the stub installer
-!define CertNameDownload   ""
-!define CertIssuerDownload ""
-
-# Dialog units are used so the UI displays correctly with the system's DPI
-# settings.
-# The dialog units for the bitmap's dimensions should match exactly with the
-# bitmap's width and height in pixels.
-!define APPNAME_BMP_WIDTH_DU "134u"
-!define APPNAME_BMP_HEIGHT_DU "36u"
-!define INTRO_BLURB_WIDTH_DU "258u"
-!define INTRO_BLURB_EDGE_DU "170u"
-!define INTRO_BLURB_LTR_TOP_DU "20u"
-!define INTRO_BLURB_RTL_TOP_DU "12u"
-
-# UI Colors that can be customized for each channel
-!define FOOTER_CONTROL_TEXT_COLOR_NORMAL 0x000000
-!define FOOTER_CONTROL_TEXT_COLOR_FADED 0x666666
-!define FOOTER_BKGRD_COLOR 0xFFFFFF
-!define INTRO_BLURB_TEXT_COLOR 0x666666
-!define INSTALL_BLURB_TEXT_COLOR 0x666666
-!define INSTALL_PROGRESS_TEXT_COLOR_NORMAL 0x666666
-!define COMMON_TEXT_COLOR_NORMAL 0x000000
-!define COMMON_TEXT_COLOR_FADED 0x666666
-!define COMMON_BKGRD_COLOR 0xF0F0F0
+!define HelpLink              "https://www.basilisk-browser.org/contact.shtml"
+!define URLSystemRequirements "https://www.basilisk-browser.org/requirements.shtml"
diff --git a/application/basilisk/branding/unofficial/branding.nsi b/application/basilisk/branding/unofficial/branding.nsi
index 77f08a4cb..586dd0074 100644
--- a/application/basilisk/branding/unofficial/branding.nsi
+++ b/application/basilisk/branding/unofficial/branding.nsi
@@ -10,36 +10,7 @@
 # instead of BrandFullName and typically should not be modified.
 !define BrandFullNameInternal "Serpent"
 !define CompanyName           "Moonchild Productions"
-!define URLInfoAbout          "http://www.basilisk-browser.org"
+!define URLInfoAbout          "https://www.basilisk-browser.org"
+!define URLUpdateInfo         "https://www.basilisk-browser.org"
 !define HelpLink              "https://forum.palemoon.org"
-
-!define URLStubDownload ""
-!define URLManualDownload ""
-!define URLSystemRequirements ""
-!define Channel "unofficial"
-
-# The installer's certificate name and issuer expected by the stub installer
-!define CertNameDownload   ""
-!define CertIssuerDownload ""
-
-# Dialog units are used so the UI displays correctly with the system's DPI
-# settings.
-# The dialog units for the bitmap's dimensions should match exactly with the
-# bitmap's width and height in pixels.
-!define APPNAME_BMP_WIDTH_DU 159u
-!define APPNAME_BMP_HEIGHT_DU 50u
-!define INTRO_BLURB_WIDTH_DU "230u"
-!define INTRO_BLURB_EDGE_DU "198u"
-!define INTRO_BLURB_LTR_TOP_DU "16u"
-!define INTRO_BLURB_RTL_TOP_DU "11u"
-
-# UI Colors that can be customized for each channel
-!define FOOTER_CONTROL_TEXT_COLOR_NORMAL 0x000000
-!define FOOTER_CONTROL_TEXT_COLOR_FADED 0x999999
-!define FOOTER_BKGRD_COLOR 0xFFFFFF
-!define INTRO_BLURB_TEXT_COLOR 0xFFFFFF
-!define INSTALL_BLURB_TEXT_COLOR 0xFFFFFF
-!define INSTALL_PROGRESS_TEXT_COLOR_NORMAL 0xFFFFFF
-!define COMMON_TEXT_COLOR_NORMAL 0xFFFFFF
-!define COMMON_TEXT_COLOR_FADED 0xA1AAB3
-!define COMMON_BKGRD_COLOR 0x0F1B26
+!define URLSystemRequirements "https://www.basilisk-browser.org"
diff --git a/application/basilisk/components/preferences/in-content/content.js b/application/basilisk/components/preferences/in-content/content.js
index a957b1dd5..2eac10ca4 100644
--- a/application/basilisk/components/preferences/in-content/content.js
+++ b/application/basilisk/components/preferences/in-content/content.js
@@ -31,18 +31,6 @@ var gContentPane = {
       menulist.value = FontBuilder.readFontSelection(menulist);
     }
 
-    // Show translation preferences if we may:
-    const prefName = "browser.translation.ui.show";
-    if (Services.prefs.getBoolPref(prefName)) {
-      let row = document.getElementById("translationBox");
-      row.removeAttribute("hidden");
-      // Showing attribution only for Bing Translator.
-      Components.utils.import("resource:///modules/translation/Translation.jsm");
-      if (Translation.translationEngine == "bing") {
-        document.getElementById("bingAttribution").removeAttribute("hidden");
-      }
-    }
-
     if (AlertsServiceDND) {
       let notificationsDoNotDisturbRow =
         document.getElementById("notificationsDoNotDisturbRow");
@@ -66,10 +54,6 @@ var gContentPane = {
       gContentPane.configureColors);
     setEventListener("chooseLanguage", "command",
       gContentPane.showLanguages);
-    setEventListener("translationAttributionImage", "click",
-      gContentPane.openTranslationProviderAttribution);
-    setEventListener("translateButton", "command",
-      gContentPane.showTranslationExceptions);
     setEventListener("notificationsDoNotDisturb", "command",
       gContentPane.toggleDoNotDisturbNotifications);
 
@@ -274,21 +258,6 @@ var gContentPane = {
     gSubDialog.open("chrome://browser/content/preferences/languages.xul");
   },
 
-  /**
-   * Displays the translation exceptions dialog where specific site and language
-   * translation preferences can be set.
-   */
-  showTranslationExceptions: function ()
-  {
-    gSubDialog.open("chrome://browser/content/preferences/translation.xul");
-  },
-
-  openTranslationProviderAttribution: function ()
-  {
-    Components.utils.import("resource:///modules/translation/Translation.jsm");
-    Translation.openProviderAttribution();
-  },
-
   toggleDoNotDisturbNotifications: function (event)
   {
     AlertsServiceDND.manualDoNotDisturb = event.target.checked;
diff --git a/application/basilisk/components/preferences/in-content/content.xul b/application/basilisk/components/preferences/in-content/content.xul
index 9434cba62..fac864411 100644
--- a/application/basilisk/components/preferences/in-content/content.xul
+++ b/application/basilisk/components/preferences/in-content/content.xul
@@ -22,11 +22,6 @@
   <preference id="font.language.group"
               name="font.language.group"
               type="wstring"/>
-
-  <!-- Languages -->
-  <preference id="browser.translation.detectLanguage"
-              name="browser.translation.detectLanguage"
-              type="bool"/>
 </preferences>
 
 <script type="application/javascript"
@@ -191,23 +186,4 @@
             label="&chooseButton.label;"
             accesskey="&chooseButton.accesskey;"/>
   </hbox>
-
-  <hbox id="translationBox" hidden="true">
-    <hbox align="center" flex="1">
-      <checkbox id="translate" preference="browser.translation.detectLanguage"
-                label="&translateWebPages.label;." accesskey="&translateWebPages.accesskey;"
-                onsyncfrompreference="return gContentPane.updateButtons('translateButton',
-                                              'browser.translation.detectLanguage');"/>
-      <hbox id="bingAttribution" hidden="true">
-        <label>&translation.options.attribution.beforeLogo;</label>
-        <separator orient="vertical" class="thin"/>
-        <image id="translationAttributionImage" aria-label="Microsoft Translator"
-               src="chrome://browser/content/microsoft-translator-attribution.png"/>
-        <separator orient="vertical" class="thin"/>
-        <label>&translation.options.attribution.afterLogo;</label>
-      </hbox>
-    </hbox>
-    <button id="translateButton" label="&translateExceptions.label;"
-            accesskey="&translateExceptions.accesskey;"/>
-  </hbox>
 </groupbox>
diff --git a/application/basilisk/components/preferences/jar.mn b/application/basilisk/components/preferences/jar.mn
index d233c7865..5b24e89df 100644
--- a/application/basilisk/components/preferences/jar.mn
+++ b/application/basilisk/components/preferences/jar.mn
@@ -31,5 +31,3 @@ browser.jar:
     content/browser/preferences/sanitize.js
     content/browser/preferences/selectBookmark.xul
     content/browser/preferences/selectBookmark.js
-    content/browser/preferences/translation.xul
-    content/browser/preferences/translation.js
diff --git a/application/basilisk/components/preferences/translation.js b/application/basilisk/components/preferences/translation.js
deleted file mode 100644
index cd570db0e..000000000
--- a/application/basilisk/components/preferences/translation.js
+++ /dev/null
@@ -1,255 +0,0 @@
-/* -*- indent-tabs-mode: nil; js-indent-level: 4 -*- */
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-"use strict";
-
-var {classes: Cc, interfaces: Ci, utils: Cu} = Components;
-
-Cu.import("resource://gre/modules/XPCOMUtils.jsm");
-Cu.import("resource://gre/modules/Services.jsm");
-
-XPCOMUtils.defineLazyGetter(this, "gLangBundle", () =>
-  Services.strings.createBundle("chrome://global/locale/languageNames.properties"));
-
-const kPermissionType = "translate";
-const kLanguagesPref = "browser.translation.neverForLanguages";
-
-function Tree(aId, aData)
-{
-  this._data = aData;
-  this._tree = document.getElementById(aId);
-  this._tree.view = this;
-}
-
-Tree.prototype = {
-  get boxObject() {
-    return this._tree.treeBoxObject;
-  },
-  get isEmpty() {
-    return !this._data.length;
-  },
-  get hasSelection() {
-    return this.selection.count > 0;
-  },
-  getSelectedItems: function() {
-    let result = [];
-
-    let rc = this.selection.getRangeCount();
-    for (let i = 0; i < rc; ++i) {
-      let min = {}, max = {};
-      this.selection.getRangeAt(i, min, max);
-      for (let j = min.value; j <= max.value; ++j)
-        result.push(this._data[j]);
-    }
-
-    return result;
-  },
-
-  // nsITreeView implementation
-  get rowCount() {
-    return this._data.length;
-  },
-  getCellText: function (aRow, aColumn) {
-    return this._data[aRow];
-  },
-  isSeparator: function(aIndex) {
-    return false;
-  },
-  isSorted: function() {
-    return false;
-  },
-  isContainer: function(aIndex) {
-    return false;
-  },
-  setTree: function(aTree) {},
-  getImageSrc: function(aRow, aColumn) {},
-  getProgressMode: function(aRow, aColumn) {},
-  getCellValue: function(aRow, aColumn) {},
-  cycleHeader: function(column) {},
-  getRowProperties: function(row) {
-    return "";
-  },
-  getColumnProperties: function(column) {
-    return "";
-  },
-  getCellProperties: function(row, column) {
-    return "";
-  },
-  QueryInterface: XPCOMUtils.generateQI([Ci.nsITreeView])
-};
-
-function Lang(aCode)
-{
-  this.langCode = aCode;
-  this._label = gLangBundle.GetStringFromName(aCode);
-}
-
-Lang.prototype = {
-  toString: function() {
-    return this._label;
-  }
-}
-
-var gTranslationExceptions = {
-  onLoad: function() {
-    if (this._siteTree) {
-      // Re-using an open dialog, clear the old observers.
-      this.uninit();
-    }
-
-    // Load site permissions into an array.
-    this._sites = [];
-    let enumerator = Services.perms.enumerator;
-    while (enumerator.hasMoreElements()) {
-      let perm = enumerator.getNext().QueryInterface(Ci.nsIPermission);
-
-      if (perm.type == kPermissionType &&
-          perm.capability == Services.perms.DENY_ACTION) {
-        this._sites.push(perm.principal.origin);
-      }
-    }
-    Services.obs.addObserver(this, "perm-changed", false);
-    this._sites.sort();
-
-    this._siteTree = new Tree("sitesTree", this._sites);
-    this.onSiteSelected();
-
-    this._langs = this.getLanguageExceptions();
-    Services.prefs.addObserver(kLanguagesPref, this, false);
-    this._langTree = new Tree("languagesTree", this._langs);
-    this.onLanguageSelected();
-  },
-
-  // Get the list of languages we don't translate as an array.
-  getLanguageExceptions: function() {
-    let langs = Services.prefs.getCharPref(kLanguagesPref);
-    if (!langs)
-      return [];
-
-    let result = langs.split(",").map(code => new Lang(code));
-    result.sort();
-
-    return result;
-  },
-
-  observe: function(aSubject, aTopic, aData) {
-    if (aTopic == "perm-changed") {
-      if (aData == "cleared") {
-        if (!this._sites.length)
-          return;
-        let removed = this._sites.splice(0, this._sites.length);
-        this._siteTree.boxObject.rowCountChanged(0, - removed.length);
-      }
-      else {
-        let perm = aSubject.QueryInterface(Ci.nsIPermission);
-        if (perm.type != kPermissionType)
-          return;
-
-        if (aData == "added") {
-          if (perm.capability != Services.perms.DENY_ACTION)
-            return;
-          this._sites.push(perm.principal.origin);
-          this._sites.sort();
-          let boxObject = this._siteTree.boxObject;
-          boxObject.rowCountChanged(0, 1);
-          boxObject.invalidate();
-        }
-        else if (aData == "deleted") {
-          let index = this._sites.indexOf(perm.principal.origin);
-          if (index == -1)
-            return;
-          this._sites.splice(index, 1);
-          this._siteTree.boxObject.rowCountChanged(index, -1);
-          this.onSiteSelected();
-          return;
-        }
-      }
-      this.onSiteSelected();
-    }
-    else if (aTopic == "nsPref:changed") {
-      this._langs = this.getLanguageExceptions();
-      let change = this._langs.length - this._langTree.rowCount;
-      this._langTree._data = this._langs;
-      let boxObject = this._langTree.boxObject;
-      if (change)
-        boxObject.rowCountChanged(0, change);
-      boxObject.invalidate();
-      this.onLanguageSelected();
-    }
-  },
-
-  _handleButtonDisabling: function(aTree, aIdPart) {
-    let empty = aTree.isEmpty;
-    document.getElementById("removeAll" + aIdPart + "s").disabled = empty;
-    document.getElementById("remove" + aIdPart).disabled =
-      empty || !aTree.hasSelection;
-  },
-
-  onLanguageSelected: function() {
-    this._handleButtonDisabling(this._langTree, "Language");
-  },
-
-  onSiteSelected: function() {
-    this._handleButtonDisabling(this._siteTree, "Site");
-  },
-
-  onLanguageDeleted: function() {
-    let langs = Services.prefs.getCharPref(kLanguagesPref);
-    if (!langs)
-      return;
-
-    let removed = this._langTree.getSelectedItems().map(l => l.langCode);
-
-    langs = langs.split(",").filter(l => removed.indexOf(l) == -1);
-    Services.prefs.setCharPref(kLanguagesPref, langs.join(","));
-  },
-
-  onAllLanguagesDeleted: function() {
-    Services.prefs.setCharPref(kLanguagesPref, "");
-  },
-
-  onSiteDeleted: function() {
-    let removedSites = this._siteTree.getSelectedItems();
-    for (let origin of removedSites) {
-      let principal = Services.scriptSecurityManager.createCodebasePrincipalFromOrigin(origin);
-      Services.perms.removeFromPrincipal(principal, kPermissionType);
-    }
-  },
-
-  onAllSitesDeleted: function() {
-    if (this._siteTree.isEmpty)
-      return;
-
-    let removedSites = this._sites.splice(0, this._sites.length);
-    this._siteTree.boxObject.rowCountChanged(0, -removedSites.length);
-
-    for (let origin of removedSites) {
-      let principal = Services.scriptSecurityManager.createCodebasePrincipalFromOrigin(origin);
-      Services.perms.removeFromPrincipal(principal, kPermissionType);
-    }
-
-    this.onSiteSelected();
-  },
-
-  onSiteKeyPress: function(aEvent) {
-    if (aEvent.keyCode == KeyEvent.DOM_VK_DELETE)
-      this.onSiteDeleted();
-  },
-
-  onLanguageKeyPress: function(aEvent) {
-    if (aEvent.keyCode == KeyEvent.DOM_VK_DELETE)
-      this.onLanguageDeleted();
-  },
-
-  onWindowKeyPress: function(aEvent) {
-    if (aEvent.keyCode == KeyEvent.DOM_VK_ESCAPE)
-      window.close();
-  },
-
-  uninit: function() {
-    Services.obs.removeObserver(this, "perm-changed");
-    Services.prefs.removeObserver(kLanguagesPref, this);
-  }
-};
diff --git a/application/basilisk/components/preferences/translation.xul b/application/basilisk/components/preferences/translation.xul
deleted file mode 100644
index b5dfd1b9b..000000000
--- a/application/basilisk/components/preferences/translation.xul
+++ /dev/null
@@ -1,88 +0,0 @@
-<?xml version="1.0"?>
-
-<!-- This Source Code Form is subject to the terms of the Mozilla Public
-   - License, v. 2.0. If a copy of the MPL was not distributed with this
-   - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
-
-<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
-<?xml-stylesheet href="chrome://browser/skin/preferences/preferences.css" type="text/css"?>
-
-<!DOCTYPE dialog SYSTEM "chrome://browser/locale/preferences/translation.dtd">
-
-<window id="TranslationDialog" class="windowDialog"
-        windowtype="Browser:TranslationExceptions"
-        title="&window.title;"
-        xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
-        style="width: &window.width;;"
-        onload="gTranslationExceptions.onLoad();"
-        onunload="gTranslationExceptions.uninit();"
-        persist="screenX screenY width height"
-        onkeypress="gTranslationExceptions.onWindowKeyPress(event);">
-
-  <script src="chrome://browser/content/preferences/translation.js"/>
-
-  <stringbundle id="bundlePreferences"
-                src="chrome://browser/locale/preferences/preferences.properties"/>
-
-  <keyset>
-    <key key="&windowClose.key;" modifiers="accel" oncommand="window.close();"/>
-  </keyset>
-
-  <vbox class="largeDialogContainer">
-    <vbox class="contentPane" flex="1">
-      <label id="languagesLabel" control="permissionsTree">&noTranslationForLanguages.label;</label>
-      <separator class="thin"/>
-      <tree id="languagesTree" flex="1" style="height: 12em;"
-            hidecolumnpicker="true"
-            onkeypress="gTranslationExceptions.onLanguageKeyPress(event)"
-            onselect="gTranslationExceptions.onLanguageSelected();">
-        <treecols>
-          <treecol id="languageCol" label="&treehead.languageName.label;" flex="1"/>
-        </treecols>
-        <treechildren/>
-      </tree>
-    </vbox>
-    <hbox align="end">
-      <hbox class="actionButtons" flex="1">
-        <button id="removeLanguage" disabled="true"
-                accesskey="&removeLanguage.accesskey;"
-                icon="remove" label="&removeLanguage.label;"
-                oncommand="gTranslationExceptions.onLanguageDeleted();"/>
-        <button id="removeAllLanguages"
-                icon="clear" label="&removeAllLanguages.label;"
-                accesskey="&removeAllLanguages.accesskey;"
-                oncommand="gTranslationExceptions.onAllLanguagesDeleted();"/>
-        <spacer flex="1"/>
-      </hbox>
-    </hbox>
-    <separator/>
-    <vbox class="contentPane" flex="1">
-      <label id="languagesLabel" control="permissionsTree">&noTranslationForSites.label;</label>
-      <separator class="thin"/>
-      <tree id="sitesTree" flex="1" style="height: 12em;"
-            hidecolumnpicker="true"
-            onkeypress="gTranslationExceptions.onSiteKeyPress(event)"
-            onselect="gTranslationExceptions.onSiteSelected();">
-        <treecols>
-          <treecol id="siteCol" label="&treehead.siteName.label;" flex="1"/>
-        </treecols>
-        <treechildren/>
-      </tree>
-    </vbox>
-  </vbox>
-  <hbox align="end">
-    <hbox class="actionButtons" flex="1">
-      <button id="removeSite" disabled="true"
-              accesskey="&removeSite.accesskey;"
-              icon="remove" label="&removeSite.label;"
-              oncommand="gTranslationExceptions.onSiteDeleted();"/>
-      <button id="removeAllSites"
-              icon="clear" label="&removeAllSites.label;"
-              accesskey="&removeAllSites.accesskey;"
-              oncommand="gTranslationExceptions.onAllSitesDeleted();"/>
-      <spacer flex="1"/>
-      <button oncommand="close();" icon="close"
-              label="&button.close.label;" accesskey="&button.close.accesskey;"/>
-    </hbox>
-  </hbox>
-</window>
diff --git a/application/basilisk/components/sessionstore/SessionStorage.jsm b/application/basilisk/components/sessionstore/SessionStorage.jsm
index 705139ebf..7499f95e9 100644
--- a/application/basilisk/components/sessionstore/SessionStorage.jsm
+++ b/application/basilisk/components/sessionstore/SessionStorage.jsm
@@ -74,7 +74,14 @@ var SessionStorageInternal = {
 
       // Get the origin of the current history entry
       // and use that as a key for the per-principal storage data.
-      let origin = principal.origin;
+      let origin;
+      try {
+        // The origin getter may throw for about:blank iframes as of bug 1340710,
+        // but we should ignore them anyway. The same goes for custom protocols.
+        origin = principal.origin;
+      } catch (e) {
+        return;
+      }
       if (visitedOrigins.has(origin)) {
         // Don't read a host twice.
         return;
diff --git a/application/basilisk/components/translation/BingTranslator.jsm b/application/basilisk/components/translation/BingTranslator.jsm
deleted file mode 100644
index fc1cc942a..000000000
--- a/application/basilisk/components/translation/BingTranslator.jsm
+++ /dev/null
@@ -1,449 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-"use strict";
-
-const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
-
-this.EXPORTED_SYMBOLS = [ "BingTranslator" ];
-
-Cu.import("resource://gre/modules/Services.jsm");
-Cu.import("resource://gre/modules/Log.jsm");
-Cu.import("resource://gre/modules/Promise.jsm");
-Cu.import("resource://gre/modules/Task.jsm");
-Cu.import("resource://services-common/utils.js");
-Cu.import("resource://gre/modules/Http.jsm");
-
-// The maximum amount of net data allowed per request on Bing's API.
-const MAX_REQUEST_DATA = 5000; // Documentation says 10000 but anywhere
-                               // close to that is refused by the service.
-
-// The maximum number of chunks allowed to be translated in a single
-// request.
-const MAX_REQUEST_CHUNKS = 1000; // Documentation says 2000.
-
-// Self-imposed limit of 15 requests. This means that a page that would need
-// to be broken in more than 15 requests won't be fully translated.
-// The maximum amount of data that we will translate for a single page
-// is MAX_REQUESTS * MAX_REQUEST_DATA.
-const MAX_REQUESTS = 15;
-
-/**
- * Translates a webpage using Bing's Translation API.
- *
- * @param translationDocument  The TranslationDocument object that represents
- *                             the webpage to be translated
- * @param sourceLanguage       The source language of the document
- * @param targetLanguage       The target language for the translation
- *
- * @returns {Promise}          A promise that will resolve when the translation
- *                             task is finished.
- */
-this.BingTranslator = function(translationDocument, sourceLanguage, targetLanguage) {
-  this.translationDocument = translationDocument;
-  this.sourceLanguage = sourceLanguage;
-  this.targetLanguage = targetLanguage;
-  this._pendingRequests = 0;
-  this._partialSuccess = false;
-  this._serviceUnavailable = false;
-  this._translatedCharacterCount = 0;
-};
-
-this.BingTranslator.prototype = {
-  /**
-   * Performs the translation, splitting the document into several chunks
-   * respecting the data limits of the API.
-   *
-   * @returns {Promise}          A promise that will resolve when the translation
-   *                             task is finished.
-   */
-  translate: function() {
-    return Task.spawn(function *() {
-      let currentIndex = 0;
-      this._onFinishedDeferred = Promise.defer();
-
-      // Let's split the document into various requests to be sent to
-      // Bing's Translation API.
-      for (let requestCount = 0; requestCount < MAX_REQUESTS; requestCount++) {
-        // Generating the text for each request can be expensive, so
-        // let's take the opportunity of the chunkification process to
-        // allow for the event loop to attend other pending events
-        // before we continue.
-        yield CommonUtils.laterTickResolvingPromise();
-
-        // Determine the data for the next request.
-        let request = this._generateNextTranslationRequest(currentIndex);
-
-        // Create a real request to the server, and put it on the
-        // pending requests list.
-        let bingRequest = new BingRequest(request.data,
-                                          this.sourceLanguage,
-                                          this.targetLanguage);
-        this._pendingRequests++;
-        bingRequest.fireRequest().then(this._chunkCompleted.bind(this),
-                                       this._chunkFailed.bind(this));
-
-        currentIndex = request.lastIndex;
-        if (request.finished) {
-          break;
-        }
-      }
-
-      return this._onFinishedDeferred.promise;
-    }.bind(this));
-  },
-
-  /**
-   * Resets the expiration time of the current token, in order to
-   * force the token manager to ask for a new token during the next request.
-   */
-  _resetToken : function() {
-    // Force the token manager to get update token
-    BingTokenManager._currentExpiryTime = 0;
-  },
-
-  /**
-   * Function called when a request sent to the server completed successfully.
-   * This function handles calling the function to parse the result and the
-   * function to resolve the promise returned by the public `translate()`
-   * method when there's no pending request left.
-   *
-   * @param   request   The BingRequest sent to the server.
-   */
-  _chunkCompleted: function(bingRequest) {
-    if (this._parseChunkResult(bingRequest)) {
-      this._partialSuccess = true;
-      // Count the number of characters successfully translated.
-      this._translatedCharacterCount += bingRequest.characterCount;
-    }
-
-    this._checkIfFinished();
-  },
-
-  /**
-   * Function called when a request sent to the server has failed.
-   * This function handles deciding if the error is transient or means the
-   * service is unavailable (zero balance on the key or request credentials are
-   * not in an active state) and calling the function to resolve the promise
-   * returned by the public `translate()` method when there's no pending.
-   * request left.
-   *
-   * @param   aError   [optional] The XHR object of the request that failed.
-   */
-  _chunkFailed: function(aError) {
-    if (aError instanceof Ci.nsIXMLHttpRequest &&
-        [400, 401].indexOf(aError.status) != -1) {
-      let body = aError.responseText;
-      if (body && body.includes("TranslateApiException") &&
-          (body.includes("balance") || body.includes("active state")))
-        this._serviceUnavailable = true;
-    }
-
-    this._checkIfFinished();
-  },
-
-  /**
-   * Function called when a request sent to the server has completed.
-   * This function handles resolving the promise
-   * returned by the public `translate()` method when all chunks are completed.
-   */
-  _checkIfFinished: function() {
-    // Check if all pending requests have been
-    // completed and then resolves the promise.
-    // If at least one chunk was successful, the
-    // promise will be resolved positively which will
-    // display the "Success" state for the infobar. Otherwise,
-    // the "Error" state will appear.
-    if (--this._pendingRequests == 0) {
-      if (this._partialSuccess) {
-        this._onFinishedDeferred.resolve({
-          characterCount: this._translatedCharacterCount
-        });
-      } else {
-        let error = this._serviceUnavailable ? "unavailable" : "failure";
-        this._onFinishedDeferred.reject(error);
-      }
-    }
-  },
-
-  /**
-   * This function parses the result returned by Bing's Http.svc API,
-   * which is a XML file that contains a number of elements. To our
-   * particular interest, the only part of the response that matters
-   * are the <TranslatedText> nodes, which contains the resulting
-   * items that were sent to be translated.
-   *
-   * @param   request      The request sent to the server.
-   * @returns boolean      True if parsing of this chunk was successful.
-   */
-  _parseChunkResult: function(bingRequest) {
-    let results;
-    try {
-      let doc = bingRequest.networkRequest.responseXML;
-      results = doc.querySelectorAll("TranslatedText");
-    } catch (e) {
-      return false;
-    }
-
-    let len = results.length;
-    if (len != bingRequest.translationData.length) {
-      // This should never happen, but if the service returns a different number
-      // of items (from the number of items submitted), we can't use this chunk
-      // because all items would be paired incorrectly.
-      return false;
-    }
-
-    let error = false;
-    for (let i = 0; i < len; i++) {
-      try {
-        let result = results[i].firstChild.nodeValue;
-        let root = bingRequest.translationData[i][0];
-
-        if (root.isSimpleRoot) {
-          // Workaround for Bing's service problem in which "&" chars in
-          // plain-text TranslationItems are double-escaped.
-          result = result.replace(/&amp;/g, "&");
-        }
-
-        root.parseResult(result);
-      } catch (e) { error = true; }
-    }
-
-    return !error;
-  },
-
-  /**
-   * This function will determine what is the data to be used for
-   * the Nth request we are generating, based on the input params.
-   *
-   * @param startIndex What is the index, in the roots list, that the
-   *                   chunk should start.
-   */
-  _generateNextTranslationRequest: function(startIndex) {
-    let currentDataSize = 0;
-    let currentChunks = 0;
-    let output = [];
-    let rootsList = this.translationDocument.roots;
-
-    for (let i = startIndex; i < rootsList.length; i++) {
-      let root = rootsList[i];
-      let text = this.translationDocument.generateTextForItem(root);
-      if (!text) {
-        continue;
-      }
-
-      text = escapeXML(text);
-      let newCurSize = currentDataSize + text.length;
-      let newChunks = currentChunks + 1;
-
-      if (newCurSize > MAX_REQUEST_DATA ||
-          newChunks > MAX_REQUEST_CHUNKS) {
-
-        // If we've reached the API limits, let's stop accumulating data
-        // for this request and return. We return information useful for
-        // the caller to pass back on the next call, so that the function
-        // can keep working from where it stopped.
-        return {
-          data: output,
-          finished: false,
-          lastIndex: i
-        };
-      }
-
-      currentDataSize = newCurSize;
-      currentChunks = newChunks;
-      output.push([root, text]);
-    }
-
-    return {
-      data: output,
-      finished: true,
-      lastIndex: 0
-    };
-  }
-};
-
-/**
- * Represents a request (for 1 chunk) sent off to Bing's service.
- *
- * @params translationData  The data to be used for this translation,
- *                          generated by the generateNextTranslationRequest...
- *                          function.
- * @param sourceLanguage    The source language of the document.
- * @param targetLanguage    The target language for the translation.
- *
- */
-function BingRequest(translationData, sourceLanguage, targetLanguage) {
-  this.translationData = translationData;
-  this.sourceLanguage = sourceLanguage;
-  this.targetLanguage = targetLanguage;
-  this.characterCount = 0;
-}
-
-BingRequest.prototype = {
-  /**
-   * Initiates the request
-   */
-  fireRequest: function() {
-    return Task.spawn(function *() {
-      // Prepare authentication.
-      let token = yield BingTokenManager.getToken();
-      let auth = "Bearer " + token;
-
-      // Prepare URL.
-      let url = getUrlParam("https://api.microsofttranslator.com/v2/Http.svc/TranslateArray",
-                            "browser.translation.bing.translateArrayURL");
-
-      // Prepare request headers.
-      let headers = [["Content-type", "text/xml"], ["Authorization", auth]];
-
-      // Prepare the request body.
-      let requestString =
-        '<TranslateArrayRequest>' +
-          '<AppId/>' +
-          '<From>' + this.sourceLanguage + '</From>' +
-          '<Options>' +
-            '<ContentType xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2">text/html</ContentType>' +
-            '<ReservedFlags xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2" />' +
-          '</Options>' +
-          '<Texts xmlns:s="http://schemas.microsoft.com/2003/10/Serialization/Arrays">';
-
-      for (let [, text] of this.translationData) {
-        requestString += '<s:string>' + text + '</s:string>';
-        this.characterCount += text.length;
-      }
-
-      requestString += '</Texts>' +
-          '<To>' + this.targetLanguage + '</To>' +
-        '</TranslateArrayRequest>';
-
-      // Set up request options.
-      let deferred = Promise.defer();
-      let options = {
-        onLoad: (function(responseText, xhr) {
-          deferred.resolve(this);
-        }).bind(this),
-        onError: function(e, responseText, xhr) {
-          deferred.reject(xhr);
-        },
-        postData: requestString,
-        headers: headers
-      };
-
-      // Fire the request.
-      let request = httpRequest(url, options);
-
-      // Override the response MIME type.
-      request.overrideMimeType("text/xml");
-      this.networkRequest = request;
-      return deferred.promise;
-    }.bind(this));
-  }
-};
-
-/**
- * Authentication Token manager for the API
- */
-var BingTokenManager = {
-  _currentToken: null,
-  _currentExpiryTime: 0,
-  _pendingRequest: null,
-
-  /**
-   * Get a valid, non-expired token to be used for the API calls.
-   *
-   * @returns {Promise}  A promise that resolves with the token
-   *                     string once it is obtained. The token returned
-   *                     can be the same one used in the past if it is still
-   *                     valid.
-   */
-  getToken: function() {
-    if (this._pendingRequest) {
-      return this._pendingRequest;
-    }
-
-    let remainingMs = this._currentExpiryTime - new Date();
-    // Our existing token is still good for more than a minute, let's use it.
-    if (remainingMs > 60 * 1000) {
-      return Promise.resolve(this._currentToken);
-    }
-
-    return this._getNewToken();
-  },
-
-  /**
-   * Generates a new token from the server.
-   *
-   * @returns {Promise}  A promise that resolves with the token
-   *                     string once it is obtained.
-   */
-  _getNewToken: function() {
-    let url = getUrlParam("https://datamarket.accesscontrol.windows.net/v2/OAuth2-13",
-                          "browser.translation.bing.authURL");
-    let params = [
-      ["grant_type", "client_credentials"],
-      ["scope", "http://api.microsofttranslator.com"],
-      ["client_id",
-      getUrlParam("%BING_API_CLIENTID%", "browser.translation.bing.clientIdOverride")],
-      ["client_secret",
-      getUrlParam("%BING_API_KEY%", "browser.translation.bing.apiKeyOverride")]
-    ];
-
-    let deferred = Promise.defer();
-    let options = {
-      onLoad: function(responseText, xhr) {
-        BingTokenManager._pendingRequest = null;
-        try {
-          let json = JSON.parse(responseText);
-
-          if (json.error) {
-            deferred.reject(json.error);
-            return;
-          }
-
-          let token = json.access_token;
-          let expires_in = json.expires_in;
-          BingTokenManager._currentToken = token;
-          BingTokenManager._currentExpiryTime = new Date(Date.now() + expires_in * 1000);
-          deferred.resolve(token);
-        } catch (e) {
-          deferred.reject(e);
-        }
-      },
-      onError: function(e, responseText, xhr) {
-        BingTokenManager._pendingRequest = null;
-        deferred.reject(e);
-      },
-      postData: params
-    };
-
-    this._pendingRequest = deferred.promise;
-    httpRequest(url, options);
-
-    return deferred.promise;
-  }
-};
-
-/**
- * Escape a string to be valid XML content.
- */
-function escapeXML(aStr) {
-  return aStr.toString()
-             .replace(/&/g, "&amp;")
-             .replace(/\"/g, "&quot;")
-             .replace(/\'/g, "&apos;")
-             .replace(/</g, "&lt;")
-             .replace(/>/g, "&gt;");
-}
-
-/**
- * Fetch an auth token (clientID or client secret), which may be overridden by
- * a pref if it's set.
- */
-function getUrlParam(paramValue, prefName) {
-  if (Services.prefs.getPrefType(prefName))
-    paramValue = Services.prefs.getCharPref(prefName);
-  paramValue = Services.urlFormatter.formatURL(paramValue);
-  return paramValue;
-}
diff --git a/application/basilisk/components/translation/Translation.jsm b/application/basilisk/components/translation/Translation.jsm
deleted file mode 100644
index 15a847c13..000000000
--- a/application/basilisk/components/translation/Translation.jsm
+++ /dev/null
@@ -1,446 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-"use strict";
-
-this.EXPORTED_SYMBOLS = [
-  "Translation",
-  "TranslationTelemetry",
-];
-
-const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
-
-const TRANSLATION_PREF_SHOWUI = "browser.translation.ui.show";
-const TRANSLATION_PREF_DETECT_LANG = "browser.translation.detectLanguage";
-
-Cu.import("resource://gre/modules/Services.jsm");
-Cu.import("resource://gre/modules/Promise.jsm");
-Cu.import("resource://gre/modules/Task.jsm", this);
-
-this.Translation = {
-  STATE_OFFER: 0,
-  STATE_TRANSLATING: 1,
-  STATE_TRANSLATED: 2,
-  STATE_ERROR: 3,
-  STATE_UNAVAILABLE: 4,
-
-  serviceUnavailable: false,
-
-  supportedSourceLanguages: ["bg", "cs", "de", "en", "es", "fr", "ja", "ko", "nl", "no", "pl", "pt", "ru", "tr", "vi", "zh"],
-  supportedTargetLanguages: ["bg", "cs", "de", "en", "es", "fr", "ja", "ko", "nl", "no", "pl", "pt", "ru", "tr", "vi", "zh"],
-
-  _defaultTargetLanguage: "",
-  get defaultTargetLanguage() {
-    if (!this._defaultTargetLanguage) {
-      this._defaultTargetLanguage = Cc["@mozilla.org/chrome/chrome-registry;1"]
-                                      .getService(Ci.nsIXULChromeRegistry)
-                                      .getSelectedLocale("global")
-                                      .split("-")[0];
-    }
-    return this._defaultTargetLanguage;
-  },
-
-  documentStateReceived: function(aBrowser, aData) {
-    if (aData.state == this.STATE_OFFER) {
-      if (aData.detectedLanguage == this.defaultTargetLanguage) {
-        // Detected language is the same as the user's locale.
-        return;
-      }
-
-      if (this.supportedSourceLanguages.indexOf(aData.detectedLanguage) == -1) {
-        // Detected language is not part of the supported languages.
-        TranslationTelemetry.recordMissedTranslationOpportunity(aData.detectedLanguage);
-        return;
-      }
-
-      TranslationTelemetry.recordTranslationOpportunity(aData.detectedLanguage);
-    }
-
-    if (!Services.prefs.getBoolPref(TRANSLATION_PREF_SHOWUI))
-      return;
-
-    if (!aBrowser.translationUI)
-      aBrowser.translationUI = new TranslationUI(aBrowser);
-    let trUI = aBrowser.translationUI;
-
-    // Set all values before showing a new translation infobar.
-    trUI._state = Translation.serviceUnavailable ? Translation.STATE_UNAVAILABLE
-                                                 : aData.state;
-    trUI.detectedLanguage = aData.detectedLanguage;
-    trUI.translatedFrom = aData.translatedFrom;
-    trUI.translatedTo = aData.translatedTo;
-    trUI.originalShown = aData.originalShown;
-
-    trUI.showURLBarIcon();
-
-    if (trUI.shouldShowInfoBar(aBrowser.currentURI))
-      trUI.showTranslationInfoBar();
-  },
-
-  openProviderAttribution: function() {
-    let attribution = this.supportedEngines[this.translationEngine];
-    Cu.import("resource:///modules/RecentWindow.jsm");
-    RecentWindow.getMostRecentBrowserWindow().openUILinkIn(attribution, "tab");
-  },
-
-  /**
-   * The list of translation engines and their attributions.
-   */
-  supportedEngines: {
-    "bing"    : "http://aka.ms/MicrosoftTranslatorAttribution",
-    "yandex"  : "http://translate.yandex.com/"
-  },
-
-  /**
-   * Fallback engine (currently Bing Translator) if the preferences seem
-   * confusing.
-   */
-  get defaultEngine() {
-    return this.supportedEngines.keys[0];
-  },
-
-  /**
-   * Returns the name of the preferred translation engine.
-   */
-  get translationEngine() {
-    let engine = Services.prefs.getCharPref("browser.translation.engine");
-    return Object.keys(this.supportedEngines).indexOf(engine) == -1 ? this.defaultEngine : engine;
-  },
-};
-
-/* TranslationUI objects keep the information related to translation for
- * a specific browser.  This object is passed to the translation
- * infobar so that it can initialize itself.  The properties exposed to
- * the infobar are:
- * - detectedLanguage, code of the language detected on the web page.
- * - state, the state in which the infobar should be displayed
- * - translatedFrom, if already translated, source language code.
- * - translatedTo, if already translated, target language code.
- * - translate, method starting the translation of the current page.
- * - showOriginalContent, method showing the original page content.
- * - showTranslatedContent, method showing the translation for an
- *   already translated page whose original content is shown.
- * - originalShown, boolean indicating if the original or translated
- *   version of the page is shown.
- */
-function TranslationUI(aBrowser) {
-  this.browser = aBrowser;
-}
-
-TranslationUI.prototype = {
-  get browser() {
-    return this._browser;
-  },
-  set browser(aBrowser) {
-    if (this._browser)
-      this._browser.messageManager.removeMessageListener("Translation:Finished", this);
-    aBrowser.messageManager.addMessageListener("Translation:Finished", this);
-    this._browser = aBrowser;
-  },
-  translate: function(aFrom, aTo) {
-    if (aFrom == aTo ||
-        (this.state == Translation.STATE_TRANSLATED &&
-         this.translatedFrom == aFrom && this.translatedTo == aTo)) {
-      // Nothing to do.
-      return;
-    }
-
-    if (this.state == Translation.STATE_OFFER) {
-      if (this.detectedLanguage != aFrom)
-        TranslationTelemetry.recordDetectedLanguageChange(true);
-    } else {
-      if (this.translatedFrom != aFrom)
-        TranslationTelemetry.recordDetectedLanguageChange(false);
-      if (this.translatedTo != aTo)
-        TranslationTelemetry.recordTargetLanguageChange();
-    }
-
-    this.state = Translation.STATE_TRANSLATING;
-    this.translatedFrom = aFrom;
-    this.translatedTo = aTo;
-
-    this.browser.messageManager.sendAsyncMessage(
-      "Translation:TranslateDocument",
-      { from: aFrom, to: aTo }
-    );
-  },
-
-  showURLBarIcon: function() {
-    let chromeWin = this.browser.ownerGlobal;
-    let PopupNotifications = chromeWin.PopupNotifications;
-    let removeId = this.originalShown ? "translated" : "translate";
-    let notification =
-      PopupNotifications.getNotification(removeId, this.browser);
-    if (notification)
-      PopupNotifications.remove(notification);
-
-    let callback = (aTopic, aNewBrowser) => {
-      if (aTopic == "swapping") {
-        let infoBarVisible =
-          this.notificationBox.getNotificationWithValue("translation");
-        aNewBrowser.translationUI = this;
-        this.browser = aNewBrowser;
-        if (infoBarVisible)
-          this.showTranslationInfoBar();
-        return true;
-      }
-
-      if (aTopic != "showing")
-        return false;
-      let notification = this.notificationBox.getNotificationWithValue("translation");
-      if (notification)
-        notification.close();
-      else
-        this.showTranslationInfoBar();
-      return true;
-    };
-
-    let addId = this.originalShown ? "translate" : "translated";
-    PopupNotifications.show(this.browser, addId, null,
-                            addId + "-notification-icon", null, null,
-                            {dismissed: true, eventCallback: callback});
-  },
-
-  _state: 0,
-  get state() {
-    return this._state;
-  },
-  set state(val) {
-    let notif = this.notificationBox.getNotificationWithValue("translation");
-    if (notif)
-      notif.state = val;
-    this._state = val;
-  },
-
-  originalShown: true,
-  showOriginalContent: function() {
-    this.originalShown = true;
-    this.showURLBarIcon();
-    this.browser.messageManager.sendAsyncMessage("Translation:ShowOriginal");
-    TranslationTelemetry.recordShowOriginalContent();
-  },
-
-  showTranslatedContent: function() {
-    this.originalShown = false;
-    this.showURLBarIcon();
-    this.browser.messageManager.sendAsyncMessage("Translation:ShowTranslation");
-  },
-
-  get notificationBox() {
-    return this.browser.ownerGlobal.gBrowser.getNotificationBox(this.browser);
-  },
-
-  showTranslationInfoBar: function() {
-    let notificationBox = this.notificationBox;
-    let notif = notificationBox.appendNotification("", "translation", null,
-                                                   notificationBox.PRIORITY_INFO_HIGH);
-    notif.init(this);
-    return notif;
-  },
-
-  shouldShowInfoBar: function(aURI) {
-    // Never show the infobar automatically while the translation
-    // service is temporarily unavailable.
-    if (Translation.serviceUnavailable)
-      return false;
-
-    // Check if we should never show the infobar for this language.
-    let neverForLangs =
-      Services.prefs.getCharPref("browser.translation.neverForLanguages");
-    if (neverForLangs.split(",").indexOf(this.detectedLanguage) != -1) {
-      TranslationTelemetry.recordAutoRejectedTranslationOffer();
-      return false;
-    }
-
-    // or if we should never show the infobar for this domain.
-    let perms = Services.perms;
-    if (perms.testExactPermission(aURI, "translate") ==  perms.DENY_ACTION) {
-      TranslationTelemetry.recordAutoRejectedTranslationOffer();
-      return false;
-    }
-
-    return true;
-  },
-
-  receiveMessage: function(msg) {
-    switch (msg.name) {
-      case "Translation:Finished":
-        if (msg.data.success) {
-          this.originalShown = false;
-          this.state = Translation.STATE_TRANSLATED;
-          this.showURLBarIcon();
-
-          // Record the number of characters translated.
-          TranslationTelemetry.recordTranslation(msg.data.from, msg.data.to,
-                                                    msg.data.characterCount);
-        } else if (msg.data.unavailable) {
-          Translation.serviceUnavailable = true;
-          this.state = Translation.STATE_UNAVAILABLE;
-        } else {
-          this.state = Translation.STATE_ERROR;
-        }
-        break;
-    }
-  },
-
-  infobarClosed: function() {
-    if (this.state == Translation.STATE_OFFER)
-      TranslationTelemetry.recordDeniedTranslationOffer();
-  }
-};
-
-/**
- * Uses telemetry histograms for collecting statistics on the usage of the
- * translation component.
- *
- * NOTE: Metrics are only recorded if the user enabled the telemetry option.
- */
-this.TranslationTelemetry = {
-
-  init: function () {
-    // Constructing histograms.
-    const plain = (id) => Services.telemetry.getHistogramById(id);
-    const keyed = (id) => Services.telemetry.getKeyedHistogramById(id);
-    this.HISTOGRAMS = {
-      OPPORTUNITIES         : () => plain("TRANSLATION_OPPORTUNITIES"),
-      OPPORTUNITIES_BY_LANG : () => keyed("TRANSLATION_OPPORTUNITIES_BY_LANGUAGE"),
-      PAGES                 : () => plain("TRANSLATED_PAGES"),
-      PAGES_BY_LANG         : () => keyed("TRANSLATED_PAGES_BY_LANGUAGE"),
-      CHARACTERS            : () => plain("TRANSLATED_CHARACTERS"),
-      DENIED                : () => plain("DENIED_TRANSLATION_OFFERS"),
-      AUTO_REJECTED         : () => plain("AUTO_REJECTED_TRANSLATION_OFFERS"),
-      SHOW_ORIGINAL         : () => plain("REQUESTS_OF_ORIGINAL_CONTENT"),
-      TARGET_CHANGES        : () => plain("CHANGES_OF_TARGET_LANGUAGE"),
-      DETECTION_CHANGES     : () => plain("CHANGES_OF_DETECTED_LANGUAGE"),
-      SHOW_UI               : () => plain("SHOULD_TRANSLATION_UI_APPEAR"),
-      DETECT_LANG           : () => plain("SHOULD_AUTO_DETECT_LANGUAGE"),
-    };
-
-    // Capturing the values of flags at the startup.
-    this.recordPreferences();
-  },
-
-  /**
-   * Record a translation opportunity in the health report.
-   * @param language
-   *        The language of the page.
-   */
-  recordTranslationOpportunity: function (language) {
-    return this._recordOpportunity(language, true);
-  },
-
-  /**
-   * Record a missed translation opportunity in the health report.
-   * A missed opportunity is when the language detected is not part
-   * of the supported languages.
-   * @param language
-   *        The language of the page.
-   */
-  recordMissedTranslationOpportunity: function (language) {
-    return this._recordOpportunity(language, false);
-  },
-
-  /**
-   * Record an automatically rejected translation offer in the health
-   * report. A translation offer is automatically rejected when a user
-   * has previously clicked "Never translate this language" or "Never
-   * translate this site", which results in the infobar not being shown for
-   * the translation opportunity.
-   *
-   * These translation opportunities should still be recorded in addition to
-   * recording the automatic rejection of the offer.
-   */
-  recordAutoRejectedTranslationOffer: function () {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.AUTO_REJECTED().add();
-  },
-
-   /**
-   * Record a translation in the health report.
-   * @param langFrom
-   *        The language of the page.
-   * @param langTo
-   *        The language translated to
-   * @param numCharacters
-   *        The number of characters that were translated
-   */
-  recordTranslation: function (langFrom, langTo, numCharacters) {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.PAGES().add();
-    this.HISTOGRAMS.PAGES_BY_LANG().add(langFrom + " -> " + langTo);
-    this.HISTOGRAMS.CHARACTERS().add(numCharacters);
-  },
-
-  /**
-   * Record a change of the detected language in the health report. This should
-   * only be called when actually executing a translation, not every time the
-   * user changes in the language in the UI.
-   *
-   * @param beforeFirstTranslation
-   *        A boolean indicating if we are recording a change of detected
-   *        language before translating the page for the first time. If we
-   *        have already translated the page from the detected language and
-   *        the user has manually adjusted the detected language false should
-   *        be passed.
-   */
-  recordDetectedLanguageChange: function (beforeFirstTranslation) {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.DETECTION_CHANGES().add(beforeFirstTranslation);
-  },
-
-  /**
-   * Record a change of the target language in the health report. This should
-   * only be called when actually executing a translation, not every time the
-   * user changes in the language in the UI.
-   */
-  recordTargetLanguageChange: function () {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.TARGET_CHANGES().add();
-  },
-
-  /**
-   * Record a denied translation offer.
-   */
-  recordDeniedTranslationOffer: function () {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.DENIED().add();
-  },
-
-  /**
-   * Record a "Show Original" command use.
-   */
-  recordShowOriginalContent: function () {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.SHOW_ORIGINAL().add();
-  },
-
-  /**
-   * Record the state of translation preferences.
-   */
-  recordPreferences: function () {
-    if (!this._canRecord) return;
-    if (Services.prefs.getBoolPref(TRANSLATION_PREF_SHOWUI)) {
-      this.HISTOGRAMS.SHOW_UI().add(1);
-    }
-    if (Services.prefs.getBoolPref(TRANSLATION_PREF_DETECT_LANG)) {
-      this.HISTOGRAMS.DETECT_LANG().add(1);
-    }
-  },
-
-  _recordOpportunity: function(language, success) {
-    if (!this._canRecord) return;
-    this.HISTOGRAMS.OPPORTUNITIES().add(success);
-    this.HISTOGRAMS.OPPORTUNITIES_BY_LANG().add(language, success);
-  },
-
-  /**
-   * A shortcut for reading the telemetry preference.
-   *
-   */
-  _canRecord: function () {
-    return Services.prefs.getBoolPref("toolkit.telemetry.enabled");
-  }
-};
-
-this.TranslationTelemetry.init();
diff --git a/application/basilisk/components/translation/TranslationContentHandler.jsm b/application/basilisk/components/translation/TranslationContentHandler.jsm
deleted file mode 100644
index 3b0d59ddd..000000000
--- a/application/basilisk/components/translation/TranslationContentHandler.jsm
+++ /dev/null
@@ -1,181 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this file,
- * You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-"use strict";
-
-this.EXPORTED_SYMBOLS = [ "TranslationContentHandler" ];
-
-const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
-
-Cu.import("resource://gre/modules/Services.jsm");
-Cu.import("resource://gre/modules/XPCOMUtils.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "LanguageDetector",
-  "resource:///modules/translation/LanguageDetector.jsm");
-
-const STATE_OFFER = 0;
-const STATE_TRANSLATED = 2;
-const STATE_ERROR = 3;
-
-this.TranslationContentHandler = function(global, docShell) {
-  let webProgress = docShell.QueryInterface(Ci.nsIInterfaceRequestor)
-                            .getInterface(Ci.nsIWebProgress);
-  webProgress.addProgressListener(this, Ci.nsIWebProgress.NOTIFY_STATE_DOCUMENT);
-
-  global.addEventListener("pageshow", this);
-
-  global.addMessageListener("Translation:TranslateDocument", this);
-  global.addMessageListener("Translation:ShowTranslation", this);
-  global.addMessageListener("Translation:ShowOriginal", this);
-  this.global = global;
-}
-
-TranslationContentHandler.prototype = {
-  handleEvent: function(aEvent) {
-    // We are only listening to pageshow events.
-    let target = aEvent.target;
-
-    // Only handle top-level frames.
-    let win = target.defaultView;
-    if (win.parent !== win)
-      return;
-
-    let content = this.global.content;
-    if (!content.detectedLanguage)
-      return;
-
-    let data = {};
-    let trDoc = content.translationDocument;
-    if (trDoc) {
-      data.state = trDoc.translationError ? STATE_ERROR : STATE_TRANSLATED;
-      data.translatedFrom = trDoc.translatedFrom;
-      data.translatedTo = trDoc.translatedTo;
-      data.originalShown = trDoc.originalShown;
-    } else {
-      data.state = STATE_OFFER;
-      data.originalShown = true;
-    }
-    data.detectedLanguage = content.detectedLanguage;
-
-    this.global.sendAsyncMessage("Translation:DocumentState", data);
-  },
-
-  /* nsIWebProgressListener implementation */
-  onStateChange: function(aWebProgress, aRequest, aStateFlags, aStatus) {
-    if (!aWebProgress.isTopLevel ||
-        !(aStateFlags & Ci.nsIWebProgressListener.STATE_STOP) ||
-        !this.global.content)
-      return;
-
-    let url = aRequest.name;
-    if (!url.startsWith("http://") && !url.startsWith("https://"))
-      return;
-
-    let content = this.global.content;
-    if (content.detectedLanguage)
-      return;
-
-    // Grab a 60k sample of text from the page.
-    let encoder = Cc["@mozilla.org/layout/documentEncoder;1?type=text/plain"]
-                    .createInstance(Ci.nsIDocumentEncoder);
-    encoder.init(content.document, "text/plain", encoder.SkipInvisibleContent);
-    let string = encoder.encodeToStringWithMaxLength(60 * 1024);
-
-    // Language detection isn't reliable on very short strings.
-    if (string.length < 100)
-      return;
-
-    LanguageDetector.detectLanguage(string).then(result => {
-      // Bail if we're not confident.
-      if (!result.confident) {
-        return;
-      }
-
-      // The window might be gone by now.
-      if (Cu.isDeadWrapper(content)) {
-        return;
-      }
-
-      content.detectedLanguage = result.language;
-
-      let data = {
-        state: STATE_OFFER,
-        originalShown: true,
-        detectedLanguage: result.language
-      };
-      this.global.sendAsyncMessage("Translation:DocumentState", data);
-    });
-  },
-
-  // Unused methods.
-  onProgressChange: function() {},
-  onLocationChange: function() {},
-  onStatusChange:   function() {},
-  onSecurityChange: function() {},
-
-  QueryInterface: XPCOMUtils.generateQI([Ci.nsIWebProgressListener,
-                                         Ci.nsISupportsWeakReference]),
-
-  receiveMessage: function(msg) {
-    switch (msg.name) {
-      case "Translation:TranslateDocument":
-      {
-        Cu.import("resource:///modules/translation/TranslationDocument.jsm");
-
-        // If a TranslationDocument already exists for this document, it should
-        // be used instead of creating a new one so that we can use the original
-        // content of the page for the new translation instead of the newly
-        // translated text.
-        let translationDocument = this.global.content.translationDocument ||
-                                  new TranslationDocument(this.global.content.document);
-
-        let preferredEngine = Services.prefs.getCharPref("browser.translation.engine");
-        let translator = null;
-        if (preferredEngine == "yandex") {
-          Cu.import("resource:///modules/translation/YandexTranslator.jsm");
-          translator = new YandexTranslator(translationDocument,
-                                            msg.data.from,
-                                            msg.data.to);
-        } else {
-          Cu.import("resource:///modules/translation/BingTranslator.jsm");
-          translator = new BingTranslator(translationDocument,
-                                          msg.data.from,
-                                          msg.data.to);
-        }
-
-        this.global.content.translationDocument = translationDocument;
-        translationDocument.translatedFrom = msg.data.from;
-        translationDocument.translatedTo = msg.data.to;
-        translationDocument.translationError = false;
-
-        translator.translate().then(
-          result => {
-            this.global.sendAsyncMessage("Translation:Finished", {
-              characterCount: result.characterCount,
-              from: msg.data.from,
-              to: msg.data.to,
-              success: true
-            });
-            translationDocument.showTranslation();
-          },
-          error => {
-            translationDocument.translationError = true;
-            let data = {success: false};
-            if (error == "unavailable")
-              data.unavailable = true;
-            this.global.sendAsyncMessage("Translation:Finished", data);
-          }
-        );
-        break;
-      }
-
-      case "Translation:ShowOriginal":
-        this.global.content.translationDocument.showOriginal();
-        break;
-
-      case "Translation:ShowTranslation":
-        this.global.content.translationDocument.showTranslation();
-        break;
-    }
-  }
-};
diff --git a/application/basilisk/components/translation/TranslationDocument.jsm b/application/basilisk/components/translation/TranslationDocument.jsm
deleted file mode 100644
index 058d07a49..000000000
--- a/application/basilisk/components/translation/TranslationDocument.jsm
+++ /dev/null
@@ -1,683 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-"use strict";
-
-const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
-
-this.EXPORTED_SYMBOLS = [ "TranslationDocument" ];
-
-const SHOW_ELEMENT = Ci.nsIDOMNodeFilter.SHOW_ELEMENT;
-const SHOW_TEXT = Ci.nsIDOMNodeFilter.SHOW_TEXT;
-const TEXT_NODE = Ci.nsIDOMNode.TEXT_NODE;
-
-Cu.import("resource://services-common/utils.js");
-Cu.import("resource://gre/modules/Task.jsm");
-
-/**
- * This class represents a document that is being translated,
- * and it is responsible for parsing the document,
- * generating the data structures translation (the list of
- * translation items and roots), and managing the original
- * and translated texts on the translation items.
- *
- * @param document  The document to be translated
- */
-this.TranslationDocument = function(document) {
-  this.itemsMap = new Map();
-  this.roots = [];
-  this._init(document);
-};
-
-this.TranslationDocument.prototype = {
-  translatedFrom: "",
-  translatedTo: "",
-  translationError: false,
-  originalShown: true,
-
-  /**
-   * Initializes the object and populates
-   * the roots lists.
-   *
-   * @param document  The document to be translated
-   */
-  _init: function(document) {
-    let window = document.defaultView;
-    let winUtils = window.QueryInterface(Ci.nsIInterfaceRequestor)
-                         .getInterface(Ci.nsIDOMWindowUtils);
-
-    // Get all the translation nodes in the document's body:
-    // a translation node is a node from the document which
-    // contains useful content for translation, and therefore
-    // must be included in the translation process.
-    let nodeList = winUtils.getTranslationNodes(document.body);
-
-    let length = nodeList.length;
-
-    for (let i = 0; i < length; i++) {
-      let node = nodeList.item(i);
-      let isRoot = nodeList.isTranslationRootAtIndex(i);
-
-      // Create a TranslationItem object for this node.
-      // This function will also add it to the this.roots array.
-      this._createItemForNode(node, i, isRoot);
-    }
-
-    // At first all roots are stored in the roots list, and only after
-    // the process has finished we're able to determine which roots are
-    // simple, and which ones are not.
-
-    // A simple root is defined by a root with no children items, which
-    // basically represents an element from a page with only text content
-    // inside.
-
-    // This distinction is useful for optimization purposes: we treat a
-    // simple root as plain-text in the translation process and with that
-    // we are able to reduce their data payload sent to the translation service.
-
-    for (let root of this.roots) {
-      if (root.children.length == 0 &&
-          root.nodeRef.childElementCount == 0) {
-        root.isSimpleRoot = true;
-      }
-    }
-  },
-
-  /**
-   * Creates a TranslationItem object, which should be called
-   * for each node returned by getTranslationNodes.
-   *
-   * @param node        The DOM node for this item.
-   * @param id          A unique, numeric id for this item.
-   * @parem isRoot      A boolean saying whether this item is a root.
-   *
-   * @returns           A TranslationItem object.
-   */
-  _createItemForNode: function(node, id, isRoot) {
-    if (this.itemsMap.has(node)) {
-      return this.itemsMap.get(node);
-    }
-
-    let item = new TranslationItem(node, id, isRoot);
-
-    if (isRoot) {
-      // Root items do not have a parent item.
-      this.roots.push(item);
-    } else {
-      let parentItem = this.itemsMap.get(node.parentNode);
-      if (parentItem) {
-        parentItem.children.push(item);
-      }
-    }
-
-    this.itemsMap.set(node, item);
-    return item;
-  },
-
-  /**
-   * Generate the text string that represents a TranslationItem object.
-   * Besides generating the string, it's also stored in the "original"
-   * field of the TranslationItem object, which needs to be stored for
-   * later to be used in the "Show Original" functionality.
-   * If this function had already been called for the given item (determined
-   * by the presence of the "original" array in the item), the text will
-   * be regenerated from the "original" data instead of from the related
-   * DOM nodes (because the nodes might contain translated data).
-   *
-   * @param item     A TranslationItem object
-   *
-   * @returns        A string representation of the TranslationItem.
-   */
-  generateTextForItem: function(item) {
-    if (item.original) {
-      return regenerateTextFromOriginalHelper(item);
-    }
-
-    if (item.isSimpleRoot) {
-      let text = item.nodeRef.firstChild.nodeValue.trim();
-      item.original = [text];
-      return text;
-    }
-
-    let str = "";
-    item.original = [];
-    let wasLastItemPlaceholder = false;
-
-    for (let child of item.nodeRef.childNodes) {
-      if (child.nodeType == TEXT_NODE) {
-        let x = child.nodeValue.trim();
-        if (x != "") {
-          item.original.push(x);
-          str += x;
-          wasLastItemPlaceholder = false;
-        }
-        continue;
-      }
-
-      let objInMap = this.itemsMap.get(child);
-      if (objInMap && !objInMap.isRoot) {
-        // If this childNode is present in the itemsMap, it means
-        // it's a translation node: it has useful content for translation.
-        // In this case, we need to stringify this node.
-        // However, if this item is a root, we should skip it here in this
-        // object's child list (and just add a placeholder for it), because
-        // it will be stringfied separately for being a root.
-        item.original.push(objInMap);
-        str += this.generateTextForItem(objInMap);
-        wasLastItemPlaceholder = false;
-      } else if (!wasLastItemPlaceholder) {
-        // Otherwise, if this node doesn't contain any useful content,
-        // or if it is a root itself, we can replace it with a placeholder node.
-        // We can't simply eliminate this node from our string representation
-        // because that could change the HTML structure (e.g., it would
-        // probably merge two separate text nodes).
-        // It's not necessary to add more than one placeholder in sequence;
-        // we can optimize them away.
-        item.original.push(TranslationItem_NodePlaceholder);
-        str += '<br>';
-        wasLastItemPlaceholder = true;
-      }
-    }
-
-    return generateTranslationHtmlForItem(item, str);
-  },
-
-  /**
-   * Changes the document to display its translated
-   * content.
-   */
-  showTranslation: function() {
-    this.originalShown = false;
-    this._swapDocumentContent("translation");
-  },
-
-  /**
-   * Changes the document to display its original
-   * content.
-   */
-  showOriginal: function() {
-    this.originalShown = true;
-    this._swapDocumentContent("original");
-  },
-
-  /**
-   * Swap the document with the resulting translation,
-   * or back with the original content.
-   *
-   * @param target   A string that is either "translation"
-   *                 or "original".
-   */
-  _swapDocumentContent: function(target) {
-    Task.spawn(function *() {
-      // Let the event loop breath on every 100 nodes
-      // that are replaced.
-      const YIELD_INTERVAL = 100;
-      let count = YIELD_INTERVAL;
-
-      for (let root of this.roots) {
-        root.swapText(target);
-        if (count-- == 0) {
-          count = YIELD_INTERVAL;
-          yield CommonUtils.laterTickResolvingPromise();
-        }
-      }
-    }.bind(this));
-  }
-};
-
-/**
- * This class represents an item for translation. It's basically our
- * wrapper class around a node returned by getTranslationNode, with
- * more data and structural information on it.
- *
- * At the end of the translation process, besides the properties below,
- * a TranslationItem will contain two other properties: one called "original"
- * and one called "translation". They are twin objects, one which reflect
- * the structure of that node in its original state, and the other in its
- * translated state.
- *
- * The "original" array is generated in the generateTextForItem function,
- * and the "translation" array is generated when the translation results
- * are parsed.
- *
- * They are both arrays, which contain a mix of strings and references to
- * child TranslationItems. The references in both arrays point to the * same *
- * TranslationItem object, but they might appear in different orders between the
- * "original" and "translation" arrays.
- *
- * An example:
- *
- *  English: <div id="n1">Welcome to <b id="n2">Mozilla's</b> website</div>
- *  Portuguese: <div id="n1">Bem vindo a pagina <b id="n2">da Mozilla</b></div>
- *
- *  TranslationItem n1 = {
- *    id: 1,
- *    original: ["Welcome to", ptr to n2, "website"]
- *    translation: ["Bem vindo a pagina", ptr to n2]
- *  }
- *
- *  TranslationItem n2 = {
- *    id: 2,
- *    original: ["Mozilla's"],
- *    translation: ["da Mozilla"]
- *  }
- */
-function TranslationItem(node, id, isRoot) {
-  this.nodeRef = node;
-  this.id = id;
-  this.isRoot = isRoot;
-  this.children = [];
-}
-
-TranslationItem.prototype = {
-  isRoot: false,
-  isSimpleRoot: false,
-
-  toString: function() {
-    let rootType = "";
-    if (this.isRoot) {
-      if (this.isSimpleRoot) {
-        rootType = " (simple root)";
-      }
-      else {
-        rootType = " (non simple root)";
-      }
-    }
-    return "[object TranslationItem: <" + this.nodeRef.localName + ">"
-           + rootType + "]";
-  },
-
-  /**
-   * This function will parse the result of the translation of one translation
-   * item. If this item was a simple root, all we sent was a plain-text version
-   * of it, so the result is also straightforward text.
-   *
-   * For non-simple roots, we sent a simplified HTML representation of that
-   * node, and we'll first parse that into an HTML doc and then call the
-   * parseResultNode helper function to parse it.
-   *
-   * While parsing, the result is stored in the "translation" field of the
-   * TranslationItem, which will be used to display the final translation when
-   * all items are finished. It remains stored too to allow back-and-forth
-   * switching between the "Show Original" and "Show Translation" functions.
-   *
-   * @param result    A string with the textual result received from the server,
-   *                  which can be plain-text or a serialized HTML doc.
-   */
-  parseResult: function(result) {
-    if (this.isSimpleRoot) {
-      this.translation = [result];
-      return;
-    }
-
-    let domParser = Cc["@mozilla.org/xmlextras/domparser;1"]
-                      .createInstance(Ci.nsIDOMParser);
-
-    let doc = domParser.parseFromString(result, "text/html");
-    parseResultNode(this, doc.body.firstChild);
-  },
-
-  /**
-   * This function finds a child TranslationItem
-   * with the given id.
-   * @param id        The id to look for, in the format "n#"
-   * @returns         A TranslationItem with the given id, or null if
-   *                  it was not found.
-   */
-  getChildById: function(id) {
-    for (let child of this.children) {
-      if (("n" + child.id) == id) {
-        return child;
-      }
-    }
-    return null;
-  },
-
-  /**
-   * Swap the text of this TranslationItem between
-   * its original and translated states.
-   *
-   * @param target   A string that is either "translation"
-   *                 or "original".
-   */
-  swapText: function(target) {
-    swapTextForItem(this, target);
-  }
-};
-
-/**
- * This object represents a placeholder item for translation. It's similar to
- * the TranslationItem class, but it represents nodes that have no meaningful
- * content for translation. These nodes will be replaced by "<br>" in a
- * translation request. It's necessary to keep them to use it as a mark
- * for correct positioning and spliting of text nodes.
- */
-const TranslationItem_NodePlaceholder = {
-  toString: function() {
-    return "[object TranslationItem_NodePlaceholder]";
-  }
-};
-
-/**
- * Generate the outer HTML representation for a given item.
- *
- * @param   item       A TranslationItem object.
- * param    content    The inner content for this item.
- * @returns string     The outer HTML needed for translation
- *                     of this item.
- */
-function generateTranslationHtmlForItem(item, content) {
-  let localName = item.isRoot ? "div" : "b";
-  return '<' + localName + ' id=n' + item.id + '>' +
-         content +
-         "</" + localName + ">";
-}
-
- /**
- * Regenerate the text string that represents a TranslationItem object,
- * with data from its "original" array. The array must have already
- * been created by TranslationDocument.generateTextForItem().
- *
- * @param item     A TranslationItem object
- *
- * @returns        A string representation of the TranslationItem.
- */
-function regenerateTextFromOriginalHelper(item) {
-  if (item.isSimpleRoot) {
-    return item.original[0];
-  }
-
-  let str = "";
-  for (let child of item.original) {
-    if (child instanceof TranslationItem) {
-      str += regenerateTextFromOriginalHelper(child);
-    } else if (child === TranslationItem_NodePlaceholder) {
-      str += "<br>";
-    } else {
-      str += child;
-    }
-  }
-
-  return generateTranslationHtmlForItem(item, str);
-}
-
-/**
- * Helper function to parse a HTML doc result.
- * How it works:
- *
- * An example result string is:
- *
- * <div id="n1">Hello <b id="n2">World</b> of Mozilla.</div>
- *
- * For an element node, we look at its id and find the corresponding
- * TranslationItem that was associated with this node, and then we
- * walk down it repeating the process.
- *
- * For text nodes we simply add it as a string.
- */
-function parseResultNode(item, node) {
-  item.translation = [];
-  for (let child of node.childNodes) {
-    if (child.nodeType == TEXT_NODE) {
-      item.translation.push(child.nodeValue);
-    } else if (child.localName == "br") {
-      item.translation.push(TranslationItem_NodePlaceholder);
-    } else {
-      let translationItemChild = item.getChildById(child.id);
-
-      if (translationItemChild) {
-        item.translation.push(translationItemChild);
-        parseResultNode(translationItemChild, child);
-      }
-    }
-  }
-}
-
-/**
- * Helper function to swap the text of a TranslationItem
- * between its original and translated states.
- * How it works:
- *
- * The function iterates through the target array (either the `original` or
- * `translation` array from the TranslationItem), while also keeping a pointer
- * to a current position in the child nodes from the actual DOM node that we
- * are modifying. This pointer is moved forward after each item of the array
- * is translated. If, at any given time, the pointer doesn't match the expected
- * node that was supposed to be seen, it means that the original and translated
- * contents have a different ordering, and thus we need to adjust that.
- *
- * A full example of the reordering process, swapping from Original to
- * Translation:
- *
- *    Original (en): <div>I <em>miss</em> <b>you</b></div>
- *
- * Translation (fr): <div><b>Tu</b> me <em>manques</em></div>
- *
- * Step 1:
- *   pointer points to firstChild of the DOM node, textnode "I "
- *   first item in item.translation is [object TranslationItem <b>]
- *
- *   pointer does not match the expected element, <b>. So let's move <b> to the
- *   pointer position.
- *
- *   Current state of the DOM:
- *     <div><b>you</b>I <em>miss</em> </div>
- *
- * Step 2:
- *   pointer moves forward to nextSibling, textnode "I " again.
- *   second item in item.translation is the string " me "
- *
- *   pointer points to a text node, and we were expecting a text node. Match!
- *   just replace the text content.
- *
- *   Current state of the DOM:
- *     <div><b>you</b> me <em>miss</em> </div>
- *
- * Step 3:
- *   pointer moves forward to nextSibling, <em>miss</em>
- *   third item in item.translation is [object TranslationItem <em>]
- *
- *   pointer points to the expected node. Match! Nothing to do.
- *
- * Step 4:
- *   all items in this item.translation were transformed. The remaining
- *   text nodes are cleared to "", and domNode.normalize() removes them.
- *
- *   Current state of the DOM:
- *     <div><b>you</b> me <em>miss</em></div>
- *
- * Further steps:
- *   After that, the function will visit the child items (from the visitStack),
- *   and the text inside the <b> and <em> nodes will be swapped as well,
- *   yielding the final result:
- *
- *     <div><b>Tu</b> me <em>manques</em></div>
- *
- *
- * @param item     A TranslationItem object
- * @param target   A string that is either "translation"
- *                 or "original".
- */
-function swapTextForItem(item, target) {
-  // visitStack is the stack of items that we still need to visit.
-  // Let's start the process by adding the root item.
-  let visitStack = [ item ];
-
-  while (visitStack.length > 0) {
-    let curItem = visitStack.shift();
-
-    let domNode = curItem.nodeRef;
-    if (!domNode) {
-      // Skipping this item due to a missing node.
-      continue;
-    }
-
-    if (!curItem[target]) {
-      // Translation not found for this item. This could be due to
-      // an error in the server response. For example, if a translation
-      // was broken in various chunks, and one of the chunks failed,
-      // the items from that chunk will be missing its "translation"
-      // field.
-      continue;
-    }
-
-    domNode.normalize();
-
-    // curNode points to the child nodes of the DOM node that we are
-    // modifying. During most of the process, while the target array is
-    // being iterated (in the for loop below), it should walk together with
-    // the array and be pointing to the correct node that needs to modified.
-    // If it's not pointing to it, that means some sort of node reordering
-    // will be necessary to produce the correct translation.
-    // Note that text nodes don't need to be reordered, as we can just replace
-    // the content of one text node with another.
-    //
-    // curNode starts in the firstChild...
-    let curNode = domNode.firstChild;
-
-    // ... actually, let's make curNode start at the first useful node (either
-    // a non-blank text node or something else). This is not strictly necessary,
-    // as the reordering algorithm would correctly handle this case. However,
-    // this better aligns the resulting translation with the DOM content of the
-    // page, avoiding cases that would need to be unecessarily reordered.
-    //
-    // An example of how this helps:
-    //
-    // ---- Original: <div>                <b>Hello </b> world.</div>
-    //                       ^textnode 1      ^item 1      ^textnode 2
-    //
-    // - Translation: <div><b>Hallo </b> Welt.</div>
-    //
-    // Transformation process without this optimization:
-    //   1 - start pointer at textnode 1
-    //   2 - move item 1 to first position inside the <div>
-    //
-    //       Node now looks like: <div><b>Hello </b>[         ][ world.]</div>
-    //                                         textnode 1^       ^textnode 2
-    //
-    //   3 - replace textnode 1 with " Welt."
-    //   4 - clear remaining text nodes (in this case, textnode 2)
-    //
-    // Transformation process with this optimization:
-    //   1 - start pointer at item 1
-    //   2 - item 1 is already in position
-    //   3 - replace textnode 2 with " Welt."
-    //
-    // which completely avoids any node reordering, and requires only one
-    // text change instead of two (while also leaving the page closer to
-    // its original state).
-    while (curNode &&
-           curNode.nodeType == TEXT_NODE &&
-           curNode.nodeValue.trim() == "") {
-      curNode = curNode.nextSibling;
-    }
-
-    // Now let's walk through all items in the `target` array of the
-    // TranslationItem. This means either the TranslationItem.original or
-    // TranslationItem.translation array.
-    for (let targetItem of curItem[target]) {
-
-      if (targetItem instanceof TranslationItem) {
-        // If the array element is another TranslationItem object, let's
-        // add it to the stack to be visited.
-        visitStack.push(targetItem);
-
-        let targetNode = targetItem.nodeRef;
-
-            // If the node is not in the expected position, let's reorder
-            // it into position...
-        if (curNode != targetNode &&
-            // ...unless the page has reparented this node under a totally
-            // different node (or removed it). In this case, all bets are off
-            // on being able to do anything correctly, so it's better not to
-            // bring back the node to this parent.
-            targetNode.parentNode == domNode) {
-
-          // We don't need to null-check curNode because insertBefore(..., null)
-          // does what we need in that case: reorder this node to the end
-          // of child nodes.
-          domNode.insertBefore(targetNode, curNode);
-          curNode = targetNode;
-        }
-
-        // Move pointer forward. Since we do not add empty text nodes to the
-        // list of translation items, we must skip them here too while
-        // traversing the DOM in order to get better alignment between the
-        // text nodes and the translation items.
-        if (curNode) {
-          curNode = getNextSiblingSkippingEmptyTextNodes(curNode);
-        }
-
-      } else if (targetItem === TranslationItem_NodePlaceholder) {
-        // If the current item is a placeholder node, we need to move
-        // our pointer "past" it, jumping from one side of a block of
-        // elements + empty text nodes to the other side. Even if
-        // non-placeholder elements exists inside the jumped block,
-        // they will be pulled correctly later in the process when the
-        // targetItem for those nodes are handled.
-
-        while (curNode &&
-               (curNode.nodeType != TEXT_NODE ||
-                curNode.nodeValue.trim() == "")) {
-          curNode = curNode.nextSibling;
-        }
-
-      } else {
-        // Finally, if it's a text item, we just need to find the next
-        // text node to use. Text nodes don't need to be reordered, so
-        // the first one found can be used.
-        while (curNode && curNode.nodeType != TEXT_NODE) {
-          curNode = curNode.nextSibling;
-        }
-
-        // If none was found and we reached the end of the child nodes,
-        // let's create a new one.
-        if (!curNode) {
-          // We don't know if the original content had a space or not,
-          // so the best bet is to create the text node with " " which
-          // will add one space at the beginning and one at the end.
-          curNode = domNode.appendChild(domNode.ownerDocument.createTextNode(" "));
-        }
-
-        // A trailing and a leading space must be preserved because
-        // they are meaningful in HTML.
-        let preSpace = /^\s/.test(curNode.nodeValue) ? " " : "";
-        let endSpace = /\s$/.test(curNode.nodeValue) ? " " : "";
-
-        curNode.nodeValue = preSpace + targetItem + endSpace;
-        curNode = getNextSiblingSkippingEmptyTextNodes(curNode);
-      }
-    }
-
-    // The translated version of a node might have less text nodes than its
-    // original version. If that's the case, let's clear the remaining nodes.
-    if (curNode) {
-      clearRemainingNonEmptyTextNodesFromElement(curNode);
-    }
-
-    // And remove any garbage "" nodes left after clearing.
-    domNode.normalize();
-  }
-}
-
-function getNextSiblingSkippingEmptyTextNodes(startSibling) {
-  let item = startSibling.nextSibling;
-  while (item &&
-         item.nodeType == TEXT_NODE &&
-         item.nodeValue.trim() == "") {
-    item = item.nextSibling;
-  }
-  return item;
-}
-
-function clearRemainingNonEmptyTextNodesFromElement(startSibling) {
-  let item = startSibling;
-  while (item) {
-    if (item.nodeType == TEXT_NODE &&
-        item.nodeValue != "") {
-      item.nodeValue = "";
-    }
-    item = item.nextSibling;
-  }
-}
diff --git a/application/basilisk/components/translation/YandexTranslator.jsm b/application/basilisk/components/translation/YandexTranslator.jsm
deleted file mode 100644
index ab92e0962..000000000
--- a/application/basilisk/components/translation/YandexTranslator.jsm
+++ /dev/null
@@ -1,343 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-"use strict";
-
-const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
-
-this.EXPORTED_SYMBOLS = [ "YandexTranslator" ];
-
-Cu.import("resource://gre/modules/Services.jsm");
-Cu.import("resource://gre/modules/Log.jsm");
-Cu.import("resource://gre/modules/Promise.jsm");
-Cu.import("resource://gre/modules/Task.jsm");
-Cu.import("resource://services-common/utils.js");
-Cu.import("resource://gre/modules/Http.jsm");
-
-// The maximum amount of net data allowed per request on Bing's API.
-const MAX_REQUEST_DATA = 5000; // Documentation says 10000 but anywhere
-                               // close to that is refused by the service.
-
-// The maximum number of chunks allowed to be translated in a single
-// request.
-const MAX_REQUEST_CHUNKS = 1000; // Documentation says 2000.
-
-// Self-imposed limit of 15 requests. This means that a page that would need
-// to be broken in more than 15 requests won't be fully translated.
-// The maximum amount of data that we will translate for a single page
-// is MAX_REQUESTS * MAX_REQUEST_DATA.
-const MAX_REQUESTS = 15;
-
-const YANDEX_RETURN_CODE_OK = 200;
-
-const YANDEX_ERR_KEY_INVALID               = 401; // Invalid API key
-const YANDEX_ERR_KEY_BLOCKED               = 402; // This API key has been blocked
-const YANDEX_ERR_DAILY_REQ_LIMIT_EXCEEDED  = 403; // Daily limit for requests reached
-const YANDEX_ERR_DAILY_CHAR_LIMIT_EXCEEDED = 404; // Daily limit of chars reached
-const YANDEX_ERR_TEXT_TOO_LONG             = 413; // The text size exceeds the maximum
-const YANDEX_ERR_UNPROCESSABLE_TEXT        = 422; // The text could not be translated
-const YANDEX_ERR_LANG_NOT_SUPPORTED        = 501; // The specified translation direction is not supported
-
-// Errors that should activate the service unavailable handling
-const YANDEX_PERMANENT_ERRORS = [
-  YANDEX_ERR_KEY_INVALID,
-  YANDEX_ERR_KEY_BLOCKED,
-  YANDEX_ERR_DAILY_REQ_LIMIT_EXCEEDED,
-  YANDEX_ERR_DAILY_CHAR_LIMIT_EXCEEDED,
-];
-
-/**
- * Translates a webpage using Yandex's Translation API.
- *
- * @param translationDocument  The TranslationDocument object that represents
- *                             the webpage to be translated
- * @param sourceLanguage       The source language of the document
- * @param targetLanguage       The target language for the translation
- *
- * @returns {Promise}          A promise that will resolve when the translation
- *                             task is finished.
- */
-this.YandexTranslator = function(translationDocument, sourceLanguage, targetLanguage) {
-  this.translationDocument = translationDocument;
-  this.sourceLanguage = sourceLanguage;
-  this.targetLanguage = targetLanguage;
-  this._pendingRequests = 0;
-  this._partialSuccess = false;
-  this._serviceUnavailable = false;
-  this._translatedCharacterCount = 0;
-};
-
-this.YandexTranslator.prototype = {
-  /**
-   * Performs the translation, splitting the document into several chunks
-   * respecting the data limits of the API.
-   *
-   * @returns {Promise}          A promise that will resolve when the translation
-   *                             task is finished.
-   */
-  translate: function() {
-    return Task.spawn(function *() {
-      let currentIndex = 0;
-      this._onFinishedDeferred = Promise.defer();
-
-      // Let's split the document into various requests to be sent to
-      // Yandex's Translation API.
-      for (let requestCount = 0; requestCount < MAX_REQUESTS; requestCount++) {
-        // Generating the text for each request can be expensive, so
-        // let's take the opportunity of the chunkification process to
-        // allow for the event loop to attend other pending events
-        // before we continue.
-        yield CommonUtils.laterTickResolvingPromise();
-
-        // Determine the data for the next request.
-        let request = this._generateNextTranslationRequest(currentIndex);
-
-        // Create a real request to the server, and put it on the
-        // pending requests list.
-        let yandexRequest = new YandexRequest(request.data,
-                                          this.sourceLanguage,
-                                          this.targetLanguage);
-        this._pendingRequests++;
-        yandexRequest.fireRequest().then(this._chunkCompleted.bind(this),
-                                       this._chunkFailed.bind(this));
-
-        currentIndex = request.lastIndex;
-        if (request.finished) {
-          break;
-        }
-      }
-
-      return this._onFinishedDeferred.promise;
-    }.bind(this));
-  },
-
-  /**
-   * Function called when a request sent to the server completed successfully.
-   * This function handles calling the function to parse the result and the
-   * function to resolve the promise returned by the public `translate()`
-   * method when there are no pending requests left.
-   *
-   * @param   request   The YandexRequest sent to the server
-   */
-  _chunkCompleted: function(yandexRequest) {
-    if (this._parseChunkResult(yandexRequest)) {
-      this._partialSuccess = true;
-      // Count the number of characters successfully translated.
-      this._translatedCharacterCount += yandexRequest.characterCount;
-    }
-
-    this._checkIfFinished();
-  },
-
-  /**
-   * Function called when a request sent to the server has failed.
-   * This function handles deciding if the error is transient or means the
-   * service is unavailable (zero balance on the key or request credentials are
-   * not in an active state) and calling the function to resolve the promise
-   * returned by the public `translate()` method when there are no pending
-   * requests left.
-   *
-   * @param   aError   [optional] The XHR object of the request that failed.
-   */
-  _chunkFailed: function(aError) {
-    if (aError instanceof Ci.nsIXMLHttpRequest) {
-      let body = aError.responseText;
-      let json = { code: 0 };
-      try {
-        json = JSON.parse(body);
-      } catch (e) {}
-
-      if (json.code && YANDEX_PERMANENT_ERRORS.indexOf(json.code) != -1)
-        this._serviceUnavailable = true;
-    }
-
-    this._checkIfFinished();
-  },
-
-  /**
-   * Function called when a request sent to the server has completed.
-   * This function handles resolving the promise
-   * returned by the public `translate()` method when all chunks are completed.
-   */
-  _checkIfFinished: function() {
-    // Check if all pending requests have been
-    // completed and then resolves the promise.
-    // If at least one chunk was successful, the
-    // promise will be resolved positively which will
-    // display the "Success" state for the infobar. Otherwise,
-    // the "Error" state will appear.
-    if (--this._pendingRequests == 0) {
-      if (this._partialSuccess) {
-        this._onFinishedDeferred.resolve({
-          characterCount: this._translatedCharacterCount
-        });
-      } else {
-        let error = this._serviceUnavailable ? "unavailable" : "failure";
-        this._onFinishedDeferred.reject(error);
-      }
-    }
-  },
-
-  /**
-   * This function parses the result returned by Yandex's Translation API,
-   * which returns a JSON result that contains a number of elements. The
-   * API is documented here:
-   * http://api.yandex.com/translate/doc/dg/reference/translate.xml
-   *
-   * @param   request      The request sent to the server.
-   * @returns boolean      True if parsing of this chunk was successful.
-   */
-  _parseChunkResult: function(yandexRequest) {
-    let results;
-    try {
-      let result = JSON.parse(yandexRequest.networkRequest.responseText);
-      if (result.code != 200) {
-        Services.console.logStringMessage("YandexTranslator: Result is " + result.code);
-        return false;
-      }
-      results = result.text
-    } catch (e) {
-      return false;
-    }
-
-    let len = results.length;
-    if (len != yandexRequest.translationData.length) {
-      // This should never happen, but if the service returns a different number
-      // of items (from the number of items submitted), we can't use this chunk
-      // because all items would be paired incorrectly.
-      return false;
-    }
-
-    let error = false;
-    for (let i = 0; i < len; i++) {
-      try {
-        let result = results[i];
-        let root = yandexRequest.translationData[i][0];
-        root.parseResult(result);
-      } catch (e) { error = true; }
-    }
-
-    return !error;
-  },
-
-  /**
-   * This function will determine what is the data to be used for
-   * the Nth request we are generating, based on the input params.
-   *
-   * @param startIndex What is the index, in the roots list, that the
-   *                   chunk should start.
-   */
-  _generateNextTranslationRequest: function(startIndex) {
-    let currentDataSize = 0;
-    let currentChunks = 0;
-    let output = [];
-    let rootsList = this.translationDocument.roots;
-
-    for (let i = startIndex; i < rootsList.length; i++) {
-      let root = rootsList[i];
-      let text = this.translationDocument.generateTextForItem(root);
-      if (!text) {
-        continue;
-      }
-
-      let newCurSize = currentDataSize + text.length;
-      let newChunks = currentChunks + 1;
-
-      if (newCurSize > MAX_REQUEST_DATA ||
-          newChunks > MAX_REQUEST_CHUNKS) {
-
-        // If we've reached the API limits, let's stop accumulating data
-        // for this request and return. We return information useful for
-        // the caller to pass back on the next call, so that the function
-        // can keep working from where it stopped.
-        return {
-          data: output,
-          finished: false,
-          lastIndex: i
-        };
-      }
-
-      currentDataSize = newCurSize;
-      currentChunks = newChunks;
-      output.push([root, text]);
-    }
-
-    return {
-      data: output,
-      finished: true,
-      lastIndex: 0
-    };
-  }
-};
-
-/**
- * Represents a request (for 1 chunk) sent off to Yandex's service.
- *
- * @params translationData  The data to be used for this translation,
- *                          generated by the generateNextTranslationRequest...
- *                          function.
- * @param sourceLanguage    The source language of the document.
- * @param targetLanguage    The target language for the translation.
- *
- */
-function YandexRequest(translationData, sourceLanguage, targetLanguage) {
-  this.translationData = translationData;
-  this.sourceLanguage = sourceLanguage;
-  this.targetLanguage = targetLanguage;
-  this.characterCount = 0;
-}
-
-YandexRequest.prototype = {
-  /**
-   * Initiates the request
-   */
-  fireRequest: function() {
-    return Task.spawn(function *() {
-      // Prepare URL.
-      let url = getUrlParam("https://translate.yandex.net/api/v1.5/tr.json/translate",
-                            "browser.translation.yandex.translateURLOverride");
-
-      // Prepare the request body.
-      let apiKey = getUrlParam("%YANDEX_API_KEY%", "browser.translation.yandex.apiKeyOverride");
-      let params = [
-        ["key", apiKey],
-        ["format", "html"],
-        ["lang", this.sourceLanguage + "-" + this.targetLanguage],
-      ];
-
-      for (let [, text] of this.translationData) {
-        params.push(["text", text]);
-        this.characterCount += text.length;
-      }
-
-      // Set up request options.
-      let deferred = Promise.defer();
-      let options = {
-        onLoad: (function(responseText, xhr) {
-          deferred.resolve(this);
-        }).bind(this),
-        onError: function(e, responseText, xhr) {
-          deferred.reject(xhr);
-        },
-        postData: params
-      };
-
-      // Fire the request.
-      this.networkRequest = httpRequest(url, options);
-
-      return deferred.promise;
-    }.bind(this));
-  }
-};
-
-/**
- * Fetch an auth token (clientID or client secret), which may be overridden by
- * a pref if it's set.
- */
-function getUrlParam(paramValue, prefName) {
-  if (Services.prefs.getPrefType(prefName))
-    paramValue = Services.prefs.getCharPref(prefName);
-  paramValue = Services.urlFormatter.formatURL(paramValue);
-  return paramValue;
-}
diff --git a/application/basilisk/components/translation/jar.mn b/application/basilisk/components/translation/jar.mn
deleted file mode 100644
index be744cb9e..000000000
--- a/application/basilisk/components/translation/jar.mn
+++ /dev/null
@@ -1,6 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-browser.jar:
-       content/browser/translation-infobar.xml
-       content/browser/microsoft-translator-attribution.png
diff --git a/application/basilisk/components/translation/microsoft-translator-attribution.png b/application/basilisk/components/translation/microsoft-translator-attribution.png
deleted file mode 100644
index d9d277461..000000000
--- a/application/basilisk/components/translation/microsoft-translator-attribution.png
+++ /dev/null
diff --git a/application/basilisk/components/translation/moz.build b/application/basilisk/components/translation/moz.build
index ac0165230..32421e430 100644
--- a/application/basilisk/components/translation/moz.build
+++ b/application/basilisk/components/translation/moz.build
@@ -3,14 +3,7 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 EXTRA_JS_MODULES.translation = [
-    'BingTranslator.jsm',
     'cld2/cld-worker.js',
     'cld2/cld-worker.js.mem',
     'LanguageDetector.jsm',
-    'Translation.jsm',
-    'TranslationContentHandler.jsm',
-    'TranslationDocument.jsm',
-    'YandexTranslator.jsm'
 ]
-
-JAR_MANIFESTS += ['jar.mn']
diff --git a/application/basilisk/components/translation/translation-infobar.xml b/application/basilisk/components/translation/translation-infobar.xml
deleted file mode 100644
index db0695c03..000000000
--- a/application/basilisk/components/translation/translation-infobar.xml
+++ /dev/null
@@ -1,441 +0,0 @@
-<?xml version="1.0"?>
-<!-- This Source Code Form is subject to the terms of the Mozilla Public
-   - License, v. 2.0. If a copy of the MPL was not distributed with this
-   - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
-
-<!DOCTYPE bindings [
-<!ENTITY % notificationDTD SYSTEM "chrome://global/locale/notification.dtd">
-%notificationDTD;
-<!ENTITY % translationDTD SYSTEM "chrome://browser/locale/translation.dtd" >
-%translationDTD;
-]>
-
-<bindings id="translationBindings"
-          xmlns="http://www.mozilla.org/xbl"
-          xmlns:xul="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
-          xmlns:xbl="http://www.mozilla.org/xbl">
-  <binding id="translationbar" extends="chrome://global/content/bindings/notification.xml#notification" role="xul:alert">
-    <resources>
-      <stylesheet src="chrome://global/skin/notification.css"/>
-    </resources>
-    <content>
-      <xul:hbox class="notification-inner" flex="1" xbl:inherits="type">
-        <xul:hbox anonid="details" align="center" flex="1">
-          <xul:image class="translate-infobar-element messageImage"
-                     anonid="messageImage"/>
-          <xul:panel anonid="welcomePanel" class="translation-welcome-panel"
-                     type="arrow" align="start">
-            <xul:image class="translation-welcome-logo"/>
-            <xul:vbox flex="1" class="translation-welcome-content">
-              <xul:description class="translation-welcome-headline"
-                               anonid="welcomeHeadline"/>
-              <xul:description class="translation-welcome-body" anonid="welcomeBody"/>
-              <xul:hbox align="center">
-                <xul:label anonid="learnMore" class="plain text-link"
-                           onclick="openUILinkIn('https://support.mozilla.org/kb/automatic-translation', 'tab'); this.parentNode.parentNode.parentNode.hidePopup();"/>
-                <xul:spacer flex="1"/>
-                <xul:button class="translate-infobar-element" anonid="thanksButton"
-                            onclick="this.parentNode.parentNode.parentNode.hidePopup();"/>
-              </xul:hbox>
-            </xul:vbox>
-          </xul:panel>
-          <xul:deck anonid="translationStates" selectedIndex="0">
-
-            <!-- offer to translate -->
-            <xul:hbox class="translate-offer-box" align="center">
-              <xul:label class="translate-infobar-element" value="&translation.thisPageIsIn.label;"/>
-              <xul:menulist class="translate-infobar-element" anonid="detectedLanguage">
-                <xul:menupopup/>
-              </xul:menulist>
-              <xul:label class="translate-infobar-element" value="&translation.translateThisPage.label;"/>
-              <xul:button class="translate-infobar-element"
-                          label="&translation.translate.button;"
-                          anonid="translate"
-                          oncommand="document.getBindingParent(this).translate();"/>
-              <xul:button class="translate-infobar-element"
-                          label="&translation.notNow.button;" anonid="notNow"
-                          oncommand="document.getBindingParent(this).closeCommand();"/>
-            </xul:hbox>
-
-            <!-- translating -->
-            <xul:vbox class="translating-box" pack="center">
-              <xul:label class="translate-infobar-element"
-                         value="&translation.translatingContent.label;"/>
-            </xul:vbox>
-
-            <!-- translated -->
-            <xul:hbox class="translated-box" align="center">
-              <xul:label class="translate-infobar-element"
-                         value="&translation.translatedFrom.label;"/>
-              <xul:menulist class="translate-infobar-element"
-                            anonid="fromLanguage"
-                            oncommand="document.getBindingParent(this).translate()">
-                <xul:menupopup/>
-              </xul:menulist>
-              <xul:label class="translate-infobar-element"
-                         value="&translation.translatedTo.label;"/>
-              <xul:menulist class="translate-infobar-element"
-                            anonid="toLanguage"
-                            oncommand="document.getBindingParent(this).translate()">
-                <xul:menupopup/>
-              </xul:menulist>
-              <xul:label class="translate-infobar-element"
-                         value="&translation.translatedToSuffix.label;"/>
-              <xul:button anonid="showOriginal"
-                          class="translate-infobar-element"
-                          label="&translation.showOriginal.button;"
-                          oncommand="document.getBindingParent(this).showOriginal();"/>
-              <xul:button anonid="showTranslation"
-                          class="translate-infobar-element"
-                          label="&translation.showTranslation.button;"
-                          oncommand="document.getBindingParent(this).showTranslation();"/>
-            </xul:hbox>
-
-            <!-- error -->
-            <xul:hbox class="translation-error" align="center">
-              <xul:label class="translate-infobar-element"
-                         value="&translation.errorTranslating.label;"/>
-              <xul:button class="translate-infobar-element"
-                          label="&translation.tryAgain.button;"
-                          anonid="tryAgain"
-                          oncommand="document.getBindingParent(this).translate();"/>
-            </xul:hbox>
-
-            <!-- unavailable -->
-            <xul:vbox class="translation-unavailable" pack="center">
-              <xul:label class="translate-infobar-element"
-                         value="&translation.serviceUnavailable.label;"/>
-            </xul:vbox>
-
-          </xul:deck>
-          <xul:spacer flex="1"/>
-
-          <xul:button type="menu"
-                      class="translate-infobar-element options-menu-button"
-                      anonid="options"
-                      label="&translation.options.menu;">
-            <xul:menupopup class="translation-menupopup cui-widget-panel cui-widget-panelview
-                                  cui-widget-panelWithFooter PanelUI-subView"
-                           onpopupshowing="document.getBindingParent(this).optionsShowing();">
-              <xul:menuitem anonid="neverForLanguage"
-                            oncommand="document.getBindingParent(this).neverForLanguage();"/>
-              <xul:menuitem anonid="neverForSite"
-                            oncommand="document.getBindingParent(this).neverForSite();"
-                            label="&translation.options.neverForSite.label;"
-                            accesskey="&translation.options.neverForSite.accesskey;"/>
-              <xul:menuseparator/>
-              <xul:menuitem oncommand="openPreferences('paneContent');"
-                            label="&translation.options.preferences.label;"
-                            accesskey="&translation.options.preferences.accesskey;"/>
-              <xul:menuitem class="subviewbutton panel-subview-footer"
-                            oncommand="document.getBindingParent(this).openProviderAttribution();">
-                <xul:deck anonid="translationEngine" selectedIndex="0">
-                  <xul:hbox class="translation-attribution">
-                    <xul:label>&translation.options.attribution.beforeLogo;</xul:label>
-                    <xul:image src="chrome://browser/content/microsoft-translator-attribution.png"
-                               aria-label="Microsoft Translator"/>
-                    <xul:label>&translation.options.attribution.afterLogo;</xul:label>
-                  </xul:hbox>
-                  <xul:label class="translation-attribution">&translation.options.attribution.yandexTranslate;</xul:label>
-                </xul:deck>
-              </xul:menuitem>
-            </xul:menupopup>
-          </xul:button>
-
-        </xul:hbox>
-        <xul:toolbarbutton ondblclick="event.stopPropagation();"
-                           anonid="closeButton"
-                           class="messageCloseButton close-icon tabbable"
-                           xbl:inherits="hidden=hideclose"
-                           tooltiptext="&closeNotification.tooltip;"
-                           oncommand="document.getBindingParent(this).closeCommand();"/>
-      </xul:hbox>
-    </content>
-    <implementation>
-      <property name="state"
-                onget="return this._getAnonElt('translationStates').selectedIndex;">
-        <setter>
-          <![CDATA[
-          let deck = this._getAnonElt('translationStates');
-
-          let activeElt = document.activeElement;
-          if (activeElt && deck.contains(activeElt))
-            activeElt.blur();
-
-          let stateName;
-          for (let name of ["OFFER", "TRANSLATING", "TRANSLATED", "ERROR"]) {
-            if (Translation["STATE_" + name] == val) {
-              stateName = name.toLowerCase();
-              break;
-            }
-          }
-          this.setAttribute("state", stateName);
-
-          if (val == Translation.STATE_TRANSLATED)
-            this._handleButtonHiding();
-
-          deck.selectedIndex = val;
-          ]]>
-        </setter>
-      </property>
-
-      <method name="init">
-        <parameter name="aTranslation"/>
-        <body>
-          <![CDATA[
-            this.translation = aTranslation;
-            let bundle = Cc["@mozilla.org/intl/stringbundle;1"]
-                           .getService(Ci.nsIStringBundleService)
-                           .createBundle("chrome://global/locale/languageNames.properties");
-            let sortByLocalizedName = function(aList) {
-              return aList.map(code => [code, bundle.GetStringFromName(code)])
-                          .sort((a, b) => a[1].localeCompare(b[1]));
-            };
-
-            // Fill the lists of supported source languages.
-            let detectedLanguage = this._getAnonElt("detectedLanguage");
-            let fromLanguage = this._getAnonElt("fromLanguage");
-            let sourceLanguages =
-              sortByLocalizedName(Translation.supportedSourceLanguages);
-            for (let [code, name] of sourceLanguages) {
-              detectedLanguage.appendItem(name, code);
-              fromLanguage.appendItem(name, code);
-            }
-            detectedLanguage.value = this.translation.detectedLanguage;
-
-            // translatedFrom is only set if we have already translated this page.
-            if (aTranslation.translatedFrom)
-              fromLanguage.value = aTranslation.translatedFrom;
-
-            // Fill the list of supported target languages.
-            let toLanguage = this._getAnonElt("toLanguage");
-            let targetLanguages =
-              sortByLocalizedName(Translation.supportedTargetLanguages);
-            for (let [code, name] of targetLanguages)
-              toLanguage.appendItem(name, code);
-
-            if (aTranslation.translatedTo)
-              toLanguage.value = aTranslation.translatedTo;
-
-            if (aTranslation.state)
-              this.state = aTranslation.state;
-
-            // Show attribution for the preferred translator.
-            let engineIndex = Object.keys(Translation.supportedEngines)
-              .indexOf(Translation.translationEngine);
-            if (engineIndex != -1) {
-              this._getAnonElt('translationEngine').selectedIndex = engineIndex;
-            }
-
-            const kWelcomePref = "browser.translation.ui.welcomeMessageShown";
-            if (Services.prefs.prefHasUserValue(kWelcomePref) ||
-                this.translation.browser != gBrowser.selectedBrowser)
-              return;
-
-            this.addEventListener("transitionend", function onShown() {
-              this.removeEventListener("transitionend", onShown);
-
-              // These strings are hardcoded because they need to reach beta
-              // without riding the trains.
-              let localizedStrings = {
-                en: ["Hey look! It's something new!",
-                     "Now the Web is even more accessible with our new in-page translation feature. Click the translate button to try it!",
-                     "Learn more.",
-                     "Thanks"],
-                "es-AR": ["\xA1Mir\xE1! \xA1Hay algo nuevo!",
-                          "Ahora la web es a\xFAn m\xE1s accesible con nuestra nueva funcionalidad de traducci\xF3n integrada. \xA1Hac\xE9 clic en el bot\xF3n traducir para probarla!",
-                          "Conoc\xE9 m\xE1s.",
-                          "Gracias"],
-                "es-ES": ["\xA1Mira! \xA1Hay algo nuevo!",
-                          "Con la nueva funcionalidad de traducci\xF3n integrada, ahora la Web es a\xFAn m\xE1s accesible. \xA1Pulsa el bot\xF3n Traducir y pru\xE9bala!",
-                          "M\xE1s informaci\xF3n.",
-                          "Gracias"],
-                pl: ["Sp\xF3jrz tutaj! To co\u015B nowego!",
-                     "Sie\u0107 sta\u0142a si\u0119 w\u0142a\u015Bnie jeszcze bardziej dost\u0119pna dzi\u0119ki opcji bezpo\u015Bredniego t\u0142umaczenia stron. Kliknij przycisk t\u0142umaczenia, aby spr\xF3bowa\u0107!",
-                     "Dowiedz si\u0119 wi\u0119cej",
-                     "Dzi\u0119kuj\u0119"],
-                tr: ["Bak\u0131n, burada yeni bir \u015Fey var!",
-                     "Yeni sayfa i\xE7i \xE7eviri \xF6zelli\u011Fimiz sayesinde Web art\u0131k \xE7ok daha anla\u015F\u0131l\u0131r olacak. Denemek i\xE7in \xC7evir d\xFC\u011Fmesine t\u0131klay\u0131n!",
-                     "Daha fazla bilgi al\u0131n.",
-                     "Te\u015Fekk\xFCrler"],
-                vi: ["Nh\xECn n\xE0y! \u0110\u1ED3 m\u1EDBi!",
-                     "Gi\u1EDD \u0111\xE2y ch\xFAng ta c\xF3 th\u1EC3 ti\u1EBFp c\u1EADn web d\u1EC5 d\xE0ng h\u01A1n n\u1EEFa v\u1EDBi t\xEDnh n\u0103ng d\u1ECBch ngay trong trang.  Hay nh\u1EA5n n\xFAt d\u1ECBch \u0111\u1EC3 th\u1EED!",
-                     "T\xECm hi\u1EC3u th\xEAm.",
-                     "C\u1EA3m \u01A1n"]
-              };
-
-              let locale = Cc["@mozilla.org/chrome/chrome-registry;1"]
-                             .getService(Ci.nsIXULChromeRegistry)
-                             .getSelectedLocale("browser");
-              if (!(locale in localizedStrings))
-                locale = "en";
-              let strings = localizedStrings[locale];
-
-              this._getAnonElt("welcomeHeadline").setAttribute("value", strings[0]);
-              this._getAnonElt("welcomeBody").textContent = strings[1];
-              this._getAnonElt("learnMore").setAttribute("value", strings[2]);
-              this._getAnonElt("thanksButton").setAttribute("label", strings[3]);
-
-              let panel = this._getAnonElt("welcomePanel");
-              panel.openPopup(this._getAnonElt("messageImage"),
-                              "bottomcenter topleft");
-
-              Services.prefs.setBoolPref(kWelcomePref, true);
-            });
-          ]]>
-        </body>
-      </method>
-
-      <method name="_getAnonElt">
-        <parameter name="aAnonId"/>
-        <body>
-          return document.getAnonymousElementByAttribute(this, "anonid", aAnonId);
-        </body>
-      </method>
-
-      <method name="translate">
-        <body>
-          <![CDATA[
-            if (this.state == Translation.STATE_OFFER) {
-              this._getAnonElt("fromLanguage").value =
-                this._getAnonElt("detectedLanguage").value;
-              this._getAnonElt("toLanguage").value =
-                Translation.defaultTargetLanguage;
-            }
-
-            this.translation.translate(this._getAnonElt("fromLanguage").value,
-                                       this._getAnonElt("toLanguage").value);
-          ]]>
-        </body>
-      </method>
-
-      <!-- To be called when the infobar should be closed per user's wish (e.g.
-           by clicking the notification's close button -->
-      <method name="closeCommand">
-        <body>
-          <![CDATA[
-            this.close();
-            this.translation.infobarClosed();
-          ]]>
-        </body>
-      </method>
-      <method name="_handleButtonHiding">
-        <body>
-          <![CDATA[
-            let originalShown = this.translation.originalShown;
-            this._getAnonElt("showOriginal").hidden = originalShown;
-            this._getAnonElt("showTranslation").hidden = !originalShown;
-          ]]>
-        </body>
-      </method>
-
-      <method name="showOriginal">
-        <body>
-          <![CDATA[
-            this.translation.showOriginalContent();
-            this._handleButtonHiding();
-          ]]>
-        </body>
-      </method>
-
-      <method name="showTranslation">
-        <body>
-          <![CDATA[
-            this.translation.showTranslatedContent();
-            this._handleButtonHiding();
-          ]]>
-        </body>
-      </method>
-
-      <method name="optionsShowing">
-        <body>
-          <![CDATA[
-            // Get the source language name.
-            let lang;
-            if (this.state == Translation.STATE_OFFER)
-              lang = this._getAnonElt("detectedLanguage").value;
-            else {
-              lang = this._getAnonElt("fromLanguage").value;
-
-              // If we have never attempted to translate the page before the
-              // service became unavailable, "fromLanguage" isn't set.
-              if (!lang && this.state == Translation.STATE_UNAVAILABLE)
-                lang = this.translation.detectedLanguage;
-            }
-
-            let langBundle =
-              Cc["@mozilla.org/intl/stringbundle;1"]
-                .getService(Ci.nsIStringBundleService)
-                .createBundle("chrome://global/locale/languageNames.properties");
-            let langName = langBundle.GetStringFromName(lang);
-
-            // Set the label and accesskey on the menuitem.
-            let bundle =
-              Cc["@mozilla.org/intl/stringbundle;1"]
-                .getService(Ci.nsIStringBundleService)
-                .createBundle("chrome://browser/locale/translation.properties");
-            let item = this._getAnonElt("neverForLanguage");
-            const kStrId = "translation.options.neverForLanguage";
-            item.setAttribute("label",
-                              bundle.formatStringFromName(kStrId + ".label",
-                                                          [langName], 1));
-            item.setAttribute("accesskey",
-                              bundle.GetStringFromName(kStrId + ".accesskey"));
-            item.langCode = lang;
-
-            // We may need to disable the menuitems if they have already been used.
-            // Check if translation is already disabled for this language:
-            let neverForLangs =
-              Services.prefs.getCharPref("browser.translation.neverForLanguages");
-            item.disabled = neverForLangs.split(",").indexOf(lang) != -1;
-
-            // Check if translation is disabled for the domain:
-            let uri = this.translation.browser.currentURI;
-            let perms = Services.perms;
-            item = this._getAnonElt("neverForSite");
-            item.disabled =
-              perms.testExactPermission(uri, "translate") == perms.DENY_ACTION;
-          ]]>
-        </body>
-      </method>
-
-      <method name="neverForLanguage">
-        <body>
-          <![CDATA[
-            const kPrefName = "browser.translation.neverForLanguages";
-
-            let val = Services.prefs.getCharPref(kPrefName);
-            if (val)
-              val += ",";
-            val += this._getAnonElt("neverForLanguage").langCode;
-
-            Services.prefs.setCharPref(kPrefName, val);
-
-            this.closeCommand();
-          ]]>
-        </body>
-      </method>
-
-      <method name="neverForSite">
-        <body>
-          <![CDATA[
-            let uri = this.translation.browser.currentURI;
-            let perms = Services.perms;
-            perms.add(uri, "translate", perms.DENY_ACTION);
-
-            this.closeCommand();
-          ]]>
-        </body>
-      </method>
-
-      <method name="openProviderAttribution">
-        <body>
-          <![CDATA[
-            Translation.openProviderAttribution();
-          ]]>
-        </body>
-      </method>
-
-    </implementation>
-  </binding>
-</bindings>
diff --git a/application/basilisk/installer/package-manifest.in b/application/basilisk/installer/package-manifest.in
index c446d3687..35060ea5e 100644
--- a/application/basilisk/installer/package-manifest.in
+++ b/application/basilisk/installer/package-manifest.in
@@ -35,7 +35,9 @@
 #ifdef XP_MACOSX
 ; Mac bundle stuff
 @APPNAME@/Contents/Info.plist
+#ifdef MOZ_UPDATER
 @APPNAME@/Contents/Library/LaunchServices
+#endif
 @APPNAME@/Contents/PkgInfo
 @RESPATH@/firefox.icns
 @RESPATH@/document.icns
@@ -114,7 +116,7 @@
 @BINPATH@/browser/VisualElements/VisualElements_150.png
 @BINPATH@/browser/VisualElements/VisualElements_70.png
 #else
-@BINPATH@/@MOZ_APP_NAME@-bin
+@RESPATH@/@MOZ_APP_NAME@-bin
 @BINPATH@/@MOZ_APP_NAME@
 #endif
 @RESPATH@/application.ini
diff --git a/application/basilisk/installer/windows/Makefile.in b/application/basilisk/installer/windows/Makefile.in
index d5580b53d..bab6ded1a 100644
--- a/application/basilisk/installer/windows/Makefile.in
+++ b/application/basilisk/installer/windows/Makefile.in
@@ -79,10 +79,6 @@ $(CONFIG_DIR)/setup.exe::
 	$(PYTHON) $(topsrcdir)/toolkit/mozapps/installer/windows/nsis/preprocess-locale.py \
 	  --preprocess-locale $(topsrcdir) \
 	  $(PPL_LOCALE_ARGS) $(AB_CD) $(CONFIG_DIR)
-	$(PYTHON) $(topsrcdir)/toolkit/mozapps/installer/windows/nsis/preprocess-locale.py \
-	  --preprocess-single-file $(topsrcdir) \
-	  $(PPL_LOCALE_ARGS) $(CONFIG_DIR) \
-	  nsisstrings.properties nsisstrings.nlf
 
 GARBARGE_DIRS += instgen
 
diff --git a/application/basilisk/installer/windows/nsis/defines.nsi.in b/application/basilisk/installer/windows/nsis/defines.nsi.in
index 5ad9b7966..dbb58ca33 100644
--- a/application/basilisk/installer/windows/nsis/defines.nsi.in
+++ b/application/basilisk/installer/windows/nsis/defines.nsi.in
@@ -3,23 +3,6 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-# Defining FunnelcakeVersion will append the value of StubURLVersionAppend to
-# StubURLVersion, append the value of URLManualDownloadAppend to
-# URLManualDownload, and append the value of URLStubDownloadAppend to
-# URLStubDownload. The value of FunnelcakeVersion should not be defined when it
-# is not used and when it is defined its value should never be empty.
-# !define FunnelcakeVersion        "999"
-
-!ifdef FunnelcakeVersion
-!define URLManualDownloadAppend  "&f=${FunnelcakeVersion}"
-!define URLStubDownloadAppend    "-f${FunnelcakeVersion}"
-!define StubURLVersionAppend     "-${FunnelcakeVersion}"
-!else
-!define URLManualDownloadAppend  ""
-!define URLStubDownloadAppend    ""
-!define StubURLVersionAppend     ""
-!endif
-
 # These defines should match application.ini settings
 !define AppName               "Basilisk"
 !define AppVersion            "@APP_VERSION@"
@@ -84,13 +67,3 @@ VIAddVersionKey "FileVersion"     "${AppVersion}"
 VIAddVersionKey "ProductVersion"  "${AppVersion}"
 # Comments is not used but left below commented out for future reference
 # VIAddVersionKey "Comments"        "Comments"
-
-# Control positions in Dialog Units so they are placed correctly with
-# non-default DPI settings
-!define OPTIONS_ITEM_EDGE_DU 90u
-!define OPTIONS_ITEM_WIDTH_DU 356u
-!define OPTIONS_SUBITEM_EDGE_DU 119u
-!define OPTIONS_SUBITEM_WIDTH_DU 327u
-!define INSTALL_BLURB_TOP_DU 78u
-!define APPNAME_BMP_EDGE_DU 19u
-!define APPNAME_BMP_TOP_DU 12u
diff --git a/application/basilisk/locales/en-US/chrome/browser/browser.dtd b/application/basilisk/locales/en-US/chrome/browser/browser.dtd
index f75aa46a7..09da91dee 100644
--- a/application/basilisk/locales/en-US/chrome/browser/browser.dtd
+++ b/application/basilisk/locales/en-US/chrome/browser/browser.dtd
@@ -207,8 +207,6 @@ These should match what Safari and other Apple applications use on OS X Lion. --
 <!ENTITY urlbar.webRTCShareScreenNotificationAnchor.tooltip       "Manage sharing your windows or screen with the site">
 
 <!ENTITY urlbar.servicesNotificationAnchor.tooltip        "Open install message panel">
-<!ENTITY urlbar.translateNotificationAnchor.tooltip       "Translate this page">
-<!ENTITY urlbar.translatedNotificationAnchor.tooltip      "Manage page translation">
 <!ENTITY urlbar.emeNotificationAnchor.tooltip             "Manage use of DRM software">
 
 <!ENTITY urlbar.cameraBlocked.tooltip            "You have blocked your camera for this website.">
diff --git a/application/basilisk/locales/en-US/chrome/browser/preferences/content.dtd b/application/basilisk/locales/en-US/chrome/browser/preferences/content.dtd
index 5d58ffa37..d1d83b3bc 100644
--- a/application/basilisk/locales/en-US/chrome/browser/preferences/content.dtd
+++ b/application/basilisk/locales/en-US/chrome/browser/preferences/content.dtd
@@ -38,21 +38,6 @@
 <!ENTITY chooseButton.label           "Choose…">
 <!ENTITY chooseButton.accesskey       "o">
 
-<!ENTITY translateWebPages.label      "Translate web content">
-<!ENTITY translateWebPages.accesskey  "T">
-<!ENTITY translateExceptions.label    "Exceptions…">
-<!ENTITY translateExceptions.accesskey "x">
-
-<!-- LOCALIZATION NOTE (translation.options.attribution.beforeLogo,
-  -                     translation.options.attribution.afterLogo):
-  -  These 2 strings are displayed before and after a 'Microsoft Translator'
-  -  logo.
-  -  The translations for these strings should match the translations in
-  -  browser/translation.dtd
-  -->
-<!ENTITY translation.options.attribution.beforeLogo "Translations by">
-<!ENTITY translation.options.attribution.afterLogo "">
-
 <!ENTITY  drmContent.label             "DRM content">
 
 <!ENTITY  playDRMContent.label         "Play DRM content">
diff --git a/application/basilisk/locales/en-US/chrome/browser/preferences/preferences.properties b/application/basilisk/locales/en-US/chrome/browser/preferences/preferences.properties
index da40f40a9..84fac427d 100644
--- a/application/basilisk/locales/en-US/chrome/browser/preferences/preferences.properties
+++ b/application/basilisk/locales/en-US/chrome/browser/preferences/preferences.properties
@@ -21,8 +21,8 @@ acceptVeryLargeMinimumFont=Keep my changes anyway
 
 trackingprotectionpermissionstext=You have disabled Tracking Protection on these sites.
 trackingprotectionpermissionstitle=Exceptions - Tracking Protection
-cookiepermissionstext=You can specify which websites are always or never allowed to use cookies. Type the exact address of the site you want to manage and then click Block, Allow for Session, or Allow.
-cookiepermissionstitle=Exceptions - Cookies
+cookiepermissionstext=You can specify which websites are always or never allowed to store cookies and site data. Type the exact address of the site you want to manage and then click Block, Allow for Session, or Allow.
+cookiepermissionstitle=Exceptions - Cookies and Site Data
 addonspermissionstext=You can specify which websites are allowed to install add-ons. Type the exact address of the site you want to allow and then click Allow.
 addons_permissions_title=Allowed Sites - Add-ons Installation
 popuppermissionstext=You can specify which websites are allowed to open pop-up windows. Type the exact address of the site you want to allow and then click Allow.
@@ -36,7 +36,7 @@ savedLoginsExceptions_desc=Logins for the following sites will not be saved:
 
 #### Block List Manager
 
-blockliststext=You can choose which list Firefox will use to block Web elements that may track your browsing activity.
+blockliststext=You can choose which list the browser will use to block Web elements that may track your browsing activity.
 blockliststitle=Block Lists
 # LOCALIZATION NOTE (mozNameTemplate): This template constructs the name of the
 # block list in the block lists dialog. It combines the list name and
diff --git a/application/basilisk/locales/en-US/chrome/browser/preferences/privacy.dtd b/application/basilisk/locales/en-US/chrome/browser/preferences/privacy.dtd
index e6a7955fb..f743c4040 100644
--- a/application/basilisk/locales/en-US/chrome/browser/preferences/privacy.dtd
+++ b/application/basilisk/locales/en-US/chrome/browser/preferences/privacy.dtd
@@ -49,10 +49,10 @@
 <!ENTITY  suggestionSettings.label      "Change preferences for search engine suggestions…">
 <!ENTITY  suggestionSettings.accesskey  "g">
 
-<!ENTITY  acceptCookies.label           "Accept cookies from sites">
+<!ENTITY  acceptCookies.label           "Allow sites to store cookies and site data">
 <!ENTITY  acceptCookies.accesskey       "A">
 
-<!ENTITY  acceptThirdParty.pre.label      "Accept third-party cookies:">
+<!ENTITY  acceptThirdParty.pre.label      "Accept third-party cookies and site data:">
 <!ENTITY  acceptThirdParty.pre.accesskey  "y">
 <!ENTITY  acceptThirdParty.always.label   "Always">
 <!ENTITY  acceptThirdParty.never.label    "Never">
@@ -77,7 +77,7 @@
 <!ENTITY  historyHeader.custom.label       "Use custom settings for history">
 <!ENTITY  historyHeader.post.label         "">
 
-<!ENTITY  rememberDescription.label      "&brandShortName; will remember your browsing, download, form and search history, and keep cookies from websites you visit.">
+<!ENTITY  rememberDescription.label      "&brandShortName; will remember your browsing, download, form and search history, and keep cookies and site data from websites you visit.">
 
 <!-- LOCALIZATION NOTE (rememberActions.pre.label): include a trailing space as needed -->
 <!-- LOCALIZATION NOTE (rememberActions.middle.label): include a starting and trailing space as needed -->
diff --git a/application/basilisk/locales/en-US/chrome/browser/preferences/translation.dtd b/application/basilisk/locales/en-US/chrome/browser/preferences/translation.dtd
deleted file mode 100644
index fc5fb2b7b..000000000
--- a/application/basilisk/locales/en-US/chrome/browser/preferences/translation.dtd
+++ /dev/null
@@ -1,24 +0,0 @@
-<!-- This Source Code Form is subject to the terms of the Mozilla Public
-   - License, v. 2.0. If a copy of the MPL was not distributed with this
-   - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
-
-<!ENTITY window.title                     "Exceptions - Translation">
-<!ENTITY window.width                     "36em">
-<!ENTITY windowClose.key                  "w">
-
-<!ENTITY noTranslationForLanguages.label  "Translation will not be offered for the following languages:">
-<!ENTITY treehead.languageName.label      "Languages">
-<!ENTITY removeLanguage.label             "Remove Language">
-<!ENTITY removeLanguage.accesskey         "R">
-<!ENTITY removeAllLanguages.label         "Remove All Languages">
-<!ENTITY removeAllLanguages.accesskey     "e">
-
-<!ENTITY noTranslationForSites.label      "Translation will not be offered for the following sites:">
-<!ENTITY treehead.siteName.label          "Sites">
-<!ENTITY removeSite.label                 "Remove Site">
-<!ENTITY removeSite.accesskey             "S">
-<!ENTITY removeAllSites.label             "Remove All Sites">
-<!ENTITY removeAllSites.accesskey         "i">
-
-<!ENTITY button.close.label               "Close">
-<!ENTITY button.close.accesskey           "C">
diff --git a/application/basilisk/locales/en-US/chrome/browser/translation.dtd b/application/basilisk/locales/en-US/chrome/browser/translation.dtd
deleted file mode 100644
index ca8bb9d51..000000000
--- a/application/basilisk/locales/en-US/chrome/browser/translation.dtd
+++ /dev/null
@@ -1,75 +0,0 @@
-<!-- This Source Code Form is subject to the terms of the Mozilla Public
-   - License, v. 2.0. If a copy of the MPL was not distributed with this
-   - file, You can obtain one at http://mozilla.org/MPL/2.0/. -->
-
-<!-- LOCALIZATION NOTE (translation.thisPageIsIn.label,
-  -                     translation.translateThisPage.label):
-  -  These 2 strings are used to construct a sentence that contains a dropdown
-  -  showing the detected language of the current web page.
-  -  In en-US it looks like this:
-  -    This page is in [detected language] Translate this page?
-  -  "detected language" here is a language name coming from the
-  -  global/languageNames.properties file; for some locales it may not be in
-  -  the correct grammar case to keep the same structure of the original
-  -  sentence. -->
-<!ENTITY translation.thisPageIsIn.label         "This page is in">
-<!ENTITY translation.translateThisPage.label    "Translate this page?">
-<!ENTITY translation.translate.button           "Translate">
-<!ENTITY translation.notNow.button              "Not Now">
-
-<!ENTITY translation.translatingContent.label   "Translating page content…">
-
-<!-- LOCALIZATION NOTE (translation.translatedFrom.label,
-  -                     translation.translatedTo.label,
-  -                     translation.translatedToSuffix.label):
-  -  These 3 strings are used to construct a sentence that contains 2 dropdowns
-  -  showing the source and target language of a translated web page.
-  -  In en-US it looks like this:
-  -    This page has been translated from [from language] to [to language]
-  -  "from language" and "to language" here are language names coming from the
-  -  global/languageNames.properties file; for some locales they may not be in
-  -  the correct grammar case to keep the same structure of the original
-  -  sentence.
-  -
-  -  translation.translatedToSuffix.label (empty in en-US) is for locales that
-  -  need to display some text after the second drop down for the sentence to
-  -  be grammatically correct. -->
-<!ENTITY translation.translatedFrom.label       "This page has been translated from">
-<!ENTITY translation.translatedTo.label         "to">
-<!ENTITY translation.translatedToSuffix.label   "">
-
-<!ENTITY translation.showOriginal.button        "Show Original">
-<!ENTITY translation.showTranslation.button     "Show Translation">
-
-<!ENTITY translation.errorTranslating.label     "There has been an error translating this page.">
-<!ENTITY translation.tryAgain.button            "Try Again">
-
-<!ENTITY translation.serviceUnavailable.label   "Translation is not available at the moment. Please try again later.">
-
-<!ENTITY translation.options.menu               "Options">
-<!-- LOCALIZATION NOTE (translation.options.neverForSite.accesskey,
-  -                     translation.options.preferences.accesskey):
-  -  The accesskey values used here should not clash with the value used for
-  -  translation.options.neverForLanguage.accesskey in translation.properties
-  -->
-<!ENTITY translation.options.neverForSite.label "Never translate this site">
-<!ENTITY translation.options.neverForSite.accesskey "e">
-<!ENTITY translation.options.preferences.label  "Translation preferences">
-<!ENTITY translation.options.preferences.accesskey "T">
-
-<!-- LOCALIZATION NOTE (translation.options.attribution.beforeLogo,
-  -                     translation.options.attribution.afterLogo):
-  -  These 2 strings are displayed before and after a 'Microsoft Translator'
-  -  logo.
-  -->
-<!ENTITY translation.options.attribution.beforeLogo "Translations by">
-<!ENTITY translation.options.attribution.afterLogo "">
-
-<!-- LOCALIZATION NOTE (translation.options.attribution.poweredByYandex,
-                        translation.options.attribution.beforeLogo,
-  -                     translation.options.attribution.afterLogo):
-  -  translation.options.attribution.poweredByYandex is displayed instead of
-  -  the other two strings when yandex translation engine is preferred by the
-  -  user.
-  -->
-<!ENTITY translation.options.attribution.yandexTranslate "Powered by Yandex.Translate">
diff --git a/application/basilisk/locales/en-US/chrome/browser/translation.properties b/application/basilisk/locales/en-US/chrome/browser/translation.properties
deleted file mode 100644
index e62edbd0a..000000000
--- a/application/basilisk/locales/en-US/chrome/browser/translation.properties
+++ /dev/null
@@ -1,12 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-# LOCALIZATION NOTE (translation.options.neverForLanguage.label):
-#  %S is a language name coming from the global/languageNames.properties file.
-translation.options.neverForLanguage.label=Never translate %S
-
-# LOCALIZATION NOTE (translation.options.neverForLanguage.accesskey):
-# The accesskey value used here should not clash with the values used for
-# translation.options.*.accesskey in translation.dtd
-translation.options.neverForLanguage.accesskey=N
diff --git a/application/basilisk/locales/en-US/installer/nsisstrings.properties b/application/basilisk/locales/en-US/installer/nsisstrings.properties
deleted file mode 100644
index 389405696..000000000
--- a/application/basilisk/locales/en-US/installer/nsisstrings.properties
+++ /dev/null
@@ -1,67 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-# LOCALIZATION NOTE:
-
-# This file must be saved as UTF8
-
-# Accesskeys are defined by prefixing the letter that is to be used for the
-# accesskey with an ampersand (e.g. &).
-
-# Do not replace $BrandShortName, $BrandFullName, or $BrandFullNameDA with a
-# custom string and always use the same one as used by the en-US files.
-# $BrandFullNameDA allows the string to contain an ampersand (e.g. DA stands
-# for double ampersand) and prevents the letter following the ampersand from
-# being used as an accesskey.
-
-# You can use \n to create a newline in the string but only when the string
-# from en-US contains a \n.
-
-WIN_CAPTION=$BrandShortName Setup
-
-INTRO_BLURB1=Thanks for choosing $BrandFullName, the browser that chooses you above everything else.
-INSTALL_BLURB1=You're about to enjoy the very latest in speed, flexibility and security so you're always in control.
-INSTALL_BLURB2=That's because $BrandShortName is made by a non-profit to make browsing and the Web better for you.
-INSTALL_BLURB3=You're also joining a global community of users, contributors and developers working to make the best browser in the world.
-
-WARN_MIN_SUPPORTED_OSVER_MSG=Sorry, $BrandShortName can't be installed. This version of $BrandShortName requires ${MinSupportedVer} or newer. Please click the OK button for additional information.
-WARN_MIN_SUPPORTED_CPU_MSG=Sorry, $BrandShortName can't be installed. This version of $BrandShortName requires a processor with ${MinSupportedCPU} support. Please click the OK button for additional information.
-WARN_MIN_SUPPORTED_OSVER_CPU_MSG=Sorry, $BrandShortName can't be installed. This version of $BrandShortName requires ${MinSupportedVer} or newer and a processor with ${MinSupportedCPU} support. Please click the OK button for additional information.
-WARN_WRITE_ACCESS=You don't have access to write to the installation directory.\n\nClick OK to select a different directory.
-WARN_DISK_SPACE=You don't have sufficient disk space to install to this location.\n\nClick OK to select a different location.
-WARN_ROOT_INSTALL=Unable to install to the root of your disk.\n\nClick OK to select a different location.
-WARN_MANUALLY_CLOSE_APP_LAUNCH=$BrandShortName is already running.\n\nPlease close $BrandShortName prior to launching the version you have just installed.
-
-ERROR_DOWNLOAD=Your download was interrupted.\n\nPlease click the OK button to continue.
-
-INSTALL_BUTTON=&Install
-UPGRADE_BUTTON=&Upgrade
-CANCEL_BUTTON=Cancel
-OPTIONS_BUTTON=&Options
-
-MAKE_DEFAULT=&Make $BrandShortName my default browser
-CREATE_SHORTCUTS=Create Shortcuts for $BrandShortName:
-ADD_SC_TASKBAR=On my &Task bar
-ADD_SC_QUICKLAUNCHBAR=On my &Quick Launch bar
-ADD_CheckboxShortcutInStartMenu=In my &Start Menu Programs Folder
-ADD_CheckboxShortcutOnDesktop=On my &Desktop
-SPACE_REQUIRED=Space Required:
-SPACE_AVAILABLE=Space Available:
-ONE_MOMENT_INSTALL=One moment, $BrandShortName will launch as soon as the install is complete…
-ONE_MOMENT_UPGRADE=One moment, $BrandShortName will launch as soon as the upgrade is complete…
-INSTALL_MAINT_SERVICE=&Install the $BrandShortName background update service
-SEND_PING=S&end information about this installation to Mozilla
-BROWSE_BUTTON=B&rowse…
-DEST_FOLDER=Destination Folder
-
-DOWNLOADING_LABEL=Downloading $BrandShortName…
-INSTALLING_LABEL=Installing $BrandShortName…
-UPGRADING_LABEL=Upgrading $BrandShortName…
-
-SELECT_FOLDER_TEXT=Select the folder to install $BrandShortName in.
-
-BYTE=B
-KILO=K
-MEGA=M
-GIGA=G
diff --git a/application/basilisk/locales/jar.mn b/application/basilisk/locales/jar.mn
index 9a847c7ed..5c5a72a1f 100644
--- a/application/basilisk/locales/jar.mn
+++ b/application/basilisk/locales/jar.mn
@@ -42,8 +42,6 @@
     locale/browser/shellservice.properties         (%chrome/browser/shellservice.properties)
     locale/browser/tabbrowser.properties           (%chrome/browser/tabbrowser.properties)
     locale/browser/taskbar.properties              (%chrome/browser/taskbar.properties)
-    locale/browser/translation.dtd                 (%chrome/browser/translation.dtd)
-    locale/browser/translation.properties          (%chrome/browser/translation.properties)
     locale/browser/webrtcIndicator.properties      (%chrome/browser/webrtcIndicator.properties)
     locale/browser/downloads/downloads.dtd         (%chrome/browser/downloads/downloads.dtd)
     locale/browser/downloads/downloads.properties  (%chrome/browser/downloads/downloads.properties)
@@ -88,7 +86,6 @@
     locale/browser/preferences/sync.dtd               (%chrome/browser/preferences/sync.dtd)
     locale/browser/preferences/tabs.dtd               (%chrome/browser/preferences/tabs.dtd)
     locale/browser/preferences/search.dtd             (%chrome/browser/preferences/search.dtd)
-    locale/browser/preferences/translation.dtd        (%chrome/browser/preferences/translation.dtd)
     locale/browser/syncBrand.dtd                (%chrome/browser/syncBrand.dtd)
     locale/browser/syncSetup.dtd                (%chrome/browser/syncSetup.dtd)
     locale/browser/syncSetup.properties         (%chrome/browser/syncSetup.properties)
diff --git a/application/basilisk/themes/linux/browser.css b/application/basilisk/themes/linux/browser.css
index f9bd0bbd0..fbc5b651b 100644
--- a/application/basilisk/themes/linux/browser.css
+++ b/application/basilisk/themes/linux/browser.css
@@ -840,80 +840,6 @@ menuitem.bookmark-item {
   outline: 1px dotted -moz-DialogText;
 }
 
-/* Translation infobar */
-
-%include ../shared/translation/infobar.inc.css
-
-notification[value="translation"] {
-  min-height: 40px;
-}
-
-notification[value="translation"],
-notification[value="translation"] button,
-notification[value="translation"] menulist {
-  min-height: 30px;
-  color: #5A5959;
-}
-
-notification[value="translation"] {
-  background-color: #F2F1F0;
-}
-
-notification[value="translation"] button,
-notification[value="translation"] menulist {
-  padding-inline-end: 1ch;
-}
-
-notification[value="translation"] menulist {
-  border: 1px solid #C1C1C1;
-  background-color: #FFF;
-}
-
-notification[value="translation"] button {
-  border: 1px solid #C1C1C1;
-  background-color: #F2F1F0;
-}
-
-notification[value="translation"] button,
-notification[value="translation"] menulist,
-notification[value="translation"] menulist > .menulist-label-box {
-  margin-inline-start: 1ch;
-  margin-inline-end: 1ch;
-}
-
-notification[value="translation"] button:hover,
-notification[value="translation"] button:active,
-notification[value="translation"] menulist:hover,
-notification[value="translation"] menulist:active {
-  background-color: #E2E1E0;
-}
-
-notification[value="translation"] button[anonid="translate"] {
-  color: #FFF;
-  background-image: linear-gradient(#9FB938, #8DA726);
-  box-shadow: none;
-  border: 1px solid #829C1C;
-}
-
-notification[value="translation"] button[anonid="translate"]:hover,
-notification[value="translation"] button[anonid="translate"]:active {
-  background-image: linear-gradient(#8DA726, #8DA726);
-}
-
-notification[value="translation"] button > .button-box,
-notification[value="translation"] button[type="menu"] > .button-box > .button-menu-dropmarker {
-  padding: 0;
-  margin-inline-start: 3ch;
-}
-
-notification[value="translation"] button:not([type="menu"]) > .button-box {
-  margin-inline-end: 3ch;
-}
-
-notification[value="translation"] menulist > .menulist-dropmarker {
-  display: block;
-}
-
 /* AutoComplete */
 
 %include ../shared/autocomplete.inc.css
diff --git a/application/basilisk/themes/linux/preferences/preferences.css b/application/basilisk/themes/linux/preferences/preferences.css
index 45e2dc23d..5c1b102fa 100644
--- a/application/basilisk/themes/linux/preferences/preferences.css
+++ b/application/basilisk/themes/linux/preferences/preferences.css
@@ -20,12 +20,6 @@
   font-weight: bold;
 }
 
-/* Content Pane */
-#translationAttributionImage {
-  width: 70px;
-  cursor: pointer;
-}
-
 /* Modeless Window Dialogs */
 .windowDialog,
 .windowDialog prefpane {
diff --git a/application/basilisk/themes/osx/browser.css b/application/basilisk/themes/osx/browser.css
index 808bb20b1..70f1f6162 100644
--- a/application/basilisk/themes/osx/browser.css
+++ b/application/basilisk/themes/osx/browser.css
@@ -2920,139 +2920,6 @@ toolbarbutton.chevron > .toolbarbutton-menu-dropmarker {
               0 0 3px 2px -moz-mac-focusring;
 }
 
-/* Translation */
-
-%include ../shared/translation/infobar.inc.css
-
-notification[value="translation"] {
-  color: #484848;
-  background-color: #EFEFEF;
-  background-image: none;
-  border-top: none;
-  border-bottom: 1px solid #c4c4c4;
-  padding-top: 1px;
-  padding-bottom: 1px;
-  min-height: 35px;
-}
-
-.translate-infobar-element {
-  margin-top: 0 !important;
-  margin-bottom: 0 !important;
-}
-
-button.translate-infobar-element {
-  background: linear-gradient(rgba(255, 255, 255, 0.8), rgba(255, 255, 255, 0.1)) repeat scroll 0% 0% padding-box transparent;
-  color: #333333;
-  border: 1px solid;
-  border-color: rgba(23, 51, 78, 0.15) rgba(23, 51, 78, 0.17) rgba(23, 51, 78, 0.2);
-  box-shadow: 0px 0px 2px rgba(255, 255, 255, 0.5) inset, 0px 1px 0px rgba(255, 255, 255, 0.2);
-  transition-property: background-color, border-color, box-shadow;
-  transition-duration: 150ms;
-  min-height: 22px;
-  min-width: 0;
-  padding: 0 0.8em !important;
-  margin-left: 0.25em;
-  margin-right: 0.25em;
-}
-
-button.translate-infobar-element .button-text {
-  margin-left: 0 !important;
-  margin-right: 0 !important;
-}
-
-label.translate-infobar-element {
-  padding-top: 2px;
-}
-
-button.translate-infobar-element:hover {
-  background: #f0f0f0;
-  box-shadow: 0 1px 0 hsla(0,0%,100%,.1) inset,  0 0 0 1px hsla(0,0%,100%,.05) inset,  0 1px 0 hsla(210,54%,20%,.01),  0 0 4px hsla(206,100%,20%,.1);
-}
-
-button.translate-infobar-element:active {
-  box-shadow: 0 1px 1px hsla(211,79%,6%,.1) inset,  0 0 1px hsla(211,79%,6%,.2) inset;
-  transition-duration: 0ms;
-}
-
-button.translate-infobar-element[anonid="translate"] {
-  color: #ffffff;
-  background: linear-gradient(#4cb1ff, #1793e5);
-  box-shadow: 0 1px 0 hsla(0,0%,100%,.2) inset,  0 0 0 1px hsla(0,0%,100%,.1) inset,  0 1px 0 hsla(210,54%,20%,.03);
-  border-color: hsla(210,54%,20%,.15) hsla(210,54%,20%,.17) hsla(210,54%,20%,.2);
-  padding: 0 1.1em  !important;;
-}
-
-button.translate-infobar-element[anonid="translate"]:hover {
-  background-image: linear-gradient(#66bdff, #0d9eff);
-  box-shadow: 0 1px 0 hsla(0,0%,100%,.2) inset,  0 0 0 1px hsla(0,0%,100%,.1) inset,  0 1px 0 hsla(210,54%,20%,.03),  0 0 4px hsla(206,100%,20%,.2);
-}
-
-button.translate-infobar-element.options-menu-button {
-  padding-inline-start: 0.5em !important;
-  padding-inline-end: 0em !important;
-}
-
-button.translate-infobar-element.options-menu-button > .button-box > .button-menu-dropmarker {
-  display: -moz-box;
-  list-style-image: url("chrome://global/skin/icons/glyph-dropdown.png");
-  padding: 0 !important;
-  margin: 0 !important;
-}
-
-@media (min-resolution: 2dppx) {
-  button.translate-infobar-element.options-menu-button > .button-box > .button-menu-dropmarker {
-    list-style-image: url("chrome://global/skin/icons/glyph-dropdown@2x.png");
-  }
-
-  button.translate-infobar-element.options-menu-button > .button-box > .button-menu-dropmarker > .dropmarker-icon {
-    width: 8px;
-  }
-}
-
-menulist.translate-infobar-element {
-  text-shadow: 0 1px 1px #FEFFFE;
-  border: 1px solid;
-  border-color: rgba(23, 51, 78, 0.15) rgba(23, 51, 78, 0.17) rgba(23, 51, 78, 0.2);
-  box-shadow: 0 1px 1px 0 #FFFFFF, inset 0 2px 2px 0 #FFFFFF;
-  background-color: #F1F1F1;
-  background-image: linear-gradient(#FFFFFF, rgba(255,255,255,0.1));
-  color: #333333;
-  padding: 0;
-  min-height: 22px !important;
-}
-
-menulist.translate-infobar-element > .menulist-label-box {
-  padding-top: 1px;
-  padding-inline-start: 0.3em;
-  margin-top: 0;
-  margin-bottom: 0;
-}
-
-menulist.translate-infobar-element:hover {
-  background: #f0f0f0;
-  box-shadow: 0 1px 0 hsla(0,0%,100%,.1) inset,  0 0 0 1px hsla(0,0%,100%,.05) inset,  0 1px 0 hsla(210,54%,20%,.01),  0 0 4px hsla(206,100%,20%,.1);
-}
-
-menulist.translate-infobar-element[open="true"] {
-  background-image: linear-gradient(rgba(255,255,255,0.1),
-                                    rgba(255,255,255,0.6));
-}
-
-menulist.translate-infobar-element > .menulist-dropmarker {
-  display: -moz-box;
-  list-style-image: url("chrome://global/skin/icons/glyph-dropdown.png");
-}
-
-@media (min-resolution: 2dppx) {
-  menulist.translate-infobar-element > .menulist-dropmarker {
-    list-style-image: url("chrome://global/skin/icons/glyph-dropdown@2x.png");
-  }
-
-  menulist.translate-infobar-element > .menulist-dropmarker > .dropmarker-icon {
-    width: 8px;
-  }
-}
-
 .popup-notification-body[popupid="addon-progress"],
 .popup-notification-body[popupid="addon-install-confirmation"] {
   width: 28em;
diff --git a/application/basilisk/themes/osx/preferences/preferences.css b/application/basilisk/themes/osx/preferences/preferences.css
index 4f17ec58d..a8dcadc55 100644
--- a/application/basilisk/themes/osx/preferences/preferences.css
+++ b/application/basilisk/themes/osx/preferences/preferences.css
@@ -49,11 +49,6 @@ caption {
   border-bottom: 1px solid #ccc;
 }
 
-#translationAttributionImage {
-  width: 70px;
-  cursor: pointer;
-}
-
 #browserUseCurrent,
 #browserUseBookmark,
 #browserUseBlank {
diff --git a/application/basilisk/themes/shared/jar.inc.mn b/application/basilisk/themes/shared/jar.inc.mn
index 588cf5364..361edd311 100644
--- a/application/basilisk/themes/shared/jar.inc.mn
+++ b/application/basilisk/themes/shared/jar.inc.mn
@@ -108,10 +108,6 @@
   skin/classic/browser/theme-switcher-icon.png                 (../shared/theme-switcher-icon.png)
   skin/classic/browser/theme-switcher-icon@2x.png              (../shared/theme-switcher-icon@2x.png)
   skin/classic/browser/toolbarbutton-dropdown-arrow.png        (../shared/toolbarbutton-dropdown-arrow.png)
-  skin/classic/browser/translating-16.png                      (../shared/translation/translating-16.png)
-  skin/classic/browser/translating-16@2x.png                   (../shared/translation/translating-16@2x.png)
-  skin/classic/browser/translation-16.png                      (../shared/translation/translation-16.png)
-  skin/classic/browser/translation-16@2x.png                   (../shared/translation/translation-16@2x.png)
   skin/classic/browser/undoCloseTab.png                        (../shared/undoCloseTab.png)
   skin/classic/browser/undoCloseTab@2x.png                     (../shared/undoCloseTab@2x.png)
   skin/classic/browser/update-badge.svg                        (../shared/update-badge.svg)
diff --git a/application/basilisk/themes/shared/notification-icons.inc.css b/application/basilisk/themes/shared/notification-icons.inc.css
index f641e72c1..99451ba98 100644
--- a/application/basilisk/themes/shared/notification-icons.inc.css
+++ b/application/basilisk/themes/shared/notification-icons.inc.css
@@ -266,27 +266,3 @@ html|*#webRTC-previewVideo {
     opacity: 1;
   }
 }
-
-/* TRANSLATION */
-
-.translation-icon {
-  list-style-image: url(chrome://browser/skin/translation-16.png);
-  -moz-image-region: rect(0px, 16px, 16px, 0px);
-}
-
-.translation-icon.in-use {
-  -moz-image-region: rect(0px, 32px, 16px, 16px);
-}
-
-%ifdef XP_MACOSX
-@media (min-resolution: 1.1dppx) {
-  .translation-icon {
-    list-style-image: url(chrome://browser/skin/translation-16@2x.png);
-    -moz-image-region: rect(0px, 32px, 32px, 0px);
-  }
-
-  .translation-icon.in-use {
-    -moz-image-region: rect(0px, 64px, 32px, 32px);
-  }
-}
-%endif
diff --git a/application/basilisk/themes/shared/translation/infobar.inc.css b/application/basilisk/themes/shared/translation/infobar.inc.css
deleted file mode 100644
index 50d1acc01..000000000
--- a/application/basilisk/themes/shared/translation/infobar.inc.css
+++ /dev/null
@@ -1,95 +0,0 @@
-%if 0
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-%endif
-notification[value="translation"] .messageImage {
-  list-style-image: url(chrome://browser/skin/translation-16.png);
-  -moz-image-region: rect(0, 32px, 16px, 16px);
-}
-
-@media (min-resolution: 1.25dppx) {
-  notification[value="translation"] .messageImage {
-    list-style-image: url(chrome://browser/skin/translation-16@2x.png);
-    -moz-image-region: rect(0, 64px, 32px, 32px);
-  }
-}
-
-notification[value="translation"][state="translating"] .messageImage {
-  list-style-image: url(chrome://browser/skin/translating-16.png);
-  -moz-image-region: auto;
-}
-
-@media (min-resolution: 1.25dppx) {
-  notification[value="translation"][state="translating"] .messageImage {
-    list-style-image: url(chrome://browser/skin/translating-16@2x.png);
-  }
-}
-
-notification[value="translation"] hbox[anonid="details"] {
-  overflow: hidden;
-}
-
-notification[value="translation"] button,
-notification[value="translation"] menulist {
-  -moz-appearance: none;
-  border-width: 1px;
-  -moz-border-top-colors: none;
-  -moz-border-right-colors: none;
-  -moz-border-bottom-colors: none;
-  -moz-border-left-colors: none;
-  border-radius: 2px;
-  min-width: 0;
-  box-shadow: 0 1px rgba(255, 255, 255, 0.5), 0 1px rgba(255, 255, 255, 0.5) inset;
-}
-
-notification[value="translation"] menulist > .menulist-dropmarker {
-  -moz-appearance: toolbarbutton-dropdown;
-  border: none;
-  background-color: transparent;
-  margin: auto;
-  padding: 5px 0;
-}
-
-.translation-menupopup arrowscrollbox {
-  padding-bottom: 0;
-}
-
-.translation-attribution {
-  cursor: pointer;
-  -moz-box-align: end;
-  font-size: small;
-}
-
-.translation-attribution > label {
-  margin-bottom: 0;
-}
-
-.translation-attribution > image {
-  width: 70px;
-}
-
-.translation-welcome-panel {
-  width: 305px;
-}
-
-.translation-welcome-logo {
-  height: 32px;
-  width: 32px;
-  list-style-image: url(chrome://browser/skin/translation-16@2x.png);
-  -moz-image-region: rect(0, 64px, 32px, 32px);
-}
-
-.translation-welcome-content {
-  margin-inline-start: 16px;
-}
-
-.translation-welcome-headline {
-  font-size: larger;
-  font-weight: bold;
-}
-
-.translation-welcome-body {
-  padding: 1em 0;
-  margin: 0 0;
-}
diff --git a/application/basilisk/themes/shared/translation/translating-16.png b/application/basilisk/themes/shared/translation/translating-16.png
deleted file mode 100644
index 71ca37c22..000000000
--- a/application/basilisk/themes/shared/translation/translating-16.png
+++ /dev/null
diff --git a/application/basilisk/themes/shared/translation/translating-16@2x.png b/application/basilisk/themes/shared/translation/translating-16@2x.png
deleted file mode 100644
index ab6184047..000000000
--- a/application/basilisk/themes/shared/translation/translating-16@2x.png
+++ /dev/null
diff --git a/application/basilisk/themes/shared/translation/translation-16.png b/application/basilisk/themes/shared/translation/translation-16.png
deleted file mode 100644
index 4b42dedcf..000000000
--- a/application/basilisk/themes/shared/translation/translation-16.png
+++ /dev/null
diff --git a/application/basilisk/themes/shared/translation/translation-16@2x.png b/application/basilisk/themes/shared/translation/translation-16@2x.png
deleted file mode 100644
index 2105a3e4a..000000000
--- a/application/basilisk/themes/shared/translation/translation-16@2x.png
+++ /dev/null
diff --git a/application/basilisk/themes/windows/browser.css b/application/basilisk/themes/windows/browser.css
index 2ed3e7e6a..334265e60 100644
--- a/application/basilisk/themes/windows/browser.css
+++ b/application/basilisk/themes/windows/browser.css
@@ -2113,87 +2113,6 @@ toolbarbutton.bookmark-item[dragover="true"][open="true"] {
   outline: 1px dotted -moz-DialogText;
 }
 
-/* Translation infobar */
-
-%include ../shared/translation/infobar.inc.css
-
-notification[value="translation"] {
-  min-height: 40px;
-}
-
-@media (-moz-windows-default-theme) {
-  notification[value="translation"],
-  notification[value="translation"] button,
-  notification[value="translation"] menulist {
-    min-height: 30px;
-    color: #545454;
-  }
-
-  notification[value="translation"] {
-    background-color: #EEE;
-  }
-
-  notification[value="translation"] button,
-  notification[value="translation"] menulist {
-    padding-inline-end: 1ch;
-  }
-
-  notification[value="translation"] menulist {
-    border: 1px solid #C1C1C1;
-    background-color: #FFF;
-  }
-
-  notification[value="translation"] button {
-    border: 1px solid #C1C1C1;
-    background-color: #FBFBFB;
-  }
-
-  notification[value="translation"] button,
-  notification[value="translation"] menulist,
-  notification[value="translation"] menulist > .menulist-label-box {
-    margin-inline-start: 1ch;
-    margin-inline-end: 1ch;
-  }
-
-  notification[value="translation"] button:hover,
-  notification[value="translation"] button:active,
-  notification[value="translation"] menulist:hover,
-  notification[value="translation"] menulist:active {
-    background-color: #EBEBEB;
-  }
-
-  notification[value="translation"] button[anonid="translate"] {
-    color: #FFF;
-    background-color: #0095DD;
-    box-shadow: none;
-    border: 1px solid #006B9D;
-  }
-
-  notification[value="translation"] button[anonid="translate"]:hover,
-  notification[value="translation"] button[anonid="translate"]:active {
-    background-color: #008ACB;
-  }
-
-  notification[value="translation"] button[type="menu"] > .button-box > .button-menu-dropmarker,
-  notification[value="translation"] menulist > .menulist-dropmarker {
-    list-style-image: url("chrome://browser/skin/toolbarbutton-dropdown-arrow.png");
-  }
-
-  notification[value="translation"] button > .button-box,
-  notification[value="translation"] button[type="menu"] > .button-box > .button-menu-dropmarker {
-    padding: 0;
-    margin-inline-start: 3ch;
-  }
-
-  notification[value="translation"] button:not([type="menu"]) > .button-box {
-    margin-inline-end: 3ch;
-  }
-}
-
-.translation-menupopup {
-  -moz-appearance: none;
-}
-
 /* Bookmarks roots menu-items */
 #subscribeToPageMenuitem:not([disabled]),
 #subscribeToPageMenupopup {
diff --git a/application/basilisk/themes/windows/preferences/preferences.css b/application/basilisk/themes/windows/preferences/preferences.css
index bd1ec3083..c6c063b53 100644
--- a/application/basilisk/themes/windows/preferences/preferences.css
+++ b/application/basilisk/themes/windows/preferences/preferences.css
@@ -15,13 +15,6 @@
 #isNotDefaultLabel {
   font-weight: bold;
 }
-
-/* Content Pane */
-#translationAttributionImage {
-  width: 70px;
-  cursor: pointer;
-}
-
 /* Modeless Window Dialogs */
 .windowDialog,
 .windowDialog prefpane {
diff --git a/application/palemoon/base/content/newtab/newTab.css b/application/palemoon/base/content/newtab/newTab.css
index fe745d2fd..3c7cfa102 100644
--- a/application/palemoon/base/content/newtab/newTab.css
+++ b/application/palemoon/base/content/newtab/newTab.css
@@ -301,7 +301,6 @@ input[type=button] {
   color: black;
   border-color: hsla(220,54%,20%,.15) hsla(220,54%,20%,.17) hsla(220,54%,20%,.2);
   border-radius: 0 2px 2px 0;
-  border-inline-start: 1px solid transparent;
   box-shadow: 0 0 2px hsla(0,0%,100%,.5) inset,
               0 1px 0 hsla(0,0%,100%,.2);
   cursor: pointer;
diff --git a/application/palemoon/base/content/tabbrowser.xml b/application/palemoon/base/content/tabbrowser.xml
index 93818e290..dc6cb0a9d 100644
--- a/application/palemoon/base/content/tabbrowser.xml
+++ b/application/palemoon/base/content/tabbrowser.xml
@@ -2950,19 +2950,6 @@
                 onget="return this.mCurrentBrowser.docShell"
                 readonly="true"/>
 
-      <property name="messageManager"
-                readonly="true">
-        <getter>
-          <![CDATA[
-            let frameLoader = this.mCurrentBrowser.frameLoader;
-            if (!frameLoader) {
-              return null;
-            }
-            return frameLoader.messageManager;
-          ]]>
-        </getter>
-      </property>
-
       <property name="webNavigation"
                 onget="return this.mCurrentBrowser.webNavigation"
                 readonly="true"/>
@@ -4296,17 +4283,76 @@
             event.originalTarget.localName != "box")
           return;
 
-        // See hack note in the tabbrowser-close-tab-button binding
+        // See comments in the "mousedown" and "click" event handlers of the
+        // tabbrowser-tabs binding.
         if (!this._blockDblClick)
           BrowserOpenTab();
 
         event.preventDefault();
       ]]></handler>
 
-      <handler event="click"><![CDATA[
-        if (event.button != 1)
-          return;
+      <!-- Consider that the in-tab close button is only shown on the active
+           tab. When clicking on an inactive tab at the position where the
+           close button will appear during the click, no "click" event will be
+           dispatched, because the mousedown and mouseup events don't have the
+           same event target. For that reason use "mousedown" instead of "click"
+           to implement in-tab close button behavior. (Pale Moon UXP issue #775)
+      -->
+      <handler event="mousedown" button="0" phase="capturing"><![CDATA[
+        /* The only sequence in which a second click event (i.e. dblclik)
+         * can be dispatched on an in-tab close button is when it is shown
+         * after the first click (i.e. the first click event was dispatched
+         * on the tab). This happens when we show the close button only on
+         * the active tab. (bug 352021)
+         * The only sequence in which a third click event can be dispatched
+         * on an in-tab close button is when the tab was opened with a
+         * double click on the tabbar. (bug 378344)
+         * In both cases, it is most likely that the close button area has
+         * been accidentally clicked, therefore we do not close the tab.
+         *
+         * We don't want to ignore processing of more than one click event,
+         * though, since the user might actually be repeatedly clicking to
+         * close many tabs at once.
+         *
+         * Also prevent errant doubleclick on the close button from opening
+         * a new tab (bug 343628):
+         * Since we're removing the event target, if the user double-clicks
+         * the button, the dblclick event will be dispatched with the tabbar
+         * as its event target (and explicit/originalTarget), which treats
+         * that as a mouse gesture for opening a new tab.
+         * In this context, we're manually blocking the dblclick event.
+         */
+
+        // Reset flags at the beginning of a series of clicks:
+        if (event.detail == 1) {
+          this.flagClickOnCloseButton = false;
+          this.flagActivateTabOrClickOnTabbar = false;
+        }
+
+        this.blockCloseButtonOnDblclick = this.flagActivateTabOrClickOnTabbar;
+        this._blockDblClick = this.flagClickOnCloseButton;
 
+        // Set flags:
+        let eventTargetIsCloseButton =
+            event.originalTarget.classList.contains("tab-close-button");
+        this.flagClickOnCloseButton = eventTargetIsCloseButton;
+        this.flagActivateTabOrClickOnTabbar =
+            ((!eventTargetIsCloseButton && event.detail == 1) ||
+             event.originalTarget.localName == "box");
+      ]]></handler>
+
+      <handler event="click" button="0"><![CDATA[
+        // See comment in the "mousedown" event handler of the
+        // tabbrowser-tabs binding.
+        if (event.originalTarget.classList.contains("tab-close-button") &&
+            !this.blockCloseButtonOnDblclick) {
+          gBrowser.removeTab(document.getBindingParent(event.originalTarget),
+                             {animate: true, byMouse: true,});
+          this._blockDblClick = true;
+        }
+      ]]></handler>
+
+      <handler event="click" button="1"><![CDATA[
         if (event.target.localName == "tab") {
           if (this.childNodes.length > 1 || !this._closeWindowWithLastTab)
             this.tabbrowser.removeTab(event.target, {animate: true, byMouse: true});
@@ -4679,63 +4725,6 @@
   <binding id="tabbrowser-close-tab-button"
            extends="chrome://global/content/bindings/toolbarbutton.xml#toolbarbutton-image">
     <handlers>
-      <handler event="click" button="0"><![CDATA[
-        var bindingParent = document.getBindingParent(this);
-        var tabContainer = bindingParent.parentNode;
-        /* The only sequence in which a second click event (i.e. dblclik)
-         * can be dispatched on an in-tab close button is when it is shown
-         * after the first click (i.e. the first click event was dispatched
-         * on the tab). This happens when we show the close button only on
-         * the active tab. (bug 352021)
-         * The only sequence in which a third click event can be dispatched
-         * on an in-tab close button is when the tab was opened with a
-         * double click on the tabbar. (bug 378344)
-         * In both cases, it is most likely that the close button area has
-         * been accidentally clicked, therefore we do not close the tab.
-         *
-         * We don't want to ignore processing of more than one click event,
-         * though, since the user might actually be repeatedly clicking to
-         * close many tabs at once.
-         */
-        if (event.detail > 1 && !this._ignoredClick) {
-          this._ignoredClick = true;
-          return;
-        }
-
-        // Reset the "ignored click" flag
-        this._ignoredClick = false;
-
-        tabContainer.tabbrowser.removeTab(bindingParent, {animate: true, byMouse: true});
-        tabContainer._blockDblClick = true;
-
-        /* XXXmano hack (see bug 343628):
-         * Since we're removing the event target, if the user
-         * double-clicks this button, the dblclick event will be dispatched
-         * with the tabbar as its event target (and explicit/originalTarget),
-         * which treats that as a mouse gesture for opening a new tab.
-         * In this context, we're manually blocking the dblclick event
-         * (see dblclick handler).
-         */
-        var clickedOnce = false;
-        function enableDblClick(event) {
-          var target = event.originalTarget;
-          if (target.className == 'tab-close-button')
-            target._ignoredClick = true;
-          if (!clickedOnce) {
-            clickedOnce = true;
-            return;
-          }
-          tabContainer._blockDblClick = false;
-          tabContainer.removeEventListener("click", enableDblClick, true);
-        }
-        tabContainer.addEventListener("click", enableDblClick, true);
-      ]]></handler>
-
-      <handler event="dblclick" button="0" phase="capturing">
-        // for the one-close-button case
-        event.stopPropagation();
-      </handler>
-
       <handler event="dragstart">
         event.stopPropagation();
       </handler>
diff --git a/application/palemoon/branding/official/branding.nsi b/application/palemoon/branding/official/branding.nsi
index 2b6dbe7bd..3bceda900 100644
--- a/application/palemoon/branding/official/branding.nsi
+++ b/application/palemoon/branding/official/branding.nsi
@@ -11,39 +11,6 @@
 !define BrandFullNameInternal "Pale Moon"
 !define CompanyName           "Moonchild Productions"
 !define URLInfoAbout          "http://www.palemoon.org/"
-!define URLUpdateInfo         "http://www.palemoon.org/releasenotes-ng.shtml"
-
-; The OFFICIAL define is a workaround to support different urls for Release and
-; Beta since they share the same branding when building with other branches that
-; set the update channel to beta.
-!define OFFICIAL
-!define URLStubDownload ""
-!define URLManualDownload "http://www.palemoon.org/download-ng.shtml"
-!define Channel "release"
-
-# The installer's certificate name and issuer expected by the stub installer
-# !define CertNameDownload   "Mozilla Corporation"
-# !define CertIssuerDownload "Thawte Code Signing CA - G2"
-
-# Dialog units are used so the UI displays correctly with the system's DPI
-# settings.
-# The dialog units for the bitmap's dimensions should match exactly with the
-# bitmap's width and height in pixels.
-!define APPNAME_BMP_WIDTH_DU "134u"
-!define APPNAME_BMP_HEIGHT_DU "36u"
-!define INTRO_BLURB_WIDTH_DU "258u"
-!define INTRO_BLURB_EDGE_DU "170u"
-!define INTRO_BLURB_LTR_TOP_DU "20u"
-!define INTRO_BLURB_RTL_TOP_DU "12u"
-
-# UI Colors that can be customized for each channel
-!define FOOTER_CONTROL_TEXT_COLOR_NORMAL 0x000033
-!define FOOTER_CONTROL_TEXT_COLOR_FADED 0x666699
-!define FOOTER_BKGRD_COLOR 0xFFFFFF
-!define INTRO_BLURB_TEXT_COLOR 0x666699
-!define OPTIONS_TEXT_COLOR_NORMAL 0x000000
-!define OPTIONS_TEXT_COLOR_FADED 0x666699
-!define OPTIONS_BKGRD_COLOR 0xF0F0F0
-!define INSTALL_BLURB_TEXT_COLOR 0x666699
-!define INSTALL_PROGRESS_TEXT_COLOR_NORMAL 0x666699
-!define INSTALL_PROGRESS_TEXT_COLOR_FADED 0x9999C0
+!define URLUpdateInfo         "http://www.palemoon.org/releasenotes.shtml"
+!define HelpLink              "http://www.palemoon.org/troubleshooting.shtml"
+!define URLSystemRequirements "http://www.palemoon.org/download.shtml"
diff --git a/application/palemoon/branding/unofficial/branding.nsi b/application/palemoon/branding/unofficial/branding.nsi
index cf1642eaf..62ed242a2 100644
--- a/application/palemoon/branding/unofficial/branding.nsi
+++ b/application/palemoon/branding/unofficial/branding.nsi
@@ -12,29 +12,5 @@
 !define CompanyName           "Moonchild Productions"
 !define URLInfoAbout          "http://www.palemoon.org"
 !define URLUpdateInfo         "http://www.palemoon.org"
-
-!define URLManualDownload "http://www.palemoon.org/download-ng.shtml"
-!define Channel "unofficial"
-
-# Dialog units are used so the UI displays correctly with the system's DPI
-# settings.
-# The dialog units for the bitmap's dimensions should match exactly with the
-# bitmap's width and height in pixels.
-!define APPNAME_BMP_WIDTH_DU 159u
-!define APPNAME_BMP_HEIGHT_DU 26u
-!define INTRO_BLURB_WIDTH_DU "230u"
-!define INTRO_BLURB_EDGE_DU "198u"
-!define INTRO_BLURB_LTR_TOP_DU "16u"
-!define INTRO_BLURB_RTL_TOP_DU "11u"
-
-# UI Colors that can be customized for each channel
-!define FOOTER_CONTROL_TEXT_COLOR_NORMAL 0x000000
-!define FOOTER_CONTROL_TEXT_COLOR_FADED 0x999999
-!define FOOTER_BKGRD_COLOR 0xFFFFFF
-!define INTRO_BLURB_TEXT_COLOR 0xFFFFFF
-!define OPTIONS_TEXT_COLOR_NORMAL 0xFFFFFF
-!define OPTIONS_TEXT_COLOR_FADED 0xA1AAB3
-!define OPTIONS_BKGRD_COLOR 0x0F1B26
-!define INSTALL_BLURB_TEXT_COLOR 0xFFFFFF
-!define INSTALL_PROGRESS_TEXT_COLOR_NORMAL 0xFFFFFF
-!define INSTALL_PROGRESS_TEXT_COLOR_FADED 0xA1AAB3
+!define HelpLink              "http://www.palemoon.org"
+!define URLSystemRequirements "http://www.palemoon.org/download.shtml"
diff --git a/application/palemoon/branding/unstable/branding.nsi b/application/palemoon/branding/unstable/branding.nsi
index 4a9199b22..535cfde33 100644
--- a/application/palemoon/branding/unstable/branding.nsi
+++ b/application/palemoon/branding/unstable/branding.nsi
@@ -11,36 +11,6 @@
 !define BrandFullNameInternal "Pale Moon"
 !define CompanyName           "Moonchild Productions"
 !define URLInfoAbout          "http://www.palemoon.org/"
-!define URLUpdateInfo         "http://www.palemoon.org/unstable.shtml"
-
-; The OFFICIAL define is a workaround to support different urls for Release and
-; Beta since they share the same branding when building with other branches that
-; set the update channel to beta.
-!define OFFICIAL
-!define URLStubDownload ""
-!define URLManualDownload "http://www.palemoon.org/unstable.shtml"
-!define Channel "unstable"
-
-
-# Dialog units are used so the UI displays correctly with the system's DPI
-# settings.
-# The dialog units for the bitmap's dimensions should match exactly with the
-# bitmap's width and height in pixels.
-!define APPNAME_BMP_WIDTH_DU "134u"
-!define APPNAME_BMP_HEIGHT_DU "36u"
-!define INTRO_BLURB_WIDTH_DU "258u"
-!define INTRO_BLURB_EDGE_DU "170u"
-!define INTRO_BLURB_LTR_TOP_DU "20u"
-!define INTRO_BLURB_RTL_TOP_DU "12u"
-
-# UI Colors that can be customized for each channel
-!define FOOTER_CONTROL_TEXT_COLOR_NORMAL 0x000033
-!define FOOTER_CONTROL_TEXT_COLOR_FADED 0x666699
-!define FOOTER_BKGRD_COLOR 0xFFFFFF
-!define INTRO_BLURB_TEXT_COLOR 0x666699
-!define OPTIONS_TEXT_COLOR_NORMAL 0x000000
-!define OPTIONS_TEXT_COLOR_FADED 0x666699
-!define OPTIONS_BKGRD_COLOR 0xF0F0F0
-!define INSTALL_BLURB_TEXT_COLOR 0x666699
-!define INSTALL_PROGRESS_TEXT_COLOR_NORMAL 0x666699
-!define INSTALL_PROGRESS_TEXT_COLOR_FADED 0x9999C0
+!define URLUpdateInfo         "http://www.palemoon.org/unstable/"
+!define HelpLink              "http://www.palemoon.org/unstable/"
+!define URLSystemRequirements "http://www.palemoon.org/download.shtml"
diff --git a/application/palemoon/components/nsBrowserGlue.js b/application/palemoon/components/nsBrowserGlue.js
index 720d1165c..f0a7aa22a 100644
--- a/application/palemoon/components/nsBrowserGlue.js
+++ b/application/palemoon/components/nsBrowserGlue.js
@@ -152,6 +152,9 @@ BrowserGlue.prototype = {
   // nsIObserver implementation 
   observe: function BG_observe(subject, topic, data) {
     switch (topic) {
+      case "notifications-open-settings":
+        this._openPermissions(subject);
+        break;
       case "prefservice:after-app-defaults":
         this._onAppDefaults();
         break;
@@ -322,6 +325,7 @@ BrowserGlue.prototype = {
   // initialization (called on application startup) 
   _init: function BG__init() {
     let os = Services.obs;
+    os.addObserver(this, "notifications-open-settings", false);
     os.addObserver(this, "prefservice:after-app-defaults", false);
     os.addObserver(this, "final-ui-startup", false);
     os.addObserver(this, "browser-delayed-startup-finished", false);
@@ -354,6 +358,7 @@ BrowserGlue.prototype = {
   // cleanup (called on application shutdown)
   _dispose: function BG__dispose() {
     let os = Services.obs;
+    os.removeObserver(this, "notifications-open-settings");
     os.removeObserver(this, "prefservice:after-app-defaults");
     os.removeObserver(this, "final-ui-startup");
     os.removeObserver(this, "sessionstore-windows-restored");
@@ -1484,6 +1489,16 @@ BrowserGlue.prototype = {
     }
   },
 
+  _openPermissions: function(aPrincipal) {
+    var win = this.getMostRecentBrowserWindow();
+    var url = "about:permissions";
+    try {
+      url = url + "?filter=" + aPrincipal.URI.host;
+    }
+    catch (e) {}
+    win.openUILinkIn(url, "tab");
+  },
+
   _hasSystemAlertsService: function() {
     try {
       return !!Cc["@mozilla.org/system-alerts-service;1"].getService(
diff --git a/application/palemoon/components/permissions/aboutPermissions.js b/application/palemoon/components/permissions/aboutPermissions.js
index 6a02daa29..421b65a0e 100644
--- a/application/palemoon/components/permissions/aboutPermissions.js
+++ b/application/palemoon/components/permissions/aboutPermissions.js
@@ -516,6 +516,14 @@ var AboutPermissions = {
     Services.obs.notifyObservers(null, "browser-permissions-preinit", null);
 
     this._initPart2();
+
+    // Process about:permissions?filter=<string>
+    // About URIs don't support query params, so do this manually
+    var loc = document.location.href;
+    var matches = /[?&]filter\=([^&]+)/i.exec(loc);
+    if (matches) {
+      this.sitesFilter.value = decodeURIComponent(matches[1]);
+    }
   },
 
   sitesReload: function() {
diff --git a/application/palemoon/components/statusbar/Status.jsm b/application/palemoon/components/statusbar/Status.jsm
index d888c7d94..19e12ddfd 100644
--- a/application/palemoon/components/statusbar/Status.jsm
+++ b/application/palemoon/components/statusbar/Status.jsm
@@ -35,7 +35,7 @@ S4EStatusService.prototype =
   _defaultStatus:          { val: "", type: "" },
 
   _isFullScreen:           false,
-  _isFullScreenVideo:      false,
+  _isVideo:                false,
 
   _statusText:             { val: "", type: "" },
   _noUpdate:               false,
@@ -222,18 +222,10 @@ S4EStatusService.prototype =
     }
   },
 
-  updateFullScreen: function()
+  setFullScreenState: function(isFullScreen, isVideo)
   {
-    this._isFullScreen = this._window.fullScreen;
-    this._isFullScreenVideo = false;
-    if(this._isFullScreen)
-    {
-      let fsEl = this._window.content.document.mozFullScreenElement;
-      if(fsEl && (fsEl.nodeName == "VIDEO" || fsEl.getElementsByTagName("VIDEO").length > 0))
-      {
-        this._isFullScreenVideo = true;
-      }
-    }
+    this._isFullScreen = isFullScreen;
+    this._isVideo = isFullScreen && isVideo;
 
     this.clearStatusField();
     this.updateStatusField(true);
@@ -305,7 +297,7 @@ S4EStatusService.prototype =
 
     let label = null;
 
-    if(this._isFullScreen && this._service.advancedStatusDetectFullScreen)
+    if(this._isFullScreen)
     {
       switch(location)
       {
@@ -330,7 +322,7 @@ S4EStatusService.prototype =
         break;
       case 3: // Popup
       default:
-        if(this._isFullScreenVideo && this._service.advancedStatusDetectVideo)
+        if(this._isVideo)
         {
           return;
         }
diff --git a/application/palemoon/components/statusbar/Status4Evar.jsm b/application/palemoon/components/statusbar/Status4Evar.jsm
index 055306a88..6400f2e2a 100644
--- a/application/palemoon/components/statusbar/Status4Evar.jsm
+++ b/application/palemoon/components/statusbar/Status4Evar.jsm
@@ -31,7 +31,7 @@ function Status4Evar(window, gBrowser, toolbox)
   this.statusService = new S4EStatusService(this._window, s4e_service, this.getters);
   this.progressMeter = new S4EProgressService(gBrowser, s4e_service, this.getters, this.statusService);
   this.downloadStatus = new S4EDownloadService(this._window, gBrowser, s4e_service, this.getters);
-  this.sizeModeService = new SizeModeService(this._window, this);
+  this.sizeModeService = new SizeModeService(this._window, gBrowser, this);
 
   this._window.addEventListener("unload", this, false);
 }
@@ -232,20 +232,31 @@ S4EWindowGetters.prototype =
   }
 };
 
-function SizeModeService(window, s4e)
+function SizeModeService(window, gBrowser, s4e)
 {
   this._window = window;
+  this._gBrowser = gBrowser;
   this._s4e = s4e;
+  this._mm = this._window.messageManager;
 
   this.lastFullScreen = this._window.fullScreen;
   this.lastwindowState = this._window.windowState;
+
+  if(s4e_service.advancedStatusDetectFullScreen)
+  {
+    this._mm.addMessageListener("status4evar@caligonstudios.com:video-detect-answer", this)
+    this._mm.loadFrameScript("resource:///modules/statusbar/content-thunk.js", true);
+  }
+
   this._window.addEventListener("sizemodechange", this, false);
 }
 
 SizeModeService.prototype =
 {
   _window:         null,
+  _gBrowser:       null,
   _s4e:            null,
+  _mm:             null,
 
   lastFullScreen:  null,
   lastwindowState: null,
@@ -254,7 +265,13 @@ SizeModeService.prototype =
   {
     this._window.removeEventListener("sizemodechange", this, false);
 
-    ["_window", "_s4e"].forEach(function(prop)
+    if(s4e_service.advancedStatusDetectFullScreen)
+    {
+      this._mm.removeDelayedFrameScript("resource:///modules/statusbar/content-thunk.js");
+      this._mm.removeMessageListener("status4evar@caligonstudios.com:video-detect-answer", this);
+    }
+
+    ["_window", "_gBrowser", "_s4e", "_mm"].forEach(function(prop)
     {
       delete this[prop];
     }, this);
@@ -262,10 +279,18 @@ SizeModeService.prototype =
 
   handleEvent: function(e)
   {
-    if(this._window.fullScreen != this.lastFullScreen)
+    if(this._window.fullScreen != this.lastFullScreen && s4e_service.advancedStatusDetectFullScreen)
     {
       this.lastFullScreen = this._window.fullScreen;
-      this._s4e.statusService.updateFullScreen();
+
+      if(this.lastFullScreen && s4e_service.advancedStatusDetectVideo)
+      {
+        this._gBrowser.selectedBrowser.messageManager.sendAsyncMessage("status4evar@caligonstudios.com:video-detect");
+      }
+      else
+      {
+        this._s4e.statusService.setFullScreenState(this.lastFullScreen, false);
+      }
     }
 
     if(this._window.windowState != this.lastwindowState)
@@ -275,5 +300,13 @@ SizeModeService.prototype =
     }
   },
 
-  QueryInterface: XPCOMUtils.generateQI([ CI.nsIDOMEventListener ])
+  receiveMessage: function(message)
+  {
+    if(message.name == "status4evar@caligonstudios.com:video-detect-answer")
+    {
+      this._s4e.statusService.setFullScreenState(this.lastFullScreen, message.data.isVideo);
+    }
+  },
+
+  QueryInterface: XPCOMUtils.generateQI([ CI.nsIDOMEventListener, CI.nsIMessageListener ])
 };
diff --git a/application/palemoon/components/statusbar/content-thunk.js b/application/palemoon/components/statusbar/content-thunk.js
new file mode 100644
index 000000000..fe1fbabad
--- /dev/null
+++ b/application/palemoon/components/statusbar/content-thunk.js
@@ -0,0 +1,23 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+function handleVideoDetect(message)
+{
+  let isVideo = false;
+
+  let fsEl = content.document.mozFullScreenElement;
+  if(fsEl)
+  {
+    isVideo = (
+      fsEl.nodeName == "VIDEO"
+      || (fsEl.nodeName == "IFRAME" && fsEl.contentDocument && fsEl.contentDocument.getElementsByTagName("VIDEO").length > 0)
+      || fsEl.getElementsByTagName("VIDEO").length > 0
+    );
+  }
+
+  sendAsyncMessage("status4evar@caligonstudios.com:video-detect-answer", {isVideo: isVideo});
+}
+
+addMessageListener("status4evar@caligonstudios.com:video-detect", handleVideoDetect);
+
diff --git a/application/palemoon/components/statusbar/moz.build b/application/palemoon/components/statusbar/moz.build
index ba8cfef86..0f7f597a5 100644
--- a/application/palemoon/components/statusbar/moz.build
+++ b/application/palemoon/components/statusbar/moz.build
@@ -16,6 +16,7 @@ EXTRA_COMPONENTS += [
 ]
 
 EXTRA_JS_MODULES.statusbar = [
+    'content-thunk.js',
     'Downloads.jsm',
     'Progress.jsm',
     'Status.jsm',
diff --git a/application/palemoon/installer/windows/Makefile.in b/application/palemoon/installer/windows/Makefile.in
index 7ba70d912..9b0f697c8 100644
--- a/application/palemoon/installer/windows/Makefile.in
+++ b/application/palemoon/installer/windows/Makefile.in
@@ -79,10 +79,6 @@ $(CONFIG_DIR)/setup.exe::
 	$(PYTHON) $(topsrcdir)/toolkit/mozapps/installer/windows/nsis/preprocess-locale.py \
 	  --preprocess-locale $(topsrcdir) \
 	  $(PPL_LOCALE_ARGS) $(AB_CD) $(CONFIG_DIR)
-	$(PYTHON) $(topsrcdir)/toolkit/mozapps/installer/windows/nsis/preprocess-locale.py \
-	  --preprocess-single-file $(topsrcdir) \
-	  $(PPL_LOCALE_ARGS) $(CONFIG_DIR) \
-	  nsisstrings.properties nsisstrings.nlf
 
 GARBARGE_DIRS += instgen
 
diff --git a/application/palemoon/installer/windows/nsis/defines.nsi.in b/application/palemoon/installer/windows/nsis/defines.nsi.in
index 97422c4f6..edef802a9 100644
--- a/application/palemoon/installer/windows/nsis/defines.nsi.in
+++ b/application/palemoon/installer/windows/nsis/defines.nsi.in
@@ -3,23 +3,6 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-# Defining FunnelcakeVersion will append the value of StubURLVersionAppend to
-# StubURLVersion, append the value of URLManualDownloadAppend to
-# URLManualDownload, and append the value of URLStubDownloadAppend to
-# URLStubDownload. The value of FunnelcakeVersion should not be defined when it
-# is not used and when it is defined its value should never be empty.
-# !define FunnelcakeVersion        "999"
-
-!ifdef FunnelcakeVersion
-!define URLManualDownloadAppend  "&f=${FunnelcakeVersion}"
-!define URLStubDownloadAppend    "-f${FunnelcakeVersion}"
-!define StubURLVersionAppend     "-${FunnelcakeVersion}"
-!else
-!define URLManualDownloadAppend  ""
-!define URLStubDownloadAppend    ""
-!define StubURLVersionAppend     ""
-!endif
-
 # These defines should match application.ini settings
 !define AppName               "Pale Moon"
 !define AppVersion            "@APP_VERSION@"
@@ -84,13 +67,3 @@ VIAddVersionKey "FileVersion"     "${AppVersion}"
 VIAddVersionKey "ProductVersion"  "${AppVersion}"
 # Comments is not used but left below commented out for future reference
 # VIAddVersionKey "Comments"        "Comments"
-
-# Control positions in Dialog Units so they are placed correctly with
-# non-default DPI settings
-!define OPTIONS_ITEM_EDGE_DU 90u
-!define OPTIONS_ITEM_WIDTH_DU 356u
-!define OPTIONS_SUBITEM_EDGE_DU 119u
-!define OPTIONS_SUBITEM_WIDTH_DU 327u
-!define INSTALL_BLURB_TOP_DU 78u
-!define APPNAME_BMP_EDGE_DU 19u
-!define APPNAME_BMP_TOP_DU 12u
diff --git a/application/palemoon/locales/en-US/chrome/browser/permissions/aboutPermissions.dtd b/application/palemoon/locales/en-US/chrome/browser/permissions/aboutPermissions.dtd
index 2030d4f59..5b220a7f0 100644
--- a/application/palemoon/locales/en-US/chrome/browser/permissions/aboutPermissions.dtd
+++ b/application/palemoon/locales/en-US/chrome/browser/permissions/aboutPermissions.dtd
@@ -32,7 +32,7 @@
 
 <!ENTITY image.label                     "Load Images">
 
-<!ENTITY cookie.label                    "Set Cookies">
+<!ENTITY cookie.label                    "Store Cookies and Site Data">
 <!ENTITY cookie.remove                   "Remove Cookies">
 <!ENTITY cookie.manage                   "Manage Cookies…">
 <!ENTITY cookie.removeAll                "Remove All Cookies">
diff --git a/application/palemoon/locales/en-US/chrome/browser/preferences/preferences.properties b/application/palemoon/locales/en-US/chrome/browser/preferences/preferences.properties
index b262eebf5..34f167586 100644
--- a/application/palemoon/locales/en-US/chrome/browser/preferences/preferences.properties
+++ b/application/palemoon/locales/en-US/chrome/browser/preferences/preferences.properties
@@ -8,8 +8,8 @@ labelDefaultFont=Default (%S)
 
 #### Permissions Manager
 
-cookiepermissionstext=You can specify which websites are always or never allowed to use cookies. Type the exact address of the site you want to manage and then click Block, Allow for Session, or Allow.
-cookiepermissionstitle=Exceptions - Cookies
+cookiepermissionstext=You can specify which websites are always or never allowed to use cookies or store site data. Type the exact address of the site you want to manage and then click Block, Allow for Session, or Allow.
+cookiepermissionstitle=Exceptions - Cookies and Site Data
 addonspermissionstext=You can specify which websites are allowed to install add-ons. Type the exact address of the site you want to allow and then click Allow.
 addons_permissions_title=Allowed Sites - Add-ons Installation
 popuppermissionstext=You can specify which websites are allowed to open pop-up windows. Type the exact address of the site you want to allow and then click Allow.
diff --git a/application/palemoon/locales/en-US/chrome/browser/preferences/privacy.dtd b/application/palemoon/locales/en-US/chrome/browser/preferences/privacy.dtd
index ef3303b94..37f8b78c0 100644
--- a/application/palemoon/locales/en-US/chrome/browser/preferences/privacy.dtd
+++ b/application/palemoon/locales/en-US/chrome/browser/preferences/privacy.dtd
@@ -20,10 +20,10 @@
 <!ENTITY  locbar.openpage.label         "Open tabs">
 <!ENTITY  locbar.openpage.accesskey     "O">
 
-<!ENTITY  acceptCookies.label           "Accept cookies from sites">
+<!ENTITY  acceptCookies.label           "Allow sites to store cookies and data">
 <!ENTITY  acceptCookies.accesskey       "A">
 
-<!ENTITY  acceptThirdParty.pre.label      "Accept third-party cookies:">
+<!ENTITY  acceptThirdParty.pre.label      "Accept third-party cookies and site data:">
 <!ENTITY  acceptThirdParty.pre.accesskey  "c">
 <!ENTITY  acceptThirdParty.always.label   "Always">
 <!ENTITY  acceptThirdParty.never.label    "Never">
@@ -48,7 +48,7 @@
 <!ENTITY  historyHeader.custom.label       "Use custom settings for history">
 <!ENTITY  historyHeader.post.label         "">
 
-<!ENTITY  rememberDescription.label      "&brandShortName; will remember your browsing, download, form and search history, and keep cookies from websites you visit.">
+<!ENTITY  rememberDescription.label      "&brandShortName; will remember your browsing, download, form and search history, and keep cookies and site data from websites you visit.">
 
 <!-- LOCALIZATION NOTE (rememberActions.pre.label): include a trailing space as needed -->
 <!-- LOCALIZATION NOTE (rememberActions.middle.label): include a starting and trailing space as needed -->
diff --git a/application/palemoon/locales/en-US/installer/nsisstrings.properties b/application/palemoon/locales/en-US/installer/nsisstrings.properties
deleted file mode 100644
index fc2898860..000000000
--- a/application/palemoon/locales/en-US/installer/nsisstrings.properties
+++ /dev/null
@@ -1,67 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-# LOCALIZATION NOTE:
-
-# This file must be saved as UTF8
-
-# Accesskeys are defined by prefixing the letter that is to be used for the
-# accesskey with an ampersand (e.g. &).
-
-# Do not replace $BrandShortName, $BrandFullName, or $BrandFullNameDA with a
-# custom string and always use the same one as used by the en-US files.
-# $BrandFullNameDA allows the string to contain an ampersand (e.g. DA stands
-# for double ampersand) and prevents the letter following the ampersand from
-# being used as an accesskey.
-
-# You can use \n to create a newline in the string but only when the string
-# from en-US contains a \n.
-
-WIN_CAPTION=$BrandShortName Setup
-
-INTRO_BLURB1=Thanks for choosing $BrandFullName, the browser that chooses you above everything else.
-INSTALL_BLURB1=You're about to enjoy the very latest in speed, flexibility and security so you're always in control.
-INSTALL_BLURB2=That's because $BrandShortName is made by a non-profit to make browsing and the Web better for you.
-INSTALL_BLURB3=You're also joining a global community of users, contributors and developers working to make the best browser in the world.
-
-WARN_MIN_SUPPORTED_OSVER_MSG=Sorry, $BrandShortName can't be installed. This version of $BrandShortName requires ${MinSupportedVer} or newer.
-WARN_MIN_SUPPORTED_CPU_MSG=Sorry, $BrandShortName can't be installed. This version of $BrandShortName requires a processor with ${MinSupportedCPU} support.
-WARN_MIN_SUPPORTED_OSVER_CPU_MSG=Sorry, $BrandShortName can't be installed. This version of $BrandShortName requires ${MinSupportedVer} or newer and a processor with ${MinSupportedCPU} support.
-WARN_WRITE_ACCESS=You don't have access to write to the installation directory.\n\nClick OK to select a different directory.
-WARN_DISK_SPACE=You don't have sufficient disk space to install to this location.\n\nClick OK to select a different location.
-WARN_ROOT_INSTALL=Unable to install to the root of your disk.\n\nClick OK to select a different location.
-WARN_MANUALLY_CLOSE_APP_LAUNCH=$BrandShortName is already running.\n\nPlease close $BrandShortName prior to launching the version you have just installed.
-
-ERROR_DOWNLOAD=Your download was interrupted.\n\nPlease click the OK button to continue.
-
-INSTALL_BUTTON=&Install
-UPGRADE_BUTTON=&Upgrade
-CANCEL_BUTTON=Cancel
-OPTIONS_BUTTON=&Options
-
-MAKE_DEFAULT=&Make $BrandShortName my default browser
-CREATE_SHORTCUTS=Create Shortcuts for $BrandShortName:
-ADD_SC_TASKBAR=On my &Task bar
-ADD_SC_QUICKLAUNCHBAR=On my &Quick Launch bar
-ADD_CheckboxShortcutInStartMenu=In my &Start Menu Programs Folder
-ADD_CheckboxShortcutOnDesktop=On my &Desktop
-SPACE_REQUIRED=Space Required:
-SPACE_AVAILABLE=Space Available:
-ONE_MOMENT_INSTALL=One moment, $BrandShortName will launch as soon as the install is complete…
-ONE_MOMENT_UPGRADE=One moment, $BrandShortName will launch as soon as the upgrade is complete…
-INSTALL_MAINT_SERVICE=&Install the $BrandShortName background update service
-SEND_PING=S&end information about this installation to Mozilla
-BROWSE_BUTTON=B&rowse…
-DEST_FOLDER=Destination Folder
-
-DOWNLOADING_LABEL=Downloading $BrandShortName…
-INSTALLING_LABEL=Installing $BrandShortName…
-UPGRADING_LABEL=Upgrading $BrandShortName…
-
-SELECT_FOLDER_TEXT=Select the folder to install $BrandShortName in.
-
-BYTE=B
-KILO=K
-MEGA=M
-GIGA=G
diff --git a/build/moz.configure/old.configure b/build/moz.configure/old.configure
index 20bde1eee..e6eaa8228 100644
--- a/build/moz.configure/old.configure
+++ b/build/moz.configure/old.configure
@@ -243,6 +243,7 @@ def old_configure_options(*options):
     '--enable-universalchardet',
     '--enable-updater',
     '--enable-url-classifier',
+    '--enable-userinfo',
     '--enable-valgrind',
     '--enable-verify-mar',
     '--enable-webrtc',
diff --git a/caps/nsPrincipal.cpp b/caps/nsPrincipal.cpp
index f89061043..129cdf9a0 100644
--- a/caps/nsPrincipal.cpp
+++ b/caps/nsPrincipal.cpp
@@ -161,6 +161,19 @@ nsPrincipal::GetOriginForURI(nsIURI* aURI, nsACString& aOrigin)
       (NS_SUCCEEDED(origin->SchemeIs("indexeddb", &isBehaved)) && isBehaved)) {
     rv = origin->GetAsciiSpec(aOrigin);
     NS_ENSURE_SUCCESS(rv, rv);
+    
+    // Remove query or ref part from about: origin
+    int32_t pos = aOrigin.FindChar('?');
+    int32_t hashPos = aOrigin.FindChar('#');
+
+    if (hashPos != kNotFound && (pos == kNotFound || hashPos < pos)) {
+      pos = hashPos;
+    }
+
+    if (pos != kNotFound) {
+      aOrigin.Truncate(pos);
+    }
+
     // These URIs could technically contain a '^', but they never should.
     if (NS_WARN_IF(aOrigin.FindChar('^', 0) != -1)) {
       aOrigin.Truncate();
diff --git a/config/external/moz.build b/config/external/moz.build
index c52933db8..fbf6da089 100644
--- a/config/external/moz.build
+++ b/config/external/moz.build
@@ -33,6 +33,9 @@ if CONFIG['MOZ_WEBM_ENCODER']:
 if not CONFIG['MOZ_SYSTEM_LIBVPX']:
     external_dirs += ['media/libvpx']
 
+if CONFIG['MOZ_AV1']:
+    external_dirs += ['media/libaom']
+
 if not CONFIG['MOZ_SYSTEM_PNG']:
     external_dirs += ['media/libpng']
 
diff --git a/config/external/nss/nss.symbols b/config/external/nss/nss.symbols
index 3239d3119..7a968b6c8 100644
--- a/config/external/nss/nss.symbols
+++ b/config/external/nss/nss.symbols
@@ -165,7 +165,6 @@ DER_GeneralizedTimeToTime
 DER_GeneralizedTimeToTime_Util
 DER_GetInteger
 DER_GetInteger_Util
-DER_Lengths
 DER_SetUInteger
 DER_UTCTimeToTime_Util
 DSAU_DecodeDerSigToLen
diff --git a/config/milestone.txt b/config/milestone.txt
index 1d06be496..752d23fca 100644
--- a/config/milestone.txt
+++ b/config/milestone.txt
@@ -10,4 +10,4 @@
 # hardcoded milestones in the tree from these two files.
 #--------------------------------------------------------
 
-4.1.5
+4.1.7
diff --git a/devtools/shared/heapsnapshot/HeapSnapshot.cpp b/devtools/shared/heapsnapshot/HeapSnapshot.cpp
index ef0bde948..b7b20dc17 100644
--- a/devtools/shared/heapsnapshot/HeapSnapshot.cpp
+++ b/devtools/shared/heapsnapshot/HeapSnapshot.cpp
@@ -1616,8 +1616,6 @@ ThreadSafeChromeUtils::ReadHeapSnapshot(GlobalObject& global,
                                         const nsAString& filePath,
                                         ErrorResult& rv)
 {
-  auto start = TimeStamp::Now();
-
   UniquePtr<char[]> path(ToNewCString(filePath));
   if (!path) {
     rv.Throw(NS_ERROR_OUT_OF_MEMORY);
diff --git a/docshell/base/nsDefaultURIFixup.cpp b/docshell/base/nsDefaultURIFixup.cpp
index e519720ab..d2876181a 100644
--- a/docshell/base/nsDefaultURIFixup.cpp
+++ b/docshell/base/nsDefaultURIFixup.cpp
@@ -154,6 +154,15 @@ HasUserPassword(const nsACString& aStringURI)
   return false;
 }
 
+// Assume that 1 tab is accidental, but more than 1 implies this is
+// supposed to be tab-separated content.
+static bool
+MaybeTabSeparatedContent(const nsCString& aStringURI)
+{
+  auto firstTab = aStringURI.FindChar('\t');
+  return firstTab != kNotFound && aStringURI.RFindChar('\t') != firstTab;
+}
+
 NS_IMETHODIMP
 nsDefaultURIFixup::GetFixupURIInfo(const nsACString& aStringURI,
                                    uint32_t aFixupFlags,
@@ -168,8 +177,8 @@ nsDefaultURIFixup::GetFixupURIInfo(const nsACString& aStringURI,
 
   // Eliminate embedded newlines, which single-line text fields now allow:
   uriString.StripChars("\r\n");
-  // Cleanup the empty spaces that might be on each end:
-  uriString.Trim(" ");
+  // Cleanup the empty spaces and tabs that might be on each end:
+  uriString.Trim(" \t");
 
   NS_ENSURE_TRUE(!uriString.IsEmpty(), NS_ERROR_FAILURE);
 
@@ -367,12 +376,16 @@ nsDefaultURIFixup::GetFixupURIInfo(const nsACString& aStringURI,
     inputHadDuffProtocol = true;
   }
 
-  // NB: this rv gets returned at the end of this method if we never
-  // do a keyword fixup after this (because the pref or the flags passed
-  // might not let us).
-  rv = FixupURIProtocol(uriString, info, getter_AddRefs(uriWithProtocol));
-  if (uriWithProtocol) {
-    info->mFixedURI = uriWithProtocol;
+  // Note: this rv gets returned at the end of this method if we don't fix up
+  // the protocol and don't do a keyword fixup after this (because the pref
+  // or the flags passed might not let us).
+  rv = NS_OK;
+  // Avoid fixing up content that looks like tab-separated values
+  if (!MaybeTabSeparatedContent(uriString)) {
+    rv = FixupURIProtocol(uriString, info, getter_AddRefs(uriWithProtocol));
+    if (uriWithProtocol) {
+      info->mFixedURI = uriWithProtocol;
+    }
   }
 
   // See if it is a keyword
diff --git a/docshell/test/unit/test_nsDefaultURIFixup_info.js b/docshell/test/unit/test_nsDefaultURIFixup_info.js
index c606ac32e..748aaab93 100644
--- a/docshell/test/unit/test_nsDefaultURIFixup_info.js
+++ b/docshell/test/unit/test_nsDefaultURIFixup_info.js
@@ -469,6 +469,14 @@ var testcases = [ {
     keywordLookup: true,
     protocolChange: true,
     affectedByDNSForSingleHosts: true,
+  }, {
+    input: " \t mozilla.org/\t \t ",
+    fixedURI: "http://mozilla.org/",
+    alternateURI: "http://www.mozilla.org/",
+    protocolChange: true,
+  }, {
+    input: " moz\ti\tlla.org ",
+    keywordLookup: true,
   }];
 
 if (Services.appinfo.OS.toLowerCase().startsWith("win")) {
diff --git a/dom/base/nsGlobalWindow.cpp b/dom/base/nsGlobalWindow.cpp
index 88cebe42b..86160c77c 100644
--- a/dom/base/nsGlobalWindow.cpp
+++ b/dom/base/nsGlobalWindow.cpp
@@ -2023,7 +2023,7 @@ nsGlobalWindow::ClearControllers()
 }
 
 void
-nsGlobalWindow::FreeInnerObjects()
+nsGlobalWindow::FreeInnerObjects(bool aForDocumentOpen)
 {
   NS_ASSERTION(IsInnerWindow(), "Don't free inner objects on an outer window");
 
@@ -2082,8 +2082,10 @@ nsGlobalWindow::FreeInnerObjects()
     mDocumentURI = mDoc->GetDocumentURI();
     mDocBaseURI = mDoc->GetDocBaseURI();
 
-    while (mDoc->EventHandlingSuppressed()) {
-      mDoc->UnsuppressEventHandlingAndFireEvents(nsIDocument::eEvents, false);
+    if (!aForDocumentOpen) {
+      while (mDoc->EventHandlingSuppressed()) {
+        mDoc->UnsuppressEventHandlingAndFireEvents(nsIDocument::eEvents, false);
+      }
     }
 
     // Note: we don't have to worry about eAnimationsOnly suppressions because
@@ -3000,6 +3002,8 @@ nsGlobalWindow::SetNewDocument(nsIDocument* aDocument,
   nsCOMPtr<WindowStateHolder> wsh = do_QueryInterface(aState);
   NS_ASSERTION(!aState || wsh, "What kind of weird state are you giving me here?");
 
+  bool handleDocumentOpen = false;
+  
   JS::Rooted<JSObject*> newInnerGlobal(cx);
   if (reUseInnerWindow) {
     // We're reusing the current inner window.
@@ -3091,6 +3095,7 @@ nsGlobalWindow::SetNewDocument(nsIDocument* aDocument,
 
     if (currentInner && currentInner->GetWrapperPreserveColor()) {
       if (oldDoc == aDocument) {
+        handleDocumentOpen = true;
         // Move the navigator from the old inner window to the new one since
         // this is a document.write. This is safe from a same-origin point of
         // view because document.write can only be used by the same origin.
@@ -3115,7 +3120,7 @@ nsGlobalWindow::SetNewDocument(nsIDocument* aDocument,
       // Don't free objects on our current inner window if it's going to be
       // held in the bfcache.
       if (!currentInner->IsFrozen()) {
-        currentInner->FreeInnerObjects();
+        currentInner->FreeInnerObjects(handleDocumentOpen);
       }
     }
 
diff --git a/dom/base/nsGlobalWindow.h b/dom/base/nsGlobalWindow.h
index 467bc6796..80bf33b80 100644
--- a/dom/base/nsGlobalWindow.h
+++ b/dom/base/nsGlobalWindow.h
@@ -1380,7 +1380,7 @@ protected:
     }
   }
 
-  void FreeInnerObjects();
+  void FreeInnerObjects(bool aForDocumentOpen = false);
   nsGlobalWindow *CallerInnerWindow();
 
   // Only to be called on an inner window.
diff --git a/dom/geolocation/nsGeolocation.cpp b/dom/geolocation/nsGeolocation.cpp
index 201d4d89a..244018ee8 100644
--- a/dom/geolocation/nsGeolocation.cpp
+++ b/dom/geolocation/nsGeolocation.cpp
@@ -395,18 +395,6 @@ nsGeolocationRequest::Allow(JS::HandleValue aChoices)
 {
   MOZ_ASSERT(aChoices.isUndefined());
 
-  if (mRequester) {
-    // Record whether a location callback is fulfilled while the owner window
-    // is not visible.
-    bool isVisible = false;
-    nsCOMPtr<nsPIDOMWindowInner> window = mLocator->GetParentObject();
-
-    if (window) {
-      nsCOMPtr<nsIDocument> doc = window->GetDoc();
-      isVisible = doc && !doc->Hidden();
-    }
-  }
-
   if (mLocator->ClearPendingRequest(this)) {
     return NS_OK;
   }
diff --git a/dom/indexedDB/ActorsParent.cpp b/dom/indexedDB/ActorsParent.cpp
index e6fe9e2a8..58c113058 100644
--- a/dom/indexedDB/ActorsParent.cpp
+++ b/dom/indexedDB/ActorsParent.cpp
@@ -23,6 +23,7 @@
 #include "mozilla/AppProcessChecker.h"
 #include "mozilla/AutoRestore.h"
 #include "mozilla/Casting.h"
+#include "mozilla/CheckedInt.h"
 #include "mozilla/EndianUtils.h"
 #include "mozilla/ErrorNames.h"
 #include "mozilla/LazyIdleThread.h"
@@ -249,8 +250,6 @@ const char kSQLiteJournalSuffix[] = ".sqlite-journal";
 const char kSQLiteSHMSuffix[] = ".sqlite-shm";
 const char kSQLiteWALSuffix[] = ".sqlite-wal";
 
-const char kPrefIndexedDBEnabled[] = "dom.indexedDB.enabled";
-
 const char kPrefFileHandleEnabled[] = "dom.fileHandle.enabled";
 
 #define IDB_PREFIX "indexedDB"
@@ -784,29 +783,25 @@ MakeCompressedIndexDataValues(
 
     MOZ_ASSERT(!keyBuffer.IsEmpty());
 
-    // Don't let |infoLength| overflow.
-    if (NS_WARN_IF(UINT32_MAX - keyBuffer.Length() <
-                   CompressedByteCountForIndexId(info.mIndexId) +
-                   CompressedByteCountForNumber(keyBufferLength) +
-                   CompressedByteCountForNumber(sortKeyBufferLength))) {
-      IDB_REPORT_INTERNAL_ERR();
-      return NS_ERROR_DOM_INDEXEDDB_UNKNOWN_ERR;
-    }
-
-    const uint32_t infoLength =
-      CompressedByteCountForIndexId(info.mIndexId) +
+    const CheckedUint32 infoLength =
+      CheckedUint32(CompressedByteCountForIndexId(info.mIndexId)) +
       CompressedByteCountForNumber(keyBufferLength) +
       CompressedByteCountForNumber(sortKeyBufferLength) +
       keyBufferLength +
       sortKeyBufferLength;
+    // Don't let |infoLength| overflow.
+    if (NS_WARN_IF(!infoLength.isValid())) {
+      IDB_REPORT_INTERNAL_ERR();
+      return NS_ERROR_DOM_INDEXEDDB_UNKNOWN_ERR;
+    }
 
     // Don't let |blobDataLength| overflow.
-    if (NS_WARN_IF(UINT32_MAX - infoLength < blobDataLength)) {
+    if (NS_WARN_IF(UINT32_MAX - infoLength.value() < blobDataLength)) {
       IDB_REPORT_INTERNAL_ERR();
       return NS_ERROR_DOM_INDEXEDDB_UNKNOWN_ERR;
     }
 
-    blobDataLength += infoLength;
+    blobDataLength += infoLength.value();
   }
 
   UniqueFreePtr<uint8_t> blobData(
@@ -21172,16 +21167,6 @@ FactoryOp::CheckPermission(ContentParent* aContentParent,
       return NS_ERROR_DOM_INDEXEDDB_NOT_ALLOWED_ERR;
     }
 
-    if (NS_WARN_IF(!Preferences::GetBool(kPrefIndexedDBEnabled, false))) {
-      if (aContentParent) {
-        // The DOM in the other process should have kept us from receiving any
-        // indexedDB messages so assume that the child is misbehaving.
-        aContentParent->KillHard("IndexedDB CheckPermission 1");
-      }
-
-      return NS_ERROR_DOM_INDEXEDDB_NOT_ALLOWED_ERR;
-    }
-
     const ContentPrincipalInfo& contentPrincipalInfo =
       principalInfo.get_ContentPrincipalInfo();
     if (contentPrincipalInfo.attrs().mPrivateBrowsingId != 0) {
diff --git a/dom/indexedDB/IDBFactory.cpp b/dom/indexedDB/IDBFactory.cpp
index 663828978..825d2ac36 100644
--- a/dom/indexedDB/IDBFactory.cpp
+++ b/dom/indexedDB/IDBFactory.cpp
@@ -47,12 +47,6 @@ namespace dom {
 using namespace mozilla::dom::quota;
 using namespace mozilla::ipc;
 
-namespace {
-
-const char kPrefIndexedDBEnabled[] = "dom.indexedDB.enabled";
-
-} // namespace
-
 class IDBFactory::BackgroundCreateCallback final
   : public nsIIPCBackgroundChildCreateCallback
 {
@@ -130,12 +124,6 @@ IDBFactory::CreateForWindow(nsPIDOMWindowInner* aWindow,
   nsCOMPtr<nsIPrincipal> principal;
   nsresult rv = AllowedForWindowInternal(aWindow, getter_AddRefs(principal));
 
-  if (!(NS_SUCCEEDED(rv) && nsContentUtils::IsSystemPrincipal(principal)) &&
-      NS_WARN_IF(!Preferences::GetBool(kPrefIndexedDBEnabled, false))) {
-    *aFactory = nullptr;
-    return NS_ERROR_DOM_INDEXEDDB_NOT_ALLOWED_ERR;
-  }
-
   if (rv == NS_ERROR_DOM_NOT_SUPPORTED_ERR) {
     NS_WARNING("IndexedDB is not permitted in a third-party window.");
     *aFactory = nullptr;
@@ -246,12 +234,6 @@ IDBFactory::CreateForMainThreadJSInternal(
   MOZ_ASSERT(NS_IsMainThread());
   MOZ_ASSERT(aPrincipalInfo);
 
-  if (aPrincipalInfo->type() != PrincipalInfo::TSystemPrincipalInfo &&
-      NS_WARN_IF(!Preferences::GetBool(kPrefIndexedDBEnabled, false))) {
-    *aFactory = nullptr;
-    return NS_ERROR_DOM_INDEXEDDB_NOT_ALLOWED_ERR;
-  }
-
   IndexedDatabaseManager* mgr = IndexedDatabaseManager::GetOrCreate();
   if (NS_WARN_IF(!mgr)) {
     IDB_REPORT_INTERNAL_ERR();
diff --git a/dom/indexedDB/KeyPath.cpp b/dom/indexedDB/KeyPath.cpp
index dc8d10668..30edd8cd7 100644
--- a/dom/indexedDB/KeyPath.cpp
+++ b/dom/indexedDB/KeyPath.cpp
@@ -14,6 +14,7 @@
 #include "xpcpublic.h"
 
 #include "mozilla/dom/BindingDeclarations.h"
+#include "mozilla/dom/BlobBinding.h"
 #include "mozilla/dom/IDBObjectStoreBinding.h"
 
 namespace mozilla {
@@ -100,7 +101,6 @@ GetJSValFromKeyPathString(JSContext* aCx,
     const char16_t* keyPathChars = token.BeginReading();
     const size_t keyPathLen = token.Length();
 
-    bool hasProp;
     if (!targetObject) {
       // We're still walking the chain of existing objects
       // http://w3c.github.io/IndexedDB/#dfn-evaluate-a-key-path-on-a-value
@@ -116,16 +116,77 @@ GetJSValFromKeyPathString(JSContext* aCx,
       }
       obj = &currentVal.toObject();
 
-      bool ok = JS_HasUCProperty(aCx, obj, keyPathChars, keyPathLen,
-                                 &hasProp);
+      // We call JS_GetOwnUCPropertyDescriptor on purpose (as opposed to
+      // JS_GetUCPropertyDescriptor) to avoid searching the prototype chain.
+      JS::Rooted<JS::PropertyDescriptor> desc(aCx);
+      bool ok = JS_GetOwnUCPropertyDescriptor(aCx, obj, keyPathChars,
+                                              keyPathLen, &desc);
       IDB_ENSURE_TRUE(ok, NS_ERROR_DOM_INDEXEDDB_UNKNOWN_ERR);
 
-      if (hasProp) {
-        // Get if the property exists...
-        JS::Rooted<JS::Value> intermediate(aCx);
-        bool ok = JS_GetUCProperty(aCx, obj, keyPathChars, keyPathLen, &intermediate);
-        IDB_ENSURE_TRUE(ok, NS_ERROR_DOM_INDEXEDDB_UNKNOWN_ERR);
+      JS::Rooted<JS::Value> intermediate(aCx);
+      bool hasProp = false;
+
+      if (desc.object()) {
+        intermediate = desc.value();
+        hasProp = true;
+      } else {
+        // If we get here it means the object doesn't have the property or the
+        // property is available throuch a getter. We don't want to call any
+        // getters to avoid potential re-entrancy.
+        // The blob object is special since its properties are available
+        // only through getters but we still want to support them for key
+        // extraction. So they need to be handled manually.
+        Blob* blob;
+        if (NS_SUCCEEDED(UNWRAP_OBJECT(Blob, &obj, blob))) {
+          if (token.EqualsLiteral("size")) {
+            ErrorResult rv;
+            uint64_t size = blob->GetSize(rv);
+            MOZ_ALWAYS_TRUE(!rv.Failed());
+
+            intermediate = JS_NumberValue(size);
+            hasProp = true;
+          } else if (token.EqualsLiteral("type")) {
+            nsString type;
+            blob->GetType(type);
+
+            JSString* string =
+              JS_NewUCStringCopyN(aCx, type.get(), type.Length());
+
+            intermediate = JS::StringValue(string);
+            hasProp = true;
+          } else {
+            RefPtr<File> file = blob->ToFile();
+            if (file) {
+              if (token.EqualsLiteral("name")) {
+                nsString name;
+                file->GetName(name);
+
+                JSString* string =
+                  JS_NewUCStringCopyN(aCx, name.get(), name.Length());
+
+                intermediate = JS::StringValue(string);
+                hasProp = true;
+              } else if (token.EqualsLiteral("lastModified")) {
+                ErrorResult rv;
+                int64_t lastModifiedDate = file->GetLastModified(rv);
+                MOZ_ALWAYS_TRUE(!rv.Failed());
+
+                intermediate = JS_NumberValue(lastModifiedDate);
+                hasProp = true;
+              } else if (token.EqualsLiteral("lastModifiedDate")) {
+                ErrorResult rv;
+                Date lastModifiedDate = file->GetLastModifiedDate(rv);
+                MOZ_ALWAYS_TRUE(!rv.Failed());
+
+                lastModifiedDate.ToDateObject(aCx, &intermediate);
+                hasProp = true;
+              }
+            }
+          }
+        }
+      }
 
+      if (hasProp) {
         // Treat explicitly undefined as an error.
         if (intermediate.isUndefined()) {
           return NS_ERROR_DOM_INDEXEDDB_DATA_ERR;
diff --git a/dom/media/AudioStream.cpp b/dom/media/AudioStream.cpp
index 896dee407..4b1d82c37 100644
--- a/dom/media/AudioStream.cpp
+++ b/dom/media/AudioStream.cpp
@@ -345,7 +345,6 @@ AudioStream::Init(uint32_t aNumChannels, uint32_t aRate,
   cubeb* cubebContext = CubebUtils::GetCubebContext();
   if (!cubebContext) {
     NS_WARNING("Can't get cubeb context!");
-    CubebUtils::ReportCubebStreamInitFailure(true);
     return NS_ERROR_DOM_MEDIA_CUBEB_INITIALIZATION_ERR;
   }
 
@@ -367,10 +366,8 @@ AudioStream::OpenCubeb(cubeb* aContext, cubeb_stream_params& aParams,
                         latency_frames,
                         DataCallback_S, StateCallback_S, this) == CUBEB_OK) {
     mCubebStream.reset(stream);
-    CubebUtils::ReportCubebBackendUsed();
   } else {
     NS_WARNING(nsPrintfCString("AudioStream::OpenCubeb() %p failed to init cubeb", this).get());
-    CubebUtils::ReportCubebStreamInitFailure(aIsFirst);
     return NS_ERROR_FAILURE;
   }
 
diff --git a/dom/media/CubebUtils.cpp b/dom/media/CubebUtils.cpp
index 93792e63b..0f0167d9c 100644
--- a/dom/media/CubebUtils.cpp
+++ b/dom/media/CubebUtils.cpp
@@ -239,31 +239,6 @@ cubeb* GetCubebContextUnlocked()
   return sCubebContext;
 }
 
-void ReportCubebBackendUsed()
-{
-  StaticMutexAutoLock lock(sMutex);
-
-  sAudioStreamInitEverSucceeded = true;
-
-  bool foundBackend = false;
-  for (uint32_t i = 0; i < ArrayLength(AUDIOSTREAM_BACKEND_ID_STR); i++) {
-    if (!strcmp(cubeb_get_backend_id(sCubebContext), AUDIOSTREAM_BACKEND_ID_STR[i])) {
-      foundBackend = true;
-    }
-  }
-}
-
-void ReportCubebStreamInitFailure(bool aIsFirst)
-{
-  StaticMutexAutoLock lock(sMutex);
-  if (!aIsFirst && !sAudioStreamInitEverSucceeded) {
-    // This machine has no audio hardware, or it's in really bad shape, don't
-    // send this info, since we want CUBEB_BACKEND_INIT_FAILURE_OTHER to detect
-    // failures to open multiple streams in a process over time.
-    return;
-  }
-}
-
 uint32_t GetCubebPlaybackLatencyInMilliseconds()
 {
   StaticMutexAutoLock lock(sMutex);
diff --git a/dom/media/CubebUtils.h b/dom/media/CubebUtils.h
index fa5fc2294..f43492374 100644
--- a/dom/media/CubebUtils.h
+++ b/dom/media/CubebUtils.h
@@ -35,8 +35,6 @@ double GetVolumeScale();
 bool GetFirstStream();
 cubeb* GetCubebContext();
 cubeb* GetCubebContextUnlocked();
-void ReportCubebStreamInitFailure(bool aIsFirstStream);
-void ReportCubebBackendUsed();
 uint32_t GetCubebPlaybackLatencyInMilliseconds();
 Maybe<uint32_t> GetCubebMSGLatencyInFrames();
 bool CubebLatencyPrefSet();
diff --git a/dom/media/GraphDriver.cpp b/dom/media/GraphDriver.cpp
index 47762c56e..b60dfee9d 100644
--- a/dom/media/GraphDriver.cpp
+++ b/dom/media/GraphDriver.cpp
@@ -623,9 +623,6 @@ AudioCallbackDriver::Init()
   cubeb* cubebContext = CubebUtils::GetCubebContext();
   if (!cubebContext) {
     NS_WARNING("Could not get cubeb context.");
-    if (!mFromFallback) {
-      CubebUtils::ReportCubebStreamInitFailure(true);
-    }
     return;
   }
 
@@ -710,18 +707,11 @@ AudioCallbackDriver::Init()
       NS_WARNING_ASSERTION(
         rv == CUBEB_OK,
         "Could not set the audio stream volume in GraphDriver.cpp");
-      CubebUtils::ReportCubebBackendUsed();
     } else {
 #ifdef MOZ_WEBRTC
       StaticMutexAutoUnlock unlock(AudioInputCubeb::Mutex());
 #endif
       NS_WARNING("Could not create a cubeb stream for MediaStreamGraph, falling back to a SystemClockDriver");
-      // Only report failures when we're not coming from a driver that was
-      // created itself as a fallback driver because of a previous audio driver
-      // failure.
-      if (!mFromFallback) {
-        CubebUtils::ReportCubebStreamInitFailure(firstStream);
-      }
       // Fall back to a driver using a normal thread. If needed,
       // the graph will try to re-open an audio stream later.
       MonitorAutoLock lock(GraphImpl()->GetMonitor());
diff --git a/dom/media/MediaData.h b/dom/media/MediaData.h
index a79aac6ed..905b4c1d9 100644
--- a/dom/media/MediaData.h
+++ b/dom/media/MediaData.h
@@ -14,6 +14,7 @@
 #include "nsIMemoryReporter.h"
 #include "SharedBuffer.h"
 #include "mozilla/RefPtr.h"
+#include "mozilla/Span.h"
 #include "mozilla/UniquePtr.h"
 #include "mozilla/UniquePtrExtensions.h"
 #include "nsTArray.h"
@@ -631,6 +632,8 @@ public:
   {
     return sizeof(*this) + mBuffer.ComputedSizeOfExcludingThis();
   }
+  // Access the buffer as a Span.
+  operator Span<const uint8_t>() { return MakeSpan(Data(), Size()); }
 
   const CryptoSample& mCrypto;
   RefPtr<MediaByteBuffer> mExtraData;
diff --git a/dom/media/MediaPrefs.h b/dom/media/MediaPrefs.h
index b237ecd3d..e67796edd 100644
--- a/dom/media/MediaPrefs.h
+++ b/dom/media/MediaPrefs.h
@@ -120,6 +120,9 @@ private:
 #ifdef MOZ_FFVPX
   DECL_MEDIA_PREF("media.ffvpx.enabled",                      PDMFFVPXEnabled, bool, true);
 #endif
+#ifdef MOZ_AV1
+  DECL_MEDIA_PREF("media.av1.enabled",                        AV1Enabled, bool, false);
+#endif
 #ifdef XP_WIN
   DECL_MEDIA_PREF("media.wmf.enabled",                        PDMWMFEnabled, bool, true);
   DECL_MEDIA_PREF("media.wmf.skip-blacklist",                 PDMWMFSkipBlacklist, bool, false);
diff --git a/dom/media/VideoUtils.cpp b/dom/media/VideoUtils.cpp
index b1a202c03..c06ba9070 100644
--- a/dom/media/VideoUtils.cpp
+++ b/dom/media/VideoUtils.cpp
@@ -207,8 +207,20 @@ already_AddRefed<SharedThreadPool> GetMediaThreadPool(MediaThreadType aType)
       name = "MediaPlayback";
       break;
   }
-  return SharedThreadPool::
+  RefPtr<SharedThreadPool> pool = SharedThreadPool::
     Get(nsDependentCString(name), MediaPrefs::MediaThreadPoolDefaultCount());
+
+  // Ensure a larger stack for platform decoder threads
+  if (aType == MediaThreadType::PLATFORM_DECODER) {
+    const uint32_t minStackSize = 512*1024;
+    uint32_t stackSize;
+    MOZ_ALWAYS_SUCCEEDS(pool->GetThreadStackSize(&stackSize));
+    if (stackSize < minStackSize) {
+      MOZ_ALWAYS_SUCCEEDS(pool->SetThreadStackSize(minStackSize));
+    }
+  }
+
+  return pool.forget();
 }
 
 bool
@@ -426,6 +438,16 @@ ParseMIMETypeString(const nsAString& aMIMEType,
   return ParseCodecsString(codecsStr, aOutCodecs);
 }
 
+template <int N>
+static bool
+StartsWith(const nsACString& string, const char (&prefix)[N])
+{
+    if (N - 1 > string.Length()) {
+      return false;
+    }
+    return memcmp(string.Data(), prefix, N - 1) == 0;
+}
+
 bool
 IsH264CodecString(const nsAString& aCodec)
 {
@@ -458,15 +480,14 @@ IsVP9CodecString(const nsAString& aCodec)
          aCodec.EqualsLiteral("vp9.0");
 }
 
-template <int N>
-static bool
-StartsWith(const nsACString& string, const char (&prefix)[N])
+#ifdef MOZ_AV1
+bool
+IsAV1CodecString(const nsAString& aCodec)
 {
-    if (N - 1 > string.Length()) {
-      return false;
-    }
-    return memcmp(string.Data(), prefix, N - 1) == 0;
+  return aCodec.EqualsLiteral("av1") ||
+    StartsWith(NS_ConvertUTF16toUTF8(aCodec), "av01");
 }
+#endif
 
 UniquePtr<TrackInfo>
 CreateTrackInfoWithMIMEType(const nsACString& aCodecMIMEType)
diff --git a/dom/media/VideoUtils.h b/dom/media/VideoUtils.h
index 441b63792..aaf0e9903 100644
--- a/dom/media/VideoUtils.h
+++ b/dom/media/VideoUtils.h
@@ -345,6 +345,11 @@ IsVP8CodecString(const nsAString& aCodec);
 bool
 IsVP9CodecString(const nsAString& aCodec);
 
+#ifdef MOZ_AV1
+bool
+IsAV1CodecString(const nsAString& aCodec);
+#endif
+
 // Try and create a TrackInfo with a given codec MIME type.
 UniquePtr<TrackInfo>
 CreateTrackInfoWithMIMEType(const nsACString& aCodecMIMEType);
diff --git a/dom/media/platforms/agnostic/AOMDecoder.cpp b/dom/media/platforms/agnostic/AOMDecoder.cpp
new file mode 100644
index 000000000..b5d21375e
--- /dev/null
+++ b/dom/media/platforms/agnostic/AOMDecoder.cpp
@@ -0,0 +1,332 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "AOMDecoder.h"
+#include "MediaResult.h"
+#include "TimeUnits.h"
+#include "aom/aomdx.h"
+#include "aom/aom_image.h"
+#include "gfx2DGlue.h"
+#include "mozilla/PodOperations.h"
+#include "mozilla/SyncRunnable.h"
+#include "nsError.h"
+#include "prsystem.h"
+
+#include <algorithm>
+
+#undef LOG
+#define LOG(arg, ...) MOZ_LOG(sPDMLog, mozilla::LogLevel::Debug, ("AOMDecoder(%p)::%s: " arg, this, __func__, ##__VA_ARGS__))
+
+namespace mozilla {
+
+using namespace gfx;
+using namespace layers;
+
+AOMDecoder::AOMDecoder(const CreateDecoderParams& aParams)
+  : mImageContainer(aParams.mImageContainer)
+  , mTaskQueue(aParams.mTaskQueue)
+  , mCallback(aParams.mCallback)
+  , mIsFlushing(false)
+  , mInfo(aParams.VideoConfig())
+{
+  PodZero(&mCodec);
+}
+
+AOMDecoder::~AOMDecoder()
+{
+}
+
+void
+AOMDecoder::Shutdown()
+{
+  aom_codec_destroy(&mCodec);
+}
+
+RefPtr<MediaDataDecoder::InitPromise>
+AOMDecoder::Init()
+{
+  int decode_threads = 2;
+
+  aom_codec_iface_t* dx = aom_codec_av1_dx();
+  if (mInfo.mDisplay.width >= 2048) {
+    decode_threads = 8;
+  }
+  else if (mInfo.mDisplay.width >= 1024) {
+    decode_threads = 4;
+  }
+  decode_threads = std::min(decode_threads, PR_GetNumberOfProcessors());
+
+  aom_codec_dec_cfg_t config;
+  PodZero(&config);
+  config.threads = decode_threads;
+  config.w = config.h = 0; // set after decode
+  config.allow_lowbitdepth = true;
+
+  aom_codec_flags_t flags = 0;
+  
+  if (!dx || aom_codec_dec_init(&mCodec, dx, &config, flags)) {
+    return InitPromise::CreateAndReject(NS_ERROR_DOM_MEDIA_FATAL_ERR, __func__);
+  }
+  return InitPromise::CreateAndResolve(TrackInfo::kVideoTrack, __func__);
+}
+
+void
+AOMDecoder::Flush()
+{
+  MOZ_ASSERT(mCallback->OnReaderTaskQueue());
+  mIsFlushing = true;
+  nsCOMPtr<nsIRunnable> r = NS_NewRunnableFunction([this] () {
+    // nothing to do for now.
+  });
+  SyncRunnable::DispatchToThread(mTaskQueue, r);
+  mIsFlushing = false;
+}
+
+// Ported from third_party/aom/tools_common.c.
+static aom_codec_err_t
+highbd_img_downshift(aom_image_t *dst, aom_image_t *src, int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h)
+    return AOM_CODEC_INVALID_PARAM;
+  if (dst->x_chroma_shift != src->x_chroma_shift)
+    return AOM_CODEC_INVALID_PARAM;
+  if (dst->y_chroma_shift != src->y_chroma_shift)
+    return AOM_CODEC_INVALID_PARAM;
+  if (dst->fmt != (src->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH))
+    return AOM_CODEC_INVALID_PARAM;
+  if (down_shift < 0)
+      return AOM_CODEC_INVALID_PARAM;
+  switch (dst->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444:
+      break;
+    default:
+      return AOM_CODEC_INVALID_PARAM;
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416:
+      break;
+    default:
+      // We don't support anything that's not 16 bit
+      return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      uint16_t *p_src =
+          (uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst =
+          dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) *p_dst++ = (*p_src++ >> down_shift) & 0xFF;
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+// UniquePtr dtor wrapper for aom_image_t.
+struct AomImageFree {
+  void operator()(aom_image_t* img) { aom_img_free(img); }
+};
+
+MediaResult
+AOMDecoder::DoDecode(MediaRawData* aSample)
+{
+  MOZ_ASSERT(mTaskQueue->IsCurrentThreadIn());
+
+#if defined(DEBUG)
+  NS_ASSERTION(IsKeyframe(*aSample) == aSample->mKeyframe,
+               "AOM Decode Keyframe error sample->mKeyframe and si.si_kf out of sync");
+#endif
+
+  if (aom_codec_err_t r = aom_codec_decode(&mCodec, aSample->Data(), aSample->Size(), nullptr)) {
+    LOG("AOM Decode error: %s", aom_codec_err_to_string(r));
+    return MediaResult(
+      NS_ERROR_DOM_MEDIA_DECODE_ERR,
+      RESULT_DETAIL("AOM error decoding AV1 sample: %s", aom_codec_err_to_string(r)));
+  }
+
+  aom_codec_iter_t iter = nullptr;
+  aom_image_t *img;
+  UniquePtr<aom_image_t, AomImageFree> img8;
+
+  while ((img = aom_codec_get_frame(&mCodec, &iter))) {
+    // Track whether the underlying buffer is 8 or 16 bits per channel.
+    bool highbd = bool(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+    if (highbd) {
+      // Downsample images with more than 8 bits per channel.
+      aom_img_fmt_t fmt8 = static_cast<aom_img_fmt_t>(img->fmt ^ AOM_IMG_FMT_HIGHBITDEPTH);
+      img8.reset(aom_img_alloc(NULL, fmt8, img->d_w, img->d_h, 16));
+      if (img8 == nullptr) {
+        LOG("Couldn't allocate bitdepth reduction target!");
+        return MediaResult(
+          NS_ERROR_OUT_OF_MEMORY,
+          RESULT_DETAIL("Couldn't allocate conversion buffer for AV1 frame"));
+      }
+      if (aom_codec_err_t r = highbd_img_downshift(img8.get(), img, img->bit_depth - 8)) {
+        return MediaResult(
+          NS_ERROR_DOM_MEDIA_DECODE_ERR,
+          RESULT_DETAIL("Error converting AV1 frame to 8 bits: %s",
+                        aom_codec_err_to_string(r)));
+      }
+      // img normally points to storage owned by mCodec, so it is not freed.
+      // To copy out the contents of img8 we can overwrite img with an alias.
+      // Since img is assigned at the start of the while loop and img8 is held
+      // outside that loop, the alias won't outlive the storage it points to.
+      img = img8.get();
+      highbd = false;
+    }
+
+    NS_ASSERTION(img->fmt == AOM_IMG_FMT_I420 ||
+                 img->fmt == AOM_IMG_FMT_I42016 ||
+                 img->fmt == AOM_IMG_FMT_I444 ||
+                 img->fmt == AOM_IMG_FMT_I44416,
+                 "AV1 image format not I420 or I444");
+
+    // Chroma shifts are rounded down as per the decoding examples in the SDK
+    VideoData::YCbCrBuffer b;
+    b.mPlanes[0].mData = img->planes[0];
+    b.mPlanes[0].mStride = img->stride[0];
+    b.mPlanes[0].mHeight = img->d_h;
+    b.mPlanes[0].mWidth = img->d_w;
+    b.mPlanes[0].mOffset = 0;
+    b.mPlanes[0].mSkip = highbd ? 1 : 0;
+
+    b.mPlanes[1].mData = img->planes[1];
+    b.mPlanes[1].mStride = img->stride[1];
+    b.mPlanes[1].mOffset = 0;
+    b.mPlanes[1].mSkip = highbd ? 1 : 0;
+
+    b.mPlanes[2].mData = img->planes[2];
+    b.mPlanes[2].mStride = img->stride[2];
+    b.mPlanes[2].mOffset = 0;
+    b.mPlanes[2].mSkip = highbd ? 1 : 0;
+
+    if (img->fmt == AOM_IMG_FMT_I420 ||
+        img->fmt == AOM_IMG_FMT_I42016) {
+      b.mPlanes[1].mHeight = (img->d_h + 1) >> img->y_chroma_shift;
+      b.mPlanes[1].mWidth = (img->d_w + 1) >> img->x_chroma_shift;
+
+      b.mPlanes[2].mHeight = (img->d_h + 1) >> img->y_chroma_shift;
+      b.mPlanes[2].mWidth = (img->d_w + 1) >> img->x_chroma_shift;
+    } else if (img->fmt == AOM_IMG_FMT_I444) {
+      b.mPlanes[1].mHeight = img->d_h;
+      b.mPlanes[1].mWidth = img->d_w;
+
+      b.mPlanes[2].mHeight = img->d_h;
+      b.mPlanes[2].mWidth = img->d_w;
+    } else {
+      LOG("AOM Unknown image format");
+      return MediaResult(NS_ERROR_DOM_MEDIA_DECODE_ERR,
+                         RESULT_DETAIL("AOM Unknown image format"));
+    }
+
+    RefPtr<VideoData> v =
+      VideoData::CreateAndCopyData(mInfo,
+                                   mImageContainer,
+                                   aSample->mOffset,
+                                   aSample->mTime,
+                                   aSample->mDuration,
+                                   b,
+                                   aSample->mKeyframe,
+                                   aSample->mTimecode,
+                                   mInfo.ScaledImageRect(img->d_w,
+                                                         img->d_h));
+
+    if (!v) {
+      LOG("Image allocation error source %ux%u display %ux%u picture %ux%u",
+          img->d_w, img->d_h, mInfo.mDisplay.width, mInfo.mDisplay.height,
+          mInfo.mImage.width, mInfo.mImage.height);
+      return MediaResult(NS_ERROR_OUT_OF_MEMORY, __func__);
+    }
+    mCallback->Output(v);
+  }
+  return NS_OK;
+}
+
+void
+AOMDecoder::ProcessDecode(MediaRawData* aSample)
+{
+  MOZ_ASSERT(mTaskQueue->IsCurrentThreadIn());
+  if (mIsFlushing) {
+    return;
+  }
+  MediaResult rv = DoDecode(aSample);
+  if (NS_FAILED(rv)) {
+    mCallback->Error(rv);
+  } else {
+    mCallback->InputExhausted();
+  }
+}
+
+void
+AOMDecoder::Input(MediaRawData* aSample)
+{
+  MOZ_ASSERT(mCallback->OnReaderTaskQueue());
+  mTaskQueue->Dispatch(NewRunnableMethod<RefPtr<MediaRawData>>(
+                       this, &AOMDecoder::ProcessDecode, aSample));
+}
+
+void
+AOMDecoder::ProcessDrain()
+{
+  MOZ_ASSERT(mTaskQueue->IsCurrentThreadIn());
+  mCallback->DrainComplete();
+}
+
+void
+AOMDecoder::Drain()
+{
+  MOZ_ASSERT(mCallback->OnReaderTaskQueue());
+  mTaskQueue->Dispatch(NewRunnableMethod(this, &AOMDecoder::ProcessDrain));
+}
+
+/* static */
+bool
+AOMDecoder::IsAV1(const nsACString& aMimeType)
+{
+  return aMimeType.EqualsLiteral("video/webm; codecs=av1") ||
+         aMimeType.EqualsLiteral("video/av1");
+}
+
+/* static */
+bool
+AOMDecoder::IsKeyframe(Span<const uint8_t> aBuffer) {
+  aom_codec_stream_info_t info;
+  PodZero(&info);
+
+  aom_codec_peek_stream_info(aom_codec_av1_dx(),
+                             aBuffer.Elements(),
+                             aBuffer.Length(),
+                             &info);
+
+  return bool(info.is_kf);
+}
+
+/* static */
+nsIntSize
+AOMDecoder::GetFrameSize(Span<const uint8_t> aBuffer) {
+  aom_codec_stream_info_t info;
+  PodZero(&info);
+
+  aom_codec_peek_stream_info(aom_codec_av1_dx(),
+                             aBuffer.Elements(),
+                             aBuffer.Length(),
+                             &info);
+
+  return nsIntSize(info.w, info.h);
+}
+
+} // namespace mozilla
+#undef LOG
diff --git a/dom/media/platforms/agnostic/AOMDecoder.h b/dom/media/platforms/agnostic/AOMDecoder.h
new file mode 100644
index 000000000..ec6b1f95a
--- /dev/null
+++ b/dom/media/platforms/agnostic/AOMDecoder.h
@@ -0,0 +1,62 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#if !defined(AOMDecoder_h_)
+#define AOMDecoder_h_
+
+#include "PlatformDecoderModule.h"
+#include "mozilla/Span.h"
+
+#include <stdint.h>
+#include "aom/aom_decoder.h"
+
+namespace mozilla {
+
+class AOMDecoder : public MediaDataDecoder
+{
+public:
+  explicit AOMDecoder(const CreateDecoderParams& aParams);
+
+  RefPtr<InitPromise> Init() override;
+  void Input(MediaRawData* aSample) override;
+  void Flush() override;
+  void Drain() override;
+  void Shutdown() override;
+  const char* GetDescriptionName() const override
+  {
+    return "libaom (AV1) video decoder";
+  }
+
+  // Return true if aMimeType is a one of the strings used
+  // by our demuxers to identify AV1 streams.
+  static bool IsAV1(const nsACString& aMimeType);
+
+  // Return true if a sample is a keyframe.
+  static bool IsKeyframe(Span<const uint8_t> aBuffer);
+
+  // Return the frame dimensions for a sample.
+  static nsIntSize GetFrameSize(Span<const uint8_t> aBuffer);
+
+private:
+  ~AOMDecoder();
+  void ProcessDecode(MediaRawData* aSample);
+  MediaResult DoDecode(MediaRawData* aSample);
+  void ProcessDrain();
+
+  const RefPtr<layers::ImageContainer> mImageContainer;
+  const RefPtr<TaskQueue> mTaskQueue;
+  MediaDataDecoderCallback* mCallback;
+  Atomic<bool> mIsFlushing;
+
+  // AOM decoder state
+  aom_codec_ctx_t mCodec;
+
+  const VideoInfo& mInfo;
+};
+
+} // namespace mozilla
+
+#endif // AOMDecoder_h_
diff --git a/dom/media/platforms/agnostic/AgnosticDecoderModule.cpp b/dom/media/platforms/agnostic/AgnosticDecoderModule.cpp
index 7bd75b7fe..cf5ed3d22 100644
--- a/dom/media/platforms/agnostic/AgnosticDecoderModule.cpp
+++ b/dom/media/platforms/agnostic/AgnosticDecoderModule.cpp
@@ -5,6 +5,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "AgnosticDecoderModule.h"
+#include "MediaPrefs.h"
 #include "mozilla/Logging.h"
 #include "OpusDecoder.h"
 #include "VorbisDecoder.h"
@@ -12,6 +13,10 @@
 #include "WAVDecoder.h"
 #include "TheoraDecoder.h"
 
+#ifdef MOZ_AV1
+#include "AOMDecoder.h"
+#endif
+
 namespace mozilla {
 
 bool
@@ -24,6 +29,11 @@ AgnosticDecoderModule::SupportsMimeType(const nsACString& aMimeType,
     VorbisDataDecoder::IsVorbis(aMimeType) ||
     WaveDataDecoder::IsWave(aMimeType) ||
     TheoraDecoder::IsTheora(aMimeType);
+#ifdef MOZ_AV1
+  if (MediaPrefs::AV1Enabled()) {
+    supports |= AOMDecoder::IsAV1(aMimeType);
+  }
+#endif
   MOZ_LOG(sPDMLog, LogLevel::Debug, ("Agnostic decoder %s requested type",
         supports ? "supports" : "rejects"));
   return supports;
@@ -36,7 +46,14 @@ AgnosticDecoderModule::CreateVideoDecoder(const CreateDecoderParams& aParams)
 
   if (VPXDecoder::IsVPX(aParams.mConfig.mMimeType)) {
     m = new VPXDecoder(aParams);
-  } else if (TheoraDecoder::IsTheora(aParams.mConfig.mMimeType)) {
+  }
+#ifdef MOZ_AV1
+  else if (AOMDecoder::IsAV1(aParams.mConfig.mMimeType) &&
+           MediaPrefs::AV1Enabled()) {
+    m = new AOMDecoder(aParams);
+  }
+#endif
+  else if (TheoraDecoder::IsTheora(aParams.mConfig.mMimeType)) {
     m = new TheoraDecoder(aParams);
   }
 
diff --git a/dom/media/platforms/agnostic/VPXDecoder.cpp b/dom/media/platforms/agnostic/VPXDecoder.cpp
index 77c81b51b..f2f84487f 100644
--- a/dom/media/platforms/agnostic/VPXDecoder.cpp
+++ b/dom/media/platforms/agnostic/VPXDecoder.cpp
@@ -22,7 +22,7 @@ namespace mozilla {
 using namespace gfx;
 using namespace layers;
 
-static int MimeTypeToCodec(const nsACString& aMimeType)
+static VPXDecoder::Codec MimeTypeToCodec(const nsACString& aMimeType)
 {
   if (aMimeType.EqualsLiteral("video/webm; codecs=vp8")) {
     return VPXDecoder::Codec::VP8;
@@ -31,7 +31,7 @@ static int MimeTypeToCodec(const nsACString& aMimeType)
   } else if (aMimeType.EqualsLiteral("video/vp9")) {
     return VPXDecoder::Codec::VP9;
   }
-  return -1;
+  return VPXDecoder::Codec::Unknown;
 }
 
 VPXDecoder::VPXDecoder(const CreateDecoderParams& aParams)
@@ -101,17 +101,10 @@ MediaResult
 VPXDecoder::DoDecode(MediaRawData* aSample)
 {
   MOZ_ASSERT(mTaskQueue->IsCurrentThreadIn());
+
 #if defined(DEBUG)
-  vpx_codec_stream_info_t si;
-  PodZero(&si);
-  si.sz = sizeof(si);
-  if (mCodec == Codec::VP8) {
-    vpx_codec_peek_stream_info(vpx_codec_vp8_dx(), aSample->Data(), aSample->Size(), &si);
-  } else if (mCodec == Codec::VP9) {
-    vpx_codec_peek_stream_info(vpx_codec_vp9_dx(), aSample->Data(), aSample->Size(), &si);
-  }
-  NS_ASSERTION(bool(si.is_kf) == aSample->mKeyframe,
-               "VPX Decode Keyframe error sample->mKeyframe and si.si_kf out of sync");
+  NS_ASSERTION(IsKeyframe(*aSample, mCodec) == aSample->mKeyframe,
+               "VPX Decode Keyframe error sample->mKeyframe and sample data out of sync");
 #endif
 
   if (vpx_codec_err_t r = vpx_codec_decode(&mVPX, aSample->Data(), aSample->Size(), nullptr, 0)) {
@@ -249,5 +242,41 @@ VPXDecoder::IsVP9(const nsACString& aMimeType)
   return IsVPX(aMimeType, VPXDecoder::VP9);
 }
 
+/* static */
+bool
+VPXDecoder::IsKeyframe(Span<const uint8_t> aBuffer, Codec aCodec)
+{
+  vpx_codec_stream_info_t si;
+  PodZero(&si);
+  si.sz = sizeof(si);
+
+  if (aCodec == Codec::VP8) {
+    vpx_codec_peek_stream_info(vpx_codec_vp8_dx(), aBuffer.Elements(), aBuffer.Length(), &si);
+    return bool(si.is_kf);
+  } else if (aCodec == Codec::VP9) {
+    vpx_codec_peek_stream_info(vpx_codec_vp9_dx(), aBuffer.Elements(), aBuffer.Length(), &si);
+    return bool(si.is_kf);
+  }
+
+  return false;
+}
+
+/* static */
+nsIntSize
+VPXDecoder::GetFrameSize(Span<const uint8_t> aBuffer, Codec aCodec)
+{
+  vpx_codec_stream_info_t si;
+  PodZero(&si);
+  si.sz = sizeof(si);
+
+  if (aCodec == Codec::VP8) {
+    vpx_codec_peek_stream_info(vpx_codec_vp8_dx(), aBuffer.Elements(), aBuffer.Length(), &si);
+  } else if (aCodec == Codec::VP9) {
+    vpx_codec_peek_stream_info(vpx_codec_vp9_dx(), aBuffer.Elements(), aBuffer.Length(), &si);
+  }
+
+  return nsIntSize(si.w, si.h);
+}
+
 } // namespace mozilla
 #undef LOG
diff --git a/dom/media/platforms/agnostic/VPXDecoder.h b/dom/media/platforms/agnostic/VPXDecoder.h
index d420ec069..4e8d83617 100644
--- a/dom/media/platforms/agnostic/VPXDecoder.h
+++ b/dom/media/platforms/agnostic/VPXDecoder.h
@@ -7,6 +7,7 @@
 #define VPXDecoder_h_
 
 #include "PlatformDecoderModule.h"
+#include "mozilla/Span.h"
 
 #include <stdint.h>
 #define VPX_DONT_DEFINE_STDINT_TYPES
@@ -36,7 +37,8 @@ public:
 
   enum Codec: uint8_t {
     VP8 = 1 << 0,
-    VP9 = 1 << 1
+    VP9 = 1 << 1,
+    Unknown = 1 << 7,
   };
 
   // Return true if aMimeType is a one of the strings used by our demuxers to
@@ -46,6 +48,12 @@ public:
   static bool IsVP8(const nsACString& aMimeType);
   static bool IsVP9(const nsACString& aMimeType);
 
+  // Return true if a sample is a keyframe for the specified codec.
+  static bool IsKeyframe(Span<const uint8_t> aBuffer, Codec aCodec);
+
+  // Return the frame dimensions for a sample for the specified codec.
+  static nsIntSize GetFrameSize(Span<const uint8_t> aBuffer, Codec aCodec);
+
 private:
   void ProcessDecode(MediaRawData* aSample);
   MediaResult DoDecode(MediaRawData* aSample);
@@ -61,7 +69,7 @@ private:
 
   const VideoInfo& mInfo;
 
-  const int mCodec;
+  const Codec mCodec;
 };
 
 } // namespace mozilla
diff --git a/dom/media/platforms/ffmpeg/FFmpegDataDecoder.cpp b/dom/media/platforms/ffmpeg/FFmpegDataDecoder.cpp
index 8cb5c8578..8655ce25f 100644
--- a/dom/media/platforms/ffmpeg/FFmpegDataDecoder.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegDataDecoder.cpp
@@ -69,12 +69,21 @@ FFmpegDataDecoder<LIBAV_VER>::InitDecoder()
     mCodecContext->extradata_size = mExtraData->Length();
     // FFmpeg may use SIMD instructions to access the data which reads the
     // data in 32 bytes block. Must ensure we have enough data to read.
+    uint32_t padding_size =
 #if LIBAVCODEC_VERSION_MAJOR >= 58
-    mExtraData->AppendElements(AV_INPUT_BUFFER_PADDING_SIZE);
+      AV_INPUT_BUFFER_PADDING_SIZE;
 #else
-    mExtraData->AppendElements(FF_INPUT_BUFFER_PADDING_SIZE);
+      FF_INPUT_BUFFER_PADDING_SIZE;
 #endif
-    mCodecContext->extradata = mExtraData->Elements();
+    mCodecContext->extradata = static_cast<uint8_t*>(
+      mLib->av_malloc(mExtraData->Length() + padding_size));
+    if (!mCodecContext->extradata) {
+      return MediaResult(NS_ERROR_OUT_OF_MEMORY,
+                        RESULT_DETAIL("Couldn't init ffmpeg extradata"));
+    }
+    memcpy(mCodecContext->extradata,
+           mExtraData->Elements(),
+           mExtraData->Length());
   } else {
     mCodecContext->extradata_size = 0;
   }
@@ -165,6 +174,9 @@ FFmpegDataDecoder<LIBAV_VER>::ProcessShutdown()
   StaticMutexAutoLock mon(sMonitor);
 
   if (mCodecContext) {
+    if (mCodecContext->extradata) {
+      mLib->av_freep(&mCodecContext->extradata);
+    }
     mLib->avcodec_close(mCodecContext);
     mLib->av_freep(&mCodecContext);
 #if LIBAVCODEC_VERSION_MAJOR >= 55
diff --git a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
index 6302882a6..e1c326818 100644
--- a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
+++ b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.cpp
@@ -7,6 +7,7 @@
 #include "MediaPrefs.h"
 #include "mozilla/PodOperations.h"
 #include "mozilla/Types.h"
+#include "PlatformDecoderModule.h"
 #include "prlink.h"
 
 #define AV_LOG_DEBUG    48
diff --git a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
index d6944a1d8..c6c43a4ae 100644
--- a/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
+++ b/dom/media/platforms/ffmpeg/FFmpegLibWrapper.h
@@ -5,6 +5,7 @@
 #ifndef __FFmpegLibWrapper_h__
 #define __FFmpegLibWrapper_h__
 
+#include "mozilla/Attributes.h"
 #include "mozilla/Types.h"
 
 struct AVCodec;
@@ -91,4 +92,4 @@ private:
 
 } // namespace mozilla
 
-#endif // FFmpegLibWrapper
-\ No newline at end of file
+#endif // FFmpegLibWrapper
diff --git a/dom/media/platforms/ffmpeg/ffvpx/moz.build b/dom/media/platforms/ffmpeg/ffvpx/moz.build
index aee58b5b0..c0041a4d4 100644
--- a/dom/media/platforms/ffmpeg/ffvpx/moz.build
+++ b/dom/media/platforms/ffmpeg/ffvpx/moz.build
@@ -20,7 +20,7 @@ SOURCES += [
 ]
 LOCAL_INCLUDES += [
     '..',
-    '../ffmpeg57/include',
+    '../ffmpeg58/include',
 ]
 
 if CONFIG['OS_ARCH'] == 'WINNT':
diff --git a/dom/media/platforms/moz.build b/dom/media/platforms/moz.build
index 3fb0cc842..be13d31c4 100644
--- a/dom/media/platforms/moz.build
+++ b/dom/media/platforms/moz.build
@@ -55,6 +55,14 @@ if CONFIG['MOZ_FFMPEG']:
         'ffmpeg',
     ]
 
+if CONFIG['MOZ_AV1']:
+    EXPORTS += [
+        'agnostic/AOMDecoder.h',
+    ]
+    UNIFIED_SOURCES += [
+        'agnostic/AOMDecoder.cpp',
+    ]
+
 if CONFIG['MOZ_APPLEMEDIA']:
   EXPORTS += [
       'apple/AppleDecoderModule.h',
diff --git a/dom/media/test/bug580982.webm b/dom/media/test/bug1377278.webm
index 802019f39..802019f39 100644
--- a/dom/media/test/bug580982.webm
+++ b/dom/media/test/bug1377278.webm
diff --git a/dom/media/test/bug580982.webm^headers^ b/dom/media/test/bug1377278.webm^headers^
index 4030ea1d3..4030ea1d3 100644
--- a/dom/media/test/bug580982.webm^headers^
+++ b/dom/media/test/bug1377278.webm^headers^
diff --git a/dom/media/test/manifest.js b/dom/media/test/manifest.js
index c6d533c1b..7e30cc97d 100644
--- a/dom/media/test/manifest.js
+++ b/dom/media/test/manifest.js
@@ -224,6 +224,9 @@ var gPlayTests = [
   // Test playback of a webm file
   { name:"seek-short.webm", type:"video/webm", duration:0.23 },
 
+  // Test playback of a webm file with 'matroska' doctype
+  { name:"bug1377278.webm", type:"video/webm", duration:4.0 },
+
   // Test playback of a WebM file with non-zero start time.
   { name:"split.webm", type:"video/webm", duration:1.967 },
 
@@ -532,7 +535,6 @@ var gErrorTests = [
   { name:"448636.ogv", type:"video/ogg" },
   { name:"bug504843.ogv", type:"video/ogg" },
   { name:"bug501279.ogg", type:"audio/ogg" },
-  { name:"bug580982.webm", type:"video/webm" },
   { name:"bug603918.webm", type:"video/webm" },
   { name:"bug604067.webm", type:"video/webm" },
   { name:"bogus.duh", type:"bogus/duh" }
diff --git a/dom/media/test/mochitest.ini b/dom/media/test/mochitest.ini
index ddabf78b6..742ac1b1c 100644
--- a/dom/media/test/mochitest.ini
+++ b/dom/media/test/mochitest.ini
@@ -382,8 +382,6 @@ support-files =
   bug556821.ogv^headers^
   bug557094.ogv
   bug557094.ogv^headers^
-  bug580982.webm
-  bug580982.webm^headers^
   bug603918.webm
   bug603918.webm^headers^
   bug604067.webm
@@ -395,6 +393,8 @@ support-files =
   bug1301226.wav^headers^
   bug1301226-odd.wav
   bug1301226-odd.wav^headers^
+  bug1377278.webm
+  bug1377278.webm^headers^
   can_play_type_dash.js
   can_play_type_ogg.js
   can_play_type_wave.js
diff --git a/dom/media/webm/WebMDecoder.cpp b/dom/media/webm/WebMDecoder.cpp
index b41de6d40..9575d6e42 100644
--- a/dom/media/webm/WebMDecoder.cpp
+++ b/dom/media/webm/WebMDecoder.cpp
@@ -5,6 +5,10 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "mozilla/Preferences.h"
+#ifdef MOZ_AV1
+#include "AOMDecoder.h"
+#endif
+#include "MediaPrefs.h"
 #include "MediaDecoderStateMachine.h"
 #include "WebMDemuxer.h"
 #include "WebMDecoder.h"
@@ -65,6 +69,11 @@ WebMDecoder::CanHandleMediaType(const nsACString& aMIMETypeExcludingCodecs,
 
       continue;
     }
+#ifdef MOZ_AV1
+    if (MediaPrefs::AV1Enabled() && IsAV1CodecString(codec)) {
+      continue;
+    }
+#endif
     // Some unsupported codec.
     return false;
   }
diff --git a/dom/media/webm/WebMDemuxer.cpp b/dom/media/webm/WebMDemuxer.cpp
index 20ed71581..013da9b2c 100644
--- a/dom/media/webm/WebMDemuxer.cpp
+++ b/dom/media/webm/WebMDemuxer.cpp
@@ -8,7 +8,11 @@
 #include "MediaDecoderStateMachine.h"
 #include "AbstractMediaDecoder.h"
 #include "MediaResource.h"
+#ifdef MOZ_AV1
+#include "AOMDecoder.h"
+#endif
 #include "OpusDecoder.h"
+#include "VPXDecoder.h"
 #include "WebMDemuxer.h"
 #include "WebMBufferedParser.h"
 #include "gfx2DGlue.h"
@@ -24,12 +28,9 @@
 #include "mozilla/Sprintf.h"
 
 #include <algorithm>
+#include <numeric>
 #include <stdint.h>
 
-#define VPX_DONT_DEFINE_STDINT_TYPES
-#include "vpx/vp8dx.h"
-#include "vpx/vpx_decoder.h"
-
 #define WEBM_DEBUG(arg, ...) MOZ_LOG(gMediaDemuxerLog, mozilla::LogLevel::Debug, ("WebMDemuxer(%p)::%s: " arg, this, __func__, ##__VA_ARGS__))
 extern mozilla::LazyLogModule gMediaDemuxerLog;
 
@@ -322,6 +323,9 @@ WebMDemuxer::ReadMetadata()
         case NESTEGG_CODEC_VP9:
           mInfo.mVideo.mMimeType = "video/webm; codecs=vp9";
           break;
+        case NESTEGG_CODEC_AV1:
+          mInfo.mVideo.mMimeType = "video/webm; codecs=av1";
+          break;
         default:
           NS_WARNING("Unknown WebM video codec");
           return NS_ERROR_FAILURE;
@@ -549,7 +553,7 @@ WebMDemuxer::GetTrackCrypto(TrackInfo::TrackType aType, size_t aTrackNumber) {
   return crypto;
 }
 
-bool
+nsresult
 WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSamples)
 {
   if (mIsMediaSource) {
@@ -557,17 +561,18 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
     EnsureUpToDateIndex();
   }
 
-  RefPtr<NesteggPacketHolder> holder(NextPacket(aType));
+  RefPtr<NesteggPacketHolder> holder;
+  nsresult rv = NextPacket(aType, holder);
 
-  if (!holder) {
-    return false;
+  if (NS_FAILED(rv)) {
+    return rv;
   }
 
   int r = 0;
   unsigned int count = 0;
   r = nestegg_packet_count(holder->Packet(), &count);
   if (r == -1) {
-    return false;
+    return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
   }
   int64_t tstamp = holder->Timestamp();
   int64_t duration = holder->Duration();
@@ -578,7 +583,11 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
   // video frame.
   int64_t next_tstamp = INT64_MIN;
   if (aType == TrackInfo::kAudioTrack) {
-    RefPtr<NesteggPacketHolder> next_holder(NextPacket(aType));
+    RefPtr<NesteggPacketHolder> next_holder;
+    rv = NextPacket(aType, next_holder);
+    if (NS_FAILED(rv) && rv != NS_ERROR_DOM_MEDIA_END_OF_STREAM) {
+      return rv;
+    }
     if (next_holder) {
       next_tstamp = next_holder->Timestamp();
       PushAudioPacket(next_holder);
@@ -593,7 +602,11 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
     }
     mLastAudioFrameTime = Some(tstamp);
   } else if (aType == TrackInfo::kVideoTrack) {
-    RefPtr<NesteggPacketHolder> next_holder(NextPacket(aType));
+    RefPtr<NesteggPacketHolder> next_holder;
+    rv = NextPacket(aType, next_holder);
+    if (NS_FAILED(rv) && rv != NS_ERROR_DOM_MEDIA_END_OF_STREAM) {
+      return rv;
+    }
     if (next_holder) {
       next_tstamp = next_holder->Timestamp();
       PushVideoPacket(next_holder);
@@ -610,7 +623,7 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
   }
 
   if (mIsMediaSource && next_tstamp == INT64_MIN) {
-    return false;
+    return NS_ERROR_DOM_MEDIA_END_OF_STREAM;
   }
 
   int64_t discardPadding = 0;
@@ -626,7 +639,7 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
     r = nestegg_packet_data(holder->Packet(), i, &data, &length);
     if (r == -1) {
       WEBM_DEBUG("nestegg_packet_data failed r=%d", r);
-      return false;
+      return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
     }
     bool isKeyframe = false;
     if (aType == TrackInfo::kAudioTrack) {
@@ -636,29 +649,46 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
         // Packet is encrypted, can't peek, use packet info
         isKeyframe = nestegg_packet_has_keyframe(holder->Packet()) == NESTEGG_PACKET_HAS_KEYFRAME_TRUE;
       } else {
-        vpx_codec_stream_info_t si;
-        PodZero(&si);
-        si.sz = sizeof(si);
+        auto sample = MakeSpan(data, length);
         switch (mVideoCodec) {
         case NESTEGG_CODEC_VP8:
-          vpx_codec_peek_stream_info(vpx_codec_vp8_dx(), data, length, &si);
+          isKeyframe = VPXDecoder::IsKeyframe(sample, VPXDecoder::Codec::VP8);
           break;
         case NESTEGG_CODEC_VP9:
-          vpx_codec_peek_stream_info(vpx_codec_vp9_dx(), data, length, &si);
+          isKeyframe = VPXDecoder::IsKeyframe(sample, VPXDecoder::Codec::VP9);
+          break;
+#ifdef MOZ_AV1
+        case NESTEGG_CODEC_AV1:
+          isKeyframe = AOMDecoder::IsKeyframe(sample);
           break;
+#endif
+        default:
+          NS_WARNING("Cannot detect keyframes in unknown WebM video codec");
+          return NS_ERROR_FAILURE;
         }
-        isKeyframe = si.is_kf;
         if (isKeyframe) {
-          // We only look for resolution changes on keyframes for both VP8 and
-          // VP9. Other resolution changes are invalid.
-          if (mLastSeenFrameWidth.isSome() && mLastSeenFrameHeight.isSome() &&
-            (si.w != mLastSeenFrameWidth.value() ||
-              si.h != mLastSeenFrameHeight.value())) {
-            mInfo.mVideo.mDisplay = nsIntSize(si.w, si.h);
+          // For both VP8 and VP9, we only look for resolution changes
+          // on keyframes. Other resolution changes are invalid.
+          auto dimensions = nsIntSize(0, 0);
+          switch (mVideoCodec) {
+          case NESTEGG_CODEC_VP8:
+            dimensions = VPXDecoder::GetFrameSize(sample, VPXDecoder::Codec::VP8);
+            break;
+          case NESTEGG_CODEC_VP9:
+            dimensions = VPXDecoder::GetFrameSize(sample, VPXDecoder::Codec::VP9);
+            break;
+#ifdef MOZ_AV1
+          case NESTEGG_CODEC_AV1:
+            dimensions = AOMDecoder::GetFrameSize(sample);
+            break;
+#endif
+          }
+          if (mLastSeenFrameSize.isSome()
+              && (dimensions != mLastSeenFrameSize.value())) {
+            mInfo.mVideo.mDisplay = dimensions;
             mSharedVideoTrackInfo = new SharedTrackInfo(mInfo.mVideo, ++sStreamSourceID);
           }
-          mLastSeenFrameWidth = Some(si.w);
-          mLastSeenFrameHeight = Some(si.h);
+          mLastSeenFrameSize = Some(dimensions);
         }
       }
     }
@@ -668,7 +698,7 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
     RefPtr<MediaRawData> sample = new MediaRawData(data, length);
     if (length && !sample->Data()) {
       // OOM.
-      return false;
+      return NS_ERROR_OUT_OF_MEMORY;
     }
     sample->mTimecode = tstamp;
     sample->mTime = tstamp;
@@ -721,11 +751,12 @@ WebMDemuxer::GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSampl
     }
     aSamples->Push(sample);
   }
-  return true;
+  return NS_OK;
 }
 
-RefPtr<NesteggPacketHolder>
-WebMDemuxer::NextPacket(TrackInfo::TrackType aType)
+nsresult
+WebMDemuxer::NextPacket(TrackInfo::TrackType aType,
+                        RefPtr<NesteggPacketHolder>& aPacket)
 {
   bool isVideo = aType == TrackInfo::kVideoTrack;
 
@@ -734,56 +765,64 @@ WebMDemuxer::NextPacket(TrackInfo::TrackType aType)
   bool hasType = isVideo ? mHasVideo : mHasAudio;
 
   if (!hasType) {
-    return nullptr;
+    return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
   }
 
   // The packet queue for the type that we are interested in.
   WebMPacketQueue &packets = isVideo ? mVideoPackets : mAudioPackets;
 
   if (packets.GetSize() > 0) {
-    return packets.PopFront();
+    aPacket = packets.PopFront();
+    return NS_OK;
   }
 
   // Track we are interested in
   uint32_t ourTrack = isVideo ? mVideoTrack : mAudioTrack;
 
   do {
-    RefPtr<NesteggPacketHolder> holder = DemuxPacket(aType);
+    RefPtr<NesteggPacketHolder> holder;
+    nsresult rv = DemuxPacket(aType, holder);
+    if (NS_FAILED(rv)) {
+      return rv;
+    }
     if (!holder) {
-      return nullptr;
+      return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
     }
 
     if (ourTrack == holder->Track()) {
-      return holder;
+      aPacket = holder;
+      return NS_OK;
     }
   } while (true);
 }
 
-RefPtr<NesteggPacketHolder>
-WebMDemuxer::DemuxPacket(TrackInfo::TrackType aType)
+nsresult
+WebMDemuxer::DemuxPacket(TrackInfo::TrackType aType,
+                         RefPtr<NesteggPacketHolder>& aPacket)
 {
   nestegg_packet* packet;
   int r = nestegg_read_packet(Context(aType), &packet);
   if (r == 0) {
     nestegg_read_reset(Context(aType));
-    return nullptr;
+    return NS_ERROR_DOM_MEDIA_END_OF_STREAM;
   } else if (r < 0) {
-    return nullptr;
+    return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
   }
 
   unsigned int track = 0;
   r = nestegg_packet_track(packet, &track);
   if (r == -1) {
-    return nullptr;
+    return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
   }
 
   int64_t offset = Resource(aType).Tell();
   RefPtr<NesteggPacketHolder> holder = new NesteggPacketHolder();
   if (!holder->Init(packet, offset, track, false)) {
-    return nullptr;
+    return NS_ERROR_DOM_MEDIA_DEMUXER_ERR;
   }
 
-  return holder;
+  aPacket = holder;
+  return NS_OK;
 }
 
 void
@@ -943,7 +982,14 @@ WebMTrackDemuxer::Seek(media::TimeUnit aTime)
   media::TimeUnit seekTime = aTime;
   mSamples.Reset();
   mParent->SeekInternal(mType, aTime);
-  mParent->GetNextPacket(mType, &mSamples);
+  nsresult rv = mParent->GetNextPacket(mType, &mSamples);
+  if (NS_FAILED(rv)) {
+    if (rv == NS_ERROR_DOM_MEDIA_END_OF_STREAM) {
+      // Ignore the error for now, the next GetSample will be rejected with EOS.
+      return SeekPromise::CreateAndResolve(media::TimeUnit(), __func__);
+    }
+    return SeekPromise::CreateAndReject(rv, __func__);
+  }
   mNeedKeyframe = true;
 
   // Check what time we actually seeked to.
@@ -956,15 +1002,18 @@ WebMTrackDemuxer::Seek(media::TimeUnit aTime)
   return SeekPromise::CreateAndResolve(seekTime, __func__);
 }
 
-RefPtr<MediaRawData>
-WebMTrackDemuxer::NextSample()
+nsresult
+WebMTrackDemuxer::NextSample(RefPtr<MediaRawData>& aData)
 {
-  while (mSamples.GetSize() < 1 && mParent->GetNextPacket(mType, &mSamples)) {
+  nsresult rv;
+  while (mSamples.GetSize() < 1 &&
+         NS_SUCCEEDED((rv = mParent->GetNextPacket(mType, &mSamples)))) {
   }
   if (mSamples.GetSize()) {
-    return mSamples.PopFront();
+    aData = mSamples.PopFront();
+    return NS_OK;
   }
-  return nullptr;
+  return rv;
 }
 
 RefPtr<WebMTrackDemuxer::SamplesPromise>
@@ -973,9 +1022,12 @@ WebMTrackDemuxer::GetSamples(int32_t aNumSamples)
   RefPtr<SamplesHolder> samples = new SamplesHolder;
   MOZ_ASSERT(aNumSamples);
 
+  nsresult rv = NS_ERROR_DOM_MEDIA_END_OF_STREAM;
+
   while (aNumSamples) {
-    RefPtr<MediaRawData> sample(NextSample());
-    if (!sample) {
+    RefPtr<MediaRawData> sample;
+    rv = NextSample(sample);
+    if (NS_FAILED(rv)) {
       break;
     }
     if (mNeedKeyframe && !sample->mKeyframe) {
@@ -987,7 +1039,7 @@ WebMTrackDemuxer::GetSamples(int32_t aNumSamples)
   }
 
   if (samples->mSamples.IsEmpty()) {
-    return SamplesPromise::CreateAndReject(NS_ERROR_DOM_MEDIA_END_OF_STREAM, __func__);
+    return SamplesPromise::CreateAndReject(rv, __func__);
   } else {
     UpdateSamples(samples->mSamples);
     return SamplesPromise::CreateAndResolve(samples, __func__);
@@ -1022,7 +1074,8 @@ WebMTrackDemuxer::SetNextKeyFrameTime()
   }
   // Demux and buffer frames until we find a keyframe.
   RefPtr<MediaRawData> sample;
-  while (!foundKeyframe && (sample = NextSample())) {
+  nsresult rv = NS_OK;
+  while (!foundKeyframe && NS_SUCCEEDED((rv = NextSample(sample)))) {
     if (sample->mKeyframe) {
       frameTime = sample->mTime;
       foundKeyframe = true;
@@ -1104,10 +1157,11 @@ WebMTrackDemuxer::SkipToNextRandomAccessPoint(media::TimeUnit aTimeThreshold)
   uint32_t parsed = 0;
   bool found = false;
   RefPtr<MediaRawData> sample;
+  nsresult rv = NS_OK;
   int64_t sampleTime;
 
   WEBM_DEBUG("TimeThreshold: %f", aTimeThreshold.ToSeconds());
-  while (!found && (sample = NextSample())) {
+  while (!found && NS_SUCCEEDED((rv = NextSample(sample)))) {
     parsed++;
     sampleTime = sample->mTime;
     if (sample->mKeyframe && sampleTime >= aTimeThreshold.ToMicroseconds()) {
@@ -1116,7 +1170,9 @@ WebMTrackDemuxer::SkipToNextRandomAccessPoint(media::TimeUnit aTimeThreshold)
       mSamples.PushFront(sample.forget());
     }
   }
-  SetNextKeyFrameTime();
+  if (NS_SUCCEEDED(rv)) {
+    SetNextKeyFrameTime();
+  }
   if (found) {
     WEBM_DEBUG("next sample: %f (parsed: %d)",
                media::TimeUnit::FromMicroseconds(sampleTime).ToSeconds(),
diff --git a/dom/media/webm/WebMDemuxer.h b/dom/media/webm/WebMDemuxer.h
index 6fff38e7d..36d381e57 100644
--- a/dom/media/webm/WebMDemuxer.h
+++ b/dom/media/webm/WebMDemuxer.h
@@ -8,9 +8,13 @@
 
 #include "nsTArray.h"
 #include "MediaDataDemuxer.h"
+#include "MediaResource.h"
 #include "NesteggPacketHolder.h"
 #include "mozilla/Move.h"
 
+#include <deque>
+#include <stdint.h>
+
 typedef struct nestegg nestegg;
 
 namespace mozilla {
@@ -111,7 +115,8 @@ public:
   bool GetOffsetForTime(uint64_t aTime, int64_t* aOffset);
 
   // Demux next WebM packet and append samples to MediaRawDataQueue
-  bool GetNextPacket(TrackInfo::TrackType aType, MediaRawDataQueue *aSamples);
+  nsresult GetNextPacket(TrackInfo::TrackType aType,
+                         MediaRawDataQueue *aSamples);
 
   nsresult Reset(TrackInfo::TrackType aType);
 
@@ -175,11 +180,13 @@ private:
   // Read a packet from the nestegg file. Returns nullptr if all packets for
   // the particular track have been read. Pass TrackInfo::kVideoTrack or
   // TrackInfo::kVideoTrack to indicate the type of the packet we want to read.
-  RefPtr<NesteggPacketHolder> NextPacket(TrackInfo::TrackType aType);
+  nsresult NextPacket(TrackInfo::TrackType aType,
+                      RefPtr<NesteggPacketHolder>& aPacket);
 
   // Internal method that demuxes the next packet from the stream. The caller
   // is responsible for making sure it doesn't get lost.
-  RefPtr<NesteggPacketHolder> DemuxPacket(TrackInfo::TrackType aType);
+  nsresult DemuxPacket(TrackInfo::TrackType aType,
+                       RefPtr<NesteggPacketHolder>& aPacket);
 
   // libnestegg audio and video context for webm container.
   // Access on reader's thread only.
@@ -237,8 +244,7 @@ private:
   int64_t mLastWebMBlockOffset;
   const bool mIsMediaSource;
 
-  Maybe<uint32_t> mLastSeenFrameWidth;
-  Maybe<uint32_t> mLastSeenFrameHeight;
+  Maybe<nsIntSize> mLastSeenFrameSize;
   // This will be populated only if a resolution change occurs, otherwise it
   // will be left as null so the original metadata is used
   RefPtr<SharedTrackInfo> mSharedVideoTrackInfo;
@@ -276,7 +282,7 @@ private:
   ~WebMTrackDemuxer();
   void UpdateSamples(nsTArray<RefPtr<MediaRawData>>& aSamples);
   void SetNextKeyFrameTime();
-  RefPtr<MediaRawData> NextSample ();
+  nsresult NextSample(RefPtr<MediaRawData>& aData);
   RefPtr<WebMDemuxer> mParent;
   TrackInfo::TrackType mType;
   UniquePtr<TrackInfo> mInfo;
diff --git a/dom/webidl/CommandEvent.webidl b/dom/webidl/CommandEvent.webidl
index 8c16e856c..9856c77c3 100644
--- a/dom/webidl/CommandEvent.webidl
+++ b/dom/webidl/CommandEvent.webidl
@@ -8,7 +8,7 @@ interface CommandEvent : Event {
   readonly attribute DOMString? command;
 
   void initCommandEvent(DOMString type,
-                        boolean canBubble,
-                        boolean cancelable,
-                        DOMString? command);
+                        optional boolean canBubble = false,
+                        optional boolean cancelable = false,
+                        optional DOMString? command = null);
 };
diff --git a/dom/webidl/CompositionEvent.webidl b/dom/webidl/CompositionEvent.webidl
index c293683ce..e4a54d678 100644
--- a/dom/webidl/CompositionEvent.webidl
+++ b/dom/webidl/CompositionEvent.webidl
@@ -25,9 +25,9 @@ interface CompositionEvent : UIEvent
 partial interface CompositionEvent
 {
   void initCompositionEvent(DOMString typeArg,
-                            boolean canBubbleArg,
-                            boolean cancelableArg,
-                            Window? viewArg,
-                            DOMString? dataArg,
-                            DOMString localeArg);
+                            optional boolean canBubbleArg = false,
+                            optional boolean cancelableArg = false,
+                            optional Window? viewArg = null,
+                            optional DOMString? dataArg = null,
+                            optional DOMString localeArg = "");
 };
diff --git a/dom/webidl/CustomEvent.webidl b/dom/webidl/CustomEvent.webidl
index 299a41ec0..1ea5572b7 100644
--- a/dom/webidl/CustomEvent.webidl
+++ b/dom/webidl/CustomEvent.webidl
@@ -19,9 +19,9 @@ interface CustomEvent : Event
   // initCustomEvent is a Gecko specific deprecated method.
   [Throws]
   void initCustomEvent(DOMString type,
-                       boolean canBubble,
-                       boolean cancelable,
-                       any detail);
+                       optional boolean canBubble = false,
+                       optional boolean cancelable = false,
+                       optional any detail = null);
 };
 
 dictionary CustomEventInit : EventInit
diff --git a/dom/webidl/DeviceMotionEvent.webidl b/dom/webidl/DeviceMotionEvent.webidl
index fa4ecf3ca..c26ab080c 100644
--- a/dom/webidl/DeviceMotionEvent.webidl
+++ b/dom/webidl/DeviceMotionEvent.webidl
@@ -48,10 +48,10 @@ dictionary DeviceMotionEventInit : EventInit {
 // Mozilla extensions.
 partial interface DeviceMotionEvent {
   void initDeviceMotionEvent(DOMString type,
-                             boolean canBubble,
-                             boolean cancelable,
-                             DeviceAccelerationInit acceleration,
-                             DeviceAccelerationInit accelerationIncludingGravity,
-                             DeviceRotationRateInit rotationRate,
-                             double? interval);
+                             optional boolean canBubble = false,
+                             optional boolean cancelable = false,
+                             optional DeviceAccelerationInit acceleration,
+                             optional DeviceAccelerationInit accelerationIncludingGravity,
+                             optional DeviceRotationRateInit rotationRate,
+                             optional double? interval = null);
 };
diff --git a/dom/webidl/DeviceOrientationEvent.webidl b/dom/webidl/DeviceOrientationEvent.webidl
index 46194453e..9802b3681 100644
--- a/dom/webidl/DeviceOrientationEvent.webidl
+++ b/dom/webidl/DeviceOrientationEvent.webidl
@@ -14,12 +14,12 @@ interface DeviceOrientationEvent : Event
 
   // initDeviceOrientationEvent is a Gecko specific deprecated method.
   void initDeviceOrientationEvent(DOMString type,
-                                  boolean canBubble,
-                                  boolean cancelable,
-                                  double? alpha,
-                                  double? beta,
-                                  double? gamma,
-                                  boolean absolute);
+                                  optional boolean canBubble = false,
+                                  optional boolean cancelable = false,
+                                  optional double? alpha = null,
+                                  optional double? beta = null,
+                                  optional double? gamma = null,
+                                  optional boolean absolute = false);
 };
 
 dictionary DeviceOrientationEventInit : EventInit
diff --git a/dom/webidl/DragEvent.webidl b/dom/webidl/DragEvent.webidl
index 2cc173d5c..806177790 100644
--- a/dom/webidl/DragEvent.webidl
+++ b/dom/webidl/DragEvent.webidl
@@ -10,21 +10,21 @@ interface DragEvent : MouseEvent
   readonly attribute DataTransfer? dataTransfer;
 
   void initDragEvent(DOMString type,
-                     boolean canBubble,
-                     boolean cancelable,
-                     Window? aView,
-                     long aDetail,
-                     long aScreenX,
-                     long aScreenY,
-                     long aClientX,
-                     long aClientY,
-                     boolean aCtrlKey,
-                     boolean aAltKey,
-                     boolean aShiftKey,
-                     boolean aMetaKey,
-                     unsigned short aButton,
-                     EventTarget? aRelatedTarget,
-                     DataTransfer? aDataTransfer);
+                     optional boolean canBubble = false,
+                     optional boolean cancelable = false,
+                     optional Window? aView = null,
+                     optional long aDetail = 0,
+                     optional long aScreenX = 0,
+                     optional long aScreenY = 0,
+                     optional long aClientX = 0,
+                     optional long aClientY = 0,
+                     optional boolean aCtrlKey = false,
+                     optional boolean aAltKey = false,
+                     optional boolean aShiftKey = false,
+                     optional boolean aMetaKey = false,
+                     optional unsigned short aButton = 0,
+                     optional EventTarget? aRelatedTarget = null,
+                     optional DataTransfer? aDataTransfer = null);
 };
 
 dictionary DragEventInit : MouseEventInit
diff --git a/dom/webidl/Event.webidl b/dom/webidl/Event.webidl
index 70a0ef513..a5d7da7d4 100644
--- a/dom/webidl/Event.webidl
+++ b/dom/webidl/Event.webidl
@@ -51,7 +51,9 @@ interface Event {
   [Pure]
   readonly attribute DOMHighResTimeStamp timeStamp;
 
-  void initEvent(DOMString type, boolean bubbles, boolean cancelable);
+  void initEvent(DOMString type,
+                 optional boolean bubbles = false,
+                 optional boolean cancelable = false);
   attribute boolean cancelBubble;
 };
 
diff --git a/dom/webidl/HashChangeEvent.webidl b/dom/webidl/HashChangeEvent.webidl
index 735e8eb28..6e8be455c 100644
--- a/dom/webidl/HashChangeEvent.webidl
+++ b/dom/webidl/HashChangeEvent.webidl
@@ -11,10 +11,10 @@ interface HashChangeEvent : Event
   readonly attribute DOMString newURL;
 
   void initHashChangeEvent(DOMString typeArg,
-                           boolean canBubbleArg,
-                           boolean cancelableArg,
-                           DOMString oldURLArg,
-                           DOMString newURLArg);
+                           optional boolean canBubbleArg = false,
+                           optional boolean cancelableArg = false,
+                           optional DOMString oldURLArg = "",
+                           optional DOMString newURLArg = "");
 };
 
 dictionary HashChangeEventInit : EventInit
diff --git a/dom/webidl/KeyEvent.webidl b/dom/webidl/KeyEvent.webidl
index 516632854..abb4b6a34 100644
--- a/dom/webidl/KeyEvent.webidl
+++ b/dom/webidl/KeyEvent.webidl
@@ -225,13 +225,13 @@ interface KeyEvent
   const unsigned long DOM_VK_WIN_OEM_CLEAR  = 0xFE;
 
   void initKeyEvent(DOMString type,
-                    boolean canBubble,
-                    boolean cancelable,
-                    Window? view,
-                    boolean ctrlKey,
-                    boolean altKey,
-                    boolean shiftKey,
-                    boolean metaKey,
-                    unsigned long keyCode,
-                    unsigned long charCode);
+                    optional boolean canBubble = false,
+                    optional boolean cancelable = false,
+                    optional Window? view = null,
+                    optional boolean ctrlKey = false,
+                    optional boolean altKey = false,
+                    optional boolean shiftKey = false,
+                    optional boolean metaKey = false,
+                    optional unsigned long keyCode = 0,
+                    optional unsigned long charCode = 0);
 };
diff --git a/dom/webidl/MessageEvent.webidl b/dom/webidl/MessageEvent.webidl
index 548f14520..be5022d67 100644
--- a/dom/webidl/MessageEvent.webidl
+++ b/dom/webidl/MessageEvent.webidl
@@ -43,10 +43,14 @@ interface MessageEvent : Event {
   [Pure, Cached, Frozen]
   readonly attribute sequence<MessagePort> ports;
 
-  void initMessageEvent(DOMString type, boolean bubbles, boolean cancelable,
-                        any data, DOMString origin, DOMString lastEventId,
-                        (WindowProxy or MessagePort)? source,
-                        sequence<MessagePort> ports);
+  void initMessageEvent(DOMString type,
+                        optional boolean bubbles = false,
+                        optional boolean cancelable = false,
+                        optional any data = null, 
+                        optional DOMString origin = "", 
+                        optional DOMString lastEventId = "",
+                        optional (WindowProxy or MessagePort)? source = null,
+                        optional sequence<MessagePort> ports = []);
 };
 
 dictionary MessageEventInit : EventInit {
diff --git a/dom/webidl/MouseEvent.webidl b/dom/webidl/MouseEvent.webidl
index d21354801..192519d57 100644
--- a/dom/webidl/MouseEvent.webidl
+++ b/dom/webidl/MouseEvent.webidl
@@ -32,21 +32,21 @@ interface MouseEvent : UIEvent {
   readonly attribute long           movementY;
 
   // Deprecated in DOM Level 3:
-  void                              initMouseEvent(DOMString typeArg, 
-                                                   boolean canBubbleArg, 
-                                                   boolean cancelableArg, 
-                                                   Window? viewArg,
-                                                   long detailArg, 
-                                                   long screenXArg, 
-                                                   long screenYArg, 
-                                                   long clientXArg, 
-                                                   long clientYArg, 
-                                                   boolean ctrlKeyArg, 
-                                                   boolean altKeyArg, 
-                                                   boolean shiftKeyArg, 
-                                                   boolean metaKeyArg, 
-                                                   short buttonArg,
-                                                   EventTarget? relatedTargetArg);
+void initMouseEvent(DOMString typeArg,
+                    optional boolean canBubbleArg = false,
+                    optional boolean cancelableArg = false,
+                    optional Window? viewArg = null,
+                    optional long detailArg = 0,
+                    optional long screenXArg = 0,
+                    optional long screenYArg = 0,
+                    optional long clientXArg = 0,
+                    optional long clientYArg = 0,
+                    optional boolean ctrlKeyArg = false,
+                    optional boolean altKeyArg = false,
+                    optional boolean shiftKeyArg = false,
+                    optional boolean metaKeyArg = false,
+                    optional short buttonArg = 0,
+                    optional EventTarget? relatedTargetArg = null);
   // Introduced in DOM Level 3:
   boolean                           getModifierState(DOMString keyArg);
 };
@@ -90,23 +90,23 @@ partial interface MouseEvent
 
   readonly attribute unsigned short mozInputSource;
 
-  void                initNSMouseEvent(DOMString typeArg,
-                                       boolean canBubbleArg,
-                                       boolean cancelableArg,
-                                       Window? viewArg,
-                                       long detailArg,
-                                       long screenXArg,
-                                       long screenYArg,
-                                       long clientXArg,
-                                       long clientYArg,
-                                       boolean ctrlKeyArg,
-                                       boolean altKeyArg,
-                                       boolean shiftKeyArg,
-                                       boolean metaKeyArg,
-                                       short buttonArg,
-                                       EventTarget? relatedTargetArg,
-                                       float pressure,
-                                       unsigned short inputSourceArg);
+  void initNSMouseEvent(DOMString typeArg,
+                        optional boolean canBubbleArg = false,
+                        optional boolean cancelableArg = false,
+                        optional Window? viewArg = null,
+                        optional long detailArg = 0,
+                        optional long screenXArg = 0,
+                        optional long screenYArg = 0,
+                        optional long clientXArg = 0,
+                        optional long clientYArg = 0,
+                        optional boolean ctrlKeyArg = false,
+                        optional boolean altKeyArg = false,
+                        optional boolean shiftKeyArg = false,
+                        optional boolean metaKeyArg = false,
+                        optional short buttonArg = 0,
+                        optional EventTarget? relatedTargetArg = null,
+                        optional float pressure = 0,
+                        optional unsigned short inputSourceArg = 0);
   [ChromeOnly]
   readonly attribute boolean hitCluster; // True when touch occurs in a cluster of links
 
diff --git a/dom/webidl/MouseScrollEvent.webidl b/dom/webidl/MouseScrollEvent.webidl
index aa9e30fd2..c1e52bd8c 100644
--- a/dom/webidl/MouseScrollEvent.webidl
+++ b/dom/webidl/MouseScrollEvent.webidl
@@ -12,19 +12,19 @@ interface MouseScrollEvent : MouseEvent
   readonly attribute long axis;
 
   void initMouseScrollEvent(DOMString type,
-                            boolean canBubble,
-                            boolean cancelable,
-                            Window? view,
-                            long detail,
-                            long screenX,
-                            long screenY,
-                            long clientX,
-                            long clientY,
-                            boolean ctrlKey,
-                            boolean altKey,
-                            boolean shiftKey,
-                            boolean metaKey,
-                            unsigned short button,
-                            EventTarget? relatedTarget,
-                            long axis);
+                            optional boolean canBubble = false,
+                            optional boolean cancelable = false,
+                            optional Window? view = null,
+                            optional long detail = 0,
+                            optional long screenX = 0,
+                            optional long screenY = 0,
+                            optional long clientX = 0,
+                            optional long clientY = 0,
+                            optional boolean ctrlKey = false,
+                            optional boolean altKey = false,
+                            optional boolean shiftKey = false,
+                            optional boolean metaKey = false,
+                            optional short button = 0,
+                            optional EventTarget? relatedTarget = null,
+                            optional long axis = 0);
 };
diff --git a/dom/webidl/MutationEvent.webidl b/dom/webidl/MutationEvent.webidl
index 43c7b1cd0..53625b4f9 100644
--- a/dom/webidl/MutationEvent.webidl
+++ b/dom/webidl/MutationEvent.webidl
@@ -23,11 +23,11 @@ interface MutationEvent : Event
 
   [Throws]
   void initMutationEvent(DOMString type,
-                         boolean canBubble,
-                         boolean cancelable,
-                         Node? relatedNode,
-                         DOMString prevValue,
-                         DOMString newValue,
-                         DOMString attrName,
-                         unsigned short attrChange);
+                         optional boolean canBubble = false,
+                         optional boolean cancelable = false,
+                         optional Node? relatedNode = null,
+                         optional DOMString prevValue = "",
+                         optional DOMString newValue = "",
+                         optional DOMString attrName = "",
+                         optional unsigned short attrChange = 0);
 };
diff --git a/dom/webidl/ScrollAreaEvent.webidl b/dom/webidl/ScrollAreaEvent.webidl
index 0f48b4bc8..f24b7c0ad 100644
--- a/dom/webidl/ScrollAreaEvent.webidl
+++ b/dom/webidl/ScrollAreaEvent.webidl
@@ -12,12 +12,12 @@ interface ScrollAreaEvent : UIEvent
   readonly attribute float height;
 
   void initScrollAreaEvent(DOMString type,
-                           boolean canBubble,
-                           boolean cancelable,
-                           Window? view,
-                           long detail,
-                           float x,
-                           float y,
-                           float width,
-                           float height);
+                           optional boolean canBubble = false,
+                           optional boolean cancelable = false,
+                           optional Window? view = null,
+                           optional long detail = 0,
+                           optional float x = 0,
+                           optional float y = 0,
+                           optional float width = 0,
+                           optional float height = 0);
 };
diff --git a/dom/webidl/SimpleGestureEvent.webidl b/dom/webidl/SimpleGestureEvent.webidl
index 0829076dd..76d0d20f6 100644
--- a/dom/webidl/SimpleGestureEvent.webidl
+++ b/dom/webidl/SimpleGestureEvent.webidl
@@ -25,22 +25,22 @@ interface SimpleGestureEvent : MouseEvent
   readonly attribute unsigned long clickCount;
 
   void initSimpleGestureEvent(DOMString typeArg,
-                              boolean canBubbleArg,
-                              boolean cancelableArg,
-                              Window? viewArg,
-                              long detailArg,
-                              long screenXArg,
-                              long screenYArg,
-                              long clientXArg,
-                              long clientYArg,
-                              boolean ctrlKeyArg,
-                              boolean altKeyArg,
-                              boolean shiftKeyArg,
-                              boolean metaKeyArg,
-                              unsigned short buttonArg,
-                              EventTarget? relatedTargetArg,
-                              unsigned long allowedDirectionsArg,
-                              unsigned long directionArg,
-                              double deltaArg,
-                              unsigned long clickCount);
+                              optional boolean canBubbleArg = false,
+                              optional boolean cancelableArg = false,
+                              optional Window? viewArg = null,
+                              optional long detailArg = 0,
+                              optional long screenXArg = 0,
+                              optional long screenYArg = 0,
+                              optional long clientXArg = 0,
+                              optional long clientYArg = 0,
+                              optional boolean ctrlKeyArg = false,
+                              optional boolean altKeyArg = false,
+                              optional boolean shiftKeyArg = false,
+                              optional boolean metaKeyArg = false,
+                              optional short buttonArg = 0,
+                              optional EventTarget? relatedTargetArg = null,
+                              optional unsigned long allowedDirectionsArg = 0,
+                              optional unsigned long directionArg = 0,
+                              optional double deltaArg = 0,
+                              optional unsigned long clickCount = 0);
 };
diff --git a/dom/webidl/StorageEvent.webidl b/dom/webidl/StorageEvent.webidl
index c3e9605eb..e03f8232c 100644
--- a/dom/webidl/StorageEvent.webidl
+++ b/dom/webidl/StorageEvent.webidl
@@ -21,13 +21,13 @@ interface StorageEvent : Event
 
   // Bug 1016053 - This is not spec compliant.
   void initStorageEvent(DOMString type,
-                        boolean canBubble,
-                        boolean cancelable,
-                        DOMString? key,
-                        DOMString? oldValue,
-                        DOMString? newValue,
-                        DOMString? url,
-                        Storage? storageArea);
+                        optional boolean canBubble = false,
+                        optional boolean cancelable = false,
+                        optional DOMString? key = null,
+                        optional DOMString? oldValue = null,
+                        optional DOMString? newValue = null,
+                        optional DOMString? url = null,
+                        optional Storage? storageArea = null);
 };
 
 dictionary StorageEventInit : EventInit
diff --git a/dom/webidl/TimeEvent.webidl b/dom/webidl/TimeEvent.webidl
index 40e7a0beb..8bbe4db53 100644
--- a/dom/webidl/TimeEvent.webidl
+++ b/dom/webidl/TimeEvent.webidl
@@ -15,6 +15,6 @@ interface TimeEvent : Event
   readonly attribute long         detail;
   readonly attribute WindowProxy? view;
   void initTimeEvent(DOMString aType,
-                     Window? aView,
-                     long aDetail);
+                     optional Window? aView = null,
+                     optional long aDetail = 0);
 };
diff --git a/dom/webidl/TouchEvent.webidl b/dom/webidl/TouchEvent.webidl
index d206fe0fb..fd677787a 100644
--- a/dom/webidl/TouchEvent.webidl
+++ b/dom/webidl/TouchEvent.webidl
@@ -23,15 +23,15 @@ interface TouchEvent : UIEvent {
   readonly attribute boolean shiftKey;
 
   void initTouchEvent(DOMString type,
-                      boolean canBubble,
-                      boolean cancelable,
-                      Window? view,
-                      long detail,
-                      boolean ctrlKey,
-                      boolean altKey,
-                      boolean shiftKey,
-                      boolean metaKey,
-                      TouchList? touches,
-                      TouchList? targetTouches,
-                      TouchList? changedTouches);
+                      optional boolean canBubble = false,
+                      optional boolean cancelable = false,
+                      optional Window? view = null,
+                      optional long detail = 0,
+                      optional boolean ctrlKey = false,
+                      optional boolean altKey = false,
+                      optional boolean shiftKey = false,
+                      optional boolean metaKey = false,
+                      optional TouchList? touches = null,
+                      optional TouchList? targetTouches = null,
+                      optional TouchList? changedTouches = null);
 };
diff --git a/dom/webidl/UIEvent.webidl b/dom/webidl/UIEvent.webidl
index 9cc1d0cdf..5be6a443a 100644
--- a/dom/webidl/UIEvent.webidl
+++ b/dom/webidl/UIEvent.webidl
@@ -16,10 +16,10 @@ interface UIEvent : Event
   readonly attribute WindowProxy? view;
   readonly attribute long         detail;
   void initUIEvent(DOMString aType,
-                   boolean aCanBubble,
-                   boolean aCancelable,
-                   Window? aView,
-                   long aDetail);
+                   optional boolean aCanBubble = false,
+                   optional boolean aCancelable = false,
+                   optional Window? aView = null,
+                   optional long aDetail = 0);
 };
 
 // Additional DOM0 properties.
diff --git a/dom/webidl/XULCommandEvent.webidl b/dom/webidl/XULCommandEvent.webidl
index 9c024edc1..72dc3802e 100644
--- a/dom/webidl/XULCommandEvent.webidl
+++ b/dom/webidl/XULCommandEvent.webidl
@@ -15,13 +15,13 @@ interface XULCommandEvent : UIEvent
   readonly attribute Event? sourceEvent;
 
   void initCommandEvent(DOMString type,
-                        boolean canBubble,
-                        boolean cancelable,
-                        Window? view,
-                        long detail,
-                        boolean ctrlKey,
-                        boolean altKey,
-                        boolean shiftKey,
-                        boolean metaKey,
-                        Event? sourceEvent);
+                        optional boolean canBubble = false,
+                        optional boolean cancelable = false,
+                        optional Window? view = null,
+                        optional long detail = 0,
+                        optional boolean ctrlKey = false,
+                        optional boolean altKey = false,
+                        optional boolean shiftKey = false,
+                        optional boolean metaKey = false,
+                        optional Event? sourceEvent = null);
 };
diff --git a/dom/xbl/nsXBLBinding.cpp b/dom/xbl/nsXBLBinding.cpp
index d9a2aacc5..b8174f6c2 100644
--- a/dom/xbl/nsXBLBinding.cpp
+++ b/dom/xbl/nsXBLBinding.cpp
@@ -1049,7 +1049,8 @@ nsXBLBinding::DoInitJSClass(JSContext *cx,
   // to create and define it.
   JS::Rooted<JSObject*> proto(cx);
   JS::Rooted<JS::PropertyDescriptor> desc(cx);
-  if (!JS_GetOwnUCPropertyDescriptor(cx, holder, aClassName.get(), &desc)) {
+  if (!JS_GetOwnUCPropertyDescriptor(cx, holder, aClassName.get(),
+                                     aClassName.Length(), &desc)) {
     return NS_ERROR_OUT_OF_MEMORY;
   }
   *aNew = !desc.object();
diff --git a/dom/xbl/nsXBLProtoImpl.cpp b/dom/xbl/nsXBLProtoImpl.cpp
index 4db9cabf0..5efcb71e0 100644
--- a/dom/xbl/nsXBLProtoImpl.cpp
+++ b/dom/xbl/nsXBLProtoImpl.cpp
@@ -100,11 +100,15 @@ nsXBLProtoImpl::InstallImplementation(nsXBLPrototypeBinding* aPrototypeBinding,
   // end up with a different content prototype, but we'll already have a property
   // holder called |foo| in the XBL scope. Check for that to avoid wasteful and
   // weird property holder duplication.
-  const char16_t* className = aPrototypeBinding->ClassName().get();
+  const nsString& className = aPrototypeBinding->ClassName();
+  const char16_t* classNameChars = className.get();
+  const size_t classNameLen = className.Length();
+
   JS::Rooted<JSObject*> propertyHolder(cx);
   JS::Rooted<JS::PropertyDescriptor> existingHolder(cx);
   if (scopeObject != globalObject &&
-      !JS_GetOwnUCPropertyDescriptor(cx, scopeObject, className, &existingHolder)) {
+      !JS_GetOwnUCPropertyDescriptor(cx, scopeObject, classNameChars,
+                                     classNameLen, &existingHolder)) {
     return NS_ERROR_FAILURE;
   }
   bool propertyHolderIsNew = !existingHolder.object() || !existingHolder.value().isObject();
@@ -119,8 +123,8 @@ nsXBLProtoImpl::InstallImplementation(nsXBLPrototypeBinding* aPrototypeBinding,
 
     // Define it as a property on the scopeObject, using the same name used on
     // the content side.
-    bool ok = JS_DefineUCProperty(cx, scopeObject, className, -1, propertyHolder,
-                                  JSPROP_PERMANENT | JSPROP_READONLY,
+    bool ok = JS_DefineUCProperty(cx, scopeObject, classNameChars, classNameLen,
+                                  propertyHolder, JSPROP_PERMANENT | JSPROP_READONLY,
                                   JS_STUBGETTER, JS_STUBSETTER);
     NS_ENSURE_TRUE(ok, NS_ERROR_UNEXPECTED);
   } else {
diff --git a/extensions/cookie/nsPermissionManager.cpp b/extensions/cookie/nsPermissionManager.cpp
index 13ae7fd35..2c2e2d79c 100644
--- a/extensions/cookie/nsPermissionManager.cpp
+++ b/extensions/cookie/nsPermissionManager.cpp
@@ -1076,8 +1076,6 @@ nsPermissionManager::InitDB(bool aRemoveFile)
         bool hostsTableExists = false;
         mDBConn->TableExists(NS_LITERAL_CSTRING("moz_hosts"), &hostsTableExists);
         if (hostsTableExists) {
-          bool migrationError = false;
-
           // Both versions 4 and 6 have a version 4 formatted hosts table named
           // moz_hosts. We can migrate this table to our version 7 table moz_perms.
           // If moz_perms is present, then we can use it as a basis for comparison.
@@ -1125,12 +1123,10 @@ nsPermissionManager::InitDB(bool aRemoveFile)
             // Read in the old row
             rv = stmt->GetUTF8String(0, host);
             if (NS_WARN_IF(NS_FAILED(rv))) {
-              migrationError = true;
               continue;
             }
             rv = stmt->GetUTF8String(1, type);
             if (NS_WARN_IF(NS_FAILED(rv))) {
-              migrationError = true;
               continue;
             }
             permission = stmt->AsInt32(2);
@@ -1138,7 +1134,6 @@ nsPermissionManager::InitDB(bool aRemoveFile)
             expireTime = stmt->AsInt64(4);
             modificationTime = stmt->AsInt64(5);
             if (NS_WARN_IF(stmt->AsInt64(6) < 0)) {
-              migrationError = true;
               continue;
             }
             appId = static_cast<uint32_t>(stmt->AsInt64(6));
@@ -1155,7 +1150,6 @@ nsPermissionManager::InitDB(bool aRemoveFile)
             if (NS_FAILED(rv)) {
               NS_WARNING("Unexpected failure when upgrading migrating permission "
                          "from host to origin");
-              migrationError = true;
             }
           }
 
diff --git a/extensions/spellcheck/src/mozEnglishWordUtils.cpp b/extensions/spellcheck/src/mozEnglishWordUtils.cpp
index 5033b247b..671b22c16 100644
--- a/extensions/spellcheck/src/mozEnglishWordUtils.cpp
+++ b/extensions/spellcheck/src/mozEnglishWordUtils.cpp
@@ -165,7 +165,7 @@ NS_IMETHODIMP mozEnglishWordUtils::FindNextWord(const char16_t *word, uint32_t l
 
     // before we spend more time looking to see if the word is a url, look for a url identifer
     // and make sure that identifer isn't the last character in the word fragment.
-    if ( (*p == ':' || *p == '@' || *p == '.') &&  p < endbuf - 1) {
+    if ( (p < endbuf - 1) && (*p == ':' || *p == '@' || *p == '.') ) {
 
         // ok, we have a possible url...do more research to find out if we really have one
         // and determine the length of the url so we can skip over it.
diff --git a/gfx/layers/ipc/CompositorBridgeParent.cpp b/gfx/layers/ipc/CompositorBridgeParent.cpp
index e31c6ceb6..00602fab5 100644
--- a/gfx/layers/ipc/CompositorBridgeParent.cpp
+++ b/gfx/layers/ipc/CompositorBridgeParent.cpp
@@ -482,8 +482,6 @@ CompositorVsyncScheduler::Composite(TimeStamp aVsyncTimestamp)
     mLastCompose = aVsyncTimestamp;
     ComposeToTarget(nullptr);
     mVsyncNotificationsSkipped = 0;
-
-    TimeDuration compositeFrameTotal = TimeStamp::Now() - aVsyncTimestamp;
   } else if (mVsyncNotificationsSkipped++ > gfxPrefs::CompositorUnobserveCount()) {
     UnobserveVsync();
   }
diff --git a/gfx/skia/skia/src/core/SkGeometry.cpp b/gfx/skia/skia/src/core/SkGeometry.cpp
index 58b45140e..9af6eb774 100644
--- a/gfx/skia/skia/src/core/SkGeometry.cpp
+++ b/gfx/skia/skia/src/core/SkGeometry.cpp
@@ -62,6 +62,15 @@ static int valid_unit_divide(SkScalar numer, SkScalar denom, SkScalar* ratio) {
     return 1;
 }
 
+// Just returns its argument, but makes it easy to set a break-point to know when
+// SkFindUnitQuadRoots is going to return 0 (an error).
+static int return_check_zero(int value) {
+    if (value == 0) {
+        return 0;
+    }
+    return value;
+}
+
 /** From Numerical Recipes in C.
 
     Q = -1/2 (B + sign(B) sqrt[B*B - 4*A*C])
@@ -72,22 +81,21 @@ int SkFindUnitQuadRoots(SkScalar A, SkScalar B, SkScalar C, SkScalar roots[2]) {
     SkASSERT(roots);
 
     if (A == 0) {
-        return valid_unit_divide(-C, B, roots);
+        return return_check_zero(valid_unit_divide(-C, B, roots));
     }
 
     SkScalar* r = roots;
 
-    SkScalar R = B*B - 4*A*C;
-    if (R < 0 || !SkScalarIsFinite(R)) {  // complex roots
-        // if R is infinite, it's possible that it may still produce
-        // useful results if the operation was repeated in doubles
-        // the flipside is determining if the more precise answer
-        // isn't useful because surrounding machinery (e.g., subtracting
-        // the axis offset from C) already discards the extra precision
-        // more investigation and unit tests required...
-        return 0;
+    // use doubles so we don't overflow temporarily trying to compute R
+    double dr = (double)B * B - 4 * (double)A * C;
+    if (dr < 0) {
+        return return_check_zero(0);
+    }
+    dr = sqrt(dr);
+    SkScalar R = SkDoubleToScalar(dr);
+    if (!SkScalarIsFinite(R)) {
+        return return_check_zero(0);
     }
-    R = SkScalarSqrt(R);
 
     SkScalar Q = (B < 0) ? -(B-R)/2 : -(B+R)/2;
     r += valid_unit_divide(Q, A, r);
@@ -98,7 +106,7 @@ int SkFindUnitQuadRoots(SkScalar A, SkScalar B, SkScalar C, SkScalar roots[2]) {
         else if (roots[0] == roots[1])  // nearly-equal?
             r -= 1; // skip the double root
     }
-    return (int)(r - roots);
+    return return_check_zero((int)(r - roots));
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/gfx/skia/skia/src/core/SkPath.cpp b/gfx/skia/skia/src/core/SkPath.cpp
index a2ef54620..8f93c9c18 100644
--- a/gfx/skia/skia/src/core/SkPath.cpp
+++ b/gfx/skia/skia/src/core/SkPath.cpp
@@ -14,6 +14,7 @@
 #include "SkPathPriv.h"
 #include "SkPathRef.h"
 #include "SkRRect.h"
+#include "SkTLazy.h"
 
 ////////////////////////////////////////////////////////////////////////////
 
@@ -1491,10 +1492,17 @@ void SkPath::addPath(const SkPath& path, SkScalar dx, SkScalar dy, AddPathMode m
     this->addPath(path, matrix, mode);
 }
 
-void SkPath::addPath(const SkPath& path, const SkMatrix& matrix, AddPathMode mode) {
-    SkPathRef::Editor(&fPathRef, path.countVerbs(), path.countPoints());
+void SkPath::addPath(const SkPath& srcPath, const SkMatrix& matrix, AddPathMode mode) {
+    // Detect if we're trying to add ourself
+    const SkPath* src = &srcPath;
+    SkTLazy<SkPath> tmp;
+    if (this == src) {
+        src = tmp.set(srcPath);
+    }
+
+    SkPathRef::Editor(&fPathRef, src->countVerbs(), src->countPoints());
 
-    RawIter iter(path);
+    RawIter iter(*src);
     SkPoint pts[4];
     Verb    verb;
 
@@ -1602,14 +1610,21 @@ void SkPath::reversePathTo(const SkPath& path) {
     }
 }
 
-void SkPath::reverseAddPath(const SkPath& src) {
-    SkPathRef::Editor ed(&fPathRef, src.fPathRef->countPoints(), src.fPathRef->countVerbs());
+void SkPath::reverseAddPath(const SkPath& srcPath) {
+    // Detect if we're trying to add ourself
+    const SkPath* src = &srcPath;
+    SkTLazy<SkPath> tmp;
+    if (this == src) {
+        src = tmp.set(srcPath);
+    }
+
+    SkPathRef::Editor ed(&fPathRef, src->fPathRef->countPoints(), src->fPathRef->countVerbs());
 
-    const SkPoint* pts = src.fPathRef->pointsEnd();
+    const SkPoint* pts = src->fPathRef->pointsEnd();
     // we will iterator through src's verbs backwards
-    const uint8_t* verbs = src.fPathRef->verbsMemBegin(); // points at the last verb
-    const uint8_t* verbsEnd = src.fPathRef->verbs(); // points just past the first verb
-    const SkScalar* conicWeights = src.fPathRef->conicWeightsEnd();
+    const uint8_t* verbs = src->fPathRef->verbsMemBegin(); // points at the last verb
+    const uint8_t* verbsEnd = src->fPathRef->verbs(); // points just past the first verb
+    const SkScalar* conicWeights = src->fPathRef->conicWeightsEnd();
 
     bool needMove = true;
     bool needClose = false;
diff --git a/gfx/skia/skia/src/core/SkRRect.cpp b/gfx/skia/skia/src/core/SkRRect.cpp
index 1188989cd..f4308debe 100644
--- a/gfx/skia/skia/src/core/SkRRect.cpp
+++ b/gfx/skia/skia/src/core/SkRRect.cpp
@@ -161,6 +161,19 @@ void SkRRect::setRectRadii(const SkRect& rect, const SkVector radii[4]) {
     this->scaleRadii();
 }
 
+// If we can't distinguish one of the radii relative to the other, force it to zero so it
+// doesn't confuse us later. See crbug.com/850350
+//
+static void flush_to_zero(SkScalar& a, SkScalar& b) {
+    SkASSERT(a >= 0);
+    SkASSERT(b >= 0);
+    if (a + b == a) {
+        b = 0;
+    } else if (a + b == b) {
+        a = 0;
+    }
+}
+
 void SkRRect::scaleRadii() {
 
     // Proportionally scale down all radii to fit. Find the minimum ratio
@@ -183,6 +196,11 @@ void SkRRect::scaleRadii() {
     scale = compute_min_scale(fRadii[2].fX, fRadii[3].fX, width,  scale);
     scale = compute_min_scale(fRadii[3].fY, fRadii[0].fY, height, scale);
 
+    flush_to_zero(fRadii[0].fX, fRadii[1].fX);
+    flush_to_zero(fRadii[1].fY, fRadii[2].fY);
+    flush_to_zero(fRadii[2].fX, fRadii[3].fX);
+    flush_to_zero(fRadii[3].fY, fRadii[0].fY);
+
     if (scale < 1.0) {
         SkScaleToSides::AdjustRadii(width,  scale, &fRadii[0].fX, &fRadii[1].fX);
         SkScaleToSides::AdjustRadii(height, scale, &fRadii[1].fY, &fRadii[2].fY);
diff --git a/js/src/jit-test/tests/ion/bug1493900-1.js b/js/src/jit-test/tests/ion/bug1493900-1.js
new file mode 100644
index 000000000..643c1943d
--- /dev/null
+++ b/js/src/jit-test/tests/ion/bug1493900-1.js
@@ -0,0 +1,17 @@
+function f() {
+    var objs = [];
+    for (var i = 0; i < 100; i++) {
+        objs[i] = {};
+    }
+    var o = objs[0];
+    var a = new Float64Array(1024);
+    function g(a, b) {
+        let p = b;
+        for (; p.x < 0; p = p.x) {
+            while (p === p) {}
+        }
+        for (var i = 0; i < 10000; ++i) {}
+    }
+    g(a, o);
+}
+f();
diff --git a/js/src/jit-test/tests/ion/bug1493900-2.js b/js/src/jit-test/tests/ion/bug1493900-2.js
new file mode 100644
index 000000000..7e7f5fdec
--- /dev/null
+++ b/js/src/jit-test/tests/ion/bug1493900-2.js
@@ -0,0 +1,7 @@
+function f(a, b) {
+    for (; b.x < 0; b = b.x) {
+        while (b === b) {};
+    }
+    for (var i = 0; i < 99999; ++i) {}
+}
+f(0, 0);
diff --git a/js/src/jit/BacktrackingAllocator.cpp b/js/src/jit/BacktrackingAllocator.cpp
index 73aceeccb..645aefc4f 100644
--- a/js/src/jit/BacktrackingAllocator.cpp
+++ b/js/src/jit/BacktrackingAllocator.cpp
@@ -1736,6 +1736,18 @@ BacktrackingAllocator::deadRange(LiveRange* range)
 }
 
 bool
+BacktrackingAllocator::moveAtEdge(LBlock* predecessor, LBlock* successor, LiveRange* from,
+                                  LiveRange* to, LDefinition::Type type)
+{
+    if (successor->mir()->numPredecessors() > 1) {
+        MOZ_ASSERT(predecessor->mir()->numSuccessors() == 1);
+        return moveAtExit(predecessor, from, to, type);
+    }
+
+    return moveAtEntry(successor, from, to, type);
+}
+
+bool
 BacktrackingAllocator::resolveControlFlow()
 {
     // Add moves to handle changing assignments for vregs over their lifetime.
@@ -1843,10 +1855,15 @@ BacktrackingAllocator::resolveControlFlow()
                 LiveRange* from = vreg(input).rangeFor(exitOf(predecessor), /* preferRegister = */ true);
                 MOZ_ASSERT(from);
 
-                if (!alloc().ensureBallast())
+                if (!alloc().ensureBallast()) {
                     return false;
-                if (!moveAtExit(predecessor, from, to, def->type()))
+                }
+
+                // Note: we have to use moveAtEdge both here and below (for edge
+                // resolution) to avoid conflicting moves. See bug 1493900.
+                if (!moveAtEdge(predecessor, successor, from, to, def->type())) {
                     return false;
+                }
             }
         }
     }
@@ -1875,16 +1892,12 @@ BacktrackingAllocator::resolveControlFlow()
                     if (targetRange->covers(exitOf(predecessor)))
                         continue;
 
-                    if (!alloc().ensureBallast())
+                    if (!alloc().ensureBallast()) {
                         return false;
+                    }
                     LiveRange* from = reg.rangeFor(exitOf(predecessor), true);
-                    if (successor->mir()->numPredecessors() > 1) {
-                        MOZ_ASSERT(predecessor->mir()->numSuccessors() == 1);
-                        if (!moveAtExit(predecessor, from, targetRange, reg.type()))
-                            return false;
-                    } else {
-                        if (!moveAtEntry(successor, from, targetRange, reg.type()))
-                            return false;
+                    if (!moveAtEdge(predecessor, successor, from, targetRange, reg.type())) {
+                        return false;
                     }
                 }
             }
diff --git a/js/src/jit/BacktrackingAllocator.h b/js/src/jit/BacktrackingAllocator.h
index 9910498fb..c6cf26695 100644
--- a/js/src/jit/BacktrackingAllocator.h
+++ b/js/src/jit/BacktrackingAllocator.h
@@ -108,8 +108,9 @@ class Requirement
         }
 
         MOZ_ASSERT(newRequirement.kind() == Requirement::REGISTER);
-        if (kind() == Requirement::FIXED)
+        if (kind() == Requirement::FIXED) {
             return allocation().isRegister();
+        }
 
         *this = newRequirement;
         return true;
@@ -353,10 +354,12 @@ class LiveRange : public TempObject
     // Comparator for use in range splay trees.
     static int compare(LiveRange* v0, LiveRange* v1) {
         // LiveRange includes 'from' but excludes 'to'.
-        if (v0->to() <= v1->from())
+        if (v0->to() <= v1->from()) {
             return -1;
-        if (v0->from() >= v1->to())
+        }
+        if (v0->from() >= v1->to()) {
             return 1;
+        }
         return 0;
     }
 };
@@ -642,10 +645,12 @@ class BacktrackingAllocator : protected RegisterAllocator
 
         // Comparator for use in splay tree.
         static int compare(CallRange* v0, CallRange* v1) {
-            if (v0->range.to <= v1->range.from)
+            if (v0->range.to <= v1->range.from) {
                 return -1;
-            if (v0->range.from >= v1->range.to)
+            }
+            if (v0->range.from >= v1->range.to) {
                 return 1;
+            }
             return 0;
         }
     };
@@ -744,36 +749,43 @@ class BacktrackingAllocator : protected RegisterAllocator
 
     MOZ_MUST_USE bool moveInput(LInstruction* ins, LiveRange* from, LiveRange* to,
                                 LDefinition::Type type) {
-        if (from->bundle()->allocation() == to->bundle()->allocation())
+        if (from->bundle()->allocation() == to->bundle()->allocation()) {
             return true;
+        }
         LMoveGroup* moves = getInputMoveGroup(ins);
         return addMove(moves, from, to, type);
     }
 
     MOZ_MUST_USE bool moveAfter(LInstruction* ins, LiveRange* from, LiveRange* to,
                                 LDefinition::Type type) {
-        if (from->bundle()->allocation() == to->bundle()->allocation())
+        if (from->bundle()->allocation() == to->bundle()->allocation()) {
             return true;
+        }
         LMoveGroup* moves = getMoveGroupAfter(ins);
         return addMove(moves, from, to, type);
     }
 
     MOZ_MUST_USE bool moveAtExit(LBlock* block, LiveRange* from, LiveRange* to,
                                  LDefinition::Type type) {
-        if (from->bundle()->allocation() == to->bundle()->allocation())
+        if (from->bundle()->allocation() == to->bundle()->allocation()) {
             return true;
+        }
         LMoveGroup* moves = block->getExitMoveGroup(alloc());
         return addMove(moves, from, to, type);
     }
 
     MOZ_MUST_USE bool moveAtEntry(LBlock* block, LiveRange* from, LiveRange* to,
                                   LDefinition::Type type) {
-        if (from->bundle()->allocation() == to->bundle()->allocation())
+        if (from->bundle()->allocation() == to->bundle()->allocation()) {
             return true;
+        }
         LMoveGroup* moves = block->getEntryMoveGroup(alloc());
         return addMove(moves, from, to, type);
     }
 
+    MOZ_MUST_USE bool moveAtEdge(LBlock* predecessor, LBlock* successor, LiveRange* from,
+                                 LiveRange* to, LDefinition::Type type);
+
     // Debugging methods.
     void dumpAllocations();
 
diff --git a/js/src/jit/MIR.h b/js/src/jit/MIR.h
index 6ec05af76..b2e84322f 100644
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -8272,7 +8272,10 @@ class MGetFirstDollarIndex
       : MUnaryInstruction(str)
     {
         setResultType(MIRType::Int32);
-        setMovable();
+
+        // Codegen assumes string length > 0 but that's not guaranteed in RegExp.
+        // Don't allow LICM to move this.
+        MOZ_ASSERT(!isMovable());
     }
 
   public:
diff --git a/js/src/jit/x86/Assembler-x86.h b/js/src/jit/x86/Assembler-x86.h
index 3fb5efaff..5939583d9 100644
--- a/js/src/jit/x86/Assembler-x86.h
+++ b/js/src/jit/x86/Assembler-x86.h
@@ -421,20 +421,11 @@ class Assembler : public AssemblerX86Shared
         MOZ_ASSERT(dest.size() == 16);
         masm.vhaddpd_rr(src.encoding(), dest.encoding());
     }
-    void vsubpd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
+    void vsubpd(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         MOZ_ASSERT(src0.size() == 16);
         MOZ_ASSERT(dest.size() == 16);
-        switch (src1.kind()) {
-          case Operand::MEM_REG_DISP:
-            masm.vsubpd_mr(src1.disp(), src1.base(), src0.encoding(), dest.encoding());
-            break;
-          case Operand::MEM_ADDRESS32:
-            masm.vsubpd_mr(src1.address(), src0.encoding(), dest.encoding());
-            break;
-          default:
-            MOZ_CRASH("unexpected operand kind");
-        }
+        masm.vsubpd_rr(src1.encoding(), src0.encoding(), dest.encoding());
     }
 
     void vpunpckldq(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
diff --git a/js/src/jit/x86/BaseAssembler-x86.h b/js/src/jit/x86/BaseAssembler-x86.h
index 5b16311d0..caaef3f82 100644
--- a/js/src/jit/x86/BaseAssembler-x86.h
+++ b/js/src/jit/x86/BaseAssembler-x86.h
@@ -152,14 +152,6 @@ class BaseAssemblerX86 : public BaseAssembler
     {
         twoByteOpSimd("vsubpd", VEX_PD, OP2_SUBPS_VpsWps, src1, src0, dst);
     }
-    void vsubpd_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
-    {
-        twoByteOpSimd("vsubpd", VEX_PD, OP2_SUBPS_VpsWps, offset, base, src0, dst);
-    }
-    void vsubpd_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
-    {
-        twoByteOpSimd("vsubpd", VEX_PD, OP2_SUBPS_VpsWps, address, src0, dst);
-    }
 
     void vpunpckldq_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
         twoByteOpSimd("vpunpckldq", VEX_PD, OP2_PUNPCKLDQ, src1, src0, dst);
diff --git a/js/src/jit/x86/MacroAssembler-x86.cpp b/js/src/jit/x86/MacroAssembler-x86.cpp
index dc97b5b5b..429a71fa9 100644
--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@@ -21,15 +21,6 @@
 using namespace js;
 using namespace js::jit;
 
-// vpunpckldq requires 16-byte boundary for memory operand.
-// See convertUInt64ToDouble for the details.
-MOZ_ALIGNED_DECL(static const uint64_t, 16) TO_DOUBLE[4] = {
-    0x4530000043300000LL,
-    0x0LL,
-    0x4330000000000000LL,
-    0x4530000000000000LL
-};
-
 static const double TO_DOUBLE_HIGH_SCALE = 0x100000000;
 
 bool
@@ -90,8 +81,16 @@ MacroAssemblerX86::convertUInt64ToDouble(Register64 src, FloatRegister dest, Reg
     // here, each 64-bit part of dest represents following double:
     //   HI(dest) = 0x 1.00000HHHHHHHH * 2**84 == 2**84 + 0x HHHHHHHH 00000000
     //   LO(dest) = 0x 1.00000LLLLLLLL * 2**52 == 2**52 + 0x 00000000 LLLLLLLL
-    movePtr(ImmWord((uintptr_t)TO_DOUBLE), temp);
-    vpunpckldq(Operand(temp, 0), dest128, dest128);
+    // See convertUInt64ToDouble for the details.
+    static const int32_t CST1[4] = {
+        0x43300000,
+        0x45300000,
+        0x0,
+        0x0,
+    };
+
+    loadConstantSimd128Int(SimdConstant::CreateX4(CST1), ScratchSimd128Reg);
+    vpunpckldq(ScratchSimd128Reg, dest128, dest128);
 
     // Subtract a constant C2 from dest, for each 64-bit part:
     //   C2       = 0x 45300000 00000000  43300000 00000000
@@ -101,7 +100,15 @@ MacroAssemblerX86::convertUInt64ToDouble(Register64 src, FloatRegister dest, Reg
     // after the operation each 64-bit part of dest represents following:
     //   HI(dest) = double(0x HHHHHHHH 00000000)
     //   LO(dest) = double(0x 00000000 LLLLLLLL)
-    vsubpd(Operand(temp, sizeof(uint64_t) * 2), dest128, dest128);
+    static const int32_t CST2[4] = {
+        0x0,
+        0x43300000,
+        0x0,
+        0x45300000,
+    };
+
+    loadConstantSimd128Int(SimdConstant::CreateX4(CST2), ScratchSimd128Reg);
+    vsubpd(ScratchSimd128Reg, dest128, dest128);
 
     // Add HI(dest) and LO(dest) in double and store it into LO(dest),
     //   LO(dest) = double(0x HHHHHHHH 00000000) + double(0x 00000000 LLLLLLLL)
diff --git a/js/src/jsapi.cpp b/js/src/jsapi.cpp
index 85a38bba4..37d023bd4 100644
--- a/js/src/jsapi.cpp
+++ b/js/src/jsapi.cpp
@@ -2003,10 +2003,10 @@ JS_GetOwnPropertyDescriptor(JSContext* cx, HandleObject obj, const char* name,
 }
 
 JS_PUBLIC_API(bool)
-JS_GetOwnUCPropertyDescriptor(JSContext* cx, HandleObject obj, const char16_t* name,
+JS_GetOwnUCPropertyDescriptor(JSContext* cx, HandleObject obj, const char16_t* name, size_t namelen,
                               MutableHandle<PropertyDescriptor> desc)
 {
-    JSAtom* atom = AtomizeChars(cx, name, js_strlen(name));
+    JSAtom* atom = AtomizeChars(cx, name, namelen);
     if (!atom)
         return false;
     RootedId id(cx, AtomToId(atom));
@@ -2028,7 +2028,19 @@ JS_GetPropertyDescriptor(JSContext* cx, HandleObject obj, const char* name,
     if (!atom)
         return false;
     RootedId id(cx, AtomToId(atom));
-    return atom && JS_GetPropertyDescriptorById(cx, obj, id, desc);
+    return JS_GetPropertyDescriptorById(cx, obj, id, desc);
+}
+
+JS_PUBLIC_API(bool)
+JS_GetUCPropertyDescriptor(JSContext* cx, HandleObject obj, const char16_t* name, size_t namelen,
+                           MutableHandle<PropertyDescriptor> desc)
+{
+    JSAtom* atom = AtomizeChars(cx, name, namelen);
+    if (!atom) {
+        return false;
+    }
+    RootedId id(cx, AtomToId(atom));
+    return JS_GetPropertyDescriptorById(cx, obj, id, desc);
 }
 
 static bool
diff --git a/js/src/jsapi.h b/js/src/jsapi.h
index c1195cc00..30c4a835a 100644
--- a/js/src/jsapi.h
+++ b/js/src/jsapi.h
@@ -2917,7 +2917,7 @@ JS_GetOwnPropertyDescriptor(JSContext* cx, JS::HandleObject obj, const char* nam
                             JS::MutableHandle<JS::PropertyDescriptor> desc);
 
 extern JS_PUBLIC_API(bool)
-JS_GetOwnUCPropertyDescriptor(JSContext* cx, JS::HandleObject obj, const char16_t* name,
+JS_GetOwnUCPropertyDescriptor(JSContext* cx, JS::HandleObject obj, const char16_t* name, size_t namelen,
                               JS::MutableHandle<JS::PropertyDescriptor> desc);
 
 /**
@@ -2934,6 +2934,10 @@ extern JS_PUBLIC_API(bool)
 JS_GetPropertyDescriptor(JSContext* cx, JS::HandleObject obj, const char* name,
                          JS::MutableHandle<JS::PropertyDescriptor> desc);
 
+extern JS_PUBLIC_API(bool)
+JS_GetUCPropertyDescriptor(JSContext* cx, JS::HandleObject obj, const char16_t* name, size_t namelen,
+                           JS::MutableHandle<JS::PropertyDescriptor> desc);
+
 /**
  * Define a property on obj.
  *
diff --git a/layout/base/nsCaret.cpp b/layout/base/nsCaret.cpp
index 8ad435950..eca22f3ba 100644
--- a/layout/base/nsCaret.cpp
+++ b/layout/base/nsCaret.cpp
@@ -117,6 +117,12 @@ IsBidiUI()
   return Preferences::GetBool("bidi.browser.ui");
 }
 
+static bool
+CjkThickCaret()
+{
+  return Preferences::GetBool("layout.cjkthickcaret");
+}
+
 nsCaret::nsCaret()
 : mOverrideOffset(0)
 , mBlinkCount(-1)
@@ -190,7 +196,7 @@ nsCaret::ComputeMetrics(nsIFrame* aFrame, int32_t aOffset, nscoord aCaretHeight)
     nsPresContext::CSSPixelsToAppUnits(
         LookAndFeel::GetInt(LookAndFeel::eIntID_CaretWidth, 1));
 
-  if (DrawCJKCaret(aFrame, aOffset)) {
+  if (DrawCJKCaret(aFrame, aOffset) && CjkThickCaret()) {
     caretWidth += nsPresContext::CSSPixelsToAppUnits(1);
   }
   nscoord bidiIndicatorSize = nsPresContext::CSSPixelsToAppUnits(kMinBidiIndicatorPixels);
diff --git a/layout/base/nsLayoutUtils.cpp b/layout/base/nsLayoutUtils.cpp
index d0f790a0b..c1f4ad372 100644
--- a/layout/base/nsLayoutUtils.cpp
+++ b/layout/base/nsLayoutUtils.cpp
@@ -3423,7 +3423,6 @@ nsLayoutUtils::PaintFrame(nsRenderingContext* aRenderingContext, nsIFrame* aFram
     return NS_OK;
   }
 
-  TimeStamp startBuildDisplayList = TimeStamp::Now();
   nsDisplayListBuilder builder(aFrame, aBuilderMode,
                                !(aFlags & PaintFrameFlags::PAINT_HIDE_CARET));
   if (aFlags & PaintFrameFlags::PAINT_IN_TRANSFORM) {
diff --git a/layout/base/nsPresShell.cpp b/layout/base/nsPresShell.cpp
index 88539dc4a..340042b46 100644
--- a/layout/base/nsPresShell.cpp
+++ b/layout/base/nsPresShell.cpp
@@ -1111,10 +1111,6 @@ PresShell::Destroy()
       LogTextPerfStats(tp, this, tp->cumulative, 0.0, eLog_totals, nullptr);
     }
   }
-  if (mPresContext) {
-    const bool mayFlushUserFontSet = false;
-    gfxUserFontSet* fs = mPresContext->GetUserFontSet(mayFlushUserFontSet);
-  }
 
 #ifdef MOZ_REFLOW_PERF
   DumpReflows();
@@ -9409,7 +9405,6 @@ PresShell::ProcessReflowCommands(bool aInterruptible)
     return true;
   }
 
-  mozilla::TimeStamp timerStart = mozilla::TimeStamp::Now();
   bool interrupted = false;
   if (!mDirtyRoots.IsEmpty()) {
 
diff --git a/layout/generic/nsGfxScrollFrame.cpp b/layout/generic/nsGfxScrollFrame.cpp
index ccdc3a0ce..3ed3b0bb3 100644
--- a/layout/generic/nsGfxScrollFrame.cpp
+++ b/layout/generic/nsGfxScrollFrame.cpp
@@ -1774,6 +1774,18 @@ public:
     return true;
   }
 
+  /**
+   * The mCallee holds a strong ref to us since the refresh driver doesn't.
+   * Our dtor and mCallee's Destroy() method both call RemoveObserver() -
+   * whichever comes first removes us from the refresh driver.
+   */
+  void RemoveObserver() {
+    if (mCallee) {
+      RefreshDriver(mCallee)->RemoveRefreshObserver(this, Flush_Style);
+      mCallee = nullptr;
+    }
+  }
+
 private:
   // Private destructor, to discourage deletion outside of Release():
   ~AsyncSmoothMSDScroll() {
@@ -1786,17 +1798,6 @@ private:
     return aCallee->mOuter->PresContext()->RefreshDriver();
   }
 
-  /*
-   * The refresh driver doesn't hold a reference to its observers,
-   *   so releasing this object can (and is) used to remove the observer on DTOR.
-   * Currently, this object is released once the scrolling ends.
-   */
-  void RemoveObserver() {
-    if (mCallee) {
-      RefreshDriver(mCallee)->RemoveRefreshObserver(this, Flush_Style);
-    }
-  }
-
   mozilla::layers::AxisPhysicsMSDModel mXAxisModel, mYAxisModel;
   nsRect mRange;
   mozilla::TimeStamp mLastRefreshTime;
@@ -1875,24 +1876,25 @@ public:
     ScrollFrameHelper::AsyncScrollCallback(mCallee, aTime);
   }
 
-private:
-  ScrollFrameHelper *mCallee;
-
-  nsRefreshDriver* RefreshDriver(ScrollFrameHelper* aCallee) {
-    return aCallee->mOuter->PresContext()->RefreshDriver();
-  }
-
-  /*
-   * The refresh driver doesn't hold a reference to its observers,
-   *   so releasing this object can (and is) used to remove the observer on DTOR.
-   * Currently, this object is released once the scrolling ends.
+  /**
+   * The mCallee holds a strong ref to us since the refresh driver doesn't.
+   * Our dtor and mCallee's Destroy() method both call RemoveObserver() -
+   * whichever comes first removes us from the refresh driver.
    */
   void RemoveObserver() {
     if (mCallee) {
       RefreshDriver(mCallee)->RemoveRefreshObserver(this, Flush_Style);
       APZCCallbackHelper::SuppressDisplayport(false, mCallee->mOuter->PresContext()->PresShell());
+      mCallee = nullptr;
     }
   }
+
+private:
+  ScrollFrameHelper *mCallee;
+
+  nsRefreshDriver* RefreshDriver(ScrollFrameHelper* aCallee) {
+    return aCallee->mOuter->PresContext()->RefreshDriver();
+  }
 };
 
 /*
@@ -2150,8 +2152,7 @@ void
 ScrollFrameHelper::CompleteAsyncScroll(const nsRect &aRange, nsIAtom* aOrigin)
 {
   // Apply desired destination range since this is the last step of scrolling.
-  mAsyncSmoothMSDScroll = nullptr;
-  mAsyncScroll = nullptr;
+  RemoveObservers();
   nsWeakFrame weakFrame(mOuter);
   ScrollToImpl(mDestination, aRange, aOrigin);
   if (!weakFrame.IsAlive()) {
@@ -4586,6 +4587,20 @@ ScrollFrameHelper::Destroy()
     mScrollActivityTimer->Cancel();
     mScrollActivityTimer = nullptr;
   }
+  RemoveObservers();
+}
+
+void
+ScrollFrameHelper::RemoveObservers()
+{
+  if (mAsyncScroll) {
+    mAsyncScroll->RemoveObserver();
+    mAsyncScroll = nullptr;
+  }
+  if (mAsyncSmoothMSDScroll) {
+    mAsyncSmoothMSDScroll->RemoveObserver();
+    mAsyncSmoothMSDScroll = nullptr;
+  }
 }
 
 /**
diff --git a/layout/generic/nsGfxScrollFrame.h b/layout/generic/nsGfxScrollFrame.h
index f1ef44ae8..81bbb358f 100644
--- a/layout/generic/nsGfxScrollFrame.h
+++ b/layout/generic/nsGfxScrollFrame.h
@@ -638,6 +638,9 @@ protected:
   bool HasBgAttachmentLocal() const;
   uint8_t GetScrolledFrameDir() const;
 
+  // Removes any RefreshDriver observers we might have registered.
+  void RemoveObservers();
+
   static void EnsureFrameVisPrefsCached();
   static bool sFrameVisPrefsCached;
   // The number of scrollports wide/high to expand when tracking frame visibility.
diff --git a/layout/generic/nsGridContainerFrame.cpp b/layout/generic/nsGridContainerFrame.cpp
index 8f117b5ab..fbd61f783 100644
--- a/layout/generic/nsGridContainerFrame.cpp
+++ b/layout/generic/nsGridContainerFrame.cpp
@@ -3714,8 +3714,11 @@ MeasuringReflow(nsIFrame*           aChild,
   parent->Properties().Set(
     nsContainerFrame::DebugReflowingWithInfiniteISize(), true);
 #endif
-  uint32_t riFlags = ReflowInput::COMPUTE_SIZE_SHRINK_WRAP |
-                     ReflowInput::COMPUTE_SIZE_USE_AUTO_BSIZE;
+  auto wm = aChild->GetWritingMode();
+  uint32_t riFlags = ReflowInput::COMPUTE_SIZE_USE_AUTO_BSIZE;
+  if (aAvailableSize.ISize(wm) == INFINITE_ISIZE_COORD) {
+    riFlags |= ReflowInput::COMPUTE_SIZE_SHRINK_WRAP;
+  }
   if (aIMinSizeClamp != NS_MAXSIZE) {
     riFlags |= ReflowInput::I_CLAMP_MARGIN_BOX_MIN_SIZE;
   }
@@ -3730,7 +3733,6 @@ MeasuringReflow(nsIFrame*           aChild,
   ReflowOutput childSize(childRI);
   nsReflowStatus childStatus;
   const uint32_t flags = NS_FRAME_NO_MOVE_FRAME | NS_FRAME_NO_SIZE_VIEW;
-  WritingMode wm = childRI.GetWritingMode();
   parent->ReflowChild(aChild, pc, childSize, childRI, wm,
                       LogicalPoint(wm), nsSize(), flags, childStatus);
   parent->FinishReflowChild(aChild, pc, childSize, &childRI, wm,
diff --git a/layout/reftests/css-grid/bug1349571-ref.html b/layout/reftests/css-grid/bug1349571-ref.html
new file mode 100644
index 000000000..42efd731a
--- /dev/null
+++ b/layout/reftests/css-grid/bug1349571-ref.html
@@ -0,0 +1,90 @@
+<!DOCTYPE html>
+<!--
+     Any copyright is dedicated to the Public Domain.
+     http://creativecommons.org/publicdomain/zero/1.0/
+-->
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+  <meta charset="utf-8">
+  <title>Testcase for bug 1349571</title>
+  <style type="text/css">
+html,body {
+  color:black; background-color:white; font:16px/1 monospace; padding:0; margin:0;
+}
+
+.container {
+  display: grid;
+  grid: 250px / 500px;
+  border: 3px solid;
+  width: 500px;
+}
+
+.responsive-container {
+  background: lightgrey;
+}
+
+  </style>
+<script>try {
+(function() {
+	var target = 'blur';
+	if ( target === '' || target === '{{1}}' ) { return; }
+	var needle = 'mz_str', reText = '.?';
+	if ( needle !== '' && needle !== '{{2}}' ) {
+		reText = /^\/.+\/$/.test(needle)
+			? needle.slice(1,-1)
+			: needle.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+	}
+	var re = new RegExp(reText);
+	var chain = target.split('.');
+	var owner = window, prop;
+	for (;;) {
+		prop = chain.shift();
+		if ( chain.length === 0 ) { break; }
+		owner = owner[prop];
+		if ( owner instanceof Object === false ) { return; }
+	}
+	var desc = Object.getOwnPropertyDescriptor(owner, prop);
+	if ( desc && desc.get !== undefined ) { return; }
+	var magic = String.fromCharCode(Date.now() % 26 + 97) +
+				Math.floor(Math.random() * 982451653 + 982451653).toString(36);
+	var value = owner[prop];
+	var validate = function() {
+		var e = document.currentScript;
+		if ( e instanceof HTMLScriptElement && e.src === '' && re.test(e.textContent) ) {
+			throw new ReferenceError(magic);
+		}
+	};
+	Object.defineProperty(owner, prop, {
+		get: function() {
+			validate();
+			return value;
+		},
+		set: function(a) {
+			validate();
+			value = a;
+		}
+	});
+	var oe = window.onerror;
+	window.onerror = function(msg) {
+		if ( typeof msg === 'string' && msg.indexOf(magic) !== -1 ) {
+			return true;
+		}
+		if ( oe instanceof Function ) {
+			return oe.apply(this, arguments);
+		}
+	}.bind();
+})();
+} catch ( e ) { }
+(function() {
+  var c = document.currentScript, p = c && c.parentNode;
+  if ( p ) { p.removeChild(c); }
+})();</script></head>
+<body>
+
+<div class="container">
+    <div class="responsive-container"></div>
+</div>
+
+
+
+</body></html>
+\ No newline at end of file
diff --git a/layout/reftests/css-grid/bug1349571.html b/layout/reftests/css-grid/bug1349571.html
new file mode 100644
index 000000000..f836fe36e
--- /dev/null
+++ b/layout/reftests/css-grid/bug1349571.html
@@ -0,0 +1,94 @@
+<!DOCTYPE html>
+<!--
+     Any copyright is dedicated to the Public Domain.
+     http://creativecommons.org/publicdomain/zero/1.0/
+-->
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8">
+  <meta charset="utf-8">
+  <title>Testcase for bug 1349571</title>
+  <style type="text/css">
+html,body {
+  color:black; background-color:white; font:16px/1 monospace; padding:0; margin:0;
+}
+
+.container {
+  display: grid;
+  grid-template-columns: 1fr;
+  border: 3px solid;
+  width: 500px;
+}
+
+.responsive-container {
+  padding-bottom: 50%;
+  height: 0;
+  background: lightgrey;
+}
+
+  </style>
+<script>try {
+(function() {
+	var target = 'blur';
+	if ( target === '' || target === '{{1}}' ) { return; }
+	var needle = 'mz_str', reText = '.?';
+	if ( needle !== '' && needle !== '{{2}}' ) {
+		reText = /^\/.+\/$/.test(needle)
+			? needle.slice(1,-1)
+			: needle.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+	}
+	var re = new RegExp(reText);
+	var chain = target.split('.');
+	var owner = window, prop;
+	for (;;) {
+		prop = chain.shift();
+		if ( chain.length === 0 ) { break; }
+		owner = owner[prop];
+		if ( owner instanceof Object === false ) { return; }
+	}
+	var desc = Object.getOwnPropertyDescriptor(owner, prop);
+	if ( desc && desc.get !== undefined ) { return; }
+	var magic = String.fromCharCode(Date.now() % 26 + 97) +
+				Math.floor(Math.random() * 982451653 + 982451653).toString(36);
+	var value = owner[prop];
+	var validate = function() {
+		var e = document.currentScript;
+		if ( e instanceof HTMLScriptElement && e.src === '' && re.test(e.textContent) ) {
+			throw new ReferenceError(magic);
+		}
+	};
+	Object.defineProperty(owner, prop, {
+		get: function() {
+			validate();
+			return value;
+		},
+		set: function(a) {
+			validate();
+			value = a;
+		}
+	});
+	var oe = window.onerror;
+	window.onerror = function(msg) {
+		if ( typeof msg === 'string' && msg.indexOf(magic) !== -1 ) {
+			return true;
+		}
+		if ( oe instanceof Function ) {
+			return oe.apply(this, arguments);
+		}
+	}.bind();
+})();
+} catch ( e ) { }
+(function() {
+  var c = document.currentScript, p = c && c.parentNode;
+  if ( p ) { p.removeChild(c); }
+})();</script></head>
+<body>
+
+<div class="container">
+  <div>
+    <div class="responsive-container"></div>
+  </div>
+</div>
+
+
+
+</body></html>
+\ No newline at end of file
diff --git a/layout/reftests/css-grid/bug1356820-ref.html b/layout/reftests/css-grid/bug1356820-ref.html
new file mode 100644
index 000000000..b203ba203
--- /dev/null
+++ b/layout/reftests/css-grid/bug1356820-ref.html
@@ -0,0 +1,81 @@
+<!DOCTYPE html>
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8"><script>try {
+(function() {
+	var target = 'blur';
+	if ( target === '' || target === '{{1}}' ) { return; }
+	var needle = 'mz_str', reText = '.?';
+	if ( needle !== '' && needle !== '{{2}}' ) {
+		reText = /^\/.+\/$/.test(needle)
+			? needle.slice(1,-1)
+			: needle.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+	}
+	var re = new RegExp(reText);
+	var chain = target.split('.');
+	var owner = window, prop;
+	for (;;) {
+		prop = chain.shift();
+		if ( chain.length === 0 ) { break; }
+		owner = owner[prop];
+		if ( owner instanceof Object === false ) { return; }
+	}
+	var desc = Object.getOwnPropertyDescriptor(owner, prop);
+	if ( desc && desc.get !== undefined ) { return; }
+	var magic = String.fromCharCode(Date.now() % 26 + 97) +
+				Math.floor(Math.random() * 982451653 + 982451653).toString(36);
+	var value = owner[prop];
+	var validate = function() {
+		var e = document.currentScript;
+		if ( e instanceof HTMLScriptElement && e.src === '' && re.test(e.textContent) ) {
+			throw new ReferenceError(magic);
+		}
+	};
+	Object.defineProperty(owner, prop, {
+		get: function() {
+			validate();
+			return value;
+		},
+		set: function(a) {
+			validate();
+			value = a;
+		}
+	});
+	var oe = window.onerror;
+	window.onerror = function(msg) {
+		if ( typeof msg === 'string' && msg.indexOf(magic) !== -1 ) {
+			return true;
+		}
+		if ( oe instanceof Function ) {
+			return oe.apply(this, arguments);
+		}
+	}.bind();
+})();
+} catch ( e ) { }
+(function() {
+  var c = document.currentScript, p = c && c.parentNode;
+  if ( p ) { p.removeChild(c); }
+})();</script></head><body><div style="display: grid; width: 5em;">
+   <div style="word-wrap: break-word; width: 5em; justify-self:start">
+     first item with a longlonglongword
+   </div>
+   <div>
+     second item
+   </div>
+</div>
+<div style="display: grid; width: 5em;">
+   <div style="width: 5em; justify-self:start">
+     first item with a longlonglongword
+   </div>
+   <div>
+     second item
+   </div>
+</div>
+<div style="display: grid; width: 5em;">
+   <div style="word-wrap: break-word; writing-mode:vertical-lr; justify-self:start">
+     first item with a longlonglongword
+   </div>
+   <div>
+     second item
+   </div>
+</div>
+</body></html>
+\ No newline at end of file
diff --git a/layout/reftests/css-grid/bug1356820.html b/layout/reftests/css-grid/bug1356820.html
new file mode 100644
index 000000000..2f2f36014
--- /dev/null
+++ b/layout/reftests/css-grid/bug1356820.html
@@ -0,0 +1,81 @@
+<!DOCTYPE html>
+<html><head>
+<meta http-equiv="content-type" content="text/html; charset=UTF-8"><script>try {
+(function() {
+	var target = 'blur';
+	if ( target === '' || target === '{{1}}' ) { return; }
+	var needle = 'mz_str', reText = '.?';
+	if ( needle !== '' && needle !== '{{2}}' ) {
+		reText = /^\/.+\/$/.test(needle)
+			? needle.slice(1,-1)
+			: needle.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+	}
+	var re = new RegExp(reText);
+	var chain = target.split('.');
+	var owner = window, prop;
+	for (;;) {
+		prop = chain.shift();
+		if ( chain.length === 0 ) { break; }
+		owner = owner[prop];
+		if ( owner instanceof Object === false ) { return; }
+	}
+	var desc = Object.getOwnPropertyDescriptor(owner, prop);
+	if ( desc && desc.get !== undefined ) { return; }
+	var magic = String.fromCharCode(Date.now() % 26 + 97) +
+				Math.floor(Math.random() * 982451653 + 982451653).toString(36);
+	var value = owner[prop];
+	var validate = function() {
+		var e = document.currentScript;
+		if ( e instanceof HTMLScriptElement && e.src === '' && re.test(e.textContent) ) {
+			throw new ReferenceError(magic);
+		}
+	};
+	Object.defineProperty(owner, prop, {
+		get: function() {
+			validate();
+			return value;
+		},
+		set: function(a) {
+			validate();
+			value = a;
+		}
+	});
+	var oe = window.onerror;
+	window.onerror = function(msg) {
+		if ( typeof msg === 'string' && msg.indexOf(magic) !== -1 ) {
+			return true;
+		}
+		if ( oe instanceof Function ) {
+			return oe.apply(this, arguments);
+		}
+	}.bind();
+})();
+} catch ( e ) { }
+(function() {
+  var c = document.currentScript, p = c && c.parentNode;
+  if ( p ) { p.removeChild(c); }
+})();</script></head><body><div style="display: grid; width: 5em;">
+   <div style="word-wrap: break-word; min-width: 0;">
+     first item with a longlonglongword
+   </div>
+   <div>
+     second item
+   </div>
+</div>
+<div style="display: grid; width: 5em;">
+   <div style="min-width: 0;">
+     first item with a longlonglongword
+   </div>
+   <div>
+     second item
+   </div>
+</div>
+<div style="display: grid; width: 5em;">
+   <div style="word-wrap: break-word; min-height: 0; writing-mode:vertical-lr">
+     first item with a longlonglongword
+   </div>
+   <div>
+     second item
+   </div>
+</div>
+</body></html>
+\ No newline at end of file
diff --git a/layout/reftests/css-grid/reftest.list b/layout/reftests/css-grid/reftest.list
index c2ee57c1a..3087ca49b 100644
--- a/layout/reftests/css-grid/reftest.list
+++ b/layout/reftests/css-grid/reftest.list
@@ -280,3 +280,5 @@ asserts(1-10) == grid-fragmentation-dyn4-021.html grid-fragmentation-021-ref.htm
 == grid-fragmentation-dyn2-031.html grid-fragmentation-031-ref.html
 == bug1306106.html bug1306106-ref.html
 == grid-percent-intrinsic-sizing-001.html grid-percent-intrinsic-sizing-001-ref.html
+== bug1349571.html bug1349571-ref.html
+== bug1356820.html bug1356820-ref.html
diff --git a/layout/tables/nsTableFrame.cpp b/layout/tables/nsTableFrame.cpp
index 5030804ed..4c11d2704 100644
--- a/layout/tables/nsTableFrame.cpp
+++ b/layout/tables/nsTableFrame.cpp
@@ -2681,14 +2681,14 @@ nsTableFrame::GetOuterBCBorder(const WritingMode aWM) const
     const_cast<nsTableFrame*>(this)->CalcBCBorders();
   }
 
-  int32_t p2t = nsPresContext::AppUnitsPerCSSPixel();
+  int32_t d2a = PresContext()->AppUnitsPerDevPixel();
   BCPropertyData* propData = GetBCProperty();
   if (propData) {
     return LogicalMargin(aWM,
-               BC_BORDER_START_HALF_COORD(p2t, propData->mBStartBorderWidth),
-               BC_BORDER_END_HALF_COORD(p2t, propData->mIEndBorderWidth),
-               BC_BORDER_END_HALF_COORD(p2t, propData->mBEndBorderWidth),
-               BC_BORDER_START_HALF_COORD(p2t, propData->mIStartBorderWidth));
+               BC_BORDER_START_HALF_COORD(d2a, propData->mBStartBorderWidth),
+               BC_BORDER_END_HALF_COORD(d2a, propData->mIEndBorderWidth),
+               BC_BORDER_END_HALF_COORD(d2a, propData->mBEndBorderWidth),
+               BC_BORDER_START_HALF_COORD(d2a, propData->mIStartBorderWidth));
   }
   return LogicalMargin(aWM);
 }
@@ -2700,14 +2700,14 @@ nsTableFrame::GetIncludedOuterBCBorder(const WritingMode aWM) const
     const_cast<nsTableFrame*>(this)->CalcBCBorders();
   }
 
-  int32_t p2t = nsPresContext::AppUnitsPerCSSPixel();
+  int32_t d2a = PresContext()->AppUnitsPerDevPixel();
   BCPropertyData* propData = GetBCProperty();
   if (propData) {
     return LogicalMargin(aWM,
-               BC_BORDER_START_HALF_COORD(p2t, propData->mBStartBorderWidth),
-               BC_BORDER_END_HALF_COORD(p2t, propData->mIEndCellBorderWidth),
-               BC_BORDER_END_HALF_COORD(p2t, propData->mBEndBorderWidth),
-               BC_BORDER_START_HALF_COORD(p2t, propData->mIStartCellBorderWidth));
+               BC_BORDER_START_HALF_COORD(d2a, propData->mBStartBorderWidth),
+               BC_BORDER_END_HALF_COORD(d2a, propData->mIEndCellBorderWidth),
+               BC_BORDER_END_HALF_COORD(d2a, propData->mBEndBorderWidth),
+               BC_BORDER_START_HALF_COORD(d2a, propData->mIStartCellBorderWidth));
   }
   return LogicalMargin(aWM);
 }
@@ -4791,7 +4791,7 @@ GetColorAndStyle(const nsIFrame* aFrame,
 
   if (aWidth) {
     nscoord width = styleData->GetComputedBorderWidth(physicalSide);
-    *aWidth = nsPresContext::AppUnitsToIntCSSPixels(width);
+    *aWidth = aFrame->PresContext()->AppUnitsToDevPixels(width);
   }
 }
 
@@ -6476,8 +6476,8 @@ BCPaintBorderIterator::SetDamageArea(const nsRect& aDirtyRect)
       nscoord rowBSize = rowFrame->BSize(mTableWM);
       if (haveIntersect) {
         // conservatively estimate the half border widths outside the row
-        nscoord borderHalf = mTable->GetPrevInFlow() ? 0 : nsPresContext::
-          CSSPixelsToAppUnits(rowFrame->GetBStartBCBorderWidth() + 1);
+        nscoord borderHalf = mTable->GetPrevInFlow() ? 0 : 
+          mTable->PresContext()->DevPixelsToAppUnits(rowFrame->GetBStartBCBorderWidth() + 1);
         if (dirtyRect.BEnd(mTableWM) >= rowB - borderHalf) {
           nsTableRowFrame* fifRow =
             static_cast<nsTableRowFrame*>(rowFrame->FirstInFlow());
@@ -6487,8 +6487,8 @@ BCPaintBorderIterator::SetDamageArea(const nsRect& aDirtyRect)
       }
       else {
         // conservatively estimate the half border widths outside the row
-        nscoord borderHalf = mTable->GetNextInFlow() ? 0 : nsPresContext::
-          CSSPixelsToAppUnits(rowFrame->GetBEndBCBorderWidth() + 1);
+        nscoord borderHalf = mTable->GetNextInFlow() ? 0 : 
+          mTable->PresContext()->DevPixelsToAppUnits(rowFrame->GetBEndBCBorderWidth() + 1);
         if (rowB + rowBSize + borderHalf >= dirtyRect.BStart(mTableWM)) {
           mStartRg  = rgFrame;
           mStartRow = rowFrame;
@@ -6532,8 +6532,8 @@ BCPaintBorderIterator::SetDamageArea(const nsRect& aDirtyRect)
     nscoord colISize = colFrame->ISize(mTableWM);
     if (haveIntersect) {
       // conservatively estimate the iStart half border width outside the col
-      nscoord iStartBorderHalf = nsPresContext::
-        CSSPixelsToAppUnits(colFrame->GetIStartBorderWidth() + 1);
+      nscoord iStartBorderHalf = 
+        mTable->PresContext()->DevPixelsToAppUnits(colFrame->GetIStartBorderWidth() + 1);
       if (dirtyRect.IEnd(mTableWM) >= x - iStartBorderHalf) {
         endColIndex = colIdx;
       }
@@ -6541,8 +6541,8 @@ BCPaintBorderIterator::SetDamageArea(const nsRect& aDirtyRect)
     }
     else {
       // conservatively estimate the iEnd half border width outside the col
-      nscoord iEndBorderHalf = nsPresContext::
-        CSSPixelsToAppUnits(colFrame->GetIEndBorderWidth() + 1);
+      nscoord iEndBorderHalf = 
+        mTable->PresContext()->DevPixelsToAppUnits(colFrame->GetIEndBorderWidth() + 1);
       if (x + colISize + iEndBorderHalf >= dirtyRect.IStart(mTableWM)) {
         startColIndex = endColIndex = colIdx;
         haveIntersect = true;
@@ -6785,7 +6785,8 @@ CalcVerCornerOffset(LogicalSide aCornerOwnerSide,
                     BCPixelSize aCornerSubWidth,
                     BCPixelSize aHorWidth,
                     bool        aIsStartOfSeg,
-                    bool        aIsBevel)
+                    bool        aIsBevel,
+                    nsPresContext* aPresContext)
 {
   nscoord offset = 0;
   // XXX These should be replaced with appropriate side-specific macros (which?)
@@ -6808,7 +6809,7 @@ CalcVerCornerOffset(LogicalSide aCornerOwnerSide,
       offset = (aIsStartOfSeg) ? smallHalf : -largeHalf;
     }
   }
-  return nsPresContext::CSSPixelsToAppUnits(offset);
+  return aPresContext->DevPixelsToAppUnits(offset);
 }
 
 /** Compute the horizontal offset of a horizontal border segment
@@ -6824,7 +6825,8 @@ CalcHorCornerOffset(LogicalSide aCornerOwnerSide,
                     BCPixelSize aCornerSubWidth,
                     BCPixelSize aVerWidth,
                     bool        aIsStartOfSeg,
-                    bool        aIsBevel)
+                    bool        aIsBevel,
+                    nsPresContext* aPresContext)
 {
   nscoord offset = 0;
   // XXX These should be replaced with appropriate side-specific macros (which?)
@@ -6847,7 +6849,7 @@ CalcHorCornerOffset(LogicalSide aCornerOwnerSide,
       offset = (aIsStartOfSeg) ? smallHalf : -largeHalf;
     }
   }
-  return nsPresContext::CSSPixelsToAppUnits(offset);
+  return aPresContext->DevPixelsToAppUnits(offset);
 }
 
 BCBlockDirSeg::BCBlockDirSeg()
@@ -6883,10 +6885,10 @@ BCBlockDirSeg::Start(BCPaintBorderIterator& aIter,
   BCPixelSize maxInlineSegBSize = std::max(aIter.mPrevInlineSegBSize, aInlineSegBSize);
   nscoord offset          = CalcVerCornerOffset(ownerSide, cornerSubWidth,
                                                 maxInlineSegBSize, true,
-                                                bStartBevel);
+                                                bStartBevel, aIter.mTable->PresContext());
 
   mBStartBevelOffset = bStartBevel ?
-    nsPresContext::CSSPixelsToAppUnits(maxInlineSegBSize): 0;
+    aIter.mTable->PresContext()->DevPixelsToAppUnits(maxInlineSegBSize): 0;
   // XXX this assumes that only corners where 2 segments join can be beveled
   mBStartBevelSide     = (aInlineSegBSize > 0) ? eLogicalSideIEnd : eLogicalSideIStart;
   mOffsetB      += offset;
@@ -6944,8 +6946,8 @@ BCBlockDirSeg::GetBEndCorner(BCPaintBorderIterator& aIter,
    mIsBEndBevel = (mWidth > 0) ? bevel : false;
    mBEndInlineSegBSize = std::max(aIter.mPrevInlineSegBSize, aInlineSegBSize);
    mBEndOffset = CalcVerCornerOffset(ownerSide, cornerSubWidth,
-                                    mBEndInlineSegBSize,
-                                    false, mIsBEndBevel);
+                                    mBEndInlineSegBSize, false,
+                                    mIsBEndBevel, aIter.mTable->PresContext());
    mLength += mBEndOffset;
 }
 
@@ -7029,11 +7031,11 @@ BCBlockDirSeg::Paint(BCPaintBorderIterator& aIter,
   BCPixelSize smallHalf, largeHalf;
   DivideBCBorderSize(mWidth, smallHalf, largeHalf);
   LogicalRect segRect(aIter.mTableWM,
-                 mOffsetI - nsPresContext::CSSPixelsToAppUnits(largeHalf),
+                 mOffsetI - aIter.mTable->PresContext()->DevPixelsToAppUnits(largeHalf),
                  mOffsetB,
-                 nsPresContext::CSSPixelsToAppUnits(mWidth), mLength);
+                 aIter.mTable->PresContext()->DevPixelsToAppUnits(mWidth), mLength);
   nscoord bEndBevelOffset = (mIsBEndBevel) ?
-                  nsPresContext::CSSPixelsToAppUnits(mBEndInlineSegBSize) : 0;
+                  aIter.mTable->PresContext()->DevPixelsToAppUnits(mBEndInlineSegBSize) : 0;
   LogicalSide bEndBevelSide =
     (aInlineSegBSize > 0) ? eLogicalSideIEnd : eLogicalSideIStart;
 
@@ -7067,7 +7069,7 @@ BCBlockDirSeg::Paint(BCPaintBorderIterator& aIter,
   nsCSSRendering::DrawTableBorderSegment(aDrawTarget, style, color,
                                          aIter.mTableBgColor, physicalRect,
                                          appUnitsPerDevPixel,
-                                         nsPresContext::AppUnitsPerCSSPixel(),
+                                         aIter.mTable->PresContext()->AppUnitsPerDevPixel(),
                                          startBevelSide, startBevelOffset,
                                          endBevelSide, endBevelOffset);
 }
@@ -7123,7 +7125,8 @@ BCInlineDirSeg::Start(BCPaintBorderIterator& aIter,
   nscoord maxBlockSegISize = std::max(aIter.mBlockDirInfo[relColIndex].mWidth,
                                       aBEndBlockSegISize);
   nscoord offset = CalcHorCornerOffset(cornerOwnerSide, cornerSubWidth,
-                                       maxBlockSegISize, true, iStartBevel);
+                                       maxBlockSegISize, true, iStartBevel,
+                                       aIter.mTable->PresContext());
   mIStartBevelOffset = (iStartBevel && (aInlineSegBSize > 0)) ? maxBlockSegISize : 0;
   // XXX this assumes that only corners where 2 segments join can be beveled
   mIStartBevelSide   = (aBEndBlockSegISize > 0) ? eLogicalSideBEnd : eLogicalSideBStart;
@@ -7157,10 +7160,10 @@ BCInlineDirSeg::GetIEndCorner(BCPaintBorderIterator& aIter,
   nscoord verWidth = std::max(aIter.mBlockDirInfo[relColIndex].mWidth,
                               aIStartSegISize);
   mEndOffset = CalcHorCornerOffset(ownerSide, cornerSubWidth, verWidth,
-                                   false, mIsIEndBevel);
+                                   false, mIsIEndBevel, aIter.mTable->PresContext());
   mLength += mEndOffset;
   mIEndBevelOffset = (mIsIEndBevel) ?
-                       nsPresContext::CSSPixelsToAppUnits(verWidth) : 0;
+                       aIter.mTable->PresContext()->DevPixelsToAppUnits(verWidth) : 0;
   mIEndBevelSide = (aIStartSegISize > 0) ? eLogicalSideBEnd : eLogicalSideBStart;
 }
 
@@ -7240,9 +7243,9 @@ BCInlineDirSeg::Paint(BCPaintBorderIterator& aIter, DrawTarget& aDrawTarget)
   BCPixelSize smallHalf, largeHalf;
   DivideBCBorderSize(mWidth, smallHalf, largeHalf);
   LogicalRect segRect(aIter.mTableWM, mOffsetI,
-                      mOffsetB - nsPresContext::CSSPixelsToAppUnits(largeHalf),
+                      mOffsetB - aIter.mTable->PresContext()->DevPixelsToAppUnits(largeHalf),
                       mLength,
-                      nsPresContext::CSSPixelsToAppUnits(mWidth));
+                      aIter.mTable->PresContext()->DevPixelsToAppUnits(mWidth));
 
   // Convert logical to physical sides/coordinates for DrawTableBorderSegment.
   nsRect physicalRect = segRect.GetPhysicalRect(aIter.mTableWM,
@@ -7250,7 +7253,7 @@ BCInlineDirSeg::Paint(BCPaintBorderIterator& aIter, DrawTarget& aDrawTarget)
   uint8_t startBevelSide = aIter.mTableWM.PhysicalSide(mIStartBevelSide);
   uint8_t endBevelSide = aIter.mTableWM.PhysicalSide(mIEndBevelSide);
   nscoord startBevelOffset =
-    nsPresContext::CSSPixelsToAppUnits(mIStartBevelOffset);
+    aIter.mTable->PresContext()->DevPixelsToAppUnits(mIStartBevelOffset);
   nscoord endBevelOffset = mIEndBevelOffset;
   // With inline-RTL directionality, the 'start' and 'end' of the inline-dir
   // border segment need to be swapped because DrawTableBorderSegment will
@@ -7271,7 +7274,7 @@ BCInlineDirSeg::Paint(BCPaintBorderIterator& aIter, DrawTarget& aDrawTarget)
   nsCSSRendering::DrawTableBorderSegment(aDrawTarget, style, color,
                                          aIter.mTableBgColor, physicalRect,
                                          appUnitsPerDevPixel,
-                                         nsPresContext::AppUnitsPerCSSPixel(),
+                                         aIter.mTable->PresContext()->AppUnitsPerDevPixel(),
                                          startBevelSide, startBevelOffset,
                                          endBevelSide, endBevelOffset);
 }
diff --git a/media/ffvpx/FILES b/media/ffvpx/FILES
index ff8a2da43..a9eb97710 100644
--- a/media/ffvpx/FILES
+++ b/media/ffvpx/FILES
@@ -1,177 +1,218 @@
 ./COPYING.LGPLv2.1
 ./COPYING.LGPLv3
+./compat/atomics/win32/stdatomic.h
 ./compat/va_copy.h
 ./compat/w32pthreads.h
-./compat/atomics/win32/stdatomic.h
-./libavcodec/audioconvert.c
-./libavcodec/audioconvert.h
+./libavcodec/allcodecs.c
+./libavcodec/arm/flacdsp_arm.S
+./libavcodec/arm/flacdsp_init_arm.c
+./libavcodec/arm/mathops.h
+./libavcodec/avcodec.h
+./libavcodec/avfft.c
+./libavcodec/avfft.h
+./libavcodec/avpacket.c
 ./libavcodec/avpicture.c
 ./libavcodec/bit_depth_template.c
+./libavcodec/bitstream.c
+./libavcodec/bitstream_filter.c
+./libavcodec/bitstream_filters.c
+./libavcodec/blockdsp.h
+./libavcodec/bsf.c
+./libavcodec/bsf.h
+./libavcodec/bsf_list.c
+./libavcodec/bytestream.h
+./libavcodec/codec_desc.c
+./libavcodec/dct.h
+./libavcodec/decode.c
+./libavcodec/decode.h
+./libavcodec/error_resilience.h
 ./libavcodec/fdctdsp.h
+./libavcodec/fft-internal.h
+./libavcodec/fft.h
+./libavcodec/fft_float.c
+./libavcodec/fft_template.c
+./libavcodec/flac.c
+./libavcodec/flac.h
 ./libavcodec/flacdata.c
 ./libavcodec/flacdata.h
+./libavcodec/flacdec.c
+./libavcodec/flacdsp.c
+./libavcodec/flacdsp.h
 ./libavcodec/flacdsp_lpc_template.c
+./libavcodec/flacdsp_template.c
 ./libavcodec/frame_thread_encoder.h
+./libavcodec/get_bits.h
 ./libavcodec/golomb.c
+./libavcodec/golomb.h
 ./libavcodec/h263dsp.h
+./libavcodec/h264chroma.h
+./libavcodec/h264dsp.h
 ./libavcodec/h264pred.c
 ./libavcodec/h264pred.h
 ./libavcodec/h264pred_template.c
+./libavcodec/hpeldsp.h
+./libavcodec/hwaccel.h
+./libavcodec/idctdsp.h
+./libavcodec/imgconvert.c
+./libavcodec/internal.h
 ./libavcodec/log2_tab.c
+./libavcodec/mathops.h
 ./libavcodec/mathtables.c
+./libavcodec/me_cmp.h
 ./libavcodec/motion_est.h
 ./libavcodec/mpeg12data.h
 ./libavcodec/mpegpicture.h
 ./libavcodec/mpegutils.h
+./libavcodec/mpegvideo.h
+./libavcodec/mpegvideodata.h
 ./libavcodec/mpegvideodsp.h
 ./libavcodec/mpegvideoencdsp.h
+./libavcodec/null_bsf.c
+./libavcodec/options.c
+./libavcodec/options_table.h
+./libavcodec/parser.c
 ./libavcodec/parser.h
+./libavcodec/pixblockdsp.h
 ./libavcodec/profiles.c
 ./libavcodec/profiles.h
 ./libavcodec/pthread.c
+./libavcodec/pthread_frame.c
 ./libavcodec/pthread_internal.h
+./libavcodec/pthread_slice.c
+./libavcodec/put_bits.h
 ./libavcodec/qpeldsp.h
 ./libavcodec/qsv_api.c
+./libavcodec/ratecontrol.h
+./libavcodec/raw.c
 ./libavcodec/raw.h
+./libavcodec/rdft.c
+./libavcodec/rdft.h
 ./libavcodec/rectangle.h
-./libavcodec/resample.c
-./libavcodec/resample2.c
 ./libavcodec/reverse.c
 ./libavcodec/rl.h
 ./libavcodec/rnd_avg.h
+./libavcodec/thread.h
 ./libavcodec/unary.h
+./libavcodec/utils.c
+./libavcodec/version.h
+./libavcodec/videodsp.c
+./libavcodec/videodsp.h
 ./libavcodec/videodsp_template.c
+./libavcodec/vlc.h
 ./libavcodec/vorbis_parser.c
+./libavcodec/vorbis_parser.h
 ./libavcodec/vorbis_parser_internal.h
+./libavcodec/vp3dsp.h
+./libavcodec/vp56.h
+./libavcodec/vp56dsp.h
+./libavcodec/vp56rac.c
+./libavcodec/vp8.c
+./libavcodec/vp8.h
+./libavcodec/vp8_parser.c
 ./libavcodec/vp8data.h
+./libavcodec/vp8dsp.c
+./libavcodec/vp8dsp.h
+./libavcodec/vp9.c
+./libavcodec/vp9.h
+./libavcodec/vp9_mc_template.c
 ./libavcodec/vp9_parser.c
+./libavcodec/vp9_superframe_split_bsf.c
+./libavcodec/vp9block.c
+./libavcodec/vp9data.c
+./libavcodec/vp9data.h
+./libavcodec/vp9dec.h
+./libavcodec/vp9dsp.c
+./libavcodec/vp9dsp.h
 ./libavcodec/vp9dsp_10bpp.c
 ./libavcodec/vp9dsp_12bpp.c
 ./libavcodec/vp9dsp_8bpp.c
+./libavcodec/vp9dsp_template.c
+./libavcodec/vp9lpf.c
+./libavcodec/vp9mvs.c
+./libavcodec/vp9prob.c
+./libavcodec/vp9recon.c
+./libavcodec/vp9shared.h
 ./libavcodec/x86/constants.c
-./libavcodec/x86/flacdsp.asm
-./libavcodec/x86/mathops.h
-./libavcodec/x86/vp56_arith.h
-./libavcodec/x86/vp9dsp_init.h
-./libavcodec/x86/vp9dsp_init_10bpp.c
-./libavcodec/x86/vp9dsp_init_12bpp.c
-./libavcodec/x86/vp9intrapred.asm
-./libavcodec/x86/vp9itxfm_16bpp.asm
-./libavcodec/x86/vp9itxfm_template.asm
-./libavcodec/x86/vp9mc_16bpp.asm
-./libavcodec/x86/vp9lpf.asm
-./libavcodec/x86/vp9lpf_16bpp.asm
 ./libavcodec/x86/constants.h
+./libavcodec/x86/fft.asm
+./libavcodec/x86/fft.h
+./libavcodec/x86/fft_init.c
+./libavcodec/x86/flacdsp.asm
 ./libavcodec/x86/flacdsp_init.c
 ./libavcodec/x86/h264_intrapred.asm
 ./libavcodec/x86/h264_intrapred_10bit.asm
 ./libavcodec/x86/h264_intrapred_init.c
+./libavcodec/x86/mathops.h
 ./libavcodec/x86/videodsp.asm
 ./libavcodec/x86/videodsp_init.c
+./libavcodec/x86/vp56_arith.h
 ./libavcodec/x86/vp8dsp.asm
 ./libavcodec/x86/vp8dsp_init.c
 ./libavcodec/x86/vp8dsp_loopfilter.asm
 ./libavcodec/x86/vp9dsp_init.c
+./libavcodec/x86/vp9dsp_init.h
+./libavcodec/x86/vp9dsp_init_10bpp.c
+./libavcodec/x86/vp9dsp_init_12bpp.c
 ./libavcodec/x86/vp9dsp_init_16bpp.c
 ./libavcodec/x86/vp9dsp_init_16bpp_template.c
+./libavcodec/x86/vp9intrapred.asm
 ./libavcodec/x86/vp9intrapred_16bpp.asm
 ./libavcodec/x86/vp9itxfm.asm
+./libavcodec/x86/vp9itxfm_16bpp.asm
+./libavcodec/x86/vp9itxfm_template.asm
+./libavcodec/x86/vp9lpf.asm
+./libavcodec/x86/vp9lpf_16bpp.asm
 ./libavcodec/x86/vp9mc.asm
+./libavcodec/x86/vp9mc_16bpp.asm
 ./libavcodec/xiph.c
 ./libavcodec/xiph.h
-./libavcodec/bsf.h
-./libavcodec/h264dsp.h
-./libavcodec/imgconvert.c
-./libavcodec/bsf.c
-./libavcodec/decode.c
-./libavcodec/decode.h
-./libavcodec/hwaccel.h
-./libavcodec/vp9block.c
-./libavcodec/vp9data.c
-./libavcodec/vp9dec.h
-./libavcodec/vp9lpf.c
-./libavcodec/vp9mvs.c
-./libavcodec/vp9prob.c
-./libavcodec/vp9recon.c
-./libavcodec/vp9shared.h
-./libavcodec/allcodecs.c
-./libavcodec/avcodec.h
-./libavcodec/avpacket.c
-./libavcodec/bitstream.c
-./libavcodec/blockdsp.h
-./libavcodec/bytestream.h
-./libavcodec/codec_desc.c
-./libavcodec/error_resilience.h
-./libavcodec/flac.c
-./libavcodec/flac.h
-./libavcodec/flac_parser.c
-./libavcodec/flacdec.c
-./libavcodec/flacdsp.c
-./libavcodec/flacdsp.h
-./libavcodec/flacdsp_template.c
-./libavcodec/get_bits.h
-./libavcodec/golomb.h
-./libavcodec/h264chroma.h
-./libavcodec/hpeldsp.h
-./libavcodec/idctdsp.h
-./libavcodec/internal.h
-./libavcodec/mathops.h
-./libavcodec/me_cmp.h
-./libavcodec/mpegvideo.h
-./libavcodec/options.c
-./libavcodec/options_table.h
-./libavcodec/parser.c
-./libavcodec/pixblockdsp.h
-./libavcodec/pthread_frame.c
-./libavcodec/pthread_slice.c
-./libavcodec/put_bits.h
-./libavcodec/ratecontrol.h
-./libavcodec/raw.c
-./libavcodec/thread.h
-./libavcodec/utils.c
-./libavcodec/version.h
-./libavcodec/videodsp.c
-./libavcodec/videodsp.h
-./libavcodec/vlc.h
-./libavcodec/vorbis_parser.h
-./libavcodec/vp3dsp.h
-./libavcodec/vp56.h
-./libavcodec/vp56dsp.h
-./libavcodec/vp56rac.c
-./libavcodec/vp8.c
-./libavcodec/vp8.h
-./libavcodec/vp8_parser.c
-./libavcodec/vp8dsp.c
-./libavcodec/vp8dsp.h
-./libavcodec/vp9.c
-./libavcodec/vp9.h
-./libavcodec/vp9_mc_template.c
-./libavcodec/vp9data.h
-./libavcodec/vp9dsp.c
-./libavcodec/vp9dsp.h
-./libavcodec/vp9dsp_template.c
-./libavcodec/bitstream_filters.c
-./libavcodec/bitstream_filter.c
-./libavcodec/bsf_list.c
-./libavcodec/null_bsf.c
 ./libavutil/adler32.c
-./libavutil/atomic.c
-./libavutil/atomic.h
-./libavutil/atomic_win32.h
+./libavutil/adler32.h
+./libavutil/arm/asm.S
+./libavutil/arm/bswap.h
+./libavutil/arm/cpu.c
+./libavutil/arm/cpu.h
+./libavutil/arm/float_dsp_arm.h
+./libavutil/arm/float_dsp_init_arm.c
+./libavutil/arm/float_dsp_init_neon.c
+./libavutil/arm/float_dsp_init_vfp.c
+./libavutil/arm/float_dsp_neon.S
+./libavutil/arm/float_dsp_vfp.S
+./libavutil/arm/intmath.h
+./libavutil/arm/intreadwrite.h
+./libavutil/arm/timer.h
+./libavutil/attributes.h
+./libavutil/avassert.h
+./libavutil/avconfig.h
+./libavutil/avstring.c
+./libavutil/avstring.h
+./libavutil/avutil.h
 ./libavutil/avutilres.rc
 ./libavutil/base64.c
 ./libavutil/base64.h
 ./libavutil/bprint.c
 ./libavutil/bprint.h
 ./libavutil/bswap.h
+./libavutil/buffer.c
+./libavutil/buffer.h
+./libavutil/buffer_internal.h
+./libavutil/channel_layout.c
+./libavutil/channel_layout.h
 ./libavutil/color_utils.c
 ./libavutil/color_utils.h
 ./libavutil/colorspace.h
 ./libavutil/common.h
+./libavutil/cpu.c
+./libavutil/cpu.h
+./libavutil/cpu_internal.h
 ./libavutil/crc.c
+./libavutil/crc.h
+./libavutil/dict.c
 ./libavutil/dict.h
+./libavutil/dynarray.h
 ./libavutil/error.c
 ./libavutil/error.h
+./libavutil/eval.c
 ./libavutil/eval.h
 ./libavutil/ffmath.h
 ./libavutil/ffversion.h
@@ -179,30 +220,78 @@
 ./libavutil/fifo.h
 ./libavutil/fixed_dsp.c
 ./libavutil/fixed_dsp.h
+./libavutil/float_dsp.c
+./libavutil/float_dsp.h
+./libavutil/frame.c
+./libavutil/frame.h
+./libavutil/hwcontext.c
+./libavutil/hwcontext.h
+./libavutil/hwcontext_internal.h
+./libavutil/imgutils.c
+./libavutil/imgutils.h
+./libavutil/imgutils_internal.h
 ./libavutil/integer.c
 ./libavutil/integer.h
+./libavutil/internal.h
 ./libavutil/intfloat.h
 ./libavutil/intmath.c
 ./libavutil/intmath.h
+./libavutil/intreadwrite.h
 ./libavutil/libm.h
 ./libavutil/lls.c
 ./libavutil/lls.h
+./libavutil/log.c
+./libavutil/log.h
 ./libavutil/log2_tab.c
 ./libavutil/macros.h
+./libavutil/mathematics.c
+./libavutil/mathematics.h
+./libavutil/mem.c
+./libavutil/mem.h
 ./libavutil/mem_internal.h
+./libavutil/opt.c
+./libavutil/opt.h
+./libavutil/parseutils.c
 ./libavutil/parseutils.h
+./libavutil/pixdesc.c
+./libavutil/pixdesc.h
 ./libavutil/pixelutils.c
 ./libavutil/pixelutils.h
+./libavutil/pixfmt.h
 ./libavutil/qsort.h
 ./libavutil/rational.c
+./libavutil/rational.h
 ./libavutil/replaygain.h
 ./libavutil/reverse.c
+./libavutil/reverse.h
+./libavutil/samplefmt.c
+./libavutil/samplefmt.h
+./libavutil/slicethread.c
+./libavutil/slicethread.h
+./libavutil/thread.h
+./libavutil/threadmessage.c
 ./libavutil/threadmessage.h
+./libavutil/time.c
 ./libavutil/time_internal.h
+./libavutil/timecode.c
+./libavutil/timecode.h
+./libavutil/timer.h
+./libavutil/timestamp.h
+./libavutil/utils.c
+./libavutil/version.h
+./libavutil/x86/asm.h
 ./libavutil/x86/bswap.h
+./libavutil/x86/cpu.c
+./libavutil/x86/cpu.h
 ./libavutil/x86/cpuid.asm
+./libavutil/x86/emms.asm
+./libavutil/x86/emms.h
 ./libavutil/x86/fixed_dsp.asm
 ./libavutil/x86/fixed_dsp_init.c
+./libavutil/x86/float_dsp.asm
+./libavutil/x86/float_dsp_init.c
+./libavutil/x86/imgutils.asm
+./libavutil/x86/imgutils_init.c
 ./libavutil/x86/intmath.h
 ./libavutil/x86/intreadwrite.h
 ./libavutil/x86/lls.asm
@@ -211,72 +300,5 @@
 ./libavutil/x86/pixelutils.h
 ./libavutil/x86/pixelutils_init.c
 ./libavutil/x86/timer.h
-./libavutil/x86/asm.h
-./libavutil/x86/imgutils.asm
-./libavutil/x86/imgutils_init.c
-./libavutil/x86/cpu.c
-./libavutil/x86/cpu.h
-./libavutil/x86/emms.asm
-./libavutil/x86/emms.h
-./libavutil/x86/float_dsp.asm
-./libavutil/x86/float_dsp_init.c
 ./libavutil/x86/x86inc.asm
 ./libavutil/x86/x86util.asm
-./libavutil/adler32.h
-./libavutil/avassert.h
-./libavutil/avconfig.h
-./libavutil/crc.h
-./libavutil/dict.c
-./libavutil/dynarray.h
-./libavutil/log.h
-./libavutil/mathematics.h
-./libavutil/rational.h
-./libavutil/samplefmt.c
-./libavutil/samplefmt.h
-./libavutil/timestamp.h
-./libavutil/opt.c
-./libavutil/imgutils_internal.h
-./libavutil/reverse.h
-./libavutil/slicethread.c
-./libavutil/slicethread.h
-./libavutil/atomic_gcc.h
-./libavutil/attributes.h
-./libavutil/avstring.c
-./libavutil/avstring.h
-./libavutil/avutil.h
-./libavutil/buffer.c
-./libavutil/buffer.h
-./libavutil/buffer_internal.h
-./libavutil/channel_layout.c
-./libavutil/channel_layout.h
-./libavutil/cpu.c
-./libavutil/cpu.h
-./libavutil/cpu_internal.h
-./libavutil/eval.c
-./libavutil/float_dsp.c
-./libavutil/float_dsp.h
-./libavutil/frame.c
-./libavutil/frame.h
-./libavutil/hwcontext.h
-./libavutil/imgutils.c
-./libavutil/imgutils.h
-./libavutil/internal.h
-./libavutil/intreadwrite.h
-./libavutil/log.c
-./libavutil/mathematics.c
-./libavutil/mem.c
-./libavutil/mem.h
-./libavutil/opt.h
-./libavutil/parseutils.c
-./libavutil/pixdesc.c
-./libavutil/pixdesc.h
-./libavutil/pixfmt.h
-./libavutil/thread.h
-./libavutil/threadmessage.c
-./libavutil/time.c
-./libavutil/timecode.c
-./libavutil/timecode.h
-./libavutil/timer.h
-./libavutil/utils.c
-./libavutil/version.h
-
diff --git a/media/ffvpx/README_MCP b/media/ffvpx/README_MCP
index 5ed4d8d94..26834d3e3 100644
--- a/media/ffvpx/README_MCP
+++ b/media/ffvpx/README_MCP
@@ -1,6 +1,6 @@
 This directory contains files used in goanna builds from FFmpeg
 (http://ffmpeg.org). The current files are from FFmpeg as of
-Release 3.4.2
+Release 4.0.2
 All source files match their path from the library's source archive.
 
 Currently, we only use the vp8 and vp9 portion of the library, and only on x86
@@ -8,14 +8,11 @@ based platforms. If this changes, configuration files will most likely
 need to be updated.
 
 configuration files were generated as follow using the configure script:
-./configure --disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vda --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm
+./configure --disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --disable-cuvid --disable-cuda
 
 config*:
 replace: /HAVE_(MALLOC_H|ARC4RANDOM|LOCALTIME_R|MEMALIGN|POSIX_MEMALIGN)/d
 
-config_darwin32.h:
-add to configure command: --disable-asm --disable-x86asm --cc='clang -m32'
-
 config_unix32.h:
 add to configure command: --disable-asm --disable-x86asm --cc='clang -m32'
 replace: s/HAVE_SYSCTL 1/HAVE_SYSCTL 0/ and s/HAVE_MEMALIGN 1/HAVE_MEMALIGN 0/ and s/HAVE_POSIX_MEMALIGN 1/HAVE_POSIX_MEMALIGN 0/
@@ -27,8 +24,8 @@ config_win32/64.h/asm:
 add to configure command: --toolchain=msvc
 
 Regenerate defaults_disabled.{h,asm} with:
-$ grep -E ".*_(INDEV|OUTDEV|DECODER|ENCODER|DEMUXER|MUXER|PARSER|FILTER|HWACCEL|PROTOCOL|ENCODERS|DECODERS|HWACCELS|INDEVS|OUTDEVS|FILTERS|DEMUXERS|MUXERS|PROTOCOLS) 0" config.h > ~/Work/Mozilla/mozilla-central/media/ffvpx/defaults_disabled.h
-$ grep -E ".*_(INDEV|OUTDEV|DECODER|ENCODER|DEMUXER|MUXER|PARSER|FILTER|HWACCEL|PROTOCOL|ENCODERS|DECODERS|HWACCELS|INDEVS|OUTDEVS|FILTERS|DEMUXERS|MUXERS|PROTOCOLS) 0" config.asm > ~/Work/Mozilla/mozilla-central/media/ffvpx/defaults_disabled.asm
+$ grep -E ".*_(INDEV|OUTDEV|DECODER|ENCODER|DEMUXER|MUXER|PARSER|FILTER|HWACCEL|PROTOCOL|ENCODERS|DECODERS|HWACCELS|INDEVS|OUTDEVS|FILTERS|DEMUXERS|MUXERS|PROTOCOLS|BSF) 0" config.h | sort -u > ~/REPO/UXP/media/ffvpx/defaults_disabled.h
+$ grep -E ".*_(INDEV|OUTDEV|DECODER|ENCODER|DEMUXER|MUXER|PARSER|FILTER|HWACCEL|PROTOCOL|ENCODERS|DECODERS|HWACCELS|INDEVS|OUTDEVS|FILTERS|DEMUXERS|MUXERS|PROTOCOLS|BSF) 0" config.asm | sort -u > ~/REPO/UXP/media/ffvpx/defaults_disabled.asm
 
 All new decoders/muxers/encoders/... should be added in the list of dummy functions found in libavcodec/dummy_funcs.c
 otherwise linkage will fail on Windows. On other platforms they are optimised out and aren't necessary.
@@ -38,6 +35,7 @@ To update the source tree, perform a diff on the files listed in FILES.
 The diffs should typically apply to the ffvpx tree.
 e.g. something like this would do:
 Run in the ffmpeg original tree:
-$ for i in `cat $PATH_CENTRAL/media/ffvpx/FILES`; do diff $REV_LASTSYNC HEAD >> patch.diff; done
+$ for i in `cat $PATH_CENTRAL/media/ffvpx/FILES`; do git diff $REV_LASTSYNC HEAD >> patch.diff; done
 Then apply patch.diff on the ffvpx tree.
+
 Compilation will reveal if any files are missing.
diff --git a/media/ffvpx/compat/w32pthreads.h b/media/ffvpx/compat/w32pthreads.h
index eeead6051..21acfd2ba 100644
--- a/media/ffvpx/compat/w32pthreads.h
+++ b/media/ffvpx/compat/w32pthreads.h
@@ -39,11 +39,6 @@
 #include <windows.h>
 #include <process.h>
 
-#if _WIN32_WINNT < 0x0600 && defined(__MINGW32__)
-#undef MemoryBarrier
-#define MemoryBarrier __sync_synchronize
-#endif
-
 #include "libavutil/attributes.h"
 #include "libavutil/common.h"
 #include "libavutil/internal.h"
@@ -56,24 +51,15 @@ typedef struct pthread_t {
     void *ret;
 } pthread_t;
 
-/* the conditional variable api for windows 6.0+ uses critical sections and
- * not mutexes */
-typedef CRITICAL_SECTION pthread_mutex_t;
-
-/* This is the CONDITION_VARIABLE typedef for using Windows' native
- * conditional variables on kernels 6.0+. */
-#if HAVE_CONDITION_VARIABLE_PTR
+/* use light weight mutex/condition variable API for Windows Vista and later */
+typedef SRWLOCK pthread_mutex_t;
 typedef CONDITION_VARIABLE pthread_cond_t;
-#else
-typedef struct pthread_cond_t {
-    void *Ptr;
-} pthread_cond_t;
-#endif
 
-#if _WIN32_WINNT >= 0x0600
+#define PTHREAD_MUTEX_INITIALIZER SRWLOCK_INIT
+#define PTHREAD_COND_INITIALIZER CONDITION_VARIABLE_INIT
+
 #define InitializeCriticalSection(x) InitializeCriticalSectionEx(x, 0, 0)
 #define WaitForSingleObject(a, b) WaitForSingleObjectEx(a, b, FALSE)
-#endif
 
 static av_unused unsigned __stdcall attribute_align_arg win32thread_worker(void *arg)
 {
@@ -114,26 +100,25 @@ static av_unused int pthread_join(pthread_t thread, void **value_ptr)
 
 static inline int pthread_mutex_init(pthread_mutex_t *m, void* attr)
 {
-    InitializeCriticalSection(m);
+    InitializeSRWLock(m);
     return 0;
 }
 static inline int pthread_mutex_destroy(pthread_mutex_t *m)
 {
-    DeleteCriticalSection(m);
+    /* Unlocked SWR locks use no resources */
     return 0;
 }
 static inline int pthread_mutex_lock(pthread_mutex_t *m)
 {
-    EnterCriticalSection(m);
+    AcquireSRWLockExclusive(m);
     return 0;
 }
 static inline int pthread_mutex_unlock(pthread_mutex_t *m)
 {
-    LeaveCriticalSection(m);
+    ReleaseSRWLockExclusive(m);
     return 0;
 }
 
-#if _WIN32_WINNT >= 0x0600
 typedef INIT_ONCE pthread_once_t;
 #define PTHREAD_ONCE_INIT INIT_ONCE_STATIC_INIT
 
@@ -167,7 +152,7 @@ static inline int pthread_cond_broadcast(pthread_cond_t *cond)
 
 static inline int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
 {
-    SleepConditionVariableCS(cond, mutex, INFINITE);
+    SleepConditionVariableSRW(cond, mutex, INFINITE, 0);
     return 0;
 }
 
@@ -177,242 +162,4 @@ static inline int pthread_cond_signal(pthread_cond_t *cond)
     return 0;
 }
 
-#else // _WIN32_WINNT < 0x0600
-
-/* atomic init state of dynamically loaded functions */
-static LONG w32thread_init_state = 0;
-static av_unused void w32thread_init(void);
-
-/* for pre-Windows 6.0 platforms, define INIT_ONCE struct,
- * compatible to the one used in the native API */
-
-typedef union pthread_once_t  {
-    void * Ptr;    ///< For the Windows 6.0+ native functions
-    LONG state;    ///< For the pre-Windows 6.0 compat code
-} pthread_once_t;
-
-#define PTHREAD_ONCE_INIT {0}
-
-/* function pointers to init once API on windows 6.0+ kernels */
-static BOOL (WINAPI *initonce_begin)(pthread_once_t *lpInitOnce, DWORD dwFlags, BOOL *fPending, void **lpContext);
-static BOOL (WINAPI *initonce_complete)(pthread_once_t *lpInitOnce, DWORD dwFlags, void *lpContext);
-
-/* pre-Windows 6.0 compat using a spin-lock */
-static inline void w32thread_once_fallback(LONG volatile *state, void (*init_routine)(void))
-{
-    switch (InterlockedCompareExchange(state, 1, 0)) {
-    /* Initial run */
-    case 0:
-        init_routine();
-        InterlockedExchange(state, 2);
-        break;
-    /* Another thread is running init */
-    case 1:
-        while (1) {
-            MemoryBarrier();
-            if (*state == 2)
-                break;
-            Sleep(0);
-        }
-        break;
-    /* Initialization complete */
-    case 2:
-        break;
-    }
-}
-
-static av_unused int pthread_once(pthread_once_t *once_control, void (*init_routine)(void))
-{
-    w32thread_once_fallback(&w32thread_init_state, w32thread_init);
-
-    /* Use native functions on Windows 6.0+ */
-    if (initonce_begin && initonce_complete) {
-        BOOL pending = FALSE;
-        initonce_begin(once_control, 0, &pending, NULL);
-        if (pending)
-            init_routine();
-        initonce_complete(once_control, 0, NULL);
-        return 0;
-    }
-
-    w32thread_once_fallback(&once_control->state, init_routine);
-    return 0;
-}
-
-/* for pre-Windows 6.0 platforms we need to define and use our own condition
- * variable and api */
-
-typedef struct  win32_cond_t {
-    pthread_mutex_t mtx_broadcast;
-    pthread_mutex_t mtx_waiter_count;
-    volatile int waiter_count;
-    HANDLE semaphore;
-    HANDLE waiters_done;
-    volatile int is_broadcast;
-} win32_cond_t;
-
-/* function pointers to conditional variable API on windows 6.0+ kernels */
-static void (WINAPI *cond_broadcast)(pthread_cond_t *cond);
-static void (WINAPI *cond_init)(pthread_cond_t *cond);
-static void (WINAPI *cond_signal)(pthread_cond_t *cond);
-static BOOL (WINAPI *cond_wait)(pthread_cond_t *cond, pthread_mutex_t *mutex,
-                                DWORD milliseconds);
-
-static av_unused int pthread_cond_init(pthread_cond_t *cond, const void *unused_attr)
-{
-    win32_cond_t *win32_cond = NULL;
-
-    w32thread_once_fallback(&w32thread_init_state, w32thread_init);
-
-    if (cond_init) {
-        cond_init(cond);
-        return 0;
-    }
-
-    /* non native condition variables */
-    win32_cond = (win32_cond_t*)av_mallocz(sizeof(win32_cond_t));
-    if (!win32_cond)
-        return ENOMEM;
-    cond->Ptr = win32_cond;
-    win32_cond->semaphore = CreateSemaphore(NULL, 0, 0x7fffffff, NULL);
-    if (!win32_cond->semaphore)
-        return ENOMEM;
-    win32_cond->waiters_done = CreateEvent(NULL, TRUE, FALSE, NULL);
-    if (!win32_cond->waiters_done)
-        return ENOMEM;
-
-    pthread_mutex_init(&win32_cond->mtx_waiter_count, NULL);
-    pthread_mutex_init(&win32_cond->mtx_broadcast, NULL);
-    return 0;
-}
-
-static av_unused int pthread_cond_destroy(pthread_cond_t *cond)
-{
-    win32_cond_t *win32_cond = (win32_cond_t*)cond->Ptr;
-    /* native condition variables do not destroy */
-    if (cond_init)
-        return 0;
-
-    /* non native condition variables */
-    CloseHandle(win32_cond->semaphore);
-    CloseHandle(win32_cond->waiters_done);
-    pthread_mutex_destroy(&win32_cond->mtx_waiter_count);
-    pthread_mutex_destroy(&win32_cond->mtx_broadcast);
-    av_freep(&win32_cond);
-    cond->Ptr = NULL;
-    return 0;
-}
-
-static av_unused int pthread_cond_broadcast(pthread_cond_t *cond)
-{
-    win32_cond_t *win32_cond = (win32_cond_t*)cond->Ptr;
-    int have_waiter;
-
-    if (cond_broadcast) {
-        cond_broadcast(cond);
-        return 0;
-    }
-
-    /* non native condition variables */
-    pthread_mutex_lock(&win32_cond->mtx_broadcast);
-    pthread_mutex_lock(&win32_cond->mtx_waiter_count);
-    have_waiter = 0;
-
-    if (win32_cond->waiter_count) {
-        win32_cond->is_broadcast = 1;
-        have_waiter = 1;
-    }
-
-    if (have_waiter) {
-        ReleaseSemaphore(win32_cond->semaphore, win32_cond->waiter_count, NULL);
-        pthread_mutex_unlock(&win32_cond->mtx_waiter_count);
-        WaitForSingleObject(win32_cond->waiters_done, INFINITE);
-        ResetEvent(win32_cond->waiters_done);
-        win32_cond->is_broadcast = 0;
-    } else
-        pthread_mutex_unlock(&win32_cond->mtx_waiter_count);
-    pthread_mutex_unlock(&win32_cond->mtx_broadcast);
-    return 0;
-}
-
-static av_unused int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
-{
-    win32_cond_t *win32_cond = (win32_cond_t*)cond->Ptr;
-    int last_waiter;
-    if (cond_wait) {
-        cond_wait(cond, mutex, INFINITE);
-        return 0;
-    }
-
-    /* non native condition variables */
-    pthread_mutex_lock(&win32_cond->mtx_broadcast);
-    pthread_mutex_lock(&win32_cond->mtx_waiter_count);
-    win32_cond->waiter_count++;
-    pthread_mutex_unlock(&win32_cond->mtx_waiter_count);
-    pthread_mutex_unlock(&win32_cond->mtx_broadcast);
-
-    // unlock the external mutex
-    pthread_mutex_unlock(mutex);
-    WaitForSingleObject(win32_cond->semaphore, INFINITE);
-
-    pthread_mutex_lock(&win32_cond->mtx_waiter_count);
-    win32_cond->waiter_count--;
-    last_waiter = !win32_cond->waiter_count || !win32_cond->is_broadcast;
-    pthread_mutex_unlock(&win32_cond->mtx_waiter_count);
-
-    if (last_waiter)
-        SetEvent(win32_cond->waiters_done);
-
-    // lock the external mutex
-    return pthread_mutex_lock(mutex);
-}
-
-static av_unused int pthread_cond_signal(pthread_cond_t *cond)
-{
-    win32_cond_t *win32_cond = (win32_cond_t*)cond->Ptr;
-    int have_waiter;
-    if (cond_signal) {
-        cond_signal(cond);
-        return 0;
-    }
-
-    pthread_mutex_lock(&win32_cond->mtx_broadcast);
-
-    /* non-native condition variables */
-    pthread_mutex_lock(&win32_cond->mtx_waiter_count);
-    have_waiter = win32_cond->waiter_count;
-    pthread_mutex_unlock(&win32_cond->mtx_waiter_count);
-
-    if (have_waiter) {
-        ReleaseSemaphore(win32_cond->semaphore, 1, NULL);
-        WaitForSingleObject(win32_cond->waiters_done, INFINITE);
-        ResetEvent(win32_cond->waiters_done);
-    }
-
-    pthread_mutex_unlock(&win32_cond->mtx_broadcast);
-    return 0;
-}
-#endif
-
-static av_unused void w32thread_init(void)
-{
-#if _WIN32_WINNT < 0x0600
-    HMODULE kernel_dll = GetModuleHandle(TEXT("kernel32.dll"));
-    /* if one is available, then they should all be available */
-    cond_init      = (void (WINAPI*)(pthread_cond_t *))
-        GetProcAddress(kernel_dll, "InitializeConditionVariable");
-    cond_broadcast = (void (WINAPI*)(pthread_cond_t *))
-        GetProcAddress(kernel_dll, "WakeAllConditionVariable");
-    cond_signal    = (void (WINAPI*)(pthread_cond_t *))
-        GetProcAddress(kernel_dll, "WakeConditionVariable");
-    cond_wait      = (BOOL (WINAPI*)(pthread_cond_t *, pthread_mutex_t *, DWORD))
-        GetProcAddress(kernel_dll, "SleepConditionVariableCS");
-    initonce_begin = (BOOL (WINAPI*)(pthread_once_t *, DWORD, BOOL *, void **))
-        GetProcAddress(kernel_dll, "InitOnceBeginInitialize");
-    initonce_complete = (BOOL (WINAPI*)(pthread_once_t *, DWORD, void *))
-        GetProcAddress(kernel_dll, "InitOnceComplete");
-#endif
-
-}
-
 #endif /* COMPAT_W32PTHREADS_H */
diff --git a/media/ffvpx/config.h b/media/ffvpx/config.h
index dab01e05c..db2f7b42e 100644
--- a/media/ffvpx/config.h
+++ b/media/ffvpx/config.h
@@ -27,11 +27,7 @@
 #define HAVE_LIBC_MSVCRT 0
 #endif
 #elif defined(XP_DARWIN)
-#if defined(HAVE_64BIT_BUILD)
 #include "config_darwin64.h"
-#else
-#include "config_darwin32.h"
-#endif
 #elif defined(XP_UNIX)
 #if defined(HAVE_64BIT_BUILD)
 #include "config_unix64.h"
diff --git a/media/ffvpx/config_darwin32.h b/media/ffvpx/config_darwin32.h
deleted file mode 100644
index f92be8737..000000000
--- a/media/ffvpx/config_darwin32.h
+++ /dev/null
@@ -1,654 +0,0 @@
-/* Automatically generated by configure - do not modify! */
-#ifndef FFMPEG_CONFIG_H
-#define FFMPEG_CONFIG_H
-#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vda --disable-vdpau --disable-videotoolbox --enable-asm --enable-yasm --disable-asm --disable-yasm --cc='clang -m32'"
-#define FFMPEG_LICENSE "LGPL version 2.1 or later"
-#define CONFIG_THIS_YEAR 2016
-#define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
-#define AVCONV_DATADIR "/usr/local/share/ffmpeg"
-#define CC_IDENT "Apple LLVM version 7.0.2 (clang-700.1.81)"
-#define av_restrict restrict
-#define EXTERN_PREFIX "_"
-#define EXTERN_ASM _
-#define BUILDSUF ""
-#define SLIBSUF ".dylib"
-#define HAVE_MMX2 HAVE_MMXEXT
-#define SWS_MAX_FILTER_SIZE 256
-#define ARCH_AARCH64 0
-#define ARCH_ALPHA 0
-#define ARCH_ARM 0
-#define ARCH_AVR32 0
-#define ARCH_AVR32_AP 0
-#define ARCH_AVR32_UC 0
-#define ARCH_BFIN 0
-#define ARCH_IA64 0
-#define ARCH_M68K 0
-#define ARCH_MIPS 0
-#define ARCH_MIPS64 0
-#define ARCH_PARISC 0
-#define ARCH_PPC 0
-#define ARCH_PPC64 0
-#define ARCH_S390 0
-#define ARCH_SH4 0
-#define ARCH_SPARC 0
-#define ARCH_SPARC64 0
-#define ARCH_TILEGX 0
-#define ARCH_TILEPRO 0
-#define ARCH_TOMI 0
-#define ARCH_X86 0
-#define ARCH_X86_32 0
-#define ARCH_X86_64 0
-#define HAVE_ARMV5TE 0
-#define HAVE_ARMV6 0
-#define HAVE_ARMV6T2 0
-#define HAVE_ARMV8 0
-#define HAVE_NEON 0
-#define HAVE_VFP 0
-#define HAVE_VFPV3 0
-#define HAVE_SETEND 0
-#define HAVE_ALTIVEC 0
-#define HAVE_DCBZL 0
-#define HAVE_LDBRX 0
-#define HAVE_POWER8 0
-#define HAVE_PPC4XX 0
-#define HAVE_VSX 0
-#define HAVE_AESNI 0
-#define HAVE_AMD3DNOW 0
-#define HAVE_AMD3DNOWEXT 0
-#define HAVE_AVX 0
-#define HAVE_AVX2 0
-#define HAVE_FMA3 0
-#define HAVE_FMA4 0
-#define HAVE_MMX 0
-#define HAVE_MMXEXT 0
-#define HAVE_SSE 0
-#define HAVE_SSE2 0
-#define HAVE_SSE3 0
-#define HAVE_SSE4 0
-#define HAVE_SSE42 0
-#define HAVE_SSSE3 0
-#define HAVE_XOP 0
-#define HAVE_CPUNOP 0
-#define HAVE_I686 0
-#define HAVE_MIPSFPU 0
-#define HAVE_MIPS32R2 0
-#define HAVE_MIPS32R5 0
-#define HAVE_MIPS64R2 0
-#define HAVE_MIPS32R6 0
-#define HAVE_MIPS64R6 0
-#define HAVE_MIPSDSP 0
-#define HAVE_MIPSDSPR2 0
-#define HAVE_MSA 0
-#define HAVE_LOONGSON2 0
-#define HAVE_LOONGSON3 0
-#define HAVE_MMI 0
-#define HAVE_ARMV5TE_EXTERNAL 0
-#define HAVE_ARMV6_EXTERNAL 0
-#define HAVE_ARMV6T2_EXTERNAL 0
-#define HAVE_ARMV8_EXTERNAL 0
-#define HAVE_NEON_EXTERNAL 0
-#define HAVE_VFP_EXTERNAL 0
-#define HAVE_VFPV3_EXTERNAL 0
-#define HAVE_SETEND_EXTERNAL 0
-#define HAVE_ALTIVEC_EXTERNAL 0
-#define HAVE_DCBZL_EXTERNAL 0
-#define HAVE_LDBRX_EXTERNAL 0
-#define HAVE_POWER8_EXTERNAL 0
-#define HAVE_PPC4XX_EXTERNAL 0
-#define HAVE_VSX_EXTERNAL 0
-#define HAVE_AESNI_EXTERNAL 0
-#define HAVE_AMD3DNOW_EXTERNAL 0
-#define HAVE_AMD3DNOWEXT_EXTERNAL 0
-#define HAVE_AVX_EXTERNAL 0
-#define HAVE_AVX2_EXTERNAL 0
-#define HAVE_FMA3_EXTERNAL 0
-#define HAVE_FMA4_EXTERNAL 0
-#define HAVE_MMX_EXTERNAL 0
-#define HAVE_MMXEXT_EXTERNAL 0
-#define HAVE_SSE_EXTERNAL 0
-#define HAVE_SSE2_EXTERNAL 0
-#define HAVE_SSE3_EXTERNAL 0
-#define HAVE_SSE4_EXTERNAL 0
-#define HAVE_SSE42_EXTERNAL 0
-#define HAVE_SSSE3_EXTERNAL 0
-#define HAVE_XOP_EXTERNAL 0
-#define HAVE_CPUNOP_EXTERNAL 0
-#define HAVE_I686_EXTERNAL 0
-#define HAVE_MIPSFPU_EXTERNAL 0
-#define HAVE_MIPS32R2_EXTERNAL 0
-#define HAVE_MIPS32R5_EXTERNAL 0
-#define HAVE_MIPS64R2_EXTERNAL 0
-#define HAVE_MIPS32R6_EXTERNAL 0
-#define HAVE_MIPS64R6_EXTERNAL 0
-#define HAVE_MIPSDSP_EXTERNAL 0
-#define HAVE_MIPSDSPR2_EXTERNAL 0
-#define HAVE_MSA_EXTERNAL 0
-#define HAVE_LOONGSON2_EXTERNAL 0
-#define HAVE_LOONGSON3_EXTERNAL 0
-#define HAVE_MMI_EXTERNAL 0
-#define HAVE_ARMV5TE_INLINE 0
-#define HAVE_ARMV6_INLINE 0
-#define HAVE_ARMV6T2_INLINE 0
-#define HAVE_ARMV8_INLINE 0
-#define HAVE_NEON_INLINE 0
-#define HAVE_VFP_INLINE 0
-#define HAVE_VFPV3_INLINE 0
-#define HAVE_SETEND_INLINE 0
-#define HAVE_ALTIVEC_INLINE 0
-#define HAVE_DCBZL_INLINE 0
-#define HAVE_LDBRX_INLINE 0
-#define HAVE_POWER8_INLINE 0
-#define HAVE_PPC4XX_INLINE 0
-#define HAVE_VSX_INLINE 0
-#define HAVE_AESNI_INLINE 0
-#define HAVE_AMD3DNOW_INLINE 0
-#define HAVE_AMD3DNOWEXT_INLINE 0
-#define HAVE_AVX_INLINE 0
-#define HAVE_AVX2_INLINE 0
-#define HAVE_FMA3_INLINE 0
-#define HAVE_FMA4_INLINE 0
-#define HAVE_MMX_INLINE 0
-#define HAVE_MMXEXT_INLINE 0
-#define HAVE_SSE_INLINE 0
-#define HAVE_SSE2_INLINE 0
-#define HAVE_SSE3_INLINE 0
-#define HAVE_SSE4_INLINE 0
-#define HAVE_SSE42_INLINE 0
-#define HAVE_SSSE3_INLINE 0
-#define HAVE_XOP_INLINE 0
-#define HAVE_CPUNOP_INLINE 0
-#define HAVE_I686_INLINE 0
-#define HAVE_MIPSFPU_INLINE 0
-#define HAVE_MIPS32R2_INLINE 0
-#define HAVE_MIPS32R5_INLINE 0
-#define HAVE_MIPS64R2_INLINE 0
-#define HAVE_MIPS32R6_INLINE 0
-#define HAVE_MIPS64R6_INLINE 0
-#define HAVE_MIPSDSP_INLINE 0
-#define HAVE_MIPSDSPR2_INLINE 0
-#define HAVE_MSA_INLINE 0
-#define HAVE_LOONGSON2_INLINE 0
-#define HAVE_LOONGSON3_INLINE 0
-#define HAVE_MMI_INLINE 0
-#define HAVE_ALIGNED_STACK 0
-#define HAVE_FAST_64BIT 0
-#define HAVE_FAST_CLZ 0
-#define HAVE_FAST_CMOV 0
-#define HAVE_LOCAL_ALIGNED_8 1
-#define HAVE_LOCAL_ALIGNED_16 1
-#define HAVE_LOCAL_ALIGNED_32 1
-#define HAVE_SIMD_ALIGN_16 0
-#define HAVE_ATOMICS_GCC 1
-#define HAVE_ATOMICS_SUNCC 0
-#define HAVE_ATOMICS_WIN32 0
-#define HAVE_ATOMIC_CAS_PTR 0
-#define HAVE_ATOMIC_COMPARE_EXCHANGE 1
-#define HAVE_MACHINE_RW_BARRIER 0
-#define HAVE_MEMORYBARRIER 0
-#define HAVE_MM_EMPTY 1
-#define HAVE_RDTSC 0
-#define HAVE_SARESTART 1
-#define HAVE_SEM_TIMEDWAIT 1
-#define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
-#define HAVE_CABS 1
-#define HAVE_CEXP 1
-#define HAVE_INLINE_ASM 1
-#define HAVE_SYMVER 1
-#define HAVE_YASM 0
-#define HAVE_BIGENDIAN 0
-#define HAVE_FAST_UNALIGNED 0
-#define HAVE_ALSA_ASOUNDLIB_H 0
-#define HAVE_ALTIVEC_H 0
-#define HAVE_ARPA_INET_H 1
-#define HAVE_ASM_TYPES_H 0
-#define HAVE_CDIO_PARANOIA_H 0
-#define HAVE_CDIO_PARANOIA_PARANOIA_H 0
-#define HAVE_DISPATCH_DISPATCH_H 0
-#define HAVE_DEV_BKTR_IOCTL_BT848_H 0
-#define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
-#define HAVE_DEV_IC_BT8XX_H 0
-#define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0
-#define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
-#define HAVE_DIRECT_H 0
-#define HAVE_DIRENT_H 1
-#define HAVE_DLFCN_H 1
-#define HAVE_D3D11_H 0
-#define HAVE_DXVA_H 0
-#define HAVE_ES2_GL_H 0
-#define HAVE_GSM_H 0
-#define HAVE_IO_H 0
-#define HAVE_MACH_MACH_TIME_H 1
-#define HAVE_MACHINE_IOCTL_BT848_H 0
-#define HAVE_MACHINE_IOCTL_METEOR_H 0
-#define HAVE_OPENCV2_CORE_CORE_C_H 0
-#define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-#define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
-#define HAVE_OPENGL_GL3_H 0
-#define HAVE_POLL_H 1
-#define HAVE_SNDIO_H 0
-#define HAVE_SOUNDCARD_H 0
-#define HAVE_SYS_MMAN_H 1
-#define HAVE_SYS_PARAM_H 1
-#define HAVE_SYS_RESOURCE_H 1
-#define HAVE_SYS_SELECT_H 1
-#define HAVE_SYS_SOUNDCARD_H 0
-#define HAVE_SYS_TIME_H 1
-#define HAVE_SYS_UN_H 1
-#define HAVE_SYS_VIDEOIO_H 0
-#define HAVE_TERMIOS_H 1
-#define HAVE_UDPLITE_H 0
-#define HAVE_UNISTD_H 1
-#define HAVE_VALGRIND_VALGRIND_H 0
-#define HAVE_WINDOWS_H 0
-#define HAVE_WINSOCK2_H 0
-#define HAVE_INTRINSICS_NEON 0
-#define HAVE_ATANF 1
-#define HAVE_ATAN2F 1
-#define HAVE_CBRT 1
-#define HAVE_CBRTF 1
-#define HAVE_COPYSIGN 1
-#define HAVE_COSF 1
-#define HAVE_ERF 1
-#define HAVE_EXP2 1
-#define HAVE_EXP2F 1
-#define HAVE_EXPF 1
-#define HAVE_HYPOT 1
-#define HAVE_ISFINITE 1
-#define HAVE_ISINF 1
-#define HAVE_ISNAN 1
-#define HAVE_LDEXPF 1
-#define HAVE_LLRINT 1
-#define HAVE_LLRINTF 1
-#define HAVE_LOG2 1
-#define HAVE_LOG2F 1
-#define HAVE_LOG10F 1
-#define HAVE_LRINT 1
-#define HAVE_LRINTF 1
-#define HAVE_POWF 1
-#define HAVE_RINT 1
-#define HAVE_ROUND 1
-#define HAVE_ROUNDF 1
-#define HAVE_SINF 1
-#define HAVE_TRUNC 1
-#define HAVE_TRUNCF 1
-#define HAVE_ACCESS 1
-#define HAVE_ALIGNED_MALLOC 0
-#define HAVE_CLOCK_GETTIME 0
-#define HAVE_CLOSESOCKET 0
-#define HAVE_COMMANDLINETOARGVW 0
-#define HAVE_COTASKMEMFREE 0
-#define HAVE_CRYPTGENRANDOM 0
-#define HAVE_DLOPEN 1
-#define HAVE_FCNTL 1
-#define HAVE_FLT_LIM 1
-#define HAVE_FORK 1
-#define HAVE_GETADDRINFO 1
-#define HAVE_GETHRTIME 0
-#define HAVE_GETOPT 1
-#define HAVE_GETPROCESSAFFINITYMASK 0
-#define HAVE_GETPROCESSMEMORYINFO 0
-#define HAVE_GETPROCESSTIMES 0
-#define HAVE_GETRUSAGE 1
-#define HAVE_GETSYSTEMTIMEASFILETIME 0
-#define HAVE_GETTIMEOFDAY 1
-#define HAVE_GLOB 1
-#define HAVE_GLXGETPROCADDRESS 0
-#define HAVE_GMTIME_R 1
-#define HAVE_INET_ATON 1
-#define HAVE_ISATTY 1
-#define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
-#define HAVE_KBHIT 0
-#define HAVE_LOADLIBRARY 0
-#define HAVE_LSTAT 1
-#define HAVE_LZO1X_999_COMPRESS 0
-#define HAVE_MACH_ABSOLUTE_TIME 1
-#define HAVE_MAPVIEWOFFILE 0
-#define HAVE_MKSTEMP 1
-#define HAVE_MMAP 1
-#define HAVE_MPROTECT 1
-#define HAVE_NANOSLEEP 1
-#define HAVE_PEEKNAMEDPIPE 0
-#define HAVE_PTHREAD_CANCEL 1
-#define HAVE_SCHED_GETAFFINITY 0
-#define HAVE_SETCONSOLETEXTATTRIBUTE 0
-#define HAVE_SETCONSOLECTRLHANDLER 0
-#define HAVE_SETMODE 0
-#define HAVE_SETRLIMIT 1
-#define HAVE_SLEEP 0
-#define HAVE_STRERROR_R 1
-#define HAVE_SYSCONF 1
-#define HAVE_USLEEP 1
-#define HAVE_UTGETOSTYPEFROMSTRING 1
-#define HAVE_VIRTUALALLOC 0
-#define HAVE_WGLGETPROCADDRESS 0
-#define HAVE_PTHREADS 1
-#define HAVE_OS2THREADS 0
-#define HAVE_W32THREADS 0
-#define HAVE_AS_DN_DIRECTIVE 0
-#define HAVE_AS_FUNC 0
-#define HAVE_AS_OBJECT_ARCH 0
-#define HAVE_ASM_MOD_Q 0
-#define HAVE_ATTRIBUTE_MAY_ALIAS 1
-#define HAVE_ATTRIBUTE_PACKED 1
-#define HAVE_EBP_AVAILABLE 1
-#define HAVE_EBX_AVAILABLE 1
-#define HAVE_GNU_AS 0
-#define HAVE_GNU_WINDRES 0
-#define HAVE_IBM_ASM 0
-#define HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS 1
-#define HAVE_INLINE_ASM_LABELS 1
-#define HAVE_INLINE_ASM_NONLOCAL_LABELS 1
-#define HAVE_PRAGMA_DEPRECATED 1
-#define HAVE_RSYNC_CONTIMEOUT 0
-#define HAVE_SYMVER_ASM_LABEL 1
-#define HAVE_SYMVER_GNU_ASM 0
-#define HAVE_VFP_ARGS 0
-#define HAVE_XFORM_ASM 0
-#define HAVE_XMM_CLOBBERS 1
-#define HAVE_CONDITION_VARIABLE_PTR 0
-#define HAVE_SOCKLEN_T 1
-#define HAVE_STRUCT_ADDRINFO 1
-#define HAVE_STRUCT_GROUP_SOURCE_REQ 1
-#define HAVE_STRUCT_IP_MREQ_SOURCE 1
-#define HAVE_STRUCT_IPV6_MREQ 1
-#define HAVE_STRUCT_MSGHDR_MSG_FLAGS 1
-#define HAVE_STRUCT_POLLFD 1
-#define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1
-#define HAVE_STRUCT_SCTP_EVENT_SUBSCRIBE 0
-#define HAVE_STRUCT_SOCKADDR_IN6 1
-#define HAVE_STRUCT_SOCKADDR_SA_LEN 1
-#define HAVE_STRUCT_SOCKADDR_STORAGE 1
-#define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
-#define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-#define HAVE_ATOMICS_NATIVE 1
-#define HAVE_DOS_PATHS 0
-#define HAVE_DXVA2_LIB 0
-#define HAVE_DXVA2API_COBJ 0
-#define HAVE_LIBC_MSVCRT 0
-#define HAVE_LIBDC1394_1 0
-#define HAVE_LIBDC1394_2 0
-#define HAVE_MAKEINFO 1
-#define HAVE_MAKEINFO_HTML 1
-#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
-#define HAVE_PERL 1
-#define HAVE_POD2MAN 1
-#define HAVE_SDL2 0
-#define HAVE_SECTION_DATA_REL_RO 0
-#define HAVE_TEXI2HTML 0
-#define HAVE_THREADS 1
-#define HAVE_VAAPI_DRM 0
-#define HAVE_VAAPI_X11 0
-#define HAVE_VDPAU_X11 0
-#define HAVE_WINRT 0
-#define HAVE_XLIB 0
-#define CONFIG_BSFS 0
-#define CONFIG_DECODERS 1
-#define CONFIG_PARSERS 1
-#define CONFIG_DOC 0
-#define CONFIG_HTMLPAGES 1
-#define CONFIG_MANPAGES 1
-#define CONFIG_PODPAGES 1
-#define CONFIG_TXTPAGES 1
-#define CONFIG_AVIO_DIR_CMD_EXAMPLE 1
-#define CONFIG_AVIO_READING_EXAMPLE 1
-#define CONFIG_DECODING_ENCODING_EXAMPLE 0
-#define CONFIG_DEMUXING_DECODING_EXAMPLE 0
-#define CONFIG_EXTRACT_MVS_EXAMPLE 0
-#define CONFIG_FILTER_AUDIO_EXAMPLE 0
-#define CONFIG_FILTERING_AUDIO_EXAMPLE 0
-#define CONFIG_FILTERING_VIDEO_EXAMPLE 0
-#define CONFIG_HTTP_MULTICLIENT_EXAMPLE 0
-#define CONFIG_METADATA_EXAMPLE 0
-#define CONFIG_MUXING_EXAMPLE 0
-#define CONFIG_QSVDEC_EXAMPLE 0
-#define CONFIG_REMUXING_EXAMPLE 0
-#define CONFIG_RESAMPLING_AUDIO_EXAMPLE 0
-#define CONFIG_SCALING_VIDEO_EXAMPLE 0
-#define CONFIG_TRANSCODE_AAC_EXAMPLE 0
-#define CONFIG_TRANSCODING_EXAMPLE 0
-#define CONFIG_AVISYNTH 0
-#define CONFIG_BZLIB 0
-#define CONFIG_CHROMAPRINT 0
-#define CONFIG_CRYSTALHD 0
-#define CONFIG_DECKLINK 0
-#define CONFIG_FREI0R 0
-#define CONFIG_GCRYPT 0
-#define CONFIG_GMP 0
-#define CONFIG_GNUTLS 0
-#define CONFIG_ICONV 0
-#define CONFIG_JNI 0
-#define CONFIG_LADSPA 0
-#define CONFIG_LIBASS 0
-#define CONFIG_LIBBLURAY 0
-#define CONFIG_LIBBS2B 0
-#define CONFIG_LIBCACA 0
-#define CONFIG_LIBCDIO 0
-#define CONFIG_LIBCELT 0
-#define CONFIG_LIBDC1394 0
-#define CONFIG_LIBEBUR128 0
-#define CONFIG_LIBFDK_AAC 0
-#define CONFIG_LIBFLITE 0
-#define CONFIG_LIBFONTCONFIG 0
-#define CONFIG_LIBFREETYPE 0
-#define CONFIG_LIBFRIBIDI 0
-#define CONFIG_LIBGME 0
-#define CONFIG_LIBGSM 0
-#define CONFIG_LIBIEC61883 0
-#define CONFIG_LIBILBC 0
-#define CONFIG_LIBKVAZAAR 0
-#define CONFIG_LIBMODPLUG 0
-#define CONFIG_LIBMP3LAME 0
-#define CONFIG_LIBNUT 0
-#define CONFIG_LIBOPENCORE_AMRNB 0
-#define CONFIG_LIBOPENCORE_AMRWB 0
-#define CONFIG_LIBOPENCV 0
-#define CONFIG_LIBOPENH264 0
-#define CONFIG_LIBOPENJPEG 0
-#define CONFIG_LIBOPENMPT 0
-#define CONFIG_LIBOPUS 0
-#define CONFIG_LIBPULSE 0
-#define CONFIG_LIBRTMP 0
-#define CONFIG_LIBRUBBERBAND 0
-#define CONFIG_LIBSCHROEDINGER 0
-#define CONFIG_LIBSHINE 0
-#define CONFIG_LIBSMBCLIENT 0
-#define CONFIG_LIBSNAPPY 0
-#define CONFIG_LIBSOXR 0
-#define CONFIG_LIBSPEEX 0
-#define CONFIG_LIBSSH 0
-#define CONFIG_LIBTESSERACT 0
-#define CONFIG_LIBTHEORA 0
-#define CONFIG_LIBTWOLAME 0
-#define CONFIG_LIBV4L2 0
-#define CONFIG_LIBVIDSTAB 0
-#define CONFIG_LIBVO_AMRWBENC 0
-#define CONFIG_LIBVORBIS 0
-#define CONFIG_LIBVPX 0
-#define CONFIG_LIBWAVPACK 0
-#define CONFIG_LIBWEBP 0
-#define CONFIG_LIBX264 0
-#define CONFIG_LIBX265 0
-#define CONFIG_LIBXAVS 0
-#define CONFIG_LIBXCB 0
-#define CONFIG_LIBXCB_SHM 0
-#define CONFIG_LIBXCB_SHAPE 0
-#define CONFIG_LIBXCB_XFIXES 0
-#define CONFIG_LIBXVID 0
-#define CONFIG_LIBZIMG 0
-#define CONFIG_LIBZMQ 0
-#define CONFIG_LIBZVBI 0
-#define CONFIG_LZMA 0
-#define CONFIG_MEDIACODEC 0
-#define CONFIG_NETCDF 0
-#define CONFIG_OPENAL 0
-#define CONFIG_OPENCL 0
-#define CONFIG_OPENGL 0
-#define CONFIG_OPENSSL 0
-#define CONFIG_SCHANNEL 0
-#define CONFIG_SDL 0
-#define CONFIG_SDL2 0
-#define CONFIG_SECURETRANSPORT 0
-#define CONFIG_VIDEOTOOLBOX 0
-#define CONFIG_X11GRAB 0
-#define CONFIG_XLIB 0
-#define CONFIG_ZLIB 0
-#define CONFIG_AUDIOTOOLBOX 0
-#define CONFIG_CUDA 0
-#define CONFIG_CUVID 0
-#define CONFIG_D3D11VA 0
-#define CONFIG_DXVA2 0
-#define CONFIG_LIBMFX 0
-#define CONFIG_LIBNPP 0
-#define CONFIG_MMAL 0
-#define CONFIG_NVENC 0
-#define CONFIG_OMX 0
-#define CONFIG_VAAPI 0
-#define CONFIG_VDA 0
-#define CONFIG_VDPAU 0
-#define CONFIG_XVMC 0
-#define CONFIG_FTRAPV 0
-#define CONFIG_GRAY 0
-#define CONFIG_HARDCODED_TABLES 0
-#define CONFIG_OMX_RPI 0
-#define CONFIG_RUNTIME_CPUDETECT 1
-#define CONFIG_SAFE_BITSTREAM_READER 1
-#define CONFIG_SHARED 1
-#define CONFIG_SMALL 0
-#define CONFIG_STATIC 0
-#define CONFIG_SWSCALE_ALPHA 1
-#define CONFIG_GPL 0
-#define CONFIG_NONFREE 0
-#define CONFIG_VERSION3 0
-#define CONFIG_AVCODEC 1
-#define CONFIG_AVDEVICE 0
-#define CONFIG_AVFILTER 0
-#define CONFIG_AVFORMAT 0
-#define CONFIG_AVRESAMPLE 0
-#define CONFIG_AVUTIL 1
-#define CONFIG_POSTPROC 0
-#define CONFIG_SWRESAMPLE 0
-#define CONFIG_SWSCALE 0
-#define CONFIG_FFPLAY 0
-#define CONFIG_FFPROBE 0
-#define CONFIG_FFSERVER 0
-#define CONFIG_FFMPEG 0
-#define CONFIG_DCT 0
-#define CONFIG_DWT 0
-#define CONFIG_ERROR_RESILIENCE 0
-#define CONFIG_FAAN 1
-#define CONFIG_FAST_UNALIGNED 0
-#define CONFIG_FFT 0
-#define CONFIG_LSP 0
-#define CONFIG_LZO 0
-#define CONFIG_MDCT 0
-#define CONFIG_PIXELUTILS 0
-#define CONFIG_NETWORK 0
-#define CONFIG_RDFT 0
-#define CONFIG_FONTCONFIG 0
-#define CONFIG_MEMALIGN_HACK 0
-#define CONFIG_MEMORY_POISONING 0
-#define CONFIG_NEON_CLOBBER_TEST 0
-#define CONFIG_PIC 1
-#define CONFIG_POD2MAN 1
-#define CONFIG_RAISE_MAJOR 0
-#define CONFIG_THUMB 0
-#define CONFIG_VALGRIND_BACKTRACE 0
-#define CONFIG_XMM_CLOBBER_TEST 0
-#define CONFIG_AANDCTTABLES 0
-#define CONFIG_AC3DSP 0
-#define CONFIG_AUDIO_FRAME_QUEUE 0
-#define CONFIG_AUDIODSP 0
-#define CONFIG_BLOCKDSP 0
-#define CONFIG_BSWAPDSP 0
-#define CONFIG_CABAC 0
-#define CONFIG_DIRAC_PARSE 0
-#define CONFIG_DVPROFILE 0
-#define CONFIG_EXIF 0
-#define CONFIG_FAANDCT 0
-#define CONFIG_FAANIDCT 0
-#define CONFIG_FDCTDSP 0
-#define CONFIG_FLACDSP 1
-#define CONFIG_FMTCONVERT 0
-#define CONFIG_G722DSP 0
-#define CONFIG_GOLOMB 1
-#define CONFIG_GPLV3 0
-#define CONFIG_H263DSP 0
-#define CONFIG_H264CHROMA 0
-#define CONFIG_H264DSP 0
-#define CONFIG_H264PRED 1
-#define CONFIG_H264QPEL 0
-#define CONFIG_HPELDSP 0
-#define CONFIG_HUFFMAN 0
-#define CONFIG_HUFFYUVDSP 0
-#define CONFIG_HUFFYUVENCDSP 0
-#define CONFIG_IDCTDSP 0
-#define CONFIG_IIRFILTER 0
-#define CONFIG_IMDCT15 0
-#define CONFIG_INTRAX8 0
-#define CONFIG_ISO_MEDIA 0
-#define CONFIG_IVIDSP 0
-#define CONFIG_JPEGTABLES 0
-#define CONFIG_LGPLV3 0
-#define CONFIG_LIBX262 0
-#define CONFIG_LLAUDDSP 0
-#define CONFIG_LLVIDDSP 0
-#define CONFIG_LPC 0
-#define CONFIG_LZF 0
-#define CONFIG_ME_CMP 0
-#define CONFIG_MPEG_ER 0
-#define CONFIG_MPEGAUDIO 0
-#define CONFIG_MPEGAUDIODSP 0
-#define CONFIG_MPEGVIDEO 0
-#define CONFIG_MPEGVIDEOENC 0
-#define CONFIG_MSS34DSP 0
-#define CONFIG_PIXBLOCKDSP 0
-#define CONFIG_QPELDSP 0
-#define CONFIG_QSV 0
-#define CONFIG_QSVDEC 0
-#define CONFIG_QSVENC 0
-#define CONFIG_RANGECODER 0
-#define CONFIG_RIFFDEC 0
-#define CONFIG_RIFFENC 0
-#define CONFIG_RTPDEC 0
-#define CONFIG_RTPENC_CHAIN 0
-#define CONFIG_RV34DSP 0
-#define CONFIG_SINEWIN 0
-#define CONFIG_SNAPPY 0
-#define CONFIG_SRTP 0
-#define CONFIG_STARTCODE 0
-#define CONFIG_TEXTUREDSP 0
-#define CONFIG_TEXTUREDSPENC 0
-#define CONFIG_TPELDSP 0
-#define CONFIG_VAAPI_ENCODE 0
-#define CONFIG_VC1DSP 0
-#define CONFIG_VIDEODSP 1
-#define CONFIG_VP3DSP 0
-#define CONFIG_VP56DSP 0
-#define CONFIG_VP8DSP 1
-#define CONFIG_VT_BT2020 0
-#define CONFIG_WMA_FREQS 0
-#define CONFIG_WMV2DSP 0
-#define CONFIG_AAC_ADTSTOASC_BSF 0
-#define CONFIG_CHOMP_BSF 0
-#define CONFIG_DUMP_EXTRADATA_BSF 0
-#define CONFIG_DCA_CORE_BSF 0
-#define CONFIG_H264_MP4TOANNEXB_BSF 0
-#define CONFIG_HEVC_MP4TOANNEXB_BSF 0
-#define CONFIG_IMX_DUMP_HEADER_BSF 0
-#define CONFIG_MJPEG2JPEG_BSF 0
-#define CONFIG_MJPEGA_DUMP_HEADER_BSF 0
-#define CONFIG_MP3_HEADER_DECOMPRESS_BSF 0
-#define CONFIG_MPEG4_UNPACK_BFRAMES_BSF 0
-#define CONFIG_MOV2TEXTSUB_BSF 0
-#define CONFIG_NOISE_BSF 0
-#define CONFIG_REMOVE_EXTRADATA_BSF 0
-#define CONFIG_TEXT2MOVSUB_BSF 0
-#define CONFIG_VP9_SUPERFRAME_BSF 0
-#define CONFIG_VP8_DECODER 1
-#define CONFIG_VP9_DECODER 1
-#define CONFIG_FLAC_DECODER 1
-#define CONFIG_FLAC_PARSER 1
-#define CONFIG_VP8_PARSER 1
-#define CONFIG_VP9_PARSER 1
-#endif /* FFMPEG_CONFIG_H */
diff --git a/media/ffvpx/config_darwin64.asm b/media/ffvpx/config_darwin64.asm
index 7eccf378e..bfaa6f05f 100644
--- a/media/ffvpx/config_darwin64.asm
+++ b/media/ffvpx/config_darwin64.asm
@@ -1,3 +1,4 @@
+; Automatically generated by configure - do not modify!
 %define ARCH_AARCH64 0
 %define ARCH_ALPHA 0
 %define ARCH_ARM 0
@@ -41,6 +42,7 @@
 %define HAVE_AMD3DNOWEXT 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
 %define HAVE_FMA3 1
 %define HAVE_FMA4 1
 %define HAVE_MMX 1
@@ -52,7 +54,7 @@
 %define HAVE_SSE42 1
 %define HAVE_SSSE3 1
 %define HAVE_XOP 1
-%define HAVE_CPUNOP 1
+%define HAVE_CPUNOP 0
 %define HAVE_I686 1
 %define HAVE_MIPSFPU 0
 %define HAVE_MIPS32R2 0
@@ -85,6 +87,7 @@
 %define HAVE_AMD3DNOWEXT_EXTERNAL 1
 %define HAVE_AVX_EXTERNAL 1
 %define HAVE_AVX2_EXTERNAL 1
+%define HAVE_AVX512_EXTERNAL 1
 %define HAVE_FMA3_EXTERNAL 1
 %define HAVE_FMA4_EXTERNAL 1
 %define HAVE_MMX_EXTERNAL 1
@@ -129,6 +132,7 @@
 %define HAVE_AMD3DNOWEXT_INLINE 1
 %define HAVE_AVX_INLINE 1
 %define HAVE_AVX2_INLINE 1
+%define HAVE_AVX512_INLINE 1
 %define HAVE_FMA3_INLINE 1
 %define HAVE_FMA4_INLINE 1
 %define HAVE_MMX_INLINE 1
@@ -158,20 +162,15 @@
 %define HAVE_FAST_64BIT 1
 %define HAVE_FAST_CLZ 1
 %define HAVE_FAST_CMOV 1
-%define HAVE_LOCAL_ALIGNED_8 1
-%define HAVE_LOCAL_ALIGNED_16 1
-%define HAVE_LOCAL_ALIGNED_32 1
+%define HAVE_LOCAL_ALIGNED 1
 %define HAVE_SIMD_ALIGN_16 1
 %define HAVE_SIMD_ALIGN_32 1
-%define HAVE_ATOMICS_GCC 1
-%define HAVE_ATOMICS_SUNCC 0
-%define HAVE_ATOMICS_WIN32 0
+%define HAVE_SIMD_ALIGN_64 1
 %define HAVE_ATOMIC_CAS_PTR 0
 %define HAVE_MACHINE_RW_BARRIER 0
 %define HAVE_MEMORYBARRIER 0
 %define HAVE_MM_EMPTY 1
 %define HAVE_RDTSC 0
-%define HAVE_SARESTART 1
 %define HAVE_SEM_TIMEDWAIT 0
 %define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
 %define HAVE_CABS 1
@@ -181,13 +180,11 @@
 %define HAVE_X86ASM 1
 %define HAVE_BIGENDIAN 0
 %define HAVE_FAST_UNALIGNED 1
-%define HAVE_ALTIVEC_H 0
 %define HAVE_ARPA_INET_H 1
 %define HAVE_ASM_TYPES_H 0
 %define HAVE_CDIO_PARANOIA_H 0
 %define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 %define HAVE_CUDA_H 0
-%define HAVE_D3D11_H 0
 %define HAVE_DISPATCH_DISPATCH_H 1
 %define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 %define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -196,26 +193,17 @@
 %define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 %define HAVE_DIRECT_H 0
 %define HAVE_DIRENT_H 1
-%define HAVE_DLFCN_H 1
 %define HAVE_DXGIDEBUG_H 0
 %define HAVE_DXVA_H 0
 %define HAVE_ES2_GL_H 0
 %define HAVE_GSM_H 0
 %define HAVE_IO_H 0
-%define HAVE_MACH_MACH_TIME_H 1
+%define HAVE_LINUX_PERF_EVENT_H 0
 %define HAVE_MACHINE_IOCTL_BT848_H 0
 %define HAVE_MACHINE_IOCTL_METEOR_H 0
 %define HAVE_OPENCV2_CORE_CORE_C_H 0
-%define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-%define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 %define HAVE_OPENGL_GL3_H 0
 %define HAVE_POLL_H 1
-%define HAVE_SOUNDCARD_H 0
-%define HAVE_STDATOMIC_H 1
-%define HAVE_SYS_MMAN_H 1
 %define HAVE_SYS_PARAM_H 1
 %define HAVE_SYS_RESOURCE_H 1
 %define HAVE_SYS_SELECT_H 1
@@ -259,16 +247,19 @@
 %define HAVE_SINF 1
 %define HAVE_TRUNC 1
 %define HAVE_TRUNCF 1
+%define HAVE_DOS_PATHS 0
+%define HAVE_LIBC_MSVCRT 0
+%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_SECTION_DATA_REL_RO 0
+%define HAVE_THREADS 1
+%define HAVE_UWP 0
+%define HAVE_WINRT 0
 %define HAVE_ACCESS 1
 %define HAVE_ALIGNED_MALLOC 0
 %define HAVE_CLOCK_GETTIME 1
 %define HAVE_CLOSESOCKET 0
 %define HAVE_COMMANDLINETOARGVW 0
-%define HAVE_COTASKMEMFREE 0
-%define HAVE_CRYPTGENRANDOM 0
 %define HAVE_FCNTL 1
-%define HAVE_FLT_LIM 1
-%define HAVE_FORK 1
 %define HAVE_GETADDRINFO 1
 %define HAVE_GETHRTIME 0
 %define HAVE_GETOPT 1
@@ -283,9 +274,7 @@
 %define HAVE_GMTIME_R 1
 %define HAVE_INET_ATON 1
 %define HAVE_ISATTY 1
-%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 %define HAVE_KBHIT 0
-%define HAVE_LOADLIBRARY 0
 %define HAVE_LSTAT 1
 %define HAVE_LZO1X_999_COMPRESS 0
 %define HAVE_MACH_ABSOLUTE_TIME 1
@@ -297,6 +286,7 @@
 %define HAVE_PEEKNAMEDPIPE 0
 %define HAVE_PTHREAD_CANCEL 1
 %define HAVE_SCHED_GETAFFINITY 0
+%define HAVE_SECITEMIMPORT 0
 %define HAVE_SETCONSOLETEXTATTRIBUTE 0
 %define HAVE_SETCONSOLECTRLHANDLER 0
 %define HAVE_SETMODE 0
@@ -309,16 +299,19 @@
 %define HAVE_UTGETOSTYPEFROMSTRING 0
 %define HAVE_VIRTUALALLOC 0
 %define HAVE_WGLGETPROCADDRESS 0
+%define HAVE_BCRYPT 0
+%define HAVE_VAAPI_DRM 0
+%define HAVE_VAAPI_X11 0
+%define HAVE_VDPAU_X11 0
 %define HAVE_PTHREADS 1
 %define HAVE_OS2THREADS 0
 %define HAVE_W32THREADS 0
+%define HAVE_AS_ARCH_DIRECTIVE 0
 %define HAVE_AS_DN_DIRECTIVE 0
 %define HAVE_AS_FPU_DIRECTIVE 0
 %define HAVE_AS_FUNC 0
 %define HAVE_AS_OBJECT_ARCH 0
 %define HAVE_ASM_MOD_Q 0
-%define HAVE_ATTRIBUTE_MAY_ALIAS 1
-%define HAVE_ATTRIBUTE_PACKED 1
 %define HAVE_BLOCKS_EXTENSION 1
 %define HAVE_EBP_AVAILABLE 1
 %define HAVE_EBX_AVAILABLE 1
@@ -335,7 +328,6 @@
 %define HAVE_VFP_ARGS 0
 %define HAVE_XFORM_ASM 0
 %define HAVE_XMM_CLOBBERS 1
-%define HAVE_CONDITION_VARIABLE_PTR 0
 %define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 %define HAVE_SOCKLEN_T 1
 %define HAVE_STRUCT_ADDRINFO 1
@@ -351,24 +343,19 @@
 %define HAVE_STRUCT_SOCKADDR_STORAGE 1
 %define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
 %define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-%define HAVE_ATOMICS_NATIVE 1
-%define HAVE_DOS_PATHS 0
-%define HAVE_LIBC_MSVCRT 0
 %define HAVE_MAKEINFO 1
 %define HAVE_MAKEINFO_HTML 0
-%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_OPENCL_D3D11 0
+%define HAVE_OPENCL_DRM_ARM 0
+%define HAVE_OPENCL_DRM_BEIGNET 0
+%define HAVE_OPENCL_DXVA2 0
+%define HAVE_OPENCL_VAAPI_BEIGNET 0
+%define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 %define HAVE_PERL 1
 %define HAVE_POD2MAN 1
-%define HAVE_SECTION_DATA_REL_RO 0
-%define HAVE_TEXI2HTML 0
-%define HAVE_THREADS 1
-%define HAVE_UWP 0
-%define HAVE_VAAPI_DRM 0
-%define HAVE_VAAPI_X11 0
-%define HAVE_VDPAU_X11 0
-%define HAVE_WINRT 0
+%define HAVE_TEXI2HTML 1
 %define CONFIG_DOC 0
-%define CONFIG_HTMLPAGES 0
+%define CONFIG_HTMLPAGES 1
 %define CONFIG_MANPAGES 1
 %define CONFIG_PODPAGES 1
 %define CONFIG_TXTPAGES 1
@@ -393,24 +380,8 @@
 %define CONFIG_SCALING_VIDEO_EXAMPLE 0
 %define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 %define CONFIG_TRANSCODING_EXAMPLE 0
-%define CONFIG_ALSA 0
-%define CONFIG_APPKIT 1
-%define CONFIG_AVFOUNDATION 1
-%define CONFIG_BZLIB 1
-%define CONFIG_COREIMAGE 1
-%define CONFIG_ICONV 0
-%define CONFIG_JACK 0
-%define CONFIG_LIBXCB 0
-%define CONFIG_LIBXCB_SHM 0
-%define CONFIG_LIBXCB_SHAPE 0
-%define CONFIG_LIBXCB_XFIXES 0
-%define CONFIG_LZMA 0
-%define CONFIG_SCHANNEL 0
-%define CONFIG_SDL2 0
-%define CONFIG_SECURETRANSPORT 0
-%define CONFIG_SNDIO 0
-%define CONFIG_XLIB 1
-%define CONFIG_ZLIB 1
+%define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+%define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 %define CONFIG_AVISYNTH 0
 %define CONFIG_FREI0R 0
 %define CONFIG_LIBCDIO 0
@@ -424,9 +395,11 @@
 %define CONFIG_LIBNDI_NEWTEK 0
 %define CONFIG_LIBFDK_AAC 0
 %define CONFIG_OPENSSL 0
+%define CONFIG_LIBTLS 0
 %define CONFIG_GMP 0
 %define CONFIG_LIBOPENCORE_AMRNB 0
 %define CONFIG_LIBOPENCORE_AMRWB 0
+%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVO_AMRWBENC 0
 %define CONFIG_RKMPP 0
 %define CONFIG_LIBSMBCLIENT 0
@@ -435,11 +408,13 @@
 %define CONFIG_GNUTLS 0
 %define CONFIG_JNI 0
 %define CONFIG_LADSPA 0
+%define CONFIG_LIBAOM 0
 %define CONFIG_LIBASS 0
 %define CONFIG_LIBBLURAY 0
 %define CONFIG_LIBBS2B 0
 %define CONFIG_LIBCACA 0
 %define CONFIG_LIBCELT 0
+%define CONFIG_LIBCODEC2 0
 %define CONFIG_LIBDC1394 0
 %define CONFIG_LIBDRM 0
 %define CONFIG_LIBFLITE 0
@@ -450,6 +425,7 @@
 %define CONFIG_LIBGSM 0
 %define CONFIG_LIBIEC61883 0
 %define CONFIG_LIBILBC 0
+%define CONFIG_LIBJACK 0
 %define CONFIG_LIBKVAZAAR 0
 %define CONFIG_LIBMODPLUG 0
 %define CONFIG_LIBMP3LAME 0
@@ -467,12 +443,12 @@
 %define CONFIG_LIBSNAPPY 0
 %define CONFIG_LIBSOXR 0
 %define CONFIG_LIBSPEEX 0
+%define CONFIG_LIBSRT 0
 %define CONFIG_LIBSSH 0
 %define CONFIG_LIBTESSERACT 0
 %define CONFIG_LIBTHEORA 0
 %define CONFIG_LIBTWOLAME 0
 %define CONFIG_LIBV4L2 0
-%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVORBIS 0
 %define CONFIG_LIBVPX 0
 %define CONFIG_LIBWAVPACK 0
@@ -481,28 +457,48 @@
 %define CONFIG_LIBZIMG 0
 %define CONFIG_LIBZMQ 0
 %define CONFIG_LIBZVBI 0
+%define CONFIG_LV2 0
 %define CONFIG_MEDIACODEC 0
 %define CONFIG_OPENAL 0
-%define CONFIG_OPENCL 0
 %define CONFIG_OPENGL 0
+%define CONFIG_ALSA 0
+%define CONFIG_APPKIT 1
+%define CONFIG_AVFOUNDATION 1
+%define CONFIG_BZLIB 1
+%define CONFIG_COREIMAGE 1
+%define CONFIG_ICONV 0
+%define CONFIG_LIBXCB 0
+%define CONFIG_LIBXCB_SHM 0
+%define CONFIG_LIBXCB_SHAPE 0
+%define CONFIG_LIBXCB_XFIXES 0
+%define CONFIG_LZMA 1
+%define CONFIG_SCHANNEL 0
+%define CONFIG_SDL2 0
+%define CONFIG_SECURETRANSPORT 0
+%define CONFIG_SNDIO 0
+%define CONFIG_XLIB 1
+%define CONFIG_ZLIB 1
+%define CONFIG_CUDA_SDK 0
+%define CONFIG_LIBNPP 0
+%define CONFIG_LIBMFX 0
+%define CONFIG_MMAL 0
+%define CONFIG_OMX 0
+%define CONFIG_OPENCL 0
+%define CONFIG_AMF 0
 %define CONFIG_AUDIOTOOLBOX 1
 %define CONFIG_CRYSTALHD 0
 %define CONFIG_CUDA 0
 %define CONFIG_CUVID 0
 %define CONFIG_D3D11VA 0
 %define CONFIG_DXVA2 0
+%define CONFIG_FFNVCODEC 0
+%define CONFIG_NVDEC 0
 %define CONFIG_NVENC 0
 %define CONFIG_VAAPI 0
-%define CONFIG_VDA 0
 %define CONFIG_VDPAU 0
 %define CONFIG_VIDEOTOOLBOX 0
 %define CONFIG_V4L2_M2M 0
 %define CONFIG_XVMC 0
-%define CONFIG_CUDA_SDK 0
-%define CONFIG_LIBNPP 0
-%define CONFIG_LIBMFX 0
-%define CONFIG_MMAL 0
-%define CONFIG_OMX 0
 %define CONFIG_FTRAPV 0
 %define CONFIG_GRAY 0
 %define CONFIG_HARDCODED_TABLES 0
@@ -527,7 +523,6 @@
 %define CONFIG_SWSCALE 0
 %define CONFIG_FFPLAY 0
 %define CONFIG_FFPROBE 0
-%define CONFIG_FFSERVER 0
 %define CONFIG_FFMPEG 0
 %define CONFIG_DCT 0
 %define CONFIG_DWT 0
@@ -553,31 +548,27 @@
 %define CONFIG_XMM_CLOBBER_TEST 0
 %define CONFIG_BSFS 1
 %define CONFIG_DECODERS 1
-%define CONFIG_ENCODERS 0
-%define CONFIG_HWACCELS 0
 %define CONFIG_PARSERS 1
-%define CONFIG_INDEVS 0
-%define CONFIG_OUTDEVS 0
-%define CONFIG_FILTERS 0
-%define CONFIG_DEMUXERS 0
-%define CONFIG_MUXERS 0
-%define CONFIG_PROTOCOLS 0
 %define CONFIG_AANDCTTABLES 0
 %define CONFIG_AC3DSP 0
+%define CONFIG_ADTS_HEADER 0
 %define CONFIG_AUDIO_FRAME_QUEUE 0
 %define CONFIG_AUDIODSP 0
 %define CONFIG_BLOCKDSP 0
 %define CONFIG_BSWAPDSP 0
 %define CONFIG_CABAC 0
+%define CONFIG_CBS 0
+%define CONFIG_CBS_H264 0
+%define CONFIG_CBS_H265 0
+%define CONFIG_CBS_MPEG2 0
 %define CONFIG_DIRAC_PARSE 0
 %define CONFIG_DVPROFILE 0
 %define CONFIG_EXIF 0
-%define CONFIG_FAANDCT 0
-%define CONFIG_FAANIDCT 0
-%define CONFIG_FDCTDSP 0
+%define CONFIG_FAANDCT 1
+%define CONFIG_FAANIDCT 1
+%define CONFIG_FDCTDSP 1
 %define CONFIG_FLACDSP 1
 %define CONFIG_FMTCONVERT 0
-%define CONFIG_FRAME_THREAD_ENCODER 0
 %define CONFIG_G722DSP 0
 %define CONFIG_GOLOMB 0
 %define CONFIG_GPLV3 0
@@ -592,7 +583,7 @@
 %define CONFIG_HUFFMAN 0
 %define CONFIG_HUFFYUVDSP 0
 %define CONFIG_HUFFYUVENCDSP 0
-%define CONFIG_IDCTDSP 0
+%define CONFIG_IDCTDSP 1
 %define CONFIG_IIRFILTER 0
 %define CONFIG_MDCT15 0
 %define CONFIG_INTRAX8 0
@@ -619,6 +610,7 @@
 %define CONFIG_QSV 0
 %define CONFIG_QSVDEC 0
 %define CONFIG_QSVENC 0
+%define CONFIG_QSVVPP 0
 %define CONFIG_RANGECODER 0
 %define CONFIG_RIFFDEC 0
 %define CONFIG_RIFFENC 0
@@ -642,9 +634,9 @@
 %define CONFIG_WMA_FREQS 0
 %define CONFIG_WMV2DSP 0
 %define CONFIG_NULL_BSF 1
+%define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 %define CONFIG_VP8_DECODER 1
 %define CONFIG_VP9_DECODER 1
 %define CONFIG_FLAC_DECODER 1
-%define CONFIG_FLAC_PARSER 0
 %define CONFIG_VP8_PARSER 1
 %define CONFIG_VP9_PARSER 1
diff --git a/media/ffvpx/config_darwin64.h b/media/ffvpx/config_darwin64.h
index 72f1d6dad..03f19b3c2 100644
--- a/media/ffvpx/config_darwin64.h
+++ b/media/ffvpx/config_darwin64.h
@@ -1,12 +1,12 @@
 /* Automatically generated by configure - do not modify! */
 #ifndef FFMPEG_CONFIG_H
 #define FFMPEG_CONFIG_H
-#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm"
+#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --disable-cuda --disable-cuvid"
 #define FFMPEG_LICENSE "LGPL version 2.1 or later"
-#define CONFIG_THIS_YEAR 2017
+#define CONFIG_THIS_YEAR 2018
 #define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
 #define AVCONV_DATADIR "/usr/local/share/ffmpeg"
-#define CC_IDENT "Apple LLVM version 9.0.0 (clang-900.0.38)"
+#define CC_IDENT "Apple LLVM version 9.1.0 (clang-902.0.39.2)"
 #define av_restrict restrict
 #define EXTERN_PREFIX "_"
 #define EXTERN_ASM _
@@ -57,6 +57,7 @@
 #define HAVE_AMD3DNOWEXT 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
 #define HAVE_FMA3 1
 #define HAVE_FMA4 1
 #define HAVE_MMX 1
@@ -68,7 +69,7 @@
 #define HAVE_SSE42 1
 #define HAVE_SSSE3 1
 #define HAVE_XOP 1
-#define HAVE_CPUNOP 1
+#define HAVE_CPUNOP 0
 #define HAVE_I686 1
 #define HAVE_MIPSFPU 0
 #define HAVE_MIPS32R2 0
@@ -101,6 +102,7 @@
 #define HAVE_AMD3DNOWEXT_EXTERNAL 1
 #define HAVE_AVX_EXTERNAL 1
 #define HAVE_AVX2_EXTERNAL 1
+#define HAVE_AVX512_EXTERNAL 1
 #define HAVE_FMA3_EXTERNAL 1
 #define HAVE_FMA4_EXTERNAL 1
 #define HAVE_MMX_EXTERNAL 1
@@ -145,6 +147,7 @@
 #define HAVE_AMD3DNOWEXT_INLINE 1
 #define HAVE_AVX_INLINE 1
 #define HAVE_AVX2_INLINE 1
+#define HAVE_AVX512_INLINE 1
 #define HAVE_FMA3_INLINE 1
 #define HAVE_FMA4_INLINE 1
 #define HAVE_MMX_INLINE 1
@@ -174,20 +177,15 @@
 #define HAVE_FAST_64BIT 1
 #define HAVE_FAST_CLZ 1
 #define HAVE_FAST_CMOV 1
-#define HAVE_LOCAL_ALIGNED_8 1
-#define HAVE_LOCAL_ALIGNED_16 1
-#define HAVE_LOCAL_ALIGNED_32 1
+#define HAVE_LOCAL_ALIGNED 1
 #define HAVE_SIMD_ALIGN_16 1
 #define HAVE_SIMD_ALIGN_32 1
-#define HAVE_ATOMICS_GCC 1
-#define HAVE_ATOMICS_SUNCC 0
-#define HAVE_ATOMICS_WIN32 0
+#define HAVE_SIMD_ALIGN_64 1
 #define HAVE_ATOMIC_CAS_PTR 0
 #define HAVE_MACHINE_RW_BARRIER 0
 #define HAVE_MEMORYBARRIER 0
 #define HAVE_MM_EMPTY 1
 #define HAVE_RDTSC 0
-#define HAVE_SARESTART 1
 #define HAVE_SEM_TIMEDWAIT 0
 #define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
 #define HAVE_CABS 1
@@ -197,13 +195,11 @@
 #define HAVE_X86ASM 1
 #define HAVE_BIGENDIAN 0
 #define HAVE_FAST_UNALIGNED 1
-#define HAVE_ALTIVEC_H 0
 #define HAVE_ARPA_INET_H 1
 #define HAVE_ASM_TYPES_H 0
 #define HAVE_CDIO_PARANOIA_H 0
 #define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 #define HAVE_CUDA_H 0
-#define HAVE_D3D11_H 0
 #define HAVE_DISPATCH_DISPATCH_H 1
 #define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 #define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -212,26 +208,17 @@
 #define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 #define HAVE_DIRECT_H 0
 #define HAVE_DIRENT_H 1
-#define HAVE_DLFCN_H 1
 #define HAVE_DXGIDEBUG_H 0
 #define HAVE_DXVA_H 0
 #define HAVE_ES2_GL_H 0
 #define HAVE_GSM_H 0
 #define HAVE_IO_H 0
-#define HAVE_MACH_MACH_TIME_H 1
+#define HAVE_LINUX_PERF_EVENT_H 0
 #define HAVE_MACHINE_IOCTL_BT848_H 0
 #define HAVE_MACHINE_IOCTL_METEOR_H 0
 #define HAVE_OPENCV2_CORE_CORE_C_H 0
-#define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-#define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 #define HAVE_OPENGL_GL3_H 0
 #define HAVE_POLL_H 1
-#define HAVE_SOUNDCARD_H 0
-#define HAVE_STDATOMIC_H 1
-#define HAVE_SYS_MMAN_H 1
 #define HAVE_SYS_PARAM_H 1
 #define HAVE_SYS_RESOURCE_H 1
 #define HAVE_SYS_SELECT_H 1
@@ -275,16 +262,19 @@
 #define HAVE_SINF 1
 #define HAVE_TRUNC 1
 #define HAVE_TRUNCF 1
+#define HAVE_DOS_PATHS 0
+#define HAVE_LIBC_MSVCRT 0
+#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_SECTION_DATA_REL_RO 0
+#define HAVE_THREADS 1
+#define HAVE_UWP 0
+#define HAVE_WINRT 0
 #define HAVE_ACCESS 1
 #define HAVE_ALIGNED_MALLOC 0
 #define HAVE_CLOCK_GETTIME 1
 #define HAVE_CLOSESOCKET 0
 #define HAVE_COMMANDLINETOARGVW 0
-#define HAVE_COTASKMEMFREE 0
-#define HAVE_CRYPTGENRANDOM 0
 #define HAVE_FCNTL 1
-#define HAVE_FLT_LIM 1
-#define HAVE_FORK 1
 #define HAVE_GETADDRINFO 1
 #define HAVE_GETHRTIME 0
 #define HAVE_GETOPT 1
@@ -299,9 +289,7 @@
 #define HAVE_GMTIME_R 1
 #define HAVE_INET_ATON 1
 #define HAVE_ISATTY 1
-#define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 #define HAVE_KBHIT 0
-#define HAVE_LOADLIBRARY 0
 #define HAVE_LSTAT 1
 #define HAVE_LZO1X_999_COMPRESS 0
 #define HAVE_MACH_ABSOLUTE_TIME 1
@@ -313,6 +301,7 @@
 #define HAVE_PEEKNAMEDPIPE 0
 #define HAVE_PTHREAD_CANCEL 1
 #define HAVE_SCHED_GETAFFINITY 0
+#define HAVE_SECITEMIMPORT 0
 #define HAVE_SETCONSOLETEXTATTRIBUTE 0
 #define HAVE_SETCONSOLECTRLHANDLER 0
 #define HAVE_SETMODE 0
@@ -325,16 +314,19 @@
 #define HAVE_UTGETOSTYPEFROMSTRING 0
 #define HAVE_VIRTUALALLOC 0
 #define HAVE_WGLGETPROCADDRESS 0
+#define HAVE_BCRYPT 0
+#define HAVE_VAAPI_DRM 0
+#define HAVE_VAAPI_X11 0
+#define HAVE_VDPAU_X11 0
 #define HAVE_PTHREADS 1
 #define HAVE_OS2THREADS 0
 #define HAVE_W32THREADS 0
+#define HAVE_AS_ARCH_DIRECTIVE 0
 #define HAVE_AS_DN_DIRECTIVE 0
 #define HAVE_AS_FPU_DIRECTIVE 0
 #define HAVE_AS_FUNC 0
 #define HAVE_AS_OBJECT_ARCH 0
 #define HAVE_ASM_MOD_Q 0
-#define HAVE_ATTRIBUTE_MAY_ALIAS 1
-#define HAVE_ATTRIBUTE_PACKED 1
 #define HAVE_BLOCKS_EXTENSION 1
 #define HAVE_EBP_AVAILABLE 1
 #define HAVE_EBX_AVAILABLE 1
@@ -351,7 +343,6 @@
 #define HAVE_VFP_ARGS 0
 #define HAVE_XFORM_ASM 0
 #define HAVE_XMM_CLOBBERS 1
-#define HAVE_CONDITION_VARIABLE_PTR 0
 #define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 #define HAVE_SOCKLEN_T 1
 #define HAVE_STRUCT_ADDRINFO 1
@@ -367,24 +358,19 @@
 #define HAVE_STRUCT_SOCKADDR_STORAGE 1
 #define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
 #define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-#define HAVE_ATOMICS_NATIVE 1
-#define HAVE_DOS_PATHS 0
-#define HAVE_LIBC_MSVCRT 0
 #define HAVE_MAKEINFO 1
 #define HAVE_MAKEINFO_HTML 0
-#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_OPENCL_D3D11 0
+#define HAVE_OPENCL_DRM_ARM 0
+#define HAVE_OPENCL_DRM_BEIGNET 0
+#define HAVE_OPENCL_DXVA2 0
+#define HAVE_OPENCL_VAAPI_BEIGNET 0
+#define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 #define HAVE_PERL 1
 #define HAVE_POD2MAN 1
-#define HAVE_SECTION_DATA_REL_RO 0
-#define HAVE_TEXI2HTML 0
-#define HAVE_THREADS 1
-#define HAVE_UWP 0
-#define HAVE_VAAPI_DRM 0
-#define HAVE_VAAPI_X11 0
-#define HAVE_VDPAU_X11 0
-#define HAVE_WINRT 0
+#define HAVE_TEXI2HTML 1
 #define CONFIG_DOC 0
-#define CONFIG_HTMLPAGES 0
+#define CONFIG_HTMLPAGES 1
 #define CONFIG_MANPAGES 1
 #define CONFIG_PODPAGES 1
 #define CONFIG_TXTPAGES 1
@@ -409,24 +395,8 @@
 #define CONFIG_SCALING_VIDEO_EXAMPLE 0
 #define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 #define CONFIG_TRANSCODING_EXAMPLE 0
-#define CONFIG_ALSA 0
-#define CONFIG_APPKIT 1
-#define CONFIG_AVFOUNDATION 1
-#define CONFIG_BZLIB 1
-#define CONFIG_COREIMAGE 1
-#define CONFIG_ICONV 0
-#define CONFIG_JACK 0
-#define CONFIG_LIBXCB 0
-#define CONFIG_LIBXCB_SHM 0
-#define CONFIG_LIBXCB_SHAPE 0
-#define CONFIG_LIBXCB_XFIXES 0
-#define CONFIG_LZMA 0
-#define CONFIG_SCHANNEL 0
-#define CONFIG_SDL2 0
-#define CONFIG_SECURETRANSPORT 0
-#define CONFIG_SNDIO 0
-#define CONFIG_XLIB 1
-#define CONFIG_ZLIB 1
+#define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+#define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 #define CONFIG_AVISYNTH 0
 #define CONFIG_FREI0R 0
 #define CONFIG_LIBCDIO 0
@@ -440,9 +410,11 @@
 #define CONFIG_LIBNDI_NEWTEK 0
 #define CONFIG_LIBFDK_AAC 0
 #define CONFIG_OPENSSL 0
+#define CONFIG_LIBTLS 0
 #define CONFIG_GMP 0
 #define CONFIG_LIBOPENCORE_AMRNB 0
 #define CONFIG_LIBOPENCORE_AMRWB 0
+#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVO_AMRWBENC 0
 #define CONFIG_RKMPP 0
 #define CONFIG_LIBSMBCLIENT 0
@@ -451,11 +423,13 @@
 #define CONFIG_GNUTLS 0
 #define CONFIG_JNI 0
 #define CONFIG_LADSPA 0
+#define CONFIG_LIBAOM 0
 #define CONFIG_LIBASS 0
 #define CONFIG_LIBBLURAY 0
 #define CONFIG_LIBBS2B 0
 #define CONFIG_LIBCACA 0
 #define CONFIG_LIBCELT 0
+#define CONFIG_LIBCODEC2 0
 #define CONFIG_LIBDC1394 0
 #define CONFIG_LIBDRM 0
 #define CONFIG_LIBFLITE 0
@@ -466,6 +440,7 @@
 #define CONFIG_LIBGSM 0
 #define CONFIG_LIBIEC61883 0
 #define CONFIG_LIBILBC 0
+#define CONFIG_LIBJACK 0
 #define CONFIG_LIBKVAZAAR 0
 #define CONFIG_LIBMODPLUG 0
 #define CONFIG_LIBMP3LAME 0
@@ -483,12 +458,12 @@
 #define CONFIG_LIBSNAPPY 0
 #define CONFIG_LIBSOXR 0
 #define CONFIG_LIBSPEEX 0
+#define CONFIG_LIBSRT 0
 #define CONFIG_LIBSSH 0
 #define CONFIG_LIBTESSERACT 0
 #define CONFIG_LIBTHEORA 0
 #define CONFIG_LIBTWOLAME 0
 #define CONFIG_LIBV4L2 0
-#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVORBIS 0
 #define CONFIG_LIBVPX 0
 #define CONFIG_LIBWAVPACK 0
@@ -497,28 +472,48 @@
 #define CONFIG_LIBZIMG 0
 #define CONFIG_LIBZMQ 0
 #define CONFIG_LIBZVBI 0
+#define CONFIG_LV2 0
 #define CONFIG_MEDIACODEC 0
 #define CONFIG_OPENAL 0
-#define CONFIG_OPENCL 0
 #define CONFIG_OPENGL 0
+#define CONFIG_ALSA 0
+#define CONFIG_APPKIT 1
+#define CONFIG_AVFOUNDATION 1
+#define CONFIG_BZLIB 1
+#define CONFIG_COREIMAGE 1
+#define CONFIG_ICONV 0
+#define CONFIG_LIBXCB 0
+#define CONFIG_LIBXCB_SHM 0
+#define CONFIG_LIBXCB_SHAPE 0
+#define CONFIG_LIBXCB_XFIXES 0
+#define CONFIG_LZMA 1
+#define CONFIG_SCHANNEL 0
+#define CONFIG_SDL2 0
+#define CONFIG_SECURETRANSPORT 0
+#define CONFIG_SNDIO 0
+#define CONFIG_XLIB 1
+#define CONFIG_ZLIB 1
+#define CONFIG_CUDA_SDK 0
+#define CONFIG_LIBNPP 0
+#define CONFIG_LIBMFX 0
+#define CONFIG_MMAL 0
+#define CONFIG_OMX 0
+#define CONFIG_OPENCL 0
+#define CONFIG_AMF 0
 #define CONFIG_AUDIOTOOLBOX 1
 #define CONFIG_CRYSTALHD 0
 #define CONFIG_CUDA 0
 #define CONFIG_CUVID 0
 #define CONFIG_D3D11VA 0
 #define CONFIG_DXVA2 0
+#define CONFIG_FFNVCODEC 0
+#define CONFIG_NVDEC 0
 #define CONFIG_NVENC 0
 #define CONFIG_VAAPI 0
-#define CONFIG_VDA 0
 #define CONFIG_VDPAU 0
 #define CONFIG_VIDEOTOOLBOX 0
 #define CONFIG_V4L2_M2M 0
 #define CONFIG_XVMC 0
-#define CONFIG_CUDA_SDK 0
-#define CONFIG_LIBNPP 0
-#define CONFIG_LIBMFX 0
-#define CONFIG_MMAL 0
-#define CONFIG_OMX 0
 #define CONFIG_FTRAPV 0
 #define CONFIG_GRAY 0
 #define CONFIG_HARDCODED_TABLES 0
@@ -543,7 +538,6 @@
 #define CONFIG_SWSCALE 0
 #define CONFIG_FFPLAY 0
 #define CONFIG_FFPROBE 0
-#define CONFIG_FFSERVER 0
 #define CONFIG_FFMPEG 0
 #define CONFIG_DCT 0
 #define CONFIG_DWT 0
@@ -569,31 +563,27 @@
 #define CONFIG_XMM_CLOBBER_TEST 0
 #define CONFIG_BSFS 1
 #define CONFIG_DECODERS 1
-#define CONFIG_ENCODERS 0
-#define CONFIG_HWACCELS 0
 #define CONFIG_PARSERS 1
-#define CONFIG_INDEVS 0
-#define CONFIG_OUTDEVS 0
-#define CONFIG_FILTERS 0
-#define CONFIG_DEMUXERS 0
-#define CONFIG_MUXERS 0
-#define CONFIG_PROTOCOLS 0
 #define CONFIG_AANDCTTABLES 0
 #define CONFIG_AC3DSP 0
+#define CONFIG_ADTS_HEADER 0
 #define CONFIG_AUDIO_FRAME_QUEUE 0
 #define CONFIG_AUDIODSP 0
 #define CONFIG_BLOCKDSP 0
 #define CONFIG_BSWAPDSP 0
 #define CONFIG_CABAC 0
+#define CONFIG_CBS 0
+#define CONFIG_CBS_H264 0
+#define CONFIG_CBS_H265 0
+#define CONFIG_CBS_MPEG2 0
 #define CONFIG_DIRAC_PARSE 0
 #define CONFIG_DVPROFILE 0
 #define CONFIG_EXIF 0
-#define CONFIG_FAANDCT 0
-#define CONFIG_FAANIDCT 0
-#define CONFIG_FDCTDSP 0
+#define CONFIG_FAANDCT 1
+#define CONFIG_FAANIDCT 1
+#define CONFIG_FDCTDSP 1
 #define CONFIG_FLACDSP 1
 #define CONFIG_FMTCONVERT 0
-#define CONFIG_FRAME_THREAD_ENCODER 0
 #define CONFIG_G722DSP 0
 #define CONFIG_GOLOMB 0
 #define CONFIG_GPLV3 0
@@ -608,7 +598,7 @@
 #define CONFIG_HUFFMAN 0
 #define CONFIG_HUFFYUVDSP 0
 #define CONFIG_HUFFYUVENCDSP 0
-#define CONFIG_IDCTDSP 0
+#define CONFIG_IDCTDSP 1
 #define CONFIG_IIRFILTER 0
 #define CONFIG_MDCT15 0
 #define CONFIG_INTRAX8 0
@@ -635,6 +625,7 @@
 #define CONFIG_QSV 0
 #define CONFIG_QSVDEC 0
 #define CONFIG_QSVENC 0
+#define CONFIG_QSVVPP 0
 #define CONFIG_RANGECODER 0
 #define CONFIG_RIFFDEC 0
 #define CONFIG_RIFFENC 0
@@ -658,10 +649,10 @@
 #define CONFIG_WMA_FREQS 0
 #define CONFIG_WMV2DSP 0
 #define CONFIG_NULL_BSF 1
+#define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_DECODER 1
 #define CONFIG_FLAC_DECODER 1
-#define CONFIG_FLAC_PARSER 0
 #define CONFIG_VP8_PARSER 1
 #define CONFIG_VP9_PARSER 1
 #endif /* FFMPEG_CONFIG_H */
diff --git a/media/ffvpx/config_unix32.h b/media/ffvpx/config_unix32.h
index c2316caab..ae49d8075 100644
--- a/media/ffvpx/config_unix32.h
+++ b/media/ffvpx/config_unix32.h
@@ -1,9 +1,9 @@
 /* Automatically generated by configure - do not modify! */
 #ifndef FFMPEG_CONFIG_H
 #define FFMPEG_CONFIG_H
-#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vda --disable-vdpau --disable-videotoolbox --enable-decoder=flac --disable-asm --disable-x86asm"
+#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --disable-asm --disable-x86asm --disable-cuda --disable-cuvid"
 #define FFMPEG_LICENSE "LGPL version 2.1 or later"
-#define CONFIG_THIS_YEAR 2017
+#define CONFIG_THIS_YEAR 2018
 #define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
 #define AVCONV_DATADIR "/usr/local/share/ffmpeg"
 #define CC_IDENT "gcc 6.3.0 (Ubuntu 6.3.0-12ubuntu2) 20170406"
@@ -57,6 +57,7 @@
 #define HAVE_AMD3DNOWEXT 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_FMA3 0
 #define HAVE_FMA4 0
 #define HAVE_MMX 0
@@ -101,6 +102,7 @@
 #define HAVE_AMD3DNOWEXT_EXTERNAL 0
 #define HAVE_AVX_EXTERNAL 0
 #define HAVE_AVX2_EXTERNAL 0
+#define HAVE_AVX512_EXTERNAL 0
 #define HAVE_FMA3_EXTERNAL 0
 #define HAVE_FMA4_EXTERNAL 0
 #define HAVE_MMX_EXTERNAL 0
@@ -145,6 +147,7 @@
 #define HAVE_AMD3DNOWEXT_INLINE 0
 #define HAVE_AVX_INLINE 0
 #define HAVE_AVX2_INLINE 0
+#define HAVE_AVX512_INLINE 0
 #define HAVE_FMA3_INLINE 0
 #define HAVE_FMA4_INLINE 0
 #define HAVE_MMX_INLINE 0
@@ -174,20 +177,15 @@
 #define HAVE_FAST_64BIT 0
 #define HAVE_FAST_CLZ 0
 #define HAVE_FAST_CMOV 0
-#define HAVE_LOCAL_ALIGNED_8 1
-#define HAVE_LOCAL_ALIGNED_16 1
-#define HAVE_LOCAL_ALIGNED_32 1
+#define HAVE_LOCAL_ALIGNED 1
 #define HAVE_SIMD_ALIGN_16 0
 #define HAVE_SIMD_ALIGN_32 0
-#define HAVE_ATOMICS_GCC 1
-#define HAVE_ATOMICS_SUNCC 0
-#define HAVE_ATOMICS_WIN32 0
+#define HAVE_SIMD_ALIGN_64 0
 #define HAVE_ATOMIC_CAS_PTR 0
 #define HAVE_MACHINE_RW_BARRIER 0
 #define HAVE_MEMORYBARRIER 0
 #define HAVE_MM_EMPTY 0
 #define HAVE_RDTSC 0
-#define HAVE_SARESTART 1
 #define HAVE_SEM_TIMEDWAIT 1
 #define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
 #define HAVE_CABS 1
@@ -197,13 +195,11 @@
 #define HAVE_X86ASM 0
 #define HAVE_BIGENDIAN 0
 #define HAVE_FAST_UNALIGNED 0
-#define HAVE_ALTIVEC_H 0
 #define HAVE_ARPA_INET_H 1
 #define HAVE_ASM_TYPES_H 1
 #define HAVE_CDIO_PARANOIA_H 0
 #define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 #define HAVE_CUDA_H 0
-#define HAVE_D3D11_H 0
 #define HAVE_DISPATCH_DISPATCH_H 0
 #define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 #define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -212,26 +208,17 @@
 #define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 #define HAVE_DIRECT_H 0
 #define HAVE_DIRENT_H 1
-#define HAVE_DLFCN_H 1
 #define HAVE_DXGIDEBUG_H 0
 #define HAVE_DXVA_H 0
 #define HAVE_ES2_GL_H 0
 #define HAVE_GSM_H 0
 #define HAVE_IO_H 0
-#define HAVE_MACH_MACH_TIME_H 0
+#define HAVE_LINUX_PERF_EVENT_H 0
 #define HAVE_MACHINE_IOCTL_BT848_H 0
 #define HAVE_MACHINE_IOCTL_METEOR_H 0
 #define HAVE_OPENCV2_CORE_CORE_C_H 0
-#define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-#define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 #define HAVE_OPENGL_GL3_H 0
 #define HAVE_POLL_H 1
-#define HAVE_SOUNDCARD_H 0
-#define HAVE_STDATOMIC_H 1
-#define HAVE_SYS_MMAN_H 1
 #define HAVE_SYS_PARAM_H 1
 #define HAVE_SYS_RESOURCE_H 1
 #define HAVE_SYS_SELECT_H 1
@@ -275,16 +262,19 @@
 #define HAVE_SINF 1
 #define HAVE_TRUNC 1
 #define HAVE_TRUNCF 1
+#define HAVE_DOS_PATHS 0
+#define HAVE_LIBC_MSVCRT 0
+#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_SECTION_DATA_REL_RO 0
+#define HAVE_THREADS 1
+#define HAVE_UWP 0
+#define HAVE_WINRT 0
 #define HAVE_ACCESS 1
 #define HAVE_ALIGNED_MALLOC 0
 #define HAVE_CLOCK_GETTIME 1
 #define HAVE_CLOSESOCKET 0
 #define HAVE_COMMANDLINETOARGVW 0
-#define HAVE_COTASKMEMFREE 0
-#define HAVE_CRYPTGENRANDOM 0
 #define HAVE_FCNTL 1
-#define HAVE_FLT_LIM 1
-#define HAVE_FORK 1
 #define HAVE_GETADDRINFO 1
 #define HAVE_GETHRTIME 0
 #define HAVE_GETOPT 1
@@ -299,9 +289,7 @@
 #define HAVE_GMTIME_R 1
 #define HAVE_INET_ATON 1
 #define HAVE_ISATTY 1
-#define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 #define HAVE_KBHIT 0
-#define HAVE_LOADLIBRARY 0
 #define HAVE_LSTAT 1
 #define HAVE_LZO1X_999_COMPRESS 0
 #define HAVE_MACH_ABSOLUTE_TIME 0
@@ -312,7 +300,8 @@
 #define HAVE_NANOSLEEP 1
 #define HAVE_PEEKNAMEDPIPE 0
 #define HAVE_PTHREAD_CANCEL 1
-#define HAVE_SCHED_GETAFFINITY 1
+#define HAVE_SCHED_GETAFFINITY 0
+#define HAVE_SECITEMIMPORT 0
 #define HAVE_SETCONSOLETEXTATTRIBUTE 0
 #define HAVE_SETCONSOLECTRLHANDLER 0
 #define HAVE_SETMODE 0
@@ -325,16 +314,19 @@
 #define HAVE_UTGETOSTYPEFROMSTRING 0
 #define HAVE_VIRTUALALLOC 0
 #define HAVE_WGLGETPROCADDRESS 0
+#define HAVE_BCRYPT 0
+#define HAVE_VAAPI_DRM 0
+#define HAVE_VAAPI_X11 0
+#define HAVE_VDPAU_X11 0
 #define HAVE_PTHREADS 1
 #define HAVE_OS2THREADS 0
 #define HAVE_W32THREADS 0
+#define HAVE_AS_ARCH_DIRECTIVE 0
 #define HAVE_AS_DN_DIRECTIVE 0
 #define HAVE_AS_FPU_DIRECTIVE 0
 #define HAVE_AS_FUNC 0
 #define HAVE_AS_OBJECT_ARCH 0
 #define HAVE_ASM_MOD_Q 0
-#define HAVE_ATTRIBUTE_MAY_ALIAS 1
-#define HAVE_ATTRIBUTE_PACKED 1
 #define HAVE_BLOCKS_EXTENSION 0
 #define HAVE_EBP_AVAILABLE 1
 #define HAVE_EBX_AVAILABLE 1
@@ -351,7 +343,6 @@
 #define HAVE_VFP_ARGS 0
 #define HAVE_XFORM_ASM 0
 #define HAVE_XMM_CLOBBERS 0
-#define HAVE_CONDITION_VARIABLE_PTR 0
 #define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 #define HAVE_SOCKLEN_T 1
 #define HAVE_STRUCT_ADDRINFO 1
@@ -367,27 +358,22 @@
 #define HAVE_STRUCT_SOCKADDR_STORAGE 1
 #define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 1
 #define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1
-#define HAVE_ATOMICS_NATIVE 1
-#define HAVE_DOS_PATHS 0
-#define HAVE_LIBC_MSVCRT 0
 #define HAVE_MAKEINFO 0
 #define HAVE_MAKEINFO_HTML 0
-#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_OPENCL_D3D11 0
+#define HAVE_OPENCL_DRM_ARM 0
+#define HAVE_OPENCL_DRM_BEIGNET 0
+#define HAVE_OPENCL_DXVA2 0
+#define HAVE_OPENCL_VAAPI_BEIGNET 0
+#define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 #define HAVE_PERL 1
 #define HAVE_POD2MAN 1
-#define HAVE_SECTION_DATA_REL_RO 1
-#define HAVE_TEXI2HTML 0
-#define HAVE_THREADS 1
-#define HAVE_UWP 0
-#define HAVE_VAAPI_DRM 0
-#define HAVE_VAAPI_X11 0
-#define HAVE_VDPAU_X11 0
-#define HAVE_WINRT 0
+#define HAVE_TEXI2HTML 1
 #define CONFIG_DOC 0
-#define CONFIG_HTMLPAGES 0
+#define CONFIG_HTMLPAGES 1
 #define CONFIG_MANPAGES 1
 #define CONFIG_PODPAGES 1
-#define CONFIG_TXTPAGES 0
+#define CONFIG_TXTPAGES 1
 #define CONFIG_AVIO_DIR_CMD_EXAMPLE 1
 #define CONFIG_AVIO_READING_EXAMPLE 1
 #define CONFIG_DECODE_AUDIO_EXAMPLE 1
@@ -409,24 +395,8 @@
 #define CONFIG_SCALING_VIDEO_EXAMPLE 0
 #define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 #define CONFIG_TRANSCODING_EXAMPLE 0
-#define CONFIG_ALSA 0
-#define CONFIG_APPKIT 0
-#define CONFIG_AVFOUNDATION 0
-#define CONFIG_BZLIB 0
-#define CONFIG_COREIMAGE 0
-#define CONFIG_ICONV 0
-#define CONFIG_JACK 0
-#define CONFIG_LIBXCB 0
-#define CONFIG_LIBXCB_SHM 0
-#define CONFIG_LIBXCB_SHAPE 0
-#define CONFIG_LIBXCB_XFIXES 0
-#define CONFIG_LZMA 0
-#define CONFIG_SCHANNEL 0
-#define CONFIG_SDL2 0
-#define CONFIG_SECURETRANSPORT 0
-#define CONFIG_SNDIO 0
-#define CONFIG_XLIB 1
-#define CONFIG_ZLIB 0
+#define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+#define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 #define CONFIG_AVISYNTH 0
 #define CONFIG_FREI0R 0
 #define CONFIG_LIBCDIO 0
@@ -440,9 +410,11 @@
 #define CONFIG_LIBNDI_NEWTEK 0
 #define CONFIG_LIBFDK_AAC 0
 #define CONFIG_OPENSSL 0
+#define CONFIG_LIBTLS 0
 #define CONFIG_GMP 0
 #define CONFIG_LIBOPENCORE_AMRNB 0
 #define CONFIG_LIBOPENCORE_AMRWB 0
+#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVO_AMRWBENC 0
 #define CONFIG_RKMPP 0
 #define CONFIG_LIBSMBCLIENT 0
@@ -451,11 +423,13 @@
 #define CONFIG_GNUTLS 0
 #define CONFIG_JNI 0
 #define CONFIG_LADSPA 0
+#define CONFIG_LIBAOM 0
 #define CONFIG_LIBASS 0
 #define CONFIG_LIBBLURAY 0
 #define CONFIG_LIBBS2B 0
 #define CONFIG_LIBCACA 0
 #define CONFIG_LIBCELT 0
+#define CONFIG_LIBCODEC2 0
 #define CONFIG_LIBDC1394 0
 #define CONFIG_LIBDRM 0
 #define CONFIG_LIBFLITE 0
@@ -466,6 +440,7 @@
 #define CONFIG_LIBGSM 0
 #define CONFIG_LIBIEC61883 0
 #define CONFIG_LIBILBC 0
+#define CONFIG_LIBJACK 0
 #define CONFIG_LIBKVAZAAR 0
 #define CONFIG_LIBMODPLUG 0
 #define CONFIG_LIBMP3LAME 0
@@ -483,12 +458,12 @@
 #define CONFIG_LIBSNAPPY 0
 #define CONFIG_LIBSOXR 0
 #define CONFIG_LIBSPEEX 0
+#define CONFIG_LIBSRT 0
 #define CONFIG_LIBSSH 0
 #define CONFIG_LIBTESSERACT 0
 #define CONFIG_LIBTHEORA 0
 #define CONFIG_LIBTWOLAME 0
 #define CONFIG_LIBV4L2 0
-#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVORBIS 0
 #define CONFIG_LIBVPX 0
 #define CONFIG_LIBWAVPACK 0
@@ -497,28 +472,48 @@
 #define CONFIG_LIBZIMG 0
 #define CONFIG_LIBZMQ 0
 #define CONFIG_LIBZVBI 0
+#define CONFIG_LV2 0
 #define CONFIG_MEDIACODEC 0
 #define CONFIG_OPENAL 0
-#define CONFIG_OPENCL 0
 #define CONFIG_OPENGL 0
-#define CONFIG_AUDIOTOOLBOX 0
+#define CONFIG_ALSA 0
+#define CONFIG_APPKIT 1
+#define CONFIG_AVFOUNDATION 1
+#define CONFIG_BZLIB 1
+#define CONFIG_COREIMAGE 0
+#define CONFIG_ICONV 0
+#define CONFIG_LIBXCB 0
+#define CONFIG_LIBXCB_SHM 0
+#define CONFIG_LIBXCB_SHAPE 0
+#define CONFIG_LIBXCB_XFIXES 0
+#define CONFIG_LZMA 1
+#define CONFIG_SCHANNEL 0
+#define CONFIG_SDL2 0
+#define CONFIG_SECURETRANSPORT 0
+#define CONFIG_SNDIO 0
+#define CONFIG_XLIB 1
+#define CONFIG_ZLIB 1
+#define CONFIG_CUDA_SDK 0
+#define CONFIG_LIBNPP 0
+#define CONFIG_LIBMFX 0
+#define CONFIG_MMAL 0
+#define CONFIG_OMX 0
+#define CONFIG_OPENCL 0
+#define CONFIG_AMF 0
+#define CONFIG_AUDIOTOOLBOX 1
 #define CONFIG_CRYSTALHD 0
-#define CONFIG_CUDA 1
-#define CONFIG_CUVID 1
+#define CONFIG_CUDA 0
+#define CONFIG_CUVID 0
 #define CONFIG_D3D11VA 0
 #define CONFIG_DXVA2 0
-#define CONFIG_NVENC 1
+#define CONFIG_FFNVCODEC 0
+#define CONFIG_NVDEC 0
+#define CONFIG_NVENC 0
 #define CONFIG_VAAPI 0
-#define CONFIG_VDA 0
 #define CONFIG_VDPAU 0
 #define CONFIG_VIDEOTOOLBOX 0
-#define CONFIG_V4L2_M2M 1
+#define CONFIG_V4L2_M2M 0
 #define CONFIG_XVMC 0
-#define CONFIG_CUDA_SDK 0
-#define CONFIG_LIBNPP 0
-#define CONFIG_LIBMFX 0
-#define CONFIG_MMAL 0
-#define CONFIG_OMX 0
 #define CONFIG_FTRAPV 0
 #define CONFIG_GRAY 0
 #define CONFIG_HARDCODED_TABLES 0
@@ -543,7 +538,6 @@
 #define CONFIG_SWSCALE 0
 #define CONFIG_FFPLAY 0
 #define CONFIG_FFPROBE 0
-#define CONFIG_FFSERVER 0
 #define CONFIG_FFMPEG 0
 #define CONFIG_DCT 0
 #define CONFIG_DWT 0
@@ -569,22 +563,19 @@
 #define CONFIG_XMM_CLOBBER_TEST 0
 #define CONFIG_BSFS 1
 #define CONFIG_DECODERS 1
-#define CONFIG_ENCODERS 0
-#define CONFIG_HWACCELS 0
 #define CONFIG_PARSERS 1
-#define CONFIG_INDEVS 0
-#define CONFIG_OUTDEVS 0
-#define CONFIG_FILTERS 0
-#define CONFIG_DEMUXERS 0
-#define CONFIG_MUXERS 0
-#define CONFIG_PROTOCOLS 0
 #define CONFIG_AANDCTTABLES 0
 #define CONFIG_AC3DSP 0
+#define CONFIG_ADTS_HEADER 0
 #define CONFIG_AUDIO_FRAME_QUEUE 0
 #define CONFIG_AUDIODSP 0
 #define CONFIG_BLOCKDSP 0
 #define CONFIG_BSWAPDSP 0
 #define CONFIG_CABAC 0
+#define CONFIG_CBS 0
+#define CONFIG_CBS_H264 0
+#define CONFIG_CBS_H265 0
+#define CONFIG_CBS_MPEG2 0
 #define CONFIG_DIRAC_PARSE 0
 #define CONFIG_DVPROFILE 0
 #define CONFIG_EXIF 0
@@ -593,7 +584,6 @@
 #define CONFIG_FDCTDSP 0
 #define CONFIG_FLACDSP 1
 #define CONFIG_FMTCONVERT 0
-#define CONFIG_FRAME_THREAD_ENCODER 0
 #define CONFIG_G722DSP 0
 #define CONFIG_GOLOMB 0
 #define CONFIG_GPLV3 0
@@ -635,6 +625,7 @@
 #define CONFIG_QSV 0
 #define CONFIG_QSVDEC 0
 #define CONFIG_QSVENC 0
+#define CONFIG_QSVVPP 0
 #define CONFIG_RANGECODER 0
 #define CONFIG_RIFFDEC 0
 #define CONFIG_RIFFENC 0
@@ -658,10 +649,10 @@
 #define CONFIG_WMA_FREQS 0
 #define CONFIG_WMV2DSP 0
 #define CONFIG_NULL_BSF 1
+#define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_DECODER 1
 #define CONFIG_FLAC_DECODER 1
-#define CONFIG_FLAC_PARSER 0
 #define CONFIG_VP8_PARSER 1
 #define CONFIG_VP9_PARSER 1
 #endif /* FFMPEG_CONFIG_H */
diff --git a/media/ffvpx/config_unix64.asm b/media/ffvpx/config_unix64.asm
index beb7f3b02..574207b1d 100644
--- a/media/ffvpx/config_unix64.asm
+++ b/media/ffvpx/config_unix64.asm
@@ -1,3 +1,4 @@
+; Automatically generated by configure - do not modify!
 %define ARCH_AARCH64 0
 %define ARCH_ALPHA 0
 %define ARCH_ARM 0
@@ -41,6 +42,7 @@
 %define HAVE_AMD3DNOWEXT 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
 %define HAVE_FMA3 1
 %define HAVE_FMA4 1
 %define HAVE_MMX 1
@@ -85,6 +87,7 @@
 %define HAVE_AMD3DNOWEXT_EXTERNAL 1
 %define HAVE_AVX_EXTERNAL 1
 %define HAVE_AVX2_EXTERNAL 1
+%define HAVE_AVX512_EXTERNAL 1
 %define HAVE_FMA3_EXTERNAL 1
 %define HAVE_FMA4_EXTERNAL 1
 %define HAVE_MMX_EXTERNAL 1
@@ -129,6 +132,7 @@
 %define HAVE_AMD3DNOWEXT_INLINE 1
 %define HAVE_AVX_INLINE 1
 %define HAVE_AVX2_INLINE 1
+%define HAVE_AVX512_INLINE 1
 %define HAVE_FMA3_INLINE 1
 %define HAVE_FMA4_INLINE 1
 %define HAVE_MMX_INLINE 1
@@ -158,20 +162,15 @@
 %define HAVE_FAST_64BIT 1
 %define HAVE_FAST_CLZ 1
 %define HAVE_FAST_CMOV 1
-%define HAVE_LOCAL_ALIGNED_8 1
-%define HAVE_LOCAL_ALIGNED_16 1
-%define HAVE_LOCAL_ALIGNED_32 1
+%define HAVE_LOCAL_ALIGNED 1
 %define HAVE_SIMD_ALIGN_16 1
 %define HAVE_SIMD_ALIGN_32 1
-%define HAVE_ATOMICS_GCC 1
-%define HAVE_ATOMICS_SUNCC 0
-%define HAVE_ATOMICS_WIN32 0
+%define HAVE_SIMD_ALIGN_64 1
 %define HAVE_ATOMIC_CAS_PTR 0
 %define HAVE_MACHINE_RW_BARRIER 0
 %define HAVE_MEMORYBARRIER 0
 %define HAVE_MM_EMPTY 1
 %define HAVE_RDTSC 0
-%define HAVE_SARESTART 1
 %define HAVE_SEM_TIMEDWAIT 1
 %define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
 %define HAVE_CABS 1
@@ -181,13 +180,11 @@
 %define HAVE_X86ASM 1
 %define HAVE_BIGENDIAN 0
 %define HAVE_FAST_UNALIGNED 1
-%define HAVE_ALTIVEC_H 0
 %define HAVE_ARPA_INET_H 1
 %define HAVE_ASM_TYPES_H 1
 %define HAVE_CDIO_PARANOIA_H 0
 %define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 %define HAVE_CUDA_H 0
-%define HAVE_D3D11_H 0
 %define HAVE_DISPATCH_DISPATCH_H 0
 %define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 %define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -196,26 +193,17 @@
 %define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 %define HAVE_DIRECT_H 0
 %define HAVE_DIRENT_H 1
-%define HAVE_DLFCN_H 1
 %define HAVE_DXGIDEBUG_H 0
 %define HAVE_DXVA_H 0
 %define HAVE_ES2_GL_H 0
 %define HAVE_GSM_H 0
 %define HAVE_IO_H 0
-%define HAVE_MACH_MACH_TIME_H 0
+%define HAVE_LINUX_PERF_EVENT_H 0
 %define HAVE_MACHINE_IOCTL_BT848_H 0
 %define HAVE_MACHINE_IOCTL_METEOR_H 0
 %define HAVE_OPENCV2_CORE_CORE_C_H 0
-%define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-%define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 %define HAVE_OPENGL_GL3_H 0
 %define HAVE_POLL_H 1
-%define HAVE_SOUNDCARD_H 0
-%define HAVE_STDATOMIC_H 1
-%define HAVE_SYS_MMAN_H 1
 %define HAVE_SYS_PARAM_H 1
 %define HAVE_SYS_RESOURCE_H 1
 %define HAVE_SYS_SELECT_H 1
@@ -259,16 +247,19 @@
 %define HAVE_SINF 1
 %define HAVE_TRUNC 1
 %define HAVE_TRUNCF 1
+%define HAVE_DOS_PATHS 0
+%define HAVE_LIBC_MSVCRT 0
+%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_SECTION_DATA_REL_RO 0
+%define HAVE_THREADS 1
+%define HAVE_UWP 0
+%define HAVE_WINRT 0
 %define HAVE_ACCESS 1
 %define HAVE_ALIGNED_MALLOC 0
 %define HAVE_CLOCK_GETTIME 1
 %define HAVE_CLOSESOCKET 0
 %define HAVE_COMMANDLINETOARGVW 0
-%define HAVE_COTASKMEMFREE 0
-%define HAVE_CRYPTGENRANDOM 0
 %define HAVE_FCNTL 1
-%define HAVE_FLT_LIM 1
-%define HAVE_FORK 1
 %define HAVE_GETADDRINFO 1
 %define HAVE_GETHRTIME 0
 %define HAVE_GETOPT 1
@@ -283,9 +274,7 @@
 %define HAVE_GMTIME_R 1
 %define HAVE_INET_ATON 1
 %define HAVE_ISATTY 1
-%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 %define HAVE_KBHIT 0
-%define HAVE_LOADLIBRARY 0
 %define HAVE_LSTAT 1
 %define HAVE_LZO1X_999_COMPRESS 0
 %define HAVE_MACH_ABSOLUTE_TIME 0
@@ -297,6 +286,7 @@
 %define HAVE_PEEKNAMEDPIPE 0
 %define HAVE_PTHREAD_CANCEL 1
 %define HAVE_SCHED_GETAFFINITY 1
+%define HAVE_SECITEMIMPORT 0
 %define HAVE_SETCONSOLETEXTATTRIBUTE 0
 %define HAVE_SETCONSOLECTRLHANDLER 0
 %define HAVE_SETMODE 0
@@ -309,16 +299,19 @@
 %define HAVE_UTGETOSTYPEFROMSTRING 0
 %define HAVE_VIRTUALALLOC 0
 %define HAVE_WGLGETPROCADDRESS 0
+%define HAVE_BCRYPT 0
+%define HAVE_VAAPI_DRM 0
+%define HAVE_VAAPI_X11 0
+%define HAVE_VDPAU_X11 0
 %define HAVE_PTHREADS 1
 %define HAVE_OS2THREADS 0
 %define HAVE_W32THREADS 0
+%define HAVE_AS_ARCH_DIRECTIVE 0
 %define HAVE_AS_DN_DIRECTIVE 0
 %define HAVE_AS_FPU_DIRECTIVE 0
 %define HAVE_AS_FUNC 0
 %define HAVE_AS_OBJECT_ARCH 0
 %define HAVE_ASM_MOD_Q 0
-%define HAVE_ATTRIBUTE_MAY_ALIAS 1
-%define HAVE_ATTRIBUTE_PACKED 1
 %define HAVE_BLOCKS_EXTENSION 0
 %define HAVE_EBP_AVAILABLE 1
 %define HAVE_EBX_AVAILABLE 1
@@ -335,7 +328,6 @@
 %define HAVE_VFP_ARGS 0
 %define HAVE_XFORM_ASM 0
 %define HAVE_XMM_CLOBBERS 1
-%define HAVE_CONDITION_VARIABLE_PTR 0
 %define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 %define HAVE_SOCKLEN_T 1
 %define HAVE_STRUCT_ADDRINFO 1
@@ -351,27 +343,22 @@
 %define HAVE_STRUCT_SOCKADDR_STORAGE 1
 %define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 1
 %define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1
-%define HAVE_ATOMICS_NATIVE 1
-%define HAVE_DOS_PATHS 0
-%define HAVE_LIBC_MSVCRT 0
 %define HAVE_MAKEINFO 0
 %define HAVE_MAKEINFO_HTML 0
-%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_OPENCL_D3D11 0
+%define HAVE_OPENCL_DRM_ARM 0
+%define HAVE_OPENCL_DRM_BEIGNET 0
+%define HAVE_OPENCL_DXVA2 0
+%define HAVE_OPENCL_VAAPI_BEIGNET 0
+%define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 %define HAVE_PERL 1
 %define HAVE_POD2MAN 1
-%define HAVE_SECTION_DATA_REL_RO 1
-%define HAVE_TEXI2HTML 0
-%define HAVE_THREADS 1
-%define HAVE_UWP 0
-%define HAVE_VAAPI_DRM 0
-%define HAVE_VAAPI_X11 0
-%define HAVE_VDPAU_X11 0
-%define HAVE_WINRT 0
+%define HAVE_TEXI2HTML 1
 %define CONFIG_DOC 0
-%define CONFIG_HTMLPAGES 0
+%define CONFIG_HTMLPAGES 1
 %define CONFIG_MANPAGES 1
 %define CONFIG_PODPAGES 1
-%define CONFIG_TXTPAGES 0
+%define CONFIG_TXTPAGES 1
 %define CONFIG_AVIO_DIR_CMD_EXAMPLE 1
 %define CONFIG_AVIO_READING_EXAMPLE 1
 %define CONFIG_DECODE_AUDIO_EXAMPLE 1
@@ -393,24 +380,8 @@
 %define CONFIG_SCALING_VIDEO_EXAMPLE 0
 %define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 %define CONFIG_TRANSCODING_EXAMPLE 0
-%define CONFIG_ALSA 1
-%define CONFIG_APPKIT 0
-%define CONFIG_AVFOUNDATION 0
-%define CONFIG_BZLIB 0
-%define CONFIG_COREIMAGE 0
-%define CONFIG_ICONV 0
-%define CONFIG_JACK 0
-%define CONFIG_LIBXCB 0
-%define CONFIG_LIBXCB_SHM 0
-%define CONFIG_LIBXCB_SHAPE 0
-%define CONFIG_LIBXCB_XFIXES 0
-%define CONFIG_LZMA 1
-%define CONFIG_SCHANNEL 0
-%define CONFIG_SDL2 0
-%define CONFIG_SECURETRANSPORT 0
-%define CONFIG_SNDIO 0
-%define CONFIG_XLIB 1
-%define CONFIG_ZLIB 1
+%define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+%define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 %define CONFIG_AVISYNTH 0
 %define CONFIG_FREI0R 0
 %define CONFIG_LIBCDIO 0
@@ -424,9 +395,11 @@
 %define CONFIG_LIBNDI_NEWTEK 0
 %define CONFIG_LIBFDK_AAC 0
 %define CONFIG_OPENSSL 0
+%define CONFIG_LIBTLS 0
 %define CONFIG_GMP 0
 %define CONFIG_LIBOPENCORE_AMRNB 0
 %define CONFIG_LIBOPENCORE_AMRWB 0
+%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVO_AMRWBENC 0
 %define CONFIG_RKMPP 0
 %define CONFIG_LIBSMBCLIENT 0
@@ -435,11 +408,13 @@
 %define CONFIG_GNUTLS 0
 %define CONFIG_JNI 0
 %define CONFIG_LADSPA 0
+%define CONFIG_LIBAOM 0
 %define CONFIG_LIBASS 0
 %define CONFIG_LIBBLURAY 0
 %define CONFIG_LIBBS2B 0
 %define CONFIG_LIBCACA 0
 %define CONFIG_LIBCELT 0
+%define CONFIG_LIBCODEC2 0
 %define CONFIG_LIBDC1394 0
 %define CONFIG_LIBDRM 0
 %define CONFIG_LIBFLITE 0
@@ -450,6 +425,7 @@
 %define CONFIG_LIBGSM 0
 %define CONFIG_LIBIEC61883 0
 %define CONFIG_LIBILBC 0
+%define CONFIG_LIBJACK 0
 %define CONFIG_LIBKVAZAAR 0
 %define CONFIG_LIBMODPLUG 0
 %define CONFIG_LIBMP3LAME 0
@@ -467,12 +443,12 @@
 %define CONFIG_LIBSNAPPY 0
 %define CONFIG_LIBSOXR 0
 %define CONFIG_LIBSPEEX 0
+%define CONFIG_LIBSRT 0
 %define CONFIG_LIBSSH 0
 %define CONFIG_LIBTESSERACT 0
 %define CONFIG_LIBTHEORA 0
 %define CONFIG_LIBTWOLAME 0
 %define CONFIG_LIBV4L2 0
-%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVORBIS 0
 %define CONFIG_LIBVPX 0
 %define CONFIG_LIBWAVPACK 0
@@ -481,28 +457,48 @@
 %define CONFIG_LIBZIMG 0
 %define CONFIG_LIBZMQ 0
 %define CONFIG_LIBZVBI 0
+%define CONFIG_LV2 0
 %define CONFIG_MEDIACODEC 0
 %define CONFIG_OPENAL 0
-%define CONFIG_OPENCL 0
 %define CONFIG_OPENGL 0
-%define CONFIG_AUDIOTOOLBOX 0
+%define CONFIG_ALSA 0
+%define CONFIG_APPKIT 1
+%define CONFIG_AVFOUNDATION 1
+%define CONFIG_BZLIB 1
+%define CONFIG_COREIMAGE 0
+%define CONFIG_ICONV 0
+%define CONFIG_LIBXCB 0
+%define CONFIG_LIBXCB_SHM 0
+%define CONFIG_LIBXCB_SHAPE 0
+%define CONFIG_LIBXCB_XFIXES 0
+%define CONFIG_LZMA 1
+%define CONFIG_SCHANNEL 0
+%define CONFIG_SDL2 0
+%define CONFIG_SECURETRANSPORT 0
+%define CONFIG_SNDIO 0
+%define CONFIG_XLIB 1
+%define CONFIG_ZLIB 1
+%define CONFIG_CUDA_SDK 0
+%define CONFIG_LIBNPP 0
+%define CONFIG_LIBMFX 0
+%define CONFIG_MMAL 0
+%define CONFIG_OMX 0
+%define CONFIG_OPENCL 0
+%define CONFIG_AMF 0
+%define CONFIG_AUDIOTOOLBOX 1
 %define CONFIG_CRYSTALHD 0
-%define CONFIG_CUDA 1
-%define CONFIG_CUVID 1
+%define CONFIG_CUDA 0
+%define CONFIG_CUVID 0
 %define CONFIG_D3D11VA 0
 %define CONFIG_DXVA2 0
-%define CONFIG_NVENC 1
+%define CONFIG_FFNVCODEC 0
+%define CONFIG_NVDEC 0
+%define CONFIG_NVENC 0
 %define CONFIG_VAAPI 0
-%define CONFIG_VDA 0
 %define CONFIG_VDPAU 0
 %define CONFIG_VIDEOTOOLBOX 0
 %define CONFIG_V4L2_M2M 1
 %define CONFIG_XVMC 0
-%define CONFIG_CUDA_SDK 0
-%define CONFIG_LIBNPP 0
-%define CONFIG_LIBMFX 0
-%define CONFIG_MMAL 0
-%define CONFIG_OMX 0
 %define CONFIG_FTRAPV 0
 %define CONFIG_GRAY 0
 %define CONFIG_HARDCODED_TABLES 0
@@ -527,7 +523,6 @@
 %define CONFIG_SWSCALE 0
 %define CONFIG_FFPLAY 0
 %define CONFIG_FFPROBE 0
-%define CONFIG_FFSERVER 0
 %define CONFIG_FFMPEG 0
 %define CONFIG_DCT 0
 %define CONFIG_DWT 0
@@ -553,31 +548,27 @@
 %define CONFIG_XMM_CLOBBER_TEST 0
 %define CONFIG_BSFS 1
 %define CONFIG_DECODERS 1
-%define CONFIG_ENCODERS 0
-%define CONFIG_HWACCELS 0
 %define CONFIG_PARSERS 1
-%define CONFIG_INDEVS 0
-%define CONFIG_OUTDEVS 0
-%define CONFIG_FILTERS 0
-%define CONFIG_DEMUXERS 0
-%define CONFIG_MUXERS 0
-%define CONFIG_PROTOCOLS 0
 %define CONFIG_AANDCTTABLES 0
 %define CONFIG_AC3DSP 0
+%define CONFIG_ADTS_HEADER 0
 %define CONFIG_AUDIO_FRAME_QUEUE 0
 %define CONFIG_AUDIODSP 0
 %define CONFIG_BLOCKDSP 0
 %define CONFIG_BSWAPDSP 0
 %define CONFIG_CABAC 0
+%define CONFIG_CBS 0
+%define CONFIG_CBS_H264 0
+%define CONFIG_CBS_H265 0
+%define CONFIG_CBS_MPEG2 0
 %define CONFIG_DIRAC_PARSE 0
 %define CONFIG_DVPROFILE 0
 %define CONFIG_EXIF 0
-%define CONFIG_FAANDCT 0
-%define CONFIG_FAANIDCT 0
-%define CONFIG_FDCTDSP 0
+%define CONFIG_FAANDCT 1
+%define CONFIG_FAANIDCT 1
+%define CONFIG_FDCTDSP 1
 %define CONFIG_FLACDSP 1
 %define CONFIG_FMTCONVERT 0
-%define CONFIG_FRAME_THREAD_ENCODER 0
 %define CONFIG_G722DSP 0
 %define CONFIG_GOLOMB 0
 %define CONFIG_GPLV3 0
@@ -592,7 +583,7 @@
 %define CONFIG_HUFFMAN 0
 %define CONFIG_HUFFYUVDSP 0
 %define CONFIG_HUFFYUVENCDSP 0
-%define CONFIG_IDCTDSP 0
+%define CONFIG_IDCTDSP 1
 %define CONFIG_IIRFILTER 0
 %define CONFIG_MDCT15 0
 %define CONFIG_INTRAX8 0
@@ -619,6 +610,7 @@
 %define CONFIG_QSV 0
 %define CONFIG_QSVDEC 0
 %define CONFIG_QSVENC 0
+%define CONFIG_QSVVPP 0
 %define CONFIG_RANGECODER 0
 %define CONFIG_RIFFDEC 0
 %define CONFIG_RIFFENC 0
@@ -642,9 +634,9 @@
 %define CONFIG_WMA_FREQS 0
 %define CONFIG_WMV2DSP 0
 %define CONFIG_NULL_BSF 1
+%define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 %define CONFIG_VP8_DECODER 1
 %define CONFIG_VP9_DECODER 1
 %define CONFIG_FLAC_DECODER 1
-%define CONFIG_FLAC_PARSER 0
 %define CONFIG_VP8_PARSER 1
 %define CONFIG_VP9_PARSER 1
diff --git a/media/ffvpx/config_unix64.h b/media/ffvpx/config_unix64.h
index 97bc765f8..1c5c4cb4c 100644
--- a/media/ffvpx/config_unix64.h
+++ b/media/ffvpx/config_unix64.h
@@ -1,9 +1,9 @@
 /* Automatically generated by configure - do not modify! */
 #ifndef FFMPEG_CONFIG_H
 #define FFMPEG_CONFIG_H
-#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vda --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm"
+#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --disable-cuda --disable-cuvid"
 #define FFMPEG_LICENSE "LGPL version 2.1 or later"
-#define CONFIG_THIS_YEAR 2017
+#define CONFIG_THIS_YEAR 2018
 #define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
 #define AVCONV_DATADIR "/usr/local/share/ffmpeg"
 #define CC_IDENT "gcc 6.3.0 (Ubuntu 6.3.0-12ubuntu2) 20170406"
@@ -57,6 +57,7 @@
 #define HAVE_AMD3DNOWEXT 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
 #define HAVE_FMA3 1
 #define HAVE_FMA4 1
 #define HAVE_MMX 1
@@ -101,6 +102,7 @@
 #define HAVE_AMD3DNOWEXT_EXTERNAL 1
 #define HAVE_AVX_EXTERNAL 1
 #define HAVE_AVX2_EXTERNAL 1
+#define HAVE_AVX512_EXTERNAL 1
 #define HAVE_FMA3_EXTERNAL 1
 #define HAVE_FMA4_EXTERNAL 1
 #define HAVE_MMX_EXTERNAL 1
@@ -145,6 +147,7 @@
 #define HAVE_AMD3DNOWEXT_INLINE 1
 #define HAVE_AVX_INLINE 1
 #define HAVE_AVX2_INLINE 1
+#define HAVE_AVX512_INLINE 1
 #define HAVE_FMA3_INLINE 1
 #define HAVE_FMA4_INLINE 1
 #define HAVE_MMX_INLINE 1
@@ -174,20 +177,15 @@
 #define HAVE_FAST_64BIT 1
 #define HAVE_FAST_CLZ 1
 #define HAVE_FAST_CMOV 1
-#define HAVE_LOCAL_ALIGNED_8 1
-#define HAVE_LOCAL_ALIGNED_16 1
-#define HAVE_LOCAL_ALIGNED_32 1
+#define HAVE_LOCAL_ALIGNED 1
 #define HAVE_SIMD_ALIGN_16 1
 #define HAVE_SIMD_ALIGN_32 1
-#define HAVE_ATOMICS_GCC 1
-#define HAVE_ATOMICS_SUNCC 0
-#define HAVE_ATOMICS_WIN32 0
+#define HAVE_SIMD_ALIGN_64 1
 #define HAVE_ATOMIC_CAS_PTR 0
 #define HAVE_MACHINE_RW_BARRIER 0
 #define HAVE_MEMORYBARRIER 0
 #define HAVE_MM_EMPTY 1
 #define HAVE_RDTSC 0
-#define HAVE_SARESTART 1
 #define HAVE_SEM_TIMEDWAIT 1
 #define HAVE_SYNC_VAL_COMPARE_AND_SWAP 1
 #define HAVE_CABS 1
@@ -197,13 +195,11 @@
 #define HAVE_X86ASM 1
 #define HAVE_BIGENDIAN 0
 #define HAVE_FAST_UNALIGNED 1
-#define HAVE_ALTIVEC_H 0
 #define HAVE_ARPA_INET_H 1
 #define HAVE_ASM_TYPES_H 1
 #define HAVE_CDIO_PARANOIA_H 0
 #define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 #define HAVE_CUDA_H 0
-#define HAVE_D3D11_H 0
 #define HAVE_DISPATCH_DISPATCH_H 0
 #define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 #define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -212,26 +208,17 @@
 #define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 #define HAVE_DIRECT_H 0
 #define HAVE_DIRENT_H 1
-#define HAVE_DLFCN_H 1
 #define HAVE_DXGIDEBUG_H 0
 #define HAVE_DXVA_H 0
 #define HAVE_ES2_GL_H 0
 #define HAVE_GSM_H 0
 #define HAVE_IO_H 0
-#define HAVE_MACH_MACH_TIME_H 0
+#define HAVE_LINUX_PERF_EVENT_H 0
 #define HAVE_MACHINE_IOCTL_BT848_H 0
 #define HAVE_MACHINE_IOCTL_METEOR_H 0
 #define HAVE_OPENCV2_CORE_CORE_C_H 0
-#define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-#define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 #define HAVE_OPENGL_GL3_H 0
 #define HAVE_POLL_H 1
-#define HAVE_SOUNDCARD_H 0
-#define HAVE_STDATOMIC_H 1
-#define HAVE_SYS_MMAN_H 1
 #define HAVE_SYS_PARAM_H 1
 #define HAVE_SYS_RESOURCE_H 1
 #define HAVE_SYS_SELECT_H 1
@@ -275,16 +262,19 @@
 #define HAVE_SINF 1
 #define HAVE_TRUNC 1
 #define HAVE_TRUNCF 1
+#define HAVE_DOS_PATHS 0
+#define HAVE_LIBC_MSVCRT 0
+#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_SECTION_DATA_REL_RO 0
+#define HAVE_THREADS 1
+#define HAVE_UWP 0
+#define HAVE_WINRT 0
 #define HAVE_ACCESS 1
 #define HAVE_ALIGNED_MALLOC 0
 #define HAVE_CLOCK_GETTIME 1
 #define HAVE_CLOSESOCKET 0
 #define HAVE_COMMANDLINETOARGVW 0
-#define HAVE_COTASKMEMFREE 0
-#define HAVE_CRYPTGENRANDOM 0
 #define HAVE_FCNTL 1
-#define HAVE_FLT_LIM 1
-#define HAVE_FORK 1
 #define HAVE_GETADDRINFO 1
 #define HAVE_GETHRTIME 0
 #define HAVE_GETOPT 1
@@ -299,9 +289,7 @@
 #define HAVE_GMTIME_R 1
 #define HAVE_INET_ATON 1
 #define HAVE_ISATTY 1
-#define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 #define HAVE_KBHIT 0
-#define HAVE_LOADLIBRARY 0
 #define HAVE_LSTAT 1
 #define HAVE_LZO1X_999_COMPRESS 0
 #define HAVE_MACH_ABSOLUTE_TIME 0
@@ -313,6 +301,7 @@
 #define HAVE_PEEKNAMEDPIPE 0
 #define HAVE_PTHREAD_CANCEL 1
 #define HAVE_SCHED_GETAFFINITY 1
+#define HAVE_SECITEMIMPORT 0
 #define HAVE_SETCONSOLETEXTATTRIBUTE 0
 #define HAVE_SETCONSOLECTRLHANDLER 0
 #define HAVE_SETMODE 0
@@ -325,16 +314,19 @@
 #define HAVE_UTGETOSTYPEFROMSTRING 0
 #define HAVE_VIRTUALALLOC 0
 #define HAVE_WGLGETPROCADDRESS 0
+#define HAVE_BCRYPT 0
+#define HAVE_VAAPI_DRM 0
+#define HAVE_VAAPI_X11 0
+#define HAVE_VDPAU_X11 0
 #define HAVE_PTHREADS 1
 #define HAVE_OS2THREADS 0
 #define HAVE_W32THREADS 0
+#define HAVE_AS_ARCH_DIRECTIVE 0
 #define HAVE_AS_DN_DIRECTIVE 0
 #define HAVE_AS_FPU_DIRECTIVE 0
 #define HAVE_AS_FUNC 0
 #define HAVE_AS_OBJECT_ARCH 0
 #define HAVE_ASM_MOD_Q 0
-#define HAVE_ATTRIBUTE_MAY_ALIAS 1
-#define HAVE_ATTRIBUTE_PACKED 1
 #define HAVE_BLOCKS_EXTENSION 0
 #define HAVE_EBP_AVAILABLE 1
 #define HAVE_EBX_AVAILABLE 1
@@ -351,7 +343,6 @@
 #define HAVE_VFP_ARGS 0
 #define HAVE_XFORM_ASM 0
 #define HAVE_XMM_CLOBBERS 1
-#define HAVE_CONDITION_VARIABLE_PTR 0
 #define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 #define HAVE_SOCKLEN_T 1
 #define HAVE_STRUCT_ADDRINFO 1
@@ -367,27 +358,22 @@
 #define HAVE_STRUCT_SOCKADDR_STORAGE 1
 #define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 1
 #define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 1
-#define HAVE_ATOMICS_NATIVE 1
-#define HAVE_DOS_PATHS 0
-#define HAVE_LIBC_MSVCRT 0
 #define HAVE_MAKEINFO 0
 #define HAVE_MAKEINFO_HTML 0
-#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_OPENCL_D3D11 0
+#define HAVE_OPENCL_DRM_ARM 0
+#define HAVE_OPENCL_DRM_BEIGNET 0
+#define HAVE_OPENCL_DXVA2 0
+#define HAVE_OPENCL_VAAPI_BEIGNET 0
+#define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 #define HAVE_PERL 1
 #define HAVE_POD2MAN 1
-#define HAVE_SECTION_DATA_REL_RO 1
-#define HAVE_TEXI2HTML 0
-#define HAVE_THREADS 1
-#define HAVE_UWP 0
-#define HAVE_VAAPI_DRM 0
-#define HAVE_VAAPI_X11 0
-#define HAVE_VDPAU_X11 0
-#define HAVE_WINRT 0
+#define HAVE_TEXI2HTML 1
 #define CONFIG_DOC 0
-#define CONFIG_HTMLPAGES 0
+#define CONFIG_HTMLPAGES 1
 #define CONFIG_MANPAGES 1
 #define CONFIG_PODPAGES 1
-#define CONFIG_TXTPAGES 0
+#define CONFIG_TXTPAGES 1
 #define CONFIG_AVIO_DIR_CMD_EXAMPLE 1
 #define CONFIG_AVIO_READING_EXAMPLE 1
 #define CONFIG_DECODE_AUDIO_EXAMPLE 1
@@ -409,24 +395,8 @@
 #define CONFIG_SCALING_VIDEO_EXAMPLE 0
 #define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 #define CONFIG_TRANSCODING_EXAMPLE 0
-#define CONFIG_ALSA 1
-#define CONFIG_APPKIT 0
-#define CONFIG_AVFOUNDATION 0
-#define CONFIG_BZLIB 0
-#define CONFIG_COREIMAGE 0
-#define CONFIG_ICONV 0
-#define CONFIG_JACK 0
-#define CONFIG_LIBXCB 0
-#define CONFIG_LIBXCB_SHM 0
-#define CONFIG_LIBXCB_SHAPE 0
-#define CONFIG_LIBXCB_XFIXES 0
-#define CONFIG_LZMA 1
-#define CONFIG_SCHANNEL 0
-#define CONFIG_SDL2 0
-#define CONFIG_SECURETRANSPORT 0
-#define CONFIG_SNDIO 0
-#define CONFIG_XLIB 1
-#define CONFIG_ZLIB 1
+#define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+#define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 #define CONFIG_AVISYNTH 0
 #define CONFIG_FREI0R 0
 #define CONFIG_LIBCDIO 0
@@ -440,9 +410,11 @@
 #define CONFIG_LIBNDI_NEWTEK 0
 #define CONFIG_LIBFDK_AAC 0
 #define CONFIG_OPENSSL 0
+#define CONFIG_LIBTLS 0
 #define CONFIG_GMP 0
 #define CONFIG_LIBOPENCORE_AMRNB 0
 #define CONFIG_LIBOPENCORE_AMRWB 0
+#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVO_AMRWBENC 0
 #define CONFIG_RKMPP 0
 #define CONFIG_LIBSMBCLIENT 0
@@ -451,11 +423,13 @@
 #define CONFIG_GNUTLS 0
 #define CONFIG_JNI 0
 #define CONFIG_LADSPA 0
+#define CONFIG_LIBAOM 0
 #define CONFIG_LIBASS 0
 #define CONFIG_LIBBLURAY 0
 #define CONFIG_LIBBS2B 0
 #define CONFIG_LIBCACA 0
 #define CONFIG_LIBCELT 0
+#define CONFIG_LIBCODEC2 0
 #define CONFIG_LIBDC1394 0
 #define CONFIG_LIBDRM 0
 #define CONFIG_LIBFLITE 0
@@ -466,6 +440,7 @@
 #define CONFIG_LIBGSM 0
 #define CONFIG_LIBIEC61883 0
 #define CONFIG_LIBILBC 0
+#define CONFIG_LIBJACK 0
 #define CONFIG_LIBKVAZAAR 0
 #define CONFIG_LIBMODPLUG 0
 #define CONFIG_LIBMP3LAME 0
@@ -483,12 +458,12 @@
 #define CONFIG_LIBSNAPPY 0
 #define CONFIG_LIBSOXR 0
 #define CONFIG_LIBSPEEX 0
+#define CONFIG_LIBSRT 0
 #define CONFIG_LIBSSH 0
 #define CONFIG_LIBTESSERACT 0
 #define CONFIG_LIBTHEORA 0
 #define CONFIG_LIBTWOLAME 0
 #define CONFIG_LIBV4L2 0
-#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVORBIS 0
 #define CONFIG_LIBVPX 0
 #define CONFIG_LIBWAVPACK 0
@@ -497,28 +472,48 @@
 #define CONFIG_LIBZIMG 0
 #define CONFIG_LIBZMQ 0
 #define CONFIG_LIBZVBI 0
+#define CONFIG_LV2 0
 #define CONFIG_MEDIACODEC 0
 #define CONFIG_OPENAL 0
-#define CONFIG_OPENCL 0
 #define CONFIG_OPENGL 0
-#define CONFIG_AUDIOTOOLBOX 0
+#define CONFIG_ALSA 0
+#define CONFIG_APPKIT 1
+#define CONFIG_AVFOUNDATION 1
+#define CONFIG_BZLIB 1
+#define CONFIG_COREIMAGE 0
+#define CONFIG_ICONV 0
+#define CONFIG_LIBXCB 0
+#define CONFIG_LIBXCB_SHM 0
+#define CONFIG_LIBXCB_SHAPE 0
+#define CONFIG_LIBXCB_XFIXES 0
+#define CONFIG_LZMA 1
+#define CONFIG_SCHANNEL 0
+#define CONFIG_SDL2 0
+#define CONFIG_SECURETRANSPORT 0
+#define CONFIG_SNDIO 0
+#define CONFIG_XLIB 1
+#define CONFIG_ZLIB 1
+#define CONFIG_CUDA_SDK 0
+#define CONFIG_LIBNPP 0
+#define CONFIG_LIBMFX 0
+#define CONFIG_MMAL 0
+#define CONFIG_OMX 0
+#define CONFIG_OPENCL 0
+#define CONFIG_AMF 0
+#define CONFIG_AUDIOTOOLBOX 1
 #define CONFIG_CRYSTALHD 0
-#define CONFIG_CUDA 1
-#define CONFIG_CUVID 1
+#define CONFIG_CUDA 0
+#define CONFIG_CUVID 0
 #define CONFIG_D3D11VA 0
 #define CONFIG_DXVA2 0
-#define CONFIG_NVENC 1
+#define CONFIG_FFNVCODEC 0
+#define CONFIG_NVDEC 0
+#define CONFIG_NVENC 0
 #define CONFIG_VAAPI 0
-#define CONFIG_VDA 0
 #define CONFIG_VDPAU 0
 #define CONFIG_VIDEOTOOLBOX 0
 #define CONFIG_V4L2_M2M 1
 #define CONFIG_XVMC 0
-#define CONFIG_CUDA_SDK 0
-#define CONFIG_LIBNPP 0
-#define CONFIG_LIBMFX 0
-#define CONFIG_MMAL 0
-#define CONFIG_OMX 0
 #define CONFIG_FTRAPV 0
 #define CONFIG_GRAY 0
 #define CONFIG_HARDCODED_TABLES 0
@@ -543,7 +538,6 @@
 #define CONFIG_SWSCALE 0
 #define CONFIG_FFPLAY 0
 #define CONFIG_FFPROBE 0
-#define CONFIG_FFSERVER 0
 #define CONFIG_FFMPEG 0
 #define CONFIG_DCT 0
 #define CONFIG_DWT 0
@@ -569,31 +563,27 @@
 #define CONFIG_XMM_CLOBBER_TEST 0
 #define CONFIG_BSFS 1
 #define CONFIG_DECODERS 1
-#define CONFIG_ENCODERS 0
-#define CONFIG_HWACCELS 0
 #define CONFIG_PARSERS 1
-#define CONFIG_INDEVS 0
-#define CONFIG_OUTDEVS 0
-#define CONFIG_FILTERS 0
-#define CONFIG_DEMUXERS 0
-#define CONFIG_MUXERS 0
-#define CONFIG_PROTOCOLS 0
 #define CONFIG_AANDCTTABLES 0
 #define CONFIG_AC3DSP 0
+#define CONFIG_ADTS_HEADER 0
 #define CONFIG_AUDIO_FRAME_QUEUE 0
 #define CONFIG_AUDIODSP 0
 #define CONFIG_BLOCKDSP 0
 #define CONFIG_BSWAPDSP 0
 #define CONFIG_CABAC 0
+#define CONFIG_CBS 0
+#define CONFIG_CBS_H264 0
+#define CONFIG_CBS_H265 0
+#define CONFIG_CBS_MPEG2 0
 #define CONFIG_DIRAC_PARSE 0
 #define CONFIG_DVPROFILE 0
 #define CONFIG_EXIF 0
-#define CONFIG_FAANDCT 0
-#define CONFIG_FAANIDCT 0
-#define CONFIG_FDCTDSP 0
+#define CONFIG_FAANDCT 1
+#define CONFIG_FAANIDCT 1
+#define CONFIG_FDCTDSP 1
 #define CONFIG_FLACDSP 1
 #define CONFIG_FMTCONVERT 0
-#define CONFIG_FRAME_THREAD_ENCODER 0
 #define CONFIG_G722DSP 0
 #define CONFIG_GOLOMB 0
 #define CONFIG_GPLV3 0
@@ -608,7 +598,7 @@
 #define CONFIG_HUFFMAN 0
 #define CONFIG_HUFFYUVDSP 0
 #define CONFIG_HUFFYUVENCDSP 0
-#define CONFIG_IDCTDSP 0
+#define CONFIG_IDCTDSP 1
 #define CONFIG_IIRFILTER 0
 #define CONFIG_MDCT15 0
 #define CONFIG_INTRAX8 0
@@ -635,6 +625,7 @@
 #define CONFIG_QSV 0
 #define CONFIG_QSVDEC 0
 #define CONFIG_QSVENC 0
+#define CONFIG_QSVVPP 0
 #define CONFIG_RANGECODER 0
 #define CONFIG_RIFFDEC 0
 #define CONFIG_RIFFENC 0
@@ -658,10 +649,10 @@
 #define CONFIG_WMA_FREQS 0
 #define CONFIG_WMV2DSP 0
 #define CONFIG_NULL_BSF 1
+#define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_DECODER 1
 #define CONFIG_FLAC_DECODER 1
-#define CONFIG_FLAC_PARSER 0
 #define CONFIG_VP8_PARSER 1
 #define CONFIG_VP9_PARSER 1
 #endif /* FFMPEG_CONFIG_H */
diff --git a/media/ffvpx/config_win32.asm b/media/ffvpx/config_win32.asm
index c804c0155..3a3566d6f 100644
--- a/media/ffvpx/config_win32.asm
+++ b/media/ffvpx/config_win32.asm
@@ -1,3 +1,4 @@
+; Automatically generated by configure - do not modify!
 %define ARCH_AARCH64 0
 %define ARCH_ALPHA 0
 %define ARCH_ARM 0
@@ -41,6 +42,7 @@
 %define HAVE_AMD3DNOWEXT 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
 %define HAVE_FMA3 1
 %define HAVE_FMA4 1
 %define HAVE_MMX 1
@@ -85,6 +87,7 @@
 %define HAVE_AMD3DNOWEXT_EXTERNAL 1
 %define HAVE_AVX_EXTERNAL 1
 %define HAVE_AVX2_EXTERNAL 1
+%define HAVE_AVX512_EXTERNAL 0
 %define HAVE_FMA3_EXTERNAL 1
 %define HAVE_FMA4_EXTERNAL 1
 %define HAVE_MMX_EXTERNAL 1
@@ -129,6 +132,7 @@
 %define HAVE_AMD3DNOWEXT_INLINE 0
 %define HAVE_AVX_INLINE 0
 %define HAVE_AVX2_INLINE 0
+%define HAVE_AVX512_INLINE 0
 %define HAVE_FMA3_INLINE 0
 %define HAVE_FMA4_INLINE 0
 %define HAVE_MMX_INLINE 0
@@ -156,22 +160,17 @@
 %define HAVE_MMI_INLINE 0
 %define HAVE_ALIGNED_STACK 0
 %define HAVE_FAST_64BIT 0
-%define HAVE_FAST_CLZ 1
+%define HAVE_FAST_CLZ 0
 %define HAVE_FAST_CMOV 0
-%define HAVE_LOCAL_ALIGNED_8 1
-%define HAVE_LOCAL_ALIGNED_16 1
-%define HAVE_LOCAL_ALIGNED_32 1
+%define HAVE_LOCAL_ALIGNED 1
 %define HAVE_SIMD_ALIGN_16 1
 %define HAVE_SIMD_ALIGN_32 1
-%define HAVE_ATOMICS_GCC 0
-%define HAVE_ATOMICS_SUNCC 0
-%define HAVE_ATOMICS_WIN32 1
+%define HAVE_SIMD_ALIGN_64 1
 %define HAVE_ATOMIC_CAS_PTR 0
 %define HAVE_MACHINE_RW_BARRIER 0
 %define HAVE_MEMORYBARRIER 1
 %define HAVE_MM_EMPTY 1
 %define HAVE_RDTSC 1
-%define HAVE_SARESTART 0
 %define HAVE_SEM_TIMEDWAIT 0
 %define HAVE_SYNC_VAL_COMPARE_AND_SWAP 0
 %define HAVE_CABS 0
@@ -181,13 +180,11 @@
 %define HAVE_X86ASM 1
 %define HAVE_BIGENDIAN 0
 %define HAVE_FAST_UNALIGNED 1
-%define HAVE_ALTIVEC_H 0
 %define HAVE_ARPA_INET_H 0
 %define HAVE_ASM_TYPES_H 0
 %define HAVE_CDIO_PARANOIA_H 0
 %define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 %define HAVE_CUDA_H 0
-%define HAVE_D3D11_H 1
 %define HAVE_DISPATCH_DISPATCH_H 0
 %define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 %define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -196,26 +193,17 @@
 %define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 %define HAVE_DIRECT_H 1
 %define HAVE_DIRENT_H 0
-%define HAVE_DLFCN_H 0
 %define HAVE_DXGIDEBUG_H 1
 %define HAVE_DXVA_H 1
 %define HAVE_ES2_GL_H 0
 %define HAVE_GSM_H 0
 %define HAVE_IO_H 1
-%define HAVE_MACH_MACH_TIME_H 0
+%define HAVE_LINUX_PERF_EVENT_H 0
 %define HAVE_MACHINE_IOCTL_BT848_H 0
 %define HAVE_MACHINE_IOCTL_METEOR_H 0
 %define HAVE_OPENCV2_CORE_CORE_C_H 0
-%define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-%define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 %define HAVE_OPENGL_GL3_H 0
 %define HAVE_POLL_H 0
-%define HAVE_SOUNDCARD_H 0
-%define HAVE_STDATOMIC_H 0
-%define HAVE_SYS_MMAN_H 0
 %define HAVE_SYS_PARAM_H 0
 %define HAVE_SYS_RESOURCE_H 0
 %define HAVE_SYS_SELECT_H 0
@@ -259,16 +247,19 @@
 %define HAVE_SINF 1
 %define HAVE_TRUNC 1
 %define HAVE_TRUNCF 1
+%define HAVE_DOS_PATHS 1
+%define HAVE_LIBC_MSVCRT 1
+%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_SECTION_DATA_REL_RO 0
+%define HAVE_THREADS 1
+%define HAVE_UWP 0
+%define HAVE_WINRT 0
 %define HAVE_ACCESS 1
 %define HAVE_ALIGNED_MALLOC 1
 %define HAVE_CLOCK_GETTIME 0
 %define HAVE_CLOSESOCKET 1
 %define HAVE_COMMANDLINETOARGVW 1
-%define HAVE_COTASKMEMFREE 1
-%define HAVE_CRYPTGENRANDOM 1
 %define HAVE_FCNTL 0
-%define HAVE_FLT_LIM 1
-%define HAVE_FORK 0
 %define HAVE_GETADDRINFO 1
 %define HAVE_GETHRTIME 0
 %define HAVE_GETOPT 0
@@ -283,9 +274,7 @@
 %define HAVE_GMTIME_R 0
 %define HAVE_INET_ATON 0
 %define HAVE_ISATTY 1
-%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 %define HAVE_KBHIT 1
-%define HAVE_LOADLIBRARY 1
 %define HAVE_LSTAT 0
 %define HAVE_LZO1X_999_COMPRESS 0
 %define HAVE_MACH_ABSOLUTE_TIME 0
@@ -297,6 +286,7 @@
 %define HAVE_PEEKNAMEDPIPE 1
 %define HAVE_PTHREAD_CANCEL 0
 %define HAVE_SCHED_GETAFFINITY 0
+%define HAVE_SECITEMIMPORT 0
 %define HAVE_SETCONSOLETEXTATTRIBUTE 1
 %define HAVE_SETCONSOLECTRLHANDLER 1
 %define HAVE_SETMODE 1
@@ -309,16 +299,19 @@
 %define HAVE_UTGETOSTYPEFROMSTRING 0
 %define HAVE_VIRTUALALLOC 1
 %define HAVE_WGLGETPROCADDRESS 0
+%define HAVE_BCRYPT 1
+%define HAVE_VAAPI_DRM 0
+%define HAVE_VAAPI_X11 0
+%define HAVE_VDPAU_X11 0
 %define HAVE_PTHREADS 0
 %define HAVE_OS2THREADS 0
 %define HAVE_W32THREADS 1
+%define HAVE_AS_ARCH_DIRECTIVE 0
 %define HAVE_AS_DN_DIRECTIVE 0
 %define HAVE_AS_FPU_DIRECTIVE 0
 %define HAVE_AS_FUNC 0
 %define HAVE_AS_OBJECT_ARCH 0
 %define HAVE_ASM_MOD_Q 0
-%define HAVE_ATTRIBUTE_MAY_ALIAS 0
-%define HAVE_ATTRIBUTE_PACKED 0
 %define HAVE_BLOCKS_EXTENSION 0
 %define HAVE_EBP_AVAILABLE 0
 %define HAVE_EBX_AVAILABLE 0
@@ -335,7 +328,6 @@
 %define HAVE_VFP_ARGS 0
 %define HAVE_XFORM_ASM 0
 %define HAVE_XMM_CLOBBERS 0
-%define HAVE_CONDITION_VARIABLE_PTR 1
 %define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 %define HAVE_SOCKLEN_T 1
 %define HAVE_STRUCT_ADDRINFO 1
@@ -343,7 +335,7 @@
 %define HAVE_STRUCT_IP_MREQ_SOURCE 1
 %define HAVE_STRUCT_IPV6_MREQ 1
 %define HAVE_STRUCT_MSGHDR_MSG_FLAGS 0
-%define HAVE_STRUCT_POLLFD 0
+%define HAVE_STRUCT_POLLFD 1
 %define HAVE_STRUCT_RUSAGE_RU_MAXRSS 0
 %define HAVE_STRUCT_SCTP_EVENT_SUBSCRIBE 0
 %define HAVE_STRUCT_SOCKADDR_IN6 1
@@ -351,22 +343,17 @@
 %define HAVE_STRUCT_SOCKADDR_STORAGE 1
 %define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
 %define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-%define HAVE_ATOMICS_NATIVE 1
-%define HAVE_DOS_PATHS 1
-%define HAVE_LIBC_MSVCRT 1
 %define HAVE_MAKEINFO 1
 %define HAVE_MAKEINFO_HTML 0
-%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_OPENCL_D3D11 0
+%define HAVE_OPENCL_DRM_ARM 0
+%define HAVE_OPENCL_DRM_BEIGNET 0
+%define HAVE_OPENCL_DXVA2 0
+%define HAVE_OPENCL_VAAPI_BEIGNET 0
+%define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 %define HAVE_PERL 1
 %define HAVE_POD2MAN 1
-%define HAVE_SECTION_DATA_REL_RO 0
 %define HAVE_TEXI2HTML 0
-%define HAVE_THREADS 1
-%define HAVE_UWP 0
-%define HAVE_VAAPI_DRM 0
-%define HAVE_VAAPI_X11 0
-%define HAVE_VDPAU_X11 0
-%define HAVE_WINRT 0
 %define CONFIG_DOC 0
 %define CONFIG_HTMLPAGES 0
 %define CONFIG_MANPAGES 1
@@ -393,27 +380,12 @@
 %define CONFIG_SCALING_VIDEO_EXAMPLE 0
 %define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 %define CONFIG_TRANSCODING_EXAMPLE 0
-%define CONFIG_ALSA 0
-%define CONFIG_APPKIT 0
-%define CONFIG_AVFOUNDATION 0
-%define CONFIG_BZLIB 0
-%define CONFIG_COREIMAGE 0
-%define CONFIG_ICONV 0
-%define CONFIG_JACK 0
-%define CONFIG_LIBXCB 0
-%define CONFIG_LIBXCB_SHM 0
-%define CONFIG_LIBXCB_SHAPE 0
-%define CONFIG_LIBXCB_XFIXES 0
-%define CONFIG_LZMA 0
-%define CONFIG_SCHANNEL 1
-%define CONFIG_SDL2 0
-%define CONFIG_SECURETRANSPORT 0
-%define CONFIG_SNDIO 0
-%define CONFIG_XLIB 1
-%define CONFIG_ZLIB 0
+%define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+%define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 %define CONFIG_AVISYNTH 0
 %define CONFIG_FREI0R 0
 %define CONFIG_LIBCDIO 0
+%define CONFIG_LIBDAVS2 0
 %define CONFIG_LIBRUBBERBAND 0
 %define CONFIG_LIBVIDSTAB 0
 %define CONFIG_LIBX264 0
@@ -424,10 +396,14 @@
 %define CONFIG_LIBNDI_NEWTEK 0
 %define CONFIG_LIBFDK_AAC 0
 %define CONFIG_OPENSSL 0
+%define CONFIG_LIBTLS 0
 %define CONFIG_GMP 0
+%define CONFIG_LIBLENSFUN 0
 %define CONFIG_LIBOPENCORE_AMRNB 0
 %define CONFIG_LIBOPENCORE_AMRWB 0
+%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVO_AMRWBENC 0
+%define CONFIG_MBEDTLS 0
 %define CONFIG_RKMPP 0
 %define CONFIG_LIBSMBCLIENT 0
 %define CONFIG_CHROMAPRINT 0
@@ -435,11 +411,13 @@
 %define CONFIG_GNUTLS 0
 %define CONFIG_JNI 0
 %define CONFIG_LADSPA 0
+%define CONFIG_LIBAOM 0
 %define CONFIG_LIBASS 0
 %define CONFIG_LIBBLURAY 0
 %define CONFIG_LIBBS2B 0
 %define CONFIG_LIBCACA 0
 %define CONFIG_LIBCELT 0
+%define CONFIG_LIBCODEC2 0
 %define CONFIG_LIBDC1394 0
 %define CONFIG_LIBDRM 0
 %define CONFIG_LIBFLITE 0
@@ -450,6 +428,7 @@
 %define CONFIG_LIBGSM 0
 %define CONFIG_LIBIEC61883 0
 %define CONFIG_LIBILBC 0
+%define CONFIG_LIBJACK 0
 %define CONFIG_LIBKVAZAAR 0
 %define CONFIG_LIBMODPLUG 0
 %define CONFIG_LIBMP3LAME 0
@@ -467,12 +446,13 @@
 %define CONFIG_LIBSNAPPY 0
 %define CONFIG_LIBSOXR 0
 %define CONFIG_LIBSPEEX 0
+%define CONFIG_LIBSRT 0
 %define CONFIG_LIBSSH 0
+%define CONFIG_LIBTENSORFLOW 0
 %define CONFIG_LIBTESSERACT 0
 %define CONFIG_LIBTHEORA 0
 %define CONFIG_LIBTWOLAME 0
 %define CONFIG_LIBV4L2 0
-%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVORBIS 0
 %define CONFIG_LIBVPX 0
 %define CONFIG_LIBWAVPACK 0
@@ -481,28 +461,49 @@
 %define CONFIG_LIBZIMG 0
 %define CONFIG_LIBZMQ 0
 %define CONFIG_LIBZVBI 0
+%define CONFIG_LV2 0
 %define CONFIG_MEDIACODEC 0
 %define CONFIG_OPENAL 0
-%define CONFIG_OPENCL 0
 %define CONFIG_OPENGL 0
+%define CONFIG_VAPOURSYNTH 0
+%define CONFIG_ALSA 0
+%define CONFIG_APPKIT 0
+%define CONFIG_AVFOUNDATION 0
+%define CONFIG_BZLIB 0
+%define CONFIG_COREIMAGE 0
+%define CONFIG_ICONV 0
+%define CONFIG_LIBXCB 0
+%define CONFIG_LIBXCB_SHM 0
+%define CONFIG_LIBXCB_SHAPE 0
+%define CONFIG_LIBXCB_XFIXES 0
+%define CONFIG_LZMA 0
+%define CONFIG_SCHANNEL 1
+%define CONFIG_SDL2 0
+%define CONFIG_SECURETRANSPORT 0
+%define CONFIG_SNDIO 0
+%define CONFIG_XLIB 0
+%define CONFIG_ZLIB 0
+%define CONFIG_CUDA_SDK 0
+%define CONFIG_LIBNPP 0
+%define CONFIG_LIBMFX 0
+%define CONFIG_MMAL 0
+%define CONFIG_OMX 0
+%define CONFIG_OPENCL 0
+%define CONFIG_AMF 0
 %define CONFIG_AUDIOTOOLBOX 0
 %define CONFIG_CRYSTALHD 0
-%define CONFIG_CUDA 1
-%define CONFIG_CUVID 1
+%define CONFIG_CUDA 0
+%define CONFIG_CUVID 0
 %define CONFIG_D3D11VA 0
 %define CONFIG_DXVA2 0
-%define CONFIG_NVENC 1
+%define CONFIG_FFNVCODEC 0
+%define CONFIG_NVDEC 0
+%define CONFIG_NVENC 0
 %define CONFIG_VAAPI 0
-%define CONFIG_VDA 0
 %define CONFIG_VDPAU 0
 %define CONFIG_VIDEOTOOLBOX 0
 %define CONFIG_V4L2_M2M 0
 %define CONFIG_XVMC 0
-%define CONFIG_CUDA_SDK 0
-%define CONFIG_LIBNPP 0
-%define CONFIG_LIBMFX 0
-%define CONFIG_MMAL 0
-%define CONFIG_OMX 0
 %define CONFIG_FTRAPV 0
 %define CONFIG_GRAY 0
 %define CONFIG_HARDCODED_TABLES 0
@@ -516,18 +517,17 @@
 %define CONFIG_GPL 0
 %define CONFIG_NONFREE 0
 %define CONFIG_VERSION3 0
-%define CONFIG_AVCODEC 1
 %define CONFIG_AVDEVICE 0
 %define CONFIG_AVFILTER 0
+%define CONFIG_SWSCALE 0
+%define CONFIG_POSTPROC 0
 %define CONFIG_AVFORMAT 0
+%define CONFIG_AVCODEC 1
+%define CONFIG_SWRESAMPLE 0
 %define CONFIG_AVRESAMPLE 0
 %define CONFIG_AVUTIL 1
-%define CONFIG_POSTPROC 0
-%define CONFIG_SWRESAMPLE 0
-%define CONFIG_SWSCALE 0
 %define CONFIG_FFPLAY 0
 %define CONFIG_FFPROBE 0
-%define CONFIG_FFSERVER 0
 %define CONFIG_FFMPEG 0
 %define CONFIG_DCT 0
 %define CONFIG_DWT 0
@@ -553,31 +553,29 @@
 %define CONFIG_XMM_CLOBBER_TEST 0
 %define CONFIG_BSFS 1
 %define CONFIG_DECODERS 1
-%define CONFIG_ENCODERS 0
-%define CONFIG_HWACCELS 0
 %define CONFIG_PARSERS 1
-%define CONFIG_INDEVS 0
-%define CONFIG_OUTDEVS 0
-%define CONFIG_FILTERS 0
-%define CONFIG_DEMUXERS 0
-%define CONFIG_MUXERS 0
-%define CONFIG_PROTOCOLS 0
 %define CONFIG_AANDCTTABLES 0
 %define CONFIG_AC3DSP 0
+%define CONFIG_ADTS_HEADER 0
 %define CONFIG_AUDIO_FRAME_QUEUE 0
 %define CONFIG_AUDIODSP 0
 %define CONFIG_BLOCKDSP 0
 %define CONFIG_BSWAPDSP 0
 %define CONFIG_CABAC 0
+%define CONFIG_CBS 0
+%define CONFIG_CBS_H264 0
+%define CONFIG_CBS_H265 0
+%define CONFIG_CBS_MPEG2 0
+%define CONFIG_CBS_VP9 0
 %define CONFIG_DIRAC_PARSE 0
+%define CONFIG_DNN 0
 %define CONFIG_DVPROFILE 0
 %define CONFIG_EXIF 0
-%define CONFIG_FAANDCT 0
-%define CONFIG_FAANIDCT 0
-%define CONFIG_FDCTDSP 0
+%define CONFIG_FAANDCT 1
+%define CONFIG_FAANIDCT 1
+%define CONFIG_FDCTDSP 1
 %define CONFIG_FLACDSP 1
 %define CONFIG_FMTCONVERT 0
-%define CONFIG_FRAME_THREAD_ENCODER 0
 %define CONFIG_G722DSP 0
 %define CONFIG_GOLOMB 0
 %define CONFIG_GPLV3 0
@@ -592,7 +590,7 @@
 %define CONFIG_HUFFMAN 0
 %define CONFIG_HUFFYUVDSP 0
 %define CONFIG_HUFFYUVENCDSP 0
-%define CONFIG_IDCTDSP 0
+%define CONFIG_IDCTDSP 1
 %define CONFIG_IIRFILTER 0
 %define CONFIG_MDCT15 0
 %define CONFIG_INTRAX8 0
@@ -619,6 +617,7 @@
 %define CONFIG_QSV 0
 %define CONFIG_QSVDEC 0
 %define CONFIG_QSVENC 0
+%define CONFIG_QSVVPP 0
 %define CONFIG_RANGECODER 0
 %define CONFIG_RIFFDEC 0
 %define CONFIG_RIFFENC 0
@@ -642,9 +641,9 @@
 %define CONFIG_WMA_FREQS 0
 %define CONFIG_WMV2DSP 0
 %define CONFIG_NULL_BSF 1
+%define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 %define CONFIG_VP8_DECODER 1
 %define CONFIG_VP9_DECODER 1
 %define CONFIG_FLAC_DECODER 1
-%define CONFIG_FLAC_PARSER 0
 %define CONFIG_VP8_PARSER 1
 %define CONFIG_VP9_PARSER 1
diff --git a/media/ffvpx/config_win32.h b/media/ffvpx/config_win32.h
index 36bb2d3c3..35480e418 100644
--- a/media/ffvpx/config_win32.h
+++ b/media/ffvpx/config_win32.h
@@ -1,12 +1,12 @@
 /* Automatically generated by configure - do not modify! */
 #ifndef FFMPEG_CONFIG_H
 #define FFMPEG_CONFIG_H
-#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --toolchain=msvc"
+#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --disable-cuda --disable-cuvid --toolchain=msvc"
 #define FFMPEG_LICENSE "LGPL version 2.1 or later"
-#define CONFIG_THIS_YEAR 2017
+#define CONFIG_THIS_YEAR 2018
 #define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
 #define AVCONV_DATADIR "/usr/local/share/ffmpeg"
-#define CC_IDENT "Microsoft (R) C/C++ Optimizing Compiler Version 19.11.25508.2 for x86"
+#define CC_IDENT "Microsoft (R) C/C++ Optimizing Compiler Version 19.15.26726 for x86"
 #define av_restrict __restrict
 #define EXTERN_PREFIX "_"
 #define EXTERN_ASM _
@@ -57,6 +57,7 @@
 #define HAVE_AMD3DNOWEXT 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
 #define HAVE_FMA3 1
 #define HAVE_FMA4 1
 #define HAVE_MMX 1
@@ -101,6 +102,7 @@
 #define HAVE_AMD3DNOWEXT_EXTERNAL 1
 #define HAVE_AVX_EXTERNAL 1
 #define HAVE_AVX2_EXTERNAL 1
+#define HAVE_AVX512_EXTERNAL 0
 #define HAVE_FMA3_EXTERNAL 1
 #define HAVE_FMA4_EXTERNAL 1
 #define HAVE_MMX_EXTERNAL 1
@@ -145,6 +147,7 @@
 #define HAVE_AMD3DNOWEXT_INLINE 0
 #define HAVE_AVX_INLINE 0
 #define HAVE_AVX2_INLINE 0
+#define HAVE_AVX512_INLINE 0
 #define HAVE_FMA3_INLINE 0
 #define HAVE_FMA4_INLINE 0
 #define HAVE_MMX_INLINE 0
@@ -174,20 +177,15 @@
 #define HAVE_FAST_64BIT 0
 #define HAVE_FAST_CLZ 0
 #define HAVE_FAST_CMOV 0
-#define HAVE_LOCAL_ALIGNED_8 1
-#define HAVE_LOCAL_ALIGNED_16 1
-#define HAVE_LOCAL_ALIGNED_32 1
+#define HAVE_LOCAL_ALIGNED 1
 #define HAVE_SIMD_ALIGN_16 1
 #define HAVE_SIMD_ALIGN_32 1
-#define HAVE_ATOMICS_GCC 0
-#define HAVE_ATOMICS_SUNCC 0
-#define HAVE_ATOMICS_WIN32 1
+#define HAVE_SIMD_ALIGN_64 1
 #define HAVE_ATOMIC_CAS_PTR 0
 #define HAVE_MACHINE_RW_BARRIER 0
 #define HAVE_MEMORYBARRIER 1
 #define HAVE_MM_EMPTY 1
 #define HAVE_RDTSC 1
-#define HAVE_SARESTART 0
 #define HAVE_SEM_TIMEDWAIT 0
 #define HAVE_SYNC_VAL_COMPARE_AND_SWAP 0
 #define HAVE_CABS 0
@@ -197,13 +195,11 @@
 #define HAVE_X86ASM 1
 #define HAVE_BIGENDIAN 0
 #define HAVE_FAST_UNALIGNED 1
-#define HAVE_ALTIVEC_H 0
 #define HAVE_ARPA_INET_H 0
 #define HAVE_ASM_TYPES_H 0
 #define HAVE_CDIO_PARANOIA_H 0
 #define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 #define HAVE_CUDA_H 0
-#define HAVE_D3D11_H 1
 #define HAVE_DISPATCH_DISPATCH_H 0
 #define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 #define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -212,26 +208,17 @@
 #define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 #define HAVE_DIRECT_H 1
 #define HAVE_DIRENT_H 0
-#define HAVE_DLFCN_H 0
 #define HAVE_DXGIDEBUG_H 1
 #define HAVE_DXVA_H 1
 #define HAVE_ES2_GL_H 0
 #define HAVE_GSM_H 0
 #define HAVE_IO_H 1
-#define HAVE_MACH_MACH_TIME_H 0
+#define HAVE_LINUX_PERF_EVENT_H 0
 #define HAVE_MACHINE_IOCTL_BT848_H 0
 #define HAVE_MACHINE_IOCTL_METEOR_H 0
 #define HAVE_OPENCV2_CORE_CORE_C_H 0
-#define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-#define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 #define HAVE_OPENGL_GL3_H 0
 #define HAVE_POLL_H 0
-#define HAVE_SOUNDCARD_H 0
-#define HAVE_STDATOMIC_H 0
-#define HAVE_SYS_MMAN_H 0
 #define HAVE_SYS_PARAM_H 0
 #define HAVE_SYS_RESOURCE_H 0
 #define HAVE_SYS_SELECT_H 0
@@ -275,16 +262,19 @@
 #define HAVE_SINF 1
 #define HAVE_TRUNC 1
 #define HAVE_TRUNCF 1
+#define HAVE_DOS_PATHS 1
+#define HAVE_LIBC_MSVCRT 1
+#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_SECTION_DATA_REL_RO 0
+#define HAVE_THREADS 1
+#define HAVE_UWP 0
+#define HAVE_WINRT 0
 #define HAVE_ACCESS 1
 #define HAVE_ALIGNED_MALLOC 1
 #define HAVE_CLOCK_GETTIME 0
 #define HAVE_CLOSESOCKET 1
 #define HAVE_COMMANDLINETOARGVW 1
-#define HAVE_COTASKMEMFREE 1
-#define HAVE_CRYPTGENRANDOM 1
 #define HAVE_FCNTL 0
-#define HAVE_FLT_LIM 1
-#define HAVE_FORK 0
 #define HAVE_GETADDRINFO 1
 #define HAVE_GETHRTIME 0
 #define HAVE_GETOPT 0
@@ -299,9 +289,7 @@
 #define HAVE_GMTIME_R 0
 #define HAVE_INET_ATON 0
 #define HAVE_ISATTY 1
-#define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 #define HAVE_KBHIT 1
-#define HAVE_LOADLIBRARY 1
 #define HAVE_LSTAT 0
 #define HAVE_LZO1X_999_COMPRESS 0
 #define HAVE_MACH_ABSOLUTE_TIME 0
@@ -313,6 +301,7 @@
 #define HAVE_PEEKNAMEDPIPE 1
 #define HAVE_PTHREAD_CANCEL 0
 #define HAVE_SCHED_GETAFFINITY 0
+#define HAVE_SECITEMIMPORT 0
 #define HAVE_SETCONSOLETEXTATTRIBUTE 1
 #define HAVE_SETCONSOLECTRLHANDLER 1
 #define HAVE_SETMODE 1
@@ -325,16 +314,19 @@
 #define HAVE_UTGETOSTYPEFROMSTRING 0
 #define HAVE_VIRTUALALLOC 1
 #define HAVE_WGLGETPROCADDRESS 0
+#define HAVE_BCRYPT 1
+#define HAVE_VAAPI_DRM 0
+#define HAVE_VAAPI_X11 0
+#define HAVE_VDPAU_X11 0
 #define HAVE_PTHREADS 0
 #define HAVE_OS2THREADS 0
 #define HAVE_W32THREADS 1
+#define HAVE_AS_ARCH_DIRECTIVE 0
 #define HAVE_AS_DN_DIRECTIVE 0
 #define HAVE_AS_FPU_DIRECTIVE 0
 #define HAVE_AS_FUNC 0
 #define HAVE_AS_OBJECT_ARCH 0
 #define HAVE_ASM_MOD_Q 0
-#define HAVE_ATTRIBUTE_MAY_ALIAS 0
-#define HAVE_ATTRIBUTE_PACKED 0
 #define HAVE_BLOCKS_EXTENSION 0
 #define HAVE_EBP_AVAILABLE 0
 #define HAVE_EBX_AVAILABLE 0
@@ -351,7 +343,6 @@
 #define HAVE_VFP_ARGS 0
 #define HAVE_XFORM_ASM 0
 #define HAVE_XMM_CLOBBERS 0
-#define HAVE_CONDITION_VARIABLE_PTR 1
 #define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 #define HAVE_SOCKLEN_T 1
 #define HAVE_STRUCT_ADDRINFO 1
@@ -359,7 +350,7 @@
 #define HAVE_STRUCT_IP_MREQ_SOURCE 1
 #define HAVE_STRUCT_IPV6_MREQ 1
 #define HAVE_STRUCT_MSGHDR_MSG_FLAGS 0
-#define HAVE_STRUCT_POLLFD 0
+#define HAVE_STRUCT_POLLFD 1
 #define HAVE_STRUCT_RUSAGE_RU_MAXRSS 0
 #define HAVE_STRUCT_SCTP_EVENT_SUBSCRIBE 0
 #define HAVE_STRUCT_SOCKADDR_IN6 1
@@ -367,22 +358,17 @@
 #define HAVE_STRUCT_SOCKADDR_STORAGE 1
 #define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
 #define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-#define HAVE_ATOMICS_NATIVE 1
-#define HAVE_DOS_PATHS 1
-#define HAVE_LIBC_MSVCRT 1
 #define HAVE_MAKEINFO 1
 #define HAVE_MAKEINFO_HTML 0
-#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_OPENCL_D3D11 0
+#define HAVE_OPENCL_DRM_ARM 0
+#define HAVE_OPENCL_DRM_BEIGNET 0
+#define HAVE_OPENCL_DXVA2 0
+#define HAVE_OPENCL_VAAPI_BEIGNET 0
+#define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 #define HAVE_PERL 1
 #define HAVE_POD2MAN 1
-#define HAVE_SECTION_DATA_REL_RO 0
 #define HAVE_TEXI2HTML 0
-#define HAVE_THREADS 1
-#define HAVE_UWP 0
-#define HAVE_VAAPI_DRM 0
-#define HAVE_VAAPI_X11 0
-#define HAVE_VDPAU_X11 0
-#define HAVE_WINRT 0
 #define CONFIG_DOC 0
 #define CONFIG_HTMLPAGES 0
 #define CONFIG_MANPAGES 1
@@ -409,27 +395,12 @@
 #define CONFIG_SCALING_VIDEO_EXAMPLE 0
 #define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 #define CONFIG_TRANSCODING_EXAMPLE 0
-#define CONFIG_ALSA 0
-#define CONFIG_APPKIT 0
-#define CONFIG_AVFOUNDATION 0
-#define CONFIG_BZLIB 0
-#define CONFIG_COREIMAGE 0
-#define CONFIG_ICONV 0
-#define CONFIG_JACK 0
-#define CONFIG_LIBXCB 0
-#define CONFIG_LIBXCB_SHM 0
-#define CONFIG_LIBXCB_SHAPE 0
-#define CONFIG_LIBXCB_XFIXES 0
-#define CONFIG_LZMA 0
-#define CONFIG_SCHANNEL 1
-#define CONFIG_SDL2 0
-#define CONFIG_SECURETRANSPORT 0
-#define CONFIG_SNDIO 0
-#define CONFIG_XLIB 1
-#define CONFIG_ZLIB 0
+#define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+#define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 #define CONFIG_AVISYNTH 0
 #define CONFIG_FREI0R 0
 #define CONFIG_LIBCDIO 0
+#define CONFIG_LIBDAVS2 0
 #define CONFIG_LIBRUBBERBAND 0
 #define CONFIG_LIBVIDSTAB 0
 #define CONFIG_LIBX264 0
@@ -440,10 +411,14 @@
 #define CONFIG_LIBNDI_NEWTEK 0
 #define CONFIG_LIBFDK_AAC 0
 #define CONFIG_OPENSSL 0
+#define CONFIG_LIBTLS 0
 #define CONFIG_GMP 0
+#define CONFIG_LIBLENSFUN 0
 #define CONFIG_LIBOPENCORE_AMRNB 0
 #define CONFIG_LIBOPENCORE_AMRWB 0
+#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVO_AMRWBENC 0
+#define CONFIG_MBEDTLS 0
 #define CONFIG_RKMPP 0
 #define CONFIG_LIBSMBCLIENT 0
 #define CONFIG_CHROMAPRINT 0
@@ -451,11 +426,13 @@
 #define CONFIG_GNUTLS 0
 #define CONFIG_JNI 0
 #define CONFIG_LADSPA 0
+#define CONFIG_LIBAOM 0
 #define CONFIG_LIBASS 0
 #define CONFIG_LIBBLURAY 0
 #define CONFIG_LIBBS2B 0
 #define CONFIG_LIBCACA 0
 #define CONFIG_LIBCELT 0
+#define CONFIG_LIBCODEC2 0
 #define CONFIG_LIBDC1394 0
 #define CONFIG_LIBDRM 0
 #define CONFIG_LIBFLITE 0
@@ -466,6 +443,7 @@
 #define CONFIG_LIBGSM 0
 #define CONFIG_LIBIEC61883 0
 #define CONFIG_LIBILBC 0
+#define CONFIG_LIBJACK 0
 #define CONFIG_LIBKVAZAAR 0
 #define CONFIG_LIBMODPLUG 0
 #define CONFIG_LIBMP3LAME 0
@@ -483,12 +461,13 @@
 #define CONFIG_LIBSNAPPY 0
 #define CONFIG_LIBSOXR 0
 #define CONFIG_LIBSPEEX 0
+#define CONFIG_LIBSRT 0
 #define CONFIG_LIBSSH 0
+#define CONFIG_LIBTENSORFLOW 0
 #define CONFIG_LIBTESSERACT 0
 #define CONFIG_LIBTHEORA 0
 #define CONFIG_LIBTWOLAME 0
 #define CONFIG_LIBV4L2 0
-#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVORBIS 0
 #define CONFIG_LIBVPX 0
 #define CONFIG_LIBWAVPACK 0
@@ -497,28 +476,49 @@
 #define CONFIG_LIBZIMG 0
 #define CONFIG_LIBZMQ 0
 #define CONFIG_LIBZVBI 0
+#define CONFIG_LV2 0
 #define CONFIG_MEDIACODEC 0
 #define CONFIG_OPENAL 0
-#define CONFIG_OPENCL 0
 #define CONFIG_OPENGL 0
+#define CONFIG_VAPOURSYNTH 0
+#define CONFIG_ALSA 0
+#define CONFIG_APPKIT 0
+#define CONFIG_AVFOUNDATION 0
+#define CONFIG_BZLIB 0
+#define CONFIG_COREIMAGE 0
+#define CONFIG_ICONV 0
+#define CONFIG_LIBXCB 0
+#define CONFIG_LIBXCB_SHM 0
+#define CONFIG_LIBXCB_SHAPE 0
+#define CONFIG_LIBXCB_XFIXES 0
+#define CONFIG_LZMA 0
+#define CONFIG_SCHANNEL 1
+#define CONFIG_SDL2 0
+#define CONFIG_SECURETRANSPORT 0
+#define CONFIG_SNDIO 0
+#define CONFIG_XLIB 0
+#define CONFIG_ZLIB 0
+#define CONFIG_CUDA_SDK 0
+#define CONFIG_LIBNPP 0
+#define CONFIG_LIBMFX 0
+#define CONFIG_MMAL 0
+#define CONFIG_OMX 0
+#define CONFIG_OPENCL 0
+#define CONFIG_AMF 0
 #define CONFIG_AUDIOTOOLBOX 0
 #define CONFIG_CRYSTALHD 0
-#define CONFIG_CUDA 1
-#define CONFIG_CUVID 1
+#define CONFIG_CUDA 0
+#define CONFIG_CUVID 0
 #define CONFIG_D3D11VA 0
 #define CONFIG_DXVA2 0
-#define CONFIG_NVENC 1
+#define CONFIG_FFNVCODEC 0
+#define CONFIG_NVDEC 0
+#define CONFIG_NVENC 0
 #define CONFIG_VAAPI 0
-#define CONFIG_VDA 0
 #define CONFIG_VDPAU 0
 #define CONFIG_VIDEOTOOLBOX 0
 #define CONFIG_V4L2_M2M 0
 #define CONFIG_XVMC 0
-#define CONFIG_CUDA_SDK 0
-#define CONFIG_LIBNPP 0
-#define CONFIG_LIBMFX 0
-#define CONFIG_MMAL 0
-#define CONFIG_OMX 0
 #define CONFIG_FTRAPV 0
 #define CONFIG_GRAY 0
 #define CONFIG_HARDCODED_TABLES 0
@@ -532,18 +532,17 @@
 #define CONFIG_GPL 0
 #define CONFIG_NONFREE 0
 #define CONFIG_VERSION3 0
-#define CONFIG_AVCODEC 1
 #define CONFIG_AVDEVICE 0
 #define CONFIG_AVFILTER 0
+#define CONFIG_SWSCALE 0
+#define CONFIG_POSTPROC 0
 #define CONFIG_AVFORMAT 0
+#define CONFIG_AVCODEC 1
+#define CONFIG_SWRESAMPLE 0
 #define CONFIG_AVRESAMPLE 0
 #define CONFIG_AVUTIL 1
-#define CONFIG_POSTPROC 0
-#define CONFIG_SWRESAMPLE 0
-#define CONFIG_SWSCALE 0
 #define CONFIG_FFPLAY 0
 #define CONFIG_FFPROBE 0
-#define CONFIG_FFSERVER 0
 #define CONFIG_FFMPEG 0
 #define CONFIG_DCT 0
 #define CONFIG_DWT 0
@@ -569,31 +568,29 @@
 #define CONFIG_XMM_CLOBBER_TEST 0
 #define CONFIG_BSFS 1
 #define CONFIG_DECODERS 1
-#define CONFIG_ENCODERS 0
-#define CONFIG_HWACCELS 0
 #define CONFIG_PARSERS 1
-#define CONFIG_INDEVS 0
-#define CONFIG_OUTDEVS 0
-#define CONFIG_FILTERS 0
-#define CONFIG_DEMUXERS 0
-#define CONFIG_MUXERS 0
-#define CONFIG_PROTOCOLS 0
 #define CONFIG_AANDCTTABLES 0
 #define CONFIG_AC3DSP 0
+#define CONFIG_ADTS_HEADER 0
 #define CONFIG_AUDIO_FRAME_QUEUE 0
 #define CONFIG_AUDIODSP 0
 #define CONFIG_BLOCKDSP 0
 #define CONFIG_BSWAPDSP 0
 #define CONFIG_CABAC 0
+#define CONFIG_CBS 0
+#define CONFIG_CBS_H264 0
+#define CONFIG_CBS_H265 0
+#define CONFIG_CBS_MPEG2 0
+#define CONFIG_CBS_VP9 0
 #define CONFIG_DIRAC_PARSE 0
+#define CONFIG_DNN 0
 #define CONFIG_DVPROFILE 0
 #define CONFIG_EXIF 0
-#define CONFIG_FAANDCT 0
-#define CONFIG_FAANIDCT 0
-#define CONFIG_FDCTDSP 0
+#define CONFIG_FAANDCT 1
+#define CONFIG_FAANIDCT 1
+#define CONFIG_FDCTDSP 1
 #define CONFIG_FLACDSP 1
 #define CONFIG_FMTCONVERT 0
-#define CONFIG_FRAME_THREAD_ENCODER 0
 #define CONFIG_G722DSP 0
 #define CONFIG_GOLOMB 0
 #define CONFIG_GPLV3 0
@@ -608,7 +605,7 @@
 #define CONFIG_HUFFMAN 0
 #define CONFIG_HUFFYUVDSP 0
 #define CONFIG_HUFFYUVENCDSP 0
-#define CONFIG_IDCTDSP 0
+#define CONFIG_IDCTDSP 1
 #define CONFIG_IIRFILTER 0
 #define CONFIG_MDCT15 0
 #define CONFIG_INTRAX8 0
@@ -635,6 +632,7 @@
 #define CONFIG_QSV 0
 #define CONFIG_QSVDEC 0
 #define CONFIG_QSVENC 0
+#define CONFIG_QSVVPP 0
 #define CONFIG_RANGECODER 0
 #define CONFIG_RIFFDEC 0
 #define CONFIG_RIFFENC 0
@@ -658,10 +656,10 @@
 #define CONFIG_WMA_FREQS 0
 #define CONFIG_WMV2DSP 0
 #define CONFIG_NULL_BSF 1
+#define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_DECODER 1
 #define CONFIG_FLAC_DECODER 1
-#define CONFIG_FLAC_PARSER 0
 #define CONFIG_VP8_PARSER 1
 #define CONFIG_VP9_PARSER 1
 #endif /* FFMPEG_CONFIG_H */
diff --git a/media/ffvpx/config_win64.asm b/media/ffvpx/config_win64.asm
index 58c61e970..fecfee498 100644
--- a/media/ffvpx/config_win64.asm
+++ b/media/ffvpx/config_win64.asm
@@ -1,3 +1,4 @@
+; Automatically generated by configure - do not modify!
 %define ARCH_AARCH64 0
 %define ARCH_ALPHA 0
 %define ARCH_ARM 0
@@ -41,6 +42,7 @@
 %define HAVE_AMD3DNOWEXT 1
 %define HAVE_AVX 1
 %define HAVE_AVX2 1
+%define HAVE_AVX512 1
 %define HAVE_FMA3 1
 %define HAVE_FMA4 1
 %define HAVE_MMX 1
@@ -85,6 +87,7 @@
 %define HAVE_AMD3DNOWEXT_EXTERNAL 1
 %define HAVE_AVX_EXTERNAL 1
 %define HAVE_AVX2_EXTERNAL 1
+%define HAVE_AVX512_EXTERNAL 0
 %define HAVE_FMA3_EXTERNAL 1
 %define HAVE_FMA4_EXTERNAL 1
 %define HAVE_MMX_EXTERNAL 1
@@ -129,6 +132,7 @@
 %define HAVE_AMD3DNOWEXT_INLINE 0
 %define HAVE_AVX_INLINE 0
 %define HAVE_AVX2_INLINE 0
+%define HAVE_AVX512_INLINE 0
 %define HAVE_FMA3_INLINE 0
 %define HAVE_FMA4_INLINE 0
 %define HAVE_MMX_INLINE 0
@@ -156,22 +160,17 @@
 %define HAVE_MMI_INLINE 0
 %define HAVE_ALIGNED_STACK 1
 %define HAVE_FAST_64BIT 1
-%define HAVE_FAST_CLZ 0
+%define HAVE_FAST_CLZ 1
 %define HAVE_FAST_CMOV 1
-%define HAVE_LOCAL_ALIGNED_8 1
-%define HAVE_LOCAL_ALIGNED_16 1
-%define HAVE_LOCAL_ALIGNED_32 1
+%define HAVE_LOCAL_ALIGNED 1
 %define HAVE_SIMD_ALIGN_16 1
 %define HAVE_SIMD_ALIGN_32 1
-%define HAVE_ATOMICS_GCC 0
-%define HAVE_ATOMICS_SUNCC 0
-%define HAVE_ATOMICS_WIN32 1
+%define HAVE_SIMD_ALIGN_64 1
 %define HAVE_ATOMIC_CAS_PTR 0
 %define HAVE_MACHINE_RW_BARRIER 0
 %define HAVE_MEMORYBARRIER 1
 %define HAVE_MM_EMPTY 0
 %define HAVE_RDTSC 1
-%define HAVE_SARESTART 0
 %define HAVE_SEM_TIMEDWAIT 0
 %define HAVE_SYNC_VAL_COMPARE_AND_SWAP 0
 %define HAVE_CABS 0
@@ -181,13 +180,11 @@
 %define HAVE_X86ASM 1
 %define HAVE_BIGENDIAN 0
 %define HAVE_FAST_UNALIGNED 1
-%define HAVE_ALTIVEC_H 0
 %define HAVE_ARPA_INET_H 0
 %define HAVE_ASM_TYPES_H 0
 %define HAVE_CDIO_PARANOIA_H 0
 %define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 %define HAVE_CUDA_H 0
-%define HAVE_D3D11_H 1
 %define HAVE_DISPATCH_DISPATCH_H 0
 %define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 %define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -196,26 +193,17 @@
 %define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 %define HAVE_DIRECT_H 1
 %define HAVE_DIRENT_H 0
-%define HAVE_DLFCN_H 0
 %define HAVE_DXGIDEBUG_H 1
 %define HAVE_DXVA_H 1
 %define HAVE_ES2_GL_H 0
 %define HAVE_GSM_H 0
 %define HAVE_IO_H 1
-%define HAVE_MACH_MACH_TIME_H 0
+%define HAVE_LINUX_PERF_EVENT_H 0
 %define HAVE_MACHINE_IOCTL_BT848_H 0
 %define HAVE_MACHINE_IOCTL_METEOR_H 0
 %define HAVE_OPENCV2_CORE_CORE_C_H 0
-%define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-%define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-%define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 %define HAVE_OPENGL_GL3_H 0
 %define HAVE_POLL_H 0
-%define HAVE_SOUNDCARD_H 0
-%define HAVE_STDATOMIC_H 0
-%define HAVE_SYS_MMAN_H 0
 %define HAVE_SYS_PARAM_H 0
 %define HAVE_SYS_RESOURCE_H 0
 %define HAVE_SYS_SELECT_H 0
@@ -259,16 +247,19 @@
 %define HAVE_SINF 1
 %define HAVE_TRUNC 1
 %define HAVE_TRUNCF 1
+%define HAVE_DOS_PATHS 1
+%define HAVE_LIBC_MSVCRT 1
+%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_SECTION_DATA_REL_RO 0
+%define HAVE_THREADS 1
+%define HAVE_UWP 0
+%define HAVE_WINRT 0
 %define HAVE_ACCESS 1
 %define HAVE_ALIGNED_MALLOC 1
 %define HAVE_CLOCK_GETTIME 0
 %define HAVE_CLOSESOCKET 1
 %define HAVE_COMMANDLINETOARGVW 1
-%define HAVE_COTASKMEMFREE 1
-%define HAVE_CRYPTGENRANDOM 1
 %define HAVE_FCNTL 0
-%define HAVE_FLT_LIM 1
-%define HAVE_FORK 0
 %define HAVE_GETADDRINFO 1
 %define HAVE_GETHRTIME 0
 %define HAVE_GETOPT 0
@@ -283,9 +274,7 @@
 %define HAVE_GMTIME_R 0
 %define HAVE_INET_ATON 0
 %define HAVE_ISATTY 1
-%define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 %define HAVE_KBHIT 1
-%define HAVE_LOADLIBRARY 1
 %define HAVE_LSTAT 0
 %define HAVE_LZO1X_999_COMPRESS 0
 %define HAVE_MACH_ABSOLUTE_TIME 0
@@ -297,6 +286,7 @@
 %define HAVE_PEEKNAMEDPIPE 1
 %define HAVE_PTHREAD_CANCEL 0
 %define HAVE_SCHED_GETAFFINITY 0
+%define HAVE_SECITEMIMPORT 0
 %define HAVE_SETCONSOLETEXTATTRIBUTE 1
 %define HAVE_SETCONSOLECTRLHANDLER 1
 %define HAVE_SETMODE 1
@@ -309,16 +299,19 @@
 %define HAVE_UTGETOSTYPEFROMSTRING 0
 %define HAVE_VIRTUALALLOC 1
 %define HAVE_WGLGETPROCADDRESS 0
+%define HAVE_BCRYPT 1
+%define HAVE_VAAPI_DRM 0
+%define HAVE_VAAPI_X11 0
+%define HAVE_VDPAU_X11 0
 %define HAVE_PTHREADS 0
 %define HAVE_OS2THREADS 0
 %define HAVE_W32THREADS 1
+%define HAVE_AS_ARCH_DIRECTIVE 0
 %define HAVE_AS_DN_DIRECTIVE 0
 %define HAVE_AS_FPU_DIRECTIVE 0
 %define HAVE_AS_FUNC 0
 %define HAVE_AS_OBJECT_ARCH 0
 %define HAVE_ASM_MOD_Q 0
-%define HAVE_ATTRIBUTE_MAY_ALIAS 0
-%define HAVE_ATTRIBUTE_PACKED 0
 %define HAVE_BLOCKS_EXTENSION 0
 %define HAVE_EBP_AVAILABLE 0
 %define HAVE_EBX_AVAILABLE 0
@@ -335,7 +328,6 @@
 %define HAVE_VFP_ARGS 0
 %define HAVE_XFORM_ASM 0
 %define HAVE_XMM_CLOBBERS 0
-%define HAVE_CONDITION_VARIABLE_PTR 1
 %define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 %define HAVE_SOCKLEN_T 1
 %define HAVE_STRUCT_ADDRINFO 1
@@ -343,7 +335,7 @@
 %define HAVE_STRUCT_IP_MREQ_SOURCE 1
 %define HAVE_STRUCT_IPV6_MREQ 1
 %define HAVE_STRUCT_MSGHDR_MSG_FLAGS 0
-%define HAVE_STRUCT_POLLFD 0
+%define HAVE_STRUCT_POLLFD 1
 %define HAVE_STRUCT_RUSAGE_RU_MAXRSS 0
 %define HAVE_STRUCT_SCTP_EVENT_SUBSCRIBE 0
 %define HAVE_STRUCT_SOCKADDR_IN6 1
@@ -351,22 +343,17 @@
 %define HAVE_STRUCT_SOCKADDR_STORAGE 1
 %define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
 %define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-%define HAVE_ATOMICS_NATIVE 1
-%define HAVE_DOS_PATHS 1
-%define HAVE_LIBC_MSVCRT 1
 %define HAVE_MAKEINFO 1
 %define HAVE_MAKEINFO_HTML 0
-%define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+%define HAVE_OPENCL_D3D11 0
+%define HAVE_OPENCL_DRM_ARM 0
+%define HAVE_OPENCL_DRM_BEIGNET 0
+%define HAVE_OPENCL_DXVA2 0
+%define HAVE_OPENCL_VAAPI_BEIGNET 0
+%define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 %define HAVE_PERL 1
 %define HAVE_POD2MAN 1
-%define HAVE_SECTION_DATA_REL_RO 0
 %define HAVE_TEXI2HTML 0
-%define HAVE_THREADS 1
-%define HAVE_UWP 0
-%define HAVE_VAAPI_DRM 0
-%define HAVE_VAAPI_X11 0
-%define HAVE_VDPAU_X11 0
-%define HAVE_WINRT 0
 %define CONFIG_DOC 0
 %define CONFIG_HTMLPAGES 0
 %define CONFIG_MANPAGES 1
@@ -393,27 +380,12 @@
 %define CONFIG_SCALING_VIDEO_EXAMPLE 0
 %define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 %define CONFIG_TRANSCODING_EXAMPLE 0
-%define CONFIG_ALSA 0
-%define CONFIG_APPKIT 0
-%define CONFIG_AVFOUNDATION 0
-%define CONFIG_BZLIB 0
-%define CONFIG_COREIMAGE 0
-%define CONFIG_ICONV 0
-%define CONFIG_JACK 0
-%define CONFIG_LIBXCB 0
-%define CONFIG_LIBXCB_SHM 0
-%define CONFIG_LIBXCB_SHAPE 0
-%define CONFIG_LIBXCB_XFIXES 0
-%define CONFIG_LZMA 0
-%define CONFIG_SCHANNEL 1
-%define CONFIG_SDL2 0
-%define CONFIG_SECURETRANSPORT 0
-%define CONFIG_SNDIO 0
-%define CONFIG_XLIB 1
-%define CONFIG_ZLIB 0
+%define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+%define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 %define CONFIG_AVISYNTH 0
 %define CONFIG_FREI0R 0
 %define CONFIG_LIBCDIO 0
+%define CONFIG_LIBDAVS2 0
 %define CONFIG_LIBRUBBERBAND 0
 %define CONFIG_LIBVIDSTAB 0
 %define CONFIG_LIBX264 0
@@ -424,10 +396,14 @@
 %define CONFIG_LIBNDI_NEWTEK 0
 %define CONFIG_LIBFDK_AAC 0
 %define CONFIG_OPENSSL 0
+%define CONFIG_LIBTLS 0
 %define CONFIG_GMP 0
+%define CONFIG_LIBLENSFUN 0
 %define CONFIG_LIBOPENCORE_AMRNB 0
 %define CONFIG_LIBOPENCORE_AMRWB 0
+%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVO_AMRWBENC 0
+%define CONFIG_MBEDTLS 0
 %define CONFIG_RKMPP 0
 %define CONFIG_LIBSMBCLIENT 0
 %define CONFIG_CHROMAPRINT 0
@@ -435,11 +411,13 @@
 %define CONFIG_GNUTLS 0
 %define CONFIG_JNI 0
 %define CONFIG_LADSPA 0
+%define CONFIG_LIBAOM 0
 %define CONFIG_LIBASS 0
 %define CONFIG_LIBBLURAY 0
 %define CONFIG_LIBBS2B 0
 %define CONFIG_LIBCACA 0
 %define CONFIG_LIBCELT 0
+%define CONFIG_LIBCODEC2 0
 %define CONFIG_LIBDC1394 0
 %define CONFIG_LIBDRM 0
 %define CONFIG_LIBFLITE 0
@@ -450,6 +428,7 @@
 %define CONFIG_LIBGSM 0
 %define CONFIG_LIBIEC61883 0
 %define CONFIG_LIBILBC 0
+%define CONFIG_LIBJACK 0
 %define CONFIG_LIBKVAZAAR 0
 %define CONFIG_LIBMODPLUG 0
 %define CONFIG_LIBMP3LAME 0
@@ -467,12 +446,13 @@
 %define CONFIG_LIBSNAPPY 0
 %define CONFIG_LIBSOXR 0
 %define CONFIG_LIBSPEEX 0
+%define CONFIG_LIBSRT 0
 %define CONFIG_LIBSSH 0
+%define CONFIG_LIBTENSORFLOW 0
 %define CONFIG_LIBTESSERACT 0
 %define CONFIG_LIBTHEORA 0
 %define CONFIG_LIBTWOLAME 0
 %define CONFIG_LIBV4L2 0
-%define CONFIG_LIBVMAF 0
 %define CONFIG_LIBVORBIS 0
 %define CONFIG_LIBVPX 0
 %define CONFIG_LIBWAVPACK 0
@@ -481,28 +461,49 @@
 %define CONFIG_LIBZIMG 0
 %define CONFIG_LIBZMQ 0
 %define CONFIG_LIBZVBI 0
+%define CONFIG_LV2 0
 %define CONFIG_MEDIACODEC 0
 %define CONFIG_OPENAL 0
-%define CONFIG_OPENCL 0
 %define CONFIG_OPENGL 0
+%define CONFIG_VAPOURSYNTH 0
+%define CONFIG_ALSA 0
+%define CONFIG_APPKIT 0
+%define CONFIG_AVFOUNDATION 0
+%define CONFIG_BZLIB 0
+%define CONFIG_COREIMAGE 0
+%define CONFIG_ICONV 0
+%define CONFIG_LIBXCB 0
+%define CONFIG_LIBXCB_SHM 0
+%define CONFIG_LIBXCB_SHAPE 0
+%define CONFIG_LIBXCB_XFIXES 0
+%define CONFIG_LZMA 0
+%define CONFIG_SCHANNEL 1
+%define CONFIG_SDL2 0
+%define CONFIG_SECURETRANSPORT 0
+%define CONFIG_SNDIO 0
+%define CONFIG_XLIB 0
+%define CONFIG_ZLIB 0
+%define CONFIG_CUDA_SDK 0
+%define CONFIG_LIBNPP 0
+%define CONFIG_LIBMFX 0
+%define CONFIG_MMAL 0
+%define CONFIG_OMX 0
+%define CONFIG_OPENCL 0
+%define CONFIG_AMF 0
 %define CONFIG_AUDIOTOOLBOX 0
 %define CONFIG_CRYSTALHD 0
-%define CONFIG_CUDA 1
-%define CONFIG_CUVID 1
+%define CONFIG_CUDA 0
+%define CONFIG_CUVID 0
 %define CONFIG_D3D11VA 0
 %define CONFIG_DXVA2 0
-%define CONFIG_NVENC 1
+%define CONFIG_FFNVCODEC 0
+%define CONFIG_NVDEC 0
+%define CONFIG_NVENC 0
 %define CONFIG_VAAPI 0
-%define CONFIG_VDA 0
 %define CONFIG_VDPAU 0
 %define CONFIG_VIDEOTOOLBOX 0
 %define CONFIG_V4L2_M2M 0
 %define CONFIG_XVMC 0
-%define CONFIG_CUDA_SDK 0
-%define CONFIG_LIBNPP 0
-%define CONFIG_LIBMFX 0
-%define CONFIG_MMAL 0
-%define CONFIG_OMX 0
 %define CONFIG_FTRAPV 0
 %define CONFIG_GRAY 0
 %define CONFIG_HARDCODED_TABLES 0
@@ -516,18 +517,17 @@
 %define CONFIG_GPL 0
 %define CONFIG_NONFREE 0
 %define CONFIG_VERSION3 0
-%define CONFIG_AVCODEC 1
 %define CONFIG_AVDEVICE 0
 %define CONFIG_AVFILTER 0
+%define CONFIG_SWSCALE 0
+%define CONFIG_POSTPROC 0
 %define CONFIG_AVFORMAT 0
+%define CONFIG_AVCODEC 1
+%define CONFIG_SWRESAMPLE 0
 %define CONFIG_AVRESAMPLE 0
 %define CONFIG_AVUTIL 1
-%define CONFIG_POSTPROC 0
-%define CONFIG_SWRESAMPLE 0
-%define CONFIG_SWSCALE 0
 %define CONFIG_FFPLAY 0
 %define CONFIG_FFPROBE 0
-%define CONFIG_FFSERVER 0
 %define CONFIG_FFMPEG 0
 %define CONFIG_DCT 0
 %define CONFIG_DWT 0
@@ -553,31 +553,29 @@
 %define CONFIG_XMM_CLOBBER_TEST 0
 %define CONFIG_BSFS 1
 %define CONFIG_DECODERS 1
-%define CONFIG_ENCODERS 0
-%define CONFIG_HWACCELS 0
 %define CONFIG_PARSERS 1
-%define CONFIG_INDEVS 0
-%define CONFIG_OUTDEVS 0
-%define CONFIG_FILTERS 0
-%define CONFIG_DEMUXERS 0
-%define CONFIG_MUXERS 0
-%define CONFIG_PROTOCOLS 0
 %define CONFIG_AANDCTTABLES 0
 %define CONFIG_AC3DSP 0
+%define CONFIG_ADTS_HEADER 0
 %define CONFIG_AUDIO_FRAME_QUEUE 0
 %define CONFIG_AUDIODSP 0
 %define CONFIG_BLOCKDSP 0
 %define CONFIG_BSWAPDSP 0
 %define CONFIG_CABAC 0
+%define CONFIG_CBS 0
+%define CONFIG_CBS_H264 0
+%define CONFIG_CBS_H265 0
+%define CONFIG_CBS_MPEG2 0
+%define CONFIG_CBS_VP9 0
 %define CONFIG_DIRAC_PARSE 0
+%define CONFIG_DNN 0
 %define CONFIG_DVPROFILE 0
 %define CONFIG_EXIF 0
-%define CONFIG_FAANDCT 0
-%define CONFIG_FAANIDCT 0
-%define CONFIG_FDCTDSP 0
+%define CONFIG_FAANDCT 1
+%define CONFIG_FAANIDCT 1
+%define CONFIG_FDCTDSP 1
 %define CONFIG_FLACDSP 1
 %define CONFIG_FMTCONVERT 0
-%define CONFIG_FRAME_THREAD_ENCODER 0
 %define CONFIG_G722DSP 0
 %define CONFIG_GOLOMB 0
 %define CONFIG_GPLV3 0
@@ -592,7 +590,7 @@
 %define CONFIG_HUFFMAN 0
 %define CONFIG_HUFFYUVDSP 0
 %define CONFIG_HUFFYUVENCDSP 0
-%define CONFIG_IDCTDSP 0
+%define CONFIG_IDCTDSP 1
 %define CONFIG_IIRFILTER 0
 %define CONFIG_MDCT15 0
 %define CONFIG_INTRAX8 0
@@ -619,6 +617,7 @@
 %define CONFIG_QSV 0
 %define CONFIG_QSVDEC 0
 %define CONFIG_QSVENC 0
+%define CONFIG_QSVVPP 0
 %define CONFIG_RANGECODER 0
 %define CONFIG_RIFFDEC 0
 %define CONFIG_RIFFENC 0
@@ -642,9 +641,9 @@
 %define CONFIG_WMA_FREQS 0
 %define CONFIG_WMV2DSP 0
 %define CONFIG_NULL_BSF 1
+%define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 %define CONFIG_VP8_DECODER 1
 %define CONFIG_VP9_DECODER 1
 %define CONFIG_FLAC_DECODER 1
-%define CONFIG_FLAC_PARSER 0
 %define CONFIG_VP8_PARSER 1
 %define CONFIG_VP9_PARSER 1
diff --git a/media/ffvpx/config_win64.h b/media/ffvpx/config_win64.h
index b74b8d507..cd8f991d8 100644
--- a/media/ffvpx/config_win64.h
+++ b/media/ffvpx/config_win64.h
@@ -1,12 +1,12 @@
 /* Automatically generated by configure - do not modify! */
 #ifndef FFMPEG_CONFIG_H
 #define FFMPEG_CONFIG_H
-#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-yasm --toolchain=msvc"
+#define FFMPEG_CONFIGURATION "--disable-everything --disable-protocols --disable-demuxers --disable-muxers --disable-filters --disable-programs --disable-doc --disable-parsers --enable-parser=vp8 --enable-parser=vp9 --enable-decoder=vp8 --enable-decoder=vp9 --disable-static --enable-shared --disable-debug --disable-sdl2 --disable-libxcb --disable-securetransport --disable-iconv --disable-swresample --disable-swscale --disable-avdevice --disable-avfilter --disable-avformat --disable-d3d11va --disable-dxva2 --disable-vaapi --disable-vdpau --disable-videotoolbox --enable-decoder=flac --enable-asm --enable-x86asm --toolchain=msvc --disable-cuda --disable-cuvid"
 #define FFMPEG_LICENSE "LGPL version 2.1 or later"
-#define CONFIG_THIS_YEAR 2017
+#define CONFIG_THIS_YEAR 2018
 #define FFMPEG_DATADIR "/usr/local/share/ffmpeg"
 #define AVCONV_DATADIR "/usr/local/share/ffmpeg"
-#define CC_IDENT "Microsoft (R) C/C++ Optimizing Compiler Version 19.11.25508.2 for x64"
+#define CC_IDENT "Microsoft (R) C/C++ Optimizing Compiler Version 19.15.26726 for x64"
 #define av_restrict __restrict
 #define EXTERN_PREFIX ""
 #define EXTERN_ASM
@@ -57,6 +57,7 @@
 #define HAVE_AMD3DNOWEXT 1
 #define HAVE_AVX 1
 #define HAVE_AVX2 1
+#define HAVE_AVX512 1
 #define HAVE_FMA3 1
 #define HAVE_FMA4 1
 #define HAVE_MMX 1
@@ -101,6 +102,7 @@
 #define HAVE_AMD3DNOWEXT_EXTERNAL 1
 #define HAVE_AVX_EXTERNAL 1
 #define HAVE_AVX2_EXTERNAL 1
+#define HAVE_AVX512_EXTERNAL 0
 #define HAVE_FMA3_EXTERNAL 1
 #define HAVE_FMA4_EXTERNAL 1
 #define HAVE_MMX_EXTERNAL 1
@@ -145,6 +147,7 @@
 #define HAVE_AMD3DNOWEXT_INLINE 0
 #define HAVE_AVX_INLINE 0
 #define HAVE_AVX2_INLINE 0
+#define HAVE_AVX512_INLINE 0
 #define HAVE_FMA3_INLINE 0
 #define HAVE_FMA4_INLINE 0
 #define HAVE_MMX_INLINE 0
@@ -174,20 +177,15 @@
 #define HAVE_FAST_64BIT 1
 #define HAVE_FAST_CLZ 0
 #define HAVE_FAST_CMOV 1
-#define HAVE_LOCAL_ALIGNED_8 1
-#define HAVE_LOCAL_ALIGNED_16 1
-#define HAVE_LOCAL_ALIGNED_32 1
+#define HAVE_LOCAL_ALIGNED 1
 #define HAVE_SIMD_ALIGN_16 1
 #define HAVE_SIMD_ALIGN_32 1
-#define HAVE_ATOMICS_GCC 0
-#define HAVE_ATOMICS_SUNCC 0
-#define HAVE_ATOMICS_WIN32 1
+#define HAVE_SIMD_ALIGN_64 1
 #define HAVE_ATOMIC_CAS_PTR 0
 #define HAVE_MACHINE_RW_BARRIER 0
 #define HAVE_MEMORYBARRIER 1
 #define HAVE_MM_EMPTY 0
 #define HAVE_RDTSC 1
-#define HAVE_SARESTART 0
 #define HAVE_SEM_TIMEDWAIT 0
 #define HAVE_SYNC_VAL_COMPARE_AND_SWAP 0
 #define HAVE_CABS 0
@@ -197,13 +195,11 @@
 #define HAVE_X86ASM 1
 #define HAVE_BIGENDIAN 0
 #define HAVE_FAST_UNALIGNED 1
-#define HAVE_ALTIVEC_H 0
 #define HAVE_ARPA_INET_H 0
 #define HAVE_ASM_TYPES_H 0
 #define HAVE_CDIO_PARANOIA_H 0
 #define HAVE_CDIO_PARANOIA_PARANOIA_H 0
 #define HAVE_CUDA_H 0
-#define HAVE_D3D11_H 1
 #define HAVE_DISPATCH_DISPATCH_H 0
 #define HAVE_DEV_BKTR_IOCTL_BT848_H 0
 #define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
@@ -212,26 +208,17 @@
 #define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
 #define HAVE_DIRECT_H 1
 #define HAVE_DIRENT_H 0
-#define HAVE_DLFCN_H 0
 #define HAVE_DXGIDEBUG_H 1
 #define HAVE_DXVA_H 1
 #define HAVE_ES2_GL_H 0
 #define HAVE_GSM_H 0
 #define HAVE_IO_H 1
-#define HAVE_MACH_MACH_TIME_H 0
+#define HAVE_LINUX_PERF_EVENT_H 0
 #define HAVE_MACHINE_IOCTL_BT848_H 0
 #define HAVE_MACHINE_IOCTL_METEOR_H 0
 #define HAVE_OPENCV2_CORE_CORE_C_H 0
-#define HAVE_OPENJPEG_2_3_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_2_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_1_OPENJPEG_H 0
-#define HAVE_OPENJPEG_2_0_OPENJPEG_H 0
-#define HAVE_OPENJPEG_1_5_OPENJPEG_H 0
 #define HAVE_OPENGL_GL3_H 0
 #define HAVE_POLL_H 0
-#define HAVE_SOUNDCARD_H 0
-#define HAVE_STDATOMIC_H 0
-#define HAVE_SYS_MMAN_H 0
 #define HAVE_SYS_PARAM_H 0
 #define HAVE_SYS_RESOURCE_H 0
 #define HAVE_SYS_SELECT_H 0
@@ -275,16 +262,19 @@
 #define HAVE_SINF 1
 #define HAVE_TRUNC 1
 #define HAVE_TRUNCF 1
+#define HAVE_DOS_PATHS 1
+#define HAVE_LIBC_MSVCRT 1
+#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_SECTION_DATA_REL_RO 0
+#define HAVE_THREADS 1
+#define HAVE_UWP 0
+#define HAVE_WINRT 0
 #define HAVE_ACCESS 1
 #define HAVE_ALIGNED_MALLOC 1
 #define HAVE_CLOCK_GETTIME 0
 #define HAVE_CLOSESOCKET 1
 #define HAVE_COMMANDLINETOARGVW 1
-#define HAVE_COTASKMEMFREE 1
-#define HAVE_CRYPTGENRANDOM 1
 #define HAVE_FCNTL 0
-#define HAVE_FLT_LIM 1
-#define HAVE_FORK 0
 #define HAVE_GETADDRINFO 1
 #define HAVE_GETHRTIME 0
 #define HAVE_GETOPT 0
@@ -299,9 +289,7 @@
 #define HAVE_GMTIME_R 0
 #define HAVE_INET_ATON 0
 #define HAVE_ISATTY 1
-#define HAVE_JACK_PORT_GET_LATENCY_RANGE 0
 #define HAVE_KBHIT 1
-#define HAVE_LOADLIBRARY 1
 #define HAVE_LSTAT 0
 #define HAVE_LZO1X_999_COMPRESS 0
 #define HAVE_MACH_ABSOLUTE_TIME 0
@@ -313,6 +301,7 @@
 #define HAVE_PEEKNAMEDPIPE 1
 #define HAVE_PTHREAD_CANCEL 0
 #define HAVE_SCHED_GETAFFINITY 0
+#define HAVE_SECITEMIMPORT 0
 #define HAVE_SETCONSOLETEXTATTRIBUTE 1
 #define HAVE_SETCONSOLECTRLHANDLER 1
 #define HAVE_SETMODE 1
@@ -320,20 +309,24 @@
 #define HAVE_SLEEP 1
 #define HAVE_STRERROR_R 0
 #define HAVE_SYSCONF 0
+#define HAVE_SYSCTL 0
 #define HAVE_USLEEP 0
 #define HAVE_UTGETOSTYPEFROMSTRING 0
 #define HAVE_VIRTUALALLOC 1
 #define HAVE_WGLGETPROCADDRESS 0
+#define HAVE_BCRYPT 1
+#define HAVE_VAAPI_DRM 0
+#define HAVE_VAAPI_X11 0
+#define HAVE_VDPAU_X11 0
 #define HAVE_PTHREADS 0
 #define HAVE_OS2THREADS 0
 #define HAVE_W32THREADS 1
+#define HAVE_AS_ARCH_DIRECTIVE 0
 #define HAVE_AS_DN_DIRECTIVE 0
 #define HAVE_AS_FPU_DIRECTIVE 0
 #define HAVE_AS_FUNC 0
 #define HAVE_AS_OBJECT_ARCH 0
 #define HAVE_ASM_MOD_Q 0
-#define HAVE_ATTRIBUTE_MAY_ALIAS 0
-#define HAVE_ATTRIBUTE_PACKED 0
 #define HAVE_BLOCKS_EXTENSION 0
 #define HAVE_EBP_AVAILABLE 0
 #define HAVE_EBX_AVAILABLE 0
@@ -350,7 +343,6 @@
 #define HAVE_VFP_ARGS 0
 #define HAVE_XFORM_ASM 0
 #define HAVE_XMM_CLOBBERS 0
-#define HAVE_CONDITION_VARIABLE_PTR 1
 #define HAVE_KCMVIDEOCODECTYPE_HEVC 0
 #define HAVE_SOCKLEN_T 1
 #define HAVE_STRUCT_ADDRINFO 1
@@ -358,7 +350,7 @@
 #define HAVE_STRUCT_IP_MREQ_SOURCE 1
 #define HAVE_STRUCT_IPV6_MREQ 1
 #define HAVE_STRUCT_MSGHDR_MSG_FLAGS 0
-#define HAVE_STRUCT_POLLFD 0
+#define HAVE_STRUCT_POLLFD 1
 #define HAVE_STRUCT_RUSAGE_RU_MAXRSS 0
 #define HAVE_STRUCT_SCTP_EVENT_SUBSCRIBE 0
 #define HAVE_STRUCT_SOCKADDR_IN6 1
@@ -366,22 +358,17 @@
 #define HAVE_STRUCT_SOCKADDR_STORAGE 1
 #define HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC 0
 #define HAVE_STRUCT_V4L2_FRMIVALENUM_DISCRETE 0
-#define HAVE_ATOMICS_NATIVE 1
-#define HAVE_DOS_PATHS 1
-#define HAVE_LIBC_MSVCRT 1
 #define HAVE_MAKEINFO 1
 #define HAVE_MAKEINFO_HTML 0
-#define HAVE_MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS 0
+#define HAVE_OPENCL_D3D11 0
+#define HAVE_OPENCL_DRM_ARM 0
+#define HAVE_OPENCL_DRM_BEIGNET 0
+#define HAVE_OPENCL_DXVA2 0
+#define HAVE_OPENCL_VAAPI_BEIGNET 0
+#define HAVE_OPENCL_VAAPI_INTEL_MEDIA 0
 #define HAVE_PERL 1
 #define HAVE_POD2MAN 1
-#define HAVE_SECTION_DATA_REL_RO 0
 #define HAVE_TEXI2HTML 0
-#define HAVE_THREADS 1
-#define HAVE_UWP 0
-#define HAVE_VAAPI_DRM 0
-#define HAVE_VAAPI_X11 0
-#define HAVE_VDPAU_X11 0
-#define HAVE_WINRT 0
 #define CONFIG_DOC 0
 #define CONFIG_HTMLPAGES 0
 #define CONFIG_MANPAGES 1
@@ -408,27 +395,12 @@
 #define CONFIG_SCALING_VIDEO_EXAMPLE 0
 #define CONFIG_TRANSCODE_AAC_EXAMPLE 0
 #define CONFIG_TRANSCODING_EXAMPLE 0
-#define CONFIG_ALSA 0
-#define CONFIG_APPKIT 0
-#define CONFIG_AVFOUNDATION 0
-#define CONFIG_BZLIB 0
-#define CONFIG_COREIMAGE 0
-#define CONFIG_ICONV 0
-#define CONFIG_JACK 0
-#define CONFIG_LIBXCB 0
-#define CONFIG_LIBXCB_SHM 0
-#define CONFIG_LIBXCB_SHAPE 0
-#define CONFIG_LIBXCB_XFIXES 0
-#define CONFIG_LZMA 0
-#define CONFIG_SCHANNEL 1
-#define CONFIG_SDL2 0
-#define CONFIG_SECURETRANSPORT 0
-#define CONFIG_SNDIO 0
-#define CONFIG_XLIB 1
-#define CONFIG_ZLIB 0
+#define CONFIG_VAAPI_ENCODE_EXAMPLE 0
+#define CONFIG_VAAPI_TRANSCODE_EXAMPLE 0
 #define CONFIG_AVISYNTH 0
 #define CONFIG_FREI0R 0
 #define CONFIG_LIBCDIO 0
+#define CONFIG_LIBDAVS2 0
 #define CONFIG_LIBRUBBERBAND 0
 #define CONFIG_LIBVIDSTAB 0
 #define CONFIG_LIBX264 0
@@ -439,10 +411,14 @@
 #define CONFIG_LIBNDI_NEWTEK 0
 #define CONFIG_LIBFDK_AAC 0
 #define CONFIG_OPENSSL 0
+#define CONFIG_LIBTLS 0
 #define CONFIG_GMP 0
+#define CONFIG_LIBLENSFUN 0
 #define CONFIG_LIBOPENCORE_AMRNB 0
 #define CONFIG_LIBOPENCORE_AMRWB 0
+#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVO_AMRWBENC 0
+#define CONFIG_MBEDTLS 0
 #define CONFIG_RKMPP 0
 #define CONFIG_LIBSMBCLIENT 0
 #define CONFIG_CHROMAPRINT 0
@@ -450,11 +426,13 @@
 #define CONFIG_GNUTLS 0
 #define CONFIG_JNI 0
 #define CONFIG_LADSPA 0
+#define CONFIG_LIBAOM 0
 #define CONFIG_LIBASS 0
 #define CONFIG_LIBBLURAY 0
 #define CONFIG_LIBBS2B 0
 #define CONFIG_LIBCACA 0
 #define CONFIG_LIBCELT 0
+#define CONFIG_LIBCODEC2 0
 #define CONFIG_LIBDC1394 0
 #define CONFIG_LIBDRM 0
 #define CONFIG_LIBFLITE 0
@@ -465,6 +443,7 @@
 #define CONFIG_LIBGSM 0
 #define CONFIG_LIBIEC61883 0
 #define CONFIG_LIBILBC 0
+#define CONFIG_LIBJACK 0
 #define CONFIG_LIBKVAZAAR 0
 #define CONFIG_LIBMODPLUG 0
 #define CONFIG_LIBMP3LAME 0
@@ -482,12 +461,13 @@
 #define CONFIG_LIBSNAPPY 0
 #define CONFIG_LIBSOXR 0
 #define CONFIG_LIBSPEEX 0
+#define CONFIG_LIBSRT 0
 #define CONFIG_LIBSSH 0
+#define CONFIG_LIBTENSORFLOW 0
 #define CONFIG_LIBTESSERACT 0
 #define CONFIG_LIBTHEORA 0
 #define CONFIG_LIBTWOLAME 0
 #define CONFIG_LIBV4L2 0
-#define CONFIG_LIBVMAF 0
 #define CONFIG_LIBVORBIS 0
 #define CONFIG_LIBVPX 0
 #define CONFIG_LIBWAVPACK 0
@@ -496,28 +476,49 @@
 #define CONFIG_LIBZIMG 0
 #define CONFIG_LIBZMQ 0
 #define CONFIG_LIBZVBI 0
+#define CONFIG_LV2 0
 #define CONFIG_MEDIACODEC 0
 #define CONFIG_OPENAL 0
-#define CONFIG_OPENCL 0
 #define CONFIG_OPENGL 0
+#define CONFIG_VAPOURSYNTH 0
+#define CONFIG_ALSA 0
+#define CONFIG_APPKIT 0
+#define CONFIG_AVFOUNDATION 0
+#define CONFIG_BZLIB 0
+#define CONFIG_COREIMAGE 0
+#define CONFIG_ICONV 0
+#define CONFIG_LIBXCB 0
+#define CONFIG_LIBXCB_SHM 0
+#define CONFIG_LIBXCB_SHAPE 0
+#define CONFIG_LIBXCB_XFIXES 0
+#define CONFIG_LZMA 0
+#define CONFIG_SCHANNEL 1
+#define CONFIG_SDL2 0
+#define CONFIG_SECURETRANSPORT 0
+#define CONFIG_SNDIO 0
+#define CONFIG_XLIB 0
+#define CONFIG_ZLIB 0
+#define CONFIG_CUDA_SDK 0
+#define CONFIG_LIBNPP 0
+#define CONFIG_LIBMFX 0
+#define CONFIG_MMAL 0
+#define CONFIG_OMX 0
+#define CONFIG_OPENCL 0
+#define CONFIG_AMF 0
 #define CONFIG_AUDIOTOOLBOX 0
 #define CONFIG_CRYSTALHD 0
-#define CONFIG_CUDA 1
-#define CONFIG_CUVID 1
+#define CONFIG_CUDA 0
+#define CONFIG_CUVID 0
 #define CONFIG_D3D11VA 0
 #define CONFIG_DXVA2 0
-#define CONFIG_NVENC 1
+#define CONFIG_FFNVCODEC 0
+#define CONFIG_NVDEC 0
+#define CONFIG_NVENC 0
 #define CONFIG_VAAPI 0
-#define CONFIG_VDA 0
 #define CONFIG_VDPAU 0
 #define CONFIG_VIDEOTOOLBOX 0
 #define CONFIG_V4L2_M2M 0
 #define CONFIG_XVMC 0
-#define CONFIG_CUDA_SDK 0
-#define CONFIG_LIBNPP 0
-#define CONFIG_LIBMFX 0
-#define CONFIG_MMAL 0
-#define CONFIG_OMX 0
 #define CONFIG_FTRAPV 0
 #define CONFIG_GRAY 0
 #define CONFIG_HARDCODED_TABLES 0
@@ -531,18 +532,17 @@
 #define CONFIG_GPL 0
 #define CONFIG_NONFREE 0
 #define CONFIG_VERSION3 0
-#define CONFIG_AVCODEC 1
 #define CONFIG_AVDEVICE 0
 #define CONFIG_AVFILTER 0
+#define CONFIG_SWSCALE 0
+#define CONFIG_POSTPROC 0
 #define CONFIG_AVFORMAT 0
+#define CONFIG_AVCODEC 1
+#define CONFIG_SWRESAMPLE 0
 #define CONFIG_AVRESAMPLE 0
 #define CONFIG_AVUTIL 1
-#define CONFIG_POSTPROC 0
-#define CONFIG_SWRESAMPLE 0
-#define CONFIG_SWSCALE 0
 #define CONFIG_FFPLAY 0
 #define CONFIG_FFPROBE 0
-#define CONFIG_FFSERVER 0
 #define CONFIG_FFMPEG 0
 #define CONFIG_DCT 0
 #define CONFIG_DWT 0
@@ -568,31 +568,29 @@
 #define CONFIG_XMM_CLOBBER_TEST 0
 #define CONFIG_BSFS 1
 #define CONFIG_DECODERS 1
-#define CONFIG_ENCODERS 0
-#define CONFIG_HWACCELS 0
 #define CONFIG_PARSERS 1
-#define CONFIG_INDEVS 0
-#define CONFIG_OUTDEVS 0
-#define CONFIG_FILTERS 0
-#define CONFIG_DEMUXERS 0
-#define CONFIG_MUXERS 0
-#define CONFIG_PROTOCOLS 0
 #define CONFIG_AANDCTTABLES 0
 #define CONFIG_AC3DSP 0
+#define CONFIG_ADTS_HEADER 0
 #define CONFIG_AUDIO_FRAME_QUEUE 0
 #define CONFIG_AUDIODSP 0
 #define CONFIG_BLOCKDSP 0
 #define CONFIG_BSWAPDSP 0
 #define CONFIG_CABAC 0
+#define CONFIG_CBS 0
+#define CONFIG_CBS_H264 0
+#define CONFIG_CBS_H265 0
+#define CONFIG_CBS_MPEG2 0
+#define CONFIG_CBS_VP9 0
 #define CONFIG_DIRAC_PARSE 0
+#define CONFIG_DNN 0
 #define CONFIG_DVPROFILE 0
 #define CONFIG_EXIF 0
-#define CONFIG_FAANDCT 0
-#define CONFIG_FAANIDCT 0
-#define CONFIG_FDCTDSP 0
+#define CONFIG_FAANDCT 1
+#define CONFIG_FAANIDCT 1
+#define CONFIG_FDCTDSP 1
 #define CONFIG_FLACDSP 1
 #define CONFIG_FMTCONVERT 0
-#define CONFIG_FRAME_THREAD_ENCODER 0
 #define CONFIG_G722DSP 0
 #define CONFIG_GOLOMB 0
 #define CONFIG_GPLV3 0
@@ -607,7 +605,7 @@
 #define CONFIG_HUFFMAN 0
 #define CONFIG_HUFFYUVDSP 0
 #define CONFIG_HUFFYUVENCDSP 0
-#define CONFIG_IDCTDSP 0
+#define CONFIG_IDCTDSP 1
 #define CONFIG_IIRFILTER 0
 #define CONFIG_MDCT15 0
 #define CONFIG_INTRAX8 0
@@ -634,6 +632,7 @@
 #define CONFIG_QSV 0
 #define CONFIG_QSVDEC 0
 #define CONFIG_QSVENC 0
+#define CONFIG_QSVVPP 0
 #define CONFIG_RANGECODER 0
 #define CONFIG_RIFFDEC 0
 #define CONFIG_RIFFENC 0
@@ -657,10 +656,10 @@
 #define CONFIG_WMA_FREQS 0
 #define CONFIG_WMV2DSP 0
 #define CONFIG_NULL_BSF 1
+#define CONFIG_VP9_SUPERFRAME_SPLIT_BSF 1
 #define CONFIG_VP8_DECODER 1
 #define CONFIG_VP9_DECODER 1
 #define CONFIG_FLAC_DECODER 1
-#define CONFIG_FLAC_PARSER 0
 #define CONFIG_VP8_PARSER 1
 #define CONFIG_VP9_PARSER 1
 #endif /* FFMPEG_CONFIG_H */
diff --git a/media/ffvpx/defaults_disabled.asm b/media/ffvpx/defaults_disabled.asm
index a9fde4c52..220a11f3d 100644
--- a/media/ffvpx/defaults_disabled.asm
+++ b/media/ffvpx/defaults_disabled.asm
@@ -1,402 +1,40 @@
-%define CONFIG_ENCODERS 0
-%define CONFIG_HWACCELS 0
-%define CONFIG_INDEVS 0
-%define CONFIG_OUTDEVS 0
-%define CONFIG_FILTERS 0
-%define CONFIG_DEMUXERS 0
-%define CONFIG_MUXERS 0
-%define CONFIG_PROTOCOLS 0
-%define CONFIG_FRAME_THREAD_ENCODER 0
-%define CONFIG_AASC_DECODER 0
-%define CONFIG_AIC_DECODER 0
-%define CONFIG_ALIAS_PIX_DECODER 0
-%define CONFIG_AMV_DECODER 0
-%define CONFIG_ANM_DECODER 0
-%define CONFIG_ANSI_DECODER 0
-%define CONFIG_APNG_DECODER 0
-%define CONFIG_ASV1_DECODER 0
-%define CONFIG_ASV2_DECODER 0
-%define CONFIG_AURA_DECODER 0
-%define CONFIG_AURA2_DECODER 0
-%define CONFIG_AVRP_DECODER 0
-%define CONFIG_AVRN_DECODER 0
-%define CONFIG_AVS_DECODER 0
-%define CONFIG_AVUI_DECODER 0
-%define CONFIG_AYUV_DECODER 0
-%define CONFIG_BETHSOFTVID_DECODER 0
-%define CONFIG_BFI_DECODER 0
-%define CONFIG_BINK_DECODER 0
-%define CONFIG_BMP_DECODER 0
-%define CONFIG_BMV_VIDEO_DECODER 0
-%define CONFIG_BRENDER_PIX_DECODER 0
-%define CONFIG_C93_DECODER 0
-%define CONFIG_CAVS_DECODER 0
-%define CONFIG_CDGRAPHICS_DECODER 0
-%define CONFIG_CDXL_DECODER 0
-%define CONFIG_CFHD_DECODER 0
-%define CONFIG_CINEPAK_DECODER 0
-%define CONFIG_CLEARVIDEO_DECODER 0
-%define CONFIG_CLJR_DECODER 0
-%define CONFIG_CLLC_DECODER 0
-%define CONFIG_COMFORTNOISE_DECODER 0
-%define CONFIG_CPIA_DECODER 0
-%define CONFIG_CSCD_DECODER 0
-%define CONFIG_CYUV_DECODER 0
-%define CONFIG_DDS_DECODER 0
-%define CONFIG_DFA_DECODER 0
-%define CONFIG_DIRAC_DECODER 0
-%define CONFIG_DNXHD_DECODER 0
-%define CONFIG_DPX_DECODER 0
-%define CONFIG_DSICINVIDEO_DECODER 0
-%define CONFIG_DVAUDIO_DECODER 0
-%define CONFIG_DVVIDEO_DECODER 0
-%define CONFIG_DXA_DECODER 0
-%define CONFIG_DXTORY_DECODER 0
-%define CONFIG_DXV_DECODER 0
-%define CONFIG_EACMV_DECODER 0
-%define CONFIG_EAMAD_DECODER 0
-%define CONFIG_EATGQ_DECODER 0
-%define CONFIG_EATGV_DECODER 0
-%define CONFIG_EATQI_DECODER 0
-%define CONFIG_EIGHTBPS_DECODER 0
-%define CONFIG_EIGHTSVX_EXP_DECODER 0
-%define CONFIG_EIGHTSVX_FIB_DECODER 0
-%define CONFIG_ESCAPE124_DECODER 0
-%define CONFIG_ESCAPE130_DECODER 0
-%define CONFIG_EXR_DECODER 0
-%define CONFIG_FFV1_DECODER 0
-%define CONFIG_FFVHUFF_DECODER 0
-%define CONFIG_FIC_DECODER 0
-%define CONFIG_FITS_DECODER 0
-%define CONFIG_FLASHSV_DECODER 0
-%define CONFIG_FLASHSV2_DECODER 0
-%define CONFIG_FLIC_DECODER 0
-%define CONFIG_FLV_DECODER 0
-%define CONFIG_FMVC_DECODER 0
-%define CONFIG_FOURXM_DECODER 0
-%define CONFIG_FRAPS_DECODER 0
-%define CONFIG_FRWU_DECODER 0
-%define CONFIG_G2M_DECODER 0
-%define CONFIG_GDV_DECODER 0
-%define CONFIG_GIF_DECODER 0
-%define CONFIG_H261_DECODER 0
-%define CONFIG_H263_DECODER 0
-%define CONFIG_H263I_DECODER 0
-%define CONFIG_H263P_DECODER 0
-%define CONFIG_H263_V4L2M2M_DECODER 0
-%define CONFIG_H264_DECODER 0
-%define CONFIG_H264_CRYSTALHD_DECODER 0
-%define CONFIG_H264_V4L2M2M_DECODER 0
-%define CONFIG_H264_MEDIACODEC_DECODER 0
-%define CONFIG_H264_MMAL_DECODER 0
-%define CONFIG_H264_QSV_DECODER 0
-%define CONFIG_H264_RKMPP_DECODER 0
-%define CONFIG_H264_VDA_DECODER 0
-%define CONFIG_H264_VDPAU_DECODER 0
-%define CONFIG_HAP_DECODER 0
-%define CONFIG_HEVC_DECODER 0
-%define CONFIG_HEVC_QSV_DECODER 0
-%define CONFIG_HEVC_RKMPP_DECODER 0
-%define CONFIG_HEVC_V4L2M2M_DECODER 0
-%define CONFIG_HNM4_VIDEO_DECODER 0
-%define CONFIG_HQ_HQA_DECODER 0
-%define CONFIG_HQX_DECODER 0
-%define CONFIG_HUFFYUV_DECODER 0
-%define CONFIG_IDCIN_DECODER 0
-%define CONFIG_IFF_ILBM_DECODER 0
-%define CONFIG_INDEO2_DECODER 0
-%define CONFIG_INDEO3_DECODER 0
-%define CONFIG_INDEO4_DECODER 0
-%define CONFIG_INDEO5_DECODER 0
-%define CONFIG_INTERPLAY_VIDEO_DECODER 0
-%define CONFIG_JPEG2000_DECODER 0
-%define CONFIG_JPEGLS_DECODER 0
-%define CONFIG_JV_DECODER 0
-%define CONFIG_KGV1_DECODER 0
-%define CONFIG_KMVC_DECODER 0
-%define CONFIG_LAGARITH_DECODER 0
-%define CONFIG_LOCO_DECODER 0
-%define CONFIG_M101_DECODER 0
-%define CONFIG_MAGICYUV_DECODER 0
-%define CONFIG_MDEC_DECODER 0
-%define CONFIG_MIMIC_DECODER 0
-%define CONFIG_MJPEG_DECODER 0
-%define CONFIG_MJPEGB_DECODER 0
-%define CONFIG_MMVIDEO_DECODER 0
-%define CONFIG_MOTIONPIXELS_DECODER 0
-%define CONFIG_MPEG_XVMC_DECODER 0
-%define CONFIG_MPEG1VIDEO_DECODER 0
-%define CONFIG_MPEG2VIDEO_DECODER 0
-%define CONFIG_MPEG4_DECODER 0
-%define CONFIG_MPEG4_CRYSTALHD_DECODER 0
-%define CONFIG_MPEG4_V4L2M2M_DECODER 0
-%define CONFIG_MPEG4_MMAL_DECODER 0
-%define CONFIG_MPEG4_VDPAU_DECODER 0
-%define CONFIG_MPEGVIDEO_DECODER 0
-%define CONFIG_MPEG_VDPAU_DECODER 0
-%define CONFIG_MPEG1_VDPAU_DECODER 0
-%define CONFIG_MPEG1_V4L2M2M_DECODER 0
-%define CONFIG_MPEG2_MMAL_DECODER 0
-%define CONFIG_MPEG2_CRYSTALHD_DECODER 0
-%define CONFIG_MPEG2_V4L2M2M_DECODER 0
-%define CONFIG_MPEG2_QSV_DECODER 0
-%define CONFIG_MPEG2_MEDIACODEC_DECODER 0
-%define CONFIG_MSA1_DECODER 0
-%define CONFIG_MSCC_DECODER 0
-%define CONFIG_MSMPEG4V1_DECODER 0
-%define CONFIG_MSMPEG4V2_DECODER 0
-%define CONFIG_MSMPEG4V3_DECODER 0
-%define CONFIG_MSMPEG4_CRYSTALHD_DECODER 0
-%define CONFIG_MSRLE_DECODER 0
-%define CONFIG_MSS1_DECODER 0
-%define CONFIG_MSS2_DECODER 0
-%define CONFIG_MSVIDEO1_DECODER 0
-%define CONFIG_MSZH_DECODER 0
-%define CONFIG_MTS2_DECODER 0
-%define CONFIG_MVC1_DECODER 0
-%define CONFIG_MVC2_DECODER 0
-%define CONFIG_MXPEG_DECODER 0
-%define CONFIG_NUV_DECODER 0
-%define CONFIG_PAF_VIDEO_DECODER 0
-%define CONFIG_PAM_DECODER 0
-%define CONFIG_PBM_DECODER 0
-%define CONFIG_PCX_DECODER 0
-%define CONFIG_PGM_DECODER 0
-%define CONFIG_PGMYUV_DECODER 0
-%define CONFIG_PICTOR_DECODER 0
-%define CONFIG_PIXLET_DECODER 0
-%define CONFIG_PNG_DECODER 0
-%define CONFIG_PPM_DECODER 0
-%define CONFIG_PRORES_DECODER 0
-%define CONFIG_PRORES_LGPL_DECODER 0
-%define CONFIG_PSD_DECODER 0
-%define CONFIG_PTX_DECODER 0
-%define CONFIG_QDRAW_DECODER 0
-%define CONFIG_QPEG_DECODER 0
-%define CONFIG_QTRLE_DECODER 0
-%define CONFIG_R10K_DECODER 0
-%define CONFIG_R210_DECODER 0
-%define CONFIG_RAWVIDEO_DECODER 0
-%define CONFIG_RL2_DECODER 0
-%define CONFIG_ROQ_DECODER 0
-%define CONFIG_RPZA_DECODER 0
-%define CONFIG_RSCC_DECODER 0
-%define CONFIG_RV10_DECODER 0
-%define CONFIG_RV20_DECODER 0
-%define CONFIG_RV30_DECODER 0
-%define CONFIG_RV40_DECODER 0
-%define CONFIG_S302M_DECODER 0
-%define CONFIG_SANM_DECODER 0
-%define CONFIG_SCPR_DECODER 0
-%define CONFIG_SCREENPRESSO_DECODER 0
-%define CONFIG_SDX2_DPCM_DECODER 0
-%define CONFIG_SGI_DECODER 0
-%define CONFIG_SGIRLE_DECODER 0
-%define CONFIG_SHEERVIDEO_DECODER 0
-%define CONFIG_SMACKER_DECODER 0
-%define CONFIG_SMC_DECODER 0
-%define CONFIG_SMVJPEG_DECODER 0
-%define CONFIG_SNOW_DECODER 0
-%define CONFIG_SP5X_DECODER 0
-%define CONFIG_SPEEDHQ_DECODER 0
-%define CONFIG_SRGC_DECODER 0
-%define CONFIG_SUNRAST_DECODER 0
-%define CONFIG_SVQ1_DECODER 0
-%define CONFIG_SVQ3_DECODER 0
-%define CONFIG_TARGA_DECODER 0
-%define CONFIG_TARGA_Y216_DECODER 0
-%define CONFIG_TDSC_DECODER 0
-%define CONFIG_THEORA_DECODER 0
-%define CONFIG_THP_DECODER 0
-%define CONFIG_TIERTEXSEQVIDEO_DECODER 0
-%define CONFIG_TIFF_DECODER 0
-%define CONFIG_TMV_DECODER 0
-%define CONFIG_TRUEMOTION1_DECODER 0
-%define CONFIG_TRUEMOTION2_DECODER 0
-%define CONFIG_TRUEMOTION2RT_DECODER 0
-%define CONFIG_TSCC_DECODER 0
-%define CONFIG_TSCC2_DECODER 0
-%define CONFIG_TXD_DECODER 0
-%define CONFIG_ULTI_DECODER 0
-%define CONFIG_UTVIDEO_DECODER 0
-%define CONFIG_V210_DECODER 0
-%define CONFIG_V210X_DECODER 0
-%define CONFIG_V308_DECODER 0
-%define CONFIG_V408_DECODER 0
-%define CONFIG_V410_DECODER 0
-%define CONFIG_VB_DECODER 0
-%define CONFIG_VBLE_DECODER 0
-%define CONFIG_VC1_DECODER 0
-%define CONFIG_VC1_CRYSTALHD_DECODER 0
-%define CONFIG_VC1_VDPAU_DECODER 0
-%define CONFIG_VC1IMAGE_DECODER 0
-%define CONFIG_VC1_MMAL_DECODER 0
-%define CONFIG_VC1_QSV_DECODER 0
-%define CONFIG_VC1_V4L2M2M_DECODER 0
-%define CONFIG_VCR1_DECODER 0
-%define CONFIG_VMDVIDEO_DECODER 0
-%define CONFIG_VMNC_DECODER 0
-%define CONFIG_VP3_DECODER 0
-%define CONFIG_VP5_DECODER 0
-%define CONFIG_VP6_DECODER 0
-%define CONFIG_VP6A_DECODER 0
-%define CONFIG_VP6F_DECODER 0
-%define CONFIG_VP7_DECODER 0
-%define CONFIG_VP8_RKMPP_DECODER 0
-%define CONFIG_VP8_V4L2M2M_DECODER 0
-%define CONFIG_VP9_RKMPP_DECODER 0
-%define CONFIG_VP9_V4L2M2M_DECODER 0
-%define CONFIG_VQA_DECODER 0
-%define CONFIG_BITPACKED_DECODER 0
-%define CONFIG_WEBP_DECODER 0
-%define CONFIG_WRAPPED_AVFRAME_DECODER 0
-%define CONFIG_WMV1_DECODER 0
-%define CONFIG_WMV2_DECODER 0
-%define CONFIG_WMV3_DECODER 0
-%define CONFIG_WMV3_CRYSTALHD_DECODER 0
-%define CONFIG_WMV3_VDPAU_DECODER 0
-%define CONFIG_WMV3IMAGE_DECODER 0
-%define CONFIG_WNV1_DECODER 0
-%define CONFIG_XAN_WC3_DECODER 0
-%define CONFIG_XAN_WC4_DECODER 0
-%define CONFIG_XBM_DECODER 0
-%define CONFIG_XFACE_DECODER 0
-%define CONFIG_XL_DECODER 0
-%define CONFIG_XPM_DECODER 0
-%define CONFIG_XWD_DECODER 0
-%define CONFIG_Y41P_DECODER 0
-%define CONFIG_YLC_DECODER 0
-%define CONFIG_YOP_DECODER 0
-%define CONFIG_YUV4_DECODER 0
-%define CONFIG_ZERO12V_DECODER 0
-%define CONFIG_ZEROCODEC_DECODER 0
-%define CONFIG_ZLIB_DECODER 0
-%define CONFIG_ZMBV_DECODER 0
+%define CONFIG_A64MULTI5_ENCODER 0
+%define CONFIG_A64MULTI_ENCODER 0
+%define CONFIG_A64_MUXER 0
+%define CONFIG_AAC_ADTSTOASC_BSF 0
+%define CONFIG_AAC_AT_DECODER 0
+%define CONFIG_AAC_AT_ENCODER 0
 %define CONFIG_AAC_DECODER 0
+%define CONFIG_AAC_DEMUXER 0
+%define CONFIG_AAC_ENCODER 0
 %define CONFIG_AAC_FIXED_DECODER 0
 %define CONFIG_AAC_LATM_DECODER 0
+%define CONFIG_AAC_LATM_PARSER 0
+%define CONFIG_AAC_PARSER 0
+%define CONFIG_AASC_DECODER 0
+%define CONFIG_AA_DEMUXER 0
+%define CONFIG_ABENCH_FILTER 0
+%define CONFIG_ABITSCOPE_FILTER 0
+%define CONFIG_AC3_AT_DECODER 0
 %define CONFIG_AC3_DECODER 0
+%define CONFIG_AC3_DEMUXER 0
+%define CONFIG_AC3_ENCODER 0
 %define CONFIG_AC3_FIXED_DECODER 0
-%define CONFIG_ALAC_DECODER 0
-%define CONFIG_ALS_DECODER 0
-%define CONFIG_AMRNB_DECODER 0
-%define CONFIG_AMRWB_DECODER 0
-%define CONFIG_APE_DECODER 0
-%define CONFIG_ATRAC1_DECODER 0
-%define CONFIG_ATRAC3_DECODER 0
-%define CONFIG_ATRAC3AL_DECODER 0
-%define CONFIG_ATRAC3P_DECODER 0
-%define CONFIG_ATRAC3PAL_DECODER 0
-%define CONFIG_BINKAUDIO_DCT_DECODER 0
-%define CONFIG_BINKAUDIO_RDFT_DECODER 0
-%define CONFIG_BMV_AUDIO_DECODER 0
-%define CONFIG_COOK_DECODER 0
-%define CONFIG_DCA_DECODER 0
-%define CONFIG_DOLBY_E_DECODER 0
-%define CONFIG_DSD_LSBF_DECODER 0
-%define CONFIG_DSD_MSBF_DECODER 0
-%define CONFIG_DSD_LSBF_PLANAR_DECODER 0
-%define CONFIG_DSD_MSBF_PLANAR_DECODER 0
-%define CONFIG_DSICINAUDIO_DECODER 0
-%define CONFIG_DSS_SP_DECODER 0
-%define CONFIG_DST_DECODER 0
-%define CONFIG_EAC3_DECODER 0
-%define CONFIG_EVRC_DECODER 0
-%define CONFIG_FFWAVESYNTH_DECODER 0
-%define CONFIG_G723_1_DECODER 0
-%define CONFIG_G729_DECODER 0
-%define CONFIG_GSM_DECODER 0
-%define CONFIG_GSM_MS_DECODER 0
-%define CONFIG_IAC_DECODER 0
-%define CONFIG_IMC_DECODER 0
-%define CONFIG_INTERPLAY_ACM_DECODER 0
-%define CONFIG_MACE3_DECODER 0
-%define CONFIG_MACE6_DECODER 0
-%define CONFIG_METASOUND_DECODER 0
-%define CONFIG_MLP_DECODER 0
-%define CONFIG_MP1_DECODER 0
-%define CONFIG_MP1FLOAT_DECODER 0
-%define CONFIG_MP2_DECODER 0
-%define CONFIG_MP2FLOAT_DECODER 0
-%define CONFIG_MP3_DECODER 0
-%define CONFIG_MP3FLOAT_DECODER 0
-%define CONFIG_MP3ADU_DECODER 0
-%define CONFIG_MP3ADUFLOAT_DECODER 0
-%define CONFIG_MP3ON4_DECODER 0
-%define CONFIG_MP3ON4FLOAT_DECODER 0
-%define CONFIG_MPC7_DECODER 0
-%define CONFIG_MPC8_DECODER 0
-%define CONFIG_NELLYMOSER_DECODER 0
-%define CONFIG_ON2AVC_DECODER 0
-%define CONFIG_OPUS_DECODER 0
-%define CONFIG_PAF_AUDIO_DECODER 0
-%define CONFIG_QCELP_DECODER 0
-%define CONFIG_QDM2_DECODER 0
-%define CONFIG_QDMC_DECODER 0
-%define CONFIG_RA_144_DECODER 0
-%define CONFIG_RA_288_DECODER 0
-%define CONFIG_RALF_DECODER 0
-%define CONFIG_SHORTEN_DECODER 0
-%define CONFIG_SIPR_DECODER 0
-%define CONFIG_SMACKAUD_DECODER 0
-%define CONFIG_SONIC_DECODER 0
-%define CONFIG_TAK_DECODER 0
-%define CONFIG_TRUEHD_DECODER 0
-%define CONFIG_TRUESPEECH_DECODER 0
-%define CONFIG_TTA_DECODER 0
-%define CONFIG_TWINVQ_DECODER 0
-%define CONFIG_VMDAUDIO_DECODER 0
-%define CONFIG_VORBIS_DECODER 0
-%define CONFIG_WAVPACK_DECODER 0
-%define CONFIG_WMALOSSLESS_DECODER 0
-%define CONFIG_WMAPRO_DECODER 0
-%define CONFIG_WMAV1_DECODER 0
-%define CONFIG_WMAV2_DECODER 0
-%define CONFIG_WMAVOICE_DECODER 0
-%define CONFIG_WS_SND1_DECODER 0
-%define CONFIG_XMA1_DECODER 0
-%define CONFIG_XMA2_DECODER 0
-%define CONFIG_PCM_ALAW_DECODER 0
-%define CONFIG_PCM_BLURAY_DECODER 0
-%define CONFIG_PCM_DVD_DECODER 0
-%define CONFIG_PCM_F16LE_DECODER 0
-%define CONFIG_PCM_F24LE_DECODER 0
-%define CONFIG_PCM_F32BE_DECODER 0
-%define CONFIG_PCM_F32LE_DECODER 0
-%define CONFIG_PCM_F64BE_DECODER 0
-%define CONFIG_PCM_F64LE_DECODER 0
-%define CONFIG_PCM_LXF_DECODER 0
-%define CONFIG_PCM_MULAW_DECODER 0
-%define CONFIG_PCM_S8_DECODER 0
-%define CONFIG_PCM_S8_PLANAR_DECODER 0
-%define CONFIG_PCM_S16BE_DECODER 0
-%define CONFIG_PCM_S16BE_PLANAR_DECODER 0
-%define CONFIG_PCM_S16LE_DECODER 0
-%define CONFIG_PCM_S16LE_PLANAR_DECODER 0
-%define CONFIG_PCM_S24BE_DECODER 0
-%define CONFIG_PCM_S24DAUD_DECODER 0
-%define CONFIG_PCM_S24LE_DECODER 0
-%define CONFIG_PCM_S24LE_PLANAR_DECODER 0
-%define CONFIG_PCM_S32BE_DECODER 0
-%define CONFIG_PCM_S32LE_DECODER 0
-%define CONFIG_PCM_S32LE_PLANAR_DECODER 0
-%define CONFIG_PCM_S64BE_DECODER 0
-%define CONFIG_PCM_S64LE_DECODER 0
-%define CONFIG_PCM_U8_DECODER 0
-%define CONFIG_PCM_U16BE_DECODER 0
-%define CONFIG_PCM_U16LE_DECODER 0
-%define CONFIG_PCM_U24BE_DECODER 0
-%define CONFIG_PCM_U24LE_DECODER 0
-%define CONFIG_PCM_U32BE_DECODER 0
-%define CONFIG_PCM_U32LE_DECODER 0
-%define CONFIG_PCM_ZORK_DECODER 0
-%define CONFIG_GREMLIN_DPCM_DECODER 0
-%define CONFIG_INTERPLAY_DPCM_DECODER 0
-%define CONFIG_ROQ_DPCM_DECODER 0
-%define CONFIG_SOL_DPCM_DECODER 0
-%define CONFIG_XAN_DPCM_DECODER 0
+%define CONFIG_AC3_FIXED_ENCODER 0
+%define CONFIG_AC3_MUXER 0
+%define CONFIG_AC3_PARSER 0
+%define CONFIG_ACM_DEMUXER 0
+%define CONFIG_ACOMPRESSOR_FILTER 0
+%define CONFIG_ACONTRAST_FILTER 0
+%define CONFIG_ACOPY_FILTER 0
+%define CONFIG_ACROSSFADE_FILTER 0
+%define CONFIG_ACRUSHER_FILTER 0
+%define CONFIG_ACT_DEMUXER 0
+%define CONFIG_ADELAY_FILTER 0
+%define CONFIG_ADF_DEMUXER 0
 %define CONFIG_ADPCM_4XM_DECODER 0
 %define CONFIG_ADPCM_ADX_DECODER 0
+%define CONFIG_ADPCM_ADX_ENCODER 0
 %define CONFIG_ADPCM_AFC_DECODER 0
 %define CONFIG_ADPCM_AICA_DECODER 0
 %define CONFIG_ADPCM_CT_DECODER 0
@@ -408,8 +46,11 @@
 %define CONFIG_ADPCM_EA_R3_DECODER 0
 %define CONFIG_ADPCM_EA_XAS_DECODER 0
 %define CONFIG_ADPCM_G722_DECODER 0
-%define CONFIG_ADPCM_G726_DECODER 0
+%define CONFIG_ADPCM_G722_ENCODER 0
 %define CONFIG_ADPCM_G726LE_DECODER 0
+%define CONFIG_ADPCM_G726LE_ENCODER 0
+%define CONFIG_ADPCM_G726_DECODER 0
+%define CONFIG_ADPCM_G726_ENCODER 0
 %define CONFIG_ADPCM_IMA_AMV_DECODER 0
 %define CONFIG_ADPCM_IMA_APC_DECODER 0
 %define CONFIG_ADPCM_IMA_DAT4_DECODER 0
@@ -419,604 +60,114 @@
 %define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 0
 %define CONFIG_ADPCM_IMA_ISS_DECODER 0
 %define CONFIG_ADPCM_IMA_OKI_DECODER 0
+%define CONFIG_ADPCM_IMA_QT_AT_DECODER 0
 %define CONFIG_ADPCM_IMA_QT_DECODER 0
+%define CONFIG_ADPCM_IMA_QT_ENCODER 0
 %define CONFIG_ADPCM_IMA_RAD_DECODER 0
 %define CONFIG_ADPCM_IMA_SMJPEG_DECODER 0
 %define CONFIG_ADPCM_IMA_WAV_DECODER 0
+%define CONFIG_ADPCM_IMA_WAV_ENCODER 0
 %define CONFIG_ADPCM_IMA_WS_DECODER 0
 %define CONFIG_ADPCM_MS_DECODER 0
+%define CONFIG_ADPCM_MS_ENCODER 0
 %define CONFIG_ADPCM_MTAF_DECODER 0
 %define CONFIG_ADPCM_PSX_DECODER 0
 %define CONFIG_ADPCM_SBPRO_2_DECODER 0
 %define CONFIG_ADPCM_SBPRO_3_DECODER 0
 %define CONFIG_ADPCM_SBPRO_4_DECODER 0
 %define CONFIG_ADPCM_SWF_DECODER 0
+%define CONFIG_ADPCM_SWF_ENCODER 0
 %define CONFIG_ADPCM_THP_DECODER 0
 %define CONFIG_ADPCM_THP_LE_DECODER 0
 %define CONFIG_ADPCM_VIMA_DECODER 0
 %define CONFIG_ADPCM_XA_DECODER 0
 %define CONFIG_ADPCM_YAMAHA_DECODER 0
-%define CONFIG_SSA_DECODER 0
-%define CONFIG_ASS_DECODER 0
-%define CONFIG_CCAPTION_DECODER 0
-%define CONFIG_DVBSUB_DECODER 0
-%define CONFIG_DVDSUB_DECODER 0
-%define CONFIG_JACOSUB_DECODER 0
-%define CONFIG_MICRODVD_DECODER 0
-%define CONFIG_MOVTEXT_DECODER 0
-%define CONFIG_MPL2_DECODER 0
-%define CONFIG_PGSSUB_DECODER 0
-%define CONFIG_PJS_DECODER 0
-%define CONFIG_REALTEXT_DECODER 0
-%define CONFIG_SAMI_DECODER 0
-%define CONFIG_SRT_DECODER 0
-%define CONFIG_STL_DECODER 0
-%define CONFIG_SUBRIP_DECODER 0
-%define CONFIG_SUBVIEWER_DECODER 0
-%define CONFIG_SUBVIEWER1_DECODER 0
-%define CONFIG_TEXT_DECODER 0
-%define CONFIG_VPLAYER_DECODER 0
-%define CONFIG_WEBVTT_DECODER 0
-%define CONFIG_XSUB_DECODER 0
-%define CONFIG_AAC_AT_DECODER 0
-%define CONFIG_AC3_AT_DECODER 0
-%define CONFIG_ADPCM_IMA_QT_AT_DECODER 0
-%define CONFIG_ALAC_AT_DECODER 0
-%define CONFIG_AMR_NB_AT_DECODER 0
-%define CONFIG_EAC3_AT_DECODER 0
-%define CONFIG_GSM_MS_AT_DECODER 0
-%define CONFIG_ILBC_AT_DECODER 0
-%define CONFIG_MP1_AT_DECODER 0
-%define CONFIG_MP2_AT_DECODER 0
-%define CONFIG_MP3_AT_DECODER 0
-%define CONFIG_PCM_ALAW_AT_DECODER 0
-%define CONFIG_PCM_MULAW_AT_DECODER 0
-%define CONFIG_QDMC_AT_DECODER 0
-%define CONFIG_QDM2_AT_DECODER 0
-%define CONFIG_LIBCELT_DECODER 0
-%define CONFIG_LIBFDK_AAC_DECODER 0
-%define CONFIG_LIBGSM_DECODER 0
-%define CONFIG_LIBGSM_MS_DECODER 0
-%define CONFIG_LIBILBC_DECODER 0
-%define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
-%define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
-%define CONFIG_LIBOPENJPEG_DECODER 0
-%define CONFIG_LIBOPUS_DECODER 0
-%define CONFIG_LIBRSVG_DECODER 0
-%define CONFIG_LIBSPEEX_DECODER 0
-%define CONFIG_LIBVORBIS_DECODER 0
-%define CONFIG_LIBVPX_VP8_DECODER 0
-%define CONFIG_LIBVPX_VP9_DECODER 0
-%define CONFIG_LIBZVBI_TELETEXT_DECODER 0
-%define CONFIG_BINTEXT_DECODER 0
-%define CONFIG_XBIN_DECODER 0
-%define CONFIG_IDF_DECODER 0
-%define CONFIG_LIBOPENH264_DECODER 0
-%define CONFIG_H264_CUVID_DECODER 0
-%define CONFIG_HEVC_CUVID_DECODER 0
-%define CONFIG_HEVC_MEDIACODEC_DECODER 0
-%define CONFIG_MJPEG_CUVID_DECODER 0
-%define CONFIG_MPEG1_CUVID_DECODER 0
-%define CONFIG_MPEG2_CUVID_DECODER 0
-%define CONFIG_MPEG4_CUVID_DECODER 0
-%define CONFIG_MPEG4_MEDIACODEC_DECODER 0
-%define CONFIG_VC1_CUVID_DECODER 0
-%define CONFIG_VP8_CUVID_DECODER 0
-%define CONFIG_VP8_MEDIACODEC_DECODER 0
-%define CONFIG_VP8_QSV_DECODER 0
-%define CONFIG_VP9_CUVID_DECODER 0
-%define CONFIG_VP9_MEDIACODEC_DECODER 0
-%define CONFIG_AA_DEMUXER 0
-%define CONFIG_AAC_DEMUXER 0
-%define CONFIG_AC3_DEMUXER 0
-%define CONFIG_ACM_DEMUXER 0
-%define CONFIG_ACT_DEMUXER 0
-%define CONFIG_ADF_DEMUXER 0
+%define CONFIG_ADPCM_YAMAHA_ENCODER 0
 %define CONFIG_ADP_DEMUXER 0
+%define CONFIG_ADRAWGRAPH_FILTER 0
 %define CONFIG_ADS_DEMUXER 0
+%define CONFIG_ADTS_MUXER 0
 %define CONFIG_ADX_DEMUXER 0
+%define CONFIG_ADX_MUXER 0
+%define CONFIG_ADX_PARSER 0
 %define CONFIG_AEA_DEMUXER 0
-%define CONFIG_AFC_DEMUXER 0
-%define CONFIG_AIFF_DEMUXER 0
-%define CONFIG_AIX_DEMUXER 0
-%define CONFIG_AMR_DEMUXER 0
-%define CONFIG_ANM_DEMUXER 0
-%define CONFIG_APC_DEMUXER 0
-%define CONFIG_APE_DEMUXER 0
-%define CONFIG_APNG_DEMUXER 0
-%define CONFIG_AQTITLE_DEMUXER 0
-%define CONFIG_ASF_DEMUXER 0
-%define CONFIG_ASF_O_DEMUXER 0
-%define CONFIG_ASS_DEMUXER 0
-%define CONFIG_AST_DEMUXER 0
-%define CONFIG_AU_DEMUXER 0
-%define CONFIG_AVI_DEMUXER 0
-%define CONFIG_AVISYNTH_DEMUXER 0
-%define CONFIG_AVR_DEMUXER 0
-%define CONFIG_AVS_DEMUXER 0
-%define CONFIG_BETHSOFTVID_DEMUXER 0
-%define CONFIG_BFI_DEMUXER 0
-%define CONFIG_BINTEXT_DEMUXER 0
-%define CONFIG_BINK_DEMUXER 0
-%define CONFIG_BIT_DEMUXER 0
-%define CONFIG_BMV_DEMUXER 0
-%define CONFIG_BFSTM_DEMUXER 0
-%define CONFIG_BRSTM_DEMUXER 0
-%define CONFIG_BOA_DEMUXER 0
-%define CONFIG_C93_DEMUXER 0
-%define CONFIG_CAF_DEMUXER 0
-%define CONFIG_CAVSVIDEO_DEMUXER 0
-%define CONFIG_CDG_DEMUXER 0
-%define CONFIG_CDXL_DEMUXER 0
-%define CONFIG_CINE_DEMUXER 0
-%define CONFIG_CONCAT_DEMUXER 0
-%define CONFIG_DASH_DEMUXER 0
-%define CONFIG_DATA_DEMUXER 0
-%define CONFIG_DAUD_DEMUXER 0
-%define CONFIG_DCSTR_DEMUXER 0
-%define CONFIG_DFA_DEMUXER 0
-%define CONFIG_DIRAC_DEMUXER 0
-%define CONFIG_DNXHD_DEMUXER 0
-%define CONFIG_DSF_DEMUXER 0
-%define CONFIG_DSICIN_DEMUXER 0
-%define CONFIG_DSS_DEMUXER 0
-%define CONFIG_DTS_DEMUXER 0
-%define CONFIG_DTSHD_DEMUXER 0
-%define CONFIG_DV_DEMUXER 0
-%define CONFIG_DVBSUB_DEMUXER 0
-%define CONFIG_DVBTXT_DEMUXER 0
-%define CONFIG_DXA_DEMUXER 0
-%define CONFIG_EA_DEMUXER 0
-%define CONFIG_EA_CDATA_DEMUXER 0
-%define CONFIG_EAC3_DEMUXER 0
-%define CONFIG_EPAF_DEMUXER 0
-%define CONFIG_FFM_DEMUXER 0
-%define CONFIG_FFMETADATA_DEMUXER 0
-%define CONFIG_FILMSTRIP_DEMUXER 0
-%define CONFIG_FITS_DEMUXER 0
-%define CONFIG_FLAC_DEMUXER 0
-%define CONFIG_FLIC_DEMUXER 0
-%define CONFIG_FLV_DEMUXER 0
-%define CONFIG_LIVE_FLV_DEMUXER 0
-%define CONFIG_FOURXM_DEMUXER 0
-%define CONFIG_FRM_DEMUXER 0
-%define CONFIG_FSB_DEMUXER 0
-%define CONFIG_G722_DEMUXER 0
-%define CONFIG_G723_1_DEMUXER 0
-%define CONFIG_G726_DEMUXER 0
-%define CONFIG_G726LE_DEMUXER 0
-%define CONFIG_G729_DEMUXER 0
-%define CONFIG_GDV_DEMUXER 0
-%define CONFIG_GENH_DEMUXER 0
-%define CONFIG_GIF_DEMUXER 0
-%define CONFIG_GSM_DEMUXER 0
-%define CONFIG_GXF_DEMUXER 0
-%define CONFIG_H261_DEMUXER 0
-%define CONFIG_H263_DEMUXER 0
-%define CONFIG_H264_DEMUXER 0
-%define CONFIG_HEVC_DEMUXER 0
-%define CONFIG_HLS_DEMUXER 0
-%define CONFIG_HNM_DEMUXER 0
-%define CONFIG_ICO_DEMUXER 0
-%define CONFIG_IDCIN_DEMUXER 0
-%define CONFIG_IDF_DEMUXER 0
-%define CONFIG_IFF_DEMUXER 0
-%define CONFIG_ILBC_DEMUXER 0
-%define CONFIG_IMAGE2_DEMUXER 0
-%define CONFIG_IMAGE2PIPE_DEMUXER 0
-%define CONFIG_IMAGE2_ALIAS_PIX_DEMUXER 0
-%define CONFIG_IMAGE2_BRENDER_PIX_DEMUXER 0
-%define CONFIG_INGENIENT_DEMUXER 0
-%define CONFIG_IPMOVIE_DEMUXER 0
-%define CONFIG_IRCAM_DEMUXER 0
-%define CONFIG_ISS_DEMUXER 0
-%define CONFIG_IV8_DEMUXER 0
-%define CONFIG_IVF_DEMUXER 0
-%define CONFIG_IVR_DEMUXER 0
-%define CONFIG_JACOSUB_DEMUXER 0
-%define CONFIG_JV_DEMUXER 0
-%define CONFIG_LMLM4_DEMUXER 0
-%define CONFIG_LOAS_DEMUXER 0
-%define CONFIG_LRC_DEMUXER 0
-%define CONFIG_LVF_DEMUXER 0
-%define CONFIG_LXF_DEMUXER 0
-%define CONFIG_M4V_DEMUXER 0
-%define CONFIG_MATROSKA_DEMUXER 0
-%define CONFIG_MGSTS_DEMUXER 0
-%define CONFIG_MICRODVD_DEMUXER 0
-%define CONFIG_MJPEG_DEMUXER 0
-%define CONFIG_MJPEG_2000_DEMUXER 0
-%define CONFIG_MLP_DEMUXER 0
-%define CONFIG_MLV_DEMUXER 0
-%define CONFIG_MM_DEMUXER 0
-%define CONFIG_MMF_DEMUXER 0
-%define CONFIG_MOV_DEMUXER 0
-%define CONFIG_MP3_DEMUXER 0
-%define CONFIG_MPC_DEMUXER 0
-%define CONFIG_MPC8_DEMUXER 0
-%define CONFIG_MPEGPS_DEMUXER 0
-%define CONFIG_MPEGTS_DEMUXER 0
-%define CONFIG_MPEGTSRAW_DEMUXER 0
-%define CONFIG_MPEGVIDEO_DEMUXER 0
-%define CONFIG_MPJPEG_DEMUXER 0
-%define CONFIG_MPL2_DEMUXER 0
-%define CONFIG_MPSUB_DEMUXER 0
-%define CONFIG_MSF_DEMUXER 0
-%define CONFIG_MSNWC_TCP_DEMUXER 0
-%define CONFIG_MTAF_DEMUXER 0
-%define CONFIG_MTV_DEMUXER 0
-%define CONFIG_MUSX_DEMUXER 0
-%define CONFIG_MV_DEMUXER 0
-%define CONFIG_MVI_DEMUXER 0
-%define CONFIG_MXF_DEMUXER 0
-%define CONFIG_MXG_DEMUXER 0
-%define CONFIG_NC_DEMUXER 0
-%define CONFIG_NISTSPHERE_DEMUXER 0
-%define CONFIG_NSV_DEMUXER 0
-%define CONFIG_NUT_DEMUXER 0
-%define CONFIG_NUV_DEMUXER 0
-%define CONFIG_OGG_DEMUXER 0
-%define CONFIG_OMA_DEMUXER 0
-%define CONFIG_PAF_DEMUXER 0
-%define CONFIG_PCM_ALAW_DEMUXER 0
-%define CONFIG_PCM_MULAW_DEMUXER 0
-%define CONFIG_PCM_F64BE_DEMUXER 0
-%define CONFIG_PCM_F64LE_DEMUXER 0
-%define CONFIG_PCM_F32BE_DEMUXER 0
-%define CONFIG_PCM_F32LE_DEMUXER 0
-%define CONFIG_PCM_S32BE_DEMUXER 0
-%define CONFIG_PCM_S32LE_DEMUXER 0
-%define CONFIG_PCM_S24BE_DEMUXER 0
-%define CONFIG_PCM_S24LE_DEMUXER 0
-%define CONFIG_PCM_S16BE_DEMUXER 0
-%define CONFIG_PCM_S16LE_DEMUXER 0
-%define CONFIG_PCM_S8_DEMUXER 0
-%define CONFIG_PCM_U32BE_DEMUXER 0
-%define CONFIG_PCM_U32LE_DEMUXER 0
-%define CONFIG_PCM_U24BE_DEMUXER 0
-%define CONFIG_PCM_U24LE_DEMUXER 0
-%define CONFIG_PCM_U16BE_DEMUXER 0
-%define CONFIG_PCM_U16LE_DEMUXER 0
-%define CONFIG_PCM_U8_DEMUXER 0
-%define CONFIG_PJS_DEMUXER 0
-%define CONFIG_PMP_DEMUXER 0
-%define CONFIG_PVA_DEMUXER 0
-%define CONFIG_PVF_DEMUXER 0
-%define CONFIG_QCP_DEMUXER 0
-%define CONFIG_R3D_DEMUXER 0
-%define CONFIG_RAWVIDEO_DEMUXER 0
-%define CONFIG_REALTEXT_DEMUXER 0
-%define CONFIG_REDSPARK_DEMUXER 0
-%define CONFIG_RL2_DEMUXER 0
-%define CONFIG_RM_DEMUXER 0
-%define CONFIG_ROQ_DEMUXER 0
-%define CONFIG_RPL_DEMUXER 0
-%define CONFIG_RSD_DEMUXER 0
-%define CONFIG_RSO_DEMUXER 0
-%define CONFIG_RTP_DEMUXER 0
-%define CONFIG_RTSP_DEMUXER 0
-%define CONFIG_S337M_DEMUXER 0
-%define CONFIG_SAMI_DEMUXER 0
-%define CONFIG_SAP_DEMUXER 0
-%define CONFIG_SBG_DEMUXER 0
-%define CONFIG_SCC_DEMUXER 0
-%define CONFIG_SDP_DEMUXER 0
-%define CONFIG_SDR2_DEMUXER 0
-%define CONFIG_SDS_DEMUXER 0
-%define CONFIG_SDX_DEMUXER 0
-%define CONFIG_SEGAFILM_DEMUXER 0
-%define CONFIG_SHORTEN_DEMUXER 0
-%define CONFIG_SIFF_DEMUXER 0
-%define CONFIG_SLN_DEMUXER 0
-%define CONFIG_SMACKER_DEMUXER 0
-%define CONFIG_SMJPEG_DEMUXER 0
-%define CONFIG_SMUSH_DEMUXER 0
-%define CONFIG_SOL_DEMUXER 0
-%define CONFIG_SOX_DEMUXER 0
-%define CONFIG_SPDIF_DEMUXER 0
-%define CONFIG_SRT_DEMUXER 0
-%define CONFIG_STR_DEMUXER 0
-%define CONFIG_STL_DEMUXER 0
-%define CONFIG_SUBVIEWER1_DEMUXER 0
-%define CONFIG_SUBVIEWER_DEMUXER 0
-%define CONFIG_SUP_DEMUXER 0
-%define CONFIG_SVAG_DEMUXER 0
-%define CONFIG_SWF_DEMUXER 0
-%define CONFIG_TAK_DEMUXER 0
-%define CONFIG_TEDCAPTIONS_DEMUXER 0
-%define CONFIG_THP_DEMUXER 0
-%define CONFIG_THREEDOSTR_DEMUXER 0
-%define CONFIG_TIERTEXSEQ_DEMUXER 0
-%define CONFIG_TMV_DEMUXER 0
-%define CONFIG_TRUEHD_DEMUXER 0
-%define CONFIG_TTA_DEMUXER 0
-%define CONFIG_TXD_DEMUXER 0
-%define CONFIG_TTY_DEMUXER 0
-%define CONFIG_V210_DEMUXER 0
-%define CONFIG_V210X_DEMUXER 0
-%define CONFIG_VAG_DEMUXER 0
-%define CONFIG_VC1_DEMUXER 0
-%define CONFIG_VC1T_DEMUXER 0
-%define CONFIG_VIVO_DEMUXER 0
-%define CONFIG_VMD_DEMUXER 0
-%define CONFIG_VOBSUB_DEMUXER 0
-%define CONFIG_VOC_DEMUXER 0
-%define CONFIG_VPK_DEMUXER 0
-%define CONFIG_VPLAYER_DEMUXER 0
-%define CONFIG_VQF_DEMUXER 0
-%define CONFIG_W64_DEMUXER 0
-%define CONFIG_WAV_DEMUXER 0
-%define CONFIG_WC3_DEMUXER 0
-%define CONFIG_WEBM_DASH_MANIFEST_DEMUXER 0
-%define CONFIG_WEBVTT_DEMUXER 0
-%define CONFIG_WSAUD_DEMUXER 0
-%define CONFIG_WSD_DEMUXER 0
-%define CONFIG_WSVQA_DEMUXER 0
-%define CONFIG_WTV_DEMUXER 0
-%define CONFIG_WVE_DEMUXER 0
-%define CONFIG_WV_DEMUXER 0
-%define CONFIG_XA_DEMUXER 0
-%define CONFIG_XBIN_DEMUXER 0
-%define CONFIG_XMV_DEMUXER 0
-%define CONFIG_XVAG_DEMUXER 0
-%define CONFIG_XWMA_DEMUXER 0
-%define CONFIG_YOP_DEMUXER 0
-%define CONFIG_YUV4MPEGPIPE_DEMUXER 0
-%define CONFIG_IMAGE_BMP_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_DDS_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_DPX_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_EXR_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_J2K_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_JPEG_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_JPEGLS_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PAM_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PBM_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PCX_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PGMYUV_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PGM_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PICTOR_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PNG_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PPM_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_PSD_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_QDRAW_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_SGI_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_SVG_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_SUNRAST_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_TIFF_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_WEBP_PIPE_DEMUXER 0
-%define CONFIG_IMAGE_XPM_PIPE_DEMUXER 0
-%define CONFIG_LIBGME_DEMUXER 0
-%define CONFIG_LIBMODPLUG_DEMUXER 0
-%define CONFIG_LIBOPENMPT_DEMUXER 0
-%define CONFIG_A64MULTI_ENCODER 0
-%define CONFIG_A64MULTI5_ENCODER 0
-%define CONFIG_ALIAS_PIX_ENCODER 0
-%define CONFIG_AMV_ENCODER 0
-%define CONFIG_APNG_ENCODER 0
-%define CONFIG_ASV1_ENCODER 0
-%define CONFIG_ASV2_ENCODER 0
-%define CONFIG_AVRP_ENCODER 0
-%define CONFIG_AVUI_ENCODER 0
-%define CONFIG_AYUV_ENCODER 0
-%define CONFIG_BMP_ENCODER 0
-%define CONFIG_CINEPAK_ENCODER 0
-%define CONFIG_CLJR_ENCODER 0
-%define CONFIG_COMFORTNOISE_ENCODER 0
-%define CONFIG_DNXHD_ENCODER 0
-%define CONFIG_DPX_ENCODER 0
-%define CONFIG_DVVIDEO_ENCODER 0
-%define CONFIG_FFV1_ENCODER 0
-%define CONFIG_FFVHUFF_ENCODER 0
-%define CONFIG_FITS_ENCODER 0
-%define CONFIG_FLASHSV_ENCODER 0
-%define CONFIG_FLASHSV2_ENCODER 0
-%define CONFIG_FLV_ENCODER 0
-%define CONFIG_GIF_ENCODER 0
-%define CONFIG_H261_ENCODER 0
-%define CONFIG_H263_ENCODER 0
-%define CONFIG_H263P_ENCODER 0
-%define CONFIG_HAP_ENCODER 0
-%define CONFIG_HUFFYUV_ENCODER 0
-%define CONFIG_JPEG2000_ENCODER 0
-%define CONFIG_JPEGLS_ENCODER 0
-%define CONFIG_LJPEG_ENCODER 0
-%define CONFIG_MJPEG_ENCODER 0
-%define CONFIG_MPEG1VIDEO_ENCODER 0
-%define CONFIG_MPEG2VIDEO_ENCODER 0
-%define CONFIG_MPEG4_ENCODER 0
-%define CONFIG_MSMPEG4V2_ENCODER 0
-%define CONFIG_MSMPEG4V3_ENCODER 0
-%define CONFIG_MSVIDEO1_ENCODER 0
-%define CONFIG_PAM_ENCODER 0
-%define CONFIG_PBM_ENCODER 0
-%define CONFIG_PCX_ENCODER 0
-%define CONFIG_PGM_ENCODER 0
-%define CONFIG_PGMYUV_ENCODER 0
-%define CONFIG_PNG_ENCODER 0
-%define CONFIG_PPM_ENCODER 0
-%define CONFIG_PRORES_ENCODER 0
-%define CONFIG_PRORES_AW_ENCODER 0
-%define CONFIG_PRORES_KS_ENCODER 0
-%define CONFIG_QTRLE_ENCODER 0
-%define CONFIG_R10K_ENCODER 0
-%define CONFIG_R210_ENCODER 0
-%define CONFIG_RAWVIDEO_ENCODER 0
-%define CONFIG_ROQ_ENCODER 0
-%define CONFIG_RV10_ENCODER 0
-%define CONFIG_RV20_ENCODER 0
-%define CONFIG_S302M_ENCODER 0
-%define CONFIG_SGI_ENCODER 0
-%define CONFIG_SNOW_ENCODER 0
-%define CONFIG_SUNRAST_ENCODER 0
-%define CONFIG_SVQ1_ENCODER 0
-%define CONFIG_TARGA_ENCODER 0
-%define CONFIG_TIFF_ENCODER 0
-%define CONFIG_UTVIDEO_ENCODER 0
-%define CONFIG_V210_ENCODER 0
-%define CONFIG_V308_ENCODER 0
-%define CONFIG_V408_ENCODER 0
-%define CONFIG_V410_ENCODER 0
-%define CONFIG_VC2_ENCODER 0
-%define CONFIG_WRAPPED_AVFRAME_ENCODER 0
-%define CONFIG_WMV1_ENCODER 0
-%define CONFIG_WMV2_ENCODER 0
-%define CONFIG_XBM_ENCODER 0
-%define CONFIG_XFACE_ENCODER 0
-%define CONFIG_XWD_ENCODER 0
-%define CONFIG_Y41P_ENCODER 0
-%define CONFIG_YUV4_ENCODER 0
-%define CONFIG_ZLIB_ENCODER 0
-%define CONFIG_ZMBV_ENCODER 0
-%define CONFIG_AAC_ENCODER 0
-%define CONFIG_AC3_ENCODER 0
-%define CONFIG_AC3_FIXED_ENCODER 0
-%define CONFIG_ALAC_ENCODER 0
-%define CONFIG_DCA_ENCODER 0
-%define CONFIG_EAC3_ENCODER 0
-%define CONFIG_FLAC_ENCODER 0
-%define CONFIG_G723_1_ENCODER 0
-%define CONFIG_MLP_ENCODER 0
-%define CONFIG_MP2_ENCODER 0
-%define CONFIG_MP2FIXED_ENCODER 0
-%define CONFIG_NELLYMOSER_ENCODER 0
-%define CONFIG_OPUS_ENCODER 0
-%define CONFIG_RA_144_ENCODER 0
-%define CONFIG_SONIC_ENCODER 0
-%define CONFIG_SONIC_LS_ENCODER 0
-%define CONFIG_TRUEHD_ENCODER 0
-%define CONFIG_TTA_ENCODER 0
-%define CONFIG_VORBIS_ENCODER 0
-%define CONFIG_WAVPACK_ENCODER 0
-%define CONFIG_WMAV1_ENCODER 0
-%define CONFIG_WMAV2_ENCODER 0
-%define CONFIG_PCM_ALAW_ENCODER 0
-%define CONFIG_PCM_F32BE_ENCODER 0
-%define CONFIG_PCM_F32LE_ENCODER 0
-%define CONFIG_PCM_F64BE_ENCODER 0
-%define CONFIG_PCM_F64LE_ENCODER 0
-%define CONFIG_PCM_MULAW_ENCODER 0
-%define CONFIG_PCM_S8_ENCODER 0
-%define CONFIG_PCM_S8_PLANAR_ENCODER 0
-%define CONFIG_PCM_S16BE_ENCODER 0
-%define CONFIG_PCM_S16BE_PLANAR_ENCODER 0
-%define CONFIG_PCM_S16LE_ENCODER 0
-%define CONFIG_PCM_S16LE_PLANAR_ENCODER 0
-%define CONFIG_PCM_S24BE_ENCODER 0
-%define CONFIG_PCM_S24DAUD_ENCODER 0
-%define CONFIG_PCM_S24LE_ENCODER 0
-%define CONFIG_PCM_S24LE_PLANAR_ENCODER 0
-%define CONFIG_PCM_S32BE_ENCODER 0
-%define CONFIG_PCM_S32LE_ENCODER 0
-%define CONFIG_PCM_S32LE_PLANAR_ENCODER 0
-%define CONFIG_PCM_S64BE_ENCODER 0
-%define CONFIG_PCM_S64LE_ENCODER 0
-%define CONFIG_PCM_U8_ENCODER 0
-%define CONFIG_PCM_U16BE_ENCODER 0
-%define CONFIG_PCM_U16LE_ENCODER 0
-%define CONFIG_PCM_U24BE_ENCODER 0
-%define CONFIG_PCM_U24LE_ENCODER 0
-%define CONFIG_PCM_U32BE_ENCODER 0
-%define CONFIG_PCM_U32LE_ENCODER 0
-%define CONFIG_ROQ_DPCM_ENCODER 0
-%define CONFIG_ADPCM_ADX_ENCODER 0
-%define CONFIG_ADPCM_G722_ENCODER 0
-%define CONFIG_ADPCM_G726_ENCODER 0
-%define CONFIG_ADPCM_G726LE_ENCODER 0
-%define CONFIG_ADPCM_IMA_QT_ENCODER 0
-%define CONFIG_ADPCM_IMA_WAV_ENCODER 0
-%define CONFIG_ADPCM_MS_ENCODER 0
-%define CONFIG_ADPCM_SWF_ENCODER 0
-%define CONFIG_ADPCM_YAMAHA_ENCODER 0
-%define CONFIG_SSA_ENCODER 0
-%define CONFIG_ASS_ENCODER 0
-%define CONFIG_DVBSUB_ENCODER 0
-%define CONFIG_DVDSUB_ENCODER 0
-%define CONFIG_MOVTEXT_ENCODER 0
-%define CONFIG_SRT_ENCODER 0
-%define CONFIG_SUBRIP_ENCODER 0
-%define CONFIG_TEXT_ENCODER 0
-%define CONFIG_WEBVTT_ENCODER 0
-%define CONFIG_XSUB_ENCODER 0
-%define CONFIG_AAC_AT_ENCODER 0
-%define CONFIG_ALAC_AT_ENCODER 0
-%define CONFIG_ILBC_AT_ENCODER 0
-%define CONFIG_PCM_ALAW_AT_ENCODER 0
-%define CONFIG_PCM_MULAW_AT_ENCODER 0
-%define CONFIG_LIBFDK_AAC_ENCODER 0
-%define CONFIG_LIBGSM_ENCODER 0
-%define CONFIG_LIBGSM_MS_ENCODER 0
-%define CONFIG_LIBILBC_ENCODER 0
-%define CONFIG_LIBMP3LAME_ENCODER 0
-%define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
-%define CONFIG_LIBOPENJPEG_ENCODER 0
-%define CONFIG_LIBOPUS_ENCODER 0
-%define CONFIG_LIBSHINE_ENCODER 0
-%define CONFIG_LIBSPEEX_ENCODER 0
-%define CONFIG_LIBTHEORA_ENCODER 0
-%define CONFIG_LIBTWOLAME_ENCODER 0
-%define CONFIG_LIBVO_AMRWBENC_ENCODER 0
-%define CONFIG_LIBVORBIS_ENCODER 0
-%define CONFIG_LIBVPX_VP8_ENCODER 0
-%define CONFIG_LIBVPX_VP9_ENCODER 0
-%define CONFIG_LIBWAVPACK_ENCODER 0
-%define CONFIG_LIBWEBP_ANIM_ENCODER 0
-%define CONFIG_LIBWEBP_ENCODER 0
-%define CONFIG_LIBX262_ENCODER 0
-%define CONFIG_LIBX264_ENCODER 0
-%define CONFIG_LIBX264RGB_ENCODER 0
-%define CONFIG_LIBX265_ENCODER 0
-%define CONFIG_LIBXAVS_ENCODER 0
-%define CONFIG_LIBXVID_ENCODER 0
-%define CONFIG_H263_V4L2M2M_ENCODER 0
-%define CONFIG_LIBOPENH264_ENCODER 0
-%define CONFIG_H264_NVENC_ENCODER 0
-%define CONFIG_H264_OMX_ENCODER 0
-%define CONFIG_H264_QSV_ENCODER 0
-%define CONFIG_H264_V4L2M2M_ENCODER 0
-%define CONFIG_H264_VAAPI_ENCODER 0
-%define CONFIG_H264_VIDEOTOOLBOX_ENCODER 0
-%define CONFIG_NVENC_ENCODER 0
-%define CONFIG_NVENC_H264_ENCODER 0
-%define CONFIG_NVENC_HEVC_ENCODER 0
-%define CONFIG_HEVC_NVENC_ENCODER 0
-%define CONFIG_HEVC_QSV_ENCODER 0
-%define CONFIG_HEVC_V4L2M2M_ENCODER 0
-%define CONFIG_HEVC_VAAPI_ENCODER 0
-%define CONFIG_LIBKVAZAAR_ENCODER 0
-%define CONFIG_MJPEG_VAAPI_ENCODER 0
-%define CONFIG_MPEG2_QSV_ENCODER 0
-%define CONFIG_MPEG2_VAAPI_ENCODER 0
-%define CONFIG_MPEG4_V4L2M2M_ENCODER 0
-%define CONFIG_VP8_V4L2M2M_ENCODER 0
-%define CONFIG_VP8_VAAPI_ENCODER 0
-%define CONFIG_VP9_VAAPI_ENCODER 0
-%define CONFIG_ABENCH_FILTER 0
-%define CONFIG_ACOMPRESSOR_FILTER 0
-%define CONFIG_ACOPY_FILTER 0
-%define CONFIG_ACROSSFADE_FILTER 0
-%define CONFIG_ACRUSHER_FILTER 0
-%define CONFIG_ADELAY_FILTER 0
 %define CONFIG_AECHO_FILTER 0
 %define CONFIG_AEMPHASIS_FILTER 0
+%define CONFIG_AEVALSRC_FILTER 0
 %define CONFIG_AEVAL_FILTER 0
 %define CONFIG_AFADE_FILTER 0
+%define CONFIG_AFC_DEMUXER 0
 %define CONFIG_AFFTFILT_FILTER 0
+%define CONFIG_AFIFO_FILTER 0
 %define CONFIG_AFIR_FILTER 0
 %define CONFIG_AFORMAT_FILTER 0
 %define CONFIG_AGATE_FILTER 0
+%define CONFIG_AHISTOGRAM_FILTER 0
+%define CONFIG_AIC_DECODER 0
+%define CONFIG_AIFF_DEMUXER 0
+%define CONFIG_AIFF_MUXER 0
+%define CONFIG_AIIR_FILTER 0
 %define CONFIG_AINTERLEAVE_FILTER 0
+%define CONFIG_AIX_DEMUXER 0
+%define CONFIG_ALAC_AT_DECODER 0
+%define CONFIG_ALAC_AT_ENCODER 0
+%define CONFIG_ALAC_DECODER 0
+%define CONFIG_ALAC_ENCODER 0
+%define CONFIG_ALIAS_PIX_DECODER 0
+%define CONFIG_ALIAS_PIX_ENCODER 0
 %define CONFIG_ALIMITER_FILTER 0
 %define CONFIG_ALLPASS_FILTER 0
+%define CONFIG_ALLRGB_FILTER 0
+%define CONFIG_ALLYUV_FILTER 0
 %define CONFIG_ALOOP_FILTER 0
+%define CONFIG_ALPHAEXTRACT_FILTER 0
+%define CONFIG_ALPHAMERGE_FILTER 0
+%define CONFIG_ALSA_INDEV 0
+%define CONFIG_ALSA_OUTDEV 0
+%define CONFIG_ALS_DECODER 0
 %define CONFIG_AMERGE_FILTER 0
 %define CONFIG_AMETADATA_FILTER 0
 %define CONFIG_AMIX_FILTER 0
+%define CONFIG_AMOVIE_FILTER 0
+%define CONFIG_AMRNB_DECODER 0
+%define CONFIG_AMRNB_DEMUXER 0
+%define CONFIG_AMRWB_DECODER 0
+%define CONFIG_AMRWB_DEMUXER 0
+%define CONFIG_AMR_DEMUXER 0
+%define CONFIG_AMR_MUXER 0
+%define CONFIG_AMR_NB_AT_DECODER 0
+%define CONFIG_AMV_DECODER 0
+%define CONFIG_AMV_ENCODER 0
+%define CONFIG_ANDROID_CAMERA_INDEV 0
 %define CONFIG_ANEQUALIZER_FILTER 0
+%define CONFIG_ANM_DECODER 0
+%define CONFIG_ANM_DEMUXER 0
+%define CONFIG_ANOISESRC_FILTER 0
+%define CONFIG_ANSI_DECODER 0
+%define CONFIG_ANULLSINK_FILTER 0
+%define CONFIG_ANULLSRC_FILTER 0
 %define CONFIG_ANULL_FILTER 0
 %define CONFIG_APAD_FILTER 0
+%define CONFIG_APC_DEMUXER 0
 %define CONFIG_APERMS_FILTER 0
+%define CONFIG_APE_DECODER 0
+%define CONFIG_APE_DEMUXER 0
+%define CONFIG_APHASEMETER_FILTER 0
 %define CONFIG_APHASER_FILTER 0
+%define CONFIG_APNG_DECODER 0
+%define CONFIG_APNG_DEMUXER 0
+%define CONFIG_APNG_ENCODER 0
+%define CONFIG_APNG_MUXER 0
+%define CONFIG_APTX_DECODER 0
+%define CONFIG_APTX_DEMUXER 0
+%define CONFIG_APTX_ENCODER 0
+%define CONFIG_APTX_HD_DECODER 0
+%define CONFIG_APTX_HD_DEMUXER 0
+%define CONFIG_APTX_HD_ENCODER 0
+%define CONFIG_APTX_HD_MUXER 0
+%define CONFIG_APTX_MUXER 0
 %define CONFIG_APULSATOR_FILTER 0
+%define CONFIG_AQTITLE_DEMUXER 0
 %define CONFIG_AREALTIME_FILTER 0
 %define CONFIG_ARESAMPLE_FILTER 0
 %define CONFIG_AREVERSE_FILTER 0
@@ -1026,81 +177,131 @@
 %define CONFIG_ASETPTS_FILTER 0
 %define CONFIG_ASETRATE_FILTER 0
 %define CONFIG_ASETTB_FILTER 0
+%define CONFIG_ASF_DEMUXER 0
+%define CONFIG_ASF_MUXER 0
+%define CONFIG_ASF_O_DEMUXER 0
+%define CONFIG_ASF_STREAM_MUXER 0
 %define CONFIG_ASHOWINFO_FILTER 0
 %define CONFIG_ASIDEDATA_FILTER 0
 %define CONFIG_ASPLIT_FILTER 0
+%define CONFIG_ASS_DECODER 0
+%define CONFIG_ASS_DEMUXER 0
+%define CONFIG_ASS_ENCODER 0
+%define CONFIG_ASS_FILTER 0
+%define CONFIG_ASS_MUXER 0
 %define CONFIG_ASTATS_FILTER 0
 %define CONFIG_ASTREAMSELECT_FILTER 0
+%define CONFIG_AST_DEMUXER 0
+%define CONFIG_AST_MUXER 0
+%define CONFIG_ASV1_DECODER 0
+%define CONFIG_ASV1_ENCODER 0
+%define CONFIG_ASV2_DECODER 0
+%define CONFIG_ASV2_ENCODER 0
+%define CONFIG_ASYNC_PROTOCOL 0
+%define CONFIG_ATADENOISE_FILTER 0
 %define CONFIG_ATEMPO_FILTER 0
+%define CONFIG_ATRAC1_DECODER 0
+%define CONFIG_ATRAC3AL_DECODER 0
+%define CONFIG_ATRAC3PAL_DECODER 0
+%define CONFIG_ATRAC3P_DECODER 0
+%define CONFIG_ATRAC3_DECODER 0
 %define CONFIG_ATRIM_FILTER 0
+%define CONFIG_AURA2_DECODER 0
+%define CONFIG_AURA_DECODER 0
+%define CONFIG_AU_DEMUXER 0
+%define CONFIG_AU_MUXER 0
+%define CONFIG_AVECTORSCOPE_FILTER 0
+%define CONFIG_AVFOUNDATION_INDEV 0
+%define CONFIG_AVGBLUR_FILTER 0
+%define CONFIG_AVGBLUR_OPENCL_FILTER 0
+%define CONFIG_AVISYNTH_DEMUXER 0
+%define CONFIG_AVI_DEMUXER 0
+%define CONFIG_AVI_MUXER 0
+%define CONFIG_AVM2_MUXER 0
+%define CONFIG_AVRN_DECODER 0
+%define CONFIG_AVRP_DECODER 0
+%define CONFIG_AVRP_ENCODER 0
+%define CONFIG_AVR_DEMUXER 0
+%define CONFIG_AVS_DECODER 0
+%define CONFIG_AVS_DEMUXER 0
+%define CONFIG_AVUI_DECODER 0
+%define CONFIG_AVUI_ENCODER 0
+%define CONFIG_AYUV_DECODER 0
+%define CONFIG_AYUV_ENCODER 0
 %define CONFIG_AZMQ_FILTER 0
 %define CONFIG_BANDPASS_FILTER 0
 %define CONFIG_BANDREJECT_FILTER 0
 %define CONFIG_BASS_FILTER 0
-%define CONFIG_BIQUAD_FILTER 0
-%define CONFIG_BS2B_FILTER 0
-%define CONFIG_CHANNELMAP_FILTER 0
-%define CONFIG_CHANNELSPLIT_FILTER 0
-%define CONFIG_CHORUS_FILTER 0
-%define CONFIG_COMPAND_FILTER 0
-%define CONFIG_COMPENSATIONDELAY_FILTER 0
-%define CONFIG_CROSSFEED_FILTER 0
-%define CONFIG_CRYSTALIZER_FILTER 0
-%define CONFIG_DCSHIFT_FILTER 0
-%define CONFIG_DYNAUDNORM_FILTER 0
-%define CONFIG_EARWAX_FILTER 0
-%define CONFIG_EBUR128_FILTER 0
-%define CONFIG_EQUALIZER_FILTER 0
-%define CONFIG_EXTRASTEREO_FILTER 0
-%define CONFIG_FIREQUALIZER_FILTER 0
-%define CONFIG_FLANGER_FILTER 0
-%define CONFIG_HAAS_FILTER 0
-%define CONFIG_HDCD_FILTER 0
-%define CONFIG_HEADPHONE_FILTER 0
-%define CONFIG_HIGHPASS_FILTER 0
-%define CONFIG_JOIN_FILTER 0
-%define CONFIG_LADSPA_FILTER 0
-%define CONFIG_LOUDNORM_FILTER 0
-%define CONFIG_LOWPASS_FILTER 0
-%define CONFIG_PAN_FILTER 0
-%define CONFIG_REPLAYGAIN_FILTER 0
-%define CONFIG_RESAMPLE_FILTER 0
-%define CONFIG_RUBBERBAND_FILTER 0
-%define CONFIG_SIDECHAINCOMPRESS_FILTER 0
-%define CONFIG_SIDECHAINGATE_FILTER 0
-%define CONFIG_SILENCEDETECT_FILTER 0
-%define CONFIG_SILENCEREMOVE_FILTER 0
-%define CONFIG_SOFALIZER_FILTER 0
-%define CONFIG_STEREOTOOLS_FILTER 0
-%define CONFIG_STEREOWIDEN_FILTER 0
-%define CONFIG_SUPEREQUALIZER_FILTER 0
-%define CONFIG_SURROUND_FILTER 0
-%define CONFIG_TREBLE_FILTER 0
-%define CONFIG_TREMOLO_FILTER 0
-%define CONFIG_VIBRATO_FILTER 0
-%define CONFIG_VOLUME_FILTER 0
-%define CONFIG_VOLUMEDETECT_FILTER 0
-%define CONFIG_AEVALSRC_FILTER 0
-%define CONFIG_ANOISESRC_FILTER 0
-%define CONFIG_ANULLSRC_FILTER 0
-%define CONFIG_FLITE_FILTER 0
-%define CONFIG_SINE_FILTER 0
-%define CONFIG_ANULLSINK_FILTER 0
-%define CONFIG_ALPHAEXTRACT_FILTER 0
-%define CONFIG_ALPHAMERGE_FILTER 0
-%define CONFIG_ASS_FILTER 0
-%define CONFIG_ATADENOISE_FILTER 0
-%define CONFIG_AVGBLUR_FILTER 0
 %define CONFIG_BBOX_FILTER 0
 %define CONFIG_BENCH_FILTER 0
+%define CONFIG_BETHSOFTVID_DECODER 0
+%define CONFIG_BETHSOFTVID_DEMUXER 0
+%define CONFIG_BFI_DECODER 0
+%define CONFIG_BFI_DEMUXER 0
+%define CONFIG_BFSTM_DEMUXER 0
+%define CONFIG_BINKAUDIO_DCT_DECODER 0
+%define CONFIG_BINKAUDIO_RDFT_DECODER 0
+%define CONFIG_BINK_DECODER 0
+%define CONFIG_BINK_DEMUXER 0
+%define CONFIG_BINTEXT_DECODER 0
+%define CONFIG_BINTEXT_DEMUXER 0
+%define CONFIG_BIQUAD_FILTER 0
+%define CONFIG_BITPACKED_DECODER 0
 %define CONFIG_BITPLANENOISE_FILTER 0
+%define CONFIG_BIT_DEMUXER 0
+%define CONFIG_BIT_MUXER 0
+%define CONFIG_BKTR_INDEV 0
 %define CONFIG_BLACKDETECT_FILTER 0
 %define CONFIG_BLACKFRAME_FILTER 0
 %define CONFIG_BLEND_FILTER 0
+%define CONFIG_BLURAY_PROTOCOL 0
+%define CONFIG_BMP_DECODER 0
+%define CONFIG_BMP_ENCODER 0
+%define CONFIG_BMP_PARSER 0
+%define CONFIG_BMV_AUDIO_DECODER 0
+%define CONFIG_BMV_DEMUXER 0
+%define CONFIG_BMV_VIDEO_DECODER 0
+%define CONFIG_BOA_DEMUXER 0
 %define CONFIG_BOXBLUR_FILTER 0
+%define CONFIG_BRENDER_PIX_DECODER 0
+%define CONFIG_BRSTM_DEMUXER 0
+%define CONFIG_BS2B_FILTER 0
 %define CONFIG_BWDIF_FILTER 0
+%define CONFIG_C93_DECODER 0
+%define CONFIG_C93_DEMUXER 0
+%define CONFIG_CACA_OUTDEV 0
+%define CONFIG_CACHE_PROTOCOL 0
+%define CONFIG_CAF_DEMUXER 0
+%define CONFIG_CAF_MUXER 0
+%define CONFIG_CAVSVIDEO_DEMUXER 0
+%define CONFIG_CAVSVIDEO_MUXER 0
+%define CONFIG_CAVSVIDEO_PARSER 0
+%define CONFIG_CAVS_DECODER 0
+%define CONFIG_CCAPTION_DECODER 0
+%define CONFIG_CDGRAPHICS_DECODER 0
+%define CONFIG_CDG_DEMUXER 0
+%define CONFIG_CDXL_DECODER 0
+%define CONFIG_CDXL_DEMUXER 0
+%define CONFIG_CELLAUTO_FILTER 0
+%define CONFIG_CFHD_DECODER 0
+%define CONFIG_CHANNELMAP_FILTER 0
+%define CONFIG_CHANNELSPLIT_FILTER 0
+%define CONFIG_CHOMP_BSF 0
+%define CONFIG_CHORUS_FILTER 0
 %define CONFIG_CHROMAKEY_FILTER 0
+%define CONFIG_CHROMAPRINT_MUXER 0
 %define CONFIG_CIESCOPE_FILTER 0
+%define CONFIG_CINEPAK_DECODER 0
+%define CONFIG_CINEPAK_ENCODER 0
+%define CONFIG_CINE_DEMUXER 0
+%define CONFIG_CLEARVIDEO_DECODER 0
+%define CONFIG_CLJR_DECODER 0
+%define CONFIG_CLJR_ENCODER 0
+%define CONFIG_CLLC_DECODER 0
+%define CONFIG_CODEC2RAW_DEMUXER 0
+%define CONFIG_CODEC2RAW_MUXER 0
+%define CONFIG_CODEC2_DEMUXER 0
+%define CONFIG_CODEC2_MUXER 0
 %define CONFIG_CODECVIEW_FILTER 0
 %define CONFIG_COLORBALANCE_FILTER 0
 %define CONFIG_COLORCHANNELMIXER_FILTER 0
@@ -1108,567 +309,1426 @@
 %define CONFIG_COLORLEVELS_FILTER 0
 %define CONFIG_COLORMATRIX_FILTER 0
 %define CONFIG_COLORSPACE_FILTER 0
+%define CONFIG_COLOR_FILTER 0
+%define CONFIG_COMFORTNOISE_DECODER 0
+%define CONFIG_COMFORTNOISE_ENCODER 0
+%define CONFIG_COMPAND_FILTER 0
+%define CONFIG_COMPENSATIONDELAY_FILTER 0
+%define CONFIG_CONCAT_DEMUXER 0
+%define CONFIG_CONCAT_FILTER 0
+%define CONFIG_CONCAT_PROTOCOL 0
 %define CONFIG_CONVOLUTION_FILTER 0
+%define CONFIG_CONVOLUTION_OPENCL_FILTER 0
 %define CONFIG_CONVOLVE_FILTER 0
+%define CONFIG_COOK_DECODER 0
+%define CONFIG_COOK_PARSER 0
 %define CONFIG_COPY_FILTER 0
+%define CONFIG_COREIMAGESRC_FILTER 0
 %define CONFIG_COREIMAGE_FILTER 0
 %define CONFIG_COVER_RECT_FILTER 0
-%define CONFIG_CROP_FILTER 0
+%define CONFIG_CPIA_DECODER 0
+%define CONFIG_CRC_MUXER 0
 %define CONFIG_CROPDETECT_FILTER 0
+%define CONFIG_CROP_FILTER 0
+%define CONFIG_CROSSFEED_FILTER 0
+%define CONFIG_CRYPTO_PROTOCOL 0
+%define CONFIG_CRYSTALIZER_FILTER 0
+%define CONFIG_CSCD_DECODER 0
 %define CONFIG_CURVES_FILTER 0
+%define CONFIG_CYUV_DECODER 0
+%define CONFIG_DASH_DEMUXER 0
+%define CONFIG_DASH_MUXER 0
 %define CONFIG_DATASCOPE_FILTER 0
+%define CONFIG_DATA_DEMUXER 0
+%define CONFIG_DATA_MUXER 0
+%define CONFIG_DATA_PROTOCOL 0
+%define CONFIG_DAUD_DEMUXER 0
+%define CONFIG_DAUD_MUXER 0
+%define CONFIG_DCA_CORE_BSF 0
+%define CONFIG_DCA_DECODER 0
+%define CONFIG_DCA_ENCODER 0
+%define CONFIG_DCA_PARSER 0
+%define CONFIG_DCSHIFT_FILTER 0
+%define CONFIG_DCSTR_DEMUXER 0
 %define CONFIG_DCTDNOIZ_FILTER 0
+%define CONFIG_DDS_DECODER 0
 %define CONFIG_DEBAND_FILTER 0
 %define CONFIG_DECIMATE_FILTER 0
+%define CONFIG_DECKLINK_INDEV 0
+%define CONFIG_DECKLINK_OUTDEV 0
+%define CONFIG_DECONVOLVE_FILTER 0
 %define CONFIG_DEFLATE_FILTER 0
 %define CONFIG_DEFLICKER_FILTER 0
 %define CONFIG_DEINTERLACE_QSV_FILTER 0
 %define CONFIG_DEINTERLACE_VAAPI_FILTER 0
 %define CONFIG_DEJUDDER_FILTER 0
 %define CONFIG_DELOGO_FILTER 0
+%define CONFIG_DEMUXERS 0
+%define CONFIG_DENOISE_VAAPI_FILTER 0
 %define CONFIG_DESHAKE_FILTER 0
 %define CONFIG_DESPILL_FILTER 0
 %define CONFIG_DETELECINE_FILTER 0
+%define CONFIG_DFA_DECODER 0
+%define CONFIG_DFA_DEMUXER 0
 %define CONFIG_DILATION_FILTER 0
+%define CONFIG_DIRAC_DECODER 0
+%define CONFIG_DIRAC_DEMUXER 0
+%define CONFIG_DIRAC_MUXER 0
+%define CONFIG_DIRAC_PARSER 0
 %define CONFIG_DISPLACE_FILTER 0
+%define CONFIG_DNXHD_DECODER 0
+%define CONFIG_DNXHD_DEMUXER 0
+%define CONFIG_DNXHD_ENCODER 0
+%define CONFIG_DNXHD_MUXER 0
+%define CONFIG_DNXHD_PARSER 0
+%define CONFIG_DOLBY_E_DECODER 0
 %define CONFIG_DOUBLEWEAVE_FILTER 0
+%define CONFIG_DPX_DECODER 0
+%define CONFIG_DPX_ENCODER 0
+%define CONFIG_DPX_PARSER 0
 %define CONFIG_DRAWBOX_FILTER 0
 %define CONFIG_DRAWGRAPH_FILTER 0
 %define CONFIG_DRAWGRID_FILTER 0
 %define CONFIG_DRAWTEXT_FILTER 0
+%define CONFIG_DRMETER_FILTER 0
+%define CONFIG_DSD_LSBF_DECODER 0
+%define CONFIG_DSD_LSBF_PLANAR_DECODER 0
+%define CONFIG_DSD_MSBF_DECODER 0
+%define CONFIG_DSD_MSBF_PLANAR_DECODER 0
+%define CONFIG_DSF_DEMUXER 0
+%define CONFIG_DSHOW_INDEV 0
+%define CONFIG_DSICINAUDIO_DECODER 0
+%define CONFIG_DSICINVIDEO_DECODER 0
+%define CONFIG_DSICIN_DEMUXER 0
+%define CONFIG_DSS_DEMUXER 0
+%define CONFIG_DSS_SP_DECODER 0
+%define CONFIG_DST_DECODER 0
+%define CONFIG_DTSHD_DEMUXER 0
+%define CONFIG_DTS_DEMUXER 0
+%define CONFIG_DTS_MUXER 0
+%define CONFIG_DUMP_EXTRADATA_BSF 0
+%define CONFIG_DVAUDIO_DECODER 0
+%define CONFIG_DVAUDIO_PARSER 0
+%define CONFIG_DVBSUB_DECODER 0
+%define CONFIG_DVBSUB_DEMUXER 0
+%define CONFIG_DVBSUB_ENCODER 0
+%define CONFIG_DVBSUB_PARSER 0
+%define CONFIG_DVBTXT_DEMUXER 0
+%define CONFIG_DVDSUB_DECODER 0
+%define CONFIG_DVDSUB_ENCODER 0
+%define CONFIG_DVDSUB_PARSER 0
+%define CONFIG_DVD_NAV_PARSER 0
+%define CONFIG_DVVIDEO_DECODER 0
+%define CONFIG_DVVIDEO_ENCODER 0
+%define CONFIG_DV_DEMUXER 0
+%define CONFIG_DV_MUXER 0
+%define CONFIG_DXA_DECODER 0
+%define CONFIG_DXA_DEMUXER 0
+%define CONFIG_DXTORY_DECODER 0
+%define CONFIG_DXV_DECODER 0
+%define CONFIG_DYNAUDNORM_FILTER 0
+%define CONFIG_EAC3_AT_DECODER 0
+%define CONFIG_EAC3_CORE_BSF 0
+%define CONFIG_EAC3_DECODER 0
+%define CONFIG_EAC3_DEMUXER 0
+%define CONFIG_EAC3_ENCODER 0
+%define CONFIG_EAC3_MUXER 0
+%define CONFIG_EACMV_DECODER 0
+%define CONFIG_EAMAD_DECODER 0
+%define CONFIG_EARWAX_FILTER 0
+%define CONFIG_EATGQ_DECODER 0
+%define CONFIG_EATGV_DECODER 0
+%define CONFIG_EATQI_DECODER 0
+%define CONFIG_EA_CDATA_DEMUXER 0
+%define CONFIG_EA_DEMUXER 0
+%define CONFIG_EBUR128_FILTER 0
 %define CONFIG_EDGEDETECT_FILTER 0
+%define CONFIG_EIGHTBPS_DECODER 0
+%define CONFIG_EIGHTSVX_EXP_DECODER 0
+%define CONFIG_EIGHTSVX_FIB_DECODER 0
 %define CONFIG_ELBG_FILTER 0
+%define CONFIG_ENCODERS 0
+%define CONFIG_ENTROPY_FILTER 0
+%define CONFIG_EPAF_DEMUXER 0
+%define CONFIG_EQUALIZER_FILTER 0
 %define CONFIG_EQ_FILTER 0
 %define CONFIG_EROSION_FILTER 0
+%define CONFIG_ESCAPE124_DECODER 0
+%define CONFIG_ESCAPE130_DECODER 0
+%define CONFIG_EVRC_DECODER 0
+%define CONFIG_EXR_DECODER 0
 %define CONFIG_EXTRACTPLANES_FILTER 0
+%define CONFIG_EXTRACT_EXTRADATA_BSF 0
+%define CONFIG_EXTRASTEREO_FILTER 0
+%define CONFIG_F4V_MUXER 0
 %define CONFIG_FADE_FILTER 0
+%define CONFIG_FBDEV_INDEV 0
+%define CONFIG_FBDEV_OUTDEV 0
+%define CONFIG_FFMETADATA_DEMUXER 0
+%define CONFIG_FFMETADATA_MUXER 0
+%define CONFIG_FFRTMPCRYPT_PROTOCOL 0
+%define CONFIG_FFRTMPHTTP_PROTOCOL 0
 %define CONFIG_FFTFILT_FILTER 0
-%define CONFIG_FIELD_FILTER 0
+%define CONFIG_FFV1_DECODER 0
+%define CONFIG_FFV1_ENCODER 0
+%define CONFIG_FFVHUFF_DECODER 0
+%define CONFIG_FFVHUFF_ENCODER 0
+%define CONFIG_FFWAVESYNTH_DECODER 0
+%define CONFIG_FIC_DECODER 0
 %define CONFIG_FIELDHINT_FILTER 0
 %define CONFIG_FIELDMATCH_FILTER 0
 %define CONFIG_FIELDORDER_FILTER 0
+%define CONFIG_FIELD_FILTER 0
+%define CONFIG_FIFO_FILTER 0
+%define CONFIG_FIFO_MUXER 0
+%define CONFIG_FIFO_TEST_MUXER 0
+%define CONFIG_FILE_PROTOCOL 0
+%define CONFIG_FILLBORDERS_FILTER 0
+%define CONFIG_FILMSTRIP_DEMUXER 0
+%define CONFIG_FILMSTRIP_MUXER 0
+%define CONFIG_FILTERS 0
+%define CONFIG_FILTER_UNITS_BSF 0
 %define CONFIG_FIND_RECT_FILTER 0
+%define CONFIG_FIREQUALIZER_FILTER 0
+%define CONFIG_FITS_DECODER 0
+%define CONFIG_FITS_DEMUXER 0
+%define CONFIG_FITS_ENCODER 0
+%define CONFIG_FITS_MUXER 0
+%define CONFIG_FLAC_DEMUXER 0
+%define CONFIG_FLAC_ENCODER 0
+%define CONFIG_FLAC_MUXER 0
+%define CONFIG_FLAC_PARSER 0
+%define CONFIG_FLANGER_FILTER 0
+%define CONFIG_FLASHSV2_DECODER 0
+%define CONFIG_FLASHSV2_ENCODER 0
+%define CONFIG_FLASHSV_DECODER 0
+%define CONFIG_FLASHSV_ENCODER 0
+%define CONFIG_FLIC_DECODER 0
+%define CONFIG_FLIC_DEMUXER 0
+%define CONFIG_FLITE_FILTER 0
 %define CONFIG_FLOODFILL_FILTER 0
+%define CONFIG_FLV_DECODER 0
+%define CONFIG_FLV_DEMUXER 0
+%define CONFIG_FLV_ENCODER 0
+%define CONFIG_FLV_MUXER 0
+%define CONFIG_FMVC_DECODER 0
 %define CONFIG_FORMAT_FILTER 0
+%define CONFIG_FOURXM_DECODER 0
+%define CONFIG_FOURXM_DEMUXER 0
 %define CONFIG_FPS_FILTER 0
+%define CONFIG_FRAMECRC_MUXER 0
+%define CONFIG_FRAMEHASH_MUXER 0
+%define CONFIG_FRAMEMD5_MUXER 0
 %define CONFIG_FRAMEPACK_FILTER 0
 %define CONFIG_FRAMERATE_FILTER 0
 %define CONFIG_FRAMESTEP_FILTER 0
+%define CONFIG_FRAME_THREAD_ENCODER 0
+%define CONFIG_FRAPS_DECODER 0
 %define CONFIG_FREI0R_FILTER 0
+%define CONFIG_FREI0R_SRC_FILTER 0
+%define CONFIG_FRM_DEMUXER 0
+%define CONFIG_FRWU_DECODER 0
+%define CONFIG_FSB_DEMUXER 0
 %define CONFIG_FSPP_FILTER 0
+%define CONFIG_FTP_PROTOCOL 0
+%define CONFIG_G2M_DECODER 0
+%define CONFIG_G722_DEMUXER 0
+%define CONFIG_G722_MUXER 0
+%define CONFIG_G723_1_DECODER 0
+%define CONFIG_G723_1_DEMUXER 0
+%define CONFIG_G723_1_ENCODER 0
+%define CONFIG_G723_1_MUXER 0
+%define CONFIG_G726LE_DEMUXER 0
+%define CONFIG_G726LE_MUXER 0
+%define CONFIG_G726_DEMUXER 0
+%define CONFIG_G726_MUXER 0
+%define CONFIG_G729_DECODER 0
+%define CONFIG_G729_DEMUXER 0
+%define CONFIG_G729_PARSER 0
 %define CONFIG_GBLUR_FILTER 0
+%define CONFIG_GDIGRAB_INDEV 0
+%define CONFIG_GDV_DECODER 0
+%define CONFIG_GDV_DEMUXER 0
+%define CONFIG_GENH_DEMUXER 0
 %define CONFIG_GEQ_FILTER 0
+%define CONFIG_GIF_DECODER 0
+%define CONFIG_GIF_DEMUXER 0
+%define CONFIG_GIF_ENCODER 0
+%define CONFIG_GIF_MUXER 0
+%define CONFIG_GOPHER_PROTOCOL 0
 %define CONFIG_GRADFUN_FILTER 0
+%define CONFIG_GREMLIN_DPCM_DECODER 0
+%define CONFIG_GSM_DECODER 0
+%define CONFIG_GSM_DEMUXER 0
+%define CONFIG_GSM_MS_AT_DECODER 0
+%define CONFIG_GSM_MS_DECODER 0
+%define CONFIG_GSM_MUXER 0
+%define CONFIG_GSM_PARSER 0
+%define CONFIG_GXF_DEMUXER 0
+%define CONFIG_GXF_MUXER 0
+%define CONFIG_H261_DECODER 0
+%define CONFIG_H261_DEMUXER 0
+%define CONFIG_H261_ENCODER 0
+%define CONFIG_H261_MUXER 0
+%define CONFIG_H261_PARSER 0
+%define CONFIG_H263I_DECODER 0
+%define CONFIG_H263P_DECODER 0
+%define CONFIG_H263P_ENCODER 0
+%define CONFIG_H263_DECODER 0
+%define CONFIG_H263_DEMUXER 0
+%define CONFIG_H263_ENCODER 0
+%define CONFIG_H263_MUXER 0
+%define CONFIG_H263_PARSER 0
+%define CONFIG_H263_V4L2M2M_DECODER 0
+%define CONFIG_H263_V4L2M2M_ENCODER 0
+%define CONFIG_H263_VAAPI_HWACCEL 0
+%define CONFIG_H263_VIDEOTOOLBOX_HWACCEL 0
+%define CONFIG_H264_AMF_ENCODER 0
+%define CONFIG_H264_CRYSTALHD_DECODER 0
+%define CONFIG_H264_CUVID_DECODER 0
+%define CONFIG_H264_D3D11VA2_HWACCEL 0
+%define CONFIG_H264_D3D11VA_HWACCEL 0
+%define CONFIG_H264_DECODER 0
+%define CONFIG_H264_DEMUXER 0
+%define CONFIG_H264_DXVA2_HWACCEL 0
+%define CONFIG_H264_MEDIACODEC_DECODER 0
+%define CONFIG_H264_METADATA_BSF 0
+%define CONFIG_H264_MMAL_DECODER 0
+%define CONFIG_H264_MP4TOANNEXB_BSF 0
+%define CONFIG_H264_MUXER 0
+%define CONFIG_H264_NVDEC_HWACCEL 0
+%define CONFIG_H264_NVENC_ENCODER 0
+%define CONFIG_H264_OMX_ENCODER 0
+%define CONFIG_H264_PARSER 0
+%define CONFIG_H264_QSV_DECODER 0
+%define CONFIG_H264_QSV_ENCODER 0
+%define CONFIG_H264_REDUNDANT_PPS_BSF 0
+%define CONFIG_H264_RKMPP_DECODER 0
+%define CONFIG_H264_V4L2M2M_DECODER 0
+%define CONFIG_H264_V4L2M2M_ENCODER 0
+%define CONFIG_H264_VAAPI_ENCODER 0
+%define CONFIG_H264_VAAPI_HWACCEL 0
+%define CONFIG_H264_VDPAU_HWACCEL 0
+%define CONFIG_H264_VIDEOTOOLBOX_ENCODER 0
+%define CONFIG_H264_VIDEOTOOLBOX_HWACCEL 0
+%define CONFIG_HAAS_FILTER 0
+%define CONFIG_HALDCLUTSRC_FILTER 0
 %define CONFIG_HALDCLUT_FILTER 0
+%define CONFIG_HAPQA_EXTRACT_BSF 0
+%define CONFIG_HAP_DECODER 0
+%define CONFIG_HAP_ENCODER 0
+%define CONFIG_HASH_MUXER 0
+%define CONFIG_HDCD_FILTER 0
+%define CONFIG_HDS_MUXER 0
+%define CONFIG_HEADPHONE_FILTER 0
+%define CONFIG_HEVC_AMF_ENCODER 0
+%define CONFIG_HEVC_CUVID_DECODER 0
+%define CONFIG_HEVC_D3D11VA2_HWACCEL 0
+%define CONFIG_HEVC_D3D11VA_HWACCEL 0
+%define CONFIG_HEVC_DECODER 0
+%define CONFIG_HEVC_DEMUXER 0
+%define CONFIG_HEVC_DXVA2_HWACCEL 0
+%define CONFIG_HEVC_MEDIACODEC_DECODER 0
+%define CONFIG_HEVC_METADATA_BSF 0
+%define CONFIG_HEVC_MP4TOANNEXB_BSF 0
+%define CONFIG_HEVC_MUXER 0
+%define CONFIG_HEVC_NVDEC_HWACCEL 0
+%define CONFIG_HEVC_NVENC_ENCODER 0
+%define CONFIG_HEVC_PARSER 0
+%define CONFIG_HEVC_QSV_DECODER 0
+%define CONFIG_HEVC_QSV_ENCODER 0
+%define CONFIG_HEVC_RKMPP_DECODER 0
+%define CONFIG_HEVC_V4L2M2M_DECODER 0
+%define CONFIG_HEVC_V4L2M2M_ENCODER 0
+%define CONFIG_HEVC_VAAPI_ENCODER 0
+%define CONFIG_HEVC_VAAPI_HWACCEL 0
+%define CONFIG_HEVC_VDPAU_HWACCEL 0
+%define CONFIG_HEVC_VIDEOTOOLBOX_ENCODER 0
+%define CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL 0
 %define CONFIG_HFLIP_FILTER 0
+%define CONFIG_HIGHPASS_FILTER 0
+%define CONFIG_HILBERT_FILTER 0
 %define CONFIG_HISTEQ_FILTER 0
 %define CONFIG_HISTOGRAM_FILTER 0
+%define CONFIG_HLS_DEMUXER 0
+%define CONFIG_HLS_MUXER 0
+%define CONFIG_HLS_PROTOCOL 0
+%define CONFIG_HNM4_VIDEO_DECODER 0
+%define CONFIG_HNM_DEMUXER 0
 %define CONFIG_HQDN3D_FILTER 0
+%define CONFIG_HQX_DECODER 0
 %define CONFIG_HQX_FILTER 0
+%define CONFIG_HQ_HQA_DECODER 0
 %define CONFIG_HSTACK_FILTER 0
+%define CONFIG_HTTPPROXY_PROTOCOL 0
+%define CONFIG_HTTPS_PROTOCOL 0
+%define CONFIG_HTTP_PROTOCOL 0
 %define CONFIG_HUE_FILTER 0
+%define CONFIG_HUFFYUV_DECODER 0
+%define CONFIG_HUFFYUV_ENCODER 0
+%define CONFIG_HWACCELS 0
 %define CONFIG_HWDOWNLOAD_FILTER 0
 %define CONFIG_HWMAP_FILTER 0
-%define CONFIG_HWUPLOAD_FILTER 0
 %define CONFIG_HWUPLOAD_CUDA_FILTER 0
+%define CONFIG_HWUPLOAD_FILTER 0
 %define CONFIG_HYSTERESIS_FILTER 0
+%define CONFIG_IAC_DECODER 0
+%define CONFIG_ICECAST_PROTOCOL 0
+%define CONFIG_ICO_DEMUXER 0
+%define CONFIG_ICO_MUXER 0
+%define CONFIG_IDCIN_DECODER 0
+%define CONFIG_IDCIN_DEMUXER 0
 %define CONFIG_IDET_FILTER 0
+%define CONFIG_IDF_DECODER 0
+%define CONFIG_IDF_DEMUXER 0
+%define CONFIG_IEC61883_INDEV 0
+%define CONFIG_IFF_DEMUXER 0
+%define CONFIG_IFF_ILBM_DECODER 0
+%define CONFIG_ILBC_AT_DECODER 0
+%define CONFIG_ILBC_AT_ENCODER 0
+%define CONFIG_ILBC_DEMUXER 0
+%define CONFIG_ILBC_MUXER 0
 %define CONFIG_IL_FILTER 0
+%define CONFIG_IMAGE2PIPE_DEMUXER 0
+%define CONFIG_IMAGE2PIPE_MUXER 0
+%define CONFIG_IMAGE2_ALIAS_PIX_DEMUXER 0
+%define CONFIG_IMAGE2_BRENDER_PIX_DEMUXER 0
+%define CONFIG_IMAGE2_DEMUXER 0
+%define CONFIG_IMAGE2_MUXER 0
+%define CONFIG_IMAGE_BMP_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_DDS_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_DPX_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_EXR_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_J2K_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_JPEGLS_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_JPEG_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PAM_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PBM_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PCX_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PGMYUV_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PGM_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PICTOR_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PNG_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PPM_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_PSD_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_QDRAW_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_SGI_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_SUNRAST_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_SVG_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_TIFF_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_WEBP_PIPE_DEMUXER 0
+%define CONFIG_IMAGE_XPM_PIPE_DEMUXER 0
+%define CONFIG_IMC_DECODER 0
+%define CONFIG_IMX_DUMP_HEADER_BSF 0
+%define CONFIG_INDEO2_DECODER 0
+%define CONFIG_INDEO3_DECODER 0
+%define CONFIG_INDEO4_DECODER 0
+%define CONFIG_INDEO5_DECODER 0
+%define CONFIG_INDEVS 0
 %define CONFIG_INFLATE_FILTER 0
+%define CONFIG_INGENIENT_DEMUXER 0
 %define CONFIG_INTERLACE_FILTER 0
 %define CONFIG_INTERLEAVE_FILTER 0
+%define CONFIG_INTERPLAY_ACM_DECODER 0
+%define CONFIG_INTERPLAY_DPCM_DECODER 0
+%define CONFIG_INTERPLAY_VIDEO_DECODER 0
+%define CONFIG_IPMOVIE_DEMUXER 0
+%define CONFIG_IPOD_MUXER 0
+%define CONFIG_IRCAM_DEMUXER 0
+%define CONFIG_IRCAM_MUXER 0
+%define CONFIG_ISMV_MUXER 0
+%define CONFIG_ISS_DEMUXER 0
+%define CONFIG_IV8_DEMUXER 0
+%define CONFIG_IVF_DEMUXER 0
+%define CONFIG_IVF_MUXER 0
+%define CONFIG_IVR_DEMUXER 0
+%define CONFIG_JACK_INDEV 0
+%define CONFIG_JACOSUB_DECODER 0
+%define CONFIG_JACOSUB_DEMUXER 0
+%define CONFIG_JACOSUB_MUXER 0
+%define CONFIG_JOIN_FILTER 0
+%define CONFIG_JPEG2000_DECODER 0
+%define CONFIG_JPEG2000_ENCODER 0
+%define CONFIG_JPEGLS_DECODER 0
+%define CONFIG_JPEGLS_ENCODER 0
+%define CONFIG_JV_DECODER 0
+%define CONFIG_JV_DEMUXER 0
 %define CONFIG_KERNDEINT_FILTER 0
+%define CONFIG_KGV1_DECODER 0
+%define CONFIG_KMSGRAB_INDEV 0
+%define CONFIG_KMVC_DECODER 0
+%define CONFIG_LADSPA_FILTER 0
+%define CONFIG_LAGARITH_DECODER 0
+%define CONFIG_LATM_MUXER 0
+%define CONFIG_LAVFI_INDEV 0
 %define CONFIG_LENSCORRECTION_FILTER 0
+%define CONFIG_LIBAOM_AV1_DECODER 0
+%define CONFIG_LIBAOM_AV1_ENCODER 0
+%define CONFIG_LIBCDIO_INDEV 0
+%define CONFIG_LIBCELT_DECODER 0
+%define CONFIG_LIBCODEC2_DECODER 0
+%define CONFIG_LIBCODEC2_ENCODER 0
+%define CONFIG_LIBDC1394_INDEV 0
+%define CONFIG_LIBFDK_AAC_DECODER 0
+%define CONFIG_LIBFDK_AAC_ENCODER 0
+%define CONFIG_LIBGME_DEMUXER 0
+%define CONFIG_LIBGSM_DECODER 0
+%define CONFIG_LIBGSM_ENCODER 0
+%define CONFIG_LIBGSM_MS_DECODER 0
+%define CONFIG_LIBGSM_MS_ENCODER 0
+%define CONFIG_LIBILBC_DECODER 0
+%define CONFIG_LIBILBC_ENCODER 0
+%define CONFIG_LIBKVAZAAR_ENCODER 0
+%define CONFIG_LIBMODPLUG_DEMUXER 0
+%define CONFIG_LIBMP3LAME_ENCODER 0
+%define CONFIG_LIBNDI_NEWTEK_INDEV 0
+%define CONFIG_LIBNDI_NEWTEK_OUTDEV 0
+%define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
+%define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
+%define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
+%define CONFIG_LIBOPENH264_DECODER 0
+%define CONFIG_LIBOPENH264_ENCODER 0
+%define CONFIG_LIBOPENJPEG_DECODER 0
+%define CONFIG_LIBOPENJPEG_ENCODER 0
+%define CONFIG_LIBOPENMPT_DEMUXER 0
+%define CONFIG_LIBOPUS_DECODER 0
+%define CONFIG_LIBOPUS_ENCODER 0
+%define CONFIG_LIBRSVG_DECODER 0
+%define CONFIG_LIBRTMPE_PROTOCOL 0
+%define CONFIG_LIBRTMPS_PROTOCOL 0
+%define CONFIG_LIBRTMPTE_PROTOCOL 0
+%define CONFIG_LIBRTMPT_PROTOCOL 0
+%define CONFIG_LIBRTMP_PROTOCOL 0
+%define CONFIG_LIBSHINE_ENCODER 0
+%define CONFIG_LIBSMBCLIENT_PROTOCOL 0
+%define CONFIG_LIBSPEEX_DECODER 0
+%define CONFIG_LIBSPEEX_ENCODER 0
+%define CONFIG_LIBSRT_PROTOCOL 0
+%define CONFIG_LIBSSH_PROTOCOL 0
+%define CONFIG_LIBTHEORA_ENCODER 0
+%define CONFIG_LIBTWOLAME_ENCODER 0
 %define CONFIG_LIBVMAF_FILTER 0
+%define CONFIG_LIBVORBIS_DECODER 0
+%define CONFIG_LIBVORBIS_ENCODER 0
+%define CONFIG_LIBVO_AMRWBENC_ENCODER 0
+%define CONFIG_LIBVPX_VP8_DECODER 0
+%define CONFIG_LIBVPX_VP8_ENCODER 0
+%define CONFIG_LIBVPX_VP9_DECODER 0
+%define CONFIG_LIBVPX_VP9_ENCODER 0
+%define CONFIG_LIBWAVPACK_ENCODER 0
+%define CONFIG_LIBWEBP_ANIM_ENCODER 0
+%define CONFIG_LIBWEBP_ENCODER 0
+%define CONFIG_LIBX262_ENCODER 0
+%define CONFIG_LIBX264RGB_ENCODER 0
+%define CONFIG_LIBX264_ENCODER 0
+%define CONFIG_LIBX265_ENCODER 0
+%define CONFIG_LIBXAVS_ENCODER 0
+%define CONFIG_LIBXVID_ENCODER 0
+%define CONFIG_LIBZVBI_TELETEXT_DECODER 0
+%define CONFIG_LIFE_FILTER 0
 %define CONFIG_LIMITER_FILTER 0
+%define CONFIG_LIVE_FLV_DEMUXER 0
+%define CONFIG_LJPEG_ENCODER 0
+%define CONFIG_LMLM4_DEMUXER 0
+%define CONFIG_LOAS_DEMUXER 0
+%define CONFIG_LOCO_DECODER 0
 %define CONFIG_LOOP_FILTER 0
+%define CONFIG_LOUDNORM_FILTER 0
+%define CONFIG_LOWPASS_FILTER 0
+%define CONFIG_LRC_DEMUXER 0
+%define CONFIG_LRC_MUXER 0
 %define CONFIG_LUMAKEY_FILTER 0
-%define CONFIG_LUT_FILTER 0
 %define CONFIG_LUT2_FILTER 0
 %define CONFIG_LUT3D_FILTER 0
 %define CONFIG_LUTRGB_FILTER 0
 %define CONFIG_LUTYUV_FILTER 0
+%define CONFIG_LUT_FILTER 0
+%define CONFIG_LV2_FILTER 0
+%define CONFIG_LVF_DEMUXER 0
+%define CONFIG_LXF_DEMUXER 0
+%define CONFIG_M101_DECODER 0
+%define CONFIG_M4V_DEMUXER 0
+%define CONFIG_M4V_MUXER 0
+%define CONFIG_MACE3_DECODER 0
+%define CONFIG_MACE6_DECODER 0
+%define CONFIG_MAGICYUV_DECODER 0
+%define CONFIG_MAGICYUV_ENCODER 0
+%define CONFIG_MANDELBROT_FILTER 0
 %define CONFIG_MASKEDCLAMP_FILTER 0
 %define CONFIG_MASKEDMERGE_FILTER 0
+%define CONFIG_MATROSKA_AUDIO_MUXER 0
+%define CONFIG_MATROSKA_DEMUXER 0
+%define CONFIG_MATROSKA_MUXER 0
 %define CONFIG_MCDEINT_FILTER 0
+%define CONFIG_MCOMPAND_FILTER 0
+%define CONFIG_MD5_MUXER 0
+%define CONFIG_MD5_PROTOCOL 0
+%define CONFIG_MDEC_DECODER 0
 %define CONFIG_MERGEPLANES_FILTER 0
 %define CONFIG_MESTIMATE_FILTER 0
 %define CONFIG_METADATA_FILTER 0
+%define CONFIG_METASOUND_DECODER 0
+%define CONFIG_MGSTS_DEMUXER 0
+%define CONFIG_MICRODVD_DECODER 0
+%define CONFIG_MICRODVD_DEMUXER 0
+%define CONFIG_MICRODVD_MUXER 0
 %define CONFIG_MIDEQUALIZER_FILTER 0
+%define CONFIG_MIMIC_DECODER 0
 %define CONFIG_MINTERPOLATE_FILTER 0
+%define CONFIG_MIX_FILTER 0
+%define CONFIG_MJPEG2JPEG_BSF 0
+%define CONFIG_MJPEGA_DUMP_HEADER_BSF 0
+%define CONFIG_MJPEGB_DECODER 0
+%define CONFIG_MJPEG_2000_DEMUXER 0
+%define CONFIG_MJPEG_CUVID_DECODER 0
+%define CONFIG_MJPEG_DECODER 0
+%define CONFIG_MJPEG_DEMUXER 0
+%define CONFIG_MJPEG_ENCODER 0
+%define CONFIG_MJPEG_MUXER 0
+%define CONFIG_MJPEG_NVDEC_HWACCEL 0
+%define CONFIG_MJPEG_PARSER 0
+%define CONFIG_MJPEG_QSV_ENCODER 0
+%define CONFIG_MJPEG_VAAPI_ENCODER 0
+%define CONFIG_MJPEG_VAAPI_HWACCEL 0
+%define CONFIG_MKVTIMESTAMP_V2_MUXER 0
+%define CONFIG_MLP_DECODER 0
+%define CONFIG_MLP_DEMUXER 0
+%define CONFIG_MLP_ENCODER 0
+%define CONFIG_MLP_MUXER 0
+%define CONFIG_MLP_PARSER 0
+%define CONFIG_MLV_DEMUXER 0
+%define CONFIG_MMF_DEMUXER 0
+%define CONFIG_MMF_MUXER 0
+%define CONFIG_MMSH_PROTOCOL 0
+%define CONFIG_MMST_PROTOCOL 0
+%define CONFIG_MMVIDEO_DECODER 0
+%define CONFIG_MM_DEMUXER 0
+%define CONFIG_MOTIONPIXELS_DECODER 0
+%define CONFIG_MOV2TEXTSUB_BSF 0
+%define CONFIG_MOVIE_FILTER 0
+%define CONFIG_MOVTEXT_DECODER 0
+%define CONFIG_MOVTEXT_ENCODER 0
+%define CONFIG_MOV_DEMUXER 0
+%define CONFIG_MOV_MUXER 0
+%define CONFIG_MP1FLOAT_DECODER 0
+%define CONFIG_MP1_AT_DECODER 0
+%define CONFIG_MP1_DECODER 0
+%define CONFIG_MP2FIXED_ENCODER 0
+%define CONFIG_MP2FLOAT_DECODER 0
+%define CONFIG_MP2_AT_DECODER 0
+%define CONFIG_MP2_DECODER 0
+%define CONFIG_MP2_ENCODER 0
+%define CONFIG_MP2_MUXER 0
+%define CONFIG_MP3ADUFLOAT_DECODER 0
+%define CONFIG_MP3ADU_DECODER 0
+%define CONFIG_MP3FLOAT_DECODER 0
+%define CONFIG_MP3ON4FLOAT_DECODER 0
+%define CONFIG_MP3ON4_DECODER 0
+%define CONFIG_MP3_AT_DECODER 0
+%define CONFIG_MP3_DECODER 0
+%define CONFIG_MP3_DEMUXER 0
+%define CONFIG_MP3_HEADER_DECOMPRESS_BSF 0
+%define CONFIG_MP3_MUXER 0
+%define CONFIG_MP4_MUXER 0
+%define CONFIG_MPC7_DECODER 0
+%define CONFIG_MPC8_DECODER 0
+%define CONFIG_MPC8_DEMUXER 0
+%define CONFIG_MPC_DEMUXER 0
 %define CONFIG_MPDECIMATE_FILTER 0
+%define CONFIG_MPEG1SYSTEM_MUXER 0
+%define CONFIG_MPEG1VCD_MUXER 0
+%define CONFIG_MPEG1VIDEO_DECODER 0
+%define CONFIG_MPEG1VIDEO_ENCODER 0
+%define CONFIG_MPEG1VIDEO_MUXER 0
+%define CONFIG_MPEG1_CUVID_DECODER 0
+%define CONFIG_MPEG1_NVDEC_HWACCEL 0
+%define CONFIG_MPEG1_V4L2M2M_DECODER 0
+%define CONFIG_MPEG1_VDPAU_HWACCEL 0
+%define CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL 0
+%define CONFIG_MPEG1_XVMC_HWACCEL 0
+%define CONFIG_MPEG2DVD_MUXER 0
+%define CONFIG_MPEG2SVCD_MUXER 0
+%define CONFIG_MPEG2VIDEO_DECODER 0
+%define CONFIG_MPEG2VIDEO_ENCODER 0
+%define CONFIG_MPEG2VIDEO_MUXER 0
+%define CONFIG_MPEG2VOB_MUXER 0
+%define CONFIG_MPEG2_CRYSTALHD_DECODER 0
+%define CONFIG_MPEG2_CUVID_DECODER 0
+%define CONFIG_MPEG2_D3D11VA2_HWACCEL 0
+%define CONFIG_MPEG2_D3D11VA_HWACCEL 0
+%define CONFIG_MPEG2_DXVA2_HWACCEL 0
+%define CONFIG_MPEG2_MEDIACODEC_DECODER 0
+%define CONFIG_MPEG2_METADATA_BSF 0
+%define CONFIG_MPEG2_MMAL_DECODER 0
+%define CONFIG_MPEG2_NVDEC_HWACCEL 0
+%define CONFIG_MPEG2_QSV_DECODER 0
+%define CONFIG_MPEG2_QSV_ENCODER 0
+%define CONFIG_MPEG2_V4L2M2M_DECODER 0
+%define CONFIG_MPEG2_VAAPI_ENCODER 0
+%define CONFIG_MPEG2_VAAPI_HWACCEL 0
+%define CONFIG_MPEG2_VDPAU_HWACCEL 0
+%define CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL 0
+%define CONFIG_MPEG2_XVMC_HWACCEL 0
+%define CONFIG_MPEG4VIDEO_PARSER 0
+%define CONFIG_MPEG4_CRYSTALHD_DECODER 0
+%define CONFIG_MPEG4_CUVID_DECODER 0
+%define CONFIG_MPEG4_DECODER 0
+%define CONFIG_MPEG4_ENCODER 0
+%define CONFIG_MPEG4_MEDIACODEC_DECODER 0
+%define CONFIG_MPEG4_MMAL_DECODER 0
+%define CONFIG_MPEG4_NVDEC_HWACCEL 0
+%define CONFIG_MPEG4_UNPACK_BFRAMES_BSF 0
+%define CONFIG_MPEG4_V4L2M2M_DECODER 0
+%define CONFIG_MPEG4_V4L2M2M_ENCODER 0
+%define CONFIG_MPEG4_VAAPI_HWACCEL 0
+%define CONFIG_MPEG4_VDPAU_HWACCEL 0
+%define CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL 0
+%define CONFIG_MPEGAUDIO_PARSER 0
+%define CONFIG_MPEGPS_DEMUXER 0
+%define CONFIG_MPEGTSRAW_DEMUXER 0
+%define CONFIG_MPEGTS_DEMUXER 0
+%define CONFIG_MPEGTS_MUXER 0
+%define CONFIG_MPEGVIDEO_DECODER 0
+%define CONFIG_MPEGVIDEO_DEMUXER 0
+%define CONFIG_MPEGVIDEO_PARSER 0
+%define CONFIG_MPJPEG_DEMUXER 0
+%define CONFIG_MPJPEG_MUXER 0
+%define CONFIG_MPL2_DECODER 0
+%define CONFIG_MPL2_DEMUXER 0
+%define CONFIG_MPSUB_DEMUXER 0
+%define CONFIG_MPTESTSRC_FILTER 0
+%define CONFIG_MSA1_DECODER 0
+%define CONFIG_MSCC_DECODER 0
+%define CONFIG_MSF_DEMUXER 0
+%define CONFIG_MSMPEG4V1_DECODER 0
+%define CONFIG_MSMPEG4V2_DECODER 0
+%define CONFIG_MSMPEG4V2_ENCODER 0
+%define CONFIG_MSMPEG4V3_DECODER 0
+%define CONFIG_MSMPEG4V3_ENCODER 0
+%define CONFIG_MSMPEG4_CRYSTALHD_DECODER 0
+%define CONFIG_MSNWC_TCP_DEMUXER 0
+%define CONFIG_MSRLE_DECODER 0
+%define CONFIG_MSS1_DECODER 0
+%define CONFIG_MSS2_DECODER 0
+%define CONFIG_MSVIDEO1_DECODER 0
+%define CONFIG_MSVIDEO1_ENCODER 0
+%define CONFIG_MSZH_DECODER 0
+%define CONFIG_MTAF_DEMUXER 0
+%define CONFIG_MTS2_DECODER 0
+%define CONFIG_MTV_DEMUXER 0
+%define CONFIG_MUSX_DEMUXER 0
+%define CONFIG_MUXERS 0
+%define CONFIG_MVC1_DECODER 0
+%define CONFIG_MVC2_DECODER 0
+%define CONFIG_MVI_DEMUXER 0
+%define CONFIG_MV_DEMUXER 0
+%define CONFIG_MXF_D10_MUXER 0
+%define CONFIG_MXF_DEMUXER 0
+%define CONFIG_MXF_MUXER 0
+%define CONFIG_MXF_OPATOM_MUXER 0
+%define CONFIG_MXG_DEMUXER 0
+%define CONFIG_MXPEG_DECODER 0
+%define CONFIG_NC_DEMUXER 0
 %define CONFIG_NEGATE_FILTER 0
+%define CONFIG_NELLYMOSER_DECODER 0
+%define CONFIG_NELLYMOSER_ENCODER 0
+%define CONFIG_NISTSPHERE_DEMUXER 0
 %define CONFIG_NLMEANS_FILTER 0
 %define CONFIG_NNEDI_FILTER 0
 %define CONFIG_NOFORMAT_FILTER 0
+%define CONFIG_NOISE_BSF 0
 %define CONFIG_NOISE_FILTER 0
+%define CONFIG_NORMALIZE_FILTER 0
+%define CONFIG_NSP_DEMUXER 0
+%define CONFIG_NSV_DEMUXER 0
+%define CONFIG_NULLSINK_FILTER 0
+%define CONFIG_NULLSRC_FILTER 0
 %define CONFIG_NULL_FILTER 0
+%define CONFIG_NULL_MUXER 0
+%define CONFIG_NUT_DEMUXER 0
+%define CONFIG_NUT_MUXER 0
+%define CONFIG_NUV_DECODER 0
+%define CONFIG_NUV_DEMUXER 0
+%define CONFIG_NVENC_ENCODER 0
+%define CONFIG_NVENC_H264_ENCODER 0
+%define CONFIG_NVENC_HEVC_ENCODER 0
 %define CONFIG_OCR_FILTER 0
 %define CONFIG_OCV_FILTER 0
+%define CONFIG_OGA_MUXER 0
+%define CONFIG_OGG_DEMUXER 0
+%define CONFIG_OGG_MUXER 0
+%define CONFIG_OGV_MUXER 0
+%define CONFIG_OMA_DEMUXER 0
+%define CONFIG_OMA_MUXER 0
+%define CONFIG_ON2AVC_DECODER 0
+%define CONFIG_OPENAL_INDEV 0
+%define CONFIG_OPENCLSRC_FILTER 0
+%define CONFIG_OPENGL_OUTDEV 0
+%define CONFIG_OPUS_DECODER 0
+%define CONFIG_OPUS_ENCODER 0
+%define CONFIG_OPUS_MUXER 0
+%define CONFIG_OPUS_PARSER 0
 %define CONFIG_OSCILLOSCOPE_FILTER 0
+%define CONFIG_OSS_INDEV 0
+%define CONFIG_OSS_OUTDEV 0
+%define CONFIG_OUTDEVS 0
 %define CONFIG_OVERLAY_FILTER 0
+%define CONFIG_OVERLAY_OPENCL_FILTER 0
+%define CONFIG_OVERLAY_QSV_FILTER 0
 %define CONFIG_OWDENOISE_FILTER 0
 %define CONFIG_PAD_FILTER 0
+%define CONFIG_PAF_AUDIO_DECODER 0
+%define CONFIG_PAF_DEMUXER 0
+%define CONFIG_PAF_VIDEO_DECODER 0
 %define CONFIG_PALETTEGEN_FILTER 0
 %define CONFIG_PALETTEUSE_FILTER 0
+%define CONFIG_PAM_DECODER 0
+%define CONFIG_PAM_ENCODER 0
+%define CONFIG_PAN_FILTER 0
+%define CONFIG_PBM_DECODER 0
+%define CONFIG_PBM_ENCODER 0
+%define CONFIG_PCM_ALAW_AT_DECODER 0
+%define CONFIG_PCM_ALAW_AT_ENCODER 0
+%define CONFIG_PCM_ALAW_DECODER 0
+%define CONFIG_PCM_ALAW_DEMUXER 0
+%define CONFIG_PCM_ALAW_ENCODER 0
+%define CONFIG_PCM_ALAW_MUXER 0
+%define CONFIG_PCM_BLURAY_DECODER 0
+%define CONFIG_PCM_DVD_DECODER 0
+%define CONFIG_PCM_F16LE_DECODER 0
+%define CONFIG_PCM_F24LE_DECODER 0
+%define CONFIG_PCM_F32BE_DECODER 0
+%define CONFIG_PCM_F32BE_DEMUXER 0
+%define CONFIG_PCM_F32BE_ENCODER 0
+%define CONFIG_PCM_F32BE_MUXER 0
+%define CONFIG_PCM_F32LE_DECODER 0
+%define CONFIG_PCM_F32LE_DEMUXER 0
+%define CONFIG_PCM_F32LE_ENCODER 0
+%define CONFIG_PCM_F32LE_MUXER 0
+%define CONFIG_PCM_F64BE_DECODER 0
+%define CONFIG_PCM_F64BE_DEMUXER 0
+%define CONFIG_PCM_F64BE_ENCODER 0
+%define CONFIG_PCM_F64BE_MUXER 0
+%define CONFIG_PCM_F64LE_DECODER 0
+%define CONFIG_PCM_F64LE_DEMUXER 0
+%define CONFIG_PCM_F64LE_ENCODER 0
+%define CONFIG_PCM_F64LE_MUXER 0
+%define CONFIG_PCM_LXF_DECODER 0
+%define CONFIG_PCM_MULAW_AT_DECODER 0
+%define CONFIG_PCM_MULAW_AT_ENCODER 0
+%define CONFIG_PCM_MULAW_DECODER 0
+%define CONFIG_PCM_MULAW_DEMUXER 0
+%define CONFIG_PCM_MULAW_ENCODER 0
+%define CONFIG_PCM_MULAW_MUXER 0
+%define CONFIG_PCM_S16BE_DECODER 0
+%define CONFIG_PCM_S16BE_DEMUXER 0
+%define CONFIG_PCM_S16BE_ENCODER 0
+%define CONFIG_PCM_S16BE_MUXER 0
+%define CONFIG_PCM_S16BE_PLANAR_DECODER 0
+%define CONFIG_PCM_S16BE_PLANAR_ENCODER 0
+%define CONFIG_PCM_S16LE_DECODER 0
+%define CONFIG_PCM_S16LE_DEMUXER 0
+%define CONFIG_PCM_S16LE_ENCODER 0
+%define CONFIG_PCM_S16LE_MUXER 0
+%define CONFIG_PCM_S16LE_PLANAR_DECODER 0
+%define CONFIG_PCM_S16LE_PLANAR_ENCODER 0
+%define CONFIG_PCM_S24BE_DECODER 0
+%define CONFIG_PCM_S24BE_DEMUXER 0
+%define CONFIG_PCM_S24BE_ENCODER 0
+%define CONFIG_PCM_S24BE_MUXER 0
+%define CONFIG_PCM_S24DAUD_DECODER 0
+%define CONFIG_PCM_S24DAUD_ENCODER 0
+%define CONFIG_PCM_S24LE_DECODER 0
+%define CONFIG_PCM_S24LE_DEMUXER 0
+%define CONFIG_PCM_S24LE_ENCODER 0
+%define CONFIG_PCM_S24LE_MUXER 0
+%define CONFIG_PCM_S24LE_PLANAR_DECODER 0
+%define CONFIG_PCM_S24LE_PLANAR_ENCODER 0
+%define CONFIG_PCM_S32BE_DECODER 0
+%define CONFIG_PCM_S32BE_DEMUXER 0
+%define CONFIG_PCM_S32BE_ENCODER 0
+%define CONFIG_PCM_S32BE_MUXER 0
+%define CONFIG_PCM_S32LE_DECODER 0
+%define CONFIG_PCM_S32LE_DEMUXER 0
+%define CONFIG_PCM_S32LE_ENCODER 0
+%define CONFIG_PCM_S32LE_MUXER 0
+%define CONFIG_PCM_S32LE_PLANAR_DECODER 0
+%define CONFIG_PCM_S32LE_PLANAR_ENCODER 0
+%define CONFIG_PCM_S64BE_DECODER 0
+%define CONFIG_PCM_S64BE_ENCODER 0
+%define CONFIG_PCM_S64LE_DECODER 0
+%define CONFIG_PCM_S64LE_ENCODER 0
+%define CONFIG_PCM_S8_DECODER 0
+%define CONFIG_PCM_S8_DEMUXER 0
+%define CONFIG_PCM_S8_ENCODER 0
+%define CONFIG_PCM_S8_MUXER 0
+%define CONFIG_PCM_S8_PLANAR_DECODER 0
+%define CONFIG_PCM_S8_PLANAR_ENCODER 0
+%define CONFIG_PCM_U16BE_DECODER 0
+%define CONFIG_PCM_U16BE_DEMUXER 0
+%define CONFIG_PCM_U16BE_ENCODER 0
+%define CONFIG_PCM_U16BE_MUXER 0
+%define CONFIG_PCM_U16LE_DECODER 0
+%define CONFIG_PCM_U16LE_DEMUXER 0
+%define CONFIG_PCM_U16LE_ENCODER 0
+%define CONFIG_PCM_U16LE_MUXER 0
+%define CONFIG_PCM_U24BE_DECODER 0
+%define CONFIG_PCM_U24BE_DEMUXER 0
+%define CONFIG_PCM_U24BE_ENCODER 0
+%define CONFIG_PCM_U24BE_MUXER 0
+%define CONFIG_PCM_U24LE_DECODER 0
+%define CONFIG_PCM_U24LE_DEMUXER 0
+%define CONFIG_PCM_U24LE_ENCODER 0
+%define CONFIG_PCM_U24LE_MUXER 0
+%define CONFIG_PCM_U32BE_DECODER 0
+%define CONFIG_PCM_U32BE_DEMUXER 0
+%define CONFIG_PCM_U32BE_ENCODER 0
+%define CONFIG_PCM_U32BE_MUXER 0
+%define CONFIG_PCM_U32LE_DECODER 0
+%define CONFIG_PCM_U32LE_DEMUXER 0
+%define CONFIG_PCM_U32LE_ENCODER 0
+%define CONFIG_PCM_U32LE_MUXER 0
+%define CONFIG_PCM_U8_DECODER 0
+%define CONFIG_PCM_U8_DEMUXER 0
+%define CONFIG_PCM_U8_ENCODER 0
+%define CONFIG_PCM_U8_MUXER 0
+%define CONFIG_PCM_ZORK_DECODER 0
+%define CONFIG_PCX_DECODER 0
+%define CONFIG_PCX_ENCODER 0
 %define CONFIG_PERMS_FILTER 0
 %define CONFIG_PERSPECTIVE_FILTER 0
+%define CONFIG_PGMYUV_DECODER 0
+%define CONFIG_PGMYUV_ENCODER 0
+%define CONFIG_PGM_DECODER 0
+%define CONFIG_PGM_ENCODER 0
+%define CONFIG_PGSSUB_DECODER 0
 %define CONFIG_PHASE_FILTER 0
+%define CONFIG_PICTOR_DECODER 0
+%define CONFIG_PIPE_PROTOCOL 0
 %define CONFIG_PIXDESCTEST_FILTER 0
+%define CONFIG_PIXLET_DECODER 0
 %define CONFIG_PIXSCOPE_FILTER 0
-%define CONFIG_PP_FILTER 0
+%define CONFIG_PJS_DECODER 0
+%define CONFIG_PJS_DEMUXER 0
+%define CONFIG_PMP_DEMUXER 0
+%define CONFIG_PNG_DECODER 0
+%define CONFIG_PNG_ENCODER 0
+%define CONFIG_PNG_PARSER 0
+%define CONFIG_PNM_PARSER 0
 %define CONFIG_PP7_FILTER 0
+%define CONFIG_PPM_DECODER 0
+%define CONFIG_PPM_ENCODER 0
+%define CONFIG_PP_FILTER 0
 %define CONFIG_PREMULTIPLY_FILTER 0
 %define CONFIG_PREWITT_FILTER 0
+%define CONFIG_PROCAMP_VAAPI_FILTER 0
+%define CONFIG_PROGRAM_OPENCL_FILTER 0
+%define CONFIG_PROMPEG_PROTOCOL 0
+%define CONFIG_PRORES_AW_ENCODER 0
+%define CONFIG_PRORES_DECODER 0
+%define CONFIG_PRORES_ENCODER 0
+%define CONFIG_PRORES_KS_ENCODER 0
+%define CONFIG_PRORES_LGPL_DECODER 0
+%define CONFIG_PROTOCOLS 0
+%define CONFIG_PSD_DECODER 0
 %define CONFIG_PSEUDOCOLOR_FILTER 0
 %define CONFIG_PSNR_FILTER 0
+%define CONFIG_PSP_MUXER 0
+%define CONFIG_PTX_DECODER 0
 %define CONFIG_PULLUP_FILTER 0
+%define CONFIG_PULSE_INDEV 0
+%define CONFIG_PULSE_OUTDEV 0
+%define CONFIG_PVA_DEMUXER 0
+%define CONFIG_PVF_DEMUXER 0
+%define CONFIG_QCELP_DECODER 0
+%define CONFIG_QCP_DEMUXER 0
+%define CONFIG_QDM2_AT_DECODER 0
+%define CONFIG_QDM2_DECODER 0
+%define CONFIG_QDMC_AT_DECODER 0
+%define CONFIG_QDMC_DECODER 0
+%define CONFIG_QDRAW_DECODER 0
+%define CONFIG_QPEG_DECODER 0
 %define CONFIG_QP_FILTER 0
+%define CONFIG_QTRLE_DECODER 0
+%define CONFIG_QTRLE_ENCODER 0
+%define CONFIG_R10K_DECODER 0
+%define CONFIG_R10K_ENCODER 0
+%define CONFIG_R210_DECODER 0
+%define CONFIG_R210_ENCODER 0
+%define CONFIG_R3D_DEMUXER 0
+%define CONFIG_RALF_DECODER 0
 %define CONFIG_RANDOM_FILTER 0
+%define CONFIG_RAWVIDEO_DECODER 0
+%define CONFIG_RAWVIDEO_DEMUXER 0
+%define CONFIG_RAWVIDEO_ENCODER 0
+%define CONFIG_RAWVIDEO_MUXER 0
+%define CONFIG_RA_144_DECODER 0
+%define CONFIG_RA_144_ENCODER 0
+%define CONFIG_RA_288_DECODER 0
 %define CONFIG_READEIA608_FILTER 0
 %define CONFIG_READVITC_FILTER 0
+%define CONFIG_REALTEXT_DECODER 0
+%define CONFIG_REALTEXT_DEMUXER 0
 %define CONFIG_REALTIME_FILTER 0
+%define CONFIG_REDSPARK_DEMUXER 0
 %define CONFIG_REMAP_FILTER 0
 %define CONFIG_REMOVEGRAIN_FILTER 0
 %define CONFIG_REMOVELOGO_FILTER 0
+%define CONFIG_REMOVE_EXTRADATA_BSF 0
 %define CONFIG_REPEATFIELDS_FILTER 0
+%define CONFIG_REPLAYGAIN_FILTER 0
+%define CONFIG_RESAMPLE_FILTER 0
 %define CONFIG_REVERSE_FILTER 0
+%define CONFIG_RGBTESTSRC_FILTER 0
+%define CONFIG_RL2_DECODER 0
+%define CONFIG_RL2_DEMUXER 0
+%define CONFIG_RM_DEMUXER 0
+%define CONFIG_RM_MUXER 0
 %define CONFIG_ROBERTS_FILTER 0
+%define CONFIG_ROQ_DECODER 0
+%define CONFIG_ROQ_DEMUXER 0
+%define CONFIG_ROQ_DPCM_DECODER 0
+%define CONFIG_ROQ_DPCM_ENCODER 0
+%define CONFIG_ROQ_ENCODER 0
+%define CONFIG_ROQ_MUXER 0
 %define CONFIG_ROTATE_FILTER 0
+%define CONFIG_RPL_DEMUXER 0
+%define CONFIG_RPZA_DECODER 0
+%define CONFIG_RSCC_DECODER 0
+%define CONFIG_RSD_DEMUXER 0
+%define CONFIG_RSO_DEMUXER 0
+%define CONFIG_RSO_MUXER 0
+%define CONFIG_RTMPE_PROTOCOL 0
+%define CONFIG_RTMPS_PROTOCOL 0
+%define CONFIG_RTMPTE_PROTOCOL 0
+%define CONFIG_RTMPTS_PROTOCOL 0
+%define CONFIG_RTMPT_PROTOCOL 0
+%define CONFIG_RTMP_PROTOCOL 0
+%define CONFIG_RTP_DEMUXER 0
+%define CONFIG_RTP_MPEGTS_MUXER 0
+%define CONFIG_RTP_MUXER 0
+%define CONFIG_RTP_PROTOCOL 0
+%define CONFIG_RTSP_DEMUXER 0
+%define CONFIG_RTSP_MUXER 0
+%define CONFIG_RUBBERBAND_FILTER 0
+%define CONFIG_RV10_DECODER 0
+%define CONFIG_RV10_ENCODER 0
+%define CONFIG_RV20_DECODER 0
+%define CONFIG_RV20_ENCODER 0
+%define CONFIG_RV30_DECODER 0
+%define CONFIG_RV30_PARSER 0
+%define CONFIG_RV40_DECODER 0
+%define CONFIG_RV40_PARSER 0
+%define CONFIG_S302M_DECODER 0
+%define CONFIG_S302M_ENCODER 0
+%define CONFIG_S337M_DEMUXER 0
 %define CONFIG_SAB_FILTER 0
-%define CONFIG_SCALE_FILTER 0
+%define CONFIG_SAMI_DECODER 0
+%define CONFIG_SAMI_DEMUXER 0
+%define CONFIG_SANM_DECODER 0
+%define CONFIG_SAP_DEMUXER 0
+%define CONFIG_SAP_MUXER 0
+%define CONFIG_SBC_DECODER 0
+%define CONFIG_SBC_DEMUXER 0
+%define CONFIG_SBC_ENCODER 0
+%define CONFIG_SBC_MUXER 0
+%define CONFIG_SBC_PARSER 0
+%define CONFIG_SBG_DEMUXER 0
+%define CONFIG_SCALE2REF_FILTER 0
 %define CONFIG_SCALE_CUDA_FILTER 0
+%define CONFIG_SCALE_FILTER 0
 %define CONFIG_SCALE_NPP_FILTER 0
 %define CONFIG_SCALE_QSV_FILTER 0
 %define CONFIG_SCALE_VAAPI_FILTER 0
-%define CONFIG_SCALE2REF_FILTER 0
-%define CONFIG_SELECT_FILTER 0
+%define CONFIG_SCC_DEMUXER 0
+%define CONFIG_SCC_MUXER 0
+%define CONFIG_SCPR_DECODER 0
+%define CONFIG_SCREENPRESSO_DECODER 0
+%define CONFIG_SCTP_PROTOCOL 0
+%define CONFIG_SDL2_OUTDEV 0
+%define CONFIG_SDP_DEMUXER 0
+%define CONFIG_SDR2_DEMUXER 0
+%define CONFIG_SDS_DEMUXER 0
+%define CONFIG_SDX2_DPCM_DECODER 0
+%define CONFIG_SDX_DEMUXER 0
+%define CONFIG_SEGAFILM_DEMUXER 0
+%define CONFIG_SEGAFILM_MUXER 0
+%define CONFIG_SEGMENT_MUXER 0
 %define CONFIG_SELECTIVECOLOR_FILTER 0
+%define CONFIG_SELECT_FILTER 0
 %define CONFIG_SENDCMD_FILTER 0
 %define CONFIG_SEPARATEFIELDS_FILTER 0
 %define CONFIG_SETDAR_FILTER 0
 %define CONFIG_SETFIELD_FILTER 0
 %define CONFIG_SETPTS_FILTER 0
+%define CONFIG_SETRANGE_FILTER 0
 %define CONFIG_SETSAR_FILTER 0
 %define CONFIG_SETTB_FILTER 0
+%define CONFIG_SGIRLE_DECODER 0
+%define CONFIG_SGI_DECODER 0
+%define CONFIG_SGI_ENCODER 0
+%define CONFIG_SHARPNESS_VAAPI_FILTER 0
+%define CONFIG_SHEERVIDEO_DECODER 0
+%define CONFIG_SHORTEN_DECODER 0
+%define CONFIG_SHORTEN_DEMUXER 0
+%define CONFIG_SHOWCQT_FILTER 0
+%define CONFIG_SHOWFREQS_FILTER 0
 %define CONFIG_SHOWINFO_FILTER 0
 %define CONFIG_SHOWPALETTE_FILTER 0
+%define CONFIG_SHOWSPECTRUMPIC_FILTER 0
+%define CONFIG_SHOWSPECTRUM_FILTER 0
+%define CONFIG_SHOWVOLUME_FILTER 0
+%define CONFIG_SHOWWAVESPIC_FILTER 0
+%define CONFIG_SHOWWAVES_FILTER 0
 %define CONFIG_SHUFFLEFRAMES_FILTER 0
 %define CONFIG_SHUFFLEPLANES_FILTER 0
+%define CONFIG_SIDECHAINCOMPRESS_FILTER 0
+%define CONFIG_SIDECHAINGATE_FILTER 0
 %define CONFIG_SIDEDATA_FILTER 0
+%define CONFIG_SIFF_DEMUXER 0
 %define CONFIG_SIGNALSTATS_FILTER 0
 %define CONFIG_SIGNATURE_FILTER 0
+%define CONFIG_SILENCEDETECT_FILTER 0
+%define CONFIG_SILENCEREMOVE_FILTER 0
+%define CONFIG_SINE_FILTER 0
+%define CONFIG_SINGLEJPEG_MUXER 0
+%define CONFIG_SIPR_DECODER 0
+%define CONFIG_SIPR_PARSER 0
+%define CONFIG_SLN_DEMUXER 0
+%define CONFIG_SMACKAUD_DECODER 0
+%define CONFIG_SMACKER_DECODER 0
+%define CONFIG_SMACKER_DEMUXER 0
 %define CONFIG_SMARTBLUR_FILTER 0
+%define CONFIG_SMC_DECODER 0
+%define CONFIG_SMJPEG_DEMUXER 0
+%define CONFIG_SMJPEG_MUXER 0
+%define CONFIG_SMOOTHSTREAMING_MUXER 0
+%define CONFIG_SMPTEBARS_FILTER 0
+%define CONFIG_SMPTEHDBARS_FILTER 0
+%define CONFIG_SMUSH_DEMUXER 0
+%define CONFIG_SMVJPEG_DECODER 0
+%define CONFIG_SNDIO_INDEV 0
+%define CONFIG_SNDIO_OUTDEV 0
+%define CONFIG_SNOW_DECODER 0
+%define CONFIG_SNOW_ENCODER 0
 %define CONFIG_SOBEL_FILTER 0
+%define CONFIG_SOFALIZER_FILTER 0
+%define CONFIG_SOL_DEMUXER 0
+%define CONFIG_SOL_DPCM_DECODER 0
+%define CONFIG_SONIC_DECODER 0
+%define CONFIG_SONIC_ENCODER 0
+%define CONFIG_SONIC_LS_ENCODER 0
+%define CONFIG_SOX_DEMUXER 0
+%define CONFIG_SOX_MUXER 0
+%define CONFIG_SP5X_DECODER 0
+%define CONFIG_SPDIF_DEMUXER 0
+%define CONFIG_SPDIF_MUXER 0
+%define CONFIG_SPECTRUMSYNTH_FILTER 0
+%define CONFIG_SPEEDHQ_DECODER 0
 %define CONFIG_SPLIT_FILTER 0
 %define CONFIG_SPP_FILTER 0
+%define CONFIG_SPX_MUXER 0
+%define CONFIG_SRGC_DECODER 0
+%define CONFIG_SRTP_PROTOCOL 0
+%define CONFIG_SRT_DECODER 0
+%define CONFIG_SRT_DEMUXER 0
+%define CONFIG_SRT_ENCODER 0
+%define CONFIG_SRT_MUXER 0
+%define CONFIG_SSA_DECODER 0
+%define CONFIG_SSA_ENCODER 0
 %define CONFIG_SSIM_FILTER 0
 %define CONFIG_STEREO3D_FILTER 0
+%define CONFIG_STEREOTOOLS_FILTER 0
+%define CONFIG_STEREOWIDEN_FILTER 0
+%define CONFIG_STL_DECODER 0
+%define CONFIG_STL_DEMUXER 0
 %define CONFIG_STREAMSELECT_FILTER 0
+%define CONFIG_STREAM_SEGMENT_MUXER 0
+%define CONFIG_STR_DEMUXER 0
+%define CONFIG_SUBFILE_PROTOCOL 0
+%define CONFIG_SUBRIP_DECODER 0
+%define CONFIG_SUBRIP_ENCODER 0
 %define CONFIG_SUBTITLES_FILTER 0
+%define CONFIG_SUBVIEWER1_DECODER 0
+%define CONFIG_SUBVIEWER1_DEMUXER 0
+%define CONFIG_SUBVIEWER_DECODER 0
+%define CONFIG_SUBVIEWER_DEMUXER 0
+%define CONFIG_SUNRAST_DECODER 0
+%define CONFIG_SUNRAST_ENCODER 0
 %define CONFIG_SUPER2XSAI_FILTER 0
+%define CONFIG_SUPEREQUALIZER_FILTER 0
+%define CONFIG_SUP_DEMUXER 0
+%define CONFIG_SUP_MUXER 0
+%define CONFIG_SURROUND_FILTER 0
+%define CONFIG_SVAG_DEMUXER 0
+%define CONFIG_SVQ1_DECODER 0
+%define CONFIG_SVQ1_ENCODER 0
+%define CONFIG_SVQ3_DECODER 0
 %define CONFIG_SWAPRECT_FILTER 0
 %define CONFIG_SWAPUV_FILTER 0
+%define CONFIG_SWF_DEMUXER 0
+%define CONFIG_SWF_MUXER 0
+%define CONFIG_TAK_DECODER 0
+%define CONFIG_TAK_DEMUXER 0
+%define CONFIG_TAK_PARSER 0
+%define CONFIG_TARGA_DECODER 0
+%define CONFIG_TARGA_ENCODER 0
+%define CONFIG_TARGA_Y216_DECODER 0
 %define CONFIG_TBLEND_FILTER 0
+%define CONFIG_TCP_PROTOCOL 0
+%define CONFIG_TDSC_DECODER 0
+%define CONFIG_TEDCAPTIONS_DEMUXER 0
+%define CONFIG_TEE_MUXER 0
+%define CONFIG_TEE_PROTOCOL 0
 %define CONFIG_TELECINE_FILTER 0
+%define CONFIG_TESTSRC2_FILTER 0
+%define CONFIG_TESTSRC_FILTER 0
+%define CONFIG_TEXT2MOVSUB_BSF 0
+%define CONFIG_TEXT_DECODER 0
+%define CONFIG_TEXT_ENCODER 0
+%define CONFIG_TG2_MUXER 0
+%define CONFIG_TGP_MUXER 0
+%define CONFIG_THEORA_DECODER 0
+%define CONFIG_THP_DECODER 0
+%define CONFIG_THP_DEMUXER 0
+%define CONFIG_THREEDOSTR_DEMUXER 0
 %define CONFIG_THRESHOLD_FILTER 0
-%define CONFIG_THUMBNAIL_FILTER 0
 %define CONFIG_THUMBNAIL_CUDA_FILTER 0
+%define CONFIG_THUMBNAIL_FILTER 0
+%define CONFIG_TIERTEXSEQVIDEO_DECODER 0
+%define CONFIG_TIERTEXSEQ_DEMUXER 0
+%define CONFIG_TIFF_DECODER 0
+%define CONFIG_TIFF_ENCODER 0
 %define CONFIG_TILE_FILTER 0
 %define CONFIG_TINTERLACE_FILTER 0
+%define CONFIG_TLS_PROTOCOL 0
 %define CONFIG_TLUT2_FILTER 0
+%define CONFIG_TMV_DECODER 0
+%define CONFIG_TMV_DEMUXER 0
 %define CONFIG_TONEMAP_FILTER 0
+%define CONFIG_TRACE_HEADERS_BSF 0
 %define CONFIG_TRANSPOSE_FILTER 0
+%define CONFIG_TREBLE_FILTER 0
+%define CONFIG_TREMOLO_FILTER 0
 %define CONFIG_TRIM_FILTER 0
+%define CONFIG_TRUEHD_DECODER 0
+%define CONFIG_TRUEHD_DEMUXER 0
+%define CONFIG_TRUEHD_ENCODER 0
+%define CONFIG_TRUEHD_MUXER 0
+%define CONFIG_TRUEMOTION1_DECODER 0
+%define CONFIG_TRUEMOTION2RT_DECODER 0
+%define CONFIG_TRUEMOTION2_DECODER 0
+%define CONFIG_TRUESPEECH_DECODER 0
+%define CONFIG_TSCC2_DECODER 0
+%define CONFIG_TSCC_DECODER 0
+%define CONFIG_TTA_DECODER 0
+%define CONFIG_TTA_DEMUXER 0
+%define CONFIG_TTA_ENCODER 0
+%define CONFIG_TTA_MUXER 0
+%define CONFIG_TTY_DEMUXER 0
+%define CONFIG_TWINVQ_DECODER 0
+%define CONFIG_TXD_DECODER 0
+%define CONFIG_TXD_DEMUXER 0
+%define CONFIG_TY_DEMUXER 0
+%define CONFIG_UDPLITE_PROTOCOL 0
+%define CONFIG_UDP_PROTOCOL 0
+%define CONFIG_ULTI_DECODER 0
+%define CONFIG_UNCODEDFRAMECRC_MUXER 0
+%define CONFIG_UNIX_PROTOCOL 0
 %define CONFIG_UNPREMULTIPLY_FILTER 0
 %define CONFIG_UNSHARP_FILTER 0
+%define CONFIG_UNSHARP_OPENCL_FILTER 0
 %define CONFIG_USPP_FILTER 0
+%define CONFIG_UTVIDEO_DECODER 0
+%define CONFIG_UTVIDEO_ENCODER 0
+%define CONFIG_V210X_DECODER 0
+%define CONFIG_V210X_DEMUXER 0
+%define CONFIG_V210_DECODER 0
+%define CONFIG_V210_DEMUXER 0
+%define CONFIG_V210_ENCODER 0
+%define CONFIG_V308_DECODER 0
+%define CONFIG_V308_ENCODER 0
+%define CONFIG_V408_DECODER 0
+%define CONFIG_V408_ENCODER 0
+%define CONFIG_V410_DECODER 0
+%define CONFIG_V410_ENCODER 0
+%define CONFIG_V4L2_INDEV 0
+%define CONFIG_V4L2_OUTDEV 0
 %define CONFIG_VAGUEDENOISER_FILTER 0
+%define CONFIG_VAG_DEMUXER 0
+%define CONFIG_VBLE_DECODER 0
+%define CONFIG_VB_DECODER 0
+%define CONFIG_VC1IMAGE_DECODER 0
+%define CONFIG_VC1T_DEMUXER 0
+%define CONFIG_VC1T_MUXER 0
+%define CONFIG_VC1_CRYSTALHD_DECODER 0
+%define CONFIG_VC1_CUVID_DECODER 0
+%define CONFIG_VC1_D3D11VA2_HWACCEL 0
+%define CONFIG_VC1_D3D11VA_HWACCEL 0
+%define CONFIG_VC1_DECODER 0
+%define CONFIG_VC1_DEMUXER 0
+%define CONFIG_VC1_DXVA2_HWACCEL 0
+%define CONFIG_VC1_MMAL_DECODER 0
+%define CONFIG_VC1_MUXER 0
+%define CONFIG_VC1_NVDEC_HWACCEL 0
+%define CONFIG_VC1_PARSER 0
+%define CONFIG_VC1_QSV_DECODER 0
+%define CONFIG_VC1_V4L2M2M_DECODER 0
+%define CONFIG_VC1_VAAPI_HWACCEL 0
+%define CONFIG_VC1_VDPAU_HWACCEL 0
+%define CONFIG_VC2_ENCODER 0
+%define CONFIG_VCR1_DECODER 0
 %define CONFIG_VECTORSCOPE_FILTER 0
 %define CONFIG_VFLIP_FILTER 0
+%define CONFIG_VFRDET_FILTER 0
+%define CONFIG_VFWCAP_INDEV 0
+%define CONFIG_VIBRATO_FILTER 0
 %define CONFIG_VIDSTABDETECT_FILTER 0
 %define CONFIG_VIDSTABTRANSFORM_FILTER 0
 %define CONFIG_VIGNETTE_FILTER 0
+%define CONFIG_VIVO_DEMUXER 0
 %define CONFIG_VMAFMOTION_FILTER 0
-%define CONFIG_VSTACK_FILTER 0
-%define CONFIG_W3FDIF_FILTER 0
-%define CONFIG_WAVEFORM_FILTER 0
-%define CONFIG_WEAVE_FILTER 0
-%define CONFIG_XBR_FILTER 0
-%define CONFIG_YADIF_FILTER 0
-%define CONFIG_ZMQ_FILTER 0
-%define CONFIG_ZOOMPAN_FILTER 0
-%define CONFIG_ZSCALE_FILTER 0
-%define CONFIG_ALLRGB_FILTER 0
-%define CONFIG_ALLYUV_FILTER 0
-%define CONFIG_CELLAUTO_FILTER 0
-%define CONFIG_COLOR_FILTER 0
-%define CONFIG_COREIMAGESRC_FILTER 0
-%define CONFIG_FREI0R_SRC_FILTER 0
-%define CONFIG_HALDCLUTSRC_FILTER 0
-%define CONFIG_LIFE_FILTER 0
-%define CONFIG_MANDELBROT_FILTER 0
-%define CONFIG_MPTESTSRC_FILTER 0
-%define CONFIG_NULLSRC_FILTER 0
-%define CONFIG_RGBTESTSRC_FILTER 0
-%define CONFIG_SMPTEBARS_FILTER 0
-%define CONFIG_SMPTEHDBARS_FILTER 0
-%define CONFIG_TESTSRC_FILTER 0
-%define CONFIG_TESTSRC2_FILTER 0
-%define CONFIG_YUVTESTSRC_FILTER 0
-%define CONFIG_NULLSINK_FILTER 0
-%define CONFIG_ABITSCOPE_FILTER 0
-%define CONFIG_ADRAWGRAPH_FILTER 0
-%define CONFIG_AHISTOGRAM_FILTER 0
-%define CONFIG_APHASEMETER_FILTER 0
-%define CONFIG_AVECTORSCOPE_FILTER 0
-%define CONFIG_CONCAT_FILTER 0
-%define CONFIG_SHOWCQT_FILTER 0
-%define CONFIG_SHOWFREQS_FILTER 0
-%define CONFIG_SHOWSPECTRUM_FILTER 0
-%define CONFIG_SHOWSPECTRUMPIC_FILTER 0
-%define CONFIG_SHOWVOLUME_FILTER 0
-%define CONFIG_SHOWWAVES_FILTER 0
-%define CONFIG_SHOWWAVESPIC_FILTER 0
-%define CONFIG_SPECTRUMSYNTH_FILTER 0
-%define CONFIG_AMOVIE_FILTER 0
-%define CONFIG_MOVIE_FILTER 0
-%define CONFIG_H263_VAAPI_HWACCEL 0
-%define CONFIG_H263_VIDEOTOOLBOX_HWACCEL 0
-%define CONFIG_H264_CUVID_HWACCEL 0
-%define CONFIG_H264_D3D11VA_HWACCEL 0
-%define CONFIG_H264_D3D11VA2_HWACCEL 0
-%define CONFIG_H264_DXVA2_HWACCEL 0
-%define CONFIG_H264_MEDIACODEC_HWACCEL 0
-%define CONFIG_H264_MMAL_HWACCEL 0
-%define CONFIG_H264_QSV_HWACCEL 0
-%define CONFIG_H264_VAAPI_HWACCEL 0
-%define CONFIG_H264_VDA_HWACCEL 0
-%define CONFIG_H264_VDA_OLD_HWACCEL 0
-%define CONFIG_H264_VDPAU_HWACCEL 0
-%define CONFIG_H264_VIDEOTOOLBOX_HWACCEL 0
-%define CONFIG_HEVC_CUVID_HWACCEL 0
-%define CONFIG_HEVC_D3D11VA_HWACCEL 0
-%define CONFIG_HEVC_D3D11VA2_HWACCEL 0
-%define CONFIG_HEVC_DXVA2_HWACCEL 0
-%define CONFIG_HEVC_MEDIACODEC_HWACCEL 0
-%define CONFIG_HEVC_QSV_HWACCEL 0
-%define CONFIG_HEVC_VAAPI_HWACCEL 0
-%define CONFIG_HEVC_VDPAU_HWACCEL 0
-%define CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL 0
-%define CONFIG_MJPEG_CUVID_HWACCEL 0
-%define CONFIG_MPEG1_CUVID_HWACCEL 0
-%define CONFIG_MPEG1_XVMC_HWACCEL 0
-%define CONFIG_MPEG1_VDPAU_HWACCEL 0
-%define CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL 0
-%define CONFIG_MPEG2_CUVID_HWACCEL 0
-%define CONFIG_MPEG2_XVMC_HWACCEL 0
-%define CONFIG_MPEG2_D3D11VA_HWACCEL 0
-%define CONFIG_MPEG2_D3D11VA2_HWACCEL 0
-%define CONFIG_MPEG2_DXVA2_HWACCEL 0
-%define CONFIG_MPEG2_MMAL_HWACCEL 0
-%define CONFIG_MPEG2_QSV_HWACCEL 0
-%define CONFIG_MPEG2_VAAPI_HWACCEL 0
-%define CONFIG_MPEG2_VDPAU_HWACCEL 0
-%define CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL 0
-%define CONFIG_MPEG2_MEDIACODEC_HWACCEL 0
-%define CONFIG_MPEG4_CUVID_HWACCEL 0
-%define CONFIG_MPEG4_MEDIACODEC_HWACCEL 0
-%define CONFIG_MPEG4_MMAL_HWACCEL 0
-%define CONFIG_MPEG4_VAAPI_HWACCEL 0
-%define CONFIG_MPEG4_VDPAU_HWACCEL 0
-%define CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL 0
-%define CONFIG_VC1_CUVID_HWACCEL 0
-%define CONFIG_VC1_D3D11VA_HWACCEL 0
-%define CONFIG_VC1_D3D11VA2_HWACCEL 0
-%define CONFIG_VC1_DXVA2_HWACCEL 0
-%define CONFIG_VC1_VAAPI_HWACCEL 0
-%define CONFIG_VC1_VDPAU_HWACCEL 0
-%define CONFIG_VC1_MMAL_HWACCEL 0
-%define CONFIG_VC1_QSV_HWACCEL 0
-%define CONFIG_VP8_CUVID_HWACCEL 0
-%define CONFIG_VP8_MEDIACODEC_HWACCEL 0
-%define CONFIG_VP8_QSV_HWACCEL 0
-%define CONFIG_VP9_CUVID_HWACCEL 0
-%define CONFIG_VP9_D3D11VA_HWACCEL 0
+%define CONFIG_VMDAUDIO_DECODER 0
+%define CONFIG_VMDVIDEO_DECODER 0
+%define CONFIG_VMD_DEMUXER 0
+%define CONFIG_VMNC_DECODER 0
+%define CONFIG_VOBSUB_DEMUXER 0
+%define CONFIG_VOC_DEMUXER 0
+%define CONFIG_VOC_MUXER 0
+%define CONFIG_VOLUMEDETECT_FILTER 0
+%define CONFIG_VOLUME_FILTER 0
+%define CONFIG_VORBIS_DECODER 0
+%define CONFIG_VORBIS_ENCODER 0
+%define CONFIG_VORBIS_PARSER 0
+%define CONFIG_VP3_DECODER 0
+%define CONFIG_VP3_PARSER 0
+%define CONFIG_VP5_DECODER 0
+%define CONFIG_VP6A_DECODER 0
+%define CONFIG_VP6F_DECODER 0
+%define CONFIG_VP6_DECODER 0
+%define CONFIG_VP7_DECODER 0
+%define CONFIG_VP8_CUVID_DECODER 0
+%define CONFIG_VP8_MEDIACODEC_DECODER 0
+%define CONFIG_VP8_NVDEC_HWACCEL 0
+%define CONFIG_VP8_QSV_DECODER 0
+%define CONFIG_VP8_RKMPP_DECODER 0
+%define CONFIG_VP8_V4L2M2M_DECODER 0
+%define CONFIG_VP8_V4L2M2M_ENCODER 0
+%define CONFIG_VP8_VAAPI_ENCODER 0
+%define CONFIG_VP8_VAAPI_HWACCEL 0
+%define CONFIG_VP9_CUVID_DECODER 0
 %define CONFIG_VP9_D3D11VA2_HWACCEL 0
+%define CONFIG_VP9_D3D11VA_HWACCEL 0
 %define CONFIG_VP9_DXVA2_HWACCEL 0
-%define CONFIG_VP9_MEDIACODEC_HWACCEL 0
+%define CONFIG_VP9_MEDIACODEC_DECODER 0
+%define CONFIG_VP9_NVDEC_HWACCEL 0
+%define CONFIG_VP9_RAW_REORDER_BSF 0
+%define CONFIG_VP9_RKMPP_DECODER 0
+%define CONFIG_VP9_SUPERFRAME_BSF 0
+%define CONFIG_VP9_V4L2M2M_DECODER 0
+%define CONFIG_VP9_VAAPI_ENCODER 0
 %define CONFIG_VP9_VAAPI_HWACCEL 0
-%define CONFIG_WMV3_D3D11VA_HWACCEL 0
-%define CONFIG_WMV3_D3D11VA2_HWACCEL 0
-%define CONFIG_WMV3_DXVA2_HWACCEL 0
-%define CONFIG_WMV3_VAAPI_HWACCEL 0
-%define CONFIG_WMV3_VDPAU_HWACCEL 0
-%define CONFIG_ALSA_INDEV 0
-%define CONFIG_AVFOUNDATION_INDEV 0
-%define CONFIG_BKTR_INDEV 0
-%define CONFIG_DECKLINK_INDEV 0
-%define CONFIG_LIBNDI_NEWTEK_INDEV 0
-%define CONFIG_DSHOW_INDEV 0
-%define CONFIG_FBDEV_INDEV 0
-%define CONFIG_GDIGRAB_INDEV 0
-%define CONFIG_IEC61883_INDEV 0
-%define CONFIG_JACK_INDEV 0
-%define CONFIG_KMSGRAB_INDEV 0
-%define CONFIG_LAVFI_INDEV 0
-%define CONFIG_OPENAL_INDEV 0
-%define CONFIG_OSS_INDEV 0
-%define CONFIG_PULSE_INDEV 0
-%define CONFIG_SNDIO_INDEV 0
-%define CONFIG_V4L2_INDEV 0
-%define CONFIG_VFWCAP_INDEV 0
-%define CONFIG_XCBGRAB_INDEV 0
-%define CONFIG_LIBCDIO_INDEV 0
-%define CONFIG_LIBDC1394_INDEV 0
-%define CONFIG_A64_MUXER 0
-%define CONFIG_AC3_MUXER 0
-%define CONFIG_ADTS_MUXER 0
-%define CONFIG_ADX_MUXER 0
-%define CONFIG_AIFF_MUXER 0
-%define CONFIG_AMR_MUXER 0
-%define CONFIG_APNG_MUXER 0
-%define CONFIG_ASF_MUXER 0
-%define CONFIG_ASS_MUXER 0
-%define CONFIG_AST_MUXER 0
-%define CONFIG_ASF_STREAM_MUXER 0
-%define CONFIG_AU_MUXER 0
-%define CONFIG_AVI_MUXER 0
-%define CONFIG_AVM2_MUXER 0
-%define CONFIG_BIT_MUXER 0
-%define CONFIG_CAF_MUXER 0
-%define CONFIG_CAVSVIDEO_MUXER 0
-%define CONFIG_CRC_MUXER 0
-%define CONFIG_DASH_MUXER 0
-%define CONFIG_DATA_MUXER 0
-%define CONFIG_DAUD_MUXER 0
-%define CONFIG_DIRAC_MUXER 0
-%define CONFIG_DNXHD_MUXER 0
-%define CONFIG_DTS_MUXER 0
-%define CONFIG_DV_MUXER 0
-%define CONFIG_EAC3_MUXER 0
-%define CONFIG_F4V_MUXER 0
-%define CONFIG_FFM_MUXER 0
-%define CONFIG_FFMETADATA_MUXER 0
-%define CONFIG_FIFO_MUXER 0
-%define CONFIG_FILMSTRIP_MUXER 0
-%define CONFIG_FITS_MUXER 0
-%define CONFIG_FLAC_MUXER 0
-%define CONFIG_FLV_MUXER 0
-%define CONFIG_FRAMECRC_MUXER 0
-%define CONFIG_FRAMEHASH_MUXER 0
-%define CONFIG_FRAMEMD5_MUXER 0
-%define CONFIG_G722_MUXER 0
-%define CONFIG_G723_1_MUXER 0
-%define CONFIG_G726_MUXER 0
-%define CONFIG_G726LE_MUXER 0
-%define CONFIG_GIF_MUXER 0
-%define CONFIG_GSM_MUXER 0
-%define CONFIG_GXF_MUXER 0
-%define CONFIG_H261_MUXER 0
-%define CONFIG_H263_MUXER 0
-%define CONFIG_H264_MUXER 0
-%define CONFIG_HASH_MUXER 0
-%define CONFIG_HDS_MUXER 0
-%define CONFIG_HEVC_MUXER 0
-%define CONFIG_HLS_MUXER 0
-%define CONFIG_ICO_MUXER 0
-%define CONFIG_ILBC_MUXER 0
-%define CONFIG_IMAGE2_MUXER 0
-%define CONFIG_IMAGE2PIPE_MUXER 0
-%define CONFIG_IPOD_MUXER 0
-%define CONFIG_IRCAM_MUXER 0
-%define CONFIG_ISMV_MUXER 0
-%define CONFIG_IVF_MUXER 0
-%define CONFIG_JACOSUB_MUXER 0
-%define CONFIG_LATM_MUXER 0
-%define CONFIG_LRC_MUXER 0
-%define CONFIG_M4V_MUXER 0
-%define CONFIG_MD5_MUXER 0
-%define CONFIG_MATROSKA_MUXER 0
-%define CONFIG_MATROSKA_AUDIO_MUXER 0
-%define CONFIG_MICRODVD_MUXER 0
-%define CONFIG_MJPEG_MUXER 0
-%define CONFIG_MLP_MUXER 0
-%define CONFIG_MMF_MUXER 0
-%define CONFIG_MOV_MUXER 0
-%define CONFIG_MP2_MUXER 0
-%define CONFIG_MP3_MUXER 0
-%define CONFIG_MP4_MUXER 0
-%define CONFIG_MPEG1SYSTEM_MUXER 0
-%define CONFIG_MPEG1VCD_MUXER 0
-%define CONFIG_MPEG1VIDEO_MUXER 0
-%define CONFIG_MPEG2DVD_MUXER 0
-%define CONFIG_MPEG2SVCD_MUXER 0
-%define CONFIG_MPEG2VIDEO_MUXER 0
-%define CONFIG_MPEG2VOB_MUXER 0
-%define CONFIG_MPEGTS_MUXER 0
-%define CONFIG_MPJPEG_MUXER 0
-%define CONFIG_MXF_MUXER 0
-%define CONFIG_MXF_D10_MUXER 0
-%define CONFIG_MXF_OPATOM_MUXER 0
-%define CONFIG_NULL_MUXER 0
-%define CONFIG_NUT_MUXER 0
-%define CONFIG_OGA_MUXER 0
-%define CONFIG_OGG_MUXER 0
-%define CONFIG_OGV_MUXER 0
-%define CONFIG_OMA_MUXER 0
-%define CONFIG_OPUS_MUXER 0
-%define CONFIG_PCM_ALAW_MUXER 0
-%define CONFIG_PCM_MULAW_MUXER 0
-%define CONFIG_PCM_F64BE_MUXER 0
-%define CONFIG_PCM_F64LE_MUXER 0
-%define CONFIG_PCM_F32BE_MUXER 0
-%define CONFIG_PCM_F32LE_MUXER 0
-%define CONFIG_PCM_S32BE_MUXER 0
-%define CONFIG_PCM_S32LE_MUXER 0
-%define CONFIG_PCM_S24BE_MUXER 0
-%define CONFIG_PCM_S24LE_MUXER 0
-%define CONFIG_PCM_S16BE_MUXER 0
-%define CONFIG_PCM_S16LE_MUXER 0
-%define CONFIG_PCM_S8_MUXER 0
-%define CONFIG_PCM_U32BE_MUXER 0
-%define CONFIG_PCM_U32LE_MUXER 0
-%define CONFIG_PCM_U24BE_MUXER 0
-%define CONFIG_PCM_U24LE_MUXER 0
-%define CONFIG_PCM_U16BE_MUXER 0
-%define CONFIG_PCM_U16LE_MUXER 0
-%define CONFIG_PCM_U8_MUXER 0
-%define CONFIG_PSP_MUXER 0
-%define CONFIG_RAWVIDEO_MUXER 0
-%define CONFIG_RM_MUXER 0
-%define CONFIG_ROQ_MUXER 0
-%define CONFIG_RSO_MUXER 0
-%define CONFIG_RTP_MUXER 0
-%define CONFIG_RTP_MPEGTS_MUXER 0
-%define CONFIG_RTSP_MUXER 0
-%define CONFIG_SAP_MUXER 0
-%define CONFIG_SCC_MUXER 0
-%define CONFIG_SEGMENT_MUXER 0
-%define CONFIG_STREAM_SEGMENT_MUXER 0
-%define CONFIG_SINGLEJPEG_MUXER 0
-%define CONFIG_SMJPEG_MUXER 0
-%define CONFIG_SMOOTHSTREAMING_MUXER 0
-%define CONFIG_SOX_MUXER 0
-%define CONFIG_SPX_MUXER 0
-%define CONFIG_SPDIF_MUXER 0
-%define CONFIG_SRT_MUXER 0
-%define CONFIG_SUP_MUXER 0
-%define CONFIG_SWF_MUXER 0
-%define CONFIG_TEE_MUXER 0
-%define CONFIG_TG2_MUXER 0
-%define CONFIG_TGP_MUXER 0
-%define CONFIG_MKVTIMESTAMP_V2_MUXER 0
-%define CONFIG_TRUEHD_MUXER 0
-%define CONFIG_TTA_MUXER 0
-%define CONFIG_UNCODEDFRAMECRC_MUXER 0
-%define CONFIG_VC1_MUXER 0
-%define CONFIG_VC1T_MUXER 0
-%define CONFIG_VOC_MUXER 0
+%define CONFIG_VPK_DEMUXER 0
+%define CONFIG_VPLAYER_DECODER 0
+%define CONFIG_VPLAYER_DEMUXER 0
+%define CONFIG_VPP_QSV_FILTER 0
+%define CONFIG_VQA_DECODER 0
+%define CONFIG_VQF_DEMUXER 0
+%define CONFIG_VSTACK_FILTER 0
+%define CONFIG_W3FDIF_FILTER 0
+%define CONFIG_W64_DEMUXER 0
 %define CONFIG_W64_MUXER 0
+%define CONFIG_WAVEFORM_FILTER 0
+%define CONFIG_WAVPACK_DECODER 0
+%define CONFIG_WAVPACK_ENCODER 0
+%define CONFIG_WAV_DEMUXER 0
 %define CONFIG_WAV_MUXER 0
-%define CONFIG_WEBM_MUXER 0
-%define CONFIG_WEBM_DASH_MANIFEST_MUXER 0
+%define CONFIG_WC3_DEMUXER 0
+%define CONFIG_WEAVE_FILTER 0
 %define CONFIG_WEBM_CHUNK_MUXER 0
+%define CONFIG_WEBM_DASH_MANIFEST_DEMUXER 0
+%define CONFIG_WEBM_DASH_MANIFEST_MUXER 0
+%define CONFIG_WEBM_MUXER 0
+%define CONFIG_WEBP_DECODER 0
 %define CONFIG_WEBP_MUXER 0
+%define CONFIG_WEBVTT_DECODER 0
+%define CONFIG_WEBVTT_DEMUXER 0
+%define CONFIG_WEBVTT_ENCODER 0
 %define CONFIG_WEBVTT_MUXER 0
+%define CONFIG_WMALOSSLESS_DECODER 0
+%define CONFIG_WMAPRO_DECODER 0
+%define CONFIG_WMAV1_DECODER 0
+%define CONFIG_WMAV1_ENCODER 0
+%define CONFIG_WMAV2_DECODER 0
+%define CONFIG_WMAV2_ENCODER 0
+%define CONFIG_WMAVOICE_DECODER 0
+%define CONFIG_WMV1_DECODER 0
+%define CONFIG_WMV1_ENCODER 0
+%define CONFIG_WMV2_DECODER 0
+%define CONFIG_WMV2_ENCODER 0
+%define CONFIG_WMV3IMAGE_DECODER 0
+%define CONFIG_WMV3_CRYSTALHD_DECODER 0
+%define CONFIG_WMV3_D3D11VA2_HWACCEL 0
+%define CONFIG_WMV3_D3D11VA_HWACCEL 0
+%define CONFIG_WMV3_DECODER 0
+%define CONFIG_WMV3_DXVA2_HWACCEL 0
+%define CONFIG_WMV3_NVDEC_HWACCEL 0
+%define CONFIG_WMV3_VAAPI_HWACCEL 0
+%define CONFIG_WMV3_VDPAU_HWACCEL 0
+%define CONFIG_WNV1_DECODER 0
+%define CONFIG_WRAPPED_AVFRAME_DECODER 0
+%define CONFIG_WRAPPED_AVFRAME_ENCODER 0
+%define CONFIG_WSAUD_DEMUXER 0
+%define CONFIG_WSD_DEMUXER 0
+%define CONFIG_WSVQA_DEMUXER 0
+%define CONFIG_WS_SND1_DECODER 0
+%define CONFIG_WTV_DEMUXER 0
 %define CONFIG_WTV_MUXER 0
+%define CONFIG_WVE_DEMUXER 0
+%define CONFIG_WV_DEMUXER 0
 %define CONFIG_WV_MUXER 0
-%define CONFIG_YUV4MPEGPIPE_MUXER 0
-%define CONFIG_CHROMAPRINT_MUXER 0
-%define CONFIG_ALSA_OUTDEV 0
-%define CONFIG_CACA_OUTDEV 0
-%define CONFIG_DECKLINK_OUTDEV 0
-%define CONFIG_LIBNDI_NEWTEK_OUTDEV 0
-%define CONFIG_FBDEV_OUTDEV 0
-%define CONFIG_OPENGL_OUTDEV 0
-%define CONFIG_OSS_OUTDEV 0
-%define CONFIG_PULSE_OUTDEV 0
-%define CONFIG_SDL2_OUTDEV 0
-%define CONFIG_SNDIO_OUTDEV 0
-%define CONFIG_V4L2_OUTDEV 0
-%define CONFIG_XV_OUTDEV 0
-%define CONFIG_AAC_PARSER 0
-%define CONFIG_AAC_LATM_PARSER 0
-%define CONFIG_AC3_PARSER 0
-%define CONFIG_ADX_PARSER 0
-%define CONFIG_BMP_PARSER 0
-%define CONFIG_CAVSVIDEO_PARSER 0
-%define CONFIG_COOK_PARSER 0
-%define CONFIG_DCA_PARSER 0
-%define CONFIG_DIRAC_PARSER 0
-%define CONFIG_DNXHD_PARSER 0
-%define CONFIG_DPX_PARSER 0
-%define CONFIG_DVAUDIO_PARSER 0
-%define CONFIG_DVBSUB_PARSER 0
-%define CONFIG_DVDSUB_PARSER 0
-%define CONFIG_DVD_NAV_PARSER 0
-%define CONFIG_G729_PARSER 0
-%define CONFIG_GSM_PARSER 0
-%define CONFIG_H261_PARSER 0
-%define CONFIG_H263_PARSER 0
-%define CONFIG_H264_PARSER 0
-%define CONFIG_HEVC_PARSER 0
-%define CONFIG_MJPEG_PARSER 0
-%define CONFIG_MLP_PARSER 0
-%define CONFIG_MPEG4VIDEO_PARSER 0
-%define CONFIG_MPEGAUDIO_PARSER 0
-%define CONFIG_MPEGVIDEO_PARSER 0
-%define CONFIG_OPUS_PARSER 0
-%define CONFIG_PNG_PARSER 0
-%define CONFIG_PNM_PARSER 0
-%define CONFIG_RV30_PARSER 0
-%define CONFIG_RV40_PARSER 0
-%define CONFIG_SIPR_PARSER 0
-%define CONFIG_TAK_PARSER 0
-%define CONFIG_VC1_PARSER 0
-%define CONFIG_VORBIS_PARSER 0
-%define CONFIG_VP3_PARSER 0
+%define CONFIG_XAN_DPCM_DECODER 0
+%define CONFIG_XAN_WC3_DECODER 0
+%define CONFIG_XAN_WC4_DECODER 0
+%define CONFIG_XA_DEMUXER 0
+%define CONFIG_XBIN_DECODER 0
+%define CONFIG_XBIN_DEMUXER 0
+%define CONFIG_XBM_DECODER 0
+%define CONFIG_XBM_ENCODER 0
+%define CONFIG_XBR_FILTER 0
+%define CONFIG_XCBGRAB_INDEV 0
+%define CONFIG_XFACE_DECODER 0
+%define CONFIG_XFACE_ENCODER 0
+%define CONFIG_XL_DECODER 0
+%define CONFIG_XMA1_DECODER 0
+%define CONFIG_XMA2_DECODER 0
 %define CONFIG_XMA_PARSER 0
-%define CONFIG_ASYNC_PROTOCOL 0
-%define CONFIG_BLURAY_PROTOCOL 0
-%define CONFIG_CACHE_PROTOCOL 0
-%define CONFIG_CONCAT_PROTOCOL 0
-%define CONFIG_CRYPTO_PROTOCOL 0
-%define CONFIG_DATA_PROTOCOL 0
-%define CONFIG_FFRTMPCRYPT_PROTOCOL 0
-%define CONFIG_FFRTMPHTTP_PROTOCOL 0
-%define CONFIG_FILE_PROTOCOL 0
-%define CONFIG_FTP_PROTOCOL 0
-%define CONFIG_GOPHER_PROTOCOL 0
-%define CONFIG_HLS_PROTOCOL 0
-%define CONFIG_HTTP_PROTOCOL 0
-%define CONFIG_HTTPPROXY_PROTOCOL 0
-%define CONFIG_HTTPS_PROTOCOL 0
-%define CONFIG_ICECAST_PROTOCOL 0
-%define CONFIG_MMSH_PROTOCOL 0
-%define CONFIG_MMST_PROTOCOL 0
-%define CONFIG_MD5_PROTOCOL 0
-%define CONFIG_PIPE_PROTOCOL 0
-%define CONFIG_PROMPEG_PROTOCOL 0
-%define CONFIG_RTMP_PROTOCOL 0
-%define CONFIG_RTMPE_PROTOCOL 0
-%define CONFIG_RTMPS_PROTOCOL 0
-%define CONFIG_RTMPT_PROTOCOL 0
-%define CONFIG_RTMPTE_PROTOCOL 0
-%define CONFIG_RTMPTS_PROTOCOL 0
-%define CONFIG_RTP_PROTOCOL 0
-%define CONFIG_SCTP_PROTOCOL 0
-%define CONFIG_SRTP_PROTOCOL 0
-%define CONFIG_SUBFILE_PROTOCOL 0
-%define CONFIG_TEE_PROTOCOL 0
-%define CONFIG_TCP_PROTOCOL 0
-%define CONFIG_TLS_GNUTLS_PROTOCOL 0
-%define CONFIG_TLS_SCHANNEL_PROTOCOL 0
-%define CONFIG_TLS_SECURETRANSPORT_PROTOCOL 0
-%define CONFIG_TLS_OPENSSL_PROTOCOL 0
-%define CONFIG_UDP_PROTOCOL 0
-%define CONFIG_UDPLITE_PROTOCOL 0
-%define CONFIG_UNIX_PROTOCOL 0
-%define CONFIG_LIBRTMP_PROTOCOL 0
-%define CONFIG_LIBRTMPE_PROTOCOL 0
-%define CONFIG_LIBRTMPS_PROTOCOL 0
-%define CONFIG_LIBRTMPT_PROTOCOL 0
-%define CONFIG_LIBRTMPTE_PROTOCOL 0
-%define CONFIG_LIBSSH_PROTOCOL 0
-%define CONFIG_LIBSMBCLIENT_PROTOCOL 0
+%define CONFIG_XMV_DEMUXER 0
+%define CONFIG_XPM_DECODER 0
+%define CONFIG_XSUB_DECODER 0
+%define CONFIG_XSUB_ENCODER 0
+%define CONFIG_XVAG_DEMUXER 0
+%define CONFIG_XV_OUTDEV 0
+%define CONFIG_XWD_DECODER 0
+%define CONFIG_XWD_ENCODER 0
+%define CONFIG_XWMA_DEMUXER 0
+%define CONFIG_Y41P_DECODER 0
+%define CONFIG_Y41P_ENCODER 0
+%define CONFIG_YADIF_FILTER 0
+%define CONFIG_YLC_DECODER 0
+%define CONFIG_YOP_DECODER 0
+%define CONFIG_YOP_DEMUXER 0
+%define CONFIG_YUV4MPEGPIPE_DEMUXER 0
+%define CONFIG_YUV4MPEGPIPE_MUXER 0
+%define CONFIG_YUV4_DECODER 0
+%define CONFIG_YUV4_ENCODER 0
+%define CONFIG_YUVTESTSRC_FILTER 0
+%define CONFIG_ZERO12V_DECODER 0
+%define CONFIG_ZEROCODEC_DECODER 0
+%define CONFIG_ZLIB_DECODER 0
+%define CONFIG_ZLIB_ENCODER 0
+%define CONFIG_ZMBV_DECODER 0
+%define CONFIG_ZMBV_ENCODER 0
+%define CONFIG_ZMQ_FILTER 0
+%define CONFIG_ZOOMPAN_FILTER 0
+%define CONFIG_ZSCALE_FILTER 0
diff --git a/media/ffvpx/defaults_disabled.h b/media/ffvpx/defaults_disabled.h
index e526aa9e7..7f5e23b96 100644
--- a/media/ffvpx/defaults_disabled.h
+++ b/media/ffvpx/defaults_disabled.h
@@ -1,402 +1,40 @@
-#define CONFIG_ENCODERS 0
-#define CONFIG_HWACCELS 0
-#define CONFIG_INDEVS 0
-#define CONFIG_OUTDEVS 0
-#define CONFIG_FILTERS 0
-#define CONFIG_DEMUXERS 0
-#define CONFIG_MUXERS 0
-#define CONFIG_PROTOCOLS 0
-#define CONFIG_FRAME_THREAD_ENCODER 0
-#define CONFIG_AASC_DECODER 0
-#define CONFIG_AIC_DECODER 0
-#define CONFIG_ALIAS_PIX_DECODER 0
-#define CONFIG_AMV_DECODER 0
-#define CONFIG_ANM_DECODER 0
-#define CONFIG_ANSI_DECODER 0
-#define CONFIG_APNG_DECODER 0
-#define CONFIG_ASV1_DECODER 0
-#define CONFIG_ASV2_DECODER 0
-#define CONFIG_AURA_DECODER 0
-#define CONFIG_AURA2_DECODER 0
-#define CONFIG_AVRP_DECODER 0
-#define CONFIG_AVRN_DECODER 0
-#define CONFIG_AVS_DECODER 0
-#define CONFIG_AVUI_DECODER 0
-#define CONFIG_AYUV_DECODER 0
-#define CONFIG_BETHSOFTVID_DECODER 0
-#define CONFIG_BFI_DECODER 0
-#define CONFIG_BINK_DECODER 0
-#define CONFIG_BMP_DECODER 0
-#define CONFIG_BMV_VIDEO_DECODER 0
-#define CONFIG_BRENDER_PIX_DECODER 0
-#define CONFIG_C93_DECODER 0
-#define CONFIG_CAVS_DECODER 0
-#define CONFIG_CDGRAPHICS_DECODER 0
-#define CONFIG_CDXL_DECODER 0
-#define CONFIG_CFHD_DECODER 0
-#define CONFIG_CINEPAK_DECODER 0
-#define CONFIG_CLEARVIDEO_DECODER 0
-#define CONFIG_CLJR_DECODER 0
-#define CONFIG_CLLC_DECODER 0
-#define CONFIG_COMFORTNOISE_DECODER 0
-#define CONFIG_CPIA_DECODER 0
-#define CONFIG_CSCD_DECODER 0
-#define CONFIG_CYUV_DECODER 0
-#define CONFIG_DDS_DECODER 0
-#define CONFIG_DFA_DECODER 0
-#define CONFIG_DIRAC_DECODER 0
-#define CONFIG_DNXHD_DECODER 0
-#define CONFIG_DPX_DECODER 0
-#define CONFIG_DSICINVIDEO_DECODER 0
-#define CONFIG_DVAUDIO_DECODER 0
-#define CONFIG_DVVIDEO_DECODER 0
-#define CONFIG_DXA_DECODER 0
-#define CONFIG_DXTORY_DECODER 0
-#define CONFIG_DXV_DECODER 0
-#define CONFIG_EACMV_DECODER 0
-#define CONFIG_EAMAD_DECODER 0
-#define CONFIG_EATGQ_DECODER 0
-#define CONFIG_EATGV_DECODER 0
-#define CONFIG_EATQI_DECODER 0
-#define CONFIG_EIGHTBPS_DECODER 0
-#define CONFIG_EIGHTSVX_EXP_DECODER 0
-#define CONFIG_EIGHTSVX_FIB_DECODER 0
-#define CONFIG_ESCAPE124_DECODER 0
-#define CONFIG_ESCAPE130_DECODER 0
-#define CONFIG_EXR_DECODER 0
-#define CONFIG_FFV1_DECODER 0
-#define CONFIG_FFVHUFF_DECODER 0
-#define CONFIG_FIC_DECODER 0
-#define CONFIG_FITS_DECODER 0
-#define CONFIG_FLASHSV_DECODER 0
-#define CONFIG_FLASHSV2_DECODER 0
-#define CONFIG_FLIC_DECODER 0
-#define CONFIG_FLV_DECODER 0
-#define CONFIG_FMVC_DECODER 0
-#define CONFIG_FOURXM_DECODER 0
-#define CONFIG_FRAPS_DECODER 0
-#define CONFIG_FRWU_DECODER 0
-#define CONFIG_G2M_DECODER 0
-#define CONFIG_GDV_DECODER 0
-#define CONFIG_GIF_DECODER 0
-#define CONFIG_H261_DECODER 0
-#define CONFIG_H263_DECODER 0
-#define CONFIG_H263I_DECODER 0
-#define CONFIG_H263P_DECODER 0
-#define CONFIG_H263_V4L2M2M_DECODER 0
-#define CONFIG_H264_DECODER 0
-#define CONFIG_H264_CRYSTALHD_DECODER 0
-#define CONFIG_H264_V4L2M2M_DECODER 0
-#define CONFIG_H264_MEDIACODEC_DECODER 0
-#define CONFIG_H264_MMAL_DECODER 0
-#define CONFIG_H264_QSV_DECODER 0
-#define CONFIG_H264_RKMPP_DECODER 0
-#define CONFIG_H264_VDA_DECODER 0
-#define CONFIG_H264_VDPAU_DECODER 0
-#define CONFIG_HAP_DECODER 0
-#define CONFIG_HEVC_DECODER 0
-#define CONFIG_HEVC_QSV_DECODER 0
-#define CONFIG_HEVC_RKMPP_DECODER 0
-#define CONFIG_HEVC_V4L2M2M_DECODER 0
-#define CONFIG_HNM4_VIDEO_DECODER 0
-#define CONFIG_HQ_HQA_DECODER 0
-#define CONFIG_HQX_DECODER 0
-#define CONFIG_HUFFYUV_DECODER 0
-#define CONFIG_IDCIN_DECODER 0
-#define CONFIG_IFF_ILBM_DECODER 0
-#define CONFIG_INDEO2_DECODER 0
-#define CONFIG_INDEO3_DECODER 0
-#define CONFIG_INDEO4_DECODER 0
-#define CONFIG_INDEO5_DECODER 0
-#define CONFIG_INTERPLAY_VIDEO_DECODER 0
-#define CONFIG_JPEG2000_DECODER 0
-#define CONFIG_JPEGLS_DECODER 0
-#define CONFIG_JV_DECODER 0
-#define CONFIG_KGV1_DECODER 0
-#define CONFIG_KMVC_DECODER 0
-#define CONFIG_LAGARITH_DECODER 0
-#define CONFIG_LOCO_DECODER 0
-#define CONFIG_M101_DECODER 0
-#define CONFIG_MAGICYUV_DECODER 0
-#define CONFIG_MDEC_DECODER 0
-#define CONFIG_MIMIC_DECODER 0
-#define CONFIG_MJPEG_DECODER 0
-#define CONFIG_MJPEGB_DECODER 0
-#define CONFIG_MMVIDEO_DECODER 0
-#define CONFIG_MOTIONPIXELS_DECODER 0
-#define CONFIG_MPEG_XVMC_DECODER 0
-#define CONFIG_MPEG1VIDEO_DECODER 0
-#define CONFIG_MPEG2VIDEO_DECODER 0
-#define CONFIG_MPEG4_DECODER 0
-#define CONFIG_MPEG4_CRYSTALHD_DECODER 0
-#define CONFIG_MPEG4_V4L2M2M_DECODER 0
-#define CONFIG_MPEG4_MMAL_DECODER 0
-#define CONFIG_MPEG4_VDPAU_DECODER 0
-#define CONFIG_MPEGVIDEO_DECODER 0
-#define CONFIG_MPEG_VDPAU_DECODER 0
-#define CONFIG_MPEG1_VDPAU_DECODER 0
-#define CONFIG_MPEG1_V4L2M2M_DECODER 0
-#define CONFIG_MPEG2_MMAL_DECODER 0
-#define CONFIG_MPEG2_CRYSTALHD_DECODER 0
-#define CONFIG_MPEG2_V4L2M2M_DECODER 0
-#define CONFIG_MPEG2_QSV_DECODER 0
-#define CONFIG_MPEG2_MEDIACODEC_DECODER 0
-#define CONFIG_MSA1_DECODER 0
-#define CONFIG_MSCC_DECODER 0
-#define CONFIG_MSMPEG4V1_DECODER 0
-#define CONFIG_MSMPEG4V2_DECODER 0
-#define CONFIG_MSMPEG4V3_DECODER 0
-#define CONFIG_MSMPEG4_CRYSTALHD_DECODER 0
-#define CONFIG_MSRLE_DECODER 0
-#define CONFIG_MSS1_DECODER 0
-#define CONFIG_MSS2_DECODER 0
-#define CONFIG_MSVIDEO1_DECODER 0
-#define CONFIG_MSZH_DECODER 0
-#define CONFIG_MTS2_DECODER 0
-#define CONFIG_MVC1_DECODER 0
-#define CONFIG_MVC2_DECODER 0
-#define CONFIG_MXPEG_DECODER 0
-#define CONFIG_NUV_DECODER 0
-#define CONFIG_PAF_VIDEO_DECODER 0
-#define CONFIG_PAM_DECODER 0
-#define CONFIG_PBM_DECODER 0
-#define CONFIG_PCX_DECODER 0
-#define CONFIG_PGM_DECODER 0
-#define CONFIG_PGMYUV_DECODER 0
-#define CONFIG_PICTOR_DECODER 0
-#define CONFIG_PIXLET_DECODER 0
-#define CONFIG_PNG_DECODER 0
-#define CONFIG_PPM_DECODER 0
-#define CONFIG_PRORES_DECODER 0
-#define CONFIG_PRORES_LGPL_DECODER 0
-#define CONFIG_PSD_DECODER 0
-#define CONFIG_PTX_DECODER 0
-#define CONFIG_QDRAW_DECODER 0
-#define CONFIG_QPEG_DECODER 0
-#define CONFIG_QTRLE_DECODER 0
-#define CONFIG_R10K_DECODER 0
-#define CONFIG_R210_DECODER 0
-#define CONFIG_RAWVIDEO_DECODER 0
-#define CONFIG_RL2_DECODER 0
-#define CONFIG_ROQ_DECODER 0
-#define CONFIG_RPZA_DECODER 0
-#define CONFIG_RSCC_DECODER 0
-#define CONFIG_RV10_DECODER 0
-#define CONFIG_RV20_DECODER 0
-#define CONFIG_RV30_DECODER 0
-#define CONFIG_RV40_DECODER 0
-#define CONFIG_S302M_DECODER 0
-#define CONFIG_SANM_DECODER 0
-#define CONFIG_SCPR_DECODER 0
-#define CONFIG_SCREENPRESSO_DECODER 0
-#define CONFIG_SDX2_DPCM_DECODER 0
-#define CONFIG_SGI_DECODER 0
-#define CONFIG_SGIRLE_DECODER 0
-#define CONFIG_SHEERVIDEO_DECODER 0
-#define CONFIG_SMACKER_DECODER 0
-#define CONFIG_SMC_DECODER 0
-#define CONFIG_SMVJPEG_DECODER 0
-#define CONFIG_SNOW_DECODER 0
-#define CONFIG_SP5X_DECODER 0
-#define CONFIG_SPEEDHQ_DECODER 0
-#define CONFIG_SRGC_DECODER 0
-#define CONFIG_SUNRAST_DECODER 0
-#define CONFIG_SVQ1_DECODER 0
-#define CONFIG_SVQ3_DECODER 0
-#define CONFIG_TARGA_DECODER 0
-#define CONFIG_TARGA_Y216_DECODER 0
-#define CONFIG_TDSC_DECODER 0
-#define CONFIG_THEORA_DECODER 0
-#define CONFIG_THP_DECODER 0
-#define CONFIG_TIERTEXSEQVIDEO_DECODER 0
-#define CONFIG_TIFF_DECODER 0
-#define CONFIG_TMV_DECODER 0
-#define CONFIG_TRUEMOTION1_DECODER 0
-#define CONFIG_TRUEMOTION2_DECODER 0
-#define CONFIG_TRUEMOTION2RT_DECODER 0
-#define CONFIG_TSCC_DECODER 0
-#define CONFIG_TSCC2_DECODER 0
-#define CONFIG_TXD_DECODER 0
-#define CONFIG_ULTI_DECODER 0
-#define CONFIG_UTVIDEO_DECODER 0
-#define CONFIG_V210_DECODER 0
-#define CONFIG_V210X_DECODER 0
-#define CONFIG_V308_DECODER 0
-#define CONFIG_V408_DECODER 0
-#define CONFIG_V410_DECODER 0
-#define CONFIG_VB_DECODER 0
-#define CONFIG_VBLE_DECODER 0
-#define CONFIG_VC1_DECODER 0
-#define CONFIG_VC1_CRYSTALHD_DECODER 0
-#define CONFIG_VC1_VDPAU_DECODER 0
-#define CONFIG_VC1IMAGE_DECODER 0
-#define CONFIG_VC1_MMAL_DECODER 0
-#define CONFIG_VC1_QSV_DECODER 0
-#define CONFIG_VC1_V4L2M2M_DECODER 0
-#define CONFIG_VCR1_DECODER 0
-#define CONFIG_VMDVIDEO_DECODER 0
-#define CONFIG_VMNC_DECODER 0
-#define CONFIG_VP3_DECODER 0
-#define CONFIG_VP5_DECODER 0
-#define CONFIG_VP6_DECODER 0
-#define CONFIG_VP6A_DECODER 0
-#define CONFIG_VP6F_DECODER 0
-#define CONFIG_VP7_DECODER 0
-#define CONFIG_VP8_RKMPP_DECODER 0
-#define CONFIG_VP8_V4L2M2M_DECODER 0
-#define CONFIG_VP9_RKMPP_DECODER 0
-#define CONFIG_VP9_V4L2M2M_DECODER 0
-#define CONFIG_VQA_DECODER 0
-#define CONFIG_BITPACKED_DECODER 0
-#define CONFIG_WEBP_DECODER 0
-#define CONFIG_WRAPPED_AVFRAME_DECODER 0
-#define CONFIG_WMV1_DECODER 0
-#define CONFIG_WMV2_DECODER 0
-#define CONFIG_WMV3_DECODER 0
-#define CONFIG_WMV3_CRYSTALHD_DECODER 0
-#define CONFIG_WMV3_VDPAU_DECODER 0
-#define CONFIG_WMV3IMAGE_DECODER 0
-#define CONFIG_WNV1_DECODER 0
-#define CONFIG_XAN_WC3_DECODER 0
-#define CONFIG_XAN_WC4_DECODER 0
-#define CONFIG_XBM_DECODER 0
-#define CONFIG_XFACE_DECODER 0
-#define CONFIG_XL_DECODER 0
-#define CONFIG_XPM_DECODER 0
-#define CONFIG_XWD_DECODER 0
-#define CONFIG_Y41P_DECODER 0
-#define CONFIG_YLC_DECODER 0
-#define CONFIG_YOP_DECODER 0
-#define CONFIG_YUV4_DECODER 0
-#define CONFIG_ZERO12V_DECODER 0
-#define CONFIG_ZEROCODEC_DECODER 0
-#define CONFIG_ZLIB_DECODER 0
-#define CONFIG_ZMBV_DECODER 0
+#define CONFIG_A64MULTI5_ENCODER 0
+#define CONFIG_A64MULTI_ENCODER 0
+#define CONFIG_A64_MUXER 0
+#define CONFIG_AAC_ADTSTOASC_BSF 0
+#define CONFIG_AAC_AT_DECODER 0
+#define CONFIG_AAC_AT_ENCODER 0
 #define CONFIG_AAC_DECODER 0
+#define CONFIG_AAC_DEMUXER 0
+#define CONFIG_AAC_ENCODER 0
 #define CONFIG_AAC_FIXED_DECODER 0
 #define CONFIG_AAC_LATM_DECODER 0
+#define CONFIG_AAC_LATM_PARSER 0
+#define CONFIG_AAC_PARSER 0
+#define CONFIG_AASC_DECODER 0
+#define CONFIG_AA_DEMUXER 0
+#define CONFIG_ABENCH_FILTER 0
+#define CONFIG_ABITSCOPE_FILTER 0
+#define CONFIG_AC3_AT_DECODER 0
 #define CONFIG_AC3_DECODER 0
+#define CONFIG_AC3_DEMUXER 0
+#define CONFIG_AC3_ENCODER 0
 #define CONFIG_AC3_FIXED_DECODER 0
-#define CONFIG_ALAC_DECODER 0
-#define CONFIG_ALS_DECODER 0
-#define CONFIG_AMRNB_DECODER 0
-#define CONFIG_AMRWB_DECODER 0
-#define CONFIG_APE_DECODER 0
-#define CONFIG_ATRAC1_DECODER 0
-#define CONFIG_ATRAC3_DECODER 0
-#define CONFIG_ATRAC3AL_DECODER 0
-#define CONFIG_ATRAC3P_DECODER 0
-#define CONFIG_ATRAC3PAL_DECODER 0
-#define CONFIG_BINKAUDIO_DCT_DECODER 0
-#define CONFIG_BINKAUDIO_RDFT_DECODER 0
-#define CONFIG_BMV_AUDIO_DECODER 0
-#define CONFIG_COOK_DECODER 0
-#define CONFIG_DCA_DECODER 0
-#define CONFIG_DOLBY_E_DECODER 0
-#define CONFIG_DSD_LSBF_DECODER 0
-#define CONFIG_DSD_MSBF_DECODER 0
-#define CONFIG_DSD_LSBF_PLANAR_DECODER 0
-#define CONFIG_DSD_MSBF_PLANAR_DECODER 0
-#define CONFIG_DSICINAUDIO_DECODER 0
-#define CONFIG_DSS_SP_DECODER 0
-#define CONFIG_DST_DECODER 0
-#define CONFIG_EAC3_DECODER 0
-#define CONFIG_EVRC_DECODER 0
-#define CONFIG_FFWAVESYNTH_DECODER 0
-#define CONFIG_G723_1_DECODER 0
-#define CONFIG_G729_DECODER 0
-#define CONFIG_GSM_DECODER 0
-#define CONFIG_GSM_MS_DECODER 0
-#define CONFIG_IAC_DECODER 0
-#define CONFIG_IMC_DECODER 0
-#define CONFIG_INTERPLAY_ACM_DECODER 0
-#define CONFIG_MACE3_DECODER 0
-#define CONFIG_MACE6_DECODER 0
-#define CONFIG_METASOUND_DECODER 0
-#define CONFIG_MLP_DECODER 0
-#define CONFIG_MP1_DECODER 0
-#define CONFIG_MP1FLOAT_DECODER 0
-#define CONFIG_MP2_DECODER 0
-#define CONFIG_MP2FLOAT_DECODER 0
-#define CONFIG_MP3_DECODER 0
-#define CONFIG_MP3FLOAT_DECODER 0
-#define CONFIG_MP3ADU_DECODER 0
-#define CONFIG_MP3ADUFLOAT_DECODER 0
-#define CONFIG_MP3ON4_DECODER 0
-#define CONFIG_MP3ON4FLOAT_DECODER 0
-#define CONFIG_MPC7_DECODER 0
-#define CONFIG_MPC8_DECODER 0
-#define CONFIG_NELLYMOSER_DECODER 0
-#define CONFIG_ON2AVC_DECODER 0
-#define CONFIG_OPUS_DECODER 0
-#define CONFIG_PAF_AUDIO_DECODER 0
-#define CONFIG_QCELP_DECODER 0
-#define CONFIG_QDM2_DECODER 0
-#define CONFIG_QDMC_DECODER 0
-#define CONFIG_RA_144_DECODER 0
-#define CONFIG_RA_288_DECODER 0
-#define CONFIG_RALF_DECODER 0
-#define CONFIG_SHORTEN_DECODER 0
-#define CONFIG_SIPR_DECODER 0
-#define CONFIG_SMACKAUD_DECODER 0
-#define CONFIG_SONIC_DECODER 0
-#define CONFIG_TAK_DECODER 0
-#define CONFIG_TRUEHD_DECODER 0
-#define CONFIG_TRUESPEECH_DECODER 0
-#define CONFIG_TTA_DECODER 0
-#define CONFIG_TWINVQ_DECODER 0
-#define CONFIG_VMDAUDIO_DECODER 0
-#define CONFIG_VORBIS_DECODER 0
-#define CONFIG_WAVPACK_DECODER 0
-#define CONFIG_WMALOSSLESS_DECODER 0
-#define CONFIG_WMAPRO_DECODER 0
-#define CONFIG_WMAV1_DECODER 0
-#define CONFIG_WMAV2_DECODER 0
-#define CONFIG_WMAVOICE_DECODER 0
-#define CONFIG_WS_SND1_DECODER 0
-#define CONFIG_XMA1_DECODER 0
-#define CONFIG_XMA2_DECODER 0
-#define CONFIG_PCM_ALAW_DECODER 0
-#define CONFIG_PCM_BLURAY_DECODER 0
-#define CONFIG_PCM_DVD_DECODER 0
-#define CONFIG_PCM_F16LE_DECODER 0
-#define CONFIG_PCM_F24LE_DECODER 0
-#define CONFIG_PCM_F32BE_DECODER 0
-#define CONFIG_PCM_F32LE_DECODER 0
-#define CONFIG_PCM_F64BE_DECODER 0
-#define CONFIG_PCM_F64LE_DECODER 0
-#define CONFIG_PCM_LXF_DECODER 0
-#define CONFIG_PCM_MULAW_DECODER 0
-#define CONFIG_PCM_S8_DECODER 0
-#define CONFIG_PCM_S8_PLANAR_DECODER 0
-#define CONFIG_PCM_S16BE_DECODER 0
-#define CONFIG_PCM_S16BE_PLANAR_DECODER 0
-#define CONFIG_PCM_S16LE_DECODER 0
-#define CONFIG_PCM_S16LE_PLANAR_DECODER 0
-#define CONFIG_PCM_S24BE_DECODER 0
-#define CONFIG_PCM_S24DAUD_DECODER 0
-#define CONFIG_PCM_S24LE_DECODER 0
-#define CONFIG_PCM_S24LE_PLANAR_DECODER 0
-#define CONFIG_PCM_S32BE_DECODER 0
-#define CONFIG_PCM_S32LE_DECODER 0
-#define CONFIG_PCM_S32LE_PLANAR_DECODER 0
-#define CONFIG_PCM_S64BE_DECODER 0
-#define CONFIG_PCM_S64LE_DECODER 0
-#define CONFIG_PCM_U8_DECODER 0
-#define CONFIG_PCM_U16BE_DECODER 0
-#define CONFIG_PCM_U16LE_DECODER 0
-#define CONFIG_PCM_U24BE_DECODER 0
-#define CONFIG_PCM_U24LE_DECODER 0
-#define CONFIG_PCM_U32BE_DECODER 0
-#define CONFIG_PCM_U32LE_DECODER 0
-#define CONFIG_PCM_ZORK_DECODER 0
-#define CONFIG_GREMLIN_DPCM_DECODER 0
-#define CONFIG_INTERPLAY_DPCM_DECODER 0
-#define CONFIG_ROQ_DPCM_DECODER 0
-#define CONFIG_SOL_DPCM_DECODER 0
-#define CONFIG_XAN_DPCM_DECODER 0
+#define CONFIG_AC3_FIXED_ENCODER 0
+#define CONFIG_AC3_MUXER 0
+#define CONFIG_AC3_PARSER 0
+#define CONFIG_ACM_DEMUXER 0
+#define CONFIG_ACOMPRESSOR_FILTER 0
+#define CONFIG_ACONTRAST_FILTER 0
+#define CONFIG_ACOPY_FILTER 0
+#define CONFIG_ACROSSFADE_FILTER 0
+#define CONFIG_ACRUSHER_FILTER 0
+#define CONFIG_ACT_DEMUXER 0
+#define CONFIG_ADELAY_FILTER 0
+#define CONFIG_ADF_DEMUXER 0
 #define CONFIG_ADPCM_4XM_DECODER 0
 #define CONFIG_ADPCM_ADX_DECODER 0
+#define CONFIG_ADPCM_ADX_ENCODER 0
 #define CONFIG_ADPCM_AFC_DECODER 0
 #define CONFIG_ADPCM_AICA_DECODER 0
 #define CONFIG_ADPCM_CT_DECODER 0
@@ -408,8 +46,11 @@
 #define CONFIG_ADPCM_EA_R3_DECODER 0
 #define CONFIG_ADPCM_EA_XAS_DECODER 0
 #define CONFIG_ADPCM_G722_DECODER 0
-#define CONFIG_ADPCM_G726_DECODER 0
+#define CONFIG_ADPCM_G722_ENCODER 0
 #define CONFIG_ADPCM_G726LE_DECODER 0
+#define CONFIG_ADPCM_G726LE_ENCODER 0
+#define CONFIG_ADPCM_G726_DECODER 0
+#define CONFIG_ADPCM_G726_ENCODER 0
 #define CONFIG_ADPCM_IMA_AMV_DECODER 0
 #define CONFIG_ADPCM_IMA_APC_DECODER 0
 #define CONFIG_ADPCM_IMA_DAT4_DECODER 0
@@ -419,604 +60,114 @@
 #define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 0
 #define CONFIG_ADPCM_IMA_ISS_DECODER 0
 #define CONFIG_ADPCM_IMA_OKI_DECODER 0
+#define CONFIG_ADPCM_IMA_QT_AT_DECODER 0
 #define CONFIG_ADPCM_IMA_QT_DECODER 0
+#define CONFIG_ADPCM_IMA_QT_ENCODER 0
 #define CONFIG_ADPCM_IMA_RAD_DECODER 0
 #define CONFIG_ADPCM_IMA_SMJPEG_DECODER 0
 #define CONFIG_ADPCM_IMA_WAV_DECODER 0
+#define CONFIG_ADPCM_IMA_WAV_ENCODER 0
 #define CONFIG_ADPCM_IMA_WS_DECODER 0
 #define CONFIG_ADPCM_MS_DECODER 0
+#define CONFIG_ADPCM_MS_ENCODER 0
 #define CONFIG_ADPCM_MTAF_DECODER 0
 #define CONFIG_ADPCM_PSX_DECODER 0
 #define CONFIG_ADPCM_SBPRO_2_DECODER 0
 #define CONFIG_ADPCM_SBPRO_3_DECODER 0
 #define CONFIG_ADPCM_SBPRO_4_DECODER 0
 #define CONFIG_ADPCM_SWF_DECODER 0
+#define CONFIG_ADPCM_SWF_ENCODER 0
 #define CONFIG_ADPCM_THP_DECODER 0
 #define CONFIG_ADPCM_THP_LE_DECODER 0
 #define CONFIG_ADPCM_VIMA_DECODER 0
 #define CONFIG_ADPCM_XA_DECODER 0
 #define CONFIG_ADPCM_YAMAHA_DECODER 0
-#define CONFIG_SSA_DECODER 0
-#define CONFIG_ASS_DECODER 0
-#define CONFIG_CCAPTION_DECODER 0
-#define CONFIG_DVBSUB_DECODER 0
-#define CONFIG_DVDSUB_DECODER 0
-#define CONFIG_JACOSUB_DECODER 0
-#define CONFIG_MICRODVD_DECODER 0
-#define CONFIG_MOVTEXT_DECODER 0
-#define CONFIG_MPL2_DECODER 0
-#define CONFIG_PGSSUB_DECODER 0
-#define CONFIG_PJS_DECODER 0
-#define CONFIG_REALTEXT_DECODER 0
-#define CONFIG_SAMI_DECODER 0
-#define CONFIG_SRT_DECODER 0
-#define CONFIG_STL_DECODER 0
-#define CONFIG_SUBRIP_DECODER 0
-#define CONFIG_SUBVIEWER_DECODER 0
-#define CONFIG_SUBVIEWER1_DECODER 0
-#define CONFIG_TEXT_DECODER 0
-#define CONFIG_VPLAYER_DECODER 0
-#define CONFIG_WEBVTT_DECODER 0
-#define CONFIG_XSUB_DECODER 0
-#define CONFIG_AAC_AT_DECODER 0
-#define CONFIG_AC3_AT_DECODER 0
-#define CONFIG_ADPCM_IMA_QT_AT_DECODER 0
-#define CONFIG_ALAC_AT_DECODER 0
-#define CONFIG_AMR_NB_AT_DECODER 0
-#define CONFIG_EAC3_AT_DECODER 0
-#define CONFIG_GSM_MS_AT_DECODER 0
-#define CONFIG_ILBC_AT_DECODER 0
-#define CONFIG_MP1_AT_DECODER 0
-#define CONFIG_MP2_AT_DECODER 0
-#define CONFIG_MP3_AT_DECODER 0
-#define CONFIG_PCM_ALAW_AT_DECODER 0
-#define CONFIG_PCM_MULAW_AT_DECODER 0
-#define CONFIG_QDMC_AT_DECODER 0
-#define CONFIG_QDM2_AT_DECODER 0
-#define CONFIG_LIBCELT_DECODER 0
-#define CONFIG_LIBFDK_AAC_DECODER 0
-#define CONFIG_LIBGSM_DECODER 0
-#define CONFIG_LIBGSM_MS_DECODER 0
-#define CONFIG_LIBILBC_DECODER 0
-#define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
-#define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
-#define CONFIG_LIBOPENJPEG_DECODER 0
-#define CONFIG_LIBOPUS_DECODER 0
-#define CONFIG_LIBRSVG_DECODER 0
-#define CONFIG_LIBSPEEX_DECODER 0
-#define CONFIG_LIBVORBIS_DECODER 0
-#define CONFIG_LIBVPX_VP8_DECODER 0
-#define CONFIG_LIBVPX_VP9_DECODER 0
-#define CONFIG_LIBZVBI_TELETEXT_DECODER 0
-#define CONFIG_BINTEXT_DECODER 0
-#define CONFIG_XBIN_DECODER 0
-#define CONFIG_IDF_DECODER 0
-#define CONFIG_LIBOPENH264_DECODER 0
-#define CONFIG_H264_CUVID_DECODER 0
-#define CONFIG_HEVC_CUVID_DECODER 0
-#define CONFIG_HEVC_MEDIACODEC_DECODER 0
-#define CONFIG_MJPEG_CUVID_DECODER 0
-#define CONFIG_MPEG1_CUVID_DECODER 0
-#define CONFIG_MPEG2_CUVID_DECODER 0
-#define CONFIG_MPEG4_CUVID_DECODER 0
-#define CONFIG_MPEG4_MEDIACODEC_DECODER 0
-#define CONFIG_VC1_CUVID_DECODER 0
-#define CONFIG_VP8_CUVID_DECODER 0
-#define CONFIG_VP8_MEDIACODEC_DECODER 0
-#define CONFIG_VP8_QSV_DECODER 0
-#define CONFIG_VP9_CUVID_DECODER 0
-#define CONFIG_VP9_MEDIACODEC_DECODER 0
-#define CONFIG_AA_DEMUXER 0
-#define CONFIG_AAC_DEMUXER 0
-#define CONFIG_AC3_DEMUXER 0
-#define CONFIG_ACM_DEMUXER 0
-#define CONFIG_ACT_DEMUXER 0
-#define CONFIG_ADF_DEMUXER 0
+#define CONFIG_ADPCM_YAMAHA_ENCODER 0
 #define CONFIG_ADP_DEMUXER 0
+#define CONFIG_ADRAWGRAPH_FILTER 0
 #define CONFIG_ADS_DEMUXER 0
+#define CONFIG_ADTS_MUXER 0
 #define CONFIG_ADX_DEMUXER 0
+#define CONFIG_ADX_MUXER 0
+#define CONFIG_ADX_PARSER 0
 #define CONFIG_AEA_DEMUXER 0
-#define CONFIG_AFC_DEMUXER 0
-#define CONFIG_AIFF_DEMUXER 0
-#define CONFIG_AIX_DEMUXER 0
-#define CONFIG_AMR_DEMUXER 0
-#define CONFIG_ANM_DEMUXER 0
-#define CONFIG_APC_DEMUXER 0
-#define CONFIG_APE_DEMUXER 0
-#define CONFIG_APNG_DEMUXER 0
-#define CONFIG_AQTITLE_DEMUXER 0
-#define CONFIG_ASF_DEMUXER 0
-#define CONFIG_ASF_O_DEMUXER 0
-#define CONFIG_ASS_DEMUXER 0
-#define CONFIG_AST_DEMUXER 0
-#define CONFIG_AU_DEMUXER 0
-#define CONFIG_AVI_DEMUXER 0
-#define CONFIG_AVISYNTH_DEMUXER 0
-#define CONFIG_AVR_DEMUXER 0
-#define CONFIG_AVS_DEMUXER 0
-#define CONFIG_BETHSOFTVID_DEMUXER 0
-#define CONFIG_BFI_DEMUXER 0
-#define CONFIG_BINTEXT_DEMUXER 0
-#define CONFIG_BINK_DEMUXER 0
-#define CONFIG_BIT_DEMUXER 0
-#define CONFIG_BMV_DEMUXER 0
-#define CONFIG_BFSTM_DEMUXER 0
-#define CONFIG_BRSTM_DEMUXER 0
-#define CONFIG_BOA_DEMUXER 0
-#define CONFIG_C93_DEMUXER 0
-#define CONFIG_CAF_DEMUXER 0
-#define CONFIG_CAVSVIDEO_DEMUXER 0
-#define CONFIG_CDG_DEMUXER 0
-#define CONFIG_CDXL_DEMUXER 0
-#define CONFIG_CINE_DEMUXER 0
-#define CONFIG_CONCAT_DEMUXER 0
-#define CONFIG_DASH_DEMUXER 0
-#define CONFIG_DATA_DEMUXER 0
-#define CONFIG_DAUD_DEMUXER 0
-#define CONFIG_DCSTR_DEMUXER 0
-#define CONFIG_DFA_DEMUXER 0
-#define CONFIG_DIRAC_DEMUXER 0
-#define CONFIG_DNXHD_DEMUXER 0
-#define CONFIG_DSF_DEMUXER 0
-#define CONFIG_DSICIN_DEMUXER 0
-#define CONFIG_DSS_DEMUXER 0
-#define CONFIG_DTS_DEMUXER 0
-#define CONFIG_DTSHD_DEMUXER 0
-#define CONFIG_DV_DEMUXER 0
-#define CONFIG_DVBSUB_DEMUXER 0
-#define CONFIG_DVBTXT_DEMUXER 0
-#define CONFIG_DXA_DEMUXER 0
-#define CONFIG_EA_DEMUXER 0
-#define CONFIG_EA_CDATA_DEMUXER 0
-#define CONFIG_EAC3_DEMUXER 0
-#define CONFIG_EPAF_DEMUXER 0
-#define CONFIG_FFM_DEMUXER 0
-#define CONFIG_FFMETADATA_DEMUXER 0
-#define CONFIG_FILMSTRIP_DEMUXER 0
-#define CONFIG_FITS_DEMUXER 0
-#define CONFIG_FLAC_DEMUXER 0
-#define CONFIG_FLIC_DEMUXER 0
-#define CONFIG_FLV_DEMUXER 0
-#define CONFIG_LIVE_FLV_DEMUXER 0
-#define CONFIG_FOURXM_DEMUXER 0
-#define CONFIG_FRM_DEMUXER 0
-#define CONFIG_FSB_DEMUXER 0
-#define CONFIG_G722_DEMUXER 0
-#define CONFIG_G723_1_DEMUXER 0
-#define CONFIG_G726_DEMUXER 0
-#define CONFIG_G726LE_DEMUXER 0
-#define CONFIG_G729_DEMUXER 0
-#define CONFIG_GDV_DEMUXER 0
-#define CONFIG_GENH_DEMUXER 0
-#define CONFIG_GIF_DEMUXER 0
-#define CONFIG_GSM_DEMUXER 0
-#define CONFIG_GXF_DEMUXER 0
-#define CONFIG_H261_DEMUXER 0
-#define CONFIG_H263_DEMUXER 0
-#define CONFIG_H264_DEMUXER 0
-#define CONFIG_HEVC_DEMUXER 0
-#define CONFIG_HLS_DEMUXER 0
-#define CONFIG_HNM_DEMUXER 0
-#define CONFIG_ICO_DEMUXER 0
-#define CONFIG_IDCIN_DEMUXER 0
-#define CONFIG_IDF_DEMUXER 0
-#define CONFIG_IFF_DEMUXER 0
-#define CONFIG_ILBC_DEMUXER 0
-#define CONFIG_IMAGE2_DEMUXER 0
-#define CONFIG_IMAGE2PIPE_DEMUXER 0
-#define CONFIG_IMAGE2_ALIAS_PIX_DEMUXER 0
-#define CONFIG_IMAGE2_BRENDER_PIX_DEMUXER 0
-#define CONFIG_INGENIENT_DEMUXER 0
-#define CONFIG_IPMOVIE_DEMUXER 0
-#define CONFIG_IRCAM_DEMUXER 0
-#define CONFIG_ISS_DEMUXER 0
-#define CONFIG_IV8_DEMUXER 0
-#define CONFIG_IVF_DEMUXER 0
-#define CONFIG_IVR_DEMUXER 0
-#define CONFIG_JACOSUB_DEMUXER 0
-#define CONFIG_JV_DEMUXER 0
-#define CONFIG_LMLM4_DEMUXER 0
-#define CONFIG_LOAS_DEMUXER 0
-#define CONFIG_LRC_DEMUXER 0
-#define CONFIG_LVF_DEMUXER 0
-#define CONFIG_LXF_DEMUXER 0
-#define CONFIG_M4V_DEMUXER 0
-#define CONFIG_MATROSKA_DEMUXER 0
-#define CONFIG_MGSTS_DEMUXER 0
-#define CONFIG_MICRODVD_DEMUXER 0
-#define CONFIG_MJPEG_DEMUXER 0
-#define CONFIG_MJPEG_2000_DEMUXER 0
-#define CONFIG_MLP_DEMUXER 0
-#define CONFIG_MLV_DEMUXER 0
-#define CONFIG_MM_DEMUXER 0
-#define CONFIG_MMF_DEMUXER 0
-#define CONFIG_MOV_DEMUXER 0
-#define CONFIG_MP3_DEMUXER 0
-#define CONFIG_MPC_DEMUXER 0
-#define CONFIG_MPC8_DEMUXER 0
-#define CONFIG_MPEGPS_DEMUXER 0
-#define CONFIG_MPEGTS_DEMUXER 0
-#define CONFIG_MPEGTSRAW_DEMUXER 0
-#define CONFIG_MPEGVIDEO_DEMUXER 0
-#define CONFIG_MPJPEG_DEMUXER 0
-#define CONFIG_MPL2_DEMUXER 0
-#define CONFIG_MPSUB_DEMUXER 0
-#define CONFIG_MSF_DEMUXER 0
-#define CONFIG_MSNWC_TCP_DEMUXER 0
-#define CONFIG_MTAF_DEMUXER 0
-#define CONFIG_MTV_DEMUXER 0
-#define CONFIG_MUSX_DEMUXER 0
-#define CONFIG_MV_DEMUXER 0
-#define CONFIG_MVI_DEMUXER 0
-#define CONFIG_MXF_DEMUXER 0
-#define CONFIG_MXG_DEMUXER 0
-#define CONFIG_NC_DEMUXER 0
-#define CONFIG_NISTSPHERE_DEMUXER 0
-#define CONFIG_NSV_DEMUXER 0
-#define CONFIG_NUT_DEMUXER 0
-#define CONFIG_NUV_DEMUXER 0
-#define CONFIG_OGG_DEMUXER 0
-#define CONFIG_OMA_DEMUXER 0
-#define CONFIG_PAF_DEMUXER 0
-#define CONFIG_PCM_ALAW_DEMUXER 0
-#define CONFIG_PCM_MULAW_DEMUXER 0
-#define CONFIG_PCM_F64BE_DEMUXER 0
-#define CONFIG_PCM_F64LE_DEMUXER 0
-#define CONFIG_PCM_F32BE_DEMUXER 0
-#define CONFIG_PCM_F32LE_DEMUXER 0
-#define CONFIG_PCM_S32BE_DEMUXER 0
-#define CONFIG_PCM_S32LE_DEMUXER 0
-#define CONFIG_PCM_S24BE_DEMUXER 0
-#define CONFIG_PCM_S24LE_DEMUXER 0
-#define CONFIG_PCM_S16BE_DEMUXER 0
-#define CONFIG_PCM_S16LE_DEMUXER 0
-#define CONFIG_PCM_S8_DEMUXER 0
-#define CONFIG_PCM_U32BE_DEMUXER 0
-#define CONFIG_PCM_U32LE_DEMUXER 0
-#define CONFIG_PCM_U24BE_DEMUXER 0
-#define CONFIG_PCM_U24LE_DEMUXER 0
-#define CONFIG_PCM_U16BE_DEMUXER 0
-#define CONFIG_PCM_U16LE_DEMUXER 0
-#define CONFIG_PCM_U8_DEMUXER 0
-#define CONFIG_PJS_DEMUXER 0
-#define CONFIG_PMP_DEMUXER 0
-#define CONFIG_PVA_DEMUXER 0
-#define CONFIG_PVF_DEMUXER 0
-#define CONFIG_QCP_DEMUXER 0
-#define CONFIG_R3D_DEMUXER 0
-#define CONFIG_RAWVIDEO_DEMUXER 0
-#define CONFIG_REALTEXT_DEMUXER 0
-#define CONFIG_REDSPARK_DEMUXER 0
-#define CONFIG_RL2_DEMUXER 0
-#define CONFIG_RM_DEMUXER 0
-#define CONFIG_ROQ_DEMUXER 0
-#define CONFIG_RPL_DEMUXER 0
-#define CONFIG_RSD_DEMUXER 0
-#define CONFIG_RSO_DEMUXER 0
-#define CONFIG_RTP_DEMUXER 0
-#define CONFIG_RTSP_DEMUXER 0
-#define CONFIG_S337M_DEMUXER 0
-#define CONFIG_SAMI_DEMUXER 0
-#define CONFIG_SAP_DEMUXER 0
-#define CONFIG_SBG_DEMUXER 0
-#define CONFIG_SCC_DEMUXER 0
-#define CONFIG_SDP_DEMUXER 0
-#define CONFIG_SDR2_DEMUXER 0
-#define CONFIG_SDS_DEMUXER 0
-#define CONFIG_SDX_DEMUXER 0
-#define CONFIG_SEGAFILM_DEMUXER 0
-#define CONFIG_SHORTEN_DEMUXER 0
-#define CONFIG_SIFF_DEMUXER 0
-#define CONFIG_SLN_DEMUXER 0
-#define CONFIG_SMACKER_DEMUXER 0
-#define CONFIG_SMJPEG_DEMUXER 0
-#define CONFIG_SMUSH_DEMUXER 0
-#define CONFIG_SOL_DEMUXER 0
-#define CONFIG_SOX_DEMUXER 0
-#define CONFIG_SPDIF_DEMUXER 0
-#define CONFIG_SRT_DEMUXER 0
-#define CONFIG_STR_DEMUXER 0
-#define CONFIG_STL_DEMUXER 0
-#define CONFIG_SUBVIEWER1_DEMUXER 0
-#define CONFIG_SUBVIEWER_DEMUXER 0
-#define CONFIG_SUP_DEMUXER 0
-#define CONFIG_SVAG_DEMUXER 0
-#define CONFIG_SWF_DEMUXER 0
-#define CONFIG_TAK_DEMUXER 0
-#define CONFIG_TEDCAPTIONS_DEMUXER 0
-#define CONFIG_THP_DEMUXER 0
-#define CONFIG_THREEDOSTR_DEMUXER 0
-#define CONFIG_TIERTEXSEQ_DEMUXER 0
-#define CONFIG_TMV_DEMUXER 0
-#define CONFIG_TRUEHD_DEMUXER 0
-#define CONFIG_TTA_DEMUXER 0
-#define CONFIG_TXD_DEMUXER 0
-#define CONFIG_TTY_DEMUXER 0
-#define CONFIG_V210_DEMUXER 0
-#define CONFIG_V210X_DEMUXER 0
-#define CONFIG_VAG_DEMUXER 0
-#define CONFIG_VC1_DEMUXER 0
-#define CONFIG_VC1T_DEMUXER 0
-#define CONFIG_VIVO_DEMUXER 0
-#define CONFIG_VMD_DEMUXER 0
-#define CONFIG_VOBSUB_DEMUXER 0
-#define CONFIG_VOC_DEMUXER 0
-#define CONFIG_VPK_DEMUXER 0
-#define CONFIG_VPLAYER_DEMUXER 0
-#define CONFIG_VQF_DEMUXER 0
-#define CONFIG_W64_DEMUXER 0
-#define CONFIG_WAV_DEMUXER 0
-#define CONFIG_WC3_DEMUXER 0
-#define CONFIG_WEBM_DASH_MANIFEST_DEMUXER 0
-#define CONFIG_WEBVTT_DEMUXER 0
-#define CONFIG_WSAUD_DEMUXER 0
-#define CONFIG_WSD_DEMUXER 0
-#define CONFIG_WSVQA_DEMUXER 0
-#define CONFIG_WTV_DEMUXER 0
-#define CONFIG_WVE_DEMUXER 0
-#define CONFIG_WV_DEMUXER 0
-#define CONFIG_XA_DEMUXER 0
-#define CONFIG_XBIN_DEMUXER 0
-#define CONFIG_XMV_DEMUXER 0
-#define CONFIG_XVAG_DEMUXER 0
-#define CONFIG_XWMA_DEMUXER 0
-#define CONFIG_YOP_DEMUXER 0
-#define CONFIG_YUV4MPEGPIPE_DEMUXER 0
-#define CONFIG_IMAGE_BMP_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_DDS_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_DPX_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_EXR_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_J2K_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_JPEG_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_JPEGLS_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PAM_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PBM_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PCX_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PGMYUV_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PGM_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PICTOR_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PNG_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PPM_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_PSD_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_QDRAW_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_SGI_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_SVG_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_SUNRAST_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_TIFF_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_WEBP_PIPE_DEMUXER 0
-#define CONFIG_IMAGE_XPM_PIPE_DEMUXER 0
-#define CONFIG_LIBGME_DEMUXER 0
-#define CONFIG_LIBMODPLUG_DEMUXER 0
-#define CONFIG_LIBOPENMPT_DEMUXER 0
-#define CONFIG_A64MULTI_ENCODER 0
-#define CONFIG_A64MULTI5_ENCODER 0
-#define CONFIG_ALIAS_PIX_ENCODER 0
-#define CONFIG_AMV_ENCODER 0
-#define CONFIG_APNG_ENCODER 0
-#define CONFIG_ASV1_ENCODER 0
-#define CONFIG_ASV2_ENCODER 0
-#define CONFIG_AVRP_ENCODER 0
-#define CONFIG_AVUI_ENCODER 0
-#define CONFIG_AYUV_ENCODER 0
-#define CONFIG_BMP_ENCODER 0
-#define CONFIG_CINEPAK_ENCODER 0
-#define CONFIG_CLJR_ENCODER 0
-#define CONFIG_COMFORTNOISE_ENCODER 0
-#define CONFIG_DNXHD_ENCODER 0
-#define CONFIG_DPX_ENCODER 0
-#define CONFIG_DVVIDEO_ENCODER 0
-#define CONFIG_FFV1_ENCODER 0
-#define CONFIG_FFVHUFF_ENCODER 0
-#define CONFIG_FITS_ENCODER 0
-#define CONFIG_FLASHSV_ENCODER 0
-#define CONFIG_FLASHSV2_ENCODER 0
-#define CONFIG_FLV_ENCODER 0
-#define CONFIG_GIF_ENCODER 0
-#define CONFIG_H261_ENCODER 0
-#define CONFIG_H263_ENCODER 0
-#define CONFIG_H263P_ENCODER 0
-#define CONFIG_HAP_ENCODER 0
-#define CONFIG_HUFFYUV_ENCODER 0
-#define CONFIG_JPEG2000_ENCODER 0
-#define CONFIG_JPEGLS_ENCODER 0
-#define CONFIG_LJPEG_ENCODER 0
-#define CONFIG_MJPEG_ENCODER 0
-#define CONFIG_MPEG1VIDEO_ENCODER 0
-#define CONFIG_MPEG2VIDEO_ENCODER 0
-#define CONFIG_MPEG4_ENCODER 0
-#define CONFIG_MSMPEG4V2_ENCODER 0
-#define CONFIG_MSMPEG4V3_ENCODER 0
-#define CONFIG_MSVIDEO1_ENCODER 0
-#define CONFIG_PAM_ENCODER 0
-#define CONFIG_PBM_ENCODER 0
-#define CONFIG_PCX_ENCODER 0
-#define CONFIG_PGM_ENCODER 0
-#define CONFIG_PGMYUV_ENCODER 0
-#define CONFIG_PNG_ENCODER 0
-#define CONFIG_PPM_ENCODER 0
-#define CONFIG_PRORES_ENCODER 0
-#define CONFIG_PRORES_AW_ENCODER 0
-#define CONFIG_PRORES_KS_ENCODER 0
-#define CONFIG_QTRLE_ENCODER 0
-#define CONFIG_R10K_ENCODER 0
-#define CONFIG_R210_ENCODER 0
-#define CONFIG_RAWVIDEO_ENCODER 0
-#define CONFIG_ROQ_ENCODER 0
-#define CONFIG_RV10_ENCODER 0
-#define CONFIG_RV20_ENCODER 0
-#define CONFIG_S302M_ENCODER 0
-#define CONFIG_SGI_ENCODER 0
-#define CONFIG_SNOW_ENCODER 0
-#define CONFIG_SUNRAST_ENCODER 0
-#define CONFIG_SVQ1_ENCODER 0
-#define CONFIG_TARGA_ENCODER 0
-#define CONFIG_TIFF_ENCODER 0
-#define CONFIG_UTVIDEO_ENCODER 0
-#define CONFIG_V210_ENCODER 0
-#define CONFIG_V308_ENCODER 0
-#define CONFIG_V408_ENCODER 0
-#define CONFIG_V410_ENCODER 0
-#define CONFIG_VC2_ENCODER 0
-#define CONFIG_WRAPPED_AVFRAME_ENCODER 0
-#define CONFIG_WMV1_ENCODER 0
-#define CONFIG_WMV2_ENCODER 0
-#define CONFIG_XBM_ENCODER 0
-#define CONFIG_XFACE_ENCODER 0
-#define CONFIG_XWD_ENCODER 0
-#define CONFIG_Y41P_ENCODER 0
-#define CONFIG_YUV4_ENCODER 0
-#define CONFIG_ZLIB_ENCODER 0
-#define CONFIG_ZMBV_ENCODER 0
-#define CONFIG_AAC_ENCODER 0
-#define CONFIG_AC3_ENCODER 0
-#define CONFIG_AC3_FIXED_ENCODER 0
-#define CONFIG_ALAC_ENCODER 0
-#define CONFIG_DCA_ENCODER 0
-#define CONFIG_EAC3_ENCODER 0
-#define CONFIG_FLAC_ENCODER 0
-#define CONFIG_G723_1_ENCODER 0
-#define CONFIG_MLP_ENCODER 0
-#define CONFIG_MP2_ENCODER 0
-#define CONFIG_MP2FIXED_ENCODER 0
-#define CONFIG_NELLYMOSER_ENCODER 0
-#define CONFIG_OPUS_ENCODER 0
-#define CONFIG_RA_144_ENCODER 0
-#define CONFIG_SONIC_ENCODER 0
-#define CONFIG_SONIC_LS_ENCODER 0
-#define CONFIG_TRUEHD_ENCODER 0
-#define CONFIG_TTA_ENCODER 0
-#define CONFIG_VORBIS_ENCODER 0
-#define CONFIG_WAVPACK_ENCODER 0
-#define CONFIG_WMAV1_ENCODER 0
-#define CONFIG_WMAV2_ENCODER 0
-#define CONFIG_PCM_ALAW_ENCODER 0
-#define CONFIG_PCM_F32BE_ENCODER 0
-#define CONFIG_PCM_F32LE_ENCODER 0
-#define CONFIG_PCM_F64BE_ENCODER 0
-#define CONFIG_PCM_F64LE_ENCODER 0
-#define CONFIG_PCM_MULAW_ENCODER 0
-#define CONFIG_PCM_S8_ENCODER 0
-#define CONFIG_PCM_S8_PLANAR_ENCODER 0
-#define CONFIG_PCM_S16BE_ENCODER 0
-#define CONFIG_PCM_S16BE_PLANAR_ENCODER 0
-#define CONFIG_PCM_S16LE_ENCODER 0
-#define CONFIG_PCM_S16LE_PLANAR_ENCODER 0
-#define CONFIG_PCM_S24BE_ENCODER 0
-#define CONFIG_PCM_S24DAUD_ENCODER 0
-#define CONFIG_PCM_S24LE_ENCODER 0
-#define CONFIG_PCM_S24LE_PLANAR_ENCODER 0
-#define CONFIG_PCM_S32BE_ENCODER 0
-#define CONFIG_PCM_S32LE_ENCODER 0
-#define CONFIG_PCM_S32LE_PLANAR_ENCODER 0
-#define CONFIG_PCM_S64BE_ENCODER 0
-#define CONFIG_PCM_S64LE_ENCODER 0
-#define CONFIG_PCM_U8_ENCODER 0
-#define CONFIG_PCM_U16BE_ENCODER 0
-#define CONFIG_PCM_U16LE_ENCODER 0
-#define CONFIG_PCM_U24BE_ENCODER 0
-#define CONFIG_PCM_U24LE_ENCODER 0
-#define CONFIG_PCM_U32BE_ENCODER 0
-#define CONFIG_PCM_U32LE_ENCODER 0
-#define CONFIG_ROQ_DPCM_ENCODER 0
-#define CONFIG_ADPCM_ADX_ENCODER 0
-#define CONFIG_ADPCM_G722_ENCODER 0
-#define CONFIG_ADPCM_G726_ENCODER 0
-#define CONFIG_ADPCM_G726LE_ENCODER 0
-#define CONFIG_ADPCM_IMA_QT_ENCODER 0
-#define CONFIG_ADPCM_IMA_WAV_ENCODER 0
-#define CONFIG_ADPCM_MS_ENCODER 0
-#define CONFIG_ADPCM_SWF_ENCODER 0
-#define CONFIG_ADPCM_YAMAHA_ENCODER 0
-#define CONFIG_SSA_ENCODER 0
-#define CONFIG_ASS_ENCODER 0
-#define CONFIG_DVBSUB_ENCODER 0
-#define CONFIG_DVDSUB_ENCODER 0
-#define CONFIG_MOVTEXT_ENCODER 0
-#define CONFIG_SRT_ENCODER 0
-#define CONFIG_SUBRIP_ENCODER 0
-#define CONFIG_TEXT_ENCODER 0
-#define CONFIG_WEBVTT_ENCODER 0
-#define CONFIG_XSUB_ENCODER 0
-#define CONFIG_AAC_AT_ENCODER 0
-#define CONFIG_ALAC_AT_ENCODER 0
-#define CONFIG_ILBC_AT_ENCODER 0
-#define CONFIG_PCM_ALAW_AT_ENCODER 0
-#define CONFIG_PCM_MULAW_AT_ENCODER 0
-#define CONFIG_LIBFDK_AAC_ENCODER 0
-#define CONFIG_LIBGSM_ENCODER 0
-#define CONFIG_LIBGSM_MS_ENCODER 0
-#define CONFIG_LIBILBC_ENCODER 0
-#define CONFIG_LIBMP3LAME_ENCODER 0
-#define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
-#define CONFIG_LIBOPENJPEG_ENCODER 0
-#define CONFIG_LIBOPUS_ENCODER 0
-#define CONFIG_LIBSHINE_ENCODER 0
-#define CONFIG_LIBSPEEX_ENCODER 0
-#define CONFIG_LIBTHEORA_ENCODER 0
-#define CONFIG_LIBTWOLAME_ENCODER 0
-#define CONFIG_LIBVO_AMRWBENC_ENCODER 0
-#define CONFIG_LIBVORBIS_ENCODER 0
-#define CONFIG_LIBVPX_VP8_ENCODER 0
-#define CONFIG_LIBVPX_VP9_ENCODER 0
-#define CONFIG_LIBWAVPACK_ENCODER 0
-#define CONFIG_LIBWEBP_ANIM_ENCODER 0
-#define CONFIG_LIBWEBP_ENCODER 0
-#define CONFIG_LIBX262_ENCODER 0
-#define CONFIG_LIBX264_ENCODER 0
-#define CONFIG_LIBX264RGB_ENCODER 0
-#define CONFIG_LIBX265_ENCODER 0
-#define CONFIG_LIBXAVS_ENCODER 0
-#define CONFIG_LIBXVID_ENCODER 0
-#define CONFIG_H263_V4L2M2M_ENCODER 0
-#define CONFIG_LIBOPENH264_ENCODER 0
-#define CONFIG_H264_NVENC_ENCODER 0
-#define CONFIG_H264_OMX_ENCODER 0
-#define CONFIG_H264_QSV_ENCODER 0
-#define CONFIG_H264_V4L2M2M_ENCODER 0
-#define CONFIG_H264_VAAPI_ENCODER 0
-#define CONFIG_H264_VIDEOTOOLBOX_ENCODER 0
-#define CONFIG_NVENC_ENCODER 0
-#define CONFIG_NVENC_H264_ENCODER 0
-#define CONFIG_NVENC_HEVC_ENCODER 0
-#define CONFIG_HEVC_NVENC_ENCODER 0
-#define CONFIG_HEVC_QSV_ENCODER 0
-#define CONFIG_HEVC_V4L2M2M_ENCODER 0
-#define CONFIG_HEVC_VAAPI_ENCODER 0
-#define CONFIG_LIBKVAZAAR_ENCODER 0
-#define CONFIG_MJPEG_VAAPI_ENCODER 0
-#define CONFIG_MPEG2_QSV_ENCODER 0
-#define CONFIG_MPEG2_VAAPI_ENCODER 0
-#define CONFIG_MPEG4_V4L2M2M_ENCODER 0
-#define CONFIG_VP8_V4L2M2M_ENCODER 0
-#define CONFIG_VP8_VAAPI_ENCODER 0
-#define CONFIG_VP9_VAAPI_ENCODER 0
-#define CONFIG_ABENCH_FILTER 0
-#define CONFIG_ACOMPRESSOR_FILTER 0
-#define CONFIG_ACOPY_FILTER 0
-#define CONFIG_ACROSSFADE_FILTER 0
-#define CONFIG_ACRUSHER_FILTER 0
-#define CONFIG_ADELAY_FILTER 0
 #define CONFIG_AECHO_FILTER 0
 #define CONFIG_AEMPHASIS_FILTER 0
+#define CONFIG_AEVALSRC_FILTER 0
 #define CONFIG_AEVAL_FILTER 0
 #define CONFIG_AFADE_FILTER 0
+#define CONFIG_AFC_DEMUXER 0
 #define CONFIG_AFFTFILT_FILTER 0
+#define CONFIG_AFIFO_FILTER 0
 #define CONFIG_AFIR_FILTER 0
 #define CONFIG_AFORMAT_FILTER 0
 #define CONFIG_AGATE_FILTER 0
+#define CONFIG_AHISTOGRAM_FILTER 0
+#define CONFIG_AIC_DECODER 0
+#define CONFIG_AIFF_DEMUXER 0
+#define CONFIG_AIFF_MUXER 0
+#define CONFIG_AIIR_FILTER 0
 #define CONFIG_AINTERLEAVE_FILTER 0
+#define CONFIG_AIX_DEMUXER 0
+#define CONFIG_ALAC_AT_DECODER 0
+#define CONFIG_ALAC_AT_ENCODER 0
+#define CONFIG_ALAC_DECODER 0
+#define CONFIG_ALAC_ENCODER 0
+#define CONFIG_ALIAS_PIX_DECODER 0
+#define CONFIG_ALIAS_PIX_ENCODER 0
 #define CONFIG_ALIMITER_FILTER 0
 #define CONFIG_ALLPASS_FILTER 0
+#define CONFIG_ALLRGB_FILTER 0
+#define CONFIG_ALLYUV_FILTER 0
 #define CONFIG_ALOOP_FILTER 0
+#define CONFIG_ALPHAEXTRACT_FILTER 0
+#define CONFIG_ALPHAMERGE_FILTER 0
+#define CONFIG_ALSA_INDEV 0
+#define CONFIG_ALSA_OUTDEV 0
+#define CONFIG_ALS_DECODER 0
 #define CONFIG_AMERGE_FILTER 0
 #define CONFIG_AMETADATA_FILTER 0
 #define CONFIG_AMIX_FILTER 0
+#define CONFIG_AMOVIE_FILTER 0
+#define CONFIG_AMRNB_DECODER 0
+#define CONFIG_AMRNB_DEMUXER 0
+#define CONFIG_AMRWB_DECODER 0
+#define CONFIG_AMRWB_DEMUXER 0
+#define CONFIG_AMR_DEMUXER 0
+#define CONFIG_AMR_MUXER 0
+#define CONFIG_AMR_NB_AT_DECODER 0
+#define CONFIG_AMV_DECODER 0
+#define CONFIG_AMV_ENCODER 0
+#define CONFIG_ANDROID_CAMERA_INDEV 0
 #define CONFIG_ANEQUALIZER_FILTER 0
+#define CONFIG_ANM_DECODER 0
+#define CONFIG_ANM_DEMUXER 0
+#define CONFIG_ANOISESRC_FILTER 0
+#define CONFIG_ANSI_DECODER 0
+#define CONFIG_ANULLSINK_FILTER 0
+#define CONFIG_ANULLSRC_FILTER 0
 #define CONFIG_ANULL_FILTER 0
 #define CONFIG_APAD_FILTER 0
+#define CONFIG_APC_DEMUXER 0
 #define CONFIG_APERMS_FILTER 0
+#define CONFIG_APE_DECODER 0
+#define CONFIG_APE_DEMUXER 0
+#define CONFIG_APHASEMETER_FILTER 0
 #define CONFIG_APHASER_FILTER 0
+#define CONFIG_APNG_DECODER 0
+#define CONFIG_APNG_DEMUXER 0
+#define CONFIG_APNG_ENCODER 0
+#define CONFIG_APNG_MUXER 0
+#define CONFIG_APTX_DECODER 0
+#define CONFIG_APTX_DEMUXER 0
+#define CONFIG_APTX_ENCODER 0
+#define CONFIG_APTX_HD_DECODER 0
+#define CONFIG_APTX_HD_DEMUXER 0
+#define CONFIG_APTX_HD_ENCODER 0
+#define CONFIG_APTX_HD_MUXER 0
+#define CONFIG_APTX_MUXER 0
 #define CONFIG_APULSATOR_FILTER 0
+#define CONFIG_AQTITLE_DEMUXER 0
 #define CONFIG_AREALTIME_FILTER 0
 #define CONFIG_ARESAMPLE_FILTER 0
 #define CONFIG_AREVERSE_FILTER 0
@@ -1026,81 +177,131 @@
 #define CONFIG_ASETPTS_FILTER 0
 #define CONFIG_ASETRATE_FILTER 0
 #define CONFIG_ASETTB_FILTER 0
+#define CONFIG_ASF_DEMUXER 0
+#define CONFIG_ASF_MUXER 0
+#define CONFIG_ASF_O_DEMUXER 0
+#define CONFIG_ASF_STREAM_MUXER 0
 #define CONFIG_ASHOWINFO_FILTER 0
 #define CONFIG_ASIDEDATA_FILTER 0
 #define CONFIG_ASPLIT_FILTER 0
+#define CONFIG_ASS_DECODER 0
+#define CONFIG_ASS_DEMUXER 0
+#define CONFIG_ASS_ENCODER 0
+#define CONFIG_ASS_FILTER 0
+#define CONFIG_ASS_MUXER 0
 #define CONFIG_ASTATS_FILTER 0
 #define CONFIG_ASTREAMSELECT_FILTER 0
+#define CONFIG_AST_DEMUXER 0
+#define CONFIG_AST_MUXER 0
+#define CONFIG_ASV1_DECODER 0
+#define CONFIG_ASV1_ENCODER 0
+#define CONFIG_ASV2_DECODER 0
+#define CONFIG_ASV2_ENCODER 0
+#define CONFIG_ASYNC_PROTOCOL 0
+#define CONFIG_ATADENOISE_FILTER 0
 #define CONFIG_ATEMPO_FILTER 0
+#define CONFIG_ATRAC1_DECODER 0
+#define CONFIG_ATRAC3AL_DECODER 0
+#define CONFIG_ATRAC3PAL_DECODER 0
+#define CONFIG_ATRAC3P_DECODER 0
+#define CONFIG_ATRAC3_DECODER 0
 #define CONFIG_ATRIM_FILTER 0
+#define CONFIG_AURA2_DECODER 0
+#define CONFIG_AURA_DECODER 0
+#define CONFIG_AU_DEMUXER 0
+#define CONFIG_AU_MUXER 0
+#define CONFIG_AVECTORSCOPE_FILTER 0
+#define CONFIG_AVFOUNDATION_INDEV 0
+#define CONFIG_AVGBLUR_FILTER 0
+#define CONFIG_AVGBLUR_OPENCL_FILTER 0
+#define CONFIG_AVISYNTH_DEMUXER 0
+#define CONFIG_AVI_DEMUXER 0
+#define CONFIG_AVI_MUXER 0
+#define CONFIG_AVM2_MUXER 0
+#define CONFIG_AVRN_DECODER 0
+#define CONFIG_AVRP_DECODER 0
+#define CONFIG_AVRP_ENCODER 0
+#define CONFIG_AVR_DEMUXER 0
+#define CONFIG_AVS_DECODER 0
+#define CONFIG_AVS_DEMUXER 0
+#define CONFIG_AVUI_DECODER 0
+#define CONFIG_AVUI_ENCODER 0
+#define CONFIG_AYUV_DECODER 0
+#define CONFIG_AYUV_ENCODER 0
 #define CONFIG_AZMQ_FILTER 0
 #define CONFIG_BANDPASS_FILTER 0
 #define CONFIG_BANDREJECT_FILTER 0
 #define CONFIG_BASS_FILTER 0
-#define CONFIG_BIQUAD_FILTER 0
-#define CONFIG_BS2B_FILTER 0
-#define CONFIG_CHANNELMAP_FILTER 0
-#define CONFIG_CHANNELSPLIT_FILTER 0
-#define CONFIG_CHORUS_FILTER 0
-#define CONFIG_COMPAND_FILTER 0
-#define CONFIG_COMPENSATIONDELAY_FILTER 0
-#define CONFIG_CROSSFEED_FILTER 0
-#define CONFIG_CRYSTALIZER_FILTER 0
-#define CONFIG_DCSHIFT_FILTER 0
-#define CONFIG_DYNAUDNORM_FILTER 0
-#define CONFIG_EARWAX_FILTER 0
-#define CONFIG_EBUR128_FILTER 0
-#define CONFIG_EQUALIZER_FILTER 0
-#define CONFIG_EXTRASTEREO_FILTER 0
-#define CONFIG_FIREQUALIZER_FILTER 0
-#define CONFIG_FLANGER_FILTER 0
-#define CONFIG_HAAS_FILTER 0
-#define CONFIG_HDCD_FILTER 0
-#define CONFIG_HEADPHONE_FILTER 0
-#define CONFIG_HIGHPASS_FILTER 0
-#define CONFIG_JOIN_FILTER 0
-#define CONFIG_LADSPA_FILTER 0
-#define CONFIG_LOUDNORM_FILTER 0
-#define CONFIG_LOWPASS_FILTER 0
-#define CONFIG_PAN_FILTER 0
-#define CONFIG_REPLAYGAIN_FILTER 0
-#define CONFIG_RESAMPLE_FILTER 0
-#define CONFIG_RUBBERBAND_FILTER 0
-#define CONFIG_SIDECHAINCOMPRESS_FILTER 0
-#define CONFIG_SIDECHAINGATE_FILTER 0
-#define CONFIG_SILENCEDETECT_FILTER 0
-#define CONFIG_SILENCEREMOVE_FILTER 0
-#define CONFIG_SOFALIZER_FILTER 0
-#define CONFIG_STEREOTOOLS_FILTER 0
-#define CONFIG_STEREOWIDEN_FILTER 0
-#define CONFIG_SUPEREQUALIZER_FILTER 0
-#define CONFIG_SURROUND_FILTER 0
-#define CONFIG_TREBLE_FILTER 0
-#define CONFIG_TREMOLO_FILTER 0
-#define CONFIG_VIBRATO_FILTER 0
-#define CONFIG_VOLUME_FILTER 0
-#define CONFIG_VOLUMEDETECT_FILTER 0
-#define CONFIG_AEVALSRC_FILTER 0
-#define CONFIG_ANOISESRC_FILTER 0
-#define CONFIG_ANULLSRC_FILTER 0
-#define CONFIG_FLITE_FILTER 0
-#define CONFIG_SINE_FILTER 0
-#define CONFIG_ANULLSINK_FILTER 0
-#define CONFIG_ALPHAEXTRACT_FILTER 0
-#define CONFIG_ALPHAMERGE_FILTER 0
-#define CONFIG_ASS_FILTER 0
-#define CONFIG_ATADENOISE_FILTER 0
-#define CONFIG_AVGBLUR_FILTER 0
 #define CONFIG_BBOX_FILTER 0
 #define CONFIG_BENCH_FILTER 0
+#define CONFIG_BETHSOFTVID_DECODER 0
+#define CONFIG_BETHSOFTVID_DEMUXER 0
+#define CONFIG_BFI_DECODER 0
+#define CONFIG_BFI_DEMUXER 0
+#define CONFIG_BFSTM_DEMUXER 0
+#define CONFIG_BINKAUDIO_DCT_DECODER 0
+#define CONFIG_BINKAUDIO_RDFT_DECODER 0
+#define CONFIG_BINK_DECODER 0
+#define CONFIG_BINK_DEMUXER 0
+#define CONFIG_BINTEXT_DECODER 0
+#define CONFIG_BINTEXT_DEMUXER 0
+#define CONFIG_BIQUAD_FILTER 0
+#define CONFIG_BITPACKED_DECODER 0
 #define CONFIG_BITPLANENOISE_FILTER 0
+#define CONFIG_BIT_DEMUXER 0
+#define CONFIG_BIT_MUXER 0
+#define CONFIG_BKTR_INDEV 0
 #define CONFIG_BLACKDETECT_FILTER 0
 #define CONFIG_BLACKFRAME_FILTER 0
 #define CONFIG_BLEND_FILTER 0
+#define CONFIG_BLURAY_PROTOCOL 0
+#define CONFIG_BMP_DECODER 0
+#define CONFIG_BMP_ENCODER 0
+#define CONFIG_BMP_PARSER 0
+#define CONFIG_BMV_AUDIO_DECODER 0
+#define CONFIG_BMV_DEMUXER 0
+#define CONFIG_BMV_VIDEO_DECODER 0
+#define CONFIG_BOA_DEMUXER 0
 #define CONFIG_BOXBLUR_FILTER 0
+#define CONFIG_BRENDER_PIX_DECODER 0
+#define CONFIG_BRSTM_DEMUXER 0
+#define CONFIG_BS2B_FILTER 0
 #define CONFIG_BWDIF_FILTER 0
+#define CONFIG_C93_DECODER 0
+#define CONFIG_C93_DEMUXER 0
+#define CONFIG_CACA_OUTDEV 0
+#define CONFIG_CACHE_PROTOCOL 0
+#define CONFIG_CAF_DEMUXER 0
+#define CONFIG_CAF_MUXER 0
+#define CONFIG_CAVSVIDEO_DEMUXER 0
+#define CONFIG_CAVSVIDEO_MUXER 0
+#define CONFIG_CAVSVIDEO_PARSER 0
+#define CONFIG_CAVS_DECODER 0
+#define CONFIG_CCAPTION_DECODER 0
+#define CONFIG_CDGRAPHICS_DECODER 0
+#define CONFIG_CDG_DEMUXER 0
+#define CONFIG_CDXL_DECODER 0
+#define CONFIG_CDXL_DEMUXER 0
+#define CONFIG_CELLAUTO_FILTER 0
+#define CONFIG_CFHD_DECODER 0
+#define CONFIG_CHANNELMAP_FILTER 0
+#define CONFIG_CHANNELSPLIT_FILTER 0
+#define CONFIG_CHOMP_BSF 0
+#define CONFIG_CHORUS_FILTER 0
 #define CONFIG_CHROMAKEY_FILTER 0
+#define CONFIG_CHROMAPRINT_MUXER 0
 #define CONFIG_CIESCOPE_FILTER 0
+#define CONFIG_CINEPAK_DECODER 0
+#define CONFIG_CINEPAK_ENCODER 0
+#define CONFIG_CINE_DEMUXER 0
+#define CONFIG_CLEARVIDEO_DECODER 0
+#define CONFIG_CLJR_DECODER 0
+#define CONFIG_CLJR_ENCODER 0
+#define CONFIG_CLLC_DECODER 0
+#define CONFIG_CODEC2RAW_DEMUXER 0
+#define CONFIG_CODEC2RAW_MUXER 0
+#define CONFIG_CODEC2_DEMUXER 0
+#define CONFIG_CODEC2_MUXER 0
 #define CONFIG_CODECVIEW_FILTER 0
 #define CONFIG_COLORBALANCE_FILTER 0
 #define CONFIG_COLORCHANNELMIXER_FILTER 0
@@ -1108,567 +309,1426 @@
 #define CONFIG_COLORLEVELS_FILTER 0
 #define CONFIG_COLORMATRIX_FILTER 0
 #define CONFIG_COLORSPACE_FILTER 0
+#define CONFIG_COLOR_FILTER 0
+#define CONFIG_COMFORTNOISE_DECODER 0
+#define CONFIG_COMFORTNOISE_ENCODER 0
+#define CONFIG_COMPAND_FILTER 0
+#define CONFIG_COMPENSATIONDELAY_FILTER 0
+#define CONFIG_CONCAT_DEMUXER 0
+#define CONFIG_CONCAT_FILTER 0
+#define CONFIG_CONCAT_PROTOCOL 0
 #define CONFIG_CONVOLUTION_FILTER 0
+#define CONFIG_CONVOLUTION_OPENCL_FILTER 0
 #define CONFIG_CONVOLVE_FILTER 0
+#define CONFIG_COOK_DECODER 0
+#define CONFIG_COOK_PARSER 0
 #define CONFIG_COPY_FILTER 0
+#define CONFIG_COREIMAGESRC_FILTER 0
 #define CONFIG_COREIMAGE_FILTER 0
 #define CONFIG_COVER_RECT_FILTER 0
-#define CONFIG_CROP_FILTER 0
+#define CONFIG_CPIA_DECODER 0
+#define CONFIG_CRC_MUXER 0
 #define CONFIG_CROPDETECT_FILTER 0
+#define CONFIG_CROP_FILTER 0
+#define CONFIG_CROSSFEED_FILTER 0
+#define CONFIG_CRYPTO_PROTOCOL 0
+#define CONFIG_CRYSTALIZER_FILTER 0
+#define CONFIG_CSCD_DECODER 0
 #define CONFIG_CURVES_FILTER 0
+#define CONFIG_CYUV_DECODER 0
+#define CONFIG_DASH_DEMUXER 0
+#define CONFIG_DASH_MUXER 0
 #define CONFIG_DATASCOPE_FILTER 0
+#define CONFIG_DATA_DEMUXER 0
+#define CONFIG_DATA_MUXER 0
+#define CONFIG_DATA_PROTOCOL 0
+#define CONFIG_DAUD_DEMUXER 0
+#define CONFIG_DAUD_MUXER 0
+#define CONFIG_DCA_CORE_BSF 0
+#define CONFIG_DCA_DECODER 0
+#define CONFIG_DCA_ENCODER 0
+#define CONFIG_DCA_PARSER 0
+#define CONFIG_DCSHIFT_FILTER 0
+#define CONFIG_DCSTR_DEMUXER 0
 #define CONFIG_DCTDNOIZ_FILTER 0
+#define CONFIG_DDS_DECODER 0
 #define CONFIG_DEBAND_FILTER 0
 #define CONFIG_DECIMATE_FILTER 0
+#define CONFIG_DECKLINK_INDEV 0
+#define CONFIG_DECKLINK_OUTDEV 0
+#define CONFIG_DECONVOLVE_FILTER 0
 #define CONFIG_DEFLATE_FILTER 0
 #define CONFIG_DEFLICKER_FILTER 0
 #define CONFIG_DEINTERLACE_QSV_FILTER 0
 #define CONFIG_DEINTERLACE_VAAPI_FILTER 0
 #define CONFIG_DEJUDDER_FILTER 0
 #define CONFIG_DELOGO_FILTER 0
+#define CONFIG_DEMUXERS 0
+#define CONFIG_DENOISE_VAAPI_FILTER 0
 #define CONFIG_DESHAKE_FILTER 0
 #define CONFIG_DESPILL_FILTER 0
 #define CONFIG_DETELECINE_FILTER 0
+#define CONFIG_DFA_DECODER 0
+#define CONFIG_DFA_DEMUXER 0
 #define CONFIG_DILATION_FILTER 0
+#define CONFIG_DIRAC_DECODER 0
+#define CONFIG_DIRAC_DEMUXER 0
+#define CONFIG_DIRAC_MUXER 0
+#define CONFIG_DIRAC_PARSER 0
 #define CONFIG_DISPLACE_FILTER 0
+#define CONFIG_DNXHD_DECODER 0
+#define CONFIG_DNXHD_DEMUXER 0
+#define CONFIG_DNXHD_ENCODER 0
+#define CONFIG_DNXHD_MUXER 0
+#define CONFIG_DNXHD_PARSER 0
+#define CONFIG_DOLBY_E_DECODER 0
 #define CONFIG_DOUBLEWEAVE_FILTER 0
+#define CONFIG_DPX_DECODER 0
+#define CONFIG_DPX_ENCODER 0
+#define CONFIG_DPX_PARSER 0
 #define CONFIG_DRAWBOX_FILTER 0
 #define CONFIG_DRAWGRAPH_FILTER 0
 #define CONFIG_DRAWGRID_FILTER 0
 #define CONFIG_DRAWTEXT_FILTER 0
+#define CONFIG_DRMETER_FILTER 0
+#define CONFIG_DSD_LSBF_DECODER 0
+#define CONFIG_DSD_LSBF_PLANAR_DECODER 0
+#define CONFIG_DSD_MSBF_DECODER 0
+#define CONFIG_DSD_MSBF_PLANAR_DECODER 0
+#define CONFIG_DSF_DEMUXER 0
+#define CONFIG_DSHOW_INDEV 0
+#define CONFIG_DSICINAUDIO_DECODER 0
+#define CONFIG_DSICINVIDEO_DECODER 0
+#define CONFIG_DSICIN_DEMUXER 0
+#define CONFIG_DSS_DEMUXER 0
+#define CONFIG_DSS_SP_DECODER 0
+#define CONFIG_DST_DECODER 0
+#define CONFIG_DTSHD_DEMUXER 0
+#define CONFIG_DTS_DEMUXER 0
+#define CONFIG_DTS_MUXER 0
+#define CONFIG_DUMP_EXTRADATA_BSF 0
+#define CONFIG_DVAUDIO_DECODER 0
+#define CONFIG_DVAUDIO_PARSER 0
+#define CONFIG_DVBSUB_DECODER 0
+#define CONFIG_DVBSUB_DEMUXER 0
+#define CONFIG_DVBSUB_ENCODER 0
+#define CONFIG_DVBSUB_PARSER 0
+#define CONFIG_DVBTXT_DEMUXER 0
+#define CONFIG_DVDSUB_DECODER 0
+#define CONFIG_DVDSUB_ENCODER 0
+#define CONFIG_DVDSUB_PARSER 0
+#define CONFIG_DVD_NAV_PARSER 0
+#define CONFIG_DVVIDEO_DECODER 0
+#define CONFIG_DVVIDEO_ENCODER 0
+#define CONFIG_DV_DEMUXER 0
+#define CONFIG_DV_MUXER 0
+#define CONFIG_DXA_DECODER 0
+#define CONFIG_DXA_DEMUXER 0
+#define CONFIG_DXTORY_DECODER 0
+#define CONFIG_DXV_DECODER 0
+#define CONFIG_DYNAUDNORM_FILTER 0
+#define CONFIG_EAC3_AT_DECODER 0
+#define CONFIG_EAC3_CORE_BSF 0
+#define CONFIG_EAC3_DECODER 0
+#define CONFIG_EAC3_DEMUXER 0
+#define CONFIG_EAC3_ENCODER 0
+#define CONFIG_EAC3_MUXER 0
+#define CONFIG_EACMV_DECODER 0
+#define CONFIG_EAMAD_DECODER 0
+#define CONFIG_EARWAX_FILTER 0
+#define CONFIG_EATGQ_DECODER 0
+#define CONFIG_EATGV_DECODER 0
+#define CONFIG_EATQI_DECODER 0
+#define CONFIG_EA_CDATA_DEMUXER 0
+#define CONFIG_EA_DEMUXER 0
+#define CONFIG_EBUR128_FILTER 0
 #define CONFIG_EDGEDETECT_FILTER 0
+#define CONFIG_EIGHTBPS_DECODER 0
+#define CONFIG_EIGHTSVX_EXP_DECODER 0
+#define CONFIG_EIGHTSVX_FIB_DECODER 0
 #define CONFIG_ELBG_FILTER 0
+#define CONFIG_ENCODERS 0
+#define CONFIG_ENTROPY_FILTER 0
+#define CONFIG_EPAF_DEMUXER 0
+#define CONFIG_EQUALIZER_FILTER 0
 #define CONFIG_EQ_FILTER 0
 #define CONFIG_EROSION_FILTER 0
+#define CONFIG_ESCAPE124_DECODER 0
+#define CONFIG_ESCAPE130_DECODER 0
+#define CONFIG_EVRC_DECODER 0
+#define CONFIG_EXR_DECODER 0
 #define CONFIG_EXTRACTPLANES_FILTER 0
+#define CONFIG_EXTRACT_EXTRADATA_BSF 0
+#define CONFIG_EXTRASTEREO_FILTER 0
+#define CONFIG_F4V_MUXER 0
 #define CONFIG_FADE_FILTER 0
+#define CONFIG_FBDEV_INDEV 0
+#define CONFIG_FBDEV_OUTDEV 0
+#define CONFIG_FFMETADATA_DEMUXER 0
+#define CONFIG_FFMETADATA_MUXER 0
+#define CONFIG_FFRTMPCRYPT_PROTOCOL 0
+#define CONFIG_FFRTMPHTTP_PROTOCOL 0
 #define CONFIG_FFTFILT_FILTER 0
-#define CONFIG_FIELD_FILTER 0
+#define CONFIG_FFV1_DECODER 0
+#define CONFIG_FFV1_ENCODER 0
+#define CONFIG_FFVHUFF_DECODER 0
+#define CONFIG_FFVHUFF_ENCODER 0
+#define CONFIG_FFWAVESYNTH_DECODER 0
+#define CONFIG_FIC_DECODER 0
 #define CONFIG_FIELDHINT_FILTER 0
 #define CONFIG_FIELDMATCH_FILTER 0
 #define CONFIG_FIELDORDER_FILTER 0
+#define CONFIG_FIELD_FILTER 0
+#define CONFIG_FIFO_FILTER 0
+#define CONFIG_FIFO_MUXER 0
+#define CONFIG_FIFO_TEST_MUXER 0
+#define CONFIG_FILE_PROTOCOL 0
+#define CONFIG_FILLBORDERS_FILTER 0
+#define CONFIG_FILMSTRIP_DEMUXER 0
+#define CONFIG_FILMSTRIP_MUXER 0
+#define CONFIG_FILTERS 0
+#define CONFIG_FILTER_UNITS_BSF 0
 #define CONFIG_FIND_RECT_FILTER 0
+#define CONFIG_FIREQUALIZER_FILTER 0
+#define CONFIG_FITS_DECODER 0
+#define CONFIG_FITS_DEMUXER 0
+#define CONFIG_FITS_ENCODER 0
+#define CONFIG_FITS_MUXER 0
+#define CONFIG_FLAC_DEMUXER 0
+#define CONFIG_FLAC_ENCODER 0
+#define CONFIG_FLAC_MUXER 0
+#define CONFIG_FLAC_PARSER 0
+#define CONFIG_FLANGER_FILTER 0
+#define CONFIG_FLASHSV2_DECODER 0
+#define CONFIG_FLASHSV2_ENCODER 0
+#define CONFIG_FLASHSV_DECODER 0
+#define CONFIG_FLASHSV_ENCODER 0
+#define CONFIG_FLIC_DECODER 0
+#define CONFIG_FLIC_DEMUXER 0
+#define CONFIG_FLITE_FILTER 0
 #define CONFIG_FLOODFILL_FILTER 0
+#define CONFIG_FLV_DECODER 0
+#define CONFIG_FLV_DEMUXER 0
+#define CONFIG_FLV_ENCODER 0
+#define CONFIG_FLV_MUXER 0
+#define CONFIG_FMVC_DECODER 0
 #define CONFIG_FORMAT_FILTER 0
+#define CONFIG_FOURXM_DECODER 0
+#define CONFIG_FOURXM_DEMUXER 0
 #define CONFIG_FPS_FILTER 0
+#define CONFIG_FRAMECRC_MUXER 0
+#define CONFIG_FRAMEHASH_MUXER 0
+#define CONFIG_FRAMEMD5_MUXER 0
 #define CONFIG_FRAMEPACK_FILTER 0
 #define CONFIG_FRAMERATE_FILTER 0
 #define CONFIG_FRAMESTEP_FILTER 0
+#define CONFIG_FRAME_THREAD_ENCODER 0
+#define CONFIG_FRAPS_DECODER 0
 #define CONFIG_FREI0R_FILTER 0
+#define CONFIG_FREI0R_SRC_FILTER 0
+#define CONFIG_FRM_DEMUXER 0
+#define CONFIG_FRWU_DECODER 0
+#define CONFIG_FSB_DEMUXER 0
 #define CONFIG_FSPP_FILTER 0
+#define CONFIG_FTP_PROTOCOL 0
+#define CONFIG_G2M_DECODER 0
+#define CONFIG_G722_DEMUXER 0
+#define CONFIG_G722_MUXER 0
+#define CONFIG_G723_1_DECODER 0
+#define CONFIG_G723_1_DEMUXER 0
+#define CONFIG_G723_1_ENCODER 0
+#define CONFIG_G723_1_MUXER 0
+#define CONFIG_G726LE_DEMUXER 0
+#define CONFIG_G726LE_MUXER 0
+#define CONFIG_G726_DEMUXER 0
+#define CONFIG_G726_MUXER 0
+#define CONFIG_G729_DECODER 0
+#define CONFIG_G729_DEMUXER 0
+#define CONFIG_G729_PARSER 0
 #define CONFIG_GBLUR_FILTER 0
+#define CONFIG_GDIGRAB_INDEV 0
+#define CONFIG_GDV_DECODER 0
+#define CONFIG_GDV_DEMUXER 0
+#define CONFIG_GENH_DEMUXER 0
 #define CONFIG_GEQ_FILTER 0
+#define CONFIG_GIF_DECODER 0
+#define CONFIG_GIF_DEMUXER 0
+#define CONFIG_GIF_ENCODER 0
+#define CONFIG_GIF_MUXER 0
+#define CONFIG_GOPHER_PROTOCOL 0
 #define CONFIG_GRADFUN_FILTER 0
+#define CONFIG_GREMLIN_DPCM_DECODER 0
+#define CONFIG_GSM_DECODER 0
+#define CONFIG_GSM_DEMUXER 0
+#define CONFIG_GSM_MS_AT_DECODER 0
+#define CONFIG_GSM_MS_DECODER 0
+#define CONFIG_GSM_MUXER 0
+#define CONFIG_GSM_PARSER 0
+#define CONFIG_GXF_DEMUXER 0
+#define CONFIG_GXF_MUXER 0
+#define CONFIG_H261_DECODER 0
+#define CONFIG_H261_DEMUXER 0
+#define CONFIG_H261_ENCODER 0
+#define CONFIG_H261_MUXER 0
+#define CONFIG_H261_PARSER 0
+#define CONFIG_H263I_DECODER 0
+#define CONFIG_H263P_DECODER 0
+#define CONFIG_H263P_ENCODER 0
+#define CONFIG_H263_DECODER 0
+#define CONFIG_H263_DEMUXER 0
+#define CONFIG_H263_ENCODER 0
+#define CONFIG_H263_MUXER 0
+#define CONFIG_H263_PARSER 0
+#define CONFIG_H263_V4L2M2M_DECODER 0
+#define CONFIG_H263_V4L2M2M_ENCODER 0
+#define CONFIG_H263_VAAPI_HWACCEL 0
+#define CONFIG_H263_VIDEOTOOLBOX_HWACCEL 0
+#define CONFIG_H264_AMF_ENCODER 0
+#define CONFIG_H264_CRYSTALHD_DECODER 0
+#define CONFIG_H264_CUVID_DECODER 0
+#define CONFIG_H264_D3D11VA2_HWACCEL 0
+#define CONFIG_H264_D3D11VA_HWACCEL 0
+#define CONFIG_H264_DECODER 0
+#define CONFIG_H264_DEMUXER 0
+#define CONFIG_H264_DXVA2_HWACCEL 0
+#define CONFIG_H264_MEDIACODEC_DECODER 0
+#define CONFIG_H264_METADATA_BSF 0
+#define CONFIG_H264_MMAL_DECODER 0
+#define CONFIG_H264_MP4TOANNEXB_BSF 0
+#define CONFIG_H264_MUXER 0
+#define CONFIG_H264_NVDEC_HWACCEL 0
+#define CONFIG_H264_NVENC_ENCODER 0
+#define CONFIG_H264_OMX_ENCODER 0
+#define CONFIG_H264_PARSER 0
+#define CONFIG_H264_QSV_DECODER 0
+#define CONFIG_H264_QSV_ENCODER 0
+#define CONFIG_H264_REDUNDANT_PPS_BSF 0
+#define CONFIG_H264_RKMPP_DECODER 0
+#define CONFIG_H264_V4L2M2M_DECODER 0
+#define CONFIG_H264_V4L2M2M_ENCODER 0
+#define CONFIG_H264_VAAPI_ENCODER 0
+#define CONFIG_H264_VAAPI_HWACCEL 0
+#define CONFIG_H264_VDPAU_HWACCEL 0
+#define CONFIG_H264_VIDEOTOOLBOX_ENCODER 0
+#define CONFIG_H264_VIDEOTOOLBOX_HWACCEL 0
+#define CONFIG_HAAS_FILTER 0
+#define CONFIG_HALDCLUTSRC_FILTER 0
 #define CONFIG_HALDCLUT_FILTER 0
+#define CONFIG_HAPQA_EXTRACT_BSF 0
+#define CONFIG_HAP_DECODER 0
+#define CONFIG_HAP_ENCODER 0
+#define CONFIG_HASH_MUXER 0
+#define CONFIG_HDCD_FILTER 0
+#define CONFIG_HDS_MUXER 0
+#define CONFIG_HEADPHONE_FILTER 0
+#define CONFIG_HEVC_AMF_ENCODER 0
+#define CONFIG_HEVC_CUVID_DECODER 0
+#define CONFIG_HEVC_D3D11VA2_HWACCEL 0
+#define CONFIG_HEVC_D3D11VA_HWACCEL 0
+#define CONFIG_HEVC_DECODER 0
+#define CONFIG_HEVC_DEMUXER 0
+#define CONFIG_HEVC_DXVA2_HWACCEL 0
+#define CONFIG_HEVC_MEDIACODEC_DECODER 0
+#define CONFIG_HEVC_METADATA_BSF 0
+#define CONFIG_HEVC_MP4TOANNEXB_BSF 0
+#define CONFIG_HEVC_MUXER 0
+#define CONFIG_HEVC_NVDEC_HWACCEL 0
+#define CONFIG_HEVC_NVENC_ENCODER 0
+#define CONFIG_HEVC_PARSER 0
+#define CONFIG_HEVC_QSV_DECODER 0
+#define CONFIG_HEVC_QSV_ENCODER 0
+#define CONFIG_HEVC_RKMPP_DECODER 0
+#define CONFIG_HEVC_V4L2M2M_DECODER 0
+#define CONFIG_HEVC_V4L2M2M_ENCODER 0
+#define CONFIG_HEVC_VAAPI_ENCODER 0
+#define CONFIG_HEVC_VAAPI_HWACCEL 0
+#define CONFIG_HEVC_VDPAU_HWACCEL 0
+#define CONFIG_HEVC_VIDEOTOOLBOX_ENCODER 0
+#define CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL 0
 #define CONFIG_HFLIP_FILTER 0
+#define CONFIG_HIGHPASS_FILTER 0
+#define CONFIG_HILBERT_FILTER 0
 #define CONFIG_HISTEQ_FILTER 0
 #define CONFIG_HISTOGRAM_FILTER 0
+#define CONFIG_HLS_DEMUXER 0
+#define CONFIG_HLS_MUXER 0
+#define CONFIG_HLS_PROTOCOL 0
+#define CONFIG_HNM4_VIDEO_DECODER 0
+#define CONFIG_HNM_DEMUXER 0
 #define CONFIG_HQDN3D_FILTER 0
+#define CONFIG_HQX_DECODER 0
 #define CONFIG_HQX_FILTER 0
+#define CONFIG_HQ_HQA_DECODER 0
 #define CONFIG_HSTACK_FILTER 0
+#define CONFIG_HTTPPROXY_PROTOCOL 0
+#define CONFIG_HTTPS_PROTOCOL 0
+#define CONFIG_HTTP_PROTOCOL 0
 #define CONFIG_HUE_FILTER 0
+#define CONFIG_HUFFYUV_DECODER 0
+#define CONFIG_HUFFYUV_ENCODER 0
+#define CONFIG_HWACCELS 0
 #define CONFIG_HWDOWNLOAD_FILTER 0
 #define CONFIG_HWMAP_FILTER 0
-#define CONFIG_HWUPLOAD_FILTER 0
 #define CONFIG_HWUPLOAD_CUDA_FILTER 0
+#define CONFIG_HWUPLOAD_FILTER 0
 #define CONFIG_HYSTERESIS_FILTER 0
+#define CONFIG_IAC_DECODER 0
+#define CONFIG_ICECAST_PROTOCOL 0
+#define CONFIG_ICO_DEMUXER 0
+#define CONFIG_ICO_MUXER 0
+#define CONFIG_IDCIN_DECODER 0
+#define CONFIG_IDCIN_DEMUXER 0
 #define CONFIG_IDET_FILTER 0
+#define CONFIG_IDF_DECODER 0
+#define CONFIG_IDF_DEMUXER 0
+#define CONFIG_IEC61883_INDEV 0
+#define CONFIG_IFF_DEMUXER 0
+#define CONFIG_IFF_ILBM_DECODER 0
+#define CONFIG_ILBC_AT_DECODER 0
+#define CONFIG_ILBC_AT_ENCODER 0
+#define CONFIG_ILBC_DEMUXER 0
+#define CONFIG_ILBC_MUXER 0
 #define CONFIG_IL_FILTER 0
+#define CONFIG_IMAGE2PIPE_DEMUXER 0
+#define CONFIG_IMAGE2PIPE_MUXER 0
+#define CONFIG_IMAGE2_ALIAS_PIX_DEMUXER 0
+#define CONFIG_IMAGE2_BRENDER_PIX_DEMUXER 0
+#define CONFIG_IMAGE2_DEMUXER 0
+#define CONFIG_IMAGE2_MUXER 0
+#define CONFIG_IMAGE_BMP_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_DDS_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_DPX_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_EXR_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_J2K_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_JPEGLS_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_JPEG_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PAM_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PBM_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PCX_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PGMYUV_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PGM_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PICTOR_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PNG_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PPM_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_PSD_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_QDRAW_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_SGI_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_SUNRAST_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_SVG_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_TIFF_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_WEBP_PIPE_DEMUXER 0
+#define CONFIG_IMAGE_XPM_PIPE_DEMUXER 0
+#define CONFIG_IMC_DECODER 0
+#define CONFIG_IMX_DUMP_HEADER_BSF 0
+#define CONFIG_INDEO2_DECODER 0
+#define CONFIG_INDEO3_DECODER 0
+#define CONFIG_INDEO4_DECODER 0
+#define CONFIG_INDEO5_DECODER 0
+#define CONFIG_INDEVS 0
 #define CONFIG_INFLATE_FILTER 0
+#define CONFIG_INGENIENT_DEMUXER 0
 #define CONFIG_INTERLACE_FILTER 0
 #define CONFIG_INTERLEAVE_FILTER 0
+#define CONFIG_INTERPLAY_ACM_DECODER 0
+#define CONFIG_INTERPLAY_DPCM_DECODER 0
+#define CONFIG_INTERPLAY_VIDEO_DECODER 0
+#define CONFIG_IPMOVIE_DEMUXER 0
+#define CONFIG_IPOD_MUXER 0
+#define CONFIG_IRCAM_DEMUXER 0
+#define CONFIG_IRCAM_MUXER 0
+#define CONFIG_ISMV_MUXER 0
+#define CONFIG_ISS_DEMUXER 0
+#define CONFIG_IV8_DEMUXER 0
+#define CONFIG_IVF_DEMUXER 0
+#define CONFIG_IVF_MUXER 0
+#define CONFIG_IVR_DEMUXER 0
+#define CONFIG_JACK_INDEV 0
+#define CONFIG_JACOSUB_DECODER 0
+#define CONFIG_JACOSUB_DEMUXER 0
+#define CONFIG_JACOSUB_MUXER 0
+#define CONFIG_JOIN_FILTER 0
+#define CONFIG_JPEG2000_DECODER 0
+#define CONFIG_JPEG2000_ENCODER 0
+#define CONFIG_JPEGLS_DECODER 0
+#define CONFIG_JPEGLS_ENCODER 0
+#define CONFIG_JV_DECODER 0
+#define CONFIG_JV_DEMUXER 0
 #define CONFIG_KERNDEINT_FILTER 0
+#define CONFIG_KGV1_DECODER 0
+#define CONFIG_KMSGRAB_INDEV 0
+#define CONFIG_KMVC_DECODER 0
+#define CONFIG_LADSPA_FILTER 0
+#define CONFIG_LAGARITH_DECODER 0
+#define CONFIG_LATM_MUXER 0
+#define CONFIG_LAVFI_INDEV 0
 #define CONFIG_LENSCORRECTION_FILTER 0
+#define CONFIG_LIBAOM_AV1_DECODER 0
+#define CONFIG_LIBAOM_AV1_ENCODER 0
+#define CONFIG_LIBCDIO_INDEV 0
+#define CONFIG_LIBCELT_DECODER 0
+#define CONFIG_LIBCODEC2_DECODER 0
+#define CONFIG_LIBCODEC2_ENCODER 0
+#define CONFIG_LIBDC1394_INDEV 0
+#define CONFIG_LIBFDK_AAC_DECODER 0
+#define CONFIG_LIBFDK_AAC_ENCODER 0
+#define CONFIG_LIBGME_DEMUXER 0
+#define CONFIG_LIBGSM_DECODER 0
+#define CONFIG_LIBGSM_ENCODER 0
+#define CONFIG_LIBGSM_MS_DECODER 0
+#define CONFIG_LIBGSM_MS_ENCODER 0
+#define CONFIG_LIBILBC_DECODER 0
+#define CONFIG_LIBILBC_ENCODER 0
+#define CONFIG_LIBKVAZAAR_ENCODER 0
+#define CONFIG_LIBMODPLUG_DEMUXER 0
+#define CONFIG_LIBMP3LAME_ENCODER 0
+#define CONFIG_LIBNDI_NEWTEK_INDEV 0
+#define CONFIG_LIBNDI_NEWTEK_OUTDEV 0
+#define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
+#define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
+#define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
+#define CONFIG_LIBOPENH264_DECODER 0
+#define CONFIG_LIBOPENH264_ENCODER 0
+#define CONFIG_LIBOPENJPEG_DECODER 0
+#define CONFIG_LIBOPENJPEG_ENCODER 0
+#define CONFIG_LIBOPENMPT_DEMUXER 0
+#define CONFIG_LIBOPUS_DECODER 0
+#define CONFIG_LIBOPUS_ENCODER 0
+#define CONFIG_LIBRSVG_DECODER 0
+#define CONFIG_LIBRTMPE_PROTOCOL 0
+#define CONFIG_LIBRTMPS_PROTOCOL 0
+#define CONFIG_LIBRTMPTE_PROTOCOL 0
+#define CONFIG_LIBRTMPT_PROTOCOL 0
+#define CONFIG_LIBRTMP_PROTOCOL 0
+#define CONFIG_LIBSHINE_ENCODER 0
+#define CONFIG_LIBSMBCLIENT_PROTOCOL 0
+#define CONFIG_LIBSPEEX_DECODER 0
+#define CONFIG_LIBSPEEX_ENCODER 0
+#define CONFIG_LIBSRT_PROTOCOL 0
+#define CONFIG_LIBSSH_PROTOCOL 0
+#define CONFIG_LIBTHEORA_ENCODER 0
+#define CONFIG_LIBTWOLAME_ENCODER 0
 #define CONFIG_LIBVMAF_FILTER 0
+#define CONFIG_LIBVORBIS_DECODER 0
+#define CONFIG_LIBVORBIS_ENCODER 0
+#define CONFIG_LIBVO_AMRWBENC_ENCODER 0
+#define CONFIG_LIBVPX_VP8_DECODER 0
+#define CONFIG_LIBVPX_VP8_ENCODER 0
+#define CONFIG_LIBVPX_VP9_DECODER 0
+#define CONFIG_LIBVPX_VP9_ENCODER 0
+#define CONFIG_LIBWAVPACK_ENCODER 0
+#define CONFIG_LIBWEBP_ANIM_ENCODER 0
+#define CONFIG_LIBWEBP_ENCODER 0
+#define CONFIG_LIBX262_ENCODER 0
+#define CONFIG_LIBX264RGB_ENCODER 0
+#define CONFIG_LIBX264_ENCODER 0
+#define CONFIG_LIBX265_ENCODER 0
+#define CONFIG_LIBXAVS_ENCODER 0
+#define CONFIG_LIBXVID_ENCODER 0
+#define CONFIG_LIBZVBI_TELETEXT_DECODER 0
+#define CONFIG_LIFE_FILTER 0
 #define CONFIG_LIMITER_FILTER 0
+#define CONFIG_LIVE_FLV_DEMUXER 0
+#define CONFIG_LJPEG_ENCODER 0
+#define CONFIG_LMLM4_DEMUXER 0
+#define CONFIG_LOAS_DEMUXER 0
+#define CONFIG_LOCO_DECODER 0
 #define CONFIG_LOOP_FILTER 0
+#define CONFIG_LOUDNORM_FILTER 0
+#define CONFIG_LOWPASS_FILTER 0
+#define CONFIG_LRC_DEMUXER 0
+#define CONFIG_LRC_MUXER 0
 #define CONFIG_LUMAKEY_FILTER 0
-#define CONFIG_LUT_FILTER 0
 #define CONFIG_LUT2_FILTER 0
 #define CONFIG_LUT3D_FILTER 0
 #define CONFIG_LUTRGB_FILTER 0
 #define CONFIG_LUTYUV_FILTER 0
+#define CONFIG_LUT_FILTER 0
+#define CONFIG_LV2_FILTER 0
+#define CONFIG_LVF_DEMUXER 0
+#define CONFIG_LXF_DEMUXER 0
+#define CONFIG_M101_DECODER 0
+#define CONFIG_M4V_DEMUXER 0
+#define CONFIG_M4V_MUXER 0
+#define CONFIG_MACE3_DECODER 0
+#define CONFIG_MACE6_DECODER 0
+#define CONFIG_MAGICYUV_DECODER 0
+#define CONFIG_MAGICYUV_ENCODER 0
+#define CONFIG_MANDELBROT_FILTER 0
 #define CONFIG_MASKEDCLAMP_FILTER 0
 #define CONFIG_MASKEDMERGE_FILTER 0
+#define CONFIG_MATROSKA_AUDIO_MUXER 0
+#define CONFIG_MATROSKA_DEMUXER 0
+#define CONFIG_MATROSKA_MUXER 0
 #define CONFIG_MCDEINT_FILTER 0
+#define CONFIG_MCOMPAND_FILTER 0
+#define CONFIG_MD5_MUXER 0
+#define CONFIG_MD5_PROTOCOL 0
+#define CONFIG_MDEC_DECODER 0
 #define CONFIG_MERGEPLANES_FILTER 0
 #define CONFIG_MESTIMATE_FILTER 0
 #define CONFIG_METADATA_FILTER 0
+#define CONFIG_METASOUND_DECODER 0
+#define CONFIG_MGSTS_DEMUXER 0
+#define CONFIG_MICRODVD_DECODER 0
+#define CONFIG_MICRODVD_DEMUXER 0
+#define CONFIG_MICRODVD_MUXER 0
 #define CONFIG_MIDEQUALIZER_FILTER 0
+#define CONFIG_MIMIC_DECODER 0
 #define CONFIG_MINTERPOLATE_FILTER 0
+#define CONFIG_MIX_FILTER 0
+#define CONFIG_MJPEG2JPEG_BSF 0
+#define CONFIG_MJPEGA_DUMP_HEADER_BSF 0
+#define CONFIG_MJPEGB_DECODER 0
+#define CONFIG_MJPEG_2000_DEMUXER 0
+#define CONFIG_MJPEG_CUVID_DECODER 0
+#define CONFIG_MJPEG_DECODER 0
+#define CONFIG_MJPEG_DEMUXER 0
+#define CONFIG_MJPEG_ENCODER 0
+#define CONFIG_MJPEG_MUXER 0
+#define CONFIG_MJPEG_NVDEC_HWACCEL 0
+#define CONFIG_MJPEG_PARSER 0
+#define CONFIG_MJPEG_QSV_ENCODER 0
+#define CONFIG_MJPEG_VAAPI_ENCODER 0
+#define CONFIG_MJPEG_VAAPI_HWACCEL 0
+#define CONFIG_MKVTIMESTAMP_V2_MUXER 0
+#define CONFIG_MLP_DECODER 0
+#define CONFIG_MLP_DEMUXER 0
+#define CONFIG_MLP_ENCODER 0
+#define CONFIG_MLP_MUXER 0
+#define CONFIG_MLP_PARSER 0
+#define CONFIG_MLV_DEMUXER 0
+#define CONFIG_MMF_DEMUXER 0
+#define CONFIG_MMF_MUXER 0
+#define CONFIG_MMSH_PROTOCOL 0
+#define CONFIG_MMST_PROTOCOL 0
+#define CONFIG_MMVIDEO_DECODER 0
+#define CONFIG_MM_DEMUXER 0
+#define CONFIG_MOTIONPIXELS_DECODER 0
+#define CONFIG_MOV2TEXTSUB_BSF 0
+#define CONFIG_MOVIE_FILTER 0
+#define CONFIG_MOVTEXT_DECODER 0
+#define CONFIG_MOVTEXT_ENCODER 0
+#define CONFIG_MOV_DEMUXER 0
+#define CONFIG_MOV_MUXER 0
+#define CONFIG_MP1FLOAT_DECODER 0
+#define CONFIG_MP1_AT_DECODER 0
+#define CONFIG_MP1_DECODER 0
+#define CONFIG_MP2FIXED_ENCODER 0
+#define CONFIG_MP2FLOAT_DECODER 0
+#define CONFIG_MP2_AT_DECODER 0
+#define CONFIG_MP2_DECODER 0
+#define CONFIG_MP2_ENCODER 0
+#define CONFIG_MP2_MUXER 0
+#define CONFIG_MP3ADUFLOAT_DECODER 0
+#define CONFIG_MP3ADU_DECODER 0
+#define CONFIG_MP3FLOAT_DECODER 0
+#define CONFIG_MP3ON4FLOAT_DECODER 0
+#define CONFIG_MP3ON4_DECODER 0
+#define CONFIG_MP3_AT_DECODER 0
+#define CONFIG_MP3_DECODER 0
+#define CONFIG_MP3_DEMUXER 0
+#define CONFIG_MP3_HEADER_DECOMPRESS_BSF 0
+#define CONFIG_MP3_MUXER 0
+#define CONFIG_MP4_MUXER 0
+#define CONFIG_MPC7_DECODER 0
+#define CONFIG_MPC8_DECODER 0
+#define CONFIG_MPC8_DEMUXER 0
+#define CONFIG_MPC_DEMUXER 0
 #define CONFIG_MPDECIMATE_FILTER 0
+#define CONFIG_MPEG1SYSTEM_MUXER 0
+#define CONFIG_MPEG1VCD_MUXER 0
+#define CONFIG_MPEG1VIDEO_DECODER 0
+#define CONFIG_MPEG1VIDEO_ENCODER 0
+#define CONFIG_MPEG1VIDEO_MUXER 0
+#define CONFIG_MPEG1_CUVID_DECODER 0
+#define CONFIG_MPEG1_NVDEC_HWACCEL 0
+#define CONFIG_MPEG1_V4L2M2M_DECODER 0
+#define CONFIG_MPEG1_VDPAU_HWACCEL 0
+#define CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL 0
+#define CONFIG_MPEG1_XVMC_HWACCEL 0
+#define CONFIG_MPEG2DVD_MUXER 0
+#define CONFIG_MPEG2SVCD_MUXER 0
+#define CONFIG_MPEG2VIDEO_DECODER 0
+#define CONFIG_MPEG2VIDEO_ENCODER 0
+#define CONFIG_MPEG2VIDEO_MUXER 0
+#define CONFIG_MPEG2VOB_MUXER 0
+#define CONFIG_MPEG2_CRYSTALHD_DECODER 0
+#define CONFIG_MPEG2_CUVID_DECODER 0
+#define CONFIG_MPEG2_D3D11VA2_HWACCEL 0
+#define CONFIG_MPEG2_D3D11VA_HWACCEL 0
+#define CONFIG_MPEG2_DXVA2_HWACCEL 0
+#define CONFIG_MPEG2_MEDIACODEC_DECODER 0
+#define CONFIG_MPEG2_METADATA_BSF 0
+#define CONFIG_MPEG2_MMAL_DECODER 0
+#define CONFIG_MPEG2_NVDEC_HWACCEL 0
+#define CONFIG_MPEG2_QSV_DECODER 0
+#define CONFIG_MPEG2_QSV_ENCODER 0
+#define CONFIG_MPEG2_V4L2M2M_DECODER 0
+#define CONFIG_MPEG2_VAAPI_ENCODER 0
+#define CONFIG_MPEG2_VAAPI_HWACCEL 0
+#define CONFIG_MPEG2_VDPAU_HWACCEL 0
+#define CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL 0
+#define CONFIG_MPEG2_XVMC_HWACCEL 0
+#define CONFIG_MPEG4VIDEO_PARSER 0
+#define CONFIG_MPEG4_CRYSTALHD_DECODER 0
+#define CONFIG_MPEG4_CUVID_DECODER 0
+#define CONFIG_MPEG4_DECODER 0
+#define CONFIG_MPEG4_ENCODER 0
+#define CONFIG_MPEG4_MEDIACODEC_DECODER 0
+#define CONFIG_MPEG4_MMAL_DECODER 0
+#define CONFIG_MPEG4_NVDEC_HWACCEL 0
+#define CONFIG_MPEG4_UNPACK_BFRAMES_BSF 0
+#define CONFIG_MPEG4_V4L2M2M_DECODER 0
+#define CONFIG_MPEG4_V4L2M2M_ENCODER 0
+#define CONFIG_MPEG4_VAAPI_HWACCEL 0
+#define CONFIG_MPEG4_VDPAU_HWACCEL 0
+#define CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL 0
+#define CONFIG_MPEGAUDIO_PARSER 0
+#define CONFIG_MPEGPS_DEMUXER 0
+#define CONFIG_MPEGTSRAW_DEMUXER 0
+#define CONFIG_MPEGTS_DEMUXER 0
+#define CONFIG_MPEGTS_MUXER 0
+#define CONFIG_MPEGVIDEO_DECODER 0
+#define CONFIG_MPEGVIDEO_DEMUXER 0
+#define CONFIG_MPEGVIDEO_PARSER 0
+#define CONFIG_MPJPEG_DEMUXER 0
+#define CONFIG_MPJPEG_MUXER 0
+#define CONFIG_MPL2_DECODER 0
+#define CONFIG_MPL2_DEMUXER 0
+#define CONFIG_MPSUB_DEMUXER 0
+#define CONFIG_MPTESTSRC_FILTER 0
+#define CONFIG_MSA1_DECODER 0
+#define CONFIG_MSCC_DECODER 0
+#define CONFIG_MSF_DEMUXER 0
+#define CONFIG_MSMPEG4V1_DECODER 0
+#define CONFIG_MSMPEG4V2_DECODER 0
+#define CONFIG_MSMPEG4V2_ENCODER 0
+#define CONFIG_MSMPEG4V3_DECODER 0
+#define CONFIG_MSMPEG4V3_ENCODER 0
+#define CONFIG_MSMPEG4_CRYSTALHD_DECODER 0
+#define CONFIG_MSNWC_TCP_DEMUXER 0
+#define CONFIG_MSRLE_DECODER 0
+#define CONFIG_MSS1_DECODER 0
+#define CONFIG_MSS2_DECODER 0
+#define CONFIG_MSVIDEO1_DECODER 0
+#define CONFIG_MSVIDEO1_ENCODER 0
+#define CONFIG_MSZH_DECODER 0
+#define CONFIG_MTAF_DEMUXER 0
+#define CONFIG_MTS2_DECODER 0
+#define CONFIG_MTV_DEMUXER 0
+#define CONFIG_MUSX_DEMUXER 0
+#define CONFIG_MUXERS 0
+#define CONFIG_MVC1_DECODER 0
+#define CONFIG_MVC2_DECODER 0
+#define CONFIG_MVI_DEMUXER 0
+#define CONFIG_MV_DEMUXER 0
+#define CONFIG_MXF_D10_MUXER 0
+#define CONFIG_MXF_DEMUXER 0
+#define CONFIG_MXF_MUXER 0
+#define CONFIG_MXF_OPATOM_MUXER 0
+#define CONFIG_MXG_DEMUXER 0
+#define CONFIG_MXPEG_DECODER 0
+#define CONFIG_NC_DEMUXER 0
 #define CONFIG_NEGATE_FILTER 0
+#define CONFIG_NELLYMOSER_DECODER 0
+#define CONFIG_NELLYMOSER_ENCODER 0
+#define CONFIG_NISTSPHERE_DEMUXER 0
 #define CONFIG_NLMEANS_FILTER 0
 #define CONFIG_NNEDI_FILTER 0
 #define CONFIG_NOFORMAT_FILTER 0
+#define CONFIG_NOISE_BSF 0
 #define CONFIG_NOISE_FILTER 0
+#define CONFIG_NORMALIZE_FILTER 0
+#define CONFIG_NSP_DEMUXER 0
+#define CONFIG_NSV_DEMUXER 0
+#define CONFIG_NULLSINK_FILTER 0
+#define CONFIG_NULLSRC_FILTER 0
 #define CONFIG_NULL_FILTER 0
+#define CONFIG_NULL_MUXER 0
+#define CONFIG_NUT_DEMUXER 0
+#define CONFIG_NUT_MUXER 0
+#define CONFIG_NUV_DECODER 0
+#define CONFIG_NUV_DEMUXER 0
+#define CONFIG_NVENC_ENCODER 0
+#define CONFIG_NVENC_H264_ENCODER 0
+#define CONFIG_NVENC_HEVC_ENCODER 0
 #define CONFIG_OCR_FILTER 0
 #define CONFIG_OCV_FILTER 0
+#define CONFIG_OGA_MUXER 0
+#define CONFIG_OGG_DEMUXER 0
+#define CONFIG_OGG_MUXER 0
+#define CONFIG_OGV_MUXER 0
+#define CONFIG_OMA_DEMUXER 0
+#define CONFIG_OMA_MUXER 0
+#define CONFIG_ON2AVC_DECODER 0
+#define CONFIG_OPENAL_INDEV 0
+#define CONFIG_OPENCLSRC_FILTER 0
+#define CONFIG_OPENGL_OUTDEV 0
+#define CONFIG_OPUS_DECODER 0
+#define CONFIG_OPUS_ENCODER 0
+#define CONFIG_OPUS_MUXER 0
+#define CONFIG_OPUS_PARSER 0
 #define CONFIG_OSCILLOSCOPE_FILTER 0
+#define CONFIG_OSS_INDEV 0
+#define CONFIG_OSS_OUTDEV 0
+#define CONFIG_OUTDEVS 0
 #define CONFIG_OVERLAY_FILTER 0
+#define CONFIG_OVERLAY_OPENCL_FILTER 0
+#define CONFIG_OVERLAY_QSV_FILTER 0
 #define CONFIG_OWDENOISE_FILTER 0
 #define CONFIG_PAD_FILTER 0
+#define CONFIG_PAF_AUDIO_DECODER 0
+#define CONFIG_PAF_DEMUXER 0
+#define CONFIG_PAF_VIDEO_DECODER 0
 #define CONFIG_PALETTEGEN_FILTER 0
 #define CONFIG_PALETTEUSE_FILTER 0
+#define CONFIG_PAM_DECODER 0
+#define CONFIG_PAM_ENCODER 0
+#define CONFIG_PAN_FILTER 0
+#define CONFIG_PBM_DECODER 0
+#define CONFIG_PBM_ENCODER 0
+#define CONFIG_PCM_ALAW_AT_DECODER 0
+#define CONFIG_PCM_ALAW_AT_ENCODER 0
+#define CONFIG_PCM_ALAW_DECODER 0
+#define CONFIG_PCM_ALAW_DEMUXER 0
+#define CONFIG_PCM_ALAW_ENCODER 0
+#define CONFIG_PCM_ALAW_MUXER 0
+#define CONFIG_PCM_BLURAY_DECODER 0
+#define CONFIG_PCM_DVD_DECODER 0
+#define CONFIG_PCM_F16LE_DECODER 0
+#define CONFIG_PCM_F24LE_DECODER 0
+#define CONFIG_PCM_F32BE_DECODER 0
+#define CONFIG_PCM_F32BE_DEMUXER 0
+#define CONFIG_PCM_F32BE_ENCODER 0
+#define CONFIG_PCM_F32BE_MUXER 0
+#define CONFIG_PCM_F32LE_DECODER 0
+#define CONFIG_PCM_F32LE_DEMUXER 0
+#define CONFIG_PCM_F32LE_ENCODER 0
+#define CONFIG_PCM_F32LE_MUXER 0
+#define CONFIG_PCM_F64BE_DECODER 0
+#define CONFIG_PCM_F64BE_DEMUXER 0
+#define CONFIG_PCM_F64BE_ENCODER 0
+#define CONFIG_PCM_F64BE_MUXER 0
+#define CONFIG_PCM_F64LE_DECODER 0
+#define CONFIG_PCM_F64LE_DEMUXER 0
+#define CONFIG_PCM_F64LE_ENCODER 0
+#define CONFIG_PCM_F64LE_MUXER 0
+#define CONFIG_PCM_LXF_DECODER 0
+#define CONFIG_PCM_MULAW_AT_DECODER 0
+#define CONFIG_PCM_MULAW_AT_ENCODER 0
+#define CONFIG_PCM_MULAW_DECODER 0
+#define CONFIG_PCM_MULAW_DEMUXER 0
+#define CONFIG_PCM_MULAW_ENCODER 0
+#define CONFIG_PCM_MULAW_MUXER 0
+#define CONFIG_PCM_S16BE_DECODER 0
+#define CONFIG_PCM_S16BE_DEMUXER 0
+#define CONFIG_PCM_S16BE_ENCODER 0
+#define CONFIG_PCM_S16BE_MUXER 0
+#define CONFIG_PCM_S16BE_PLANAR_DECODER 0
+#define CONFIG_PCM_S16BE_PLANAR_ENCODER 0
+#define CONFIG_PCM_S16LE_DECODER 0
+#define CONFIG_PCM_S16LE_DEMUXER 0
+#define CONFIG_PCM_S16LE_ENCODER 0
+#define CONFIG_PCM_S16LE_MUXER 0
+#define CONFIG_PCM_S16LE_PLANAR_DECODER 0
+#define CONFIG_PCM_S16LE_PLANAR_ENCODER 0
+#define CONFIG_PCM_S24BE_DECODER 0
+#define CONFIG_PCM_S24BE_DEMUXER 0
+#define CONFIG_PCM_S24BE_ENCODER 0
+#define CONFIG_PCM_S24BE_MUXER 0
+#define CONFIG_PCM_S24DAUD_DECODER 0
+#define CONFIG_PCM_S24DAUD_ENCODER 0
+#define CONFIG_PCM_S24LE_DECODER 0
+#define CONFIG_PCM_S24LE_DEMUXER 0
+#define CONFIG_PCM_S24LE_ENCODER 0
+#define CONFIG_PCM_S24LE_MUXER 0
+#define CONFIG_PCM_S24LE_PLANAR_DECODER 0
+#define CONFIG_PCM_S24LE_PLANAR_ENCODER 0
+#define CONFIG_PCM_S32BE_DECODER 0
+#define CONFIG_PCM_S32BE_DEMUXER 0
+#define CONFIG_PCM_S32BE_ENCODER 0
+#define CONFIG_PCM_S32BE_MUXER 0
+#define CONFIG_PCM_S32LE_DECODER 0
+#define CONFIG_PCM_S32LE_DEMUXER 0
+#define CONFIG_PCM_S32LE_ENCODER 0
+#define CONFIG_PCM_S32LE_MUXER 0
+#define CONFIG_PCM_S32LE_PLANAR_DECODER 0
+#define CONFIG_PCM_S32LE_PLANAR_ENCODER 0
+#define CONFIG_PCM_S64BE_DECODER 0
+#define CONFIG_PCM_S64BE_ENCODER 0
+#define CONFIG_PCM_S64LE_DECODER 0
+#define CONFIG_PCM_S64LE_ENCODER 0
+#define CONFIG_PCM_S8_DECODER 0
+#define CONFIG_PCM_S8_DEMUXER 0
+#define CONFIG_PCM_S8_ENCODER 0
+#define CONFIG_PCM_S8_MUXER 0
+#define CONFIG_PCM_S8_PLANAR_DECODER 0
+#define CONFIG_PCM_S8_PLANAR_ENCODER 0
+#define CONFIG_PCM_U16BE_DECODER 0
+#define CONFIG_PCM_U16BE_DEMUXER 0
+#define CONFIG_PCM_U16BE_ENCODER 0
+#define CONFIG_PCM_U16BE_MUXER 0
+#define CONFIG_PCM_U16LE_DECODER 0
+#define CONFIG_PCM_U16LE_DEMUXER 0
+#define CONFIG_PCM_U16LE_ENCODER 0
+#define CONFIG_PCM_U16LE_MUXER 0
+#define CONFIG_PCM_U24BE_DECODER 0
+#define CONFIG_PCM_U24BE_DEMUXER 0
+#define CONFIG_PCM_U24BE_ENCODER 0
+#define CONFIG_PCM_U24BE_MUXER 0
+#define CONFIG_PCM_U24LE_DECODER 0
+#define CONFIG_PCM_U24LE_DEMUXER 0
+#define CONFIG_PCM_U24LE_ENCODER 0
+#define CONFIG_PCM_U24LE_MUXER 0
+#define CONFIG_PCM_U32BE_DECODER 0
+#define CONFIG_PCM_U32BE_DEMUXER 0
+#define CONFIG_PCM_U32BE_ENCODER 0
+#define CONFIG_PCM_U32BE_MUXER 0
+#define CONFIG_PCM_U32LE_DECODER 0
+#define CONFIG_PCM_U32LE_DEMUXER 0
+#define CONFIG_PCM_U32LE_ENCODER 0
+#define CONFIG_PCM_U32LE_MUXER 0
+#define CONFIG_PCM_U8_DECODER 0
+#define CONFIG_PCM_U8_DEMUXER 0
+#define CONFIG_PCM_U8_ENCODER 0
+#define CONFIG_PCM_U8_MUXER 0
+#define CONFIG_PCM_ZORK_DECODER 0
+#define CONFIG_PCX_DECODER 0
+#define CONFIG_PCX_ENCODER 0
 #define CONFIG_PERMS_FILTER 0
 #define CONFIG_PERSPECTIVE_FILTER 0
+#define CONFIG_PGMYUV_DECODER 0
+#define CONFIG_PGMYUV_ENCODER 0
+#define CONFIG_PGM_DECODER 0
+#define CONFIG_PGM_ENCODER 0
+#define CONFIG_PGSSUB_DECODER 0
 #define CONFIG_PHASE_FILTER 0
+#define CONFIG_PICTOR_DECODER 0
+#define CONFIG_PIPE_PROTOCOL 0
 #define CONFIG_PIXDESCTEST_FILTER 0
+#define CONFIG_PIXLET_DECODER 0
 #define CONFIG_PIXSCOPE_FILTER 0
-#define CONFIG_PP_FILTER 0
+#define CONFIG_PJS_DECODER 0
+#define CONFIG_PJS_DEMUXER 0
+#define CONFIG_PMP_DEMUXER 0
+#define CONFIG_PNG_DECODER 0
+#define CONFIG_PNG_ENCODER 0
+#define CONFIG_PNG_PARSER 0
+#define CONFIG_PNM_PARSER 0
 #define CONFIG_PP7_FILTER 0
+#define CONFIG_PPM_DECODER 0
+#define CONFIG_PPM_ENCODER 0
+#define CONFIG_PP_FILTER 0
 #define CONFIG_PREMULTIPLY_FILTER 0
 #define CONFIG_PREWITT_FILTER 0
+#define CONFIG_PROCAMP_VAAPI_FILTER 0
+#define CONFIG_PROGRAM_OPENCL_FILTER 0
+#define CONFIG_PROMPEG_PROTOCOL 0
+#define CONFIG_PRORES_AW_ENCODER 0
+#define CONFIG_PRORES_DECODER 0
+#define CONFIG_PRORES_ENCODER 0
+#define CONFIG_PRORES_KS_ENCODER 0
+#define CONFIG_PRORES_LGPL_DECODER 0
+#define CONFIG_PROTOCOLS 0
+#define CONFIG_PSD_DECODER 0
 #define CONFIG_PSEUDOCOLOR_FILTER 0
 #define CONFIG_PSNR_FILTER 0
+#define CONFIG_PSP_MUXER 0
+#define CONFIG_PTX_DECODER 0
 #define CONFIG_PULLUP_FILTER 0
+#define CONFIG_PULSE_INDEV 0
+#define CONFIG_PULSE_OUTDEV 0
+#define CONFIG_PVA_DEMUXER 0
+#define CONFIG_PVF_DEMUXER 0
+#define CONFIG_QCELP_DECODER 0
+#define CONFIG_QCP_DEMUXER 0
+#define CONFIG_QDM2_AT_DECODER 0
+#define CONFIG_QDM2_DECODER 0
+#define CONFIG_QDMC_AT_DECODER 0
+#define CONFIG_QDMC_DECODER 0
+#define CONFIG_QDRAW_DECODER 0
+#define CONFIG_QPEG_DECODER 0
 #define CONFIG_QP_FILTER 0
+#define CONFIG_QTRLE_DECODER 0
+#define CONFIG_QTRLE_ENCODER 0
+#define CONFIG_R10K_DECODER 0
+#define CONFIG_R10K_ENCODER 0
+#define CONFIG_R210_DECODER 0
+#define CONFIG_R210_ENCODER 0
+#define CONFIG_R3D_DEMUXER 0
+#define CONFIG_RALF_DECODER 0
 #define CONFIG_RANDOM_FILTER 0
+#define CONFIG_RAWVIDEO_DECODER 0
+#define CONFIG_RAWVIDEO_DEMUXER 0
+#define CONFIG_RAWVIDEO_ENCODER 0
+#define CONFIG_RAWVIDEO_MUXER 0
+#define CONFIG_RA_144_DECODER 0
+#define CONFIG_RA_144_ENCODER 0
+#define CONFIG_RA_288_DECODER 0
 #define CONFIG_READEIA608_FILTER 0
 #define CONFIG_READVITC_FILTER 0
+#define CONFIG_REALTEXT_DECODER 0
+#define CONFIG_REALTEXT_DEMUXER 0
 #define CONFIG_REALTIME_FILTER 0
+#define CONFIG_REDSPARK_DEMUXER 0
 #define CONFIG_REMAP_FILTER 0
 #define CONFIG_REMOVEGRAIN_FILTER 0
 #define CONFIG_REMOVELOGO_FILTER 0
+#define CONFIG_REMOVE_EXTRADATA_BSF 0
 #define CONFIG_REPEATFIELDS_FILTER 0
+#define CONFIG_REPLAYGAIN_FILTER 0
+#define CONFIG_RESAMPLE_FILTER 0
 #define CONFIG_REVERSE_FILTER 0
+#define CONFIG_RGBTESTSRC_FILTER 0
+#define CONFIG_RL2_DECODER 0
+#define CONFIG_RL2_DEMUXER 0
+#define CONFIG_RM_DEMUXER 0
+#define CONFIG_RM_MUXER 0
 #define CONFIG_ROBERTS_FILTER 0
+#define CONFIG_ROQ_DECODER 0
+#define CONFIG_ROQ_DEMUXER 0
+#define CONFIG_ROQ_DPCM_DECODER 0
+#define CONFIG_ROQ_DPCM_ENCODER 0
+#define CONFIG_ROQ_ENCODER 0
+#define CONFIG_ROQ_MUXER 0
 #define CONFIG_ROTATE_FILTER 0
+#define CONFIG_RPL_DEMUXER 0
+#define CONFIG_RPZA_DECODER 0
+#define CONFIG_RSCC_DECODER 0
+#define CONFIG_RSD_DEMUXER 0
+#define CONFIG_RSO_DEMUXER 0
+#define CONFIG_RSO_MUXER 0
+#define CONFIG_RTMPE_PROTOCOL 0
+#define CONFIG_RTMPS_PROTOCOL 0
+#define CONFIG_RTMPTE_PROTOCOL 0
+#define CONFIG_RTMPTS_PROTOCOL 0
+#define CONFIG_RTMPT_PROTOCOL 0
+#define CONFIG_RTMP_PROTOCOL 0
+#define CONFIG_RTP_DEMUXER 0
+#define CONFIG_RTP_MPEGTS_MUXER 0
+#define CONFIG_RTP_MUXER 0
+#define CONFIG_RTP_PROTOCOL 0
+#define CONFIG_RTSP_DEMUXER 0
+#define CONFIG_RTSP_MUXER 0
+#define CONFIG_RUBBERBAND_FILTER 0
+#define CONFIG_RV10_DECODER 0
+#define CONFIG_RV10_ENCODER 0
+#define CONFIG_RV20_DECODER 0
+#define CONFIG_RV20_ENCODER 0
+#define CONFIG_RV30_DECODER 0
+#define CONFIG_RV30_PARSER 0
+#define CONFIG_RV40_DECODER 0
+#define CONFIG_RV40_PARSER 0
+#define CONFIG_S302M_DECODER 0
+#define CONFIG_S302M_ENCODER 0
+#define CONFIG_S337M_DEMUXER 0
 #define CONFIG_SAB_FILTER 0
-#define CONFIG_SCALE_FILTER 0
+#define CONFIG_SAMI_DECODER 0
+#define CONFIG_SAMI_DEMUXER 0
+#define CONFIG_SANM_DECODER 0
+#define CONFIG_SAP_DEMUXER 0
+#define CONFIG_SAP_MUXER 0
+#define CONFIG_SBC_DECODER 0
+#define CONFIG_SBC_DEMUXER 0
+#define CONFIG_SBC_ENCODER 0
+#define CONFIG_SBC_MUXER 0
+#define CONFIG_SBC_PARSER 0
+#define CONFIG_SBG_DEMUXER 0
+#define CONFIG_SCALE2REF_FILTER 0
 #define CONFIG_SCALE_CUDA_FILTER 0
+#define CONFIG_SCALE_FILTER 0
 #define CONFIG_SCALE_NPP_FILTER 0
 #define CONFIG_SCALE_QSV_FILTER 0
 #define CONFIG_SCALE_VAAPI_FILTER 0
-#define CONFIG_SCALE2REF_FILTER 0
-#define CONFIG_SELECT_FILTER 0
+#define CONFIG_SCC_DEMUXER 0
+#define CONFIG_SCC_MUXER 0
+#define CONFIG_SCPR_DECODER 0
+#define CONFIG_SCREENPRESSO_DECODER 0
+#define CONFIG_SCTP_PROTOCOL 0
+#define CONFIG_SDL2_OUTDEV 0
+#define CONFIG_SDP_DEMUXER 0
+#define CONFIG_SDR2_DEMUXER 0
+#define CONFIG_SDS_DEMUXER 0
+#define CONFIG_SDX2_DPCM_DECODER 0
+#define CONFIG_SDX_DEMUXER 0
+#define CONFIG_SEGAFILM_DEMUXER 0
+#define CONFIG_SEGAFILM_MUXER 0
+#define CONFIG_SEGMENT_MUXER 0
 #define CONFIG_SELECTIVECOLOR_FILTER 0
+#define CONFIG_SELECT_FILTER 0
 #define CONFIG_SENDCMD_FILTER 0
 #define CONFIG_SEPARATEFIELDS_FILTER 0
 #define CONFIG_SETDAR_FILTER 0
 #define CONFIG_SETFIELD_FILTER 0
 #define CONFIG_SETPTS_FILTER 0
+#define CONFIG_SETRANGE_FILTER 0
 #define CONFIG_SETSAR_FILTER 0
 #define CONFIG_SETTB_FILTER 0
+#define CONFIG_SGIRLE_DECODER 0
+#define CONFIG_SGI_DECODER 0
+#define CONFIG_SGI_ENCODER 0
+#define CONFIG_SHARPNESS_VAAPI_FILTER 0
+#define CONFIG_SHEERVIDEO_DECODER 0
+#define CONFIG_SHORTEN_DECODER 0
+#define CONFIG_SHORTEN_DEMUXER 0
+#define CONFIG_SHOWCQT_FILTER 0
+#define CONFIG_SHOWFREQS_FILTER 0
 #define CONFIG_SHOWINFO_FILTER 0
 #define CONFIG_SHOWPALETTE_FILTER 0
+#define CONFIG_SHOWSPECTRUMPIC_FILTER 0
+#define CONFIG_SHOWSPECTRUM_FILTER 0
+#define CONFIG_SHOWVOLUME_FILTER 0
+#define CONFIG_SHOWWAVESPIC_FILTER 0
+#define CONFIG_SHOWWAVES_FILTER 0
 #define CONFIG_SHUFFLEFRAMES_FILTER 0
 #define CONFIG_SHUFFLEPLANES_FILTER 0
+#define CONFIG_SIDECHAINCOMPRESS_FILTER 0
+#define CONFIG_SIDECHAINGATE_FILTER 0
 #define CONFIG_SIDEDATA_FILTER 0
+#define CONFIG_SIFF_DEMUXER 0
 #define CONFIG_SIGNALSTATS_FILTER 0
 #define CONFIG_SIGNATURE_FILTER 0
+#define CONFIG_SILENCEDETECT_FILTER 0
+#define CONFIG_SILENCEREMOVE_FILTER 0
+#define CONFIG_SINE_FILTER 0
+#define CONFIG_SINGLEJPEG_MUXER 0
+#define CONFIG_SIPR_DECODER 0
+#define CONFIG_SIPR_PARSER 0
+#define CONFIG_SLN_DEMUXER 0
+#define CONFIG_SMACKAUD_DECODER 0
+#define CONFIG_SMACKER_DECODER 0
+#define CONFIG_SMACKER_DEMUXER 0
 #define CONFIG_SMARTBLUR_FILTER 0
+#define CONFIG_SMC_DECODER 0
+#define CONFIG_SMJPEG_DEMUXER 0
+#define CONFIG_SMJPEG_MUXER 0
+#define CONFIG_SMOOTHSTREAMING_MUXER 0
+#define CONFIG_SMPTEBARS_FILTER 0
+#define CONFIG_SMPTEHDBARS_FILTER 0
+#define CONFIG_SMUSH_DEMUXER 0
+#define CONFIG_SMVJPEG_DECODER 0
+#define CONFIG_SNDIO_INDEV 0
+#define CONFIG_SNDIO_OUTDEV 0
+#define CONFIG_SNOW_DECODER 0
+#define CONFIG_SNOW_ENCODER 0
 #define CONFIG_SOBEL_FILTER 0
+#define CONFIG_SOFALIZER_FILTER 0
+#define CONFIG_SOL_DEMUXER 0
+#define CONFIG_SOL_DPCM_DECODER 0
+#define CONFIG_SONIC_DECODER 0
+#define CONFIG_SONIC_ENCODER 0
+#define CONFIG_SONIC_LS_ENCODER 0
+#define CONFIG_SOX_DEMUXER 0
+#define CONFIG_SOX_MUXER 0
+#define CONFIG_SP5X_DECODER 0
+#define CONFIG_SPDIF_DEMUXER 0
+#define CONFIG_SPDIF_MUXER 0
+#define CONFIG_SPECTRUMSYNTH_FILTER 0
+#define CONFIG_SPEEDHQ_DECODER 0
 #define CONFIG_SPLIT_FILTER 0
 #define CONFIG_SPP_FILTER 0
+#define CONFIG_SPX_MUXER 0
+#define CONFIG_SRGC_DECODER 0
+#define CONFIG_SRTP_PROTOCOL 0
+#define CONFIG_SRT_DECODER 0
+#define CONFIG_SRT_DEMUXER 0
+#define CONFIG_SRT_ENCODER 0
+#define CONFIG_SRT_MUXER 0
+#define CONFIG_SSA_DECODER 0
+#define CONFIG_SSA_ENCODER 0
 #define CONFIG_SSIM_FILTER 0
 #define CONFIG_STEREO3D_FILTER 0
+#define CONFIG_STEREOTOOLS_FILTER 0
+#define CONFIG_STEREOWIDEN_FILTER 0
+#define CONFIG_STL_DECODER 0
+#define CONFIG_STL_DEMUXER 0
 #define CONFIG_STREAMSELECT_FILTER 0
+#define CONFIG_STREAM_SEGMENT_MUXER 0
+#define CONFIG_STR_DEMUXER 0
+#define CONFIG_SUBFILE_PROTOCOL 0
+#define CONFIG_SUBRIP_DECODER 0
+#define CONFIG_SUBRIP_ENCODER 0
 #define CONFIG_SUBTITLES_FILTER 0
+#define CONFIG_SUBVIEWER1_DECODER 0
+#define CONFIG_SUBVIEWER1_DEMUXER 0
+#define CONFIG_SUBVIEWER_DECODER 0
+#define CONFIG_SUBVIEWER_DEMUXER 0
+#define CONFIG_SUNRAST_DECODER 0
+#define CONFIG_SUNRAST_ENCODER 0
 #define CONFIG_SUPER2XSAI_FILTER 0
+#define CONFIG_SUPEREQUALIZER_FILTER 0
+#define CONFIG_SUP_DEMUXER 0
+#define CONFIG_SUP_MUXER 0
+#define CONFIG_SURROUND_FILTER 0
+#define CONFIG_SVAG_DEMUXER 0
+#define CONFIG_SVQ1_DECODER 0
+#define CONFIG_SVQ1_ENCODER 0
+#define CONFIG_SVQ3_DECODER 0
 #define CONFIG_SWAPRECT_FILTER 0
 #define CONFIG_SWAPUV_FILTER 0
+#define CONFIG_SWF_DEMUXER 0
+#define CONFIG_SWF_MUXER 0
+#define CONFIG_TAK_DECODER 0
+#define CONFIG_TAK_DEMUXER 0
+#define CONFIG_TAK_PARSER 0
+#define CONFIG_TARGA_DECODER 0
+#define CONFIG_TARGA_ENCODER 0
+#define CONFIG_TARGA_Y216_DECODER 0
 #define CONFIG_TBLEND_FILTER 0
+#define CONFIG_TCP_PROTOCOL 0
+#define CONFIG_TDSC_DECODER 0
+#define CONFIG_TEDCAPTIONS_DEMUXER 0
+#define CONFIG_TEE_MUXER 0
+#define CONFIG_TEE_PROTOCOL 0
 #define CONFIG_TELECINE_FILTER 0
+#define CONFIG_TESTSRC2_FILTER 0
+#define CONFIG_TESTSRC_FILTER 0
+#define CONFIG_TEXT2MOVSUB_BSF 0
+#define CONFIG_TEXT_DECODER 0
+#define CONFIG_TEXT_ENCODER 0
+#define CONFIG_TG2_MUXER 0
+#define CONFIG_TGP_MUXER 0
+#define CONFIG_THEORA_DECODER 0
+#define CONFIG_THP_DECODER 0
+#define CONFIG_THP_DEMUXER 0
+#define CONFIG_THREEDOSTR_DEMUXER 0
 #define CONFIG_THRESHOLD_FILTER 0
-#define CONFIG_THUMBNAIL_FILTER 0
 #define CONFIG_THUMBNAIL_CUDA_FILTER 0
+#define CONFIG_THUMBNAIL_FILTER 0
+#define CONFIG_TIERTEXSEQVIDEO_DECODER 0
+#define CONFIG_TIERTEXSEQ_DEMUXER 0
+#define CONFIG_TIFF_DECODER 0
+#define CONFIG_TIFF_ENCODER 0
 #define CONFIG_TILE_FILTER 0
 #define CONFIG_TINTERLACE_FILTER 0
+#define CONFIG_TLS_PROTOCOL 0
 #define CONFIG_TLUT2_FILTER 0
+#define CONFIG_TMV_DECODER 0
+#define CONFIG_TMV_DEMUXER 0
 #define CONFIG_TONEMAP_FILTER 0
+#define CONFIG_TRACE_HEADERS_BSF 0
 #define CONFIG_TRANSPOSE_FILTER 0
+#define CONFIG_TREBLE_FILTER 0
+#define CONFIG_TREMOLO_FILTER 0
 #define CONFIG_TRIM_FILTER 0
+#define CONFIG_TRUEHD_DECODER 0
+#define CONFIG_TRUEHD_DEMUXER 0
+#define CONFIG_TRUEHD_ENCODER 0
+#define CONFIG_TRUEHD_MUXER 0
+#define CONFIG_TRUEMOTION1_DECODER 0
+#define CONFIG_TRUEMOTION2RT_DECODER 0
+#define CONFIG_TRUEMOTION2_DECODER 0
+#define CONFIG_TRUESPEECH_DECODER 0
+#define CONFIG_TSCC2_DECODER 0
+#define CONFIG_TSCC_DECODER 0
+#define CONFIG_TTA_DECODER 0
+#define CONFIG_TTA_DEMUXER 0
+#define CONFIG_TTA_ENCODER 0
+#define CONFIG_TTA_MUXER 0
+#define CONFIG_TTY_DEMUXER 0
+#define CONFIG_TWINVQ_DECODER 0
+#define CONFIG_TXD_DECODER 0
+#define CONFIG_TXD_DEMUXER 0
+#define CONFIG_TY_DEMUXER 0
+#define CONFIG_UDPLITE_PROTOCOL 0
+#define CONFIG_UDP_PROTOCOL 0
+#define CONFIG_ULTI_DECODER 0
+#define CONFIG_UNCODEDFRAMECRC_MUXER 0
+#define CONFIG_UNIX_PROTOCOL 0
 #define CONFIG_UNPREMULTIPLY_FILTER 0
 #define CONFIG_UNSHARP_FILTER 0
+#define CONFIG_UNSHARP_OPENCL_FILTER 0
 #define CONFIG_USPP_FILTER 0
+#define CONFIG_UTVIDEO_DECODER 0
+#define CONFIG_UTVIDEO_ENCODER 0
+#define CONFIG_V210X_DECODER 0
+#define CONFIG_V210X_DEMUXER 0
+#define CONFIG_V210_DECODER 0
+#define CONFIG_V210_DEMUXER 0
+#define CONFIG_V210_ENCODER 0
+#define CONFIG_V308_DECODER 0
+#define CONFIG_V308_ENCODER 0
+#define CONFIG_V408_DECODER 0
+#define CONFIG_V408_ENCODER 0
+#define CONFIG_V410_DECODER 0
+#define CONFIG_V410_ENCODER 0
+#define CONFIG_V4L2_INDEV 0
+#define CONFIG_V4L2_OUTDEV 0
 #define CONFIG_VAGUEDENOISER_FILTER 0
+#define CONFIG_VAG_DEMUXER 0
+#define CONFIG_VBLE_DECODER 0
+#define CONFIG_VB_DECODER 0
+#define CONFIG_VC1IMAGE_DECODER 0
+#define CONFIG_VC1T_DEMUXER 0
+#define CONFIG_VC1T_MUXER 0
+#define CONFIG_VC1_CRYSTALHD_DECODER 0
+#define CONFIG_VC1_CUVID_DECODER 0
+#define CONFIG_VC1_D3D11VA2_HWACCEL 0
+#define CONFIG_VC1_D3D11VA_HWACCEL 0
+#define CONFIG_VC1_DECODER 0
+#define CONFIG_VC1_DEMUXER 0
+#define CONFIG_VC1_DXVA2_HWACCEL 0
+#define CONFIG_VC1_MMAL_DECODER 0
+#define CONFIG_VC1_MUXER 0
+#define CONFIG_VC1_NVDEC_HWACCEL 0
+#define CONFIG_VC1_PARSER 0
+#define CONFIG_VC1_QSV_DECODER 0
+#define CONFIG_VC1_V4L2M2M_DECODER 0
+#define CONFIG_VC1_VAAPI_HWACCEL 0
+#define CONFIG_VC1_VDPAU_HWACCEL 0
+#define CONFIG_VC2_ENCODER 0
+#define CONFIG_VCR1_DECODER 0
 #define CONFIG_VECTORSCOPE_FILTER 0
 #define CONFIG_VFLIP_FILTER 0
+#define CONFIG_VFRDET_FILTER 0
+#define CONFIG_VFWCAP_INDEV 0
+#define CONFIG_VIBRATO_FILTER 0
 #define CONFIG_VIDSTABDETECT_FILTER 0
 #define CONFIG_VIDSTABTRANSFORM_FILTER 0
 #define CONFIG_VIGNETTE_FILTER 0
+#define CONFIG_VIVO_DEMUXER 0
 #define CONFIG_VMAFMOTION_FILTER 0
-#define CONFIG_VSTACK_FILTER 0
-#define CONFIG_W3FDIF_FILTER 0
-#define CONFIG_WAVEFORM_FILTER 0
-#define CONFIG_WEAVE_FILTER 0
-#define CONFIG_XBR_FILTER 0
-#define CONFIG_YADIF_FILTER 0
-#define CONFIG_ZMQ_FILTER 0
-#define CONFIG_ZOOMPAN_FILTER 0
-#define CONFIG_ZSCALE_FILTER 0
-#define CONFIG_ALLRGB_FILTER 0
-#define CONFIG_ALLYUV_FILTER 0
-#define CONFIG_CELLAUTO_FILTER 0
-#define CONFIG_COLOR_FILTER 0
-#define CONFIG_COREIMAGESRC_FILTER 0
-#define CONFIG_FREI0R_SRC_FILTER 0
-#define CONFIG_HALDCLUTSRC_FILTER 0
-#define CONFIG_LIFE_FILTER 0
-#define CONFIG_MANDELBROT_FILTER 0
-#define CONFIG_MPTESTSRC_FILTER 0
-#define CONFIG_NULLSRC_FILTER 0
-#define CONFIG_RGBTESTSRC_FILTER 0
-#define CONFIG_SMPTEBARS_FILTER 0
-#define CONFIG_SMPTEHDBARS_FILTER 0
-#define CONFIG_TESTSRC_FILTER 0
-#define CONFIG_TESTSRC2_FILTER 0
-#define CONFIG_YUVTESTSRC_FILTER 0
-#define CONFIG_NULLSINK_FILTER 0
-#define CONFIG_ABITSCOPE_FILTER 0
-#define CONFIG_ADRAWGRAPH_FILTER 0
-#define CONFIG_AHISTOGRAM_FILTER 0
-#define CONFIG_APHASEMETER_FILTER 0
-#define CONFIG_AVECTORSCOPE_FILTER 0
-#define CONFIG_CONCAT_FILTER 0
-#define CONFIG_SHOWCQT_FILTER 0
-#define CONFIG_SHOWFREQS_FILTER 0
-#define CONFIG_SHOWSPECTRUM_FILTER 0
-#define CONFIG_SHOWSPECTRUMPIC_FILTER 0
-#define CONFIG_SHOWVOLUME_FILTER 0
-#define CONFIG_SHOWWAVES_FILTER 0
-#define CONFIG_SHOWWAVESPIC_FILTER 0
-#define CONFIG_SPECTRUMSYNTH_FILTER 0
-#define CONFIG_AMOVIE_FILTER 0
-#define CONFIG_MOVIE_FILTER 0
-#define CONFIG_H263_VAAPI_HWACCEL 0
-#define CONFIG_H263_VIDEOTOOLBOX_HWACCEL 0
-#define CONFIG_H264_CUVID_HWACCEL 0
-#define CONFIG_H264_D3D11VA_HWACCEL 0
-#define CONFIG_H264_D3D11VA2_HWACCEL 0
-#define CONFIG_H264_DXVA2_HWACCEL 0
-#define CONFIG_H264_MEDIACODEC_HWACCEL 0
-#define CONFIG_H264_MMAL_HWACCEL 0
-#define CONFIG_H264_QSV_HWACCEL 0
-#define CONFIG_H264_VAAPI_HWACCEL 0
-#define CONFIG_H264_VDA_HWACCEL 0
-#define CONFIG_H264_VDA_OLD_HWACCEL 0
-#define CONFIG_H264_VDPAU_HWACCEL 0
-#define CONFIG_H264_VIDEOTOOLBOX_HWACCEL 0
-#define CONFIG_HEVC_CUVID_HWACCEL 0
-#define CONFIG_HEVC_D3D11VA_HWACCEL 0
-#define CONFIG_HEVC_D3D11VA2_HWACCEL 0
-#define CONFIG_HEVC_DXVA2_HWACCEL 0
-#define CONFIG_HEVC_MEDIACODEC_HWACCEL 0
-#define CONFIG_HEVC_QSV_HWACCEL 0
-#define CONFIG_HEVC_VAAPI_HWACCEL 0
-#define CONFIG_HEVC_VDPAU_HWACCEL 0
-#define CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL 0
-#define CONFIG_MJPEG_CUVID_HWACCEL 0
-#define CONFIG_MPEG1_CUVID_HWACCEL 0
-#define CONFIG_MPEG1_XVMC_HWACCEL 0
-#define CONFIG_MPEG1_VDPAU_HWACCEL 0
-#define CONFIG_MPEG1_VIDEOTOOLBOX_HWACCEL 0
-#define CONFIG_MPEG2_CUVID_HWACCEL 0
-#define CONFIG_MPEG2_XVMC_HWACCEL 0
-#define CONFIG_MPEG2_D3D11VA_HWACCEL 0
-#define CONFIG_MPEG2_D3D11VA2_HWACCEL 0
-#define CONFIG_MPEG2_DXVA2_HWACCEL 0
-#define CONFIG_MPEG2_MMAL_HWACCEL 0
-#define CONFIG_MPEG2_QSV_HWACCEL 0
-#define CONFIG_MPEG2_VAAPI_HWACCEL 0
-#define CONFIG_MPEG2_VDPAU_HWACCEL 0
-#define CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL 0
-#define CONFIG_MPEG2_MEDIACODEC_HWACCEL 0
-#define CONFIG_MPEG4_CUVID_HWACCEL 0
-#define CONFIG_MPEG4_MEDIACODEC_HWACCEL 0
-#define CONFIG_MPEG4_MMAL_HWACCEL 0
-#define CONFIG_MPEG4_VAAPI_HWACCEL 0
-#define CONFIG_MPEG4_VDPAU_HWACCEL 0
-#define CONFIG_MPEG4_VIDEOTOOLBOX_HWACCEL 0
-#define CONFIG_VC1_CUVID_HWACCEL 0
-#define CONFIG_VC1_D3D11VA_HWACCEL 0
-#define CONFIG_VC1_D3D11VA2_HWACCEL 0
-#define CONFIG_VC1_DXVA2_HWACCEL 0
-#define CONFIG_VC1_VAAPI_HWACCEL 0
-#define CONFIG_VC1_VDPAU_HWACCEL 0
-#define CONFIG_VC1_MMAL_HWACCEL 0
-#define CONFIG_VC1_QSV_HWACCEL 0
-#define CONFIG_VP8_CUVID_HWACCEL 0
-#define CONFIG_VP8_MEDIACODEC_HWACCEL 0
-#define CONFIG_VP8_QSV_HWACCEL 0
-#define CONFIG_VP9_CUVID_HWACCEL 0
-#define CONFIG_VP9_D3D11VA_HWACCEL 0
+#define CONFIG_VMDAUDIO_DECODER 0
+#define CONFIG_VMDVIDEO_DECODER 0
+#define CONFIG_VMD_DEMUXER 0
+#define CONFIG_VMNC_DECODER 0
+#define CONFIG_VOBSUB_DEMUXER 0
+#define CONFIG_VOC_DEMUXER 0
+#define CONFIG_VOC_MUXER 0
+#define CONFIG_VOLUMEDETECT_FILTER 0
+#define CONFIG_VOLUME_FILTER 0
+#define CONFIG_VORBIS_DECODER 0
+#define CONFIG_VORBIS_ENCODER 0
+#define CONFIG_VORBIS_PARSER 0
+#define CONFIG_VP3_DECODER 0
+#define CONFIG_VP3_PARSER 0
+#define CONFIG_VP5_DECODER 0
+#define CONFIG_VP6A_DECODER 0
+#define CONFIG_VP6F_DECODER 0
+#define CONFIG_VP6_DECODER 0
+#define CONFIG_VP7_DECODER 0
+#define CONFIG_VP8_CUVID_DECODER 0
+#define CONFIG_VP8_MEDIACODEC_DECODER 0
+#define CONFIG_VP8_NVDEC_HWACCEL 0
+#define CONFIG_VP8_QSV_DECODER 0
+#define CONFIG_VP8_RKMPP_DECODER 0
+#define CONFIG_VP8_V4L2M2M_DECODER 0
+#define CONFIG_VP8_V4L2M2M_ENCODER 0
+#define CONFIG_VP8_VAAPI_ENCODER 0
+#define CONFIG_VP8_VAAPI_HWACCEL 0
+#define CONFIG_VP9_CUVID_DECODER 0
 #define CONFIG_VP9_D3D11VA2_HWACCEL 0
+#define CONFIG_VP9_D3D11VA_HWACCEL 0
 #define CONFIG_VP9_DXVA2_HWACCEL 0
-#define CONFIG_VP9_MEDIACODEC_HWACCEL 0
+#define CONFIG_VP9_MEDIACODEC_DECODER 0
+#define CONFIG_VP9_NVDEC_HWACCEL 0
+#define CONFIG_VP9_RAW_REORDER_BSF 0
+#define CONFIG_VP9_RKMPP_DECODER 0
+#define CONFIG_VP9_SUPERFRAME_BSF 0
+#define CONFIG_VP9_V4L2M2M_DECODER 0
+#define CONFIG_VP9_VAAPI_ENCODER 0
 #define CONFIG_VP9_VAAPI_HWACCEL 0
-#define CONFIG_WMV3_D3D11VA_HWACCEL 0
-#define CONFIG_WMV3_D3D11VA2_HWACCEL 0
-#define CONFIG_WMV3_DXVA2_HWACCEL 0
-#define CONFIG_WMV3_VAAPI_HWACCEL 0
-#define CONFIG_WMV3_VDPAU_HWACCEL 0
-#define CONFIG_ALSA_INDEV 0
-#define CONFIG_AVFOUNDATION_INDEV 0
-#define CONFIG_BKTR_INDEV 0
-#define CONFIG_DECKLINK_INDEV 0
-#define CONFIG_LIBNDI_NEWTEK_INDEV 0
-#define CONFIG_DSHOW_INDEV 0
-#define CONFIG_FBDEV_INDEV 0
-#define CONFIG_GDIGRAB_INDEV 0
-#define CONFIG_IEC61883_INDEV 0
-#define CONFIG_JACK_INDEV 0
-#define CONFIG_KMSGRAB_INDEV 0
-#define CONFIG_LAVFI_INDEV 0
-#define CONFIG_OPENAL_INDEV 0
-#define CONFIG_OSS_INDEV 0
-#define CONFIG_PULSE_INDEV 0
-#define CONFIG_SNDIO_INDEV 0
-#define CONFIG_V4L2_INDEV 0
-#define CONFIG_VFWCAP_INDEV 0
-#define CONFIG_XCBGRAB_INDEV 0
-#define CONFIG_LIBCDIO_INDEV 0
-#define CONFIG_LIBDC1394_INDEV 0
-#define CONFIG_A64_MUXER 0
-#define CONFIG_AC3_MUXER 0
-#define CONFIG_ADTS_MUXER 0
-#define CONFIG_ADX_MUXER 0
-#define CONFIG_AIFF_MUXER 0
-#define CONFIG_AMR_MUXER 0
-#define CONFIG_APNG_MUXER 0
-#define CONFIG_ASF_MUXER 0
-#define CONFIG_ASS_MUXER 0
-#define CONFIG_AST_MUXER 0
-#define CONFIG_ASF_STREAM_MUXER 0
-#define CONFIG_AU_MUXER 0
-#define CONFIG_AVI_MUXER 0
-#define CONFIG_AVM2_MUXER 0
-#define CONFIG_BIT_MUXER 0
-#define CONFIG_CAF_MUXER 0
-#define CONFIG_CAVSVIDEO_MUXER 0
-#define CONFIG_CRC_MUXER 0
-#define CONFIG_DASH_MUXER 0
-#define CONFIG_DATA_MUXER 0
-#define CONFIG_DAUD_MUXER 0
-#define CONFIG_DIRAC_MUXER 0
-#define CONFIG_DNXHD_MUXER 0
-#define CONFIG_DTS_MUXER 0
-#define CONFIG_DV_MUXER 0
-#define CONFIG_EAC3_MUXER 0
-#define CONFIG_F4V_MUXER 0
-#define CONFIG_FFM_MUXER 0
-#define CONFIG_FFMETADATA_MUXER 0
-#define CONFIG_FIFO_MUXER 0
-#define CONFIG_FILMSTRIP_MUXER 0
-#define CONFIG_FITS_MUXER 0
-#define CONFIG_FLAC_MUXER 0
-#define CONFIG_FLV_MUXER 0
-#define CONFIG_FRAMECRC_MUXER 0
-#define CONFIG_FRAMEHASH_MUXER 0
-#define CONFIG_FRAMEMD5_MUXER 0
-#define CONFIG_G722_MUXER 0
-#define CONFIG_G723_1_MUXER 0
-#define CONFIG_G726_MUXER 0
-#define CONFIG_G726LE_MUXER 0
-#define CONFIG_GIF_MUXER 0
-#define CONFIG_GSM_MUXER 0
-#define CONFIG_GXF_MUXER 0
-#define CONFIG_H261_MUXER 0
-#define CONFIG_H263_MUXER 0
-#define CONFIG_H264_MUXER 0
-#define CONFIG_HASH_MUXER 0
-#define CONFIG_HDS_MUXER 0
-#define CONFIG_HEVC_MUXER 0
-#define CONFIG_HLS_MUXER 0
-#define CONFIG_ICO_MUXER 0
-#define CONFIG_ILBC_MUXER 0
-#define CONFIG_IMAGE2_MUXER 0
-#define CONFIG_IMAGE2PIPE_MUXER 0
-#define CONFIG_IPOD_MUXER 0
-#define CONFIG_IRCAM_MUXER 0
-#define CONFIG_ISMV_MUXER 0
-#define CONFIG_IVF_MUXER 0
-#define CONFIG_JACOSUB_MUXER 0
-#define CONFIG_LATM_MUXER 0
-#define CONFIG_LRC_MUXER 0
-#define CONFIG_M4V_MUXER 0
-#define CONFIG_MD5_MUXER 0
-#define CONFIG_MATROSKA_MUXER 0
-#define CONFIG_MATROSKA_AUDIO_MUXER 0
-#define CONFIG_MICRODVD_MUXER 0
-#define CONFIG_MJPEG_MUXER 0
-#define CONFIG_MLP_MUXER 0
-#define CONFIG_MMF_MUXER 0
-#define CONFIG_MOV_MUXER 0
-#define CONFIG_MP2_MUXER 0
-#define CONFIG_MP3_MUXER 0
-#define CONFIG_MP4_MUXER 0
-#define CONFIG_MPEG1SYSTEM_MUXER 0
-#define CONFIG_MPEG1VCD_MUXER 0
-#define CONFIG_MPEG1VIDEO_MUXER 0
-#define CONFIG_MPEG2DVD_MUXER 0
-#define CONFIG_MPEG2SVCD_MUXER 0
-#define CONFIG_MPEG2VIDEO_MUXER 0
-#define CONFIG_MPEG2VOB_MUXER 0
-#define CONFIG_MPEGTS_MUXER 0
-#define CONFIG_MPJPEG_MUXER 0
-#define CONFIG_MXF_MUXER 0
-#define CONFIG_MXF_D10_MUXER 0
-#define CONFIG_MXF_OPATOM_MUXER 0
-#define CONFIG_NULL_MUXER 0
-#define CONFIG_NUT_MUXER 0
-#define CONFIG_OGA_MUXER 0
-#define CONFIG_OGG_MUXER 0
-#define CONFIG_OGV_MUXER 0
-#define CONFIG_OMA_MUXER 0
-#define CONFIG_OPUS_MUXER 0
-#define CONFIG_PCM_ALAW_MUXER 0
-#define CONFIG_PCM_MULAW_MUXER 0
-#define CONFIG_PCM_F64BE_MUXER 0
-#define CONFIG_PCM_F64LE_MUXER 0
-#define CONFIG_PCM_F32BE_MUXER 0
-#define CONFIG_PCM_F32LE_MUXER 0
-#define CONFIG_PCM_S32BE_MUXER 0
-#define CONFIG_PCM_S32LE_MUXER 0
-#define CONFIG_PCM_S24BE_MUXER 0
-#define CONFIG_PCM_S24LE_MUXER 0
-#define CONFIG_PCM_S16BE_MUXER 0
-#define CONFIG_PCM_S16LE_MUXER 0
-#define CONFIG_PCM_S8_MUXER 0
-#define CONFIG_PCM_U32BE_MUXER 0
-#define CONFIG_PCM_U32LE_MUXER 0
-#define CONFIG_PCM_U24BE_MUXER 0
-#define CONFIG_PCM_U24LE_MUXER 0
-#define CONFIG_PCM_U16BE_MUXER 0
-#define CONFIG_PCM_U16LE_MUXER 0
-#define CONFIG_PCM_U8_MUXER 0
-#define CONFIG_PSP_MUXER 0
-#define CONFIG_RAWVIDEO_MUXER 0
-#define CONFIG_RM_MUXER 0
-#define CONFIG_ROQ_MUXER 0
-#define CONFIG_RSO_MUXER 0
-#define CONFIG_RTP_MUXER 0
-#define CONFIG_RTP_MPEGTS_MUXER 0
-#define CONFIG_RTSP_MUXER 0
-#define CONFIG_SAP_MUXER 0
-#define CONFIG_SCC_MUXER 0
-#define CONFIG_SEGMENT_MUXER 0
-#define CONFIG_STREAM_SEGMENT_MUXER 0
-#define CONFIG_SINGLEJPEG_MUXER 0
-#define CONFIG_SMJPEG_MUXER 0
-#define CONFIG_SMOOTHSTREAMING_MUXER 0
-#define CONFIG_SOX_MUXER 0
-#define CONFIG_SPX_MUXER 0
-#define CONFIG_SPDIF_MUXER 0
-#define CONFIG_SRT_MUXER 0
-#define CONFIG_SUP_MUXER 0
-#define CONFIG_SWF_MUXER 0
-#define CONFIG_TEE_MUXER 0
-#define CONFIG_TG2_MUXER 0
-#define CONFIG_TGP_MUXER 0
-#define CONFIG_MKVTIMESTAMP_V2_MUXER 0
-#define CONFIG_TRUEHD_MUXER 0
-#define CONFIG_TTA_MUXER 0
-#define CONFIG_UNCODEDFRAMECRC_MUXER 0
-#define CONFIG_VC1_MUXER 0
-#define CONFIG_VC1T_MUXER 0
-#define CONFIG_VOC_MUXER 0
+#define CONFIG_VPK_DEMUXER 0
+#define CONFIG_VPLAYER_DECODER 0
+#define CONFIG_VPLAYER_DEMUXER 0
+#define CONFIG_VPP_QSV_FILTER 0
+#define CONFIG_VQA_DECODER 0
+#define CONFIG_VQF_DEMUXER 0
+#define CONFIG_VSTACK_FILTER 0
+#define CONFIG_W3FDIF_FILTER 0
+#define CONFIG_W64_DEMUXER 0
 #define CONFIG_W64_MUXER 0
+#define CONFIG_WAVEFORM_FILTER 0
+#define CONFIG_WAVPACK_DECODER 0
+#define CONFIG_WAVPACK_ENCODER 0
+#define CONFIG_WAV_DEMUXER 0
 #define CONFIG_WAV_MUXER 0
-#define CONFIG_WEBM_MUXER 0
-#define CONFIG_WEBM_DASH_MANIFEST_MUXER 0
+#define CONFIG_WC3_DEMUXER 0
+#define CONFIG_WEAVE_FILTER 0
 #define CONFIG_WEBM_CHUNK_MUXER 0
+#define CONFIG_WEBM_DASH_MANIFEST_DEMUXER 0
+#define CONFIG_WEBM_DASH_MANIFEST_MUXER 0
+#define CONFIG_WEBM_MUXER 0
+#define CONFIG_WEBP_DECODER 0
 #define CONFIG_WEBP_MUXER 0
+#define CONFIG_WEBVTT_DECODER 0
+#define CONFIG_WEBVTT_DEMUXER 0
+#define CONFIG_WEBVTT_ENCODER 0
 #define CONFIG_WEBVTT_MUXER 0
+#define CONFIG_WMALOSSLESS_DECODER 0
+#define CONFIG_WMAPRO_DECODER 0
+#define CONFIG_WMAV1_DECODER 0
+#define CONFIG_WMAV1_ENCODER 0
+#define CONFIG_WMAV2_DECODER 0
+#define CONFIG_WMAV2_ENCODER 0
+#define CONFIG_WMAVOICE_DECODER 0
+#define CONFIG_WMV1_DECODER 0
+#define CONFIG_WMV1_ENCODER 0
+#define CONFIG_WMV2_DECODER 0
+#define CONFIG_WMV2_ENCODER 0
+#define CONFIG_WMV3IMAGE_DECODER 0
+#define CONFIG_WMV3_CRYSTALHD_DECODER 0
+#define CONFIG_WMV3_D3D11VA2_HWACCEL 0
+#define CONFIG_WMV3_D3D11VA_HWACCEL 0
+#define CONFIG_WMV3_DECODER 0
+#define CONFIG_WMV3_DXVA2_HWACCEL 0
+#define CONFIG_WMV3_NVDEC_HWACCEL 0
+#define CONFIG_WMV3_VAAPI_HWACCEL 0
+#define CONFIG_WMV3_VDPAU_HWACCEL 0
+#define CONFIG_WNV1_DECODER 0
+#define CONFIG_WRAPPED_AVFRAME_DECODER 0
+#define CONFIG_WRAPPED_AVFRAME_ENCODER 0
+#define CONFIG_WSAUD_DEMUXER 0
+#define CONFIG_WSD_DEMUXER 0
+#define CONFIG_WSVQA_DEMUXER 0
+#define CONFIG_WS_SND1_DECODER 0
+#define CONFIG_WTV_DEMUXER 0
 #define CONFIG_WTV_MUXER 0
+#define CONFIG_WVE_DEMUXER 0
+#define CONFIG_WV_DEMUXER 0
 #define CONFIG_WV_MUXER 0
-#define CONFIG_YUV4MPEGPIPE_MUXER 0
-#define CONFIG_CHROMAPRINT_MUXER 0
-#define CONFIG_ALSA_OUTDEV 0
-#define CONFIG_CACA_OUTDEV 0
-#define CONFIG_DECKLINK_OUTDEV 0
-#define CONFIG_LIBNDI_NEWTEK_OUTDEV 0
-#define CONFIG_FBDEV_OUTDEV 0
-#define CONFIG_OPENGL_OUTDEV 0
-#define CONFIG_OSS_OUTDEV 0
-#define CONFIG_PULSE_OUTDEV 0
-#define CONFIG_SDL2_OUTDEV 0
-#define CONFIG_SNDIO_OUTDEV 0
-#define CONFIG_V4L2_OUTDEV 0
-#define CONFIG_XV_OUTDEV 0
-#define CONFIG_AAC_PARSER 0
-#define CONFIG_AAC_LATM_PARSER 0
-#define CONFIG_AC3_PARSER 0
-#define CONFIG_ADX_PARSER 0
-#define CONFIG_BMP_PARSER 0
-#define CONFIG_CAVSVIDEO_PARSER 0
-#define CONFIG_COOK_PARSER 0
-#define CONFIG_DCA_PARSER 0
-#define CONFIG_DIRAC_PARSER 0
-#define CONFIG_DNXHD_PARSER 0
-#define CONFIG_DPX_PARSER 0
-#define CONFIG_DVAUDIO_PARSER 0
-#define CONFIG_DVBSUB_PARSER 0
-#define CONFIG_DVDSUB_PARSER 0
-#define CONFIG_DVD_NAV_PARSER 0
-#define CONFIG_G729_PARSER 0
-#define CONFIG_GSM_PARSER 0
-#define CONFIG_H261_PARSER 0
-#define CONFIG_H263_PARSER 0
-#define CONFIG_H264_PARSER 0
-#define CONFIG_HEVC_PARSER 0
-#define CONFIG_MJPEG_PARSER 0
-#define CONFIG_MLP_PARSER 0
-#define CONFIG_MPEG4VIDEO_PARSER 0
-#define CONFIG_MPEGAUDIO_PARSER 0
-#define CONFIG_MPEGVIDEO_PARSER 0
-#define CONFIG_OPUS_PARSER 0
-#define CONFIG_PNG_PARSER 0
-#define CONFIG_PNM_PARSER 0
-#define CONFIG_RV30_PARSER 0
-#define CONFIG_RV40_PARSER 0
-#define CONFIG_SIPR_PARSER 0
-#define CONFIG_TAK_PARSER 0
-#define CONFIG_VC1_PARSER 0
-#define CONFIG_VORBIS_PARSER 0
-#define CONFIG_VP3_PARSER 0
+#define CONFIG_XAN_DPCM_DECODER 0
+#define CONFIG_XAN_WC3_DECODER 0
+#define CONFIG_XAN_WC4_DECODER 0
+#define CONFIG_XA_DEMUXER 0
+#define CONFIG_XBIN_DECODER 0
+#define CONFIG_XBIN_DEMUXER 0
+#define CONFIG_XBM_DECODER 0
+#define CONFIG_XBM_ENCODER 0
+#define CONFIG_XBR_FILTER 0
+#define CONFIG_XCBGRAB_INDEV 0
+#define CONFIG_XFACE_DECODER 0
+#define CONFIG_XFACE_ENCODER 0
+#define CONFIG_XL_DECODER 0
+#define CONFIG_XMA1_DECODER 0
+#define CONFIG_XMA2_DECODER 0
 #define CONFIG_XMA_PARSER 0
-#define CONFIG_ASYNC_PROTOCOL 0
-#define CONFIG_BLURAY_PROTOCOL 0
-#define CONFIG_CACHE_PROTOCOL 0
-#define CONFIG_CONCAT_PROTOCOL 0
-#define CONFIG_CRYPTO_PROTOCOL 0
-#define CONFIG_DATA_PROTOCOL 0
-#define CONFIG_FFRTMPCRYPT_PROTOCOL 0
-#define CONFIG_FFRTMPHTTP_PROTOCOL 0
-#define CONFIG_FILE_PROTOCOL 0
-#define CONFIG_FTP_PROTOCOL 0
-#define CONFIG_GOPHER_PROTOCOL 0
-#define CONFIG_HLS_PROTOCOL 0
-#define CONFIG_HTTP_PROTOCOL 0
-#define CONFIG_HTTPPROXY_PROTOCOL 0
-#define CONFIG_HTTPS_PROTOCOL 0
-#define CONFIG_ICECAST_PROTOCOL 0
-#define CONFIG_MMSH_PROTOCOL 0
-#define CONFIG_MMST_PROTOCOL 0
-#define CONFIG_MD5_PROTOCOL 0
-#define CONFIG_PIPE_PROTOCOL 0
-#define CONFIG_PROMPEG_PROTOCOL 0
-#define CONFIG_RTMP_PROTOCOL 0
-#define CONFIG_RTMPE_PROTOCOL 0
-#define CONFIG_RTMPS_PROTOCOL 0
-#define CONFIG_RTMPT_PROTOCOL 0
-#define CONFIG_RTMPTE_PROTOCOL 0
-#define CONFIG_RTMPTS_PROTOCOL 0
-#define CONFIG_RTP_PROTOCOL 0
-#define CONFIG_SCTP_PROTOCOL 0
-#define CONFIG_SRTP_PROTOCOL 0
-#define CONFIG_SUBFILE_PROTOCOL 0
-#define CONFIG_TEE_PROTOCOL 0
-#define CONFIG_TCP_PROTOCOL 0
-#define CONFIG_TLS_GNUTLS_PROTOCOL 0
-#define CONFIG_TLS_SCHANNEL_PROTOCOL 0
-#define CONFIG_TLS_SECURETRANSPORT_PROTOCOL 0
-#define CONFIG_TLS_OPENSSL_PROTOCOL 0
-#define CONFIG_UDP_PROTOCOL 0
-#define CONFIG_UDPLITE_PROTOCOL 0
-#define CONFIG_UNIX_PROTOCOL 0
-#define CONFIG_LIBRTMP_PROTOCOL 0
-#define CONFIG_LIBRTMPE_PROTOCOL 0
-#define CONFIG_LIBRTMPS_PROTOCOL 0
-#define CONFIG_LIBRTMPT_PROTOCOL 0
-#define CONFIG_LIBRTMPTE_PROTOCOL 0
-#define CONFIG_LIBSSH_PROTOCOL 0
-#define CONFIG_LIBSMBCLIENT_PROTOCOL 0
+#define CONFIG_XMV_DEMUXER 0
+#define CONFIG_XPM_DECODER 0
+#define CONFIG_XSUB_DECODER 0
+#define CONFIG_XSUB_ENCODER 0
+#define CONFIG_XVAG_DEMUXER 0
+#define CONFIG_XV_OUTDEV 0
+#define CONFIG_XWD_DECODER 0
+#define CONFIG_XWD_ENCODER 0
+#define CONFIG_XWMA_DEMUXER 0
+#define CONFIG_Y41P_DECODER 0
+#define CONFIG_Y41P_ENCODER 0
+#define CONFIG_YADIF_FILTER 0
+#define CONFIG_YLC_DECODER 0
+#define CONFIG_YOP_DECODER 0
+#define CONFIG_YOP_DEMUXER 0
+#define CONFIG_YUV4MPEGPIPE_DEMUXER 0
+#define CONFIG_YUV4MPEGPIPE_MUXER 0
+#define CONFIG_YUV4_DECODER 0
+#define CONFIG_YUV4_ENCODER 0
+#define CONFIG_YUVTESTSRC_FILTER 0
+#define CONFIG_ZERO12V_DECODER 0
+#define CONFIG_ZEROCODEC_DECODER 0
+#define CONFIG_ZLIB_DECODER 0
+#define CONFIG_ZLIB_ENCODER 0
+#define CONFIG_ZMBV_DECODER 0
+#define CONFIG_ZMBV_ENCODER 0
+#define CONFIG_ZMQ_FILTER 0
+#define CONFIG_ZOOMPAN_FILTER 0
+#define CONFIG_ZSCALE_FILTER 0
diff --git a/media/ffvpx/ffvpxcommon.mozbuild b/media/ffvpx/ffvpxcommon.mozbuild
index 620158694..b6230bb9f 100644
--- a/media/ffvpx/ffvpxcommon.mozbuild
+++ b/media/ffvpx/ffvpxcommon.mozbuild
@@ -21,7 +21,7 @@ if CONFIG['FFVPX_ASFLAGS']:
         else:
             ASFLAGS += ['-Pconfig_win64.asm']
     elif CONFIG['OS_ARCH'] == 'Darwin':
-        # 32/64-bit macosx assemblers need to prefix symbols with an underscore.
+        # 64-bit macosx assemblers need to prefix symbols with an underscore.
         ASFLAGS += [
             '-Pconfig_darwin64.asm',
             '-DPREFIX'
diff --git a/media/ffvpx/libavcodec/allcodecs.c b/media/ffvpx/libavcodec/allcodecs.c
index 4f34312e6..4d4ef530e 100644
--- a/media/ffvpx/libavcodec/allcodecs.c
+++ b/media/ffvpx/libavcodec/allcodecs.c
@@ -29,732 +29,865 @@
 #include "avcodec.h"
 #include "version.h"
 
-#define REGISTER_HWACCEL(X, x)                                          \
-    {                                                                   \
-        extern AVHWAccel ff_##x##_hwaccel;                              \
-        if (CONFIG_##X##_HWACCEL)                                       \
-            av_register_hwaccel(&ff_##x##_hwaccel);                     \
+extern AVCodec ff_a64multi_encoder;
+extern AVCodec ff_a64multi5_encoder;
+extern AVCodec ff_aasc_decoder;
+extern AVCodec ff_aic_decoder;
+extern AVCodec ff_alias_pix_encoder;
+extern AVCodec ff_alias_pix_decoder;
+extern AVCodec ff_amv_encoder;
+extern AVCodec ff_amv_decoder;
+extern AVCodec ff_anm_decoder;
+extern AVCodec ff_ansi_decoder;
+extern AVCodec ff_apng_encoder;
+extern AVCodec ff_apng_decoder;
+extern AVCodec ff_asv1_encoder;
+extern AVCodec ff_asv1_decoder;
+extern AVCodec ff_asv2_encoder;
+extern AVCodec ff_asv2_decoder;
+extern AVCodec ff_aura_decoder;
+extern AVCodec ff_aura2_decoder;
+extern AVCodec ff_avrp_encoder;
+extern AVCodec ff_avrp_decoder;
+extern AVCodec ff_avrn_decoder;
+extern AVCodec ff_avs_decoder;
+extern AVCodec ff_avui_encoder;
+extern AVCodec ff_avui_decoder;
+extern AVCodec ff_ayuv_encoder;
+extern AVCodec ff_ayuv_decoder;
+extern AVCodec ff_bethsoftvid_decoder;
+extern AVCodec ff_bfi_decoder;
+extern AVCodec ff_bink_decoder;
+extern AVCodec ff_bmp_encoder;
+extern AVCodec ff_bmp_decoder;
+extern AVCodec ff_bmv_video_decoder;
+extern AVCodec ff_brender_pix_decoder;
+extern AVCodec ff_c93_decoder;
+extern AVCodec ff_cavs_decoder;
+extern AVCodec ff_cdgraphics_decoder;
+extern AVCodec ff_cdxl_decoder;
+extern AVCodec ff_cfhd_decoder;
+extern AVCodec ff_cinepak_encoder;
+extern AVCodec ff_cinepak_decoder;
+extern AVCodec ff_clearvideo_decoder;
+extern AVCodec ff_cljr_encoder;
+extern AVCodec ff_cljr_decoder;
+extern AVCodec ff_cllc_decoder;
+extern AVCodec ff_comfortnoise_encoder;
+extern AVCodec ff_comfortnoise_decoder;
+extern AVCodec ff_cpia_decoder;
+extern AVCodec ff_cscd_decoder;
+extern AVCodec ff_cyuv_decoder;
+extern AVCodec ff_dds_decoder;
+extern AVCodec ff_dfa_decoder;
+extern AVCodec ff_dirac_decoder;
+extern AVCodec ff_dnxhd_encoder;
+extern AVCodec ff_dnxhd_decoder;
+extern AVCodec ff_dpx_encoder;
+extern AVCodec ff_dpx_decoder;
+extern AVCodec ff_dsicinvideo_decoder;
+extern AVCodec ff_dvaudio_decoder;
+extern AVCodec ff_dvvideo_encoder;
+extern AVCodec ff_dvvideo_decoder;
+extern AVCodec ff_dxa_decoder;
+extern AVCodec ff_dxtory_decoder;
+extern AVCodec ff_dxv_decoder;
+extern AVCodec ff_eacmv_decoder;
+extern AVCodec ff_eamad_decoder;
+extern AVCodec ff_eatgq_decoder;
+extern AVCodec ff_eatgv_decoder;
+extern AVCodec ff_eatqi_decoder;
+extern AVCodec ff_eightbps_decoder;
+extern AVCodec ff_eightsvx_exp_decoder;
+extern AVCodec ff_eightsvx_fib_decoder;
+extern AVCodec ff_escape124_decoder;
+extern AVCodec ff_escape130_decoder;
+extern AVCodec ff_exr_decoder;
+extern AVCodec ff_ffv1_encoder;
+extern AVCodec ff_ffv1_decoder;
+extern AVCodec ff_ffvhuff_encoder;
+extern AVCodec ff_ffvhuff_decoder;
+extern AVCodec ff_fic_decoder;
+extern AVCodec ff_fits_encoder;
+extern AVCodec ff_fits_decoder;
+extern AVCodec ff_flashsv_encoder;
+extern AVCodec ff_flashsv_decoder;
+extern AVCodec ff_flashsv2_encoder;
+extern AVCodec ff_flashsv2_decoder;
+extern AVCodec ff_flic_decoder;
+extern AVCodec ff_flv_encoder;
+extern AVCodec ff_flv_decoder;
+extern AVCodec ff_fmvc_decoder;
+extern AVCodec ff_fourxm_decoder;
+extern AVCodec ff_fraps_decoder;
+extern AVCodec ff_frwu_decoder;
+extern AVCodec ff_g2m_decoder;
+extern AVCodec ff_gdv_decoder;
+extern AVCodec ff_gif_encoder;
+extern AVCodec ff_gif_decoder;
+extern AVCodec ff_h261_encoder;
+extern AVCodec ff_h261_decoder;
+extern AVCodec ff_h263_encoder;
+extern AVCodec ff_h263_decoder;
+extern AVCodec ff_h263i_decoder;
+extern AVCodec ff_h263p_encoder;
+extern AVCodec ff_h263p_decoder;
+extern AVCodec ff_h263_v4l2m2m_decoder;
+extern AVCodec ff_h264_decoder;
+extern AVCodec ff_h264_crystalhd_decoder;
+extern AVCodec ff_h264_v4l2m2m_decoder;
+extern AVCodec ff_h264_mediacodec_decoder;
+extern AVCodec ff_h264_mmal_decoder;
+extern AVCodec ff_h264_qsv_decoder;
+extern AVCodec ff_h264_rkmpp_decoder;
+extern AVCodec ff_hap_encoder;
+extern AVCodec ff_hap_decoder;
+extern AVCodec ff_hevc_decoder;
+extern AVCodec ff_hevc_qsv_decoder;
+extern AVCodec ff_hevc_rkmpp_decoder;
+extern AVCodec ff_hevc_v4l2m2m_decoder;
+extern AVCodec ff_hnm4_video_decoder;
+extern AVCodec ff_hq_hqa_decoder;
+extern AVCodec ff_hqx_decoder;
+extern AVCodec ff_huffyuv_encoder;
+extern AVCodec ff_huffyuv_decoder;
+extern AVCodec ff_idcin_decoder;
+extern AVCodec ff_iff_ilbm_decoder;
+extern AVCodec ff_indeo2_decoder;
+extern AVCodec ff_indeo3_decoder;
+extern AVCodec ff_indeo4_decoder;
+extern AVCodec ff_indeo5_decoder;
+extern AVCodec ff_interplay_video_decoder;
+extern AVCodec ff_jpeg2000_encoder;
+extern AVCodec ff_jpeg2000_decoder;
+extern AVCodec ff_jpegls_encoder;
+extern AVCodec ff_jpegls_decoder;
+extern AVCodec ff_jv_decoder;
+extern AVCodec ff_kgv1_decoder;
+extern AVCodec ff_kmvc_decoder;
+extern AVCodec ff_lagarith_decoder;
+extern AVCodec ff_ljpeg_encoder;
+extern AVCodec ff_loco_decoder;
+extern AVCodec ff_m101_decoder;
+extern AVCodec ff_magicyuv_encoder;
+extern AVCodec ff_magicyuv_decoder;
+extern AVCodec ff_mdec_decoder;
+extern AVCodec ff_mimic_decoder;
+extern AVCodec ff_mjpeg_encoder;
+extern AVCodec ff_mjpeg_decoder;
+extern AVCodec ff_mjpegb_decoder;
+extern AVCodec ff_mmvideo_decoder;
+extern AVCodec ff_motionpixels_decoder;
+extern AVCodec ff_mpeg1video_encoder;
+extern AVCodec ff_mpeg1video_decoder;
+extern AVCodec ff_mpeg2video_encoder;
+extern AVCodec ff_mpeg2video_decoder;
+extern AVCodec ff_mpeg4_encoder;
+extern AVCodec ff_mpeg4_decoder;
+extern AVCodec ff_mpeg4_crystalhd_decoder;
+extern AVCodec ff_mpeg4_v4l2m2m_decoder;
+extern AVCodec ff_mpeg4_mmal_decoder;
+extern AVCodec ff_mpegvideo_decoder;
+extern AVCodec ff_mpeg1_v4l2m2m_decoder;
+extern AVCodec ff_mpeg2_mmal_decoder;
+extern AVCodec ff_mpeg2_crystalhd_decoder;
+extern AVCodec ff_mpeg2_v4l2m2m_decoder;
+extern AVCodec ff_mpeg2_qsv_decoder;
+extern AVCodec ff_mpeg2_mediacodec_decoder;
+extern AVCodec ff_msa1_decoder;
+extern AVCodec ff_mscc_decoder;
+extern AVCodec ff_msmpeg4v1_decoder;
+extern AVCodec ff_msmpeg4v2_encoder;
+extern AVCodec ff_msmpeg4v2_decoder;
+extern AVCodec ff_msmpeg4v3_encoder;
+extern AVCodec ff_msmpeg4v3_decoder;
+extern AVCodec ff_msmpeg4_crystalhd_decoder;
+extern AVCodec ff_msrle_decoder;
+extern AVCodec ff_mss1_decoder;
+extern AVCodec ff_mss2_decoder;
+extern AVCodec ff_msvideo1_encoder;
+extern AVCodec ff_msvideo1_decoder;
+extern AVCodec ff_mszh_decoder;
+extern AVCodec ff_mts2_decoder;
+extern AVCodec ff_mvc1_decoder;
+extern AVCodec ff_mvc2_decoder;
+extern AVCodec ff_mxpeg_decoder;
+extern AVCodec ff_nuv_decoder;
+extern AVCodec ff_paf_video_decoder;
+extern AVCodec ff_pam_encoder;
+extern AVCodec ff_pam_decoder;
+extern AVCodec ff_pbm_encoder;
+extern AVCodec ff_pbm_decoder;
+extern AVCodec ff_pcx_encoder;
+extern AVCodec ff_pcx_decoder;
+extern AVCodec ff_pgm_encoder;
+extern AVCodec ff_pgm_decoder;
+extern AVCodec ff_pgmyuv_encoder;
+extern AVCodec ff_pgmyuv_decoder;
+extern AVCodec ff_pictor_decoder;
+extern AVCodec ff_pixlet_decoder;
+extern AVCodec ff_png_encoder;
+extern AVCodec ff_png_decoder;
+extern AVCodec ff_ppm_encoder;
+extern AVCodec ff_ppm_decoder;
+extern AVCodec ff_prores_encoder;
+extern AVCodec ff_prores_decoder;
+extern AVCodec ff_prores_aw_encoder;
+extern AVCodec ff_prores_ks_encoder;
+extern AVCodec ff_prores_lgpl_decoder;
+extern AVCodec ff_psd_decoder;
+extern AVCodec ff_ptx_decoder;
+extern AVCodec ff_qdraw_decoder;
+extern AVCodec ff_qpeg_decoder;
+extern AVCodec ff_qtrle_encoder;
+extern AVCodec ff_qtrle_decoder;
+extern AVCodec ff_r10k_encoder;
+extern AVCodec ff_r10k_decoder;
+extern AVCodec ff_r210_encoder;
+extern AVCodec ff_r210_decoder;
+extern AVCodec ff_rawvideo_encoder;
+extern AVCodec ff_rawvideo_decoder;
+extern AVCodec ff_rl2_decoder;
+extern AVCodec ff_roq_encoder;
+extern AVCodec ff_roq_decoder;
+extern AVCodec ff_rpza_decoder;
+extern AVCodec ff_rscc_decoder;
+extern AVCodec ff_rv10_encoder;
+extern AVCodec ff_rv10_decoder;
+extern AVCodec ff_rv20_encoder;
+extern AVCodec ff_rv20_decoder;
+extern AVCodec ff_rv30_decoder;
+extern AVCodec ff_rv40_decoder;
+extern AVCodec ff_s302m_encoder;
+extern AVCodec ff_s302m_decoder;
+extern AVCodec ff_sanm_decoder;
+extern AVCodec ff_scpr_decoder;
+extern AVCodec ff_screenpresso_decoder;
+extern AVCodec ff_sdx2_dpcm_decoder;
+extern AVCodec ff_sgi_encoder;
+extern AVCodec ff_sgi_decoder;
+extern AVCodec ff_sgirle_decoder;
+extern AVCodec ff_sheervideo_decoder;
+extern AVCodec ff_smacker_decoder;
+extern AVCodec ff_smc_decoder;
+extern AVCodec ff_smvjpeg_decoder;
+extern AVCodec ff_snow_encoder;
+extern AVCodec ff_snow_decoder;
+extern AVCodec ff_sp5x_decoder;
+extern AVCodec ff_speedhq_decoder;
+extern AVCodec ff_srgc_decoder;
+extern AVCodec ff_sunrast_encoder;
+extern AVCodec ff_sunrast_decoder;
+extern AVCodec ff_svq1_encoder;
+extern AVCodec ff_svq1_decoder;
+extern AVCodec ff_svq3_decoder;
+extern AVCodec ff_targa_encoder;
+extern AVCodec ff_targa_decoder;
+extern AVCodec ff_targa_y216_decoder;
+extern AVCodec ff_tdsc_decoder;
+extern AVCodec ff_theora_decoder;
+extern AVCodec ff_thp_decoder;
+extern AVCodec ff_tiertexseqvideo_decoder;
+extern AVCodec ff_tiff_encoder;
+extern AVCodec ff_tiff_decoder;
+extern AVCodec ff_tmv_decoder;
+extern AVCodec ff_truemotion1_decoder;
+extern AVCodec ff_truemotion2_decoder;
+extern AVCodec ff_truemotion2rt_decoder;
+extern AVCodec ff_tscc_decoder;
+extern AVCodec ff_tscc2_decoder;
+extern AVCodec ff_txd_decoder;
+extern AVCodec ff_ulti_decoder;
+extern AVCodec ff_utvideo_encoder;
+extern AVCodec ff_utvideo_decoder;
+extern AVCodec ff_v210_encoder;
+extern AVCodec ff_v210_decoder;
+extern AVCodec ff_v210x_decoder;
+extern AVCodec ff_v308_encoder;
+extern AVCodec ff_v308_decoder;
+extern AVCodec ff_v408_encoder;
+extern AVCodec ff_v408_decoder;
+extern AVCodec ff_v410_encoder;
+extern AVCodec ff_v410_decoder;
+extern AVCodec ff_vb_decoder;
+extern AVCodec ff_vble_decoder;
+extern AVCodec ff_vc1_decoder;
+extern AVCodec ff_vc1_crystalhd_decoder;
+extern AVCodec ff_vc1image_decoder;
+extern AVCodec ff_vc1_mmal_decoder;
+extern AVCodec ff_vc1_qsv_decoder;
+extern AVCodec ff_vc1_v4l2m2m_decoder;
+extern AVCodec ff_vc2_encoder;
+extern AVCodec ff_vcr1_decoder;
+extern AVCodec ff_vmdvideo_decoder;
+extern AVCodec ff_vmnc_decoder;
+extern AVCodec ff_vp3_decoder;
+extern AVCodec ff_vp5_decoder;
+extern AVCodec ff_vp6_decoder;
+extern AVCodec ff_vp6a_decoder;
+extern AVCodec ff_vp6f_decoder;
+extern AVCodec ff_vp7_decoder;
+extern AVCodec ff_vp8_decoder;
+extern AVCodec ff_vp8_rkmpp_decoder;
+extern AVCodec ff_vp8_v4l2m2m_decoder;
+extern AVCodec ff_vp9_decoder;
+extern AVCodec ff_vp9_rkmpp_decoder;
+extern AVCodec ff_vp9_v4l2m2m_decoder;
+extern AVCodec ff_vqa_decoder;
+extern AVCodec ff_bitpacked_decoder;
+extern AVCodec ff_webp_decoder;
+extern AVCodec ff_wrapped_avframe_encoder;
+extern AVCodec ff_wrapped_avframe_decoder;
+extern AVCodec ff_wmv1_encoder;
+extern AVCodec ff_wmv1_decoder;
+extern AVCodec ff_wmv2_encoder;
+extern AVCodec ff_wmv2_decoder;
+extern AVCodec ff_wmv3_decoder;
+extern AVCodec ff_wmv3_crystalhd_decoder;
+extern AVCodec ff_wmv3image_decoder;
+extern AVCodec ff_wnv1_decoder;
+extern AVCodec ff_xan_wc3_decoder;
+extern AVCodec ff_xan_wc4_decoder;
+extern AVCodec ff_xbm_encoder;
+extern AVCodec ff_xbm_decoder;
+extern AVCodec ff_xface_encoder;
+extern AVCodec ff_xface_decoder;
+extern AVCodec ff_xl_decoder;
+extern AVCodec ff_xpm_decoder;
+extern AVCodec ff_xwd_encoder;
+extern AVCodec ff_xwd_decoder;
+extern AVCodec ff_y41p_encoder;
+extern AVCodec ff_y41p_decoder;
+extern AVCodec ff_ylc_decoder;
+extern AVCodec ff_yop_decoder;
+extern AVCodec ff_yuv4_encoder;
+extern AVCodec ff_yuv4_decoder;
+extern AVCodec ff_zero12v_decoder;
+extern AVCodec ff_zerocodec_decoder;
+extern AVCodec ff_zlib_encoder;
+extern AVCodec ff_zlib_decoder;
+extern AVCodec ff_zmbv_encoder;
+extern AVCodec ff_zmbv_decoder;
+
+/* audio codecs */
+extern AVCodec ff_aac_encoder;
+extern AVCodec ff_aac_decoder;
+extern AVCodec ff_aac_fixed_decoder;
+extern AVCodec ff_aac_latm_decoder;
+extern AVCodec ff_ac3_encoder;
+extern AVCodec ff_ac3_decoder;
+extern AVCodec ff_ac3_fixed_encoder;
+extern AVCodec ff_ac3_fixed_decoder;
+extern AVCodec ff_alac_encoder;
+extern AVCodec ff_alac_decoder;
+extern AVCodec ff_als_decoder;
+extern AVCodec ff_amrnb_decoder;
+extern AVCodec ff_amrwb_decoder;
+extern AVCodec ff_ape_decoder;
+extern AVCodec ff_aptx_encoder;
+extern AVCodec ff_aptx_decoder;
+extern AVCodec ff_aptx_hd_encoder;
+extern AVCodec ff_aptx_hd_decoder;
+extern AVCodec ff_atrac1_decoder;
+extern AVCodec ff_atrac3_decoder;
+extern AVCodec ff_atrac3al_decoder;
+extern AVCodec ff_atrac3p_decoder;
+extern AVCodec ff_atrac3pal_decoder;
+extern AVCodec ff_binkaudio_dct_decoder;
+extern AVCodec ff_binkaudio_rdft_decoder;
+extern AVCodec ff_bmv_audio_decoder;
+extern AVCodec ff_cook_decoder;
+extern AVCodec ff_dca_encoder;
+extern AVCodec ff_dca_decoder;
+extern AVCodec ff_dolby_e_decoder;
+extern AVCodec ff_dsd_lsbf_decoder;
+extern AVCodec ff_dsd_msbf_decoder;
+extern AVCodec ff_dsd_lsbf_planar_decoder;
+extern AVCodec ff_dsd_msbf_planar_decoder;
+extern AVCodec ff_dsicinaudio_decoder;
+extern AVCodec ff_dss_sp_decoder;
+extern AVCodec ff_dst_decoder;
+extern AVCodec ff_eac3_encoder;
+extern AVCodec ff_eac3_decoder;
+extern AVCodec ff_evrc_decoder;
+extern AVCodec ff_ffwavesynth_decoder;
+extern AVCodec ff_flac_encoder;
+extern AVCodec ff_flac_decoder;
+extern AVCodec ff_g723_1_encoder;
+extern AVCodec ff_g723_1_decoder;
+extern AVCodec ff_g729_decoder;
+extern AVCodec ff_gsm_decoder;
+extern AVCodec ff_gsm_ms_decoder;
+extern AVCodec ff_iac_decoder;
+extern AVCodec ff_imc_decoder;
+extern AVCodec ff_interplay_acm_decoder;
+extern AVCodec ff_mace3_decoder;
+extern AVCodec ff_mace6_decoder;
+extern AVCodec ff_metasound_decoder;
+extern AVCodec ff_mlp_encoder;
+extern AVCodec ff_mlp_decoder;
+extern AVCodec ff_mp1_decoder;
+extern AVCodec ff_mp1float_decoder;
+extern AVCodec ff_mp2_encoder;
+extern AVCodec ff_mp2_decoder;
+extern AVCodec ff_mp2float_decoder;
+extern AVCodec ff_mp2fixed_encoder;
+extern AVCodec ff_mp3float_decoder;
+extern AVCodec ff_mp3_decoder;
+extern AVCodec ff_mp3adufloat_decoder;
+extern AVCodec ff_mp3adu_decoder;
+extern AVCodec ff_mp3on4float_decoder;
+extern AVCodec ff_mp3on4_decoder;
+extern AVCodec ff_mpc7_decoder;
+extern AVCodec ff_mpc8_decoder;
+extern AVCodec ff_nellymoser_encoder;
+extern AVCodec ff_nellymoser_decoder;
+extern AVCodec ff_on2avc_decoder;
+extern AVCodec ff_opus_encoder;
+extern AVCodec ff_opus_decoder;
+extern AVCodec ff_paf_audio_decoder;
+extern AVCodec ff_qcelp_decoder;
+extern AVCodec ff_qdm2_decoder;
+extern AVCodec ff_qdmc_decoder;
+extern AVCodec ff_ra_144_encoder;
+extern AVCodec ff_ra_144_decoder;
+extern AVCodec ff_ra_288_decoder;
+extern AVCodec ff_ralf_decoder;
+extern AVCodec ff_sbc_encoder;
+extern AVCodec ff_sbc_decoder;
+extern AVCodec ff_shorten_decoder;
+extern AVCodec ff_sipr_decoder;
+extern AVCodec ff_smackaud_decoder;
+extern AVCodec ff_sonic_encoder;
+extern AVCodec ff_sonic_decoder;
+extern AVCodec ff_sonic_ls_encoder;
+extern AVCodec ff_tak_decoder;
+extern AVCodec ff_truehd_encoder;
+extern AVCodec ff_truehd_decoder;
+extern AVCodec ff_truespeech_decoder;
+extern AVCodec ff_tta_encoder;
+extern AVCodec ff_tta_decoder;
+extern AVCodec ff_twinvq_decoder;
+extern AVCodec ff_vmdaudio_decoder;
+extern AVCodec ff_vorbis_encoder;
+extern AVCodec ff_vorbis_decoder;
+extern AVCodec ff_wavpack_encoder;
+extern AVCodec ff_wavpack_decoder;
+extern AVCodec ff_wmalossless_decoder;
+extern AVCodec ff_wmapro_decoder;
+extern AVCodec ff_wmav1_encoder;
+extern AVCodec ff_wmav1_decoder;
+extern AVCodec ff_wmav2_encoder;
+extern AVCodec ff_wmav2_decoder;
+extern AVCodec ff_wmavoice_decoder;
+extern AVCodec ff_ws_snd1_decoder;
+extern AVCodec ff_xma1_decoder;
+extern AVCodec ff_xma2_decoder;
+
+/* PCM codecs */
+extern AVCodec ff_pcm_alaw_encoder;
+extern AVCodec ff_pcm_alaw_decoder;
+extern AVCodec ff_pcm_bluray_decoder;
+extern AVCodec ff_pcm_dvd_decoder;
+extern AVCodec ff_pcm_f16le_decoder;
+extern AVCodec ff_pcm_f24le_decoder;
+extern AVCodec ff_pcm_f32be_encoder;
+extern AVCodec ff_pcm_f32be_decoder;
+extern AVCodec ff_pcm_f32le_encoder;
+extern AVCodec ff_pcm_f32le_decoder;
+extern AVCodec ff_pcm_f64be_encoder;
+extern AVCodec ff_pcm_f64be_decoder;
+extern AVCodec ff_pcm_f64le_encoder;
+extern AVCodec ff_pcm_f64le_decoder;
+extern AVCodec ff_pcm_lxf_decoder;
+extern AVCodec ff_pcm_mulaw_encoder;
+extern AVCodec ff_pcm_mulaw_decoder;
+extern AVCodec ff_pcm_s8_encoder;
+extern AVCodec ff_pcm_s8_decoder;
+extern AVCodec ff_pcm_s8_planar_encoder;
+extern AVCodec ff_pcm_s8_planar_decoder;
+extern AVCodec ff_pcm_s16be_encoder;
+extern AVCodec ff_pcm_s16be_decoder;
+extern AVCodec ff_pcm_s16be_planar_encoder;
+extern AVCodec ff_pcm_s16be_planar_decoder;
+extern AVCodec ff_pcm_s16le_encoder;
+extern AVCodec ff_pcm_s16le_decoder;
+extern AVCodec ff_pcm_s16le_planar_encoder;
+extern AVCodec ff_pcm_s16le_planar_decoder;
+extern AVCodec ff_pcm_s24be_encoder;
+extern AVCodec ff_pcm_s24be_decoder;
+extern AVCodec ff_pcm_s24daud_encoder;
+extern AVCodec ff_pcm_s24daud_decoder;
+extern AVCodec ff_pcm_s24le_encoder;
+extern AVCodec ff_pcm_s24le_decoder;
+extern AVCodec ff_pcm_s24le_planar_encoder;
+extern AVCodec ff_pcm_s24le_planar_decoder;
+extern AVCodec ff_pcm_s32be_encoder;
+extern AVCodec ff_pcm_s32be_decoder;
+extern AVCodec ff_pcm_s32le_encoder;
+extern AVCodec ff_pcm_s32le_decoder;
+extern AVCodec ff_pcm_s32le_planar_encoder;
+extern AVCodec ff_pcm_s32le_planar_decoder;
+extern AVCodec ff_pcm_s64be_encoder;
+extern AVCodec ff_pcm_s64be_decoder;
+extern AVCodec ff_pcm_s64le_encoder;
+extern AVCodec ff_pcm_s64le_decoder;
+extern AVCodec ff_pcm_u8_encoder;
+extern AVCodec ff_pcm_u8_decoder;
+extern AVCodec ff_pcm_u16be_encoder;
+extern AVCodec ff_pcm_u16be_decoder;
+extern AVCodec ff_pcm_u16le_encoder;
+extern AVCodec ff_pcm_u16le_decoder;
+extern AVCodec ff_pcm_u24be_encoder;
+extern AVCodec ff_pcm_u24be_decoder;
+extern AVCodec ff_pcm_u24le_encoder;
+extern AVCodec ff_pcm_u24le_decoder;
+extern AVCodec ff_pcm_u32be_encoder;
+extern AVCodec ff_pcm_u32be_decoder;
+extern AVCodec ff_pcm_u32le_encoder;
+extern AVCodec ff_pcm_u32le_decoder;
+extern AVCodec ff_pcm_zork_decoder;
+
+/* DPCM codecs */
+extern AVCodec ff_gremlin_dpcm_decoder;
+extern AVCodec ff_interplay_dpcm_decoder;
+extern AVCodec ff_roq_dpcm_encoder;
+extern AVCodec ff_roq_dpcm_decoder;
+extern AVCodec ff_sol_dpcm_decoder;
+extern AVCodec ff_xan_dpcm_decoder;
+
+/* ADPCM codecs */
+extern AVCodec ff_adpcm_4xm_decoder;
+extern AVCodec ff_adpcm_adx_encoder;
+extern AVCodec ff_adpcm_adx_decoder;
+extern AVCodec ff_adpcm_afc_decoder;
+extern AVCodec ff_adpcm_aica_decoder;
+extern AVCodec ff_adpcm_ct_decoder;
+extern AVCodec ff_adpcm_dtk_decoder;
+extern AVCodec ff_adpcm_ea_decoder;
+extern AVCodec ff_adpcm_ea_maxis_xa_decoder;
+extern AVCodec ff_adpcm_ea_r1_decoder;
+extern AVCodec ff_adpcm_ea_r2_decoder;
+extern AVCodec ff_adpcm_ea_r3_decoder;
+extern AVCodec ff_adpcm_ea_xas_decoder;
+extern AVCodec ff_adpcm_g722_encoder;
+extern AVCodec ff_adpcm_g722_decoder;
+extern AVCodec ff_adpcm_g726_encoder;
+extern AVCodec ff_adpcm_g726_decoder;
+extern AVCodec ff_adpcm_g726le_encoder;
+extern AVCodec ff_adpcm_g726le_decoder;
+extern AVCodec ff_adpcm_ima_amv_decoder;
+extern AVCodec ff_adpcm_ima_apc_decoder;
+extern AVCodec ff_adpcm_ima_dat4_decoder;
+extern AVCodec ff_adpcm_ima_dk3_decoder;
+extern AVCodec ff_adpcm_ima_dk4_decoder;
+extern AVCodec ff_adpcm_ima_ea_eacs_decoder;
+extern AVCodec ff_adpcm_ima_ea_sead_decoder;
+extern AVCodec ff_adpcm_ima_iss_decoder;
+extern AVCodec ff_adpcm_ima_oki_decoder;
+extern AVCodec ff_adpcm_ima_qt_encoder;
+extern AVCodec ff_adpcm_ima_qt_decoder;
+extern AVCodec ff_adpcm_ima_rad_decoder;
+extern AVCodec ff_adpcm_ima_smjpeg_decoder;
+extern AVCodec ff_adpcm_ima_wav_encoder;
+extern AVCodec ff_adpcm_ima_wav_decoder;
+extern AVCodec ff_adpcm_ima_ws_decoder;
+extern AVCodec ff_adpcm_ms_encoder;
+extern AVCodec ff_adpcm_ms_decoder;
+extern AVCodec ff_adpcm_mtaf_decoder;
+extern AVCodec ff_adpcm_psx_decoder;
+extern AVCodec ff_adpcm_sbpro_2_decoder;
+extern AVCodec ff_adpcm_sbpro_3_decoder;
+extern AVCodec ff_adpcm_sbpro_4_decoder;
+extern AVCodec ff_adpcm_swf_encoder;
+extern AVCodec ff_adpcm_swf_decoder;
+extern AVCodec ff_adpcm_thp_decoder;
+extern AVCodec ff_adpcm_thp_le_decoder;
+extern AVCodec ff_adpcm_vima_decoder;
+extern AVCodec ff_adpcm_xa_decoder;
+extern AVCodec ff_adpcm_yamaha_encoder;
+extern AVCodec ff_adpcm_yamaha_decoder;
+
+/* subtitles */
+extern AVCodec ff_ssa_encoder;
+extern AVCodec ff_ssa_decoder;
+extern AVCodec ff_ass_encoder;
+extern AVCodec ff_ass_decoder;
+extern AVCodec ff_ccaption_decoder;
+extern AVCodec ff_dvbsub_encoder;
+extern AVCodec ff_dvbsub_decoder;
+extern AVCodec ff_dvdsub_encoder;
+extern AVCodec ff_dvdsub_decoder;
+extern AVCodec ff_jacosub_decoder;
+extern AVCodec ff_microdvd_decoder;
+extern AVCodec ff_movtext_encoder;
+extern AVCodec ff_movtext_decoder;
+extern AVCodec ff_mpl2_decoder;
+extern AVCodec ff_pgssub_decoder;
+extern AVCodec ff_pjs_decoder;
+extern AVCodec ff_realtext_decoder;
+extern AVCodec ff_sami_decoder;
+extern AVCodec ff_srt_encoder;
+extern AVCodec ff_srt_decoder;
+extern AVCodec ff_stl_decoder;
+extern AVCodec ff_subrip_encoder;
+extern AVCodec ff_subrip_decoder;
+extern AVCodec ff_subviewer_decoder;
+extern AVCodec ff_subviewer1_decoder;
+extern AVCodec ff_text_encoder;
+extern AVCodec ff_text_decoder;
+extern AVCodec ff_vplayer_decoder;
+extern AVCodec ff_webvtt_encoder;
+extern AVCodec ff_webvtt_decoder;
+extern AVCodec ff_xsub_encoder;
+extern AVCodec ff_xsub_decoder;
+
+/* external libraries */
+extern AVCodec ff_aac_at_encoder;
+extern AVCodec ff_aac_at_decoder;
+extern AVCodec ff_ac3_at_decoder;
+extern AVCodec ff_adpcm_ima_qt_at_decoder;
+extern AVCodec ff_alac_at_encoder;
+extern AVCodec ff_alac_at_decoder;
+extern AVCodec ff_amr_nb_at_decoder;
+extern AVCodec ff_eac3_at_decoder;
+extern AVCodec ff_gsm_ms_at_decoder;
+extern AVCodec ff_ilbc_at_encoder;
+extern AVCodec ff_ilbc_at_decoder;
+extern AVCodec ff_mp1_at_decoder;
+extern AVCodec ff_mp2_at_decoder;
+extern AVCodec ff_mp3_at_decoder;
+extern AVCodec ff_pcm_alaw_at_encoder;
+extern AVCodec ff_pcm_alaw_at_decoder;
+extern AVCodec ff_pcm_mulaw_at_encoder;
+extern AVCodec ff_pcm_mulaw_at_decoder;
+extern AVCodec ff_qdmc_at_decoder;
+extern AVCodec ff_qdm2_at_decoder;
+extern AVCodec ff_libaom_av1_decoder;
+extern AVCodec ff_libaom_av1_encoder;
+extern AVCodec ff_libcelt_decoder;
+extern AVCodec ff_libcodec2_encoder;
+extern AVCodec ff_libcodec2_decoder;
+extern AVCodec ff_libfdk_aac_encoder;
+extern AVCodec ff_libfdk_aac_decoder;
+extern AVCodec ff_libgsm_encoder;
+extern AVCodec ff_libgsm_decoder;
+extern AVCodec ff_libgsm_ms_encoder;
+extern AVCodec ff_libgsm_ms_decoder;
+extern AVCodec ff_libilbc_encoder;
+extern AVCodec ff_libilbc_decoder;
+extern AVCodec ff_libmp3lame_encoder;
+extern AVCodec ff_libopencore_amrnb_encoder;
+extern AVCodec ff_libopencore_amrnb_decoder;
+extern AVCodec ff_libopencore_amrwb_decoder;
+extern AVCodec ff_libopenjpeg_encoder;
+extern AVCodec ff_libopenjpeg_decoder;
+extern AVCodec ff_libopus_encoder;
+extern AVCodec ff_libopus_decoder;
+extern AVCodec ff_librsvg_decoder;
+extern AVCodec ff_libshine_encoder;
+extern AVCodec ff_libspeex_encoder;
+extern AVCodec ff_libspeex_decoder;
+extern AVCodec ff_libtheora_encoder;
+extern AVCodec ff_libtwolame_encoder;
+extern AVCodec ff_libvo_amrwbenc_encoder;
+extern AVCodec ff_libvorbis_encoder;
+extern AVCodec ff_libvorbis_decoder;
+extern AVCodec ff_libvpx_vp8_encoder;
+extern AVCodec ff_libvpx_vp8_decoder;
+extern AVCodec ff_libvpx_vp9_encoder;
+extern AVCodec ff_libvpx_vp9_decoder;
+extern AVCodec ff_libwavpack_encoder;
+/* preferred over libwebp */
+extern AVCodec ff_libwebp_anim_encoder;
+extern AVCodec ff_libwebp_encoder;
+extern AVCodec ff_libx262_encoder;
+extern AVCodec ff_libx264_encoder;
+extern AVCodec ff_libx264rgb_encoder;
+extern AVCodec ff_libx265_encoder;
+extern AVCodec ff_libxavs_encoder;
+extern AVCodec ff_libxvid_encoder;
+extern AVCodec ff_libzvbi_teletext_decoder;
+
+/* text */
+extern AVCodec ff_bintext_decoder;
+extern AVCodec ff_xbin_decoder;
+extern AVCodec ff_idf_decoder;
+
+/* external libraries, that shouldn't be used by default if one of the
+ * above is available */
+extern AVCodec ff_h263_v4l2m2m_encoder;
+extern AVCodec ff_libopenh264_encoder;
+extern AVCodec ff_libopenh264_decoder;
+extern AVCodec ff_h264_amf_encoder;
+extern AVCodec ff_h264_cuvid_decoder;
+extern AVCodec ff_h264_nvenc_encoder;
+extern AVCodec ff_h264_omx_encoder;
+extern AVCodec ff_h264_qsv_encoder;
+extern AVCodec ff_h264_v4l2m2m_encoder;
+extern AVCodec ff_h264_vaapi_encoder;
+extern AVCodec ff_h264_videotoolbox_encoder;
+#if FF_API_NVENC_OLD_NAME
+extern AVCodec ff_nvenc_encoder;
+extern AVCodec ff_nvenc_h264_encoder;
+extern AVCodec ff_nvenc_hevc_encoder;
+#endif
+extern AVCodec ff_hevc_amf_encoder;
+extern AVCodec ff_hevc_cuvid_decoder;
+extern AVCodec ff_hevc_mediacodec_decoder;
+extern AVCodec ff_hevc_nvenc_encoder;
+extern AVCodec ff_hevc_qsv_encoder;
+extern AVCodec ff_hevc_v4l2m2m_encoder;
+extern AVCodec ff_hevc_vaapi_encoder;
+extern AVCodec ff_hevc_videotoolbox_encoder;
+extern AVCodec ff_libkvazaar_encoder;
+extern AVCodec ff_mjpeg_cuvid_decoder;
+extern AVCodec ff_mjpeg_qsv_encoder;
+extern AVCodec ff_mjpeg_vaapi_encoder;
+extern AVCodec ff_mpeg1_cuvid_decoder;
+extern AVCodec ff_mpeg2_cuvid_decoder;
+extern AVCodec ff_mpeg2_qsv_encoder;
+extern AVCodec ff_mpeg2_vaapi_encoder;
+extern AVCodec ff_mpeg4_cuvid_decoder;
+extern AVCodec ff_mpeg4_mediacodec_decoder;
+extern AVCodec ff_mpeg4_v4l2m2m_encoder;
+extern AVCodec ff_vc1_cuvid_decoder;
+extern AVCodec ff_vp8_cuvid_decoder;
+extern AVCodec ff_vp8_mediacodec_decoder;
+extern AVCodec ff_vp8_qsv_decoder;
+extern AVCodec ff_vp8_v4l2m2m_encoder;
+extern AVCodec ff_vp8_vaapi_encoder;
+extern AVCodec ff_vp9_cuvid_decoder;
+extern AVCodec ff_vp9_mediacodec_decoder;
+extern AVCodec ff_vp9_vaapi_encoder;
+
+#include "libavcodec/codec_list.c"
+
+static AVOnce av_codec_static_init = AV_ONCE_INIT;
+static void av_codec_init_static(void)
+{
+    for (int i = 0; codec_list[i]; i++) {
+        if (codec_list[i]->init_static_data)
+            codec_list[i]->init_static_data((AVCodec*)codec_list[i]);
     }
+}
+
+const AVCodec *av_codec_iterate(void **opaque)
+{
+    uintptr_t i = (uintptr_t)*opaque;
+    const AVCodec *c = codec_list[i];
 
-#define REGISTER_ENCODER(X, x)                                          \
-    {                                                                   \
-        extern AVCodec ff_##x##_encoder;                                \
-        if (CONFIG_##X##_ENCODER)                                       \
-            avcodec_register(&ff_##x##_encoder);                        \
+    ff_thread_once(&av_codec_static_init, av_codec_init_static);
+
+    if (c)
+        *opaque = (void*)(i + 1);
+
+    return c;
+}
+
+#if FF_API_NEXT
+FF_DISABLE_DEPRECATION_WARNINGS
+static AVOnce av_codec_next_init = AV_ONCE_INIT;
+
+static void av_codec_init_next(void)
+{
+    AVCodec *prev = NULL, *p;
+    void *i = 0;
+    while ((p = (AVCodec*)av_codec_iterate(&i))) {
+        if (prev)
+            prev->next = p;
+        prev = p;
     }
+}
+
+
+
+av_cold void avcodec_register(AVCodec *codec)
+{
+    ff_thread_once(&av_codec_next_init, av_codec_init_next);
+}
+
+AVCodec *av_codec_next(const AVCodec *c)
+{
+    ff_thread_once(&av_codec_next_init, av_codec_init_next);
+
+    if (c)
+        return c->next;
+    else
+        return (AVCodec*)codec_list[0];
+}
 
-#define REGISTER_DECODER(X, x)                                          \
-    {                                                                   \
-        extern AVCodec ff_##x##_decoder;                                \
-        if (CONFIG_##X##_DECODER)                                       \
-            avcodec_register(&ff_##x##_decoder);                        \
+void avcodec_register_all(void)
+{
+    ff_thread_once(&av_codec_next_init, av_codec_init_next);
+}
+FF_ENABLE_DEPRECATION_WARNINGS
+#endif
+
+static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
+{
+    switch(id){
+        //This is for future deprecatec codec ids, its empty since
+        //last major bump but will fill up again over time, please don't remove it
+        default                                         : return id;
     }
+}
+
+static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
+{
+    const AVCodec *p, *experimental = NULL;
+    void *i = 0;
 
-#define REGISTER_ENCDEC(X, x) REGISTER_ENCODER(X, x); REGISTER_DECODER(X, x)
+    id = remap_deprecated_codec_id(id);
 
-#define REGISTER_PARSER(X, x)                                           \
-    {                                                                   \
-        extern AVCodecParser ff_##x##_parser;                           \
-        if (CONFIG_##X##_PARSER)                                        \
-            av_register_codec_parser(&ff_##x##_parser);                 \
+    while ((p = av_codec_iterate(&i))) {
+        if (!x(p))
+            continue;
+        if (p->id == id) {
+            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
+                experimental = p;
+            } else
+                return (AVCodec*)p;
+        }
     }
 
-static void register_all(void)
+    return (AVCodec*)experimental;
+}
+
+AVCodec *avcodec_find_encoder(enum AVCodecID id)
 {
-    /* hardware accelerators */
-    REGISTER_HWACCEL(H263_VAAPI,        h263_vaapi);
-    REGISTER_HWACCEL(H263_VIDEOTOOLBOX, h263_videotoolbox);
-    REGISTER_HWACCEL(H264_CUVID,        h264_cuvid);
-    REGISTER_HWACCEL(H264_D3D11VA,      h264_d3d11va);
-    REGISTER_HWACCEL(H264_D3D11VA2,     h264_d3d11va2);
-    REGISTER_HWACCEL(H264_DXVA2,        h264_dxva2);
-    REGISTER_HWACCEL(H264_MEDIACODEC,   h264_mediacodec);
-    REGISTER_HWACCEL(H264_MMAL,         h264_mmal);
-    REGISTER_HWACCEL(H264_QSV,          h264_qsv);
-    REGISTER_HWACCEL(H264_VAAPI,        h264_vaapi);
-    REGISTER_HWACCEL(H264_VDA,          h264_vda);
-    REGISTER_HWACCEL(H264_VDA_OLD,      h264_vda_old);
-    REGISTER_HWACCEL(H264_VDPAU,        h264_vdpau);
-    REGISTER_HWACCEL(H264_VIDEOTOOLBOX, h264_videotoolbox);
-    REGISTER_HWACCEL(HEVC_CUVID,        hevc_cuvid);
-    REGISTER_HWACCEL(HEVC_D3D11VA,      hevc_d3d11va);
-    REGISTER_HWACCEL(HEVC_D3D11VA2,     hevc_d3d11va2);
-    REGISTER_HWACCEL(HEVC_DXVA2,        hevc_dxva2);
-    REGISTER_HWACCEL(HEVC_MEDIACODEC,   hevc_mediacodec);
-    REGISTER_HWACCEL(HEVC_QSV,          hevc_qsv);
-    REGISTER_HWACCEL(HEVC_VAAPI,        hevc_vaapi);
-    REGISTER_HWACCEL(HEVC_VDPAU,        hevc_vdpau);
-    REGISTER_HWACCEL(HEVC_VIDEOTOOLBOX, hevc_videotoolbox);
-    REGISTER_HWACCEL(MJPEG_CUVID,       mjpeg_cuvid);
-    REGISTER_HWACCEL(MPEG1_CUVID,       mpeg1_cuvid);
-    REGISTER_HWACCEL(MPEG1_XVMC,        mpeg1_xvmc);
-    REGISTER_HWACCEL(MPEG1_VDPAU,       mpeg1_vdpau);
-    REGISTER_HWACCEL(MPEG1_VIDEOTOOLBOX, mpeg1_videotoolbox);
-    REGISTER_HWACCEL(MPEG2_CUVID,       mpeg2_cuvid);
-    REGISTER_HWACCEL(MPEG2_XVMC,        mpeg2_xvmc);
-    REGISTER_HWACCEL(MPEG2_D3D11VA,     mpeg2_d3d11va);
-    REGISTER_HWACCEL(MPEG2_D3D11VA2,    mpeg2_d3d11va2);
-    REGISTER_HWACCEL(MPEG2_DXVA2,       mpeg2_dxva2);
-    REGISTER_HWACCEL(MPEG2_MMAL,        mpeg2_mmal);
-    REGISTER_HWACCEL(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_HWACCEL(MPEG2_VAAPI,       mpeg2_vaapi);
-    REGISTER_HWACCEL(MPEG2_VDPAU,       mpeg2_vdpau);
-    REGISTER_HWACCEL(MPEG2_VIDEOTOOLBOX, mpeg2_videotoolbox);
-    REGISTER_HWACCEL(MPEG2_MEDIACODEC,  mpeg2_mediacodec);
-    REGISTER_HWACCEL(MPEG4_CUVID,       mpeg4_cuvid);
-    REGISTER_HWACCEL(MPEG4_MEDIACODEC,  mpeg4_mediacodec);
-    REGISTER_HWACCEL(MPEG4_MMAL,        mpeg4_mmal);
-    REGISTER_HWACCEL(MPEG4_VAAPI,       mpeg4_vaapi);
-    REGISTER_HWACCEL(MPEG4_VDPAU,       mpeg4_vdpau);
-    REGISTER_HWACCEL(MPEG4_VIDEOTOOLBOX, mpeg4_videotoolbox);
-    REGISTER_HWACCEL(VC1_CUVID,         vc1_cuvid);
-    REGISTER_HWACCEL(VC1_D3D11VA,       vc1_d3d11va);
-    REGISTER_HWACCEL(VC1_D3D11VA2,      vc1_d3d11va2);
-    REGISTER_HWACCEL(VC1_DXVA2,         vc1_dxva2);
-    REGISTER_HWACCEL(VC1_VAAPI,         vc1_vaapi);
-    REGISTER_HWACCEL(VC1_VDPAU,         vc1_vdpau);
-    REGISTER_HWACCEL(VC1_MMAL,          vc1_mmal);
-    REGISTER_HWACCEL(VC1_QSV,           vc1_qsv);
-    REGISTER_HWACCEL(VP8_CUVID,         vp8_cuvid);
-    REGISTER_HWACCEL(VP8_MEDIACODEC,    vp8_mediacodec);
-    REGISTER_HWACCEL(VP8_QSV,           vp8_qsv);
-    REGISTER_HWACCEL(VP9_CUVID,         vp9_cuvid);
-    REGISTER_HWACCEL(VP9_D3D11VA,       vp9_d3d11va);
-    REGISTER_HWACCEL(VP9_D3D11VA2,      vp9_d3d11va2);
-    REGISTER_HWACCEL(VP9_DXVA2,         vp9_dxva2);
-    REGISTER_HWACCEL(VP9_MEDIACODEC,    vp9_mediacodec);
-    REGISTER_HWACCEL(VP9_VAAPI,         vp9_vaapi);
-    REGISTER_HWACCEL(WMV3_D3D11VA,      wmv3_d3d11va);
-    REGISTER_HWACCEL(WMV3_D3D11VA2,     wmv3_d3d11va2);
-    REGISTER_HWACCEL(WMV3_DXVA2,        wmv3_dxva2);
-    REGISTER_HWACCEL(WMV3_VAAPI,        wmv3_vaapi);
-    REGISTER_HWACCEL(WMV3_VDPAU,        wmv3_vdpau);
-
-    /* video codecs */
-    REGISTER_ENCODER(A64MULTI,          a64multi);
-    REGISTER_ENCODER(A64MULTI5,         a64multi5);
-    REGISTER_DECODER(AASC,              aasc);
-    REGISTER_DECODER(AIC,               aic);
-    REGISTER_ENCDEC (ALIAS_PIX,         alias_pix);
-    REGISTER_ENCDEC (AMV,               amv);
-    REGISTER_DECODER(ANM,               anm);
-    REGISTER_DECODER(ANSI,              ansi);
-    REGISTER_ENCDEC (APNG,              apng);
-    REGISTER_ENCDEC (ASV1,              asv1);
-    REGISTER_ENCDEC (ASV2,              asv2);
-    REGISTER_DECODER(AURA,              aura);
-    REGISTER_DECODER(AURA2,             aura2);
-    REGISTER_ENCDEC (AVRP,              avrp);
-    REGISTER_DECODER(AVRN,              avrn);
-    REGISTER_DECODER(AVS,               avs);
-    REGISTER_ENCDEC (AVUI,              avui);
-    REGISTER_ENCDEC (AYUV,              ayuv);
-    REGISTER_DECODER(BETHSOFTVID,       bethsoftvid);
-    REGISTER_DECODER(BFI,               bfi);
-    REGISTER_DECODER(BINK,              bink);
-    REGISTER_ENCDEC (BMP,               bmp);
-    REGISTER_DECODER(BMV_VIDEO,         bmv_video);
-    REGISTER_DECODER(BRENDER_PIX,       brender_pix);
-    REGISTER_DECODER(C93,               c93);
-    REGISTER_DECODER(CAVS,              cavs);
-    REGISTER_DECODER(CDGRAPHICS,        cdgraphics);
-    REGISTER_DECODER(CDXL,              cdxl);
-    REGISTER_DECODER(CFHD,              cfhd);
-    REGISTER_ENCDEC (CINEPAK,           cinepak);
-    REGISTER_DECODER(CLEARVIDEO,        clearvideo);
-    REGISTER_ENCDEC (CLJR,              cljr);
-    REGISTER_DECODER(CLLC,              cllc);
-    REGISTER_ENCDEC (COMFORTNOISE,      comfortnoise);
-    REGISTER_DECODER(CPIA,              cpia);
-    REGISTER_DECODER(CSCD,              cscd);
-    REGISTER_DECODER(CYUV,              cyuv);
-    REGISTER_DECODER(DDS,               dds);
-    REGISTER_DECODER(DFA,               dfa);
-    REGISTER_DECODER(DIRAC,             dirac);
-    REGISTER_ENCDEC (DNXHD,             dnxhd);
-    REGISTER_ENCDEC (DPX,               dpx);
-    REGISTER_DECODER(DSICINVIDEO,       dsicinvideo);
-    REGISTER_DECODER(DVAUDIO,           dvaudio);
-    REGISTER_ENCDEC (DVVIDEO,           dvvideo);
-    REGISTER_DECODER(DXA,               dxa);
-    REGISTER_DECODER(DXTORY,            dxtory);
-    REGISTER_DECODER(DXV,               dxv);
-    REGISTER_DECODER(EACMV,             eacmv);
-    REGISTER_DECODER(EAMAD,             eamad);
-    REGISTER_DECODER(EATGQ,             eatgq);
-    REGISTER_DECODER(EATGV,             eatgv);
-    REGISTER_DECODER(EATQI,             eatqi);
-    REGISTER_DECODER(EIGHTBPS,          eightbps);
-    REGISTER_DECODER(EIGHTSVX_EXP,      eightsvx_exp);
-    REGISTER_DECODER(EIGHTSVX_FIB,      eightsvx_fib);
-    REGISTER_DECODER(ESCAPE124,         escape124);
-    REGISTER_DECODER(ESCAPE130,         escape130);
-    REGISTER_DECODER(EXR,               exr);
-    REGISTER_ENCDEC (FFV1,              ffv1);
-    REGISTER_ENCDEC (FFVHUFF,           ffvhuff);
-    REGISTER_DECODER(FIC,               fic);
-    REGISTER_ENCDEC (FITS,              fits);
-    REGISTER_ENCDEC (FLASHSV,           flashsv);
-    REGISTER_ENCDEC (FLASHSV2,          flashsv2);
-    REGISTER_DECODER(FLIC,              flic);
-    REGISTER_ENCDEC (FLV,               flv);
-    REGISTER_DECODER(FMVC,              fmvc);
-    REGISTER_DECODER(FOURXM,            fourxm);
-    REGISTER_DECODER(FRAPS,             fraps);
-    REGISTER_DECODER(FRWU,              frwu);
-    REGISTER_DECODER(G2M,               g2m);
-    REGISTER_DECODER(GDV,               gdv);
-    REGISTER_ENCDEC (GIF,               gif);
-    REGISTER_ENCDEC (H261,              h261);
-    REGISTER_ENCDEC (H263,              h263);
-    REGISTER_DECODER(H263I,             h263i);
-    REGISTER_ENCDEC (H263P,             h263p);
-    REGISTER_DECODER(H263_V4L2M2M,      h263_v4l2m2m);
-    REGISTER_DECODER(H264,              h264);
-    REGISTER_DECODER(H264_CRYSTALHD,    h264_crystalhd);
-    REGISTER_DECODER(H264_V4L2M2M,      h264_v4l2m2m);
-    REGISTER_DECODER(H264_MEDIACODEC,   h264_mediacodec);
-    REGISTER_DECODER(H264_MMAL,         h264_mmal);
-    REGISTER_DECODER(H264_QSV,          h264_qsv);
-    REGISTER_DECODER(H264_RKMPP,        h264_rkmpp);
-    REGISTER_DECODER(H264_VDA,          h264_vda);
-#if FF_API_VDPAU
-    REGISTER_DECODER(H264_VDPAU,        h264_vdpau);
-#endif
-    REGISTER_ENCDEC (HAP,               hap);
-    REGISTER_DECODER(HEVC,              hevc);
-    REGISTER_DECODER(HEVC_QSV,          hevc_qsv);
-    REGISTER_DECODER(HEVC_RKMPP,        hevc_rkmpp);
-    REGISTER_DECODER(HEVC_V4L2M2M,      hevc_v4l2m2m);
-    REGISTER_DECODER(HNM4_VIDEO,        hnm4_video);
-    REGISTER_DECODER(HQ_HQA,            hq_hqa);
-    REGISTER_DECODER(HQX,               hqx);
-    REGISTER_ENCDEC (HUFFYUV,           huffyuv);
-    REGISTER_DECODER(IDCIN,             idcin);
-    REGISTER_DECODER(IFF_ILBM,          iff_ilbm);
-    REGISTER_DECODER(INDEO2,            indeo2);
-    REGISTER_DECODER(INDEO3,            indeo3);
-    REGISTER_DECODER(INDEO4,            indeo4);
-    REGISTER_DECODER(INDEO5,            indeo5);
-    REGISTER_DECODER(INTERPLAY_VIDEO,   interplay_video);
-    REGISTER_ENCDEC (JPEG2000,          jpeg2000);
-    REGISTER_ENCDEC (JPEGLS,            jpegls);
-    REGISTER_DECODER(JV,                jv);
-    REGISTER_DECODER(KGV1,              kgv1);
-    REGISTER_DECODER(KMVC,              kmvc);
-    REGISTER_DECODER(LAGARITH,          lagarith);
-    REGISTER_ENCODER(LJPEG,             ljpeg);
-    REGISTER_DECODER(LOCO,              loco);
-    REGISTER_DECODER(M101,              m101);
-    REGISTER_DECODER(MAGICYUV,          magicyuv);
-    REGISTER_DECODER(MDEC,              mdec);
-    REGISTER_DECODER(MIMIC,             mimic);
-    REGISTER_ENCDEC (MJPEG,             mjpeg);
-    REGISTER_DECODER(MJPEGB,            mjpegb);
-    REGISTER_DECODER(MMVIDEO,           mmvideo);
-    REGISTER_DECODER(MOTIONPIXELS,      motionpixels);
-#if FF_API_XVMC
-    REGISTER_DECODER(MPEG_XVMC,         mpeg_xvmc);
-#endif /* FF_API_XVMC */
-    REGISTER_ENCDEC (MPEG1VIDEO,        mpeg1video);
-    REGISTER_ENCDEC (MPEG2VIDEO,        mpeg2video);
-    REGISTER_ENCDEC (MPEG4,             mpeg4);
-    REGISTER_DECODER(MPEG4_CRYSTALHD,   mpeg4_crystalhd);
-    REGISTER_DECODER(MPEG4_V4L2M2M,     mpeg4_v4l2m2m);
-    REGISTER_DECODER(MPEG4_MMAL,        mpeg4_mmal);
-#if FF_API_VDPAU
-    REGISTER_DECODER(MPEG4_VDPAU,       mpeg4_vdpau);
-#endif
-    REGISTER_DECODER(MPEGVIDEO,         mpegvideo);
-#if FF_API_VDPAU
-    REGISTER_DECODER(MPEG_VDPAU,        mpeg_vdpau);
-    REGISTER_DECODER(MPEG1_VDPAU,       mpeg1_vdpau);
-#endif
-    REGISTER_DECODER(MPEG1_V4L2M2M,     mpeg1_v4l2m2m);
-    REGISTER_DECODER(MPEG2_MMAL,        mpeg2_mmal);
-    REGISTER_DECODER(MPEG2_CRYSTALHD,   mpeg2_crystalhd);
-    REGISTER_DECODER(MPEG2_V4L2M2M,     mpeg2_v4l2m2m);
-    REGISTER_DECODER(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_DECODER(MPEG2_MEDIACODEC,  mpeg2_mediacodec);
-    REGISTER_DECODER(MSA1,              msa1);
-    REGISTER_DECODER(MSCC,              mscc);
-    REGISTER_DECODER(MSMPEG4V1,         msmpeg4v1);
-    REGISTER_ENCDEC (MSMPEG4V2,         msmpeg4v2);
-    REGISTER_ENCDEC (MSMPEG4V3,         msmpeg4v3);
-    REGISTER_DECODER(MSMPEG4_CRYSTALHD, msmpeg4_crystalhd);
-    REGISTER_DECODER(MSRLE,             msrle);
-    REGISTER_DECODER(MSS1,              mss1);
-    REGISTER_DECODER(MSS2,              mss2);
-    REGISTER_ENCDEC (MSVIDEO1,          msvideo1);
-    REGISTER_DECODER(MSZH,              mszh);
-    REGISTER_DECODER(MTS2,              mts2);
-    REGISTER_DECODER(MVC1,              mvc1);
-    REGISTER_DECODER(MVC2,              mvc2);
-    REGISTER_DECODER(MXPEG,             mxpeg);
-    REGISTER_DECODER(NUV,               nuv);
-    REGISTER_DECODER(PAF_VIDEO,         paf_video);
-    REGISTER_ENCDEC (PAM,               pam);
-    REGISTER_ENCDEC (PBM,               pbm);
-    REGISTER_ENCDEC (PCX,               pcx);
-    REGISTER_ENCDEC (PGM,               pgm);
-    REGISTER_ENCDEC (PGMYUV,            pgmyuv);
-    REGISTER_DECODER(PICTOR,            pictor);
-    REGISTER_DECODER(PIXLET,            pixlet);
-    REGISTER_ENCDEC (PNG,               png);
-    REGISTER_ENCDEC (PPM,               ppm);
-    REGISTER_ENCDEC (PRORES,            prores);
-    REGISTER_ENCODER(PRORES_AW,         prores_aw);
-    REGISTER_ENCODER(PRORES_KS,         prores_ks);
-    REGISTER_DECODER(PRORES_LGPL,       prores_lgpl);
-    REGISTER_DECODER(PSD,               psd);
-    REGISTER_DECODER(PTX,               ptx);
-    REGISTER_DECODER(QDRAW,             qdraw);
-    REGISTER_DECODER(QPEG,              qpeg);
-    REGISTER_ENCDEC (QTRLE,             qtrle);
-    REGISTER_ENCDEC (R10K,              r10k);
-    REGISTER_ENCDEC (R210,              r210);
-    REGISTER_ENCDEC (RAWVIDEO,          rawvideo);
-    REGISTER_DECODER(RL2,               rl2);
-    REGISTER_ENCDEC (ROQ,               roq);
-    REGISTER_DECODER(RPZA,              rpza);
-    REGISTER_DECODER(RSCC,              rscc);
-    REGISTER_ENCDEC (RV10,              rv10);
-    REGISTER_ENCDEC (RV20,              rv20);
-    REGISTER_DECODER(RV30,              rv30);
-    REGISTER_DECODER(RV40,              rv40);
-    REGISTER_ENCDEC (S302M,             s302m);
-    REGISTER_DECODER(SANM,              sanm);
-    REGISTER_DECODER(SCPR,              scpr);
-    REGISTER_DECODER(SCREENPRESSO,      screenpresso);
-    REGISTER_DECODER(SDX2_DPCM,         sdx2_dpcm);
-    REGISTER_ENCDEC (SGI,               sgi);
-    REGISTER_DECODER(SGIRLE,            sgirle);
-    REGISTER_DECODER(SHEERVIDEO,        sheervideo);
-    REGISTER_DECODER(SMACKER,           smacker);
-    REGISTER_DECODER(SMC,               smc);
-    REGISTER_DECODER(SMVJPEG,           smvjpeg);
-    REGISTER_ENCDEC (SNOW,              snow);
-    REGISTER_DECODER(SP5X,              sp5x);
-    REGISTER_DECODER(SPEEDHQ,           speedhq);
-    REGISTER_DECODER(SRGC,              srgc);
-    REGISTER_ENCDEC (SUNRAST,           sunrast);
-    REGISTER_ENCDEC (SVQ1,              svq1);
-    REGISTER_DECODER(SVQ3,              svq3);
-    REGISTER_ENCDEC (TARGA,             targa);
-    REGISTER_DECODER(TARGA_Y216,        targa_y216);
-    REGISTER_DECODER(TDSC,              tdsc);
-    REGISTER_DECODER(THEORA,            theora);
-    REGISTER_DECODER(THP,               thp);
-    REGISTER_DECODER(TIERTEXSEQVIDEO,   tiertexseqvideo);
-    REGISTER_ENCDEC (TIFF,              tiff);
-    REGISTER_DECODER(TMV,               tmv);
-    REGISTER_DECODER(TRUEMOTION1,       truemotion1);
-    REGISTER_DECODER(TRUEMOTION2,       truemotion2);
-    REGISTER_DECODER(TRUEMOTION2RT,     truemotion2rt);
-    REGISTER_DECODER(TSCC,              tscc);
-    REGISTER_DECODER(TSCC2,             tscc2);
-    REGISTER_DECODER(TXD,               txd);
-    REGISTER_DECODER(ULTI,              ulti);
-    REGISTER_ENCDEC (UTVIDEO,           utvideo);
-    REGISTER_ENCDEC (V210,              v210);
-    REGISTER_DECODER(V210X,             v210x);
-    REGISTER_ENCDEC (V308,              v308);
-    REGISTER_ENCDEC (V408,              v408);
-    REGISTER_ENCDEC (V410,              v410);
-    REGISTER_DECODER(VB,                vb);
-    REGISTER_DECODER(VBLE,              vble);
-    REGISTER_DECODER(VC1,               vc1);
-    REGISTER_DECODER(VC1_CRYSTALHD,     vc1_crystalhd);
-#if FF_API_VDPAU
-    REGISTER_DECODER(VC1_VDPAU,         vc1_vdpau);
-#endif
-    REGISTER_DECODER(VC1IMAGE,          vc1image);
-    REGISTER_DECODER(VC1_MMAL,          vc1_mmal);
-    REGISTER_DECODER(VC1_QSV,           vc1_qsv);
-    REGISTER_DECODER(VC1_V4L2M2M,       vc1_v4l2m2m);
-    REGISTER_ENCODER(VC2,               vc2);
-    REGISTER_DECODER(VCR1,              vcr1);
-    REGISTER_DECODER(VMDVIDEO,          vmdvideo);
-    REGISTER_DECODER(VMNC,              vmnc);
-    REGISTER_DECODER(VP3,               vp3);
-    REGISTER_DECODER(VP5,               vp5);
-    REGISTER_DECODER(VP6,               vp6);
-    REGISTER_DECODER(VP6A,              vp6a);
-    REGISTER_DECODER(VP6F,              vp6f);
-    REGISTER_DECODER(VP7,               vp7);
-    REGISTER_DECODER(VP8,               vp8);
-    REGISTER_DECODER(VP8_RKMPP,         vp8_rkmpp);
-    REGISTER_DECODER(VP8_V4L2M2M,       vp8_v4l2m2m);
-    REGISTER_DECODER(VP9,               vp9);
-    REGISTER_DECODER(VP9_RKMPP,         vp9_rkmpp);
-    REGISTER_DECODER(VP9_V4L2M2M,       vp9_v4l2m2m);
-    REGISTER_DECODER(VQA,               vqa);
-    REGISTER_DECODER(BITPACKED,         bitpacked);
-    REGISTER_DECODER(WEBP,              webp);
-    REGISTER_ENCDEC (WRAPPED_AVFRAME,   wrapped_avframe);
-    REGISTER_ENCDEC (WMV1,              wmv1);
-    REGISTER_ENCDEC (WMV2,              wmv2);
-    REGISTER_DECODER(WMV3,              wmv3);
-    REGISTER_DECODER(WMV3_CRYSTALHD,    wmv3_crystalhd);
-#if FF_API_VDPAU
-    REGISTER_DECODER(WMV3_VDPAU,        wmv3_vdpau);
-#endif
-    REGISTER_DECODER(WMV3IMAGE,         wmv3image);
-    REGISTER_DECODER(WNV1,              wnv1);
-    REGISTER_DECODER(XAN_WC3,           xan_wc3);
-    REGISTER_DECODER(XAN_WC4,           xan_wc4);
-    REGISTER_ENCDEC (XBM,               xbm);
-    REGISTER_ENCDEC (XFACE,             xface);
-    REGISTER_DECODER(XL,                xl);
-    REGISTER_DECODER(XPM,               xpm);
-    REGISTER_ENCDEC (XWD,               xwd);
-    REGISTER_ENCDEC (Y41P,              y41p);
-    REGISTER_DECODER(YLC,               ylc);
-    REGISTER_DECODER(YOP,               yop);
-    REGISTER_ENCDEC (YUV4,              yuv4);
-    REGISTER_DECODER(ZERO12V,           zero12v);
-    REGISTER_DECODER(ZEROCODEC,         zerocodec);
-    REGISTER_ENCDEC (ZLIB,              zlib);
-    REGISTER_ENCDEC (ZMBV,              zmbv);
-
-    /* audio codecs */
-    REGISTER_ENCDEC (AAC,               aac);
-    REGISTER_DECODER(AAC_FIXED,         aac_fixed);
-    REGISTER_DECODER(AAC_LATM,          aac_latm);
-    REGISTER_ENCDEC (AC3,               ac3);
-    REGISTER_ENCDEC (AC3_FIXED,         ac3_fixed);
-    REGISTER_ENCDEC (ALAC,              alac);
-    REGISTER_DECODER(ALS,               als);
-    REGISTER_DECODER(AMRNB,             amrnb);
-    REGISTER_DECODER(AMRWB,             amrwb);
-    REGISTER_DECODER(APE,               ape);
-    REGISTER_DECODER(ATRAC1,            atrac1);
-    REGISTER_DECODER(ATRAC3,            atrac3);
-    REGISTER_DECODER(ATRAC3AL,          atrac3al);
-    REGISTER_DECODER(ATRAC3P,           atrac3p);
-    REGISTER_DECODER(ATRAC3PAL,         atrac3pal);
-    REGISTER_DECODER(BINKAUDIO_DCT,     binkaudio_dct);
-    REGISTER_DECODER(BINKAUDIO_RDFT,    binkaudio_rdft);
-    REGISTER_DECODER(BMV_AUDIO,         bmv_audio);
-    REGISTER_DECODER(COOK,              cook);
-    REGISTER_ENCDEC (DCA,               dca);
-    REGISTER_DECODER(DOLBY_E,           dolby_e);
-    REGISTER_DECODER(DSD_LSBF,          dsd_lsbf);
-    REGISTER_DECODER(DSD_MSBF,          dsd_msbf);
-    REGISTER_DECODER(DSD_LSBF_PLANAR,   dsd_lsbf_planar);
-    REGISTER_DECODER(DSD_MSBF_PLANAR,   dsd_msbf_planar);
-    REGISTER_DECODER(DSICINAUDIO,       dsicinaudio);
-    REGISTER_DECODER(DSS_SP,            dss_sp);
-    REGISTER_DECODER(DST,               dst);
-    REGISTER_ENCDEC (EAC3,              eac3);
-    REGISTER_DECODER(EVRC,              evrc);
-    REGISTER_DECODER(FFWAVESYNTH,       ffwavesynth);
-    REGISTER_ENCDEC (FLAC,              flac);
-    REGISTER_ENCDEC (G723_1,            g723_1);
-    REGISTER_DECODER(G729,              g729);
-    REGISTER_DECODER(GSM,               gsm);
-    REGISTER_DECODER(GSM_MS,            gsm_ms);
-    REGISTER_DECODER(IAC,               iac);
-    REGISTER_DECODER(IMC,               imc);
-    REGISTER_DECODER(INTERPLAY_ACM,     interplay_acm);
-    REGISTER_DECODER(MACE3,             mace3);
-    REGISTER_DECODER(MACE6,             mace6);
-    REGISTER_DECODER(METASOUND,         metasound);
-    REGISTER_ENCDEC (MLP,               mlp);
-    REGISTER_DECODER(MP1,               mp1);
-    REGISTER_DECODER(MP1FLOAT,          mp1float);
-    REGISTER_ENCDEC (MP2,               mp2);
-    REGISTER_DECODER(MP2FLOAT,          mp2float);
-    REGISTER_ENCODER(MP2FIXED,          mp2fixed);
-    REGISTER_DECODER(MP3,               mp3);
-    REGISTER_DECODER(MP3FLOAT,          mp3float);
-    REGISTER_DECODER(MP3ADU,            mp3adu);
-    REGISTER_DECODER(MP3ADUFLOAT,       mp3adufloat);
-    REGISTER_DECODER(MP3ON4,            mp3on4);
-    REGISTER_DECODER(MP3ON4FLOAT,       mp3on4float);
-    REGISTER_DECODER(MPC7,              mpc7);
-    REGISTER_DECODER(MPC8,              mpc8);
-    REGISTER_ENCDEC (NELLYMOSER,        nellymoser);
-    REGISTER_DECODER(ON2AVC,            on2avc);
-    REGISTER_ENCDEC (OPUS,              opus);
-    REGISTER_DECODER(PAF_AUDIO,         paf_audio);
-    REGISTER_DECODER(QCELP,             qcelp);
-    REGISTER_DECODER(QDM2,              qdm2);
-    REGISTER_DECODER(QDMC,              qdmc);
-    REGISTER_ENCDEC (RA_144,            ra_144);
-    REGISTER_DECODER(RA_288,            ra_288);
-    REGISTER_DECODER(RALF,              ralf);
-    REGISTER_DECODER(SHORTEN,           shorten);
-    REGISTER_DECODER(SIPR,              sipr);
-    REGISTER_DECODER(SMACKAUD,          smackaud);
-    REGISTER_ENCDEC (SONIC,             sonic);
-    REGISTER_ENCODER(SONIC_LS,          sonic_ls);
-    REGISTER_DECODER(TAK,               tak);
-    REGISTER_ENCDEC (TRUEHD,            truehd);
-    REGISTER_DECODER(TRUESPEECH,        truespeech);
-    REGISTER_ENCDEC (TTA,               tta);
-    REGISTER_DECODER(TWINVQ,            twinvq);
-    REGISTER_DECODER(VMDAUDIO,          vmdaudio);
-    REGISTER_ENCDEC (VORBIS,            vorbis);
-    REGISTER_ENCDEC (WAVPACK,           wavpack);
-    REGISTER_DECODER(WMALOSSLESS,       wmalossless);
-    REGISTER_DECODER(WMAPRO,            wmapro);
-    REGISTER_ENCDEC (WMAV1,             wmav1);
-    REGISTER_ENCDEC (WMAV2,             wmav2);
-    REGISTER_DECODER(WMAVOICE,          wmavoice);
-    REGISTER_DECODER(WS_SND1,           ws_snd1);
-    REGISTER_DECODER(XMA1,              xma1);
-    REGISTER_DECODER(XMA2,              xma2);
-
-    /* PCM codecs */
-    REGISTER_ENCDEC (PCM_ALAW,          pcm_alaw);
-    REGISTER_DECODER(PCM_BLURAY,        pcm_bluray);
-    REGISTER_DECODER(PCM_DVD,           pcm_dvd);
-    REGISTER_DECODER(PCM_F16LE,         pcm_f16le);
-    REGISTER_DECODER(PCM_F24LE,         pcm_f24le);
-    REGISTER_ENCDEC (PCM_F32BE,         pcm_f32be);
-    REGISTER_ENCDEC (PCM_F32LE,         pcm_f32le);
-    REGISTER_ENCDEC (PCM_F64BE,         pcm_f64be);
-    REGISTER_ENCDEC (PCM_F64LE,         pcm_f64le);
-    REGISTER_DECODER(PCM_LXF,           pcm_lxf);
-    REGISTER_ENCDEC (PCM_MULAW,         pcm_mulaw);
-    REGISTER_ENCDEC (PCM_S8,            pcm_s8);
-    REGISTER_ENCDEC (PCM_S8_PLANAR,     pcm_s8_planar);
-    REGISTER_ENCDEC (PCM_S16BE,         pcm_s16be);
-    REGISTER_ENCDEC (PCM_S16BE_PLANAR,  pcm_s16be_planar);
-    REGISTER_ENCDEC (PCM_S16LE,         pcm_s16le);
-    REGISTER_ENCDEC (PCM_S16LE_PLANAR,  pcm_s16le_planar);
-    REGISTER_ENCDEC (PCM_S24BE,         pcm_s24be);
-    REGISTER_ENCDEC (PCM_S24DAUD,       pcm_s24daud);
-    REGISTER_ENCDEC (PCM_S24LE,         pcm_s24le);
-    REGISTER_ENCDEC (PCM_S24LE_PLANAR,  pcm_s24le_planar);
-    REGISTER_ENCDEC (PCM_S32BE,         pcm_s32be);
-    REGISTER_ENCDEC (PCM_S32LE,         pcm_s32le);
-    REGISTER_ENCDEC (PCM_S32LE_PLANAR,  pcm_s32le_planar);
-    REGISTER_ENCDEC (PCM_S64BE,         pcm_s64be);
-    REGISTER_ENCDEC (PCM_S64LE,         pcm_s64le);
-    REGISTER_ENCDEC (PCM_U8,            pcm_u8);
-    REGISTER_ENCDEC (PCM_U16BE,         pcm_u16be);
-    REGISTER_ENCDEC (PCM_U16LE,         pcm_u16le);
-    REGISTER_ENCDEC (PCM_U24BE,         pcm_u24be);
-    REGISTER_ENCDEC (PCM_U24LE,         pcm_u24le);
-    REGISTER_ENCDEC (PCM_U32BE,         pcm_u32be);
-    REGISTER_ENCDEC (PCM_U32LE,         pcm_u32le);
-    REGISTER_DECODER(PCM_ZORK,          pcm_zork);
-
-    /* DPCM codecs */
-    REGISTER_DECODER(GREMLIN_DPCM,      gremlin_dpcm);
-    REGISTER_DECODER(INTERPLAY_DPCM,    interplay_dpcm);
-    REGISTER_ENCDEC (ROQ_DPCM,          roq_dpcm);
-    REGISTER_DECODER(SOL_DPCM,          sol_dpcm);
-    REGISTER_DECODER(XAN_DPCM,          xan_dpcm);
-
-    /* ADPCM codecs */
-    REGISTER_DECODER(ADPCM_4XM,         adpcm_4xm);
-    REGISTER_ENCDEC (ADPCM_ADX,         adpcm_adx);
-    REGISTER_DECODER(ADPCM_AFC,         adpcm_afc);
-    REGISTER_DECODER(ADPCM_AICA,        adpcm_aica);
-    REGISTER_DECODER(ADPCM_CT,          adpcm_ct);
-    REGISTER_DECODER(ADPCM_DTK,         adpcm_dtk);
-    REGISTER_DECODER(ADPCM_EA,          adpcm_ea);
-    REGISTER_DECODER(ADPCM_EA_MAXIS_XA, adpcm_ea_maxis_xa);
-    REGISTER_DECODER(ADPCM_EA_R1,       adpcm_ea_r1);
-    REGISTER_DECODER(ADPCM_EA_R2,       adpcm_ea_r2);
-    REGISTER_DECODER(ADPCM_EA_R3,       adpcm_ea_r3);
-    REGISTER_DECODER(ADPCM_EA_XAS,      adpcm_ea_xas);
-    REGISTER_ENCDEC (ADPCM_G722,        adpcm_g722);
-    REGISTER_ENCDEC (ADPCM_G726,        adpcm_g726);
-    REGISTER_ENCDEC (ADPCM_G726LE,      adpcm_g726le);
-    REGISTER_DECODER(ADPCM_IMA_AMV,     adpcm_ima_amv);
-    REGISTER_DECODER(ADPCM_IMA_APC,     adpcm_ima_apc);
-    REGISTER_DECODER(ADPCM_IMA_DAT4,    adpcm_ima_dat4);
-    REGISTER_DECODER(ADPCM_IMA_DK3,     adpcm_ima_dk3);
-    REGISTER_DECODER(ADPCM_IMA_DK4,     adpcm_ima_dk4);
-    REGISTER_DECODER(ADPCM_IMA_EA_EACS, adpcm_ima_ea_eacs);
-    REGISTER_DECODER(ADPCM_IMA_EA_SEAD, adpcm_ima_ea_sead);
-    REGISTER_DECODER(ADPCM_IMA_ISS,     adpcm_ima_iss);
-    REGISTER_DECODER(ADPCM_IMA_OKI,     adpcm_ima_oki);
-    REGISTER_ENCDEC (ADPCM_IMA_QT,      adpcm_ima_qt);
-    REGISTER_DECODER(ADPCM_IMA_RAD,     adpcm_ima_rad);
-    REGISTER_DECODER(ADPCM_IMA_SMJPEG,  adpcm_ima_smjpeg);
-    REGISTER_ENCDEC (ADPCM_IMA_WAV,     adpcm_ima_wav);
-    REGISTER_DECODER(ADPCM_IMA_WS,      adpcm_ima_ws);
-    REGISTER_ENCDEC (ADPCM_MS,          adpcm_ms);
-    REGISTER_DECODER(ADPCM_MTAF,        adpcm_mtaf);
-    REGISTER_DECODER(ADPCM_PSX,         adpcm_psx);
-    REGISTER_DECODER(ADPCM_SBPRO_2,     adpcm_sbpro_2);
-    REGISTER_DECODER(ADPCM_SBPRO_3,     adpcm_sbpro_3);
-    REGISTER_DECODER(ADPCM_SBPRO_4,     adpcm_sbpro_4);
-    REGISTER_ENCDEC (ADPCM_SWF,         adpcm_swf);
-    REGISTER_DECODER(ADPCM_THP,         adpcm_thp);
-    REGISTER_DECODER(ADPCM_THP_LE,      adpcm_thp_le);
-    REGISTER_DECODER(ADPCM_VIMA,        adpcm_vima);
-    REGISTER_DECODER(ADPCM_XA,          adpcm_xa);
-    REGISTER_ENCDEC (ADPCM_YAMAHA,      adpcm_yamaha);
-
-    /* subtitles */
-    REGISTER_ENCDEC (SSA,               ssa);
-    REGISTER_ENCDEC (ASS,               ass);
-    REGISTER_DECODER(CCAPTION,          ccaption);
-    REGISTER_ENCDEC (DVBSUB,            dvbsub);
-    REGISTER_ENCDEC (DVDSUB,            dvdsub);
-    REGISTER_DECODER(JACOSUB,           jacosub);
-    REGISTER_DECODER(MICRODVD,          microdvd);
-    REGISTER_ENCDEC (MOVTEXT,           movtext);
-    REGISTER_DECODER(MPL2,              mpl2);
-    REGISTER_DECODER(PGSSUB,            pgssub);
-    REGISTER_DECODER(PJS,               pjs);
-    REGISTER_DECODER(REALTEXT,          realtext);
-    REGISTER_DECODER(SAMI,              sami);
-    REGISTER_ENCDEC (SRT,               srt);
-    REGISTER_DECODER(STL,               stl);
-    REGISTER_ENCDEC (SUBRIP,            subrip);
-    REGISTER_DECODER(SUBVIEWER,         subviewer);
-    REGISTER_DECODER(SUBVIEWER1,        subviewer1);
-    REGISTER_ENCDEC (TEXT,              text);
-    REGISTER_DECODER(VPLAYER,           vplayer);
-    REGISTER_ENCDEC (WEBVTT,            webvtt);
-    REGISTER_ENCDEC (XSUB,              xsub);
-
-    /* external libraries */
-    REGISTER_ENCDEC (AAC_AT,            aac_at);
-    REGISTER_DECODER(AC3_AT,            ac3_at);
-    REGISTER_DECODER(ADPCM_IMA_QT_AT,   adpcm_ima_qt_at);
-    REGISTER_ENCDEC (ALAC_AT,           alac_at);
-    REGISTER_DECODER(AMR_NB_AT,         amr_nb_at);
-    REGISTER_DECODER(EAC3_AT,           eac3_at);
-    REGISTER_DECODER(GSM_MS_AT,         gsm_ms_at);
-    REGISTER_ENCDEC (ILBC_AT,           ilbc_at);
-    REGISTER_DECODER(MP1_AT,            mp1_at);
-    REGISTER_DECODER(MP2_AT,            mp2_at);
-    REGISTER_DECODER(MP3_AT,            mp3_at);
-    REGISTER_ENCDEC (PCM_ALAW_AT,       pcm_alaw_at);
-    REGISTER_ENCDEC (PCM_MULAW_AT,      pcm_mulaw_at);
-    REGISTER_DECODER(QDMC_AT,           qdmc_at);
-    REGISTER_DECODER(QDM2_AT,           qdm2_at);
-    REGISTER_DECODER(LIBCELT,           libcelt);
-    REGISTER_ENCDEC (LIBFDK_AAC,        libfdk_aac);
-    REGISTER_ENCDEC (LIBGSM,            libgsm);
-    REGISTER_ENCDEC (LIBGSM_MS,         libgsm_ms);
-    REGISTER_ENCDEC (LIBILBC,           libilbc);
-    REGISTER_ENCODER(LIBMP3LAME,        libmp3lame);
-    REGISTER_ENCDEC (LIBOPENCORE_AMRNB, libopencore_amrnb);
-    REGISTER_DECODER(LIBOPENCORE_AMRWB, libopencore_amrwb);
-    REGISTER_ENCDEC (LIBOPENJPEG,       libopenjpeg);
-    REGISTER_ENCDEC (LIBOPUS,           libopus);
-    REGISTER_DECODER(LIBRSVG,           librsvg);
-    REGISTER_ENCODER(LIBSHINE,          libshine);
-    REGISTER_ENCDEC (LIBSPEEX,          libspeex);
-    REGISTER_ENCODER(LIBTHEORA,         libtheora);
-    REGISTER_ENCODER(LIBTWOLAME,        libtwolame);
-    REGISTER_ENCODER(LIBVO_AMRWBENC,    libvo_amrwbenc);
-    REGISTER_ENCDEC (LIBVORBIS,         libvorbis);
-    REGISTER_ENCDEC (LIBVPX_VP8,        libvpx_vp8);
-    REGISTER_ENCDEC (LIBVPX_VP9,        libvpx_vp9);
-    REGISTER_ENCODER(LIBWAVPACK,        libwavpack);
-    REGISTER_ENCODER(LIBWEBP_ANIM,      libwebp_anim);  /* preferred over libwebp */
-    REGISTER_ENCODER(LIBWEBP,           libwebp);
-    REGISTER_ENCODER(LIBX262,           libx262);
-    REGISTER_ENCODER(LIBX264,           libx264);
-    REGISTER_ENCODER(LIBX264RGB,        libx264rgb);
-    REGISTER_ENCODER(LIBX265,           libx265);
-    REGISTER_ENCODER(LIBXAVS,           libxavs);
-    REGISTER_ENCODER(LIBXVID,           libxvid);
-    REGISTER_DECODER(LIBZVBI_TELETEXT,  libzvbi_teletext);
-
-    /* text */
-    REGISTER_DECODER(BINTEXT,           bintext);
-    REGISTER_DECODER(XBIN,              xbin);
-    REGISTER_DECODER(IDF,               idf);
-
-    /* external libraries, that shouldn't be used by default if one of the
-     * above is available */
-    REGISTER_ENCODER(H263_V4L2M2M,      h263_v4l2m2m);
-    REGISTER_ENCDEC (LIBOPENH264,       libopenh264);
-    REGISTER_DECODER(H264_CUVID,        h264_cuvid);
-    REGISTER_ENCODER(H264_NVENC,        h264_nvenc);
-    REGISTER_ENCODER(H264_OMX,          h264_omx);
-    REGISTER_ENCODER(H264_QSV,          h264_qsv);
-    REGISTER_ENCODER(H264_V4L2M2M,      h264_v4l2m2m);
-    REGISTER_ENCODER(H264_VAAPI,        h264_vaapi);
-    REGISTER_ENCODER(H264_VIDEOTOOLBOX, h264_videotoolbox);
-#if FF_API_NVENC_OLD_NAME
-    REGISTER_ENCODER(NVENC,             nvenc);
-    REGISTER_ENCODER(NVENC_H264,        nvenc_h264);
-    REGISTER_ENCODER(NVENC_HEVC,        nvenc_hevc);
-#endif
-    REGISTER_DECODER(HEVC_CUVID,        hevc_cuvid);
-    REGISTER_DECODER(HEVC_MEDIACODEC,   hevc_mediacodec);
-    REGISTER_ENCODER(HEVC_NVENC,        hevc_nvenc);
-    REGISTER_ENCODER(HEVC_QSV,          hevc_qsv);
-    REGISTER_ENCODER(HEVC_V4L2M2M,      hevc_v4l2m2m);
-    REGISTER_ENCODER(HEVC_VAAPI,        hevc_vaapi);
-    REGISTER_ENCODER(LIBKVAZAAR,        libkvazaar);
-    REGISTER_DECODER(MJPEG_CUVID,       mjpeg_cuvid);
-    REGISTER_ENCODER(MJPEG_VAAPI,       mjpeg_vaapi);
-    REGISTER_DECODER(MPEG1_CUVID,       mpeg1_cuvid);
-    REGISTER_DECODER(MPEG2_CUVID,       mpeg2_cuvid);
-    REGISTER_ENCODER(MPEG2_QSV,         mpeg2_qsv);
-    REGISTER_ENCODER(MPEG2_VAAPI,       mpeg2_vaapi);
-    REGISTER_DECODER(MPEG4_CUVID,       mpeg4_cuvid);
-    REGISTER_DECODER(MPEG4_MEDIACODEC,  mpeg4_mediacodec);
-    REGISTER_ENCODER(MPEG4_V4L2M2M,     mpeg4_v4l2m2m);
-    REGISTER_DECODER(VC1_CUVID,         vc1_cuvid);
-    REGISTER_DECODER(VP8_CUVID,         vp8_cuvid);
-    REGISTER_DECODER(VP8_MEDIACODEC,    vp8_mediacodec);
-    REGISTER_DECODER(VP8_QSV,           vp8_qsv);
-    REGISTER_ENCODER(VP8_V4L2M2M,       vp8_v4l2m2m);
-    REGISTER_ENCODER(VP8_VAAPI,         vp8_vaapi);
-    REGISTER_DECODER(VP9_CUVID,         vp9_cuvid);
-    REGISTER_DECODER(VP9_MEDIACODEC,    vp9_mediacodec);
-    REGISTER_ENCODER(VP9_VAAPI,         vp9_vaapi);
-
-    /* parsers */
-    REGISTER_PARSER(AAC,                aac);
-    REGISTER_PARSER(AAC_LATM,           aac_latm);
-    REGISTER_PARSER(AC3,                ac3);
-    REGISTER_PARSER(ADX,                adx);
-    REGISTER_PARSER(BMP,                bmp);
-    REGISTER_PARSER(CAVSVIDEO,          cavsvideo);
-    REGISTER_PARSER(COOK,               cook);
-    REGISTER_PARSER(DCA,                dca);
-    REGISTER_PARSER(DIRAC,              dirac);
-    REGISTER_PARSER(DNXHD,              dnxhd);
-    REGISTER_PARSER(DPX,                dpx);
-    REGISTER_PARSER(DVAUDIO,            dvaudio);
-    REGISTER_PARSER(DVBSUB,             dvbsub);
-    REGISTER_PARSER(DVDSUB,             dvdsub);
-    REGISTER_PARSER(DVD_NAV,            dvd_nav);
-    REGISTER_PARSER(FLAC,               flac);
-    REGISTER_PARSER(G729,               g729);
-    REGISTER_PARSER(GSM,                gsm);
-    REGISTER_PARSER(H261,               h261);
-    REGISTER_PARSER(H263,               h263);
-    REGISTER_PARSER(H264,               h264);
-    REGISTER_PARSER(HEVC,               hevc);
-    REGISTER_PARSER(MJPEG,              mjpeg);
-    REGISTER_PARSER(MLP,                mlp);
-    REGISTER_PARSER(MPEG4VIDEO,         mpeg4video);
-    REGISTER_PARSER(MPEGAUDIO,          mpegaudio);
-    REGISTER_PARSER(MPEGVIDEO,          mpegvideo);
-    REGISTER_PARSER(OPUS,               opus);
-    REGISTER_PARSER(PNG,                png);
-    REGISTER_PARSER(PNM,                pnm);
-    REGISTER_PARSER(RV30,               rv30);
-    REGISTER_PARSER(RV40,               rv40);
-    REGISTER_PARSER(SIPR,               sipr);
-    REGISTER_PARSER(TAK,                tak);
-    REGISTER_PARSER(VC1,                vc1);
-    REGISTER_PARSER(VORBIS,             vorbis);
-    REGISTER_PARSER(VP3,                vp3);
-    REGISTER_PARSER(VP8,                vp8);
-    REGISTER_PARSER(VP9,                vp9);
-    REGISTER_PARSER(XMA,                xma);
+    return find_codec(id, av_codec_is_encoder);
 }
 
-void avcodec_register_all(void)
+AVCodec *avcodec_find_decoder(enum AVCodecID id)
+{
+    return find_codec(id, av_codec_is_decoder);
+}
+
+static AVCodec *find_codec_by_name(const char *name, int (*x)(const AVCodec *))
 {
-    static AVOnce control = AV_ONCE_INIT;
+    void *i = 0;
+    const AVCodec *p;
 
-    ff_thread_once(&control, register_all);
+    if (!name)
+        return NULL;
+
+    while ((p = av_codec_iterate(&i))) {
+        if (!x(p))
+            continue;
+        if (strcmp(name, p->name) == 0)
+            return (AVCodec*)p;
+    }
+
+    return NULL;
+}
+
+AVCodec *avcodec_find_encoder_by_name(const char *name)
+{
+    return find_codec_by_name(name, av_codec_is_encoder);
+}
+
+AVCodec *avcodec_find_decoder_by_name(const char *name)
+{
+    return find_codec_by_name(name, av_codec_is_decoder);
 }
diff --git a/media/ffvpx/libavcodec/audioconvert.c b/media/ffvpx/libavcodec/audioconvert.c
deleted file mode 100644
index 5e46fae2d..000000000
--- a/media/ffvpx/libavcodec/audioconvert.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * audio conversion
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * audio conversion
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "libavutil/avstring.h"
-#include "libavutil/common.h"
-#include "libavutil/libm.h"
-#include "libavutil/samplefmt.h"
-#include "avcodec.h"
-#include "audioconvert.h"
-
-#if FF_API_AUDIO_CONVERT
-
-struct AVAudioConvert {
-    int in_channels, out_channels;
-    int fmt_pair;
-};
-
-AVAudioConvert *av_audio_convert_alloc(enum AVSampleFormat out_fmt, int out_channels,
-                                       enum AVSampleFormat in_fmt, int in_channels,
-                                       const float *matrix, int flags)
-{
-    AVAudioConvert *ctx;
-    if (in_channels!=out_channels)
-        return NULL;  /* FIXME: not supported */
-    ctx = av_malloc(sizeof(AVAudioConvert));
-    if (!ctx)
-        return NULL;
-    ctx->in_channels = in_channels;
-    ctx->out_channels = out_channels;
-    ctx->fmt_pair = out_fmt + AV_SAMPLE_FMT_NB*in_fmt;
-    return ctx;
-}
-
-void av_audio_convert_free(AVAudioConvert *ctx)
-{
-    av_free(ctx);
-}
-
-int av_audio_convert(AVAudioConvert *ctx,
-                           void * const out[6], const int out_stride[6],
-                     const void * const  in[6], const int  in_stride[6], int len)
-{
-    int ch;
-
-    //FIXME optimize common cases
-
-    for(ch=0; ch<ctx->out_channels; ch++){
-        const int is=  in_stride[ch];
-        const int os= out_stride[ch];
-        const uint8_t *pi=  in[ch];
-        uint8_t *po= out[ch];
-        uint8_t *end= po + os*len;
-        if(!out[ch])
-            continue;
-
-#define CONV(ofmt, otype, ifmt, expr)\
-if(ctx->fmt_pair == ofmt + AV_SAMPLE_FMT_NB*ifmt){\
-    do{\
-        *(otype*)po = expr; pi += is; po += os;\
-    }while(po < end);\
-}
-
-//FIXME put things below under ifdefs so we do not waste space for cases no codec will need
-//FIXME rounding ?
-
-             CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_U8 ,  *(const uint8_t*)pi)
-        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)<<8)
-        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)<<24)
-        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)*(1.0 / (1<<7)))
-        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_U8 , (*(const uint8_t*)pi - 0x80)*(1.0 / (1<<7)))
-        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_S16, (*(const int16_t*)pi>>8) + 0x80)
-        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_S16,  *(const int16_t*)pi)
-        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_S16,  *(const int16_t*)pi<<16)
-        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_S16,  *(const int16_t*)pi*(1.0 / (1<<15)))
-        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_S16,  *(const int16_t*)pi*(1.0 / (1<<15)))
-        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_S32, (*(const int32_t*)pi>>24) + 0x80)
-        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_S32,  *(const int32_t*)pi>>16)
-        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_S32,  *(const int32_t*)pi)
-        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_S32,  *(const int32_t*)pi*(1.0 / (1U<<31)))
-        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_S32,  *(const int32_t*)pi*(1.0 / (1U<<31)))
-        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_FLT, av_clip_uint8(  lrintf(*(const float*)pi * (1<<7)) + 0x80))
-        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, av_clip_int16(  lrintf(*(const float*)pi * (1<<15))))
-        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, av_clipl_int32(llrintf(*(const float*)pi * (1U<<31))))
-        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_FLT, *(const float*)pi)
-        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_FLT, *(const float*)pi)
-        else CONV(AV_SAMPLE_FMT_U8 , uint8_t, AV_SAMPLE_FMT_DBL, av_clip_uint8(  lrint(*(const double*)pi * (1<<7)) + 0x80))
-        else CONV(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, av_clip_int16(  lrint(*(const double*)pi * (1<<15))))
-        else CONV(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, av_clipl_int32(llrint(*(const double*)pi * (1U<<31))))
-        else CONV(AV_SAMPLE_FMT_FLT, float  , AV_SAMPLE_FMT_DBL, *(const double*)pi)
-        else CONV(AV_SAMPLE_FMT_DBL, double , AV_SAMPLE_FMT_DBL, *(const double*)pi)
-        else return -1;
-    }
-    return 0;
-}
-
-#endif /* FF_API_AUDIO_CONVERT */
diff --git a/media/ffvpx/libavcodec/audioconvert.h b/media/ffvpx/libavcodec/audioconvert.h
deleted file mode 100644
index 996c3f37a..000000000
--- a/media/ffvpx/libavcodec/audioconvert.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * audio conversion
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2008 Peter Ross
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_AUDIOCONVERT_H
-#define AVCODEC_AUDIOCONVERT_H
-
-#include "version.h"
-
-/**
- * @file
- * Audio format conversion routines
- * This interface is deprecated and will be dropped in a future
- * version. You should use the libswresample library instead.
- */
-
-#if FF_API_AUDIO_CONVERT
-
-#include "libavutil/cpu.h"
-#include "avcodec.h"
-#include "libavutil/channel_layout.h"
-
-struct AVAudioConvert;
-typedef struct AVAudioConvert AVAudioConvert;
-
-/**
- * Create an audio sample format converter context
- * @param out_fmt Output sample format
- * @param out_channels Number of output channels
- * @param in_fmt Input sample format
- * @param in_channels Number of input channels
- * @param[in] matrix Channel mixing matrix (of dimension in_channel*out_channels). Set to NULL to ignore.
- * @param flags See AV_CPU_FLAG_xx
- * @return NULL on error
- * @deprecated See libswresample
- */
-
-attribute_deprecated
-AVAudioConvert *av_audio_convert_alloc(enum AVSampleFormat out_fmt, int out_channels,
-                                       enum AVSampleFormat in_fmt, int in_channels,
-                                       const float *matrix, int flags);
-
-/**
- * Free audio sample format converter context
- * @deprecated See libswresample
- */
-
-attribute_deprecated
-void av_audio_convert_free(AVAudioConvert *ctx);
-
-/**
- * Convert between audio sample formats
- * @param[in] out array of output buffers for each channel. set to NULL to ignore processing of the given channel.
- * @param[in] out_stride distance between consecutive output samples (measured in bytes)
- * @param[in] in array of input buffers for each channel
- * @param[in] in_stride distance between consecutive input samples (measured in bytes)
- * @param len length of audio frame size (measured in samples)
- * @deprecated See libswresample
- */
-
-attribute_deprecated
-int av_audio_convert(AVAudioConvert *ctx,
-                           void * const out[6], const int out_stride[6],
-                     const void * const  in[6], const int  in_stride[6], int len);
-
-#endif /* FF_API_AUDIO_CONVERT */
-
-#endif /* AVCODEC_AUDIOCONVERT_H */
diff --git a/media/ffvpx/libavcodec/avcodec.h b/media/ffvpx/libavcodec/avcodec.h
index 18c3e3ea1..fb0c6fae7 100644
--- a/media/ffvpx/libavcodec/avcodec.h
+++ b/media/ffvpx/libavcodec/avcodec.h
@@ -36,6 +36,7 @@
 #include "libavutil/channel_layout.h"
 #include "libavutil/dict.h"
 #include "libavutil/frame.h"
+#include "libavutil/hwcontext.h"
 #include "libavutil/log.h"
 #include "libavutil/pixfmt.h"
 #include "libavutil/rational.h"
@@ -217,9 +218,6 @@ enum AVCodecID {
     /* video codecs */
     AV_CODEC_ID_MPEG1VIDEO,
     AV_CODEC_ID_MPEG2VIDEO, ///< preferred ID for MPEG-1/2 video decoding
-#if FF_API_XVMC
-    AV_CODEC_ID_MPEG2VIDEO_XVMC,
-#endif /* FF_API_XVMC */
     AV_CODEC_ID_H261,
     AV_CODEC_ID_H263,
     AV_CODEC_ID_RV10,
@@ -520,9 +518,6 @@ enum AVCodecID {
     AV_CODEC_ID_ADPCM_G722,
     AV_CODEC_ID_ADPCM_IMA_APC,
     AV_CODEC_ID_ADPCM_VIMA,
-#if FF_API_VIMA_DECODER
-    AV_CODEC_ID_VIMA = AV_CODEC_ID_ADPCM_VIMA,
-#endif
 
     AV_CODEC_ID_ADPCM_AFC = 0x11800,
     AV_CODEC_ID_ADPCM_IMA_OKI,
@@ -585,9 +580,6 @@ enum AVCodecID {
     AV_CODEC_ID_MLP,
     AV_CODEC_ID_GSM_MS, /* as found in WAV */
     AV_CODEC_ID_ATRAC3,
-#if FF_API_VOXWARE
-    AV_CODEC_ID_VOXWARE,
-#endif
     AV_CODEC_ID_APE,
     AV_CODEC_ID_NELLYMOSER,
     AV_CODEC_ID_MUSEPACK8,
@@ -623,6 +615,7 @@ enum AVCodecID {
     AV_CODEC_ID_PAF_AUDIO,
     AV_CODEC_ID_ON2AVC,
     AV_CODEC_ID_DSS_SP,
+    AV_CODEC_ID_CODEC2,
 
     AV_CODEC_ID_FFWAVESYNTH = 0x15800,
     AV_CODEC_ID_SONIC,
@@ -641,6 +634,9 @@ enum AVCodecID {
     AV_CODEC_ID_ATRAC3AL,
     AV_CODEC_ID_ATRAC3PAL,
     AV_CODEC_ID_DOLBY_E,
+    AV_CODEC_ID_APTX,
+    AV_CODEC_ID_APTX_HD,
+    AV_CODEC_ID_SBC,
 
     /* subtitle codecs */
     AV_CODEC_ID_FIRST_SUBTITLE = 0x17000,          ///< A dummy ID pointing at the start of subtitle codecs.
@@ -774,7 +770,7 @@ typedef struct AVCodecDescriptor {
  * Note: If the first 23 bits of the additional bytes are not 0, then damaged
  * MPEG bitstreams could cause overread and segfault.
  */
-#define AV_INPUT_BUFFER_PADDING_SIZE 32
+#define AV_INPUT_BUFFER_PADDING_SIZE 64
 
 /**
  * @ingroup lavc_encoding
@@ -783,38 +779,6 @@ typedef struct AVCodecDescriptor {
  */
 #define AV_INPUT_BUFFER_MIN_SIZE 16384
 
-#if FF_API_WITHOUT_PREFIX
-/**
- * @deprecated use AV_INPUT_BUFFER_PADDING_SIZE instead
- */
-#define FF_INPUT_BUFFER_PADDING_SIZE 32
-
-/**
- * @deprecated use AV_INPUT_BUFFER_MIN_SIZE instead
- */
-#define FF_MIN_BUFFER_SIZE 16384
-#endif /* FF_API_WITHOUT_PREFIX */
-
-/**
- * @ingroup lavc_encoding
- * motion estimation type.
- * @deprecated use codec private option instead
- */
-#if FF_API_MOTION_EST
-enum Motion_Est_ID {
-    ME_ZERO = 1,    ///< no search, that is use 0,0 vector whenever one is needed
-    ME_FULL,
-    ME_LOG,
-    ME_PHODS,
-    ME_EPZS,        ///< enhanced predictive zonal search
-    ME_X1,          ///< reserved for experiments
-    ME_HEX,         ///< hexagon based search
-    ME_UMH,         ///< uneven multi-hexagon search
-    ME_TESA,        ///< transformed exhaustive search algorithm
-    ME_ITER=50,     ///< iterative search
-};
-#endif
-
 /**
  * @ingroup lavc_decoding
  */
@@ -853,13 +817,6 @@ typedef struct RcOverride{
     float quality_factor;
 } RcOverride;
 
-#if FF_API_MAX_BFRAMES
-/**
- * @deprecated there is no libavcodec-wide limit on the number of B-frames
- */
-#define FF_MAX_B_FRAMES 16
-#endif
-
 /* encoding support
    These flags can be passed in AVCodecContext.flags before initialization.
    Note: Not everything is supported yet.
@@ -1031,13 +988,6 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_CAP_SMALL_LAST_FRAME    (1 <<  6)
 
-#if FF_API_CAP_VDPAU
-/**
- * Codec can export data for HW decoding (VDPAU).
- */
-#define AV_CODEC_CAP_HWACCEL_VDPAU       (1 <<  7)
-#endif
-
 /**
  * Codec can output multiple frames per AVPacket
  * Normally demuxers return one frame at a time, demuxers which do not do
@@ -1098,231 +1048,26 @@ typedef struct RcOverride{
  */
 #define AV_CODEC_CAP_LOSSLESS         0x80000000
 
-
-#if FF_API_WITHOUT_PREFIX
 /**
- * Allow decoders to produce frames with data planes that are not aligned
- * to CPU requirements (e.g. due to cropping).
+ * Codec is backed by a hardware implementation. Typically used to
+ * identify a non-hwaccel hardware decoder. For information about hwaccels, use
+ * avcodec_get_hw_config() instead.
  */
-#define CODEC_FLAG_UNALIGNED AV_CODEC_FLAG_UNALIGNED
-#define CODEC_FLAG_QSCALE AV_CODEC_FLAG_QSCALE
-#define CODEC_FLAG_4MV    AV_CODEC_FLAG_4MV
-#define CODEC_FLAG_OUTPUT_CORRUPT AV_CODEC_FLAG_OUTPUT_CORRUPT
-#define CODEC_FLAG_QPEL   AV_CODEC_FLAG_QPEL
-#if FF_API_GMC
-/**
- * @deprecated use the "gmc" private option of the libxvid encoder
- */
-#define CODEC_FLAG_GMC    0x0020  ///< Use GMC.
-#endif
-#if FF_API_MV0
-/**
- * @deprecated use the flag "mv0" in the "mpv_flags" private option of the
- * mpegvideo encoders
- */
-#define CODEC_FLAG_MV0    0x0040
-#endif
-#if FF_API_INPUT_PRESERVED
-/**
- * @deprecated passing reference-counted frames to the encoders replaces this
- * flag
- */
-#define CODEC_FLAG_INPUT_PRESERVED 0x0100
-#endif
-#define CODEC_FLAG_PASS1           AV_CODEC_FLAG_PASS1
-#define CODEC_FLAG_PASS2           AV_CODEC_FLAG_PASS2
-#define CODEC_FLAG_GRAY            AV_CODEC_FLAG_GRAY
-#if FF_API_EMU_EDGE
-/**
- * @deprecated edges are not used/required anymore. I.e. this flag is now always
- * set.
- */
-#define CODEC_FLAG_EMU_EDGE        0x4000
-#endif
-#define CODEC_FLAG_PSNR            AV_CODEC_FLAG_PSNR
-#define CODEC_FLAG_TRUNCATED       AV_CODEC_FLAG_TRUNCATED
+#define AV_CODEC_CAP_HARDWARE            (1 << 18)
 
-#if FF_API_NORMALIZE_AQP
 /**
- * @deprecated use the flag "naq" in the "mpv_flags" private option of the
- * mpegvideo encoders
+ * Codec is potentially backed by a hardware implementation, but not
+ * necessarily. This is used instead of AV_CODEC_CAP_HARDWARE, if the
+ * implementation provides some sort of internal fallback.
  */
-#define CODEC_FLAG_NORMALIZE_AQP  0x00020000
-#endif
-#define CODEC_FLAG_INTERLACED_DCT AV_CODEC_FLAG_INTERLACED_DCT
-#define CODEC_FLAG_LOW_DELAY      AV_CODEC_FLAG_LOW_DELAY
-#define CODEC_FLAG_GLOBAL_HEADER  AV_CODEC_FLAG_GLOBAL_HEADER
-#define CODEC_FLAG_BITEXACT       AV_CODEC_FLAG_BITEXACT
-#define CODEC_FLAG_AC_PRED        AV_CODEC_FLAG_AC_PRED
-#define CODEC_FLAG_LOOP_FILTER    AV_CODEC_FLAG_LOOP_FILTER
-#define CODEC_FLAG_INTERLACED_ME  AV_CODEC_FLAG_INTERLACED_ME
-#define CODEC_FLAG_CLOSED_GOP     AV_CODEC_FLAG_CLOSED_GOP
-#define CODEC_FLAG2_FAST          AV_CODEC_FLAG2_FAST
-#define CODEC_FLAG2_NO_OUTPUT     AV_CODEC_FLAG2_NO_OUTPUT
-#define CODEC_FLAG2_LOCAL_HEADER  AV_CODEC_FLAG2_LOCAL_HEADER
-#define CODEC_FLAG2_DROP_FRAME_TIMECODE AV_CODEC_FLAG2_DROP_FRAME_TIMECODE
-#define CODEC_FLAG2_IGNORE_CROP   AV_CODEC_FLAG2_IGNORE_CROP
-
-#define CODEC_FLAG2_CHUNKS        AV_CODEC_FLAG2_CHUNKS
-#define CODEC_FLAG2_SHOW_ALL      AV_CODEC_FLAG2_SHOW_ALL
-#define CODEC_FLAG2_EXPORT_MVS    AV_CODEC_FLAG2_EXPORT_MVS
-#define CODEC_FLAG2_SKIP_MANUAL   AV_CODEC_FLAG2_SKIP_MANUAL
-
-/* Unsupported options :
- *              Syntax Arithmetic coding (SAC)
- *              Reference Picture Selection
- *              Independent Segment Decoding */
-/* /Fx */
-/* codec capabilities */
-
-#define CODEC_CAP_DRAW_HORIZ_BAND AV_CODEC_CAP_DRAW_HORIZ_BAND ///< Decoder can use draw_horiz_band callback.
-/**
- * Codec uses get_buffer() for allocating buffers and supports custom allocators.
- * If not set, it might not use get_buffer() at all or use operations that
- * assume the buffer was allocated by avcodec_default_get_buffer.
- */
-#define CODEC_CAP_DR1             AV_CODEC_CAP_DR1
-#define CODEC_CAP_TRUNCATED       AV_CODEC_CAP_TRUNCATED
-#if FF_API_XVMC
-/* Codec can export data for HW decoding. This flag indicates that
- * the codec would call get_format() with list that might contain HW accelerated
- * pixel formats (XvMC, VDPAU, VAAPI, etc). The application can pick any of them
- * including raw image format.
- * The application can use the passed context to determine bitstream version,
- * chroma format, resolution etc.
- */
-#define CODEC_CAP_HWACCEL         0x0010
-#endif /* FF_API_XVMC */
-/**
- * Encoder or decoder requires flushing with NULL input at the end in order to
- * give the complete and correct output.
- *
- * NOTE: If this flag is not set, the codec is guaranteed to never be fed with
- *       with NULL data. The user can still send NULL data to the public encode
- *       or decode function, but libavcodec will not pass it along to the codec
- *       unless this flag is set.
- *
- * Decoders:
- * The decoder has a non-zero delay and needs to be fed with avpkt->data=NULL,
- * avpkt->size=0 at the end to get the delayed data until the decoder no longer
- * returns frames.
- *
- * Encoders:
- * The encoder needs to be fed with NULL data at the end of encoding until the
- * encoder no longer returns data.
- *
- * NOTE: For encoders implementing the AVCodec.encode2() function, setting this
- *       flag also means that the encoder must set the pts and duration for
- *       each output packet. If this flag is not set, the pts and duration will
- *       be determined by libavcodec from the input frame.
- */
-#define CODEC_CAP_DELAY           AV_CODEC_CAP_DELAY
-/**
- * Codec can be fed a final frame with a smaller size.
- * This can be used to prevent truncation of the last audio samples.
- */
-#define CODEC_CAP_SMALL_LAST_FRAME AV_CODEC_CAP_SMALL_LAST_FRAME
-#if FF_API_CAP_VDPAU
-/**
- * Codec can export data for HW decoding (VDPAU).
- */
-#define CODEC_CAP_HWACCEL_VDPAU    AV_CODEC_CAP_HWACCEL_VDPAU
-#endif
-/**
- * Codec can output multiple frames per AVPacket
- * Normally demuxers return one frame at a time, demuxers which do not do
- * are connected to a parser to split what they return into proper frames.
- * This flag is reserved to the very rare category of codecs which have a
- * bitstream that cannot be split into frames without timeconsuming
- * operations like full decoding. Demuxers carrying such bitstreams thus
- * may return multiple frames in a packet. This has many disadvantages like
- * prohibiting stream copy in many cases thus it should only be considered
- * as a last resort.
- */
-#define CODEC_CAP_SUBFRAMES        AV_CODEC_CAP_SUBFRAMES
-/**
- * Codec is experimental and is thus avoided in favor of non experimental
- * encoders
- */
-#define CODEC_CAP_EXPERIMENTAL     AV_CODEC_CAP_EXPERIMENTAL
-/**
- * Codec should fill in channel configuration and samplerate instead of container
- */
-#define CODEC_CAP_CHANNEL_CONF     AV_CODEC_CAP_CHANNEL_CONF
-#if FF_API_NEG_LINESIZES
-/**
- * @deprecated no codecs use this capability
- */
-#define CODEC_CAP_NEG_LINESIZES    0x0800
-#endif
-/**
- * Codec supports frame-level multithreading.
- */
-#define CODEC_CAP_FRAME_THREADS    AV_CODEC_CAP_FRAME_THREADS
-/**
- * Codec supports slice-based (or partition-based) multithreading.
- */
-#define CODEC_CAP_SLICE_THREADS    AV_CODEC_CAP_SLICE_THREADS
-/**
- * Codec supports changed parameters at any point.
- */
-#define CODEC_CAP_PARAM_CHANGE     AV_CODEC_CAP_PARAM_CHANGE
-/**
- * Codec supports avctx->thread_count == 0 (auto).
- */
-#define CODEC_CAP_AUTO_THREADS     AV_CODEC_CAP_AUTO_THREADS
-/**
- * Audio encoder supports receiving a different number of samples in each call.
- */
-#define CODEC_CAP_VARIABLE_FRAME_SIZE AV_CODEC_CAP_VARIABLE_FRAME_SIZE
-/**
- * Codec is intra only.
- */
-#define CODEC_CAP_INTRA_ONLY       AV_CODEC_CAP_INTRA_ONLY
-/**
- * Codec is lossless.
- */
-#define CODEC_CAP_LOSSLESS         AV_CODEC_CAP_LOSSLESS
-
-/**
- * HWAccel is experimental and is thus avoided in favor of non experimental
- * codecs
- */
-#define HWACCEL_CODEC_CAP_EXPERIMENTAL     0x0200
-#endif /* FF_API_WITHOUT_PREFIX */
-
-#if FF_API_MB_TYPE
-//The following defines may change, don't expect compatibility if you use them.
-#define MB_TYPE_INTRA4x4   0x0001
-#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific
-#define MB_TYPE_INTRA_PCM  0x0004 //FIXME H.264-specific
-#define MB_TYPE_16x16      0x0008
-#define MB_TYPE_16x8       0x0010
-#define MB_TYPE_8x16       0x0020
-#define MB_TYPE_8x8        0x0040
-#define MB_TYPE_INTERLACED 0x0080
-#define MB_TYPE_DIRECT2    0x0100 //FIXME
-#define MB_TYPE_ACPRED     0x0200
-#define MB_TYPE_GMC        0x0400
-#define MB_TYPE_SKIP       0x0800
-#define MB_TYPE_P0L0       0x1000
-#define MB_TYPE_P1L0       0x2000
-#define MB_TYPE_P0L1       0x4000
-#define MB_TYPE_P1L1       0x8000
-#define MB_TYPE_L0         (MB_TYPE_P0L0 | MB_TYPE_P1L0)
-#define MB_TYPE_L1         (MB_TYPE_P0L1 | MB_TYPE_P1L1)
-#define MB_TYPE_L0L1       (MB_TYPE_L0   | MB_TYPE_L1)
-#define MB_TYPE_QUANT      0x00010000
-#define MB_TYPE_CBP        0x00020000
-// Note bits 24-31 are reserved for codec specific use (H.264 ref0, MPEG-1 0mv, ...)
-#endif
+#define AV_CODEC_CAP_HYBRID              (1 << 19)
 
 /**
  * Pan Scan area.
  * This specifies the area which should be displayed.
  * Note there may be multiple such areas for one frame.
  */
-typedef struct AVPanScan{
+typedef struct AVPanScan {
     /**
      * id
      * - encoding: Set by user.
@@ -1344,7 +1089,7 @@ typedef struct AVPanScan{
      * - decoding: Set by libavcodec.
      */
     int16_t position[3][2];
-}AVPanScan;
+} AVPanScan;
 
 /**
  * This structure describes the bitrate properties of an encoded bitstream. It
@@ -1384,13 +1129,6 @@ typedef struct AVCPBProperties {
     uint64_t vbv_delay;
 } AVCPBProperties;
 
-#if FF_API_QSCALE_TYPE
-#define FF_QSCALE_TYPE_MPEG1 0
-#define FF_QSCALE_TYPE_MPEG2 1
-#define FF_QSCALE_TYPE_H264  2
-#define FF_QSCALE_TYPE_VP56  3
-#endif
-
 /**
  * The decoder will keep a reference to the frame and may reuse it later.
  */
@@ -1518,7 +1256,7 @@ enum AVPacketSideDataType {
      * u8    reason for end   skip (0=padding silence, 1=convergence)
      * @endcode
      */
-    AV_PKT_DATA_SKIP_SAMPLES=70,
+    AV_PKT_DATA_SKIP_SAMPLES,
 
     /**
      * An AV_PKT_DATA_JP_DUALMONO side data packet indicates that
@@ -1607,7 +1345,20 @@ enum AVPacketSideDataType {
     AV_PKT_DATA_A53_CC,
 
     /**
-     * The number of side data elements (in fact a bit more than it).
+     * This side data is encryption initialization data.
+     * The format is not part of ABI, use av_encryption_init_info_* methods to
+     * access.
+     */
+    AV_PKT_DATA_ENCRYPTION_INIT_INFO,
+
+    /**
+     * This side data contains encryption info for how to decrypt the packet.
+     * The format is not part of ABI, use av_encryption_info_* methods to access.
+     */
+    AV_PKT_DATA_ENCRYPTION_INFO,
+
+    /**
+     * The number of side data types.
      * This is not part of the public API/ABI in the sense that it may
      * change when new side data types are added.
      * This must stay the last enum value.
@@ -1723,6 +1474,12 @@ typedef struct AVPacket {
  * outside the packet may be followed.
  */
 #define AV_PKT_FLAG_TRUSTED   0x0008
+/**
+ * Flag is used to indicate packets that contain frames that can
+ * be discarded by the decoder.  I.e. Non-reference frames.
+ */
+#define AV_PKT_FLAG_DISPOSABLE 0x0010
+
 
 enum AVSideDataParamChangeFlags {
     AV_SIDE_DATA_PARAM_CHANGE_CHANNEL_COUNT  = 0x0001,
@@ -1768,13 +1525,6 @@ typedef struct AVCodecContext {
 
     enum AVMediaType codec_type; /* see AVMEDIA_TYPE_xxx */
     const struct AVCodec  *codec;
-#if FF_API_CODEC_NAME
-    /**
-     * @deprecated this field is not used for anything in libavcodec
-     */
-    attribute_deprecated
-    char             codec_name[32];
-#endif
     enum AVCodecID     codec_id; /* see AV_CODEC_ID_xxx */
 
     /**
@@ -1792,14 +1542,6 @@ typedef struct AVCodecContext {
      */
     unsigned int codec_tag;
 
-#if FF_API_STREAM_CODEC_TAG
-    /**
-     * @deprecated this field is unused
-     */
-    attribute_deprecated
-    unsigned int stream_codec_tag;
-#endif
-
     void *priv_data;
 
     /**
@@ -1962,10 +1704,6 @@ typedef struct AVCodecContext {
      */
     int coded_width, coded_height;
 
-#if FF_API_ASPECT_EXTENDED
-#define FF_ASPECT_EXTENDED 15
-#endif
-
     /**
      * the number of pictures in a group of pictures, or 0 for intra_only
      * - encoding: Set by user.
@@ -1988,14 +1726,6 @@ typedef struct AVCodecContext {
      */
     enum AVPixelFormat pix_fmt;
 
-#if FF_API_MOTION_EST
-    /**
-     * This option does nothing
-     * @deprecated use codec private options instead
-     */
-    attribute_deprecated int me_method;
-#endif
-
     /**
      * If non NULL, 'draw_horiz_band' is called by the libavcodec
      * decoder to draw a horizontal band. It improves cache usage. Not
@@ -2055,12 +1785,6 @@ typedef struct AVCodecContext {
      */
     float b_quant_factor;
 
-#if FF_API_RC_STRATEGY
-    /** @deprecated use codec private option instead */
-    attribute_deprecated int rc_strategy;
-#define FF_RC_STRATEGY_XVID 1
-#endif
-
 #if FF_API_PRIVATE_OPT
     /** @deprecated use encoder private options instead */
     attribute_deprecated
@@ -2254,26 +1978,6 @@ typedef struct AVCodecContext {
      */
     int me_subpel_quality;
 
-#if FF_API_AFD
-    /**
-     * DTG active format information (additional aspect ratio
-     * information only used in DVB MPEG-2 transport streams)
-     * 0 if not set.
-     *
-     * - encoding: unused
-     * - decoding: Set by decoder.
-     * @deprecated Deprecated in favor of AVSideData
-     */
-    attribute_deprecated int dtg_active_format;
-#define FF_DTG_AFD_SAME         8
-#define FF_DTG_AFD_4_3          9
-#define FF_DTG_AFD_16_9         10
-#define FF_DTG_AFD_14_9         11
-#define FF_DTG_AFD_4_3_SP_14_9  13
-#define FF_DTG_AFD_16_9_SP_14_9 14
-#define FF_DTG_AFD_SP_4_3       15
-#endif /* FF_API_AFD */
-
     /**
      * maximum motion estimation search range in subpel units
      * If 0 then no limit.
@@ -2283,19 +1987,6 @@ typedef struct AVCodecContext {
      */
     int me_range;
 
-#if FF_API_QUANT_BIAS
-    /**
-     * @deprecated use encoder private option instead
-     */
-    attribute_deprecated int intra_quant_bias;
-#define FF_DEFAULT_QUANT_BIAS 999999
-
-    /**
-     * @deprecated use encoder private option instead
-     */
-    attribute_deprecated int inter_quant_bias;
-#endif
-
     /**
      * slice flags
      * - encoding: unused
@@ -2306,16 +1997,6 @@ typedef struct AVCodecContext {
 #define SLICE_FLAG_ALLOW_FIELD    0x0002 ///< allow draw_horiz_band() with field slices (MPEG-2 field pics)
 #define SLICE_FLAG_ALLOW_PLANE    0x0004 ///< allow draw_horiz_band() with 1 component at a time (SVQ1)
 
-#if FF_API_XVMC
-    /**
-     * XVideo Motion Acceleration
-     * - encoding: forbidden
-     * - decoding: set by decoder
-     * @deprecated XvMC doesn't need it anymore.
-     */
-    attribute_deprecated int xvmc_acceleration;
-#endif /* FF_API_XVMC */
-
     /**
      * macroblock decision mode
      * - encoding: Set by user.
@@ -2350,20 +2031,6 @@ typedef struct AVCodecContext {
     int noise_reduction;
 #endif
 
-#if FF_API_MPV_OPT
-    /**
-     * @deprecated this field is unused
-     */
-    attribute_deprecated
-    int me_threshold;
-
-    /**
-     * @deprecated this field is unused
-     */
-    attribute_deprecated
-    int mb_threshold;
-#endif
-
     /**
      * precision of the intra DC coefficient - 8
      * - encoding: Set by user.
@@ -2385,14 +2052,6 @@ typedef struct AVCodecContext {
      */
     int skip_bottom;
 
-#if FF_API_MPV_OPT
-    /**
-     * @deprecated use encoder private options instead
-     */
-    attribute_deprecated
-    float border_masking;
-#endif
-
     /**
      * minimum MB Lagrange multiplier
      * - encoding: Set by user.
@@ -2447,15 +2106,6 @@ typedef struct AVCodecContext {
     int chromaoffset;
 #endif
 
-#if FF_API_UNUSED_MEMBERS
-    /**
-     * Multiplied by qscale for each frame and added to scene_change_score.
-     * - encoding: Set by user.
-     * - decoding: unused
-     */
-    attribute_deprecated int scenechange_factor;
-#endif
-
     /**
      * Note: Value depends upon the compare function used for fullpel ME.
      * - encoding: Set by user.
@@ -2718,19 +2368,6 @@ typedef struct AVCodecContext {
      */
     int max_qdiff;
 
-#if FF_API_MPV_OPT
-    /**
-     * @deprecated use encoder private options instead
-     */
-    attribute_deprecated
-    float rc_qsquish;
-
-    attribute_deprecated
-    float rc_qmod_amp;
-    attribute_deprecated
-    int rc_qmod_freq;
-#endif
-
     /**
      * decoder bitstream buffer size
      * - encoding: Set by user.
@@ -2746,14 +2383,6 @@ typedef struct AVCodecContext {
     int rc_override_count;
     RcOverride *rc_override;
 
-#if FF_API_MPV_OPT
-    /**
-     * @deprecated use encoder private options instead
-     */
-    attribute_deprecated
-    const char *rc_eq;
-#endif
-
     /**
      * maximum bitrate
      * - encoding: Set by user.
@@ -2768,17 +2397,6 @@ typedef struct AVCodecContext {
      */
     int64_t rc_min_rate;
 
-#if FF_API_MPV_OPT
-    /**
-     * @deprecated use encoder private options instead
-     */
-    attribute_deprecated
-    float rc_buffer_aggressivity;
-
-    attribute_deprecated
-    float rc_initial_cplx;
-#endif
-
     /**
      * Ratecontrol attempt to use, at maximum, <value> of what can be used without an underflow.
      * - encoding: Set by user.
@@ -2805,9 +2423,6 @@ typedef struct AVCodecContext {
 #define FF_CODER_TYPE_AC        1
 #define FF_CODER_TYPE_RAW       2
 #define FF_CODER_TYPE_RLE       3
-#if FF_API_UNUSED_MEMBERS
-#define FF_CODER_TYPE_DEFLATE   4
-#endif /* FF_API_UNUSED_MEMBERS */
     /**
      * @deprecated use encoder private options instead
      */
@@ -2821,20 +2436,6 @@ typedef struct AVCodecContext {
     int context_model;
 #endif
 
-#if FF_API_MPV_OPT
-    /**
-     * @deprecated use encoder private options instead
-     */
-    attribute_deprecated
-    int lmin;
-
-    /**
-     * @deprecated use encoder private options instead
-     */
-    attribute_deprecated
-    int lmax;
-#endif
-
 #if FF_API_PRIVATE_OPT
     /** @deprecated use encoder private options instead */
     attribute_deprecated
@@ -2945,16 +2546,10 @@ typedef struct AVCodecContext {
      */
     int workaround_bugs;
 #define FF_BUG_AUTODETECT       1  ///< autodetection
-#if FF_API_OLD_MSMPEG4
-#define FF_BUG_OLD_MSMPEG4      2
-#endif
 #define FF_BUG_XVID_ILACE       4
 #define FF_BUG_UMP4             8
 #define FF_BUG_NO_PADDING       16
 #define FF_BUG_AMV              32
-#if FF_API_AC_VLC
-#define FF_BUG_AC_VLC           0  ///< Will be removed, libavcodec can now handle these non-compliant files by default.
-#endif
 #define FF_BUG_QPEL_CHROMA      64
 #define FF_BUG_STD_QPEL         128
 #define FF_BUG_QPEL_CHROMA2     256
@@ -3015,9 +2610,6 @@ typedef struct AVCodecContext {
 #define FF_DEBUG_DCT_COEFF   0x00000040
 #define FF_DEBUG_SKIP        0x00000080
 #define FF_DEBUG_STARTCODE   0x00000100
-#if FF_API_UNUSED_MEMBERS
-#define FF_DEBUG_PTS         0x00000200
-#endif /* FF_API_UNUSED_MEMBERS */
 #define FF_DEBUG_ER          0x00000400
 #define FF_DEBUG_MMCO        0x00000800
 #define FF_DEBUG_BUGS        0x00001000
@@ -3079,7 +2671,7 @@ typedef struct AVCodecContext {
      * - encoding: unused.
      * - decoding: Set by libavcodec
      */
-    struct AVHWAccel *hwaccel;
+    const struct AVHWAccel *hwaccel;
 
     /**
      * Hardware accelerator context.
@@ -3125,27 +2717,12 @@ typedef struct AVCodecContext {
 #define FF_IDCT_SIMPLEMMX     3
 #define FF_IDCT_ARM           7
 #define FF_IDCT_ALTIVEC       8
-#if FF_API_ARCH_SH4
-#define FF_IDCT_SH4           9
-#endif
 #define FF_IDCT_SIMPLEARM     10
-#if FF_API_UNUSED_MEMBERS
-#define FF_IDCT_IPP           13
-#endif /* FF_API_UNUSED_MEMBERS */
 #define FF_IDCT_XVID          14
-#if FF_API_IDCT_XVIDMMX
-#define FF_IDCT_XVIDMMX       14
-#endif /* FF_API_IDCT_XVIDMMX */
 #define FF_IDCT_SIMPLEARMV5TE 16
 #define FF_IDCT_SIMPLEARMV6   17
-#if FF_API_ARCH_SPARC
-#define FF_IDCT_SIMPLEVIS     18
-#endif
 #define FF_IDCT_FAAN          20
 #define FF_IDCT_SIMPLENEON    22
-#if FF_API_ARCH_ALPHA
-#define FF_IDCT_SIMPLEALPHA   23
-#endif
 #define FF_IDCT_NONE          24 /* Used by XvMC to extract IDCT coefficients with FF_IDCT_PERM_NONE */
 #define FF_IDCT_SIMPLEAUTO    128
 
@@ -3356,6 +2933,18 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_HEVC_MAIN_STILL_PICTURE          3
 #define FF_PROFILE_HEVC_REXT                        4
 
+#define FF_PROFILE_AV1_MAIN                         0
+#define FF_PROFILE_AV1_HIGH                         1
+#define FF_PROFILE_AV1_PROFESSIONAL                 2
+
+#define FF_PROFILE_MJPEG_HUFFMAN_BASELINE_DCT            0xc0
+#define FF_PROFILE_MJPEG_HUFFMAN_EXTENDED_SEQUENTIAL_DCT 0xc1
+#define FF_PROFILE_MJPEG_HUFFMAN_PROGRESSIVE_DCT         0xc2
+#define FF_PROFILE_MJPEG_HUFFMAN_LOSSLESS                0xc3
+#define FF_PROFILE_MJPEG_JPEG_LS                         0xf7
+
+#define FF_PROFILE_SBC_MSBC                         1
+
     /**
      * level
      * - encoding: Set by user.
@@ -3396,15 +2985,6 @@ typedef struct AVCodecContext {
     uint8_t *subtitle_header;
     int subtitle_header_size;
 
-#if FF_API_ERROR_RATE
-    /**
-     * @deprecated use the 'error_rate' private AVOption of the mpegvideo
-     * encoders
-     */
-    attribute_deprecated
-    int error_rate;
-#endif
-
 #if FF_API_VBV_DELAY
     /**
      * VBV delay coded in the last frame (in periods of a 27 MHz clock).
@@ -3516,6 +3096,7 @@ typedef struct AVCodecContext {
 #define FF_SUB_CHARENC_MODE_DO_NOTHING  -1  ///< do nothing (demuxer outputs a stream supposed to be already in UTF-8, or the codec is bitmap for instance)
 #define FF_SUB_CHARENC_MODE_AUTOMATIC    0  ///< libavcodec will select the mode itself
 #define FF_SUB_CHARENC_MODE_PRE_DECODER  1  ///< the AVPacket data needs to be recoded to UTF-8 before being fed to the decoder, requires iconv
+#define FF_SUB_CHARENC_MODE_IGNORE       2  ///< neither convert the subtitles, nor check them for valid UTF-8
 
     /**
      * Skip processing alpha if supported by codec.
@@ -3702,24 +3283,57 @@ typedef struct AVCodecContext {
      * (with the display dimensions being determined by the crop_* fields).
      */
     int apply_cropping;
+
+    /*
+     * Video decoding only.  Sets the number of extra hardware frames which
+     * the decoder will allocate for use by the caller.  This must be set
+     * before avcodec_open2() is called.
+     *
+     * Some hardware decoders require all frames that they will use for
+     * output to be defined in advance before decoding starts.  For such
+     * decoders, the hardware frame pool must therefore be of a fixed size.
+     * The extra frames set here are on top of any number that the decoder
+     * needs internally in order to operate normally (for example, frames
+     * used as reference pictures).
+     */
+    int extra_hw_frames;
 } AVCodecContext;
 
+#if FF_API_CODEC_GET_SET
+/**
+ * Accessors for some AVCodecContext fields. These used to be provided for ABI
+ * compatibility, and do not need to be used anymore.
+ */
+attribute_deprecated
 AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+attribute_deprecated
 void       av_codec_set_pkt_timebase         (AVCodecContext *avctx, AVRational val);
 
+attribute_deprecated
 const AVCodecDescriptor *av_codec_get_codec_descriptor(const AVCodecContext *avctx);
+attribute_deprecated
 void                     av_codec_set_codec_descriptor(AVCodecContext *avctx, const AVCodecDescriptor *desc);
 
+attribute_deprecated
 unsigned av_codec_get_codec_properties(const AVCodecContext *avctx);
 
+#if FF_API_LOWRES
+attribute_deprecated
 int  av_codec_get_lowres(const AVCodecContext *avctx);
+attribute_deprecated
 void av_codec_set_lowres(AVCodecContext *avctx, int val);
+#endif
 
+attribute_deprecated
 int  av_codec_get_seek_preroll(const AVCodecContext *avctx);
+attribute_deprecated
 void av_codec_set_seek_preroll(AVCodecContext *avctx, int val);
 
+attribute_deprecated
 uint16_t *av_codec_get_chroma_intra_matrix(const AVCodecContext *avctx);
+attribute_deprecated
 void av_codec_set_chroma_intra_matrix(AVCodecContext *avctx, uint16_t *val);
+#endif
 
 /**
  * AVProfile.
@@ -3729,6 +3343,61 @@ typedef struct AVProfile {
     const char *name; ///< short name for the profile
 } AVProfile;
 
+enum {
+    /**
+     * The codec supports this format via the hw_device_ctx interface.
+     *
+     * When selecting this format, AVCodecContext.hw_device_ctx should
+     * have been set to a device of the specified type before calling
+     * avcodec_open2().
+     */
+    AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX = 0x01,
+    /**
+     * The codec supports this format via the hw_frames_ctx interface.
+     *
+     * When selecting this format for a decoder,
+     * AVCodecContext.hw_frames_ctx should be set to a suitable frames
+     * context inside the get_format() callback.  The frames context
+     * must have been created on a device of the specified type.
+     */
+    AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX = 0x02,
+    /**
+     * The codec supports this format by some internal method.
+     *
+     * This format can be selected without any additional configuration -
+     * no device or frames context is required.
+     */
+    AV_CODEC_HW_CONFIG_METHOD_INTERNAL      = 0x04,
+    /**
+     * The codec supports this format by some ad-hoc method.
+     *
+     * Additional settings and/or function calls are required.  See the
+     * codec-specific documentation for details.  (Methods requiring
+     * this sort of configuration are deprecated and others should be
+     * used in preference.)
+     */
+    AV_CODEC_HW_CONFIG_METHOD_AD_HOC        = 0x08,
+};
+
+typedef struct AVCodecHWConfig {
+    /**
+     * A hardware pixel format which the codec can use.
+     */
+    enum AVPixelFormat pix_fmt;
+    /**
+     * Bit set of AV_CODEC_HW_CONFIG_METHOD_* flags, describing the possible
+     * setup methods which can be used with this configuration.
+     */
+    int methods;
+    /**
+     * The device type associated with the configuration.
+     *
+     * Must be set for AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX and
+     * AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX, otherwise unused.
+     */
+    enum AVHWDeviceType device_type;
+} AVCodecHWConfig;
+
 typedef struct AVCodecDefault AVCodecDefault;
 
 struct AVSubtitle;
@@ -3765,6 +3434,18 @@ typedef struct AVCodec {
     const AVClass *priv_class;              ///< AVClass for the private context
     const AVProfile *profiles;              ///< array of recognized profiles, or NULL if unknown, array is terminated by {FF_PROFILE_UNKNOWN}
 
+    /**
+     * Group name of the codec implementation.
+     * This is a short symbolic name of the wrapper backing this codec. A
+     * wrapper uses some kind of external implementation for the codec, such
+     * as an external library, or a codec implementation provided by the OS or
+     * the hardware.
+     * If this field is NULL, this is a builtin, libavcodec native codec.
+     * If non-NULL, this will be the suffix in AVCodec.name in most cases
+     * (usually AVCodec.name will be of the form "<codec_name>_<wrapper_name>").
+     */
+    const char *wrapper_name;
+
     /*****************************************************************
      * No fields below this line are part of the public API. They
      * may not be used outside of libavcodec and can be changed and
@@ -3801,6 +3482,9 @@ typedef struct AVCodec {
 
     /**
      * Initialize codec static data, called from avcodec_register().
+     *
+     * This is not intended for time consuming operations as it is
+     * run for every codec regardless of that codec being used.
      */
     void (*init_static_data)(struct AVCodec *codec);
 
@@ -3854,14 +3538,39 @@ typedef struct AVCodec {
      * packets before decoding.
      */
     const char *bsfs;
+
+    /**
+     * Array of pointers to hardware configurations supported by the codec,
+     * or NULL if no hardware supported.  The array is terminated by a NULL
+     * pointer.
+     *
+     * The user can only access this field via avcodec_get_hw_config().
+     */
+    const struct AVCodecHWConfigInternal **hw_configs;
 } AVCodec;
 
+#if FF_API_CODEC_GET_SET
+attribute_deprecated
 int av_codec_get_max_lowres(const AVCodec *codec);
+#endif
 
 struct MpegEncContext;
 
 /**
+ * Retrieve supported hardware configurations for a codec.
+ *
+ * Values of index from zero to some maximum return the indexed configuration
+ * descriptor; all other values return NULL.  If the codec does not support
+ * any hardware configurations then it will always return NULL.
+ */
+const AVCodecHWConfig *avcodec_get_hw_config(const AVCodec *codec, int index);
+
+/**
  * @defgroup lavc_hwaccel AVHWAccel
+ *
+ * @note  Nothing in this structure should be accessed by the user.  At some
+ *        point in future it will not be externally visible at all.
+ *
  * @{
  */
 typedef struct AVHWAccel {
@@ -3906,7 +3615,6 @@ typedef struct AVHWAccel {
      * New public fields should be added right above.
      *****************************************************************
      */
-    struct AVHWAccel *next;
 
     /**
      * Allocate a custom buffer
@@ -3930,6 +3638,20 @@ typedef struct AVHWAccel {
     int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
 
     /**
+     * Callback for parameter data (SPS/PPS/VPS etc).
+     *
+     * Useful for hardware decoders which keep persistent state about the
+     * video parameters, and need to receive any changes to update that state.
+     *
+     * @param avctx the codec context
+     * @param type the nal unit type
+     * @param buf the nal unit data buffer
+     * @param buf_size the size of the nal unit in bytes
+     * @return zero if successful, a negative value otherwise
+     */
+    int (*decode_params)(AVCodecContext *avctx, int type, const uint8_t *buf, uint32_t buf_size);
+
+    /**
      * Callback for each slice.
      *
      * Meaningful slice information (codec specific) is guaranteed to
@@ -4001,6 +3723,16 @@ typedef struct AVHWAccel {
      * Internal hwaccel capabilities.
      */
     int caps_internal;
+
+    /**
+     * Fill the given hw_frames context with current codec parameters. Called
+     * from get_format. Refer to avcodec_get_hw_frames_parameters() for
+     * details.
+     *
+     * This CAN be called before AVHWAccel.init is called, and you must assume
+     * that avctx->hwaccel_priv_data is invalid.
+     */
+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
 } AVHWAccel;
 
 /**
@@ -4293,11 +4025,25 @@ typedef struct AVCodecParameters {
 } AVCodecParameters;
 
 /**
+ * Iterate over all registered codecs.
+ *
+ * @param opaque a pointer where libavcodec will store the iteration state. Must
+ *               point to NULL to start the iteration.
+ *
+ * @return the next registered codec or NULL when the iteration is
+ *         finished
+ */
+const AVCodec *av_codec_iterate(void **opaque);
+
+#if FF_API_NEXT
+/**
  * If c is NULL, returns the first registered codec,
  * if c is non-NULL, returns the next registered codec after c,
  * or NULL if c is the last one.
  */
+attribute_deprecated
 AVCodec *av_codec_next(const AVCodec *c);
+#endif
 
 /**
  * Return the LIBAVCODEC_VERSION_INT constant.
@@ -4314,6 +4060,7 @@ const char *avcodec_configuration(void);
  */
 const char *avcodec_license(void);
 
+#if FF_API_NEXT
 /**
  * Register the codec codec and initialize libavcodec.
  *
@@ -4322,6 +4069,7 @@ const char *avcodec_license(void);
  *
  * @see avcodec_register_all()
  */
+attribute_deprecated
 void avcodec_register(AVCodec *codec);
 
 /**
@@ -4334,7 +4082,9 @@ void avcodec_register(AVCodec *codec);
  * @see av_register_codec_parser
  * @see av_register_bitstream_filter
  */
+attribute_deprecated
 void avcodec_register_all(void);
+#endif
 
 /**
  * Allocate an AVCodecContext and set its fields to default values. The
@@ -4615,7 +4365,7 @@ int av_packet_from_data(AVPacket *pkt, uint8_t *data, int size);
  * @warning This is a hack - the packet memory allocation stuff is broken. The
  * packet is allocated if it was not really allocated.
  *
- * @deprecated Use av_packet_ref
+ * @deprecated Use av_packet_ref or av_packet_make_refcounted
  */
 attribute_deprecated
 int av_dup_packet(AVPacket *pkt);
@@ -4787,6 +4537,33 @@ void av_packet_move_ref(AVPacket *dst, AVPacket *src);
 int av_packet_copy_props(AVPacket *dst, const AVPacket *src);
 
 /**
+ * Ensure the data described by a given packet is reference counted.
+ *
+ * @note This function does not ensure that the reference will be writable.
+ *       Use av_packet_make_writable instead for that purpose.
+ *
+ * @see av_packet_ref
+ * @see av_packet_make_writable
+ *
+ * @param pkt packet whose data should be made reference counted.
+ *
+ * @return 0 on success, a negative AVERROR on error. On failure, the
+ *         packet is unchanged.
+ */
+int av_packet_make_refcounted(AVPacket *pkt);
+
+/**
+ * Create a writable reference for the data described by a given packet,
+ * avoiding data copy if possible.
+ *
+ * @param pkt Packet whose data should be made writable.
+ *
+ * @return 0 on success, a negative AVERROR on failure. On failure, the
+ *         packet is unchanged.
+ */
+int av_packet_make_writable(AVPacket *pkt);
+
+/**
  * Convert valid timing fields (timestamps / durations) in a packet from one
  * timebase to another. Timestamps with unknown values (AV_NOPTS_VALUE) will be
  * ignored.
@@ -4831,21 +4608,6 @@ AVCodec *avcodec_find_decoder_by_name(const char *name);
  */
 int avcodec_default_get_buffer2(AVCodecContext *s, AVFrame *frame, int flags);
 
-#if FF_API_EMU_EDGE
-/**
- * Return the amount of padding in pixels which the get_buffer callback must
- * provide around the edge of the image for codecs which do not have the
- * CODEC_FLAG_EMU_EDGE flag.
- *
- * @return Required padding in pixels.
- *
- * @deprecated CODEC_FLAG_EMU_EDGE is deprecated, so this function is no longer
- * needed
- */
-attribute_deprecated
-unsigned avcodec_get_edge_width(void);
-#endif
-
 /**
  * Modify width and height values so that they will result in a memory
  * buffer that is acceptable for the codec if you do not use any horizontal
@@ -5151,6 +4913,109 @@ int avcodec_send_frame(AVCodecContext *avctx, const AVFrame *frame);
  */
 int avcodec_receive_packet(AVCodecContext *avctx, AVPacket *avpkt);
 
+/**
+ * Create and return a AVHWFramesContext with values adequate for hardware
+ * decoding. This is meant to get called from the get_format callback, and is
+ * a helper for preparing a AVHWFramesContext for AVCodecContext.hw_frames_ctx.
+ * This API is for decoding with certain hardware acceleration modes/APIs only.
+ *
+ * The returned AVHWFramesContext is not initialized. The caller must do this
+ * with av_hwframe_ctx_init().
+ *
+ * Calling this function is not a requirement, but makes it simpler to avoid
+ * codec or hardware API specific details when manually allocating frames.
+ *
+ * Alternatively to this, an API user can set AVCodecContext.hw_device_ctx,
+ * which sets up AVCodecContext.hw_frames_ctx fully automatically, and makes
+ * it unnecessary to call this function or having to care about
+ * AVHWFramesContext initialization at all.
+ *
+ * There are a number of requirements for calling this function:
+ *
+ * - It must be called from get_format with the same avctx parameter that was
+ *   passed to get_format. Calling it outside of get_format is not allowed, and
+ *   can trigger undefined behavior.
+ * - The function is not always supported (see description of return values).
+ *   Even if this function returns successfully, hwaccel initialization could
+ *   fail later. (The degree to which implementations check whether the stream
+ *   is actually supported varies. Some do this check only after the user's
+ *   get_format callback returns.)
+ * - The hw_pix_fmt must be one of the choices suggested by get_format. If the
+ *   user decides to use a AVHWFramesContext prepared with this API function,
+ *   the user must return the same hw_pix_fmt from get_format.
+ * - The device_ref passed to this function must support the given hw_pix_fmt.
+ * - After calling this API function, it is the user's responsibility to
+ *   initialize the AVHWFramesContext (returned by the out_frames_ref parameter),
+ *   and to set AVCodecContext.hw_frames_ctx to it. If done, this must be done
+ *   before returning from get_format (this is implied by the normal
+ *   AVCodecContext.hw_frames_ctx API rules).
+ * - The AVHWFramesContext parameters may change every time time get_format is
+ *   called. Also, AVCodecContext.hw_frames_ctx is reset before get_format. So
+ *   you are inherently required to go through this process again on every
+ *   get_format call.
+ * - It is perfectly possible to call this function without actually using
+ *   the resulting AVHWFramesContext. One use-case might be trying to reuse a
+ *   previously initialized AVHWFramesContext, and calling this API function
+ *   only to test whether the required frame parameters have changed.
+ * - Fields that use dynamically allocated values of any kind must not be set
+ *   by the user unless setting them is explicitly allowed by the documentation.
+ *   If the user sets AVHWFramesContext.free and AVHWFramesContext.user_opaque,
+ *   the new free callback must call the potentially set previous free callback.
+ *   This API call may set any dynamically allocated fields, including the free
+ *   callback.
+ *
+ * The function will set at least the following fields on AVHWFramesContext
+ * (potentially more, depending on hwaccel API):
+ *
+ * - All fields set by av_hwframe_ctx_alloc().
+ * - Set the format field to hw_pix_fmt.
+ * - Set the sw_format field to the most suited and most versatile format. (An
+ *   implication is that this will prefer generic formats over opaque formats
+ *   with arbitrary restrictions, if possible.)
+ * - Set the width/height fields to the coded frame size, rounded up to the
+ *   API-specific minimum alignment.
+ * - Only _if_ the hwaccel requires a pre-allocated pool: set the initial_pool_size
+ *   field to the number of maximum reference surfaces possible with the codec,
+ *   plus 1 surface for the user to work (meaning the user can safely reference
+ *   at most 1 decoded surface at a time), plus additional buffering introduced
+ *   by frame threading. If the hwaccel does not require pre-allocation, the
+ *   field is left to 0, and the decoder will allocate new surfaces on demand
+ *   during decoding.
+ * - Possibly AVHWFramesContext.hwctx fields, depending on the underlying
+ *   hardware API.
+ *
+ * Essentially, out_frames_ref returns the same as av_hwframe_ctx_alloc(), but
+ * with basic frame parameters set.
+ *
+ * The function is stateless, and does not change the AVCodecContext or the
+ * device_ref AVHWDeviceContext.
+ *
+ * @param avctx The context which is currently calling get_format, and which
+ *              implicitly contains all state needed for filling the returned
+ *              AVHWFramesContext properly.
+ * @param device_ref A reference to the AVHWDeviceContext describing the device
+ *                   which will be used by the hardware decoder.
+ * @param hw_pix_fmt The hwaccel format you are going to return from get_format.
+ * @param out_frames_ref On success, set to a reference to an _uninitialized_
+ *                       AVHWFramesContext, created from the given device_ref.
+ *                       Fields will be set to values required for decoding.
+ *                       Not changed if an error is returned.
+ * @return zero on success, a negative value on error. The following error codes
+ *         have special semantics:
+ *      AVERROR(ENOENT): the decoder does not support this functionality. Setup
+ *                       is always manual, or it is a decoder which does not
+ *                       support setting AVCodecContext.hw_frames_ctx at all,
+ *                       or it is a software format.
+ *      AVERROR(EINVAL): it is known that hardware decoding is not supported for
+ *                       this configuration, or the device_ref is not supported
+ *                       for the hwaccel referenced by hw_pix_fmt.
+ */
+int avcodec_get_hw_frames_parameters(AVCodecContext *avctx,
+                                     AVBufferRef *device_ref,
+                                     enum AVPixelFormat hw_pix_fmt,
+                                     AVBufferRef **out_frames_ref);
+
+
 
 /**
  * @defgroup lavc_parsing Frame parsing
@@ -5346,8 +5211,21 @@ typedef struct AVCodecParser {
     struct AVCodecParser *next;
 } AVCodecParser;
 
+/**
+ * Iterate over all registered codec parsers.
+ *
+ * @param opaque a pointer where libavcodec will store the iteration state. Must
+ *               point to NULL to start the iteration.
+ *
+ * @return the next registered codec parser or NULL when the iteration is
+ *         finished
+ */
+const AVCodecParser *av_parser_iterate(void **opaque);
+
+attribute_deprecated
 AVCodecParser *av_parser_next(const AVCodecParser *c);
 
+attribute_deprecated
 void av_register_codec_parser(AVCodecParser *parser);
 AVCodecParserContext *av_parser_init(int codec_id);
 
@@ -5516,103 +5394,6 @@ int avcodec_encode_subtitle(AVCodecContext *avctx, uint8_t *buf, int buf_size,
  * @}
  */
 
-#if FF_API_AVCODEC_RESAMPLE
-/**
- * @defgroup lavc_resample Audio resampling
- * @ingroup libavc
- * @deprecated use libswresample instead
- *
- * @{
- */
-struct ReSampleContext;
-struct AVResampleContext;
-
-typedef struct ReSampleContext ReSampleContext;
-
-/**
- *  Initialize audio resampling context.
- *
- * @param output_channels  number of output channels
- * @param input_channels   number of input channels
- * @param output_rate      output sample rate
- * @param input_rate       input sample rate
- * @param sample_fmt_out   requested output sample format
- * @param sample_fmt_in    input sample format
- * @param filter_length    length of each FIR filter in the filterbank relative to the cutoff frequency
- * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
- * @param linear           if 1 then the used FIR filter will be linearly interpolated
-                           between the 2 closest, if 0 the closest will be used
- * @param cutoff           cutoff frequency, 1.0 corresponds to half the output sampling rate
- * @return allocated ReSampleContext, NULL if error occurred
- */
-attribute_deprecated
-ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
-                                        int output_rate, int input_rate,
-                                        enum AVSampleFormat sample_fmt_out,
-                                        enum AVSampleFormat sample_fmt_in,
-                                        int filter_length, int log2_phase_count,
-                                        int linear, double cutoff);
-
-attribute_deprecated
-int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples);
-
-/**
- * Free resample context.
- *
- * @param s a non-NULL pointer to a resample context previously
- *          created with av_audio_resample_init()
- */
-attribute_deprecated
-void audio_resample_close(ReSampleContext *s);
-
-
-/**
- * Initialize an audio resampler.
- * Note, if either rate is not an integer then simply scale both rates up so they are.
- * @param filter_length length of each FIR filter in the filterbank relative to the cutoff freq
- * @param log2_phase_count log2 of the number of entries in the polyphase filterbank
- * @param linear If 1 then the used FIR filter will be linearly interpolated
-                 between the 2 closest, if 0 the closest will be used
- * @param cutoff cutoff frequency, 1.0 corresponds to half the output sampling rate
- */
-attribute_deprecated
-struct AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_length, int log2_phase_count, int linear, double cutoff);
-
-/**
- * Resample an array of samples using a previously configured context.
- * @param src an array of unconsumed samples
- * @param consumed the number of samples of src which have been consumed are returned here
- * @param src_size the number of unconsumed samples available
- * @param dst_size the amount of space in samples available in dst
- * @param update_ctx If this is 0 then the context will not be modified, that way several channels can be resampled with the same context.
- * @return the number of samples written in dst or -1 if an error occurred
- */
-attribute_deprecated
-int av_resample(struct AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx);
-
-
-/**
- * Compensate samplerate/timestamp drift. The compensation is done by changing
- * the resampler parameters, so no audible clicks or similar distortions occur
- * @param compensation_distance distance in output samples over which the compensation should be performed
- * @param sample_delta number of output samples which should be output less
- *
- * example: av_resample_compensate(c, 10, 500)
- * here instead of 510 samples only 500 samples would be output
- *
- * note, due to rounding the actual compensation might be slightly different,
- * especially if the compensation_distance is large and the in_rate used during init is small
- */
-attribute_deprecated
-void av_resample_compensate(struct AVResampleContext *c, int sample_delta, int compensation_distance);
-attribute_deprecated
-void av_resample_close(struct AVResampleContext *c);
-
-/**
- * @}
- */
-#endif
-
 #if FF_API_AVPICTURE
 /**
  * @addtogroup lavc_picture
@@ -5753,14 +5534,6 @@ enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const en
  * @}
  */
 
-#if FF_API_SET_DIMENSIONS
-/**
- * @deprecated this function is not supposed to be used from outside of lavc
- */
-attribute_deprecated
-void avcodec_set_dimensions(AVCodecContext *s, int width, int height);
-#endif
-
 #if FF_API_TAG_STRING
 /**
  * Put a string representing the codec tag codec_tag in buf.
@@ -5997,84 +5770,42 @@ typedef struct AVBitStreamFilter {
 
 #if FF_API_OLD_BSF
 /**
- * Register a bitstream filter.
- *
- * The filter will be accessible to the application code through
- * av_bitstream_filter_next() or can be directly initialized with
- * av_bitstream_filter_init().
- *
- * @see avcodec_register_all()
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use the new bitstream filtering API (using AVBSFContext).
  */
 attribute_deprecated
 void av_register_bitstream_filter(AVBitStreamFilter *bsf);
-
 /**
- * Create and initialize a bitstream filter context given a bitstream
- * filter name.
- *
- * The returned context must be freed with av_bitstream_filter_close().
- *
- * @param name    the name of the bitstream filter
- * @return a bitstream filter context if a matching filter was found
- * and successfully initialized, NULL otherwise
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_get_by_name(), av_bsf_alloc(), and av_bsf_init()
+ * from the new bitstream filtering API (using AVBSFContext).
  */
 attribute_deprecated
 AVBitStreamFilterContext *av_bitstream_filter_init(const char *name);
-
 /**
- * Filter bitstream.
- *
- * This function filters the buffer buf with size buf_size, and places the
- * filtered buffer in the buffer pointed to by poutbuf.
- *
- * The output buffer must be freed by the caller.
- *
- * @param bsfc            bitstream filter context created by av_bitstream_filter_init()
- * @param avctx           AVCodecContext accessed by the filter, may be NULL.
- *                        If specified, this must point to the encoder context of the
- *                        output stream the packet is sent to.
- * @param args            arguments which specify the filter configuration, may be NULL
- * @param poutbuf         pointer which is updated to point to the filtered buffer
- * @param poutbuf_size    pointer which is updated to the filtered buffer size in bytes
- * @param buf             buffer containing the data to filter
- * @param buf_size        size in bytes of buf
- * @param keyframe        set to non-zero if the buffer to filter corresponds to a key-frame packet data
- * @return >= 0 in case of success, or a negative error code in case of failure
- *
- * If the return value is positive, an output buffer is allocated and
- * is available in *poutbuf, and is distinct from the input buffer.
- *
- * If the return value is 0, the output buffer is not allocated and
- * should be considered identical to the input buffer, or in case
- * *poutbuf was set it points to the input buffer (not necessarily to
- * its starting address). A special case is if *poutbuf was set to NULL and
- * *poutbuf_size was set to 0, which indicates the packet should be dropped.
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_send_packet() and av_bsf_receive_packet() from the
+ * new bitstream filtering API (using AVBSFContext).
  */
 attribute_deprecated
 int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
                                AVCodecContext *avctx, const char *args,
                                uint8_t **poutbuf, int *poutbuf_size,
                                const uint8_t *buf, int buf_size, int keyframe);
-
 /**
- * Release bitstream filter context.
- *
- * @param bsf the bitstream filter context created with
- * av_bitstream_filter_init(), can be NULL
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_free() from the new bitstream filtering API (using
+ * AVBSFContext).
  */
 attribute_deprecated
 void av_bitstream_filter_close(AVBitStreamFilterContext *bsf);
-
 /**
- * If f is NULL, return the first registered bitstream filter,
- * if f is non-NULL, return the next registered bitstream filter
- * after f, or NULL if f is the last one.
- *
- * This function can be used to iterate over all registered bitstream
- * filters.
+ * @deprecated the old bitstream filtering API (using AVBitStreamFilterContext)
+ * is deprecated. Use av_bsf_iterate() from the new bitstream filtering API (using
+ * AVBSFContext).
  */
 attribute_deprecated
-AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
+const AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f);
 #endif
 
 /**
@@ -6092,7 +5823,11 @@ const AVBitStreamFilter *av_bsf_get_by_name(const char *name);
  * @return the next registered bitstream filter or NULL when the iteration is
  *         finished
  */
+const AVBitStreamFilter *av_bsf_iterate(void **opaque);
+#if FF_API_NEXT
+attribute_deprecated
 const AVBitStreamFilter *av_bsf_next(void **opaque);
+#endif
 
 /**
  * Allocate a context for a given bitstream filter. The caller must fill in the
@@ -6282,51 +6017,32 @@ void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size);
  */
 unsigned int av_xiphlacing(unsigned char *s, unsigned int v);
 
-#if FF_API_MISSING_SAMPLE
-/**
- * Log a generic warning message about a missing feature. This function is
- * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
- * only, and would normally not be used by applications.
- * @param[in] avc a pointer to an arbitrary struct of which the first field is
- * a pointer to an AVClass struct
- * @param[in] feature string containing the name of the missing feature
- * @param[in] want_sample indicates if samples are wanted which exhibit this feature.
- * If want_sample is non-zero, additional verbiage will be added to the log
- * message which tells the user how to report samples to the development
- * mailing list.
- * @deprecated Use avpriv_report_missing_feature() instead.
- */
-attribute_deprecated
-void av_log_missing_feature(void *avc, const char *feature, int want_sample);
-
-/**
- * Log a generic warning message asking for a sample. This function is
- * intended to be used internally by FFmpeg (libavcodec, libavformat, etc.)
- * only, and would normally not be used by applications.
- * @param[in] avc a pointer to an arbitrary struct of which the first field is
- * a pointer to an AVClass struct
- * @param[in] msg string containing an optional message, or NULL if no message
- * @deprecated Use avpriv_request_sample() instead.
- */
-attribute_deprecated
-void av_log_ask_for_sample(void *avc, const char *msg, ...) av_printf_format(2, 3);
-#endif /* FF_API_MISSING_SAMPLE */
-
+#if FF_API_USER_VISIBLE_AVHWACCEL
 /**
  * Register the hardware accelerator hwaccel.
+ *
+ * @deprecated  This function doesn't do anything.
  */
+attribute_deprecated
 void av_register_hwaccel(AVHWAccel *hwaccel);
 
 /**
  * If hwaccel is NULL, returns the first registered hardware accelerator,
  * if hwaccel is non-NULL, returns the next registered hardware accelerator
  * after hwaccel, or NULL if hwaccel is the last one.
+ *
+ * @deprecated  AVHWaccel structures contain no user-serviceable parts, so
+ *              this function should not be used.
  */
+attribute_deprecated
 AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel);
+#endif
 
-
+#if FF_API_LOCKMGR
 /**
  * Lock operation used by lockmgr
+ *
+ * @deprecated Deprecated together with av_lockmgr_register().
  */
 enum AVLockOp {
   AV_LOCK_CREATE,  ///< Create a mutex
@@ -6357,8 +6073,13 @@ enum AVLockOp {
  *           mechanism (i.e. do not use a single static object to
  *           implement your lock manager). If cb is set to NULL the
  *           lockmgr will be unregistered.
+ *
+ * @deprecated This function does nothing, and always returns 0. Be sure to
+ *             build with thread support to get basic thread safety.
  */
+attribute_deprecated
 int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op));
+#endif
 
 /**
  * Get the type of the given codec.
diff --git a/media/ffvpx/libavcodec/avcodec.symbols b/media/ffvpx/libavcodec/avcodec.symbols
index cf507f722..0b7c21cca 100644
--- a/media/ffvpx/libavcodec/avcodec.symbols
+++ b/media/ffvpx/libavcodec/avcodec.symbols
@@ -1,7 +1,3 @@
-av_bitstream_filter_close
-av_bitstream_filter_filter
-av_bitstream_filter_init
-av_bitstream_filter_next
 av_codec_ffversion
 av_codec_get_chroma_intra_matrix
 av_codec_get_codec_descriptor
@@ -34,8 +30,6 @@ av_grow_packet
 av_hwaccel_next
 av_init_packet
 av_lockmgr_register
-av_log_ask_for_sample
-av_log_missing_feature
 av_new_packet
 av_packet_copy_props
 av_packet_free_side_data
@@ -57,17 +51,8 @@ av_parser_close
 av_parser_init
 av_parser_next
 av_parser_parse2
-av_picture_copy
-av_picture_crop
-av_picture_pad
-av_qsv_alloc_context
-av_register_bitstream_filter
 av_register_codec_parser
 av_register_hwaccel
-av_resample
-av_resample_close
-av_resample_compensate
-av_resample_init
 av_shrink_packet
 av_vorbis_parse_frame
 av_vorbis_parse_frame_flags
@@ -94,36 +79,23 @@ avcodec_descriptor_get_by_name
 avcodec_descriptor_next
 avcodec_enum_to_chroma_pos
 avcodec_fill_audio_frame
-avcodec_find_best_pix_fmt2
-avcodec_find_best_pix_fmt_of_2
-avcodec_find_best_pix_fmt_of_list
 avcodec_find_decoder
 avcodec_find_decoder_by_name
 avcodec_find_encoder
 avcodec_find_encoder_by_name
 avcodec_flush_buffers
 avcodec_free_context
-avcodec_get_chroma_sub_sample
 avcodec_get_class
 avcodec_get_context_defaults3
-avcodec_get_edge_width
 avcodec_get_frame_class
 avcodec_get_name
-avcodec_get_pix_fmt_loss
 avcodec_get_subtitle_rect_class
 avcodec_get_type
 avcodec_is_open
 avcodec_license
 avcodec_open2
-avcodec_pix_fmt_to_codec_tag
 avcodec_register
 avcodec_register_all
-avcodec_set_dimensions
 avcodec_string
 avcodec_version
-avpicture_alloc
-avpicture_fill
-avpicture_free
-avpicture_get_size
-avpicture_layout
 avsubtitle_free
diff --git a/media/ffvpx/libavcodec/avpacket.c b/media/ffvpx/libavcodec/avpacket.c
index d1f4ea9eb..99a0c1383 100644
--- a/media/ffvpx/libavcodec/avpacket.c
+++ b/media/ffvpx/libavcodec/avpacket.c
@@ -479,34 +479,6 @@ int av_packet_split_side_data(AVPacket *pkt){
 }
 #endif
 
-#if FF_API_MERGE_SD
-int ff_packet_split_and_drop_side_data(AVPacket *pkt){
-    if (!pkt->side_data_elems && pkt->size >12 && AV_RB64(pkt->data + pkt->size - 8) == FF_MERGE_MARKER){
-        int i;
-        unsigned int size;
-        uint8_t *p;
-
-        p = pkt->data + pkt->size - 8 - 5;
-        for (i=1; ; i++){
-            size = AV_RB32(p);
-            if (size>INT_MAX - 5 || p - pkt->data < size)
-                return 0;
-            if (p[4]&128)
-                break;
-            if (p - pkt->data < size + 5)
-                return 0;
-            p-= size+5;
-            if (i > AV_PKT_DATA_NB)
-                return 0;
-        }
-        pkt->size = p - pkt->data - size;
-        av_assert0(pkt->size >= 0);
-        return 1;
-    }
-    return 0;
-}
-#endif
-
 uint8_t *av_packet_pack_dictionary(AVDictionary *dict, int *size)
 {
     AVDictionaryEntry *t = NULL;
@@ -599,6 +571,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
     dst->flags                = src->flags;
     dst->stream_index         = src->stream_index;
 
+    dst->side_data            = NULL;
+    dst->side_data_elems      = 0;
     for (i = 0; i < src->side_data_elems; i++) {
          enum AVPacketSideDataType type = src->side_data[i].type;
          int size          = src->side_data[i].size;
@@ -678,6 +652,45 @@ void av_packet_move_ref(AVPacket *dst, AVPacket *src)
     src->size = 0;
 }
 
+int av_packet_make_refcounted(AVPacket *pkt)
+{
+    int ret;
+
+    if (pkt->buf)
+        return 0;
+
+    ret = packet_alloc(&pkt->buf, pkt->size);
+    if (ret < 0)
+        return ret;
+    if (pkt->size)
+        memcpy(pkt->buf->data, pkt->data, pkt->size);
+
+    pkt->data = pkt->buf->data;
+
+    return 0;
+}
+
+int av_packet_make_writable(AVPacket *pkt)
+{
+    AVBufferRef *buf = NULL;
+    int ret;
+
+    if (pkt->buf && av_buffer_is_writable(pkt->buf))
+        return 0;
+
+    ret = packet_alloc(&buf, pkt->size);
+    if (ret < 0)
+        return ret;
+    if (pkt->size)
+        memcpy(buf->data, pkt->data, pkt->size);
+
+    av_buffer_unref(&pkt->buf);
+    pkt->buf  = buf;
+    pkt->data = buf->data;
+
+    return 0;
+}
+
 void av_packet_rescale_ts(AVPacket *pkt, AVRational src_tb, AVRational dst_tb)
 {
     if (pkt->pts != AV_NOPTS_VALUE)
diff --git a/media/ffvpx/libavcodec/bit_depth_template.c b/media/ffvpx/libavcodec/bit_depth_template.c
index 80184892f..d44d47ea4 100644
--- a/media/ffvpx/libavcodec/bit_depth_template.c
+++ b/media/ffvpx/libavcodec/bit_depth_template.c
@@ -29,6 +29,7 @@
 #   undef pixel2
 #   undef pixel4
 #   undef dctcoef
+#   undef idctin
 #   undef INIT_CLIP
 #   undef no_rnd_avg_pixel4
 #   undef rnd_avg_pixel4
@@ -53,6 +54,16 @@
 #   define pixel4 uint64_t
 #   define dctcoef int32_t
 
+#ifdef IN_IDCT_DEPTH
+#if IN_IDCT_DEPTH == 32
+#   define idctin int32_t
+#else
+#   define idctin int16_t
+#endif
+#else
+#   define idctin int16_t
+#endif
+
 #   define INIT_CLIP
 #   define no_rnd_avg_pixel4 no_rnd_avg64
 #   define    rnd_avg_pixel4    rnd_avg64
@@ -71,6 +82,7 @@
 #   define pixel2 uint16_t
 #   define pixel4 uint32_t
 #   define dctcoef int16_t
+#   define idctin  int16_t
 
 #   define INIT_CLIP
 #   define no_rnd_avg_pixel4 no_rnd_avg32
@@ -87,7 +99,10 @@
 #   define CLIP(a) av_clip_uint8(a)
 #endif
 
-#define FUNC3(a, b, c)  a ## _ ## b ## c
+#define FUNC3(a, b, c)  a ## _ ## b ##  c
 #define FUNC2(a, b, c)  FUNC3(a, b, c)
 #define FUNC(a)  FUNC2(a, BIT_DEPTH,)
 #define FUNCC(a) FUNC2(a, BIT_DEPTH, _c)
+#define FUNC4(a, b, c)  a ## _int ## b ## _ ## c ## bit
+#define FUNC5(a, b, c)  FUNC4(a, b, c)
+#define FUNC6(a)  FUNC5(a, IN_IDCT_DEPTH, BIT_DEPTH)
diff --git a/media/ffvpx/libavcodec/bitstream_filter.c b/media/ffvpx/libavcodec/bitstream_filter.c
index 8599b90d4..ca11ed371 100644
--- a/media/ffvpx/libavcodec/bitstream_filter.c
+++ b/media/ffvpx/libavcodec/bitstream_filter.c
@@ -28,15 +28,15 @@
 #if FF_API_OLD_BSF
 FF_DISABLE_DEPRECATION_WARNINGS
 
-AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f)
+const AVBitStreamFilter *av_bitstream_filter_next(const AVBitStreamFilter *f)
 {
     const AVBitStreamFilter *filter = NULL;
     void *opaque = NULL;
 
     while (filter != f)
-        filter = av_bsf_next(&opaque);
+        filter = av_bsf_iterate(&opaque);
 
-    return av_bsf_next(&opaque);
+    return av_bsf_iterate(&opaque);
 }
 
 void av_register_bitstream_filter(AVBitStreamFilter *bsf)
@@ -131,7 +131,7 @@ int av_bitstream_filter_filter(AVBitStreamFilterContext *bsfc,
             return ret;
     }
 
-    pkt.data = buf;
+    pkt.data = (uint8_t *)buf;
     pkt.size = buf_size;
 
     ret = av_bsf_send_packet(priv->ctx, &pkt);
diff --git a/media/ffvpx/libavcodec/bitstream_filters.c b/media/ffvpx/libavcodec/bitstream_filters.c
index ce34de640..18b698a85 100644
--- a/media/ffvpx/libavcodec/bitstream_filters.c
+++ b/media/ffvpx/libavcodec/bitstream_filters.c
@@ -28,26 +28,34 @@ extern const AVBitStreamFilter ff_aac_adtstoasc_bsf;
 extern const AVBitStreamFilter ff_chomp_bsf;
 extern const AVBitStreamFilter ff_dump_extradata_bsf;
 extern const AVBitStreamFilter ff_dca_core_bsf;
+extern const AVBitStreamFilter ff_eac3_core_bsf;
 extern const AVBitStreamFilter ff_extract_extradata_bsf;
+extern const AVBitStreamFilter ff_filter_units_bsf;
+extern const AVBitStreamFilter ff_h264_metadata_bsf;
 extern const AVBitStreamFilter ff_h264_mp4toannexb_bsf;
+extern const AVBitStreamFilter ff_h264_redundant_pps_bsf;
+extern const AVBitStreamFilter ff_hapqa_extract_bsf;
+extern const AVBitStreamFilter ff_hevc_metadata_bsf;
 extern const AVBitStreamFilter ff_hevc_mp4toannexb_bsf;
 extern const AVBitStreamFilter ff_imx_dump_header_bsf;
 extern const AVBitStreamFilter ff_mjpeg2jpeg_bsf;
 extern const AVBitStreamFilter ff_mjpega_dump_header_bsf;
 extern const AVBitStreamFilter ff_mp3_header_decompress_bsf;
+extern const AVBitStreamFilter ff_mpeg2_metadata_bsf;
 extern const AVBitStreamFilter ff_mpeg4_unpack_bframes_bsf;
 extern const AVBitStreamFilter ff_mov2textsub_bsf;
 extern const AVBitStreamFilter ff_noise_bsf;
 extern const AVBitStreamFilter ff_null_bsf;
 extern const AVBitStreamFilter ff_remove_extradata_bsf;
 extern const AVBitStreamFilter ff_text2movsub_bsf;
+extern const AVBitStreamFilter ff_trace_headers_bsf;
 extern const AVBitStreamFilter ff_vp9_raw_reorder_bsf;
 extern const AVBitStreamFilter ff_vp9_superframe_bsf;
 extern const AVBitStreamFilter ff_vp9_superframe_split_bsf;
 
 #include "libavcodec/bsf_list.c"
 
-const AVBitStreamFilter *av_bsf_next(void **opaque)
+const AVBitStreamFilter *av_bsf_iterate(void **opaque)
 {
     uintptr_t i = (uintptr_t)*opaque;
     const AVBitStreamFilter *f = bitstream_filters[i];
@@ -58,12 +66,18 @@ const AVBitStreamFilter *av_bsf_next(void **opaque)
     return f;
 }
 
+#if FF_API_NEXT
+const AVBitStreamFilter *av_bsf_next(void **opaque) {
+    return av_bsf_iterate(opaque);
+}
+#endif
+
 const AVBitStreamFilter *av_bsf_get_by_name(const char *name)
 {
-    int i;
+    const AVBitStreamFilter *f = NULL;
+    void *i = 0;
 
-    for (i = 0; bitstream_filters[i]; i++) {
-        const AVBitStreamFilter *f = bitstream_filters[i];
+    while ((f = av_bsf_iterate(&i))) {
         if (!strcmp(f->name, name))
             return f;
     }
@@ -73,19 +87,20 @@ const AVBitStreamFilter *av_bsf_get_by_name(const char *name)
 
 const AVClass *ff_bsf_child_class_next(const AVClass *prev)
 {
-    int i;
+    const AVBitStreamFilter *f = NULL;
+    void *i = 0;
 
     /* find the filter that corresponds to prev */
-    for (i = 0; prev && bitstream_filters[i]; i++) {
-        if (bitstream_filters[i]->priv_class == prev) {
-            i++;
+    while (prev && (f = av_bsf_iterate(&i))) {
+        if (f->priv_class == prev) {
             break;
         }
     }
 
     /* find next filter with priv options */
-    for (; bitstream_filters[i]; i++)
-        if (bitstream_filters[i]->priv_class)
-            return bitstream_filters[i]->priv_class;
+    while ((f = av_bsf_iterate(&i))) {
+        if (f->priv_class)
+            return f->priv_class;
+    }
     return NULL;
 }
diff --git a/media/ffvpx/libavcodec/blockdsp.h b/media/ffvpx/libavcodec/blockdsp.h
index 6e27a02ba..26fc2ea13 100644
--- a/media/ffvpx/libavcodec/blockdsp.h
+++ b/media/ffvpx/libavcodec/blockdsp.h
@@ -33,8 +33,8 @@ typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
                              uint8_t value, ptrdiff_t line_size, int h);
 
 typedef struct BlockDSPContext {
-    void (*clear_block)(int16_t *block /* align 16 */);
-    void (*clear_blocks)(int16_t *blocks /* align 16 */);
+    void (*clear_block)(int16_t *block /* align 32 */);
+    void (*clear_blocks)(int16_t *blocks /* align 32 */);
 
     op_fill_func fill_block_tab[2];
 } BlockDSPContext;
diff --git a/media/ffvpx/libavcodec/bsf.c b/media/ffvpx/libavcodec/bsf.c
index 38b423101..bd611ea16 100644
--- a/media/ffvpx/libavcodec/bsf.c
+++ b/media/ffvpx/libavcodec/bsf.c
@@ -174,6 +174,8 @@ int av_bsf_init(AVBSFContext *ctx)
 
 int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt)
 {
+    int ret;
+
     if (!pkt || (!pkt->data && !pkt->side_data_elems)) {
         ctx->internal->eof = 1;
         return 0;
@@ -188,6 +190,9 @@ int av_bsf_send_packet(AVBSFContext *ctx, AVPacket *pkt)
         ctx->internal->buffer_pkt->side_data_elems)
         return AVERROR(EAGAIN);
 
+    ret = av_packet_make_refcounted(pkt);
+    if (ret < 0)
+        return ret;
     av_packet_move_ref(ctx->internal->buffer_pkt, pkt);
 
     return 0;
diff --git a/media/ffvpx/libavcodec/bsf_list.c b/media/ffvpx/libavcodec/bsf_list.c
index d31ece942..92d9948b2 100644
--- a/media/ffvpx/libavcodec/bsf_list.c
+++ b/media/ffvpx/libavcodec/bsf_list.c
@@ -1,3 +1,6 @@
 static const AVBitStreamFilter * const bitstream_filters[] = {
     &ff_null_bsf,
+#if CONFIG_VP9_SUPERFRAME_SPLIT_BSF
+    &ff_vp9_superframe_split_bsf,
+#endif
     NULL };
diff --git a/media/ffvpx/libavcodec/codec_desc.c b/media/ffvpx/libavcodec/codec_desc.c
index 6a13bbbf0..79552a910 100644
--- a/media/ffvpx/libavcodec/codec_desc.c
+++ b/media/ffvpx/libavcodec/codec_desc.c
@@ -46,15 +46,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_REORDER,
         .profiles  = NULL_IF_CONFIG_SMALL(ff_mpeg2_video_profiles),
     },
-#if FF_API_XVMC
-    {
-        .id        = AV_CODEC_ID_MPEG2VIDEO_XVMC,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "mpegvideo_xvmc",
-        .long_name = NULL_IF_CONFIG_SMALL("MPEG-1/2 video XvMC (X-Video Motion Compensation)"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-#endif /* FF_API_XVMC */
     {
         .id        = AV_CODEC_ID_H261,
         .type      = AVMEDIA_TYPE_VIDEO,
@@ -99,6 +90,28 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_LJPEG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ljpeg",
+        .long_name = NULL_IF_CONFIG_SMALL("Lossless JPEG"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_SP5X,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sp5x",
+        .long_name = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_JPEGLS,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "jpegls",
+        .long_name = NULL_IF_CONFIG_SMALL("JPEG-LS"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_MPEG4,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mpeg4",
@@ -170,14 +183,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_SVG,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "svg",
-        .long_name = NULL_IF_CONFIG_SMALL("Scalable Vector Graphics"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/svg+xml"),
-    },
-    {
         .id        = AV_CODEC_ID_SVQ1,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "svq1",
@@ -417,13 +422,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_SNOW,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "snow",
-        .long_name = NULL_IF_CONFIG_SMALL("Snow"),
-        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_TSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "tscc",
@@ -459,6 +457,50 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_PNG,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "png",
+        .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
+    },
+    {
+        .id        = AV_CODEC_ID_PPM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ppm",
+        .long_name = NULL_IF_CONFIG_SMALL("PPM (Portable PixelMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PBM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pbm",
+        .long_name = NULL_IF_CONFIG_SMALL("PBM (Portable BitMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PGM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pgm",
+        .long_name = NULL_IF_CONFIG_SMALL("PGM (Portable GrayMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PGMYUV,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pgmyuv",
+        .long_name = NULL_IF_CONFIG_SMALL("PGMYUV (Portable GrayMap YUV) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PAM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pam",
+        .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-portable-pixmap"),
+    },
+    {
         .id        = AV_CODEC_ID_FFVHUFF,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "ffvhuff",
@@ -647,6 +689,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_TARGA,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "targa",
+        .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-targa", "image/x-tga"),
+    },
+    {
         .id        = AV_CODEC_ID_DSICINVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dsicinvideo",
@@ -661,6 +711,22 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_TIFF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "tiff",
+        .long_name = NULL_IF_CONFIG_SMALL("TIFF image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/tiff"),
+    },
+    {
+        .id        = AV_CODEC_ID_GIF,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "gif",
+        .long_name = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/gif"),
+    },
+    {
         .id        = AV_CODEC_ID_DXA,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dxa",
@@ -683,6 +749,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_SGI,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sgi",
+        .long_name = NULL_IF_CONFIG_SMALL("SGI image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_C93,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "c93",
@@ -697,6 +770,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_PTX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ptx",
+        .long_name = NULL_IF_CONFIG_SMALL("V.Flash PTX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_TXD,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "txd",
+        .long_name = NULL_IF_CONFIG_SMALL("Renderware TXD (TeXture Dictionary) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_VP6A,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "vp6a",
@@ -718,6 +805,21 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_PCX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "pcx",
+        .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-pcx"),
+    },
+    {
+        .id        = AV_CODEC_ID_SUNRAST,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "sunrast",
+        .long_name = NULL_IF_CONFIG_SMALL("Sun Rasterfile image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_INDEO4,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "indeo4",
@@ -753,13 +855,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_DAALA,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "daala",
-        .long_name = NULL_IF_CONFIG_SMALL("Daala"),
-        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_DIRAC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dirac",
@@ -844,6 +939,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_DPX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "dpx",
+        .long_name = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_MAD,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mad",
@@ -921,14 +1023,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_VP9,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "vp9",
-        .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
-        .props     = AV_CODEC_PROP_LOSSY,
-        .profiles  = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
-    },
-    {
         .id        = AV_CODEC_ID_PICTOR,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "pictor",
@@ -936,6 +1030,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ANSI,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "ansi",
+        .long_name = NULL_IF_CONFIG_SMALL("ASCII/ANSI art"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_A64_MULTI,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "a64_multi",
@@ -957,27 +1058,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_M101,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "m101",
-        .long_name = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_MVC1,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "mvc1",
-        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 1"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_MVC2,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "mvc2",
-        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 2"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_MXPEG,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "mxpeg",
@@ -1013,6 +1093,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_WMV3IMAGE,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "wmv3image",
+        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_VC1IMAGE,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "vc1image",
+        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image v2"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_UTVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "utvideo",
@@ -1048,6 +1142,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_XWD,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xwd",
+        .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xwindowdump"),
+    },
+    {
         .id        = AV_CODEC_ID_CDXL,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "cdxl",
@@ -1055,6 +1157,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_XBM,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "xbm",
+        .long_name = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/x-xbitmap"),
+    },
+    {
         .id        = AV_CODEC_ID_ZEROCODEC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "zerocodec",
@@ -1104,6 +1214,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_VP9,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "vp9",
+        .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
+        .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
+    },
+    {
         .id        = AV_CODEC_ID_AIC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "aic",
@@ -1111,13 +1229,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_Y41P,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "y41p",
-        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_ESCAPE130,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "escape130",
@@ -1125,88 +1236,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_AVRP,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "avrp",
-        .long_name = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_012V,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "012v",
-        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_AVUI,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "avui",
-        .long_name = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_AYUV,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ayuv",
-        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_TARGA_Y216,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "targa_y216",
-        .long_name = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_V308,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "v308",
-        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_V408,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "v408",
-        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_YUV4,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "yuv4",
-        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_AVRN,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "avrn",
-        .long_name = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
-    },
-    {
-        .id        = AV_CODEC_ID_CPIA,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "cpia",
-        .long_name = NULL_IF_CONFIG_SMALL("CPiA video format"),
-    },
-    {
-        .id        = AV_CODEC_ID_XFACE,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "xface",
-        .long_name = NULL_IF_CONFIG_SMALL("X-face image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_SMVJPEG,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "smvjpeg",
-        .long_name = NULL_IF_CONFIG_SMALL("Sigmatel Motion Video"),
-    },
-
-    {
         .id        = AV_CODEC_ID_G2M,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "g2m",
@@ -1214,6 +1243,15 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_WEBP,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "webp",
+        .long_name = NULL_IF_CONFIG_SMALL("WebP"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/webp"),
+    },
+    {
         .id        = AV_CODEC_ID_HNM4_VIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hnm4video",
@@ -1236,6 +1274,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ALIAS_PIX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "alias_pix",
+        .long_name = NULL_IF_CONFIG_SMALL("Alias/Wavefront PIX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_BRENDER_PIX,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "brender_pix",
+        .long_name = NULL_IF_CONFIG_SMALL("BRender PIX image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_PAF_VIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "paf_video",
@@ -1243,6 +1295,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_EXR,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "exr",
+        .long_name = NULL_IF_CONFIG_SMALL("OpenEXR image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_VP7,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "vp7",
@@ -1264,6 +1324,20 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
+        .id        = AV_CODEC_ID_MVC1,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "mvc1",
+        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 1"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_MVC2,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "mvc2",
+        .long_name = NULL_IF_CONFIG_SMALL("Silicon Graphics Motion Video Compressor 2"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_HQX,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hqx",
@@ -1271,6 +1345,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_TDSC,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "tdsc",
+        .long_name = NULL_IF_CONFIG_SMALL("TDSC"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_HQ_HQA,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "hq_hqa",
@@ -1285,6 +1366,14 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_DDS,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "dds",
+        .long_name = NULL_IF_CONFIG_SMALL("DirectDraw Surface image decoder"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
+                     AV_CODEC_PROP_LOSSLESS,
+    },
+    {
         .id        = AV_CODEC_ID_DXV,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "dxv",
@@ -1299,20 +1388,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_SPEEDHQ,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "speedhq",
-        .long_name = NULL_IF_CONFIG_SMALL("NewTek SpeedHQ"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_WRAPPED_AVFRAME,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "wrapped_avframe",
-        .long_name = NULL_IF_CONFIG_SMALL("AVFrame to AVPacket passthrough"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_RSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "rscc",
@@ -1320,229 +1395,155 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_MAGICYUV,
+        .id        = AV_CODEC_ID_Y41P,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "magicyuv",
-        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV video"),
+        .name      = "y41p",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed YUV 4:1:1 12-bit"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_TRUEMOTION2RT,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "truemotion2rt",
-        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_CFHD,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "cfhd",
-        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_SHEERVIDEO,
+        .id        = AV_CODEC_ID_AVRP,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sheervideo",
-        .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
+        .name      = "avrp",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid 1:1 10-bit RGB Packer"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_YLC,
+        .id        = AV_CODEC_ID_012V,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ylc",
-        .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+        .name      = "012v",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed 4:2:2 10-bit"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PIXLET,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pixlet",
-        .long_name = NULL_IF_CONFIG_SMALL("Apple Pixlet"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_FMVC,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "fmvc",
-        .long_name = NULL_IF_CONFIG_SMALL("FM Screen Capture Codec"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_SCPR,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "scpr",
-        .long_name = NULL_IF_CONFIG_SMALL("ScreenPressor"),
-        .props     = AV_CODEC_PROP_LOSSLESS | AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_CLEARVIDEO,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "clearvideo",
-        .long_name = NULL_IF_CONFIG_SMALL("Iterated Systems ClearVideo"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_AV1,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "av1",
-        .long_name = NULL_IF_CONFIG_SMALL("Alliance for Open Media AV1"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_BITPACKED,
+        .id        = AV_CODEC_ID_AVUI,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "bitpacked",
-        .long_name = NULL_IF_CONFIG_SMALL("Bitpacked"),
+        .name      = "avui",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid Meridien Uncompressed"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_MSCC,
+        .id        = AV_CODEC_ID_AYUV,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "mscc",
-        .long_name = NULL_IF_CONFIG_SMALL("Mandsoft Screen Capture Codec"),
+        .name      = "ayuv",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed MS 4:4:4:4"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_SRGC,
+        .id        = AV_CODEC_ID_TARGA_Y216,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "srgc",
-        .long_name = NULL_IF_CONFIG_SMALL("Screen Recorder Gold Codec"),
+        .name      = "targa_y216",
+        .long_name = NULL_IF_CONFIG_SMALL("Pinnacle TARGA CineWave YUV16"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_GDV,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "gdv",
-        .long_name = NULL_IF_CONFIG_SMALL("Gremlin Digital Video"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-
-    /* image codecs */
-    {
-        .id        = AV_CODEC_ID_ALIAS_PIX,
+        .id        = AV_CODEC_ID_V308,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "alias_pix",
-        .long_name = NULL_IF_CONFIG_SMALL("Alias/Wavefront PIX image"),
+        .name      = "v308",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:4:4"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_ANSI,
+        .id        = AV_CODEC_ID_V408,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ansi",
-        .long_name = NULL_IF_CONFIG_SMALL("ASCII/ANSI art"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "v408",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed QT 4:4:4:4"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_BRENDER_PIX,
+        .id        = AV_CODEC_ID_YUV4,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "brender_pix",
-        .long_name = NULL_IF_CONFIG_SMALL("BRender PIX image"),
+        .name      = "yuv4",
+        .long_name = NULL_IF_CONFIG_SMALL("Uncompressed packed 4:2:0"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_DDS,
+        .id        = AV_CODEC_ID_AVRN,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "dds",
-        .long_name = NULL_IF_CONFIG_SMALL("DirectDraw Surface image decoder"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "avrn",
+        .long_name = NULL_IF_CONFIG_SMALL("Avid AVI Codec"),
     },
     {
-        .id        = AV_CODEC_ID_DPX,
+        .id        = AV_CODEC_ID_CPIA,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "dpx",
-        .long_name = NULL_IF_CONFIG_SMALL("DPX (Digital Picture Exchange) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "cpia",
+        .long_name = NULL_IF_CONFIG_SMALL("CPiA video format"),
     },
     {
-        .id        = AV_CODEC_ID_EXR,
+        .id        = AV_CODEC_ID_XFACE,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "exr",
-        .long_name = NULL_IF_CONFIG_SMALL("OpenEXR image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "xface",
+        .long_name = NULL_IF_CONFIG_SMALL("X-face image"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_FITS,
+        .id        = AV_CODEC_ID_SNOW,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "fits",
-        .long_name = NULL_IF_CONFIG_SMALL("FITS (Flexible Image Transport System)"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "snow",
+        .long_name = NULL_IF_CONFIG_SMALL("Snow"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_GIF,
+        .id        = AV_CODEC_ID_SMVJPEG,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "gif",
-        .long_name = NULL_IF_CONFIG_SMALL("GIF (Graphics Interchange Format)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/gif"),
+        .name      = "smvjpeg",
+        .long_name = NULL_IF_CONFIG_SMALL("Sigmatel Motion Video"),
     },
     {
-        .id        = AV_CODEC_ID_JPEGLS,
+        .id        = AV_CODEC_ID_APNG,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "jpegls",
-        .long_name = NULL_IF_CONFIG_SMALL("JPEG-LS"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
+        .name      = "apng",
+        .long_name = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/png"),
     },
     {
-        .id        = AV_CODEC_ID_LJPEG,
+        .id        = AV_CODEC_ID_DAALA,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ljpeg",
-        .long_name = NULL_IF_CONFIG_SMALL("Lossless JPEG"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "daala",
+        .long_name = NULL_IF_CONFIG_SMALL("Daala"),
+        .props     = AV_CODEC_PROP_LOSSY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PAM,
+        .id        = AV_CODEC_ID_CFHD,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pam",
-        .long_name = NULL_IF_CONFIG_SMALL("PAM (Portable AnyMap) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/x-portable-pixmap"),
+        .name      = "cfhd",
+        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_PBM,
+        .id        = AV_CODEC_ID_TRUEMOTION2RT,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pbm",
-        .long_name = NULL_IF_CONFIG_SMALL("PBM (Portable BitMap) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "truemotion2rt",
+        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_PCX,
+        .id        = AV_CODEC_ID_M101,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pcx",
-        .long_name = NULL_IF_CONFIG_SMALL("PC Paintbrush PCX image"),
+        .name      = "m101",
+        .long_name = NULL_IF_CONFIG_SMALL("Matrox Uncompressed SD"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/x-pcx"),
     },
     {
-        .id        = AV_CODEC_ID_PGM,
+        .id        = AV_CODEC_ID_MAGICYUV,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pgm",
-        .long_name = NULL_IF_CONFIG_SMALL("PGM (Portable GrayMap) image"),
+        .name      = "magicyuv",
+        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV video"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PGMYUV,
+        .id        = AV_CODEC_ID_SHEERVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "pgmyuv",
-        .long_name = NULL_IF_CONFIG_SMALL("PGMYUV (Portable GrayMap YUV) image"),
+        .name      = "sheervideo",
+        .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PNG,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "png",
-        .long_name = NULL_IF_CONFIG_SMALL("PNG (Portable Network Graphics) image"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/png"),
-    },
-    {
-        .id        = AV_CODEC_ID_PPM,
+        .id        = AV_CODEC_ID_YLC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ppm",
-        .long_name = NULL_IF_CONFIG_SMALL("PPM (Portable PixelMap) image"),
+        .name      = "ylc",
+        .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
@@ -1553,117 +1554,98 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PTX,
+        .id        = AV_CODEC_ID_PIXLET,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "ptx",
-        .long_name = NULL_IF_CONFIG_SMALL("V.Flash PTX image"),
+        .name      = "pixlet",
+        .long_name = NULL_IF_CONFIG_SMALL("Apple Pixlet"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_SGI,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sgi",
-        .long_name = NULL_IF_CONFIG_SMALL("SGI image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_SP5X,
+        .id        = AV_CODEC_ID_SPEEDHQ,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sp5x",
-        .long_name = NULL_IF_CONFIG_SMALL("Sunplus JPEG (SP5X)"),
+        .name      = "speedhq",
+        .long_name = NULL_IF_CONFIG_SMALL("NewTek SpeedHQ"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_SUNRAST,
+        .id        = AV_CODEC_ID_FMVC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "sunrast",
-        .long_name = NULL_IF_CONFIG_SMALL("Sun Rasterfile image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+        .name      = "fmvc",
+        .long_name = NULL_IF_CONFIG_SMALL("FM Screen Capture Codec"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_TARGA,
+        .id        = AV_CODEC_ID_SCPR,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "targa",
-        .long_name = NULL_IF_CONFIG_SMALL("Truevision Targa image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/x-targa", "image/x-tga"),
+        .name      = "scpr",
+        .long_name = NULL_IF_CONFIG_SMALL("ScreenPressor"),
+        .props     = AV_CODEC_PROP_LOSSLESS | AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_TDSC,
+        .id        = AV_CODEC_ID_CLEARVIDEO,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "tdsc",
-        .long_name = NULL_IF_CONFIG_SMALL("TDSC"),
+        .name      = "clearvideo",
+        .long_name = NULL_IF_CONFIG_SMALL("Iterated Systems ClearVideo"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_TIFF,
+        .id        = AV_CODEC_ID_XPM,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "tiff",
-        .long_name = NULL_IF_CONFIG_SMALL("TIFF image"),
+        .name      = "xpm",
+        .long_name = NULL_IF_CONFIG_SMALL("XPM (X PixMap) image"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/tiff"),
-    },
-    {
-        .id        = AV_CODEC_ID_TXD,
-        .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "txd",
-        .long_name = NULL_IF_CONFIG_SMALL("Renderware TXD (TeXture Dictionary) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY,
+        .mime_types= MT("image/x-xpixmap"),
     },
     {
-        .id        = AV_CODEC_ID_VC1IMAGE,
+        .id        = AV_CODEC_ID_AV1,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "vc1image",
-        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image v2"),
+        .name      = "av1",
+        .long_name = NULL_IF_CONFIG_SMALL("Alliance for Open Media AV1"),
         .props     = AV_CODEC_PROP_LOSSY,
+        .profiles  = NULL_IF_CONFIG_SMALL(ff_av1_profiles),
     },
     {
-        .id        = AV_CODEC_ID_WEBP,
+        .id        = AV_CODEC_ID_BITPACKED,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "webp",
-        .long_name = NULL_IF_CONFIG_SMALL("WebP"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSY |
-                     AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/webp"),
+        .name      = "bitpacked",
+        .long_name = NULL_IF_CONFIG_SMALL("Bitpacked"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_WMV3IMAGE,
+        .id        = AV_CODEC_ID_MSCC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "wmv3image",
-        .long_name = NULL_IF_CONFIG_SMALL("Windows Media Video 9 Image"),
-        .props     = AV_CODEC_PROP_LOSSY,
+        .name      = "mscc",
+        .long_name = NULL_IF_CONFIG_SMALL("Mandsoft Screen Capture Codec"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_XBM,
+        .id        = AV_CODEC_ID_SRGC,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "xbm",
-        .long_name = NULL_IF_CONFIG_SMALL("XBM (X BitMap) image"),
+        .name      = "srgc",
+        .long_name = NULL_IF_CONFIG_SMALL("Screen Recorder Gold Codec"),
         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/x-xbitmap"),
     },
     {
-        .id        = AV_CODEC_ID_XPM,
+        .id        = AV_CODEC_ID_SVG,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "xpm",
-        .long_name = NULL_IF_CONFIG_SMALL("XPM (X PixMap) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/x-xpixmap"),
+        .name      = "svg",
+        .long_name = NULL_IF_CONFIG_SMALL("Scalable Vector Graphics"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+        .mime_types= MT("image/svg+xml"),
     },
     {
-        .id        = AV_CODEC_ID_XWD,
+        .id        = AV_CODEC_ID_GDV,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "xwd",
-        .long_name = NULL_IF_CONFIG_SMALL("XWD (X Window Dump) image"),
-        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/x-xwindowdump"),
+        .name      = "gdv",
+        .long_name = NULL_IF_CONFIG_SMALL("Gremlin Digital Video"),
+        .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_APNG,
+        .id        = AV_CODEC_ID_FITS,
         .type      = AVMEDIA_TYPE_VIDEO,
-        .name      = "apng",
-        .long_name = NULL_IF_CONFIG_SMALL("APNG (Animated Portable Network Graphics) image"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-        .mime_types= MT("image/png"),
+        .name      = "fits",
+        .long_name = NULL_IF_CONFIG_SMALL("FITS (Flexible Image Transport System)"),
+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
     },
 
     /* various PCM "codecs" */
@@ -1738,20 +1720,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PCM_S64LE,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s64le",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit little-endian"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_PCM_S64BE,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s64be",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit big-endian"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_PCM_U32LE,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_u32le",
@@ -1808,13 +1776,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_PCM_S16BE_PLANAR,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s16be_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit big-endian planar"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_PCM_S16LE_PLANAR,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_s16le_planar",
@@ -1822,20 +1783,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PCM_S24LE_PLANAR,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s24le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit little-endian planar"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_PCM_S32LE_PLANAR,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_s32le_planar",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit little-endian planar"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_PCM_DVD,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_dvd",
@@ -1843,20 +1790,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_PCM_F16LE,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_f16le",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM 16.8 floating point little-endian"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_PCM_F24LE,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "pcm_f24le",
-        .long_name = NULL_IF_CONFIG_SMALL("PCM 24.0 floating point little-endian"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_PCM_F32BE,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "pcm_f32be",
@@ -1912,6 +1845,55 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("PCM signed 8-bit planar"),
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
+    {
+        .id        = AV_CODEC_ID_PCM_S24LE_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s24le_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 24-bit little-endian planar"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S32LE_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s32le_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 32-bit little-endian planar"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S16BE_PLANAR,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s16be_planar",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 16-bit big-endian planar"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S64LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s64le",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit little-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_S64BE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_s64be",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM signed 64-bit big-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_F16LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_f16le",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM 16.8 floating point little-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_PCM_F24LE,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "pcm_f24le",
+        .long_name = NULL_IF_CONFIG_SMALL("PCM 24.0 floating point little-endian"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
 
     /* various ADPCM codecs */
     {
@@ -2048,13 +2030,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_ADPCM_THP_LE,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "adpcm_thp_le",
-        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP (Little-Endian)"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_ADPCM_IMA_AMV,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_ima_amv",
@@ -2132,6 +2107,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_ADPCM_VIMA,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_vima",
+        .long_name = NULL_IF_CONFIG_SMALL("LucasArts VIMA audio"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_ADPCM_AFC,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "adpcm_afc",
@@ -2167,10 +2149,10 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_ADPCM_VIMA,
+        .id        = AV_CODEC_ID_ADPCM_THP_LE,
         .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "adpcm_vima",
-        .long_name = NULL_IF_CONFIG_SMALL("LucasArts VIMA audio"),
+        .name      = "adpcm_thp_le",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM Nintendo THP (Little-Endian)"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
@@ -2194,6 +2176,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("ADPCM IMA Eurocom DAT4"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
+    {
+        .id        = AV_CODEC_ID_ADPCM_MTAF,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "adpcm_mtaf",
+        .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
 
     /* AMR */
     {
@@ -2499,15 +2488,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .long_name = NULL_IF_CONFIG_SMALL("ATRAC3 (Adaptive TRansform Acoustic Coding 3)"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
-#if FF_API_VOXWARE
-    {
-        .id        = AV_CODEC_ID_VOXWARE,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "voxware",
-        .long_name = NULL_IF_CONFIG_SMALL("Voxware RT29 Metasound"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-#endif
     {
         .id        = AV_CODEC_ID_APE,
         .type      = AVMEDIA_TYPE_AUDIO,
@@ -2565,20 +2545,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_ATRAC3PAL,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "atrac3pal",
-        .long_name = NULL_IF_CONFIG_SMALL("ATRAC3+ AL (Adaptive TRansform Acoustic Coding 3+ Advanced Lossless)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
-        .id        = AV_CODEC_ID_ATRAC3AL,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "atrac3al",
-        .long_name = NULL_IF_CONFIG_SMALL("ATRAC3 AL (Adaptive TRansform Acoustic Coding 3 Advanced Lossless)"),
-        .props     = AV_CODEC_PROP_LOSSLESS,
-    },
-    {
         .id        = AV_CODEC_ID_EAC3,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "eac3",
@@ -2671,20 +2637,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_DSS_SP,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "dss_sp",
-        .long_name = NULL_IF_CONFIG_SMALL("Digital Speech Standard - Standard Play mode (DSS SP)"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
-        .id        = AV_CODEC_ID_DOLBY_E,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "dolby_e",
-        .long_name = NULL_IF_CONFIG_SMALL("Dolby E"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_G729,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "g729",
@@ -2734,24 +2686,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_FFWAVESYNTH,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "wavesynth",
-        .long_name = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
-    },
-    {
-        .id        = AV_CODEC_ID_SONIC,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "sonic",
-        .long_name = NULL_IF_CONFIG_SMALL("Sonic"),
-    },
-    {
-        .id        = AV_CODEC_ID_SONIC_LS,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "sonicls",
-        .long_name = NULL_IF_CONFIG_SMALL("Sonic lossless"),
-    },
-    {
         .id        = AV_CODEC_ID_OPUS,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "opus",
@@ -2794,6 +2728,38 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_DSS_SP,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dss_sp",
+        .long_name = NULL_IF_CONFIG_SMALL("Digital Speech Standard - Standard Play mode (DSS SP)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_CODEC2,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "codec2",
+        .long_name = NULL_IF_CONFIG_SMALL("codec2 (very low bitrate speech codec)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_FFWAVESYNTH,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "wavesynth",
+        .long_name = NULL_IF_CONFIG_SMALL("Wave synthesis pseudo-codec"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonic",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic"),
+    },
+    {
+        .id        = AV_CODEC_ID_SONIC_LS,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sonicls",
+        .long_name = NULL_IF_CONFIG_SMALL("Sonic lossless"),
+    },
+    {
         .id        = AV_CODEC_ID_EVRC,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "evrc",
@@ -2808,13 +2774,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
-        .id        = AV_CODEC_ID_4GV,
-        .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "4gv",
-        .long_name = NULL_IF_CONFIG_SMALL("4GV (Fourth Generation Vocoder)"),
-        .props     = AV_CODEC_PROP_LOSSY,
-    },
-    {
         .id        = AV_CODEC_ID_DSD_LSBF,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "dsd_lsbf",
@@ -2843,6 +2802,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSY,
     },
     {
+        .id        = AV_CODEC_ID_4GV,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "4gv",
+        .long_name = NULL_IF_CONFIG_SMALL("4GV (Fourth Generation Vocoder)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
         .id        = AV_CODEC_ID_INTERPLAY_ACM,
         .type      = AVMEDIA_TYPE_AUDIO,
         .name      = "interplayacm",
@@ -2871,10 +2837,45 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_LOSSLESS,
     },
     {
-        .id        = AV_CODEC_ID_ADPCM_MTAF,
+        .id        = AV_CODEC_ID_ATRAC3AL,
         .type      = AVMEDIA_TYPE_AUDIO,
-        .name      = "adpcm_mtaf",
-        .long_name = NULL_IF_CONFIG_SMALL("ADPCM MTAF"),
+        .name      = "atrac3al",
+        .long_name = NULL_IF_CONFIG_SMALL("ATRAC3 AL (Adaptive TRansform Acoustic Coding 3 Advanced Lossless)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_ATRAC3PAL,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "atrac3pal",
+        .long_name = NULL_IF_CONFIG_SMALL("ATRAC3+ AL (Adaptive TRansform Acoustic Coding 3+ Advanced Lossless)"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
+    },
+    {
+        .id        = AV_CODEC_ID_DOLBY_E,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "dolby_e",
+        .long_name = NULL_IF_CONFIG_SMALL("Dolby E"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_APTX,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "aptx",
+        .long_name = NULL_IF_CONFIG_SMALL("aptX (Audio Processing Technology for Bluetooth)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_APTX_HD,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "aptx_hd",
+        .long_name = NULL_IF_CONFIG_SMALL("aptX HD (Audio Processing Technology for Bluetooth)"),
+        .props     = AV_CODEC_PROP_LOSSY,
+    },
+    {
+        .id        = AV_CODEC_ID_SBC,
+        .type      = AVMEDIA_TYPE_AUDIO,
+        .name      = "sbc",
+        .long_name = NULL_IF_CONFIG_SMALL("SBC (low-complexity subband codec)"),
         .props     = AV_CODEC_PROP_LOSSY,
     },
 
@@ -2908,13 +2909,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_BITMAP_SUB,
     },
     {
-        .id        = AV_CODEC_ID_ASS,
-        .type      = AVMEDIA_TYPE_SUBTITLE,
-        .name      = "ass",
-        .long_name = NULL_IF_CONFIG_SMALL("ASS (Advanced SSA) subtitle"),
-        .props     = AV_CODEC_PROP_TEXT_SUB,
-    },
-    {
         .id        = AV_CODEC_ID_SSA,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "ssa",
@@ -2949,13 +2943,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
-        .id        = AV_CODEC_ID_SUBRIP,
-        .type      = AVMEDIA_TYPE_SUBTITLE,
-        .name      = "subrip",
-        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
-        .props     = AV_CODEC_PROP_TEXT_SUB,
-    },
-    {
         .id        = AV_CODEC_ID_MICRODVD,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "microdvd",
@@ -2963,13 +2950,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
-        .id        = AV_CODEC_ID_MPL2,
-        .type      = AVMEDIA_TYPE_SUBTITLE,
-        .name      = "mpl2",
-        .long_name = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
-        .props     = AV_CODEC_PROP_TEXT_SUB,
-    },
-    {
         .id        = AV_CODEC_ID_EIA_608,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "eia_608",
@@ -2984,13 +2964,6 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
-        .id        = AV_CODEC_ID_PJS,
-        .type      = AVMEDIA_TYPE_SUBTITLE,
-        .name      = "pjs",
-        .long_name = NULL_IF_CONFIG_SMALL("PJS (Phoenix Japanimation Society) subtitle"),
-        .props     = AV_CODEC_PROP_TEXT_SUB,
-    },
-    {
         .id        = AV_CODEC_ID_SAMI,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "sami",
@@ -3026,10 +2999,10 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
-        .id        = AV_CODEC_ID_VPLAYER,
+        .id        = AV_CODEC_ID_SUBRIP,
         .type      = AVMEDIA_TYPE_SUBTITLE,
-        .name      = "vplayer",
-        .long_name = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+        .name      = "subrip",
+        .long_name = NULL_IF_CONFIG_SMALL("SubRip subtitle"),
         .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
@@ -3040,6 +3013,34 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .props     = AV_CODEC_PROP_TEXT_SUB,
     },
     {
+        .id        = AV_CODEC_ID_MPL2,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "mpl2",
+        .long_name = NULL_IF_CONFIG_SMALL("MPL2 subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_VPLAYER,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "vplayer",
+        .long_name = NULL_IF_CONFIG_SMALL("VPlayer subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_PJS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "pjs",
+        .long_name = NULL_IF_CONFIG_SMALL("PJS (Phoenix Japanimation Society) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
+        .id        = AV_CODEC_ID_ASS,
+        .type      = AVMEDIA_TYPE_SUBTITLE,
+        .name      = "ass",
+        .long_name = NULL_IF_CONFIG_SMALL("ASS (Advanced SSA) subtitle"),
+        .props     = AV_CODEC_PROP_TEXT_SUB,
+    },
+    {
         .id        = AV_CODEC_ID_HDMV_TEXT_SUBTITLE,
         .type      = AVMEDIA_TYPE_SUBTITLE,
         .name      = "hdmv_text_subtitle",
@@ -3056,6 +3057,12 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .mime_types= MT("application/x-truetype-font", "application/x-font"),
     },
     {
+        .id        = AV_CODEC_ID_SCTE_35,
+        .type      = AVMEDIA_TYPE_DATA,
+        .name      = "scte_35",
+        .long_name = NULL_IF_CONFIG_SMALL("SCTE 35 Message Queue"),
+    },
+    {
         .id        = AV_CODEC_ID_BINTEXT,
         .type      = AVMEDIA_TYPE_VIDEO,
         .name      = "bintext",
@@ -3109,23 +3116,26 @@ static const AVCodecDescriptor codec_descriptors[] = {
         .mime_types= MT("application/octet-stream"),
     },
     {
-        .id        = AV_CODEC_ID_SCTE_35,
-        .type      = AVMEDIA_TYPE_DATA,
-        .name      = "scte_35",
-        .long_name = NULL_IF_CONFIG_SMALL("SCTE 35 Message Queue"),
+        .id        = AV_CODEC_ID_WRAPPED_AVFRAME,
+        .type      = AVMEDIA_TYPE_VIDEO,
+        .name      = "wrapped_avframe",
+        .long_name = NULL_IF_CONFIG_SMALL("AVFrame to AVPacket passthrough"),
+        .props     = AV_CODEC_PROP_LOSSLESS,
     },
-
-    /* deprecated codec ids */
 };
 
-const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id)
+static int descriptor_compare(const void *key, const void *member)
 {
-    int i;
+    enum AVCodecID id = *(const enum AVCodecID *) key;
+    const AVCodecDescriptor *desc = member;
 
-    for (i = 0; i < FF_ARRAY_ELEMS(codec_descriptors); i++)
-        if (codec_descriptors[i].id == id)
-            return &codec_descriptors[i];
-    return NULL;
+    return id - desc->id;
+}
+
+const AVCodecDescriptor *avcodec_descriptor_get(enum AVCodecID id)
+{
+    return bsearch(&id, codec_descriptors, FF_ARRAY_ELEMS(codec_descriptors),
+                   sizeof(codec_descriptors[0]), descriptor_compare);
 }
 
 const AVCodecDescriptor *avcodec_descriptor_next(const AVCodecDescriptor *prev)
diff --git a/media/ffvpx/libavcodec/codec_list.c b/media/ffvpx/libavcodec/codec_list.c
new file mode 100644
index 000000000..063f8ff78
--- /dev/null
+++ b/media/ffvpx/libavcodec/codec_list.c
@@ -0,0 +1,11 @@
+static const AVCodec * const codec_list[] = {
+#if CONFIG_VP8_DECODER
+    &ff_vp8_decoder,
+#endif
+#if CONFIG_VP9_DECODER
+    &ff_vp9_decoder,
+#endif
+#if CONFIG_FLAC_DECODER
+    &ff_flac_decoder,
+#endif
+    NULL };
diff --git a/media/ffvpx/libavcodec/decode.c b/media/ffvpx/libavcodec/decode.c
index fb1824be1..421a8f1a3 100644
--- a/media/ffvpx/libavcodec/decode.c
+++ b/media/ffvpx/libavcodec/decode.c
@@ -40,6 +40,7 @@
 #include "avcodec.h"
 #include "bytestream.h"
 #include "decode.h"
+#include "hwaccel.h"
 #include "internal.h"
 #include "thread.h"
 
@@ -129,7 +130,7 @@ static int extract_packet_props(AVCodecInternal *avci, const AVPacket *pkt)
     if (pkt) {
         ret = av_packet_copy_props(avci->last_pkt_props, pkt);
         if (!ret)
-            avci->last_pkt_props->size = pkt->size; // HACK: Needed for ff_init_buffer_info().
+            avci->last_pkt_props->size = pkt->size; // HACK: Needed for ff_decode_frame_props().
     }
     return ret;
 }
@@ -369,8 +370,7 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
     DecodeSimpleContext *ds = &avci->ds;
     AVPacket           *pkt = ds->in_pkt;
     // copy to ensure we do not change pkt
-    AVPacket tmp;
-    int got_frame, actual_got_frame, did_split;
+    int got_frame, actual_got_frame;
     int ret;
 
     if (!pkt->data && !avci->draining) {
@@ -390,31 +390,12 @@ static int decode_simple_internal(AVCodecContext *avctx, AVFrame *frame)
           avctx->active_thread_type & FF_THREAD_FRAME))
         return AVERROR_EOF;
 
-    tmp = *pkt;
-#if FF_API_MERGE_SD
-FF_DISABLE_DEPRECATION_WARNINGS
-    did_split = avci->compat_decode_partial_size ?
-                ff_packet_split_and_drop_side_data(&tmp) :
-                av_packet_split_side_data(&tmp);
-
-    if (did_split) {
-        ret = extract_packet_props(avctx->internal, &tmp);
-        if (ret < 0)
-            return ret;
-
-        ret = apply_param_change(avctx, &tmp);
-        if (ret < 0)
-            return ret;
-    }
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
-
     got_frame = 0;
 
     if (HAVE_THREADS && avctx->active_thread_type & FF_THREAD_FRAME) {
-        ret = ff_thread_decode_frame(avctx, frame, &got_frame, &tmp);
+        ret = ff_thread_decode_frame(avctx, frame, &got_frame, pkt);
     } else {
-        ret = avctx->codec->decode(avctx, frame, &got_frame, &tmp);
+        ret = avctx->codec->decode(avctx, frame, &got_frame, pkt);
 
         if (!(avctx->codec->caps_internal & FF_CODEC_CAP_SETS_PKT_DTS))
             frame->pkt_dts = pkt->dts;
@@ -544,13 +525,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
     }
-#if FF_API_MERGE_SD
-    if (did_split) {
-        av_packet_free_side_data(&tmp);
-        if(ret == tmp.size)
-            ret = pkt->size;
-    }
-#endif
 
     if (avctx->codec->type == AVMEDIA_TYPE_AUDIO &&
         !avci->showed_multi_packet_warning &&
@@ -640,6 +614,28 @@ static int decode_receive_frame_internal(AVCodecContext *avctx, AVFrame *frame)
     if (ret == AVERROR_EOF)
         avci->draining_done = 1;
 
+    if (!ret) {
+        /* the only case where decode data is not set should be decoders
+         * that do not call ff_get_buffer() */
+        av_assert0((frame->private_ref && frame->private_ref->size == sizeof(FrameDecodeData)) ||
+                   !(avctx->codec->capabilities & AV_CODEC_CAP_DR1));
+
+        if (frame->private_ref) {
+            FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
+
+            if (fdd->post_process) {
+                ret = fdd->post_process(avctx, frame);
+                if (ret < 0) {
+                    av_frame_unref(frame);
+                    return ret;
+                }
+            }
+        }
+    }
+
+    /* free the per-frame decode data */
+    av_buffer_unref(&frame->private_ref);
+
     return ret;
 }
 
@@ -1004,7 +1000,6 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
                              AVPacket *avpkt)
 {
     int i, ret = 0;
-    AVCodecInternal *avci = avctx->internal;
 
     if (!avpkt->data && avpkt->size) {
         av_log(avctx, AV_LOG_ERROR, "invalid packet: NULL data, size != 0\n");
@@ -1021,29 +1016,9 @@ int avcodec_decode_subtitle2(AVCodecContext *avctx, AVSubtitle *sub,
     get_subtitle_defaults(sub);
 
     if ((avctx->codec->capabilities & AV_CODEC_CAP_DELAY) || avpkt->size) {
-        AVPacket pkt_recoded;
-        AVPacket tmp = *avpkt;
-#if FF_API_MERGE_SD
-FF_DISABLE_DEPRECATION_WARNINGS
-        int did_split = avci->compat_decode_partial_size ?
-                        ff_packet_split_and_drop_side_data(&tmp) :
-                        av_packet_split_side_data(&tmp);
-        //apply_param_change(avctx, &tmp);
-
-        if (did_split) {
-            /* FFMIN() prevents overflow in case the packet wasn't allocated with
-             * proper padding.
-             * If the side data is smaller than the buffer padding size, the
-             * remaining bytes should have already been filled with zeros by the
-             * original packet allocation anyway. */
-            memset(tmp.data + tmp.size, 0,
-                   FFMIN(avpkt->size - tmp.size, AV_INPUT_BUFFER_PADDING_SIZE));
-        }
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
+        AVPacket pkt_recoded = *avpkt;
 
-        pkt_recoded = tmp;
-        ret = recode_subtitle(avctx, &pkt_recoded, &tmp);
+        ret = recode_subtitle(avctx, &pkt_recoded, avpkt);
         if (ret < 0) {
             *got_sub_ptr = 0;
         } else {
@@ -1082,7 +1057,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 sub->format = 1;
 
             for (i = 0; i < sub->num_rects; i++) {
-                if (sub->rects[i]->ass && !utf8_check(sub->rects[i]->ass)) {
+                if (avctx->sub_charenc_mode != FF_SUB_CHARENC_MODE_IGNORE &&
+                    sub->rects[i]->ass && !utf8_check(sub->rects[i]->ass)) {
                     av_log(avctx, AV_LOG_ERROR,
                            "Invalid UTF-8 in decoded subtitles text; "
                            "maybe missing -sub_charenc option\n");
@@ -1092,7 +1068,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
                 }
             }
 
-            if (tmp.data != pkt_recoded.data) { // did we recode?
+            if (avpkt->data != pkt_recoded.data) { // did we recode?
                 /* prevent from destroying side data from original packet */
                 pkt_recoded.side_data = NULL;
                 pkt_recoded.side_data_elems = 0;
@@ -1101,14 +1077,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
         }
 
-#if FF_API_MERGE_SD
-        if (did_split) {
-            av_packet_free_side_data(&tmp);
-            if(ret == tmp.size)
-                ret = avpkt->size;
-        }
-#endif
-
         if (*got_sub_ptr)
             avctx->frame_number++;
     }
@@ -1116,84 +1084,238 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return ret;
 }
 
-static int is_hwaccel_pix_fmt(enum AVPixelFormat pix_fmt)
+enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *avctx,
+                                              const enum AVPixelFormat *fmt)
 {
-    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
-    return desc->flags & AV_PIX_FMT_FLAG_HWACCEL;
-}
+    const AVPixFmtDescriptor *desc;
+    const AVCodecHWConfig *config;
+    int i, n;
+
+    // If a device was supplied when the codec was opened, assume that the
+    // user wants to use it.
+    if (avctx->hw_device_ctx && avctx->codec->hw_configs) {
+        AVHWDeviceContext *device_ctx =
+            (AVHWDeviceContext*)avctx->hw_device_ctx->data;
+        for (i = 0;; i++) {
+            config = &avctx->codec->hw_configs[i]->public;
+            if (!config)
+                break;
+            if (!(config->methods &
+                  AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX))
+                continue;
+            if (device_ctx->type != config->device_type)
+                continue;
+            for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++) {
+                if (config->pix_fmt == fmt[n])
+                    return fmt[n];
+            }
+        }
+    }
+    // No device or other setup, so we have to choose from things which
+    // don't any other external information.
+
+    // If the last element of the list is a software format, choose it
+    // (this should be best software format if any exist).
+    for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++);
+    desc = av_pix_fmt_desc_get(fmt[n - 1]);
+    if (!(desc->flags & AV_PIX_FMT_FLAG_HWACCEL))
+        return fmt[n - 1];
+
+    // Finally, traverse the list in order and choose the first entry
+    // with no external dependencies (if there is no hardware configuration
+    // information available then this just picks the first entry).
+    for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++) {
+        for (i = 0;; i++) {
+            config = avcodec_get_hw_config(avctx->codec, i);
+            if (!config)
+                break;
+            if (config->pix_fmt == fmt[n])
+                break;
+        }
+        if (!config) {
+            // No specific config available, so the decoder must be able
+            // to handle this format without any additional setup.
+            return fmt[n];
+        }
+        if (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
+            // Usable with only internal setup.
+            return fmt[n];
+        }
+    }
 
-enum AVPixelFormat avcodec_default_get_format(struct AVCodecContext *s, const enum AVPixelFormat *fmt)
-{
-    while (*fmt != AV_PIX_FMT_NONE && is_hwaccel_pix_fmt(*fmt))
-        ++fmt;
-    return fmt[0];
+    // Nothing is usable, give up.
+    return AV_PIX_FMT_NONE;
 }
 
-static AVHWAccel *find_hwaccel(enum AVCodecID codec_id,
-                               enum AVPixelFormat pix_fmt)
+int ff_decode_get_hw_frames_ctx(AVCodecContext *avctx,
+                                enum AVHWDeviceType dev_type)
 {
-    AVHWAccel *hwaccel = NULL;
+    AVHWDeviceContext *device_ctx;
+    AVHWFramesContext *frames_ctx;
+    int ret;
+
+    if (!avctx->hwaccel)
+        return AVERROR(ENOSYS);
+
+    if (avctx->hw_frames_ctx)
+        return 0;
+    if (!avctx->hw_device_ctx) {
+        av_log(avctx, AV_LOG_ERROR, "A hardware frames or device context is "
+                "required for hardware accelerated decoding.\n");
+        return AVERROR(EINVAL);
+    }
+
+    device_ctx = (AVHWDeviceContext *)avctx->hw_device_ctx->data;
+    if (device_ctx->type != dev_type) {
+        av_log(avctx, AV_LOG_ERROR, "Device type %s expected for hardware "
+               "decoding, but got %s.\n", av_hwdevice_get_type_name(dev_type),
+               av_hwdevice_get_type_name(device_ctx->type));
+        return AVERROR(EINVAL);
+    }
+
+    ret = avcodec_get_hw_frames_parameters(avctx,
+                                           avctx->hw_device_ctx,
+                                           avctx->hwaccel->pix_fmt,
+                                           &avctx->hw_frames_ctx);
+    if (ret < 0)
+        return ret;
+
+    frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+
 
-    while ((hwaccel = av_hwaccel_next(hwaccel)))
-        if (hwaccel->id == codec_id
-            && hwaccel->pix_fmt == pix_fmt)
-            return hwaccel;
-    return NULL;
+    if (frames_ctx->initial_pool_size) {
+        // We guarantee 4 base work surfaces. The function above guarantees 1
+        // (the absolute minimum), so add the missing count.
+        frames_ctx->initial_pool_size += 3;
+    }
+
+    ret = av_hwframe_ctx_init(avctx->hw_frames_ctx);
+    if (ret < 0) {
+        av_buffer_unref(&avctx->hw_frames_ctx);
+        return ret;
+    }
+
+    return 0;
 }
 
-static int setup_hwaccel(AVCodecContext *avctx,
-                         const enum AVPixelFormat fmt,
-                         const char *name)
+int avcodec_get_hw_frames_parameters(AVCodecContext *avctx,
+                                     AVBufferRef *device_ref,
+                                     enum AVPixelFormat hw_pix_fmt,
+                                     AVBufferRef **out_frames_ref)
 {
-    AVHWAccel *hwa = find_hwaccel(avctx->codec_id, fmt);
-    int ret        = 0;
+    AVBufferRef *frames_ref = NULL;
+    const AVCodecHWConfigInternal *hw_config;
+    const AVHWAccel *hwa;
+    int i, ret;
 
-    if (!hwa) {
-        av_log(avctx, AV_LOG_ERROR,
-               "Could not find an AVHWAccel for the pixel format: %s",
-               name);
+    for (i = 0;; i++) {
+        hw_config = avctx->codec->hw_configs[i];
+        if (!hw_config)
+            return AVERROR(ENOENT);
+        if (hw_config->public.pix_fmt == hw_pix_fmt)
+            break;
+    }
+
+    hwa = hw_config->hwaccel;
+    if (!hwa || !hwa->frame_params)
         return AVERROR(ENOENT);
+
+    frames_ref = av_hwframe_ctx_alloc(device_ref);
+    if (!frames_ref)
+        return AVERROR(ENOMEM);
+
+    ret = hwa->frame_params(avctx, frames_ref);
+    if (ret >= 0) {
+        AVHWFramesContext *frames_ctx = (AVHWFramesContext*)frames_ref->data;
+
+        if (frames_ctx->initial_pool_size) {
+            // If the user has requested that extra output surfaces be
+            // available then add them here.
+            if (avctx->extra_hw_frames > 0)
+                frames_ctx->initial_pool_size += avctx->extra_hw_frames;
+
+            // If frame threading is enabled then an extra surface per thread
+            // is also required.
+            if (avctx->active_thread_type & FF_THREAD_FRAME)
+                frames_ctx->initial_pool_size += avctx->thread_count;
+        }
+
+        *out_frames_ref = frames_ref;
+    } else {
+        av_buffer_unref(&frames_ref);
     }
+    return ret;
+}
+
+static int hwaccel_init(AVCodecContext *avctx,
+                        const AVCodecHWConfigInternal *hw_config)
+{
+    const AVHWAccel *hwaccel;
+    int err;
 
-    if (hwa->capabilities & AV_HWACCEL_CODEC_CAP_EXPERIMENTAL &&
+    hwaccel = hw_config->hwaccel;
+    if (hwaccel->capabilities & AV_HWACCEL_CODEC_CAP_EXPERIMENTAL &&
         avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL) {
         av_log(avctx, AV_LOG_WARNING, "Ignoring experimental hwaccel: %s\n",
-               hwa->name);
+               hwaccel->name);
         return AVERROR_PATCHWELCOME;
     }
 
-    if (hwa->priv_data_size) {
-        avctx->internal->hwaccel_priv_data = av_mallocz(hwa->priv_data_size);
+    if (hwaccel->priv_data_size) {
+        avctx->internal->hwaccel_priv_data =
+            av_mallocz(hwaccel->priv_data_size);
         if (!avctx->internal->hwaccel_priv_data)
             return AVERROR(ENOMEM);
     }
 
-    avctx->hwaccel = hwa;
-    if (hwa->init) {
-        ret = hwa->init(avctx);
-        if (ret < 0) {
+    avctx->hwaccel = hwaccel;
+    if (hwaccel->init) {
+        err = hwaccel->init(avctx);
+        if (err < 0) {
+            av_log(avctx, AV_LOG_ERROR, "Failed setup for format %s: "
+                   "hwaccel initialisation returned error.\n",
+                   av_get_pix_fmt_name(hw_config->public.pix_fmt));
             av_freep(&avctx->internal->hwaccel_priv_data);
             avctx->hwaccel = NULL;
-            return ret;
+            return err;
         }
     }
 
     return 0;
 }
 
+static void hwaccel_uninit(AVCodecContext *avctx)
+{
+    if (avctx->hwaccel && avctx->hwaccel->uninit)
+        avctx->hwaccel->uninit(avctx);
+
+    av_freep(&avctx->internal->hwaccel_priv_data);
+
+    avctx->hwaccel = NULL;
+
+    av_buffer_unref(&avctx->hw_frames_ctx);
+}
+
 int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
 {
     const AVPixFmtDescriptor *desc;
     enum AVPixelFormat *choices;
-    enum AVPixelFormat ret;
-    unsigned n = 0;
-
-    while (fmt[n] != AV_PIX_FMT_NONE)
-        ++n;
-
+    enum AVPixelFormat ret, user_choice;
+    const AVCodecHWConfigInternal *hw_config;
+    const AVCodecHWConfig *config;
+    int i, n, err;
+
+    // Find end of list.
+    for (n = 0; fmt[n] != AV_PIX_FMT_NONE; n++);
+    // Must contain at least one entry.
     av_assert0(n >= 1);
-    avctx->sw_pix_fmt = fmt[n - 1];
-    av_assert2(!is_hwaccel_pix_fmt(avctx->sw_pix_fmt));
+    // If a software format is available, it must be the last entry.
+    desc = av_pix_fmt_desc_get(fmt[n - 1]);
+    if (desc->flags & AV_PIX_FMT_FLAG_HWACCEL) {
+        // No software format is available.
+    } else {
+        avctx->sw_pix_fmt = fmt[n - 1];
+    }
 
     choices = av_malloc_array(n + 1, sizeof(*choices));
     if (!choices)
@@ -1202,48 +1324,108 @@ int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt)
     memcpy(choices, fmt, (n + 1) * sizeof(*choices));
 
     for (;;) {
-        if (avctx->hwaccel && avctx->hwaccel->uninit)
-            avctx->hwaccel->uninit(avctx);
-        av_freep(&avctx->internal->hwaccel_priv_data);
-        avctx->hwaccel = NULL;
-
-        av_buffer_unref(&avctx->hw_frames_ctx);
+        // Remove the previous hwaccel, if there was one.
+        hwaccel_uninit(avctx);
 
-        ret = avctx->get_format(avctx, choices);
+        user_choice = avctx->get_format(avctx, choices);
+        if (user_choice == AV_PIX_FMT_NONE) {
+            // Explicitly chose nothing, give up.
+            ret = AV_PIX_FMT_NONE;
+            break;
+        }
 
-        desc = av_pix_fmt_desc_get(ret);
+        desc = av_pix_fmt_desc_get(user_choice);
         if (!desc) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid format returned by "
+                   "get_format() callback.\n");
             ret = AV_PIX_FMT_NONE;
             break;
         }
+        av_log(avctx, AV_LOG_DEBUG, "Format %s chosen by get_format().\n",
+               desc->name);
 
-        if (!(desc->flags & AV_PIX_FMT_FLAG_HWACCEL))
-            break;
-#if FF_API_CAP_VDPAU
-        if (avctx->codec->capabilities&AV_CODEC_CAP_HWACCEL_VDPAU)
+        for (i = 0; i < n; i++) {
+            if (choices[i] == user_choice)
+                break;
+        }
+        if (i == n) {
+            av_log(avctx, AV_LOG_ERROR, "Invalid return from get_format(): "
+                   "%s not in possible list.\n", desc->name);
             break;
-#endif
+        }
 
-        if (avctx->hw_frames_ctx) {
-            AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-            if (hw_frames_ctx->format != ret) {
-                av_log(avctx, AV_LOG_ERROR, "Format returned from get_buffer() "
-                       "does not match the format of provided AVHWFramesContext\n");
-                ret = AV_PIX_FMT_NONE;
-                break;
+        if (avctx->codec->hw_configs) {
+            for (i = 0;; i++) {
+                hw_config = avctx->codec->hw_configs[i];
+                if (!hw_config)
+                    break;
+                if (hw_config->public.pix_fmt == user_choice)
+                    break;
             }
+        } else {
+            hw_config = NULL;
         }
 
-        if (!setup_hwaccel(avctx, ret, desc->name))
+        if (!hw_config) {
+            // No config available, so no extra setup required.
+            ret = user_choice;
             break;
+        }
+        config = &hw_config->public;
+
+        if (config->methods &
+            AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX &&
+            avctx->hw_frames_ctx) {
+            const AVHWFramesContext *frames_ctx =
+                (AVHWFramesContext*)avctx->hw_frames_ctx->data;
+            if (frames_ctx->format != user_choice) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid setup for format %s: "
+                       "does not match the format of the provided frames "
+                       "context.\n", desc->name);
+                goto try_again;
+            }
+        } else if (config->methods &
+                   AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX &&
+                   avctx->hw_device_ctx) {
+            const AVHWDeviceContext *device_ctx =
+                (AVHWDeviceContext*)avctx->hw_device_ctx->data;
+            if (device_ctx->type != config->device_type) {
+                av_log(avctx, AV_LOG_ERROR, "Invalid setup for format %s: "
+                       "does not match the type of the provided device "
+                       "context.\n", desc->name);
+                goto try_again;
+            }
+        } else if (config->methods &
+                   AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
+            // Internal-only setup, no additional configuration.
+        } else if (config->methods &
+                   AV_CODEC_HW_CONFIG_METHOD_AD_HOC) {
+            // Some ad-hoc configuration we can't see and can't check.
+        } else {
+            av_log(avctx, AV_LOG_ERROR, "Invalid setup for format %s: "
+                   "missing configuration.\n", desc->name);
+            goto try_again;
+        }
+        if (hw_config->hwaccel) {
+            av_log(avctx, AV_LOG_DEBUG, "Format %s requires hwaccel "
+                   "initialisation.\n", desc->name);
+            err = hwaccel_init(avctx, hw_config);
+            if (err < 0)
+                goto try_again;
+        }
+        ret = user_choice;
+        break;
 
-        /* Remove failed hwaccel from choices */
-        for (n = 0; choices[n] != ret; n++)
-            av_assert0(choices[n] != AV_PIX_FMT_NONE);
-
-        do
-            choices[n] = choices[n + 1];
-        while (choices[n++] != AV_PIX_FMT_NONE);
+    try_again:
+        av_log(avctx, AV_LOG_DEBUG, "Format %s not usable, retrying "
+               "get_format() without it.\n", desc->name);
+        for (i = 0; i < n; i++) {
+            if (choices[i] == user_choice)
+                break;
+        }
+        for (; i + 1 < n; i++)
+            choices[i] = choices[i + 1];
+        --n;
     }
 
     av_freep(&choices);
@@ -1432,7 +1614,7 @@ static int video_get_buffer(AVCodecContext *s, AVFrame *pic)
         pic->linesize[i] = 0;
     }
     if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
-        desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
+        ((desc->flags & FF_PSEUDOPAL) && pic->data[1]))
         avpriv_set_systematic_pal2((uint32_t *)pic->data[1], pic->format);
 
     if (s->debug & FF_DEBUG_BUFFERS)
@@ -1480,7 +1662,7 @@ static int add_metadata_from_side_data(const AVPacket *avpkt, AVFrame *frame)
     return av_packet_unpack_dictionary(side_metadata, size, frame_md);
 }
 
-int ff_init_buffer_info(AVCodecContext *avctx, AVFrame *frame)
+int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
 {
     const AVPacket *pkt = avctx->internal->last_pkt_props;
     int i;
@@ -1588,11 +1770,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-int ff_decode_frame_props(AVCodecContext *avctx, AVFrame *frame)
-{
-    return ff_init_buffer_info(avctx, frame);
-}
-
 static void validate_avframe_allocation(AVCodecContext *avctx, AVFrame *frame)
 {
     if (avctx->codec_type == AVMEDIA_TYPE_VIDEO) {
@@ -1602,12 +1779,11 @@ static void validate_avframe_allocation(AVCodecContext *avctx, AVFrame *frame)
         int flags = desc ? desc->flags : 0;
         if (num_planes == 1 && (flags & AV_PIX_FMT_FLAG_PAL))
             num_planes = 2;
+        if ((flags & FF_PSEUDOPAL) && frame->data[1])
+            num_planes = 2;
         for (i = 0; i < num_planes; i++) {
             av_assert0(frame->data[i]);
         }
-        // For now do not enforce anything for palette of pseudopal formats
-        if (num_planes == 1 && (flags & AV_PIX_FMT_FLAG_PSEUDOPAL))
-            num_planes = 2;
         // For formats without data like hwaccel allow unused pointers to be non-NULL.
         for (i = num_planes; num_planes > 0 && i < FF_ARRAY_ELEMS(frame->data); i++) {
             if (frame->data[i])
@@ -1617,6 +1793,43 @@ static void validate_avframe_allocation(AVCodecContext *avctx, AVFrame *frame)
     }
 }
 
+static void decode_data_free(void *opaque, uint8_t *data)
+{
+    FrameDecodeData *fdd = (FrameDecodeData*)data;
+
+    if (fdd->post_process_opaque_free)
+        fdd->post_process_opaque_free(fdd->post_process_opaque);
+
+    if (fdd->hwaccel_priv_free)
+        fdd->hwaccel_priv_free(fdd->hwaccel_priv);
+
+    av_freep(&fdd);
+}
+
+int ff_attach_decode_data(AVFrame *frame)
+{
+    AVBufferRef *fdd_buf;
+    FrameDecodeData *fdd;
+
+    av_assert1(!frame->private_ref);
+    av_buffer_unref(&frame->private_ref);
+
+    fdd = av_mallocz(sizeof(*fdd));
+    if (!fdd)
+        return AVERROR(ENOMEM);
+
+    fdd_buf = av_buffer_create((uint8_t*)fdd, sizeof(*fdd), decode_data_free,
+                               NULL, AV_BUFFER_FLAG_READONLY);
+    if (!fdd_buf) {
+        av_freep(&fdd);
+        return AVERROR(ENOMEM);
+    }
+
+    frame->private_ref = fdd_buf;
+
+    return 0;
+}
+
 static int get_buffer_internal(AVCodecContext *avctx, AVFrame *frame, int flags)
 {
     const AVHWAccel *hwaccel = avctx->hwaccel;
@@ -1653,8 +1866,14 @@ static int get_buffer_internal(AVCodecContext *avctx, AVFrame *frame, int flags)
         avctx->sw_pix_fmt = avctx->pix_fmt;
 
     ret = avctx->get_buffer2(avctx, frame, flags);
-    if (ret >= 0)
-        validate_avframe_allocation(avctx, frame);
+    if (ret < 0)
+        goto end;
+
+    validate_avframe_allocation(avctx, frame);
+
+    ret = ff_attach_decode_data(frame);
+    if (ret < 0)
+        goto end;
 
 end:
     if (avctx->codec_type == AVMEDIA_TYPE_VIDEO && !override_dimensions &&
@@ -1663,6 +1882,9 @@ end:
         frame->height = avctx->height;
     }
 
+    if (ret < 0)
+        av_frame_unref(frame);
+
     return ret;
 }
 
@@ -1689,8 +1911,6 @@ static int reget_buffer_internal(AVCodecContext *avctx, AVFrame *frame)
         av_frame_unref(frame);
     }
 
-    ff_init_buffer_info(avctx, frame);
-
     if (!frame->data[0])
         return ff_get_buffer(avctx, frame, AV_GET_BUFFER_FLAG_REF);
 
diff --git a/media/ffvpx/libavcodec/decode.h b/media/ffvpx/libavcodec/decode.h
index c9630228d..15271c529 100644
--- a/media/ffvpx/libavcodec/decode.h
+++ b/media/ffvpx/libavcodec/decode.h
@@ -21,9 +21,39 @@
 #ifndef AVCODEC_DECODE_H
 #define AVCODEC_DECODE_H
 
+#include "libavutil/buffer.h"
+#include "libavutil/frame.h"
+#include "libavutil/hwcontext.h"
+
 #include "avcodec.h"
 
 /**
+ * This struct stores per-frame lavc-internal data and is attached to it via
+ * private_ref.
+ */
+typedef struct FrameDecodeData {
+    /**
+     * The callback to perform some delayed processing on the frame right
+     * before it is returned to the caller.
+     *
+     * @note This code is called at some unspecified point after the frame is
+     * returned from the decoder's decode/receive_frame call. Therefore it cannot rely
+     * on AVCodecContext being in any specific state, so it does not get to
+     * access AVCodecContext directly at all. All the state it needs must be
+     * stored in the post_process_opaque object.
+     */
+    int (*post_process)(void *logctx, AVFrame *frame);
+    void *post_process_opaque;
+    void (*post_process_opaque_free)(void *opaque);
+
+    /**
+     * Per-frame private data for hwaccels.
+     */
+    void *hwaccel_priv;
+    void (*hwaccel_priv_free)(void *priv);
+} FrameDecodeData;
+
+/**
  * Called by decoders to get the next packet for decoding.
  *
  * @param pkt An empty packet to be filled with data.
@@ -36,4 +66,14 @@ int ff_decode_get_packet(AVCodecContext *avctx, AVPacket *pkt);
 
 void ff_decode_bsfs_uninit(AVCodecContext *avctx);
 
+/**
+ * Make sure avctx.hw_frames_ctx is set. If it's not set, the function will
+ * try to allocate it from hw_device_ctx. If that is not possible, an error
+ * message is printed, and an error code is returned.
+ */
+int ff_decode_get_hw_frames_ctx(AVCodecContext *avctx,
+                                enum AVHWDeviceType dev_type);
+
+int ff_attach_decode_data(AVFrame *frame);
+
 #endif /* AVCODEC_DECODE_H */
diff --git a/media/ffvpx/libavcodec/dummy_funcs.c b/media/ffvpx/libavcodec/dummy_funcs.c
index 21b469f7e..295ee84e1 100644
--- a/media/ffvpx/libavcodec/dummy_funcs.c
+++ b/media/ffvpx/libavcodec/dummy_funcs.c
@@ -13,20 +13,14 @@ typedef struct VP9DSPContext VP9DSPContext;
 typedef struct FLACDSPContext FLACDSPContext;
 
 AVHWAccel ff_h263_vaapi_hwaccel;
-AVHWAccel ff_h263_vdpau_hwaccel;
 AVHWAccel ff_h263_videotoolbox_hwaccel;
 AVHWAccel ff_h264_d3d11va_hwaccel;
 AVHWAccel ff_h264_dxva2_hwaccel;
-AVHWAccel ff_h264_mmal_hwaccel;
-AVHWAccel ff_h264_qsv_hwaccel;
 AVHWAccel ff_h264_vaapi_hwaccel;
-AVHWAccel ff_h264_vda_hwaccel;
-AVHWAccel ff_h264_vda_old_hwaccel;
 AVHWAccel ff_h264_vdpau_hwaccel;
 AVHWAccel ff_h264_videotoolbox_hwaccel;
 AVHWAccel ff_hevc_d3d11va_hwaccel;
 AVHWAccel ff_hevc_dxva2_hwaccel;
-AVHWAccel ff_hevc_qsv_hwaccel;
 AVHWAccel ff_hevc_vaapi_hwaccel;
 AVHWAccel ff_hevc_vdpau_hwaccel;
 AVHWAccel ff_mpeg1_xvmc_hwaccel;
@@ -35,11 +29,9 @@ AVHWAccel ff_mpeg1_videotoolbox_hwaccel;
 AVHWAccel ff_mpeg2_xvmc_hwaccel;
 AVHWAccel ff_mpeg2_d3d11va_hwaccel;
 AVHWAccel ff_mpeg2_dxva2_hwaccel;
-AVHWAccel ff_mpeg2_qsv_hwaccel;
 AVHWAccel ff_mpeg2_vaapi_hwaccel;
 AVHWAccel ff_mpeg2_vdpau_hwaccel;
 AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
-AVHWAccel ff_mpeg4_mmal_hwaccel;
 AVHWAccel ff_mpeg4_vaapi_hwaccel;
 AVHWAccel ff_mpeg4_vdpau_hwaccel;
 AVHWAccel ff_mpeg4_videotoolbox_hwaccel;
@@ -47,40 +39,19 @@ AVHWAccel ff_vc1_d3d11va_hwaccel;
 AVHWAccel ff_vc1_dxva2_hwaccel;
 AVHWAccel ff_vc1_vaapi_hwaccel;
 AVHWAccel ff_vc1_vdpau_hwaccel;
-AVHWAccel ff_vc1_qsv_hwaccel;
 AVHWAccel ff_wmv3_d3d11va_hwaccel;
 AVHWAccel ff_wmv3_dxva2_hwaccel;
 AVHWAccel ff_wmv3_vaapi_hwaccel;
 AVHWAccel ff_wmv3_vdpau_hwaccel;
-AVHWAccel ff_mpeg2_mmal_hwaccel;
-AVHWAccel ff_vc1_mmal_hwaccel;
 AVHWAccel ff_vp9_d3d11va_hwaccel;
 AVHWAccel ff_vp9_dxva2_hwaccel;
 AVHWAccel ff_vp9_vaapi_hwaccel;
-AVHWAccel ff_vp9_cuvid_hwaccel;
-AVHWAccel ff_vp8_cuvid_hwaccel;
-AVHWAccel ff_vc1_cuvid_hwaccel;
-AVHWAccel ff_hevc_cuvid_hwaccel;
-AVHWAccel ff_h264_cuvid_hwaccel;
-/* Added by FFmpeg 3.2 */
-AVHWAccel ff_h263_cuvid_hwaccel;
-AVHWAccel ff_mjpeg_cuvid_hwaccel;
-AVHWAccel ff_mpeg1_cuvid_hwaccel;
-AVHWAccel ff_mpeg2_cuvid_hwaccel;
-AVHWAccel ff_mpeg4_cuvid_hwaccel;
-AVHWAccel ff_h264_mediacodec_hwaccel;
-AVHWAccel ff_hevc_mediacodec_hwaccel;
-AVHWAccel ff_mpeg4_mediacodec_hwaccel;
-AVHWAccel ff_vp8_mediacodec_hwaccel;
-AVHWAccel ff_vp9_mediacodec_hwaccel;
 /* Added by FFmpeg 3.4 */
 AVHWAccel ff_h264_d3d11va2_hwaccel;
 AVHWAccel ff_hevc_d3d11va2_hwaccel;
 AVHWAccel ff_hevc_videotoolbox_hwaccel;
 AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
-AVHWAccel ff_mpeg2_mediacodec_hwaccel;
 AVHWAccel ff_vc1_d3d11va2_hwaccel;
-AVHWAccel ff_vp8_qsv_hwaccel;
 AVHWAccel ff_vp9_d3d11va2_hwaccel;
 AVHWAccel ff_wmv3_d3d11va2_hwaccel;
 
@@ -185,8 +156,6 @@ AVCodec ff_h264_decoder;
 AVCodec ff_h264_crystalhd_decoder;
 AVCodec ff_h264_mmal_decoder;
 AVCodec ff_h264_qsv_decoder;
-AVCodec ff_h264_vda_decoder;
-AVCodec ff_h264_vdpau_decoder;
 AVCodec ff_hap_encoder;
 AVCodec ff_hap_decoder;
 AVCodec ff_hevc_decoder;
@@ -221,7 +190,6 @@ AVCodec ff_mjpeg_decoder;
 AVCodec ff_mjpegb_decoder;
 AVCodec ff_mmvideo_decoder;
 AVCodec ff_motionpixels_decoder;
-AVCodec ff_mpeg_xvmc_decoder;
 AVCodec ff_mpeg1video_encoder;
 AVCodec ff_mpeg1video_decoder;
 AVCodec ff_mpeg2video_encoder;
@@ -230,10 +198,7 @@ AVCodec ff_mpeg4_encoder;
 AVCodec ff_mpeg4_decoder;
 AVCodec ff_mpeg4_crystalhd_decoder;
 AVCodec ff_mpeg4_mmal_decoder;
-AVCodec ff_mpeg4_vdpau_decoder;
 AVCodec ff_mpegvideo_decoder;
-AVCodec ff_mpeg_vdpau_decoder;
-AVCodec ff_mpeg1_vdpau_decoder;
 AVCodec ff_mpeg2_crystalhd_decoder;
 AVCodec ff_mpeg2_qsv_decoder;
 AVCodec ff_msa1_decoder;
@@ -344,7 +309,6 @@ AVCodec ff_vb_decoder;
 AVCodec ff_vble_decoder;
 AVCodec ff_vc1_decoder;
 AVCodec ff_vc1_crystalhd_decoder;
-AVCodec ff_vc1_vdpau_decoder;
 AVCodec ff_vc1image_decoder;
 AVCodec ff_vc1_qsv_decoder;
 AVCodec ff_vc2_encoder;
@@ -365,7 +329,6 @@ AVCodec ff_wmv2_encoder;
 AVCodec ff_wmv2_decoder;
 AVCodec ff_wmv3_decoder;
 AVCodec ff_wmv3_crystalhd_decoder;
-AVCodec ff_wmv3_vdpau_decoder;
 AVCodec ff_wmv3image_decoder;
 AVCodec ff_wnv1_decoder;
 AVCodec ff_xan_wc3_decoder;
@@ -740,10 +703,8 @@ AVCodec ff_vp8_mediacodec_decoder;
 AVCodec ff_mpeg4_mediacodec_decoder;
 AVCodec ff_mpeg4_cuvid_decoder;
 AVCodec ff_mpeg2_cuvid_decoder;
-AVCodec ff_mpeg1_cuvid_decoder;
 AVCodec ff_mjpeg_cuvid_decoder;
 AVCodec ff_hevc_mediacodec_decoder;
-AVCodec ff_h263_cuvid_decoder;
 AVCodec ff_libopenh264_decoder;
 AVCodec ff_pcm_s64le_decoder;
 AVCodec ff_pcm_s64le_encoder;
@@ -800,6 +761,7 @@ AVCodec ff_vp8_v4l2m2m_encoder;
 AVCodec ff_vp8_vaapi_encoder;
 AVCodec ff_vp9_vaapi_encoder;
 
+
 AVCodecParser ff_aac_parser;
 AVCodecParser ff_aac_latm_parser;
 AVCodecParser ff_ac3_parser;
@@ -815,6 +777,7 @@ AVCodecParser ff_dvaudio_parser;
 AVCodecParser ff_dvbsub_parser;
 AVCodecParser ff_dvdsub_parser;
 AVCodecParser ff_dvd_nav_parser;
+AVCodecParser ff_flac_parser;
 AVCodecParser ff_g729_parser;
 AVCodecParser ff_gsm_parser;
 AVCodecParser ff_h261_parser;
@@ -877,7 +840,9 @@ void ff_vp8dsp_init_mips(VP8DSPContext *c) {}
 void ff_vp9dsp_init_mips(VP9DSPContext *dsp, int bpp) {}
 void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp) {}
 void ff_vp9dsp_init_arm(VP9DSPContext *dsp, int bpp) {}
+#if !defined(__arm__)
 void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, int channels, int bps) {}
+#endif
 #if !defined(HAVE_64BIT_BUILD)
 void ff_flac_decorrelate_indep8_16_sse2(uint8_t **out, int32_t **in, int channels, int len, int shift) {}
 void ff_flac_decorrelate_indep8_32_avx(uint8_t **out, int32_t **in, int channels, int len, int shift) {}
diff --git a/media/ffvpx/libavcodec/error_resilience.h b/media/ffvpx/libavcodec/error_resilience.h
index 27c200869..664a76565 100644
--- a/media/ffvpx/libavcodec/error_resilience.h
+++ b/media/ffvpx/libavcodec/error_resilience.h
@@ -20,6 +20,7 @@
 #define AVCODEC_ERROR_RESILIENCE_H
 
 #include <stdint.h>
+#include <stdatomic.h>
 
 #include "avcodec.h"
 #include "me_cmp.h"
@@ -60,7 +61,7 @@ typedef struct ERContext {
     ptrdiff_t mb_stride;
     ptrdiff_t b8_stride;
 
-    volatile int error_count;
+    atomic_int error_count;
     int error_occurred;
     uint8_t *error_status_table;
     uint8_t *er_temp_buffer;
diff --git a/media/ffvpx/libavcodec/flac_parser.c b/media/ffvpx/libavcodec/flac_parser.c
index 84da23f32..272128646 100644
--- a/media/ffvpx/libavcodec/flac_parser.c
+++ b/media/ffvpx/libavcodec/flac_parser.c
@@ -686,12 +686,17 @@ static int flac_parse(AVCodecParserContext *s, AVCodecContext *avctx,
     }
 
     for (curr = fpc->headers; curr; curr = curr->next) {
-        if (curr->max_score > 0 &&
-            (!fpc->best_header || curr->max_score > fpc->best_header->max_score)) {
+        if (!fpc->best_header || curr->max_score > fpc->best_header->max_score) {
             fpc->best_header = curr;
         }
     }
 
+    if (fpc->best_header && fpc->best_header->max_score <= 0) {
+        // Only accept a bad header if there is no other option to continue
+        if (!buf_size || !buf || read_end != buf || fpc->nb_headers_buffered < FLAC_MIN_HEADERS)
+            fpc->best_header = NULL;
+    }
+
     if (fpc->best_header) {
         fpc->best_header_valid = 1;
         if (fpc->best_header->offset > 0) {
diff --git a/media/ffvpx/libavcodec/flacdec.c b/media/ffvpx/libavcodec/flacdec.c
index 3d41a1af7..c8eb45604 100644
--- a/media/ffvpx/libavcodec/flacdec.c
+++ b/media/ffvpx/libavcodec/flacdec.c
@@ -220,20 +220,27 @@ static int get_metadata_size(const uint8_t *buf, int buf_size)
 
 static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
 {
+    GetBitContext gb = s->gb;
     int i, tmp, partition, method_type, rice_order;
     int rice_bits, rice_esc;
     int samples;
 
-    method_type = get_bits(&s->gb, 2);
+    method_type = get_bits(&gb, 2);
+    rice_order  = get_bits(&gb, 4);
+
+    samples   = s->blocksize >> rice_order;
+    rice_bits = 4 + method_type;
+    rice_esc  = (1 << rice_bits) - 1;
+
+    decoded += pred_order;
+    i        = pred_order;
+
     if (method_type > 1) {
         av_log(s->avctx, AV_LOG_ERROR, "illegal residual coding method %d\n",
                method_type);
         return AVERROR_INVALIDDATA;
     }
 
-    rice_order = get_bits(&s->gb, 4);
-
-    samples= s->blocksize >> rice_order;
     if (samples << rice_order != s->blocksize) {
         av_log(s->avctx, AV_LOG_ERROR, "invalid rice order: %i blocksize %i\n",
                rice_order, s->blocksize);
@@ -246,21 +253,16 @@ static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
         return AVERROR_INVALIDDATA;
     }
 
-    rice_bits = 4 + method_type;
-    rice_esc  = (1 << rice_bits) - 1;
-
-    decoded += pred_order;
-    i= pred_order;
     for (partition = 0; partition < (1 << rice_order); partition++) {
-        tmp = get_bits(&s->gb, rice_bits);
+        tmp = get_bits(&gb, rice_bits);
         if (tmp == rice_esc) {
-            tmp = get_bits(&s->gb, 5);
+            tmp = get_bits(&gb, 5);
             for (; i < samples; i++)
-                *decoded++ = get_sbits_long(&s->gb, tmp);
+                *decoded++ = get_sbits_long(&gb, tmp);
         } else {
             int real_limit = tmp ? (INT_MAX >> tmp) + 2 : INT_MAX;
             for (; i < samples; i++) {
-                int v = get_sr_golomb_flac(&s->gb, tmp, real_limit, 0);
+                int v = get_sr_golomb_flac(&gb, tmp, real_limit, 0);
                 if (v == 0x80000000){
                     av_log(s->avctx, AV_LOG_ERROR, "invalid residual\n");
                     return AVERROR_INVALIDDATA;
@@ -272,6 +274,8 @@ static int decode_residuals(FLACContext *s, int32_t *decoded, int pred_order)
         i= 0;
     }
 
+    s->gb = gb;
+
     return 0;
 }
 
diff --git a/media/ffvpx/libavcodec/get_bits.h b/media/ffvpx/libavcodec/get_bits.h
index 0c7f5ff0c..56ef5f0cb 100644
--- a/media/ffvpx/libavcodec/get_bits.h
+++ b/media/ffvpx/libavcodec/get_bits.h
@@ -32,6 +32,7 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/log.h"
 #include "libavutil/avassert.h"
+#include "avcodec.h"
 #include "mathops.h"
 #include "vlc.h"
 
@@ -201,6 +202,13 @@ static inline int get_bits_count(const GetBitContext *s)
     return s->index;
 }
 
+/**
+ * Skips the specified number of bits.
+ * @param n the number of bits to skip,
+ *          For the UNCHECKED_BITSTREAM_READER this must not cause the distance
+ *          from the start to overflow int32_t. Staying within the bitstream + padding
+ *          is sufficient, too.
+ */
 static inline void skip_bits_long(GetBitContext *s, int n)
 {
 #if UNCHECKED_BITSTREAM_READER
@@ -428,7 +436,7 @@ static inline int init_get_bits(GetBitContext *s, const uint8_t *buffer,
     int buffer_size;
     int ret = 0;
 
-    if (bit_size >= INT_MAX - 7 || bit_size < 0 || !buffer) {
+    if (bit_size >= INT_MAX - FFMAX(7, AV_INPUT_BUFFER_PADDING_SIZE*8) || bit_size < 0 || !buffer) {
         bit_size    = 0;
         buffer      = NULL;
         ret         = AVERROR_INVALIDDATA;
diff --git a/media/ffvpx/libavcodec/hwaccel.h b/media/ffvpx/libavcodec/hwaccel.h
index 124fbbf1f..3aaa92571 100644
--- a/media/ffvpx/libavcodec/hwaccel.h
+++ b/media/ffvpx/libavcodec/hwaccel.h
@@ -19,6 +19,66 @@
 #ifndef AVCODEC_HWACCEL_H
 #define AVCODEC_HWACCEL_H
 
+#include "avcodec.h"
+#include "hwaccels.h"
+
+
 #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
 
+
+typedef struct AVCodecHWConfigInternal {
+    /**
+     * This is the structure which will be returned to the user by
+     * avcodec_get_hw_config().
+     */
+    AVCodecHWConfig public;
+    /**
+     * If this configuration uses a hwaccel, a pointer to it.
+     * If not, NULL.
+     */
+    const AVHWAccel *hwaccel;
+} AVCodecHWConfigInternal;
+
+
+// These macros are used to simplify AVCodecHWConfigInternal definitions.
+
+#define HW_CONFIG_HWACCEL(device, frames, ad_hoc, format, device_type_, name) \
+    &(const AVCodecHWConfigInternal) { \
+        .public          = { \
+            .pix_fmt     = AV_PIX_FMT_ ## format, \
+            .methods     = (device ? AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX : 0) | \
+                           (frames ? AV_CODEC_HW_CONFIG_METHOD_HW_FRAMES_CTX : 0) | \
+                           (ad_hoc ? AV_CODEC_HW_CONFIG_METHOD_AD_HOC        : 0),  \
+            .device_type = AV_HWDEVICE_TYPE_ ## device_type_, \
+        }, \
+        .hwaccel         = &name, \
+    }
+
+#define HW_CONFIG_INTERNAL(format) \
+    &(const AVCodecHWConfigInternal) { \
+        .public          = { \
+            .pix_fmt     = AV_PIX_FMT_ ## format, \
+            .methods     = AV_CODEC_HW_CONFIG_METHOD_INTERNAL, \
+            .device_type = AV_HWDEVICE_TYPE_NONE, \
+        }, \
+        .hwaccel         = NULL, \
+    }
+
+#define HWACCEL_DXVA2(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 1, DXVA2_VLD,    DXVA2,        ff_ ## codec ## _dxva2_hwaccel)
+#define HWACCEL_D3D11VA2(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
+#define HWACCEL_NVDEC(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
+#define HWACCEL_VAAPI(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
+#define HWACCEL_VDPAU(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 1, VDPAU,        VDPAU,        ff_ ## codec ## _vdpau_hwaccel)
+#define HWACCEL_VIDEOTOOLBOX(codec) \
+    HW_CONFIG_HWACCEL(1, 1, 1, VIDEOTOOLBOX, VIDEOTOOLBOX, ff_ ## codec ## _videotoolbox_hwaccel)
+#define HWACCEL_D3D11VA(codec) \
+    HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD,  NONE,         ff_ ## codec ## _d3d11va_hwaccel)
+#define HWACCEL_XVMC(codec) \
+    HW_CONFIG_HWACCEL(0, 0, 1, XVMC,         NONE,         ff_ ## codec ## _xvmc_hwaccel)
+
 #endif /* AVCODEC_HWACCEL_H */
diff --git a/media/ffvpx/libavcodec/hwaccels.h b/media/ffvpx/libavcodec/hwaccels.h
new file mode 100644
index 000000000..7d73da867
--- /dev/null
+++ b/media/ffvpx/libavcodec/hwaccels.h
@@ -0,0 +1,78 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_HWACCELS_H
+#define AVCODEC_HWACCELS_H
+
+#include "avcodec.h"
+
+extern const AVHWAccel ff_h263_vaapi_hwaccel;
+extern const AVHWAccel ff_h263_videotoolbox_hwaccel;
+extern const AVHWAccel ff_h264_d3d11va_hwaccel;
+extern const AVHWAccel ff_h264_d3d11va2_hwaccel;
+extern const AVHWAccel ff_h264_dxva2_hwaccel;
+extern const AVHWAccel ff_h264_nvdec_hwaccel;
+extern const AVHWAccel ff_h264_vaapi_hwaccel;
+extern const AVHWAccel ff_h264_vdpau_hwaccel;
+extern const AVHWAccel ff_h264_videotoolbox_hwaccel;
+extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
+extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+extern const AVHWAccel ff_hevc_nvdec_hwaccel;
+extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+extern const AVHWAccel ff_mjpeg_nvdec_hwaccel;
+extern const AVHWAccel ff_mjpeg_vaapi_hwaccel;
+extern const AVHWAccel ff_mpeg1_nvdec_hwaccel;
+extern const AVHWAccel ff_mpeg1_vdpau_hwaccel;
+extern const AVHWAccel ff_mpeg1_videotoolbox_hwaccel;
+extern const AVHWAccel ff_mpeg1_xvmc_hwaccel;
+extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel;
+extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel;
+extern const AVHWAccel ff_mpeg2_nvdec_hwaccel;
+extern const AVHWAccel ff_mpeg2_dxva2_hwaccel;
+extern const AVHWAccel ff_mpeg2_vaapi_hwaccel;
+extern const AVHWAccel ff_mpeg2_vdpau_hwaccel;
+extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel;
+extern const AVHWAccel ff_mpeg2_xvmc_hwaccel;
+extern const AVHWAccel ff_mpeg4_nvdec_hwaccel;
+extern const AVHWAccel ff_mpeg4_vaapi_hwaccel;
+extern const AVHWAccel ff_mpeg4_vdpau_hwaccel;
+extern const AVHWAccel ff_mpeg4_videotoolbox_hwaccel;
+extern const AVHWAccel ff_vc1_d3d11va_hwaccel;
+extern const AVHWAccel ff_vc1_d3d11va2_hwaccel;
+extern const AVHWAccel ff_vc1_dxva2_hwaccel;
+extern const AVHWAccel ff_vc1_nvdec_hwaccel;
+extern const AVHWAccel ff_vc1_vaapi_hwaccel;
+extern const AVHWAccel ff_vc1_vdpau_hwaccel;
+extern const AVHWAccel ff_vp8_nvdec_hwaccel;
+extern const AVHWAccel ff_vp8_vaapi_hwaccel;
+extern const AVHWAccel ff_vp9_d3d11va_hwaccel;
+extern const AVHWAccel ff_vp9_d3d11va2_hwaccel;
+extern const AVHWAccel ff_vp9_dxva2_hwaccel;
+extern const AVHWAccel ff_vp9_nvdec_hwaccel;
+extern const AVHWAccel ff_vp9_vaapi_hwaccel;
+extern const AVHWAccel ff_wmv3_d3d11va_hwaccel;
+extern const AVHWAccel ff_wmv3_d3d11va2_hwaccel;
+extern const AVHWAccel ff_wmv3_dxva2_hwaccel;
+extern const AVHWAccel ff_wmv3_nvdec_hwaccel;
+extern const AVHWAccel ff_wmv3_vaapi_hwaccel;
+extern const AVHWAccel ff_wmv3_vdpau_hwaccel;
+
+#endif /* AVCODEC_HWACCELS_H */
diff --git a/media/ffvpx/libavcodec/idctdsp.h b/media/ffvpx/libavcodec/idctdsp.h
index 26221f6a9..ca21a31a0 100644
--- a/media/ffvpx/libavcodec/idctdsp.h
+++ b/media/ffvpx/libavcodec/idctdsp.h
@@ -95,6 +95,8 @@ typedef struct IDCTDSPContext {
      */
     uint8_t idct_permutation[64];
     enum idct_permutation_type perm_type;
+
+    int mpeg4_studio_profile;
 } IDCTDSPContext;
 
 void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
diff --git a/media/ffvpx/libavcodec/imgconvert.c b/media/ffvpx/libavcodec/imgconvert.c
index 1547f1896..1fd636c83 100644
--- a/media/ffvpx/libavcodec/imgconvert.c
+++ b/media/ffvpx/libavcodec/imgconvert.c
@@ -69,10 +69,15 @@ enum AVPixelFormat avcodec_find_best_pix_fmt_of_list(const enum AVPixelFormat *p
     int i;
 
     enum AVPixelFormat best = AV_PIX_FMT_NONE;
+    int loss;
 
-    for(i=0; pix_fmt_list[i] != AV_PIX_FMT_NONE; i++)
-        best = avcodec_find_best_pix_fmt_of_2(best, pix_fmt_list[i], src_pix_fmt, has_alpha, loss_ptr);
+    for (i=0; pix_fmt_list[i] != AV_PIX_FMT_NONE; i++) {
+        loss = loss_ptr ? *loss_ptr : 0;
+        best = avcodec_find_best_pix_fmt_of_2(best, pix_fmt_list[i], src_pix_fmt, has_alpha, &loss);
+    }
 
+    if (loss_ptr)
+        *loss_ptr = loss;
     return best;
 }
 
diff --git a/media/ffvpx/libavcodec/internal.h b/media/ffvpx/libavcodec/internal.h
index faa923c11..bb92873d7 100644
--- a/media/ffvpx/libavcodec/internal.h
+++ b/media/ffvpx/libavcodec/internal.h
@@ -76,22 +76,20 @@
 #endif
 
 
-#if !FF_API_QUANT_BIAS
 #define FF_DEFAULT_QUANT_BIAS 999999
-#endif
 
-#if !FF_API_QSCALE_TYPE
 #define FF_QSCALE_TYPE_MPEG1 0
 #define FF_QSCALE_TYPE_MPEG2 1
 #define FF_QSCALE_TYPE_H264  2
 #define FF_QSCALE_TYPE_VP56  3
-#endif
 
 #define FF_SANE_NB_CHANNELS 64U
 
 #define FF_SIGNBIT(x) ((x) >> CHAR_BIT * sizeof(x) - 1)
 
-#if HAVE_SIMD_ALIGN_32
+#if HAVE_SIMD_ALIGN_64
+#   define STRIDE_ALIGN 64 /* AVX-512 */
+#elif HAVE_SIMD_ALIGN_32
 #   define STRIDE_ALIGN 32
 #elif HAVE_SIMD_ALIGN_16
 #   define STRIDE_ALIGN 16
@@ -237,21 +235,8 @@ int ff_match_2uint16(const uint16_t (*tab)[2], int size, int a, int b);
 
 unsigned int avpriv_toupper4(unsigned int x);
 
-/**
- * does needed setup of pkt_pts/pos and such for (re)get_buffer();
- */
-int ff_init_buffer_info(AVCodecContext *s, AVFrame *frame);
-
-
 void ff_color_frame(AVFrame *frame, const int color[4]);
 
-extern volatile int ff_avcodec_locked;
-int ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec);
-int ff_unlock_avcodec(const AVCodec *codec);
-
-int avpriv_lock_avformat(void);
-int avpriv_unlock_avformat(void);
-
 /**
  * Maximum size in bytes of extradata.
  * This value was chosen such that every bit of the buffer is
@@ -373,14 +358,16 @@ int ff_set_sar(AVCodecContext *avctx, AVRational sar);
 int ff_side_data_update_matrix_encoding(AVFrame *frame,
                                         enum AVMatrixEncoding matrix_encoding);
 
-#if FF_API_MERGE_SD
-int ff_packet_split_and_drop_side_data(AVPacket *pkt);
-#endif
-
 /**
  * Select the (possibly hardware accelerated) pixel format.
  * This is a wrapper around AVCodecContext.get_format() and should be used
  * instead of calling get_format() directly.
+ *
+ * The list of pixel formats must contain at least one valid entry, and is
+ * terminated with AV_PIX_FMT_NONE.  If it is possible to decode to software,
+ * the last entry in the list must be the most accurate software format.
+ * If it is not possible to decode to software, AVCodecContext.sw_pix_fmt
+ * must be set before calling this function.
  */
 int ff_get_format(AVCodecContext *avctx, const enum AVPixelFormat *fmt);
 
@@ -417,4 +404,10 @@ int ff_alloc_a53_sei(const AVFrame *frame, size_t prefix_len,
  */
 int64_t ff_guess_coded_bitrate(AVCodecContext *avctx);
 
+#if defined(_WIN32) && CONFIG_SHARED && !defined(BUILDING_avcodec)
+#    define av_export_avcodec __declspec(dllimport)
+#else
+#    define av_export_avcodec
+#endif
+
 #endif /* AVCODEC_INTERNAL_H */
diff --git a/media/ffvpx/libavcodec/me_cmp.h b/media/ffvpx/libavcodec/me_cmp.h
index 0dbbcbb1d..0a589e3c3 100644
--- a/media/ffvpx/libavcodec/me_cmp.h
+++ b/media/ffvpx/libavcodec/me_cmp.h
@@ -23,7 +23,7 @@
 
 #include "avcodec.h"
 
-extern uint32_t ff_square_tab[512];
+extern const uint32_t ff_square_tab[512];
 
 
 /* minimum alignment rules ;)
@@ -79,8 +79,6 @@ typedef struct MECmpContext {
     me_cmp_func median_sad[6];
 } MECmpContext;
 
-void ff_me_cmp_init_static(void);
-
 int ff_check_alignment(void);
 
 void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build
index 9980e1556..05217a6e4 100644
--- a/media/ffvpx/libavcodec/moz.build
+++ b/media/ffvpx/libavcodec/moz.build
@@ -12,7 +12,6 @@ if CONFIG['FFVPX_ASFLAGS']:
 SharedLibrary('mozavcodec')
 SOURCES += [
     'allcodecs.c',
-    'audioconvert.c',
     'avpacket.c',
     'avpicture.c',
     'bitstream.c',
@@ -41,8 +40,6 @@ SOURCES += [
     'pthread_slice.c',
     'qsv_api.c',
     'raw.c',
-    'resample.c',
-    'resample2.c',
     'reverse.c',
     'utils.c',
     'videodsp.c',
@@ -53,6 +50,7 @@ SOURCES += [
     'vp8dsp.c',
     'vp9.c',
     'vp9_parser.c',
+    'vp9_superframe_split_bsf.c',
     'vp9block.c',
     'vp9data.c',
     'vp9dsp.c',
diff --git a/media/ffvpx/libavcodec/mpegutils.h b/media/ffvpx/libavcodec/mpegutils.h
index 9cfadfc4c..1ed21c19b 100644
--- a/media/ffvpx/libavcodec/mpegutils.h
+++ b/media/ffvpx/libavcodec/mpegutils.h
@@ -48,7 +48,6 @@
 #define MAX_FCODE        7
 
 /* MB types */
-#if !FF_API_MB_TYPE
 #define MB_TYPE_INTRA4x4   (1 <<  0)
 #define MB_TYPE_INTRA16x16 (1 <<  1) // FIXME H.264-specific
 #define MB_TYPE_INTRA_PCM  (1 <<  2) // FIXME H.264-specific
@@ -70,7 +69,6 @@
 #define MB_TYPE_L0L1       (MB_TYPE_L0   | MB_TYPE_L1)
 #define MB_TYPE_QUANT      (1 << 16)
 #define MB_TYPE_CBP        (1 << 17)
-#endif
 
 #define MB_TYPE_INTRA    MB_TYPE_INTRA4x4 // default mb_type if there is just one type
 
@@ -139,4 +137,12 @@ void ff_draw_horiz_band(AVCodecContext *avctx, AVFrame *cur, AVFrame *last,
                         int y, int h, int picture_structure, int first_field,
                         int low_delay);
 
+/**
+ * Print debugging info for the given picture.
+ */
+void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
+                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
+                         int *low_delay,
+                         int mb_width, int mb_height, int mb_stride, int quarter_sample);
+
 #endif /* AVCODEC_MPEGUTILS_H */
diff --git a/media/ffvpx/libavcodec/mpegvideo.h b/media/ffvpx/libavcodec/mpegvideo.h
index e9eb633d1..541909cbb 100644
--- a/media/ffvpx/libavcodec/mpegvideo.h
+++ b/media/ffvpx/libavcodec/mpegvideo.h
@@ -45,6 +45,7 @@
 #include "mpegpicture.h"
 #include "mpegvideodsp.h"
 #include "mpegvideoencdsp.h"
+#include "mpegvideodata.h"
 #include "pixblockdsp.h"
 #include "put_bits.h"
 #include "ratecontrol.h"
@@ -71,6 +72,8 @@
 #define SLICE_MAX_START_CODE    0x000001af
 #define EXT_START_CODE          0x000001b5
 #define USER_START_CODE         0x000001b2
+#define SLICE_START_CODE        0x000001b7
+
 
 /**
  * MpegEncContext.
@@ -252,9 +255,6 @@ typedef struct MpegEncContext {
     int16_t (*b_field_mv_table[2][2][2])[2];///< MV table (4MV per MB) interlaced B-frame encoding
     uint8_t (*p_field_select_table[2]);
     uint8_t (*b_field_select_table[2][2]);
-#if FF_API_MOTION_EST
-    int me_method;                       ///< ME algorithm
-#endif
     int motion_est;                      ///< ME algorithm
     int me_penalty_compensation;
     int me_pre;                          ///< prepass for motion estimation
@@ -381,6 +381,8 @@ typedef struct MpegEncContext {
     int custom_pcf;
 
     /* MPEG-4 specific */
+    int studio_profile;
+    int dct_precision;
     ///< number of bits to represent the fractional part of time (encoder only)
     int time_increment_bits;
     int last_time_base;
@@ -467,6 +469,13 @@ typedef struct MpegEncContext {
     int intra_vlc_format;
     int alternate_scan;
     int seq_disp_ext;
+    int video_format;
+#define VIDEO_FORMAT_COMPONENT   0
+#define VIDEO_FORMAT_PAL         1
+#define VIDEO_FORMAT_NTSC        2
+#define VIDEO_FORMAT_SECAM       3
+#define VIDEO_FORMAT_MAC         4
+#define VIDEO_FORMAT_UNSPECIFIED 5
     int repeat_first_field;
     int chroma_420_type;
     int chroma_format;
@@ -497,7 +506,10 @@ typedef struct MpegEncContext {
 
     int16_t (*block)[64]; ///< points to one of the following blocks
     int16_t (*blocks)[12][64]; // for HQ mode we need to keep the best block
-    int (*decode_mb)(struct MpegEncContext *s, int16_t block[6][64]); // used by some codecs to avoid a switch()
+    int (*decode_mb)(struct MpegEncContext *s, int16_t block[12][64]); // used by some codecs to avoid a switch()
+
+    int32_t (*block32)[12][64];
+
 #define SLICE_OK         0
 #define SLICE_ERROR     -1
 #define SLICE_END       -2 ///<end marker found
@@ -699,10 +711,6 @@ void ff_mpeg_draw_horiz_band(MpegEncContext *s, int y, int h);
 void ff_mpeg_flush(AVCodecContext *avctx);
 
 void ff_print_debug_info(MpegEncContext *s, Picture *p, AVFrame *pict);
-void ff_print_debug_info2(AVCodecContext *avctx, AVFrame *pict, uint8_t *mbskip_table,
-                         uint32_t *mbtype_table, int8_t *qscale_table, int16_t (*motion_val[2])[2],
-                         int *low_delay,
-                         int mb_width, int mb_height, int mb_stride, int quarter_sample);
 
 int ff_mpv_export_qp_table(MpegEncContext *s, AVFrame *f, Picture *p, int qp_type);
 
@@ -729,7 +737,8 @@ void ff_mpv_motion(MpegEncContext *s,
                    qpel_mc_func (*qpix_op)[16]);
 
 static inline void ff_update_block_index(MpegEncContext *s){
-    const int block_size= 8 >> s->avctx->lowres;
+    const int bytes_per_pixel = 1 + (s->avctx->bits_per_raw_sample > 8);
+    const int block_size= (8*bytes_per_pixel) >> s->avctx->lowres;
 
     s->block_index[0]+=2;
     s->block_index[1]+=2;
@@ -738,8 +747,8 @@ static inline void ff_update_block_index(MpegEncContext *s){
     s->block_index[4]++;
     s->block_index[5]++;
     s->dest[0]+= 2*block_size;
-    s->dest[1]+= block_size;
-    s->dest[2]+= block_size;
+    s->dest[1]+= (2 >> s->chroma_x_shift) * block_size;
+    s->dest[2]+= (2 >> s->chroma_x_shift) * block_size;
 }
 
 static inline int get_bits_diff(MpegEncContext *s){
@@ -751,4 +760,13 @@ static inline int get_bits_diff(MpegEncContext *s){
     return bits - last;
 }
 
+static inline int mpeg_get_qscale(MpegEncContext *s)
+{
+    int qscale = get_bits(&s->gb, 5);
+    if (s->q_scale_type)
+        return ff_mpeg2_non_linear_qscale[qscale];
+    else
+        return qscale << 1;
+}
+
 #endif /* AVCODEC_MPEGVIDEO_H */
diff --git a/media/ffvpx/libavcodec/mpegvideodata.h b/media/ffvpx/libavcodec/mpegvideodata.h
new file mode 100644
index 000000000..14f4806d6
--- /dev/null
+++ b/media/ffvpx/libavcodec/mpegvideodata.h
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_MPEGVIDEODATA_H
+#define AVCODEC_MPEGVIDEODATA_H
+
+#include <stdint.h>
+
+/* encoding scans */
+extern const uint8_t ff_alternate_horizontal_scan[64];
+extern const uint8_t ff_alternate_vertical_scan[64];
+
+extern const uint8_t ff_mpeg1_dc_scale_table[128];
+extern const uint8_t * const ff_mpeg2_dc_scale_table[4];
+
+extern const uint8_t ff_mpeg2_non_linear_qscale[32];
+
+extern const uint8_t ff_default_chroma_qscale_table[32];
+
+#endif /* AVCODEC_MPEGVIDEODATA_H */
diff --git a/media/ffvpx/libavcodec/null_bsf.c b/media/ffvpx/libavcodec/null_bsf.c
index feb71248a..24d26dfb1 100644
--- a/media/ffvpx/libavcodec/null_bsf.c
+++ b/media/ffvpx/libavcodec/null_bsf.c
@@ -24,17 +24,9 @@
 #include "avcodec.h"
 #include "bsf.h"
 
-static int null_filter(AVBSFContext *ctx, AVPacket *out)
+static int null_filter(AVBSFContext *ctx, AVPacket *pkt)
 {
-    AVPacket *in;
-    int ret;
-
-    ret = ff_bsf_get_packet(ctx, &in);
-    if (ret < 0)
-        return ret;
-    av_packet_move_ref(out, in);
-    av_packet_free(&in);
-    return 0;
+    return ff_bsf_get_packet_ref(ctx, pkt);
 }
 
 const AVBitStreamFilter ff_null_bsf = {
diff --git a/media/ffvpx/libavcodec/options.c b/media/ffvpx/libavcodec/options.c
index 82e12179a..41b60521c 100644
--- a/media/ffvpx/libavcodec/options.c
+++ b/media/ffvpx/libavcodec/options.c
@@ -30,7 +30,6 @@
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
 #include "libavutil/opt.h"
-#include <float.h>              /* FLT_MIN, FLT_MAX */
 #include <string.h>
 
 FF_DISABLE_DEPRECATION_WARNINGS
diff --git a/media/ffvpx/libavcodec/options_table.h b/media/ffvpx/libavcodec/options_table.h
index 2ac37c3ff..099261e16 100644
--- a/media/ffvpx/libavcodec/options_table.h
+++ b/media/ffvpx/libavcodec/options_table.h
@@ -54,26 +54,11 @@ static const AVOption avcodec_options[] = {
 {"qpel", "use 1/4-pel motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QPEL }, INT_MIN, INT_MAX, V|E, "flags"},
 {"loop", "use loop filter", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_LOOP_FILTER }, INT_MIN, INT_MAX, V|E, "flags"},
 {"qscale", "use fixed qscale", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_QSCALE }, INT_MIN, INT_MAX, 0, "flags"},
-#if FF_API_GMC
-{"gmc", "use gmc", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_GMC }, INT_MIN, INT_MAX, V|E, "flags"},
-#endif
-#if FF_API_MV0
-{"mv0", "always try a mb with mv=<0,0>", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_MV0 }, INT_MIN, INT_MAX, V|E, "flags"},
-#endif
-#if FF_API_INPUT_PRESERVED
-{"input_preserved", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_INPUT_PRESERVED }, INT_MIN, INT_MAX, 0, "flags"},
-#endif
 {"pass1", "use internal 2-pass ratecontrol in first  pass mode", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PASS1 }, INT_MIN, INT_MAX, 0, "flags"},
 {"pass2", "use internal 2-pass ratecontrol in second pass mode", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PASS2 }, INT_MIN, INT_MAX, 0, "flags"},
 {"gray", "only decode/encode grayscale", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_GRAY }, INT_MIN, INT_MAX, V|E|D, "flags"},
-#if FF_API_EMU_EDGE
-{"emu_edge", "do not draw edges", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_EMU_EDGE }, INT_MIN, INT_MAX, 0, "flags"},
-#endif
 {"psnr", "error[?] variables will be set during encoding", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_PSNR }, INT_MIN, INT_MAX, V|E, "flags"},
 {"truncated", "Input bitstream might be randomly truncated", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_TRUNCATED }, INT_MIN, INT_MAX, V|D, "flags"},
-#if FF_API_NORMALIZE_AQP
-{"naq", "normalize adaptive quantization", 0, AV_OPT_TYPE_CONST, {.i64 = CODEC_FLAG_NORMALIZE_AQP }, INT_MIN, INT_MAX, V|E, "flags"},
-#endif
 {"ildct", "use interlaced DCT", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_INTERLACED_DCT }, INT_MIN, INT_MAX, V|E, "flags"},
 {"low_delay", "force low delay", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_LOW_DELAY }, INT_MIN, INT_MAX, V|D|E, "flags"},
 {"global_header", "place global headers in extradata instead of every keyframe", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG_GLOBAL_HEADER }, INT_MIN, INT_MAX, V|A|E, "flags"},
@@ -91,21 +76,6 @@ static const AVOption avcodec_options[] = {
 {"export_mvs", "export motion vectors through frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_EXPORT_MVS}, INT_MIN, INT_MAX, V|D, "flags2"},
 {"skip_manual", "do not skip samples and export skip information as frame side data", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_SKIP_MANUAL}, INT_MIN, INT_MAX, V|D, "flags2"},
 {"ass_ro_flush_noop", "do not reset ASS ReadOrder field on flush", 0, AV_OPT_TYPE_CONST, {.i64 = AV_CODEC_FLAG2_RO_FLUSH_NOOP}, INT_MIN, INT_MAX, S|D, "flags2"},
-#if FF_API_MOTION_EST
-{"me_method", "set motion estimation method", OFFSET(me_method), AV_OPT_TYPE_INT, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method"},
-{"zero", "zero motion estimation (fastest)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_ZERO }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"full", "full motion estimation (slowest)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"epzs", "EPZS motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"esa", "esa motion estimation (alias for full)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_FULL }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"tesa", "tesa motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_TESA }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"dia", "diamond motion estimation (alias for EPZS)", 0, AV_OPT_TYPE_CONST, {.i64 = ME_EPZS }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"log", "log motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_LOG }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"phods", "phods motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_PHODS }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"x1", "X1 motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_X1 }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"hex", "hex motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_HEX }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"umh", "umh motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_UMH }, INT_MIN, INT_MAX, V|E, "me_method" },
-{"iter", "iter motion estimation", 0, AV_OPT_TYPE_CONST, {.i64 = ME_ITER }, INT_MIN, INT_MAX, V|E, "me_method" },
-#endif
 {"time_base", NULL, OFFSET(time_base), AV_OPT_TYPE_RATIONAL, {.dbl = 0}, 0, INT_MAX},
 {"g", "set the group of picture (GOP) size", OFFSET(gop_size), AV_OPT_TYPE_INT, {.i64 = 12 }, INT_MIN, INT_MAX, V|E},
 {"ar", "set audio sampling rate (in Hz)", OFFSET(sample_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, A|D|E},
@@ -123,9 +93,6 @@ static const AVOption avcodec_options[] = {
 {"qdiff", "maximum difference between the quantizer scales (VBR)", OFFSET(max_qdiff), AV_OPT_TYPE_INT, {.i64 = 3 }, INT_MIN, INT_MAX, V|E},
 {"bf", "set maximum number of B-frames between non-B-frames", OFFSET(max_b_frames), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, -1, INT_MAX, V|E},
 {"b_qfactor", "QP factor between P- and B-frames", OFFSET(b_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = 1.25 }, -FLT_MAX, FLT_MAX, V|E},
-#if FF_API_RC_STRATEGY
-{"rc_strategy", "ratecontrol method", OFFSET(rc_strategy), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-#endif
 #if FF_API_PRIVATE_OPT
 {"b_strategy", "strategy to choose between I/P/B-frames", OFFSET(b_frame_strategy), AV_OPT_TYPE_INT, {.i64 = 0 }, INT_MIN, INT_MAX, V|E},
 {"ps", "RTP payload size in bytes", OFFSET(rtp_payload_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
@@ -144,16 +111,10 @@ static const AVOption avcodec_options[] = {
 {"codec_tag", NULL, OFFSET(codec_tag), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
 {"bug", "work around not autodetected encoder bugs", OFFSET(workaround_bugs), AV_OPT_TYPE_FLAGS, {.i64 = FF_BUG_AUTODETECT }, INT_MIN, INT_MAX, V|D, "bug"},
 {"autodetect", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_AUTODETECT }, INT_MIN, INT_MAX, V|D, "bug"},
-#if FF_API_OLD_MSMPEG4
-{"old_msmpeg4", "some old lavc-generated MSMPEG4v3 files (no autodetection)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_OLD_MSMPEG4 }, INT_MIN, INT_MAX, V|D, "bug"},
-#endif
 {"xvid_ilace", "Xvid interlacing bug (autodetected if FOURCC == XVIX)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_XVID_ILACE }, INT_MIN, INT_MAX, V|D, "bug"},
 {"ump4", "(autodetected if FOURCC == UMP4)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_UMP4 }, INT_MIN, INT_MAX, V|D, "bug"},
 {"no_padding", "padding bug (autodetected)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_NO_PADDING }, INT_MIN, INT_MAX, V|D, "bug"},
 {"amv", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_AMV }, INT_MIN, INT_MAX, V|D, "bug"},
-#if FF_API_AC_VLC
-{"ac_vlc", "illegal VLC bug (autodetected per FOURCC)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_AC_VLC }, INT_MIN, INT_MAX, V|D, "bug"},
-#endif
 {"qpel_chroma", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_QPEL_CHROMA }, INT_MIN, INT_MAX, V|D, "bug"},
 {"std_qpel", "old standard qpel (autodetected per FOURCC/version)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_STD_QPEL }, INT_MIN, INT_MAX, V|D, "bug"},
 {"qpel_chroma2", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_BUG_QPEL_CHROMA2 }, INT_MIN, INT_MAX, V|D, "bug"},
@@ -185,27 +146,13 @@ static const AVOption avcodec_options[] = {
 #if FF_API_PRIVATE_OPT
 {"mpeg_quant", "use MPEG quantizers instead of H.263", OFFSET(mpeg_quant), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-#if FF_API_MPV_OPT
-{"qsquish", "deprecated, use encoder private options instead", OFFSET(rc_qsquish), AV_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, 0, 99, V|E},
-{"rc_qmod_amp",  "deprecated, use encoder private options instead", OFFSET(rc_qmod_amp), AV_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, -FLT_MAX, FLT_MAX, V|E},
-{"rc_qmod_freq", "deprecated, use encoder private options instead", OFFSET(rc_qmod_freq), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-#endif
 {"rc_override_count", NULL, OFFSET(rc_override_count), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-#if FF_API_MPV_OPT
-{"rc_eq", "deprecated, use encoder private options instead", OFFSET(rc_eq), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, V|E},
-#endif
 {"maxrate", "maximum bitrate (in bits/s). Used for VBV together with bufsize.", OFFSET(rc_max_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, 0, INT_MAX, V|A|E},
 {"minrate", "minimum bitrate (in bits/s). Most useful in setting up a CBR encode. It is of little use otherwise.",
             OFFSET(rc_min_rate), AV_OPT_TYPE_INT64, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
 {"bufsize", "set ratecontrol buffer size (in bits)", OFFSET(rc_buffer_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, A|V|E},
-#if FF_API_MPV_OPT
-{"rc_buf_aggressivity", "deprecated, use encoder private options instead", OFFSET(rc_buffer_aggressivity), AV_OPT_TYPE_FLOAT, {.dbl = 1.0 }, -FLT_MAX, FLT_MAX, V|E},
-#endif
 {"i_qfactor", "QP factor between P- and I-frames", OFFSET(i_quant_factor), AV_OPT_TYPE_FLOAT, {.dbl = -0.8 }, -FLT_MAX, FLT_MAX, V|E},
 {"i_qoffset", "QP offset between P- and I-frames", OFFSET(i_quant_offset), AV_OPT_TYPE_FLOAT, {.dbl = 0.0 }, -FLT_MAX, FLT_MAX, V|E},
-#if FF_API_MPV_OPT
-{"rc_init_cplx", "deprecated, use encoder private options instead", OFFSET(rc_initial_cplx), AV_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, -FLT_MAX, FLT_MAX, V|E},
-#endif
 {"dct", "DCT algorithm", OFFSET(dct_algo), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, INT_MAX, V|E, "dct"},
 {"auto", "autoselect a good one", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_AUTO }, INT_MIN, INT_MAX, V|E, "dct"},
 {"fastint", "fast integer", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DCT_FASTINT }, INT_MIN, INT_MAX, V|E, "dct"},
@@ -225,19 +172,10 @@ static const AVOption avcodec_options[] = {
 {"simplemmx", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"arm", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"altivec", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_ALTIVEC }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#if FF_API_ARCH_SH4
-{"sh4", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SH4 }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#endif
 {"simplearm", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"simplearmv5te", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV5TE }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"simplearmv6", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEARMV6 }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"simpleneon", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLENEON }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#if FF_API_ARCH_ALPHA
-{"simplealpha", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_SIMPLEALPHA }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#endif
-#if FF_API_UNUSED_MEMBERS
-{"ipp", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_IPP }, INT_MIN, INT_MAX, V|E|D, "idct"},
-#endif /* FF_API_UNUSED_MEMBERS */
 {"xvid", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"xvidmmx", "deprecated, for compatibility only", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_XVID }, INT_MIN, INT_MAX, V|E|D, "idct"},
 {"faani", "floating point AAN IDCT", 0, AV_OPT_TYPE_CONST, {.i64 = FF_IDCT_FAAN }, INT_MIN, INT_MAX, V|D|E, "idct"},
@@ -269,9 +207,6 @@ static const AVOption avcodec_options[] = {
 {"green_metadata", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_GREEN_MD }, INT_MIN, INT_MAX, V|D, "debug"},
 {"skip", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_SKIP }, INT_MIN, INT_MAX, V|D, "debug"},
 {"startcode", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_STARTCODE }, INT_MIN, INT_MAX, V|D, "debug"},
-#if FF_API_UNUSED_MEMBERS
-{"pts", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_PTS }, INT_MIN, INT_MAX, V|D, "debug"},
-#endif /* FF_API_UNUSED_MEMBERS */
 {"er", "error recognition", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_ER }, INT_MIN, INT_MAX, V|D, "debug"},
 {"mmco", "memory management control operations (H.264)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_MMCO }, INT_MIN, INT_MAX, V|D, "debug"},
 {"bugs", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUGS }, INT_MIN, INT_MAX, V|D, "debug"},
@@ -282,12 +217,6 @@ static const AVOption avcodec_options[] = {
 {"buffers", "picture buffer allocations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_BUFFERS }, INT_MIN, INT_MAX, V|D, "debug"},
 {"thread_ops", "threading operations", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_THREADS }, INT_MIN, INT_MAX, V|A|D, "debug"},
 {"nomc", "skip motion compensation", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_NOMC }, INT_MIN, INT_MAX, V|A|D, "debug"},
-#if FF_API_VISMV
-{"vismv", "visualize motion vectors (MVs) (deprecated)", OFFSET(debug_mv), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT }, 0, INT_MAX, V|D, "debug_mv"},
-{"pf", "forward predicted MVs of P-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_P_FOR }, INT_MIN, INT_MAX, V|D, "debug_mv"},
-{"bf", "forward predicted MVs of B-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_B_FOR }, INT_MIN, INT_MAX, V|D, "debug_mv"},
-{"bb", "backward predicted MVs of B-frames", 0, AV_OPT_TYPE_CONST, {.i64 = FF_DEBUG_VIS_MV_B_BACK }, INT_MIN, INT_MAX, V|D, "debug_mv"},
-#endif
 {"cmp", "full-pel ME compare function", OFFSET(me_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"subcmp", "sub-pel ME compare function", OFFSET(me_sub_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"mbcmp", "macroblock compare function", OFFSET(mb_cmp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "cmp_func"},
@@ -318,14 +247,7 @@ static const AVOption avcodec_options[] = {
 {"msad", "sum of absolute differences, median predicted", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CMP_MEDIAN_SAD }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 {"pre_dia_size", "diamond type & size for motion estimation pre-pass", OFFSET(pre_dia_size), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"subq", "sub-pel motion estimation quality", OFFSET(me_subpel_quality), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E},
-#if FF_API_AFD
-{"dtg_active_format", NULL, OFFSET(dtg_active_format), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-#endif
 {"me_range", "limit motion vectors range (1023 for DivX player)", OFFSET(me_range), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-#if FF_API_QUANT_BIAS
-{"ibias", "intra quant bias", OFFSET(intra_quant_bias), AV_OPT_TYPE_INT, {.i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, V|E},
-{"pbias", "inter quant bias", OFFSET(inter_quant_bias), AV_OPT_TYPE_INT, {.i64 = FF_DEFAULT_QUANT_BIAS }, INT_MIN, INT_MAX, V|E},
-#endif
 {"global_quality", NULL, OFFSET(global_quality), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
 #if FF_API_CODER_TYPE
 {"coder", NULL, OFFSET(coder_type), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E, "coder"},
@@ -333,45 +255,25 @@ static const AVOption avcodec_options[] = {
 {"ac", "arithmetic coder", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CODER_TYPE_AC }, INT_MIN, INT_MAX, V|E, "coder"},
 {"raw", "raw (no encoding)", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CODER_TYPE_RAW }, INT_MIN, INT_MAX, V|E, "coder"},
 {"rle", "run-length coder", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CODER_TYPE_RLE }, INT_MIN, INT_MAX, V|E, "coder"},
-#if FF_API_UNUSED_MEMBERS
-{"deflate", "deflate-based coder", 0, AV_OPT_TYPE_CONST, {.i64 = FF_CODER_TYPE_DEFLATE }, INT_MIN, INT_MAX, V|E, "coder"},
-#endif /* FF_API_UNUSED_MEMBERS */
 #endif /* FF_API_CODER_TYPE */
 #if FF_API_PRIVATE_OPT
 {"context", "context model", OFFSET(context_model), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"slice_flags", NULL, OFFSET(slice_flags), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-#if FF_API_XVMC
-{"xvmc_acceleration", NULL, OFFSET(xvmc_acceleration), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-#endif /* FF_API_XVMC */
 {"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, 0, 2, V|E, "mbd"},
 {"simple", "use mbcmp", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_SIMPLE }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"bits", "use fewest bits", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_BITS }, INT_MIN, INT_MAX, V|E, "mbd"},
 {"rd", "use best rate distortion", 0, AV_OPT_TYPE_CONST, {.i64 = FF_MB_DECISION_RD }, INT_MIN, INT_MAX, V|E, "mbd"},
-#if FF_API_STREAM_CODEC_TAG
-{"stream_codec_tag", NULL, OFFSET(stream_codec_tag), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX},
-#endif
 #if FF_API_PRIVATE_OPT
 {"sc_threshold", "scene change threshold", OFFSET(scenechange_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
-#if FF_API_MPV_OPT
-{"lmin", "deprecated, use encoder private options instead", OFFSET(lmin), AV_OPT_TYPE_INT, {.i64 =  0 }, 0, INT_MAX, V|E},
-{"lmax", "deprecated, use encoder private options instead", OFFSET(lmax), AV_OPT_TYPE_INT, {.i64 =  0 }, 0, INT_MAX, V|E},
-#endif
 #if FF_API_PRIVATE_OPT
 {"nr", "noise reduction", OFFSET(noise_reduction), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"rc_init_occupancy", "number of bits which should be loaded into the rc buffer before decoding starts", OFFSET(rc_initial_buffer_occupancy), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"flags2", NULL, OFFSET(flags2), AV_OPT_TYPE_FLAGS, {.i64 = DEFAULT}, 0, UINT_MAX, V|A|E|D, "flags2"},
-#if FF_API_ERROR_RATE
-{"error", NULL, OFFSET(error_rate), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-#endif
 {"threads", "set the number of threads", OFFSET(thread_count), AV_OPT_TYPE_INT, {.i64 = 1 }, 0, INT_MAX, V|A|E|D, "threads"},
 {"auto", "autodetect a suitable number of threads to use", 0, AV_OPT_TYPE_CONST, {.i64 = 0 }, INT_MIN, INT_MAX, V|E|D, "threads"},
-#if FF_API_MPV_OPT
-{"me_threshold", "motion estimation threshold", OFFSET(me_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-{"mb_threshold", "macroblock threshold", OFFSET(mb_threshold), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
-#endif
 {"dc", "intra_dc_precision", OFFSET(intra_dc_precision), AV_OPT_TYPE_INT, {.i64 = 0 }, -8, 16, V|E},
 {"nssew", "nsse weight", OFFSET(nsse_weight), AV_OPT_TYPE_INT, {.i64 = 8 }, INT_MIN, INT_MAX, V|E},
 {"skip_top", "number of macroblock rows at the top which are skipped", OFFSET(skip_top), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|D},
@@ -398,6 +300,7 @@ static const AVOption avcodec_options[] = {
 {"mpeg4_main", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_MAIN }, INT_MIN, INT_MAX, V|E, "profile"},
 {"mpeg4_asp",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_MPEG4_ADVANCED_SIMPLE }, INT_MIN, INT_MAX, V|E, "profile"},
 {"main10",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_HEVC_MAIN_10 }, INT_MIN, INT_MAX, V|E, "profile"},
+{"msbc",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_PROFILE_SBC_MSBC }, INT_MIN, INT_MAX, A|E, "profile"},
 {"level", NULL, OFFSET(level), AV_OPT_TYPE_INT, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
 {"unknown", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_LEVEL_UNKNOWN }, INT_MIN, INT_MAX, V|A|E, "level"},
 {"lowres", "decode at 1= 1/2, 2=1/4, 3=1/8 resolutions", OFFSET(lowres), AV_OPT_TYPE_INT, {.i64 = 0 }, 0, INT_MAX, V|A|D},
@@ -407,9 +310,6 @@ static const AVOption avcodec_options[] = {
 {"skip_exp", "frame skip exponent", OFFSET(frame_skip_exp), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 {"skipcmp", "frame skip compare function", OFFSET(frame_skip_cmp), AV_OPT_TYPE_INT, {.i64 = FF_CMP_DCTMAX }, INT_MIN, INT_MAX, V|E, "cmp_func"},
 #endif
-#if FF_API_MPV_OPT
-{"border_mask", "deprecated, use encoder private options instead", OFFSET(border_masking), AV_OPT_TYPE_FLOAT, {.dbl = DEFAULT }, -FLT_MAX, FLT_MAX, V|E},
-#endif
 {"mblmin", "minimum macroblock Lagrange factor (VBR)", OFFSET(mb_lmin), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 2 }, 1, FF_LAMBDA_MAX, V|E},
 {"mblmax", "maximum macroblock Lagrange factor (VBR)", OFFSET(mb_lmax), AV_OPT_TYPE_INT, {.i64 = FF_QP2LAMBDA * 31 }, 1, FF_LAMBDA_MAX, V|E},
 #if FF_API_PRIVATE_OPT
@@ -435,9 +335,6 @@ static const AVOption avcodec_options[] = {
 {"chromaoffset", "chroma QP offset from luma", OFFSET(chromaoffset), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|E},
 #endif
 {"trellis", "rate-distortion optimal quantization", OFFSET(trellis), AV_OPT_TYPE_INT, {.i64 = DEFAULT }, INT_MIN, INT_MAX, V|A|E},
-#if FF_API_UNUSED_MEMBERS
-{"sc_factor", "multiplied by qscale for each frame and added to scene_change_score", OFFSET(scenechange_factor), AV_OPT_TYPE_INT, {.i64 = 6 }, 0, INT_MAX, V|E},
-#endif /* FF_API_UNUSED_MEMBERS */
 {"mv0_threshold", NULL, OFFSET(mv0_threshold), AV_OPT_TYPE_INT, {.i64 = 256 }, 0, INT_MAX, V|E},
 #if FF_API_PRIVATE_OPT
 {"b_sensitivity", "adjust sensitivity of b_frame_strategy 1", OFFSET(b_sensitivity), AV_OPT_TYPE_INT, {.i64 = 40 }, 1, INT_MAX, V|E},
@@ -550,6 +447,7 @@ static const AVOption avcodec_options[] = {
 {"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
 {"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
 {"pre_decoder", NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_PRE_DECODER}, INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
+{"ignore",      NULL, 0, AV_OPT_TYPE_CONST, {.i64 = FF_SUB_CHARENC_MODE_IGNORE},      INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
 #if FF_API_ASS_TIMING
 {"sub_text_format", "set decoded text subtitle format", OFFSET(sub_text_format), AV_OPT_TYPE_INT, {.i64 = FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS}, 0, 1, S|D, "sub_text_format"},
 #else
@@ -580,6 +478,7 @@ static const AVOption avcodec_options[] = {
 {"ignore_level", "ignore level even if the codec level used is unknown or higher than the maximum supported level reported by the hardware driver", 0, AV_OPT_TYPE_CONST, { .i64 = AV_HWACCEL_FLAG_IGNORE_LEVEL }, INT_MIN, INT_MAX, V | D, "hwaccel_flags" },
 {"allow_high_depth", "allow to output YUV pixel formats with a different chroma sampling than 4:2:0 and/or other than 8 bits per component", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_ALLOW_HIGH_DEPTH }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"},
 {"allow_profile_mismatch", "attempt to decode anyway if HW accelerated decoder's supported profiles do not exactly match the stream", 0, AV_OPT_TYPE_CONST, {.i64 = AV_HWACCEL_FLAG_ALLOW_PROFILE_MISMATCH }, INT_MIN, INT_MAX, V | D, "hwaccel_flags"},
+{"extra_hw_frames", "Number of extra hardware frames to allocate for the user", OFFSET(extra_hw_frames), AV_OPT_TYPE_INT, { .i64 = -1 }, -1, INT_MAX, V|D },
 {NULL},
 };
 
diff --git a/media/ffvpx/libavcodec/parser.c b/media/ffvpx/libavcodec/parser.c
index 670680ea7..f43b197d5 100644
--- a/media/ffvpx/libavcodec/parser.c
+++ b/media/ffvpx/libavcodec/parser.c
@@ -25,40 +25,108 @@
 #include <string.h>
 
 #include "libavutil/avassert.h"
-#include "libavutil/atomic.h"
 #include "libavutil/internal.h"
 #include "libavutil/mem.h"
+#include "libavutil/thread.h"
 
 #include "internal.h"
 #include "parser.h"
 
-static AVCodecParser *av_first_parser = NULL;
+/* Parsers */
+extern AVCodecParser ff_aac_parser;
+extern AVCodecParser ff_aac_latm_parser;
+extern AVCodecParser ff_ac3_parser;
+extern AVCodecParser ff_adx_parser;
+extern AVCodecParser ff_bmp_parser;
+extern AVCodecParser ff_cavsvideo_parser;
+extern AVCodecParser ff_cook_parser;
+extern AVCodecParser ff_dca_parser;
+extern AVCodecParser ff_dirac_parser;
+extern AVCodecParser ff_dnxhd_parser;
+extern AVCodecParser ff_dpx_parser;
+extern AVCodecParser ff_dvaudio_parser;
+extern AVCodecParser ff_dvbsub_parser;
+extern AVCodecParser ff_dvdsub_parser;
+extern AVCodecParser ff_dvd_nav_parser;
+extern AVCodecParser ff_flac_parser;
+extern AVCodecParser ff_g729_parser;
+extern AVCodecParser ff_gsm_parser;
+extern AVCodecParser ff_h261_parser;
+extern AVCodecParser ff_h263_parser;
+extern AVCodecParser ff_h264_parser;
+extern AVCodecParser ff_hevc_parser;
+extern AVCodecParser ff_mjpeg_parser;
+extern AVCodecParser ff_mlp_parser;
+extern AVCodecParser ff_mpeg4video_parser;
+extern AVCodecParser ff_mpegaudio_parser;
+extern AVCodecParser ff_mpegvideo_parser;
+extern AVCodecParser ff_opus_parser;
+extern AVCodecParser ff_png_parser;
+extern AVCodecParser ff_pnm_parser;
+extern AVCodecParser ff_rv30_parser;
+extern AVCodecParser ff_rv40_parser;
+extern AVCodecParser ff_sbc_parser;
+extern AVCodecParser ff_sipr_parser;
+extern AVCodecParser ff_tak_parser;
+extern AVCodecParser ff_vc1_parser;
+extern AVCodecParser ff_vorbis_parser;
+extern AVCodecParser ff_vp3_parser;
+extern AVCodecParser ff_vp8_parser;
+extern AVCodecParser ff_vp9_parser;
+extern AVCodecParser ff_xma_parser;
+
+#include "libavcodec/parser_list.c"
+
+static AVOnce av_parser_next_init = AV_ONCE_INIT;
+
+static void av_parser_init_next(void)
+{
+    AVCodecParser *prev = NULL, *p;
+    int i = 0;
+    while ((p = (AVCodecParser*)parser_list[i++])) {
+        if (prev)
+            prev->next = p;
+        prev = p;
+    }
+}
 
 AVCodecParser *av_parser_next(const AVCodecParser *p)
 {
+    ff_thread_once(&av_parser_next_init, av_parser_init_next);
+
     if (p)
         return p->next;
     else
-        return av_first_parser;
+        return (AVCodecParser*)parser_list[0];
+}
+
+const AVCodecParser *av_parser_iterate(void **opaque)
+{
+    uintptr_t i = (uintptr_t)*opaque;
+    const AVCodecParser *p = parser_list[i];
+
+    if (p)
+        *opaque = (void*)(i + 1);
+
+    return p;
 }
 
 void av_register_codec_parser(AVCodecParser *parser)
 {
-    do {
-        parser->next = av_first_parser;
-    } while (parser->next != avpriv_atomic_ptr_cas((void * volatile *)&av_first_parser, parser->next, parser));
+    ff_thread_once(&av_parser_next_init, av_parser_init_next);
 }
 
 AVCodecParserContext *av_parser_init(int codec_id)
 {
     AVCodecParserContext *s = NULL;
-    AVCodecParser *parser;
+    const AVCodecParser *parser;
+    void *i = 0;
     int ret;
 
     if (codec_id == AV_CODEC_ID_NONE)
         return NULL;
 
-    for (parser = av_first_parser; parser; parser = parser->next) {
+    while ((parser = av_parser_iterate(&i))) {
         if (parser->codec_ids[0] == codec_id ||
             parser->codec_ids[1] == codec_id ||
             parser->codec_ids[2] == codec_id ||
@@ -72,7 +140,7 @@ found:
     s = av_mallocz(sizeof(AVCodecParserContext));
     if (!s)
         goto err_out;
-    s->parser = parser;
+    s->parser = (AVCodecParser*)parser;
     s->priv_data = av_mallocz(parser->priv_data_size);
     if (!s->priv_data)
         goto err_out;
diff --git a/media/ffvpx/libavcodec/parser_list.c b/media/ffvpx/libavcodec/parser_list.c
new file mode 100644
index 000000000..b60c60bce
--- /dev/null
+++ b/media/ffvpx/libavcodec/parser_list.c
@@ -0,0 +1,8 @@
+static const AVCodecParser * const parser_list[] = {
+#if CONFIG_VP8_PARSER
+    &ff_vp8_parser,
+#endif
+#if CONFIG_VP9_PARSER
+    &ff_vp9_parser,
+#endif
+    NULL };
diff --git a/media/ffvpx/libavcodec/profiles.c b/media/ffvpx/libavcodec/profiles.c
index 30498efed..d7dc960f3 100644
--- a/media/ffvpx/libavcodec/profiles.c
+++ b/media/ffvpx/libavcodec/profiles.c
@@ -140,4 +140,16 @@ const AVProfile ff_vp9_profiles[] = {
     { FF_PROFILE_UNKNOWN },
 };
 
+const AVProfile ff_av1_profiles[] = {
+    { FF_PROFILE_AV1_MAIN,         "Main" },
+    { FF_PROFILE_AV1_HIGH,         "High" },
+    { FF_PROFILE_AV1_PROFESSIONAL, "Professional" },
+    { FF_PROFILE_UNKNOWN },
+};
+
+const AVProfile ff_sbc_profiles[] = {
+    { FF_PROFILE_SBC_MSBC, "mSBC" },
+    { FF_PROFILE_UNKNOWN },
+};
+
 #endif /* !CONFIG_SMALL */
diff --git a/media/ffvpx/libavcodec/profiles.h b/media/ffvpx/libavcodec/profiles.h
index eb18b406a..9d7e211e1 100644
--- a/media/ffvpx/libavcodec/profiles.h
+++ b/media/ffvpx/libavcodec/profiles.h
@@ -31,5 +31,7 @@ extern const AVProfile ff_mpeg2_video_profiles[];
 extern const AVProfile ff_mpeg4_video_profiles[];
 extern const AVProfile ff_vc1_profiles[];
 extern const AVProfile ff_vp9_profiles[];
+extern const AVProfile ff_av1_profiles[];
+extern const AVProfile ff_sbc_profiles[];
 
 #endif /* AVCODEC_PROFILES_H */
diff --git a/media/ffvpx/libavcodec/pthread_frame.c b/media/ffvpx/libavcodec/pthread_frame.c
index 2c702c737..5104b1beb 100644
--- a/media/ffvpx/libavcodec/pthread_frame.c
+++ b/media/ffvpx/libavcodec/pthread_frame.c
@@ -246,7 +246,7 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
 {
     int err = 0;
 
-    if (dst != src && (for_user || !(av_codec_get_codec_descriptor(src)->props & AV_CODEC_PROP_INTRA_ONLY))) {
+    if (dst != src && (for_user || !(src->codec_descriptor->props & AV_CODEC_PROP_INTRA_ONLY))) {
         dst->time_base = src->time_base;
         dst->framerate = src->framerate;
         dst->width     = src->width;
@@ -262,11 +262,6 @@ static int update_context_from_thread(AVCodecContext *dst, AVCodecContext *src,
 
         dst->bits_per_coded_sample = src->bits_per_coded_sample;
         dst->sample_aspect_ratio   = src->sample_aspect_ratio;
-#if FF_API_AFD
-FF_DISABLE_DEPRECATION_WARNINGS
-        dst->dtg_active_format     = src->dtg_active_format;
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_AFD */
 
         dst->profile = src->profile;
         dst->level   = src->level;
@@ -733,10 +728,6 @@ int ff_frame_thread_init(AVCodecContext *avctx)
     FrameThreadContext *fctx;
     int i, err = 0;
 
-#if HAVE_W32THREADS
-    w32thread_init();
-#endif
-
     if (!thread_count) {
         int nb_cpus = av_cpu_count();
 #if FF_API_DEBUG_MV
@@ -895,8 +886,6 @@ static int thread_get_buffer_internal(AVCodecContext *avctx, ThreadFrame *f, int
 
     f->owner[0] = f->owner[1] = avctx;
 
-    ff_init_buffer_info(avctx, f->f);
-
     if (!(avctx->active_thread_type & FF_THREAD_FRAME))
         return ff_get_buffer(avctx, f->f, flags);
 
diff --git a/media/ffvpx/libavcodec/pthread_slice.c b/media/ffvpx/libavcodec/pthread_slice.c
index d659f9b0b..77cfe3c9f 100644
--- a/media/ffvpx/libavcodec/pthread_slice.c
+++ b/media/ffvpx/libavcodec/pthread_slice.c
@@ -132,10 +132,6 @@ int ff_slice_thread_init(AVCodecContext *avctx)
     int thread_count = avctx->thread_count;
     static void (*mainfunc)(void *);
 
-#if HAVE_W32THREADS
-    w32thread_init();
-#endif
-
     // We cannot do this in the encoder init as the threads are created before
     if (av_codec_is_encoder(avctx->codec) &&
         avctx->codec_id == AV_CODEC_ID_MPEG1VIDEO &&
diff --git a/media/ffvpx/libavcodec/raw.h b/media/ffvpx/libavcodec/raw.h
index 24bf4cc55..28a27b1f9 100644
--- a/media/ffvpx/libavcodec/raw.h
+++ b/media/ffvpx/libavcodec/raw.h
@@ -28,6 +28,7 @@
 #define AVCODEC_RAW_H
 
 #include "avcodec.h"
+#include "internal.h"
 #include "libavutil/internal.h"
 
 typedef struct PixelFormatTag {
@@ -41,7 +42,7 @@ const struct PixelFormatTag *avpriv_get_raw_pix_fmt_tags(void);
 
 enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags, unsigned int fourcc);
 
-extern av_export const PixelFormatTag avpriv_pix_fmt_bps_avi[];
-extern av_export const PixelFormatTag avpriv_pix_fmt_bps_mov[];
+extern av_export_avcodec const PixelFormatTag avpriv_pix_fmt_bps_avi[];
+extern av_export_avcodec const PixelFormatTag avpriv_pix_fmt_bps_mov[];
 
 #endif /* AVCODEC_RAW_H */
diff --git a/media/ffvpx/libavcodec/resample.c b/media/ffvpx/libavcodec/resample.c
deleted file mode 100644
index 4c5eb9f10..000000000
--- a/media/ffvpx/libavcodec/resample.c
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * samplerate conversion for both audio and video
- * Copyright (c) 2000 Fabrice Bellard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * samplerate conversion for both audio and video
- */
-
-#include <string.h>
-
-#include "avcodec.h"
-#include "audioconvert.h"
-#include "libavutil/opt.h"
-#include "libavutil/mem.h"
-#include "libavutil/samplefmt.h"
-
-#if FF_API_AVCODEC_RESAMPLE
-FF_DISABLE_DEPRECATION_WARNINGS
-
-#define MAX_CHANNELS 8
-
-struct AVResampleContext;
-
-static const char *context_to_name(void *ptr)
-{
-    return "audioresample";
-}
-
-static const AVOption options[] = {{NULL}};
-static const AVClass audioresample_context_class = {
-    "ReSampleContext", context_to_name, options, LIBAVUTIL_VERSION_INT
-};
-
-struct ReSampleContext {
-    struct AVResampleContext *resample_context;
-    short *temp[MAX_CHANNELS];
-    int temp_len;
-    float ratio;
-    /* channel convert */
-    int input_channels, output_channels, filter_channels;
-    AVAudioConvert *convert_ctx[2];
-    enum AVSampleFormat sample_fmt[2]; ///< input and output sample format
-    unsigned sample_size[2];           ///< size of one sample in sample_fmt
-    short *buffer[2];                  ///< buffers used for conversion to S16
-    unsigned buffer_size[2];           ///< sizes of allocated buffers
-};
-
-/* n1: number of samples */
-static void stereo_to_mono(short *output, short *input, int n1)
-{
-    short *p, *q;
-    int n = n1;
-
-    p = input;
-    q = output;
-    while (n >= 4) {
-        q[0] = (p[0] + p[1]) >> 1;
-        q[1] = (p[2] + p[3]) >> 1;
-        q[2] = (p[4] + p[5]) >> 1;
-        q[3] = (p[6] + p[7]) >> 1;
-        q += 4;
-        p += 8;
-        n -= 4;
-    }
-    while (n > 0) {
-        q[0] = (p[0] + p[1]) >> 1;
-        q++;
-        p += 2;
-        n--;
-    }
-}
-
-/* n1: number of samples */
-static void mono_to_stereo(short *output, short *input, int n1)
-{
-    short *p, *q;
-    int n = n1;
-    int v;
-
-    p = input;
-    q = output;
-    while (n >= 4) {
-        v = p[0]; q[0] = v; q[1] = v;
-        v = p[1]; q[2] = v; q[3] = v;
-        v = p[2]; q[4] = v; q[5] = v;
-        v = p[3]; q[6] = v; q[7] = v;
-        q += 8;
-        p += 4;
-        n -= 4;
-    }
-    while (n > 0) {
-        v = p[0]; q[0] = v; q[1] = v;
-        q += 2;
-        p += 1;
-        n--;
-    }
-}
-
-/*
-5.1 to stereo input: [fl, fr, c, lfe, rl, rr]
-- Left = front_left + rear_gain * rear_left + center_gain * center
-- Right = front_right + rear_gain * rear_right + center_gain * center
-Where rear_gain is usually around 0.5-1.0 and
-      center_gain is almost always 0.7 (-3 dB)
-*/
-static void surround_to_stereo(short **output, short *input, int channels, int samples)
-{
-    int i;
-    short l, r;
-
-    for (i = 0; i < samples; i++) {
-        int fl,fr,c,rl,rr;
-        fl = input[0];
-        fr = input[1];
-        c = input[2];
-        // lfe = input[3];
-        rl = input[4];
-        rr = input[5];
-
-        l = av_clip_int16(fl + (0.5 * rl) + (0.7 * c));
-        r = av_clip_int16(fr + (0.5 * rr) + (0.7 * c));
-
-        /* output l & r. */
-        *output[0]++ = l;
-        *output[1]++ = r;
-
-        /* increment input. */
-        input += channels;
-    }
-}
-
-static void deinterleave(short **output, short *input, int channels, int samples)
-{
-    int i, j;
-
-    for (i = 0; i < samples; i++) {
-        for (j = 0; j < channels; j++) {
-            *output[j]++ = *input++;
-        }
-    }
-}
-
-static void interleave(short *output, short **input, int channels, int samples)
-{
-    int i, j;
-
-    for (i = 0; i < samples; i++) {
-        for (j = 0; j < channels; j++) {
-            *output++ = *input[j]++;
-        }
-    }
-}
-
-static void ac3_5p1_mux(short *output, short *input1, short *input2, int n)
-{
-    int i;
-    short l, r;
-
-    for (i = 0; i < n; i++) {
-        l = *input1++;
-        r = *input2++;
-        *output++ = l;                  /* left */
-        *output++ = (l / 2) + (r / 2);  /* center */
-        *output++ = r;                  /* right */
-        *output++ = 0;                  /* left surround */
-        *output++ = 0;                  /* right surroud */
-        *output++ = 0;                  /* low freq */
-    }
-}
-
-#define SUPPORT_RESAMPLE(ch1, ch2, ch3, ch4, ch5, ch6, ch7, ch8) \
-    ch8<<7 | ch7<<6 | ch6<<5 | ch5<<4 | ch4<<3 | ch3<<2 | ch2<<1 | ch1<<0
-
-static const uint8_t supported_resampling[MAX_CHANNELS] = {
-    // output ch:    1  2  3  4  5  6  7  8
-    SUPPORT_RESAMPLE(1, 1, 0, 0, 0, 0, 0, 0), // 1 input channel
-    SUPPORT_RESAMPLE(1, 1, 0, 0, 0, 1, 0, 0), // 2 input channels
-    SUPPORT_RESAMPLE(0, 0, 1, 0, 0, 0, 0, 0), // 3 input channels
-    SUPPORT_RESAMPLE(0, 0, 0, 1, 0, 0, 0, 0), // 4 input channels
-    SUPPORT_RESAMPLE(0, 0, 0, 0, 1, 0, 0, 0), // 5 input channels
-    SUPPORT_RESAMPLE(0, 1, 0, 0, 0, 1, 0, 0), // 6 input channels
-    SUPPORT_RESAMPLE(0, 0, 0, 0, 0, 0, 1, 0), // 7 input channels
-    SUPPORT_RESAMPLE(0, 0, 0, 0, 0, 0, 0, 1), // 8 input channels
-};
-
-ReSampleContext *av_audio_resample_init(int output_channels, int input_channels,
-                                        int output_rate, int input_rate,
-                                        enum AVSampleFormat sample_fmt_out,
-                                        enum AVSampleFormat sample_fmt_in,
-                                        int filter_length, int log2_phase_count,
-                                        int linear, double cutoff)
-{
-    ReSampleContext *s;
-
-    if (input_channels > MAX_CHANNELS) {
-        av_log(NULL, AV_LOG_ERROR,
-               "Resampling with input channels greater than %d is unsupported.\n",
-               MAX_CHANNELS);
-        return NULL;
-    }
-    if (!(supported_resampling[input_channels-1] & (1<<(output_channels-1)))) {
-        int i;
-        av_log(NULL, AV_LOG_ERROR, "Unsupported audio resampling. Allowed "
-               "output channels for %d input channel%s", input_channels,
-               input_channels > 1 ? "s:" : ":");
-        for (i = 0; i < MAX_CHANNELS; i++)
-            if (supported_resampling[input_channels-1] & (1<<i))
-                av_log(NULL, AV_LOG_ERROR, " %d", i + 1);
-        av_log(NULL, AV_LOG_ERROR, "\n");
-        return NULL;
-    }
-
-    s = av_mallocz(sizeof(ReSampleContext));
-    if (!s) {
-        av_log(NULL, AV_LOG_ERROR, "Can't allocate memory for resample context.\n");
-        return NULL;
-    }
-
-    s->ratio = (float)output_rate / (float)input_rate;
-
-    s->input_channels = input_channels;
-    s->output_channels = output_channels;
-
-    s->filter_channels = s->input_channels;
-    if (s->output_channels < s->filter_channels)
-        s->filter_channels = s->output_channels;
-
-    s->sample_fmt[0]  = sample_fmt_in;
-    s->sample_fmt[1]  = sample_fmt_out;
-    s->sample_size[0] = av_get_bytes_per_sample(s->sample_fmt[0]);
-    s->sample_size[1] = av_get_bytes_per_sample(s->sample_fmt[1]);
-
-    if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
-        if (!(s->convert_ctx[0] = av_audio_convert_alloc(AV_SAMPLE_FMT_S16, 1,
-                                                         s->sample_fmt[0], 1, NULL, 0))) {
-            av_log(s, AV_LOG_ERROR,
-                   "Cannot convert %s sample format to s16 sample format\n",
-                   av_get_sample_fmt_name(s->sample_fmt[0]));
-            av_free(s);
-            return NULL;
-        }
-    }
-
-    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
-        if (!(s->convert_ctx[1] = av_audio_convert_alloc(s->sample_fmt[1], 1,
-                                                         AV_SAMPLE_FMT_S16, 1, NULL, 0))) {
-            av_log(s, AV_LOG_ERROR,
-                   "Cannot convert s16 sample format to %s sample format\n",
-                   av_get_sample_fmt_name(s->sample_fmt[1]));
-            av_audio_convert_free(s->convert_ctx[0]);
-            av_free(s);
-            return NULL;
-        }
-    }
-
-    s->resample_context = av_resample_init(output_rate, input_rate,
-                                           filter_length, log2_phase_count,
-                                           linear, cutoff);
-
-    *(const AVClass**)s->resample_context = &audioresample_context_class;
-
-    return s;
-}
-
-/* resample audio. 'nb_samples' is the number of input samples */
-/* XXX: optimize it ! */
-int audio_resample(ReSampleContext *s, short *output, short *input, int nb_samples)
-{
-    int i, nb_samples1;
-    short *bufin[MAX_CHANNELS];
-    short *bufout[MAX_CHANNELS];
-    short *buftmp2[MAX_CHANNELS], *buftmp3[MAX_CHANNELS];
-    short *output_bak = NULL;
-    int lenout;
-
-    if (s->sample_fmt[0] != AV_SAMPLE_FMT_S16) {
-        int istride[1] = { s->sample_size[0] };
-        int ostride[1] = { 2 };
-        const void *ibuf[1] = { input };
-        void       *obuf[1];
-        unsigned input_size = nb_samples * s->input_channels * 2;
-
-        if (!s->buffer_size[0] || s->buffer_size[0] < input_size) {
-            av_free(s->buffer[0]);
-            s->buffer_size[0] = input_size;
-            s->buffer[0] = av_malloc(s->buffer_size[0]);
-            if (!s->buffer[0]) {
-                av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
-                return 0;
-            }
-        }
-
-        obuf[0] = s->buffer[0];
-
-        if (av_audio_convert(s->convert_ctx[0], obuf, ostride,
-                             ibuf, istride, nb_samples * s->input_channels) < 0) {
-            av_log(s->resample_context, AV_LOG_ERROR,
-                   "Audio sample format conversion failed\n");
-            return 0;
-        }
-
-        input = s->buffer[0];
-    }
-
-    lenout= 2*s->output_channels*nb_samples * s->ratio + 16;
-
-    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
-        int out_size = lenout * av_get_bytes_per_sample(s->sample_fmt[1]) *
-                       s->output_channels;
-        output_bak = output;
-
-        if (!s->buffer_size[1] || s->buffer_size[1] < out_size) {
-            av_free(s->buffer[1]);
-            s->buffer_size[1] = out_size;
-            s->buffer[1] = av_malloc(s->buffer_size[1]);
-            if (!s->buffer[1]) {
-                av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
-                return 0;
-            }
-        }
-
-        output = s->buffer[1];
-    }
-
-    /* XXX: move those malloc to resample init code */
-    for (i = 0; i < s->filter_channels; i++) {
-        bufin[i] = av_malloc_array((nb_samples + s->temp_len), sizeof(short));
-        bufout[i] = av_malloc_array(lenout, sizeof(short));
-
-        if (!bufin[i] || !bufout[i]) {
-            av_log(s->resample_context, AV_LOG_ERROR, "Could not allocate buffer\n");
-            nb_samples1 = 0;
-            goto fail;
-        }
-
-        memcpy(bufin[i], s->temp[i], s->temp_len * sizeof(short));
-        buftmp2[i] = bufin[i] + s->temp_len;
-    }
-
-    if (s->input_channels == 2 && s->output_channels == 1) {
-        buftmp3[0] = output;
-        stereo_to_mono(buftmp2[0], input, nb_samples);
-    } else if (s->output_channels >= 2 && s->input_channels == 1) {
-        buftmp3[0] = bufout[0];
-        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
-    } else if (s->input_channels == 6 && s->output_channels ==2) {
-        buftmp3[0] = bufout[0];
-        buftmp3[1] = bufout[1];
-        surround_to_stereo(buftmp2, input, s->input_channels, nb_samples);
-    } else if (s->output_channels >= s->input_channels && s->input_channels >= 2) {
-        for (i = 0; i < s->input_channels; i++) {
-            buftmp3[i] = bufout[i];
-        }
-        deinterleave(buftmp2, input, s->input_channels, nb_samples);
-    } else {
-        buftmp3[0] = output;
-        memcpy(buftmp2[0], input, nb_samples * sizeof(short));
-    }
-
-    nb_samples += s->temp_len;
-
-    /* resample each channel */
-    nb_samples1 = 0; /* avoid warning */
-    for (i = 0; i < s->filter_channels; i++) {
-        int consumed;
-        int is_last = i + 1 == s->filter_channels;
-
-        nb_samples1 = av_resample(s->resample_context, buftmp3[i], bufin[i],
-                                  &consumed, nb_samples, lenout, is_last);
-        s->temp_len = nb_samples - consumed;
-        s->temp[i] = av_realloc_array(s->temp[i], s->temp_len, sizeof(short));
-        memcpy(s->temp[i], bufin[i] + consumed, s->temp_len * sizeof(short));
-    }
-
-    if (s->output_channels == 2 && s->input_channels == 1) {
-        mono_to_stereo(output, buftmp3[0], nb_samples1);
-    } else if (s->output_channels == 6 && s->input_channels == 2) {
-        ac3_5p1_mux(output, buftmp3[0], buftmp3[1], nb_samples1);
-    } else if ((s->output_channels == s->input_channels && s->input_channels >= 2) ||
-               (s->output_channels == 2 && s->input_channels == 6)) {
-        interleave(output, buftmp3, s->output_channels, nb_samples1);
-    }
-
-    if (s->sample_fmt[1] != AV_SAMPLE_FMT_S16) {
-        int istride[1] = { 2 };
-        int ostride[1] = { s->sample_size[1] };
-        const void *ibuf[1] = { output };
-        void       *obuf[1] = { output_bak };
-
-        if (av_audio_convert(s->convert_ctx[1], obuf, ostride,
-                             ibuf, istride, nb_samples1 * s->output_channels) < 0) {
-            av_log(s->resample_context, AV_LOG_ERROR,
-                   "Audio sample format conversion failed\n");
-            return 0;
-        }
-    }
-
-fail:
-    for (i = 0; i < s->filter_channels; i++) {
-        av_free(bufin[i]);
-        av_free(bufout[i]);
-    }
-
-    return nb_samples1;
-}
-
-void audio_resample_close(ReSampleContext *s)
-{
-    int i;
-    av_resample_close(s->resample_context);
-    for (i = 0; i < s->filter_channels; i++)
-        av_freep(&s->temp[i]);
-    av_freep(&s->buffer[0]);
-    av_freep(&s->buffer[1]);
-    av_audio_convert_free(s->convert_ctx[0]);
-    av_audio_convert_free(s->convert_ctx[1]);
-    av_free(s);
-}
-
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif
diff --git a/media/ffvpx/libavcodec/resample2.c b/media/ffvpx/libavcodec/resample2.c
deleted file mode 100644
index 56ae9f722..000000000
--- a/media/ffvpx/libavcodec/resample2.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * audio resampling
- * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-/**
- * @file
- * audio resampling
- * @author Michael Niedermayer <michaelni@gmx.at>
- */
-
-#include "libavutil/avassert.h"
-#include "avcodec.h"
-#include "libavutil/common.h"
-
-#if FF_API_AVCODEC_RESAMPLE
-
-#ifndef CONFIG_RESAMPLE_HP
-#define FILTER_SHIFT 15
-
-typedef int16_t FELEM;
-typedef int32_t FELEM2;
-typedef int64_t FELEML;
-#define FELEM_MAX INT16_MAX
-#define FELEM_MIN INT16_MIN
-#define WINDOW_TYPE 9
-#elif !defined(CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE)
-#define FILTER_SHIFT 30
-
-#define FELEM int32_t
-#define FELEM2 int64_t
-#define FELEML int64_t
-#define FELEM_MAX INT32_MAX
-#define FELEM_MIN INT32_MIN
-#define WINDOW_TYPE 12
-#else
-#define FILTER_SHIFT 0
-
-typedef double FELEM;
-typedef double FELEM2;
-typedef double FELEML;
-#define WINDOW_TYPE 24
-#endif
-
-
-typedef struct AVResampleContext{
-    const AVClass *av_class;
-    FELEM *filter_bank;
-    int filter_length;
-    int ideal_dst_incr;
-    int dst_incr;
-    int index;
-    int frac;
-    int src_incr;
-    int compensation_distance;
-    int phase_shift;
-    int phase_mask;
-    int linear;
-}AVResampleContext;
-
-/**
- * 0th order modified bessel function of the first kind.
- */
-static double bessel(double x){
-    double v=1;
-    double lastv=0;
-    double t=1;
-    int i;
-
-    x= x*x/4;
-    for(i=1; v != lastv; i++){
-        lastv=v;
-        t *= x/(i*i);
-        v += t;
-    }
-    return v;
-}
-
-/**
- * Build a polyphase filterbank.
- * @param factor resampling factor
- * @param scale wanted sum of coefficients for each filter
- * @param type 0->cubic, 1->blackman nuttall windowed sinc, 2..16->kaiser windowed sinc beta=2..16
- * @return 0 on success, negative on error
- */
-static int build_filter(FELEM *filter, double factor, int tap_count, int phase_count, int scale, int type){
-    int ph, i;
-    double x, y, w;
-    double *tab = av_malloc_array(tap_count, sizeof(*tab));
-    const int center= (tap_count-1)/2;
-
-    if (!tab)
-        return AVERROR(ENOMEM);
-
-    /* if upsampling, only need to interpolate, no filter */
-    if (factor > 1.0)
-        factor = 1.0;
-
-    for(ph=0;ph<phase_count;ph++) {
-        double norm = 0;
-        for(i=0;i<tap_count;i++) {
-            x = M_PI * ((double)(i - center) - (double)ph / phase_count) * factor;
-            if (x == 0) y = 1.0;
-            else        y = sin(x) / x;
-            switch(type){
-            case 0:{
-                const float d= -0.5; //first order derivative = -0.5
-                x = fabs(((double)(i - center) - (double)ph / phase_count) * factor);
-                if(x<1.0) y= 1 - 3*x*x + 2*x*x*x + d*(            -x*x + x*x*x);
-                else      y=                       d*(-4 + 8*x - 5*x*x + x*x*x);
-                break;}
-            case 1:
-                w = 2.0*x / (factor*tap_count) + M_PI;
-                y *= 0.3635819 - 0.4891775 * cos(w) + 0.1365995 * cos(2*w) - 0.0106411 * cos(3*w);
-                break;
-            default:
-                w = 2.0*x / (factor*tap_count*M_PI);
-                y *= bessel(type*sqrt(FFMAX(1-w*w, 0)));
-                break;
-            }
-
-            tab[i] = y;
-            norm += y;
-        }
-
-        /* normalize so that an uniform color remains the same */
-        for(i=0;i<tap_count;i++) {
-#ifdef CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE
-            filter[ph * tap_count + i] = tab[i] / norm;
-#else
-            filter[ph * tap_count + i] = av_clip(lrintf(tab[i] * scale / norm), FELEM_MIN, FELEM_MAX);
-#endif
-        }
-    }
-#if 0
-    {
-#define LEN 1024
-        int j,k;
-        double sine[LEN + tap_count];
-        double filtered[LEN];
-        double maxff=-2, minff=2, maxsf=-2, minsf=2;
-        for(i=0; i<LEN; i++){
-            double ss=0, sf=0, ff=0;
-            for(j=0; j<LEN+tap_count; j++)
-                sine[j]= cos(i*j*M_PI/LEN);
-            for(j=0; j<LEN; j++){
-                double sum=0;
-                ph=0;
-                for(k=0; k<tap_count; k++)
-                    sum += filter[ph * tap_count + k] * sine[k+j];
-                filtered[j]= sum / (1<<FILTER_SHIFT);
-                ss+= sine[j + center] * sine[j + center];
-                ff+= filtered[j] * filtered[j];
-                sf+= sine[j + center] * filtered[j];
-            }
-            ss= sqrt(2*ss/LEN);
-            ff= sqrt(2*ff/LEN);
-            sf= 2*sf/LEN;
-            maxff= FFMAX(maxff, ff);
-            minff= FFMIN(minff, ff);
-            maxsf= FFMAX(maxsf, sf);
-            minsf= FFMIN(minsf, sf);
-            if(i%11==0){
-                av_log(NULL, AV_LOG_ERROR, "i:%4d ss:%f ff:%13.6e-%13.6e sf:%13.6e-%13.6e\n", i, ss, maxff, minff, maxsf, minsf);
-                minff=minsf= 2;
-                maxff=maxsf= -2;
-            }
-        }
-    }
-#endif
-
-    av_free(tab);
-    return 0;
-}
-
-AVResampleContext *av_resample_init(int out_rate, int in_rate, int filter_size, int phase_shift, int linear, double cutoff){
-    AVResampleContext *c= av_mallocz(sizeof(AVResampleContext));
-    double factor= FFMIN(out_rate * cutoff / in_rate, 1.0);
-    int phase_count= 1<<phase_shift;
-
-    if (!c)
-        return NULL;
-
-    c->phase_shift= phase_shift;
-    c->phase_mask= phase_count-1;
-    c->linear= linear;
-
-    c->filter_length= FFMAX((int)ceil(filter_size/factor), 1);
-    c->filter_bank= av_mallocz_array(c->filter_length, (phase_count+1)*sizeof(FELEM));
-    if (!c->filter_bank)
-        goto error;
-    if (build_filter(c->filter_bank, factor, c->filter_length, phase_count, 1<<FILTER_SHIFT, WINDOW_TYPE))
-        goto error;
-    memcpy(&c->filter_bank[c->filter_length*phase_count+1], c->filter_bank, (c->filter_length-1)*sizeof(FELEM));
-    c->filter_bank[c->filter_length*phase_count]= c->filter_bank[c->filter_length - 1];
-
-    if(!av_reduce(&c->src_incr, &c->dst_incr, out_rate, in_rate * (int64_t)phase_count, INT32_MAX/2))
-        goto error;
-    c->ideal_dst_incr= c->dst_incr;
-
-    c->index= -phase_count*((c->filter_length-1)/2);
-
-    return c;
-error:
-    av_free(c->filter_bank);
-    av_free(c);
-    return NULL;
-}
-
-void av_resample_close(AVResampleContext *c){
-    av_freep(&c->filter_bank);
-    av_freep(&c);
-}
-
-void av_resample_compensate(AVResampleContext *c, int sample_delta, int compensation_distance){
-//    sample_delta += (c->ideal_dst_incr - c->dst_incr)*(int64_t)c->compensation_distance / c->ideal_dst_incr;
-    c->compensation_distance= compensation_distance;
-    c->dst_incr = c->ideal_dst_incr - c->ideal_dst_incr * (int64_t)sample_delta / compensation_distance;
-}
-
-int av_resample(AVResampleContext *c, short *dst, short *src, int *consumed, int src_size, int dst_size, int update_ctx){
-    int dst_index, i;
-    int index= c->index;
-    int frac= c->frac;
-    int dst_incr_frac= c->dst_incr % c->src_incr;
-    int dst_incr=      c->dst_incr / c->src_incr;
-    int compensation_distance= c->compensation_distance;
-
-  if(compensation_distance == 0 && c->filter_length == 1 && c->phase_shift==0){
-        int64_t index2= ((int64_t)index)<<32;
-        int64_t incr= (1LL<<32) * c->dst_incr / c->src_incr;
-        dst_size= FFMIN(dst_size, (src_size-1-index) * (int64_t)c->src_incr / c->dst_incr);
-
-        for(dst_index=0; dst_index < dst_size; dst_index++){
-            dst[dst_index] = src[index2>>32];
-            index2 += incr;
-        }
-        index += dst_index * dst_incr;
-        index += (frac + dst_index * (int64_t)dst_incr_frac) / c->src_incr;
-        frac   = (frac + dst_index * (int64_t)dst_incr_frac) % c->src_incr;
-  }else{
-    for(dst_index=0; dst_index < dst_size; dst_index++){
-        FELEM *filter= c->filter_bank + c->filter_length*(index & c->phase_mask);
-        int sample_index= index >> c->phase_shift;
-        FELEM2 val=0;
-
-        if(sample_index < 0){
-            for(i=0; i<c->filter_length; i++)
-                val += src[FFABS(sample_index + i) % src_size] * filter[i];
-        }else if(sample_index + c->filter_length > src_size){
-            break;
-        }else if(c->linear){
-            FELEM2 v2=0;
-            for(i=0; i<c->filter_length; i++){
-                val += src[sample_index + i] * (FELEM2)filter[i];
-                v2  += src[sample_index + i] * (FELEM2)filter[i + c->filter_length];
-            }
-            val+=(v2-val)*(FELEML)frac / c->src_incr;
-        }else{
-            for(i=0; i<c->filter_length; i++){
-                val += src[sample_index + i] * (FELEM2)filter[i];
-            }
-        }
-
-#ifdef CONFIG_RESAMPLE_AUDIOPHILE_KIDDY_MODE
-        dst[dst_index] = av_clip_int16(lrintf(val));
-#else
-        val = (val + (1<<(FILTER_SHIFT-1)))>>FILTER_SHIFT;
-        dst[dst_index] = (unsigned)(val + 32768) > 65535 ? (val>>31) ^ 32767 : val;
-#endif
-
-        frac += dst_incr_frac;
-        index += dst_incr;
-        if(frac >= c->src_incr){
-            frac -= c->src_incr;
-            index++;
-        }
-
-        if(dst_index + 1 == compensation_distance){
-            compensation_distance= 0;
-            dst_incr_frac= c->ideal_dst_incr % c->src_incr;
-            dst_incr=      c->ideal_dst_incr / c->src_incr;
-        }
-    }
-  }
-    *consumed= FFMAX(index, 0) >> c->phase_shift;
-    if(index>=0) index &= c->phase_mask;
-
-    if(compensation_distance){
-        compensation_distance -= dst_index;
-        av_assert2(compensation_distance > 0);
-    }
-    if(update_ctx){
-        c->frac= frac;
-        c->index= index;
-        c->dst_incr= dst_incr_frac + c->src_incr*dst_incr;
-        c->compensation_distance= compensation_distance;
-    }
-
-    return dst_index;
-}
-
-#endif
diff --git a/media/ffvpx/libavcodec/thread.h b/media/ffvpx/libavcodec/thread.h
index 318619316..540135fbc 100644
--- a/media/ffvpx/libavcodec/thread.h
+++ b/media/ffvpx/libavcodec/thread.h
@@ -29,7 +29,6 @@
 
 #include "libavutil/buffer.h"
 
-#include "config.h"
 #include "avcodec.h"
 
 typedef struct ThreadFrame {
diff --git a/media/ffvpx/libavcodec/utils.c b/media/ffvpx/libavcodec/utils.c
index 0c47e761f..59d41ccbb 100644
--- a/media/ffvpx/libavcodec/utils.c
+++ b/media/ffvpx/libavcodec/utils.c
@@ -26,7 +26,6 @@
  */
 
 #include "config.h"
-#include "libavutil/atomic.h"
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
 #include "libavutil/avstring.h"
@@ -45,8 +44,8 @@
 #include "libavutil/thread.h"
 #include "avcodec.h"
 #include "decode.h"
+#include "hwaccel.h"
 #include "libavutil/opt.h"
-#include "me_cmp.h"
 #include "mpegvideo.h"
 #include "thread.h"
 #include "frame_thread_encoder.h"
@@ -56,6 +55,7 @@
 #include "version.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdatomic.h>
 #include <limits.h>
 #include <float.h>
 #if CONFIG_ICONV
@@ -65,58 +65,7 @@
 #include "libavutil/ffversion.h"
 const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
 
-#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
-static int default_lockmgr_cb(void **arg, enum AVLockOp op)
-{
-    void * volatile * mutex = arg;
-    int err;
-
-    switch (op) {
-    case AV_LOCK_CREATE:
-        return 0;
-    case AV_LOCK_OBTAIN:
-        if (!*mutex) {
-            pthread_mutex_t *tmp = av_malloc(sizeof(pthread_mutex_t));
-            if (!tmp)
-                return AVERROR(ENOMEM);
-            if ((err = pthread_mutex_init(tmp, NULL))) {
-                av_free(tmp);
-                return AVERROR(err);
-            }
-            if (avpriv_atomic_ptr_cas(mutex, NULL, tmp)) {
-                pthread_mutex_destroy(tmp);
-                av_free(tmp);
-            }
-        }
-
-        if ((err = pthread_mutex_lock(*mutex)))
-            return AVERROR(err);
-
-        return 0;
-    case AV_LOCK_RELEASE:
-        if ((err = pthread_mutex_unlock(*mutex)))
-            return AVERROR(err);
-
-        return 0;
-    case AV_LOCK_DESTROY:
-        if (*mutex)
-            pthread_mutex_destroy(*mutex);
-        av_free(*mutex);
-        avpriv_atomic_ptr_cas(mutex, *mutex, NULL);
-        return 0;
-    }
-    return 1;
-}
-static int (*lockmgr_cb)(void **mutex, enum AVLockOp op) = default_lockmgr_cb;
-#else
-static int (*lockmgr_cb)(void **mutex, enum AVLockOp op) = NULL;
-#endif
-
-
-volatile int ff_avcodec_locked;
-static int volatile entangled_thread_counter = 0;
-static void *codec_mutex;
-static void *avformat_mutex;
+static AVMutex codec_mutex = AV_MUTEX_INITIALIZER;
 
 void av_fast_padded_malloc(void *ptr, unsigned int *size, size_t min_size)
 {
@@ -142,30 +91,6 @@ void av_fast_padded_mallocz(void *ptr, unsigned int *size, size_t min_size)
         memset(*p, 0, min_size + AV_INPUT_BUFFER_PADDING_SIZE);
 }
 
-/* encoder management */
-static AVCodec *first_avcodec = NULL;
-static AVCodec **last_avcodec = &first_avcodec;
-
-AVCodec *av_codec_next(const AVCodec *c)
-{
-    if (c)
-        return c->next;
-    else
-        return first_avcodec;
-}
-
-static av_cold void avcodec_init(void)
-{
-    static int initialized = 0;
-
-    if (initialized != 0)
-        return;
-    initialized = 1;
-
-    if (CONFIG_ME_CMP)
-        ff_me_cmp_init_static();
-}
-
 int av_codec_is_encoder(const AVCodec *codec)
 {
     return codec && (codec->encode_sub || codec->encode2 ||codec->send_frame);
@@ -176,38 +101,6 @@ int av_codec_is_decoder(const AVCodec *codec)
     return codec && (codec->decode || codec->receive_frame);
 }
 
-av_cold void avcodec_register(AVCodec *codec)
-{
-    AVCodec **p;
-    avcodec_init();
-    p = last_avcodec;
-    codec->next = NULL;
-
-    while(*p || avpriv_atomic_ptr_cas((void * volatile *)p, NULL, codec))
-        p = &(*p)->next;
-    last_avcodec = &codec->next;
-
-    if (codec->init_static_data)
-        codec->init_static_data(codec);
-}
-
-#if FF_API_EMU_EDGE
-unsigned avcodec_get_edge_width(void)
-{
-    return EDGE_WIDTH;
-}
-#endif
-
-#if FF_API_SET_DIMENSIONS
-void avcodec_set_dimensions(AVCodecContext *s, int width, int height)
-{
-    int ret = ff_set_dimensions(s, width, height);
-    if (ret < 0) {
-        av_log(s, AV_LOG_WARNING, "Failed to set dimensions %d %d\n", width, height);
-    }
-}
-#endif
-
 int ff_set_dimensions(AVCodecContext *s, int width, int height)
 {
     int ret = av_image_check_size2(width, height, s->max_pixels, AV_PIX_FMT_NONE, 0, s);
@@ -419,7 +312,10 @@ void avcodec_align_dimensions2(AVCodecContext *s, int *width, int *height,
 
     *width  = FFALIGN(*width, w_align);
     *height = FFALIGN(*height, h_align);
-    if (s->codec_id == AV_CODEC_ID_H264 || s->lowres) {
+    if (s->codec_id == AV_CODEC_ID_H264 || s->lowres ||
+        s->codec_id == AV_CODEC_ID_VP5  || s->codec_id == AV_CODEC_ID_VP6 ||
+        s->codec_id == AV_CODEC_ID_VP6F || s->codec_id == AV_CODEC_ID_VP6A
+    ) {
         // some of the optimized chroma MC reads one line too much
         // which is also done in mpeg decoders with lowres > 0
         *height += 2;
@@ -569,6 +465,7 @@ enum AVPixelFormat avpriv_find_pix_fmt(const PixelFormatTag *tags,
     return AV_PIX_FMT_NONE;
 }
 
+#if FF_API_CODEC_GET_SET
 MAKE_ACCESSORS(AVCodecContext, codec, AVRational, pkt_timebase)
 MAKE_ACCESSORS(AVCodecContext, codec, const AVCodecDescriptor *, codec_descriptor)
 MAKE_ACCESSORS(AVCodecContext, codec, int, lowres)
@@ -584,6 +481,7 @@ int av_codec_get_max_lowres(const AVCodec *codec)
 {
     return codec->max_lowres;
 }
+#endif
 
 int avpriv_codec_get_cap_skip_frame_fill_param(const AVCodec *codec){
     return !!(codec->caps_internal & FF_CODEC_CAP_SKIP_FRAME_FILL_PARAM);
@@ -612,6 +510,19 @@ static int64_t get_bit_rate(AVCodecContext *ctx)
     return bit_rate;
 }
 
+
+static void ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec)
+{
+    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init)
+        ff_mutex_lock(&codec_mutex);
+}
+
+static void ff_unlock_avcodec(const AVCodec *codec)
+{
+    if (!(codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE) && codec->init)
+        ff_mutex_unlock(&codec_mutex);
+}
+
 int attribute_align_arg ff_codec_open2_recursive(AVCodecContext *avctx, const AVCodec *codec, AVDictionary **options)
 {
     int ret = 0;
@@ -651,9 +562,7 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
     if (options)
         av_dict_copy(&tmp, *options, 0);
 
-    ret = ff_lock_avcodec(avctx, codec);
-    if (ret < 0)
-        return ret;
+    ff_lock_avcodec(avctx, codec);
 
     avctx->internal = av_mallocz(sizeof(*avctx->internal));
     if (!avctx->internal) {
@@ -833,12 +742,6 @@ int attribute_align_arg avcodec_open2(AVCodecContext *avctx, const AVCodec *code
         avctx->lowres = avctx->codec->max_lowres;
     }
 
-#if FF_API_VISMV
-    if (avctx->debug_mv)
-        av_log(avctx, AV_LOG_WARNING, "The 'vismv' option is deprecated, "
-               "see the codecview filter instead.\n");
-#endif
-
     if (av_codec_is_encoder(avctx->codec)) {
         int i;
 #if FF_API_CODED_FRAME
@@ -1025,11 +928,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
 
     ret=0;
 
-#if FF_API_AUDIOENC_DELAY
-    if (av_codec_is_encoder(avctx->codec))
-        avctx->delay = avctx->initial_padding;
-#endif
-
     if (av_codec_is_decoder(avctx->codec)) {
         if (!avctx->bit_rate)
             avctx->bit_rate = get_bit_rate(avctx);
@@ -1226,71 +1124,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     return 0;
 }
 
-static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
-{
-    switch(id){
-        //This is for future deprecatec codec ids, its empty since
-        //last major bump but will fill up again over time, please don't remove it
-        default                                         : return id;
-    }
-}
-
-static AVCodec *find_encdec(enum AVCodecID id, int encoder)
-{
-    AVCodec *p, *experimental = NULL;
-    p = first_avcodec;
-    id= remap_deprecated_codec_id(id);
-    while (p) {
-        if ((encoder ? av_codec_is_encoder(p) : av_codec_is_decoder(p)) &&
-            p->id == id) {
-            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
-                experimental = p;
-            } else
-                return p;
-        }
-        p = p->next;
-    }
-    return experimental;
-}
-
-AVCodec *avcodec_find_encoder(enum AVCodecID id)
-{
-    return find_encdec(id, 1);
-}
-
-AVCodec *avcodec_find_encoder_by_name(const char *name)
-{
-    AVCodec *p;
-    if (!name)
-        return NULL;
-    p = first_avcodec;
-    while (p) {
-        if (av_codec_is_encoder(p) && strcmp(name, p->name) == 0)
-            return p;
-        p = p->next;
-    }
-    return NULL;
-}
-
-AVCodec *avcodec_find_decoder(enum AVCodecID id)
-{
-    return find_encdec(id, 0);
-}
-
-AVCodec *avcodec_find_decoder_by_name(const char *name)
-{
-    AVCodec *p;
-    if (!name)
-        return NULL;
-    p = first_avcodec;
-    while (p) {
-        if (av_codec_is_decoder(p) && strcmp(name, p->name) == 0)
-            return p;
-        p = p->next;
-    }
-    return NULL;
-}
-
 const char *avcodec_get_name(enum AVCodecID id)
 {
     const AVCodecDescriptor *cd;
@@ -1836,13 +1669,13 @@ static int get_audio_frame_duration(enum AVCodecID id, int sr, int ch, int ba,
                 /* calc from frame_bytes, channels, and bits_per_coded_sample */
                 switch (id) {
                 case AV_CODEC_ID_PCM_DVD:
-                    if(bps<4)
+                    if(bps<4 || frame_bytes<3)
                         return 0;
-                    return 2 * (frame_bytes / ((bps * 2 / 8) * ch));
+                    return 2 * ((frame_bytes - 3) / ((bps * 2 / 8) * ch));
                 case AV_CODEC_ID_PCM_BLURAY:
-                    if(bps<4)
+                    if(bps<4 || frame_bytes<4)
                         return 0;
-                    return frame_bytes / ((FFALIGN(ch, 2) * bps) / 8);
+                    return (frame_bytes - 4) / ((FFALIGN(ch, 2) * bps) / 8);
                 case AV_CODEC_ID_S302M:
                     return 2 * (frame_bytes / ((bps + 4) / 4)) / ch;
                 }
@@ -1911,143 +1744,34 @@ int ff_match_2uint16(const uint16_t(*tab)[2], int size, int a, int b)
     return i;
 }
 
-#if FF_API_MISSING_SAMPLE
-FF_DISABLE_DEPRECATION_WARNINGS
-void av_log_missing_feature(void *avc, const char *feature, int want_sample)
+const AVCodecHWConfig *avcodec_get_hw_config(const AVCodec *codec, int index)
 {
-    av_log(avc, AV_LOG_WARNING, "%s is not implemented. Update your FFmpeg "
-            "version to the newest one from Git. If the problem still "
-            "occurs, it means that your file has a feature which has not "
-            "been implemented.\n", feature);
-    if(want_sample)
-        av_log_ask_for_sample(avc, NULL);
+    int i;
+    if (!codec->hw_configs || index < 0)
+        return NULL;
+    for (i = 0; i <= index; i++)
+        if (!codec->hw_configs[i])
+            return NULL;
+    return &codec->hw_configs[index]->public;
 }
 
-void av_log_ask_for_sample(void *avc, const char *msg, ...)
+#if FF_API_USER_VISIBLE_AVHWACCEL
+AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel)
 {
-    va_list argument_list;
-
-    va_start(argument_list, msg);
-
-    if (msg)
-        av_vlog(avc, AV_LOG_WARNING, msg, argument_list);
-    av_log(avc, AV_LOG_WARNING, "If you want to help, upload a sample "
-            "of this file to ftp://upload.ffmpeg.org/incoming/ "
-            "and contact the ffmpeg-devel mailing list. (ffmpeg-devel@ffmpeg.org)\n");
-
-    va_end(argument_list);
+    return NULL;
 }
-FF_ENABLE_DEPRECATION_WARNINGS
-#endif /* FF_API_MISSING_SAMPLE */
-
-static AVHWAccel *first_hwaccel = NULL;
-static AVHWAccel **last_hwaccel = &first_hwaccel;
 
 void av_register_hwaccel(AVHWAccel *hwaccel)
 {
-    AVHWAccel **p = last_hwaccel;
-    hwaccel->next = NULL;
-    while(*p || avpriv_atomic_ptr_cas((void * volatile *)p, NULL, hwaccel))
-        p = &(*p)->next;
-    last_hwaccel = &hwaccel->next;
-}
-
-AVHWAccel *av_hwaccel_next(const AVHWAccel *hwaccel)
-{
-    return hwaccel ? hwaccel->next : first_hwaccel;
 }
+#endif
 
+#if FF_API_LOCKMGR
 int av_lockmgr_register(int (*cb)(void **mutex, enum AVLockOp op))
 {
-    if (lockmgr_cb) {
-        // There is no good way to rollback a failure to destroy the
-        // mutex, so we ignore failures.
-        lockmgr_cb(&codec_mutex,    AV_LOCK_DESTROY);
-        lockmgr_cb(&avformat_mutex, AV_LOCK_DESTROY);
-        lockmgr_cb     = NULL;
-        codec_mutex    = NULL;
-        avformat_mutex = NULL;
-    }
-
-    if (cb) {
-        void *new_codec_mutex    = NULL;
-        void *new_avformat_mutex = NULL;
-        int err;
-        if (err = cb(&new_codec_mutex, AV_LOCK_CREATE)) {
-            return err > 0 ? AVERROR_UNKNOWN : err;
-        }
-        if (err = cb(&new_avformat_mutex, AV_LOCK_CREATE)) {
-            // Ignore failures to destroy the newly created mutex.
-            cb(&new_codec_mutex, AV_LOCK_DESTROY);
-            return err > 0 ? AVERROR_UNKNOWN : err;
-        }
-        lockmgr_cb     = cb;
-        codec_mutex    = new_codec_mutex;
-        avformat_mutex = new_avformat_mutex;
-    }
-
-    return 0;
-}
-
-int ff_lock_avcodec(AVCodecContext *log_ctx, const AVCodec *codec)
-{
-    if (codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE || !codec->init)
-        return 0;
-
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_OBTAIN))
-            return -1;
-    }
-
-    if (avpriv_atomic_int_add_and_fetch(&entangled_thread_counter, 1) != 1) {
-        av_log(log_ctx, AV_LOG_ERROR,
-               "Insufficient thread locking. At least %d threads are "
-               "calling avcodec_open2() at the same time right now.\n",
-               entangled_thread_counter);
-        if (!lockmgr_cb)
-            av_log(log_ctx, AV_LOG_ERROR, "No lock manager is set, please see av_lockmgr_register()\n");
-        ff_avcodec_locked = 1;
-        ff_unlock_avcodec(codec);
-        return AVERROR(EINVAL);
-    }
-    av_assert0(!ff_avcodec_locked);
-    ff_avcodec_locked = 1;
-    return 0;
-}
-
-int ff_unlock_avcodec(const AVCodec *codec)
-{
-    if (codec->caps_internal & FF_CODEC_CAP_INIT_THREADSAFE || !codec->init)
-        return 0;
-
-    av_assert0(ff_avcodec_locked);
-    ff_avcodec_locked = 0;
-    avpriv_atomic_int_add_and_fetch(&entangled_thread_counter, -1);
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&codec_mutex, AV_LOCK_RELEASE))
-            return -1;
-    }
-
-    return 0;
-}
-
-int avpriv_lock_avformat(void)
-{
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&avformat_mutex, AV_LOCK_OBTAIN))
-            return -1;
-    }
-    return 0;
-}
-
-int avpriv_unlock_avformat(void)
-{
-    if (lockmgr_cb) {
-        if ((*lockmgr_cb)(&avformat_mutex, AV_LOCK_RELEASE))
-            return -1;
-    }
     return 0;
 }
+#endif
 
 unsigned int avpriv_toupper4(unsigned int x)
 {
diff --git a/media/ffvpx/libavcodec/version.h b/media/ffvpx/libavcodec/version.h
index 10d9ac4eb..6895f1a46 100644
--- a/media/ffvpx/libavcodec/version.h
+++ b/media/ffvpx/libavcodec/version.h
@@ -27,8 +27,8 @@
 
 #include "libavutil/version.h"
 
-#define LIBAVCODEC_VERSION_MAJOR  57
-#define LIBAVCODEC_VERSION_MINOR 107
+#define LIBAVCODEC_VERSION_MAJOR  58
+#define LIBAVCODEC_VERSION_MINOR  18
 #define LIBAVCODEC_VERSION_MICRO 100
 
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
@@ -51,136 +51,18 @@
  * at once through the bump. This improves the git bisect-ability of the change.
  */
 
-#ifndef FF_API_VIMA_DECODER
-#define FF_API_VIMA_DECODER     (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_AUDIO_CONVERT
-#define FF_API_AUDIO_CONVERT     (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_AVCODEC_RESAMPLE
-#define FF_API_AVCODEC_RESAMPLE  FF_API_AUDIO_CONVERT
-#endif
-#ifndef FF_API_MISSING_SAMPLE
-#define FF_API_MISSING_SAMPLE    (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
 #ifndef FF_API_LOWRES
-#define FF_API_LOWRES            (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_CAP_VDPAU
-#define FF_API_CAP_VDPAU         (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_BUFS_VDPAU
-#define FF_API_BUFS_VDPAU        (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_VOXWARE
-#define FF_API_VOXWARE           (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_SET_DIMENSIONS
-#define FF_API_SET_DIMENSIONS    (LIBAVCODEC_VERSION_MAJOR < 58)
+#define FF_API_LOWRES            (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
 #ifndef FF_API_DEBUG_MV
 #define FF_API_DEBUG_MV          (LIBAVCODEC_VERSION_MAJOR < 58)
 #endif
-#ifndef FF_API_AC_VLC
-#define FF_API_AC_VLC            (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_OLD_MSMPEG4
-#define FF_API_OLD_MSMPEG4       (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_ASPECT_EXTENDED
-#define FF_API_ASPECT_EXTENDED   (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_ARCH_ALPHA
-#define FF_API_ARCH_ALPHA        (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_XVMC
-#define FF_API_XVMC              (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_ERROR_RATE
-#define FF_API_ERROR_RATE        (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_QSCALE_TYPE
-#define FF_API_QSCALE_TYPE       (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_MB_TYPE
-#define FF_API_MB_TYPE           (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_MAX_BFRAMES
-#define FF_API_MAX_BFRAMES       (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_NEG_LINESIZES
-#define FF_API_NEG_LINESIZES     (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_EMU_EDGE
-#define FF_API_EMU_EDGE          (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_ARCH_SH4
-#define FF_API_ARCH_SH4          (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_ARCH_SPARC
-#define FF_API_ARCH_SPARC        (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_UNUSED_MEMBERS
-#define FF_API_UNUSED_MEMBERS    (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_IDCT_XVIDMMX
-#define FF_API_IDCT_XVIDMMX      (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_INPUT_PRESERVED
-#define FF_API_INPUT_PRESERVED   (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_NORMALIZE_AQP
-#define FF_API_NORMALIZE_AQP     (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_GMC
-#define FF_API_GMC               (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_MV0
-#define FF_API_MV0               (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_CODEC_NAME
-#define FF_API_CODEC_NAME        (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_AFD
-#define FF_API_AFD               (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_VISMV
-/* XXX: don't forget to drop the -vismv documentation */
-#define FF_API_VISMV             (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_AUDIOENC_DELAY
-#define FF_API_AUDIOENC_DELAY    (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_VAAPI_CONTEXT
-#define FF_API_VAAPI_CONTEXT     (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
-#ifndef FF_API_MERGE_SD
-#define FF_API_MERGE_SD          (LIBAVCODEC_VERSION_MAJOR < 58)
-#endif
 #ifndef FF_API_AVCTX_TIMEBASE
 #define FF_API_AVCTX_TIMEBASE    (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
-#ifndef FF_API_MPV_OPT
-#define FF_API_MPV_OPT           (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
-#ifndef FF_API_STREAM_CODEC_TAG
-#define FF_API_STREAM_CODEC_TAG  (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
-#ifndef FF_API_QUANT_BIAS
-#define FF_API_QUANT_BIAS        (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
-#ifndef FF_API_RC_STRATEGY
-#define FF_API_RC_STRATEGY       (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
 #ifndef FF_API_CODED_FRAME
 #define FF_API_CODED_FRAME       (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
-#ifndef FF_API_MOTION_EST
-#define FF_API_MOTION_EST        (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
-#ifndef FF_API_WITHOUT_PREFIX
-#define FF_API_WITHOUT_PREFIX    (LIBAVCODEC_VERSION_MAJOR < 59)
-#endif
 #ifndef FF_API_SIDEDATA_ONLY_PKT
 #define FF_API_SIDEDATA_ONLY_PKT (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
@@ -238,6 +120,18 @@
 #ifndef FF_API_GETCHROMA
 #define FF_API_GETCHROMA         (LIBAVCODEC_VERSION_MAJOR < 59)
 #endif
+#ifndef FF_API_CODEC_GET_SET
+#define FF_API_CODEC_GET_SET     (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_USER_VISIBLE_AVHWACCEL
+#define FF_API_USER_VISIBLE_AVHWACCEL (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_LOCKMGR
+#define FF_API_LOCKMGR (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
+#ifndef FF_API_NEXT
+#define FF_API_NEXT              (LIBAVCODEC_VERSION_MAJOR < 59)
+#endif
 
 
 #endif /* AVCODEC_VERSION_H */
diff --git a/media/ffvpx/libavcodec/vp8.c b/media/ffvpx/libavcodec/vp8.c
index 7841a9d96..62b9f8bc2 100644
--- a/media/ffvpx/libavcodec/vp8.c
+++ b/media/ffvpx/libavcodec/vp8.c
@@ -27,6 +27,7 @@
 #include "libavutil/imgutils.h"
 
 #include "avcodec.h"
+#include "hwaccel.h"
 #include "internal.h"
 #include "mathops.h"
 #include "rectangle.h"
@@ -72,16 +73,30 @@ static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
         return ret;
-    if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
-        ff_thread_release_buffer(s->avctx, &f->tf);
-        return AVERROR(ENOMEM);
+    if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
+        goto fail;
+    if (s->avctx->hwaccel) {
+        const AVHWAccel *hwaccel = s->avctx->hwaccel;
+        if (hwaccel->frame_priv_data_size) {
+            f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
+            if (!f->hwaccel_priv_buf)
+                goto fail;
+            f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
+        }
     }
     return 0;
+
+fail:
+    av_buffer_unref(&f->seg_map);
+    ff_thread_release_buffer(s->avctx, &f->tf);
+    return AVERROR(ENOMEM);
 }
 
 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
 {
     av_buffer_unref(&f->seg_map);
+    av_buffer_unref(&f->hwaccel_priv_buf);
+    f->hwaccel_picture_private = NULL;
     ff_thread_release_buffer(s->avctx, &f->tf);
 }
 
@@ -99,6 +114,12 @@ static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
         vp8_release_frame(s, dst);
         return AVERROR(ENOMEM);
     }
+    if (src->hwaccel_picture_private) {
+        dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
+        if (!dst->hwaccel_priv_buf)
+            return AVERROR(ENOMEM);
+        dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
+    }
 
     return 0;
 }
@@ -140,12 +161,28 @@ static VP8Frame *vp8_find_free_buffer(VP8Context *s)
         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
         abort();
     }
-    if (frame->tf.f->data[0])
+    if (frame->tf.f->buf[0])
         vp8_release_frame(s, frame);
 
     return frame;
 }
 
+static enum AVPixelFormat get_pixel_format(VP8Context *s)
+{
+    enum AVPixelFormat pix_fmts[] = {
+#if CONFIG_VP8_VAAPI_HWACCEL
+        AV_PIX_FMT_VAAPI,
+#endif
+#if CONFIG_VP8_NVDEC_HWACCEL
+        AV_PIX_FMT_CUDA,
+#endif
+        AV_PIX_FMT_YUV420P,
+        AV_PIX_FMT_NONE,
+    };
+
+    return ff_get_format(s->avctx, pix_fmts);
+}
+
 static av_always_inline
 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 {
@@ -161,6 +198,13 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
             return ret;
     }
 
+    if (!s->actually_webp && !is_vp7) {
+        s->pix_fmt = get_pixel_format(s);
+        if (s->pix_fmt < 0)
+            return AVERROR(EINVAL);
+        avctx->pix_fmt = s->pix_fmt;
+    }
+
     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
     s->mb_height = (s->avctx->coded_height + 15) / 16;
 
@@ -218,8 +262,9 @@ static void parse_segment_info(VP8Context *s)
     int i;
 
     s->segmentation.update_map = vp8_rac_get(c);
+    s->segmentation.update_feature_data = vp8_rac_get(c);
 
-    if (vp8_rac_get(c)) { // update segment feature data
+    if (s->segmentation.update_feature_data) {
         s->segmentation.absolute_vals = vp8_rac_get(c);
 
         for (i = 0; i < 4; i++)
@@ -274,6 +319,7 @@ static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
         int size = AV_RL24(sizes + 3 * i);
         if (buf_size - size < 0)
             return -1;
+        s->coeff_partition_size[i] = size;
 
         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
         if (ret < 0)
@@ -281,7 +327,11 @@ static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
         buf      += size;
         buf_size -= size;
     }
-    return ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
+
+    s->coeff_partition_size[i] = buf_size;
+    ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
+
+    return 0;
 }
 
 static void vp7_get_quants(VP8Context *s)
@@ -308,28 +358,28 @@ static void vp8_get_quants(VP8Context *s)
     VP56RangeCoder *c = &s->c;
     int i, base_qi;
 
-    int yac_qi     = vp8_rac_get_uint(c, 7);
-    int ydc_delta  = vp8_rac_get_sint(c, 4);
-    int y2dc_delta = vp8_rac_get_sint(c, 4);
-    int y2ac_delta = vp8_rac_get_sint(c, 4);
-    int uvdc_delta = vp8_rac_get_sint(c, 4);
-    int uvac_delta = vp8_rac_get_sint(c, 4);
+    s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
+    s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
+    s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
+    s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
+    s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
+    s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 
     for (i = 0; i < 4; i++) {
         if (s->segmentation.enabled) {
             base_qi = s->segmentation.base_quant[i];
             if (!s->segmentation.absolute_vals)
-                base_qi += yac_qi;
+                base_qi += s->quant.yac_qi;
         } else
-            base_qi = yac_qi;
+            base_qi = s->quant.yac_qi;
 
-        s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
+        s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
-        s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
+        s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
         /* 101581>>16 is equivalent to 155/100 */
-        s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
-        s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
-        s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
+        s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
+        s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
+        s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 
         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
@@ -606,6 +656,8 @@ static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
             s->fade_present = vp8_rac_get(c);
     }
 
+    if (c->end <= c->buffer && c->bits >= 0)
+        return AVERROR_INVALIDDATA;
     /* E. Fading information for previous frame */
     if (s->fade_present && vp8_rac_get(c)) {
         if ((ret = vp7_fade_frame(s ,c)) < 0)
@@ -661,6 +713,8 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     buf      += 3;
     buf_size -= 3;
 
+    s->header_partition_size = header_size;
+
     if (s->profile > 3)
         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 
@@ -726,9 +780,11 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
     s->filter.level     = vp8_rac_get_uint(c, 6);
     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 
-    if ((s->lf_delta.enabled = vp8_rac_get(c)))
-        if (vp8_rac_get(c))
+    if ((s->lf_delta.enabled = vp8_rac_get(c))) {
+        s->lf_delta.update = vp8_rac_get(c);
+        if (s->lf_delta.update)
             update_lf_deltas(s);
+    }
 
     if (setup_partitions(s, buf, buf_size)) {
         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
@@ -768,6 +824,13 @@ static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_si
         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
     }
 
+    // Record the entropy coder state here so that hwaccels can use it.
+    s->c.code_word = vp56_rac_renorm(&s->c);
+    s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
+    s->coder_state_at_header_end.range     = s->c.high;
+    s->coder_state_at_header_end.value     = s->c.code_word >> 16;
+    s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
+
     return 0;
 }
 
@@ -2540,7 +2603,6 @@ static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
 }
 
-
 static av_always_inline
 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
                       AVPacket *avpkt, int is_vp7)
@@ -2550,8 +2612,6 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     enum AVDiscard skip_thresh;
     VP8Frame *av_uninit(curframe), *prev_frame;
 
-    av_assert0(avctx->pix_fmt == AV_PIX_FMT_YUVA420P || avctx->pix_fmt == AV_PIX_FMT_YUV420P);
-
     if (is_vp7)
         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
     else
@@ -2560,6 +2620,17 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
     if (ret < 0)
         goto err;
 
+    if (s->actually_webp) {
+        // avctx->pix_fmt already set in caller.
+    } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
+        s->pix_fmt = get_pixel_format(s);
+        if (s->pix_fmt < 0) {
+            ret = AVERROR(EINVAL);
+            goto err;
+        }
+        avctx->pix_fmt = s->pix_fmt;
+    }
+
     prev_frame = s->framep[VP56_FRAME_CURRENT];
 
     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
@@ -2578,7 +2649,7 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     // release no longer referenced frames
     for (i = 0; i < 5; i++)
-        if (s->frames[i].tf.f->data[0] &&
+        if (s->frames[i].tf.f->buf[0] &&
             &s->frames[i] != prev_frame &&
             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
@@ -2631,56 +2702,70 @@ int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
 
     s->next_framep[VP56_FRAME_CURRENT] = curframe;
 
-    if (avctx->codec->update_thread_context)
-        ff_thread_finish_setup(avctx);
+    ff_thread_finish_setup(avctx);
 
-    s->linesize   = curframe->tf.f->linesize[0];
-    s->uvlinesize = curframe->tf.f->linesize[1];
+    if (avctx->hwaccel) {
+        ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
+        if (ret < 0)
+            goto err;
 
-    memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
-    /* Zero macroblock structures for top/top-left prediction
-     * from outside the frame. */
-    if (!s->mb_layout)
-        memset(s->macroblocks + s->mb_height * 2 - 1, 0,
-               (s->mb_width + 1) * sizeof(*s->macroblocks));
-    if (!s->mb_layout && s->keyframe)
-        memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
+        ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
+        if (ret < 0)
+            goto err;
 
-    memset(s->ref_count, 0, sizeof(s->ref_count));
+        ret = avctx->hwaccel->end_frame(avctx);
+        if (ret < 0)
+            goto err;
 
-    if (s->mb_layout == 1) {
-        // Make sure the previous frame has read its segmentation map,
-        // if we re-use the same map.
-        if (prev_frame && s->segmentation.enabled &&
-            !s->segmentation.update_map)
-            ff_thread_await_progress(&prev_frame->tf, 1, 0);
+    } else {
+        s->linesize   = curframe->tf.f->linesize[0];
+        s->uvlinesize = curframe->tf.f->linesize[1];
+
+        memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
+        /* Zero macroblock structures for top/top-left prediction
+         * from outside the frame. */
+        if (!s->mb_layout)
+            memset(s->macroblocks + s->mb_height * 2 - 1, 0,
+                   (s->mb_width + 1) * sizeof(*s->macroblocks));
+        if (!s->mb_layout && s->keyframe)
+            memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
+
+        memset(s->ref_count, 0, sizeof(s->ref_count));
+
+        if (s->mb_layout == 1) {
+            // Make sure the previous frame has read its segmentation map,
+            // if we re-use the same map.
+            if (prev_frame && s->segmentation.enabled &&
+                !s->segmentation.update_map)
+                ff_thread_await_progress(&prev_frame->tf, 1, 0);
+            if (is_vp7)
+                vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+            else
+                vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+        }
+
+        if (avctx->active_thread_type == FF_THREAD_FRAME)
+            num_jobs = 1;
+        else
+            num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
+        s->num_jobs   = num_jobs;
+        s->curframe   = curframe;
+        s->prev_frame = prev_frame;
+        s->mv_bounds.mv_min.y   = -MARGIN;
+        s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
+        for (i = 0; i < MAX_THREADS; i++) {
+            VP8ThreadData *td = &s->thread_data[i];
+            atomic_init(&td->thread_mb_pos, 0);
+            atomic_init(&td->wait_mb_pos, INT_MAX);
+        }
         if (is_vp7)
-            vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
+            avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
+                            num_jobs);
         else
-            vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
+            avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
+                            num_jobs);
     }
 
-    if (avctx->active_thread_type == FF_THREAD_FRAME)
-        num_jobs = 1;
-    else
-        num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
-    s->num_jobs   = num_jobs;
-    s->curframe   = curframe;
-    s->prev_frame = prev_frame;
-    s->mv_bounds.mv_min.y   = -MARGIN;
-    s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
-    for (i = 0; i < MAX_THREADS; i++) {
-        VP8ThreadData *td = &s->thread_data[i];
-        atomic_init(&td->thread_mb_pos, 0);
-        atomic_init(&td->wait_mb_pos, INT_MAX);
-    }
-    if (is_vp7)
-        avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
-                        num_jobs);
-    else
-        avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
-                        num_jobs);
-
     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
 
@@ -2750,6 +2835,7 @@ int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
 
     s->avctx = avctx;
     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
+    s->pix_fmt = AV_PIX_FMT_NONE;
     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
     avctx->internal->allocate_progress = 1;
 
@@ -2823,13 +2909,14 @@ static int vp8_decode_update_thread_context(AVCodecContext *dst,
         s->mb_height = s_src->mb_height;
     }
 
+    s->pix_fmt      = s_src->pix_fmt;
     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
     s->segmentation = s_src->segmentation;
     s->lf_delta     = s_src->lf_delta;
     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
 
     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
-        if (s_src->frames[i].tf.f->data[0]) {
+        if (s_src->frames[i].tf.f->buf[0]) {
             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
             if (ret < 0)
                 return ret;
@@ -2876,5 +2963,14 @@ AVCodec ff_vp8_decoder = {
     .flush                 = vp8_decode_flush,
     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
+    .hw_configs            = (const AVCodecHWConfigInternal*[]) {
+#if CONFIG_VP8_VAAPI_HWACCEL
+                               HWACCEL_VAAPI(vp8),
+#endif
+#if CONFIG_VP8_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(vp8),
+#endif
+                               NULL
+                           },
 };
 #endif /* CONFIG_VP7_DECODER */
diff --git a/media/ffvpx/libavcodec/vp8.h b/media/ffvpx/libavcodec/vp8.h
index 8263997e3..70d21e3c6 100644
--- a/media/ffvpx/libavcodec/vp8.h
+++ b/media/ffvpx/libavcodec/vp8.h
@@ -138,12 +138,18 @@ typedef struct VP8ThreadData {
 typedef struct VP8Frame {
     ThreadFrame tf;
     AVBufferRef *seg_map;
+
+    AVBufferRef *hwaccel_priv_buf;
+    void *hwaccel_picture_private;
 } VP8Frame;
 
 #define MAX_THREADS 8
 typedef struct VP8Context {
     VP8ThreadData *thread_data;
     AVCodecContext *avctx;
+    enum AVPixelFormat pix_fmt;
+    int actually_webp;
+
     VP8Frame *framep[4];
     VP8Frame *next_framep[4];
     VP8Frame *curframe;
@@ -172,6 +178,7 @@ typedef struct VP8Context {
         uint8_t enabled;
         uint8_t absolute_vals;
         uint8_t update_map;
+        uint8_t update_feature_data;
         int8_t base_quant[4];
         int8_t filter_level[4];     ///< base loop filter level
     } segmentation;
@@ -199,8 +206,19 @@ typedef struct VP8Context {
         int16_t chroma_qmul[2];
     } qmat[4];
 
+    // Raw quantisation values, which may be needed by hwaccel decode.
+    struct {
+        int yac_qi;
+        int ydc_delta;
+        int y2dc_delta;
+        int y2ac_delta;
+        int uvdc_delta;
+        int uvac_delta;
+    } quant;
+
     struct {
         uint8_t enabled;    ///< whether each mb can have a different strength based on mode/ref
+        uint8_t update;
 
         /**
          * filter strength adjustment for the following macroblock modes:
@@ -228,6 +246,20 @@ typedef struct VP8Context {
 
     VP56RangeCoder c;   ///< header context, includes mb modes and motion vectors
 
+    /* This contains the entropy coder state at the end of the header
+     * block, in the form specified by the standard.  For use by
+     * hwaccels, so that a hardware decoder has the information to
+     * start decoding at the macroblock layer.
+     */
+    struct {
+        const uint8_t *input;
+        uint32_t range;
+        uint32_t value;
+        int bit_count;
+    } coder_state_at_header_end;
+
+    int header_partition_size;
+
     /**
      * These are all of the updatable probabilities for binary decisions.
      * They are only implicitly reset on keyframes, making it quite likely
@@ -265,6 +297,7 @@ typedef struct VP8Context {
      */
     int num_coeff_partitions;
     VP56RangeCoder coeff_partition[8];
+    int coeff_partition_size[8];
     VideoDSPContext vdsp;
     VP8DSPContext vp8dsp;
     H264PredContext hpc;
diff --git a/media/ffvpx/libavcodec/vp8_parser.c b/media/ffvpx/libavcodec/vp8_parser.c
index 609f5077d..e2d91b271 100644
--- a/media/ffvpx/libavcodec/vp8_parser.c
+++ b/media/ffvpx/libavcodec/vp8_parser.c
@@ -28,6 +28,9 @@ static int parse(AVCodecParserContext *s,
     unsigned int frame_type;
     unsigned int profile;
 
+    *poutbuf      = buf;
+    *poutbuf_size = buf_size;
+
     if (buf_size < 3)
         return buf_size;
 
diff --git a/media/ffvpx/libavcodec/vp9.c b/media/ffvpx/libavcodec/vp9.c
index f2cf19424..b1178c9c0 100644
--- a/media/ffvpx/libavcodec/vp9.c
+++ b/media/ffvpx/libavcodec/vp9.c
@@ -23,6 +23,7 @@
 
 #include "avcodec.h"
 #include "get_bits.h"
+#include "hwaccel.h"
 #include "internal.h"
 #include "profiles.h"
 #include "thread.h"
@@ -169,7 +170,10 @@ fail:
 
 static int update_size(AVCodecContext *avctx, int w, int h)
 {
-#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + CONFIG_VP9_D3D11VA_HWACCEL * 2 + CONFIG_VP9_VAAPI_HWACCEL)
+#define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \
+                     CONFIG_VP9_D3D11VA_HWACCEL * 2 + \
+                     CONFIG_VP9_NVDEC_HWACCEL + \
+                     CONFIG_VP9_VAAPI_HWACCEL)
     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts;
     VP9Context *s = avctx->priv_data;
     uint8_t *p;
@@ -184,6 +188,7 @@ static int update_size(AVCodecContext *avctx, int w, int h)
 
         switch (s->pix_fmt) {
         case AV_PIX_FMT_YUV420P:
+        case AV_PIX_FMT_YUV420P10:
 #if CONFIG_VP9_DXVA2_HWACCEL
             *fmtp++ = AV_PIX_FMT_DXVA2_VLD;
 #endif
@@ -191,12 +196,17 @@ static int update_size(AVCodecContext *avctx, int w, int h)
             *fmtp++ = AV_PIX_FMT_D3D11VA_VLD;
             *fmtp++ = AV_PIX_FMT_D3D11;
 #endif
+#if CONFIG_VP9_NVDEC_HWACCEL
+            *fmtp++ = AV_PIX_FMT_CUDA;
+#endif
 #if CONFIG_VP9_VAAPI_HWACCEL
             *fmtp++ = AV_PIX_FMT_VAAPI;
 #endif
             break;
-        case AV_PIX_FMT_YUV420P10:
         case AV_PIX_FMT_YUV420P12:
+#if CONFIG_VP9_NVDEC_HWACCEL
+            *fmtp++ = AV_PIX_FMT_CUDA;
+#endif
 #if CONFIG_VP9_VAAPI_HWACCEL
             *fmtp++ = AV_PIX_FMT_VAAPI;
 #endif
@@ -1787,4 +1797,23 @@ AVCodec ff_vp9_decoder = {
     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),
     .profiles              = NULL_IF_CONFIG_SMALL(ff_vp9_profiles),
+    .bsfs                  = "vp9_superframe_split",
+    .hw_configs            = (const AVCodecHWConfigInternal*[]) {
+#if CONFIG_VP9_DXVA2_HWACCEL
+                               HWACCEL_DXVA2(vp9),
+#endif
+#if CONFIG_VP9_D3D11VA_HWACCEL
+                               HWACCEL_D3D11VA(vp9),
+#endif
+#if CONFIG_VP9_D3D11VA2_HWACCEL
+                               HWACCEL_D3D11VA2(vp9),
+#endif
+#if CONFIG_VP9_NVDEC_HWACCEL
+                               HWACCEL_NVDEC(vp9),
+#endif
+#if CONFIG_VP9_VAAPI_HWACCEL
+                               HWACCEL_VAAPI(vp9),
+#endif
+                               NULL
+                           },
 };
diff --git a/media/ffvpx/libavcodec/vp9_parser.c b/media/ffvpx/libavcodec/vp9_parser.c
index 9900e7ab1..9531f34a3 100644
--- a/media/ffvpx/libavcodec/vp9_parser.c
+++ b/media/ffvpx/libavcodec/vp9_parser.c
@@ -25,21 +25,19 @@
 #include "libavcodec/get_bits.h"
 #include "parser.h"
 
-typedef struct VP9ParseContext {
-    int n_frames; // 1-8
-    int size[8];
-    int marker_size;
-    int64_t pts;
-} VP9ParseContext;
-
-static int parse_frame(AVCodecParserContext *ctx, const uint8_t *buf, int size)
+static int parse(AVCodecParserContext *ctx,
+                 AVCodecContext *avctx,
+                 const uint8_t **out_data, int *out_size,
+                 const uint8_t *data, int size)
 {
-    VP9ParseContext *s = ctx->priv_data;
     GetBitContext gb;
-    int res, profile, keyframe, invisible;
+    int res, profile, keyframe;
 
-    if ((res = init_get_bits8(&gb, buf, size)) < 0)
-        return res;
+    *out_data = data;
+    *out_size = size;
+
+    if ((res = init_get_bits8(&gb, data, size)) < 0)
+        return size; // parsers can't return errors
     get_bits(&gb, 2); // frame marker
     profile  = get_bits1(&gb);
     profile |= get_bits1(&gb) << 1;
@@ -47,10 +45,8 @@ static int parse_frame(AVCodecParserContext *ctx, const uint8_t *buf, int size)
 
     if (get_bits1(&gb)) {
         keyframe = 0;
-        invisible = 0;
     } else {
         keyframe  = !get_bits1(&gb);
-        invisible = !get_bits1(&gb);
     }
 
     if (!keyframe) {
@@ -61,113 +57,10 @@ static int parse_frame(AVCodecParserContext *ctx, const uint8_t *buf, int size)
         ctx->key_frame = 1;
     }
 
-    if (!invisible) {
-        if (ctx->pts == AV_NOPTS_VALUE)
-            ctx->pts = s->pts;
-        s->pts = AV_NOPTS_VALUE;
-    } else if (ctx->pts != AV_NOPTS_VALUE) {
-        s->pts = ctx->pts;
-        ctx->pts = AV_NOPTS_VALUE;
-    }
-
-    return 0;
-}
-
-static int parse(AVCodecParserContext *ctx,
-                 AVCodecContext *avctx,
-                 const uint8_t **out_data, int *out_size,
-                 const uint8_t *data, int size)
-{
-    VP9ParseContext *s = ctx->priv_data;
-    int full_size = size;
-    int marker;
-
-    if (size <= 0) {
-        *out_size = 0;
-        *out_data = data;
-
-        return 0;
-    }
-
-    if (s->n_frames > 0) {
-        int i;
-        int size_sum = 0;
-
-        for (i = 0; i < s->n_frames ;i++)
-            size_sum += s->size[i];
-        size_sum += s->marker_size;
-
-        if (size_sum != size) {
-            av_log(avctx, AV_LOG_ERROR, "Inconsistent input frame sizes %d %d\n",
-                   size_sum, size);
-            s->n_frames = 0;
-        }
-    }
-
-    if (s->n_frames > 0) {
-        *out_data = data;
-        *out_size = s->size[--s->n_frames];
-        parse_frame(ctx, *out_data, *out_size);
-
-        return s->n_frames > 0 ? *out_size : size /* i.e. include idx tail */;
-    }
-
-    marker = data[size - 1];
-    if ((marker & 0xe0) == 0xc0) {
-        int nbytes = 1 + ((marker >> 3) & 0x3);
-        int n_frames = 1 + (marker & 0x7), idx_sz = 2 + n_frames * nbytes;
-
-        if (size >= idx_sz && data[size - idx_sz] == marker) {
-            const uint8_t *idx = data + size + 1 - idx_sz;
-            int first = 1;
-
-            switch (nbytes) {
-#define case_n(a, rd) \
-            case a: \
-                while (n_frames--) { \
-                    unsigned sz = rd; \
-                    idx += a; \
-                    if (sz == 0 || sz > size) { \
-                        s->n_frames = 0; \
-                        *out_size = size; \
-                        *out_data = data; \
-                        av_log(avctx, AV_LOG_ERROR, \
-                               "Invalid superframe packet size: %u frame size: %d\n", \
-                               sz, size); \
-                        return full_size; \
-                    } \
-                    if (first) { \
-                        first = 0; \
-                        *out_data = data; \
-                        *out_size = sz; \
-                        s->n_frames = n_frames; \
-                    } else { \
-                        s->size[n_frames] = sz; \
-                    } \
-                    data += sz; \
-                    size -= sz; \
-                } \
-                s->marker_size = size; \
-                parse_frame(ctx, *out_data, *out_size); \
-                return s->n_frames > 0 ? *out_size : full_size
-
-                case_n(1, *idx);
-                case_n(2, AV_RL16(idx));
-                case_n(3, AV_RL24(idx));
-                case_n(4, AV_RL32(idx));
-            }
-        }
-    }
-
-    *out_data = data;
-    *out_size = size;
-    parse_frame(ctx, data, size);
-
     return size;
 }
 
 AVCodecParser ff_vp9_parser = {
     .codec_ids      = { AV_CODEC_ID_VP9 },
-    .priv_data_size = sizeof(VP9ParseContext),
     .parser_parse   = parse,
 };
diff --git a/media/ffvpx/libavcodec/vp9_superframe_split_bsf.c b/media/ffvpx/libavcodec/vp9_superframe_split_bsf.c
new file mode 100644
index 000000000..9c4aa33dc
--- /dev/null
+++ b/media/ffvpx/libavcodec/vp9_superframe_split_bsf.c
@@ -0,0 +1,147 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * This bitstream filter splits VP9 superframes into packets containing
+ * just one frame.
+ */
+
+#include <stddef.h>
+
+#include "avcodec.h"
+#include "bsf.h"
+#include "bytestream.h"
+#include "get_bits.h"
+
+typedef struct VP9SFSplitContext {
+    AVPacket buffer_pkt;
+
+    int nb_frames;
+    int next_frame;
+    size_t next_frame_offset;
+    int sizes[8];
+} VP9SFSplitContext;
+
+static int vp9_superframe_split_filter(AVBSFContext *ctx, AVPacket *out)
+{
+    VP9SFSplitContext *s = ctx->priv_data;
+    AVPacket *in;
+    int i, j, ret, marker;
+    int is_superframe = !!s->buffer_pkt.data;
+
+    if (!s->buffer_pkt.data) {
+        ret = ff_bsf_get_packet_ref(ctx, &s->buffer_pkt);
+        if (ret < 0)
+            return ret;
+        in = &s->buffer_pkt;
+
+        marker = in->data[in->size - 1];
+        if ((marker & 0xe0) == 0xc0) {
+            int length_size = 1 + ((marker >> 3) & 0x3);
+            int   nb_frames = 1 + (marker & 0x7);
+            int    idx_size = 2 + nb_frames * length_size;
+
+            if (in->size >= idx_size && in->data[in->size - idx_size] == marker) {
+                GetByteContext bc;
+                int64_t total_size = 0;
+
+                bytestream2_init(&bc, in->data + in->size + 1 - idx_size,
+                                 nb_frames * length_size);
+
+                for (i = 0; i < nb_frames; i++) {
+                    int frame_size = 0;
+                    for (j = 0; j < length_size; j++)
+                        frame_size |= bytestream2_get_byte(&bc) << (j * 8);
+
+                    total_size += frame_size;
+                    if (frame_size < 0 || total_size > in->size - idx_size) {
+                        av_log(ctx, AV_LOG_ERROR,
+                               "Invalid frame size in a superframe: %d\n", frame_size);
+                        ret = AVERROR(EINVAL);
+                        goto fail;
+                    }
+                    s->sizes[i] = frame_size;
+                }
+                s->nb_frames         = nb_frames;
+                s->next_frame        = 0;
+                s->next_frame_offset = 0;
+                is_superframe        = 1;
+            }
+        }
+    }
+
+    if (is_superframe) {
+        GetBitContext gb;
+        int profile, invisible = 0;
+
+        ret = av_packet_ref(out, &s->buffer_pkt);
+        if (ret < 0)
+            goto fail;
+
+        out->data += s->next_frame_offset;
+        out->size  = s->sizes[s->next_frame];
+
+        s->next_frame_offset += out->size;
+        s->next_frame++;
+
+        if (s->next_frame >= s->nb_frames)
+            av_packet_unref(&s->buffer_pkt);
+
+        ret = init_get_bits8(&gb, out->data, out->size);
+        if (ret < 0)
+            goto fail;
+
+        get_bits(&gb, 2); // frame_marker
+        profile  = get_bits1(&gb);
+        profile |= get_bits1(&gb) << 1;
+        if (profile == 3)
+            get_bits1(&gb);
+        if (!get_bits1(&gb)) {
+            get_bits1(&gb);
+            invisible = !get_bits1(&gb);
+        }
+
+        if (invisible)
+            out->pts = AV_NOPTS_VALUE;
+
+    } else {
+        av_packet_move_ref(out, &s->buffer_pkt);
+    }
+
+    return 0;
+fail:
+    if (ret < 0)
+        av_packet_unref(out);
+    av_packet_unref(&s->buffer_pkt);
+    return ret;
+}
+
+static void vp9_superframe_split_uninit(AVBSFContext *ctx)
+{
+    VP9SFSplitContext *s = ctx->priv_data;
+    av_packet_unref(&s->buffer_pkt);
+}
+
+const AVBitStreamFilter ff_vp9_superframe_split_bsf = {
+    .name = "vp9_superframe_split",
+    .priv_data_size = sizeof(VP9SFSplitContext),
+    .close          = vp9_superframe_split_uninit,
+    .filter         = vp9_superframe_split_filter,
+    .codec_ids      = (const enum AVCodecID []){ AV_CODEC_ID_VP9, AV_CODEC_ID_NONE },
+};
diff --git a/media/ffvpx/libavcodec/x86/constants.c b/media/ffvpx/libavcodec/x86/constants.c
index 11002ee61..4bfb78cc3 100644
--- a/media/ffvpx/libavcodec/x86/constants.c
+++ b/media/ffvpx/libavcodec/x86/constants.c
@@ -26,23 +26,23 @@ DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x000
                                                     0x0001000100010001ULL, 0x0001000100010001ULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL,
                                                     0x0002000200020002ULL, 0x0002000200020002ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
-DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
+DECLARE_ASM_ALIGNED(32, const ymm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL,
                                                     0x0004000400040004ULL, 0x0004000400040004ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_20)   = { 0x0014001400140014ULL, 0x0014001400140014ULL };
 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
+DECLARE_ASM_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
+DECLARE_ASM_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_255)  = { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
                                                     0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pw_256)  = { 0x0100010001000100ULL, 0x0100010001000100ULL,
@@ -74,7 +74,8 @@ DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_2)    = { 0x0202020202020202ULL, 0x020
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL,
                                                     0x0303030303030303ULL, 0x0303030303030303ULL };
 DECLARE_ALIGNED(32, const xmm_reg,  ff_pb_15)   = { 0x0F0F0F0F0F0F0F0FULL, 0x0F0F0F0F0F0F0F0FULL };
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
+DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL,
+                                                    0x8080808080808080ULL, 0x8080808080808080ULL };
 DECLARE_ALIGNED(32, const ymm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL,
                                                     0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
diff --git a/media/ffvpx/libavcodec/x86/constants.h b/media/ffvpx/libavcodec/x86/constants.h
index bbb0ef844..85da38b7b 100644
--- a/media/ffvpx/libavcodec/x86/constants.h
+++ b/media/ffvpx/libavcodec/x86/constants.h
@@ -57,7 +57,7 @@ extern const ymm_reg  ff_pb_0;
 extern const ymm_reg  ff_pb_1;
 extern const ymm_reg  ff_pb_2;
 extern const ymm_reg  ff_pb_3;
-extern const xmm_reg  ff_pb_80;
+extern const ymm_reg  ff_pb_80;
 extern const ymm_reg  ff_pb_FE;
 extern const uint64_t ff_pb_FC;
 
diff --git a/media/ffvpx/libavcodec/x86/vp8dsp.asm b/media/ffvpx/libavcodec/x86/vp8dsp.asm
index e303b8029..75de5690a 100644
--- a/media/ffvpx/libavcodec/x86/vp8dsp.asm
+++ b/media/ffvpx/libavcodec/x86/vp8dsp.asm
@@ -664,6 +664,37 @@ INIT_XMM sse2
 FILTER_V 8
 
 %macro FILTER_BILINEAR 1
+%if cpuflag(ssse3)
+cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
+    shl      myd, 4
+%ifdef PIC
+    lea  picregq, [bilinear_filter_vb_m]
+%endif
+    pxor      m4, m4
+    mova      m3, [bilinear_filter_vb+myq-16]
+.nextrow:
+    movh      m0, [srcq+srcstrideq*0]
+    movh      m1, [srcq+srcstrideq*1]
+    movh      m2, [srcq+srcstrideq*2]
+    punpcklbw m0, m1
+    punpcklbw m1, m2
+    pmaddubsw m0, m3
+    pmaddubsw m1, m3
+    psraw     m0, 2
+    psraw     m1, 2
+    pavgw     m0, m4
+    pavgw     m1, m4
+%if mmsize==8
+    packuswb  m0, m0
+    packuswb  m1, m1
+    movh   [dstq+dststrideq*0], m0
+    movh   [dstq+dststrideq*1], m1
+%else
+    packuswb  m0, m1
+    movh   [dstq+dststrideq*0], m0
+    movhps [dstq+dststrideq*1], m0
+%endif
+%else ; cpuflag(ssse3)
 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
     shl      myd, 4
 %ifdef PIC
@@ -701,6 +732,7 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
     movh   [dstq+dststrideq*0], m0
     movhps [dstq+dststrideq*1], m0
 %endif
+%endif ; cpuflag(ssse3)
 
     lea     dstq, [dstq+dststrideq*2]
     lea     srcq, [srcq+srcstrideq*2]
@@ -708,6 +740,37 @@ cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, p
     jg .nextrow
     REP_RET
 
+%if cpuflag(ssse3)
+cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
+    shl      mxd, 4
+%ifdef PIC
+    lea  picregq, [bilinear_filter_vb_m]
+%endif
+    pxor      m4, m4
+    mova      m2, [filter_h2_shuf]
+    mova      m3, [bilinear_filter_vb+mxq-16]
+.nextrow:
+    movu      m0, [srcq+srcstrideq*0]
+    movu      m1, [srcq+srcstrideq*1]
+    pshufb    m0, m2
+    pshufb    m1, m2
+    pmaddubsw m0, m3
+    pmaddubsw m1, m3
+    psraw     m0, 2
+    psraw     m1, 2
+    pavgw     m0, m4
+    pavgw     m1, m4
+%if mmsize==8
+    packuswb  m0, m0
+    packuswb  m1, m1
+    movh   [dstq+dststrideq*0], m0
+    movh   [dstq+dststrideq*1], m1
+%else
+    packuswb  m0, m1
+    movh   [dstq+dststrideq*0], m0
+    movhps [dstq+dststrideq*1], m0
+%endif
+%else ; cpuflag(ssse3)
 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
     shl      mxd, 4
 %ifdef PIC
@@ -746,6 +809,7 @@ cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride
     movh   [dstq+dststrideq*0], m0
     movhps [dstq+dststrideq*1], m0
 %endif
+%endif ; cpuflag(ssse3)
 
     lea     dstq, [dstq+dststrideq*2]
     lea     srcq, [srcq+srcstrideq*2]
@@ -758,85 +822,10 @@ INIT_MMX mmxext
 FILTER_BILINEAR 4
 INIT_XMM sse2
 FILTER_BILINEAR 8
-
-%macro FILTER_BILINEAR_SSSE3 1
-cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
-    shl      myd, 4
-%ifdef PIC
-    lea  picregq, [bilinear_filter_vb_m]
-%endif
-    pxor      m4, m4
-    mova      m3, [bilinear_filter_vb+myq-16]
-.nextrow:
-    movh      m0, [srcq+srcstrideq*0]
-    movh      m1, [srcq+srcstrideq*1]
-    movh      m2, [srcq+srcstrideq*2]
-    punpcklbw m0, m1
-    punpcklbw m1, m2
-    pmaddubsw m0, m3
-    pmaddubsw m1, m3
-    psraw     m0, 2
-    psraw     m1, 2
-    pavgw     m0, m4
-    pavgw     m1, m4
-%if mmsize==8
-    packuswb  m0, m0
-    packuswb  m1, m1
-    movh   [dstq+dststrideq*0], m0
-    movh   [dstq+dststrideq*1], m1
-%else
-    packuswb  m0, m1
-    movh   [dstq+dststrideq*0], m0
-    movhps [dstq+dststrideq*1], m0
-%endif
-
-    lea     dstq, [dstq+dststrideq*2]
-    lea     srcq, [srcq+srcstrideq*2]
-    sub  heightd, 2
-    jg .nextrow
-    REP_RET
-
-cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
-    shl      mxd, 4
-%ifdef PIC
-    lea  picregq, [bilinear_filter_vb_m]
-%endif
-    pxor      m4, m4
-    mova      m2, [filter_h2_shuf]
-    mova      m3, [bilinear_filter_vb+mxq-16]
-.nextrow:
-    movu      m0, [srcq+srcstrideq*0]
-    movu      m1, [srcq+srcstrideq*1]
-    pshufb    m0, m2
-    pshufb    m1, m2
-    pmaddubsw m0, m3
-    pmaddubsw m1, m3
-    psraw     m0, 2
-    psraw     m1, 2
-    pavgw     m0, m4
-    pavgw     m1, m4
-%if mmsize==8
-    packuswb  m0, m0
-    packuswb  m1, m1
-    movh   [dstq+dststrideq*0], m0
-    movh   [dstq+dststrideq*1], m1
-%else
-    packuswb  m0, m1
-    movh   [dstq+dststrideq*0], m0
-    movhps [dstq+dststrideq*1], m0
-%endif
-
-    lea     dstq, [dstq+dststrideq*2]
-    lea     srcq, [srcq+srcstrideq*2]
-    sub  heightd, 2
-    jg .nextrow
-    REP_RET
-%endmacro
-
 INIT_MMX ssse3
-FILTER_BILINEAR_SSSE3 4
+FILTER_BILINEAR 4
 INIT_XMM ssse3
-FILTER_BILINEAR_SSSE3 8
+FILTER_BILINEAR 8
 
 INIT_MMX mmx
 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
diff --git a/media/ffvpx/libavutil/atomic.c b/media/ffvpx/libavutil/atomic.c
deleted file mode 100644
index 64cff2576..000000000
--- a/media/ffvpx/libavutil/atomic.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "atomic.h"
-
-#if !HAVE_ATOMICS_NATIVE
-
-#if HAVE_PTHREADS
-
-#include <pthread.h>
-
-static pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER;
-
-int avpriv_atomic_int_get(volatile int *ptr)
-{
-    int res;
-
-    pthread_mutex_lock(&atomic_lock);
-    res = *ptr;
-    pthread_mutex_unlock(&atomic_lock);
-
-    return res;
-}
-
-void avpriv_atomic_int_set(volatile int *ptr, int val)
-{
-    pthread_mutex_lock(&atomic_lock);
-    *ptr = val;
-    pthread_mutex_unlock(&atomic_lock);
-}
-
-int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc)
-{
-    int res;
-
-    pthread_mutex_lock(&atomic_lock);
-    *ptr += inc;
-    res = *ptr;
-    pthread_mutex_unlock(&atomic_lock);
-
-    return res;
-}
-
-void *avpriv_atomic_ptr_cas(void * volatile *ptr, void *oldval, void *newval)
-{
-    void *ret;
-    pthread_mutex_lock(&atomic_lock);
-    ret = *ptr;
-    if (ret == oldval)
-        *ptr = newval;
-    pthread_mutex_unlock(&atomic_lock);
-    return ret;
-}
-
-#elif !HAVE_THREADS
-
-int avpriv_atomic_int_get(volatile int *ptr)
-{
-    return *ptr;
-}
-
-void avpriv_atomic_int_set(volatile int *ptr, int val)
-{
-    *ptr = val;
-}
-
-int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc)
-{
-    *ptr += inc;
-    return *ptr;
-}
-
-void *avpriv_atomic_ptr_cas(void * volatile *ptr, void *oldval, void *newval)
-{
-    if (*ptr == oldval) {
-        *ptr = newval;
-        return oldval;
-    }
-    return *ptr;
-}
-
-#else /* HAVE_THREADS */
-
-/* This should never trigger, unless a new threading implementation
- * without correct atomics dependencies in configure or a corresponding
- * atomics implementation is added. */
-#error "Threading is enabled, but there is no implementation of atomic operations available"
-
-#endif /* HAVE_PTHREADS */
-
-#endif /* !HAVE_ATOMICS_NATIVE */
diff --git a/media/ffvpx/libavutil/atomic.h b/media/ffvpx/libavutil/atomic.h
deleted file mode 100644
index 15906d24c..000000000
--- a/media/ffvpx/libavutil/atomic.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_ATOMIC_H
-#define AVUTIL_ATOMIC_H
-
-#include "config.h"
-
-#if HAVE_ATOMICS_NATIVE
-
-#if HAVE_ATOMICS_GCC
-#include "atomic_gcc.h"
-#elif HAVE_ATOMICS_WIN32
-#include "atomic_win32.h"
-#elif HAVE_ATOMICS_SUNCC
-#include "atomic_suncc.h"
-#endif
-
-#else
-
-/**
- * Load the current value stored in an atomic integer.
- *
- * @param ptr atomic integer
- * @return the current value of the atomic integer
- * @note This acts as a memory barrier.
- */
-int avpriv_atomic_int_get(volatile int *ptr);
-
-/**
- * Store a new value in an atomic integer.
- *
- * @param ptr atomic integer
- * @param val the value to store in the atomic integer
- * @note This acts as a memory barrier.
- */
-void avpriv_atomic_int_set(volatile int *ptr, int val);
-
-/**
- * Add a value to an atomic integer.
- *
- * @param ptr atomic integer
- * @param inc the value to add to the atomic integer (may be negative)
- * @return the new value of the atomic integer.
- * @note This does NOT act as a memory barrier. This is primarily
- *       intended for reference counting.
- */
-int avpriv_atomic_int_add_and_fetch(volatile int *ptr, int inc);
-
-/**
- * Atomic pointer compare and swap.
- *
- * @param ptr pointer to the pointer to operate on
- * @param oldval do the swap if the current value of *ptr equals to oldval
- * @param newval value to replace *ptr with
- * @return the value of *ptr before comparison
- */
-void *avpriv_atomic_ptr_cas(void * volatile *ptr, void *oldval, void *newval);
-
-#endif /* HAVE_ATOMICS_NATIVE */
-
-#endif /* AVUTIL_ATOMIC_H */
diff --git a/media/ffvpx/libavutil/atomic_gcc.h b/media/ffvpx/libavutil/atomic_gcc.h
deleted file mode 100644
index 2bb43c3ce..000000000
--- a/media/ffvpx/libavutil/atomic_gcc.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_ATOMIC_GCC_H
-#define AVUTIL_ATOMIC_GCC_H
-
-#include <stdint.h>
-
-#include "atomic.h"
-
-#define avpriv_atomic_int_get atomic_int_get_gcc
-static inline int atomic_int_get_gcc(volatile int *ptr)
-{
-    __sync_synchronize();
-    return *ptr;
-}
-
-#define avpriv_atomic_int_set atomic_int_set_gcc
-static inline void atomic_int_set_gcc(volatile int *ptr, int val)
-{
-    *ptr = val;
-    __sync_synchronize();
-}
-
-#define avpriv_atomic_int_add_and_fetch atomic_int_add_and_fetch_gcc
-static inline int atomic_int_add_and_fetch_gcc(volatile int *ptr, int inc)
-{
-    return __sync_add_and_fetch(ptr, inc);
-}
-
-#define avpriv_atomic_ptr_cas atomic_ptr_cas_gcc
-static inline void *atomic_ptr_cas_gcc(void * volatile *ptr,
-                                       void *oldval, void *newval)
-{
-#ifdef __ARMCC_VERSION
-    // armcc will throw an error if ptr is not an integer type
-    volatile uintptr_t *tmp = (volatile uintptr_t*)ptr;
-    return (void*)__sync_val_compare_and_swap(tmp, oldval, newval);
-#else
-    return __sync_val_compare_and_swap(ptr, oldval, newval);
-#endif
-}
-
-#endif /* AVUTIL_ATOMIC_GCC_H */
diff --git a/media/ffvpx/libavutil/atomic_win32.h b/media/ffvpx/libavutil/atomic_win32.h
deleted file mode 100644
index f7299336f..000000000
--- a/media/ffvpx/libavutil/atomic_win32.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVUTIL_ATOMIC_WIN32_H
-#define AVUTIL_ATOMIC_WIN32_H
-
-#define WIN32_LEAN_AND_MEAN
-#include <windows.h>
-
-#define avpriv_atomic_int_get atomic_int_get_win32
-static inline int atomic_int_get_win32(volatile int *ptr)
-{
-    MemoryBarrier();
-    return *ptr;
-}
-
-#define avpriv_atomic_int_set atomic_int_set_win32
-static inline void atomic_int_set_win32(volatile int *ptr, int val)
-{
-    *ptr = val;
-    MemoryBarrier();
-}
-
-#define avpriv_atomic_int_add_and_fetch atomic_int_add_and_fetch_win32
-static inline int atomic_int_add_and_fetch_win32(volatile int *ptr, int inc)
-{
-    return inc + InterlockedExchangeAdd(ptr, inc);
-}
-
-#define avpriv_atomic_ptr_cas atomic_ptr_cas_win32
-static inline void *atomic_ptr_cas_win32(void * volatile *ptr,
-                                         void *oldval, void *newval)
-{
-    return InterlockedCompareExchangePointer(ptr, newval, oldval);
-}
-
-#endif /* AVUTIL_ATOMIC_WIN32_H */
diff --git a/media/ffvpx/libavutil/attributes.h b/media/ffvpx/libavutil/attributes.h
index 54d190111..ced108aa2 100644
--- a/media/ffvpx/libavutil/attributes.h
+++ b/media/ffvpx/libavutil/attributes.h
@@ -66,19 +66,19 @@
 #    define av_noinline
 #endif
 
-#if AV_GCC_VERSION_AT_LEAST(3,1)
+#if AV_GCC_VERSION_AT_LEAST(3,1) || defined(__clang__)
 #    define av_pure __attribute__((pure))
 #else
 #    define av_pure
 #endif
 
-#if AV_GCC_VERSION_AT_LEAST(2,6)
+#if AV_GCC_VERSION_AT_LEAST(2,6) || defined(__clang__)
 #    define av_const __attribute__((const))
 #else
 #    define av_const
 #endif
 
-#if AV_GCC_VERSION_AT_LEAST(4,3)
+#if AV_GCC_VERSION_AT_LEAST(4,3) || defined(__clang__)
 #    define av_cold __attribute__((cold))
 #else
 #    define av_cold
@@ -138,19 +138,19 @@
 #    define av_used
 #endif
 
-#if AV_GCC_VERSION_AT_LEAST(3,3)
+#if AV_GCC_VERSION_AT_LEAST(3,3) || defined(__clang__)
 #   define av_alias __attribute__((may_alias))
 #else
 #   define av_alias
 #endif
 
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)
 #    define av_uninit(x) x=x
 #else
 #    define av_uninit(x) x
 #endif
 
-#ifdef __GNUC__
+#if defined(__GNUC__) || defined(__clang__)
 #    define av_builtin_constant_p __builtin_constant_p
 #    define av_printf_format(fmtpos, attrpos) __attribute__((__format__(__printf__, fmtpos, attrpos)))
 #else
@@ -158,7 +158,7 @@
 #    define av_printf_format(fmtpos, attrpos)
 #endif
 
-#if AV_GCC_VERSION_AT_LEAST(2,5)
+#if AV_GCC_VERSION_AT_LEAST(2,5) || defined(__clang__)
 #    define av_noreturn __attribute__((noreturn))
 #else
 #    define av_noreturn
diff --git a/media/ffvpx/libavutil/avutil.symbols b/media/ffvpx/libavutil/avutil.symbols
index ba68dc33c..ede2ff9ac 100644
--- a/media/ffvpx/libavutil/avutil.symbols
+++ b/media/ffvpx/libavutil/avutil.symbols
@@ -1,10 +1,7 @@
 av_add_q
 av_add_stable
-av_adler32_update
 av_append_path_component
 av_asprintf
-av_base64_decode
-av_base64_encode
 av_basename
 av_bprint_append_data
 av_bprint_channel_layout
@@ -177,8 +174,10 @@ av_image_get_buffer_size
 av_image_get_linesize
 av_int_list_length_for_size
 av_log
+#ifndef MOZ_FFVPX_FLACONLY
 av_log2
 av_log2_16bit
+#endif
 av_log_default_callback
 av_log_format_line
 av_log_get_flags
@@ -257,7 +256,6 @@ av_pix_fmt_desc_get_id
 av_pix_fmt_desc_next
 av_pix_fmt_get_chroma_sub_sample
 av_pix_fmt_swap_endianness
-av_pixelutils_get_sad_fn
 av_q2intfloat
 av_read_image_line
 av_realloc
@@ -296,20 +294,6 @@ av_strstart
 av_strtod
 av_strtok
 av_sub_q
-av_thread_message_queue_alloc
-av_thread_message_queue_free
-av_thread_message_queue_recv
-av_thread_message_queue_send
-av_thread_message_queue_set_err_recv
-av_thread_message_queue_set_err_send
-av_timecode_adjust_ntsc_framenum2
-av_timecode_check_frame_rate
-av_timecode_get_smpte_from_framenum
-av_timecode_init
-av_timecode_init_from_string
-av_timecode_make_mpeg_tc_string
-av_timecode_make_smpte_tc_string
-av_timecode_make_string
 av_timegm
 av_usleep
 av_utf8_decode
@@ -320,14 +304,10 @@ av_vlog
 av_write_image_line
 avpriv_alloc_fixed_dsp
 avpriv_float_dsp_alloc
-avpriv_frame_get_metadatap
-avpriv_get_gamma_from_trc
-avpriv_init_lls
 avpriv_report_missing_feature
 avpriv_request_sample
 avpriv_scalarproduct_float_c
 avpriv_set_systematic_pal2
-avpriv_solve_lls
 avutil_configuration
 avutil_license
 avutil_version
@@ -337,3 +317,8 @@ avpriv_emms_asm
 avpriv_slicethread_create
 avpriv_slicethread_execute
 avpriv_slicethread_free
+av_hwdevice_get_type_name
+av_hwframe_ctx_alloc
+av_hwframe_ctx_init
+av_malloc_array
+av_mallocz_array
diff --git a/media/ffvpx/libavutil/common.h b/media/ffvpx/libavutil/common.h
index 8142b31fd..8db029117 100644
--- a/media/ffvpx/libavutil/common.h
+++ b/media/ffvpx/libavutil/common.h
@@ -158,7 +158,7 @@ static av_always_inline av_const int64_t av_clip64_c(int64_t a, int64_t amin, in
  */
 static av_always_inline av_const uint8_t av_clip_uint8_c(int a)
 {
-    if (a&(~0xFF)) return (-a)>>31;
+    if (a&(~0xFF)) return (~a)>>31;
     else           return a;
 }
 
@@ -180,7 +180,7 @@ static av_always_inline av_const int8_t av_clip_int8_c(int a)
  */
 static av_always_inline av_const uint16_t av_clip_uint16_c(int a)
 {
-    if (a&(~0xFFFF)) return (-a)>>31;
+    if (a&(~0xFFFF)) return (~a)>>31;
     else             return a;
 }
 
@@ -228,7 +228,7 @@ static av_always_inline av_const int av_clip_intp2_c(int a, int p)
  */
 static av_always_inline av_const unsigned av_clip_uintp2_c(int a, int p)
 {
-    if (a & ~((1<<p) - 1)) return -a >> 31 & ((1<<p) - 1);
+    if (a & ~((1<<p) - 1)) return (~a) >> 31 & ((1<<p) - 1);
     else                   return  a;
 }
 
@@ -260,7 +260,7 @@ static av_always_inline int av_sat_add32_c(int a, int b)
  *
  * @param  a first value
  * @param  b value doubled and added to a
- * @return sum with signed saturation
+ * @return sum sat(a + sat(2*b)) with signed saturation
  */
 static av_always_inline int av_sat_dadd32_c(int a, int b)
 {
@@ -268,6 +268,30 @@ static av_always_inline int av_sat_dadd32_c(int a, int b)
 }
 
 /**
+ * Subtract two signed 32-bit values with saturation.
+ *
+ * @param  a one value
+ * @param  b another value
+ * @return difference with signed saturation
+ */
+static av_always_inline int av_sat_sub32_c(int a, int b)
+{
+    return av_clipl_int32((int64_t)a - b);
+}
+
+/**
+ * Subtract a doubled value from another value with saturation at both stages.
+ *
+ * @param  a first value
+ * @param  b value doubled and subtracted from a
+ * @return difference sat(a - sat(2*b)) with signed saturation
+ */
+static av_always_inline int av_sat_dsub32_c(int a, int b)
+{
+    return av_sat_sub32(a, av_sat_add32(b, b));
+}
+
+/**
  * Clip a float value into the amin-amax range.
  * @param a value to clip
  * @param amin minimum value of the clip range
@@ -513,6 +537,12 @@ static av_always_inline av_const int av_parity_c(uint32_t v)
 #ifndef av_sat_dadd32
 #   define av_sat_dadd32    av_sat_dadd32_c
 #endif
+#ifndef av_sat_sub32
+#   define av_sat_sub32     av_sat_sub32_c
+#endif
+#ifndef av_sat_dsub32
+#   define av_sat_dsub32    av_sat_dsub32_c
+#endif
 #ifndef av_clipf
 #   define av_clipf         av_clipf_c
 #endif
diff --git a/media/ffvpx/libavutil/cpu.c b/media/ffvpx/libavutil/cpu.c
index c8401b825..6548cc304 100644
--- a/media/ffvpx/libavutil/cpu.c
+++ b/media/ffvpx/libavutil/cpu.c
@@ -80,7 +80,8 @@ void av_force_cpu_flags(int arg){
                     AV_CPU_FLAG_XOP      |
                     AV_CPU_FLAG_FMA3     |
                     AV_CPU_FLAG_FMA4     |
-                    AV_CPU_FLAG_AVX2     ))
+                    AV_CPU_FLAG_AVX2     |
+                    AV_CPU_FLAG_AVX512   ))
         && !(arg & AV_CPU_FLAG_MMX)) {
         av_log(NULL, AV_LOG_WARNING, "MMX implied by specified flags\n");
         arg |= AV_CPU_FLAG_MMX;
@@ -126,6 +127,7 @@ int av_parse_cpu_flags(const char *s)
 #define CPUFLAG_AVX2     (AV_CPU_FLAG_AVX2     | CPUFLAG_AVX)
 #define CPUFLAG_BMI2     (AV_CPU_FLAG_BMI2     | AV_CPU_FLAG_BMI1)
 #define CPUFLAG_AESNI    (AV_CPU_FLAG_AESNI    | CPUFLAG_SSE42)
+#define CPUFLAG_AVX512   (AV_CPU_FLAG_AVX512   | CPUFLAG_AVX2)
     static const AVOption cpuflags_opts[] = {
         { "flags"   , NULL, 0, AV_OPT_TYPE_FLAGS, { .i64 = 0 }, INT64_MIN, INT64_MAX, .unit = "flags" },
 #if   ARCH_PPC
@@ -154,6 +156,7 @@ int av_parse_cpu_flags(const char *s)
         { "3dnowext", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_3DNOWEXT     },    .unit = "flags" },
         { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CMOV     },    .unit = "flags" },
         { "aesni"   , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AESNI        },    .unit = "flags" },
+        { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX512       },    .unit = "flags" },
 #elif ARCH_ARM
         { "armv5te",  NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ARMV5TE  },    .unit = "flags" },
         { "armv6",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_ARMV6    },    .unit = "flags" },
@@ -216,6 +219,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
         { "3dnowext", NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_3DNOWEXT },    .unit = "flags" },
         { "cmov",     NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_CMOV     },    .unit = "flags" },
         { "aesni",    NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AESNI    },    .unit = "flags" },
+        { "avx512"  , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX512   },    .unit = "flags" },
 
 #define CPU_FLAG_P2 AV_CPU_FLAG_CMOV | AV_CPU_FLAG_MMX
 #define CPU_FLAG_P3 CPU_FLAG_P2 | AV_CPU_FLAG_MMX2 | AV_CPU_FLAG_SSE
diff --git a/media/ffvpx/libavutil/cpu.h b/media/ffvpx/libavutil/cpu.h
index 9e5d40aff..8bb9eb606 100644
--- a/media/ffvpx/libavutil/cpu.h
+++ b/media/ffvpx/libavutil/cpu.h
@@ -55,6 +55,7 @@
 #define AV_CPU_FLAG_FMA3        0x10000 ///< Haswell FMA3 functions
 #define AV_CPU_FLAG_BMI1        0x20000 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI2        0x40000 ///< Bit Manipulation Instruction Set 2
+#define AV_CPU_FLAG_AVX512     0x100000 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used
 
 #define AV_CPU_FLAG_ALTIVEC      0x0001 ///< standard
 #define AV_CPU_FLAG_VSX          0x0002 ///< ISA 2.06
diff --git a/media/ffvpx/libavutil/cpu_internal.h b/media/ffvpx/libavutil/cpu_internal.h
index b8bf1e539..37122d1c5 100644
--- a/media/ffvpx/libavutil/cpu_internal.h
+++ b/media/ffvpx/libavutil/cpu_internal.h
@@ -19,6 +19,8 @@
 #ifndef AVUTIL_CPU_INTERNAL_H
 #define AVUTIL_CPU_INTERNAL_H
 
+#include "config.h"
+
 #include "cpu.h"
 
 #define CPUEXT_SUFFIX(flags, suffix, cpuext)                            \
diff --git a/media/ffvpx/libavutil/crc.c b/media/ffvpx/libavutil/crc.c
index 495732b16..c45ea63a6 100644
--- a/media/ffvpx/libavutil/crc.c
+++ b/media/ffvpx/libavutil/crc.c
@@ -20,6 +20,8 @@
 
 #include "config.h"
 
+#include "thread.h"
+#include "avassert.h"
 #include "bswap.h"
 #include "common.h"
 #include "crc.h"
@@ -50,6 +52,30 @@ static const AVCRC av_crc_table[AV_CRC_MAX][257] = {
         0xDE, 0xD9, 0xD0, 0xD7, 0xC2, 0xC5, 0xCC, 0xCB, 0xE6, 0xE1, 0xE8, 0xEF,
         0xFA, 0xFD, 0xF4, 0xF3, 0x01
     },
+    [AV_CRC_8_EBU] = {
+        0x00, 0x1D, 0x3A, 0x27, 0x74, 0x69, 0x4E, 0x53, 0xE8, 0xF5, 0xD2, 0xCF,
+        0x9C, 0x81, 0xA6, 0xBB, 0xCD, 0xD0, 0xF7, 0xEA, 0xB9, 0xA4, 0x83, 0x9E,
+        0x25, 0x38, 0x1F, 0x02, 0x51, 0x4C, 0x6B, 0x76, 0x87, 0x9A, 0xBD, 0xA0,
+        0xF3, 0xEE, 0xC9, 0xD4, 0x6F, 0x72, 0x55, 0x48, 0x1B, 0x06, 0x21, 0x3C,
+        0x4A, 0x57, 0x70, 0x6D, 0x3E, 0x23, 0x04, 0x19, 0xA2, 0xBF, 0x98, 0x85,
+        0xD6, 0xCB, 0xEC, 0xF1, 0x13, 0x0E, 0x29, 0x34, 0x67, 0x7A, 0x5D, 0x40,
+        0xFB, 0xE6, 0xC1, 0xDC, 0x8F, 0x92, 0xB5, 0xA8, 0xDE, 0xC3, 0xE4, 0xF9,
+        0xAA, 0xB7, 0x90, 0x8D, 0x36, 0x2B, 0x0C, 0x11, 0x42, 0x5F, 0x78, 0x65,
+        0x94, 0x89, 0xAE, 0xB3, 0xE0, 0xFD, 0xDA, 0xC7, 0x7C, 0x61, 0x46, 0x5B,
+        0x08, 0x15, 0x32, 0x2F, 0x59, 0x44, 0x63, 0x7E, 0x2D, 0x30, 0x17, 0x0A,
+        0xB1, 0xAC, 0x8B, 0x96, 0xC5, 0xD8, 0xFF, 0xE2, 0x26, 0x3B, 0x1C, 0x01,
+        0x52, 0x4F, 0x68, 0x75, 0xCE, 0xD3, 0xF4, 0xE9, 0xBA, 0xA7, 0x80, 0x9D,
+        0xEB, 0xF6, 0xD1, 0xCC, 0x9F, 0x82, 0xA5, 0xB8, 0x03, 0x1E, 0x39, 0x24,
+        0x77, 0x6A, 0x4D, 0x50, 0xA1, 0xBC, 0x9B, 0x86, 0xD5, 0xC8, 0xEF, 0xF2,
+        0x49, 0x54, 0x73, 0x6E, 0x3D, 0x20, 0x07, 0x1A, 0x6C, 0x71, 0x56, 0x4B,
+        0x18, 0x05, 0x22, 0x3F, 0x84, 0x99, 0xBE, 0xA3, 0xF0, 0xED, 0xCA, 0xD7,
+        0x35, 0x28, 0x0F, 0x12, 0x41, 0x5C, 0x7B, 0x66, 0xDD, 0xC0, 0xE7, 0xFA,
+        0xA9, 0xB4, 0x93, 0x8E, 0xF8, 0xE5, 0xC2, 0xDF, 0x8C, 0x91, 0xB6, 0xAB,
+        0x10, 0x0D, 0x2A, 0x37, 0x64, 0x79, 0x5E, 0x43, 0xB2, 0xAF, 0x88, 0x95,
+        0xC6, 0xDB, 0xFC, 0xE1, 0x5A, 0x47, 0x60, 0x7D, 0x2E, 0x33, 0x14, 0x09,
+        0x7F, 0x62, 0x45, 0x58, 0x0B, 0x16, 0x31, 0x2C, 0x97, 0x8A, 0xAD, 0xB0,
+        0xE3, 0xFE, 0xD9, 0xC4, 0x01
+    },
     [AV_CRC_16_ANSI] = {
         0x0000, 0x0580, 0x0F80, 0x0A00, 0x1B80, 0x1E00, 0x1400, 0x1180,
         0x3380, 0x3600, 0x3C00, 0x3980, 0x2800, 0x2D80, 0x2780, 0x2200,
@@ -291,20 +317,25 @@ static const AVCRC av_crc_table[AV_CRC_MAX][257] = {
 #else
 #define CRC_TABLE_SIZE 1024
 #endif
-static struct {
-    uint8_t  le;
-    uint8_t  bits;
-    uint32_t poly;
-} av_crc_table_params[AV_CRC_MAX] = {
-    [AV_CRC_8_ATM]      = { 0,  8,       0x07 },
-    [AV_CRC_16_ANSI]    = { 0, 16,     0x8005 },
-    [AV_CRC_16_CCITT]   = { 0, 16,     0x1021 },
-    [AV_CRC_24_IEEE]    = { 0, 24,   0x864CFB },
-    [AV_CRC_32_IEEE]    = { 0, 32, 0x04C11DB7 },
-    [AV_CRC_32_IEEE_LE] = { 1, 32, 0xEDB88320 },
-    [AV_CRC_16_ANSI_LE] = { 1, 16,     0xA001 },
-};
 static AVCRC av_crc_table[AV_CRC_MAX][CRC_TABLE_SIZE];
+
+#define DECLARE_CRC_INIT_TABLE_ONCE(id, le, bits, poly)                                       \
+static AVOnce id ## _once_control = AV_ONCE_INIT;                                             \
+static void id ## _init_table_once(void)                                                      \
+{                                                                                             \
+    av_assert0(av_crc_init(av_crc_table[id], le, bits, poly, sizeof(av_crc_table[id])) >= 0); \
+}
+
+#define CRC_INIT_TABLE_ONCE(id) ff_thread_once(&id ## _once_control, id ## _init_table_once)
+
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_8_ATM,      0,  8,       0x07)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_8_EBU,      0,  8,       0x1D)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_16_ANSI,    0, 16,     0x8005)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_16_CCITT,   0, 16,     0x1021)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_24_IEEE,    0, 24,   0x864CFB)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_32_IEEE,    0, 32, 0x04C11DB7)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_32_IEEE_LE, 1, 32, 0xEDB88320)
+DECLARE_CRC_INIT_TABLE_ONCE(AV_CRC_16_ANSI_LE, 1, 16,     0xA001)
 #endif
 
 int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
@@ -343,13 +374,17 @@ int av_crc_init(AVCRC *ctx, int le, int bits, uint32_t poly, int ctx_size)
 const AVCRC *av_crc_get_table(AVCRCId crc_id)
 {
 #if !CONFIG_HARDCODED_TABLES
-    if (!av_crc_table[crc_id][FF_ARRAY_ELEMS(av_crc_table[crc_id]) - 1])
-        if (av_crc_init(av_crc_table[crc_id],
-                        av_crc_table_params[crc_id].le,
-                        av_crc_table_params[crc_id].bits,
-                        av_crc_table_params[crc_id].poly,
-                        sizeof(av_crc_table[crc_id])) < 0)
-            return NULL;
+    switch (crc_id) {
+    case AV_CRC_8_ATM:      CRC_INIT_TABLE_ONCE(AV_CRC_8_ATM); break;
+    case AV_CRC_8_EBU:      CRC_INIT_TABLE_ONCE(AV_CRC_8_EBU); break;
+    case AV_CRC_16_ANSI:    CRC_INIT_TABLE_ONCE(AV_CRC_16_ANSI); break;
+    case AV_CRC_16_CCITT:   CRC_INIT_TABLE_ONCE(AV_CRC_16_CCITT); break;
+    case AV_CRC_24_IEEE:    CRC_INIT_TABLE_ONCE(AV_CRC_24_IEEE); break;
+    case AV_CRC_32_IEEE:    CRC_INIT_TABLE_ONCE(AV_CRC_32_IEEE); break;
+    case AV_CRC_32_IEEE_LE: CRC_INIT_TABLE_ONCE(AV_CRC_32_IEEE_LE); break;
+    case AV_CRC_16_ANSI_LE: CRC_INIT_TABLE_ONCE(AV_CRC_16_ANSI_LE); break;
+    default: av_assert0(0);
+    }
 #endif
     return av_crc_table[crc_id];
 }
diff --git a/media/ffvpx/libavutil/crc.h b/media/ffvpx/libavutil/crc.h
index 2a1b0d762..47e22b4c7 100644
--- a/media/ffvpx/libavutil/crc.h
+++ b/media/ffvpx/libavutil/crc.h
@@ -53,11 +53,8 @@ typedef enum {
     AV_CRC_32_IEEE,
     AV_CRC_32_IEEE_LE,  /*< reversed bitorder version of AV_CRC_32_IEEE */
     AV_CRC_16_ANSI_LE,  /*< reversed bitorder version of AV_CRC_16_ANSI */
-#if FF_API_CRC_BIG_TABLE
-    AV_CRC_24_IEEE = 12,
-#else
     AV_CRC_24_IEEE,
-#endif /* FF_API_CRC_BIG_TABLE */
+    AV_CRC_8_EBU,
     AV_CRC_MAX,         /*< Not part of public API! Do not use outside libavutil. */
 }AVCRCId;
 
diff --git a/media/ffvpx/libavutil/dummy_funcs.c b/media/ffvpx/libavutil/dummy_funcs.c
index 3a2381074..5d1cdc819 100644
--- a/media/ffvpx/libavutil/dummy_funcs.c
+++ b/media/ffvpx/libavutil/dummy_funcs.c
@@ -5,22 +5,27 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
 #include "avutil.h"
+#include "hwcontext.h"
 
 // cpu_internal.c
 int ff_get_cpu_flags_aarch64(void) { return 0; }
+#if !defined(__arm__)
 int ff_get_cpu_flags_arm(void) { return 0; }
+#endif
 int ff_get_cpu_flags_ppc(void) { return 0; }
 
 // float_dsp.c
 #include "float_dsp.h"
 void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp) {}
-void ff_float_dsp_init_arm(AVFloatDSPContext *fdsp) {}
 void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int strict) {}
 void ff_float_dsp_init_mips(AVFloatDSPContext *fdsp) {}
-
-int av_hwframe_get_buffer(struct AVBufferRef* hwframe_ref, struct AVFrame* frame, int flags) { return 0; }
+#if !defined(__arm__)
+void ff_float_dsp_init_arm(AVFloatDSPContext *fdsp) {}
+#endif
 
 // cpu.c
 size_t ff_get_cpu_max_align_aarch64() { return 0; }
-size_t ff_get_cpu_max_align_arm() { return 0; }
 size_t ff_get_cpu_max_align_ppc() { return 0; }
+#if !defined(__arm__)
+size_t ff_get_cpu_max_align_arm() { return 0; }
+#endif
diff --git a/media/ffvpx/libavutil/eval.c b/media/ffvpx/libavutil/eval.c
index e5948793b..efed91b6e 100644
--- a/media/ffvpx/libavutil/eval.c
+++ b/media/ffvpx/libavutil/eval.c
@@ -57,7 +57,14 @@ typedef struct Parser {
     double *var;
 } Parser;
 
-static const AVClass eval_class = { "Eval", av_default_item_name, NULL, LIBAVUTIL_VERSION_INT, offsetof(Parser,log_offset), offsetof(Parser,log_ctx) };
+static const AVClass eval_class = {
+    .class_name                = "Eval",
+    .item_name                 = av_default_item_name,
+    .option                    = NULL,
+    .version                   = LIBAVUTIL_VERSION_INT,
+    .log_level_offset_offset   = offsetof(Parser, log_offset),
+    .parent_log_context_offset = offsetof(Parser, log_ctx),
+};
 
 static const struct {
     double bin_val;
diff --git a/media/ffvpx/libavutil/ffversion.h b/media/ffvpx/libavutil/ffversion.h
index 3da2a6bf1..9b533e039 100644
--- a/media/ffvpx/libavutil/ffversion.h
+++ b/media/ffvpx/libavutil/ffversion.h
@@ -1,5 +1,5 @@
 /* Automatically generated by version.sh, do not manually edit! */
 #ifndef AVUTIL_FFVERSION_H
 #define AVUTIL_FFVERSION_H
-#define FFMPEG_VERSION "n3.4.2"
+#define FFMPEG_VERSION "n4.0.2"
 #endif /* AVUTIL_FFVERSION_H */
diff --git a/media/ffvpx/libavutil/frame.c b/media/ffvpx/libavutil/frame.c
index d5fd2932e..00215ac29 100644
--- a/media/ffvpx/libavutil/frame.c
+++ b/media/ffvpx/libavutil/frame.c
@@ -26,11 +26,7 @@
 #include "mem.h"
 #include "samplefmt.h"
 
-
-static AVFrameSideData *frame_new_side_data(AVFrame *frame,
-                                            enum AVFrameSideDataType type,
-                                            AVBufferRef *buf);
-
+#if FF_API_FRAME_GET_SET
 MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
 MAKE_ACCESSORS(AVFrame, frame, int64_t, pkt_duration)
 MAKE_ACCESSORS(AVFrame, frame, int64_t, pkt_pos)
@@ -42,41 +38,84 @@ MAKE_ACCESSORS(AVFrame, frame, int,     decode_error_flags)
 MAKE_ACCESSORS(AVFrame, frame, int,     pkt_size)
 MAKE_ACCESSORS(AVFrame, frame, enum AVColorSpace, colorspace)
 MAKE_ACCESSORS(AVFrame, frame, enum AVColorRange, color_range)
+#endif
 
 #define CHECK_CHANNELS_CONSISTENCY(frame) \
     av_assert2(!(frame)->channel_layout || \
                (frame)->channels == \
                av_get_channel_layout_nb_channels((frame)->channel_layout))
 
-AVDictionary **avpriv_frame_get_metadatap(AVFrame *frame) {return &frame->metadata;};
-
 #if FF_API_FRAME_QP
+struct qp_properties {
+    int stride;
+    int type;
+};
+
 int av_frame_set_qp_table(AVFrame *f, AVBufferRef *buf, int stride, int qp_type)
 {
+    struct qp_properties *p;
+    AVFrameSideData *sd;
+    AVBufferRef *ref;
+
+FF_DISABLE_DEPRECATION_WARNINGS
     av_buffer_unref(&f->qp_table_buf);
 
     f->qp_table_buf = buf;
-
-FF_DISABLE_DEPRECATION_WARNINGS
     f->qscale_table = buf->data;
     f->qstride      = stride;
     f->qscale_type  = qp_type;
 FF_ENABLE_DEPRECATION_WARNINGS
 
+    av_frame_remove_side_data(f, AV_FRAME_DATA_QP_TABLE_PROPERTIES);
+    av_frame_remove_side_data(f, AV_FRAME_DATA_QP_TABLE_DATA);
+
+    ref = av_buffer_ref(buf);
+    if (!av_frame_new_side_data_from_buf(f, AV_FRAME_DATA_QP_TABLE_DATA, ref)) {
+        av_buffer_unref(&ref);
+        return AVERROR(ENOMEM);
+    }
+
+    sd = av_frame_new_side_data(f, AV_FRAME_DATA_QP_TABLE_PROPERTIES,
+                                sizeof(struct qp_properties));
+    if (!sd)
+        return AVERROR(ENOMEM);
+
+    p = (struct qp_properties *)sd->data;
+    p->stride = stride;
+    p->type = qp_type;
+
     return 0;
 }
 
 int8_t *av_frame_get_qp_table(AVFrame *f, int *stride, int *type)
 {
+    AVBufferRef *buf = NULL;
+
+    *stride = 0;
+    *type   = 0;
+
 FF_DISABLE_DEPRECATION_WARNINGS
-    *stride = f->qstride;
-    *type   = f->qscale_type;
+    if (f->qp_table_buf) {
+        *stride = f->qstride;
+        *type   = f->qscale_type;
+        buf     = f->qp_table_buf;
 FF_ENABLE_DEPRECATION_WARNINGS
+    } else {
+        AVFrameSideData *sd;
+        struct qp_properties *p;
+        sd = av_frame_get_side_data(f, AV_FRAME_DATA_QP_TABLE_PROPERTIES);
+        if (!sd)
+            return NULL;
+        p = (struct qp_properties *)sd->data;
+        sd = av_frame_get_side_data(f, AV_FRAME_DATA_QP_TABLE_DATA);
+        if (!sd)
+            return NULL;
+        *stride = p->stride;
+        *type   = p->type;
+        buf     = sd->buf;
+    }
 
-    if (!f->qp_table_buf)
-        return NULL;
-
-    return f->qp_table_buf->data;
+    return buf ? buf->data : NULL;
 }
 #endif
 
@@ -208,7 +247,7 @@ static int get_video_buffer(AVFrame *frame, int align)
 
         frame->data[i] = frame->buf[i]->data;
     }
-    if (desc->flags & AV_PIX_FMT_FLAG_PAL || desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL) {
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL || desc->flags & FF_PSEUDOPAL) {
         av_buffer_unref(&frame->buf[1]);
         frame->buf[1] = av_buffer_alloc(AVPALETTE_SIZE);
         if (!frame->buf[1])
@@ -356,8 +395,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
             }
             memcpy(sd_dst->data, sd_src->data, sd_src->size);
         } else {
-            sd_dst = frame_new_side_data(dst, sd_src->type, av_buffer_ref(sd_src->buf));
+            AVBufferRef *ref = av_buffer_ref(sd_src->buf);
+            sd_dst = av_frame_new_side_data_from_buf(dst, sd_src->type, ref);
             if (!sd_dst) {
+                av_buffer_unref(&ref);
                 wipe_side_data(dst);
                 return AVERROR(ENOMEM);
             }
@@ -383,12 +424,17 @@ FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     av_buffer_unref(&dst->opaque_ref);
+    av_buffer_unref(&dst->private_ref);
     if (src->opaque_ref) {
         dst->opaque_ref = av_buffer_ref(src->opaque_ref);
         if (!dst->opaque_ref)
             return AVERROR(ENOMEM);
     }
-
+    if (src->private_ref) {
+        dst->private_ref = av_buffer_ref(src->private_ref);
+        if (!dst->private_ref)
+            return AVERROR(ENOMEM);
+    }
     return 0;
 }
 
@@ -518,12 +564,15 @@ void av_frame_unref(AVFrame *frame)
     av_freep(&frame->extended_buf);
     av_dict_free(&frame->metadata);
 #if FF_API_FRAME_QP
+FF_DISABLE_DEPRECATION_WARNINGS
     av_buffer_unref(&frame->qp_table_buf);
+FF_ENABLE_DEPRECATION_WARNINGS
 #endif
 
     av_buffer_unref(&frame->hw_frames_ctx);
 
     av_buffer_unref(&frame->opaque_ref);
+    av_buffer_unref(&frame->private_ref);
 
     get_frame_defaults(frame);
 }
@@ -636,9 +685,9 @@ AVBufferRef *av_frame_get_plane_buffer(AVFrame *frame, int plane)
     return NULL;
 }
 
-static AVFrameSideData *frame_new_side_data(AVFrame *frame,
-                                            enum AVFrameSideDataType type,
-                                            AVBufferRef *buf)
+AVFrameSideData *av_frame_new_side_data_from_buf(AVFrame *frame,
+                                                 enum AVFrameSideDataType type,
+                                                 AVBufferRef *buf)
 {
     AVFrameSideData *ret, **tmp;
 
@@ -646,17 +695,17 @@ static AVFrameSideData *frame_new_side_data(AVFrame *frame,
         return NULL;
 
     if (frame->nb_side_data > INT_MAX / sizeof(*frame->side_data) - 1)
-        goto fail;
+        return NULL;
 
     tmp = av_realloc(frame->side_data,
                      (frame->nb_side_data + 1) * sizeof(*frame->side_data));
     if (!tmp)
-        goto fail;
+        return NULL;
     frame->side_data = tmp;
 
     ret = av_mallocz(sizeof(*ret));
     if (!ret)
-        goto fail;
+        return NULL;
 
     ret->buf = buf;
     ret->data = ret->buf->data;
@@ -666,17 +715,18 @@ static AVFrameSideData *frame_new_side_data(AVFrame *frame,
     frame->side_data[frame->nb_side_data++] = ret;
 
     return ret;
-fail:
-    av_buffer_unref(&buf);
-    return NULL;
 }
 
 AVFrameSideData *av_frame_new_side_data(AVFrame *frame,
                                         enum AVFrameSideDataType type,
                                         int size)
 {
-
-    return frame_new_side_data(frame, type, av_buffer_alloc(size));
+    AVFrameSideData *ret;
+    AVBufferRef *buf = av_buffer_alloc(size);
+    ret = av_frame_new_side_data_from_buf(frame, type, buf);
+    if (!ret)
+        av_buffer_unref(&buf);
+    return ret;
 }
 
 AVFrameSideData *av_frame_get_side_data(const AVFrame *frame,
@@ -782,6 +832,8 @@ const char *av_frame_side_data_name(enum AVFrameSideDataType type)
     case AV_FRAME_DATA_CONTENT_LIGHT_LEVEL:         return "Content light level metadata";
     case AV_FRAME_DATA_GOP_TIMECODE:                return "GOP timecode";
     case AV_FRAME_DATA_ICC_PROFILE:                 return "ICC profile";
+    case AV_FRAME_DATA_QP_TABLE_PROPERTIES:         return "QP table properties";
+    case AV_FRAME_DATA_QP_TABLE_DATA:               return "QP table data";
     }
     return NULL;
 }
@@ -796,7 +848,7 @@ static int calc_cropping_offsets(size_t offsets[4], const AVFrame *frame,
         int shift_x = (i == 1 || i == 2) ? desc->log2_chroma_w : 0;
         int shift_y = (i == 1 || i == 2) ? desc->log2_chroma_h : 0;
 
-        if (desc->flags & (AV_PIX_FMT_FLAG_PAL | AV_PIX_FMT_FLAG_PSEUDOPAL) && i == 1) {
+        if (desc->flags & (AV_PIX_FMT_FLAG_PAL | FF_PSEUDOPAL) && i == 1) {
             offsets[i] = 0;
             break;
         }
diff --git a/media/ffvpx/libavutil/frame.h b/media/ffvpx/libavutil/frame.h
index abe4f4fd1..9d57d6ce6 100644
--- a/media/ffvpx/libavutil/frame.h
+++ b/media/ffvpx/libavutil/frame.h
@@ -141,6 +141,23 @@ enum AVFrameSideDataType {
      * metadata key entry "name".
      */
     AV_FRAME_DATA_ICC_PROFILE,
+
+#if FF_API_FRAME_QP
+    /**
+     * Implementation-specific description of the format of AV_FRAME_QP_TABLE_DATA.
+     * The contents of this side data are undocumented and internal; use
+     * av_frame_set_qp_table() and av_frame_get_qp_table() to access this in a
+     * meaningful way instead.
+     */
+    AV_FRAME_DATA_QP_TABLE_PROPERTIES,
+
+    /**
+     * Raw QP table data. Its format is described by
+     * AV_FRAME_DATA_QP_TABLE_PROPERTIES. Use av_frame_set_qp_table() and
+     * av_frame_get_qp_table() to access this instead.
+     */
+    AV_FRAME_DATA_QP_TABLE_DATA,
+#endif
 };
 
 enum AVActiveFormatDescription {
@@ -529,6 +546,7 @@ typedef struct AVFrame {
     attribute_deprecated
     int qscale_type;
 
+    attribute_deprecated
     AVBufferRef *qp_table_buf;
 #endif
     /**
@@ -563,39 +581,77 @@ typedef struct AVFrame {
     /**
      * @}
      */
+
+    /**
+     * AVBufferRef for internal use by a single libav* library.
+     * Must not be used to transfer data between libraries.
+     * Has to be NULL when ownership of the frame leaves the respective library.
+     *
+     * Code outside the FFmpeg libs should never check or change the contents of the buffer ref.
+     *
+     * FFmpeg calls av_buffer_unref() on it when the frame is unreferenced.
+     * av_frame_copy_props() calls create a new reference with av_buffer_ref()
+     * for the target frame's private_ref field.
+     */
+    AVBufferRef *private_ref;
 } AVFrame;
 
+#if FF_API_FRAME_GET_SET
 /**
  * Accessors for some AVFrame fields. These used to be provided for ABI
  * compatibility, and do not need to be used anymore.
  */
+attribute_deprecated
 int64_t av_frame_get_best_effort_timestamp(const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_best_effort_timestamp(AVFrame *frame, int64_t val);
+attribute_deprecated
 int64_t av_frame_get_pkt_duration         (const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_pkt_duration         (AVFrame *frame, int64_t val);
+attribute_deprecated
 int64_t av_frame_get_pkt_pos              (const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_pkt_pos              (AVFrame *frame, int64_t val);
+attribute_deprecated
 int64_t av_frame_get_channel_layout       (const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_channel_layout       (AVFrame *frame, int64_t val);
+attribute_deprecated
 int     av_frame_get_channels             (const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_channels             (AVFrame *frame, int     val);
+attribute_deprecated
 int     av_frame_get_sample_rate          (const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_sample_rate          (AVFrame *frame, int     val);
+attribute_deprecated
 AVDictionary *av_frame_get_metadata       (const AVFrame *frame);
+attribute_deprecated
 void          av_frame_set_metadata       (AVFrame *frame, AVDictionary *val);
+attribute_deprecated
 int     av_frame_get_decode_error_flags   (const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_decode_error_flags   (AVFrame *frame, int     val);
+attribute_deprecated
 int     av_frame_get_pkt_size(const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_pkt_size(AVFrame *frame, int val);
-AVDictionary **avpriv_frame_get_metadatap(AVFrame *frame);
 #if FF_API_FRAME_QP
+attribute_deprecated
 int8_t *av_frame_get_qp_table(AVFrame *f, int *stride, int *type);
+attribute_deprecated
 int av_frame_set_qp_table(AVFrame *f, AVBufferRef *buf, int stride, int type);
 #endif
+attribute_deprecated
 enum AVColorSpace av_frame_get_colorspace(const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_colorspace(AVFrame *frame, enum AVColorSpace val);
+attribute_deprecated
 enum AVColorRange av_frame_get_color_range(const AVFrame *frame);
+attribute_deprecated
 void    av_frame_set_color_range(AVFrame *frame, enum AVColorRange val);
+#endif
 
 /**
  * Get the name of a colorspace.
@@ -763,6 +819,22 @@ AVFrameSideData *av_frame_new_side_data(AVFrame *frame,
                                         int size);
 
 /**
+ * Add a new side data to a frame from an existing AVBufferRef
+ *
+ * @param frame a frame to which the side data should be added
+ * @param type  the type of the added side data
+ * @param buf   an AVBufferRef to add as side data. The ownership of
+ *              the reference is transferred to the frame.
+ *
+ * @return newly added side data on success, NULL on error. On failure
+ *         the frame is unchanged and the AVBufferRef remains owned by
+ *         the caller.
+ */
+AVFrameSideData *av_frame_new_side_data_from_buf(AVFrame *frame,
+                                                 enum AVFrameSideDataType type,
+                                                 AVBufferRef *buf);
+
+/**
  * @return a pointer to the side data of a given type on success, NULL if there
  * is no side data with such type in this frame.
  */
diff --git a/media/ffvpx/libavutil/hwcontext.c b/media/ffvpx/libavutil/hwcontext.c
new file mode 100644
index 000000000..70c556eca
--- /dev/null
+++ b/media/ffvpx/libavutil/hwcontext.c
@@ -0,0 +1,873 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "buffer.h"
+#include "common.h"
+#include "hwcontext.h"
+#include "hwcontext_internal.h"
+#include "imgutils.h"
+#include "log.h"
+#include "mem.h"
+#include "pixdesc.h"
+#include "pixfmt.h"
+
+static const HWContextType * const hw_table[] = {
+#if CONFIG_CUDA
+    &ff_hwcontext_type_cuda,
+#endif
+#if CONFIG_D3D11VA
+    &ff_hwcontext_type_d3d11va,
+#endif
+#if CONFIG_LIBDRM
+    &ff_hwcontext_type_drm,
+#endif
+#if CONFIG_DXVA2
+    &ff_hwcontext_type_dxva2,
+#endif
+#if CONFIG_OPENCL
+    &ff_hwcontext_type_opencl,
+#endif
+#if CONFIG_QSV
+    &ff_hwcontext_type_qsv,
+#endif
+#if CONFIG_VAAPI
+    &ff_hwcontext_type_vaapi,
+#endif
+#if CONFIG_VDPAU
+    &ff_hwcontext_type_vdpau,
+#endif
+#if CONFIG_VIDEOTOOLBOX
+    &ff_hwcontext_type_videotoolbox,
+#endif
+#if CONFIG_MEDIACODEC
+    &ff_hwcontext_type_mediacodec,
+#endif
+    NULL,
+};
+
+static const char *const hw_type_names[] = {
+    [AV_HWDEVICE_TYPE_CUDA]   = "cuda",
+    [AV_HWDEVICE_TYPE_DRM]    = "drm",
+    [AV_HWDEVICE_TYPE_DXVA2]  = "dxva2",
+    [AV_HWDEVICE_TYPE_D3D11VA] = "d3d11va",
+    [AV_HWDEVICE_TYPE_OPENCL] = "opencl",
+    [AV_HWDEVICE_TYPE_QSV]    = "qsv",
+    [AV_HWDEVICE_TYPE_VAAPI]  = "vaapi",
+    [AV_HWDEVICE_TYPE_VDPAU]  = "vdpau",
+    [AV_HWDEVICE_TYPE_VIDEOTOOLBOX] = "videotoolbox",
+    [AV_HWDEVICE_TYPE_MEDIACODEC] = "mediacodec",
+};
+
+enum AVHWDeviceType av_hwdevice_find_type_by_name(const char *name)
+{
+    int type;
+    for (type = 0; type < FF_ARRAY_ELEMS(hw_type_names); type++) {
+        if (hw_type_names[type] && !strcmp(hw_type_names[type], name))
+            return type;
+    }
+    return AV_HWDEVICE_TYPE_NONE;
+}
+
+const char *av_hwdevice_get_type_name(enum AVHWDeviceType type)
+{
+    if (type > AV_HWDEVICE_TYPE_NONE &&
+        type < FF_ARRAY_ELEMS(hw_type_names))
+        return hw_type_names[type];
+    else
+        return NULL;
+}
+
+enum AVHWDeviceType av_hwdevice_iterate_types(enum AVHWDeviceType prev)
+{
+    enum AVHWDeviceType next;
+    int i, set = 0;
+    for (i = 0; hw_table[i]; i++) {
+        if (prev != AV_HWDEVICE_TYPE_NONE && hw_table[i]->type <= prev)
+            continue;
+        if (!set || hw_table[i]->type < next) {
+            next = hw_table[i]->type;
+            set = 1;
+        }
+    }
+    return set ? next : AV_HWDEVICE_TYPE_NONE;
+}
+
+static const AVClass hwdevice_ctx_class = {
+    .class_name = "AVHWDeviceContext",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static void hwdevice_ctx_free(void *opaque, uint8_t *data)
+{
+    AVHWDeviceContext *ctx = (AVHWDeviceContext*)data;
+
+    /* uninit might still want access the hw context and the user
+     * free() callback might destroy it, so uninit has to be called first */
+    if (ctx->internal->hw_type->device_uninit)
+        ctx->internal->hw_type->device_uninit(ctx);
+
+    if (ctx->free)
+        ctx->free(ctx);
+
+    av_buffer_unref(&ctx->internal->source_device);
+
+    av_freep(&ctx->hwctx);
+    av_freep(&ctx->internal->priv);
+    av_freep(&ctx->internal);
+    av_freep(&ctx);
+}
+
+AVBufferRef *av_hwdevice_ctx_alloc(enum AVHWDeviceType type)
+{
+    AVHWDeviceContext *ctx;
+    AVBufferRef *buf;
+    const HWContextType *hw_type = NULL;
+    int i;
+
+    for (i = 0; hw_table[i]; i++) {
+        if (hw_table[i]->type == type) {
+            hw_type = hw_table[i];
+            break;
+        }
+    }
+    if (!hw_type)
+        return NULL;
+
+    ctx = av_mallocz(sizeof(*ctx));
+    if (!ctx)
+        return NULL;
+
+    ctx->internal = av_mallocz(sizeof(*ctx->internal));
+    if (!ctx->internal)
+        goto fail;
+
+    if (hw_type->device_priv_size) {
+        ctx->internal->priv = av_mallocz(hw_type->device_priv_size);
+        if (!ctx->internal->priv)
+            goto fail;
+    }
+
+    if (hw_type->device_hwctx_size) {
+        ctx->hwctx = av_mallocz(hw_type->device_hwctx_size);
+        if (!ctx->hwctx)
+            goto fail;
+    }
+
+    buf = av_buffer_create((uint8_t*)ctx, sizeof(*ctx),
+                           hwdevice_ctx_free, NULL,
+                           AV_BUFFER_FLAG_READONLY);
+    if (!buf)
+        goto fail;
+
+    ctx->type     = type;
+    ctx->av_class = &hwdevice_ctx_class;
+
+    ctx->internal->hw_type = hw_type;
+
+    return buf;
+
+fail:
+    if (ctx->internal)
+        av_freep(&ctx->internal->priv);
+    av_freep(&ctx->internal);
+    av_freep(&ctx->hwctx);
+    av_freep(&ctx);
+    return NULL;
+}
+
+int av_hwdevice_ctx_init(AVBufferRef *ref)
+{
+    AVHWDeviceContext *ctx = (AVHWDeviceContext*)ref->data;
+    int ret;
+
+    if (ctx->internal->hw_type->device_init) {
+        ret = ctx->internal->hw_type->device_init(ctx);
+        if (ret < 0)
+            goto fail;
+    }
+
+    return 0;
+fail:
+    if (ctx->internal->hw_type->device_uninit)
+        ctx->internal->hw_type->device_uninit(ctx);
+    return ret;
+}
+
+static const AVClass hwframe_ctx_class = {
+    .class_name = "AVHWFramesContext",
+    .item_name  = av_default_item_name,
+    .version    = LIBAVUTIL_VERSION_INT,
+};
+
+static void hwframe_ctx_free(void *opaque, uint8_t *data)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)data;
+
+    if (ctx->internal->pool_internal)
+        av_buffer_pool_uninit(&ctx->internal->pool_internal);
+
+    if (ctx->internal->hw_type->frames_uninit)
+        ctx->internal->hw_type->frames_uninit(ctx);
+
+    if (ctx->free)
+        ctx->free(ctx);
+
+    av_buffer_unref(&ctx->internal->source_frames);
+
+    av_buffer_unref(&ctx->device_ref);
+
+    av_freep(&ctx->hwctx);
+    av_freep(&ctx->internal->priv);
+    av_freep(&ctx->internal);
+    av_freep(&ctx);
+}
+
+AVBufferRef *av_hwframe_ctx_alloc(AVBufferRef *device_ref_in)
+{
+    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)device_ref_in->data;
+    const HWContextType  *hw_type = device_ctx->internal->hw_type;
+    AVHWFramesContext *ctx;
+    AVBufferRef *buf, *device_ref = NULL;
+
+    ctx = av_mallocz(sizeof(*ctx));
+    if (!ctx)
+        return NULL;
+
+    ctx->internal = av_mallocz(sizeof(*ctx->internal));
+    if (!ctx->internal)
+        goto fail;
+
+    if (hw_type->frames_priv_size) {
+        ctx->internal->priv = av_mallocz(hw_type->frames_priv_size);
+        if (!ctx->internal->priv)
+            goto fail;
+    }
+
+    if (hw_type->frames_hwctx_size) {
+        ctx->hwctx = av_mallocz(hw_type->frames_hwctx_size);
+        if (!ctx->hwctx)
+            goto fail;
+    }
+
+    device_ref = av_buffer_ref(device_ref_in);
+    if (!device_ref)
+        goto fail;
+
+    buf = av_buffer_create((uint8_t*)ctx, sizeof(*ctx),
+                           hwframe_ctx_free, NULL,
+                           AV_BUFFER_FLAG_READONLY);
+    if (!buf)
+        goto fail;
+
+    ctx->av_class   = &hwframe_ctx_class;
+    ctx->device_ref = device_ref;
+    ctx->device_ctx = device_ctx;
+    ctx->format     = AV_PIX_FMT_NONE;
+    ctx->sw_format  = AV_PIX_FMT_NONE;
+
+    ctx->internal->hw_type = hw_type;
+
+    return buf;
+
+fail:
+    if (device_ref)
+        av_buffer_unref(&device_ref);
+    if (ctx->internal)
+        av_freep(&ctx->internal->priv);
+    av_freep(&ctx->internal);
+    av_freep(&ctx->hwctx);
+    av_freep(&ctx);
+    return NULL;
+}
+
+static int hwframe_pool_prealloc(AVBufferRef *ref)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)ref->data;
+    AVFrame **frames;
+    int i, ret = 0;
+
+    frames = av_mallocz_array(ctx->initial_pool_size, sizeof(*frames));
+    if (!frames)
+        return AVERROR(ENOMEM);
+
+    for (i = 0; i < ctx->initial_pool_size; i++) {
+        frames[i] = av_frame_alloc();
+        if (!frames[i])
+            goto fail;
+
+        ret = av_hwframe_get_buffer(ref, frames[i], 0);
+        if (ret < 0)
+            goto fail;
+    }
+
+fail:
+    for (i = 0; i < ctx->initial_pool_size; i++)
+        av_frame_free(&frames[i]);
+    av_freep(&frames);
+
+    return ret;
+}
+
+int av_hwframe_ctx_init(AVBufferRef *ref)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)ref->data;
+    const enum AVPixelFormat *pix_fmt;
+    int ret;
+
+    if (ctx->internal->source_frames) {
+        /* A derived frame context is already initialised. */
+        return 0;
+    }
+
+    /* validate the pixel format */
+    for (pix_fmt = ctx->internal->hw_type->pix_fmts; *pix_fmt != AV_PIX_FMT_NONE; pix_fmt++) {
+        if (*pix_fmt == ctx->format)
+            break;
+    }
+    if (*pix_fmt == AV_PIX_FMT_NONE) {
+        av_log(ctx, AV_LOG_ERROR,
+               "The hardware pixel format '%s' is not supported by the device type '%s'\n",
+               av_get_pix_fmt_name(ctx->format), ctx->internal->hw_type->name);
+        return AVERROR(ENOSYS);
+    }
+
+    /* validate the dimensions */
+    ret = av_image_check_size(ctx->width, ctx->height, 0, ctx);
+    if (ret < 0)
+        return ret;
+
+    /* format-specific init */
+    if (ctx->internal->hw_type->frames_init) {
+        ret = ctx->internal->hw_type->frames_init(ctx);
+        if (ret < 0)
+            goto fail;
+    }
+
+    if (ctx->internal->pool_internal && !ctx->pool)
+        ctx->pool = ctx->internal->pool_internal;
+
+    /* preallocate the frames in the pool, if requested */
+    if (ctx->initial_pool_size > 0) {
+        ret = hwframe_pool_prealloc(ref);
+        if (ret < 0)
+            goto fail;
+    }
+
+    return 0;
+fail:
+    if (ctx->internal->hw_type->frames_uninit)
+        ctx->internal->hw_type->frames_uninit(ctx);
+    return ret;
+}
+
+int av_hwframe_transfer_get_formats(AVBufferRef *hwframe_ref,
+                                    enum AVHWFrameTransferDirection dir,
+                                    enum AVPixelFormat **formats, int flags)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)hwframe_ref->data;
+
+    if (!ctx->internal->hw_type->transfer_get_formats)
+        return AVERROR(ENOSYS);
+
+    return ctx->internal->hw_type->transfer_get_formats(ctx, dir, formats);
+}
+
+static int transfer_data_alloc(AVFrame *dst, const AVFrame *src, int flags)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)src->hw_frames_ctx->data;
+    AVFrame *frame_tmp;
+    int ret = 0;
+
+    frame_tmp = av_frame_alloc();
+    if (!frame_tmp)
+        return AVERROR(ENOMEM);
+
+    /* if the format is set, use that
+     * otherwise pick the first supported one */
+    if (dst->format >= 0) {
+        frame_tmp->format = dst->format;
+    } else {
+        enum AVPixelFormat *formats;
+
+        ret = av_hwframe_transfer_get_formats(src->hw_frames_ctx,
+                                              AV_HWFRAME_TRANSFER_DIRECTION_FROM,
+                                              &formats, 0);
+        if (ret < 0)
+            goto fail;
+        frame_tmp->format = formats[0];
+        av_freep(&formats);
+    }
+    frame_tmp->width  = ctx->width;
+    frame_tmp->height = ctx->height;
+
+    ret = av_frame_get_buffer(frame_tmp, 32);
+    if (ret < 0)
+        goto fail;
+
+    ret = av_hwframe_transfer_data(frame_tmp, src, flags);
+    if (ret < 0)
+        goto fail;
+
+    frame_tmp->width  = src->width;
+    frame_tmp->height = src->height;
+
+    av_frame_move_ref(dst, frame_tmp);
+
+fail:
+    av_frame_free(&frame_tmp);
+    return ret;
+}
+
+int av_hwframe_transfer_data(AVFrame *dst, const AVFrame *src, int flags)
+{
+    AVHWFramesContext *ctx;
+    int ret;
+
+    if (!dst->buf[0])
+        return transfer_data_alloc(dst, src, flags);
+
+    if (src->hw_frames_ctx) {
+        ctx = (AVHWFramesContext*)src->hw_frames_ctx->data;
+
+        ret = ctx->internal->hw_type->transfer_data_from(ctx, dst, src);
+        if (ret < 0)
+            return ret;
+    } else if (dst->hw_frames_ctx) {
+        ctx = (AVHWFramesContext*)dst->hw_frames_ctx->data;
+
+        ret = ctx->internal->hw_type->transfer_data_to(ctx, dst, src);
+        if (ret < 0)
+            return ret;
+    } else
+        return AVERROR(ENOSYS);
+
+    return 0;
+}
+
+int av_hwframe_get_buffer(AVBufferRef *hwframe_ref, AVFrame *frame, int flags)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)hwframe_ref->data;
+    int ret;
+
+    if (ctx->internal->source_frames) {
+        // This is a derived frame context, so we allocate in the source
+        // and map the frame immediately.
+        AVFrame *src_frame;
+
+        frame->format = ctx->format;
+        frame->hw_frames_ctx = av_buffer_ref(hwframe_ref);
+        if (!frame->hw_frames_ctx)
+            return AVERROR(ENOMEM);
+
+        src_frame = av_frame_alloc();
+        if (!src_frame)
+            return AVERROR(ENOMEM);
+
+        ret = av_hwframe_get_buffer(ctx->internal->source_frames,
+                                    src_frame, 0);
+        if (ret < 0) {
+            av_frame_free(&src_frame);
+            return ret;
+        }
+
+        ret = av_hwframe_map(frame, src_frame,
+                             ctx->internal->source_allocation_map_flags);
+        if (ret) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to map frame into derived "
+                   "frame context: %d.\n", ret);
+            av_frame_free(&src_frame);
+            return ret;
+        }
+
+        // Free the source frame immediately - the mapped frame still
+        // contains a reference to it.
+        av_frame_free(&src_frame);
+
+        return 0;
+    }
+
+    if (!ctx->internal->hw_type->frames_get_buffer)
+        return AVERROR(ENOSYS);
+
+    if (!ctx->pool)
+        return AVERROR(EINVAL);
+
+    frame->hw_frames_ctx = av_buffer_ref(hwframe_ref);
+    if (!frame->hw_frames_ctx)
+        return AVERROR(ENOMEM);
+
+    ret = ctx->internal->hw_type->frames_get_buffer(ctx, frame);
+    if (ret < 0) {
+        av_buffer_unref(&frame->hw_frames_ctx);
+        return ret;
+    }
+
+    return 0;
+}
+
+void *av_hwdevice_hwconfig_alloc(AVBufferRef *ref)
+{
+    AVHWDeviceContext *ctx = (AVHWDeviceContext*)ref->data;
+    const HWContextType  *hw_type = ctx->internal->hw_type;
+
+    if (hw_type->device_hwconfig_size == 0)
+        return NULL;
+
+    return av_mallocz(hw_type->device_hwconfig_size);
+}
+
+AVHWFramesConstraints *av_hwdevice_get_hwframe_constraints(AVBufferRef *ref,
+                                                           const void *hwconfig)
+{
+    AVHWDeviceContext *ctx = (AVHWDeviceContext*)ref->data;
+    const HWContextType  *hw_type = ctx->internal->hw_type;
+    AVHWFramesConstraints *constraints;
+
+    if (!hw_type->frames_get_constraints)
+        return NULL;
+
+    constraints = av_mallocz(sizeof(*constraints));
+    if (!constraints)
+        return NULL;
+
+    constraints->min_width = constraints->min_height = 0;
+    constraints->max_width = constraints->max_height = INT_MAX;
+
+    if (hw_type->frames_get_constraints(ctx, hwconfig, constraints) >= 0) {
+        return constraints;
+    } else {
+        av_hwframe_constraints_free(&constraints);
+        return NULL;
+    }
+}
+
+void av_hwframe_constraints_free(AVHWFramesConstraints **constraints)
+{
+    if (*constraints) {
+        av_freep(&(*constraints)->valid_hw_formats);
+        av_freep(&(*constraints)->valid_sw_formats);
+    }
+    av_freep(constraints);
+}
+
+int av_hwdevice_ctx_create(AVBufferRef **pdevice_ref, enum AVHWDeviceType type,
+                           const char *device, AVDictionary *opts, int flags)
+{
+    AVBufferRef *device_ref = NULL;
+    AVHWDeviceContext *device_ctx;
+    int ret = 0;
+
+    device_ref = av_hwdevice_ctx_alloc(type);
+    if (!device_ref) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    device_ctx = (AVHWDeviceContext*)device_ref->data;
+
+    if (!device_ctx->internal->hw_type->device_create) {
+        ret = AVERROR(ENOSYS);
+        goto fail;
+    }
+
+    ret = device_ctx->internal->hw_type->device_create(device_ctx, device,
+                                                       opts, flags);
+    if (ret < 0)
+        goto fail;
+
+    ret = av_hwdevice_ctx_init(device_ref);
+    if (ret < 0)
+        goto fail;
+
+    *pdevice_ref = device_ref;
+    return 0;
+fail:
+    av_buffer_unref(&device_ref);
+    *pdevice_ref = NULL;
+    return ret;
+}
+
+int av_hwdevice_ctx_create_derived(AVBufferRef **dst_ref_ptr,
+                                   enum AVHWDeviceType type,
+                                   AVBufferRef *src_ref, int flags)
+{
+    AVBufferRef *dst_ref = NULL, *tmp_ref;
+    AVHWDeviceContext *dst_ctx, *tmp_ctx;
+    int ret = 0;
+
+    tmp_ref = src_ref;
+    while (tmp_ref) {
+        tmp_ctx = (AVHWDeviceContext*)tmp_ref->data;
+        if (tmp_ctx->type == type) {
+            dst_ref = av_buffer_ref(tmp_ref);
+            if (!dst_ref) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+            goto done;
+        }
+        tmp_ref = tmp_ctx->internal->source_device;
+    }
+
+    dst_ref = av_hwdevice_ctx_alloc(type);
+    if (!dst_ref) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    dst_ctx = (AVHWDeviceContext*)dst_ref->data;
+
+    tmp_ref = src_ref;
+    while (tmp_ref) {
+        tmp_ctx = (AVHWDeviceContext*)tmp_ref->data;
+        if (dst_ctx->internal->hw_type->device_derive) {
+            ret = dst_ctx->internal->hw_type->device_derive(dst_ctx,
+                                                            tmp_ctx,
+                                                            flags);
+            if (ret == 0) {
+                dst_ctx->internal->source_device = av_buffer_ref(src_ref);
+                if (!dst_ctx->internal->source_device) {
+                    ret = AVERROR(ENOMEM);
+                    goto fail;
+                }
+                goto done;
+            }
+            if (ret != AVERROR(ENOSYS))
+                goto fail;
+        }
+        tmp_ref = tmp_ctx->internal->source_device;
+    }
+
+    ret = AVERROR(ENOSYS);
+    goto fail;
+
+done:
+    ret = av_hwdevice_ctx_init(dst_ref);
+    if (ret < 0)
+        goto fail;
+
+    *dst_ref_ptr = dst_ref;
+    return 0;
+
+fail:
+    av_buffer_unref(&dst_ref);
+    *dst_ref_ptr = NULL;
+    return ret;
+}
+
+static void ff_hwframe_unmap(void *opaque, uint8_t *data)
+{
+    HWMapDescriptor *hwmap = (HWMapDescriptor*)data;
+    AVHWFramesContext *ctx = opaque;
+
+    if (hwmap->unmap)
+        hwmap->unmap(ctx, hwmap);
+
+    av_frame_free(&hwmap->source);
+
+    av_buffer_unref(&hwmap->hw_frames_ctx);
+
+    av_free(hwmap);
+}
+
+int ff_hwframe_map_create(AVBufferRef *hwframe_ref,
+                          AVFrame *dst, const AVFrame *src,
+                          void (*unmap)(AVHWFramesContext *ctx,
+                                        HWMapDescriptor *hwmap),
+                          void *priv)
+{
+    AVHWFramesContext *ctx = (AVHWFramesContext*)hwframe_ref->data;
+    HWMapDescriptor *hwmap;
+    int ret;
+
+    hwmap = av_mallocz(sizeof(*hwmap));
+    if (!hwmap) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    hwmap->source = av_frame_alloc();
+    if (!hwmap->source) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+    ret = av_frame_ref(hwmap->source, src);
+    if (ret < 0)
+        goto fail;
+
+    hwmap->hw_frames_ctx = av_buffer_ref(hwframe_ref);
+    if (!hwmap->hw_frames_ctx) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    hwmap->unmap = unmap;
+    hwmap->priv  = priv;
+
+    dst->buf[0] = av_buffer_create((uint8_t*)hwmap, sizeof(*hwmap),
+                                   &ff_hwframe_unmap, ctx, 0);
+    if (!dst->buf[0]) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    return 0;
+
+fail:
+    if (hwmap) {
+        av_buffer_unref(&hwmap->hw_frames_ctx);
+        av_frame_free(&hwmap->source);
+    }
+    av_free(hwmap);
+    return ret;
+}
+
+int av_hwframe_map(AVFrame *dst, const AVFrame *src, int flags)
+{
+    AVHWFramesContext *src_frames, *dst_frames;
+    HWMapDescriptor *hwmap;
+    int ret;
+
+    if (src->hw_frames_ctx && dst->hw_frames_ctx) {
+        src_frames = (AVHWFramesContext*)src->hw_frames_ctx->data;
+        dst_frames = (AVHWFramesContext*)dst->hw_frames_ctx->data;
+
+        if ((src_frames == dst_frames &&
+             src->format == dst_frames->sw_format &&
+             dst->format == dst_frames->format) ||
+            (src_frames->internal->source_frames &&
+             src_frames->internal->source_frames->data ==
+             (uint8_t*)dst_frames)) {
+            // This is an unmap operation.  We don't need to directly
+            // do anything here other than fill in the original frame,
+            // because the real unmap will be invoked when the last
+            // reference to the mapped frame disappears.
+            if (!src->buf[0]) {
+                av_log(src_frames, AV_LOG_ERROR, "Invalid mapping "
+                       "found when attempting unmap.\n");
+                return AVERROR(EINVAL);
+            }
+            hwmap = (HWMapDescriptor*)src->buf[0]->data;
+            av_frame_unref(dst);
+            return av_frame_ref(dst, hwmap->source);
+        }
+    }
+
+    if (src->hw_frames_ctx) {
+        src_frames = (AVHWFramesContext*)src->hw_frames_ctx->data;
+
+        if (src_frames->format == src->format &&
+            src_frames->internal->hw_type->map_from) {
+            ret = src_frames->internal->hw_type->map_from(src_frames,
+                                                          dst, src, flags);
+            if (ret != AVERROR(ENOSYS))
+                return ret;
+        }
+    }
+
+    if (dst->hw_frames_ctx) {
+        dst_frames = (AVHWFramesContext*)dst->hw_frames_ctx->data;
+
+        if (dst_frames->format == dst->format &&
+            dst_frames->internal->hw_type->map_to) {
+            ret = dst_frames->internal->hw_type->map_to(dst_frames,
+                                                        dst, src, flags);
+            if (ret != AVERROR(ENOSYS))
+                return ret;
+        }
+    }
+
+    return AVERROR(ENOSYS);
+}
+
+int av_hwframe_ctx_create_derived(AVBufferRef **derived_frame_ctx,
+                                  enum AVPixelFormat format,
+                                  AVBufferRef *derived_device_ctx,
+                                  AVBufferRef *source_frame_ctx,
+                                  int flags)
+{
+    AVBufferRef   *dst_ref = NULL;
+    AVHWFramesContext *dst = NULL;
+    AVHWFramesContext *src = (AVHWFramesContext*)source_frame_ctx->data;
+    int ret;
+
+    if (src->internal->source_frames) {
+        AVHWFramesContext *src_src =
+            (AVHWFramesContext*)src->internal->source_frames->data;
+        AVHWDeviceContext *dst_dev =
+            (AVHWDeviceContext*)derived_device_ctx->data;
+
+        if (src_src->device_ctx == dst_dev) {
+            // This is actually an unmapping, so we just return a
+            // reference to the source frame context.
+            *derived_frame_ctx =
+                av_buffer_ref(src->internal->source_frames);
+            if (!*derived_frame_ctx) {
+                ret = AVERROR(ENOMEM);
+                goto fail;
+            }
+            return 0;
+        }
+    }
+
+    dst_ref = av_hwframe_ctx_alloc(derived_device_ctx);
+    if (!dst_ref) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    dst = (AVHWFramesContext*)dst_ref->data;
+
+    dst->format    = format;
+    dst->sw_format = src->sw_format;
+    dst->width     = src->width;
+    dst->height    = src->height;
+
+    dst->internal->source_frames = av_buffer_ref(source_frame_ctx);
+    if (!dst->internal->source_frames) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    dst->internal->source_allocation_map_flags =
+        flags & (AV_HWFRAME_MAP_READ      |
+                 AV_HWFRAME_MAP_WRITE     |
+                 AV_HWFRAME_MAP_OVERWRITE |
+                 AV_HWFRAME_MAP_DIRECT);
+
+    ret = AVERROR(ENOSYS);
+    if (src->internal->hw_type->frames_derive_from)
+        ret = src->internal->hw_type->frames_derive_from(dst, src, flags);
+    if (ret == AVERROR(ENOSYS) &&
+        dst->internal->hw_type->frames_derive_to)
+        ret = dst->internal->hw_type->frames_derive_to(dst, src, flags);
+    if (ret == AVERROR(ENOSYS))
+        ret = 0;
+    if (ret)
+        goto fail;
+
+    *derived_frame_ctx = dst_ref;
+    return 0;
+
+fail:
+    if (dst)
+        av_buffer_unref(&dst->internal->source_frames);
+    av_buffer_unref(&dst_ref);
+    return ret;
+}
diff --git a/media/ffvpx/libavutil/hwcontext.h b/media/ffvpx/libavutil/hwcontext.h
index 03334e20e..f5a4b6238 100644
--- a/media/ffvpx/libavutil/hwcontext.h
+++ b/media/ffvpx/libavutil/hwcontext.h
@@ -25,15 +25,17 @@
 #include "pixfmt.h"
 
 enum AVHWDeviceType {
+    AV_HWDEVICE_TYPE_NONE,
     AV_HWDEVICE_TYPE_VDPAU,
     AV_HWDEVICE_TYPE_CUDA,
     AV_HWDEVICE_TYPE_VAAPI,
     AV_HWDEVICE_TYPE_DXVA2,
     AV_HWDEVICE_TYPE_QSV,
     AV_HWDEVICE_TYPE_VIDEOTOOLBOX,
-    AV_HWDEVICE_TYPE_NONE,
     AV_HWDEVICE_TYPE_D3D11VA,
     AV_HWDEVICE_TYPE_DRM,
+    AV_HWDEVICE_TYPE_OPENCL,
+    AV_HWDEVICE_TYPE_MEDIACODEC,
 };
 
 typedef struct AVHWDeviceInternal AVHWDeviceInternal;
diff --git a/media/ffvpx/libavutil/hwcontext_internal.h b/media/ffvpx/libavutil/hwcontext_internal.h
new file mode 100644
index 000000000..332062dda
--- /dev/null
+++ b/media/ffvpx/libavutil/hwcontext_internal.h
@@ -0,0 +1,171 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_HWCONTEXT_INTERNAL_H
+#define AVUTIL_HWCONTEXT_INTERNAL_H
+
+#include <stddef.h>
+
+#include "buffer.h"
+#include "hwcontext.h"
+#include "frame.h"
+#include "pixfmt.h"
+
+typedef struct HWContextType {
+    enum AVHWDeviceType type;
+    const char         *name;
+
+    /**
+     * An array of pixel formats supported by the AVHWFramesContext instances
+     * Terminated by AV_PIX_FMT_NONE.
+     */
+    const enum AVPixelFormat *pix_fmts;
+
+    /**
+     * size of the public hardware-specific context,
+     * i.e. AVHWDeviceContext.hwctx
+     */
+    size_t             device_hwctx_size;
+    /**
+     * size of the private data, i.e.
+     * AVHWDeviceInternal.priv
+     */
+    size_t             device_priv_size;
+
+    /**
+     * Size of the hardware-specific device configuration.
+     * (Used to query hwframe constraints.)
+     */
+    size_t             device_hwconfig_size;
+
+    /**
+     * size of the public frame pool hardware-specific context,
+     * i.e. AVHWFramesContext.hwctx
+     */
+    size_t             frames_hwctx_size;
+    /**
+     * size of the private data, i.e.
+     * AVHWFramesInternal.priv
+     */
+    size_t             frames_priv_size;
+
+    int              (*device_create)(AVHWDeviceContext *ctx, const char *device,
+                                      AVDictionary *opts, int flags);
+    int              (*device_derive)(AVHWDeviceContext *dst_ctx,
+                                      AVHWDeviceContext *src_ctx, int flags);
+
+    int              (*device_init)(AVHWDeviceContext *ctx);
+    void             (*device_uninit)(AVHWDeviceContext *ctx);
+
+    int              (*frames_get_constraints)(AVHWDeviceContext *ctx,
+                                               const void *hwconfig,
+                                               AVHWFramesConstraints *constraints);
+
+    int              (*frames_init)(AVHWFramesContext *ctx);
+    void             (*frames_uninit)(AVHWFramesContext *ctx);
+
+    int              (*frames_get_buffer)(AVHWFramesContext *ctx, AVFrame *frame);
+    int              (*transfer_get_formats)(AVHWFramesContext *ctx,
+                                             enum AVHWFrameTransferDirection dir,
+                                             enum AVPixelFormat **formats);
+    int              (*transfer_data_to)(AVHWFramesContext *ctx, AVFrame *dst,
+                                         const AVFrame *src);
+    int              (*transfer_data_from)(AVHWFramesContext *ctx, AVFrame *dst,
+                                           const AVFrame *src);
+
+    int              (*map_to)(AVHWFramesContext *ctx, AVFrame *dst,
+                               const AVFrame *src, int flags);
+    int              (*map_from)(AVHWFramesContext *ctx, AVFrame *dst,
+                                 const AVFrame *src, int flags);
+
+    int              (*frames_derive_to)(AVHWFramesContext *dst_ctx,
+                                         AVHWFramesContext *src_ctx, int flags);
+    int              (*frames_derive_from)(AVHWFramesContext *dst_ctx,
+                                           AVHWFramesContext *src_ctx, int flags);
+} HWContextType;
+
+struct AVHWDeviceInternal {
+    const HWContextType *hw_type;
+    void                *priv;
+
+    /**
+     * For a derived device, a reference to the original device
+     * context it was derived from.
+     */
+    AVBufferRef *source_device;
+};
+
+struct AVHWFramesInternal {
+    const HWContextType *hw_type;
+    void                *priv;
+
+    AVBufferPool *pool_internal;
+
+    /**
+     * For a derived context, a reference to the original frames
+     * context it was derived from.
+     */
+    AVBufferRef *source_frames;
+    /**
+     * Flags to apply to the mapping from the source to the derived
+     * frame context when trying to allocate in the derived context.
+     */
+    int source_allocation_map_flags;
+};
+
+typedef struct HWMapDescriptor {
+    /**
+     * A reference to the original source of the mapping.
+     */
+    AVFrame *source;
+    /**
+     * A reference to the hardware frames context in which this
+     * mapping was made.  May be the same as source->hw_frames_ctx,
+     * but need not be.
+     */
+    AVBufferRef *hw_frames_ctx;
+    /**
+     * Unmap function.
+     */
+    void (*unmap)(AVHWFramesContext *ctx,
+                  struct HWMapDescriptor *hwmap);
+    /**
+     * Hardware-specific private data associated with the mapping.
+     */
+    void          *priv;
+} HWMapDescriptor;
+
+int ff_hwframe_map_create(AVBufferRef *hwframe_ref,
+                          AVFrame *dst, const AVFrame *src,
+                          void (*unmap)(AVHWFramesContext *ctx,
+                                        HWMapDescriptor *hwmap),
+                          void *priv);
+
+
+extern const HWContextType ff_hwcontext_type_cuda;
+extern const HWContextType ff_hwcontext_type_d3d11va;
+extern const HWContextType ff_hwcontext_type_drm;
+extern const HWContextType ff_hwcontext_type_dxva2;
+extern const HWContextType ff_hwcontext_type_opencl;
+extern const HWContextType ff_hwcontext_type_qsv;
+extern const HWContextType ff_hwcontext_type_vaapi;
+extern const HWContextType ff_hwcontext_type_vdpau;
+extern const HWContextType ff_hwcontext_type_videotoolbox;
+extern const HWContextType ff_hwcontext_type_mediacodec;
+
+#endif /* AVUTIL_HWCONTEXT_INTERNAL_H */
diff --git a/media/ffvpx/libavutil/imgutils.c b/media/ffvpx/libavutil/imgutils.c
index 500517880..4938a7ef6 100644
--- a/media/ffvpx/libavutil/imgutils.c
+++ b/media/ffvpx/libavutil/imgutils.c
@@ -125,7 +125,7 @@ int av_image_fill_pointers(uint8_t *data[4], enum AVPixelFormat pix_fmt, int hei
     size[0] = linesizes[0] * height;
 
     if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
-        desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL) {
+        desc->flags & FF_PSEUDOPAL) {
         data[1] = ptr + size[0]; /* palette is stored here as 256 32 bits words */
         return size[0] + 256 * 4;
     }
@@ -216,7 +216,7 @@ int av_image_alloc(uint8_t *pointers[4], int linesizes[4],
         av_free(buf);
         return ret;
     }
-    if (desc->flags & AV_PIX_FMT_FLAG_PAL || desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL) {
+    if (desc->flags & AV_PIX_FMT_FLAG_PAL || (desc->flags & FF_PSEUDOPAL && pointers[1])) {
         avpriv_set_systematic_pal2((uint32_t*)pointers[1], pix_fmt);
         if (align < 4) {
             av_log(NULL, AV_LOG_ERROR, "Formats with a palette require a minimum alignment of 4\n");
@@ -225,7 +225,7 @@ int av_image_alloc(uint8_t *pointers[4], int linesizes[4],
     }
 
     if ((desc->flags & AV_PIX_FMT_FLAG_PAL ||
-         desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL) &&
+         desc->flags & FF_PSEUDOPAL) && pointers[1] &&
         pointers[1] - pointers[0] > linesizes[0] * h) {
         /* zero-initialize the padding before the palette */
         memset(pointers[0] + linesizes[0] * h, 0,
@@ -242,9 +242,10 @@ typedef struct ImgUtils {
 } ImgUtils;
 
 static const AVClass imgutils_class = {
-    .class_name = "IMGUTILS",
-    .item_name  = av_default_item_name,
-    .version    = LIBAVUTIL_VERSION_INT,
+    .class_name                = "IMGUTILS",
+    .item_name                 = av_default_item_name,
+    .option                    = NULL,
+    .version                   = LIBAVUTIL_VERSION_INT,
     .log_level_offset_offset   = offsetof(ImgUtils, log_offset),
     .parent_log_context_offset = offsetof(ImgUtils, log_ctx),
 };
@@ -353,12 +354,13 @@ static void image_copy(uint8_t *dst_data[4], const ptrdiff_t dst_linesizes[4],
         return;
 
     if (desc->flags & AV_PIX_FMT_FLAG_PAL ||
-        desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL) {
+        desc->flags & FF_PSEUDOPAL) {
         copy_plane(dst_data[0], dst_linesizes[0],
                    src_data[0], src_linesizes[0],
                    width, height);
         /* copy the palette */
-        memcpy(dst_data[1], src_data[1], 4*256);
+        if ((desc->flags & AV_PIX_FMT_FLAG_PAL) || (dst_data[1] && src_data[1]))
+            memcpy(dst_data[1], src_data[1], 4*256);
     } else {
         int i, planes_nb = 0;
 
@@ -441,7 +443,7 @@ int av_image_get_buffer_size(enum AVPixelFormat pix_fmt,
         return ret;
 
     // do not include palette for these pseudo-paletted formats
-    if (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL)
+    if (desc->flags & FF_PSEUDOPAL)
         return FFALIGN(width, align) * height;
 
     return av_image_fill_arrays(data, linesize, NULL, pix_fmt,
diff --git a/media/ffvpx/libavutil/integer.c b/media/ffvpx/libavutil/integer.c
index 6d6855fa1..890e314dc 100644
--- a/media/ffvpx/libavutil/integer.c
+++ b/media/ffvpx/libavutil/integer.c
@@ -164,41 +164,3 @@ int64_t av_i2int(AVInteger a){
     }
     return out;
 }
-
-#ifdef TEST
-
-const uint8_t ff_log2_tab[256]={
-        0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
-        5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
-        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
-};
-
-int main(void){
-    int64_t a,b;
-
-    for(a=7; a<256*256*256; a+=13215){
-        for(b=3; b<256*256*256; b+=27118){
-            AVInteger ai= av_int2i(a);
-            AVInteger bi= av_int2i(b);
-
-            av_assert0(av_i2int(ai) == a);
-            av_assert0(av_i2int(bi) == b);
-            av_assert0(av_i2int(av_add_i(ai,bi)) == a+b);
-            av_assert0(av_i2int(av_sub_i(ai,bi)) == a-b);
-            av_assert0(av_i2int(av_mul_i(ai,bi)) == a*b);
-            av_assert0(av_i2int(av_shr_i(ai, 9)) == a>>9);
-            av_assert0(av_i2int(av_shr_i(ai,-9)) == a<<9);
-            av_assert0(av_i2int(av_shr_i(ai, 17)) == a>>17);
-            av_assert0(av_i2int(av_shr_i(ai,-17)) == a<<17);
-            av_assert0(av_log2_i(ai) == av_log2(a));
-            av_assert0(av_i2int(av_div_i(ai,bi)) == a/b);
-        }
-    }
-    return 0;
-}
-#endif
diff --git a/media/ffvpx/libavutil/internal.h b/media/ffvpx/libavutil/internal.h
index a2d73e3cc..06bd561e8 100644
--- a/media/ffvpx/libavutil/internal.h
+++ b/media/ffvpx/libavutil/internal.h
@@ -43,6 +43,7 @@
 #include "cpu.h"
 #include "dict.h"
 #include "macros.h"
+#include "mem.h"
 #include "pixfmt.h"
 #include "version.h"
 
@@ -62,10 +63,10 @@
 #endif
 #endif
 
-#if defined(_MSC_VER) && CONFIG_SHARED
-#    define av_export __declspec(dllimport)
+#if defined(_WIN32) && CONFIG_SHARED && !defined(BUILDING_avutil)
+#    define av_export_avutil __declspec(dllimport)
 #else
-#    define av_export
+#    define av_export_avutil
 #endif
 
 #if HAVE_PRAGMA_DEPRECATED
@@ -76,8 +77,8 @@
 #        define FF_DISABLE_DEPRECATION_WARNINGS __pragma(warning(push)) __pragma(warning(disable:4996))
 #        define FF_ENABLE_DEPRECATION_WARNINGS  __pragma(warning(pop))
 #    else
-#        define FF_DISABLE_DEPRECATION_WARNINGS _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
-#        define FF_ENABLE_DEPRECATION_WARNINGS  _Pragma("GCC diagnostic warning \"-Wdeprecated-declarations\"")
+#        define FF_DISABLE_DEPRECATION_WARNINGS _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#        define FF_ENABLE_DEPRECATION_WARNINGS  _Pragma("GCC diagnostic pop")
 #    endif
 #else
 #    define FF_DISABLE_DEPRECATION_WARNINGS
@@ -110,24 +111,30 @@
     DECLARE_ALIGNED(a, t, la_##v) s o;                  \
     t (*v) o = la_##v
 
-#define LOCAL_ALIGNED(a, t, v, ...) E1(LOCAL_ALIGNED_A(a, t, v, __VA_ARGS__,,))
+#define LOCAL_ALIGNED(a, t, v, ...) LOCAL_ALIGNED_##a(t, v, __VA_ARGS__)
 
-#if HAVE_LOCAL_ALIGNED_8
+#if HAVE_LOCAL_ALIGNED
+#   define LOCAL_ALIGNED_4(t, v, ...) E1(LOCAL_ALIGNED_D(4, t, v, __VA_ARGS__,,))
+#else
+#   define LOCAL_ALIGNED_4(t, v, ...) E1(LOCAL_ALIGNED_A(4, t, v, __VA_ARGS__,,))
+#endif
+
+#if HAVE_LOCAL_ALIGNED
 #   define LOCAL_ALIGNED_8(t, v, ...) E1(LOCAL_ALIGNED_D(8, t, v, __VA_ARGS__,,))
 #else
-#   define LOCAL_ALIGNED_8(t, v, ...) LOCAL_ALIGNED(8, t, v, __VA_ARGS__)
+#   define LOCAL_ALIGNED_8(t, v, ...) E1(LOCAL_ALIGNED_A(8, t, v, __VA_ARGS__,,))
 #endif
 
-#if HAVE_LOCAL_ALIGNED_16
+#if HAVE_LOCAL_ALIGNED
 #   define LOCAL_ALIGNED_16(t, v, ...) E1(LOCAL_ALIGNED_D(16, t, v, __VA_ARGS__,,))
 #else
-#   define LOCAL_ALIGNED_16(t, v, ...) LOCAL_ALIGNED(16, t, v, __VA_ARGS__)
+#   define LOCAL_ALIGNED_16(t, v, ...) E1(LOCAL_ALIGNED_A(16, t, v, __VA_ARGS__,,))
 #endif
 
-#if HAVE_LOCAL_ALIGNED_32
+#if HAVE_LOCAL_ALIGNED
 #   define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_D(32, t, v, __VA_ARGS__,,))
 #else
-#   define LOCAL_ALIGNED_32(t, v, ...) LOCAL_ALIGNED(32, t, v, __VA_ARGS__)
+#   define LOCAL_ALIGNED_32(t, v, ...) E1(LOCAL_ALIGNED_A(32, t, v, __VA_ARGS__,,))
 #endif
 
 #define FF_ALLOC_OR_GOTO(ctx, p, size, label)\
@@ -353,4 +360,13 @@ void ff_check_pixfmt_descriptors(void);
  */
 int avpriv_dict_set_timestamp(AVDictionary **dict, const char *key, int64_t timestamp);
 
+// Helper macro for AV_PIX_FMT_FLAG_PSEUDOPAL deprecation. Code inside FFmpeg
+// should always use FF_PSEUDOPAL. Once the public API flag gets removed, all
+// code using it is dead code.
+#if FF_API_PSEUDOPAL
+#define FF_PSEUDOPAL AV_PIX_FMT_FLAG_PSEUDOPAL
+#else
+#define FF_PSEUDOPAL 0
+#endif
+
 #endif /* AVUTIL_INTERNAL_H */
diff --git a/media/ffvpx/libavutil/intreadwrite.h b/media/ffvpx/libavutil/intreadwrite.h
index d54d4b91d..67c763b13 100644
--- a/media/ffvpx/libavutil/intreadwrite.h
+++ b/media/ffvpx/libavutil/intreadwrite.h
@@ -215,7 +215,7 @@ typedef union {
  * by per-arch headers.
  */
 
-#if defined(__GNUC__) && !defined(__TI_COMPILER_VERSION__)
+#if defined(__GNUC__)
 
 union unaligned_64 { uint64_t l; } __attribute__((packed)) av_alias;
 union unaligned_32 { uint32_t l; } __attribute__((packed)) av_alias;
@@ -224,12 +224,7 @@ union unaligned_16 { uint16_t l; } __attribute__((packed)) av_alias;
 #   define AV_RN(s, p) (((const union unaligned_##s *) (p))->l)
 #   define AV_WN(s, p, v) ((((union unaligned_##s *) (p))->l) = (v))
 
-#elif defined(__DECC)
-
-#   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
-#   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
-
-#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_X64)) && AV_HAVE_FAST_UNALIGNED
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_X64) || defined(_M_ARM64)) && AV_HAVE_FAST_UNALIGNED
 
 #   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
 #   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
diff --git a/media/ffvpx/libavutil/log.c b/media/ffvpx/libavutil/log.c
index be806202f..9b7d48487 100644
--- a/media/ffvpx/libavutil/log.c
+++ b/media/ffvpx/libavutil/log.c
@@ -39,11 +39,9 @@
 #include "common.h"
 #include "internal.h"
 #include "log.h"
+#include "thread.h"
 
-#if HAVE_PTHREADS
-#include <pthread.h>
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-#endif
+static AVMutex mutex = AV_MUTEX_INITIALIZER;
 
 #define LINE_SZ 1024
 
@@ -57,7 +55,7 @@ static int av_log_level = AV_LOG_INFO;
 static int flags;
 
 #define NB_LEVELS 8
-#if defined(_WIN32) && !defined(__MINGW32CE__) && HAVE_SETCONSOLETEXTATTRIBUTE
+#if defined(_WIN32) && HAVE_SETCONSOLETEXTATTRIBUTE
 #include <windows.h>
 static const uint8_t color[16 + AV_CLASS_CATEGORY_NB] = {
     [AV_LOG_PANIC  /8] = 12,
@@ -124,7 +122,7 @@ static int use_color = -1;
 
 static void check_color_terminal(void)
 {
-#if defined(_WIN32) && !defined(__MINGW32CE__) && HAVE_SETCONSOLETEXTATTRIBUTE
+#if defined(_WIN32) && HAVE_SETCONSOLETEXTATTRIBUTE
     CONSOLE_SCREEN_BUFFER_INFO con_info;
     con = GetStdHandle(STD_ERROR_HANDLE);
     use_color = (con != INVALID_HANDLE_VALUE) && !getenv("NO_COLOR") &&
@@ -159,7 +157,7 @@ static void colored_fputs(int level, int tint, const char *str)
     if (level == AV_LOG_INFO/8) local_use_color = 0;
     else                        local_use_color = use_color;
 
-#if defined(_WIN32) && !defined(__MINGW32CE__) && HAVE_SETCONSOLETEXTATTRIBUTE
+#if defined(_WIN32) && HAVE_SETCONSOLETEXTATTRIBUTE
     if (local_use_color)
         SetConsoleTextAttribute(con, background | color[level]);
     fputs(str, stderr);
@@ -268,11 +266,11 @@ static void format_line(void *avcl, int level, const char *fmt, va_list vl,
         av_bprintf(part+1, "[%s @ %p] ",
                  avc->item_name(avcl), avcl);
         if(type) type[1] = get_category(avcl);
-
-        if (flags & AV_LOG_PRINT_LEVEL)
-            av_bprintf(part+2, "[%s] ", get_level_str(level));
     }
 
+    if (*print_prefix && (level > AV_LOG_QUIET) && (flags & AV_LOG_PRINT_LEVEL))
+        av_bprintf(part+2, "[%s] ", get_level_str(level));
+
     av_vbprintf(part+3, fmt, vl);
 
     if(*part[0].str || *part[1].str || *part[2].str || *part[3].str) {
@@ -317,9 +315,7 @@ void av_log_default_callback(void* ptr, int level, const char* fmt, va_list vl)
 
     if (level > av_log_level)
         return;
-#if HAVE_PTHREADS
-    pthread_mutex_lock(&mutex);
-#endif
+    ff_mutex_lock(&mutex);
 
     format_line(ptr, level, fmt, vl, part, &print_prefix, type);
     snprintf(line, sizeof(line), "%s%s%s%s", part[0].str, part[1].str, part[2].str, part[3].str);
@@ -356,9 +352,7 @@ void av_log_default_callback(void* ptr, int level, const char* fmt, va_list vl)
 #endif
 end:
     av_bprint_finalize(part+3, NULL);
-#if HAVE_PTHREADS
-    pthread_mutex_unlock(&mutex);
-#endif
+    ff_mutex_unlock(&mutex);
 }
 
 static void (*av_log_callback)(void*, int, const char*, va_list) =
diff --git a/media/ffvpx/libavutil/log.h b/media/ffvpx/libavutil/log.h
index f0a57385d..d9554e609 100644
--- a/media/ffvpx/libavutil/log.h
+++ b/media/ffvpx/libavutil/log.h
@@ -334,20 +334,6 @@ void av_log_format_line(void *ptr, int level, const char *fmt, va_list vl,
 int av_log_format_line2(void *ptr, int level, const char *fmt, va_list vl,
                         char *line, int line_size, int *print_prefix);
 
-#if FF_API_DLOG
-/**
- * av_dlog macros
- * @deprecated unused
- * Useful to print debug messages that shouldn't get compiled in normally.
- */
-
-#ifdef DEBUG
-#    define av_dlog(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__)
-#else
-#    define av_dlog(pctx, ...) do { if (0) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__); } while (0)
-#endif
-#endif /* FF_API_DLOG */
-
 /**
  * Skip repeated messages, this requires the user app to use av_log() instead of
  * (f)printf as the 2 would otherwise interfere and lead to
diff --git a/media/ffvpx/libavutil/mem.c b/media/ffvpx/libavutil/mem.c
index 36740f115..6149755a6 100644
--- a/media/ffvpx/libavutil/mem.c
+++ b/media/ffvpx/libavutil/mem.c
@@ -61,7 +61,7 @@ void  free(void *ptr);
 
 #include "mem_internal.h"
 
-#define ALIGN (HAVE_AVX ? 32 : 16)
+#define ALIGN (HAVE_AVX512 ? 64 : (HAVE_AVX ? 32 : 16))
 
 /* NOTE: if you want to override these functions with your own
  * implementations (not recommended) you have to link libav* as
@@ -181,6 +181,20 @@ int av_reallocp(void *ptr, size_t size)
     return 0;
 }
 
+void *av_malloc_array(size_t nmemb, size_t size)
+{
+    if (!size || nmemb >= INT_MAX / size)
+        return NULL;
+    return av_malloc(nmemb * size);
+}
+
+void *av_mallocz_array(size_t nmemb, size_t size)
+{
+    if (!size || nmemb >= INT_MAX / size)
+        return NULL;
+    return av_mallocz(nmemb * size);
+}
+
 void *av_realloc_array(void *ptr, size_t nmemb, size_t size)
 {
     if (!size || nmemb >= INT_MAX / size)
@@ -449,10 +463,15 @@ void av_memcpy_backptr(uint8_t *dst, int back, int cnt)
 
 void *av_fast_realloc(void *ptr, unsigned int *size, size_t min_size)
 {
-    if (min_size < *size)
+    if (min_size <= *size)
         return ptr;
 
-    min_size = FFMAX(min_size + min_size / 16 + 32, min_size);
+    if (min_size > max_alloc_size - 32) {
+        *size = 0;
+        return NULL;
+    }
+
+    min_size = FFMIN(max_alloc_size - 32, FFMAX(min_size + min_size / 16 + 32, min_size));
 
     ptr = av_realloc(ptr, min_size);
     /* we could set this to the unmodified min_size but this is safer
diff --git a/media/ffvpx/libavutil/mem.h b/media/ffvpx/libavutil/mem.h
index 527cd0319..7e0b12a8a 100644
--- a/media/ffvpx/libavutil/mem.h
+++ b/media/ffvpx/libavutil/mem.h
@@ -74,6 +74,19 @@
  */
 
 /**
+ * @def DECLARE_ASM_ALIGNED(n,t,v)
+ * Declare an aligned variable appropriate for use in inline assembly code.
+ *
+ * @code{.c}
+ * DECLARE_ASM_ALIGNED(16, uint64_t, pw_08) = UINT64_C(0x0008000800080008);
+ * @endcode
+ *
+ * @param n Minimum alignment in bytes
+ * @param t Type of the variable (or array element)
+ * @param v Name of the variable
+ */
+
+/**
  * @def DECLARE_ASM_CONST(n,t,v)
  * Declare a static constant aligned variable appropriate for use in inline
  * assembly code.
@@ -89,25 +102,23 @@
 
 #if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1110 || defined(__SUNPRO_C)
     #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+    #define DECLARE_ASM_ALIGNED(n,t,v)  t __attribute__ ((aligned (n))) v
     #define DECLARE_ASM_CONST(n,t,v)    const t __attribute__ ((aligned (n))) v
-#elif defined(__TI_COMPILER_VERSION__)
-    #define DECLARE_ALIGNED(n,t,v)                      \
-        AV_PRAGMA(DATA_ALIGN(v,n))                      \
-        t __attribute__((aligned(n))) v
-    #define DECLARE_ASM_CONST(n,t,v)                    \
-        AV_PRAGMA(DATA_ALIGN(v,n))                      \
-        static const t __attribute__((aligned(n))) v
 #elif defined(__DJGPP__)
     #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (FFMIN(n, 16)))) v
+    #define DECLARE_ASM_ALIGNED(n,t,v)  t av_used __attribute__ ((aligned (FFMIN(n, 16)))) v
     #define DECLARE_ASM_CONST(n,t,v)    static const t av_used __attribute__ ((aligned (FFMIN(n, 16)))) v
 #elif defined(__GNUC__) || defined(__clang__)
     #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+    #define DECLARE_ASM_ALIGNED(n,t,v)  t av_used __attribute__ ((aligned (n))) v
     #define DECLARE_ASM_CONST(n,t,v)    static const t av_used __attribute__ ((aligned (n))) v
 #elif defined(_MSC_VER)
     #define DECLARE_ALIGNED(n,t,v)      __declspec(align(n)) t v
+    #define DECLARE_ASM_ALIGNED(n,t,v)  __declspec(align(n)) t v
     #define DECLARE_ASM_CONST(n,t,v)    __declspec(align(n)) static const t v
 #else
     #define DECLARE_ALIGNED(n,t,v)      t v
+    #define DECLARE_ASM_ALIGNED(n,t,v)  t v
     #define DECLARE_ASM_CONST(n,t,v)    static const t v
 #endif
 
@@ -206,12 +217,7 @@ void *av_mallocz(size_t size) av_malloc_attrib av_alloc_size(1);
  *         be allocated
  * @see av_malloc()
  */
-av_alloc_size(1, 2) static inline void *av_malloc_array(size_t nmemb, size_t size)
-{
-    if (!size || nmemb >= INT_MAX / size)
-        return NULL;
-    return av_malloc(nmemb * size);
-}
+av_alloc_size(1, 2) void *av_malloc_array(size_t nmemb, size_t size);
 
 /**
  * Allocate a memory block for an array with av_mallocz().
@@ -226,12 +232,7 @@ av_alloc_size(1, 2) static inline void *av_malloc_array(size_t nmemb, size_t siz
  * @see av_mallocz()
  * @see av_malloc_array()
  */
-av_alloc_size(1, 2) static inline void *av_mallocz_array(size_t nmemb, size_t size)
-{
-    if (!size || nmemb >= INT_MAX / size)
-        return NULL;
-    return av_mallocz(nmemb * size);
-}
+av_alloc_size(1, 2) void *av_mallocz_array(size_t nmemb, size_t size);
 
 /**
  * Non-inlined equivalent of av_mallocz_array().
diff --git a/media/ffvpx/libavutil/moz.build b/media/ffvpx/libavutil/moz.build
index 201b62fed..040ab76ba 100644
--- a/media/ffvpx/libavutil/moz.build
+++ b/media/ffvpx/libavutil/moz.build
@@ -12,7 +12,6 @@ if CONFIG['FFVPX_ASFLAGS']:
 SharedLibrary('mozavutil')
 SOURCES += [
     'adler32.c',
-    'atomic.c',
     'avstring.c',
     'base64.c',
     'bprint.c',
@@ -29,6 +28,7 @@ SOURCES += [
     'fixed_dsp.c',
     'float_dsp.c',
     'frame.c',
+    'hwcontext.c',
     'imgutils.c',
     'integer.c',
     'intmath.c',
diff --git a/media/ffvpx/libavutil/opt.c b/media/ffvpx/libavutil/opt.c
index df88663e3..3b0aab4ee 100644
--- a/media/ffvpx/libavutil/opt.c
+++ b/media/ffvpx/libavutil/opt.c
@@ -1181,6 +1181,7 @@ static void opt_list(void *obj, void *av_log_obj, const char *unit,
         av_log(av_log_obj, AV_LOG_INFO, "%c", (opt->flags & AV_OPT_FLAG_SUBTITLE_PARAM) ? 'S' : '.');
         av_log(av_log_obj, AV_LOG_INFO, "%c", (opt->flags & AV_OPT_FLAG_EXPORT)         ? 'X' : '.');
         av_log(av_log_obj, AV_LOG_INFO, "%c", (opt->flags & AV_OPT_FLAG_READONLY)       ? 'R' : '.');
+        av_log(av_log_obj, AV_LOG_INFO, "%c", (opt->flags & AV_OPT_FLAG_BSF_PARAM)      ? 'B' : '.');
 
         if (opt->help)
             av_log(av_log_obj, AV_LOG_INFO, " %s", opt->help);
diff --git a/media/ffvpx/libavutil/opt.h b/media/ffvpx/libavutil/opt.h
index 0d893795d..07da68ea2 100644
--- a/media/ffvpx/libavutil/opt.h
+++ b/media/ffvpx/libavutil/opt.h
@@ -229,15 +229,15 @@ enum AVOptionType{
     AV_OPT_TYPE_BINARY,  ///< offset must point to a pointer immediately followed by an int for the length
     AV_OPT_TYPE_DICT,
     AV_OPT_TYPE_UINT64,
-    AV_OPT_TYPE_CONST = 128,
-    AV_OPT_TYPE_IMAGE_SIZE = MKBETAG('S','I','Z','E'), ///< offset must point to two consecutive integers
-    AV_OPT_TYPE_PIXEL_FMT  = MKBETAG('P','F','M','T'),
-    AV_OPT_TYPE_SAMPLE_FMT = MKBETAG('S','F','M','T'),
-    AV_OPT_TYPE_VIDEO_RATE = MKBETAG('V','R','A','T'), ///< offset must point to AVRational
-    AV_OPT_TYPE_DURATION   = MKBETAG('D','U','R',' '),
-    AV_OPT_TYPE_COLOR      = MKBETAG('C','O','L','R'),
-    AV_OPT_TYPE_CHANNEL_LAYOUT = MKBETAG('C','H','L','A'),
-    AV_OPT_TYPE_BOOL           = MKBETAG('B','O','O','L'),
+    AV_OPT_TYPE_CONST,
+    AV_OPT_TYPE_IMAGE_SIZE, ///< offset must point to two consecutive integers
+    AV_OPT_TYPE_PIXEL_FMT,
+    AV_OPT_TYPE_SAMPLE_FMT,
+    AV_OPT_TYPE_VIDEO_RATE, ///< offset must point to AVRational
+    AV_OPT_TYPE_DURATION,
+    AV_OPT_TYPE_COLOR,
+    AV_OPT_TYPE_CHANNEL_LAYOUT,
+    AV_OPT_TYPE_BOOL,
 };
 
 /**
@@ -275,9 +275,6 @@ typedef struct AVOption {
     int flags;
 #define AV_OPT_FLAG_ENCODING_PARAM  1   ///< a generic parameter which can be set by the user for muxing or encoding
 #define AV_OPT_FLAG_DECODING_PARAM  2   ///< a generic parameter which can be set by the user for demuxing or decoding
-#if FF_API_OPT_TYPE_METADATA
-#define AV_OPT_FLAG_METADATA        4   ///< some data extracted or inserted into the file like title, comment, ...
-#endif
 #define AV_OPT_FLAG_AUDIO_PARAM     8
 #define AV_OPT_FLAG_VIDEO_PARAM     16
 #define AV_OPT_FLAG_SUBTITLE_PARAM  32
@@ -290,6 +287,7 @@ typedef struct AVOption {
  * This flag only makes sense when AV_OPT_FLAG_EXPORT is also set.
  */
 #define AV_OPT_FLAG_READONLY        128
+#define AV_OPT_FLAG_BSF_PARAM       (1<<8) ///< a generic parameter which can be set by the user for bit stream filtering
 #define AV_OPT_FLAG_FILTERING_PARAM (1<<16) ///< a generic parameter which can be set by the user for filtering
 //FIXME think about enc-audio, ... style flags
 
diff --git a/media/ffvpx/libavutil/parseutils.c b/media/ffvpx/libavutil/parseutils.c
index be4ea1ee1..9de19d1c1 100644
--- a/media/ffvpx/libavutil/parseutils.c
+++ b/media/ffvpx/libavutil/parseutils.c
@@ -590,7 +590,7 @@ int av_parse_time(int64_t *timeval, const char *timestr, int duration)
     int64_t t, now64;
     time_t now;
     struct tm dt = { 0 }, tmbuf;
-    int today = 0, negative = 0, microseconds = 0;
+    int today = 0, negative = 0, microseconds = 0, suffix = 1000000;
     int i;
     static const char * const date_fmt[] = {
         "%Y - %m - %d",
@@ -689,6 +689,16 @@ int av_parse_time(int64_t *timeval, const char *timestr, int duration)
 
     if (duration) {
         t = dt.tm_hour * 3600 + dt.tm_min * 60 + dt.tm_sec;
+        if (q[0] == 'm' && q[1] == 's') {
+            suffix = 1000;
+            microseconds /= 1000;
+            q += 2;
+        } else if (q[0] == 'u' && q[1] == 's') {
+            suffix = 1;
+            microseconds = 0;
+            q += 2;
+        } else if (*q == 's')
+            q++;
     } else {
         int is_utc = *q == 'Z' || *q == 'z';
         int tzoffset = 0;
@@ -724,7 +734,7 @@ int av_parse_time(int64_t *timeval, const char *timestr, int duration)
     if (*q)
         return AVERROR(EINVAL);
 
-    t *= 1000000;
+    t *= suffix;
     t += microseconds;
     *timeval = negative ? -t : t;
     return 0;
diff --git a/media/ffvpx/libavutil/pixdesc.c b/media/ffvpx/libavutil/pixdesc.c
index 2cfab89c0..8ed52751c 100644
--- a/media/ffvpx/libavutil/pixdesc.c
+++ b/media/ffvpx/libavutil/pixdesc.c
@@ -257,7 +257,7 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .comp = {
             { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
         },
-        .flags = AV_PIX_FMT_FLAG_PSEUDOPAL,
+        .flags = FF_PSEUDOPAL,
         .alias = "gray8,y8",
     },
     [AV_PIX_FMT_MONOWHITE] = {
@@ -326,22 +326,10 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         },
         .flags = AV_PIX_FMT_FLAG_PLANAR,
     },
-#if FF_API_XVMC
-    [AV_PIX_FMT_XVMC_MPEG2_MC] = {
-        .name = "xvmcmc",
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-    [AV_PIX_FMT_XVMC_MPEG2_IDCT] = {
-        .name = "xvmcidct",
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-#endif /* FF_API_XVMC */
-#if !FF_API_XVMC
     [AV_PIX_FMT_XVMC] = {
         .name = "xvmc",
         .flags = AV_PIX_FMT_FLAG_HWACCEL,
     },
-#endif /* !FF_API_XVMC */
     [AV_PIX_FMT_UYVY422] = {
         .name = "uyvy422",
         .nb_components = 3,
@@ -374,7 +362,7 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
             { 0, 1, 0, 3, 3, 0, 2, 1 },        /* G */
             { 0, 1, 0, 6, 2, 0, 1, 1 },        /* B */
         },
-        .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_PSEUDOPAL,
+        .flags = AV_PIX_FMT_FLAG_RGB | FF_PSEUDOPAL,
     },
     [AV_PIX_FMT_BGR4] = {
         .name = "bgr4",
@@ -398,7 +386,7 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
             { 0, 1, 0, 1, 2, 0, 1, 1 },        /* G */
             { 0, 1, 0, 3, 1, 0, 0, 1 },        /* B */
         },
-        .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_PSEUDOPAL,
+        .flags = AV_PIX_FMT_FLAG_RGB | FF_PSEUDOPAL,
     },
     [AV_PIX_FMT_RGB8] = {
         .name = "rgb8",
@@ -410,7 +398,7 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
             { 0, 1, 0, 3, 3, 0, 2, 1 },        /* G */
             { 0, 1, 0, 0, 3, 0, 2, 1 },        /* B */
         },
-        .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_PSEUDOPAL,
+        .flags = AV_PIX_FMT_FLAG_RGB | FF_PSEUDOPAL,
     },
     [AV_PIX_FMT_RGB4] = {
         .name = "rgb4",
@@ -434,7 +422,7 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
             { 0, 1, 0, 1, 2, 0, 1, 1 },        /* G */
             { 0, 1, 0, 0, 1, 0, 0, 1 },        /* B */
         },
-        .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_PSEUDOPAL,
+        .flags = AV_PIX_FMT_FLAG_RGB | FF_PSEUDOPAL,
     },
     [AV_PIX_FMT_NV12] = {
         .name = "nv12",
@@ -989,44 +977,6 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         },
         .flags = AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_ALPHA,
     },
-#if FF_API_VDPAU
-    [AV_PIX_FMT_VDPAU_H264] = {
-        .name = "vdpau_h264",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-    [AV_PIX_FMT_VDPAU_MPEG1] = {
-        .name = "vdpau_mpeg1",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-    [AV_PIX_FMT_VDPAU_MPEG2] = {
-        .name = "vdpau_mpeg2",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-    [AV_PIX_FMT_VDPAU_WMV3] = {
-        .name = "vdpau_wmv3",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-    [AV_PIX_FMT_VDPAU_VC1] = {
-        .name = "vdpau_vc1",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-    [AV_PIX_FMT_VDPAU_MPEG4] = {
-        .name = "vdpau_mpeg4",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
-#endif
     [AV_PIX_FMT_RGB48BE] = {
         .name = "rgb48be",
         .nb_components = 3,
@@ -1670,12 +1620,6 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .log2_chroma_h = 1,
         .flags = AV_PIX_FMT_FLAG_HWACCEL,
     },
-    [AV_PIX_FMT_VDA_VLD] = {
-        .name = "vda_vld",
-        .log2_chroma_w = 1,
-        .log2_chroma_h = 1,
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
     [AV_PIX_FMT_YA8] = {
         .name = "ya8",
         .nb_components = 2,
@@ -2029,10 +1973,6 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         },
         .flags = AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_BE,
     },
-    [AV_PIX_FMT_VDA] = {
-        .name = "vda",
-        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-    },
     [AV_PIX_FMT_QSV] = {
         .name = "qsv",
         .flags = AV_PIX_FMT_FLAG_HWACCEL,
@@ -2241,6 +2181,10 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
         .name = "drm_prime",
         .flags = AV_PIX_FMT_FLAG_HWACCEL,
     },
+    [AV_PIX_FMT_OPENCL] = {
+        .name  = "opencl",
+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
+    },
 };
 #if FF_API_PLUS1_MINUS1
 FF_ENABLE_DEPRECATION_WARNINGS
diff --git a/media/ffvpx/libavutil/pixdesc.h b/media/ffvpx/libavutil/pixdesc.h
index fc3737c4a..1ab372782 100644
--- a/media/ffvpx/libavutil/pixdesc.h
+++ b/media/ffvpx/libavutil/pixdesc.h
@@ -154,6 +154,14 @@ typedef struct AVPixFmtDescriptor {
  * in some cases be simpler. Or the data can be interpreted purely based on
  * the pixel format without using the palette.
  * An example of a pseudo-paletted format is AV_PIX_FMT_GRAY8
+ *
+ * @deprecated This flag is deprecated, and will be removed. When it is removed,
+ * the extra palette allocation in AVFrame.data[1] is removed as well. Only
+ * actual paletted formats (as indicated by AV_PIX_FMT_FLAG_PAL) will have a
+ * palette. Starting with FFmpeg versions which have this flag deprecated, the
+ * extra "pseudo" palette is already ignored, and API users are not required to
+ * allocate a palette for AV_PIX_FMT_FLAG_PSEUDOPAL formats (it was required
+ * before the deprecation, though).
  */
 #define AV_PIX_FMT_FLAG_PSEUDOPAL    (1 << 6)
 
@@ -225,11 +233,6 @@ enum AVPixelFormat av_pix_fmt_desc_get_id(const AVPixFmtDescriptor *desc);
  * Utility function to access log2_chroma_w log2_chroma_h from
  * the pixel format AVPixFmtDescriptor.
  *
- * See av_get_chroma_sub_sample() for a function that asserts a
- * valid pixel format instead of returning an error code.
- * Its recommended that you use avcodec_get_chroma_sub_sample unless
- * you do check the return code!
- *
  * @param[in]  pix_fmt the pixel format
  * @param[out] h_shift store log2_chroma_w (horizontal/width shift)
  * @param[out] v_shift store log2_chroma_h (vertical/height shift)
diff --git a/media/ffvpx/libavutil/pixfmt.h b/media/ffvpx/libavutil/pixfmt.h
index 24889c8e5..e184a5667 100644
--- a/media/ffvpx/libavutil/pixfmt.h
+++ b/media/ffvpx/libavutil/pixfmt.h
@@ -74,11 +74,6 @@ enum AVPixelFormat {
     AV_PIX_FMT_YUVJ420P,  ///< planar YUV 4:2:0, 12bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV420P and setting color_range
     AV_PIX_FMT_YUVJ422P,  ///< planar YUV 4:2:2, 16bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV422P and setting color_range
     AV_PIX_FMT_YUVJ444P,  ///< planar YUV 4:4:4, 24bpp, full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV444P and setting color_range
-#if FF_API_XVMC
-    AV_PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing
-    AV_PIX_FMT_XVMC_MPEG2_IDCT,
-    AV_PIX_FMT_XVMC = AV_PIX_FMT_XVMC_MPEG2_IDCT,
-#endif /* FF_API_XVMC */
     AV_PIX_FMT_UYVY422,   ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
     AV_PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3
     AV_PIX_FMT_BGR8,      ///< packed RGB 3:3:2,  8bpp, (msb)2B 3G 3R(lsb)
@@ -100,13 +95,6 @@ enum AVPixelFormat {
     AV_PIX_FMT_YUV440P,   ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
     AV_PIX_FMT_YUVJ440P,  ///< planar YUV 4:4:0 full scale (JPEG), deprecated in favor of AV_PIX_FMT_YUV440P and setting color_range
     AV_PIX_FMT_YUVA420P,  ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
-#if FF_API_VDPAU
-    AV_PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    AV_PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    AV_PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    AV_PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-    AV_PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-#endif
     AV_PIX_FMT_RGB48BE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, the 2-byte value for each R/G/B component is stored as big-endian
     AV_PIX_FMT_RGB48LE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, the 2-byte value for each R/G/B component is stored as little-endian
 
@@ -142,9 +130,6 @@ enum AVPixelFormat {
     AV_PIX_FMT_YUV422P16BE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
     AV_PIX_FMT_YUV444P16LE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
     AV_PIX_FMT_YUV444P16BE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
-#if FF_API_VDPAU
-    AV_PIX_FMT_VDPAU_MPEG4,  ///< MPEG-4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
-#endif
     AV_PIX_FMT_DXVA2_VLD,    ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer
 
     AV_PIX_FMT_RGB444LE,  ///< packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), little-endian, X=unused/undefined
@@ -176,7 +161,6 @@ enum AVPixelFormat {
     AV_PIX_FMT_YUV444P10LE,///< planar YUV 4:4:4, 30bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
     AV_PIX_FMT_YUV422P9BE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
     AV_PIX_FMT_YUV422P9LE, ///< planar YUV 4:2:2, 18bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
-    AV_PIX_FMT_VDA_VLD,    ///< hardware decoding through VDA
     AV_PIX_FMT_GBRP,      ///< planar GBR 4:4:4 24bpp
     AV_PIX_FMT_GBR24P = AV_PIX_FMT_GBRP, // alias for #AV_PIX_FMT_GBRP
     AV_PIX_FMT_GBRP9BE,   ///< planar GBR 4:4:4 27bpp, big-endian
@@ -221,8 +205,6 @@ enum AVPixelFormat {
 
     AV_PIX_FMT_YVYU422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
 
-    AV_PIX_FMT_VDA,          ///< HW acceleration through VDA, data[3] contains a CVPixelBufferRef
-
     AV_PIX_FMT_YA16BE,       ///< 16 bits gray, 16 bits alpha (big-endian)
     AV_PIX_FMT_YA16LE,       ///< 16 bits gray, 16 bits alpha (little-endian)
 
@@ -248,7 +230,7 @@ enum AVPixelFormat {
      */
     AV_PIX_FMT_CUDA,
 
-    AV_PIX_FMT_0RGB=0x123+4,///< packed RGB 8:8:8, 32bpp, XRGBXRGB...   X=unused/undefined
+    AV_PIX_FMT_0RGB,        ///< packed RGB 8:8:8, 32bpp, XRGBXRGB...   X=unused/undefined
     AV_PIX_FMT_RGB0,        ///< packed RGB 8:8:8, 32bpp, RGBXRGBX...   X=unused/undefined
     AV_PIX_FMT_0BGR,        ///< packed BGR 8:8:8, 32bpp, XBGRXBGR...   X=unused/undefined
     AV_PIX_FMT_BGR0,        ///< packed BGR 8:8:8, 32bpp, BGRXBGRX...   X=unused/undefined
@@ -283,9 +265,9 @@ enum AVPixelFormat {
     AV_PIX_FMT_BAYER_GBRG16BE, ///< bayer, GBGB..(odd line), RGRG..(even line), 16-bit samples, big-endian */
     AV_PIX_FMT_BAYER_GRBG16LE, ///< bayer, GRGR..(odd line), BGBG..(even line), 16-bit samples, little-endian */
     AV_PIX_FMT_BAYER_GRBG16BE, ///< bayer, GRGR..(odd line), BGBG..(even line), 16-bit samples, big-endian */
-#if !FF_API_XVMC
+
     AV_PIX_FMT_XVMC,///< XVideo Motion Acceleration via common packet passing
-#endif /* !FF_API_XVMC */
+
     AV_PIX_FMT_YUV440P10LE, ///< planar YUV 4:4:0,20bpp, (1 Cr & Cb sample per 1x2 Y samples), little-endian
     AV_PIX_FMT_YUV440P10BE, ///< planar YUV 4:4:0,20bpp, (1 Cr & Cb sample per 1x2 Y samples), big-endian
     AV_PIX_FMT_YUV440P12LE, ///< planar YUV 4:4:0,24bpp, (1 Cr & Cb sample per 1x2 Y samples), little-endian
@@ -340,6 +322,13 @@ enum AVPixelFormat {
      * data[0] points to an AVDRMFrameDescriptor.
      */
     AV_PIX_FMT_DRM_PRIME,
+    /**
+     * Hardware surfaces for OpenCL.
+     *
+     * data[i] contain 2D image objects (typed in C as cl_mem, used
+     * in OpenCL as image2d_t) for each plane of the surface.
+     */
+    AV_PIX_FMT_OPENCL,
 
     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 };
diff --git a/media/ffvpx/libavutil/slicethread.c b/media/ffvpx/libavutil/slicethread.c
index c43f87a2a..dfbe551ef 100644
--- a/media/ffvpx/libavutil/slicethread.c
+++ b/media/ffvpx/libavutil/slicethread.c
@@ -99,10 +99,6 @@ int avpriv_slicethread_create(AVSliceThread **pctx, void *priv,
     AVSliceThread *ctx;
     int nb_workers, i;
 
-#if HAVE_W32THREADS
-    w32thread_init();
-#endif
-
     av_assert0(nb_threads >= 0);
     if (!nb_threads) {
         int nb_cpus = av_cpu_count();
diff --git a/media/ffvpx/libavutil/thread.h b/media/ffvpx/libavutil/thread.h
index f108e2005..cc5272d37 100644
--- a/media/ffvpx/libavutil/thread.h
+++ b/media/ffvpx/libavutil/thread.h
@@ -134,6 +134,7 @@ static inline int strict_pthread_once(pthread_once_t *once_control, void (*init_
 #endif
 
 #define AVMutex pthread_mutex_t
+#define AV_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 
 #define ff_mutex_init    pthread_mutex_init
 #define ff_mutex_lock    pthread_mutex_lock
@@ -148,11 +149,12 @@ static inline int strict_pthread_once(pthread_once_t *once_control, void (*init_
 #else
 
 #define AVMutex char
+#define AV_MUTEX_INITIALIZER 0
 
-#define ff_mutex_init(mutex, attr) (0)
-#define ff_mutex_lock(mutex) (0)
-#define ff_mutex_unlock(mutex) (0)
-#define ff_mutex_destroy(mutex) (0)
+static inline int ff_mutex_init(AVMutex *mutex, const void *attr){ return 0; }
+static inline int ff_mutex_lock(AVMutex *mutex){ return 0; }
+static inline int ff_mutex_unlock(AVMutex *mutex){ return 0; }
+static inline int ff_mutex_destroy(AVMutex *mutex){ return 0; }
 
 #define AVOnce char
 #define AV_ONCE_INIT 0
diff --git a/media/ffvpx/libavutil/timecode.c b/media/ffvpx/libavutil/timecode.c
index c0c67c847..60077ba0c 100644
--- a/media/ffvpx/libavutil/timecode.c
+++ b/media/ffvpx/libavutil/timecode.c
@@ -155,7 +155,7 @@ static int check_fps(int fps)
 static int check_timecode(void *log_ctx, AVTimecode *tc)
 {
     if ((int)tc->fps <= 0) {
-        av_log(log_ctx, AV_LOG_ERROR, "Timecode frame rate must be specified\n");
+        av_log(log_ctx, AV_LOG_ERROR, "Valid timecode frame rate must be specified. Minimum value is 1\n");
         return AVERROR(EINVAL);
     }
     if ((tc->flags & AV_TIMECODE_FLAG_DROPFRAME) && tc->fps != 30 && tc->fps != 60) {
@@ -214,7 +214,7 @@ int av_timecode_init_from_string(AVTimecode *tc, AVRational rate, const char *st
     tc->start = (hh*3600 + mm*60 + ss) * tc->fps + ff;
     if (tc->flags & AV_TIMECODE_FLAG_DROPFRAME) { /* adjust frame number */
         int tmins = 60*hh + mm;
-        tc->start -= 2 * (tmins - tmins/10);
+        tc->start -= (tc->fps == 30 ? 2 : 4) * (tmins - tmins/10);
     }
     return 0;
 }
diff --git a/media/ffvpx/libavutil/timer.h b/media/ffvpx/libavutil/timer.h
index f7ab455df..0bb353cfc 100644
--- a/media/ffvpx/libavutil/timer.h
+++ b/media/ffvpx/libavutil/timer.h
@@ -42,7 +42,7 @@
 #include <stdint.h>
 #include <inttypes.h>
 
-#if HAVE_MACH_MACH_TIME_H
+#if HAVE_MACH_ABSOLUTE_TIME
 #include <mach/mach_time.h>
 #endif
 
diff --git a/media/ffvpx/libavutil/utils.c b/media/ffvpx/libavutil/utils.c
index 2c170db2e..230081ea4 100644
--- a/media/ffvpx/libavutil/utils.c
+++ b/media/ffvpx/libavutil/utils.c
@@ -41,9 +41,6 @@ unsigned avutil_version(void)
     if (checks_done)
         return LIBAVUTIL_VERSION_INT;
 
-#if FF_API_VDPAU
-    av_assert0(AV_PIX_FMT_VDA_VLD == 81); //check if the pix fmt enum has not had anything inserted or removed by mistake
-#endif
     av_assert0(AV_SAMPLE_FMT_DBLP == 9);
     av_assert0(AVMEDIA_TYPE_ATTACHMENT == 4);
     av_assert0(AV_PICTURE_TYPE_BI == 7);
diff --git a/media/ffvpx/libavutil/version.h b/media/ffvpx/libavutil/version.h
index f594dc069..3a63e6355 100644
--- a/media/ffvpx/libavutil/version.h
+++ b/media/ffvpx/libavutil/version.h
@@ -78,9 +78,8 @@
  * @{
  */
 
-
-#define LIBAVUTIL_VERSION_MAJOR  55
-#define LIBAVUTIL_VERSION_MINOR  78
+#define LIBAVUTIL_VERSION_MAJOR  56
+#define LIBAVUTIL_VERSION_MINOR  14
 #define LIBAVUTIL_VERSION_MICRO 100
 
 #define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
@@ -106,38 +105,29 @@
  * @{
  */
 
-#ifndef FF_API_VDPAU
-#define FF_API_VDPAU                    (LIBAVUTIL_VERSION_MAJOR < 56)
-#endif
-#ifndef FF_API_XVMC
-#define FF_API_XVMC                     (LIBAVUTIL_VERSION_MAJOR < 56)
-#endif
-#ifndef FF_API_OPT_TYPE_METADATA
-#define FF_API_OPT_TYPE_METADATA        (LIBAVUTIL_VERSION_MAJOR < 56)
-#endif
-#ifndef FF_API_DLOG
-#define FF_API_DLOG                     (LIBAVUTIL_VERSION_MAJOR < 56)
-#endif
 #ifndef FF_API_VAAPI
-#define FF_API_VAAPI                    (LIBAVUTIL_VERSION_MAJOR < 56)
+#define FF_API_VAAPI                    (LIBAVUTIL_VERSION_MAJOR < 57)
 #endif
 #ifndef FF_API_FRAME_QP
-#define FF_API_FRAME_QP                 (LIBAVUTIL_VERSION_MAJOR < 56)
+#define FF_API_FRAME_QP                 (LIBAVUTIL_VERSION_MAJOR < 57)
 #endif
 #ifndef FF_API_PLUS1_MINUS1
-#define FF_API_PLUS1_MINUS1             (LIBAVUTIL_VERSION_MAJOR < 56)
+#define FF_API_PLUS1_MINUS1             (LIBAVUTIL_VERSION_MAJOR < 57)
 #endif
 #ifndef FF_API_ERROR_FRAME
-#define FF_API_ERROR_FRAME              (LIBAVUTIL_VERSION_MAJOR < 56)
-#endif
-#ifndef FF_API_CRC_BIG_TABLE
-#define FF_API_CRC_BIG_TABLE            (LIBAVUTIL_VERSION_MAJOR < 56)
+#define FF_API_ERROR_FRAME              (LIBAVUTIL_VERSION_MAJOR < 57)
 #endif
 #ifndef FF_API_PKT_PTS
-#define FF_API_PKT_PTS                  (LIBAVUTIL_VERSION_MAJOR < 56)
+#define FF_API_PKT_PTS                  (LIBAVUTIL_VERSION_MAJOR < 57)
 #endif
 #ifndef FF_API_CRYPTO_SIZE_T
-#define FF_API_CRYPTO_SIZE_T            (LIBAVUTIL_VERSION_MAJOR < 56)
+#define FF_API_CRYPTO_SIZE_T            (LIBAVUTIL_VERSION_MAJOR < 57)
+#endif
+#ifndef FF_API_FRAME_GET_SET
+#define FF_API_FRAME_GET_SET            (LIBAVUTIL_VERSION_MAJOR < 57)
+#endif
+#ifndef FF_API_PSEUDOPAL
+#define FF_API_PSEUDOPAL                (LIBAVUTIL_VERSION_MAJOR < 57)
 #endif
 
 
diff --git a/media/ffvpx/libavutil/x86/cpu.c b/media/ffvpx/libavutil/x86/cpu.c
index f33088c8c..aca893174 100644
--- a/media/ffvpx/libavutil/x86/cpu.c
+++ b/media/ffvpx/libavutil/x86/cpu.c
@@ -97,6 +97,7 @@ int ff_get_cpu_flags_x86(void)
     int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
     int family = 0, model = 0;
     union { int i[3]; char c[12]; } vendor;
+    int xcr0_lo = 0, xcr0_hi = 0;
 
     if (!cpuid_test())
         return 0; /* CPUID not supported */
@@ -132,8 +133,8 @@ int ff_get_cpu_flags_x86(void)
         /* Check OXSAVE and AVX bits */
         if ((ecx & 0x18000000) == 0x18000000) {
             /* Check for OS support */
-            xgetbv(0, eax, edx);
-            if ((eax & 0x6) == 0x6) {
+            xgetbv(0, xcr0_lo, xcr0_hi);
+            if ((xcr0_lo & 0x6) == 0x6) {
                 rval |= AV_CPU_FLAG_AVX;
                 if (ecx & 0x00001000)
                     rval |= AV_CPU_FLAG_FMA3;
@@ -147,6 +148,13 @@ int ff_get_cpu_flags_x86(void)
 #if HAVE_AVX2
         if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x00000020))
             rval |= AV_CPU_FLAG_AVX2;
+#if HAVE_AVX512 /* F, CD, BW, DQ, VL */
+        if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
+            if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd0030000) == 0xd0030000)
+                rval |= AV_CPU_FLAG_AVX512;
+
+        }
+#endif /* HAVE_AVX512 */
 #endif /* HAVE_AVX2 */
         /* BMI1/2 don't need OS support */
         if (ebx & 0x00000008) {
@@ -238,6 +246,8 @@ size_t ff_get_cpu_max_align_x86(void)
 {
     int flags = av_get_cpu_flags();
 
+    if (flags & AV_CPU_FLAG_AVX512)
+        return 64;
     if (flags & (AV_CPU_FLAG_AVX2      |
                  AV_CPU_FLAG_AVX       |
                  AV_CPU_FLAG_XOP       |
diff --git a/media/ffvpx/libavutil/x86/cpu.h b/media/ffvpx/libavutil/x86/cpu.h
index 309b8e746..937c697fa 100644
--- a/media/ffvpx/libavutil/x86/cpu.h
+++ b/media/ffvpx/libavutil/x86/cpu.h
@@ -19,7 +19,6 @@
 #ifndef AVUTIL_X86_CPU_H
 #define AVUTIL_X86_CPU_H
 
-#include "config.h"
 #include "libavutil/cpu.h"
 #include "libavutil/cpu_internal.h"
 
@@ -50,6 +49,7 @@
 #define X86_FMA4(flags)             CPUEXT(flags, FMA4)
 #define X86_AVX2(flags)             CPUEXT(flags, AVX2)
 #define X86_AESNI(flags)            CPUEXT(flags, AESNI)
+#define X86_AVX512(flags)           CPUEXT(flags, AVX512)
 
 #define EXTERNAL_AMD3DNOW(flags)    CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOW)
 #define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AMD3DNOWEXT)
@@ -79,6 +79,7 @@
 #define EXTERNAL_AVX2_FAST(flags)   CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX)
 #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX)
 #define EXTERNAL_AESNI(flags)       CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
+#define EXTERNAL_AVX512(flags)      CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
 
 #define INLINE_AMD3DNOW(flags)      CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOW)
 #define INLINE_AMD3DNOWEXT(flags)   CPUEXT_SUFFIX(flags, _INLINE, AMD3DNOWEXT)
diff --git a/media/ffvpx/libavutil/x86/intmath.h b/media/ffvpx/libavutil/x86/intmath.h
index e83971c08..40743fd13 100644
--- a/media/ffvpx/libavutil/x86/intmath.h
+++ b/media/ffvpx/libavutil/x86/intmath.h
@@ -47,7 +47,8 @@ static av_always_inline av_const int ff_log2_x86(unsigned int v)
 #   endif
 #   define ff_log2_16bit av_log2
 
-#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700))
+#if defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER >= 1700) && \
+                                  (defined(__BMI__) || !defined(__clang__)))
 #   define ff_ctz(v) _tzcnt_u32(v)
 
 #   if ARCH_X86_64
diff --git a/media/ffvpx/libavutil/x86/x86inc.asm b/media/ffvpx/libavutil/x86/x86inc.asm
index 6a054a3e0..5044ee86f 100644
--- a/media/ffvpx/libavutil/x86/x86inc.asm
+++ b/media/ffvpx/libavutil/x86/x86inc.asm
@@ -1,12 +1,12 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2017 x264 project
+;* Copyright (C) 2005-2018 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Henrik Gramner <henrik@gramner.com>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
 ;*          Fiona Glaser <fiona@x264.com>
-;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
 ;* purpose with or without fee is hereby granted, provided that the above
@@ -90,6 +90,10 @@
         SECTION .text
     %elifidn __OUTPUT_FORMAT__,coff
         SECTION .text
+    %elifidn __OUTPUT_FORMAT__,win32
+        SECTION .rdata align=%1
+    %elif WIN64
+        SECTION .rdata align=%1
     %else
         SECTION .rodata align=%1
     %endif
@@ -337,6 +341,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %endmacro
 
 %define required_stack_alignment ((mmsize + 15) & ~15)
+%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
+%define high_mm_regs (16*cpuflag(avx512))
 
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
@@ -450,15 +456,16 @@ DECLARE_REG 14, R13, 120
 
 %macro WIN64_PUSH_XMM 0
     ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps [rstk + stack_offset +  8], xmm6
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps [rstk + stack_offset + 24], xmm7
     %endif
-    %if xmm_regs_used > 8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         %assign %%i 8
-        %rep xmm_regs_used-8
+        %rep %%xmm_regs_on_stack
             movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
             %assign %%i %%i+1
         %endrep
@@ -467,10 +474,11 @@ DECLARE_REG 14, R13, 120
 
 %macro WIN64_SPILL_XMM 1
     %assign xmm_regs_used %1
-    ASSERT xmm_regs_used <= 16
-    %if xmm_regs_used > 8
+    ASSERT xmm_regs_used <= 16 + high_mm_regs
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
         ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
-        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign %%pad %%xmm_regs_on_stack*16 + 32
         %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
         SUB rsp, stack_size_padded
     %endif
@@ -479,9 +487,10 @@ DECLARE_REG 14, R13, 120
 
 %macro WIN64_RESTORE_XMM_INTERNAL 0
     %assign %%pad_size 0
-    %if xmm_regs_used > 8
-        %assign %%i xmm_regs_used
-        %rep xmm_regs_used-8
+    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
+    %if %%xmm_regs_on_stack > 0
+        %assign %%i xmm_regs_used - high_mm_regs
+        %rep %%xmm_regs_on_stack
             %assign %%i %%i-1
             movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
         %endrep
@@ -494,10 +503,10 @@ DECLARE_REG 14, R13, 120
             %assign %%pad_size stack_size_padded
         %endif
     %endif
-    %if xmm_regs_used > 7
+    %if xmm_regs_used > 7 + high_mm_regs
         movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
     %endif
-    %if xmm_regs_used > 6
+    %if xmm_regs_used > 6 + high_mm_regs
         movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
     %endif
 %endmacro
@@ -509,12 +518,12 @@ DECLARE_REG 14, R13, 120
     %assign xmm_regs_used 0
 %endmacro
 
-%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
 
 %macro RET 0
     WIN64_RESTORE_XMM_INTERNAL
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -538,9 +547,10 @@ DECLARE_REG 12, R15, 56
 DECLARE_REG 13, R12, 64
 DECLARE_REG 14, R13, 72
 
-%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
     %assign num_args %1
     %assign regs_used %2
+    %assign xmm_regs_used %3
     ASSERT regs_used >= num_args
     SETUP_STACK_POINTER %4
     ASSERT regs_used <= 15
@@ -550,7 +560,7 @@ DECLARE_REG 14, R13, 72
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -561,7 +571,7 @@ DECLARE_REG 14, R13, 72
         %endif
     %endif
     POP_IF_USED 14, 13, 12, 11, 10, 9
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -606,7 +616,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
     DEFINE_ARGS_INTERNAL %0, %4, %5
 %endmacro
 
-%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
 
 %macro RET 0
     %if stack_size_padded > 0
@@ -617,7 +627,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
         %endif
     %endif
     POP_IF_USED 6, 5, 4, 3
-    %if mmsize == 32
+    %if vzeroupper_required
         vzeroupper
     %endif
     AUTO_REP_RET
@@ -727,12 +737,22 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %assign stack_offset 0      ; stack pointer offset relative to the return address
     %assign stack_size 0        ; amount of stack space that can be freely used inside a function
     %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
-    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
     %ifnidn %3, ""
         PROLOGUE %3
     %endif
 %endmacro
 
+; Create a global symbol from a local label with the correct name mangling and type
+%macro cglobal_label 1
+    %if FORMAT_ELF
+        global current_function %+ %1:function hidden
+    %else
+        global current_function %+ %1
+    %endif
+    %1:
+%endmacro
+
 %macro cextern 1
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
     CAT_XDEFINE cglobaled_, %1, 1
@@ -803,10 +823,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
 %assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
 %assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
 %assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
+%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
 
-%assign cpuflags_cache32  (1<<20)
-%assign cpuflags_cache64  (1<<21)
-%assign cpuflags_slowctz  (1<<22)
+%assign cpuflags_cache32  (1<<21)
+%assign cpuflags_cache64  (1<<22)
 %assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
 %assign cpuflags_atom     (1<<24)
 
@@ -856,11 +876,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %endif
 %endmacro
 
-; Merge mmx and sse*
+; Merge mmx, sse*, and avx*
 ; m# is a simd register of the currently selected size
 ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
 ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
-; (All 3 remain in sync through SWAP.)
+; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
+; (All 4 remain in sync through SWAP.)
 
 %macro CAT_XDEFINE 3
     %xdefine %1%2 %3
@@ -870,69 +891,99 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
     %undef %1%2
 %endmacro
 
+%macro DEFINE_MMREGS 1 ; mmtype
+    %assign %%prev_mmregs 0
+    %ifdef num_mmregs
+        %assign %%prev_mmregs num_mmregs
+    %endif
+
+    %assign num_mmregs 8
+    %if ARCH_X86_64 && mmsize >= 16
+        %assign num_mmregs 16
+        %if cpuflag(avx512) || mmsize == 64
+            %assign num_mmregs 32
+        %endif
+    %endif
+
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, %1 %+ %%i
+        CAT_XDEFINE nn%1, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %if %%prev_mmregs > num_mmregs
+        %rep %%prev_mmregs - num_mmregs
+            CAT_UNDEF m, %%i
+            CAT_UNDEF nn %+ mmtype, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+    %xdefine mmtype %1
+%endmacro
+
+; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
+%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
+    %if ARCH_X86_64 && cpuflag(avx512)
+        %assign %%i %1
+        %rep 16-%1
+            %assign %%i_high %%i+16
+            SWAP %%i, %%i_high
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
 %macro INIT_MMX 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_MMX %1
     %define mmsize 8
-    %define num_mmregs 8
     %define mova movq
     %define movu movq
     %define movh movd
     %define movnta movntq
-    %assign %%i 0
-    %rep 8
-        CAT_XDEFINE m, %%i, mm %+ %%i
-        CAT_XDEFINE nnmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
-    %rep 8
-        CAT_UNDEF m, %%i
-        CAT_UNDEF nnmm, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS mm
 %endmacro
 
 %macro INIT_XMM 0-1+
     %assign avx_enabled 0
     %define RESET_MM_PERMUTATION INIT_XMM %1
     %define mmsize 16
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %define movh movq
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, xmm %+ %%i
-        CAT_XDEFINE nnxmm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS xmm
+    %if WIN64
+        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
+    %endif
 %endmacro
 
 %macro INIT_YMM 0-1+
     %assign avx_enabled 1
     %define RESET_MM_PERMUTATION INIT_YMM %1
     %define mmsize 32
-    %define num_mmregs 8
-    %if ARCH_X86_64
-        %define num_mmregs 16
-    %endif
     %define mova movdqa
     %define movu movdqu
     %undef movh
     %define movnta movntdq
-    %assign %%i 0
-    %rep num_mmregs
-        CAT_XDEFINE m, %%i, ymm %+ %%i
-        CAT_XDEFINE nnymm, %%i, %%i
-        %assign %%i %%i+1
-    %endrep
     INIT_CPUFLAGS %1
+    DEFINE_MMREGS ymm
+    AVX512_MM_PERMUTATION
+%endmacro
+
+%macro INIT_ZMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_ZMM %1
+    %define mmsize 64
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    INIT_CPUFLAGS %1
+    DEFINE_MMREGS zmm
+    AVX512_MM_PERMUTATION
 %endmacro
 
 INIT_XMM
@@ -941,18 +992,26 @@ INIT_XMM
     %define  mmmm%1   mm%1
     %define  mmxmm%1  mm%1
     %define  mmymm%1  mm%1
+    %define  mmzmm%1  mm%1
     %define xmmmm%1   mm%1
     %define xmmxmm%1 xmm%1
     %define xmmymm%1 xmm%1
+    %define xmmzmm%1 xmm%1
     %define ymmmm%1   mm%1
     %define ymmxmm%1 xmm%1
     %define ymmymm%1 ymm%1
+    %define ymmzmm%1 ymm%1
+    %define zmmmm%1   mm%1
+    %define zmmxmm%1 xmm%1
+    %define zmmymm%1 ymm%1
+    %define zmmzmm%1 zmm%1
     %define xm%1 xmm %+ m%1
     %define ym%1 ymm %+ m%1
+    %define zm%1 zmm %+ m%1
 %endmacro
 
 %assign i 0
-%rep 16
+%rep 32
     DECLARE_MMCAST i
     %assign i i+1
 %endrep
@@ -1087,12 +1146,17 @@ INIT_XMM
 ;=============================================================================
 
 %assign i 0
-%rep 16
+%rep 32
     %if i < 8
         CAT_XDEFINE sizeofmm, i, 8
+        CAT_XDEFINE regnumofmm, i, i
     %endif
     CAT_XDEFINE sizeofxmm, i, 16
     CAT_XDEFINE sizeofymm, i, 32
+    CAT_XDEFINE sizeofzmm, i, 64
+    CAT_XDEFINE regnumofxmm, i, i
+    CAT_XDEFINE regnumofymm, i, i
+    CAT_XDEFINE regnumofzmm, i, i
     %assign i i+1
 %endrep
 %undef i
@@ -1209,7 +1273,7 @@ INIT_XMM
     %endmacro
 %endmacro
 
-; Instructions with both VEX and non-VEX encodings
+; Instructions with both VEX/EVEX and legacy encodings
 ; Non-destructive instructions are written without parameters
 AVX_INSTR addpd, sse2, 1, 0, 1
 AVX_INSTR addps, sse, 1, 0, 1
@@ -1231,10 +1295,42 @@ AVX_INSTR blendpd, sse4, 1, 1, 0
 AVX_INSTR blendps, sse4, 1, 1, 0
 AVX_INSTR blendvpd, sse4 ; can't be emulated
 AVX_INSTR blendvps, sse4 ; can't be emulated
+AVX_INSTR cmpeqpd, sse2, 1, 0, 1
+AVX_INSTR cmpeqps, sse, 1, 0, 1
+AVX_INSTR cmpeqsd, sse2, 1, 0, 0
+AVX_INSTR cmpeqss, sse, 1, 0, 0
+AVX_INSTR cmplepd, sse2, 1, 0, 0
+AVX_INSTR cmpleps, sse, 1, 0, 0
+AVX_INSTR cmplesd, sse2, 1, 0, 0
+AVX_INSTR cmpless, sse, 1, 0, 0
+AVX_INSTR cmpltpd, sse2, 1, 0, 0
+AVX_INSTR cmpltps, sse, 1, 0, 0
+AVX_INSTR cmpltsd, sse2, 1, 0, 0
+AVX_INSTR cmpltss, sse, 1, 0, 0
+AVX_INSTR cmpneqpd, sse2, 1, 0, 1
+AVX_INSTR cmpneqps, sse, 1, 0, 1
+AVX_INSTR cmpneqsd, sse2, 1, 0, 0
+AVX_INSTR cmpneqss, sse, 1, 0, 0
+AVX_INSTR cmpnlepd, sse2, 1, 0, 0
+AVX_INSTR cmpnleps, sse, 1, 0, 0
+AVX_INSTR cmpnlesd, sse2, 1, 0, 0
+AVX_INSTR cmpnless, sse, 1, 0, 0
+AVX_INSTR cmpnltpd, sse2, 1, 0, 0
+AVX_INSTR cmpnltps, sse, 1, 0, 0
+AVX_INSTR cmpnltsd, sse2, 1, 0, 0
+AVX_INSTR cmpnltss, sse, 1, 0, 0
+AVX_INSTR cmpordpd, sse2 1, 0, 1
+AVX_INSTR cmpordps, sse 1, 0, 1
+AVX_INSTR cmpordsd, sse2 1, 0, 0
+AVX_INSTR cmpordss, sse 1, 0, 0
 AVX_INSTR cmppd, sse2, 1, 1, 0
 AVX_INSTR cmpps, sse, 1, 1, 0
 AVX_INSTR cmpsd, sse2, 1, 1, 0
 AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR cmpunordpd, sse2, 1, 0, 1
+AVX_INSTR cmpunordps, sse, 1, 0, 1
+AVX_INSTR cmpunordsd, sse2, 1, 0, 0
+AVX_INSTR cmpunordss, sse, 1, 0, 0
 AVX_INSTR comisd, sse2
 AVX_INSTR comiss, sse
 AVX_INSTR cvtdq2pd, sse2
@@ -1545,6 +1641,52 @@ FMA4_INSTR fmsubadd, pd, ps
 FMA4_INSTR fnmadd,   pd, ps, sd, ss
 FMA4_INSTR fnmsub,   pd, ps, sd, ss
 
+; Macros for converting VEX instructions to equivalent EVEX ones.
+%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
+    %macro %1 2-7 fnord, fnord, %1, %2, %3
+        %ifidn %3, fnord
+            %define %%args %1, %2
+        %elifidn %4, fnord
+            %define %%args %1, %2, %3
+        %else
+            %define %%args %1, %2, %3, %4
+        %endif
+        %assign %%evex_required cpuflag(avx512) & %7
+        %ifnum regnumof%1
+            %if regnumof%1 >= 16 || sizeof%1 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %ifnum regnumof%2
+            %if regnumof%2 >= 16 || sizeof%2 > 32
+                %assign %%evex_required 1
+            %endif
+        %endif
+        %if %%evex_required
+            %6 %%args
+        %else
+            %5 %%args ; Prefer VEX over EVEX due to shorter instruction length
+        %endif
+    %endmacro
+%endmacro
+
+EVEX_INSTR vbroadcastf128, vbroadcastf32x4
+EVEX_INSTR vbroadcasti128, vbroadcasti32x4
+EVEX_INSTR vextractf128,   vextractf32x4
+EVEX_INSTR vextracti128,   vextracti32x4
+EVEX_INSTR vinsertf128,    vinsertf32x4
+EVEX_INSTR vinserti128,    vinserti32x4
+EVEX_INSTR vmovdqa,        vmovdqa32
+EVEX_INSTR vmovdqu,        vmovdqu32
+EVEX_INSTR vpand,          vpandd
+EVEX_INSTR vpandn,         vpandnd
+EVEX_INSTR vpor,           vpord
+EVEX_INSTR vpxor,          vpxord
+EVEX_INSTR vrcpps,         vrcp14ps,   1 ; EVEX versions have higher precision
+EVEX_INSTR vrcpss,         vrcp14ss,   1
+EVEX_INSTR vrsqrtps,       vrsqrt14ps, 1
+EVEX_INSTR vrsqrtss,       vrsqrt14ss, 1
+
 ; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
 %ifdef __YASM_VER__
     %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
diff --git a/media/ffvpx/libavutil/x86/x86util.asm b/media/ffvpx/libavutil/x86/x86util.asm
index e1220dfc1..d7cd99684 100644
--- a/media/ffvpx/libavutil/x86/x86util.asm
+++ b/media/ffvpx/libavutil/x86/x86util.asm
@@ -357,7 +357,7 @@
 %endif
 %endmacro
 
-%macro ABSB 2 ; source mmreg, temp mmreg (unused for ssse3)
+%macro ABSB 2 ; source mmreg, temp mmreg (unused for SSSE3)
 %if cpuflag(ssse3)
     pabsb   %1, %1
 %else
@@ -381,7 +381,7 @@
 %endif
 %endmacro
 
-%macro ABSD2_MMX 4
+%macro ABSD2 4
     pxor    %3, %3
     pxor    %4, %4
     pcmpgtd %3, %1
@@ -475,7 +475,7 @@
 %else
     palignr %1, %2, %3
 %endif
-%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
+%else ; [dst,] src1, src2, imm, tmp
     %define %%dst %1
 %if %0==5
 %ifnidn %1, %2
@@ -799,37 +799,47 @@
     pminsw %1, %3
 %endmacro
 
-%macro PMINSD_MMX 3 ; dst, src, tmp
+%macro PMINSD 3 ; dst, src, tmp/unused
+%if cpuflag(sse4)
+    pminsd    %1, %2
+%elif cpuflag(sse2)
+    cvtdq2ps  %1, %1
+    minps     %1, %2
+    cvtps2dq  %1, %1
+%else
     mova      %3, %2
     pcmpgtd   %3, %1
     pxor      %1, %2
     pand      %1, %3
     pxor      %1, %2
+%endif
 %endmacro
 
-%macro PMAXSD_MMX 3 ; dst, src, tmp
+%macro PMAXSD 3 ; dst, src, tmp/unused
+%if cpuflag(sse4)
+    pmaxsd    %1, %2
+%else
     mova      %3, %1
     pcmpgtd   %3, %2
     pand      %1, %3
     pandn     %3, %2
     por       %1, %3
+%endif
 %endmacro
 
-%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
-    PMINSD_MMX %1, %3, %4
-    PMAXSD_MMX %1, %2, %4
-%endmacro
-
-%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
+%macro CLIPD 3-4
+%if cpuflag(sse4);  src/dst, min, max, unused
+    pminsd  %1, %3
+    pmaxsd  %1, %2
+%elif cpuflag(sse2) ; src/dst, min (float), max (float), unused
     cvtdq2ps  %1, %1
     minps     %1, %3
     maxps     %1, %2
     cvtps2dq  %1, %1
-%endmacro
-
-%macro CLIPD_SSE41 3-4 ;  src/dst, min, max, unused
-    pminsd  %1, %3
-    pmaxsd  %1, %2
+%else               ; src/dst, min, max, tmp
+    PMINSD    %1, %3, %4
+    PMAXSD    %1, %2, %4
+%endif
 %endmacro
 
 %macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
@@ -880,6 +890,14 @@
 %endif
 %endmacro
 
+%macro VBROADCASTI128 2 ; dst xmm/ymm, src : 128bits val
+%if mmsize > 16
+    vbroadcasti128 %1, %2
+%else
+    mova           %1, %2
+%endif
+%endmacro
+
 %macro SHUFFLE_MASK_W 8
     %rep 8
         %if %1>=0x80
diff --git a/media/libaom/README_MCP b/media/libaom/README_MCP
new file mode 100644
index 000000000..5ae9f96c4
--- /dev/null
+++ b/media/libaom/README_MCP
@@ -0,0 +1,13 @@
+This directory contains build files for the aom video
+codec reference implementation. The actual library
+source is in $TOPSRCDIR/third_party/aom/
+
+Any patches or additional configuration to be applied to the
+upstream source should be kept here in the media/libaom
+directory.
+
+The upstream aom git repository is:
+
+    https://aomedia.googlesource.com/aom
+
+The git commit ID used was 1e227d41f0616de9548a673a83a21ef990b62591.
diff --git a/media/libaom/aom_version.h b/media/libaom/aom_version.h
new file mode 100644
index 000000000..b791ef64d
--- /dev/null
+++ b/media/libaom/aom_version.h
@@ -0,0 +1,7 @@
+#define VERSION_MAJOR  0
+#define VERSION_MINOR  1
+#define VERSION_PATCH  0
+#define VERSION_EXTRA  ""
+#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
+#define VERSION_STRING_NOSP "v0.1.0"
+#define VERSION_STRING      " v0.1.0"
diff --git a/media/libaom/cmakeparser.py b/media/libaom/cmakeparser.py
new file mode 100644
index 000000000..cb2686a61
--- /dev/null
+++ b/media/libaom/cmakeparser.py
@@ -0,0 +1,293 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+from pyparsing import (CharsNotIn, Group, Forward, Literal, Suppress, Word,
+                       QuotedString, ZeroOrMore, alphas, alphanums)
+from string import Template
+import re
+
+# Grammar for CMake
+comment = Literal('#') + ZeroOrMore(CharsNotIn('\n'))
+quoted_argument = QuotedString('\"', '\\', multiline=True)
+unquoted_argument = CharsNotIn('\n ()#\"\\')
+argument = quoted_argument | unquoted_argument | Suppress(comment)
+arguments = Forward()
+arguments << (argument | (Literal('(') + ZeroOrMore(arguments) + Literal(')')))
+identifier = Word(alphas, alphanums+'_')
+command = Group(identifier + Literal('(') + ZeroOrMore(arguments) + Literal(')'))
+file_elements = command | Suppress(comment)
+cmake = ZeroOrMore(file_elements)
+
+
+def extract_arguments(parsed):
+    """Extract the command arguments skipping the parentheses"""
+    return parsed[2:len(parsed) - 1]
+
+
+def match_block(command, parsed, start):
+    """Find the end of block starting with the command"""
+    depth = 0
+    end = start + 1
+    endcommand = 'end' + command
+    while parsed[end][0] != endcommand or depth > 0:
+        if parsed[end][0] == command:
+            depth += 1
+        elif parsed[end][0] == endcommand:
+            depth -= 1
+        end = end + 1
+        if end == len(parsed):
+            print('error: eof when trying to match block statement: %s'
+                  % parsed[start])
+    return end
+
+
+def parse_if(parsed, start):
+    """Parse if/elseif/else/endif into a list of conditions and commands"""
+    depth = 0
+    conditions = []
+    condition = [extract_arguments(parsed[start])]
+    start = start + 1
+    end = start
+
+    while parsed[end][0] != 'endif' or depth > 0:
+        command = parsed[end][0]
+        if command == 'if':
+            depth += 1
+        elif command == 'else' and depth == 0:
+            condition.append(parsed[start:end])
+            conditions.append(condition)
+            start = end + 1
+            condition = [['TRUE']]
+        elif command == 'elseif' and depth == 0:
+            condition.append(parsed[start:end])
+            conditions.append(condition)
+            condition = [extract_arguments(parsed[end])]
+            start = end + 1
+        elif command == 'endif':
+            depth -= 1
+        end = end + 1
+        if end == len(parsed):
+            print('error: eof when trying to match if statement: %s'
+                  % parsed[start])
+    condition.append(parsed[start:end])
+    conditions.append(condition)
+    return end, conditions
+
+
+def substs(variables, values):
+    """Substitute variables into values"""
+    new_values = []
+    for value in values:
+        t = Template(value)
+        new_value = t.safe_substitute(variables)
+
+        # Safe substitute leaves unrecognized variables in place.
+        # We replace them with the empty string.
+        new_values.append(re.sub('\$\{\w+\}', '', new_value))
+    return new_values
+
+
+def evaluate(variables, cache_variables, parsed):
+    """Evaluate a list of parsed commands, returning sources to build"""
+    i = 0
+    sources = []
+    while i < len(parsed):
+        command = parsed[i][0]
+        arguments = substs(variables, extract_arguments(parsed[i]))
+
+        if command == 'foreach':
+            end = match_block(command, parsed, i)
+            for argument in arguments[1:]:
+                # ; is also a valid divider, why have one when you can have two?
+                argument = argument.replace(';', ' ')
+                for value in argument.split():
+                    variables[arguments[0]] = value
+                    cont_eval, new_sources = evaluate(variables, cache_variables,
+                                                      parsed[i+1:end])
+                    sources.extend(new_sources)
+                    if not cont_eval:
+                        return cont_eval, sources
+        elif command == 'function':
+            # for now we just execute functions inline at point of declaration
+            # as this is sufficient to build libaom
+            pass
+        elif command == 'if':
+            i, conditions = parse_if(parsed, i)
+            for condition in conditions:
+                if evaluate_boolean(variables, condition[0]):
+                    cont_eval, new_sources = evaluate(variables,
+                                                      cache_variables,
+                                                      condition[1])
+                    sources.extend(new_sources)
+                    if not cont_eval:
+                        return cont_eval, sources
+                    break
+        elif command == 'include':
+            if arguments:
+                try:
+                    print('including: %s' % arguments[0])
+                    sources.extend(parse(variables, cache_variables, arguments[0]))
+                except IOError:
+                    print('warning: could not include: %s' % arguments[0])
+        elif command == 'list':
+            try:
+                action = arguments[0]
+                variable = arguments[1]
+                values = arguments[2:]
+                if action == 'APPEND':
+                    if not variables.has_key(variable):
+                        variables[variable] = ' '.join(values)
+                    else:
+                        variables[variable] += ' ' + ' '.join(values)
+            except (IndexError, KeyError):
+                pass
+        elif command == 'option':
+            variable = arguments[0]
+            value = arguments[2]
+            # Allow options to be override without changing CMake files
+            if not variables.has_key(variable):
+                variables[variable] = value
+        elif command == 'return':
+            return False, sources
+        elif command == 'set':
+            variable = arguments[0]
+            values = arguments[1:]
+            # CACHE variables are not set if already present
+            try:
+                cache = values.index('CACHE')
+                values = values[0:cache]
+                if not variables.has_key(variable):
+                    variables[variable] = ' '.join(values)
+                cache_variables.append(variable)
+            except ValueError:
+                variables[variable] = ' '.join(values)
+        # we need to emulate the behavior of these function calls
+        # because we don't support interpreting them directly
+        # see bug 1492292
+        elif command in ['set_aom_config_var', 'set_aom_detect_var']:
+            variable = arguments[0]
+            value = arguments[1]
+            if variable not in variables:
+                variables[variable] = value
+            cache_variables.append(variable)
+        elif command == 'set_aom_option_var':
+            # option vars cannot go into cache_variables
+            variable = arguments[0]
+            value = arguments[2]
+            if variable not in variables:
+                variables[variable] = value
+        elif command == 'add_asm_library':
+            try:
+                sources.extend(variables[arguments[1]].split(' '))
+            except (IndexError, KeyError):
+                pass
+        elif command == 'add_intrinsics_object_library':
+            try:
+                sources.extend(variables[arguments[3]].split(' '))
+            except (IndexError, KeyError):
+                pass
+        elif command == 'add_library':
+            for source in arguments[1:]:
+                sources.extend(source.split(' '))
+        elif command == 'target_sources':
+            for source in arguments[1:]:
+                sources.extend(source.split(' '))
+        elif command == 'MOZDEBUG':
+            print('>>>> MOZDEBUG: %s' % ' '.join(arguments))
+        i += 1
+    return True, sources
+
+
+def evaluate_boolean(variables, arguments):
+    """Evaluate a boolean expression"""
+    if not arguments:
+        return False
+
+    argument = arguments[0]
+
+    if argument == 'NOT':
+        return not evaluate_boolean(variables, arguments[1:])
+
+    if argument == '(':
+        i = 0
+        depth = 1
+        while depth > 0 and i < len(arguments):
+            i += 1
+            if arguments[i] == '(':
+                depth += 1
+            if arguments[i] == ')':
+                depth -= 1
+        return evaluate_boolean(variables, arguments[1:i])
+
+    def evaluate_constant(argument):
+        try:
+            as_int = int(argument)
+            if as_int != 0:
+                return True
+            else:
+                return False
+        except ValueError:
+            upper = argument.upper()
+            if upper in ['ON', 'YES', 'TRUE', 'Y']:
+                return True
+            elif upper in ['OFF', 'NO', 'FALSE', 'N', 'IGNORE', '', 'NOTFOUND']:
+                return False
+            elif upper.endswith('-NOTFOUND'):
+                return False
+        return None
+
+    def lookup_variable(argument):
+        # If statements can have old-style variables which are not demarcated
+        # like ${VARIABLE}. Attempt to look up the variable both ways.
+        try:
+            if re.search('\$\{\w+\}', argument):
+                try:
+                    t = Template(argument)
+                    value = t.substitute(variables)
+                    try:
+                        # Attempt an old-style variable lookup with the
+                        # substituted value.
+                        return variables[value]
+                    except KeyError:
+                        return value
+                except ValueError:
+                    # TODO: CMake supports nesting, e.g. ${${foo}}
+                    return None
+            else:
+                return variables[argument]
+        except KeyError:
+            return None
+
+    lhs = lookup_variable(argument)
+    if lhs is None:
+        # variable resolution failed, treat as string
+        lhs = argument
+
+    if len(arguments) > 1:
+        op = arguments[1]
+        if op == 'AND':
+            return evaluate_constant(lhs) and evaluate_boolean(variables, arguments[2:])
+        elif op == 'MATCHES':
+            rhs = lookup_variable(arguments[2])
+            if not rhs:
+                rhs = arguments[2]
+            return not re.match(rhs, lhs) is None
+        elif op == 'OR':
+            return evaluate_constant(lhs) or evaluate_boolean(variables, arguments[2:])
+        elif op == 'STREQUAL':
+            rhs = lookup_variable(arguments[2])
+            if not rhs:
+                rhs = arguments[2]
+            return lhs == rhs
+    else:
+        lhs = evaluate_constant(lhs)
+        if lhs is None:
+            lhs = lookup_variable(argument)
+
+    return lhs
+
+
+def parse(variables, cache_variables, filename):
+    parsed = cmake.parseFile(filename)
+    cont_eval, sources = evaluate(variables, cache_variables, parsed)
+    return sources
diff --git a/media/libaom/config/aom_version.h b/media/libaom/config/aom_version.h
new file mode 100644
index 000000000..b791ef64d
--- /dev/null
+++ b/media/libaom/config/aom_version.h
@@ -0,0 +1,7 @@
+#define VERSION_MAJOR  0
+#define VERSION_MINOR  1
+#define VERSION_PATCH  0
+#define VERSION_EXTRA  ""
+#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
+#define VERSION_STRING_NOSP "v0.1.0"
+#define VERSION_STRING      " v0.1.0"
diff --git a/media/libaom/config/generic/config/aom_config.asm b/media/libaom/config/generic/config/aom_config.asm
new file mode 100644
index 000000000..897eeb4fd
--- /dev/null
+++ b/media/libaom/config/generic/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 0
+ARCH_X86_64 equ 0
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 0
+HAVE_AVX2 equ 0
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 0
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 0
+HAVE_SSE2 equ 0
+HAVE_SSE3 equ 0
+HAVE_SSE4_1 equ 0
+HAVE_SSE4_2 equ 0
+HAVE_SSSE3 equ 0
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/generic/config/aom_config.h b/media/libaom/config/generic/config/aom_config.h
new file mode 100644
index 000000000..54fb243e1
--- /dev/null
+++ b/media/libaom/config/generic/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 0
+#define ARCH_X86_64 0
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_SSE4_2 0
+#define HAVE_SSSE3 0
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/generic/config/aom_dsp_rtcd.h b/media/libaom/config/generic/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..c3b61b8a8
--- /dev/null
+++ b/media/libaom/config/generic/config/aom_dsp_rtcd.h
@@ -0,0 +1,1380 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+#define aom_blend_a64_hmask aom_blend_a64_hmask_c
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+#define aom_blend_a64_mask aom_blend_a64_mask_c
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+#define aom_blend_a64_vmask aom_blend_a64_vmask_c
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_horiz aom_convolve8_horiz_c
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_vert aom_convolve8_vert_c
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve_copy aom_convolve_copy_c
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_c
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_c
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_c
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_c
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_c
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x16 aom_dc_128_predictor_32x16_c
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x32 aom_dc_128_predictor_32x32_c
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x64 aom_dc_128_predictor_32x64_c
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_c
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_c
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_c
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_c
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_64x16 aom_dc_128_predictor_64x16_c
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_64x32 aom_dc_128_predictor_64x32_c
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_64x64 aom_dc_128_predictor_64x64_c
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_c
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_c
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_c
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_c
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_c
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_c
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_c
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_c
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_c
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x16 aom_dc_left_predictor_32x16_c
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x32 aom_dc_left_predictor_32x32_c
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x64 aom_dc_left_predictor_32x64_c
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_c
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_c
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_c
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_c
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_64x16 aom_dc_left_predictor_64x16_c
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_64x32 aom_dc_left_predictor_64x32_c
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_64x64 aom_dc_left_predictor_64x64_c
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_c
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_c
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_c
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_c
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_c
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_c
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_c
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_c
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_c
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x16 aom_dc_predictor_32x16_c
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x32 aom_dc_predictor_32x32_c
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x64 aom_dc_predictor_32x64_c
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_c
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_c
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_c
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_c
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_64x16 aom_dc_predictor_64x16_c
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_64x32 aom_dc_predictor_64x32_c
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_64x64 aom_dc_predictor_64x64_c
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_c
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_c
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_c
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_c
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_c
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_c
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_c
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_c
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_c
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x16 aom_dc_top_predictor_32x16_c
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x32 aom_dc_top_predictor_32x32_c
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x64 aom_dc_top_predictor_32x64_c
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_c
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_c
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_c
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_c
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_64x16 aom_dc_top_predictor_64x16_c
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_64x32 aom_dc_top_predictor_64x32_c
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_64x64 aom_dc_top_predictor_64x64_c
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_c
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_c
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_c
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_c
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x16 aom_h_predictor_16x16_c
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x32 aom_h_predictor_16x32_c
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x4 aom_h_predictor_16x4_c
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x64 aom_h_predictor_16x64_c
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x8 aom_h_predictor_16x8_c
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x16 aom_h_predictor_32x16_c
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x32 aom_h_predictor_32x32_c
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x64 aom_h_predictor_32x64_c
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x8 aom_h_predictor_32x8_c
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x16 aom_h_predictor_4x16_c
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x4 aom_h_predictor_4x4_c
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x8 aom_h_predictor_4x8_c
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x16 aom_h_predictor_64x16_c
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x32 aom_h_predictor_64x32_c
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x64 aom_h_predictor_64x64_c
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x16 aom_h_predictor_8x16_c
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x32 aom_h_predictor_8x32_c
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x4 aom_h_predictor_8x4_c
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x8 aom_h_predictor_8x8_c
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+#define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_c
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_c
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve_copy aom_highbd_convolve_copy_c
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_c
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_c
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_c
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_c
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_c
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_c
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_c
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_c
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_c
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_c
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_c
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_c
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_c
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_c
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_c
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_c
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_c
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_c
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_c
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_c
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_c
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_c
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_c
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_c
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_c
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_c
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_c
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_c
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_c
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_c
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_c
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_c
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_c
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_c
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_c
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_c
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_c
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_c
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_c
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_c
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_c
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_c
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_c
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_c
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_c
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_c
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_c
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_c
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_c
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_c
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_c
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+#define aom_highbd_lpf_horizontal_14_dual aom_highbd_lpf_horizontal_14_dual_c
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_c
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_4_dual aom_highbd_lpf_horizontal_4_dual_c
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_c
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_c
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_c
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_8_dual aom_highbd_lpf_horizontal_8_dual_c
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_c
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_14_dual aom_highbd_lpf_vertical_14_dual_c
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_c
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_4_dual aom_highbd_lpf_vertical_4_dual_c
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_c
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_c
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_c
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_8_dual aom_highbd_lpf_vertical_8_dual_c
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_c
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_c
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_c
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_c
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_c
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_c
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_c
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_c
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_c
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_c
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+#define aom_lowbd_blend_a64_d16_mask aom_lowbd_blend_a64_d16_mask_c
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_c
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_c
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_c
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_c
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_c
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_c
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_c
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_c
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_14 aom_lpf_vertical_14_c
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_c
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_4 aom_lpf_vertical_4_c
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_c
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_6 aom_lpf_vertical_6_c
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_c
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_8 aom_lpf_vertical_8_c
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_c
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x16 aom_paeth_predictor_16x16_c
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x32 aom_paeth_predictor_16x32_c
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x4 aom_paeth_predictor_16x4_c
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x64 aom_paeth_predictor_16x64_c
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x8 aom_paeth_predictor_16x8_c
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x16 aom_paeth_predictor_32x16_c
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x32 aom_paeth_predictor_32x32_c
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x64 aom_paeth_predictor_32x64_c
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x8 aom_paeth_predictor_32x8_c
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_4x16 aom_paeth_predictor_4x16_c
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_4x4 aom_paeth_predictor_4x4_c
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_4x8 aom_paeth_predictor_4x8_c
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_64x16 aom_paeth_predictor_64x16_c
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_64x32 aom_paeth_predictor_64x32_c
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_64x64 aom_paeth_predictor_64x64_c
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x16 aom_paeth_predictor_8x16_c
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x32 aom_paeth_predictor_8x32_c
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x4 aom_paeth_predictor_8x4_c
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x8 aom_paeth_predictor_8x8_c
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x16 aom_smooth_h_predictor_16x16_c
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x32 aom_smooth_h_predictor_16x32_c
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x4 aom_smooth_h_predictor_16x4_c
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x64 aom_smooth_h_predictor_16x64_c
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x8 aom_smooth_h_predictor_16x8_c
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x16 aom_smooth_h_predictor_32x16_c
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x32 aom_smooth_h_predictor_32x32_c
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x64 aom_smooth_h_predictor_32x64_c
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x8 aom_smooth_h_predictor_32x8_c
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_4x16 aom_smooth_h_predictor_4x16_c
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_4x4 aom_smooth_h_predictor_4x4_c
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_4x8 aom_smooth_h_predictor_4x8_c
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_64x16 aom_smooth_h_predictor_64x16_c
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_64x32 aom_smooth_h_predictor_64x32_c
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_64x64 aom_smooth_h_predictor_64x64_c
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x16 aom_smooth_h_predictor_8x16_c
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x32 aom_smooth_h_predictor_8x32_c
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x4 aom_smooth_h_predictor_8x4_c
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x8 aom_smooth_h_predictor_8x8_c
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x16 aom_smooth_predictor_16x16_c
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x32 aom_smooth_predictor_16x32_c
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x4 aom_smooth_predictor_16x4_c
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x64 aom_smooth_predictor_16x64_c
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x8 aom_smooth_predictor_16x8_c
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x16 aom_smooth_predictor_32x16_c
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x32 aom_smooth_predictor_32x32_c
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x64 aom_smooth_predictor_32x64_c
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x8 aom_smooth_predictor_32x8_c
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_4x16 aom_smooth_predictor_4x16_c
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_4x4 aom_smooth_predictor_4x4_c
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_4x8 aom_smooth_predictor_4x8_c
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_64x16 aom_smooth_predictor_64x16_c
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_64x32 aom_smooth_predictor_64x32_c
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_64x64 aom_smooth_predictor_64x64_c
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x16 aom_smooth_predictor_8x16_c
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x32 aom_smooth_predictor_8x32_c
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x4 aom_smooth_predictor_8x4_c
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x8 aom_smooth_predictor_8x8_c
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x16 aom_smooth_v_predictor_16x16_c
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x32 aom_smooth_v_predictor_16x32_c
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x4 aom_smooth_v_predictor_16x4_c
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x64 aom_smooth_v_predictor_16x64_c
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x8 aom_smooth_v_predictor_16x8_c
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x16 aom_smooth_v_predictor_32x16_c
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x32 aom_smooth_v_predictor_32x32_c
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x64 aom_smooth_v_predictor_32x64_c
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x8 aom_smooth_v_predictor_32x8_c
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_4x16 aom_smooth_v_predictor_4x16_c
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_4x4 aom_smooth_v_predictor_4x4_c
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_4x8 aom_smooth_v_predictor_4x8_c
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_64x16 aom_smooth_v_predictor_64x16_c
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_64x32 aom_smooth_v_predictor_64x32_c
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_64x64 aom_smooth_v_predictor_64x64_c
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x16 aom_smooth_v_predictor_8x16_c
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x32 aom_smooth_v_predictor_8x32_c
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x4 aom_smooth_v_predictor_8x4_c
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x8 aom_smooth_v_predictor_8x8_c
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x16 aom_v_predictor_16x16_c
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x32 aom_v_predictor_16x32_c
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x4 aom_v_predictor_16x4_c
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x64 aom_v_predictor_16x64_c
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x8 aom_v_predictor_16x8_c
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x16 aom_v_predictor_32x16_c
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x32 aom_v_predictor_32x32_c
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x64 aom_v_predictor_32x64_c
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x8 aom_v_predictor_32x8_c
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x16 aom_v_predictor_4x16_c
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x4 aom_v_predictor_4x4_c
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x8 aom_v_predictor_4x8_c
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_64x16 aom_v_predictor_64x16_c
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_64x32 aom_v_predictor_64x32_c
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_64x64 aom_v_predictor_64x64_c
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x16 aom_v_predictor_8x16_c
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x32 aom_v_predictor_8x32_c
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x4 aom_v_predictor_8x4_c
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x8 aom_v_predictor_8x8_c
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_c
+
+void aom_dsp_rtcd(void);
+
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/generic/config/aom_scale_rtcd.h b/media/libaom/config/generic/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..1ee9bef3d
--- /dev/null
+++ b/media/libaom/config/generic/config/aom_scale_rtcd.h
@@ -0,0 +1,85 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/generic/config/av1_rtcd.h b/media/libaom/config/generic/config/av1_rtcd.h
new file mode 100644
index 000000000..e2fe11bbd
--- /dev/null
+++ b/media/libaom/config/generic/config/av1_rtcd.h
@@ -0,0 +1,331 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+#define apply_selfguided_restoration apply_selfguided_restoration_c
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+#define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+#define av1_build_compound_diffwtd_mask_d16 av1_build_compound_diffwtd_mask_d16_c
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d_copy_sr av1_convolve_2d_copy_sr_c
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_scale av1_convolve_2d_scale_c
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_2d_sr av1_convolve_2d_sr_c
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_x_sr av1_convolve_x_sr_c
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_convolve_y_sr av1_convolve_y_sr_c
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+#define av1_filter_intra_edge av1_filter_intra_edge_c
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+#define av1_filter_intra_predictor av1_filter_intra_predictor_c
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_c
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_c
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_c
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+#define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_4x4 av1_highbd_inv_txfm_add_4x4_c
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_8x8 av1_highbd_inv_txfm_add_8x8_c
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+#define av1_highbd_warp_affine av1_highbd_warp_affine_c
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x4 av1_inv_txfm2d_add_4x4_c
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x8 av1_inv_txfm2d_add_8x8_c
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_inv_txfm_add av1_inv_txfm_add_c
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_jnt_convolve_2d av1_jnt_convolve_2d_c
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_c
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_jnt_convolve_x av1_jnt_convolve_x_c
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_jnt_convolve_y av1_jnt_convolve_y_c
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+#define av1_selfguided_restoration av1_selfguided_restoration_c
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+#define av1_upsample_intra_edge av1_upsample_intra_edge_c
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+#define av1_warp_affine av1_warp_affine_c
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+#define av1_wiener_convolve_add_src av1_wiener_convolve_add_src_c
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+#define cdef_filter_block cdef_filter_block_c
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+#define cdef_find_dir cdef_find_dir_c
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+#define cfl_get_luma_subsampling_420_hbd cfl_get_luma_subsampling_420_hbd_c
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+#define cfl_get_luma_subsampling_420_lbd cfl_get_luma_subsampling_420_lbd_c
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+#define cfl_get_luma_subsampling_422_hbd cfl_get_luma_subsampling_422_hbd_c
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+#define cfl_get_luma_subsampling_422_lbd cfl_get_luma_subsampling_422_lbd_c
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+#define cfl_get_luma_subsampling_444_hbd cfl_get_luma_subsampling_444_hbd_c
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+#define cfl_get_luma_subsampling_444_lbd cfl_get_luma_subsampling_444_lbd_c
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+#define copy_rect8_16bit_to_16bit copy_rect8_16bit_to_16bit_c
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+#define copy_rect8_8bit_to_16bit copy_rect8_8bit_to_16bit_c
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+#define get_predict_hbd_fn get_predict_hbd_fn_c
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+#define get_predict_lbd_fn get_predict_lbd_fn_c
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+#define get_subtract_average_fn get_subtract_average_fn_c
+
+void av1_rtcd(void);
+
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/arm/config/aom_config.asm b/media/libaom/config/linux/arm/config/aom_config.asm
new file mode 100644
index 000000000..f5db6f6ca
--- /dev/null
+++ b/media/libaom/config/linux/arm/config/aom_config.asm
@@ -0,0 +1,77 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+.equ ARCH_ARM, 1
+.equ ARCH_MIPS, 0
+.equ ARCH_PPC, 0
+.equ ARCH_X86, 0
+.equ ARCH_X86_64, 0
+.equ CONFIG_2PASS_PARTITION_SEARCH_LVL, 1
+.equ CONFIG_ACCOUNTING, 0
+.equ CONFIG_ANALYZER, 0
+.equ CONFIG_AV1_DECODER, 1
+.equ CONFIG_AV1_ENCODER, 0
+.equ CONFIG_BIG_ENDIAN, 0
+.equ CONFIG_BITSTREAM_DEBUG, 0
+.equ CONFIG_COEFFICIENT_RANGE_CHECKING, 0
+.equ CONFIG_COLLECT_INTER_MODE_RD_STATS, 0
+.equ CONFIG_COLLECT_RD_STATS, 0
+.equ CONFIG_DEBUG, 0
+.equ CONFIG_DENOISE, 1
+.equ CONFIG_DIST_8X8, 0
+.equ CONFIG_ENTROPY_STATS, 0
+.equ CONFIG_FILEOPTIONS, 1
+.equ CONFIG_FIX_GF_LENGTH, 1
+.equ CONFIG_FP_MB_STATS, 0
+.equ CONFIG_GCC, 1
+.equ CONFIG_GCOV, 0
+.equ CONFIG_GLOBAL_MOTION_SEARCH, 1
+.equ CONFIG_GPROF, 0
+.equ CONFIG_INSPECTION, 0
+.equ CONFIG_INTERNAL_STATS, 0
+.equ CONFIG_INTER_STATS_ONLY, 0
+.equ CONFIG_LIBYUV, 0
+.equ CONFIG_LOWBITDEPTH, 1
+.equ CONFIG_MAX_DECODE_PROFILE, 2
+.equ CONFIG_MISMATCH_DEBUG, 0
+.equ CONFIG_MULTITHREAD, 1
+.equ CONFIG_NORMAL_TILE_MODE, 0
+.equ CONFIG_OS_SUPPORT, 1
+.equ CONFIG_PIC, 1
+.equ CONFIG_RD_DEBUG, 0
+.equ CONFIG_REDUCED_ENCODER_BORDER, 0
+.equ CONFIG_RUNTIME_CPU_DETECT, 1
+.equ CONFIG_SHARED, 0
+.equ CONFIG_SHARP_SETTINGS, 0
+.equ CONFIG_SIZE_LIMIT, 0
+.equ CONFIG_SPATIAL_RESAMPLING, 1
+.equ CONFIG_STATIC, 1
+.equ CONFIG_WEBM_IO, 0
+.equ DECODE_HEIGHT_LIMIT, 0
+.equ DECODE_WIDTH_LIMIT, 0
+.equ HAVE_AVX, 0
+.equ HAVE_AVX2, 0
+.equ HAVE_DSPR2, 0
+.equ HAVE_FEXCEPT, 1
+.equ HAVE_MIPS32, 0
+.equ HAVE_MIPS64, 0
+.equ HAVE_MMX, 0
+.equ HAVE_MSA, 0
+.equ HAVE_NEON, 1
+.equ HAVE_SSE, 0
+.equ HAVE_SSE2, 0
+.equ HAVE_SSE3, 0
+.equ HAVE_SSE4_1, 0
+.equ HAVE_SSE4_2, 0
+.equ HAVE_SSSE3, 0
+.equ HAVE_VSX, 0
+.equ HAVE_WXWIDGETS, 0
+.section	.note.GNU-stack,"",%progbits
+\ No newline at end of file
diff --git a/media/libaom/config/linux/arm/config/aom_config.h b/media/libaom/config/linux/arm/config/aom_config.h
new file mode 100644
index 000000000..850eebfbd
--- /dev/null
+++ b/media/libaom/config/linux/arm/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 1
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 0
+#define ARCH_X86_64 0
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 1
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 0
+#define HAVE_MSA 0
+#define HAVE_NEON 1
+#define HAVE_SSE 0
+#define HAVE_SSE2 0
+#define HAVE_SSE3 0
+#define HAVE_SSE4_1 0
+#define HAVE_SSE4_2 0
+#define HAVE_SSSE3 0
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..d80f8972b
--- /dev/null
+++ b/media/libaom/config/linux/arm/config/aom_dsp_rtcd.h
@@ -0,0 +1,1508 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+#define aom_blend_a64_mask aom_blend_a64_mask_c
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_horiz aom_convolve8_horiz_c
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve8_vert aom_convolve8_vert_c
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve_copy aom_convolve_copy_c
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_c
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_c
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_c
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_c
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x16 aom_dc_128_predictor_32x16_c
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x64 aom_dc_128_predictor_32x64_c
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_c
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_c
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_c
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_64x16 aom_dc_128_predictor_64x16_c
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_64x32 aom_dc_128_predictor_64x32_c
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_64x64 aom_dc_128_predictor_64x64_c
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_c
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_c
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_c
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_c
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_c
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_c
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_c
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x16 aom_dc_left_predictor_32x16_c
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x64 aom_dc_left_predictor_32x64_c
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_c
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_c
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_c
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_64x16 aom_dc_left_predictor_64x16_c
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_64x32 aom_dc_left_predictor_64x32_c
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_64x64 aom_dc_left_predictor_64x64_c
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_c
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_c
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_c
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_c
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_c
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_c
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_c
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x16 aom_dc_predictor_32x16_c
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x64 aom_dc_predictor_32x64_c
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_c
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_c
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_c
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_64x16 aom_dc_predictor_64x16_c
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_64x32 aom_dc_predictor_64x32_c
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_64x64 aom_dc_predictor_64x64_c
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_c
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_c
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_c
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_c
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_c
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_c
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_c
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x16 aom_dc_top_predictor_32x16_c
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x64 aom_dc_top_predictor_32x64_c
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_c
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_c
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_c
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_64x16 aom_dc_top_predictor_64x16_c
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_64x32 aom_dc_top_predictor_64x32_c
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_64x64 aom_dc_top_predictor_64x64_c
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_c
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_c
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_c
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x32 aom_h_predictor_16x32_c
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x4 aom_h_predictor_16x4_c
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x64 aom_h_predictor_16x64_c
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x8 aom_h_predictor_16x8_c
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x16 aom_h_predictor_32x16_c
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x64 aom_h_predictor_32x64_c
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x8 aom_h_predictor_32x8_c
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x16 aom_h_predictor_4x16_c
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x8 aom_h_predictor_4x8_c
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x16 aom_h_predictor_64x16_c
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x32 aom_h_predictor_64x32_c
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x64 aom_h_predictor_64x64_c
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x16 aom_h_predictor_8x16_c
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x32 aom_h_predictor_8x32_c
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x4 aom_h_predictor_8x4_c
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_hmask aom_highbd_blend_a64_hmask_c
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+#define aom_highbd_blend_a64_mask aom_highbd_blend_a64_mask_c
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+#define aom_highbd_blend_a64_vmask aom_highbd_blend_a64_vmask_c
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_horiz aom_highbd_convolve8_horiz_c
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve8_vert aom_highbd_convolve8_vert_c
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define aom_highbd_convolve_copy aom_highbd_convolve_copy_c
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_c
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_c
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_c
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_c
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_c
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_c
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_c
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_c
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_c
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_c
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_c
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_c
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_c
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_c
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_c
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_c
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_c
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_c
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_c
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_c
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_c
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_c
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_c
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_c
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_64x64_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_64x64)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_c
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_c
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_c
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_c
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_c
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_c
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_c
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_c
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_c
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_c
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_c
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_c
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_c
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_c
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_c
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_c
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_c
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_c
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_c
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_c
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_c
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_c
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_c
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+#define aom_highbd_lpf_horizontal_14_dual aom_highbd_lpf_horizontal_14_dual_c
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_c
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_4_dual aom_highbd_lpf_horizontal_4_dual_c
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_c
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_c
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_c
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_8_dual aom_highbd_lpf_horizontal_8_dual_c
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_c
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_14_dual aom_highbd_lpf_vertical_14_dual_c
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_c
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_4_dual aom_highbd_lpf_vertical_4_dual_c
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_c
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_c
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_c
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_8_dual aom_highbd_lpf_vertical_8_dual_c
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_c
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_c
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_c
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_c
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_c
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_c
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_c
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_c
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_c
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_c
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_neon(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_c
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_c
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_c
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_c
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_c
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_c
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_c
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_c
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x16 aom_paeth_predictor_16x16_c
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x32 aom_paeth_predictor_16x32_c
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x4 aom_paeth_predictor_16x4_c
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x64 aom_paeth_predictor_16x64_c
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_16x8 aom_paeth_predictor_16x8_c
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x16 aom_paeth_predictor_32x16_c
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x32 aom_paeth_predictor_32x32_c
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x64 aom_paeth_predictor_32x64_c
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_32x8 aom_paeth_predictor_32x8_c
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_4x16 aom_paeth_predictor_4x16_c
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_4x4 aom_paeth_predictor_4x4_c
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_4x8 aom_paeth_predictor_4x8_c
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_64x16 aom_paeth_predictor_64x16_c
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_64x32 aom_paeth_predictor_64x32_c
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_64x64 aom_paeth_predictor_64x64_c
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x16 aom_paeth_predictor_8x16_c
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x32 aom_paeth_predictor_8x32_c
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x4 aom_paeth_predictor_8x4_c
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_8x8 aom_paeth_predictor_8x8_c
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x16 aom_smooth_h_predictor_16x16_c
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x32 aom_smooth_h_predictor_16x32_c
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x4 aom_smooth_h_predictor_16x4_c
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x64 aom_smooth_h_predictor_16x64_c
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_16x8 aom_smooth_h_predictor_16x8_c
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x16 aom_smooth_h_predictor_32x16_c
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x32 aom_smooth_h_predictor_32x32_c
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x64 aom_smooth_h_predictor_32x64_c
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_32x8 aom_smooth_h_predictor_32x8_c
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_4x16 aom_smooth_h_predictor_4x16_c
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_4x4 aom_smooth_h_predictor_4x4_c
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_4x8 aom_smooth_h_predictor_4x8_c
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_64x16 aom_smooth_h_predictor_64x16_c
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_64x32 aom_smooth_h_predictor_64x32_c
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_64x64 aom_smooth_h_predictor_64x64_c
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x16 aom_smooth_h_predictor_8x16_c
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x32 aom_smooth_h_predictor_8x32_c
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x4 aom_smooth_h_predictor_8x4_c
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_8x8 aom_smooth_h_predictor_8x8_c
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x16 aom_smooth_predictor_16x16_c
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x32 aom_smooth_predictor_16x32_c
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x4 aom_smooth_predictor_16x4_c
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x64 aom_smooth_predictor_16x64_c
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_16x8 aom_smooth_predictor_16x8_c
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x16 aom_smooth_predictor_32x16_c
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x32 aom_smooth_predictor_32x32_c
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x64 aom_smooth_predictor_32x64_c
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_32x8 aom_smooth_predictor_32x8_c
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_4x16 aom_smooth_predictor_4x16_c
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_4x4 aom_smooth_predictor_4x4_c
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_4x8 aom_smooth_predictor_4x8_c
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_64x16 aom_smooth_predictor_64x16_c
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_64x32 aom_smooth_predictor_64x32_c
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_64x64 aom_smooth_predictor_64x64_c
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x16 aom_smooth_predictor_8x16_c
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x32 aom_smooth_predictor_8x32_c
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x4 aom_smooth_predictor_8x4_c
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_8x8 aom_smooth_predictor_8x8_c
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x16 aom_smooth_v_predictor_16x16_c
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x32 aom_smooth_v_predictor_16x32_c
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x4 aom_smooth_v_predictor_16x4_c
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x64 aom_smooth_v_predictor_16x64_c
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_16x8 aom_smooth_v_predictor_16x8_c
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x16 aom_smooth_v_predictor_32x16_c
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x32 aom_smooth_v_predictor_32x32_c
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x64 aom_smooth_v_predictor_32x64_c
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_32x8 aom_smooth_v_predictor_32x8_c
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_4x16 aom_smooth_v_predictor_4x16_c
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_4x4 aom_smooth_v_predictor_4x4_c
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_4x8 aom_smooth_v_predictor_4x8_c
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_64x16 aom_smooth_v_predictor_64x16_c
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_64x32 aom_smooth_v_predictor_64x32_c
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_64x64 aom_smooth_v_predictor_64x64_c
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x16 aom_smooth_v_predictor_8x16_c
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x32 aom_smooth_v_predictor_8x32_c
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x4 aom_smooth_v_predictor_8x4_c
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_8x8 aom_smooth_v_predictor_8x8_c
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x32 aom_v_predictor_16x32_c
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x4 aom_v_predictor_16x4_c
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x64 aom_v_predictor_16x64_c
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x8 aom_v_predictor_16x8_c
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x16 aom_v_predictor_32x16_c
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x64 aom_v_predictor_32x64_c
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x8 aom_v_predictor_32x8_c
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x16 aom_v_predictor_4x16_c
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x8 aom_v_predictor_4x8_c
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_64x16 aom_v_predictor_64x16_c
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_64x32 aom_v_predictor_64x32_c
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_64x64 aom_v_predictor_64x64_c
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x16 aom_v_predictor_8x16_c
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x32 aom_v_predictor_8x32_c
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x4 aom_v_predictor_8x4_c
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = aom_arm_cpu_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_NEON) aom_blend_a64_hmask = aom_blend_a64_hmask_neon;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_NEON) aom_blend_a64_vmask = aom_blend_a64_vmask_neon;
+    aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_neon;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_neon;
+    aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_neon;
+    aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_neon;
+    aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_neon;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_neon;
+    aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_neon;
+    aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_neon;
+    aom_dc_predictor_16x16 = aom_dc_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_dc_predictor_16x16 = aom_dc_predictor_16x16_neon;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_neon;
+    aom_dc_predictor_4x4 = aom_dc_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_dc_predictor_4x4 = aom_dc_predictor_4x4_neon;
+    aom_dc_predictor_8x8 = aom_dc_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_dc_predictor_8x8 = aom_dc_predictor_8x8_neon;
+    aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_neon;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_neon;
+    aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_neon;
+    aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_neon;
+    aom_h_predictor_16x16 = aom_h_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_h_predictor_16x16 = aom_h_predictor_16x16_neon;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_h_predictor_32x32 = aom_h_predictor_32x32_neon;
+    aom_h_predictor_4x4 = aom_h_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_h_predictor_4x4 = aom_h_predictor_4x4_neon;
+    aom_h_predictor_8x8 = aom_h_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_h_predictor_8x8 = aom_h_predictor_8x8_neon;
+    aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_neon;
+    aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_neon;
+    aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_neon;
+    aom_highbd_dc_predictor_64x64 = aom_highbd_dc_predictor_64x64_c;
+    if (flags & HAS_NEON) aom_highbd_dc_predictor_64x64 = aom_highbd_dc_predictor_64x64_neon;
+    aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_neon;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_NEON) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_neon;
+    aom_lpf_horizontal_14 = aom_lpf_horizontal_14_c;
+    if (flags & HAS_NEON) aom_lpf_horizontal_14 = aom_lpf_horizontal_14_neon;
+    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
+    if (flags & HAS_NEON) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_neon;
+    aom_lpf_horizontal_6 = aom_lpf_horizontal_6_c;
+    if (flags & HAS_NEON) aom_lpf_horizontal_6 = aom_lpf_horizontal_6_neon;
+    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
+    if (flags & HAS_NEON) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_neon;
+    aom_lpf_vertical_14 = aom_lpf_vertical_14_c;
+    if (flags & HAS_NEON) aom_lpf_vertical_14 = aom_lpf_vertical_14_neon;
+    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
+    if (flags & HAS_NEON) aom_lpf_vertical_4 = aom_lpf_vertical_4_neon;
+    aom_lpf_vertical_6 = aom_lpf_vertical_6_c;
+    if (flags & HAS_NEON) aom_lpf_vertical_6 = aom_lpf_vertical_6_neon;
+    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
+    if (flags & HAS_NEON) aom_lpf_vertical_8 = aom_lpf_vertical_8_neon;
+    aom_v_predictor_16x16 = aom_v_predictor_16x16_c;
+    if (flags & HAS_NEON) aom_v_predictor_16x16 = aom_v_predictor_16x16_neon;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_c;
+    if (flags & HAS_NEON) aom_v_predictor_32x32 = aom_v_predictor_32x32_neon;
+    aom_v_predictor_4x4 = aom_v_predictor_4x4_c;
+    if (flags & HAS_NEON) aom_v_predictor_4x4 = aom_v_predictor_4x4_neon;
+    aom_v_predictor_8x8 = aom_v_predictor_8x8_c;
+    if (flags & HAS_NEON) aom_v_predictor_8x8 = aom_v_predictor_8x8_neon;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_NEON) av1_round_shift_array = av1_round_shift_array_neon;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/arm/config/aom_scale_rtcd.h b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..165d73093
--- /dev/null
+++ b/media/libaom/config/linux/arm/config/aom_scale_rtcd.h
@@ -0,0 +1,90 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = aom_arm_cpu_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/arm/config/av1_rtcd.h b/media/libaom/config/linux/arm/config/av1_rtcd.h
new file mode 100644
index 000000000..b54b4514d
--- /dev/null
+++ b/media/libaom/config/linux/arm/config/av1_rtcd.h
@@ -0,0 +1,421 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_neon(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+#define av1_build_compound_diffwtd_mask av1_build_compound_diffwtd_mask_c
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_neon(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+#define av1_build_compound_diffwtd_mask_highbd av1_build_compound_diffwtd_mask_highbd_c
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+#define av1_convolve_2d_scale av1_convolve_2d_scale_c
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+#define av1_convolve_horiz_rs av1_convolve_horiz_rs_c
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+#define av1_filter_intra_edge av1_filter_intra_edge_c
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+#define av1_filter_intra_edge_high av1_filter_intra_edge_high_c
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+#define av1_filter_intra_predictor av1_filter_intra_predictor_c
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_c
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_c
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_copy_sr av1_highbd_convolve_2d_copy_sr_c
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_scale av1_highbd_convolve_2d_scale_c
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_2d_sr av1_highbd_convolve_2d_sr_c
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+#define av1_highbd_convolve_horiz_rs av1_highbd_convolve_horiz_rs_c
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_x_sr av1_highbd_convolve_x_sr_c
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_4x4 av1_highbd_inv_txfm_add_4x4_c
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+#define av1_highbd_inv_txfm_add_8x8 av1_highbd_inv_txfm_add_8x8_c
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+#define av1_highbd_warp_affine av1_highbd_warp_affine_c
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+#define av1_highbd_wiener_convolve_add_src av1_highbd_wiener_convolve_add_src_c
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x4 av1_inv_txfm2d_add_4x4_c
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x8 av1_inv_txfm2d_add_8x8_c
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_neon(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+#define av1_upsample_intra_edge av1_upsample_intra_edge_c
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+#define av1_upsample_intra_edge_high av1_upsample_intra_edge_high_c
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_neon(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_neon(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = aom_arm_cpu_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_NEON) apply_selfguided_restoration = apply_selfguided_restoration_neon;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_NEON) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_neon;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_c;
+    if (flags & HAS_NEON) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_neon;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_c;
+    if (flags & HAS_NEON) av1_convolve_2d_sr = av1_convolve_2d_sr_neon;
+    av1_convolve_x_sr = av1_convolve_x_sr_c;
+    if (flags & HAS_NEON) av1_convolve_x_sr = av1_convolve_x_sr_neon;
+    av1_convolve_y_sr = av1_convolve_y_sr_c;
+    if (flags & HAS_NEON) av1_convolve_y_sr = av1_convolve_y_sr_neon;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_NEON) av1_inv_txfm_add = av1_inv_txfm_add_neon;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_NEON) av1_jnt_convolve_2d = av1_jnt_convolve_2d_neon;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_c;
+    if (flags & HAS_NEON) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_neon;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_c;
+    if (flags & HAS_NEON) av1_jnt_convolve_x = av1_jnt_convolve_x_neon;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_c;
+    if (flags & HAS_NEON) av1_jnt_convolve_y = av1_jnt_convolve_y_neon;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_NEON) av1_selfguided_restoration = av1_selfguided_restoration_neon;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_NEON) av1_warp_affine = av1_warp_affine_neon;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_c;
+    if (flags & HAS_NEON) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_neon;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_NEON) cdef_filter_block = cdef_filter_block_neon;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_NEON) cdef_find_dir = cdef_find_dir_neon;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_NEON) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_neon;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_NEON) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_neon;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_NEON) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_neon;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_NEON) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_neon;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_NEON) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_neon;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_NEON) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_neon;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
+    if (flags & HAS_NEON) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_neon;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
+    if (flags & HAS_NEON) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_neon;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_NEON) get_predict_hbd_fn = get_predict_hbd_fn_neon;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_NEON) get_predict_lbd_fn = get_predict_lbd_fn_neon;
+    get_subtract_average_fn = get_subtract_average_fn_c;
+    if (flags & HAS_NEON) get_subtract_average_fn = get_subtract_average_fn_neon;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/ia32/config/aom_config.asm b/media/libaom/config/linux/ia32/config/aom_config.asm
new file mode 100644
index 000000000..128f85ec2
--- /dev/null
+++ b/media/libaom/config/linux/ia32/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 1
+ARCH_X86_64 equ 0
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 1
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/linux/ia32/config/aom_config.h b/media/libaom/config/linux/ia32/config/aom_config.h
new file mode 100644
index 000000000..02f6f7243
--- /dev/null
+++ b/media/libaom/config/linux/ia32/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 1
+#define ARCH_X86_64 0
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 1
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..5b3762f10
--- /dev/null
+++ b/media/libaom/config/linux/ia32/config/aom_dsp_rtcd.h
@@ -0,0 +1,2379 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_c;
+    if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_c;
+    if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_convolve_copy = aom_convolve_copy_c;
+    if (flags & HAS_SSE2) aom_convolve_copy = aom_convolve_copy_sse2;
+    aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_sse2;
+    aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_sse2;
+    aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_sse2;
+    aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_sse2;
+    aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_sse2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_sse2;
+    aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_sse2;
+    aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_sse2;
+    aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_sse2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_sse2;
+    aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_sse2;
+    aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_sse2;
+    aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_sse2;
+    aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_sse2;
+    aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_sse2;
+    aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_sse2;
+    aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_sse2;
+    aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_sse2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_sse2;
+    aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_sse2;
+    aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_sse2;
+    aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_sse2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_sse2;
+    aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_sse2;
+    aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_sse2;
+    aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_sse2;
+    aom_dc_predictor_16x16 = aom_dc_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x16 = aom_dc_predictor_16x16_sse2;
+    aom_dc_predictor_16x32 = aom_dc_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x32 = aom_dc_predictor_16x32_sse2;
+    aom_dc_predictor_16x4 = aom_dc_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x4 = aom_dc_predictor_16x4_sse2;
+    aom_dc_predictor_16x64 = aom_dc_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x64 = aom_dc_predictor_16x64_sse2;
+    aom_dc_predictor_16x8 = aom_dc_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x8 = aom_dc_predictor_16x8_sse2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_32x8 = aom_dc_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x8 = aom_dc_predictor_32x8_sse2;
+    aom_dc_predictor_4x16 = aom_dc_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x16 = aom_dc_predictor_4x16_sse2;
+    aom_dc_predictor_4x4 = aom_dc_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x4 = aom_dc_predictor_4x4_sse2;
+    aom_dc_predictor_4x8 = aom_dc_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x8 = aom_dc_predictor_4x8_sse2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_predictor_8x16 = aom_dc_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x16 = aom_dc_predictor_8x16_sse2;
+    aom_dc_predictor_8x32 = aom_dc_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x32 = aom_dc_predictor_8x32_sse2;
+    aom_dc_predictor_8x4 = aom_dc_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x4 = aom_dc_predictor_8x4_sse2;
+    aom_dc_predictor_8x8 = aom_dc_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x8 = aom_dc_predictor_8x8_sse2;
+    aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_sse2;
+    aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_sse2;
+    aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_sse2;
+    aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_sse2;
+    aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_sse2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_sse2;
+    aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_sse2;
+    aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_sse2;
+    aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_sse2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_sse2;
+    aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_sse2;
+    aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_sse2;
+    aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_sse2;
+    aom_h_predictor_16x16 = aom_h_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x16 = aom_h_predictor_16x16_sse2;
+    aom_h_predictor_16x32 = aom_h_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x32 = aom_h_predictor_16x32_sse2;
+    aom_h_predictor_16x4 = aom_h_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x4 = aom_h_predictor_16x4_sse2;
+    aom_h_predictor_16x64 = aom_h_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x64 = aom_h_predictor_16x64_sse2;
+    aom_h_predictor_16x8 = aom_h_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x8 = aom_h_predictor_16x8_sse2;
+    aom_h_predictor_32x16 = aom_h_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x16 = aom_h_predictor_32x16_sse2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_h_predictor_32x64 = aom_h_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x64 = aom_h_predictor_32x64_sse2;
+    aom_h_predictor_32x8 = aom_h_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x8 = aom_h_predictor_32x8_sse2;
+    aom_h_predictor_4x16 = aom_h_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x16 = aom_h_predictor_4x16_sse2;
+    aom_h_predictor_4x4 = aom_h_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x4 = aom_h_predictor_4x4_sse2;
+    aom_h_predictor_4x8 = aom_h_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x8 = aom_h_predictor_4x8_sse2;
+    aom_h_predictor_64x16 = aom_h_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x16 = aom_h_predictor_64x16_sse2;
+    aom_h_predictor_64x32 = aom_h_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x32 = aom_h_predictor_64x32_sse2;
+    aom_h_predictor_64x64 = aom_h_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x64 = aom_h_predictor_64x64_sse2;
+    aom_h_predictor_8x16 = aom_h_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x16 = aom_h_predictor_8x16_sse2;
+    aom_h_predictor_8x32 = aom_h_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x32 = aom_h_predictor_8x32_sse2;
+    aom_h_predictor_8x4 = aom_h_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x4 = aom_h_predictor_8x4_sse2;
+    aom_h_predictor_8x8 = aom_h_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x8 = aom_h_predictor_8x8_sse2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_c;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_c;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_c;
+    if (flags & HAS_SSE2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_sse2;
+    aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_sse2;
+    aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_sse2;
+    aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_sse2;
+    aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_sse2;
+    aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_sse2;
+    aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_sse2;
+    aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_sse2;
+    aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_sse2;
+    aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_sse2;
+    aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_sse2;
+    aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_sse2;
+    aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_sse2;
+    aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_sse2;
+    aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_sse2;
+    aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_sse2;
+    aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_sse2;
+    aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_sse2;
+    aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_sse2;
+    aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_sse2;
+    aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_sse2;
+    aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_sse2;
+    aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_sse2;
+    aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_sse2;
+    aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_sse2;
+    aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_sse2;
+    aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_sse2;
+    aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_sse2;
+    aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_sse2;
+    aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_sse2;
+    aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_sse2;
+    aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_sse2;
+    aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_sse2;
+    aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_sse2;
+    aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_sse2;
+    aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_sse2;
+    aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_sse2;
+    aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_sse2;
+    aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_sse2;
+    aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_sse2;
+    aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_sse2;
+    aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_sse2;
+    aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_sse2;
+    aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_sse2;
+    aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_sse2;
+    aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_sse2;
+    aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_sse2;
+    aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_sse2;
+    aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_sse2;
+    aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_sse2;
+    aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_sse2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_sse2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_sse2;
+    aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_sse2;
+    aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_sse2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_sse2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_sse2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_sse2;
+    aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_sse2;
+    aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_sse2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_sse2;
+    aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_sse2;
+    aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_sse2;
+    aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_sse2;
+    aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_sse2;
+    aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_sse2;
+    aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_sse2;
+    aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_sse2;
+    aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_sse2;
+    aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_sse2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_lpf_horizontal_14 = aom_lpf_horizontal_14_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_14 = aom_lpf_horizontal_14_sse2;
+    aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_sse2;
+    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_sse2;
+    aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_sse2;
+    aom_lpf_horizontal_6 = aom_lpf_horizontal_6_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_6 = aom_lpf_horizontal_6_sse2;
+    aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_sse2;
+    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_sse2;
+    aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_sse2;
+    aom_lpf_vertical_14 = aom_lpf_vertical_14_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_14 = aom_lpf_vertical_14_sse2;
+    aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_sse2;
+    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_4 = aom_lpf_vertical_4_sse2;
+    aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_sse2;
+    aom_lpf_vertical_6 = aom_lpf_vertical_6_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_6 = aom_lpf_vertical_6_sse2;
+    aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_sse2;
+    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_8 = aom_lpf_vertical_8_sse2;
+    aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_sse2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_16x16 = aom_v_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x16 = aom_v_predictor_16x16_sse2;
+    aom_v_predictor_16x32 = aom_v_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x32 = aom_v_predictor_16x32_sse2;
+    aom_v_predictor_16x4 = aom_v_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x4 = aom_v_predictor_16x4_sse2;
+    aom_v_predictor_16x64 = aom_v_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x64 = aom_v_predictor_16x64_sse2;
+    aom_v_predictor_16x8 = aom_v_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x8 = aom_v_predictor_16x8_sse2;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_32x8 = aom_v_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x8 = aom_v_predictor_32x8_sse2;
+    aom_v_predictor_4x16 = aom_v_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x16 = aom_v_predictor_4x16_sse2;
+    aom_v_predictor_4x4 = aom_v_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x4 = aom_v_predictor_4x4_sse2;
+    aom_v_predictor_4x8 = aom_v_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x8 = aom_v_predictor_4x8_sse2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    aom_v_predictor_8x16 = aom_v_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x16 = aom_v_predictor_8x16_sse2;
+    aom_v_predictor_8x32 = aom_v_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x32 = aom_v_predictor_8x32_sse2;
+    aom_v_predictor_8x4 = aom_v_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x4 = aom_v_predictor_8x4_sse2;
+    aom_v_predictor_8x8 = aom_v_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x8 = aom_v_predictor_8x8_sse2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/linux/ia32/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/ia32/config/av1_rtcd.h b/media/libaom/config/linux/ia32/config/av1_rtcd.h
new file mode 100644
index 000000000..a2a6f14ff
--- /dev/null
+++ b/media/libaom/config/linux/ia32/config/av1_rtcd.h
@@ -0,0 +1,605 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_c
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_c
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_c;
+    if (flags & HAS_SSE2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_c;
+    if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
+    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
+    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_c;
+    if (flags & HAS_SSE2) get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/x64/config/aom_config.asm b/media/libaom/config/linux/x64/config/aom_config.asm
new file mode 100644
index 000000000..4ab2dedb4
--- /dev/null
+++ b/media/libaom/config/linux/x64/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 0
+ARCH_X86_64 equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/linux/x64/config/aom_config.h b/media/libaom/config/linux/x64/config/aom_config.h
new file mode 100644
index 000000000..9a0be7cd1
--- /dev/null
+++ b/media/libaom/config/linux/x64/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 0
+#define ARCH_X86_64 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..2856d4ede
--- /dev/null
+++ b/media/libaom/config/linux/x64/config/aom_dsp_rtcd.h
@@ -0,0 +1,2001 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve_copy aom_convolve_copy_sse2
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_sse2
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_sse2
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_sse2
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_sse2
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_sse2
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_sse2
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_sse2
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_sse2
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_sse2
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_sse2
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_sse2
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_sse2
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_sse2
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_sse2
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_sse2
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_sse2
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_sse2
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_sse2
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_sse2
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_sse2
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_sse2
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_sse2
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_sse2
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_sse2
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_sse2
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_sse2
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_sse2
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_sse2
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_sse2
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_sse2
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_sse2
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_sse2
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_sse2
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_sse2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_sse2
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_sse2
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_sse2
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_sse2
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_sse2
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_sse2
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_sse2
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_sse2
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_sse2
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_sse2
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_sse2
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_sse2
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_sse2
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_sse2
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_sse2
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_sse2
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_sse2
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_sse2
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x16 aom_h_predictor_16x16_sse2
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x32 aom_h_predictor_16x32_sse2
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x4 aom_h_predictor_16x4_sse2
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x64 aom_h_predictor_16x64_sse2
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x8 aom_h_predictor_16x8_sse2
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x16 aom_h_predictor_32x16_sse2
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x64 aom_h_predictor_32x64_sse2
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x8 aom_h_predictor_32x8_sse2
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x16 aom_h_predictor_4x16_sse2
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x4 aom_h_predictor_4x4_sse2
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x8 aom_h_predictor_4x8_sse2
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x16 aom_h_predictor_64x16_sse2
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x32 aom_h_predictor_64x32_sse2
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x64 aom_h_predictor_64x64_sse2
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x16 aom_h_predictor_8x16_sse2
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x32 aom_h_predictor_8x32_sse2
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x4 aom_h_predictor_8x4_sse2
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_sse2
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_sse2
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_sse2
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_sse2
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_sse2
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_sse2
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_sse2
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_sse2
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_sse2
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_sse2
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_sse2
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_sse2
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_sse2
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_sse2
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_sse2
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_sse2
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_sse2
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_sse2
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_sse2
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_sse2
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_sse2
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_sse2
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_sse2
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_sse2
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_sse2
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_sse2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_sse2
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_sse2
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_sse2
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_sse2
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_sse2
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_sse2
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_sse2
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_sse2
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_sse2
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_sse2
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_sse2
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_sse2
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_sse2
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_sse2
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_sse2
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_sse2
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_sse2
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_sse2
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_sse2
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_sse2
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_sse2
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_sse2
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_sse2
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_sse2
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_sse2
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_sse2
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_sse2
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_sse2
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_sse2
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_sse2
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_sse2
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_sse2
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_sse2
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x16 aom_v_predictor_16x16_sse2
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x32 aom_v_predictor_16x32_sse2
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x4 aom_v_predictor_16x4_sse2
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x64 aom_v_predictor_16x64_sse2
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x8 aom_v_predictor_16x8_sse2
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x8 aom_v_predictor_32x8_sse2
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x16 aom_v_predictor_4x16_sse2
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x4 aom_v_predictor_4x4_sse2
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x8 aom_v_predictor_4x8_sse2
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x16 aom_v_predictor_8x16_sse2
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x32 aom_v_predictor_8x32_sse2
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x4 aom_v_predictor_8x4_sse2
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/x64/config/aom_scale_rtcd.h b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/linux/x64/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/linux/x64/config/av1_rtcd.h b/media/libaom/config/linux/x64/config/av1_rtcd.h
new file mode 100644
index 000000000..d27318208
--- /dev/null
+++ b/media/libaom/config/linux/x64/config/av1_rtcd.h
@@ -0,0 +1,594 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_sse2
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_sse2
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/mac/x64/config/aom_config.asm b/media/libaom/config/mac/x64/config/aom_config.asm
new file mode 100644
index 000000000..4ab2dedb4
--- /dev/null
+++ b/media/libaom/config/mac/x64/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 0
+ARCH_X86_64 equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/mac/x64/config/aom_config.h b/media/libaom/config/mac/x64/config/aom_config.h
new file mode 100644
index 000000000..9a0be7cd1
--- /dev/null
+++ b/media/libaom/config/mac/x64/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 0
+#define ARCH_X86_64 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..2856d4ede
--- /dev/null
+++ b/media/libaom/config/mac/x64/config/aom_dsp_rtcd.h
@@ -0,0 +1,2001 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve_copy aom_convolve_copy_sse2
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_sse2
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_sse2
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_sse2
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_sse2
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_sse2
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_sse2
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_sse2
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_sse2
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_sse2
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_sse2
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_sse2
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_sse2
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_sse2
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_sse2
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_sse2
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_sse2
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_sse2
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_sse2
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_sse2
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_sse2
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_sse2
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_sse2
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_sse2
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_sse2
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_sse2
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_sse2
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_sse2
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_sse2
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_sse2
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_sse2
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_sse2
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_sse2
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_sse2
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_sse2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_sse2
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_sse2
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_sse2
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_sse2
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_sse2
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_sse2
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_sse2
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_sse2
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_sse2
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_sse2
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_sse2
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_sse2
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_sse2
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_sse2
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_sse2
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_sse2
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_sse2
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_sse2
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x16 aom_h_predictor_16x16_sse2
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x32 aom_h_predictor_16x32_sse2
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x4 aom_h_predictor_16x4_sse2
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x64 aom_h_predictor_16x64_sse2
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x8 aom_h_predictor_16x8_sse2
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x16 aom_h_predictor_32x16_sse2
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x64 aom_h_predictor_32x64_sse2
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x8 aom_h_predictor_32x8_sse2
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x16 aom_h_predictor_4x16_sse2
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x4 aom_h_predictor_4x4_sse2
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x8 aom_h_predictor_4x8_sse2
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x16 aom_h_predictor_64x16_sse2
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x32 aom_h_predictor_64x32_sse2
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x64 aom_h_predictor_64x64_sse2
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x16 aom_h_predictor_8x16_sse2
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x32 aom_h_predictor_8x32_sse2
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x4 aom_h_predictor_8x4_sse2
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_sse2
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_sse2
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_sse2
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_sse2
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_sse2
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_sse2
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_sse2
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_sse2
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_sse2
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_sse2
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_sse2
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_sse2
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_sse2
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_sse2
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_sse2
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_sse2
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_sse2
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_sse2
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_sse2
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_sse2
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_sse2
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_sse2
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_sse2
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_sse2
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_sse2
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_sse2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_sse2
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_sse2
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_sse2
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_sse2
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_sse2
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_sse2
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_sse2
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_sse2
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_sse2
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_sse2
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_sse2
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_sse2
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_sse2
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_sse2
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_sse2
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_sse2
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_sse2
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_sse2
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_sse2
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_sse2
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_sse2
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_sse2
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_sse2
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_sse2
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_sse2
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_sse2
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_sse2
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_sse2
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_sse2
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_sse2
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_sse2
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_sse2
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_sse2
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x16 aom_v_predictor_16x16_sse2
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x32 aom_v_predictor_16x32_sse2
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x4 aom_v_predictor_16x4_sse2
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x64 aom_v_predictor_16x64_sse2
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x8 aom_v_predictor_16x8_sse2
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x8 aom_v_predictor_32x8_sse2
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x16 aom_v_predictor_4x16_sse2
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x4 aom_v_predictor_4x4_sse2
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x8 aom_v_predictor_4x8_sse2
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x16 aom_v_predictor_8x16_sse2
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x32 aom_v_predictor_8x32_sse2
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x4 aom_v_predictor_8x4_sse2
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/mac/x64/config/aom_scale_rtcd.h b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/mac/x64/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/mac/x64/config/av1_rtcd.h b/media/libaom/config/mac/x64/config/av1_rtcd.h
new file mode 100644
index 000000000..d27318208
--- /dev/null
+++ b/media/libaom/config/mac/x64/config/av1_rtcd.h
@@ -0,0 +1,594 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_sse2
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_sse2
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/ia32/config/aom_config.asm b/media/libaom/config/win/ia32/config/aom_config.asm
new file mode 100644
index 000000000..6ae776c7c
--- /dev/null
+++ b/media/libaom/config/win/ia32/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 1
+ARCH_X86_64 equ 0
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/win/ia32/config/aom_config.h b/media/libaom/config/win/ia32/config/aom_config.h
new file mode 100644
index 000000000..090a45577
--- /dev/null
+++ b/media/libaom/config/win/ia32/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 1
+#define ARCH_X86_64 0
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..5b3762f10
--- /dev/null
+++ b/media/libaom/config/win/ia32/config/aom_dsp_rtcd.h
@@ -0,0 +1,2379 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_c;
+    if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_c;
+    if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_convolve_copy = aom_convolve_copy_c;
+    if (flags & HAS_SSE2) aom_convolve_copy = aom_convolve_copy_sse2;
+    aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_sse2;
+    aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_sse2;
+    aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_sse2;
+    aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_sse2;
+    aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_sse2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_sse2;
+    aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_sse2;
+    aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_sse2;
+    aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_sse2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_sse2;
+    aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_sse2;
+    aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_sse2;
+    aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_sse2;
+    aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_sse2;
+    aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_sse2;
+    aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_sse2;
+    aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_sse2;
+    aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_sse2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_sse2;
+    aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_sse2;
+    aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_sse2;
+    aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_sse2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_sse2;
+    aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_sse2;
+    aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_sse2;
+    aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_sse2;
+    aom_dc_predictor_16x16 = aom_dc_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x16 = aom_dc_predictor_16x16_sse2;
+    aom_dc_predictor_16x32 = aom_dc_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x32 = aom_dc_predictor_16x32_sse2;
+    aom_dc_predictor_16x4 = aom_dc_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x4 = aom_dc_predictor_16x4_sse2;
+    aom_dc_predictor_16x64 = aom_dc_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x64 = aom_dc_predictor_16x64_sse2;
+    aom_dc_predictor_16x8 = aom_dc_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x8 = aom_dc_predictor_16x8_sse2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_32x8 = aom_dc_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x8 = aom_dc_predictor_32x8_sse2;
+    aom_dc_predictor_4x16 = aom_dc_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x16 = aom_dc_predictor_4x16_sse2;
+    aom_dc_predictor_4x4 = aom_dc_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x4 = aom_dc_predictor_4x4_sse2;
+    aom_dc_predictor_4x8 = aom_dc_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x8 = aom_dc_predictor_4x8_sse2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_predictor_8x16 = aom_dc_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x16 = aom_dc_predictor_8x16_sse2;
+    aom_dc_predictor_8x32 = aom_dc_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x32 = aom_dc_predictor_8x32_sse2;
+    aom_dc_predictor_8x4 = aom_dc_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x4 = aom_dc_predictor_8x4_sse2;
+    aom_dc_predictor_8x8 = aom_dc_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x8 = aom_dc_predictor_8x8_sse2;
+    aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_sse2;
+    aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_sse2;
+    aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_sse2;
+    aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_sse2;
+    aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_sse2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_sse2;
+    aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_sse2;
+    aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_sse2;
+    aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_sse2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_sse2;
+    aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_sse2;
+    aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_sse2;
+    aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_sse2;
+    aom_h_predictor_16x16 = aom_h_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x16 = aom_h_predictor_16x16_sse2;
+    aom_h_predictor_16x32 = aom_h_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x32 = aom_h_predictor_16x32_sse2;
+    aom_h_predictor_16x4 = aom_h_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x4 = aom_h_predictor_16x4_sse2;
+    aom_h_predictor_16x64 = aom_h_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x64 = aom_h_predictor_16x64_sse2;
+    aom_h_predictor_16x8 = aom_h_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x8 = aom_h_predictor_16x8_sse2;
+    aom_h_predictor_32x16 = aom_h_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x16 = aom_h_predictor_32x16_sse2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_h_predictor_32x64 = aom_h_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x64 = aom_h_predictor_32x64_sse2;
+    aom_h_predictor_32x8 = aom_h_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x8 = aom_h_predictor_32x8_sse2;
+    aom_h_predictor_4x16 = aom_h_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x16 = aom_h_predictor_4x16_sse2;
+    aom_h_predictor_4x4 = aom_h_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x4 = aom_h_predictor_4x4_sse2;
+    aom_h_predictor_4x8 = aom_h_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x8 = aom_h_predictor_4x8_sse2;
+    aom_h_predictor_64x16 = aom_h_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x16 = aom_h_predictor_64x16_sse2;
+    aom_h_predictor_64x32 = aom_h_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x32 = aom_h_predictor_64x32_sse2;
+    aom_h_predictor_64x64 = aom_h_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x64 = aom_h_predictor_64x64_sse2;
+    aom_h_predictor_8x16 = aom_h_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x16 = aom_h_predictor_8x16_sse2;
+    aom_h_predictor_8x32 = aom_h_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x32 = aom_h_predictor_8x32_sse2;
+    aom_h_predictor_8x4 = aom_h_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x4 = aom_h_predictor_8x4_sse2;
+    aom_h_predictor_8x8 = aom_h_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x8 = aom_h_predictor_8x8_sse2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_c;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_c;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_c;
+    if (flags & HAS_SSE2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_sse2;
+    aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_sse2;
+    aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_sse2;
+    aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_sse2;
+    aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_sse2;
+    aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_sse2;
+    aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_sse2;
+    aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_sse2;
+    aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_sse2;
+    aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_sse2;
+    aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_sse2;
+    aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_sse2;
+    aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_sse2;
+    aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_sse2;
+    aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_sse2;
+    aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_sse2;
+    aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_sse2;
+    aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_sse2;
+    aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_sse2;
+    aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_sse2;
+    aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_sse2;
+    aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_sse2;
+    aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_sse2;
+    aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_sse2;
+    aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_sse2;
+    aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_sse2;
+    aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_sse2;
+    aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_sse2;
+    aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_sse2;
+    aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_sse2;
+    aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_sse2;
+    aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_sse2;
+    aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_sse2;
+    aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_sse2;
+    aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_sse2;
+    aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_sse2;
+    aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_sse2;
+    aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_sse2;
+    aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_sse2;
+    aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_sse2;
+    aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_sse2;
+    aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_sse2;
+    aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_sse2;
+    aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_sse2;
+    aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_sse2;
+    aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_sse2;
+    aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_sse2;
+    aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_sse2;
+    aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_sse2;
+    aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_sse2;
+    aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_sse2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_sse2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_sse2;
+    aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_sse2;
+    aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_sse2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_sse2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_sse2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_sse2;
+    aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_sse2;
+    aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_sse2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_sse2;
+    aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_sse2;
+    aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_sse2;
+    aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_sse2;
+    aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_sse2;
+    aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_sse2;
+    aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_sse2;
+    aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_sse2;
+    aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_sse2;
+    aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_sse2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_lpf_horizontal_14 = aom_lpf_horizontal_14_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_14 = aom_lpf_horizontal_14_sse2;
+    aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_sse2;
+    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_sse2;
+    aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_sse2;
+    aom_lpf_horizontal_6 = aom_lpf_horizontal_6_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_6 = aom_lpf_horizontal_6_sse2;
+    aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_sse2;
+    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_sse2;
+    aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_sse2;
+    aom_lpf_vertical_14 = aom_lpf_vertical_14_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_14 = aom_lpf_vertical_14_sse2;
+    aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_sse2;
+    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_4 = aom_lpf_vertical_4_sse2;
+    aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_sse2;
+    aom_lpf_vertical_6 = aom_lpf_vertical_6_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_6 = aom_lpf_vertical_6_sse2;
+    aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_sse2;
+    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_8 = aom_lpf_vertical_8_sse2;
+    aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_sse2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_16x16 = aom_v_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x16 = aom_v_predictor_16x16_sse2;
+    aom_v_predictor_16x32 = aom_v_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x32 = aom_v_predictor_16x32_sse2;
+    aom_v_predictor_16x4 = aom_v_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x4 = aom_v_predictor_16x4_sse2;
+    aom_v_predictor_16x64 = aom_v_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x64 = aom_v_predictor_16x64_sse2;
+    aom_v_predictor_16x8 = aom_v_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x8 = aom_v_predictor_16x8_sse2;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_32x8 = aom_v_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x8 = aom_v_predictor_32x8_sse2;
+    aom_v_predictor_4x16 = aom_v_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x16 = aom_v_predictor_4x16_sse2;
+    aom_v_predictor_4x4 = aom_v_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x4 = aom_v_predictor_4x4_sse2;
+    aom_v_predictor_4x8 = aom_v_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x8 = aom_v_predictor_4x8_sse2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    aom_v_predictor_8x16 = aom_v_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x16 = aom_v_predictor_8x16_sse2;
+    aom_v_predictor_8x32 = aom_v_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x32 = aom_v_predictor_8x32_sse2;
+    aom_v_predictor_8x4 = aom_v_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x4 = aom_v_predictor_8x4_sse2;
+    aom_v_predictor_8x8 = aom_v_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x8 = aom_v_predictor_8x8_sse2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/ia32/config/aom_scale_rtcd.h b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/win/ia32/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/ia32/config/av1_rtcd.h b/media/libaom/config/win/ia32/config/av1_rtcd.h
new file mode 100644
index 000000000..a2a6f14ff
--- /dev/null
+++ b/media/libaom/config/win/ia32/config/av1_rtcd.h
@@ -0,0 +1,605 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_c
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_c
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_c;
+    if (flags & HAS_SSE2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_c;
+    if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
+    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
+    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_c;
+    if (flags & HAS_SSE2) get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/mingw32/config/aom_config.asm b/media/libaom/config/win/mingw32/config/aom_config.asm
new file mode 100644
index 000000000..6ae776c7c
--- /dev/null
+++ b/media/libaom/config/win/mingw32/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 1
+ARCH_X86_64 equ 0
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/win/mingw32/config/aom_config.h b/media/libaom/config/win/mingw32/config/aom_config.h
new file mode 100644
index 000000000..090a45577
--- /dev/null
+++ b/media/libaom/config/win/mingw32/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 1
+#define ARCH_X86_64 0
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/win/mingw32/config/aom_dsp_rtcd.h b/media/libaom/config/win/mingw32/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..5b3762f10
--- /dev/null
+++ b/media/libaom/config/win/mingw32/config/aom_dsp_rtcd.h
@@ -0,0 +1,2379 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_128_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_left_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_dc_top_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_h_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_6)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_6_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8)(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_16x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_32x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_32x32)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_4x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_4x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x16)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x4)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+RTCD_EXTERN void (*aom_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_horizontal_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_horizontal_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_14)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_14_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_4)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_4_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_6)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_6_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*aom_lpf_vertical_8)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*aom_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_c;
+    if (flags & HAS_SSE2) aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_c;
+    if (flags & HAS_SSE2) aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_convolve_copy = aom_convolve_copy_c;
+    if (flags & HAS_SSE2) aom_convolve_copy = aom_convolve_copy_sse2;
+    aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x16 = aom_dc_128_predictor_16x16_sse2;
+    aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x32 = aom_dc_128_predictor_16x32_sse2;
+    aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x4 = aom_dc_128_predictor_16x4_sse2;
+    aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x64 = aom_dc_128_predictor_16x64_sse2;
+    aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_16x8 = aom_dc_128_predictor_16x8_sse2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_32x8 = aom_dc_128_predictor_32x8_sse2;
+    aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x16 = aom_dc_128_predictor_4x16_sse2;
+    aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x4 = aom_dc_128_predictor_4x4_sse2;
+    aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_4x8 = aom_dc_128_predictor_4x8_sse2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x16 = aom_dc_128_predictor_8x16_sse2;
+    aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x32 = aom_dc_128_predictor_8x32_sse2;
+    aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x4 = aom_dc_128_predictor_8x4_sse2;
+    aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_128_predictor_8x8 = aom_dc_128_predictor_8x8_sse2;
+    aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x16 = aom_dc_left_predictor_16x16_sse2;
+    aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x32 = aom_dc_left_predictor_16x32_sse2;
+    aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x4 = aom_dc_left_predictor_16x4_sse2;
+    aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x64 = aom_dc_left_predictor_16x64_sse2;
+    aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_16x8 = aom_dc_left_predictor_16x8_sse2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_32x8 = aom_dc_left_predictor_32x8_sse2;
+    aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x16 = aom_dc_left_predictor_4x16_sse2;
+    aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x4 = aom_dc_left_predictor_4x4_sse2;
+    aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_4x8 = aom_dc_left_predictor_4x8_sse2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x16 = aom_dc_left_predictor_8x16_sse2;
+    aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x32 = aom_dc_left_predictor_8x32_sse2;
+    aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x4 = aom_dc_left_predictor_8x4_sse2;
+    aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_left_predictor_8x8 = aom_dc_left_predictor_8x8_sse2;
+    aom_dc_predictor_16x16 = aom_dc_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x16 = aom_dc_predictor_16x16_sse2;
+    aom_dc_predictor_16x32 = aom_dc_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x32 = aom_dc_predictor_16x32_sse2;
+    aom_dc_predictor_16x4 = aom_dc_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x4 = aom_dc_predictor_16x4_sse2;
+    aom_dc_predictor_16x64 = aom_dc_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x64 = aom_dc_predictor_16x64_sse2;
+    aom_dc_predictor_16x8 = aom_dc_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_16x8 = aom_dc_predictor_16x8_sse2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_32x8 = aom_dc_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_32x8 = aom_dc_predictor_32x8_sse2;
+    aom_dc_predictor_4x16 = aom_dc_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x16 = aom_dc_predictor_4x16_sse2;
+    aom_dc_predictor_4x4 = aom_dc_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x4 = aom_dc_predictor_4x4_sse2;
+    aom_dc_predictor_4x8 = aom_dc_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_4x8 = aom_dc_predictor_4x8_sse2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_predictor_8x16 = aom_dc_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x16 = aom_dc_predictor_8x16_sse2;
+    aom_dc_predictor_8x32 = aom_dc_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x32 = aom_dc_predictor_8x32_sse2;
+    aom_dc_predictor_8x4 = aom_dc_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x4 = aom_dc_predictor_8x4_sse2;
+    aom_dc_predictor_8x8 = aom_dc_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_predictor_8x8 = aom_dc_predictor_8x8_sse2;
+    aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x16 = aom_dc_top_predictor_16x16_sse2;
+    aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x32 = aom_dc_top_predictor_16x32_sse2;
+    aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x4 = aom_dc_top_predictor_16x4_sse2;
+    aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x64 = aom_dc_top_predictor_16x64_sse2;
+    aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_16x8 = aom_dc_top_predictor_16x8_sse2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_32x8 = aom_dc_top_predictor_32x8_sse2;
+    aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x16 = aom_dc_top_predictor_4x16_sse2;
+    aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x4 = aom_dc_top_predictor_4x4_sse2;
+    aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_4x8 = aom_dc_top_predictor_4x8_sse2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x16 = aom_dc_top_predictor_8x16_sse2;
+    aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x32 = aom_dc_top_predictor_8x32_sse2;
+    aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x4 = aom_dc_top_predictor_8x4_sse2;
+    aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_dc_top_predictor_8x8 = aom_dc_top_predictor_8x8_sse2;
+    aom_h_predictor_16x16 = aom_h_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x16 = aom_h_predictor_16x16_sse2;
+    aom_h_predictor_16x32 = aom_h_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x32 = aom_h_predictor_16x32_sse2;
+    aom_h_predictor_16x4 = aom_h_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x4 = aom_h_predictor_16x4_sse2;
+    aom_h_predictor_16x64 = aom_h_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x64 = aom_h_predictor_16x64_sse2;
+    aom_h_predictor_16x8 = aom_h_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_16x8 = aom_h_predictor_16x8_sse2;
+    aom_h_predictor_32x16 = aom_h_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x16 = aom_h_predictor_32x16_sse2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_h_predictor_32x64 = aom_h_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x64 = aom_h_predictor_32x64_sse2;
+    aom_h_predictor_32x8 = aom_h_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_32x8 = aom_h_predictor_32x8_sse2;
+    aom_h_predictor_4x16 = aom_h_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x16 = aom_h_predictor_4x16_sse2;
+    aom_h_predictor_4x4 = aom_h_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x4 = aom_h_predictor_4x4_sse2;
+    aom_h_predictor_4x8 = aom_h_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_4x8 = aom_h_predictor_4x8_sse2;
+    aom_h_predictor_64x16 = aom_h_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x16 = aom_h_predictor_64x16_sse2;
+    aom_h_predictor_64x32 = aom_h_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x32 = aom_h_predictor_64x32_sse2;
+    aom_h_predictor_64x64 = aom_h_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_h_predictor_64x64 = aom_h_predictor_64x64_sse2;
+    aom_h_predictor_8x16 = aom_h_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x16 = aom_h_predictor_8x16_sse2;
+    aom_h_predictor_8x32 = aom_h_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x32 = aom_h_predictor_8x32_sse2;
+    aom_h_predictor_8x4 = aom_h_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x4 = aom_h_predictor_8x4_sse2;
+    aom_h_predictor_8x8 = aom_h_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_h_predictor_8x8 = aom_h_predictor_8x8_sse2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_c;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_c;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_c;
+    if (flags & HAS_SSE2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x16 = aom_highbd_dc_128_predictor_16x16_sse2;
+    aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x32 = aom_highbd_dc_128_predictor_16x32_sse2;
+    aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_16x8 = aom_highbd_dc_128_predictor_16x8_sse2;
+    aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x16 = aom_highbd_dc_128_predictor_32x16_sse2;
+    aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_32x32 = aom_highbd_dc_128_predictor_32x32_sse2;
+    aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x4 = aom_highbd_dc_128_predictor_4x4_sse2;
+    aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_4x8 = aom_highbd_dc_128_predictor_4x8_sse2;
+    aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x16 = aom_highbd_dc_128_predictor_8x16_sse2;
+    aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x4 = aom_highbd_dc_128_predictor_8x4_sse2;
+    aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_128_predictor_8x8 = aom_highbd_dc_128_predictor_8x8_sse2;
+    aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x16 = aom_highbd_dc_left_predictor_16x16_sse2;
+    aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x32 = aom_highbd_dc_left_predictor_16x32_sse2;
+    aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_16x8 = aom_highbd_dc_left_predictor_16x8_sse2;
+    aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x16 = aom_highbd_dc_left_predictor_32x16_sse2;
+    aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_32x32 = aom_highbd_dc_left_predictor_32x32_sse2;
+    aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x4 = aom_highbd_dc_left_predictor_4x4_sse2;
+    aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_4x8 = aom_highbd_dc_left_predictor_4x8_sse2;
+    aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x16 = aom_highbd_dc_left_predictor_8x16_sse2;
+    aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x4 = aom_highbd_dc_left_predictor_8x4_sse2;
+    aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_left_predictor_8x8 = aom_highbd_dc_left_predictor_8x8_sse2;
+    aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x16 = aom_highbd_dc_predictor_16x16_sse2;
+    aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x32 = aom_highbd_dc_predictor_16x32_sse2;
+    aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_16x8 = aom_highbd_dc_predictor_16x8_sse2;
+    aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x16 = aom_highbd_dc_predictor_32x16_sse2;
+    aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_32x32 = aom_highbd_dc_predictor_32x32_sse2;
+    aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x4 = aom_highbd_dc_predictor_4x4_sse2;
+    aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_4x8 = aom_highbd_dc_predictor_4x8_sse2;
+    aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x16 = aom_highbd_dc_predictor_8x16_sse2;
+    aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x4 = aom_highbd_dc_predictor_8x4_sse2;
+    aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_predictor_8x8 = aom_highbd_dc_predictor_8x8_sse2;
+    aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x16 = aom_highbd_dc_top_predictor_16x16_sse2;
+    aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x32 = aom_highbd_dc_top_predictor_16x32_sse2;
+    aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_16x8 = aom_highbd_dc_top_predictor_16x8_sse2;
+    aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x16 = aom_highbd_dc_top_predictor_32x16_sse2;
+    aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_32x32 = aom_highbd_dc_top_predictor_32x32_sse2;
+    aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x4 = aom_highbd_dc_top_predictor_4x4_sse2;
+    aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_4x8 = aom_highbd_dc_top_predictor_4x8_sse2;
+    aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x16 = aom_highbd_dc_top_predictor_8x16_sse2;
+    aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x4 = aom_highbd_dc_top_predictor_8x4_sse2;
+    aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_dc_top_predictor_8x8 = aom_highbd_dc_top_predictor_8x8_sse2;
+    aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x16 = aom_highbd_h_predictor_16x16_sse2;
+    aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x32 = aom_highbd_h_predictor_16x32_sse2;
+    aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_16x8 = aom_highbd_h_predictor_16x8_sse2;
+    aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x16 = aom_highbd_h_predictor_32x16_sse2;
+    aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_32x32 = aom_highbd_h_predictor_32x32_sse2;
+    aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x4 = aom_highbd_h_predictor_4x4_sse2;
+    aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_4x8 = aom_highbd_h_predictor_4x8_sse2;
+    aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x16 = aom_highbd_h_predictor_8x16_sse2;
+    aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x4 = aom_highbd_h_predictor_8x4_sse2;
+    aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_h_predictor_8x8 = aom_highbd_h_predictor_8x8_sse2;
+    aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14 = aom_highbd_lpf_horizontal_14_sse2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4 = aom_highbd_lpf_horizontal_4_sse2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6 = aom_highbd_lpf_horizontal_6_sse2;
+    aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_6_dual = aom_highbd_lpf_horizontal_6_dual_sse2;
+    aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8 = aom_highbd_lpf_horizontal_8_sse2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14 = aom_highbd_lpf_vertical_14_sse2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4 = aom_highbd_lpf_vertical_4_sse2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6 = aom_highbd_lpf_vertical_6_sse2;
+    aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_6_dual = aom_highbd_lpf_vertical_6_dual_sse2;
+    aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8 = aom_highbd_lpf_vertical_8_sse2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_c;
+    if (flags & HAS_SSE2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x16 = aom_highbd_v_predictor_16x16_sse2;
+    aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x32 = aom_highbd_v_predictor_16x32_sse2;
+    aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_16x8 = aom_highbd_v_predictor_16x8_sse2;
+    aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x16 = aom_highbd_v_predictor_32x16_sse2;
+    aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_32x32 = aom_highbd_v_predictor_32x32_sse2;
+    aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x4 = aom_highbd_v_predictor_4x4_sse2;
+    aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_4x8 = aom_highbd_v_predictor_4x8_sse2;
+    aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x16 = aom_highbd_v_predictor_8x16_sse2;
+    aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x4 = aom_highbd_v_predictor_8x4_sse2;
+    aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_highbd_v_predictor_8x8 = aom_highbd_v_predictor_8x8_sse2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_lpf_horizontal_14 = aom_lpf_horizontal_14_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_14 = aom_lpf_horizontal_14_sse2;
+    aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_14_dual = aom_lpf_horizontal_14_dual_sse2;
+    aom_lpf_horizontal_4 = aom_lpf_horizontal_4_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_4 = aom_lpf_horizontal_4_sse2;
+    aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_4_dual = aom_lpf_horizontal_4_dual_sse2;
+    aom_lpf_horizontal_6 = aom_lpf_horizontal_6_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_6 = aom_lpf_horizontal_6_sse2;
+    aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_6_dual = aom_lpf_horizontal_6_dual_sse2;
+    aom_lpf_horizontal_8 = aom_lpf_horizontal_8_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_8 = aom_lpf_horizontal_8_sse2;
+    aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_horizontal_8_dual = aom_lpf_horizontal_8_dual_sse2;
+    aom_lpf_vertical_14 = aom_lpf_vertical_14_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_14 = aom_lpf_vertical_14_sse2;
+    aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_14_dual = aom_lpf_vertical_14_dual_sse2;
+    aom_lpf_vertical_4 = aom_lpf_vertical_4_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_4 = aom_lpf_vertical_4_sse2;
+    aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_4_dual = aom_lpf_vertical_4_dual_sse2;
+    aom_lpf_vertical_6 = aom_lpf_vertical_6_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_6 = aom_lpf_vertical_6_sse2;
+    aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_6_dual = aom_lpf_vertical_6_dual_sse2;
+    aom_lpf_vertical_8 = aom_lpf_vertical_8_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_8 = aom_lpf_vertical_8_sse2;
+    aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_c;
+    if (flags & HAS_SSE2) aom_lpf_vertical_8_dual = aom_lpf_vertical_8_dual_sse2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_16x16 = aom_v_predictor_16x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x16 = aom_v_predictor_16x16_sse2;
+    aom_v_predictor_16x32 = aom_v_predictor_16x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x32 = aom_v_predictor_16x32_sse2;
+    aom_v_predictor_16x4 = aom_v_predictor_16x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x4 = aom_v_predictor_16x4_sse2;
+    aom_v_predictor_16x64 = aom_v_predictor_16x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x64 = aom_v_predictor_16x64_sse2;
+    aom_v_predictor_16x8 = aom_v_predictor_16x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_16x8 = aom_v_predictor_16x8_sse2;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_32x8 = aom_v_predictor_32x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_32x8 = aom_v_predictor_32x8_sse2;
+    aom_v_predictor_4x16 = aom_v_predictor_4x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x16 = aom_v_predictor_4x16_sse2;
+    aom_v_predictor_4x4 = aom_v_predictor_4x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x4 = aom_v_predictor_4x4_sse2;
+    aom_v_predictor_4x8 = aom_v_predictor_4x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_4x8 = aom_v_predictor_4x8_sse2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_c;
+    if (flags & HAS_SSE2) aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    aom_v_predictor_8x16 = aom_v_predictor_8x16_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x16 = aom_v_predictor_8x16_sse2;
+    aom_v_predictor_8x32 = aom_v_predictor_8x32_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x32 = aom_v_predictor_8x32_sse2;
+    aom_v_predictor_8x4 = aom_v_predictor_8x4_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x4 = aom_v_predictor_8x4_sse2;
+    aom_v_predictor_8x8 = aom_v_predictor_8x8_c;
+    if (flags & HAS_SSE2) aom_v_predictor_8x8 = aom_v_predictor_8x8_sse2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/mingw32/config/aom_scale_rtcd.h b/media/libaom/config/win/mingw32/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/win/mingw32/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/mingw32/config/av1_rtcd.h b/media/libaom/config/win/mingw32/config/av1_rtcd.h
new file mode 100644
index 000000000..a2a6f14ff
--- /dev/null
+++ b/media/libaom/config/win/mingw32/config/av1_rtcd.h
@@ -0,0 +1,605 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_c
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_c
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_c
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_c;
+    if (flags & HAS_SSE2) av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_c;
+    if (flags & HAS_SSE2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_c;
+    if (flags & HAS_SSE2) av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_c;
+    if (flags & HAS_SSE2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_c;
+    if (flags & HAS_SSE2) cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_c;
+    if (flags & HAS_SSE2) cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_c;
+    if (flags & HAS_SSE2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_c;
+    if (flags & HAS_SSE2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_c;
+    if (flags & HAS_SSE2) get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/mingw64/config/aom_config.asm b/media/libaom/config/win/mingw64/config/aom_config.asm
new file mode 100644
index 000000000..4ab2dedb4
--- /dev/null
+++ b/media/libaom/config/win/mingw64/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 0
+ARCH_X86_64 equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/win/mingw64/config/aom_config.h b/media/libaom/config/win/mingw64/config/aom_config.h
new file mode 100644
index 000000000..9a0be7cd1
--- /dev/null
+++ b/media/libaom/config/win/mingw64/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 0
+#define ARCH_X86_64 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/win/mingw64/config/aom_dsp_rtcd.h b/media/libaom/config/win/mingw64/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..2856d4ede
--- /dev/null
+++ b/media/libaom/config/win/mingw64/config/aom_dsp_rtcd.h
@@ -0,0 +1,2001 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve_copy aom_convolve_copy_sse2
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_sse2
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_sse2
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_sse2
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_sse2
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_sse2
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_sse2
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_sse2
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_sse2
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_sse2
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_sse2
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_sse2
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_sse2
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_sse2
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_sse2
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_sse2
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_sse2
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_sse2
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_sse2
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_sse2
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_sse2
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_sse2
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_sse2
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_sse2
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_sse2
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_sse2
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_sse2
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_sse2
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_sse2
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_sse2
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_sse2
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_sse2
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_sse2
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_sse2
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_sse2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_sse2
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_sse2
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_sse2
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_sse2
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_sse2
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_sse2
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_sse2
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_sse2
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_sse2
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_sse2
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_sse2
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_sse2
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_sse2
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_sse2
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_sse2
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_sse2
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_sse2
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_sse2
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x16 aom_h_predictor_16x16_sse2
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x32 aom_h_predictor_16x32_sse2
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x4 aom_h_predictor_16x4_sse2
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x64 aom_h_predictor_16x64_sse2
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x8 aom_h_predictor_16x8_sse2
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x16 aom_h_predictor_32x16_sse2
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x64 aom_h_predictor_32x64_sse2
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x8 aom_h_predictor_32x8_sse2
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x16 aom_h_predictor_4x16_sse2
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x4 aom_h_predictor_4x4_sse2
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x8 aom_h_predictor_4x8_sse2
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x16 aom_h_predictor_64x16_sse2
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x32 aom_h_predictor_64x32_sse2
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x64 aom_h_predictor_64x64_sse2
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x16 aom_h_predictor_8x16_sse2
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x32 aom_h_predictor_8x32_sse2
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x4 aom_h_predictor_8x4_sse2
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_sse2
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_sse2
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_sse2
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_sse2
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_sse2
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_sse2
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_sse2
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_sse2
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_sse2
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_sse2
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_sse2
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_sse2
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_sse2
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_sse2
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_sse2
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_sse2
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_sse2
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_sse2
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_sse2
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_sse2
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_sse2
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_sse2
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_sse2
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_sse2
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_sse2
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_sse2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_sse2
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_sse2
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_sse2
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_sse2
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_sse2
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_sse2
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_sse2
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_sse2
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_sse2
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_sse2
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_sse2
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_sse2
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_sse2
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_sse2
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_sse2
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_sse2
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_sse2
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_sse2
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_sse2
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_sse2
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_sse2
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_sse2
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_sse2
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_sse2
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_sse2
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_sse2
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_sse2
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_sse2
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_sse2
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_sse2
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_sse2
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_sse2
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_sse2
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x16 aom_v_predictor_16x16_sse2
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x32 aom_v_predictor_16x32_sse2
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x4 aom_v_predictor_16x4_sse2
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x64 aom_v_predictor_16x64_sse2
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x8 aom_v_predictor_16x8_sse2
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x8 aom_v_predictor_32x8_sse2
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x16 aom_v_predictor_4x16_sse2
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x4 aom_v_predictor_4x4_sse2
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x8 aom_v_predictor_4x8_sse2
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x16 aom_v_predictor_8x16_sse2
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x32 aom_v_predictor_8x32_sse2
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x4 aom_v_predictor_8x4_sse2
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/mingw64/config/aom_scale_rtcd.h b/media/libaom/config/win/mingw64/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/win/mingw64/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/mingw64/config/av1_rtcd.h b/media/libaom/config/win/mingw64/config/av1_rtcd.h
new file mode 100644
index 000000000..d27318208
--- /dev/null
+++ b/media/libaom/config/win/mingw64/config/av1_rtcd.h
@@ -0,0 +1,594 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_sse2
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_sse2
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/x64/config/aom_config.asm b/media/libaom/config/win/x64/config/aom_config.asm
new file mode 100644
index 000000000..4ab2dedb4
--- /dev/null
+++ b/media/libaom/config/win/x64/config/aom_config.asm
@@ -0,0 +1,76 @@
+;
+; Copyright (c) 2018, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+ARCH_ARM equ 0
+ARCH_MIPS equ 0
+ARCH_PPC equ 0
+ARCH_X86 equ 0
+ARCH_X86_64 equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_ACCOUNTING equ 0
+CONFIG_ANALYZER equ 0
+CONFIG_AV1_DECODER equ 1
+CONFIG_AV1_ENCODER equ 0
+CONFIG_BIG_ENDIAN equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
+CONFIG_COLLECT_INTER_MODE_RD_STATS equ 0
+CONFIG_COLLECT_RD_STATS equ 0
+CONFIG_DEBUG equ 0
+CONFIG_DENOISE equ 1
+CONFIG_DIST_8X8 equ 0
+CONFIG_ENTROPY_STATS equ 0
+CONFIG_FILEOPTIONS equ 1
+CONFIG_FIX_GF_LENGTH equ 1
+CONFIG_FP_MB_STATS equ 0
+CONFIG_GCC equ 1
+CONFIG_GCOV equ 0
+CONFIG_GLOBAL_MOTION_SEARCH equ 1
+CONFIG_GPROF equ 0
+CONFIG_INSPECTION equ 0
+CONFIG_INTERNAL_STATS equ 0
+CONFIG_INTER_STATS_ONLY equ 0
+CONFIG_LIBYUV equ 0
+CONFIG_LOWBITDEPTH equ 1
+CONFIG_MAX_DECODE_PROFILE equ 2
+CONFIG_MISMATCH_DEBUG equ 0
+CONFIG_MULTITHREAD equ 1
+CONFIG_NORMAL_TILE_MODE equ 0
+CONFIG_OS_SUPPORT equ 1
+CONFIG_PIC equ 0
+CONFIG_RD_DEBUG equ 0
+CONFIG_REDUCED_ENCODER_BORDER equ 0
+CONFIG_RUNTIME_CPU_DETECT equ 1
+CONFIG_SHARED equ 0
+CONFIG_SHARP_SETTINGS equ 0
+CONFIG_SIZE_LIMIT equ 0
+CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_STATIC equ 1
+CONFIG_WEBM_IO equ 0
+DECODE_HEIGHT_LIMIT equ 0
+DECODE_WIDTH_LIMIT equ 0
+HAVE_AVX equ 1
+HAVE_AVX2 equ 1
+HAVE_DSPR2 equ 0
+HAVE_FEXCEPT equ 1
+HAVE_MIPS32 equ 0
+HAVE_MIPS64 equ 0
+HAVE_MMX equ 1
+HAVE_MSA equ 0
+HAVE_NEON equ 0
+HAVE_SSE equ 1
+HAVE_SSE2 equ 1
+HAVE_SSE3 equ 1
+HAVE_SSE4_1 equ 1
+HAVE_SSE4_2 equ 1
+HAVE_SSSE3 equ 1
+HAVE_VSX equ 0
+HAVE_WXWIDGETS equ 0
diff --git a/media/libaom/config/win/x64/config/aom_config.h b/media/libaom/config/win/x64/config/aom_config.h
new file mode 100644
index 000000000..9a0be7cd1
--- /dev/null
+++ b/media/libaom/config/win/x64/config/aom_config.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_CONFIG_H_
+#define AOM_CONFIG_H_
+
+#define ARCH_ARM 0
+#define ARCH_MIPS 0
+#define ARCH_PPC 0
+#define ARCH_X86 0
+#define ARCH_X86_64 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_ACCOUNTING 0
+#define CONFIG_ANALYZER 0
+#define CONFIG_AV1_DECODER 1
+#define CONFIG_AV1_ENCODER 0
+#define CONFIG_BIG_ENDIAN 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
+#define CONFIG_COLLECT_INTER_MODE_RD_STATS 0
+#define CONFIG_COLLECT_RD_STATS 0
+#define CONFIG_DEBUG 0
+#define CONFIG_DENOISE 1
+#define CONFIG_DIST_8X8 0
+#define CONFIG_ENTROPY_STATS 0
+#define CONFIG_FILEOPTIONS 1
+#define CONFIG_FIX_GF_LENGTH 1
+#define CONFIG_FP_MB_STATS 0
+#define CONFIG_GCC 1
+#define CONFIG_GCOV 0
+#define CONFIG_GLOBAL_MOTION_SEARCH 1
+#define CONFIG_GPROF 0
+#define CONFIG_INSPECTION 0
+#define CONFIG_INTERNAL_STATS 0
+#define CONFIG_INTER_STATS_ONLY 0
+#define CONFIG_LIBYUV 0
+#define CONFIG_LOWBITDEPTH 1
+#define CONFIG_MAX_DECODE_PROFILE 2
+#define CONFIG_MISMATCH_DEBUG 0
+#define CONFIG_MULTITHREAD 1
+#define CONFIG_NORMAL_TILE_MODE 0
+#define CONFIG_OS_SUPPORT 1
+#define CONFIG_PIC 0
+#define CONFIG_RD_DEBUG 0
+#define CONFIG_REDUCED_ENCODER_BORDER 0
+#define CONFIG_RUNTIME_CPU_DETECT 1
+#define CONFIG_SHARED 0
+#define CONFIG_SHARP_SETTINGS 0
+#define CONFIG_SIZE_LIMIT 0
+#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_STATIC 1
+#define CONFIG_WEBM_IO 0
+#define DECODE_HEIGHT_LIMIT 0
+#define DECODE_WIDTH_LIMIT 0
+#define HAVE_AVX 1
+#define HAVE_AVX2 1
+#define HAVE_DSPR2 0
+#define HAVE_FEXCEPT 1
+#define HAVE_MIPS32 0
+#define HAVE_MIPS64 0
+#define HAVE_MMX 1
+#define HAVE_MSA 0
+#define HAVE_NEON 0
+#define HAVE_SSE 1
+#define HAVE_SSE2 1
+#define HAVE_SSE3 1
+#define HAVE_SSE4_1 1
+#define HAVE_SSE4_2 1
+#define HAVE_SSSE3 1
+#define HAVE_VSX 0
+#define HAVE_WXWIDGETS 0
+#define INCLUDE_INSTALL_DIR INSTALLDIR/include
+#define INLINE inline
+#define LIB_INSTALL_DIR INSTALLDIR/lib
+#endif /* AOM_CONFIG_H_ */
diff --git a/media/libaom/config/win/x64/config/aom_dsp_rtcd.h b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
new file mode 100644
index 000000000..2856d4ede
--- /dev/null
+++ b/media/libaom/config/win/x64/config/aom_dsp_rtcd.h
@@ -0,0 +1,2001 @@
+// This file is generated. Do not edit.
+#ifndef AOM_DSP_RTCD_H_
+#define AOM_DSP_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+RTCD_EXTERN void (*aom_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby);
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+RTCD_EXTERN void (*aom_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h);
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+RTCD_EXTERN void (*aom_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+#define aom_convolve_copy aom_convolve_copy_sse2
+
+void aom_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x16 aom_dc_128_predictor_16x16_sse2
+
+void aom_dc_128_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x32 aom_dc_128_predictor_16x32_sse2
+
+void aom_dc_128_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x4 aom_dc_128_predictor_16x4_sse2
+
+void aom_dc_128_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x64 aom_dc_128_predictor_16x64_sse2
+
+void aom_dc_128_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_16x8 aom_dc_128_predictor_16x8_sse2
+
+void aom_dc_128_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_2x2 aom_dc_128_predictor_2x2_c
+
+void aom_dc_128_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_32x8 aom_dc_128_predictor_32x8_sse2
+
+void aom_dc_128_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x16 aom_dc_128_predictor_4x16_sse2
+
+void aom_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x4 aom_dc_128_predictor_4x4_sse2
+
+void aom_dc_128_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_4x8 aom_dc_128_predictor_4x8_sse2
+
+void aom_dc_128_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_128_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_128_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x16 aom_dc_128_predictor_8x16_sse2
+
+void aom_dc_128_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x32 aom_dc_128_predictor_8x32_sse2
+
+void aom_dc_128_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x4 aom_dc_128_predictor_8x4_sse2
+
+void aom_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_128_predictor_8x8 aom_dc_128_predictor_8x8_sse2
+
+void aom_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x16 aom_dc_left_predictor_16x16_sse2
+
+void aom_dc_left_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x32 aom_dc_left_predictor_16x32_sse2
+
+void aom_dc_left_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x4 aom_dc_left_predictor_16x4_sse2
+
+void aom_dc_left_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x64 aom_dc_left_predictor_16x64_sse2
+
+void aom_dc_left_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_16x8 aom_dc_left_predictor_16x8_sse2
+
+void aom_dc_left_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_2x2 aom_dc_left_predictor_2x2_c
+
+void aom_dc_left_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_32x8 aom_dc_left_predictor_32x8_sse2
+
+void aom_dc_left_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x16 aom_dc_left_predictor_4x16_sse2
+
+void aom_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x4 aom_dc_left_predictor_4x4_sse2
+
+void aom_dc_left_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_4x8 aom_dc_left_predictor_4x8_sse2
+
+void aom_dc_left_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_left_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_left_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x16 aom_dc_left_predictor_8x16_sse2
+
+void aom_dc_left_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x32 aom_dc_left_predictor_8x32_sse2
+
+void aom_dc_left_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x4 aom_dc_left_predictor_8x4_sse2
+
+void aom_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_left_predictor_8x8 aom_dc_left_predictor_8x8_sse2
+
+void aom_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x16 aom_dc_predictor_16x16_sse2
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x32 aom_dc_predictor_16x32_sse2
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x4 aom_dc_predictor_16x4_sse2
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x64 aom_dc_predictor_16x64_sse2
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_16x8 aom_dc_predictor_16x8_sse2
+
+void aom_dc_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_2x2 aom_dc_predictor_2x2_c
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_32x8 aom_dc_predictor_32x8_sse2
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x16 aom_dc_predictor_4x16_sse2
+
+void aom_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x4 aom_dc_predictor_4x4_sse2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_4x8 aom_dc_predictor_4x8_sse2
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x16 aom_dc_predictor_8x16_sse2
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x32 aom_dc_predictor_8x32_sse2
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x4 aom_dc_predictor_8x4_sse2
+
+void aom_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_predictor_8x8 aom_dc_predictor_8x8_sse2
+
+void aom_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x16 aom_dc_top_predictor_16x16_sse2
+
+void aom_dc_top_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x32 aom_dc_top_predictor_16x32_sse2
+
+void aom_dc_top_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x4 aom_dc_top_predictor_16x4_sse2
+
+void aom_dc_top_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x64 aom_dc_top_predictor_16x64_sse2
+
+void aom_dc_top_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_16x8 aom_dc_top_predictor_16x8_sse2
+
+void aom_dc_top_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_2x2 aom_dc_top_predictor_2x2_c
+
+void aom_dc_top_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_32x8 aom_dc_top_predictor_32x8_sse2
+
+void aom_dc_top_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x16 aom_dc_top_predictor_4x16_sse2
+
+void aom_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x4 aom_dc_top_predictor_4x4_sse2
+
+void aom_dc_top_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_4x8 aom_dc_top_predictor_4x8_sse2
+
+void aom_dc_top_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_dc_top_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_dc_top_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x16 aom_dc_top_predictor_8x16_sse2
+
+void aom_dc_top_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x32 aom_dc_top_predictor_8x32_sse2
+
+void aom_dc_top_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x4 aom_dc_top_predictor_8x4_sse2
+
+void aom_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_dc_top_predictor_8x8 aom_dc_top_predictor_8x8_sse2
+
+void aom_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x16 aom_h_predictor_16x16_sse2
+
+void aom_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x32 aom_h_predictor_16x32_sse2
+
+void aom_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x4 aom_h_predictor_16x4_sse2
+
+void aom_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x64 aom_h_predictor_16x64_sse2
+
+void aom_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_16x8 aom_h_predictor_16x8_sse2
+
+void aom_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_2x2 aom_h_predictor_2x2_c
+
+void aom_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x16 aom_h_predictor_32x16_sse2
+
+void aom_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x64 aom_h_predictor_32x64_sse2
+
+void aom_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_32x8 aom_h_predictor_32x8_sse2
+
+void aom_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x16 aom_h_predictor_4x16_sse2
+
+void aom_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x4 aom_h_predictor_4x4_sse2
+
+void aom_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_4x8 aom_h_predictor_4x8_sse2
+
+void aom_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x16 aom_h_predictor_64x16_sse2
+
+void aom_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x32 aom_h_predictor_64x32_sse2
+
+void aom_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_64x64 aom_h_predictor_64x64_sse2
+
+void aom_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x16 aom_h_predictor_8x16_sse2
+
+void aom_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x32 aom_h_predictor_8x32_sse2
+
+void aom_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x4 aom_h_predictor_8x4_sse2
+
+void aom_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_h_predictor_8x8 aom_h_predictor_8x8_sse2
+
+void aom_highbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd);
+#define aom_highbd_blend_a64_d16_mask aom_highbd_blend_a64_d16_mask_c
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_hmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_mask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd);
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+void aom_highbd_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+RTCD_EXTERN void (*aom_highbd_blend_a64_vmask)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd);
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_horiz)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve8_vert)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void aom_highbd_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+RTCD_EXTERN void (*aom_highbd_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+
+void aom_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x16 aom_highbd_dc_128_predictor_16x16_sse2
+
+void aom_highbd_dc_128_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x32 aom_highbd_dc_128_predictor_16x32_sse2
+
+void aom_highbd_dc_128_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x4 aom_highbd_dc_128_predictor_16x4_c
+
+void aom_highbd_dc_128_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x64 aom_highbd_dc_128_predictor_16x64_c
+
+void aom_highbd_dc_128_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_16x8 aom_highbd_dc_128_predictor_16x8_sse2
+
+void aom_highbd_dc_128_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_2x2 aom_highbd_dc_128_predictor_2x2_c
+
+void aom_highbd_dc_128_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x16 aom_highbd_dc_128_predictor_32x16_sse2
+
+void aom_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x32 aom_highbd_dc_128_predictor_32x32_sse2
+
+void aom_highbd_dc_128_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x64 aom_highbd_dc_128_predictor_32x64_c
+
+void aom_highbd_dc_128_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_32x8 aom_highbd_dc_128_predictor_32x8_c
+
+void aom_highbd_dc_128_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x16 aom_highbd_dc_128_predictor_4x16_c
+
+void aom_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x4 aom_highbd_dc_128_predictor_4x4_sse2
+
+void aom_highbd_dc_128_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_4x8 aom_highbd_dc_128_predictor_4x8_sse2
+
+void aom_highbd_dc_128_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x16 aom_highbd_dc_128_predictor_64x16_c
+
+void aom_highbd_dc_128_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x32 aom_highbd_dc_128_predictor_64x32_c
+
+void aom_highbd_dc_128_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_64x64 aom_highbd_dc_128_predictor_64x64_c
+
+void aom_highbd_dc_128_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x16 aom_highbd_dc_128_predictor_8x16_sse2
+
+void aom_highbd_dc_128_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x32 aom_highbd_dc_128_predictor_8x32_c
+
+void aom_highbd_dc_128_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x4 aom_highbd_dc_128_predictor_8x4_sse2
+
+void aom_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_128_predictor_8x8 aom_highbd_dc_128_predictor_8x8_sse2
+
+void aom_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x16 aom_highbd_dc_left_predictor_16x16_sse2
+
+void aom_highbd_dc_left_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x32 aom_highbd_dc_left_predictor_16x32_sse2
+
+void aom_highbd_dc_left_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x4 aom_highbd_dc_left_predictor_16x4_c
+
+void aom_highbd_dc_left_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x64 aom_highbd_dc_left_predictor_16x64_c
+
+void aom_highbd_dc_left_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_16x8 aom_highbd_dc_left_predictor_16x8_sse2
+
+void aom_highbd_dc_left_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_2x2 aom_highbd_dc_left_predictor_2x2_c
+
+void aom_highbd_dc_left_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x16 aom_highbd_dc_left_predictor_32x16_sse2
+
+void aom_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x32 aom_highbd_dc_left_predictor_32x32_sse2
+
+void aom_highbd_dc_left_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x64 aom_highbd_dc_left_predictor_32x64_c
+
+void aom_highbd_dc_left_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_32x8 aom_highbd_dc_left_predictor_32x8_c
+
+void aom_highbd_dc_left_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x16 aom_highbd_dc_left_predictor_4x16_c
+
+void aom_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x4 aom_highbd_dc_left_predictor_4x4_sse2
+
+void aom_highbd_dc_left_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_4x8 aom_highbd_dc_left_predictor_4x8_sse2
+
+void aom_highbd_dc_left_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x16 aom_highbd_dc_left_predictor_64x16_c
+
+void aom_highbd_dc_left_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x32 aom_highbd_dc_left_predictor_64x32_c
+
+void aom_highbd_dc_left_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_64x64 aom_highbd_dc_left_predictor_64x64_c
+
+void aom_highbd_dc_left_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x16 aom_highbd_dc_left_predictor_8x16_sse2
+
+void aom_highbd_dc_left_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x32 aom_highbd_dc_left_predictor_8x32_c
+
+void aom_highbd_dc_left_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x4 aom_highbd_dc_left_predictor_8x4_sse2
+
+void aom_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_left_predictor_8x8 aom_highbd_dc_left_predictor_8x8_sse2
+
+void aom_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x16 aom_highbd_dc_predictor_16x16_sse2
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x32 aom_highbd_dc_predictor_16x32_sse2
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x4 aom_highbd_dc_predictor_16x4_c
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x64 aom_highbd_dc_predictor_16x64_c
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_16x8 aom_highbd_dc_predictor_16x8_sse2
+
+void aom_highbd_dc_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_2x2 aom_highbd_dc_predictor_2x2_c
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x16 aom_highbd_dc_predictor_32x16_sse2
+
+void aom_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x32 aom_highbd_dc_predictor_32x32_sse2
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x64 aom_highbd_dc_predictor_32x64_c
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_32x8 aom_highbd_dc_predictor_32x8_c
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x16 aom_highbd_dc_predictor_4x16_c
+
+void aom_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x4 aom_highbd_dc_predictor_4x4_sse2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_4x8 aom_highbd_dc_predictor_4x8_sse2
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x16 aom_highbd_dc_predictor_64x16_c
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x32 aom_highbd_dc_predictor_64x32_c
+
+void aom_highbd_dc_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_64x64 aom_highbd_dc_predictor_64x64_c
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x16 aom_highbd_dc_predictor_8x16_sse2
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x32 aom_highbd_dc_predictor_8x32_c
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x4 aom_highbd_dc_predictor_8x4_sse2
+
+void aom_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_predictor_8x8 aom_highbd_dc_predictor_8x8_sse2
+
+void aom_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x16 aom_highbd_dc_top_predictor_16x16_sse2
+
+void aom_highbd_dc_top_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x32 aom_highbd_dc_top_predictor_16x32_sse2
+
+void aom_highbd_dc_top_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x4 aom_highbd_dc_top_predictor_16x4_c
+
+void aom_highbd_dc_top_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x64 aom_highbd_dc_top_predictor_16x64_c
+
+void aom_highbd_dc_top_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_16x8 aom_highbd_dc_top_predictor_16x8_sse2
+
+void aom_highbd_dc_top_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_2x2 aom_highbd_dc_top_predictor_2x2_c
+
+void aom_highbd_dc_top_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x16 aom_highbd_dc_top_predictor_32x16_sse2
+
+void aom_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x32 aom_highbd_dc_top_predictor_32x32_sse2
+
+void aom_highbd_dc_top_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x64 aom_highbd_dc_top_predictor_32x64_c
+
+void aom_highbd_dc_top_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_32x8 aom_highbd_dc_top_predictor_32x8_c
+
+void aom_highbd_dc_top_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x16 aom_highbd_dc_top_predictor_4x16_c
+
+void aom_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x4 aom_highbd_dc_top_predictor_4x4_sse2
+
+void aom_highbd_dc_top_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_4x8 aom_highbd_dc_top_predictor_4x8_sse2
+
+void aom_highbd_dc_top_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x16 aom_highbd_dc_top_predictor_64x16_c
+
+void aom_highbd_dc_top_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x32 aom_highbd_dc_top_predictor_64x32_c
+
+void aom_highbd_dc_top_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_64x64 aom_highbd_dc_top_predictor_64x64_c
+
+void aom_highbd_dc_top_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x16 aom_highbd_dc_top_predictor_8x16_sse2
+
+void aom_highbd_dc_top_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x32 aom_highbd_dc_top_predictor_8x32_c
+
+void aom_highbd_dc_top_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x4 aom_highbd_dc_top_predictor_8x4_sse2
+
+void aom_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_dc_top_predictor_8x8 aom_highbd_dc_top_predictor_8x8_sse2
+
+void aom_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x16 aom_highbd_h_predictor_16x16_sse2
+
+void aom_highbd_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x32 aom_highbd_h_predictor_16x32_sse2
+
+void aom_highbd_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x4 aom_highbd_h_predictor_16x4_c
+
+void aom_highbd_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x64 aom_highbd_h_predictor_16x64_c
+
+void aom_highbd_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_16x8 aom_highbd_h_predictor_16x8_sse2
+
+void aom_highbd_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_2x2 aom_highbd_h_predictor_2x2_c
+
+void aom_highbd_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x16 aom_highbd_h_predictor_32x16_sse2
+
+void aom_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x32 aom_highbd_h_predictor_32x32_sse2
+
+void aom_highbd_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x64 aom_highbd_h_predictor_32x64_c
+
+void aom_highbd_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_32x8 aom_highbd_h_predictor_32x8_c
+
+void aom_highbd_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x16 aom_highbd_h_predictor_4x16_c
+
+void aom_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x4 aom_highbd_h_predictor_4x4_sse2
+
+void aom_highbd_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_4x8 aom_highbd_h_predictor_4x8_sse2
+
+void aom_highbd_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x16 aom_highbd_h_predictor_64x16_c
+
+void aom_highbd_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x32 aom_highbd_h_predictor_64x32_c
+
+void aom_highbd_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_64x64 aom_highbd_h_predictor_64x64_c
+
+void aom_highbd_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x16 aom_highbd_h_predictor_8x16_sse2
+
+void aom_highbd_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x32 aom_highbd_h_predictor_8x32_c
+
+void aom_highbd_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x4 aom_highbd_h_predictor_8x4_sse2
+
+void aom_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_h_predictor_8x8 aom_highbd_h_predictor_8x8_sse2
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_14 aom_highbd_lpf_horizontal_14_sse2
+
+void aom_highbd_lpf_horizontal_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+void aom_highbd_lpf_horizontal_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd);
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_4 aom_highbd_lpf_horizontal_4_sse2
+
+void aom_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_6 aom_highbd_lpf_horizontal_6_sse2
+
+void aom_highbd_lpf_horizontal_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_horizontal_6_dual aom_highbd_lpf_horizontal_6_dual_sse2
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_horizontal_8 aom_highbd_lpf_horizontal_8_sse2
+
+void aom_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_horizontal_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_horizontal_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_14 aom_highbd_lpf_vertical_14_sse2
+
+void aom_highbd_lpf_vertical_14_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_14_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_14_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_4 aom_highbd_lpf_vertical_4_sse2
+
+void aom_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_4_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_4_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_6 aom_highbd_lpf_vertical_6_sse2
+
+void aom_highbd_lpf_vertical_6_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_6_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define aom_highbd_lpf_vertical_6_dual aom_highbd_lpf_vertical_6_dual_sse2
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define aom_highbd_lpf_vertical_8 aom_highbd_lpf_vertical_8_sse2
+
+void aom_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void aom_highbd_lpf_vertical_8_dual_avx2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+RTCD_EXTERN void (*aom_highbd_lpf_vertical_8_dual)(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+
+void aom_highbd_paeth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x16 aom_highbd_paeth_predictor_16x16_c
+
+void aom_highbd_paeth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x32 aom_highbd_paeth_predictor_16x32_c
+
+void aom_highbd_paeth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x4 aom_highbd_paeth_predictor_16x4_c
+
+void aom_highbd_paeth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x64 aom_highbd_paeth_predictor_16x64_c
+
+void aom_highbd_paeth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_16x8 aom_highbd_paeth_predictor_16x8_c
+
+void aom_highbd_paeth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_2x2 aom_highbd_paeth_predictor_2x2_c
+
+void aom_highbd_paeth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x16 aom_highbd_paeth_predictor_32x16_c
+
+void aom_highbd_paeth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x32 aom_highbd_paeth_predictor_32x32_c
+
+void aom_highbd_paeth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x64 aom_highbd_paeth_predictor_32x64_c
+
+void aom_highbd_paeth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_32x8 aom_highbd_paeth_predictor_32x8_c
+
+void aom_highbd_paeth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x16 aom_highbd_paeth_predictor_4x16_c
+
+void aom_highbd_paeth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x4 aom_highbd_paeth_predictor_4x4_c
+
+void aom_highbd_paeth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_4x8 aom_highbd_paeth_predictor_4x8_c
+
+void aom_highbd_paeth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x16 aom_highbd_paeth_predictor_64x16_c
+
+void aom_highbd_paeth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x32 aom_highbd_paeth_predictor_64x32_c
+
+void aom_highbd_paeth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_64x64 aom_highbd_paeth_predictor_64x64_c
+
+void aom_highbd_paeth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x16 aom_highbd_paeth_predictor_8x16_c
+
+void aom_highbd_paeth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x32 aom_highbd_paeth_predictor_8x32_c
+
+void aom_highbd_paeth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x4 aom_highbd_paeth_predictor_8x4_c
+
+void aom_highbd_paeth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_paeth_predictor_8x8 aom_highbd_paeth_predictor_8x8_c
+
+void aom_highbd_smooth_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x16 aom_highbd_smooth_h_predictor_16x16_c
+
+void aom_highbd_smooth_h_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x32 aom_highbd_smooth_h_predictor_16x32_c
+
+void aom_highbd_smooth_h_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x4 aom_highbd_smooth_h_predictor_16x4_c
+
+void aom_highbd_smooth_h_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x64 aom_highbd_smooth_h_predictor_16x64_c
+
+void aom_highbd_smooth_h_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_16x8 aom_highbd_smooth_h_predictor_16x8_c
+
+void aom_highbd_smooth_h_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_2x2 aom_highbd_smooth_h_predictor_2x2_c
+
+void aom_highbd_smooth_h_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x16 aom_highbd_smooth_h_predictor_32x16_c
+
+void aom_highbd_smooth_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x32 aom_highbd_smooth_h_predictor_32x32_c
+
+void aom_highbd_smooth_h_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x64 aom_highbd_smooth_h_predictor_32x64_c
+
+void aom_highbd_smooth_h_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_32x8 aom_highbd_smooth_h_predictor_32x8_c
+
+void aom_highbd_smooth_h_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x16 aom_highbd_smooth_h_predictor_4x16_c
+
+void aom_highbd_smooth_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x4 aom_highbd_smooth_h_predictor_4x4_c
+
+void aom_highbd_smooth_h_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_4x8 aom_highbd_smooth_h_predictor_4x8_c
+
+void aom_highbd_smooth_h_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x16 aom_highbd_smooth_h_predictor_64x16_c
+
+void aom_highbd_smooth_h_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x32 aom_highbd_smooth_h_predictor_64x32_c
+
+void aom_highbd_smooth_h_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_64x64 aom_highbd_smooth_h_predictor_64x64_c
+
+void aom_highbd_smooth_h_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x16 aom_highbd_smooth_h_predictor_8x16_c
+
+void aom_highbd_smooth_h_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x32 aom_highbd_smooth_h_predictor_8x32_c
+
+void aom_highbd_smooth_h_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x4 aom_highbd_smooth_h_predictor_8x4_c
+
+void aom_highbd_smooth_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_h_predictor_8x8 aom_highbd_smooth_h_predictor_8x8_c
+
+void aom_highbd_smooth_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x16 aom_highbd_smooth_predictor_16x16_c
+
+void aom_highbd_smooth_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x32 aom_highbd_smooth_predictor_16x32_c
+
+void aom_highbd_smooth_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x4 aom_highbd_smooth_predictor_16x4_c
+
+void aom_highbd_smooth_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x64 aom_highbd_smooth_predictor_16x64_c
+
+void aom_highbd_smooth_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_16x8 aom_highbd_smooth_predictor_16x8_c
+
+void aom_highbd_smooth_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_2x2 aom_highbd_smooth_predictor_2x2_c
+
+void aom_highbd_smooth_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x16 aom_highbd_smooth_predictor_32x16_c
+
+void aom_highbd_smooth_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x32 aom_highbd_smooth_predictor_32x32_c
+
+void aom_highbd_smooth_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x64 aom_highbd_smooth_predictor_32x64_c
+
+void aom_highbd_smooth_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_32x8 aom_highbd_smooth_predictor_32x8_c
+
+void aom_highbd_smooth_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x16 aom_highbd_smooth_predictor_4x16_c
+
+void aom_highbd_smooth_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x4 aom_highbd_smooth_predictor_4x4_c
+
+void aom_highbd_smooth_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_4x8 aom_highbd_smooth_predictor_4x8_c
+
+void aom_highbd_smooth_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x16 aom_highbd_smooth_predictor_64x16_c
+
+void aom_highbd_smooth_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x32 aom_highbd_smooth_predictor_64x32_c
+
+void aom_highbd_smooth_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_64x64 aom_highbd_smooth_predictor_64x64_c
+
+void aom_highbd_smooth_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x16 aom_highbd_smooth_predictor_8x16_c
+
+void aom_highbd_smooth_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x32 aom_highbd_smooth_predictor_8x32_c
+
+void aom_highbd_smooth_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x4 aom_highbd_smooth_predictor_8x4_c
+
+void aom_highbd_smooth_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_predictor_8x8 aom_highbd_smooth_predictor_8x8_c
+
+void aom_highbd_smooth_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x16 aom_highbd_smooth_v_predictor_16x16_c
+
+void aom_highbd_smooth_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x32 aom_highbd_smooth_v_predictor_16x32_c
+
+void aom_highbd_smooth_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x4 aom_highbd_smooth_v_predictor_16x4_c
+
+void aom_highbd_smooth_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x64 aom_highbd_smooth_v_predictor_16x64_c
+
+void aom_highbd_smooth_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_16x8 aom_highbd_smooth_v_predictor_16x8_c
+
+void aom_highbd_smooth_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_2x2 aom_highbd_smooth_v_predictor_2x2_c
+
+void aom_highbd_smooth_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x16 aom_highbd_smooth_v_predictor_32x16_c
+
+void aom_highbd_smooth_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x32 aom_highbd_smooth_v_predictor_32x32_c
+
+void aom_highbd_smooth_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x64 aom_highbd_smooth_v_predictor_32x64_c
+
+void aom_highbd_smooth_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_32x8 aom_highbd_smooth_v_predictor_32x8_c
+
+void aom_highbd_smooth_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x16 aom_highbd_smooth_v_predictor_4x16_c
+
+void aom_highbd_smooth_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x4 aom_highbd_smooth_v_predictor_4x4_c
+
+void aom_highbd_smooth_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_4x8 aom_highbd_smooth_v_predictor_4x8_c
+
+void aom_highbd_smooth_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x16 aom_highbd_smooth_v_predictor_64x16_c
+
+void aom_highbd_smooth_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x32 aom_highbd_smooth_v_predictor_64x32_c
+
+void aom_highbd_smooth_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_64x64 aom_highbd_smooth_v_predictor_64x64_c
+
+void aom_highbd_smooth_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x16 aom_highbd_smooth_v_predictor_8x16_c
+
+void aom_highbd_smooth_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x32 aom_highbd_smooth_v_predictor_8x32_c
+
+void aom_highbd_smooth_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x4 aom_highbd_smooth_v_predictor_8x4_c
+
+void aom_highbd_smooth_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_smooth_v_predictor_8x8 aom_highbd_smooth_v_predictor_8x8_c
+
+void aom_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x16 aom_highbd_v_predictor_16x16_sse2
+
+void aom_highbd_v_predictor_16x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x32 aom_highbd_v_predictor_16x32_sse2
+
+void aom_highbd_v_predictor_16x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x4 aom_highbd_v_predictor_16x4_c
+
+void aom_highbd_v_predictor_16x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x64 aom_highbd_v_predictor_16x64_c
+
+void aom_highbd_v_predictor_16x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_16x8 aom_highbd_v_predictor_16x8_sse2
+
+void aom_highbd_v_predictor_2x2_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_2x2 aom_highbd_v_predictor_2x2_c
+
+void aom_highbd_v_predictor_32x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x16 aom_highbd_v_predictor_32x16_sse2
+
+void aom_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x32 aom_highbd_v_predictor_32x32_sse2
+
+void aom_highbd_v_predictor_32x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x64 aom_highbd_v_predictor_32x64_c
+
+void aom_highbd_v_predictor_32x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_32x8 aom_highbd_v_predictor_32x8_c
+
+void aom_highbd_v_predictor_4x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x16 aom_highbd_v_predictor_4x16_c
+
+void aom_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x4 aom_highbd_v_predictor_4x4_sse2
+
+void aom_highbd_v_predictor_4x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_4x8 aom_highbd_v_predictor_4x8_sse2
+
+void aom_highbd_v_predictor_64x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x16 aom_highbd_v_predictor_64x16_c
+
+void aom_highbd_v_predictor_64x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x32 aom_highbd_v_predictor_64x32_c
+
+void aom_highbd_v_predictor_64x64_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_64x64 aom_highbd_v_predictor_64x64_c
+
+void aom_highbd_v_predictor_8x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x16 aom_highbd_v_predictor_8x16_sse2
+
+void aom_highbd_v_predictor_8x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x32 aom_highbd_v_predictor_8x32_c
+
+void aom_highbd_v_predictor_8x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x4 aom_highbd_v_predictor_8x4_sse2
+
+void aom_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void aom_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define aom_highbd_v_predictor_8x8 aom_highbd_v_predictor_8x8_sse2
+
+void aom_lowbd_blend_a64_d16_mask_c(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+void aom_lowbd_blend_a64_d16_mask_avx2(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+RTCD_EXTERN void (*aom_lowbd_blend_a64_d16_mask)(uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params);
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_14 aom_lpf_horizontal_14_sse2
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_14_dual aom_lpf_horizontal_14_dual_sse2
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_4 aom_lpf_horizontal_4_sse2
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_4_dual aom_lpf_horizontal_4_dual_sse2
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_6 aom_lpf_horizontal_6_sse2
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_6_dual aom_lpf_horizontal_6_dual_sse2
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_horizontal_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_horizontal_8 aom_lpf_horizontal_8_sse2
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_horizontal_8_dual aom_lpf_horizontal_8_dual_sse2
+
+void aom_lpf_vertical_14_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_14_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_14 aom_lpf_vertical_14_sse2
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_14_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_14_dual aom_lpf_vertical_14_dual_sse2
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_4 aom_lpf_vertical_4_sse2
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_4_dual aom_lpf_vertical_4_dual_sse2
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_6_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_6 aom_lpf_vertical_6_sse2
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_6_dual aom_lpf_vertical_6_dual_sse2
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void aom_lpf_vertical_8_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define aom_lpf_vertical_8 aom_lpf_vertical_8_sse2
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define aom_lpf_vertical_8_dual aom_lpf_vertical_8_dual_sse2
+
+void aom_paeth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_paeth_predictor_2x2 aom_paeth_predictor_2x2_c
+
+void aom_paeth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_paeth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_paeth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_h_predictor_2x2 aom_smooth_h_predictor_2x2_c
+
+void aom_smooth_h_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_predictor_2x2 aom_smooth_predictor_2x2_c
+
+void aom_smooth_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_16x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_smooth_v_predictor_2x2 aom_smooth_v_predictor_2x2_c
+
+void aom_smooth_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_32x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_4x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_smooth_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_smooth_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x16 aom_v_predictor_16x16_sse2
+
+void aom_v_predictor_16x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x32 aom_v_predictor_16x32_sse2
+
+void aom_v_predictor_16x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x4 aom_v_predictor_16x4_sse2
+
+void aom_v_predictor_16x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x64 aom_v_predictor_16x64_sse2
+
+void aom_v_predictor_16x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_16x8 aom_v_predictor_16x8_sse2
+
+void aom_v_predictor_2x2_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_2x2 aom_v_predictor_2x2_c
+
+void aom_v_predictor_32x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_32x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_32x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_32x8 aom_v_predictor_32x8_sse2
+
+void aom_v_predictor_4x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x16 aom_v_predictor_4x16_sse2
+
+void aom_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x4 aom_v_predictor_4x4_sse2
+
+void aom_v_predictor_4x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_4x8 aom_v_predictor_4x8_sse2
+
+void aom_v_predictor_64x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_64x64_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*aom_v_predictor_64x64)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+
+void aom_v_predictor_8x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x16 aom_v_predictor_8x16_sse2
+
+void aom_v_predictor_8x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x32 aom_v_predictor_8x32_sse2
+
+void aom_v_predictor_8x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x4 aom_v_predictor_8x4_sse2
+
+void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit);
+RTCD_EXTERN void (*av1_round_shift_array)(int32_t *arr, int size, int bit);
+
+void aom_dsp_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    aom_blend_a64_hmask = aom_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_hmask = aom_blend_a64_hmask_sse4_1;
+    aom_blend_a64_mask = aom_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_mask = aom_blend_a64_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_blend_a64_mask = aom_blend_a64_mask_avx2;
+    aom_blend_a64_vmask = aom_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_blend_a64_vmask = aom_blend_a64_vmask_sse4_1;
+    aom_convolve8_horiz = aom_convolve8_horiz_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_horiz = aom_convolve8_horiz_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_horiz = aom_convolve8_horiz_avx2;
+    aom_convolve8_vert = aom_convolve8_vert_sse2;
+    if (flags & HAS_SSSE3) aom_convolve8_vert = aom_convolve8_vert_ssse3;
+    if (flags & HAS_AVX2) aom_convolve8_vert = aom_convolve8_vert_avx2;
+    aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x16 = aom_dc_128_predictor_32x16_avx2;
+    aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x32 = aom_dc_128_predictor_32x32_avx2;
+    aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_32x64 = aom_dc_128_predictor_32x64_avx2;
+    aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x16 = aom_dc_128_predictor_64x16_avx2;
+    aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x32 = aom_dc_128_predictor_64x32_avx2;
+    aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_128_predictor_64x64 = aom_dc_128_predictor_64x64_avx2;
+    aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x16 = aom_dc_left_predictor_32x16_avx2;
+    aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x32 = aom_dc_left_predictor_32x32_avx2;
+    aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_32x64 = aom_dc_left_predictor_32x64_avx2;
+    aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x16 = aom_dc_left_predictor_64x16_avx2;
+    aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x32 = aom_dc_left_predictor_64x32_avx2;
+    aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_left_predictor_64x64 = aom_dc_left_predictor_64x64_avx2;
+    aom_dc_predictor_32x16 = aom_dc_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x16 = aom_dc_predictor_32x16_avx2;
+    aom_dc_predictor_32x32 = aom_dc_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x32 = aom_dc_predictor_32x32_avx2;
+    aom_dc_predictor_32x64 = aom_dc_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_32x64 = aom_dc_predictor_32x64_avx2;
+    aom_dc_predictor_64x16 = aom_dc_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x16 = aom_dc_predictor_64x16_avx2;
+    aom_dc_predictor_64x32 = aom_dc_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x32 = aom_dc_predictor_64x32_avx2;
+    aom_dc_predictor_64x64 = aom_dc_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_predictor_64x64 = aom_dc_predictor_64x64_avx2;
+    aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x16 = aom_dc_top_predictor_32x16_avx2;
+    aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x32 = aom_dc_top_predictor_32x32_avx2;
+    aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_32x64 = aom_dc_top_predictor_32x64_avx2;
+    aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x16 = aom_dc_top_predictor_64x16_avx2;
+    aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x32 = aom_dc_top_predictor_64x32_avx2;
+    aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_dc_top_predictor_64x64 = aom_dc_top_predictor_64x64_avx2;
+    aom_h_predictor_32x32 = aom_h_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_h_predictor_32x32 = aom_h_predictor_32x32_avx2;
+    aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_hmask = aom_highbd_blend_a64_hmask_sse4_1;
+    aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_mask = aom_highbd_blend_a64_mask_sse4_1;
+    aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_c;
+    if (flags & HAS_SSE4_1) aom_highbd_blend_a64_vmask = aom_highbd_blend_a64_vmask_sse4_1;
+    aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_horiz = aom_highbd_convolve8_horiz_avx2;
+    aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve8_vert = aom_highbd_convolve8_vert_avx2;
+    aom_highbd_convolve_copy = aom_highbd_convolve_copy_sse2;
+    if (flags & HAS_AVX2) aom_highbd_convolve_copy = aom_highbd_convolve_copy_avx2;
+    aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_14_dual = aom_highbd_lpf_horizontal_14_dual_avx2;
+    aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_4_dual = aom_highbd_lpf_horizontal_4_dual_avx2;
+    aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_horizontal_8_dual = aom_highbd_lpf_horizontal_8_dual_avx2;
+    aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_14_dual = aom_highbd_lpf_vertical_14_dual_avx2;
+    aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_4_dual = aom_highbd_lpf_vertical_4_dual_avx2;
+    aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_sse2;
+    if (flags & HAS_AVX2) aom_highbd_lpf_vertical_8_dual = aom_highbd_lpf_vertical_8_dual_avx2;
+    aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_c;
+    if (flags & HAS_SSE4_1) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_sse4_1;
+    if (flags & HAS_AVX2) aom_lowbd_blend_a64_d16_mask = aom_lowbd_blend_a64_d16_mask_avx2;
+    aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x16 = aom_paeth_predictor_16x16_avx2;
+    aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x32 = aom_paeth_predictor_16x32_avx2;
+    aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x4 = aom_paeth_predictor_16x4_ssse3;
+    aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x64 = aom_paeth_predictor_16x64_avx2;
+    aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_16x8 = aom_paeth_predictor_16x8_avx2;
+    aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x16 = aom_paeth_predictor_32x16_avx2;
+    aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x32 = aom_paeth_predictor_32x32_avx2;
+    aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_32x64 = aom_paeth_predictor_32x64_avx2;
+    aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_32x8 = aom_paeth_predictor_32x8_ssse3;
+    aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x16 = aom_paeth_predictor_4x16_ssse3;
+    aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x4 = aom_paeth_predictor_4x4_ssse3;
+    aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_4x8 = aom_paeth_predictor_4x8_ssse3;
+    aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x16 = aom_paeth_predictor_64x16_avx2;
+    aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x32 = aom_paeth_predictor_64x32_avx2;
+    aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_ssse3;
+    if (flags & HAS_AVX2) aom_paeth_predictor_64x64 = aom_paeth_predictor_64x64_avx2;
+    aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x16 = aom_paeth_predictor_8x16_ssse3;
+    aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x32 = aom_paeth_predictor_8x32_ssse3;
+    aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x4 = aom_paeth_predictor_8x4_ssse3;
+    aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_paeth_predictor_8x8 = aom_paeth_predictor_8x8_ssse3;
+    aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x16 = aom_smooth_h_predictor_16x16_ssse3;
+    aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x32 = aom_smooth_h_predictor_16x32_ssse3;
+    aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x4 = aom_smooth_h_predictor_16x4_ssse3;
+    aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x64 = aom_smooth_h_predictor_16x64_ssse3;
+    aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_16x8 = aom_smooth_h_predictor_16x8_ssse3;
+    aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x16 = aom_smooth_h_predictor_32x16_ssse3;
+    aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x32 = aom_smooth_h_predictor_32x32_ssse3;
+    aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x64 = aom_smooth_h_predictor_32x64_ssse3;
+    aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_32x8 = aom_smooth_h_predictor_32x8_ssse3;
+    aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x16 = aom_smooth_h_predictor_4x16_ssse3;
+    aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x4 = aom_smooth_h_predictor_4x4_ssse3;
+    aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_4x8 = aom_smooth_h_predictor_4x8_ssse3;
+    aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x16 = aom_smooth_h_predictor_64x16_ssse3;
+    aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x32 = aom_smooth_h_predictor_64x32_ssse3;
+    aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_64x64 = aom_smooth_h_predictor_64x64_ssse3;
+    aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x16 = aom_smooth_h_predictor_8x16_ssse3;
+    aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x32 = aom_smooth_h_predictor_8x32_ssse3;
+    aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x4 = aom_smooth_h_predictor_8x4_ssse3;
+    aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_h_predictor_8x8 = aom_smooth_h_predictor_8x8_ssse3;
+    aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x16 = aom_smooth_predictor_16x16_ssse3;
+    aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x32 = aom_smooth_predictor_16x32_ssse3;
+    aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x4 = aom_smooth_predictor_16x4_ssse3;
+    aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x64 = aom_smooth_predictor_16x64_ssse3;
+    aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_16x8 = aom_smooth_predictor_16x8_ssse3;
+    aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x16 = aom_smooth_predictor_32x16_ssse3;
+    aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x32 = aom_smooth_predictor_32x32_ssse3;
+    aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x64 = aom_smooth_predictor_32x64_ssse3;
+    aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_32x8 = aom_smooth_predictor_32x8_ssse3;
+    aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x16 = aom_smooth_predictor_4x16_ssse3;
+    aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x4 = aom_smooth_predictor_4x4_ssse3;
+    aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_4x8 = aom_smooth_predictor_4x8_ssse3;
+    aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x16 = aom_smooth_predictor_64x16_ssse3;
+    aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x32 = aom_smooth_predictor_64x32_ssse3;
+    aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_64x64 = aom_smooth_predictor_64x64_ssse3;
+    aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x16 = aom_smooth_predictor_8x16_ssse3;
+    aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x32 = aom_smooth_predictor_8x32_ssse3;
+    aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x4 = aom_smooth_predictor_8x4_ssse3;
+    aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_predictor_8x8 = aom_smooth_predictor_8x8_ssse3;
+    aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x16 = aom_smooth_v_predictor_16x16_ssse3;
+    aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x32 = aom_smooth_v_predictor_16x32_ssse3;
+    aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x4 = aom_smooth_v_predictor_16x4_ssse3;
+    aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x64 = aom_smooth_v_predictor_16x64_ssse3;
+    aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_16x8 = aom_smooth_v_predictor_16x8_ssse3;
+    aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x16 = aom_smooth_v_predictor_32x16_ssse3;
+    aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x32 = aom_smooth_v_predictor_32x32_ssse3;
+    aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x64 = aom_smooth_v_predictor_32x64_ssse3;
+    aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_32x8 = aom_smooth_v_predictor_32x8_ssse3;
+    aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x16 = aom_smooth_v_predictor_4x16_ssse3;
+    aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x4 = aom_smooth_v_predictor_4x4_ssse3;
+    aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_4x8 = aom_smooth_v_predictor_4x8_ssse3;
+    aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x16 = aom_smooth_v_predictor_64x16_ssse3;
+    aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x32 = aom_smooth_v_predictor_64x32_ssse3;
+    aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_64x64 = aom_smooth_v_predictor_64x64_ssse3;
+    aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x16 = aom_smooth_v_predictor_8x16_ssse3;
+    aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x32 = aom_smooth_v_predictor_8x32_ssse3;
+    aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x4 = aom_smooth_v_predictor_8x4_ssse3;
+    aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_c;
+    if (flags & HAS_SSSE3) aom_smooth_v_predictor_8x8 = aom_smooth_v_predictor_8x8_ssse3;
+    aom_v_predictor_32x16 = aom_v_predictor_32x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x16 = aom_v_predictor_32x16_avx2;
+    aom_v_predictor_32x32 = aom_v_predictor_32x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x32 = aom_v_predictor_32x32_avx2;
+    aom_v_predictor_32x64 = aom_v_predictor_32x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_32x64 = aom_v_predictor_32x64_avx2;
+    aom_v_predictor_64x16 = aom_v_predictor_64x16_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x16 = aom_v_predictor_64x16_avx2;
+    aom_v_predictor_64x32 = aom_v_predictor_64x32_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x32 = aom_v_predictor_64x32_avx2;
+    aom_v_predictor_64x64 = aom_v_predictor_64x64_sse2;
+    if (flags & HAS_AVX2) aom_v_predictor_64x64 = aom_v_predictor_64x64_avx2;
+    av1_round_shift_array = av1_round_shift_array_c;
+    if (flags & HAS_SSE4_1) av1_round_shift_array = av1_round_shift_array_sse4_1;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/x64/config/aom_scale_rtcd.h b/media/libaom/config/win/x64/config/aom_scale_rtcd.h
new file mode 100644
index 000000000..6b56795cd
--- /dev/null
+++ b/media/libaom/config/win/x64/config/aom_scale_rtcd.h
@@ -0,0 +1,88 @@
+// This file is generated. Do not edit.
+#ifndef AOM_SCALE_RTCD_H_
+#define AOM_SCALE_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+struct yv12_buffer_config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_borders aom_extend_frame_borders_c
+
+void aom_extend_frame_borders_y_c(struct yv12_buffer_config *ybf);
+#define aom_extend_frame_borders_y aom_extend_frame_borders_y_c
+
+void aom_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_extend_frame_inner_borders aom_extend_frame_inner_borders_c
+
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_2_1_scale aom_horizontal_line_2_1_scale_c
+
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_3_scale aom_horizontal_line_5_3_scale_c
+
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
+#define aom_horizontal_line_5_4_scale aom_horizontal_line_5_4_scale_c
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale aom_vertical_band_2_1_scale_c
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_2_1_scale_i aom_vertical_band_2_1_scale_i_c
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_3_scale aom_vertical_band_5_3_scale_c
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width);
+#define aom_vertical_band_5_4_scale aom_vertical_band_5_4_scale_c
+
+void aom_yv12_copy_frame_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes);
+#define aom_yv12_copy_frame aom_yv12_copy_frame_c
+
+void aom_yv12_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_u aom_yv12_copy_u_c
+
+void aom_yv12_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc);
+#define aom_yv12_copy_v aom_yv12_copy_v_c
+
+void aom_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define aom_yv12_copy_y aom_yv12_copy_y_c
+
+void aom_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf, const int num_planes);
+#define aom_yv12_extend_frame_borders aom_yv12_extend_frame_borders_c
+
+void aom_yv12_partial_copy_u_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_u aom_yv12_partial_copy_u_c
+
+void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_v aom_yv12_partial_copy_v_c
+
+void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend);
+#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+
+void aom_scale_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/config/win/x64/config/av1_rtcd.h b/media/libaom/config/win/x64/config/av1_rtcd.h
new file mode 100644
index 000000000..d27318208
--- /dev/null
+++ b/media/libaom/config/win/x64/config/av1_rtcd.h
@@ -0,0 +1,594 @@
+// This file is generated. Do not edit.
+#ifndef AV1_RTCD_H_
+#define AV1_RTCD_H_
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void apply_selfguided_restoration_c(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+void apply_selfguided_restoration_avx2(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+RTCD_EXTERN void (*apply_selfguided_restoration)(const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd);
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w);
+
+void av1_build_compound_diffwtd_mask_d16_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_sse4_1(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+void av1_build_compound_diffwtd_mask_d16_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_d16)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd);
+
+void av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_ssse3(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+void av1_build_compound_diffwtd_mask_highbd_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+RTCD_EXTERN void (*av1_build_compound_diffwtd_mask_highbd)(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd);
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_copy_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_scale)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params);
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_2d_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+RTCD_EXTERN void (*av1_convolve_horiz_rs)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn);
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_x_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_convolve_y_sr)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
+#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
+
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z2 av1_dr_prediction_z2_c
+
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy);
+#define av1_dr_prediction_z3 av1_dr_prediction_z3_c
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength);
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge)(uint8_t *p, int sz, int strength);
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength);
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength);
+RTCD_EXTERN void (*av1_filter_intra_edge_high)(uint16_t *p, int sz, int strength);
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+RTCD_EXTERN void (*av1_filter_intra_predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode);
+
+void av1_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8 av1_highbd_convolve8_sse2
+
+void av1_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_horiz av1_highbd_convolve8_horiz_sse2
+
+void av1_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+void av1_highbd_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve8_vert av1_highbd_convolve8_vert_sse2
+
+void av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_sse2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_copy_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_copy_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_scale_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_scale)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_2d_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_avg av1_highbd_convolve_avg_c
+
+void av1_highbd_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps);
+#define av1_highbd_convolve_copy av1_highbd_convolve_copy_c
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_horiz_rs)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd);
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_x_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_convolve_y_sr)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
+
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z2 av1_highbd_dr_prediction_z2_c
+
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd);
+#define av1_highbd_dr_prediction_z3 av1_highbd_dr_prediction_z3_c
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_16x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_32x32)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_4x4)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x16)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_highbd_inv_txfm_add_8x8)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_16_add av1_highbd_iwht4x4_16_add_c
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
+#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_2d_copy_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_2d_copy)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_x)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_sse4_1(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+void av1_highbd_jnt_convolve_y_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+RTCD_EXTERN void (*av1_highbd_jnt_convolve_y)(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_highbd_warp_affine)(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_highbd_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+void av1_highbd_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+RTCD_EXTERN void (*av1_highbd_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps);
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x16 av1_inv_txfm2d_add_16x16_c
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x32 av1_inv_txfm2d_add_16x32_c
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x4 av1_inv_txfm2d_add_16x4_c
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x64 av1_inv_txfm2d_add_16x64_c
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_16x8 av1_inv_txfm2d_add_16x8_c
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x16 av1_inv_txfm2d_add_32x16_c
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x32 av1_inv_txfm2d_add_32x32_c
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x64 av1_inv_txfm2d_add_32x64_c
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_32x8 av1_inv_txfm2d_add_32x8_c
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x16 av1_inv_txfm2d_add_4x16_c
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_4x4)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_4x8 av1_inv_txfm2d_add_4x8_c
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x16 av1_inv_txfm2d_add_64x16_c
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x32 av1_inv_txfm2d_add_64x32_c
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_64x64 av1_inv_txfm2d_add_64x64_c
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x16 av1_inv_txfm2d_add_8x16_c
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x32 av1_inv_txfm2d_add_8x32_c
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+#define av1_inv_txfm2d_add_8x4 av1_inv_txfm2d_add_8x4_c
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+RTCD_EXTERN void (*av1_inv_txfm2d_add_8x8)(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd);
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+RTCD_EXTERN void (*av1_inv_txfm_add)(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_2d_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_x)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_jnt_convolve_y)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+RTCD_EXTERN int (*av1_selfguided_restoration)(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd);
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz);
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz);
+RTCD_EXTERN void (*av1_upsample_intra_edge)(uint8_t *p, int sz);
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd);
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd);
+RTCD_EXTERN void (*av1_upsample_intra_edge_high)(uint16_t *p, int sz, int bd);
+
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+RTCD_EXTERN void (*av1_warp_affine)(const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+RTCD_EXTERN void (*av1_wiener_convolve_add_src)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params);
+
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_ssse3(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_sse4_1(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+void cdef_filter_block_avx2(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+RTCD_EXTERN void (*cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift);
+
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_ssse3(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_sse4_1(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+int cdef_find_dir_avx2(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+RTCD_EXTERN int (*cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_420_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_420_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_422_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_422_lbd)(TX_SIZE tx_size);
+
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_hbd_fn (*cfl_get_luma_subsampling_444_hbd)(TX_SIZE tx_size);
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_ssse3(TX_SIZE tx_size);
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subsample_lbd_fn (*cfl_get_luma_subsampling_444_lbd)(TX_SIZE tx_size);
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+void copy_rect8_16bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h);
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_ssse3(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_sse4_1(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+void copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+RTCD_EXTERN void (*copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h);
+
+cfl_predict_hbd_fn get_predict_hbd_fn_c(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_hbd_fn (*get_predict_hbd_fn)(TX_SIZE tx_size);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_c(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size);
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_predict_lbd_fn (*get_predict_lbd_fn)(TX_SIZE tx_size);
+
+cfl_subtract_average_fn get_subtract_average_fn_c(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_sse2(TX_SIZE tx_size);
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size);
+RTCD_EXTERN cfl_subtract_average_fn (*get_subtract_average_fn)(TX_SIZE tx_size);
+
+void av1_rtcd(void);
+
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+    apply_selfguided_restoration = apply_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) apply_selfguided_restoration = apply_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) apply_selfguided_restoration = apply_selfguided_restoration_avx2;
+    av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask = av1_build_compound_diffwtd_mask_avx2;
+    av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_c;
+    if (flags & HAS_SSE4_1) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_sse4_1;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_d16 = av1_build_compound_diffwtd_mask_d16_avx2;
+    av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_c;
+    if (flags & HAS_SSSE3) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_ssse3;
+    if (flags & HAS_AVX2) av1_build_compound_diffwtd_mask_highbd = av1_build_compound_diffwtd_mask_highbd_avx2;
+    av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_copy_sr = av1_convolve_2d_copy_sr_avx2;
+    av1_convolve_2d_scale = av1_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_convolve_2d_scale = av1_convolve_2d_scale_sse4_1;
+    av1_convolve_2d_sr = av1_convolve_2d_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_2d_sr = av1_convolve_2d_sr_avx2;
+    av1_convolve_horiz_rs = av1_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_convolve_horiz_rs = av1_convolve_horiz_rs_sse4_1;
+    av1_convolve_x_sr = av1_convolve_x_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_x_sr = av1_convolve_x_sr_avx2;
+    av1_convolve_y_sr = av1_convolve_y_sr_sse2;
+    if (flags & HAS_AVX2) av1_convolve_y_sr = av1_convolve_y_sr_avx2;
+    av1_filter_intra_edge = av1_filter_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge = av1_filter_intra_edge_sse4_1;
+    av1_filter_intra_edge_high = av1_filter_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_edge_high = av1_filter_intra_edge_high_sse4_1;
+    av1_filter_intra_predictor = av1_filter_intra_predictor_c;
+    if (flags & HAS_SSE4_1) av1_filter_intra_predictor = av1_filter_intra_predictor_sse4_1;
+    av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_sse2;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_copy_sr = av1_highbd_convolve_2d_copy_sr_avx2;
+    av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_2d_scale = av1_highbd_convolve_2d_scale_sse4_1;
+    av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_2d_sr = av1_highbd_convolve_2d_sr_avx2;
+    av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_c;
+    if (flags & HAS_SSE4_1) av1_highbd_convolve_horiz_rs = av1_highbd_convolve_horiz_rs_sse4_1;
+    av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_x_sr = av1_highbd_convolve_x_sr_avx2;
+    av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_c;
+    if (flags & HAS_SSSE3) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_convolve_y_sr = av1_highbd_convolve_y_sr_avx2;
+    av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add = av1_highbd_inv_txfm_add_avx2;
+    av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x16 = av1_highbd_inv_txfm_add_16x16_sse4_1;
+    av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_16x8 = av1_highbd_inv_txfm_add_16x8_sse4_1;
+    av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_inv_txfm_add_32x32 = av1_highbd_inv_txfm_add_32x32_avx2;
+    av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_4x4 = av1_highbd_inv_txfm_add_4x4_sse4_1;
+    av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x16 = av1_highbd_inv_txfm_add_8x16_sse4_1;
+    av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_highbd_inv_txfm_add_8x8 = av1_highbd_inv_txfm_add_8x8_sse4_1;
+    av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d = av1_highbd_jnt_convolve_2d_avx2;
+    av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_2d_copy = av1_highbd_jnt_convolve_2d_copy_avx2;
+    av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_x = av1_highbd_jnt_convolve_x_avx2;
+    av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_c;
+    if (flags & HAS_SSE4_1) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_sse4_1;
+    if (flags & HAS_AVX2) av1_highbd_jnt_convolve_y = av1_highbd_jnt_convolve_y_avx2;
+    av1_highbd_warp_affine = av1_highbd_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_highbd_warp_affine = av1_highbd_warp_affine_sse4_1;
+    av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_c;
+    if (flags & HAS_SSSE3) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_ssse3;
+    if (flags & HAS_AVX2) av1_highbd_wiener_convolve_add_src = av1_highbd_wiener_convolve_add_src_avx2;
+    av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_4x4 = av1_inv_txfm2d_add_4x4_sse4_1;
+    av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_c;
+    if (flags & HAS_SSE4_1) av1_inv_txfm2d_add_8x8 = av1_inv_txfm2d_add_8x8_sse4_1;
+    av1_inv_txfm_add = av1_inv_txfm_add_c;
+    if (flags & HAS_SSSE3) av1_inv_txfm_add = av1_inv_txfm_add_ssse3;
+    if (flags & HAS_AVX2) av1_inv_txfm_add = av1_inv_txfm_add_avx2;
+    av1_jnt_convolve_2d = av1_jnt_convolve_2d_c;
+    if (flags & HAS_SSSE3) av1_jnt_convolve_2d = av1_jnt_convolve_2d_ssse3;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d = av1_jnt_convolve_2d_avx2;
+    av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_2d_copy = av1_jnt_convolve_2d_copy_avx2;
+    av1_jnt_convolve_x = av1_jnt_convolve_x_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_x = av1_jnt_convolve_x_avx2;
+    av1_jnt_convolve_y = av1_jnt_convolve_y_sse2;
+    if (flags & HAS_AVX2) av1_jnt_convolve_y = av1_jnt_convolve_y_avx2;
+    av1_selfguided_restoration = av1_selfguided_restoration_c;
+    if (flags & HAS_SSE4_1) av1_selfguided_restoration = av1_selfguided_restoration_sse4_1;
+    if (flags & HAS_AVX2) av1_selfguided_restoration = av1_selfguided_restoration_avx2;
+    av1_upsample_intra_edge = av1_upsample_intra_edge_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge = av1_upsample_intra_edge_sse4_1;
+    av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_c;
+    if (flags & HAS_SSE4_1) av1_upsample_intra_edge_high = av1_upsample_intra_edge_high_sse4_1;
+    av1_warp_affine = av1_warp_affine_c;
+    if (flags & HAS_SSE4_1) av1_warp_affine = av1_warp_affine_sse4_1;
+    av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_sse2;
+    if (flags & HAS_AVX2) av1_wiener_convolve_add_src = av1_wiener_convolve_add_src_avx2;
+    cdef_filter_block = cdef_filter_block_sse2;
+    if (flags & HAS_SSSE3) cdef_filter_block = cdef_filter_block_ssse3;
+    if (flags & HAS_SSE4_1) cdef_filter_block = cdef_filter_block_sse4_1;
+    if (flags & HAS_AVX2) cdef_filter_block = cdef_filter_block_avx2;
+    cdef_find_dir = cdef_find_dir_sse2;
+    if (flags & HAS_SSSE3) cdef_find_dir = cdef_find_dir_ssse3;
+    if (flags & HAS_SSE4_1) cdef_find_dir = cdef_find_dir_sse4_1;
+    if (flags & HAS_AVX2) cdef_find_dir = cdef_find_dir_avx2;
+    cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_hbd = cfl_get_luma_subsampling_420_hbd_avx2;
+    cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_420_lbd = cfl_get_luma_subsampling_420_lbd_avx2;
+    cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_hbd = cfl_get_luma_subsampling_422_hbd_avx2;
+    cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_422_lbd = cfl_get_luma_subsampling_422_lbd_avx2;
+    cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_hbd = cfl_get_luma_subsampling_444_hbd_avx2;
+    cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_c;
+    if (flags & HAS_SSSE3) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_ssse3;
+    if (flags & HAS_AVX2) cfl_get_luma_subsampling_444_lbd = cfl_get_luma_subsampling_444_lbd_avx2;
+    copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_16bit_to_16bit = copy_rect8_16bit_to_16bit_avx2;
+    copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse2;
+    if (flags & HAS_SSSE3) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_ssse3;
+    if (flags & HAS_SSE4_1) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_sse4_1;
+    if (flags & HAS_AVX2) copy_rect8_8bit_to_16bit = copy_rect8_8bit_to_16bit_avx2;
+    get_predict_hbd_fn = get_predict_hbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_hbd_fn = get_predict_hbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_hbd_fn = get_predict_hbd_fn_avx2;
+    get_predict_lbd_fn = get_predict_lbd_fn_c;
+    if (flags & HAS_SSSE3) get_predict_lbd_fn = get_predict_lbd_fn_ssse3;
+    if (flags & HAS_AVX2) get_predict_lbd_fn = get_predict_lbd_fn_avx2;
+    get_subtract_average_fn = get_subtract_average_fn_sse2;
+    if (flags & HAS_AVX2) get_subtract_average_fn = get_subtract_average_fn_avx2;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
diff --git a/media/libaom/generate_sources_mozbuild.py b/media/libaom/generate_sources_mozbuild.py
new file mode 100644
index 000000000..a5a75d8be
--- /dev/null
+++ b/media/libaom/generate_sources_mozbuild.py
@@ -0,0 +1,163 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+import cmakeparser as cp
+
+import copy
+import datetime
+import os
+import re
+import subprocess
+
+AOM_DIR = '../../third_party/aom'
+
+def write_aom_config(system, arch, variables, cache_variables):
+    # read template cmake file
+    variables['year'] = datetime.datetime.now().year
+    cp.parse(variables, [], os.path.join(AOM_DIR, 'build', 'cmake',
+          'generate_aom_config_templates.cmake'))
+
+    # filter variables
+    cache_variables = [x for x in sorted(cache_variables)
+                       if x and not x.startswith((' ', 'CMAKE', 'AOM'))]
+
+    # inherit this from the mozilla build config
+    cache_variables.remove('HAVE_PTHREAD_H')
+
+    outdir = os.path.join('config', system, arch, 'config')
+    try:
+        os.makedirs(outdir)
+    except OSError:
+        pass
+
+    with open(os.path.join(outdir, 'aom_config.h'), 'w') as f:
+        header = variables['h_file_header_block']
+        f.write(header)
+        f.write('\n')
+        for var in cache_variables:
+            f.write('#define %s %s\n' % (var, variables[var]))
+        f.write('#endif /* AOM_CONFIG_H_ */\n')
+
+    with open(os.path.join(outdir, 'aom_config.asm'), 'w') as f:
+        header = variables['asm_file_header_block']
+        f.write(header)
+        f.write('\n')
+        for var in cache_variables:
+            if var in ['INCLUDE_INSTALL_DIR', 'INLINE',
+                       'LIB_INSTALL_DIR', 'RESTRICT']:
+                continue
+            if arch == 'arm':
+                f.write('.equ %s, %s\n' % (var, variables[var]))
+            else:
+                f.write('%s equ %s\n' % (var, variables[var]))
+
+        if arch == 'arm':
+            f.write('.section	.note.GNU-stack,"",%progbits')
+
+
+if __name__ == '__main__':
+    import sys
+
+    shared_variables = {
+        'CMAKE_CURRENT_SOURCE_DIR': AOM_DIR,
+        'CONFIG_AV1_DECODER': 1,
+        'CONFIG_AV1_ENCODER': 0,
+        'CONFIG_COLLECT_INTER_MODE_RD_STATS': 0,
+        'CONFIG_INSPECTION': 0,
+        'CONFIG_INTERNAL_STATS': 0,
+        'CONFIG_LIBYUV': 0,
+        'CONFIG_LOWBITDEPTH': 1,
+        'CONFIG_MULTITHREAD': 1,
+        'CONFIG_PIC': 0,
+        'CONFIG_WEBM_IO': 0,
+        'CMAKE_CURRENT_BINARY_DIR': 'OBJDIR',
+        'CMAKE_INSTALL_PREFIX': 'INSTALLDIR',
+        'CMAKE_SYSTEM_NAME': 'Linux',
+        'CMAKE_SYSTEM_PROCESSOR': 'x86_64',
+        'ENABLE_EXAMPLES': 0,
+        'ENABLE_TESTS': 0,
+        'ENABLE_TOOLS': 0,
+        'ENABLE_DOCS': 0,
+        'AOM_TEST_TEST_CMAKE_': 1, #prevent building tests
+    }
+
+    f = open('sources.mozbuild', 'wb')
+    f.write('# This file is generated. Do not edit.\n\n')
+    f.write('files = {\n')
+
+    platforms = [
+        ('armv7', 'linux', 'arm', True),
+        ('generic', '', 'generic', True),
+        ('x86', 'linux', 'ia32', True),
+        ('x86', 'win', 'mingw32', False),
+        ('x86', 'win', 'ia32', False),
+        ('x86_64', 'linux', 'x64', True),
+        ('x86_64', 'mac', 'x64', False),
+        ('x86_64', 'win', 'x64', False),
+        ('x86_64', 'win', 'mingw64', False),
+    ]
+    for cpu, system, arch, generate_sources in platforms:
+        print('Running CMake for %s (%s)' % (cpu, system))
+        variables = shared_variables.copy()
+        variables['AOM_TARGET_CPU'] = cpu
+
+        # We skip compiling test programs that detect these
+        variables['HAVE_FEXCEPT'] = 1
+        variables['INLINE'] = 'inline'
+        if cpu == 'x86' and system == 'linux':
+            variables['CONFIG_PIC'] = 1
+        if cpu == 'armv7':
+            variables['CONFIG_PIC'] = 1
+        if system == 'win' and not arch.startswith('mingw'):
+            variables['MSVC'] = 1
+
+        cache_variables = []
+        sources = cp.parse(variables, cache_variables,
+                           os.path.join(AOM_DIR, 'CMakeLists.txt'))
+
+        # Disable HAVE_UNISTD_H.
+        cache_variables.remove('HAVE_UNISTD_H')
+        write_aom_config(system, arch, variables, cache_variables)
+        # Currently, the sources are the same for each supported cpu
+        # regardless of operating system / compiler. If that changes, we'll
+        # have to generate sources for each combination.
+        if generate_sources:
+            # Remove spurious sources and perl files
+            sources = filter(lambda x: x.startswith(AOM_DIR), sources)
+            sources = filter(lambda x: not x.endswith('.pl'), sources)
+
+            # Filter out exports
+            exports = filter(lambda x: re.match(os.path.join(AOM_DIR, '(aom|aom_mem|aom_ports|aom_scale)/.*h$'), x), sources)
+            exports = filter(lambda x: not re.search('(internal|src)', x), exports)
+            exports = filter(lambda x: not re.search('(emmintrin_compat.h|mem_.*|msvc.h|aom_once.h)$', x), exports)
+
+            for export in exports:
+                sources.remove(export)
+
+            # Remove header files
+            sources = sorted(filter(lambda x: not x.endswith('.h'), sources))
+
+            # The build system is unhappy if two files have the same prefix
+            # In libaom, sometimes .asm and .c files share the same prefix
+            for i in xrange(len(sources) - 1):
+                if sources[i].endswith('.asm'):
+                    if os.path.splitext(sources[i])[0] == os.path.splitext(sources[i + 1])[0]:
+                        old = sources[i]
+                        sources[i] = sources[i].replace('.asm', '_asm.asm')
+                        if not os.path.exists(sources[i]):
+                            os.rename(old, sources[i])
+
+            f.write('  \'%s_EXPORTS\': [\n' % arch.upper())
+            for export in sorted(exports):
+                f.write('    \'%s\',\n' % export)
+            f.write("  ],\n")
+
+            f.write('  \'%s_SOURCES\': [\n' % arch.upper())
+            for source in sorted(sources):
+                f.write('    \'%s\',\n' % source)
+            f.write('  ],\n')
+
+        print('\n')
+
+    f.write('}\n')
+    f.close()
diff --git a/media/libaom/generate_sources_mozbuild.sh b/media/libaom/generate_sources_mozbuild.sh
new file mode 100755
index 000000000..fc12b7230
--- /dev/null
+++ b/media/libaom/generate_sources_mozbuild.sh
@@ -0,0 +1,80 @@
+#!/bin/bash -e
+#
+# Copyright (c) 2012 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Modified from chromium/src/third_party/libaom/generate_gni.sh
+
+# This script is used to generate sources.mozbuild and files in the
+# config/platform directories needed to build libaom.
+# Every time libaom source code is updated just run this script.
+#
+# Usage:
+# $ ./generate_sources_mozbuild.sh
+
+export LC_ALL=C
+BASE_DIR=$(pwd)
+LIBAOM_SRC_DIR="../../third_party/aom"
+LIBAOM_CONFIG_DIR="config"
+
+# Print license header.
+# $1 - Output base name
+function write_license {
+  echo "# This file is generated. Do not edit." >> $1
+  echo "" >> $1
+}
+
+# Generate *_rtcd.h files.
+# $1 - Header file directory.
+# $2 - Architecture.
+# $3 - Optional - any additional arguments to pass through.
+function gen_rtcd_header {
+  echo "Generate $LIBAOM_CONFIG_DIR/$1/*_rtcd.h files."
+
+  AOM_CONFIG=$BASE_DIR/$LIBAOM_CONFIG_DIR/$1/config/aom_config.h
+
+  $BASE_DIR/$LIBAOM_SRC_DIR/build/cmake/rtcd.pl \
+    --arch=$2 \
+    --sym=av1_rtcd $3 \
+    --config=$AOM_CONFIG \
+    $BASE_DIR/$LIBAOM_SRC_DIR/av1/common/av1_rtcd_defs.pl \
+    > $BASE_DIR/$LIBAOM_CONFIG_DIR/$1/config/av1_rtcd.h
+
+  $BASE_DIR/$LIBAOM_SRC_DIR/build/cmake/rtcd.pl \
+    --arch=$2 \
+    --sym=aom_scale_rtcd $3 \
+    --config=$AOM_CONFIG \
+    $BASE_DIR/$LIBAOM_SRC_DIR/aom_scale/aom_scale_rtcd.pl \
+    > $BASE_DIR/$LIBAOM_CONFIG_DIR/$1/config/aom_scale_rtcd.h
+
+  $BASE_DIR/$LIBAOM_SRC_DIR/build/cmake/rtcd.pl \
+    --arch=$2 \
+    --sym=aom_dsp_rtcd $3 \
+    --config=$AOM_CONFIG \
+    $BASE_DIR/$LIBAOM_SRC_DIR/aom_dsp/aom_dsp_rtcd_defs.pl \
+    > $BASE_DIR/$LIBAOM_CONFIG_DIR/$1/config/aom_dsp_rtcd.h
+}
+
+echo "Generating config files."
+cd $BASE_DIR
+python generate_sources_mozbuild.py
+
+# Copy aom_version.h once. The file is the same for all platforms.
+cp aom_version.h $BASE_DIR/$LIBAOM_CONFIG_DIR
+
+gen_rtcd_header linux/x64 x86_64
+gen_rtcd_header linux/ia32 x86
+gen_rtcd_header mac/x64 x86_64
+gen_rtcd_header win/x64 x86_64
+gen_rtcd_header win/ia32 x86
+gen_rtcd_header win/mingw32 x86
+gen_rtcd_header win/mingw64 x86_64
+
+gen_rtcd_header linux/arm armv7
+
+gen_rtcd_header generic generic
+
+cd $BASE_DIR/$LIBAOM_SRC_DIR
+
+cd $BASE_DIR
diff --git a/media/libaom/moz.build b/media/libaom/moz.build
new file mode 100644
index 000000000..e376878cc
--- /dev/null
+++ b/media/libaom/moz.build
@@ -0,0 +1,143 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+with Files('*'):
+    BUG_COMPONENT = ('Core', 'Audio/Video')
+
+include('sources.mozbuild')
+
+# Linux, Mac and Win share file lists for x86* but not configurations.
+if CONFIG['CPU_ARCH'] == 'x86_64':
+    EXPORTS.aom += files['X64_EXPORTS']
+    SOURCES += files['X64_SOURCES']
+    USE_YASM = True
+    if CONFIG['OS_TARGET'] == 'WINNT':
+        if CONFIG['GNU_CC']:
+            ASFLAGS += [ '-I%s/media/libaom/config/win/mingw64/' % TOPSRCDIR ]
+            LOCAL_INCLUDES += [ '/media/libaom/config/win/mingw64/' ]
+            EXPORTS.aom += [ 'config/win/mingw64/config/aom_config.h' ]
+        else:
+            ASFLAGS += [ '-I%s/media/libaom/config/win/x64/' % TOPSRCDIR ]
+            LOCAL_INCLUDES += [ '/media/libaom/config/win/x64/' ]
+            EXPORTS.aom += [ 'config/win/x64/config/aom_config.h' ]
+            NO_PGO = True
+    elif CONFIG['OS_TARGET'] == 'Darwin':
+        ASFLAGS += [ '-I%s/media/libaom/config/mac/x64/' % TOPSRCDIR ]
+        LOCAL_INCLUDES += [ '/media/libaom/config/mac/x64/' ]
+        EXPORTS.aom += [ 'config/mac/x64/config/aom_config.h' ]
+    else: # Android, Linux, BSDs, etc.
+        ASFLAGS += [ '-I%s/media/libaom/config/linux/x64/' % TOPSRCDIR ]
+        LOCAL_INCLUDES += [ '/media/libaom/config/linux/x64/' ]
+        EXPORTS.aom += [ 'config/linux/x64/config/aom_config.h' ]
+elif CONFIG['CPU_ARCH'] == 'x86':
+    EXPORTS.aom += files['IA32_EXPORTS']
+    SOURCES += files['IA32_SOURCES']
+    USE_YASM = True
+    if CONFIG['OS_TARGET'] == 'WINNT':
+        if CONFIG['GNU_CC']:
+            ASFLAGS += [ '-I%s/media/libaom/config/win/mingw32/' % TOPSRCDIR ]
+            LOCAL_INCLUDES += [ '/media/libaom/config/win/mingw32/' ]
+            EXPORTS.aom += [ 'config/win/mingw32/config/aom_config.h' ]
+        else:
+            ASFLAGS += [ '-I%s/media/libaom/config/win/ia32/' % TOPSRCDIR ]
+            LOCAL_INCLUDES += [ '/media/libaom/config/win/ia32/' ]
+            EXPORTS.aom += [ 'config/win/ia32/config/aom_config.h' ]
+            NO_PGO = True
+    else: # Android, Linux, BSDs, etc.
+        ASFLAGS += [ '-I%s/media/libaom/config/linux/ia32/' % TOPSRCDIR ]
+        LOCAL_INCLUDES += [ '/media/libaom/config/linux/ia32/' ]
+        EXPORTS.aom += [ 'config/linux/ia32/config/aom_config.h' ]
+elif CONFIG['CPU_ARCH'] == 'arm':
+    EXPORTS.aom += files['ARM_EXPORTS']
+    ASFLAGS += [
+        '-I%s/media/libaom/config/linux/arm/' % TOPSRCDIR,
+        '-I%s/libaom' % OBJDIR,
+    ]
+    LOCAL_INCLUDES += [ '/media/libaom/config/linux/arm/' ]
+    EXPORTS.aom += [ 'config/linux/arm/config/aom_config.h' ]
+
+    arm_asm_files = files['ARM_SOURCES']
+
+    if CONFIG['VPX_AS_CONVERSION']:
+        SOURCES += sorted([
+            "!%s.S" % f if f.endswith('.asm') else f for f in arm_asm_files
+        ])
+    else:
+        SOURCES += sorted(arm_asm_files)
+
+    for f in SOURCES:
+        if f.endswith('neon.c'):
+            SOURCES[f].flags += CONFIG['VPX_ASFLAGS']
+
+    if CONFIG['OS_TARGET'] == 'Android':
+        # For cpu-features.h
+        LOCAL_INCLUDES += [
+            '%%%s/sources/android/cpufeatures' % CONFIG['ANDROID_NDK'],
+        ]
+    if CONFIG['CLANG_CXX']:
+        ASFLAGS += [
+            '-no-integrated-as',
+        ]
+else:
+    # Generic C-only configuration
+    EXPORTS.aom += files['GENERIC_EXPORTS']
+    SOURCES += files['GENERIC_SOURCES']
+    ASFLAGS += [ '-I%s/media/libaom/config/generic/' % TOPSRCDIR ]
+    LOCAL_INCLUDES += [ '/media/libaom/config/generic/' ]
+    EXPORTS.aom += [ 'config/generic/config/aom_config.h' ]
+
+# We allow warnings for third-party code that can be updated from upstream.
+ALLOW_COMPILER_WARNINGS = True
+
+FINAL_LIBRARY = 'gkmedias'
+
+if CONFIG['OS_TARGET'] == 'Android':
+    # Older versions of the Android NDK don't pre-define anything to indicate
+    # the OS they're on, so do it for them.
+    DEFINES['__linux__'] = True
+
+    if not CONFIG['MOZ_WEBRTC']:
+        SOURCES += [
+            '%%%s/sources/android/cpufeatures/cpu-features.c' % CONFIG['ANDROID_NDK'],
+        ]
+
+if CONFIG['CLANG_CL'] or not CONFIG['_MSC_VER']:
+    for f in SOURCES:
+        if f.endswith('sse2.c'):
+            SOURCES[f].flags += CONFIG['SSE2_FLAGS']
+        elif f.endswith('ssse3.c'):
+            SOURCES[f].flags += ['-mssse3']
+        elif f.endswith('sse4.c'):
+            SOURCES[f].flags += ['-msse4.1']
+        elif f.endswith('sse42.c'):
+            SOURCES[f].flags += ['-msse4.2']
+        elif f.endswith('avx.c'):
+            SOURCES[f].flags += ['-mavx']
+        elif f.endswith('avx2.c'):
+            SOURCES[f].flags += ['-mavx2']
+
+# Suppress warnings in third-party code.
+if CONFIG['GNU_CC'] or CONFIG['CLANG_CL']:
+    CFLAGS += [
+        '-Wno-sign-compare',
+        '-Wno-unused-function', # so many of these warnings; just ignore them
+    ]
+if CONFIG['CLANG_CXX'] or CONFIG['CLANG_CL']:
+    CFLAGS += [
+        '-Wno-unreachable-code',
+        '-Wno-unneeded-internal-declaration',
+    ]
+
+ASFLAGS += CONFIG['VPX_ASFLAGS']
+ASFLAGS += [
+    '-I.',
+    '-I%s/third_party/aom' % TOPSRCDIR,
+]
+
+LOCAL_INCLUDES += [
+    '/media/libaom/config', # aom_version.h
+    '/third_party/aom',
+]
diff --git a/media/libaom/sources.mozbuild b/media/libaom/sources.mozbuild
new file mode 100644
index 000000000..9aff837c4
--- /dev/null
+++ b/media/libaom/sources.mozbuild
@@ -0,0 +1,533 @@
+# This file is automatically generated. Do not edit.
+
+files = {
+  'ARM_EXPORTS': [
+    '../../third_party/aom/aom/aom.h',
+    '../../third_party/aom/aom/aom_codec.h',
+    '../../third_party/aom/aom/aom_decoder.h',
+    '../../third_party/aom/aom/aom_encoder.h',
+    '../../third_party/aom/aom/aom_frame_buffer.h',
+    '../../third_party/aom/aom/aom_image.h',
+    '../../third_party/aom/aom/aom_integer.h',
+    '../../third_party/aom/aom/aomcx.h',
+    '../../third_party/aom/aom/aomdx.h',
+    '../../third_party/aom/aom_mem/aom_mem.h',
+    '../../third_party/aom/aom_ports/aom_timer.h',
+    '../../third_party/aom/aom_ports/arm.h',
+    '../../third_party/aom/aom_ports/bitops.h',
+    '../../third_party/aom/aom_ports/mem.h',
+    '../../third_party/aom/aom_ports/sanitizer.h',
+    '../../third_party/aom/aom_ports/system_state.h',
+    '../../third_party/aom/aom_scale/aom_scale.h',
+    '../../third_party/aom/aom_scale/yv12config.h',
+  ],
+  'ARM_SOURCES': [
+    '../../third_party/aom/aom/src/aom_codec.c',
+    '../../third_party/aom/aom/src/aom_decoder.c',
+    '../../third_party/aom/aom/src/aom_encoder.c',
+    '../../third_party/aom/aom/src/aom_image.c',
+    '../../third_party/aom/aom/src/aom_integer.c',
+    '../../third_party/aom/aom_dsp/aom_convolve.c',
+    '../../third_party/aom/aom_dsp/aom_dsp_rtcd.c',
+    '../../third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c',
+    '../../third_party/aom/aom_dsp/arm/fwd_txfm_neon.c',
+    '../../third_party/aom/aom_dsp/arm/intrapred_neon.c',
+    '../../third_party/aom/aom_dsp/arm/loopfilter_neon.c',
+    '../../third_party/aom/aom_dsp/arm/subtract_neon.c',
+    '../../third_party/aom/aom_dsp/binary_codes_reader.c',
+    '../../third_party/aom/aom_dsp/bitreader_buffer.c',
+    '../../third_party/aom/aom_dsp/bitwriter_buffer.c',
+    '../../third_party/aom/aom_dsp/blend_a64_hmask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_mask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_vmask.c',
+    '../../third_party/aom/aom_dsp/daalaboolreader.c',
+    '../../third_party/aom/aom_dsp/entcode.c',
+    '../../third_party/aom/aom_dsp/entdec.c',
+    '../../third_party/aom/aom_dsp/fft.c',
+    '../../third_party/aom/aom_dsp/grain_synthesis.c',
+    '../../third_party/aom/aom_dsp/intrapred.c',
+    '../../third_party/aom/aom_dsp/loopfilter.c',
+    '../../third_party/aom/aom_dsp/subtract.c',
+    '../../third_party/aom/aom_mem/aom_mem.c',
+    '../../third_party/aom/aom_ports/arm_cpudetect.c',
+    '../../third_party/aom/aom_scale/aom_scale_rtcd.c',
+    '../../third_party/aom/aom_scale/generic/aom_scale.c',
+    '../../third_party/aom/aom_scale/generic/gen_scalers.c',
+    '../../third_party/aom/aom_scale/generic/yv12config.c',
+    '../../third_party/aom/aom_scale/generic/yv12extend.c',
+    '../../third_party/aom/aom_util/aom_thread.c',
+    '../../third_party/aom/aom_util/debug_util.c',
+    '../../third_party/aom/av1/av1_dx_iface.c',
+    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/arm/av1_inv_txfm_neon.c',
+    '../../third_party/aom/av1/common/arm/av1_txfm_neon.c',
+    '../../third_party/aom/av1/common/arm/blend_a64_hmask_neon.c',
+    '../../third_party/aom/av1/common/arm/blend_a64_vmask_neon.c',
+    '../../third_party/aom/av1/common/arm/cfl_neon.c',
+    '../../third_party/aom/av1/common/arm/convolve_neon.c',
+    '../../third_party/aom/av1/common/arm/jnt_convolve_neon.c',
+    '../../third_party/aom/av1/common/arm/reconinter_neon.c',
+    '../../third_party/aom/av1/common/arm/selfguided_neon.c',
+    '../../third_party/aom/av1/common/arm/warp_plane_neon.c',
+    '../../third_party/aom/av1/common/arm/wiener_convolve_neon.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm1d.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm2d.c',
+    '../../third_party/aom/av1/common/av1_loopfilter.c',
+    '../../third_party/aom/av1/common/av1_rtcd.c',
+    '../../third_party/aom/av1/common/av1_txfm.c',
+    '../../third_party/aom/av1/common/blockd.c',
+    '../../third_party/aom/av1/common/cdef.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cdef_block_neon.c',
+    '../../third_party/aom/av1/common/cfl.c',
+    '../../third_party/aom/av1/common/convolve.c',
+    '../../third_party/aom/av1/common/debugmodes.c',
+    '../../third_party/aom/av1/common/entropy.c',
+    '../../third_party/aom/av1/common/entropymode.c',
+    '../../third_party/aom/av1/common/entropymv.c',
+    '../../third_party/aom/av1/common/frame_buffers.c',
+    '../../third_party/aom/av1/common/idct.c',
+    '../../third_party/aom/av1/common/mvref_common.c',
+    '../../third_party/aom/av1/common/obu_util.c',
+    '../../third_party/aom/av1/common/odintrin.c',
+    '../../third_party/aom/av1/common/pred_common.c',
+    '../../third_party/aom/av1/common/quant_common.c',
+    '../../third_party/aom/av1/common/reconinter.c',
+    '../../third_party/aom/av1/common/reconintra.c',
+    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
+    '../../third_party/aom/av1/common/scale.c',
+    '../../third_party/aom/av1/common/scan.c',
+    '../../third_party/aom/av1/common/seg_common.c',
+    '../../third_party/aom/av1/common/thread_common.c',
+    '../../third_party/aom/av1/common/tile_common.c',
+    '../../third_party/aom/av1/common/timing.c',
+    '../../third_party/aom/av1/common/txb_common.c',
+    '../../third_party/aom/av1/common/warped_motion.c',
+    '../../third_party/aom/av1/decoder/decodeframe.c',
+    '../../third_party/aom/av1/decoder/decodemv.c',
+    '../../third_party/aom/av1/decoder/decoder.c',
+    '../../third_party/aom/av1/decoder/decodetxb.c',
+    '../../third_party/aom/av1/decoder/detokenize.c',
+    '../../third_party/aom/av1/decoder/dthread.c',
+    '../../third_party/aom/av1/decoder/obu.c',
+    '../../third_party/aom/av1/encoder/arm/neon/quantize_neon.c',
+  ],
+  'GENERIC_EXPORTS': [
+    '../../third_party/aom/aom/aom.h',
+    '../../third_party/aom/aom/aom_codec.h',
+    '../../third_party/aom/aom/aom_decoder.h',
+    '../../third_party/aom/aom/aom_encoder.h',
+    '../../third_party/aom/aom/aom_frame_buffer.h',
+    '../../third_party/aom/aom/aom_image.h',
+    '../../third_party/aom/aom/aom_integer.h',
+    '../../third_party/aom/aom/aomcx.h',
+    '../../third_party/aom/aom/aomdx.h',
+    '../../third_party/aom/aom_mem/aom_mem.h',
+    '../../third_party/aom/aom_ports/aom_timer.h',
+    '../../third_party/aom/aom_ports/bitops.h',
+    '../../third_party/aom/aom_ports/mem.h',
+    '../../third_party/aom/aom_ports/sanitizer.h',
+    '../../third_party/aom/aom_ports/system_state.h',
+    '../../third_party/aom/aom_scale/aom_scale.h',
+    '../../third_party/aom/aom_scale/yv12config.h',
+  ],
+  'GENERIC_SOURCES': [
+    '../../third_party/aom/aom/src/aom_codec.c',
+    '../../third_party/aom/aom/src/aom_decoder.c',
+    '../../third_party/aom/aom/src/aom_encoder.c',
+    '../../third_party/aom/aom/src/aom_image.c',
+    '../../third_party/aom/aom/src/aom_integer.c',
+    '../../third_party/aom/aom_dsp/aom_convolve.c',
+    '../../third_party/aom/aom_dsp/aom_dsp_rtcd.c',
+    '../../third_party/aom/aom_dsp/binary_codes_reader.c',
+    '../../third_party/aom/aom_dsp/bitreader_buffer.c',
+    '../../third_party/aom/aom_dsp/bitwriter_buffer.c',
+    '../../third_party/aom/aom_dsp/blend_a64_hmask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_mask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_vmask.c',
+    '../../third_party/aom/aom_dsp/daalaboolreader.c',
+    '../../third_party/aom/aom_dsp/entcode.c',
+    '../../third_party/aom/aom_dsp/entdec.c',
+    '../../third_party/aom/aom_dsp/fft.c',
+    '../../third_party/aom/aom_dsp/grain_synthesis.c',
+    '../../third_party/aom/aom_dsp/intrapred.c',
+    '../../third_party/aom/aom_dsp/loopfilter.c',
+    '../../third_party/aom/aom_dsp/subtract.c',
+    '../../third_party/aom/aom_mem/aom_mem.c',
+    '../../third_party/aom/aom_scale/aom_scale_rtcd.c',
+    '../../third_party/aom/aom_scale/generic/aom_scale.c',
+    '../../third_party/aom/aom_scale/generic/gen_scalers.c',
+    '../../third_party/aom/aom_scale/generic/yv12config.c',
+    '../../third_party/aom/aom_scale/generic/yv12extend.c',
+    '../../third_party/aom/aom_util/aom_thread.c',
+    '../../third_party/aom/aom_util/debug_util.c',
+    '../../third_party/aom/av1/av1_dx_iface.c',
+    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm1d.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm2d.c',
+    '../../third_party/aom/av1/common/av1_loopfilter.c',
+    '../../third_party/aom/av1/common/av1_rtcd.c',
+    '../../third_party/aom/av1/common/av1_txfm.c',
+    '../../third_party/aom/av1/common/blockd.c',
+    '../../third_party/aom/av1/common/cdef.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cfl.c',
+    '../../third_party/aom/av1/common/convolve.c',
+    '../../third_party/aom/av1/common/debugmodes.c',
+    '../../third_party/aom/av1/common/entropy.c',
+    '../../third_party/aom/av1/common/entropymode.c',
+    '../../third_party/aom/av1/common/entropymv.c',
+    '../../third_party/aom/av1/common/frame_buffers.c',
+    '../../third_party/aom/av1/common/idct.c',
+    '../../third_party/aom/av1/common/mvref_common.c',
+    '../../third_party/aom/av1/common/obu_util.c',
+    '../../third_party/aom/av1/common/odintrin.c',
+    '../../third_party/aom/av1/common/pred_common.c',
+    '../../third_party/aom/av1/common/quant_common.c',
+    '../../third_party/aom/av1/common/reconinter.c',
+    '../../third_party/aom/av1/common/reconintra.c',
+    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
+    '../../third_party/aom/av1/common/scale.c',
+    '../../third_party/aom/av1/common/scan.c',
+    '../../third_party/aom/av1/common/seg_common.c',
+    '../../third_party/aom/av1/common/thread_common.c',
+    '../../third_party/aom/av1/common/tile_common.c',
+    '../../third_party/aom/av1/common/timing.c',
+    '../../third_party/aom/av1/common/txb_common.c',
+    '../../third_party/aom/av1/common/warped_motion.c',
+    '../../third_party/aom/av1/decoder/decodeframe.c',
+    '../../third_party/aom/av1/decoder/decodemv.c',
+    '../../third_party/aom/av1/decoder/decoder.c',
+    '../../third_party/aom/av1/decoder/decodetxb.c',
+    '../../third_party/aom/av1/decoder/detokenize.c',
+    '../../third_party/aom/av1/decoder/dthread.c',
+    '../../third_party/aom/av1/decoder/obu.c',
+  ],
+  'IA32_EXPORTS': [
+    '../../third_party/aom/aom/aom.h',
+    '../../third_party/aom/aom/aom_codec.h',
+    '../../third_party/aom/aom/aom_decoder.h',
+    '../../third_party/aom/aom/aom_encoder.h',
+    '../../third_party/aom/aom/aom_frame_buffer.h',
+    '../../third_party/aom/aom/aom_image.h',
+    '../../third_party/aom/aom/aom_integer.h',
+    '../../third_party/aom/aom/aomcx.h',
+    '../../third_party/aom/aom/aomdx.h',
+    '../../third_party/aom/aom_mem/aom_mem.h',
+    '../../third_party/aom/aom_ports/aom_timer.h',
+    '../../third_party/aom/aom_ports/bitops.h',
+    '../../third_party/aom/aom_ports/mem.h',
+    '../../third_party/aom/aom_ports/sanitizer.h',
+    '../../third_party/aom/aom_ports/system_state.h',
+    '../../third_party/aom/aom_scale/aom_scale.h',
+    '../../third_party/aom/aom_scale/yv12config.h',
+  ],
+  'IA32_SOURCES': [
+    '../../third_party/aom/aom/src/aom_codec.c',
+    '../../third_party/aom/aom/src/aom_decoder.c',
+    '../../third_party/aom/aom/src/aom_encoder.c',
+    '../../third_party/aom/aom/src/aom_image.c',
+    '../../third_party/aom/aom/src/aom_integer.c',
+    '../../third_party/aom/aom_dsp/aom_convolve.c',
+    '../../third_party/aom/aom_dsp/aom_dsp_rtcd.c',
+    '../../third_party/aom/aom_dsp/binary_codes_reader.c',
+    '../../third_party/aom/aom_dsp/bitreader_buffer.c',
+    '../../third_party/aom/aom_dsp/bitwriter_buffer.c',
+    '../../third_party/aom/aom_dsp/blend_a64_hmask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_mask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_vmask.c',
+    '../../third_party/aom/aom_dsp/daalaboolreader.c',
+    '../../third_party/aom/aom_dsp/entcode.c',
+    '../../third_party/aom/aom_dsp/entdec.c',
+    '../../third_party/aom/aom_dsp/fft.c',
+    '../../third_party/aom/aom_dsp/grain_synthesis.c',
+    '../../third_party/aom/aom_dsp/intrapred.c',
+    '../../third_party/aom/aom_dsp/loopfilter.c',
+    '../../third_party/aom/aom_dsp/subtract.c',
+    '../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
+    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/fft_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/fft_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm',
+    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm',
+    '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/inv_wht_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/loopfilter_sse2.c',
+    '../../third_party/aom/aom_mem/aom_mem.c',
+    '../../third_party/aom/aom_ports/emms.asm',
+    '../../third_party/aom/aom_ports/x86_abi_support.asm',
+    '../../third_party/aom/aom_scale/aom_scale_rtcd.c',
+    '../../third_party/aom/aom_scale/generic/aom_scale.c',
+    '../../third_party/aom/aom_scale/generic/gen_scalers.c',
+    '../../third_party/aom/aom_scale/generic/yv12config.c',
+    '../../third_party/aom/aom_scale/generic/yv12extend.c',
+    '../../third_party/aom/aom_util/aom_thread.c',
+    '../../third_party/aom/aom_util/debug_util.c',
+    '../../third_party/aom/av1/av1_dx_iface.c',
+    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm1d.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm2d.c',
+    '../../third_party/aom/av1/common/av1_loopfilter.c',
+    '../../third_party/aom/av1/common/av1_rtcd.c',
+    '../../third_party/aom/av1/common/av1_txfm.c',
+    '../../third_party/aom/av1/common/blockd.c',
+    '../../third_party/aom/av1/common/cdef.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cdef_block_avx2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse4.c',
+    '../../third_party/aom/av1/common/cdef_block_ssse3.c',
+    '../../third_party/aom/av1/common/cfl.c',
+    '../../third_party/aom/av1/common/convolve.c',
+    '../../third_party/aom/av1/common/debugmodes.c',
+    '../../third_party/aom/av1/common/entropy.c',
+    '../../third_party/aom/av1/common/entropymode.c',
+    '../../third_party/aom/av1/common/entropymv.c',
+    '../../third_party/aom/av1/common/frame_buffers.c',
+    '../../third_party/aom/av1/common/idct.c',
+    '../../third_party/aom/av1/common/mvref_common.c',
+    '../../third_party/aom/av1/common/obu_util.c',
+    '../../third_party/aom/av1/common/odintrin.c',
+    '../../third_party/aom/av1/common/pred_common.c',
+    '../../third_party/aom/av1/common/quant_common.c',
+    '../../third_party/aom/av1/common/reconinter.c',
+    '../../third_party/aom/av1/common/reconintra.c',
+    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
+    '../../third_party/aom/av1/common/scale.c',
+    '../../third_party/aom/av1/common/scan.c',
+    '../../third_party/aom/av1/common/seg_common.c',
+    '../../third_party/aom/av1/common/thread_common.c',
+    '../../third_party/aom/av1/common/tile_common.c',
+    '../../third_party/aom/av1/common/timing.c',
+    '../../third_party/aom/av1/common/txb_common.c',
+    '../../third_party/aom/av1/common/warped_motion.c',
+    '../../third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c',
+    '../../third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c',
+    '../../third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c',
+    '../../third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c',
+    '../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c',
+    '../../third_party/aom/av1/common/x86/av1_txfm_sse4.c',
+    '../../third_party/aom/av1/common/x86/cfl_avx2.c',
+    '../../third_party/aom/av1/common/x86/cfl_sse2.c',
+    '../../third_party/aom/av1/common/x86/cfl_ssse3.c',
+    '../../third_party/aom/av1/common/x86/convolve_2d_avx2.c',
+    '../../third_party/aom/av1/common/x86/convolve_2d_sse2.c',
+    '../../third_party/aom/av1/common/x86/convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/convolve_sse2.c',
+    '../../third_party/aom/av1/common/x86/filterintra_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c',
+    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c',
+    '../../third_party/aom/av1/common/x86/intra_edge_sse4.c',
+    '../../third_party/aom/av1/common/x86/jnt_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/jnt_convolve_sse2.c',
+    '../../third_party/aom/av1/common/x86/jnt_convolve_ssse3.c',
+    '../../third_party/aom/av1/common/x86/reconinter_avx2.c',
+    '../../third_party/aom/av1/common/x86/reconinter_sse4.c',
+    '../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
+    '../../third_party/aom/av1/common/x86/selfguided_avx2.c',
+    '../../third_party/aom/av1/common/x86/selfguided_sse4.c',
+    '../../third_party/aom/av1/common/x86/warp_plane_sse4.c',
+    '../../third_party/aom/av1/common/x86/wiener_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/wiener_convolve_sse2.c',
+    '../../third_party/aom/av1/decoder/decodeframe.c',
+    '../../third_party/aom/av1/decoder/decodemv.c',
+    '../../third_party/aom/av1/decoder/decoder.c',
+    '../../third_party/aom/av1/decoder/decodetxb.c',
+    '../../third_party/aom/av1/decoder/detokenize.c',
+    '../../third_party/aom/av1/decoder/dthread.c',
+    '../../third_party/aom/av1/decoder/obu.c',
+  ],
+  'X64_EXPORTS': [
+    '../../third_party/aom/aom/aom.h',
+    '../../third_party/aom/aom/aom_codec.h',
+    '../../third_party/aom/aom/aom_decoder.h',
+    '../../third_party/aom/aom/aom_encoder.h',
+    '../../third_party/aom/aom/aom_frame_buffer.h',
+    '../../third_party/aom/aom/aom_image.h',
+    '../../third_party/aom/aom/aom_integer.h',
+    '../../third_party/aom/aom/aomcx.h',
+    '../../third_party/aom/aom/aomdx.h',
+    '../../third_party/aom/aom_mem/aom_mem.h',
+    '../../third_party/aom/aom_ports/aom_timer.h',
+    '../../third_party/aom/aom_ports/bitops.h',
+    '../../third_party/aom/aom_ports/mem.h',
+    '../../third_party/aom/aom_ports/sanitizer.h',
+    '../../third_party/aom/aom_ports/system_state.h',
+    '../../third_party/aom/aom_scale/aom_scale.h',
+    '../../third_party/aom/aom_scale/yv12config.h',
+  ],
+  'X64_SOURCES': [
+    '../../third_party/aom/aom/src/aom_codec.c',
+    '../../third_party/aom/aom/src/aom_decoder.c',
+    '../../third_party/aom/aom/src/aom_encoder.c',
+    '../../third_party/aom/aom/src/aom_image.c',
+    '../../third_party/aom/aom/src/aom_integer.c',
+    '../../third_party/aom/aom_dsp/aom_convolve.c',
+    '../../third_party/aom/aom_dsp/aom_dsp_rtcd.c',
+    '../../third_party/aom/aom_dsp/binary_codes_reader.c',
+    '../../third_party/aom/aom_dsp/bitreader_buffer.c',
+    '../../third_party/aom/aom_dsp/bitwriter_buffer.c',
+    '../../third_party/aom/aom_dsp/blend_a64_hmask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_mask.c',
+    '../../third_party/aom/aom_dsp/blend_a64_vmask.c',
+    '../../third_party/aom/aom_dsp/daalaboolreader.c',
+    '../../third_party/aom/aom_dsp/entcode.c',
+    '../../third_party/aom/aom_dsp/entdec.c',
+    '../../third_party/aom/aom_dsp/fft.c',
+    '../../third_party/aom/aom_dsp/grain_synthesis.c',
+    '../../third_party/aom/aom_dsp/intrapred.c',
+    '../../third_party/aom/aom_dsp/loopfilter.c',
+    '../../third_party/aom/aom_dsp/subtract.c',
+    '../../third_party/aom/aom_dsp/x86/aom_asm_stubs.c',
+    '../../third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c',
+    '../../third_party/aom/aom_dsp/x86/fft_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/fft_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm',
+    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_avx2.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_sse2.c',
+    '../../third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm',
+    '../../third_party/aom/aom_dsp/x86/intrapred_ssse3.c',
+    '../../third_party/aom/aom_dsp/x86/inv_wht_sse2.asm',
+    '../../third_party/aom/aom_dsp/x86/loopfilter_sse2.c',
+    '../../third_party/aom/aom_mem/aom_mem.c',
+    '../../third_party/aom/aom_ports/emms.asm',
+    '../../third_party/aom/aom_scale/aom_scale_rtcd.c',
+    '../../third_party/aom/aom_scale/generic/aom_scale.c',
+    '../../third_party/aom/aom_scale/generic/gen_scalers.c',
+    '../../third_party/aom/aom_scale/generic/yv12config.c',
+    '../../third_party/aom/aom_scale/generic/yv12extend.c',
+    '../../third_party/aom/aom_util/aom_thread.c',
+    '../../third_party/aom/aom_util/debug_util.c',
+    '../../third_party/aom/av1/av1_dx_iface.c',
+    '../../third_party/aom/av1/common/alloccommon.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm1d.c',
+    '../../third_party/aom/av1/common/av1_inv_txfm2d.c',
+    '../../third_party/aom/av1/common/av1_loopfilter.c',
+    '../../third_party/aom/av1/common/av1_rtcd.c',
+    '../../third_party/aom/av1/common/av1_txfm.c',
+    '../../third_party/aom/av1/common/blockd.c',
+    '../../third_party/aom/av1/common/cdef.c',
+    '../../third_party/aom/av1/common/cdef_block.c',
+    '../../third_party/aom/av1/common/cdef_block_avx2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse2.c',
+    '../../third_party/aom/av1/common/cdef_block_sse4.c',
+    '../../third_party/aom/av1/common/cdef_block_ssse3.c',
+    '../../third_party/aom/av1/common/cfl.c',
+    '../../third_party/aom/av1/common/convolve.c',
+    '../../third_party/aom/av1/common/debugmodes.c',
+    '../../third_party/aom/av1/common/entropy.c',
+    '../../third_party/aom/av1/common/entropymode.c',
+    '../../third_party/aom/av1/common/entropymv.c',
+    '../../third_party/aom/av1/common/frame_buffers.c',
+    '../../third_party/aom/av1/common/idct.c',
+    '../../third_party/aom/av1/common/mvref_common.c',
+    '../../third_party/aom/av1/common/obu_util.c',
+    '../../third_party/aom/av1/common/odintrin.c',
+    '../../third_party/aom/av1/common/pred_common.c',
+    '../../third_party/aom/av1/common/quant_common.c',
+    '../../third_party/aom/av1/common/reconinter.c',
+    '../../third_party/aom/av1/common/reconintra.c',
+    '../../third_party/aom/av1/common/resize.c',
+    '../../third_party/aom/av1/common/restoration.c',
+    '../../third_party/aom/av1/common/scale.c',
+    '../../third_party/aom/av1/common/scan.c',
+    '../../third_party/aom/av1/common/seg_common.c',
+    '../../third_party/aom/av1/common/thread_common.c',
+    '../../third_party/aom/av1/common/tile_common.c',
+    '../../third_party/aom/av1/common/timing.c',
+    '../../third_party/aom/av1/common/txb_common.c',
+    '../../third_party/aom/av1/common/warped_motion.c',
+    '../../third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c',
+    '../../third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c',
+    '../../third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c',
+    '../../third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c',
+    '../../third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c',
+    '../../third_party/aom/av1/common/x86/av1_txfm_sse4.c',
+    '../../third_party/aom/av1/common/x86/cfl_avx2.c',
+    '../../third_party/aom/av1/common/x86/cfl_sse2.c',
+    '../../third_party/aom/av1/common/x86/cfl_ssse3.c',
+    '../../third_party/aom/av1/common/x86/convolve_2d_avx2.c',
+    '../../third_party/aom/av1/common/x86/convolve_2d_sse2.c',
+    '../../third_party/aom/av1/common/x86/convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/convolve_sse2.c',
+    '../../third_party/aom/av1/common/x86/filterintra_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c',
+    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c',
+    '../../third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c',
+    '../../third_party/aom/av1/common/x86/intra_edge_sse4.c',
+    '../../third_party/aom/av1/common/x86/jnt_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/jnt_convolve_sse2.c',
+    '../../third_party/aom/av1/common/x86/jnt_convolve_ssse3.c',
+    '../../third_party/aom/av1/common/x86/reconinter_avx2.c',
+    '../../third_party/aom/av1/common/x86/reconinter_sse4.c',
+    '../../third_party/aom/av1/common/x86/reconinter_ssse3.c',
+    '../../third_party/aom/av1/common/x86/selfguided_avx2.c',
+    '../../third_party/aom/av1/common/x86/selfguided_sse4.c',
+    '../../third_party/aom/av1/common/x86/warp_plane_sse4.c',
+    '../../third_party/aom/av1/common/x86/wiener_convolve_avx2.c',
+    '../../third_party/aom/av1/common/x86/wiener_convolve_sse2.c',
+    '../../third_party/aom/av1/decoder/decodeframe.c',
+    '../../third_party/aom/av1/decoder/decodemv.c',
+    '../../third_party/aom/av1/decoder/decoder.c',
+    '../../third_party/aom/av1/decoder/decodetxb.c',
+    '../../third_party/aom/av1/decoder/detokenize.c',
+    '../../third_party/aom/av1/decoder/dthread.c',
+    '../../third_party/aom/av1/decoder/obu.c',
+  ],
+}
diff --git a/media/libaom/test_cmakeparser.py b/media/libaom/test_cmakeparser.py
new file mode 100644
index 000000000..af68c3313
--- /dev/null
+++ b/media/libaom/test_cmakeparser.py
@@ -0,0 +1,190 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+from pyparsing import ParseException
+import unittest
+
+import cmakeparser as cp
+
+class TestCMakeParser(unittest.TestCase):
+    def test_arguments(self):
+        self.assertEqual(cp.arguments.parseString('1').asList(), ['1'])
+        self.assertEqual(cp.arguments.parseString('(1 2)').asList(),
+                         ['(', '1', '2', ')'])
+
+    def test_command(self):
+        self.assertEqual(cp.command.parseString('blah()').asList(),
+                         [['blah', '(', ')']])
+        self.assertEqual(cp.command.parseString('blah(1)').asList(),
+                         [['blah', '(', '1', ')']])
+        self.assertEqual(cp.command.parseString('blah(1 (2 3))').asList(),
+                         [['blah', '(', '1', '(', '2', '3', ')', ')']])
+
+    def test_evaluate_boolean(self):
+        self.assertTrue(cp.evaluate_boolean({}, ['TRUE']))
+        self.assertFalse(cp.evaluate_boolean({}, ['NOT', 'TRUE']))
+        self.assertFalse(cp.evaluate_boolean({}, ['TRUE', 'AND', 'FALSE']))
+        self.assertTrue(cp.evaluate_boolean({}, ['ABC', 'MATCHES', '^AB']))
+        self.assertTrue(cp.evaluate_boolean({}, ['TRUE', 'OR', 'FALSE']))
+        self.assertTrue(cp.evaluate_boolean({}, ['ABC', 'STREQUAL', 'ABC']))
+        self.assertTrue(cp.evaluate_boolean({'ABC': '1'}, ['ABC']))
+        self.assertFalse(cp.evaluate_boolean({'ABC': '0'}, ['${ABC}']))
+        self.assertFalse(cp.evaluate_boolean({}, ['ABC']))
+        self.assertFalse(cp.evaluate_boolean({}, ['${ABC}']))
+        self.assertTrue(cp.evaluate_boolean({'YES_ABC': 1, 'VAL': 'ABC'},
+                                            ['YES_${VAL}']))
+        self.assertTrue(cp.evaluate_boolean({'ABC': 'DEF', 'DEF': 1},
+                                            ['${ABC}']))
+        self.assertTrue(cp.evaluate_boolean({}, ['FALSE', 'OR', '(', 'FALSE', 'OR', 'TRUE', ')']))
+        self.assertFalse(cp.evaluate_boolean({}, ['FALSE', 'OR', '(', 'FALSE', 'AND', 'TRUE', ')']))
+
+    def test_foreach(self):
+        s = """
+            set(STUFF A B C D E F)
+            foreach(item ${STUFF})
+                set(YES_${item} 1)
+            endforeach ()
+            """
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cp.evaluate(variables, [], parsed)
+        for k in ['A', 'B', 'C', 'D', 'E', 'F']:
+            self.assertEqual(variables['YES_%s' % k], '1')
+
+        s = """
+            set(STUFF "A;B;C;D;E;F")
+            foreach(item ${STUFF})
+                set(${item} 1)
+            endforeach ()
+            """
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cp.evaluate(variables, [], parsed)
+        for k in ['A', 'B', 'C', 'D', 'E', 'F']:
+            self.assertEqual(variables[k], '1')
+
+        s = """
+            set(STUFF D E F)
+            foreach(item A B C ${STUFF})
+                set(${item} 1)
+            endforeach ()
+            """
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cp.evaluate(variables, [], parsed)
+        for k in ['A', 'B', 'C', 'D', 'E', 'F']:
+            self.assertEqual(variables[k], '1')
+
+    def test_list(self):
+        s = 'list(APPEND TEST 1 1 2 3 5 8 13)'
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cache_variables = []
+        cp.evaluate(variables, cache_variables, parsed)
+        self.assertEqual(variables['TEST'], '1 1 2 3 5 8 13')
+        self.assertEqual(len(cache_variables), 0)
+
+        s = """
+            set(TEST 1)
+            list(APPEND TEST 1 2 3 5 8 13)
+            """
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cache_variables = []
+        cp.evaluate(variables, cache_variables, parsed)
+        self.assertEqual(variables['TEST'], '1 1 2 3 5 8 13')
+        self.assertEqual(len(cache_variables), 0)
+
+    def test_malformed_input(self):
+        self.assertEqual(len(cp.cmake.parseString('func ((A)')), 0)
+        self.assertEqual(len(cp.cmake.parseString('cmd"arg"(arg2)')), 0)
+        self.assertRaises(ParseException, cp.command.parseString, 'func ((A)')
+        self.assertRaises(ParseException, cp.identifier.parseString,
+                          '-asdlf')
+        self.assertEqual(cp.identifier.parseString('asd-lf')[0], 'asd')
+        self.assertRaises(ParseException, cp.quoted_argument.parseString,
+                          'blah"')
+        s = """
+            " blah blah
+            blah
+            """
+        self.assertRaises(ParseException, cp.quoted_argument.parseString,
+                          s)
+        self.assertRaises(ParseException, cp.quoted_argument.parseString,
+                          'asdlf')
+        self.assertRaises(ParseException, cp.unquoted_argument.parseString,
+                          '#asdflkj')
+
+    def test_parse_if(self):
+        s = """
+            if (A)
+                B()
+            elseif (C)
+                D()
+            elseif (E)
+                F()
+            else ()
+                H()
+            endif ()
+            I()
+            """
+        parsed = cp.cmake.parseString(s)
+        self.assertEqual(len(parsed), 10)
+        end, conditions = cp.parse_if(parsed, 0)
+        self.assertEqual(parsed[end][0], 'endif')
+        self.assertEqual(len(conditions), 4)
+        self.assertEqual(conditions[0][0], ['A'])
+        self.assertEqual(conditions[0][1][0], parsed[1])
+        self.assertEqual(conditions[1][0], ['C'])
+        self.assertEqual(conditions[1][1][0], parsed[3])
+        self.assertEqual(conditions[2][0], ['E'])
+        self.assertEqual(conditions[2][1][0], parsed[5])
+        self.assertEqual(conditions[3][0], ['TRUE'])
+        self.assertEqual(conditions[3][1][0], parsed[7])
+
+    def test_return(self):
+        s = """
+            set(TEST 2)
+            if (true)
+                return()
+            endif ()
+            set(TEST 3)
+            """
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cache_variables = []
+        cp.evaluate(variables, cache_variables, parsed)
+        self.assertEqual(variables['TEST'], '2')
+        self.assertEqual(len(cache_variables), 0)
+
+    def test_set(self):
+        s = """set(TEST 2)"""
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cache_variables = []
+        cp.evaluate(variables, cache_variables, parsed)
+        self.assertEqual(variables['TEST'], '2')
+        self.assertEqual(len(cache_variables), 0)
+
+        s = """set(TEST 3 CACHE "documentation")"""
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cache_variables = []
+        cp.evaluate(variables, cache_variables, parsed)
+        self.assertEqual(variables['TEST'], '3')
+        self.assertTrue('TEST' in cache_variables)
+
+        s = """set(TEST A B C D)"""
+        parsed = cp.cmake.parseString(s)
+        variables = {}
+        cp.evaluate(variables, [], parsed)
+        self.assertEqual(variables['TEST'], 'A B C D')
+
+        s = """set(TEST ${TEST} E F G H)"""
+        parsed = cp.cmake.parseString(s)
+        cp.evaluate(variables, [], parsed)
+        self.assertEqual(variables['TEST'], 'A B C D E F G H')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/media/libnestegg/README_MOZILLA b/media/libnestegg/README_MCP
index ede23ea14..e738fd920 100644
--- a/media/libnestegg/README_MOZILLA
+++ b/media/libnestegg/README_MCP
@@ -3,6 +3,6 @@ git repository using the update.sh script.  The only changes
 made were those applied by update.sh and the addition of
 Makefile.in build files for the Mozilla build system.
 
-The nestegg git repository is: git://github.com/kinetiknz/nestegg.git
+The nestegg git repository is: https://github.com/kinetiknz/nestegg
 
-The git commit ID used was 1eeeccee205f7aee0386898508c1a68427e0dcc2.
+The git commit ID used was f7a0b7cedc893b6683cf15cb210b1656c086d964.
diff --git a/media/libnestegg/include/nestegg.h b/media/libnestegg/include/nestegg.h
index d91483b93..2baa50bc5 100644
--- a/media/libnestegg/include/nestegg.h
+++ b/media/libnestegg/include/nestegg.h
@@ -27,7 +27,7 @@ extern "C" {
 
     @code
     nestegg * demux_ctx;
-    nestegg_init(&demux_ctx, io, NULL);
+    nestegg_init(&demux_ctx, io, NULL, -1);
 
     nestegg_packet * pkt;
     while ((r = nestegg_read_packet(demux_ctx, &pkt)) > 0) {
@@ -71,6 +71,7 @@ extern "C" {
 #define NESTEGG_CODEC_VORBIS  1       /**< Track uses Xiph Vorbis codec. */
 #define NESTEGG_CODEC_VP9     2       /**< Track uses Google On2 VP9 codec. */
 #define NESTEGG_CODEC_OPUS    3       /**< Track uses Xiph Opus codec. */
+#define NESTEGG_CODEC_AV1     4       /**< Track uses AOMedia AV1 codec. */
 #define NESTEGG_CODEC_UNKNOWN INT_MAX /**< Track uses unknown codec. */
 
 #define NESTEGG_VIDEO_MONO              0 /**< Track is mono video. */
@@ -92,9 +93,10 @@ extern "C" {
 #define NESTEGG_ENCODING_COMPRESSION 0 /**< Content encoding type is compression. */
 #define NESTEGG_ENCODING_ENCRYPTION  1 /**< Content encoding type is encryption. */
 
-#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_FALSE       0 /**< Packet does not have signal byte */
-#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_UNENCRYPTED 1 /**< Packet has signal byte and is unencrypted */
-#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_ENCRYPTED   2 /**< Packet has signal byte and is encrypted */
+#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_FALSE         0 /**< Packet does not have signal byte */
+#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_UNENCRYPTED   1 /**< Packet has signal byte and is unencrypted */
+#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_ENCRYPTED     2 /**< Packet has signal byte and is encrypted */
+#define NESTEGG_PACKET_HAS_SIGNAL_BYTE_PARTITIONED   4 /**< Packet has signal byte and is partitioned */
 
 #define NESTEGG_PACKET_HAS_KEYFRAME_FALSE   0 /**< Packet contains only keyframes. */
 #define NESTEGG_PACKET_HAS_KEYFRAME_TRUE    1 /**< Packet does not contain any keyframes */
@@ -185,7 +187,7 @@ void nestegg_destroy(nestegg * context);
 int nestegg_duration(nestegg * context, uint64_t * duration);
 
 /** Query the tstamp scale of the media stream in nanoseconds.
-    @note Timecodes presented by nestegg have been scaled by this value
+    @note Timestamps presented by nestegg have been scaled by this value
           before presentation to the caller.
     @param context Stream context initialized by #nestegg_init.
     @param scale   Storage for the queried scale factor.
@@ -247,6 +249,7 @@ int nestegg_track_type(nestegg * context, unsigned int track);
     @param track   Zero based track number.
     @retval #NESTEGG_CODEC_VP8     Track codec is VP8.
     @retval #NESTEGG_CODEC_VP9     Track codec is VP9.
+    @retval #NESTEGG_CODEC_AV1     Track codec is AV1.
     @retval #NESTEGG_CODEC_VORBIS  Track codec is Vorbis.
     @retval #NESTEGG_CODEC_OPUS    Track codec is Opus.
     @retval #NESTEGG_CODEC_UNKNOWN Track codec is unknown.
@@ -363,7 +366,7 @@ int nestegg_packet_has_keyframe(nestegg_packet * packet);
     @retval -1 Error. */
 int nestegg_packet_track(nestegg_packet * packet, unsigned int * track);
 
-/** Query the time stamp in nanoseconds of @a packet.
+/** Query the timestamp in nanoseconds of @a packet.
     @param packet Packet initialized by #nestegg_read_packet.
     @param tstamp Storage for the queried timestamp in nanoseconds.
     @retval  0 Success.
@@ -424,6 +427,8 @@ int nestegg_packet_discard_padding(nestegg_packet * packet,
              set, encryption information not read from packet.
     @retval  #NESTEGG_PACKET_HAS_SIGNAL_BYTE_ENCRYPTED Encrypted bit set,
              encryption infomation read from packet.
+    @retval  #NESTEGG_PACKET_HAS_SIGNAL_BYTE_PARTITIONED Partitioned bit set,
+             encryption and parition information read from packet.
     @retval -1 Error.*/
 int nestegg_packet_encryption(nestegg_packet * packet);
 
@@ -439,6 +444,18 @@ int nestegg_packet_encryption(nestegg_packet * packet);
 int nestegg_packet_iv(nestegg_packet * packet, unsigned char const ** iv,
                       size_t * length);
 
+/** Query the packet for offsets.
+@param packet            Packet initialized by #nestegg_read_packet.
+@param partition_offsets Storage for queried offsets.
+@param num_offsets       Length of returned offsets, may be 0.
+The data is owned by the #nestegg_packet packet.
+@retval  0 Success.
+@retval -1 Error.
+*/
+int nestegg_packet_offsets(nestegg_packet * packet,
+                           uint32_t const ** partition_offsets,
+                           uint8_t * num_offsets);
+
 /** Returns reference_block given packet
     @param packet          Packet initialized by #nestegg_read_packet.
     @param reference_block pointer to store reference block in.
diff --git a/media/libnestegg/src/nestegg.c b/media/libnestegg/src/nestegg.c
index 1ca34c4f0..61c30ec6b 100644
--- a/media/libnestegg/src/nestegg.c
+++ b/media/libnestegg/src/nestegg.c
@@ -154,6 +154,7 @@ enum ebml_type_enum {
 /* Track IDs */
 #define TRACK_ID_VP8                "V_VP8"
 #define TRACK_ID_VP9                "V_VP9"
+#define TRACK_ID_AV1                "V_AV1"
 #define TRACK_ID_VORBIS             "A_VORBIS"
 #define TRACK_ID_OPUS               "A_OPUS"
 
@@ -164,11 +165,16 @@ enum ebml_type_enum {
 /* Packet Encryption */
 #define SIGNAL_BYTE_SIZE            1
 #define IV_SIZE                     8
+#define NUM_PACKETS_SIZE            1
+#define PACKET_OFFSET_SIZE          4
 
 /* Signal Byte */
 #define PACKET_ENCRYPTED            1
 #define ENCRYPTED_BIT_MASK          (1 << 0)
 
+#define PACKET_PARTITIONED          2
+#define PARTITIONED_BIT_MASK        (1 << 1)
+
 enum vint_mask {
   MASK_NONE,
   MASK_FIRST_BIT
@@ -338,6 +344,8 @@ struct frame_encryption {
   unsigned char * iv;
   size_t length;
   uint8_t signal_byte;
+  uint8_t num_partitions;
+  uint32_t * partition_offsets;
 };
 
 struct frame {
@@ -1039,14 +1047,14 @@ static int
 ne_read_simple(nestegg * ctx, struct ebml_element_desc * desc, size_t length)
 {
   struct ebml_type * storage;
-  int r;
+  int r = -1;
 
   storage = (struct ebml_type *) (ctx->ancestor->data + desc->offset);
 
   if (storage->read) {
-    ctx->log(ctx, NESTEGG_LOG_DEBUG, "element %llx (%s) already read, skipping",
-             desc->id, desc->name);
-    return 0;
+    ctx->log(ctx, NESTEGG_LOG_DEBUG, "element %llx (%s) already read, skipping %u",
+             desc->id, desc->name, length);
+    return ne_io_read_skip(ctx->io, length);
   }
 
   storage->type = desc->type;
@@ -1070,7 +1078,6 @@ ne_read_simple(nestegg * ctx, struct ebml_element_desc * desc, size_t length)
   case TYPE_MASTER:
   case TYPE_UNKNOWN:
   default:
-    r = 0;
     assert(0);
     break;
   }
@@ -1359,6 +1366,52 @@ ne_find_track_entry(nestegg * ctx, unsigned int track)
   return NULL;
 }
 
+static struct frame *
+ne_alloc_frame(void)
+{
+  struct frame * f = ne_alloc(sizeof(*f));
+
+  if (!f)
+    return NULL;
+
+  f->data = NULL;
+  f->length = 0;
+  f->frame_encryption = NULL;
+  f->next = NULL;
+
+  return f;
+}
+
+static struct frame_encryption *
+ne_alloc_frame_encryption(void)
+{
+  struct frame_encryption * f = ne_alloc(sizeof(*f));
+
+  if (!f)
+    return NULL;
+
+  f->iv = NULL;
+  f->length = 0;
+  f->signal_byte = 0;
+  f->num_partitions = 0;
+  f->partition_offsets = NULL;
+
+  return f;
+}
+
+static void
+ne_free_frame(struct frame * f)
+{
+  if (f->frame_encryption) {
+    free(f->frame_encryption->iv);
+    free(f->frame_encryption->partition_offsets);
+  }
+
+  free(f->frame_encryption);
+  free(f->data);
+  free(f);
+}
+
 static int
 ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_packet ** data)
 {
@@ -1371,7 +1424,7 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
   uint64_t track_number, length, frame_sizes[256], cluster_tc, flags, frames, tc_scale, total,
            encoding_type, encryption_algo, encryption_mode;
   unsigned int i, lacing, track;
-  uint8_t signal_byte, keyframe = NESTEGG_PACKET_HAS_KEYFRAME_UNKNOWN;
+  uint8_t signal_byte, keyframe = NESTEGG_PACKET_HAS_KEYFRAME_UNKNOWN, j = 0;
   size_t consumed = 0, data_size, encryption_size;
 
   *data = NULL;
@@ -1424,6 +1477,10 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
       return r;
     consumed += 1;
     frames += 1;
+    break;
+  default:
+    assert(0);
+    return -1;
   }
 
   if (frames > 256)
@@ -1453,6 +1510,9 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
     if (r != 1)
       return r;
     break;
+  default:
+    assert(0);
+    return -1;
   }
 
   /* Sanity check unlaced frame sizes against total block size. */
@@ -1490,8 +1550,11 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
   cluster_tc = ctx->cluster_timecode;
 
   abs_timecode = timecode + cluster_tc;
-  if (abs_timecode < 0)
-    return -1;
+  if (abs_timecode < 0) {
+      /* Ignore the spec and negative timestamps */
+      ctx->log(ctx, NESTEGG_LOG_WARNING, "ignoring negative timecode: %lld", abs_timecode);
+      abs_timecode = 0;
+  }
 
   pkt = ne_alloc(sizeof(*pkt));
   if (!pkt)
@@ -1509,7 +1572,7 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
       nestegg_free_packet(pkt);
       return -1;
     }
-    f = ne_alloc(sizeof(*f));
+    f = ne_alloc_frame();
     if (!f) {
       nestegg_free_packet(pkt);
       return -1;
@@ -1518,13 +1581,13 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
     if (encoding_type == NESTEGG_ENCODING_ENCRYPTION) {
       r = ne_io_read(ctx->io, &signal_byte, SIGNAL_BYTE_SIZE);
       if (r != 1) {
-        free(f);
+        ne_free_frame(f);
         nestegg_free_packet(pkt);
         return r;
       }
-      f->frame_encryption = ne_alloc(sizeof(*f->frame_encryption));
+      f->frame_encryption = ne_alloc_frame_encryption();
       if (!f->frame_encryption) {
-        free(f);
+        ne_free_frame(f);
         nestegg_free_packet(pkt);
         return -1;
       }
@@ -1532,49 +1595,70 @@ ne_read_block(nestegg * ctx, uint64_t block_id, uint64_t block_size, nestegg_pac
       if ((signal_byte & ENCRYPTED_BIT_MASK) == PACKET_ENCRYPTED) {
         f->frame_encryption->iv = ne_alloc(IV_SIZE);
         if (!f->frame_encryption->iv) {
-          free(f->frame_encryption);
-          free(f);
+          ne_free_frame(f);
           nestegg_free_packet(pkt);
           return -1;
         }
         r = ne_io_read(ctx->io, f->frame_encryption->iv, IV_SIZE);
         if (r != 1) {
-          free(f->frame_encryption->iv);
-          free(f->frame_encryption);
-          free(f);
+          ne_free_frame(f);
           nestegg_free_packet(pkt);
           return r;
         }
         f->frame_encryption->length = IV_SIZE;
         encryption_size = SIGNAL_BYTE_SIZE + IV_SIZE;
+
+        if ((signal_byte & PARTITIONED_BIT_MASK) == PACKET_PARTITIONED) {
+          r = ne_io_read(ctx->io, &f->frame_encryption->num_partitions, NUM_PACKETS_SIZE);
+          if (r != 1) {
+            ne_free_frame(f);
+            nestegg_free_packet(pkt);
+            return r;
+          }
+
+          encryption_size += NUM_PACKETS_SIZE + f->frame_encryption->num_partitions * PACKET_OFFSET_SIZE;
+          f->frame_encryption->partition_offsets = ne_alloc(f->frame_encryption->num_partitions * PACKET_OFFSET_SIZE);
+
+          for (j = 0; j < f->frame_encryption->num_partitions; ++j) {
+            uint64_t value = 0;
+            r = ne_read_uint(ctx->io, &value, PACKET_OFFSET_SIZE);
+            if (r != 1) {
+              break;
+            }
+
+            f->frame_encryption->partition_offsets[j] = (uint32_t) value;
+          }
+
+          /* If any of the partition offsets did not return 1, then fail. */
+          if (j != f->frame_encryption->num_partitions) {
+            ne_free_frame(f);
+            nestegg_free_packet(pkt);
+            return r;
+          }
+        }
       } else {
-        f->frame_encryption->iv = NULL;
-        f->frame_encryption->length = 0;
         encryption_size = SIGNAL_BYTE_SIZE;
       }
     } else {
-      f->frame_encryption = NULL;
       encryption_size = 0;
     }
+    if (encryption_size > frame_sizes[i]) {
+      ne_free_frame(f);
+      nestegg_free_packet(pkt);
+      return -1;
+    }
     data_size = frame_sizes[i] - encryption_size;
     /* Encryption parsed */
     f->data = ne_alloc(data_size);
     if (!f->data) {
-      if (f->frame_encryption)
-        free(f->frame_encryption->iv);
-      free(f->frame_encryption);
-      free(f);
+      ne_free_frame(f);
       nestegg_free_packet(pkt);
       return -1;
     }
     f->length = data_size;
     r = ne_io_read(ctx->io, f->data, data_size);
     if (r != 1) {
-      if (f->frame_encryption)
-        free(f->frame_encryption->iv);
-      free(f->frame_encryption);
-      free(f->data);
-      free(f);
+      ne_free_frame(f);
       nestegg_free_packet(pkt);
       return r;
     }
@@ -1610,6 +1694,7 @@ ne_read_block_additions(nestegg * ctx, uint64_t block_size, struct block_additio
     add_id = 1;
     data = NULL;
     has_data = 0;
+    data_size = 0;
     r = ne_read_element(ctx, &id, &size);
     if (r != 1)
       return r;
@@ -1864,8 +1949,8 @@ ne_init_cue_points(nestegg * ctx, int64_t max_offset)
 }
 
 /* Three functions that implement the nestegg_io interface, operating on a
- * sniff_buffer. */
-struct sniff_buffer {
+   io_buffer. */
+struct io_buffer {
   unsigned char const * buffer;
   size_t length;
   int64_t offset;
@@ -1874,25 +1959,26 @@ struct sniff_buffer {
 static int
 ne_buffer_read(void * buffer, size_t length, void * userdata)
 {
-  struct sniff_buffer * sb = userdata;
+  struct io_buffer * iob = userdata;
+  size_t available = iob->length - iob->offset;
 
-  int rv = 1;
-  size_t available = sb->length - sb->offset;
+  if (available == 0)
+    return 0;
 
   if (available < length)
-    return 0;
+    return -1;
 
-  memcpy(buffer, sb->buffer + sb->offset, length);
-  sb->offset += length;
+  memcpy(buffer, iob->buffer + iob->offset, length);
+  iob->offset += length;
 
-  return rv;
+  return 1;
 }
 
 static int
 ne_buffer_seek(int64_t offset, int whence, void * userdata)
 {
-  struct sniff_buffer * sb = userdata;
-  int64_t o = sb->offset;
+  struct io_buffer * iob = userdata;
+  int64_t o = iob->offset;
 
   switch(whence) {
   case NESTEGG_SEEK_SET:
@@ -1902,22 +1988,22 @@ ne_buffer_seek(int64_t offset, int whence, void * userdata)
     o += offset;
     break;
   case NESTEGG_SEEK_END:
-    o = sb->length + offset;
+    o = iob->length + offset;
     break;
   }
 
-  if (o < 0 || o > (int64_t) sb->length)
+  if (o < 0 || o > (int64_t) iob->length)
     return -1;
 
-  sb->offset = o;
+  iob->offset = o;
   return 0;
 }
 
 static int64_t
 ne_buffer_tell(void * userdata)
 {
-  struct sniff_buffer * sb = userdata;
-  return sb->offset;
+  struct io_buffer * iob = userdata;
+  return iob->offset;
 }
 
 static int
@@ -1994,6 +2080,17 @@ ne_match_webm(nestegg_io io, int64_t max_offset)
   return 1;
 }
 
+static void
+ne_free_block_additions(struct block_additional * block_additional)
+{
+  while (block_additional) {
+    struct block_additional * tmp = block_additional;
+    block_additional = block_additional->next;
+    free(tmp->data);
+    free(tmp);
+  }
+}
+
 int
 nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback, int64_t max_offset)
 {
@@ -2039,7 +2136,7 @@ nestegg_init(nestegg ** context, nestegg_io io, nestegg_log callback, int64_t ma
 
   if (ne_get_string(ctx->ebml.doctype, &doctype) != 0)
     doctype = "matroska";
-  if (strcmp(doctype, "webm") != 0) {
+  if (!!strcmp(doctype, "webm") && !!strcmp(doctype, "matroska")) {
     nestegg_destroy(ctx);
     return -1;
   }
@@ -2098,7 +2195,7 @@ nestegg_duration(nestegg * ctx, uint64_t * duration)
     return -1;
 
   if (unscaled_duration != unscaled_duration ||
-      unscaled_duration < 0 || unscaled_duration > (double) UINT64_MAX ||
+      unscaled_duration < 0 || unscaled_duration >= (double) UINT64_MAX ||
       (uint64_t) unscaled_duration > UINT64_MAX / tc_scale)
     return -1;
 
@@ -2295,6 +2392,9 @@ nestegg_track_codec_id(nestegg * ctx, unsigned int track)
   if (strcmp(codec_id, TRACK_ID_VP9) == 0)
     return NESTEGG_CODEC_VP9;
 
+  if (strcmp(codec_id, TRACK_ID_AV1) == 0)
+    return NESTEGG_CODEC_AV1;
+
   if (strcmp(codec_id, TRACK_ID_VORBIS) == 0)
     return NESTEGG_CODEC_VORBIS;
 
@@ -2350,61 +2450,67 @@ nestegg_track_codec_data(nestegg * ctx, unsigned int track, unsigned int item,
 {
   struct track_entry * entry;
   struct ebml_binary codec_private;
-  uint64_t sizes[3], size, total, avail;
-  unsigned char * p;
-  unsigned int count, i;
 
   *data = NULL;
   *length = 0;
-  count = 1;
 
   entry = ne_find_track_entry(ctx, track);
   if (!entry)
     return -1;
 
-  if (nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_VORBIS
-      && nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_OPUS)
+  if (nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_VORBIS &&
+      nestegg_track_codec_id(ctx, track) != NESTEGG_CODEC_OPUS)
     return -1;
 
   if (ne_get_binary(entry->codec_private, &codec_private) != 0)
     return -1;
 
   if (nestegg_track_codec_id(ctx, track) == NESTEGG_CODEC_VORBIS) {
-    p = codec_private.data;
-    avail = codec_private.length;
-    if (avail < 1)
-      return -1;
+    uint64_t count;
+    uint64_t sizes[3];
+    size_t total;
+    unsigned char * p;
+    unsigned int i;
+    int r;
+
+    nestegg_io io;
+    struct io_buffer userdata;
+    userdata.buffer = codec_private.data;
+    userdata.length = codec_private.length;
+    userdata.offset = 0;
+
+    io.read = ne_buffer_read;
+    io.seek = ne_buffer_seek;
+    io.tell = ne_buffer_tell;
+    io.userdata = &userdata;
+
+    total = 0;
 
-    count = *p++ + 1;
-    avail -= 1;
+    r = ne_read_uint(&io, &count, 1);
+    if (r != 1)
+      return r;
+    total += 1;
+    count += 1;
 
-    if (count > 3 || item >= count)
+    if (count > 3)
       return -1;
+    r = ne_read_xiph_lacing(&io, codec_private.length, &total, count, sizes);
+    if (r != 1)
+      return r;
 
-    total = 0;
-    for (i = 0; i < count - 1; ++i) {
-      size = 0;
-      do {
-        if (avail - total <= size) {
-          return -1;
-        }
-        size += *p;
-        avail -= 1;
-      } while (*p++ == 255);
-      if (avail - total < size)
-        return -1;
-      sizes[i] = size;
-      total += size;
-    }
-    sizes[i] = avail - total;
+    if (item >= count)
+      return -1;
 
+    p = codec_private.data + total;
     for (i = 0; i < item; ++i) {
       p += sizes[i];
     }
+    assert((size_t) (p - codec_private.data) <= codec_private.length &&
+           codec_private.length - (p - codec_private.data) >= sizes[item]);
     *data = p;
     *length = sizes[item];
   } else {
-    if (item >= count)
+    if (item >= 1)
       return -1;
 
     *data = codec_private.data;
@@ -2700,7 +2806,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
       while (ne_io_tell(ctx->io) < block_group_end) {
         r = ne_read_element(ctx, &id, &size);
         if (r != 1) {
-          free(block_additional);
+          ne_free_block_additions(block_additional);
           if (*pkt) {
             nestegg_free_packet(*pkt);
             *pkt = NULL;
@@ -2710,9 +2816,14 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
 
         switch (id) {
         case ID_BLOCK: {
+          if (*pkt) {
+            ctx->log(ctx, NESTEGG_LOG_DEBUG,
+                     "read_packet: multiple Blocks in BlockGroup, dropping previously read Block");
+            nestegg_free_packet(*pkt);
+          }
           r = ne_read_block(ctx, id, size, pkt);
           if (r != 1) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2726,7 +2837,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
         case ID_BLOCK_DURATION: {
           r = ne_read_uint(ctx->io, &block_duration, size);
           if (r != 1) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2735,7 +2846,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
           }
           tc_scale = ne_get_timecode_scale(ctx);
           if (tc_scale == 0) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2749,7 +2860,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
         case ID_DISCARD_PADDING: {
           r = ne_read_int(ctx->io, &discard_padding, size);
           if (r != 1) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2762,7 +2873,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
         case ID_BLOCK_ADDITIONS: {
           /* There should only be one BlockAdditions; treat multiple as an error. */
           if (block_additional) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2771,7 +2882,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
           }
           r = ne_read_block_additions(ctx, size, &block_additional);
           if (r != 1) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2783,7 +2894,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
         case ID_REFERENCE_BLOCK: {
           r = ne_read_int(ctx->io, &reference_block, size);
           if (r != 1) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2800,7 +2911,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
                      "read_packet: unknown element %llx in BlockGroup", id);
           r = ne_io_read_skip(ctx->io, size);
           if (r != 1) {
-            free(block_additional);
+            ne_free_block_additions(block_additional);
             if (*pkt) {
               nestegg_free_packet(*pkt);
               *pkt = NULL;
@@ -2824,7 +2935,7 @@ nestegg_read_packet(nestegg * ctx, nestegg_packet ** pkt)
              predictive frames and no keyframes */
           (*pkt)->keyframe = NESTEGG_PACKET_HAS_KEYFRAME_FALSE;
       } else {
-        free(block_additional);
+        ne_free_block_additions(block_additional);
       }
       break;
     }
@@ -2843,26 +2954,16 @@ void
 nestegg_free_packet(nestegg_packet * pkt)
 {
   struct frame * frame;
-  struct block_additional * block_additional;
 
   while (pkt->frame) {
     frame = pkt->frame;
     pkt->frame = frame->next;
-    if (frame->frame_encryption) {
-      free(frame->frame_encryption->iv);
-    }
-    free(frame->frame_encryption);
-    free(frame->data);
-    free(frame);
-  }
 
-  while (pkt->block_additional) {
-    block_additional = pkt->block_additional;
-    pkt->block_additional = block_additional->next;
-    free(block_additional->data);
-    free(block_additional);
+    ne_free_frame(frame);
   }
 
+  ne_free_block_additions(pkt->block_additional);
+
   free(pkt);
 }
 
@@ -2977,6 +3078,7 @@ nestegg_packet_encryption(nestegg_packet * pkt)
 {
   struct frame * f = pkt->frame;
   unsigned char encrypted_bit;
+  unsigned char partitioned_bit;
 
   if (!f->frame_encryption)
     return NESTEGG_PACKET_HAS_SIGNAL_BYTE_FALSE;
@@ -2985,10 +3087,14 @@ nestegg_packet_encryption(nestegg_packet * pkt)
   assert(f->next == NULL);
 
   encrypted_bit = f->frame_encryption->signal_byte & ENCRYPTED_BIT_MASK;
+  partitioned_bit = f->frame_encryption->signal_byte & PARTITIONED_BIT_MASK;
 
   if (encrypted_bit != PACKET_ENCRYPTED)
     return NESTEGG_PACKET_HAS_SIGNAL_BYTE_UNENCRYPTED;
 
+  if (partitioned_bit == PACKET_PARTITIONED)
+    return NESTEGG_PACKET_HAS_SIGNAL_BYTE_PARTITIONED;
+
   return NESTEGG_PACKET_HAS_SIGNAL_BYTE_ENCRYPTED;
 }
 
@@ -3017,6 +3123,35 @@ nestegg_packet_iv(nestegg_packet * pkt, unsigned char const ** iv, size_t * leng
 }
 
 int
+nestegg_packet_offsets(nestegg_packet * pkt,
+                       uint32_t const ** partition_offsets,
+                       uint8_t * num_partitions)
+{
+  struct frame * f = pkt->frame;
+  unsigned char encrypted_bit;
+  unsigned char partitioned_bit;
+
+  *partition_offsets = NULL;
+  *num_partitions = 0;
+
+  if (!f->frame_encryption)
+    return -1;
+
+  /* Should never have parsed blocks with both encryption and lacing */
+  assert(f->next == NULL);
+
+  encrypted_bit = f->frame_encryption->signal_byte & ENCRYPTED_BIT_MASK;
+  partitioned_bit = f->frame_encryption->signal_byte & PARTITIONED_BIT_MASK;
+
+  if (encrypted_bit != PACKET_ENCRYPTED || partitioned_bit != PACKET_PARTITIONED)
+    return -1;
+
+  *num_partitions = f->frame_encryption->num_partitions;
+  *partition_offsets = f->frame_encryption->partition_offsets;
+  return 0;
+}
+
+int
 nestegg_has_cues(nestegg * ctx)
 {
   return ctx->segment.cues.cue_point.head ||
@@ -3027,7 +3162,7 @@ int
 nestegg_sniff(unsigned char const * buffer, size_t length)
 {
   nestegg_io io;
-  struct sniff_buffer userdata;
+  struct io_buffer userdata;
 
   userdata.buffer = buffer;
   userdata.length = length;
diff --git a/media/libnestegg/update.sh b/media/libnestegg/update.sh
index 6145ef064..644408e09 100755
--- a/media/libnestegg/update.sh
+++ b/media/libnestegg/update.sh
@@ -15,8 +15,8 @@ if [ -n "$rev" ]; then
     version=$version-dirty
     echo "WARNING: updating from a dirty git repository."
   fi
-  sed -i "/The git commit ID used was/ s/[0-9a-f]\+\(-dirty\)\?\./$version./" README_MOZILLA
+  sed -i "/The git commit ID used was/ s/[0-9a-f]\+\(-dirty\)\?\./$version./" README_MCP
 else
-  echo "Remember to update README_MOZILLA with the version details."
+  echo "Remember to update README_MCP with the version details."
 fi
 
diff --git a/mfbt/Casting.h b/mfbt/Casting.h
index a7d0fb50d..adf2c9045 100644
--- a/mfbt/Casting.h
+++ b/mfbt/Casting.h
@@ -238,6 +238,19 @@ AssertedCast(const From aFrom)
   return static_cast<To>(aFrom);
 }
 
+/**
+ * Cast a value of integral type |From| to a value of integral type |To|,
+ * release asserting that the cast will be a safe cast per C++ (that is, that
+ * |to| is in the range of values permitted for the type |From|).
+ */
+template<typename To, typename From>
+inline To
+ReleaseAssertedCast(const From aFrom)
+{
+  MOZ_RELEASE_ASSERT((detail::IsInBounds<From, To>(aFrom)));
+  return static_cast<To>(aFrom);
+}
+
 } // namespace mozilla
 
 #endif /* mozilla_Casting_h */
diff --git a/mfbt/Range.h b/mfbt/Range.h
index 47d91bb0c..753fe07f8 100644
--- a/mfbt/Range.h
+++ b/mfbt/Range.h
@@ -9,6 +9,7 @@
 
 #include "mozilla/RangedPtr.h"
 #include "mozilla/TypeTraits.h"
+#include "mozilla/Span.h"
 
 #include <stddef.h>
 
@@ -44,6 +45,19 @@ public:
       mEnd(aOther.mEnd)
   {}
 
+  MOZ_IMPLICIT Range(Span<T> aSpan)
+    : Range(aSpan.Elements(), aSpan.Length())
+  {
+  }
+
+  template<typename U,
+           class = typename EnableIf<IsConvertible<U (*)[], T (*)[]>::value,
+                                     int>::Type>
+  MOZ_IMPLICIT Range(const Span<U>& aSpan)
+    : Range(aSpan.Elements(), aSpan.Length())
+  {
+  }
+
   RangedPtr<T> begin() const { return mStart; }
   RangedPtr<T> end() const { return mEnd; }
   size_t length() const { return mEnd - mStart; }
@@ -51,8 +65,26 @@ public:
   T& operator[](size_t aOffset) const { return mStart[aOffset]; }
 
   explicit operator bool() const { return mStart != nullptr; }
+
+  operator Span<T>() { return Span<T>(mStart.get(), length()); }
+
+  operator Span<const T>() const { return Span<T>(mStart.get(), length()); }
 };
 
+template<class T>
+Span<T>
+MakeSpan(Range<T>& aRange)
+{
+  return aRange;
+}
+
+template<class T>
+Span<const T>
+MakeSpan(const Range<T>& aRange)
+{
+  return aRange;
+}
+
 } // namespace mozilla
 
 #endif /* mozilla_Range_h */
diff --git a/mfbt/Span.h b/mfbt/Span.h
new file mode 100644
index 000000000..f031a928b
--- /dev/null
+++ b/mfbt/Span.h
@@ -0,0 +1,1041 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+// Adapted from https://github.com/Microsoft/GSL/blob/3819df6e378ffccf0e29465afe99c3b324c2aa70/include/gsl/span
+// and https://github.com/Microsoft/GSL/blob/3819df6e378ffccf0e29465afe99c3b324c2aa70/include/gsl/gsl_util
+
+#ifndef mozilla_Span_h
+#define mozilla_Span_h
+
+#include "mozilla/Array.h"
+#include "mozilla/Assertions.h"
+#include "mozilla/Casting.h"
+#include "mozilla/IntegerTypeTraits.h"
+#include "mozilla/Move.h"
+#include "mozilla/TypeTraits.h"
+#include "mozilla/UniquePtr.h"
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iterator>
+
+// Classifications for reasons why constexpr was removed in C++14 to C++11
+// conversion. Once we upgrade compilers, we can try defining each of these
+// to constexpr to restore a category of constexprs at a time.
+#define MOZ_SPAN_ASSERTION_CONSTEXPR
+#define MOZ_SPAN_GCC_CONSTEXPR
+#define MOZ_SPAN_EXPLICITLY_DEFAULTED_CONSTEXPR
+#define MOZ_SPAN_CONSTEXPR_NOT_JUST_RETURN
+#define MOZ_SPAN_NON_CONST_CONSTEXPR
+
+#ifdef _MSC_VER
+#pragma warning(push)
+
+// turn off some warnings that are noisy about our MOZ_RELEASE_ASSERT statements
+#pragma warning(disable : 4127) // conditional expression is constant
+
+// blanket turn off warnings from CppCoreCheck for now
+// so people aren't annoyed by them when running the tool.
+// more targeted suppressions will be added in a future update to the GSL
+#pragma warning(disable : 26481 26482 26483 26485 26490 26491 26492 26493 26495)
+
+#if _MSC_VER < 1910
+#pragma push_macro("constexpr")
+#define constexpr /*constexpr*/
+
+#endif            // _MSC_VER < 1910
+#endif            // _MSC_VER
+
+namespace mozilla {
+
+// Stuff from gsl_util
+
+// narrow_cast(): a searchable way to do narrowing casts of values
+template<class T, class U>
+inline constexpr T
+narrow_cast(U&& u)
+{
+  return static_cast<T>(mozilla::Forward<U>(u));
+}
+
+// end gsl_util
+
+// [views.constants], constants
+// This was -1 in gsl::span, but using size_t for sizes instead of ptrdiff_t
+// and reserving a magic value that realistically doesn't occur in
+// compile-time-constant Span sizes makes things a lot less messy in terms of
+// comparison between signed and unsigned.
+constexpr const size_t dynamic_extent = mozilla::MaxValue<size_t>::value;
+
+template<class ElementType, size_t Extent = dynamic_extent>
+class Span;
+
+// implementation details
+namespace span_details {
+
+// C++14 types that we don't have because we build as C++11.
+template<class T>
+using remove_cv_t = typename mozilla::RemoveCV<T>::Type;
+template<class T>
+using remove_const_t = typename mozilla::RemoveConst<T>::Type;
+template<bool B, class T, class F>
+using conditional_t = typename mozilla::Conditional<B, T, F>::Type;
+template<class T>
+using add_pointer_t = typename mozilla::AddPointer<T>::Type;
+template<bool B, class T = void>
+using enable_if_t = typename mozilla::EnableIf<B, T>::Type;
+
+template<class T>
+struct is_span_oracle : mozilla::FalseType
+{
+};
+
+template<class ElementType, size_t Extent>
+struct is_span_oracle<mozilla::Span<ElementType, Extent>> : mozilla::TrueType
+{
+};
+
+template<class T>
+struct is_span : public is_span_oracle<remove_cv_t<T>>
+{
+};
+
+template<class T>
+struct is_std_array_oracle : mozilla::FalseType
+{
+};
+
+template<class ElementType, size_t Extent>
+struct is_std_array_oracle<std::array<ElementType, Extent>> : mozilla::TrueType
+{
+};
+
+template<class T>
+struct is_std_array : public is_std_array_oracle<remove_cv_t<T>>
+{
+};
+
+template<size_t From, size_t To>
+struct is_allowed_extent_conversion
+  : public mozilla::IntegralConstant<bool,
+                                  From == To ||
+                                    From == mozilla::dynamic_extent ||
+                                    To == mozilla::dynamic_extent>
+{
+};
+
+template<class From, class To>
+struct is_allowed_element_type_conversion
+  : public mozilla::IntegralConstant<bool, mozilla::IsConvertible<From (*)[], To (*)[]>::value>
+{
+};
+
+template<class Span, bool IsConst>
+class span_iterator
+{
+  using element_type_ = typename Span::element_type;
+
+public:
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = remove_const_t<element_type_>;
+  using difference_type = typename Span::index_type;
+
+  using reference = conditional_t<IsConst, const element_type_, element_type_>&;
+  using pointer = add_pointer_t<reference>;
+
+  constexpr span_iterator() : span_iterator(nullptr, 0) {}
+
+  MOZ_SPAN_ASSERTION_CONSTEXPR span_iterator(const Span* span,
+                                             typename Span::index_type index)
+    : span_(span)
+    , index_(index)
+  {
+    MOZ_RELEASE_ASSERT(span == nullptr ||
+                       (index_ >= 0 && index <= span_->Length()));
+  }
+
+  friend class span_iterator<Span, true>;
+  constexpr MOZ_IMPLICIT span_iterator(const span_iterator<Span, false>& other)
+    : span_iterator(other.span_, other.index_)
+  {
+  }
+
+  MOZ_SPAN_EXPLICITLY_DEFAULTED_CONSTEXPR span_iterator<Span, IsConst>&
+  operator=(const span_iterator<Span, IsConst>&) = default;
+
+  MOZ_SPAN_GCC_CONSTEXPR reference operator*() const
+  {
+    MOZ_RELEASE_ASSERT(span_);
+    return (*span_)[index_];
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR pointer operator->() const
+  {
+    MOZ_RELEASE_ASSERT(span_);
+    return &((*span_)[index_]);
+  }
+
+  MOZ_SPAN_NON_CONST_CONSTEXPR span_iterator& operator++()
+  {
+    MOZ_RELEASE_ASSERT(span_ && index_ >= 0 && index_ < span_->Length());
+    ++index_;
+    return *this;
+  }
+
+  MOZ_SPAN_NON_CONST_CONSTEXPR span_iterator operator++(int)
+  {
+    auto ret = *this;
+    ++(*this);
+    return ret;
+  }
+
+  MOZ_SPAN_NON_CONST_CONSTEXPR span_iterator& operator--()
+  {
+    MOZ_RELEASE_ASSERT(span_ && index_ > 0 && index_ <= span_->Length());
+    --index_;
+    return *this;
+  }
+
+  MOZ_SPAN_NON_CONST_CONSTEXPR span_iterator operator--(int)
+  {
+    auto ret = *this;
+    --(*this);
+    return ret;
+  }
+
+  MOZ_SPAN_CONSTEXPR_NOT_JUST_RETURN span_iterator
+  operator+(difference_type n) const
+  {
+    auto ret = *this;
+    return ret += n;
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR span_iterator& operator+=(difference_type n)
+  {
+    MOZ_RELEASE_ASSERT(span_ && (index_ + n) >= 0 &&
+                       (index_ + n) <= span_->Length());
+    index_ += n;
+    return *this;
+  }
+
+  MOZ_SPAN_CONSTEXPR_NOT_JUST_RETURN span_iterator
+  operator-(difference_type n) const
+  {
+    auto ret = *this;
+    return ret -= n;
+  }
+
+  MOZ_SPAN_NON_CONST_CONSTEXPR span_iterator& operator-=(difference_type n)
+
+  {
+    return *this += -n;
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR difference_type
+  operator-(const span_iterator& rhs) const
+  {
+    MOZ_RELEASE_ASSERT(span_ == rhs.span_);
+    return index_ - rhs.index_;
+  }
+
+  constexpr reference operator[](difference_type n) const
+  {
+    return *(*this + n);
+  }
+
+  constexpr friend bool operator==(const span_iterator& lhs,
+                                   const span_iterator& rhs)
+  {
+    return lhs.span_ == rhs.span_ && lhs.index_ == rhs.index_;
+  }
+
+  constexpr friend bool operator!=(const span_iterator& lhs,
+                                   const span_iterator& rhs)
+  {
+    return !(lhs == rhs);
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR friend bool operator<(const span_iterator& lhs,
+                                               const span_iterator& rhs)
+  {
+    MOZ_RELEASE_ASSERT(lhs.span_ == rhs.span_);
+    return lhs.index_ < rhs.index_;
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR friend bool operator<=(const span_iterator& lhs,
+                                                const span_iterator& rhs)
+  {
+    return !(rhs < lhs);
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR friend bool operator>(const span_iterator& lhs,
+                                               const span_iterator& rhs)
+  {
+    return rhs < lhs;
+  }
+
+  MOZ_SPAN_GCC_CONSTEXPR friend bool operator>=(const span_iterator& lhs,
+                                                const span_iterator& rhs)
+  {
+    return !(rhs > lhs);
+  }
+
+  void swap(span_iterator& rhs)
+  {
+    std::swap(index_, rhs.index_);
+    std::swap(span_, rhs.span_);
+  }
+
+protected:
+  const Span* span_;
+  size_t index_;
+};
+
+template<class Span, bool IsConst>
+inline constexpr span_iterator<Span, IsConst>
+operator+(typename span_iterator<Span, IsConst>::difference_type n,
+          const span_iterator<Span, IsConst>& rhs)
+{
+  return rhs + n;
+}
+
+template<size_t Ext>
+class extent_type
+{
+public:
+  using index_type = size_t;
+
+  static_assert(Ext >= 0, "A fixed-size Span must be >= 0 in size.");
+
+  constexpr extent_type() {}
+
+  template<index_type Other>
+  MOZ_SPAN_ASSERTION_CONSTEXPR MOZ_IMPLICIT extent_type(extent_type<Other> ext)
+  {
+    static_assert(
+      Other == Ext || Other == dynamic_extent,
+      "Mismatch between fixed-size extent and size of initializing data.");
+    MOZ_RELEASE_ASSERT(ext.size() == Ext);
+  }
+
+  MOZ_SPAN_ASSERTION_CONSTEXPR MOZ_IMPLICIT extent_type(index_type length)
+  {
+    MOZ_RELEASE_ASSERT(length == Ext);
+  }
+
+  constexpr index_type size() const { return Ext; }
+};
+
+template<>
+class extent_type<dynamic_extent>
+{
+public:
+  using index_type = size_t;
+
+  template<index_type Other>
+  explicit constexpr extent_type(extent_type<Other> ext)
+    : size_(ext.size())
+  {
+  }
+
+  explicit constexpr extent_type(index_type length)
+    : size_(length)
+  {
+  }
+
+  constexpr index_type size() const { return size_; }
+
+private:
+  index_type size_;
+};
+} // namespace span_details
+
+/**
+ * Span - slices for C++
+ *
+ * Span implements Rust's slice concept for C++. It's called "Span" instead of
+ * "Slice" to follow the naming used in C++ Core Guidelines.
+ *
+ * A Span wraps a pointer and a length that identify a non-owning view to a
+ * contiguous block of memory of objects of the same type. Various types,
+ * including (pre-decay) C arrays, XPCOM strings, nsTArray, mozilla::Array,
+ * mozilla::Range and contiguous standard-library containers, auto-convert
+ * into Spans when attempting to pass them as arguments to methods that take
+ * Spans. MakeSpan() functions can be used for explicit conversion in other
+ * contexts. (Span itself autoconverts into mozilla::Range.)
+ *
+ * Like Rust's slices, Span provides safety against out-of-bounds access by
+ * performing run-time bound checks. However, unlike Rust's slices, Span
+ * cannot provide safety against use-after-free.
+ *
+ * (Note: Span is like Rust's slice only conceptually. Due to the lack of
+ * ABI guarantees, you should still decompose spans/slices to raw pointer
+ * and length parts when crossing the FFI.)
+ *
+ * In addition to having constructors and MakeSpan() functions that take
+ * various well-known types, a Span for an arbitrary type can be constructed
+ * (via constructor or MakeSpan()) from a pointer and a length or a pointer
+ * and another pointer pointing just past the last element.
+ *
+ * A Span<const char> can be obtained for const char* pointing to a
+ * zero-terminated C string using the MakeCStringSpan() function. A
+ * corresponding implicit constructor does not exist in order to avoid
+ * accidental construction in cases where const char* does not point to a
+ * zero-terminated C string.
+ *
+ * Span has methods that follow the Mozilla naming style and methods that
+ * don't. The methods that follow the Mozilla naming style are meant to be
+ * used directly from Mozilla code. The methods that don't are meant for
+ * integration with C++11 range-based loops and with meta-programming that
+ * expects the same methods that are found on the standard-library
+ * containers. For example, to decompose a Span into its parts in Mozilla
+ * code, use Elements() and Length() (as with nsTArray) instead of data()
+ * and size() (as with std::vector).
+ *
+ * The pointer and length wrapped by a Span cannot be changed after a Span has
+ * been created. When new values are required, simply create a new Span. Span
+ * has a method called Subspan() that works analogously to the Substring()
+ * method of XPCOM strings taking a start index and an optional length. As a
+ * Mozilla extension (relative to Microsoft's gsl::span that mozilla::Span is
+ * based on), Span has methods From(start), To(end) and FromTo(start, end)
+ * that correspond to Rust's &slice[start..], &slice[..end] and
+ * &slice[start..end], respectively. (That is, the end index is the index of
+ * the first element not to be included in the new subspan.)
+ *
+ * When indicating a Span that's only read from, const goes inside the type
+ * parameter. Don't put const in front of Span. That is:
+ * size_t ReadsFromOneSpanAndWritesToAnother(Span<const uint8_t> aReadFrom,
+ *                                           Span<uint8_t> aWrittenTo);
+ *
+ * Any Span<const T> can be viewed as Span<const uint8_t> using the function
+ * AsBytes(). Any Span<T> can be viewed as Span<uint8_t> using the function
+ * AsWritableBytes().
+ */
+template<class ElementType, size_t Extent>
+class Span
+{
+public:
+  // constants and types
+  using element_type = ElementType;
+  using index_type = size_t;
+  using pointer = element_type*;
+  using reference = element_type&;
+
+  using iterator =
+    span_details::span_iterator<Span<ElementType, Extent>, false>;
+  using const_iterator =
+    span_details::span_iterator<Span<ElementType, Extent>, true>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  constexpr static const index_type extent = Extent;
+
+  // [Span.cons], Span constructors, copy, assignment, and destructor
+  // "Dependent" is needed to make "span_details::enable_if_t<(Dependent || Extent == 0 || Extent == mozilla::MaxValue<size_t>::value)>" SFINAE,
+  // since "span_details::enable_if_t<(Extent == 0 || Extent == mozilla::MaxValue<size_t>::value)>" is ill-formed when Extent is neither of the extreme values.
+  /**
+   * Constructor with no args.
+   */
+  template<
+    bool Dependent = false,
+    class = span_details::enable_if_t<
+      (Dependent || Extent == 0 || Extent == mozilla::MaxValue<size_t>::value)>>
+  constexpr Span()
+    : storage_(nullptr, span_details::extent_type<0>())
+  {
+  }
+
+  /**
+   * Constructor for nullptr.
+   */
+  constexpr MOZ_IMPLICIT Span(std::nullptr_t) : Span() {}
+
+  /**
+   * Constructor for pointer and length.
+   */
+  constexpr Span(pointer aPtr, index_type aLength)
+    : storage_(aPtr, aLength)
+  {
+  }
+
+  /**
+   * Constructor for start pointer and pointer past end.
+   */
+  constexpr Span(pointer aStartPtr, pointer aEndPtr)
+    : storage_(aStartPtr, std::distance(aStartPtr, aEndPtr))
+  {
+  }
+
+  /**
+   * Constructor for C array.
+   */
+  template<size_t N>
+  constexpr MOZ_IMPLICIT Span(element_type (&aArr)[N])
+    : storage_(&aArr[0], span_details::extent_type<N>())
+  {
+  }
+
+  /**
+   * Constructor for std::array.
+   */
+  template<size_t N,
+           class ArrayElementType = span_details::remove_const_t<element_type>>
+  constexpr MOZ_IMPLICIT Span(std::array<ArrayElementType, N>& aArr)
+    : storage_(&aArr[0], span_details::extent_type<N>())
+  {
+  }
+
+  /**
+   * Constructor for const std::array.
+   */
+  template<size_t N>
+  constexpr MOZ_IMPLICIT Span(
+    const std::array<span_details::remove_const_t<element_type>, N>& aArr)
+    : storage_(&aArr[0], span_details::extent_type<N>())
+  {
+  }
+
+  /**
+   * Constructor for mozilla::Array.
+   */
+  template<size_t N,
+           class ArrayElementType = span_details::remove_const_t<element_type>>
+  constexpr MOZ_IMPLICIT Span(mozilla::Array<ArrayElementType, N>& aArr)
+    : storage_(&aArr[0], span_details::extent_type<N>())
+  {
+  }
+
+  /**
+   * Constructor for const mozilla::Array.
+   */
+  template<size_t N>
+  constexpr MOZ_IMPLICIT Span(
+    const mozilla::Array<span_details::remove_const_t<element_type>, N>& aArr)
+    : storage_(&aArr[0], span_details::extent_type<N>())
+  {
+  }
+
+  /**
+   * Constructor for mozilla::UniquePtr holding an array and length.
+   */
+  template<class ArrayElementType = std::add_pointer<element_type>>
+  constexpr Span(const mozilla::UniquePtr<ArrayElementType>& aPtr,
+                 index_type aLength)
+    : storage_(aPtr.get(), aLength)
+  {
+  }
+
+  // NB: the SFINAE here uses .data() as a incomplete/imperfect proxy for the requirement
+  // on Container to be a contiguous sequence container.
+  /**
+   * Constructor for standard-library containers.
+   */
+  template<
+    class Container,
+    class = span_details::enable_if_t<
+      !span_details::is_span<Container>::value &&
+      !span_details::is_std_array<Container>::value &&
+      mozilla::IsConvertible<typename Container::pointer, pointer>::value &&
+      mozilla::IsConvertible<typename Container::pointer,
+                          decltype(mozilla::DeclVal<Container>().data())>::value>>
+  constexpr MOZ_IMPLICIT Span(Container& cont)
+    : Span(cont.data(), ReleaseAssertedCast<index_type>(cont.size()))
+  {
+  }
+
+  /**
+   * Constructor for standard-library containers (const version).
+   */
+  template<
+    class Container,
+    class = span_details::enable_if_t<
+      mozilla::IsConst<element_type>::value &&
+      !span_details::is_span<Container>::value &&
+      mozilla::IsConvertible<typename Container::pointer, pointer>::value &&
+      mozilla::IsConvertible<typename Container::pointer,
+                          decltype(mozilla::DeclVal<Container>().data())>::value>>
+  constexpr MOZ_IMPLICIT Span(const Container& cont)
+    : Span(cont.data(), ReleaseAssertedCast<index_type>(cont.size()))
+  {
+  }
+
+  /**
+   * Constructor from other Span.
+   */
+  constexpr Span(const Span& other) = default;
+
+  /**
+   * Constructor from other Span.
+   */
+  constexpr Span(Span&& other) = default;
+
+  /**
+   * Constructor from other Span with conversion of element type.
+   */
+  template<
+    class OtherElementType,
+    size_t OtherExtent,
+    class = span_details::enable_if_t<
+      span_details::is_allowed_extent_conversion<OtherExtent, Extent>::value &&
+      span_details::is_allowed_element_type_conversion<OtherElementType,
+                                                       element_type>::value>>
+  constexpr MOZ_IMPLICIT Span(const Span<OtherElementType, OtherExtent>& other)
+    : storage_(other.data(),
+               span_details::extent_type<OtherExtent>(other.size()))
+  {
+  }
+
+  /**
+   * Constructor from other Span with conversion of element type.
+   */
+  template<
+    class OtherElementType,
+    size_t OtherExtent,
+    class = span_details::enable_if_t<
+      span_details::is_allowed_extent_conversion<OtherExtent, Extent>::value &&
+      span_details::is_allowed_element_type_conversion<OtherElementType,
+                                                       element_type>::value>>
+  constexpr MOZ_IMPLICIT Span(Span<OtherElementType, OtherExtent>&& other)
+    : storage_(other.data(),
+               span_details::extent_type<OtherExtent>(other.size()))
+  {
+  }
+
+  ~Span() = default;
+  MOZ_SPAN_EXPLICITLY_DEFAULTED_CONSTEXPR Span& operator=(const Span& other)
+    = default;
+
+  MOZ_SPAN_EXPLICITLY_DEFAULTED_CONSTEXPR Span& operator=(Span&& other)
+    = default;
+
+  // [Span.sub], Span subviews
+  /**
+   * Subspan with first N elements with compile-time N.
+   */
+  template<size_t Count>
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, Count> First() const
+  {
+    MOZ_RELEASE_ASSERT(Count <= size());
+    return { data(), Count };
+  }
+
+  /**
+   * Subspan with last N elements with compile-time N.
+   */
+  template<size_t Count>
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, Count> Last() const
+  {
+    MOZ_RELEASE_ASSERT(Count <= size());
+    return { data() + (size() - Count), Count };
+  }
+
+  /**
+   * Subspan with compile-time start index and length.
+   */
+  template<size_t Offset, size_t Count = dynamic_extent>
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, Count> Subspan() const
+  {
+    MOZ_RELEASE_ASSERT(Offset <= size() &&
+      (Count == dynamic_extent || (Offset + Count <= size())));
+    return { data() + Offset,
+             Count == dynamic_extent ? size() - Offset : Count };
+  }
+
+  /**
+   * Subspan with first N elements with run-time N.
+   */
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, dynamic_extent> First(
+    index_type aCount) const
+  {
+    MOZ_RELEASE_ASSERT(aCount <= size());
+    return { data(), aCount };
+  }
+
+  /**
+   * Subspan with last N elements with run-time N.
+   */
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, dynamic_extent> Last(
+    index_type aCount) const
+  {
+    MOZ_RELEASE_ASSERT(aCount <= size());
+    return { data() + (size() - aCount), aCount };
+  }
+
+  /**
+   * Subspan with run-time start index and length.
+   */
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, dynamic_extent> Subspan(
+    index_type aStart,
+    index_type aLength = dynamic_extent) const
+  {
+    MOZ_RELEASE_ASSERT(aStart <= size() &&
+                       (aLength == dynamic_extent ||
+                        (aStart + aLength <= size())));
+    return { data() + aStart,
+             aLength == dynamic_extent ? size() - aStart : aLength };
+  }
+
+  /**
+   * Subspan with run-time start index. (Rust's &foo[start..])
+   */
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, dynamic_extent> From(
+    index_type aStart) const
+  {
+    return Subspan(aStart);
+  }
+
+  /**
+   * Subspan with run-time exclusive end index. (Rust's &foo[..end])
+   */
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, dynamic_extent> To(
+    index_type aEnd) const
+  {
+    return Subspan(0, aEnd);
+  }
+
+  /**
+   * Subspan with run-time start index and exclusive end index.
+   * (Rust's &foo[start..end])
+   */
+  MOZ_SPAN_GCC_CONSTEXPR Span<element_type, dynamic_extent> FromTo(
+    index_type aStart,
+    index_type aEnd) const
+  {
+    MOZ_RELEASE_ASSERT(aStart <= aEnd);
+    return Subspan(aStart, aEnd - aStart);
+  }
+
+  // [Span.obs], Span observers
+  /**
+   * Number of elements in the span.
+   */
+  constexpr index_type Length() const { return size(); }
+
+  /**
+   * Number of elements in the span (standard-libray duck typing version).
+   */
+  constexpr index_type size() const { return storage_.size(); }
+
+  /**
+   * Size of the span in bytes.
+   */
+  constexpr index_type LengthBytes() const { return size_bytes(); }
+
+  /**
+   * Size of the span in bytes (standard-library naming style version).
+   */
+  constexpr index_type size_bytes() const
+  {
+    return size() * narrow_cast<index_type>(sizeof(element_type));
+  }
+
+  /**
+   * Checks if the the length of the span is zero.
+   */
+  constexpr bool IsEmpty() const { return empty(); }
+
+  /**
+   * Checks if the the length of the span is zero (standard-libray duck
+   * typing version).
+   */
+  constexpr bool empty() const { return size() == 0; }
+
+  // [Span.elem], Span element access
+  MOZ_SPAN_GCC_CONSTEXPR reference operator[](index_type idx) const
+  {
+    MOZ_RELEASE_ASSERT(idx < storage_.size());
+    return data()[idx];
+  }
+
+  /**
+   * Access element of span by index (standard-library duck typing version).
+   */
+  constexpr reference at(index_type idx) const { return this->operator[](idx); }
+
+  constexpr reference operator()(index_type idx) const
+  {
+    return this->operator[](idx);
+  }
+
+  /**
+   * Pointer to the first element of the span.
+   */
+  constexpr pointer Elements() const { return data(); }
+
+  /**
+   * Pointer to the first element of the span (standard-libray duck typing version).
+   */
+  constexpr pointer data() const { return storage_.data(); }
+
+  // [Span.iter], Span iterator support
+  iterator begin() const { return { this, 0 }; }
+  iterator end() const { return { this, Length() }; }
+
+  const_iterator cbegin() const { return { this, 0 }; }
+  const_iterator cend() const { return { this, Length() }; }
+
+  reverse_iterator rbegin() const
+  {
+    return reverse_iterator{ end() };
+  }
+  reverse_iterator rend() const
+  {
+    return reverse_iterator{ begin() };
+  }
+
+  const_reverse_iterator crbegin() const
+  {
+    return const_reverse_iterator{ cend() };
+  }
+  const_reverse_iterator crend() const
+  {
+    return const_reverse_iterator{ cbegin() };
+  }
+
+private:
+  // this implementation detail class lets us take advantage of the
+  // empty base class optimization to pay for only storage of a single
+  // pointer in the case of fixed-size Spans
+  template<class ExtentType>
+  class storage_type : public ExtentType
+  {
+  public:
+    template<class OtherExtentType>
+    MOZ_SPAN_ASSERTION_CONSTEXPR storage_type(pointer elements,
+                                              OtherExtentType ext)
+      : ExtentType(ext)
+      , data_(elements)
+    {
+      MOZ_RELEASE_ASSERT(
+        (!elements && ExtentType::size() == 0) ||
+        (elements && ExtentType::size() != mozilla::MaxValue<size_t>::value));
+    }
+
+    constexpr pointer data() const { return data_; }
+
+  private:
+    pointer data_;
+  };
+
+  storage_type<span_details::extent_type<Extent>> storage_;
+};
+
+// [Span.comparison], Span comparison operators
+template<class ElementType, size_t FirstExtent, size_t SecondExtent>
+inline constexpr bool
+operator==(const Span<ElementType, FirstExtent>& l,
+           const Span<ElementType, SecondExtent>& r)
+{
+  return (l.size() == r.size()) && std::equal(l.begin(), l.end(), r.begin());
+}
+
+template<class ElementType, size_t Extent>
+inline constexpr bool
+operator!=(const Span<ElementType, Extent>& l,
+           const Span<ElementType, Extent>& r)
+{
+  return !(l == r);
+}
+
+template<class ElementType, size_t Extent>
+inline constexpr bool
+operator<(const Span<ElementType, Extent>& l,
+          const Span<ElementType, Extent>& r)
+{
+  return std::lexicographical_compare(l.begin(), l.end(), r.begin(), r.end());
+}
+
+template<class ElementType, size_t Extent>
+inline constexpr bool
+operator<=(const Span<ElementType, Extent>& l,
+           const Span<ElementType, Extent>& r)
+{
+  return !(l > r);
+}
+
+template<class ElementType, size_t Extent>
+inline constexpr bool
+operator>(const Span<ElementType, Extent>& l,
+          const Span<ElementType, Extent>& r)
+{
+  return r < l;
+}
+
+template<class ElementType, size_t Extent>
+inline constexpr bool
+operator>=(const Span<ElementType, Extent>& l,
+           const Span<ElementType, Extent>& r)
+{
+  return !(l < r);
+}
+
+namespace span_details {
+// if we only supported compilers with good constexpr support then
+// this pair of classes could collapse down to a constexpr function
+
+// we should use a narrow_cast<> to go to size_t, but older compilers may not see it as
+// constexpr
+// and so will fail compilation of the template
+template<class ElementType, size_t Extent>
+struct calculate_byte_size
+  : mozilla::IntegralConstant<size_t,
+                           static_cast<size_t>(sizeof(ElementType) *
+                                               static_cast<size_t>(Extent))>
+{
+};
+
+template<class ElementType>
+struct calculate_byte_size<ElementType, dynamic_extent>
+  : mozilla::IntegralConstant<size_t, dynamic_extent>
+{
+};
+}
+
+// [Span.objectrep], views of object representation
+/**
+ * View span as Span<const uint8_t>.
+ */
+template<class ElementType, size_t Extent>
+Span<const uint8_t,
+     span_details::calculate_byte_size<ElementType, Extent>::value>
+AsBytes(Span<ElementType, Extent> s)
+{
+  return { reinterpret_cast<const uint8_t*>(s.data()), s.size_bytes() };
+}
+
+/**
+ * View span as Span<uint8_t>.
+ */
+template<class ElementType,
+         size_t Extent,
+         class = span_details::enable_if_t<!mozilla::IsConst<ElementType>::value>>
+Span<uint8_t, span_details::calculate_byte_size<ElementType, Extent>::value>
+AsWritableBytes(Span<ElementType, Extent> s)
+{
+  return { reinterpret_cast<uint8_t*>(s.data()), s.size_bytes() };
+}
+
+//
+// MakeSpan() - Utility functions for creating Spans
+//
+/**
+ * Create span from pointer and length.
+ */
+template<class ElementType>
+Span<ElementType>
+MakeSpan(ElementType* aPtr, typename Span<ElementType>::index_type aLength)
+{
+  return Span<ElementType>(aPtr, aLength);
+}
+
+/**
+ * Create span from start pointer and pointer past end.
+ */
+template<class ElementType>
+Span<ElementType>
+MakeSpan(ElementType* aStartPtr, ElementType* aEndPtr)
+{
+  return Span<ElementType>(aStartPtr, aEndPtr);
+}
+
+/**
+ * Create span from C array.
+ */
+template<class ElementType, size_t N>
+Span<ElementType> MakeSpan(ElementType (&aArr)[N])
+{
+  return Span<ElementType>(aArr);
+}
+
+/**
+ * Create span from mozilla::Array.
+ */
+template<class ElementType, size_t N>
+Span<ElementType>
+MakeSpan(mozilla::Array<ElementType, N>& aArr)
+{
+  return aArr;
+}
+
+/**
+ * Create span from const mozilla::Array.
+ */
+template<class ElementType, size_t N>
+Span<const ElementType>
+MakeSpan(const mozilla::Array<ElementType, N>& arr)
+{
+  return arr;
+}
+
+/**
+ * Create span from standard-library container.
+ */
+template<class Container>
+Span<typename Container::value_type>
+MakeSpan(Container& cont)
+{
+  return Span<typename Container::value_type>(cont);
+}
+
+/**
+ * Create span from standard-library container (const version).
+ */
+template<class Container>
+Span<const typename Container::value_type>
+MakeSpan(const Container& cont)
+{
+  return Span<const typename Container::value_type>(cont);
+}
+
+/**
+ * Create span from smart pointer and length.
+ */
+template<class Ptr>
+Span<typename Ptr::element_type>
+MakeSpan(Ptr& aPtr, size_t aLength)
+{
+  return Span<typename Ptr::element_type>(aPtr, aLength);
+}
+
+/**
+ * Create span from C string.
+ */
+inline Span<const char>
+MakeCStringSpan(const char* aStr)
+{
+  return Span<const char>(aStr, std::strlen(aStr));
+}
+
+} // namespace mozilla
+
+#ifdef _MSC_VER
+#if _MSC_VER < 1910
+#undef constexpr
+#pragma pop_macro("constexpr")
+
+#endif // _MSC_VER < 1910
+
+#pragma warning(pop)
+#endif // _MSC_VER
+
+#undef MOZ_SPAN_ASSERTION_CONSTEXPR
+#undef MOZ_SPAN_GCC_CONSTEXPR
+#undef MOZ_SPAN_EXPLICITLY_DEFAULTED_CONSTEXPR
+#undef MOZ_SPAN_CONSTEXPR_NOT_JUST_RETURN
+#undef MOZ_SPAN_NON_CONST_CONSTEXPR
+
+#endif // mozilla_Span_h
diff --git a/mfbt/moz.build b/mfbt/moz.build
index 897a686f4..ea3c3b701 100644
--- a/mfbt/moz.build
+++ b/mfbt/moz.build
@@ -82,6 +82,7 @@ EXPORTS.mozilla = [
     'SegmentedVector.h',
     'SHA1.h',
     'SizePrintfMacros.h',
+    'Span.h',
     'SplayTree.h',
     'Sprintf.h',
     'StaticAnalysisFunctions.h',
diff --git a/mfbt/tests/gtest/TestSpan.cpp b/mfbt/tests/gtest/TestSpan.cpp
new file mode 100644
index 000000000..f3aa000a4
--- /dev/null
+++ b/mfbt/tests/gtest/TestSpan.cpp
@@ -0,0 +1,2079 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+//
+// This code is licensed under the MIT License (MIT).
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+// Adapted from https://github.com/Microsoft/GSL/blob/3819df6e378ffccf0e29465afe99c3b324c2aa70/tests/Span_tests.cpp
+
+#include "gtest/gtest.h"
+
+#include "mozilla/Span.h"
+
+#include "nsString.h"
+#include "nsTArray.h"
+#include "mozilla/Range.h"
+#include "mozilla/TypeTraits.h"
+
+#define SPAN_TEST(name) TEST(SpanTest, name)
+#define CHECK_THROW(a, b)
+
+using namespace std;
+using namespace mozilla;
+
+static_assert(IsConvertible<Range<int>, Span<const int>>::value,
+              "Range should convert into const");
+static_assert(IsConvertible<Range<const int>, Span<const int>>::value,
+              "const Range should convert into const");
+static_assert(!IsConvertible<Range<const int>, Span<int>>::value,
+              "Range should not drop const in conversion");
+static_assert(IsConvertible<Span<int>, Range<const int>>::value,
+              "Span should convert into const");
+static_assert(IsConvertible<Span<const int>, Range<const int>>::value,
+              "const Span should convert into const");
+static_assert(!IsConvertible<Span<const int>, Range<int>>::value,
+              "Span should not drop const in conversion");
+static_assert(IsConvertible<Span<const int>, Span<const int>>::value,
+              "const Span should convert into const");
+static_assert(IsConvertible<Span<int>, Span<const int>>::value,
+              "Span should convert into const");
+static_assert(!IsConvertible<Span<const int>, Span<int>>::value,
+              "Span should not drop const in conversion");
+static_assert(IsConvertible<const nsTArray<int>, Span<const int>>::value,
+              "const nsTArray should convert into const");
+static_assert(IsConvertible<nsTArray<int>, Span<const int>>::value,
+              "nsTArray should convert into const");
+static_assert(!IsConvertible<const nsTArray<int>, Span<int>>::value,
+              "nsTArray should not drop const in conversion");
+static_assert(IsConvertible<nsTArray<const int>, Span<const int>>::value,
+              "nsTArray should convert into const");
+static_assert(!IsConvertible<nsTArray<const int>, Span<int>>::value,
+              "nsTArray should not drop const in conversion");
+
+namespace {
+struct BaseClass
+{
+};
+struct DerivedClass : BaseClass
+{
+};
+}
+
+void
+AssertSpanOfThreeInts(Span<const int> s)
+{
+  ASSERT_EQ(s.size(), 3U);
+  ASSERT_EQ(s[0], 1);
+  ASSERT_EQ(s[1], 2);
+  ASSERT_EQ(s[2], 3);
+}
+
+void
+AssertSpanOfThreeChars(Span<const char> s)
+{
+  ASSERT_EQ(s.size(), 3U);
+  ASSERT_EQ(s[0], 'a');
+  ASSERT_EQ(s[1], 'b');
+  ASSERT_EQ(s[2], 'c');
+}
+
+void
+AssertSpanOfThreeChar16s(Span<const char16_t> s)
+{
+  ASSERT_EQ(s.size(), 3U);
+  ASSERT_EQ(s[0], 'a');
+  ASSERT_EQ(s[1], 'b');
+  ASSERT_EQ(s[2], 'c');
+}
+
+void
+AssertSpanOfThreeCharsViaString(const nsACString& aStr)
+{
+  AssertSpanOfThreeChars(aStr);
+}
+
+void
+AssertSpanOfThreeChar16sViaString(const nsAString& aStr)
+{
+  AssertSpanOfThreeChar16s(aStr);
+}
+
+SPAN_TEST(default_constructor)
+{
+  {
+    Span<int> s;
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int> cs;
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  {
+    Span<int, 0> s;
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int, 0> cs;
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    Span<int, 1> s;
+    ASSERT_EQ(s.Length(), 1U);
+    ASSERT_EQ(s.data(), nullptr); // explains why it can't compile
+#endif
+  }
+
+  {
+    Span<int> s{};
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int> cs{};
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+}
+
+SPAN_TEST(size_optimization)
+{
+  {
+    Span<int> s;
+    ASSERT_EQ(sizeof(s), sizeof(int*) + sizeof(size_t));
+  }
+
+  {
+    Span<int, 0> s;
+    ASSERT_EQ(sizeof(s), sizeof(int*));
+  }
+}
+
+SPAN_TEST(from_nullptr_constructor)
+{
+  {
+    Span<int> s = nullptr;
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int> cs = nullptr;
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  {
+    Span<int, 0> s = nullptr;
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int, 0> cs = nullptr;
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    Span<int, 1> s = nullptr;
+    ASSERT_EQ(s.Length(), 1U);
+    ASSERT_EQ(s.data(), nullptr); // explains why it can't compile
+#endif
+  }
+
+  {
+    Span<int> s{ nullptr };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int> cs{ nullptr };
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  {
+    Span<int*> s{ nullptr };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int*> cs{ nullptr };
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+}
+
+SPAN_TEST(from_nullptr_length_constructor)
+{
+  {
+    Span<int> s{ nullptr, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int> cs{ nullptr, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+  {
+    Span<int, 0> s{ nullptr, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int, 0> cs{ nullptr, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+
+#if 0
+        {
+            auto workaround_macro = []() { Span<int, 1> s{ nullptr, static_cast<Span<int>::index_type>(0) }; };
+            CHECK_THROW(workaround_macro(), fail_fast);
+        }
+
+        {
+            auto workaround_macro = []() { Span<int> s{nullptr, 1}; };
+            CHECK_THROW(workaround_macro(), fail_fast);
+
+            auto const_workaround_macro = []() { Span<const int> cs{nullptr, 1}; };
+            CHECK_THROW(const_workaround_macro(), fail_fast);
+        }
+
+        {
+            auto workaround_macro = []() { Span<int, 0> s{nullptr, 1}; };
+            CHECK_THROW(workaround_macro(), fail_fast);
+
+            auto const_workaround_macro = []() { Span<const int, 0> s{nullptr, 1}; };
+            CHECK_THROW(const_workaround_macro(), fail_fast);
+        }
+#endif
+  {
+    Span<int*> s{ nullptr, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+
+    Span<const int*> cs{ nullptr, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(cs.Length(), 0U);
+    ASSERT_EQ(cs.data(), nullptr);
+  }
+}
+
+SPAN_TEST(from_pointer_length_constructor)
+{
+  int arr[4] = { 1, 2, 3, 4 };
+
+  {
+    Span<int> s{ &arr[0], 2 };
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[1], 2);
+  }
+
+  {
+    Span<int, 2> s{ &arr[0], 2 };
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[1], 2);
+  }
+
+  {
+    int* p = nullptr;
+    Span<int> s{ p, static_cast<Span<int>::index_type>(0) };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+  }
+
+#if 0
+        {
+            int* p = nullptr;
+            auto workaround_macro = [=]() { Span<int> s{p, 2}; };
+            CHECK_THROW(workaround_macro(), fail_fast);
+        }
+#endif
+
+  {
+    auto s = MakeSpan(&arr[0], 2);
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[1], 2);
+  }
+
+  {
+    int* p = nullptr;
+    auto s = MakeSpan(p, static_cast<Span<int>::index_type>(0));
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+  }
+
+#if 0
+        {
+            int* p = nullptr;
+            auto workaround_macro = [=]() { MakeSpan(p, 2); };
+            CHECK_THROW(workaround_macro(), fail_fast);
+        }
+#endif
+}
+
+SPAN_TEST(from_pointer_pointer_constructor)
+{
+  int arr[4] = { 1, 2, 3, 4 };
+
+  {
+    Span<int> s{ &arr[0], &arr[2] };
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[1], 2);
+  }
+
+  {
+    Span<int, 2> s{ &arr[0], &arr[2] };
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[1], 2);
+  }
+
+  {
+    Span<int> s{ &arr[0], &arr[0] };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<int, 0> s{ &arr[0], &arr[0] };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  // this will fail the std::distance() precondition, which asserts on MSVC debug builds
+  //{
+  //    auto workaround_macro = [&]() { Span<int> s{&arr[1], &arr[0]}; };
+  //    CHECK_THROW(workaround_macro(), fail_fast);
+  //}
+
+  // this will fail the std::distance() precondition, which asserts on MSVC debug builds
+  //{
+  //    int* p = nullptr;
+  //    auto workaround_macro = [&]() { Span<int> s{&arr[0], p}; };
+  //    CHECK_THROW(workaround_macro(), fail_fast);
+  //}
+
+  {
+    int* p = nullptr;
+    Span<int> s{ p, p };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+  }
+
+  {
+    int* p = nullptr;
+    Span<int, 0> s{ p, p };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+  }
+
+  // this will fail the std::distance() precondition, which asserts on MSVC debug builds
+  //{
+  //    int* p = nullptr;
+  //    auto workaround_macro = [&]() { Span<int> s{&arr[0], p}; };
+  //    CHECK_THROW(workaround_macro(), fail_fast);
+  //}
+
+  {
+    auto s = MakeSpan(&arr[0], &arr[2]);
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[1], 2);
+  }
+
+  {
+    auto s = MakeSpan(&arr[0], &arr[0]);
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    int* p = nullptr;
+    auto s = MakeSpan(p, p);
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), nullptr);
+  }
+}
+
+SPAN_TEST(from_array_constructor)
+{
+  int arr[5] = { 1, 2, 3, 4, 5 };
+
+  {
+    Span<int> s{ arr };
+    ASSERT_EQ(s.Length(), 5U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<int, 5> s{ arr };
+    ASSERT_EQ(s.Length(), 5U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  int arr2d[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int, 6> s{ arr };
+  }
+
+  {
+    Span<int, 0> s{ arr };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<int> s{ arr2d };
+    ASSERT_EQ(s.Length(), 6U);
+    ASSERT_EQ(s.data(), &arr2d[0][0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[5], 6);
+  }
+
+  {
+    Span<int, 0> s{ arr2d };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), &arr2d[0][0]);
+  }
+
+  {
+    Span<int, 6> s{ arr2d };
+  }
+#endif
+  {
+    Span<int[3]> s{ &(arr2d[0]), 1 };
+    ASSERT_EQ(s.Length(), 1U);
+    ASSERT_EQ(s.data(), &arr2d[0]);
+  }
+
+  int arr3d[2][3][2] = { { { 1, 2 }, { 3, 4 }, { 5, 6 } },
+                         { { 7, 8 }, { 9, 10 }, { 11, 12 } } };
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int> s{ arr3d };
+    ASSERT_EQ(s.Length(), 12U);
+    ASSERT_EQ(s.data(), &arr3d[0][0][0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[11], 12);
+  }
+
+  {
+    Span<int, 0> s{ arr3d };
+    ASSERT_EQ(s.Length(), 0U);
+    ASSERT_EQ(s.data(), &arr3d[0][0][0]);
+  }
+
+  {
+    Span<int, 11> s{ arr3d };
+  }
+
+  {
+    Span<int, 12> s{ arr3d };
+    ASSERT_EQ(s.Length(), 12U);
+    ASSERT_EQ(s.data(), &arr3d[0][0][0]);
+    ASSERT_EQ(s[0], 1);
+    ASSERT_EQ(s[5], 6);
+  }
+#endif
+  {
+    Span<int[3][2]> s{ &arr3d[0], 1 };
+    ASSERT_EQ(s.Length(), 1U);
+    ASSERT_EQ(s.data(), &arr3d[0]);
+  }
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.Length(), 5U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    auto s = MakeSpan(&(arr2d[0]), 1);
+    ASSERT_EQ(s.Length(), 1U);
+    ASSERT_EQ(s.data(), &arr2d[0]);
+  }
+
+  {
+    auto s = MakeSpan(&arr3d[0], 1);
+    ASSERT_EQ(s.Length(), 1U);
+    ASSERT_EQ(s.data(), &arr3d[0]);
+  }
+}
+
+SPAN_TEST(from_dynamic_array_constructor)
+{
+  double(*arr)[3][4] = new double[100][3][4];
+
+  {
+    Span<double> s(&arr[0][0][0], 10);
+    ASSERT_EQ(s.Length(), 10U);
+    ASSERT_EQ(s.data(), &arr[0][0][0]);
+  }
+
+  {
+    auto s = MakeSpan(&arr[0][0][0], 10);
+    ASSERT_EQ(s.Length(), 10U);
+    ASSERT_EQ(s.data(), &arr[0][0][0]);
+  }
+
+  delete[] arr;
+}
+
+SPAN_TEST(from_std_array_constructor)
+{
+  std::array<int, 4> arr = { { 1, 2, 3, 4 } };
+
+  {
+    Span<int> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+
+    Span<const int> cs{ arr };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(cs.data(), arr.data());
+  }
+
+  {
+    Span<int, 4> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+
+    Span<const int, 4> cs{ arr };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(cs.data(), arr.data());
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int, 2> s{ arr };
+    ASSERT_EQ(s.size(), 2U);
+    ASSERT_EQ(s.data(), arr.data());
+
+    Span<const int, 2> cs{ arr };
+    ASSERT_EQ(cs.size(), 2U);
+    ASSERT_EQ(cs.data(), arr.data());
+  }
+
+  {
+    Span<int, 0> s{ arr };
+    ASSERT_EQ(s.size(), 0U);
+    ASSERT_EQ(s.data(), arr.data());
+
+    Span<const int, 0> cs{ arr };
+    ASSERT_EQ(cs.size(), 0U);
+    ASSERT_EQ(cs.data(), arr.data());
+  }
+
+  {
+    Span<int, 5> s{ arr };
+  }
+
+  {
+    auto get_an_array = []() -> std::array<int, 4> { return { 1, 2, 3, 4 }; };
+    auto take_a_Span = [](Span<int> s) { static_cast<void>(s); };
+    // try to take a temporary std::array
+    take_a_Span(get_an_array());
+  }
+#endif
+
+  {
+    auto get_an_array = []() -> std::array<int, 4> {
+      return { { 1, 2, 3, 4 } };
+    };
+    auto take_a_Span = [](Span<const int> s) { static_cast<void>(s); };
+    // try to take a temporary std::array
+    take_a_Span(get_an_array());
+  }
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+}
+
+SPAN_TEST(from_const_std_array_constructor)
+{
+  const std::array<int, 4> arr = { { 1, 2, 3, 4 } };
+
+  {
+    Span<const int> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+  {
+    Span<const int, 4> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<const int, 2> s{ arr };
+    ASSERT_EQ(s.size(), 2U);
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+  {
+    Span<const int, 0> s{ arr };
+    ASSERT_EQ(s.size(), 0U);
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+  {
+    Span<const int, 5> s{ arr };
+  }
+#endif
+
+  {
+    auto get_an_array = []() -> const std::array<int, 4> {
+      return { { 1, 2, 3, 4 } };
+    };
+    auto take_a_Span = [](Span<const int> s) { static_cast<void>(s); };
+    // try to take a temporary std::array
+    take_a_Span(get_an_array());
+  }
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+}
+
+SPAN_TEST(from_std_array_const_constructor)
+{
+  std::array<const int, 4> arr = { { 1, 2, 3, 4 } };
+
+  {
+    Span<const int> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+  {
+    Span<const int, 4> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<const int, 2> s{ arr };
+    ASSERT_EQ(s.size(), 2U);
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+  {
+    Span<const int, 0> s{ arr };
+    ASSERT_EQ(s.size(), 0U);
+    ASSERT_EQ(s.data(), arr.data());
+  }
+
+  {
+    Span<const int, 5> s{ arr };
+  }
+
+  {
+    Span<int, 4> s{ arr };
+  }
+#endif
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.size()));
+    ASSERT_EQ(s.data(), arr.data());
+  }
+}
+
+SPAN_TEST(from_mozilla_array_constructor)
+{
+  mozilla::Array<int, 4> arr(1, 2, 3, 4);
+
+  {
+    Span<int> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+
+    Span<const int> cs{ arr };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(cs.data(), &arr[0]);
+  }
+
+  {
+    Span<int, 4> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+
+    Span<const int, 4> cs{ arr };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(cs.data(), &arr[0]);
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int, 2> s{ arr };
+    ASSERT_EQ(s.size(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+
+    Span<const int, 2> cs{ arr };
+    ASSERT_EQ(cs.size(), 2U);
+    ASSERT_EQ(cs.data(), &arr[0]);
+  }
+
+  {
+    Span<int, 0> s{ arr };
+    ASSERT_EQ(s.size(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+
+    Span<const int, 0> cs{ arr };
+    ASSERT_EQ(cs.size(), 0U);
+    ASSERT_EQ(cs.data(), &arr[0]);
+  }
+
+  {
+    Span<int, 5> s{ arr };
+  }
+
+  {
+    auto get_an_array = []() -> mozilla::Array<int, 4> {
+      return { 1, 2, 3, 4 };
+    };
+    auto take_a_Span = [](Span<int> s) { static_cast<void>(s); };
+    // try to take a temporary mozilla::Array
+    take_a_Span(get_an_array());
+  }
+#endif
+
+  {
+    auto get_an_array = []() -> mozilla::Array<int, 4> {
+      return { 1, 2, 3, 4 };
+    };
+    auto take_a_Span = [](Span<const int> s) { static_cast<void>(s); };
+    // try to take a temporary mozilla::Array
+    take_a_Span(get_an_array());
+  }
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+}
+
+SPAN_TEST(from_const_mozilla_array_constructor)
+{
+  const mozilla::Array<int, 4> arr(1, 2, 3, 4);
+
+  {
+    Span<const int> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<const int, 4> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<const int, 2> s{ arr };
+    ASSERT_EQ(s.size(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<const int, 0> s{ arr };
+    ASSERT_EQ(s.size(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<const int, 5> s{ arr };
+  }
+#endif
+
+#if 0
+  {
+    auto get_an_array = []() -> const mozilla::Array<int, 4> {
+      return { 1, 2, 3, 4 };
+    };
+    auto take_a_Span = [](Span<const int> s) { static_cast<void>(s); };
+    // try to take a temporary mozilla::Array
+    take_a_Span(get_an_array());
+  }
+#endif
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+}
+
+SPAN_TEST(from_mozilla_array_const_constructor)
+{
+  mozilla::Array<const int, 4> arr(1, 2, 3, 4);
+
+  {
+    Span<const int> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<const int, 4> s{ arr };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<const int, 2> s{ arr };
+    ASSERT_EQ(s.size(), 2U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<const int, 0> s{ arr };
+    ASSERT_EQ(s.size(), 0U);
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+
+  {
+    Span<const int, 5> s{ arr };
+  }
+
+  {
+    Span<int, 4> s{ arr };
+  }
+#endif
+
+  {
+    auto s = MakeSpan(arr);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(arr.cend() - arr.cbegin()));
+    ASSERT_EQ(s.data(), &arr[0]);
+  }
+}
+
+SPAN_TEST(from_container_constructor)
+{
+  std::vector<int> v = { 1, 2, 3 };
+  const std::vector<int> cv = v;
+
+  {
+    AssertSpanOfThreeInts(v);
+
+    Span<int> s{ v };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.size()));
+    ASSERT_EQ(s.data(), v.data());
+
+    Span<const int> cs{ v };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(v.size()));
+    ASSERT_EQ(cs.data(), v.data());
+  }
+
+  std::string str = "hello";
+  const std::string cstr = "hello";
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    Span<char> s{ str };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(str.size()));
+    ASSERT_EQ(s.data(), str.data());
+#endif
+    Span<const char> cs{ str };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(str.size()));
+    ASSERT_EQ(cs.data(), str.data());
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    Span<char> s{ cstr };
+#endif
+    Span<const char> cs{ cstr };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(cstr.size()));
+    ASSERT_EQ(cs.data(), cstr.data());
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    auto get_temp_vector = []() -> std::vector<int> { return {}; };
+    auto use_Span = [](Span<int> s) { static_cast<void>(s); };
+    use_Span(get_temp_vector());
+#endif
+  }
+
+  {
+    auto get_temp_vector = []() -> std::vector<int> { return {}; };
+    auto use_Span = [](Span<const int> s) { static_cast<void>(s); };
+    use_Span(get_temp_vector());
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    auto get_temp_string = []() -> std::string { return {}; };
+    auto use_Span = [](Span<char> s) { static_cast<void>(s); };
+    use_Span(get_temp_string());
+#endif
+  }
+
+  {
+    auto get_temp_string = []() -> std::string { return {}; };
+    auto use_Span = [](Span<const char> s) { static_cast<void>(s); };
+    use_Span(get_temp_string());
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    auto get_temp_vector = []() -> const std::vector<int> { return {}; };
+    auto use_Span = [](Span<const char> s) { static_cast<void>(s); };
+    use_Span(get_temp_vector());
+#endif
+  }
+
+  {
+    auto get_temp_string = []() -> const std::string { return {}; };
+    auto use_Span = [](Span<const char> s) { static_cast<void>(s); };
+    use_Span(get_temp_string());
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    std::map<int, int> m;
+    Span<int> s{ m };
+#endif
+  }
+
+  {
+    auto s = MakeSpan(v);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.size()));
+    ASSERT_EQ(s.data(), v.data());
+
+    auto cs = MakeSpan(cv);
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(cv.size()));
+    ASSERT_EQ(cs.data(), cv.data());
+  }
+}
+
+SPAN_TEST(from_xpcom_collections)
+{
+  {
+    nsTArray<int> v;
+    v.AppendElement(1);
+    v.AppendElement(2);
+    v.AppendElement(3);
+
+    AssertSpanOfThreeInts(v);
+
+    Span<int> s{ v };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+
+    Span<const int> cs{ v };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(cs.data(), v.Elements());
+    ASSERT_EQ(cs[2], 3);
+  }
+  {
+    nsTArray<int> v;
+    v.AppendElement(1);
+    v.AppendElement(2);
+    v.AppendElement(3);
+
+    AssertSpanOfThreeInts(v);
+
+    auto s = MakeSpan(v);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+  }
+  {
+    AutoTArray<int, 5> v;
+    v.AppendElement(1);
+    v.AppendElement(2);
+    v.AppendElement(3);
+
+    AssertSpanOfThreeInts(v);
+
+    Span<int> s{ v };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+
+    Span<const int> cs{ v };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(cs.data(), v.Elements());
+    ASSERT_EQ(cs[2], 3);
+  }
+  {
+    AutoTArray<int, 5> v;
+    v.AppendElement(1);
+    v.AppendElement(2);
+    v.AppendElement(3);
+
+    AssertSpanOfThreeInts(v);
+
+    auto s = MakeSpan(v);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+  }
+  {
+    FallibleTArray<int> v;
+    *(v.AppendElement(fallible)) = 1;
+    *(v.AppendElement(fallible)) = 2;
+    *(v.AppendElement(fallible)) = 3;
+
+    AssertSpanOfThreeInts(v);
+
+    Span<int> s{ v };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+
+    Span<const int> cs{ v };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(cs.data(), v.Elements());
+    ASSERT_EQ(cs[2], 3);
+  }
+  {
+    FallibleTArray<int> v;
+    *(v.AppendElement(fallible)) = 1;
+    *(v.AppendElement(fallible)) = 2;
+    *(v.AppendElement(fallible)) = 3;
+
+    AssertSpanOfThreeInts(v);
+
+    auto s = MakeSpan(v);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+  }
+  {
+    nsAutoString str;
+    str.AssignLiteral("abc");
+
+    AssertSpanOfThreeChar16s(str);
+    AssertSpanOfThreeChar16sViaString(str);
+
+    Span<char16_t> s{ str };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(str.Length()));
+    ASSERT_EQ(s.data(), str.BeginWriting());
+    ASSERT_EQ(s[2], 'c');
+
+    Span<const char16_t> cs{ str };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(str.Length()));
+    ASSERT_EQ(cs.data(), str.BeginReading());
+    ASSERT_EQ(cs[2], 'c');
+  }
+  {
+    nsAutoString str;
+    str.AssignLiteral("abc");
+
+    AssertSpanOfThreeChar16s(str);
+    AssertSpanOfThreeChar16sViaString(str);
+
+    auto s = MakeSpan(str);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(str.Length()));
+    ASSERT_EQ(s.data(), str.BeginWriting());
+    ASSERT_EQ(s[2], 'c');
+  }
+  {
+    nsAutoCString str;
+    str.AssignLiteral("abc");
+
+    AssertSpanOfThreeChars(str);
+    AssertSpanOfThreeCharsViaString(str);
+
+    Span<uint8_t> s{ str };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(str.Length()));
+    ASSERT_EQ(s.data(), reinterpret_cast<uint8_t*>(str.BeginWriting()));
+    ASSERT_EQ(s[2], 'c');
+
+    Span<const uint8_t> cs{ str };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(str.Length()));
+    ASSERT_EQ(cs.data(), reinterpret_cast<const uint8_t*>(str.BeginReading()));
+    ASSERT_EQ(cs[2], 'c');
+  }
+  {
+    nsAutoCString str;
+    str.AssignLiteral("abc");
+
+    AssertSpanOfThreeChars(str);
+    AssertSpanOfThreeCharsViaString(str);
+
+    auto s = MakeSpan(str);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(str.Length()));
+    ASSERT_EQ(s.data(), str.BeginWriting());
+    ASSERT_EQ(s[2], 'c');
+  }
+  {
+    nsTArray<int> v;
+    v.AppendElement(1);
+    v.AppendElement(2);
+    v.AppendElement(3);
+
+    Range<int> r(v.Elements(), v.Length());
+
+    AssertSpanOfThreeInts(r);
+
+    Span<int> s{ r };
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+
+    Span<const int> cs{ r };
+    ASSERT_EQ(cs.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(cs.data(), v.Elements());
+    ASSERT_EQ(cs[2], 3);
+  }
+  {
+    nsTArray<int> v;
+    v.AppendElement(1);
+    v.AppendElement(2);
+    v.AppendElement(3);
+
+    Range<int> r(v.Elements(), v.Length());
+
+    AssertSpanOfThreeInts(r);
+
+    auto s = MakeSpan(r);
+    ASSERT_EQ(s.size(), narrow_cast<size_t>(v.Length()));
+    ASSERT_EQ(s.data(), v.Elements());
+    ASSERT_EQ(s[2], 3);
+  }
+}
+
+SPAN_TEST(from_cstring)
+{
+  {
+    const char* str = "abc";
+
+    auto cs = MakeCStringSpan(str);
+    ASSERT_EQ(cs.size(), 3U);
+    ASSERT_EQ(cs.data(), str);
+    ASSERT_EQ(cs[2], 'c');
+  }
+}
+
+SPAN_TEST(from_convertible_Span_constructor){
+  {
+    Span<DerivedClass> avd;
+    Span<const DerivedClass> avcd = avd;
+    static_cast<void>(avcd);
+  }
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+  Span<DerivedClass> avd;
+  Span<BaseClass> avb = avd;
+  static_cast<void>(avb);
+#endif
+  }
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int> s;
+    Span<unsigned int> s2 = s;
+    static_cast<void>(s2);
+  }
+
+  {
+    Span<int> s;
+    Span<const unsigned int> s2 = s;
+    static_cast<void>(s2);
+  }
+
+  {
+    Span<int> s;
+    Span<short> s2 = s;
+    static_cast<void>(s2);
+  }
+#endif
+}
+
+SPAN_TEST(copy_move_and_assignment)
+{
+  Span<int> s1;
+  ASSERT_TRUE(s1.empty());
+
+  int arr[] = { 3, 4, 5 };
+
+  Span<const int> s2 = arr;
+  ASSERT_EQ(s2.Length(), 3U);
+  ASSERT_EQ(s2.data(), &arr[0]);
+
+  s2 = s1;
+  ASSERT_TRUE(s2.empty());
+
+  auto get_temp_Span = [&]() -> Span<int> { return { &arr[1], 2 }; };
+  auto use_Span = [&](Span<const int> s) {
+    ASSERT_EQ(s.Length(), 2U);
+    ASSERT_EQ(s.data(), &arr[1]);
+  };
+  use_Span(get_temp_Span());
+
+  s1 = get_temp_Span();
+  ASSERT_EQ(s1.Length(), 2U);
+  ASSERT_EQ(s1.data(), &arr[1]);
+}
+
+SPAN_TEST(first)
+{
+  int arr[5] = { 1, 2, 3, 4, 5 };
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.First<2>().Length(), 2U);
+    ASSERT_EQ(av.First(2).Length(), 2U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.First<0>().Length(), 0U);
+    ASSERT_EQ(av.First(0).Length(), 0U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.First<5>().Length(), 5U);
+    ASSERT_EQ(av.First(5).Length(), 5U);
+  }
+
+#if 0
+        {
+            Span<int, 5> av = arr;
+#ifdef CONFIRM_COMPILATION_ERRORS
+            ASSERT_EQ(av.First<6>().Length() , 6U);
+            ASSERT_EQ(av.First<-1>().Length() , -1);
+#endif
+            CHECK_THROW(av.First(6).Length(), fail_fast);
+        }
+#endif
+
+  {
+    Span<int> av;
+    ASSERT_EQ(av.First<0>().Length(), 0U);
+    ASSERT_EQ(av.First(0).Length(), 0U);
+  }
+}
+
+SPAN_TEST(last)
+{
+  int arr[5] = { 1, 2, 3, 4, 5 };
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.Last<2>().Length(), 2U);
+    ASSERT_EQ(av.Last(2).Length(), 2U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.Last<0>().Length(), 0U);
+    ASSERT_EQ(av.Last(0).Length(), 0U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.Last<5>().Length(), 5U);
+    ASSERT_EQ(av.Last(5).Length(), 5U);
+  }
+
+#if 0
+        {
+            Span<int, 5> av = arr;
+#ifdef CONFIRM_COMPILATION_ERRORS
+            ASSERT_EQ(av.Last<6>().Length() , 6U);
+#endif
+            CHECK_THROW(av.Last(6).Length(), fail_fast);
+        }
+#endif
+
+  {
+    Span<int> av;
+    ASSERT_EQ(av.Last<0>().Length(), 0U);
+    ASSERT_EQ(av.Last(0).Length(), 0U);
+  }
+}
+
+SPAN_TEST(from_to)
+{
+  int arr[5] = { 1, 2, 3, 4, 5 };
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.From(3).Length(), 2U);
+    ASSERT_EQ(av.From(2)[1], 4);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.From(5).Length(), 0U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.From(0).Length(), 5U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.To(3).Length(), 3U);
+    ASSERT_EQ(av.To(3)[1], 2);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.To(0).Length(), 0U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.To(5).Length(), 5U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.FromTo(1, 4).Length(), 3U);
+    ASSERT_EQ(av.FromTo(1, 4)[1], 3);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.FromTo(2, 2).Length(), 0U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.FromTo(0, 5).Length(), 5U);
+  }
+}
+
+SPAN_TEST(Subspan)
+{
+  int arr[5] = { 1, 2, 3, 4, 5 };
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ((av.Subspan<2, 2>().Length()), 2U);
+    ASSERT_EQ(av.Subspan(2, 2).Length(), 2U);
+    ASSERT_EQ(av.Subspan(2, 3).Length(), 3U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ((av.Subspan<0, 0>().Length()), 0U);
+    ASSERT_EQ(av.Subspan(0, 0).Length(), 0U);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ((av.Subspan<0, 5>().Length()), 5U);
+    ASSERT_EQ(av.Subspan(0, 5).Length(), 5U);
+    CHECK_THROW(av.Subspan(0, 6).Length(), fail_fast);
+    CHECK_THROW(av.Subspan(1, 5).Length(), fail_fast);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ((av.Subspan<4, 0>().Length()), 0U);
+    ASSERT_EQ(av.Subspan(4, 0).Length(), 0U);
+    ASSERT_EQ(av.Subspan(5, 0).Length(), 0U);
+    CHECK_THROW(av.Subspan(6, 0).Length(), fail_fast);
+  }
+
+  {
+    Span<int> av;
+    ASSERT_EQ((av.Subspan<0, 0>().Length()), 0U);
+    ASSERT_EQ(av.Subspan(0, 0).Length(), 0U);
+    CHECK_THROW((av.Subspan<1, 0>().Length()), fail_fast);
+  }
+
+  {
+    Span<int> av;
+    ASSERT_EQ(av.Subspan(0).Length(), 0U);
+    CHECK_THROW(av.Subspan(1).Length(), fail_fast);
+  }
+
+  {
+    Span<int> av = arr;
+    ASSERT_EQ(av.Subspan(0).Length(), 5U);
+    ASSERT_EQ(av.Subspan(1).Length(), 4U);
+    ASSERT_EQ(av.Subspan(4).Length(), 1U);
+    ASSERT_EQ(av.Subspan(5).Length(), 0U);
+    CHECK_THROW(av.Subspan(6).Length(), fail_fast);
+    auto av2 = av.Subspan(1);
+    for (int i = 0; i < 4; ++i)
+      ASSERT_EQ(av2[i], i + 2);
+  }
+
+  {
+    Span<int, 5> av = arr;
+    ASSERT_EQ(av.Subspan(0).Length(), 5U);
+    ASSERT_EQ(av.Subspan(1).Length(), 4U);
+    ASSERT_EQ(av.Subspan(4).Length(), 1U);
+    ASSERT_EQ(av.Subspan(5).Length(), 0U);
+    CHECK_THROW(av.Subspan(6).Length(), fail_fast);
+    auto av2 = av.Subspan(1);
+    for (int i = 0; i < 4; ++i)
+      ASSERT_EQ(av2[i], i + 2);
+  }
+}
+
+SPAN_TEST(at_call)
+{
+  int arr[4] = { 1, 2, 3, 4 };
+
+  {
+    Span<int> s = arr;
+    ASSERT_EQ(s.at(0), 1);
+    CHECK_THROW(s.at(5), fail_fast);
+  }
+
+  {
+    int arr2d[2] = { 1, 6 };
+    Span<int, 2> s = arr2d;
+    ASSERT_EQ(s.at(0), 1);
+    ASSERT_EQ(s.at(1), 6);
+    CHECK_THROW(s.at(2), fail_fast);
+  }
+}
+
+SPAN_TEST(operator_function_call)
+{
+  int arr[4] = { 1, 2, 3, 4 };
+
+  {
+    Span<int> s = arr;
+    ASSERT_EQ(s(0), 1);
+    CHECK_THROW(s(5), fail_fast);
+  }
+
+  {
+    int arr2d[2] = { 1, 6 };
+    Span<int, 2> s = arr2d;
+    ASSERT_EQ(s(0), 1);
+    ASSERT_EQ(s(1), 6);
+    CHECK_THROW(s(2), fail_fast);
+  }
+}
+
+SPAN_TEST(iterator_default_init)
+{
+  Span<int>::iterator it1;
+  Span<int>::iterator it2;
+  ASSERT_EQ(it1, it2);
+}
+
+SPAN_TEST(const_iterator_default_init)
+{
+  Span<int>::const_iterator it1;
+  Span<int>::const_iterator it2;
+  ASSERT_EQ(it1, it2);
+}
+
+SPAN_TEST(iterator_conversions)
+{
+  Span<int>::iterator badIt;
+  Span<int>::const_iterator badConstIt;
+  ASSERT_EQ(badIt, badConstIt);
+
+  int a[] = { 1, 2, 3, 4 };
+  Span<int> s = a;
+
+  auto it = s.begin();
+  auto cit = s.cbegin();
+
+  ASSERT_EQ(it, cit);
+  ASSERT_EQ(cit, it);
+
+  Span<int>::const_iterator cit2 = it;
+  ASSERT_EQ(cit2, cit);
+
+  Span<int>::const_iterator cit3 = it + 4;
+  ASSERT_EQ(cit3, s.cend());
+}
+
+SPAN_TEST(iterator_comparisons)
+{
+  int a[] = { 1, 2, 3, 4 };
+  {
+    Span<int> s = a;
+    Span<int>::iterator it = s.begin();
+    auto it2 = it + 1;
+    Span<int>::const_iterator cit = s.cbegin();
+
+    ASSERT_EQ(it, cit);
+    ASSERT_EQ(cit, it);
+    ASSERT_EQ(it, it);
+    ASSERT_EQ(cit, cit);
+    ASSERT_EQ(cit, s.begin());
+    ASSERT_EQ(s.begin(), cit);
+    ASSERT_EQ(s.cbegin(), cit);
+    ASSERT_EQ(it, s.begin());
+    ASSERT_EQ(s.begin(), it);
+
+    ASSERT_NE(it, it2);
+    ASSERT_NE(it2, it);
+    ASSERT_NE(it, s.end());
+    ASSERT_NE(it2, s.end());
+    ASSERT_NE(s.end(), it);
+    ASSERT_NE(it2, cit);
+    ASSERT_NE(cit, it2);
+
+    ASSERT_LT(it, it2);
+    ASSERT_LE(it, it2);
+    ASSERT_LE(it2, s.end());
+    ASSERT_LT(it, s.end());
+    ASSERT_LE(it, cit);
+    ASSERT_LE(cit, it);
+    ASSERT_LT(cit, it2);
+    ASSERT_LE(cit, it2);
+    ASSERT_LT(cit, s.end());
+    ASSERT_LE(cit, s.end());
+
+    ASSERT_GT(it2, it);
+    ASSERT_GE(it2, it);
+    ASSERT_GT(s.end(), it2);
+    ASSERT_GE(s.end(), it2);
+    ASSERT_GT(it2, cit);
+    ASSERT_GE(it2, cit);
+  }
+}
+
+SPAN_TEST(begin_end)
+{
+  {
+    int a[] = { 1, 2, 3, 4 };
+    Span<int> s = a;
+
+    Span<int>::iterator it = s.begin();
+    Span<int>::iterator it2 = std::begin(s);
+    ASSERT_EQ(it, it2);
+
+    it = s.end();
+    it2 = std::end(s);
+    ASSERT_EQ(it, it2);
+  }
+
+  {
+    int a[] = { 1, 2, 3, 4 };
+    Span<int> s = a;
+
+    auto it = s.begin();
+    auto first = it;
+    ASSERT_EQ(it, first);
+    ASSERT_EQ(*it, 1);
+
+    auto beyond = s.end();
+    ASSERT_NE(it, beyond);
+    CHECK_THROW(*beyond, fail_fast);
+
+    ASSERT_EQ(beyond - first, 4U);
+    ASSERT_EQ(first - first, 0U);
+    ASSERT_EQ(beyond - beyond, 0U);
+
+    ++it;
+    ASSERT_EQ(it - first, 1U);
+    ASSERT_EQ(*it, 2);
+    *it = 22;
+    ASSERT_EQ(*it, 22);
+    ASSERT_EQ(beyond - it, 3U);
+
+    it = first;
+    ASSERT_EQ(it, first);
+    while (it != s.end()) {
+      *it = 5;
+      ++it;
+    }
+
+    ASSERT_EQ(it, beyond);
+    ASSERT_EQ(it - beyond, 0U);
+
+    for (auto& n : s) {
+      ASSERT_EQ(n, 5);
+    }
+  }
+}
+
+SPAN_TEST(cbegin_cend)
+{
+#if 0
+          {
+              int a[] = { 1, 2, 3, 4 };
+              Span<int> s = a;
+
+              Span<int>::const_iterator cit = s.cbegin();
+              Span<int>::const_iterator cit2 = std::cbegin(s);
+              ASSERT_EQ(cit , cit2);
+
+              cit = s.cend();
+              cit2 = std::cend(s);
+              ASSERT_EQ(cit , cit2);
+          }
+#endif
+  {
+    int a[] = { 1, 2, 3, 4 };
+    Span<int> s = a;
+
+    auto it = s.cbegin();
+    auto first = it;
+    ASSERT_EQ(it, first);
+    ASSERT_EQ(*it, 1);
+
+    auto beyond = s.cend();
+    ASSERT_NE(it, beyond);
+    CHECK_THROW(*beyond, fail_fast);
+
+    ASSERT_EQ(beyond - first, 4U);
+    ASSERT_EQ(first - first, 0U);
+    ASSERT_EQ(beyond - beyond, 0U);
+
+    ++it;
+    ASSERT_EQ(it - first, 1U);
+    ASSERT_EQ(*it, 2);
+    ASSERT_EQ(beyond - it, 3U);
+
+    int last = 0;
+    it = first;
+    ASSERT_EQ(it, first);
+    while (it != s.cend()) {
+      ASSERT_EQ(*it, last + 1);
+
+      last = *it;
+      ++it;
+    }
+
+    ASSERT_EQ(it, beyond);
+    ASSERT_EQ(it - beyond, 0U);
+  }
+}
+
+SPAN_TEST(rbegin_rend)
+{
+  {
+    int a[] = { 1, 2, 3, 4 };
+    Span<int> s = a;
+
+    auto it = s.rbegin();
+    auto first = it;
+    ASSERT_EQ(it, first);
+    ASSERT_EQ(*it, 4);
+
+    auto beyond = s.rend();
+    ASSERT_NE(it, beyond);
+    CHECK_THROW(*beyond, fail_fast);
+
+    ASSERT_EQ(beyond - first, 4U);
+    ASSERT_EQ(first - first, 0U);
+    ASSERT_EQ(beyond - beyond, 0U);
+
+    ++it;
+    ASSERT_EQ(it - first, 1U);
+    ASSERT_EQ(*it, 3);
+    *it = 22;
+    ASSERT_EQ(*it, 22);
+    ASSERT_EQ(beyond - it, 3U);
+
+    it = first;
+    ASSERT_EQ(it, first);
+    while (it != s.rend()) {
+      *it = 5;
+      ++it;
+    }
+
+    ASSERT_EQ(it, beyond);
+    ASSERT_EQ(it - beyond, 0U);
+
+    for (auto& n : s) {
+      ASSERT_EQ(n, 5);
+    }
+  }
+}
+
+SPAN_TEST(crbegin_crend)
+{
+  {
+    int a[] = { 1, 2, 3, 4 };
+    Span<int> s = a;
+
+    auto it = s.crbegin();
+    auto first = it;
+    ASSERT_EQ(it, first);
+    ASSERT_EQ(*it, 4);
+
+    auto beyond = s.crend();
+    ASSERT_NE(it, beyond);
+    CHECK_THROW(*beyond, fail_fast);
+
+    ASSERT_EQ(beyond - first, 4U);
+    ASSERT_EQ(first - first, 0U);
+    ASSERT_EQ(beyond - beyond, 0U);
+
+    ++it;
+    ASSERT_EQ(it - first, 1U);
+    ASSERT_EQ(*it, 3);
+    ASSERT_EQ(beyond - it, 3U);
+
+    it = first;
+    ASSERT_EQ(it, first);
+    int last = 5;
+    while (it != s.crend()) {
+      ASSERT_EQ(*it, last - 1);
+      last = *it;
+
+      ++it;
+    }
+
+    ASSERT_EQ(it, beyond);
+    ASSERT_EQ(it - beyond, 0U);
+  }
+}
+
+SPAN_TEST(comparison_operators)
+{
+  {
+    Span<int> s1 = nullptr;
+    Span<int> s2 = nullptr;
+    ASSERT_EQ(s1, s2);
+    ASSERT_FALSE(s1 != s2);
+    ASSERT_FALSE(s1 < s2);
+    ASSERT_LE(s1, s2);
+    ASSERT_FALSE(s1 > s2);
+    ASSERT_GE(s1, s2);
+    ASSERT_EQ(s2, s1);
+    ASSERT_FALSE(s2 != s1);
+    ASSERT_FALSE(s2 < s1);
+    ASSERT_LE(s2, s1);
+    ASSERT_FALSE(s2 > s1);
+    ASSERT_GE(s2, s1);
+  }
+
+  {
+    int arr[] = { 2, 1 };
+    Span<int> s1 = arr;
+    Span<int> s2 = arr;
+
+    ASSERT_EQ(s1, s2);
+    ASSERT_FALSE(s1 != s2);
+    ASSERT_FALSE(s1 < s2);
+    ASSERT_LE(s1, s2);
+    ASSERT_FALSE(s1 > s2);
+    ASSERT_GE(s1, s2);
+    ASSERT_EQ(s2, s1);
+    ASSERT_FALSE(s2 != s1);
+    ASSERT_FALSE(s2 < s1);
+    ASSERT_LE(s2, s1);
+    ASSERT_FALSE(s2 > s1);
+    ASSERT_GE(s2, s1);
+  }
+
+  {
+    int arr[] = { 2, 1 }; // bigger
+
+    Span<int> s1 = nullptr;
+    Span<int> s2 = arr;
+
+    ASSERT_NE(s1, s2);
+    ASSERT_NE(s2, s1);
+    ASSERT_NE(s1, s2);
+    ASSERT_NE(s2, s1);
+    ASSERT_LT(s1, s2);
+    ASSERT_FALSE(s2 < s1);
+    ASSERT_LE(s1, s2);
+    ASSERT_FALSE(s2 <= s1);
+    ASSERT_GT(s2, s1);
+    ASSERT_FALSE(s1 > s2);
+    ASSERT_GE(s2, s1);
+    ASSERT_FALSE(s1 >= s2);
+  }
+
+  {
+    int arr1[] = { 1, 2 };
+    int arr2[] = { 1, 2 };
+    Span<int> s1 = arr1;
+    Span<int> s2 = arr2;
+
+    ASSERT_EQ(s1, s2);
+    ASSERT_FALSE(s1 != s2);
+    ASSERT_FALSE(s1 < s2);
+    ASSERT_LE(s1, s2);
+    ASSERT_FALSE(s1 > s2);
+    ASSERT_GE(s1, s2);
+    ASSERT_EQ(s2, s1);
+    ASSERT_FALSE(s2 != s1);
+    ASSERT_FALSE(s2 < s1);
+    ASSERT_LE(s2, s1);
+    ASSERT_FALSE(s2 > s1);
+    ASSERT_GE(s2, s1);
+  }
+
+  {
+    int arr[] = { 1, 2, 3 };
+
+    AssertSpanOfThreeInts(arr);
+
+    Span<int> s1 = { &arr[0], 2 }; // shorter
+    Span<int> s2 = arr;            // longer
+
+    ASSERT_NE(s1, s2);
+    ASSERT_NE(s2, s1);
+    ASSERT_NE(s1, s2);
+    ASSERT_NE(s2, s1);
+    ASSERT_LT(s1, s2);
+    ASSERT_FALSE(s2 < s1);
+    ASSERT_LE(s1, s2);
+    ASSERT_FALSE(s2 <= s1);
+    ASSERT_GT(s2, s1);
+    ASSERT_FALSE(s1 > s2);
+    ASSERT_GE(s2, s1);
+    ASSERT_FALSE(s1 >= s2);
+  }
+
+  {
+    int arr1[] = { 1, 2 }; // smaller
+    int arr2[] = { 2, 1 }; // bigger
+
+    Span<int> s1 = arr1;
+    Span<int> s2 = arr2;
+
+    ASSERT_NE(s1, s2);
+    ASSERT_NE(s2, s1);
+    ASSERT_NE(s1, s2);
+    ASSERT_NE(s2, s1);
+    ASSERT_LT(s1, s2);
+    ASSERT_FALSE(s2 < s1);
+    ASSERT_LE(s1, s2);
+    ASSERT_FALSE(s2 <= s1);
+    ASSERT_GT(s2, s1);
+    ASSERT_FALSE(s1 > s2);
+    ASSERT_GE(s2, s1);
+    ASSERT_FALSE(s1 >= s2);
+  }
+}
+
+SPAN_TEST(as_bytes)
+{
+  int a[] = { 1, 2, 3, 4 };
+
+  {
+    Span<const int> s = a;
+    ASSERT_EQ(s.Length(), 4U);
+    Span<const uint8_t> bs = AsBytes(s);
+    ASSERT_EQ(static_cast<const void*>(bs.data()),
+              static_cast<const void*>(s.data()));
+    ASSERT_EQ(bs.Length(), s.LengthBytes());
+  }
+
+  {
+    Span<int> s;
+    auto bs = AsBytes(s);
+    ASSERT_EQ(bs.Length(), s.Length());
+    ASSERT_EQ(bs.Length(), 0U);
+    ASSERT_EQ(bs.size_bytes(), 0U);
+    ASSERT_EQ(static_cast<const void*>(bs.data()),
+              static_cast<const void*>(s.data()));
+    ASSERT_EQ(bs.data(), nullptr);
+  }
+
+  {
+    Span<int> s = a;
+    auto bs = AsBytes(s);
+    ASSERT_EQ(static_cast<const void*>(bs.data()),
+              static_cast<const void*>(s.data()));
+    ASSERT_EQ(bs.Length(), s.LengthBytes());
+  }
+}
+
+SPAN_TEST(as_writable_bytes)
+{
+  int a[] = { 1, 2, 3, 4 };
+
+  {
+#ifdef CONFIRM_COMPILATION_ERRORS
+    // you should not be able to get writeable bytes for const objects
+    Span<const int> s = a;
+    ASSERT_EQ(s.Length(), 4U);
+    Span<const byte> bs = AsWritableBytes(s);
+    ASSERT_EQ(static_cast<void*>(bs.data()), static_cast<void*>(s.data()));
+    ASSERT_EQ(bs.Length(), s.LengthBytes());
+#endif
+  }
+
+  {
+    Span<int> s;
+    auto bs = AsWritableBytes(s);
+    ASSERT_EQ(bs.Length(), s.Length());
+    ASSERT_EQ(bs.Length(), 0U);
+    ASSERT_EQ(bs.size_bytes(), 0U);
+    ASSERT_EQ(static_cast<void*>(bs.data()), static_cast<void*>(s.data()));
+    ASSERT_EQ(bs.data(), nullptr);
+  }
+
+  {
+    Span<int> s = a;
+    auto bs = AsWritableBytes(s);
+    ASSERT_EQ(static_cast<void*>(bs.data()), static_cast<void*>(s.data()));
+    ASSERT_EQ(bs.Length(), s.LengthBytes());
+  }
+}
+
+SPAN_TEST(fixed_size_conversions)
+{
+  int arr[] = { 1, 2, 3, 4 };
+
+  // converting to an Span from an equal size array is ok
+  Span<int, 4> s4 = arr;
+  ASSERT_EQ(s4.Length(), 4U);
+
+  // converting to dynamic_range is always ok
+  {
+    Span<int> s = s4;
+    ASSERT_EQ(s.Length(), s4.Length());
+    static_cast<void>(s);
+  }
+
+// initialization or assignment to static Span that REDUCES size is NOT ok
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int, 2> s = arr;
+  }
+  {
+    Span<int, 2> s2 = s4;
+    static_cast<void>(s2);
+  }
+#endif
+
+#if 0
+        // even when done dynamically
+        {
+            Span<int> s = arr;
+            auto f = [&]() {
+                Span<int, 2> s2 = s;
+                static_cast<void>(s2);
+            };
+            CHECK_THROW(f(), fail_fast);
+        }
+#endif
+
+  // but doing so explicitly is ok
+
+  // you can convert statically
+  {
+    Span<int, 2> s2 = { arr, 2 };
+    static_cast<void>(s2);
+  }
+  {
+    Span<int, 1> s1 = s4.First<1>();
+    static_cast<void>(s1);
+  }
+
+  // ...or dynamically
+  {
+    // NB: implicit conversion to Span<int,1> from Span<int>
+    Span<int, 1> s1 = s4.First(1);
+    static_cast<void>(s1);
+  }
+
+#if 0
+        // initialization or assignment to static Span that requires size INCREASE is not ok.
+        int arr2[2] = {1, 2};
+#endif
+
+#ifdef CONFIRM_COMPILATION_ERRORS
+  {
+    Span<int, 4> s3 = arr2;
+  }
+  {
+    Span<int, 2> s2 = arr2;
+    Span<int, 4> s4a = s2;
+  }
+#endif
+
+#if 0
+        {
+            auto f = [&]() {
+                Span<int, 4> _s4 = {arr2, 2};
+                static_cast<void>(_s4);
+            };
+            CHECK_THROW(f(), fail_fast);
+        }
+
+        // this should fail - we are trying to assign a small dynamic Span to a fixed_size larger one
+        Span<int> av = arr2;
+        auto f = [&]() {
+            Span<int, 4> _s4 = av;
+            static_cast<void>(_s4);
+        };
+        CHECK_THROW(f(), fail_fast);
+#endif
+}
+
+#if 0
+    SPAN_TEST(interop_with_std_regex)
+    {
+        char lat[] = { '1', '2', '3', '4', '5', '6', 'E', 'F', 'G' };
+        Span<char> s = lat;
+        auto f_it = s.begin() + 7;
+
+        std::match_results<Span<char>::iterator> match;
+
+        std::regex_match(s.begin(), s.end(), match, std::regex(".*"));
+        ASSERT_EQ(match.ready());
+        ASSERT_TRUE(!match.empty());
+        ASSERT_TRUE(match[0].matched);
+        ASSERT_TRUE(match[0].first , s.begin());
+        ASSERT_EQ(match[0].second , s.end());
+
+        std::regex_search(s.begin(), s.end(), match, std::regex("F"));
+        ASSERT_TRUE(match.ready());
+        ASSERT_TRUE(!match.empty());
+        ASSERT_TRUE(match[0].matched);
+        ASSERT_EQ(match[0].first , f_it);
+        ASSERT_EQ(match[0].second , (f_it + 1));
+    }
+
+SPAN_TEST(interop_with_gsl_at)
+{
+  int arr[5] = { 1, 2, 3, 4, 5 };
+  Span<int> s{ arr };
+  ASSERT_EQ(at(s, 0) , 1 );
+ASSERT_EQ(at(s, 1) , 2U);
+}
+#endif
+
+SPAN_TEST(default_constructible)
+{
+  ASSERT_TRUE((std::is_default_constructible<Span<int>>::value));
+  ASSERT_TRUE((std::is_default_constructible<Span<int, 0>>::value));
+  ASSERT_TRUE((!std::is_default_constructible<Span<int, 42>>::value));
+}
diff --git a/mfbt/tests/gtest/moz.build b/mfbt/tests/gtest/moz.build
new file mode 100644
index 000000000..bd559d60b
--- /dev/null
+++ b/mfbt/tests/gtest/moz.build
@@ -0,0 +1,15 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+UNIFIED_SOURCES += [
+    'TestSpan.cpp',
+]
+
+#LOCAL_INCLUDES += [
+#    '../../base',
+#]
+
+FINAL_LIBRARY = 'xul-gtest'
diff --git a/mfbt/tests/moz.build b/mfbt/tests/moz.build
index bd25ab1d0..e69de5d75 100644
--- a/mfbt/tests/moz.build
+++ b/mfbt/tests/moz.build
@@ -4,6 +4,11 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+if not CONFIG['JS_STANDALONE']:
+    TEST_DIRS += [
+        'gtest',
+    ]
+
 CppUnitTests([
     'TestArray',
     'TestArrayUtils',
diff --git a/modules/libjar/nsJARChannel.cpp b/modules/libjar/nsJARChannel.cpp
index ee60602da..2f721fa3f 100644
--- a/modules/libjar/nsJARChannel.cpp
+++ b/modules/libjar/nsJARChannel.cpp
@@ -995,6 +995,25 @@ nsJARChannel::OnStartRequest(nsIRequest *req, nsISupports *ctx)
     mRequest = req;
     nsresult rv = mListener->OnStartRequest(this, mListenerContext);
     mRequest = nullptr;
+    NS_ENSURE_SUCCESS(rv, rv);
+
+    // Restrict loadable content types.
+    nsAutoCString contentType;
+    GetContentType(contentType);
+    auto contentPolicyType = mLoadInfo->GetExternalContentPolicyType();
+    if (contentType.Equals(APPLICATION_HTTP_INDEX_FORMAT) &&
+        contentPolicyType != nsIContentPolicy::TYPE_DOCUMENT &&
+        contentPolicyType != nsIContentPolicy::TYPE_FETCH) {
+      return NS_ERROR_CORRUPTED_CONTENT;
+    }
+    if (contentPolicyType == nsIContentPolicy::TYPE_STYLESHEET &&
+        !contentType.EqualsLiteral(TEXT_CSS)) {
+      return NS_ERROR_CORRUPTED_CONTENT;
+    }
+    if (contentPolicyType == nsIContentPolicy::TYPE_SCRIPT &&
+        !nsContentUtils::IsJavascriptMIMEType(NS_ConvertUTF8toUTF16(contentType))) {
+      return NS_ERROR_CORRUPTED_CONTENT;
+    }
 
     return rv;
 }
diff --git a/modules/libpref/init/all.js b/modules/libpref/init/all.js
index 32c59e737..957affa79 100644
--- a/modules/libpref/init/all.js
+++ b/modules/libpref/init/all.js
@@ -117,6 +117,9 @@ pref("browser.cache.compression_level", 0);
 // Don't show "Open with" option on download dialog if true.
 pref("browser.download.forbid_open_with", false);
 
+// Save download locations as a content preference
+pref("browser.download.lastDir.savePerSite", true);
+
 #ifdef XP_WIN
 // Save internet zone information on downloaded files:
 // 0 => Never
@@ -128,8 +131,6 @@ pref("browser.download.saveZoneInformation", 2);
 // Whether or not testing features are enabled.
 pref("dom.quotaManager.testing", false);
 
-// Whether or not indexedDB is enabled.
-pref("dom.indexedDB.enabled", true);
 // Whether or not indexedDB experimental features are enabled.
 pref("dom.indexedDB.experimental", false);
 // Enable indexedDB logging.
@@ -584,6 +585,10 @@ pref("media.mediasource.webm.enabled", true);
 #endif
 pref("media.mediasource.webm.audio.enabled", true);
 
+#ifdef MOZ_AV1
+pref("media.av1.enabled", false);
+#endif
+
 // Use new MediaFormatReader architecture for plain ogg.
 pref("media.flac.enabled", true);
 pref("media.ogg.flac.enabled", true);
@@ -2214,9 +2219,10 @@ pref("ui.key.contentAccess", 5);
 pref("ui.key.menuAccessKeyFocuses", false); // overridden below
 pref("ui.key.saveLink.shift", true); // true = shift, false = meta
 
-// When true, overrides OS convention to lock content scrolling
+// When true, overrides Windows OS convention to lock content scrolling
 // if a contextual menu is open.
-pref("ui.menu.allow_content_scroll", false);
+// XXX: Only effective on Windows for now!
+pref("ui.menu.allow_content_scroll", true);
 
 // Disable page loading activity cursor by default.
 pref("ui.use_activity_cursor", false);
@@ -2432,7 +2438,7 @@ pref("layout.word_select.stop_at_punctuation", true);
 pref("layout.selection.caret_style", 0);
 
 // pref to report CSS errors to the error console
-pref("layout.css.report_errors", true);
+pref("layout.css.report_errors", false);
 
 // Should the :visited selector ever match (otherwise :link matches instead)?
 pref("layout.css.visited_links_enabled", true);
@@ -5400,6 +5406,9 @@ pref("plugins.navigator_hide_disabled_flash", false);
 // Disable browser frames by default
 pref("dom.mozBrowserFramesEnabled", false);
 
+// Thick caret when behind CJK characters
+pref("layout.cjkthickcaret", true); 
+
 // Is support for 'color-adjust' CSS property enabled?
 pref("layout.css.color-adjust.enabled", true);
 
diff --git a/netwerk/base/nsIOService.cpp b/netwerk/base/nsIOService.cpp
index e13541acf..435294315 100644
--- a/netwerk/base/nsIOService.cpp
+++ b/netwerk/base/nsIOService.cpp
@@ -789,7 +789,9 @@ nsIOService::NewChannelFromURIWithProxyFlagsInternal(nsIURI* aURI,
         // creating a new channel by calling NewChannel().
         if (NS_FAILED(rv)) {
             rv = handler->NewChannel(aURI, getter_AddRefs(channel));
-            NS_ENSURE_SUCCESS(rv, rv);
+            if (NS_FAILED(rv)) {
+                return rv;
+            }
             // The protocol handler does not implement NewChannel2, so
             // maybe we need to wrap the channel (see comment in MaybeWrap
             // function).
diff --git a/netwerk/base/nsNetUtilInlines.h b/netwerk/base/nsNetUtilInlines.h
index 7003814d5..b831ec2e7 100644
--- a/netwerk/base/nsNetUtilInlines.h
+++ b/netwerk/base/nsNetUtilInlines.h
@@ -224,7 +224,9 @@ NS_NewChannelInternal(nsIChannel           **outChannel,
          aUri,
          aLoadInfo,
          getter_AddRefs(channel));
-  NS_ENSURE_SUCCESS(rv, rv);
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
 
   if (aLoadGroup) {
     rv = channel->SetLoadGroup(aLoadGroup);
diff --git a/netwerk/base/nsSocketTransportService2.cpp b/netwerk/base/nsSocketTransportService2.cpp
index 72afdc9e1..739388b0f 100644
--- a/netwerk/base/nsSocketTransportService2.cpp
+++ b/netwerk/base/nsSocketTransportService2.cpp
@@ -471,9 +471,6 @@ nsSocketTransportService::Poll(uint32_t *interval,
     PRIntervalTime ts = PR_IntervalNow();
 
     TimeStamp pollStart;
-    if (mTelemetryEnabledPref) {
-        pollStart = TimeStamp::NowLoRes();
-    }
 
     SOCKET_LOG(("    timeout = %i milliseconds\n",
          PR_IntervalToMilliseconds(pollTimeout)));
@@ -481,10 +478,6 @@ nsSocketTransportService::Poll(uint32_t *interval,
 
     PRIntervalTime passedInterval = PR_IntervalNow() - ts;
 
-    if (mTelemetryEnabledPref && !pollStart.IsNull()) {
-        *pollDuration = TimeStamp::NowLoRes() - pollStart;
-    }
-
     SOCKET_LOG(("    ...returned after %i milliseconds\n",
          PR_IntervalToMilliseconds(passedInterval))); 
 
@@ -858,20 +851,9 @@ nsSocketTransportService::Run()
     // make sure the pseudo random number generator is seeded on this thread
     srand(static_cast<unsigned>(PR_Now()));
 
-    // For the calculation of the duration of the last cycle (i.e. the last for-loop
-    // iteration before shutdown).
-    TimeStamp startOfCycleForLastCycleCalc;
-    int numberOfPendingEventsLastCycle;
-
-    // For measuring of the poll iteration duration without time spent blocked
-    // in poll().
-    TimeStamp pollCycleStart;
     // Time blocked in poll().
     TimeDuration singlePollDuration;
 
-    // For calculating the time needed for a new element to run.
-    TimeStamp startOfIteration;
-    TimeStamp startOfNextIteration;
     int numberOfPendingEvents;
 
     // If there is too many pending events queued, we will run some poll()
@@ -883,18 +865,9 @@ nsSocketTransportService::Run()
         bool pendingEvents = false;
 
         numberOfPendingEvents = 0;
-        numberOfPendingEventsLastCycle = 0;
-        if (mTelemetryEnabledPref) {
-            startOfCycleForLastCycleCalc = TimeStamp::NowLoRes();
-            startOfNextIteration = TimeStamp::NowLoRes();
-        }
         pollDuration = 0;
 
         do {
-            if (mTelemetryEnabledPref) {
-                pollCycleStart = TimeStamp::NowLoRes();
-            }
-
             DoPollIteration(&singlePollDuration);
 
             mRawThread->HasPendingEvents(&pendingEvents);
@@ -909,15 +882,6 @@ nsSocketTransportService::Run()
                     } else {
                         mServingPendingQueue = true;
                     }
-
-                    if (mTelemetryEnabledPref) {
-                        startOfIteration = startOfNextIteration;
-                        // Everything that comes after this point will
-                        // be served in the next iteration. If no even
-                        // arrives, startOfNextIteration will be reset at the
-                        // beginning of each for-loop.
-                        startOfNextIteration = TimeStamp::NowLoRes();
-                    }
                 }
                 TimeStamp eventQueueStart = TimeStamp::NowLoRes();
                 do {
diff --git a/netwerk/base/nsUDPSocket.cpp b/netwerk/base/nsUDPSocket.cpp
index 24f3954cb..445b62d9c 100644
--- a/netwerk/base/nsUDPSocket.cpp
+++ b/netwerk/base/nsUDPSocket.cpp
@@ -770,12 +770,6 @@ nsUDPSocket::CloseSocket()
       // If shutdown last to long, let the socket leak and do not close it.
       UDPSOCKET_LOG(("Intentional leak"));
     } else {
-
-      PRIntervalTime closeStarted = 0;
-      if (gSocketTransportService->IsTelemetryEnabledAndNotSleepPhase()) {
-        closeStarted = PR_IntervalNow();
-      }
-
       PR_Close(mFD);
     }
     mFD = nullptr;
diff --git a/netwerk/cache/nsCacheEntryDescriptor.cpp b/netwerk/cache/nsCacheEntryDescriptor.cpp
index 64765f8aa..35ea22d55 100644
--- a/netwerk/cache/nsCacheEntryDescriptor.cpp
+++ b/netwerk/cache/nsCacheEntryDescriptor.cpp
@@ -43,7 +43,7 @@ public:
         nsresult status = NS_OK;
 
         {
-            nsCacheServiceAutoLock lock(LOCK_TELEM(NSASYNCDOOMEVENT_RUN));
+            nsCacheServiceAutoLock lock;
 
             if (mDescriptor->mCacheEntry) {
                 status = nsCacheService::gService->DoomEntry_Internal(
@@ -113,7 +113,7 @@ nsCacheEntryDescriptor::GetClientID(char ** result)
 {
     NS_ENSURE_ARG_POINTER(result);
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETCLIENTID));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     return ClientIDFromCacheKey(*(mCacheEntry->Key()), result);
@@ -124,7 +124,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetDeviceID(char ** aDeviceID)
 {
     NS_ENSURE_ARG_POINTER(aDeviceID);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETDEVICEID));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     const char* deviceID = mCacheEntry->GetDeviceID();
@@ -141,7 +141,7 @@ nsCacheEntryDescriptor::GetDeviceID(char ** aDeviceID)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::GetKey(nsACString &result)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETKEY));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     return ClientKeyFromCacheKey(*(mCacheEntry->Key()), result);
@@ -152,7 +152,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetFetchCount(int32_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETFETCHCOUNT));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->FetchCount();
@@ -164,7 +164,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetLastFetched(uint32_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETLASTFETCHED));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->LastFetched();
@@ -176,7 +176,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetLastModified(uint32_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETLASTMODIFIED));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->LastModified();
@@ -188,7 +188,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetExpirationTime(uint32_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETEXPIRATIONTIME));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->ExpirationTime();
@@ -199,7 +199,7 @@ nsCacheEntryDescriptor::GetExpirationTime(uint32_t *result)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::SetExpirationTime(uint32_t expirationTime)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETEXPIRATIONTIME));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     mCacheEntry->SetExpirationTime(expirationTime);
@@ -211,7 +211,7 @@ nsCacheEntryDescriptor::SetExpirationTime(uint32_t expirationTime)
 NS_IMETHODIMP nsCacheEntryDescriptor::IsStreamBased(bool *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_ISSTREAMBASED));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->IsStreamData();
@@ -221,7 +221,7 @@ NS_IMETHODIMP nsCacheEntryDescriptor::IsStreamBased(bool *result)
 NS_IMETHODIMP nsCacheEntryDescriptor::GetPredictedDataSize(int64_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETPREDICTEDDATASIZE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry) return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->PredictedDataSize();
@@ -231,7 +231,7 @@ NS_IMETHODIMP nsCacheEntryDescriptor::GetPredictedDataSize(int64_t *result)
 NS_IMETHODIMP nsCacheEntryDescriptor::SetPredictedDataSize(int64_t
                                                            predictedSize)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETPREDICTEDDATASIZE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     mCacheEntry->SetPredictedDataSize(predictedSize);
@@ -241,7 +241,7 @@ NS_IMETHODIMP nsCacheEntryDescriptor::SetPredictedDataSize(int64_t
 NS_IMETHODIMP nsCacheEntryDescriptor::GetDataSize(uint32_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETDATASIZE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     const char* val = mCacheEntry->GetMetaDataElement("uncompressed-len");
@@ -258,7 +258,7 @@ NS_IMETHODIMP nsCacheEntryDescriptor::GetDataSize(uint32_t *result)
 NS_IMETHODIMP nsCacheEntryDescriptor::GetStorageDataSize(uint32_t *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETSTORAGEDATASIZE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->DataSize();
@@ -270,7 +270,7 @@ NS_IMETHODIMP nsCacheEntryDescriptor::GetStorageDataSize(uint32_t *result)
 nsresult
 nsCacheEntryDescriptor::RequestDataSizeChange(int32_t deltaSize)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_REQUESTDATASIZECHANGE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     nsresult  rv;
@@ -288,7 +288,7 @@ nsCacheEntryDescriptor::RequestDataSizeChange(int32_t deltaSize)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::SetDataSize(uint32_t dataSize)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETDATASIZE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     // XXX review for signed/unsigned math errors
@@ -317,7 +317,7 @@ nsCacheEntryDescriptor::OpenInputStream(uint32_t offset, nsIInputStream ** resul
 
     nsInputStreamWrapper* cacheInput = nullptr;
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_OPENINPUTSTREAM));
+        nsCacheServiceAutoLock lock;
         if (!mCacheEntry)                  return NS_ERROR_NOT_AVAILABLE;
         if (!mCacheEntry->IsStreamData())  return NS_ERROR_CACHE_DATA_IS_NOT_STREAM;
 
@@ -352,7 +352,7 @@ nsCacheEntryDescriptor::OpenOutputStream(uint32_t offset, nsIOutputStream ** res
 
     nsOutputStreamWrapper* cacheOutput = nullptr;
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_OPENOUTPUTSTREAM));
+        nsCacheServiceAutoLock lock;
         if (!mCacheEntry)                  return NS_ERROR_NOT_AVAILABLE;
         if (!mCacheEntry->IsStreamData())  return NS_ERROR_CACHE_DATA_IS_NOT_STREAM;
 
@@ -390,7 +390,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetCacheElement(nsISupports ** result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETCACHEELEMENT));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)                 return NS_ERROR_NOT_AVAILABLE;
     if (mCacheEntry->IsStreamData())  return NS_ERROR_CACHE_DATA_IS_STREAM;
 
@@ -402,7 +402,7 @@ nsCacheEntryDescriptor::GetCacheElement(nsISupports ** result)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::SetCacheElement(nsISupports * cacheElement)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETCACHEELEMENT));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)                 return NS_ERROR_NOT_AVAILABLE;
     if (mCacheEntry->IsStreamData())  return NS_ERROR_CACHE_DATA_IS_STREAM;
 
@@ -423,7 +423,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetStoragePolicy(nsCacheStoragePolicy *result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETSTORAGEPOLICY));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
     
     *result = mCacheEntry->StoragePolicy();
@@ -434,7 +434,7 @@ nsCacheEntryDescriptor::GetStoragePolicy(nsCacheStoragePolicy *result)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::SetStoragePolicy(nsCacheStoragePolicy policy)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETSTORAGEPOLICY));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
     // XXX validate policy against session?
     
@@ -461,7 +461,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetFile(nsIFile ** result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETFILE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     return nsCacheService::GetFileForEntry(mCacheEntry, result);
@@ -472,7 +472,7 @@ NS_IMETHODIMP
 nsCacheEntryDescriptor::GetSecurityInfo(nsISupports ** result)
 {
     NS_ENSURE_ARG_POINTER(result);
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETSECURITYINFO));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     *result = mCacheEntry->SecurityInfo();
@@ -484,7 +484,7 @@ nsCacheEntryDescriptor::GetSecurityInfo(nsISupports ** result)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::SetSecurityInfo(nsISupports * securityInfo)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETSECURITYINFO));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     mCacheEntry->SetSecurityInfo(securityInfo);
@@ -496,7 +496,7 @@ nsCacheEntryDescriptor::SetSecurityInfo(nsISupports * securityInfo)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::Doom()
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_DOOM));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     return nsCacheService::DoomEntry(mCacheEntry);
@@ -506,7 +506,7 @@ nsCacheEntryDescriptor::Doom()
 NS_IMETHODIMP
 nsCacheEntryDescriptor::DoomAndFailPendingRequests(nsresult status)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_DOOMANDFAILPENDINGREQUESTS));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     return NS_ERROR_NOT_IMPLEMENTED;
@@ -544,7 +544,7 @@ nsCacheEntryDescriptor::AsyncDoom(nsICacheListener *listener)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::MarkValid()
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_MARKVALID));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     nsresult  rv = nsCacheService::ValidateEntry(mCacheEntry);
@@ -559,7 +559,7 @@ nsCacheEntryDescriptor::Close()
     nsTArray<RefPtr<nsInputStreamWrapper> > inputWrappers;
 
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_CLOSE));
+        nsCacheServiceAutoLock lock;
         if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
         // Make sure no other stream can be opened
@@ -585,7 +585,7 @@ nsCacheEntryDescriptor::Close()
 
     inputWrappers.Clear();
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_CLOSE));
+    nsCacheServiceAutoLock lock;
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
 
     // XXX perhaps closing descriptors should clear/sever transports
@@ -604,7 +604,7 @@ nsCacheEntryDescriptor::GetMetaDataElement(const char *key, char **result)
     NS_ENSURE_ARG_POINTER(key);
     *result = nullptr;
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_GETMETADATAELEMENT));
+    nsCacheServiceAutoLock lock;
     NS_ENSURE_TRUE(mCacheEntry, NS_ERROR_NOT_AVAILABLE);
 
     const char *value;
@@ -624,7 +624,7 @@ nsCacheEntryDescriptor::SetMetaDataElement(const char *key, const char *value)
 {
     NS_ENSURE_ARG_POINTER(key);
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_SETMETADATAELEMENT));
+    nsCacheServiceAutoLock lock;
     NS_ENSURE_TRUE(mCacheEntry, NS_ERROR_NOT_AVAILABLE);
 
     // XXX allow null value, for clearing key?
@@ -639,7 +639,7 @@ nsCacheEntryDescriptor::SetMetaDataElement(const char *key, const char *value)
 NS_IMETHODIMP
 nsCacheEntryDescriptor::VisitMetaData(nsICacheMetaDataVisitor * visitor)
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHEENTRYDESCRIPTOR_VISITMETADATA)); 
+    nsCacheServiceAutoLock lock;
     // XXX check callers, we're calling out of module
     NS_ENSURE_ARG_POINTER(visitor);
     if (!mCacheEntry)  return NS_ERROR_NOT_AVAILABLE;
@@ -667,7 +667,7 @@ nsCacheEntryDescriptor::nsInputStreamWrapper::Release()
     }
 
     if (desc)
-        nsCacheService::Lock(LOCK_TELEM(NSINPUTSTREAMWRAPPER_RELEASE));
+        nsCacheService::Lock();
 
     nsrefcnt count;
     NS_PRECONDITION(0 != mRefCnt, "dup release");
@@ -709,7 +709,7 @@ nsInputStreamWrapper::LazyInit()
     if (!mDescriptor)
         return NS_ERROR_NOT_AVAILABLE;
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSINPUTSTREAMWRAPPER_LAZYINIT));
+    nsCacheServiceAutoLock lock;
 
     nsCacheAccessMode mode;
     nsresult rv = mDescriptor->GetAccessGranted(&mode);
@@ -755,7 +755,7 @@ nsInputStreamWrapper::CloseInternal()
         return;
     }
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSINPUTSTREAMWRAPPER_CLOSEINTERNAL));
+    nsCacheServiceAutoLock lock;
 
     if (mDescriptor) {
         mDescriptor->mInputWrappers.RemoveElement(this);
@@ -859,8 +859,7 @@ nsCacheEntryDescriptor::nsDecompressInputStreamWrapper::Release()
     }
 
     if (desc)
-        nsCacheService::Lock(LOCK_TELEM(
-                             NSDECOMPRESSINPUTSTREAMWRAPPER_RELEASE));
+        nsCacheService::Lock();
 
     nsrefcnt count;
     NS_PRECONDITION(0 != mRefCnt, "dup release");
@@ -1049,7 +1048,7 @@ nsCacheEntryDescriptor::nsOutputStreamWrapper::Release()
     }
 
     if (desc)
-        nsCacheService::Lock(LOCK_TELEM(NSOUTPUTSTREAMWRAPPER_RELEASE));
+        nsCacheService::Lock();
 
     nsrefcnt count;
     NS_PRECONDITION(0 != mRefCnt, "dup release");
@@ -1089,7 +1088,7 @@ nsOutputStreamWrapper::LazyInit()
     if (!mDescriptor)
         return NS_ERROR_NOT_AVAILABLE;
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSOUTPUTSTREAMWRAPPER_LAZYINIT));
+    nsCacheServiceAutoLock lock;
 
     nsCacheAccessMode mode;
     nsresult rv = mDescriptor->GetAccessGranted(&mode);
@@ -1163,7 +1162,7 @@ nsOutputStreamWrapper::CloseInternal()
         return;
     }
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSOUTPUTSTREAMWRAPPER_CLOSEINTERNAL));
+    nsCacheServiceAutoLock lock;
 
     if (mDescriptor) {
         mDescriptor->mOutputWrapper = nullptr;
@@ -1279,7 +1278,7 @@ nsCacheEntryDescriptor::nsCompressOutputStreamWrapper::Release()
     }
 
     if (desc)
-        nsCacheService::Lock(LOCK_TELEM(NSCOMPRESSOUTPUTSTREAMWRAPPER_RELEASE));
+        nsCacheService::Lock();
 
     nsrefcnt count;
     NS_PRECONDITION(0 != mRefCnt, "dup release");
diff --git a/netwerk/cache/nsCacheService.cpp b/netwerk/cache/nsCacheService.cpp
index 97b1a71c8..4b08614b8 100644
--- a/netwerk/cache/nsCacheService.cpp
+++ b/netwerk/cache/nsCacheService.cpp
@@ -211,7 +211,7 @@ public:
 
     NS_IMETHOD Notify(nsITimer* aTimer) override {
         if (nsCacheService::gService) {
-            nsCacheServiceAutoLock autoLock(LOCK_TELEM(NSSETDISKSMARTSIZECALLBACK_NOTIFY));
+            nsCacheServiceAutoLock autoLock;
             nsCacheService::gService->SetDiskSmartSize_Locked();
             nsCacheService::gService->mSmartSizeTimer = nullptr;
         }
@@ -295,7 +295,7 @@ public:
     }
     NS_IMETHOD Run() override
     {
-        nsCacheServiceAutoLock autoLock(LOCK_TELEM(NSBLOCKONCACHETHREADEVENT_RUN));
+        nsCacheServiceAutoLock autoLock;
         CACHE_LOG_DEBUG(("nsBlockOnCacheThreadEvent [%p]\n", this));
         nsCacheService::gService->mNotified = true;
         nsCacheService::gService->mCondVar.Notify();
@@ -988,7 +988,7 @@ public:
         NS_ASSERTION(mRequest->mListener,
                      "Sync OpenCacheEntry() posted to background thread!");
 
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSPROCESSREQUESTEVENT_RUN));
+        nsCacheServiceAutoLock lock;
         rv = nsCacheService::gService->ProcessRequest(mRequest,
                                                       false,
                                                       nullptr);
@@ -1189,7 +1189,7 @@ nsCacheService::Shutdown()
     nsCOMPtr<nsIFile> parentDir;
 
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SHUTDOWN));
+        nsCacheServiceAutoLock lock;
         NS_ASSERTION(mInitialized,
             "can't shutdown nsCacheService unless it has been initialized.");
         if (!mInitialized)
@@ -1204,7 +1204,7 @@ nsCacheService::Shutdown()
     UnregisterWeakMemoryReporter(this);
 
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SHUTDOWN));
+        nsCacheServiceAutoLock lock;
         NS_ASSERTION(mInitialized, "Bad state");
 
         mInitialized = false;
@@ -1363,7 +1363,7 @@ nsCacheService::EvictEntriesForClient(const char *          clientID,
         new EvictionNotifierRunnable(NS_ISUPPORTS_CAST(nsICacheService*, this));
     NS_DispatchToMainThread(r);
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_EVICTENTRIESFORCLIENT));
+    nsCacheServiceAutoLock lock;
     nsresult res = NS_OK;
 
     if (storagePolicy == nsICache::STORE_ANYWHERE ||
@@ -1412,7 +1412,7 @@ nsCacheService::IsStorageEnabledForPolicy(nsCacheStoragePolicy  storagePolicy,
                                           bool *              result)
 {
     if (gService == nullptr) return NS_ERROR_NOT_AVAILABLE;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_ISSTORAGEENABLEDFORPOLICY));
+    nsCacheServiceAutoLock lock;
 
     *result = gService->IsStorageEnabledForPolicy_Locked(storagePolicy);
     return NS_OK;
@@ -1466,7 +1466,7 @@ nsresult nsCacheService::VisitEntriesInternal(nsICacheVisitor *visitor)
 {
     NS_ENSURE_ARG_POINTER(visitor);
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_VISITENTRIES));
+    nsCacheServiceAutoLock lock;
 
     if (!(mEnableDiskDevice || mEnableMemoryDevice))
         return NS_ERROR_NOT_AVAILABLE;
@@ -1548,7 +1548,7 @@ NS_IMETHODIMP nsCacheService::GetCacheIOTarget(nsIEventTarget * *aCacheIOTarget)
     // read from the main thread without the lock. This is useful to prevent
     // blocking the main thread on other cache operations.
     if (!NS_IsMainThread()) {
-        Lock(LOCK_TELEM(NSCACHESERVICE_GETCACHEIOTARGET));
+        Lock();
     }
 
     nsresult rv;
@@ -1940,7 +1940,7 @@ nsCacheService::ProcessRequest(nsCacheRequest *           request,
                 // XXX this is probably wrong...
                 Unlock();
                 rv = request->WaitForValidation();
-                Lock(LOCK_TELEM(NSCACHESERVICE_PROCESSREQUEST));
+                Lock();
             }
 
             PR_REMOVE_AND_INIT_LINK(request);
@@ -2052,7 +2052,7 @@ nsCacheService::OpenCacheEntry(nsCacheSession *           session,
     }
     else {
 
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_OPENCACHEENTRY));
+        nsCacheServiceAutoLock lock;
         rv = gService->ProcessRequest(request, true, result);
 
         // delete requests that have completed
@@ -2353,14 +2353,14 @@ nsCacheService::OnProfileShutdown()
     }
 
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_ONPROFILESHUTDOWN));
+        nsCacheServiceAutoLock lock;
         gService->mClearingEntries = true;
         gService->DoomActiveEntries(nullptr);
     }
 
     gService->CloseAllStreams();
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_ONPROFILESHUTDOWN));
+    nsCacheServiceAutoLock lock;
     gService->ClearDoomList();
 
     // Make sure to wait for any pending cache-operations before
@@ -2399,7 +2399,7 @@ nsCacheService::OnProfileChanged()
 
     CACHE_LOG_DEBUG(("nsCacheService::OnProfileChanged"));
  
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_ONPROFILECHANGED));
+    nsCacheServiceAutoLock lock;
     
     gService->mEnableDiskDevice    = gService->mObserver->DiskCacheEnabled();
     gService->mEnableOfflineDevice = gService->mObserver->OfflineCacheEnabled();
@@ -2453,7 +2453,7 @@ void
 nsCacheService::SetDiskCacheEnabled(bool    enabled)
 {
     if (!gService)  return;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETDISKCACHEENABLED));
+    nsCacheServiceAutoLock lock;
     gService->mEnableDiskDevice = enabled;
 }
 
@@ -2462,7 +2462,7 @@ void
 nsCacheService::SetDiskCacheCapacity(int32_t  capacity)
 {
     if (!gService)  return;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETDISKCACHECAPACITY));
+    nsCacheServiceAutoLock lock;
 
     if (gService->mDiskDevice) {
         gService->mDiskDevice->SetCapacity(capacity);
@@ -2475,7 +2475,7 @@ void
 nsCacheService::SetDiskCacheMaxEntrySize(int32_t  maxSize)
 {
     if (!gService)  return;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETDISKCACHEMAXENTRYSIZE));
+    nsCacheServiceAutoLock lock;
 
     if (gService->mDiskDevice) {
         gService->mDiskDevice->SetMaxEntrySize(maxSize);
@@ -2486,7 +2486,7 @@ void
 nsCacheService::SetMemoryCacheMaxEntrySize(int32_t  maxSize)
 {
     if (!gService)  return;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETMEMORYCACHEMAXENTRYSIZE));
+    nsCacheServiceAutoLock lock;
 
     if (gService->mMemoryDevice) {
         gService->mMemoryDevice->SetMaxEntrySize(maxSize);
@@ -2497,7 +2497,7 @@ void
 nsCacheService::SetOfflineCacheEnabled(bool    enabled)
 {
     if (!gService)  return;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETOFFLINECACHEENABLED));
+    nsCacheServiceAutoLock lock;
     gService->mEnableOfflineDevice = enabled;
 }
 
@@ -2505,7 +2505,7 @@ void
 nsCacheService::SetOfflineCacheCapacity(int32_t  capacity)
 {
     if (!gService)  return;
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETOFFLINECACHECAPACITY));
+    nsCacheServiceAutoLock lock;
 
     if (gService->mOfflineDevice) {
         gService->mOfflineDevice->SetCapacity(capacity);
@@ -2522,7 +2522,7 @@ nsCacheService::SetMemoryCache()
 
     CACHE_LOG_DEBUG(("nsCacheService::SetMemoryCache"));
 
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETMEMORYCACHE));
+    nsCacheServiceAutoLock lock;
 
     gService->mEnableMemoryDevice = gService->mObserver->MemoryCacheEnabled();
 
@@ -2638,23 +2638,6 @@ nsCacheService::Lock()
 }
 
 void
-nsCacheService::Lock(mozilla::Telemetry::ID mainThreadLockerID)
-{
-    mozilla::Telemetry::ID lockerID;
-    mozilla::Telemetry::ID generalID;
-
-    if (NS_IsMainThread()) {
-        lockerID = mainThreadLockerID;
-        generalID = mozilla::Telemetry::CACHE_SERVICE_LOCK_WAIT_MAINTHREAD_2;
-    } else {
-        lockerID = mozilla::Telemetry::HistogramCount;
-        generalID = mozilla::Telemetry::CACHE_SERVICE_LOCK_WAIT_2;
-    }
-
-    nsCacheService::Lock();
-}
-
-void
 nsCacheService::Unlock()
 {
     gService->mLock.AssertCurrentThreadOwns();
@@ -2933,7 +2916,7 @@ nsCacheService::CloseAllStreams()
     nsTArray<RefPtr<nsCacheEntryDescriptor::nsOutputStreamWrapper> > outputs;
 
     {
-        nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_CLOSEALLSTREAMS));
+        nsCacheServiceAutoLock lock;
 
         nsTArray<nsCacheEntry*> entries;
 
@@ -3056,7 +3039,7 @@ nsCacheService::LogCacheStatistics()
 nsresult
 nsCacheService::SetDiskSmartSize()
 {
-    nsCacheServiceAutoLock lock(LOCK_TELEM(NSCACHESERVICE_SETDISKSMARTSIZE));
+    nsCacheServiceAutoLock lock;
 
     if (!gService) return NS_ERROR_NOT_AVAILABLE;
 
@@ -3175,8 +3158,7 @@ nsCacheService::CollectReports(nsIHandleReportCallback* aHandleReport,
 {
     size_t disk = 0;
     if (mDiskDevice) {
-        nsCacheServiceAutoLock
-            lock(LOCK_TELEM(NSCACHESERVICE_DISKDEVICEHEAPSIZE));
+        nsCacheServiceAutoLock lock;
         disk = mDiskDevice->SizeOfIncludingThis(DiskCacheDeviceMallocSizeOf);
     }
 
diff --git a/netwerk/cache/nsCacheService.h b/netwerk/cache/nsCacheService.h
index 1751d4875..95816cfe5 100644
--- a/netwerk/cache/nsCacheService.h
+++ b/netwerk/cache/nsCacheService.h
@@ -371,9 +371,6 @@ private:
  *  nsCacheServiceAutoLock
  ******************************************************************************/
 
-#define LOCK_TELEM(x) \
-  (::mozilla::Telemetry::CACHE_SERVICE_LOCK_WAIT_MAINTHREAD_##x)
-
 // Instantiate this class to acquire the cache service lock for a particular
 // execution scope.
 class nsCacheServiceAutoLock {
@@ -381,9 +378,6 @@ public:
     nsCacheServiceAutoLock() {
         nsCacheService::Lock();
     }
-    explicit nsCacheServiceAutoLock(mozilla::Telemetry::ID mainThreadLockerID) {
-        nsCacheService::Lock(mainThreadLockerID);
-    }
     ~nsCacheServiceAutoLock() {
         nsCacheService::Unlock();
     }
diff --git a/netwerk/cache2/CacheEntry.cpp b/netwerk/cache2/CacheEntry.cpp
index b79bf7373..51e441aa7 100644
--- a/netwerk/cache2/CacheEntry.cpp
+++ b/netwerk/cache2/CacheEntry.cpp
@@ -419,11 +419,6 @@ bool CacheEntry::Load(bool aTruncate, bool aPriority)
   {
     mozilla::MutexAutoUnlock unlock(mLock);
 
-    if (reportMiss) {
-      CacheFileUtils::DetailedCacheHitTelemetry::AddRecord(
-        CacheFileUtils::DetailedCacheHitTelemetry::MISS, mLoadStart);
-    }
-
     LOG(("  performing load, file=%p", mFile.get()));
     if (NS_SUCCEEDED(rv)) {
       rv = mFile->Init(fileKey,
@@ -458,16 +453,6 @@ NS_IMETHODIMP CacheEntry::OnFileReady(nsresult aResult, bool aIsNew)
 
   MOZ_ASSERT(!mLoadStart.IsNull());
 
-  if (NS_SUCCEEDED(aResult)) {
-    if (aIsNew) {
-      CacheFileUtils::DetailedCacheHitTelemetry::AddRecord(
-        CacheFileUtils::DetailedCacheHitTelemetry::MISS, mLoadStart);
-    } else {
-      CacheFileUtils::DetailedCacheHitTelemetry::AddRecord(
-        CacheFileUtils::DetailedCacheHitTelemetry::HIT, mLoadStart);
-    }
-  }
-
   // OnFileReady, that is the only code that can transit from LOADING
   // to any follow-on state and can only be invoked ones on an entry.
   // Until this moment there is no consumer that could manipulate
diff --git a/netwerk/cache2/CacheFile.cpp b/netwerk/cache2/CacheFile.cpp
index ce771c754..69fc3d33c 100644
--- a/netwerk/cache2/CacheFile.cpp
+++ b/netwerk/cache2/CacheFile.cpp
@@ -1865,33 +1865,6 @@ CacheFile::Truncate(int64_t aOffset)
   return NS_OK;
 }
 
-static uint32_t
-StatusToTelemetryEnum(nsresult aStatus)
-{
-  if (NS_SUCCEEDED(aStatus)) {
-    return 0;
-  }
-
-  switch (aStatus) {
-    case NS_BASE_STREAM_CLOSED:
-      return 0; // Log this as a success
-    case NS_ERROR_OUT_OF_MEMORY:
-      return 2;
-    case NS_ERROR_FILE_DISK_FULL:
-      return 3;
-    case NS_ERROR_FILE_CORRUPTED:
-      return 4;
-    case NS_ERROR_FILE_NOT_FOUND:
-      return 5;
-    case NS_BINDING_ABORTED:
-      return 6;
-    default:
-      return 1; // other error
-  }
-
-  NS_NOTREACHED("We should never get here");
-}
-
 nsresult
 CacheFile::RemoveInput(CacheFileInputStream *aInput, nsresult aStatus)
 {
diff --git a/netwerk/cache2/CacheFileIOManager.cpp b/netwerk/cache2/CacheFileIOManager.cpp
index f6b499e47..25e621d12 100644
--- a/netwerk/cache2/CacheFileIOManager.cpp
+++ b/netwerk/cache2/CacheFileIOManager.cpp
@@ -3822,44 +3822,6 @@ CacheFileIOManager::CreateCacheTree()
 
   StartRemovingTrash();
 
-  if (!CacheObserver::CacheFSReported()) {
-    uint32_t fsType = 4; // Other OS
-
-#ifdef XP_WIN
-    nsAutoString target;
-    nsresult rv = mCacheDirectory->GetTarget(target);
-    if (NS_FAILED(rv)) {
-      return NS_OK;
-    }
-
-    wchar_t volume_path[MAX_PATH + 1] = { 0 };
-    if (!::GetVolumePathNameW(target.get(),
-                              volume_path,
-                              mozilla::ArrayLength(volume_path))) {
-      return NS_OK;
-    }
-
-    wchar_t fsName[6] = { 0 };
-    if (!::GetVolumeInformationW(volume_path, nullptr, 0, nullptr, nullptr,
-                                 nullptr, fsName,
-                                 mozilla::ArrayLength(fsName))) {
-      return NS_OK;
-    }
-
-    if (wcscmp(fsName, L"NTFS") == 0) {
-      fsType = 0;
-    } else if (wcscmp(fsName, L"FAT32") == 0) {
-      fsType = 1;
-    } else if (wcscmp(fsName, L"FAT") == 0) {
-      fsType = 2;
-    } else {
-      fsType = 3;
-    }
-#endif
-
-    CacheObserver::SetCacheFSReported();
-  }
-
   return NS_OK;
 }
 
@@ -3905,16 +3867,6 @@ CacheFileIOManager::OpenNSPRHandle(CacheFileHandle *aHandle, bool aCreate)
         LOG(("CacheFileIOManager::OpenNSPRHandle() - Successfully evicted entry"
              " with hash %08x%08x%08x%08x%08x. %s to create the new file.",
              LOGSHA1(&hash), NS_SUCCEEDED(rv) ? "Succeeded" : "Failed"));
-
-        // Report the full size only once per session
-        static bool sSizeReported = false;
-        if (!sSizeReported) {
-          uint32_t cacheUsage;
-          if (NS_SUCCEEDED(CacheIndex::GetCacheSize(&cacheUsage))) {
-            cacheUsage >>= 10;
-            sSizeReported = true;
-          }
-        }
       } else {
         LOG(("CacheFileIOManager::OpenNSPRHandle() - Couldn't evict an existing"
              " entry."));
diff --git a/netwerk/cache2/CacheFileUtils.cpp b/netwerk/cache2/CacheFileUtils.cpp
index fe1a53b3d..a090a9cb1 100644
--- a/netwerk/cache2/CacheFileUtils.cpp
+++ b/netwerk/cache2/CacheFileUtils.cpp
@@ -401,101 +401,6 @@ ValidityMap::operator[](uint32_t aIdx)
   return mMap.ElementAt(aIdx);
 }
 
-StaticMutex DetailedCacheHitTelemetry::sLock;
-uint32_t DetailedCacheHitTelemetry::sRecordCnt = 0;
-DetailedCacheHitTelemetry::HitRate DetailedCacheHitTelemetry::sHRStats[kNumOfRanges];
-
-DetailedCacheHitTelemetry::HitRate::HitRate()
-{
-  Reset();
-}
-
-void
-DetailedCacheHitTelemetry::HitRate::AddRecord(ERecType aType)
-{
-  if (aType == HIT) {
-    ++mHitCnt;
-  } else {
-    ++mMissCnt;
-  }
-}
-
-uint32_t
-DetailedCacheHitTelemetry::HitRate::GetHitRateBucket(uint32_t aNumOfBuckets) const
-{
-  uint32_t bucketIdx = (aNumOfBuckets * mHitCnt) / (mHitCnt + mMissCnt);
-  if (bucketIdx == aNumOfBuckets) { // make sure 100% falls into the last bucket
-    --bucketIdx;
-  }
-
-  return bucketIdx;
-}
-
-uint32_t
-DetailedCacheHitTelemetry::HitRate::Count()
-{
-  return mHitCnt + mMissCnt;
-}
-
-void
-DetailedCacheHitTelemetry::HitRate::Reset()
-{
-  mHitCnt = 0;
-  mMissCnt = 0;
-}
-
-// static
-void
-DetailedCacheHitTelemetry::AddRecord(ERecType aType, TimeStamp aLoadStart)
-{
-  bool isUpToDate = false;
-  CacheIndex::IsUpToDate(&isUpToDate);
-  if (!isUpToDate) {
-    // Ignore the record when the entry file count might be incorrect
-    return;
-  }
-
-  uint32_t entryCount;
-  nsresult rv = CacheIndex::GetEntryFileCount(&entryCount);
-  if (NS_FAILED(rv)) {
-    return;
-  }
-
-  uint32_t rangeIdx = entryCount / kRangeSize;
-  if (rangeIdx >= kNumOfRanges) { // The last range has no upper limit.
-    rangeIdx = kNumOfRanges - 1;
-  }
-
-  uint32_t hitMissValue = 2 * rangeIdx; // 2 values per range
-  if (aType == MISS) { // The order is HIT, MISS
-    ++hitMissValue;
-  }
-
-  StaticMutexAutoLock lock(sLock);
-
-  sHRStats[rangeIdx].AddRecord(aType);
-  ++sRecordCnt;
-
-  if (sRecordCnt < kTotalSamplesReportLimit) {
-    return;
-  }
-
-  sRecordCnt = 0;
-
-  for (uint32_t i = 0; i < kNumOfRanges; ++i) {
-    if (sHRStats[i].Count() >= kHitRateSamplesReportLimit) {
-      // The telemetry enums are grouped by buckets as follows:
-      // Telemetry value : 0,1,2,3, ... ,19,20,21,22, ... ,398,399
-      // Hit rate bucket : 0,0,0,0, ... , 0, 1, 1, 1, ... , 19, 19
-      // Cache size range: 0,1,2,3, ... ,19, 0, 1, 2, ... , 18, 19
-      uint32_t bucketOffset = sHRStats[i].GetHitRateBucket(kHitRateBuckets) *
-                              kNumOfRanges;
-
-      sHRStats[i].Reset();
-    }
-  }
-}
-
 void
 FreeBuffer(void *aBuf) {
 #ifndef NS_FREE_PERMANENT_DATA
diff --git a/netwerk/cache2/CacheFileUtils.h b/netwerk/cache2/CacheFileUtils.h
index 13acfd71d..b66f0adf1 100644
--- a/netwerk/cache2/CacheFileUtils.h
+++ b/netwerk/cache2/CacheFileUtils.h
@@ -10,7 +10,6 @@
 #include "nsString.h"
 #include "nsTArray.h"
 #include "mozilla/StaticMutex.h"
-#include "mozilla/TimeStamp.h"
 
 class nsILoadContextInfo;
 class nsACString;
@@ -90,64 +89,6 @@ private:
   nsTArray<ValidityPair> mMap;
 };
 
-
-class DetailedCacheHitTelemetry {
-public:
-  enum ERecType {
-    HIT  = 0,
-    MISS = 1
-  };
-
-  static void AddRecord(ERecType aType, TimeStamp aLoadStart);
-
-private:
-  class HitRate {
-  public:
-    HitRate();
-
-    void     AddRecord(ERecType aType);
-    // Returns the bucket index that the current hit rate falls into according
-    // to the given aNumOfBuckets.
-    uint32_t GetHitRateBucket(uint32_t aNumOfBuckets) const;
-    uint32_t Count();
-    void     Reset();
-
-  private:
-    uint32_t mHitCnt;
-    uint32_t mMissCnt;
-  };
-
-  // Group the hits and misses statistics by cache files count ranges (0-5000,
-  // 5001-10000, ... , 95001- )
-  static const uint32_t kRangeSize = 5000;
-  static const uint32_t kNumOfRanges = 20;
-
-  // Use the same ranges to report an average hit rate. Report the hit rates
-  // (and reset the counters) every kTotalSamplesReportLimit samples.
-  static const uint32_t kTotalSamplesReportLimit = 1000;
-
-  // Report hit rate for a given cache size range only if it contains
-  // kHitRateSamplesReportLimit or more samples. This limit should avoid
-  // reporting a biased statistics.
-  static const uint32_t kHitRateSamplesReportLimit = 500;
-
-  // All hit rates are accumulated in a single telemetry probe, so to use
-  // a sane number of enumerated values the hit rate is divided into buckets
-  // instead of using a percent value. This constant defines number of buckets
-  // that we divide the hit rates into. I.e. we'll report ranges 0%-5%, 5%-10%,
-  // 10-%15%, ...
-  static const uint32_t kHitRateBuckets = 20;
-
-  // Protects sRecordCnt and sHitStats calls.
-  static StaticMutex sLock;
-
-  // Counter of samples that is compared against kTotalSamplesReportLimit.
-  static uint32_t sRecordCnt;
- 
-  // Hit rate statistics for every cache size range.
-  static HitRate sHRStats[kNumOfRanges];
-};
-
 void
 FreeBuffer(void *aBuf);
 
diff --git a/netwerk/cache2/CacheIOThread.cpp b/netwerk/cache2/CacheIOThread.cpp
index d51f61b0f..2fbc0ccce 100644
--- a/netwerk/cache2/CacheIOThread.cpp
+++ b/netwerk/cache2/CacheIOThread.cpp
@@ -18,54 +18,6 @@
 namespace mozilla {
 namespace net {
 
-namespace { // anon
-
-class CacheIOTelemetry
-{
-public:
-  typedef CacheIOThread::EventQueue::size_type size_type;
-  static size_type mMinLengthToReport[CacheIOThread::LAST_LEVEL];
-  static void Report(uint32_t aLevel, size_type aLength);
-};
-
-static CacheIOTelemetry::size_type const kGranularity = 30;
-
-CacheIOTelemetry::size_type 
-CacheIOTelemetry::mMinLengthToReport[CacheIOThread::LAST_LEVEL] = {
-  kGranularity, kGranularity, kGranularity, kGranularity,
-  kGranularity, kGranularity, kGranularity, kGranularity
-};
-
-// static
-void CacheIOTelemetry::Report(uint32_t aLevel, CacheIOTelemetry::size_type aLength)
-{
-  if (mMinLengthToReport[aLevel] > aLength) {
-    return;
-  }
-
-  static Telemetry::ID telemetryID[] = {
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_OPEN_PRIORITY,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_READ_PRIORITY,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_MANAGEMENT,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_OPEN,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_READ,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_WRITE_PRIORITY,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_WRITE,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_INDEX,
-    Telemetry::HTTP_CACHE_IO_QUEUE_2_EVICT
-  };
-
-  // Each bucket is a multiply of kGranularity (30, 60, 90..., 300+)
-  aLength = (aLength / kGranularity);
-  // Next time report only when over the current length + kGranularity
-  mMinLengthToReport[aLevel] = (aLength + 1) * kGranularity;
-
-  // 10 is number of buckets we have in each probe
-  aLength = std::min<size_type>(aLength, 10);
-}
-
-} // anon
-
 namespace detail {
 
 /**
@@ -525,7 +477,6 @@ void CacheIOThread::LoopOneLevel(uint32_t aLevel)
   mCurrentlyExecutingLevel = aLevel;
 
   bool returnEvents = false;
-  bool reportTelementry = true;
 
   EventQueue::size_type index;
   {
@@ -539,11 +490,6 @@ void CacheIOThread::LoopOneLevel(uint32_t aLevel)
         break;
       }
 
-      if (reportTelementry) {
-        reportTelementry = false;
-        CacheIOTelemetry::Report(aLevel, length);
-      }
-
       // Drop any previous flagging, only an event on the current level may set
       // this flag.
       mRerunCurrentEvent = false;
diff --git a/netwerk/cache2/CacheIndex.cpp b/netwerk/cache2/CacheIndex.cpp
index 1009aeaf8..2c6451d72 100644
--- a/netwerk/cache2/CacheIndex.cpp
+++ b/netwerk/cache2/CacheIndex.cpp
@@ -20,7 +20,6 @@
 #include "nsITimer.h"
 #include "mozilla/AutoRestore.h"
 #include <algorithm>
-#include "mozilla/Telemetry.h"
 #include "mozilla/Unused.h"
 
 
@@ -1295,29 +1294,6 @@ CacheIndex::GetCacheSize(uint32_t *_retval)
 
 // static
 nsresult
-CacheIndex::GetEntryFileCount(uint32_t *_retval)
-{
-  LOG(("CacheIndex::GetEntryFileCount()"));
-
-  StaticMutexAutoLock lock(sLock);
-
-  RefPtr<CacheIndex> index = gInstance;
-
-  if (!index) {
-    return NS_ERROR_NOT_INITIALIZED;
-  }
-
-  if (!index->IsIndexUsable()) {
-    return NS_ERROR_NOT_AVAILABLE;
-  }
-
-  *_retval = index->mIndexStats.ActiveEntriesCount();
-  LOG(("CacheIndex::GetEntryFileCount() - returning %u", *_retval));
-  return NS_OK;
-}
-
-// static
-nsresult
 CacheIndex::GetCacheStats(nsILoadContextInfo *aInfo, uint32_t *aSize, uint32_t *aCount)
 {
   LOG(("CacheIndex::GetCacheStats() [info=%p]", aInfo));
@@ -3167,11 +3143,6 @@ CacheIndex::ChangeState(EState aNewState)
     return;
   }
 
-  if ((mState == READING || mState == BUILDING || mState == UPDATING) &&
-      aNewState == READY) {
-    ReportHashStats();
-  }
-
   // Try to evict entries over limit everytime we're leaving state READING,
   // BUILDING or UPDATING, but not during shutdown or when removing all
   // entries.
@@ -3720,75 +3691,6 @@ CacheIndex::SizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf)
   return mallocSizeOf(gInstance) + SizeOfExcludingThis(mallocSizeOf);
 }
 
-namespace {
-
-class HashComparator
-{
-public:
-  bool Equals(CacheIndexRecord* a, CacheIndexRecord* b) const {
-    return memcmp(&a->mHash, &b->mHash, sizeof(SHA1Sum::Hash)) == 0;
-  }
-  bool LessThan(CacheIndexRecord* a, CacheIndexRecord* b) const {
-    return memcmp(&a->mHash, &b->mHash, sizeof(SHA1Sum::Hash)) < 0;
-  }
-};
-
-void
-ReportHashSizeMatch(const SHA1Sum::Hash *aHash1, const SHA1Sum::Hash *aHash2)
-{
-  const uint32_t *h1 = reinterpret_cast<const uint32_t *>(aHash1);
-  const uint32_t *h2 = reinterpret_cast<const uint32_t *>(aHash2);
-
-  for (uint32_t i = 0; i < 5; ++i) {
-    if (h1[i] != h2[i]) {
-      uint32_t bitsDiff = h1[i] ^ h2[i];
-      bitsDiff = NetworkEndian::readUint32(&bitsDiff);
-
-      // count leading zeros in bitsDiff
-      static const uint8_t debruijn32[32] =
-        { 0, 31, 9, 30, 3, 8, 13, 29, 2, 5, 7, 21, 12, 24, 28, 19,
-          1, 10, 4, 14, 6, 22, 25, 20, 11, 15, 23, 26, 16, 27, 17, 18};
-
-      bitsDiff |= bitsDiff>>1;
-      bitsDiff |= bitsDiff>>2;
-      bitsDiff |= bitsDiff>>4;
-      bitsDiff |= bitsDiff>>8;
-      bitsDiff |= bitsDiff>>16;
-      bitsDiff++;
-
-      uint8_t hashSizeMatch = debruijn32[bitsDiff*0x076be629>>27] + (i<<5);
-
-      return;
-    }
-  }
-
-  MOZ_ASSERT(false, "Found a collision in the index!");
-}
-
-} // namespace
-
-void
-CacheIndex::ReportHashStats()
-{
-  // We're gathering the hash stats only once, exclude too small caches.
-  if (CacheObserver::HashStatsReported() || mFrecencyArray.Length() < 15000) {
-    return;
-  }
-
-  nsTArray<CacheIndexRecord *> records;
-  for (auto iter = mFrecencyArray.Iter(); !iter.Done(); iter.Next()) {
-    records.AppendElement(iter.Get());
-  }
-
-  records.Sort(HashComparator());
-
-  for (uint32_t i = 1; i < records.Length(); i++) {
-    ReportHashSizeMatch(&records[i-1]->mHash, &records[i]->mHash);
-  }
-
-  CacheObserver::SetHashStatsReported();
-}
-
 // static
 void
 CacheIndex::OnAsyncEviction(bool aEvicting)
diff --git a/netwerk/cache2/CacheIndex.h b/netwerk/cache2/CacheIndex.h
index dc72c346f..acb3bdd25 100644
--- a/netwerk/cache2/CacheIndex.h
+++ b/netwerk/cache2/CacheIndex.h
@@ -662,9 +662,6 @@ public:
   // Returns cache size in kB.
   static nsresult GetCacheSize(uint32_t *_retval);
 
-  // Returns number of entry files in the cache
-  static nsresult GetEntryFileCount(uint32_t *_retval);
-
   // Synchronously returns the disk occupation and number of entries per-context.
   // Callable on any thread.
   static nsresult GetCacheStats(nsILoadContextInfo *aInfo, uint32_t *aSize, uint32_t *aCount);
@@ -918,8 +915,6 @@ private:
   // Memory reporting (private part)
   size_t SizeOfExcludingThisInternal(mozilla::MallocSizeOf mallocSizeOf) const;
 
-  void ReportHashStats();
-
   static mozilla::StaticRefPtr<CacheIndex> gInstance;
   static StaticMutex sLock;
 
diff --git a/netwerk/cache2/CacheObserver.cpp b/netwerk/cache2/CacheObserver.cpp
index 6b6d89395..32e0dff95 100644
--- a/netwerk/cache2/CacheObserver.cpp
+++ b/netwerk/cache2/CacheObserver.cpp
@@ -86,12 +86,6 @@ bool CacheObserver::sSanitizeOnShutdown = kDefaultSanitizeOnShutdown;
 static bool kDefaultClearCacheOnShutdown = false;
 bool CacheObserver::sClearCacheOnShutdown = kDefaultClearCacheOnShutdown;
 
-static bool kDefaultCacheFSReported = false;
-bool CacheObserver::sCacheFSReported = kDefaultCacheFSReported;
-
-static bool kDefaultHashStatsReported = false;
-bool CacheObserver::sHashStatsReported = kDefaultHashStatsReported;
-
 static uint32_t const kDefaultMaxShutdownIOLag = 2; // seconds
 Atomic<uint32_t, Relaxed> CacheObserver::sMaxShutdownIOLag(kDefaultMaxShutdownIOLag);
 
@@ -331,58 +325,6 @@ CacheObserver::StoreDiskCacheCapacity()
 }
 
 // static
-void
-CacheObserver::SetCacheFSReported()
-{
-  sCacheFSReported = true;
-
-  if (!sSelf) {
-    return;
-  }
-
-  if (NS_IsMainThread()) {
-    sSelf->StoreCacheFSReported();
-  } else {
-    nsCOMPtr<nsIRunnable> event =
-      NewRunnableMethod(sSelf, &CacheObserver::StoreCacheFSReported);
-    NS_DispatchToMainThread(event);
-  }
-}
-
-void
-CacheObserver::StoreCacheFSReported()
-{
-  mozilla::Preferences::SetInt("browser.cache.disk.filesystem_reported",
-                               sCacheFSReported);
-}
-
-// static
-void
-CacheObserver::SetHashStatsReported()
-{
-  sHashStatsReported = true;
-
-  if (!sSelf) {
-    return;
-  }
-
-  if (NS_IsMainThread()) {
-    sSelf->StoreHashStatsReported();
-  } else {
-    nsCOMPtr<nsIRunnable> event =
-      NewRunnableMethod(sSelf, &CacheObserver::StoreHashStatsReported);
-    NS_DispatchToMainThread(event);
-  }
-}
-
-void
-CacheObserver::StoreHashStatsReported()
-{
-  mozilla::Preferences::SetInt("browser.cache.disk.hashstats_reported",
-                               sHashStatsReported);
-}
-
-// static
 void CacheObserver::ParentDirOverride(nsIFile** aDir)
 {
   if (NS_WARN_IF(!aDir))
diff --git a/netwerk/cache2/CacheObserver.h b/netwerk/cache2/CacheObserver.h
index 62e5bbc6e..ccdd89030 100644
--- a/netwerk/cache2/CacheObserver.h
+++ b/netwerk/cache2/CacheObserver.h
@@ -61,12 +61,6 @@ class CacheObserver : public nsIObserver
     { return sHalfLifeExperiment; }
   static bool ClearCacheOnShutdown()
     { return sSanitizeOnShutdown && sClearCacheOnShutdown; }
-  static bool CacheFSReported()
-    { return sCacheFSReported; }
-  static void SetCacheFSReported();
-  static bool HashStatsReported()
-    { return sHashStatsReported; }
-  static void SetHashStatsReported();
   static void ParentDirOverride(nsIFile ** aDir);
 
   static bool EntryIsTooBig(int64_t aSize, bool aUsingDisk);
@@ -82,8 +76,6 @@ private:
   static CacheObserver* sSelf;
 
   void StoreDiskCacheCapacity();
-  void StoreCacheFSReported();
-  void StoreHashStatsReported();
   void AttachToPreferences();
 
   static uint32_t sUseNewCache;
@@ -106,8 +98,6 @@ private:
   static int32_t sHalfLifeExperiment;
   static bool sSanitizeOnShutdown;
   static bool sClearCacheOnShutdown;
-  static bool sCacheFSReported;
-  static bool sHashStatsReported;
   static Atomic<uint32_t, Relaxed> sMaxShutdownIOLag;
   static Atomic<PRIntervalTime> sShutdownDemandedTime;
 
diff --git a/netwerk/cookie/nsCookieService.cpp b/netwerk/cookie/nsCookieService.cpp
index ea54dbd61..7bc5abcd1 100644
--- a/netwerk/cookie/nsCookieService.cpp
+++ b/netwerk/cookie/nsCookieService.cpp
@@ -3311,13 +3311,6 @@ nsCookieService::SetCookieInternal(nsIURI                        *aHostURI,
   // so we can handle them separately.
   bool newCookie = ParseAttributes(aCookieHeader, cookieAttributes);
 
-  // Collect telemetry on how often secure cookies are set from non-secure
-  // origins, and vice-versa.
-  //
-  // 0 = nonsecure and "http:"
-  // 1 = nonsecure and "https:"
-  // 2 = secure and "http:"
-  // 3 = secure and "https:"
   bool isHTTPS;
   nsresult rv = aHostURI->SchemeIs("https", &isHTTPS);
 
diff --git a/netwerk/protocol/http/Http2Compression.cpp b/netwerk/protocol/http/Http2Compression.cpp
index 64fd05a17..9206f8b4c 100644
--- a/netwerk/protocol/http/Http2Compression.cpp
+++ b/netwerk/protocol/http/Http2Compression.cpp
@@ -402,7 +402,7 @@ Http2Decompressor::DecodeHeaderBlock(const uint8_t *data, uint32_t datalen,
 
   nsresult rv = NS_OK;
   nsresult softfail_rv = NS_OK;
-  while (NS_SUCCEEDED(rv) && (mOffset < datalen)) {
+  while (NS_SUCCEEDED(rv) && (mOffset < mDataLen)) {
     bool modifiesTable = true;
     if (mData[mOffset] & 0x80) {
       rv = DoIndexed();
@@ -684,6 +684,11 @@ nsresult
 Http2Decompressor::DecodeFinalHuffmanCharacter(const HuffmanIncomingTable *table,
                                                uint8_t &c, uint8_t &bitsLeft)
 {
+  MOZ_ASSERT(mOffset <= mDataLen);
+  if (mOffset > mDataLen) {
+    NS_WARNING("DecodeFinalHuffmanCharacter trying to read beyond end of buffer");
+    return NS_ERROR_FAILURE;
+  }
   uint8_t mask = (1 << bitsLeft) - 1;
   uint8_t idx = mData[mOffset - 1] & mask;
   idx <<= (8 - bitsLeft);
@@ -721,6 +726,7 @@ Http2Decompressor::DecodeFinalHuffmanCharacter(const HuffmanIncomingTable *table
 uint8_t
 Http2Decompressor::ExtractByte(uint8_t bitsLeft, uint32_t &bytesConsumed)
 {
+  MOZ_DIAGNOSTIC_ASSERT(mOffset < mDataLen);
   uint8_t rv;
 
   if (bitsLeft) {
@@ -750,8 +756,8 @@ Http2Decompressor::DecodeHuffmanCharacter(const HuffmanIncomingTable *table,
   uint8_t idx = ExtractByte(bitsLeft, bytesConsumed);
 
   if (table->IndexHasANextTable(idx)) {
-    if (bytesConsumed >= mDataLen) {
-      if (!bitsLeft || (bytesConsumed > mDataLen)) {
+    if (mOffset >= mDataLen) {
+      if (!bitsLeft || (mOffset > mDataLen)) {
         // TODO - does this get me into trouble in the new world?
         // No info left in input to try to consume, we're done
         LOG(("DecodeHuffmanCharacter all out of bits to consume, can't chain"));
@@ -892,6 +898,13 @@ Http2Decompressor::DoLiteralInternal(nsACString &name, nsACString &value,
     return rv;
   }
 
+  // sanity check
+  if (mOffset >= mDataLen) {
+    NS_WARNING("Http2 Decompressor ran out of data");
+    // This is session-fatal
+    return NS_ERROR_FAILURE;
+  }
+
   bool isHuffmanEncoded;
 
   if (!index) {
@@ -919,6 +932,13 @@ Http2Decompressor::DoLiteralInternal(nsACString &name, nsACString &value,
     return rv;
   }
 
+  // sanity check
+  if (mOffset >= mDataLen) {
+    NS_WARNING("Http2 Decompressor ran out of data");
+    // This is session-fatal
+    return NS_ERROR_FAILURE;
+  }
+  
   // now the value
   uint32_t valueLen;
   isHuffmanEncoded = mData[mOffset] & (1 << 7);
diff --git a/netwerk/sctp/datachannel/DataChannel.cpp b/netwerk/sctp/datachannel/DataChannel.cpp
index f2a91c589..ebc430f8c 100644
--- a/netwerk/sctp/datachannel/DataChannel.cpp
+++ b/netwerk/sctp/datachannel/DataChannel.cpp
@@ -276,6 +276,7 @@ DataChannelConnection::Destroy()
     LOG(("Deregistered %p from the SCTP stack.", static_cast<void *>(this)));
   }
 
+  mListener = nullptr;
   // Finish Destroy on STS thread to avoid bug 876167 - once that's fixed,
   // the usrsctp_close() calls can move back here (and just proxy the
   // disconnect_all())
diff --git a/netwerk/streamconv/converters/nsIndexedToHTML.cpp b/netwerk/streamconv/converters/nsIndexedToHTML.cpp
index 0414c4841..29fea8bfb 100644
--- a/netwerk/streamconv/converters/nsIndexedToHTML.cpp
+++ b/netwerk/streamconv/converters/nsIndexedToHTML.cpp
@@ -146,7 +146,14 @@ nsIndexedToHTML::DoOnStartRequest(nsIRequest* request, nsISupports *aContext,
     nsAutoCString baseUri, titleUri;
     rv = uri->GetAsciiSpec(baseUri);
     if (NS_FAILED(rv)) return rv;
-    titleUri = baseUri;
+
+    nsCOMPtr<nsIURI> titleURL;
+    rv = uri->Clone(getter_AddRefs(titleURL));
+    if (NS_FAILED(rv)) titleURL = uri;
+    rv = titleURL->SetQuery(EmptyCString());
+    if (NS_FAILED(rv)) titleURL = uri;
+    rv = titleURL->SetRef(EmptyCString());
+    if (NS_FAILED(rv)) titleURL = uri;
 
     nsCString parentStr;
 
@@ -170,16 +177,14 @@ nsIndexedToHTML::DoOnStartRequest(nsIRequest* request, nsISupports *aContext,
         // that - see above
         
         nsAutoCString pw;
-        rv = uri->GetPassword(pw);
+        rv = titleURL->GetPassword(pw);
         if (NS_FAILED(rv)) return rv;
         if (!pw.IsEmpty()) {
              nsCOMPtr<nsIURI> newUri;
-             rv = uri->Clone(getter_AddRefs(newUri));
+             rv = titleURL->Clone(getter_AddRefs(newUri));
              if (NS_FAILED(rv)) return rv;
              rv = newUri->SetPassword(EmptyCString());
              if (NS_FAILED(rv)) return rv;
-             rv = newUri->GetAsciiSpec(titleUri);
-             if (NS_FAILED(rv)) return rv;
         }
 
         nsAutoCString path;
@@ -247,6 +252,11 @@ nsIndexedToHTML::DoOnStartRequest(nsIRequest* request, nsISupports *aContext,
         }
     }
 
+    rv = titleURL->GetAsciiSpec(titleUri);
+    if (NS_FAILED(rv)) {
+        return rv;
+    }
+
     buffer.AppendLiteral("<style type=\"text/css\">\n"
                          ":root {\n"
                          "  font-family: sans-serif;\n"
diff --git a/old-configure.in b/old-configure.in
index 80597eb73..d0b17835c 100644
--- a/old-configure.in
+++ b/old-configure.in
@@ -2247,6 +2247,7 @@ MOZ_PLACES=1
 MOZ_SERVICES_HEALTHREPORT=1
 MOZ_SERVICES_SYNC=1
 MOZ_SERVICES_CLOUDSYNC=1
+MOZ_USERINFO=1
 
 case "$target_os" in
     mingw*)
@@ -4587,6 +4588,20 @@ fi
 AC_SUBST(MOZ_DEVTOOLS)
 
 dnl ========================================================
+dnl = Disable nsUserInfo
+dnl ========================================================
+MOZ_ARG_DISABLE_BOOL(userinfo,
+[  --disable-userinfo         Disable nsUserInfo (default=enabled)],
+    MOZ_USERINFO=,
+    MOZ_USERINFO=1)
+
+if test -n "$MOZ_USERINFO"; then
+    AC_DEFINE(MOZ_USERINFO)
+fi
+
+AC_SUBST(MOZ_USERINFO)
+
+dnl ========================================================
 dnl = Define default location for MOZILLA_FIVE_HOME
 dnl ========================================================
 MOZ_ARG_WITH_STRING(default-mozilla-five-home,
diff --git a/security/manager/ssl/TransportSecurityInfo.cpp b/security/manager/ssl/TransportSecurityInfo.cpp
index 0e2238ad0..3c7023302 100644
--- a/security/manager/ssl/TransportSecurityInfo.cpp
+++ b/security/manager/ssl/TransportSecurityInfo.cpp
@@ -429,7 +429,12 @@ TransportSecurityInfo::Read(nsIObjectInputStream* stream)
   if (NS_FAILED(rv)) {
     return rv;
   }
-  mSSLStatus = BitwiseCast<nsSSLStatus*, nsISupports*>(supports.get());
+  nsCOMPtr<nsISSLStatus> castGuard(do_QueryInterface(supports));
+  if (castGuard) {
+    mSSLStatus = BitwiseCast<nsSSLStatus*, nsISSLStatus*>(castGuard.get());
+  } else {
+    mSSLStatus = nullptr;
+  }
 
   nsCOMPtr<nsISupports> failedCertChainSupports;
   rv = NS_ReadOptionalObject(stream, true, getter_AddRefs(failedCertChainSupports));
diff --git a/security/manager/ssl/nsNSSCallbacks.cpp b/security/manager/ssl/nsNSSCallbacks.cpp
index 6bac59f51..daabca591 100644
--- a/security/manager/ssl/nsNSSCallbacks.cpp
+++ b/security/manager/ssl/nsNSSCallbacks.cpp
@@ -1277,7 +1277,6 @@ void HandshakeCallback(PRFileDesc* fd, void* client_data) {
                                            infoObject->GetPort(),
                                            versions.max);
 
-  bool usesFallbackCipher = false;
   SSLChannelInfo channelInfo;
   rv = SSL_GetChannelInfo(fd, &channelInfo, sizeof(channelInfo));
   MOZ_ASSERT(rv == SECSuccess);
@@ -1296,8 +1295,6 @@ void HandshakeCallback(PRFileDesc* fd, void* client_data) {
                                 sizeof cipherInfo);
     MOZ_ASSERT(rv == SECSuccess);
     if (rv == SECSuccess) {
-      usesFallbackCipher = channelInfo.keaType == ssl_kea_dh;
-
       MOZ_ASSERT(infoObject->GetKEAUsed() == channelInfo.keaType);
 
       if (infoObject->IsFullHandshake()) {
@@ -1372,15 +1369,6 @@ void HandshakeCallback(PRFileDesc* fd, void* client_data) {
   } else {
     state = nsIWebProgressListener::STATE_IS_SECURE |
             nsIWebProgressListener::STATE_SECURE_HIGH;
-    if (!usesFallbackCipher) {
-      SSLVersionRange defVersion;
-      rv = SSL_VersionRangeGetDefault(ssl_variant_stream, &defVersion);
-      if (rv == SECSuccess && versions.max >= defVersion.max) {
-        // we know this site no longer requires a fallback cipher
-        ioLayerHelpers.removeInsecureFallbackSite(infoObject->GetHostName(),
-                                                  infoObject->GetPort());
-      }
-    }
   }
 
   if (status->HasServerCert()) {
diff --git a/security/manager/ssl/nsNSSCertificate.cpp b/security/manager/ssl/nsNSSCertificate.cpp
index 12fca5065..f6685e89a 100644
--- a/security/manager/ssl/nsNSSCertificate.cpp
+++ b/security/manager/ssl/nsNSSCertificate.cpp
@@ -1208,6 +1208,10 @@ void nsNSSCertList::destructorSafeDestroyNSSReference()
 NS_IMETHODIMP
 nsNSSCertList::AddCert(nsIX509Cert* aCert)
 {
+  if (!aCert) {
+    return NS_ERROR_INVALID_ARG;
+  }
+
   nsNSSShutDownPreventionLock locker;
   if (isAlreadyShutDown()) {
     return NS_ERROR_NOT_AVAILABLE;
@@ -1369,17 +1373,20 @@ nsNSSCertList::Read(nsIObjectInputStream* aStream)
     nsCOMPtr<nsISupports> certSupports;
     rv = aStream->ReadObject(true, getter_AddRefs(certSupports));
     if (NS_FAILED(rv)) {
-      break;
+      return rv;
     }
 
     nsCOMPtr<nsIX509Cert> cert = do_QueryInterface(certSupports);
+    if (!cert) {
+      return NS_ERROR_UNEXPECTED;
+    }
     rv = AddCert(cert);
     if (NS_FAILED(rv)) {
-      break;
+      return rv;
     }
   }
 
-  return rv;
+  return NS_OK;
 }
 
 NS_IMETHODIMP
diff --git a/security/manager/ssl/nsNSSIOLayer.cpp b/security/manager/ssl/nsNSSIOLayer.cpp
index 93fca396b..d2549c52d 100644
--- a/security/manager/ssl/nsNSSIOLayer.cpp
+++ b/security/manager/ssl/nsNSSIOLayer.cpp
@@ -1916,59 +1916,12 @@ nsConvertCANamesToStrings(const UniquePLArenaPool& arena, char** caNameStrings,
     }
 
     SECItem* dername;
-    SECStatus rv;
-    int headerlen;
-    uint32_t contentlen;
-    SECItem newitem;
     int n;
     char* namestring;
 
     for (n = 0; n < caNames->nnames; n++) {
-        newitem.data = nullptr;
         dername = &caNames->names[n];
 
-        rv = DER_Lengths(dername, &headerlen, &contentlen);
-
-        if (rv != SECSuccess) {
-            goto loser;
-        }
-
-        if (headerlen + contentlen != dername->len) {
-            // This must be from an enterprise 2.x server, which sent
-            // incorrectly formatted der without the outer wrapper of type and
-            // length. Fix it up by adding the top level header.
-            if (dername->len <= 127) {
-                newitem.data = (unsigned char*) PR_Malloc(dername->len + 2);
-                if (!newitem.data) {
-                    goto loser;
-                }
-                newitem.data[0] = (unsigned char) 0x30;
-                newitem.data[1] = (unsigned char) dername->len;
-                (void) memcpy(&newitem.data[2], dername->data, dername->len);
-            } else if (dername->len <= 255) {
-                newitem.data = (unsigned char*) PR_Malloc(dername->len + 3);
-                if (!newitem.data) {
-                    goto loser;
-                }
-                newitem.data[0] = (unsigned char) 0x30;
-                newitem.data[1] = (unsigned char) 0x81;
-                newitem.data[2] = (unsigned char) dername->len;
-                (void) memcpy(&newitem.data[3], dername->data, dername->len);
-            } else {
-                // greater than 256, better be less than 64k
-                newitem.data = (unsigned char*) PR_Malloc(dername->len + 4);
-                if (!newitem.data) {
-                    goto loser;
-                }
-                newitem.data[0] = (unsigned char) 0x30;
-                newitem.data[1] = (unsigned char) 0x82;
-                newitem.data[2] = (unsigned char) ((dername->len >> 8) & 0xff);
-                newitem.data[3] = (unsigned char) (dername->len & 0xff);
-                memcpy(&newitem.data[4], dername->data, dername->len);
-            }
-            dername = &newitem;
-        }
-
         namestring = CERT_DerNameToAscii(dername);
         if (!namestring) {
             // XXX - keep going until we fail to convert the name
@@ -1977,21 +1930,12 @@ nsConvertCANamesToStrings(const UniquePLArenaPool& arena, char** caNameStrings,
             caNameStrings[n] = PORT_ArenaStrdup(arena.get(), namestring);
             PR_Free(namestring);
             if (!caNameStrings[n]) {
-                goto loser;
+                return SECFailure;
             }
         }
-
-        if (newitem.data) {
-            PR_Free(newitem.data);
-        }
     }
 
     return SECSuccess;
-loser:
-    if (newitem.data) {
-        PR_Free(newitem.data);
-    }
-    return SECFailure;
 }
 
 // Possible behaviors for choosing a cert for client auth.
diff --git a/security/manager/ssl/nsSTSPreloadList.errors b/security/manager/ssl/nsSTSPreloadList.errors
index 48a20f39b..1b7c90ede 100644
--- a/security/manager/ssl/nsSTSPreloadList.errors
+++ b/security/manager/ssl/nsSTSPreloadList.errors
@@ -1,16 +1,29 @@
 0-1.party: could not connect to host
 0.me.uk: did not receive HSTS header
 00001.am: max-age too low: 129600
-00002.am: max-age too low: 129600
 0005.com: could not connect to host
 0005aa.com: could not connect to host
 0005pay.com: did not receive HSTS header
+00100010.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00120012.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00130013.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00140014.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00150015.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00160016.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00180018.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00190019.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 00220022.net: could not connect to host
+00330033.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00440044.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00550055.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+00660066.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 007-preisvergleich.de: could not connect to host
+00770077.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 00778899.com: did not receive HSTS header
 007kf.com: could not connect to host
 007sascha.de: did not receive HSTS header
 00880088.net: could not connect to host
+00990099.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 00wbf.com: could not connect to host
 01100010011001010111001101110100.com: could not connect to host
 013028.com: did not receive HSTS header
@@ -18,22 +31,25 @@
 016098.com: did not receive HSTS header
 016298.com: did not receive HSTS header
 016328.com: did not receive HSTS header
-019328.com: did not receive HSTS header
+019328.com: could not connect to host
 019398.com: did not receive HSTS header
+01smh.com: could not connect to host
 020wifi.nl: could not connect to host
 0222.mg: did not receive HSTS header
 0222aa.com: could not connect to host
-023838.com: did not receive HSTS header
+023838.com: could not connect to host
 028718.com: did not receive HSTS header
-029978.com: did not receive HSTS header
+029978.com: could not connect to host
 029inno.com: could not connect to host
 02dl.net: could not connect to host
+02smh.com: could not connect to host
 03-09-2016.wedding: could not connect to host
+0311buy.cn: did not receive HSTS header
 040fit.nl: did not receive HSTS header
 040fitvitality.nl: did not receive HSTS header
 048.ag: could not connect to host
+04sun.com: could not connect to host
 050508.com: could not connect to host
-0513c.com: did not receive HSTS header
 055268.com: did not receive HSTS header
 066318.com: could not connect to host
 066538.com: did not receive HSTS header
@@ -41,12 +57,14 @@
 066928.com: could not connect to host
 066938.com: could not connect to host
 070709.net: could not connect to host
+07733.win: did not receive HSTS header
 078805.com: did not receive HSTS header
 078810.com: did not receive HSTS header
 078820.com: did not receive HSTS header
 078860.com: did not receive HSTS header
 078890.com: did not receive HSTS header
 081638.com: did not receive HSTS header
+083962.com: could not connect to host
 086628.com: did not receive HSTS header
 0c.eu: did not receive HSTS header
 0cdn.ga: could not connect to host
@@ -72,13 +90,11 @@
 0x90.fi: could not connect to host
 0x90.in: could not connect to host
 0xa.in: could not connect to host
-0xaa55.me: could not connect to host
 0xb612.org: could not connect to host
 0xcafec0.de: did not receive HSTS header
 0xf00.ch: could not connect to host
 1.0.0.1: max-age too low: 0
 1000hats.com: did not receive HSTS header
-1000serien.com: could not connect to host
 1001.best: could not connect to host
 1001firms.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 100onrainkajino.com: could not connect to host
@@ -87,20 +103,29 @@
 1022996493.rsc.cdn77.org: could not connect to host
 1091.jp: could not connect to host
 10gbit.ovh: could not connect to host
+10seos.com: did not receive HSTS header
 10tacle.io: could not connect to host
 10v2.com: did not receive HSTS header
 10x.ooo: could not connect to host
+1100.so: could not connect to host
+110110110.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 1116pay.com: did not receive HSTS header
+112112112.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+113113113.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+118118118.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 11bt.cc: did not receive HSTS header
 11recruitment.com.au: did not receive HSTS header
+11scc.com: did not receive HSTS header
 120dayweightloss.com: could not connect to host
 123110.com: could not connect to host
 123movies.fyi: did not receive HSTS header
 123share.org: did not receive HSTS header
 123termpapers.com: could not connect to host
+123test.com: did not receive HSTS header
 123test.de: did not receive HSTS header
 123test.es: did not receive HSTS header
 123test.fr: did not receive HSTS header
+123test.nl: did not receive HSTS header
 126ium.moe: could not connect to host
 127011-networks.ch: could not connect to host
 12vpn.org: could not connect to host
@@ -112,12 +137,24 @@
 1391kj.com: did not receive HSTS header
 1396.cc: could not connect to host
 1396.net: did not receive HSTS header
+1481481.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481481.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481482.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481482.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481483.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481483.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481485.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481485.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481486.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+1481486.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 1536.cf: could not connect to host
-160887.com: did not receive HSTS header
 16164f.com: could not connect to host
 163pwd.com: could not connect to host
+166166.com: could not connect to host
 1689886.com: did not receive HSTS header
 168bet9.com: could not connect to host
+168bo9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+168bo9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 168esb.com: could not connect to host
 16deza.com: did not receive HSTS header
 16packets.com: could not connect to host
@@ -126,18 +163,23 @@
 173vpnv.com: could not connect to host
 174.net.nz: did not receive HSTS header
 174343.com: could not connect to host
+1750studios.com: could not connect to host
 17hats.com: did not receive HSTS header
 188522.com: did not receive HSTS header
 18888msc.com: could not connect to host
 1888zr.com: could not connect to host
 188betwarriors.co.uk: could not connect to host
+188da.com: could not connect to host
 188trafalgar.ca: did not receive HSTS header
+189dv.com: could not connect to host
 1912x.com: could not connect to host
 19216811.online: could not connect to host
 1921958389.rsc.cdn77.org: could not connect to host
-195gm.com: did not receive HSTS header
+195gm.com: could not connect to host
 1a-jva.de: could not connect to host
+1aim.com: did not receive HSTS header
 1atic.com: could not connect to host
+1b1.pl: could not connect to host
 1co-jp.net: did not receive HSTS header
 1cover.com: could not connect to host
 1day1ac.red: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -145,10 +187,12 @@
 1gsoft.com: could not connect to host
 1item.co.il: did not receive HSTS header
 1k8b.com: could not connect to host
+1m.duckdns.org: could not connect to host
 1nian.vip: could not connect to host
-1pw.ca: did not receive HSTS header
+1q2w.nl: could not connect to host
 1q365a.com: could not connect to host
 1s.tn: could not connect to host
+1salland.nl: could not connect to host
 1st4abounce.co.uk: did not receive HSTS header
 1stcapital.com.sg: did not receive HSTS header
 1ststop.co.uk: did not receive HSTS header
@@ -157,21 +201,23 @@
 1xcess.com: did not receive HSTS header
 1years.cc: could not connect to host
 2-cpu.de: could not connect to host
+200fcw.com: could not connect to host
 2018.wales: could not connect to host
 20188088.com: did not receive HSTS header
+2048-spiel.de: could not connect to host
 2048game.co.uk: could not connect to host
 206rc.net: max-age too low: 2592000
-208.es: did not receive HSTS header
+208.es: could not connect to host
 20hs.cn: did not receive HSTS header
 20zq.com: could not connect to host
+21.co.uk: did not receive HSTS header
 21lg.co: could not connect to host
 21stnc.com: could not connect to host
 22bt.cc: did not receive HSTS header
+22scc.com: did not receive HSTS header
 2333.press: could not connect to host
-2333666.xyz: did not receive HSTS header
-233ss.net: did not receive HSTS header
-245meadowvistaway.com: did not receive HSTS header
 247a.co.uk: could not connect to host
+247exchange.com: could not connect to host
 247quickbooks.com: did not receive HSTS header
 2488.ch: did not receive HSTS header
 249cq.com: could not connect to host
@@ -180,32 +226,35 @@
 24kbet.com: could not connect to host
 24pcr.com: could not connect to host
 24sihu.com: could not connect to host
-2566335.xyz: could not connect to host
+2566335.xyz: did not receive HSTS header
 256k.me: could not connect to host
+258da.com: did not receive HSTS header
 25daysof.io: could not connect to host
-260887.com: did not receive HSTS header
 27728522.com: could not connect to host
 2859cc.com: could not connect to host
+286.com: did not receive HSTS header
+288da.com: did not receive HSTS header
 29227.com: could not connect to host
+298da.com: did not receive HSTS header
 2acbi-asso.fr: did not receive HSTS header
 2b3b.com: could not connect to host
+2bad2c0.de: could not connect to host
 2bitout.com: could not connect to host
 2bizi.ru: could not connect to host
+2bouncy.com: did not receive HSTS header
 2brokegirls.org: could not connect to host
 2carpros.com: did not receive HSTS header
-2fl.me: did not receive HSTS header
-2gen.com: could not connect to host
+2fl.me: could not connect to host
 2intermediate.co.uk: did not receive HSTS header
-2mir.com: could not connect to host
 2or3.tk: could not connect to host
 2smart4food.com: could not connect to host
-2ss.jp: could not connect to host
+2ss.jp: did not receive HSTS header
 300651.ru: did not receive HSTS header
 300mbmovie24.com: could not connect to host
 300mbmovies4u.cc: could not connect to host
 301.website: could not connect to host
 302.nyc: could not connect to host
-30yearmortgagerates.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+30yearmortgagerates.net: could not connect to host
 3133780x.com: did not receive HSTS header
 314166.com: could not connect to host
 314chan.org: could not connect to host
@@ -214,20 +263,22 @@
 330.net: could not connect to host
 338da.com: could not connect to host
 33drugstore.com: could not connect to host
+33scc.com: did not receive HSTS header
 341.mg: could not connect to host
 34oztonic.eu: did not receive HSTS header
-3555500.com: could not connect to host
+3555500.com: did not receive HSTS header
 3555aa.com: could not connect to host
 35792.de: could not connect to host
 360gradus.com: did not receive HSTS header
+360woodworking.com: could not connect to host
 365.or.jp: could not connect to host
-365beautyworld.com: could not connect to host
 365maya.com: did not receive HSTS header
 368mibn.com: could not connect to host
 3778vip.com: did not receive HSTS header
 3778xl.com: could not connect to host
 3839.ca: could not connect to host
 38888msc.com: could not connect to host
+388da.com: could not connect to host
 38blog.com: could not connect to host
 38sihu.com: could not connect to host
 3candy.com: could not connect to host
@@ -243,8 +294,9 @@
 3sreporting.com: did not receive HSTS header
 3vlnaeet.cz: could not connect to host
 3wecommerce.com.br: could not connect to host
+3weekdietworks.com: did not receive HSTS header
 3xx.link: could not connect to host
-40-grad.de: did not receive HSTS header
+40-grad.de: max-age too low: 2628000
 4036aa.com: did not receive HSTS header
 4036bb.com: did not receive HSTS header
 4036cc.com: did not receive HSTS header
@@ -260,25 +312,18 @@
 42entrepreneurs.fr: did not receive HSTS header
 42ms.org: could not connect to host
 42t.ru: could not connect to host
+439050.com: could not connect to host
 439191.com: did not receive HSTS header
-440887.com: did not receive HSTS header
 440hz-radio.de: did not receive HSTS header
 440hz.radio: did not receive HSTS header
-442887.com: did not receive HSTS header
-443887.com: did not receive HSTS header
-444887.com: did not receive HSTS header
-4455software.com: could not connect to host
-445887.com: did not receive HSTS header
+4455software.com: did not receive HSTS header
+448da.com: did not receive HSTS header
 44957.com: could not connect to host
+44scc.com: did not receive HSTS header
 44sec.com: could not connect to host
 4500.co.il: did not receive HSTS header
 4679.space: did not receive HSTS header
-4706666.com: did not receive HSTS header
-4716666.com: did not receive HSTS header
-4726666.com: did not receive HSTS header
-4756666.com: did not receive HSTS header
-4786666.com: did not receive HSTS header
-478933.com: did not receive HSTS header
+478933.com: could not connect to host
 47tech.com: could not connect to host
 4997777.com: could not connect to host
 4azino777.ru: did not receive HSTS header
@@ -294,32 +339,41 @@
 4sqsu.eu: could not connect to host
 4w-performers.link: could not connect to host
 4web-hosting.com: could not connect to host
-4winds.pt: could not connect to host
+4winds.pt: did not receive HSTS header
 5000yz.com: could not connect to host
-500103.com: could not connect to host
-500108.com: could not connect to host
+500103.com: did not receive HSTS header
+500108.com: did not receive HSTS header
+500fcw.com: could not connect to host
+50ma.xyz: could not connect to host
 50millionablaze.org: could not connect to host
 513vpn.net: could not connect to host
 517vpn.cn: could not connect to host
 518maicai.com: could not connect to host
-51aifuli.com: could not connect to host
+5214889.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+5214889.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 52b9.com: could not connect to host
 52b9.net: could not connect to host
 52kb.net: could not connect to host
 52kb1.com: could not connect to host
-52neptune.com: could not connect to host
+52neptune.com: did not receive HSTS header
+5310899.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+5310899.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+5364.com: could not connect to host
 540.co: did not receive HSTS header
 5432.cc: did not receive HSTS header
 54bf.com: could not connect to host
 555fl.com: max-age too low: 129600
 555xl.com: could not connect to host
 55bt.cc: did not receive HSTS header
+55scc.com: did not receive HSTS header
 56877.com: could not connect to host
 56ct.com: could not connect to host
 57aromas.com: did not receive HSTS header
+598598598.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 5chat.it: could not connect to host
 5ece.de: could not connect to host
 5piecesofadvice.com: could not connect to host
+5w5.la: did not receive HSTS header
 605508.cc: could not connect to host
 605508.com: could not connect to host
 60ych.net: did not receive HSTS header
@@ -328,19 +382,20 @@
 645ds.cn: did not receive HSTS header
 645ds.com: did not receive HSTS header
 64616e.xyz: could not connect to host
+64970.com: did not receive HSTS header
 64bitgaming.de: could not connect to host
-64bitservers.net: could not connect to host
-660011.com: could not connect to host
-6677.us: did not receive HSTS header
+660011.com: did not receive HSTS header
+66205.net: did not receive HSTS header
+668da.com: did not receive HSTS header
 67899876.com: did not receive HSTS header
-68277.me: could not connect to host
 688da.com: could not connect to host
 692b8c32.de: could not connect to host
 69mentor.com: could not connect to host
 69square.com: could not connect to host
 6ird.com: did not receive HSTS header
+6w6.la: did not receive HSTS header
 6z3.net: could not connect to host
-7045h.com: could not connect to host
+7045.com: could not connect to host
 7183.org: could not connect to host
 721av.com: could not connect to host
 724go.com: could not connect to host
@@ -348,21 +403,25 @@
 72ty.com: could not connect to host
 72ty.net: could not connect to host
 73223.com: did not receive HSTS header
-73info.com: did not receive HSTS header
+7570.com: could not connect to host
 771122.tv: did not receive HSTS header
 7717a.com: did not receive HSTS header
+772244.net: did not receive HSTS header
 776573.net: did not receive HSTS header
 7777av.co: could not connect to host
 77890k.com: could not connect to host
+778da.com: did not receive HSTS header
 77book.cn: could not connect to host
+788da.com: did not receive HSTS header
 789zr.com: max-age too low: 86400
 7f-wgg.cf: could not connect to host
 7links.com.br: did not receive HSTS header
 7nw.eu: could not connect to host
 7thheavenrestaurant.com: could not connect to host
 8.net.co: could not connect to host
+80036.com: could not connect to host
 8003pay.com: could not connect to host
-808.lv: did not receive HSTS header
+808.lv: could not connect to host
 808phone.net: could not connect to host
 81818app.com: could not connect to host
 81uc.com: could not connect to host
@@ -378,27 +437,44 @@
 8722.com: did not receive HSTS header
 87577.com: could not connect to host
 88.to: could not connect to host
-887.ag: did not receive HSTS header
-8884553.com: could not connect to host
 8887999.com: could not connect to host
 8888av.co: could not connect to host
+8888esb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 888azino.com: did not receive HSTS header
 888lu.co: could not connect to host
+888msc.vip: did not receive HSTS header
 88d.com: could not connect to host
 88laohu.cc: could not connect to host
 88laohu.com: could not connect to host
+8901178.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8901178.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8910899.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8910899.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8917168.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8917168.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8917818.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8917818.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8951889.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8951889.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 8989k3.com: could not connect to host
+8992088.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+8992088.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 89955.com: could not connect to host
 899699.com: did not receive HSTS header
 89he.com: could not connect to host
 8azino777.ru: did not receive HSTS header
 8ballbombom.uk: could not connect to host
+8da2017.com: did not receive HSTS header
 8da2018.com: could not connect to host
+8dabet.com: could not connect to host
 8mpay.com: did not receive HSTS header
 8t88.biz: could not connect to host
+8ung.online: could not connect to host
+8xx.bet: could not connect to host
+8xx.io: could not connect to host
+8xx888.com: could not connect to host
 90smthng.com: could not connect to host
 91-freedom.com: could not connect to host
-910kj.com: did not receive HSTS header
 9118b.com: could not connect to host
 911911.pw: could not connect to host
 915ers.com: did not receive HSTS header
@@ -409,22 +485,27 @@
 92bmh.com: did not receive HSTS header
 9454.com: max-age too low: 86400
 94cs.cn: did not receive HSTS header
+9500years.com: max-age too low: 0
 95778.com: could not connect to host
 960news.ca: could not connect to host
 9617818.com: could not connect to host
 9617818.net: could not connect to host
 9651678.ru: could not connect to host
+9696178.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+9696178.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 9822.com: did not receive HSTS header
 9822.info: did not receive HSTS header
 987987.com: did not receive HSTS header
 9906753.net: did not receive HSTS header
 99511.fi: did not receive HSTS header
 99buffets.com: could not connect to host
+9bingo.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 9iwan.net: did not receive HSTS header
 9jadirect.com: could not connect to host
 9point6.com: could not connect to host
 9ss6.com: could not connect to host
 9vies.ca: could not connect to host
+9won.kr: could not connect to host
 a-intel.com: could not connect to host
 a-ix.net: could not connect to host
 a-plus.space: could not connect to host
@@ -434,42 +515,46 @@ a-theme.com: could not connect to host
 a1-autopartsglasgow.com: could not connect to host
 a1798.com: could not connect to host
 a200k.xyz: did not receive HSTS header
+a2a.net: could not connect to host
 a2c-co.net: could not connect to host
-a2it.gr: did not receive HSTS header
+a2it.gr: max-age too low: 0
 a3workshop.swiss: could not connect to host
-a8q.org: could not connect to host
 a9c.co: could not connect to host
+aa43d.cn: could not connect to host
+aa6688.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 aa7733.com: could not connect to host
 aaeblog.com: did not receive HSTS header
 aaeblog.net: did not receive HSTS header
 aaeblog.org: did not receive HSTS header
 aaoo.net: could not connect to host
 aapp.space: could not connect to host
+aardvarksolutions.co.za: did not receive HSTS header
 aariefhaafiz.com: could not connect to host
 aaron-gustafson.com: did not receive HSTS header
-aaronburt.co.uk: could not connect to host
-aaronhorler.com.au: could not connect to host
 aaronmcguire.me: did not receive HSTS header
 aarvinproperties.com: could not connect to host
+ab-bauservice-berlin.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+abacus-events.co.uk: did not receive HSTS header
+abacustech.net: did not receive HSTS header
+abacustech.org: did not receive HSTS header
 abareplace.com: did not receive HSTS header
 abasky.net: could not connect to host
-abcdef.be: could not connect to host
 abcdentalcare.com: did not receive HSTS header
 abcdobebe.com: did not receive HSTS header
 abchelp.net: did not receive HSTS header
 abearofsoap.com: could not connect to host
 abecodes.net: could not connect to host
 abeontech.com: could not connect to host
+aberdeenalmeras.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 abi-fvs.de: could not connect to host
 abigailstark.com: could not connect to host
 abilitylist.org: did not receive HSTS header
-abilma.com: could not connect to host
 abioniere.de: could not connect to host
 ablogagency.net: could not connect to host
 abloop.com: could not connect to host
 abmahnhelfer.de: did not receive HSTS header
 abnarnro.com: could not connect to host
-abolition.co: could not connect to host
+abolition.co: did not receive HSTS header
 abosav.com: did not receive HSTS header
 abou.to: could not connect to host
 about.ge: did not receive HSTS header
@@ -477,7 +562,8 @@ aboutassistedliving.org: did not receive HSTS header
 aboutmyip.info: did not receive HSTS header
 aboutmyproperty.ca: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 aboutyou-deals.de: could not connect to host
-abracadabra.co.jp: max-age too low: 0
+abraxan.pro: could not connect to host
+absimple.ca: did not receive HSTS header
 absinthium.ch: could not connect to host
 absolutewaterproofingsolutions.com: did not receive HSTS header
 abstractbarista.com: could not connect to host
@@ -486,7 +572,8 @@ abt.de: did not receive HSTS header
 abtom.de: did not receive HSTS header
 abury.fr: did not receive HSTS header
 abury.me: did not receive HSTS header
-abyssgaming.eu: did not receive HSTS header
+abyssgaming.eu: could not connect to host
+ac-epmservices.com: could not connect to host
 ac.milan.it: did not receive HSTS header
 acabadosboston.com: could not connect to host
 academialowcost.com.br: did not receive HSTS header
@@ -496,6 +583,7 @@ acadianapatios.com: did not receive HSTS header
 acai51.net: could not connect to host
 acaonegocios.com.br: could not connect to host
 acbc.ie: max-age too low: 0
+accadoro.it: did not receive HSTS header
 accbay.com: could not connect to host
 accelerate.network: could not connect to host
 accelerole.com: did not receive HSTS header
@@ -504,7 +592,7 @@ accelight.jp: did not receive HSTS header
 access-sofia.org: did not receive HSTS header
 accolade.com.br: could not connect to host
 accoun.technology: could not connect to host
-accounts-p.com: could not connect to host
+accounts-p.com: did not receive HSTS header
 accountsuspended.club: could not connect to host
 accwing.com: could not connect to host
 aceadvisory.biz: did not receive HSTS header
@@ -520,7 +608,6 @@ acheritage.co.uk: did not receive HSTS header
 achmadfamily.com: could not connect to host
 achow101.com: did not receive HSTS header
 achterhoekseveiligheidsbeurs.nl: could not connect to host
-achterstieg.dedyn.io: could not connect to host
 acisonline.net: did not receive HSTS header
 acksoft.fr: did not receive HSTS header
 acksoftdemo.fr: did not receive HSTS header
@@ -528,9 +615,11 @@ acoffeeshops.com: could not connect to host
 acorns.com: did not receive HSTS header
 acpinformatique.fr: could not connect to host
 acr.im: could not connect to host
+acraft.org: could not connect to host
 acrepairdrippingsprings.com: could not connect to host
 acritelli.com: did not receive HSTS header
 acrossgw.com: could not connect to host
+acsihostingsolutions.com: did not receive HSTS header
 acslimited.co.uk: did not receive HSTS header
 actilove.ch: could not connect to host
 actiontowingroundrock.com: could not connect to host
@@ -544,8 +633,9 @@ actu-film.com: max-age too low: 0
 actu-medias.com: could not connect to host
 actualite-videos.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 acuve.jp: could not connect to host
-acy.com: could not connect to host
+acyume.com: did not receive HSTS header
 ad-disruptio.fr: could not connect to host
+ad13.in: could not connect to host
 ada.is: max-age too low: 2592000
 adajwells.me: could not connect to host
 adambryant.ca: could not connect to host
@@ -553,14 +643,12 @@ adamcoffee.net: could not connect to host
 adamdixon.co.uk: could not connect to host
 adamricheimer.com: could not connect to host
 adamsfoundationrepair.com: did not receive HSTS header
-adamwallington.co.uk: could not connect to host
 adamwk.com: did not receive HSTS header
 adastra.re: could not connect to host
 adblock.ovh: could not connect to host
 adboos.com: could not connect to host
 addaxpetroleum.com: could not connect to host
 addcrazy.com: did not receive HSTS header
-addiko.net: could not connect to host
 addvocate.com: could not connect to host
 adec-emsa.ae: could not connect to host
 adelaides.com: did not receive HSTS header
@@ -569,7 +657,6 @@ adelinlydia-coach.com: did not receive HSTS header
 adequatetechnology.com: could not connect to host
 aderal.io: could not connect to host
 adesa-asesoria.com: did not receive HSTS header
-adevel.eu: could not connect to host
 adfa-1.com: could not connect to host
 adhigamindia.com: could not connect to host
 adhosting.nl: did not receive HSTS header
@@ -581,10 +668,8 @@ adlerweb.info: did not receive HSTS header
 admin-forms.co.uk: did not receive HSTS header
 admin-numerique.com: did not receive HSTS header
 admin.google.com: did not receive HSTS header (error ignored - included regardless)
-admins.tech: could not connect to host
 adminwerk.com: did not receive HSTS header
 adminwerk.net: did not receive HSTS header
-admirable.one: did not receive HSTS header
 admiral.dp.ua: did not receive HSTS header
 admitcard.co.in: could not connect to host
 admsel.ec: could not connect to host
@@ -593,6 +678,7 @@ adoge.me: could not connect to host
 adonairelogios.com.br: could not connect to host
 adoniscabaret.co.uk: could not connect to host
 adopteunsiteflash.com: could not connect to host
+adora-illustrations.fr: did not receive HSTS header
 adprospb.com: did not receive HSTS header
 adquisitio.de: could not connect to host
 adquisitio.in: could not connect to host
@@ -615,7 +701,7 @@ advancedplasticsurgerycenter.com: did not receive HSTS header
 advancedseotool.it: did not receive HSTS header
 advancedstudio.ro: could not connect to host
 advancedwriters.com: could not connect to host
-advantagemechanicalinc.com: did not receive HSTS header
+advantagemechanicalinc.com: could not connect to host
 adventistdeploy.org: could not connect to host
 adventures.is: did not receive HSTS header
 adver.top: could not connect to host
@@ -623,6 +709,7 @@ advertisemant.com: could not connect to host
 adviespuntklokkenluiders.nl: could not connect to host
 adzie.xyz: could not connect to host
 adzuna.co.uk: did not receive HSTS header
+aegialis.com: did not receive HSTS header
 aelurus.com: could not connect to host
 aemoria.com: could not connect to host
 aeon.wiki: could not connect to host
@@ -634,6 +721,7 @@ aes256.ru: could not connect to host
 aesthetics-blog.com: did not receive HSTS header
 aesym.de: could not connect to host
 aether.pw: could not connect to host
+aethonan.pro: could not connect to host
 aevpn.net: could not connect to host
 aevpn.org: could not connect to host
 aeyoun.com: did not receive HSTS header
@@ -664,12 +752,15 @@ agenciagriff.com: did not receive HSTS header
 agencymanager.be: could not connect to host
 agentseeker.ca: could not connect to host
 agevio.com: could not connect to host
+agiairini.cz: could not connect to host
+agilebits.net: could not connect to host
 agileecommerce.com.br: could not connect to host
 agingstop.net: could not connect to host
-agonswim.com: did not receive HSTS header
+agonswim.com: could not connect to host
 agoravm.tk: could not connect to host
 agowa.eu: did not receive HSTS header
 agowa338.de: did not receive HSTS header
+agracan.com: could not connect to host
 agrafix.design: did not receive HSTS header
 agrarking.com: could not connect to host
 agrias.com.br: did not receive HSTS header
@@ -682,7 +773,8 @@ agroyard.com.ua: could not connect to host
 agtv.com.br: did not receive HSTS header
 ahabingo.com: did not receive HSTS header
 ahelos.tk: could not connect to host
-ahlz.sk: could not connect to host
+ahiru3.com: did not receive HSTS header
+ahkubiak.ovh: could not connect to host
 aholic.co: did not receive HSTS header
 ahoynetwork.com: could not connect to host
 ahri.ovh: could not connect to host
@@ -692,17 +784,18 @@ ahwatukeefoothillsmontessori.com: did not receive HSTS header
 ai1989.com: could not connect to host
 aibaoyou.com: could not connect to host
 aibsoftware.mx: could not connect to host
-aicial.com: did not receive HSTS header
+aicial.com: could not connect to host
 aicial.com.au: could not connect to host
 aid-web.ch: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 aidanwoods.com: did not receive HSTS header
 aide-admin.com: did not receive HSTS header
+aide-valais.ch: could not connect to host
 aidikofflaw.com: did not receive HSTS header
 aiesecarad.ro: could not connect to host
 aifreeze.ru: could not connect to host
 aify.eu: could not connect to host
 aikenorganics.com: could not connect to host
-aim-consultants.com: could not connect to host
+aim-consultants.com: did not receive HSTS header
 aimerworld.com: did not receive HSTS header
 aimrom.org: could not connect to host
 ainrb.com: could not connect to host
@@ -710,16 +803,20 @@ aioboot.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERRO
 aip-marine.com: could not connect to host
 aiphyron.com: could not connect to host
 aiponne.com: could not connect to host
+airbly.com: did not receive HSTS header
 airconsalberton.co.za: did not receive HSTS header
 airconsboksburg.co.za: did not receive HSTS header
 airconsfourways.co.za: did not receive HSTS header
 airconsmidrand.co.za: did not receive HSTS header
-airconssandton.co.za: did not receive HSTS header
 airedaleterrier.com.br: could not connect to host
 airfax.io: could not connect to host
+airhorn.de: did not receive HSTS header
+airicy.com: did not receive HSTS header
 airlea.com: could not connect to host
 airlinecheckins.com: did not receive HSTS header
+airlinesettlement.com: did not receive HSTS header
 airmazinginflatables.com: could not connect to host
+airportlimototoronto.com: did not receive HSTS header
 airproto.com: did not receive HSTS header
 airsick.guide: did not receive HSTS header
 airtimefranchise.com: did not receive HSTS header
@@ -728,8 +825,9 @@ aisle3.space: could not connect to host
 aiticon.de: did not receive HSTS header
 aivene.com: could not connect to host
 aiw-thkoeln.online: could not connect to host
+aixxe.net: did not receive HSTS header
 ajetaci.cz: could not connect to host
-ajibot.com: did not receive HSTS header
+ajibot.com: could not connect to host
 ajmahal.com: could not connect to host
 ajouin.com: could not connect to host
 ajw-group.com: did not receive HSTS header
@@ -744,16 +842,15 @@ akhras.at: did not receive HSTS header
 akiba-server.info: could not connect to host
 akihiro.xyz: could not connect to host
 akita-stream.com: could not connect to host
-akj.io: did not receive HSTS header
 akkadia.cc: could not connect to host
+akkeylab.com: could not connect to host
 akoch.net: could not connect to host
 akombakom.net: could not connect to host
-akoww.de: could not connect to host
 akracing.se: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 akselimedia.fi: could not connect to host
 akstudentsfirst.org: could not connect to host
 aktan.com.br: could not connect to host
-aktivist.in: did not receive HSTS header
+aktivist.in: could not connect to host
 akul.co.in: could not connect to host
 al-f.net: could not connect to host
 al-shami.net: could not connect to host
@@ -770,6 +867,7 @@ alauda-home.de: could not connect to host
 alaundeil.xyz: could not connect to host
 albanien.guide: could not connect to host
 alberguecimballa.es: could not connect to host
+albertbogdanowicz.pl: could not connect to host
 albertify.xyz: could not connect to host
 albertonplumber24-7.co.za: did not receive HSTS header
 albertopimienta.com: did not receive HSTS header
@@ -780,20 +878,23 @@ alcazaar.com: could not connect to host
 alchemia.co.il: did not receive HSTS header
 alcorao.org: could not connect to host
 aldes.co.za: did not receive HSTS header
+aldred.cloud: could not connect to host
 aleax.me: could not connect to host
 alecvannoten.be: did not receive HSTS header
+aledg.cl: could not connect to host
 alenan.org: could not connect to host
 aleph.land: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 alertaenlinea.gov: did not receive HSTS header
 alessandro.pw: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+alessandroonline.com.br: did not receive HSTS header
+alessandroz.ddns.net: could not connect to host
 alessandroz.pro: could not connect to host
 alethearose.com: did not receive HSTS header
 alexandernorth.ch: could not connect to host
 alexandre.sh: did not receive HSTS header
-alexandros.io: could not connect to host
 alexdodge.ca: did not receive HSTS header
-alexei.su: could not connect to host
 alexfisherhealth.com.au: did not receive HSTS header
+alexgebhard.com: could not connect to host
 alexhaydock.co.uk: did not receive HSTS header
 alexischaussy.xyz: could not connect to host
 alexismeza.com: could not connect to host
@@ -815,18 +916,19 @@ alghaib.com: could not connect to host
 alibababee.com: did not receive HSTS header
 alicialab.org: could not connect to host
 alien.bz: did not receive HSTS header
-alignrs.com: could not connect to host
-alikulov.me: did not receive HSTS header
 alilialili.ga: could not connect to host
 alinemaciel.adm.br: could not connect to host
+alinode.com: could not connect to host
+alistairholland.me: did not receive HSTS header
 alistairpialek.com: max-age too low: 86400
+alisync.com: could not connect to host
 alittlebitcheeky.com: did not receive HSTS header
 aliwebstore.com: could not connect to host
 aljammaz.holdings: could not connect to host
 aljmz.com: did not receive HSTS header
 alkami.com: max-age too low: 0
 alkamitech.com: max-age too low: 0
-alkel.info: did not receive HSTS header
+alkel.info: could not connect to host
 all-subtitles.com: could not connect to host
 all.tf: could not connect to host
 all4os.com: did not receive HSTS header
@@ -835,9 +937,9 @@ alldaymonitoring.com: could not connect to host
 alldm.ru: could not connect to host
 allegro-inc.com: did not receive HSTS header
 allerbestefreunde.de: did not receive HSTS header
-allfreelancers.su: did not receive HSTS header
 allgrass.es: did not receive HSTS header
 allgrass.net: did not receive HSTS header
+allhard.org: could not connect to host
 alliance-compacts.com: did not receive HSTS header
 allinnote.com: could not connect to host
 allinonecyprus.com: did not receive HSTS header
@@ -860,12 +962,15 @@ allstorebrasil.com.br: could not connect to host
 alltheducks.com: max-age too low: 43200
 allthingsblogging.com: could not connect to host
 allthingsfpl.com: could not connect to host
-allvips.ru: could not connect to host
+alltubedownload.net: could not connect to host
 almagalla.com: could not connect to host
 almatinki.com: could not connect to host
 aloalabs.com: did not receive HSTS header
 alocato.com: could not connect to host
+alorenzi.eu: did not receive HSTS header
+alp.net.cn: could not connect to host
 alparque.com: did not receive HSTS header
+alpe-d-or.dyn-o-saur.com: could not connect to host
 alpha.irccloud.com: could not connect to host
 alphabit-secure.com: could not connect to host
 alphabuild.io: could not connect to host
@@ -887,33 +992,31 @@ altered.network: could not connect to host
 altfire.ca: could not connect to host
 altiacaselight.com: could not connect to host
 altitudemoversdenver.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-altoneum.com: could not connect to host
+altoneum.com: did not receive HSTS header
 altporn.xyz: could not connect to host
 altruistgroup.net: max-age too low: 300
 aluminium-scaffolding.co.uk: could not connect to host
-alunjam.es: could not connect to host
-alunonaescola.com.br: could not connect to host
+alunjam.es: did not receive HSTS header
+alunonaescola.com.br: did not receive HSTS header
 aluoblog.pw: could not connect to host
 aluoblog.top: could not connect to host
 alusta.co: could not connect to host
+alvis-audio.com: did not receive HSTS header
 alvn.ga: could not connect to host
 am8888.top: could not connect to host
 amaderelectronics.com: max-age too low: 2592000
 amadilo.de: could not connect to host
 amadoraslindas.com: could not connect to host
 amaforums.org: did not receive HSTS header
-amagical.net: did not receive HSTS header
 amandaonishi.com: could not connect to host
 amaranthus.com.ph: could not connect to host
-amartinz.at: could not connect to host
 amateri.com: could not connect to host
-amateurvoicetalent.com: could not connect to host
 amatzen.dk: did not receive HSTS header
 amavis.org: did not receive HSTS header
 amazing-gaming.fr: could not connect to host
 amazingbouncycastles.co.uk: did not receive HSTS header
 amazingfloridagulfhomes.com: did not receive HSTS header
-ambiancestudio.ro: could not connect to host
+ambiancestudio.ro: did not receive HSTS header
 ambrosius.io: could not connect to host
 amcvega.com: did not receive HSTS header
 amdouglas.uk: could not connect to host
@@ -927,6 +1030,7 @@ americanoutlawjeepparts.com: did not receive HSTS header
 americansforcommunitydevelopment.org: did not receive HSTS header
 americansportsinstitute.org: did not receive HSTS header
 americanworkwear.nl: did not receive HSTS header
+ameschristian.net: did not receive HSTS header
 amethystcards.co.uk: could not connect to host
 ameza.co.uk: could not connect to host
 ameza.com.mx: could not connect to host
@@ -942,10 +1046,11 @@ amilx.org: could not connect to host
 amimoto-ami.com: did not receive HSTS header
 amin.ga: did not receive HSTS header
 amin.one: could not connect to host
+amisharingstuff.com: could not connect to host
 amishsecurity.com: could not connect to host
 amitse.com: did not receive HSTS header
 amitube.com: did not receive HSTS header
-amleeds.co.uk: could not connect to host
+amleeds.co.uk: did not receive HSTS header
 amlvfs.net: could not connect to host
 ammoulianiapartments.com: did not receive HSTS header
 amo-entreprise-et-commerce.fr: could not connect to host
@@ -956,7 +1061,7 @@ ampledesigners.com: could not connect to host
 ampleinfographics.com: could not connect to host
 amri.nl: did not receive HSTS header
 amtentertainments.co.uk: could not connect to host
-amua.fr: could not connect to host
+amua.fr: did not receive HSTS header
 amunoz.org: could not connect to host
 amv-crm.ru: could not connect to host
 anabol.nl: could not connect to host
@@ -964,7 +1069,7 @@ anacruz.es: did not receive HSTS header
 anadoluefessk.org: did not receive HSTS header
 anadoluefessporkulubu.org: could not connect to host
 anagra.ms: could not connect to host
-anaiscoachpersonal.es: did not receive HSTS header
+anaiscoachpersonal.es: could not connect to host
 anaisypirueta.es: did not receive HSTS header
 anakros.me: could not connect to host
 analangelsteen.com: could not connect to host
@@ -980,10 +1085,10 @@ anchorgrounds.com: did not receive HSTS header
 anchorinmarinainc.com: did not receive HSTS header
 ancient-gates.de: could not connect to host
 ancientkarma.com: could not connect to host
+ancientnorth.com: did not receive HSTS header
 andbraiz.com: did not receive HSTS header
 andere-gedanken.net: max-age too low: 10
 anderslind.dk: could not connect to host
-andoms.fi: did not receive HSTS header
 andreagobetti.com: did not receive HSTS header
 andreas-kluge.eu: could not connect to host
 andreasanti.net: did not receive HSTS header
@@ -991,17 +1096,21 @@ andreasbasurto.com: could not connect to host
 andreasbreitenlohner.de: max-age too low: 600000
 andreasfritz-fotografie.de: could not connect to host
 andreaskluge.eu: could not connect to host
+andreasr.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 andreastoneman.com: could not connect to host
-andrei-coman.com: could not connect to host
+andrei-coman.com: did not receive HSTS header
 andreigec.net: did not receive HSTS header
+andrejbenz.com: could not connect to host
+andrepicard.de: could not connect to host
 andrerose.ca: did not receive HSTS header
-andrew.fi: could not connect to host
-andrewbroekman.com: did not receive HSTS header
+andrespaz.com: could not connect to host
+andrewbroekman.com: could not connect to host
 andrewdavidwong.com: did not receive HSTS header
 andrewdaws.co: could not connect to host
 andrewdaws.info: could not connect to host
 andrewdaws.me: could not connect to host
 andrewdaws.tv: could not connect to host
+andrewhowden.com: did not receive HSTS header
 andrewmichaud.beer: could not connect to host
 andrewrdaws.com: could not connect to host
 andrewregan.me: could not connect to host
@@ -1016,9 +1125,8 @@ androidsphone.com: did not receive HSTS header
 androled.fr: max-age too low: 5184000
 andronika.net: could not connect to host
 androoz.se: could not connect to host
-anduril.de: did not receive HSTS header
-anduril.eu: did not receive HSTS header
 andyclark.io: could not connect to host
+andycloud.dynu.net: could not connect to host
 andycraftz.eu: did not receive HSTS header
 andymartin.cc: could not connect to host
 andymelichar.com: max-age too low: 0
@@ -1034,14 +1142,19 @@ angeloroberto.ch: did not receive HSTS header
 angeloventuri.com: did not receive HSTS header
 angervillelorcher.fr: did not receive HSTS header
 anghami.com: did not receive HSTS header
+anglertanke.de: could not connect to host
 anglictinatabor.cz: could not connect to host
-angrut.com: did not receive HSTS header
+angrut.com: could not connect to host
 angry-monk.com: could not connect to host
 angrydragonproductions.com: could not connect to host
+angrylab.com: did not receive HSTS header
 angryroute.com: could not connect to host
-animal-nature-human.com: could not connect to host
+anguiao.com: did not receive HSTS header
+aniaimichal.eu: could not connect to host
+anim.ee: could not connect to host
 animalnet.de: max-age too low: 7776000
 animalstropic.com: could not connect to host
+animatelluris.nl: max-age too low: 300
 anime1.top: could not connect to host
 anime1video.tk: could not connect to host
 animeday.ml: could not connect to host
@@ -1059,16 +1172,22 @@ ankakaak.com: could not connect to host
 ankaraprofesyonelnakliyat.com: did not receive HSTS header
 ankaraprofesyonelnakliyat.com.tr: did not receive HSTS header
 ankitha.in: max-age too low: 0
+ankya9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+anlp.top: could not connect to host
 annabellaw.com: did not receive HSTS header
 annahmeschluss.de: did not receive HSTS header
+annangela.moe: did not receive HSTS header
 annarbor.group: did not receive HSTS header
 annetaan.fi: could not connect to host
 annevankesteren.com: could not connect to host
 annevankesteren.org: could not connect to host
+annicascakes.nl: could not connect to host
+annrusnak.com: did not receive HSTS header
 annsbouncycastles.com: could not connect to host
 anomaly.ws: did not receive HSTS header
 anonboards.com: could not connect to host
 anonrea.ch: could not connect to host
+anonukradio.org: could not connect to host
 anonymo.co.uk: could not connect to host
 anonymo.uk: could not connect to host
 anonymousstatecollegelulzsec.com: could not connect to host
@@ -1085,6 +1204,7 @@ antecim.fr: could not connect to host
 antenasmundosat.com.br: did not receive HSTS header
 anthenor.co.uk: could not connect to host
 anthony-rouanet.com: could not connect to host
+anthonyaires.com: did not receive HSTS header
 anthonyavon.com: could not connect to host
 anthonyloop.com: did not receive HSTS header
 anthro.id: did not receive HSTS header
@@ -1093,6 +1213,7 @@ antimatiere.space: could not connect to host
 antimine.kr: could not connect to host
 antipa.ch: could not connect to host
 antirayapmalang.com: did not receive HSTS header
+antocom.com: could not connect to host
 antoine-roux.fr: did not receive HSTS header
 antoinebetas.be: max-age too low: 0
 antoined.fr: did not receive HSTS header
@@ -1100,6 +1221,7 @@ antoinemary.io: did not receive HSTS header
 antoineschaller.ch: did not receive HSTS header
 antoniomarques.eu: did not receive HSTS header
 antoniorequena.com.ve: could not connect to host
+antons.io: did not receive HSTS header
 antraxx.ee: could not connect to host
 antscript.com: did not receive HSTS header
 anttitenhunen.com: could not connect to host
@@ -1119,7 +1241,6 @@ aooobo.com: could not connect to host
 aov.io: could not connect to host
 aovcentrum.nl: did not receive HSTS header
 aozora.moe: could not connect to host
-apachelounge.com: did not receive HSTS header
 apadrinaunolivo.org: did not receive HSTS header
 apaginastore.com.br: could not connect to host
 apeasternpower.com: could not connect to host
@@ -1128,6 +1249,7 @@ aperture-laboratories.science: did not receive HSTS header
 api.mega.co.nz: could not connect to host
 apibot.de: could not connect to host
 apience.com: did not receive HSTS header
+apila.us: could not connect to host
 apis.blue: could not connect to host
 apis.google.com: did not receive HSTS header (error ignored - included regardless)
 apis.world: could not connect to host
@@ -1139,8 +1261,10 @@ apl2bits.net: did not receive HSTS header
 apm.com.tw: did not receive HSTS header
 apmg-certified.com: did not receive HSTS header
 apmg-cyber.com: did not receive HSTS header
+apmpproject.org: did not receive HSTS header
 apnakliyat.com: did not receive HSTS header
 apolloyl.com: could not connect to host
+apollyon.work: could not connect to host
 aponkral.site: could not connect to host
 aponkralsunucu.com: could not connect to host
 aponow.de: did not receive HSTS header
@@ -1150,6 +1274,7 @@ app-arena.com: did not receive HSTS header
 app.manilla.com: could not connect to host
 apparels24.com: did not receive HSTS header
 appart.ninja: could not connect to host
+appcoins.io: did not receive HSTS header
 appdb.cc: did not receive HSTS header
 appdrinks.com: could not connect to host
 appeldorn.me: did not receive HSTS header
@@ -1158,7 +1283,7 @@ appimlab.it: could not connect to host
 apple-watch-zubehoer.de: could not connect to host
 apple.ax: could not connect to host
 applejacks-bouncy-castles.co.uk: could not connect to host
-applewatch.co.nz: did not receive HSTS header
+applewatch.co.nz: could not connect to host
 applez.xyz: could not connect to host
 appliancerepairlosangeles.com: did not receive HSTS header
 applic8.com: did not receive HSTS header
@@ -1166,16 +1291,21 @@ apply55gx.com: could not connect to host
 appointed.at: did not receive HSTS header
 appraisal-comps.com: could not connect to host
 appreciationkards.com: did not receive HSTS header
+apprenticeships.gov: did not receive HSTS header
 approlys.fr: did not receive HSTS header
 apps-for-fishing.com: could not connect to host
 apps4all.sytes.net: could not connect to host
 appsbystudio.co.uk: did not receive HSTS header
 appsdash.io: could not connect to host
+appson.co.uk: did not receive HSTS header
 apptoutou.com: could not connect to host
 appuro.com: did not receive HSTS header
 appxcrypto.com: did not receive HSTS header
+aproposcomputing.com: could not connect to host
 aprpullmanportermuseum.org: did not receive HSTS header
+aprr.org: could not connect to host
 aptitude9.com: could not connect to host
+aqilacademy.com.au: could not connect to host
 aqqrate.com: could not connect to host
 aquariumaccessories.shop: could not connect to host
 aquilaguild.com: could not connect to host
@@ -1187,8 +1317,9 @@ arabsexi.info: could not connect to host
 aradulconteaza.ro: could not connect to host
 aran.me.uk: could not connect to host
 aranel.me: could not connect to host
+arawaza.info: could not connect to host
 arboineuropa.nl: did not receive HSTS header
-arbu.eu: max-age too low: 2419200
+arboleda-hurtado.com: could not connect to host
 arcadiaeng.com: did not receive HSTS header
 arcbit.io: could not connect to host
 archii.ca: did not receive HSTS header
@@ -1199,33 +1330,37 @@ archsec.info: could not connect to host
 arckr.com: could not connect to host
 arctica.io: did not receive HSTS header
 ardao.me: could not connect to host
-ardor.noip.me: could not connect to host
-ardorlabs.se: could not connect to host
+ardorlabs.se: did not receive HSTS header
 area3.org: could not connect to host
 areallyneatwebsite.com: could not connect to host
-arent.kz: could not connect to host
+arent.kz: did not receive HSTS header
 arenzanaphotography.com: could not connect to host
-arethsu.se: could not connect to host
+areqgaming.com: could not connect to host
 arewedubstepyet.com: did not receive HSTS header
 areyouever.me: could not connect to host
 argennon.xyz: could not connect to host
 argh.io: could not connect to host
 arguggi.co.uk: could not connect to host
+ariaartgallery.com: did not receive HSTS header
 ariacreations.net: did not receive HSTS header
-ariege-pyrenees.net: did not receive HSTS header
 arifburhan.online: could not connect to host
 arifp.me: could not connect to host
 arinflatablefun.co.uk: could not connect to host
 arislight.com: could not connect to host
-aristocrates.co: could not connect to host
+aristilabs.com: did not receive HSTS header
 aristocratps.com: did not receive HSTS header
 arithxu.com: did not receive HSTS header
+arizer.com: did not receive HSTS header
 arka.gq: did not receive HSTS header
 arknodejs.com: could not connect to host
 arlen.io: could not connect to host
 arlen.se: could not connect to host
+arlet.click: could not connect to host
 arlingtonwine.net: could not connect to host
+arm-host.com: did not receive HSTS header
+arm.gov: could not connect to host
 armazemdaminiatura.com.br: could not connect to host
+armeni-jewellery.gr: did not receive HSTS header
 armenians.online: could not connect to host
 armingrodon.de: did not receive HSTS header
 armodec.com: did not receive HSTS header
@@ -1236,6 +1371,7 @@ armory.supplies: could not connect to host
 armsday.com: could not connect to host
 armyofbane.com: did not receive HSTS header
 armytricka.cz: did not receive HSTS header
+arnaudminable.net: could not connect to host
 arne-petersen.net: did not receive HSTS header
 arnesolutions.com: could not connect to host
 aromaclub.nl: did not receive HSTS header
@@ -1251,24 +1387,16 @@ arrowgrove.com: could not connect to host
 ars-design.net: could not connect to host
 arsenal.ru: could not connect to host
 arsk1.com: could not connect to host
-arswb.men: could not connect to host
 art2web.net: could not connect to host
+artansoft.com: could not connect to host
 artaronquieres.com: did not receive HSTS header
 artartefatos.com.br: could not connect to host
 artbytik.ru: did not receive HSTS header
-artea.ga: could not connect to host
-arteaga.co.uk: could not connect to host
-arteaga.eu: could not connect to host
-arteaga.me: could not connect to host
-arteaga.tech: could not connect to host
-arteaga.uk: could not connect to host
-arteaga.xyz: could not connect to host
 artegusto.ru: did not receive HSTS header
 artemicroway.com.br: could not connect to host
 arteseideias.com.pt: did not receive HSTS header
 artesupra.com: did not receive HSTS header
 arthan.me: could not connect to host
-arthur.cn: could not connect to host
 arti-group.ml: max-age too low: 2592000
 articaexports.com: could not connect to host
 artifex21.com: did not receive HSTS header
@@ -1297,9 +1425,10 @@ asana.studio: did not receive HSTS header
 asasuou.pw: could not connect to host
 asc16.com: could not connect to host
 aschaefer.net: could not connect to host
-asciitable.tips: could not connect to host
+ascii.moe: could not connect to host
 asdpress.cn: could not connect to host
 asepms.com: max-age too low: 7776000
+asge-handel.de: did not receive HSTS header
 ashlane-cottages.com: could not connect to host
 ashleakunowski.com: could not connect to host
 ashleyadum.com: could not connect to host
@@ -1308,17 +1437,19 @@ ashleymedway.com: could not connect to host
 asian-archi.com.tw: did not receive HSTS header
 asianbet77.co: did not receive HSTS header
 asianbet77.net: did not receive HSTS header
-asianodor.com: could not connect to host
-asiesvenezuela.com: did not receive HSTS header
+asiesvenezuela.com: could not connect to host
 asisee.co.il: could not connect to host
+asisee.photography: could not connect to host
 ask.pe: could not connect to host
 askfit.cz: did not receive HSTS header
 askmagicconch.com: could not connect to host
 asm-x.com: could not connect to host
 asmik-armenie.com: did not receive HSTS header
+asmm.cc: did not receive HSTS header
 asmui.ga: could not connect to host
-asmui.ml: could not connect to host
+asmui.ml: did not receive HSTS header
 asoftwareco.com: did not receive HSTS header
+aspargesgaarden.no: could not connect to host
 asphaltfruehling.de: could not connect to host
 asral7.com: could not connect to host
 asryflorist.com: could not connect to host
@@ -1330,12 +1461,14 @@ asset-alive.com: did not receive HSTS header
 asset-alive.net: could not connect to host
 assetsupervision.com: could not connect to host
 assindia.nl: could not connect to host
+assistance-personnes-agees.ch: could not connect to host
 assistcart.com: could not connect to host
 assurancesmons.be: did not receive HSTS header
 astaninki.com: could not connect to host
 asthon.cn: could not connect to host
 astraalivankila.net: could not connect to host
 astral.gq: did not receive HSTS header
+astral.org.pl: could not connect to host
 astrath.net: could not connect to host
 astrea-voetbal-groningen.nl: could not connect to host
 astrolpost.com: could not connect to host
@@ -1346,10 +1479,8 @@ astutr.co: could not connect to host
 asuhe.cc: could not connect to host
 asuhe.win: did not receive HSTS header
 asuhe.xyz: could not connect to host
-async.be: could not connect to host
+async.be: max-age too low: 0
 at1.co: could not connect to host
-ataber.pw: could not connect to host
-atacadodesandalias.com.br: could not connect to host
 atacadooptico.com.br: could not connect to host
 atavio.at: could not connect to host
 atavio.ch: could not connect to host
@@ -1360,14 +1491,17 @@ atelier-rk.com: did not receive HSTS header
 atelier-viennois-cannes.fr: could not connect to host
 ateliernihongo.ch: did not receive HSTS header
 ateliersantgervasi.com: did not receive HSTS header
-ath0.org: max-age too low: 600
+atg.soy: could not connect to host
 athaliasoft.com: could not connect to host
 athenelive.com: could not connect to host
 athensbusinessresources.us: could not connect to host
+atheoryofchange.com: could not connect to host
 athi.pl: did not receive HSTS header
 athul.xyz: could not connect to host
 atk.me: could not connect to host
 atkdesign.pt: did not receive HSTS header
+atlantaspringroll.com: could not connect to host
+atlantichomes.com.au: could not connect to host
 atlas-5.site: could not connect to host
 atlas-staging.ml: could not connect to host
 atlas.co: did not receive HSTS header
@@ -1382,24 +1516,27 @@ atop.io: could not connect to host
 atracaosexshop.com.br: could not connect to host
 atrevillot.com: could not connect to host
 attic118.com: could not connect to host
+attilagyorffy.com: could not connect to host
 attimidesigns.com: did not receive HSTS header
 attogproductions.com: did not receive HSTS header
 au-pair24.de: did not receive HSTS header
 au.search.yahoo.com: max-age too low: 172800
+au2pb.net: could not connect to host
 aubiosales.com: could not connect to host
 aucubin.moe: could not connect to host
-audiense.com: did not receive HSTS header
+audioonly.stream: could not connect to host
 audiovisualdevices.com.au: did not receive HSTS header
 audividi.shop: did not receive HSTS header
 aufmerksamkeitsstudie.com: could not connect to host
 aufprise.de: did not receive HSTS header
-augaware.org: could not connect to host
+augaware.org: did not receive HSTS header
 augenblicke-blog.de: could not connect to host
 augias.org: could not connect to host
 augix.net: could not connect to host
 augrandinquisiteur.com: did not receive HSTS header
 aujapan.ru: could not connect to host
 auntieme.com: did not receive HSTS header
+aur.rocks: did not receive HSTS header
 aurainfosec.com: did not receive HSTS header
 aurainfosec.com.au: did not receive HSTS header
 auraredeye.com: could not connect to host
@@ -1425,7 +1562,6 @@ auth.mail.ru: did not receive HSTS header
 authenitech.com: did not receive HSTS header
 authentication.io: could not connect to host
 authint.com: could not connect to host
-authland.com: could not connect to host
 author24.ru: did not receive HSTS header
 authoritynutrition.com: did not receive HSTS header
 authorsguild.in: did not receive HSTS header
@@ -1449,6 +1585,7 @@ autostock.me: could not connect to host
 autostop-occasions.be: could not connect to host
 autotsum.com: could not connect to host
 autoxy.it: did not receive HSTS header
+autozane.com: could not connect to host
 autumnwindsagility.com: could not connect to host
 auverbox.ovh: could not connect to host
 auvious.com: did not receive HSTS header
@@ -1464,16 +1601,16 @@ avastantivirus.ro: did not receive HSTS header
 avdelivers.com: did not receive HSTS header
 avdh.top: could not connect to host
 avec-ou-sans-ordonnance.fr: could not connect to host
-aveling-adventure.co.uk: did not receive HSTS header
+aveling-adventure.co.uk: could not connect to host
 avg.club: did not receive HSTS header
+avi9526.pp.ua: could not connect to host
 aviacao.pt: did not receive HSTS header
 avidcruiser.com: did not receive HSTS header
 aviodeals.com: could not connect to host
-avitres.com: could not connect to host
+avitres.com: did not receive HSTS header
 avmemo.com: could not connect to host
 avmo.pw: could not connect to host
 avmoo.com: could not connect to host
-avnet.ws: could not connect to host
 avonlearningcampus.com: could not connect to host
 avso.pw: could not connect to host
 avspot.net: could not connect to host
@@ -1485,6 +1622,7 @@ awccanadianpharmacy.com: could not connect to host
 awei.pub: could not connect to host
 awf0.xyz: could not connect to host
 awg-mode.de: did not receive HSTS header
+awin.la: did not receive HSTS header
 aww.moe: did not receive HSTS header
 awxg.eu.org: could not connect to host
 awxg.org: could not connect to host
@@ -1495,23 +1633,24 @@ axel-fischer.science: could not connect to host
 axelchv.fr: could not connect to host
 axem.co.jp: did not receive HSTS header
 axeny.com: did not receive HSTS header
-axfr.it: could not connect to host
 axg.io: did not receive HSTS header
 axialsports.com: did not receive HSTS header
 axiumacademy.com: did not receive HSTS header
-axolotlfarm.org: could not connect to host
+axka.com: could not connect to host
 axolsoft.com: max-age too low: 10540800
+axtudo.com: did not receive HSTS header
 axtux.tk: could not connect to host
 axxial.tk: could not connect to host
 ayahuascaadvisor.com: could not connect to host
 ayamchikchik.com: could not connect to host
 ayatk.com: did not receive HSTS header
-aymericlagier.com: could not connect to host
+ayesh.win: could not connect to host
 ayon.group: could not connect to host
 ayor.jp: could not connect to host
 ayor.tech: could not connect to host
 ayuru.info: could not connect to host
 az-vinyl-boden.de: could not connect to host
+azabani.com: did not receive HSTS header
 azamra.com: did not receive HSTS header
 azia.info: could not connect to host
 azino777.ru: could not connect to host
@@ -1524,29 +1663,126 @@ b-entropy.com: could not connect to host
 b-pi.duckdns.org: could not connect to host
 b-rickroll-e.pw: could not connect to host
 b-space.de: could not connect to host
+b0618.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b0618.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b0868.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b0868.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b1.work: could not connect to host
 b1236.com: could not connect to host
-b2and.com: could not connect to host
+b1758.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b1758.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b1768.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b1768.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b1788.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b2486.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b2486.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b2b-nestle.com.br: could not connect to host
 b2bpromoteit.com: did not receive HSTS header
 b3orion.com: could not connect to host
 b422edu.com: could not connect to host
+b5189.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b5189.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b5289.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b5289.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b5989.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b5989.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b61688.com: could not connect to host
+b8591.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b8591.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b8979.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b8979.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b8a.me: could not connect to host
+b9018.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9018.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9108.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9108.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9110.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9110.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9112.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9112.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b911gt.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b911gt.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9168.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b91688.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b91688.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b91688.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b91688.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9175.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9175.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9258.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9258.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9318.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9318.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9418.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9418.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9428.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9428.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9453.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9453.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9468.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9468.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9488.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9488.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9498.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9498.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9518.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9518.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9518.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9518.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b9520.com: could not connect to host
+b9528.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9528.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9538.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9538.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b9568.com: could not connect to host
+b9586.net: could not connect to host
+b9588.net: could not connect to host
+b95888.net: could not connect to host
+b9589.net: could not connect to host
+b9598.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9598.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9658.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9658.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b96899.com: could not connect to host
+b9758.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9758.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9818.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9818.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9858.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9858.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9880.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9883.net: could not connect to host
+b9884.net: could not connect to host
+b9885.net: could not connect to host
 b9886.com: could not connect to host
+b9886.net: could not connect to host
+b9887.net: could not connect to host
+b9888.net: could not connect to host
 b98886.com: could not connect to host
+b9889.net: could not connect to host
+b9920.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b9930.com: could not connect to host
+b9948.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9948.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b99520.com: could not connect to host
-b9970.com: could not connect to host
+b9960.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9970.com: did not receive HSTS header
 b9980.com: could not connect to host
 b99881.com: could not connect to host
 b99882.com: could not connect to host
 b99883.com: could not connect to host
 b99885.com: could not connect to host
+b99886.com: could not connect to host
+b9best.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9best.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9king.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9king.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9king.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+b9winner.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 b9winner.com: could not connect to host
+b9winner.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 babelfisch.eu: could not connect to host
+babsbibs.com: could not connect to host
 babursahvizeofisi.com: could not connect to host
 baby-click.de: could not connect to host
 babybee.ie: could not connect to host
@@ -1558,8 +1794,10 @@ babyliss-pro.net: did not receive HSTS header
 babysaying.me: could not connect to host
 babystep.tv: did not receive HSTS header
 bacchanallia.com: could not connect to host
+bacgrouppublishing.com: could not connect to host
 bacimg.com: did not receive HSTS header
 back-bone.nl: did not receive HSTS header
+backeby.eu: could not connect to host
 backenmachtgluecklich.de: max-age too low: 2592000
 backgroundchecks.online: did not receive HSTS header
 backgroundz.net: could not connect to host
@@ -1579,8 +1817,8 @@ badgirlsbible.com: could not connect to host
 badkamergigant.com: could not connect to host
 badlink.org: could not connect to host
 baff.lu: could not connect to host
+baffinlee.com: could not connect to host
 bagiobella.com: max-age too low: 0
-baglu.com: did not receive HSTS header
 baiduaccount.com: could not connect to host
 baildonhottubs.co.uk: could not connect to host
 bair.io: could not connect to host
@@ -1596,19 +1834,17 @@ bakkerdesignandbuild.com: did not receive HSTS header
 bakxnet.com: could not connect to host
 balatoni-nyar.hu: did not receive HSTS header
 balcan-underground.net: could not connect to host
-baldur.cc: could not connect to host
 baldwinkoo.com: could not connect to host
 baleares.party: could not connect to host
 balenciaspa.com: did not receive HSTS header
 balihai.com: did not receive HSTS header
 balilingo.ooo: could not connect to host
-ballarin.cc: could not connect to host
 ballbusting-cbt.com: could not connect to host
+ballitolocksmith.com: could not connect to host
 balloonphp.com: could not connect to host
 balnearionaturaspa.com: did not receive HSTS header
 balonmano.co: could not connect to host
 bals.org: did not receive HSTS header
-balticer.de: did not receive HSTS header
 bambambaby.com.br: could not connect to host
 bamtoki.com: could not connect to host
 bamtoki.se: could not connect to host
@@ -1616,29 +1852,31 @@ bananabandy.com: could not connect to host
 bananensap.nl: did not receive HSTS header
 bananium.fr: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 banbanchs.com: could not connect to host
-banburybid.com: could not connect to host
 banchethai.com: could not connect to host
 bandally.net: could not connect to host
 bandar303.cc: did not receive HSTS header
 bandar303.id: did not receive HSTS header
 bandar303.win: did not receive HSTS header
 bandarifamily.com: could not connect to host
-bandb.xyz: did not receive HSTS header
-bandrcrafts.com: could not connect to host
+bandb.xyz: could not connect to host
+bandrcrafts.com: did not receive HSTS header
 banduhn.com: did not receive HSTS header
-bangdream.ga: could not connect to host
-bangzafran.com: did not receive HSTS header
+bangzafran.com: could not connect to host
 bank: could not connect to host
 bankcircle.co.in: could not connect to host
-bankersonline.com: did not receive HSTS header
 bankitt.network: could not connect to host
 bankmilhas.com.br: did not receive HSTS header
+banknet.gov: did not receive HSTS header
 bankofrealty.review: could not connect to host
 banksaround.com: did not receive HSTS header
 bannisbierblog.de: could not connect to host
+banoviny.sk: did not receive HSTS header
 banqingdiao.com: could not connect to host
 banri.me: could not connect to host
 banxehoi.com: did not receive HSTS header
+bao-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bao-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+baodan666.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 baosuckhoedoisong.net: could not connect to host
 baptistboard.com: did not receive HSTS header
 baptiste-destombes.fr: did not receive HSTS header
@@ -1646,8 +1884,9 @@ baraxolka.ru: could not connect to host
 barcouniforms.com: did not receive HSTS header
 bardiel.de: max-age too low: 0
 barely.sexy: could not connect to host
-bargainmovingcompany.com: could not connect to host
+bargainmovingcompany.com: did not receive HSTS header
 bariller.fr: did not receive HSTS header
+baris-sagdic.com: could not connect to host
 barisi.me: could not connect to host
 barnrats.com: could not connect to host
 baropkamp.be: did not receive HSTS header
@@ -1655,7 +1894,7 @@ barprive.com: could not connect to host
 barqo.co: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 barracuda.blog: could not connect to host
 barrelhead.org: could not connect to host
-barrett.ag: could not connect to host
+barrett.ag: did not receive HSTS header
 barrut.me: did not receive HSTS header
 barshout.co.uk: could not connect to host
 barss.io: could not connect to host
@@ -1664,6 +1903,7 @@ bartelldrugs.com: did not receive HSTS header
 barunisystems.com: could not connect to host
 bascht.com: did not receive HSTS header
 basculasconfiables.com: could not connect to host
+basercap.co.ke: could not connect to host
 bashc.at: could not connect to host
 bashcode.ninja: could not connect to host
 basicsolutionsus.com: could not connect to host
@@ -1674,12 +1914,20 @@ basketsbymaurice.com: did not receive HSTS header
 baskettemple.com: did not receive HSTS header
 basnieuwenhuizen.nl: did not receive HSTS header
 bassh.net: did not receive HSTS header
+bastadigital.com: did not receive HSTS header
+bastivmobile.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bat909.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bat909.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bat9vip.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bat9vip.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 batfoundry.com: could not connect to host
-batonger.com: did not receive HSTS header
+batonger.com: could not connect to host
 batten.eu.org: could not connect to host
 batteryservice.ru: did not receive HSTS header
+batvip9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 baud.ninja: could not connect to host
-baudairenergyservices.com: could not connect to host
+baudairenergyservices.com: did not receive HSTS header
+bauen-mit-ziegel.de: max-age too low: 604800
 baum.ga: did not receive HSTS header
 baumstark.ca: could not connect to host
 bayinstruments.com: could not connect to host
@@ -1694,6 +1942,8 @@ bbj.io: did not receive HSTS header
 bbkanews.com: did not receive HSTS header
 bblovess.cn: could not connect to host
 bbrinck.eu: could not connect to host
+bbswin9.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bbswin9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 bbw-wrestling.com: could not connect to host
 bbwdom.xyz: could not connect to host
 bbwf.de: did not receive HSTS header
@@ -1701,6 +1951,9 @@ bbwfacesitting.us: could not connect to host
 bbwfacesitting.xyz: could not connect to host
 bbwfight.xyz: could not connect to host
 bbwteens.org: could not connect to host
+bbxin9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bbxin9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bc-personal.ch: did not receive HSTS header
 bc416.com: did not receive HSTS header
 bc418.com: did not receive HSTS header
 bc419.com: did not receive HSTS header
@@ -1711,18 +1964,30 @@ bcheng.cf: did not receive HSTS header
 bckp.de: could not connect to host
 bcm.com.au: did not receive HSTS header
 bcmlu.org: could not connect to host
+bcnet.com.hk: could not connect to host
 bcnet.hk: could not connect to host
 bcodeur.com: did not receive HSTS header
 bcradio.org: could not connect to host
 bcsytv.com: could not connect to host
 bcweightlifting.ca: could not connect to host
-bdata.cl: could not connect to host
+bdata.cl: did not receive HSTS header
 bddemir.com: could not connect to host
 bde-epitech.fr: could not connect to host
 bdenzer.com: did not receive HSTS header
 bdenzer.xyz: could not connect to host
 bdsmxxxpics.com: could not connect to host
-be-real.life: did not receive HSTS header
+be9418.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9418.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9418.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9418.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9458.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9458.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9458.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be9458.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be958.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be958.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be958.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+be958.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 be9966.com: could not connect to host
 beach-inspector.com: did not receive HSTS header
 beachi.es: could not connect to host
@@ -1742,13 +2007,14 @@ bebeefy.uk: could not connect to host
 bebesurdoue.com: could not connect to host
 beccajoshwedding.com: could not connect to host
 becklove.cn: could not connect to host
+becoast.fr: did not receive HSTS header
 becubed.co: could not connect to host
 bedabox.com: did not receive HSTS header
 bedeta.de: could not connect to host
 bedouille.com: could not connect to host
 bedreid.dk: did not receive HSTS header
 bedrijvenadministratie.nl: could not connect to host
-beelen.fr: could not connect to host
+bee.supply: could not connect to host
 beerboutique.com.br: could not connect to host
 beermedlar.com: could not connect to host
 beersandco.ch: could not connect to host
@@ -1757,11 +2023,13 @@ beetleroadstories.com: could not connect to host
 beforesunrise.de: did not receive HSTS header
 befundup.com: could not connect to host
 begcykel.com: did not receive HSTS header
+begoodny.co.il: max-age too low: 7889238
 behere.be: could not connect to host
 beier.io: could not connect to host
 beikeil.de: did not receive HSTS header
 beingmad.org: did not receive HSTS header
 belairsewvac.com: could not connect to host
+belcompany.nl: could not connect to host
 belewpictures.com: could not connect to host
 belgien.guide: could not connect to host
 belize-firmengruendung.com: could not connect to host
@@ -1771,11 +2039,12 @@ belltower.io: did not receive HSTS header
 belmontprom.com: could not connect to host
 belpbleibtbelp.ch: could not connect to host
 belwederczykow.eu: could not connect to host
-bemcorp.de: did not receive HSTS header
-bemvindoaolar.com.br: did not receive HSTS header
+bemvindoaolar.com.br: could not connect to host
 bemyvictim.com: max-age too low: 2678400
+benchcast.com: could not connect to host
+bencorby.com: could not connect to host
 bendechrai.com: did not receive HSTS header
-benedikt-tuchen.de: did not receive HSTS header
+benedikt-tuchen.de: could not connect to host
 benediktdichgans.de: did not receive HSTS header
 beneffy.com: did not receive HSTS header
 benevisim.com: could not connect to host
@@ -1806,7 +2075,6 @@ beretech.fr: could not connect to host
 berger.work: could not connect to host
 bergfex.at: did not receive HSTS header
 bergland-seefeld.at: did not receive HSTS header
-bergstoneware.com: could not connect to host
 berhampore-gateway.tk: could not connect to host
 berlatih.com: did not receive HSTS header
 berlin-kohlefrei.de: could not connect to host
@@ -1814,6 +2082,8 @@ berlinleaks.com: could not connect to host
 bermytraq.bm: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 bernexskiclub.ch: did not receive HSTS header
 bernieware.de: could not connect to host
+berr.yt: could not connect to host
+berry.cat: could not connect to host
 berrymark.be: did not receive HSTS header
 berseb.se: could not connect to host
 berthelier.me: could not connect to host
@@ -1827,29 +2097,44 @@ besixdouze.world: could not connect to host
 beslider.com: could not connect to host
 besnik.de: could not connect to host
 besola.de: could not connect to host
+bespaarnu.click: could not connect to host
+best-of-bounce.co.uk: could not connect to host
 best-wedding-quotes.com: could not connect to host
 bestattorney.com: did not receive HSTS header
 bestbeards.ca: could not connect to host
 bestbestbitcoin.com: could not connect to host
 bestbonuses.co.uk: did not receive HSTS header
 bestellipticalmachinereview.info: could not connect to host
+bestesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bestesb.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 bestfitnesswatchreview.info: could not connect to host
 besthost.cz: did not receive HSTS header
 besthotsales.com: could not connect to host
+bestiahosting.com: could not connect to host
 bestleftwild.com: could not connect to host
 bestmodels.su: did not receive HSTS header
 bestof1001.de: could not connect to host
 bestorangeseo.com: could not connect to host
 bestpaintings.nl: did not receive HSTS header
-bestschools.top: did not receive HSTS header
+bestparking.xyz: could not connect to host
+bestpig.fr: could not connect to host
 bestwarezone.com: could not connect to host
+bet-99.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bet-99.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bet-99.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bet168wy.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bet168wy.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bet909.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 bet990.com: could not connect to host
+bet9bet9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 betaclean.fr: did not receive HSTS header
 betafive.net: could not connect to host
 betakah.net: could not connect to host
 betamint.org: did not receive HSTS header
-betcafearena.ro: did not receive HSTS header
+betcafearena.ro: could not connect to host
 betformular.com: could not connect to host
+betgo9.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bethanyduke.com: could not connect to host
 bethditto.com: did not receive HSTS header
 betkoo.com: could not connect to host
 betnet.fr: could not connect to host
@@ -1859,7 +2144,11 @@ bets.de: did not receive HSTS header
 betshoot.com: could not connect to host
 betsonlinefree.com.au: could not connect to host
 betterlifemakers.com: max-age too low: 200
+bettflaschen.ch: did not receive HSTS header
 bettween.com: did not receive HSTS header
+between.be: could not connect to host
+betwin9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+betwin9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 betz.ro: could not connect to host
 beulahtabernacle.com: could not connect to host
 bevapehappy.com: did not receive HSTS header
@@ -1870,7 +2159,7 @@ bexit-security.eu: could not connect to host
 bexit-security.nl: could not connect to host
 bexithosting.nl: could not connect to host
 bey.io: could not connect to host
-beylikduzum.com: did not receive HSTS header
+beylikduzum.com: could not connect to host
 beyond-edge.com: could not connect to host
 beyuna.co.uk: did not receive HSTS header
 beyuna.eu: did not receive HSTS header
@@ -1894,7 +2183,7 @@ bgenlisted.com: could not connect to host
 bgfashion.net: could not connect to host
 bgneuesheim.de: did not receive HSTS header
 bhatia.at: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-biancolievito.it: did not receive HSTS header
+bhosted.nl: did not receive HSTS header
 bianinapiccanovias.com: could not connect to host
 biaoqingfuhao.net: did not receive HSTS header
 biaoqingfuhao.org: did not receive HSTS header
@@ -1908,11 +2197,13 @@ bidon.ca: did not receive HSTS header
 bidorbuy.co.ke: did not receive HSTS header
 bieberium.de: could not connect to host
 biego.cn: did not receive HSTS header
+biehl.co: did not receive HSTS header
 bielsa.me: did not receive HSTS header
 bienenblog.cc: could not connect to host
 bier.jp: did not receive HSTS header
 bierbringer.at: could not connect to host
 bierochs.org: could not connect to host
+biftin.net: could not connect to host
 big-black.de: did not receive HSTS header
 bigbbqbrush.bid: could not connect to host
 bigbounceentertainment.co.uk: could not connect to host
@@ -1920,11 +2211,13 @@ bigbrownpromotions.com.au: did not receive HSTS header
 bigcorporateevents.com: could not connect to host
 bigerbio.com: could not connect to host
 bigfunbouncycastles.com: could not connect to host
+bigjohn.ru: did not receive HSTS header
 biglagoonrentals.com: did not receive HSTS header
 bigshinylock.minazo.net: could not connect to host
 bigshort.org: could not connect to host
 biguixhe.net: could not connect to host
 bijoux.com.br: could not connect to host
+bijouxbrasil.com.br: did not receive HSTS header
 bijouxdegriffe.com.br: could not connect to host
 bijugeral.com.br: could not connect to host
 bikelifetvkidsquads.co.uk: could not connect to host
@@ -1939,13 +2232,16 @@ billdestler.com: did not receive HSTS header
 billigssl.dk: did not receive HSTS header
 billkiss.com: could not connect to host
 billninja.com: did not receive HSTS header
+billpro.com.au: could not connect to host
 billrobinson.io: could not connect to host
 billrusling.com: could not connect to host
 binam.center: could not connect to host
 binarization.net: could not connect to host
 binarization.org: did not receive HSTS header
 binaryabstraction.com: could not connect to host
-binaryfigments.com: max-age too low: 86400
+binaryfigments.com: max-age too low: 7776000
+binbin9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+binbin9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 binderapp.net: could not connect to host
 bingcheung.com: could not connect to host
 bingcheung.org: could not connect to host
@@ -1954,12 +2250,9 @@ bingo9.net: could not connect to host
 bingofriends.com: could not connect to host
 bingostars.com: did not receive HSTS header
 binimo.com: could not connect to host
-binsp.net: could not connect to host
-biocrafting.net: could not connect to host
+biocrafting.net: did not receive HSTS header
 bioespuna.eu: did not receive HSTS header
 biofam.ru: did not receive HSTS header
-bioknowme.com: did not receive HSTS header
-biologis.ch: could not connect to host
 biomax-mep.com.br: did not receive HSTS header
 bionicspirit.com: did not receive HSTS header
 biophysik-ssl.de: did not receive HSTS header
@@ -1971,17 +2264,18 @@ bip.gov.sa: could not connect to host
 birdandbranchnyc.com: max-age too low: 43200
 birkengarten.ch: could not connect to host
 birkman.com: did not receive HSTS header
-bischoff-mathey.family: could not connect to host
 biscuits-rec.com: could not connect to host
 biscuits-shop.com: could not connect to host
 bismarck.moe: did not receive HSTS header
 bisterfeldt.com: did not receive HSTS header
 biswas.me: could not connect to host
+bit.voyage: did not receive HSTS header
 bitace.com: did not receive HSTS header
 bitbit.org: did not receive HSTS header
 bitbr.net: could not connect to host
 bitcantor.com: did not receive HSTS header
 bitchan.it: could not connect to host
+bitclubfun.com: did not receive HSTS header
 bitcoin-casino-no-deposit-bonus.com: max-age too low: 0
 bitcoin-class.com: could not connect to host
 bitcoin-daijin.com: could not connect to host
@@ -1994,6 +2288,7 @@ bitcoinprivacy.net: did not receive HSTS header
 bitcoinworld.me: could not connect to host
 bitconcepts.co.uk: could not connect to host
 bitedge.com: did not receive HSTS header
+bitenose.com: could not connect to host
 bitenose.net: could not connect to host
 bitenose.org: could not connect to host
 biteoftech.com: did not receive HSTS header
@@ -2001,9 +2296,13 @@ bitf.ly: could not connect to host
 bitfactory.ws: could not connect to host
 bitfarm-archiv.com: did not receive HSTS header
 bitfarm-archiv.de: did not receive HSTS header
+bithap.com: could not connect to host
 bitheus.com: could not connect to host
 bithosting.io: did not receive HSTS header
-bitk.uk: could not connect to host
+bitk.co: did not receive HSTS header
+bitk.co.uk: did not receive HSTS header
+bitk.eu: did not receive HSTS header
+bitk.uk: did not receive HSTS header
 bitmain.com.ua: could not connect to host
 bitmaincare.com.ua: could not connect to host
 bitmaincare.ru: could not connect to host
@@ -2011,17 +2310,17 @@ bitmainwarranty.com.ua: could not connect to host
 bitmainwarranty.ru: could not connect to host
 bitmex.com: did not receive HSTS header
 bitmexin.com: could not connect to host
+bitmon.net: could not connect to host
 bitnet.io: did not receive HSTS header
+bitok.com: did not receive HSTS header
 bitplay.space: could not connect to host
 bitpod.de: could not connect to host
-bitpoll.de: could not connect to host
-bitpoll.org: could not connect to host
 bitrage.de: could not connect to host
 bitraum.io: could not connect to host
 bitroll.com: could not connect to host
 bitsafe.systems: did not receive HSTS header
 bitsensor.io: did not receive HSTS header
-bitstep.ca: could not connect to host
+bitshaker.net: did not receive HSTS header
 bittervault.xyz: could not connect to host
 bituptick.com: did not receive HSTS header
 bitvegas.com: did not receive HSTS header
@@ -2035,11 +2334,14 @@ bizon.sk: did not receive HSTS header
 bizpare.com: did not receive HSTS header
 bizzartech.com: did not receive HSTS header
 bizzybeebouncers.co.uk: could not connect to host
-bjgongyi.com: could not connect to host
+bjgongyi.com: did not receive HSTS header
+bjl5689.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bjl5689.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 bjrn.io: could not connect to host
 bjtxl.cn: could not connect to host
 bk-wife.com: could not connect to host
 bkb-skandal.ch: could not connect to host
+bkhayes.com: did not receive HSTS header
 black-armada.com: could not connect to host
 black-armada.com.pl: could not connect to host
 black-armada.pl: could not connect to host
@@ -2051,28 +2353,36 @@ blackburn.link: could not connect to host
 blackdesertsp.com: could not connect to host
 blackdiam.net: did not receive HSTS header
 blacklane.com: did not receive HSTS header
-blacklightparty.be: could not connect to host
 blackly.uk: max-age too low: 0
 blackmagic.sk: could not connect to host
 blackmirror.com.au: did not receive HSTS header
-blackpayment.ru: did not receive HSTS header
+blackpayment.ru: could not connect to host
 blackphantom.de: could not connect to host
 blackscreen.me: could not connect to host
+blackscytheconsulting.com: could not connect to host
 blackunicorn.wtf: could not connect to host
 bladesmith.io: did not receive HSTS header
-blakerandall.xyz: could not connect to host
+blakerandall.xyz: did not receive HSTS header
 blantik.net: could not connect to host
 blarg.co: could not connect to host
 blauwwit.be: did not receive HSTS header
 blazeit.io: could not connect to host
+blechpirat.name: could not connect to host
 bleep.zone: could not connect to host
 blendlecdn.com: could not connect to host
 blenheimchalcot.com: did not receive HSTS header
+blessedearth.com.au: max-age too low: 7889238
 blessnet.jp: did not receive HSTS header
+bleutecmedia.com: did not receive HSTS header
 blha303.com.au: could not connect to host
-blikund.swedbank.se: did not receive HSTS header
+bliesekow.net: could not connect to host
+bliker.ga: could not connect to host
 blindaryproduction.tk: could not connect to host
 blindsexdate.nl: did not receive HSTS header
+bling9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bling999.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bling999.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bling999.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 blinkenlight.co.uk: could not connect to host
 blinkenlight.com.au: could not connect to host
 blmiller.com: did not receive HSTS header
@@ -2092,7 +2402,6 @@ bloglikepro.com: could not connect to host
 blognone.com: did not receive HSTS header
 blognr.com: could not connect to host
 blogonblogspot.com: did not receive HSTS header
-blok56.nl: did not receive HSTS header
 blokino.org: did not receive HSTS header
 blokuhaka.fr: did not receive HSTS header
 bloodyexcellent.com: did not receive HSTS header
@@ -2106,17 +2415,19 @@ blucas.org: did not receive HSTS header
 blue17.co.uk: did not receive HSTS header
 bluebill.net: did not receive HSTS header
 bluecon.eu: did not receive HSTS header
-bluedata.ltd: could not connect to host
 bluefinger.nl: did not receive HSTS header
 blueglobalmedia.com: could not connect to host
-bluehawk.cloud: did not receive HSTS header
+bluehawk.cloud: could not connect to host
+bluehelixmusic.com: could not connect to host
 blueliv.com: did not receive HSTS header
 bluemoonroleplaying.com: could not connect to host
+bluepearl.tk: could not connect to host
 bluepoint.foundation: could not connect to host
 bluepoint.institute: could not connect to host
+blueprintloans.co.uk: did not receive HSTS header
 blueridgesecuritycameras.com: did not receive HSTS header
 bluescloud.xyz: could not connect to host
-bluesecure.com.br: did not receive HSTS header
+bluesecure.com.br: could not connect to host
 bluetenmeer.com: did not receive HSTS header
 bluezonehealth.co.uk: did not receive HSTS header
 blui.cf: max-age too low: 1209600
@@ -2124,6 +2435,7 @@ bluketing.com: did not receive HSTS header
 blumen-binder.ch: did not receive HSTS header
 blumen-garage.de: could not connect to host
 blumenwiese.xyz: did not receive HSTS header
+blundell.wedding: could not connect to host
 blunderify.se: did not receive HSTS header
 bluop.com: did not receive HSTS header
 bluserv.net: could not connect to host
@@ -2132,7 +2444,6 @@ blutopia.xyz: did not receive HSTS header
 blutroyal.de: could not connect to host
 blvdmb.com: did not receive HSTS header
 bm-i.ch: could not connect to host
-bm-immo.ch: could not connect to host
 bm-trading.nl: did not receive HSTS header
 bmet.de: did not receive HSTS header
 bmoattachments.org: did not receive HSTS header
@@ -2140,8 +2451,17 @@ bnb-buddy.nl: could not connect to host
 bnboy.cn: could not connect to host
 bngsecure.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 bnhlibrary.com: did not receive HSTS header
+bo1689.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo1689.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9club.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9club.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9club.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9fun.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9fun.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9game.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9game.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+bo9king.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 board-buy.ru: could not connect to host
-bobaobei.net: could not connect to host
 bobaobei.org: could not connect to host
 bobep.ru: could not connect to host
 boboates.com: did not receive HSTS header
@@ -2154,14 +2474,13 @@ bodyweightsolution.com: could not connect to host
 boel073.nl: did not receive HSTS header
 boensou.com: did not receive HSTS header
 bohaishibei.com: did not receive HSTS header
-bohan.co: could not connect to host
 bohan.life: could not connect to host
 bohyn.cz: could not connect to host
 boiadeirodeberna.com: could not connect to host
 boilesen.com: did not receive HSTS header
 bokeyy.com: could not connect to host
 bolainfoasia.com: did not receive HSTS header
-bolivarfm.com.ve: did not receive HSTS header
+bolivarfm.com.ve: could not connect to host
 boltdata.io: could not connect to host
 boltn.uk: did not receive HSTS header
 bolwerk.com.br: did not receive HSTS header
@@ -2172,6 +2491,7 @@ bonapp.restaurant: could not connect to host
 bondagefetishstore.com: could not connect to host
 bondtofte.dk: did not receive HSTS header
 boneko.de: did not receive HSTS header
+bonesserver.com: could not connect to host
 bonigo.de: did not receive HSTS header
 bonitabrazilian.co.nz: did not receive HSTS header
 bonnin.fr: did not receive HSTS header
@@ -2186,13 +2506,16 @@ booked.holiday: could not connect to host
 bookingentertainment.com: did not receive HSTS header
 bookmakersfreebets.com.au: could not connect to host
 bookofraonlinecasinos.com: could not connect to host
+bookourdjs.com: could not connect to host
 bookreport.ga: could not connect to host
+bookshopofindia.com: did not receive HSTS header
 bookwitty.social: could not connect to host
 boomerang.com: did not receive HSTS header
 boomsaki.com: did not receive HSTS header
 boomsakis.com: did not receive HSTS header
 boosterlearnpro.com: did not receive HSTS header
-boote.wien: did not receive HSTS header
+boostgame.win: could not connect to host
+boote.wien: could not connect to host
 booter.es: could not connect to host
 booth.in.th: could not connect to host
 bootikexpress.fr: did not receive HSTS header
@@ -2206,37 +2529,43 @@ boringsecurity.net: could not connect to host
 boris.one: could not connect to host
 borisavstankovic.rs: could not connect to host
 borisbesemer.com: could not connect to host
-born-to-learn.com: did not receive HSTS header
+born-to-learn.com: could not connect to host
 borrelioz.com: did not receive HSTS header
 borscheid-wenig.com: did not receive HSTS header
 boschee.net: could not connect to host
+bosworthdental.co.uk: did not receive HSTS header
 botlab.ch: could not connect to host
 botmanager.pl: could not connect to host
 botox.bz: did not receive HSTS header
 bots.cat: could not connect to host
+botserver.de: could not connect to host
 boueki.jp: did not receive HSTS header
 boueki.org: did not receive HSTS header
 bouk.co: could not connect to host
 bounce-r-us.co.uk: did not receive HSTS header
+bouncebeyondcastles.co.uk: did not receive HSTS header
 bounceboxspc.com: did not receive HSTS header
 bouncecoffee.com: did not receive HSTS header
 bouncehighpeak.co.uk: could not connect to host
 bouncelanduk.co.uk: did not receive HSTS header
 bouncemasters.co.uk: could not connect to host
+bouncewithbovells.com: could not connect to host
 bouncing4joy.co.uk: could not connect to host
 bouncingbuzzybees.co.uk: could not connect to host
 bouncycastleandparty.co.uk: could not connect to host
-bouncycastlehireauckland.co.nz: could not connect to host
 bouncycastlehiremedway.com: did not receive HSTS header
+bouncycastles.me: could not connect to host
+bouncymadness.com: did not receive HSTS header
 bouwbedrijfpurmerend.nl: did not receive HSTS header
 bowlsheet.com: did not receive HSTS header
-bownty.pt: could not connect to host
+bownty.pt: max-age too low: 0
 boxcryptor.com: did not receive HSTS header
 boxdevigneron.fr: could not connect to host
 boxing-austria.eu: did not receive HSTS header
 boxintense.com: did not receive HSTS header
 boxit.es: did not receive HSTS header
 boxlitepackaging.com: did not receive HSTS header
+boxmoe.cn: did not receive HSTS header
 boxview.com: could not connect to host
 boyan.in: could not connect to host
 boyfriendhusband.men: did not receive HSTS header
@@ -2268,7 +2597,9 @@ brand-foo.com: did not receive HSTS header
 brand-foo.jp: did not receive HSTS header
 brand-foo.net: did not receive HSTS header
 brandnewdays.nl: could not connect to host
+brando753.xyz: could not connect to host
 brandon.so: could not connect to host
+brandonlui.ml: could not connect to host
 brandons.site: could not connect to host
 brandontaylor-black.com: could not connect to host
 brandred.net: could not connect to host
@@ -2285,6 +2616,7 @@ brenden.net.au: could not connect to host
 bress.cloud: could not connect to host
 brettcornwall.com: did not receive HSTS header
 brettpemberton.xyz: did not receive HSTS header
+bretz-hufer.de: did not receive HSTS header
 brfvh24.se: could not connect to host
 brianpcurran.com: did not receive HSTS header
 brickoo.com: could not connect to host
@@ -2292,7 +2624,6 @@ brickwerks.io: could not connect to host
 brickyardbuffalo.com: did not receive HSTS header
 bridgeout.com: could not connect to host
 bridholm.se: could not connect to host
-briggsleroux.com: did not receive HSTS header
 brightfuturemadebyme.com: could not connect to host
 brightstarkids.co.uk: did not receive HSTS header
 brightstarkids.com.au: did not receive HSTS header
@@ -2306,6 +2637,7 @@ brinkhu.is: could not connect to host
 brinkmann.one: could not connect to host
 brinquedoseducativos.art.br: did not receive HSTS header
 brio-ukraine.store: could not connect to host
+britishchronicles.com: could not connect to host
 britishmeat.com: could not connect to host
 britzer-toner.de: did not receive HSTS header
 brivadois.ovh: did not receive HSTS header
@@ -2317,43 +2649,45 @@ broken-oak.com: could not connect to host
 brookechase.com: did not receive HSTS header
 brookframework.org: could not connect to host
 brossman.it: could not connect to host
-brouwerijkoelit.nl: did not receive HSTS header
+brouwerijkoelit.nl: could not connect to host
 brownlawoffice.us: did not receive HSTS header
 browserid.org: could not connect to host
 brplusdigital.com: could not connect to host
 brrd.io: could not connect to host
-brrr.fr: could not connect to host
-brunix.net: could not connect to host
+brunix.net: did not receive HSTS header
 brunoonline.co.uk: could not connect to host
-brunoramos.com: could not connect to host
-brunoramos.org: could not connect to host
-brunosouza.org: could not connect to host
 bryancastillo.site: could not connect to host
+bryankaplan.com: could not connect to host
 bryanshearer.accountant: did not receive HSTS header
 bryn.xyz: could not connect to host
 brynnan.nl: could not connect to host
 brztec.com: did not receive HSTS header
+bs.sb: could not connect to host
 bsagan.fr: did not receive HSTS header
 bsalyzer.com: could not connect to host
 bsc01.dyndns.org: could not connect to host
+bsd.com.ro: could not connect to host
 bsdtips.com: could not connect to host
 bsdug.org: could not connect to host
+bsg-aok-muenchen.de: did not receive HSTS header
 bsklabels.com: did not receive HSTS header
 bsktweetup.info: could not connect to host
 bsohoekvanholland.nl: could not connect to host
 bsuess.de: could not connect to host
 bt78.cn: did not receive HSTS header
-bt85.cn: did not receive HSTS header
+bt85.cn: could not connect to host
 bt9.cc: did not receive HSTS header
 bt96.cn: did not receive HSTS header
 bt995.com: did not receive HSTS header
 btaoke.com: could not connect to host
-btc-e.com: did not receive HSTS header
+btc-e.com: could not connect to host
 btcdlc.com: could not connect to host
 btcgo.nl: did not receive HSTS header
 btcp.space: could not connect to host
-btcpot.ltd: did not receive HSTS header
-btku.org: could not connect to host
+btcpot.ltd: could not connect to host
+btrb.ml: could not connect to host
+btserv.de: did not receive HSTS header
+btth.live: could not connect to host
 btxiaobai.com: did not receive HSTS header
 bubba.cc: could not connect to host
 buben.tech: did not receive HSTS header
@@ -2361,15 +2695,18 @@ bubulazi.com: did not receive HSTS header
 bubulazy.com: did not receive HSTS header
 buchheld.at: could not connect to host
 buchverlag-scholz.de: did not receive HSTS header
+buck.com: could not connect to host
 bucket.tk: could not connect to host
 buckmulligans.com: did not receive HSTS header
+budaev-shop.ru: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 buddhistische-weisheiten.org: could not connect to host
 budgetenergievriendenvoordeel.nl: could not connect to host
 budgetthostels.nl: did not receive HSTS header
 budskap.eu: could not connect to host
 buenosairesestetica.com.ar: could not connect to host
 buenotour.ru: did not receive HSTS header
-buettgens.net: max-age too low: 2592000
+buergerdialog.net: could not connect to host
+buergerhaushalt.com: did not receive HSTS header
 buffalodrinkinggame.beer: did not receive HSTS header
 bugtrack.co.uk: did not receive HSTS header
 bugtrack.io: could not connect to host
@@ -2397,6 +2734,7 @@ bullbits.com: max-age too low: 0
 bulldoghire.co.uk: did not receive HSTS header
 bulletbabu.com: could not connect to host
 bulletpoint.cz: could not connect to host
+bullpay.com: did not receive HSTS header
 bullterrier.me: could not connect to host
 bulmafox.com: could not connect to host
 bulmastife.com.br: could not connect to host
@@ -2412,24 +2750,27 @@ burckardtnet.de: did not receive HSTS header
 bureaubolster.nl: did not receive HSTS header
 bureaugravity.com: did not receive HSTS header
 burian-server.cz: could not connect to host
+buricloud.fr: could not connect to host
 burlesquemakeup.com: did not receive HSTS header
 burningcrash.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 burpsuite.site: could not connect to host
 burroughsid.com: could not connect to host
 burrow.ovh: could not connect to host
 burrowingsec.com: could not connect to host
+bursa3bydgoszcz.pl: did not receive HSTS header
 burtrum.top: could not connect to host
 buryat-mongol.cf: could not connect to host
 buryit.net: did not receive HSTS header
+busanhs.bid: could not connect to host
+buserror.cn: could not connect to host
 bush41.org: did not receive HSTS header
+bushcraftfriends.com: could not connect to host
 business.lookout.com: could not connect to host
 business.medbank.com.mt: did not receive HSTS header
-businessadviceperth.com.au: did not receive HSTS header
 businessamongus.com: could not connect to host
 businessetmarketing.com: could not connect to host
 businessfurs.info: could not connect to host
 businesshosting.nl: did not receive HSTS header
-businessloanconnection.org: did not receive HSTS header
 businessmodeler.se: could not connect to host
 bustabit.com: could not connect to host
 bustimes.org.uk: did not receive HSTS header
@@ -2438,20 +2779,31 @@ butchersworkshop.com: did not receive HSTS header
 butian518.com: did not receive HSTS header
 butt.repair: could not connect to host
 buttercoin.com: could not connect to host
+buttercupstraining.co.uk: did not receive HSTS header
 butterfieldstraining.com: could not connect to host
 buttermilk.cf: could not connect to host
 buturyu.org: did not receive HSTS header
 buvinghausen.com: max-age too low: 86400
+buy-thing.com: did not receive HSTS header
+buyaccessible.gov: did not receive HSTS header
 buybaby.eu: could not connect to host
-buybike.shop: could not connect to host
+buybike.shop: did not receive HSTS header
+buycarpet.shop: did not receive HSTS header
+buycook.shop: max-age too low: 2592000
 buydesired.com: did not receive HSTS header
 buyessay.org: could not connect to host
 buyessays.net: could not connect to host
+buyessayscheap.com: could not connect to host
 buyfox.de: could not connect to host
 buyharpoon.com: could not connect to host
+buyhealth.shop: did not receive HSTS header
 buyingsellingflorida.com: could not connect to host
+buyjewel.shop: did not receive HSTS header
 buynowdepot.com: did not receive HSTS header
+buyplussize.shop: did not receive HSTS header
+buyprofessional.shop: max-age too low: 2592000
 buyshoe.org: could not connect to host
+buywine.shop: did not receive HSTS header
 buywood.shop: could not connect to host
 buzzconcert.com: did not receive HSTS header
 buzzdeck.com: did not receive HSTS header
@@ -2467,18 +2819,15 @@ bwin8603.com: did not receive HSTS header
 bwin8604.com: did not receive HSTS header
 bwin8605.com: did not receive HSTS header
 bwin8606.com: did not receive HSTS header
-bwwb.nu: could not connect to host
 bx-web.com: did not receive HSTS header
-bxdev.me: could not connect to host
 by1896.com: could not connect to host
 by1898.com: could not connect to host
 by1899.com: could not connect to host
 by4cqb.cn: could not connect to host
-by77.com: did not receive HSTS header
+by77.com: could not connect to host
 by777.com: did not receive HSTS header
+bydisk.com: could not connect to host
 byji.com: could not connect to host
-byken.cn: did not receive HSTS header
-bynet.cz: could not connect to host
 bypass.kr: could not connect to host
 bypassed.bid: could not connect to host
 bypassed.cc: could not connect to host
@@ -2494,23 +2843,28 @@ bypassed.press: could not connect to host
 bypassed.pw: could not connect to host
 bypassed.rocks: could not connect to host
 bypassed.site: could not connect to host
-bypassed.st: did not receive HSTS header
+bypassed.st: could not connect to host
 bypassed.today: could not connect to host
 bypassed.works: could not connect to host
 bypassed.world: could not connect to host
 bypro.xyz: could not connect to host
-byronwade.com: max-age too low: 7889238
+byronkg.us: could not connect to host
+byronprivaterehab.com.au: did not receive HSTS header
+byronr.com: did not receive HSTS header
+byronwade.com: did not receive HSTS header
 byte.chat: did not receive HSTS header
 byte.wtf: did not receive HSTS header
-bytelog.org: could not connect to host
+bytelog.org: did not receive HSTS header
+bytepen.com: could not connect to host
 bytesatwork.eu: could not connect to host
 byteshift.ca: could not connect to host
 bytesofcode.de: could not connect to host
 bytesund.biz: could not connect to host
 byteturtle.eu: did not receive HSTS header
 byurudraw.pics: could not connect to host
-bzhub.bid: did not receive HSTS header
+bywin9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 c-rickroll-v.pw: could not connect to host
+c0rn3j.com: could not connect to host
 c12discountonline.com: did not receive HSTS header
 c16t.uk: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 c1yd3i.me: could not connect to host
@@ -2520,42 +2874,45 @@ c3-compose.com: could not connect to host
 c3.pm: could not connect to host
 c3b.info: could not connect to host
 c3bbs.com: could not connect to host
+c3hv.cn: could not connect to host
 c3ie.com: did not receive HSTS header
 c4.hk: could not connect to host
+c5h8no4na.net: could not connect to host
 cabsites.com: could not connect to host
 cabusar.fr: could not connect to host
+cachethome.com: could not connect to host
 cachethq.io: did not receive HSTS header
 caconnect.org: could not connect to host
 cadao.me: did not receive HSTS header
-cadburymovies.in.net: could not connect to host
+cadburymovies.in.net: did not receive HSTS header
+cadcreations.co.ke: could not connect to host
 cadenadg.gr: did not receive HSTS header
 caerostris.com: could not connect to host
 caesreon.com: could not connect to host
 cafe-murr.de: did not receive HSTS header
 cafe-scientifique.org.ec: could not connect to host
-cafe-service.ru: could not connect to host
 cafechesscourt.com: could not connect to host
 cafefresco.pe: did not receive HSTS header
-caferagazzi.de: did not receive HSTS header
-cafesg.net: could not connect to host
+cafesg.net: did not receive HSTS header
 caibi.io: could not connect to host
 caim.cz: did not receive HSTS header
 caipai.fm: could not connect to host
 cairnterrier.com.br: could not connect to host
 cais.de: did not receive HSTS header
-cajapopcorn.com: could not connect to host
-cake-time.co.uk: could not connect to host
+caizx.com: did not receive HSTS header
+cajapopcorn.com: max-age too low: 0
 cake.care: could not connect to host
 cal.goip.de: could not connect to host
 calcularpagerank.com.br: could not connect to host
 calculatoaresecondhand.xyz: could not connect to host
-calebmorris.com: did not receive HSTS header
+caldecotevillagehall.co.uk: could not connect to host
+calebmorris.com: max-age too low: 60
 calgaryconstructionjobs.com: did not receive HSTS header
 callabs.net: could not connect to host
 callanbryant.co.uk: did not receive HSTS header
 calleveryday.com: could not connect to host
 callision.com: did not receive HSTS header
-callmereda.com: did not receive HSTS header
+callmereda.com: could not connect to host
 callsigns.ca: could not connect to host
 calltrackingreports.com: could not connect to host
 calomel.org: max-age too low: 2764800
@@ -2564,23 +2921,21 @@ caltonnutrition.com: did not receive HSTS header
 calvin.me: did not receive HSTS header
 calypso-tour.net: could not connect to host
 calypsogames.net: could not connect to host
-calyxinstitute.org: could not connect to host
-camashop.de: did not receive HSTS header
 camaya.net: did not receive HSTS header
 cambridgeanalytica.net: could not connect to host
 cambridgeanalytica.org: did not receive HSTS header
-camda.online: could not connect to host
 camisadotorcedor.com.br: could not connect to host
 camjackson.net: did not receive HSTS header
 cammarkets.com: could not connect to host
-camomile.desi: could not connect to host
 campaignelves.com: did not receive HSTS header
 campbellsoftware.co.uk: could not connect to host
+campeoesdofutebol.com.br: did not receive HSTS header
 campfire.co.il: did not receive HSTS header
 campfourpaws.com: did not receive HSTS header
 campingcarlovers.com: could not connect to host
 campingdreams.com: did not receive HSTS header
 campus-cybersecurity.team: did not receive HSTS header
+campusportalng.com: did not receive HSTS header
 camsanalytics.com: could not connect to host
 camshowhub.com: could not connect to host
 canadiangamblingchoice.com: did not receive HSTS header
@@ -2591,28 +2946,33 @@ candicontrols.com: did not receive HSTS header
 candratech.com: could not connect to host
 candygirl.shop: could not connect to host
 candykidsentertainment.co.uk: did not receive HSTS header
+candylion.rocks: could not connect to host
+canerkorkmaz.com: could not connect to host
 canifis.net: did not receive HSTS header
 cannarobotics.com: could not connect to host
 canterbury.ws: could not connect to host
+canva-dev.com: could not connect to host
+canyons.media: did not receive HSTS header
 canyonshoa.com: did not receive HSTS header
 caodecristachines.com.br: could not connect to host
 caoyu.info: did not receive HSTS header
+capacent.is: did not receive HSTS header
 capacitacionyautoempleo.com: did not receive HSTS header
 capecycles.co.za: did not receive HSTS header
 capeyorkfire.com.au: did not receive HSTS header
-capitalonecardservice.com: could not connect to host
-captainark.net: could not connect to host
+capitalonecardservice.com: did not receive HSTS header
+capitaltg.com: did not receive HSTS header
+capogna.com: did not receive HSTS header
+captalize.com: could not connect to host
 captchatheprize.com: could not connect to host
 captianseb.de: could not connect to host
 captivatedbytabrett.com: could not connect to host
 captivationscience.com: could not connect to host
-captivationtheory.com: could not connect to host
 capturethepen.co.uk: could not connect to host
 car-navi.ph: did not receive HSTS header
 car-rental24.com: did not receive HSTS header
-car-shop.top: did not receive HSTS header
 carano-service.de: did not receive HSTS header
-caraudio69.cz: could not connect to host
+caraudio69.cz: did not receive HSTS header
 card-cashing.com: max-age too low: 0
 card-toka.jp: could not connect to host
 cardloan-manual.net: could not connect to host
@@ -2621,13 +2981,14 @@ cardstream.com: did not receive HSTS header
 cardurl.com: did not receive HSTS header
 cardwars.hu: could not connect to host
 careeraid.in: could not connect to host
-careerstuds.com: could not connect to host
+careerstuds.com: did not receive HSTS header
 careplasticsurgery.com: did not receive HSTS header
 carey.bio: did not receive HSTS header
-carif-idf.net: did not receive HSTS header
-carif-idf.org: did not receive HSTS header
+carey.li: did not receive HSTS header
+carif-idf.net: could not connect to host
+carif-idf.org: could not connect to host
+carlandfaith.com: could not connect to host
 carlgo11.com: did not receive HSTS header
-carlo.mx: did not receive HSTS header
 carlolly.co.uk: could not connect to host
 carlosalves.info: could not connect to host
 carloshmm.com: could not connect to host
@@ -2635,18 +2996,17 @@ carloshmm.stream: could not connect to host
 carlovanwyk.com: could not connect to host
 carlsbouncycastlesandhottubs.co.uk: did not receive HSTS header
 carlscatering.com: did not receive HSTS header
-carol-lambert.com: could not connect to host
 caroli.biz: could not connect to host
+caroli.info: could not connect to host
 carpliyz.com: did not receive HSTS header
 carrando.de: could not connect to host
 carredejardin.com: could not connect to host
 carroarmato0.be: did not receive HSTS header
 carsforbackpackers.com: could not connect to host
-carsten.pw: could not connect to host
+carsten.pw: did not receive HSTS header
 carstenfeuls.de: did not receive HSTS header
 carterorland.com: could not connect to host
 cartesunicef.be: did not receive HSTS header
-carun.us: did not receive HSTS header
 carwashvapeur.be: could not connect to host
 casadellecose.com: did not receive HSTS header
 casajardininsecticidas.com: did not receive HSTS header
@@ -2660,17 +3020,22 @@ cashfortulsahouses.com: could not connect to host
 cashless.fr: did not receive HSTS header
 cashmyphone.ch: could not connect to host
 cashsector.ga: could not connect to host
+casino-cash-flow.su: could not connect to host
+casino-cashflow.ru: could not connect to host
+casinocashflow.ru: could not connect to host
+casinolegal.pt: did not receive HSTS header
 casinolistings.com: could not connect to host
 casinoluck.com: could not connect to host
 casinoreal.com: could not connect to host
 casinostest.com: could not connect to host
 casionova.org: did not receive HSTS header
 casioshop.eu: did not receive HSTS header
+casjay.com: did not receive HSTS header
 casovi.cf: could not connect to host
-caspicards.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 castagnonavocats.com: did not receive HSTS header
+castlejackpot.com: did not receive HSTS header
 cata.ga: could not connect to host
-catalin.pw: did not receive HSTS header
+catalin.pw: could not connect to host
 catarsisvr.com: could not connect to host
 catcontent.cloud: could not connect to host
 caterkids.com: did not receive HSTS header
@@ -2691,24 +3056,23 @@ caveclan.org: did not receive HSTS header
 cavedevs.de: could not connect to host
 cavedroid.xyz: could not connect to host
 cavern.tv: did not receive HSTS header
-cayafashion.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-cayounglab.co.jp: did not receive HSTS header
-cazes.info: did not receive HSTS header
+cayafashion.de: did not receive HSTS header
 cbamo.org: did not receive HSTS header
-cbengineeringinc.com: max-age too low: 86400
-cbhq.net: could not connect to host
 cbi-epa.gov: could not connect to host
 cc2729.com: did not receive HSTS header
 ccayearbook.com: could not connect to host
 ccblog.de: did not receive HSTS header
-ccl-sti.ch: could not connect to host
+ccja.ro: did not receive HSTS header
+ccl-sti.ch: did not receive HSTS header
 ccretreatandfarm.com: did not receive HSTS header
 cctech.ph: could not connect to host
 cctld.com: could not connect to host
 ccv.eu: did not receive HSTS header
 cd0.us: could not connect to host
 cdcpartners.gov: could not connect to host
+cdeck.net: could not connect to host
 cdkeyworld.de: did not receive HSTS header
+cdlcenter.com: could not connect to host
 cdmhp.org.nz: could not connect to host
 cdmon.tech: could not connect to host
 cdn.sx.cn: could not connect to host
@@ -2727,10 +3091,12 @@ cekaja.com: did not receive HSTS header
 celebphotos.blog: could not connect to host
 celec.gob.ec: could not connect to host
 celeirorural.com.br: did not receive HSTS header
+celeraindustries.tk: did not receive HSTS header
 celigo.com: did not receive HSTS header
 celina-reads.de: could not connect to host
 cellartracker.com: could not connect to host
 cellsites.nz: could not connect to host
+cem.pw: did not receive HSTS header
 cencalvia.org: could not connect to host
 centennialrewards.com: did not receive HSTS header
 centerforpolicy.org: could not connect to host
@@ -2744,10 +3110,11 @@ centralvoice.org: could not connect to host
 centralync.com: could not connect to host
 centrepoint-community.com: could not connect to host
 centricbeats.com: did not receive HSTS header
-centrodoinstalador.com.br: did not receive HSTS header
 centrolavoro.org: did not receive HSTS header
 centsforchange.net: could not connect to host
+century-group.com: could not connect to host
 ceoimon.com: did not receive HSTS header
+ceoptique.com: could not connect to host
 cercevelet.com: did not receive HSTS header
 ceres1.space: did not receive HSTS header
 ceresia.ch: could not connect to host
@@ -2755,23 +3122,26 @@ ceritamalam.net: could not connect to host
 cerize.love: could not connect to host
 cernega.ro: did not receive HSTS header
 cerpa.com.br: did not receive HSTS header
+cerstve-korenie.sk: did not receive HSTS header
+cerstvekorenie.sk: did not receive HSTS header
 cert.se: max-age too low: 2628001
 certcenter.fr: could not connect to host
-certifi.io: could not connect to host
+certifi.io: did not receive HSTS header
 certifix.eu: did not receive HSTS header
 certly.io: could not connect to host
 certmgr.org: could not connect to host
 ceruleanmainbeach.com.au: did not receive HSTS header
-cervejista.com: could not connect to host
 cesal.net: could not connect to host
 cesidianroot.eu: could not connect to host
+cespri.com.pe: did not receive HSTS header
+ceta.one: did not receive HSTS header
 cevrimici.com: could not connect to host
 cf-tm.net: could not connect to host
 cf11.de: did not receive HSTS header
 cfcnexus.org: could not connect to host
 cfcproperties.com: did not receive HSTS header
 cfetengineering.com: could not connect to host
-cfneia.org: could not connect to host
+cfneia.org: did not receive HSTS header
 cfoitplaybook.com: could not connect to host
 cfsh.tk: could not connect to host
 cganx.org: could not connect to host
@@ -2782,8 +3152,8 @@ chabaojia.com: did not receive HSTS header
 chadklass.com: could not connect to host
 chahub.com: could not connect to host
 chainmonitor.com: could not connect to host
+chairinstitute.com: did not receive HSTS header
 chaldeen.pro: did not receive HSTS header
-chalker.io: could not connect to host
 challengeskins.com: could not connect to host
 chameleon-ents.co.uk: could not connect to host
 chameth.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -2796,14 +3166,14 @@ chandlerredding.com: could not connect to host
 changelab.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 changetip.com: did not receive HSTS header
 channelcards.com: did not receive HSTS header
-channellife.asia: did not receive HSTS header
+channellife.asia: could not connect to host
 channellife.co.nz: did not receive HSTS header
 channellife.com.au: did not receive HSTS header
 channyc.com: did not receive HSTS header
-chanoyu-gakkai.jp: could not connect to host
-chanshiyu.com: did not receive HSTS header
 chaos.fail: could not connect to host
+chaoscastles.co.uk: did not receive HSTS header
 chaospott.de: did not receive HSTS header
+chaoswars.ddns.net: could not connect to host
 chaoswebs.net: did not receive HSTS header
 chaouby.com: could not connect to host
 charakato.com: could not connect to host
@@ -2811,25 +3181,29 @@ charge.co: could not connect to host
 chargejuice.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 charityclear.com: could not connect to host
 charitystreet.co.uk: could not connect to host
+charl.eu: could not connect to host
+charlesjay.com: could not connect to host
+charlestonfacialplastic.com: did not receive HSTS header
 charlestonsecuritysystems.net: did not receive HSTS header
 charliemcneive.com: could not connect to host
 charlimarie.com: did not receive HSTS header
-charlipopkids.com.au: could not connect to host
+charlipopkids.com.au: did not receive HSTS header
 charnleyhouse.co.uk: did not receive HSTS header
 charonsecurity.com: could not connect to host
 charp.eu: could not connect to host
 chartstoffarm.de: did not receive HSTS header
+chasafilli.ch: could not connect to host
 chaseganey.com: did not receive HSTS header
 chasing-coins.com: did not receive HSTS header
-chaska.co.za: did not receive HSTS header
+chaska.co.za: could not connect to host
 chasse-et-plaisir.com: did not receive HSTS header
 chat-porc.eu: did not receive HSTS header
 chatbot.me: did not receive HSTS header
+chatbot.one: could not connect to host
 chatbots.email: could not connect to host
-chateau-belvoir.com: did not receive HSTS header
+chateau-belvoir.com: could not connect to host
 chateauconstellation.ch: did not receive HSTS header
 chateaudevaugrigneuse.com: did not receive HSTS header
-chatint.com: did not receive HSTS header
 chatnbook.com: could not connect to host
 chatup.cf: could not connect to host
 chatxp.com: could not connect to host
@@ -2846,9 +3220,11 @@ cheazey.net: did not receive HSTS header
 chebedara.com: could not connect to host
 chebwebb.com: could not connect to host
 checkhost.org: could not connect to host
+checkmateshoes.com: did not receive HSTS header
 checkmatewebsolutions.com: max-age too low: 0
 checkout.google.com: could not connect to host (error ignored - included regardless)
-checkyourmeds.com: could not connect to host
+checkyourmath.com: could not connect to host
+checkyourmeds.com: did not receive HSTS header
 cheekylittlerascals.co.uk: did not receive HSTS header
 cheerflow.com: could not connect to host
 cheesefusion.com: could not connect to host
@@ -2856,7 +3232,6 @@ cheesetart.my: could not connect to host
 cheesypicsbooths.co.uk: could not connect to host
 cheetah85.de: could not connect to host
 chefgalles.com.br: could not connect to host
-chehalemgroup.com: did not receive HSTS header
 chejianer.cn: could not connect to host
 chellame.com: could not connect to host
 chellame.fr: could not connect to host
@@ -2867,13 +3242,14 @@ chengtongled.com: did not receive HSTS header
 chensir.net: could not connect to host
 chepaofen.com: did not receive HSTS header
 cherekerry.com: could not connect to host
-cherrydropscandycarts.co.uk: did not receive HSTS header
+cherrydropscandycarts.co.uk: could not connect to host
 cherylsoleway.com: did not receive HSTS header
 chessreporter.nl: did not receive HSTS header
 chesterbrass.uk: did not receive HSTS header
 chiamata-aiuto.ch: could not connect to host
 chiaramail.com: could not connect to host
 chib.chat: could not connect to host
+chicagolug.org: could not connect to host
 chicorycom.net: could not connect to host
 chihiro.xyz: could not connect to host
 chijiokeindustries.co.uk: could not connect to host
@@ -2881,12 +3257,13 @@ chikan-beacon.net: could not connect to host
 chikatomo-ryugaku.com: did not receive HSTS header
 chikory.com: could not connect to host
 childcaresolutionscny.org: did not receive HSTS header
-childrendeservebetter.org: did not receive HSTS header
-childrens-room.com: did not receive HSTS header
-chilli943.info: did not receive HSTS header
+childrendeservebetter.org: could not connect to host
+chilli943.info: could not connect to host
 chimparoo.ca: did not receive HSTS header
 china-dhl.org: could not connect to host
 china-line.org: could not connect to host
+chinacdn.org: could not connect to host
+chinawhale.com: could not connect to host
 chinternet.xyz: could not connect to host
 chiphell.com: did not receive HSTS header
 chirgui.eu: could not connect to host
@@ -2896,23 +3273,22 @@ chloeallison.co.uk: could not connect to host
 chloehorler.com: could not connect to host
 chlouis.net: could not connect to host
 chm.vn: did not receive HSTS header
-chmielarz.it: could not connect to host
 chocolat-suisse.ch: could not connect to host
 chodobien.com: could not connect to host
 chodocu.com: did not receive HSTS header
 choe.fi: could not connect to host
 choiralberta.ca: did not receive HSTS header
-chonghe.org: did not receive HSTS header
+chollima.pro: could not connect to host
 chontalpa.pw: could not connect to host
 chopperforums.com: could not connect to host
 chordso.com: did not receive HSTS header
 chorkley.me: could not connect to host
+chorleiterverband.de: did not receive HSTS header
 choruscrowd.com: could not connect to host
 chotu.net: could not connect to host
+chr0me.sh: could not connect to host
 chris-web.info: could not connect to host
 chrisandsarahinasia.com: could not connect to host
-chrisb.me: did not receive HSTS header
-chrisb.xyz: did not receive HSTS header
 chrisbrakebill.com: did not receive HSTS header
 chrisbrown.id.au: could not connect to host
 chrisebert.net: could not connect to host
@@ -2921,7 +3297,8 @@ chrisfinazzo.com: did not receive HSTS header
 chriskirchner.de: did not receive HSTS header
 chriskyrouac.com: could not connect to host
 chrisopperwall.com: did not receive HSTS header
-chrisself.xyz: could not connect to host
+chrisself.xyz: max-age too low: 0
+christerwaren.fi: could not connect to host
 christiaandruif.nl: could not connect to host
 christianbro.gq: could not connect to host
 christianhoffmann.info: could not connect to host
@@ -2931,17 +3308,19 @@ christianscholz.eu: did not receive HSTS header
 christina-quast.de: did not receive HSTS header
 christophebarbezat.ch: could not connect to host
 christophercolumbusfoundation.gov: could not connect to host
+christopherpritchard.co.uk: could not connect to host
 christophersole.com: could not connect to host
 christophheich.me: did not receive HSTS header
+christophkreileder.com: could not connect to host
 chrisupjohn.com: could not connect to host
 chrisupjohn.xyz: could not connect to host
 chrisvicmall.com: did not receive HSTS header
-chriswbarry.com: did not receive HSTS header
 chromaryu.net: could not connect to host
 chromaxa.com: could not connect to host
 chrome: could not connect to host
 chrome-devtools-frontend.appspot.com: did not receive HSTS header (error ignored - included regardless)
 chrome.google.com: did not receive HSTS header (error ignored - included regardless)
+chronic101.xyz: could not connect to host
 chronogram.me: did not receive HSTS header
 chronoproject.com: did not receive HSTS header
 chrst.ph: could not connect to host
@@ -2952,14 +3331,15 @@ chua.family: did not receive HSTS header
 chuckame.fr: did not receive HSTS header
 chulado.com: did not receive HSTS header
 chundelac.com: could not connect to host
-churchux.co: did not receive HSTS header
+churchux.co: could not connect to host
 churchwebcanada.ca: did not receive HSTS header
 churchwebsupport.com: did not receive HSTS header
 churrasqueirafacil.com.br: could not connect to host
+chxdf.net: could not connect to host
 ci-labo.com.tw: max-age too low: 7889238
 ci-suite.com: could not connect to host
 cianmawhinney.xyz: could not connect to host
-cidadedopoker.com.br: did not receive HSTS header
+cidadedopoker.com.br: could not connect to host
 ciderclub.com: could not connect to host
 cidr.ml: could not connect to host
 cienbeaute-lidl.fr: could not connect to host
@@ -2970,26 +3350,30 @@ ciicutini.ro: did not receive HSTS header
 cim2b.de: could not connect to host
 cimalando.eu: could not connect to host
 cinartelorgu.com: did not receive HSTS header
+cindey.io: could not connect to host
 cinefilia.tk: could not connect to host
-cinelite.club: could not connect to host
+cinelite.club: did not receive HSTS header
 cinema5.ru: did not receive HSTS header
 cinemaclub.co: could not connect to host
+ciner.is: could not connect to host
 cinerama.com.br: did not receive HSTS header
 cintdirect.com: could not connect to host
-cioconference.co.nz: did not receive HSTS header
+cioconference.co.nz: could not connect to host
 cipher.co.th: did not receive HSTS header
 cipher.land: could not connect to host
-cipherboy.com: could not connect to host
 cipherli.st: did not receive HSTS header
 ciplanutrition.com: could not connect to host
 cipriano.nl: did not receive HSTS header
 cira.email: could not connect to host
 circ-logic.com: did not receive HSTS header
 circlebox.rocks: could not connect to host
+cirfi.com: could not connect to host
 cirrohost.com: did not receive HSTS header
 ciscohomeanalytics.com: could not connect to host
 ciscommerce.net: could not connect to host
+citationgurus.com: could not connect to host
 citiagent.cz: could not connect to host
+citizen-cam.de: did not receive HSTS header
 citra-emu.org: did not receive HSTS header
 citroner.blog: could not connect to host
 citybusexpress.com: did not receive HSTS header
@@ -3006,12 +3390,14 @@ cjessett.com: max-age too low: 0
 cjtkfan.club: could not connect to host
 ckcameron.net: could not connect to host
 ckp.io: could not connect to host
+clad.cf: could not connect to host
+claibornecountytn.gov: could not connect to host
 claimit.ml: could not connect to host
 clan-ww.com: did not receive HSTS header
 clapping-rhymes.com: could not connect to host
 clara-baumert.de: could not connect to host
 claralabs.com: did not receive HSTS header
-claretandbanter.uk: could not connect to host
+claretandbanter.uk: did not receive HSTS header
 clarity-c2ced.appspot.com: did not receive HSTS header
 claritysrv.com: did not receive HSTS header
 clarksgaragedoorrepair.com: did not receive HSTS header
@@ -3026,6 +3412,7 @@ claudearpel.fr: did not receive HSTS header
 claudio4.com: did not receive HSTS header
 claytoncondon.com: could not connect to host
 clcleaningco.com: could not connect to host
+cleanbeautymarket.com.au: did not receive HSTS header
 cleanexperts.co.uk: could not connect to host
 cleaningsquad.ca: did not receive HSTS header
 cleanmta.com: could not connect to host
@@ -3035,6 +3422,7 @@ clearc.tk: could not connect to host
 clearchatsandbox.com: could not connect to host
 clearsky.me: did not receive HSTS header
 clearviewwealthprojector.com.au: could not connect to host
+clementfevrier.fr: could not connect to host
 clemovementlaw.com: could not connect to host
 clerkendweller.uk: could not connect to host
 clevelandokla.com: could not connect to host
@@ -3054,16 +3442,17 @@ clicktenisdemesa.com.br: did not receive HSTS header
 clicn.bio: could not connect to host
 clicnbio.com: could not connect to host
 cliftons.com: did not receive HSTS header
+climaencusco.com: could not connect to host
 clinia.ca: did not receive HSTS header
 clinicaferrusbratos.com: did not receive HSTS header
 clinicasilos.com: did not receive HSTS header
 cliniko.com: did not receive HSTS header
-cliniquecomplementaire.com: could not connect to host
 clintonbloodworth.com: could not connect to host
 clintonbloodworth.io: could not connect to host
 clintwilson.technology: max-age too low: 2592000
+clip.ovh: did not receive HSTS header
 clipped4u.com: could not connect to host
-clod-hacking.com: could not connect to host
+clnet.com.au: did not receive HSTS header
 cloghercastles.co.uk: did not receive HSTS header
 clorik.com: could not connect to host
 closient.com: did not receive HSTS header
@@ -3072,6 +3461,7 @@ cloud-crowd.com.au: did not receive HSTS header
 cloud-project.com: could not connect to host
 cloud.wtf: could not connect to host
 cloud2go.de: did not receive HSTS header
+cloud58.org: did not receive HSTS header
 cloudapi.vc: could not connect to host
 cloudbased.info: did not receive HSTS header
 cloudbasedsite.com: did not receive HSTS header
@@ -3085,11 +3475,12 @@ cloudimag.es: could not connect to host
 cloudimproved.com: could not connect to host
 cloudimprovedtest.com: could not connect to host
 cloudlink.club: could not connect to host
-cloudmigrator365.com: did not receive HSTS header
+cloudmigrator365.com: could not connect to host
 cloudns.com.au: could not connect to host
 cloudopt.net: did not receive HSTS header
 cloudpagesforwork.com: did not receive HSTS header
 cloudpebble.net: did not receive HSTS header
+cloudpengu.in: could not connect to host
 clouds.webcam: could not connect to host
 cloudspotterapp.com: did not receive HSTS header
 cloudstoragemaus.com: could not connect to host
@@ -3099,14 +3490,13 @@ cloudteam.de: did not receive HSTS header
 cloudwalk.io: did not receive HSTS header
 cloudwarez.xyz: could not connect to host
 clounix.online: could not connect to host
-clovissantos.com: could not connect to host
+clovissantos.com: did not receive HSTS header
 clowde.in: could not connect to host
 clownaroundbouncycastles.co.uk: did not receive HSTS header
 clownish.co.il: could not connect to host
 clsimplex.com: did not receive HSTS header
 clubcall.com: did not receive HSTS header
 clubdeslecteurs.net: could not connect to host
-clubmate.rocks: could not connect to host
 clubmix.co.kr: could not connect to host
 cluefulca.com: could not connect to host
 cluefulca.net: could not connect to host
@@ -3117,6 +3507,7 @@ clvrwebdesign.com: did not receive HSTS header
 clvs7.com: did not receive HSTS header
 clycat.ru: could not connect to host
 clywedogmaths.co.uk: could not connect to host
+cm3.pw: could not connect to host
 cmangos.net: did not receive HSTS header
 cmc-versand.de: did not receive HSTS header
 cmcc.network: could not connect to host
@@ -3125,32 +3516,32 @@ cmdtelecom.net.br: did not receive HSTS header
 cmitao.com: could not connect to host
 cmpr.es: could not connect to host
 cmrss.com: could not connect to host
-cms-weble.jp: could not connect to host
 cmsbattle.com: could not connect to host
 cmscafe.ru: did not receive HSTS header
 cmskh.co.uk: could not connect to host
 cmso-cal.com: could not connect to host
 cmweller.com: could not connect to host
+cnam.net: did not receive HSTS header
 cnaprograms.online: could not connect to host
 cncfraises.fr: did not receive HSTS header
+cncmachinemetal.com: did not receive HSTS header
 cncn.us: did not receive HSTS header
 cnetw.xyz: could not connect to host
 cnitdog.com: could not connect to host
+cnlau.com: did not receive HSTS header
 cnlic.com: could not connect to host
 cnrd.me: did not receive HSTS header
 cnsyear.com: did not receive HSTS header
 cnwage.com: could not connect to host
 cnwarn.com: could not connect to host
-co-driversphoto.se: did not receive HSTS header
+co-driversphoto.se: could not connect to host
 co-yutaka.com: could not connect to host
 coach-sportif.paris: did not receive HSTS header
 coachingconsultancy.com: did not receive HSTS header
-coam.co: could not connect to host
-coathangerstrangla.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-coathangerstrangler.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 cobaltlp.com: could not connect to host
 cobcode.com: could not connect to host
 cobrax.net: could not connect to host
+cocaine-import.agency: could not connect to host
 coccinellaskitchen.com: could not connect to host
 coccinellaskitchen.de: could not connect to host
 coccinellaskitchen.it: could not connect to host
@@ -3162,8 +3553,6 @@ cocktailfuture.fr: could not connect to host
 coco-cool.fr: could not connect to host
 cocodemy.com: did not receive HSTS header
 cocolovesdaddy.com: could not connect to host
-codabix.com: did not receive HSTS header
-codabix.de: did not receive HSTS header
 codabix.net: could not connect to host
 code-35.com: could not connect to host
 code-digsite.com: could not connect to host
@@ -3172,9 +3561,9 @@ code.google.com: did not receive HSTS header (error ignored - included regardles
 codealkemy.co: could not connect to host
 codeco.pw: could not connect to host
 codecontrollers.de: could not connect to host
-codedelarouteenligne.fr: did not receive HSTS header
 codeforce.io: could not connect to host
-codeforhakodate.org: did not receive HSTS header
+codeforhakodate.org: could not connect to host
+codejunkie.de: could not connect to host
 codelayer.ca: could not connect to host
 codelitmus.com: did not receive HSTS header
 codeloop.pw: could not connect to host
@@ -3189,30 +3578,34 @@ codeproxy.ddns.net: could not connect to host
 codepx.com: did not receive HSTS header
 codercy.com: could not connect to host
 coderhangout.com: could not connect to host
+codersbistro.com: did not receive HSTS header
 codewiththepros.org: could not connect to host
 codewiz.xyz: could not connect to host
-codigosddd.com.br: did not receive HSTS header
 coecrafters.com: could not connect to host
 coffeedino.com: did not receive HSTS header
 coffeeetc.co.uk: could not connect to host
 coffeestrategies.com: max-age too low: 5184000
 cogniflex.com: did not receive HSTS header
+cognixia.com: did not receive HSTS header
 cogumelosmagicos.org: could not connect to host
 cohesive.io: did not receive HSTS header
 coin-exchange.cz: could not connect to host
 coindam.com: could not connect to host
 coinessa.com: could not connect to host
+coinjar-sandbox.com: could not connect to host
 colarelli.ch: could not connect to host
 coldaddy.com: could not connect to host
 coldlostsick.net: could not connect to host
 coldwatericecream.com: did not receive HSTS header
 colearnr.com: could not connect to host
+colincampbell.me: could not connect to host
 collablynk.com: could not connect to host
 collabra.email: did not receive HSTS header
 collard.tk: could not connect to host
 collbox.co: did not receive HSTS header
-collectosaurus.com: did not receive HSTS header
+collectosaurus.com: could not connect to host
 colleencornez.com: could not connect to host
+collegepaperworld.com: could not connect to host
 collegepulse.org: could not connect to host
 collies.eu: max-age too low: 3
 collinghammethodist.org.uk: did not receive HSTS header
@@ -3222,20 +3615,25 @@ collinsartworks.com: did not receive HSTS header
 collision.fyi: could not connect to host
 colmexpro.com: did not receive HSTS header
 colognegaming.net: could not connect to host
+cololi.moe: max-age too low: 2592000
 coloradocomputernetworking.net: could not connect to host
 colorcentertoner.com.br: did not receive HSTS header
 coloringnotebook.com: could not connect to host
 colorlib.com: did not receive HSTS header
 colorunhas.com.br: did not receive HSTS header
+com-news.io: could not connect to host
 com.cc: could not connect to host
 combatshield.cz: did not receive HSTS header
 comchezmeme.com: could not connect to host
 comdotgame.com: could not connect to host
-comefollowme2016.com: could not connect to host
+comefollowme2016.com: did not receive HSTS header
 comeoncolleen.com: did not receive HSTS header
 comercialtrading.eu: could not connect to host
-cometonovascotia.ca: could not connect to host
+cometbot.cf: could not connect to host
 cometrueunlimited.com: could not connect to host
+comevius.com: could not connect to host
+comevius.org: could not connect to host
+comevius.xyz: could not connect to host
 comfortdom.ua: did not receive HSTS header
 comfortticket.de: did not receive HSTS header
 comfy.cafe: could not connect to host
@@ -3252,13 +3650,10 @@ commerciallocker.com: could not connect to host
 commercialplanet.eu: could not connect to host
 commune-preuilly.fr: did not receive HSTS header
 community-cupboard.org: did not receive HSTS header
-comocurarlagastritis24.online: did not receive HSTS header
-comocurarlashemorroides.org: could not connect to host
+comocurarlashemorroides.org: did not receive HSTS header
 comocurarlashemorroidesya.com: did not receive HSTS header
-comoimportar.net: did not receive HSTS header
+comodo.nl: could not connect to host
 comorecuperaratumujerpdf.com: could not connect to host
-comosecarabarriga.net: did not receive HSTS header
-comoseduzir.net: did not receive HSTS header
 comotalk.com: could not connect to host
 compalytics.com: could not connect to host
 comparamejor.com: did not receive HSTS header
@@ -3276,11 +3671,13 @@ compredietlight.com.br: did not receive HSTS header
 comprefitasadere.com.br: could not connect to host
 comprehensiveihc.com: could not connect to host
 compromised.com: could not connect to host
+compros.me: could not connect to host
 compsmag.com: did not receive HSTS header
-comptrollerofthecurrency.gov: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+comptrollerofthecurrency.gov: did not receive HSTS header
 compucorner.com.mx: could not connect to host
-computeremergency.com.au: did not receive HSTS header
+compusolve.nl: could not connect to host
 computertal.de: could not connect to host
+comssa.org.au: did not receive HSTS header
 comyuno.com: did not receive HSTS header
 concentrade.de: did not receive HSTS header
 conceptatelier.de: could not connect to host
@@ -3288,11 +3685,13 @@ conception.sk: could not connect to host
 concerts-metal.ch: did not receive HSTS header
 conclave.global: could not connect to host
 concord-group.co.jp: did not receive HSTS header
-concretehermit.com: did not receive HSTS header
 conectalmeria.com: did not receive HSTS header
+confidential.network: could not connect to host
 confirm365.com: could not connect to host
-conflux.tw: could not connect to host
+conflux.tw: did not receive HSTS header
 conformal.com: could not connect to host
+confuddledpenguin.com: did not receive HSTS header
+cong5.net: max-age too low: 0
 congz.me: could not connect to host
 conkret.ch: could not connect to host
 conkret.co.uk: could not connect to host
@@ -3305,7 +3704,7 @@ connected-verhuurservice.nl: did not receive HSTS header
 connectfss.com: could not connect to host
 connectingconcepts.com: did not receive HSTS header
 conniesacademy.com: could not connect to host
-connorsmith.co: could not connect to host
+conpins.nl: could not connect to host
 conrad.am: could not connect to host
 consciousandglamorous.com: could not connect to host
 consciousbrand.org.au: could not connect to host
@@ -3315,8 +3714,7 @@ conseil-gli.fr: did not receive HSTS header
 consejosdehogar.com: did not receive HSTS header
 console.python.org: did not receive HSTS header
 console.support: did not receive HSTS header
-construct-trust.com: did not receive HSTS header
-constructive.men: could not connect to host
+construct-trust.com: could not connect to host
 consultcelerity.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 consultingroupitaly.com: did not receive HSTS header
 consultorcr.net: did not receive HSTS header
@@ -3332,7 +3730,9 @@ continuation.io: could not connect to host
 continuumgaming.com: could not connect to host
 contraout.com: could not connect to host
 controlcenter.gigahost.dk: did not receive HSTS header
+contxt-agentur.de: did not receive HSTS header
 convergemagazine.com: did not receive HSTS header
+converter.ml: could not connect to host
 convertimg.com: could not connect to host
 convoitises.com: did not receive HSTS header
 cooink.net: could not connect to host
@@ -3342,9 +3742,9 @@ coolaj86.com: did not receive HSTS header
 coolbutbroken.com: did not receive HSTS header
 coolchevy.org.ua: did not receive HSTS header
 coole-meister.de: could not connect to host
-coolerssr.space: could not connect to host
 cooljs.me: could not connect to host
 coolkidsbouncycastles.co.uk: did not receive HSTS header
+coolrc.me: did not receive HSTS header
 coolvox.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 coonelnel.net: did not receive HSTS header
 cooxa.com: could not connect to host
@@ -3356,7 +3756,7 @@ cor-ser.es: could not connect to host
 coralproject.net: did not receive HSTS header
 coralrosado.com.br: did not receive HSTS header
 coramcdaniel.com: did not receive HSTS header
-corbinhesse.com: could not connect to host
+corbinhesse.com: did not receive HSTS header
 corderoscleaning.com: did not receive HSTS header
 cordial-restaurant.com: did not receive HSTS header
 core4system.de: could not connect to host
@@ -3375,6 +3775,7 @@ cormilu.com.br: did not receive HSTS header
 cornishcamels.com: did not receive HSTS header
 cornmachine.com: did not receive HSTS header
 coroasdefloresonline.com.br: could not connect to host
+coropiacere.org: could not connect to host
 corozanu.ro: did not receive HSTS header
 corpoatletico.com.br: could not connect to host
 corporateencryption.com: could not connect to host
@@ -3385,12 +3786,11 @@ correiodovale.com.br: did not receive HSTS header
 corruption-mc.net: could not connect to host
 corruption-rsps.net: could not connect to host
 corruption-server.net: could not connect to host
-corzntin.fr: could not connect to host
 cosmeticosnet.com.br: did not receive HSTS header
 cosmiatria.pe: could not connect to host
+cosmic-os.org: could not connect to host
 cosmoluziluminacion.com: did not receive HSTS header
-cosmoss-departure.com: did not receive HSTS header
-cosplayer.com: could not connect to host
+cosmoss-departure.com: could not connect to host
 costow.club: did not receive HSTS header
 cotonea.de: did not receive HSTS header
 cougarsland.com: did not receive HSTS header
@@ -3400,8 +3800,8 @@ count.sh: could not connect to host
 countryoutlaws.ca: did not receive HSTS header
 coup-dun-soir.ch: could not connect to host
 couponcodeq.com: could not connect to host
-cour4g3.me: could not connect to host
 couragewhispers.ca: could not connect to host
+coursables.com: did not receive HSTS header
 coursdeprogrammation.com: could not connect to host
 course.pp.ua: did not receive HSTS header
 course.rs: could not connect to host
@@ -3413,12 +3813,12 @@ covenantbank.net: could not connect to host
 coverdat.com: could not connect to host
 coverduck.ru: could not connect to host
 coworkingmanifesto.com: did not receive HSTS header
+cozitop.com.br: could not connect to host
 cozmaadrian.ro: could not connect to host
 cozy.io: did not receive HSTS header
 cozycloud.cc: did not receive HSTS header
 cpaneltips.com: could not connect to host
 cpbanq.com: could not connect to host
-cptoon.com: could not connect to host
 cpuvinf.eu.org: could not connect to host
 cqchome.com: did not receive HSTS header
 cracking.org: did not receive HSTS header
@@ -3442,23 +3842,27 @@ cravelyrics.com: could not connect to host
 crazifyngers.com: could not connect to host
 crazy-crawler.de: did not receive HSTS header
 crazycen.com: could not connect to host
-crazycraftland.de: did not receive HSTS header
-crazycraftland.net: did not receive HSTS header
+crazycraftland.de: could not connect to host
+crazycraftland.net: could not connect to host
 crazyfamily11.de: did not receive HSTS header
 crazyhotseeds.com: did not receive HSTS header
 crazyker.com: did not receive HSTS header
 crbug.com: did not receive HSTS header (error ignored - included regardless)
+crc-online.nl: did not receive HSTS header
 creaescola.com: did not receive HSTS header
 creamybuild.com: could not connect to host
 create-ls.jp: could not connect to host
 create-test-publish.co.uk: could not connect to host
+createcos.com: could not connect to host
 creativeapple.ltd: did not receive HSTS header
 creativeartifice.com: did not receive HSTS header
+creativecommons.cl: did not receive HSTS header
 creativecommonscatpictures.com: could not connect to host
 creativefolks.co.uk: did not receive HSTS header
 creativephysics.ml: could not connect to host
 creativeplayuk.com: did not receive HSTS header
 creato.top: could not connect to host
+creators.co: could not connect to host
 crecips.com: could not connect to host
 crecket.me: could not connect to host
 credia.jp: did not receive HSTS header
@@ -3470,48 +3874,54 @@ crestoncottage.com: could not connect to host
 crewplanner.eu: did not receive HSTS header
 crge.eu: max-age too low: 0
 crimewatch.net.za: could not connect to host
+crip-usk.ba: could not connect to host
 crisissurvivalspecialists.com: could not connect to host
 cristianhares.com: could not connect to host
-critcola.com: could not connect to host
 criticalaim.com: could not connect to host
 crizk.com: could not connect to host
 crl-autos.com: could not connect to host
-crmdemo.website: did not receive HSTS header
+crmdemo.website: could not connect to host
+croceverdevb.it: did not receive HSTS header
 crockett.io: did not receive HSTS header
 croco.vision: did not receive HSTS header
 croeder.net: could not connect to host
 croisieres.discount: did not receive HSTS header
+cromefire.myds.me: could not connect to host
 cromosomax.com: could not connect to host
+cronoscentral.be: did not receive HSTS header
 croods-mt2.fr: did not receive HSTS header
 croome.no-ip.org: could not connect to host
 crop-alert.com: could not connect to host
 crosbug.com: did not receive HSTS header (error ignored - included regardless)
 crosspeakoms.com: did not receive HSTS header
 crosssec.com: did not receive HSTS header
-crowd.supply: did not receive HSTS header
+crow.tw: could not connect to host
 crowdcurity.com: did not receive HSTS header
 crowdjuris.com: could not connect to host
-crowdwis.com: could not connect to host
+crowdwis.com: did not receive HSTS header
 crownbouncycastlehire.co.uk: did not receive HSTS header
 crownruler.com: did not receive HSTS header
 crox.co: could not connect to host
 crrev.com: did not receive HSTS header (error ignored - included regardless)
-crt.cloud: could not connect to host
+crt2014-2024review.gov: could not connect to host
 crtvmgmt.com: could not connect to host
 crudysql.com: could not connect to host
 crufad.org: did not receive HSTS header
 cruikshank.com.au: did not receive HSTS header
 crushroom.com: max-age too low: 43200
+cruzadobalcazarabogados.com: could not connect to host
 cruzeiropedia.org: did not receive HSTS header
 cruzr.xyz: could not connect to host
 crypalert.com: could not connect to host
 crypt.guru: did not receive HSTS header
 cryptify.eu: could not connect to host
-crypto-armory.com: could not connect to host
+crypto-navi.org: did not receive HSTS header
+crypto.tube: did not receive HSTS header
 cryptobells.com: did not receive HSTS header
 cryptobin.org: could not connect to host
 cryptocaseproject.com: could not connect to host
 cryptodash.net: could not connect to host
+cryptodyno.ninja: could not connect to host
 cryptoisnotacrime.org: could not connect to host
 cryptojar.io: could not connect to host
 cryptolab.pro: could not connect to host
@@ -3519,44 +3929,50 @@ cryptolab.tk: did not receive HSTS header
 cryptoparty.dk: could not connect to host
 cryptopartyatx.org: could not connect to host
 cryptopartynewcastle.org: could not connect to host
+cryptopro.shop: could not connect to host
 cryptopush.com: did not receive HSTS header
 crysadm.com: could not connect to host
 crystalclassics.co.uk: did not receive HSTS header
 crystalmate.eu: did not receive HSTS header
 cs-colorscreed-betongulve.dk: could not connect to host
 cs-ubladego.pl: could not connect to host
-csacongress.org: did not receive HSTS header
+csacongress.org: max-age too low: 2592000
 csapak.com: did not receive HSTS header
 csawctf.poly.edu: could not connect to host
 cscau.com: did not receive HSTS header
 csehnyelv.hu: could not connect to host
 cselzer.com: did not receive HSTS header
 cser.me: could not connect to host
+csfloors.co.uk: could not connect to host
 csfs.org.uk: could not connect to host
 csgf.ru: did not receive HSTS header
 csgo.help: could not connect to host
 csgo77.com: could not connect to host
 csgodicegame.com: could not connect to host
 csgoelemental.com: could not connect to host
-csgogamers.com: did not receive HSTS header
+csgogamers.com: could not connect to host
 csgohandouts.com: did not receive HSTS header
 csgokings.eu: could not connect to host
 csgoshifter.com: could not connect to host
+csgotwister.com: could not connect to host
+cshopify.com: could not connect to host
 csilies.de: could not connect to host
 csinfo.us: could not connect to host
+cskdoc.com: did not receive HSTS header
 csohack.tk: could not connect to host
 cspbuilder.info: did not receive HSTS header
+csru.net: could not connect to host
 cssps.org: could not connect to host
 cssu.in: did not receive HSTS header
 csvape.com: did not receive HSTS header
 cswarzone.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ct-status.org: could not connect to host
 ct-watches.dk: did not receive HSTS header
-ctliu.com: could not connect to host
+cthomas.work: could not connect to host
 ctrl.blog: did not receive HSTS header
-ctyi.me: did not receive HSTS header
+ctyi.me: could not connect to host
 cuanhua3s.com: did not receive HSTS header
-cubecart.net: could not connect to host
+cubecart.net: did not receive HSTS header
 cubecraftstore.com: could not connect to host
 cubecraftstore.net: could not connect to host
 cubela.tech: could not connect to host
@@ -3568,8 +3984,8 @@ cuecamania.com.br: could not connect to host
 cujanovic.com: did not receive HSTS header
 cujba.com: could not connect to host
 culinae.nl: could not connect to host
-culture-school.top: did not receive HSTS header
 cultureelbeleggen.nl: did not receive HSTS header
+cultureroll.com: could not connect to host
 cumparama.com: did not receive HSTS header
 cumshots-video.ru: could not connect to host
 cunha.be: could not connect to host
@@ -3581,45 +3997,55 @@ cuongthach.com: did not receive HSTS header
 cuonic.com: could not connect to host
 cupcake.io: did not receive HSTS header
 cupcake.is: did not receive HSTS header
+cupidosshop.com: could not connect to host
 cupofarchitects.net: could not connect to host
 curacao-license.com: could not connect to host
 curarnosensalud.com: could not connect to host
+curia.fi: could not connect to host
 curiouscat.me: max-age too low: 2592000
 curlyroots.com: did not receive HSTS header
 current.com: did not receive HSTS header
 curroapp.com: could not connect to host
 cursosdnc.cl: did not receive HSTS header
+cursosgratuitos.com.br: did not receive HSTS header
 curveweb.co.uk: did not receive HSTS header
 cusfit.com: did not receive HSTS header
 custe.rs: could not connect to host
+custerweb.com: could not connect to host
 customadesign.com: did not receive HSTS header
 customd.com: did not receive HSTS header
 customfilmworks.com: could not connect to host
 customizeyourshower.com: could not connect to host
 custompapers.com: could not connect to host
 customromlist.com: could not connect to host
+customshort.link: could not connect to host
 customwritings.com: could not connect to host
 cutelariafiveladeouro.com.br: did not receive HSTS header
 cutorrent.com: could not connect to host
 cuvva.insure: did not receive HSTS header
+cuxpool.club: could not connect to host
 cvjm-memmingen.de: did not receive HSTS header
+cvps.top: did not receive HSTS header
 cvsoftub.com: did not receive HSTS header
 cvtparking.co.uk: did not receive HSTS header
+cvv.cn: could not connect to host
 cw-bw.de: could not connect to host
 cwage.com: could not connect to host
 cwbw.network: did not receive HSTS header
 cwilson.ga: could not connect to host
+cwinfo.fi: could not connect to host
 cy.technology: did not receive HSTS header
 cyanogenmod.xxx: could not connect to host
 cybbh.space: could not connect to host
 cyber-computer.club: could not connect to host
 cyber-konzept.de: did not receive HSTS header
-cyber-perikarp.eu: did not receive HSTS header
+cyber-perikarp.eu: could not connect to host
+cyber.cafe: could not connect to host
 cybercecurity.com: did not receive HSTS header
 cybercloud.cc: did not receive HSTS header
-cyberdos.de: did not receive HSTS header
+cybercymru.co.uk: did not receive HSTS header
 cyberdyne-industries.net: could not connect to host
-cyberlab.kiev.ua: did not receive HSTS header
+cyberfrancais.ro: did not receive HSTS header
 cyberlab.team: did not receive HSTS header
 cyberpeace.nl: could not connect to host
 cyberprey.com: did not receive HSTS header
@@ -3627,15 +4053,18 @@ cyberpunk.ca: could not connect to host
 cybersantri.com: could not connect to host
 cyberserver.org: could not connect to host
 cybershambles.com: could not connect to host
+cybersmart.co.uk: did not receive HSTS header
 cyberspace.today: could not connect to host
+cybertorsk.org: could not connect to host
 cyclehackluxembourgcity.lu: could not connect to host
 cyclingjunkies.com: could not connect to host
 cydia-search.io: could not connect to host
 cyelint.com: could not connect to host
-cygu.ch: could not connect to host
+cygu.ch: did not receive HSTS header
 cymtech.net: could not connect to host
 cynoshair.com: could not connect to host
 cyoda.com: did not receive HSTS header
+cypherpunk.at: could not connect to host
 cypherpunk.com: could not connect to host
 cypherpunk.ws: could not connect to host
 cyphertite.com: could not connect to host
@@ -3648,45 +4077,51 @@ d-academia.com: did not receive HSTS header
 d-macindustries.com: did not receive HSTS header
 d-rickroll-e.pw: could not connect to host
 d-toys.com.ua: could not connect to host
+d.rip: max-age too low: 900
 d00r.de: did not receive HSTS header
 d0xq.net: could not connect to host
 d1ves.io: did not receive HSTS header
+d3njjcbhbojbot.cloudfront.net: did not receive HSTS header
 d3x.pw: could not connect to host
 d4rkdeagle.tk: could not connect to host
-d4wson.com: could not connect to host
 d8studio.net: could not connect to host
+da.hn: could not connect to host
 da8.cc: could not connect to host
 dabblegoat.com: could not connect to host
 dabbot.org: did not receive HSTS header
 dad256.tk: could not connect to host
 dadtheimpaler.com: could not connect to host
+daemonslayer.net: could not connect to host
 dah5.com: did not receive HSTS header
 dahl-pind.dk: did not receive HSTS header
+dahlberg.cologne: could not connect to host
 dai-rin.co.jp: could not connect to host
 dailybunda.com: did not receive HSTS header
 dailystormerpodcasts.com: could not connect to host
 dailytopix.com: could not connect to host
 daimadi.com: could not connect to host
-daisuki.pw: could not connect to host
-daiyuu.jp: could not connect to host
+daisuki.pw: did not receive HSTS header
+daiwai.de: did not receive HSTS header
 dakerealestate.com: did not receive HSTS header
 dakl-shop.de: did not receive HSTS header
 dakotasilencer.com: did not receive HSTS header
 dakrib.net: could not connect to host
 daku.gdn: could not connect to host
+dalek.co.nz: did not receive HSTS header
 dalfiume.it: did not receive HSTS header
 dalingk.co: could not connect to host
 daltonedwards.me: could not connect to host
 dam74.com.ar: could not connect to host
 damianuv-blog.cz: did not receive HSTS header
+damicris.ro: could not connect to host
 damjanovic.work: could not connect to host
 dan.org.nz: could not connect to host
 danbarrett.com.au: could not connect to host
 dancebuzz.co.uk: did not receive HSTS header
 dancerdates.net: did not receive HSTS header
 dandymrsb.com: could not connect to host
-dane-bre.net: max-age too low: 172800
 dango.in: could not connect to host
+daniel-du.com: could not connect to host
 daniel-mosquera.com: could not connect to host
 daniel-seifert.com: max-age too low: 600000
 daniel-stahl.net: could not connect to host
@@ -3694,36 +4129,33 @@ daniel-steuer.de: could not connect to host
 danielcowie.me: could not connect to host
 danieldk.eu: did not receive HSTS header
 danielgraziano.ca: could not connect to host
-danielheal.net: could not connect to host
 danieliancu.com: could not connect to host
 danielkratz.com: max-age too low: 172800
-danielmarquard.com: could not connect to host
-danielt.co.uk: could not connect to host
-danielthompson.info: could not connect to host
+danielt.co.uk: did not receive HSTS header
 danielverlaan.nl: could not connect to host
 danielworthy.com: did not receive HSTS header
 danielzuzevich.com: could not connect to host
 danijobs.com: could not connect to host
 danishenanigans.com: could not connect to host
 dankeblog.com: could not connect to host
+dankredues.com: could not connect to host
 danmark.guide: did not receive HSTS header
 dannycrichton.com: did not receive HSTS header
 danova.de: did not receive HSTS header
 danrl.de: could not connect to host
-dansage.co: could not connect to host
 danskringsporta.be: did not receive HSTS header
 danwillenberg.com: did not receive HSTS header
 daolerp.xyz: could not connect to host
+dapim.co.il: did not receive HSTS header
 dargasia.is: could not connect to host
 darinjohnson.ca: did not receive HSTS header
 dario.im: did not receive HSTS header
-dariosirangelo.me: did not receive HSTS header
 dark-x.cf: could not connect to host
-dark.ninja: could not connect to host
 darkanzali.pl: max-age too low: 0
 darkdestiny.ch: could not connect to host
+darkfire.ch: could not connect to host
 darkfriday.ddns.net: could not connect to host
-darkhole.cn: could not connect to host
+darkhole.cn: did not receive HSTS header
 darkishgreen.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 darkkeepers.dk: max-age too low: 172800
 darknebula.space: could not connect to host
@@ -3737,6 +4169,8 @@ darlastudio66.com: did not receive HSTS header
 darlo.co.uk: could not connect to host
 darrenellis.xyz: could not connect to host
 darrenm.net: could not connect to host
+dart-tanke.com: could not connect to host
+dart-tanke.de: could not connect to host
 das-tyrol.at: did not receive HSTS header
 dash-board.jp: did not receive HSTS header
 dash.rocks: did not receive HSTS header
@@ -3760,6 +4194,7 @@ datahoarderschool.club: did not receive HSTS header
 dataisme.com: did not receive HSTS header
 datajapan.co.jp: did not receive HSTS header
 datamatic.ru: could not connect to host
+datapun.ch: did not receive HSTS header
 dataretention.solutions: could not connect to host
 datasharesystem.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 datasnitch.co.uk: could not connect to host
@@ -3778,6 +4213,8 @@ datortipsen.se: did not receive HSTS header
 datsound.ru: did not receive HSTS header
 datsumou-q.com: did not receive HSTS header
 daverandom.com: could not connect to host
+davewut.ca: did not receive HSTS header
+david-mallett.com: did not receive HSTS header
 davidandkailey.com: could not connect to host
 davidbrito.tech: could not connect to host
 davidglidden.eu: did not receive HSTS header
@@ -3791,26 +4228,28 @@ davidscherzer.at: could not connect to host
 davimun.org: could not connect to host
 davros.eu: could not connect to host
 davros.ru: could not connect to host
-daw.nz: could not connect to host
 dawnofeden.org: did not receive HSTS header
 dawnson.is: could not connect to host
 dawnsonb.com: could not connect to host
-day.vip: did not receive HSTS header
+day.vip: could not connect to host
 daylightcompany.com: did not receive HSTS header
 days.one: could not connect to host
 daytonaseaside.com: did not receive HSTS header
+db-sanity.com: could not connect to host
 db.gy: could not connect to host
+dbcom.ru: could not connect to host
 dbjc.duckdns.org: could not connect to host
 dblx.io: could not connect to host
 dbox.ga: could not connect to host
 dbpmedia.se: did not receive HSTS header
-dbx.ovh: did not receive HSTS header
+dbx.ovh: could not connect to host
 dbyz.co.uk: max-age too low: 43200
 dcaracing.nl: could not connect to host
 dcc.moe: could not connect to host
 dccode.gov: could not connect to host
 dccoffeeproducts.com: did not receive HSTS header
 dccraft.net: could not connect to host
+dcl.re: could not connect to host
 dctxf.com: did not receive HSTS header
 dcuofriends.net: could not connect to host
 dcw.io: did not receive HSTS header
@@ -3820,12 +4259,13 @@ dden.website: could not connect to host
 dden.xyz: could not connect to host
 ddmeportal.com: could not connect to host
 ddns-anbieter.de: could not connect to host
-ddocu.me: did not receive HSTS header
+ddocu.me: could not connect to host
 ddos-mitigation.co.uk: could not connect to host
 ddos-mitigation.info: could not connect to host
 de-servers.de: could not connect to host
 deadmann.com: could not connect to host
 deadsoul.net: could not connect to host
+deai-life.biz: could not connect to host
 debank.tv: did not receive HSTS header
 debatch.se: could not connect to host
 debian-vhost.de: could not connect to host
@@ -3852,10 +4292,8 @@ decormiernissanparts.com: could not connect to host
 decoyrouting.com: could not connect to host
 dedeo.tk: could not connect to host
 dedicatutiempo.es: could not connect to host
-dedietrich-asia.com: could not connect to host
-deep.club: could not connect to host
+dedietrich-asia.com: did not receive HSTS header
 deep.social: did not receive HSTS header
-deeparamaraj.com: did not receive HSTS header
 deepcovelabs.net: could not connect to host
 deepcreampie.com: could not connect to host
 deepearth.uk: could not connect to host
@@ -3863,29 +4301,33 @@ deeprecce.com: could not connect to host
 deeprecce.link: could not connect to host
 deeprecce.tech: could not connect to host
 deeps.cat: could not connect to host
-deeps.me: did not receive HSTS header
+deeps.me: could not connect to host
+deepsouthsounds.com: did not receive HSTS header
 deepvalley.tech: could not connect to host
 deepvision.com.ua: did not receive HSTS header
 deer.team: could not connect to host
 deetz.nl: did not receive HSTS header
 deetzen.de: did not receive HSTS header
-defi-metier.com: did not receive HSTS header
-defi-metier.fr: did not receive HSTS header
+deezeno.com: could not connect to host
+defi-metier.com: could not connect to host
+defi-metier.fr: could not connect to host
 defi-metier.org: could not connect to host
-defi-metiers.com: did not receive HSTS header
+defi-metiers.com: could not connect to host
 defi-metiers.fr: did not receive HSTS header
-defi-metiers.org: did not receive HSTS header
+defi-metiers.org: could not connect to host
 defiler.tk: could not connect to host
 defimetier.fr: could not connect to host
-defimetier.org: did not receive HSTS header
-defimetiers.com: did not receive HSTS header
-defimetiers.fr: did not receive HSTS header
+defimetier.org: could not connect to host
+defimetiers.com: could not connect to host
+defimetiers.fr: could not connect to host
 degroetenvanrosaline.nl: could not connect to host
 deight.co: could not connect to host
 deight.in: could not connect to host
 dekasan.ru: could not connect to host
 delandalucia.com: did not receive HSTS header
 delayrefunds.co.uk: could not connect to host
+delcopa.gov: could not connect to host
+delf.co.jp: did not receive HSTS header
 deliberatedigital.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 deliver.moe: did not receive HSTS header
 deliverance.co.uk: could not connect to host
@@ -3894,13 +4336,10 @@ deltaconcepts.de: could not connect to host
 delvj.org: could not connect to host
 demandware.com: did not receive HSTS header
 demarche-expresse.com: did not receive HSTS header
-demarle.ch: could not connect to host
 demdis.org: could not connect to host
 demilitarized.ninja: could not connect to host
 demo-server.us: could not connect to host
 demo.sb: could not connect to host
-demo.swedbank.se: did not receive HSTS header
-demo9.ovh: did not receive HSTS header
 democracy.io: did not receive HSTS header
 democraticdifference.com: could not connect to host
 demomanca.com: did not receive HSTS header
@@ -3914,16 +4353,16 @@ dentaldomain.org: did not receive HSTS header
 dentaldomain.ph: did not receive HSTS header
 denvercybersecurity.com: did not receive HSTS header
 denverphilharmonic.org: did not receive HSTS header
-denverprophit.us: did not receive HSTS header
+denverprophit.us: could not connect to host
 depaco.com: did not receive HSTS header
 deped.blog: could not connect to host
 depedshs.com: could not connect to host
+depedtayo.com: did not receive HSTS header
 depedtayo.ph: could not connect to host
 depijl-mz.nl: did not receive HSTS header
 depixion.agency: could not connect to host
 depo.space: could not connect to host
 deprobe.pro: could not connect to host
-depth-co.jp: could not connect to host
 dequehablamos.es: could not connect to host
 derbyshiredotnet.co.uk: did not receive HSTS header
 derchris.me: could not connect to host
@@ -3934,10 +4373,11 @@ derivativeshub.pro: could not connect to host
 derive.cc: could not connect to host
 dermacarecomplex.com: could not connect to host
 derpumpkinfuhrer.com: could not connect to host
-derrickemery.com: did not receive HSTS header
+derrickemery.com: could not connect to host
 derwaldschrat.net: did not receive HSTS header
 derwolfe.net: did not receive HSTS header
 desiccantpackets.com: did not receive HSTS header
+design-fu.com: did not receive HSTS header
 designandmore.it: did not receive HSTS header
 designanyware.com.br: could not connect to host
 designgears.com: did not receive HSTS header
@@ -3948,9 +4388,11 @@ despora.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR
 despotika.de: could not connect to host
 desserteagleselvenar.tk: could not connect to host
 destinationbijoux.fr: could not connect to host
+destinopiriapolis.com: did not receive HSTS header
 destom.be: could not connect to host
 desveja.com.br: could not connect to host
 detalhecomercio.com.br: could not connect to host
+detalyedesigngroup.com: could not connect to host
 detechnologiecooperatie.nl: did not receive HSTS header
 detecte-fuite.ch: could not connect to host
 detecte.ch: could not connect to host
@@ -3961,8 +4403,8 @@ dethikiemtra.com: did not receive HSTS header
 detroitrocs.org: did not receive HSTS header
 detteflies.com: max-age too low: 7889238
 detutorial.com: max-age too low: 36000
-deusu.de: could not connect to host
-deusu.org: could not connect to host
+deusu.de: did not receive HSTS header
+deusu.org: did not receive HSTS header
 deux.solutions: could not connect to host
 deuxsol.co: could not connect to host
 deuxsol.com: could not connect to host
@@ -3972,21 +4414,22 @@ dev: could not connect to host
 dev-aegon.azurewebsites.net: did not receive HSTS header
 dev-bluep.pantheonsite.io: did not receive HSTS header
 dev-talk.eu: did not receive HSTS header
-dev-talk.net: did not receive HSTS header
+dev-talk.net: could not connect to host
 devafterdark.com: could not connect to host
 devdesco.com: could not connect to host
 devdom.io: max-age too low: 172800
-develop.cool: did not receive HSTS header
+devdoodle.net: could not connect to host
 develop.fitness: could not connect to host
-developermail.io: did not receive HSTS header
 developersclub.website: could not connect to host
-developyourelement.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+devenney.io: did not receive HSTS header
 devh.de: could not connect to host
+deviltracks.net: could not connect to host
 devin-balimuhac.de: did not receive HSTS header
 devincrow.me: could not connect to host
 devinpacker.com: could not connect to host
 devisonline.ch: could not connect to host
 devistravaux.org: did not receive HSTS header
+devjack.de: could not connect to host
 devlux.ch: did not receive HSTS header
 devmsg.com: could not connect to host
 devnsec.com: could not connect to host
@@ -3994,10 +4437,14 @@ devnull.team: could not connect to host
 devopps.me: did not receive HSTS header
 devops.moe: could not connect to host
 devopsconnected.com: could not connect to host
+devpgsv.com: could not connect to host
 devtestfan1.gov: could not connect to host
 devtub.com: could not connect to host
 devuan.org: did not receive HSTS header
+dewebwerf.nl: did not receive HSTS header
 dewin.io: could not connect to host
+dfekt.no: could not connect to host
+dfektlan.no: could not connect to host
 dfixit.com: could not connect to host
 dfrance.com.br: did not receive HSTS header
 dfviana.com.br: max-age too low: 2592000
@@ -4013,9 +4460,10 @@ dhub.xyz: could not connect to host
 dhxxls.com: could not connect to host
 diablotine.rocks: could not connect to host
 diabolic.chat: could not connect to host
+diagnocentro.cl: could not connect to host
 diagnosia.com: did not receive HSTS header
 diagonale-deco.fr: did not receive HSTS header
-dialoegue.com: did not receive HSTS header
+dialectic-og.com: could not connect to host
 diamondcare.com.br: did not receive HSTS header
 diamondpkg.org: could not connect to host
 diamondt.us: did not receive HSTS header
@@ -4029,15 +4477,15 @@ dicgaming.net: could not connect to host
 dichgans-besserer.de: did not receive HSTS header
 dichvudangkygiayphep.com: could not connect to host
 dicio.com.br: did not receive HSTS header
-dicionariofinanceiro.com: did not receive HSTS header
-dicionariopopular.com: did not receive HSTS header
 dick.red: could not connect to host
+dickord.club: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 didierlaumen.be: did not receive HSTS header
 die-besten-weisheiten.de: could not connect to host
 die-gruenen-teufel.de: could not connect to host
 dieb.photo: could not connect to host
 diejanssens.net: did not receive HSTS header
-diemogebhardt.com: could not connect to host
+diemogebhardt.com: did not receive HSTS header
+dierabenmutti.de: max-age too low: 7776000
 dierencompleet.nl: did not receive HSTS header
 dierenkruiden.nl: did not receive HSTS header
 dieser.me: could not connect to host
@@ -4046,14 +4494,14 @@ diewebstube.de: could not connect to host
 diezel.com: could not connect to host
 diferenca.com: did not receive HSTS header
 diggable.co: max-age too low: 2592000
-digihyp.ch: could not connect to host
+digihyp.ch: did not receive HSTS header
 digikol.net: could not connect to host
 diginota.com: did not receive HSTS header
 digired.xyz: could not connect to host
 digital1world.com: did not receive HSTS header
 digitalbank.kz: could not connect to host
 digitalcloud.ovh: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-digitalcraftmarketing.co.uk: did not receive HSTS header
+digitalcuko.com: did not receive HSTS header
 digitaldaddy.net: did not receive HSTS header
 digitalero.rip: did not receive HSTS header
 digitalewelten.de: could not connect to host
@@ -4062,6 +4510,7 @@ digitalhurricane.io: could not connect to host
 digitalimpostor.co.uk: could not connect to host
 digitaljungle.net: could not connect to host
 digitallocker.com: did not receive HSTS header
+digitalmaniac.co.uk: could not connect to host
 digitalnonplus.com: could not connect to host
 digitalquery.com: did not receive HSTS header
 digitalriver.tk: did not receive HSTS header
@@ -4073,38 +4522,39 @@ diguass.us: could not connect to host
 dijks.com: could not connect to host
 dikshant.net: could not connect to host
 diletec.com.br: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+dilichen.fr: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 dillynbarber.com: did not receive HSTS header
 dim.lighting: could not connect to host
+dimensionen.de: did not receive HSTS header
 dimes.com.tr: did not receive HSTS header
 dimitrisotiropoulosbooks.com: max-age too low: 7889238
+dimseklubben.dk: could not connect to host
 din-tools.com: did not receive HSTS header
 dinamoelektrik.com: could not connect to host
 dingcc.com: could not connect to host
 dingcc.org: could not connect to host
 dingcc.xyz: could not connect to host
-dinge.xyz: could not connect to host
 dingelbob-schuhcreme.gq: could not connect to host
 dingss.com: could not connect to host
-dinheirolucrar.com: did not receive HSTS header
 dinkum.online: could not connect to host
 dinotv.at: could not connect to host
 dintillat.fr: could not connect to host
 dinube.com: did not receive HSTS header
 dionysus.se: could not connect to host
 dipconsultants.com: could not connect to host
-direct2uk.com: could not connect to host
 directhskincream.com: could not connect to host
 directinsure.in: did not receive HSTS header
+directme.ga: could not connect to host
 directorinegocis.cat: could not connect to host
 directtwo.solutions: could not connect to host
 directtwosolutions.org: could not connect to host
 directwatertanks.co.uk: did not receive HSTS header
 direnv.net: did not receive HSTS header
 direwolfsoftware.ca: could not connect to host
-dirk-weise.de: could not connect to host
 dirkwolf.de: could not connect to host
 dirtycat.ru: could not connect to host
 disadattamentolavorativo.it: could not connect to host
+discipul.nl: did not receive HSTS header
 disclosure.io: did not receive HSTS header
 disco-crazy-world.de: could not connect to host
 discord-chan.net: could not connect to host
@@ -4113,7 +4563,7 @@ discountmetaux.fr: did not receive HSTS header
 discover-mercure.com: could not connect to host
 discoveringdocker.com: could not connect to host
 discoverrsv.com: did not receive HSTS header
-discoverwellness.center: did not receive HSTS header
+discoverwellness.center: could not connect to host
 discovery.lookout.com: did not receive HSTS header
 discoveryballoon.org: could not connect to host
 disking.co.uk: did not receive HSTS header
@@ -4132,23 +4582,29 @@ ditrutoancau.vn: could not connect to host
 dittvertshus.no: could not connect to host
 diva-ey.com: could not connect to host
 divegearexpress.com.cn: did not receive HSTS header
+divenwa.com: did not receive HSTS header
 diversity-spielzeug.de: did not receive HSTS header
 divvi.co.nz: did not receive HSTS header
+divvymonkey.com: did not receive HSTS header
 divvyradio.com: did not receive HSTS header
 dixiediner.com: did not receive HSTS header
 dixmag.com: could not connect to host
-diz.in.ua: did not receive HSTS header
+diz.in.ua: could not connect to host
 dizihocasi.com: could not connect to host
 dizorg.net: could not connect to host
 dj4et.de: could not connect to host
+djieno.com: could not connect to host
+djsk.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 djul.net: could not connect to host
 djxmmx.net: did not receive HSTS header
+dkn.go.id: did not receive HSTS header
 dkniss.de: could not connect to host
-dko-steiermark.ml: did not receive HSTS header
 dl.google.com: did not receive HSTS header (error ignored - included regardless)
 dlbouncers.co.uk: could not connect to host
 dlc.viasinc.com: could not connect to host
+dlcwilson.com: could not connect to host
 dlemper.de: did not receive HSTS header
+dlouwrink.nl: could not connect to host
 dlyl888.com: could not connect to host
 dmarketer.com: did not receive HSTS header
 dmcastles.com: did not receive HSTS header
@@ -4162,7 +4618,9 @@ dmlogic.com: could not connect to host
 dmtry.me: did not receive HSTS header
 dmwall.cn: could not connect to host
 dmz.ninja: could not connect to host
+dnfc.rocks: could not connect to host
 dnmaze.com: could not connect to host
+dns-manager.info: did not receive HSTS header
 dns.google.com: did not receive HSTS header (error ignored - included regardless)
 dnsbird.net: could not connect to host
 dnsbird.org: could not connect to host
@@ -4175,14 +4633,16 @@ doak.io: did not receive HSTS header
 dobet.in: could not connect to host
 doc-justice.com: did not receive HSTS header
 docid.io: could not connect to host
+dockerm.com: could not connect to host
 dockerturkiye.com: could not connect to host
 docket.news: could not connect to host
 doclassworks.com: could not connect to host
 doclot.io: could not connect to host
 docplexus.in: did not receive HSTS header
+docplexus.org: did not receive HSTS header
 docset.io: could not connect to host
 docufiel.com: could not connect to host
-doculus.io: did not receive HSTS header
+doculus.io: could not connect to host
 documentations-sociales.com: could not connect to host
 docxtemplater.com: did not receive HSTS header
 doesmycodehavebugs.today: could not connect to host
@@ -4190,6 +4650,7 @@ doeswindowssuckforeveryoneorjustme.com: could not connect to host
 dogbox.se: could not connect to host
 dogcratereview.info: could not connect to host
 dogespeed.ga: could not connect to host
+dogfi.sh: could not connect to host
 doggieholic.net: could not connect to host
 dognlife.com: could not connect to host
 dogoodbehappyllc.com: did not receive HSTS header
@@ -4197,9 +4658,10 @@ dogprograms.net: could not connect to host
 dohosting.ru: could not connect to host
 dojifish.space: could not connect to host
 dojin.nagoya: could not connect to host
-dokan-e.com: could not connect to host
 dokan.online: did not receive HSTS header
 doked.io: could not connect to host
+dokspot.cf: could not connect to host
+dokspot.ga: could not connect to host
 dolarcanadense.com.br: could not connect to host
 dolevik.com: could not connect to host
 dollarstore24.com: could not connect to host
@@ -4208,7 +4670,7 @@ dolphin-cloud.com: could not connect to host
 dolphin-hosting.com: could not connect to host
 dolphincorp.co.uk: could not connect to host
 dolphinswithlasers.com: could not connect to host
-dolt.xyz: did not receive HSTS header
+dolt.xyz: could not connect to host
 domaine-aigoual-cevennes.com: did not receive HSTS header
 domainelaremejeanne.com: did not receive HSTS header
 domaris.de: did not receive HSTS header
@@ -4219,17 +4681,19 @@ domfee.com: could not connect to host
 dominikanskarepubliken.guide: could not connect to host
 dominioanimal.com: could not connect to host
 dominique-mueller.de: could not connect to host
+domytermpaper.com: could not connect to host
 don.yokohama: could not connect to host
 donateway.com: did not receive HSTS header
 dong8.top: could not connect to host
+dongjingre.net: could not connect to host
 donhoward.org: did not receive HSTS header
+donlydental.ca: did not receive HSTS header
 donmez.uk: could not connect to host
 donmez.ws: could not connect to host
-donner-reuschel.de: did not receive HSTS header
-donotcall.gov: did not receive HSTS header
 donotspampls.me: could not connect to host
 donotspellitgav.in: did not receive HSTS header
 donpaginasweb.com: did not receive HSTS header
+donsbach-edv.de: did not receive HSTS header
 donthedragonwilson.com: could not connect to host
 donttrustrobots.nl: could not connect to host
 donzelot.co.uk: did not receive HSTS header
@@ -4239,6 +4703,7 @@ doodlefinder.de: max-age too low: 600000
 dooku.cz: could not connect to host
 doomleika.com: did not receive HSTS header
 doooonoooob.com: could not connect to host
+doopdidoop.com: did not receive HSTS header
 door.cards: could not connect to host
 dopost.it: could not connect to host
 doriginal.es: did not receive HSTS header
@@ -4247,14 +4712,13 @@ dormebebe.com.br: could not connect to host
 dosipe.com: could not connect to host
 doska.kz: could not connect to host
 dostavkakurierom.ru: could not connect to host
+dot.ro: did not receive HSTS header
 dotadata.me: could not connect to host
 dotb.dn.ua: did not receive HSTS header
-dotbrick.co.th: did not receive HSTS header
+dotbrick.co.th: could not connect to host
 dotkod.com: could not connect to host
 dotnetsandbox.ca: could not connect to host
-dotrox.net: could not connect to host
 dotspaperie.com: could not connect to host
-doubleaste.com: did not receive HSTS header
 doublethink.online: could not connect to host
 doubleyummy.uk: did not receive HSTS header
 dougferris.id.au: could not connect to host
@@ -4268,16 +4732,20 @@ dovetailnow.com: could not connect to host
 dowc.org: did not receive HSTS header
 download.jitsi.org: did not receive HSTS header
 downsouthweddings.com.au: did not receive HSTS header
+doxcelerate.com: could not connect to host
 doyoulyft.com: could not connect to host
 dpangerl.de: did not receive HSTS header
 dps.srl: did not receive HSTS header
 dpsart.it: did not receive HSTS header
 dr-knirr.de: could not connect to host
 dr2dr.ca: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+drabben.be: did not receive HSTS header
 drabbin.com: could not connect to host
+dragfiles.com: could not connect to host
 draghive.club: did not receive HSTS header
 draghive.net: could not connect to host
 draghive.photos: did not receive HSTS header
+dragon-aspect.com: could not connect to host
 dragoncityhack.tips: could not connect to host
 dragonisles.net: could not connect to host
 dragons-of-highlands.cz: did not receive HSTS header
@@ -4291,6 +4759,7 @@ drakfot.se: could not connect to host
 dralexjimenez.com: did not receive HSTS header
 drastosasports.com.br: could not connect to host
 drawvesly.ovh: did not receive HSTS header
+drbarnabus.com: could not connect to host
 drdevil.ru: could not connect to host
 dreadbyte.com: could not connect to host
 dreadd.org: could not connect to host
@@ -4298,26 +4767,26 @@ dreamaholic.club: could not connect to host
 dreamcatcherblog.de: could not connect to host
 dreaming.solutions: could not connect to host
 dreamlighteyeserum.com: could not connect to host
-dreamof.net: could not connect to host
 dreamsforabetterworld.com.au: did not receive HSTS header
 dreax.win: could not connect to host
 dredgepress.com: did not receive HSTS header
 dreischneidiger.de: could not connect to host
 dreizwosechs.de: could not connect to host
 drewgle.net: could not connect to host
-drhopeson.com: could not connect to host
+drhopeson.com: did not receive HSTS header
 drillnation.com.au: could not connect to host
 drinknaturespower.com: could not connect to host
 drinkvabeer.com: could not connect to host
+dripdoctors.com: did not receive HSTS header
 drishti.guru: could not connect to host
 drive.xyz: could not connect to host
+drivercopilot.com: did not receive HSTS header
 drivewithstatetransit.com.au: did not receive HSTS header
 driving-lessons.co.uk: could not connect to host
 drixn.cn: could not connect to host
 drixn.com: could not connect to host
 drixn.info: could not connect to host
 drixn.net: could not connect to host
-drizz.com.br: could not connect to host
 drlazarina.net: did not receive HSTS header
 drobniuch.pl: could not connect to host
 drogoz.moe: could not connect to host
@@ -4343,8 +4812,10 @@ drpure.pw: did not receive HSTS header
 drtroyhendrickson.com: could not connect to host
 drtti.io: could not connect to host
 drturner.com.au: did not receive HSTS header
+drubn.de: could not connect to host
 drugagodba.si: did not receive HSTS header
 drumbandesperanto.nl: could not connect to host
+drump-truck.com: did not receive HSTS header
 drupal123.com: could not connect to host
 druznek.rocks: could not connect to host
 druznek.xyz: could not connect to host
@@ -4353,15 +4824,13 @@ drybasementkansas.com: did not receive HSTS header
 drycreekapiary.com: could not connect to host
 ds-christiansen.de: could not connect to host
 dshiv.io: could not connect to host
-dsne.com.mx: did not receive HSTS header
 dsouzamusic.com: could not connect to host
-dstvinstallrandburg.co.za: did not receive HSTS header
 dsuinnovation.com: could not connect to host
-dsyunmall.com: could not connect to host
-dtp-mstdn.jp: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+dsyunmall.com: did not receive HSTS header
 dtub.co: could not connect to host
-dualascent.com: did not receive HSTS header
+dualias.xyz: could not connect to host
 duan.li: could not connect to host
+dubaosheng.com: could not connect to host
 dubik.su: did not receive HSTS header
 duckyubuntu.tk: could not connect to host
 ducohosting.com: did not receive HSTS header
@@ -4387,9 +4856,11 @@ duocircle.com: did not receive HSTS header
 duole30.com: could not connect to host
 duongpho.com: did not receive HSTS header
 durangoenergyllc.com: could not connect to host
+durchblick-shop.de: could not connect to host
+durexwinkel.nl: could not connect to host
 dushu.cat: could not connect to host
 duskopy.top: could not connect to host
-dutchessuganda.com: could not connect to host
+dutchessuganda.com: did not receive HSTS header
 dutchrank.com: did not receive HSTS header
 dutyfreeonboard.com: did not receive HSTS header
 duuu.ch: could not connect to host
@@ -4399,18 +4870,22 @@ dwellstudio.com: did not receive HSTS header
 dwhd.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 dwnld.me: could not connect to host
 dycem-ns.com: did not receive HSTS header
+dycoa.com: could not connect to host
 dycontrol.de: could not connect to host
+dylancl.cf: could not connect to host
 dylanscott.com.au: did not receive HSTS header
 dynamic-innovations.net: could not connect to host
 dynamic-networks.be: could not connect to host
 dynamize.solutions: did not receive HSTS header
 dyncdn.me: could not connect to host
 dynts.pro: could not connect to host
-dyz.pw: did not receive HSTS header
+dyz.pw: could not connect to host
 dziekonski.com: could not connect to host
 dzimejl.sk: did not receive HSTS header
 dzlibs.io: could not connect to host
-dzsibi.com: could not connect to host
+dzndk.com: could not connect to host
+dzndk.net: could not connect to host
+dzndk.org: could not connect to host
 dzytdl.com: did not receive HSTS header
 e-aut.net: could not connect to host
 e-baraxolka.ru: could not connect to host
@@ -4419,21 +4894,28 @@ e-isfa.eu: did not receive HSTS header
 e-mak.eu: could not connect to host
 e-migration.ch: could not connect to host
 e-newshub.com: could not connect to host
+e-planetelec.fr: did not receive HSTS header
 e-pokupki.eu: did not receive HSTS header
 e-rickroll-r.pw: could not connect to host
 e-sa.com: did not receive HSTS header
-e-speak24.pl: could not connect to host
 e-vau.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 e-vo-linka.cz: did not receive HSTS header
 e-wishlist.net: could not connect to host
-e024.org: did not receive HSTS header
+e024.org: could not connect to host
+e1488.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 e191.com: could not connect to host
 e30gruppe.com: did not receive HSTS header
 e3amn2l.com: could not connect to host
 e3kids.com: did not receive HSTS header
 e3q.de: could not connect to host
-e505.net: did not receive HSTS header
+e505.net: could not connect to host
 e51888.com: did not receive HSTS header
+e52888.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+e52888.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+e53888.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+e53888.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+e59888.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+e59888.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 eagle-aluminum.com: did not receive HSTS header
 eagle-yard.de: could not connect to host
 eagleridgecampground.com: could not connect to host
@@ -4444,16 +4926,23 @@ earlybirdsnacks.com: could not connect to host
 earth-people.org: could not connect to host
 earthrise16.com: could not connect to host
 easew.com: could not connect to host
+east-line.su: could not connect to host
+eastcoastbubbleandbounce.co.uk: could not connect to host
 eastcoastinflatables.co.uk: did not receive HSTS header
 easthokkaido-5airport.jp: did not receive HSTS header
 eastmidlandsstargazers.org.uk: did not receive HSTS header
 eastmontgroup.com: did not receive HSTS header
+eastpeoria-il.gov: could not connect to host
 easy-factures.fr: could not connect to host
 easychiller.org: could not connect to host
 easykonto.de: could not connect to host
+easykraamzorg.nl: did not receive HSTS header
 easyplane.it: did not receive HSTS header
+easypv.ch: could not connect to host
 easyreal.ru: could not connect to host
+easyschools.org: did not receive HSTS header
 easysimplecrm.com: could not connect to host
+eat-mine.ml: could not connect to host
 eat-the-world.ch: could not connect to host
 eat4happiness.com: did not receive HSTS header
 eatfitoutlet.com.br: could not connect to host
@@ -4471,19 +4960,19 @@ ebolsas.com.br: did not receive HSTS header
 ebooksgratuits.org: could not connect to host
 ebop.ch: could not connect to host
 ebp2p.com: did not receive HSTS header
-ebpglobal.com: could not connect to host
 ebraph.com: could not connect to host
 ebrowz.com: could not connect to host
 ecake.in: could not connect to host
 ecc-kaufbeuren.de: could not connect to host
 eccux.com: could not connect to host
+ecelembrou.ovh: could not connect to host
 ecfs.link: could not connect to host
 ecg.fr: could not connect to host
 echipstore.com: did not receive HSTS header
 echo.cc: could not connect to host
-echoactive.com: could not connect to host
+echoactive.com: max-age too low: 7776000
 echomanchester.net: could not connect to host
-eckro.com: did not receive HSTS header
+eckro.com: could not connect to host
 ecole-en-danger.fr: could not connect to host
 ecole-iaf.fr: could not connect to host
 ecole-maternelle-saint-joseph.be: could not connect to host
@@ -4492,7 +4981,7 @@ ecology-21.ru: did not receive HSTS header
 ecomlane.com: could not connect to host
 ecomparemo.com: did not receive HSTS header
 econativa.pt: could not connect to host
-economy.st: could not connect to host
+economy.st: did not receive HSTS header
 economycarrentalscyprus.com: could not connect to host
 ecorus.eu: did not receive HSTS header
 ecosoftconsult.com: could not connect to host
@@ -4517,12 +5006,13 @@ edh.email: did not receive HSTS header
 edhrealtor.com: did not receive HSTS header
 edilservizi.it: did not receive HSTS header
 edilservizivco.it: did not receive HSTS header
-edisonchee.com: could not connect to host
+edisonchee.com: did not receive HSTS header
 edissecurity.sk: did not receive HSTS header
 edition-pommern.com: did not receive HSTS header
 editoraacademiacrista.com.br: could not connect to host
 edix.ru: could not connect to host
 edk.com.tr: did not receive HSTS header
+edpubs.gov: could not connect to host
 edsh.de: did not receive HSTS header
 eduardnikolenko.com: could not connect to host
 eduardnikolenko.ru: could not connect to host
@@ -4533,9 +5023,8 @@ educatoys.com.br: could not connect to host
 educatweb.de: did not receive HSTS header
 educnum.fr: did not receive HSTS header
 educourse.ga: could not connect to host
-eduif.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+eduif.nl: could not connect to host
 eduvance.in: did not receive HSTS header
-edvgarbe.de: could not connect to host
 ee-terminals.com: could not connect to host
 eeb98.com: could not connect to host
 eeetrust.org: could not connect to host
@@ -4552,20 +5041,21 @@ eengezinswoning-in-zuid-holland-kopen.nl: could not connect to host
 eengezinswoning-in-zuidplas-kopen.nl: could not connect to host
 eengezinswoning-in-zwartewaterland-kopen.nl: could not connect to host
 eengezinswoningverkopen.nl: could not connect to host
+eengoedenotaris.nl: did not receive HSTS header
 eenhoorn.ga: could not connect to host
 eeqj.com: did not receive HSTS header
 eesistumine2017.ee: could not connect to host
 eez.ee: could not connect to host
+ef-georgia.org: could not connect to host
 effectiveosgi.com: could not connect to host
 effectivepapers.com: could not connect to host
-efficienthealth.com: did not receive HSTS header
-effortlesshr.com: did not receive HSTS header
-efinity.io: did not receive HSTS header
-eftcorp.biz: max-age too low: 0
-egarden.it: did not receive HSTS header
+efficienthealth.com: could not connect to host
+eftcorp.biz: did not receive HSTS header
+egbert.net: could not connect to host
 egfl.org.uk: did not receive HSTS header
 egge.com: max-age too low: 0
 egit.co: could not connect to host
+eglek.com: did not receive HSTS header
 ego-world.org: did not receive HSTS header
 egupova.ru: did not receive HSTS header
 ehealthcounselor.com: could not connect to host
@@ -4580,7 +5070,7 @@ eifelindex.de: did not receive HSTS header
 eiga-movie.com: max-age too low: 0
 eigenbubi.de: could not connect to host
 eightyfour.ca: could not connect to host
-eigo.work: could not connect to host
+eigo.work: did not receive HSTS header
 eimanavicius.lt: did not receive HSTS header
 einar.io: max-age too low: 86400
 einfachmaldiefressehalten.de: could not connect to host
@@ -4590,7 +5080,8 @@ einsatzstiefel.info: could not connect to host
 einsit.com: could not connect to host
 einsitapis.com: could not connect to host
 ejgconsultancy.co.uk: did not receive HSTS header
-ejusu.com: did not receive HSTS header
+ejuicelab.co.uk: did not receive HSTS header
+ejusu.com: could not connect to host
 ek.network: could not connect to host
 ekbanden.nl: could not connect to host
 ekobudisantoso.net: could not connect to host
@@ -4602,11 +5093,14 @@ elan-organics.com: did not receive HSTS header
 elanguest.pl: could not connect to host
 elanguest.ro: could not connect to host
 elanguest.ru: could not connect to host
-elastic7.uk: could not connect to host
+elarvee.xyz: could not connect to host
 elaxy-online.de: could not connect to host
 elbaal.gov: did not receive HSTS header
 elblein.de: did not receive HSTS header
-elderoost.com: could not connect to host
+elbohlyart.com: did not receive HSTS header
+eldietista.es: could not connect to host
+eldisagjapi.com: could not connect to host
+eldisagjapi.de: could not connect to host
 elearningpilot.com: did not receive HSTS header
 electicofficial.com: did not receive HSTS header
 electricalcontrolpanels.co.uk: could not connect to host
@@ -4625,22 +5119,21 @@ elementalict.com: did not receive HSTS header
 elementalrobotics.com: could not connect to host
 elemenx.com: did not receive HSTS header
 elemprendedor.com.ve: could not connect to host
-elena-baykova.ru: did not receive HSTS header
 elenag.ga: could not connect to host
 elenagherta.ga: could not connect to host
 elenoon.ir: max-age too low: 1
 elenorsmadness.org: could not connect to host
 eleonorengland.com: did not receive HSTS header
+elestanteliterario.com: did not receive HSTS header
 eletesstilus.hu: could not connect to host
 elevateandprosper.com: could not connect to host
-elexel.ru: could not connect to host
 elgacien.de: could not connect to host
 elguillatun.cl: did not receive HSTS header
 elhall.pro: did not receive HSTS header
 elhall.ru: did not receive HSTS header
-elias-nicolas.com: could not connect to host
 eliasojala.me: did not receive HSTS header
 elimdengelen.com: did not receive HSTS header
+eline168.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 eliott.be: could not connect to host
 elistor6100.xyz: did not receive HSTS header
 elite-box.com: did not receive HSTS header
@@ -4660,9 +5153,7 @@ elnutricionista.es: could not connect to host
 elo.fyi: could not connect to host
 elohna.ch: did not receive HSTS header
 elonbase.com: could not connect to host
-elpado.de: could not connect to host
 elpay.kz: did not receive HSTS header
-elpo.net: did not receive HSTS header
 elpo.xyz: could not connect to host
 elsamakhin.com: could not connect to host
 elsemanario.com: did not receive HSTS header
@@ -4671,7 +5162,8 @@ elsitar.com: could not connect to host
 elsword.moe: could not connect to host
 eltransportquevolem.org: could not connect to host
 eltrox.me: could not connect to host
-elyisus.info: could not connect to host
+eluft.de: could not connect to host
+elyisus.info: did not receive HSTS header
 elytronsecurity.com: did not receive HSTS header
 email.lookout.com: could not connect to host
 email2rss.net: could not connect to host
@@ -4683,7 +5175,7 @@ embellir-aroma.com: could not connect to host
 embellir-kyujin.com: could not connect to host
 embracethedarkness.co.uk: could not connect to host
 embroidered-stuff.com: could not connect to host
-embudospro.net: did not receive HSTS header
+embudospro.net: could not connect to host
 emeldi-commerce.com: max-age too low: 0
 emergencyessay.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 emergencymedicinefoundations.com: did not receive HSTS header
@@ -4691,16 +5183,16 @@ emergentvisiontec.com: did not receive HSTS header
 emesolutions.net: did not receive HSTS header
 emiele.com.br: could not connect to host
 emilyhorsman.com: could not connect to host
-emilyjohnson.ga: could not connect to host
 emilyshepherd.me: did not receive HSTS header
 eminhuseynov.com: could not connect to host
 eminovic.me: could not connect to host
 emjainteractive.com: did not receive HSTS header
 emjimadhu.com: could not connect to host
+emma-o.com: could not connect to host
 emmable.com: could not connect to host
 emmaliddell.com: did not receive HSTS header
 emmanuelle-et-julien.ch: could not connect to host
-emmdy.com: did not receive HSTS header
+emmdy.com: could not connect to host
 emmehair.com: could not connect to host
 emnitech.com: could not connect to host
 emojiengine.com: did not receive HSTS header
@@ -4715,6 +5207,7 @@ employeestore.org: did not receive HSTS header
 emporiovinareal.com.br: could not connect to host
 empty-r.com: could not connect to host
 emptypath.com: did not receive HSTS header
+emupedia.net: did not receive HSTS header
 emyself.info: could not connect to host
 emyself.org: did not receive HSTS header
 en4u.org: could not connect to host
@@ -4733,6 +5226,7 @@ endangeredwatch.com: could not connect to host
 endlessdark.net: max-age too low: 600
 endlesshorizon.net: could not connect to host
 endlesstone.com: did not receive HSTS header
+endofinternet.goip.de: could not connect to host
 endofnet.org: could not connect to host
 endohaus.ca: could not connect to host
 endohaus.com: could not connect to host
@@ -4741,10 +5235,12 @@ endohaus.us: could not connect to host
 endspamwith.us: could not connect to host
 enecoshop.nl: did not receive HSTS header
 enefan.jp: could not connect to host
+enelacto.com: did not receive HSTS header
 energethik-tulln.at: did not receive HSTS header
 enersaveapp.org: could not connect to host
 enersec.co.uk: could not connect to host
 enfoqueseguro.com: did not receive HSTS header
+engineowning.com: did not receive HSTS header
 enginx.cn: could not connect to host
 englerts.de: did not receive HSTS header
 englishclub.com: did not receive HSTS header
@@ -4755,23 +5251,25 @@ enigmail.net: did not receive HSTS header
 enjen.net: did not receive HSTS header
 enjoymayfield.com: max-age too low: 0
 enjoystudio.ro: did not receive HSTS header
-enlatte.com: did not receive HSTS header
-enlightened.si: could not connect to host
+enlatte.com: could not connect to host
+enlazaresbueno.cl: did not receive HSTS header
+enlightened.si: did not receive HSTS header
 enoou.com: could not connect to host
 enpalmademallorca.info: could not connect to host
 ensemble-vos-idees.fr: could not connect to host
 enskat.de: could not connect to host
 enskatson-sippe.de: could not connect to host
+ensured.com: could not connect to host
+ensured.nl: could not connect to host
 entaurus.com: could not connect to host
 enteente.club: could not connect to host
 enteente.com: could not connect to host
 enteente.space: could not connect to host
 enteente.xyz: could not connect to host
 enterdev.co: did not receive HSTS header
-enterprisecarclub.co.uk: did not receive HSTS header
+enterprisecarclub.co.uk: could not connect to host
 enterprisechannel.asia: did not receive HSTS header
 enterprivacy.com: did not receive HSTS header
-entersynapse.com: could not connect to host
 entheorie.net: did not receive HSTS header
 entourneebeetle.com: could not connect to host
 entrepreneur.or.id: could not connect to host
@@ -4780,15 +5278,20 @@ enumify.com: could not connect to host
 envelope.co.nz: could not connect to host
 enviam.de: did not receive HSTS header
 enviapresentes.com.br: could not connect to host
-enviatufoto.com: max-age too low: 604800
 environment.ai: could not connect to host
+envoutement-desenvoutement.com: did not receive HSTS header
 envoyglobal.com: did not receive HSTS header
 envoyworld.com: did not receive HSTS header
 envygeeks.com: could not connect to host
+envygeeks.io: did not receive HSTS header
 eol34.com: could not connect to host
 eoldb.org: could not connect to host
 eolme.ml: could not connect to host
+eonet.cc: did not receive HSTS header
+eos-classic.io: could not connect to host
+eosol.zone: could not connect to host
 epanurse.com: could not connect to host
+epave.paris: could not connect to host
 epaygateway.net: could not connect to host
 ephe.be: could not connect to host
 ephry.com: could not connect to host
@@ -4796,29 +5299,36 @@ epicmc.games: could not connect to host
 epitesz.co: did not receive HSTS header
 eposcloud.net: could not connect to host
 eposmidlands.co.uk: could not connect to host
-eposnewport.co.uk: could not connect to host
+eposnewport.co.uk: did not receive HSTS header
 eposnottingham.co.uk: could not connect to host
 eposreading.co.uk: could not connect to host
 eposreview.co.uk: could not connect to host
-epossurrey.co.uk: could not connect to host
+epossurrey.co.uk: did not receive HSTS header
 epossussex.co.uk: could not connect to host
 eposwales.co.uk: could not connect to host
 epoxate.com: could not connect to host
 eprofitacademy.com: did not receive HSTS header
+epulsar.ru: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+epvin.com: could not connect to host
 eq8.net.au: could not connect to host
 eqib.nl: did not receive HSTS header
 eqim.me: could not connect to host
 eqorg.com: could not connect to host
+equallyy.com: could not connect to host
 equalparts.eu: could not connect to host
 equate.net.au: did not receive HSTS header
 equatetechnologies.com.au: did not receive HSTS header
 equilibre-yoga-jennifer-will.com: could not connect to host
+equilime.com: did not receive HSTS header
 equippers.de: did not receive HSTS header
+equipsupply.com: did not receive HSTS header
 equitee.co: did not receive HSTS header
 equityflows.com: did not receive HSTS header
 er-music.com: could not connect to host
 erad.fr: could not connect to host
+erawanarifnugroho.com: did not receive HSTS header
 erclab.kr: could not connect to host
+erecciontotalal100.com: could not connect to host
 erepublik-deutschland.de: did not receive HSTS header
 eressea.xyz: could not connect to host
 ericbond.net: could not connect to host
@@ -4830,58 +5340,142 @@ eriel.com.br: could not connect to host
 erikwagner.de: did not receive HSTS header
 erinlin.com: did not receive HSTS header
 eriser.fr: did not receive HSTS header
+ernaehrungsberatung-rapperswil.ch: did not receive HSTS header
 ernaehrungsberatung-zurich.ch: could not connect to host
 ernesto.at: could not connect to host
 eroimatome.com: could not connect to host
 eromixx.com: could not connect to host
+eromon.net: could not connect to host
 erotalia.es: could not connect to host
 erotic4me.ch: did not receive HSTS header
+eroticforce.com: could not connect to host
 erotische-aanbiedingen.nl: could not connect to host
 erotpo.cz: could not connect to host
+erpiv.com: could not connect to host
 errolz.com: did not receive HSTS header
 errors.zenpayroll.com: could not connect to host
 erspro.net: could not connect to host
-eru.me: did not receive HSTS header
-erverydown.ml: could not connect to host
+ervaarjapan.nl: did not receive HSTS header
 erwinvanlonden.net: did not receive HSTS header
+es888.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 es8888.net: could not connect to host
 es888999.com: could not connect to host
+es999.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+es9999.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esafar.cz: did not receive HSTS header
+esb-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb-top.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb-top.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb111.com: could not connect to host
 esb111.net: could not connect to host
 esb112.com: could not connect to host
 esb112.net: could not connect to host
+esb116.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb1314.net: could not connect to host
 esb1668.com: could not connect to host
+esb168168.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb168168.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb168168.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb168168.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1688.biz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1688.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1688.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1688.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1688.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb16888.com: could not connect to host
+esb1711.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1711.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1788.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1788.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1788.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb1788.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb17888.com: could not connect to host
+esb2013.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb2013.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb2099.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb2099.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb222.net: could not connect to host
+esb258.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb325.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb325.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb333.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb336.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb369.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb433.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb518.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb553.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb555.biz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb555.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb555.com: could not connect to host
 esb556.com: could not connect to host
+esb5889.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb5889.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb6.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb666.com: could not connect to host
 esb666.net: could not connect to host
 esb66666.com: could not connect to host
+esb677.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb688.com: could not connect to host
 esb68888.com: could not connect to host
+esb775.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb777.biz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb777.cc: could not connect to host
 esb777.com: could not connect to host
+esb777.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb777.net: could not connect to host
+esb777.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb777.us: could not connect to host
-esb9588.info: did not receive HSTS header
+esb886.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb888.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb8886.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb9527.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb9588.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb9588.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esb9588.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb999.biz: could not connect to host
 esb999.com: could not connect to host
 esb999.info: could not connect to host
+esb999.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esb999.us: could not connect to host
 esba11.cc: could not connect to host
+esba11.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esba11.in: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esba11.net: could not connect to host
 esba11.us: could not connect to host
+esball-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.bz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esball.in: could not connect to host
+esball.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.mx: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.online: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.tv: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.win: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball.ws: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball518.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball518.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball518.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esball518.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esball888.com: could not connect to host
 esball888.net: could not connect to host
+esballs.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbbon.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbbon.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbfun.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbfun.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbgood.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbin.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbjon.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbjon.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbm4.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esbm5.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esbuilders.co.nz: did not receive HSTS header
 escalate.eu: could not connect to host
 escapees.com: did not receive HSTS header
 escolaengenharia.com.br: did not receive HSTS header
+escort-byuro.net: could not connect to host
 escort-fashion.com: could not connect to host
 escortdisplay.com: could not connect to host
 escortshotsexy.com: could not connect to host
@@ -4891,15 +5485,18 @@ esec.rs: did not receive HSTS header
 eseth.de: did not receive HSTS header
 esh.ink: could not connect to host
 eshepperd.com: did not receive HSTS header
+eshobe.com: did not receive HSTS header
 eshtapay.com: could not connect to host
 esko.bar: could not connect to host
 esln.org: did not receive HSTS header
+esmoney.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+esmoney.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 esn-ypci.com: did not receive HSTS header
 esocweb.com: could not connect to host
 esoterik.link: could not connect to host
 esp-berlin.de: could not connect to host
 esp-desarrolladores.com: could not connect to host
-esp.community: did not receive HSTS header
+esp.community: could not connect to host
 esp8285.store: could not connect to host
 espacemontmorency.com: did not receive HSTS header
 especificosba.com.mx: could not connect to host
@@ -4914,6 +5511,7 @@ essayhave.com: could not connect to host
 essaylib.com: could not connect to host
 essayscam.org: could not connect to host
 essayshark.com: could not connect to host
+essaywebsite.com: did not receive HSTS header
 essenceofvitalitydetox.com: could not connect to host
 essential12.com: could not connect to host
 essentialoilsimports.com: could not connect to host
@@ -4927,9 +5525,11 @@ estaleiro.org: could not connect to host
 estan.cn: could not connect to host
 estebanborges.com: did not receive HSTS header
 estespr.com: did not receive HSTS header
+estetistarimini.it: did not receive HSTS header
 estilosapeca.com: could not connect to host
 estland.guide: could not connect to host
 estoqueinformatica.com.br: could not connect to host
+estudio21pattern.com: could not connect to host
 estudioamazonico.com: could not connect to host
 et-buchholz.de: could not connect to host
 et180.com: could not connect to host
@@ -4941,7 +5541,7 @@ etenendrinken.nu: could not connect to host
 eternalsymbols.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 eternitylove.us: could not connect to host
 eth9.net: could not connect to host
-etha.nz: max-age too low: 0
+ethandelany.me: did not receive HSTS header
 ethanfaust.com: did not receive HSTS header
 ethanlew.is: could not connect to host
 ethantskinner.com: did not receive HSTS header
@@ -4953,6 +5553,7 @@ ethicaltek.com: could not connect to host
 ethil-faer.fr: could not connect to host
 ethiobaba.com: could not connect to host
 etidni.help: did not receive HSTS header
+etincelle.ml: could not connect to host
 etk2000.com: did not receive HSTS header
 etmirror.top: could not connect to host
 etmirror.xyz: could not connect to host
@@ -4961,11 +5562,11 @@ etproxy.tech: could not connect to host
 ets2mp.de: did not receive HSTS header
 etsysecure.com: could not connect to host
 ettebiz.com: max-age too low: 0
-etula.ga: could not connect to host
+etula.ga: did not receive HSTS header
 etula.me: could not connect to host
 etys.no: did not receive HSTS header
 euanbaines.com: did not receive HSTS header
-euanbarrett.com: could not connect to host
+eucl3d.com: did not receive HSTS header
 euclideanpostulates.xyz: could not connect to host
 eucollegetours.com: could not connect to host
 euexia.fr: could not connect to host
@@ -4981,29 +5582,43 @@ eupresidency2018.com: could not connect to host
 euren.se: could not connect to host
 eurocamping.se: could not connect to host
 euroescortguide.com: could not connect to host
+europapier.at: did not receive HSTS header
+europapier.ba: did not receive HSTS header
+europapier.bg: did not receive HSTS header
+europapier.com: did not receive HSTS header
+europapier.cz: did not receive HSTS header
+europapier.hr: did not receive HSTS header
+europapier.rs: did not receive HSTS header
+europapier.si: did not receive HSTS header
+europapier.ua: did not receive HSTS header
+europeanpreppers.com: could not connect to host
+euroservice.com.gr: did not receive HSTS header
 euroshop24.net: could not connect to host
-euroskano.nl: did not receive HSTS header
 eurospecautowerks.com: did not receive HSTS header
 eurostrategy.vn.ua: could not connect to host
-euteamo.cn: did not receive HSTS header
+euvo.tk: could not connect to host
+evades.io: did not receive HSTS header
 evanhandgraaf.nl: did not receive HSTS header
 evankurniawan.com: did not receive HSTS header
+evanreev.es: could not connect to host
+evansville-wy.gov: could not connect to host
+evantage.org: could not connect to host
 evasion-energie.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-evasioncreole.com: could not connect to host
 evdenevenakliyatankara.pw: could not connect to host
 evecalm.com: did not receive HSTS header
 evedanjailbreak.com: could not connect to host
 evegalaxy.net: could not connect to host
+evemodx.com: did not receive HSTS header
 evenstar-gaming.com: could not connect to host
 event64.ru: did not receive HSTS header
 eventmake.es: could not connect to host
 eventplace.me: did not receive HSTS header
 events12.com: did not receive HSTS header
-eventsafrica.net: could not connect to host
+eventsafrica.net: did not receive HSTS header
+everitoken.io: did not receive HSTS header
 everyarti.st: could not connect to host
 everybooks.com: could not connect to host
 everydaytherich.com: max-age too low: 7776000
-everydaywot.com: could not connect to host
 everygayporn.xyz: could not connect to host
 everylab.org: could not connect to host
 everymove.org: could not connect to host
@@ -5012,9 +5627,12 @@ everytruckjob.com: did not receive HSTS header
 eveseat.net: could not connect to host
 eveshaiwu.com: could not connect to host
 evi.be: did not receive HSTS header
+evilbeasts.ru: could not connect to host
 evileden.com: could not connect to host
 evilnerd.de: did not receive HSTS header
+evilness.nl: could not connect to host
 evilsay.com: could not connect to host
+evilvolcanolairs.com: did not receive HSTS header
 evin.ml: could not connect to host
 evio.com: did not receive HSTS header
 evites.me: could not connect to host
@@ -5026,57 +5644,64 @@ evowl.com: could not connect to host
 ewallet-optimizer.com: did not receive HSTS header
 ewex.org: could not connect to host
 eworksmedia.com: could not connect to host
+ewuchuan.com: could not connect to host
 exampleessays.com: could not connect to host
 excelgum.ca: did not receive HSTS header
 exceptionalbits.com: could not connect to host
 exceptionalservices.us: could not connect to host
-excessamerica.com: could not connect to host
 exchangecoordinator.com: could not connect to host
 exchangeworks.co: did not receive HSTS header
+exebouncycastles.co.uk: did not receive HSTS header
 exembit.com: did not receive HSTS header
 exfiles.cz: did not receive HSTS header
 exgaywatch.com: could not connect to host
 exgravitus.com: could not connect to host
 exno.co: could not connect to host
-exo.do: could not connect to host
+exo.do: max-age too low: 0
+exocen.com: could not connect to host
 exoticads.com: could not connect to host
 exousiakaidunamis.xyz: could not connect to host
+expancio.com: max-age too low: 0
 expanddigital.media: did not receive HSTS header
 expatads.com: could not connect to host
 expatriate.pl: did not receive HSTS header
 expecting.com.br: could not connect to host
 experticon.com: did not receive HSTS header
 expertmile.com: did not receive HSTS header
+experts-en-gestion.fr: did not receive HSTS header
 explodingcamera.com: did not receive HSTS header
 exploit-db.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-exploit.cz: did not receive HSTS header
 expo-designers.com: did not receive HSTS header
 expokohler.com: could not connect to host
 expoort.com.br: could not connect to host
+exporo.de: did not receive HSTS header
 expoundite.net: did not receive HSTS header
 expowerhps.com: did not receive HSTS header
-expressemotion.net: could not connect to host
 expressfinance.co.za: did not receive HSTS header
+extendwings.com: could not connect to host
 exteriorservices.io: could not connect to host
-extramoney.cash: did not receive HSTS header
+extramoney.cash: could not connect to host
 extrathemeshowcase.net: could not connect to host
 extratorrent.cool: did not receive HSTS header
-extratorrent.fyi: could not connect to host
-extratorrent.red: could not connect to host
+extratorrent.fyi: max-age too low: 0
+extratorrent.red: max-age too low: 0
 extratorrent.world: could not connect to host
 extratorrentlive.xyz: could not connect to host
 extratorrents.tech: could not connect to host
 extreemhost.nl: did not receive HSTS header
 extremenetworking.net: could not connect to host
+extremeservicesandrestoration.com: could not connect to host
 exy.pw: could not connect to host
+eyasc.nl: did not receive HSTS header
 eyedarts.com: did not receive HSTS header
 eyeglassuniverse.com: did not receive HSTS header
 eyenote.gov: did not receive HSTS header
 eyes-of-universe.eu: did not receive HSTS header
 eyesoccer-didikh.rhcloud.com: could not connect to host
+eyesonly.cc: did not receive HSTS header
 eytosh.net: could not connect to host
 ez.fi: could not connect to host
-ezgamble.com: could not connect to host
+ezgamble.com: did not receive HSTS header
 ezimoeko.net: could not connect to host
 ezmod.org: could not connect to host
 eznfe.com: could not connect to host
@@ -5087,21 +5712,22 @@ f-rickroll-g.pw: could not connect to host
 f-s-u.co.uk: could not connect to host
 f00.ca: did not receive HSTS header
 f1bigpicture.com: could not connect to host
-f2e.io: did not receive HSTS header
 f2f.cash: could not connect to host
 f42.net: could not connect to host
 f5movies.top: could not connect to host
 f8842.com: could not connect to host
-f9digital.com: did not receive HSTS header
+f9digital.com: max-age too low: 2592000
 faber.io: could not connect to host
 faberusa.com: did not receive HSTS header
 fabhub.io: could not connect to host
 fabian-kluge.de: could not connect to host
+fabianasantiago.com: could not connect to host
 fabianfischer.de: did not receive HSTS header
-fabianmunoz.com: did not receive HSTS header
+fabianmunoz.com: could not connect to host
 fabienbaker.com: could not connect to host
 fabled.com: did not receive HSTS header
 fabriko.fr: did not receive HSTS header
+fabriziorocca.com: could not connect to host
 fabrysociety.org: could not connect to host
 fabulouslyyouthfulskin.com: could not connect to host
 fabulouslyyouthfulskineyeserum.com: could not connect to host
@@ -5109,10 +5735,10 @@ facebattle.com: could not connect to host
 facebook.ax: could not connect to host
 facebooktsukaikata.net: did not receive HSTS header
 facepalmsecurity.com: could not connect to host
+facepunch.org: could not connect to host
 facesnf.com: could not connect to host
 fachschaft-informatik.de: did not receive HSTS header
 facilitrak.com: could not connect to host
-factor.cc: did not receive HSTS header
 factorable.net: did not receive HSTS header
 factorygw.com: did not receive HSTS header
 factorypartsdirect.com: could not connect to host
@@ -5124,7 +5750,6 @@ faesser.com: did not receive HSTS header
 fafatiger.com: could not connect to host
 fag.wtf: could not connect to host
 fahmed.de: did not receive HSTS header
-faidanoi.it: did not receive HSTS header
 failproof.be: max-age too low: 604800
 faircom.co.za: did not receive HSTS header
 fairkey.dk: did not receive HSTS header
@@ -5133,19 +5758,17 @@ faisalshuvo.com: did not receive HSTS header
 faizan.net: did not receive HSTS header
 faizan.xyz: did not receive HSTS header
 fakeletters.org: could not connect to host
-fakerli.com: could not connect to host
 faktura.pl: did not receive HSTS header
 falcibiosystems.org: did not receive HSTS header
 falconwiz.com: did not receive HSTS header
 falkp.no: did not receive HSTS header
 falkus.net: could not connect to host
-falldennismarketing.com: max-age too low: 2592000
 fallenangeldrinks.eu: could not connect to host
 fallenangelspirits.uk: could not connect to host
 fallingapart.de: could not connect to host
-fallofthecitadel.com: did not receive HSTS header
 false.in.net: could not connect to host
 faluninfo.ba: did not receive HSTS header
+famdouma.nl: could not connect to host
 fame-agency.net: could not connect to host
 famep.gov: could not connect to host
 famer.me: could not connect to host
@@ -5153,16 +5776,18 @@ fameuxhosting.co.uk: could not connect to host
 familie-sander.rocks: max-age too low: 600
 familie-sprink.de: could not connect to host
 familie-zimmermann.at: could not connect to host
+familletouret.fr: did not receive HSTS header
 famio.cn: did not receive HSTS header
-fan.gov: could not connect to host
-fancy-bridge.com: could not connect to host
 fanflow.com: did not receive HSTS header
+fanhouwan.com: did not receive HSTS header
 fansmade.art: could not connect to host
 fant.dk: did not receive HSTS header
 fantasticgardenersmelbourne.com.au: did not receive HSTS header
 fantasticpestcontrolmelbourne.com.au: did not receive HSTS header
 fantasyfootballpundit.com: did not receive HSTS header
+fantasyprojections.com: could not connect to host
 fanyl.cn: could not connect to host
+fap.no: could not connect to host
 faq.lookout.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 faraonplay5.com: could not connect to host
 faraonplay7.com: could not connect to host
@@ -5175,7 +5800,7 @@ farm24.co.uk: could not connect to host
 farmacia.pt: did not receive HSTS header
 farmaciaformula.com.br: could not connect to host
 farmaciamedicom.com.br: could not connect to host
-fascia.fit: could not connect to host
+farmmaximizer.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 fashion.net: did not receive HSTS header
 fashioncare.cz: did not receive HSTS header
 fashiondays.bg: max-age too low: 0
@@ -5189,8 +5814,10 @@ fastbackmbg.be: could not connect to host
 fastbackmbm.be: could not connect to host
 fastcomcorp.com: did not receive HSTS header
 fastcomcorp.net: did not receive HSTS header
+fastconfirm.com: could not connect to host
 fastograph.com: could not connect to host
 fastopen.ml: could not connect to host
+fastwebsites.com.br: did not receive HSTS header
 fastworx.com: did not receive HSTS header
 fatdoge.cn: did not receive HSTS header
 fatgeekflix.net: could not connect to host
@@ -5198,18 +5825,22 @@ fatherhood.gov: did not receive HSTS header
 fatlossguide.xyz: could not connect to host
 fator25.com.br: could not connect to host
 fatox.de: could not connect to host
+fattorino.it: did not receive HSTS header
 fatwin.pw: could not connect to host
 fatzebra.com.au: max-age too low: 0
 favorit.club: did not receive HSTS header
 fawkex.me: could not connect to host
 faxreader.net: could not connect to host
 fayolle.info: did not receive HSTS header
-fbi.pw: did not receive HSTS header
+fbf.gov: did not receive HSTS header
+fbi.pw: could not connect to host
 fbook.top: could not connect to host
 fbox.li: could not connect to host
 fcapartsdb.com: could not connect to host
+fcitasc.com: could not connect to host
 fcp.cn: could not connect to host
 fdj.im: could not connect to host
+fdm.ro: did not receive HSTS header
 fdt.name: did not receive HSTS header
 feard.space: could not connect to host
 fed51.com: could not connect to host
@@ -5221,7 +5852,7 @@ fedo.moe: could not connect to host
 feedstringer.com: could not connect to host
 feedthebot.com: did not receive HSTS header
 feegg.com.br: could not connect to host
-feelgood-workouts.de: did not receive HSTS header
+feeriedesign-event.com: could not connect to host
 fefore.com: did not receive HSTS header
 fegans.org.uk: did not receive HSTS header
 feirlane.org: could not connect to host
@@ -5229,12 +5860,16 @@ feisbed.com: could not connect to host
 feist.io: could not connect to host
 feitobrasilcosmeticos.com.br: did not receive HSTS header
 felger-times.fr: could not connect to host
+felgitscher.xyz: max-age too low: 2592000
 feliwyn.fr: did not receive HSTS header
 felixhefner.de: did not receive HSTS header
+felixqu.com: did not receive HSTS header
 felixrr.pro: could not connect to host
 femaledom.xyz: could not connect to host
 femdombbw.com: could not connect to host
 feminists.co: could not connect to host
+feng-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+feng-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 fengyadi.com: could not connect to host
 fenixhost.com.br: could not connect to host
 fenno.net: could not connect to host
@@ -5259,15 +5894,15 @@ fetlife.com: could not connect to host
 fettbrot.tk: did not receive HSTS header
 feudaltactics.com: could not connect to host
 feuerwehr-dachaufsetzer.de: could not connect to host
-fexmen.com: could not connect to host
+fexmen.com: did not receive HSTS header
 ff-bg.xyz: could not connect to host
 ffh.me: could not connect to host
 ffl123.com: did not receive HSTS header
 fgequipamentos.com.br: did not receive HSTS header
 fhsseniormens.club: could not connect to host
-fi-sanki.co.jp: did not receive HSTS header
+fi-sanki.co.jp: could not connect to host
 fibrasynormasdecolombia.com: did not receive HSTS header
-ficklenote.net: could not connect to host
+ficklenote.net: did not receive HSTS header
 fics-twosigma.com: could not connect to host
 fid.to: could not connect to host
 fidel.uk: did not receive HSTS header
@@ -5275,20 +5910,21 @@ fideleslaici.com: did not receive HSTS header
 fieldclockapp.com: did not receive HSTS header
 fieldtalk.co.uk: could not connect to host
 fiendishmasterplan.com: did not receive HSTS header
-fierman.eu: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-fierman.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-fierman.us: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+fierman.eu: could not connect to host
+fierman.net: could not connect to host
+fierman.us: could not connect to host
 fiftyshadesofluca.ml: could not connect to host
 fig.co: did not receive HSTS header
 fig.ms: could not connect to host
 fightr.co: could not connect to host
 figura.cz: did not receive HSTS header
 figura.im: did not receive HSTS header
-figuurzagers.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+figuurzagers.nl: could not connect to host
 fiksel.info: could not connect to host
 fikt.space: could not connect to host
+filamentia.nl: could not connect to host
 filebox.moe: could not connect to host
-filebox.space: did not receive HSTS header
+filebox.space: could not connect to host
 filedir.com: did not receive HSTS header
 fileio.io: could not connect to host
 fileon.com: could not connect to host
@@ -5305,11 +5941,13 @@ filme-online.eu.com: did not receive HSTS header
 filmesubtitrate2017.online: could not connect to host
 filo.xyz: did not receive HSTS header
 filoitoupediou.gr: did not receive HSTS header
+filterflasche-kaufen.de: did not receive HSTS header
 finalgear.com: could not connect to host
 finalvpn.com: did not receive HSTS header
 financier.io: did not receive HSTS header
 financieringsportaal.nl: did not receive HSTS header
 finanzkontor.net: could not connect to host
+find-your-happy-place.de: did not receive HSTS header
 findigo.fish: could not connect to host
 findingmyname.com: max-age too low: 2629746
 findmybottleshop.com.au: could not connect to host
@@ -5321,10 +5959,9 @@ finewineonline.com: could not connect to host
 fingent.com: did not receive HSTS header
 fingerscrossed.style: could not connect to host
 finiteheap.com: did not receive HSTS header
-finn.io: did not receive HSTS header
+finkenberger.org: did not receive HSTS header
 finstererlebnis.de: could not connect to host
 finsterlebnis.de: did not receive HSTS header
-fintandunleavy.com: could not connect to host
 fiodental.com.br: did not receive HSTS header
 fiork.com: did not receive HSTS header
 fire-wolf.com: could not connect to host
@@ -5333,31 +5970,30 @@ firebaseio-demo.com: could not connect to host
 firebaseio.com: could not connect to host (error ignored - included regardless)
 firebird.io: did not receive HSTS header
 firefall.rocks: could not connect to host
-firehost.com: could not connect to host
+firehost.com: did not receive HSTS header
 fireinthedeep.com: could not connect to host
 firemail.io: could not connect to host
-firenza.org: did not receive HSTS header
 fireorbit.de: did not receive HSTS header
 firepeak.ru: could not connect to host
 fireworkcoaching.com: did not receive HSTS header
 firexarxa.de: could not connect to host
-firmament.space: could not connect to host
-firmenverzeichnis.nu: did not receive HSTS header
+firmale.com: could not connect to host
+firmenverzeichnis.nu: could not connect to host
 first-time-offender.com: could not connect to host
 firstchoicepool.com: did not receive HSTS header
 firstdogonthemoon.com.au: did not receive HSTS header
 firstforex.co.uk: did not receive HSTS header
 firstlook.org: did not receive HSTS header
-fischers.cc: could not connect to host
 fiscoeconti.it: did not receive HSTS header
-fishfinders.info: did not receive HSTS header
 fiskestang.com: did not receive HSTS header
 fit4medien.de: did not receive HSTS header
 fitbylo.com: could not connect to host
 fitea.cz: could not connect to host
+fitfitup.com: did not receive HSTS header
 fitiapp.com: could not connect to host
 fitnesswerk.de: could not connect to host
 fitqbe.com: did not receive HSTS header
+fitseven.ru: did not receive HSTS header
 fitshop.com.br: could not connect to host
 fitsw.com: did not receive HSTS header
 fiuxy.org: could not connect to host
@@ -5389,7 +6025,8 @@ fl0666.com: did not receive HSTS header
 fl0777.com: did not receive HSTS header
 fl0888.com: did not receive HSTS header
 fl0999.com: did not receive HSTS header
-flagfic.com: could not connect to host
+flacandmp3.ml: could not connect to host
+flagfic.com: did not receive HSTS header
 flags.ninja: could not connect to host
 flair.co: max-age too low: 7889238
 flairbros.at: could not connect to host
@@ -5402,15 +6039,11 @@ flareon.net: could not connect to host
 flaretechnologies.io: could not connect to host
 flashbaggie.com: could not connect to host
 flatbellyreview.com: max-age too low: 2592000
+flatlandchurch.com: did not receive HSTS header
 flawcheck.com: could not connect to host
 flc111.com: did not receive HSTS header
 flc999.com: max-age too low: 129600
-fleetcor.at: could not connect to host
-fleetcor.de: could not connect to host
-fleetcor.fr: could not connect to host
-fleetcor.hu: could not connect to host
-fleetcor.nl: could not connect to host
-fleetcor.pl: could not connect to host
+fleamarketgoods.com: did not receive HSTS header
 flemingtonaudiparts.com: could not connect to host
 fleurette.me: could not connect to host
 fleursdesoleil.fr: did not receive HSTS header
@@ -5418,14 +6051,16 @@ flexdrukker.nl: could not connect to host
 flexinvesting.fi: could not connect to host
 fliexer.com: could not connect to host
 flightschoolusa.com: did not receive HSTS header
+flikmsg.co: could not connect to host
 fling.dating: could not connect to host
 flipagram.com: did not receive HSTS header
 flipbell.com: did not receive HSTS header
 flipkey.com: did not receive HSTS header
 flirchi.com: did not receive HSTS header
+flirtycourts.com: did not receive HSTS header
 flixtor.net: could not connect to host
 flkrpxl.com: max-age too low: 86400
-floless.co.uk: could not connect to host
+flood.io: did not receive HSTS header
 floorball-haunwoehr.de: did not receive HSTS header
 flopy.club: could not connect to host
 florafiora.com.br: did not receive HSTS header
@@ -5460,24 +6095,31 @@ flybunnyfly.dk: did not receive HSTS header
 flygpost.com: did not receive HSTS header
 flyingdoggy.net: could not connect to host
 flyingspaghettimonsterdonationsfund.nl: could not connect to host
+flyingyoung.top: did not receive HSTS header
 flyp.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 flyspace.ga: did not receive HSTS header
+flyspace.ml: did not receive HSTS header
 flyss.net: could not connect to host
 fm83.nl: could not connect to host
 fm992.com: could not connect to host
+fmapplication.com: could not connect to host
+fmi.gov: did not receive HSTS header
+fmovies.fyi: did not receive HSTS header
 fmovies.life: could not connect to host
 fnfpt.co.uk: could not connect to host
 fniephaus.com: did not receive HSTS header
-fnncat.com: could not connect to host
+fnncat.com: did not receive HSTS header
 fnvsecurity.com: could not connect to host
 fobc-usa.org: did not receive HSTS header
 focalforest.com: could not connect to host
-foerster-kunststoff.de: did not receive HSTS header
+foerster-kunststoff.de: could not connect to host
 fognini-depablo.eu: could not connect to host
 fohome.ca: could not connect to host
 fokan.ch: did not receive HSTS header
+fol.tf: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 foliekonsulenten.dk: did not receive HSTS header
-folioapp.io: did not receive HSTS header
+folioapp.io: could not connect to host
+folkfests.org: did not receive HSTS header
 fondanastasia.ru: did not receive HSTS header
 fondy.ru: did not receive HSTS header
 foneo.com: could not connect to host
@@ -5496,7 +6138,7 @@ foodserve.in: did not receive HSTS header
 footballmapped.com: could not connect to host
 footlegende.fr: did not receive HSTS header
 forafifty.co.za: could not connect to host
-foraje-profesionale.ro: did not receive HSTS header
+foraje-profesionale.ro: could not connect to host
 forbid.life: could not connect to host
 forbiddenhistory.info: could not connect to host
 forbook.net: did not receive HSTS header
@@ -5511,15 +6153,19 @@ foreveryoung.pt: did not receive HSTS header
 forex-dan.com: did not receive HSTS header
 forex-plus.com: did not receive HSTS header
 forgix.com: could not connect to host
+forglemmigej.net: could not connect to host
 forlagetmarx.dk: did not receive HSTS header
 formadmin.com: did not receive HSTS header
 formaliteo.com: did not receive HSTS header
+formasdemaquillarse.com: did not receive HSTS header
 formazioneopen.it: could not connect to host
 formersessalaries.com: did not receive HSTS header
 formula.cf: could not connect to host
 forplanetsake.com: could not connect to host
 forplayers.pl: could not connect to host
+forquilhinhanoticias.com.br: did not receive HSTS header
 forsyththeatre.com: could not connect to host
+fortoglethorpega.gov: could not connect to host
 fortricks.in: did not receive HSTS header
 fortuna-loessnitz.de: could not connect to host
 fortuna-s.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -5532,43 +6178,43 @@ foryoucosmeticos.com.br: could not connect to host
 fossewayflowers.co.uk: could not connect to host
 fossewayflowers.com: could not connect to host
 fossewaygardencentre.co.uk: did not receive HSTS header
-fossgruppen.se: could not connect to host
+fossgruppen.se: did not receive HSTS header
+fossguard.com: could not connect to host
 fotiu.com: could not connect to host
 fotoallerlei.com: did not receive HSTS header
 fotocerita.net: could not connect to host
-fotofaerie.net: could not connect to host
 fotogiraffe.ru: did not receive HSTS header
 fotografosexpertos.com: did not receive HSTS header
-fotonjan.com: could not connect to host
 fotopasja.info: could not connect to host
 fotostravestisbr.com: could not connect to host
 fourchin.net: could not connect to host
 fourwheelpartloanssimple.com: did not receive HSTS header
 foxdev.io: could not connect to host
-foxelbox.com: could not connect to host
+foxelbox.com: did not receive HSTS header
 foxes.no: could not connect to host
 foxley-farm.co.uk: did not receive HSTS header
 foxley-seeds.co.uk: did not receive HSTS header
 foxleyseeds.co.uk: could not connect to host
 foxmay.co.uk: could not connect to host
-foxtrot.pw: could not connect to host
+foxterrier.com.br: could not connect to host
+foxtrot.pw: did not receive HSTS header
 foxyslut.com: could not connect to host
 foyale.io: could not connect to host
 fpki.sh: could not connect to host
-fptravelling.com: could not connect to host
 fr0zenbits.io: could not connect to host
 fr33d0m.link: could not connect to host
 fragilesolar.cf: could not connect to host
-fragnic.com: could not connect to host
+fragnic.com: did not receive HSTS header
 fralef.me: did not receive HSTS header
 francesca-and-lucas.com: did not receive HSTS header
-francescoservida.ch: could not connect to host
 francevpn.xyz: could not connect to host
+franckgirard.net: could not connect to host
 francois-vidit.com: did not receive HSTS header
 frangor.info: did not receive HSTS header
 frankedier.com: did not receive HSTS header
-frankl.in: did not receive HSTS header
+frankfurt-am-start.de: did not receive HSTS header
 franklinhua.com: could not connect to host
+fransallen.com: did not receive HSTS header
 franta.biz: did not receive HSTS header
 franta.email: did not receive HSTS header
 franzt.de: could not connect to host
@@ -5579,13 +6225,14 @@ frasys.cloud: max-age too low: 2592000
 frasys.io: could not connect to host
 fraudempire.com: could not connect to host
 freakyamazing.com: could not connect to host
-freakyaweso.me: could not connect to host
+freakyaweso.me: max-age too low: 86400
 freakyawesome.band: could not connect to host
 freakyawesome.blog: could not connect to host
 freakyawesome.ca: could not connect to host
 freakyawesome.club: could not connect to host
 freakyawesome.co: could not connect to host
 freakyawesome.co.uk: could not connect to host
+freakyawesome.com: could not connect to host
 freakyawesome.company: could not connect to host
 freakyawesome.email: could not connect to host
 freakyawesome.events: could not connect to host
@@ -5645,9 +6292,8 @@ freakyawesometheme.com: could not connect to host
 freakyawesomethemes.com: could not connect to host
 freakyawesomewp.com: could not connect to host
 freddythechick.uk: could not connect to host
-fredericcote.com: could not connect to host
+fredericcote.com: did not receive HSTS header
 fredliang.cn: could not connect to host
-fredtec.ru: could not connect to host
 free8.xyz: could not connect to host
 freeasinlliure.org: did not receive HSTS header
 freeassangenow.org: did not receive HSTS header
@@ -5655,8 +6301,6 @@ freeben666.fr: could not connect to host
 freeblog.me: could not connect to host
 freebookmakerbets.com.au: did not receive HSTS header
 freebus.org: could not connect to host
-freecam2cam.site: could not connect to host
-freecookies.nl: did not receive HSTS header
 freedomrealtyoftexas.com: did not receive HSTS header
 freedomvote.nl: could not connect to host
 freeexampapers.com: could not connect to host
@@ -5681,6 +6325,8 @@ frenzel.dk: could not connect to host
 freqlabs.com: did not receive HSTS header
 freshfind.xyz: could not connect to host
 freshlymind.com: did not receive HSTS header
+freshmaza.io: did not receive HSTS header
+frettboard.com: did not receive HSTS header
 frezbo.com: could not connect to host
 frforms.com: did not receive HSTS header
 frickelboxx.de: could not connect to host
@@ -5689,37 +6335,38 @@ fridaperfumaria.com.br: could not connect to host
 friedhelm-wolf.de: could not connect to host
 friendica.ch: could not connect to host
 friendlyfiregameshow.com: could not connect to host
+friendship-quotes.co.uk: could not connect to host
+friller.com.au: did not receive HSTS header
 frimons.com: max-age too low: 7889238
 fringeintravel.com: did not receive HSTS header
+fritteli.ch: did not receive HSTS header
 frodriguez.xyz: could not connect to host
 froehlich.it: did not receive HSTS header
 froggstack.de: could not connect to host
 frolov.net: could not connect to host
-from-the-net.com: could not connect to host
 fromix.de: could not connect to host
 fromlemaytoz.com: could not connect to host
 front-end.dog: could not connect to host
 frontisme.nl: did not receive HSTS header
 frontline.cloud: did not receive HSTS header
 frontline6.com: did not receive HSTS header
-frontlinemessenger.com: did not receive HSTS header
 frontmin.com: did not receive HSTS header
 frost-ci.xyz: could not connect to host
-frostbytes.net: could not connect to host
+frostbytes.net: did not receive HSTS header
 frosty-gaming.xyz: could not connect to host
 frp-roleplay.de: could not connect to host
 frprn.com: could not connect to host
 frprn.xxx: could not connect to host
 frsis2017.com: could not connect to host
+frugal-millennial.com: did not receive HSTS header
 fruitscale.com: could not connect to host
 fruitusers.com: could not connect to host
 frumious.fyi: could not connect to host
 frusky.net: could not connect to host
-fs-gamenet.de: could not connect to host
-fsf.moe: did not receive HSTS header
+fs-gamenet.de: did not receive HSTS header
+fsf.moe: could not connect to host
 fsfi.is: could not connect to host
 fsinf.at: did not receive HSTS header
-fsk.fo: did not receive HSTS header
 fspphoto.com: could not connect to host
 fsradio.eu: did not receive HSTS header
 fsrs.gov: could not connect to host
@@ -5731,6 +6378,8 @@ ftctele.com: could not connect to host
 fteproxy.org: did not receive HSTS header
 ftgho.com: could not connect to host
 ftpi.ml: could not connect to host
+fu-li88.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+fu-li88.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 fuchsy.com: could not connect to host
 fuckbilibili.com: could not connect to host
 fuckcf.cf: could not connect to host
@@ -5741,6 +6390,7 @@ fuelministry.com: did not receive HSTS header
 fugle.de: could not connect to host
 fuitedeau.ch: could not connect to host
 fujianshipbuilding.com: could not connect to host
+fujiorganics.com: did not receive HSTS header
 fukuko.biz: could not connect to host
 fukuko.xyz: could not connect to host
 fukuoka-cityliner.jp: did not receive HSTS header
@@ -5750,7 +6400,7 @@ fulilingyu.info: could not connect to host
 fuliydys.com: could not connect to host
 fullpackage.co.uk: did not receive HSTS header
 fulltxt.ml: could not connect to host
-fullytrained.co.uk: could not connect to host
+fullytrained.co.uk: did not receive HSTS header
 fumiware.com: could not connect to host
 fun25.tk: could not connect to host
 fun9.cc: could not connect to host
@@ -5771,21 +6421,23 @@ funnyang.com: could not connect to host
 funrun.com: did not receive HSTS header
 funtastic-event-hire.co.uk: did not receive HSTS header
 funtastic.ie: could not connect to host
-funtimebourne.co.uk: could not connect to host
-fuorifuocogenova.it: did not receive HSTS header
+funtimebourne.co.uk: did not receive HSTS header
+fuorifuocogenova.it: could not connect to host
 furi.ga: could not connect to host
 furiffic.com: did not receive HSTS header
-furikake.xyz: did not receive HSTS header
 furnation.com: could not connect to host
 furnishedproperty.com.au: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+furnitureconcept.co.uk: could not connect to host
 furry.agency: could not connect to host
 furry.be: did not receive HSTS header
+furry.zone: did not receive HSTS header
+fursuitbutts.com: could not connect to host
+furtherfood.com: did not receive HSTS header
 furtivelook.com: did not receive HSTS header
-fusa-miyamoto.jp: could not connect to host
 fusedrops.com: did not receive HSTS header
 fusionmate.com: could not connect to host
-fuskator.com: could not connect to host
 fussell.io: could not connect to host
+futa.agency: could not connect to host
 futbol11.com: did not receive HSTS header
 futbolvivo.tv: did not receive HSTS header
 futos.de: could not connect to host
@@ -5801,16 +6453,18 @@ fuzoku-sodan.com: could not connect to host
 fuzoku.jp: could not connect to host
 fwei.tk: did not receive HSTS header
 fws.gov: did not receive HSTS header
-fwww7.com: did not receive HSTS header
+fwww7.com: could not connect to host
 fxgame.online: could not connect to host
 fxpig-ib.com: could not connect to host
+fxtalk.cn: could not connect to host
 fxwebstudio.com.au: max-age too low: 0
 fyodorpi.com: did not receive HSTS header
 fyol.pw: could not connect to host
 fysiohaenraets.nl: did not receive HSTS header
 fzn.io: did not receive HSTS header
-fzslm.me: could not connect to host
+fzslm.me: did not receive HSTS header
 g-i-s.vn: did not receive HSTS header
+g-marketing.ro: did not receive HSTS header
 g-rickroll-o.pw: could not connect to host
 g01.in.ua: could not connect to host
 g1jeu.com: could not connect to host
@@ -5832,20 +6486,22 @@ gablaxian.com: max-age too low: 2592000
 gabriele-kluge.de: could not connect to host
 gaelleetarnaud.com: did not receive HSTS header
 gafachi.com: could not connect to host
-gagor.pl: could not connect to host
 gaichanh.com: did not receive HSTS header
 gailfellowsphotography.com: could not connect to host
 gainesvillegoneaustin.org: did not receive HSTS header
 gaiserik.com: did not receive HSTS header
 gaite.me: did not receive HSTS header
 gajas18.com: could not connect to host
+gakkainavi.net: could not connect to host
 gakkainavi4.com: could not connect to host
 galardi.org: could not connect to host
 galena.io: could not connect to host
 galenskap.eu: could not connect to host
 galeriadobimba.com.br: could not connect to host
+galerieautodirect.com: did not receive HSTS header
 galgoafegao.com.br: could not connect to host
 galgoingles.com.br: could not connect to host
+galgopersa.com.br: could not connect to host
 gali.review: did not receive HSTS header
 galileomtz.com: did not receive HSTS header
 gallery44.org: did not receive HSTS header
@@ -5855,6 +6511,7 @@ gambitcloud.net: could not connect to host
 game-files.net: did not receive HSTS header
 game-gentle.com: could not connect to host
 game.yt: could not connect to host
+game88city.com: could not connect to host
 gamebits.net: did not receive HSTS header
 gamecave.de: could not connect to host
 gamecdn.com: could not connect to host
@@ -5866,6 +6523,7 @@ gamek.es: could not connect to host
 gamenected.com: could not connect to host
 gamenected.de: could not connect to host
 gameofbay.org: could not connect to host
+gameofpwnz.com: did not receive HSTS header
 gamepad.vg: could not connect to host
 gamepader.com: could not connect to host
 gameparade.de: could not connect to host
@@ -5876,7 +6534,6 @@ gamers-life.fr: could not connect to host
 gamerslair.org: did not receive HSTS header
 gamerz-point.de: could not connect to host
 gamerz-stream.com: did not receive HSTS header
-gamesdepartment.co.uk: could not connect to host
 gameserver-sponsor.de: did not receive HSTS header
 gamesurferapp.com: could not connect to host
 gameswitchers.uk: could not connect to host
@@ -5888,30 +6545,35 @@ gamingreinvented.com: did not receive HSTS header
 gamoice.com: did not receive HSTS header
 gampenhof.de: could not connect to host
 gangnam-club.com: could not connect to host
-gangnam-karaoke.com: could not connect to host
+gangnam-karaoke.com: did not receive HSTS header
 ganhonet.com.br: did not receive HSTS header
+ganyouxuan.com: could not connect to host
 ganzgraph.de: did not receive HSTS header
 gaon.network: could not connect to host
-gaphag.ddns.net: could not connect to host
-gar-nich.net: could not connect to host
+gaptek.id: did not receive HSTS header
 garage-abri-chalet.fr: did not receive HSTS header
 garage-door.pro: could not connect to host
 garageon.net: did not receive HSTS header
+garbage-juice.com: could not connect to host
 garciamartin.me: could not connect to host
 garcinia--cambogia.com: could not connect to host
 garciniacambogiareviewed.co: did not receive HSTS header
+garden-life.org: could not connect to host
 garden.trade: could not connect to host
 gardencarezone.com: did not receive HSTS header
+garethkirk.com: could not connect to host
+garethkirkreviews.com: could not connect to host
 garfieldairlines.net: did not receive HSTS header
 garten-bau.ch: did not receive HSTS header
 garten-diy.de: could not connect to host
+gartenhauszentrum.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 gasbarkenora.com: could not connect to host
 gasnews.net: could not connect to host
 gasser-daniel.ch: did not receive HSTS header
 gastauftritt.net: did not receive HSTS header
 gastritisolucion.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 gatapro.net: could not connect to host
-gatemotorsumhlanga.co.za: could not connect to host
+gatemotorsumhlanga.co.za: did not receive HSTS header
 gatemoves.com: could not connect to host
 gateworld.fr: did not receive HSTS header
 gatilagata.com.br: could not connect to host
@@ -5920,36 +6582,45 @@ gatorsa.es: could not connect to host
 gaussorgues.me: could not connect to host
 gautham.pro: could not connect to host
 gavick.com: did not receive HSTS header
+gay-jays.com: could not connect to host
 gay-sissies.com: could not connect to host
-gaycc.cc: could not connect to host
+gaya-sa.org: did not receive HSTS header
+gaycc.cc: did not receive HSTS header
 gaygeeks.de: could not connect to host
+gayjays.com: could not connect to host
 gaysfisting.com: could not connect to host
 gaytorrent.ru: could not connect to host
 gayxsite.com: could not connect to host
+gazee.net: did not receive HSTS header
 gazflynn.com: did not receive HSTS header
 gbit.xyz: could not connect to host
 gc.net: could not connect to host
+gccm-events.com: did not receive HSTS header
 gchoic.com: max-age too low: 7889238
 gchp.ie: did not receive HSTS header
+gcodetools.com: could not connect to host
 gdegem.org: did not receive HSTS header
 gdevpenze.ru: could not connect to host
 gdprhallofshame.com: could not connect to host
 gdutnic.com: could not connect to host
 gdz-otvety.com: could not connect to host
-gdz.tv: could not connect to host
 gear-acquisition-syndrome.community: could not connect to host
+gearseo.com.br: did not receive HSTS header
 geaskb.nl: could not connect to host
 geblitzt.de: did not receive HSTS header
 gedankenbude.info: could not connect to host
+gedankenworks.com: could not connect to host
 geekbaba.com: could not connect to host
-geekcast.co.uk: did not receive HSTS header
-geekchimp.com: did not receive HSTS header
-geekdt.com: could not connect to host
+geekcast.co.uk: could not connect to host
+geekchimp.com: could not connect to host
+geekdt.com: did not receive HSTS header
 geekmind.org: max-age too low: 172800
 geeks.berlin: could not connect to host
 geeks.lgbt: could not connect to host
+geeks.one: did not receive HSTS header
 geektimes.com: did not receive HSTS header
 geeky.software: could not connect to host
+geekystudios.us: could not connect to host
 geemo.top: could not connect to host
 gehrke.nrw: could not connect to host
 geigr.de: could not connect to host
@@ -5970,33 +6641,36 @@ genneve.com: did not receive HSTS header
 genoog.com: could not connect to host
 genossen.ru: could not connect to host
 genshiken.org: could not connect to host
-genuu.com: could not connect to host
+gensokyo.chat: could not connect to host
+genuu.com: did not receive HSTS header
 genuxation.com: could not connect to host
 genxbeats.com: did not receive HSTS header
 genyaa.com: could not connect to host
 genyhitch.com: did not receive HSTS header
+geocommunicator.gov: could not connect to host
 geoffanderinmyers.com: did not receive HSTS header
 geoffdev.com: could not connect to host
 geoffmyers.com: did not receive HSTS header
 geoffreyrichard.com: could not connect to host
 geopals.net: did not receive HSTS header
-georgeperez.me: could not connect to host
+georgeperez.me: did not receive HSTS header
 georgesonarthurs.com.au: did not receive HSTS header
-gerardobsd.com: could not connect to host
 gereja.ga: max-age too low: 1209600
 gerencianet.com.br: did not receive HSTS header
 gereon.ch: could not connect to host
 geri.be: could not connect to host
-germanticz.de: could not connect to host
+germansoldiers.net: could not connect to host
 gers-authentique.com: could not connect to host
 gerum.dynv6.net: did not receive HSTS header
 geschenkly.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 geschmackspiloten.de: did not receive HSTS header
 gesiwista.net: did not receive HSTS header
+gestorehotel.com: did not receive HSTS header
 gesunde-smoothies.de: did not receive HSTS header
 gesundes-im-napf.de: did not receive HSTS header
 get-asterisk.ru: could not connect to host
 get-cctv.com: could not connect to host
+get-link.info: could not connect to host
 get.zenpayroll.com: did not receive HSTS header
 getable.com: did not receive HSTS header
 getblys.com.au: did not receive HSTS header
@@ -6028,7 +6702,7 @@ getlittleapps.com: could not connect to host
 getlolaccount.com: did not receive HSTS header
 getmassage.com.ng: could not connect to host
 getmondo.co.uk: could not connect to host
-geto.ml: could not connect to host
+geto.ml: did not receive HSTS header
 getpake.com: could not connect to host
 getpop.org: did not receive HSTS header
 getpost.online: did not receive HSTS header
@@ -6040,15 +6714,19 @@ getsetupfile.com: did not receive HSTS header
 getshifter.io: did not receive HSTS header
 getspeaker.com: did not receive HSTS header
 getspire.com: could not connect to host
+getsubs.net: could not connect to host
 getwarden.net: could not connect to host
 getwashdaddy.com: could not connect to host
+getweloop.io: did not receive HSTS header
 getyourphix.tk: could not connect to host
 gevaulug.fr: could not connect to host
 gfbouncycastles.co.uk: did not receive HSTS header
 gfhgiro.nl: did not receive HSTS header
+gflclan.ru: could not connect to host
 gfm.tech: could not connect to host
 gfoss.gr: could not connect to host
 gfw.moe: could not connect to host
+gfwno.win: did not receive HSTS header
 gfwsb.ml: could not connect to host
 gglks.com: could not connect to host
 ggobbo.com: could not connect to host
@@ -6061,24 +6739,27 @@ gheorghe-sarcov.ga: could not connect to host
 gheorghesarcov.ga: could not connect to host
 gheorghesarcov.tk: could not connect to host
 ghi.gov: could not connect to host
-ghibli.studio: did not receive HSTS header
+ghibli.studio: could not connect to host
+ghid-pitesti.ro: did not receive HSTS header
 ghkim.net: could not connect to host
-ghuntley.com: max-age too low: 0
-giakki.eu: could not connect to host
+ghowell.io: could not connect to host
 gianlucapartengo.photography: did not receive HSTS header
 giant-powerfit.co.uk: did not receive HSTS header
 gibraltar-firma.com: did not receive HSTS header
 giddyaunt.net: could not connect to host
 gidea.nu: could not connect to host
+giduv.com: did not receive HSTS header
 giegler.software: did not receive HSTS header
+giftbg.org: did not receive HSTS header
 giftgofers.com: max-age too low: 2592000
 giftservices.nl: did not receive HSTS header
 gifzilla.net: could not connect to host
 gigacloud.org: could not connect to host
-gigawattz.com: could not connect to host
+gigawattz.com: did not receive HSTS header
 gigiscloud.servebeer.com: could not connect to host
 gigolodavid.be: could not connect to host
 gilcloud.com: could not connect to host
+gilescountytn.gov: did not receive HSTS header
 gilgaz.com: did not receive HSTS header
 gillet-cros.fr: could not connect to host
 gilly.berlin: did not receive HSTS header
@@ -6095,6 +6776,9 @@ gip-carif-idf.net: could not connect to host
 gip-carif-idf.org: could not connect to host
 gipsamsfashion.com: could not connect to host
 gipsic.com: did not receive HSTS header
+girlsgonesporty.com: could not connect to host
+girlsnet.work: could not connect to host
+girsa.org: could not connect to host
 gis3m.org: did not receive HSTS header
 gisac.org: did not receive HSTS header
 gistfy.com: could not connect to host
@@ -6108,6 +6792,7 @@ giverang.com: could not connect to host
 gix.net.pl: could not connect to host
 gixtools.co.uk: could not connect to host
 gixtools.uk: could not connect to host
+gizmo.ovh: could not connect to host
 gizzo.sk: could not connect to host
 glabiatoren-kst.de: could not connect to host
 gladystudio.com: did not receive HSTS header
@@ -6116,43 +6801,51 @@ glasslikes.com: did not receive HSTS header
 glbg.eu: did not receive HSTS header
 gle: could not connect to host
 glenavy.tk: could not connect to host
+glencambria.com: could not connect to host
+glencoveny.gov: could not connect to host
 glentakahashi.com: could not connect to host
 glicerina.online: did not receive HSTS header
 glittersjabloon.nl: did not receive HSTS header
 glitzmirror.com: could not connect to host
 glnpo.gov: could not connect to host
 globalado.com: could not connect to host
-globalbridge-japan.com: did not receive HSTS header
+globalbridge-japan.com: could not connect to host
 globalelite.black: did not receive HSTS header
 globalexpert.co.nz: could not connect to host
+globalgivingtime.com: could not connect to host
 globalinsights.xyz: could not connect to host
 globalinstitutefortraining.org.au: did not receive HSTS header
 globalittech.com: could not connect to host
+globalmoneyapp.com: could not connect to host
 globalmusic.ga: could not connect to host
 globalnewsdaily.cf: could not connect to host
 globalnomadvintage.com: could not connect to host
 globalperspectivescanada.com: could not connect to host
+globalresistancecorporation.com: could not connect to host
 globalsites.nl: did not receive HSTS header
 globaltennis.ca: could not connect to host
 globalvisions-events.ch: could not connect to host
 globalvisions-events.com: could not connect to host
+globeinform.com: did not receive HSTS header
 globuli-info.de: could not connect to host
 gloomyspark.com: could not connect to host
 glotter.com: did not receive HSTS header
 gloucesterphotographer.com: did not receive HSTS header
 glubbforum.de: did not receive HSTS header
-glutenfreiheit.at: did not receive HSTS header
+glutenfreiheit.at: could not connect to host
 glws.org: did not receive HSTS header
-glyph.ws: could not connect to host
 gm-assicurazioni.it: could not connect to host
 gmail.com: did not receive HSTS header (error ignored - included regardless)
+gmantra.org: max-age too low: 7776000
 gmanukyan.com: could not connect to host
 gmat.ovh: could not connect to host
 gmoes.at: did not receive HSTS header
-gmslparking.co.uk: did not receive HSTS header
+gmplab.com: could not connect to host
 gnaptracker.tk: could not connect to host
 gnom.me: could not connect to host
 gnosticjade.net: did not receive HSTS header
+gnwp.eu: could not connect to host
+gnylf.com: could not connect to host
 go.ax: did not receive HSTS header
 go2sh.de: did not receive HSTS header
 go4it.solutions: did not receive HSTS header
@@ -6160,16 +6853,23 @@ goabonga.com: could not connect to host
 goalsetup.com: did not receive HSTS header
 goaltree.ch: did not receive HSTS header
 goapunks.net: did not receive HSTS header
-goarmy.eu: could not connect to host
 goat.chat: did not receive HSTS header
 goat.xyz: could not connect to host
+goatbot.xyz: could not connect to host
 goben.ch: could not connect to host
 goblins.net: did not receive HSTS header
 goblinsatwork.com: could not connect to host
+goblintears.com: could not connect to host
 gocardless.com: did not receive HSTS header
+god-esb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+godbo9.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+godbo9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+godbo9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+godesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 godrealms.com: could not connect to host
 goedeke.ml: could not connect to host
 goerner.me: did not receive HSTS header
+goesta-hallenbau.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 goge.site: could not connect to host
 gogenenglish.com: could not connect to host
 gogetssl.com: did not receive HSTS header
@@ -6190,11 +6890,13 @@ goldwaterscholarship.gov: could not connect to host
 golearn.gov: could not connect to host
 golfburn.com: could not connect to host
 golocal-media.de: could not connect to host
+gomiblog.com: did not receive HSTS header
 gong8.win: could not connect to host
-gongjianwei.com: could not connect to host
 gonkar.com: did not receive HSTS header
+gonzalesca.gov: did not receive HSTS header
 gonzalosanchez.mx: did not receive HSTS header
 goodeats.nyc: did not receive HSTS header
+goodfeels.net: could not connect to host
 goodfurday.ca: could not connect to host
 goodmengroup.de: did not receive HSTS header
 goods-memo.net: did not receive HSTS header
@@ -6213,6 +6915,9 @@ goozz.nl: did not receive HSTS header
 gopay.cz: did not receive HSTS header
 gopokego.cz: could not connect to host
 goranrango.ch: could not connect to host
+gordonobrecht.com: did not receive HSTS header
+gorf.chat: could not connect to host
+gorf.club: could not connect to host
 gorgiaxx.com: could not connect to host
 gorilla-gym.site: could not connect to host
 gorillow.com: could not connect to host
@@ -6221,6 +6926,8 @@ gosharewood.com: did not receive HSTS header
 goshop.cz: did not receive HSTS header
 goshow.tv: could not connect to host
 gostream.asia: could not connect to host
+gosuland.org: did not receive HSTS header
+got-tty.de: did not receive HSTS header
 gotgenes.com: could not connect to host
 goto.google.com: did not receive HSTS header (error ignored - included regardless)
 gotobrno.cz: did not receive HSTS header
@@ -6233,11 +6940,13 @@ goubi.me: did not receive HSTS header
 goujianwen.com: did not receive HSTS header
 goukon.ru: could not connect to host
 gourmettia.com: did not receive HSTS header
-gouthro-goteborg.se: did not receive HSTS header
+gouthro-goteborg.se: could not connect to host
 gouv.ovh: did not receive HSTS header
 gov.ax: could not connect to host
 goverage.org: could not connect to host
 govillemo.ca: did not receive HSTS header
+gowin9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+gowin9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 gozadentro.com: could not connect to host
 gozel.com.tr: did not receive HSTS header
 gpalabs.com: could not connect to host
@@ -6245,6 +6954,7 @@ gparent.org: did not receive HSTS header
 gpga.cf: could not connect to host
 gplintegratedit.com: could not connect to host
 gpo.gov: did not receive HSTS header
+gps.com.br: could not connect to host
 gpstuner.com: did not receive HSTS header
 graavaapi.elasticbeanstalk.com: could not connect to host
 grabi.ga: could not connect to host
@@ -6258,21 +6968,24 @@ gradenotify.com: could not connect to host
 grads360.org: could not connect to host
 gradsm-ci.net: could not connect to host
 grafitec.ru: did not receive HSTS header
+grafmurr.de: could not connect to host
 graftworld.pw: could not connect to host
+grahamofthewheels.com: did not receive HSTS header
+grana.com: did not receive HSTS header
 grandchamproofing.com: did not receive HSTS header
-grande.coffee: could not connect to host
 grandlinecsk.ru: did not receive HSTS header
 grandmascookieblog.com: did not receive HSTS header
 grandmasfridge.org: did not receive HSTS header
 grandwailea.com: did not receive HSTS header
+granian.pro: could not connect to host
 grantedby.me: max-age too low: 0
 granth.io: could not connect to host
 graph.no: did not receive HSTS header
+graphified.nl: did not receive HSTS header
 graphite.org.uk: could not connect to host
 graphsearchengine.com: could not connect to host
 gratis-app.com: did not receive HSTS header
 gratisonlinesex.com: could not connect to host
-gravitation.pro: did not receive HSTS header
 gravito.nl: did not receive HSTS header
 gravity-net.de: could not connect to host
 graycell.net: could not connect to host
@@ -6282,6 +6995,7 @@ great.nagoya: could not connect to host
 greatergoodoffers.com: did not receive HSTS header
 greatfire.kr: could not connect to host
 greatideahub.com: did not receive HSTS header
+greatlengthshairextensionssalon.com: did not receive HSTS header
 greatnet.de: did not receive HSTS header
 greatsong.net: did not receive HSTS header
 greencardtalent.com: could not connect to host
@@ -6299,6 +7013,8 @@ greenvpn.ltd: could not connect to host
 greenvpn.pro: did not receive HSTS header
 greggsfoundation.org.uk: could not connect to host
 gregmartyn.com: could not connect to host
+gregmarziomedia.co.za: did not receive HSTS header
+gregmilton.org: could not connect to host
 gregorytlee.me: could not connect to host
 grekland.guide: could not connect to host
 gremots.com: could not connect to host
@@ -6309,6 +7025,7 @@ grettogeek.com: did not receive HSTS header
 greuel.online: could not connect to host
 greve.xyz: could not connect to host
 grevesgarten.de: could not connect to host
+greyhash.se: could not connect to host
 greyline.se: could not connect to host
 grian-bam.at: did not receive HSTS header
 gribani.com: could not connect to host
@@ -6317,10 +7034,12 @@ gridle.io: did not receive HSTS header
 grifomarchetti.com: did not receive HSTS header
 grigalanzsoftware.com: could not connect to host
 grillinfools.com: did not receive HSTS header
+gripnijmegen.rip: could not connect to host
 gripopgriep.net: could not connect to host
 gritte.net: could not connect to host
 griyo.online: could not connect to host
 groben-itsolutions.de: could not connect to host
+groenders.nl: could not connect to host
 groenewoud.run: could not connect to host
 groenteclub.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 groentefruitzeep.com: could not connect to host
@@ -6337,21 +7056,20 @@ grow-shop.lt: could not connect to host
 grow-shop.lv: could not connect to host
 growingmetrics.com: could not connect to host
 grozip.com: did not receive HSTS header
-grozter.se: did not receive HSTS header
 gruelang.org: could not connect to host
 gruenderwoche-dresden.de: did not receive HSTS header
 grumples.biz: did not receive HSTS header
 grunex.com: did not receive HSTS header
 grupopgn.com.br: could not connect to host
-grusig-geil.ch: could not connect to host
+gruppoipl.it: did not receive HSTS header
 gryffin.ga: could not connect to host
 gryffin.ml: could not connect to host
 gryffin.tk: could not connect to host
+grytics.com: did not receive HSTS header
 gs-net.at: could not connect to host
 gsm-map.com: could not connect to host
 gsmkungen.com: could not connect to host
 gsnort.com: did not receive HSTS header
-gsoc.se: could not connect to host
 gtamodshop.org: could not connect to host
 gtanda.tk: could not connect to host
 gtech.work: did not receive HSTS header
@@ -6365,7 +7083,6 @@ gudrun.ml: could not connect to host
 guelphhydropool.com: could not connect to host
 guendra.dedyn.io: could not connect to host
 guentherhouse.com: did not receive HSTS header
-guenthernoack.de: could not connect to host
 guffrits.com: could not connect to host
 gugaltika-ipb.org: could not connect to host
 guge.gq: could not connect to host
@@ -6377,17 +7094,16 @@ guidesetc.com: did not receive HSTS header
 guilde-vindicta.fr: could not connect to host
 guildgearscore.cf: could not connect to host
 guillaume-leduc.fr: did not receive HSTS header
+guillaumecote.me: could not connect to host
 guillaumematheron.fr: did not receive HSTS header
-guillaumeperrin.io: could not connect to host
 guiltypleasuresroleplaying.com: did not receive HSTS header
-guinea-pig.co: could not connect to host
+guinea-pig.co: did not receive HSTS header
 guineafruitcorp.com: could not connect to host
 gulch.in.ua: could not connect to host
 gulenet.com: could not connect to host
 gulfcoast-sandbox.com: could not connect to host
 gulitsky.me: could not connect to host
 gulleyperformancecenter.com: did not receive HSTS header
-gulshankumar.net: did not receive HSTS header
 gumannp.de: did not receive HSTS header
 gummibande.noip.me: could not connect to host
 gunhunter.com: could not connect to host
@@ -6396,6 +7112,7 @@ gunnarhafdal.com: did not receive HSTS header
 gunnaro.com: could not connect to host
 guntbert.net: could not connect to host
 guoqiang.info: did not receive HSTS header
+guphi.net: did not receive HSTS header
 gurochan.ch: could not connect to host
 gurom.lv: could not connect to host
 gurubetng.com: did not receive HSTS header
@@ -6409,6 +7126,7 @@ gussi.is: did not receive HSTS header
 guthabenkarten-billiger.de: could not connect to host
 guts.me: did not receive HSTS header
 guts.moe: did not receive HSTS header
+guvernalternativa.ro: could not connect to host
 guyot-tech.com: did not receive HSTS header
 gvchannel.xyz: could not connect to host
 gvi.be: did not receive HSTS header
@@ -6418,25 +7136,26 @@ gvt2.com: could not connect to host (error ignored - included regardless)
 gvt3.com: could not connect to host (error ignored - included regardless)
 gw2oracle.com: could not connect to host
 gw2reload.eu: could not connect to host
-gwa-verwaltung.de: could not connect to host
 gwijaya.com: could not connect to host
 gwtest.us: could not connect to host
 gxgx.org: could not connect to host
 gxlrx.net: could not connect to host
 gyboche.com: could not connect to host
 gyboche.science: could not connect to host
+gycis.me: did not receive HSTS header
 gylauto.fr: could not connect to host
-gymjp.com: did not receive HSTS header
 gypsycatdreams.com: could not connect to host
 gypthecat.com: did not receive HSTS header
 gyz.io: did not receive HSTS header
+gzitech.com: could not connect to host
+gzitech.net: could not connect to host
+gzitech.org: could not connect to host
 gzpblog.com: could not connect to host
 h-og.com: could not connect to host
 h-rickroll-n.pw: could not connect to host
 h2cdn.cloud: could not connect to host
 h2check.org: could not connect to host
 h3x.jp: could not connect to host
-ha6.ru: could not connect to host
 haarkliniek.com: did not receive HSTS header
 habbixed.tk: could not connect to host
 habbo.life: could not connect to host
@@ -6444,15 +7163,16 @@ habbotalk.nl: could not connect to host
 habeo.si: could not connect to host
 hablemosdetecnologia.com.ve: could not connect to host
 hac30.com: could not connect to host
-hack.club: could not connect to host
+hack.cz: could not connect to host
 hack.li: could not connect to host
+hackattack.com: did not receive HSTS header
 hackbubble.me: could not connect to host
-hacker.club: could not connect to host
 hacker.deals: could not connect to host
 hacker8.cn: could not connect to host
 hackercat.ninja: max-age too low: 2592000
-hackerforever.com: could not connect to host
-hackerlite.xyz: did not receive HSTS header
+hackerchai.com: could not connect to host
+hackerco.com: did not receive HSTS header
+hackerforever.com: did not receive HSTS header
 hackerone-ext-adroll.com: could not connect to host
 hackerspace-ntnu.no: did not receive HSTS header
 hackest.org: did not receive HSTS header
@@ -6461,7 +7181,6 @@ hackit.im: could not connect to host
 hackmeplz.com: could not connect to host
 hackroyale.xyz: could not connect to host
 hacksnack.io: could not connect to host
-hackthissite.org: could not connect to host
 hackyourfaceoff.com: could not connect to host
 hackzogtum-coburg.de: did not receive HSTS header
 hadaf.pro: could not connect to host
@@ -6472,20 +7191,21 @@ haehnlein.at: could not connect to host
 haemmerle.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 haf.gr: could not connect to host
 hafoda.com: did not receive HSTS header
-haggeluring.su: could not connect to host
 hahayidu.org: could not connect to host
 hail2u.net: did not receive HSTS header
 hainoni.com: did not receive HSTS header
-hairlossstop.net: did not receive HSTS header
+hairlossstop.net: could not connect to host
 haitschi.com: could not connect to host
 haitschi.de: did not receive HSTS header
 haitschi.net: could not connect to host
 haitschi.org: could not connect to host
+haizum.pro: could not connect to host
 haktec.de: did not receive HSTS header
 haku.moe: could not connect to host
 hakugin.me: could not connect to host
 hakugin.org: could not connect to host
 hakurei.moe: could not connect to host
+hal-9th.space: could not connect to host
 halcyonsbastion.com: could not connect to host
 half-logic.eu.org: could not connect to host
 halfwaythere.eu: could not connect to host
@@ -6499,7 +7219,6 @@ hamking.tk: could not connect to host
 hammamsayad.com: could not connect to host
 hamon.cc: did not receive HSTS header
 hamu.blue: could not connect to host
-hanakaraku.com: could not connect to host
 hancatemc.com: did not receive HSTS header
 hancc.net: could not connect to host
 handenafvanhetmedischdossier.nl: could not connect to host
@@ -6518,6 +7237,7 @@ hanksservice.com: could not connect to host
 hannes-speelgoedencadeautjes.nl: did not receive HSTS header
 hans-natur.de: did not receive HSTS header
 hansch.ventures: could not connect to host
+hanxv.pw: did not receive HSTS header
 hanys.xyz: could not connect to host
 hanzcollection.online: could not connect to host
 haobo111.com: could not connect to host
@@ -6529,8 +7249,10 @@ haobo5555.com: could not connect to host
 haobo6666.com: could not connect to host
 haobo7777.com: could not connect to host
 haomwei.com: could not connect to host
-haoqi.men: could not connect to host
 haoyugao.com: could not connect to host
+hapijs.cn: could not connect to host
+hapissl.com: could not connect to host
+hapivm.com: could not connect to host
 happist.com: did not receive HSTS header
 happix.nl: did not receive HSTS header
 happyfabric.me: did not receive HSTS header
@@ -6539,20 +7261,20 @@ happyheartsabode.com: did not receive HSTS header
 happytiger.eu: could not connect to host
 hapsfordmill.co.uk: could not connect to host
 hapvm.com: could not connect to host
-haqaza.com.br: did not receive HSTS header
+haqaza.com.br: could not connect to host
 harambe.site: could not connect to host
-harbourweb.net: could not connect to host
-hardeman.nu: could not connect to host
+harbourweb.net: did not receive HSTS header
 hardline.xyz: could not connect to host
 hardtime.ru: could not connect to host
 hardyboyplant.com: did not receive HSTS header
-harekaze.info: could not connect to host
 haribosupermix.com: could not connect to host
 hariome.com: did not receive HSTS header
+haritsa.co.id: could not connect to host
 harlentimberproducts.co.uk: did not receive HSTS header
 harmonycosmetic.com: max-age too low: 300
 harrisonsdirect.co.uk: did not receive HSTS header
 harristony.com: could not connect to host
+harry-baker.com: could not connect to host
 harryharrison.co: did not receive HSTS header
 harrypottereditor.com: could not connect to host
 harrypottereditor.net: could not connect to host
@@ -6560,7 +7282,7 @@ harschnitz.nl: did not receive HSTS header
 hartlep.eu: could not connect to host
 hartmancpa.com: did not receive HSTS header
 harvestrenewal.org: did not receive HSTS header
-harveymilton.com: did not receive HSTS header
+harveymilton.com: max-age too low: 0
 harz.cloud: could not connect to host
 has.vision: could not connect to host
 hasabig.wang: could not connect to host
@@ -6569,7 +7291,6 @@ haschrebellen.de: could not connect to host
 hash-list.com: could not connect to host
 hashiconf.com: did not receive HSTS header
 hashidays.com: did not receive HSTS header
-hashimah.ca: could not connect to host
 hashplex.com: could not connect to host
 hasinase.de: could not connect to host
 haste.ch: could not connect to host
@@ -6579,16 +7300,16 @@ hatethe.uk: could not connect to host
 hatoko.net: could not connect to host
 haufschild.de: could not connect to host
 haurumcraft.net: could not connect to host
-hausarzt-stader-str.de: did not receive HSTS header
+hausarzt-stader-str.de: could not connect to host
 hauswarteam.com: could not connect to host
 hav.com: could not connect to host
 haveeruexaminer.com: could not connect to host
-haven-moon.com: could not connect to host
 haven-staging.cloud: could not connect to host
 haven.cloud: did not receive HSTS header
 havenmoon.com: could not connect to host
 havenswift-hosting.co.uk: did not receive HSTS header
 hawk-la.com: could not connect to host
+hawkofgeorgia.com: could not connect to host
 hawthornharpist.com: could not connect to host
 haxoff.com: did not receive HSTS header
 haxon.me: could not connect to host
@@ -6599,6 +7320,8 @@ hazcod.com: could not connect to host
 haze-productions.com: could not connect to host
 haze.network: did not receive HSTS header
 haze.sucks: could not connect to host
+hazeltime.com: could not connect to host
+hazeltime.se: did not receive HSTS header
 hazyrom.net: could not connect to host
 hb1111.com: could not connect to host
 hb3333.com: could not connect to host
@@ -6622,10 +7345,9 @@ hcs-company.com: did not receive HSTS header
 hcs-company.nl: did not receive HSTS header
 hcstr.com: did not receive HSTS header
 hd1tj.org: did not receive HSTS header
-hda.me: could not connect to host
+hda.me: did not receive HSTS header
 hdm.io: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 hdrboundless.com: could not connect to host
-hdritalyphotos.com: did not receive HSTS header
 hdserver.info: did not receive HSTS header
 hdsmigrationtool.com: could not connect to host
 hduin.xyz: could not connect to host
@@ -6634,20 +7356,24 @@ hdy.nz: could not connect to host
 head-shop.lt: could not connect to host
 head-shop.lv: could not connect to host
 headmates.xyz: could not connect to host
+health-match.com.au: could not connect to host
 healthcare6.com: did not receive HSTS header
 healthjoy.com: did not receive HSTS header
+healthlabs.com: did not receive HSTS header
+healthmatchapp.com: could not connect to host
 healthyandnaturalliving.com: could not connect to host
 healthycod.in: could not connect to host
-healtious.com: could not connect to host
+healtious.com: did not receive HSTS header
 hearingshofar.com: could not connect to host
-heart.ge: did not receive HSTS header
+heart.ge: could not connect to host
 heartlandrentals.com: did not receive HSTS header
+heartsucker.com: could not connect to host
 hearty.cf: did not receive HSTS header
 hearty.ink: could not connect to host
 hearty.space: could not connect to host
 hearty.tech: could not connect to host
-hearty.tw: did not receive HSTS header
 heartyapp.com: could not connect to host
+heartyapp.tw: could not connect to host
 heartyme.net: could not connect to host
 heathmanners.com: could not connect to host
 heavenlyseals.com: could not connect to host
@@ -6655,13 +7381,17 @@ heavenlysmokenc.com: could not connect to host
 heavystresser.com: could not connect to host
 heayao.com: could not connect to host
 hebaus.com: could not connect to host
+hebriff.com: could not connect to host
 hectorj.net: could not connect to host
 hedweb.com: could not connect to host
+heeler.blue: could not connect to host
+heeler.red: could not connect to host
 heidilein.info: did not receive HSTS header
 heimnetze.org: could not connect to host
 heisenberg.co: could not connect to host
 hejsupport.se: could not connect to host
 hekeki.com: could not connect to host
+helber-it-services.de: could not connect to host
 hele.cz: did not receive HSTS header
 helgakristoffer.com: could not connect to host
 helgakristoffer.wedding: could not connect to host
@@ -6671,11 +7401,11 @@ hellenicaward.com: did not receive HSTS header
 hello-nestor.com: did not receive HSTS header
 helloanselm.com: did not receive HSTS header
 hellofilters.com: could not connect to host
-hellomouse.cf: could not connect to host
+hellomouse.cf: did not receive HSTS header
+hellomouse.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 hellomouse.tk: could not connect to host
 hellotandem.com: could not connect to host
 hellothought.net: could not connect to host
-helloworldhost.com: did not receive HSTS header
 hellscanyonraft.com: did not receive HSTS header
 helpadmin.net: could not connect to host
 helpantiaging.com: could not connect to host
@@ -6686,25 +7416,27 @@ helpfixe.com: did not receive HSTS header
 helpflux.com: did not receive HSTS header
 helpfute.com: did not receive HSTS header
 helpgerer.com: did not receive HSTS header
-helpium.de: could not connect to host
+helpium.de: did not receive HSTS header
 helpmebuild.com: did not receive HSTS header
 helppresta.com: did not receive HSTS header
 helpverif.com: did not receive HSTS header
 helpwithmybank.gov: did not receive HSTS header
 helsingfors.guide: could not connect to host
+helup.com: did not receive HSTS header
 hemlockhillscabinrentals.com: did not receive HSTS header
 hencagon.com: could not connect to host
 hendersonrealestatepros.com: did not receive HSTS header
-hendric.us: did not receive HSTS header
 henhenlu.com: could not connect to host
+henkbrink.com: did not receive HSTS header
 henningkerstan.org: did not receive HSTS header
 henriknoerr.com: could not connect to host
+henrock.net: could not connect to host
 hentai.design: did not receive HSTS header
 hentaimaster.net: could not connect to host
 hentaiz.net: could not connect to host
 hepteract.us: could not connect to host
 heptner24.de: could not connect to host
-her25.com: did not receive HSTS header
+herbandpat.org: could not connect to host
 herbertmouwen.nl: could not connect to host
 here.ml: could not connect to host
 here4funpartysolutions.ie: did not receive HSTS header
@@ -6712,27 +7444,28 @@ heribe-maruo.com: did not receive HSTS header
 heritagedentistry.ca: did not receive HSTS header
 hermes-servizi.it: could not connect to host
 heroin.org.uk: could not connect to host
-herpaderp.net: did not receive HSTS header
+herpaderp.net: could not connect to host
+herr-webdesign.de: could not connect to host
 herramientasbazarot.com: did not receive HSTS header
 herrenfahrt.com: did not receive HSTS header
+herrtxbias.org: could not connect to host
 hetmeisjeachterpauw.nl: could not connect to host
 hetmer.com: did not receive HSTS header
 hetmer.cz: did not receive HSTS header
-hetmer.net: did not receive HSTS header
+hetmer.net: could not connect to host
 heutger.net: did not receive HSTS header
-heverhagen.rocks: could not connect to host
 hex2013.com: did not receive HSTS header
 hexacon.io: could not connect to host
 hexadecimal.tech: could not connect to host
 hexe.net: did not receive HSTS header
 hexhu.com: could not connect to host
-hexo.io: did not receive HSTS header
 hexobind.com: could not connect to host
-hexstream.net: did not receive HSTS header
+heyfringe.com: could not connect to host
 heyguevara.com: did not receive HSTS header
+heyjournal.com: could not connect to host
 heywoodtown.co.uk: did not receive HSTS header
 hfbg.nl: did not receive HSTS header
-hfcbank.com.gh: could not connect to host
+hfcbank.com.gh: did not receive HSTS header
 hfi.me: did not receive HSTS header
 hflsdev.org: could not connect to host
 hfu.io: could not connect to host
@@ -6740,6 +7473,7 @@ hg525.com: max-age too low: 86400
 hg71839.com: could not connect to host
 hg881.com: could not connect to host
 hgfa.fi: could not connect to host
+hgvnet.de: could not connect to host
 hi808.net: did not receive HSTS header
 hialatv.com: could not connect to host
 hibilog.com: could not connect to host
@@ -6751,9 +7485,10 @@ hidedd.com: could not connect to host
 hideftv.deals: could not connect to host
 hideout.agency: could not connect to host
 hidrofire.com: did not receive HSTS header
-hiexmerida-mailing.com: did not receive HSTS header
+hiexmerida-mailing.com: could not connect to host
 hig.gov: could not connect to host
 highgrove.org.uk: could not connect to host
+highland-webcams.com: could not connect to host
 highlandparkcog.org: did not receive HSTS header
 highperformancehvac.com: did not receive HSTS header
 highseer.com: did not receive HSTS header
@@ -6765,12 +7500,11 @@ highvelocitydesign.com: could not connect to host
 higp.de: did not receive HSTS header
 hiisukun.com: could not connect to host
 hiitcentre.com: did not receive HSTS header
-hijoan.com: could not connect to host
 hik-cloud.com: did not receive HSTS header
 hikagestudios.com: did not receive HSTS header
 hikariempire.com: could not connect to host
-hikinggearlab.com: did not receive HSTS header
 hilaolu.com: could not connect to host
+hilaolu.studio: did not receive HSTS header
 hilariousbeer.com.mx: could not connect to host
 hilinemerchandising.com: did not receive HSTS header
 hill.selfip.net: could not connect to host
@@ -6784,30 +7518,35 @@ hinkel-sohn.de: did not receive HSTS header
 hinrich.de: did not receive HSTS header
 hintergedanken.com: could not connect to host
 hintermeier-rae.at: did not receive HSTS header
+hiojbk.com: could not connect to host
 hipercultura.com: did not receive HSTS header
-hiphop.ren: could not connect to host
 hiphopconvention.nl: could not connect to host
+hipi.jp: could not connect to host
 hipnos.net: did not receive HSTS header
 hipnoseinstitute.org: did not receive HSTS header
 hiraku.me: did not receive HSTS header
+hirefitness.co.uk: did not receive HSTS header
 hireprofs.com: could not connect to host
 hiresuccessstaffing.com: did not receive HSTS header
 hiretech.com: did not receive HSTS header
 hirevets.gov: did not receive HSTS header
 hirokilog.com: could not connect to host
 hisingenrunt.se: did not receive HSTS header
+hisnet.de: could not connect to host
 histoire-theatre.com: did not receive HSTS header
 history.pe: could not connect to host
 hitchunion.org: could not connect to host
-hitoy.org: did not receive HSTS header
+hitoy.org: could not connect to host
+hitrek.ml: could not connect to host
 hittipps.com: could not connect to host
 hivatal-info.hu: could not connect to host
-hiyuki2578.net: could not connect to host
+hj2999.com: did not receive HSTS header
 hjes.com.ve: could not connect to host
 hjf-immobilien.de: did not receive HSTS header
 hjkhs.cn: did not receive HSTS header
 hknet.at: did not receive HSTS header
 hlacosedora.com: max-age too low: 7889238
+hloe0xff.ru: could not connect to host
 hlpublicidad.com: could not connect to host
 hlyue.com: did not receive HSTS header
 hm1ch.com: could not connect to host
@@ -6819,20 +7558,19 @@ hobaugh.social: could not connect to host
 hobby-gamerz-community.de: did not receive HSTS header
 hocassian.cn: did not receive HSTS header
 hochzeitshelferlein.de: did not receive HSTS header
+hockey.academy: did not receive HSTS header
 hodamakade.com: could not connect to host
 hodne.io: could not connect to host
 hoekwoningverkopen.nl: could not connect to host
 hoelty.network: could not connect to host
 hoerbuecher-und-hoerspiele.de: could not connect to host
-hoeveiligismijn.nl: did not receive HSTS header
 hoffens.se: could not connect to host
 hofiprojekt.cz: did not receive HSTS header
 hogar123.es: could not connect to host
+hogepad.com: could not connect to host
 hoiku-map.tokyo: could not connect to host
 hoiku-navi.com: did not receive HSTS header
-hoikuen-now.top: did not receive HSTS header
 hokepon.com: did not receive HSTS header
-hokioisecurity.com: did not receive HSTS header
 holgerlehner.com: could not connect to host
 holidayincotswolds.co.uk: could not connect to host
 holifestival-freyung.de: could not connect to host
@@ -6851,15 +7589,19 @@ home-craft.de: could not connect to host
 home-v.ind.in: could not connect to host
 home-work-jobs.com: did not receive HSTS header
 homeandyarddetailing.com: could not connect to host
+homecarpetcleaning.co.uk: could not connect to host
 homeclouding.de: could not connect to host
 homecoming.city: could not connect to host
 homedna.com: did not receive HSTS header
 homeexx.com: did not receive HSTS header
+homehunting.pt: did not receive HSTS header
+homeoesp.org: did not receive HSTS header
 homeownersassociationmanagementla.com: did not receive HSTS header
 homeremodelingcontractorsca.com: did not receive HSTS header
 homesandal.com: did not receive HSTS header
 homeseller.co.uk: could not connect to host
 homesfordinner.ca: could not connect to host
+homestay.id: did not receive HSTS header
 homeyantra.com: did not receive HSTS header
 homezhi.com.tw: could not connect to host
 homoglyph.net: could not connect to host
@@ -6871,7 +7613,7 @@ hongzu.cc: could not connect to host
 hongzuwang.com: could not connect to host
 hongzuzhibo.com: could not connect to host
 honkhonk.net: could not connect to host
-honkion.net: could not connect to host
+honoka.tech: could not connect to host
 honoo.com: could not connect to host
 hoodiecrow.com: could not connect to host
 hoodoo.io: could not connect to host
@@ -6879,21 +7621,19 @@ hoodoo.tech: could not connect to host
 hookandloom.com: did not receive HSTS header
 hookbin.com: could not connect to host
 hoopsacademyusa.com: could not connect to host
-hooray.beer: could not connect to host
-hoovism.com: did not receive HSTS header
 hopesb.org: did not receive HSTS header
 hopewellproperties.co.uk: did not receive HSTS header
 hopglass.eu: could not connect to host
 hopglass.net: could not connect to host
-hoponmedia.de: could not connect to host
 hopzone.net: could not connect to host
 horace.li: did not receive HSTS header
+horackova.info: could not connect to host
 horisonttimedia.fi: did not receive HSTS header
 horizonmoto.fr: did not receive HSTS header
+horkel.cf: could not connect to host
 horning.co: did not receive HSTS header
 horosho.in: could not connect to host
 horrendous-servers.com: could not connect to host
-horror-forum.de: could not connect to host
 horrorserv.com: could not connect to host
 horseboners.xxx: could not connect to host
 hortifarm.ro: did not receive HSTS header
@@ -6911,7 +7651,6 @@ hostinaus.com.au: did not receive HSTS header
 hostingfirst.nl: could not connect to host
 hostingfj.com: could not connect to host
 hostisan.com: could not connect to host
-hostworkz.com: could not connect to host
 hosyaku.gr.jp: did not receive HSTS header
 hot-spa.ch: did not receive HSTS header
 hotartup.com: could not connect to host
@@ -6921,9 +7660,11 @@ hotel-huberhof.at: did not receive HSTS header
 hotel-tongruben.de: max-age too low: 0
 hotelaustria-wien.at: did not receive HSTS header
 hotelmadhuwanvihar.com: could not connect to host
-hotelvictoriaoax-mailing.com: did not receive HSTS header
-hotelvillahermosa-mailing.com: did not receive HSTS header
+hotelvictoriaoax-mailing.com: could not connect to host
+hotelvillahermosa-mailing.com: could not connect to host
 hotelvue.nl: could not connect to host
+hotesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+hotesb.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 hotjuice.com: could not connect to host
 hotornot.com: could not connect to host
 hotpoint-training.com: did not receive HSTS header
@@ -6938,11 +7679,12 @@ howfargames.com: could not connect to host
 howrandom.org: could not connect to host
 howtocuremysciatica.com: could not connect to host
 howtofreelance.com: did not receive HSTS header
+howtoinstall.co: did not receive HSTS header
 hozinga.de: could not connect to host
 hpctecnologias.com: did not receive HSTS header
 hpeditor.tk: could not connect to host
-hpepub.asia: did not receive HSTS header
-hpepub.org: could not connect to host
+hpepub.asia: could not connect to host
+hpepub.org: did not receive HSTS header
 hppub.info: could not connect to host
 hppub.org: could not connect to host
 hppub.site: could not connect to host
@@ -6954,8 +7696,11 @@ hr-tech.store: could not connect to host
 hr98.tk: could not connect to host
 hrackydomino.cz: did not receive HSTS header
 hrfhomelottery.com: did not receive HSTS header
+hrk.io: did not receive HSTS header
+hrobert.hu: could not connect to host
 hrtech.store: could not connect to host
 hrtraining.com.au: did not receive HSTS header
+hru.gov: could not connect to host
 hschen.top: could not connect to host
 hserver.top: could not connect to host
 hsir.me: could not connect to host
@@ -6972,37 +7717,47 @@ https.ps: could not connect to host
 https.ren: could not connect to host
 httpstatuscode418.xyz: could not connect to host
 httptest.net: could not connect to host
-hu8hu8.com: could not connect to host
-huang-haitao.com: did not receive HSTS header
+hua-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+hua-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+hua-li88.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+hua-li88.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 huang.nu: could not connect to host
 huangguancq.com: could not connect to host
 huangh.com: could not connect to host
+huangting.me: did not receive HSTS header
 huangzenghao.com: could not connect to host
 huarongdao.com: did not receive HSTS header
+huaxueba.com: could not connect to host
 hubert.systems: did not receive HSTS header
-hubertmoszka.pl: could not connect to host
+hubertmoszka.pl: max-age too low: 0
+hubrecht.at: could not connect to host
 hubrick.com: could not connect to host
 hudhaifahgoga.co.za: could not connect to host
 hudingyuan.cn: could not connect to host
 hugocollignon.fr: could not connect to host
+hui-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+hui-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+huihui.moe: did not receive HSTS header
 huiser.nl: could not connect to host
 hukaloh.com: could not connect to host
 hukkatavara.com: could not connect to host
+hulsoft.co.uk: could not connect to host
 humanexperiments.com: could not connect to host
+humankode.com: did not receive HSTS header
+humblebee.es: could not connect to host
 humblefinances.com: could not connect to host
 humeurs.net: could not connect to host
-humorce.com: did not receive HSTS header
 humortuga.pt: did not receive HSTS header
 hump.dk: could not connect to host
 humpi.at: could not connect to host
 humpteedumptee.in: did not receive HSTS header
-hunger.im: could not connect to host
 hunqz.com: could not connect to host
-huntshomeinspections.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+hunter-read.com: could not connect to host
 huodongweb.com: could not connect to host
 huoduan.com: did not receive HSTS header
 huongquynh.com: could not connect to host
 hup.blue: could not connect to host
+hurricanelabs.com: did not receive HSTS header
 huskybutt.dog: could not connect to host
 huskyduvercors.com: did not receive HSTS header
 hustle.com: did not receive HSTS header
@@ -7010,32 +7765,34 @@ hustle.life: did not receive HSTS header
 huwjones.me: could not connect to host
 huzu.com: did not receive HSTS header
 huzurmetal.net: could not connect to host
-hwcine.com: did not receive HSTS header
+hwcine.com: could not connect to host
 hwpkasse.de: max-age too low: 2592000
 hyakumachi.com: did not receive HSTS header
 hyatt.com: did not receive HSTS header
 hybridiyhdistys.fi: could not connect to host
 hybridklubben.fi: could not connect to host
+hybridragon.net: could not connect to host
 hybula.nl: could not connect to host
 hydai.co: could not connect to host
 hydra.ws: could not connect to host
 hydra.zone: could not connect to host
 hydrabit.nl: did not receive HSTS header
 hydrante.ch: could not connect to host
+hydrocloud.net: could not connect to host
 hydrodipcenter.nl: did not receive HSTS header
 hydronium.cf: could not connect to host
 hydronium.ga: could not connect to host
 hydronium.me: could not connect to host
 hydronium.ml: could not connect to host
 hydronium.tk: could not connect to host
-hydronyx.me: could not connect to host
+hydronyx.me: did not receive HSTS header
 hydrosight.com: did not receive HSTS header
 hyeok.org: did not receive HSTS header
+hylians.com: could not connect to host
 hymerscollege.co.uk: max-age too low: 43200
 hypa.net.au: did not receive HSTS header
 hyper-matrix.org: could not connect to host
 hyper69.com: could not connect to host
-hyperactive.am: could not connect to host
 hyperporn.net: could not connect to host
 hyperreal.info: could not connect to host
 hypnoresults.com.au: did not receive HSTS header
@@ -7044,7 +7801,6 @@ hypotheques24.ch: could not connect to host
 hysg.me: could not connect to host
 hyvive.com: could not connect to host
 hzh.pub: did not receive HSTS header
-hztgzz.com: could not connect to host
 i--b.com: did not receive HSTS header
 i-jp.net: could not connect to host
 i-partners.sk: could not connect to host
@@ -7054,20 +7810,21 @@ i10z.com: could not connect to host
 i28s.com: did not receive HSTS header
 i496.eu: could not connect to host
 i4m1k0su.com: could not connect to host
-i95.me: could not connect to host
 i9multiequipamentos.com.br: did not receive HSTS header
 ia1000.com: could not connect to host
-iacono.com.br: did not receive HSTS header
 iadttaveras.com: could not connect to host
 iain.tech: did not receive HSTS header
 iamokay.nl: did not receive HSTS header
 iamreubin.co.uk: did not receive HSTS header
 iamsoareyou.se: could not connect to host
 iamveto.com: did not receive HSTS header
+ian.sh: did not receive HSTS header
+ianvisits.co.uk: did not receive HSTS header
 iapws.com: did not receive HSTS header
 iban.is: could not connect to host
 ibarf.nl: did not receive HSTS header
 ibase.com: did not receive HSTS header
+ibeep.com: could not connect to host
 ibenchu.com: did not receive HSTS header
 ibestreview.com: did not receive HSTS header
 ibizatopcharter.com: did not receive HSTS header
@@ -7075,11 +7832,13 @@ ibna.online: could not connect to host
 ibnuwebhost.com: could not connect to host
 ibnw.de: did not receive HSTS header
 ibox.ovh: did not receive HSTS header
+ibpegasus.tk: could not connect to host
 ibps.blog: did not receive HSTS header
 ibpsrecruitment.co.in: could not connect to host
 ibron.co: could not connect to host
 ibsafrica.co.za: could not connect to host
 ibsglobal.co.za: could not connect to host
+icabanken.se: did not receive HSTS header
 icaforsakring.se: did not receive HSTS header
 icasnetwork.com: did not receive HSTS header
 ice.yt: could not connect to host
@@ -7092,18 +7851,20 @@ icepink.com.br: could not connect to host
 icewoman.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 icfl.com.br: could not connect to host
 ich-find-den-g.net: could not connect to host
+ich-hab-die-schnauze-voll-von-der-suche-nach-ner-kurzen-domain.de: could not connect to host
 ich-mach-druck.eu: did not receive HSTS header
 ichnichtskaufmann.de: could not connect to host
 ichoosebtec.com: did not receive HSTS header
 ichronos.net: did not receive HSTS header
 icity.ly: did not receive HSTS header
+ickerseashop.com: could not connect to host
 icloud.net: could not connect to host
-icmshoptrend.com: did not receive HSTS header
 icnsoft.ga: did not receive HSTS header
+icnsoft.me: did not receive HSTS header
+icnsoft.org: could not connect to host
 icntorrent.download: could not connect to host
 ico500.com: did not receive HSTS header
-icondoom.nl: did not receive HSTS header
-icowhitepapers.co: did not receive HSTS header
+icondoom.nl: could not connect to host
 icpc2016.in.th: could not connect to host
 icreative.nl: did not receive HSTS header
 ictinforensics.org: could not connect to host
@@ -7124,15 +7885,14 @@ idealmykonos.com: did not receive HSTS header
 idealvenir.com: did not receive HSTS header
 ideapaisajistas.es: did not receive HSTS header
 ideaplus.me: could not connect to host
-ideasenfoto.com: max-age too low: 604800
 ideasmeetingpoint.com: could not connect to host
 ideation-inc.co.jp: could not connect to host
 idedr.com: could not connect to host
+ideiasefinancas.com.br: could not connect to host
 idemo.in: could not connect to host
 identity-hash.online: could not connect to host
 identitylabs.uk: could not connect to host
 identitysandbox.gov: could not connect to host
-identitytheft.gov: did not receive HSTS header
 idgsupply.com: did not receive HSTS header
 idid.tk: could not connect to host
 idinby.dk: did not receive HSTS header
@@ -7152,6 +7912,7 @@ ievgenialehner.com: did not receive HSTS header
 iexpert9.com: did not receive HSTS header
 if0.ru: could not connect to host
 ifad.org: did not receive HSTS header
+ifamily.top: did not receive HSTS header
 ifan.ch: could not connect to host
 ifastuniversity.com: did not receive HSTS header
 ifcfg.me: could not connect to host
@@ -7161,17 +7922,18 @@ ifreetion.cn: did not receive HSTS header
 ifx.ee: could not connect to host
 ifxnet.com: could not connect to host
 ifxor.com: could not connect to host
-iga-semi.jp: could not connect to host
 igamingforums.com: could not connect to host
-igd.chat: did not receive HSTS header
+igaryhe.io: did not receive HSTS header
+igd.chat: could not connect to host
 igforums.com: could not connect to host
 igi.codes: did not receive HSTS header
 igiftcards.nl: did not receive HSTS header
 ignatisd.gr: did not receive HSTS header
 igule.net: could not connect to host
 iha6.com: could not connect to host
+ihc.im: did not receive HSTS header
+ihls.xyz: did not receive HSTS header
 ihongzu.com: could not connect to host
-ihotel.io: did not receive HSTS header
 ihrlotto.de: could not connect to host
 ihrnationalrat.ch: could not connect to host
 ihsbsd.me: could not connect to host
@@ -7183,46 +7945,43 @@ iilin.com: did not receive HSTS header
 iispeed.com: did not receive HSTS header
 ijn-dd.nl: could not connect to host
 ijoda.com: could not connect to host
-ik-life.com: did not receive HSTS header
-ike.io: did not receive HSTS header
+ike.io: could not connect to host
 ikenmeyer.com: could not connect to host
 ikenmeyer.eu: could not connect to host
 ikocik.sk: could not connect to host
 ikon.name: could not connect to host
-ikraenglish.com: could not connect to host
+ikudo.top: could not connect to host
 ikwilguidobellen.nl: could not connect to host
 ikzoekeengoedkopeauto.nl: could not connect to host
 ikzoekjeugdhulp.nl: did not receive HSTS header
 ilbuongiorno.it: did not receive HSTS header
 ildomani.it: did not receive HSTS header
 ileat.com: could not connect to host
+ilemonrain.com: could not connect to host
 ilgi.work: could not connect to host
 ilii.me: could not connect to host
-ilikerainbows.co: could not connect to host
+ilikerainbows.co: did not receive HSTS header
 ilikerainbows.co.uk: could not connect to host
 ilikfreshweedstores.com: did not receive HSTS header
 ilmconpm.de: could not connect to host
 iloilofit.org: did not receive HSTS header
 ilona.graphics: did not receive HSTS header
-iltec-prom.ru: could not connect to host
-iltec.ru: could not connect to host
+iltisim.ch: did not receive HSTS header
 iluvscotland.co.uk: did not receive HSTS header
 im-design.com.ua: did not receive HSTS header
+imadalin.ro: could not connect to host
 image.tf: could not connect to host
 imagecurl.com: could not connect to host
 imagecurl.org: could not connect to host
 imaginarymakings.me: could not connect to host
 imakepoems.net: could not connect to host
 imanhearts.com: did not receive HSTS header
-imanolbarba.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 imanudin.net: did not receive HSTS header
 imbrian.org: could not connect to host
-ime.moe: could not connect to host
 imed.com.pt: did not receive HSTS header
 imed.pt: did not receive HSTS header
 imedi.it: could not connect to host
 imfromthefuture.com: did not receive HSTS header
-img.ovh: could not connect to host
 imgencrypt.com: could not connect to host
 imgul.net: could not connect to host
 imguoguo.com: could not connect to host
@@ -7236,6 +7995,7 @@ immanuel60.hu: did not receive HSTS header
 immaternity.com: could not connect to host
 immersionwealth.com: could not connect to host
 immo-vk.de: did not receive HSTS header
+immobiliarecapitani.com: did not receive HSTS header
 immobilien-wallat.de: could not connect to host
 immoprotect.ca: did not receive HSTS header
 immortals-co.com: did not receive HSTS header
@@ -7249,7 +8009,7 @@ immunicity.info: could not connect to host
 immunicity.online: could not connect to host
 immunicity.press: could not connect to host
 immunicity.rocks: could not connect to host
-immunicity.st: did not receive HSTS header
+immunicity.st: could not connect to host
 immunicity.today: could not connect to host
 immunicity.top: could not connect to host
 immunicity.win: could not connect to host
@@ -7264,7 +8024,6 @@ imoto.me: could not connect to host
 imperdintechnologies.com: could not connect to host
 imperialonlinestore.com: did not receive HSTS header
 imperialwebsolutions.com: did not receive HSTS header
-implicitdenial.com: could not connect to host
 imprenta-es.com: did not receive HSTS header
 improvingwp.com: could not connect to host
 impulse-clan.de: could not connect to host
@@ -7278,16 +8037,23 @@ inbox.li: did not receive HSTS header
 inboxen.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 incendiary-arts.com: could not connect to host
 inceptionradionetwork.com: could not connect to host
+incestporn.tv: could not connect to host
 inchomatic.com: did not receive HSTS header
+incompliance.de: did not receive HSTS header
 increasetestosteronelevels.org: could not connect to host
 inderagamono.net: could not connect to host
 indesit-training.com: did not receive HSTS header
+indexyz.me: could not connect to host
+indianaantlersupply.com: could not connect to host
+indiawise.co.uk: could not connect to host
 indiecert.net: did not receive HSTS header
 indieethos.com: did not receive HSTS header
 indiemods.com: could not connect to host
 indien.guide: could not connect to host
 indilens.com: did not receive HSTS header
+indiraactive.com: could not connect to host
 indiroyunu.com: did not receive HSTS header
+indochina.io: could not connect to host
 indoorskiassen.nl: did not receive HSTS header
 indostar303.com: did not receive HSTS header
 indredouglas.me: could not connect to host
@@ -7296,6 +8062,7 @@ industreiler.com.br: could not connect to host
 industriasrenova.com: could not connect to host
 industrybazar.com: max-age too low: 2592000
 ineed.com.mt: could not connect to host
+inetpub.cn: could not connect to host
 inexlog.fr: could not connect to host
 inexpensivecomputers.net: could not connect to host
 infcof.com: did not receive HSTS header
@@ -7325,13 +8092,14 @@ infos-generation.com: did not receive HSTS header
 infosec.rip: could not connect to host
 infosoph.org: could not connect to host
 infotics.es: did not receive HSTS header
-infovae-idf.com: did not receive HSTS header
+infovae-idf.com: could not connect to host
 infoworm.org: could not connect to host
 infradio.am: could not connect to host
 infranix.eu: max-age too low: 7360000
-infrarank.com: could not connect to host
 infruction.com: could not connect to host
 infura.co.th: could not connect to host
+ing89.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+ing89.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ingalabs.hu: could not connect to host
 ingalls.run: could not connect to host
 ingesol.fr: did not receive HSTS header
@@ -7351,12 +8119,12 @@ injust.tk: could not connect to host
 inkbunny.net: could not connect to host
 inked-guy.de: could not connect to host
 inkedguy.de: could not connect to host
-inkhor.se: could not connect to host
 inkstory.gr: did not receive HSTS header
 inksupply.com: did not receive HSTS header
 inku.ovh: did not receive HSTS header
 inkvisual.tk: could not connect to host
 inleaked.com: could not connect to host
+inme.ga: did not receive HSTS header
 innerform.com: could not connect to host
 innit.be: could not connect to host
 innobatics.com: did not receive HSTS header
@@ -7364,10 +8132,10 @@ innophate-security.nl: could not connect to host
 innovamag.ca: did not receive HSTS header
 innovativebuildingsolutions.co.za: could not connect to host
 innovativeideaz.org: could not connect to host
-innoventure.de: did not receive HSTS header
-inobun.jp: could not connect to host
+innoventure.de: could not connect to host
 inondation.ch: could not connect to host
 inorder.website: could not connect to host
+inovatec.com: did not receive HSTS header
 inox.io: did not receive HSTS header
 inoxio.com: did not receive HSTS header
 inoxio.de: did not receive HSTS header
@@ -7377,10 +8145,10 @@ insane-bullets.com: could not connect to host
 insane.zone: could not connect to host
 inschrijfformulier.com: could not connect to host
 inscript.pl: did not receive HSTS header
-insho.fashion: did not receive HSTS header
 insideofgaming.de: could not connect to host
 insite-feedback.com: could not connect to host
 insouciant.org: could not connect to host
+inspirationalquotesuk.co.uk: could not connect to host
 inspirationconcepts.nl: did not receive HSTS header
 inspire-av.com: did not receive HSTS header
 inspiroinc.com: could not connect to host
@@ -7392,23 +8160,28 @@ instantdev.io: could not connect to host
 instantsubs.de: did not receive HSTS header
 instaquiz.ru: could not connect to host
 instasex.ch: could not connect to host
+instawi.com: could not connect to host
 institutoflordelavida.com: could not connect to host
 institutulcultural.ro: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 instruktor.io: could not connect to host
 insurance: could not connect to host
 insurethebox.tk: could not connect to host
 int-ext-design.fr: could not connect to host
+int-ma.in: did not receive HSTS header
 integrationinc.com: did not receive HSTS header
+integraxor.com.tw: did not receive HSTS header
 integrityingovernmentidaho.com: could not connect to host
 intel.gov: did not receive HSTS header
 intel.li: could not connect to host
 intelbet.es: did not receive HSTS header
 intelbet.ro: did not receive HSTS header
+intelhost.net: max-age too low: 0
 intelldynamics.com: could not connect to host
-intensifyrsvp.com.au: could not connect to host
+intelliance.eu: could not connect to host
 interboursegeneva.ch: did not receive HSTS header
+interchanges.io: max-age too low: 0
 interference.io: did not receive HSTS header
-interfug.de: did not receive HSTS header
+interfloraservices.co.uk: could not connect to host
 intergenx.co.uk: could not connect to host
 intergenx.com: could not connect to host
 intergenx.org: could not connect to host
@@ -7416,7 +8189,6 @@ intergenx.org.uk: could not connect to host
 interhosts.co.za: could not connect to host
 interim-cto.de: could not connect to host
 interiorcheapo.com: could not connect to host
-interiorprofesional.com.ar: max-age too low: 604800
 interiortradingco.com.au: could not connect to host
 interleucina.org: did not receive HSTS header
 interlocal.co.uk: could not connect to host
@@ -7426,7 +8198,6 @@ intermezzo-emmerich.nl: did not receive HSTS header
 internacao.com: did not receive HSTS header
 internaldh.com: could not connect to host
 internaut.co.za: did not receive HSTS header
-internet-pornografie.de: did not receive HSTS header
 internetbugbounty.org: did not receive HSTS header
 internetcasinos.de: could not connect to host
 internetcensus.org: could not connect to host
@@ -7439,15 +8210,16 @@ intersectraven.net: did not receive HSTS header
 interspot.nl: could not connect to host
 interstellarhyperdrive.com: did not receive HSTS header
 interviewpipeline.co.uk: could not connect to host
-intervisteperstrada.com: could not connect to host
+intervisteperstrada.com: did not receive HSTS header
 intexplore.org: could not connect to host
 intim-uslugi-kazan.net: could not connect to host
 intimateperrierjouet.com: could not connect to host
 intimici.com.br: could not connect to host
 intimtoy.com.ua: could not connect to host
 intranetsec.fr: could not connect to host
+intreaba.xyz: max-age too low: 2592000
 introvertedtravel.space: max-age too low: 0
-intrp.net: could not connect to host
+intrp.net: did not receive HSTS header
 invenio.software: could not connect to host
 inverselink.com: could not connect to host
 investcountry.com: could not connect to host
@@ -7467,9 +8239,9 @@ ioiart.eu: could not connect to host
 iolife.dk: could not connect to host
 ionas-law.ro: did not receive HSTS header
 ionc.ca: could not connect to host
+ionote.me: did not receive HSTS header
 iop.intuit.com: max-age too low: 86400
 iora.fr: could not connect to host
-iosmods.com: did not receive HSTS header
 iostips.ru: could not connect to host
 iotfen.com: could not connect to host
 iotsms.io: could not connect to host
@@ -7480,14 +8252,13 @@ ipbill.org.uk: could not connect to host
 ipcfg.me: could not connect to host
 ipfp.pl: did not receive HSTS header
 iphonechina.net: could not connect to host
-iplantom.com: could not connect to host
+ipid.me: could not connect to host
 iplife.cn: could not connect to host
 iplog.info: could not connect to host
 ipmimagazine.com: did not receive HSTS header
 ipmotion.ca: could not connect to host
 ipnetworking.net: could not connect to host
 ipo-times.com: could not connect to host
-ipop.gr: did not receive HSTS header
 iprice.co.id: did not receive HSTS header
 iprice.hk: did not receive HSTS header
 iprice.my: did not receive HSTS header
@@ -7495,7 +8266,7 @@ iprice.ph: did not receive HSTS header
 iprice.sg: did not receive HSTS header
 iprice.vn: did not receive HSTS header
 ipricethailand.com: did not receive HSTS header
-iprody.com: did not receive HSTS header
+iprody.com: could not connect to host
 ipsilon-project.org: did not receive HSTS header
 iptel.ro: could not connect to host
 ipuservicedesign.com: could not connect to host
@@ -7507,11 +8278,14 @@ ipvsec.nl: could not connect to host
 iqcn.co: could not connect to host
 iqualtech.com: max-age too low: 7889238
 ir-saitama.com: could not connect to host
-iran-poll.org: could not connect to host
+iran-poll.org: max-age too low: 0
+irandp.net: did not receive HSTS header
 iranianlawschool.com: could not connect to host
 iraqidinar.org: did not receive HSTS header
-irazimina.ru: could not connect to host
+irasandi.com: could not connect to host
+irazimina.ru: did not receive HSTS header
 irccloud.com: did not receive HSTS header
+iready.ro: could not connect to host
 irelandesign.com: could not connect to host
 irinkeby.nu: could not connect to host
 irische-segenswuensche.info: could not connect to host
@@ -7522,6 +8296,7 @@ irmag.ru: did not receive HSTS header
 irmtrudjurke.de: did not receive HSTS header
 irodorinet.com: max-age too low: 0
 iron-guard.net: did not receive HSTS header
+irondaleirregulars.com: did not receive HSTS header
 irstaxforumsonline.com: did not receive HSTS header
 irugs.ch: did not receive HSTS header
 irugs.co.uk: did not receive HSTS header
@@ -7530,12 +8305,13 @@ irukandjilabs.com: could not connect to host
 irun-telecom.co.uk: could not connect to host
 irvinepa.org: max-age too low: 10540800
 is-a-furry.org: did not receive HSTS header
+isaackabel.cf: could not connect to host
+isastylish.com: could not connect to host
 ischool.co.jp: did not receive HSTS header
 isdf.me: could not connect to host
 isdown.cz: could not connect to host
 isef-eg.com: did not receive HSTS header
 iserv.fr: did not receive HSTS header
-iseulde.com: did not receive HSTS header
 isfriday.com: could not connect to host
 ishadowsocks.ltd: could not connect to host
 ishillaryclintoninprisonyet.com: could not connect to host
@@ -7546,57 +8322,66 @@ isisfighters.info: could not connect to host
 isitamor.pm: could not connect to host
 isitnuclearwaryet.com: could not connect to host
 iskai.net: did not receive HSTS header
-islam.si: could not connect to host
+iskkk.com: could not connect to host
+iskkk.net: could not connect to host
 islandinthenet.com: did not receive HSTS header
 islandoilsupply.com: max-age too low: 300
 islandpumpandtank.com: did not receive HSTS header
 islandzero.net: did not receive HSTS header
-islazia.fr: could not connect to host
+islazia.fr: did not receive HSTS header
+islief.com: could not connect to host
 isntall.us: did not receive HSTS header
-isocom.eu: could not connect to host
 isoface33.fr: did not receive HSTS header
 isogen5.com: could not connect to host
 isogram.nl: did not receive HSTS header
 isoroc-nidzica.pl: could not connect to host
 ispringcloud.ru: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+ispweb.es: did not receive HSTS header
+israkurort.com: could not connect to host
 issala.org: did not receive HSTS header
 isscouncil.com: could not connect to host
 isslshop.com: could not connect to host
-istanbul.systems: could not connect to host
 istanbultravelguide.info: could not connect to host
 istaspirtslietas.lv: did not receive HSTS header
 istgame.com: did not receive HSTS header
 isthefieldcontrolsystemdown.com: could not connect to host
 istherrienstillcoach.com: could not connect to host
 isv.online: did not receive HSTS header
+isyu.xyz: could not connect to host
+iszy.me: could not connect to host
 it-cave.com: could not connect to host
+it-enthusiasts.tech: could not connect to host
 it-go.net: did not receive HSTS header
 it-labor.info: did not receive HSTS header
 it-schwerin.de: could not connect to host
-it-sysoft.com: could not connect to host
 itad.top: could not connect to host
 itbrief.co.nz: did not receive HSTS header
 itbrief.com.au: did not receive HSTS header
 itchimes.com: did not receive HSTS header
 itchybrainscentral.com: could not connect to host
+itds-consulting.com: could not connect to host
+itds-consulting.cz: could not connect to host
+itds-consulting.eu: could not connect to host
 itechgeek.com: max-age too low: 0
 items.lv: did not receive HSTS header
 itemton.com: could not connect to host
 itfaq.nl: did not receive HSTS header
 itfensi.net: max-age too low: 6307200
 itforcc.com: did not receive HSTS header
+ithakama.com: could not connect to host
 itilo.de: did not receive HSTS header
 itinsight.hu: did not receive HSTS header
 itiomassagem.com.br: did not receive HSTS header
 itisjustnot.cricket: could not connect to host
 itmanie.cz: could not connect to host
 itnews-bg.com: could not connect to host
+itogoyomi.com: did not receive HSTS header
 itos.asia: did not receive HSTS header
 itos.pl: did not receive HSTS header
 itpol.dk: did not receive HSTS header
 itpro-mg.de: could not connect to host
 itproject.guru: did not receive HSTS header
-itrack.in.th: could not connect to host
+itrack.in.th: did not receive HSTS header
 itriskltd.com: did not receive HSTS header
 its-schindler.de: could not connect to host
 its-v.de: could not connect to host
@@ -7614,14 +8399,13 @@ ittop-gabon.com: could not connect to host
 itu2015.de: could not connect to host
 ius.io: did not receive HSTS header
 iuscommunity.org: did not receive HSTS header
-ivanboi.com: could not connect to host
-ivanilla.org: could not connect to host
 ivanpolchenko.com: could not connect to host
 ivi-co.com: max-age too low: 0
 ivi-fertility.com: max-age too low: 0
 ivi.es: max-age too low: 0
 ivk.website: could not connect to host
 ivklombard.ru: did not receive HSTS header
+ivxv.ee: could not connect to host
 ivyshop.com.br: could not connect to host
 iwannarefill.com: could not connect to host
 iwex.swiss: could not connect to host
@@ -7632,7 +8416,7 @@ iww.mx: could not connect to host
 ix8.ru: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ixec2.tk: could not connect to host
 ixh.me: did not receive HSTS header
-iyinolaashafa.com: could not connect to host
+ixio.cz: could not connect to host
 izdiwho.com: could not connect to host
 izolight.ch: could not connect to host
 izonemart.com: did not receive HSTS header
@@ -7643,20 +8427,24 @@ j-lsolutions.com: could not connect to host
 j-rickroll-a.pw: could not connect to host
 j0ng.xyz: could not connect to host
 j15t98j.co.uk: did not receive HSTS header
+j2ee.cz: could not connect to host
 j8y.de: did not receive HSTS header
+ja-publications.agency: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ja-publications.com: did not receive HSTS header
 jaan.su: could not connect to host
 jaaxypro.com: could not connect to host
+jability.ovh: did not receive HSTS header
 jackalworks.com: could not connect to host
-jackdoan.com: did not receive HSTS header
+jackdoan.com: could not connect to host
 jackfahnestock.com: could not connect to host
 jackops.com: could not connect to host
+jackrusselterrier.com.br: could not connect to host
+jackyyf.com: could not connect to host
 jacobparry.ca: max-age too low: 0
-jacobsenarquitetura.com: max-age too low: 5184000
-jadopado.com: could not connect to host
+jadara.info: could not connect to host
 jaepinformatica.com: did not receive HSTS header
 jagido.de: did not receive HSTS header
-jahliveradio.com: could not connect to host
+jahliveradio.com: did not receive HSTS header
 jaimechanaga.com: could not connect to host
 jaion.ml: could not connect to host
 jak-na-les.cz: could not connect to host
@@ -7665,13 +8453,12 @@ jakewalker.xyz: did not receive HSTS header
 jakincode.army: could not connect to host
 jaksel.id: could not connect to host
 jaksi.io: did not receive HSTS header
-jakubtopic.cz: could not connect to host
 jamanji.com.ng: could not connect to host
 jamaware.org: did not receive HSTS header
+jamberry.com.mx: could not connect to host
 james-parker.com: did not receive HSTS header
 james.je: could not connect to host
 jamesandanneke.com: did not receive HSTS header
-jamesandpame.la: could not connect to host
 jamesbradach.com: did not receive HSTS header
 jamesburton.london: could not connect to host
 jamesbywater.co.uk: could not connect to host
@@ -7688,11 +8475,12 @@ jamesf.xyz: could not connect to host
 jamesforman.co.nz: did not receive HSTS header
 jameshale.me: did not receive HSTS header
 jamesheald.com: could not connect to host
-jamesl.ml: could not connect to host
+jamesl.ml: did not receive HSTS header
 jamesmaurer.com: did not receive HSTS header
 jamesrains.com: could not connect to host
 jameswarp.com: could not connect to host
 jami.am: max-age too low: 0
+jamie-read-photography.com: could not connect to host
 jamourtney.com: could not connect to host
 jamyeprice.com: did not receive HSTS header
 jan-cermak.cz: did not receive HSTS header
@@ -7706,19 +8494,17 @@ janheidler.dynv6.net: could not connect to host
 janhermann.cz: did not receive HSTS header
 janking.de: could not connect to host
 janmachynka.cz: could not connect to host
+janmg.com: could not connect to host
 janosh.com: did not receive HSTS header
 janssen.fm: could not connect to host
 janus-engineering.de: did not receive HSTS header
-janvari.com: could not connect to host
-janvaribalint.com: could not connect to host
 janverlaan.nl: did not receive HSTS header
 jap-nope.de: did not receive HSTS header
-japan4you.org: could not connect to host
+japan4you.org: did not receive HSTS header
 japanbaths.com: could not connect to host
 japaneseemoticons.org: did not receive HSTS header
 japanesenames.biz: did not receive HSTS header
 japangids.nl: could not connect to host
-japanphilosophy.com: did not receive HSTS header
 japansm.com: could not connect to host
 japanwide.net: could not connect to host
 japaripark.com: could not connect to host
@@ -7727,20 +8513,24 @@ japlex.com: could not connect to host
 jaqen.ch: could not connect to host
 jardins-utopie.net: could not connect to host
 jaredbates.net: did not receive HSTS header
+jaredfraser.com: could not connect to host
+jarivisual.com: did not receive HSTS header
 jarl.ninja: could not connect to host
 jarnail.ca: could not connect to host
 jaroslavtrsek.cz: did not receive HSTS header
 jarrodcastaing.com: did not receive HSTS header
 jarrodcastaing.com.au: did not receive HSTS header
-jarsater.com: could not connect to host
+jarsater.com: did not receive HSTS header
 jartza.org: could not connect to host
 jasmineconseil.com: did not receive HSTS header
 jasoncosper.com: did not receive HSTS header
+jasonian-photo.com: could not connect to host
 jasonradin.com: did not receive HSTS header
 jasonrobinson.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 jasonroe.me: did not receive HSTS header
 jasonwindholz.com: could not connect to host
 jastoria.pl: did not receive HSTS header
+jastrow.me: did not receive HSTS header
 jateng.press: could not connect to host
 jav-collective.com: could not connect to host
 java-board.com: could not connect to host
@@ -7755,7 +8545,6 @@ jawn.ca: could not connect to host
 jawnelodzkie.org.pl: could not connect to host
 jaxageto.de: did not receive HSTS header
 jayblock.com: did not receive HSTS header
-jayf.de: could not connect to host
 jayharris.ca: could not connect to host
 jaylen.com.ar: did not receive HSTS header
 jayna.design: did not receive HSTS header
@@ -7766,7 +8555,7 @@ jazzinutrecht.info: could not connect to host
 jballelectronics.com: did not receive HSTS header
 jbelien.be: did not receive HSTS header
 jbelien.photography: did not receive HSTS header
-jbfp.dk: could not connect to host
+jbfp.dk: did not receive HSTS header
 jbj.co.uk: did not receive HSTS header
 jbn.mx: could not connect to host
 jbrowndesign.me: did not receive HSTS header
@@ -7779,7 +8568,7 @@ jcolideles.com: could not connect to host
 jcom-communication-system.biz: could not connect to host
 jcor.me: could not connect to host
 jcoscia.com: could not connect to host
-jcraft.us: could not connect to host
+jcraft.us: did not receive HSTS header
 jctf.io: could not connect to host
 jcyz.cf: could not connect to host
 jdav-leipzig.de: could not connect to host
@@ -7791,19 +8580,18 @@ jdsf.tk: did not receive HSTS header
 jean-remy.ch: could not connect to host
 jebengotai.com: did not receive HSTS header
 jecho.cn: could not connect to host
-jeepeg.com: did not receive HSTS header
+jeepmafia.com: did not receive HSTS header
 jeff.forsale: could not connect to host
 jeff.is: did not receive HSTS header
 jeff393.com: could not connect to host
 jeffersonregan.org: could not connect to host
 jeffhuxley.com: could not connect to host
 jeffreymagee.com: did not receive HSTS header
-jehovahsays.net: could not connect to host
 jeil-makes.co.kr: could not connect to host
-jekkt.com: max-age too low: 604800
 jellow.nl: did not receive HSTS header
 jemoticons.com: did not receive HSTS header
 jenjoit.de: could not connect to host
+jennedebleser.com: did not receive HSTS header
 jenniferchan.id.au: could not connect to host
 jennifercherniack.com: did not receive HSTS header
 jennybeaned.com: did not receive HSTS header
@@ -7814,9 +8602,9 @@ jenssen.org: did not receive HSTS header
 jeremyc.ca: could not connect to host
 jeremye77.com: did not receive HSTS header
 jeremymade.com: did not receive HSTS header
-jeremyness.com: could not connect to host
 jeremywagner.me: did not receive HSTS header
 jermann.biz: did not receive HSTS header
+jeroenensanne.wedding: could not connect to host
 jeroenvanderwal.nl: did not receive HSTS header
 jeroldirvin.com: did not receive HSTS header
 jerrypau.ca: could not connect to host
@@ -7825,8 +8613,11 @@ jessevictors.com: could not connect to host
 jessicah.org: could not connect to host
 jesuisformidable.nl: could not connect to host
 jesuslucas.com: did not receive HSTS header
-jet-code.com: could not connect to host
+jet-code.com: did not receive HSTS header
+jetbrains.pw: could not connect to host
+jetflex.de: did not receive HSTS header
 jetlagphotography.com: could not connect to host
+jetmirshatri.com: did not receive HSTS header
 jeton.com: did not receive HSTS header
 jetsetcharge.com: could not connect to host
 jetsetpay.com: could not connect to host
@@ -7838,8 +8629,8 @@ jewellerydesignstore.com: could not connect to host
 jewellerymarvels.com: did not receive HSTS header
 jez.nl: could not connect to host
 jfmel.com: did not receive HSTS header
-jfmhero.me: could not connect to host
-jfnllc.com: did not receive HSTS header
+jfnllc.com: could not connect to host
+jfr.im: did not receive HSTS header
 jfx.space: did not receive HSTS header
 jh-media.eu: could not connect to host
 jhburton.co.uk: could not connect to host
@@ -7847,6 +8638,7 @@ jhburton.uk: could not connect to host
 jhcommunitysports.co.uk: could not connect to host
 jhejderup.me: could not connect to host
 jhermsmeier.de: could not connect to host
+jhw-profiles.de: did not receive HSTS header
 jia1hao.com: could not connect to host
 jiaidu.com: could not connect to host
 jiangzequn.com: could not connect to host
@@ -7862,8 +8654,10 @@ jimas.eu: did not receive HSTS header
 jimenacocina.com: did not receive HSTS header
 jimgao.tk: did not receive HSTS header
 jimmehcai.com: could not connect to host
-jimmycai.org: could not connect to host
 jimmynelson.com: did not receive HSTS header
+jinancy.fr: could not connect to host
+jing-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+jing-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 jingyuesi.com: could not connect to host
 jinliming.ml: could not connect to host
 jinmaguoji.com: could not connect to host
@@ -7873,6 +8667,7 @@ jirav.io: could not connect to host
 jisaku-homepage.com: did not receive HSTS header
 jitlab.org: could not connect to host
 jitsi.org: did not receive HSTS header
+jiveiaktivno.bg: did not receive HSTS header
 jiyue.com: did not receive HSTS header
 jiyuu-ni.com: could not connect to host
 jiyuu-ni.net: could not connect to host
@@ -7880,15 +8675,17 @@ jjf.org.au: did not receive HSTS header
 jka.io: did not receive HSTS header
 jkb.pics: could not connect to host
 jkbuster.com: could not connect to host
+jkinteriorspa.com: could not connect to host
 jkng.eu: could not connect to host
 jko.works: could not connect to host
 jkuvw.xyz: could not connect to host
+jldp.org: did not receive HSTS header
 jlhmedia.com: did not receive HSTS header
 jm06.com: did not receive HSTS header
 jm22.com: could not connect to host
 jmb.lc: could not connect to host
-jmdekker.it: could not connect to host
-jmssg.jp: could not connect to host
+jmotion.co.uk: did not receive HSTS header
+jmpmotorsport.co.uk: did not receive HSTS header
 jmvbmx.ch: could not connect to host
 jn1.me: did not receive HSTS header
 jncde.de: did not receive HSTS header
@@ -7897,10 +8694,10 @@ jncie.eu: did not receive HSTS header
 jncip.de: did not receive HSTS header
 joakimalgroy.com: could not connect to host
 joaquimgoliveira.pt: could not connect to host
-job-offer.de: could not connect to host
 jobers.ch: did not receive HSTS header
 jobers.pt: did not receive HSTS header
 jobflyapp.com: could not connect to host
+joblab.com.ua: did not receive HSTS header
 jobmedic.com: could not connect to host
 jobshq.com: did not receive HSTS header
 jobss.co.uk: did not receive HSTS header
@@ -7914,10 +8711,11 @@ jodel.ninja: could not connect to host
 joe-pagan.com: did not receive HSTS header
 joearodriguez.com: could not connect to host
 joecod.es: could not connect to host
+joefixit.co.uk: could not connect to host
 joelgonewild.com: did not receive HSTS header
-joepitt.co.uk: could not connect to host
 joerg-wellpott.de: did not receive HSTS header
 joetyson.io: could not connect to host
+johand.io: could not connect to host
 johannaojanen.com: could not connect to host
 johannes-bugenhagen.de: did not receive HSTS header
 johannes-sprink.de: could not connect to host
@@ -7926,13 +8724,15 @@ johncardell.com: did not receive HSTS header
 johners.me: could not connect to host
 johngaltgroup.com: did not receive HSTS header
 johnhgaunt.com: did not receive HSTS header
+johnmorganpartnership.co.uk: did not receive HSTS header
 johnrom.com: could not connect to host
-johnsanchez.io: could not connect to host
+johnsiu.com: did not receive HSTS header
 johntomasowa.com: could not connect to host
 johnverkerk.com: could not connect to host
 joinamericacorps.gov: could not connect to host
 jointoweb.com: could not connect to host
-jomp16.tk: could not connect to host
+jomp16.tk: did not receive HSTS header
+jonarcher.info: did not receive HSTS header
 jonas-keidel.de: did not receive HSTS header
 jonasgroth.se: did not receive HSTS header
 jonathan.ir: could not connect to host
@@ -7940,19 +8740,17 @@ jonathandowning.uk: did not receive HSTS header
 jonathanmassacand.ch: could not connect to host
 jonathanreyes.com: did not receive HSTS header
 jonathansanchez.pro: could not connect to host
-jonathanselea.se: did not receive HSTS header
+jonesopolis.xyz: could not connect to host
 jonfor.net: could not connect to host
 jongha.me: could not connect to host
 jonn.me: could not connect to host
 jonnichols.info: could not connect to host
-jons.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 jonsno.ws: could not connect to host
 joostbovee.nl: did not receive HSTS header
 jooto.com: did not receive HSTS header
 jopl.org: did not receive HSTS header
 jordankirby.co.uk: could not connect to host
 jordanp.engineer: could not connect to host
-jordanscorporatelaw.com: could not connect to host
 jordanstrustcompany.cn: could not connect to host
 jordanstrustcompany.ru: could not connect to host
 jordikroon.nl: could not connect to host
@@ -7962,23 +8760,24 @@ jornadasciberdefensa2016.es: could not connect to host
 jorovik.com: did not receive HSTS header
 jorrit.info: max-age too low: 0
 josahrens.me: could not connect to host
-josc.com.au: could not connect to host
+jose.eti.br: did not receive HSTS header
 joseaveleira.es: did not receive HSTS header
 josecage.com: could not connect to host
-josephre.es: did not receive HSTS header
+josericaurte.com: could not connect to host
 joshi.su: could not connect to host
 joshplant.co.uk: could not connect to host
 joshstroup.me: could not connect to host
 joto.de: did not receive HSTS header
 jotpics.com: could not connect to host
 jottit.com: could not connect to host
+jouetspetitechanson.com: could not connect to host
 journalof.tech: could not connect to host
 joworld.net: could not connect to host
 joyceclerkx.com: could not connect to host
 joyjohnston.ca: did not receive HSTS header
 joyqi.com: did not receive HSTS header
 jpaglier.com: could not connect to host
-jpbike.cz: did not receive HSTS header
+jpbike.cz: could not connect to host
 jpcrochetapparel.com: could not connect to host
 jpeaches.xyz: could not connect to host
 jpgangbang.com: could not connect to host
@@ -7987,30 +8786,30 @@ jptun.com: could not connect to host
 jrgold.me: could not connect to host
 jrmd.io: could not connect to host
 jrvar.com: did not receive HSTS header
-js88.sg: did not receive HSTS header
+js88.sg: could not connect to host
 jsanders.us: did not receive HSTS header
 jsbentertainment.nl: could not connect to host
 jsbevents.nl: could not connect to host
 jsblights.nl: could not connect to host
-jsc7776.com: did not receive HSTS header
+jsc7776.com: could not connect to host
+jsdelivr.net: could not connect to host
 jsg-technologies.de: did not receive HSTS header
 jsjyhzy.cc: could not connect to host
+jslidong.top: did not receive HSTS header
 json-viewer.com: did not receive HSTS header
 jstelecom.com.br: did not receive HSTS header
+jsuse.xyz: did not receive HSTS header
 jsvr.tk: could not connect to host
+jsxc.ch: could not connect to host
 ju1ro.de: did not receive HSTS header
 jualautoclave.com: did not receive HSTS header
 jualssh.com: could not connect to host
 juandesouza.com: did not receive HSTS header
 juanhub.com: did not receive HSTS header
 juchheim-methode.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-judge2020.com: did not receive HSTS header
 juiced.gs: did not receive HSTS header
 juka.pp.ua: could not connect to host
-juku-info.top: did not receive HSTS header
-juku-wing.jp: could not connect to host
 julegoerke.de: did not receive HSTS header
-julenlanda.com: could not connect to host
 juliamweber.de: could not connect to host
 julian-kipka.de: did not receive HSTS header
 julian-witusch.de: could not connect to host
@@ -8037,23 +8836,25 @@ junglegoat.xyz: did not receive HSTS header
 juniwalk.cz: could not connect to host
 junjung.me: could not connect to host
 junqtion.com: could not connect to host
-jupp0r.de: could not connect to host
+jupp0r.de: did not receive HSTS header
 juridiqueo.com: did not receive HSTS header
 juristeo.com: did not receive HSTS header
 jurke.com: did not receive HSTS header
 jurko.cz: could not connect to host
+jurriaan.ninja: could not connect to host
 just-english.online: did not receive HSTS header
 just-pools.co.za: could not connect to host
 just2trade.com: did not receive HSTS header
+justbelieverecovery.com: did not receive HSTS header
 justiceforfathers.com: did not receive HSTS header
 justiceo.org: did not receive HSTS header
+justinellingwood.com: could not connect to host
 justinlemay.com: could not connect to host
 justinrudio.com: did not receive HSTS header
 justlikethat.hosting: did not receive HSTS header
 justmy.website: did not receive HSTS header
 justnaw.co.uk: could not connect to host
 justonce.net: could not connect to host
-justsome.info: did not receive HSTS header
 justudin.com: did not receive HSTS header
 justwood.cz: did not receive HSTS header
 justzz.xyz: could not connect to host
@@ -8062,45 +8863,45 @@ juul.xyz: could not connect to host
 juventusclublugano.ch: could not connect to host
 juventusmania1897.com: could not connect to host
 juwairen.cn: could not connect to host
+juzgalo.com: did not receive HSTS header
 jvn.com: did not receive HSTS header
 jvoice.net: did not receive HSTS header
 jwilsson.me: could not connect to host
 jwolt-lx.com: could not connect to host
 jxir.de: could not connect to host
-jxm.in: could not connect to host
 jysperm.me: did not receive HSTS header
 jznet.org: could not connect to host
 k-dev.de: could not connect to host
 k-rickroll-g.pw: could not connect to host
-k-wallet.com: did not receive HSTS header
+k-wallet.com: could not connect to host
 k1cp.com: could not connect to host
-k33k00.com: could not connect to host
+k33k00.com: did not receive HSTS header
 k38.cc: max-age too low: 3600
 ka-clan.com: could not connect to host
 kaanduman.com: could not connect to host
 kaasbijwijn.nl: did not receive HSTS header
-kabarlinux.id: could not connect to host
-kabat-fans.cz: did not receive HSTS header
 kabinapp.com: did not receive HSTS header
 kabuabc.com: could not connect to host
-kabus.org: could not connect to host
 kackscharf.de: could not connect to host
 kadioglumakina.com.tr: did not receive HSTS header
 kadmec.com: did not receive HSTS header
 kaela.design: could not connect to host
 kahopoon.net: could not connect to host
 kai.cool: did not receive HSTS header
+kaibol.com: could not connect to host
 kaika-facilitymanagement.de: could not connect to host
 kaika-hms.de: did not receive HSTS header
 kainetsoft.com: could not connect to host
 kainz.bayern: could not connect to host
 kainz.be: could not connect to host
-kaisers.de: did not receive HSTS header
+kaisers.de: could not connect to host
 kaiyuewu.com: could not connect to host
 kajlovo.cz: could not connect to host
 kakaomilchkuh.de: did not receive HSTS header
 kaketalk.com: did not receive HSTS header
-kakie-gobocha.jp: could not connect to host
+kakoo-media.nl: could not connect to host
+kakoo.nl: could not connect to host
+kakoomedia.nl: could not connect to host
 kalami.nl: could not connect to host
 kaleidomarketing.com: did not receive HSTS header
 kaleidoskop-freiburg.de: did not receive HSTS header
@@ -8108,17 +8909,17 @@ kalender.goip.de: could not connect to host
 kalilinux.tech: could not connect to host
 kaloix.de: could not connect to host
 kamalame.co: could not connect to host
+kamatajisyaku.tokyo.jp: did not receive HSTS header
 kambodja.guide: could not connect to host
 kamcvicit.sk: could not connect to host
 kamikano.com: could not connect to host
-kamikatse.net: could not connect to host
 kamitech.ch: could not connect to host
-kanaanonline.org: max-age too low: 86400
 kanada.guide: could not connect to host
 kanagawachuo-hospital.jp: did not receive HSTS header
 kanar.nl: could not connect to host
 kancolle.me: could not connect to host
 kandec.co.jp: did not receive HSTS header
+kaneisdi.com: did not receive HSTS header
 kaneo-gmbh.de: did not receive HSTS header
 kanganer.com: could not connect to host
 kangzaber.com: could not connect to host
@@ -8129,27 +8930,29 @@ kanr.in: could not connect to host
 kanscooking.org: could not connect to host
 kantorad.io: could not connect to host
 kantv1.com: could not connect to host
-kanuvu.de: did not receive HSTS header
+kanzakiranko.jp: could not connect to host
 kanzlei-wirtschaftsrecht.berlin: max-age too low: 600000
+kanzshop.com: could not connect to host
 kaohub.com: could not connect to host
 kaomojis.net: did not receive HSTS header
-kaotik4266.com: could not connect to host
+kapiorr.duckdns.org: could not connect to host
 kaplatz.is: could not connect to host
 kapo.info: could not connect to host
 kappit.dk: could not connect to host
 kapucini.si: max-age too low: 0
 kaputt.com: could not connect to host
 kapverde.guide: could not connect to host
+karaface.com: could not connect to host
 karamna.com: could not connect to host
-karamomo.net: did not receive HSTS header
 karanastic.com: did not receive HSTS header
+karanlyons.com: could not connect to host
 karaoketonight.com: could not connect to host
-karatekit.co.uk: did not receive HSTS header
-karatorian.org: did not receive HSTS header
+karatekit.co.uk: could not connect to host
+karatorian.org: could not connect to host
 karjala-ski.ru: could not connect to host
+karlis-kavacis.id.lv: did not receive HSTS header
 karloskontana.tk: could not connect to host
 karlproctor.co.uk: could not connect to host
-karmaflux.com: did not receive HSTS header
 karpanhellas.com: could not connect to host
 kars.ooo: could not connect to host
 karting34.com: did not receive HSTS header
@@ -8158,7 +8961,7 @@ kashdash.ca: could not connect to host
 kashis.com.au: max-age too low: 0
 kat.al: max-age too low: 0
 katalogakci.cz: did not receive HSTS header
-katata-kango.ac.jp: could not connect to host
+kathrinbaumannphotography.com: did not receive HSTS header
 kati0.com: could not connect to host
 katiaetdavid.fr: could not connect to host
 katja-nikolic-design.de: could not connect to host
@@ -8173,23 +8976,28 @@ kattenfun.be: did not receive HSTS header
 kattenfun.nl: did not receive HSTS header
 katthewaffle.fr: could not connect to host
 katzen.me: could not connect to host
+katzspeech.com: did not receive HSTS header
 kaufkraftkiel.de: could not connect to host
 kauperwood.ovh: could not connect to host
 kauplusprofesional.com: did not receive HSTS header
 kausch.at: could not connect to host
 kausta.me: could not connect to host
+kavik.no: could not connect to host
 kavinvin.me: could not connect to host
-kawaiii.link: did not receive HSTS header
 kawaiiku.com: could not connect to host
 kawaiiku.de: could not connect to host
 kaydan.io: could not connect to host
 kayipmurekkep.com: could not connect to host
+kayleen.net: could not connect to host
 kayon.cf: could not connect to host
-kazamasion.com: did not receive HSTS header
+kaysis.gov.tr: did not receive HSTS header
+kazamasion.com: could not connect to host
 kazanasolutions.de: could not connect to host
 kazenojiyu.fr: did not receive HSTS header
+kb3.net: did not receive HSTS header
 kbfl.org: could not connect to host
 kcluster.io: could not connect to host
+kcptun.com: could not connect to host
 kd-plus.pp.ua: could not connect to host
 kdata.it: did not receive HSTS header
 kdbx.online: could not connect to host
@@ -8207,6 +9015,7 @@ keepflow.io: could not connect to host
 keepmanager.org: could not connect to host
 keeprubyweird.com: did not receive HSTS header
 kefaloniatoday.com: did not receive HSTS header
+keifel.de: could not connect to host
 keihin-chaplin.jp: did not receive HSTS header
 kein-fidget-spinner-werden.de: could not connect to host
 kejibot.com: could not connect to host
@@ -8223,19 +9032,21 @@ kenderhaz-magyarorszag.hu: did not receive HSTS header
 kenderhazmagyarorszag.hu: did not receive HSTS header
 kenkoelectric.com: did not receive HSTS header
 kenman.dk: max-age too low: 2592000
+kennedy.ie: could not connect to host
 kennynet.co.uk: could not connect to host
-kensparkesphotography.com: did not receive HSTS header
 kentacademiestrust.org.uk: did not receive HSTS header
 kepler-seminar.de: did not receive HSTS header
+keralit.nl: did not receive HSTS header
 kerangalam.com: did not receive HSTS header
 kerksanders.nl: could not connect to host
 kermadec.blog: could not connect to host
 kermadec.com: did not receive HSTS header
-kermadec.net: could not connect to host
+kermadec.net: did not receive HSTS header
 kernelmode.io: did not receive HSTS header
 kernl.us: did not receive HSTS header
 keshausconsulting.com: could not connect to host
 keskeces.com: did not receive HSTS header
+kessel-runners.com: could not connect to host
 kesteren.com: could not connect to host
 kevindekoninck.com: could not connect to host
 kevinfoley.cc: could not connect to host
@@ -8245,7 +9056,6 @@ kevinroebert.de: did not receive HSTS header
 kevlar.pw: did not receive HSTS header
 kewego.co.uk: could not connect to host
 keymaster.lookout.com: did not receive HSTS header
-keys.jp: could not connect to host
 keyserver.sexy: could not connect to host
 kfbrussels.be: could not connect to host
 kg-rating.com: could not connect to host
@@ -8258,22 +9068,25 @@ khosla.uk: could not connect to host
 ki-on.net: did not receive HSTS header
 kiaka.co: could not connect to host
 kialo.com: did not receive HSTS header
+kiapps.ovh: could not connect to host
 kickass-proxies.org: could not connect to host
 kickass.al: could not connect to host
 kickasstorrents.gq: could not connect to host
 kickerplaza.nl: did not receive HSTS header
-kid-dachau.de: did not receive HSTS header
+kid-dachau.de: max-age too low: 600000
 kidbacker.com: could not connect to host
 kidkat.cn: could not connect to host
-kiehls.pt: could not connect to host
+kidswallstickers.com.au: could not connect to host
 kiel-media.de: did not receive HSTS header
 kielderweather.org.uk: did not receive HSTS header
+kielwi.gov: could not connect to host
 kienlen.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 kieranweightman.me: could not connect to host
 kiesuwcursus.nl: did not receive HSTS header
 kievradio.com: could not connect to host
 kikuzuki.org: could not connect to host
 kiladera.be: did not receive HSTS header
+kilerd.me: did not receive HSTS header
 kill-paff.com: did not receive HSTS header
 kimana.pe: could not connect to host
 kimberg.co.uk: did not receive HSTS header
@@ -8282,7 +9095,7 @@ kimpost.org: could not connect to host
 kimscrazeecastles.co.uk: did not receive HSTS header
 kina.guide: could not connect to host
 kinderbuecher-kostenlos.de: did not receive HSTS header
-kinderjugendfreizeitverein.de: could not connect to host
+kinderjugendfreizeitverein.de: did not receive HSTS header
 kinderly.co.uk: did not receive HSTS header
 kinderopvangengeltjes.nl: did not receive HSTS header
 kinderopvangzevenbergen.nl: did not receive HSTS header
@@ -8291,6 +9104,7 @@ kindlyfire.com: could not connect to host
 kindof.ninja: could not connect to host
 kinepolis-studio.ga: could not connect to host
 kineto.space: could not connect to host
+king-henris-castles.co.uk: did not receive HSTS header
 kingclass.cn: could not connect to host
 kingdomcrc.org: did not receive HSTS header
 kingmanhall.org: could not connect to host
@@ -8298,7 +9112,9 @@ kingpincages.com: could not connect to host
 kingqueen.org.uk: did not receive HSTS header
 kinkdr.com: could not connect to host
 kinmunity.com: did not receive HSTS header
+kinnettmemorial.org: did not receive HSTS header
 kinnon.enterprises: could not connect to host
+kinomoto.me: could not connect to host
 kinow.com: did not receive HSTS header
 kinsmenhomelottery.com: did not receive HSTS header
 kintoandar.com: max-age too low: 0
@@ -8308,17 +9124,16 @@ kionetworks.com: did not receive HSTS header
 kipin.fr: did not receive HSTS header
 kipira.com: could not connect to host
 kiraboshi.xyz: could not connect to host
-kirainmoe.com: did not receive HSTS header
 kirara.eu: could not connect to host
-kirill.ws: could not connect to host
-kirito.kr: could not connect to host
+kircp.com: could not connect to host
+kirito.kr: did not receive HSTS header
 kirkforsenate.com: could not connect to host
 kirkpatrickdavis.com: could not connect to host
-kirstin-peters.de: could not connect to host
 kisa.io: could not connect to host
-kisalt.im: could not connect to host
 kiss-register.org: could not connect to host
 kissart.net: could not connect to host
+kissesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+kissesb.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 kisskiss.ch: could not connect to host
 kisstyle.ru: did not receive HSTS header
 kisun.co.jp: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -8329,6 +9144,7 @@ kitashop.com.br: could not connect to host
 kitatec.com.br: could not connect to host
 kitchenaccessories.pro: did not receive HSTS header
 kitchenchaos.de: could not connect to host
+kitegarage.eu: did not receive HSTS header
 kiteschoolamsterdam.nl: could not connect to host
 kitestar.co.uk: did not receive HSTS header
 kitk.at: could not connect to host
@@ -8339,11 +9155,11 @@ kiwiirc.com: max-age too low: 5256000
 kiwipayment.com: could not connect to host
 kiwipayments.com: could not connect to host
 kiwiplace.com: could not connect to host
-kix.moe: did not receive HSTS header
 kiyo.space: could not connect to host
 kizil.net: could not connect to host
 kj1391.com: did not receive HSTS header
 kjaermaxi.me: did not receive HSTS header
+kjellvn.net: could not connect to host
 kjg-bachrain.de: could not connect to host
 kjoglum.me: could not connect to host
 kkull.tv: could not connect to host
@@ -8369,6 +9185,7 @@ klingsundet.no: did not receive HSTS header
 kliqsd.com: could not connect to host
 kloentrup.de: max-age too low: 604800
 klunkergarten.org: could not connect to host
+km-net.pl: did not receive HSTS header
 kmdev.me: did not receive HSTS header
 knapen.io: max-age too low: 604800
 knccloud.com: could not connect to host
@@ -8376,29 +9193,30 @@ kngk-azs.ru: could not connect to host
 knigadel.com: did not receive HSTS header
 knightsbridgegroup.org: could not connect to host
 knightsweep.com: could not connect to host
+kniwweler.com: could not connect to host
 knnet.ch: could not connect to host
 knowdebt.org: did not receive HSTS header
 knowledgehook.com: did not receive HSTS header
 knowledgesnap.com: could not connect to host
 knowledgesnapsites.com: could not connect to host
 knownsec.cf: could not connect to host
+knuckles.tk: could not connect to host
 kobieta.guru: could not connect to host
-kobolya.hu: did not receive HSTS header
 koddsson.com: did not receive HSTS header
 kodexplorer.ml: could not connect to host
 kodiaklabs.org: could not connect to host
 kodokushi.fr: could not connect to host
 koen.io: max-age too low: 86400
-koenrouwhorst.nl: did not receive HSTS header
 koenvdheuvel.me: could not connect to host
+koerper-wie-seele.de: did not receive HSTS header
 koerperimpuls.ch: did not receive HSTS header
 koez-mangal.ch: could not connect to host
 koezmangal.ch: could not connect to host
 koik.io: could not connect to host
 koirala.net: could not connect to host
 kokenmetaanbiedingen.nl: could not connect to host
+kokoiroworks.com: could not connect to host
 kola-entertainments.de: did not receive HSTS header
-kolaykaydet.com: could not connect to host
 kolbeck.tk: could not connect to host
 kollawat.me: could not connect to host
 kolozsvaricsuhe.hu: did not receive HSTS header
@@ -8407,7 +9225,7 @@ kompetenzwerft.de: did not receive HSTS header
 konata.us: could not connect to host
 kongbaofang.com: could not connect to host
 konicaprinterdriver.com: could not connect to host
-koningskwartiertje.nl: could not connect to host
+konings.it: could not connect to host
 konkurs.ba: could not connect to host
 kontakthuman.hu: did not receive HSTS header
 kontaxis.network: could not connect to host
@@ -8415,10 +9233,10 @@ konventseliten.se: could not connect to host
 koop-bremen.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 koopjesnel.nl: could not connect to host
 koordinate.net: could not connect to host
+kopio.jp: did not receive HSTS header
 koppelvlak.net: could not connect to host
 kopular.com: could not connect to host
 kori.ml: did not receive HSTS header
-korinar.com: could not connect to host
 koriyoukai.net: did not receive HSTS header
 kornersafe.com: did not receive HSTS header
 korni22.org: could not connect to host
@@ -8430,22 +9248,28 @@ kotomei.moe: could not connect to host
 kotonehoko.net: could not connect to host
 kotorimusic.ga: could not connect to host
 kotovstyle.ru: could not connect to host
+kottur.is: could not connect to host
 koukni.cz: did not receive HSTS header
 kourpe.online: could not connect to host
 kousaku.jp: could not connect to host
-kouten-jp.com: could not connect to host
 kozmik.co: could not connect to host
 kpdyer.com: did not receive HSTS header
 kpebetka.net: did not receive HSTS header
 kpn-dnssec.com: could not connect to host
 kprog.net: did not receive HSTS header
 kpvpn.com: could not connect to host
+kradalby.no: could not connect to host
 kraigwalker.com: could not connect to host
+kraiwan.com: did not receive HSTS header
+krampus-fischamend.at: could not connect to host
 krasavchik.by: could not connect to host
 krasota.ru: did not receive HSTS header
-krausen.ca: did not receive HSTS header
+krausen.ca: could not connect to host
+krausoft.hu: could not connect to host
 kravelindo-adventure.com: could not connect to host
+kraynik.com: could not connect to host
 krayx.com: could not connect to host
+krc.link: could not connect to host
 kream.io: did not receive HSTS header
 kreavis.com: did not receive HSTS header
 kreb.io: could not connect to host
@@ -8464,6 +9288,9 @@ krizek.cc: did not receive HSTS header
 krizevackapajdasija.hr: could not connect to host
 krmela.com: did not receive HSTS header
 kroetenfuchs.de: could not connect to host
+krokodent.de: did not receive HSTS header
+kromonos.net: could not connect to host
+kronych.cz: could not connect to host
 kroodle.nl: did not receive HSTS header
 krouzkyliduska.cz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 kruegerrand-wert.de: did not receive HSTS header
@@ -8471,24 +9298,21 @@ krunut.com: did not receive HSTS header
 kryha.io: did not receive HSTS header
 krypteia.org: could not connect to host
 kryptomech.com: could not connect to host
+ks88.com: could not connect to host
 ksfh-mail.de: could not connect to host
-ksham.net: could not connect to host
 ksk-agentur.de: did not receive HSTS header
 kstan.me: could not connect to host
 kswcosmetics.com: could not connect to host
 kswriter.com: could not connect to host
 kteen.info: did not receive HSTS header
 ktube.yt: could not connect to host
-ku.io: did not receive HSTS header
 kuba.guide: could not connect to host
 kubiwa.net: could not connect to host
 kubusadvocaten.nl: could not connect to host
-kuchenfeelisa.de: did not receive HSTS header
 kuchenschock.de: did not receive HSTS header
-kucheryavenkovn.ru: could not connect to host
+kucheryavenkovn.ru: did not receive HSTS header
 kucom.it: did not receive HSTS header
 kuechenplan.online: could not connect to host
-kuehnel.org: max-age too low: 604800
 kueulangtahunanak.net: could not connect to host
 kuko-crews.org: could not connect to host
 kultmobil.se: did not receive HSTS header
@@ -8496,6 +9320,7 @@ kum.com: could not connect to host
 kummerlaender.eu: did not receive HSTS header
 kundenerreichen.com: did not receive HSTS header
 kundenerreichen.de: did not receive HSTS header
+kundo.se: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 kunstschule-krabax.de: did not receive HSTS header
 kuops.com: did not receive HSTS header
 kupdokuchyne.cz: could not connect to host
@@ -8507,17 +9332,20 @@ kuro346.moe: could not connect to host
 kuroisalva.xyz: did not receive HSTS header
 kursprogramisty.pl: could not connect to host
 kurtmclester.com: could not connect to host
+kurumi.io: did not receive HSTS header
 kurz.pw: could not connect to host
 kurzonline.com.br: could not connect to host
 kuwago.io: could not connect to host
+kuzdrowiu24.pl: did not receive HSTS header
 kvt.berlin: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-kweddingplanning.com: max-age too low: 0
 kwidz.fr: did not receive HSTS header
 kwikmed.eu: could not connect to host
+kwiknews.com: did not receive HSTS header
 kwipi.com: did not receive HSTS header
+kwmr.me: did not receive HSTS header
 kwok.tv: could not connect to host
 kwondratsch.com: could not connect to host
-kxind.cn: did not receive HSTS header
+kxind.cn: could not connect to host
 kyanite.co: could not connect to host
 kyberna.xyz: could not connect to host
 kykoonn.net: did not receive HSTS header
@@ -8525,12 +9353,13 @@ kylapps.com: did not receive HSTS header
 kyle.place: could not connect to host
 kylebaldw.in: did not receive HSTS header
 kylerwood.com: could not connect to host
+kyliehunt.com: did not receive HSTS header
 kylling.io: could not connect to host
 kymo.org: did not receive HSTS header
+kynaston.org.uk: could not connect to host
 kyochon.fr: could not connect to host
 kyonagashima.com: did not receive HSTS header
-kyoto-k9.com: could not connect to host
-kyoto-tomoshibi.jp: could not connect to host
+kyoto-k9.com: did not receive HSTS header
 kyouko.nl: could not connect to host
 kyujin-office.net: could not connect to host
 kzjnet.com: could not connect to host
@@ -8543,12 +9372,12 @@ la-grande-jaugue.fr: did not receive HSTS header
 la-retraite-info.com: did not receive HSTS header
 la-serendipite.fr: did not receive HSTS header
 labaia.info: could not connect to host
+laballoons.com: max-age too low: 7889238
 labella-umbrella.com: did not receive HSTS header
 labelleza.com.br: could not connect to host
 labfox.de: did not receive HSTS header
 labiblioafronebrulepas.com: could not connect to host
 labina.com.tr: did not receive HSTS header
-labms.com.au: could not connect to host
 laboiteanem.fr: could not connect to host
 laboiteapc.fr: did not receive HSTS header
 labordata.io: could not connect to host
@@ -8558,7 +9387,7 @@ laboutiquemarocaineduconvoyeur.ma: could not connect to host
 labrador-retrievers.com.au: did not receive HSTS header
 labrasaq8.com: did not receive HSTS header
 labs.directory: could not connect to host
-labs.moscow: could not connect to host
+labs.moscow: did not receive HSTS header
 lacarpesaintaubinoise.fr: did not receive HSTS header
 lacasa.fr: could not connect to host
 lacasseroy.com: could not connect to host
@@ -8567,16 +9396,20 @@ lacentral.com: did not receive HSTS header
 lacledeslan.ninja: could not connect to host
 lacuevadechauvet.com: did not receive HSTS header
 ladadate.com: could not connect to host
+ladybugjam.com: could not connect to host
 ladylikeit.com: could not connect to host
 ladylucks.co.uk: could not connect to host
 laemen.com: did not receive HSTS header
 laemen.nl: could not connect to host
 laf.in.net: could not connect to host
 lafamillemusique.fr: did not receive HSTS header
+lafeemam.fr: could not connect to host
 laforetenchantee.ch: could not connect to host
 lafr4nc3.xyz: could not connect to host
 lagalerievirtuelle.fr: did not receive HSTS header
 lagier.xyz: could not connect to host
+laglab.org: could not connect to host
+lagodny.eu: could not connect to host
 lagoza.name: could not connect to host
 laharilais.fr: did not receive HSTS header
 lainchan.org: did not receive HSTS header
@@ -8588,6 +9421,7 @@ lakehavasuhouserentals.com: could not connect to host
 lakewoodcomputerservices.com: could not connect to host
 lalajj.com: could not connect to host
 laltroweb.it: did not receive HSTS header
+lamafioso.com: could not connect to host
 lamaland.ru: did not receive HSTS header
 lambda-complex.org: could not connect to host
 lambdafive.co.uk: could not connect to host
@@ -8595,7 +9429,6 @@ lamomebijou.paris: did not receive HSTS header
 lampl.info: could not connect to host
 lamtv.com.mx: could not connect to host
 lan2k.org: max-age too low: 86400
-lana.swedbank.se: did not receive HSTS header
 lanauzedesigns.com: did not receive HSTS header
 lanboll.com: could not connect to host
 lanbyte.se: did not receive HSTS header
@@ -8608,7 +9441,6 @@ landgoedverkopen.nl: could not connect to host
 landhuisverkopen.nl: could not connect to host
 landscape.canonical.com: max-age too low: 2592000
 landscapingmedic.com: did not receive HSTS header
-langatang.com: could not connect to host
 langenbach.rocks: could not connect to host
 langendorf-ernaehrung-training.de: could not connect to host
 langendries.eu: did not receive HSTS header
@@ -8620,11 +9452,14 @@ lanzainc.xyz: could not connect to host
 laobox.fr: could not connect to host
 laohei.org: could not connect to host
 laospage.com: did not receive HSTS header
+lapakus.com: could not connect to host
+laperfumista.es: could not connect to host
 lapetition.be: could not connect to host
 laplaceduvillage.net: could not connect to host
 laquack.com: could not connect to host
 lared.ovh: did not receive HSTS header
 laredsemanario.com: could not connect to host
+larky.top: could not connect to host
 larsgujord.no: did not receive HSTS header
 larsmerke.de: did not receive HSTS header
 lasepiataca.com: did not receive HSTS header
@@ -8632,6 +9467,7 @@ lasercloud.ml: could not connect to host
 lashstuff.com: did not receive HSTS header
 lasnaves.com: did not receive HSTS header
 lasst-uns-beten.de: could not connect to host
+lastharo.com: could not connect to host
 latable-bowling-vire.fr: did not receive HSTS header
 latabledebry.be: could not connect to host
 latamarissiere.eu: could not connect to host
@@ -8643,7 +9479,7 @@ lathamlabs.com: could not connect to host
 lathamlabs.net: could not connect to host
 lathamlabs.org: could not connect to host
 lathen-wahn.de: did not receive HSTS header
-latinred.com: could not connect to host
+latinred.com: did not receive HSTS header
 latitude42technology.com: did not receive HSTS header
 latour-managedcare.ch: could not connect to host
 latus.xyz: could not connect to host
@@ -8655,16 +9491,17 @@ laurelspaandlash.com: did not receive HSTS header
 laureltv.org: did not receive HSTS header
 laurent-e-levy.com: did not receive HSTS header
 lausitzer-widerstand.de: could not connect to host
-lavabit.no: could not connect to host
 lavapot.com: did not receive HSTS header
 laventainnhotel-mailing.com: could not connect to host
 lavine.ch: did not receive HSTS header
 lavito.cz: could not connect to host
+lavval.com: could not connect to host
 lawly.org: could not connect to host
 lawrence-institute.com: could not connect to host
 laxatus.com: could not connect to host
 laxiongames.es: did not receive HSTS header
 layer8.tk: could not connect to host
+layfully.me: could not connect to host
 laymans911.info: could not connect to host
 lazapateriahandmade.pe: did not receive HSTS header
 lazerus.net: could not connect to host
@@ -8682,17 +9519,19 @@ ldcraft.pw: could not connect to host
 leadbook.ru: max-age too low: 604800
 leadership9.com: could not connect to host
 leadgenie.me: could not connect to host
+leadinfo.com: did not receive HSTS header
+leadstart.org: did not receive HSTS header
 leakedminecraft.net: could not connect to host
 leakreporter.net: could not connect to host
 leaks.directory: could not connect to host
 leanclub.org: could not connect to host
 leaodarodesia.com.br: could not connect to host
 leardev.de: did not receive HSTS header
-learn-smart.uk: did not receive HSTS header
 learnedhacker.com: could not connect to host
 learnedovo.com: did not receive HSTS header
 learnfrenchfluently.com: could not connect to host
 learningorder.com: could not connect to host
+learntale.com: could not connect to host
 lebal.se: could not connect to host
 lebosse.me: could not connect to host
 lebrun.org: could not connect to host
@@ -8703,9 +9542,13 @@ ledgerscope.net: could not connect to host
 ledhouse.sk: did not receive HSTS header
 ledlampor365.se: could not connect to host
 ledshop.mx: did not receive HSTS header
+leebiblestudycenter.co.uk: could not connect to host
+leebiblestudycenter.com: could not connect to host
+leebiblestudycentre.com: could not connect to host
 leebiblestudycentre.net: could not connect to host
 leebiblestudycentre.org: could not connect to host
 leefindlow.com: could not connect to host
+leegyuho.com: could not connect to host
 leelou.wedding: could not connect to host
 leen.io: could not connect to host
 leerkotte.eu: could not connect to host
@@ -8715,22 +9558,27 @@ legalcontrol.info: could not connect to host
 legaleus.co.uk: could not connect to host
 legalisepeacebloom.com: could not connect to host
 legalrobot-uat.com: could not connect to host
+legalsen.com: did not receive HSTS header
 legaltip.eu: could not connect to host
 legarage.org: did not receive HSTS header
+legatofmrc.fr: could not connect to host
 legavenue.com.br: did not receive HSTS header
 legendary.camera: did not receive HSTS header
 legitaxi.com: did not receive HSTS header
 legymnase.eu: did not receive HSTS header
 lehtinen.xyz: could not connect to host
+leigh.life: did not receive HSTS header
 leighneithardt.com: could not connect to host
+leiming.co: could not connect to host
 leinir.dk: did not receive HSTS header
+leition.com: did not receive HSTS header
+leitionusercontent.com: did not receive HSTS header
 leitner.com.au: did not receive HSTS header
 lelehei.com: could not connect to host
 lellyboi.ml: could not connect to host
 lelongbank.com: did not receive HSTS header
 lelubre.info: did not receive HSTS header
 lemon.co: could not connect to host
-lemonrockbiketours.com: did not receive HSTS header
 lemp.io: did not receive HSTS header
 lenders.direct: could not connect to host
 lengyelnyelvoktatas.hu: could not connect to host
@@ -8745,18 +9593,19 @@ lenovogaming.com: could not connect to host
 lentri.com: did not receive HSTS header
 lenzw.de: did not receive HSTS header
 leob.in: could not connect to host
+leochedibracchio.com: did not receive HSTS header
 leon-jaekel.com: could not connect to host
 leonardcamacho.me: could not connect to host
+leonhooijer.nl: could not connect to host
 leonmahler.consulting: did not receive HSTS header
 leopold.email: could not connect to host
-leopoldina.net: did not receive HSTS header
 leopotamgroup.com: could not connect to host
 leovanna.co.uk: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 lepiquillo.fr: did not receive HSTS header
 lepont.pl: could not connect to host
 lerasenglish.com: max-age too low: 0
 lerlivros.online: could not connect to host
-lerner.moscow: could not connect to host
+lerner.moscow: did not receive HSTS header
 les-corsaires.net: could not connect to host
 les-voitures-electriques.com: max-age too low: 2592000
 lesbiansslaves.com: could not connect to host
@@ -8764,15 +9613,15 @@ lesbofight.com: could not connect to host
 lescomptoirsdepierrot.com: could not connect to host
 lesdouceursdeliyana.com: could not connect to host
 lesecuadors.com: did not receive HSTS header
-lesformations.net: did not receive HSTS header
+lesformations.net: could not connect to host
 lesh.eu: could not connect to host
+lesharris.com: could not connect to host
 lesjardinsdubanchet.fr: could not connect to host
 lesliekearney.com: did not receive HSTS header
 lesperlesdunet.fr: could not connect to host
 lesquatredauphins.fr: did not receive HSTS header
 lesquerda.cat: did not receive HSTS header
 lessing.consulting: did not receive HSTS header
-let-go.cc: could not connect to host
 letempsdunefleur.be: could not connect to host
 leter.io: did not receive HSTS header
 lethbridgecoffee.com: did not receive HSTS header
@@ -8799,15 +9648,15 @@ lexpartsofac.com: could not connect to host
 lez-cuties.com: could not connect to host
 lezdomsm.com: could not connect to host
 lfaz.org: could not connect to host
-lfullerdesign.com: could not connect to host
 lg21.co: could not connect to host
-lgbtqventures.com: could not connect to host
+lgbtqventures.com: did not receive HSTS header
+lgbtventures.com: did not receive HSTS header
 lgiswa.com.au: did not receive HSTS header
 lgrs.com.au: did not receive HSTS header
 lgsg.us: could not connect to host
 lgts.se: could not connect to host
 lhasaapso.com.br: could not connect to host
-lheinrich.com: could not connect to host
+lheinrich.com: did not receive HSTS header
 lheinrich.de: did not receive HSTS header
 lheinrich.org: could not connect to host
 lhsj28.com: could not connect to host
@@ -8817,6 +9666,10 @@ liaillustr.at: did not receive HSTS header
 liam-is-a-nig.ga: could not connect to host
 liam-w.com: could not connect to host
 liamjack.fr: could not connect to host
+lian-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+lian-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+liang-li88.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+liang-li88.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 liangbp.com: could not connect to host
 lianwen.kim: could not connect to host
 lianye.in: could not connect to host
@@ -8834,6 +9687,7 @@ libfte.org: did not receive HSTS header
 librairie-asie.com: did not receive HSTS header
 library.linode.com: did not receive HSTS header
 librechan.net: could not connect to host
+librends.org: could not connect to host
 libricks.fr: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 librisulibri.it: did not receive HSTS header
 licence-registry.com: could not connect to host
@@ -8848,7 +9702,6 @@ liebach.me: did not receive HSTS header
 liebestarot.at: did not receive HSTS header
 lied8.eu: could not connect to host
 liehuojun.com: could not connect to host
-liemen.net: did not receive HSTS header
 lietaer.eu: did not receive HSTS header
 life-time.nl: did not receive HSTS header
 lifecoach.tw: did not receive HSTS header
@@ -8859,41 +9712,51 @@ lifeinsurances.pro: did not receive HSTS header
 lifeinsurances24.com: did not receive HSTS header
 lifemarque.co.uk: did not receive HSTS header
 lifenexto.com: could not connect to host
-lifeng.us: did not receive HSTS header
+lifeng.us: could not connect to host
+lifequotes-uk.co.uk: could not connect to host
 lifeskillsdirect.com: did not receive HSTS header
 lifestyler.me: could not connect to host
 lifetimemoneymachine.com: did not receive HSTS header
 lifeventure.co.uk: did not receive HSTS header
 lightarmory.com: could not connect to host
 lightcloud.com: did not receive HSTS header
+lightdark.xyz: could not connect to host
+lighthouseinstruments.com: did not receive HSTS header
 lightning-ashe.com: did not receive HSTS header
 lightnovelsekai.com: could not connect to host
 lightpaste.com: could not connect to host
 lighttherapydevice.com: did not receive HSTS header
+lighttp.com: could not connect to host
 lightworx.io: could not connect to host
 lignemalin.com: could not connect to host
 lignemax.com: did not receive HSTS header
 lignenet.com: did not receive HSTS header
+lijero.co: could not connect to host
 like.lgbt: could not connect to host
+likenewhearing.com.au: could not connect to host
 likenosis.com: could not connect to host
 lila.pink: did not receive HSTS header
-lilapmedia.com: could not connect to host
-liliang13.com: could not connect to host
+lilapmedia.com: did not receive HSTS header
 lilismartinis.com: could not connect to host
 lillpopp.eu: max-age too low: 10
 lilpwny.com: could not connect to host
 lilycms.com: could not connect to host
 lilygreen.co.za: did not receive HSTS header
 limalama.eu: max-age too low: 1
+limeres.com: could not connect to host
 limeyeti.com: could not connect to host
 limiteddata.co.uk: could not connect to host
 limitget.com: did not receive HSTS header
 limodo-shop.de: did not receive HSTS header
 limpens.net: did not receive HSTS header
 limpido.it: could not connect to host
+limunana.com: could not connect to host
 lincsbouncycastlehire.co.uk: did not receive HSTS header
 lindberg.io: did not receive HSTS header
+linden.me: did not receive HSTS header
+lindnerhof.info: did not receive HSTS header
 lineauniformes.com.br: could not connect to host
+linernotekids.com: did not receive HSTS header
 linext.cn: could not connect to host
 lingerie.net.br: did not receive HSTS header
 lingerielovers.com.br: did not receive HSTS header
@@ -8925,6 +9788,7 @@ linuxeyecandy.com: could not connect to host
 linuxfixed.it: could not connect to host
 linuxforyou.com: could not connect to host
 linuxgeek.ro: could not connect to host
+linuxincluded.com: did not receive HSTS header
 linuxmint.cz: could not connect to host
 linuxmonitoring.net: could not connect to host
 linvx.org: did not receive HSTS header
@@ -8938,6 +9802,7 @@ liquorsanthe.in: could not connect to host
 lisaco.de: could not connect to host
 lisbongold.com: did not receive HSTS header
 lisgade.dk: could not connect to host
+lisieuxarquitetura.com.br: could not connect to host
 lisowski-photography.com: could not connect to host
 lissabon.guide: could not connect to host
 listafirmelor.com: could not connect to host
@@ -8969,14 +9834,12 @@ livrariahugodesaovitor.com.br: could not connect to host
 lixiang.one: could not connect to host
 lixiaojiang.ga: could not connect to host
 lixingcong.com: could not connect to host
-liyang.pro: did not receive HSTS header
+liyinjia.com: could not connect to host
 lizzythepooch.com: did not receive HSTS header
 lkiserver.com: could not connect to host
 llamasweet.tech: could not connect to host
 lll.st: could not connect to host
-llslb.com: could not connect to host
 llvm.us: could not connect to host
-lmcm.io: could not connect to host
 lmrcouncil.gov: could not connect to host
 ln.io: could not connect to host
 lnbeauty.ru: max-age too low: 0
@@ -8984,6 +9847,7 @@ lnoldan.com: could not connect to host
 loacg.com: did not receive HSTS header
 loadingdeck.com: did not receive HSTS header
 loadso.me: could not connect to host
+loadtraining.com: did not receive HSTS header
 loafbox.com: could not connect to host
 loafhead.me: could not connect to host
 loanmatch.sg: could not connect to host
@@ -8994,6 +9858,7 @@ lobosdomain.ddns.net: could not connect to host
 lobosdomain.no-ip.info: could not connect to host
 lobste.rs: did not receive HSTS header
 locais.org: could not connect to host
+localblitz.com: did not receive HSTS header
 localchum.com: could not connect to host
 localdata.us: did not receive HSTS header
 localdrive.me: could not connect to host
@@ -9009,12 +9874,15 @@ locationvoiturepaysbas.com: could not connect to host
 locationvoituresuede.com: could not connect to host
 locchat.com: could not connect to host
 locker3.com: could not connect to host
+locksmith-durbannorth.co.za: could not connect to host
+locksmithdearborn.com: did not receive HSTS header
+locksmithhillcrest.co.za: could not connect to host
 locksmithrandburg24-7.co.za: could not connect to host
+locksmithsbluff.com: could not connect to host
 locksport.org.nz: could not connect to host
 locktheirphone.com: could not connect to host
 lockyourcomputer.pw: could not connect to host
 locomotive.ca: did not receive HSTS header
-locomotive.net.br: could not connect to host
 locvis.ru: did not receive HSTS header
 lode.li: could not connect to host
 lodgesdureynou.fr: did not receive HSTS header
@@ -9023,10 +9891,10 @@ loftboard.eu: could not connect to host
 log2n.uk: could not connect to host
 logario.com.br: could not connect to host
 logcat.info: could not connect to host
-logfile.ch: did not receive HSTS header
+logfro.de: max-age too low: 0
 logic8.ml: could not connect to host
 logicaladvertising.com: could not connect to host
-logicoma.com: could not connect to host
+logicchen.com: could not connect to host
 logicsale.com: did not receive HSTS header
 logicsale.de: did not receive HSTS header
 logicsale.fr: did not receive HSTS header
@@ -9039,29 +9907,32 @@ loginseite.com: could not connect to host
 logistify.com.mx: did not receive HSTS header
 lognot.net: could not connect to host
 logymedia.com: could not connect to host
+lohl1kohl.de: could not connect to host
 loisircreatif.net: did not receive HSTS header
-lojadocristaozinho.com.br: did not receive HSTS header
+lojadocristaozinho.com.br: could not connect to host
 lojadoprazer.com.br: could not connect to host
 lojahunamarcenaria.com.br: could not connect to host
+lojamulticapmais.com.br: did not receive HSTS header
 lojashowdecozinha.com.br: could not connect to host
 lojasviavento.com.br: could not connect to host
 lojavalcapelli.com.br: could not connect to host
-loli.bz: did not receive HSTS header
-loli.vip: could not connect to host
+lolhax.org: could not connect to host
+loli.bz: could not connect to host
+loli.com: could not connect to host
+loli.vip: did not receive HSTS header
 lolicon.info: could not connect to host
 lolicore.ch: could not connect to host
 lolidunno.com: could not connect to host
 lolis.stream: could not connect to host
 lollaconcept.com.br: could not connect to host
-lonal.com: could not connect to host
 lonasdigital.com: did not receive HSTS header
 lonbali.com: did not receive HSTS header
 londoncalling.co: did not receive HSTS header
-londonkan.jp: could not connect to host
 londonlanguageexchange.com: could not connect to host
 londonseedcentre.co.uk: could not connect to host
 lonerwolf.com: did not receive HSTS header
 longboarding-ulm.de: could not connect to host
+longma.pw: could not connect to host
 longtaitouwang.com: did not receive HSTS header
 look-at-my.site: could not connect to host
 lookout.com: did not receive HSTS header
@@ -9070,11 +9941,11 @@ lookupclose.com: did not receive HSTS header
 looneymooney.com: could not connect to host
 loongsg.xyz: could not connect to host
 loperetti.ch: could not connect to host
-loposchokk.com: could not connect to host
 loqyu.co: could not connect to host
 lordgun.com: could not connect to host
 lordjevington.co.uk: did not receive HSTS header
 losebellyfat.pro: could not connect to host
+losrascadoresparagatos.com: did not receive HSTS header
 loss.no: could not connect to host
 lostandcash.com: could not connect to host
 lostarq.com: could not connect to host
@@ -9092,10 +9963,15 @@ lotuscloud.de: did not receive HSTS header
 lotuscloud.org: could not connect to host
 louduniverse.net: did not receive HSTS header
 louiewatch.com: could not connect to host
+louisvillevmug.info: could not connect to host
 love-schna.jp: could not connect to host
 love4taylor.eu.org: could not connect to host
 loveable.de: could not connect to host
+loveamber.me: could not connect to host
 loveandloyalty.se: could not connect to host
+lovebo9.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+lovebo9.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+lovelens.ch: could not connect to host
 lovelifelovelive.com: could not connect to host
 lovelive-anime.tk: could not connect to host
 lovelive-anime.tokyo: could not connect to host
@@ -9112,8 +9988,8 @@ lovingpenguin.com: did not receive HSTS header
 lowhangingfruitgrabber.com: could not connect to host
 lowt.us: could not connect to host
 lowtherpavilion.co.uk: did not receive HSTS header
+loxal.org: could not connect to host
 loxis.be: did not receive HSTS header
-loyaleco.it: could not connect to host
 loyaltech.ch: could not connect to host
 lpacademy.com.br: could not connect to host
 lpak.nl: could not connect to host
@@ -9129,10 +10005,12 @@ lsp-sports.de: did not receive HSTS header
 lstma.com: could not connect to host
 lsvih.com: did not receive HSTS header
 lswim.com: did not receive HSTS header
+lsws.de: could not connect to host
+lsys.ac: could not connect to host
 lszj.com: could not connect to host
 ltba.org: could not connect to host
 ltbytes.com: could not connect to host
-ltechnologygroup.com: did not receive HSTS header
+ltechnologygroup.com: could not connect to host
 ltransferts.com: could not connect to host
 ltu.social: could not connect to host
 luan.ma: did not receive HSTS header
@@ -9146,10 +10024,9 @@ lucidlogs.com: could not connect to host
 luckydog.pw: could not connect to host
 luckystarfishing.com: did not receive HSTS header
 luclu7.pw: could not connect to host
-lucysan.net: could not connect to host
-ludum.pl: could not connect to host
 ludwig.click: did not receive HSTS header
 lufthansaexperts.com: max-age too low: 2592000
+luganskservers.net: could not connect to host
 luis-checa.com: could not connect to host
 luisv.me: could not connect to host
 luk.photo: could not connect to host
@@ -9184,32 +10061,34 @@ luripump.se: could not connect to host
 lusis.fr: did not receive HSTS header
 lusis.net: could not connect to host
 lustrumxi.nl: could not connect to host
+luteijn.biz: did not receive HSTS header
 luther.fi: could not connect to host
 luxe-it.co.uk: could not connect to host
 luxinmo.com: did not receive HSTS header
+luxofit.de: did not receive HSTS header
 luxonetwork.com: could not connect to host
 luxus-russen.de: could not connect to host
 luzeshomologadas.com.br: could not connect to host
-luzfaltex.com: did not receive HSTS header
 lwhate.com: could not connect to host
+lycly.me: could not connect to host
 lycly.top: could not connect to host
 lydia-und-simon.de: could not connect to host
 lydiagorstein.com: did not receive HSTS header
 lylares.com: did not receive HSTS header
 lynkos.com: did not receive HSTS header
 lyonelkaufmann.ch: did not receive HSTS header
-lyonl.com: did not receive HSTS header
+lyonl.com: could not connect to host
 lyscnd.com: could not connect to host
 lysergion.com: could not connect to host
 lyuba.fr: could not connect to host
+lyuly.com: did not receive HSTS header
 lz.sb: could not connect to host
 lzahq.tech: did not receive HSTS header
 lzkill.com: did not receive HSTS header
 lzqii.cn: could not connect to host
-lzzr.me: did not receive HSTS header
 m-ali.xyz: did not receive HSTS header
+m-edmondson.co.uk: did not receive HSTS header
 m-generator.com: could not connect to host
-m-idea.jp: could not connect to host
 m-rickroll-v.pw: could not connect to host
 m-warrior.tk: could not connect to host
 m.gparent.org: could not connect to host
@@ -9226,7 +10105,7 @@ maartenprovo.be: did not receive HSTS header
 maartenterpstra.xyz: could not connect to host
 mac-torrents.me: did not receive HSTS header
 mac-world.pl: did not receive HSTS header
-macandtonic.com: did not receive HSTS header
+macandtonic.com: could not connect to host
 macbolo.com: could not connect to host
 macchaberrycream.com: could not connect to host
 macchedil.com: did not receive HSTS header
@@ -9234,22 +10113,23 @@ macdj.tk: could not connect to host
 macedopesca.com.br: did not receive HSTS header
 macgeneral.de: did not receive HSTS header
 mach1club.com: did not receive HSTS header
+machbach.net: could not connect to host
 machinelearningjavascript.com: could not connect to host
 mack.space: could not connect to host
 macleodnc.com: did not receive HSTS header
 macsandcheesedreams.com: could not connect to host
-macstore.pe: did not receive HSTS header
 macustar.eu: did not receive HSTS header
-madandpissedoff.com: did not receive HSTS header
+madandpissedoff.com: could not connect to host
 madcatdesign.de: did not receive HSTS header
 maddin.ga: could not connect to host
 madebyfalcon.co.uk: did not receive HSTS header
 madebymagnitude.com: did not receive HSTS header
 madeglobal.com: did not receive HSTS header
-madeinorder.com: did not receive HSTS header
+madeinorder.com: could not connect to host
 madeintucson.org: could not connect to host
 mademoiselle-emma.be: could not connect to host
 mademoiselle-emma.fr: could not connect to host
+maderasbrown.com: could not connect to host
 maderwin.com: did not receive HSTS header
 madesoftware.com.br: could not connect to host
 madnetwork.org: could not connect to host
@@ -9257,17 +10137,19 @@ madokami.net: could not connect to host
 madpeople.net: max-age too low: 2592000
 madrants.net: could not connect to host
 madweb.design: did not receive HSTS header
+maerzpa.de: did not receive HSTS header
 mafamane.com: could not connect to host
 mafiareturns.com: max-age too low: 2592000
 magazinedabeleza.net: could not connect to host
 magebankin.com: did not receive HSTS header
 magenx.com: did not receive HSTS header
-magi.systems: could not connect to host
 magia360.com: did not receive HSTS header
 magicball.co: could not connect to host
+magieamour.com: did not receive HSTS header
 magieblanche.fr: did not receive HSTS header
 magnacumlaude.co: could not connect to host
 magneticanvil.com: did not receive HSTS header
+magosmedellin.com: could not connect to host
 magyarokegyhelyen.hu: did not receive HSTS header
 mahamed91.pw: could not connect to host
 mahansexcavating.com: did not receive HSTS header
@@ -9283,6 +10165,7 @@ mailgarant.nl: could not connect to host
 mailhost.it: could not connect to host
 mailing-femprendedores.com: did not receive HSTS header
 mailing-jbgg.com: could not connect to host
+maillink.store: did not receive HSTS header
 mailon.ga: could not connect to host
 mailpenny.com: could not connect to host
 main-street-seo.com: did not receive HSTS header
@@ -9290,8 +10173,6 @@ main-unit.com: could not connect to host
 maintainerheaven.ch: could not connect to host
 maisalto.ind.br: could not connect to host
 maitriser-son-stress.com: could not connect to host
-majkl.xyz: could not connect to host
-majkl578.cz: could not connect to host
 majncloud.tk: could not connect to host
 make-pizza.info: could not connect to host
 makedonien.guide: could not connect to host
@@ -9300,10 +10181,10 @@ makeitdynamic.com: could not connect to host
 makemejob.com: could not connect to host
 makemyvape.co.uk: max-age too low: 7889238
 makerstuff.net: did not receive HSTS header
-makeshiftco.de: did not receive HSTS header
+makeshiftco.de: could not connect to host
 makeuplove.nl: could not connect to host
-makeyourlaws.org: could not connect to host
-makinen.ru: could not connect to host
+makeyourlaws.org: did not receive HSTS header
+makino.games: could not connect to host
 malamutedoalasca.com.br: could not connect to host
 maldiverna.guide: could not connect to host
 maleexcel.com: did not receive HSTS header
@@ -9315,21 +10196,18 @@ malgraph.net: could not connect to host
 malibubeachrecoverycenter.com: could not connect to host
 maljaars-media.nl: could not connect to host
 malkaso.com.ua: could not connect to host
-mallner.me: could not connect to host
 malmstroms-co.se: could not connect to host
 malone.link: could not connect to host
-malte-kiefer.de: did not receive HSTS header
 maltes.website: could not connect to host
 malvy.kiev.ua: could not connect to host
+malwareverse.us: did not receive HSTS header
 malwre.io: could not connect to host
 maly.io: did not receive HSTS header
 malya.fr: could not connect to host
-mamacobaby.com: could not connect to host
-mamadoma.com.ua: could not connect to host
+mamacobaby.com: did not receive HSTS header
 mamaison.io: could not connect to host
 mamastore.eu: could not connect to host
 mamaxi.org: did not receive HSTS header
-mammeitalianeavienna.com: could not connect to host
 mammothmail.com: could not connect to host
 mammothmail.net: could not connect to host
 mammothmail.org: could not connect to host
@@ -9349,8 +10227,10 @@ manalu.cz: could not connect to host
 manantial.mx: could not connect to host
 manav-it.de: could not connect to host
 mandala-ausmalbilder.de: did not receive HSTS header
+mandanudes.ae: could not connect to host
 mandm.servebeer.com: could not connect to host
 mandpress.com: did not receive HSTS header
+mangapoi.com: could not connect to host
 mangazuki.co: could not connect to host
 maniadeprazer.com.br: could not connect to host
 manifestbin.com: did not receive HSTS header
@@ -9362,6 +10242,7 @@ manns-solutions.com: did not receive HSTS header
 manns-solutions.ru: did not receive HSTS header
 mannsolutions.co.uk: did not receive HSTS header
 manojsharan.me: could not connect to host
+manonamission.de: did not receive HSTS header
 manova.cz: could not connect to host
 mansfieldplacevt.com: did not receive HSTS header
 manshop24.com: could not connect to host
@@ -9374,36 +10255,35 @@ maomaobt.com: did not receive HSTS header
 maomaofuli.vip: could not connect to host
 maosi.xin: could not connect to host
 maple5.com: did not receive HSTS header
-maplenorth.co: did not receive HSTS header
+maplenorth.co: could not connect to host
 mapresidentielle.fr: did not receive HSTS header
+mapservices.nl: did not receive HSTS header
 maquillage-permanent-tatoo.com: did not receive HSTS header
 maranatha.pl: did not receive HSTS header
 marbinvest.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-marc-schlagenhauf.de: could not connect to host
 marcaixala.me: could not connect to host
-marcberman.co: could not connect to host
+marcaudefroy.com: could not connect to host
+marcberman.co: did not receive HSTS header
 marcbuehlmann.com: did not receive HSTS header
 marcelmarnitz.com: could not connect to host
 marcelparra.com: could not connect to host
-marcelwiedemeier.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 marchagen.nl: did not receive HSTS header
 marche-nordic-jorat.ch: could not connect to host
-marchhappy.tech: did not receive HSTS header
-marcianoandtopazio.com: could not connect to host
 marco-kretz.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 marco01809.net: could not connect to host
 marcoececilia.it: did not receive HSTS header
 marcofinke.de: could not connect to host
+marcohager.de: could not connect to host
 marcontrol.com: did not receive HSTS header
-marcoslater.com: did not receive HSTS header
 marcosteixeira.tk: could not connect to host
 marcschlagenhauf.de: could not connect to host
 marcus-scheffler.com: did not receive HSTS header
 marcush.de: could not connect to host
-marcusserver.synology.me: could not connect to host
+marcusserver.synology.me: did not receive HSTS header
 mardelcupon.com: could not connect to host
 mare92.cz: could not connect to host
 mareklecian.cz: did not receive HSTS header
+margan.ch: could not connect to host
 margaretrosefashions.co.uk: could not connect to host
 mariacristinadoces.com.br: did not receive HSTS header
 mariannematthew.com: could not connect to host
@@ -9411,10 +10291,11 @@ marianwehlus.de: did not receive HSTS header
 mariaolesen.dk: could not connect to host
 marie-curie.fr: could not connect to host
 marie-elisabeth.dk: did not receive HSTS header
-marie-en-provence.com: did not receive HSTS header
+marie-en-provence.com: could not connect to host
 marie.club: could not connect to host
 marienschule-sundern.de: did not receive HSTS header
 mariusschulte.de: did not receive HSTS header
+marix.ro: could not connect to host
 mark-a-hydrant.com: did not receive HSTS header
 mark-armstrong-gaming.com: could not connect to host
 markayapilandirma.com: could not connect to host
@@ -9439,10 +10320,13 @@ markusueberallconsulting.de: could not connect to host
 markusweimar.de: did not receive HSTS header
 marlen.cz: did not receive HSTS header
 marleyresort.com: did not receive HSTS header
+marlonschultz.de: did not receive HSTS header
 marqperso.ch: could not connect to host
 marquepersonnelle.ch: could not connect to host
 marriottvetcareers.com: could not connect to host
 marsatapp.com: could not connect to host
+marsble.com: did not receive HSTS header
+marshallford.me: could not connect to host
 marshut.net: could not connect to host
 martialc.be: could not connect to host
 martiert.com: could not connect to host
@@ -9456,18 +10340,19 @@ martinkup.cz: did not receive HSTS header
 martinp.no: could not connect to host
 martinrogalla.com: did not receive HSTS header
 martins.im: could not connect to host
-marumagic.com: did not receive HSTS header
-marvinkeller.de: could not connect to host
+marumagic.com: could not connect to host
+marvinkeller.de: did not receive HSTS header
 marxist.party: could not connect to host
 marykshoup.com: could not connect to host
 masa-hou.com: did not receive HSTS header
 masa-yoga.com: did not receive HSTS header
 masa.li: could not connect to host
 masaze-hanka.cz: could not connect to host
-maservant.net: could not connect to host
+mascorazon.com: could not connect to host
 mashek.net: could not connect to host
 mashnew.com: could not connect to host
 masjidtawheed.net: did not receive HSTS header
+maskice.hr: did not receive HSTS header
 maskinkultur.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 maskt.pw: could not connect to host
 massagelimaperu.com: did not receive HSTS header
@@ -9475,7 +10360,8 @@ massivum.de: did not receive HSTS header
 massot.eu: did not receive HSTS header
 mastd.fr: could not connect to host
 mastd.onl: could not connect to host
-masteragenasia.com: did not receive HSTS header
+mastepinnelaand.nl: did not receive HSTS header
+masteragenasia.com: could not connect to host
 masteragenasia.net: did not receive HSTS header
 masterapi.ninja: did not receive HSTS header
 masterhaus.bg: did not receive HSTS header
@@ -9494,47 +10380,41 @@ mastodon.fun: could not connect to host
 mastodon.my: could not connect to host
 mastodon.org.uk: did not receive HSTS header
 mastodon.pl: did not receive HSTS header
-mastodon.top: did not receive HSTS header
 mastodones.club: could not connect to host
 masty.nl: could not connect to host
 masumreza.tk: could not connect to host
 mat99.dk: could not connect to host
 matarrosabierzo.com: could not connect to host
-matatall.com: did not receive HSTS header
-maternalsafety.org: did not receive HSTS header
 mateusmeyer.com.br: could not connect to host
 mateuszpilszek.pl: could not connect to host
 mathers.ovh: did not receive HSTS header
 mathias.re: did not receive HSTS header
-mathieui.net: could not connect to host
 mathijskingma.nl: could not connect to host
-mathis.com.tr: did not receive HSTS header
 matillat.ovh: did not receive HSTS header
 matlabjo.ir: could not connect to host
 matomeplus.co: could not connect to host
-matratzentester.com: did not receive HSTS header
 matrict.com: could not connect to host
 matrip.de: could not connect to host
 matrix.ac: could not connect to host
 matrixcheats.net: could not connect to host
 matsuz.com: did not receive HSTS header
-matt-brooks.com: could not connect to host
 matt.tf: did not receive HSTS header
 mattandreko.com: did not receive HSTS header
 mattberryman.com: did not receive HSTS header
 matterconcern.com: could not connect to host
 matthew-carson.info: could not connect to host
 matthewemes.com: did not receive HSTS header
-matthewgrow.com: did not receive HSTS header
 matthewprenger.com: could not connect to host
 matthewtester.com: did not receive HSTS header
 matthiassteen.be: could not connect to host
 matthiasweiler.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-mattia98.org: could not connect to host
+mattia98.org: did not receive HSTS header
 mattisam.com: did not receive HSTS header
 mattressinsider.com: max-age too low: 3153600
 mattwb65.com: did not receive HSTS header
+mattwservices.co.uk: max-age too low: 2592000
 matty.digital: did not receive HSTS header
+matze.co: did not receive HSTS header
 matze.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 maultrom.ml: could not connect to host
 maupiknik.com: did not receive HSTS header
@@ -9543,25 +10423,26 @@ maurus-automation.de: did not receive HSTS header
 mausi.co: did not receive HSTS header
 mavisang.cf: could not connect to host
 mawe.red: could not connect to host
-maxhoechtl.at: could not connect to host
+max-mad.com: could not connect to host
+maxfox.me: could not connect to host
 maxhorvath.com: could not connect to host
 maxibanki.ovh: could not connect to host
 maxicore.co.za: could not connect to host
 maxima.at: did not receive HSTS header
 maximelouet.me: did not receive HSTS header
-maximov.space: did not receive HSTS header
+maximov.space: could not connect to host
 maxkeller.io: did not receive HSTS header
 maxmachine.ind.br: could not connect to host
+maxr1998.de: could not connect to host
 maxserver.com: did not receive HSTS header
+maya-ro.com: could not connect to host
 maya.mg: could not connect to host
 maybeul.com: could not connect to host
+maydex.info: could not connect to host
 maynardnetworks.com: could not connect to host
 mayoristassexshop.com: did not receive HSTS header
-maze.design: did not receive HSTS header
-mazurlabs.tk: could not connect to host
 mazyun.com: did not receive HSTS header
 mazz-tech.com: could not connect to host
-mbanq.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 mbconsultancy.nu: did not receive HSTS header
 mbdrogenbos-usedcars.be: could not connect to host
 mbwemmel-usedcars.be: could not connect to host
@@ -9571,7 +10452,6 @@ mcadmin.net: could not connect to host
 mcard.vn: did not receive HSTS header
 mcb-bank.com: did not receive HSTS header
 mcc.re: could not connect to host
-mccarty.io: could not connect to host
 mccordworks.com: did not receive HSTS header
 mcdanieldevelopmentservices.com: could not connect to host
 mcdonalds.ru: did not receive HSTS header
@@ -9582,8 +10462,10 @@ mcideas.tk: could not connect to host
 mcjackk77.com: could not connect to host
 mckinley1.com: could not connect to host
 mckinleytk.com: could not connect to host
+mcl.gg: did not receive HSTS header
 mclab.su: max-age too low: 2592000
 mclist.it: could not connect to host
+mclyr.com: could not connect to host
 mcnoobs.pro: could not connect to host
 mcooperlaw.com: did not receive HSTS header
 mcqyy.com: could not connect to host
@@ -9598,10 +10480,9 @@ mdfnet.se: did not receive HSTS header
 mdscomp.net: did not receive HSTS header
 mdwftw.com: could not connect to host
 me-dc.com: could not connect to host
-meadowfenfarm.com: could not connect to host
 meadowviewfarms.org: could not connect to host
 mealz.com: did not receive HSTS header
-meanevo.com: could not connect to host
+meanevo.com: did not receive HSTS header
 measuretwice.com: did not receive HSTS header
 meat-education.com: could not connect to host
 meathealth.com: could not connect to host
@@ -9610,9 +10491,10 @@ mecanicadom.com: could not connect to host
 mecenat-cassous.com: did not receive HSTS header
 mechok.ru: could not connect to host
 medallia.io: could not connect to host
+meddatix.com: could not connect to host
 media-access.online: did not receive HSTS header
 media-courses.com: could not connect to host
-mediacru.sh: could not connect to host
+mediacru.sh: max-age too low: 0
 mediadandy.com: could not connect to host
 mediafinancelab.org: could not connect to host
 mediamag.am: max-age too low: 0
@@ -9622,7 +10504,7 @@ medicinskavranje.edu.rs: could not connect to host
 medienservice-fritz.de: did not receive HSTS header
 medifab.online: did not receive HSTS header
 medirich.co: could not connect to host
-meditek-dv.ru: could not connect to host
+meditek-dv.ru: did not receive HSTS header
 mediter-simplement.com: did not receive HSTS header
 mediterenopmaandag.nl: did not receive HSTS header
 mediumraw.org: did not receive HSTS header
@@ -9631,27 +10513,24 @@ medm-test.com: could not connect to host
 medpot.net: did not receive HSTS header
 medsindex.com: max-age too low: 2592000
 medstreaming.com: did not receive HSTS header
+medtankers.management: did not receive HSTS header
 medy-me.com: could not connect to host
 medzinenews.com: did not receive HSTS header
 meedoenzaanstad.nl: did not receive HSTS header
 meetfinch.com: could not connect to host
-meetmibaby.co.uk: could not connect to host
-meetscompany.jp: could not connect to host
 megadrol.com: could not connect to host
+megaflix.nl: could not connect to host
 megakiste.de: could not connect to host
 megam.host: could not connect to host
 megashur.se: did not receive HSTS header
-megasystem.cl: did not receive HSTS header
+megasystem.cl: could not connect to host
 meghudson.com: could not connect to host
-mego.cloud: could not connect to host
 meifrench.com: could not connect to host
-meiju.video: could not connect to host
 meincloudspeicher.de: could not connect to host
 meine-reise-gut-versichert.de: did not receive HSTS header
 meinebo.it: could not connect to host
 meisterritter.de: did not receive HSTS header
 meizufans.eu: could not connect to host
-mekongeye.com: could not connect to host
 melakaltenegger.at: did not receive HSTS header
 melangebrasil.com: could not connect to host
 melaniebilodeau.com: did not receive HSTS header
@@ -9670,23 +10549,29 @@ melvinlammerts.nl: could not connect to host
 melvinlow.com: did not receive HSTS header
 memberpress.com: did not receive HSTS header
 members.mayfirst.org: did not receive HSTS header
+membersonline.org: did not receive HSTS header
 memdoc.org: could not connect to host
 memeblast.ninja: could not connect to host
 memepasmal.org: could not connect to host
-memes.nz: could not connect to host
+memes.nz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 memetrash.co.uk: could not connect to host
+memind.net: could not connect to host
 memory-plus-180.com: could not connect to host
 memorygame.io: did not receive HSTS header
 memorytrace.space: could not connect to host
 menaraannonces.com: could not connect to host
 menchez.me: could not connect to host
-mennace.com: did not receive HSTS header
+menhera.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 menotag.com: did not receive HSTS header
 mensachterdepatient.nl: max-age too low: 2592000
 mensmaximus.de: did not receive HSTS header
+mentalhealth.gov: did not receive HSTS header
 mentax.net: did not receive HSTS header
+mentesemprendedoras.net: could not connect to host
 menthix.net: could not connect to host
+menu.fyi: could not connect to host
 menudrivetest.com: could not connect to host
+menuel.me: did not receive HSTS header
 menuiserie-berard.com: did not receive HSTS header
 menzaijia.com: could not connect to host
 meo.de: could not connect to host
@@ -9699,10 +10584,11 @@ mercedes-benz-usedcars.be: could not connect to host
 mercury-studio.com: did not receive HSTS header
 mereckas.com: could not connect to host
 meredithkm.info: did not receive HSTS header
-meremobil.dk: did not receive HSTS header
+merenita.eu: did not receive HSTS header
 mergozzo.com: did not receive HSTS header
 merimatka.fi: could not connect to host
 meritz.rocks: could not connect to host
+merloat.club: could not connect to host
 mersinunivercity.com: could not connect to host
 merson.me: could not connect to host
 mertak.cz: did not receive HSTS header
@@ -9710,16 +10596,14 @@ meshlab.co: could not connect to host
 meshotes.com: max-age too low: 8640000
 meskdeals.com: could not connect to host
 mesmoque.com: could not connect to host
+messagescelestes-archives.ca: did not receive HSTS header
 messagescelestes.ca: did not receive HSTS header
-meta-word.com: could not connect to host
 metadistribution.com: did not receive HSTS header
-metaether.net: could not connect to host
 metagrader.com: could not connect to host
 metalsculpture.co.uk: max-age too low: 0
+metanic.org: did not receive HSTS header
+metaregistrar.com: max-age too low: 0
 metasyntactic.xyz: could not connect to host
-metaword.com: could not connect to host
-metaword.net: could not connect to host
-metaword.org: could not connect to host
 metebalci.com: did not receive HSTS header
 meteosky.net: could not connect to host
 meter.md: could not connect to host
@@ -9735,20 +10619,22 @@ metzgerei-birkenhof.de: could not connect to host
 meu-smartphone.com: did not receive HSTS header
 meucosmetico.com.br: could not connect to host
 meuemail.pro: could not connect to host
-meupedido.online: could not connect to host
+meupedido.online: did not receive HSTS header
 meusigno.com: could not connect to host
 mexbt.com: could not connect to host
 mexicanbusinessweb.mx: did not receive HSTS header
-mexicansbook.ru: could not connect to host
+mexicansbook.ru: did not receive HSTS header
 mexior.nl: could not connect to host
 meyeraviation.com: could not connect to host
 mfcatalin.com: could not connect to host
 mfedderke.com: could not connect to host
 mfgod.com: did not receive HSTS header
 mfiles.pl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+mfpccprod.com: could not connect to host
 mfrsgb45.org: did not receive HSTS header
 mft.global: could not connect to host
 mfxer.com: could not connect to host
+mgcraft.net: did not receive HSTS header
 mgdigital.fr: did not receive HSTS header
 mgiay.com: did not receive HSTS header
 mgoessel.de: did not receive HSTS header
@@ -9764,15 +10650,18 @@ mia.to: could not connect to host
 miamicityballet.org: did not receive HSTS header
 mianfei-vpn.com: could not connect to host
 miboulot.com: could not connect to host
-michael-schilling.de: did not receive HSTS header
+micaiahparker.com: could not connect to host
+micasamgmt.com: did not receive HSTS header
 michaeldemuth.com: could not connect to host
 michaelfitzpatrickruth.com: did not receive HSTS header
+michaelizquierdo.com: max-age too low: 0
 michaelklos.nl: could not connect to host
 michaelmorpurgo.com: did not receive HSTS header
 michaeln.net: could not connect to host
+michaels-homepage-service.de: could not connect to host
 michaelscrivo.com: did not receive HSTS header
-michaelsulzer.com: did not receive HSTS header
-michaelsulzer.eu: did not receive HSTS header
+michaelsulzer.com: could not connect to host
+michaelsulzer.eu: could not connect to host
 michaelwaite.org: could not connect to host
 michal-kral.cz: could not connect to host
 michalborka.cz: could not connect to host
@@ -9780,13 +10669,13 @@ michalkral.tk: could not connect to host
 michalvasicek.cz: did not receive HSTS header
 michasfahrschule.com: could not connect to host
 michel.pt: did not receive HSTS header
+michele.ml: could not connect to host
 michelledonelan.co.uk: did not receive HSTS header
 michiganmetalartwork.com: max-age too low: 7889238
 mico.world: could not connect to host
 miconware.de: could not connect to host
 micro-dv.ru: could not connect to host
 micro-rain-systems.com: did not receive HSTS header
-microbiote-insectes-vecteurs.group: did not receive HSTS header
 microblading.pe: could not connect to host
 microdesic.com: could not connect to host
 microme.ga: could not connect to host
@@ -9812,9 +10701,10 @@ miguksaram.com: could not connect to host
 mijn-email.org: could not connect to host
 mijndiad.nl: did not receive HSTS header
 mijnkredietpaspoort.nl: could not connect to host
+mijnsite.ovh: could not connect to host
 mika.cat: could not connect to host
 mikadesign.se: did not receive HSTS header
-mikaela.info: did not receive HSTS header
+mikaela.info: max-age too low: 0
 mikaelemilsson.net: did not receive HSTS header
 mikeburns.com: could not connect to host
 mikedugan.org: did not receive HSTS header
@@ -9826,11 +10716,10 @@ mikes.tk: could not connect to host
 mikeybot.com: could not connect to host
 mikii.club: could not connect to host
 mikk.cz: could not connect to host
-mikkelscheike.com: could not connect to host
-mikkelvej.dk: could not connect to host
+mikori.sk: did not receive HSTS header
 mikro-inwestycje.co.uk: did not receive HSTS header
 miku.be: could not connect to host
-miku.hatsune.my: could not connect to host
+miku.hatsune.my: did not receive HSTS header
 mikusinec.com: could not connect to host
 milahendri.com: did not receive HSTS header
 milang.xyz: could not connect to host
@@ -9840,13 +10729,17 @@ milesgeek.com: did not receive HSTS header
 military-portal.cz: did not receive HSTS header
 militarycarlot.com: did not receive HSTS header
 militaryconsumer.gov: did not receive HSTS header
+milkingit.net: could not connect to host
 millibitcoin.jp: could not connect to host
 millionairessecrets.com: could not connect to host
+millions25.com: could not connect to host
+millions26.com: could not connect to host
+millions27.com: could not connect to host
 millstep.de: did not receive HSTS header
 milonga.tips: could not connect to host
 mim.properties: could not connect to host
 mimbeim.com: did not receive HSTS header
-mimm.gov: could not connect to host
+mimm.gov: did not receive HSTS header
 mimoderoupa.pt: could not connect to host
 min.kiwi: could not connect to host
 minacssas.com: could not connect to host
@@ -9855,7 +10748,7 @@ mind.sh: did not receive HSTS header
 mindbodycontinuum.com: could not connect to host
 mindcell.no: could not connect to host
 mindcraft.ga: could not connect to host
-mindturbo.com: did not receive HSTS header
+mindwork.space: could not connect to host
 mine.world: could not connect to host
 minecraft-forum.cf: could not connect to host
 minecraft-forum.ga: could not connect to host
@@ -9877,7 +10770,6 @@ mingkyaa.com: could not connect to host
 mingo.nl: max-age too low: 2592000
 mingy.ddns.net: could not connect to host
 mingyueli.com: could not connect to host
-minh.at: did not receive HSTS header
 minhanossasenhora.com.br: could not connect to host
 mini-piraten.de: did not receive HSTS header
 minikneet.nl: did not receive HSTS header
@@ -9891,8 +10783,8 @@ minnesotadata.com: could not connect to host
 minor.news: could not connect to host
 minora.io: could not connect to host
 minoris.se: did not receive HSTS header
+minorshadows.net: did not receive HSTS header
 mintea-noua.ro: could not connect to host
-minu.link: could not connect to host
 mipiaci.co.nz: did not receive HSTS header
 mipiaci.com.au: did not receive HSTS header
 miragrow.com: could not connect to host
@@ -9907,31 +10799,33 @@ miruc.co: did not receive HSTS header
 mirucon.com: did not receive HSTS header
 misconfigured.io: could not connect to host
 miscreant.me: could not connect to host
-misericordiasegrate.org: could not connect to host
+misericordiasegrate.org: did not receive HSTS header
 misgluteosperfectos.com: did not receive HSTS header
-misiondelosangeles-mailing.com: did not receive HSTS header
+misiondelosangeles-mailing.com: could not connect to host
 misiru.jp: could not connect to host
-missguidedus.com: did not receive HSTS header
 missrain.tw: could not connect to host
 missycosmeticos.com.br: could not connect to host
 mist.ink: could not connect to host
-mister.hosting: could not connect to host
+mister.hosting: did not receive HSTS header
 misterl.net: did not receive HSTS header
 misuzu.moe: could not connect to host
 mitarbeiter-pc.de: did not receive HSTS header
 mitchellrenouf.ca: could not connect to host
 mitior.net: could not connect to host
 mitm-software.badssl.com: could not connect to host
+mitsign.com: could not connect to host
 mittenhacks.com: could not connect to host
+mityinc.com: did not receive HSTS header
 miukimodafeminina.com: could not connect to host
 mivcon.net: could not connect to host
-mixer.cash: did not receive HSTS header
+mixer.cash: could not connect to host
 miya.io: could not connect to host
 miyako-kyoto.jp: could not connect to host
 miyoshi-kikaku.co.jp: could not connect to host
 mizd.at: could not connect to host
 mizi.name: could not connect to host
-mjcaffarattilaw.com: could not connect to host
+mjcaffarattilaw.com: did not receive HSTS header
+mjec.net: could not connect to host
 mjhsc.nl: did not receive HSTS header
 mk-dizajn.com: could not connect to host
 mkacg.com: could not connect to host
@@ -9939,9 +10833,10 @@ mkakh.xyz: could not connect to host
 mkfs.be: could not connect to host
 mkfs.fr: could not connect to host
 mkg-palais-hanau.de: did not receive HSTS header
-mkg-wiebelskirchen.de: could not connect to host
+mkie.cf: could not connect to host
 mkp-deutschland.de: did not receive HSTS header
 mkplay.io: could not connect to host
+mkuznets.com: could not connect to host
 mkw.st: could not connect to host
 mlcambiental.com.br: did not receive HSTS header
 mlcdn.co: could not connect to host
@@ -9952,21 +10847,25 @@ mlpvc-rr.ml: did not receive HSTS header
 mlrslateroofing.com.au: did not receive HSTS header
 mlsrv.de: could not connect to host
 mm-wife.com: could not connect to host
+mmaps.ddns.net: could not connect to host
 mmarnitz.de: could not connect to host
 mmcc.pe: did not receive HSTS header
 mmgazhomeloans.com: did not receive HSTS header
 mmilog.hu: could not connect to host
 mmmm.com: could not connect to host
 mmstick.tk: could not connect to host
+mnd.sc: could not connect to host
 mnec.io: could not connect to host
 mneeb.de: could not connect to host
 mnemotiv.com: could not connect to host
 mnetworkingsolutions.co.uk: could not connect to host
 mnmt.no: did not receive HSTS header
 mnwt.nl: could not connect to host
+mo3.club: could not connect to host
 moar.so: did not receive HSTS header
 moas.design: did not receive HSTS header
 moas.photos: did not receive HSTS header
+mobag.ru: did not receive HSTS header
 mobaircon.com: did not receive HSTS header
 mobile-gesundheit.org: could not connect to host
 mobile.eti.br: could not connect to host
@@ -9991,6 +10890,7 @@ mockmyapp.com: could not connect to host
 mocloud.eu: could not connect to host
 mocloud.win: could not connect to host
 mocsuite.club: could not connect to host
+mocurio.com: could not connect to host
 modalrakyat.com: could not connect to host
 modalrakyat.id: did not receive HSTS header
 modaperuimport.com: could not connect to host
@@ -9999,8 +10899,8 @@ modded-minecraft-server-list.com: could not connect to host
 moddedark.com: could not connect to host
 mode-marine.com: could not connect to host
 modecaso.com: could not connect to host
-modehaus-marionk.de: could not connect to host
 model9.io: did not receive HSTS header
+modeldimension.com: could not connect to host
 modelsclub.org.ua: could not connect to host
 modemagazines.co.uk: could not connect to host
 moderatortv.de: did not receive HSTS header
@@ -10010,19 +10910,17 @@ mododo.de: could not connect to host
 modx.by: max-age too low: 31536
 modx.io: could not connect to host
 modydev.club: could not connect to host
-moe-max.jp: could not connect to host
 moe.pe: could not connect to host
+moe.wtf: could not connect to host
 moe4sale.in: did not receive HSTS header
 moebel-nagel.de: did not receive HSTS header
 moebel-vergleichen.com: did not receive HSTS header
 moefi.xyz: could not connect to host
 moegirl.org: did not receive HSTS header
-moehrke.cc: could not connect to host
 moellers.it: could not connect to host
 moeloli.pw: did not receive HSTS header
 moelord.org: could not connect to host
 moen.io: did not receive HSTS header
-moeqing.net: could not connect to host
 moevenpick-cafe.com: did not receive HSTS header
 moeyun.net: could not connect to host
 mogry.net: did not receive HSTS header
@@ -10033,10 +10931,8 @@ moitur.com: did not receive HSTS header
 mojapraca.sk: did not receive HSTS header
 mojefilmy.xyz: could not connect to host
 mojizuri.jp: max-age too low: 86400
-mojnet.eu: could not connect to host
-mojnet.net: could not connect to host
-mojoco.co.za: could not connect to host
 mokadev.com: did not receive HSTS header
+mokken-fabriek.nl: did not receive HSTS header
 molokai.org: could not connect to host
 mols.me: could not connect to host
 momento.co.id: did not receive HSTS header
@@ -10046,14 +10942,12 @@ mommelonline.de: could not connect to host
 momoka.moe: could not connect to host
 mon-a-lisa.com: did not receive HSTS header
 mon-mobile.com: did not receive HSTS header
-mona-antenna.com: did not receive HSTS header
 mona.lu: could not connect to host
 monalisa.wtf: could not connect to host
 monarca.systems: could not connect to host
 monasterialis.eu: could not connect to host
 monautoneuve.fr: did not receive HSTS header
 mondar.io: could not connect to host
-mondedie.fr: could not connect to host
 mondopoint.com: did not receive HSTS header
 mondwandler.de: could not connect to host
 moneoci.com.br: could not connect to host
@@ -10062,18 +10956,17 @@ moneycrownmedia.com: could not connect to host
 moneyfactory.gov: did not receive HSTS header
 mongla168.net: could not connect to host
 mongla88.net: could not connect to host
-monicabeckstrom.no: could not connect to host
+monicabeckstrom.no: did not receive HSTS header
 monika-sokol.de: did not receive HSTS header
+monique.io: could not connect to host
 monitaure.io: could not connect to host
-monitman.com: did not receive HSTS header
 monitman.solutions: could not connect to host
 monitorchain.com: did not receive HSTS header
 monitori.ng: could not connect to host
-monkieteel.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+monkieteel.nl: max-age too low: 2592000
 monochrometoys.com: could not connect to host
-monodukuri.cafe: did not receive HSTS header
-monodzukuri.cafe: did not receive HSTS header
-monokoo.com: could not connect to host
+monodukuri.cafe: could not connect to host
+monodzukuri.cafe: could not connect to host
 monotsuku.com: could not connect to host
 monozukuri.cafe: did not receive HSTS header
 montanacures.org: could not connect to host
@@ -10081,18 +10974,18 @@ montanana.com: did not receive HSTS header
 montand.com: did not receive HSTS header
 monteurzimmerfrei.de: could not connect to host
 montonicms.com: could not connect to host
-moo.pet: did not receive HSTS header
+moo.pet: could not connect to host
 moobo.xyz: did not receive HSTS header
 moodifiers.com: could not connect to host
 moon.lc: could not connect to host
-moonchart.co.uk: did not receive HSTS header
+moonagic.io: could not connect to host
 moonless.net: could not connect to host
 moonloupe.com: could not connect to host
+moonrhythm.info: could not connect to host
 moonrhythm.io: did not receive HSTS header
 moonysbouncycastles.co.uk: could not connect to host
 moosemanstudios.com: could not connect to host
 moov.is: could not connect to host
-moovablestorage.com: could not connect to host
 moparcraft.com: could not connect to host
 moparcraft.org: could not connect to host
 moparisthebest.biz: could not connect to host
@@ -10102,29 +10995,33 @@ mopsuite.club: could not connect to host
 mor.cloud: could not connect to host
 mor.gl: could not connect to host
 mordrum.com: could not connect to host
+morenci.ch: could not connect to host
+morepopcorn.co.nz: did not receive HSTS header
 moreserviceleads.com: did not receive HSTS header
+morespacestorage.com.au: did not receive HSTS header
 morethanadream.lv: could not connect to host
 morfitronik.pl: could not connect to host
 morganestes.com: max-age too low: 0
 morganino.eu: could not connect to host
 morningcalculation.com: could not connect to host
 morninglory.com: did not receive HSTS header
-mornings.com: could not connect to host
+mornings.com: did not receive HSTS header
 morotech.com.br: did not receive HSTS header
 morpheusx.at: could not connect to host
 morpheusxaut.net: could not connect to host
 morpork.xyz: could not connect to host
+morrodafumacanoticias.com.br: did not receive HSTS header
 morz.org: max-age too low: 0
 mosaique-lachenaie.fr: could not connect to host
 moskva.guide: did not receive HSTS header
 moso.io: did not receive HSTS header
 mostlyharmless.at: could not connect to host
+mostlyinfinite.com: did not receive HSTS header
 mostwuat.com: could not connect to host
 motherbase.io: could not connect to host
 motherboard.services: could not connect to host
 motionfreight.com: could not connect to host
 motionpicturesolutions.com: did not receive HSTS header
-motocollection.pl: did not receive HSTS header
 motocyklovedily.cz: did not receive HSTS header
 motomorgen.com: could not connect to host
 motorbiketourhanoi.com: could not connect to host
@@ -10136,26 +11033,28 @@ motovio.de: did not receive HSTS header
 motransportinfo.com: did not receive HSTS header
 mottvd.com: could not connect to host
 moube.fr: could not connect to host
+moucloud.cn: did not receive HSTS header
 moudicat.com: max-age too low: 6307200
 moula.com.au: did not receive HSTS header
 moumaobuchiyu.com: could not connect to host
 mountainadventureseminars.com: did not receive HSTS header
 mountainmusicpromotions.com: did not receive HSTS header
 movabletype.net: max-age too low: 3600
+moveek.com: did not receive HSTS header
 moveltix.net: could not connect to host
 movepin.com: could not connect to host
-movie4k.fyi: could not connect to host
+movie4k.fyi: max-age too low: 0
 movie4k.life: could not connect to host
 movie4kto.site: could not connect to host
 moviedollars.com: could not connect to host
 movienang.com: max-age too low: 0
 moviesabout.net: could not connect to host
-moviespur.info: did not receive HSTS header
 moving-pixtures.de: could not connect to host
 movingoklahoma.org: could not connect to host
 movio.ga: could not connect to host
 mowalls.net: could not connect to host
 moy-gorod.od.ua: did not receive HSTS header
+moysovet.info: did not receive HSTS header
 moyu.host: did not receive HSTS header
 mozart-game.cz: could not connect to host
 mozartgame.cz: could not connect to host
@@ -10167,10 +11066,14 @@ mp3donusturucu.com: did not receive HSTS header
 mp3donusturucu.net: did not receive HSTS header
 mp3gratuiti.com: could not connect to host
 mp3juices.is: could not connect to host
+mpe.org: did not receive HSTS header
 mpi-sa.fr: did not receive HSTS header
 mpkossen.com: did not receive HSTS header
 mpn.poker: did not receive HSTS header
+mpreserver.com: could not connect to host
 mpserver12.org: could not connect to host
+mpu-giessen.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+mqas.net: could not connect to host
 mr-coffee.net: could not connect to host
 mr-hosting.com: could not connect to host
 mrafrohead.com: could not connect to host
@@ -10181,17 +11084,16 @@ mrdleisure.co.uk: did not receive HSTS header
 mredsanders.net: did not receive HSTS header
 mrettich.org: did not receive HSTS header
 mrhc.ru: could not connect to host
-mrhee.com: did not receive HSTS header
 mrizzio.com: could not connect to host
 mrksk.com: could not connect to host
 mrleonardo.com: did not receive HSTS header
 mrliu.me: could not connect to host
+mrmoregame.de: could not connect to host
 mrnh.tk: could not connect to host
 mrnonz.com: max-age too low: 0
 mrparker.pw: did not receive HSTS header
 mrpopat.in: did not receive HSTS header
 mrpropop.com: max-age too low: 0
-mrs-shop.com: did not receive HSTS header
 mruganiepodspacja.pl: could not connect to host
 msc-seereisen.net: could not connect to host
 msgallery.tk: could not connect to host
@@ -10205,6 +11107,7 @@ mszaki.com: did not receive HSTS header
 mt.me.uk: could not connect to host
 mtamaki.com: could not connect to host
 mtau.com: max-age too low: 2592000
+mtb.wtf: could not connect to host
 mtcgf.com: did not receive HSTS header
 mtd.ovh: could not connect to host
 mtdn.jp: could not connect to host
@@ -10212,6 +11115,8 @@ mtfgnettoyage.fr: could not connect to host
 mtg-esport.de: did not receive HSTS header
 mtirc.co: could not connect to host
 mtn.cc: could not connect to host
+mtr.md: could not connect to host
+mtravelers.net: could not connect to host
 mu3on.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 muchohentai.com: could not connect to host
 muffet.pw: could not connect to host
@@ -10221,19 +11126,18 @@ mujadin.se: did not receive HSTS header
 mulenvo.com: did not receive HSTS header
 mulheres18.com: could not connect to host
 mullen.net.au: did not receive HSTS header
-multiterm.org: did not receive HSTS header
+multiterm.org: could not connect to host
 multivpn.cn.com: could not connect to host
 multivpn.com.de: could not connect to host
 multivpn.com.ua: could not connect to host
 multivpn.fr: could not connect to host
 multiworldsoftware.com: did not receive HSTS header
+multizone.games: could not connect to host
 mumei.space: did not receive HSTS header
 mundoadulto.com.br: did not receive HSTS header
 mundoalpha.com.br: did not receive HSTS header
-mundodapoesia.com: did not receive HSTS header
 munecoscabezones.com: did not receive HSTS header
-munich-rage.de: did not receive HSTS header
-munirajiwa.com: could not connect to host
+munich-rage.de: could not connect to host
 munkiepus.com: did not receive HSTS header
 munpanel.com: could not connect to host
 munrabi.com: could not connect to host
@@ -10250,14 +11154,16 @@ murraycolin.org: could not connect to host
 murrayrun.com: did not receive HSTS header
 mursu.directory: could not connect to host
 murz.tv: could not connect to host
-murzik.space: could not connect to host
 muscleangels.com: could not connect to host
+musearchengine.com: could not connect to host
 museminder2.com: did not receive HSTS header
-museumstreak.com: did not receive HSTS header
+museumstreak.com: could not connect to host
 musewearflipflops.com: could not connect to host
 mushman.tk: could not connect to host
 mushroomandfern.com: could not connect to host
 musi.cx: could not connect to host
+musicaconleali.it: did not receive HSTS header
+musiccitycats.com: did not receive HSTS header
 musikkfondene.no: did not receive HSTS header
 musikzug-bookholzberg.de: did not receive HSTS header
 muslimbanter.co.za: could not connect to host
@@ -10271,6 +11177,7 @@ mvanmarketing.nl: did not receive HSTS header
 mvnet.com.br: did not receive HSTS header
 mvsecurity.nl: could not connect to host
 mwalz.com: could not connect to host
+mwohlfarth.de: did not receive HSTS header
 mxawei.cn: could not connect to host
 mxlife.org: could not connect to host
 my-demo.co: could not connect to host
@@ -10290,13 +11197,13 @@ mybboard.pl: could not connect to host
 mybudget.xyz: could not connect to host
 mybuilderinlondon.co.uk: did not receive HSTS header
 mybusiness.cm: did not receive HSTS header
-mycamda.com: could not connect to host
 mychocolateweightloss.com: could not connect to host
 myclientsplus.com: did not receive HSTS header
 mycollab.net: could not connect to host
 mycolorado.gov: could not connect to host
 mycontrolmonitor.com: could not connect to host
 mycoted.com: did not receive HSTS header
+mycuco.it: did not receive HSTS header
 myday.eu.com: did not receive HSTS header
 mydeos.com: could not connect to host
 mydigipass.com: did not receive HSTS header
@@ -10306,11 +11213,12 @@ mydnatest.com: did not receive HSTS header
 mydriversedge.com: did not receive HSTS header
 myepass.bg: could not connect to host
 myepass.de: could not connect to host
+myessaygeek.com: could not connect to host
 myfappening.org: could not connect to host
 myfdic.gov: could not connect to host
+myfishpalace.at: could not connect to host
 myfunworld.de: could not connect to host
-mygalgame.com: could not connect to host
-mygate.at: did not receive HSTS header
+mygalgame.com: did not receive HSTS header
 mygaysitges.com: could not connect to host
 mygivingcircle.org: did not receive HSTS header
 mygooder.com: did not receive HSTS header
@@ -10319,25 +11227,22 @@ mygpsite.com: did not receive HSTS header
 mygreatjob.eu: could not connect to host
 myhair.asia: did not receive HSTS header
 myhloli.com: did not receive HSTS header
-myhostname.net: did not receive HSTS header
 myicare.org: did not receive HSTS header
 myiocc.org: did not receive HSTS header
 myip.tech: max-age too low: 2592000
-myipaddr.de: did not receive HSTS header
+myjumpsuit.de: did not receive HSTS header
 mykolab.com: did not receive HSTS header
 mykreuzfahrt.de: could not connect to host
 mylene-chandelier.me: did not receive HSTS header
 mylighthost.com: did not receive HSTS header
 mylocalsearch.co.uk: did not receive HSTS header
 mylotto.co.nz: could not connect to host
-mymadina.com: could not connect to host
 mymp3singer.co: could not connect to host
 mymp3singer.net: could not connect to host
 mymp3singer.site: could not connect to host
 mymsr.de: did not receive HSTS header
 myndcommunication.com: could not connect to host
 mynetblog.com: did not receive HSTS header
-mynetworkingbuddy.com: could not connect to host
 mynewleaf.co: did not receive HSTS header
 mynewselfbariatrics.com: did not receive HSTS header
 myni.io: could not connect to host
@@ -10355,6 +11260,7 @@ mypanier.com: max-age too low: 7889238
 mypaperwriter.com: could not connect to host
 myparfumerie.at: did not receive HSTS header
 mypension.ca: could not connect to host
+myperfumecollection.com: did not receive HSTS header
 myphonebox.de: could not connect to host
 myptsite.com: could not connect to host
 myqdu.cn: could not connect to host
@@ -10365,25 +11271,30 @@ myrig.ru: did not receive HSTS header
 myrsa.in: did not receive HSTS header
 myruststats.com: could not connect to host
 mysa.is: could not connect to host
+mysber.ru: did not receive HSTS header
 mysecretrewards.com: could not connect to host
+mysongbird.xyz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 myspa.asia: did not receive HSTS header
 mystery-science-theater-3000.de: did not receive HSTS header
 mysteryblog.de: did not receive HSTS header
+mysterysear.ch: could not connect to host
 mystown.org: could not connect to host
 mystudy.me: could not connect to host
 mytc.fr: could not connect to host
-mythemeshop.com: did not receive HSTS header
 mythlogic.com: did not receive HSTS header
 mythslegendscollection.com: did not receive HSTS header
 mytravelblog.de: could not connect to host
 mywallets.io: could not connect to host
 myweb360.de: did not receive HSTS header
+mywebinar.io: could not connect to host
+mywebpanel.nl: did not receive HSTS header
 myxbox.gr: max-age too low: 0
-myzone.com: did not receive HSTS header
+myzone.com: max-age too low: 0
+mziulu.me: could not connect to host
 mzlog.win: could not connect to host
 mzorn.photography: could not connect to host
 n-rickroll-e.pw: could not connect to host
-n-x.info: did not receive HSTS header
+n-x.info: could not connect to host
 n0099.cf: did not receive HSTS header
 n0rm.ru: could not connect to host
 n0s.de: did not receive HSTS header
@@ -10392,32 +11303,29 @@ n2x.in: could not connect to host
 n3twork.net: could not connect to host
 n4l.pw: could not connect to host
 n64chan.me: did not receive HSTS header
-n8ch.net: could not connect to host
+n7.education: did not receive HSTS header
 na.hn: did not receive HSTS header
 naano.org: could not connect to host
 nabru.co.uk: did not receive HSTS header
 nabu-bad-nauheim.de: did not receive HSTS header
 nabytko.cz: could not connect to host
-nacktetatsachen.at: did not receive HSTS header
 nadia.pt: could not connect to host
 nagaragem.com.br: did not receive HSTS header
 nagios.by: did not receive HSTS header
 nagoya-kyuyo.com: could not connect to host
 naiaspa.fr: did not receive HSTS header
 naiharngym.com: did not receive HSTS header
-nailedithomebuilders.com: could not connect to host
+nailedithomebuilders.com: max-age too low: 300
 nais.me: did not receive HSTS header
 najedlo.sk: could not connect to host
-nakada4610.com: could not connect to host
 nakamastreamingcommunity.com: could not connect to host
-nakanishi-paint.com: could not connect to host
 nakhonidc.com: could not connect to host
 nakitbonus2.com: could not connect to host
 nakliyatsirketi.biz: could not connect to host
 nakuro.de: could not connect to host
 nalao-company.com: did not receive HSTS header
 nalifornia.com: could not connect to host
-nalinux.cz: did not receive HSTS header
+nalinux.cz: could not connect to host
 nallon.com.br: could not connect to host
 nalukfitness.com.br: could not connect to host
 namacindia.com: did not receive HSTS header
@@ -10427,14 +11335,15 @@ nameme.xyz: could not connect to host
 nametaken-cloud.duckdns.org: could not connect to host
 namethatbone.com: could not connect to host
 namethatporn.com: could not connect to host
+nami.exchange: could not connect to host
 namikawatetsuji.jp: could not connect to host
-namorico.me: could not connect to host
+namorico.me: max-age too low: 0
+namuwikiusercontent.com: could not connect to host
 nan.ci: did not receive HSTS header
 nan.zone: could not connect to host
 nanami.moe: could not connect to host
 nanderson.me: could not connect to host
 nanfangstone.com: could not connect to host
-nani.io: did not receive HSTS header
 naniki.co.uk: could not connect to host
 nanogeneinc.com: could not connect to host
 nanokamo.com: did not receive HSTS header
@@ -10445,27 +11354,32 @@ naoar.com: could not connect to host
 naphex.rocks: could not connect to host
 napisynapomniky.cz: did not receive HSTS header
 narach.com: did not receive HSTS header
-nargele.eu: could not connect to host
+nargele.eu: did not receive HSTS header
 narko.space: could not connect to host
 narodniki.com: did not receive HSTS header
 narviz.com: did not receive HSTS header
 nasarawanewsonline.com: could not connect to host
+nasme.tk: could not connect to host
 nasmocopati.com: did not receive HSTS header
-nasralmabrooka.com: could not connect to host
+nasralmabrooka.com: did not receive HSTS header
 nastysclaw.com: could not connect to host
 natalia-fadeeva.ru: could not connect to host
 natalia.io: did not receive HSTS header
 natalieandjoshua.com: could not connect to host
 natalt.org: did not receive HSTS header
 natalydanilova.com: max-age too low: 300
+natanaelys.com: could not connect to host
 nataniel-perissier.fr: could not connect to host
 nate.sh: could not connect to host
 natenom.com: max-age too low: 7200
 natenom.de: max-age too low: 7200
 natenom.name: max-age too low: 7200
 nathanmfarrugia.com: did not receive HSTS header
+nationalbank.gov: did not receive HSTS header
+nationalbanknet.gov: did not receive HSTS header
 nationalmall.gov: could not connect to host
 nationwidevehiclecontracts.co.uk: did not receive HSTS header
+natropie.pl: could not connect to host
 natur-udvar.hu: could not connect to host
 natural-progesterone.net: could not connect to host
 naturalcommission.com: could not connect to host
@@ -10475,16 +11389,20 @@ natuterra.com.br: could not connect to host
 natuurbehangnederland.nl: could not connect to host
 nauck.org: did not receive HSTS header
 naudles.me: could not connect to host
+naughty.audio: could not connect to host
 nav.jobs: could not connect to host
 naval.tf: could not connect to host
+navegos.net: did not receive HSTS header
 naviaddress.io: did not receive HSTS header
-navigate-it-services.de: did not receive HSTS header
 naviteq.eu: could not connect to host
+navitime.me: did not receive HSTS header
 navjobs.com: could not connect to host
 nawroth.info: could not connect to host
 nax.io: did not receive HSTS header
 nay.moe: did not receive HSTS header
 nba2kqq.com: could not connect to host
+nba669.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+nba686.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 nbb.io: could not connect to host
 nbg-ha.de: could not connect to host
 nbis.gov: could not connect to host
@@ -10492,36 +11410,40 @@ nbl.org.tw: could not connect to host
 nbrown.us: could not connect to host
 nbtparse.org: could not connect to host
 nc2c.com: could not connect to host
-nc99.co: could not connect to host
 ncc60205.info: could not connect to host
 ncdesigns-studio.com: could not connect to host
 nchristo.com: did not receive HSTS header
 ncloud.freeddns.org: could not connect to host
+nclvle.co.uk: did not receive HSTS header
 ncpc.gov: could not connect to host
 ncpw.gov: did not receive HSTS header
 ncrmnt.org: did not receive HSTS header
 nct.org.uk: did not receive HSTS header
-nctx.co.uk: did not receive HSTS header
 ndmath.club: could not connect to host
 ndtblog.com: could not connect to host
 ndtmarket.place: could not connect to host
-ne1home.dyndns.org: could not connect to host
+ne1home.dyndns.org: did not receive HSTS header
 neap.io: could not connect to host
 near.st: did not receive HSTS header
 nearbiwa.com: did not receive HSTS header
 nearon.nl: could not connect to host
 neavision.de: did not receive HSTS header
+nebula.exchange: did not receive HSTS header
 nebulousenhanced.com: could not connect to host
 necesitodinero.org: could not connect to host
 necio.ca: could not connect to host
 nedcf.org.uk: could not connect to host
 nediyor.com: did not receive HSTS header
 nedwave.com: did not receive HSTS header
-nedzad.me: could not connect to host
-neels.ch: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+nedzad.me: max-age too low: 0
+needle.net.nz: could not connect to host
+needle.nz: could not connect to host
+neels.ch: did not receive HSTS header
 neer.io: could not connect to host
+neet-investor.biz: could not connect to host
 neftaly.com: did not receive HSTS header
 neftebitum-kngk.ru: did not receive HSTS header
+negativecurvature.net: could not connect to host
 negativzinsen.info: did not receive HSTS header
 negraelinda.com: did not receive HSTS header
 neilgreen.net: did not receive HSTS header
@@ -10535,15 +11457,16 @@ nellen.it: did not receive HSTS header
 nemanja.top: did not receive HSTS header
 nemno.de: could not connect to host
 nemovement.org: could not connect to host
-nemplex.win: could not connect to host
 neoani.me: did not receive HSTS header
+neocoding.com: did not receive HSTS header
 neocyd.com: could not connect to host
+neodrive.ch: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 neofelhz.space: could not connect to host
 neojames.me: could not connect to host
 neonisi.com: could not connect to host
 neonnuke.tech: did not receive HSTS header
 neosolution.ca: did not receive HSTS header
-nephy.jp: could not connect to host
+nephos.xyz: could not connect to host
 nercp.org.uk: did not receive HSTS header
 nerd42.de: could not connect to host
 nerdbox.cc: did not receive HSTS header
@@ -10552,6 +11475,8 @@ nerfroute.com: could not connect to host
 neris.io: could not connect to host
 neriumhcp.com: did not receive HSTS header
 nesantuoka.lt: could not connect to host
+nesbase.com: could not connect to host
+nestedquotes.ca: could not connect to host
 nestone.ru: could not connect to host
 net-navi.cc: did not receive HSTS header
 net-rencontre.com: did not receive HSTS header
@@ -10559,9 +11484,9 @@ net2o.com: did not receive HSTS header
 net2o.de: did not receive HSTS header
 net2o.net: did not receive HSTS header
 net4it.de: did not receive HSTS header
+netba.net: could not connect to host
 netbox.cc: could not connect to host
-netbrief.ml: did not receive HSTS header
-netbuzz.ru: could not connect to host
+netbrief.ml: could not connect to host
 netde.jp: could not connect to host
 netdego.jp: could not connect to host
 netfs.pl: did not receive HSTS header
@@ -10569,6 +11494,7 @@ netguide.co.nz: did not receive HSTS header
 netherwind.eu: did not receive HSTS header
 netlilo.com: could not connect to host
 netloanusa.com: could not connect to host
+netlocal.ru: could not connect to host
 netmagik.com: did not receive HSTS header
 netprofile.com.au: did not receive HSTS header
 netresourcedesign.com: could not connect to host
@@ -10579,19 +11505,20 @@ netsparkercloud.com: did not receive HSTS header
 netsystems.pro: could not connect to host
 nettacompany.com.tr: did not receive HSTS header
 nettefoundation.com: could not connect to host
+nettplusultra-rhone.fr: did not receive HSTS header
+networkmon.net: could not connect to host
 networx-online.de: could not connect to host
 netzbit.de: could not connect to host
 netzpolitik.org: max-age too low: 2592000
 netztest.at: did not receive HSTS header
 netzvieh.de: could not connect to host
-netzzwerg4u.de: did not receive HSTS header
+netzzwerg4u.de: could not connect to host
 neuch.info: did not receive HSTS header
 neueonlinecasino2016.com: could not connect to host
 neuralgic.net: could not connect to host
 neuro-plus-100.com: could not connect to host
 neuronfactor.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 neutralvehicle.com: did not receive HSTS header
-neva.li: could not connect to host
 nevadafiber.net: could not connect to host
 never-afk.de: did not receive HSTS header
 neveta.com: could not connect to host
@@ -10601,6 +11528,7 @@ newbieboss.com: did not receive HSTS header
 newbownerton.xyz: could not connect to host
 newchance.store: could not connect to host
 newcityinfo.info: could not connect to host
+newcreamforface.com: could not connect to host
 newdeveloper.download: could not connect to host
 newedivideo.it: could not connect to host
 newfacialbeautycream.com: could not connect to host
@@ -10608,11 +11536,13 @@ newgenerationplus.org: could not connect to host
 newhdmovies.io: could not connect to host
 newline.online: did not receive HSTS header
 newlooknow.com: did not receive HSTS header
-newmelalife.com: did not receive HSTS header
 newparadigmventures.net: did not receive HSTS header
-newpoke.net: did not receive HSTS header
+newpathintegratedtherapy.com: did not receive HSTS header
+newpoke.net: could not connect to host
 newportpropertygroup.com: could not connect to host
 news4c.com: did not receive HSTS header
+newsaboutgames.de: could not connect to host
+newserumforskin.com: could not connect to host
 newsquantified.com: max-age too low: 0
 newstarnootropics.com: could not connect to host
 newtnote.com: could not connect to host
@@ -10621,24 +11551,27 @@ newtonwarp.com: could not connect to host
 nexgeneration-solutions.com: could not connect to host
 nexlab.org: did not receive HSTS header
 next-taxi.ru: could not connect to host
-next176.sk: did not receive HSTS header
 next47.com: did not receive HSTS header
 nextcloud.li: could not connect to host
 nextcloud.org: could not connect to host
-nextend.org: could not connect to host
+nextend.net: could not connect to host
+nextend.org: did not receive HSTS header
 nexth.de: could not connect to host
 nexth.net: did not receive HSTS header
 nexth.us: could not connect to host
 nexthop.co.jp: could not connect to host
 nexthop.co.th: did not receive HSTS header
+nextlevel-it.co.uk: could not connect to host
 nextpages.de: could not connect to host
 nextproject.us: could not connect to host
 nextshutter.com: did not receive HSTS header
 nexusbyte.de: could not connect to host
-nexuscorporation.in: could not connect to host
+nexuscorporation.in: did not receive HSTS header
 nfhome.be: did not receive HSTS header
+nfls.io: did not receive HSTS header
 nfluence.org: could not connect to host
 nfo.so: could not connect to host
+nfrost.me: could not connect to host
 ng-firewall.com: did not receive HSTS header
 ng-security.com: could not connect to host
 ngiemboon.net: could not connect to host
@@ -10655,7 +11588,8 @@ nhus.de: max-age too low: 172800
 niallator.com: could not connect to host
 nibiisclaim.com: could not connect to host
 nicestresser.fr: could not connect to host
-nicholasperkins.io: could not connect to host
+nickcleans.co.uk: could not connect to host
+nicktheitguy.com: could not connect to host
 nicky.io: did not receive HSTS header
 nico.one: could not connect to host
 nicoborghuis.nl: could not connect to host
@@ -10666,10 +11600,11 @@ nicolasdutour.com: did not receive HSTS header
 nicolasklotz.de: did not receive HSTS header
 nicoleoquendo.com: max-age too low: 2592000
 niconiconi.xyz: could not connect to host
+nicoobook.com: did not receive HSTS header
+nicoobook.net: did not receive HSTS header
 nicorevin.ru: could not connect to host
 nidux.com: did not receive HSTS header
-niduxcomercial.com: could not connect to host
-niedersetz.de: did not receive HSTS header
+niedersetz.de: could not connect to host
 nien.chat: could not connect to host
 nien.com.tw: could not connect to host
 nienfun.com: could not connect to host
@@ -10679,10 +11614,13 @@ nifpnet.nl: could not connect to host
 nifume.com: could not connect to host
 niggo.eu: could not connect to host
 nightsnack.cf: could not connect to host
+nightwinds.tk: could not connect to host
 niho.jp: did not receive HSTS header
-nikcub.com: could not connect to host
+nikcub.com: did not receive HSTS header
+niki.ai: did not receive HSTS header
 nikksno.io: could not connect to host
 niklas.host: could not connect to host
+niklasanderson.com: could not connect to host
 niklaslindblad.se: did not receive HSTS header
 nikobradshaw.com: could not connect to host
 nikolaichik.photo: did not receive HSTS header
@@ -10697,13 +11635,15 @@ ninhs.org: could not connect to host
 ninjan.co: did not receive HSTS header
 ninjaspiders.com: could not connect to host
 ninofink.com: could not connect to host
+ninreiei.jp: could not connect to host
+ninverse.com: did not receive HSTS header
 niouininon.eu: could not connect to host
 nippler.org: could not connect to host
 nippombashi.net: did not receive HSTS header
 nippon.fr: could not connect to host
-nipponcareers.com: did not receive HSTS header
+nipponcareers.com: could not connect to host
 nirada.info: could not connect to host
-nirjharstudio.com: could not connect to host
+nirjharstudio.com: did not receive HSTS header
 nirna.io: did not receive HSTS header
 nirvanashop.com: could not connect to host
 nishaswonderland.be: did not receive HSTS header
@@ -10711,6 +11651,7 @@ nishaswonderland.nl: did not receive HSTS header
 nishikino-maki.com: could not connect to host
 nishisbma.com: could not connect to host
 nitaonline.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+niva.synology.me: could not connect to host
 niveldron.com: could not connect to host
 nixien.fr: could not connect to host
 nixmag.net: could not connect to host
@@ -10720,13 +11661,15 @@ nkautoservice.nl: did not receive HSTS header
 nkb.in.th: could not connect to host
 nlegall.fr: did not receive HSTS header
 nll.fi: could not connect to host
+nlrb.gov: did not receive HSTS header
 nmadda.com: did not receive HSTS header
 nmctest.net: could not connect to host
 nmgb.ga: could not connect to host
 nmgb.ml: could not connect to host
 nmsnj.com: did not receive HSTS header
 nmueller.at: could not connect to host
-nnote.net: could not connect to host
+nn78.com: did not receive HSTS header
+nnote.net: did not receive HSTS header
 nnya.cat: could not connect to host
 no17sifangjie.cc: could not connect to host
 noc.wang: could not connect to host
@@ -10736,13 +11679,15 @@ noctinus.tk: could not connect to host
 node-core-app.com: could not connect to host
 nodebrewery.com: could not connect to host
 nodechate.xyz: could not connect to host
-nodecompat.com: could not connect to host
+nodecompat.com: did not receive HSTS header
 nodefiles.com: could not connect to host
 nodefoo.com: could not connect to host
+nodelab-it.de: did not receive HSTS header
 nodepanel.net: did not receive HSTS header
 nodepositcasinouk.com: did not receive HSTS header
 nodeselect.com: could not connect to host
 nodespin.com: did not receive HSTS header
+nodesturut.cl: did not receive HSTS header
 nodetemple.com: could not connect to host
 nodi.at: did not receive HSTS header
 nodum.io: did not receive HSTS header
@@ -10759,9 +11704,11 @@ nolberg.net: did not receive HSTS header
 nolimits.net.nz: could not connect to host
 nolimitsbook.de: did not receive HSTS header
 nolte.work: could not connect to host
-nomagic.software: did not receive HSTS header
+nomagic.software: could not connect to host
+nomoondev.azurewebsites.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 nomorebytes.de: could not connect to host
 nonemu.ninja: could not connect to host
+noobunbox.net: did not receive HSTS header
 noodlecrave.com: did not receive HSTS header
 noodlesandwich.com: did not receive HSTS header
 noodleyum.com: did not receive HSTS header
@@ -10770,25 +11717,26 @@ nootropicsource.com: did not receive HSTS header
 nope.website: could not connect to host
 nopex.no: could not connect to host
 nopol.de: could not connect to host
-norad.sytes.net: could not connect to host
 norandom.com: could not connect to host
 norb.at: could not connect to host
 norden.eu.org: could not connect to host
 nordic-survival.de: did not receive HSTS header
 nordiccasinocommunity.com: did not receive HSTS header
 nordlicht.photography: did not receive HSTS header
-nordseeblicke.de: did not receive HSTS header
 noref.tk: could not connect to host
 norge.guide: could not connect to host
 normalady.com: could not connect to host
 normanschwaneberg.de: did not receive HSTS header
+norrkemi.se: could not connect to host
 north.supply: could not connect to host
 northcutt.com: did not receive HSTS header
 northernmuscle.ca: could not connect to host
 northpennvwparts.com: max-age too low: 604800
+northwest-events.co.uk: could not connect to host
 northwoodsfish.com: could not connect to host
 nosbenevolesontdutalent.com: could not connect to host
 nosecretshop.com: could not connect to host
+nosproduitsdequalite.fr: did not receive HSTS header
 nossasenhoradaconceicao.com.br: did not receive HSTS header
 nostraspace.com: could not connect to host
 nosx.tk: could not connect to host
@@ -10797,6 +11745,7 @@ nota-web.com: could not connect to host
 notablog.xyz: did not receive HSTS header
 notadd.io: could not connect to host
 notadd.store: could not connect to host
+notalone.gov: could not connect to host
 notarankastojkovic.me: could not connect to host
 notarobot.fr: did not receive HSTS header
 notboring.co.uk: could not connect to host
@@ -10813,6 +11762,7 @@ notjustbitchy.com: could not connect to host
 notonprem.com: could not connect to host
 nottheonion.net: did not receive HSTS header
 nottori.com: could not connect to host
+nottres.com: did not receive HSTS header
 notypiesni.sk: did not receive HSTS header
 nou.si: did not receive HSTS header
 nouma.fr: did not receive HSTS header
@@ -10823,7 +11773,6 @@ novaco.in: max-age too low: 3600
 novacraft.me: could not connect to host
 novaopcaofestas.com.br: could not connect to host
 novaorbis.org: could not connect to host
-novascan.net: could not connect to host
 novatrucking.de: could not connect to host
 novavoidhowl.com: did not receive HSTS header
 novelabs.de: could not connect to host
@@ -10831,21 +11780,22 @@ novelabs.eu: could not connect to host
 novelshouse.com: could not connect to host
 novfishing.ru: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 novinhabucetuda.com: could not connect to host
-novinivo.com: did not receive HSTS header
 novtest.ru: did not receive HSTS header
 nowcost.com: could not connect to host
 noworrywp.com: could not connect to host
 nowprotein.com: did not receive HSTS header
 nowremindme.com: could not connect to host
+noxi.ga: could not connect to host
 nozoe.jp: could not connect to host
-npm.li: did not receive HSTS header
-npol.de: could not connect to host
+npm.li: could not connect to host
+npol.de: did not receive HSTS header
 npool.org: could not connect to host
 nq7.pl: could not connect to host
+nqesh.com: could not connect to host
 nrc-gateway.gov: could not connect to host
 nrechn.de: could not connect to host
 nrizzio.me: could not connect to host
-nrnjn.xyz: could not connect to host
+nrnjn.xyz: did not receive HSTS header
 nrvn.cc: did not receive HSTS header
 nrvnastudios.com: could not connect to host
 nsbfalconacademy.org: could not connect to host
@@ -10858,8 +11808,8 @@ nstyleintl.ca: did not receive HSTS header
 nsure.us: could not connect to host
 nsweb.solutions: could not connect to host
 ntbs.pro: could not connect to host
-ntia.gov: could not connect to host
 ntse.xyz: could not connect to host
+nu-pogodi.net: could not connect to host
 nu3.at: did not receive HSTS header
 nu3.ch: did not receive HSTS header
 nu3.co.uk: could not connect to host
@@ -10887,8 +11837,9 @@ nullpro.com: could not connect to host
 numericacu.com: did not receive HSTS header
 numero-di-telefono.it: could not connect to host
 numista.com: did not receive HSTS header
+numm.fr: did not receive HSTS header
 nuovamoda.al: could not connect to host
-nup.pw: could not connect to host
+nup.pw: max-age too low: 0
 nupef.org.br: did not receive HSTS header
 nurserybook.co: did not receive HSTS header
 nurture.be: did not receive HSTS header
@@ -10924,19 +11875,17 @@ nyuusannkinn.com: did not receive HSTS header
 nz.search.yahoo.com: max-age too low: 172800
 nzbs.io: could not connect to host
 nzmk.cz: could not connect to host
-nzquakes.maori.nz: could not connect to host
+nzquakes.maori.nz: did not receive HSTS header
 o-rickroll-y.pw: could not connect to host
 o0o.one: did not receive HSTS header
 oaksbloom.com: could not connect to host
 oasis-conference.org.nz: could not connect to host
-oasis.mobi: could not connect to host
-oasisim.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+oasis.mobi: did not receive HSTS header
 obdolbacca.ru: could not connect to host
 oben.pl: did not receive HSTS header
 oberam.de: could not connect to host
 oberhof.co: could not connect to host
 oberhofjuice.com: could not connect to host
-oberoi.de: max-age too low: 600000
 objectif-leger.com: did not receive HSTS header
 oblikdom.pro: did not receive HSTS header
 oblikdom.ru: did not receive HSTS header
@@ -10948,18 +11897,17 @@ obsydian.org: could not connect to host
 oc-minecraft.com: could not connect to host
 ocad.com.au: did not receive HSTS header
 ocapic.com: could not connect to host
-occ.gov: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+occ.gov: did not receive HSTS header
 occasion-impro.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 occupymedia.org: could not connect to host
 ochaken.cf: could not connect to host
 ocmeulebeke.be: did not receive HSTS header
 ocrami.us: did not receive HSTS header
-octal.es: could not connect to host
 octanio.com: could not connect to host
 octo.im: could not connect to host
 octocat.ninja: could not connect to host
 octod.tk: could not connect to host
-octofox.de: did not receive HSTS header
+octohost.net: did not receive HSTS header
 oddmouse.com: could not connect to host
 odin.xxx: could not connect to host
 odinkapital.no: did not receive HSTS header
@@ -10979,24 +11927,24 @@ offerstone.cl: could not connect to host
 offgames.pro: could not connect to host
 office-ruru.com: could not connect to host
 officeclub.com.mx: did not receive HSTS header
+officeprint.co.th: could not connect to host
 offshore-firma.org: could not connect to host
 offshore-unternehmen.com: could not connect to host
 offshorefirma-gruenden.com: could not connect to host
-offshoremarineparts.com: did not receive HSTS header
 offtherails.ie: could not connect to host
-ofggolf.com: could not connect to host
 oficinadocelular.com.br: could not connect to host
 ofo2.com: could not connect to host
 oganek.ie: could not connect to host
-oganime.com: did not receive HSTS header
+oganime.com: could not connect to host
 oggw.us: could not connect to host
-ogkw.de: could not connect to host
+ogkw.de: did not receive HSTS header
 ogogoshop.com: could not connect to host
 ogrodywstudniach.pl: did not receive HSTS header
 ohayosoro.me: could not connect to host
+ohhdeertrade.com: did not receive HSTS header
 ohm2013.org: did not receive HSTS header
 ohma.ga: did not receive HSTS header
-ohnemusik.com: could not connect to host
+ohnemusik.com: max-age too low: 0
 ohohrazi.com: did not receive HSTS header
 ohreally.de: could not connect to host
 ohsocool.org: did not receive HSTS header
@@ -11004,14 +11952,14 @@ oiepoie.nl: could not connect to host
 oinky.ddns.net: could not connect to host
 oishioffice.com: did not receive HSTS header
 ojbk.eu: could not connect to host
-ojeremy.com: could not connect to host
+ojeremy.com: did not receive HSTS header
 ojls.co: could not connect to host
-okad-center.de: could not connect to host
-okad.de: could not connect to host
-okad.eu: could not connect to host
+okad-center.de: did not receive HSTS header
+okad.de: did not receive HSTS header
+okad.eu: did not receive HSTS header
 okane.love: could not connect to host
 okashi.me: could not connect to host
-okaz.de: could not connect to host
+okaz.de: did not receive HSTS header
 oklahomamoversassociation.org: could not connect to host
 oklahomanotepro.com: could not connect to host
 okok-rent.com: could not connect to host
@@ -11022,16 +11970,18 @@ olcso-vps-szerver.hu: could not connect to host
 oldandyounglesbians.us: could not connect to host
 oldschool-criminal.com: did not receive HSTS header
 oldtimer-trifft-flugplatz.de: did not receive HSTS header
+olifant.fr: did not receive HSTS header
 oliverdunk.com: did not receive HSTS header
+olivlabs.com: could not connect to host
 ollehbizev.co.kr: could not connect to host
 ols.io: did not receive HSTS header
 olswangtrainees.com: could not connect to host
 olympe-transport.fr: did not receive HSTS header
 omacostudio.com: could not connect to host
-omar.yt: could not connect to host
-omarh.net: could not connect to host
+omarsuniagamusic.ga: did not receive HSTS header
 omeuanimal.com: did not receive HSTS header
 omgaanmetidealen.com: could not connect to host
+omifind.com: did not receive HSTS header
 ominto.com: did not receive HSTS header
 omise.co: did not receive HSTS header
 ommahpost.com: did not receive HSTS header
@@ -11043,6 +11993,7 @@ omquote.gq: could not connect to host
 omskit.ru: did not receive HSTS header
 omyogarishikesh.com: did not receive HSTS header
 on-te.ch: did not receive HSTS header
+onahonavi.com: could not connect to host
 one-pe.com: did not receive HSTS header
 onearth.one: did not receive HSTS header
 oneb4nk.com: could not connect to host
@@ -11060,7 +12011,6 @@ onepluscamps.com: did not receive HSTS header
 onepopstore.com: could not connect to host
 onespiritinc.com: did not receive HSTS header
 onet.space: could not connect to host
-onetech.it: did not receive HSTS header
 onetly.com: could not connect to host
 onetwentyseven001.com: did not receive HSTS header
 onewebdev.info: could not connect to host
@@ -11070,15 +12020,16 @@ onguardonline.gov: did not receive HSTS header
 oniichan.us: did not receive HSTS header
 onionbot.ga: could not connect to host
 onioncloud.org: could not connect to host
+onionplay.live: could not connect to host
 onionsburg.com: could not connect to host
 online-casino.eu: did not receive HSTS header
 online-scene.com: did not receive HSTS header
-online-wetten.de: did not receive HSTS header
-online.swedbank.se: did not receive HSTS header
+online-wetten.de: could not connect to host
 onlinebiller.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 onlinebillingform.com: could not connect to host
 onlinecasinobluebook.com: could not connect to host
 onlinecompliance.org: did not receive HSTS header
+onlinecorners.com: did not receive HSTS header
 onlinedemo.hu: could not connect to host
 onlinedeposit.us: could not connect to host
 onlinekasino.de: did not receive HSTS header
@@ -11089,11 +12040,14 @@ onlinespielothek.com: did not receive HSTS header
 onlinewetten.de: could not connect to host
 only-roses.co.uk: did not receive HSTS header
 only-roses.com: max-age too low: 2592000
+onlyesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+onlyesb.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 onlyshopstation.com: did not receive HSTS header
 onlyzero.net: could not connect to host
 onmuvo.com: could not connect to host
 onmyoji.biz: could not connect to host
 onnee.ch: could not connect to host
+onnext.cc: did not receive HSTS header
 ononpay.com: did not receive HSTS header
 onovlena.dn.ua: could not connect to host
 onpatient.com: did not receive HSTS header
@@ -11106,33 +12060,37 @@ ontheboard.com: did not receive HSTS header
 onthecheap.store: could not connect to host
 ontheten.org: did not receive HSTS header
 ontimestamp.com: did not receive HSTS header
-onviga.de: could not connect to host
 onwie.com: could not connect to host
 onwie.fr: could not connect to host
 onyxwall.com: could not connect to host
 onyxwall.link: could not connect to host
 onyxwall.net: could not connect to host
-oo.edu.rs: did not receive HSTS header
+oo.edu.rs: could not connect to host
 ooeste.com: could not connect to host
 ookjesprookje.nl: could not connect to host
 ooooush.co.uk: could not connect to host
 oopsmycase.com: could not connect to host
 oopsorup.com: could not connect to host
+oosoo.org: could not connect to host
 oost.io: could not connect to host
 opatut.de: did not receive HSTS header
 opcaobolsas.com.br: could not connect to host
-open-domotics.info: could not connect to host
 open-future.be: did not receive HSTS header
 open-mx.de: could not connect to host
 open-to-repair.fr: max-age too low: 86400
+openacademies.com: could not connect to host
+openas.org: did not receive HSTS header
+openbankproject.com: did not receive HSTS header
 openclub24.ru: could not connect to host
 openconcept.no: did not receive HSTS header
 openconnect.com.au: could not connect to host
 opendesk.cc: did not receive HSTS header
 openevic.info: could not connect to host
 opengateway.fr: did not receive HSTS header
+openiocdb.com: could not connect to host
 openmetals.com: could not connect to host
 openmind-shop.de: did not receive HSTS header
+openmirrors.cf: could not connect to host
 openpriv.pw: could not connect to host
 openprovider.nl: did not receive HSTS header
 openrainbow.org: could not connect to host
@@ -11147,12 +12105,13 @@ opentexon.com: did not receive HSTS header
 openxmpp.com: could not connect to host
 operad.fr: could not connect to host
 opiates.net: did not receive HSTS header
-opic.gov: could not connect to host
 opim.ca: did not receive HSTS header
 opinion8td.com: did not receive HSTS header
 opinionicentrifuga.it: could not connect to host
 opinionipannolini.it: could not connect to host
 opioids.com: could not connect to host
+oportunidadesemfoco.com.br: could not connect to host
+opp.ag: did not receive HSTS header
 oppag.com.br: did not receive HSTS header
 opperwall.net: could not connect to host
 opposer.me: could not connect to host
@@ -11170,9 +12129,7 @@ optumrxhealthstore.com: could not connect to host
 opunch.org: did not receive HSTS header
 oracaodocredo.com.br: could not connect to host
 orangekey.tk: could not connect to host
-orangenuts.in: could not connect to host
 oranges.tokyo: could not connect to host
-orangetravel.eu: could not connect to host
 oranic.com: did not receive HSTS header
 orbiosales.com: could not connect to host
 orbitcom.de: did not receive HSTS header
@@ -11182,11 +12139,12 @@ orcahq.com: did not receive HSTS header
 order.one: could not connect to host
 ordereat.fr: could not connect to host
 orderlounge.de: did not receive HSTS header
-oref-idf.com: did not receive HSTS header
-oref-idf.net: did not receive HSTS header
-oref-idf.org: did not receive HSTS header
+oref-idf.com: could not connect to host
+oref-idf.net: could not connect to host
+oref-idf.org: could not connect to host
 oregonmu.org: did not receive HSTS header
-orelavtomaster.ru: did not receive HSTS header
+oreka.online: could not connect to host
+orelavtomaster.ru: could not connect to host
 orfeo-engineering.ch: could not connect to host
 organic-superfood.net: could not connect to host
 organicae.com: did not receive HSTS header
@@ -11198,22 +12156,26 @@ orionfcu.com: did not receive HSTS header
 orionrebellion.com: did not receive HSTS header
 orleika.ml: could not connect to host
 oroweatorganic.com: could not connect to host
+ortho-graz.at: max-age too low: 86400
 orthodoxy.lt: did not receive HSTS header
+ortodonciaian.com: did not receive HSTS header
 orui.com.br: could not connect to host
-orum.in: could not connect to host
 osaiyuwu.com: could not connect to host
 oscarmashauri.com: did not receive HSTS header
 oscillation-services.fr: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 oscloud.com.ua: could not connect to host
 oscreen.me: could not connect to host
 oscreen.org: could not connect to host
-oscsdp.cz: could not connect to host
+oscsdp.cz: did not receive HSTS header
 osdls.gov: did not receive HSTS header
+osereso.tn: could not connect to host
 osha-kimi.com: did not receive HSTS header
 oshanko.de: could not connect to host
 oshinagaki.jp: could not connect to host
 oslfoundation.org: did not receive HSTS header
+osmestres.com: did not receive HSTS header
 osp.cx: could not connect to host
+osquery.io: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ossan-kobe-gourmet.com: did not receive HSTS header
 ossbinaries.com: could not connect to host
 osteammate.com: could not connect to host
@@ -11221,35 +12183,41 @@ ostendorf.com: did not receive HSTS header
 osticketawesome.com: did not receive HSTS header
 ostrov8.com: could not connect to host
 oswaldmattgroup.com: did not receive HSTS header
+otako.pl: did not receive HSTS header
 otakuworld.de: could not connect to host
-other98.com: did not receive HSTS header
+otakuyun.com: did not receive HSTS header
+otchecker.com: could not connect to host
 othercode.nl: could not connect to host
 otherkinforum.com: could not connect to host
-othermedia.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-otherstuff.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+othermedia.cc: could not connect to host
+otherstuff.nl: could not connect to host
 otichi.com: did not receive HSTS header
 otinane.eu: could not connect to host
 otmns.net: could not connect to host
+otmo7.com: could not connect to host
 otokonna.com: could not connect to host
 otrsdemo.hu: did not receive HSTS header
+ottoproject.io: max-age too low: 900
 ottospora.nl: could not connect to host
 ourbank.com: max-age too low: 2592000
 ourchoice2016.com: could not connect to host
-ourls.win: could not connect to host
-ourmaster.org: did not receive HSTS header
 outdooradventures.pro: could not connect to host
 outdoorproducts.com: max-age too low: 7889238
 outreachbuddy.com: could not connect to host
 outsider.im: could not connect to host
-outurnate.com: could not connect to host
+outurnate.com: did not receive HSTS header
 ouvirmusica.com.br: did not receive HSTS header
 ovenapp.io: did not receive HSTS header
 over25tips.com: did not receive HSTS header
 override.io: could not connect to host
-overrustle.com: could not connect to host
+overrustle.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 oversight.io: could not connect to host
+overstappen.nl: did not receive HSTS header
+overture.london: did not receive HSTS header
+ovpn.to: could not connect to host
 ovuscloud.de: could not connect to host
 ovwane.com: could not connect to host
+owall.ml: did not receive HSTS header
 owennelson.me: max-age too low: 2592000
 owensmith.website: could not connect to host
 owlscrap.ru: could not connect to host
@@ -11260,7 +12228,8 @@ ownspec.com: could not connect to host
 owothisdiz.pw: could not connect to host
 oxanababy.com: could not connect to host
 oxro.co: did not receive HSTS header
-oxro.io: did not receive HSTS header
+oxro.io: could not connect to host
+oxygaming.com: could not connect to host
 oxygenabsorbers.com: did not receive HSTS header
 oxymc.com: did not receive HSTS header
 oxynux.fr: could not connect to host
@@ -11271,53 +12240,54 @@ ozoz.cc: could not connect to host
 p-pc.de: could not connect to host
 p-rickroll-o.pw: could not connect to host
 p.linode.com: could not connect to host
+p1984.nl: could not connect to host
 p1c.pw: could not connect to host
-p22.co: did not receive HSTS header
 p2av.com: could not connect to host
 p3.marketing: did not receive HSTS header
 p3in.com: could not connect to host
 p3ter.fr: did not receive HSTS header
 p8r.de: did not receive HSTS header
 paavolastudio.com: did not receive HSTS header
-pablo.im: could not connect to host
-pablo.scot: could not connect to host
-pablo.sh: could not connect to host
-pabloarteaga.co.uk: could not connect to host
-pabloarteaga.com: could not connect to host
-pabloarteaga.com.es: could not connect to host
-pabloarteaga.es: could not connect to host
-pabloarteaga.eu: could not connect to host
-pabloarteaga.info: could not connect to host
-pabloarteaga.me: could not connect to host
-pabloarteaga.name: could not connect to host
-pabloarteaga.net: could not connect to host
-pabloarteaga.nom.es: could not connect to host
-pabloarteaga.org: could not connect to host
-pabloarteaga.science: could not connect to host
-pabloarteaga.tech: could not connect to host
-pabloarteaga.uk: could not connect to host
-pabloarteaga.xyz: could not connect to host
+pablo.im: did not receive HSTS header
+pablo.scot: did not receive HSTS header
+pablo.sh: did not receive HSTS header
+pabloarteaga.co.uk: did not receive HSTS header
+pabloarteaga.com: did not receive HSTS header
+pabloarteaga.com.es: did not receive HSTS header
+pabloarteaga.es: did not receive HSTS header
+pabloarteaga.eu: did not receive HSTS header
+pabloarteaga.info: did not receive HSTS header
+pabloarteaga.me: did not receive HSTS header
+pabloarteaga.name: did not receive HSTS header
+pabloarteaga.net: did not receive HSTS header
+pabloarteaga.nom.es: did not receive HSTS header
+pabloarteaga.org: did not receive HSTS header
+pabloarteaga.science: did not receive HSTS header
+pabloarteaga.tech: did not receive HSTS header
+pabloarteaga.uk: did not receive HSTS header
+pabloarteaga.xyz: did not receive HSTS header
 pablocamino.tk: could not connect to host
-pablofain.com: did not receive HSTS header
 pablorey-art.com: did not receive HSTS header
 pachaiyappas.org: did not receive HSTS header
 packair.com: did not receive HSTS header
+packer.io: did not receive HSTS header
 packetapp.ru: could not connect to host
 packetcrash.net: could not connect to host
 packlane.com: did not receive HSTS header
+packshot-creator.com: did not receive HSTS header
 pacnetwork.io: could not connect to host
 pacoda.de: could not connect to host
 pactf-flag-4boxdpa21ogonzkcrs9p.com: could not connect to host
 pactocore.org: could not connect to host
 padeoe.com: did not receive HSTS header
 pader-deko.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-padovani.de: could not connect to host
 paestbin.com: could not connect to host
 page: could not connect to host
+pagedesignshop.com: did not receive HSTS header
 pagerate.io: could not connect to host
 pages-tocaven.com: could not connect to host
 pagetoimage.com: could not connect to host
-paginapolitica.ro: could not connect to host
+paginapolitica.ro: did not receive HSTS header
 pagure.org: could not connect to host
 pahnid.com: could not connect to host
 paigeglass.com: did not receive HSTS header
@@ -11333,12 +12303,12 @@ paket.io: could not connect to host
 paket.ml: did not receive HSTS header
 paketkreditsuzuki.com: could not connect to host
 paku.me: could not connect to host
-palariviera.com: could not connect to host
 palationtrade.com: could not connect to host
 palawan.jp: could not connect to host
 palazzotalamo.it: did not receive HSTS header
+paleolowcarb.de: did not receive HSTS header
 paleosquawk.com: could not connect to host
-pallet.io: did not receive HSTS header
+pallet.io: could not connect to host
 palmer.im: could not connect to host
 pammbook.com: did not receive HSTS header
 pamplona.tv: could not connect to host
@@ -11348,10 +12318,14 @@ panama-gbs.com: did not receive HSTS header
 panamaequity.com: did not receive HSTS header
 panamateakforestry.com: did not receive HSTS header
 panascais.io: could not connect to host
+panascais.net: did not receive HSTS header
 pandapsy.com: could not connect to host
 panelomix.net: did not receive HSTS header
 pangci.xyz: could not connect to host
 panicparts.com: max-age too low: 10540800
+panjee.com: did not receive HSTS header
+panjee.fr: did not receive HSTS header
+panlex.org: did not receive HSTS header
 panni.me: could not connect to host
 panoranordic.net: could not connect to host
 panos.io: did not receive HSTS header
@@ -11367,20 +12341,23 @@ papercrunch.io: could not connect to host
 paperhaven.com.au: max-age too low: 7889238
 papermasters.com: could not connect to host
 papersmart.net: could not connect to host
+paperwallets.io: could not connect to host
 paperwork.co.za: could not connect to host
 papierniak.net: could not connect to host
+papotage.net: could not connect to host
 papygeek.com: could not connect to host
 parabhairavayoga.com: did not receive HSTS header
 paradiesgirls.ch: could not connect to host
 paradise-engineers.com: could not connect to host
-paragon.edu: did not receive HSTS header
+paragon.edu: could not connect to host
 parakranov.ru: did not receive HSTS header
 paranormalweirdo.com: could not connect to host
+paranoxer.hu: could not connect to host
 parav.xyz: did not receive HSTS header
 pardnoy.com: could not connect to host
 parent5446.us: could not connect to host
 parentmail.co.uk: did not receive HSTS header
-parfum-baza.ru: could not connect to host
+parfum-baza.ru: did not receive HSTS header
 paris-cyber.fr: did not receive HSTS header
 parisdimanche.com: did not receive HSTS header
 parishome.jp: could not connect to host
@@ -11394,11 +12371,18 @@ parksubaruoemparts.com: could not connect to host
 parkwithark.com: could not connect to host
 parodybit.net: did not receive HSTS header
 parpaing-paillette.net: could not connect to host
-parteaga.com: could not connect to host
-parteaga.net: could not connect to host
+parquet-lascazes.fr: did not receive HSTS header
+partage.ovh: did not receive HSTS header
+parteaga.com: did not receive HSTS header
+parteaga.net: did not receive HSTS header
+parthkolekar.me: could not connect to host
+participatorybudgeting.de: did not receive HSTS header
+participatorybudgeting.info: did not receive HSTS header
 particonpsplus.it: could not connect to host
 partirkyoto.jp: did not receive HSTS header
-partnercardservices.com: could not connect to host
+partiwatch.com: could not connect to host
+partnerbeam.com: could not connect to host
+partnercardservices.com: did not receive HSTS header
 partnerwerk.de: did not receive HSTS header
 partyhaus.ovh: could not connect to host
 partyhireformby.co.uk: did not receive HSTS header
@@ -11410,15 +12394,13 @@ partyvan.moe: could not connect to host
 partyvan.nl: could not connect to host
 partyvan.se: could not connect to host
 pascalchristen.ch: did not receive HSTS header
-pasportaservo.org: did not receive HSTS header
+pasportaservo.org: could not connect to host
 passpilot.co.uk: did not receive HSTS header
-passumpsicbank.com: did not receive HSTS header
 passwd.io: did not receive HSTS header
 password.codes: could not connect to host
 passwordbox.com: did not receive HSTS header
 passwordrevelator.net: did not receive HSTS header
 passwordscon.com: could not connect to host
-pasta-factory.co.il: could not connect to host
 pastaf.com: could not connect to host
 pastdream.xyz: could not connect to host
 paste.linode.com: could not connect to host
@@ -11434,6 +12416,7 @@ patfs.com: did not receive HSTS header
 pathwaytofaith.com: could not connect to host
 patientinsight.net: could not connect to host
 patriaco.net: did not receive HSTS header
+patric-lenhart.de: could not connect to host
 patrick.dark.name: could not connect to host
 patrickbusch.net: could not connect to host
 patrickneuro.de: could not connect to host
@@ -11444,7 +12427,6 @@ patterson.mp: could not connect to host
 paul-kerebel.pro: could not connect to host
 paul-schmidt.de: max-age too low: 0
 paulbunyanmls.com: did not receive HSTS header
-paulchen.at: did not receive HSTS header
 paulproell.at: did not receive HSTS header
 paulrudge.codes: could not connect to host
 paulshir.com: could not connect to host
@@ -11452,7 +12434,6 @@ paulshir.is: could not connect to host
 paulyang.cn: did not receive HSTS header
 paveljanda.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 pavelkahouseforcisco.com: did not receive HSTS header
-pavelstriz.cz: could not connect to host
 pawsru.org: could not connect to host
 paxdei.com.br: could not connect to host
 paxwinkel.nl: could not connect to host
@@ -11468,6 +12449,7 @@ paymon.tj: could not connect to host
 paypod.org: could not connect to host
 payroll.ch: could not connect to host
 paytwopay.com: could not connect to host
+payzwin.com: did not receive HSTS header
 pb-design.ch: could not connect to host
 pbapp.net: did not receive HSTS header
 pbbr.com: did not receive HSTS header
@@ -11479,9 +12461,11 @@ pbytes.com: could not connect to host
 pc-nf.de: did not receive HSTS header
 pc-tweak.de: did not receive HSTS header
 pcat.io: could not connect to host
+pcbricole.fr: could not connect to host
 pcfun.net: did not receive HSTS header
 pchax.net: could not connect to host
 pchospital.cc: could not connect to host
+pcmedia.co.nz: did not receive HSTS header
 pcvirusclear.com: could not connect to host
 pdamsidoarjo.co.id: could not connect to host
 pdevio.com: could not connect to host
@@ -11494,15 +12478,18 @@ peakapp.nl: could not connect to host
 pebblesdemo.com: could not connect to host
 pecot.fr: did not receive HSTS header
 peekops.com: could not connect to host
+peen.ch: could not connect to host
 peerherrmann.de: could not connect to host
-peerless.ae: could not connect to host
+peerless.ae: did not receive HSTS header
 peinard.net: did not receive HSTS header
+peintrenomade.com: did not receive HSTS header
 peirong.me: could not connect to host
 peissen.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 pekkapikkarainen.fi: did not receive HSTS header
 pekkarik.ru: could not connect to host
 peliculasaudiolatinoonline.com: could not connect to host
 peliseries24.com: could not connect to host
+peliweb.com: did not receive HSTS header
 pemagrid.org: could not connect to host
 pemberton.at: did not receive HSTS header
 penablog.com: did not receive HSTS header
@@ -11520,11 +12507,13 @@ pensiunealido.ro: could not connect to host
 pentagram.me: max-age too low: 2592000
 pentano.net: could not connect to host
 people-mozilla.org: could not connect to host
+peoplerange.com: did not receive HSTS header
+peoplesbankal.com: did not receive HSTS header
 peperiot.com: did not receive HSTS header
 pepper.dog: could not connect to host
 pepperhead.com: did not receive HSTS header
 pepperworldhotshop.de: did not receive HSTS header
-pepsicoemployeepreferencesurvey.com: could not connect to host
+pepsicoemployeepreferencesurvey.com: did not receive HSTS header
 per-pedes.at: did not receive HSTS header
 perdel.cn: could not connect to host
 pereuda.com: could not connect to host
@@ -11533,13 +12522,14 @@ perfect-radiant-wrinkles.com: could not connect to host
 perfectionis.me: could not connect to host
 perfectionunite.com: could not connect to host
 perfectseourl.com: did not receive HSTS header
+performancesantafe.org: did not receive HSTS header
 performaride.com.au: did not receive HSTS header
 performaterm.ro: could not connect to host
 performous.org: did not receive HSTS header
 perfumista.vn: did not receive HSTS header
 periodismoactual.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 periscopeliveweb.com: could not connect to host
-perlwork.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+perlwork.nl: could not connect to host
 pernatie.ru: could not connect to host
 peromsik.com: did not receive HSTS header
 perplex.nl: did not receive HSTS header
@@ -11557,11 +12547,13 @@ persson.im: could not connect to host
 perthdevicelab.com: did not receive HSTS header
 pestalozzishop.com.br: could not connect to host
 pesto.video: could not connect to host
-pet-life.top: did not receive HSTS header
+pesyun.cn: max-age too low: 3600
 pet-nsk.ru: could not connect to host
 petangen.se: could not connect to host
 petchart.net: could not connect to host
+peteboc.com: max-age too low: 0
 peterfolta.net: could not connect to host
+peterkshultz.com: could not connect to host
 petermazur.com: did not receive HSTS header
 peternagy.ie: did not receive HSTS header
 peters.consulting: could not connect to host
@@ -11571,10 +12563,7 @@ pethub.com: did not receive HSTS header
 petit.site: could not connect to host
 petlife.od.ua: could not connect to host
 petplum.com: could not connect to host
-petrachuk.ru: could not connect to host
-petravdbos.nl: could not connect to host
 petrkrapek.cz: did not receive HSTS header
-petrolplus.ru: max-age too low: 7776000
 petrovsky.pro: could not connect to host
 petsittersservices.com: could not connect to host
 pettsy.com: did not receive HSTS header
@@ -11595,34 +12584,34 @@ pgpm.io: could not connect to host
 pgregg.com: did not receive HSTS header
 pgtb.be: could not connect to host
 phalconist.com: could not connect to host
-pharmacie-fr.org: did not receive HSTS header
 pharmgkb.org: could not connect to host
+pharynx.nl: could not connect to host
 phcmembers.com: did not receive HSTS header
 phcnetworks.net: did not receive HSTS header
 phdsupply.com: could not connect to host
 phdwuda.com: could not connect to host
-phelx.de: could not connect to host
-phenomeno-porto.com: could not connect to host
-phenomeno.nl: could not connect to host
-phenomenoporto.com: could not connect to host
-phenomenoporto.nl: could not connect to host
+phenomeno-porto.com: did not receive HSTS header
+phenomeno.nl: did not receive HSTS header
+phenomenoporto.com: did not receive HSTS header
+phenomenoporto.nl: did not receive HSTS header
 philadelphiacandies.com: did not receive HSTS header
 philadelphiadancefoundation.org: could not connect to host
 philipmordue.co.uk: could not connect to host
 philippa.cool: could not connect to host
 phillippi.me: could not connect to host
 phillmoore.com: did not receive HSTS header
-phillprice.com: could not connect to host
+phillprice.com: did not receive HSTS header
 philonas.net: did not receive HSTS header
 philpropertygroup.com: could not connect to host
 phippsreporting.com: did not receive HSTS header
-phishing.rs: did not receive HSTS header
 phoebe.co.nz: did not receive HSTS header
 phoenicis.com.ua: did not receive HSTS header
 phoenix.dj: did not receive HSTS header
+phoenixlogan.com: could not connect to host
 phonenumberinfo.co.uk: could not connect to host
 phongmay24h.com: could not connect to host
 phood.be: could not connect to host
+photo.org.il: could not connect to host
 photoblogverona.com: could not connect to host
 photoboothpartyhire.co.uk: did not receive HSTS header
 photographyforchange.com: could not connect to host
@@ -11631,11 +12620,12 @@ photops.fr: could not connect to host
 photosoftware.nl: could not connect to host
 phototag.org: did not receive HSTS header
 php-bach.org: could not connect to host
+phpdistribution.com: did not receive HSTS header
 phperformances.fr: did not receive HSTS header
 phpfashion.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-phpinfo.in.th: could not connect to host
 phr34kz.pw: did not receive HSTS header
 phrasing.me: could not connect to host
+phryneas.de: did not receive HSTS header
 phumin.in.th: did not receive HSTS header
 phuong.faith: could not connect to host
 pi-eng.fr: did not receive HSTS header
@@ -11643,11 +12633,12 @@ pianetaottica.eu: could not connect to host
 pianetaottica.info: could not connect to host
 pianetaottica.net: could not connect to host
 pianetaottica.org: could not connect to host
-pias-button.net: did not receive HSTS header
+pias-button.net: could not connect to host
 piasto.com.cy: could not connect to host
-piatanoua.md: could not connect to host
+piatanoua.md: did not receive HSTS header
 picallo.es: could not connect to host
 picardiascr.com: could not connect to host
+pickersurvey.org: could not connect to host
 pickr.co: could not connect to host
 picone.com.au: could not connect to host
 picotronic.biz: could not connect to host
@@ -11657,17 +12648,18 @@ picshare.nz: could not connect to host
 pidatacenters.com: did not receive HSTS header
 pidomex.com: did not receive HSTS header
 piedfeed.com: did not receive HSTS header
+pieinsurance.com: did not receive HSTS header
 piekacz.co.uk: could not connect to host
+pieperhome.de: did not receive HSTS header
 pierrejeansuau.fr: could not connect to host
 pieterjangeeroms.me: could not connect to host
 piggott.me.uk: did not receive HSTS header
-pigritia.de: could not connect to host
 piils.fr: did not receive HSTS header
+pikalongwar.com: did not receive HSTS header
 pikmy.com: could not connect to host
 pilgermaske.org: did not receive HSTS header
 piligrimname.com: could not connect to host
 pillowandpepper.com: did not receive HSTS header
-pilotandy.com: could not connect to host
 pilotcrowd.nl: did not receive HSTS header
 pimpmymac.ru: did not receive HSTS header
 pimpmypaper.com: could not connect to host
@@ -11677,31 +12669,35 @@ pin.net.au: did not receive HSTS header
 pineapplesapp.com: did not receive HSTS header
 pinebaylibrary.org: could not connect to host
 pinesandneedles.com: max-age too low: 7889238
+pinigseu.xyz: could not connect to host
+pinkcasino.co.uk: did not receive HSTS header
 pinkfis.ch: did not receive HSTS header
 pinkhq.com: did not receive HSTS header
 pinkinked.com: could not connect to host
-pinkyf.com: did not receive HSTS header
 pinoylinux.org: did not receive HSTS header
+pinpointengineer.co.uk: could not connect to host
 pintoselectrician.co.za: did not receive HSTS header
 pioche.ovh: did not receive HSTS header
 pippen.io: could not connect to host
 pips.rocks: could not connect to host
 pir9.com: did not receive HSTS header
+piranil.com: did not receive HSTS header
 pirata.ga: did not receive HSTS header
 pirateahoy.eu: could not connect to host
+piratebay.ml: could not connect to host
 piratebit.tech: could not connect to host
 piratedb.com: could not connect to host
 piratedot.com: could not connect to host
 piratelist.online: could not connect to host
 piratenlogin.de: could not connect to host
 piratepay.io: could not connect to host
-piratepay.ir: did not receive HSTS header
-pirateproxy.pe: could not connect to host
+piratepay.ir: could not connect to host
+pirateproxy.pe: max-age too low: 0
 pirateproxy.sx: did not receive HSTS header
 pirateproxy.vip: could not connect to host
 pirati.cz: max-age too low: 604800
 piratte.net: did not receive HSTS header
-pirganj24.com: did not receive HSTS header
+pirganj24.com: could not connect to host
 pirlitu.com: did not receive HSTS header
 pisexy.me: did not receive HSTS header
 pisidia.de: could not connect to host
@@ -11719,17 +12715,19 @@ pixelesque.uk: could not connect to host
 pixelgliders.de: could not connect to host
 pixelhero.co.uk: did not receive HSTS header
 pixelpoint.io: did not receive HSTS header
+pixelrain.info: could not connect to host
 pixi.chat: could not connect to host
 pixi.me: did not receive HSTS header
-pixlfox.com: could not connect to host
+pizala.de: could not connect to host
 pizzadoc.ch: could not connect to host
-pj00100.com: did not receive HSTS header
+pj00100.com: could not connect to host
 pj00200.com: did not receive HSTS header
 pj00300.com: did not receive HSTS header
 pj00400.com: did not receive HSTS header
 pj00600.com: did not receive HSTS header
 pj00700.com: did not receive HSTS header
-pj00800.com: could not connect to host
+pj00800.com: did not receive HSTS header
+pj009.com: did not receive HSTS header
 pj00900.com: did not receive HSTS header
 pj02.com: did not receive HSTS header
 pj83.duckdns.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -11738,33 +12736,42 @@ pjbet.mg: could not connect to host
 pjsec.tk: could not connect to host
 pkautodesign.com: did not receive HSTS header
 pkbjateng.or.id: could not connect to host
-pko.ch: could not connect to host
+pko.ch: did not receive HSTS header
 pkschat.com: could not connect to host
+pksps.com: could not connect to host
 plaasprodukte.com: could not connect to host
 placefade.com: could not connect to host
 placehold.co: did not receive HSTS header
 placollection.org: could not connect to host
 plaettliaktion.ch: did not receive HSTS header
 plagiarismcheck.org: max-age too low: 604800
+plaintech.net.au: could not connect to host
+plaintray.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 plakbak.nl: could not connect to host
 planbox.info: could not connect to host
 planeexplanation.com: could not connect to host
+planetbeauty.com: did not receive HSTS header
 planete-secu.com: could not connect to host
 planetromeo.com: could not connect to host
+planformation.com: did not receive HSTS header
 planktonholland.com: did not receive HSTS header
 planpharmacy.com: could not connect to host
 plant.ml: could not connect to host
 plantroon.com: did not receive HSTS header
 plass.hamburg: could not connect to host
+plasti-pac.ch: did not receive HSTS header
+plasticsurgeryservices.com: did not receive HSTS header
 plasvilledescartaveis.com.br: could not connect to host
 platform.lookout.com: could not connect to host
+platinumpeek.com: did not receive HSTS header
 platomania.eu: did not receive HSTS header
 play: could not connect to host
 play.google.com: did not receive HSTS header (error ignored - included regardless)
 playdreamcraft.com.br: did not receive HSTS header
 playerhunter.com: did not receive HSTS header
-playflick.com: did not receive HSTS header
+playflick.com: could not connect to host
 playkh.com: did not receive HSTS header
+playkinder.com: did not receive HSTS header
 playmaker.io: did not receive HSTS header
 playmaza.live: did not receive HSTS header
 playmfe.com: could not connect to host
@@ -11772,8 +12779,8 @@ playsoundevents.be: could not connect to host
 playsource.co: could not connect to host
 playwhyyza.com: could not connect to host
 playyou.be: could not connect to host
-please-deny.me: did not receive HSTS header
-pleasure-science.com: could not connect to host
+please-deny.me: could not connect to host
+pleaseuseansnisupportedbrowser.ml: could not connect to host
 pleasure.forsale: could not connect to host
 plen.io: could not connect to host
 plexpy13.ddns.net: could not connect to host
@@ -11791,8 +12798,9 @@ ploup.net: could not connect to host
 pluff.nl: did not receive HSTS header
 plugboard.xyz: could not connect to host
 pluggedhead.com: did not receive HSTS header
-plumbingboksburg.co.za: could not connect to host
+plumbingboksburg.co.za: did not receive HSTS header
 plumbingman.com.au: did not receive HSTS header
+plumplat.com: did not receive HSTS header
 plus-digital.net: did not receive HSTS header
 plus-u.com.au: did not receive HSTS header
 plus.sandbox.google.com: did not receive HSTS header (error ignored - included regardless)
@@ -11800,11 +12808,12 @@ plus1s.tk: could not connect to host
 plushev.com: did not receive HSTS header
 plussizereviews.com: could not connect to host
 plut.org: did not receive HSTS header
-pluth.org: did not receive HSTS header
+plymouthglassgallery.com: did not receive HSTS header
 plymouthsoftplay.co.uk: could not connect to host
 pm13-media.cz: could not connect to host
 pmac.pt: could not connect to host
 pmbremer.de: could not connect to host
+pmbtf.com: could not connect to host
 pmctire.com: did not receive HSTS header
 pmemanager.fr: did not receive HSTS header
 pmessage.ch: could not connect to host
@@ -11822,6 +12831,8 @@ pocloud.homelinux.net: could not connect to host
 pocobelli.ch: did not receive HSTS header
 podcast.style: could not connect to host
 podiumsdiskussion.org: did not receive HSTS header
+poed.com.au: could not connect to host
+poeg.cz: did not receive HSTS header
 pogoswine.com: could not connect to host
 pogs.us: could not connect to host
 poiema.com.sg: did not receive HSTS header
@@ -11836,6 +12847,7 @@ pokomichi.com: did not receive HSTS header
 pol-expo.ru: could not connect to host
 pol.in.th: could not connect to host
 polandb2b.directory: could not connect to host
+polar-baer.com: could not connect to host
 polarityschule.com: did not receive HSTS header
 pole.net.nz: did not receive HSTS header
 poleartschool.com: could not connect to host
@@ -11853,12 +12865,12 @@ polsport.live: did not receive HSTS header
 polycoise.com: could not connect to host
 polycrypt.us: could not connect to host
 polypho.nyc: could not connect to host
-polysage.org: could not connect to host
+polysage.org: did not receive HSTS header
 polytechecosystem.vc: could not connect to host
 pomardaserra.com: could not connect to host
 pomfe.co: could not connect to host
 pompefunebrilariviera.it: could not connect to host
-pompompoes.com: did not receive HSTS header
+pompompoes.com: could not connect to host
 pondof.fish: could not connect to host
 poneytelecom.org: could not connect to host
 ponteencima.com: could not connect to host
@@ -11876,21 +12888,29 @@ popi.se: did not receive HSTS header
 popkins.cf: could not connect to host
 popkins.ga: could not connect to host
 popkins.gq: could not connect to host
-popkins.ml: could not connect to host
+popkins.ml: did not receive HSTS header
 popkins.tk: could not connect to host
 popupsoftplay.com: could not connect to host
 poris.web.id: could not connect to host
 porn77.info: could not connect to host
+pornalpha.com: could not connect to host
 pornbay.org: could not connect to host
 pornblog.org: could not connect to host
-porncandi.com: could not connect to host
+pornimg.net: could not connect to host
+pornless.biz: could not connect to host
+pornmax.net: could not connect to host
 porno-gif.ru: could not connect to host
 pornohub.su: could not connect to host
 pornolab-net.appspot.com: could not connect to host
 pornoserver.eu: could not connect to host
+pornport.org: could not connect to host
+pornsocket.com: could not connect to host
 pornstars.me: did not receive HSTS header
+pornteddy.com: could not connect to host
+pornultra.net: could not connect to host
 porschen.fr: could not connect to host
 port.social: could not connect to host
+portalcarapicuiba.com: did not receive HSTS header
 portale-randkowe.pl: did not receive HSTS header
 portalhubnuti.cz: did not receive HSTS header
 portalisapres.cl: could not connect to host
@@ -11899,9 +12919,12 @@ portalmundo.xyz: could not connect to host
 portalplatform.net: could not connect to host
 portaluniversalista.org: did not receive HSTS header
 portalveneza.com.br: did not receive HSTS header
+portalzine.de: did not receive HSTS header
 portefeuillesignalen.nl: did not receive HSTS header
+portraitsystem.biz: did not receive HSTS header
 poshpak.com: max-age too low: 86400
 positivesobrietyinstitute.com: did not receive HSTS header
+post.we.bs: did not receive HSTS header
 post4me.at: could not connect to host
 postback.io: did not receive HSTS header
 postcardpayment.com: could not connect to host
@@ -11910,6 +12933,7 @@ posters.win: could not connect to host
 postscheduler.org: could not connect to host
 posylka.de: did not receive HSTS header
 potatoheads.net: could not connect to host
+potatron.tech: could not connect to host
 potbar.com: could not connect to host
 potbox.com: could not connect to host
 potenzmittelblog.info: could not connect to host
@@ -11920,16 +12944,17 @@ potsky.com: did not receive HSTS header
 pouet.it: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 pouets.ovh: could not connect to host
 poupatempo.org: did not receive HSTS header
+pour-la-culture-aulnay.fr: could not connect to host
 pourmesloisirs.com: could not connect to host
 pourout.org: did not receive HSTS header
 poussinooz.fr: could not connect to host
 povitria.net: could not connect to host
 powaclub.com: max-age too low: 86400
-powdersnow.top: did not receive HSTS header
 power-l.ch: did not receive HSTS header
 power-of-interest.com: did not receive HSTS header
 power99press.com: could not connect to host
 powerb.ch: did not receive HSTS header
+powerdent.net.br: could not connect to host
 powerentertainment.tv: could not connect to host
 poweroff.win: could not connect to host
 powerplannerapp.com: could not connect to host
@@ -11937,24 +12962,27 @@ powersergunited.com: could not connect to host
 powersergunited.org: could not connect to host
 powershellmagic.com: could not connect to host
 powershift.ne.jp: did not receive HSTS header
+powertothebuilder.com: could not connect to host
 powerxequality.com: did not receive HSTS header
 poy-tech.com: could not connect to host
 pozniak.at: did not receive HSTS header
 pozyczka-bez-zaswiadczen.pl: did not receive HSTS header
 pozytywnyplan.pl: could not connect to host
+pozzitiv.ro: could not connect to host
 pozzo-balbi.com: did not receive HSTS header
 ppembed.com: did not receive HSTS header
+ppoou.co.uk: could not connect to host
+ppoozl.com: could not connect to host
 pppo.gov: could not connect to host
 ppr-truby.ru: could not connect to host
 ppsvcs2.com: did not receive HSTS header
 ppuu.org: did not receive HSTS header
 ppy3.com: did not receive HSTS header
-practicallabs.com: could not connect to host
-practicalprogrammer.tech: could not connect to host
 practodev.com: could not connect to host
 pratinav.xyz: could not connect to host
 prattpokemon.com: could not connect to host
-praxis-research.info: did not receive HSTS header
+praxis-research.info: could not connect to host
+prazeresdavida.com.br: could not connect to host
 precedecaritas.com.br: could not connect to host
 precisionaeroimaging.com: did not receive HSTS header
 prediksisydney.com: could not connect to host
@@ -11966,7 +12994,6 @@ prego-shop.de: could not connect to host
 preio.cn: could not connect to host
 preisser-it.de: did not receive HSTS header
 preisser.it: did not receive HSTS header
-preissler.co.uk: could not connect to host
 prekladysanca.cz: could not connect to host
 prelist.org: did not receive HSTS header
 premaritalsex.info: could not connect to host
@@ -11974,43 +13001,47 @@ premioambiente.it: did not receive HSTS header
 premiumzweirad.de: max-age too low: 7776000
 prepaidgirl.com: could not connect to host
 prepandgo-euro.com: could not connect to host
+preposted.com: could not connect to host
 preppertactics.com: did not receive HSTS header
 preprodfan.gov: could not connect to host
 prescriptionrex.com: did not receive HSTS header
+presentesdegrife.com.br: could not connect to host
 presidentials2016.com: could not connect to host
 press-anime-nenkan.com: did not receive HSTS header
 press-presse.ca: did not receive HSTS header
-pressenews.net: did not receive HSTS header
+pressakey.de: did not receive HSTS header
+pressenews.net: could not connect to host
 pressfreedomfoundation.org: did not receive HSTS header
+prestburyscouts.org.uk: did not receive HSTS header
 prestigeeventshire.co.uk: could not connect to host
 prestonapp.com: could not connect to host
 prettygrouse.com: did not receive HSTS header
 prettyphotoart.de: did not receive HSTS header
 prettytunesapp.com: could not connect to host
+pretwolk.nl: could not connect to host
 pretzlaff.info: did not receive HSTS header
 preworkout.me: could not connect to host
 prgslab.net: could not connect to host
 priceholic.com: could not connect to host
-pridetechdesign.com: did not receive HSTS header
 pridoc.se: did not receive HSTS header
 prifo.se: could not connect to host
 prijsvergelijken.ml: could not connect to host
 prilock.com: did not receive HSTS header
 primecaplending.com: could not connect to host
+primewho.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 primordialsnooze.com: could not connect to host
 primotiles.co.uk: did not receive HSTS header
-prinbanat.ngo: did not receive HSTS header
-princeofwhales.com: did not receive HSTS header
-princesparktouch.com: did not receive HSTS header
+prinbanat.ngo: could not connect to host
+princeagency.com: could not connect to host
 princessbackpack.de: could not connect to host
 princessmargaretlotto.com: did not receive HSTS header
+principalsexam.com: could not connect to host
+principalstest.ph: could not connect to host
 prinesdoma.at: did not receive HSTS header
 printerest.io: could not connect to host
 printersonline.be: did not receive HSTS header
 printery.be: could not connect to host
 printexpress.cloud: could not connect to host
-printfn.com: did not receive HSTS header
-printler.com: did not receive HSTS header
 priolkar.com: could not connect to host
 prism-communication.com: could not connect to host
 pristineevents.co.uk: did not receive HSTS header
@@ -12021,20 +13052,29 @@ privacynow.eu: did not receive HSTS header
 privacyrup.net: could not connect to host
 privategiant.com: could not connect to host
 privatstunden.express: could not connect to host
+privcloud.cc: could not connect to host
 privcloud.org: could not connect to host
 privilegevisa.fr: could not connect to host
+privu.me: could not connect to host
 privytime.com: could not connect to host
-prmte.com: max-age too low: 2592000
+prmte.com: did not receive HSTS header
 prnt.li: did not receive HSTS header
+pro-esb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+pro-esb.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 pro-image.de: did not receive HSTS header
 pro-zone.com: could not connect to host
 proact-it.co.uk: could not connect to host
 proactive.run: could not connect to host
+procens.us: could not connect to host
 procode.gq: could not connect to host
+procrastinatingengineer.co.uk: could not connect to host
 prodpad.com: did not receive HSTS header
 produccioneskm.cl: did not receive HSTS header
-productgap.com: could not connect to host
+productgap.com: did not receive HSTS header
+productived.net: did not receive HSTS header
 producto8.com: did not receive HSTS header
+proesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+proesb.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 profi-durchgangsmelder.de: did not receive HSTS header
 profinetz.de: could not connect to host
 profivps.com: could not connect to host
@@ -12045,7 +13085,6 @@ profusion.io: could not connect to host
 progblog.net: max-age too low: 0
 progolfjourney.com: could not connect to host
 program-and.work: could not connect to host
-programmaticmagic.com: could not connect to host
 programmingstudent.com: could not connect to host
 progress-technologies.com: could not connect to host
 progressivecfo.co.nz: could not connect to host
@@ -12058,17 +13097,19 @@ project-splash.com: max-age too low: 0
 project-stats.com: could not connect to host
 projectascension.io: could not connect to host
 projectasterk.com: could not connect to host
+projectcastle.tech: did not receive HSTS header
 projectdp.net: could not connect to host
 projectherogames.xyz: could not connect to host
 projectl1b1t1na.tk: could not connect to host
 projectmercury.space: could not connect to host
 projectte.ch: could not connect to host
 projectvault.ovh: did not receive HSTS header
-projectx.top: did not receive HSTS header
+projectx.top: could not connect to host
 projekt-umbriel.de: could not connect to host
 projektik.cz: did not receive HSTS header
 projektzentrisch.de: could not connect to host
 projetoresecia.com: could not connect to host
+prok.pw: did not receive HSTS header
 prokop.ovh: could not connect to host
 promarketer.net: did not receive HSTS header
 promecon-gmbh.de: did not receive HSTS header
@@ -12082,6 +13123,7 @@ prontocleaners.co.uk: could not connect to host
 prontolight.com: did not receive HSTS header
 prontomovers.co.uk: could not connect to host
 propactrading.com: could not connect to host
+propagandism.org: did not receive HSTS header
 propershave.com: could not connect to host
 proplan.co.il: did not receive HSTS header
 propmag.co: could not connect to host
@@ -12107,13 +13149,14 @@ proxbox.net: did not receive HSTS header
 proxi.cf: could not connect to host
 proximato.com: could not connect to host
 proxybay.al: could not connect to host
-proxybay.club: could not connect to host
+proxybay.club: max-age too low: 0
 proxybay.info: did not receive HSTS header
 proxybay.top: could not connect to host
 proxydesk.eu: could not connect to host
 proxydesk.net: could not connect to host
 proxyowl.pw: could not connect to host
 proxyportal.me: could not connect to host
+proxyportal.net: could not connect to host
 proxyportal.org: did not receive HSTS header
 proxyrox.com: could not connect to host
 proxyweb.us: did not receive HSTS header
@@ -12123,6 +13166,7 @@ prstatic.com: could not connect to host
 pruikshop.nl: could not connect to host
 prxio.date: could not connect to host
 prxio.site: could not connect to host
+prytkov.com: could not connect to host
 ps-qa.com: could not connect to host
 ps-x.ru: could not connect to host
 pscleaningsolutions.co.uk: could not connect to host
@@ -12135,20 +13179,21 @@ psncardplus.com: could not connect to host
 psncardplus.dk: could not connect to host
 psncardplus.nl: could not connect to host
 psncardplus.se: could not connect to host
+pstrozniak.com: could not connect to host
 pstudio.me: max-age too low: 0
 psw.academy: could not connect to host
 psw.consulting: could not connect to host
 psychiatrie-betreuung.ch: did not receive HSTS header
-psylab.re: could not connect to host
-psylab.vip: could not connect to host
 psynapse.net.au: could not connect to host
 ptn.moscow: could not connect to host
 ptonet.com: could not connect to host
 ptrujillo.com: did not receive HSTS header
 pub-online.ro: could not connect to host
+pubi.me: could not connect to host
 pubkey.is: could not connect to host
 publications.qld.gov.au: did not receive HSTS header
 publicidadnovagrass.com.mx: could not connect to host
+publick.net: did not receive HSTS header
 publicspeakingcamps.com: could not connect to host
 publimepa.it: could not connect to host
 publishingshack.com: did not receive HSTS header
@@ -12170,13 +13215,13 @@ punchr-kamikazee.rhcloud.com: could not connect to host
 punchunique.com: did not receive HSTS header
 punkdns.top: could not connect to host
 puppydns.com: did not receive HSTS header
-purahealthyliving.com: could not connect to host
+purahealthyliving.com: did not receive HSTS header
 purbd.com: did not receive HSTS header
 pureessentialoil.biz: max-age too low: 300
 pureholisticliving.me: could not connect to host
 purewebmasters.com: could not connect to host
+purikore.com: could not connect to host
 purplehippie.in: did not receive HSTS header
-purplez.pw: did not receive HSTS header
 purpoz.com.br: could not connect to host
 purpspc.com: could not connect to host
 purrfectcams.com: did not receive HSTS header
@@ -12210,11 +13255,14 @@ q2.si: did not receive HSTS header
 q8mp3.me: did not receive HSTS header
 qadmium.tk: could not connect to host
 qamrulhaque.com: could not connect to host
+qaz.cloud: did not receive HSTS header
 qazcloud.com: could not connect to host
+qbeing.info: could not connect to host
 qbik.de: did not receive HSTS header
 qbin.io: did not receive HSTS header
 qbnt.ca: could not connect to host
-qccqld.org.au: could not connect to host
+qbus.pl: could not connect to host
+qccqld.org.au: did not receive HSTS header
 qe2homelottery.com: did not receive HSTS header
 qensio.com: did not receive HSTS header
 qforum.org: could not connect to host
@@ -12222,22 +13270,26 @@ qi0.de: did not receive HSTS header
 qiannews.net: could not connect to host
 qifu.org.cn: could not connect to host
 qimiao.io: did not receive HSTS header
-qingxuan.info: could not connect to host
+qingpat.com: could not connect to host
+qingxuan.info: did not receive HSTS header
 qinxi1992.com: could not connect to host
 qionglu.pw: could not connect to host
 qipp.com: did not receive HSTS header
 qirinus.com: did not receive HSTS header
 qiuxian.ddns.net: could not connect to host
 qixxit.de: did not receive HSTS header
+qkka.org: did not receive HSTS header
 qldconservation.org: could not connect to host
 qnatek.org: could not connect to host
 qonqa.de: did not receive HSTS header
 qoohoot.com: did not receive HSTS header
-qop.io: could not connect to host
+qop.io: did not receive HSTS header
 qoqo.us: did not receive HSTS header
 qorm.co.uk: could not connect to host
 qqj.net: could not connect to host
 qqq.gg: could not connect to host
+qqrss.com: could not connect to host
+qqvips.com: could not connect to host
 qqvrsmart.cn: could not connect to host
 qrara.net: did not receive HSTS header
 qredo.com: did not receive HSTS header
@@ -12245,13 +13297,17 @@ qrforex.com: did not receive HSTS header
 qrlending.com: could not connect to host
 qrlfinancial.com: could not connect to host
 qswoo.org: could not connect to host
+qto.com: could not connect to host
+qto.net: could not connect to host
+qto.org: could not connect to host
+quaedam.org: could not connect to host
 quail.solutions: could not connect to host
 quakerlens.com: did not receive HSTS header
 quality1.com.br: did not receive HSTS header
 qualityology.com: did not receive HSTS header
 quanglepro.com: could not connect to host
 quangngaimedia.com: did not receive HSTS header
-quanjinlong.cn: did not receive HSTS header
+quanjinlong.cn: could not connect to host
 quantacloud.ch: could not connect to host
 quantenteranik.eu: could not connect to host
 quantor.dk: did not receive HSTS header
@@ -12261,13 +13317,17 @@ quantum-lviv.pp.ua: could not connect to host
 quantumcore.cn: could not connect to host
 quantumcourse.org: did not receive HSTS header
 quanwuji.com: could not connect to host
+quanyin.eu.org: could not connect to host
 quarryhillrentals.com: did not receive HSTS header
+quarus.net: could not connect to host
 quebecmailbox.com: could not connect to host
 queenbrownie.com.br: could not connect to host
 queenshaflo.com: could not connect to host
 quelmandataire.fr: did not receive HSTS header
+querkommentar.de: could not connect to host
 queroreceitasoberana.com.br: did not receive HSTS header
 queryplayground.com: could not connect to host
+questionable.host: could not connect to host
 questions-admin.com: did not receive HSTS header
 questionyu.com: did not receive HSTS header
 questsandrewards.com: could not connect to host
@@ -12275,20 +13335,21 @@ quic.fr: did not receive HSTS header
 quickandroid.tools: could not connect to host
 quickpayservice.com: could not connect to host
 quietus.gq: could not connect to host
+quikrmovies.to: could not connect to host
 quikstorhawaii.com: max-age too low: 300
 quimsertek.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 quizionic.com: could not connect to host
 quizl.io: did not receive HSTS header
 quizmemes.org: could not connect to host
-quizstore.net: could not connect to host
 quotehex.com: could not connect to host
 quotemaster.co.za: could not connect to host
 quranserver.net: could not connect to host
 qwallet.ca: did not receive HSTS header
 qwaser.fr: could not connect to host
+qweepi.de: could not connect to host
 qwertyatom100.me: could not connect to host
 qwilink.me: did not receive HSTS header
-qybot.cc: could not connect to host
+qybot.cc: did not receive HSTS header
 r-ay.club: did not receive HSTS header
 r-core.org: could not connect to host
 r-core.ru: could not connect to host
@@ -12296,7 +13357,7 @@ r-cut.fr: could not connect to host
 r-rickroll-u.pw: could not connect to host
 r10n.com: did not receive HSTS header
 r15.me: did not receive HSTS header
-r18.moe: did not receive HSTS header
+r18.moe: could not connect to host
 raajheshkannaa.com: could not connect to host
 rabbitvcactus.eu: did not receive HSTS header
 rabota-x.ru: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -12307,7 +13368,8 @@ raconteur.net: did not receive HSTS header
 rad-route.de: could not connect to host
 raddavarden.nu: could not connect to host
 radicaleducation.net: could not connect to host
-radioheteroglossia.com: did not receive HSTS header
+radioafibra.com.br: could not connect to host
+radiocomsaocarlos.com.br: could not connect to host
 radtke.bayern: did not receive HSTS header
 rafaelcz.de: could not connect to host
 ragingserenity.com: did not receive HSTS header
@@ -12316,32 +13378,29 @@ rahadiana.com: could not connect to host
 rahamasin.eu: could not connect to host
 rai-co.net: did not receive HSTS header
 raiblockscommunity.net: could not connect to host
-raidensnakesden.co.uk: could not connect to host
-raidensnakesden.com: could not connect to host
-raidensnakesden.net: could not connect to host
 raidstone.com: could not connect to host
 raidstone.rocks: could not connect to host
 raiffeisen-kosovo.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-railgun.com.cn: could not connect to host
+railjob.cn: could not connect to host
 railyardurgentcare.com: did not receive HSTS header
 rainbin.com: could not connect to host
 rainbowbarracuda.com: could not connect to host
-rainel.at: could not connect to host
+raipet.no-ip.biz: could not connect to host
 raisecorp.com: could not connect to host
 raitza.de: could not connect to host
-rajkapoordas.com: could not connect to host
 rakugaki.cn: could not connect to host
+ralfs-zusizone.de: could not connect to host
 ramarka.de: could not connect to host
 ramatola.uk: could not connect to host
 rambii.de: could not connect to host
 ramblingrf.tech: could not connect to host
 ramezanloo.com: did not receive HSTS header
+ramitmittal.com: could not connect to host
 ramon-c.nl: could not connect to host
 ramonj.nl: could not connect to host
 randomcage.com: did not receive HSTS header
 randomcloud.net: could not connect to host
 randomhero.cloud: could not connect to host
-randomquotesapp.com: could not connect to host
 randomwinpicker.de: could not connect to host
 randy.pw: could not connect to host
 ranegroup.hosting: could not connect to host
@@ -12360,6 +13419,7 @@ rapidthunder.io: could not connect to host
 rasing.me: could not connect to host
 raspass.me: did not receive HSTS header
 raspberry.us: could not connect to host
+raspberryultradrops.com: did not receive HSTS header
 rastreador.com.es: did not receive HSTS header
 rastreie.net: did not receive HSTS header
 ratajczak.fr: could not connect to host
@@ -12369,26 +13429,28 @@ rationem.nl: did not receive HSTS header
 ratuseks.com: could not connect to host
 ratuseks.net: could not connect to host
 ratuseks.us: could not connect to host
-rauchenwald.net: could not connect to host
 raucris.ro: could not connect to host
 raulfraile.net: could not connect to host
+rautermods.net: could not connect to host
 ravage.fm: did not receive HSTS header
 raven.lipetsk.ru: could not connect to host
 ravengergaming.ga: could not connect to host
 ravengergaming.net: could not connect to host
 ravenx.me: could not connect to host
-ravkr.duckdns.org: did not receive HSTS header
 ravse.dk: could not connect to host
-raw-diets.com: did not receive HSTS header
+raw-diets.com: could not connect to host
 rawet.se: could not connect to host
 rawoil.com: could not connect to host
 rawr.sexy: could not connect to host
 rawstorieslondon.com: could not connect to host
-rayanitco.com: did not receive HSTS header
 raycarruthersphotography.co.uk: could not connect to host
 raydan.space: could not connect to host
 raydobe.me: could not connect to host
 raytron.org: could not connect to host
+raywin168.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+raywin168.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+raywin88.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+razberry.kr: could not connect to host
 razlaw.name: did not receive HSTS header
 razzolini.com.br: could not connect to host
 rb-china.net: could not connect to host
@@ -12396,17 +13458,20 @@ rbhighinc.org: could not connect to host
 rbose.org: could not connect to host
 rbqcloud.com: could not connect to host
 rbti.me: could not connect to host
+rbtvshitstorm.is: did not receive HSTS header
 rbxcatalog.com: could not connect to host
 rc4.io: could not connect to host
 rc7.ch: could not connect to host
 rcafox.com: could not connect to host
 rcoliveira.com: could not connect to host
+rcorporation.be: did not receive HSTS header
 rcpcbd.com: could not connect to host
+rcraigmurphy.net: could not connect to host
 rcvd.io: did not receive HSTS header
 rcx.io: could not connect to host
 rdfz.tech: could not connect to host
-rdh.asia: did not receive HSTS header
 rdns.im: did not receive HSTS header
+rdplumbingsolutions.com.au: did not receive HSTS header
 rdyrda.fr: could not connect to host
 re-customer.net: could not connect to host
 re-wilding.com: could not connect to host
@@ -12414,6 +13479,7 @@ reachr.com: could not connect to host
 reactdatepicker.com: did not receive HSTS header
 reactor92.com: could not connect to host
 reader.ga: could not connect to host
+readify.com.au: did not receive HSTS header
 readism.io: could not connect to host
 readityourself.net: could not connect to host
 readmeeatmedrinkme.com: did not receive HSTS header
@@ -12427,14 +13493,14 @@ real-bits.com: could not connect to host
 real-compare.com: did not receive HSTS header
 realcli.com: could not connect to host
 realfamilyincest.com: could not connect to host
+realgarant-shop.de: did not receive HSTS header
 realhost.name: could not connect to host
-realloc.me: could not connect to host
-really.io: could not connect to host
+realincest.tv: could not connect to host
+really.io: did not receive HSTS header
 reallyreally.io: did not receive HSTS header
 realmic.net: could not connect to host
 realmofespionage.com: could not connect to host
 realnewhomes.com: could not connect to host
-realoteam.ddns.net: could not connect to host
 realraghavgupta.com: could not connect to host
 realwoo.com: did not receive HSTS header
 reapdrive.net: did not receive HSTS header
@@ -12442,14 +13508,16 @@ reaper.rip: could not connect to host
 reardenporn.com: could not connect to host
 rebekaesgabor.online: could not connect to host
 rebootmc.com: could not connect to host
-receitas-de-bolos.pt: did not receive HSTS header
+receitas-de-bolos.pt: could not connect to host
 receitasdebacalhau.pt: could not connect to host
+recetasdecocinaideal.com: did not receive HSTS header
 recetasfacilesdehacer.com: did not receive HSTS header
 rechat.com: did not receive HSTS header
 rechenwerk.net: could not connect to host
 recht-freundlich.de: did not receive HSTS header
 rechtenliteratuurleiden.nl: could not connect to host
 recompiled.org: max-age too low: 7776000
+recreation.gov: did not receive HSTS header
 recruitsecuritytraining.co.uk: could not connect to host
 recruitsecuritytraining.com: could not connect to host
 rectoraudiparts.com: could not connect to host
@@ -12467,7 +13535,6 @@ redirectman.com: could not connect to host
 redizoo.com: did not receive HSTS header
 redlatam.org: did not receive HSTS header
 redmbk.com: did not receive HSTS header
-redmind.se: could not connect to host
 redneck-gaming.de: did not receive HSTS header
 redner.cc: did not receive HSTS header
 rednertv.de: did not receive HSTS header
@@ -12477,21 +13544,19 @@ redperegrine.com: did not receive HSTS header
 redporno.cz: could not connect to host
 redports.org: could not connect to host
 redra.ws: did not receive HSTS header
-redsquirrelcampsite.co.uk: could not connect to host
 redstarsurf.com: did not receive HSTS header
 reducerin.ro: did not receive HSTS header
 redy.host: did not receive HSTS header
-reedyforkfarm.com: did not receive HSTS header
+reepay.com: did not receive HSTS header
 reeson.at: could not connect to host
 reeson.de: could not connect to host
 reeson.info: could not connect to host
 reeson.org: could not connect to host
+reevoo.com: did not receive HSTS header
 ref1oct.nl: could not connect to host
-refactor.zone: did not receive HSTS header
 referenten.org: did not receive HSTS header
 refitplanner.com: did not receive HSTS header
 reflecton.io: could not connect to host
-reforesttheplanet.com: could not connect to host
 reformatreality.com: could not connect to host
 refreshingserum.com: could not connect to host
 reg.ru: did not receive HSTS header
@@ -12507,10 +13572,11 @@ regionalcoalition.org: did not receive HSTS header
 regionale.org: did not receive HSTS header
 register.gov.uk: did not receive HSTS header
 registertovoteflorida.gov: did not receive HSTS header
+registryplus.nl: did not receive HSTS header
 regoul.com: did not receive HSTS header
 regsec.com: could not connect to host
 rehabthailand.nl: could not connect to host
-rei.ki: did not receive HSTS header
+rei.codes: did not receive HSTS header
 reic.me: could not connect to host
 reidascuecas.com.br: could not connect to host
 reignsphere.net: could not connect to host
@@ -12518,8 +13584,9 @@ reikiqueen.uk: could not connect to host
 reinaertvandecruys.com: could not connect to host
 reinaertvandecruys.me: could not connect to host
 reineberthe.ch: could not connect to host
+reinoldus.ddns.net: could not connect to host
 reismil.ch: could not connect to host
-reisyukaku.org: did not receive HSTS header
+reisyukaku.org: could not connect to host
 reithguard-it.de: did not receive HSTS header
 rejo.in: could not connect to host
 rejushiiplotter.ru: could not connect to host
@@ -12528,13 +13595,14 @@ relatic.net: could not connect to host
 relayawards.com: could not connect to host
 reldoc.com.mx: did not receive HSTS header
 reliable-mail.de: could not connect to host
+reliant3sixty.com: could not connect to host
 religiousforums.com: did not receive HSTS header
 relisten.nl: did not receive HSTS header
-relsak.cz: could not connect to host
 rem.pe: did not receive HSTS header
 rema.site: did not receive HSTS header
 remain.london: could not connect to host
 remedica.fr: could not connect to host
+remedioskaseros.com: did not receive HSTS header
 remedium.de: could not connect to host
 remedyrehab.com: did not receive HSTS header
 rememberthis.co.za: could not connect to host
@@ -12542,15 +13610,16 @@ remodela.com.ve: could not connect to host
 remodelwithlegacy.com: did not receive HSTS header
 remonttitekniikka.fi: could not connect to host
 remotestance.com: did not receive HSTS header
-remrol.ru: could not connect to host
 rencaijia.com: did not receive HSTS header
 rencontres-erotiques.com: did not receive HSTS header
+reneclemens.nl: max-age too low: 300
 rengarenkblog.com: could not connect to host
 renideo.fr: could not connect to host
 renkhosting.com: could not connect to host
 renlong.org: did not receive HSTS header
 rennfire.org: could not connect to host
 renrenss.com: could not connect to host
+rent-a-c.io: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 rentacarcluj.xyz: did not receive HSTS header
 rentalmed.com.br: did not receive HSTS header
 rentbrowser.com: could not connect to host
@@ -12558,8 +13627,6 @@ rentbrowsertrain.me: could not connect to host
 rentcarassist.com: could not connect to host
 renteater.com: could not connect to host
 rentex.com: did not receive HSTS header
-renxinge.cn: did not receive HSTS header
-reparo.pe: did not receive HSTS header
 repex.co.il: could not connect to host
 replaceits.me: could not connect to host
 replacemychina.com: could not connect to host
@@ -12577,12 +13644,14 @@ reporturl.com: did not receive HSTS header
 reporturl.io: did not receive HSTS header
 reposaarenkuva.fi: could not connect to host
 reprolife.co.uk: did not receive HSTS header
-reptilauksjonen.no: did not receive HSTS header
+reptilauksjonen.no: could not connect to host
 republicmo.gov: could not connect to host
 repustate.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 reqognize.com: could not connect to host
+request-trent.com: could not connect to host
 res-rheingau.de: did not receive HSTS header
 res42.com: could not connect to host
+resc.la: could not connect to host
 research.md: could not connect to host
 reseponline.info: did not receive HSTS header
 reserve-online.net: did not receive HSTS header
@@ -12591,7 +13660,6 @@ residentsinsurance.co.uk: did not receive HSTS header
 resl20.servehttp.com: could not connect to host
 resoundpro.ca: could not connect to host
 respice.xyz: could not connect to host
-ressl.ch: could not connect to host
 ressos.com: did not receive HSTS header
 restaurace-klokocka.cz: did not receive HSTS header
 restaurant-mangal.ch: could not connect to host
@@ -12601,10 +13669,10 @@ restaurantmangal.ch: could not connect to host
 restchart.com: did not receive HSTS header
 restioson.me: could not connect to host
 restopro.nyc: did not receive HSTS header
-restoreresearchstudy.com: did not receive HSTS header
+restoreresearchstudy.com: could not connect to host
 resultsdate.news: could not connect to host
 reth.ch: could not connect to host
-reto.io: could not connect to host
+retireyourpassword.org: did not receive HSTS header
 retogroup.com: could not connect to host
 retropage.co: did not receive HSTS header
 retrowave.eu: could not connect to host
@@ -12620,6 +13688,7 @@ reverie.pw: could not connect to host
 review.info: did not receive HSTS header
 reviewbestseller.com: did not receive HSTS header
 reviewjust.com: did not receive HSTS header
+reviewspedia.org: did not receive HSTS header
 revistapequenosolhares.com.br: could not connect to host
 revolutionhive.com: could not connect to host
 revtut.net: could not connect to host
@@ -12629,11 +13698,12 @@ rexhockingkelpies.com.au: did not receive HSTS header
 reykjavik.guide: could not connect to host
 rezun.cloud: did not receive HSTS header
 rf.tn: could not connect to host
-rfxanalyst.com: could not connect to host
+rfeif.org: could not connect to host
 rgservers.com: did not receive HSTS header
 rhapsodhy.hu: could not connect to host
 rhdigital.pro: could not connect to host
 rhering.de: could not connect to host
+rhetthenckel.com: max-age too low: 0
 rhiskiapril.com: did not receive HSTS header
 rhodes.ml: could not connect to host
 rhodesianridgeback.com.br: could not connect to host
@@ -12643,6 +13713,7 @@ ribopierre.fr: could not connect to host
 riceglue.com: could not connect to host
 richamorindonesia.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 richardb.me: could not connect to host
+richardcrosby.co.uk: did not receive HSTS header
 richardhicks.us: could not connect to host
 richeza.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 richie.link: did not receive HSTS header
@@ -12653,6 +13724,7 @@ richsiciliano.com: did not receive HSTS header
 richterphilipp.com: could not connect to host
 rickmartensen.nl: did not receive HSTS header
 ricknox.com: did not receive HSTS header
+ricky.capital: did not receive HSTS header
 rid-wan.com: could not connect to host
 rideaudiscount.com: did not receive HSTS header
 rideforwade.com: could not connect to host
@@ -12663,6 +13735,7 @@ ridingoklahoma.com: could not connect to host
 ridwan.co: could not connect to host
 rienasemettre.fr: did not receive HSTS header
 riesenmagnete.de: could not connect to host
+riester.pl: did not receive HSTS header
 rievo.net: did not receive HSTS header
 right-to-love.name: did not receive HSTS header
 right2.org: could not connect to host
@@ -12674,10 +13747,10 @@ rika.me: could not connect to host
 rincon-nsn.gov: could not connect to host
 ring0.xyz: did not receive HSTS header
 ringh.am: could not connect to host
-rinj.se: could not connect to host
+rinj.se: did not receive HSTS header
 rionewyork.com.br: could not connect to host
-rioshop.com.br: could not connect to host
 ripa.io: did not receive HSTS header
+ripple.com: did not receive HSTS header
 rippleunion.com: could not connect to host
 risi-china.com: could not connect to host
 risingsun.red: could not connect to host
@@ -12704,54 +13777,58 @@ rme.li: did not receive HSTS header
 rmit.me: could not connect to host
 rmk.si: could not connect to host
 rmsides.com: did not receive HSTS header
+rmstudio.tw: could not connect to host
 roadfeast.com: could not connect to host
 roan24.pl: did not receive HSTS header
-rob.uk.com: could not connect to host
+rob.uk.com: did not receive HSTS header
 robertabittle.com: could not connect to host
-roberto-webhosting.nl: did not receive HSTS header
+roberto-webhosting.nl: could not connect to host
 robertocasares.no-ip.biz: could not connect to host
 robi-net.it: could not connect to host
-robigalia.org: did not receive HSTS header
 robinvdmarkt.nl: could not connect to host
-robjager-fotografie.nl: could not connect to host
 robomonkey.org: could not connect to host
 robteix.com: did not receive HSTS header
 robtex.com: did not receive HSTS header
 robtex.net: did not receive HSTS header
 robtex.org: did not receive HSTS header
 robust.ga: could not connect to host
+roc.net.au: could not connect to host
 rochman.id: did not receive HSTS header
+rocket-wars.de: did not receive HSTS header
 rocketnet.ml: could not connect to host
 rockeyscrivo.com: did not receive HSTS header
 rocksberg.net: could not connect to host
 rockz.io: did not receive HSTS header
 rodarion.pl: could not connect to host
 rodehutskors.net: could not connect to host
+rodinneodpoledne2018.cz: did not receive HSTS header
 rodney.id.au: did not receive HSTS header
 rodneybrooksjr.com: did not receive HSTS header
-rodomonte.org: did not receive HSTS header
 rodosto.com: did not receive HSTS header
+rody-design.com: could not connect to host
 roelbazuin.com: did not receive HSTS header
 roelf.org: did not receive HSTS header
 roeper.party: could not connect to host
 roesemann.email: could not connect to host
+roffe.nu: did not receive HSTS header
 rofrank.space: could not connect to host
 rogeiro.net: could not connect to host
+roger101.com: could not connect to host
 rogerdat.ovh: could not connect to host
 roguefortgame.com: could not connect to host
 rohanbassett.com: could not connect to host
 rohankrishnadev.in: could not connect to host
+rohlik.cz: did not receive HSTS header
+roiscroll.com: did not receive HSTS header
 roketix.co.uk: did not receive HSTS header
 rolandkolodziej.com: max-age too low: 86400
-rolandreed.cn: did not receive HSTS header
 rolandslate.com: did not receive HSTS header
 rolemaster.net: could not connect to host
 rolroer.co.za: could not connect to host
 romaimperator.com: did not receive HSTS header
-romainmuller.xyz: did not receive HSTS header
 romans-place.me.uk: could not connect to host
+romantic-quotes.co.uk: could not connect to host
 romanticschemermovie.com: could not connect to host
-romar-bos.nl: did not receive HSTS header
 romeoferraris.com: did not receive HSTS header
 romleg.cf: could not connect to host
 roms.fun: could not connect to host
@@ -12762,11 +13839,12 @@ rondreis-planner.nl: could not connect to host
 ronghexx.com: could not connect to host
 ronvandordt.info: did not receive HSTS header
 ronwo.de: max-age too low: 1
-roo.ie: could not connect to host
+roo.ie: did not receive HSTS header
 rool.me: did not receive HSTS header
 roolevoi.ru: could not connect to host
 room-checkin24.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 roosteroriginals.com: could not connect to host
+roosterpgplus.nl: did not receive HSTS header
 rootbsd.at: could not connect to host
 rootforum.org: did not receive HSTS header
 rootrelativity.com: could not connect to host
@@ -12774,7 +13852,10 @@ rootservice.org: did not receive HSTS header
 rootwpn.com: could not connect to host
 rop.io: did not receive HSTS header
 roquecenter.org: did not receive HSTS header
+roromendut.online: could not connect to host
 rorymcdaniel.com: did not receive HSTS header
+rosetiger.life: could not connect to host
+rosevillefacialplasticsurgery.com: did not receive HSTS header
 rosewoodranch.com: did not receive HSTS header
 rosi-royal.com: could not connect to host
 rospa100.com: did not receive HSTS header
@@ -12798,23 +13879,26 @@ royal876.com: could not connect to host
 royalhop.co: could not connect to host
 royalpub.net: did not receive HSTS header
 royalsignaturecruise.com: could not connect to host
+royaltube.net: could not connect to host
 roychan.org: max-age too low: 0
 royzez.com: could not connect to host
 rozalisbengal.ro: could not connect to host
-rozalynne-dawn.ga: could not connect to host
 rozeapp.nl: could not connect to host
 rpasafrica.com: could not connect to host
 rr.in.th: could not connect to host
+rring.me: could not connect to host
+rritv.com: could not connect to host
 rrke.cc: did not receive HSTS header
 rrom.me: did not receive HSTS header
 rs-devdemo.host: could not connect to host
 rsajeey.info: could not connect to host
 rsampaio.info: did not receive HSTS header
 rsf.io: could not connect to host
-rsi.im: did not receive HSTS header
+rsi.im: could not connect to host
 rskuipers.com: did not receive HSTS header
 rsldb.com: could not connect to host
 rsmaps.org: could not connect to host
+rsmmail.com: did not receive HSTS header
 rsships.com: could not connect to host
 rstraining.co.uk: did not receive HSTS header
 rstsecuritygroup.co.uk: could not connect to host
@@ -12822,18 +13906,21 @@ rtc.fun: could not connect to host
 rtho.me: did not receive HSTS header
 rtvi.com: did not receive HSTS header
 ru-music.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+ruarua.ml: could not connect to host
 rubbereggs.ca: could not connect to host
 rubbix.net: could not connect to host
 rubecodeberg.com: could not connect to host
 rubendv.be: did not receive HSTS header
 rubens.cloud: did not receive HSTS header
-rubenschulz.nl: could not connect to host
+rubenschulz.nl: did not receive HSTS header
 rubi-ka.net: max-age too low: 0
 ruborr.se: did not receive HSTS header
 rubysecurity.org: did not receive HSTS header
 rubyshop.nl: could not connect to host
+rudelune.fr: could not connect to host
 rudeotter.com: did not receive HSTS header
-rue-de-la-vieille.fr: max-age too low: 0
+ruderverein-gelsenkirchen.de: did not receive HSTS header
+rue-de-la-vieille.fr: did not receive HSTS header
 ruflay.ru: could not connect to host
 rugirlfriend.com: could not connect to host
 rugs.ca: did not receive HSTS header
@@ -12844,16 +13931,17 @@ ruitershoponline.nl: did not receive HSTS header
 ruja.dk: did not receive HSTS header
 rukhaiyar.com: could not connect to host
 rullzer.com: did not receive HSTS header
-rumlager.de: max-age too low: 600000
 rummel-platz.de: could not connect to host
 rumoterra.com.br: could not connect to host
 run-forrest.run: could not connect to host
 runawebinar.nl: could not connect to host
 runcarina.com: could not connect to host
 rundumcolumn.xyz: could not connect to host
+runefake.com: could not connect to host
 runhardt.eu: did not receive HSTS header
 runtl.com: did not receive HSTS header
 runtondev.com: did not receive HSTS header
+ruobiyi.com: could not connect to host
 ruqu.nl: could not connect to host
 rusadmin.biz: did not receive HSTS header
 rushball.net: could not connect to host
@@ -12861,16 +13949,20 @@ rusl.me: could not connect to host
 russmarshall.com: could not connect to host
 rustbyexample.com: did not receive HSTS header
 rustfanatic.com: did not receive HSTS header
+rustralasia.net: max-age too low: 0
+rusxakep.com: could not connect to host
 ruurdboomsma.nl: could not connect to host
 ruxit.com: did not receive HSTS header
+rva.gov: could not connect to host
 rvg.zone: could not connect to host
 rvolve.net: could not connect to host
 rw-solutions.tech: could not connect to host
 rwanderlust.com: did not receive HSTS header
-rxgroup.io: could not connect to host
+rxbn.de: could not connect to host
 rxprep.com: did not receive HSTS header
 rxt.social: could not connect to host
 rxv.cc: could not connect to host
+ryanroberts.co.uk: could not connect to host
 ryanteck.uk: did not receive HSTS header
 rybox.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ryejuice.sytes.net: could not connect to host
@@ -12888,14 +13980,17 @@ s1mplescripts.de: could not connect to host
 s1ris.org: did not receive HSTS header
 s3cases.com: did not receive HSTS header
 s3n.se: could not connect to host
+s4media.org: did not receive HSTS header
 saabwa.org: could not connect to host
-saba-piserver.info: could not connect to host
 sabatek.pl: did not receive HSTS header
 sac-shop.com: did not receive HSTS header
 sachk.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 saco-ceso.com: could not connect to host
 sadiejanehair.com: could not connect to host
+sadsu.com: did not receive HSTS header
+saenforcement.agency: could not connect to host
 safari-afrique.com: did not receive HSTS header
+safe.space: could not connect to host
 safedevice.net: did not receive HSTS header
 safelist.eu: did not receive HSTS header
 safemovescheme.co.uk: could not connect to host
@@ -12903,18 +13998,18 @@ safemt.gov: could not connect to host
 safepay.io: could not connect to host
 saferedirect.link: could not connect to host
 saferedirectlink.com: could not connect to host
-safersurfing.eu: did not receive HSTS header
 safesecret.info: did not receive HSTS header
-safetyrisk.net: did not receive HSTS header
 safewings-nh.nl: could not connect to host
 safing.me: could not connect to host
 safnah.com: did not receive HSTS header
 sagarhandicraft.com: could not connect to host
+sagemontchurch.org: did not receive HSTS header
 sageth.com: could not connect to host
 sah3.net: could not connect to host
 sail-nyc.com: did not receive HSTS header
 saint-astier-triathlon.com: did not receive HSTS header
 saintjohnlutheran.church: did not receive HSTS header
+saintmichelqud.com: did not receive HSTS header
 saintw.com: could not connect to host
 sairai.bid: could not connect to host
 saiyasu-search.com: did not receive HSTS header
@@ -12923,29 +14018,33 @@ sakib.ninja: did not receive HSTS header
 sakurabuff.com: could not connect to host
 salaervergleich.com: did not receive HSTS header
 sale.sh: could not connect to host
+saleaks.org: could not connect to host
 salearnership.co.za: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-saleslift.pl: did not receive HSTS header
+saleslift.pl: could not connect to host
 salishseawhalewatching.ca: could not connect to host
+salixcode.com: could not connect to host
 sallysubs.com: could not connect to host
 salmo23.com.br: could not connect to host
 salon-claudia.ch: could not connect to host
 salonestella.it: could not connect to host
-salrosadohimalaia.com: did not receive HSTS header
 salserocafe.com: did not receive HSTS header
 salserototal.com: could not connect to host
 saltedskies.com: could not connect to host
 saltra.online: could not connect to host
+saltro.nl: did not receive HSTS header
 salvaalocombia.com: could not connect to host
 salverainha.org: could not connect to host
 salzamt.tk: could not connect to host
 samanthahumphreysstudio.com: did not receive HSTS header
 samaritan.tech: could not connect to host
 samaritansnet.org: did not receive HSTS header
-samenwerkingsportaal.tk: did not receive HSTS header
+samel.de: could not connect to host
 sametovymesic.cz: could not connect to host
+samin.tk: could not connect to host
 saml2.com: could not connect to host
 samlamac.com: could not connect to host
 samm.com.au: did not receive HSTS header
+sammenlignakasser.dk: did not receive HSTS header
 sammyjohnson.com: could not connect to host
 samp.im: could not connect to host
 sampcup.com: could not connect to host
@@ -12955,20 +14054,22 @@ samsen.club: could not connect to host
 samsonova.de: could not connect to host
 samsungxoa.com: could not connect to host
 samvanderkris.com: could not connect to host
-samvanderkris.xyz: did not receive HSTS header
 sanael.net: could not connect to host
+sanalbayrak.com: could not connect to host
 sanandreasstories.com: did not receive HSTS header
 sanasalud.org: could not connect to host
 sanatfilan.com: did not receive HSTS header
+sanatorionosti.com.ar: could not connect to host
 sanatrans.com: could not connect to host
-sand-islets.de: did not receive HSTS header
+sanchez.adv.br: could not connect to host
 sanderknape.com: did not receive HSTS header
 sandviks.com: did not receive HSTS header
 sanguoxiu.com: could not connect to host
 sanhei.ch: did not receive HSTS header
-sanik.my: could not connect to host
+sanik.my: did not receive HSTS header
+sanmuding.com: could not connect to host
 sanradon.by: did not receive HSTS header
-sansage.com.br: could not connect to host
+sansage.com.br: did not receive HSTS header
 sansdev.com: could not connect to host
 sansemea.com: did not receive HSTS header
 santi.eu: did not receive HSTS header
@@ -12979,11 +14080,13 @@ santmark.fi: could not connect to host
 santmark.info: could not connect to host
 santmark.net: could not connect to host
 santmark.org: could not connect to host
-santodomingocg.org: could not connect to host
+santodomingocg.org: did not receive HSTS header
 santorinibbs.com: did not receive HSTS header
 santouri.be: could not connect to host
+saol.eu: could not connect to host
 saotn.org: did not receive HSTS header
 sapereaude.com.pl: did not receive HSTS header
+sapphireblue.me: could not connect to host
 sapporobeer.com: could not connect to host
 sapuncheta.com: could not connect to host
 saq.com: could not connect to host
@@ -13001,10 +14104,9 @@ sarkarikhoj.com: could not connect to host
 sarkarischeme.in: could not connect to host
 sarkisozleri.us: could not connect to host
 sarndipity.com: could not connect to host
-saro.me: did not receive HSTS header
 saruwebshop.co.za: could not connect to host
+sasanika.org: did not receive HSTS header
 sat.rent: did not receive HSTS header
-sat7a-riyadh.com: did not receive HSTS header
 satanichia.moe: could not connect to host
 sativatunja.com: could not connect to host
 satmep.com: did not receive HSTS header
@@ -13012,21 +14114,21 @@ satoshicrypt.com: did not receive HSTS header
 satragreen.com: did not receive HSTS header
 satrent.com: did not receive HSTS header
 satrent.se: did not receive HSTS header
-satriyowibowo.my.id: did not receive HSTS header
+satriyowibowo.my.id: could not connect to host
 satsang-uwe.de: did not receive HSTS header
 satsukii.moe: did not receive HSTS header
 sattamatkadpboss.mobi: could not connect to host
 saturne.tk: could not connect to host
-saturngames.co.uk: could not connect to host
 saucyfox.net: did not receive HSTS header
 saudeeconforto.com.br: did not receive HSTS header
 sauenytt.no: could not connect to host
-sauerbrey.eu: did not receive HSTS header
 saumon.io: did not receive HSTS header
 saumon.xyz: could not connect to host
 saunasandstuff.ca: did not receive HSTS header
 saunasandstuff.com: did not receive HSTS header
+saurel.me: could not connect to host
 savacloud.com: did not receive HSTS header
+savannahtasteexperience.com: did not receive HSTS header
 save.gov: could not connect to host
 saveaward.gov: could not connect to host
 savecashindia.com: did not receive HSTS header
@@ -13048,8 +14150,12 @@ sbobetfun.com: did not receive HSTS header
 sbox-archives.com: could not connect to host
 sby.de: did not receive HSTS header
 sc4le.com: could not connect to host
+scaffoldhirefourways.co.za: did not receive HSTS header
+scaffoldhirerandburg.co.za: did not receive HSTS header
+scaffoldhiresandton.co.za: did not receive HSTS header
 scala.click: did not receive HSTS header
 scannabi.com: could not connect to host
+sceptique.eu: did not receive HSTS header
 sch44r0n.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 schaafenstrasse.koeln: could not connect to host
 schachburg.de: did not receive HSTS header
@@ -13059,24 +14165,24 @@ schau-rein.co.at: did not receive HSTS header
 schauer.so: could not connect to host
 schd.io: did not receive HSTS header
 schermreparatierotterdam.nl: did not receive HSTS header
+schippers-it.nl: did not receive HSTS header
 schlabbi.com: did not receive HSTS header
-schlagenhauf.info: could not connect to host
-schlagma.de: could not connect to host
+schmelzle.io: could not connect to host
 schmidttulskie.de: could not connect to host
 schmitt.ovh: could not connect to host
 schmitt.ws: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-schnapke.name: could not connect to host
-schneider-electric.tg: could not connect to host
+schmitz.link: could not connect to host
+schneider-electric.tg: did not receive HSTS header
 schnell-abnehmen.tips: did not receive HSTS header
-schnell-gold.com: did not receive HSTS header
+schnell-gold.com: could not connect to host
 scholl.io: could not connect to host
+school.in.th: could not connect to host
 schooli.io: could not connect to host
 schooltrends.co.uk: did not receive HSTS header
 schoolze.com: did not receive HSTS header
 schoop.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 schorel.ovh: could not connect to host
 schraebanowicz.net: did not receive HSTS header
-schreck-thomas.de: could not connect to host
 schreiber-netzwerk.eu: did not receive HSTS header
 schreibnacht.de: did not receive HSTS header
 schreinerei-wortmann.de: did not receive HSTS header
@@ -13089,8 +14195,10 @@ schulterglatzen-altenwalde.de: could not connect to host
 schur-it.de: could not connect to host
 schwarzkopfforyou.de: did not receive HSTS header
 schwarzwaldcon.de: could not connect to host
+schwedenhaus.ag: did not receive HSTS header
 schweiz.guide: could not connect to host
-schweizerbolzonello.net: could not connect to host
+schweizerbolzonello.net: did not receive HSTS header
+schwerkraftlabor.de: did not receive HSTS header
 schwetz.net: could not connect to host
 sci-internet.tk: could not connect to host
 scib.tk: could not connect to host
@@ -13098,7 +14206,6 @@ scicasts.com: max-age too low: 7776000
 science-anatomie.com: did not receive HSTS header
 scienceathome.org: did not receive HSTS header
 sciencemonster.co.uk: could not connect to host
-scintillating.stream: could not connect to host
 scionasset.com: did not receive HSTS header
 scivillage.com: did not receive HSTS header
 sclgroup.cc: did not receive HSTS header
@@ -13107,6 +14214,7 @@ scooshonline.co.uk: did not receive HSTS header
 scopea.fr: max-age too low: 0
 score-savers.com: max-age too low: 10540800
 scores4schools.com: could not connect to host
+scorobudem.ru: could not connect to host
 scotbirchfield.com: did not receive HSTS header
 scottainslie.me.uk: could not connect to host
 scottdial.com: did not receive HSTS header
@@ -13139,12 +14247,12 @@ sdhmanagementgroup.com: could not connect to host
 sdia.ru: could not connect to host
 sdl-corporatesite-staging.azurewebsites.net: did not receive HSTS header
 sdmoscow.ru: could not connect to host
+sdocast.com: could not connect to host
 sdrobs.com: did not receive HSTS header
 sdsl-speedtest.de: could not connect to host
 se7ensins.com: did not receive HSTS header
 sea-godzilla.com: could not connect to host
-seadus.ee: could not connect to host
-seanationals.org: did not receive HSTS header
+seanationals.org: max-age too low: 1
 seanchaidh.org: could not connect to host
 seans.cc: did not receive HSTS header
 seanstrout.com: did not receive HSTS header
@@ -13153,17 +14261,19 @@ searchgov.gov.il: did not receive HSTS header
 searchshops.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 searx.pw: could not connect to host
 sebastian-bair.de: could not connect to host
+sebastian-lutsch.de: could not connect to host
 sebastian-schmidt.me: did not receive HSTS header
 sebastianhampl.de: could not connect to host
 sebastianpedersen.com: did not receive HSTS header
 sebastiensenechal.com: did not receive HSTS header
-sebi.cf: could not connect to host
 sebster.com: did not receive HSTS header
 sec44.com: could not connect to host
 sec44.net: could not connect to host
 sec44.org: could not connect to host
+sec4share.me: did not receive HSTS header
 secandtech.com: could not connect to host
 secanje.nl: did not receive HSTS header
+secboom.com: could not connect to host
 seccomp.ru: did not receive HSTS header
 seceye.cn: could not connect to host
 secitem.at: did not receive HSTS header
@@ -13184,7 +14294,7 @@ secretnation.net: did not receive HSTS header
 secretofanah.com: could not connect to host
 secretpanties.com: could not connect to host
 sectest.ml: could not connect to host
-sectia22.ro: did not receive HSTS header
+sectia22.ro: could not connect to host
 secur3.us: did not receive HSTS header
 secure-automotive-cloud.com: could not connect to host
 secure-automotive-cloud.org: could not connect to host
@@ -13202,13 +14312,14 @@ securita.eu: did not receive HSTS header
 security-carpet.com: could not connect to host
 security-thoughts.org: could not connect to host
 security.google.com: did not receive HSTS header (error ignored - included regardless)
+security.love: could not connect to host
 security.xn--q9jyb4c: could not connect to host
 securityarena.com: could not connect to host
 securitybrief.asia: did not receive HSTS header
 securitybrief.co.nz: did not receive HSTS header
 securitybrief.com.au: did not receive HSTS header
 securitybrief.eu: did not receive HSTS header
-securitybsides.pl: did not receive HSTS header
+securitybsides.pl: could not connect to host
 securityglance.com: could not connect to host
 securityinet.biz: did not receive HSTS header
 securityinet.net: did not receive HSTS header
@@ -13221,6 +14332,7 @@ securitywatch.co.nz: did not receive HSTS header
 securiviera.ch: did not receive HSTS header
 securon.io: could not connect to host
 securoswiss.ch: could not connect to host
+secwise.nl: did not receive HSTS header
 sedoexpert.nl: could not connect to host
 sedoexperts.nl: could not connect to host
 sedrubal.de: could not connect to host
@@ -13233,9 +14345,11 @@ seehimnaked.com: could not connect to host
 seehimnude.com: could not connect to host
 seehisnudes.com: could not connect to host
 seele.ca: could not connect to host
+seemeasaperson.com: did not receive HSTS header
 seen.life: could not connect to host
 sehenderson.com: did not receive HSTS header
 seida.at: could not connect to host
+seiko-dojo.com: could not connect to host
 seiler-bad.de: did not receive HSTS header
 seizoushokoyuubangou.com: did not receive HSTS header
 sektor.team: could not connect to host
@@ -13258,13 +14372,11 @@ semantheme.fr: did not receive HSTS header
 semen3325.xyz: could not connect to host
 semenkovich.com: did not receive HSTS header
 sementes.gratis: could not connect to host
-semjonov.de: could not connect to host
 semps-servers.de: could not connect to host
 sendash.com: did not receive HSTS header
 sendmeback.de: did not receive HSTS header
 senedirect.com: could not connect to host
-senmonsyoku.top: did not receive HSTS header
-sens2lavie.com: did not receive HSTS header
+senemusique.com: did not receive HSTS header
 senseofnumber.co.uk: did not receive HSTS header
 senseye.io: did not receive HSTS header
 sensiblemn.org: could not connect to host
@@ -13275,13 +14387,13 @@ seo-lagniappe.com: did not receive HSTS header
 seoarchive.org: could not connect to host
 seobot.com.au: could not connect to host
 seohochschule.de: could not connect to host
+seokay.com: did not receive HSTS header
 seolaba.io: could not connect to host
 seomarketingdeals.com: did not receive HSTS header
 seomen.biz: could not connect to host
 seomobo.com: could not connect to host
 seosanantonioinc.com: did not receive HSTS header
 seoscribe.net: could not connect to host
-seosec.xyz: did not receive HSTS header
 seotronix.net: did not receive HSTS header
 seowarp.net: did not receive HSTS header
 sep23.ru: could not connect to host
@@ -13289,13 +14401,13 @@ sepakbola.win: could not connect to host
 sephr.com: did not receive HSTS header
 sepie.gob.es: did not receive HSTS header
 seq.tf: did not receive HSTS header
-sequatchiecounty-tn.gov: could not connect to host
+sequatchiecountytn.gov: could not connect to host
 serafin.tech: could not connect to host
 serathius.ovh: could not connect to host
 serbien.guide: could not connect to host
 serenitycreams.com: did not receive HSTS header
 serfdom.io: did not receive HSTS header
-sergeyreznikov.com: could not connect to host
+sergivb01.me: did not receive HSTS header
 serized.pw: could not connect to host
 serkaneles.com: did not receive HSTS header
 servecrypt.com: could not connect to host
@@ -13308,10 +14420,9 @@ servercode.ca: did not receive HSTS header
 serverdensity.io: did not receive HSTS header
 servergno.me: did not receive HSTS header
 serverlauget.no: could not connect to host
-serverlog.net: could not connect to host
 servermonkey.nl: could not connect to host
 servfefe.com: could not connect to host
-service.gov.uk: could not connect to host
+service-wueste-vodafone.tk: could not connect to host
 servicevie.com: did not receive HSTS header
 servpanel.de: did not receive HSTS header
 servu.de: did not receive HSTS header
@@ -13324,8 +14435,10 @@ sesha.co.za: could not connect to host
 sethoedjo.com: could not connect to host
 setkit.net: could not connect to host
 setuid.de: could not connect to host
-setuid.io: did not receive HSTS header
+sevenet.pl: did not receive HSTS header
 sevenhearts.online: could not connect to host
+sevsey.ru: could not connect to host
+sevsopr.ru: could not connect to host
 sex-education.com: could not connect to host
 sexgarage.de: could not connect to host
 sexocomgravidas.com: could not connect to host
@@ -13333,6 +14446,7 @@ sexoyrelax.com: could not connect to host
 sexpay.net: could not connect to host
 sexplicit.co.uk: did not receive HSTS header
 sexshopfacil.com.br: could not connect to host
+sexshopnet.com.br: could not connect to host
 sexshopsgay.com: did not receive HSTS header
 sexwork.net: could not connect to host
 sexymassageoil.com: could not connect to host
@@ -13345,9 +14459,11 @@ sfhobbies.com.br: could not connect to host
 sfsltd.com: did not receive HSTS header
 sgovaard.nl: did not receive HSTS header
 sgthotshot.com: could not connect to host
+sgtsnookums.net: could not connect to host
 sh11.pp.ua: did not receive HSTS header
 sh4y.com: could not connect to host
 sha2017.org: did not receive HSTS header
+shaaaaaaaaaaaaa.com: did not receive HSTS header
 shaamrelief.org: did not receive HSTS header
 shadiku.com: could not connect to host
 shadow-socks.net: could not connect to host
@@ -13356,6 +14472,7 @@ shadow-socks.pro: could not connect to host
 shadowguardian507-irl.tk: did not receive HSTS header
 shadowguardian507.tk: did not receive HSTS header
 shadowmorph.info: did not receive HSTS header
+shadowping.com: could not connect to host
 shadowplus.net: could not connect to host
 shadowrocket.net: could not connect to host
 shadowroket.com: did not receive HSTS header
@@ -13375,18 +14492,16 @@ shagi29.ru: did not receive HSTS header
 shahbeat.com: could not connect to host
 shaitan.eu: could not connect to host
 shakebox.de: could not connect to host
-shaken-kyoto.jp: could not connect to host
-shamka.ru: could not connect to host
 shandonsg.co.uk: could not connect to host
 shanekoster.net: did not receive HSTS header
-shanesage.com: could not connect to host
+shanesage.com: did not receive HSTS header
 shang-yu.cn: could not connect to host
 shanxiapark.com: could not connect to host
 shanyhs.com: did not receive HSTS header
 shapesedinburgh.co.uk: did not receive HSTS header
 shardsoft.com: could not connect to host
+shareeri.com: could not connect to host
 shareimg.xyz: could not connect to host
-sharejoy.cn: did not receive HSTS header
 sharemessage.net: could not connect to host
 shareoine.com: did not receive HSTS header
 sharepass.pw: could not connect to host
@@ -13394,10 +14509,16 @@ sharepic.xyz: could not connect to host
 sharesplitter.com: could not connect to host
 shareworx.net: could not connect to host
 sharezen.de: could not connect to host
+shariahlawcenter.com: could not connect to host
+shariahlawcenter.org: could not connect to host
+sharialawcenter.com: could not connect to host
+sharialawcenter.org: could not connect to host
 sharingcode.com: did not receive HSTS header
 sharpe-practice.co.uk: could not connect to host
-sharvey.ca: could not connect to host
-shatorin.com: could not connect to host
+sharperedge.pw: could not connect to host
+sharperedgecomputers.com: could not connect to host
+shasso.com: did not receive HSTS header
+shatorin.com: did not receive HSTS header
 shauncrowley.co.uk: could not connect to host
 shaunwheelhou.se: could not connect to host
 shavingks.com: could not connect to host
@@ -13405,20 +14526,19 @@ shawnbsmith.me: did not receive HSTS header
 shawnh.net: could not connect to host
 shawnstarrcustomhomes.com: did not receive HSTS header
 shawnwilson.info: could not connect to host
-shellj.me: did not receive HSTS header
+shazbots.org: could not connect to host
 shellsec.pw: did not receive HSTS header
 shemissed.me: did not receive HSTS header
+shena.co.uk: could not connect to host
 shentengtu.idv.tw: could not connect to host
 shep.co.il: did not receive HSTS header
 sheratan.web.id: could not connect to host
-shereallyheals.com: could not connect to host
+shereallyheals.com: did not receive HSTS header
 shermantank.biz: did not receive HSTS header
 shervik.ga: could not connect to host
 shethbox.com: could not connect to host
 shevronpatriot.ru: did not receive HSTS header
 sheying.tm: could not connect to host
-shg-pornographieabhaengigkeit.de: did not receive HSTS header
-shh.sh: could not connect to host
 shiatsu-institut.ch: could not connect to host
 shibainu.com.br: could not connect to host
 shibe.club: could not connect to host
@@ -13428,11 +14548,9 @@ shiftnrg.org: did not receive HSTS header
 shiftplanning.com: did not receive HSTS header
 shiinko.com: could not connect to host
 shikinobi.com: did not receive HSTS header
-shimo.im: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 shindorei.fr: could not connect to host
 shinebijoux.com.br: could not connect to host
 shinju.moe: could not connect to host
-shinko-osaka.jp: could not connect to host
 shinobi-fansub.ro: could not connect to host
 shiona.xyz: could not connect to host
 shipinsight.com: did not receive HSTS header
@@ -13441,7 +14559,6 @@ shipping24h.com: could not connect to host
 shippingbo.com: did not receive HSTS header
 shiroki-k.net: could not connect to host
 shirosaki.org: could not connect to host
-shiseki.top: did not receive HSTS header
 shishkin.link: did not receive HSTS header
 shitfest.info: did not receive HSTS header
 shitposting.life: could not connect to host
@@ -13450,8 +14567,10 @@ shlemenkov.by: could not connect to host
 shm-forum.org.uk: could not connect to host
 sho-tanaka.jp: did not receive HSTS header
 shocksrv.com: did not receive HSTS header
+shoemuse.com: did not receive HSTS header
 shooshosha.com: could not connect to host
 shootpooloklahoma.com: could not connect to host
+shop.fr: max-age too low: 0
 shopdopastor.com.br: could not connect to host
 shopherbal.co.za: could not connect to host
 shopods.com: did not receive HSTS header
@@ -13464,24 +14583,24 @@ shops.neonisi.com: could not connect to host
 shortpath.com: could not connect to host
 shortr.li: could not connect to host
 shota.party: could not connect to host
+shota.vip: could not connect to host
 shotpixonline.com.br: did not receive HSTS header
 show-stream.tv: could not connect to host
 showdepiscinas.com.br: did not receive HSTS header
 shower.im: did not receive HSTS header
 showkeeper.tv: did not receive HSTS header
 showroom.de: did not receive HSTS header
-showroom113.ru: could not connect to host
-shoxmusic.net: did not receive HSTS header
 shred.ch: could not connect to host
 shredoptics.ch: could not connect to host
-shrug.ml: could not connect to host
 shtorku.com: could not connect to host
 shu-kin.net: did not receive HSTS header
 shukatsu-note.com: could not connect to host
 shurita.org: could not connect to host
+shuvo.rocks: did not receive HSTS header
 shuzicai.cn: could not connect to host
 shv25.se: could not connect to host
 shwongacc.com: could not connect to host
+shybynature.com: did not receive HSTS header
 shymeck.pw: could not connect to host
 shypp.it: could not connect to host
 shyrydan.es: could not connect to host
@@ -13490,27 +14609,29 @@ siammedia.co: could not connect to host
 siamojo.com: could not connect to host
 sianimacion.com: could not connect to host
 siao-mei.com: did not receive HSTS header
+siberkulupler.com: could not connect to host
 sichere-kartenakzeptanz.de: could not connect to host
 siciliadigitale.pro: could not connect to host
 sicklepod.com: could not connect to host
 sictame-tigf.org: did not receive HSTS header
-sidpod.ru: could not connect to host
+sideropolisnoticias.com.br: did not receive HSTS header
 siebens.net: could not connect to host
 sieh.es: did not receive HSTS header
+sieulog.com: could not connect to host
 sifls.com: could not connect to host
 sifreuret.com: could not connect to host
 signere.com: could not connect to host
 signere.no: did not receive HSTS header
 signoracle.com: could not connect to host
 signosquecombinam.com.br: could not connect to host
+signsdance.uk: could not connect to host
 sigsegv.run: did not receive HSTS header
 sihaizixun.net: could not connect to host
 siikarantacamping.fi: did not receive HSTS header
 sijimi.cn: could not connect to host
-sijmenschoon.nl: could not connect to host
+sijmenschoon.nl: did not receive HSTS header
 sikatehtaat.fi: could not connect to host
 siku.pro: could not connect to host
-silent.live: could not connect to host
 silentcircle.com: did not receive HSTS header
 silentcircle.org: could not connect to host
 silentexplosion.de: could not connect to host
@@ -13519,9 +14640,12 @@ silentmode.com: max-age too low: 0
 silicagelpackets.ca: did not receive HSTS header
 silke-hunde.de: did not receive HSTS header
 silqueskineyeserum.com: could not connect to host
+silvacor-ziegel.de: max-age too low: 604800
 silver-drachenkrieger.de: did not receive HSTS header
+silverartcollector.com: did not receive HSTS header
 silverback.is: did not receive HSTS header
 silvergoldbull.ba: could not connect to host
+silvergoldbull.lt: could not connect to host
 silvergoldbull.md: could not connect to host
 silvergoldbull.ph: could not connect to host
 silverhome.ninja: could not connect to host
@@ -13529,11 +14653,13 @@ silverpvp.com: could not connect to host
 silverstartup.sk: could not connect to host
 silviamacallister.com: did not receive HSTS header
 silvistefi.com: could not connect to host
+sim-sim.appspot.com: did not receive HSTS header
+sim4seed.org: could not connect to host
 simbast.com: could not connect to host
 simbihaiti.com: max-age too low: 7889238
 simbol.id: could not connect to host
 simbolo.co.uk: could not connect to host
-simccorp.com: did not receive HSTS header
+simccorp.com: could not connect to host
 simeon.us: max-age too low: 2592000
 simfri.com: could not connect to host
 simha.online: could not connect to host
@@ -13545,6 +14671,7 @@ simon-pokorny.com: did not receive HSTS header
 simon.butcher.name: max-age too low: 2629743
 simon.lc: did not receive HSTS header
 simongong.net: did not receive HSTS header
+simonkjellberg.se: did not receive HSTS header
 simonsaxon.com: did not receive HSTS header
 simonschmitt.ch: could not connect to host
 simonsmh.cc: did not receive HSTS header
@@ -13552,7 +14679,6 @@ simpan.id: could not connect to host
 simpeo.fr: did not receive HSTS header
 simpeo.org: did not receive HSTS header
 simpleai.net: max-age too low: 600
-simplecoding.click: did not receive HSTS header
 simplefraud.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 simplelearner.com: could not connect to host
 simplepractice.com: did not receive HSTS header
@@ -13579,11 +14705,10 @@ sinneserweiterung.de: could not connect to host
 sinon.org: did not receive HSTS header
 sinoscandinavia.se: could not connect to host
 sinosky.org: did not receive HSTS header
-sinsojb.me: did not receive HSTS header
+sinsojb.me: could not connect to host
 sintesysglobal.com: did not receive HSTS header
 sinusbot.online: did not receive HSTS header
 sion.moe: did not receive HSTS header
-sipc.org: did not receive HSTS header
 sipsik.net: did not receive HSTS header
 siqi.wang: could not connect to host
 sirburton.com: did not receive HSTS header
@@ -13611,20 +14736,22 @@ sittinginoblivion.com: did not receive HSTS header
 sizingservers.be: did not receive HSTS header
 sizzle.co.uk: did not receive HSTS header
 sja-se-training.com: could not connect to host
-sjdaws.com: could not connect to host
 sjdtaxi.com: did not receive HSTS header
 sjhyl11.com: could not connect to host
 sjsc.fr: did not receive HSTS header
+sjsmith.id.au: did not receive HSTS header
+sjzebs.com: did not receive HSTS header
+sjzget.com: did not receive HSTS header
+sjzybs.com: did not receive HSTS header
 skandiabanken.no: did not receive HSTS header
 skaraborgsassistans.com: did not receive HSTS header
 skarox.com: could not connect to host
-skarox.ee: could not connect to host
 skarox.eu: could not connect to host
 skarox.net: could not connect to host
 skarox.ru: could not connect to host
 skates.guru: did not receive HSTS header
 skday.com: did not receive HSTS header
-sketchywebsite.net: max-age too low: 0
+sketchywebsite.net: did not receive HSTS header
 ski-insurance.com.au: did not receive HSTS header
 skidstresser.com: could not connect to host
 skiinstructor.services: did not receive HSTS header
@@ -13632,6 +14759,8 @@ skilldetector.com: could not connect to host
 skillproxy.com: could not connect to host
 skillproxy.net: could not connect to host
 skillproxy.org: could not connect to host
+skills2services.com: did not receive HSTS header
+skimming.net: did not receive HSTS header
 skinbet.co: could not connect to host
 skinmarket.co: could not connect to host
 skischuleulm.de: did not receive HSTS header
@@ -13646,10 +14775,10 @@ skotty.io: did not receive HSTS header
 skpdev.net: could not connect to host
 skrimix.tk: could not connect to host
 skrivande.co: could not connect to host
-skullhouse.nyc: did not receive HSTS header
+skullhouse.nyc: could not connect to host
 sky-aroma.com: could not connect to host
 skyasker.cn: could not connect to host
-skyasker.com: did not receive HSTS header
+skyasker.com: could not connect to host
 skybloom.com: could not connect to host
 skybound.link: did not receive HSTS header
 skyflix.me: could not connect to host
@@ -13660,6 +14789,7 @@ skylocker.nl: could not connect to host
 skyoy.com: did not receive HSTS header
 skypeassets.com: could not connect to host
 skypoker.com: could not connect to host
+skyris.co: could not connect to host
 skyrunners.ch: could not connect to host
 skytec.host: did not receive HSTS header
 skyvault.io: could not connect to host
@@ -13667,7 +14797,6 @@ skyveo.ml: did not receive HSTS header
 skyway.capital: did not receive HSTS header
 skyworldserver.ddns.net: could not connect to host
 sl1pkn07.wtf: max-age too low: 2592000
-slapen17.nl: could not connect to host
 slaps.be: could not connect to host
 slash-dev.de: did not receive HSTS header
 slash64.co.uk: could not connect to host
@@ -13685,32 +14814,32 @@ sleep10.com: could not connect to host
 sleepstar.com.mt: did not receive HSTS header
 sliceone.com: could not connect to host
 slicketl.com: did not receive HSTS header
+slicss.com: could not connect to host
 slightfuture.click: could not connect to host
 slightfuture.com: did not receive HSTS header
+slimk1nd.nl: could not connect to host
 slimmerbouwen.be: did not receive HSTS header
 slingo.com: did not receive HSTS header
 slix.io: could not connect to host
 sln.cloud: could not connect to host
-slo-net.net: could not connect to host
 slope.haus: could not connect to host
+slotboss.co.uk: did not receive HSTS header
 slovakiana.sk: did not receive HSTS header
 slovenskycestovatel.sk: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 slovoice.org: could not connect to host
 slowfood.es: did not receive HSTS header
 slowsociety.org: could not connect to host
 slse.ca: max-age too low: 0
-sluimann.de: could not connect to host
 sluplift.com: did not receive HSTS header
 slycurity.de: could not connect to host
 slytech.ch: could not connect to host
 smallcdn.rocks: could not connect to host
 smallchat.nl: could not connect to host
-smalldata.tech: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-smallpath.me: could not connect to host
 smallplanet.ch: did not receive HSTS header
 smallshopit.com: did not receive HSTS header
 smart-mirror.de: did not receive HSTS header
 smart-ov.nl: could not connect to host
+smartass.space: could not connect to host
 smartbiz.vn: could not connect to host
 smartboleta.com: did not receive HSTS header
 smartbuyelectric.com: could not connect to host
@@ -13723,21 +14852,22 @@ smartlend.se: [Exception... "Component returned failure code: 0x80004005 (NS_ERR
 smartofficesandsmarthomes.com: did not receive HSTS header
 smartofficeusa.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 smartphone.continental.com: could not connect to host
+smartrade.tech: did not receive HSTS header
 smartrak.co.nz: did not receive HSTS header
 smartshoppers.es: did not receive HSTS header
+smartwank.com: could not connect to host
 smartwritingservice.com: could not connect to host
 smcbox.com: did not receive HSTS header
 smdev.fr: could not connect to host
 smet.us: could not connect to host
 smexpt.com: did not receive HSTS header
 smi-a.me: could not connect to host
-smiatek.name: could not connect to host
 smileawei.com: could not connect to host
-smimea.com: could not connect to host
+smimea.com: did not receive HSTS header
 smirkingwhorefromhighgarden.pro: could not connect to host
-smith.is: did not receive HSTS header
+smithchow.com: could not connect to host
 smittix.co.uk: did not receive HSTS header
-smkn1lengkong.sch.id: could not connect to host
+smkn1lengkong.sch.id: did not receive HSTS header
 smksi2.com: could not connect to host
 smksultanismail2.com: could not connect to host
 sml.lc: could not connect to host
@@ -13752,19 +14882,18 @@ smsben.cn: did not receive HSTS header
 smsben.com: did not receive HSTS header
 smspodmena.ru: could not connect to host
 smtp.bz: did not receive HSTS header
+smtpdev.com: could not connect to host
 smuhelper.cn: could not connect to host
 smusg.com: could not connect to host
 snafarms.com: did not receive HSTS header
 snailing.org: could not connect to host
 snake.dog: could not connect to host
 snakehosting.dk: did not receive HSTS header
-snapappts.com: could not connect to host
-snaptools.io: could not connect to host
 snapworks.net: did not receive HSTS header
 snarf.in: could not connect to host
-sneak.berlin: did not receive HSTS header
 sneaker.date: could not connect to host
 sneed.company: could not connect to host
+sneezry.com: did not receive HSTS header
 snekchat.moe: could not connect to host
 snelwerk.be: could not connect to host
 sng.my: could not connect to host
@@ -13774,20 +14903,18 @@ sniderman.xyz: could not connect to host
 snip.host: could not connect to host
 snippet.host: could not connect to host
 snod.land: did not receive HSTS header
+snoot.club: did not receive HSTS header
 snoozedds.com: max-age too low: 600
 snoqualmiefiber.org: could not connect to host
 snovey.com: could not connect to host
+snow-online.de: could not connect to host
 snowdy.eu: could not connect to host
 snowdy.link: could not connect to host
-snowplane.net: did not receive HSTS header
-snowraven.de: did not receive HSTS header
 snowyluma.com: could not connect to host
-snrat.com: did not receive HSTS header
 so-healthy.co.uk: did not receive HSTS header
 sobabox.ru: could not connect to host
 sobinski.pl: did not receive HSTS header
 soboleva-pr.com.ua: could not connect to host
-sobreporcentagem.com: did not receive HSTS header
 socal-babes.com: could not connect to host
 soccergif.com: could not connect to host
 soci.ml: could not connect to host
@@ -13799,20 +14926,20 @@ socialfacecook.com: could not connect to host
 socialgrowing.cl: did not receive HSTS header
 socialhead.io: could not connect to host
 socialhub.com: did not receive HSTS header
-socialprize.com: could not connect to host
+socialprize.com: did not receive HSTS header
 socialspirit.com.br: did not receive HSTS header
 socialworkout.com: could not connect to host
 socialworkout.net: could not connect to host
 socialworkout.org: could not connect to host
 socialworkout.tv: could not connect to host
-socketize.com: could not connect to host
+socketize.com: did not receive HSTS header
 sockeye.cc: could not connect to host
 socomponents.co.uk: could not connect to host
 sodacore.com: could not connect to host
 soe-server.com: could not connect to host
 softballsavings.com: did not receive HSTS header
 softclean.pt: did not receive HSTS header
-softplaynation.co.uk: could not connect to host
+softplaynation.co.uk: did not receive HSTS header
 sogeek.me: could not connect to host
 sogravatas.net.br: could not connect to host
 sojingle.net: could not connect to host
@@ -13825,7 +14952,6 @@ soldbygold.net: did not receive HSTS header
 solentes.com.br: could not connect to host
 solidfuelappliancespares.co.uk: did not receive HSTS header
 solidimage.com.br: could not connect to host
-solidtuesday.com: could not connect to host
 solidus.systems: did not receive HSTS header
 solidwebnetworks.co.uk: did not receive HSTS header
 solinter.com.br: did not receive HSTS header
@@ -13834,10 +14960,12 @@ soljem.com: did not receive HSTS header
 soll-i.ch: did not receive HSTS header
 solosmusic.xyz: could not connect to host
 solsystems.ru: did not receive HSTS header
+solus-project.com: could not connect to host
 solutive.fi: did not receive HSTS header
 solymar.co: could not connect to host
 some.rip: max-age too low: 6307200
 somebodycares.org: did not receive HSTS header
+somepills.com: did not receive HSTS header
 someshit.xyz: could not connect to host
 something-else.cf: could not connect to host
 somethingnew.xyz: could not connect to host
@@ -13848,18 +14976,17 @@ sonerezh.bzh: did not receive HSTS header
 sonialive.com: did not receive HSTS header
 sonic.network: did not receive HSTS header
 sonicrainboom.rocks: could not connect to host
+sonix.dk: could not connect to host
 sonja-daniels.com: could not connect to host
 sonja-kowa.de: could not connect to host
 sonyforum.no: did not receive HSTS header
 soobi.org: did not receive HSTS header
 soondy.com: did not receive HSTS header
-soontm.de: could not connect to host
 soply.com: could not connect to host
 soporte.cc: could not connect to host
 sorenam.com: could not connect to host
 sorensen-online.com: could not connect to host
-sorex.photo: max-age too low: 0
-sorincocorada.ro: could not connect to host
+sorex.photo: did not receive HSTS header
 sorinmuntean.ro: did not receive HSTS header
 sortaweird.net: could not connect to host
 sortingwizard.com: could not connect to host
@@ -13867,21 +14994,20 @@ soruly.moe: did not receive HSTS header
 sos.de: did not receive HSTS header
 sosaka.ml: could not connect to host
 sosecu.red: could not connect to host
-sosiolog.com: could not connect to host
+sosiolog.com: did not receive HSTS header
 sosko.in.rs: could not connect to host
 sotiran.com: did not receive HSTS header
 sotor.de: did not receive HSTS header
 soucorneteiro.com.br: could not connect to host
-sougi-review.top: did not receive HSTS header
 soulcraft.bz: could not connect to host
 soulema.com: could not connect to host
 soulfulglamour.uk: could not connect to host
 soulsteer.com: could not connect to host
 soundbytemedia.com: did not receive HSTS header
+soundedj.com.br: could not connect to host
 soundforsound.co.uk: did not receive HSTS header
 soundgasm.net: could not connect to host
 soundsecurity.io: could not connect to host
-souqtajmeel.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 sourcecode.love: could not connect to host
 sourcelair.com: did not receive HSTS header
 sourcitec.com: did not receive HSTS header
@@ -13889,6 +15015,7 @@ sous-surveillance.net: could not connect to host
 southcoastkitesurf.co.uk: did not receive HSTS header
 southcoastswords.com: did not receive HSTS header
 southernjamusa.com: did not receive HSTS header
+southernlights.xyz: could not connect to host
 southgale.condos: could not connect to host
 southside-crew.club: could not connect to host
 southworcestershiregpservices.co.uk: could not connect to host
@@ -13900,7 +15027,6 @@ souyar.us: could not connect to host
 sovereignshare.com: could not connect to host
 sown.dyndns.org: could not connect to host
 sowncloud.de: could not connect to host
-soz6.com: did not receive HSTS header
 sp.rw: could not connect to host
 spacecafe.org: did not receive HSTS header
 spacedust.xyz: could not connect to host
@@ -13918,6 +15044,8 @@ sparelib.com: max-age too low: 3650
 spark.team: could not connect to host
 sparkbase.cn: could not connect to host
 sparklingsparklers.com: did not receive HSTS header
+sparkresearch.net: could not connect to host
+sparkreviewcenter.com: could not connect to host
 sparkwood.org: could not connect to host
 sparmedo.de: did not receive HSTS header
 sparsa.army: could not connect to host
@@ -13925,13 +15053,12 @@ sparta-trade.com: could not connect to host
 spartantheatre.org: could not connect to host
 spauted.com: could not connect to host
 spawn.cz: could not connect to host
-spcx.eu: did not receive HSTS header
+spcx.eu: could not connect to host
 spdysync.com: could not connect to host
 specialedesigns.com: could not connect to host
 specialistnow.com.au: did not receive HSTS header
 spectreattack.com: did not receive HSTS header
 speculor.net: could not connect to host
-spedition-transport-umzug.de: could not connect to host
 spedplus.com.br: did not receive HSTS header
 speed-mailer.com: could not connect to host
 speedcounter.net: could not connect to host
@@ -13942,6 +15069,7 @@ speedyprep.com: did not receive HSTS header
 speidel.com.tr: did not receive HSTS header
 spencerbaer.com: could not connect to host
 spendwise.com.au: could not connect to host
+sperohub.com: could not connect to host
 sperohub.io: could not connect to host
 sperohub.lt: did not receive HSTS header
 sphinx.network: could not connect to host
@@ -13949,19 +15077,23 @@ spicydog.tk: could not connect to host
 spicywombat.com: could not connect to host
 spiegels.nl: did not receive HSTS header
 spielcasinos.com: did not receive HSTS header
-spikeykc.me: did not receive HSTS header
+spikeykc.me: could not connect to host
+spillersfamily.net: could not connect to host
+spillmaker.no: did not receive HSTS header
 spilsbury.io: could not connect to host
 spineandscoliosis.com: did not receive HSTS header
 spinner.dnshome.de: could not connect to host
+spinor.im: could not connect to host
 spirit-dev.net: max-age too low: 0
 spirit-hunters-germany.de: did not receive HSTS header
 spiritbionic.ro: could not connect to host
 spiritfanfics.com: did not receive HSTS header
-spisoggrin.dk: did not receive HSTS header
+spisoggrin.dk: could not connect to host
 spitefultowel.com: did not receive HSTS header
 spitfireuav.com: could not connect to host
 spititout.it: could not connect to host
 split.is: could not connect to host
+splunk.zone: could not connect to host
 spokonline.com: could not connect to host
 spon.cz: did not receive HSTS header
 sponsorowani.pl: did not receive HSTS header
@@ -13969,6 +15101,7 @@ sponsortobias.com: could not connect to host
 spontex.org: did not receive HSTS header
 spookyinternet.com: could not connect to host
 sporara.com: did not receive HSTS header
+sport-socken.net: did not receive HSTS header
 sport247.bet: max-age too low: 2592000
 sportchirp-internal.azurewebsites.net: did not receive HSTS header
 sportflash.info: did not receive HSTS header
@@ -13978,19 +15111,19 @@ sportingoods.com.br: could not connect to host
 sportressofblogitude.com: did not receive HSTS header
 sportscollection.com.br: could not connect to host
 sportsmanadvisor.com: could not connect to host
-sportwette.eu: did not receive HSTS header
 spot-events.com: could not connect to host
 spotifyripper.tk: could not connect to host
 spotlightsrule.com: could not connect to host
 spotlightsrule.ddns.net: could not connect to host
+spr.id.au: could not connect to host
 spreadsheets.google.com: did not receive HSTS header (error ignored - included regardless)
 spresso.me: did not receive HSTS header
+spricknet.de: could not connect to host
 sprigings.com: did not receive HSTS header
 springsoffthegrid.com: could not connect to host
 sprint.ml: did not receive HSTS header
 sprk.fitness: did not receive HSTS header
 sproing.ca: max-age too low: 0
-spron.in: could not connect to host
 sproutconnections.com: could not connect to host
 sprueche-zum-valentinstag.de: could not connect to host
 sprueche-zur-geburt.info: could not connect to host
@@ -13999,10 +15132,12 @@ sprueche-zur-konfirmation.de: did not receive HSTS header
 sprutech.de: could not connect to host
 spykedigital.com: could not connect to host
 sqetsa.com: did not receive HSTS header
+sqkaccountancy.co.uk: did not receive HSTS header
 sqshq.de: did not receive HSTS header
 squaddraft.com: did not receive HSTS header
+squadlinx.com: did not receive HSTS header
 square.gs: could not connect to host
-squarelab.it: did not receive HSTS header
+squareonebgc.com.ph: could not connect to host
 squatldf.org: could not connect to host
 squids.space: could not connect to host
 squirtlesbians.net: could not connect to host
@@ -14020,36 +15155,36 @@ srvonfire.com: could not connect to host
 ss-free.net: could not connect to host
 ss-x.ru: could not connect to host
 ss.wtf: could not connect to host
+ssc8689.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+ssc8689.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ssco.xyz: did not receive HSTS header
 ssconn.com: could not connect to host
 ssh.nu: could not connect to host
 sshool.at: could not connect to host
 ssl.panoramio.com: could not connect to host
 ssl.rip: could not connect to host
-sslpoint.com: did not receive HSTS header
 sslzilla.de: did not receive HSTS header
 ssn1.ru: did not receive HSTS header
-sspanda.com: could not connect to host
+sspanda.com: did not receive HSTS header
 ssrvpn.tech: could not connect to host
 sss3s.com: could not connect to host
+sstewartgallus.com: could not connect to host
 ssworld.ga: could not connect to host
 staack.com: could not connect to host
 stabletoken.com: could not connect to host
 staceyhankeinc.com: did not receive HSTS header
 stackfiles.io: could not connect to host
 stackhub.cc: could not connect to host
+stacktile.io: could not connect to host
 stadionmanager.com: could not connect to host
 stadjerspasonline.nl: could not connect to host
-stadtbauwerk.at: did not receive HSTS header
-stadtbuecherei-bad-wurzach.de: max-age too low: 0
 stadtgartenla.com: could not connect to host
 staffjoy.com: did not receive HSTS header
 staffjoystaging.com: could not connect to host
-stagingjobshq.com: did not receive HSTS header
-stahl.xyz: could not connect to host
-stalder.work: could not connect to host
+stagingjobshq.com: could not connect to host
+stahl.xyz: did not receive HSTS header
+stakestrategy.com: could not connect to host
 stalkerhispano.com: max-age too low: 0
-stalkerteam.pl: did not receive HSTS header
 stalkthe.net: could not connect to host
 stall-zur-linde.de: did not receive HSTS header
 stalschermer.nl: could not connect to host
@@ -14061,17 +15196,20 @@ standingmist.com: did not receive HSTS header
 standoutbooks.com: did not receive HSTS header
 standuppaddlesports.com.au: did not receive HSTS header
 stannahtrapliften.nl: did not receive HSTS header
+star-citizen.wiki: did not receive HSTS header
 star-killer.net: could not connect to host
 star-stuff.de: did not receive HSTS header
-star.do: did not receive HSTS header
+star.do: could not connect to host
 starandshield.com: did not receive HSTS header
 starapple.nl: did not receive HSTS header
 starcafe.me: could not connect to host
 stardeeps.net: max-age too low: 0
+stardust-entertainments.co.uk: did not receive HSTS header
 starease.net: could not connect to host
 starfeeling.net: could not connect to host
 stargatepartners.com: did not receive HSTS header
 starklane.com: max-age too low: 300
+starlightentertainmentdevon.co.uk: did not receive HSTS header
 starmusic.ga: could not connect to host
 starplatinum.jp: could not connect to host
 starquake.nl: could not connect to host
@@ -14082,7 +15220,9 @@ startuplevel.com: could not connect to host
 startuponcloud.com: max-age too low: 2678400
 startuppeople.co.uk: could not connect to host
 startupum.ru: could not connect to host
+starwatches.eu: could not connect to host
 stash.ai: did not receive HSTS header
+stassi.ch: did not receive HSTS header
 state-of-body-and-mind.com: could not connect to host
 state-sponsored-actors.net: could not connect to host
 statementinsertsforless.com: did not receive HSTS header
@@ -14101,7 +15241,7 @@ status-sprueche.de: could not connect to host
 status.coffee: could not connect to host
 statusbot.io: could not connect to host
 statuschecks.net: could not connect to host
-stavebnice.net: did not receive HSTS header
+stavebnice.net: could not connect to host
 staxflax.tk: could not connect to host
 stayokhotelscdc-mailing.com: could not connect to host
 stcable.net: did not receive HSTS header
@@ -14109,14 +15249,13 @@ stcomex.com: did not receive HSTS header
 stdev.org: could not connect to host
 steamhours.com: could not connect to host
 steampunkrobot.com: did not receive HSTS header
-stedbg.net: could not connect to host
 steelbea.ms: could not connect to host
 steelrhino.co: could not connect to host
 steem.io: did not receive HSTS header
 steenackers.be: did not receive HSTS header
 stefanweiser.de: did not receive HSTS header
-stefany.eu: could not connect to host
 steffi-in-australien.com: could not connect to host
+stellarvale.net: could not connect to host
 stem.is: did not receive HSTS header
 stepbystep3d.com: did not receive HSTS header
 steph-autoecole.ch: did not receive HSTS header
@@ -14131,25 +15270,26 @@ sterjoski.com: did not receive HSTS header
 stesti.cz: could not connect to host
 stevechekblain.win: could not connect to host
 stevengoodpaster.com: could not connect to host
+stevenkwan.me: could not connect to host
 stevensheffey.me: could not connect to host
 stevensononthe.net: did not receive HSTS header
-steventruesdell.com: could not connect to host
 stewartremodelingadvantage.com: could not connect to host
 stewonet.nl: did not receive HSTS header
 stge.uk: could not connect to host
 sticklerjs.org: could not connect to host
 stickmy.cn: could not connect to host
 stickswag.cf: could not connect to host
+stickswag.eu: could not connect to host
+stiffordacademy.org.uk: could not connect to host
 stig.io: did not receive HSTS header
 stiger.me: could not connect to host
 stigroom.com: could not connect to host
-stigviewer.com: did not receive HSTS header
 stijnbelmans.be: max-age too low: 604800
+stikkie.me: could not connect to host
 stilettomoda.com.br: could not connect to host
 stillblackhat.id: could not connect to host
 stillyarts.com: did not receive HSTS header
 stinkytrashhound.com: could not connect to host
-stirling.co: could not connect to host
 stirlingpoon.net: could not connect to host
 stirlingpoon.xyz: could not connect to host
 stitthappens.com: did not receive HSTS header
@@ -14159,8 +15299,10 @@ stkbn.com: could not connect to host
 stkeverneparishcouncil.org.uk: did not receive HSTS header
 stl.news: max-age too low: 0
 stlucasmuseum.org: did not receive HSTS header
+stm32f4.jp: could not connect to host
 stmbgr.com: could not connect to host
 stn.me.uk: did not receive HSTS header
+stnl.de: could not connect to host
 stockseyeserum.com: could not connect to host
 stocktrade.de: could not connect to host
 stoffe-monster.de: did not receive HSTS header
@@ -14170,12 +15312,16 @@ stoick.me: could not connect to host
 stoinov.com: could not connect to host
 stole-my.bike: could not connect to host
 stole-my.tv: could not connect to host
+stolkschepen.nl: did not receive HSTS header
 stomadental.com: did not receive HSTS header
 stonecutterscommunity.com: could not connect to host
 stonemain.eu: could not connect to host
 stonemanbrasil.com.br: could not connect to host
+stonewuu.com: could not connect to host
 stopakwardhandshakes.org: could not connect to host
+stopbreakupnow.org: could not connect to host
 stopwoodfin.org: could not connect to host
+storageshedsnc.com: did not receive HSTS header
 storbritannien.guide: could not connect to host
 store-host.com: did not receive HSTS header
 store10.de: could not connect to host
@@ -14186,20 +15332,23 @@ storiesofhealth.org: did not receive HSTS header
 stormhub.org: could not connect to host
 stormwatcher.org: could not connect to host
 stormyyd.com: max-age too low: 0
+storytea.top: did not receive HSTS header
 stpatricksguild.com: did not receive HSTS header
 stqry.com: did not receive HSTS header
 str0.at: did not receive HSTS header
 strangeplace.net: did not receive HSTS header
-strangescout.me: did not receive HSTS header
+strangescout.me: could not connect to host
 strasweb.fr: did not receive HSTS header
 strbt.de: could not connect to host
 strchr.com: did not receive HSTS header
 stream-ing.xyz: could not connect to host
 stream.pub: could not connect to host
+streamblur.net: could not connect to host
 streamdesk.ca: did not receive HSTS header
 streamer.tips: did not receive HSTS header
 streamingeverywhere.com: could not connect to host
 streamingmagazin.de: could not connect to host
+streamlineautogroup.com: could not connect to host
 streampanel.net: did not receive HSTS header
 streams.dyndns.org: could not connect to host
 streamthemeeting.com: did not receive HSTS header
@@ -14207,16 +15356,18 @@ streamzilla.com: did not receive HSTS header
 strehl.tk: could not connect to host
 strelitzia02.com: could not connect to host
 stressfreehousehold.com: could not connect to host
+stretchpc.com: could not connect to host
 strictlysudo.com: could not connect to host
 strife.tk: could not connect to host
 strila.me: could not connect to host
 striptizer.tk: could not connect to host
-strobeto.de: did not receive HSTS header
+strming.com: could not connect to host
 stroeercrm.de: could not connect to host
 strongest-privacy.com: could not connect to host
 struxureon.com: did not receive HSTS header
 stuartbaxter.co: could not connect to host
 stubbings.eu: could not connect to host
+stucorweb.com: could not connect to host
 student-scientist.org: did not receive HSTS header
 student.andover.edu: could not connect to host
 studentrdh.com: did not receive HSTS header
@@ -14229,23 +15380,26 @@ studio-panic.com: did not receive HSTS header
 studiocn.cn: could not connect to host
 studiodoprazer.com.br: could not connect to host
 studiozelden.com: did not receive HSTS header
+studisys.net: could not connect to host
+studlan.no: could not connect to host
 studport.rv.ua: max-age too low: 604800
 studyabroadstation.com: could not connect to host
 studybay.com: could not connect to host
 studydrive.net: did not receive HSTS header
 studyhub.cf: did not receive HSTS header
 studying-neet.com: could not connect to host
+studytale.com: could not connect to host
 stuff-fibre.co.nz: did not receive HSTS header
 stugb.de: did not receive HSTS header
 stumeta2018.de: could not connect to host
 stupidstatetricks.com: could not connect to host
+sturbi.de: did not receive HSTS header
 sturbock.me: did not receive HSTS header
 sturdio.com.br: could not connect to host
 sturge.co.uk: did not receive HSTS header
 stuudium.life: could not connect to host
 stylenda.com: could not connect to host
 stylle.me: could not connect to host
-styloeart.com: could not connect to host
 stytt.com: did not receive HSTS header
 suaraangin.com: could not connect to host
 suareforma.com: could not connect to host
@@ -14254,6 +15408,8 @@ subbing.work: could not connect to host
 subdimension.org: could not connect to host
 subeesu.com: could not connect to host
 subhacker.net: could not connect to host
+subjektzentrisch.de: could not connect to host
+sublevel.net: did not receive HSTS header
 subrain.com: did not receive HSTS header
 subrosa.io: could not connect to host
 subsys.no: did not receive HSTS header
@@ -14267,7 +15423,6 @@ succubus.xxx: could not connect to host
 suche.org: could not connect to host
 suchprogrammer.net: did not receive HSTS header
 sudo.im: could not connect to host
-sudo.li: did not receive HSTS header
 sudosu.fr: could not connect to host
 suempresa.cloud: could not connect to host
 suffts.de: could not connect to host
@@ -14281,6 +15436,7 @@ summer.ga: could not connect to host
 summitbankofkc.com: did not receive HSTS header
 summitmasters.net: did not receive HSTS header
 sumoscout.de: did not receive HSTS header
+sun-wellness-online.com.vn: did not receive HSTS header
 sun.re: could not connect to host
 suncountrymarine.com: did not receive HSTS header
 sundaycooks.com: max-age too low: 2592000
@@ -14290,6 +15446,7 @@ sunfireshop.com.br: could not connect to host
 sunlandsg.vn: did not receive HSTS header
 sunnyfruit.ru: could not connect to host
 sunriseafricarelief.com: did not receive HSTS header
+sunset.im: did not receive HSTS header
 sunshinepress.org: could not connect to host
 sunxchina.com: could not connect to host
 sunyanzi.tk: could not connect to host
@@ -14305,20 +15462,21 @@ super-ripped-power.com: could not connect to host
 super-slim-coffee.com: could not connect to host
 superbabysitting.ch: could not connect to host
 superbike.tw: could not connect to host
+superbowlkneel.com: could not connect to host
 superbshare.com: could not connect to host
 supercastlessouthsydney.com.au: could not connect to host
 supercreepsvideo.com: did not receive HSTS header
 superiorfloridavacation.com: could not connect to host
 superklima.ro: did not receive HSTS header
-superlandnetwork.de: did not receive HSTS header
+superlandnetwork.de: could not connect to host
 superlentes.com.br: could not connect to host
+supermarx.nl: could not connect to host
 supernovabrasil.com.br: did not receive HSTS header
 supernt.lt: could not connect to host
 superpase.com: could not connect to host
 supersahnetorten.de: could not connect to host
 supersalescontest.nl: did not receive HSTS header
 superschnappchen.de: could not connect to host
-supersec.es: could not connect to host
 supersecurefancydomain.com: could not connect to host
 supertramp-dafonseca.com: did not receive HSTS header
 superuser.fi: could not connect to host
@@ -14335,12 +15493,16 @@ surfeasy.com: did not receive HSTS header
 surfone-leucate.com: did not receive HSTS header
 surgiclinic.gr: did not receive HSTS header
 surkatty.org: did not receive HSTS header
+surveyinstrumentsales.com: max-age too low: 84600
+survivalistplanet.com: could not connect to host
 survivebox.fr: did not receive HSTS header
 susastudentenjobs.de: could not connect to host
 susconam.org: could not connect to host
 suseasky.com: did not receive HSTS header
+sushi.roma.it: did not receive HSTS header
 sushifrick.de: could not connect to host
 sushiwereld.be: did not receive HSTS header
+susoccm.org: did not receive HSTS header
 suspiciousdarknet.xyz: could not connect to host
 sussexwebdesigns.com: could not connect to host
 sussexwebsites.info: could not connect to host
@@ -14348,22 +15510,24 @@ sustainability.gov: did not receive HSTS header
 suts.co.uk: could not connect to host
 suttonbouncycastles.co.uk: could not connect to host
 suvidhaapay.com: could not connect to host
-suzukikazuki.com: max-age too low: 0
+suzukikazuki.com: did not receive HSTS header
 suzukikenichi.com: did not receive HSTS header
 svadobkajuvi.sk: did not receive HSTS header
 svarovani.tk: could not connect to host
 svatba-frantovi.cz: could not connect to host
 sve-hosting.nl: could not connect to host
-svenluijten.com: did not receive HSTS header
+svenbacia.me: could not connect to host
 svenskacasino.com: did not receive HSTS header
 svenskaservern.se: could not connect to host
 svetdrzaku.cz: did not receive HSTS header
 svetjakonadlani.cz: did not receive HSTS header
+svetzitrka.cz: did not receive HSTS header
 sviz.pro: could not connect to host
 svj-stochovska.cz: could not connect to host
 svjvn.cz: could not connect to host
 swacp.com: could not connect to host
 swaggerdile.com: could not connect to host
+swagsocial.net: could not connect to host
 swaleacademiestrust.org.uk: max-age too low: 2592000
 swallsoft.co.uk: could not connect to host
 swallsoft.com: could not connect to host
@@ -14382,7 +15546,9 @@ swiftcrypto.com: could not connect to host
 swiftpk.net: could not connect to host
 swiggy.com: did not receive HSTS header
 swimming.ca: did not receive HSTS header
+swingular.com: could not connect to host
 swissentreprises.ch: could not connect to host
+swissfreshaircan.com: could not connect to host
 swisstranslate.ch: did not receive HSTS header
 swisstranslate.fr: did not receive HSTS header
 swisswebhelp.ch: could not connect to host
@@ -14393,22 +15559,21 @@ swordfighting.net: could not connect to host
 swu.party: could not connect to host
 sx3.no: could not connect to host
 sxbk.pw: could not connect to host
-sy-anduril.de: did not receive HSTS header
 syam.cc: could not connect to host
+syamuwatching.xyz: could not connect to host
 sychov.pro: could not connect to host
 sydgrabber.tk: could not connect to host
 sykl.us: could not connect to host
 sylvaincombe.net: could not connect to host
 sylvangarden.org: could not connect to host
-sylvanorder.com: could not connect to host
+sylvanorder.com: did not receive HSTS header
 symetria.io: max-age too low: 2592000
-synackr.com: did not receive HSTS header
+synackr.com: could not connect to host
 synapticconsulting.co.uk: could not connect to host
 syncaddict.net: could not connect to host
 syncappate.com: could not connect to host
 syncclinicalstudy.com: could not connect to host
 syncer.jp: did not receive HSTS header
-synchrocube.com: could not connect to host
 synchtu.be: could not connect to host
 syncmylife.net: could not connect to host
 syncserve.net: did not receive HSTS header
@@ -14431,7 +15596,9 @@ syss.de: did not receive HSTS header
 systea.net: could not connect to host
 system-online.cz: did not receive HSTS header
 systemd.me: could not connect to host
+sytk.me: could not connect to host
 syy.hk: did not receive HSTS header
+szagun.net: did not receive HSTS header
 szaszm.tk: could not connect to host
 szerbnyelvkonyv.hu: could not connect to host
 szlovaknyelv.hu: could not connect to host
@@ -14450,7 +15617,7 @@ taabe.xyz: could not connect to host
 taartenfeesies.nl: did not receive HSTS header
 tab.watch: did not receive HSTS header
 taberu-fujitsubo.com: did not receive HSTS header
-tabino.top: did not receive HSTS header
+tabhui.com: did not receive HSTS header
 tabitatsu.jp: did not receive HSTS header
 tabla-periodica.com: could not connect to host
 tachyonapp.com: could not connect to host
@@ -14462,21 +15629,21 @@ tadigitalstore.com: could not connect to host
 tafoma.com: did not receive HSTS header
 tageau.com: could not connect to host
 tagesmutter-in-bilm.de: did not receive HSTS header
+tagesmutter-zwitscherlinge.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 tahakomat.cz: could not connect to host
 tahf.net: could not connect to host
+tai-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+tai-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 taichi-jade.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 taidu.news: could not connect to host
 tailandfur.com: did not receive HSTS header
 tailify.com: did not receive HSTS header
-tailpuff.net: did not receive HSTS header
 tails.com.ar: could not connect to host
 taim.io: could not connect to host
 takebackyourstate.com: could not connect to host
 takebackyourstate.net: could not connect to host
 takebackyourstate.org: could not connect to host
 takebonus.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-takedownthissite.com: could not connect to host
-takemoto-ped.com: could not connect to host
 takinet.kr: could not connect to host
 takusan.ru: could not connect to host
 talenthero.io: did not receive HSTS header
@@ -14492,10 +15659,10 @@ talktwincities.com: could not connect to host
 tallr.se: could not connect to host
 tallshoe.com: could not connect to host
 talsi.eu: could not connect to host
-tam7t.com: did not receive HSTS header
-tamaraboutique.com: could not connect to host
+tam7t.com: could not connect to host
 tamersunion.org: did not receive HSTS header
 tamex.xyz: could not connect to host
+tamriel-rebuilt.org: could not connect to host
 tandarts-haarlem.nl: did not receive HSTS header
 tandblekningidag.com: could not connect to host
 tangerine.ga: could not connect to host
@@ -14512,11 +15679,12 @@ tantotiempo.de: did not receive HSTS header
 tanze-jetzt.de: could not connect to host
 taotuba.net: did not receive HSTS header
 taozj.org: did not receive HSTS header
+tapakgram.com: did not receive HSTS header
 tapestries.tk: could not connect to host
 tapfinder.ca: could not connect to host
 tapka.cz: did not receive HSTS header
 tappublisher.com: did not receive HSTS header
-taranis.re: could not connect to host
+tarantul.org.ua: could not connect to host
 taravancil.com: did not receive HSTS header
 tarek.link: could not connect to host
 targaryen.house: could not connect to host
@@ -14526,7 +15694,7 @@ tartaros.fi: could not connect to host
 taskstats.com: could not connect to host
 tasmansecurity.com: could not connect to host
 tassup.com: could not connect to host
-tasta.ro: did not receive HSTS header
+tasta.ro: could not connect to host
 tasticfilm.com: did not receive HSTS header
 tastyyy.co: could not connect to host
 tasyacherry-anal.com: could not connect to host
@@ -14555,7 +15723,7 @@ tcao.info: could not connect to host
 tcby45.xyz: could not connect to host
 tcl.ath.cx: did not receive HSTS header
 tcp.expert: did not receive HSTS header
-tcspartner.net: did not receive HSTS header
+tcptun.com: could not connect to host
 tcwebvn.com: could not connect to host
 tdelmas.eu: could not connect to host
 tdelmas.ovh: could not connect to host
@@ -14579,14 +15747,14 @@ teambeoplay.co.uk: did not receive HSTS header
 teamblueridge.org: could not connect to host
 teamdaylo.xyz: could not connect to host
 teamhood.io: did not receive HSTS header
+teamnetsol.com: did not receive HSTS header
 teampoint.cz: could not connect to host
 teams.microsoft.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 teamsocial.co: did not receive HSTS header
-teamup.rocks: did not receive HSTS header
 teamx-gaming.de: could not connect to host
 teamzeus.cz: could not connect to host
 teaparty.id: could not connect to host
-tearoy.faith: did not receive HSTS header
+tearoy.faith: could not connect to host
 teasenetwork.com: could not connect to host
 tebieer.com: could not connect to host
 tech-blog.fr: did not receive HSTS header
@@ -14611,25 +15779,30 @@ techmasters.andover.edu: could not connect to host
 techmatehq.com: could not connect to host
 technicalforensic.com: could not connect to host
 technicalpenguins.com: did not receive HSTS header
+technicalsystemsprocessing.com: did not receive HSTS header
+techniclab.net: could not connect to host
 techniclab.org: could not connect to host
 techniclab.ru: could not connect to host
 technikrom.org: did not receive HSTS header
 technogroup.cz: did not receive HSTS header
 technosavvyport.com: did not receive HSTS header
 technosuport.com: did not receive HSTS header
+technoswag.ca: could not connect to host
 technotonic.com.au: did not receive HSTS header
 techpointed.com: could not connect to host
 techpro.net.br: did not receive HSTS header
 techproud.com: did not receive HSTS header
 techreview.link: could not connect to host
-techtoy.store: could not connect to host
+techtoy.store: did not receive HSTS header
 techtrackerpro.com: could not connect to host
 techtraveller.com.au: did not receive HSTS header
+techtuts.info: could not connect to host
 techunit.org: could not connect to host
 tecit.ch: could not connect to host
 tecnidev.com: could not connect to host
 tecnimotos.com: did not receive HSTS header
 tecnogaming.com: did not receive HSTS header
+tecnologino.com: could not connect to host
 tecture.de: did not receive HSTS header
 tedovo.com: could not connect to host
 tedxkmitl.com: could not connect to host
@@ -14638,6 +15811,7 @@ teedb.de: could not connect to host
 teehaus-shila.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 teenerotic.net: could not connect to host
 teeplelaw.com: did not receive HSTS header
+teesypeesy.com: did not receive HSTS header
 tefl.io: did not receive HSTS header
 tegelsensanitaironline.nl: did not receive HSTS header
 tehotuotanto.net: did not receive HSTS header
@@ -14648,9 +15822,9 @@ teknologi.or.id: max-age too low: 36000
 teknotes.co.uk: could not connect to host
 tekshrek.com: did not receive HSTS header
 teksuperior.com: could not connect to host
-tektoria.de: could not connect to host
-tekuteku.jp: could not connect to host
+tektoria.de: did not receive HSTS header
 tel-dithmarschen.de: did not receive HSTS header
+tele-assistance.ch: could not connect to host
 teleallarme.ch: could not connect to host
 telecharger-itunes.com: could not connect to host
 telecharger-open-office.com: could not connect to host
@@ -14658,6 +15832,7 @@ telecharger-winrar.com: could not connect to host
 telefisk.org: did not receive HSTS header
 telefonnummer.online: could not connect to host
 telefonogratuito.com: did not receive HSTS header
+telefonsinyalguclendirici.com: did not receive HSTS header
 telefoonnummerinfo.nl: could not connect to host
 telekollektiv.org: could not connect to host
 telescam.com: could not connect to host
@@ -14665,7 +15840,7 @@ teleshop.be: could not connect to host
 teletechnology.in: did not receive HSTS header
 teletra.ru: could not connect to host
 telfordwhitehouse.co.uk: did not receive HSTS header
-tellingua.com: did not receive HSTS header
+tellingua.com: could not connect to host
 teltonica.com: did not receive HSTS header
 telugu4u.net: could not connect to host
 temasa.net: did not receive HSTS header
@@ -14676,25 +15851,24 @@ tempo.co: did not receive HSTS header
 tempodecolheita.com.br: could not connect to host
 tempus-aquilae.de: could not connect to host
 ten-cafe.com: could not connect to host
-tenberg.com: could not connect to host
 tendertool.nl: could not connect to host
 tendoryu-aikido.org: did not receive HSTS header
-tenerife-villas.com: did not receive HSTS header
+tenerife-villas.com: max-age too low: 2592000
 tengu.cloud: could not connect to host
 tenispopular.com: could not connect to host
+tenma.pro: could not connect to host
 tenni.xyz: could not connect to host
 tennisapp.org: could not connect to host
 tennispensacola.com: could not connect to host
 tensei-slime.com: did not receive HSTS header
 tensionup.com: could not connect to host
+tent.io: could not connect to host
 tentins.com: could not connect to host
 teodio.cl: did not receive HSTS header
-teoleonie.com: did not receive HSTS header
 teos.online: could not connect to host
 teoskanta.fi: could not connect to host
 teranga.ch: did not receive HSTS header
 tercerapuertoaysen.cl: could not connect to host
-termino.eu: did not receive HSTS header
 terra-x.net: could not connect to host
 terra.by: did not receive HSTS header
 terrax.berlin: could not connect to host
@@ -14702,16 +15876,22 @@ terrax.info: did not receive HSTS header
 terrax.net: could not connect to host
 terrazoo.de: did not receive HSTS header
 teru.com.br: could not connect to host
+test-aankoop.be: did not receive HSTS header
+test-achats.be: did not receive HSTS header
 test-dns.eu: could not connect to host
 test02.dk: did not receive HSTS header
+testadren.com: could not connect to host
 testadron.com: could not connect to host
 testandroid.xyz: could not connect to host
 testbawks.com: could not connect to host
 testbirds.cz: could not connect to host
 testbirds.sk: could not connect to host
 testdomain.ovh: could not connect to host
+testi.info: max-age too low: 10518975
 testnode.xyz: could not connect to host
+testosterone-complex.com: could not connect to host
 testovaci.ml: could not connect to host
+testpornsite.com: could not connect to host
 tetrafinancial-commercial-business-equipment-financing.com: did not receive HSTS header
 tetrafinancial-energy-mining-equipment-financing.com: did not receive HSTS header
 tetrafinancial-healthcare-medical-equipment-financing.com: did not receive HSTS header
@@ -14719,13 +15899,14 @@ tetrafinancial-manufacturing-industrial-equipment-financing.com: did not receive
 tetrafinancial-news.com: did not receive HSTS header
 tetrafinancial-technology-equipment-software-financing.com: did not receive HSTS header
 tetramax.eu: did not receive HSTS header
-tetrarch.co: could not connect to host
 tetsai.com: could not connect to host
 teufelsystem.de: could not connect to host
 teuniz.nl: did not receive HSTS header
 texte-zur-taufe.de: did not receive HSTS header
 textoplano.xyz: could not connect to host
 textracer.dk: could not connect to host
+tezcam.tk: could not connect to host
+tf-network.de: did not receive HSTS header
 tf2stadium.com: did not receive HSTS header
 tfcoms-sp-tracker-client.azurewebsites.net: could not connect to host
 tffans.com: could not connect to host
@@ -14737,33 +15918,31 @@ th-bl.de: did not receive HSTS header
 th3nd.com: could not connect to host
 thackert.myfirewall.org: could not connect to host
 thagki9.com: did not receive HSTS header
-thai.land: did not receive HSTS header
+thai.land: could not connect to host
 thaianthro.com: max-age too low: 0
 thaigirls.xyz: could not connect to host
 thaihostcool.com: did not receive HSTS header
-thailandpropertylisting.com: did not receive HSTS header
 thailandpropertylistings.com: did not receive HSTS header
 thalmann.fr: did not receive HSTS header
 thalskarth.com: did not receive HSTS header
 thatgudstuff.com: could not connect to host
 thatpodcast.io: did not receive HSTS header
-thatvizsla.life: did not receive HSTS header
+thatvizsla.life: could not connect to host
 the-construct.com: could not connect to host
 the-delta.net.eu.org: could not connect to host
 the-digitale.com: did not receive HSTS header
 the-earth-yui.net: could not connect to host
 the-finance-blog.com: could not connect to host
 the-gist.io: could not connect to host
-the-paddies.de: did not receive HSTS header
+the-paddies.de: could not connect to host
 the-sky-of-valkyries.com: could not connect to host
 the.ie: max-age too low: 0
 the420vape.org: could not connect to host
 theamateurs.net: did not receive HSTS header
-theamp.com: did not receive HSTS header
+theamp.com: could not connect to host
 theater.cf: could not connect to host
 theavenuegallery.com: did not receive HSTS header
 thebakingclass.com: max-age too low: 60
-thebarneystyle.com: did not receive HSTS header
 thebasementguys.com: could not connect to host
 thebeautifulmusic.net: did not receive HSTS header
 thebeginningisnye.com: could not connect to host
@@ -14776,7 +15955,7 @@ thebuffalotavern.com: could not connect to host
 thecandidforum.com: could not connect to host
 thecapitalbank.com: did not receive HSTS header
 thecharlestonwaldorf.com: did not receive HSTS header
-theciderlink.com.au: did not receive HSTS header
+theciderlink.com.au: could not connect to host
 thecitizens.com: did not receive HSTS header
 theclementinebutchers.com: could not connect to host
 theclimbingunit.com: did not receive HSTS header
@@ -14786,11 +15965,15 @@ theclubjersey.com: did not receive HSTS header
 thecodeninja.net: did not receive HSTS header
 thecoffeehouse.xyz: could not connect to host
 thecoffeepod.co.uk: did not receive HSTS header
+thecozycastle.com: did not receive HSTS header
 thecskr.in: did not receive HSTS header
 thecsw.com: did not receive HSTS header
+thecuriousdev.com: did not receive HSTS header
 thedailyupvote.com: could not connect to host
 thedarkartsandcrafts.com: could not connect to host
+thedebug.life: did not receive HSTS header
 thedevilwearswibra.nl: did not receive HSTS header
+thedom.site: could not connect to host
 thedominatorsclan.com: did not receive HSTS header
 thedrinks.co: did not receive HSTS header
 thedrop.pw: did not receive HSTS header
@@ -14798,13 +15981,14 @@ thedrunkencabbage.com: could not connect to host
 thedystance.com: could not connect to host
 theel0ja.info: did not receive HSTS header
 theelitebuzz.com: could not connect to host
-theendofzion.com: could not connect to host
+theendofzion.com: did not receive HSTS header
 theepankar.com: could not connect to host
 theescapistswiki.com: could not connect to host
 theevergreen.me: could not connect to host
 theexpatriate.de: could not connect to host
 theeyeopener.com: did not receive HSTS header
 thefarbeyond.com: could not connect to host
+thefasterweb.com: did not receive HSTS header
 thefootballanalyst.com: did not receive HSTS header
 thefox.co: did not receive HSTS header
 thefox.com.fr: could not connect to host
@@ -14819,18 +16003,16 @@ thegoldregister.co.uk: could not connect to host
 thegraciousgourmet.com: did not receive HSTS header
 thegreens.us: could not connect to host
 thegreenvpn.com: could not connect to host
+thegym.org: did not receive HSTS header
 thehiddenbay.cc: could not connect to host
 thehiddenbay.eu: max-age too low: 0
-thehiddenbay.me: could not connect to host
+thehiddenbay.me: max-age too low: 0
 thehiddenbay.net: could not connect to host
 thehighersideclothing.com: did not receive HSTS header
 thehistory.me: could not connect to host
-thehonorguard.org: did not receive HSTS header
 thehoopsarchive.com: could not connect to host
 theimagesalon.com: max-age too low: 43200
 theinvisibletrailer.com: could not connect to host
-theitsage.com: did not receive HSTS header
-thej0lt.com: could not connect to host
 thejobauction.com: did not receive HSTS header
 thejserver.de: could not connect to host
 thekrewserver.com: did not receive HSTS header
@@ -14841,13 +16023,14 @@ thelostyankee.com: could not connect to host
 themadmechanic.net: could not connect to host
 themanufacturingmarketingagency.com: could not connect to host
 themarble.co: could not connect to host
+themaster.site: did not receive HSTS header
 themathbehindthe.science: could not connect to host
 themathematician.uk: could not connect to host
 themeaudit.com: could not connect to host
+themenzentrisch.de: could not connect to host
 themerchandiser.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 themesurgeons.net: could not connect to host
 themicrocapital.com: could not connect to host
-themilanlife.com: could not connect to host
 themoderate.xyz: could not connect to host
 thenextstep.events: could not connect to host
 thenichecast.com: could not connect to host
@@ -14859,7 +16042,6 @@ theokonst.tk: did not receive HSTS header
 theosblog.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 theosophie-afrique.org: could not connect to host
 theoverfly.co: could not connect to host
-thepaffy.de: could not connect to host
 thepartywarehouse.co.uk: did not receive HSTS header
 thepcweb.tk: could not connect to host
 thepiratebay.al: could not connect to host
@@ -14869,7 +16051,8 @@ theposhfudgecompany.co.uk: could not connect to host
 theprincegame.com: could not connect to host
 theprivacysolution.com: could not connect to host
 thequillmagazine.org: could not connect to host
-therise.ca: max-age too low: 300
+therewill.be: could not connect to host
+therise.ca: could not connect to host
 thermique.ch: could not connect to host
 theroamingnotary.com: did not receive HSTS header
 therockawaysny.com: did not receive HSTS header
@@ -14880,11 +16063,15 @@ thesehighsandlows.com: could not connect to host
 theserver201.tk: could not connect to host
 theshadestore.com: max-age too low: 10368000
 thesled.net: could not connect to host
+thesnellvilledentist.com: did not receive HSTS header
 thesplit.is: could not connect to host
 thestack.xyz: could not connect to host
 thestagchorleywood.co.uk: did not receive HSTS header
-thestonegroup.de: could not connect to host
+thesteins.org: could not connect to host
+thestonegroup.de: did not receive HSTS header
 thestoritplace.com: max-age too low: 0
+thestral.pro: could not connect to host
+thestralbot.com: could not connect to host
 thetapirsmouth.com: could not connect to host
 thethirdroad.com: did not receive HSTS header
 thetradinghall.com: could not connect to host
@@ -14895,17 +16082,22 @@ thevintagenews.com: did not receive HSTS header
 thevoid.one: could not connect to host
 thewallset.com: could not connect to host
 thewaxhouse.shop: did not receive HSTS header
+thewayofthedojo.com: did not receive HSTS header
 thewebfellas.com: did not receive HSTS header
 thewego.com: could not connect to host
 theweilai.com: could not connect to host
 thewhiterabbit.space: could not connect to host
+thewindow.com: could not connect to host
 theworkingeye.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-thewp.pro: could not connect to host
+theyachtteam.com: could not connect to host
+thezero.org: could not connect to host
 thezonders.com: did not receive HSTS header
 thgros.fr: could not connect to host
 thibautcharles.net: did not receive HSTS header
 thienteakee.com: did not receive HSTS header
 thierfreund.de: did not receive HSTS header
+thijsbekke.nl: could not connect to host
+thingies.site: could not connect to host
 thinkcash.nl: could not connect to host
 thinkcoding.de: could not connect to host
 thinkcoding.org: could not connect to host
@@ -14914,6 +16106,7 @@ thinklikeanentrepreneur.com: did not receive HSTS header
 thinkswap.com: did not receive HSTS header
 thinlyveiledcontempt.com: could not connect to host
 thirdpartytrade.com: did not receive HSTS header
+thirdworld.moe: could not connect to host
 thirty5.net: did not receive HSTS header
 thirtyspot.com: could not connect to host
 thisisacompletetest.ga: could not connect to host
@@ -14933,27 +16126,24 @@ thomasnet.fr: could not connect to host
 thomasscholz.com: max-age too low: 2592000
 thomasschweizer.net: could not connect to host
 thomasvt.xyz: max-age too low: 2592000
-thompsonfamily.cloud: could not connect to host
+thomspooren.nl: could not connect to host
 thorbis.com: could not connect to host
 thorbiswebsitedesign.com: could not connect to host
 thorgames.nl: did not receive HSTS header
 thorncreek.net: did not receive HSTS header
 thot.space: could not connect to host
-thoughtsynth.com: could not connect to host
-thoughtsynth.net: could not connect to host
-thoughtsynth.org: could not connect to host
+thoughtlessleaders.online: could not connect to host
 threatcentral.io: could not connect to host
-threebrothersbrewing.com: max-age too low: 2592000
+threebrothersbrewing.com: could not connect to host
 threebulls.be: did not receive HSTS header
-threit.de: did not receive HSTS header
 thriveapproach.co.uk: did not receive HSTS header
 thrivewellnesshub.co.za: did not receive HSTS header
 throughthelookingglasslens.co.uk: could not connect to host
 thrx.net: did not receive HSTS header
 thumbtack.com: did not receive HSTS header
 thundercampaign.com: could not connect to host
+thuviensoft.com: could not connect to host
 thuviensoft.net: could not connect to host
-thuybich.com: did not receive HSTS header
 thyrex.fr: could not connect to host
 ti-js.com: could not connect to host
 ti.blog.br: did not receive HSTS header
@@ -14971,24 +16161,28 @@ tickreport.com: did not receive HSTS header
 ticktock.today: could not connect to host
 tictactux.de: could not connect to host
 tidmore.us: could not connect to host
-tie-online.org: did not receive HSTS header
+tie-online.org: could not connect to host
 tiendafetichista.com: could not connect to host
 tiendschuurstraat.nl: could not connect to host
 tiensnet.com: could not connect to host
 tierarztpraxis-illerwinkel.de: did not receive HSTS header
+tiernanx.com: could not connect to host
+tierraprohibida.net: did not receive HSTS header
 tierrarp.com: could not connect to host
 tiffanytravels.com: did not receive HSTS header
+tiggi.pw: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 tightlineproductions.com: did not receive HSTS header
 tigit.co.nz: could not connect to host
-tiki-god.co.uk: could not connect to host
 tikutiku.pl: could not connect to host
 tildebot.com: could not connect to host
+tiledailyshop.com: did not receive HSTS header
 tilient.eu: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 tilkah.com.au: could not connect to host
 tillcraft.com: could not connect to host
 timbeilby.com: could not connect to host
 timbuktutimber.com: did not receive HSTS header
 timcamara.com: could not connect to host
+timchanhxe.com: did not receive HSTS header
 timdebruijn.nl: did not receive HSTS header
 time-river.xyz: could not connect to host
 timeatlas.com: did not receive HSTS header
@@ -15002,24 +16196,26 @@ timeserver2.de: could not connect to host
 timeserver3.de: could not connect to host
 timestamp.io: did not receive HSTS header
 timestamp.uk: could not connect to host
+timetab.org: could not connect to host
 timhieubenh.net: could not connect to host
 timhjalpen.se: could not connect to host
 timklefisch.de: did not receive HSTS header
+timmy.im: could not connect to host
 timmy.ws: could not connect to host
-timotrans.de: could not connect to host
-timotrans.eu: could not connect to host
+timotrans.de: did not receive HSTS header
+timotrans.eu: did not receive HSTS header
 timowi.de: could not connect to host
 timowi.net: could not connect to host
-timroes.de: did not receive HSTS header
+timroes.de: could not connect to host
 timschubert.net: max-age too low: 172800
-timtj.ca: could not connect to host
 timvandekamp.nl: did not receive HSTS header
-timweb.ca: could not connect to host
 timwhite.io: did not receive HSTS header
 timwittenberg.com: could not connect to host
-tinchbear.xyz: did not receive HSTS header
+tinchbear.xyz: could not connect to host
 tindewen.net: could not connect to host
 tink.network: could not connect to host
+tinkerers-trunk.co.za: did not receive HSTS header
+tioat.net: could not connect to host
 tipiakers.club: could not connect to host
 tipps-fuer-den-haushalt.de: could not connect to host
 tippspiel.cc: could not connect to host
@@ -15042,11 +16238,12 @@ tju.me: could not connect to host
 tkappertjedemetamorfose.nl: could not connect to host
 tkarstens.de: did not receive HSTS header
 tkhw.tk: could not connect to host
+tkirch.de: could not connect to host
+tkjg.fi: could not connect to host
+tkn.tokyo: could not connect to host
 tkonstantopoulos.tk: could not connect to host
 tkts.cl: could not connect to host
-tlach.cz: did not receive HSTS header
 tlcdn.net: could not connect to host
-tlcnet.info: could not connect to host
 tlo.hosting: could not connect to host
 tlo.link: could not connect to host
 tlo.network: could not connect to host
@@ -15056,7 +16253,10 @@ tlshost.net: could not connect to host
 tm-solutions.eu: could not connect to host
 tm.id.au: did not receive HSTS header
 tmaward.net: could not connect to host
+tmconnects.com: could not connect to host
+tmdc.ddns.net: could not connect to host
 tmhlive.com: could not connect to host
+tmhr.moe: could not connect to host
 tmin.cf: could not connect to host
 tmitchell.io: could not connect to host
 tmprod.com: did not receive HSTS header
@@ -15066,8 +16266,10 @@ tncnanet.com.br: could not connect to host
 tno.io: could not connect to host
 to2mbn.org: could not connect to host
 tobaby.com.br: could not connect to host
+tobacco.gov: could not connect to host
 tobaccore.eu: could not connect to host
 tobaccore.sk: could not connect to host
+tobedo.net: could not connect to host
 tobias-bielefeld.de: did not receive HSTS header
 tobiasbergius.se: could not connect to host
 tobiasmathes.com: could not connect to host
@@ -15076,11 +16278,11 @@ tobiasofficial.at: could not connect to host
 tobiassachs.cf: could not connect to host
 tobiassachs.tk: could not connect to host
 tobis-webservice.de: did not receive HSTS header
-tobisworld.ch: could not connect to host
 tobyx.is: could not connect to host
 todesschaf.org: could not connect to host
 todo.is: could not connect to host
 todobazar.es: could not connect to host
+todocracy.com: could not connect to host
 todokete.ga: could not connect to host
 todoscomciro.com: did not receive HSTS header
 todosrv.com: could not connect to host
@@ -15102,18 +16304,21 @@ tokoindo.top: could not connect to host
 tokoone.com: did not receive HSTS header
 tokotamz.net: could not connect to host
 tokotimbangandigitalmurah.web.id: did not receive HSTS header
-tokototech.com: could not connect to host
 tokoyo.biz: could not connect to host
 tollmanz.com: did not receive HSTS header
 tollsjekk.no: could not connect to host
 tolud.com: could not connect to host
+tom-maxwell.com: did not receive HSTS header
 tom.run: did not receive HSTS header
 tomandshirley.com: could not connect to host
-tomaz.eu: could not connect to host
+tomberek.info: did not receive HSTS header
 tomcort.com: could not connect to host
+tomdudfield.com: did not receive HSTS header
 tomeara.net: could not connect to host
+tomershemesh.me: could not connect to host
 tomevans.io: did not receive HSTS header
 tomharling.co.uk: could not connect to host
+tomiler.com: could not connect to host
 tomlankhorst.nl: did not receive HSTS header
 tomli.me: could not connect to host
 tommounsey.com: did not receive HSTS header
@@ -15121,17 +16326,18 @@ tommsy.com: did not receive HSTS header
 tommy-bordas.fr: did not receive HSTS header
 tommyads.com: could not connect to host
 tommyweber.de: did not receive HSTS header
+tomoyaf.com: did not receive HSTS header
 tomphill.co.uk: could not connect to host
 tomy.icu: could not connect to host
 tonburi.jp: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 tongmu.me: could not connect to host
+tonguetechnology.com: could not connect to host
 toniharant.de: could not connect to host
-tono.us: could not connect to host
 toomanypillows.com: could not connect to host
-top-solar-info.de: could not connect to host
+top-esb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 top-stage.net: could not connect to host
 top10mountainbikes.info: could not connect to host
-top9.fr: did not receive HSTS header
+top9.fr: could not connect to host
 topanlage.de: could not connect to host
 topbargains.com.au: did not receive HSTS header
 topbestsellerproduct.com: did not receive HSTS header
@@ -15139,16 +16345,19 @@ topbilan.com: did not receive HSTS header
 topdeskdev.net: could not connect to host
 topdetoxcleanse.com: could not connect to host
 topdevbox.net: could not connect to host
+topesb.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+topkek.ml: could not connect to host
 topmarine.se: did not receive HSTS header
 topnewstoday.org: could not connect to host
 topnotchendings.com: could not connect to host
 topnovini.com: did not receive HSTS header
 toppik.com.br: could not connect to host
+toppointrea.com: could not connect to host
 topservercccam.com: did not receive HSTS header
 topshelfguild.com: could not connect to host
-topshoptools.com: could not connect to host
 toptenthebest.com: did not receive HSTS header
 toptranslation.com: did not receive HSTS header
+topwin.la: could not connect to host
 topyx.com: did not receive HSTS header
 tor2web.org: could not connect to host
 torbay.ga: could not connect to host
@@ -15158,7 +16367,7 @@ toretfaction.net: could not connect to host
 torlock.download: could not connect to host
 torproject.org.uk: could not connect to host
 torproject.ovh: could not connect to host
-torrentdownloads.bid: could not connect to host
+torrentdownloads.bid: max-age too low: 0
 torrentgamesps2.info: could not connect to host
 torrenttop100.net: could not connect to host
 torrentz.website: could not connect to host
@@ -15168,7 +16377,6 @@ torv.rocks: did not receive HSTS header
 tosecure.link: could not connect to host
 toshnix.com: could not connect to host
 toshub.com: could not connect to host
-totaku.ru: could not connect to host
 totalle.com.br: could not connect to host
 totallynotaserver.com: could not connect to host
 totalsystemcare.com: could not connect to host
@@ -15185,30 +16393,29 @@ touchinformatica.com: did not receive HSTS header
 touchpointidg.us: could not connect to host
 touchscreen-handy.de: did not receive HSTS header
 touchstonefms.co.uk: did not receive HSTS header
+touchtable.nl: did not receive HSTS header
+tougetu.com: could not connect to host
+touhou.cc: did not receive HSTS header
 touray-enterprise.ch: could not connect to host
-tournaire.fr: did not receive HSTS header
 tourpeer.com: did not receive HSTS header
 toursandtransfers.it: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 tousproducteurs.fr: did not receive HSTS header
 towaway.ru: could not connect to host
-town-farm.surrey.sch.uk: max-age too low: 0
-townofbridgewater.ca: could not connect to host
 tox.im: did not receive HSTS header
 toxicboot.com: could not connect to host
 toxicip.com: could not connect to host
 toxme.se: did not receive HSTS header
 toymania.de: could not connect to host
 toyotamotala.se: could not connect to host
-tpansino.com: could not connect to host
 tpbcdn.com: could not connect to host
-tpblist.xyz: could not connect to host
+tpblist.xyz: max-age too low: 0
 tpbunblocked.org: could not connect to host
 tpe-edu.com: could not connect to host
 tpms4u.at: did not receive HSTS header
 tppdebate.org: did not receive HSTS header
 trabajarenperu.com: did not receive HSTS header
+tracalada.cl: did not receive HSTS header
 tracetracker.com: did not receive HSTS header
-tracetracker.no: did not receive HSTS header
 tracewind.top: could not connect to host
 track.plus: could not connect to host
 trackdays4fun.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -15216,6 +16423,8 @@ tracker-gps.ch: could not connect to host
 trackmeet.io: did not receive HSTS header
 tracktivity.com.au: did not receive HSTS header
 trade-smart.ru: could not connect to host
+tradedesk.co.za: could not connect to host
+tradernet.com: could not connect to host
 tradietrove.com.au: did not receive HSTS header
 trading-analytics.com: could not connect to host
 tradingbhavishya.com: did not receive HSTS header
@@ -15225,44 +16434,50 @@ tradingrooms.com: did not receive HSTS header
 traditional-knowledge.tk: did not receive HSTS header
 traeningsprojekt.dk: did not receive HSTS header
 trafficquality.org: could not connect to host
-traffictigers.com: did not receive HSTS header
+traffictigers.com: could not connect to host
 traforet.win: could not connect to host
 train-track.co.uk: did not receive HSTS header
 traindb.nl: did not receive HSTS header
-trainhorns.us: did not receive HSTS header
 training4girls.ru: could not connect to host
 traininglist.org: could not connect to host
 trainingproviderresults.gov: could not connect to host
+trainings-handschuhe-test.de: could not connect to host
 trainline.dk: could not connect to host
 trainline.io: could not connect to host
 trainline.se: could not connect to host
 trainut.com: could not connect to host
 trakfusion.com: could not connect to host
+trancendances.fr: could not connect to host
 tranos.de: did not receive HSTS header
 transbike.es: did not receive HSTS header
 transcendmotor.sg: could not connect to host
+transcricentro.pt: could not connect to host
 transcriptionwave.com: did not receive HSTS header
 transdirect.com.au: did not receive HSTS header
 transformify.org: did not receive HSTS header
 transgendernetwerk.nl: did not receive HSTS header
 transl8.eu: did not receive HSTS header
 translate.googleapis.com: did not receive HSTS header (error ignored - included regardless)
+translateblender.ru: could not connect to host
 transmithe.net: could not connect to host
 transportal.sk: did not receive HSTS header
 transsexualpantyhose.com: could not connect to host
-tratamentoparacelulite.biz: did not receive HSTS header
-tratamentoparacelulite.net: did not receive HSTS header
+tratamentoparacelulite.biz: could not connect to host
 trauertexte.info: could not connect to host
 traumhuetten.de: did not receive HSTS header
 travality.ru: could not connect to host
+travel-dealz.de: did not receive HSTS header
 travel-kuban.ru: did not receive HSTS header
 travel1x1.com: did not receive HSTS header
 traveling-thailand.info: could not connect to host
 travelinsightswriter.com: could not connect to host
 travelling.expert: could not connect to host
+travelmyth.ie: did not receive HSTS header
+travelpricecheck.com: max-age too low: 0
 travotion.com: could not connect to host
 trazosdearte.com: did not receive HSTS header
 treasuredinheritanceministry.com: did not receive HSTS header
+treatment.org: could not connect to host
 treatprostatewithhifu.com: could not connect to host
 treeby.net: could not connect to host
 treehousebydesign.com: did not receive HSTS header
@@ -15270,16 +16485,22 @@ treeremovaljohannesburg.co.za: could not connect to host
 treino.blog.br: could not connect to host
 treker.us: could not connect to host
 trell.co.in: did not receive HSTS header
+tremolosoftware.com: did not receive HSTS header
 tremoureux.fr: could not connect to host
 trendberry.ru: could not connect to host
 trendingpulse.com: could not connect to host
 trendisland.de: did not receive HSTS header
 trendydips.com: could not connect to host
+trentmaydew.com: could not connect to host
 trewe.eu: could not connect to host
 triadwars.com: did not receive HSTS header
 triageo.com.au: could not connect to host
 trialmock.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+trianon.xyz: could not connect to host
+tributh.cf: could not connect to host
+tributh.net: could not connect to host
 trickedguys.com: could not connect to host
+tricks.clothing: did not receive HSTS header
 triddi.com: could not connect to host
 tridimage.com: did not receive HSTS header
 trigular.de: could not connect to host
@@ -15297,18 +16518,20 @@ trixati.org.ua: did not receive HSTS header
 trixies-wish.nz: could not connect to host
 trixy.com.br: could not connect to host
 trizone.com.au: did not receive HSTS header
-troi.de: did not receive HSTS header
+troisdorf-gestalten.de: did not receive HSTS header
 trollme.me: could not connect to host
 trollscave.xyz: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+trondelan.no: could not connect to host
 tronflix.com: did not receive HSTS header
 troo.ly: could not connect to host
 trouter.io: could not connect to host
 trouver-son-chemin.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 trpg.wiki: could not connect to host
-truckgpsreviews.com: did not receive HSTS header
 true.ink: did not receive HSTS header
 truebred-labradors.com: did not receive HSTS header
+trueessayhelp.co.uk: did not receive HSTS header
 truejob.com: did not receive HSTS header
+truessl.shop: could not connect to host
 trumeet.top: did not receive HSTS header
 trunkjunk.co: could not connect to host
 trush.in: could not connect to host
@@ -15327,6 +16550,7 @@ ts2.se: could not connect to host
 ts3-dns.me: could not connect to host
 ts3.consulting: could not connect to host
 tsaro.io: could not connect to host
+tscqmalawi.info: did not receive HSTS header
 tsdom.net: could not connect to host
 tsecy.com: could not connect to host
 tsgbit.net: could not connect to host
@@ -15340,11 +16564,11 @@ tsumegumi.net: could not connect to host
 tsumi.moe: could not connect to host
 tsura.org: could not connect to host
 tsurezurematome.ga: could not connect to host
-tsurimap.com: could not connect to host
-tsutsumi-kogyo.jp: could not connect to host
+tt.dog: could not connect to host
 ttackmedical.com.br: could not connect to host
-ttb.gov: did not receive HSTS header
+ttrade.ga: could not connect to host
 tts.co.nz: did not receive HSTS header
+ttspttsp.com: could not connect to host
 tty.space: could not connect to host
 ttz.im: could not connect to host
 tuamoronline.com: could not connect to host
@@ -15360,7 +16584,6 @@ tudorapido.com.br: did not receive HSTS header
 tueche.com.ar: did not receive HSTS header
 tufilo.com: could not connect to host
 tugers.com: did not receive HSTS header
-tuja.hu: could not connect to host
 tulenceria.es: could not connect to host
 tulsameetingroom.com: could not connect to host
 tunca.it: did not receive HSTS header
@@ -15379,7 +16602,7 @@ turtlementors.com: could not connect to host
 turtles.ga: could not connect to host
 tusb.ml: did not receive HSTS header
 tussengelegenwoningverkopen.nl: could not connect to host
-tuthowto.com: did not receive HSTS header
+tuthowto.com: could not connect to host
 tutiendaroja.com: did not receive HSTS header
 tutiendarosa.com: did not receive HSTS header
 tutorio.ga: could not connect to host
@@ -15387,14 +16610,17 @@ tutu.ro: could not connect to host
 tuturulianda.com: did not receive HSTS header
 tuvalie.com: did not receive HSTS header
 tuxhound.org: could not connect to host
+tuxrtfm.com: could not connect to host
 tv.search.yahoo.com: could not connect to host
 tvc.red: could not connect to host
 tverdohleb.com: could not connect to host
 tvoru.com.ua: did not receive HSTS header
-tvtubeflix.com: could not connect to host
+tvs-virtual.cz: did not receive HSTS header
+tvtubeflix.com: did not receive HSTS header
 tvz-materijali.com: could not connect to host
 tw2-tools.ga: could not connect to host
 twarog.cc: could not connect to host
+tweakersbadge.nl: could not connect to host
 twee-onder-een-kap-woning-in-alphen-aan-den-rijn-kopen.nl: could not connect to host
 twee-onder-een-kap-woning-in-brielle-kopen.nl: could not connect to host
 twee-onder-een-kap-woning-in-de-friese-meren-kopen.nl: could not connect to host
@@ -15413,6 +16639,7 @@ twelve.rocks: could not connect to host
 twelve.today: could not connect to host
 twelverocks.com: could not connect to host
 twem.ddns.net: could not connect to host
+twilightcookies.ca: could not connect to host
 twillionmas.com: could not connect to host
 twinkieman.com: could not connect to host
 twinkseason.ca: could not connect to host
@@ -15423,9 +16650,12 @@ twinkseason.org: could not connect to host
 twinkseason.xyz: could not connect to host
 twiri.net: could not connect to host
 twist.party: could not connect to host
-twisted-brains.org: could not connect to host
+twistapp.com: did not receive HSTS header
+twistdevelopment.co.uk: could not connect to host
+twittelzie.nl: could not connect to host
 twitter.ax: could not connect to host
 twogo.com: did not receive HSTS header
+twojfaktum.pl: could not connect to host
 twolanedesign.com: did not receive HSTS header
 twolinepassbrewing.com: could not connect to host
 twolivelife.com: could not connect to host
@@ -15451,7 +16681,9 @@ tylian.net: max-age too low: 0
 type1joe.com: could not connect to host
 type1joe.net: could not connect to host
 type1joe.org: could not connect to host
+typehub.net: could not connect to host
 typeofweb.com: did not receive HSTS header
+typeonejoe.com: could not connect to host
 typeonejoe.net: could not connect to host
 typeonejoe.org: could not connect to host
 typingrevolution.com: did not receive HSTS header
@@ -15459,12 +16691,11 @@ tyreis.com: did not receive HSTS header
 tyrelius.com: could not connect to host
 tyroproducts.eu: did not receive HSTS header
 tyskland.guide: could not connect to host
-tysye.ca: could not connect to host
-tyuo-keibi.co.jp: did not receive HSTS header
 tz56789.com: did not receive HSTS header
 tzappa.net: could not connect to host
 tzwe.com: could not connect to host
 u-master.net: did not receive HSTS header
+u-tokyo.club: could not connect to host
 u175.com: could not connect to host
 uadp.pw: could not connect to host
 uahs.org.uk: did not receive HSTS header
@@ -15472,32 +16703,36 @@ ubalert.com: could not connect to host
 uber.com.au: did not receive HSTS header
 ubercalculator.com: did not receive HSTS header
 uberfunction.com: did not receive HSTS header
+ubertt.org: could not connect to host
 ubicloud.de: could not connect to host
 ubicv.com: could not connect to host
 ublox.com: did not receive HSTS header
 ubtce.com: could not connect to host
 ubuntuhot.com: did not receive HSTS header
 uc.ac.id: did not receive HSTS header
+uchiha.ml: could not connect to host
 uclanmasterplan.co.uk: did not receive HSTS header
+uclf.de: could not connect to host
 udbhav.me: could not connect to host
-ueba1085.jp: could not connect to host
 uefeng.com: did not receive HSTS header
 uega.net: did not receive HSTS header
 uerdingen.info: did not receive HSTS header
 uesociedadlimitada.com: could not connect to host
 ueu.me: could not connect to host
+uevan.com: did not receive HSTS header
 ufgaming.com: did not receive HSTS header
 uflixit.com: did not receive HSTS header
-ufo.moe: did not receive HSTS header
+ufo.moe: could not connect to host
 ufotable.uk: could not connect to host
 ugcdn.com: could not connect to host
+uggedal.com: could not connect to host
 ugisgutless.com: could not connect to host
 ugo.ninja: could not connect to host
 ugosadventures.com: could not connect to host
 uhasseltctf.ga: could not connect to host
-uhlhosting.ch: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-uhm.io: did not receive HSTS header
-uhuru-market.com: could not connect to host
+uhm.io: could not connect to host
+uhuru-market.com: did not receive HSTS header
+uicchy.com: could not connect to host
 uitslagensoftware.nl: did not receive HSTS header
 ukas.com: could not connect to host
 ukdropshipment.co.uk: did not receive HSTS header
@@ -15507,7 +16742,7 @@ ukkeyholdingcompany.co.uk: could not connect to host
 ukrgadget.com: could not connect to host
 ulabox.cat: did not receive HSTS header
 ulabox.es: did not receive HSTS header
-ulalau.com: could not connect to host
+ulalau.com: did not receive HSTS header
 ullamodaintima.com.br: could not connect to host
 ulmo.dk: could not connect to host
 ulti.gq: did not receive HSTS header
@@ -15515,6 +16750,8 @@ ultimate-garcinia-plus.com: could not connect to host
 ultimate-glow-skin.com: could not connect to host
 ultimate-memoryplus.com: could not connect to host
 ultimate-neuroplus.com: could not connect to host
+ultramax.biz: could not connect to host
+ultraporn.biz: could not connect to host
 ultrasteam.net: could not connect to host
 ultros.io: did not receive HSTS header
 umaimise.info: did not receive HSTS header
@@ -15523,7 +16760,8 @@ umbriel.fr: did not receive HSTS header
 umgardi.ca: could not connect to host
 umidev.com: could not connect to host
 umie.cc: did not receive HSTS header
-ump45.moe: did not receive HSTS header
+ump45.moe: could not connect to host
+umsolugar.com.br: did not receive HSTS header
 unapolegetic.co: did not receive HSTS header
 unart.info: could not connect to host
 unbanthe.net: could not connect to host
@@ -15535,46 +16773,52 @@ unblocked.faith: could not connect to host
 unblocked.host: could not connect to host
 unblocked.party: could not connect to host
 unblocked.sh: could not connect to host
-unblocked.st: did not receive HSTS header
+unblocked.st: could not connect to host
 unblocked.today: could not connect to host
 unblocked.vc: could not connect to host
 unblocked.win: could not connect to host
 unblocked.works: could not connect to host
 unblocked.world: could not connect to host
 unblockedall.site: could not connect to host
-unblockedbay.info: could not connect to host
-unblockerproxy.site: could not connect to host
-unblockerproxy.top: could not connect to host
+unblockedbay.info: max-age too low: 0
+unblockerproxy.site: did not receive HSTS header
+unblockerproxy.top: did not receive HSTS header
 unblockmy.party: could not connect to host
 unblockmy.tech: could not connect to host
 unblockmy.xyz: could not connect to host
-unblockmyproxy.site: could not connect to host
+unblockmyproxy.site: did not receive HSTS header
 unblockthe.site: could not connect to host
 unblockthe.top: could not connect to host
 unccdesign.club: could not connect to host
 unclegen.xyz: could not connect to host
-undeadbrains.de: could not connect to host
+undecidable.de: could not connect to host
 under30stravelinsurance.com.au: did not receive HSTS header
 undercovercondoms.com: could not connect to host
 underkin.com: could not connect to host
+underwearoffer.com: did not receive HSTS header
 unefuite.ch: could not connect to host
 unfiltered.nyc: could not connect to host
+unfuddle.cn: could not connect to host
 ungern.guide: could not connect to host
 unhu.fr: could not connect to host
 uni-games.com: could not connect to host
 uni2share.com: could not connect to host
+unicefcards.at: did not receive HSTS header
 unicefkaarten.be: did not receive HSTS header
 unicefkort.dk: did not receive HSTS header
 unicooo.com: could not connect to host
 unicorn.li: could not connect to host
 unicorncloud.org: could not connect to host
 unifiednetwork.me: could not connect to host
+uniformebateriasheliar.com.br: could not connect to host
 uniformecomgas.com.br: could not connect to host
 uniformehope.com.br: did not receive HSTS header
 uniformehumboldt.com.br: did not receive HSTS header
 uniformespousoalegre.com.br: did not receive HSTS header
 unikitty-on-tour.com: could not connect to host
 unikrn.com: could not connect to host
+uninet.cf: could not connect to host
+uniojeda.ml: did not receive HSTS header
 unionstationapp.com: could not connect to host
 unirenter.ru: did not receive HSTS header
 unison.com: did not receive HSTS header
@@ -15586,19 +16830,26 @@ university4industry.com: did not receive HSTS header
 universogay.com: could not connect to host
 univstore.win: could not connect to host
 univz.com: could not connect to host
+unix.se: did not receive HSTS header
 unixtime.pro: could not connect to host
 unknownbreakup.com: max-age too low: 2592000
 unknownphenomena.net: could not connect to host
+unlogis.ch: could not connect to host
 unmanaged.space: could not connect to host
+uno.fi: did not receive HSTS header
 unplugg3r.dk: could not connect to host
-unpossible.xyz: could not connect to host
 unravel.ie: could not connect to host
+unripple.com: could not connect to host
+unruh.fr: did not receive HSTS header
+uns.vn: could not connect to host
 unschoolrules.com: did not receive HSTS header
 unstockd.org: could not connect to host
 unsupervised.ca: did not receive HSTS header
 unsystem.net: could not connect to host
 unterkunft.guru: did not receive HSTS header
+unterschicht.tv: could not connect to host
 untoldstory.eu: did not receive HSTS header
+unveiledgnosis.com: did not receive HSTS header
 unwiredbrain.com: could not connect to host
 unwomen.is: did not receive HSTS header
 unyq.me: did not receive HSTS header
@@ -15607,6 +16858,7 @@ uow.ninja: could not connect to host
 up1.ca: could not connect to host
 upaknship.com: did not receive HSTS header
 upandclear.org: max-age too low: 0
+upay.ru: could not connect to host
 upboard.jp: could not connect to host
 upldr.pw: could not connect to host
 uploadbro.com: could not connect to host
@@ -15616,12 +16868,14 @@ uprotect.it: could not connect to host
 upstats.eu: could not connect to host
 uptakedigital.com.au: max-age too low: 2592000
 uptic.net: did not receive HSTS header
+uptogood.org: could not connect to host
 upupming.site: did not receive HSTS header
 ur-lauber.de: did not receive HSTS header
 urban-garden.lt: could not connect to host
 urban-garden.lv: could not connect to host
 urbanmic.com: could not connect to host
 urbanstylestaging.com: did not receive HSTS header
+urbansurvival.com: did not receive HSTS header
 urbpic.com: could not connect to host
 urcentral.org: could not connect to host
 url.cab: could not connect to host
@@ -15643,18 +16897,24 @@ uscurrency.gov: did not receive HSTS header
 use.ci: could not connect to host
 used-in.jp: could not connect to host
 usedesk.ru: did not receive HSTS header
+usedoor.jp: did not receive HSTS header
 useevlo.com.br: could not connect to host
 user-new.com: did not receive HSTS header
-usercare.com: did not receive HSTS header
+usercare.com: could not connect to host
 useresponse.com: did not receive HSTS header
-userify.com: max-age too low: 0
+userify.com: did not receive HSTS header
 uslab.io: could not connect to host
+usleep.net: could not connect to host
 usparklodging.com: did not receive HSTS header
 usportsgo.com: could not connect to host
 usr.nz: did not receive HSTS header
+usuluddin.ga: did not receive HSTS header
 utdscanner.com: did not receive HSTS header
+uteam.it: could not connect to host
+utilio.nl: max-age too low: 2592000
 utilitronium-shockwave.com: could not connect to host
-utleieplassen.no: could not connect to host
+utitreatment.com: did not receive HSTS header
+utleieplassen.no: did not receive HSTS header
 utopiagalaxy.space: could not connect to host
 utopialgb.org.uk: could not connect to host
 utopian-surgery.com: could not connect to host
@@ -15663,9 +16923,11 @@ utopianhomespa.com: did not receive HSTS header
 utopianrealms.org: did not receive HSTS header
 utopians.dk: did not receive HSTS header
 uttnetgroup.fr: could not connect to host
+utube.tw: could not connect to host
 utumno.ch: could not connect to host
 utvbloggen.se: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
-uvarov.pw: did not receive HSTS header
+uvarov.pw: could not connect to host
+uvolejniku.cz: did not receive HSTS header
 uwesander.de: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 uwfreelanceopticien.nl: could not connect to host
 uwimonacs.org.jm: did not receive HSTS header
@@ -15683,12 +16945,12 @@ v12.co.uk: did not receive HSTS header
 v1sit0r.ru: could not connect to host
 v2.pw: did not receive HSTS header
 v2ex.us: could not connect to host
-v4s.ro: did not receive HSTS header
 v4veedu.com: could not connect to host
 v5wz.com: did not receive HSTS header
-v5xp.com: did not receive HSTS header
+v5xp.com: could not connect to host
 v7.cl: could not connect to host
 v789xl.com: did not receive HSTS header
+va.gov: did not receive HSTS header
 vaaddress.co: could not connect to host
 vaalmarketplace.co.za: could not connect to host
 vacationality.com: could not connect to host
@@ -15702,16 +16964,17 @@ vaddder.com: could not connect to host
 vadennissanofhinesvilleparts.com: could not connect to host
 vadik.me: could not connect to host
 vadodesign.nl: did not receive HSTS header
-vagaerg.com: could not connect to host
-vagaerg.net: could not connect to host
+vagaerg.com: did not receive HSTS header
+vagaerg.net: did not receive HSTS header
 vaibhavchatarkar.com: could not connect to host
 val-sec.com: could not connect to host
 valaeris.de: did not receive HSTS header
+valbonne-consulting.com: did not receive HSTS header
 valecnatechnika.cz: could not connect to host
 valenhub.com: could not connect to host
 valenhub.es: could not connect to host
 valenscaelum.com: could not connect to host
-valesdev.com: did not receive HSTS header
+valesdev.com: max-age too low: 0
 valethound.com: could not connect to host
 valhallacostarica.com: could not connect to host
 valhallamovement.com: did not receive HSTS header
@@ -15732,13 +16995,16 @@ vanessabalibridal.com: could not connect to host
 vanestack.com: could not connect to host
 vanetv.com: could not connect to host
 vangeluwedeberlaere.be: did not receive HSTS header
-vanitas.xyz: did not receive HSTS header
+vanhaos.com: could not connect to host
+vanitas.xyz: could not connect to host
 vanitynailworkz.com: could not connect to host
 vanlaanen.com: did not receive HSTS header
 vansieleghem.com: could not connect to host
+vantaio.com: did not receive HSTS header
 vapecraftinc.com: did not receive HSTS header
 vapemania.eu: could not connect to host
 vapeshopsupply.com: max-age too low: 7889238
+vaporpunk.space: did not receive HSTS header
 varela-electricite.fr: could not connect to host
 variablyconstant.com: could not connect to host
 varta.io: could not connect to host
@@ -15747,8 +17013,12 @@ vasanth.org: could not connect to host
 vase-eroticke-povidky.cz: could not connect to host
 vastgoedcultuurfonds.nl: did not receive HSTS header
 vastkustenrunt.se: did not receive HSTS header
+vatsalyagoel.com: did not receive HSTS header
+vatsim-uk.co.uk: did not receive HSTS header
+vatsim.uk: did not receive HSTS header
 vavai.net: did not receive HSTS header
 vavouchers.com: could not connect to host
+vawltstorage.com: did not receive HSTS header
 vayaport.com: could not connect to host
 vbest.net: could not connect to host
 vbhelp.org: did not receive HSTS header
@@ -15756,28 +17026,30 @@ vbulletin-russia.com: could not connect to host
 vbulletinrussia.com: could not connect to host
 vcdn.xyz: could not connect to host
 vcdove.com: could not connect to host
+vcelin-na-doliku.cz: could not connect to host
 vconcept.ch: could not connect to host
 vconcept.me: could not connect to host
 vcr.re: could not connect to host
+vdanker.net: could not connect to host
 vdhco.be: did not receive HSTS header
 vdownloader.com: could not connect to host
 vdrpro.com: could not connect to host
 veblen.com: did not receive HSTS header
 vechkasov.ru: could not connect to host
 vectro.me: could not connect to host
-vedatkamer.com: did not receive HSTS header
+vedatkamer.com: could not connect to host
 vega-motor.com.ua: did not receive HSTS header
 vega-rumia.com.pl: max-age too low: 2592000
 vega.dyndns.info: could not connect to host
-vegalayer.com: did not receive HSTS header
-vegalengd.com: did not receive HSTS header
+vegalayer.com: could not connect to host
+vegane-proteine.com: could not connect to host
 vegangaymer.blog: could not connect to host
 veganosonline.com: could not connect to host
 vegasdocs.com: did not receive HSTS header
 veggiefasting.com: could not connect to host
 veggiesbourg.fr: did not receive HSTS header
 vegis.ro: did not receive HSTS header
-veglog.com: did not receive HSTS header
+veglog.com: could not connect to host
 vehent.org: did not receive HSTS header
 vehicleuplift.co.uk: did not receive HSTS header
 vekenz.com: could not connect to host
@@ -15802,10 +17074,10 @@ veraandsteve.date: could not connect to host
 verdeandco.co.uk: could not connect to host
 verifiedinvesting.com: could not connect to host
 verifikatorindonesia.com: could not connect to host
-veriny.tf: could not connect to host
 veriomed.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 veristor.com: did not receive HSTS header
 verliefde-jongens.nl: could not connect to host
+vermogeninkaart.nl: could not connect to host
 vermontcareergateway.org: could not connect to host
 vernonfishandgame.ca: did not receive HSTS header
 versbeton.nl: max-age too low: 864000
@@ -15826,15 +17098,13 @@ vfree.org: could not connect to host
 vgatest.nl: could not connect to host
 vglimg.com: could not connect to host
 vhost.co.id: could not connect to host
-via-shire-krug.ru: could not connect to host
 viabemestar.com.br: could not connect to host
 viadeux.com: did not receive HSTS header
 vialibido.com.br: could not connect to host
-viasinc.com: did not receive HSTS header
+viato.fr: could not connect to host
 vibrashop.com.br: did not receive HSTS header
 vicenage.com: could not connect to host
 viceversa.xyz: did not receive HSTS header
-vicianovi.cz: could not connect to host
 viciousviscosity.xyz: could not connect to host
 victorenxovais.com.br: could not connect to host
 victoriapemberton.com: did not receive HSTS header
@@ -15844,34 +17114,39 @@ vidb.me: could not connect to host
 vidbuchanan.co.uk: did not receive HSTS header
 viddiaz.com: did not receive HSTS header
 videnskabsklubben.dk: did not receive HSTS header
+videoload.co: could not connect to host
 videomuz.com: could not connect to host
 videorullen.se: could not connect to host
 videosxgays.com: could not connect to host
 videotogel.net: could not connect to host
 videoueberwachung-set.de: did not receive HSTS header
 vider.ga: could not connect to host
-vidid.net: could not connect to host
+vidid.net: did not receive HSTS header
 vidiproject.com: did not receive HSTS header
 viditut.com: could not connect to host
 vidkovaomara.si: could not connect to host
 vidlyoficial.com: could not connect to host
 vidz.ga: could not connect to host
 vieaw.com: could not connect to host
-viennan.net: could not connect to host
+viennan.net: did not receive HSTS header
 vietnam-lifer.com: could not connect to host
 vietnamchevrolet.net: did not receive HSTS header
 vietnamphotographytours.com: did not receive HSTS header
+vieux.pro: could not connect to host
 viewsea.com: max-age too low: 0
 vigilo.cf: could not connect to host
 vigilo.ga: could not connect to host
+vigo-krankenversicherung.de: could not connect to host
 viikko.eu: could not connect to host
 vijos.org: did not receive HSTS header
 vikasbabyworld.de: could not connect to host
-viktor-machnik.de: did not receive HSTS header
+viktor-machnik.de: could not connect to host
 viktorsvantesson.net: did not receive HSTS header
+vilabiamodas.com.br: could not connect to host
 viladochurrasco.com.br: could not connect to host
 vilaydin.com: did not receive HSTS header
 vilight.com.br: could not connect to host
+villa-anna-cilento.de: could not connect to host
 villa-bellarte.de: did not receive HSTS header
 villacarmela.com.br: did not receive HSTS header
 villainsclothing.com.au: could not connect to host
@@ -15891,10 +17166,12 @@ vinbet666.com: could not connect to host
 vinbet888.com: could not connect to host
 vincentkooijman.at: did not receive HSTS header
 vincentkooijman.nl: did not receive HSTS header
+vineright.com: did not receive HSTS header
 vinesauce.info: could not connect to host
 vinetalk.net: could not connect to host
 vinicius.sl: could not connect to host
 viniferawineclub.com: did not receive HSTS header
+vinihk.com: did not receive HSTS header
 vinogradovka.com: did not receive HSTS header
 vio.no: did not receive HSTS header
 violenceinterrupted.org: did not receive HSTS header
@@ -15902,22 +17179,28 @@ violet-letter.delivery: could not connect to host
 violetraven.co.uk: could not connect to host
 viosey.com: could not connect to host
 vioye.com: could not connect to host
+vip-9649.com: could not connect to host
+vip9649.com: could not connect to host
 viperdns.com: could not connect to host
+vipesball.cc: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+vipesball.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+vipesball.me: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 vipesball.net: could not connect to host
 viphospitality.se: could not connect to host
 viplentes.com.br: did not receive HSTS header
 vipmusic.ga: could not connect to host
 vipnettikasinoklubi.com: did not receive HSTS header
 viral8.jp: could not connect to host
+viralboombox.xyz: could not connect to host
 virginiacrimeanalysisnetwork.org: did not receive HSTS header
 viris.si: max-age too low: 536000
 virtualhealth.com: did not receive HSTS header
 virtualstrongbox.ca: did not receive HSTS header
-virtusaero.com: could not connect to host
 visa-shinsei.com: did not receive HSTS header
 visanhigia.com: could not connect to host
 viserproject.com: did not receive HSTS header
 vision-painting.com: did not receive HSTS header
+visiondigitalsog.com: could not connect to host
 visiongamestudios.com: could not connect to host
 visionthroughknowledge.com: could not connect to host
 visiontree-beta.eu: could not connect to host
@@ -15930,6 +17213,7 @@ vissersgrootboek.nl: did not receive HSTS header
 vistarait.com: could not connect to host
 visualvotes.co.uk: could not connect to host
 vitagenda.nl: could not connect to host
+vital-tel.co.uk: did not receive HSTS header
 vitalamin.at: could not connect to host
 vitalamin.ch: could not connect to host
 vitalita.cz: did not receive HSTS header
@@ -15940,20 +17224,20 @@ vitapingu.de: could not connect to host
 vitta.me: did not receive HSTS header
 vitzro.kr: could not connect to host
 viva-french.com: did not receive HSTS header
-vivanosports.com.br: did not receive HSTS header
 vivasports.com.br: could not connect to host
+viveconsalud.club: could not connect to host
 vivocloud.com: could not connect to host
 vivoregularizafacil.com.br: did not receive HSTS header
 vivoseg.com: could not connect to host
-vivremoinscher.fr: did not receive HSTS header
+vivremoinscher.fr: could not connect to host
 viza.io: could not connect to host
 vizeat.com: did not receive HSTS header
-vkino.com: could not connect to host
 vkulagin.ru: could not connect to host
 vladimiroff.org: did not receive HSTS header
 vldkn.net: could not connect to host
 vleij.family: could not connect to host
 vlogge.com: did not receive HSTS header
+vlsk.eu: could not connect to host
 vlzbazar.ru: could not connect to host
 vmrdev.com: could not connect to host
 vmstan.com: did not receive HSTS header
@@ -15961,10 +17245,17 @@ vndb.org: could not connect to host
 vocab.guru: could not connect to host
 vocalsynth.space: could not connect to host
 voceinveste.com: did not receive HSTS header
+vodpay.com: could not connect to host
+vodpay.net: could not connect to host
+vodpay.org: could not connect to host
 vogt.tech: could not connect to host
 voicesuk.co.uk: did not receive HSTS header
-void-it.nl: could not connect to host
+void-it.nl: did not receive HSTS header
+voidark.com: could not connect to host
 voidi.ca: could not connect to host
+voidpay.net: could not connect to host
+voidpay.org: could not connect to host
+voids.org: could not connect to host
 voidserv.net: could not connect to host
 voidshift.com: could not connect to host
 voilo.club: could not connect to host
@@ -15972,16 +17263,22 @@ voilodaisuki.club: could not connect to host
 voipkb.com: did not receive HSTS header
 voiro.club: could not connect to host
 voirodaisuki.club: could not connect to host
+vokalsystem.com: did not receive HSTS header
 volatimer.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 volbyzive.cz: did not receive HSTS header
 volcain.io: could not connect to host
 volcrado.com: could not connect to host
 volkden.com: could not connect to host
+volkerwesselswave.nl: did not receive HSTS header
 volkswurst.de: did not receive HSTS header
 voltimax.com: did not receive HSTS header
 voltotc.com: did not receive HSTS header
 voluptueuse.com: did not receive HSTS header
 volvipress.gr: did not receive HSTS header
+von-lien-aluprofile.de: did not receive HSTS header
+von-lien-dachrinnen.de: did not receive HSTS header
+von-lien-lichtplatten.de: did not receive HSTS header
+von-lien-profilbleche.de: did not receive HSTS header
 vonavy-cukor.sk: could not connect to host
 vonavycukor.sk: could not connect to host
 vonedelmann.de: did not receive HSTS header
@@ -15991,11 +17288,12 @@ vooreenveiligthuis.nl: did not receive HSTS header
 voorjou.com: did not receive HSTS header
 vorangerie.com: could not connect to host
 vorderklier.de: could not connect to host
-vorm2.com: did not receive HSTS header
+vorkbaard.nl: did not receive HSTS header
 vortexhobbies.com: did not receive HSTS header
 vosjesweb.nl: could not connect to host
 votercircle.com: did not receive HSTS header
 voterstartingpoint.uk: did not receive HSTS header
+votewa.gov: could not connect to host
 votresiteweb.ch: could not connect to host
 vow.vn: could not connect to host
 vowsy.club: did not receive HSTS header
@@ -16009,6 +17307,8 @@ vpnhot.com: could not connect to host
 vpnzoom.com: did not receive HSTS header
 vps-szerver-berles.hu: could not connect to host
 vpsmojo.com: could not connect to host
+vqporn.com: could not connect to host
+vranjske.co.rs: could not connect to host
 vratny.space: could not connect to host
 vriendenvoordeel.com: did not receive HSTS header
 vrijstaandhuis-in-alphen-aan-den-rijn-kopen.nl: could not connect to host
@@ -16026,43 +17326,55 @@ vrijstaandhuisverkopen.nl: could not connect to host
 vrlaid.com: could not connect to host
 vrobert.fr: could not connect to host
 vrsgames.com.mx: did not receive HSTS header
-vrtak-cz.net: could not connect to host
+vrtouring.org: could not connect to host
 vrzl.pro: could not connect to host
 vsamsonov.com: could not connect to host
 vsc-don-stocksport.de: did not receive HSTS header
-vsestiralnie.com: did not receive HSTS header
+vtuber-schedule.info: could not connect to host
 vucdn.com: could not connect to host
-vulndetect.com: did not receive HSTS header
 vulnerabilities.io: could not connect to host
 vuosaarenmontessoritalo.fi: did not receive HSTS header
 vvl.me: did not receive HSTS header
+vvzero.cf: could not connect to host
+vw-touranclub.cz: could not connect to host
 vwoforangeparts.com: could not connect to host
 vwt-event.nl: could not connect to host
 vxapps.com: could not connect to host
 vxml.club: could not connect to host
+vxz.me: could not connect to host
 vykup-car.ru: could not connect to host
 vynedmusic.com: could not connect to host
 vyshivanochka.in.ua: could not connect to host
 vysvetluju.cz: could not connect to host
 vyvybean.cf: could not connect to host
 vyvygen.com: did not receive HSTS header
+vyzner.cz: could not connect to host
 vzk.io: could not connect to host
 w10club.com: could not connect to host
 w2gshop.com.br: could not connect to host
-w4.no: did not receive HSTS header
 w4a.fr: could not connect to host
 w4b.in: could not connect to host
 w4xzr.top: could not connect to host
 w4xzr.xyz: could not connect to host
 w9rld.com: did not receive HSTS header
 wabifoggynuts.com: could not connect to host
+wachter.biz: could not connect to host
 wachtwoordencheck.nl: could not connect to host
+wadvisor.com: could not connect to host
 waelti.xxx: could not connect to host
+wafa4hw.com: could not connect to host
 wafairhaven.com.au: did not receive HSTS header
 wafni.com: could not connect to host
 wai-in.com: could not connect to host
+wai-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+wail.net: could not connect to host
 wait.moe: could not connect to host
 waixingrenfuli7.vip: could not connect to host
+waka-mono.com: could not connect to host
+waka168.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+waka168.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+waka88.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+waka88.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 wakapp.de: could not connect to host
 wakened.net: did not receive HSTS header
 waldkinder-ilmenau.de: did not receive HSTS header
@@ -16083,27 +17395,30 @@ wanda79.com: could not connect to host
 wanda96.com: could not connect to host
 wanda97.com: could not connect to host
 wanda98.com: could not connect to host
+wandercue.com: did not receive HSTS header
 wangjiatun.com.tw: could not connect to host
 wangkezun.com: could not connect to host
 wangqiliang.xn--fiqs8s: could not connect to host
 wangql.cn: could not connect to host
 wanquanojbk.com: did not receive HSTS header
 wantshow.com.br: did not receive HSTS header
+wanvi.net: did not receive HSTS header
 wanybug.cn: could not connect to host
 wapgu.cc: could not connect to host
 wapjt.cn: could not connect to host
-wapking.co: could not connect to host
 wapking.live: could not connect to host
 wapt.fr: did not receive HSTS header
 warandpeace.xyz: could not connect to host
 warcraftjournal.org: could not connect to host
 wardsegers.be: did not receive HSTS header
-warehost.de: did not receive HSTS header
+warehost.de: could not connect to host
 warekon.com: could not connect to host
 warekon.dk: could not connect to host
+warenmedia.com: could not connect to host
 warezaddict.com: could not connect to host
 warhistoryonline.com: did not receive HSTS header
 warlions.info: could not connect to host
+warmestwishes.ca: could not connect to host
 warnings.xyz: could not connect to host
 warped.com: did not receive HSTS header
 warren.sh: could not connect to host
@@ -16112,7 +17427,6 @@ warsentech.com: did not receive HSTS header
 warumsuchen.at: did not receive HSTS header
 wasatchconstables.com: did not receive HSTS header
 wasatchcrest.com: did not receive HSTS header
-wasfuereintheater.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 wasserburg.dk: did not receive HSTS header
 wassim.is: did not receive HSTS header
 watashi.bid: could not connect to host
@@ -16126,6 +17440,7 @@ watsonhall.uk: could not connect to host
 wattechweb.com: did not receive HSTS header
 wavefloatrooms.com: did not receive HSTS header
 wavefrontsystemstech.com: could not connect to host
+wavesoftime.com: could not connect to host
 waxlrs.com: could not connect to host
 waylaydesign.com: did not receive HSTS header
 waylee.net: did not receive HSTS header
@@ -16133,13 +17448,18 @@ wbit.co.il: did not receive HSTS header
 wbut.ml: could not connect to host
 wdesk.com: did not receive HSTS header
 wdmg.com.ua: max-age too low: 604800
-wdt.io: did not receive HSTS header
+wdrl.info: did not receive HSTS header
+wdt.io: could not connect to host
 we.serveftp.net: could not connect to host
+wealthcentral.com.au: did not receive HSTS header
 wealthformyhealth.com: did not receive HSTS header
 wear2work.nl: could not connect to host
+wearedisneyland.com: did not receive HSTS header
 weareincognito.org: could not connect to host
 wearewithyou.org: could not connect to host
+weather-and-climate.com: did not receive HSTS header
 weaverhairextensions.nl: could not connect to host
+web-adminy.co.uk: could not connect to host
 web-advisor.co.uk: could not connect to host
 web-demarche.com: could not connect to host
 web-industry.fr: could not connect to host
@@ -16148,11 +17468,12 @@ web-vision.de: did not receive HSTS header
 web4all.fr: did not receive HSTS header
 web4pro.fr: could not connect to host
 webandwords.com.au: could not connect to host
-webanker.sh: did not receive HSTS header
+webanker.sh: could not connect to host
 webapky.cz: could not connect to host
 webapps.directory: could not connect to host
 webart-factory.de: could not connect to host
 webassadors.com: could not connect to host
+webauthority.co.uk: did not receive HSTS header
 webbuzz.com.au: did not receive HSTS header
 webbx.se: did not receive HSTS header
 webchat.domains: did not receive HSTS header
@@ -16161,20 +17482,20 @@ webdesign-kronberg.de: did not receive HSTS header
 webdesignssussex.co.uk: could not connect to host
 webdev-quiz.de: did not receive HSTS header
 webdev.mobi: could not connect to host
-webdevxp.com: could not connect to host
 webdosh.com: did not receive HSTS header
 webeconomia.it: did not receive HSTS header
 webelement.sk: did not receive HSTS header
 weberjulia.com: could not connect to host
 webfronten.dk: did not receive HSTS header
-webgaff.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+webgaff.com: could not connect to host
+webgap.me: did not receive HSTS header
 webgreat.de: max-age too low: 3600
 webhackspro.com: could not connect to host
 webhelyesarcu.hu: did not receive HSTS header
 webhosting4.net: did not receive HSTS header
 webhostingpros.ml: could not connect to host
-webies.ro: did not receive HSTS header
 webless.com: could not connect to host
+weblogic.pl: did not receive HSTS header
 webm.to: could not connect to host
 webmail.mayfirst.org: did not receive HSTS header
 webmaniabr.com: did not receive HSTS header
@@ -16194,57 +17515,58 @@ webnosql.com: could not connect to host
 webperformance.ru: could not connect to host
 webproshosting.tk: could not connect to host
 webpublica.pt: could not connect to host
-webqueens.com: could not connect to host
-webreslist.com: could not connect to host
+webreslist.com: did not receive HSTS header
 websandbox.uk: could not connect to host
 websectools.com: could not connect to host
 webseo.de: did not receive HSTS header
 websiteadvice.com.au: did not receive HSTS header
 websitedesign.bg: did not receive HSTS header
 websitesabq.com: did not receive HSTS header
-websmartmedia.co.uk: did not receive HSTS header
 webspotter.nl: could not connect to host
 webstationservice.fr: could not connect to host
-webstellung.com: could not connect to host
+webstellung.com: did not receive HSTS header
 webstory.xyz: could not connect to host
-webswitch.io: could not connect to host
+webswitch.io: did not receive HSTS header
 webtar.info: could not connect to host
 webtech.com.br: could not connect to host
 webtechgadgetry.com: could not connect to host
 webtek.nu: could not connect to host
-webthings.com.br: did not receive HSTS header
+webthings.com.br: could not connect to host
 webtiles.co.uk: could not connect to host
 webtobesocial.de: could not connect to host
 webukhost.com: could not connect to host
 webuni.hu: did not receive HSTS header
 webveloper.com: did not receive HSTS header
-webwork.pw: could not connect to host
+webwolf.co.za: could not connect to host
+webwork.pw: did not receive HSTS header
 webypass.xyz: could not connect to host
-webz.one: could not connect to host
 webzanem.com: could not connect to host
 wecanfindit.co.za: could not connect to host
 wecanvisit.com: could not connect to host
 wedding-m.jp: did not receive HSTS header
+weddingalbumsdesign.com: did not receive HSTS header
 weddingenvelopes.co.uk: did not receive HSTS header
 weddingibiza.nl: could not connect to host
 wedotrains.club: did not receive HSTS header
 weebsr.us: could not connect to host
 weed.ren: could not connect to host
-weedlandia.org: did not receive HSTS header
+weedcircles.com: did not receive HSTS header
+weedlandia.org: could not connect to host
 weekly.fyi: could not connect to host
-weerstationgiethoorn.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 wegenaer.nl: could not connect to host
+wegner.no: could not connect to host
 weicn.org: did not receive HSTS header
-weightreviews.com: did not receive HSTS header
+weightreviews.com: could not connect to host
 weiji.ga: could not connect to host
 weiler.xyz: could not connect to host
+weimaraner.com.br: could not connect to host
 weinhandel-preissler.de: could not connect to host
 weirdserver.com: could not connect to host
 weiyuz.com: max-age too low: 6585555
 weizenke.im: could not connect to host
 wejumall.com: could not connect to host
 wekibe.de: could not connect to host
-welby.cat: did not receive HSTS header
+welby.cat: could not connect to host
 welches-kinderfahrrad.de: could not connect to host
 welkers.org: could not connect to host
 wellastore.ru: could not connect to host
@@ -16253,7 +17575,7 @@ welldrake.com: could not connect to host
 wellmarts.com: did not receive HSTS header
 wellness.so: could not connect to host
 wellopp.com: did not receive HSTS header
-wellproducedwines.com: did not receive HSTS header
+wellproducedwines.com: could not connect to host
 wellsplasticsurgery.com: did not receive HSTS header
 wellspringcamps.com: did not receive HSTS header
 welovejobs.com: did not receive HSTS header
@@ -16263,8 +17585,11 @@ welpy.com: could not connect to host
 weltentreff.com: could not connect to host
 weltmeisterschaft.net: could not connect to host
 weme.eu: could not connect to host
+wen-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+wen-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+wenchieh.com: could not connect to host
 wendalyncheng.com: did not receive HSTS header
-wendigo.pl: could not connect to host
+wener.me: could not connect to host
 wengebowuguan.com: could not connect to host
 wenode.net: did not receive HSTS header
 wentu.ml: could not connect to host
@@ -16277,6 +17602,7 @@ werken-bij-inwork.nl: could not connect to host
 werkenbijkfc.nl: did not receive HSTS header
 werkplaatsoost.nl: did not receive HSTS header
 werkruimtebottendaal.nl: could not connect to host
+werkz.io: could not connect to host
 werner-schaeffer.de: did not receive HSTS header
 wernerschaeffer.de: did not receive HSTS header
 wes-dev.com: did not receive HSTS header
@@ -16286,6 +17612,7 @@ wespeakgeek.co.za: could not connect to host
 westcoastaggregate.com: could not connect to host
 westendzone.com: could not connect to host
 westerhoud.nl: did not receive HSTS header
+westhighlandwhiteterrier.com.br: could not connect to host
 westlinwinds.com: could not connect to host
 westsussexconnecttosupport.org: could not connect to host
 westtulsa.com: could not connect to host
@@ -16294,13 +17621,12 @@ wetherbyweather.org.uk: did not receive HSTS header
 wetoxic.com: could not connect to host
 wettbonus.info: max-age too low: 0
 wettbuero.de: did not receive HSTS header
-wetten.eu: did not receive HSTS header
 wettertoertchen.com: could not connect to host
 wetthost.com: could not connect to host
 wetttipps.com: could not connect to host
 wetttipps.de: could not connect to host
 wevahoo.com: could not connect to host
-wevg.org: could not connect to host
+wevg.org: did not receive HSTS header
 wevolver.com: did not receive HSTS header
 wewillgo.com: could not connect to host
 wewillgo.org: did not receive HSTS header
@@ -16311,9 +17637,11 @@ wf-training-master.appspot.com: did not receive HSTS header (error ignored - inc
 wftda.com: did not receive HSTS header
 wg-tools.de: could not connect to host
 whanau.org: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+whatanime.ga: could not connect to host
 whatisl.ovh: could not connect to host
 whats.io: could not connect to host
 whatsstalk.me: could not connect to host
+whatsupdeco.com: did not receive HSTS header
 whatsyouroffer.co.uk: did not receive HSTS header
 wheelwright.org: did not receive HSTS header
 when-release.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -16325,16 +17653,21 @@ wheresben.today: could not connect to host
 whilsttraveling.com: could not connect to host
 whisker.network: could not connect to host
 whiskyglazen.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+whisperinghoperanch.org: did not receive HSTS header
 whistler-transfers.com: did not receive HSTS header
-whitehat.id: did not receive HSTS header
+whitehat.id: could not connect to host
 whiterabbit.org: did not receive HSTS header
 whiterabbitcakery.com: could not connect to host
+whiteready.it: did not receive HSTS header
 whiteroom.agency: did not receive HSTS header
 whitestagforge.com: did not receive HSTS header
 whoclicks.net: could not connect to host
 whoisamitsingh.com: did not receive HSTS header
 whoisapi.online: could not connect to host
+whoiscuter.ml: could not connect to host
+whoiscutest.ml: could not connect to host
 wholebites.com: max-age too low: 7889238
+wholelotofbounce.co.uk: did not receive HSTS header
 wholikes.us: could not connect to host
 whoneedstobeprimaried.today: could not connect to host
 whoownsmyavailability.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
@@ -16355,7 +17688,7 @@ wiiforum.no: did not receive HSTS header
 wiire.me: could not connect to host
 wikiclash.info: could not connect to host
 wikipeter.nl: did not receive HSTS header
-wikisports.eu: could not connect to host
+wikisports.eu: did not receive HSTS header
 wild-emotion-events.de: could not connect to host
 wildbee.org: could not connect to host
 wildbirds.dk: did not receive HSTS header
@@ -16367,11 +17700,11 @@ wilhelm-nathan.de: could not connect to host
 willcipriano.com: could not connect to host
 willeminfo.ch: did not receive HSTS header
 willemsjort.be: did not receive HSTS header
+william.gg: did not receive HSTS header
 william.si: did not receive HSTS header
 williamboundsltd.com: could not connect to host
-williamle.com: did not receive HSTS header
 williamsapiens.com: could not connect to host
-williamsonshore.com: max-age too low: 0
+williamsflintlocks.com: did not receive HSTS header
 williamtm.design: could not connect to host
 willkommen-fuerstenberg.de: could not connect to host
 willosagiede.com: did not receive HSTS header
@@ -16379,15 +17712,18 @@ wilsonovi.com: could not connect to host
 winaes.com: did not receive HSTS header
 winclient.cn: could not connect to host
 windholz.us: could not connect to host
+windows10insider.com: did not receive HSTS header
 windowsforum.com: did not receive HSTS header
 windowstech.it: did not receive HSTS header
 windowwellexperts.com: did not receive HSTS header
+windrunner.se: could not connect to host
 winds.cf: could not connect to host
 windwoodmedia.com: could not connect to host
 windwoodweb.com: could not connect to host
 wine-importer.ru: did not receive HSTS header
 winebid.com: could not connect to host
 winecodeavocado.com: could not connect to host
+wineonthewall.com: max-age too low: 300
 wineworksonline.com: could not connect to host
 winfield.me.uk: did not receive HSTS header
 winfieldchen.me: did not receive HSTS header
@@ -16397,6 +17733,7 @@ wingumd.net: could not connect to host
 winnersports.co: could not connect to host
 winpack.cf: could not connect to host
 winpack.eu.org: could not connect to host
+winportal.cz: did not receive HSTS header
 winsec.nl: could not connect to host
 winshiplending.com: could not connect to host
 winsufi.biz: could not connect to host
@@ -16406,18 +17743,21 @@ wipply.com: could not connect to host
 wirbatz.org: did not receive HSTS header
 wirc.gr: could not connect to host
 wiredcut.com: did not receive HSTS header
+wireframesoftware.com: could not connect to host
 wireless-emergency-stop.com: did not receive HSTS header
-wirelesswatch.com.au: did not receive HSTS header
+wirelesswatch.com.au: could not connect to host
 wiretrip.io: did not receive HSTS header
 wirkaufendeinau.to: could not connect to host
 wisak.eu: could not connect to host
 wisdomize.me: could not connect to host
+wiseflat.com: did not receive HSTS header
 wiseloan.com: did not receive HSTS header
 wishcert.com: could not connect to host
 wishesbee.com: could not connect to host
 wissl.org: could not connect to host
 witae.com: could not connect to host
 withgoogle.com: did not receive HSTS header (error ignored - included regardless)
+withlocals.com: did not receive HSTS header
 withmy.beer: could not connect to host
 withoutacrystalball.com: did not receive HSTS header
 withustrading.com: did not receive HSTS header
@@ -16440,17 +17780,21 @@ wmoda.com.br: could not connect to host
 wnmed.com.au: did not receive HSTS header
 wnmm.nl: could not connect to host
 wnnc.co.uk: could not connect to host
+woaiuhd.com: could not connect to host
 wobblylang.org: could not connect to host
 wochenentwicklung.com: did not receive HSTS header
+wochennummern.de: could not connect to host
 wod-stavby.cz: could not connect to host
 wodice.com: could not connect to host
 wohnungsbau-ludwigsburg.de: did not receive HSTS header
+woi.vision: could not connect to host
 woima.fi: max-age too low: 604800
 wokeai.net: could not connect to host
 woktoss.com: could not connect to host
 wolfemg.com: could not connect to host
 wolfenland.net: did not receive HSTS header
 wolfesden.com: could not connect to host
+wolfgang-kloke.de: could not connect to host
 wolfram.io: could not connect to host
 wolkenspeicher.org: could not connect to host
 wollekorb.de: could not connect to host
@@ -16471,10 +17815,10 @@ wootton95.com: could not connect to host
 wooviet.com: could not connect to host
 wopen.org: could not connect to host
 wordbits.net: did not receive HSTS header
-wordlessecho.com: could not connect to host
+wordlessecho.com: did not receive HSTS header
 wordplay.one: could not connect to host
 wordpress-test.site: could not connect to host
-wordpresspro.cl: did not receive HSTS header
+wordpresspro.cl: could not connect to host
 wordsofamaster.com: could not connect to host
 work-and-jockel.de: did not receive HSTS header
 workemy.com: could not connect to host
@@ -16488,13 +17832,16 @@ worldchess.london: could not connect to host
 worldfree4.org: did not receive HSTS header
 worldlist.org: could not connect to host
 worldpovertysolutions.org: did not receive HSTS header
+worldrecipes.eu: did not receive HSTS header
 worldsbeststory.com: did not receive HSTS header
 worldwhisperer.net: could not connect to host
 wormholevpn.net: could not connect to host
 worshapp.com: did not receive HSTS header
+woshiluo.site: could not connect to host
 wow-travel.eu: could not connect to host
 wow202y5.com: did not receive HSTS header
 wowapi.org: could not connect to host
+wowhelp.it: could not connect to host
 wowinvasion.com: did not receive HSTS header
 wp-fastsearch.de: could not connect to host
 wp-rescue.com.au: could not connect to host
@@ -16527,34 +17874,40 @@ wrfu.co.nz: did not receive HSTS header
 wriedts.de: did not receive HSTS header
 wrightdoumawedding.com: could not connect to host
 writeapp.me: did not receive HSTS header
+writemyessay.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 writemytermpapers.com: could not connect to host
+writepro.net: could not connect to host
 writing-expert.com: could not connect to host
 wrldevelopment.com: did not receive HSTS header
 wroffle.com: did not receive HSTS header
 wrwg.ca: could not connect to host
 ws-meca.com: did not receive HSTS header
 wsb-immo.at: could not connect to host
+wscbiolo.id: could not connect to host
 wsdcap.com: could not connect to host
 wsor.group: did not receive HSTS header
 wss.com.ve: could not connect to host
 wsscompany.com.ve: could not connect to host
+wssv.ch: could not connect to host
 wsup.social: could not connect to host
-wtwk.com: could not connect to host
+wtwk.com: did not receive HSTS header
 wubify.com: did not receive HSTS header
-wubocong.com: could not connect to host
+wubocong.com: did not receive HSTS header
 wubthecaptain.eu: could not connect to host
 wuchipc.com: could not connect to host
-wufupay.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+wufupay.com: could not connect to host
 wuhengmin.com: could not connect to host
 wulpi.it: did not receive HSTS header
-wumai.cloud: did not receive HSTS header
+wumai.cloud: could not connect to host
 wumbo.kiwi: could not connect to host
 wundtherapie-schulung.de: could not connect to host
 wurzelzwerg.net: could not connect to host
 wusx.club: could not connect to host
-wutianxian.com: did not receive HSTS header
+wutianxian.com: could not connect to host
 wvr-law.de: did not receive HSTS header
 wvw698.com: max-age too low: 2592000
+wwjd.dynu.net: could not connect to host
+wwv-8522.com: could not connect to host
 www-001133.com: could not connect to host
 www-0385.com: could not connect to host
 www-1116.com: could not connect to host
@@ -16564,12 +17917,13 @@ www-39988.com: did not receive HSTS header
 www-507.net: could not connect to host
 www-62755.com: did not receive HSTS header
 www-66136.com: did not receive HSTS header
-www-68277.com: could not connect to host
 www-746.com: could not connect to host
+www-7570.com: could not connect to host
 www-771122.com: did not receive HSTS header
-www-8003.com: could not connect to host
+www-8003.com: did not receive HSTS header
 www-88599.com: did not receive HSTS header
 www-8887999.com: could not connect to host
+www-9649.com: could not connect to host
 www-9995.com: could not connect to host
 www-djbet.com: could not connect to host
 www-jinshavip.com: could not connect to host
@@ -16584,11 +17938,11 @@ www.amazon.de: [Exception... "Component returned failure code: 0x80004005 (NS_ER
 www.amazon.fr: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 www.amazon.it: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 www.amazon.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+www.captaintrain.com: did not receive HSTS header
 www.cueup.com: could not connect to host
 www.cyveillance.com: did not receive HSTS header
 www.developer.mydigipass.com: could not connect to host
 www.elanex.biz: did not receive HSTS header
-www.gamesdepartment.co.uk: could not connect to host
 www.gmail.com: did not receive HSTS header (error ignored - included regardless)
 www.googlemail.com: did not receive HSTS header (error ignored - included regardless)
 www.gpo.gov: did not receive HSTS header
@@ -16605,21 +17959,22 @@ www.rme.li: did not receive HSTS header
 www.sandbox.mydigipass.com: could not connect to host
 www.simbolo.co.uk: could not connect to host
 www.surfeasy.com: did not receive HSTS header
-www.viasinc.com: did not receive HSTS header
 www.zenpayroll.com: did not receive HSTS header
 www3.info: could not connect to host
+www68277.com: could not connect to host
+wwww.is: could not connect to host
+wwww.me.uk: did not receive HSTS header
 wxrlab.com: could not connect to host
 wxukang.cn: could not connect to host
 wxyz.buzz: could not connect to host
+wxzm.sx: could not connect to host
 wy6.org: did not receive HSTS header
 wybmabiity.com: could not connect to host
 wygluszanie.eu: could not connect to host
-wylog.ph: could not connect to host
+wylog.ph: did not receive HSTS header
 wyu.cc: could not connect to host
 wyzphoto.nl: did not receive HSTS header
 wyzwaniemilosci.com: could not connect to host
-wzrd.in: did not receive HSTS header
-x-iweb.ru: did not receive HSTS header
 x-pertservice.com: did not receive HSTS header
 x-power-detox.com: could not connect to host
 x-ripped-hd.com: could not connect to host
@@ -16630,14 +17985,13 @@ x2w.io: could not connect to host
 x3led.com: could not connect to host
 x509.pub: could not connect to host
 x509.pw: could not connect to host
-x64architecture.com: could not connect to host
 x69.biz: could not connect to host
 x69x.net: could not connect to host
 xanderweaver.com: did not receive HSTS header
 xandocs.com: could not connect to host
 xat.re: did not receive HSTS header
+xavier.is: could not connect to host
 xavierbarroso.com: did not receive HSTS header
-xawen.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 xbc.nz: could not connect to host
 xbind.io: could not connect to host
 xchangeinfo.com: could not connect to host
@@ -16659,8 +18013,10 @@ xeonlab.com: could not connect to host
 xeonlab.de: could not connect to host
 xett.com: could not connect to host
 xfive.de: could not connect to host
+xfrag-networks.com: did not receive HSTS header
 xg3n1us.de: did not receive HSTS header
 xgusto.com: did not receive HSTS header
+xhadius.de: could not connect to host
 xia100.xyz: could not connect to host
 xiangqiushi.com: did not receive HSTS header
 xianguocy.com: could not connect to host
@@ -16668,31 +18024,34 @@ xiaody.me: could not connect to host
 xiaolan.me: could not connect to host
 xiaolvmu.com: could not connect to host
 xiaolvmu.me: could not connect to host
-xiaomionline24.pl: could not connect to host
 xiaoxiao.im: could not connect to host
 xiaxuejin.cn: could not connect to host
+xiazhanjian.com: could not connect to host
+xice.cf: could not connect to host
 xilegames.com: could not connect to host
-xiliant.com: did not receive HSTS header
 ximage.me: could not connect to host
 ximens.me: could not connect to host
+xin-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+xin-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 xinbiji.cn: could not connect to host
 xinex.cz: did not receive HSTS header
+xing-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 xing.ml: could not connect to host
 xinghuokeji.xin: could not connect to host
 xingiahanvisa.net: did not receive HSTS header
 xinnixwebshop.be: did not receive HSTS header
-xiqi.us: could not connect to host
+xiqi.us: did not receive HSTS header
 xirion.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 xisa.it: could not connect to host
 xivpn.com: could not connect to host
 xiyu.it: did not receive HSTS header
 xiyu.moe: did not receive HSTS header
-xjoi.net: could not connect to host
+xjoi.net: did not receive HSTS header
 xlaff.com: could not connect to host
-xlboo.com: did not receive HSTS header
+xlboo.com: could not connect to host
 xlfblog.com: did not receive HSTS header
 xlinar.com: could not connect to host
-xmerak.com: could not connect to host
+xmerak.com: did not receive HSTS header
 xmiui.com: could not connect to host
 xmonk.org: did not receive HSTS header
 xmr.my: could not connect to host
@@ -16728,12 +18087,14 @@ xn--cckvb1cwa0c5br5e2d2711k.net: could not connect to host
 xn--datenrettung-mnchen-jbc.com: did not receive HSTS header
 xn--dckya4a0bya6x.com: could not connect to host
 xn--dckya4a0bya6x.jp: could not connect to host
+xn--dk8haaa.ws: could not connect to host
+xn--dmontaa-9za.com: did not receive HSTS header
 xn--e--0g4aiy1b8rmfg3o.jp: could not connect to host
 xn--e--4h4axau6ld4lna0g.com: could not connect to host
 xn--e--ig4a4c3f6bvc5et632i.com: could not connect to host
 xn--e--k83a5h244w54gttk.xyz: could not connect to host
 xn--ekr87w7se89ay98ezcs.biz: did not receive HSTS header
-xn--gfrr-7qa.li: could not connect to host
+xn--gfrrli-yxa.ch: could not connect to host
 xn--gmq92k.nagoya: could not connect to host
 xn--grnderlehrstuhl-0vb.de: could not connect to host
 xn--hfk-allgu-schwaben-stb.de: could not connect to host
@@ -16748,7 +18109,7 @@ xn--lgb3a8bcpn.gq: could not connect to host
 xn--lgb3a8bcpn.ml: could not connect to host
 xn--lna-2000-9za.nu: could not connect to host
 xn--lna-4000-9za.nu: could not connect to host
-xn--lnakuten-9za.com: did not receive HSTS header
+xn--lnakuten-9za.com: max-age too low: 10368000
 xn--ls8hi7a.tk: could not connect to host
 xn--maraa-rta.org: could not connect to host
 xn--mensenges-o1a8c.gq: could not connect to host
@@ -16765,11 +18126,9 @@ xn--qckqc0nxbyc4cdb4527err7c.biz: did not receive HSTS header
 xn--qckyd1cu698a35zarib.xyz: could not connect to host
 xn--r77hya.ga: could not connect to host
 xn--rt-cja.eu: could not connect to host
-xn--sdkwa9azd389v01ya.com: did not receive HSTS header
+xn--sdkwa9azd389v01ya.com: could not connect to host
 xn--srenpind-54a.dk: could not connect to host
-xn--sz8h.ml: could not connect to host
 xn--t8j2a3042d.xyz: could not connect to host
-xn--t8j4aa4nyhxa7duezbl49aqg5546e264d.net: could not connect to host
 xn--tda.ml: could not connect to host
 xn--thorme-6uaf.ca: could not connect to host
 xn--u9jy16ncfao19mo8i.nagoya: could not connect to host
@@ -16782,7 +18141,6 @@ xn--vck8crcu789ajtaj92eura.xyz: could not connect to host
 xn--w22a.jp: could not connect to host
 xn--werner-schffer-fib.de: did not receive HSTS header
 xn--wmq.jp: could not connect to host
-xn--wq9h.ml: could not connect to host
 xn--xdtx3pfzbiw3ar8e7yedqrhui.com: could not connect to host
 xn--xz1a.jp: could not connect to host
 xn--y8j2eb5631a4qf5n0h.com: could not connect to host
@@ -16792,6 +18150,7 @@ xn--ykrp42k.com: could not connect to host
 xn--yoamomisuasbcn-ynb.com: could not connect to host
 xn--zck9a4b352yuua.jp: did not receive HSTS header
 xng.io: did not receive HSTS header
+xnu.kr: could not connect to host
 xobox.me: could not connect to host
 xoda.pw: could not connect to host
 xoffy.com: did not receive HSTS header
@@ -16799,6 +18158,7 @@ xom.party: could not connect to host
 xombra.com: could not connect to host
 xor-a.net: could not connect to host
 xotika.tv: could not connect to host
+xpbytes.com: did not receive HSTS header
 xpenology-fr.net: could not connect to host
 xperiacodes.com: could not connect to host
 xpi.fr: could not connect to host
@@ -16806,9 +18166,10 @@ xpj.bet: did not receive HSTS header
 xpj.sx: could not connect to host
 xpjcunkuan.com: could not connect to host
 xpressprint.com.br: max-age too low: 90
-xpwn.cz: could not connect to host
-xq55.com: did not receive HSTS header
+xpwn.cz: did not receive HSTS header
 xqin.net: could not connect to host
+xrippedhd.com: could not connect to host
+xroot.org: did not receive HSTS header
 xrp.pw: could not connect to host
 xscancun.com: could not connect to host
 xscapers.com: did not receive HSTS header
@@ -16817,21 +18178,25 @@ xsyds.cn: did not receive HSTS header
 xt.om: did not receive HSTS header
 xtenz.xyz: could not connect to host
 xtom.email: could not connect to host
+xtom.io: could not connect to host
 xtream-hosting.com: could not connect to host
 xtream-hosting.de: could not connect to host
 xtream-hosting.eu: could not connect to host
 xtreamhosting.eu: could not connect to host
+xtremegaming.it: could not connect to host
 xtrim.ru: did not receive HSTS header
 xtzone.be: could not connect to host
+xuan-li88.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+xuan-li88.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 xuanmeishe.top: could not connect to host
 xuexb.com: did not receive HSTS header
 xujan.com: could not connect to host
 xuntaosms.com: could not connect to host
 xupeng.me: did not receive HSTS header
-xuyh0120.win: did not receive HSTS header
-xxbase.com: did not receive HSTS header
+xxbase.com: could not connect to host
 xxx3dbdsm.com: could not connect to host
 xxxladyboysporn.com: could not connect to host
+xxxred.net: could not connect to host
 xy1919.com: could not connect to host
 xy6161.com: could not connect to host
 xy6262.com: could not connect to host
@@ -16842,7 +18207,9 @@ xy7373.com: could not connect to host
 xyndrac.net: max-age too low: 2592000
 xynex.us: could not connect to host
 xynta.ch: could not connect to host
+xyyp.mn: could not connect to host
 xzoneadventure.com: did not receive HSTS header
+xzy.one: did not receive HSTS header
 y-o-w.com: did not receive HSTS header
 y-s.pw: could not connect to host
 y3451.com: could not connect to host
@@ -16850,20 +18217,22 @@ yaay.com.br: could not connect to host
 yabrt.cn: could not connect to host
 yaccin.com: could not connect to host
 yachts-magazine.com: did not receive HSTS header
+yafull.com: could not connect to host
 yagi2.com: did not receive HSTS header
 yahoo.ax: could not connect to host
 yalla.jp: did not receive HSTS header
 yamamo10.com: could not connect to host
 yameveo.com: did not receive HSTS header
-yangmaodang.org: did not receive HSTS header
+yannikhenke.de: could not connect to host
 yanwh.xyz: did not receive HSTS header
+yao-in.com: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+yao-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 yaoidreams.com: could not connect to host
 yaporn.tv: could not connect to host
-yarchives.jp: could not connect to host
+yarchives.jp: max-age too low: 0
 yard-fu.com: could not connect to host
 yardbird.us: could not connect to host
 yarnhookup.com: did not receive HSTS header
-yarogneva.ru: could not connect to host
 yasinaydin.net: did not receive HSTS header
 yasutomonodokoiko.com: did not receive HSTS header
 yaucy.win: could not connect to host
@@ -16873,7 +18242,6 @@ ycaaz.com: did not receive HSTS header
 ycc.wtf: could not connect to host
 ycm2.wtf: could not connect to host
 ydy.jp: could not connect to host
-yecl.net: did not receive HSTS header
 yello.website: could not connect to host
 yellowcar.website: could not connect to host
 yemalu.com: could not connect to host
@@ -16884,7 +18252,6 @@ yenniferallulli.de: could not connect to host
 yenniferallulli.es: did not receive HSTS header
 yenniferallulli.moda: could not connect to host
 yenniferallulli.nl: could not connect to host
-yenpape.com: did not receive HSTS header
 yepbitcoin.com: could not connect to host
 yesdevnull.net: did not receive HSTS header
 yesfone.com.br: could not connect to host
@@ -16896,9 +18263,12 @@ ygcdyf.com: did not receive HSTS header
 yggdar.ga: could not connect to host
 yh35.net: max-age too low: 86400
 yhori.xyz: could not connect to host
-yhrd.org: did not receive HSTS header
+yhwj.top: could not connect to host
 yibaoweilong.top: could not connect to host
 yibin0831.com: could not connect to host
+yiffy.tips: did not receive HSTS header
+yiffy.zone: did not receive HSTS header
+yigujin.cn: could not connect to host
 yikzu.cn: could not connect to host
 yin.roma.it: did not receive HSTS header
 yin8888.tv: did not receive HSTS header
@@ -16906,12 +18276,11 @@ ying299.com: could not connect to host
 ying299.net: could not connect to host
 yinga.ga: did not receive HSTS header
 yingsuo.ltd: could not connect to host
-yingyj.com: did not receive HSTS header
+yingyj.com: could not connect to host
 yinhe12.net: did not receive HSTS header
-yipingguo.com: did not receive HSTS header
+yipingguo.com: could not connect to host
 yippie.nl: could not connect to host
 yizhu.com: could not connect to host
-yjsoft.me: could not connect to host
 ylilauta.org: could not connect to host
 ylk.io: could not connect to host
 ynode.co: did not receive HSTS header
@@ -16925,7 +18294,6 @@ yogatrainingrishikesh.com: could not connect to host
 yogeshbeniwal.com: did not receive HSTS header
 yogoeasy.com: did not receive HSTS header
 yohanesmario.com: did not receive HSTS header
-yoimise.net: did not receive HSTS header
 yoiyado.info: did not receive HSTS header
 yokeepo.com: could not connect to host
 yolo-csgo.com: could not connect to host
@@ -16934,6 +18302,7 @@ yoloprod.fr: could not connect to host
 yoloseo.com: could not connect to host
 yomena.in: could not connect to host
 yomepre.com: could not connect to host
+yongbin.org: did not receive HSTS header
 yopers.com: did not receive HSTS header
 yorkshireterrier.com.br: could not connect to host
 yorname.ml: did not receive HSTS header
@@ -16941,9 +18310,8 @@ yoru.me: could not connect to host
 yotilabs.com: could not connect to host
 youcaitian.com: did not receive HSTS header
 youcancraft.de: could not connect to host
-youcanfuckoff.xyz: could not connect to host
 youcontrol.ru: could not connect to host
-youdowell.com: could not connect to host
+youdowell.com: did not receive HSTS header
 youfencun.com: did not receive HSTS header
 youjizz.bz: could not connect to host
 youlend.com: did not receive HSTS header
@@ -16953,12 +18321,12 @@ youmonit.me: could not connect to host
 youngandunited.nl: did not receive HSTS header
 younl.net: could not connect to host
 youon.tokyo: did not receive HSTS header
-youran.me: could not connect to host
 yourbapp.ch: could not connect to host
 yourgame.co.il: did not receive HSTS header
+yourhair.net: could not connect to host
 youri.me: could not connect to host
 yourlovesong.com.mx: could not connect to host
-yourname.xyz: could not connect to host
+yourname.xyz: did not receive HSTS header
 yoursbookstore.jp: max-age too low: 0
 yoursecondphone.co: could not connect to host
 yourself.today: could not connect to host
@@ -16967,18 +18335,23 @@ yourtrainingsolutions.com: did not receive HSTS header
 youruseragent.info: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 yourznc.com: could not connect to host
 yousite.by: could not connect to host
-youth2009.org: did not receive HSTS header
+youth2009.org: max-age too low: 2592000
 youtube: could not connect to host
 youtubeviews.ml: could not connect to host
 youwatchporn.com: could not connect to host
 youyoulemon.com: could not connect to host
 ypcs.fi: did not receive HSTS header
+ypiresia.fr: could not connect to host
 yryz.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 ysicing.net: could not connect to host
-yslbeauty.com: did not receive HSTS header
 yspeo.com: max-age too low: 2592000
+ysx.me.uk: did not receive HSTS header
+ytb.zone: did not receive HSTS header
+ytbmp3.com: did not receive HSTS header
+ytbmp4.com: did not receive HSTS header
 ytcuber.xyz: could not connect to host
 ythyth.com: max-age too low: 2592000
+ytpak.com: could not connect to host
 ytvwld.de: did not receive HSTS header
 yu7.jp: did not receive HSTS header
 yuanbenlian.com: did not receive HSTS header
@@ -16990,6 +18363,7 @@ yugege.cf: could not connect to host
 yuhen.ru: did not receive HSTS header
 yui.cat: did not receive HSTS header
 yuka.one: could not connect to host
+yuki-portfolio.com: did not receive HSTS header
 yukijinji.moe: did not receive HSTS header
 yukiminami.net: could not connect to host
 yukimochi.com: could not connect to host
@@ -17013,6 +18387,7 @@ yuqi.me: could not connect to host
 yurinet.org: could not connect to host
 yuriykuzmin.com: did not receive HSTS header
 yutabon.com: could not connect to host
+yutaron.tokyo: did not receive HSTS header
 yuushou.com: could not connect to host
 yux.fr: could not connect to host
 yux.io: did not receive HSTS header
@@ -17020,26 +18395,31 @@ yuxingxin.com: did not receive HSTS header
 yuzu.tk: did not receive HSTS header
 ywei.org: could not connect to host
 ywyz.tech: could not connect to host
+yya.bid: could not connect to host
+yya.men: could not connect to host
+yyrss.com: could not connect to host
 z-coder.com: could not connect to host
+z-to-a.com: did not receive HSTS header
 z0rro.net: could not connect to host
 z33.ch: did not receive HSTS header
 z33.co: could not connect to host
 z3liff.com: could not connect to host
 z3liff.net: could not connect to host
+zaalleatherwear.nl: did not receive HSTS header
 zacharopoulos.me: could not connect to host
 zachbolinger.com: could not connect to host
+zachpeters.org: did not receive HSTS header
 zadieheimlich.com: did not receive HSTS header
+zadroweb.com: did not receive HSTS header
 zaem.tv: could not connect to host
 zahnrechner-staging.azurewebsites.net: could not connect to host
 zahyantechnologies.com: did not receive HSTS header
-zaidan.de: could not connect to host
-zaidan.eu: could not connect to host
-zaidanfood.com: could not connect to host
-zaidanfood.eu: could not connect to host
-zaidanlebensmittelhandel.de: could not connect to host
-zakoncontrol.com: did not receive HSTS header
+zaidan.de: did not receive HSTS header
+zaidan.eu: did not receive HSTS header
+zaidanfood.com: did not receive HSTS header
+zaidanfood.eu: did not receive HSTS header
+zaidanlebensmittelhandel.de: did not receive HSTS header
 zalan.do: could not connect to host
-zalohovaniburian.cz: could not connect to host
 zamis.net: did not receive HSTS header
 zamorano.edu: could not connect to host
 zamos.ru: max-age too low: 0
@@ -17049,11 +18429,16 @@ zaoext.com: could not connect to host
 zaoshanghao-dajia.rhcloud.com: could not connect to host
 zap.yt: could not connect to host
 zapatoshechoamano.pe: did not receive HSTS header
+zaptan.info: did not receive HSTS header
+zaptan.net: did not receive HSTS header
+zaptan.org: did not receive HSTS header
+zaptan.us: did not receive HSTS header
 zargaripour.com: did not receive HSTS header
 zarooba.com: could not connect to host
 zavca.com: did not receive HSTS header
 zbasenem.pl: did not receive HSTS header
 zbchen.com: could not connect to host
+zberger.com: could not connect to host
 zbigniewgalucki.eu: did not receive HSTS header
 zbp.at: did not receive HSTS header
 zby.io: could not connect to host
@@ -17061,7 +18446,7 @@ zdravotnickasluzba.eu: could not connect to host
 zdrowiepaleo.pl: did not receive HSTS header
 zdx.ch: max-age too low: 0
 zeb.fun: could not connect to host
-zebibyte.cn: did not receive HSTS header
+zebibyte.cn: could not connect to host
 zebrababy.cn: could not connect to host
 zebry.nl: did not receive HSTS header
 zecrypto.com: could not connect to host
@@ -17073,10 +18458,11 @@ zehntner.ch: max-age too low: 3600
 zeitzer-turngala.de: could not connect to host
 zelfmoord.ga: could not connect to host
 zelfstandigemakelaars.net: could not connect to host
-zellari.ru: could not connect to host
+zellari.ru: did not receive HSTS header
 zeloz.xyz: could not connect to host
-zenghx.tk: could not connect to host
+zenfusion.fr: could not connect to host
 zenhaiku.com: could not connect to host
+zenics.co.uk: did not receive HSTS header
 zeno-system.com: did not receive HSTS header
 zenpayroll.com: did not receive HSTS header
 zentience.dk: did not receive HSTS header
@@ -17088,7 +18474,6 @@ zentralwolke.de: did not receive HSTS header
 zenvite.com: could not connect to host
 zenwears.com: could not connect to host
 zenycosta.com: could not connect to host
-zeparadox.com: did not receive HSTS header
 zepect.com: did not receive HSTS header
 zera.com.au: could not connect to host
 zerekin.net: max-age too low: 86400
@@ -17097,10 +18482,10 @@ zero-x-baadf00d.com: could not connect to host
 zerocool.io: could not connect to host
 zeroday.sk: did not receive HSTS header
 zerofox.gq: could not connect to host
-zerolab.org: could not connect to host
 zeroling.com: could not connect to host
 zeroml.ml: could not connect to host
 zerosource.net: could not connect to host
+zerowastesonoma.gov: could not connect to host
 zerudi.com: did not receive HSTS header
 zetadisseny.es: did not receive HSTS header
 zeto365.pl: did not receive HSTS header
@@ -17126,14 +18511,13 @@ zhihua-lai.com: did not receive HSTS header
 zhiin.net: could not connect to host
 zhikin.com: could not connect to host
 zhimajk.com: could not connect to host
-zhome.info: could not connect to host
+zhiwei.me: did not receive HSTS header
 zhoujiashu.com: could not connect to host
 zhuji.com: could not connect to host
 zhuji.com.cn: could not connect to host
 zhuji5.com: could not connect to host
 zhujicaihong.com: could not connect to host
 zhuweiyou.com: did not receive HSTS header
-zi.is: could not connect to host
 zi0r.com: did not receive HSTS header
 zian.online: could not connect to host
 zicklam.com: could not connect to host
@@ -17145,19 +18529,19 @@ zinenapse.info: could not connect to host
 zippy-download.com: could not connect to host
 zippy-download.de: could not connect to host
 zirtue.io: could not connect to host
+zitrone44.de: did not receive HSTS header
 zivagold.com: did not receive HSTS header
-zivver.com: could not connect to host
 zivy-ruzenec.cz: could not connect to host
 zixo.sk: could not connect to host
 ziyuanabc.xyz: could not connect to host
-ziz.exchange: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+ziz.exchange: could not connect to host
 zizoo.com: did not receive HSTS header
-zju.tv: could not connect to host
+zju.tv: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 zjubtv.com: could not connect to host
 zjutv.com: could not connect to host
 zkillboard.com: did not receive HSTS header
 zking.ga: could not connect to host
-zlc1994.com: did not receive HSTS header
+zlc1994.com: could not connect to host
 zlcp.com: could not connect to host
 zmsastro.co.za: could not connect to host
 zmscable.com: did not receive HSTS header
@@ -17191,26 +18575,30 @@ zooxdata.com: could not connect to host
 zorki.nl: did not receive HSTS header
 zortium.report: could not connect to host
 zorz.info: could not connect to host
-zouyaoji.top: did not receive HSTS header
 zoznamrealit.sk: did not receive HSTS header
 zpy.fun: could not connect to host
 zq789.com: could not connect to host
 zqhong.com: could not connect to host
 zqjs.tk: could not connect to host
 zqwqz.com: could not connect to host
+zrhdwz.cn: could not connect to host
 zrkr.de: could not connect to host
 zrn.in: did not receive HSTS header
 ztan.tk: could not connect to host
 ztcaoll222.cn: could not connect to host
 ztytian.com: could not connect to host
 zuan-in.com: could not connect to host
+zuan-in.net: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
+zubora.co: could not connect to host
 zubro.net: could not connect to host
 zuckerfloh.de: did not receive HSTS header
 zudomc.me: could not connect to host
 zuehlcke.de: could not connect to host
 zukix.com: could not connect to host
 zulu7.com: did not receive HSTS header
+zunda.cafe: did not receive HSTS header
 zunftmarke.de: did not receive HSTS header
+zurickrelogios.com.br: did not receive HSTS header
 zutsu-raku.com: did not receive HSTS header
 zuviel.space: could not connect to host
 zvejonys.lt: did not receive HSTS header
@@ -17219,6 +18607,8 @@ zvz.im: could not connect to host
 zwembadheeten.nl: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 zx1168.com: could not connect to host
 zx2268.com: could not connect to host
+zxavier.com: did not receive HSTS header
+zxc.science: [Exception... "Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsISiteSecurityService.processHeader]"  nsresult: "0x80004005 (NS_ERROR_FAILURE)"  location: "JS frame :: /home/trava90/REPO/UXP/security/manager/tools/getHSTSPreloadList.js :: processStsHeader :: line 131"  data: no]
 zxity.co.uk: could not connect to host
 zxity.ltd: could not connect to host
 zxity.uk: could not connect to host
@@ -17228,6 +18618,7 @@ zync.ca: did not receive HSTS header
 zypgr.com: could not connect to host
 zypr.pw: could not connect to host
 zyso.org: could not connect to host
+zz295.com: did not receive HSTS header
 zzb510.com: could not connect to host
 zzb6688.com: could not connect to host
 zzb8899.com: could not connect to host
diff --git a/security/manager/ssl/nsSTSPreloadList.inc b/security/manager/ssl/nsSTSPreloadList.inc
index 8dab9a1ee..66c7922fa 100644
--- a/security/manager/ssl/nsSTSPreloadList.inc
+++ b/security/manager/ssl/nsSTSPreloadList.inc
@@ -8,7 +8,7 @@
 /*****************************************************************************/
 
 #include <stdint.h>
-const PRTime gPreloadListExpirationTime = INT64_C(1547572131850000);
+const PRTime gPreloadListExpirationTime = INT64_C(1551523663318000);
 
 class nsSTSPreload
 {
@@ -18,22 +18,8 @@ class nsSTSPreload
 };
 
 static const nsSTSPreload kSTSPreloadList[] = {
-  { "00100010.net", true },
   { "0010100.net", true },
-  { "00120012.net", true },
-  { "00130013.net", true },
-  { "00140014.net", true },
-  { "00150015.net", true },
-  { "00160016.net", true },
-  { "00180018.net", true },
-  { "00190019.net", true },
-  { "00330033.net", true },
-  { "00440044.net", true },
-  { "00550055.net", true },
-  { "00660066.net", true },
-  { "00770077.net", true },
   { "0086286.com", true },
-  { "00990099.net", true },
   { "00dani.me", true },
   { "00f.net", true },
   { "0100dev.com", false },
@@ -42,7 +28,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "01110000011100110111001001100111.com", true },
   { "01electronica.com.ar", true },
   { "01seguridad.com.ar", true },
-  { "01smh.com", true },
   { "022367.com", true },
   { "022379.com", true },
   { "022391.com", true },
@@ -60,19 +45,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "02607.com", true },
   { "026122.com", true },
   { "02638.net", true },
-  { "02smh.com", true },
-  { "0311buy.cn", true },
   { "03170317.com", true },
   { "0391315.com", true },
   { "046569.com", true },
-  { "04sun.com", true },
   { "050.ca", true },
   { "050media.nl", true },
   { "0511315.net", true },
+  { "0513c.com", true },
   { "0573wk.com", true },
   { "06091994.xyz", true },
   { "06se.com", true },
-  { "07733.win", true },
   { "0788yh.com", true },
   { "0792112.com", true },
   { "0809yh.com", true },
@@ -101,9 +83,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "083912.com", true },
   { "083957.com", true },
   { "083960.com", true },
-  { "083962.com", true },
   { "083965.com", true },
   { "083967.com", true },
+  { "08detaxe.fr", true },
   { "09115.com", true },
   { "0916app.com", true },
   { "09892.net", true },
@@ -125,10 +107,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "0x0.li", true },
   { "0x00ff00ff.com", true },
   { "0x17.de", true },
+  { "0x378.net", true },
+  { "0x48.pw", true },
   { "0x52.net", true },
   { "0x7d.com", true },
   { "0x7fffffff.net", true },
   { "0x90.io", true },
+  { "0xaa55.me", true },
   { "0xabe.io", true },
   { "0xacab.org", true },
   { "0xda.de", true },
@@ -142,6 +127,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "100-downloads.com", true },
   { "10000v.ru", true },
   { "1000minds.com", true },
+  { "1000serien.com", true },
   { "1001carats.fr", true },
   { "1001kartini.com", true },
   { "1001kerstpakketten.com", false },
@@ -173,19 +159,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "10hz.de", true },
   { "10og.de", true },
   { "10ppm.com", true },
-  { "10seos.com", true },
-  { "1100.so", true },
-  { "110110110.net", true },
-  { "112112112.net", true },
+  { "10xiuxiu.com", true },
   { "112app.nl", true },
   { "112hz.com", true },
-  { "113113113.net", true },
   { "114514ss.com", true },
   { "1177107.com", true },
-  { "118118118.net", true },
   { "11dzon.com", true },
   { "11loc.de", true },
-  { "11scc.com", true },
   { "11thstreetcoffee.com", true },
   { "11urss.com", true },
   { "1212873467.rsc.cdn77.org", true },
@@ -206,8 +186,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "123plons.nl", true },
   { "123roulement.be", true },
   { "123roulement.com", true },
-  { "123test.com", true },
-  { "123test.nl", true },
   { "123writings.com", true },
   { "124133.com", true },
   { "124633.com", true },
@@ -216,6 +194,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "1288fc.com", true },
   { "12photos.eu", true },
   { "12thmanrising.com", true },
+  { "12train.com", true },
   { "12vpn.net", true },
   { "130.ua", true },
   { "132kv.ch", true },
@@ -234,16 +213,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "1464424382.rsc.cdn77.org", true },
   { "146533.com", true },
   { "146733.com", true },
-  { "1481481.com", true },
-  { "1481481.net", true },
-  { "1481482.com", true },
-  { "1481482.net", true },
-  { "1481483.com", true },
-  { "1481483.net", true },
-  { "1481485.com", true },
-  { "1481485.net", true },
-  { "1481486.com", true },
-  { "1481486.net", true },
   { "149433.com", true },
   { "149733.com", true },
   { "14it.de", true },
@@ -257,39 +226,37 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "156433.com", true },
   { "1590284872.rsc.cdn77.org", true },
   { "1600esplanade.com", true },
+  { "160887.com", true },
   { "1644091933.rsc.cdn77.org", true },
   { "1661237.com", true },
-  { "166166.com", true },
-  { "168bo9.com", true },
-  { "168bo9.net", true },
   { "16book.org", true },
-  { "1750studios.com", false },
+  { "1768calc.com.au", true },
   { "1811559.com", true },
   { "1844329061.rsc.cdn77.org", true },
   { "1876996.com", true },
-  { "188da.com", true },
   { "188dv.com", true },
-  { "189dv.com", true },
+  { "1895media.com", true },
   { "189fc.com", true },
   { "18celebration.com", true },
   { "18celebration.org", true },
   { "18f.gov", true },
   { "18f.gsa.gov", false },
+  { "1911trust.com", true },
   { "192168ll.repair", true },
   { "192433.com", true },
   { "1972969867.rsc.cdn77.org", true },
   { "1981612088.rsc.cdn77.org", true },
+  { "19area.cn", true },
   { "19hundert84.de", true },
   { "1a-diamantscheiben.de", true },
   { "1a-vermessung.at", true },
   { "1a-werkstattgeraete.de", true },
   { "1ab-machinery.com", true },
-  { "1aim.com", true },
-  { "1b1.pl", true },
   { "1c-power.ru", true },
   { "1cover.co.nz", true },
   { "1cover.com.au", true },
   { "1e9.nl", true },
+  { "1europlan.nl", true },
   { "1f123.net", true },
   { "1fach-digital.de", true },
   { "1hourproofreading.com", true },
@@ -300,10 +267,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "1kmi.co", true },
   { "1ll.uk", true },
   { "1lord1faith.com", true },
-  { "1m.duckdns.org", true },
   { "1montre.fr", true },
   { "1morebounce.co.uk", true },
   { "1nfr.com", false },
+  { "1oaklasvegas.com", true },
   { "1of16.de", true },
   { "1on1on1.de", true },
   { "1on1on1.tv", true },
@@ -312,12 +279,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "1password.ca", true },
   { "1password.com", true },
   { "1password.eu", true },
+  { "1pw.ca", true },
   { "1px.tv", true },
-  { "1q2w.nl", true },
   { "1r.is", true },
   { "1rs.nl", true },
-  { "1salland.nl", true },
   { "1scope.com", true },
+  { "1se.co", true },
   { "1se2or3.com", true },
   { "1st-bounce.co.uk", true },
   { "1st-community.de", true },
@@ -331,32 +298,35 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "1wl.uk", true },
   { "2.wtf", true },
   { "200.network", true },
-  { "200fcw.com", true },
-  { "2048-spiel.de", true },
+  { "2012.ovh", true },
   { "20at.com", true },
   { "20denier.com", true },
-  { "21.co.uk", true },
   { "215dy.net", true },
   { "21sthammersmith.org.uk", true },
+  { "21stnc.us", true },
   { "21x9.org", true },
   { "2222yh.com", true },
   { "22digital.agency", true },
-  { "22scc.com", true },
   { "230beats.com", true },
+  { "23333.link", true },
+  { "2333666.xyz", true },
   { "2333blog.com", true },
   { "233abc.com", true },
   { "233blog.com", true },
   { "233boy.com", true },
   { "233bwg.com", true },
   { "233hugo.com", true },
+  { "233ss.net", true },
   { "233vps.com", true },
   { "24-7.jp", true },
+  { "245meadowvistaway.com", true },
   { "246060.ru", true },
-  { "247exchange.com", true },
   { "247healthshop.com", true },
   { "247medplan.com", true },
+  { "2495dentalimplants.com", true },
   { "24dian30.com", true },
   { "24hour-locksmithsanantonio.com", true },
+  { "24hourcyclist.co.uk", true },
   { "24hourlocksmithbaltimore.com", true },
   { "24hourlocksmithdallastx.com", true },
   { "24hourlocksmithdetroit.com", true },
@@ -368,30 +338,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "24timeravis.dk", true },
   { "24zpravy.cz", true },
   { "256pages.com", false },
-  { "258da.com", true },
   { "25reinyan25.net", true },
   { "2600edinburgh.org", true },
   { "2600hq.com", true },
+  { "260887.com", true },
   { "263.info", true },
+  { "2718282.net", true },
   { "28-industries.com", true },
   { "281180.de", true },
   { "2858958.com", true },
-  { "286.com", true },
-  { "288da.com", true },
   { "28peaks.com", true },
   { "28spots.net", true },
   { "291167.xyz", true },
   { "2912.nl", true },
   { "2948.ca", true },
   { "297computers.com", true },
-  { "298da.com", true },
   { "2991236.com", true },
   { "2au.ru", true },
   { "2bas.nl", true },
   { "2bcompany.ch", true },
   { "2bis10.de", true },
   { "2bougie.com", true },
-  { "2bouncy.com", true },
   { "2c-b.com", true },
   { "2c-d.com", true },
   { "2c-e.com", true },
@@ -399,10 +366,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "2c-t-7.com", true },
   { "2c-t-8.com", true },
   { "2cash.ru", true },
+  { "2chan.eu", true },
+  { "2chan.jp", true },
   { "2cv-fahrer.de", true },
   { "2fm.ie", true },
   { "2fraud.pro", true },
   { "2g1s.net", true },
+  { "2gen.com", true },
   { "2h-nagoya.org", true },
   { "2heartsbookings.co.uk", true },
   { "2hypeenterprises.com", true },
@@ -411,6 +381,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "2li.ch", true },
   { "2manydots.nl", true },
   { "2mb.solutions", true },
+  { "2mir.com", true },
   { "2nains.ch", true },
   { "2nerds1bit.com", true },
   { "2nics.net", true },
@@ -469,7 +440,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "33445333.com", true },
   { "33445444.com", true },
   { "33836.com", true },
-  { "33scc.com", true },
   { "340422.com", true },
   { "340622.com", true },
   { "340922.com", true },
@@ -518,23 +488,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "354922.com", true },
   { "354933.com", true },
   { "356433.com", true },
+  { "357maelai.co", true },
   { "360live.fr", true },
   { "360rail.nl", true },
-  { "360woodworking.com", true },
+  { "360vrs.com", true },
   { "364553.com", true },
   { "365365.com", true },
+  { "365beautyworld.com", true },
   { "365daysreview.com", true },
   { "365healthworld.com", true },
   { "365propertybuyer.co.uk", true },
   { "365skulls.com", true },
   { "370422.com", true },
   { "371422.com", true },
+  { "371cloud.com", true },
   { "373422.com", true },
   { "374933.com", true },
   { "375422.com", true },
   { "379700.com", true },
   { "380422.com", true },
-  { "388da.com", true },
   { "390422.com", true },
   { "392422.com", true },
   { "393335.ml", true },
@@ -560,6 +532,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "3circlefunding.ch", true },
   { "3countiescastlehire.co.uk", true },
   { "3cs.ch", true },
+  { "3d-fotoservice.de", true },
   { "3deeplearner.com", true },
   { "3djuegos.com", true },
   { "3dmedium.de", true },
@@ -599,16 +572,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "3tribes.co.uk", true },
   { "3typen.tv", true },
   { "3v4l.org", true },
-  { "3weekdietworks.com", true },
   { "4-1-where.com", true },
   { "4-it.de", true },
-  { "4000milestare.com", true },
+  { "4000milestare.com", false },
   { "403.ch", true },
   { "404notfound.com.br", true },
   { "4096b.com", true },
   { "4096bit.de", false },
   { "40acts.org.uk", true },
   { "41-where.com", true },
+  { "411416.com", true },
   { "41199.com", true },
   { "411film.com", true },
   { "411movie.com", true },
@@ -616,15 +589,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "41studio.com", true },
   { "41where.com", true },
   { "420java.com", true },
-  { "439050.com", true },
+  { "440887.com", true },
   { "441jj.com", false },
+  { "442887.com", true },
+  { "443887.com", true },
   { "4444yh.com", true },
-  { "448da.com", true },
-  { "44scc.com", true },
+  { "444887.com", true },
+  { "445887.com", true },
   { "451.ooo", true },
   { "4553s.com", true },
   { "4553vip.com", true },
+  { "4706666.com", true },
+  { "4716666.com", true },
+  { "4726666.com", true },
   { "4736666.com", true },
+  { "4756666.com", true },
+  { "4786666.com", true },
   { "47essays.com", true },
   { "491mhz.net", true },
   { "49889.com", true },
@@ -653,7 +633,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "4vf.de", true },
   { "4x.fi", true },
   { "4x4.lk", true },
+  { "4x4tt.com", true },
   { "4xlabs.co", true },
+  { "50.gd", true },
   { "50.pe", true },
   { "500a500.com", true },
   { "500b500.com", true },
@@ -661,7 +643,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "500d500.com", true },
   { "500e500.com", true },
   { "500f500.com", true },
-  { "500fcw.com", true },
   { "500g500.com", true },
   { "500h500.com", true },
   { "500i500.com", true },
@@ -690,7 +671,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "506pay.com", true },
   { "508088.com", true },
   { "50lakeshore.com", true },
-  { "50ma.xyz", true },
   { "50north.de", true },
   { "50plusnet.nl", true },
   { "514122.com", true },
@@ -702,9 +682,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "51877.net", true },
   { "519422.com", true },
   { "51acg.eu.org", true },
+  { "51aifuli.com", true },
   { "51tiaojiu.com", true },
-  { "5214889.com", true },
-  { "5214889.net", true },
   { "524022.com", true },
   { "524622.com", true },
   { "524922.com", true },
@@ -713,17 +692,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "52kb365.com", true },
   { "52ncp.net", true },
   { "52sykb.com", true },
-  { "5310899.com", true },
-  { "5310899.net", true },
   { "531422.com", true },
   { "534122.com", true },
   { "534622.com", true },
   { "534922.com", true },
-  { "5364.com", true },
   { "536422.com", true },
   { "5364b.com", true },
   { "5364c.com", true },
   { "5364d.com", true },
+  { "5364jc.com", true },
   { "53ningen.com", true },
   { "540922.com", true },
   { "541022.com", true },
@@ -736,9 +713,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "5518k3.com", true },
   { "5533445.com", true },
   { "5555yh.com", true },
+  { "55639.com", true },
   { "55797.com", true },
   { "558da.com", true },
-  { "55scc.com", true },
   { "576422.com", true },
   { "579422.com", true },
   { "57he.com", true },
@@ -754,20 +731,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "594622.com", true },
   { "595422.com", true },
   { "596422.com", true },
-  { "598598598.net", true },
   { "5986fc.com", true },
   { "5997891.com", true },
   { "5apps.com", true },
   { "5c1fd0f31022cbc40af9f785847baaf9.space", true },
   { "5crowd.com", true },
   { "5dm.tv", true },
+  { "5dwin.com", true },
+  { "5dwin.net", true },
   { "5francs.com", true },
   { "5gb.space", true },
   { "5kraceforals.com", true },
   { "5percentperweek.com", true },
   { "5starbouncycastlehire.co.uk", true },
   { "5thchichesterscouts.org.uk", true },
-  { "5w5.la", true },
   { "5y.fi", true },
   { "602422.com", true },
   { "604122.com", true },
@@ -830,9 +807,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "646722.com", true },
   { "649022.com", true },
   { "649622.com", true },
-  { "64970.com", true },
   { "649722.com", true },
   { "649822.com", true },
+  { "64bitservers.net", false },
   { "651422.com", true },
   { "652422.com", true },
   { "6541166.com", true },
@@ -850,9 +827,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "6548877.com", true },
   { "656088.com", true },
   { "659422.com", true },
+  { "65d88.com", true },
   { "66136.com", true },
   { "6616fc.com", true },
-  { "66205.net", true },
   { "6633445.com", true },
   { "6652566.com", true },
   { "6660111.ru", true },
@@ -860,7 +837,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "666668722.com", true },
   { "6666yh.com", true },
   { "666omg.com", true },
-  { "668da.com", true },
+  { "6677.us", true },
+  { "66b.com", true },
   { "66bwf.com", true },
   { "670422.com", true },
   { "671422.com", true },
@@ -869,6 +847,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "676422.com", true },
   { "679422.com", true },
   { "680422.com", true },
+  { "68277.me", true },
   { "686848.com", true },
   { "690422.com", true },
   { "691422.com", true },
@@ -882,13 +861,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "6997896.com", true },
   { "69butterfly.com", true },
   { "69fps.gg", true },
+  { "69wasted.net", true },
   { "6lo.zgora.pl", true },
   { "6pm.com", true },
   { "6t-montjoye.org", true },
-  { "6w6.la", true },
   { "700.az", true },
   { "704233.com", true },
   { "704533.com", true },
+  { "7045h.com", true },
   { "704633.com", true },
   { "712433.com", true },
   { "713433.com", true },
@@ -909,23 +889,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "736433.com", true },
   { "738433.com", true },
   { "739433.com", true },
+  { "73info.com", false },
   { "740833.com", true },
   { "741833.com", true },
   { "742833.com", true },
   { "743833.com", true },
   { "74th.jp", true },
   { "755k3.com", true },
-  { "7570.com", true },
   { "762.ch", true },
-  { "772244.net", true },
   { "7733445.com", true },
   { "7777yh.com", true },
   { "777coin.com", true },
-  { "778da.com", true },
   { "783lab.com", true },
   { "787k3.com", true },
   { "7885765.com", true },
-  { "788da.com", true },
   { "7891553.com", true },
   { "7891997.com", true },
   { "7careconnect.com", true },
@@ -940,7 +917,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "7thcircledesigns.com", true },
   { "7trade8.com", true },
   { "7x24servis.com", true },
-  { "80036.com", true },
   { "804322.com", true },
   { "8086.cf", true },
   { "809422.com", true },
@@ -949,6 +925,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "8189196.com", true },
   { "818bwf.com", true },
   { "818da.com", true },
+  { "8349822.com", true },
   { "8522.com", true },
   { "8522club.com", true },
   { "8522hk.com", true },
@@ -972,33 +949,95 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "8818k3.com", true },
   { "8833445.com", true },
   { "88522am.com", true },
+  { "887.ag", true },
+  { "8876007.com", true },
+  { "8876008.com", true },
+  { "8876009.com", true },
+  { "8876138.com", true },
+  { "8876205.com", true },
+  { "8876278.com", true },
+  { "8876289.com", true },
+  { "8876290.com", true },
+  { "8876353.com", true },
+  { "8876389.com", true },
+  { "8876520.com", true },
+  { "8876578.com", true },
+  { "8876598.com", true },
+  { "8876655.com", true },
+  { "8876660.com", true },
+  { "8876687.com", true },
+  { "8876764.com", true },
+  { "8876770.com", true },
+  { "8876775.com", true },
+  { "8876776.com", true },
+  { "8876779.com", true },
+  { "8876808.com", true },
+  { "8876818.com", true },
+  { "8876822.com", true },
+  { "8876832.com", true },
+  { "8876835.com", true },
+  { "8876838.com", true },
+  { "8876858.com", true },
+  { "8876859.com", true },
+  { "8876866.com", true },
+  { "8876878.com", true },
+  { "8876879.com", true },
+  { "8876881.com", true },
+  { "8876882.com", true },
+  { "8876883.com", true },
+  { "8876898.com", true },
+  { "8876900.com", true },
+  { "8876955.com", true },
+  { "8876979.com", true },
+  { "8876987.com", true },
+  { "8876989.com", true },
+  { "8876991.com", true },
+  { "8876992.com", true },
+  { "8876996.com", true },
+  { "8880013.com", true },
+  { "8880021.com", true },
+  { "8880023.com", true },
+  { "8880025.com", true },
+  { "8880057.com", true },
+  { "8880059.com", true },
+  { "8880067.com", true },
+  { "8880083.com", true },
+  { "8880100.com", true },
+  { "8884553.com", true },
+  { "8886737.com", true },
+  { "8886739.com", true },
+  { "8886793.com", true },
+  { "8886806.com", true },
+  { "8886860.com", true },
   { "888888722.com", true },
   { "88889822.com", true },
-  { "8888esb.com", true },
   { "8888yh.com", true },
+  { "8889457.com", true },
+  { "8889458.com", true },
+  { "8889466.com", true },
+  { "8889563.com", true },
+  { "8889709.com", true },
+  { "8889729.com", true },
+  { "8889792.com", true },
+  { "8889807.com", true },
+  { "8889809.com", true },
+  { "8889819.com", true },
+  { "8889870.com", true },
+  { "8889881.com", true },
+  { "8889890.com", true },
+  { "8889893.com", true },
+  { "8889903.com", true },
+  { "8889910.com", true },
   { "888bwf.com", true },
-  { "888msc.vip", true },
+  { "888funcity.com", true },
+  { "888funcity.net", true },
   { "88bwf.com", true },
-  { "8901178.com", true },
-  { "8901178.net", true },
-  { "8910899.com", true },
-  { "8910899.net", true },
-  { "8917168.com", true },
-  { "8917168.net", true },
-  { "8917818.com", true },
-  { "8917818.net", true },
-  { "8951889.com", true },
-  { "8951889.net", true },
-  { "8992088.com", true },
-  { "8992088.net", true },
   { "8ack.de", true },
   { "8ackprotect.com", true },
   { "8da188.com", true },
-  { "8da2017.com", true },
   { "8da222.com", true },
   { "8da88.com", true },
   { "8da999.com", true },
-  { "8dabet.com", true },
   { "8hrs.net", true },
   { "8maerz.at", true },
   { "8pecxstudios.com", true },
@@ -1008,15 +1047,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "8tech.com.hk", true },
   { "8thportsmouth.org.uk", true },
   { "8tuffbeers.com", true },
-  { "8ung.online", true },
-  { "8xx.bet", true },
-  { "8xx.io", true },
-  { "8xx888.com", true },
   { "8xxbet.net", true },
   { "9-11commission.gov", true },
   { "903422.com", true },
   { "905422.com", true },
   { "90r.jp", true },
+  { "910kj.com", true },
   { "9118.com", true },
   { "911commission.gov", true },
   { "912422.com", true },
@@ -1045,11 +1081,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "949122.com", true },
   { "949622.com", true },
   { "949722.com", true },
-  { "9500years.com", true },
   { "9679693.com", true },
   { "9681909.com", true },
-  { "9696178.com", true },
-  { "9696178.net", true },
   { "972422.com", true },
   { "9788876.com", true },
   { "97bros.com", true },
@@ -1073,7 +1106,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "99999822.com", true },
   { "999998722.com", true },
   { "99rst.org", true },
-  { "9bingo.net", true },
   { "9farm.com", true },
   { "9fvip.net", true },
   { "9jajuice.com", true },
@@ -1082,39 +1114,39 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "9tolife.be", true },
   { "9uelle.jp", true },
   { "9vx.org", true },
-  { "9won.kr", true },
   { "9y.at", true },
   { "9yw.me", true },
   { "a-1basements.com", true },
   { "a-1indianawaterproofing.com", true },
   { "a-allard.be", true },
   { "a-classinflatables.co.uk", true },
+  { "a-invest.de", true },
   { "a-little-linux-box.at", true },
   { "a-msystems.com", true },
   { "a-oben.org", true },
   { "a-starbouncycastles.co.uk", true },
   { "a-wife.net", true },
+  { "a-ztransmission.com", true },
   { "a0print.nl", true },
   { "a1bouncycastlehire.com", true },
   { "a1jumpandbounce.co.uk", true },
   { "a1moldsolutions.com", true },
   { "a1scuba.com", true },
   { "a1scubastore.com", true },
-  { "a2a.net", true },
   { "a2nutrition.com.au", true },
   { "a3.pm", true },
   { "a4sound.com", true },
   { "a632079.me", true },
   { "a7m2.me", true },
+  { "a8q.org", true },
   { "aa-tour.ru", true },
   { "aa1718.net", true },
-  { "aa43d.cn", true },
-  { "aa6688.net", true },
   { "aaapl.com", true },
   { "aabanet.com.br", true },
   { "aaben-bank.dk", true },
   { "aabenbank.dk", true },
   { "aacfree.com", true },
+  { "aaex.cloud", true },
   { "aagetransport.no", true },
   { "aalalbayt.com", true },
   { "aalalbayt.net", true },
@@ -1127,11 +1159,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aanmpc.com", true },
   { "aaomidi.com", true },
   { "aapas.org.ar", true },
-  { "aardvarksolutions.co.za", true },
   { "aarkue.eu", true },
-  { "aaron.cm", false },
+  { "aaron.cm", true },
   { "aaron.xin", true },
+  { "aaronburt.co.uk", true },
   { "aaronhorler.com", true },
+  { "aaronhorler.com.au", true },
   { "aaronkimmig.de", true },
   { "aaronroyle.com", true },
   { "aaronsilber.me", true },
@@ -1139,15 +1172,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aati.be", true },
   { "aati.info", true },
   { "aavienna.com", true },
-  { "ab-bauservice-berlin.de", true },
   { "abaapplianceservice.com", true },
   { "abaaustin.com", true },
-  { "abacus-events.co.uk", true },
   { "abacusbouncycastle.co.uk", true },
   { "abacustech.co.jp", true },
-  { "abacustech.net", true },
-  { "abacustech.org", true },
   { "abandonedmines.gov", true },
+  { "abateroad66.it", true },
   { "abbadabbabouncycastles.co.uk", true },
   { "abbas.ch", true },
   { "abborsjo.fi", true },
@@ -1159,6 +1189,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "abc.li", true },
   { "abcbouncycastlessurrey.co.uk", true },
   { "abcbouncyfactory.co.uk", true },
+  { "abcdef.be", true },
   { "abcheck.se", true },
   { "abckam.com", true },
   { "abcpartyhire.com", true },
@@ -1172,7 +1203,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "abeestrada.com", false },
   { "abeilles-idapi.fr", true },
   { "abenteuer-ahnenforschung.de", true },
-  { "aberdeenalmeras.com", true },
   { "aberdeencastles.co.uk", true },
   { "aberdeenjudo.co.uk", true },
   { "abeus.com", true },
@@ -1185,9 +1215,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "abilitycaresoftware.com", true },
   { "abilitynet.org.uk", true },
   { "abilityone.gov", true },
+  { "abilma.com", true },
   { "abilymp06.net", true },
   { "abimelec.com", true },
   { "abinyah.com", true },
+  { "abitidalavoro.roma.it", true },
   { "abitur97ag.de", true },
   { "abiturma.de", true },
   { "ablak-nyilaszaro.info", true },
@@ -1234,14 +1266,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aboutyou.ch", true },
   { "aboutyou.de", true },
   { "aboutyou.nl", true },
+  { "aboveaverageplumbing.com", true },
   { "abox-kb.com", true },
   { "abpis.hr", true },
+  { "abracadabra.co.jp", false },
   { "abrakidabra.com.br", true },
-  { "abraxan.pro", true },
   { "abrilect.com", true },
   { "abristolgeek.co.uk", true },
   { "abseits.org", true },
   { "absolem.cc", true },
+  { "absoluteautobody.com", true },
   { "absolutedouble.co.uk", true },
   { "absolutehaitian.com", true },
   { "absolutehosting.co.za", true },
@@ -1261,13 +1295,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "abvlbasketviganello.ch", true },
   { "abyssproject.net", true },
   { "ac-admin.pl", true },
-  { "ac-epmservices.com", true },
   { "ac-town.com", true },
   { "ac0g.dyndns.org", true },
   { "aca-creative.co.uk", true },
   { "academicexperts.us", true },
   { "academichealthscience.net", true },
   { "academie-de-police.ch", true },
+  { "academkin.com", true },
   { "academytv.com.au", true },
   { "acaeum.com", true },
   { "acampar.com.br", true },
@@ -1276,7 +1310,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "acareer.in", true },
   { "acat.io", true },
   { "acbrussels-used.be", true },
-  { "accadoro.it", true },
   { "accelaway.com", true },
   { "acceleratenetworks.com", true },
   { "accelerateyourworld.org", true },
@@ -1300,8 +1333,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "accpodcast.com", true },
   { "accredit.ly", true },
   { "accudraftpaintbooths.com", true },
+  { "accurateautobodywa.com", true },
   { "accuritconsulting.com", true },
   { "accuritpresence.com", true },
+  { "accutint.com", true },
   { "ace.media", true },
   { "ace.one", true },
   { "acealters.com", true },
@@ -1329,6 +1364,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "acheter-ethylotest.fr", true },
   { "achromatisch.de", true },
   { "achterblog.de", true },
+  { "achterstieg.dedyn.io", true },
   { "achtzehn.eu", true },
   { "achtzehnterachter.de", true },
   { "achtzig20.de", true },
@@ -1339,6 +1375,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aciksite.com", true },
   { "ackermann.ch", true },
   { "ackis.duckdns.org", false },
+  { "acklandstainless.com.au", true },
+  { "acl.gov", true },
   { "aclu.org", false },
   { "acluva.org", false },
   { "acme.beer", true },
@@ -1357,10 +1395,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "acoustics.network", true },
   { "acoustics.tech", true },
   { "acoustique-tardy.com", true },
+  { "acpcoils.com", true },
   { "acperu.ch", true },
   { "acquisition.gov", true },
   { "acquistareviagragenericoitalia.net", true },
-  { "acraft.org", true },
   { "acrealux.lu", true },
   { "acrepairgeorgetown.com", true },
   { "acrepairhutto.com", true },
@@ -1377,13 +1415,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "acs-chantal.com", true },
   { "acsbbs.org", true },
   { "acsc.gov.au", true },
+  { "acscbasket.com", true },
   { "acsemb.org", true },
-  { "acsihostingsolutions.com", true },
   { "acsports.ca", true },
   { "actc.org.uk", true },
   { "actc81.fr", true },
   { "actgruppe.de", true },
   { "actiefgeld.nl", true },
+  { "actioncleaningnd.com", true },
   { "actionlabs.net", true },
   { "actionmadagascar.ch", true },
   { "actionsack.com", true },
@@ -1393,6 +1432,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "active-tluszcz.pl", true },
   { "active.hu", false },
   { "activecare-monitor.com", true },
+  { "activeexcavator.com", true },
   { "activehire.co.uk", true },
   { "activeleisure.ie", true },
   { "activeworld.net", false },
@@ -1404,7 +1444,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "actors-cafe.net", true },
   { "actorsroom.com", true },
   { "actserv.co.ke", true },
+  { "actualadmins.com", true },
+  { "actualidadecommerce.com", true },
+  { "actualidadgadget.com", true },
   { "actualidadiphone.com", true },
+  { "actualidadkd.com", true },
   { "actualidadmotor.com", true },
   { "acuica.co.uk", false },
   { "acul.me", true },
@@ -1412,8 +1456,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "acus.gov", true },
   { "acwcerts.co.uk", true },
   { "acwi.gov", true },
+  { "acy.com", true },
   { "acyfxasia.com", true },
-  { "acyume.com", true },
   { "ad-notam.asia", true },
   { "ad-notam.ch", true },
   { "ad-notam.co.uk", true },
@@ -1423,7 +1467,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ad-notam.it", true },
   { "ad-notam.pt", true },
   { "ad-notam.us", true },
-  { "ad13.in", true },
   { "ada.gov", true },
   { "adalis.org", true },
   { "adam-ant.co.uk", true },
@@ -1443,6 +1486,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "adams.dk", true },
   { "adamsbouncycastles.co.uk", true },
   { "adamstas.com", true },
+  { "adamwallington.co.uk", true },
   { "adamwilcox.org", true },
   { "adamyuan.xyz", true },
   { "adapt-elektronik.com", true },
@@ -1450,6 +1494,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "adaptablesecurity.org", true },
   { "adapti.de", true },
   { "adaptivemechanics.edu.au", true },
+  { "adarshthapa.in", true },
   { "adawolfa.cz", true },
   { "adayinthelifeof.nl", true },
   { "adblockextreme.com", true },
@@ -1458,7 +1503,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "addeekt.com", true },
   { "adderall.space", true },
   { "addicional.com", true },
+  { "addictionresource.com", true },
   { "addictively.com", true },
+  { "addiko.net", true },
   { "addisoncrump.info", true },
   { "addnine.com", true },
   { "addon.watch", true },
@@ -1471,6 +1518,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "adentalsolution.com", true },
   { "adept.org.pl", true },
   { "adesa.co.uk", true },
+  { "adevel.eu", true },
   { "adex.network", true },
   { "adf-safetytools.com", true },
   { "adftrasporti.it", true },
@@ -1499,15 +1547,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "administratorserwera.pl", true },
   { "adminlinux.pl", true },
   { "admino.cz", true },
+  { "admins.tech", true },
   { "adminwiki.fr", true },
+  { "admirable.one", true },
   { "admody.com", true },
   { "admongo.gov", true },
+  { "adnanoktar.com", true },
   { "adnanotoyedekparca.com", true },
   { "adnot.am", true },
   { "adnseguros.es", true },
   { "adonnante.com", true },
   { "adoptionlink.co.uk", true },
-  { "adora-illustrations.fr", true },
   { "adorade.ro", true },
   { "adorai.tk", true },
   { "adorecricket.com", true },
@@ -1538,11 +1588,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "adurra.com", true },
   { "aduvi.de", true },
   { "advance.hr", true },
+  { "advanced-fleet-services.com", true },
   { "advanced-scribes.com", true },
   { "advanced.info", true },
+  { "advanceddieselspokane.com", true },
   { "advancedoneroofing.com", true },
   { "advancedprotectionkey.com", true },
   { "advancedprotectionsecuritykey.com", true },
+  { "advancedsurgicalconsultantsllc.com", true },
   { "advancedweb.hu", true },
   { "advanceworx.com", true },
   { "advancis.net", true },
@@ -1552,6 +1605,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "advelty.cz", true },
   { "advenacs.com.au", true },
   { "advenapay.com", true },
+  { "adventaholdings.com", true },
   { "advento.bg", true },
   { "adventure-inn.com", true },
   { "adventureally.com", true },
@@ -1562,12 +1616,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "adventurenow.nl", true },
   { "adventures.de", true },
   { "adventureswithlillie.ca", true },
+  { "advertis.biz", true },
   { "advicepro.org.uk", true },
   { "advocate-europe.eu", true },
   { "advocaten-avocats.be", true },
   { "advocatenalkmaar.org", true },
   { "advocator.ca", true },
   { "advocoeurdehaan.nl", true },
+  { "advogatech.com.br", true },
   { "advokat-romanov.com", true },
   { "advtran.com", true },
   { "adware.pl", true },
@@ -1593,11 +1649,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ae-construction.co.uk", true },
   { "aebian.org", true },
   { "aecexpert.fr", true },
+  { "aefcleaning.com", true },
   { "aegee-utrecht.nl", true },
   { "aegisalarm.co.uk", true },
   { "aegisalarm.com", true },
   { "aegisalarms.co.uk", true },
   { "aegisalarms.com", true },
+  { "aegisinsight.com", true },
   { "aegrel.ee", true },
   { "aehe.us", true },
   { "aei.co.uk", true },
@@ -1613,7 +1671,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aero-pioneer.com", true },
   { "aerobasegroup.com", true },
   { "aerobotz.com", true },
+  { "aeronautix.com", true },
+  { "aeropole.de", true },
+  { "aeropole.eu", true },
   { "aerosimexperience.com", true },
+  { "aerospace-schools.com", true },
+  { "aerotechcoatings.com", true },
   { "aertel.ie", true },
   { "aessencia.com.br", true },
   { "aestheticdr.org", true },
@@ -1621,7 +1684,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aestore.by", true },
   { "aeternus.tech", true },
   { "aetherc0r3.eu", true },
-  { "aethonan.pro", true },
   { "aetoscg.com", true },
   { "aetoscg.com.au", true },
   { "aextron.com", true },
@@ -1642,9 +1704,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "affinitysync.com", true },
   { "affissioni.roma.it", true },
   { "affittacamere.roma.it", true },
+  { "affittialmare.it", true },
+  { "affloc.com", true },
   { "affordableazdivorce.com", true },
   { "affordableblindsexpress.com", true },
   { "affordableenergyadvocates.com", true },
+  { "affordableenvironmental.net", true },
   { "affordablehealthquotesforyou.com", true },
   { "affordablekilimanjaro.com", true },
   { "affordablemudjacking.com", true },
@@ -1677,6 +1742,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "afterskool.eu", true },
   { "afuh.de", true },
   { "afva.net", true },
+  { "afwd.international", true },
   { "afzco.asia", true },
   { "ag-websolutions.de", true },
   { "ag8-game.com", true },
@@ -1684,8 +1750,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "agamsecurity.ch", true },
   { "agatajanik.de", true },
   { "agate.pw", true },
+  { "age.hk", true },
   { "agechecker.net", true },
   { "ageg.ca", true },
+  { "agemfis.com", true },
   { "agenceklic.com", true },
   { "agencewebstreet.com", true },
   { "agenciadeempregosdourados.com.br", true },
@@ -1702,11 +1770,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "agglo-sion.ch", true },
   { "aggr.pw", true },
   { "agia.ad", true },
-  { "agiairini.cz", true },
   { "agiapelagia.com", true },
   { "agic-geneve.ch", true },
+  { "agic.io", true },
   { "agilebits.com", true },
-  { "agilebits.net", false },
   { "agilecraft.com", true },
   { "agileui.com", true },
   { "agiley.se", true },
@@ -1738,7 +1805,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "agouralighting.com", true },
   { "agouraoutdoorlighting.com", true },
   { "agr.asia", true },
-  { "agracan.com", true },
   { "agrajag.nl", true },
   { "agrarking.de", true },
   { "agrarshop4u.de", true },
@@ -1746,23 +1812,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "agreor.com", true },
   { "agrichamber.com.ua", true },
   { "agricolo.ch", true },
+  { "agriculture-schools.com", true },
   { "agridir.site", true },
   { "agrilinks.org", true },
   { "agrios.de", true },
   { "agro-forestry.net", true },
   { "agroline.by", true },
-  { "agroxxi.ru", true },
+  { "agroxxi.ru", false },
   { "agsb.ch", true },
   { "agscinemas.com", true },
   { "agscinemasapp.com", true },
   { "agung-furniture.com", true },
   { "agwa.name", true },
+  { "agy.cl", true },
+  { "ahawkesrealtors.com", true },
   { "ahd.com", false },
   { "aheng.me", true },
   { "ahero4all.org", true },
-  { "ahiru3.com", true },
-  { "ahkubiak.ovh", true },
   { "ahlaejaba.com", true },
+  { "ahlz.sk", true },
   { "ahmad.works", true },
   { "ahmadly.com", true },
   { "ahmedabadflowermall.com", true },
@@ -1778,12 +1846,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ai-english.jp", true },
   { "ai-soft.co.jp", true },
   { "ai.gov", true },
+  { "ai.je", true },
   { "aia.de", true },
   { "aibenzi.com", true },
+  { "aibiying.com", true },
   { "aicial.co.uk", true },
   { "aidanapple.com", true },
+  { "aidanmitchell.co.uk", true },
+  { "aidanmitchell.uk", true },
   { "aidanmontare.net", true },
-  { "aide-valais.ch", true },
   { "aiden.link", true },
   { "aidhan.net", true },
   { "aids.gov", true },
@@ -1801,29 +1872,36 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aikenpromotions.com", true },
   { "aiki.de", true },
   { "aiki.do", true },
+  { "aiki.tk", true },
   { "aikido-club-limburg.de", true },
   { "aikido-kiel.de", true },
   { "aikido-linz.at", true },
   { "aikido-wels.at", true },
+  { "ailitonia.com", true },
+  { "ailitonia.xyz", true },
   { "aimax.com", true },
   { "aimeeandalec.com", true },
   { "aimgroup.co.tz", true },
+  { "aimi-salon.com", true },
   { "aimotive.com", true },
   { "aimstoreglobal.com", true },
   { "aintevenmad.ch", true },
+  { "ainutrition.co.uk", true },
+  { "ainvest.de", true },
   { "aiois.com", true },
   { "aipbarcelona.com", true },
   { "air-craftglass.com", true },
   { "air-shots.ch", true },
   { "air-we-go.co.uk", true },
-  { "airbly.com", true },
   { "airbnb.ae", true },
   { "airbnb.at", true },
   { "airbnb.be", true },
+  { "airbnb.biz", true },
   { "airbnb.ca", true },
   { "airbnb.cat", true },
   { "airbnb.ch", true },
   { "airbnb.cl", true },
+  { "airbnb.cn", true },
   { "airbnb.co.cr", true },
   { "airbnb.co.id", true },
   { "airbnb.co.il", true },
@@ -1838,6 +1916,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "airbnb.com.bo", true },
   { "airbnb.com.br", true },
   { "airbnb.com.bz", true },
+  { "airbnb.com.cn", true },
   { "airbnb.com.co", true },
   { "airbnb.com.ec", true },
   { "airbnb.com.gt", true },
@@ -1880,10 +1959,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "airbnb.pt", true },
   { "airbnb.ru", true },
   { "airbnb.se", true },
+  { "airbnb.tools", true },
+  { "airbnbchina.cn", true },
   { "airbnbopen.com", true },
   { "airborne-inflatables.co.uk", true },
+  { "airbossofamerica.com", true },
   { "airclass.com", true },
   { "aircomms.com", true },
+  { "airconssandton.co.za", true },
   { "airductclean.com", false },
   { "airductcleaning-fresno.com", true },
   { "airductcleaninggrandprairie.com", true },
@@ -1893,26 +1976,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "airetvie.com", true },
   { "airhart.me", true },
   { "airhelp.com", true },
-  { "airhorn.de", true },
   { "airi-tabei.com", true },
-  { "airicy.com", true },
   { "airikai.com", true },
-  { "airlinesettlement.com", true },
+  { "airlibre-parachutisme.com", true },
   { "airmail.cc", true },
   { "airmaxinflatables.com", true },
   { "airnow.gov", true },
   { "airpbx.com", true },
   { "airplay-inflatable-hire.co.uk", true },
   { "airplayradio.nl", true },
-  { "airportlimototoronto.com", true },
   { "airpurifierproductsonline.com", true },
   { "airrestoration.ch", true },
+  { "airsnore.com", true },
   { "airsoft.ch", true },
+  { "airswap.io", true },
   { "airtimerewards.co.uk", true },
+  { "airtoolaccessoryo.com", true },
   { "airvpn.org", true },
   { "airvuz.com", true },
+  { "airwaystorage.net", true },
+  { "airweb.top", true },
   { "airwegobouncycastles.co.uk", true },
   { "airwolfthemes.com", true },
+  { "airwrenchei.com", true },
   { "ais.fashion", true },
   { "aisance-co.com", true },
   { "aisi316l.net", true },
@@ -1926,7 +2012,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aivd.lol", true },
   { "aiwdirect.com", true },
   { "aixvox.com", false },
-  { "aixxe.net", true },
   { "aizxxs.com", true },
   { "aizxxs.net", true },
   { "ajapaik.ee", true },
@@ -1948,6 +2033,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aka.ms", true },
   { "akachanikuji.com", true },
   { "akademeia.moe", true },
+  { "akademie-frankfurt.de", true },
   { "akalashnikov.ru", true },
   { "akamon.ac.jp", true },
   { "akaoma.com", true },
@@ -1957,15 +2043,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "akelius.de", false },
   { "akfoundationindia.com", true },
   { "akhealthconnection.com", true },
+  { "akhomesforyou.com", true },
   { "akihito.com", true },
   { "akijo.de", true },
   { "akilli-devre.com", true },
   { "akita-boutique.com", true },
   { "akiym.com", true },
+  { "akj.io", true },
   { "akkbouncycastles.co.uk", true },
-  { "akkeylab.com", true },
   { "akostecki.de", true },
   { "akovana.com", true },
+  { "akoww.de", false },
   { "akoya.fi", true },
   { "akplates.org", true },
   { "akpwebdesign.com", true },
@@ -1978,6 +2066,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aksehir.bel.tr", true },
   { "akselinurmio.fi", false },
   { "akshi.in", true },
+  { "aktin.cz", true },
+  { "aktin.sk", true },
   { "aktiv-naturheilmittel.at", true },
   { "aktiv-naturheilmittel.ch", true },
   { "aktiv-naturheilmittel.de", true },
@@ -1989,6 +2079,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "akustik.tech", true },
   { "akutun.cl", true },
   { "akvorrat.at", true },
+  { "al3366.tech", true },
   { "al3xpro.com", true },
   { "alab.space", true },
   { "alabamadebtrelief.org", true },
@@ -2022,27 +2113,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "albbounce.co.uk", true },
   { "albersdruck.de", true },
   { "albertathome.org", true },
-  { "albertbogdanowicz.pl", true },
+  { "albertcuyp-markt.amsterdam", true },
   { "albertinum-goettingen.de", true },
+  { "albinma.com", true },
   { "albion2.org", true },
   { "alboweb.nl", true },
+  { "albrocar.com", true },
   { "albuic.tk", true },
   { "alca31.com", true },
   { "alchimic.ch", true },
   { "alcnutrition.com", true },
   { "alco-united.com", true },
   { "alcoholapi.com", true },
+  { "alcolecapital.com", true },
   { "aldiabcs.com", true },
   { "aldien.com.br", true },
   { "aldo-vandini.de", true },
   { "aldomedia.com", true },
   { "aldorr.net", false },
   { "aldous-huxley.com", true },
-  { "aldred.cloud", true },
   { "alecpap.com", true },
   { "alecpapierniak.com", true },
   { "alecrust.com", true },
-  { "aledg.cl", true },
   { "alek.in", true },
   { "aleksejjocic.tk", true },
   { "aleksib.fi", true },
@@ -2053,8 +2145,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alerts.sg", true },
   { "alertwire.com", true },
   { "alesia-formation.fr", true },
-  { "alessandroonline.com.br", true },
-  { "alessandroz.ddns.net", true },
   { "aletm.it", true },
   { "alex-ross.co.uk", true },
   { "alex97000.de", true },
@@ -2068,19 +2158,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alexandrastorm.com", true },
   { "alexandrastylist.com", true },
   { "alexandre-blond.fr", true },
+  { "alexandros.io", true },
   { "alexbaker.org", true },
   { "alexberts.ch", true },
   { "alexbresnahan.com", true },
   { "alexcoman.com", true },
   { "alexdaniel.org", true },
   { "alexdaulby.com", true },
+  { "alexei.su", false },
   { "alexey-shamara.ru", true },
   { "alexeykopytko.com", true },
   { "alexgaynor.net", true },
-  { "alexgebhard.com", true },
   { "alexhd.de", true },
   { "alexio.ml", true },
   { "alexisabarca.com", true },
+  { "alexiskoustoulidis.com", true },
   { "alexkott.com", true },
   { "alexlouden.com", true },
   { "alexmerkel.com", true },
@@ -2106,6 +2198,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alfa-tech.su", true },
   { "alfaperfumes.com.br", true },
   { "alftrain.com", true },
+  { "algeriepart.com", true },
   { "alghanimcatering.com", true },
   { "algoaware.eu", true },
   { "algoentremanos.com", true },
@@ -2117,26 +2210,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aliantsoft.pl", true },
   { "aliaswp.com", true },
   { "alibangash.com", true },
+  { "alibiloungelv.com", true },
   { "alibip.de", true },
   { "alice-noutore.com", true },
   { "alice.tw", true },
   { "alicemaywebdesign.com.au", true },
   { "alicestudio.it", true },
   { "alicetone.net", true },
+  { "alieke.design", true },
   { "alienation.biz", true },
   { "alienflight.com", true },
   { "alienslab.net", true },
   { "alienstat.com", true },
+  { "alignrs.com", true },
   { "aliim.gdn", true },
   { "alijammusic.com", true },
+  { "alikulov.me", true },
   { "alinasmusicstudio.com", true },
-  { "alinode.com", true },
+  { "alinbu.net", true },
   { "aliorange.com", true },
+  { "alis-test.tk", true },
   { "alisonisrealestate.com", true },
   { "alisonlitchfield.com", true },
-  { "alistairholland.me", true },
   { "alistairstowing.com", true },
-  { "alisync.com", true },
   { "alix-board.de", true },
   { "alize-theatre.ch", true },
   { "aljaspod.com", true },
@@ -2144,6 +2240,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aljaspod.net", true },
   { "aljweb.com", true },
   { "all-connect.net", false },
+  { "all-fashion-schools.com", true },
   { "all-markup-news.com", true },
   { "all4hardware4u.de", true },
   { "allaboutfunuk.com", true },
@@ -2160,6 +2257,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "allbounceandplay.co.uk", true },
   { "allbouncesurrey.co.uk", true },
   { "allbrandbrand.com", true },
+  { "allbursaries.co.za", true },
   { "allbusiness.com", true },
   { "allcapa.org", true },
   { "allcarecorrectionalpharmacy.com", true },
@@ -2171,6 +2269,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alldigitalsolutions.com", true },
   { "alle.bg", true },
   { "allemobieleproviders.nl", true },
+  { "allemoz.com", true },
+  { "allemoz.fr", true },
   { "allenosgood.com", true },
   { "allenscaravans.co.uk", true },
   { "allensun.org", true },
@@ -2178,6 +2278,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alleskomtgoed.org", true },
   { "allesrocknroll.de", true },
   { "allforyou.at", true },
+  { "allfreelancers.su", false },
   { "allgaragefloors.com", true },
   { "allgreenturf.com.au", true },
   { "alliance-psychiatry.com", true },
@@ -2189,6 +2290,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "allincoin.shop", true },
   { "allinone-ranking150.com", true },
   { "allis.studio", true },
+  { "allius.de", true },
   { "alljamin.com", true },
   { "allmebel.ru", true },
   { "allmend-ru.de", true },
@@ -2202,6 +2304,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alloydevil.nl", true },
   { "allplayer.tk", true },
   { "allpointsblog.com", true },
+  { "allpointsheating.com", true },
   { "allproptonline.com", true },
   { "allroundpvp.net", true },
   { "allsaints.church", true },
@@ -2209,29 +2312,31 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "allshousedesigns.com", true },
   { "allstakesupply.com.au", true },
   { "allstarautokiaparts.com", true },
+  { "allstarcashforcars.com", true },
   { "allstarpartyinflatables.co.uk", true },
   { "allstarquilts.com", true },
   { "allsync.com", true },
   { "allsync.nl", true },
   { "allteach.co.uk", true },
+  { "allterrainfence.com", true },
   { "allthecryptonews.com", true },
   { "allthethings.co.nz", true },
   { "allthings.me", true },
   { "allthingssquared.com", true },
   { "allthingswild.co.uk", true },
-  { "alltubedownload.net", true },
   { "allurescarves.com", true },
   { "alluvion.studio", true },
+  { "allvips.ru", true },
+  { "allweatherlandscaping.net", true },
   { "almaatlantica.com", true },
   { "almavios.com", true },
+  { "almorafestival.com", true },
+  { "almut-zielonka.de", true },
   { "aloesoluciones.com.ar", true },
   { "alohapartyevents.co.uk", true },
   { "alonetone.com", true },
-  { "alorenzi.eu", true },
-  { "alp.net.cn", true },
   { "alp.od.ua", true },
   { "alpca.org", true },
-  { "alpe-d-or.dyn-o-saur.com", true },
   { "alpencam.com", true },
   { "alpencams.com", true },
   { "alpengreis.ch", true },
@@ -2256,7 +2361,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alphaman.ooo", true },
   { "alphapengu.in", true },
   { "alpharotary.com", true },
-  { "alphasall.com", true },
+  { "alphasall.com", false },
   { "alphassl.de", true },
   { "alphatrash.de", true },
   { "alphavote-avex.com", true },
@@ -2265,6 +2370,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alphie.me", true },
   { "alphipneux.fr", true },
   { "alpinechaletrental.com", true },
+  { "alpinehighlandrealty.com", true },
   { "alpineplanet.com", true },
   { "alpinepubliclibrary.org", true },
   { "alpinestarmassage.com", true },
@@ -2275,6 +2381,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alrait.com", true },
   { "alroniks.com", true },
   { "als-japan.com", true },
+  { "alstertouch.com", true },
+  { "alstertouch.de", true },
   { "alstroemeria.org", true },
   { "alt-three.com", true },
   { "alt.org", true },
@@ -2309,12 +2417,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alvcs.com", true },
   { "alviano.com", true },
   { "alvicom.hu", true },
-  { "alvis-audio.com", true },
   { "alvosec.com", true },
   { "alwaysdry.com.au", true },
   { "alwayslookingyourbest.com", true },
   { "alwaysmine.fi", true },
   { "alwaysonssl.com", true },
+  { "alxpresentes.com.br", true },
   { "alyoung.com", true },
   { "alza.at", true },
   { "alza.co.uk", true },
@@ -2325,12 +2433,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "alzashop.com", true },
   { "alzonaprinting.com", true },
   { "am-dd.com", true },
+  { "am-executive-consulting.com", true },
   { "am2s.fr", true },
   { "am3.se", true },
   { "ama.ne.jp", true },
   { "amadvice.com", true },
   { "amaforro.com", true },
   { "amagdic.com", true },
+  { "amagical.net", false },
   { "amaiz.com", true },
   { "amalfi5stars.com", true },
   { "amalficoastchauffeur.com", true },
@@ -2338,15 +2448,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "amalfipositanoboatrental.com", true },
   { "amalfirock.it", true },
   { "amalfitabula.it", true },
+  { "amandadamsphotography.com", true },
   { "amandasage.ca", true },
   { "amani-kinderdorf.de", true },
   { "amaresq.com", true },
+  { "amartinz.at", true },
   { "amateurchef.co.uk", true },
   { "amateurradionotes.com", true },
+  { "amateurvoicetalent.com", true },
   { "amati.solutions", true },
   { "amato.tk", true },
   { "amatsuka.com", true },
   { "amauf.de", true },
+  { "amautorepairwa.com", true },
   { "amazili-communication.com", true },
   { "amazing-castles.co.uk", true },
   { "amazinginflatables.co.uk", true },
@@ -2377,9 +2491,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "amend-friseur-schwabing.de", true },
   { "america.gov", true },
   { "americafamilylawcenter.org", true },
+  { "american-school-search.com", true },
   { "american.dating", true },
   { "americandetour.com", true },
   { "americanfoundationbr.com", true },
+  { "americanindiannursing.com", true },
   { "americanmediainstitute.com", true },
   { "americasbasementcontractor.com", true },
   { "americkykongres.cz", true },
@@ -2388,7 +2504,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "amerika-forum.de", true },
   { "amerimarkdirect.com", true },
   { "amerimex.cc", true },
-  { "ameschristian.net", true },
   { "amesgen.de", true },
   { "amesvacuumrepair.com", true },
   { "amethystdevelopment.co.uk", true },
@@ -2397,8 +2512,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "amg-microwave.com", true },
   { "amh-entertainments.co.uk", true },
   { "ami-de-bastanes.fr", true },
+  { "amica-travel.com", true },
   { "amicalecanyon.ch", true },
   { "amiciidogrescue.org.uk", true },
+  { "amicimar.it", true },
   { "amielucha.com", true },
   { "amifoundation.net", true },
   { "amikootours.com", true },
@@ -2409,7 +2526,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "amirautos.com", true },
   { "amirmahdy.com", true },
   { "amisderodin.fr", true },
-  { "amisharingstuff.com", true },
   { "amitabhsirkiclasses.org.in", true },
   { "amitpatra.com", true },
   { "amiu.org", true },
@@ -2442,6 +2558,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "amyrussellhair.com", true },
   { "amyyeung.com", true },
   { "amzn.rocks", true },
+  { "anabolic.co", true },
   { "anacreon.de", true },
   { "anadiyogacentre.com", true },
   { "anaethelion.fr", true },
@@ -2465,11 +2582,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "anarchyrp.life", true },
   { "anassiriphotography.com", false },
   { "anastasia-shamara.ru", true },
+  { "anaveragehuman.eu.org", true },
   { "ancestramil.fr", true },
   { "anchev.net", true },
+  { "anchorit.gov", true },
   { "anchovy.nz", false },
   { "ancientcraft.eu", true },
-  { "ancientnorth.com", true },
   { "ancientnorth.nl", true },
   { "ancolies-andre.com", true },
   { "anconaswine.com", true },
@@ -2486,6 +2604,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "andisadhdspot.com", true },
   { "andiscyber.space", true },
   { "anditi.com", true },
+  { "andoms.fi", true },
   { "andre-ballensiefen.de", true },
   { "andre-lategan.com", true },
   { "andre-otto.com", true },
@@ -2503,22 +2622,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "andreashecht-blog.de", true },
   { "andreaskrasa.com", true },
   { "andreaslicht.nl", true },
+  { "andreasmuelhaupt.de", true },
   { "andreasolsson.se", true },
-  { "andreasr.com", true },
   { "andree.cloud", true },
   { "andrefaber.nl", true },
   { "andrehansen.de", true },
   { "andrei-nakov.org", true },
-  { "andrejbenz.com", true },
+  { "andrejstefanovski.com", true },
+  { "andrelauzier.com", true },
   { "andreoliveira.io", true },
-  { "andrepicard.de", true },
-  { "andrespaz.com", true },
   { "andreundnina.de", true },
+  { "andrew.fi", true },
   { "andrew.london", true },
   { "andrewbdesign.com", true },
   { "andrewdaws.io", true },
   { "andrewensley.com", true },
-  { "andrewhowden.com", true },
   { "andrewimeson.com", true },
   { "andrewin.ru", true },
   { "andrewmichaud.com", true },
@@ -2547,11 +2665,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "androticsdirect.com", true },
   { "andruvision.cz", true },
   { "andsat.org", true },
-  { "andschwa.com", false },
+  { "andschwa.com", true },
   { "andso.cn", true },
+  { "anduril.de", true },
+  { "anduril.eu", true },
   { "andybrett.com", true },
   { "andyc.cc", true },
-  { "andycloud.dynu.net", true },
   { "andycrockett.io", true },
   { "andymoore.info", true },
   { "andys-place.co.uk", true },
@@ -2564,6 +2683,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "anedot.xyz", true },
   { "aneebahmed.com", true },
   { "anegabawa.com", true },
+  { "aneslix.com", true },
   { "anetaben.nl", true },
   { "anextraordinaryday.net", true },
   { "ange-de-bonheur444.com", true },
@@ -2572,11 +2692,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "angelesydemonios.es", true },
   { "angelicare.co.uk", true },
   { "angelinahair.com", true },
+  { "angeljmadrid.com", true },
   { "angeloryndon.com", true },
   { "angelremigene.com", true },
   { "angelsgirl.eu.org", true },
   { "anginf.de", true },
-  { "anglertanke.de", true },
   { "anglesgirl.eu.org", true },
   { "anglesya.win", true },
   { "anglictina-sojcak.cz", true },
@@ -2587,25 +2707,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "angristan.fr", true },
   { "angristan.xyz", true },
   { "angry.im", true },
-  { "angrylab.com", true },
   { "angrysnarl.com", true },
-  { "angryteeth.net", false },
-  { "anguiao.com", true },
+  { "angryteeth.net", true },
   { "angularjs.org", false },
   { "angusmak.com", true },
   { "anhaffen.lu", true },
   { "ani-man.de", true },
-  { "aniaimichal.eu", true },
   { "aniforprez.net", true },
-  { "anim.ee", true },
   { "animacurse.moe", true },
   { "animaemundi.be", true },
   { "animal-liberation.com", true },
+  { "animal-nature-human.com", true },
   { "animal-rights.com", true },
   { "animalistic.io", true },
   { "animaltesting.fr", true },
   { "animan.ca", true },
-  { "animatelluris.nl", true },
   { "animationsmusicales.ch", true },
   { "anime-culture.com", true },
   { "anime-rg.com", true },
@@ -2628,6 +2744,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "anjoola.com", true },
   { "ankarakart.com.tr", true },
   { "ankaraprofesyonelwebtasarim.com", true },
+  { "ankaraseo.name.tr", true },
   { "ankarauzmanlarnakliyat.com", true },
   { "ankarayilmaznakliyat.com", true },
   { "ankarayucelnakliyat.com", true },
@@ -2636,51 +2753,49 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ankiuser.net", true },
   { "ankiweb.net", true },
   { "ankwanoma.com", true },
-  { "ankya9.com", true },
   { "anleitung-deutsch-lernen.de", true },
   { "anleitung-zum-flechten.de", true },
   { "anleitung-zum-haekeln.de", true },
   { "anleitung-zum-schreiben.de", true },
   { "anleitung-zum-schweissen.de", true },
   { "anleitung-zum-toepfern.de", true },
-  { "anlp.top", true },
   { "anna.info", true },
   { "annaenemma.nl", true },
   { "annafiore.com.br", true },
-  { "annangela.moe", true },
   { "annarokina.com", true },
   { "annasvapor.se", true },
   { "annawagner.pl", true },
   { "annedaniels.co.uk", true },
   { "annejan.com", true },
   { "anneliesonline.nl", true },
+  { "annema.biz", true },
   { "annemakeslovelycandles.co.uk", true },
   { "annetta.com", true },
   { "annettewindlin.ch", true },
   { "annevankesteren.nl", true },
-  { "annicascakes.nl", true },
   { "anniversary-cruise.com", true },
   { "annmariewaltsphotography.com", true },
   { "annonasoftware.com", true },
   { "annotate.software", true },
   { "annoyingasfuk.com", true },
-  { "annrusnak.com", true },
   { "annuaire-jcb.com", true },
   { "annuaire-photographe.fr", false },
+  { "annunciationbvmchurch.org", true },
   { "anohana.org", true },
   { "anojan.com", true },
   { "anon-next.de", true },
   { "anoncom.net", true },
-  { "anoneko.com", false },
+  { "anoncrypto.org", true },
+  { "anoneko.com", true },
   { "anongoth.pl", true },
   { "anons.fr", true },
-  { "anonukradio.org", true },
   { "anonym-surfen.de", true },
   { "anonyme-spieler.at", true },
   { "anorak.tech", true },
   { "another.ch", true },
   { "anotherchef.com", true },
   { "anotherfatgeek.net", true },
+  { "anothervps.com", true },
   { "anowicki.pl", false },
   { "anoxinon.de", false },
   { "ans-delft.nl", true },
@@ -2701,19 +2816,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "antani.cloud", true },
   { "antarcti.co", true },
   { "antarees.net", true },
+  { "antaresmedia.com.py", true },
   { "antarespc.com", true },
   { "antcas.com", true },
   { "antennista.catania.it", true },
   { "antennista.milano.it", true },
   { "antennista.pavia.it", true },
   { "antennista.roma.it", true },
+  { "antennista.tv", true },
   { "antennisti.milano.it", true },
   { "antennisti.roma.it", true },
   { "anteprima.info", true },
   { "anthedesign.fr", true },
   { "anthisis.tv", true },
   { "anthony.codes", true },
-  { "anthonyaires.com", true },
   { "anthonycarbonaro.com", true },
   { "anthonyfontanez.com", true },
   { "anthonygaidot.fr", true },
@@ -2736,7 +2852,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "antispeciesism.com", true },
   { "antispeciesist.com", true },
   { "antivirusprotection.reviews", true },
-  { "antocom.com", true },
   { "antoga.eu", true },
   { "antoinedeschenes.com", true },
   { "antoinemary.com", true },
@@ -2745,13 +2860,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "antonin.one", true },
   { "antonio-gartenbau.de", true },
   { "antonjuulnaber.dk", true },
-  { "antons.io", true },
   { "antota.lt", true },
   { "antragsgruen.de", true },
   { "antroposofica.com.br", true },
+  { "antvklik.com", true },
   { "antyblokada.pl", true },
   { "anulowano.pl", true },
-  { "anvartay.com", true },
+  { "anvartay.com", false },
+  { "anwalt.us", true },
   { "anwaltsindex.com", true },
   { "anxietyspace.com", true },
   { "anxiolytics.com", true },
@@ -2771,6 +2887,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aoadatacommunity.us", true },
   { "aoaprograms.net", true },
   { "aofusa.net", true },
+  { "aoil.gr", true },
   { "aoku3d.com", true },
   { "aopedeure.nl", true },
   { "aopsy.de", true },
@@ -2783,6 +2900,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "apac-tech.com", false },
   { "apache-portal.com", true },
   { "apachehaus.de", false },
+  { "apachelounge.com", true },
   { "apadvantage.com", true },
   { "aparaatti.org", true },
   { "apartmanicg.me", true },
@@ -2820,7 +2938,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "apiary.supplies", true },
   { "apiary.supply", true },
   { "apila.care", true },
-  { "apila.us", true },
   { "apiled.io", true },
   { "apination.com", true },
   { "apio.systems", true },
@@ -2829,18 +2946,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "apisyouwonthate.com", true },
   { "apk.li", true },
   { "apk4fun.com", true },
+  { "apkmod.id", true },
   { "aplikaceproandroid.cz", true },
   { "aplpackaging.co.uk", true },
   { "aplu.fr", true },
   { "aplus-usa.net", true },
-  { "apmpproject.org", true },
+  { "apluswaterservices.com", true },
   { "apn-dz.org", true },
   { "apn-einstellungen.de", true },
   { "apo-deutschland.biz", true },
   { "apobot.de", true },
   { "apogeephoto.com", true },
   { "apoil.org", true },
-  { "apollyon.work", true },
   { "apoly.de", true },
   { "aponkral.net", true },
   { "aporia.io", true },
@@ -2863,9 +2980,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "appartementhaus-badria.de", true },
   { "appartementmarsum.nl", true },
   { "appchive.net", true },
-  { "appcoins.io", true },
   { "appearance-plm.de", true },
   { "appel-aide.ch", true },
+  { "appelaprojets.fr", true },
   { "appelboomdefilm.nl", true },
   { "appengine.google.com", true },
   { "apperio.com", true },
@@ -2873,6 +2990,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "appgeek.com.br", true },
   { "appharbor.com", true },
   { "appify.org", true },
+  { "appinn.com", true },
   { "applelife.ru", true },
   { "applemon.com", true },
   { "appleoosa.com", true },
@@ -2889,7 +3007,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "apponline.com", true },
   { "apprank.in", true },
   { "apprenticeship.gov", true },
-  { "apprenticeships.gov", true },
   { "approbo.com", true },
   { "approvedtreecare.com", true },
   { "apps.co", true },
@@ -2900,7 +3017,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "appscloudplus.com", true },
   { "appseccalifornia.org", false },
   { "appshuttle.com", true },
-  { "appson.co.uk", false },
   { "appt.ch", true },
   { "apptomics.com", true },
   { "appuals.com", true },
@@ -2912,9 +3028,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "apretatuercas.es", true },
   { "aprikaner.de", true },
   { "aprogend.com.br", true },
-  { "aproposcomputing.com", true },
   { "aprovpn.com", true },
-  { "aprr.org", true },
   { "aprsdroid.org", true },
   { "aprz.de", true },
   { "apsa.paris", true },
@@ -2923,7 +3037,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "apu-board.de", true },
   { "apv-ollon.ch", true },
   { "aqdun.com", true },
-  { "aqilacademy.com.au", true },
   { "aqsiq.net", true },
   { "aqua-fitness-nacht.de", true },
   { "aqua-fotowelt.de", true },
@@ -2950,11 +3063,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arab.dating", true },
   { "arabicxz.com", true },
   { "arachina.com", true },
+  { "arackiralama.name.tr", true },
   { "arados.de", true },
   { "arai21.net", true },
   { "araleeniken.com", true },
   { "aramado.com", true },
   { "aramido.de", true },
+  { "aranchhomes.com", true },
   { "aranycsillag.net", true },
   { "araraexpress.com.br", true },
   { "araratour.com", true },
@@ -2962,7 +3077,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "araseifudousan.com", true },
   { "arawaza.biz", true },
   { "arawaza.com", false },
-  { "arawaza.info", true },
   { "araxis.com", true },
   { "arbeitsch.eu", true },
   { "arbeitskreis-asyl-eningen.de", true },
@@ -2970,14 +3084,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arbejdsdag.dk", true },
   { "arbitrarion.com", true },
   { "arbitrary.ch", true },
-  { "arboleda-hurtado.com", true },
   { "arboworks.com", true },
+  { "arbu.eu", false },
   { "arcaik.net", true },
   { "arcbouncycastles.co.uk", true },
   { "arcenergy.co.uk", true },
   { "archimedicx.com", true },
   { "archined.nl", true },
   { "architectryan.com", true },
+  { "architecture-colleges.com", true },
+  { "architectureandgovernance.com", true },
   { "archivero.es", true },
   { "archivesdelavieordinaire.ch", true },
   { "archlinux.de", true },
@@ -2989,6 +3105,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arcueil-cachan.fr", false },
   { "arcusnova.de", true },
   { "arda-audio.pt", true },
+  { "ardor.noip.me", true },
   { "ardtrade.ru", true },
   { "area4pro.com", true },
   { "area536.com", true },
@@ -3003,21 +3120,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arenlor.com", true },
   { "arenlor.info", true },
   { "arenns.com", true },
-  { "areqgaming.com", true },
   { "ares-trading.de", true },
+  { "arethsu.se", true },
   { "arfad.ch", true },
   { "arg.zone", true },
-  { "argama-nature.com", true },
+  { "argama-nature.com", false },
   { "arganaderm.ch", true },
   { "argb.de", true },
   { "argekultur.at", true },
+  { "argonium.com.au", true },
   { "argot.com", true },
   { "argovpay.com", true },
-  { "ariaartgallery.com", true },
   { "ariadermspa.com", true },
   { "arian.io", true },
+  { "ariana.wtf", true },
   { "arias.re", true },
   { "ariba.info", true },
+  { "ariege-pyrenees.net", true },
   { "arieswdd.com", true },
   { "arigato-java.download", true },
   { "arijitdg.net", true },
@@ -3027,12 +3146,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arinde.ee", true },
   { "arise19.com", true },
   { "arisevendor.net", true },
-  { "aristilabs.com", true },
+  { "aristocrates.co", true },
   { "aritec-la.com", true },
   { "arivo.com.br", true },
-  { "arizer.com", true },
   { "arizonaautomobileclub.com", true },
   { "arizonabondedtitle.com", true },
+  { "arizonahomeownerinsurance.com", true },
   { "arjandejong.eu", true },
   { "arjanvaartjes.net", true },
   { "arjunasdaughter.pub", true },
@@ -3045,19 +3164,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arlatools.com", true },
   { "arlen.tv", true },
   { "arlenarmageddon.com", true },
-  { "arlet.click", true },
   { "arletalibrary.com", true },
-  { "arm-host.com", true },
+  { "arlingtonelectric.com", true },
   { "armadaquadrat.com", true },
   { "armandsdiscount.com", true },
   { "armanozak.com", true },
   { "armansfinejewellery.com", true },
   { "armansfinejewellery.com.au", true },
   { "armarinhovirtual.com.br", true },
+  { "armazemgourmetbrasil.com.br", true },
   { "armbrust.me", true },
   { "armedpoet.com", true },
-  { "armeni-jewellery.gr", true },
+  { "armeo.top", true },
   { "armil.it", true },
+  { "armin-cme.de", true },
+  { "armin-cpe.de", true },
   { "arminc.tk", true },
   { "arminpech.de", true },
   { "armleads.com", true },
@@ -3067,7 +3188,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "armyprodej.cz", true },
   { "arnaudb.net", true },
   { "arnaudfeld.de", true },
-  { "arnaudminable.net", true },
   { "arne.codes", true },
   { "arnevankauter.com", true },
   { "arniescastles.co.uk", true },
@@ -3085,6 +3205,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arogov.com", true },
   { "arokha.com", true },
   { "aromacos.ch", true },
+  { "aromatlas.com", true },
   { "aron.host", true },
   { "aroonchande.com", true },
   { "aros.pl", true },
@@ -3106,14 +3227,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arrowheadflats.com", true },
   { "arrowwebprojects.nl", true },
   { "arschkrebs.org", true },
+  { "arsplus.ru", true },
+  { "arswb.men", true },
   { "art-auction.jp", true },
   { "art-et-culture.ch", true },
-  { "artansoft.com", true },
   { "artboja.com", true },
   { "artdeco-photo.com", true },
+  { "arte-soft.co", true },
+  { "artea.ga", true },
+  { "arteaga.co.uk", true },
+  { "arteaga.eu", true },
+  { "arteaga.me", true },
+  { "arteaga.tech", true },
+  { "arteaga.uk", true },
+  { "arteaga.xyz", true },
   { "artecat.ch", true },
   { "artedellavetrina.it", true },
   { "artedona.com", true },
+  { "arteequipamientos.com.uy", true },
   { "artefakt.es", true },
   { "artefeita.com.br", true },
   { "arteinstudio.it", true },
@@ -3122,11 +3253,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "arterienundvenen.ch", true },
   { "arteshow.ch", true },
   { "artetrama.com", false },
+  { "artfabrics.com", true },
   { "artforum.sk", true },
   { "artfullyelegant.com", true },
   { "arthermitage.org", true },
+  { "arthur.cn", true },
   { "arthurlaw.ca", true },
   { "artificial.army", true },
+  { "artificialgrassandlandscaping.com", true },
   { "artik.cloud", true },
   { "artimpact.ch", true },
   { "artioml.net", true },
@@ -3188,18 +3322,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ascension.run", true },
   { "ascensori.biz", true },
   { "ascgathering.com", true },
-  { "ascii.moe", true },
+  { "asciitable.tips", true },
   { "asciiwwdc.com", true },
   { "asd.gov.au", true },
   { "asdyx.de", true },
   { "asec01.net", true },
+  { "asegem.es", true },
   { "aseith.com", true },
   { "aseko.gr", true },
   { "asenno.com", true },
   { "aserver.co", true },
   { "asexualitat.cat", true },
   { "asgapps.co.za", true },
-  { "asge-handel.de", true },
   { "ashd1.goip.de", true },
   { "ashd2.goip.de", true },
   { "ashd3.goip.de", true },
@@ -3213,14 +3347,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "asiaheavens.com", true },
   { "asialeonding.at", true },
   { "asian-industry.eu", true },
+  { "asianodor.com", true },
   { "asianshops.net", true },
   { "asianspa.co.uk", true },
   { "asiba.com.au", true },
+  { "asiinc-tex.com", true },
   { "asile-colis.fr", true },
   { "asinetasima.com", true },
-  { "asisee.photography", true },
+  { "ask.fi", true },
   { "ask1.org", true },
   { "askcaisse.com", true },
+  { "askcascade.com", true },
   { "askizzy.org.au", true },
   { "askkaren.gov", true },
   { "askme24.de", true },
@@ -3231,16 +3368,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aslinfinity.com", true },
   { "asmbsurvey.com", true },
   { "asmdz.com", true },
-  { "asmm.cc", true },
   { "asmood.net", true },
   { "asoul.tw", true },
-  { "aspargesgaarden.no", true },
   { "aspatrimoine.com", true },
   { "aspcl.ch", true },
   { "aspectcontext.com", true },
   { "asperti.com", true },
+  { "aspformacion.com", true },
   { "asphyxia.su", true },
   { "aspiescentral.com", true },
+  { "aspiradorasbaratas.net", true },
   { "aspirateur-anti-pollution.fr", true },
   { "aspires.co.jp", true },
   { "aspisdata.com", true },
@@ -3259,7 +3396,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "assetvault.co.za", true },
   { "assguidesporrentruy.ch", true },
   { "assign-it.co.uk", true },
-  { "assistance-personnes-agees.ch", true },
   { "assistel.com", true },
   { "assistenzaferrodastiro.org", true },
   { "assistenzafrigorifero.org", true },
@@ -3277,7 +3413,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "astenotarili.online", true },
   { "astenretail.com", true },
   { "astral-imperium.uk", true },
-  { "astral.org.pl", true },
   { "astrology42.com", true },
   { "astroscopy.ch", true },
   { "astrovandalistas.cc", true },
@@ -3294,8 +3429,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "asystent-dzierzawy.pl", true },
   { "at-one.ca", true },
   { "at.search.yahoo.com", false },
+  { "ataber.pw", true },
   { "atac.no", true },
   { "atacadocervejeiro.com.br", true },
+  { "atacadodesandalias.com.br", true },
   { "ataton.ch", true },
   { "atc.io", true },
   { "atchleyjazz.com", true },
@@ -3315,6 +3452,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "atelierdeloulou.fr", true },
   { "atelierdesflammesnoires.fr", true },
   { "atelierfantazie.sk", true },
+  { "atelierhsn.com", true },
   { "atelierhupsakee.nl", true },
   { "ateliernaruby.cz", true },
   { "ateliers-veronese-nantes.fr", true },
@@ -3322,17 +3460,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "atelierssud.swiss", true },
   { "atencionbimbo.com", false },
   { "atendimentodelta.com.br", true },
-  { "atg.soy", true },
   { "atgoetschel.ch", true },
   { "atgroup.gr", true },
   { "atgseed.co.uk", true },
   { "atgseed.uk", true },
+  { "ath0.org", false },
   { "atheist-refugees.com", true },
   { "athena-bartholdi.com", true },
   { "athena-garage.co.uk", true },
   { "athenadynamics.com", true },
   { "athenaneuro.com", true },
-  { "atheoryofchange.com", true },
   { "atherosense.ga", true },
   { "athlin.de", true },
   { "atigerseye.com", true },
@@ -3343,17 +3480,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "atisoft.net", true },
   { "atisoft.net.tr", true },
   { "atisoft.web.tr", true },
+  { "atisystem.com", true },
   { "atitude.com", true },
   { "ativapsicologia.com.br", true },
   { "atl-paas.net", true },
   { "atlantahairsurgeon.com", true },
   { "atlantareroof.com", true },
-  { "atlantaspringroll.com", true },
-  { "atlantichomes.com.au", true },
+  { "atlanticpediatricortho.com", true },
   { "atlantis-kh.noip.me", true },
   { "atlantischild.hu", true },
   { "atlantishq.de", true },
   { "atlantiswaterproofing.com", true },
+  { "atlasbrown.com", true },
   { "atlaschiropractic.org", true },
   { "atlascultural.com", true },
   { "atlasdev.nl", true },
@@ -3370,6 +3508,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "atom86.net", true },
   { "atombase.org", true },
   { "atomic-bounce.com", true },
+  { "atomic.red", true },
   { "atomicbounce.co.uk", true },
   { "atomism.com", true },
   { "atorcidabrasileira.com.br", true },
@@ -3385,7 +3524,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "attendantdesign.com", true },
   { "attendu.cz", true },
   { "attention.horse", true },
-  { "attilagyorffy.com", true },
   { "attilavandervelde.nl", true },
   { "attinderdhillon.com", true },
   { "attitudes-bureaux.fr", true },
@@ -3395,6 +3533,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "atulhost.com", true },
   { "atviras.lt", false },
   { "atvirtual.at", true },
+  { "atvsafety.gov", true },
+  { "atwar-mod.com", true },
   { "atwonline.org", true },
   { "atxchirocoverage.com", true },
   { "atyourprice.net", true },
@@ -3404,7 +3544,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "atypicom.pt", true },
   { "atzenchefin.de", true },
   { "au-be.net", true },
-  { "au2pb.net", true },
   { "au2pb.org", true },
   { "aubergegilly.ch", true },
   { "aubg.org", true },
@@ -3415,12 +3554,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "aucubin.de", true },
   { "audialbuquerqueparts.com", true },
   { "audiblox.co.za", true },
+  { "audiense.com", false },
   { "audio-detector.com", true },
+  { "audiobookboo.com", true },
   { "audiobookstudio.com", true },
   { "audiolibri.org", true },
   { "audiolot.com", true },
+  { "audion.cc", true },
   { "audion.hr", true },
-  { "audioonly.stream", true },
   { "audiophile.ch", true },
   { "audiophix.com", true },
   { "audiorecording.me", true },
@@ -3433,11 +3574,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "auditos.com", true },
   { "audits.io", true },
   { "auditsquare.com", true },
+  { "audreyjudson.com", true },
   { "auerbach-verlag.de", true },
   { "auf-feindgebiet.de", true },
   { "augen-seite.de", true },
   { "augiero.it", true },
-  { "augmentable.de", true },
+  { "augmentable.de", false },
   { "augmented-portal.com", true },
   { "august-don.site", true },
   { "august.black", true },
@@ -3452,7 +3594,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "auntie-eileens.com.au", true },
   { "aupasdecourses.com", true },
   { "auplidespages.fr", true },
-  { "aur.rocks", true },
+  { "aurelieburn.fr", true },
   { "aurelienaltarriba.fr", true },
   { "aureus.pw", true },
   { "auri.ga", true },
@@ -3466,6 +3608,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "auroz.video", true },
   { "aus-ryugaku.info", true },
   { "ausmwoid.de", true },
+  { "auspicacious.org", true },
   { "aussiefunadvisor.com", true },
   { "aussiegreenmarks.com.au", true },
   { "aussieservicedown.com", true },
@@ -3488,12 +3631,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "auszeit-walsrode.de", true },
   { "auszeit.bio", true },
   { "auth.adult", true },
+  { "authenticwoodcraft.com", true },
   { "authinfo-bestellen.de", true },
   { "authinity.com", true },
+  { "authland.com", false },
   { "author24.biz", true },
+  { "author24.info", true },
   { "authoritysolutions.com", true },
   { "autimatisering.nl", true },
   { "auto-anleitung.de", true },
+  { "auto-dealership-news.com", true },
   { "auto-motor-i-sport.pl", true },
   { "auto-plus.tn", true },
   { "auto-spurgo.com", true },
@@ -3517,6 +3664,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "autodidacticstudios.org", true },
   { "autoentrepreneurinfo.com", true },
   { "autoepc.ro", true },
+  { "autohaus-snater.de", true },
   { "autoinsurancehavasu.com", true },
   { "autokovrik-diskont.ru", true },
   { "autoledky.sk", true },
@@ -3528,13 +3676,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "automotivegroup-usedcars.be", true },
   { "automotivemechanic.org", true },
   { "automoto-tom.net", true },
+  { "autonewssite.com", true },
   { "autoosijek.com", true },
   { "autopapo.com.br", true },
   { "autoparts.im", true },
   { "autoparts.sh", true },
   { "autoparts.wf", true },
-  { "autoprice.info", true },
+  { "autoprice.info", false },
   { "autoprogconsortium.ga", true },
+  { "autoproshouston.com", true },
   { "autorando.com", true },
   { "autoschadeschreuder.nl", true },
   { "autoscuola.roma.it", true },
@@ -3548,14 +3698,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "autospurgo.milano.it", true },
   { "autostodulky.cz", true },
   { "autotechschool.com", true },
+  { "autoteplo.org", true },
   { "autoterminus-used.be", true },
+  { "autoto.hr", true },
   { "autoverzekeringafsluiten.com", true },
   { "autowerkstatt-puchheim.de", true },
-  { "autozane.com", true },
   { "autres-talents.fr", true },
+  { "autshir.com", true },
   { "auux.com", true },
   { "auvernet.org", true },
   { "aux-arts-de-la-table.com", true },
+  { "auxiliame.com", true },
+  { "auxille.com", true },
   { "auxquatrevents.ch", true },
   { "av-yummy.com", true },
   { "av0ndale.de", true },
@@ -3568,9 +3722,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "avalon-island.ru", true },
   { "avalon-rpg.com", true },
   { "avalon-studios.de", true },
+  { "avalyuan.com", true },
   { "avanet.ch", true },
   { "avanet.com", true },
   { "avanovum.de", true },
+  { "avantitualatin.com", true },
   { "avarty.com", true },
   { "avarty.net", true },
   { "avatardiffusion.com", true },
@@ -3589,7 +3745,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "averen.co.uk", true },
   { "avernis.de", true },
   { "avexon.com", true },
-  { "avi9526.pp.ua", true },
   { "avia-krasnoyarsk.ru", true },
   { "avia-ufa.ru", true },
   { "aviapoisk.kz", true },
@@ -3602,6 +3757,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "avietech.com", true },
   { "aviv.nyc", true },
   { "avlhostel.com", true },
+  { "avnet.ws", true },
   { "avocadooo.stream", true },
   { "avocatbeziau.com", true },
   { "avocode.com", true },
@@ -3610,25 +3766,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "avpres.net", true },
   { "avptp.org", true },
   { "avqueen.cn", true },
+  { "avs-building-services.co.uk", true },
   { "avsox.com", true },
   { "avticket.ru", false },
   { "avtoforex.ru", true },
   { "avtogara-isperih.com", true },
   { "avtovokzaly.ru", true },
   { "avv.li", true },
+  { "avvaterra.ch", true },
   { "avvcorda.com", true },
   { "avvocato.bologna.it", true },
+  { "awardplatform.com", true },
+  { "awardsplatform.com", true },
   { "awaremi-tai.com", true },
   { "awaresec.com", true },
   { "awaresec.no", true },
   { "awaro.net", true },
   { "awbouncycastlehire.com", true },
+  { "awecademy.org", true },
   { "awen.me", true },
   { "awesomebouncycastles.co.uk", true },
   { "awesomesit.es", true },
-  { "awin.la", true },
   { "awk.tw", true },
   { "awksolutions.com", true },
+  { "awningcanopyus.com", true },
   { "awningsaboveus.com", true },
   { "awningsatlantaga.com", true },
   { "awomaninherprime.com", true },
@@ -3639,6 +3800,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "axchap.ir", true },
   { "axelname.ru", true },
   { "axelteichmann.net", true },
+  { "axiatancell.com", true },
   { "axiomer.com", true },
   { "axiomer.es", true },
   { "axiomer.eu", true },
@@ -3647,18 +3809,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "axiomer.org", true },
   { "axis-stralis.co.uk", true },
   { "axisfleetmanagement.co.uk", true },
-  { "axka.com", false },
+  { "axolotlfarm.org", false },
   { "axonholdingse.eu", true },
   { "axrec.de", true },
-  { "axtudo.com", false },
+  { "ay-net.jp", true },
+  { "ayahya.me", true },
   { "ayanomimi.com", true },
   { "aycomba.de", true },
   { "ayesh.me", true },
-  { "ayesh.win", true },
   { "aykutcevik.com", true },
   { "aylak.com", true },
   { "aylesburycastlehire.co.uk", true },
   { "aymerick.fr", true },
+  { "aymericlagier.com", true },
   { "ayothemes.com", true },
   { "ayrohq.com", true },
   { "ayrshirebouncycastlehire.co.uk", true },
@@ -3666,7 +3829,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ayurveda-mantry.com", true },
   { "az-moga.bg", true },
   { "az.search.yahoo.com", false },
-  { "azabani.com", true },
   { "azadliq.info", true },
   { "azazy.net", false },
   { "azgfd.com", true },
@@ -3678,6 +3840,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "azort.com", true },
   { "azrazalea.net", true },
   { "azso.pro", true },
+  { "azsupport.com", true },
   { "aztraslochi.it", true },
   { "aztrix.me", true },
   { "azu-l.com", true },
@@ -3698,123 +3861,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "b-root-force.de", true },
   { "b-services.net", true },
   { "b-ticket.ch", true },
-  { "b0618.com", true },
-  { "b0618.net", true },
-  { "b0868.com", true },
-  { "b0868.net", true },
   { "b0k.org", true },
   { "b0rk.com", true },
-  { "b1.work", true },
-  { "b1758.com", true },
-  { "b1758.net", true },
-  { "b1768.com", true },
-  { "b1768.net", true },
-  { "b1788.com", true },
-  { "b1788.net", true },
+  { "b1788.net", false },
   { "b1c1l1.com", true },
   { "b1rd.tk", true },
-  { "b2486.com", true },
-  { "b2486.net", true },
+  { "b2and.com", false },
   { "b2bmuzikbank.com", true },
   { "b303.me", true },
   { "b4bouncycastles.co.uk", true },
   { "b4ckbone.de", true },
   { "b4r7.de", true },
   { "b4z.eu", true },
-  { "b5189.com", true },
-  { "b5189.net", true },
-  { "b5289.com", true },
-  { "b5289.net", true },
-  { "b5989.com", true },
-  { "b5989.net", true },
   { "b64.club", true },
   { "b72.com", true },
   { "b72.net", true },
-  { "b8591.com", true },
-  { "b8591.net", true },
-  { "b8979.com", true },
-  { "b8979.net", true },
-  { "b9018.com", true },
-  { "b9018.net", true },
-  { "b9108.com", true },
-  { "b9108.net", true },
-  { "b9110.com", true },
-  { "b9110.net", true },
-  { "b9112.com", true },
-  { "b9112.net", true },
-  { "b911gt.com", true },
-  { "b911gt.net", true },
-  { "b9168.com", true },
-  { "b91688.com", true },
-  { "b91688.info", true },
-  { "b91688.net", true },
-  { "b91688.org", true },
-  { "b9175.com", true },
-  { "b9175.net", true },
-  { "b9258.com", true },
-  { "b9258.net", true },
-  { "b9318.com", true },
-  { "b9318.net", true },
-  { "b9418.com", true },
-  { "b9418.net", true },
-  { "b9428.com", true },
-  { "b9428.net", true },
-  { "b9453.com", true },
-  { "b9453.net", true },
-  { "b9468.com", true },
-  { "b9468.net", true },
-  { "b9488.com", true },
-  { "b9488.net", true },
-  { "b9498.com", true },
-  { "b9498.net", true },
-  { "b9518.com", true },
-  { "b9518.info", true },
-  { "b9518.net", true },
-  { "b9518.org", true },
-  { "b9528.com", true },
-  { "b9528.net", true },
-  { "b9538.com", true },
-  { "b9538.net", true },
-  { "b9586.net", true },
-  { "b9588.net", true },
-  { "b95888.net", true },
-  { "b9589.net", true },
-  { "b9598.com", true },
-  { "b9598.net", true },
-  { "b9658.com", true },
-  { "b9658.net", true },
-  { "b9758.com", true },
-  { "b9758.net", true },
-  { "b9818.com", true },
-  { "b9818.net", true },
-  { "b9858.com", true },
-  { "b9858.net", true },
-  { "b9880.com", true },
-  { "b9883.net", true },
-  { "b9884.net", true },
-  { "b9885.net", true },
-  { "b9886.net", true },
-  { "b9887.net", true },
-  { "b9888.net", true },
-  { "b9889.net", true },
-  { "b9920.com", true },
-  { "b9948.com", true },
-  { "b9948.net", true },
-  { "b9960.com", true },
-  { "b99886.com", true },
-  { "b9best.cc", true },
-  { "b9best.net", true },
-  { "b9king.cc", true },
-  { "b9king.com", true },
-  { "b9king.net", true },
-  { "b9winner.cc", true },
-  { "b9winner.net", true },
   { "baalsworld.de", true },
   { "baas-becking.biology.utah.edu", true },
+  { "baazee.de", true },
   { "babacasino.net", true },
   { "babai.ru", true },
   { "babarkata.com", true },
   { "babeleo.com", true },
+  { "baby-bath-tub.com", true },
   { "baby-digne.com", true },
   { "baby-fotografie-muenchen.de", true },
   { "babybauch-shooting-muenchen.de", true },
@@ -3825,16 +3894,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "babyphototime.com", true },
   { "babypibu.com", true },
   { "babyshoprimini.com", true },
-  { "bacgrouppublishing.com", true },
   { "bachata.info", true },
   { "baches-piscines.com", true },
   { "baciu.ch", true },
-  { "backeby.eu", true },
   { "backmountaingas.com", true },
   { "backpacker.dating", true },
   { "backschues.com", true },
   { "backschues.de", true },
   { "backschues.net", true },
+  { "backseatbandits.com", true },
   { "backsideverbier.ch", true },
   { "backterris.com", true },
   { "backtest.org", true },
@@ -3870,17 +3938,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "baer.im", false },
   { "baer.one", false },
   { "baer.space", true },
-  { "baffinlee.com", true },
   { "bag.bg", true },
   { "bageez.us", true },
+  { "bagelcraft.net", true },
   { "bagelsbakery.com", false },
   { "bageluncle.com", true },
   { "baggy.me.uk", true },
   { "bagheera.me.uk", true },
+  { "baglu.com", true },
   { "bagsofbounce.co.uk", true },
   { "bagspecialist.nl", true },
   { "bagstage.de", true },
   { "bah.im", false },
+  { "bahaiprayers.io", true },
   { "bahnbonus-praemienwelt.de", true },
   { "bahnenimbild.de", true },
   { "bahnenimbild.eu", true },
@@ -3893,6 +3963,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bailakomigo.com.br", true },
   { "baildonbouncycastles.co.uk", true },
   { "baileebee.com", true },
+  { "bailonga.com", true },
   { "baitulongbaycruises.com", true },
   { "baiyangliu.com", true },
   { "bajajfinserv.in", true },
@@ -3918,6 +3989,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "balcaonet.com.br", true },
   { "balconnr.com", true },
   { "balconsverdun.com", true },
+  { "baldur.cc", true },
   { "balia.de", true },
   { "balicekzdravi.cz", true },
   { "balidesignshop.com.br", true },
@@ -3926,18 +3998,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "balist.es", true },
   { "balivillassanur.com", true },
   { "balkonien.org", true },
+  { "ball-bizarr.de", true },
   { "ball.holdings", true },
   { "ball3d.es", true },
+  { "ballarin.cc", true },
   { "ballejaune.com", true },
+  { "balletcenterofhouston.com", true },
   { "ballinarsl.com.au", true },
-  { "ballitolocksmith.com", true },
   { "ballmerpeak.org", true },
   { "ballonsportclub-erlangen.de", true },
   { "ballotapi.com", true },
   { "ballothero.com", true },
   { "ballparkbuns.com", false },
   { "ballroom.info", true },
+  { "balmofgilead.org.uk", true },
   { "balslev.io", true },
+  { "balticer.de", true },
   { "balticnetworks.com", true },
   { "bamahammer.com", true },
   { "bambooforest.nl", true },
@@ -3945,6 +4021,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bambumania.com.br", true },
   { "bamily.rocks", true },
   { "bananavapes.com", true },
+  { "banburybid.com", true },
   { "bancacrs.it", true },
   { "bancaolhares.com.br", true },
   { "bancobai.ao", true },
@@ -3956,25 +4033,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bandiga.it", true },
   { "bandito.re", true },
   { "banes.ch", true },
+  { "bangdream.ga", true },
   { "bangkok-dark-night.com", true },
   { "bangkok.dating", true },
   { "bangkokcity.de", true },
   { "bangorfederal.com", false },
+  { "bangridho.com", true },
   { "bangumi.co", true },
   { "banham.co.uk", true },
   { "banham.com", true },
+  { "banjostringiz.com", true },
   { "bank.simple.com", false },
   { "bankbranchlocator.com", true },
   { "bankcardoffer.com", true },
   { "bankee.us", true },
   { "bankerbuch.de", true },
+  { "bankersonline.com", true },
   { "banketbesteld.nl", true },
   { "bankfreeoffers.com", true },
   { "bankgradesecurity.com", true },
   { "bankin.com", true },
   { "bankinter.pt", true },
   { "bankio.se", true },
-  { "banknet.gov", true },
   { "bankofdenton.com", true },
   { "banksiaparkcottages.com.au", true },
   { "bankstownapartments.com.au", true },
@@ -3982,19 +4062,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "banned-bitches.tk", true },
   { "bannermarquees.ie", true },
   { "bannsecurity.com", true },
-  { "banoviny.sk", true },
   { "banquevanbreda.be", true },
   { "banter.city", true },
-  { "bao-in.com", true },
-  { "bao-in.net", true },
   { "baobeiglass.com", true },
-  { "baodan666.com", true },
   { "baofengtech.com", true },
   { "baopublishing.it", true },
   { "baptistedeleris.fr", true },
   { "bar-harcourt.com", true },
   { "barabrume.fr", true },
   { "barans2239.com", true },
+  { "barbarabowersrealty.com", true },
   { "barbarafabbri.com", true },
   { "barbarafeldman.com", true },
   { "barbarians.com", false },
@@ -4002,6 +4079,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "barbate.fr", true },
   { "barbershop-harmony.org", true },
   { "barbershop-lasvillas.com", true },
+  { "barbiere.it", true },
   { "barbu.family", true },
   { "barburas.com", true },
   { "barcamp.koeln", true },
@@ -4015,7 +4093,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "baresquare.com", true },
   { "barf-alarm.de", true },
   { "baripedia.org", true },
-  { "baris-sagdic.com", true },
   { "bariseau-mottrie.be", true },
   { "bariskaragoz.nl", true },
   { "baristador.com", true },
@@ -4027,6 +4104,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "barracuda.com.tr", true },
   { "barrera.io", true },
   { "barriofut.com", true },
+  { "barrydenicola.com", true },
   { "barryswebdesign.co.uk", true },
   { "bars.kh.ua", true },
   { "barsashop.com.br", true },
@@ -4039,6 +4117,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bartelt.name", true },
   { "barter4crypto.com", true },
   { "barthonia-showroom.de", true },
+  { "bartkramer.nl", true },
   { "bartlamboo.nl", true },
   { "bartolomebellido.com", true },
   { "bartula.de", true },
@@ -4056,7 +4135,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "basementdoctor.com", true },
   { "basementdoctornorthwest.com", true },
   { "basementfinishingohio.com", true },
-  { "basercap.co.ke", true },
+  { "basementwaterproofingdesmoines.com", true },
+  { "baserverz.ga", true },
   { "bashing-battlecats.com", true },
   { "bashstreetband.co.uk", true },
   { "basicapparel.de", true },
@@ -4071,19 +4151,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bassresource.com", true },
   { "bassrider.eu", true },
   { "bassys.com.co", true },
-  { "bastadigital.com", true },
   { "bastelzauberwelt.de", true },
   { "bastianstalder.ch", true },
   { "bastiv.com", true },
-  { "bastivmobile.com", true },
   { "bastolino.de", true },
   { "basw.eu", true },
   { "baswetter.photography", true },
   { "basyspro.net", true },
-  { "bat909.com", true },
-  { "bat909.net", true },
-  { "bat9vip.com", true },
-  { "bat9vip.net", true },
   { "batcave.tech", true },
   { "batch.com", true },
   { "bati-alu.fr", true },
@@ -4098,12 +4172,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "battle-game.com", true },
   { "battleboxx.com", false },
   { "battleofthegridiron.com", true },
-  { "batvip9.net", true },
-  { "bauen-mit-ziegel.de", true },
   { "bauer.network", true },
+  { "bauernmarkt-fernitz.at", true },
   { "baugeldspezi.de", true },
   { "baugemeinschaftbernstein.de", true },
   { "baumannfabrice.com", true },
+  { "baur.de", true },
   { "bausep.de", true },
   { "baustils.com", true },
   { "bauthier-occasions.be", true },
@@ -4141,23 +4215,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bbcastles.com", true },
   { "bbgeschenke.ch", true },
   { "bbimarketing.com", true },
+  { "bbinsure.com", true },
   { "bbka.org.uk", true },
   { "bbkaforum.co.uk", true },
   { "bbkworldwide.jp", true },
+  { "bbld.de", true },
   { "bblove.me", true },
   { "bblsa.ch", true },
   { "bbnbb.de", true },
   { "bbnx.net", true },
-  { "bbswin9.cc", true },
-  { "bbswin9.com", true },
   { "bbuio.com", false },
   { "bbw.dating", true },
   { "bbwcs.co.uk", true },
-  { "bbxin9.com", true },
-  { "bbxin9.net", true },
   { "bc-bd.org", false },
   { "bc-diffusion.com", true },
-  { "bc-personal.ch", true },
   { "bcansw.com.au", true },
   { "bcbulle.ch", true },
   { "bcdonadio.com", true },
@@ -4169,17 +4240,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bck-koethen.de", true },
   { "bck-lelystad.nl", true },
   { "bck.me", true },
+  { "bckaccompressoroz.com", true },
   { "bclogandtimberbuilders.com", true },
   { "bclrk.us", true },
   { "bcmainland.ca", true },
   { "bcmguide.com", true },
   { "bcmhire.co.uk", true },
-  { "bcnet.com.hk", true },
   { "bcpc-ccgpfcheminots.com", true },
   { "bcrook.com", true },
   { "bcs.adv.br", true },
   { "bcswampcabins.com", true },
   { "bcvps.com", true },
+  { "bcyw56.live", true },
   { "bd2positivo.com", true },
   { "bda-boulevarddesairs.com", true },
   { "bdbxml.net", true },
@@ -4189,23 +4261,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bdvg.org", true },
   { "be-a-password.ninja", true },
   { "be-ka-tec.de", true },
+  { "be-real.life", false },
   { "be-up-developpement.com", true },
   { "be-webdesign.com", true },
   { "be.search.yahoo.com", false },
   { "be2cloud.de", true },
-  { "be9418.com", true },
-  { "be9418.info", true },
-  { "be9418.net", true },
-  { "be9418.org", true },
-  { "be9458.com", true },
-  { "be9458.info", true },
-  { "be9458.net", true },
-  { "be9458.org", true },
-  { "be958.com", true },
-  { "be958.info", true },
-  { "be958.net", true },
-  { "be958.org", true },
   { "beacham.online", true },
+  { "beachcitycastles.com", true },
   { "beachfutbolclub.com", true },
   { "beacinsight.com", true },
   { "beadare.com", true },
@@ -4214,12 +4276,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bealpha.pl", true },
   { "beamer-discount.de", true },
   { "beamstat.com", true },
+  { "beanbagaa.com", true },
+  { "beanilla.com", true },
   { "beanjuice.me", true },
   { "beans-one.com", false },
   { "bearcosports.com.br", true },
   { "bearded.sexy", true },
   { "beardic.cn", true },
   { "bearingworks.com", true },
+  { "beastiejob.com", true },
   { "beastowner.li", true },
   { "beatfeld.de", true },
   { "beatnikbreaks.com", true },
@@ -4232,6 +4297,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "beautyby.tv", true },
   { "beautyevent.fr", true },
   { "beautykat.ru", true },
+  { "beaverdamautos.com", true },
   { "bebef.de", true },
   { "bebefofuxo.com.br", true },
   { "bebes.uno", true },
@@ -4242,7 +4308,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "beckenhamcastles.co.uk", true },
   { "beckerantiques.com", true },
   { "beckon.com", true },
-  { "becoast.fr", true },
   { "becs.ch", true },
   { "becydog.cz", true },
   { "bedamedia.com", true },
@@ -4253,13 +4318,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bedfordnissanparts.com", true },
   { "bedlingtonterrier.com.br", true },
   { "bednar.co", true },
+  { "bedrijfsfotoreportages.nl", true },
   { "bedrijfsportaal.nl", true },
   { "bedrocklinux.org", true },
   { "bedste10.dk", true },
   { "bee-creative.nl", true },
   { "bee-line.org.uk", true },
   { "bee.clothing", true },
-  { "bee.supply", true },
   { "bee.tools", true },
   { "beechwoodmetalworks.com", true },
   { "beehive.govt.nz", true },
@@ -4278,8 +4343,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "beekeeping.clothing", true },
   { "beekeeping.tools", true },
   { "beeksnetwork.nl", true },
+  { "beelen.fr", true },
   { "beelit.com", true },
   { "beeming.net", true },
+  { "beer9.com", true },
   { "beercandle.com", true },
   { "beergazetteer.com", true },
   { "beerians.com", true },
@@ -4301,6 +4368,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "beezkneezcastles.co.uk", true },
   { "beeznest.com", true },
   { "befoodsafe.gov", true },
+  { "beforeyoueatoc.com", true },
   { "beframed.ch", true },
   { "befundonline.de", true },
   { "begabungsfoerderung.info", true },
@@ -4308,7 +4376,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "beginatzero.com", true },
   { "beginner.nl", true },
   { "beginwp.top", true },
-  { "begoodny.co.il", true },
   { "behamepresrdce.sk", true },
   { "behamzdarma.cz", true },
   { "behindthethrills.com", true },
@@ -4323,6 +4390,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "beimchristoph.de", true },
   { "beinad.com", true },
   { "beinad.ru", true },
+  { "bejarano.io", true },
   { "belacapa.com.br", true },
   { "belanglos.de", true },
   { "belani.eu", true },
@@ -4335,7 +4403,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "belarto.pl", true },
   { "belastingdienst-in-beeld.nl", false },
   { "belastingmiddeling.nl", true },
-  { "belcompany.nl", false },
+  { "belavis.com", true },
   { "belegit.org", true },
   { "belfastbounce.co.uk", true },
   { "belfastlocks.com", true },
@@ -4350,11 +4418,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bell.id.au", true },
   { "bella.network", true },
   { "bellamodeling.com", true },
+  { "bellinghamdetailandglass.com", true },
   { "belloy.ch", true },
   { "belloy.net", true },
   { "bellthrogh.com", true },
   { "bellthrough.com", true },
   { "belly-button-piercings.com", true },
+  { "bellyandbrain.amsterdam", true },
   { "belmontgoessolar.org", true },
   { "belouga.org", true },
   { "belt.black", true },
@@ -4362,6 +4432,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "belvoirbouncycastles.co.uk", true },
   { "bely-mishka.by", true },
   { "belyvly.com", true },
+  { "bemcorp.de", true },
   { "bemindly.com", true },
   { "bemsoft.pl", true },
   { "ben-energy.com", false },
@@ -4375,10 +4446,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "benburwell.com", true },
   { "benc.io", true },
   { "benceskorka.com", true },
-  { "benchcast.com", true },
   { "benchling.com", true },
   { "benchmarkmonument.com", true },
-  { "bencorby.com", true },
+  { "benchstoolo.com", true },
   { "bendemaree.com", true },
   { "bendigoland.com.au", true },
   { "bendingtheending.com", true },
@@ -4439,6 +4509,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "berasavocate.com", true },
   { "berdaguermontes.eu", false },
   { "bergenhave.nl", true },
+  { "berger-chiro.com", true },
   { "bergevoet-fa.nl", true },
   { "bergfreunde.de", true },
   { "bergfreunde.dk", true },
@@ -4450,6 +4521,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bergfreunde.no", true },
   { "bergfreunde.se", true },
   { "berglust-pur.de", true },
+  { "bergmanbeachproperties.com", true },
   { "bergmann-fotografin-berlin.de", true },
   { "bergmann-fotografin-dortmund.de", true },
   { "bergmann-fotografin-duesseldorf.de", true },
@@ -4459,9 +4531,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bergmann-fotografin-koeln.de", true },
   { "bergmann-fotografin-muenchen.de", true },
   { "bergmann-fotografin-stuttgart.de", true },
+  { "bergstoneware.com", true },
   { "berichtsheft-vorlage.de", true },
   { "berikod.ru", true },
-  { "berliancom.com", false },
+  { "berinhard.pl", true },
+  { "berliancom.com", true },
   { "berlin-flirt.de", true },
   { "berlin.dating", true },
   { "bermeitinger.eu", true },
@@ -4469,6 +4543,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bernadetteanderes.ch", true },
   { "bernardcontainers.be", true },
   { "bernarddickens.com", true },
+  { "bernardez-photo.com", true },
   { "bernardfischer.fr", true },
   { "bernardgo.com", true },
   { "bernat.ch", true },
@@ -4476,16 +4551,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bernd-leitner-fotodesign.com", true },
   { "bernd-leitner-fotodesign.de", true },
   { "bernd-leitner.de", true },
+  { "berndklaus.at", true },
   { "bernhard-seidenspinner.de", true },
   { "bernhardkau.de", true },
   { "bernhardluginbuehl.ch", true },
   { "bernhardluginbuehl.com", true },
   { "berodes.be", true },
-  { "berr.yt", true },
   { "berra.se", true },
   { "berruezoabogados.com", true },
   { "berrus.com", true },
-  { "berry.cat", true },
   { "berrypay.com", true },
   { "bersierservices.ch", true },
   { "bersotavocats.fr", true },
@@ -4501,15 +4575,33 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "besb.io", true },
   { "besb66.com", true },
   { "beschriftung-metz.de", true },
+  { "bescover.com", true },
   { "beserberg.tk", true },
   { "besole.ch", true },
   { "bespaarenergie.click", true },
-  { "bespaarnu.click", true },
   { "bespokestraps.com", true },
   { "besser-beissen.de", true },
   { "bessettenotaire.com", true },
+  { "best-accounting-schools.com", true },
+  { "best-art-colleges.com", true },
+  { "best-baptist-colleges.com", true },
+  { "best-beauty-schools.com", true },
+  { "best-business-colleges.com", true },
+  { "best-catholic-colleges.com", true },
+  { "best-community-colleges.com", true },
+  { "best-culinary-colleges.com", true },
+  { "best-education-schools.com", true },
+  { "best-engineering-colleges.com", true },
   { "best-essay-service.com", true },
-  { "best-of-bounce.co.uk", true },
+  { "best-graduate-programs.com", true },
+  { "best-hvac-schools.com", true },
+  { "best-lutheran-colleges.com", true },
+  { "best-management-schools.com", true },
+  { "best-marketing-schools.com", true },
+  { "best-music-colleges.com", true },
+  { "best-nursing-colleges.com", true },
+  { "best-pharmacy-schools.com", true },
+  { "best-trucking-schools.com", true },
   { "best-wallpaper.net", true },
   { "best10websitebuilders.com", true },
   { "best2pay.net", true },
@@ -4518,52 +4610,46 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bestattungshaus-kammerer.de", true },
   { "bestautoinsurance.com", true },
   { "bestbatteriesonline.com", true },
+  { "bestbefore.com", true },
   { "bestbrakes.com", true },
   { "bestbridal.top", true },
   { "bestbyte.com.br", true },
   { "bestcellular.com", false },
   { "bestdating.today", true },
+  { "bestelectricnd.com", true },
   { "bestemailmarketingsoftware.org", true },
-  { "bestesb.com", true },
-  { "bestesb.net", true },
   { "bestessaycheap.com", true },
   { "bestessayhelp.com", true },
   { "bestfriendsequality.org", true },
   { "bestgiftever.ca", true },
   { "bestgifts4you.com", true },
-  { "bestiahosting.com", true },
   { "bestinductioncooktop.us", true },
+  { "bestinshowing.com", true },
   { "bestinver.es", true },
   { "bestjumptrampolines.be", true },
+  { "bestkenmoredentists.com", true },
   { "bestlashesandbrows.com", true },
   { "bestlashesandbrows.hu", true },
   { "bestmotherfucking.website", true },
   { "bestoffert.club", true },
   { "bestoliveoils.com", true },
-  { "bestparking.xyz", true },
+  { "bestpal.eu", true },
   { "bestpartyhire.com", true },
   { "bestperfumebrands.com", true },
-  { "bestpig.fr", true },
+  { "bestplumbing.com", true },
   { "bestschools.io", true },
+  { "bestschools.top", true },
   { "bestseries.tv", true },
   { "bestshoesmix.com", true },
   { "bestwebsite.gallery", true },
-  { "bet-99.cc", true },
-  { "bet-99.com", true },
-  { "bet-99.net", true },
-  { "bet168wy.com", true },
-  { "bet168wy.net", true },
-  { "bet909.com", true },
-  { "bet9bet9.net", true },
   { "betacavi.com", true },
   { "betacloud.io", true },
+  { "betaclouds.net", true },
   { "betalenviainternet.nl", true },
   { "betaprofiles.com", true },
   { "betaworx.de", true },
   { "betaworx.eu", true },
   { "betecnet.de", true },
-  { "betgo9.cc", true },
-  { "bethanyduke.com", true },
   { "bethpage.net", true },
   { "betobaccofree.gov", true },
   { "betonbit.com", true },
@@ -4571,6 +4657,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "betrallyarabia.com", true },
   { "bets.gg", true },
   { "betseybuckheit.com", true },
+  { "betsharpangles.com", true },
   { "betsyshilling.com", true },
   { "bett1.de", true },
   { "better-bounce.co.uk", true },
@@ -4588,18 +4675,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "betterscience.org", true },
   { "bettertechinterviews.com", true },
   { "bettertest.it", true },
+  { "bettertime.de", true },
+  { "bettertime.jetzt", true },
   { "betterweb.fr", true },
   { "betterworldinternational.org", true },
-  { "bettflaschen.ch", true },
   { "bettingbusiness.ru", true },
   { "bettingsider.dk", true },
   { "bettolinokitchen.com", true },
   { "bettrlifeapp.com", true },
   { "betulashop.ch", true },
   { "betwalker.com", true },
-  { "between.be", true },
-  { "betwin9.com", true },
-  { "betwin9.net", true },
   { "beulen.email", true },
   { "beulen.link", true },
   { "beulen.pro", true },
@@ -4651,19 +4736,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bglsingles.de", true },
   { "bgp.space", true },
   { "bgr34.cz", true },
+  { "bgs-game.com", true },
   { "bgtgames.com", true },
   { "bgtoyou.com", true },
+  { "bguidinger.com", true },
   { "bgwfans.com", true },
   { "bh-oberland.de", true },
   { "bh.sb", true },
   { "bharath-g.in", true },
   { "bhodisoft.com", true },
   { "bhost.net", true },
-  { "bhosted.nl", true },
   { "bhtelecom.ba", true },
   { "bhuntr.com", true },
   { "bi.search.yahoo.com", false },
   { "biaggeo.com", true },
+  { "biancolievito.it", true },
   { "biano-ai.com", true },
   { "biasmath.es", true },
   { "biathloncup.ru", true },
@@ -4675,6 +4762,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "biblioblog.fr", true },
   { "bibliomarkt.ch", true },
   { "biblionaut.net", true },
+  { "biblioporn.com", true },
   { "bibliotekarien.se", true },
   { "biboumail.fr", true },
   { "bibuch.com", true },
@@ -4682,12 +4770,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bicha.net", true },
   { "bicranial.io", true },
   { "bicycle-events.com", true },
+  { "bicycleframeiz.com", true },
   { "biddl.com", true },
   { "biddle.co", true },
   { "bidu.com.br", true },
   { "bie.edu", false },
   { "biegal.ski", true },
   { "biegner-technik.de", true },
+  { "biehl.tech", true },
   { "biehlsoft.info", true },
   { "bielefailed.de", true },
   { "bien-etre-sante.info", true },
@@ -4700,8 +4790,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bieser.ch", true },
   { "biester.pro", true },
   { "bieumau.net", true },
+  { "biewen.me", true },
   { "bifrost.cz", true },
-  { "biftin.net", true },
   { "big-andy.co.uk", true },
   { "big-bounce.co.uk", true },
   { "big-fluglaerm-hamburg.de", true },
@@ -4714,16 +4804,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bigclassaction.com", true },
   { "bigdinosaur.org", true },
   { "biggreenexchange.com", true },
+  { "bigideasnetwork.com", true },
   { "bigio.com.br", true },
-  { "bigjohn.ru", true },
   { "biglou.com", false },
   { "bignumworks.com", true },
+  { "bigorbitgallery.org", true },
+  { "bigserp.com", true },
   { "bigsisterchannel.com", true },
+  { "bigskylifestylerealestate.com", true },
   { "bigskymontanalandforsale.com", true },
   { "bigwiseguide.com", true },
   { "bihub.io", true },
   { "biilo.com", true },
-  { "bijouxbrasil.com.br", true },
   { "bijouxcherie.com", true },
   { "bijuteriicualint.ro", true },
   { "bike-discount.de", true },
@@ -4749,6 +4841,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "biletyplus.by", true },
   { "biletyplus.ua", true },
   { "bilgo.com", true },
+  { "bilibili.link", true },
   { "bilibili.red", true },
   { "bilimoe.com", true },
   { "bilke.org", true },
@@ -4762,9 +4855,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "billionaire365.com", true },
   { "billionairemailinglist.com", false },
   { "billionkiaparts.com", true },
+  { "billkochman.com", true },
   { "billogram.com", true },
   { "billpro.com", false },
-  { "billpro.com.au", true },
   { "billrhodesbakery.com", true },
   { "billy.pictures", true },
   { "billyoh.com", true },
@@ -4793,8 +4886,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "binaryevolved.com", true },
   { "binaryrebel.net", true },
   { "binarystud.io", true },
-  { "binbin9.com", true },
-  { "binbin9.net", true },
   { "binding-problem.com", true },
   { "binfind.com", true },
   { "bing.com", true },
@@ -4803,6 +4894,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "binkanhada.biz", true },
   { "binkconsulting.be", true },
   { "binnenmeer.de", true },
+  { "binsp.net", true },
   { "binti.com", true },
   { "bintooshoots.com", true },
   { "bio-disinfestazione.it", true },
@@ -4828,7 +4920,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "biohappiness.com", true },
   { "bioharmony.ca", true },
   { "biointelligence-explosion.com", true },
+  { "bioknowme.com", true },
   { "bioligo.ch", true },
+  { "biologis.ch", true },
+  { "biology-colleges.com", true },
   { "biomasscore.com", true },
   { "biometrics.es", true },
   { "biomodra.cz", true },
@@ -4862,21 +4957,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "birthright.website", true },
   { "birzan.org", true },
   { "bisa-sis.net", true },
+  { "bischoff-mathey.family", true },
   { "biscoint.io", true },
+  { "biscuitcute.com.br", true },
   { "biser-borisov.eu", true },
   { "bismarck-tb.de", true },
   { "biso.ga", true },
   { "bison.co", true },
+  { "bisq.community", true },
   { "bissalama.org", true },
   { "bisschopssteeg.nl", true },
   { "bistrocean.com", true },
+  { "bistrodeminas.com", true },
   { "bistrotdelagare.fr", true },
   { "bit-cloud.de", true },
   { "bit-rapid.com", true },
   { "bit-sentinel.com", true },
   { "bit-service-aalter.be", true },
   { "bit.biz.tr", true },
-  { "bit.voyage", true },
   { "bit8.com", true },
   { "bitaccelerate.com", true },
   { "bitbank.cc", true },
@@ -4888,7 +4986,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bitburner.de", true },
   { "bitcalt.eu.org", true },
   { "bitcalt.ga", true },
-  { "bitclubfun.com", true },
+  { "bitchigo.com", true },
   { "bitcoin-india.net", true },
   { "bitcoin-india.org", true },
   { "bitcoin.asia", true },
@@ -4909,21 +5007,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bitcoinwalletscript.tk", true },
   { "bitcoinx.gr", true },
   { "bitcoinx.ro", true },
-  { "bitenose.com", true },
+  { "bitcork.io", true },
   { "bitex.la", true },
   { "bitfasching.de", false },
   { "bitfehler.net", true },
   { "bitfence.io", true },
   { "bitfinder.nl", true },
+  { "bitfolio.org", true },
   { "bitfuse.net", true },
   { "bitgo.com", true },
   { "bitgrapes.com", true },
-  { "bithap.com", true },
   { "bithir.co.uk", true },
   { "bititrain.com", true },
-  { "bitk.co", true },
-  { "bitk.co.uk", true },
-  { "bitk.eu", true },
   { "bitlish.com", true },
   { "bitlo.com", true },
   { "bitlo.com.tr", true },
@@ -4932,20 +5027,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bitmainwarranty.com", true },
   { "bitmarket.net", true },
   { "bitmarket.pl", true },
+  { "bitmask.me", true },
   { "bitmessage.ch", true },
   { "bitmidi.com", true },
   { "bitminter.com", true },
   { "bitmoe.com", true },
-  { "bitmon.net", true },
-  { "bitok.com", true },
+  { "bitpoll.de", true },
+  { "bitpoll.org", true },
   { "bitpumpe.net", true },
   { "bitref.com", true },
   { "bitrush.nl", true },
   { "bitsafe.com.my", true },
   { "bitsburg.ru", true },
-  { "bitshaker.net", true },
   { "bitskins.co", true },
   { "bitskrieg.net", true },
+  { "bitsoffreedom.nl", true },
+  { "bitstep.ca", true },
   { "bitstorm.nl", true },
   { "bitstorm.org", true },
   { "bitsum.com", true },
@@ -4961,6 +5058,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "biyori.moe", true },
   { "biyou-homme.com", true },
   { "biz4x.com", true },
+  { "bizbudding.com", true },
+  { "bizcash.co.za", true },
   { "bizeau.ch", true },
   { "bizniskatalog.mk", true },
   { "biznpro.ru", true },
@@ -4970,15 +5069,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "biztouch.work", true },
   { "bizzi.tv", true },
   { "bjarnerest.de", true },
-  { "bjl5689.com", true },
-  { "bjl5689.net", true },
+  { "bjmgeek.science", true },
+  { "bjmun.cn", true },
   { "bjornhelmersson.se", true },
   { "bjornjohansen.no", true },
   { "bjs.gov", true },
   { "bjsbouncycastles.com", true },
   { "bkentertainments.co.uk", true },
-  { "bkhayes.com", true },
   { "bkhpilates.co.uk", true },
+  { "bkkposn.com", true },
   { "bklaindia.com", true },
   { "bkositspartytime.co.uk", true },
   { "bl00.se", true },
@@ -4989,6 +5088,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bl4ckb0x.info", true },
   { "bl4ckb0x.net", true },
   { "bl4ckb0x.org", true },
+  { "blaauwgeers.pro", true },
+  { "blaauwgeers.travel", true },
   { "blabber.im", true },
   { "blablacar.co.uk", true },
   { "blablacar.com", true },
@@ -5032,19 +5133,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "blackgate.org", true },
   { "blackhat.dk", true },
   { "blackhelicopters.net", true },
+  { "blackhell.xyz", true },
   { "blackhillsinfosec.com", true },
   { "blackilli.de", true },
   { "blackislegroup.com", true },
+  { "blackjackballroomcasino.info", true },
   { "blackkeg.ca", true },
   { "blackl.net", true },
+  { "blacklightparty.be", true },
   { "blackmonday.gr", true },
   { "blacknetwork.eu", true },
   { "blacknova.io", true },
   { "blackonion.com", true },
   { "blackpapermoon.de", true },
   { "blackphoenix.de", true },
+  { "blackpi.dedyn.io", true },
   { "blackroadphotography.de", true },
-  { "blackscytheconsulting.com", true },
+  { "blackroot.eu", true },
   { "blackseals.net", true },
   { "blackyau.cc", true },
   { "blackys-chamber.de", true },
@@ -5070,8 +5175,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bleche-onlineshop.at", true },
   { "bleche-onlineshop.de", true },
   { "blechinger.io", true },
-  { "blechpirat.name", true },
   { "blechschmidt.saarland", true },
+  { "blenderinsider.com", true },
   { "blenderrecipereviews.com", true },
   { "blending.kr", true },
   { "blendle.com", true },
@@ -5079,22 +5184,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "blendr.com", true },
   { "blenheimears.com", true },
   { "blenneros.net", false },
-  { "blessedearth.com.au", true },
   { "blessedguy.com", true },
   { "blessedguy.net", false },
   { "blewebprojects.com", true },
   { "blichmann.eu", true },
   { "blidz.com", true },
   { "blieque.co.uk", true },
-  { "bliesekow.net", true },
-  { "bliker.ga", true },
   { "blikk.no", true },
+  { "blikund.swedbank.se", true },
+  { "blinder.com.co", true },
   { "blindpigandtheacorn.com", true },
   { "blinds-unlimited.com", true },
-  { "bling9.com", true },
-  { "bling999.cc", true },
-  { "bling999.com", true },
-  { "bling999.net", true },
   { "blingsparkleshine.com", true },
   { "blink-security.com", true },
   { "blinking.link", true },
@@ -5152,8 +5252,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "blogom.at", true },
   { "blogpentrusuflet.ro", true },
   { "blogreen.org", true },
+  { "blogsdna.com", true },
   { "blogthedayaway.com", true },
   { "blogtroterzy.pl", true },
+  { "blok56.nl", true },
+  { "blokmy.com", true },
   { "blood4pets.tk", true },
   { "bloodsports.org", true },
   { "bloom-avenue.com", true },
@@ -5176,27 +5279,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bluecon.ninja", true },
   { "bluecrazii.nl", true },
   { "blued.moe", true },
+  { "bluedata.ltd", true },
   { "bluedeck.org", true },
   { "blueflare.org", true },
   { "bluefrag.com", true },
   { "bluefuzz.nl", true },
-  { "bluehelixmusic.com", true },
   { "blueimp.net", true },
+  { "bluekrypt.com", true },
   { "blueliquiddesigns.com.au", true },
   { "bluemeda.web.id", true },
   { "bluemosh.com", true },
   { "bluemtnrentalmanagement.ca", true },
   { "bluenote9.com", true },
   { "blueoakart.com", true },
-  { "bluepearl.tk", true },
   { "blueperil.de", true },
   { "bluepoint.one", true },
   { "bluepostbox.de", true },
-  { "blueprintloans.co.uk", true },
   { "bluerootsmarketing.com", true },
   { "blues-and-pictures.com", true },
   { "blueskycoverage.com", true },
   { "bluestardiabetes.com", true },
+  { "bluesunhotels.com", true },
   { "bluetexservice.com", true },
   { "bluewavewebdesign.com", true },
   { "bluex.im", true },
@@ -5209,16 +5312,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bluimedia.com", true },
   { "blumenfeldart.com", true },
   { "blumiges-fischbachtal.de", false },
-  { "blundell.wedding", true },
   { "bluntandsnakes.com", true },
   { "blupig.net", true },
   { "bluproducts.com.es", true },
   { "blurringexistence.net", true },
+  { "blusens.com", true },
   { "blusmurf.net", true },
   { "blyat.science", true },
   { "blyth.me.uk", true },
   { "blzrk.com", true },
+  { "bm-immo.ch", true },
   { "bmhglobal.com.au", true },
+  { "bminton.is-a-geek.net", true },
   { "bmone.net", true },
   { "bmriv.com", true },
   { "bmros.com.ar", true },
@@ -5232,16 +5337,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bnjscastles.co.uk", true },
   { "bnstree.com", true },
   { "bnty.net", true },
-  { "bo1689.com", true },
-  { "bo1689.net", true },
-  { "bo9club.cc", true },
-  { "bo9club.com", true },
-  { "bo9club.net", true },
-  { "bo9fun.com", true },
-  { "bo9fun.net", true },
-  { "bo9game.com", true },
-  { "bo9game.net", true },
-  { "bo9king.net", true },
+  { "bnzblowermotors.com", true },
   { "boardgamegeeks.de", true },
   { "boards.ie", true },
   { "boat-engines.eu", true },
@@ -5250,6 +5346,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "boattrader.com.au", true },
   { "bobaly.es", true },
   { "bobancoamigo.com", true },
+  { "bobaobei.net", true },
   { "bobazar.com", true },
   { "bobcopeland.com", true },
   { "bobiji.com", false },
@@ -5274,8 +5371,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bodymusclejournal.com", true },
   { "bodypainter.pl", true },
   { "bodypainting.waw.pl", true },
+  { "bodyshopnews.net", true },
   { "bodyweb.com.br", true },
   { "bodyworkbymichael.com", true },
+  { "bodyworksautorebuild.com", true },
   { "boeddhashop.nl", true },
   { "boekenlegger.nl", true },
   { "boem.gov", true },
@@ -5285,6 +5384,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bogner.sh", true },
   { "bogobeats.com", true },
   { "bogosity.se", true },
+  { "bohan.co", true },
   { "bohramt.de", true },
   { "boimmobilier.ch", true },
   { "boincstats.com", true },
@@ -5313,6 +5413,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bonami.pl", true },
   { "bonami.ro", true },
   { "bonami.sk", true },
+  { "bonawehouse.co.uk", true },
   { "bonbonmania.com", true },
   { "bondank.com", true },
   { "bondarenko.dn.ua", true },
@@ -5321,8 +5422,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bondoer.fr", true },
   { "bondskampeerder.nl", true },
   { "bonebunny.de", true },
-  { "bonesserver.com", true },
   { "bonfi.net", true },
+  { "bongo.cat", true },
   { "bonibuty.com", true },
   { "bonifacius.be", true },
   { "bonita.com.br", true },
@@ -5330,6 +5431,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bonnant-associes.ch", true },
   { "bonnant-partners.ch", true },
   { "bonnebouffe.fr", true },
+  { "bonniecoloring.com", true },
+  { "bonniedraw.com", true },
   { "bonnieradvocaten.nl", true },
   { "bonnsustainabilityportal.de", true },
   { "bonnyprints.at", true },
@@ -5340,8 +5443,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bonqoeur.ca", true },
   { "bonrecipe.com", true },
   { "bonsaimedia.nl", true },
-  { "boodaah.com", true },
+  { "bonsi.net", true },
+  { "bonux.co", true },
+  { "boodaah.com", false },
   { "boodmo.com", true },
+  { "boogaerdtmakelaars.nl", true },
   { "boogiebouncecastles.co.uk", true },
   { "book-in-hotel.com", true },
   { "booker.ly", true },
@@ -5353,12 +5459,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bookingworldspeakers.com", true },
   { "bookluk.com", true },
   { "bookmein.in", true },
-  { "bookourdjs.com", true },
   { "booksearch.jp", true },
-  { "bookshopofindia.com", true },
   { "booksinthefridge.at", true },
   { "booktracker-org.appspot.com", true },
   { "bool.be", true },
+  { "boombv.com", true },
   { "boomersurf.com", true },
   { "boomshelf.com", true },
   { "boomshelf.org", true },
@@ -5377,7 +5482,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "boosinflatablegames.co.uk", true },
   { "boost.fyi", true },
   { "boost.ink", true },
-  { "boostgame.win", true },
   { "booter.pw", true },
   { "bootjp.me", false },
   { "bopiweb.com", true },
@@ -5394,6 +5498,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "borja.io", true },
   { "born2bounce.co.uk", true },
   { "bornandgrazed.com", true },
+  { "borneodictionary.com", true },
   { "bornfiber.dk", true },
   { "bornhack.dk", true },
   { "borowski.pw", true },
@@ -5405,12 +5510,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "boskeopolis-stories.com", true },
   { "boss.az", true },
   { "bostadsportal.se", true },
+  { "bostonadvisors.com", true },
   { "bosufitness.cz", true },
   { "bosun.io", true },
-  { "bosworthdental.co.uk", true },
   { "bot-manager.pl", true },
+  { "botezdepoveste.ro", true },
   { "botguard.net", true },
-  { "botserver.de", true },
+  { "bothellwaygarage.net", true },
+  { "botsindiscord.me", true },
   { "botstack.host", true },
   { "bottaerisposta.net", true },
   { "bottineauneighborhood.org", true },
@@ -5423,6 +5530,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "boudah.pl", true },
   { "bougeret.fr", true },
   { "boukoubengo.com", true },
+  { "boulderswap.com", true },
   { "boulzicourt.fr", true },
   { "bounce-a-mania.co.uk", true },
   { "bounce-a-roo.co.uk", true },
@@ -5446,7 +5554,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bouncearoundsheffield.co.uk", true },
   { "bounceawaycastles.com", true },
   { "bouncebackcastles.co.uk", true },
-  { "bouncebeyondcastles.co.uk", true },
   { "bouncebookings.com.au", true },
   { "bouncecrazy.ie", true },
   { "bouncejumpboston.co.uk", true },
@@ -5466,7 +5573,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bouncetasticuk.co.uk", true },
   { "bouncetheparty.co.uk", true },
   { "bounceunlimited.co.uk", true },
-  { "bouncewithbovells.com", false },
   { "bouncewrightcastles.co.uk", true },
   { "bouncincastles.co.uk", true },
   { "bouncing-bugs.co.uk", true },
@@ -5488,6 +5594,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bouncycastlehire-norwich.com", true },
   { "bouncycastlehire-sheffield.co.uk", true },
   { "bouncycastlehire.co.uk", true },
+  { "bouncycastlehireauckland.co.nz", true },
   { "bouncycastlehirebarnstaple.co.uk", true },
   { "bouncycastlehirebexley.co.uk", true },
   { "bouncycastlehirechelmsford.org.uk", true },
@@ -5503,7 +5610,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bouncycastleman.co.uk", true },
   { "bouncycastlemangloucestershire.co.uk", true },
   { "bouncycastleparade.com", true },
-  { "bouncycastles.me", true },
   { "bouncycastlesgalway.com", true },
   { "bouncycastleshire.co.uk", true },
   { "bouncycastleshireleeds.co.uk", true },
@@ -5524,7 +5630,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bouncykings.co.uk", true },
   { "bouncykingsnortheast.co.uk", true },
   { "bouncymacs.co.uk", true },
-  { "bouncymadness.com", true },
   { "bouncyrainbows.co.uk", true },
   { "bouncytime.co.uk", true },
   { "bouncytown.co.uk", true },
@@ -5542,6 +5647,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "boutiquefutebol.com.br", true },
   { "boutiqueguenaelleverdin.com", true },
   { "bouw.live", true },
+  { "bouzouada.com", true },
   { "bouzouks.net", true },
   { "bovenwebdesign.nl", true },
   { "bowdens.me", true },
@@ -5558,7 +5664,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bownty.it", true },
   { "bownty.nl", true },
   { "bowntycdn.net", true },
-  { "boxmoe.cn", true },
   { "boxpeg.com", true },
   { "boxpirates.to", true },
   { "boxvergelijker.nl", true },
@@ -5586,6 +5691,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brackets-salad.com", true },
   { "bracoitaliano.com.br", true },
   { "bradbrockmeyer.com", true },
+  { "bradfergusonrealestate.com", true },
   { "bradfordhottubhire.co.uk", true },
   { "bradfordmascots.co.uk", true },
   { "bradkovach.com", true },
@@ -5609,6 +5715,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brainsik.net", true },
   { "brainster.co", true },
   { "brainvoyagermusic.com", true },
+  { "brainwav.es", true },
   { "brainwork.space", true },
   { "brakemanpro.com", true },
   { "brakpanplumber24-7.co.za", true },
@@ -5628,10 +5735,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brandbuilderwebsites.com", true },
   { "brandcodeconsulting.com", true },
   { "brandcodestyle.com", true },
-  { "brando753.xyz", true },
+  { "brandingclic.com", true },
   { "brandongomez.me", true },
   { "brandonhubbard.com", true },
-  { "brandonlui.ml", true },
   { "brandonwalker.me", true },
   { "brandrocket.dk", true },
   { "brandstead.com", true },
@@ -5646,6 +5752,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brashear.me", true },
   { "brasilbombas.com.br", true },
   { "brasildxn.com.br", true },
+  { "brasileiro.ca", true },
   { "brasserie-mino.fr", true },
   { "brasspipedreams.org", true },
   { "bratislava-airport-taxi.com", true },
@@ -5674,6 +5781,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "breadofgod.org", true },
   { "breakingtech.it", true },
   { "breakpoint.at", true },
+  { "breaky.de", true },
   { "breathedreamgo.com", true },
   { "breathingblanket.com", true },
   { "brecht.ch", true },
@@ -5704,8 +5812,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bretcarmichael.com", true },
   { "brettabel.com", true },
   { "brettelliff.com", true },
+  { "brettlawyer.com", true },
   { "brettw.xyz", true },
-  { "bretz-hufer.de", true },
   { "bretzner.fr", true },
   { "brevboxar.se", true },
   { "brewsouth.com", true },
@@ -5732,14 +5840,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brid.gy", false },
   { "bridalshoes.com", true },
   { "brideandgroomdirect.ie", true },
+  { "bridgedirectoutreach.com", true },
   { "bridgeglobalmarketing.com", true },
+  { "bridgehomeloans.com", true },
   { "bridgement.com", true },
   { "bridgevest.com", true },
   { "bridgingdirectory.com", true },
+  { "bridltaceng.com", true },
   { "brie.tech", true },
   { "briefassistant.com", true },
   { "briefhansa.de", true },
   { "briefvorlagen-papierformat.de", true },
+  { "briffoud.fr", true },
+  { "briggsleroux.com", true },
   { "brighouse-leisure.co.uk", true },
   { "brightday.bz", true },
   { "brightendofleasecleaning.com.au", true },
@@ -5748,6 +5861,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brightonbouncycastles.net", true },
   { "brightonchilli.org.uk", true },
   { "brightonzhang.com", true },
+  { "brightworkcreative.com", true },
   { "brigidaarie.com", true },
   { "brilliantbouncyfun.co.uk", true },
   { "brilliantproductions.co.nz", true },
@@ -5760,7 +5874,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "britelocate.com", true },
   { "britishbeef.com", true },
   { "britishbookmakers.co.uk", true },
-  { "britishchronicles.com", true },
   { "britishgroupsg.com", true },
   { "britishpearl.com", true },
   { "britishsciencefestival.org", true },
@@ -5778,6 +5891,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brockmeyer.net", true },
   { "brockmeyer.org", true },
   { "brodowski.cc", true },
+  { "brody.digital", true },
+  { "brody.ninja", true },
   { "broersma.com", true },
   { "broeselei.at", true },
   { "brokenhands.io", true },
@@ -5789,7 +5904,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bronwynlewis.com", true },
   { "broodbesteld.nl", true },
   { "brooke-fan.com", true },
-  { "brookehatton.com", true },
+  { "brookehatton.com", false },
   { "brooklynrealestateblog.com", true },
   { "brookworth.com", true },
   { "brossmanit.com", true },
@@ -5800,8 +5915,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brown-devost.com", true },
   { "brownfieldstsc.org", true },
   { "brownihc.com", true },
+  { "browntowncountryclub.com", true },
   { "browsemycity.com", true },
+  { "browserleaks.com", true },
   { "brring.com", true },
+  { "brrr.fr", true },
   { "bru6.de", true },
   { "brucekovner.com", true },
   { "brucemartin.net", true },
@@ -5813,16 +5931,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "brunn.email", true },
   { "brunner.ninja", true },
   { "brunohenc.from.hr", true },
+  { "brunoproduit.ch", true },
+  { "brunoramos.com", true },
+  { "brunoramos.org", true },
+  { "brunosouza.org", true },
   { "brush.ninja", true },
   { "bruun.co", true },
-  { "bryankaplan.com", true },
   { "bryanquigley.com", true },
   { "bryansmith.net", true },
   { "bryansmith.tech", true },
+  { "brycecanyon.net", true },
+  { "brycecanyonnationalpark.com", true },
+  { "bryggebladet.dk", true },
   { "brzy-svoji.cz", true },
   { "bs-network.net", true },
   { "bs-security.com", true },
-  { "bs.sb", true },
   { "bs.to", true },
   { "bs12v.ru", true },
   { "bsa157.org", true },
@@ -5830,7 +5953,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bsc-rietz.at", true },
   { "bscc.support", true },
   { "bsd-box.net", true },
-  { "bsd.com.ro", true },
   { "bsdes.net", true },
   { "bsdfreak.dk", true },
   { "bsdlab.com", true },
@@ -5838,7 +5960,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bsdunix.xyz", true },
   { "bsee.gov", true },
   { "bserved.de", true },
-  { "bsg-aok-muenchen.de", true },
   { "bsg.ro", true },
   { "bsgamanet.ro", true },
   { "bsidesf.com", true },
@@ -5846,6 +5967,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bsidessf.com", true },
   { "bsimerch.com", true },
   { "bslim-e-boutique.com", true },
+  { "bso-buitengewoon.nl", true },
   { "bsociabl.com", true },
   { "bsp-southpool.com", true },
   { "bsquared.org", true },
@@ -5862,16 +5984,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "btcpop.co", true },
   { "btcycle.org", true },
   { "btio.pw", true },
+  { "btku.org", true },
   { "btmstore.com.br", true },
   { "btnissanparts.com", true },
   { "btorrent.xyz", true },
-  { "btrb.ml", true },
   { "btsapem.com", true },
-  { "btserv.de", true },
   { "btsoft.eu", true },
   { "btsow.com", true },
   { "bttc.co.uk", true },
-  { "btth.live", true },
   { "btth.pl", true },
   { "btth.tv", true },
   { "btth.xyz", true },
@@ -5879,15 +5999,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bturboo.com", true },
   { "bubblegumblog.com", true },
   { "bubblespetspa.com", true },
+  { "bubblin.io", true },
   { "bubblinghottubs.co.uk", true },
   { "bubblybouncers.co.uk", true },
   { "bubhub.io", true },
+  { "bucek.cz", true },
   { "buch-angucken.de", true },
   { "buchhandlungkilgus.de", true },
   { "buchwegweiser.com", true },
-  { "buck.com", true },
+  { "buckelewrealtygroup.com", true },
   { "buckypaper.com", true },
-  { "budaev-shop.ru", true },
   { "buddhismus.net", true },
   { "buddie5.com", true },
   { "buddlycrafts.com", true },
@@ -5905,12 +6026,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "budolfs.de", true },
   { "buehnenbande.ch", false },
   { "bueltge.de", true },
+  { "buena-vista.cz", true },
   { "buena.me", true },
-  { "buergerdialog.net", true },
-  { "buergerhaushalt.com", true },
   { "bueroplus.de", true },
   { "bueroschwarz.design", true },
   { "bueroshop24.de", true },
+  { "buettgens.net", true },
   { "buffaloautomation.com", true },
   { "buffaloturf.com.au", true },
   { "buffetbouc.com", true },
@@ -5925,6 +6046,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "build.chromium.org", true },
   { "buildbox.io", true },
   { "buildbytes.com", true },
+  { "buildhoscaletraingi.com", true },
   { "building-cost-estimators.com", true },
   { "buildingclouds.de", true },
   { "buildingcostestimators.co.uk", true },
@@ -5934,6 +6056,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "buildplease.com", true },
   { "buildrightbuildingservicesltd.co.uk", true },
   { "buileo.com", true },
+  { "builtory.my", true },
   { "builtvisible.com", true },
   { "builtwith.com", true },
   { "bukkenfan.jp", true },
@@ -5945,20 +6068,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bulkcandystore.com", true },
   { "bulkingtime.com", true },
   { "bulktrade.de", true },
+  { "bulktshirtsjohannesburg.co.za", true },
   { "bulkwholesalesweets.co.uk", true },
   { "bull.id.au", true },
   { "bulldog-hosting.de", true },
   { "bulledair-savons.ch", true },
   { "bullettags.com", true },
-  { "bullpay.com", true },
   { "bullshitmail.nl", true },
   { "bullterrier.nu", true },
   { "bulwarkhost.com", true },
   { "bunbun.be", false },
   { "bund-von-theramore.de", true },
   { "bundespolizei-forum.de", true },
+  { "bungee.pw", true },
+  { "bungee.systems", true },
+  { "bungeetaco.com", true },
   { "bunkyo-life.com", true },
   { "bunny-rabbits.com", true },
+  { "bunnycarenotes.com", true },
   { "bunnyvishal.com", true },
   { "bunzy.ca", true },
   { "bupropion.com", true },
@@ -5966,12 +6093,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "buradangonder.com", true },
   { "burcevo.info", true },
   { "burfordbedandbreakfast.co.uk", true },
+  { "burg-hohnstein.com", true },
   { "burgernet.nl", true },
   { "burgers.io", true },
   { "burghardt.pl", true },
   { "buri.be", false },
   { "burialinsurancenetwork.com", true },
-  { "buricloud.fr", true },
   { "burke.services", true },
   { "burlapsac.ca", true },
   { "burncorp.org", true },
@@ -5991,20 +6118,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "burtrum.me", true },
   { "burtrum.name", true },
   { "burtrum.org", true },
+  { "burzcast.ro", true },
   { "burzmali.com", true },
-  { "busanhs.bid", true },
+  { "burzmedia.com", true },
+  { "burzstudios.com", true },
   { "busanhs.win", true },
-  { "buserror.cn", true },
   { "bushbaby.com", true },
-  { "bushcraftfriends.com", true },
   { "busindre.com", true },
   { "business-garden.com", true },
   { "business.facebook.com", false },
+  { "businessadviceperth.com.au", true },
   { "businesscentermarin.ch", true },
   { "businessesdirectory.eu", true },
   { "businessfactors.de", true },
   { "businessimmigration-eu.com", true },
   { "businessimmigration-eu.ru", true },
+  { "businessloanconnection.org", false },
+  { "businessmadeeasypodcast.com", true },
+  { "businessmarketingblog.org", true },
   { "businessplanexperts.ca", true },
   { "businessradar.com.au", true },
   { "businesswebadmin.com", true },
@@ -6014,38 +6145,33 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bustimes.org", true },
   { "bustup-tips.com", true },
   { "busuttil.org.uk", true },
+  { "busyon.cloud", true },
   { "butarque.es", true },
   { "buthowdoyoubuygroceries.com", true },
   { "butikvip.ru", true },
-  { "buttercupstraining.co.uk", true },
+  { "butteramotors.com", true },
   { "buttonline.ch", true },
   { "buttonrun.com", true },
   { "buturyu.net", true },
   { "buurtgenotencollectief.nl", true },
   { "buurtpreventiefraneker.nl", true },
   { "buxum-communication.ch", true },
-  { "buy-thing.com", true },
-  { "buyaccessible.gov", true },
-  { "buycarpet.shop", true },
+  { "buy-out.jp", true },
   { "buycbd.store", true },
-  { "buycook.shop", true },
   { "buydissertations.com", true },
+  { "buyebook.xyz", true },
   { "buyerdocs.com", true },
-  { "buyessayscheap.com", true },
-  { "buyhealth.shop", true },
   { "buyinginvestmentproperty.com", true },
-  { "buyjewel.shop", true },
   { "buymindhack.com", true },
   { "buypapercheap.net", true },
-  { "buyplussize.shop", true },
-  { "buyprofessional.shop", true },
   { "buyritefairview.com", true },
+  { "buysellinvestproperties.com", true },
   { "buyseo.store", true },
   { "buytermpaper.com", true },
   { "buytheway.co.za", true },
-  { "buywine.shop", true },
   { "buzz.tools", true },
   { "buzzconf.io", true },
+  { "buzzcontent.com", true },
   { "buzzprint.it", true },
   { "bvalle.com", true },
   { "bvgg.eu", true },
@@ -6061,34 +6187,37 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bwf77.com", true },
   { "bwf99.com", true },
   { "bwfc.nl", true },
-  { "bwh1.net", true },
+  { "bwh1.net", false },
   { "bwilkinson.co.uk", true },
   { "bwl-earth.club", true },
   { "bws16.de", true },
+  { "bwserhoscaletrainaz.com", true },
+  { "bwwb.nu", true },
   { "bx-n.de", true },
+  { "bxdev.me", true },
   { "bxp40.at", true },
   { "by.cx", true },
   { "byange.pro", true },
   { "byatte.com", true },
-  { "bydisk.com", false },
   { "byeskille.no", true },
   { "bygningsregistrering.dk", true },
-  { "byiu.info", true },
+  { "byhe.me", true },
+  { "byiu.info", false },
+  { "byken.cn", true },
   { "bymark.co", true },
   { "bymike.co", true },
   { "bynder.com", true },
+  { "bynet.cz", true },
   { "bynumlaw.net", true },
   { "bypass.sh", true },
+  { "byr.moe", true },
   { "byrko.cz", true },
   { "byrko.sk", true },
-  { "byronkg.us", true },
-  { "byronprivaterehab.com.au", true },
-  { "byronr.com", true },
   { "byrtz.de", true },
-  { "bysb.net", false },
+  { "bytanchan.com", true },
   { "byte-time.com", true },
   { "byte128.com", true },
-  { "bytearts.net", true },
+  { "bytearts.net", false },
   { "bytebucket.org", true },
   { "bytecode.no", true },
   { "bytejail.com", true },
@@ -6099,7 +6228,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bytemix.cloud", true },
   { "byteowls.com", false },
   { "bytepark.de", true },
-  { "bytepen.com", true },
   { "bytes.co", true },
   { "bytes.fyi", true },
   { "bytesatwork.de", true },
@@ -6110,8 +6238,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "bytesystems.com", true },
   { "bythen.cn", true },
   { "bythisverse.com", true },
+  { "bytrain.net", true },
   { "byvshie.com", true },
-  { "bywin9.com", true },
+  { "bzhub.bid", true },
   { "bziaks.xyz", true },
   { "bzsparks.com", true },
   { "bztech.com.br", true },
@@ -6126,17 +6255,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "c-webdesign.net", true },
   { "c-world.co.uk", true },
   { "c.cc", true },
-  { "c0rn3j.com", true },
   { "c0rporation.com", true },
   { "c2design.it", true },
   { "c2o-library.net", true },
-  { "c3hv.cn", true },
   { "c3vo.de", true },
   { "c3w.at", true },
   { "c3wien.at", true },
   { "c4539.com", true },
   { "c4k3.net", true },
-  { "c5h8no4na.net", true },
   { "c7dn.com", true },
   { "ca-key.de", true },
   { "ca-terminal-multiservices.fr", true },
@@ -6151,6 +6277,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cabforum.org", true },
   { "cabineritten.nl", true },
   { "cabinet-bedin.com", true },
+  { "cabinetfurnituree.com", true },
   { "cablehighspeed.net", true },
   { "cablemod.com", true },
   { "cablesandkits.com", true },
@@ -6160,11 +6287,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cacaolalina.com", true },
   { "cacaumidade.com.br", true },
   { "caceis.bank", true },
+  { "cachacacha.com", true },
   { "cachetagalong.com", true },
-  { "cachethome.com", true },
   { "cachetur.no", true },
   { "cackette.com", true },
   { "cad-noerdlingen.de", true },
+  { "cadacoon.com", true },
+  { "cadafamilia.de", true },
   { "cadams.io", true },
   { "cadetsge.ch", true },
   { "cadmail.nl", true },
@@ -6177,6 +6306,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cadsys.net", true },
   { "cadusilva.com", true },
   { "caesarkabalan.com", true },
+  { "cafe-service.ru", false },
   { "cafedupont.be", true },
   { "cafedupont.co.uk", true },
   { "cafedupont.de", true },
@@ -6184,16 +6314,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cafeimsueden.de", true },
   { "cafelandia.net", true },
   { "cafeobscura.nl", true },
+  { "caferagazzi.de", true },
   { "cafericoy.com", true },
   { "caffeinatedcode.com", true },
   { "cagalogluyayinevi.com", true },
   { "cainhosting.com", false },
   { "caitcs.com", true },
   { "caiwenjian.xyz", true },
-  { "caizx.com", false },
   { "caja-pdf.es", true },
   { "cajio.ru", true },
   { "cajunuk.co.uk", true },
+  { "cake-time.co.uk", true },
   { "cakestart.net", true },
   { "caketoindia.com", true },
   { "cakingandbaking.com", true },
@@ -6213,18 +6344,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "calcedge.com", true },
   { "calcinacci.com", true },
   { "calcoolator.pl", true },
+  { "calculadoraconversor.com", true },
   { "calcularis.ch", true },
   { "calculateaspectratio.com", true },
   { "calculator-imt.com", true },
   { "calculator.tf", true },
   { "calcworkshop.com", true },
-  { "caldecotevillagehall.co.uk", true },
+  { "caldaro.de", true },
+  { "caldoletto.com", true },
   { "caleb.cx", true },
   { "caleb.host", true },
+  { "calebennett.com", true },
   { "calebthompson.io", true },
+  { "calehoo.com", true },
+  { "calendar.cf", true },
   { "calendarr.com", true },
   { "calendarsnow.com", true },
   { "calendly.com", true },
+  { "calenfil.com", true },
   { "caletka.cz", true },
   { "calgoty.com", true },
   { "calibreapp.com", true },
@@ -6236,6 +6373,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "callawayracing.se", false },
   { "callear.org", true },
   { "callhub.io", true },
+  { "callidus-vulpes.de", true },
   { "calltoar.ms", true },
   { "calltothepen.com", true },
   { "callumsilcock.com", true },
@@ -6247,6 +6385,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "calvin.my", true },
   { "calvinallen.net", false },
   { "calyxengineers.com", true },
+  { "calyxinstitute.org", false },
   { "camaradivisas.com", true },
   { "camaras.uno", true },
   { "camarilloelectric.com", true },
@@ -6255,14 +6394,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "camarillolandscapelighting.com", true },
   { "camarillolighting.com", true },
   { "camarillooutdoorlighting.com", true },
+  { "camashop.de", true },
   { "camastowncar.com", true },
   { "cambier.org", true },
   { "cambiowatch.ch", true },
   { "cambodian.dating", true },
   { "cambridge-security.com", true },
   { "cambridgebouncers.co.uk", true },
+  { "cambridgesecuritygroup.org", true },
   { "camcapital.com", true },
   { "camconn.cc", true },
+  { "camda.online", true },
   { "camdesign.pl", true },
   { "camelservers.com", true },
   { "cameo-membership.uk", true },
@@ -6271,18 +6413,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "camilomodzz.net", true },
   { "camjobs.net", true },
   { "camolist.com", true },
+  { "camomile.desi", true },
   { "camp-pleinsoleil.ch", true },
   { "camp.co.uk", true },
   { "campaign-ad.com", true },
   { "campaign.gov.uk", true },
   { "campaignagent.com.au", true },
+  { "campaignhelpdesk.org", true },
   { "campaignwiki.org", true },
   { "campamentos.info", true },
   { "campbellapplianceheatingandair.com", true },
   { "campbrainybunch.com", true },
   { "campcambodia.org", true },
   { "campcanada.org", true },
-  { "campeoesdofutebol.com.br", true },
   { "campeonatoalemao.com.br", true },
   { "camperdays.de", true },
   { "camperlist.com", true },
@@ -6298,11 +6441,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "campus-discounts.com", true },
   { "campus-finance.com", true },
   { "campusdrugprevention.gov", true },
-  { "campusportalng.com", true },
   { "campuswire.com", true },
   { "campvana.com", true },
   { "campwabashi.org", true },
   { "camshowstorage.com", true },
+  { "camshowverse.com", true },
   { "camsky.de", false },
   { "canada-tourisme.ch", true },
   { "canadabread.com", false },
@@ -6313,19 +6456,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "canadianchristianity.com", false },
   { "canadianoutdoorequipment.com", true },
   { "canadiantouristboard.com", true },
+  { "canal-onanismo.org", true },
   { "canalsidehouse.be", true },
   { "canalsidehouse.com", true },
   { "canarymod.net", true },
   { "cancerdata.nhs.uk", true },
   { "candaceplayforth.com", true },
+  { "candeo-books.nl", true },
   { "candex.com", true },
+  { "candguchocolat.com", true },
   { "candicecity.com", true },
   { "candidasa.com", true },
+  { "candidaturedunprix.com", true },
   { "candlcastles.co.uk", true },
   { "cando.eu", true },
-  { "candylion.rocks", true },
   { "candyout.com", true },
-  { "canerkorkmaz.com", true },
   { "cangelloplasticsurgery.com", true },
   { "cangku.in", true },
   { "cangku.moe", false },
@@ -6337,6 +6482,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "canlidoviz.com", true },
   { "canmipai.com", true },
   { "cannabis-marijuana.com", true },
+  { "cannabismd.com", true },
   { "cannacards.ca", true },
   { "cannahealth.com", true },
   { "cannoli.london", true },
@@ -6347,7 +6493,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "canterberry.cc", true },
   { "canterburybouncycastlehire.co.uk", true },
   { "cantrack.com", true },
-  { "canva-dev.com", true },
   { "canva.com", true },
   { "canx.org", true },
   { "canyoupwn.me", true },
@@ -6355,7 +6500,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cao.la", true },
   { "caodesantohumberto.com.br", true },
   { "caoshan60.com", true },
-  { "capacent.is", true },
   { "capachitos.cl", true },
   { "capacityproject.org", true },
   { "capekeen.com", true },
@@ -6366,14 +6510,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "capital-match.com", true },
   { "capitalcap.com", true },
   { "capitalcollections.org.uk", true },
+  { "capitalfps.com", true },
   { "capitalibre.com", true },
   { "capitalism.party", true },
+  { "capitalmediaventures.co.uk", true },
   { "capitalp.jp", true },
   { "capitalquadatv.org.nz", true },
-  { "capitaltg.com", true },
   { "capitolpathways.org", true },
   { "caplinbouncycastles.co.uk", true },
-  { "capogna.com", false },
   { "capper.de", true },
   { "capriccio.to", true },
   { "caprichosdevicky.com", true },
@@ -6382,15 +6526,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "capstansecurity.co.uk", true },
   { "capstansecurity.com", true },
   { "capstoneinsights.com", true },
+  { "capsulesubs.fr", true },
   { "captain-dandelion.com", true },
+  { "captainark.net", true },
   { "captainsinn.com", true },
-  { "captalize.com", true },
+  { "captivationtheory.com", true },
   { "capturapp.com", false },
   { "capture-app.com", true },
   { "captured-symphonies.com", true },
   { "capuchinox.com", true },
   { "caputo.com", true },
   { "caputodesign.com", true },
+  { "car-shop.top", true },
   { "car.info", true },
   { "car24.de", true },
   { "car24portal.de", true },
@@ -6409,6 +6556,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "carbonmonoxidelawyer.net", true },
   { "carbono.uy", true },
   { "carbontv.com", true },
+  { "carburetorcycleoi.com", true },
   { "carck.co.uk", true },
   { "carck.uk", true },
   { "cardboard.cx", true },
@@ -6440,10 +6588,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "caremad.io", true },
   { "carepassport.com", true },
   { "caretta.co.uk", true },
-  { "carey.li", false },
   { "careyshop.cn", true },
   { "carezone.com", false },
+  { "carfinancehelp.com", true },
+  { "carfraemill.co.uk", true },
   { "cargobay.net", true },
+  { "cargomaps.com", true },
   { "cargorestraintsystems.com.au", true },
   { "carhunters.cz", true },
   { "caribbean.dating", true },
@@ -6456,11 +6606,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cariocacooking.com", true },
   { "carisenda.com", true },
   { "carkeysanantonio.com", true },
-  { "carlandfaith.com", true },
   { "carlife-at.jp", true },
   { "carlili.fr", true },
   { "carlingfordapartments.com.au", true },
   { "carlmjohnson.net", true },
+  { "carlo.mx", false },
   { "carlobiagi.de", true },
   { "carlocksmith--dallas.com", true },
   { "carlocksmithbaltimore.com", true },
@@ -6476,15 +6626,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "carnet-du-voyageur.com", true },
   { "carnildo.com", true },
   { "caroes.be", true },
+  { "caroffer.ch", true },
+  { "carol-lambert.com", true },
   { "carolcappelletti.com", true },
   { "carolcestas.com", true },
   { "caroli.com", true },
-  { "caroli.info", true },
   { "caroli.name", true },
   { "caroli.net", true },
   { "carolina.cz", true },
   { "carolinaclimatecontrolsc.com", true },
   { "carolynjoyce.com.au", true },
+  { "carpetandhardwoodflooringpros.com", true },
   { "carpetcleaningtomball.com", true },
   { "carrando.com", true },
   { "carre-lutz.com", true },
@@ -6492,10 +6644,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "carriedin.com", true },
   { "carrierplatform.com", true },
   { "carringtonrealtygroup.com", true },
+  { "carroattrezzimilanodaluiso.it", true },
   { "carroceriascarluis.com", true },
   { "carrollservicecompany.com", true },
+  { "carrosserie-dubois.com", true },
   { "carseatchecks.ca", true },
   { "carson-aviation-adventures.com", true },
+  { "carson-matthews.co.uk", true },
   { "carsoug.com", true },
   { "carspneu.cz", true },
   { "cartadeviajes.cl", true },
@@ -6524,13 +6679,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cartouche-deal.fr", true },
   { "cartouche24.eu", true },
   { "cartucce24.it", true },
+  { "cartwrightrealestate.com", true },
+  { "carun.us", true },
   { "carusorealestate.com", true },
   { "caryefurd.com", true },
+  { "casa-app.de", true },
   { "casa-due-pur.com", true },
   { "casa-due-pur.de", true },
   { "casa-due.com", true },
+  { "casa-lunch-break.de", true },
+  { "casa-lunchbreak.de", true },
   { "casa-mea-inteligenta.ro", true },
   { "casa-su.casa", true },
+  { "casaanastasia.ro", true },
   { "casabouquet.com", true },
   { "casacameo.com", false },
   { "casacochecurro.com", true },
@@ -6538,6 +6699,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "casadoarbitro.com.br", true },
   { "casadowifi.com.br", true },
   { "casalindamex.com", true },
+  { "casalunchbreak.de", true },
   { "casamariposaspi.com", true },
   { "casapalla.com.br", true },
   { "casasuara.com", true },
@@ -6548,8 +6710,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cascadesjobcorpscca.com", true },
   { "cascavelle.fr", true },
   { "cascavelle.nl", true },
+  { "case-vacanza-salento.com", true },
+  { "casecoverkeygi.com", true },
   { "casecurity.org", true },
   { "caseplus-daem.de", true },
+  { "caseycapitalpartners.com", true },
   { "cash-4x4.com", true },
   { "cashati.com", true },
   { "cashbook.co.tz", true },
@@ -6559,22 +6724,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cashlogic.ch", true },
   { "cashmaxtexas.com", true },
   { "cashplk.com", true },
-  { "casino-cash-flow.su", true },
-  { "casino-cashflow.ru", true },
+  { "casino-online.info", true },
   { "casino-trio.com", true },
   { "casinobonuscodes.online", true },
-  { "casinocashflow.ru", true },
-  { "casinolegal.pt", true },
   { "casinomucho.com", true },
+  { "casinomucho.org", true },
+  { "casinomucho.se", true },
   { "casinoonlinesicuri.com", true },
+  { "casinovergleich.com", true },
   { "casio-caisses-enregistreuses.fr", true },
   { "casjay.cloud", true },
-  { "casjay.com", true },
   { "casjay.info", true },
   { "casjay.us", true },
   { "casjaygames.com", true },
   { "caspar.ai", true },
   { "casperpanel.com", true },
+  { "caspicards.com", true },
   { "cassimo.com", true },
   { "castbulletassoc.org", false },
   { "casteloinformatica.com.br", true },
@@ -6582,7 +6747,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "castle-engine.io", true },
   { "castlecapers.com.au", true },
   { "castlecms.io", true },
-  { "castlejackpot.com", true },
   { "castleking.net", true },
   { "castlekingdomstockport.co.uk", true },
   { "castlekingkent.co.uk", true },
@@ -6602,6 +6766,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "catalogobiblioteca.com", true },
   { "catalogoreina.com", true },
   { "catalogosvirtualesonline.com", true },
+  { "catalyconv.com", true },
   { "catalystapp.co", true },
   { "catbold.space", true },
   { "catbull.com", true },
@@ -6613,6 +6778,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "catchief.com", true },
   { "catdecor.ru", true },
   { "catenacondos.com", true },
+  { "catering-xanadu.cz", true },
   { "catfooddispensersreviews.com", true },
   { "catgirl.science", true },
   { "catharinesomerville.com", true },
@@ -6622,6 +6788,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "catherinescastles.co.uk", true },
   { "catholics.dating", true },
   { "cathosa.nl", true },
+  { "cathosting.org", true },
   { "cathy.guru", true },
   { "cathy.website", true },
   { "cathyfitzpatrick.com", true },
@@ -6649,7 +6816,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cavzodiaco.com.br", true },
   { "caxalt.com", true },
   { "caylercapital.com", true },
+  { "cayounglab.co.jp", true },
   { "cazaviajes.es", true },
+  { "cazes.info", true },
   { "cb-crochet.com", true },
   { "cbbank.com", true },
   { "cbc-hire.co.uk", true },
@@ -6659,6 +6828,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cbdev.de", true },
   { "cbdmarket.space", true },
   { "cbecrft.net", true },
+  { "cbhq.net", true },
   { "cbin168.com", true },
   { "cbintermountainrealty.com", true },
   { "cbk-connect.com", true },
@@ -6671,12 +6841,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cccwien.at", true },
   { "ccgn.co", true },
   { "ccgx.de", true },
-  { "ccja.ro", false },
   { "ccoooss.com", true },
   { "ccprwebsite.org", true },
   { "ccsource.org", true },
   { "ccss-cces.com", true },
   { "ccsys.com", true },
+  { "cctvcanada.net", true },
   { "cctvview.info", true },
   { "ccu.io", true },
   { "ccu.plus", true },
@@ -6695,25 +6865,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cdbtech.com", true },
   { "cdburnerxp.se", true },
   { "cdda.ch", true },
-  { "cdeck.net", true },
   { "cdepot.eu", true },
   { "cdkeykopen.com", true },
-  { "cdlcenter.com", true },
   { "cdn.ampproject.org", true },
   { "cdn6.de", true },
   { "cdncompanies.com", true },
   { "cdnjs.com", true },
   { "cdns.cloud", true },
+  { "cdnsys.net", true },
   { "cdom.de", true },
   { "cdsdigital.de", true },
   { "cdshining.com", true },
   { "cdu-wilgersdorf.de", true },
   { "cduckett.net", true },
   { "ce-pimkie.fr", true },
+  { "ceagriproducts.com", true },
   { "cebz.org", true },
+  { "cecame.ch", true },
   { "ceciliacolombara.com", true },
   { "cecipu.gob.cl", true },
   { "ced-services.nl", true },
+  { "cedarcitydining.com", true },
   { "cedarslodge.com", true },
   { "cedriccassimo.ch", true },
   { "cedriccassimo.com", true },
@@ -6727,23 +6899,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "celectro-pro.com", true },
   { "celiendev.ch", true },
   { "celine-patisserie.fr", true },
+  { "cellebrite.com", true },
   { "celltek-server.de", false },
+  { "celltesequ.com", true },
   { "celluliteorangeskin.com", true },
   { "celluliteremovaldiet.com", true },
   { "celtadigital.com", true },
   { "celti.ie.eu.org", true },
   { "celti.name", true },
   { "celuliteonline.com", true },
-  { "cem.pw", true },
   { "cementscience.com", true },
   { "cemeteriat.com", true },
   { "ceml.ch", true },
   { "cenatorium.pl", true },
+  { "cennelley.com", true },
+  { "cennelly.com", true },
   { "censurfridns.dk", true },
   { "censurfridns.nu", true },
   { "censys.io", true },
+  { "centa-am.com", true },
   { "centaur.de", true },
   { "centennialradon.com", true },
+  { "centennialseptic.com", true },
   { "centerpereezd.ru", false },
   { "centerpoint.ovh", true },
   { "centillien.com", false },
@@ -6761,8 +6938,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "centredaccueil.fr", true },
   { "centreoeil.ch", true },
   { "centrobill.com", true },
+  { "centrodoinstalador.com.br", true },
   { "centrojovencuenca.es", true },
   { "centromasterin.com", true },
+  { "centroperugia.gr", true },
   { "centrosocialferrel.pt", true },
   { "centrumhodinek.cz", true },
   { "centruvechisv.ro", true },
@@ -6775,9 +6954,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "centurioninfosec.hk", true },
   { "centurioninfosec.sg", true },
   { "centurionunderground.com", true },
-  { "century-group.com", true },
   { "ceopedia.org", true },
-  { "ceoptique.com", true },
   { "ceramixcoating.nl", true },
   { "ceramiya.com", true },
   { "cerastar.com", true },
@@ -6791,8 +6968,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cermak.photos", true },
   { "cernakova.eu", true },
   { "cerpus-course.com", true },
-  { "cerstve-korenie.sk", true },
-  { "cerstvekorenie.sk", true },
   { "cert.govt.nz", true },
   { "cert.or.id", true },
   { "certcenter.ch", true },
@@ -6812,6 +6987,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "certnazionale.it", true },
   { "certspotter.com", true },
   { "certspotter.org", true },
+  { "cervejista.com", true },
   { "cesantias.co", true },
   { "cesboard.com", true },
   { "cesdb.com", true },
@@ -6822,8 +6998,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cesobaly.cz", true },
   { "cestasedelicias.com.br", true },
   { "cestunmetier.ch", true },
-  { "ceta.one", true },
   { "cetamol.com", true },
+  { "cetangarana.com", true },
   { "ceu.edu", false },
   { "cevo.com.hr", true },
   { "ceyizlikelisleri.com", true },
@@ -6842,6 +7018,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cfttt.com", true },
   { "cfurl.cf", true },
   { "cfxdesign.com", true },
+  { "cg-goerlitz.de", true },
   { "cg-systems.hu", true },
   { "cg.al", true },
   { "cg.search.yahoo.com", false },
@@ -6865,15 +7042,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chad.ch", true },
   { "chadstoneapartments.com.au", true },
   { "chadtaljaardt.com", true },
+  { "chaffeyconstruction.com", true },
   { "chaifeng.com", true },
   { "chainedunion.info", true },
   { "chaip.org", true },
-  { "chairinstitute.com", true },
   { "chaisystems.net", true },
   { "chaletdemontagne.org", true },
   { "chaletmanager.com", true },
   { "chaletpierrot.ch", true },
   { "chaleur.com", true },
+  { "chalker.io", true },
   { "challengeblog.org", true },
   { "challstrom.com", true },
   { "chamathellawala.com", true },
@@ -6885,9 +7063,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "championcastles.ie", true },
   { "champions.co", true },
   { "championsofpowerfulliving.com", true },
+  { "championweb.co.nz", true },
   { "championweb.com.au", true },
+  { "championweb.nz", true },
   { "champonthis.de", true },
   { "champserver.net", false },
+  { "chancekorte.com", true },
   { "chanddriving.co.uk", true },
   { "chandr1000.ga", true },
   { "chang-feng.info", true },
@@ -6896,18 +7077,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "changesfor.life", true },
   { "changethislater.com", true },
   { "channeladam.com", true },
+  { "chanoyu-gakkai.jp", true },
+  { "chanshiyu.com", false },
   { "chantalguggenbuhl.ch", true },
   { "chanz.com", true },
   { "chaos-games.org", true },
   { "chaos-inc.de", true },
   { "chaos.run", true },
-  { "chaoscastles.co.uk", true },
   { "chaoschemnitz.de", true },
   { "chaosdorf.de", true },
   { "chaosfield.at", true },
   { "chaoslab.org", true },
   { "chaosriftgames.com", true },
-  { "chaoswars.ddns.net", true },
   { "chaotichive.com", true },
   { "chaoticlaw.com", true },
   { "chapelaria.tf", true },
@@ -6922,16 +7103,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "charisma.ai", true },
   { "charissadescande.com", true },
   { "charitylog.co.uk", true },
-  { "charl.eu", true },
   { "charlenevondell.com", true },
   { "charles-darwin.com", true },
   { "charlesbwise.com", true },
-  { "charlesjay.com", true },
   { "charlesmilette.net", true },
   { "charlespitonltd.com", true },
   { "charlesrogers.co.uk", true },
   { "charlesstover.com", true },
-  { "charlestonfacialplastic.com", true },
   { "charliedillon.com", true },
   { "charliegarrod.com", true },
   { "charliehr.com", true },
@@ -6940,6 +7118,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "charlotte-touati.ch", true },
   { "charlotteomnes.com", true },
   { "charlottesvillegolfcommunities.com", true },
+  { "charlottesvillehorsefarms.com", true },
   { "charlotteswimmingpoolbuilder.com", true },
   { "charmander.me", true },
   { "charmanterelefant.at", true },
@@ -6952,8 +7131,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chartkick.com", true },
   { "chartpen.com", true },
   { "chartsy.de", true },
+  { "chartwellestate.com", true },
   { "charuru.moe", true },
-  { "chasafilli.ch", true },
+  { "chascrazycreations.com", true },
   { "chaseandzoey.de", true },
   { "chasetrails.co.uk", true },
   { "chat-libera.org", true },
@@ -6962,7 +7142,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chat2.cf", true },
   { "chat40.net", true },
   { "chatbelgie.eu", true },
-  { "chatbot.one", true },
   { "chatbotclic.com", true },
   { "chatbotclick.com", true },
   { "chatbots.systems", true },
@@ -6970,6 +7149,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chateau-de-lisle.fr", true },
   { "chateaudestrainchamps.com", true },
   { "chatfacile.org", true },
+  { "chatgrape.com", true },
+  { "chatint.com", true },
   { "chatitaly.org", true },
   { "chatme.im", false },
   { "chatnederland.eu", true },
@@ -6979,6 +7160,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chattanoogaface.com", true },
   { "chattergalerie.eu", true },
   { "chattergallery.com", true },
+  { "chattersworld.nl", true },
   { "chatu.io", true },
   { "chatu.me", true },
   { "chatucomputers.com", true },
@@ -6986,6 +7168,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chatxtutti.com", true },
   { "chatzimanolis.com", true },
   { "chatzimanolis.gr", true },
+  { "chauffage-budget.fr", true },
   { "chaurocks.com", true },
   { "chaussenot.net", true },
   { "chavetaro.com", true },
@@ -6996,9 +7179,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chbk.co", true },
   { "chbs.me", true },
   { "chch.it", true },
+  { "chci-web.cz", true },
   { "chcoc.gov", true },
   { "chcsct.com", true },
   { "chd-expert.fr", true },
+  { "cheap-colleges.com", true },
   { "cheapalarmparts.com.au", true },
   { "cheapcaribbean.com", true },
   { "cheapessay.net", true },
@@ -7018,18 +7203,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cheapticket.in", true },
   { "check.torproject.org", false },
   { "checkecert.nl", true },
-  { "checkmateshoes.com", false },
   { "checkmyessay.com", true },
   { "checkmyessays.com", true },
   { "checkmyip.com", true },
   { "checkmypsoriasis.com", true },
   { "checkout.google.com", true },
   { "checkpoint-tshirt.com", true },
+  { "checkras.tk", true },
   { "checkrente.nl", true },
   { "checkspf.net", true },
   { "checktype.com", true },
   { "checkui.com", true },
-  { "checkyourmath.com", true },
   { "checkyourprivilege.org", true },
   { "checkyourreps.org", true },
   { "checos.co.uk", true },
@@ -7043,6 +7227,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cheez.systems", true },
   { "cheezflix.uk", true },
   { "chefwear.com", true },
+  { "chehalemgroup.com", true },
   { "cheladmin.ru", true },
   { "chelema.xyz", true },
   { "cheltenhambounce.co.uk", true },
@@ -7050,11 +7235,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cheltik.ru", true },
   { "chemicalcrux.org", true },
   { "chemiphys.com", true },
+  { "chemistry-schools.com", true },
   { "chenapartment.com", true },
+  { "chengxindong.com", true },
   { "chenkun.pro", true },
   { "chenky.com", true },
   { "chenna.me", true },
   { "chennien.com", true },
+  { "chenpei.org", true },
   { "chenqinghua.com", true },
   { "chentianyi.cn", true },
   { "chenzhekl.me", true },
@@ -7067,12 +7255,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cherryonit.com", true },
   { "cherrywoodtech.com", true },
   { "chertseybouncycastles.co.uk", true },
+  { "chesapeakebaychristmas.com", true },
+  { "chess.com", true },
+  { "chessboardao.com", true },
+  { "chesscoders.com", true },
   { "chesspoint.ch", true },
+  { "chesterlestreetasc.co.uk", true },
   { "chestnut.cf", true },
   { "chevy37.com", true },
   { "chevymotor-occasions.be", true },
   { "chewey.de", true },
   { "chewey.org", true },
+  { "chewingucand.com", true },
   { "chez-janine.de", true },
   { "chez-oim.org", true },
   { "chez.moe", true },
@@ -7084,7 +7278,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chiboard.co", true },
   { "chibr.eu", true },
   { "chic-leather.com", true },
-  { "chicagolug.org", true },
   { "chicagostudentactivists.org", true },
   { "chicisimo.com", true },
   { "chicolawfirm.com", true },
@@ -7098,6 +7291,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "childrenandmedia.org.au", true },
   { "childrenfirstalways.org", true },
   { "childreninadversity.gov", true },
+  { "childrens-room.com", true },
   { "childrensentertainmentleicester.co.uk", true },
   { "childrenspartiesrus.com", true },
   { "childstats.gov", true },
@@ -7113,11 +7307,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chimeratool.com", true },
   { "chimerity.com", true },
   { "chimpanzee.net", true },
-  { "chinacdn.org", true },
   { "chinahighlights.ru", true },
   { "chinaspaceflight.com", true },
   { "chinatrademarkoffice.com", true },
-  { "chinawhale.com", true },
   { "ching.tv", true },
   { "chint.ai", true },
   { "chinwag.im", true },
@@ -7144,36 +7336,39 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chl.la", true },
   { "chloescastles.co.uk", true },
   { "chlth.com", true },
+  { "chmielarz.it", true },
   { "chmsoft.com.ua", true },
   { "chmsoft.ru", true },
   { "chmurakotori.ml", true },
   { "choc-o-lush.co.uk", true },
+  { "chocgu.com", true },
   { "chocodecor.com.br", true },
   { "chocolah.com.au", false },
   { "chocolate13tilias.com.br", true },
   { "chocolatesandhealth.com", true },
   { "chocolatier-tristan.ch", true },
-  { "chocotough.nl", false },
+  { "chocotough.nl", true },
+  { "choiceautoloan.com", true },
   { "choisirmonerp.com", true },
   { "chokladfantasi.net", true },
-  { "chollima.pro", true },
   { "chon.io", true },
+  { "chonghe.org", true },
   { "chook.as", true },
   { "choootto.club", true },
   { "choosemypc.net", true },
   { "chorkley.co.uk", true },
   { "chorkley.com", true },
   { "chorkley.uk", true },
-  { "chorleiterverband.de", true },
   { "chorpinkpoemps.de", true },
   { "chosenplaintext.org", true },
   { "chourishi-shigoto.com", true },
   { "chovancova.sk", true },
   { "chowii.com", true },
   { "choyri.com", true },
-  { "chr0me.sh", true },
   { "chris-edwards.net", true },
   { "chrisaitch.com", true },
+  { "chrisb.me", true },
+  { "chrisb.xyz", true },
   { "chrisbryant.me.uk", true },
   { "chrisburnell.com", true },
   { "chriscarey.com", true },
@@ -7188,16 +7383,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chrismorgan.info", true },
   { "chrismurrayfilm.com", true },
   { "chrisnekarda.com", true },
+  { "chrisplankhomes.com", true },
   { "chrispstreet.com", true },
+  { "chrissx.ga", true },
   { "christadelphiananswers.org", true },
   { "christadelphians.eu", true },
   { "christchurchbouncycastles.co.uk", true },
   { "christec.net", true },
   { "christensenplace.us", true },
-  { "christerwaren.fi", true },
   { "christiaanconover.com", true },
   { "christian-gredig.de", true },
   { "christian-host.com", true },
+  { "christian-krug.website", true },
   { "christian-liebel.com", true },
   { "christian-stadelmann.de", true },
   { "christianbargon.de", false },
@@ -7225,14 +7422,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "christopherburg.com", true },
   { "christopherkennelly.com", true },
   { "christopherl.com", true },
-  { "christopherpritchard.co.uk", true },
+  { "christopherstocks.online", true },
   { "christophertruncer.com", true },
-  { "christophkreileder.com", true },
   { "christophsackl.de", true },
   { "christthekingparish.net", true },
   { "christtheredeemer.us", true },
+  { "christwaycounseling.com", true },
   { "chriswald.com", true },
   { "chriswarrick.com", true },
+  { "chriswbarry.com", true },
   { "chriswells.io", true },
   { "chromcraft-revington.com", true },
   { "chrome-devtools-frontend.appspot.com", true },
@@ -7242,33 +7440,37 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "chromebooksforwork.com", true },
   { "chromiumbugs.appspot.com", true },
   { "chromiumcodereview.appspot.com", false },
-  { "chronic101.xyz", true },
   { "chroniclesofgeorge.com", true },
   { "chronology.no", true },
   { "chronoshop.cz", true },
   { "chrpaul.de", true },
   { "chrstn.eu", true },
   { "chrysanthos.net", true },
+  { "chshouyu.com", true },
   { "chsterz.de", true },
   { "chuchote-moi.fr", true },
   { "chuck.ovh", true },
+  { "chuill.com", true },
   { "chun.pro", true },
   { "chunche.net", true },
   { "chunk.science", true },
   { "chupadelfrasco.com", true },
   { "chuppa.com.au", true },
   { "churchlinkpro.com", true },
+  { "churchofsaintrocco.org", true },
+  { "churchofscb.org", true },
   { "churchthemes.com", true },
   { "churningtracker.com", true },
-  { "chxdf.net", true },
   { "chyen.cc", true },
   { "chytraauta.cz", true },
   { "chziyue.com", true },
   { "ci-fo.org", true },
   { "ci5.me", true },
   { "ciancode.com", true },
+  { "ciania.pl", true },
   { "cianmawhinney.me", true },
   { "ciansc.com", true },
+  { "ciaracode.com", true },
   { "ciat.no", false },
   { "cidbot.com", true },
   { "cidersus.com.ec", true },
@@ -7276,6 +7478,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cielbleu.org", true },
   { "cielly.com", true },
   { "cifop-numerique.fr", true },
+  { "ciftlikesintisi.com", true },
   { "cig-dem.com", true },
   { "cigar-cartel.com", true },
   { "ciiex.co", true },
@@ -7285,7 +7488,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cimfax.com", true },
   { "cinafilm.com", true },
   { "cinay.pw", true },
-  { "cindey.io", true },
   { "cindydudley.com", true },
   { "cine-music.de", true },
   { "cine.to", true },
@@ -7294,7 +7496,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cinemasetfree.com", true },
   { "cinemysticism.com", true },
   { "cineplex.my", true },
-  { "ciner.is", true },
   { "cinnabon.com", true },
   { "cinq-elements.com", true },
   { "cinq-elements.fr", true },
@@ -7305,11 +7506,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cio-ciso-interchange.org", true },
   { "cio-cisointerchange.org", true },
   { "cio.go.jp", true },
-  { "cio.gov", true },
+  { "cio.gov", false },
   { "cioscloud.com", true },
   { "cip.md", true },
   { "cipartyhire.co.uk", true },
   { "cipher.team", true },
+  { "cipherboy.com", true },
   { "ciphersuite.info", true },
   { "ciphrex.com", true },
   { "cipri.com", true },
@@ -7324,7 +7526,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "circu.ml", true },
   { "circulatedigital.com", true },
   { "circule.cc", true },
-  { "cirfi.com", true },
   { "ciri.com.co", true },
   { "cirope.com", true },
   { "cirrus0.de", true },
@@ -7333,20 +7534,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cirurgicagervasio.com.br", true },
   { "cirurgicalucena.com.br", true },
   { "cirurgicasalutar.com.br", true },
-  { "ciscodude.net", true },
+  { "ciscodude.net", false },
   { "cisoaid.com", true },
   { "cisofy.com", true },
   { "cispeo.org", true },
   { "ciss.ltd", true },
   { "cisum-cycling.com", true },
   { "cisy.me", true },
-  { "citationgurus.com", true },
   { "citcuit.in", true },
   { "cities.cl", true },
   { "citimarinestore.com", true },
-  { "citizen-cam.de", true },
   { "citizensbankal.com", true },
   { "citizenscience.gov", false },
+  { "citizenslasvegas.com", true },
   { "citizensleague.org", true },
   { "citizenspact.eu", true },
   { "citizing.org", true },
@@ -7365,6 +7565,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cityworksonline.com", true },
   { "ciubotaru.tk", true },
   { "ciurcasdan.eu", true },
+  { "civicforum.pl", true },
   { "civilg20.org", true },
   { "civillines.nl", true },
   { "civiltoday.com", true },
@@ -7392,8 +7593,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ckostecki.de", true },
   { "cktennis.com", true },
   { "cl.search.yahoo.com", false },
+  { "cl0ud.space", true },
   { "clacetandil.com.ar", true },
-  { "clad.cf", true },
   { "claimconnect.com", true },
   { "claimconnect.us", true },
   { "claimjeidee.be", true },
@@ -7408,6 +7609,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clarkeaward.com", true },
   { "clarkwinkelmann.com", true },
   { "clase3.tk", true },
+  { "clash-movies.de", true },
   { "clash.lol", true },
   { "class.com.au", true },
   { "classdojo.com", true },
@@ -7419,6 +7621,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "classroomconductor.com", true },
   { "classroomcountdown.co.nz", true },
   { "classteaching.com.au", true },
+  { "classyvaper.de", true },
   { "claude-leveille.com", true },
   { "claude.tech", true },
   { "claudia-urio.com", true },
@@ -7427,17 +7630,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clawe.de", true },
   { "clawhammer.dk", true },
   { "clayandcottonkirkwood.com", true },
+  { "clayprints.com", true },
   { "claytonstowing.com.au", true },
   { "clazzrooms.com", true },
   { "cldfile.com", true },
   { "cldly.com", true },
   { "cleanapproachnw.com", true },
-  { "cleanbeautymarket.com.au", true },
   { "cleanbrowsing.org", true },
   { "cleancode.club", true },
   { "cleandetroit.org", true },
   { "cleandogsnederland.nl", true },
+  { "cleanfiles.us", true },
   { "cleanhouse2000.us", true },
+  { "cleaningbyrosie.com", true },
   { "cleaningservicejulai.com", true },
   { "cleansewellness.com", true },
   { "clearance365.co.uk", true },
@@ -7450,9 +7655,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clearvoice.com", true },
   { "clemenscompanies.com", true },
   { "clement-beaufils.fr", true },
-  { "clementfevrier.fr", true },
   { "cles-asso.fr", true },
   { "cles.jp", true },
+  { "clevergod.net", true },
   { "clevertarget.ru", true },
   { "clevisto.com", true },
   { "cleysense.com", true },
@@ -7461,6 +7666,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clicecompre.com.br", true },
   { "clicheshishalounge.co.uk", true },
   { "click-licht.de", true },
+  { "click4web.com", true },
   { "clickclock.cc", true },
   { "clickenergy.com.au", true },
   { "clickphish.com", true },
@@ -7470,7 +7676,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clientboss.com", true },
   { "clientsecure.me", true },
   { "clifflu.net", true },
-  { "climaencusco.com", true },
   { "climaprecio.es", true },
   { "climateinteractive.org", true },
   { "climatestew.com", true },
@@ -7482,20 +7687,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clinicaltrials.gov", true },
   { "clinicasmedicas.com.br", true },
   { "clinicminds.com", true },
+  { "cliniquecomplementaire.com", true },
   { "cliniquevethuy.be", true },
   { "clintonlibrary.gov", true },
   { "clintonplasticsurgery.com", true },
-  { "clip.ovh", false },
   { "clipclip.com", true },
+  { "clippings.com", true },
   { "clive.io", true },
   { "clmde.de", true },
   { "clnc.to", true },
-  { "clnet.com.au", true },
   { "clnnet.ch", true },
   { "cloaked.ch", true },
   { "clochix.net", true },
   { "clockcaster.com", true },
   { "clockworksms.com", true },
+  { "clod-hacking.com", true },
   { "clojurescript.ru", true },
   { "cloppenburg-autmobil.com", true },
   { "cloppenburg-automobil.com", true },
@@ -7511,7 +7717,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cloud.google.com", true },
   { "cloud.gov", true },
   { "cloud42.ch", false },
-  { "cloud58.org", true },
   { "cloud9bouncycastlehire.com", true },
   { "cloudapps.digital", true },
   { "cloudbolin.es", true },
@@ -7519,11 +7724,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cloudbrothers.info", true },
   { "cloudcactuar.com", false },
   { "cloudcaprice.net", true },
+  { "cloudchart.site", true },
   { "cloudcite.net", true },
   { "cloudcloudcloud.cloud", true },
   { "cloudconsulting.net.za", true },
   { "cloudconsulting.org.za", true },
   { "cloudconsulting.web.za", true },
+  { "cloudcrux.net", true },
   { "cloudey.net", true },
   { "cloudfiles.at", true },
   { "cloudflare-dns.com", true },
@@ -7539,7 +7746,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cloudns.net", true },
   { "cloudoptimizedsmb.com", true },
   { "cloudoptimus.com", true },
-  { "cloudpengu.in", true },
   { "cloudpipes.com", true },
   { "cloudse.co.uk", true },
   { "cloudsecurityalliance.org", true },
@@ -7564,6 +7770,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clsoft.ch", true },
   { "clu-in.org", true },
   { "club-adulti.ro", true },
+  { "club-climate.com", true },
   { "club-corsicana.de", true },
   { "club-creole.com", true },
   { "club-duomo.com", true },
@@ -7582,26 +7789,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "clubfamily.de", true },
   { "clubgalaxy.futbol", true },
   { "clubiconkenosha.com", true },
+  { "clubmate.rocks", true },
   { "clubmini.jp", true },
   { "clubnoetig-ink2g.de", true },
   { "clubon.space", true },
   { "clubscannan.ie", true },
   { "clueful.ca", true },
+  { "clush.pw", true },
   { "cluster.biz.tr", true },
   { "clusteranalyse.net", true },
   { "clusterfuck.nz", true },
   { "clustermaze.net", true },
   { "clweb.ch", true },
   { "cm.center", true },
-  { "cm3.pw", true },
   { "cmacacias.ch", true },
   { "cmadeangelis.it", true },
   { "cmahy.be", true },
   { "cmcressy.ch", true },
   { "cmdline.org", true },
+  { "cme-colleg.de", true },
   { "cmf.qc.ca", true },
   { "cmfaccounting.com", true },
   { "cmftech.com", true },
+  { "cmgacheatcontrol.com", true },
   { "cmillrehab.com", true },
   { "cmlachapelle.ch", true },
   { "cmlancy.ch", true },
@@ -7611,6 +7821,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cmngroup.com", true },
   { "cmngroupe.com", true },
   { "cmplainpalais.ch", true },
+  { "cms-weble.jp", true },
   { "cmskeyholding.co.uk", true },
   { "cmskeyholding.com", true },
   { "cmusical.es", true },
@@ -7620,13 +7831,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cna-aiic.ca", true },
   { "cna5.cc", true },
   { "cnam-idf.fr", true },
-  { "cnam.net", true },
   { "cnatraining.network", true },
   { "cnbs.ch", true },
   { "cnc-lehrgang.de", true },
   { "cncado.net", true },
   { "cncbazar365.com", true },
-  { "cncmachinemetal.com", true },
   { "cncrans.ch", true },
   { "cnet-hosting.com", true },
   { "cni-certing.it", true },
@@ -7635,6 +7844,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "co-factor.ro", true },
   { "co-founder-stuttgart.de", true },
   { "co.search.yahoo.com", false },
+  { "co2eco.cn", true },
   { "co50.com", true },
   { "coa.one", true },
   { "coachezmoi.ch", true },
@@ -7642,14 +7852,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "coaching-impulse.ch", true },
   { "coalitionministries.org", true },
   { "coalpointcottage.com", true },
+  { "coam.co", true },
   { "coastline.net.au", true },
   { "coathangastrangla.com", true },
   { "coathangastrangler.com", true },
+  { "coathangerstrangla.com", true },
+  { "coathangerstrangler.com", true },
   { "coatl-industries.com", true },
   { "cobalt.io", true },
   { "cobaltgp.com", true },
   { "cobracastles.co.uk", true },
-  { "cocaine-import.agency", true },
   { "cocaine.ninja", true },
   { "cocalc.com", true },
   { "cocareonline.com", true },
@@ -7670,6 +7882,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "coda.moe", true },
   { "coda.today", true },
   { "coda.world", true },
+  { "codabix.com", true },
+  { "codabix.de", true },
   { "code-golf.io", true },
   { "code-poets.co.uk", true },
   { "code-well.com", true },
@@ -7680,16 +7894,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "code67.com", true },
   { "codeandpeace.com", true },
   { "codebrahma.com", false },
+  { "codecommunity.io", true },
+  { "codedelarouteenligne.fr", true },
   { "codedump.net", true },
   { "codeeclipse.com", true },
   { "codeferm.com", true },
   { "codefordus.de", true },
   { "codefordus.nrw", true },
+  { "codein.ca", true },
   { "codeine.co.uk", true },
   { "codeit.guru", true },
   { "codeit.us", true },
   { "codejots.com", true },
-  { "codejunkie.de", false },
+  { "codemill.se", true },
   { "codemonster.eu", true },
   { "codenode.io", true },
   { "codeofthenorth.com", true },
@@ -7701,7 +7918,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "codereview.appspot.com", false },
   { "codereview.chromium.org", false },
   { "coderme.com", true },
-  { "codersbistro.com", true },
+  { "coderware.co.uk", true },
   { "codes.pk", true },
   { "codesplain.in", true },
   { "codesport.io", true },
@@ -7714,6 +7931,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "codeux.com", false },
   { "codeux.info", false },
   { "codeux.net", false },
+  { "codevat.com", true },
   { "codeventure.de", true },
   { "codeversetech.com", true },
   { "codewild.de", true },
@@ -7723,6 +7941,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "codific.eu", true },
   { "codigo-bonus-bet.es", true },
   { "codigodelbonusbet365.com", true },
+  { "codigosddd.com.br", true },
+  { "codimaker.com", true },
   { "coding-minds.com", true },
   { "coding.lv", true },
   { "coding.net", true },
@@ -7734,14 +7954,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "codyevanscomputer.com", true },
   { "codymoniz.com", true },
   { "codyqx4.com", true },
+  { "codyscafesb.com", true },
+  { "coens.me.uk", true },
   { "coentropic.com", true },
+  { "cofbev.com", true },
   { "coffee-mamenoki.jp", true },
   { "coffeeandteabrothers.com", true },
+  { "coffeetime.fun", true },
   { "coffeetocode.me", true },
   { "cogala.eu", true },
   { "cogent.cc", true },
   { "cogilog.com", true },
   { "cogitoltd.com", true },
+  { "cognicom-gaming.com", true },
   { "cognitip.com", true },
   { "cognitivecomputingconsortium.com", true },
   { "cognitohq.com", true },
@@ -7761,14 +7986,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "coincoin.eu.org", true },
   { "coincolors.co", true },
   { "coindatabase.net", true },
+  { "coindeal.com", true },
   { "coinf.it", true },
   { "coinflux.com", true },
   { "coingate.com", true },
-  { "coinjar-sandbox.com", true },
   { "coinlist.co", false },
   { "coinloan.io", true },
   { "coinmewallet.com", true },
   { "coinpit.io", true },
+  { "coinroom.com", true },
   { "coins2001.ru", true },
   { "coinx.pro", true },
   { "coisasdemulher.org", true },
@@ -7783,12 +8009,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "coldfff.com", false },
   { "coldhak.ca", true },
   { "coldstreamcreekfarm.com", true },
+  { "colectivointerconductual.com", true },
   { "colegiocierp.com.br", true },
   { "colemak.com", true },
   { "colengo.com", true },
   { "colf.online", true },
   { "colibris.xyz", true },
-  { "colincampbell.me", true },
   { "colinchartier.com", true },
   { "colincogle.name", true },
   { "colinsnaith.co.uk", true },
@@ -7816,8 +8042,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "collectorsystems.com", true },
   { "collegeconnexxions.com.au", true },
   { "collegenavigator.gov", true },
-  { "collegepaperworld.com", true },
   { "collegeprospectsofcentralindiana.com", true },
+  { "collegestationhomes.com", true },
   { "collinel-hossari.com", true },
   { "collinelhossari.com", true },
   { "collinklippel.com", true },
@@ -7835,14 +8061,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "coloristcafe.com", true },
   { "colorsbycarin.com", true },
   { "colossal-events.co.uk", true },
+  { "colotimes.com", true },
   { "colourfulcastles.co.uk", true },
   { "colpacpackaging.com", true },
   { "colson-occasions.be", true },
+  { "coltellisurvival.com", true },
   { "coltonrb.com", true },
   { "columbuswines.com", true },
   { "colyakootees.com", true },
   { "com-in.de", true },
-  { "com-news.io", true },
   { "comalia.com", true },
   { "comandofilmes.club", true },
   { "comarkinstruments.net", true },
@@ -7859,11 +8086,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "comercialtpv.com", true },
   { "comerford.net", true },
   { "comestoarra.com", true },
-  { "cometbot.cf", true },
   { "cometcache.com", true },
-  { "comevius.com", true },
-  { "comevius.org", true },
-  { "comevius.xyz", true },
+  { "cometonovascotia.ca", true },
   { "comff.net", true },
   { "comfintouch.com", true },
   { "comflores.com.br", true },
@@ -7880,30 +8104,36 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "commechezvous.ch", true },
   { "commerce.gov", true },
   { "commercial-academy.fr", true },
+  { "commeunamour.com", true },
   { "commissionagenda.com", true },
   { "commitsandrebases.com", true },
   { "common.io", true },
   { "commoncode.com.au", true },
   { "commoncode.io", true },
   { "commoncore4kids.com", true },
+  { "commonspace.la", true },
   { "communityblog.fedoraproject.org", true },
   { "communitycodeofconduct.com", true },
   { "communityflow.info", true },
+  { "communitymanagertorrejon.com", true },
   { "communote.net", true },
   { "como-se-escribe.com", true },
+  { "comocurarlagastritis24.online", true },
   { "comocurarlagastritistratamientonatural.com", true },
   { "comodesinflamarlashemorroides.org", true },
-  { "comodo.nl", true },
   { "comodormirmasrapido.com", true },
   { "comodosslstore.com", true },
   { "comoeliminarlaspapulasperladasenelglande.com", true },
   { "comogene.com", true },
   { "comohacerelamoraunhombrenet.com", true },
   { "comohacerpara.com", true },
+  { "comoimportar.net", true },
   { "comopuededejardefumar.net", true },
   { "comoquitarlacaspa24.com", true },
   { "comoquitarlasestriasrapidamente.com", true },
   { "comosatisfaceraunhombreenlacamaydejarloloco.com", true },
+  { "comosecarabarriga.net", true },
+  { "comoseduzir.net", true },
   { "comosefazisto.com.br", true },
   { "comp2go.com.au", true },
   { "compactchess.cc", true },
@@ -7929,6 +8159,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "completesecurityessex.com", true },
   { "completionist.me", true },
   { "complexart.ro", true },
+  { "complexorganizations.com", true },
   { "complexsystems.fail", true },
   { "compliance-management.ch", true },
   { "compliance-systeme.de", true },
@@ -7937,29 +8168,32 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "compostatebien.com.ar", true },
   { "compoundingrxusa.com", true },
   { "compraneta.com", false },
+  { "comprasoffie.com.br", true },
   { "compreautomacao.com.br", true },
   { "compree.com", true },
-  { "compros.me", true },
   { "compservice.in.ua", true },
   { "comptu.com", true },
   { "compubench.com", true },
   { "compucastell.ch", true },
   { "compucorner.mx", true },
+  { "compunetwor.com", true },
   { "compuplast.cz", true },
-  { "compusolve.nl", true },
   { "computehealth.com", true },
   { "computer-acquisti.com", true },
+  { "computer-science-schools.com", true },
   { "computeracademy.co.za", true },
   { "computerassistance.co.uk", true },
   { "computerbas.nl", true },
   { "computerbase.de", true },
+  { "computercamaccgi.com", true },
   { "computercraft.net", true },
+  { "computeremergency.com.au", false },
   { "computerhilfe-feucht.de", true },
   { "computernetwerkwestland.nl", true },
   { "computerslotopschool.nl", true },
   { "computersystems.guru", false },
-  { "comssa.org.au", true },
   { "comunidadmontepinar.es", true },
+  { "comvos.de", true },
   { "comw.cc", true },
   { "conalcorp.com", true },
   { "conatus.ai", true },
@@ -7970,6 +8204,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "concertsenboite.fr", true },
   { "concertsto.com", true },
   { "conciliumnotaire.ca", true },
+  { "conclinica.com.br", true },
   { "concordsoftwareleasing.com", true },
   { "concretelevelingsystems.com", true },
   { "concreterepairatlanta.com", true },
@@ -7995,14 +8230,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "conejovalleyoutdoorlighting.com", true },
   { "conexiontransporte.com", true },
   { "confiancefoundation.org", true },
-  { "confidential.network", true },
   { "config.schokokeks.org", false },
   { "confiwall.de", true },
   { "conformax.com.br", true },
   { "conformist.jp", true },
   { "confucio.cl", true },
-  { "confuddledpenguin.com", true },
-  { "cong5.net", true },
   { "congineer.com", true },
   { "congobunkering.com", true },
   { "conju.cat", true },
@@ -8026,18 +8258,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "connext.de", true },
   { "connictro.de", true },
   { "connorhatch.com", true },
+  { "connorsmith.co", true },
   { "connyduck.at", true },
   { "conociendosalama.com", true },
   { "conocimientosdigitales.com", true },
   { "conory.com", true },
-  { "conpins.nl", true },
+  { "conpath.net", true },
   { "conrad-kostecki.de", true },
   { "conradkostecki.de", true },
+  { "conradsautotransmissionrepair.com", true },
   { "conrail.blue", true },
   { "consagracionamariasantisima.org", true },
   { "consciousbrand.co", true },
   { "consciouschoices.net", true },
   { "consec-systems.de", true },
+  { "consegnafioridomicilio.net", true },
   { "consejosdenutricion.com", true },
   { "consensoprivacy.it", true },
   { "conservados.com.br", true },
@@ -8050,18 +8285,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "console.rest", true },
   { "consommateuraverti.com", true },
   { "consonare.de", true },
+  { "conspiracyservers.com", true },
   { "constancechen.me", true },
   { "constant-rough.de", true },
   { "constares.de", true },
+  { "constitution.website", true },
   { "constructexpres.ro", true },
   { "constructieve.nl", true },
+  { "construction-colleges.com", true },
   { "construction-student.co.uk", true },
   { "constructionjobs.com", true },
+  { "constructive.men", true },
   { "consul.io", true },
   { "consulenza.pro", true },
   { "consultation.biz.tr", true },
   { "consultimator.com", true },
   { "consultimedia.de", true },
+  { "consultoriadeseguranca.com.br", true },
   { "consultoriosodontologicos.com.br", true },
   { "consultpetkov.com", true },
   { "consulvation.com", true },
@@ -8094,7 +8334,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "controle.net", true },
   { "controleer-maar-een-ander.nl", true },
   { "controltickets.com.br", true },
-  { "contxt-agentur.de", true },
   { "conv2pdf.com", true },
   { "convergence.fi", true },
   { "convergencela.com", true },
@@ -8103,7 +8342,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "conversionsciences.com", true },
   { "convert.im", true },
   { "convert.zone", true },
-  { "converter.ml", true },
   { "converticacommerce.com", false },
   { "convexset.org", true },
   { "convocatoriafundacionpepsicomexico.org", false },
@@ -8116,6 +8354,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cookiecrook.com", true },
   { "cookielab.io", true },
   { "cookiesoft.de", true },
+  { "cooking-sun.com", true },
   { "cookingcrusade.com", true },
   { "cookinglife.nl", false },
   { "cookingreporter.com", true },
@@ -8125,24 +8364,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cookwithmanali.com", true },
   { "cool-parties.co.uk", true },
   { "cool-wallpapers.jp", true },
+  { "cool.haus", true },
   { "cool110.tk", true },
   { "cool110.xyz", true },
   { "coolattractions.co.uk", true },
   { "coolbitx.com", true },
+  { "coolcamping.com", true },
   { "cooldan.com", true },
   { "coole-fete.de", true },
+  { "coolerssr.space", true },
   { "coolgifs.de", true },
   { "coolprylar.se", true },
-  { "coolrc.me", true },
   { "coolviewthermostat.com", true },
   { "coolwallet.io", true },
   { "coonawarrawines.com.au", true },
   { "coopens.com", true },
+  { "cooperativehandmade.com", true },
+  { "cooperativehandmade.pe", true },
   { "coor.fun", true },
   { "coore.jp", true },
   { "coorpacademy.com", true },
   { "copdfoundation.org", true },
   { "copinstant.com", true },
+  { "copperandtileroofing.com", true },
   { "copperhead.co", true },
   { "copperheados.com", true },
   { "coppermein.co.za", true },
@@ -8154,10 +8398,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "copycaught.org", true },
   { "copycaught.xyz", true },
   { "copycrafter.net", true },
+  { "copydz.com", true },
   { "copypoison.com", true },
   { "copyright-watch.org", true },
   { "coquibus.net", true },
   { "corbi.net.au", true },
+  { "cordejong.nl", true },
   { "cordep.biz", true },
   { "corder.tech", true },
   { "cordeydesign.ch", true },
@@ -8195,7 +8441,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "corona-academy.com", true },
   { "corona-renderer.cloud", true },
   { "corona-renderer.com", true },
-  { "coropiacere.org", true },
   { "corourbano.es", true },
   { "corpfin.net", true },
   { "corpio.nl", true },
@@ -8210,6 +8455,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "corpulantcoffee.com", true },
   { "corpulent.coffee", true },
   { "corpulentcoffee.com", true },
+  { "corpuschristisouthriver.org", true },
   { "corpusslayer.com", true },
   { "corrbee.com", true },
   { "correctiv.org", true },
@@ -8222,9 +8468,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cortexx.nl", true },
   { "cortis-consulting.ch", true },
   { "cortisolsupplement.com", true },
+  { "corvax.kiev.ua", true },
   { "corvus.eu.org", true },
   { "coryadum.com", true },
   { "corytyburski.com", true },
+  { "corzntin.fr", false },
   { "cosasque.com", true },
   { "cosciamoos.com", true },
   { "cosirex.com", true },
@@ -8235,32 +8483,39 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cosmeticasimple.com", true },
   { "cosmeticos-naturales.com", true },
   { "cosmeticosdelivery.com.br", true },
-  { "cosmic-os.org", true },
   { "cosmicnavigator.com", true },
   { "cosmodacollection.com", true },
   { "cosmofunnel.com", true },
   { "cosmundi.de", true },
   { "cosni.co", true },
+  { "cosplayer.com", true },
   { "cospol.ch", true },
   { "costa-rica-reisen.ch", true },
   { "costa-rica-reisen.de", true },
   { "costablanca.villas", true },
   { "costablancavoorjou.com", true },
   { "costcofinance.com", true },
+  { "costcoinsider.com", true },
+  { "costellofc.co.uk", true },
   { "costinstefan.eu", true },
   { "costreportdata.com", false },
   { "costulessdirect.com", true },
   { "coteries.com", true },
+  { "cotoacc.com", true },
   { "cotonmusic.ch", true },
   { "cotta.dk", true },
   { "cotwe-ge.ch", true },
   { "cougar.dating", true },
+  { "counsellingtime.co.uk", true },
+  { "counsellingtime.com", true },
   { "counstellor.com", true },
   { "counter-team.ch", true },
   { "counterglobal.com", true },
-  { "countermail.com", true },
+  { "counterhack.nl", true },
+  { "countermail.com", false },
   { "countermats.net", true },
   { "countersolutions.co.uk", true },
+  { "countetime.com", true },
   { "countingto.one", true },
   { "countryattire.com", true },
   { "countrybrewer.com.au", true },
@@ -8270,14 +8525,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "countyjailinmatesearch.com", true },
   { "coupe-bordure.com", true },
   { "couponcodesme.com", true },
+  { "cour4g3.me", true },
   { "couragefound.org", true },
-  { "coursables.com", true },
   { "coursera.org", true },
   { "courtlistener.com", true },
   { "couscous.recipes", true },
-  { "cousincouples.com", true },
+  { "cousincouples.com", false },
   { "coussinsky.net", true },
   { "couvreur-hinault.fr", true },
+  { "covaci.pro", true },
   { "covbounce.co.uk", true },
   { "covenantmatrix.com", true },
   { "covenantoftheriver.org", true },
@@ -8290,17 +8546,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cowbird.org", true },
   { "cowboyim.com", true },
   { "coweo.cz", true },
+  { "cowo.group", true },
+  { "coxcapitalmanagement.com", true },
   { "coxxs.me", true },
   { "coxxs.moe", true },
-  { "cozitop.com.br", true },
   { "cozo.me", true },
   { "cozyeggdesigns.com", true },
   { "cp-st-martin.be", true },
   { "cpahunt.com", false },
+  { "cpasperdu.com", true },
   { "cpbapremiocaduceo.com.ar", true },
   { "cpcheats.co", true },
   { "cpd-education.co.uk", true },
+  { "cpe-colleg.de", true },
   { "cphpvb.net", true },
+  { "cplala.com", true },
   { "cplus.me", true },
   { "cplusplus.se", true },
   { "cppan.org", true },
@@ -8308,12 +8568,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cpqcol.gov.co", true },
   { "cprheartcenter.com", true },
   { "cprnearme.com", true },
+  { "cpsc.gov", true },
+  { "cptoon.com", true },
   { "cpu.biz.tr", true },
   { "cpvmatch.eu", true },
   { "cpy.pt", true },
   { "cqn.ch", true },
   { "cr.search.yahoo.com", false },
   { "cr0nus.net", true },
+  { "cr9499.com", true },
+  { "cra-bank.com", true },
+  { "cra-search.net", true },
+  { "craazzyman21.at", true },
   { "crackcat.de", true },
   { "cracker.in.th", true },
   { "crackle.io", true },
@@ -8327,18 +8593,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "craftinginredlipstick.com", true },
   { "craftist.de", true },
   { "craftsmandruggets.com", true },
+  { "craftsmany.net", true },
   { "craftyguy.net", true },
   { "craftyphotons.net", true },
   { "crag.com.tw", true },
   { "craigary.net", true },
   { "craigbates.co.uk", true },
   { "craigfrancis.co.uk", true },
+  { "craigleclaireteam.com", true },
   { "craigrouse.com", true },
   { "craigwfox.com", true },
   { "cralarm.de", true },
   { "crandall.io", true },
   { "cranforddental.com", true },
+  { "cranshafengin.com", true },
   { "crapouill.es", true },
+  { "cratss.co.uk", true },
   { "crawcial.de", true },
   { "crawford.cloud", true },
   { "crawfordcountytcc.org", true },
@@ -8346,6 +8616,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "crawler.ninja", true },
   { "crawleybouncycastles.co.uk", true },
   { "crawlspaceandbasementsolutions.com", true },
+  { "crazy-bulks.com", true },
+  { "crazy-cat.net", true },
   { "crazy-coders.com", true },
   { "crazycastles.ie", true },
   { "crazydomains.ae", true },
@@ -8357,7 +8629,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "crazynoisybizarre.town", true },
   { "crazypaul.com", true },
   { "crbug.com", true },
-  { "crc-online.nl", true },
+  { "crc-bank.com", true },
+  { "crc-search.com", true },
   { "crdmendoza.net", true },
   { "crea-etc.net", true },
   { "crea-shops.ch", true },
@@ -8375,7 +8648,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "creative-wave.fr", true },
   { "creativebites.de", true },
   { "creativecaptiv.es", true },
-  { "creativecommons.cl", true },
   { "creativecommons.gr", true },
   { "creativecommons.org", true },
   { "creativeconceptsvernon.com", true },
@@ -8391,9 +8663,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "creativeweb.biz", true },
   { "creativewolf.net", true },
   { "creativlabor.ch", true },
-  { "creatixx-network.de", true },
+  { "creatixx-network.de", false },
   { "creators-design.com", true },
-  { "creators.co", true },
   { "creators.direct", true },
   { "creatujoya.com", true },
   { "credential.eu", true },
@@ -8403,6 +8674,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "creditkarma.com", true },
   { "creditos-rapidos.com", true },
   { "creditproautos.com", false },
+  { "creditscoretalk.com", true },
+  { "creditta.com", true },
+  { "credittoken.io", true },
   { "creeks-coworking.com", true },
   { "creep.im", true },
   { "creepycraft.nl", true },
@@ -8430,31 +8704,33 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "criena.com", true },
   { "criena.net", true },
   { "crimefreeliving.com", true },
+  { "crimesolutions.gov", true },
   { "crimevictims.gov", true },
   { "criminal-attorney.ru", true },
   { "criminal.enterprises", true },
   { "crimson.no", true },
   { "crinesdanzantes.be", true },
-  { "crip-usk.ba", true },
+  { "criptolog.com", true },
   { "crisisactual.com", true },
   { "crisisnextdoor.gov", true },
   { "crisp.chat", true },
   { "crisp.email", true },
   { "crisp.help", true },
   { "crisp.im", true },
+  { "crisp.watch", true },
   { "crispinusphotography.com", true },
   { "cristarta.com", true },
   { "cristau.org", true },
   { "cristiandeluxe.com", false },
+  { "critcola.com", true },
   { "critical.today", false },
   { "criticalsurveys.co.uk", true },
   { "crizin.io", true },
   { "crm.onlime.ch", false },
-  { "croceverdevb.it", true },
+  { "crm114d.com", true },
   { "crochetnerd.com", true },
   { "croisedanslemetro.com", true },
   { "croixblanche-haguenau.fr", true },
-  { "cromefire.myds.me", true },
   { "cronberg.ch", true },
   { "croncron.io", true },
   { "cronix.cc", true },
@@ -8478,7 +8754,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "crossorig.in", true },
   { "crossoverit.com", true },
   { "crosssellguide.com", true },
-  { "crow.tw", true },
+  { "crowd.supply", true },
   { "crowdbox.net", true },
   { "crowdcloud.be", true },
   { "crowdliminal.com", true },
@@ -8492,51 +8768,51 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "croydonapartments.com.au", true },
   { "croydonbouncycastles.co.uk", true },
   { "crrev.com", true },
+  { "crsmsodry.cz", true },
   { "crstat.ru", true },
+  { "crt.cloud", true },
   { "crt.sh", true },
-  { "crt2014-2024review.gov", true },
   { "crumbcontrol.com", true },
   { "crunchrapps.com", true },
   { "crunchy.rocks", true },
   { "crustytoothpaste.net", true },
   { "crute.me", true },
-  { "cruzadobalcazarabogados.com", true },
   { "crvv.me", true },
   { "cry.nu", false },
   { "cryoit.com", true },
   { "cryothanasia.com", true },
   { "cryp.no", true },
   { "crypt.is-by.us", true },
+  { "cryptagio.com", true },
   { "cryptearth.de", true },
   { "crypted.chat", true },
   { "crypteianetworks.com", true },
   { "crypticshell.co.uk", true },
-  { "crypto-navi.org", true },
+  { "crypto-armory.com", true },
   { "crypto.cat", false },
   { "crypto.graphics", true },
   { "crypto.is", false },
-  { "crypto.tube", true },
   { "cryptobin.co", true },
   { "cryptocon.org", true },
-  { "cryptodyno.ninja", true },
   { "cryptoegg.ca", true },
   { "cryptofan.org", true },
   { "cryptofrog.co", true },
   { "cryptography.ch", true },
   { "cryptography.io", true },
   { "cryptoguidemap.com", true },
+  { "cryptojacks.io", true },
   { "cryptojourney.com", true },
   { "cryptolinc.com", true },
   { "cryptology.ch", true },
   { "cryptolosophy.io", true },
   { "cryptolosophy.org", true },
+  { "cryptomaniaks.com", true },
   { "cryptonom.org", true },
   { "cryptonym.com", true },
   { "cryptoparty.at", true },
   { "cryptoparty.tv", true },
   { "cryptopartyutah.org", true },
   { "cryptophobia.nl", true },
-  { "cryptopro.shop", true },
   { "cryptorival.com", true },
   { "cryptoseb.pw", true },
   { "cryptoshot.pw", true },
@@ -8548,6 +8824,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "crystalchandelierservices.com", true },
   { "crystalgrid.net", true },
   { "crystallizedcouture.com", true },
+  { "crystaloscillat.com", true },
   { "crystone.me", true },
   { "cryz.ru", true },
   { "cs2016.ch", true },
@@ -8560,41 +8837,40 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "csd-sevnica.si", true },
   { "csfcloud.com", true },
   { "csfd.cz", true },
-  { "csfloors.co.uk", true },
   { "csfm.com", true },
-  { "csgo.su", true },
+  { "csgo.su", false },
   { "csgoswap.com", true },
-  { "csgotwister.com", true },
   { "csharpmarc.net", true },
-  { "cshopify.com", true },
   { "csi.lk", true },
   { "csinterstargeneve.ch", true },
-  { "cskdoc.com", true },
   { "cskentertainment.co.uk", true },
   { "csmainframe.com", true },
   { "csokolade.hu", true },
+  { "csovek-idomok.hu", true },
   { "csp.ch", true },
   { "cspeti.hu", true },
   { "cspvalidator.org", true },
   { "csrichter.com", true },
-  { "csru.net", true },
   { "css.direct", false },
   { "css.net", true },
   { "cssai.eu", true },
   { "cssaunion.com", true },
   { "cstb.ch", true },
   { "cstp-marketing.com", true },
+  { "cstrong.nl", true },
   { "csu.st", true },
   { "csuw.net", true },
   { "csvalpha.nl", true },
   { "ct.search.yahoo.com", false },
   { "ctc-transportation.com", true },
+  { "ctcom-peru.com", true },
   { "ctcue.com", true },
   { "ctf.link", true },
-  { "cthomas.work", true },
   { "cthulhuden.com", true },
   { "ctj.im", true },
+  { "ctkwwri.org", true },
   { "ctl.email", true },
+  { "ctliu.com", true },
   { "ctnguyen.de", true },
   { "ctnguyen.net", true },
   { "ctns.de", true },
@@ -8607,6 +8883,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cub-bouncingcastles.co.uk", true },
   { "cube-cloud.com", true },
   { "cube.de", true },
+  { "cubebot.io", true },
+  { "cubebuilders.net", true },
   { "cubecart-demo.co.uk", true },
   { "cubecart-hosting.co.uk", true },
   { "cubecraft.net", true },
@@ -8616,6 +8894,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cubia3.com", true },
   { "cubia4.com", true },
   { "cubile.xyz", true },
+  { "cubing.net", true },
   { "cublick.com", true },
   { "cubos.io", false },
   { "cubostecnologia.com", false },
@@ -8634,9 +8913,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cultivo.bio", true },
   { "cultofd50.org", true },
   { "cultofperf.org.uk", true },
+  { "culture-school.top", true },
   { "culturedcode.com", true },
   { "culturerain.com", true },
-  { "cultureroll.com", true },
   { "culturesouthwest.org.uk", true },
   { "cumberlandrivertales.com", true },
   { "cumplegenial.com", true },
@@ -8653,7 +8932,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "curareldolordeespalda.com", true },
   { "curatedgeek.com", true },
   { "curbside.com", true },
-  { "curia.fi", true },
   { "curieux.digital", true },
   { "curio-shiki.com", true },
   { "curiosity-driven.org", true },
@@ -8664,15 +8942,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "currentlyusa.com", true },
   { "currentobserver.com", true },
   { "currynissanmaparts.com", true },
+  { "cursos-trabajadores.net", true },
   { "cursos.com", true },
   { "cursosforex.com", true },
-  { "cursosgratuitos.com.br", true },
   { "cursosingles.com", true },
   { "cursossena.co", true },
   { "cursuri-de-actorie.ro", true },
   { "curtacircuitos.com.br", false },
   { "curtis-smith.me.uk", true },
   { "curtis-smith.uk", true },
+  { "curtislaw-pllc.com", true },
   { "curtislinville.net", true },
   { "curtissmith.me.uk", true },
   { "curtissmith.uk", true },
@@ -8683,7 +8962,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "curveprotect.org", true },
   { "curvesandwords.com", true },
   { "curvissa.co.uk", true },
-  { "custerweb.com", true },
   { "custodyxchange.com", true },
   { "custombikes.cl", true },
   { "customdissertation.com", true },
@@ -8692,7 +8970,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "customfitmarketing.com", true },
   { "customgear.com.au", true },
   { "customizeyoursink.com", true },
-  { "customshort.link", true },
   { "customwritingservice.com", true },
   { "customwritten.com", true },
   { "cutephil.com", true },
@@ -8710,8 +8987,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cuvva.org", true },
   { "cuvva.uk", true },
   { "cuvva.us", true },
-  { "cuxpool.club", true },
+  { "cuxpool.net", true },
   { "cvc.digital", true },
+  { "cvchomes.com", true },
   { "cvcoders.com", true },
   { "cve-le-carrousel.ch", true },
   { "cviip.ca", true },
@@ -8721,16 +8999,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cvlibrary.co.uk", true },
   { "cvmu.jp", true },
   { "cvninja.pl", true },
-  { "cvps.top", true },
   { "cvr.dk", true },
   { "cvursache.com", true },
-  { "cvv.cn", true },
   { "cw.center", true },
   { "cwagner.me", true },
   { "cwarren.org", true },
   { "cwbrtrust.ca", true },
   { "cwgaming.co.uk", true },
-  { "cwinfo.fi", true },
   { "cwmart.in", true },
   { "cwrau.com", true },
   { "cwrau.de", true },
@@ -8741,8 +9016,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cwrau.rocks", true },
   { "cwrau.tech", true },
   { "cwrcoding.com", true },
+  { "cxadd.com", true },
   { "cy.ax", true },
-  { "cyber.cafe", true },
+  { "cyanghost.com", true },
   { "cyber.je", true },
   { "cyberatlantis.com", true },
   { "cybercareers.gov", true },
@@ -8750,17 +9026,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cybercrew.cc", true },
   { "cybercrime-forschung.de", true },
   { "cybercrime.gov", true },
-  { "cybercymru.co.uk", true },
+  { "cyberdos.de", false },
   { "cyberduck.io", true },
   { "cyberexplained.info", true },
-  { "cyberfrancais.ro", true },
   { "cybergrx.com", true },
   { "cyberguerrilla.info", true },
   { "cyberguerrilla.org", true },
   { "cyberianhusky.com", true },
   { "cyberkov.com", true },
+  { "cyberlab.kiev.ua", false },
   { "cyberlightapp.com", true },
   { "cybermeldpunt.nl", true },
+  { "cyberogism.com", true },
   { "cyberoptic.de", true },
   { "cyberphaze.com", true },
   { "cyberpioneer.net", false },
@@ -8771,17 +9048,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cyberscan.io", true },
   { "cybersecurity.nz", true },
   { "cybersecurity.run", true },
-  { "cybersecuritychallenge.be", true },
+  { "cybersecuritychallenge.be", false },
   { "cybersecurityketen.nl", true },
   { "cyberseguranca.com.br", true },
   { "cybersins.com", true },
-  { "cybersmart.co.uk", true },
   { "cybersmartdefence.com", true },
   { "cyberspace.community", true },
   { "cyberspect.com", true },
   { "cyberspect.io", true },
   { "cyberstatus.de", true },
-  { "cybertorsk.org", true },
   { "cybertu.be", true },
   { "cyberwars.dk", true },
   { "cyberwire.nl", true },
@@ -8794,6 +9069,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cybrary.it", true },
   { "cyclebeads.com", true },
   { "cycleluxembourg.lu", true },
+  { "cyclinggoodso.com", true },
   { "cyclisjumper.gallery", true },
   { "cyclop-editorial.fr", true },
   { "cydetec.com", true },
@@ -8804,6 +9080,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cygnius.net", true },
   { "cyhour.com", true },
   { "cykelbanor.se", true },
+  { "cylindehea.com", true },
   { "cylindricity.com", true },
   { "cyon.ch", true },
   { "cypad.cn", true },
@@ -8816,7 +9093,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cyph.me", true },
   { "cyph.video", true },
   { "cyph.ws", true },
-  { "cypherpunk.at", true },
   { "cypherpunk.observer", true },
   { "cypresslegacy.com", true },
   { "cyprus-company-service.com", true },
@@ -8826,12 +9102,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "cytech.com.tr", true },
   { "cytegic-update-packages.com", true },
   { "cyumus.com", true },
-  { "cyyzaid.cn", true },
+  { "cyyzaid.cn", false },
   { "czakey.net", true },
   { "czbix.com", true },
   { "czbtm.com", true },
   { "czc.cz", true },
   { "czechamlp.com", true },
+  { "czechcrystals.co.uk", true },
   { "czechvirus.cz", true },
   { "czerno.com", true },
   { "czfa.pl", true },
@@ -8847,22 +9124,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "d-training.de", true },
   { "d.nf", true },
   { "d.nr", true },
-  { "d.rip", true },
   { "d00d.de", true },
   { "d0g.cc", true },
   { "d0m41n.name", true },
   { "d0xq.com", true },
+  { "d2ph.com", true },
   { "d2s.uk", true },
   { "d3lab.net", true },
-  { "d3njjcbhbojbot.cloudfront.net", true },
   { "d3xt3r01.tk", true },
   { "d3xx3r.de", true },
   { "d42.no", true },
+  { "d4wson.com", true },
   { "d4x.de", true },
   { "d66.nl", true },
+  { "d6c5yfulmsbv6.cloudfront.net", true },
   { "d8.io", true },
+  { "d88688.com", true },
+  { "d88871.com", true },
+  { "d88988.com", true },
   { "da-ist-kunst.de", true },
-  { "da.hn", true },
   { "da42foripad.com", true },
   { "daallexx.eu", true },
   { "dabasstacija.lv", true },
@@ -8883,7 +9163,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "daduke.org", true },
   { "daemen.org", true },
   { "daemon.xin", true },
-  { "daemonslayer.net", true },
   { "daemwool.ch", true },
   { "daevel.fr", true },
   { "dafnik.me", true },
@@ -8893,15 +9172,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dag-konsult.com", true },
   { "dagensannonser.se", true },
   { "dagmar2018.cz", true },
-  { "dahlberg.cologne", true },
   { "dai.top", true },
   { "daigakujuken-plus.com", true },
   { "daikoz.com", true },
   { "dailybits.be", true },
   { "dailyblogged.com", true },
+  { "dailyemailinboxing.com", true },
   { "dailyenglishchallenge.com", true },
   { "dailyhealthguard.com", true },
   { "dailykos.com", true },
+  { "dailyrover.com", true },
   { "dailyxenang.com", true },
   { "daintymeal.com", true },
   { "dairyshrine.org", true },
@@ -8909,8 +9189,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "daisidaniels.co.uk", true },
   { "daisy-peanut.com", true },
   { "daisypeanut.com", true },
-  { "daiwai.de", false },
   { "daiweihu.com", true },
+  { "daiyuu.jp", true },
+  { "dajiadu.net", true },
   { "dak.org", true },
   { "daknob.net", true },
   { "daktarisys.com", true },
@@ -8918,7 +9199,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dalaran.city", true },
   { "dalb.in", true },
   { "dale-electric.com", true },
-  { "dalek.co.nz", true },
   { "dalepresencia.com", true },
   { "dalfsennet.nl", true },
   { "dalingk.com", true },
@@ -8930,9 +9210,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "damasexpress.com", true },
   { "damedrogy.cz", true },
   { "damghaem.ir", true },
-  { "damicris.ro", true },
   { "damienoreilly.org", true },
-  { "damienpontifex.com", true },
+  { "damienpontifex.com", false },
   { "daminiphysio.ca", true },
   { "damip.net", true },
   { "dammekens.be", true },
@@ -8947,16 +9226,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danandrum.com", true },
   { "danarozmarin.com", true },
   { "danbaldwinart.com", true },
+  { "danburycampervans.co.uk", true },
+  { "dance-colleges.com", true },
   { "danchen.org", true },
   { "dancingcubs.co.uk", true },
   { "dancingshiva.at", true },
   { "dandenongroadapartments.com.au", true },
   { "daneandthepain.com", true },
+  { "dangr.zone", true },
   { "danhalliday.com", true },
   { "danholloway.online", true },
   { "daniel-baumann.ch", true },
   { "daniel-cholewa.de", true },
-  { "daniel-du.com", true },
   { "daniel-kulbe.de", true },
   { "daniel-milnes.uk", true },
   { "daniel-ruf.de", true },
@@ -8966,17 +9247,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danielehniss.de", true },
   { "danielepestilli.com", true },
   { "danielgorr.de", true },
+  { "danielheal.net", true },
   { "danielhinterlechner.eu", true },
   { "danielhochleitner.de", true },
   { "danieljamesscott.org", true },
   { "danieljireh.com", true },
+  { "danieljstevens.com", true },
   { "danielkoster.nl", true },
   { "daniellockyer.com", true },
+  { "danielmarquard.com", false },
   { "danielmartin.de", true },
   { "danielmoch.com", true },
   { "danielmorell.com", true },
   { "danielmostertman.com", true },
   { "danielmostertman.nl", true },
+  { "danielnaaman.com", true },
+  { "danielnaaman.net", true },
+  { "danielnaaman.org", true },
   { "danielpeukert.cz", true },
   { "danielran.com", true },
   { "danielrozenberg.com", true },
@@ -8986,6 +9273,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danielstach.cz", true },
   { "danielsteiner.net", true },
   { "danielstiner.me", true },
+  { "danielthompson.info", true },
   { "danieltollot.de", true },
   { "danielvoogsgerd.nl", true },
   { "danielwildhaber.ch", true },
@@ -8994,7 +9282,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danjesensky.com", true },
   { "dank.ninja", true },
   { "dankim.de", false },
-  { "dankredues.com", true },
   { "danla.nl", true },
   { "danmaby.com", true },
   { "danmarksbedstefredagsbar.dk", true },
@@ -9002,6 +9289,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danmassarano.com", true },
   { "danminkevitch.com", true },
   { "danna-salary.com", true },
+  { "dannhanks.com", true },
   { "danny-tittel.de", true },
   { "danny.fm", true },
   { "dannycairns.com", true },
@@ -9012,6 +9300,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danoz.net", true },
   { "danpiel.net", true },
   { "dansa.com.co", true },
+  { "dansage.co", true },
   { "danscomp.com", true },
   { "dansdiscounttools.com", true },
   { "danselibre.net", true },
@@ -9028,11 +9317,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "danwin1210.me", true },
   { "danwolff.se", true },
   { "danyabanya.com", true },
+  { "danzac.com", true },
   { "dao.spb.su", true },
   { "daoro.net", true },
   { "daphne.informatik.uni-freiburg.de", true },
-  { "dapim.co.il", true },
   { "daplie.com", true },
+  { "dapps.earth", true },
   { "dappworld.com", true },
   { "daracokorilo.com", true },
   { "daravk.ch", true },
@@ -9047,10 +9337,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "darinkotter.com", true },
   { "darioackermann.ch", true },
   { "darioclip.com", true },
+  { "dariosirangelo.me", true },
   { "darioturchetti.me", true },
   { "darisni.me", true },
   { "dark-infection.de", true },
   { "dark-vision.cz", true },
+  { "dark.ninja", true },
   { "darkag.ovh", true },
   { "darkcores.net", true },
   { "darkengine.io", true },
@@ -9058,7 +9350,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "darkerlystormy.com", true },
   { "darkerstormy.com", true },
   { "darkeststar.org", true },
-  { "darkfire.ch", true },
+  { "darklaunch.com", true },
   { "darknessflickers.com", true },
   { "darknetlive.com", true },
   { "darknight.blog", true },
@@ -9071,19 +9363,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "darkspacelab.com", true },
   { "darktime.ru", true },
   { "darkwater.info", true },
+  { "darkwebnews.com", true },
   { "darkx.me", true },
   { "darom.jp", true },
   { "darookee.net", false },
   { "daropia.org", true },
   { "darshnam.com", true },
-  { "dart-tanke.com", true },
-  { "dart-tanke.de", true },
   { "dartcode.org", true },
   { "dartetdemetiers.fr", true },
   { "darth-sonic.de", true },
   { "dartsdon.jp", true },
   { "dartshopmn.nl", true },
   { "darwinkel.net", true },
+  { "darwinsearch.org", true },
   { "daryl.moe", true },
   { "darylcumbo.net", true },
   { "das-forum24.de", true },
@@ -9099,9 +9391,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dasteichwerk.at", true },
   { "dasug.de", true },
   { "data-detox.de", true },
+  { "data-wing.ga", true },
   { "data.gov", true },
   { "data.govt.nz", true },
   { "data.world", true },
+  { "data3w.nl", true },
   { "databionix.com", true },
   { "databutlr.com", true },
   { "databutlr.net", true },
@@ -9112,13 +9406,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "datadyne.technology", true },
   { "dataformers.at", true },
   { "datagrail.io", true },
+  { "dataguidance.com", true },
   { "dataharvest.at", true },
   { "datahoarder.xyz", true },
+  { "datajobs.ai", true },
   { "datakick.org", true },
   { "datalife.gr", true },
   { "datalysis.ch", true },
   { "dataprotectionadvisors.com", true },
-  { "datapun.ch", true },
   { "datapure.net", true },
   { "dataregister.info", true },
   { "datascience.cafe", true },
@@ -9167,10 +9462,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "davetempleton.com", true },
   { "davevelopment.net", true },
   { "davewardle.com", true },
-  { "davewut.ca", true },
   { "david-corry.com", true },
   { "david-jeffery.co.uk", true },
-  { "david-mallett.com", true },
   { "david-pearce.com", true },
   { "david-reess.de", true },
   { "david-schiffmann.de", true },
@@ -9181,12 +9474,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "davidbuckell.com", true },
   { "davidcrx.net", true },
   { "daviddever.net", true },
+  { "davidfetveit.com", true },
   { "davidforward.com", true },
   { "davidforward.net", true },
   { "davidfrancoeur.com", true },
   { "davidgouveia.net", true },
   { "davidgow.net", true },
   { "davidhanle.com", true },
+  { "davidkennardphotography.com", true },
   { "davidking.xyz", true },
   { "davidlamprea.com", true },
   { "davidlane.io", true },
@@ -9205,9 +9500,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "davidtiffany.com", true },
   { "davidundetiwan.com", true },
   { "davie3.com", true },
+  { "davisdieselandautorepair.com", true },
   { "davisroi.com", true },
   { "davo-usedcars.be", true },
   { "davy-server.com", true },
+  { "davypropper.com", true },
+  { "daw.nz", true },
   { "dawena.de", true },
   { "dawgs.ga", true },
   { "dawnbringer.eu", true },
@@ -9223,12 +9521,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "daymprove.life", true },
   { "dayofdays.be", true },
   { "daysoftheyear.com", true },
-  { "db-sanity.com", true },
   { "db-works.nl", true },
   { "dbapress.org", true },
   { "dbaron.org", true },
   { "dbas.cz", true },
-  { "dbcom.ru", true },
   { "dbdc.us", true },
   { "dbentertainment.co.uk", true },
   { "dbgamestudio.com", true },
@@ -9246,10 +9542,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dc-elektro.de", true },
   { "dc-elektro.eu", true },
   { "dc-occasies.be", true },
-  { "dc-solution.de", true },
+  { "dc-solution.de", false },
   { "dc1.com.br", true },
   { "dc562.org", true },
   { "dc585.info", true },
+  { "dcain.me", true },
   { "dcards.in.th", true },
   { "dcautomacao.com.br", true },
   { "dcbouncycastles.co.uk", true },
@@ -9259,7 +9556,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dchatelain.ch", true },
   { "dchest.org", true },
   { "dckd.nl", true },
-  { "dcl.re", true },
   { "dclaisse.fr", true },
   { "dcmt.co", true },
   { "dcpower.eu", true },
@@ -9272,6 +9568,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ddhosted.com", true },
   { "ddns-test.de", true },
   { "ddnsweb.com", true },
+  { "ddoser.cn", true },
+  { "ddosolitary.org", true },
   { "ddproxy.cf", true },
   { "ddracepro.net", true },
   { "dds.mil", true },
@@ -9287,7 +9585,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "deadinsi.de", true },
   { "deaf.dating", true },
   { "deaf.eu.org", true },
-  { "deai-life.biz", true },
   { "dealapp.nl", true },
   { "dealbanana.at", true },
   { "dealbanana.be", true },
@@ -9315,6 +9612,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dearfcc.net", true },
   { "dearfcc.org", true },
   { "dearktiel.nl", true },
+  { "dearly.com", true },
   { "dearnevalleybouncycastles.co.uk", true },
   { "deathofspring.com", true },
   { "deathy.ro", true },
@@ -9336,7 +9634,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "debuis.nl", true },
   { "decaffeinated.io", true },
   { "decalquai.ch", true },
+  { "decay24.de", true },
   { "dechat.nl", true },
+  { "decher.de", true },
   { "decidetreatment.org", true },
   { "decis.fr", true },
   { "decisivetactics.com", true },
@@ -9346,6 +9646,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "decodeanddestroy.com", true },
   { "decoder.link", true },
   { "decompiled.de", true },
+  { "decoora.com", true },
   { "decor-d.com", true },
   { "decoratingadvice.co.uk", true },
   { "decoratore.roma.it", true },
@@ -9360,6 +9661,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dedelta.net", true },
   { "dedg3.com", true },
   { "dedge.org", true },
+  { "dedicatedtowomenobgyn.com", true },
   { "dedimax.de", true },
   { "dedmorozrzn.ru", false },
   { "deduijventil.nl", true },
@@ -9367,20 +9669,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dee.su", true },
   { "deechtebakkers.nl", true },
   { "deegeeinflatables.co.uk", true },
+  { "deejayevents.ro", true },
   { "deelmijnreis.nl", true },
+  { "deep-chess.com", true },
+  { "deep.club", true },
   { "deepaero.com", true },
+  { "deeparamaraj.com", true },
   { "deepbluecrafting.co.uk", true },
+  { "deepblueemail.com", true },
   { "deepcode.io", true },
   { "deeperxh.com", true },
   { "deephill.com", true },
+  { "deepinsight.io", true },
   { "deeployr.io", true },
   { "deepserve.info", true },
-  { "deepsouthsounds.com", true },
   { "deepspace.dedyn.io", true },
   { "deepwealth.institute", true },
   { "deepz.pt", true },
   { "deepzz.com", true },
-  { "deezeno.com", true },
   { "def-pos.ru", true },
   { "defcon.org", true },
   { "defcongroups.org", true },
@@ -9407,6 +9713,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "degeberg.com", true },
   { "degeberg.dk", true },
   { "degen-elektrotechnik.de", true },
+  { "degestamptepot.nl", true },
   { "degosoft.nl", true },
   { "degoulet.net", true },
   { "degracetechnologie.com", true },
@@ -9427,6 +9734,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dekeurslagers.nl", true },
   { "dekka.cz", true },
   { "dekkercreativedesign.nl", true },
+  { "dekko.io", true },
   { "dekoh-shouyu.com", true },
   { "dekonix.ru", true },
   { "dekulk.nl", true },
@@ -9434,7 +9742,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "delbecqvo.be", true },
   { "delbrouck.ch", true },
   { "deleidscheflesch.nl", true },
-  { "delf.co.jp", true },
   { "delfic.org", true },
   { "delfino.cr", true },
   { "delhionlinegifts.com", true },
@@ -9447,6 +9754,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "delid.cz", true },
   { "delitto.top", true },
   { "delivery.co.at", true },
+  { "deliveryiquique.cl", true },
   { "dellipaoli.com", true },
   { "delogo.nl", true },
   { "delorenzi.dk", true },
@@ -9463,12 +9771,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "deltasigmachi.org", true },
   { "deltasmart.ch", true },
   { "deltava.org", true },
+  { "demarle.ch", true },
   { "dementiapraecox.de", true },
   { "demeyere-usedcars.be", true },
   { "demfloro.ru", true },
   { "demijn.nl", true },
   { "demilletech.net", true },
+  { "demiranda.com", true },
   { "demmer.one", true },
+  { "demo.swedbank.se", true },
+  { "demo9.ovh", true },
   { "democracychronicles.com", true },
   { "democracyineurope.eu", true },
   { "democraziaineuropa.eu", true },
@@ -9504,8 +9816,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dennogumi.org", true },
   { "denous.nl", true },
   { "dent.uy", true },
+  { "dental-colleges.com", true },
   { "dentallaborgeraeteservice.de", true },
-  { "dentfix.ro", true },
+  { "dentfix.ro", false },
   { "dentistesdarveauetrioux.com", true },
   { "dentistglasgow.com", true },
   { "dentrassi.de", true },
@@ -9513,15 +9826,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "denwauranailab.com", true },
   { "deontology.com", true },
   { "depaddestoeltjes.be", true },
+  { "depannage-traceur.fr", true },
   { "deparis.me", true },
   { "depechemode-live.com", true },
-  { "depedtayo.com", true },
   { "depicus.com", true },
   { "depone.net", true },
   { "depot-leipzig.de", true },
   { "depotsquarekerrville.com", true },
   { "depotter-usedcars.be", true },
   { "deprecate.de", true },
+  { "depth-co.jp", true },
   { "depthe.gr", true },
   { "der-bank-blog.de", true },
   { "der-fliesenzauberer.de", true },
@@ -9529,6 +9843,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "der-lan.de", true },
   { "der-rudi.eu", true },
   { "der-stein-fluesterer.de", true },
+  { "der-windows-papst.de", true },
   { "derattizzazione.name", true },
   { "derattizzazioni.biz", true },
   { "derattizzazioni.milano.it", true },
@@ -9548,6 +9863,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "derkuki.de", true },
   { "derma-expert.eu", true },
   { "dermapuur.nl", true },
+  { "dermato.floripa.br", true },
   { "dermatologie-morges.ch", true },
   { "dermediq.nl", true },
   { "dermot.org.uk", true },
@@ -9566,7 +9882,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "desec.io", true },
   { "desertsounds.org", true },
   { "desgenst.ch", true },
-  { "design-fu.com", false },
   { "design-in-bad.eu", true },
   { "design-tooning.de", true },
   { "designdevs.eu", true },
@@ -9583,6 +9898,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "desila.jp", true },
   { "deskdesign.nl", true },
   { "deskeen.fr", true },
+  { "desktopd.eu.org", true },
   { "desktopfx.net", false },
   { "deskture.com", true },
   { "deskvip.com", true },
@@ -9598,12 +9914,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "destileria.net.br", true },
   { "destinationsofnewyorkstate.com", true },
   { "destinattorneyjohngreene.com", true },
-  { "destinopiriapolis.com", true },
+  { "destinoytarot.com", true },
+  { "destinyofthephoenix.me", false },
   { "desu.ne.jp", true },
+  { "desuchan.eu", true },
+  { "desuchan.org", true },
   { "desuperheroes.co", true },
   { "det-te.ch", true },
   { "detalika.ru", true },
-  { "detalyedesigngroup.com", true },
+  { "detecmon.com", true },
   { "detectify.com", false },
   { "detectivedesk.com.au", true },
   { "detekenmuze.nl", true },
@@ -9618,6 +9937,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "detski.center", true },
   { "detskysad.com", true },
   { "detuinmuze.nl", true },
+  { "detuprovincia.cl", true },
   { "detype.nl", true },
   { "deuchnord.fr", true },
   { "deude.de", true },
@@ -9633,41 +9953,42 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "deutschland-dsl.de", true },
   { "deuxmetrescubes.fr", true },
   { "dev-brandywineglobal.com", true },
+  { "dev-gutools.co.uk", true },
   { "dev-pulse-mtn.pantheonsite.io", true },
   { "dev-tek.de", true },
   { "devagency.fr", true },
   { "devalps.eu", true },
   { "devb.nl", true },
-  { "devcast.io", true },
+  { "devcast.io", false },
   { "devcf.com", true },
   { "devct.cz", false },
   { "devcu.com", true },
   { "devcu.net", true },
-  { "devdoodle.net", true },
   { "devel.cz", true },
   { "develerik.com", false },
+  { "develop.cool", true },
   { "developer.android.com", true },
   { "developer.mydigipass.com", false },
   { "developerdan.com", true },
   { "developerfair.com", true },
+  { "developermail.io", false },
   { "developers.facebook.com", false },
   { "developfx.com", true },
   { "developmentaid.org", true },
   { "developmentsites.melbourne", true },
   { "develops.co.il", true },
+  { "developyourelement.com", true },
   { "develux.com", true },
   { "develux.net", true },
-  { "devenney.io", true },
   { "devh.net", true },
   { "deviant.email", true },
   { "devillers-occasions.be", true },
   { "devilshakerz.com", true },
-  { "deviltracks.net", true },
   { "deviltraxxx.de", true },
   { "devinfo.net", false },
   { "devirc.net", true },
+  { "deviser.wang", true },
   { "devisnow.fr", true },
-  { "devjack.de", true },
   { "devkid.net", true },
   { "devkit.cc", false },
   { "devklog.net", true },
@@ -9679,24 +10000,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "devonsawatzky.ca", true },
   { "devopers.com.br", true },
   { "devops-survey.com", true },
-  { "devpgsv.com", true },
   { "devpsy.info", true },
   { "devragu.com", true },
   { "devrandom.net", true },
   { "devsjournal.com", true },
   { "devsrvr.ru", true },
   { "devstaff.gr", true },
-  { "devyn.ca", true },
+  { "devstroke.io", true },
+  { "devtty.org", true },
+  { "devyn.ca", false },
   { "devzero.io", true },
   { "dewaard.de", true },
   { "dewalch.net", true },
   { "dewapress.com", true },
-  { "dewebwerf.nl", true },
   { "dewinter.com", true },
   { "dex.top", true },
   { "dexalo.de", true },
   { "dexigner.com", true },
   { "deyute.com", true },
+  { "dez-online.de", true },
   { "dezeregio.nl", true },
   { "dezet-ev.de", true },
   { "dezintranet.com", true },
@@ -9704,8 +10026,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dezshop24.de", true },
   { "df1paw.de", true },
   { "dfctaiwan.org", true },
-  { "dfekt.no", true },
-  { "dfektlan.no", true },
   { "dfl.mn", true },
   { "dflcares.com", true },
   { "dfmn.berlin", true },
@@ -9730,19 +10050,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dhinflatables.co.uk", true },
   { "dhlinux.org", true },
   { "dhome.at", true },
-  { "dhuy.net", true },
+  { "dhuy.net", false },
   { "di2pra.com", true },
   { "di2pra.fr", true },
+  { "dia-de.com", true },
   { "diablovalleytech.com", true },
   { "diadorafitness.es", true },
   { "diadorafitness.it", true },
-  { "diagnocentro.cl", true },
   { "diagnostix.org", true },
   { "diagrammingoutloud.co.uk", true },
   { "dialapicnic.co.za", true },
-  { "dialectic-og.com", true },
+  { "dialoegue.com", true },
   { "diamante.ro", true },
   { "diamantovaburza.cz", true },
+  { "diamond-hairstyle.dk", true },
   { "diamondsleepsolutions.com", true },
   { "diamondyze.nl", true },
   { "diamorphine.com", true },
@@ -9760,21 +10081,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "diccionariodedudas.com", true },
   { "dice.tokyo", true },
   { "dicelab.co.uk", true },
+  { "dicesites.com", true },
   { "dicionario.org", true },
   { "dicionariodegirias.com.br", true },
   { "dicionariodelatim.com.br", true },
   { "dicionariodenomesproprios.com.br", true },
   { "dicionariodesimbolos.com.br", true },
   { "dicionarioetimologico.com.br", true },
+  { "dicionariofinanceiro.com", true },
+  { "dicionariopopular.com", true },
   { "dickieslife.com", true },
   { "dickpics.ru", true },
+  { "dicksakowicz.com", true },
   { "dicoding.com", true },
   { "dictionaryofnumbers.com", true },
   { "dictzone.com", true },
   { "didacte.com", true },
   { "didche.net", true },
   { "diddens.de", true },
+  { "dideeducacion.com", true },
+  { "didefamilia.com", true },
+  { "didesalud.com", true },
   { "didierghez.com", true },
+  { "didigotoffer.com", true },
   { "didikhari.web.id", true },
   { "die-bergfuehrer.de", true },
   { "die-blahuts.de", true },
@@ -9784,7 +10113,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "die-seide.de", true },
   { "die-sinlosen.de", true },
   { "die-speisekammer-reutlingen.de", true },
-  { "diedrich.co", false },
+  { "diedrich.co", true },
   { "diedrich.me", true },
   { "dieecpd.org", true },
   { "diegelernten.de", true },
@@ -9799,7 +10128,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dienchaninstitute.com", true },
   { "dienstplan.cc", true },
   { "dienstplan.one", true },
-  { "dierabenmutti.de", true },
   { "dierenartsdeconinck.be", true },
   { "dieselanimals.lt", true },
   { "dieselgalleri.com", true },
@@ -9835,6 +10163,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "digilicious.com", true },
   { "digimagical.com", true },
   { "digimedia.cd", false },
+  { "digimomedia.co.uk", true },
   { "digioccumss.ddns.net", true },
   { "digipitch.com", true },
   { "digired.ro", true },
@@ -9851,8 +10180,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "digitalcash.cf", true },
   { "digitalcitizen.life", true },
   { "digitalcitizen.ro", true },
+  { "digitalcraftmarketing.co.uk", true },
   { "digitalcreationclass.com", true },
-  { "digitalcuko.com", true },
   { "digitaldashboard.gov", true },
   { "digitaldatacenter.net", true },
   { "digitaldeli.com", true },
@@ -9873,7 +10202,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "digitalhabit.at", true },
   { "digitalhabitat.io", true },
   { "digitalliteracy.gov", true },
-  { "digitalmaniac.co.uk", true },
   { "digitalmarketingindallas.com", true },
   { "digitalrights.center", true },
   { "digitalrights.fund", true },
@@ -9882,14 +10210,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "digitaltcertifikat.dk", true },
   { "digitaltechnologies.ltd.uk", true },
   { "digitkon.com", true },
+  { "digitreads.com", true },
   { "digminecraft.com", true },
   { "digwp.com", true },
   { "dihesan.com", true },
+  { "dijitaller.com", true },
   { "dijkmanmuziek.nl", false },
   { "dijkmanvandoorn.nl", false },
   { "diju.ch", true },
   { "dildoexperten.se", true },
-  { "dilichen.fr", true },
   { "diligo.ch", true },
   { "dillewijnzwapak.nl", true },
   { "dillonkorman.com", true },
@@ -9898,10 +10227,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dimdom.com.br", true },
   { "dime-staging.com", true },
   { "dime.io", true },
-  { "dimensionen.de", true },
   { "dimeponline.com.br", true },
   { "dimeshop.nl", true },
   { "dimez.ru", true },
+  { "dimiskovska.de", true },
+  { "dimitrihomes.com", true },
   { "dimmersagourahills.com", true },
   { "dimmerscalabasas.com", true },
   { "dimmersdosvientos.com", true },
@@ -9912,22 +10242,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dimmersthousandoaks.com", true },
   { "dimmerswestlakevillage.com", true },
   { "dimonb.com", true },
-  { "dimseklubben.dk", true },
   { "din-hkd.jp", true },
   { "dineachook.com.au", true },
   { "dinepont.fr", true },
+  { "dinerroboticurology.com", true },
   { "dingcc.me", true },
+  { "dinge.xyz", true },
   { "dingsbums.shop", true },
+  { "dinheirolucrar.com", true },
   { "dinkommunikasjon.no", true },
   { "dinmtb.dk", true },
   { "dinocarrozzeria.com", true },
   { "dinotopia.org.uk", true },
+  { "dinstec.cl", true },
   { "dintrafic.net", true },
   { "diodeled.com", true },
   { "diodo.me", true },
   { "dionysos-ios.gr", true },
   { "diozoid.com", true },
   { "dipalma.me", true },
+  { "dipdaq.com", true },
   { "dipling.de", true },
   { "diplona.de", true },
   { "dipulse.it", true },
@@ -9935,10 +10269,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dir2epub.org", true },
   { "dirba.io", true },
   { "direct-sel.com", true },
+  { "direct2uk.com", false },
+  { "direct365.es", true },
   { "directebanking.com", true },
   { "directelectricalltd.co.uk", true },
   { "directlinkfunding.co.uk", true },
-  { "directme.ga", true },
   { "directnews.be", true },
   { "directorioz.com", true },
   { "directreal.sk", true },
@@ -9946,23 +10281,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "direktvermarktung-schmitzberger.at", true },
   { "dirips.com", true },
   { "dirk-scheele.de", true },
+  { "dirk-weise.de", false },
   { "dirkdoering.de", true },
   { "dirkjonker.nl", true },
   { "dirko.net", true },
   { "dirtcraft.ca", true },
   { "dirtygeek.ovh", true },
+  { "dirtyincest.com", true },
+  { "dirtyprettyartwear.com", true },
   { "disability.gov", true },
   { "disabled.dating", true },
   { "disanteimpianti.com", true },
   { "disavow.tools", true },
   { "disc.uz", true },
+  { "discarica.bologna.it", true },
   { "discarica.it", true },
   { "discarica.roma.it", true },
   { "discha.net", true },
   { "dischempharmacie.com", true },
   { "disciples.io", true },
   { "disciplina.io", true },
-  { "discipul.nl", true },
   { "discofitta.com", true },
   { "disconformity.net", true },
   { "discord.gg", true },
@@ -9972,6 +10310,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "discordia.me", true },
   { "discotek.club", true },
   { "discount24.de", true },
+  { "discountlumberspokane.com", true },
   { "discountplush.com", true },
   { "discover-shaken.com", true },
   { "discoverthreejs.com", true },
@@ -9980,6 +10319,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "discoveryottawa.ca", true },
   { "discoveryrom.org", true },
   { "discreet-condooms.nl", true },
+  { "discrypt.ca", true },
   { "dise-online.de", true },
   { "disinclined.org", true },
   { "disinfesta.it", true },
@@ -9988,6 +10328,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "disinfestazione.brescia.it", true },
   { "disinfestazione.torino.it", true },
   { "disinfestazione.venezia.it", true },
+  { "disinfestazione.verona.it", true },
   { "disinfestazione24.it", true },
   { "disinfestazioneblatte.it", true },
   { "disinfestazionecimici.roma.it", true },
@@ -9997,6 +10338,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "disinfestazioni.catania.it", true },
   { "disinfestazioni.firenze.it", true },
   { "disinfestazioni.genova.it", true },
+  { "disinfestazioni.gorizia.it", true },
   { "disinfestazioni.info", true },
   { "disinfestazioni.milano.it", true },
   { "disinfestazioni.net", true },
@@ -10010,6 +10352,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "disinfestazionivespe.milano.it", true },
   { "disinfestazionizanzare.milano.it", true },
   { "disinisharing.com", true },
+  { "disk.do", true },
   { "diskbit.com", true },
   { "diskbit.nl", true },
   { "dismail.de", true },
@@ -10027,6 +10370,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "distillery.com", true },
   { "distinguishedprisoner.com", true },
   { "distribuidoracristal.com.br", true },
+  { "distribuidoraplus.com", true },
   { "distribuidorveterinario.es", true },
   { "distro.re", true },
   { "ditelbat.com", true },
@@ -10035,10 +10379,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "diva.nl", true },
   { "divari.nl", true },
   { "divcoder.com", true },
+  { "dive-japan.com", true },
   { "divedowntown.com", true },
   { "divegearexpress.com", true },
   { "diveidc.com", true },
-  { "divenwa.com", true },
   { "diveplan.org", true },
   { "divergenz.org", true },
   { "diversityflags.com", true },
@@ -10048,9 +10392,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "divi-experte.de", true },
   { "divinasaiamodas.com.br", true },
   { "divinegames.studio", true },
+  { "divinemercyparishvld.com", true },
+  { "divinemercyparishvlds.com", true },
   { "diving.photo", true },
+  { "divorcelawyersformen.com", true },
   { "divorciosmurcia.com", true },
-  { "divvymonkey.com", true },
   { "diwei.vip", true },
   { "dixi.fi", true },
   { "dixibox.com", true },
@@ -10070,18 +10416,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "djangoproject.com", true },
   { "djangosnippets.org", true },
   { "djbbouncycastles.co.uk", true },
+  { "djboekingskantoor.nl", true },
   { "djc.me", true },
   { "djcursuszwolle.nl", true },
   { "djdavid98.hu", true },
-  { "djieno.com", true },
+  { "djeung.org", true },
   { "djipanov.com", true },
   { "djleon.net", true },
   { "djlive.pl", true },
   { "djlnetworks.co.uk", true },
   { "djsbouncycastlehire.com", true },
-  { "djsk.nl", true },
   { "djt-vom-chausseehaus.de", true },
   { "djursland-psykologen.dk", true },
+  { "djvintagevinyl.nl", true },
   { "djwaynepryke.com", true },
   { "djz4music.com", false },
   { "dk-kromeriz.cz", true },
@@ -10089,7 +10436,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dk.search.yahoo.com", false },
   { "dkcomputers.com.au", true },
   { "dkds.us", true },
-  { "dkn.go.id", false },
+  { "dko-steiermark.ml", true },
   { "dkstage.com", true },
   { "dl.google.com", true },
   { "dlabouncycastlehire.co.uk", true },
@@ -10100,7 +10447,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dlitz.net", true },
   { "dll4free.com", true },
   { "dlld.com", true },
-  { "dlouwrink.nl", true },
   { "dlrsp.org", true },
   { "dlscomputers.com.au", true },
   { "dlui.xyz", true },
@@ -10118,10 +10464,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dmd.lv", true },
   { "dmdd.org.uk", true },
   { "dmeevalumate.com", true },
+  { "dmess.ru", true },
   { "dmi.es", true },
   { "dmitry.sh", true },
   { "dmmkenya.co.ke", true },
   { "dmmultionderhoud.nl", true },
+  { "dmparish.com", true },
   { "dmschilderwerken.nl", true },
   { "dmx.xyz", true },
   { "dmxledlights.com", true },
@@ -10132,14 +10480,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dnacloud.pl", true },
   { "dnakids.co.uk", true },
   { "dnc.org.nz", true },
+  { "dndesign.be", true },
   { "dndtools.net", true },
   { "dne.lu", true },
-  { "dnfc.rocks", true },
   { "dnlr.tech", true },
   { "dnmlab.it", true },
   { "dnplegal.com", true },
   { "dns-control.eu", true },
-  { "dns-manager.info", true },
   { "dns-swiss.ch", true },
   { "dns.google.com", true },
   { "dns8.online", true },
@@ -10149,6 +10496,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dnscurve.io", true },
   { "dnshallinta.fi", true },
   { "dnsinfo.ml", true },
+  { "dnsipv6.srv.br", true },
   { "dnsman.se", true },
   { "dnspod.ml", true },
   { "dnstwister.report", true },
@@ -10173,12 +10521,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dochimera.com", true },
   { "dochitaceahlau.ro", true },
   { "dockerbook.com", false },
-  { "dockerm.com", true },
   { "dockerup.net", true },
   { "docline.gov", true },
   { "docloh.de", true },
   { "docloudu.info", true },
-  { "docplexus.org", true },
   { "docs.google.com", false },
   { "docs.python.org", true },
   { "docs.re", true },
@@ -10195,17 +10541,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "docucopies.com", true },
   { "documaniatv.com", true },
   { "docupet.com", true },
+  { "docusearch.com", true },
   { "dodopri.com", true },
   { "doenjoylife.com", true },
   { "does.one", true },
   { "doesburg-comp.nl", true },
   { "dofuspvp.com", true },
+  { "dofux.org", true },
   { "dog-blum.com", true },
   { "dogadayiz.net", true },
   { "dogan.ch", false },
   { "dogcontrol.ca", true },
+  { "doge.me", true },
+  { "doge.town", true },
   { "dogear.ch", true },
-  { "dogfi.sh", true },
   { "dogft.com", true },
   { "doggedbyirs.com", true },
   { "doggroomingcourse.com", true },
@@ -10216,17 +10565,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dogworld.com.br", true },
   { "dohanews.co", true },
   { "doihavetoputonpants.com", true },
+  { "doitauto.de", true },
+  { "dojozendebourges.fr", true },
+  { "dokan-e.com", false },
   { "dokelio-idf.fr", true },
+  { "doki.space", true },
   { "dokipy.no", true },
-  { "dokspot.cf", true },
-  { "dokspot.ga", true },
   { "doku-gilde.de", true },
   { "dokuboard.com", true },
   { "dokuraum.de", true },
   { "dolci-delizie.de", true },
+  { "doleta.gov", true },
   { "doli.se", true },
   { "dolice.net", true },
+  { "dolinathome.com", true },
   { "dollemore.com", true },
+  { "dollhousetoyo.com", true },
   { "dolorism.com", true },
   { "dolphin-it.de", true },
   { "dom-medicina.ru", true },
@@ -10246,12 +10600,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "domains.motorcycles", true },
   { "domains.yachts", true },
   { "domainsilk.com", true },
+  { "domainspeicher.one", true },
   { "domainstaff.com", true },
   { "domainwatch.me", true },
+  { "domakidis.com", true },
   { "domaxpoker.com", true },
   { "domeconseil.fr", true },
   { "domein-direct.nl", true },
   { "domenic.me", true },
+  { "domenicam.com", true },
   { "domesticcleaners.co.uk", true },
   { "domhaase.me", true },
   { "domian.cz", true },
@@ -10276,6 +10633,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "domster.com", true },
   { "domus-global.com", true },
   { "domus-global.cz", true },
+  { "domwkwiatach.pl", true },
   { "domyassignments.com", true },
   { "domycasestudy.com", true },
   { "domycoursework.com", true },
@@ -10289,16 +10647,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "domyresearchpaper.com", true },
   { "domyreview.net", true },
   { "domyspeech.com", true },
-  { "domytermpaper.com", true },
   { "domythesis.net", true },
   { "domyzitrka.cz", true },
   { "donabeneko.jp", true },
+  { "donaldm.co.uk", true },
   { "donateaday.net", true },
   { "donfelino.tk", false },
   { "dongxuwang.com", true },
   { "donkennedyandsons.com", true },
   { "donkeytrekkingkefalonia.com", true },
-  { "donlydental.ca", true },
   { "donmaldeamores.com", true },
   { "donna-bellini-business-fotografie-muenchen.de", true },
   { "donna-bellini-fotografie-berlin.de", true },
@@ -10312,14 +10669,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "donna-bellini-fotografie-wien.de", true },
   { "donna-bellini-hochzeitsfotograf-frankfurt.de", true },
   { "donna-bellini-hochzeitsfotograf-muenchen.de", true },
+  { "donnaandscottmcelweerealestate.com", true },
   { "donnacha.blog", true },
   { "donnachie.net", true },
+  { "donner-reuschel.de", true },
   { "donnons.org", false },
-  { "donnoval.ru", true },
+  { "donnoval.ru", false },
+  { "donotcall.gov", true },
   { "donotlink.it", true },
   { "donovand.info", true },
+  { "donovankraag.nl", true },
   { "donpomodoro.com.co", true },
-  { "donsbach-edv.de", true },
   { "dont.re", true },
   { "dont.watch", true },
   { "dontbubble.me", true },
@@ -10337,9 +10697,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "doomus.me", true },
   { "doooooops.com", true },
   { "doop.im", true },
-  { "doopdidoop.com", true },
   { "doordecor.bg", true },
   { "doorflow.com", true },
+  { "doorhandlese.com", true },
+  { "doorshingekit.com", true },
   { "dopesoft.de", true },
   { "dopfer-fenstertechnik.de", true },
   { "doppenpost.nl", true },
@@ -10362,6 +10723,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dormiu.com", true },
   { "dormiu.com.br", true },
   { "dornhecker.me", true },
+  { "dorpshuis-dwarsgracht.nl", true },
   { "dorpshuiskesteren.nl", true },
   { "dorquelle.com", true },
   { "dorsetentertainments.co.uk", true },
@@ -10373,7 +10735,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "doska.ru", true },
   { "dosomeworks.biz", true },
   { "dossplumbing.co.za", true },
+  { "dostalsecurity.com", true },
   { "dostlar.fr", true },
+  { "dostrece.net", true },
   { "dosvientoselectric.com", true },
   { "dosvientoselectrical.com", true },
   { "dosvientoselectrician.com", true },
@@ -10383,7 +10747,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dosvientosoutdoorlighting.com", true },
   { "doswap.com", true },
   { "dosyauzantisi.com", true },
-  { "dot.ro", true },
   { "dot42.no", true },
   { "dota2huds.com", true },
   { "dotacni-parazit.cz", true },
@@ -10404,20 +10767,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dotphoto.com", true },
   { "dotplex.com", true },
   { "dotplex.de", true },
+  { "dotrox.net", true },
   { "dotshule.ug", true },
   { "dotsiam.co.th", true },
   { "dotsiam.com", true },
   { "dotsiam.in.th", true },
   { "douai.me", true },
+  { "doubleaste.com", true },
   { "doubleavineyards.com", true },
   { "doublefun.net", true },
   { "doublestat.me", true },
   { "doubleup.com.au", true },
   { "doucheba.gs", false },
   { "dougley.com", true },
+  { "dougsautobody.com", true },
   { "doujinshi.info", true },
   { "dounats.com", true },
   { "douzer.de", true },
+  { "douzer.industries", true },
   { "dovenzorgmalawi.nl", true },
   { "dovro.de", true },
   { "dowell.media", true },
@@ -10433,7 +10800,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "downtimerobot.com", true },
   { "downtimerobot.nl", true },
   { "downtownvernon.com", true },
-  { "doxcelerate.com", false },
   { "doyoucheck.com", false },
   { "doyouedc.com", true },
   { "doyoutax.com", true },
@@ -10447,12 +10813,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dpg.no", true },
   { "dpi-design.de", true },
   { "dpisecuretests.com", true },
+  { "dpm-ident.de", true },
   { "dprb.biz", true },
   { "dprd-wonogirikab.go.id", false },
-  { "dpsg-roden.de", false },
+  { "dpsg-roden.de", true },
   { "dpwsweeps.co.uk", true },
   { "dr-becarelli-philippe.chirurgiens-dentistes.fr", true },
   { "dr-bodendorf.de", true },
+  { "dr-it.co.uk", true },
   { "dr-jakob-zahnaerzte.de", true },
   { "dr-klotz.info", true },
   { "dr-krebs.net", true },
@@ -10465,7 +10833,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dr-stoetter.de", true },
   { "dr-www.de", true },
   { "drabadir.com", true },
-  { "drabben.be", true },
   { "drabim.org", true },
   { "drach.xyz", true },
   { "drachenleder.de", true },
@@ -10475,7 +10842,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "draftguru.com.au", true },
   { "drafton.com", true },
   { "drageeparadise.fr", true },
-  { "dragfiles.com", true },
   { "draghetti.it", true },
   { "draghive.asia", true },
   { "draghive.ca", true },
@@ -10484,10 +10850,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "draghive.com", true },
   { "draghive.org", true },
   { "draghive.tv", true },
-  { "dragon-aspect.com", true },
   { "dragon-chem.eu", true },
   { "dragon-hearts.co.uk", true },
   { "dragoncave.me", true },
+  { "dragonclean.gr", true },
   { "dragonfly.co.uk", true },
   { "dragonheartsrpg.com", true },
   { "dragonkin.net", true },
@@ -10500,22 +10866,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dragonsunited.info", true },
   { "dragonsunited.net", true },
   { "dragonsunited.org", true },
+  { "dragonwolfpackaquaria.com", true },
   { "dragonwork.me", true },
   { "drahcro.uk", true },
+  { "drakecommercial.com", true },
   { "drakeluce.com", true },
   { "drakenson.de", true },
   { "dramaticpeople.com", true },
   { "dramyalderman.com", true },
   { "dranderle.com", true },
   { "dranek.com", true },
+  { "dras.hu", true },
   { "draugr.de", true },
   { "draw.uy", true },
   { "drawesome.uy", true },
   { "drawingcode.net", true },
+  { "drawtwo.gg", true },
   { "drawxp.com", true },
-  { "drbarnabus.com", true },
   { "drbethanybarnes.com", true },
   { "drbriones.com", true },
+  { "drcarolynquist.com", true },
   { "drchrislivingston.com", true },
   { "drchristinehatfield.ca", true },
   { "drchristophepanthier.com", true },
@@ -10524,21 +10894,37 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "drdipilla.com", true },
   { "dreamcreator108.com", true },
   { "dreamday-with-dreamcar.de", true },
+  { "dreamdivers.com", true },
   { "dreamersgiftshopec.com", true },
   { "dreamhack.com", true },
   { "dreamhostremixer.com", true },
   { "dreamithost.com.au", true },
+  { "dreamkitchenbath.com", true },
   { "dreamlandmagic.com", true },
   { "dreamlinehost.com", false },
+  { "dreamlordpress.it", true },
   { "dreamlux.cz", true },
   { "dreamlux.sk", true },
+  { "dreammaker-nw.com", true },
+  { "dreammakerremodelil.com", true },
+  { "dreammakerutah.com", true },
+  { "dreamof.net", true },
   { "dreamonkey.com", true },
   { "dreamrae.net", true },
   { "dreamtechie.com", true },
+  { "dreatho.com", true },
   { "drei01.com", true },
   { "drei01.de", true },
   { "dreid.org", true },
   { "dreiweiden.de", true },
+  { "dresden-kaffee-24.de", true },
+  { "dresden-kaffeeroesterei.de", true },
+  { "dresdener-mandelstollen.de", true },
+  { "dresdens-pfefferkuchenprinzessin.de", true },
+  { "dresdner-christstollen-von-reimann.de", true },
+  { "dresdner-kaffeeroesterei.de", true },
+  { "dresdner-mandelstollen.de", true },
+  { "dresdner-stollen.shop", true },
   { "dress-cons.com", true },
   { "dressify.co", true },
   { "dressify.in", true },
@@ -10565,20 +10951,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "drillion.net", true },
   { "drillshackresort.com", true },
   { "drinkcontrolapp.com", true },
+  { "drinkgas-jihlava.cz", true },
   { "drinkplanet.eu", true },
-  { "dripdoctors.com", true },
   { "drive.google.com", false },
   { "driven2shine.eu", true },
   { "drivenes.net", true },
   { "driver.ru", true },
   { "driver61.com", true },
-  { "drivercopilot.com", true },
   { "driverless.id", true },
   { "driverprofiler.co.uk", true },
   { "driverscollection.com", true },
   { "drivinghorror.com", true },
   { "drivingtestpro.com", true },
+  { "drivinhors.com", true },
   { "drivya.com", true },
+  { "drizz.com.br", false },
   { "drjacquesmalan.com", true },
   { "drjenafernandez.com", true },
   { "drjoe.ca", true },
@@ -10588,6 +10975,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "drkmtrx.xyz", true },
   { "drlandis.com", true },
   { "drlangsdon.com", true },
+  { "drlinkcheck.com", true },
+  { "drlutfi.com", true },
   { "drmayakato.com", true },
   { "drmcdaniel.com", true },
   { "drms.us", true },
@@ -10607,6 +10996,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "droni.cz", true },
   { "dronnet.com", false },
   { "dronografia.es", true },
+  { "dronova-art.ru", true },
   { "drop.com", true },
   { "dropbox.com", true },
   { "dropboxer.net", true },
@@ -10614,18 +11004,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dropq.nl", true },
   { "dropscloud.spdns.de", true },
   { "dropshare.cloud", true },
+  { "dropshell.net", true },
   { "droso.dk", true },
   { "drown.photography", true },
+  { "drpetervoigt.ddns.net", true },
   { "drpetervoigt.de", true },
   { "drpico.com.au", true },
   { "drrodina.com", true },
   { "drrr.chat", true },
   { "drrr.wiki", true },
   { "drsajjadian.com", true },
+  { "drsamuelkoo.com", true },
+  { "drschlarb.eu", true },
   { "drschruefer.de", true },
   { "drsturgeonfreitas.com", true },
   { "drtimmarch.com", true },
-  { "drubn.de", false },
   { "druckerei-huesgen.de", true },
   { "drugs.com", true },
   { "drumbe.at", true },
@@ -10639,6 +11032,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "druwe.net", true },
   { "druznek.me", true },
   { "drvr.xyz", true },
+  { "drwang.group", true },
   { "drweissbrot.net", true },
   { "drwxr.org", true },
   { "dryan.com", false },
@@ -10648,20 +11042,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "drydrydry.com", true },
   { "dryerventcleaningarlington.com", true },
   { "dryerventcleaningcarrollton.com", true },
+  { "drywallresponse.gov", true },
   { "ds67.de", true },
   { "dsancomics.com", true },
   { "dsanraffleshangbai.xyz", true },
   { "dsayce.com", true },
   { "dsbrowser.com", true },
+  { "dschwarzachtaler.de", true },
   { "dsdalismerkezi.com", true },
   { "dsebastien.net", true },
   { "dsektionen.se", false },
   { "dsgarms.com", true },
   { "dsgholsters.com", true },
+  { "dsgnet.hu", true },
   { "dsgvo.name", true },
   { "dshield.org", true },
   { "dsm5.com", true },
   { "dsmjs.com", true },
+  { "dsne.com.mx", true },
   { "dso-imaging.co.uk", true },
   { "dso-izlake.si", true },
   { "dsol.hu", true },
@@ -10669,6 +11067,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dssale.com", true },
   { "dstamou.de", true },
   { "dsteiner.at", true },
+  { "dstvinstallalberton.co.za", true },
+  { "dstvinstallrandburg.co.za", true },
   { "dt27.org", true },
   { "dtbouncycastles.co.uk", true },
   { "dtdsh.com", true },
@@ -10681,14 +11081,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dtnx.eu", true },
   { "dtnx.net", true },
   { "dtnx.org", true },
+  { "dtoweb.be", true },
+  { "dtp-mstdn.jp", false },
+  { "dtpak.cz", true },
   { "dtuaarsfest.dk", true },
   { "dtx.sk", true },
-  { "dualias.xyz", false },
+  { "dualascent.com", true },
   { "dub.cz", true },
   { "dubai-company.ae", true },
   { "dubaieveningsafari.com", true },
-  { "dubaosheng.com", true },
   { "dubbingkursus.dk", true },
+  { "dubious-website.com", true },
   { "dublin-traceroute.net", true },
   { "dubrovnik-dental.clinic", true },
   { "dubrovskiy.net", true },
@@ -10701,6 +11104,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "duckbase.com", true },
   { "duckduck.horse", true },
   { "duckduckstart.com", true },
+  { "duckeight.win", true },
   { "duckinc.net", true },
   { "duct.me", true },
   { "due-diligence-security.com", true },
@@ -10715,6 +11119,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dugnet.org", false },
   { "dugunedavet.com", true },
   { "duh.se", true },
+  { "dui805.com", true },
   { "duijf.info", true },
   { "duijfathome.nl", true },
   { "duitang.com", true },
@@ -10755,13 +11160,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dupree.pe", true },
   { "durand.tf", true },
   { "durbanlocksmiths.co.za", true },
-  { "durchblick-shop.de", true },
   { "durdle.com", true },
   { "dureuil.info", true },
-  { "durexwinkel.nl", true },
   { "durfteparticiperen.nl", true },
   { "duria.de", true },
   { "duriaux-dentiste.ch", true },
+  { "duroterm.ro", true },
   { "durys.be", true },
   { "dusmomente.com", true },
   { "dusnan.com", true },
@@ -10776,11 +11180,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dutchrank.nl", true },
   { "dutchwanderers.nl", true },
   { "dutchweballiance.nl", true },
+  { "duval.paris", true },
   { "dv189.com", true },
   { "dvbris.co.uk", true },
   { "dvbris.com", true },
+  { "dvdinmotion.com", true },
   { "dvdland.com.au", true },
   { "dvhosting.be", true },
+  { "dvipadmin.com", true },
   { "dvnatura.ch", true },
   { "dvorupotocnych.sk", true },
   { "dvwc.org", true },
@@ -10804,12 +11211,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dybuster.es", true },
   { "dybuster.it", true },
   { "dybuster.se", true },
-  { "dycoa.com", true },
   { "dyeager.org", true },
   { "dyktig.as", true },
   { "dyktig.no", true },
   { "dylanboudro.com", true },
-  { "dylancl.cf", true },
   { "dylankatz.com", true },
   { "dylanknoll.ca", true },
   { "dylanspcrepairs.com", true },
@@ -10834,6 +11239,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dynamics-365.no", true },
   { "dynamics365.no", true },
   { "dynamicsnetwork.net", true },
+  { "dynamicsretailnotes.com", true },
   { "dynamictostatic.com", true },
   { "dynamicyou.co.uk", true },
   { "dynamo.city", true },
@@ -10853,11 +11259,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "dzet.de", true },
   { "dziary.com", true },
   { "dziurdzia.pl", true },
-  { "dzndk.com", true },
-  { "dzndk.net", true },
-  { "dzndk.org", true },
+  { "dzivniekubriviba.lv", true },
   { "dznn.nl", true },
   { "dzomo.org", true },
+  { "dzsibi.com", true },
   { "dzsula.hu", true },
   { "dzyabchenko.com", true },
   { "dzyszla.pl", true },
@@ -10875,12 +11280,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "e-lambre.com", true },
   { "e-learningbs.com", true },
   { "e-lifetechnology.com", true },
-  { "e-planetelec.fr", false },
   { "e-ptn.com", true },
+  { "e-speak24.pl", true },
   { "e-standardstore.org", true },
   { "e-surveillant.nl", true },
   { "e-sw.co.jp", true },
   { "e-teacher.pl", true },
+  { "e-teachers.me", true },
   { "e-tech-solution.com", true },
   { "e-tech-solution.net", true },
   { "e-techsolution.com", true },
@@ -10894,20 +11300,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "e-worksmedia.com", true },
   { "e.mail.ru", true },
   { "e11even.nl", false },
-  { "e1488.com", true },
   { "e15r.co", true },
   { "e2feed.com", true },
   { "e30.ee", true },
   { "e4metech.com", true },
-  { "e52888.com", true },
-  { "e52888.net", true },
-  { "e53888.com", true },
-  { "e53888.net", true },
-  { "e59888.com", true },
-  { "e59888.net", true },
   { "e5tv.hu", true },
   { "e64.com", true },
   { "e6e.io", true },
+  { "e6ex.com", true },
   { "e7d.io", true },
   { "e7fun.net", true },
   { "e965.ru", true },
@@ -10927,6 +11327,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ealev.de", true },
   { "eapestudioweb.com", true },
   { "earl.org.uk", true },
+  { "earlydocs.com", true },
   { "earlyyearshub.com", true },
   { "earmarks.gov", true },
   { "earn.com", true },
@@ -10937,14 +11338,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "easez.net", true },
   { "eashwar.com", true },
   { "eason-yang.com", true },
-  { "east-line.su", true },
   { "eastarm.net", true },
   { "eastblue.org", true },
-  { "eastcoastbubbleandbounce.co.uk", true },
   { "easterncapebirding.co.za", true },
   { "eastlothianbouncycastles.co.uk", true },
   { "eastmanbusinessinstitute.com", true },
+  { "eastnorschool.co.uk", true },
+  { "eastplan.co.kr", true },
   { "eastsidecottages.co.uk", true },
+  { "eastsideroofingcontractor.com", true },
   { "easy-rpg.org", false },
   { "easy2bathe.co.uk", true },
   { "easyadsnbanners.tk", false },
@@ -10954,24 +11356,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "easycosmetic.ch", true },
   { "easycup.com", false },
   { "easydumpsterrental.com", false },
+  { "easyeigo.com", true },
   { "easyfiles.ch", true },
   { "easyhaul.com", true },
-  { "easykraamzorg.nl", false },
+  { "easymun.com", true },
   { "easyocm.hu", true },
   { "easyoutdoor.nl", true },
   { "easypay.bg", true },
   { "easyproperty.com", true },
-  { "easypv.ch", true },
   { "easyqr.codes", true },
   { "easyroad.fr", true },
-  { "easyschools.org", true },
   { "easyslide.be", true },
   { "easyssl.com.cn", true },
   { "easystore.co", true },
   { "easytechguides.com", true },
   { "easytechsecurity.com", true },
   { "easyweenies.com", true },
-  { "eat-mine.ml", true },
   { "eat-sleep-code.com", true },
   { "eatery.co.il", true },
   { "eatmebudapest.hu", true },
@@ -10979,6 +11379,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eatry.io", true },
   { "eatsleeprepeat.net", true },
   { "eatson.com", true },
+  { "eatz-and-treatz.com", true },
   { "eatz.com", true },
   { "eaucube.com", true },
   { "eauxdespleiades.ch", true },
@@ -10994,9 +11395,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ebayinc.com", true },
   { "ebaymotorssucks.com", true },
   { "ebene-bpo.com", true },
+  { "ebenezersbarnandgrill.com", true },
+  { "ebenvloedaanleggen.nl", true },
   { "ebermannstadt.de", false },
   { "eberwe.in", true },
   { "ebest.co.jp", true },
+  { "ebiebievidence.com", true },
   { "ebiografia.com", true },
   { "ebisi.be", true },
   { "ebizarts.com", true },
@@ -11006,6 +11410,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ebooklaunchers.com", true },
   { "eboutic.ch", true },
   { "eboyer.com", true },
+  { "ebpglobal.com", false },
   { "ebrnd.de", true },
   { "ec-baran.de", true },
   { "ec-current.com", true },
@@ -11020,13 +11425,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ecclesia-koeln.de", true },
   { "ecco-verde.com", false },
   { "ecdn.cz", true },
-  { "ecelembrou.ovh", true },
   { "ecfnorte.com.br", true },
   { "echatta.net", true },
   { "echatta.org", true },
   { "echidna-rocktools.eu", true },
   { "echo-security.co", true },
   { "echoanalytics.com", true },
+  { "echobridgepartners.com", true },
   { "echodio.com", true },
   { "echofoxtrot.co", true },
   { "echopaper.com", true },
@@ -11036,15 +11441,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "echoteam.gq", true },
   { "echoteen.com", true },
   { "echoworld.ch", true },
+  { "echtes-hutzelbrot.de", true },
+  { "echtgeld-casinos.de", true },
   { "ecir.pro", true },
   { "ecir.ru", true },
   { "ecirtam.net", true },
   { "eckel.co", true },
+  { "eclanet.ca", true },
   { "eclipse.ws", true },
+  { "ecliptic.cc", true },
   { "ecnetworker.com", true },
   { "eco-derattizzazione.it", true },
   { "eco-wiki.com", true },
   { "eco-work.it", true },
+  { "eco2u.ru", true },
   { "ecobee.com", false },
   { "ecobergerie.fr", true },
   { "ecobin.nl", true },
@@ -11073,6 +11483,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "economias.pt", true },
   { "economic-sanctions.com", true },
   { "economicinclusion.gov", true },
+  { "economics-colleges.com", true },
   { "economiefidu.ch", true },
   { "economies.ch", true },
   { "econsumer.gov", true },
@@ -11091,6 +11502,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ecpannualmeeting.com", true },
   { "ecrandouble.ch", true },
   { "ecupcafe.com", false },
+  { "ecuteam.com", true },
   { "ecxforum.com", true },
   { "ed.gs", true },
   { "ed4becky.net", true },
@@ -11103,6 +11515,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eddyn.net", true },
   { "edeca.net", true },
   { "edehsa.com", true },
+  { "eden.co.uk", true },
+  { "edenming.info", true },
   { "edesseglabor.hu", true },
   { "edfinancial.com", true },
   { "edge-cloud.net", true },
@@ -11111,8 +11525,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "edgevelder.com", true },
   { "edhesive.com", true },
   { "edholm.pub", true },
+  { "edi-gate.com", true },
+  { "edi-gate.de", true },
   { "edibarcode.com", true },
   { "edicct.com", true },
+  { "edilane.com", true },
+  { "edilane.de", true },
   { "edinburghsportsandoutdoorlearning.com", true },
   { "edincmovie.com", true },
   { "ediscomp.sk", true },
@@ -11132,7 +11550,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "edoss.co.za", true },
   { "edp-collaborative.com", true },
   { "edplan.io", true },
-  { "edpubs.gov", false },
   { "edragneainpuscarie.ro", true },
   { "edsby.com", true },
   { "edservicing.com", true },
@@ -11151,6 +11568,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eductf.org", true },
   { "eduid.se", false },
   { "edumundo.nl", true },
+  { "edupool.in", true },
   { "eduroam.no", true },
   { "eduroam.uy", true },
   { "edusanjal.com", true },
@@ -11160,6 +11578,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "edv-bv.de", true },
   { "edv-kohls.de", true },
   { "edv-lehrgang.de", true },
+  { "edv-schmittner.de", true },
+  { "edvgarbe.de", true },
   { "edvmesstec.de", true },
   { "edwar.do", true },
   { "edwards.me.uk", true },
@@ -11177,7 +11597,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eelsden.net", true },
   { "eelzak.nl", true },
   { "eemcevn.com", true },
-  { "eengoedenotaris.nl", true },
   { "eentweevijf.be", true },
   { "eer.io", true },
   { "eerlijktransport.nl", true },
@@ -11185,7 +11604,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eery.de", true },
   { "eesti.xyz", true },
   { "eewna.org", true },
-  { "ef-georgia.org", true },
   { "ef.gy", true },
   { "efa-football.com", true },
   { "efaas.nl", true },
@@ -11200,24 +11618,39 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "effectivecoffee.com", true },
   { "effero.net", true },
   { "effex.ru", true },
+  { "effinfun.com", true },
   { "effizienta.ch", true },
   { "efflam.net", true },
+  { "effortlesshr.com", true },
   { "efg-darmstadt.de", false },
+  { "efinity.io", true },
+  { "efipsactiva.com", true },
+  { "eflorashop.be", true },
+  { "eflorashop.ch", true },
+  { "eflorashop.co.uk", true },
+  { "eflorashop.com", true },
+  { "eflorashop.de", true },
+  { "eflorashop.es", true },
+  { "eflorashop.fr", true },
+  { "eflorashop.it", true },
+  { "eflorashop.mx", true },
+  { "eflorashop.net", true },
+  { "eflorashop.us", true },
   { "efmcredentialing.org", true },
   { "eft.boutique", true },
+  { "eftelingcraft.net", true },
   { "egablo.black", true },
   { "egamespw.com", true },
   { "egami.ch", true },
   { "eganassociates.com.au", true },
+  { "egarden.it", true },
   { "egb.at", false },
-  { "egbert.net", true },
   { "egeozcan.com", true },
   { "egg-ortho.ch", true },
   { "eggblast.com", true },
   { "eggert.org", false },
   { "eggplant.today", true },
   { "egiftcards.be", true },
-  { "eglek.com", true },
   { "egles.eu", true },
   { "ego4u.com", true },
   { "ego4u.de", true },
@@ -11242,11 +11675,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ehmtheblueline.com", true },
   { "ehne.de", true },
   { "ehomusicgear.com", true },
+  { "ehrenburg.info", true },
   { "ehub.cz", true },
   { "ehub.hu", true },
   { "ehub.pl", true },
   { "ehub.sk", true },
   { "eichel.eu", true },
+  { "eichler.work", true },
   { "eichornenterprises.com", true },
   { "eickemeyer.nl", true },
   { "eickhof.co", true },
@@ -11259,6 +11694,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eighty-aid.com", true },
   { "eigpropertyauctions.co.uk", true },
   { "eihaikyo.com", true },
+  { "eika.as", true },
   { "eilhan.com", true },
   { "eimacs.com", true },
   { "einaros.is", true },
@@ -11270,6 +11706,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "einsatzstellenverwaltung.de", true },
   { "einser.com", true },
   { "einsteinathome.org", true },
+  { "einsteincapital.ca", true },
   { "eintageinzug.de", true },
   { "eintragsservice24.de", true },
   { "eipione.com", true },
@@ -11286,7 +11723,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ejkmuseum.nl", true },
   { "ejknet.nl", true },
   { "ejkwebdesign.nl", true },
-  { "ejuicelab.co.uk", true },
   { "ek-networks.de", true },
   { "ekaigotenshoku.com", true },
   { "ekati.ru", true },
@@ -11300,6 +11736,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ekokontakt.cz", true },
   { "ekonbenefits.com", true },
   { "ekostecki.de", true },
+  { "ekostrateg.com", true },
   { "ekpyroticfrood.net", true },
   { "ekrana.info", true },
   { "eksisozluk.com", true },
@@ -11314,19 +11751,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elainerock.com", true },
   { "elaon.de", true },
   { "elars.de", true },
-  { "elarvee.xyz", true },
   { "elb500ttl.nl", true },
   { "elbetech.net", true },
-  { "elbohlyart.com", true },
+  { "elblogdegoyo.mx", true },
   { "elcambiador.es", true },
   { "elcontadorsac.com", true },
   { "eldapoint.co.uk", true },
+  { "eldenelesat.com", true },
+  { "eldercare.gov", true },
+  { "elderjustice.gov", true },
+  { "elderoost.com", true },
   { "eldertons.co.uk", true },
   { "eldevo.com", true },
-  { "eldietista.es", true },
   { "eldinhadzic.com", true },
-  { "eldisagjapi.com", true },
-  { "eldisagjapi.de", true },
   { "eldrid.ge", true },
   { "eldritchfiction.net", true },
   { "eleaut.com.br", true },
@@ -11338,12 +11775,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "electric-vault.co.uk", true },
   { "electricagoura.com", true },
   { "electricagourahills.com", true },
+  { "electrical-schools.com", true },
   { "electricalagoura.com", true },
   { "electricalagourahills.com", true },
   { "electricalcalabasas.com", true },
   { "electricalcamarillo.com", true },
   { "electricalconejovalley.com", true },
   { "electricaldosvientos.com", true },
+  { "electricalfencingbedfordview.co.za", true },
   { "electricalfencingedenvale.co.za", true },
   { "electricalhiddenhills.com", true },
   { "electricallakesherwood.com", true },
@@ -11375,6 +11814,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "electricianconejovalley.com", true },
   { "electriciandosvientos.com", true },
   { "electricianhiddenhills.com", true },
+  { "electriciankemptonpark24-7.co.za", true },
   { "electricianlakesherwood.com", true },
   { "electricianlalucia.co.za", true },
   { "electricianmalibu.com", true },
@@ -11395,6 +11835,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "electricsimivalley.com", true },
   { "electricthousandoaks.com", true },
   { "electricwestlakevillage.com", true },
+  { "electroinkoophardenberg.nl", true },
   { "electronic-ignition-system.com", true },
   { "electronicafacil.net", true },
   { "electronicfasteners.com", true },
@@ -11435,23 +11876,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elemental.software", true },
   { "elementalsoftware.net", true },
   { "elementalsoftware.org", true },
+  { "elementarty.com", true },
   { "elementarywave.com", true },
   { "elements.guide", true },
   { "elementshop.co.uk", true },
+  { "elena-baykova.ru", false },
   { "elenatranslations.nl", true },
   { "elephants.net", true },
   { "elephpant.cz", true },
   { "elepover.com", true },
   { "elerizoentintado.es", true },
-  { "elestanteliterario.com", true },
+  { "eletor.com", true },
+  { "eletor.pl", true },
   { "elettricista-roma.org", true },
   { "eleusis-zur-verschwiegenheit.de", true },
   { "elevator.ee", true },
   { "elevatoraptitudetest.com", true },
+  { "elexel.ru", true },
   { "elexprimidor.com", true },
   { "elfe.de", true },
   { "elfnon.com", true },
   { "elfring.eu", true },
+  { "elfussports.com", true },
   { "elgalponazo.com.ar", true },
   { "elglobo.com.mx", false },
   { "elgosblanc.com", false },
@@ -11461,15 +11907,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elhossari.com", true },
   { "elia.cloud", true },
   { "elian-art.de", true },
+  { "elias-nicolas.com", true },
+  { "eliaskordelakos.com", true },
   { "elibom.com", true },
   { "elie.net", true },
+  { "elielaloum.com", true },
   { "elifesciences.org", true },
+  { "eligibilis.com", true },
   { "eligible.com", true },
   { "eligibleapi.com", true },
   { "eligrey.com", true },
   { "elijahgrey.com", true },
   { "eliminercellulite.com", true },
-  { "eline168.com", true },
   { "elinevanhaaften.nl", true },
   { "elinvention.ovh", true },
   { "eliolita.com", true },
@@ -11483,6 +11932,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elitebouncingfun.com", true },
   { "elitegameservers.net", true },
   { "elitehosting.de", false },
+  { "elitenutritionoficial.com", true },
   { "elixi.re", true },
   { "elixir.bzh", true },
   { "eliyah.co.il", true },
@@ -11507,6 +11957,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elmermx.ch", true },
   { "elnan.do", true },
   { "elnoorandelmohanad.com", true },
+  { "elo-forum.org", true },
   { "elodieclerc.ch", true },
   { "elohellp.com", false },
   { "elonaspitze.de", true },
@@ -11514,6 +11965,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elosrah.com", true },
   { "elosuite.com", true },
   { "eloxt.com", true },
+  { "elpado.de", true },
+  { "elpo.net", true },
   { "elpoderdelespiritu.org", true },
   { "elrinconderovica.com", true },
   { "elsagradocoran.org", true },
@@ -11527,10 +11980,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "elternbeiratswahl.online", true },
   { "elternforum-birmensdorf.ch", true },
   { "elternverein-utzenstorf.ch", true },
+  { "eltip.click", true },
+  { "eltlaw.com", true },
   { "elucron.com", true },
-  { "eluft.de", true },
   { "eluhome.de", true },
   { "eluvio.com", true },
+  { "elvcino.com", true },
   { "elvidence.com.au", true },
   { "elviraszabo.com", true },
   { "elvispresley.net", true },
@@ -11553,9 +12008,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "emailtools.io", true },
   { "emaily.eu", true },
   { "emanuel.photography", true },
+  { "emanuela-gabriela.co.uk", true },
   { "emanuelduss.ch", true },
   { "emanueleanastasio.com", true },
   { "emanuelemazzotta.com", true },
+  { "emarketingmatters.com", true },
   { "embassycargo.eu", true },
   { "emberlife.com", true },
   { "embox.net", true },
@@ -11563,10 +12020,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "embroideryexpress.co.uk", true },
   { "emby.cloud", true },
   { "emcspotlight.com", true },
+  { "emdrupholm.dk", true },
   { "emecew.com", true },
   { "emeliefalk.se", true },
   { "ememsei.com", true },
-  { "emeraldcbdshop.com", true },
+  { "emeraldcbdshop.com", false },
   { "emeraldcityswagger.com", true },
   { "emeraldcoastrideshare.com", true },
   { "emeraldonion.org", true },
@@ -11576,20 +12034,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "emi-air-comprime.com", true },
   { "emi.im", true },
   { "emielraaijmakers.nl", true },
+  { "emil-dein-baecker.com", true },
+  { "emil-dein-baecker.de", true },
+  { "emil-reimann.com", true },
   { "emil.click", true },
   { "emilecourriel.com", true },
   { "emiliendevos.be", true },
   { "emilong.com", true },
+  { "emilreimann.de", true },
+  { "emils-1910.de", true },
+  { "emils-chemnitz.de", true },
+  { "emils1910.de", true },
   { "emilstahl.dk", true },
   { "emilvarga.com", true },
   { "emily.moe", true },
+  { "emilyjohnson.ga", true },
   { "emirabiz.com", false },
   { "emirichardson.com", true },
+  { "emisia.com", true },
   { "emivauthey.com", true },
   { "emkanrecords.com", true },
   { "emkei.cz", true },
   { "emkrivoy.com", true },
-  { "emma-o.com", true },
   { "emma.ca", true },
   { "emmababy420.com", true },
   { "emmagraystore.com", true },
@@ -11622,7 +12088,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "emresaglam.com", true },
   { "emtradingacademy.com", true },
   { "emultiagent.pl", true },
-  { "emupedia.net", true },
   { "emvoice.net", true },
   { "emvoiceapp.com", true },
   { "emw3.com", true },
@@ -11648,6 +12113,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "encrypt.org.uk", true },
   { "encryptallthethings.net", true },
   { "encrypted.google.com", true },
+  { "encryptmy.site", true },
+  { "encryptmycard.com", true },
+  { "encryptmysite.net", true },
   { "encuentraprecios.es", true },
   { "encycarpedia.com", true },
   { "ende-x.com", true },
@@ -11674,7 +12142,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "energy.eu", true },
   { "energyatlas.com", true },
   { "energyaupair.se", true },
+  { "energycodes.gov", true },
   { "energydrinkblog.de", true },
+  { "energyefficientservices.com", true },
   { "energyelephant.com", true },
   { "energyled.com.br", true },
   { "energystar.gov", true },
@@ -11685,12 +12155,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "enfu.se", true },
   { "engarde.net", true },
   { "engaugetools.com", true },
+  { "engelke-optik.de", true },
   { "engelundlicht.ch", true },
   { "engelwerbung.com", true },
   { "engg.ca", true },
   { "engie-laadpalen.nl", true },
   { "engiedev.net", true },
-  { "engineowning.com", true },
   { "enginepit.com", true },
   { "enginsight.com", true },
   { "enginx.net", true },
@@ -11709,8 +12179,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "enjinwallet.io", true },
   { "enjinx.io", true },
   { "enjoy-drive.com", true },
+  { "enjoy-israel.ru", true },
   { "enjoyphoneblog.it", true },
-  { "enlazaresbueno.cl", true },
+  { "enlight.no", true },
   { "enlighten10x.ga", true },
   { "enlightenedhr.com", true },
   { "enlightenment.org", true },
@@ -11734,8 +12205,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ensembling.com", true },
   { "ensley.tech", true },
   { "ensons.de", true },
-  { "ensured.com", true },
-  { "ensured.nl", true },
   { "ensurtec.com", true },
   { "ent-london.com", true },
   { "entabe.jp", true },
@@ -11744,6 +12213,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "entercenter.ru", true },
   { "enterprisey.enterprises", true },
   { "entersoftsecurity.com", true },
+  { "entersynapse.com", false },
   { "entheogens.com", true },
   { "enthusiaformazione.com", true },
   { "entradaweb.cl", true },
@@ -11758,31 +12228,34 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "envant.co.uk", true },
   { "enveloppenopmaat.nl", true },
   { "envescent.com", true },
+  { "enviatufoto.com", true },
   { "enviaya.com.mx", true },
+  { "environmental-colleges.com", true },
   { "environmentkirklees.org", true },
   { "enviroprobasements.com", true },
   { "envirotech.com.au", true },
   { "envoie.moi", true },
-  { "envoutement-desenvoutement.com", true },
   { "envoyez.moi", true },
-  { "envygeeks.io", true },
   { "eocservices.co.uk", true },
   { "eoitek.com", true },
-  { "eonet.cc", true },
   { "eonhive.com", true },
   { "eoonglobalresources.jp", true },
   { "eopugetsound.org", false },
+  { "eosol.de", true },
+  { "eosol.net", true },
+  { "eosol.services", true },
   { "epa.com.es", true },
   { "epassafe.com", true },
-  { "epave.paris", true },
   { "epay.bg", true },
   { "epdeveloperchallenge.com", true },
   { "ephesusbreeze.com", true },
+  { "epi-lichtblick.de", true },
   { "epi.one", true },
   { "epic-vistas.com", true },
   { "epic-vistas.de", true },
   { "epicbouncycastlehirenorwich.co.uk", true },
   { "epicbouncycastles.co.uk", true },
+  { "epicdowney.com", true },
   { "epicenter.work", true },
   { "epicenter.works", true },
   { "epicentre.works", true },
@@ -11833,13 +12306,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "epreskripce.cz", true },
   { "epsilon.dk", true },
   { "epsorting.cz", true },
+  { "epspolymer.com", true },
   { "epublibre.org", true },
-  { "epulsar.ru", true },
-  { "epvin.com", true },
   { "epyonsuniverse.net", true },
   { "eq-serve.com", true },
   { "equalcloud.com", true },
-  { "equallyy.com", true },
   { "equeim.ru", true },
   { "equidam.com", true },
   { "equinecoaching.ca", true },
@@ -11848,7 +12319,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "equipandoloja.net.br", true },
   { "equipedefrance.tv", true },
   { "equipeferramentas.com.br", true },
-  { "equipsupply.com", true },
   { "equk.co.uk", true },
   { "er-mgmt.com", true },
   { "er.tl", true },
@@ -11856,10 +12326,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "erasmusplusrooms.com", true },
   { "erate.fi", true },
   { "erath.fr", true },
-  { "erawanarifnugroho.com", false },
   { "erdethamburgeronsdag.no", true },
   { "ereader.uno", true },
-  { "erecciontotalal100.com", true },
   { "erectiepillenwinkel.nl", true },
   { "erethon.com", true },
   { "erf-neuilly.com", true },
@@ -11882,6 +12350,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "erico.jp", true },
   { "ericoc.com", true },
   { "erics.site", true },
+  { "ericschwartzlive.com", true },
   { "ericvaughn-flam.com", true },
   { "ericwie.se", true },
   { "eridanus.uk", true },
@@ -11899,22 +12368,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "erisrenee.com", true },
   { "erixschueler.de", true },
   { "erkaelderbarenaaben.dk", true },
-  { "ernaehrungsberatung-rapperswil.ch", true },
   { "ernest.ly", true },
+  { "ero.ink", true },
   { "eroma.com.au", true },
   { "eron.info", true },
   { "eroskines.com", true },
-  { "eroticforce.com", true },
   { "erp-band.ru", true },
   { "erp.band", true },
   { "erpax.com", true },
   { "erpband.ru", true },
   { "erpcargo.com", false },
   { "erperium.com", true },
-  { "erpiv.com", true },
   { "errietta.me", true },
   { "errlytics.com", true },
-  { "error418.nl", true },
+  { "error418.nl", false },
   { "ers35.com", true },
   { "ersa-shop.com", true },
   { "ershiwo.com", true },
@@ -11922,6 +12389,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ersinerce.com", true },
   { "erstehilfeprodukte.at", true },
   { "eru.im", false },
+  { "eru.me", true },
   { "eru.moe", true },
   { "erudicia.com", true },
   { "erudicia.de", true },
@@ -11932,7 +12400,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "erudicia.se", true },
   { "erudicia.uk", true },
   { "erudikum.cz", true },
-  { "ervaarjapan.nl", true },
+  { "erverydown.ml", true },
   { "erwanlepape.com", true },
   { "erwin.saarland", true },
   { "erwinpaal.nl", true },
@@ -11941,94 +12409,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "erythroxylum-coca.com", true },
   { "es-geenen.de", true },
   { "es.search.yahoo.com", false },
-  { "es888.net", true },
-  { "es999.net", true },
-  { "es9999.net", true },
   { "esaborit.ddns.net", true },
   { "esagente.com", true },
   { "esailinggear.com", true },
   { "esalesdata.com", true },
   { "esamievalori.com", true },
   { "esample.info", true },
-  { "esb-in.net", true },
-  { "esb-top.com", true },
-  { "esb-top.net", true },
-  { "esb116.com", true },
-  { "esb168168.com", true },
-  { "esb168168.info", true },
-  { "esb168168.net", true },
-  { "esb168168.org", true },
-  { "esb1688.biz", true },
-  { "esb1688.com", true },
-  { "esb1688.info", true },
-  { "esb1688.net", true },
-  { "esb1688.org", true },
-  { "esb1711.com", true },
-  { "esb1711.net", true },
-  { "esb1788.com", true },
-  { "esb1788.info", true },
-  { "esb1788.net", true },
-  { "esb1788.org", true },
-  { "esb2013.com", true },
-  { "esb2013.net", true },
-  { "esb2099.com", true },
-  { "esb2099.net", true },
-  { "esb258.net", true },
-  { "esb325.com", true },
-  { "esb325.net", true },
-  { "esb333.net", true },
-  { "esb336.com", true },
-  { "esb369.com", true },
-  { "esb433.com", true },
-  { "esb518.com", true },
-  { "esb553.com", true },
-  { "esb555.biz", true },
-  { "esb555.cc", true },
-  { "esb5889.com", true },
-  { "esb5889.net", true },
-  { "esb6.net", true },
-  { "esb677.net", true },
-  { "esb775.net", true },
-  { "esb777.biz", true },
-  { "esb777.me", true },
-  { "esb777.org", true },
-  { "esb886.com", true },
-  { "esb888.net", true },
-  { "esb8886.com", true },
-  { "esb9527.com", true },
-  { "esb9588.com", true },
-  { "esb9588.net", true },
-  { "esb9588.org", true },
-  { "esb999.org", true },
-  { "esba11.com", true },
-  { "esba11.in", true },
-  { "esball-in.com", true },
-  { "esball-in.net", true },
-  { "esball.bz", true },
-  { "esball.cc", true },
-  { "esball.me", true },
-  { "esball.mx", true },
-  { "esball.online", true },
-  { "esball.org", true },
-  { "esball.tv", true },
-  { "esball.win", true },
-  { "esball.ws", true },
-  { "esball518.com", true },
-  { "esball518.info", true },
-  { "esball518.net", true },
-  { "esball518.org", true },
-  { "esballs.com", true },
-  { "esbbon.com", true },
-  { "esbbon.net", true },
-  { "esbfun.com", true },
-  { "esbfun.net", true },
-  { "esbgood.com", true },
-  { "esbin.net", true },
-  { "esbjon.com", true },
-  { "esbjon.net", true },
-  { "esbm4.net", true },
-  { "esbm5.net", true },
+  { "esb9588.info", false },
   { "esc.chat", true },
+  { "esc.gov", true },
   { "escael.org", true },
   { "escape2rooms.fr", true },
   { "escapeplaza.de", true },
@@ -12036,8 +12425,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "escargotbistro.com", true },
   { "escavador.com", true },
   { "esclear.de", true },
+  { "escolibri.com", true },
   { "escontact.ch", true },
-  { "escort-byuro.net", true },
   { "escortmantra.com", true },
   { "escritoriodearte.com", false },
   { "escuelabiblica.com", true },
@@ -12047,17 +12436,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "esdiscuss.org", true },
   { "eservices-greece.com", true },
   { "esg-abi2001.de", true },
+  { "esgen.org", true },
   { "esgr.in", true },
-  { "eshobe.com", true },
   { "eshop-prices.com", true },
+  { "eshspotatoes.com", true },
   { "esibun.net", true },
   { "esigmbh.de", true },
   { "esipublications.com", true },
   { "esite.ch", true },
   { "eskdale.net", true },
   { "eskriett.com", true },
-  { "esmoney.cc", true },
-  { "esmoney.me", true },
+  { "eslint.org", true },
   { "esoa.net", true },
   { "esoko.eu", true },
   { "esolcourses.com", true },
@@ -12072,6 +12461,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "espacetheosophie.fr", true },
   { "espacio-cultural.com", true },
   { "espanol.search.yahoo.com", false },
+  { "espanolseguros.com", true },
   { "espanova.com", true },
   { "espci.fr", true },
   { "especificosba.com.ar", true },
@@ -12079,13 +12469,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "espgg.org", true },
   { "esphigmenou.gr", true },
   { "espigol.org", true },
+  { "esport-battlefield.com", true },
   { "esports-network.de", true },
+  { "espower.com.sg", true },
   { "espritrait.com", true },
   { "esquirou-trieves.fr", true },
   { "esquisse.fr", true },
   { "esrhd.com", true },
   { "esrinfo.com", true },
   { "ess-cert.ru", true },
+  { "essayace.co.uk", true },
   { "essayads.com", true },
   { "essaybrand.com", true },
   { "essaychecker.com", true },
@@ -12094,8 +12487,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "essaynews.com", true },
   { "essaypro.net", true },
   { "essaytalk.com", true },
-  { "essaywebsite.com", true },
   { "essaywriting.biz", true },
+  { "essca.fr", true },
   { "essenalablog.de", true },
   { "essencesdeprana.org", true },
   { "essex.cc", true },
@@ -12115,6 +12508,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "esterilizacion-perros.es", true },
   { "esteticanorte.com.br", true },
   { "estetista.net", true },
+  { "estherlew.is", true },
   { "esthesoleil.jp", true },
   { "estilopack-loja.com.br", true },
   { "estoic.net", true },
@@ -12123,6 +12517,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "estudiarparaser.com", true },
   { "estudiserradal.com", true },
   { "esurety.net", true },
+  { "esuretynew.azurewebsites.net", true },
   { "esw00.com", true },
   { "esw06.com", true },
   { "esw07.com", true },
@@ -12130,6 +12525,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "esw09.com", true },
   { "eswap.cz", true },
   { "et-inf.de", true },
+  { "eta.cz", true },
   { "etaes.eu", true },
   { "etajerka-spb.ru", true },
   { "etalent.net", true },
@@ -12156,7 +12552,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ethack.org", true },
   { "ethaligan.fr", true },
   { "ethan.pm", true },
-  { "ethandelany.me", true },
   { "ethercalc.com", true },
   { "ethercalc.org", true },
   { "etherderbies.com", true },
@@ -12172,7 +12567,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ethosinfo.com", true },
   { "etienne.cc", true },
   { "etikus-hacker.hu", true },
-  { "etincelle.ml", true },
   { "etiquetaunica.com.br", true },
   { "etoile-usedcars.com", true },
   { "etre-soi.ch", true },
@@ -12193,8 +12587,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eu-gamers.com", true },
   { "eu-stellenangebot.de", true },
   { "euaggelion.blog.br", true },
+  { "euanbarrett.com", true },
   { "euchre.us", true },
-  { "eucl3d.com", true },
   { "eugenechae.com", true },
   { "eugenekay.com", true },
   { "eugenetech.org", true },
@@ -12222,27 +12616,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eurekaarchitecture.com", true },
   { "euro-servers.de", true },
   { "euroalter.com", true },
+  { "eurocars2000.es", true },
   { "eurocenterobuda.hu", true },
   { "eurocomcompany.cz", true },
+  { "euroconthr.ro", true },
+  { "eurodentaire.com", true },
+  { "euroflora.com", true },
+  { "euroflora.mobi", true },
   { "eurofrank.eu", true },
   { "eurolocarno.es", true },
-  { "europapier.at", true },
-  { "europapier.ba", true },
-  { "europapier.bg", true },
-  { "europapier.com", true },
-  { "europapier.cz", true },
-  { "europapier.hr", true },
   { "europapier.hu", true },
   { "europapier.net", true },
-  { "europapier.rs", true },
-  { "europapier.si", true },
   { "europapier.sk", true },
-  { "europapier.ua", true },
   { "europarts-sd.com", true },
   { "europastudien.de", true },
   { "european-agency.org", true },
   { "europeancupinline.eu", true },
-  { "europeanpreppers.com", true },
   { "europeantimberconnectors.ca", true },
   { "europeantransportmanagement.com", true },
   { "europeanwineresource.com", true },
@@ -12251,22 +12640,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "europop.com", true },
   { "eurora.de", true },
   { "euroscot.de", true },
-  { "euroservice.com.gr", true },
   { "euroshop.or.at", true },
+  { "euroskano.nl", true },
   { "eurotime.ua", true },
   { "eurotramp.com", true },
   { "eurotravelstar.eu", true },
   { "eurousa.us", true },
   { "eurovision.ie", true },
+  { "euteamo.cn", true },
   { "eutotal.com", true },
   { "eutram.com", true },
-  { "euvo.tk", false },
   { "euwid.de", true },
   { "ev-zertifikate.de", true },
   { "eva-select.com", true },
   { "eva.cz", true },
   { "evaartinger.de", true },
-  { "evades.io", true },
   { "evadifranco.com", true },
   { "evafojtova.cz", true },
   { "evailoil.ee", true },
@@ -12278,11 +12666,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "evanfiddes.com", true },
   { "evangelicalmagazine.com", true },
   { "evangelosm.com", true },
-  { "evanreev.es", true },
-  { "evantage.org", true },
   { "evantageglobal.com", true },
   { "evanwang0.com", true },
   { "evapp.org", true },
+  { "evasioncreole.com", true },
   { "evasovova.cz", true },
   { "evavolfova.cz", true },
   { "eve.ac", true },
@@ -12290,22 +12677,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "evelienzorgt.nl", true },
   { "evelyndayman.com", true },
   { "evemarketer.com", true },
-  { "evemodx.com", true },
   { "evenementenhoekvanholland.nl", true },
   { "evenstargames.com", true },
   { "event4fun.no", true },
   { "eventaro.com", true },
+  { "eventide.space", true },
   { "eventive.org", true },
   { "eventnexus.co.uk", true },
   { "eventosenmendoza.com.ar", true },
   { "events-hire.co.uk", true },
   { "eventtech.com", false },
+  { "evenwallet.com", true },
+  { "eveonline.com", true },
   { "ever.sale", true },
   { "everain.me", true },
+  { "everettsautorepair.com", true },
   { "everfine.com.tw", true },
   { "evergladesrestoration.gov", true },
+  { "evergreenmichigan.com", true },
   { "everhome.de", true },
-  { "everitoken.io", true },
   { "everling.lu", true },
   { "everlong.org", true },
   { "evermarkstudios.com", true },
@@ -12315,6 +12705,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "everybodyhertz.co.uk", true },
   { "everyday.eu.org", true },
   { "everydaygary.com", true },
+  { "everydaywot.com", true },
   { "everydaywp.com", true },
   { "everyex.com", true },
   { "everyfad.com", true },
@@ -12322,6 +12713,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "everything-everywhere.com", true },
   { "everythingaccess.com", true },
   { "everythingstech.com", true },
+  { "everythinq.com", true },
   { "everytrycounts.gov", false },
   { "everywhere.cloud", true },
   { "eveshamglass.co.uk", true },
@@ -12333,14 +12725,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "evidencija.ba", true },
   { "evidentiasoftware.com", true },
   { "evilarmy.com", true },
-  { "evilbeasts.ru", true },
   { "evilbunnyfufu.com", true },
   { "evilcult.me", true },
   { "evilized.de", true },
   { "evilmartians.com", true },
-  { "evilness.nl", true },
   { "evilsite.cf", true },
-  { "evilvolcanolairs.com", true },
   { "evion.nl", true },
   { "evlear.com", true },
   { "evoco.vc", true },
@@ -12352,6 +12741,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "evolutioninflatables.co.uk", true },
   { "evolutionlending.co.uk", true },
   { "evolutionpets.com", true },
+  { "evolutionsmedicalspa.com", true },
   { "evolvetechnologies.co.uk", true },
   { "evolvingthoughts.net", true },
   { "evonews.com", true },
@@ -12364,6 +12754,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "evrica.me", true },
   { "evromandie.ch", true },
   { "evstatus.com", true },
+  { "evtasima.name.tr", true },
   { "evtripping.com", true },
   { "ewaipiotr.pl", true },
   { "ewanm89.co.uk", true },
@@ -12375,7 +12766,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ewout.io", true },
   { "ewsfeed.com", true },
   { "ewtl.es", true },
-  { "ewuchuan.com", true },
   { "ewus.de", true },
   { "ewycena.pl", true },
   { "ex-deli.jp", true },
@@ -12383,26 +12773,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "exactphilosophy.net", true },
   { "exagoni.com.au", true },
   { "exagoni.com.my", true },
+  { "examedge.com", true },
   { "example.sc", true },
   { "example.wf", true },
+  { "examplesu.com", true },
   { "examsmate.in", true },
   { "exaplac.com", true },
   { "exarpy.com", true },
   { "exatmiseis.net", false },
   { "exceed.global", true },
   { "exceedagency.com", true },
+  { "excelhot.com", true },
   { "exceltechdubai.com", true },
   { "exceltechoman.com", true },
   { "exceltobarcode.com", true },
   { "excentos.com", true },
   { "exceptionalservers.com", true },
+  { "excessamerica.com", true },
   { "excesssecurity.com", true },
   { "exchaser.com", true },
   { "exclusivebouncycastles.co.uk", true },
+  { "exclusivecarcare.co.uk", true },
   { "exclusivedesignz.com", true },
   { "exdamo.de", false },
   { "exe-boss.tech", true },
-  { "exebouncycastles.co.uk", true },
   { "execution.biz.tr", true },
   { "exehack.net", true },
   { "exeintel.com", true },
@@ -12420,29 +12814,34 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "exit9wineandliquor.com", true },
   { "exitooutdoor.com", true },
   { "exmoe.com", true },
-  { "exocen.com", true },
+  { "exnovin.co", true },
   { "exon.io", true },
   { "exoplatform.com", true },
+  { "exordiumconcepts.com", true },
   { "exoscale.ch", true },
   { "exoscale.com", true },
   { "exoten-spezialist.de", true },
   { "exousiakaidunamis.pw", true },
   { "exp.de", true },
-  { "expancio.com", true },
   { "expandeco.com", true },
   { "expatmortgage.uk", true },
+  { "expe.voyage", true },
+  { "expeditiegrensland.nl", true },
   { "experienceoutdoors.org.uk", true },
   { "experienceoz.com.au", true },
+  { "experise.fr", true },
   { "expert-korovin.ru", true },
   { "expert.cz", true },
   { "experteasy.com.au", true },
+  { "expertofficefitouts.com.au", true },
   { "expertohomestaging.com", true },
-  { "experts-en-gestion.fr", true },
   { "expertsverts.com", true },
+  { "expertvagabond.com", true },
   { "expertviolinteacher.com", true },
   { "expiscor.solutions", true },
   { "explodie.org", true },
   { "exploflex.com.br", true },
+  { "exploit.cz", true },
   { "exploit.party", true },
   { "exploit.ph", true },
   { "exploited.cz", true },
@@ -12462,10 +12861,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "expoort.fr", true },
   { "expoort.it", true },
   { "expopodium.com", true },
-  { "exporo.de", true },
   { "exporta.cz", true },
   { "express-shina.ru", true },
   { "express-vpn.com", true },
+  { "expressemotion.net", true },
   { "expresshosting.org", true },
   { "expressmarket.ru", true },
   { "expresstinte.de", true },
@@ -12476,7 +12875,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "exside.com", true },
   { "exsora.com", true },
   { "extasic.com", true },
-  { "extendwings.com", true },
   { "extensia.it", true },
   { "extensibility.biz.tr", true },
   { "extensiblewebmanifesto.org", true },
@@ -12501,6 +12899,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "exteriorlightingwestlakevillage.com", true },
   { "extintormadrid.com", true },
   { "extradesktops.com", false },
+  { "extradivers-worldwide.com", true },
   { "extranetpuc.com.br", true },
   { "extrapagetab.com", true },
   { "extreme-gaming.de", true },
@@ -12508,10 +12907,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "extreme-players.com", true },
   { "extreme-players.de", true },
   { "extreme.co.th", true },
-  { "extremeservicesandrestoration.com", true },
   { "exultcosmetics.co.uk", true },
+  { "exxo.tk", true },
   { "exyplis.com", true },
-  { "eyasc.nl", true },
   { "eydesignguidelines.com", true },
   { "eyeandfire.com", true },
   { "eyecandy.gr", true },
@@ -12522,7 +12920,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "eyes-berg.ch", true },
   { "eyes-berg.com", true },
   { "eyesandearsrescue.org", true },
-  { "eyesonly.cc", true },
   { "eynio.com", true },
   { "eyona.com", true },
   { "eyps.net", true },
@@ -12552,14 +12949,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "f2h.io", true },
   { "f3nws.com", true },
   { "f43.me", true },
+  { "f5.hk", true },
   { "f5nu.com", true },
   { "f5w.de", true },
   { "fa-works.com", true },
+  { "fabbro-roma.org", true },
   { "faber.org.ru", true },
   { "fabian-fingerle.de", true },
+  { "fabian-klose.com", true },
+  { "fabian-klose.de", true },
+  { "fabian-klose.net", true },
   { "fabian-koeppen.de", true },
   { "fabianackle.ch", true },
-  { "fabianasantiago.com", true },
   { "fabianbeiner.com", false },
   { "fabianbeiner.de", false },
   { "fabianfranke.de", true },
@@ -12572,7 +12973,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fabrica360.com", true },
   { "fabriceleroux.com", true },
   { "fabriziocavaliere.it", true },
-  { "fabriziorocca.com", true },
   { "fabriziorocca.it", true },
   { "fabse.net", true },
   { "facanabota.com", true },
@@ -12582,13 +12982,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "face-mania.com", true },
   { "facealacrise.fr", true },
   { "facebook-atom.appspot.com", true },
-  { "facebook.com", false },
+  { "facebook.com", true },
   { "facebydrh.com", true },
   { "facebylouise.co.uk", true },
   { "facekungfu.com", true },
   { "facepainting.gr", true },
-  { "facepunch.org", true },
   { "facerepo.com", true },
+  { "faceresources.org", true },
   { "fach-journalist.de", true },
   { "fachschaftslisten.at", true },
   { "fachschaftslisten.org", true },
@@ -12606,6 +13006,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fackovec.sk", true },
   { "factbytefactbox.com", true },
   { "factcool.com", true },
+  { "factor.cc", false },
   { "factureenlinea.com", true },
   { "factuur.pro", true },
   { "factuursturen.be", true },
@@ -12619,12 +13020,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "faelix.net", true },
   { "faerb.it", true },
   { "faerie-art.com", true },
+  { "fahnamporn.com", true },
   { "fahnen-fanwelt.de", true },
   { "fahrenwal.de", true },
   { "fahrenwalde.de", true },
   { "fahrschule-laux.de", true },
   { "fahrwerk.io", true },
   { "fai.gov", true },
+  { "faidanoi.it", true },
+  { "faidatefacile.it", true },
   { "fail.coach", true },
   { "fail4free.de", true },
   { "failover.de", true },
@@ -12633,12 +13037,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fairbill.com", true },
   { "fairedeseconomies.info", true },
   { "fairgolfteams.com", true },
+  { "fairleighcrafty.com", true },
   { "fairmarketing.com", true },
   { "fairplay.im", true },
   { "fairssl.dk", true },
   { "fairssl.se", true },
   { "fairviewmotel-simcoe.com", true },
   { "fairydust.space", true },
+  { "faithcentercogop.net", true },
   { "faithgrowth.com", true },
   { "faithindemocracy.eu", false },
   { "faithleaks.org", true },
@@ -12646,18 +13052,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "faithwatch.org", true },
   { "faixaazul.com", true },
   { "fakeapple.nl", true },
+  { "fakerli.com", true },
   { "fakti.bg", true },
+  { "faktotum.tech", true },
   { "fakturi.com", true },
   { "fakturoid.cz", true },
   { "falaeapp.org", true },
+  { "falaland.com", true },
   { "falaowang.com", true },
   { "falbros.com", true },
+  { "falcona.io", true },
   { "falconfrag.com", true },
   { "falconvintners.com", true },
   { "falcoz.co", true },
   { "faldoria.de", true },
   { "falegname-roma.it", true },
   { "falkhusemann.de", true },
+  { "falldennismarketing.com", true },
   { "fallenangeldrinks.co.uk", true },
   { "fallenangeldrinks.com", true },
   { "fallenangelspirits.co.uk", true },
@@ -12665,12 +13076,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fallenmystic.com", true },
   { "fallenspirits.co.uk", true },
   { "falling.se", true },
+  { "fallofthecitadel.com", true },
   { "falsum.net", true },
   { "fam-kreibich.de", true },
   { "fam-stemmer.de", false },
   { "fam-weyer.de", true },
   { "famcloud.de", true },
-  { "famdouma.nl", true },
   { "fameng.nl", true },
   { "fameus.fr", true },
   { "famfi.co", true },
@@ -12683,9 +13094,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "familie-remke.de", true },
   { "familiegrottendieck.de", true },
   { "familieholme.de", true },
+  { "familiekiekjes.nl", true },
   { "familjenfrodlund.se", true },
   { "familjenm.se", true },
-  { "familletouret.fr", true },
   { "familylawhotline.org", true },
   { "familyparties.co.uk", true },
   { "familyreal.ru", true },
@@ -12696,27 +13107,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "famoushostels.com", true },
   { "famvangelder.nl", true },
   { "famvsomeren.nl", true },
+  { "fan.gov", true },
   { "fanactu.com", true },
   { "fanatical.com", true },
   { "fanboi.ch", true },
+  { "fancy-bridge.com", true },
   { "fancy.org.uk", true },
+  { "fancygaming.dk", true },
   { "fander.it", true },
   { "fandler.cz", true },
   { "fandomservices.com", true },
   { "fanfareunion.ch", true },
   { "fangs.ink", true },
-  { "fanhouwan.com", true },
   { "fanjoe.be", true },
   { "fansided.com", true },
   { "fantasiapainter.com", true },
+  { "fantasiatravel.hr", true },
   { "fantasticcleaners.com.au", true },
   { "fantastichandymanmelbourne.com.au", true },
   { "fantastici.de", true },
   { "fantasticservices.com", true },
+  { "fantasticservicesgroup.com.au", true },
   { "fantasycastles.co.uk", true },
   { "fantasyescortsbirmingham.co.uk", true },
   { "fantasypartyhire.com.au", true },
-  { "fantasyprojections.com", true },
   { "fantasyspectrum.com", true },
   { "fantopia.club", true },
   { "fanvoice.com", true },
@@ -12725,7 +13139,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fanyue123.tk", true },
   { "fanz.pro", true },
   { "fanzlive.com", true },
-  { "fap.no", true },
   { "faq.ie", true },
   { "fara.gov", true },
   { "faradji.nu", true },
@@ -12752,11 +13165,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "farrelf.blog", true },
   { "farsil.eu", true },
   { "fart.wtf", true },
+  { "farthing.xyz", true },
   { "farwat.ru", true },
   { "faschingmd.com", true },
+  { "fascia.fit", true },
   { "fashion-stoff.de", true },
   { "fashion24.de", true },
   { "fashion4ever.pl", true },
+  { "fashionhijabers.com", true },
   { "fashionunited.be", true },
   { "fashionunited.cl", true },
   { "fashionunited.com", true },
@@ -12783,7 +13199,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fastblit.com", true },
   { "fastcash.com.br", true },
   { "fastcommerce.org", true },
-  { "fastconfirm.com", true },
   { "fastcp.top", true },
   { "fastest-hosting.co.uk", true },
   { "fastforwardsociety.nl", true },
@@ -12794,19 +13209,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fastpresence.com", true },
   { "fastrevision.com", true },
   { "fastvistorias.com.br", true },
-  { "fastwebsites.com.br", true },
   { "faszienrollen-info.de", false },
   { "fateandirony.com", true },
   { "fatecdevday.com.br", true },
   { "fatedata.com", true },
+  { "fateitalia.it", true },
   { "fathers4equalrights.org", true },
   { "fatidique.com", true },
   { "fatimamoldes.com.br", true },
   { "fatmixx.com", true },
   { "fatowltees.com", true },
-  { "fattorino.it", true },
   { "faucetbox.com", false },
-  { "faui2k17.de", true },
+  { "faui2k17.de", false },
+  { "faultlines.org", true },
   { "faulty.equipment", true },
   { "fauvettes.be", true },
   { "favirei.com", true },
@@ -12817,6 +13232,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fb.me", true },
   { "fbcdn.net", true },
   { "fbcopy.com", true },
+  { "fbi.gov", true },
   { "fbigame.com", true },
   { "fbiic.gov", true },
   { "fbijobs.gov", true },
@@ -12827,25 +13243,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fc.media", true },
   { "fca-tools.com", true },
   { "fcburk.de", true },
+  { "fccarbon.com", true },
+  { "fcdn.nl", true },
   { "fcforum.net", true },
   { "fcingolstadt.de", true },
-  { "fcitasc.com", true },
   { "fckd.net", true },
   { "fcosinus.com", true },
   { "fcprovadia.com", true },
   { "fcsic.gov", true },
+  { "fdalawboston.com", true },
+  { "fdaregs.com", true },
   { "fdevs.ch", true },
   { "fdicig.gov", true },
   { "fdicoig.gov", true },
   { "fdlibre.eu", true },
-  { "fdm.ro", true },
   { "fdms.gov", true },
   { "fdn.one", true },
   { "fdos.me", true },
   { "fdp-brig-glis.ch", true },
+  { "fdresearch.ca", true },
   { "fdsys.gov", false },
   { "feac.us", true },
   { "feaden.me", true },
+  { "feandc.com", true },
   { "fearby.com", true },
   { "fearghus.org", true },
   { "fearsomegaming.com", true },
@@ -12880,13 +13300,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "feedkovacs.hu", true },
   { "feedough.com", true },
   { "feedthefuture.gov", true },
+  { "feeeei.com", true },
   { "feel-events.com", true },
   { "feel.aero", true },
+  { "feelgood-workouts.de", true },
   { "feelmom.com", true },
   { "feeltennis.net", true },
   { "feen.us", true },
   { "feepod.com", true },
-  { "feeriedesign-event.com", true },
   { "feetpa.ws", true },
   { "feezmodo.com", false },
   { "fefelovalex.ru", true },
@@ -12909,16 +13330,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "feld.saarland", true },
   { "feldhousen.com", true },
   { "felett.es", true },
-  { "felgitscher.xyz", true },
+  { "feli.games", true },
   { "felicifia.org", true },
   { "felinepc.com", true },
   { "felisslovakia.sk", true },
   { "felistirnavia.sk", true },
+  { "felixaufreisen.de", true },
   { "felixbarta.de", true },
   { "felixcrux.com", true },
   { "felixgenicio.com", true },
   { "felixkauer.de", true },
-  { "felixqu.com", true },
   { "felixsanz.com", true },
   { "felixseele.de", true },
   { "felsing.net", true },
@@ -12930,8 +13351,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "femradio.es", true },
   { "femtomind.com", true },
   { "fence-stlouis.com", true },
-  { "feng-in.com", true },
-  { "feng-in.net", true },
+  { "feng-hhcm.com", true },
   { "feng.si", true },
   { "fengyi.tel", true },
   { "fenster-bank.at", true },
@@ -12945,6 +13365,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ferienhaeuser-krummin.de", true },
   { "ferienhaus-polchow-ruegen.de", false },
   { "ferienhausprovence.ch", true },
+  { "ferienwohnung-hafeninsel-stralsund.de", true },
   { "feriespotter.dk", true },
   { "ferm-rotterdam.nl", true },
   { "fermabel.com.br", true },
@@ -12965,11 +13386,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "festaprylar.se", true },
   { "festival-tipps.com", true },
   { "festivaljapon.com", true },
+  { "festx.co.za", true },
   { "fettlaus.de", true },
   { "feudalisten.de", true },
   { "feuerhuhn.de", true },
   { "feuerloescher-arten.de", true },
   { "feuerloescher-test.de", true },
+  { "feuerwehr-coesfeld.de", true },
+  { "feuerwehr-gebirge.de", true },
   { "feuerwehr-heiligenberg.de", true },
   { "feuerwehr-illmensee.de", true },
   { "feuerwehr-mehring.de", true },
@@ -12978,6 +13402,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "feuerwehr-vechta.de", true },
   { "feuerwerksmanufaktur.de", true },
   { "feuetgloire.com", true },
+  { "fewo-hafeninsel-stralsund.de", true },
   { "fewo-thueringer-wald.de", true },
   { "fexco.com", true },
   { "feybiblia.com", true },
@@ -13015,6 +13440,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fiasgo.dk", true },
   { "fiasgo.i.ng", true },
   { "fibo-forex.org", true },
+  { "fibra.click", true },
   { "fibretv.co.nz", true },
   { "fibretv.tv", true },
   { "fichier-pdf.fr", true },
@@ -13030,7 +13456,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fiduciaire-ratio.ch", true },
   { "fieldwork-paysage.com", true },
   { "fierlafijn.net", true },
+  { "fierscleaning.nl", true },
   { "fiery.me", true },
+  { "fifautstore.com", true },
   { "fifei.de", true },
   { "fifichachnil.paris", true },
   { "fifieldtech.com", true },
@@ -13050,13 +13478,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fijnefeestdageneneengelukkignieuwjaar.nl", true },
   { "fijnewoensdag.nl", true },
   { "fiken.no", true },
+  { "fikst.com", true },
   { "fil.fi", true },
-  { "filamentia.nl", true },
   { "filanthropystar.org", true },
   { "file-cloud.eu", true },
   { "file-pdf.it", true },
   { "filecopa.com", true },
   { "files.from-me.org", true },
+  { "fileservicios.com.ar", true },
   { "filestar.io", true },
   { "filestartest.io", true },
   { "filetransfer.one", true },
@@ -13069,11 +13498,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "filiio.com", true },
   { "filingsmadeeasy.com", true },
   { "filiosoft.cloud", true },
-  { "filip-prochazka.com", true },
+  { "filip-prochazka.com", false },
   { "filippo.io", true },
   { "filipsebesta.com", true },
   { "filleritemsindia.com", true },
   { "fillo.sk", true },
+  { "film-colleges.com", true },
   { "film-storyboards.com", true },
   { "film-storyboards.fr", true },
   { "film-tutorial.com", true },
@@ -13087,13 +13517,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "filmsphoto.com", true },
   { "filoo.de", true },
   { "filstop.com", true },
-  { "filterflasche-kaufen.de", true },
   { "filterlists.com", true },
   { "filtr.me", true },
   { "fimsquad.com", true },
   { "final-expense-quotes.com", true },
+  { "finalprice.net", true },
   { "finalrewind.org", true },
   { "finalx.nl", true },
+  { "finance-colleges.com", true },
   { "financejobs.ch", true },
   { "financepark.ch", true },
   { "financniexperti.sk", true },
@@ -13104,7 +13535,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "finchnest.co.uk", true },
   { "find-job-in.com", true },
   { "find-mba.com", true },
-  { "find-your-happy-place.de", true },
   { "findapinball.com", true },
   { "findcarspecs.com", true },
   { "findhoustonseniorcare.com", true },
@@ -13123,23 +13553,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "finefriends.nl", true },
   { "finelovedolls.com", true },
   { "finenet.com.tw", true },
+  { "finevegashomes.com", true },
   { "finfev.de", true },
   { "finflix.net", true },
   { "finform.ch", true },
   { "fini-de-jouer.ch", true },
   { "finisron.in", true },
   { "finkelstein.fr", true },
-  { "finkenberger.org", false },
+  { "finkmartin.com", true },
+  { "finn.io", true },
   { "finnclass.cz", true },
   { "finnwea.com", true },
   { "finpt.com", false },
   { "finsprings.org", true },
+  { "fintandunleavy.com", false },
   { "fintechnics.com", false },
   { "fintry.ca", true },
   { "finvantage.com", true },
+  { "finwe.info", true },
   { "fionamcbride.com", true },
+  { "fioristionline.it", true },
+  { "fioristionline.net", true },
   { "fioulmarket.fr", true },
   { "fir3net.com", true },
+  { "fire-schools.com", true },
   { "firebaseio.com", true },
   { "firebirdrangecookers.com", true },
   { "firebounty.com", true },
@@ -13153,8 +13590,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "firefly-iii.org", true },
   { "firegoby.jp", true },
   { "firegore.com", true },
+  { "firekoi.com", true },
   { "fireleadership.gov", true },
   { "firemudfm.com", true },
+  { "firenza.org", true },
   { "fireplex.co.uk", true },
   { "fireportal.cz", true },
   { "fireportal.sk", true },
@@ -13167,7 +13606,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fireworksshowvr.com", true },
   { "firma-cerny.cz", true },
   { "firma-offshore.com", true },
-  { "firmale.com", true },
+  { "firmament.space", true },
   { "firmapi.com", true },
   { "firmen-assekuranz.de", true },
   { "firmenwerbung-vermarktung.de", true },
@@ -13188,6 +13627,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "firstq.xyz", true },
   { "fischer-its.com", false },
   { "fischer-kundendienst.de", true },
+  { "fischers.cc", true },
   { "fischers.it", true },
   { "fischers.srv.br", true },
   { "fise.cz", true },
@@ -13199,6 +13639,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fishermansbendcorporation.com.au", true },
   { "fishermansbendtownhouses.com.au", true },
   { "fishexport.eu", true },
+  { "fishfinders.info", true },
   { "fishgen.no", true },
   { "fishserver.net", true },
   { "fishtacos.blog", true },
@@ -13208,13 +13649,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fit-mit-system.eu", true },
   { "fit365.jp", true },
   { "fitchannel.com", true },
-  { "fitfitup.com", true },
   { "fitinclass.com", true },
   { "fitkram.cz", true },
   { "fitmeat.at", true },
   { "fitness-challenge.co.uk", true },
   { "fitness.gov", true },
-  { "fitseven.ru", true },
   { "fittelo.cz", true },
   { "fitzsim.org", true },
   { "fiuxy.bz", true },
@@ -13223,13 +13662,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fiveboosts.xyz", true },
   { "fivethirtyeight.com", true },
   { "fixatom.com", true },
+  { "fixed.supply", true },
   { "fixel.express", true },
   { "fixforce.nl", true },
   { "fixhotsauce.com", true },
   { "fixingscrews.co.uk", true },
   { "fixitfelix.us", true },
+  { "fixlasvegas.com", true },
   { "fixmyalarmpanel.co.uk", true },
   { "fixmycomputerdude.com", true },
+  { "fiziktedavi.name.tr", true },
+  { "fizyoterapi.name.tr", true },
   { "fizz.buzz", false },
   { "fj.search.yahoo.com", false },
   { "fj.simple.com", false },
@@ -13239,7 +13682,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fkcdn.de", true },
   { "fkfev.de", true },
   { "fktpm.ru", true },
-  { "flacandmp3.ml", true },
   { "flaemig42.de", false },
   { "flagburningworld.com", true },
   { "flagfox.net", true },
@@ -13255,25 +13697,31 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "flanga.org", true },
   { "flangaapis.com", true },
   { "flapoverspeed.com", true },
+  { "flare.cloud", true },
   { "flashback.org", true },
   { "flashbeing.com", true },
   { "flashcomp.cz", true },
   { "flashgot.net", true },
   { "flat.io", true },
-  { "flatlandchurch.com", true },
   { "flatmail.pl", true },
   { "flatmatehub.com.au", true },
   { "flatpackmates.co.uk", true },
   { "flauschig.net", true },
+  { "flavinus.fr", true },
   { "flaviu.co.uk", true },
   { "flavo.io", true },
   { "flavr.be", true },
   { "flawlesscowboy.xyz", true },
-  { "fleamarketgoods.com", true },
   { "fleep.io", true },
+  { "fleetcor.at", true },
   { "fleetcor.ch", true },
   { "fleetcor.cz", true },
+  { "fleetcor.de", true },
+  { "fleetcor.fr", true },
+  { "fleetcor.hu", true },
   { "fleetcor.lu", true },
+  { "fleetcor.nl", true },
+  { "fleetcor.pl", true },
   { "fleetcor.sk", true },
   { "fleetcorcards.be", true },
   { "fleetsmith.com", true },
@@ -13298,6 +13746,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "flextribly.xyz", true },
   { "fliacuello.com.ar", true },
   { "flickcritter.com", true },
+  { "fliesen-waldschmidt.de", true },
   { "flight.school", true },
   { "flightdeckfriend.com", true },
   { "flightmedx.com", true },
@@ -13310,16 +13759,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fliino.net", true },
   { "fliino.org", true },
   { "fliio.com", true },
-  { "flikmsg.co", true },
   { "flinch.io", true },
   { "flip.kim", true },
   { "flipneus.net", true },
   { "fliptable.org", true },
   { "flirt-norden.de", true },
-  { "flirtee.net", true },
+  { "flirtee.net", false },
   { "flirtfaces.de", true },
   { "flirtos.de", true },
-  { "flirtycourts.com", true },
+  { "flixhaven.net", true },
   { "flixports.com", true },
   { "flmortgagebank.com", true },
   { "floatationlocations.com", true },
@@ -13330,18 +13778,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "floffi.media", true },
   { "floj.tech", true },
   { "flokinet.is", true },
+  { "floless.co.uk", true },
   { "flomeyer.de", true },
   { "floobits.com", true },
-  { "flood.io", true },
+  { "floogulinc.com", true },
   { "floorballpoint.cz", true },
   { "flooringnightmares.com", true },
   { "flooringsourcetx.com", true },
   { "floors4lessbay.com", true },
   { "floort.net", false },
+  { "floraclick.net", true },
+  { "floraexpress.it", true },
   { "florence.uk.net", true },
   { "florenceapp.co.uk", true },
   { "florent-tatard.fr", true },
   { "florentynadawn.co.uk", true },
+  { "floresvilleedc.org", true },
   { "florian-bachelet.fr", true },
   { "florian-thie.de", true },
   { "florian2833z.de", true },
@@ -13352,7 +13804,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "floriantanner.ch", true },
   { "floridafabrication.net", true },
   { "floridafieros.org", true },
+  { "floridagulfbeachrealty.com", true },
   { "floridahomesinvest.com", true },
+  { "floridasexhealth.com", true },
   { "florinlungu.it", true },
   { "florismoo.nl", true },
   { "florismouwen.com", false },
@@ -13380,6 +13834,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "flunschi.goip.de", true },
   { "fluoxetine.net", true },
   { "flurp.de", false },
+  { "flushlife.com", true },
   { "fluteandpianoteaching.co.uk", true },
   { "flux.by", true },
   { "flux.healthcare", true },
@@ -13403,25 +13858,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "flynn.io", true },
   { "flyserver.co.il", true },
   { "flyshe.co.uk", true },
-  { "flyspace.ml", true },
   { "flyssh.net", true },
   { "flyswoop.com", true },
   { "flyt.online", true },
   { "flytoadventures.com", true },
   { "fm-cdn.de", true },
   { "fm.ie", true },
-  { "fmapplication.com", true },
   { "fmarchal.fr", true },
   { "fmc.gov", true },
   { "fmdance.cl", true },
-  { "fmi.gov", true },
   { "fminsight.net", true },
   { "fmodoux.biz", true },
-  { "fmovies.fyi", true },
   { "fmussatmd.com", true },
   { "fnanen.net", true },
   { "fnb-griffinonline.com", true },
   { "fnbnokomis.com", true },
+  { "fnh-expert.net", true },
   { "fnkr.net", true },
   { "fnof.ch", true },
   { "fnordserver.eu", true },
@@ -13438,6 +13890,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fogway.net", true },
   { "foia.gov", true },
   { "foiaonline.gov", true },
+  { "foixet.com", true },
   { "fojing.com", true },
   { "fojt.cz", true },
   { "fojtova.cz", true },
@@ -13445,11 +13898,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fokan.be", true },
   { "fokep.no", true },
   { "fokkusu.fi", true },
-  { "fol.tf", true },
   { "folio.no", true },
   { "foljeton.dk", true },
   { "folk.as", true },
-  { "folkfests.org", true },
   { "follandviolins.com", true },
   { "followback.net", true },
   { "follower98.ir", true },
@@ -13493,6 +13944,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "foot.fr", true },
   { "footagecrate.com", true },
   { "footballforum.de", true },
+  { "footloose.co.uk", true },
   { "for.care", true },
   { "foray-jero.me", true },
   { "forbusiness.ca", true },
@@ -13508,6 +13960,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "foreclosureattorneyhouston.com", true },
   { "forefrontcloud.com", true },
   { "foregroundweb.com", true },
+  { "foreign-language-colleges.com", true },
   { "forellenpark.com", true },
   { "forento.be", true },
   { "foresdon.jp", true },
@@ -13524,11 +13977,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "forextimes.ru", false },
   { "forfunssake.co.uk", true },
   { "forge-goerger.eu", true },
-  { "forglemmigej.net", true },
   { "forgotten-legends.org", true },
+  { "form3w.nl", true },
   { "forman.store", true },
   { "formapi.io", true },
-  { "formasdemaquillarse.com", true },
   { "format-paysage.ch", true },
   { "formation-assureur.com", true },
   { "formation-mac.ch", true },
@@ -13536,8 +13988,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "formbetter.com", true },
   { "formini.dz", true },
   { "formkiq.com", true },
+  { "formsbyair.com", true },
   { "formula-ot.ru", true },
   { "formulacionquimica.com", true },
+  { "formulastudent.de", true },
   { "fornoreason.net.au", true },
   { "fornwall.net", true },
   { "foro.io", false },
@@ -13546,7 +14000,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "forologikidilosi.com.gr", true },
   { "forourselves.com", true },
   { "forpc.us", true },
-  { "forquilhinhanoticias.com.br", true },
   { "forrestheller.com", true },
   { "forro.info", true },
   { "forsakringsarkivet.se", true },
@@ -13557,11 +14010,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fort.eu", true },
   { "forteggz.nl", true },
   { "fortesanshop.it", true },
+  { "fortknox.cz", true },
   { "fortnine.ca", true },
   { "fortnitemagic.ga", true },
   { "fortran.io", true },
   { "fortress.no", true },
   { "fortress.sk", true },
+  { "fortuna-apotheke-lahnstein.de", true },
   { "forty-two.nl", true },
   { "forty8creates.com", true },
   { "fortytwo.cloud", true },
@@ -13580,7 +14035,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "foscamcanada.com", true },
   { "fosdem.org", true },
   { "foshanshequ.com", false },
-  { "fossguard.com", true },
+  { "fossforward.com", true },
   { "fossilfreeyale.org", true },
   { "fotella.com", true },
   { "fotikpro.ru", true },
@@ -13592,6 +14047,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "foto-robitsch.at", true },
   { "foto-roma.ru", true },
   { "foto.by", true },
+  { "fotoboxvysocina.cz", true },
+  { "fotofaerie.net", true },
   { "fotoflits.net", true },
   { "fotografechristha.nl", true },
   { "fotografiadellalucerossa.com", true },
@@ -13599,6 +14056,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fotokomorkomania.pl", true },
   { "fotoleitner.com", true },
   { "fotoleitner.de", true },
+  { "fotonjan.com", true },
   { "fotostudio-leitner.com", true },
   { "fotostudio-leitner.de", true },
   { "fotostudio-schweiz.ch", true },
@@ -13606,6 +14064,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fotowolfy.com", true },
   { "fougner.co", true },
   { "found.website", true },
+  { "foundationrepairnebraska.com", true },
   { "foundationspecialisteast.com", true },
   { "foundationspecialistmi.com", true },
   { "foundchurch.co.uk", true },
@@ -13627,12 +14086,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "foxontheinter.net", true },
   { "foxphotography.ch", true },
   { "foxquill.com", true },
-  { "foxterrier.com.br", true },
-  { "fpc.gov", true },
+  { "foxstreetcomms.co.za", true },
+  { "fpaci.org", true },
+  { "fpc.gov", false },
   { "fpersona.com", true },
   { "fpgradosuperior.com", true },
+  { "fpsclasico.de", true },
+  { "fpt-technojapan.com", true },
+  { "fptravelling.com", false },
   { "fpvr.org", true },
   { "fpy.cz", true },
+  { "fq.mk", true },
   { "fr.search.yahoo.com", false },
   { "fr33tux.org", true },
   { "frack.nl", true },
@@ -13647,17 +14111,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "framezdakkapellen.nl", true },
   { "fran.cr", true },
   { "francescopalazzo.com", true },
+  { "francescopandolfibalbi.it", true },
+  { "francescoservida.ch", true },
   { "francetraceur.fr", true },
   { "franchini.email", true },
   { "franchini.engineer", true },
   { "francis.tokyo", true },
   { "francisli.net", false },
-  { "franckgirard.net", true },
   { "franckyz.com", true },
   { "francois-gaillard.fr", true },
   { "francois-occasions.be", true },
+  { "francoisbelangerboisclair.com", true },
   { "francoiscarrier.com", true },
   { "francoise-paviot.com", true },
+  { "francoisharvey.ca", true },
   { "francoislepage.com", true },
   { "francoz.me", true },
   { "frandor.co.uk", true },
@@ -13672,13 +14139,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "frankierprofi.de", true },
   { "frankierstar.de", true },
   { "frankinteriordesign.co.uk", true },
+  { "frankl.in", true },
   { "frankmorrow.com", true },
   { "frankopol-sklep.pl", true },
   { "franksiler.com", true },
   { "frankslaughterinsurance.com", true },
   { "frankwei.xyz", true },
   { "frankyan.com", true },
-  { "fransallen.com", true },
   { "frantic1048.com", true },
   { "frantorregrosa.me", true },
   { "franz-vatter.de", true },
@@ -13687,6 +14154,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "franzknoll.de", true },
   { "frappant.cc", true },
   { "fraselab.ru", true },
+  { "frasesconemocion.com", true },
   { "frasesdodia.com", true },
   { "frasesparaface.com.br", true },
   { "frasesytarjetas.com", true },
@@ -13701,10 +14169,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fraurichter.net", true },
   { "fraye.net", true },
   { "frbracch.it", true },
+  { "frccsgo.tk", true },
   { "frdl.ch", true },
   { "freaksites.dk", true },
   { "freaksports.com.au", true },
-  { "freakyawesome.com", true },
   { "frebi.org", true },
   { "frebib.co.uk", true },
   { "frebib.com", true },
@@ -13718,6 +14186,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "frederikvig.com", true },
   { "fredloya.com", true },
   { "fredriksslekt.se", true },
+  { "fredtec.ru", true },
   { "fredvoyage.fr", true },
   { "free-your-pc.com", true },
   { "free.ac.cn", true },
@@ -13726,9 +14195,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "freebarrettbrown.org", true },
   { "freebcard.com", true },
   { "freebetoffers.co.uk", true },
+  { "freebies.id", true },
   { "freebookmakersbetsandbonuses.com.au", true },
   { "freeboson.org", true },
+  { "freecam2cam.site", true },
   { "freecloud.at", true },
+  { "freecookies.nl", true },
   { "freedev.cz", true },
   { "freedom.nl", true },
   { "freedom.press", true },
@@ -13759,7 +14231,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "freelifer.jp", true },
   { "freelo.cz", true },
   { "freelysurf.cf", true },
+  { "freemanlogistics.com", true },
   { "freemans.com", true },
+  { "freemomhugs.org", true },
   { "freemyipod.org", true },
   { "freend.me", false },
   { "freenetproject.org", true },
@@ -13776,6 +14250,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "freesoft-board.to", true },
   { "freesoftlab.com", true },
   { "freesolitaire.win", true },
+  { "freessl.tech", true },
+  { "freesslcertificate.me", true },
   { "freethetv.ie", true },
   { "freetsa.org", true },
   { "freevps.us", false },
@@ -13798,6 +14274,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "freitasul.com.br", true },
   { "freitasul.io", true },
   { "freiwurst.net", true },
+  { "freizeitbad-riff.de", true },
   { "freizeitplaza.de", true },
   { "frejasdal.dk", true },
   { "frenchcreekcog.org", true },
@@ -13813,40 +14290,41 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "freshislandfish.com", true },
   { "freshkiss.com.au", true },
   { "freshmaza.com", true },
-  { "freshmaza.io", true },
   { "freshmaza.net", true },
   { "fretscha.com", true },
-  { "frettboard.com", true },
   { "frettirnar.is", true },
   { "fretworksec.com", true },
+  { "freundinnen-ausflug.de", true },
+  { "freundinnen-kurzurlaub.de", true },
+  { "freundinnen-urlaub.de", true },
   { "friarsonbase.com", true },
   { "fribourgviking.net", true },
   { "frickelmeister.de", true },
   { "fridayfoucoud.ma", true },
   { "fridolinka.cz", true },
+  { "friedenauer-herbstfest.de", true },
   { "friederes.lu", true },
   { "friederloch.de", true },
   { "friedrich-foto-art.de", true },
   { "friedsamphotography.com", true },
   { "friendlysiberia.com", true },
+  { "friendowment.us", true },
   { "friends-of-naz.com", true },
   { "friends-socialgroup.org", true },
   { "friends24.cz", true },
-  { "friendship-quotes.co.uk", true },
   { "friendshipismagicsquad.com", true },
+  { "friendsofgfwpc.org", true },
   { "frieslandrail.nl", true },
   { "friet.org", true },
   { "frietbesteld.nl", true },
   { "friezy.ru", true },
   { "frigi.ch", true },
   { "frigolit.net", true },
-  { "friller.com.au", true },
   { "frillip.com", true },
   { "frinkiac.com", true },
   { "frino.de", true },
   { "frippz.se", true },
   { "friss.com", true },
-  { "fritteli.ch", true },
   { "fritzrepair.com", true },
   { "frizo.com", true },
   { "frly.de", true },
@@ -13858,20 +14336,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "frogsonamission.de", true },
   { "froh.co.jp", true },
   { "frolova.org", true },
+  { "from-the-net.com", true },
   { "fromscratch.rocks", true },
   { "fromthesoutherncross.com", true },
   { "fronteers.nl", false },
   { "frontier-ad.co.jp", true },
+  { "frontier.bet", true },
   { "frontierdiscount.com", true },
   { "frontiers.nl", true },
+  { "frontlinemessenger.com", true },
   { "fropky.com", true },
   { "frostprotection.co.uk", true },
   { "frostwarning.com", true },
   { "frostysummers.com", true },
   { "frothy.coffee", true },
   { "froufe.com", true },
+  { "frownonline.co.uk", true },
   { "frozen-geek.net", true },
   { "frozen-solid.net", true },
+  { "frozenjam.com", true },
+  { "frpg.gov", true },
   { "frprn.es", true },
   { "frtn.com", true },
   { "frtr.gov", true },
@@ -13879,7 +14363,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fruchthof24.de", true },
   { "fruchtikus.net", true },
   { "fruend-hausgeraeteshop.de", true },
-  { "frugal-millennial.com", true },
   { "frugalfamilyhome.com", true },
   { "frugalmechanic.com", true },
   { "frugro.be", true },
@@ -13889,6 +14372,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "frydrychit.cz", true },
   { "fs-community.nl", true },
   { "fs-fitness.eu", true },
+  { "fs-g.org", true },
   { "fs-maistadt.de", true },
   { "fs257.com", true },
   { "fsapubs.gov", false },
@@ -13902,7 +14386,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fscott.de", true },
   { "fsdress.com", true },
   { "fsfxpackages.com", true },
+  { "fsg.one", true },
   { "fsj4u.ch", true },
+  { "fsk.fo", true },
   { "fsky.info", true },
   { "fsm2016.org", true },
   { "fsps.ch", true },
@@ -13918,8 +14404,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ftptest.net", true },
   { "ftrsecure.com", true },
   { "ftv.re", true },
-  { "fu-li88.com", true },
-  { "fu-li88.net", true },
+  { "fu639.top", true },
+  { "fu898.top", true },
   { "fuantaishenhaimuli.net", true },
   { "fuck-your-false-positive.de", true },
   { "fuckav.ru", true },
@@ -13933,16 +14419,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fuckup.dk", true },
   { "fuckyoupaypal.me", true },
   { "fuechschen.org", true },
+  { "fuelfirebrand.com", true },
   { "fuelingyourdreams.com", true },
   { "fuerstenfelder-immobilien.de", true },
   { "fugamo.de", true },
   { "fuglede.dk", true },
   { "fuite.ch", true },
   { "fuites.ch", true },
-  { "fujiorganics.com", false },
   { "fujiwaraqol.com", true },
   { "fukakukeiba.com", true },
   { "fukikaeru.com", true },
+  { "fukuiedu.com", true },
   { "fukushimacoffee.com", true },
   { "fulfilmentcrowd.com", true },
   { "fulgenzis.com", true },
@@ -13957,6 +14444,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fullhost.com", true },
   { "fullhub.ru", true },
   { "fullmatch.net", true },
+  { "fullstack.love", true },
   { "fullstacknotes.com", true },
   { "fumblers.ca", true },
   { "fumerolles.ch", true },
@@ -13965,6 +14453,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fun-tasia.co.uk", true },
   { "fun4kidzbouncycastles.co.uk", true },
   { "fun4ubouncycastles.co.uk", true },
+  { "fun888city.com", true },
+  { "fun888city.net", true },
   { "funadvisor.ca", true },
   { "funadvisorfrance.com", true },
   { "funandbounce.com", true },
@@ -13995,6 +14485,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "funniestclip.com", true },
   { "funnybikini.com", true },
   { "funoverip.net", true },
+  { "funsochi.ru", true },
   { "funspins.com", true },
   { "funtasticinflatablesdurham.co.uk", true },
   { "funtime-inflatables.co.uk", true },
@@ -14003,9 +14494,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "funtimeentertainment.co.uk", true },
   { "funtimesbouncycastles.co.uk", true },
   { "fur.red", true },
+  { "furaje-iasi.com", true },
   { "furcity.me", true },
   { "furgo.love", true },
   { "furigana.info", true },
+  { "furikake.xyz", true },
   { "furkancaliskan.com", true },
   { "furkot.com", true },
   { "furkot.de", true },
@@ -14016,22 +14509,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "furlan.co", true },
   { "furlog.it", true },
   { "furnfurs.com", true },
-  { "furnitureconcept.co.uk", true },
   { "furry.cat", true },
   { "furry.dk", true },
-  { "furry.zone", false },
   { "furrybot.me", true },
   { "furrytech.network", true },
   { "furrytf.club", true },
   { "furryyiff.site", true },
-  { "fursuitbutts.com", true },
-  { "furtherfood.com", true },
+  { "fusa-miyamoto.jp", true },
   { "fuselight.nl", true },
   { "fuseos.net", true },
   { "fushee.com", true },
   { "fusiongaming.de", true },
   { "fussball-xxl.de", true },
-  { "futa.agency", true },
+  { "futaba-works.com", true },
   { "futagro.com", true },
   { "futbomb.com", true },
   { "futcre.com", true },
@@ -14069,7 +14559,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fxopen.ru", true },
   { "fxp.co.il", true },
   { "fxseo.com.au", true },
-  { "fxtalk.cn", true },
   { "fxthai.com", true },
   { "fxtrade-lab.com", true },
   { "fxweb.co", true },
@@ -14089,11 +14578,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "fysiotherapienieuwveen.nl", true },
   { "fysiovdberg.nl", true },
   { "fysuite.com", true },
+  { "fytcart.com", true },
   { "fzbrweb.cz", true },
   { "fzhyzamt.com", true },
   { "fzx750.ru", true },
+  { "g-ds.de", true },
   { "g-m-w.eu", true },
-  { "g-marketing.ro", true },
   { "g-o.pl", true },
   { "g-p-design.com", true },
   { "g-rom.net", true },
@@ -14110,6 +14600,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "g3rv4.com", true },
   { "g4w.co", true },
   { "g5.gov", true },
+  { "g6666g.tk", true },
   { "g8energysolutions.co.uk", true },
   { "gaaz.fr", true },
   { "gabe565.com", true },
@@ -14127,8 +14618,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gadgethacks.com", true },
   { "gadse.games", true },
   { "gae123.com", true },
+  { "gaengler.com", true },
   { "gaest.com", true },
   { "gaestehaus-monika.com", true },
+  { "gaetanosonline.com", true },
   { "gaff-rig.co.uk", true },
   { "gaflooring.com", true },
   { "gafunds.com", true },
@@ -14137,6 +14630,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gagne.tk", true },
   { "gagnerplusdargent.info", true },
   { "gagniard.org", true },
+  { "gagor.pl", true },
   { "gagygnole.ch", true },
   { "gaiavanderzeyp.com", true },
   { "gaichon.com", true },
@@ -14149,7 +14643,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gakkainavi-epsilon.jp", true },
   { "gakkainavi-epsilon.net", true },
   { "gakkainavi.jp", true },
-  { "gakkainavi.net", true },
   { "gakkainavi4.jp", true },
   { "gakkainavi4.net", true },
   { "gaku-architect.com", true },
@@ -14161,11 +14654,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "galanight.cz", true },
   { "galecia.com", true },
   { "galeria42.com", true },
-  { "galerieautodirect.com", true },
+  { "galeriart.xyz", true },
   { "galeries.photo", true },
-  { "galgopersa.com.br", true },
   { "galilahiskye.com", true },
   { "galileanhome.org", true },
+  { "galilel.cloud", true },
   { "galinas-blog.de", true },
   { "galinos.gr", true },
   { "galle.cz", true },
@@ -14185,39 +14678,47 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gambitnash.co.uk", true },
   { "gambitnash.com", true },
   { "gambitprint.com", true },
+  { "gamblerhealing.com", true },
   { "gamblersgaming.eu", true },
   { "game4less.com", true },
   { "game7.de", true },
+  { "game88city.net", true },
+  { "gameanalytics.com", true },
   { "gameblabla.nl", true },
   { "gamebrott.com", true },
   { "gamecard-shop.nl", true },
+  { "gamechefpummarola.eu", true },
   { "gameclue.jp", true },
   { "gamecollector.be", true },
   { "gameconservation.org.uk", true },
   { "gamedevelopers.pl", true },
   { "gamegix.com", true },
   { "gameguardian.net", true },
+  { "gameindustry.de", true },
   { "gameisbest.jp", true },
   { "gamekaitori.jp", true },
   { "gamekeepers.cz", true },
+  { "gamemodding.com", true },
   { "gamenerd.net", true },
-  { "gameofpwnz.com", true },
   { "gamepad.com.br", true },
   { "gameplaysforkids.com", true },
   { "gamepreorders.com", true },
   { "gamequest.info", true },
+  { "gamer-portal.com", true },
   { "gamercredo.com", true },
   { "gamereader.de", true },
   { "gamerezo.com", true },
   { "gamerzdot.com", true },
   { "games4theworld.org", true },
   { "gamesaviour.com", true },
+  { "gamesdepartment.co.uk", true },
   { "gameserver-sponsor.me", true },
   { "gameshowchallenge.ie", true },
   { "gamesided.com", true },
   { "gamesplanet.com", true },
   { "gamesputnik.ru", true },
   { "gamestats.gg", true },
+  { "gametilt.com", true },
   { "gametube.website", true },
   { "gamilab.com", true },
   { "gamilab.no", true },
@@ -14226,7 +14727,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gamingrealms.net", true },
   { "gamingwithcromulent.com", true },
   { "gamingzoneservers.com", true },
+  { "gamisalya.com", true },
   { "gamishou.fr", true },
+  { "gamismodelbaru.com", true },
+  { "gamismu.com", true },
   { "gamivo.com", true },
   { "gamoloco.com", true },
   { "gan.wtf", true },
@@ -14240,14 +14744,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gandgliquors.com", true },
   { "ganggalbichler.at", true },
   { "gansleit.com", false },
-  { "ganyouxuan.com", true },
   { "ganztagplus.de", true },
+  { "gao.ci", true },
   { "gao.rocks", true },
+  { "gaojianli.me", true },
   { "gaojianli.tk", true },
   { "gaos.org", true },
   { "gapdirect.com", true },
   { "gapfa.org", true },
-  { "gaptek.id", false },
+  { "gaphag.ddns.net", true },
+  { "gar-nich.net", false },
   { "garage-leone.com", true },
   { "garage-meynard.com", true },
   { "garageenginuity.com", true },
@@ -14255,26 +14761,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "garagemhermetica.org", true },
   { "garagevanhulle-used.be", true },
   { "garanteasy.com", true },
-  { "garbage-juice.com", true },
   { "garbomuffin.com", true },
   { "garciagerman.com", true },
   { "garda-see.mobi", true },
   { "gardedenfantspourtous.fr", true },
-  { "garden-life.org", true },
   { "gardengameshireuk.com", true },
   { "garderobche.eu", true },
   { "gardikagigih.com", true },
   { "gardinte.com", true },
   { "garedtech.com", false },
   { "garethbowker.com", true },
-  { "garethkirk.com", true },
-  { "garethkirkreviews.com", true },
   { "garethrhugh.es", true },
   { "garforthgolfclub.co.uk", true },
   { "gargazon.net", true },
   { "garron.net", true },
   { "garrowmediallc.com", true },
-  { "gartenhauszentrum.de", true },
+  { "gartenbaur.de", true },
   { "gartenplanung-brendes.de", true },
   { "garycarmell.com", true },
   { "garycwaite.com", true },
@@ -14290,6 +14792,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gastrotiger.de", true },
   { "gate2home.com", true },
   { "gateaucreation.fr", true },
+  { "gatekiller.co.uk", true },
   { "gatewaybridal.com", true },
   { "gatewaybronco.com", true },
   { "gauche.com", true },
@@ -14297,16 +14800,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gaudere.co.jp", true },
   { "gaurl.ga", true },
   { "gaussianwaves.com", true },
-  { "gautham.it", true },
   { "gauthier.dk", true },
   { "gautvedt.no", true },
   { "gavins.stream", true },
   { "gavinsblog.com", true },
   { "gawrimanecuta.com", true },
-  { "gay-jays.com", true },
-  { "gaya-sa.org", true },
   { "gayforgenji.com", true },
-  { "gayjays.com", true },
   { "gaymerconnect.net", true },
   { "gaymerx.com", true },
   { "gaymerx.net", true },
@@ -14314,7 +14813,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gaysexpositions.guide", true },
   { "gayukai.net", true },
   { "gazachallenge.org", true },
-  { "gazee.net", true },
   { "gazellegames.net", false },
   { "gazete.org", true },
   { "gazette.govt.nz", true },
@@ -14326,7 +14824,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gc.gy", true },
   { "gc.ru.net", true },
   { "gcbit.dk", true },
-  { "gccm-events.com", true },
   { "gcfadvisors.com", true },
   { "gcgeeks.com.au", true },
   { "gcguild.net", true },
@@ -14343,12 +14840,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gdpr-pohotovost.cz", true },
   { "gdv.me", true },
   { "gdz-spishy.com", true },
+  { "gdz.tv", true },
   { "ge3k.net", false },
   { "gear4you.shop", true },
   { "gearallnews.com", true },
   { "gearev.net", true },
   { "gearfinder.nl", true },
-  { "gearseo.com.br", true },
   { "gearset.com", true },
   { "geass.xyz", true },
   { "geba-online.de", true },
@@ -14360,7 +14857,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gecem.org", true },
   { "gechr.io", true },
   { "geckler-ee.de", true },
-  { "gedankenworks.com", true },
   { "geder.at", true },
   { "gedlingcastlehire.co.uk", true },
   { "gee.is", true },
@@ -14377,13 +14873,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "geekles.net", true },
   { "geeknik.com", true },
   { "geekpad.com", true },
-  { "geeks.one", true },
   { "geekshirts.cz", true },
   { "geektopia.es", true },
   { "geekwhack.org", true },
   { "geekwithabudget.com", true },
   { "geekwu.org", true },
-  { "geekystudios.us", true },
   { "geekz.sk", true },
   { "geekzone.co.nz", true },
   { "geekzone.fr", true },
@@ -14422,8 +14916,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gemgroups.in", true },
   { "gemini.com", true },
   { "gemquery.com", true },
+  { "genbright.com", true },
   { "genchev.io", true },
-  { "gencmedya.com", false },
+  { "gencmedya.com", true },
   { "genderidentiteit.nl", true },
   { "gendrin.com", true },
   { "gendundrupa.ch", true },
@@ -14460,7 +14955,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "geniuszone.biz", true },
   { "genocidediary.org", true },
   { "genodeftest.de", true },
-  { "genome.gov", true },
+  { "genome.gov", false },
   { "genomequestlive.com", true },
   { "genosse-einhorn.de", true },
   { "genoveve.de", true },
@@ -14470,7 +14965,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gensicke.de", true },
   { "genslerapps.com", true },
   { "genslerwisp.com", true },
-  { "gensokyo.chat", true },
   { "gensokyo.re", true },
   { "gensonline.eu", true },
   { "gentianes.ch", true },
@@ -14479,12 +14973,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "genusshotel-riegersburg.at", true },
   { "genuxtsg.com", true },
   { "genxnotes.com", true },
-  { "geocommunicator.gov", true },
   { "geocompass.at", true },
   { "geofox.org", true },
+  { "geography-schools.com", true },
   { "geoip.fedoraproject.org", true },
   { "geoip.stg.fedoraproject.org", true },
   { "geojs.io", true },
+  { "geology-schools.com", true },
   { "geometra.roma.it", true },
   { "geomex.be", true },
   { "geoponika.gr", true },
@@ -14497,11 +14992,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "georgehalachev.com", true },
   { "georgemaschke.com", true },
   { "georgemaschke.net", true },
+  { "georgepancescu.ro", true },
   { "georgescarryout.com", true },
   { "georgewbushlibrary.gov", true },
   { "georgiaautoglass.net", true },
   { "georgiaglassrepair.com", true },
   { "georgiastuartyoga.co.uk", true },
+  { "georgiaurologist.com", true },
   { "georgioskontaxis.com", true },
   { "georgioskontaxis.net", true },
   { "georgioskontaxis.org", true },
@@ -14517,14 +15014,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gerald-zojer.com", true },
   { "geraldsonrealty.com", true },
   { "gerardinden.nl", true },
+  { "gerardobsd.com", true },
   { "gerardozamudio.mx", true },
   { "gerbyte.co.uk", true },
   { "gerbyte.com", true },
   { "gerbyte.uk", true },
   { "germancraft.net", true },
   { "germandarknes.net", true },
-  { "germansoldiers.net", true },
   { "germanssky.de", true },
+  { "germanticz.de", true },
   { "gernert-server.de", true },
   { "gero.io", true },
   { "gerritcodereview.com", true },
@@ -14534,14 +15032,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "geschmacksache.online", true },
   { "geschwinder.net", true },
   { "gesica.cloud", true },
-  { "gestorehotel.com", true },
+  { "gessettirotti.it", true },
   { "gestormensajeria.com", true },
   { "gesundheitmassage.com", true },
   { "gesundheitswelt24.de", true },
   { "get-erp.ru", true },
   { "get-it-live.com", true },
   { "get-it-live.de", true },
-  { "get-link.info", true },
   { "get-on.bid", true },
   { "get-refer.com", true },
   { "get.how", true },
@@ -14559,6 +15056,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "getdash.io", true },
   { "getdeveloper.de", true },
   { "geteckeld.nl", true },
+  { "geteduroam.no", true },
+  { "getenv.io", true },
   { "geterp.ru", true },
   { "getfedora.org", true },
   { "getfirstalert.com", true },
@@ -14582,7 +15081,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "getoutofdebt.org", true },
   { "getpagespeed.com", true },
   { "getpanelapp.com", true },
-  { "getpei.com", true },
+  { "getpei.com", false },
   { "getprivacy.de", true },
   { "getprivacy.net", true },
   { "getpublii.com", true },
@@ -14595,20 +15094,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "getsilknow.com", true },
   { "getsmartaboutdrugs.gov", true },
   { "getsport.mobi", true },
-  { "getsubs.net", true },
   { "getswadeshi.com", true },
   { "getteamninja.com", true },
   { "getthefriendsyouwant.com", true },
   { "getticker.com", true },
+  { "gettopquality.com", true },
   { "getts.ro", true },
   { "getupandbounce.co.uk", true },
   { "getvdownloader.com", true },
-  { "getweloop.io", true },
   { "getwemap.com", true },
   { "getwisdom.io", true },
   { "getyeflask.com", true },
-  { "getyou.onl", true },
+  { "getyou.onl", false },
   { "getyourlifestraight.com", true },
+  { "gevelreinigingtiel.nl", true },
   { "geyduschek.be", true },
   { "gf-franken.de", true },
   { "gf5fcalc.com", true },
@@ -14621,13 +15120,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gfestival.fo", true },
   { "gfk-kunststoff-luebben.de", true },
   { "gflame.de", true },
-  { "gflclan.ru", true },
   { "gfms.ru", true },
   { "gfnetfun.cf", true },
   { "gforce.ninja", true },
   { "gfoss.eu", true },
   { "gfournier.ca", true },
-  { "gfwno.win", true },
   { "gfxbench.com", true },
   { "ggdcpt.com", true },
   { "gginin.today", true },
@@ -14641,20 +15138,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ggx.us", true },
   { "gha.st", true },
   { "ghaglund.se", true },
-  { "ghid-pitesti.ro", true },
   { "ghini.com", true },
   { "ghislainphu.fr", true },
   { "ghostblog.info", false },
   { "ghostcir.com", true },
   { "ghou.me", true },
-  { "ghowell.io", true },
   { "ghrelinblocker.info", true },
   { "ghrelinblocker.org", true },
+  { "ghuntley.com", false },
   { "giac.org", true },
   { "giacomodrago.com", true },
   { "giacomodrago.it", true },
   { "giacomopelagatti.it", true },
   { "giaithich.net", true },
+  { "giakki.eu", false },
+  { "giannademartini.com", true },
   { "gianproperties.com", true },
   { "giant-panda.com", true },
   { "giant-tortoise.com", true },
@@ -14666,7 +15164,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gibraltar.at", true },
   { "gichigamigames.com", true },
   { "gicl.dk", true },
-  { "giduv.com", true },
   { "giebel.it", true },
   { "gierds.de", true },
   { "giethoorn.com", true },
@@ -14677,6 +15174,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "giftmaniabrilhos.com.br", true },
   { "gifts.best", true },
   { "gifts365.co.uk", true },
+  { "gifudodo.com", true },
   { "gig-raiffeisen.de", true },
   { "giga.nl", true },
   { "gigabitz.pw", true },
@@ -14704,9 +15202,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gillmanandsoame.co.uk", true },
   { "gillyscastles.co.uk", true },
   { "gilmoreid.com.au", true },
+  { "gilmourluna.com", true },
   { "gilnet.be", true },
+  { "gilpinmanagement.com", true },
+  { "gilpinrealty.com", true },
   { "gimme.money", true },
   { "gina-architektur.design", true },
+  { "ginabaum.com", true },
   { "gingersutton.com", true },
   { "ginionusedcars.be", true },
   { "ginja.co.th", true },
@@ -14728,16 +15230,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "girlan.net", true },
   { "girlsforum.com", true },
   { "girlsgenerationgoods.com", true },
-  { "girlsgonesporty.com", true },
-  { "girlsnet.work", true },
   { "girlz.jp", true },
-  { "girsa.org", true },
   { "girvas.ru", true },
   { "gisch.tk", true },
   { "gisgov.be", true },
   { "gisher.news", true },
   { "gisher.org", true },
   { "gisher.video", true },
+  { "gishiko.net", true },
   { "gistr.io", true },
   { "git.ac.cn", true },
   { "git.market", true },
@@ -14754,6 +15254,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gittigidiyor.com", true },
   { "gittr.ch", true },
   { "giuem.com", true },
+  { "giulianosdeli.com", true },
   { "giunchi.net", true },
   { "giuseppemacario.men", true },
   { "givastar.com", true },
@@ -14768,7 +15269,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "givip.eu", true },
   { "gixtools.com", true },
   { "gixtools.net", true },
-  { "gizmo.ovh", true },
   { "gj-bochum.de", true },
   { "gjcampbell.co.uk", true },
   { "gjengset.com", true },
@@ -14777,7 +15277,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gkoenig-innenausbau.de", true },
   { "gkralik.eu", true },
   { "gl.search.yahoo.com", false },
+  { "gla-hyperloop.com", true },
   { "glaciernursery.com", true },
+  { "gladiatorboost.com", true },
   { "gladwellentertainments.co.uk", true },
   { "glahcks.com", true },
   { "glamguru.co.il", true },
@@ -14789,8 +15291,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "glasfaser-im-hanseviertel.de", true },
   { "glasgestaltung.biz", true },
   { "glasner.photo", true },
+  { "glaspe.com", true },
   { "glass.google.com", true },
   { "glasschmuck-millefiori.de", true },
+  { "glassexpertswa.com", true },
+  { "glassrainbowtrust.org.je", true },
   { "glasweld.com", true },
   { "glavsudexpertiza.ru", true },
   { "glazedmag.fr", true },
@@ -14798,7 +15303,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gleanview.com", true },
   { "gleich-aluminium-shop.de", true },
   { "glenberviegolfclub.com", true },
-  { "glencambria.com", true },
   { "glencarbide.com", true },
   { "glendarraghbouncycastles.co.uk", true },
   { "glenhuntlyapartments.com.au", true },
@@ -14818,25 +15322,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "globalcanineregistry.com", true },
   { "globalchokepoints.org", true },
   { "globalcomix.com", true },
-  { "globalgivingtime.com", true },
   { "globalgovernancewatch.org", true },
   { "globalhealth.gov", true },
   { "globalhorses.de", true },
   { "globalipaction.ch", true },
   { "globalisierung-fakten.de", true },
+  { "globalitac.com", true },
   { "globalityinvestment.com", true },
-  { "globalmoneyapp.com", true },
   { "globalonetechnology.com", true },
   { "globalprojetores.com.br", true },
   { "globalresearchcouncil.org", true },
-  { "globalresistancecorporation.com", true },
   { "globalventil.com", true },
   { "globcoin.io", true },
-  { "globeinform.com", true },
   { "globelink-group.com", true },
   { "glocalworks.jp", true },
   { "glofox.com", true },
   { "gloneta.com", false },
+  { "gloning.name", true },
   { "glosiko.com", true },
   { "glossopnorthendafc.co.uk", true },
   { "glotech.co.uk", true },
@@ -14851,11 +15353,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "glutenfreelife.co.nz", true },
   { "glutenfreevr.com", true },
   { "glykofridis.nl", true },
+  { "glyph.ws", true },
   { "glyxins.com", true },
   { "gm-net.jp", true },
   { "gm.search.yahoo.com", false },
+  { "gmacedo.com", true },
   { "gmail.com", false },
-  { "gmantra.org", true },
   { "gmbh-kiekin.de", true },
   { "gmc.uy", true },
   { "gmccar.it", true },
@@ -14865,7 +15368,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gmod.de", true },
   { "gmpark.dk", true },
   { "gmpartsdb.com", true },
-  { "gmplab.com", true },
+  { "gmslparking.co.uk", true },
   { "gmta.nl", true },
   { "gmtplus.co.za", true },
   { "gmw-hannover.de", true },
@@ -14891,10 +15394,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gnucashtoqif.us", true },
   { "gnunet.org", true },
   { "gnuplus.me", true },
-  { "gnwp.eu", true },
-  { "gnylf.com", true },
   { "go-dutch.eu", true },
   { "go-embedded.de", true },
+  { "go-propiedades.cl", true },
   { "go-wild.co.uk", true },
   { "go-zh.org", true },
   { "go.microsoft.com", true },
@@ -14905,7 +15407,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "goalongtravels.com", true },
   { "goanalyse.co.uk", true },
   { "goand.run", true },
-  { "goatbot.xyz", true },
+  { "goarmy.eu", true },
   { "goatcloud.com", true },
   { "gobarrelroll.com", true },
   { "gobouncy.co.uk", true },
@@ -14915,13 +15417,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gochu.se", true },
   { "gocleanerslondon.co.uk", true },
   { "god-clan.hu", true },
-  { "god-esb.com", true },
   { "godaxen.tv", true },
-  { "godbo9.cc", true },
-  { "godbo9.com", true },
-  { "godbo9.net", true },
   { "godclan.hu", true },
-  { "godesb.com", true },
   { "godesigner.ru", true },
   { "godofnea.com", true },
   { "godrive.ga", true },
@@ -14929,12 +15426,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "godsofhell.com", true },
   { "godsofhell.de", true },
   { "goededoelkerstkaarten.nl", true },
+  { "goedkopelaptopshardenberg.nl", true },
   { "goedkopeonesies.nl", true },
   { "goedverzekerd.net", true },
   { "goemail.me", true },
   { "goerlitz-zgorzelec.org", true },
   { "goerres2014.de", true },
-  { "goesta-hallenbau.de", true },
   { "goetemp.de", true },
   { "goetic.space", true },
   { "goettinger-biergarten.de", true },
@@ -14956,7 +15453,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "goingreen.com.au", true },
   { "gokhankesici.com", true },
   { "gokmenguresci.com", true },
+  { "golang.org", true },
   { "golang.zone", true },
+  { "golangnews.com", true },
   { "gold24.ru", true },
   { "goldcoastasian.com", true },
   { "goldcoasthypnotherapyhypnosis.com.au", true },
@@ -14968,6 +15467,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "goldenhillsoftware.com", true },
   { "goldenhost.ca", true },
   { "goldenplate.com.sg", true },
+  { "goldenruleemail.com", true },
   { "goldfmromania.ro", true },
   { "goldmark.com.au", true },
   { "goldpreisfinder.at", true },
@@ -14985,9 +15485,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gomasy.jp", true },
   { "gomelchat.com", true },
   { "gomena.io", true },
-  { "gomiblog.com", true },
   { "gommista.roma.it", true },
   { "gondawa.com", true },
+  { "gondelvaartdwarsgracht.nl", true },
+  { "gongjianwei.com", true },
   { "gongjuhao.com", true },
   { "gonx.dk", true },
   { "goo.gl", true },
@@ -14996,9 +15497,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gooday.life", true },
   { "gooddomainna.me", true },
   { "goodenough.nz", false },
-  { "goodfeels.net", true },
   { "goodhealthtv.com", true },
+  { "goodquote.gq", true },
   { "goodshepherdmv.com", true },
+  { "goodth.ink", true },
   { "google", true },
   { "google-analytics.com", true },
   { "googleandroid.cz", true },
@@ -15009,14 +15511,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "goonersworld.co.uk", true },
   { "goooo.info", true },
   { "gootlijsten.nl", true },
+  { "goover.de", true },
+  { "goow.in", true },
   { "goozp.com", true },
   { "gopher.tk", true },
   { "goproallaccess.com", true },
+  { "goproinspectiongroup.com", true },
   { "goquiq.com", true },
-  { "gordonobrecht.com", true },
   { "gordonscouts.com.au", true },
-  { "gorf.chat", true },
-  { "gorf.club", true },
+  { "gorealya.com", true },
   { "gorgias.me", true },
   { "gorky.media", true },
   { "gorn.ch", true },
@@ -15032,8 +15535,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gostaffer.com", true },
   { "gostest.org", false },
   { "gosu.pro", true },
-  { "gosuland.org", true },
-  { "got-tty.de", true },
   { "goteborgsklassikern.se", true },
   { "gotech.com.eg", false },
   { "gothamlimo.com", true },
@@ -15065,8 +15566,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "govype.com", true },
   { "gowe.wang", false },
   { "gowildrodeo.co.uk", true },
-  { "gowin9.com", true },
-  { "gowin9.net", true },
   { "gowithflo.de", true },
   { "gozenhost.com", true },
   { "gpcsolutions.fr", true },
@@ -15076,13 +15575,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gplans.us", true },
   { "gpm.ltd", true },
   { "gprs.uk.com", true },
-  { "gps.com.br", true },
   { "gpsarena.ro", true },
   { "gpscamera.nl", true },
   { "gpsfix.cz", true },
   { "gpsolarpanels.com", true },
   { "gpsvideocanada.com", true },
+  { "gpureport.cz", true },
   { "gpws.ovh", true },
+  { "gqmstore.com.br", true },
   { "gr.search.yahoo.com", false },
   { "gra2.com", true },
   { "graasp.net", false },
@@ -15106,13 +15606,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grafcaps.com", true },
   { "graffen.dk", true },
   { "grafmag.pl", true },
-  { "grafmurr.de", true },
   { "grafoteka.pl", true },
   { "graft.community", true },
   { "graft.observer", true },
   { "grahamcarruthers.co.za", true },
   { "grahamcluley.com", true },
-  { "grahamofthewheels.com", true },
   { "grailians.com", true },
   { "grailify.com", true },
   { "graingert.co.uk", true },
@@ -15120,7 +15618,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gram.tips", true },
   { "gramati.com.br", true },
   { "grammysgrid.com", true },
-  { "grana.com", true },
   { "granary-demo.appspot.com", false },
   { "grancellconsulting.com", true },
   { "grandcafecineac.nl", true },
@@ -15131,13 +15628,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grandcapital.ru", true },
   { "grandcastles.co.uk", true },
   { "grandchene.ch", true },
+  { "grande.coffee", true },
   { "grandefratellonews.com", true },
   { "grandeto.com", true },
   { "grandjunctionbrewing.com", true },
   { "grandmusiccentral.com.au", true },
   { "grandpadusercontent.com", true },
   { "granfort.es", false },
-  { "granian.pro", true },
   { "granishe.com", true },
   { "graniteind.com", true },
   { "grannys-stats.com", true },
@@ -15145,14 +15642,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grantcooper.com", true },
   { "grantdb.ca", true },
   { "grantmorrison.net", true },
+  { "grantplatform.com", true },
+  { "grantsplatform.com", true },
   { "granular.ag", true },
   { "graonatural.com.br", true },
   { "grapee.jp", true },
   { "grapeintentions.com", true },
   { "graphcommons.com", true },
   { "graphene.software", true },
+  { "graphic-schools.com", true },
   { "graphic-shot.com", true },
-  { "graphified.nl", false },
   { "graphire.io", true },
   { "grapholio.net", true },
   { "grasmark.com", true },
@@ -15169,6 +15668,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "graumeier.de", true },
   { "gravilink.com", true },
   { "gravitascreative.net", true },
+  { "gravitation.pro", false },
   { "gravitechthai.com", true },
   { "gravity-inc.net", true },
   { "gravityformspdfextended.com", true },
@@ -15176,6 +15676,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grawe-blog.at", true },
   { "grayclub.co.il", true },
   { "grayhatter.com", true },
+  { "grayiron.io", true },
   { "graymalk.in", true },
   { "grayowlworks.com", true },
   { "grayscale.co", true },
@@ -15185,15 +15686,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "greatagain.gov", true },
   { "greatestwebsiteonearth.com", true },
   { "greatfire.org", true },
+  { "greathairtransplants.com", true },
   { "greatislandarts.ca", true },
   { "greatlakeside.de", true },
-  { "greatlengthshairextensionssalon.com", true },
   { "greatlifeinsurancegroup.com", true },
+  { "greatskillchecks.com", true },
   { "greboid.co.uk", true },
   { "greboid.com", true },
   { "greditsoft.com", true },
   { "greedbutt.com", true },
   { "greek.dating", true },
+  { "greeklish.gr", true },
   { "green-attitude.be", true },
   { "green-care.nl", true },
   { "green-light.cf", true },
@@ -15225,15 +15728,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "greensdictofslang.com", true },
   { "greensquare.tk", true },
   { "greenteamtwente.nl", true },
+  { "greenwaylog.net", true },
   { "greenwithdecor.com", true },
   { "greer.ru", true },
   { "greg.red", true },
   { "gregbrimble.com", true },
   { "greger.me", true },
-  { "gregmarziomedia.co.za", true },
+  { "gregmarziomedia-dev.com", true },
   { "gregmarziomedia.com", true },
   { "gregmilton.com", true },
-  { "gregmilton.org", true },
   { "gregmote.com", true },
   { "grego.pt", true },
   { "gregoirow.be", true },
@@ -15243,6 +15746,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gregorykelleher.com", true },
   { "gregoryrealestategroup.com", true },
   { "gregorywiest.com", true },
+  { "greice.de", true },
   { "grenadiercorps-kaarst.de", true },
   { "grenadiere-kaarst.de", true },
   { "grenadierkorps-kaarst.de", true },
@@ -15257,7 +15761,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grexx.nl", true },
   { "grey.house", true },
   { "greybeards.ca", true },
-  { "greyhash.se", true },
   { "greymattertechs.com", true },
   { "greysky.me", true },
   { "greyskymedia.com", true },
@@ -15284,7 +15787,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grimneko.de", true },
   { "grimstveit.no", true },
   { "grinnellplans.com", true },
-  { "gripnijmegen.rip", true },
   { "grippe-impftermin.de", true },
   { "gritte.ch", true },
   { "grizzlys.com", true },
@@ -15314,6 +15816,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grothoff.org", true },
   { "grottenthaler.eu", true },
   { "grouchysysadmin.com", true },
+  { "groundlevelup.com", true },
   { "group4layers.net", true },
   { "groupe-neurologique-nord.lu", true },
   { "groupebaillargeon.com", true },
@@ -15324,6 +15827,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "growingallthings.co.uk", true },
   { "growit.events", true },
   { "growy.ch", true },
+  { "grozter.se", true },
   { "grrmmll.com", true },
   { "grsecurity.net", true },
   { "gruble.de", true },
@@ -15341,20 +15845,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "grupomakben.com", true },
   { "grupomedlegal.com", true },
   { "grupoparco.com", true },
-  { "gruppoipl.it", true },
+  { "grupoproabienesraices.com.mx", true },
   { "gruselgrotte.com", true },
   { "grusenmeyer.be", true },
+  { "grusig-geil.ch", true },
   { "gruwa.net", true },
-  { "grytics.com", true },
   { "gs93.de", true },
   { "gsaj114.net", true },
   { "gscloud.xyz", true },
-  { "gsdb.net", true },
   { "gsi-network.com", true },
   { "gsimagebank.co.uk", true },
   { "gslink.me", true },
   { "gsmbrick.com", true },
   { "gsmsecurity.net", true },
+  { "gsoc.se", true },
   { "gsrc.io", true },
   { "gst.name", true },
   { "gst.priv.at", true },
@@ -15376,10 +15880,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gtravers-basketmaker.co.uk", true },
   { "gts-dp.de", true },
   { "gtts.space", true },
+  { "gtxbbs.com", true },
   { "guajars.cl", true },
   { "guanyembadalona.org", true },
   { "guanzhong.ca", true },
   { "guardian360.nl", true },
+  { "guardianportal.us", true },
   { "guardianproject.info", true },
   { "guardiansoftheearth.org", true },
   { "gubagoo.com", true },
@@ -15388,6 +15894,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "guegan.de", true },
   { "guelo.ch", true },
   { "guenthereder.at", true },
+  { "guenthernoack.de", true },
   { "guerard.info", true },
   { "guerrilla.technology", true },
   { "guesthouse-namaste.com", true },
@@ -15417,7 +15924,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "guildbase.de", true },
   { "guildofmusicsupervisors.co.uk", true },
   { "guillaume-briand.fr", true },
-  { "guillaumecote.me", true },
+  { "guillaumeperrin.io", true },
   { "guillemaud.me", true },
   { "guim.co.uk", true },
   { "guineapigmustach.es", true },
@@ -15426,19 +15933,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gulchuk.com", true },
   { "gulenbase.no", true },
   { "gulfstream.ru", true },
+  { "gulshankumar.net", true },
   { "gumballs.com", true },
   { "gume4you.com", true },
   { "gumi.ca", true },
   { "gunauc.net", true },
   { "gunceloyunhileleri.com", true },
+  { "gunsofshadowvalley.com", true },
   { "gunwatch.co.uk", true },
   { "gunworld.com.au", true },
   { "guochang.xyz", true },
+  { "guodong.net", true },
   { "guoke.com", true },
   { "guoliang.me", true },
   { "guozeyu.com", true },
   { "gupfen.ch", true },
-  { "guphi.net", false },
   { "gurkan.in", true },
   { "gurmel.ru", true },
   { "gurpusmaximus.com", true },
@@ -15449,13 +15958,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gus.host", true },
   { "gustaff.de", true },
   { "gustiaux.com", true },
+  { "gustom.io", true },
   { "gut8er.com.de", true },
   { "gutschein-spezialist.de", true },
   { "gutscheingeiz.de", true },
   { "gutuia.blue", true },
   { "guus-thijssen.nl", true },
   { "guusvandewal.nl", true },
-  { "guvernalternativa.ro", true },
   { "guyeskens.be", true },
   { "gv-neumann.de", true },
   { "gv-salto.nl", true },
@@ -15471,6 +15980,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gw2efficiency.com", true },
   { "gw2treasures.com", true },
   { "gw2zone.net", true },
+  { "gwa-verwaltung.de", true },
   { "gwerder.net", true },
   { "gwhois.org", true },
   { "gwrtech.com", true },
@@ -15478,9 +15988,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gxmyqy.net", true },
   { "gyara.moe", true },
   { "gyas.nl", true },
-  { "gycis.me", false },
   { "gymagine.ch", true },
   { "gymhero.me", true },
+  { "gymjp.com", true },
   { "gymkirchenfeld.ch", true },
   { "gymnaserenens.ch", true },
   { "gymnasium-farmsen.de", true },
@@ -15497,9 +16007,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "gz-benz.com", true },
   { "gz-bmw.com", true },
   { "gza.jp", true },
-  { "gzitech.com", true },
-  { "gzitech.net", true },
-  { "gzitech.org", true },
   { "gzom.ru", true },
   { "h-jo.net", true },
   { "h-suppo.com", true },
@@ -15520,12 +16027,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "h404bi.com", true },
   { "ha-kunamatata.de", true },
   { "ha3.eu", true },
+  { "ha6.ru", true },
   { "haarlemsesaxofoonschool.nl", true },
   { "haavard.me", true },
   { "haazen.xyz", true },
   { "habarisoft.com", true },
   { "habbig.cc", true },
   { "habbos.es", true },
+  { "haberer.me", true },
   { "habitat-domotique.fr", true },
   { "habr.com", true },
   { "habtium.com", true },
@@ -15538,26 +16047,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hacertest.com", true },
   { "hacettepeteknokent.com.tr", true },
   { "hachre.de", false },
-  { "hack.cz", true },
+  { "hack.club", true },
   { "hackademix.net", true },
   { "hackanders.com", true },
-  { "hackattack.com", true },
   { "hackbarth.guru", true },
   { "hackbeil.name", true },
   { "hackcraft.net", true },
   { "hackdown.me", true },
   { "hackenkunjeleren.nl", true },
   { "hackenturet.dk", true },
+  { "hacker.club", true },
   { "hacker.holiday", true },
   { "hacker.im", true },
   { "hacker.one", true },
   { "hacker.parts", true },
   { "hacker1.com", true },
   { "hacker101.com", true },
-  { "hackerchai.com", true },
-  { "hackerco.com", true },
   { "hackereyes.com", true },
   { "hackergateway.com", true },
+  { "hackerlite.xyz", true },
   { "hackernet.se", true },
   { "hackerone-ext-content.com", true },
   { "hackerone-user-content.com", true },
@@ -15577,6 +16085,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hackreone.com", true },
   { "hacksecu.re", true },
   { "hacksoc.co.uk", true },
+  { "hackthissite.org", true },
   { "hacktic.info", true },
   { "hacktivis.me", true },
   { "hackworx.com", false },
@@ -15587,11 +16096,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hadret.sh", true },
   { "hadrons.org", true },
   { "haefligermedia.ch", true },
+  { "haemka.de", true },
   { "haens.li", true },
   { "haerwu.biz", true },
   { "haferman.net", true },
   { "haferman.org", true },
   { "hafniatimes.com", true },
+  { "haggeluring.su", true },
   { "hagiati.gr", true },
   { "hagueaustralia.com.au", true },
   { "haha-raku.com", true },
@@ -15607,7 +16118,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hairraisingphotobooths.co.uk", true },
   { "hairtonic-lab.com", true },
   { "haixihui.cn", true },
-  { "haizum.pro", true },
   { "hajekdavid.cz", true },
   { "hajekj.net", true },
   { "hajnzic.at", true },
@@ -15617,7 +16127,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hakase.pw", true },
   { "hakatabijin-mind.com", true },
   { "hake.me", true },
-  { "hal-9th.space", true },
+  { "hakkasangroup.com", true },
+  { "hakkasannightclub.com", true },
   { "halacs.hu", true },
   { "halbich.design", true },
   { "haleo.net", true },
@@ -15634,8 +16145,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "halliday.work", true },
   { "halligladen.de", true },
   { "hallmarkestates.ca", true },
+  { "halloweenthings.website", true },
   { "hallucinogen.com", true },
   { "hallucinogens.org", true },
+  { "hallumlaw.com", true },
   { "halo.fr", true },
   { "halongbaybackpackertour.com", true },
   { "haloobaloo.com", true },
@@ -15655,6 +16168,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hampshiretechservices.co.uk", true },
   { "hamsters-uk.org", true },
   { "hana.ondemand.com", true },
+  { "hanakaraku.com", true },
   { "hanashi.eu", true },
   { "hanbing.it", true },
   { "handbrake.fr", true },
@@ -15698,30 +16212,31 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hansvaneijsden.nl", true },
   { "hantse.com", true },
   { "hanu.la", true },
-  { "hanxv.pw", true },
   { "hanyibo.com", true },
   { "hanzubon.jp", true },
   { "hao-zhang.com", true },
   { "haogoodair.ca", true },
+  { "haoqi.men", true },
+  { "haorenka.cc", true },
   { "haotown.cn", true },
   { "haozhang.org", true },
+  { "haozhexie.com", true },
   { "haozi.me", true },
   { "hapheemraadssingel.nl", true },
-  { "hapijs.cn", true },
-  { "hapissl.com", true },
-  { "hapivm.com", true },
   { "happndin.com", true },
   { "happy-baby.info", true },
   { "happy-end-shukatsu.com", true },
   { "happyagain.de", true },
   { "happyagain.se", true },
   { "happyandrelaxeddogs.eu", true },
+  { "happybirthdaywisher.com", true },
   { "happybounce.co.uk", true },
   { "happycarb.de", true },
   { "happycoder.net", false },
   { "happydietplan.com", true },
   { "happydoq.ch", true },
   { "happygadget.me", true },
+  { "happyhealthylifestyle.com", true },
   { "happykidscastles.co.uk", true },
   { "happylifestyle.com", true },
   { "happyschnapper.com", true },
@@ -15734,6 +16249,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "harald-pfeiffer.de", true },
   { "harapecorita.com", true },
   { "harbor-light.net", true },
+  { "hardeman.nu", true },
   { "hardenize.com", true },
   { "hardergayporn.com", true },
   { "hardertimes.com", true },
@@ -15746,11 +16262,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hardrain980.com", true },
   { "hardtfrieden.de", true },
   { "hardwareschotte.de", true },
+  { "harekaze.info", true },
   { "haribilalic.com", true },
   { "harilova.fr", true },
   { "harion.fr", true },
   { "harisht.me", false },
-  { "haritsa.co.id", true },
   { "harjitbhogal.com", true },
   { "harlor.de", true },
   { "harmfarm.nl", true },
@@ -15764,7 +16280,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "harringtonca.com", true },
   { "harrisonswebsites.com", true },
   { "harrisonvillenaz.org", true },
-  { "harry-baker.com", true },
   { "harrymclaren.co.uk", true },
   { "harryphoto.fr", true },
   { "harrysgardengamehire.co.uk", true },
@@ -15780,6 +16295,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "harvestapp.com", true },
   { "harvester.fr", true },
   { "harveyauzorst.com", true },
+  { "harveysautoservice.net", true },
   { "has-no-email-set.de", false },
   { "has.work", true },
   { "hasdf.de", true },
@@ -15789,10 +16305,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hash.army", true },
   { "hash.works", true },
   { "hashcat.net", true },
+  { "hashemian.com", true },
   { "hashes.org", true },
   { "hashi.dk", true },
   { "hashiconf.eu", true },
   { "hashicorp.com", true },
+  { "hashimah.ca", true },
   { "hashimoto-jimusho.com", true },
   { "hashinteractive.com", true },
   { "hashish.net", true },
@@ -15821,6 +16339,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hautaka.com", true },
   { "hautarztzentrum.ch", true },
   { "hauteslatitudes.com", true },
+  { "havarijna-sluzba-bratislava.sk", true },
   { "havasuhomepage.com", true },
   { "havasuinsurance.com", true },
   { "havasutacohacienda.com", true },
@@ -15830,15 +16349,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "haveibeenpwned.com", true },
   { "havellab.de", true },
   { "havelland-obstler.de", true },
+  { "havencyber.com", true },
   { "havenstrategies.com", true },
+  { "havernbenefits.com", true },
   { "haverstack.com", true },
   { "havetherelationshipyouwant.com", true },
   { "hawaar.com", true },
   { "hawaya.com", true },
   { "hawkeyeinsight.com", true },
   { "hawkinsonkiaparts.com", true },
-  { "hawkofgeorgia.com", true },
+  { "hawkon.dk", true },
   { "hawksguild.com", true },
+  { "hawksracing.de", true },
   { "haxdroid.com", true },
   { "haxo.nl", false },
   { "hayai.space", true },
@@ -15848,13 +16370,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "haydentomas.com", true },
   { "hayfordoleary.com", true },
   { "haynes-davis.com", true },
-  { "hayvid.com", true },
+  { "hayvid.com", false },
+  { "hayzepvp.us", true },
   { "haz.cat", true },
   { "haze.productions", true },
   { "hazeover.com", true },
   { "hazloconlapix.com", true },
   { "hazukilab.com", true },
   { "hb8522.com", true },
+  { "hbcu-colleges.com", true },
   { "hbkonsult.com", true },
   { "hboeck.de", true },
   { "hbpowell.com", true },
@@ -15872,10 +16396,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hdfgroup.org", true },
   { "hdguru.com", true },
   { "hdhoang.space", true },
+  { "hdkandsons.com", true },
   { "hdnastudio.com", true },
+  { "hdritalyphotos.com", true },
   { "hdrsource.com", true },
   { "hdrtranscon.com", true },
   { "hds-lan.de", true },
+  { "hdv.paris", true },
   { "heaaart.com", true },
   { "head.org", true },
   { "head.ru", true },
@@ -15884,13 +16411,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "headshopinternational.com", true },
   { "headshotharp.de", true },
   { "healey.io", true },
+  { "health-and-beauty-news.net", true },
   { "health-booster.com", true },
-  { "health-match.com.au", true },
   { "health-plan-news.com", true },
   { "health.gov", true },
   { "health.graphics", true },
+  { "healthand-beautynews.net", true },
+  { "healthandskinbeauty.com", true },
   { "healthcare.gov", false },
   { "healthcultureexpo.com", true },
+  { "healthdata.gov", true },
   { "healtheals.com", true },
   { "healtheffectsofasbestos.com", true },
   { "healthery.com", true },
@@ -15900,8 +16430,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "healthiercompany.com", true },
   { "healthiergenerations.co.uk", true },
   { "healthit.gov", true },
-  { "healthlabs.com", true },
-  { "healthmatchapp.com", true },
   { "healthplansamerica.org", true },
   { "healthstar-dev.io", true },
   { "healthstar.io", true },
@@ -15914,13 +16442,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "heap.zone", true },
   { "heapkeeper.org", true },
   { "hearmeraw.uk", true },
-  { "heart.taxi", true },
+  { "heart.taxi", false },
   { "heartbeat24.de", true },
   { "heartgames.pl", true },
   { "heartlandbiomed.com", true },
   { "heartmdinstitute.com", true },
   { "heartsintrueharmony.com", true },
-  { "heartsucker.com", false },
   { "hearttruth.gov", true },
   { "heartview.com.br", true },
   { "heartwoodart.com", true },
@@ -15932,10 +16459,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hearty.ooo", true },
   { "hearty.org.tw", true },
   { "hearty.taipei", true },
+  { "hearty.tw", true },
   { "hearty.us", true },
   { "heartycraft.com", true },
   { "heatershop.co.uk", true },
   { "heatingandairconditioningdallastx.com", true },
+  { "heatingpartswarehouse.co.uk", true },
   { "heavensattic.co.uk", true },
   { "heavensinferno.net", true },
   { "heavyequipments.org", true },
@@ -15943,7 +16472,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hebikhiv.nl", true },
   { "hebingying.cn", true },
   { "hebocon.nl", true },
-  { "hebriff.com", true },
   { "hec-espace-entreprise.ch", true },
   { "hec.global", true },
   { "hechamano.es", true },
@@ -15958,8 +16486,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hedweb.co.uk", true },
   { "hedweb.net", true },
   { "hedweb.org", true },
-  { "heeler.blue", true },
-  { "heeler.red", true },
   { "heello.es", true },
   { "hefengautoparts.com", true },
   { "heftkaufen.de", true },
@@ -15974,6 +16500,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "heidisheroes.org", true },
   { "heijblok.com", true },
   { "heijdel.nl", true },
+  { "heikegastmann.com", true },
   { "heikorichter.name", true },
   { "heiland.io", true },
   { "heiliger-gral.info", true },
@@ -15995,13 +16522,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hejahanif.se", true },
   { "hejianpeng.cn", true },
   { "heka.ai", true },
-  { "helber-it-services.de", true },
   { "helden-spielen.de", true },
   { "heldenhalde.de", true },
   { "heldundsexgott.de", true },
   { "heleendebruyne.be", true },
   { "helenaknowledge.com", true },
   { "helencrump.co.uk", true },
+  { "helenekurtz.com", true },
   { "helenelefauconnier.com", true },
   { "helenkellersimulator.org", true },
   { "helfordriversc.co.uk", true },
@@ -16012,10 +16539,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "heliosnet.com", true },
   { "heliosvoting.org", true },
   { "helix.am", true },
+  { "hellenicagora.co.uk", true },
   { "hellersgas.com", true },
   { "helles-koepfchen.de", true },
   { "helloacm.com", true },
-  { "hellomouse.net", true },
+  { "hellobrian.me", true },
+  { "helloworldhost.com", false },
   { "hellsgamers.pw", true },
   { "hellsh.com", true },
   { "helmut-a-binser.de", true },
@@ -16026,7 +16555,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "helpscoutdocs.com", true },
   { "helpstarloja.com.br", true },
   { "helsinki.dating", true },
-  { "helup.com", true },
   { "helvella.de", true },
   { "hematoonkologia.pl", true },
   { "hemdal.se", true },
@@ -16034,13 +16562,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hen.ne.ke", true },
   { "henchman.io", true },
   { "hendersonvalleyautomotive.co.nz", true },
+  { "hendric.us", false },
   { "hendrik.li", true },
   { "hendrinortier.nl", true },
   { "hendyisaac.com", true },
   { "hengelsportdeal.com", true },
   { "hengstumone.com", true },
   { "henkboelman.com", true },
-  { "henkbrink.com", true },
+  { "henke-home.eu", true },
   { "henker.net", true },
   { "henkverlinde.com", false },
   { "henley-computer-repairs.co.uk", true },
@@ -16054,9 +16583,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "henriksen.is", true },
   { "henrikwelk.de", true },
   { "henrilammers.nl", true },
-  { "henrock.net", true },
   { "henry.gg", true },
   { "henryphan.com", false },
+  { "henrysautodetail.com", true },
   { "hentaiworld.cc", true },
   { "hentschke-bau.de", true },
   { "hentschke-betonfertigteilwerk.de", true },
@@ -16064,8 +16593,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "henzenhoning.nl", true },
   { "heppler.net", true },
   { "heptafrogs.de", true },
+  { "her25.com", false },
+  { "heracles-hotel.eu", true },
   { "herbal-id.com", true },
-  { "herbandpat.org", true },
   { "herberichfamily.com", true },
   { "herbert.io", true },
   { "herbhuang.com", true },
@@ -16086,6 +16616,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "herminghaus24.de", true },
   { "herndl.org", true },
   { "herni-kupony.cz", true },
+  { "hernn.com", true },
   { "herocentral.de", true },
   { "herofil.es", true },
   { "herohirehq.co.uk", true },
@@ -16093,20 +16624,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "heroicpixel.com", true },
   { "heroku.com", true },
   { "herpes-no.com", true },
-  { "herr-webdesign.de", true },
   { "herranzramia.com", false },
   { "herrderzeit.de", true },
   { "herrenmuehle-wein.de", true },
+  { "herringboneeats.com", true },
   { "herringsresidence.be", true },
   { "herrkaschke.com", true },
   { "herrsmith.com", true },
   { "herrtxbias.net", false },
-  { "herrtxbias.org", true },
   { "hertsbouncycastles.com", true },
   { "hertz.bj", true },
   { "herzbotschaft.de", true },
   { "herzfuersoziales.at", true },
   { "herzig.cc", true },
+  { "herzogglass.com", true },
   { "hesaplama.net", true },
   { "hessen-liebe.de", true },
   { "hestervanderheijden.nl", true },
@@ -16117,8 +16648,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hesyifei.com", true },
   { "hetene.nl", true },
   { "hethely.ch", true },
+  { "hetluisterbos.be", true },
+  { "hetzflix.stream", true },
   { "heute-kaufen.de", true },
   { "heute.training", true },
+  { "heverhagen.rocks", true },
   { "hevertonfreitas.com.br", true },
   { "hex.bz", true },
   { "hex.nl", true },
@@ -16131,13 +16665,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hexicurity.com", true },
   { "hexid.me", true },
   { "hexieshe.com", true },
+  { "hexo.io", false },
   { "hexony.com", true },
   { "hexr.org", true },
+  { "hexstream.net", true },
   { "hexstream.xyz", true },
   { "hexstreamsoft.com", true },
   { "hexxagon.com", true },
-  { "heyfringe.com", true },
-  { "heyjournal.com", true },
   { "heywood.cloud", true },
   { "hf-tekst.nl", true },
   { "hf51.nl", true },
@@ -16145,7 +16679,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hg.gg", true },
   { "hg.python.org", true },
   { "hgbet.com", true },
-  { "hgvnet.de", true },
   { "hgw168.com", true },
   { "hh-medic.com", true },
   { "hh-wolke.dedyn.io", true },
@@ -16160,7 +16693,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hicl.org", true },
   { "hicoria.com", true },
   { "hidbo.de", true },
-  { "hiddendepth.ie", false },
+  { "hiddendepth.ie", true },
   { "hiddenhillselectric.com", true },
   { "hiddenhillselectrical.com", true },
   { "hiddenhillselectrician.com", true },
@@ -16176,7 +16709,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "higgsboson.tk", true },
   { "higgstools.org", true },
   { "higherpress.org", true },
-  { "highland-webcams.com", true },
   { "highlatitudestravel.com", true },
   { "highlegshop.com", true },
   { "highlevelwoodlands.com", true },
@@ -16188,11 +16720,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hightechbasementsystems.com", true },
   { "highwaytohoell.de", true },
   { "higilopocht.li", true },
+  { "hijackpost.com", true },
+  { "hijoan.com", true },
   { "hikarukujo.com", true },
   { "hike.pics", true },
+  { "hikerone.com", true },
+  { "hikinggearlab.com", true },
   { "hikingguy.com", true },
   { "hilahdih.cz", true },
-  { "hilaolu.studio", true },
   { "hilaryhutler.com", true },
   { "hilchenba.ch", true },
   { "hilde.link", true },
@@ -16211,11 +16746,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "himekomi.com", true },
   { "himens.com", true },
   { "hin10.com", true },
+  { "hinata-hidetoshi.com", true },
   { "hintergrundbewegung.de", true },
   { "hinterhofbu.de", true },
   { "hinterposemuckel.de", true },
-  { "hiojbk.com", true },
-  { "hipi.jp", true },
+  { "hiparish.org", true },
+  { "hiphop.ren", true },
   { "hippies.com.br", true },
   { "hippo.ge", true },
   { "hippomovers.com", true },
@@ -16231,7 +16767,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hiratake.xyz", true },
   { "hire-a-coder.de", true },
   { "hireabouncycastle.net", true },
-  { "hirefitness.co.uk", true },
   { "hiresteve.ca", true },
   { "hirevo.eu", true },
   { "hirezzportal.com", true },
@@ -16239,16 +16774,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hiromuogawa.com", true },
   { "hirotaka.org", true },
   { "hirte-digital.de", true },
+  { "hirtzfr.eu", true },
   { "hirzaconsult.ro", true },
   { "hisbrucker.net", true },
   { "hisgifts.com.au", true },
   { "hisingensck.se", true },
-  { "hisnet.de", true },
   { "hispanic.dating", true },
   { "histocamp.de", true },
   { "histoire-cite.ch", true },
   { "historia-arte.com", true },
-  { "history.google.com", true },
+  { "history-schools.com", true },
+  { "history.google.com", false },
   { "hitandhealth.nl", true },
   { "hiteco.com", true },
   { "hititgunesi-tr.com", true },
@@ -16259,7 +16795,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hitokoto-mania.com", true },
   { "hitokoto.cn", true },
   { "hitomecha.com", true },
-  { "hitrek.ml", true },
   { "hitter-lauzon.com", true },
   { "hitter.family", true },
   { "hitterfamily.com", true },
@@ -16267,14 +16802,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hiv.com.tw", true },
   { "hiv.gov", true },
   { "hivatalinfo.hu", true },
-  { "hiverlune.net", true },
+  { "hiverlune.net", false },
   { "hiwiki.tk", true },
   { "hiyacar.co.uk", true },
   { "hiyobi.me", true },
+  { "hiyuki2578.net", true },
   { "hizzacked.xxx", true },
   { "hj-mosaiques.be", true },
   { "hj.rs", true },
-  { "hj2999.com", true },
   { "hj3455.com", true },
   { "hjartasmarta.se", true },
   { "hjkbm.cn", true },
@@ -16300,7 +16835,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hlidacnajemneho.cz", true },
   { "hlin.cloud", true },
   { "hlinformatics.nl", true },
-  { "hloe0xff.ru", true },
   { "hlsmandarincentre.com", true },
   { "hlucas.de", true },
   { "hm773.net", true },
@@ -16311,6 +16845,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hmsseahawk.com", true },
   { "hn.search.yahoo.com", false },
   { "hnfertilizermachine.com", true },
+  { "hnn.net.br", true },
   { "hnonline.sk", true },
   { "hnwebi.com", true },
   { "hnyp.hu", true },
@@ -16326,7 +16861,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hochzeitsfotograf-deinfoto.ch", true },
   { "hochzeitsgezwitscher.de", true },
   { "hochzeitsplanerin-hamburg.de", true },
-  { "hockey.academy", true },
   { "hockeyapp.ch", true },
   { "hockeymotion.ch", true },
   { "hodgephotography.com", true },
@@ -16337,6 +16871,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hoepli.it", true },
   { "hoeren.club", true },
   { "hoesnelwasik.nl", true },
+  { "hoeveiligismijn.nl", true },
   { "hoevenstein.nl", true },
   { "hoewler.ch", true },
   { "hoezzi.nl", true },
@@ -16344,18 +16879,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hofapp.de", true },
   { "hofauer.de", true },
   { "hoflerlawfirm.com", true },
-  { "hogepad.com", true },
   { "hogl.dk", true },
   { "hogrebe.de", true },
   { "hogwarts.io", true },
   { "hohenleimbach.de", true },
   { "hohm.in", true },
+  { "hoikuen-now.top", true },
   { "hoiquanadida.com", true },
+  { "hoish.in", true },
   { "hoken-wakaru.jp", true },
   { "hokieprivacy.org", true },
   { "hokify.at", true },
   { "hokify.ch", true },
   { "hokify.de", true },
+  { "hokioisecurity.com", true },
   { "holad.de", true },
   { "holboxwhalesharktours.com", true },
   { "holebedeljek.hu", true },
@@ -16378,9 +16915,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "holoxplor.space", true },
   { "holstphoto.com", true },
   { "holvonix.com", true },
+  { "holycrossphl.org", true },
+  { "holycrossverobeach.org", true },
   { "holydragoon.jp", true },
+  { "holyfamilyphilly.org", true },
+  { "holyfamilyrussell.org", true },
+  { "holyghost-church.org", true },
   { "holygrail.games", true },
   { "holyhiphopdatabase.com", true },
+  { "holymartyrschurch.org", true },
+  { "holyspiritpalmyra.com", true },
+  { "holyspiritweb.org", true },
   { "holytransaction.com", true },
   { "holywhite.com", true },
   { "holz.nu", true },
@@ -16392,29 +16937,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "holzundgarten.de", true },
   { "holzvergaser-forum.de", true },
   { "homatism.com", true },
+  { "home-handymen.co.uk", true },
   { "home-insurance-quotes.tk", true },
   { "homeautomated.com", true },
   { "homebasedsalons.com.au", true },
   { "homebodyalberta.com", true },
   { "homecareassociatespa.com", true },
-  { "homecarpetcleaning.co.uk", true },
   { "homecheck.gr", true },
   { "homefacialpro.com", false },
   { "homegardeningforum.com", true },
   { "homegardenresort.nl", true },
   { "homegreenmark.com", true },
   { "homehuntertoronto.com", true },
-  { "homehunting.pt", true },
   { "homeimagician.com.au", true },
   { "homem-viril.com", true },
   { "homeodynamics.com", true },
-  { "homeoesp.org", true },
   { "homeofjones.net", true },
   { "homeogenium.com", true },
   { "homeownersinsurancenevada.com", true },
   { "homeownersinsurancenv.com", true },
   { "homepage.shiga.jp", true },
+  { "homeporn.stream", true },
   { "homeprivate.de", true },
+  { "homes-in-norcal.com", true },
+  { "homes-in-stockton.com", true },
   { "homeseller.com", true },
   { "homeserver-kp.de", true },
   { "homesteadandprepper.com", true },
@@ -16429,6 +16975,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "honda-centrum.cz", true },
   { "hondart.cz", true },
   { "hondenoppasfraneker.nl", true },
+  { "honey.beer", true },
   { "honey.is", true },
   { "honeybadger.io", false },
   { "honeycome.net", true },
@@ -16437,17 +16984,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "honeypot.net", true },
   { "hong.io", true },
   { "hongoi.com", true },
+  { "honkion.net", true },
   { "honovere.de", true },
   { "hontoir.eu", true },
   { "hoodtrader.com", true },
   { "hoofddorp-centraal.nl", true },
   { "hookany.com", true },
+  { "hoooc.com", true },
   { "hooowl.com", true },
   { "hoop.la", true },
   { "hoopertechnicalsolutions.com", true },
   { "hooplessinseattle.com", true },
+  { "hooray.beer", true },
   { "hoorr.com", true },
   { "hootworld.net", false },
+  { "hoovism.com", true },
   { "hoowhen.cn", true },
   { "hopconseils.ch", true },
   { "hopconseils.com", true },
@@ -16455,24 +17006,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hopesanddreams.org.uk", true },
   { "hopla.sg", true },
   { "hoplongtech.com", true },
+  { "hoponmedia.de", true },
   { "hopps.me", true },
   { "hoppyx.com", true },
   { "hor.website", true },
   { "horaceli.com", true },
-  { "horackova.info", true },
+  { "horairetrain.fr", true },
   { "hord.ca", true },
   { "horecaapparatuurkobezuijen.nl", true },
   { "horecatiger.eu", true },
+  { "horeco.com", true },
   { "horeizai.net", true },
   { "horizonhomes-samui.com", true },
   { "horizonlawncare.tk", true },
+  { "horizonresourcesinc.com", true },
   { "horizonshypnosis.ca", true },
-  { "horkel.cf", true },
   { "horn.co", true },
   { "hornertranslations.com", true },
   { "hornyforhanzo.com", true },
   { "horodance.dk", true },
   { "horrell.ca", true },
+  { "horror-forum.de", true },
   { "horrormovies.gr", true },
   { "horsehunter.co.uk", true },
   { "horstmanshof.eu", true },
@@ -16486,6 +17040,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hosiet.me", true },
   { "hosmussynergie.nl", false },
   { "hospitalhomelottery.org", true },
+  { "hospitality-colleges.com", true },
   { "hostadvice.com", true },
   { "hostarea51.com", true },
   { "hostcoz.com", true },
@@ -16494,6 +17049,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hostedtalkgadget.google.com", true },
   { "hostfission.com", true },
   { "hostgigz.com", true },
+  { "hostico.ro", true },
+  { "hostinecpodlipou.cz", true },
   { "hosting-swiss.ch", true },
   { "hostingactive.it", true },
   { "hostinghelp.guru", true },
@@ -16504,15 +17061,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hostingsolutions.cz", true },
   { "hostix.de", true },
   { "hostma.ma", true },
+  { "hostmark.pl", true },
   { "hostme.co.il", false },
   { "hostmijnpagina.nl", true },
   { "hostmodern.com.au", true },
   { "hosts.cf", true },
   { "hostserv.org", true },
+  { "hostworkz.com", true },
   { "hotcandlestick.com", true },
   { "hotchillibox.com", true },
   { "hotcoin.io", true },
   { "hotdoc.com.au", true },
+  { "hotel-kontorhaus-stralsund.de", true },
+  { "hotel-kontorhaus.de", true },
   { "hotel-kronjuwel.de", true },
   { "hotel-le-vaisseau.ch", true },
   { "hotel-pension-sonnalp.eu", true },
@@ -16525,6 +17086,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hotello.io", true },
   { "hotelmap.com", true },
   { "hotelpostaorvieto.it", true },
+  { "hotelromacuernavaca.com.mx", true },
   { "hotels-insolites.com", true },
   { "hotels3d.com", true },
   { "hotels4teams.com", true },
@@ -16533,8 +17095,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hotelsinncoventry.com", true },
   { "hotelvalena.com", true },
   { "hotelvillaluisa.de", true },
-  { "hotesb.com", true },
-  { "hotesb.net", true },
   { "hothbricks.com", true },
   { "hotnewhiphop.com", true },
   { "hoto.us", true },
@@ -16563,6 +17123,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "houstoncreditlaw.com", true },
   { "houstontxlocksmiths.com", true },
   { "houtinee.com", true },
+  { "hoverboardbarato.com", true },
   { "how2play.pl", true },
   { "howa-n.net", true },
   { "howardtyson.com", true },
@@ -16581,7 +17142,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "howtogeek.com", true },
   { "howtogeekpro.com", true },
   { "howtogosolar.org", true },
-  { "howtoinstall.co", true },
   { "howtolaser.com", true },
   { "howtoteachviolin.com", true },
   { "howtotech.de", true },
@@ -16589,6 +17149,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hp-work.net", true },
   { "hp42.de", true },
   { "hpac-portal.com", true },
+  { "hpage.com", true },
   { "hpbn.co", true },
   { "hpepub.com", true },
   { "hpisavageforum.com", true },
@@ -16609,13 +17170,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hreflang.info", true },
   { "hrjfeedstock.com", true },
   { "hrjfeedstock.org", true },
-  { "hrk.io", true },
-  { "hrobert.hu", true },
   { "hroling.nl", true },
   { "hroschyk.cz", true },
+  { "hrsa.gov", true },
   { "hrstapps-dev.com", true },
   { "hrtech.shop", true },
-  { "hru.gov", true },
   { "hryniewski.net", true },
   { "hryx.net", true },
   { "hs-arbeitsschutz.de", true },
@@ -16651,6 +17210,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "htmlyse.com", true },
   { "htmue.net", true },
   { "htmue.org", true },
+  { "htp2.top", true },
   { "htsure.ma", true },
   { "http2.eu", true },
   { "http2.pro", true },
@@ -16658,6 +17218,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "https.dk", true },
   { "https.jetzt", true },
   { "https4all.org", true },
+  { "httpsalarm.com", true },
   { "httpsecured.net", true },
   { "httpsecurityreport.com", true },
   { "httpsiseasy.com", true },
@@ -16673,26 +17234,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hu8588.com", true },
   { "hu8777.com", true },
   { "hu8bet.com", true },
-  { "hua-in.com", true },
-  { "hua-in.net", true },
-  { "hua-li88.com", true },
-  { "hua-li88.net", true },
+  { "hu8hu8.com", true },
   { "huagati.com", true },
   { "huahinpropertylisting.com", true },
+  { "huang-haitao.com", true },
   { "huangjiaint.com", true },
   { "huangjingjing.com", true },
   { "huangliangbo.com", true },
-  { "huangting.me", true },
   { "huangzenghao.cn", false },
   { "huaqian.art", true },
   { "huashan.co.uk", true },
-  { "huaxueba.com", true },
   { "hub.org.ua", true },
   { "hub385.com", true },
   { "hubapi.com", true },
   { "huber-informatik.de", true },
   { "hubok.net", true },
-  { "hubrecht.at", true },
   { "hubspot.com", true },
   { "huchet.me", true },
   { "hudebnibazarmixer.cz", true },
@@ -16708,6 +17264,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hugi.is", true },
   { "hugizrecords.com", true },
   { "huglen.info", true },
+  { "hugo.pro", true },
   { "hugo6.com", true },
   { "hugofs.com", true },
   { "hugolynx.fr", true },
@@ -16715,9 +17272,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "huguesditciles.com", true },
   { "huh.gdn", true },
   { "huh.today", true },
-  { "hui-in.com", true },
-  { "hui-in.net", true },
-  { "huihui.moe", true },
+  { "huininga.com", true },
+  { "huininga.nl", true },
+  { "huininga.org", true },
   { "huirongis.me", true },
   { "huisartsenpraktijkheemraadssingel.nl", true },
   { "huisartsenpraktijksonmezer.nl", true },
@@ -16732,14 +17289,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hulldevs.net", true },
   { "hulpbijmarketing.nl", true },
   { "hulpmiddelenshop.nl", true },
-  { "hulsoft.co.uk", true },
   { "human-clone.com", true },
   { "humanenrich.com", true },
   { "humanesources.com", true },
   { "humanity.com", true },
-  { "humankode.com", true },
   { "humans.io", true },
   { "humanzee.com", true },
+  { "humass.nl", true },
   { "humblebee.at", true },
   { "humblebee.be", true },
   { "humblebee.bg", true },
@@ -16750,7 +17306,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "humblebee.com.ph", true },
   { "humblebee.cz", true },
   { "humblebee.dk", true },
-  { "humblebee.es", true },
   { "humblebee.eu", true },
   { "humblebee.foundation", true },
   { "humblebee.fr", true },
@@ -16766,10 +17321,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "humblebeeshop.ca", true },
   { "humblebeeshop.com.au", true },
   { "humbledot.com", true },
+  { "humboldthomeguide.com", true },
   { "humboldtmfg.com", true },
   { "humeur.de", true },
+  { "humexe.com", true },
   { "hummy.tv", true },
   { "humorcaliente.com", true },
+  { "humorce.com", false },
   { "humpchies.com", true },
   { "humpen.se", true },
   { "humppakone.com", true },
@@ -16778,13 +17336,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hundesport-psvhalle.de", true },
   { "hundeverwaltung.de", true },
   { "hundter.com", true },
+  { "hunger.im", true },
+  { "huniverse.co", true },
   { "hunstoncanoeclub.co.uk", true },
-  { "hunter-read.com", true },
   { "hunter.io", true },
+  { "hunterjohnson.io", true },
   { "hunterkehoe.com", true },
   { "huntexpired.com", true },
   { "huntingdonbouncers.co.uk", true },
   { "huntingdonlifesciences.com", true },
+  { "huntshomeinspections.com", false },
   { "huntsmansecurity.com", true },
   { "huntsvillecottage.ca", true },
   { "huonit.com.au", true },
@@ -16794,7 +17355,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "huren.nl", true },
   { "hurleyhomestead.com", true },
   { "huroji.com", true },
-  { "hurricanelabs.com", false },
   { "hurtigtinternet.dk", true },
   { "husakbau.at", true },
   { "hushfile.it", true },
@@ -16823,11 +17383,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hwaddress.com", true },
   { "hwag-pb.de", true },
   { "hwinfo.com", true },
+  { "hwlibre.com", true },
   { "hx53.de", true },
   { "hxp.io", true },
   { "hxsf.me", true },
   { "hxying.com", true },
-  { "hybridragon.net", true },
   { "hybridworx.com", true },
   { "hybridworx.de", true },
   { "hybridworx.eu", true },
@@ -16841,7 +17401,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hydrazin.pw", true },
   { "hydro17.com", true },
   { "hydroagro.pl", true },
-  { "hydrocloud.net", true },
   { "hydrographicsocietybenelux.eu", true },
   { "hydroturbine.info", true },
   { "hydrozone.fr", true },
@@ -16849,12 +17408,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hygo.com", true },
   { "hyk.me", true },
   { "hylemorphica.org", true },
-  { "hylians.com", true },
   { "hynek.me", true },
   { "hyparia.fr", true },
   { "hype.ru", true },
   { "hypemgmt.com", true },
   { "hyper-text.org", true },
+  { "hyperactive.am", true },
   { "hyperalgesia.com", true },
   { "hyperautomotive.com.au", true },
   { "hyperbolic-mayonnaise-interceptor.ovh", true },
@@ -16870,9 +17429,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "hypotheekbond.nl", true },
   { "hypothes.is", true },
   { "hypothyroidmom.com", true },
+  { "hysh.jp", true },
+  { "hytzongxuan.com", true },
   { "hyundai.no", true },
   { "hyvanilmankampaamo.fi", true },
   { "hyvinvointineuvoja.fi", true },
+  { "hztgzz.com", true },
   { "i-aloks.ru", true },
   { "i-geld.de", true },
   { "i-hakul.net", true },
@@ -16887,15 +17449,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "i-telligence.de", true },
   { "i-verbi.it", true },
   { "i00.eu", true },
-  { "i1314.gdn", false },
+  { "i1314.gdn", true },
   { "i1place.com", true },
   { "i2b.ro", true },
   { "i5y.co.uk", true },
   { "i5y.org", true },
   { "i66.me", true },
   { "i879.com", true },
+  { "i95.me", false },
   { "ia.net", true },
   { "iaco.li", true },
+  { "iacono.com.br", false },
   { "iactu.info", true },
   { "iaeste.no", true },
   { "iaeste.or.jp", true },
@@ -16904,20 +17468,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "iainsimms.co.uk", true },
   { "iainsimms.com", true },
   { "iainsimms.me", true },
+  { "iaitouzi.com", true },
   { "ialis.me", true },
   { "iam.lc", true },
   { "iam.soy", true },
+  { "iamanewme.com", true },
   { "iambozboz.co.uk", true },
   { "iamcarrico.com", true },
+  { "iamcryptoki.com", true },
   { "iamhansen.xyz", true },
+  { "iaminashittymood.today", true },
   { "iamjoshellis.com", true },
   { "iamlbk.com", true },
+  { "iamlizu.com", true },
   { "iamlzh.com", true },
   { "iamtheib.me", true },
   { "iamtonyarthur.com", true },
   { "iamusingtheinter.net", true },
   { "iamwoodbeard.com", true },
-  { "ian.sh", true },
+  { "ianbrault.com", true },
   { "iandouglasscott.com", true },
   { "iane-ccs.com", true },
   { "ianix.com", true },
@@ -16937,23 +17506,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ibaq.nl", true },
   { "ibauruapan.com.mx", true },
   { "ibcmed.org", true },
+  { "ibe.de", true },
   { "iberiaversicherungen.com", true },
   { "ibericaderedes.es", true },
   { "ibexcore.com", true },
   { "ibigawamizueco.com", true },
   { "ibin.co", true },
   { "ibiz.mk", true },
+  { "iblackfriday.ro", true },
   { "ibodyiq.com", true },
-  { "ibpegasus.tk", true },
   { "ibrainmedicine.org", true },
   { "ibrom.eu", true },
   { "ibstyle.tk", true },
   { "ibuki.run", true },
   { "ibwc.gov", true },
+  { "ibykos.com", true },
   { "ic-lighting.com.au", true },
   { "ic-spares.com", true },
   { "ic3.gov", true },
-  { "icabanken.se", true },
   { "icafecash.com", true },
   { "icake.life", true },
   { "icanhasht.ml", true },
@@ -16966,24 +17536,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "icebook.co.uk", true },
   { "icecars.net", true },
   { "icedream.tech", true },
-  { "ich-hab-die-schnauze-voll-von-der-suche-nach-ner-kurzen-domain.de", true },
+  { "icetiger.eu", true },
   { "ich-tanke.de", true },
   { "ichasco.com", true },
   { "ichbinein.org", true },
   { "ichbinkeinreh.de", true },
   { "ichmachdas.net", true },
-  { "ickerseashop.com", true },
   { "iclart.com", true },
   { "iclinic.ua", true },
   { "icloudlogin.com", true },
   { "icmhd.ch", true },
   { "icmp2018.org", true },
-  { "icnsoft.me", true },
-  { "icnsoft.org", true },
+  { "icmshoptrend.com", true },
   { "icobench.com", true },
   { "icodeconnect.com", true },
   { "icoh.it", true },
   { "iconomi.net", true },
+  { "icowhitepapers.co", true },
   { "icpc.pp.ua", true },
   { "icq-project.net", true },
   { "icsadviseurs.nl", true },
@@ -17022,9 +17591,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "idealtruss.com.tw", true },
   { "idealwhite.space", true },
   { "ideaman924.com", true },
+  { "ideasenfoto.com", true },
   { "ideashop.com", true },
   { "ideaweb.de", true },
-  { "ideiasefinancas.com.br", true },
   { "idenamaislami.com", true },
   { "idensys.nl", false },
   { "ident-clinic.be", true },
@@ -17032,8 +17601,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "identifyme.net", true },
   { "identigraf.center", true },
   { "identity.plus", true },
+  { "identitytheft.gov", true },
   { "idered.net", true },
   { "idesignstudio.de", true },
+  { "idesoft.cloud", true },
+  { "idesoftinnovacion.com", true },
+  { "idesoftinnovacion.es", true },
   { "idexxpublicationportal.com", true },
   { "idfy.io", true },
   { "idgard.de", false },
@@ -17065,6 +17638,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "idubaj.cz", true },
   { "idunno.org", true },
   { "idvl.de", true },
+  { "idxforza.com", true },
   { "ie.search.yahoo.com", false },
   { "iea-annex61.org", true },
   { "ieedes.com", true },
@@ -17079,19 +17653,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ies-italia.it", true },
   { "ieval.ro", true },
   { "iewar.com", true },
-  { "ifamily.top", false },
   { "ifangpei.cn", true },
   { "ifangpei.com.cn", true },
   { "ifcfg.jp", true },
   { "ifelse.io", true },
   { "ifengge.cn", true },
   { "ifengge.me", true },
+  { "ifgcdn.com", true },
   { "ifibe.com", true },
   { "ifightsurveillance.com", true },
   { "ifightsurveillance.net", true },
   { "ifightsurveillance.org", true },
   { "ifixe.ch", true },
   { "iflare.de", true },
+  { "ifly.pw", true },
   { "ifort.fr", true },
   { "ifosep.fr", true },
   { "ifoss.me", true },
@@ -17104,7 +17679,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ifxd.bid", true },
   { "ifyou.live", true },
   { "ig.com", true },
-  { "igaryhe.io", true },
+  { "iga-semi.jp", true },
   { "igcc.jp", true },
   { "igeh-immo.at", true },
   { "igglabs.com", true },
@@ -17125,6 +17700,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ignet.gov", true },
   { "ignitedlocal.com", true },
   { "ignitedmindz.in", true },
+  { "ignitelocal.com", true },
+  { "igorrealestate.com", true },
   { "igorw.org", true },
   { "igotoffer.com", false },
   { "igrivi.com", true },
@@ -17134,31 +17711,36 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ihacklabs.com", true },
   { "ihakkitekin.com", true },
   { "ihatethissh.it", true },
-  { "ihc.im", true },
+  { "ihcr.top", true },
   { "ihkk.net", true },
   { "ihls.stream", true },
   { "ihls.world", true },
-  { "ihls.xyz", true },
   { "ihoey.com", true },
   { "ihollaback.org", true },
   { "ihopeit.works", true },
   { "ihostup.net", true },
+  { "ihotel.io", false },
   { "ihrhost.com", true },
   { "ihtdenisjaccard.com", true },
   { "ii74.com", true },
   { "iiit.pl", true },
+  { "iimarckus.org", true },
   { "iiong.com", false },
   { "iirii.com", true },
   { "iix.se", true },
   { "ijm.io", true },
   { "ijohan.nl", true },
+  { "ijr.com", true },
   { "ijsblokjesvormen.nl", true },
   { "ijsclubtilburg.nl", true },
+  { "ijsclubwanneperveen.nl", true },
   { "ijunohana.jp", true },
+  { "ik-life.com", true },
   { "ikachalife.com", true },
   { "ikarate.ru", true },
   { "ikarr.com", true },
   { "ikeacareers.co.uk", true },
+  { "ikebuku.ro", true },
   { "ikebukuro-shame.com", true },
   { "ikedaquotes.org", true },
   { "ikespta.com", true },
@@ -17172,9 +17754,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ikkev.de", true },
   { "ikkoku.de", true },
   { "iklive.org", false },
+  { "ikraenglish.com", true },
   { "ikulist.me", true },
+  { "ikumi.us", true },
+  { "ikuuuu.com", true },
   { "ikvts.de", true },
   { "ikwilthepiratebay.org", true },
+  { "ikxkx.com", true },
   { "ila.fi", true },
   { "ilamparas.at", true },
   { "ilamparas.co.uk", true },
@@ -17188,7 +17774,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ile-sapporo.jp", true },
   { "ileci.de", true },
   { "ilektronika-farmakeia-online.gr", true },
-  { "ilemonrain.com", false },
   { "ilhan.name", true },
   { "ilhansubasi.com", true },
   { "iliastsi.net", true },
@@ -17209,24 +17794,32 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "illumed.net", true },
   { "illuminationis.com", true },
   { "illusionephemere.com", true },
+  { "illusionunlimited.com", true },
   { "illustrate.biz", true },
   { "illuxat.com", true },
   { "ilmataat.ee", true },
   { "ilmiobusinessonline.it", true },
   { "ilmiogiardiniere.it", true },
   { "ilmuk.org", false },
+  { "ilonewolfs.com", true },
   { "ilookz.nl", true },
   { "ilove.fish", true },
+  { "ilovequiz.ru", true },
+  { "ilovethiscampsite.com", true },
   { "ilrg.com", true },
-  { "iltisim.ch", true },
+  { "iltec-prom.ru", true },
+  { "iltec.ru", true },
   { "ilweb.es", true },
   { "ilya.pp.ua", true },
   { "im-c-shop.com", true },
+  { "im-haus-sonnenschein.de", true },
   { "im2net.com", true },
+  { "im4h.de", true },
+  { "im4h.eu", true },
+  { "im4h.info", true },
   { "im66.net", true },
   { "ima-tourcoing.fr", true },
   { "imacs.org", true },
-  { "imadalin.ro", true },
   { "image-drive.de", true },
   { "imagebin.ca", true },
   { "imagefu.com", true },
@@ -17243,6 +17836,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "imanageproducts.co.uk", true },
   { "imanageproducts.uk", true },
   { "imanesdeviaje.com", true },
+  { "imanolbarba.net", true },
+  { "imap2imap.de", true },
   { "imaple.org", true },
   { "imarkethost.co.uk", true },
   { "imask.ml", true },
@@ -17250,6 +17845,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "imbianchino.roma.it", true },
   { "imbushuo.net", true },
   { "imcsx.co", true },
+  { "imdemos.com", true },
+  { "ime.moe", true },
   { "imedes.de", true },
   { "imediafly.com", true },
   { "imedikament.de", true },
@@ -17259,6 +17856,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "imex-dtp.com", true },
   { "imforza.com", true },
   { "img.mg", true },
+  { "img.ovh", true },
   { "imga.ch", true },
   { "imgaa.com", true },
   { "imgbb.com", true },
@@ -17280,10 +17878,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "immatix.xyz", true },
   { "immersion-pictures.com", true },
   { "immersivewebportal.com", true },
+  { "immigrantdad.com", true },
   { "immigrationdirect.com.au", true },
   { "immo-agentur.com", true },
   { "immo-passion.net", true },
-  { "immobiliarecapitani.com", true },
   { "immobilien-badlippspringe.de", true },
   { "immobilien-in-istanbul.de", true },
   { "immobilien-zirm.de", true },
@@ -17302,14 +17900,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "impakho.com", true },
   { "impas.se", true },
   { "impelup.com", true },
+  { "impendulo.org", true },
   { "imperdin.com", true },
   { "imperial-legrand.com", true },
   { "imperialmiami.com", true },
+  { "imperiodigital.online", true },
   { "imperiumglass.com.au", true },
   { "imperiumnova.info", true },
   { "impex.com.bd", true },
   { "impiantistica.org", true },
   { "implantologie-dr-loeck.de", true },
+  { "implicitdenial.com", true },
   { "imponet.com.ar", true },
   { "import-shopping.de", true },
   { "importsagt.com", true },
@@ -17322,6 +17923,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "impresa-pulizie.it", true },
   { "impresadipulizie.roma.it", true },
   { "impresaedile.roma.it", true },
+  { "impressivebison.eu", true },
   { "imprimante-3d-store.fr", true },
   { "improfestival.ee", true },
   { "improklinikken.dk", true },
@@ -17369,7 +17971,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "includesubdomains2.preloaded.test", true },
   { "incoherent.ch", true },
   { "incommon.io", true },
-  { "incompliance.de", true },
   { "inconcerts.de", true },
   { "incontrixsingle.net", true },
   { "incowrimo.org", true },
@@ -17383,25 +17984,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "independent-operators.com", true },
   { "index-games.com", true },
   { "index-mp3.com", true },
-  { "indexyz.me", true },
   { "indiaflowermall.com", true },
   { "indian-elephant.com", true },
-  { "indianaantlersupply.com", true },
   { "indianaberry.com", true },
+  { "indianafoundationpros.com", true },
   { "indianapolislocksmithinc.com", true },
   { "indiatrademarkwatch.com", true },
-  { "indiawise.co.uk", true },
   { "indiayogastudio.net", true },
   { "indicateurs-flash.fr", true },
+  { "indie.dog", true },
+  { "indiecongdr.it", true },
   { "indiegame.space", true },
   { "indievelopment.nl", true },
   { "indigitalagency.com", true },
   { "indigoinflatables.com", true },
   { "indigosakura.com", true },
-  { "indiraactive.com", true },
+  { "indigotreeservice.com", true },
   { "inditip.com", true },
-  { "indochina.io", true },
-  { "indogerman.de", true },
+  { "indogerman.de", false },
   { "indogermanstartup.com", true },
   { "indogermantrade.de", true },
   { "indoorcomfortteam.com", true },
@@ -17413,6 +18013,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "indust.me", true },
   { "industriafranchini.com", true },
   { "industrialstarter.com", true },
+  { "industriemeister.io", true },
   { "indybay.org", true },
   { "ineardisplay.com", true },
   { "inebula.it", true },
@@ -17421,7 +18022,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "inesfinc.es", true },
   { "inesta.nl", true },
   { "inet.se", true },
-  { "inetpub.cn", true },
   { "inetserver.eu", true },
   { "inetsoftware.de", true },
   { "inevitavelbrasil.com.br", true },
@@ -17432,6 +18032,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "inficom.org", true },
   { "infinite.hosting", true },
   { "infinitegroup.info", true },
+  { "infinitescript.com", true },
   { "infinitiofallentownparts.com", true },
   { "infinitiofaugustaparts.com", true },
   { "infinitioflynnwoodparts.com", true },
@@ -17454,6 +18055,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "infmed.com", true },
   { "info-beamer.com", true },
   { "info-d-74.com", true },
+  { "info-screen-usercontent.me", true },
   { "info-screen.me", true },
   { "info-screw.com", true },
   { "infobae.com", true },
@@ -17461,6 +18063,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "infocommsociety.com", true },
   { "infocon.org", true },
   { "infocusvr.net", true },
+  { "infogram.com", true },
   { "infogrfx.com", true },
   { "infomegastore.com", true },
   { "infomir.eu", true },
@@ -17484,32 +18087,46 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "infosimmo.com", true },
   { "infotainworld.com", true },
   { "infotolium.com", true },
+  { "infotrac.net", true },
   { "infotune.nl", true },
   { "infovision-france.com", true },
   { "infoweb.ee", true },
   { "infr.red", true },
   { "infra.land", true },
   { "infra.press", true },
+  { "infrabeep.com", true },
+  { "infraclass.com", true },
+  { "infradrop.com", true },
   { "infrafire.com", true },
   { "infraflip.com", true },
   { "infraflux.com", true },
   { "infrafusion.com", true },
+  { "infralist.com", true },
+  { "inframetro.com", true },
+  { "inframint.com", true },
   { "infranium.com", true },
   { "infranium.eu", true },
   { "infranium.info", true },
   { "infranium.net", true },
   { "infranium.org", true },
   { "infranotes.com", true },
+  { "infranox.com", true },
   { "infrapass.com", true },
+  { "infrapilot.com", true },
+  { "infraping.com", true },
   { "infrapirtis.lt", true },
+  { "infrarank.com", true },
   { "infrarank.net", true },
+  { "infraspin.com", true },
+  { "infratank.com", true },
   { "infrathink.com", true },
+  { "infravideo.com", true },
+  { "infrazine.com", true },
   { "infuzeit.com.au", true },
   { "ing-buero-junk.de", true },
-  { "ing89.cc", true },
-  { "ing89.com", true },
   { "ingatlanjogaszok.hu", true },
   { "ingatlanneked.hu", true },
+  { "ingatlanrobot.hu", true },
   { "ingber.com", true },
   { "inge-r.nl", true },
   { "ingeeibach.de", true },
@@ -17537,6 +18154,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ink.horse", true },
   { "inkable.com.au", true },
   { "inkeliz.com", true },
+  { "inkhor.se", true },
   { "inkontriamoci.com", true },
   { "inksay.com", true },
   { "inkspire.co.uk", true },
@@ -17547,7 +18165,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "inmaps.xyz", true },
   { "inmatefinancial.com", true },
   { "inmateintake.com", true },
-  { "inme.ga", true },
   { "inmobillium.fr", true },
   { "inmoodforsex.com", true },
   { "inmusrv.de", true },
@@ -17555,12 +18172,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "innerlightcrystals.co.uk", true },
   { "innermostparts.org", true },
   { "innersafe.com", true },
+  { "inno.ch", true },
   { "innocenceseekers.net", true },
   { "innohb.com", true },
   { "innolabfribourg.ch", true },
   { "innoloop.com", true },
   { "innophate-security.com", true },
   { "innot.net", true },
+  { "innotas.com", true },
   { "innoteil.com", true },
   { "innovaptor.at", true },
   { "innovaptor.com", true },
@@ -17571,8 +18190,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "innsalzachsingles.de", true },
   { "innwan.com", true },
   { "inoa8.com", true },
+  { "inobun.jp", true },
   { "inovat.ma", true },
-  { "inovatec.com", true },
   { "inovatecsystems.com", true },
   { "inpas.co.uk", true },
   { "inquant.de", true },
@@ -17586,8 +18205,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "insertcoins.net", true },
   { "insgesamt.net", true },
   { "inshapenutrition.com.br", true },
+  { "insho.fashion", true },
   { "inside19.com", true },
   { "insideaudit.com", true },
+  { "insidebedroom.com", true },
+  { "insidesolutions.nl", true },
   { "insidethefirewall.tk", true },
   { "insightera.co.th", true },
   { "insighti.com", true },
@@ -17600,8 +18222,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "insolent.ch", true },
   { "insolved.com", true },
   { "insping.com", true },
-  { "inspirationalquotesuk.co.uk", true },
+  { "inspired-creations.co.za", true },
   { "inspired-lua.org", true },
+  { "inspiredrealtyinc.com", true },
   { "insrt.uk", true },
   { "insside.net", true },
   { "instafind.nl", true },
@@ -17615,8 +18238,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "instant.io", true },
   { "instantkhabar.com", true },
   { "instava.cz", true },
-  { "instawi.com", true },
   { "instead.com.au", true },
+  { "insteagle.com", true },
   { "instela.com", true },
   { "instelikes.com.br", true },
   { "instics.com", true },
@@ -17624,13 +18247,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "instinctiveads.com", true },
   { "institut-confucius-montpellier.org", true },
   { "institutmaupertuis.hopto.org", true },
-  { "institutolancaster.com", false },
+  { "institutolancaster.com", true },
   { "instrumart.ru", false },
   { "insult.es", true },
   { "insurance321.com", true },
   { "insureon.com", true },
   { "insurgentsmustdie.com", true },
-  { "int-ma.in", true },
   { "intae.it", true },
   { "intafe.co.jp", true },
   { "intal.info", true },
@@ -17643,44 +18265,44 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "integratedintegrations.xyz", true },
   { "integratedmedicalonline.com", true },
   { "integrateur-web-paris.com", true },
-  { "integraxor.com.tw", true },
   { "integrity.gov", true },
   { "integrityokc.com", true },
   { "integrityoklahoma.com", true },
   { "integrogroup.com", true },
   { "integromat.com", true },
   { "intelhost.cl", true },
+  { "intelhost.com", true },
   { "intelhost.com.ar", true },
   { "intelhost.com.br", true },
   { "intelhost.com.co", true },
   { "intelhost.com.mx", true },
   { "intelhost.com.pe", true },
-  { "intelhost.net", true },
   { "intellar.com", true },
   { "intellectdynamics.com", true },
-  { "intelliance.eu", true },
   { "intelligence-explosion.com", true },
   { "intellinetixvibration.com", true },
   { "intelly.nl", true },
   { "intelly365.nl", true },
   { "intencje.pl", true },
+  { "intensifyrsvp.com.au", true },
   { "inter-corporate.com", true },
   { "inter-culinarium.com", true },
   { "interaffairs.com", true },
   { "interaktiva.fi", true },
   { "interasistmen.se", true },
   { "interchangedesign.com", true },
-  { "interchanges.io", true },
   { "intercom.com", true },
   { "intercom.io", true },
   { "interessiert-uns.net", true },
   { "interfesse.net", true },
-  { "interfloraservices.co.uk", true },
   { "interflores.com.br", true },
+  { "interfug.de", true },
   { "intergozd.si", true },
   { "interiery-waters.cz", true },
   { "interimages.fr", true },
+  { "interior-design-colleges.com", true },
   { "interiordesignsconcept.com", true },
+  { "interiorprofesional.com.ar", true },
   { "interisaudit.com", true },
   { "interlingvo.biz", true },
   { "intermax.nl", true },
@@ -17691,10 +18313,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "international-books.org", true },
   { "international-nash-day.com", true },
   { "internationalfashionjobs.com", true },
+  { "internationalschool.it", true },
   { "internationalschoolnewyork.com", true },
   { "internationaltalento.it", true },
   { "internect.co.za", true },
   { "internet-aukcion.info", true },
+  { "internet-pornografie.de", false },
   { "internet-software.eu", true },
   { "internetaanbieders.eu", true },
   { "internetbank.swedbank.se", true },
@@ -17705,7 +18329,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "internethealthreport.org", true },
   { "internethering.de", true },
   { "internetinhetbuitengebied.nl", true },
-  { "internetmarkets.net", true },
   { "internetmuseum.se", true },
   { "internetofdon.gs", true },
   { "internetoffensive.fail", true },
@@ -17714,9 +18337,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "internetpro.me", true },
   { "internetstaff.com", true },
   { "internetzentrale.net", true },
+  { "interpol.gov", true },
   { "interracial.dating", true },
   { "interseller.io", true },
   { "interserved.com", false },
+  { "interstateautomotiveinc.com", true },
   { "intertime.services", true },
   { "interview-suite.com", true },
   { "interways.de", true },
@@ -17739,6 +18364,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "intraobes.com", true },
   { "intrasoft.com.au", true },
   { "intraxia.com", true },
+  { "introverted.ninja", true },
   { "intune.life", true },
   { "intvonline.com", true },
   { "intxt.net", true },
@@ -17748,6 +18374,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "inuyasha-petition.tk", true },
   { "invadelabs.com", true },
   { "invasion.com", true },
+  { "invasivespeciesinfo.gov", true },
   { "invasmani.com", true },
   { "inventaire.ch", true },
   { "inventionsteps.com.au", true },
@@ -17759,6 +18386,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "inventtheworld.com.au", true },
   { "inventum.cloud", true },
   { "inverselink-user-content.com", true },
+  { "inversioneseconomicas.com", true },
   { "investarholding.nl", true },
   { "investigatore.it", true },
   { "investigazionimoretti.it", true },
@@ -17769,6 +18397,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "investorforms.com", true },
   { "investosure.com", true },
   { "investpay.ru", true },
+  { "invinoaustria.com", true },
+  { "invinoaustria.cz", true },
   { "invioinc.com", true },
   { "inviosolutions.com", true },
   { "invisible-college.com", true },
@@ -17793,6 +18423,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "iocheck.com", false },
   { "iochen.com", true },
   { "iocurrents.com", true },
+  { "iodev.nl", true },
   { "iodine.com", true },
   { "ioerror.us", true },
   { "iofort.com", true },
@@ -17802,16 +18433,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "iompost.com", true },
   { "iomstamps.com", true },
   { "ionlabs.kr", true },
-  { "ionote.me", true },
   { "ionovia.de", true },
   { "ionx.co.uk", true },
   { "ioover.net", true },
   { "iosartstudios.gr", true },
   { "iosjailbreakiphone.com", true },
   { "ioslo.net", true },
+  { "iosmods.com", true },
   { "iosnoops.com", true },
   { "iossifovlab.com", true },
   { "iostream.by", true },
+  { "iowaent.com", true },
   { "iowaschoolofbeauty.com", true },
   { "ip-blacklist.net", true },
   { "ip-hahn.de", true },
@@ -17831,6 +18463,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ipcareers.net", true },
   { "ipconsulting.se", true },
   { "ipemcomodoro.com.ar", true },
+  { "ipfire.org", true },
   { "ipfirebox.de", true },
   { "ipfs.ink", true },
   { "ipfs.io", true },
@@ -17840,6 +18473,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "iphonote.com", true },
   { "ipintel.io", true },
   { "iplabs.de", true },
+  { "iplantom.com", true },
   { "iplayradio.net", false },
   { "ipleak.net", true },
   { "ipledgeonline.org", false },
@@ -17847,7 +18481,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ipo-times.jp", true },
   { "ipokabu.net", true },
   { "ipomue.com", false },
+  { "ipop.gr", true },
   { "iposm.net", true },
+  { "ipplans.com", true },
   { "ipresent.com", true },
   { "iprim.ru", true },
   { "iproducemusic.com", true },
@@ -17860,6 +18496,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ipty.de", true },
   { "ipura.ch", true },
   { "ipv4.cf", true },
+  { "ipv4.co.il", true },
   { "ipv4.gr", true },
   { "ipv6-adresse.dk", true },
   { "ipv6-handbuch.de", true },
@@ -17873,16 +18510,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ir1s.com", true },
   { "iramellor.com", true },
   { "iran-geo.com", true },
-  { "irandp.net", true },
   { "iranian.lgbt", true },
   { "iranianholiday.com", true },
   { "iranjeunesse.com", true },
-  { "irasandi.com", true },
   { "irayo.net", true },
   { "irc-results.com", true },
   { "ircmett.de", true },
   { "irdvb.com", true },
-  { "iready.ro", true },
   { "ireef.tv", true },
   { "iren.ch", true },
   { "irenekauer.com", true },
@@ -17894,7 +18528,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "iridiumflare.de", true },
   { "iriomote.com", true },
   { "iris-design.info", true },
-  { "iris-insa.com", false },
+  { "iris-insa.com", true },
   { "irish.dating", true },
   { "irisjieun.com", true },
   { "irland-firma.com", true },
@@ -17904,7 +18538,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "iroise.ch", true },
   { "ironbelly.pro", true },
   { "ironcarnival.com", true },
-  { "irondaleirregulars.com", true },
   { "ironfistdesign.com", true },
   { "ironhide.de", true },
   { "ironpeak.be", true },
@@ -17915,7 +18548,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "is-going-to-rickroll.me", true },
   { "is-sw.net", true },
   { "isaacdgoodman.com", true },
-  { "isaackabel.cf", true },
   { "isaackabel.ga", true },
   { "isaackabel.gq", true },
   { "isaackabel.ml", true },
@@ -17935,14 +18567,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "isamiok.com", true },
   { "isara.com", true },
   { "isarklinikum.de", true },
-  { "isastylish.com", true },
+  { "isavings.com", true },
   { "isayoga.de", true },
   { "isbc-telecom.ru", true },
   { "isbengrumpy.com", true },
+  { "isc2chapter-cny.org", true },
   { "iscert.org", true },
+  { "isdecolaop.nl", true },
   { "isdn.jp", true },
   { "isecrets.se", true },
   { "iservicio.mx", true },
+  { "iseulde.com", true },
   { "isfff.com", true },
   { "isgp-studies.com", true },
   { "ishamf.com", true },
@@ -17953,7 +18588,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "isil.fi", true },
   { "isimonbrown.co.uk", true },
   { "isincheck.com", true },
-  { "isinolsun.com", false },
+  { "isinolsun.com", true },
   { "isistomie.com", true },
   { "isitchristmas.com", true },
   { "isitcoffeetime.com", true },
@@ -17961,16 +18596,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "isitpatchtuesday.com", true },
   { "isitrest.info", true },
   { "isitup.org", true },
+  { "iskanderbroere.nl", true },
   { "iskaron.de", true },
   { "iskaz.rs", true },
-  { "iskkk.com", true },
-  { "iskkk.net", true },
   { "iskogen.nu", true },
+  { "islam.si", true },
   { "islamonline.net", true },
   { "islandhosting.com", true },
+  { "islavolcan.cl", true },
   { "isletech.net", true },
   { "isliada.org", true },
-  { "islief.com", true },
   { "islykaithecutest.cf", true },
   { "islykaithecutest.ml", true },
   { "ismailkarsli.com", true },
@@ -17980,6 +18615,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ismywebsitepenalized.com", true },
   { "isn.cz", true },
   { "iso27032.com", true },
+  { "isocom.eu", true },
   { "isognattori.com", true },
   { "isolta.com", true },
   { "isolta.de", true },
@@ -17996,15 +18632,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ispitrade.com", true },
   { "ispo.com.tw", true },
   { "ispsoft.pro", true },
-  { "ispweb.es", true },
   { "isqrl.de", true },
   { "israelbiblicalstudies.com", true },
   { "israelbizreg.com", true },
-  { "israkurort.com", true },
   { "isreedyinthe.uk", true },
   { "isreedyinthe.us", true },
   { "isreedyintheuk.com", true },
-  { "issa.org.pl", true },
+  { "issa.org.pl", false },
   { "issasfrissa.se", true },
   { "issforum.org", true },
   { "issio.net", true },
@@ -18012,6 +18646,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "issues.email", true },
   { "issuesofconcern.in", true },
   { "ist-intim.de", true },
+  { "istanbul.systems", true },
   { "istdieweltschonuntergegangen.de", true },
   { "isteinbaby.de", true },
   { "istheapplestoredown.com", true },
@@ -18024,6 +18659,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "istheservicedowncanada.com", true },
   { "isthisus.org", true },
   { "isthnew.com", true },
+  { "istogether.com", true },
   { "istore.lt", true },
   { "istorrent.is", true },
   { "istrazivac-istine.com", true },
@@ -18032,11 +18668,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "isuzupartscenter.com", true },
   { "isvbscriptdead.com", true },
   { "isvsecwatch.org", true },
-  { "isyu.xyz", true },
   { "isz-berlin.de", true },
   { "isz.no", true },
   { "it-academy.sk", true },
-  { "it-enthusiasts.tech", true },
   { "it-faul.de", true },
   { "it-fernau.com", true },
   { "it-jobbank.dk", true },
@@ -18050,6 +18684,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "it-service24.com", true },
   { "it-shamans.de", true },
   { "it-shamans.eu", true },
+  { "it-sysoft.com", true },
   { "it-ti.me", true },
   { "it-world.eu", true },
   { "it.search.yahoo.com", false },
@@ -18067,14 +18702,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "italserrande.it", true },
   { "italyinspires.com", true },
   { "itamservices.nl", true },
+  { "itap.gov", true },
   { "itb-online.co.uk", true },
   { "itblog.pp.ua", true },
   { "itchy.nl", true },
   { "itcko.sk", true },
   { "itdashboard.gov", true },
-  { "itds-consulting.com", true },
-  { "itds-consulting.cz", true },
-  { "itds-consulting.eu", true },
   { "itecor.net", true },
   { "iteecafe.hu", true },
   { "iteha.de", true },
@@ -18089,13 +18722,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "itfix.cz", true },
   { "itforge.nl", true },
   { "itgirls.rs", true },
-  { "ithakama.com", true },
   { "ithakama.cz", true },
   { "ithenrik.com", true },
   { "ithinc.net", true },
   { "itikon.com", true },
   { "itis.gov", true },
   { "itis4u.ch", true },
+  { "itjob.ma", true },
   { "itkaufmann.at", true },
   { "itlitera.com", true },
   { "itludens.com", true },
@@ -18104,7 +18737,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "itneeds.tech", true },
   { "itnota.com", true },
   { "itochan.jp", true },
-  { "itogoyomi.com", true },
   { "itooky.com", true },
   { "itpro.ua", true },
   { "itraveille.fr", true },
@@ -18132,17 +18764,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "itsok.de", true },
   { "itspartytimeonline.co.uk", true },
   { "itspartytimesweetinflations.com", true },
+  { "itspecialista.eu", true },
   { "itspersonaltraining.nl", true },
   { "itsryan.com", true },
   { "itsstefan.eu", true },
   { "itstatic.tech", true },
+  { "itsuitsyou.co.za", true },
+  { "itsundef.in", true },
+  { "itsv.at", true },
   { "itswincer.com", true },
   { "itzap.com.au", true },
+  { "iurisnow.com", true },
   { "iusedtosmoke.com", true },
   { "iuyos.com", true },
   { "ivact.co.jp", true },
   { "ivanbenito.com", true },
+  { "ivanboi.com", true },
   { "ivancacic.com", false },
+  { "ivanilla.org", true },
   { "ivanmeade.com", true },
   { "ivaoru.org", true },
   { "ivfausland.de", true },
@@ -18162,21 +18801,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ivor.is", true },
   { "ivorvanhese.com", true },
   { "ivorvanhese.nl", true },
+  { "ivoryonsunset.com", true },
   { "ivpn.net", true },
   { "ivre.rocks", true },
+  { "ivsign.net", true },
   { "ivusn.cz", true },
   { "ivvl.ru", true },
-  { "ivxv.ee", true },
+  { "ivy-league-colleges.com", true },
   { "iwader.co.uk", true },
   { "iwalton.com", true },
+  { "iwantexchange.com", true },
   { "iwanttoliveinabunker.com", true },
   { "iwch.tk", true },
+  { "iwebolutions.com", true },
   { "iwell.de", true },
   { "iwizerunek.pl", true },
   { "iworos.com", true },
   { "iww.me", true },
+  { "iwyc.cn", true },
   { "ixds.org", true },
-  { "ixio.cz", true },
   { "ixnext.de", true },
   { "ixquick-proxy.com", true },
   { "ixquick.co.uk", true },
@@ -18187,6 +18830,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ixquick.info", true },
   { "ixquick.nl", true },
   { "iyassu.com", true },
+  { "iyinolaashafa.com", true },
   { "iyouewo.com", true },
   { "iyuanbao.net", true },
   { "iz8mbw.net", true },
@@ -18207,12 +18851,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "j0s.at", true },
   { "j0s.eu", true },
   { "j15h.nu", true },
-  { "j2ee.cz", true },
   { "j2h.de", true },
   { "j3e.de", true },
   { "ja-dyck.de", true },
   { "ja-gps.com.au", true },
-  { "ja-publications.agency", false },
   { "ja.md", true },
   { "jaakkohannikainen.fi", true },
   { "jaalits.com", true },
@@ -18227,24 +18869,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jabberzac.org", true },
   { "jaberg-rutschi.ch", true },
   { "jabergrutschi.ch", true },
-  { "jability.ovh", true },
   { "jabjab.de", true },
   { "jaccblog.com", true },
   { "jacekowski.org", true },
   { "jacik.cz", true },
+  { "jack2celebrities.com", true },
+  { "jackassofalltrades.org", true },
   { "jackdawphoto.co.uk", true },
   { "jackdelik.de", true },
   { "jackf.me", true },
   { "jackhoodtransportation.com", true },
   { "jackingramnissanparts.com", true },
   { "jackpothappy.com", true },
-  { "jackrusselterrier.com.br", true },
   { "jackson-quon.com", true },
   { "jackson.jp", true },
+  { "jacksonhu.com", true },
   { "jacksonvillestation.com", true },
+  { "jacksorrell.com", true },
   { "jacksutton.info", true },
   { "jackyliao123.tk", true },
-  { "jackyyf.com", false },
   { "jaco.by", true },
   { "jacobamunch.com", true },
   { "jacobdevans.com", true },
@@ -18253,9 +18896,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jacobian.org", true },
   { "jacobjangles.com", true },
   { "jacobphono.com", true },
+  { "jacobsenarquitetura.com", true },
   { "jacuzziprozone.com", true },
-  { "jadara.info", true },
   { "jadchaar.me", true },
+  { "jadopado.com", true },
   { "jaegerlacke.de", true },
   { "jaetech.org", true },
   { "jagbouncycastles.co.uk", true },
@@ -18293,6 +18937,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jakubarbet.eu", true },
   { "jakubboucek.cz", true },
   { "jakubklimek.com", true },
+  { "jakubtopic.cz", true },
   { "jakubvrba.cz", true },
   { "jala.co.jp", true },
   { "jaleo.cn", true },
@@ -18300,15 +18945,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jamaat.hk", true },
   { "jamacha.org", true },
   { "jamalfi.bio", true },
-  { "jamberry.com.mx", true },
   { "jamberrynails.co.uk", true },
   { "james-bell.co.uk", true },
   { "james-digital.com", true },
   { "james-loewen.com", true },
   { "jamesachambers.com", true },
   { "jamesaimonetti.com", true },
+  { "jamesandpame.la", true },
   { "jamesbillingham.com", true },
   { "jameschorlton.co.uk", true },
+  { "jamesclark.com", true },
   { "jamesdorf.com", true },
   { "jamesgreenfield.com", true },
   { "jameshemmings.co.uk", true },
@@ -18327,7 +18973,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jamessmith.me.uk", true },
   { "jamesturnerstickley.com", true },
   { "jamhost.org", true },
-  { "jamie-read-photography.com", true },
   { "jamie.ie", true },
   { "jamielarter.ca", true },
   { "jamielinux.com", true },
@@ -18354,6 +18999,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jandev.de", true },
   { "janduchene.ch", true },
   { "janehamelgardendesign.co.uk", true },
+  { "janelauhomes.com", true },
   { "janhuelsmann.com", true },
   { "jani.media", true },
   { "janiat.com", true },
@@ -18361,7 +19007,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "janikrabe.com", true },
   { "janjoris.nl", true },
   { "jankoepsel.com", true },
-  { "janmg.com", true },
   { "jann.is", true },
   { "jannekekaasjager.nl", true },
   { "jannisfink.de", true },
@@ -18372,17 +19017,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "janssenwigman.nl", true },
   { "janterpstra.eu", true },
   { "jantinaboelens.nl", true },
+  { "janvari.com", true },
+  { "janvaribalint.com", true },
   { "jaot.info", true },
+  { "japanesemotorsports.net", true },
   { "japaniac.de", false },
+  { "japanphilosophy.com", false },
   { "japanwatches.xyz", true },
   { "japon-japan.com", true },
   { "jardin-exotique-rennes.fr", true },
   { "jardinderline.ch", true },
+  { "jardineriaon.com", true },
   { "jardiniersduminotaure.fr", true },
   { "jaredeberle.org", false },
   { "jaredfernandez.com", true },
-  { "jaredfraser.com", true },
-  { "jarivisual.com", true },
   { "jarniashop.se", true },
   { "jaroku.com", true },
   { "jarondl.net", true },
@@ -18390,27 +19038,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jarrettgraham.com", true },
   { "jarroba.com", true },
   { "jas-team.net", true },
+  { "jashvaidya.com", true },
   { "jasl.works", true },
   { "jasmijnwagenaar.nl", true },
   { "jasminefields.net", true },
   { "jason.re", true },
   { "jasonamorrow.com", true },
   { "jasongerber.ch", true },
-  { "jasonian-photo.com", false },
   { "jasonmili.online", true },
   { "jasonsansone.com", true },
   { "jasper.link", true },
   { "jasperhammink.com", true },
   { "jasperhuttenmedia.com", true },
   { "jasperpatterson.me", true },
-  { "jastrow.me", true },
   { "jaszbereny-vechta.eu", true },
   { "javalestari.com", true },
   { "javamilk.com", true },
   { "javierburgos.net", true },
+  { "javierlorente.es", true },
+  { "javik.net", true },
+  { "jaxfstk.com", true },
   { "jaxxnet.co.uk", true },
   { "jaxxnet.org", true },
   { "jaycouture.com", true },
+  { "jayf.de", true },
   { "jayfreestone.com", true },
   { "jaymecd.rocks", true },
   { "jayrl.com", true },
@@ -18418,11 +19069,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jaytx.com", true },
   { "jayxon.com", true },
   { "jayxu.com", true },
+  { "jazminguaramato.com", true },
   { "jazz-alliance.com", true },
   { "jazz-alliance.org", true },
   { "jazzanet.com", true },
   { "jazzfeet.co.uk", true },
   { "jazzncheese.com", true },
+  { "jazzy-feet.com", true },
   { "jazzy.id.au", true },
   { "jazzy.pro", true },
   { "jazzysumi.com", true },
@@ -18437,6 +19090,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jcb.com", true },
   { "jcbgolfandcountryclub.com", true },
   { "jci.cc", true },
+  { "jclynne.com", true },
   { "jcra.net", true },
   { "jctf.team", true },
   { "jcwodan.nl", true },
@@ -18475,9 +19129,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jedidiah.eu", false },
   { "jedipedia.net", true },
   { "jediweb.com.au", true },
+  { "jedmud.com", true },
   { "jedwarddurrett.com", true },
   { "jeec.ist", true },
-  { "jeepmafia.com", true },
+  { "jeepeg.com", true },
   { "jeffanderson.me", true },
   { "jeffcasavant.com", false },
   { "jeffcloninger.net", true },
@@ -18487,15 +19142,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jeffhaferman.com", true },
   { "jeffmcneill.com", true },
   { "jeffreyhaferman.com", true },
+  { "jeffrhinelander.com", true },
   { "jeffri.me", true },
   { "jeffsanders.com", true },
   { "jefftickle.com", true },
   { "jeffwebb.com", true },
   { "jefrydco.id", true },
+  { "jehovahsays.net", true },
   { "jej.cz", true },
   { "jej.sk", true },
   { "jekhar.com", true },
+  { "jekkt.com", false },
   { "jelena-adeli.com", true },
+  { "jelenkovic.rs", true },
   { "jelewa.de", true },
   { "jell.ie", true },
   { "jelle.pro", true },
@@ -18506,7 +19165,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jelleluteijn.nl", true },
   { "jelleluteijn.pro", true },
   { "jelleraaijmakers.nl", true },
-  { "jelleschneiders.com", true },
+  { "jelleschneiders.com", false },
   { "jelly.cz", true },
   { "jellybeanbooks.com.au", true },
   { "jelmer.co.uk", true },
@@ -18517,7 +19176,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jembatankarir.com", true },
   { "jemefaisdesamis.com", true },
   { "jena.space", true },
-  { "jennedebleser.com", false },
+  { "jennierobinson.com", true },
   { "jenniferengerwingaantrouwen.nl", true },
   { "jennifermason.eu", true },
   { "jennifersauer.nl", true },
@@ -18535,11 +19194,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jeremycantu.com", true },
   { "jeremycrews.com", true },
   { "jeremynally.com", true },
+  { "jeremyness.com", true },
   { "jeremypaul.me", true },
   { "jeremytcd.com", true },
   { "jericamacmillan.com", true },
+  { "jering.tech", true },
   { "jeroendeneef.com", true },
-  { "jeroenensanne.wedding", true },
   { "jeroensangers.com", true },
   { "jerret.de", true },
   { "jerryweb.org", true },
@@ -18561,10 +19221,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jet-stream.fr", true },
   { "jetapi.org", true },
   { "jetbbs.com", true },
-  { "jetbrains.pw", true },
-  { "jetflex.de", true },
   { "jetkittens.co.uk", true },
-  { "jetmirshatri.com", true },
   { "jetsetboyz.net", true },
   { "jetsieswerda.nl", true },
   { "jettlarue.com", true },
@@ -18575,11 +19232,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jexler.net", true },
   { "jf-fotos.de", true },
   { "jfbst.net", true },
-  { "jfr.im", true },
+  { "jfmhero.me", true },
   { "jfreitag.de", true },
   { "jfsa.jp", true },
   { "jgid.de", true },
   { "jgke.fi", true },
+  { "jglover.com", true },
+  { "jgoldgroup.com", true },
+  { "jgregory.co.uk", true },
   { "jgwb.de", true },
   { "jgwb.eu", true },
   { "jhalderm.com", true },
@@ -18589,7 +19249,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jhill.de", true },
   { "jhollandtranslations.com", true },
   { "jhuang.me", true },
-  { "jhw-profiles.de", true },
   { "jhwestover.com", true },
   { "jiacl.com", true },
   { "jiahao.codes", true },
@@ -18601,23 +19260,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jicaivvip.com", true },
   { "jichi.io", true },
   { "jichi000.win", true },
+  { "jikegu.com", true },
   { "jimbiproducts.com", true },
   { "jimbraaten.com", true },
   { "jimbutlerkiaparts.com", true },
   { "jimdorf.com", true },
   { "jimfranke.com", true },
   { "jimfranke.nl", true },
+  { "jimizhou.xyz", true },
   { "jimmycai.com", false },
   { "jimmycn.com", false },
   { "jimmyroura.ch", true },
   { "jimshaver.net", true },
   { "jimslop.nl", true },
-  { "jinancy.fr", true },
   { "jinanshen.com", true },
   { "jinbo123.com", false },
   { "jinbowiki.org", true },
-  { "jing-in.com", true },
-  { "jing-in.net", true },
   { "jing.su", true },
   { "jingjo.com.au", true },
   { "jinja.ai", true },
@@ -18631,9 +19289,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jirosworld.com", true },
   { "jisai.net.cn", true },
   { "jisha.site", true },
-  { "jiveiaktivno.bg", true },
   { "jixun.moe", true },
   { "jiyusu.com", true },
+  { "jjhampton.com", true },
   { "jjj.blog", true },
   { "jjjconnection.com", true },
   { "jjlvk.nl", true },
@@ -18645,7 +19303,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jk-entertainment.biz", true },
   { "jkchocolate.com", true },
   { "jkest.cc", true },
-  { "jkinteriorspa.com", true },
   { "jkirsche.com", true },
   { "jkrippen.com", true },
   { "jkyuan.tk", true },
@@ -18653,11 +19310,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jl-dns.nl", true },
   { "jl-exchange.nl", true },
   { "jl-mail.nl", true },
-  { "jldp.org", true },
   { "jlink.nl", true },
   { "jlkhosting.com", true },
   { "jloh.codes", true },
   { "jlot.org", true },
+  { "jlpn.eu", true },
+  { "jlpn.nl", true },
   { "jlponsetto.com", true },
   { "jlr-luxembourg.com", true },
   { "jltctech.com", true },
@@ -18670,22 +19328,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jmcataffo.com", true },
   { "jmce.eu", true },
   { "jmcleaning.services", true },
+  { "jmdekker.it", true },
   { "jmedved.com", true },
   { "jmentertainment.co.uk", true },
   { "jmfjltd.com", true },
   { "jmk.hu", true },
+  { "jmorahan.net", true },
   { "jmoreau.ddns.net", true },
-  { "jmotion.co.uk", true },
   { "jmpb.hu", true },
-  { "jmpmotorsport.co.uk", true },
   { "jmsolodesigns.com", true },
+  { "jmssg.jp", true },
   { "jmvdigital.com", true },
   { "jnjdj.com", true },
   { "jnm-art.com", true },
+  { "jnordell.com", true },
   { "joa-ebert.com", true },
   { "joacimeldre.com", true },
+  { "joanofarcmtcarmel.org", true },
   { "joaoaugusto.net", true },
   { "joaosampaio.com.br", true },
+  { "job-ofertas.info", true },
+  { "job-offer.de", true },
   { "job.biz.tr", true },
   { "jobatus.com.br", true },
   { "jobatus.es", true },
@@ -18698,7 +19361,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jobcorpsy2y.com", true },
   { "jobify.in", true },
   { "jobindex.dk", true },
-  { "joblab.com.ua", true },
   { "joblife.co.za", true },
   { "jobmi.com", true },
   { "jobmiplayground.com", true },
@@ -18709,6 +19371,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jobs4sales.ch", true },
   { "jobseekeritalia.it", true },
   { "jobsisbrown.com", true },
+  { "jobsnet.eu", true },
   { "jobsuchmaschine.ch", true },
   { "jobwinner.ch", true },
   { "jobzninja.com", true },
@@ -18719,12 +19382,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jodlajodla.si", true },
   { "joduska.me", true },
   { "jodyboucher.com", false },
+  { "jodyshop.com", true },
   { "joe262.com", true },
   { "joedavison.me", true },
   { "joedinardo.com", true },
   { "joedoyle.us", true },
   { "joefixit.co", true },
-  { "joefixit.co.uk", true },
   { "joehenry.co.uk", true },
   { "joejohnson.name", true },
   { "joel.coffee", true },
@@ -18740,6 +19403,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "joelmunch.com", true },
   { "joelnichols.uk", true },
   { "joemotherfuckingjohnson.com", true },
+  { "joepitt.co.uk", false },
+  { "joergschneider.com", true },
   { "joerosca.com", true },
   { "joerss.at", true },
   { "joeskup.com", true },
@@ -18751,11 +19416,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "joeygitalian.com", true },
   { "joeyhoer.com", true },
   { "joeysmith.com", true },
+  { "joeyvanvenrooij.nl", true },
   { "joeyvilaro.com", true },
   { "jogi-server.de", true },
   { "jogorama.com.br", false },
   { "jogwitz.de", true },
-  { "johand.io", true },
   { "johanli.com", true },
   { "johannes-bauer.com", true },
   { "johannes-zinke.de", true },
@@ -18765,9 +19430,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "johannesen.tv", true },
   { "johanneskonrad.de", true },
   { "johannespichler.com", false },
+  { "johanpeeters.com", true },
   { "johansf.tech", true },
   { "johego.org", true },
+  { "johnaltamura.com", true },
   { "johnbeil.com", true },
+  { "johnberan.com", true },
   { "johnblackbourn.com", true },
   { "johnbpodcast.com", true },
   { "johncook.ltd.uk", true },
@@ -18781,20 +19449,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "johnmcintosh.pro", true },
   { "johnmh.me", true },
   { "johnmichel.org", true },
-  { "johnmorganpartnership.co.uk", true },
   { "johnno.be", true },
   { "johnnybet.com", true },
   { "johnnybsecure.com", true },
   { "johnroach.io", true },
   { "johnroberts.me", true },
   { "johnrockefeller.net", true },
+  { "johnsanchez.io", true },
   { "johnsegovia.com", true },
-  { "johnsiu.com", true },
   { "johnsonho.net", true },
   { "johnvanhese.nl", true },
   { "johnyytb.be", true },
   { "joi-dhl.ch", true },
+  { "joinhonor.com", true },
   { "jointotem.com", true },
+  { "joinus-outfits.nl", true },
   { "jojosplaycentreandcafeteria.co.uk", true },
   { "jokedalderup.nl", true },
   { "joker.menu", true },
@@ -18811,22 +19480,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jomofojo.com", true },
   { "jonahperez.com", true },
   { "jonandnoraswedding.com", true },
-  { "jonarcher.info", true },
   { "jonas-thelemann.de", true },
   { "jonas-wenk.de", false },
   { "jonaskjodt.com", true },
+  { "jonasled.de", true },
   { "jonaswitmer.ch", true },
   { "jonathan-apps.com", true },
   { "jonathancarter.org", true },
   { "jonathandupree.com", true },
   { "jonathanha.as", true },
   { "jonathanj.nl", true },
+  { "jonathanlara.com", true },
   { "jonathanschle.de", true },
+  { "jonathanselea.se", true },
   { "jonblankenship.com", true },
   { "jondarby.com", true },
   { "jondevin.com", true },
   { "jondowdle.com", true },
-  { "jonesopolis.xyz", true },
   { "jonespayne.com", false },
   { "jonferwerda.net", true },
   { "jong030.nl", true },
@@ -18841,12 +19511,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jonnybarnes.uk", true },
   { "jonnystoten.com", true },
   { "jonoalderson.com", true },
+  { "jonola.com", true },
   { "jonpads.com", true },
   { "jonpavelich.com", true },
+  { "jons.org", true },
   { "jonscaife.com", true },
   { "jooksms.com", true },
   { "jooksuratas.ee", true },
   { "joomlant.org", true },
+  { "joompress.biz", true },
   { "joonatoona.me", true },
   { "joostrijneveld.nl", true },
   { "joostvanderlaan.nl", true },
@@ -18854,18 +19527,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "joran.org", true },
   { "jorcus.com", true },
   { "jordan-jungk.de", true },
+  { "jordandevelopment.com", true },
   { "jordanhamilton.me", true },
   { "jordankmportal.com", true },
   { "jordans.co.uk", true },
+  { "jordanscorporatelaw.com", true },
   { "jordanstrustcompany.com", true },
   { "jordhy.com", true },
   { "jorgerosales.org", true },
   { "jorisdalderup.nl", true },
   { "jornalalerta.com.br", true },
+  { "josc.com.au", true },
   { "joscares.com", true },
   { "jose-alexand.re", true },
   { "jose-lesson.com", true },
-  { "jose.eti.br", true },
   { "joseetesser.nl", true },
   { "josef-lotz.de", true },
   { "josefjanosec.com", true },
@@ -18875,15 +19550,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "josemikkola.fi", true },
   { "josepbel.com", true },
   { "josephbleroy.com", true },
+  { "josephre.es", false },
   { "josephsniderman.com", true },
   { "josephsniderman.net", true },
   { "josephsniderman.org", true },
   { "josephv.website", true },
-  { "josericaurte.com", true },
   { "joshgilson.com", true },
   { "joshgrancell.com", true },
   { "joshharkema.com", true },
   { "joshharmon.me", true },
+  { "joshics.in", true },
   { "joshlovephotography.co.uk", true },
   { "joshpanter.com", true },
   { "joshrickert.com", true },
@@ -18894,17 +19570,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "joshua.bio", true },
   { "joshuadmiller.info", true },
   { "joshuajohnson.ca", true },
+  { "joshuameunier.com", true },
   { "joshuarogers.net", true },
   { "josoansi.de", true },
-  { "jouetspetitechanson.com", true },
+  { "journalism-schools.com", true },
   { "journeyfriday.rocks", true },
   { "journeytomastery.net", true },
+  { "jouwpaardenbak.nl", true },
   { "jovani.com", false },
   { "jovic.hamburg", true },
   { "joyceseamone.com", true },
   { "joyful.house", true },
   { "joyfulexpressions.gallery", true },
+  { "joynadvisors.com", true },
   { "joyofcookingandbaking.com", true },
+  { "joysinventingblog.com", true },
   { "jpdeharenne.be", true },
   { "jpeg.io", true },
   { "jphandjob.com", true },
@@ -18912,6 +19592,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jpmelos.com", true },
   { "jpmelos.com.br", true },
   { "jpod.cc", true },
+  { "jpralves.net", true },
   { "jps-selection.co.uk", true },
   { "jps-selection.com", true },
   { "jps-selection.eu", true },
@@ -18930,6 +19611,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jross.me", true },
   { "jrtapsell.co.uk", true },
   { "jrxpress.com", true },
+  { "js-web.eu", true },
+  { "js3311.com", true },
+  { "js8855.com", true },
   { "js93029.com", true },
   { "jschoi.org", true },
   { "jschumacher.info", true },
@@ -18940,15 +19624,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jsevilleja.org", true },
   { "jskier.com", false },
   { "jskoelliken.ch", true },
-  { "jslidong.top", true },
   { "jsmetallerie.fr", true },
   { "jsnfwlr.com", true },
   { "jsnfwlr.io", true },
+  { "jsonsinc.com", true },
   { "jsteward.moe", true },
   { "jstore.ch", true },
-  { "jsuse.xyz", true },
-  { "jsxc.ch", true },
   { "jtcat.com", true },
+  { "jtcjewelry.com", true },
+  { "jtconsultancy.sg", true },
   { "jthackery.com", false },
   { "jtl-software.com", true },
   { "jtmar.me", true },
@@ -18959,21 +19643,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "juan23.edu.uy", true },
   { "juanfrancisco.tech", true },
   { "juanmaguitar.com", true },
+  { "juanmazzetti.com", true },
   { "juanxt.ddns.net", true },
+  { "jubee.nl", true },
   { "jubileum.online", true },
+  { "jubileumfotograaf.nl", true },
   { "jucca-nautica.si", true },
   { "juch.cc", true },
   { "juchit.at", true },
   { "jucktehkeinen.de", true },
   { "judc-ge.ch", true },
+  { "judge2020.com", true },
   { "judge2020.me", true },
   { "judoprodeti.cz", true },
   { "judosaintdenis.fr", true },
+  { "judytka.cz", true },
   { "juef.space", true },
   { "juegosycodigos.es", true },
   { "juegosycodigos.mx", true },
   { "juelda.com", true },
   { "juergen-elbert.de", true },
+  { "juergen-roehrig.de", true },
   { "juergenhecht.de", true },
   { "juergenklieber.de", true },
   { "juergenspecht.com", true },
@@ -18984,8 +19674,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jugh.de", true },
   { "juhakoho.com", true },
   { "juice.codes", true },
-  { "juk.life", true },
+  { "juk.life", false },
+  { "juku-info.top", true },
+  { "juku-wing.jp", true },
   { "jule-spil.dk", true },
+  { "julenlanda.com", false },
   { "julian-uphoff.de", true },
   { "julian-weigle.de", true },
   { "juliangonggrijp.com", true },
@@ -19008,6 +19701,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "juliemaurel.fr", true },
   { "julienc.io", true },
   { "julienpaterne.com", true },
+  { "julienschmidt.com", true },
   { "julientartarin.com", true },
   { "julius-zoellner.de", true },
   { "jullensgroningen.com", true },
@@ -19032,6 +19726,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jumpingjacksbouncycastles.co.uk", true },
   { "jumpinjaes.co.uk", true },
   { "jumpinmonkeys.co.uk", true },
+  { "jumpintogreenerpastures.com", true },
   { "jumpnplay.co.uk", true },
   { "junespina.com", true },
   { "junethack.net", true },
@@ -19050,6 +19745,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "junjun-web.net", true },
   { "junkdrome.org", true },
   { "juno.co.uk", true },
+  { "junoaroma.com", true },
   { "junodownload.com", true },
   { "jura-reiseschutz.de", true },
   { "jurassicbarkharrogate.co.uk", true },
@@ -19058,11 +19754,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jurijbuga.de", true },
   { "jurisprudent.by", true },
   { "juristas.com.br", true },
-  { "jurriaan.ninja", true },
   { "just-a-clanpage.de", true },
   { "just-vet-and-drive.fr", true },
   { "justanothercompany.name", true },
-  { "justbelieverecovery.com", true },
   { "justbelieverecoverypa.com", true },
   { "justbookexcursions.com", true },
   { "justbookhotels.com", true },
@@ -19075,12 +19769,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "justice.gov", true },
   { "justice4assange.com", true },
   { "justin-tech.com", false },
-  { "justinellingwood.com", true },
   { "justinharrison.ca", true },
   { "justinho.com", true },
   { "justinstandring.com", true },
   { "justmensgloves.com", true },
   { "justpaste.it", true },
+  { "justsmart.io", true },
+  { "justsome.info", true },
+  { "justtalk.site", true },
   { "justthinktwice.gov", true },
   { "justupdate.me", true },
   { "justyy.com", true },
@@ -19091,12 +19787,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "juusujanar.eu", true },
   { "juvenex.co", true },
   { "juwelierstoopman.nl", true },
-  { "juzgalo.com", true },
   { "jva-wuerzburg.de", true },
   { "jvandenbroeck.com", true },
   { "jvanerp.nl", true },
   { "jvbouncycastlehire.co.uk", true },
   { "jvega.me", true },
+  { "jvlandscapingservices.com", true },
   { "jvphotoboothhire.co.uk", true },
   { "jvsticker.com", true },
   { "jvwdev.nl", true },
@@ -19107,8 +19803,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jwjwjw.com", true },
   { "jwmmarketing.com", true },
   { "jwnotifier.org", true },
+  { "jwod.gov", true },
   { "jwschuepfheim.ch", true },
   { "jwsoft.nl", true },
+  { "jxltom.com", true },
+  { "jxm.in", true },
   { "jydemarked.dk", true },
   { "jyggen.com", true },
   { "jym.fit", true },
@@ -19116,12 +19815,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "jzachpearson.com", true },
   { "jzbk.org", true },
   { "jzcapital.co", true },
+  { "k-bone.com", true },
   { "k-homes.net", true },
   { "k-netz.de", true },
   { "k-pan.com", true },
   { "k-plant.com", true },
   { "k-pture.com", true },
   { "k-scr.me", true },
+  { "k-system.de", true },
   { "k-tube.com", true },
   { "k258059.net", true },
   { "k2mts.org", true },
@@ -19142,11 +19843,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kaatha-kamrater.se", true },
   { "kab-s.de", true },
   { "kabaca.design", true },
+  { "kabarlinux.id", true },
   { "kabashop.com.br", true },
+  { "kabat-fans.cz", true },
   { "kabeltv.co.nz", true },
   { "kabeuchi.com", true },
   { "kaboom.pw", true },
   { "kabu-abc.com", true },
+  { "kabulpress.org", true },
+  { "kabus.org", true },
+  { "kacgal.com", true },
   { "kachelfm.nl", true },
   { "kachlikova2.cz", true },
   { "kack.website", true },
@@ -19158,13 +19864,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kagitreklam.com", true },
   { "kaheim.de", true },
   { "kai-ratzeburg.de", true },
-  { "kaibol.com", true },
   { "kaigojj.com", true },
   { "kaikei7.com", true },
   { "kaileymslusser.com", true },
   { "kairion.de", false },
   { "kaisakura.net", true },
-  { "kaisev.net", true },
+  { "kaisev.net", false },
   { "kaitol.click", true },
   { "kaiusaltd.com", true },
   { "kaivac-emea.com", true },
@@ -19174,11 +19879,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kaka.farm", true },
   { "kakao-karten.de", true },
   { "kakaravaara.fi", true },
+  { "kakie-gobocha.jp", true },
   { "kakie-kolesa.ru", true },
   { "kakolightingmuseum.or.jp", true },
-  { "kakoo-media.nl", true },
-  { "kakoo.nl", true },
-  { "kakoomedia.nl", true },
   { "kakuto.me", true },
   { "kalakarclub.com", true },
   { "kalamos-psychiatrie.be", true },
@@ -19189,6 +19892,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kaliaa.fi", true },
   { "kalian.cz", true },
   { "kalifornien-tourismus.de", true },
+  { "kalkulacka-havarijni.cz", true },
   { "kall.is", true },
   { "kallies-net.de", true },
   { "kalmar.com", true },
@@ -19199,11 +19903,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kalterersee.ch", true },
   { "kalwestelectric.com", true },
   { "kam-serwis.pl", true },
-  { "kamatajisyaku.tokyo.jp", true },
   { "kamikaichimaru.com", false },
+  { "kamikatse.net", true },
   { "kaminbau-laub.de", true },
   { "kamixa.se", true },
   { "kamppailusali.fi", true },
+  { "kamranmirhazar.com", true },
   { "kamui.co.uk", true },
   { "kan3.de", true },
   { "kana-mono.biz", true },
@@ -19216,7 +19921,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kandianshang.com", true },
   { "kanecastles.com", true },
   { "kanehusky.com", true },
-  { "kaneisdi.com", true },
   { "kanetix.ca", true },
   { "kangaroo-bouncycastle.co.uk", true },
   { "kangarooislandholidayaccommodation.com.au", true },
@@ -19239,33 +19943,34 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kantanmt.com", true },
   { "kantorkita.net", true },
   { "kantorosobisty.pl", true },
+  { "kanuvu.de", true },
   { "kany.me", false },
-  { "kanzakiranko.jp", true },
   { "kanzashi.com", true },
+  { "kanzlei-gaengler.de", true },
   { "kanzlei-myca.de", true },
   { "kanzlei-oehler.com", true },
   { "kanzlei-sixt.de", true },
-  { "kanzshop.com", true },
+  { "kaotik4266.com", true },
   { "kap-genial.de", true },
   { "kapgy-moto.com", true },
-  { "kapiorr.duckdns.org", true },
   { "kappenstein.org", true },
   { "kapseli.net", true },
   { "kaptadata.com", true },
   { "kaptamedia.com", true },
   { "kara-fabian.com", true },
+  { "kara-fabian.de", true },
   { "karabas.com", true },
   { "karabijnhaken.nl", false },
   { "karachi.dating", true },
-  { "karaface.com", true },
   { "karalane.com", true },
+  { "karamomo.net", true },
   { "karanjthakkar.com", true },
-  { "karanlyons.com", true },
   { "karasik.by", true },
   { "karateka.org", true },
   { "karateka.ru", true },
   { "kardize24.pl", true },
   { "karenledger.ca", true },
+  { "karewan.ovh", true },
   { "kargl.net", true },
   { "karguine.in", true },
   { "karhm.com", true },
@@ -19274,12 +19979,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "karlbowden.com", true },
   { "karlic.net", true },
   { "karlin.run", true },
-  { "karlis-kavacis.id.lv", true },
   { "karlloch.de", true },
   { "karlsmithmn.org", true },
   { "karlzotter.com", true },
   { "karmaassurance.ca", true },
   { "karmabaker.com", true },
+  { "karmaflux.com", true },
   { "karmainsurance.ca", true },
   { "karmaplatform.com", true },
   { "karmaspa.se", true },
@@ -19309,7 +20014,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kastorsky.ru", true },
   { "kat.marketing", true },
   { "katagena.com", true },
+  { "katalogbajugamismu.com", true },
   { "katalogbutikker.dk", true },
+  { "katata-kango.ac.jp", true },
   { "katcleaning.com.au", true },
   { "katedra.de", true },
   { "kateduggan.net", true },
@@ -19319,7 +20026,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "katex.org", true },
   { "kathardt.de", true },
   { "kathegiraldo.com", true },
-  { "kathrinbaumannphotography.com", true },
   { "kati-raumplaner.de", true },
   { "katiechai.xyz", true },
   { "katieskandy.co.uk", true },
@@ -19334,20 +20040,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "katyusha.net", false },
   { "katzenbrunnen-test.de", true },
   { "katzensklave.me", true },
-  { "katzspeech.com", true },
+  { "katzrkool.xyz", true },
   { "kau-boys.com", true },
   { "kau-boys.de", true },
   { "kaufberatung.community", true },
+  { "kaufmanbankruptcylaw.com", true },
   { "kaverti.com", true },
-  { "kavik.no", true },
   { "kavovary-kava.cz", true },
   { "kawaii.io", true },
+  { "kawaiii.link", true },
   { "kaweus.de", true },
   { "kay.la", true },
   { "kayakabovegroundswimmingpools.com", true },
-  { "kayleen.net", true },
   { "kayscs.com", true },
-  { "kaysis.gov.tr", true },
   { "kazakov.lt", true },
   { "kazand.lt", true },
   { "kazandaemon.ru", true },
@@ -19358,7 +20063,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kazumi.ooo", true },
   { "kazumi.ro", true },
   { "kazy111.info", true },
-  { "kb3.net", true },
+  { "kb88.com", true },
   { "kba-online.de", true },
   { "kbb-ev.de", true },
   { "kbbouncycastlehire.co.uk", true },
@@ -19371,10 +20076,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kc1hbk.com", true },
   { "kc3.moe", true },
   { "kc5mpk.com", true },
+  { "kcc.sh", true },
   { "kcliner.com", true },
   { "kcmicapital.com", true },
   { "kcolford.com", false },
-  { "kcptun.com", true },
   { "kcshipping.co.uk", true },
   { "kcsordparticipation.org", true },
   { "kd.net.nz", true },
@@ -19394,6 +20099,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kecht.at", true },
   { "kedarastudios.com", true },
   { "kedibizworx.com", true },
+  { "kediri.win", true },
   { "keditor.biz", true },
   { "kedv.es", true },
   { "keeleysam.com", true },
@@ -19406,11 +20112,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "keepersecurity.com", true },
   { "keepingtheplot.co.uk", true },
   { "keepiteasy.eu", true },
+  { "keevitaja.com", true },
   { "keeweb.info", true },
   { "keezin.ga", true },
   { "keganthorrez.com", true },
   { "kehlenbach.net", true },
-  { "keifel.de", true },
   { "kein-design.de", true },
   { "keinanung.nl", true },
   { "keinefilterblase.de", true },
@@ -19449,7 +20155,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kengilmour.com", true },
   { "kenguntokku.jp", true },
   { "kenia-vakantie.nl", true },
-  { "kennedy.ie", true },
   { "kenners.org", true },
   { "kennethaasan.no", true },
   { "kennethferguson.com", true },
@@ -19462,6 +20167,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kenrogers.co", false },
   { "kens.pics", true },
   { "kensbouncycastles.co.uk", true },
+  { "kenscustomfloors.com", true },
+  { "kensparkesphotography.com", true },
   { "kentec.net", true },
   { "kenterlis.gr", true },
   { "kenvix.com", true },
@@ -19470,7 +20177,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "keops-spine.fr", true },
   { "keops-spine.us", true },
   { "kepkonyvtar.hu", true },
-  { "keralit.nl", true },
   { "kerebro.com", true },
   { "kerem.xyz", true },
   { "kerforhome.com", true },
@@ -19478,6 +20184,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kernel-error.de", true },
   { "kernelpanics.nl", true },
   { "kerrfrequencycombs.org", true },
+  { "kerrnel.com", true },
   { "kersbergen.nl", true },
   { "kersmexico.com", true },
   { "kerstkaart.nl", true },
@@ -19486,30 +20193,34 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kerzyte.net", true },
   { "kescher.site", true },
   { "kessawear.com", true },
-  { "kessel-runners.com", true },
   { "kesslerwine.com", true },
   { "kesteren.org", true },
   { "ketamine.co.uk", true },
+  { "ketaminecareclinic.com", true },
   { "ketosecology.co.uk", true },
   { "ketotadka.com", true },
   { "kettlebellkrusher.com", true },
   { "kettner.com", true },
   { "ketty-voyance.com", true },
   { "keutel.net", true },
+  { "kevin-darmor.eu", true },
   { "kevinapease.com", true },
   { "kevinbowers.me", true },
   { "kevinbusse.de", true },
   { "kevincox.ca", false },
+  { "kevingsky.com", true },
+  { "kevinheslinphoto.com", true },
   { "kevinhill.nl", true },
   { "kevinhq.com", true },
   { "kevinkla.es", true },
   { "kevinlocke.name", true },
   { "kevinmeijer.nl", true },
   { "kevinmorssink.nl", true },
-  { "kevinpirnie.com", true },
+  { "kevinpirnie.com", false },
   { "kevinrandles.com", true },
   { "kevinratcliff.com", true },
   { "kevyn.lu", true },
+  { "kexueboy.com", true },
   { "keybase.io", true },
   { "keybored.co", true },
   { "keybored.me", true },
@@ -19523,9 +20234,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "keyihao.cn", true },
   { "keyinfo.io", true },
   { "keylaserinstitute.com", true },
+  { "keylength.com", true },
   { "keymach.com", true },
   { "keypersonins.com", true },
   { "keys.fedoraproject.org", true },
+  { "keys.jp", true },
   { "keystoneok.com", false },
   { "keysupport.org", true },
   { "keywebdesign.nl", true },
@@ -19547,16 +20260,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "khetzal.info", true },
   { "khipu.com", true },
   { "khlee.net", true },
-  { "khmb.ru", true },
+  { "khmb.ru", false },
   { "khoury-dulla.ch", true },
   { "khs1994.com", true },
+  { "khslaw.com", true },
   { "khudothiswanpark.vn", true },
+  { "khushiandjoel.com", true },
   { "kiadoapartman.hu", true },
   { "kiahoriane.com", true },
   { "kiano.net", true },
   { "kiapartscenter.net", true },
   { "kiapartsdepartment.com", true },
-  { "kiapps.ovh", true },
   { "kibea.net", true },
   { "kibibit.net", true },
   { "kibriscicek.net", true },
@@ -19581,12 +20295,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kidsneversleep.com", true },
   { "kidsplay-plymouth.co.uk", true },
   { "kidsplaybouncycastles.co.uk", true },
-  { "kidswallstickers.com.au", true },
   { "kidtoyshop.ru", true },
   { "kidzpartiesllp.co.uk", true },
   { "kidzsmile.co.uk", true },
   { "kiebel.de", true },
   { "kiedys.net", true },
+  { "kiehls.pt", true },
   { "kiekin.org", true },
   { "kiekko.pro", true },
   { "kiel-kind.de", true },
@@ -19599,11 +20313,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kiki-voice.jp", true },
   { "kikimilyatacado.com.br", true },
   { "kiku.pw", true },
-  { "kilerd.me", true },
   { "kilianvalkhof.com", true },
   { "killaraapartments.com.au", true },
   { "killdeer.com", true },
+  { "killedbynlp.com", true },
   { "killerit.in", true },
+  { "killerkink.net", true },
   { "killerrobots.com", true },
   { "killymoonbouncycastles.com", true },
   { "kilobyte22.de", true },
@@ -19617,6 +20332,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kimmel.com", true },
   { "kimmel.in", true },
   { "kimo.se", true },
+  { "kimono-rental-one.com", true },
   { "kimotodental.com", true },
   { "kimsufi-jordi.tk", true },
   { "kimtran.kim", true },
@@ -19628,12 +20344,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kinderkleding.news", true },
   { "kindertagespflege-rasselbande-halle.de", true },
   { "kinderzahn-bogenhausen.de", true },
+  { "kindfotografie.nl", true },
   { "kindleworth.com", true },
   { "kindlezs.com", true },
   { "kine-duthil.fr", true },
   { "kinepolis-studio.be", true },
   { "kinetiq.com", true },
-  { "king-henris-castles.co.uk", true },
   { "king-of-the-castles.com", true },
   { "kingant.net", true },
   { "kinganywhere.eu", true },
@@ -19645,16 +20361,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kingofthecastlesentertainments.co.uk", true },
   { "kingofthecastlesouthwales.co.uk", true },
   { "kingofthecastlesrhyl.co.uk", true },
+  { "kingsfoot.com", true },
+  { "kingsgateseptic.com", true },
+  { "kingsley.cc", true },
   { "kingstclinic.com", true },
   { "kingtecservices.com", true },
   { "kini24.ru", true },
+  { "kinkcafe.net", true },
   { "kinkenonline.com", true },
-  { "kinnettmemorial.org", true },
   { "kinnikinnick.com", true },
   { "kinniyaonlus.com", true },
   { "kinocheck.de", true },
   { "kinohled.cz", true },
-  { "kinomoto.me", true },
   { "kinomoto.ovh", false },
   { "kinos.nl", true },
   { "kinozal-tv.appspot.com", true },
@@ -19669,15 +20387,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kippenbart.gq", true },
   { "kipriakipita.gr", true },
   { "kiragameforum.net", true },
+  { "kirainmoe.com", true },
   { "kiraku.co", true },
   { "kirbear.com", true },
   { "kirche-dortmund-ost.de", true },
-  { "kirchen-im-web.de", true },
+  { "kirchen-im-web.de", false },
   { "kirchengemeinde-markt-erlbach.de", true },
-  { "kircp.com", true },
   { "kirei.se", true },
   { "kirig.ph", true },
   { "kirikira.moe", true },
+  { "kirill.ws", true },
   { "kirillaristov.com", true },
   { "kirillpokrovsky.de", true },
   { "kirinas.com", true },
@@ -19691,16 +20410,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kirschbaum.me", true },
   { "kirslis.com", true },
   { "kirstenbos.ca", true },
+  { "kirstin-peters.de", true },
   { "kirwandigital.com", true },
   { "kis-toitoidixi.de", true },
   { "kisallatorvos.hu", true },
+  { "kisalt.im", true },
   { "kisiselveri.com", true },
+  { "kiskeedeesailing.com", true },
   { "kisma.de", true },
-  { "kissesb.com", true },
-  { "kissesb.net", true },
   { "kissflow.com", true },
   { "kissgyms.com", true },
   { "kissmycreative.com", true },
+  { "kissoft.ro", true },
   { "kisstube.tv", true },
   { "kitabnamabayi.com", true },
   { "kitbag.com.au", true },
@@ -19711,7 +20432,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kitchenalley.com", true },
   { "kitchenpunx.com", false },
   { "kiteadventure.nl", true },
-  { "kitegarage.eu", true },
   { "kiteschooledam.nl", true },
   { "kiteschoolijmuiden.nl", true },
   { "kiteschoolkatwijk.nl", true },
@@ -19728,6 +20448,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kiwi.digital", true },
   { "kiwi.wiki", true },
   { "kiwico.com", true },
+  { "kix.moe", false },
   { "kiyotatsu.com", true },
   { "kj-prince.com", true },
   { "kj1396.net", true },
@@ -19737,19 +20458,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kjarrval.is", true },
   { "kjchernov.info", true },
   { "kjellner.com", true },
-  { "kjellvn.net", true },
+  { "kjelltitulaer.com", true },
   { "kjg-ummeln.de", true },
   { "kk-neudorf-duissern.de", false },
   { "kkaefer.com", true },
   { "kki.org", true },
   { "kkovacs.eu", true },
   { "kkr-bridal.net", true },
+  { "kkren.me", true },
   { "kks-karlstadt.de", true },
   { "kksg.com", true },
   { "kkws.co", true },
   { "kkyy.me", true },
   { "kkzxak47.com", true },
   { "kl-diaetist.dk", true },
+  { "klaasmeijerbodems.nl", true },
   { "klaim.us", true },
   { "klamathrestoration.gov", true },
   { "klanggut.at", true },
@@ -19757,7 +20480,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "klarika.com", true },
   { "klarmobil-empfehlen.de", true },
   { "klasfauseweh.de", true },
-  { "klausbrinch.dk", true },
+  { "klausbrinch.dk", false },
   { "klausen.dk", true },
   { "klaver.it", true },
   { "klaw.xyz", true },
@@ -19767,15 +20490,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kleaning.by", true },
   { "klebeband.eu", true },
   { "klebetape.de", true },
+  { "kleding.website", true },
   { "kledingrekken.nl", false },
   { "kleim.fr", true },
   { "kleinblogje.nl", false },
   { "kleine-dingen.nl", true },
+  { "kleine-strandburg-heringsdorf.de", true },
+  { "kleine-strandburg.com", true },
   { "kleine-strolche-lich.de", true },
   { "kleineanfragen.de", true },
+  { "kleinestrandburg-heringsdorf.de", true },
+  { "kleinestrandburg-usedom.de", true },
+  { "kleineviecherei.de", true },
   { "kleinfein.co", true },
   { "kleinreich.de", true },
   { "kleinsys.com", true },
+  { "kleintransporte.net", true },
   { "kleteckova.cz", true },
   { "klicke-gemeinsames.de", true },
   { "klickstdu.com", true },
@@ -19786,6 +20516,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "klimapartner.net", true },
   { "klimchuk.by", true },
   { "klimchuk.com", true },
+  { "klingenundmesser.com", true },
   { "klinikac.co.id", false },
   { "klinkenberg.ws", true },
   { "klinkerstreet.com.ua", false },
@@ -19794,19 +20525,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "klocker-ausserlechner.com", true },
   { "klocksnack.se", true },
   { "kloia.com", true },
+  { "klose.family", true },
   { "klosko.net", true },
   { "klotz-labs.com", true },
   { "kloudboy.com", true },
   { "kls-agency.com.ua", false },
   { "klseet.com", true },
   { "klssn.com", true },
+  { "klubxanadu.cz", true },
   { "kluck.me", true },
   { "klugemedia.de", true },
   { "klustekeningen.nl", true },
   { "klustermedia.com", true },
   { "klva.cz", true },
   { "klzwzhi.com", true },
-  { "km-net.pl", true },
   { "kmashworth.co.uk", true },
   { "kmkz.jp", true },
   { "kmsci.com.ph", true },
@@ -19826,6 +20558,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kngk-group.ru", true },
   { "kngk-transavto.ru", true },
   { "kngk.org", true },
+  { "kngkng.com", true },
   { "kniga.market", false },
   { "knight-industries.org", true },
   { "knightsblog.de", true },
@@ -19834,7 +20567,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "knip.ch", true },
   { "knispel-online.de", true },
   { "knitfarious.com", true },
-  { "kniwweler.com", true },
   { "knmv.nl", true },
   { "knockendarroch.co.uk", true },
   { "knop.info", true },
@@ -19843,7 +20575,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "knowledgeforce.com", true },
   { "knowlevillagecc.co.uk", true },
   { "knthost.com", true },
-  { "knuckles.tk", true },
   { "knurps.de", true },
   { "knutur.is", true },
   { "knygos.lt", true },
@@ -19855,9 +20586,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kobezda.net", true },
   { "kobofarm.com", true },
   { "koboldcraft.ch", true },
+  { "kobolya.hu", true },
   { "kocherev.org", true },
   { "kochereva.com", true },
   { "kochhar.net", true },
+  { "kochinke.com", true },
+  { "kochinke.us", true },
   { "kockanakocko.si", true },
   { "kodak-ism.com", true },
   { "kodden.com.br", true },
@@ -19872,8 +20606,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "koelnmafia.de", true },
   { "koenen-bau.de", true },
   { "koenigsbrunner-tafel.de", true },
-  { "koenleemans.nl", false },
-  { "koerper-wie-seele.de", false },
+  { "koenleemans.nl", true },
+  { "koenrouwhorst.nl", true },
+  { "koenzk.nl", true },
   { "koerperkult.ch", true },
   { "koertner-muth.com", true },
   { "koertner-muth.de", true },
@@ -19881,6 +20616,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "koetjesenkanker.nl", true },
   { "kofler.info", true },
   { "kogak.ninja", true },
+  { "kogax.com", true },
   { "kogcoder.com", true },
   { "kogi.fr", true },
   { "kogro.de", true },
@@ -19889,7 +20625,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kohlchan.net", true },
   { "kohlistkool.tk", true },
   { "koho.fi", true },
+  { "kohoutsautomotive.com", true },
   { "kohsandra.com", true },
+  { "kohu.nz", true },
   { "koi-lexikon.de", true },
   { "koi-sama.net", true },
   { "koicenter-thuine.de", true },
@@ -19898,12 +20636,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "koka-shop.de", true },
   { "kokensupport.com", true },
   { "koketteriet.se", true },
-  { "kokoiroworks.com", true },
+  { "kokobaba.com", true },
   { "kokona.ch", true },
   { "kokumoto.com", true },
   { "kolania.com", true },
   { "kolania.de", true },
   { "kolania.net", true },
+  { "kolaykaydet.com", true },
   { "kolbeinsson.se", true },
   { "kolcsey.eu", true },
   { "koldanews.com", true },
@@ -19928,7 +20667,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "komidoc.com", true },
   { "komiksbaza.pl", true },
   { "kominfo.go.id", true },
-  { "kominfo.net", true },
+  { "kominfo.net", false },
   { "kominki-sauny.pl", true },
   { "komintek.ru", true },
   { "komischkeszeug.de", true },
@@ -19937,14 +20676,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "komoju.com", true },
   { "komok.co.uk", true },
   { "kompetenzkurs.de", true },
+  { "komplet.sk", true },
   { "kon-sil.de", true },
   { "kondi.net", true },
   { "kondou-butsudan.com", true },
   { "kongar.org", true },
   { "koniecfica.sk", true },
   { "konijntjes.nl", true },
-  { "konings.it", true },
+  { "koningskwartiertje.nl", true },
   { "koninkrijk.net", true },
+  { "konkai.store", true },
   { "konklone.com", true },
   { "konoe.studio", true },
   { "konosuke.jp", true },
@@ -19954,12 +20695,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "konst.se", true },
   { "kontaxis.org", true },
   { "kontorhaus-schlachte.de", true },
+  { "kontorhaus-stralsund.de", true },
+  { "kontrolapovinnosti.cz", true },
   { "konventa.net", true },
   { "konyalian.com", true },
   { "konzertheld.de", true },
   { "koodaklife.com", true },
   { "koodimasin.ee", true },
   { "koodimasin.eu", true },
+  { "kooer.org", true },
   { "kooli.ee", true },
   { "koolikatsed.ee", true },
   { "koolitee.ee", true },
@@ -19969,16 +20713,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "koozal.de", true },
   { "kopfkrieg.org", true },
   { "kopfundseele.de", true },
-  { "kopio.jp", true },
   { "kopjethee.nl", true },
+  { "koplancpa.com", true },
   { "koptev.ru", true },
   { "kopteva.ru", true },
   { "korben.info", true },
   { "korea.dating", true },
   { "koreaboo.com", true },
   { "koretech.nl", true },
+  { "korinar.com", true },
+  { "kornrunner.net", true },
   { "korobi.io", true },
   { "korobkovsky.ru", true },
+  { "koroknaimedical.hu", true },
   { "korono.de", true },
   { "korosiprogram.hu", true },
   { "korp.fr", true },
@@ -19991,7 +20738,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kosho.org", true },
   { "kosonaudioteca.com", true },
   { "kost-magazin.de", true },
-  { "kostal.com", true },
+  { "kostal.com", false },
   { "kostecki.com", true },
   { "kostecki.org", true },
   { "kostecki.tel", true },
@@ -20006,10 +20753,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kotois.com", true },
   { "kotonoha.cafe", true },
   { "kotori.love", true },
-  { "kottur.is", true },
   { "kouki-food.com", true },
+  { "koumakan.cc", true },
   { "koumuwin.com", true },
   { "koushinjo.org", true },
+  { "kouten-jp.com", true },
   { "kov.space", true },
   { "koval.io", true },
   { "kovaldo.ru", true },
@@ -20032,19 +20780,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kplnet.net", true },
   { "kpmgpublications.ie", true },
   { "kpop.re", true },
+  { "kpopsource.com", false },
   { "kpumuk.info", true },
   { "kpx1.de", true },
   { "kr.search.yahoo.com", false },
   { "kr0n.dk", true },
   { "krachtinverbinding.nl", true },
-  { "kradalby.no", true },
   { "kraft.blog", true },
   { "kraft.im", true },
   { "kraftfleisch.de", true },
   { "kraftzeiten.de", true },
   { "krag.be", true },
   { "kraga.sk", true },
-  { "kraiwan.com", true },
   { "kraiwon.com", true },
   { "kraken.io", true },
   { "kraken.site", true },
@@ -20053,20 +20800,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kralovstvimap.cz", true },
   { "kram.nz", true },
   { "krambeutel.de", true },
-  { "krampus-fischamend.at", true },
   { "kramsj.uk", true },
   { "krang.org.uk", true },
   { "krankenpflege-haushaltshilfe.de", true },
   { "krasnodar-avia.ru", true },
   { "krasovsky.me", true },
-  { "krausoft.hu", true },
   { "krautomat.com", true },
-  { "kraynik.com", true },
   { "krazyboi.com", true },
   { "krazykastles.co.uk", true },
   { "krazykoolkastles.com", true },
   { "krazyphotobooths.co.uk", true },
-  { "krc.link", true },
   { "kreationnext.com", true },
   { "kreativelabs.ch", true },
   { "kreativstrecke.de", true },
@@ -20083,11 +20826,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "krfuli.com", true },
   { "kriechel.de", true },
   { "krinetzki.de", true },
+  { "kringloopwinkelsteenwijk.nl", true },
   { "kriptosec.com", true },
   { "kris.click", true },
   { "krise-chance.ch", true },
+  { "krisftp.fr", true },
   { "krishnenduayur.org", true },
   { "krishofer.com", true },
+  { "krishouse.fr", true },
   { "krislamoureux.com", true },
   { "krismurray.co.uk", true },
   { "krisstarkey.co.uk", true },
@@ -20100,12 +20846,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "krizevci.info", true },
   { "krmeni.cz", false },
   { "krokedil.se", true },
-  { "krokodent.de", true },
   { "kromamoveis.com.br", true },
-  { "kromonos.net", true },
   { "kronaw.it", true },
   { "krony.de", true },
-  { "kronych.cz", true },
   { "kroon.email", true },
   { "kropkait.pl", true },
   { "kroy.io", true },
@@ -20115,14 +20858,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kruin.net", true },
   { "kruisselbrink.com", true },
   { "kruk.co", true },
+  { "krukhmer.com", true },
   { "krumberconsulting.com", true },
   { "krupa.net.pl", false },
+  { "krusesec.com", true },
   { "krutka.cz", true },
   { "kruu.de", true },
   { "kruzhki-s-kartinkami.ru", true },
   { "kry.no", true },
   { "kry.se", true },
   { "kryglik.com", true },
+  { "krypmonet.com", true },
   { "krypsys.com", true },
   { "krypt.com", true },
   { "kryptera.se", true },
@@ -20135,6 +20881,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kselenia.ee", true },
   { "ksero.center", true },
   { "ksero.wroclaw.pl", true },
+  { "ksham.net", true },
   { "kshlm.in", true },
   { "kspg.tv", true },
   { "kssk.de", true },
@@ -20144,8 +20891,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kthnxbai.xyz", true },
   { "kts-thueringen.de", true },
   { "ktsee.eu.org", true },
+  { "ktsofas.gr", true },
   { "ktw.lv", true },
   { "ku-7.club", true },
+  { "ku.io", false },
+  { "kuaitiyu.org", true },
   { "kualiti.net", true },
   { "kualo.co.uk", true },
   { "kualo.com", true },
@@ -20157,17 +20907,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kubik-rubik.de", false },
   { "kubkprf.ru", true },
   { "kublis.ch", true },
+  { "kuchen-am-stiel.de", true },
+  { "kuchenfeelisa.de", true },
   { "kuchentraum.eu", true },
   { "kucnibudzet.com", true },
   { "kucukayvaz.com", true },
   { "kudo.co.id", true },
-  { "kueche-co.de", true },
+  { "kueche-co.de", false },
   { "kuechenprofi-group.de", false },
   { "kuehndel.org", true },
   { "kuehnel-bs.de", true },
   { "kuehnel-online.eu", true },
+  { "kuehnel.org", false },
   { "kuemmerlin.eu", true },
   { "kuemmling.eu", true },
+  { "kugelblitz.co", true },
   { "kuhn-elektrotechnik.de", true },
   { "kuhne-electronic.de", true },
   { "kuhnelautorepair.com", true },
@@ -20182,8 +20936,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kumachan.biz", true },
   { "kumalog.com", true },
   { "kumasanda.jp", true },
-  { "kundo.se", true },
+  { "kumilasvegas.com", true },
   { "kungerkueken.de", true },
+  { "kunra.de", true },
   { "kunstdrucke-textildruck.de", true },
   { "kunstfehler.at", true },
   { "kunstundunrat.de", true },
@@ -20203,7 +20958,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kuroinu.jp", true },
   { "kurona.ga", true },
   { "kuronekogaro.com", true },
-  { "kurrende.nrw", true },
+  { "kurrende.nrw", false },
   { "kurrietv.nl", true },
   { "kurschies.de", true },
   { "kurserne.dk", true },
@@ -20221,9 +20976,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kutukupret.com", true },
   { "kutus.ee", true },
   { "kuzbass-pwl.ru", true },
-  { "kuzdrowiu24.pl", true },
   { "kvadratnimeter.si", true },
   { "kvalita-1a.cz", true },
+  { "kvalitetsaktiepodden.se", true },
   { "kvalitnitesneni.cz", true },
   { "kvantel.no", true },
   { "kvcc.com.au", true },
@@ -20238,13 +20993,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kwcolville.com", true },
   { "kwedo.com", true },
   { "kwench.com", true },
-  { "kwiknews.com", true },
-  { "kwmr.me", true },
   { "kwok.cc", true },
+  { "kwoll.de", true },
   { "kwyxz.org", true },
   { "kx197.com", true },
   { "kxah35.com", true },
+  { "kxline.com", true },
   { "kxnrl.com", false },
+  { "kxway.com", true },
   { "kybi.sk", true },
   { "kydara.com", true },
   { "kyledrake.net", true },
@@ -20252,9 +21008,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kylejohnson.io", true },
   { "kylelaker.com", true },
   { "kylescastles.co.uk", true },
-  { "kyliehunt.com", true },
   { "kylinj.com", false },
-  { "kynaston.org.uk", true },
   { "kynastonwedding.co.uk", true },
   { "kyobostory-events.com", true },
   { "kyoko.org", true },
@@ -20262,6 +21016,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "kyoto-mic.com", true },
   { "kyoto-sake.net", true },
   { "kyoto-tomikawa.jp", true },
+  { "kyoto-tomoshibi.jp", true },
   { "kyprexxo.com", true },
   { "kyras-castles.co.uk", true },
   { "kyunyuki.com", true },
@@ -20277,22 +21032,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "l4n-clan.de", true },
   { "l7plumbing.com.au", true },
   { "l7world.com", true },
-  { "l9.fr", true },
+  { "l9.fr", false },
   { "la-baldosa.fr", true },
-  { "la-cave-a-nodo.fr", false },
+  { "la-cave-a-nodo.fr", true },
   { "la-compagnie-des-elfes.fr", true },
+  { "la-fenice-neheim.de", true },
   { "la-ganiere.com", true },
   { "la-kaz-a-velo.fr", true },
   { "la-maison.ch", true },
   { "la-maison.eu", true },
   { "la-petite-entreprise.com", true },
   { "la-tourmaline.ch", true },
+  { "laac.io", true },
   { "laassari.me", false },
   { "laatikko.io", true },
   { "laatjeniethackmaken.nl", true },
-  { "laballoons.com", true },
   { "labande-annonce.fr", true },
+  { "labcenter.com", true },
   { "labcoat.jp", true },
+  { "labms.com.au", true },
   { "labobooks.com", true },
   { "laboitebio-logique.ca", true },
   { "labortogether.com", true },
@@ -20314,6 +21072,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lachlan-harris.com", true },
   { "lachlan.com", true },
   { "lachosetypo.com", true },
+  { "lachyoga-schwieberdingen.de", true },
   { "lacicloud.net", true },
   { "lacigf.org", true },
   { "laclaque.ch", true },
@@ -20332,12 +21091,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ladraiglaan.com", true },
   { "lady-2.jp", true },
   { "ladyanna.de", true },
-  { "ladybugjam.com", true },
+  { "ladyofhopeparish.org", true },
   { "laeso.es", true },
   { "laextra.mx", true },
   { "lafayette-rushford.com", true },
   { "lafcheta.info", true },
-  { "lafeemam.fr", true },
   { "lafema.de", true },
   { "lafillepolyvalente.ca", true },
   { "lafillepolyvalente.com", true },
@@ -20350,17 +21108,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lagazzettadigitale.it", true },
   { "lagerauftrag.info", true },
   { "lagit.in", true },
-  { "laglab.org", false },
-  { "lagodny.eu", true },
   { "lagout.org", true },
   { "lagriffeduservice.fr", true },
   { "laguiadelvaron.com", true },
   { "laguinguette.fr", true },
+  { "lagunacoastrealestate.com", true },
   { "lahipotesisgaia.com", true },
   { "lahnau-akustik.de", true },
   { "lahora.com.ec", true },
   { "lai.is", true },
   { "lain.at", true },
+  { "lain.li", true },
   { "laindonleisure.co.uk", true },
   { "lak-berlin.de", true },
   { "lakarwebb.se", true },
@@ -20381,6 +21139,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lakesherwoodlighting.com", true },
   { "lakesherwoodoutdoorlighting.com", true },
   { "lakeshowlife.com", true },
+  { "lakewoodcityglass.com", true },
   { "lakhesis.net", true },
   { "lakonia.com.br", true },
   { "lalalab.com", true },
@@ -20402,6 +21161,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lamiaposta.email", false },
   { "lamikvah.org", true },
   { "laminine.info", true },
+  { "lamontre.ru", true },
   { "lamp.re", false },
   { "lamp24.se", true },
   { "lampade.it", true },
@@ -20415,7 +21175,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lampenwelt.de", true },
   { "lampposthomeschool.com", true },
   { "lampy.pl", true },
+  { "lamunyon.com", true },
   { "lan.biz.tr", true },
+  { "lana.swedbank.se", true },
   { "lanahallen.com", true },
   { "lanbroa.eu", true },
   { "lancashirecca.org.uk", true },
@@ -20454,6 +21216,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lanetix.com", true },
   { "lanforalla.se", true },
   { "lang-php.com", true },
+  { "langatang.com", true },
   { "langbein.org", true },
   { "langguth.io", true },
   { "langkahteduh.com", true },
@@ -20464,6 +21227,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "languageterminal.com", true },
   { "langworth.com", true },
   { "langzijn.nl", true },
+  { "lanhhuyet510.tk", true },
   { "lanna.io", true },
   { "lannainnovation.com", true },
   { "lanodan.eu", true },
@@ -20484,7 +21248,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "laozhu.me", true },
   { "laparoscopia.com.mx", true },
   { "lapassiondutrading.com", true },
-  { "laperfumista.es", true },
+  { "lapicena.eu", true },
   { "lapidge.net", true },
   { "lapix.com.co", true },
   { "laplacesicherheit.de", true },
@@ -20508,7 +21272,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "largeviewer.com", true },
   { "lariposte.org", true },
   { "lariscus.eu", true },
-  { "larky.top", true },
   { "larondinedisinfestazione.com", true },
   { "larptreff.de", true },
   { "larraz.es", true },
@@ -20522,8 +21285,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "larsklene.nl", true },
   { "larsklint.com", true },
   { "laruga.co.uk", true },
+  { "larvatoken.org", true },
   { "lasalle.wa.edu.au", true },
   { "lasarmas.com", true },
+  { "lasavonnerieducroisic.fr", true },
   { "lascana.co.uk", true },
   { "lasereyess.net", true },
   { "laserfuchs.de", true },
@@ -20534,6 +21299,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lask.in", true },
   { "laskas.pl", true },
   { "laspequenassemillas.com", true },
+  { "lasrecetascocina.com", true },
   { "lasrecetasdeguada.com", true },
   { "lasse-it.dk", true },
   { "lasseleegaard.com", true },
@@ -20543,7 +21309,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lassesworld.com", true },
   { "lassesworld.se", true },
   { "lastchancetraveler.com", true },
-  { "lastharo.com", true },
   { "lastpass.com", false },
   { "lastrada-minden.de", true },
   { "lastweekinaws.com", true },
@@ -20573,7 +21338,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lattyware.com", true },
   { "laubacher.io", true },
   { "lauchundei.at", true },
-  { "lauensteiner.de", true },
+  { "lauensteiner.de", false },
   { "laufers.pl", true },
   { "laufpix.de", true },
   { "lauftreff-himmelgeist.de", true },
@@ -20581,6 +21346,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "laukstein.com", true },
   { "launayflorian.net", true },
   { "launchkey.com", false },
+  { "launchmylifend.com", true },
   { "launchpad-app2.com", true },
   { "launchpadder2.com", true },
   { "lauraandwill.wedding", false },
@@ -20599,6 +21365,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "laut.digital", true },
   { "lauxzahnheilkunde.de", true },
   { "lauzon-hitter.com", true },
+  { "lavabit.no", true },
   { "lavalite.de", true },
   { "lavamob.com", true },
   { "lavanderia.roma.it", true },
@@ -20609,28 +21376,34 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lavinya.net", true },
   { "lavishlooksstudio.com.au", true },
   { "lavita.de", true },
+  { "lavitaura.com", true },
   { "lavitrine-une-collection.be", true },
   { "lavoieducoeur.be", true },
   { "lavoiepharmd.com", true },
   { "lavolte.net", true },
-  { "lavval.com", false },
+  { "law-colleges.com", true },
   { "law-peters.de", true },
   { "law.co.il", true },
+  { "law22.com", true },
+  { "lawbirduk.com", true },
   { "lawformt.com", true },
   { "lawn-seeds.com", true },
   { "lawnuk.com", true },
   { "lawrenceberg.nl", true },
   { "lawrencemurgatroyd.com", true },
+  { "lawrencewhiteside.com", true },
+  { "lawyerdigital.co.bw", true },
   { "lawyerkf.com", true },
-  { "layfully.me", true },
-  { "laylo.io", true },
-  { "laylo.nl", true },
+  { "laylo.io", false },
+  { "laylo.nl", false },
   { "layoutsatzunddruck.de", true },
+  { "lazistance.com", true },
   { "lazowik.pl", true },
   { "lazurit.com", true },
   { "lazyboston.com", true },
   { "lazyclock.com", true },
   { "lazyframe.com", true },
+  { "lazyhelp.com", true },
   { "lazytux.org", true },
   { "lb-toner.de", true },
   { "lbayer.com", true },
@@ -20638,6 +21411,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lbgconsultores.com", true },
   { "lbihrhelpdesk.com", true },
   { "lbls.me", true },
+  { "lbmblaasmuziek.nl", true },
   { "lbphacker.pw", true },
   { "lbs-logics.com", true },
   { "lbsi-nordwest.de", true },
@@ -20646,6 +21420,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lc-promiss.de", true },
   { "lca-pv.de", true },
   { "lca.gov", true },
+  { "lcacommons.gov", true },
   { "lcars-sv.info", true },
   { "lcbizsolutions.com", true },
   { "lce-events.com", true },
@@ -20678,19 +21453,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "leadbox.cz", true },
   { "leaderoftheresistance.com", false },
   { "leaderoftheresistance.net", false },
-  { "leadinfo.com", true },
   { "leadingsalons.com", true },
   { "leadquest.nl", true },
-  { "leadstart.org", true },
   { "leafandseed.co.uk", true },
-  { "leafans.tk", true },
+  { "leafans.tk", false },
   { "leafinote.com", true },
   { "leakforums.net", true },
   { "leamsigc.com", true },
   { "leandre.cn", true },
+  { "leankit.com", true },
   { "leanplando.com", true },
   { "leap-it.be", true },
   { "leapandjump.co.uk", true },
+  { "learn-smart.uk", true },
+  { "learndev.info", true },
   { "learnflakes.net", true },
   { "learnforestry.com", true },
   { "learning-id.com", true },
@@ -20699,10 +21475,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "learningman.top", true },
   { "learnpianogreece.com", true },
   { "learnplayground.com", true },
-  { "learntale.com", true },
   { "learntube.cz", true },
   { "leaseit24.com", true },
   { "leaseit24.de", true },
+  { "leaseplan.com", true },
   { "leasit.at", true },
   { "leasit.de", true },
   { "leastsignificantbit.de", true },
@@ -20724,6 +21500,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "leclaire.com.br", true },
   { "lecoinchocolat.com", true },
   { "lectricecorrectrice.com", true },
+  { "led-jihlava.cz", true },
   { "led.xyz", true },
   { "ledecologie.com.br", true },
   { "ledeguisement.com", true },
@@ -20732,13 +21509,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ledscontato.com.br", true },
   { "ledzom.ru", false },
   { "lee-fuller.co.uk", true },
-  { "leebiblestudycenter.co.uk", true },
-  { "leebiblestudycenter.com", true },
+  { "leeaaronsrealestate.com", true },
   { "leebiblestudycentre.co.uk", true },
-  { "leebiblestudycentre.com", true },
   { "leech360.com", false },
   { "leeclemens.net", false },
   { "leedev.org", true },
+  { "leekspin.ml", true },
   { "leelaylay.com", true },
   { "leere.me", true },
   { "leerliga.de", true },
@@ -20760,9 +21536,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "legacy.bank", true },
   { "legadental.com", true },
   { "legaillart.fr", true },
+  { "legaldesk.com", true },
   { "legalinmotion.es", true },
   { "legalrobot.com", true },
-  { "legatofmrc.fr", true },
   { "legendarycamera.com", true },
   { "legendesdechine.ch", true },
   { "legendofkrystal.com", true },
@@ -20782,28 +21558,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lehti-tarjous.net", true },
   { "leibniz-remscheid.de", false },
   { "leideninternationalreview.com", true },
-  { "leigh.life", true },
   { "leilautourdumon.de", true },
   { "leilonorte.com", true },
-  { "leiming.co", true },
   { "leinfelder.in", true },
   { "leipzig.photo", true },
   { "leipziger-triathlon.de", true },
   { "leisure-blog.com", true },
   { "leisure-supplies-show.co.uk", true },
-  { "leition.com", true },
-  { "leitionusercontent.com", true },
   { "leiyun.me", true },
   { "lejardindesmesanges.fr", true },
+  { "lektier.cf", true },
   { "lel.ovh", true },
   { "lelambiental.com.br", true },
   { "lemarcheelagrandeguerra.it", true },
+  { "lemazol.fr", true },
   { "lemni.top", true },
   { "lemoine.at", true },
   { "lemondenumerique.com", true },
   { "lemondrops.xyz", true },
   { "lemonop.com", true },
   { "lemonparty.co", true },
+  { "lemonrockbiketours.com", true },
   { "lemonthy.ca", true },
   { "lemonthy.com", true },
   { "lemouillour.fr", true },
@@ -20818,6 +21593,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lenidh.de", true },
   { "leninalbertop.com.ve", true },
   { "lennyobez.be", true },
+  { "lenou.nl", true },
   { "lenr-forum.com", true },
   { "lensdoctor.com", true },
   { "lenspirations.com", true },
@@ -20827,7 +21603,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lenyip.me", true },
   { "lenyip.works", true },
   { "leoandpeto.com", true },
-  { "leochedibracchio.com", true },
   { "leodaniels.com", true },
   { "leodraxler.at", true },
   { "leola.cz", true },
@@ -20840,8 +21615,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "leonax.net", true },
   { "leonbuitendam.nl", true },
   { "leondenard.com", true },
-  { "leonhooijer.nl", false },
   { "leonklingele.de", true },
+  { "leontiekoetter.de", true },
+  { "leopoldina.net", true },
   { "leowkahman.com", true },
   { "lep.gov", true },
   { "lepenetapeti.com", true },
@@ -20868,10 +21644,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lesarts.com", true },
   { "lesberger.ch", true },
   { "lesconteursavis.org", true },
+  { "lescourtiersbordelais.com", true },
   { "leseditionsbraquage.com", true },
   { "lesfilmsavivre.com", true },
   { "lesgoodnews.fr", true },
-  { "lesharris.com", true },
   { "leshervelines.com", true },
   { "lesjardinsdemathieu.net", true },
   { "lesmamy.ch", true },
@@ -20888,7 +21664,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lesterrassesdusoleil.ch", true },
   { "lesyndicat.info", true },
   { "letemps.ch", true },
+  { "leto12.xyz", true },
   { "letraba.com", true },
+  { "letranif.net", true },
   { "lets-bounce.com", true },
   { "lets-go-acoustic.de", true },
   { "lets-ktai.jp", true },
@@ -20902,12 +21680,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "letsgowhilewereyoung.com", true },
   { "letskick.ru", true },
   { "letspartyrugby.co.uk", true },
+  { "letssackcancer.org", true },
   { "letstalkcounseling.com", true },
   { "letterbox-online.de", true },
   { "letterdance.de", true },
   { "letteringinstitute.com", true },
   { "lettersblogatory.com", true },
   { "lettori.club", true },
+  { "letzchange.org", true },
   { "leuenhagen.com", true },
   { "leulu.com", true },
   { "leumi-how-to.co.il", true },
@@ -20916,18 +21696,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "levanscatering.com", true },
   { "levelaccordingly.com", true },
   { "levelcheat.com", true },
+  { "leveluplv.com", true },
   { "leveluprails.com", true },
   { "levendwater.org", true },
   { "levensbron.nl", true },
   { "leverj.io", true },
   { "levermann.eu", true },
   { "leviaan.nl", true },
+  { "levineteamestates.com", true },
   { "levinus.de", true },
   { "leviscop.com", true },
   { "leviscop.de", true },
   { "lew.im", true },
   { "lewdawson.com", true },
   { "lewis.li", true },
+  { "lewiscollard.com", true },
   { "lewisdatasecurity.com", true },
   { "lewislaw.com", true },
   { "lewisllewellyn.me", true },
@@ -20943,25 +21726,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lfgss.com", true },
   { "lfklzw.com", true },
   { "lfrconseil.com", true },
+  { "lfullerdesign.com", true },
+  { "lg-waps.go.jp", true },
+  { "lg-waps.jp", true },
+  { "lg0.site", true },
+  { "lgbt-colleges.com", true },
   { "lgbt.io", true },
   { "lgbt.ventures", true },
-  { "lgbtventures.com", true },
   { "lghfinancialstrategy.ch", true },
   { "lgpecasoriginais.com.br", true },
   { "lhajn.cz", true },
   { "lhakustik.se", true },
   { "lhalbert.xyz", true },
+  { "lhamaths.online", true },
   { "lhconsult.tk", false },
+  { "lhgavarain.com", true },
   { "lhost.su", true },
   { "li-ke.co.jp", true },
   { "li.search.yahoo.com", false },
   { "liam-w.io", true },
   { "liamelliott.me", true },
   { "liamlin.me", true },
-  { "lian-in.com", true },
-  { "lian-in.net", true },
-  { "liang-li88.com", true },
-  { "liang-li88.net", true },
   { "lianye1.cc", true },
   { "lianye2.cc", true },
   { "lianye3.cc", true },
@@ -20983,13 +21768,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "librarytools.com", false },
   { "librazy.org", true },
   { "libre-service.de", true },
+  { "libre.cr", true },
   { "libre.university", true },
   { "libreboot.org", true },
   { "librebox.de", true },
   { "libreduca.com", true },
   { "librelamp.com", true },
   { "libremail.nl", true },
-  { "librends.org", true },
   { "libreoffice-from-collabora.com", true },
   { "libreofficefromcollabora.com", true },
   { "librervac.org", true },
@@ -21017,36 +21802,43 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lidogr.com", true },
   { "lidong.me", true },
   { "lidow.eu", true },
+  { "lidtkemotors.com", true },
   { "liduan.net", false },
+  { "lie.as", true },
   { "liebel.org", true },
   { "lieberwirth.biz", true },
   { "lieblingsholz.de", true },
+  { "liemen.net", true },
+  { "lierrmm.space", true },
   { "lieuu.com", true },
   { "lifanov.com", true },
   { "life-emotions.pt", true },
   { "lifeartstudios.net", true },
   { "lifebetweenlives.com.au", true },
+  { "lifeboxhealthcare.co.uk", true },
   { "lifecism.com", true },
+  { "lifeenrichmentnc.com", true },
   { "lifegrip.com.au", true },
   { "lifeinhex.com", true },
   { "lifeinsurancepro.org", true },
   { "lifekiss.ru", true },
+  { "lifelenz.com", true },
   { "lifematenutrition.com", true },
   { "lifemstyle.com", true },
   { "lifeqa.net", true },
-  { "lifequotes-uk.co.uk", true },
+  { "lifereset.it", true },
   { "lifesafety.com.br", true },
   { "lifestyle7788.com", true },
   { "lifestylefinancial.ca", true },
+  { "lifetree.network", true },
   { "lifi.digital", true },
   { "lifi.is", true },
   { "liftie.info", true },
+  { "ligadosgames.com", true },
   { "light-up.xyz", true },
   { "light.mail.ru", true },
   { "lightbox.co", true },
-  { "lightdark.xyz", true },
   { "lightdream.tech", true },
-  { "lighthouseinstruments.com", true },
   { "lighting-centres.co.uk", true },
   { "lightingagoura.com", true },
   { "lightingagourahills.com", true },
@@ -21073,14 +21865,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lightsheep.no", true },
   { "lightspeed.com", false },
   { "lightspeedta.co", true },
-  { "lighttp.com", true },
   { "lightupcollective.co.uk", true },
   { "lignite.com", true },
   { "lignoma.com", true },
   { "ligonier.com", true },
   { "lihaul.dnsalias.net", true },
-  { "lijero.co", true },
   { "lijncoaching.nl", true },
+  { "lijstje.be", true },
+  { "lijstje.nl", true },
   { "likc.me", true },
   { "likeablehub.com", true },
   { "likeabox.de", true },
@@ -21088,15 +21880,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "likegeeks.com", true },
   { "likehifi.de", true },
   { "likemovies.de", true },
-  { "likenewhearing.com.au", true },
   { "likere.com", true },
   { "likesforinsta.com", true },
   { "likui.me", true },
   { "lilaccakeboutique.com", true },
+  { "liliang13.com", true },
+  { "liljohnsanitary.net", true },
   { "lillepuu.com", true },
   { "lily-bearing.com", true },
   { "lily-inn.com", true },
   { "lilyfarmfreshskincare.com", true },
+  { "lilylasvegas.com", true },
   { "lilysbouncycastles.com", true },
   { "lim-light.com", true },
   { "limap.ch", true },
@@ -21106,7 +21900,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "limeburst.net", true },
   { "limelabs.de", true },
   { "limelabs.io", true },
-  { "limeres.com", true },
   { "limereslaw.com", true },
   { "limitededitioncomputers.com", true },
   { "limitededitionsolutions.com", true },
@@ -21116,17 +21909,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "limousineservicezurich.com", true },
   { "limpid.nl", true },
   { "limules.ch", true },
-  { "limunana.com", true },
   { "lin.fi", true },
   { "linan.blog", true },
   { "lincdavis.com", true },
+  { "linchpin-it.com", true },
   { "lincnaarzorg.nl", true },
   { "lincolnfinewines.com", true },
+  { "lincolnpedsgroup.com", true },
   { "lincolnsfh.com", true },
   { "lincolnwayflorist.com", true },
   { "lindalap.fi", true },
+  { "lindaolsson.com", true },
   { "lindemann.space", true },
-  { "linden.me", true },
   { "lindeskar.se", true },
   { "lindholmen.club", true },
   { "lindnerhof-taktik.de", true },
@@ -21142,7 +21936,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "linearaudio.net", true },
   { "linearaudio.nl", true },
   { "linearmap.com", true },
-  { "linernotekids.com", true },
   { "linfamilygc.com", true },
   { "lingerie.com.br", true },
   { "lingeriesilhouette.com", true },
@@ -21172,6 +21965,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "linost.com", true },
   { "linqhost.nl", true },
   { "linss.com", true },
+  { "lintellift.com", true },
   { "lintmx.com", true },
   { "linusdrop.tips", true },
   { "linux-audit.com", true },
@@ -21184,6 +21978,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "linux.im", true },
   { "linux.pizza", true },
   { "linux3.org", true },
+  { "linuxadictos.com", true },
   { "linuxbabe.com", true },
   { "linuxbierwanderung.com", true },
   { "linuxchick.se", true },
@@ -21191,7 +21986,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "linuxdays.cz", true },
   { "linuxforum.ch", true },
   { "linuxhostsupport.com", true },
-  { "linuxincluded.com", true },
   { "linuxiuvat.de", true },
   { "linuxlounge.net", true },
   { "linuxos.org", true },
@@ -21207,6 +22001,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lipartydepot.com", true },
   { "lipex.com", true },
   { "lipoabaltimore.org", true },
+  { "lipthink.com", true },
   { "liqd.net", true },
   { "liquid.cz", true },
   { "liquidhost.co", true },
@@ -21219,8 +22014,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lirnberger.com", true },
   { "lisamccorrie.com", true },
   { "lisamortimore.com", true },
+  { "lisanzauomo.com", true },
   { "lisburnhottubnbounce.co.uk", true },
-  { "lisieuxarquitetura.com.br", true },
   { "liskgdt.net", true },
   { "lisky.ru", true },
   { "lislan.org.uk", true },
@@ -21237,6 +22032,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "litebits.com", true },
   { "litemind.com", true },
   { "literarymachin.es", true },
+  { "literature-schools.com", true },
   { "litfin.name", true },
   { "lithan.com", true },
   { "lithesalar.se", true },
@@ -21306,34 +22102,46 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "livetoride.co.za", true },
   { "livetube.tv", true },
   { "livi.co", true },
+  { "livi.co.uk", true },
+  { "livi.fr", true },
   { "living-space.co.nz", true },
   { "living24.de", true },
   { "livingforreal.com", true },
+  { "livinginhimalone.com", true },
+  { "livingkingsinc.net", true },
   { "livinglocalnashville.com", true },
   { "livingworduk.org", true },
   { "livnev.me", true },
   { "livnev.xyz", true },
   { "livolett.de", true },
   { "livrariacoad.com.br", true },
+  { "livres-et-stickers.com", true },
   { "livroseuniformes.com.br", true },
   { "lixtick.com", true },
+  { "liyang.pro", false },
   { "liyin.date", true },
-  { "liyinjia.com", true },
+  { "liyunbin.com", true },
   { "liz.ee", true },
   { "lizardsystems.com", true },
   { "lizhi.io", true },
   { "lizhi123.net", true },
+  { "lizmooredestinationweddings.com", true },
   { "lizzaran.io", true },
   { "ljason.cn", true },
+  { "ljc.ro", true },
   { "ljs.io", true },
   { "lk-hardware.cz", true },
   { "lknw.de", true },
   { "lkp111138.me", true },
   { "llamacuba.com", true },
+  { "llemoz.com", true },
   { "ller.xyz", true },
   { "llm-guide.com", true },
+  { "llnl.gov", true },
   { "lloyd-day.me", true },
+  { "llslb.com", false },
   { "lm-pumpen.de", false },
+  { "lmcm.io", true },
   { "lmddgtfy.net", true },
   { "lmerza.com", true },
   { "lmintlcx.com", true },
@@ -21341,17 +22149,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lmsptfy.com", true },
   { "lmtls.me", true },
   { "lmtm.eu", true },
+  { "lng-17.org", true },
   { "lnhequipmentltd.com", true },
   { "lntu.org", true },
   { "lnx.li", true },
   { "lnyltx.cn", true },
   { "load-ev.de", true },
   { "loadlow.me", true },
-  { "loadtraining.com", true },
   { "loadwallet.com", true },
   { "loanaway.ca", true },
   { "loancompare.co.za", true },
   { "loandolphin.com.au", true },
+  { "loanreadycredit.com", true },
   { "loanstreet.nl", true },
   { "lob-staging.com", true },
   { "lob.com", true },
@@ -21362,7 +22171,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "local360.net", true },
   { "localbandz.com", true },
   { "localbitcoins.com", true },
-  { "localblitz.com", true },
   { "localblock.co.za", true },
   { "localbouncycastle.com", true },
   { "localdecor.com.br", true },
@@ -21382,22 +22190,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "locationvoitureportugal.com", true },
   { "locatorplus.gov", true },
   { "locauxrama.fr", true },
-  { "locker.email", false },
+  { "locker.email", true },
   { "locker.plus", true },
   { "lockify.com", true },
   { "lockpick.nl", true },
   { "lockpicks.se", true },
   { "lockr.io", true },
-  { "locksmith-durbannorth.co.za", true },
   { "locksmith-sanantonio-tx.com", true },
   { "locksmithbalchsprings.com", true },
   { "locksmithballito.com", true },
   { "locksmithbluff.co.za", true },
-  { "locksmithdearborn.com", true },
   { "locksmithedmonds.com", true },
   { "locksmithgarland-tx.com", true },
   { "locksmithgrapevinetx.com", true },
-  { "locksmithhillcrest.co.za", true },
   { "locksmithindurban.co.za", true },
   { "locksmithlivoniami.com", true },
   { "locksmithmadisonheights.com", true },
@@ -21406,7 +22211,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "locksmithopen.com", true },
   { "locksmithsammamishwa.com", true },
   { "locksmithsanantoniotexas.com", true },
-  { "locksmithsbluff.com", true },
   { "locksmithscottsdaleaz.com", true },
   { "locksmithseattleco.com", true },
   { "locksmithservice-houston.com", true },
@@ -21414,8 +22218,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "locksmithspringtx.com", true },
   { "locksmithswestville.com", true },
   { "locksmiththewoodlands.com", true },
+  { "locomocosec.com", true },
   { "locomore.com", true },
   { "locomotionds.com", true },
+  { "locomotive.net.br", true },
   { "locurimunca.co", true },
   { "lodash.com", false },
   { "loddeke.eu", true },
@@ -21434,13 +22240,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "logentries.com", false },
   { "logexplorer.net", true },
   { "logfile.at", true },
-  { "logfro.de", true },
-  { "logicchen.com", true },
+  { "logfile.ch", true },
   { "logiciel-entreprise-seurann.fr", true },
   { "logicio.ch", false },
   { "logicio.de", false },
   { "logicio.net", false },
   { "logicne-hise.si", true },
+  { "logicoma.com", true },
   { "logicz.top", true },
   { "login.corp.google.com", true },
   { "login.gov", false },
@@ -21480,7 +22286,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lojamagicalx.com", true },
   { "lojamascate.com.br", true },
   { "lojamoleco.com.br", true },
-  { "lojamulticapmais.com.br", true },
   { "lojaprimemed.com.br", true },
   { "lojaprojetoagua.com.br", true },
   { "lojasceletro.com.br", true },
@@ -21496,13 +22301,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "loket.nl", true },
   { "lolcorp.pl", true },
   { "lolcow.farm", true },
-  { "lolhax.org", true },
   { "loli.net", true },
   { "loli.pet", true },
   { "loli.ski", true },
   { "loli.tube", true },
   { "loli.world", true },
   { "lolibrary.org", true },
+  { "lolic.xyz", true },
+  { "lolico.moe", true },
   { "lolicon.eu", true },
   { "lolkot.ru", true },
   { "lolnames.gg", true },
@@ -21513,19 +22319,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lommyfleet.com", true },
   { "lon-so.com", true },
   { "lona.io", true },
+  { "lonal.com", true },
   { "london-transfers.com", true },
   { "london.dating", true },
   { "londongallery.net", true },
   { "londongynaecologist.co", true },
+  { "londonkan.jp", true },
   { "londonkeyholdingcompany.co.uk", true },
   { "lonelytweets.com", true },
   { "lonesomecosmonaut.com", true },
+  { "lonestarlandandcommercial.com", true },
   { "long-journey.com", true },
+  { "long139.com", true },
+  { "long18.cc", true },
+  { "long688.com", true },
   { "longhaircareforum.com", true },
   { "longhorn-imports.com", true },
   { "longhorn.id.au", true },
-  { "longma.pw", true },
   { "longstride.net", true },
+  { "longtermcare.gov", true },
   { "lonniec.com", true },
   { "lonniemason.net", true },
   { "look.co.il", true },
@@ -21543,6 +22355,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lookastic.ru", true },
   { "lookatmysco.re", true },
   { "lookbetweenthelines.com", true },
+  { "looker.wang", true },
   { "lookup-dns.net", true },
   { "lookyman.net", true },
   { "lookzook.com", true },
@@ -21550,25 +22363,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "loony.info", true },
   { "loopower.com", true },
   { "loopstart.org", true },
+  { "looseleafsecurity.com", true },
   { "loothole.com", true },
   { "loovto.net", true },
+  { "loposchokk.com", true },
   { "loqu8.com", true },
+  { "lord.sh", true },
   { "lordofthebrick.com", true },
   { "lore.azurewebsites.net", true },
   { "lorenadumitrascu.ro", true },
   { "loreofthenorth.com", true },
   { "loreofthenorth.nl", true },
+  { "loricozengeller.com", true },
   { "lorientlejour.com", true },
   { "loritaboegl.de", true },
   { "lormansas.com", true },
   { "losangelestown.com", true },
   { "losless.fr", true },
-  { "losrascadoresparagatos.com", true },
   { "losreyesdeldescanso.com.ar", true },
   { "lost.host", true },
   { "lost.report", true },
   { "lostkeys.co.uk", true },
   { "lostserver.com", true },
+  { "loteamentomontereiitu.com.br", true },
   { "lothlorien.ca", false },
   { "lotl.ru", true },
   { "lotn.mobi", true },
@@ -21583,24 +22400,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "louange-reconvilier.ch", true },
   { "loucanfixit.com", true },
   { "louerunhacker.fr", true },
+  { "louisemisellinteriors.co.uk", true },
   { "louisvillecarguys.com", true },
-  { "louisvillevmug.info", true },
   { "loune.net", true },
   { "loungecafe.net", true },
   { "loungecafe.org", true },
+  { "loungepapillon.com", true },
   { "love4taylor.me", true },
   { "loveandadoreboutique.com", true },
   { "lovebigisland.com", true },
-  { "lovebo9.com", true },
-  { "lovebo9.net", true },
   { "lovecrystal.co.uk", true },
   { "loveislandgames.com", true },
   { "loveisourweapon.com", true },
-  { "lovelens.ch", false },
   { "lovelens.li", false },
   { "lovelivewiki.com", true },
   { "lovelovenavi.jp", true },
   { "lovelytimes.net", true },
+  { "lovemanagementaccounts.co.uk", true },
   { "lovemomiji.com", true },
   { "lovenwishes.com", true },
   { "loveph.one", true },
@@ -21609,12 +22425,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lovesmagical.com", true },
   { "lovesupremefestival.com", true },
   { "lovetravel360.com", true },
+  { "lovevape.co", true },
   { "loveyounastya.com", true },
   { "loveysa.ch", true },
   { "lovg.ren", true },
   { "lovingearth.co", true },
+  { "lovingthermo.com", true },
   { "lovizaim.ru", true },
   { "low-diets.com", true },
+  { "lowcarblab.com", true },
   { "lowcostwire.com.au", true },
   { "lowerpricefinder.com", true },
   { "lowmagnitude.com", true },
@@ -21622,12 +22441,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lowsidetna.com", true },
   { "lowson.ca", true },
   { "loxal.net", true },
-  { "loxal.org", true },
+  { "loyaleco.it", true },
   { "loyaltyondemand.club", true },
   { "loyaltyondemand.eu", true },
   { "lp-support.nl", true },
+  { "lpcom.de", true },
+  { "lprcommunity.co.za", true },
   { "lpt-nebreziny.eu", true },
+  { "lqs.me", true },
   { "lra-cloud.de", true },
+  { "lrdo.net", true },
   { "lrssystems.com", true },
   { "ls-alarm.de", true },
   { "lsal.me", true },
@@ -21636,8 +22459,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lshiy.com", true },
   { "lsmpx.com", true },
   { "lsquo.com", true },
-  { "lsws.de", true },
-  { "lsys.ac", true },
+  { "lsscreens.de", true },
   { "lt.search.yahoo.com", false },
   { "ltaake.com", true },
   { "ltecode.com", true },
@@ -21651,6 +22473,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lublin.toys", true },
   { "lubomirkazakov.com", true },
   { "luc-oberson.ch", true },
+  { "luca-steeb.com", true },
   { "luca.swiss", true },
   { "lucacastelnuovo.nl", false },
   { "lucafontana.net", true },
@@ -21669,17 +22492,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lucidframeworks.com", true },
   { "lucidlight.de", true },
   { "lucidoccult.com", true },
+  { "lucie.jp", true },
   { "lucielavickova.com", true },
   { "luckycastles.co.uk", true },
   { "luckyfrog.hk", true },
   { "luckyxf.com", true },
   { "lucy.science", true },
   { "lucyparsonslabs.com", true },
+  { "lucysan.net", true },
   { "lucz.co", true },
   { "ludek.biz", true },
   { "ludikovsky.name", true },
   { "ludogue.net", true },
   { "ludovic-muller.fr", true },
+  { "ludum.pl", true },
   { "ludwig.im", true },
   { "ludwiggrill.de", true },
   { "ludwigjohnson.se", true },
@@ -21693,7 +22519,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "luftbild-siegerland.de", true },
   { "luftreiniger.biz", true },
   { "lufu.io", true },
-  { "luganskservers.net", true },
   { "lugbb.org", true },
   { "luginbuehl.be", true },
   { "luginbuehl.eu", true },
@@ -21761,17 +22586,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "luoh.me", true },
   { "luohua.im", true },
   { "luongvu.com", true },
+  { "luowu.cc", true },
   { "lupecode.com", true },
   { "lupinencyclopedia.com", true },
   { "lupinenorthamerica.com", true },
   { "luqsus.pl", true },
+  { "lusitom.com", true },
   { "luso-livros.net", true },
-  { "lusteniny.cz", false },
+  { "lusteniny.cz", true },
   { "lustige-zitate.com", true },
   { "lustin.fr", true },
   { "lustrum.ch", true },
   { "lusynth.com", true },
-  { "luteijn.biz", true },
   { "luteijn.cloud", true },
   { "luteijn.email", true },
   { "luteijn.pro", true },
@@ -21782,9 +22608,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "luuppi.fi", true },
   { "luvare.com", true },
   { "luvbridal.com.au", true },
+  { "luvplay.co.uk", true },
   { "luxcraft.eng.br", true },
-  { "luxescreenprotector.nl", true },
-  { "luxofit.de", true },
+  { "luxescreenprotector.nl", false },
   { "luxsci.com", true },
   { "luxurynsight.net", true },
   { "luxurytimepieces.net", true },
@@ -21795,6 +22621,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "luxwatch.com", true },
   { "luyckx.net", true },
   { "luzat.com", true },
+  { "luzfaltex.com", true },
   { "lv.search.yahoo.com", false },
   { "lv0.it", true },
   { "lv5.top", true },
@@ -21805,6 +22632,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lwl12.com", true },
   { "lxd.cc", true },
   { "lxd.pm", true },
+  { "lyam.fr", true },
   { "lycee-saintjoseph-mesnieres.fr", true },
   { "lychankiet.name.vn", false },
   { "lydudlejning.net", true },
@@ -21817,6 +22645,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lyngvaer.no", true },
   { "lynnlaytonnissanparts.com", true },
   { "lynnmosher.com", true },
+  { "lynsec.com", true },
   { "lynthium.com", true },
   { "lynx.nl", true },
   { "lynxbroker.de", true },
@@ -21829,16 +22658,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "lys.ch", true },
   { "lyst.co.uk", true },
   { "lyukaacom.ru", true },
-  { "lyuly.com", true },
   { "lyx.dk", true },
   { "lzh.one", true },
   { "lzwc.nl", true },
+  { "lzzr.me", true },
   { "m-22.com", true },
   { "m-chemical.com.hk", true },
-  { "m-edmondson.co.uk", true },
   { "m-gh.info", true },
+  { "m-idea.jp", true },
   { "m-kleinert.de", true },
   { "m-mail.fr", true },
+  { "m-monitor.pl", true },
   { "m-orthodontic.com", true },
   { "m-ses.fr", true },
   { "m.facebook.com", true },
@@ -21848,6 +22678,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "m132.eu", true },
   { "m134.eu", true },
   { "m2epro.com", true },
+  { "m2il.co", true },
   { "m2os.com", true },
   { "m4rcus.de", true },
   { "ma-eir.nl", true },
@@ -21872,9 +22703,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mach-politik.ch", true },
   { "macha.cloud", true },
   { "machbach.com", true },
-  { "machbach.net", true },
   { "machetewp.com", true },
   { "machikka.com", false },
+  { "machinetransport.com", true },
   { "macht-elektro.de", true },
   { "machtweb.de", true },
   { "machu-picchu.nl", true },
@@ -21897,10 +22728,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "macosxfilerecovery.com", true },
   { "macoun.de", true },
   { "macros.co.jp", true },
+  { "macstore.pe", true },
   { "mactools.com.co", true },
   { "mad.ninja", true },
   { "madae.nl", true },
   { "madars.org", false },
+  { "madbicicletas.com", true },
   { "madbin.com", true },
   { "madbouncycastles.co.uk", true },
   { "maddi.biz", true },
@@ -21909,6 +22742,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "maddistonpsa.co.uk", true },
   { "maddreefer.com", true },
   { "made-in-earth.co.jp", true },
+  { "made-to-usb.com", true },
   { "madebydusk.com", true },
   { "madebyshore.com", true },
   { "madeinchezmoi.net", true },
@@ -21917,13 +22751,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "madeloc.com", true },
   { "mademoiselledemargaux.com", true },
   { "mader.jp", true },
-  { "maderasbrown.com", true },
   { "madin.ru", true },
   { "madirc.net", true },
+  { "madisonent-facialplasticsurgery.com", true },
+  { "madisonsquarerealestate.com", true },
   { "madmar.ee", true },
   { "madoka.nu", true },
+  { "madokami.pw", true },
   { "madreacqua.org", true },
+  { "madrecha.com", true },
   { "madridartcollection.com", true },
+  { "madscientistwebdesign.com", true },
   { "madtec.de", true },
   { "madusecurity.com", true },
   { "mae-berlinistanbul.com", true },
@@ -21933,7 +22771,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "maelstrom-fury.eu", true },
   { "maelstrom.ninja", true },
   { "maeplasticsurgery.com", true },
-  { "maerzpa.de", true },
   { "maestrano.com", true },
   { "maff.co.uk", true },
   { "maff.scot", false },
@@ -21958,10 +22795,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "magewell.nl", true },
   { "maggie.com", true },
   { "magi-cake.com", true },
+  { "magi.systems", true },
   { "magic-cards.info", true },
   { "magical-secrets.com", true },
   { "magical.rocks", true },
   { "magicalcircuslv.com", true },
+  { "magicalshuttle.fr", true },
   { "magicbroccoli.de", true },
   { "magiccards.info", true },
   { "magicdaysomagh.co.uk", true },
@@ -21970,9 +22809,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "magicspaceninjapirates.de", true },
   { "magictable.com", true },
   { "magicvodi.at", true },
-  { "magieamour.com", true },
   { "magilio.com", true },
   { "magnacarebroker.com", true },
+  { "magnate.co", true },
   { "magnatronic.com.br", true },
   { "magneticattraction.com.au", true },
   { "magnetpass.uk", true },
@@ -21983,7 +22822,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "magnoliastrong.com", true },
   { "magnunbaterias.com.br", true },
   { "magonote-nk.com", true },
-  { "magosmedellin.com", true },
+  { "magravsitalia.com", true },
   { "magu.kz", true },
   { "maguire.email", true },
   { "magwin.co.uk", true },
@@ -22007,6 +22846,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mail.google.com", true },
   { "mail.storage", true },
   { "mail.yahoo.com", false },
+  { "mail180.com", true },
   { "mail4you.in", true },
   { "mailbox.mg", true },
   { "mailbox.org", true },
@@ -22017,6 +22857,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mailjet.tech", true },
   { "maillady-susume.com", true },
   { "mailmag.net", false },
+  { "mailnara.co.kr", true },
   { "mailto.space", true },
   { "mailum.org", false },
   { "mainechiro.com", true },
@@ -22033,23 +22874,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "maispa.com", true },
   { "maisretorno.com", true },
   { "maisvitaminas.com.br", true },
+  { "maitheme.com", true },
   { "maitrechaton.fr", true },
   { "maitrise-orthopedique.com", true },
   { "majahoidja.ee", true },
   { "majaweb.cz", true },
   { "majemedia.com", true },
   { "majesnix.org", true },
+  { "majesticcolorado.com", true },
   { "majid.info", true },
   { "majkassab.com", true },
   { "majkassab.net", true },
   { "majkassab.org", true },
   { "majkl.me", true },
+  { "majkl.xyz", true },
+  { "majkl578.cz", true },
   { "majkyto.cz", true },
+  { "majlovesreg.one", true },
   { "majolka.com", true },
+  { "majorpaintingco.com", true },
   { "makaleci.com", true },
   { "makalu.me", true },
   { "make-your-own-song.com", true },
   { "makeaboldmove.com", true },
+  { "makechanges.com.au", true },
   { "makedin.net", true },
   { "makem-bounce.co.uk", true },
   { "makenaiyo-fx.com", true },
@@ -22058,8 +22906,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "maketheneighborsjealous.com", true },
   { "makeurbiz.com", true },
   { "maki-chan.de", true },
-  { "makino.games", true },
+  { "makinen.ru", true },
   { "makkusu.photo", true },
+  { "makkyon.com", true },
   { "makowitz.cz", true },
   { "maktoob.search.yahoo.com", false },
   { "maku.edu.tr", true },
@@ -22078,6 +22927,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "malibu-electric.com", true },
   { "malibuelectrical.com", true },
   { "malibuexteriorlighting.com", true },
+  { "malik.holdings", true },
   { "malik.id", true },
   { "malikussa.id", true },
   { "malikussaid.com", true },
@@ -22098,24 +22948,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "malscan.com", true },
   { "malscan.org", true },
   { "malta-firma.com", true },
+  { "malte-kiefer.de", true },
   { "malufs.com.br", true },
   { "malware.watch", true },
   { "malwareinvestigator.gov", true },
   { "malwarekillers.com", true },
   { "malwaretips.com", false },
-  { "malwareverse.us", true },
   { "maly.cz", true },
   { "malyshata.com", true },
   { "malysvet.net", true },
   { "mamaasia.info", true },
   { "mamadea.be", true },
+  { "mamadoma.com.ua", true },
   { "mamafit.club", true },
   { "mamamoet.ru", true },
   { "mamanecesitaungintonic.com", true },
+  { "mambas.cn", true },
   { "mamiecouscous.com", true },
   { "mammals.net", true },
   { "mammaw.com", true },
+  { "mammeitalianeavienna.com", true },
   { "mammooc.org", true },
+  { "mammothlakesmls.net", true },
   { "mamospienas.lt", true },
   { "mamot.fr", false },
   { "mamuko.nl", true },
@@ -22136,8 +22990,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "manatees.net", true },
   { "manavgabhawala.com", true },
   { "manawill.jp", true },
-  { "mandanudes.ae", true },
   { "mandcbouncycastlehire.co.uk", true },
+  { "mandynamic.gr", true },
   { "maneggio.milano.it", true },
   { "manesht.ir", true },
   { "manfredgruber.net", true },
@@ -22145,8 +22999,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "manfredimatteo.com", true },
   { "manfredschafer.ch", true },
   { "mangahigh.com", true },
-  { "mangapoi.com", true },
-  { "mangaristica.com", true },
+  { "mangaristica.com", false },
+  { "mangnhuapvc.com.vn", true },
+  { "mangotwoke.co.uk", true },
   { "manhattanchoralensemble.org", true },
   { "manhole.club", true },
   { "manhuagui.com", true },
@@ -22165,15 +23020,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "manns-solutions.co.uk", true },
   { "mannschafft.ch", true },
   { "manoirdecontres.com", true },
-  { "manonamission.de", true },
+  { "manonandre-avocat.fr", true },
   { "manoro.de", true },
   { "manowarus.com", true },
   { "mansdell.net", true },
   { "mansfeld.pl", true },
   { "manski.net", true },
   { "mantabiofuel.com", true },
+  { "manti.by", true },
   { "mantor.org", false },
   { "mantra.pictures", true },
+  { "manualidadeson.com", true },
   { "manuall.co.uk", true },
   { "manuall.de", true },
   { "manuall.fr", true },
@@ -22201,6 +23058,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "manylots.ru", true },
   { "manyue.org", true },
   { "maoi.re", true },
+  { "maomao.blog", true },
   { "maomihz.com", true },
   { "maone.net", true },
   { "maorseo.com", true },
@@ -22213,21 +23071,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mapblender.com", true },
   { "mapeo.io", true },
   { "maplanetebeaute.fr", true },
+  { "maplehome.tk", true },
   { "mapletime.com", true },
   { "maps.net", true },
-  { "mapservices.nl", true },
   { "mapstack.org", true },
   { "maquettage.com", true },
   { "maquinariaspesadas.org", true },
+  { "maquininhamercadopoint.com.br", true },
   { "mar-eco.no", true },
   { "marabumadrid.com", false },
   { "marakovits.net", true },
   { "marble.com", true },
   { "marbogardenlidkoping.se", true },
   { "marc-hammer.de", true },
-  { "marcaudefroy.com", true },
+  { "marc-schlagenhauf.de", true },
   { "marcbeije.com", true },
   { "marcberndtgen.de", true },
+  { "marcceleiro.cat", true },
   { "marcceleiro.com", true },
   { "marceau.ovh", true },
   { "marcel-preuss.de", true },
@@ -22242,30 +23102,37 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "marcelpreuss.de", true },
   { "marcelsiegert.com", true },
   { "marcelwaldvogel.ch", true },
+  { "marcelwiedemeier.com", true },
   { "marcelwolf.coach", true },
   { "marcgoertz.de", true },
   { "marche-contre-monsanto.ch", true },
+  { "marchhappy.tech", false },
   { "marchukov.com", true },
   { "marchwj.pl", true },
   { "marciaimportados.com.br", true },
+  { "marcianoandtopazio.com", true },
   { "marclay.co.uk", true },
   { "marco-goltz.de", true },
   { "marco-hegenberg.net", true },
   { "marco-polo-reisen.com", true },
   { "marcocasoni.com", true },
-  { "marcohager.de", true },
   { "marcoherten.com", true },
+  { "marcoklomp.nl", true },
+  { "marcoslater.com", true },
   { "marcuskoh.com", true },
   { "marcusstafford.com", true },
   { "marechal-company.com", true },
+  { "marek.pro", true },
   { "marek.su", true },
   { "marelijah.org", true },
   { "margagriesser.de", true },
-  { "margan.ch", true },
   { "margecommunication.com", true },
   { "margo-co.ch", true },
   { "margo.ml", true },
   { "margotlondon.co.uk", true },
+  { "margots.biz", true },
+  { "margots.life", true },
+  { "margots.tech", true },
   { "marguerite-maison.fr", true },
   { "mariacorzo.com", true },
   { "mariage-photo.ch", true },
@@ -22289,6 +23156,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "marinazarza.es", true },
   { "marinbusinesscenter.ch", true },
   { "marine.gov", true },
+  { "marinecadastre.gov", true },
   { "marinekaplama.com", true },
   { "marinela.com.mx", false },
   { "marinelausa.com", false },
@@ -22298,10 +23166,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mariogeckler.de", true },
   { "mariposah.ch", true },
   { "marisamorby.com", false },
+  { "mariskavankasbergen.nl", true },
   { "maritim.go.id", false },
   { "mariushubatschek.de", true },
   { "mariviolin.com", true },
-  { "marix.ro", true },
   { "marjeta-gurtner.ch", true },
   { "marjoleindens.be", true },
   { "marjoriecarvalho.com.br", true },
@@ -22322,14 +23190,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "marketing-2.de", true },
   { "marketing.limited", true },
   { "marketing91.com", true },
+  { "marketingbrandingnews.com", true },
+  { "marketingbrandingnews.net", true },
   { "marketingco.nl", true },
   { "marketingconverts.com", true },
   { "marketingforfood.com", true },
   { "marketinggenerators.nl", true },
+  { "marketingtrendnews.com", true },
   { "marketingvirtuales.com", true },
   { "marketizare.ro", true },
   { "marketlinks.org", true },
   { "marketnsight.com", true },
+  { "markfordelegate.com", true },
   { "markhaehnel.de", true },
   { "markhenrick.site", true },
   { "markholden.guru", true },
@@ -22348,6 +23220,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "markridgwellcom.appspot.com", true },
   { "markscastles.co.uk", true },
   { "marksm.it", true },
+  { "marksmanhomes.com", true },
   { "marksmit.co", true },
   { "marksouthall.com", true },
   { "markspres.org", true },
@@ -22356,17 +23229,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "marktcontact.com", true },
   { "marktissink.nl", true },
   { "markup-ua.com", true },
+  { "markus-blog.de", true },
   { "markus-dev.com", true },
+  { "markus-keppeler.de", true },
   { "markus-musiker.de", true },
   { "markus-ullmann.de", true },
   { "markus.design", true },
   { "markusehrlicher.de", true },
   { "markusgran.de", true },
+  { "markuskeppeler.de", true },
   { "markuskeppeler.no-ip.biz", true },
   { "marl.fr", true },
   { "marloncommunications.com", true },
   { "marlonlosurdopictures.com", true },
-  { "marlonschultz.de", true },
   { "marlosoft.net", true },
   { "marmista.roma.it", true },
   { "marmolesromero.com", true },
@@ -22382,19 +23257,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "marriage-shrine.jp", true },
   { "marrickvilleapartments.com.au", true },
   { "marsanvet.com", true },
-  { "marsble.com", true },
   { "marseillekiteclub.com", true },
   { "marshallscastles.com", true },
   { "marshallwilson.com", true },
   { "marshmallow.co", true },
   { "marshmallow.com", true },
   { "marshyplay.live", true },
+  { "marsikelektro.cz", true },
   { "martasibaja.com", true },
   { "martelange.ovh", true },
   { "marten-buer.de", true },
   { "martensmxservice.nl", true },
   { "martensson.io", true },
   { "marti201.ga", true },
+  { "martialarts-wels.at", true },
   { "martide.com", true },
   { "martiestrimsalon.nl", true },
   { "martijn.site", true },
@@ -22426,24 +23302,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "marycliffpress.com", true },
   { "maryeclark.com", true },
   { "maryeileen90.party", true },
+  { "maryjaneroach.com", true },
   { "maryjruggles.com", true },
   { "marykatrinaphotography.com", true },
   { "marylandbasementandcrawlspacewaterproofing.com", true },
+  { "marzio.co.za", true },
   { "masarik.sh", true },
   { "masatotaniguchi.jp", true },
+  { "masautonomo.com", true },
   { "masayahost.com", true },
   { "mascosolutions.com", true },
   { "masdillah.com", true },
+  { "maservant.net", true },
   { "mashandco.it", true },
   { "mashandco.tv", true },
   { "masiniunelte.store.ro", true },
   { "masiul.is", true },
-  { "maskice.hr", true },
   { "maskim.fr", true },
   { "maslife365.com", true },
   { "maslin.io", true },
   { "masrur.org", true },
   { "massaboutique.com", true },
+  { "massage-colleges.com", true },
   { "massage-vitalite.fr", true },
   { "massage4u.net", true },
   { "massagecupping.com", true },
@@ -22456,11 +23336,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "massive.tk", true },
   { "massoni.pl", true },
   { "massotherapeutique.com", true },
+  { "massvow.com", true },
   { "masta.ch", true },
   { "mastah.fr", true },
   { "mastd.me", false },
   { "mastellone.us", true },
-  { "mastepinnelaand.nl", true },
   { "master-net.org", true },
   { "mastercardpac.com", true },
   { "masterdemolitioninc.com", true },
@@ -22477,9 +23357,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mastodon.at", true },
   { "mastodon.host", true },
   { "mastodon.rocks", true },
+  { "mastodon.top", true },
   { "mat.tt", true },
   { "matanz.de", true },
   { "matatabimix.com", true },
+  { "matatall.com", true },
+  { "matbad.de", true },
   { "match.audio", true },
   { "matcha-iga.jp", true },
   { "matchatea24.com", true },
@@ -22488,16 +23371,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "matdogs.com", true },
   { "matejgroma.com", true },
   { "matel.org", true },
+  { "materassi.roma.it", true },
   { "materiaischiquinho.com.br", true },
   { "material-ui.com", true },
   { "material-world-fuyouhin.com", true },
   { "materialism.com", true },
   { "materialyinzynierskie.pl", true },
+  { "maternalsafety.org", true },
   { "maternum.com", true },
   { "mateuszchyla.pl", true },
+  { "math-colleges.com", true },
   { "math.hamburg", true },
   { "mathalexservice.info", true },
-  { "mathematik.rocks", true },
+  { "mathematik.rocks", false },
   { "mathematris.com", true },
   { "mathembedded.com", true },
   { "matheo-schefczyk.de", true },
@@ -22508,6 +23394,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mathiasgarbe.de", true },
   { "mathiaswagner.org", true },
   { "mathieuguimond.com", true },
+  { "mathieui.net", true },
+  { "mathis.com.tr", true },
   { "maths.network", true },
   { "mathsource.ga", true },
   { "mathspace.co", true },
@@ -22523,6 +23411,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "matome-surume.com", true },
   { "matomeathena.com", true },
   { "matoutepetiteboutique.com", true },
+  { "matratzentester.com", true },
   { "matridiana.com", true },
   { "matrimoni.uk", true },
   { "matriterie-sdv.ro", true },
@@ -22531,6 +23420,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "matrixreq.com", true },
   { "matsu-semi.com", true },
   { "matsu-walk.com", true },
+  { "matt-brooks.com", true },
   { "matt-royal.gr", true },
   { "matt.re", true },
   { "mattandyana.com", true },
@@ -22542,6 +23432,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mattcoles.io", true },
   { "mattconstruction.com", true },
   { "mattcorp.com", true },
+  { "mattdbarton.com", true },
   { "matteomarescotti.it", true },
   { "mattferderer.com", true },
   { "mattfin.ch", true },
@@ -22550,6 +23441,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "matthewchapman.co.uk", true },
   { "matthewfells.com", true },
   { "matthewgallagher.co.uk", true },
+  { "matthewgrow.com", true },
   { "matthewj.ca", true },
   { "matthewkenny.co.uk", true },
   { "matthewohare.com", true },
@@ -22570,11 +23462,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mattmccutchen.net", true },
   { "mattmcshane.com", true },
   { "mattonline.me", true },
-  { "mattwservices.co.uk", true },
   { "matviet.vn", true },
   { "matway.com", true },
   { "matway.net", true },
-  { "matze.co", true },
   { "mauerwerkstag.info", true },
   { "mauldincookfence.com", true },
   { "mauran.me", true },
@@ -22596,17 +23486,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mawidabp.com", true },
   { "mawidaca.com", true },
   { "mawo.olkusz.pl", true },
-  { "max-mad.com", true },
   { "max-moeglich.de", true },
   { "max-went.pl", true },
   { "max.gov", true },
+  { "maxb.fm", true },
+  { "maxbachmann.de", true },
   { "maxbeenen.de", true },
   { "maxbruckner.de", true },
   { "maxbruckner.org", true },
   { "maxchan.info", true },
   { "maxdev72.freeboxos.fr", true },
-  { "maxfox.me", true },
+  { "maxh.me.uk", true },
   { "maxhamon.ovh", true },
+  { "maxhoechtl.at", true },
   { "maximdeboiserie.be", true },
   { "maximdens.be", true },
   { "maximeferon.fr", true },
@@ -22615,13 +23507,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "maximiliankaul.de", true },
   { "maximiliankrieg.de", true },
   { "maxims-travel.com", true },
+  { "maxinesbydennees.com", true },
   { "maxipcalls.com", true },
   { "maxisito.it", true },
   { "maxkaul.de", true },
+  { "maxmatthe.ws", true },
   { "maxmilton.com", true },
+  { "maxmind.com", true },
   { "maxp.info", true },
   { "maxpl0it.com", true },
-  { "maxr1998.de", true },
   { "maxrandolph.com", true },
   { "maxtruxa.com", true },
   { "maxundlara.at", true },
@@ -22630,11 +23524,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "maxwellflynn.com", true },
   { "maxwellmoore.co.uk", true },
   { "may24.tw", true },
-  { "maya-ro.com", true },
   { "mayaimplant.com", true },
   { "mayavi.co.in", true },
-  { "maydex.info", true },
   { "mayerbrownllz.com", true },
+  { "mayoimobiliare.ro", true },
   { "mayomarquees.com", true },
   { "mayopartyhire.com", true },
   { "maypolevilla.co.uk", true },
@@ -22642,8 +23535,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mazda-mps.de", true },
   { "mazda-thermote.com", true },
   { "mazda626.net", true },
+  { "maze.design", false },
   { "maze.fr", true },
+  { "mazenjobs.com", true },
   { "mazternet.ru", true },
+  { "mazurlabs.tk", true },
   { "mazzotta.me", true },
   { "mb-is.info", true },
   { "mb300sd.com", true },
@@ -22651,6 +23547,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mbaasy.com", true },
   { "mbaestlein.de", true },
   { "mbainflatables.co.uk", true },
+  { "mbanq.com", true },
   { "mbardot.com", true },
   { "mbasic.facebook.com", false },
   { "mbcars.be", true },
@@ -22659,24 +23556,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mbilker.us", true },
   { "mbinf.de", false },
   { "mbits.solutions", true },
+  { "mbk.net.pl", true },
   { "mblankhorst.nl", true },
   { "mble.mg", true },
+  { "mbmcatering.com", true },
   { "mbp.banking.co.at", false },
   { "mbr-net.de", true },
   { "mbrooks.info", true },
   { "mbs-journey.com", true },
   { "mbsec.net", true },
+  { "mburaks.com", true },
   { "mburns.duckdns.org", true },
   { "mbweir.com", true },
   { "mbwis.net", true },
   { "mc-jobs.net", true },
+  { "mc-ruempel-firmen-und-haushaltsaufloesungen.de", true },
   { "mc-venture.net", false },
   { "mc4free.cc", true },
   { "mcatnnlo.org", true },
+  { "mccarty.io", false },
   { "mccoolesredlioninn.com", true },
   { "mccordsvillelocksmith.com", true },
   { "mccrackon.com", true },
-  { "mcculloughjchris.com", true },
+  { "mcculloughjchris.com", false },
   { "mcdermottautomotive.com", true },
   { "mcdona1d.me", true },
   { "mcdonalds.be", true },
@@ -22697,14 +23599,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mchristopher.com", true },
   { "mcinterface.de", true },
   { "mcivor.me", true },
-  { "mckenry.net", true },
+  { "mckenry.net", false },
   { "mckernan.in", true },
   { "mckinley.school", true },
   { "mcl.de", false },
-  { "mcl.gg", true },
   { "mclinflatables.co.uk", true },
   { "mclmotors.co.uk", true },
-  { "mclyr.com", true },
   { "mcmillansedationdentistry.com", false },
   { "mcmillanskiclub.com.au", true },
   { "mcneill.io", true },
@@ -22717,6 +23617,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mcprocdn.com", true },
   { "mcrn.jp", true },
   { "mcsinflatables.co.uk", true },
+  { "mcsports.es", true },
   { "mcsrvstat.us", true },
   { "mctools.org", true },
   { "mcuuid.net", true },
@@ -22742,6 +23643,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mdoering.de", true },
   { "mdosch.de", true },
   { "mdpraha.cz", true },
+  { "mdrthmcs.io", true },
   { "mds-paris.com", true },
   { "mdsave.com", true },
   { "mdx.no", true },
@@ -22750,8 +23652,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mdzservers.com", true },
   { "me-center.com", true },
   { "me-groups.com", true },
+  { "me-soft.nl", true },
   { "me.net.nz", true },
   { "meadowfen.farm", true },
+  { "meadowfenfarm.com", true },
   { "mealgoo.com", true },
   { "meamod.com", false },
   { "meany.xyz", true },
@@ -22761,18 +23665,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mebaneattorney.com", true },
   { "mebanesteakhouse.com", true },
   { "mecanicoautomotriz.org", true },
+  { "mecaniquemondor.com", true },
+  { "mechanics-schools.com", true },
   { "mechanus.io", true },
   { "mechmk1.me", true },
+  { "med-colleges.com", true },
   { "med-otzyv.ru", true },
   { "med360.at", true },
   { "medba.se", true },
   { "medcir.com.br", true },
   { "medcrowd.com", true },
-  { "meddatix.com", true },
   { "meddelare.com", true },
   { "meddigital.com", false },
   { "mede-handover.azurewebsites.net", true },
   { "medeinos.lt", true },
+  { "medellinapartamentos.com", true },
   { "medexpress.co.uk", true },
   { "medhy.fr", true },
   { "medi-link.co.il", true },
@@ -22781,6 +23688,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "media-instance.ru", true },
   { "media-library.co.uk", true },
   { "media-pi.com", true },
+  { "media-service.fr", true },
   { "media-serwis.com", true },
   { "mediaarea.net", true },
   { "mediabackoffice.co.jp", true },
@@ -22794,13 +23702,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mediagenic.ch", true },
   { "mediagold.it", true },
   { "mediagrand.net", true },
+  { "mediahaus.de", true },
   { "mediajurnal.com", true },
   { "medialab.nrw", true },
   { "mediamarkt.pl", true },
   { "mediapart.fr", true },
   { "mediarithmics.com", true },
+  { "mediarithmics.io", true },
   { "mediarocks.de", true },
   { "mediaselection.eu", true },
+  { "mediathekview.de", true },
   { "mediationculturelleclp.ch", true },
   { "mediatorzy.waw.pl", true },
   { "mediaukkies.nl", true },
@@ -22810,6 +23721,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mediawiki.org", true },
   { "mediawin.pl", true },
   { "medic-world.com", true },
+  { "medical-assistant-colleges.com", true },
   { "medicalabroad.org", true },
   { "medicalcountermeasures.gov", true },
   { "medicare-providers.net", true },
@@ -22822,6 +23734,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "medienweite.de", true },
   { "medifi.com", true },
   { "medigap-quote.net", true },
+  { "medik8.com.cy", true },
   { "medikuma.com", true },
   { "medino.com", true },
   { "medinside.ch", true },
@@ -22829,6 +23742,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "medinsider.ch", true },
   { "medinsider.li", true },
   { "medireport.fr", true },
+  { "meditadvisors.com", true },
   { "meditel.nl", true },
   { "medium.com", true },
   { "medja.net", true },
@@ -22836,11 +23750,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "medmarkt24.com", true },
   { "medo64.com", true },
   { "medovea.ru", true },
+  { "medpeer.co.jp", true },
   { "medpeer.jp", true },
   { "medpics.com", true },
   { "medschat.com", true },
   { "medtalents.ch", true },
-  { "medtankers.management", true },
   { "medtehnika.ua", true },
   { "medusa.wtf", true },
   { "meduza.io", true },
@@ -22854,7 +23768,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "meehle.com", true },
   { "meeko.cc", true },
   { "meereskunst.de", true },
+  { "meerman.nl", true },
+  { "meermantechnischburo.nl", true },
   { "meerutcake.com", true },
+  { "meesteresmisty.nl", true },
   { "meet.google.com", true },
   { "meetawesomepeople.net", true },
   { "meetbot.fedoraproject.org", true },
@@ -22863,7 +23780,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "meetingmanage.nl", true },
   { "meetingmanager.ovh", true },
   { "meetings2.com", true },
+  { "meetmibaby.co.uk", true },
   { "meetmygoods.com", true },
+  { "meetscompany.jp", true },
   { "meeusen-usedcars.be", true },
   { "meeztertom.nl", true },
   { "meg-a-bounce.co.uk", true },
@@ -22876,10 +23795,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "megabounce.co.uk", true },
   { "megabounceni.co.uk", true },
   { "megabouncingcastles.com", true },
-  { "megaflix.nl", true },
   { "megaflowers.ru", true },
   { "megagifs.de", true },
   { "megainflatables.co.uk", true },
+  { "megakoncert90.cz", true },
   { "megamarkey.de", true },
   { "megamisja.pl", true },
   { "meganandmarc.us", true },
@@ -22892,6 +23811,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "megasslstore.com", true },
   { "megauction.tk", true },
   { "megaxchange.com", true },
+  { "mego.cloud", true },
   { "megumico.net", true },
   { "megustariasaber.com", true },
   { "megztosidejos.lt", true },
@@ -22931,6 +23851,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mekesh.net", true },
   { "mekesh.ru", true },
   { "meklon.net", true },
+  { "mekongeye.com", true },
   { "melaniebernhardt.com", true },
   { "melaniegruber.de", true },
   { "melbourne.dating", true },
@@ -22962,7 +23883,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "members-only-shopping.com", true },
   { "members.nearlyfreespeech.net", false },
   { "membershipservices.org.uk", true },
-  { "membersonline.org", true },
   { "meme-photostudio.com.tw", true },
   { "meme.fi", true },
   { "meme.institute", true },
@@ -22970,7 +23890,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "memepasmal.net", true },
   { "memesbee.com", true },
   { "memfrob.org", true },
-  { "memind.net", true },
   { "memiux.com", true },
   { "memo-linux.com", true },
   { "memo.ee", true },
@@ -22983,9 +23902,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mendipbouncycastles.co.uk", true },
   { "mendozagenevieve.com", true },
   { "mendy.jp", true },
-  { "menhera.org", true },
   { "menielias.com", true },
   { "menkyo-blog.com", true },
+  { "mennace.com", true },
   { "menntagatt.is", true },
   { "menole.com", true },
   { "menole.de", true },
@@ -22996,21 +23915,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mensagensdeconforto.com.br", true },
   { "mensagensperfeitas.com.br", true },
   { "mensch-peter.me", true },
-  { "mentalhealth.gov", true },
   { "mentalhealthmn.org", true },
   { "mentaltraining-fuer-musiker.ch", true },
-  { "mentesemprendedoras.net", true },
   { "mentiq.az", true },
   { "mentorithm.com", true },
   { "mentz.info", true },
-  { "menu.fyi", true },
   { "menudieta.com", true },
-  { "menuel.me", true },
   { "menuonlineordering.com", true },
   { "menzel-motors.com", true },
   { "menzietti.it", true },
   { "mephedrone.org", true },
+  { "meps.net", true },
   { "mer.gd", true },
+  { "merakilp.com", true },
   { "meransuedtirol.com", true },
   { "meraseo.com", true },
   { "mercadobitcoin.com.br", true },
@@ -23024,24 +23941,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "merchant-automotive.com", true },
   { "mercier-auto.com", true },
   { "mercier-cars.co.uk", true },
+  { "mercredifiction.io", true },
   { "mercury.photo", true },
   { "mercuryamericas.com", false },
   { "meremeti-online.gr", true },
+  { "meremobil.dk", true },
   { "merenbach.com", true },
   { "merenita.com", true },
-  { "merenita.eu", true },
   { "merenita.net", true },
   { "merenita.nl", true },
   { "meric-graphisme.info", true },
+  { "meridianfresno.com", true },
   { "meridianmetals.com", true },
   { "meridianstore.com.br", true },
   { "merkel.me", true },
   { "merlet.eu", true },
   { "merlinsoap.com", true },
-  { "merloat.club", true },
   { "merloat.com", true },
   { "merojob.com", true },
   { "meronberry.jp", true },
+  { "mers.one", true },
   { "merson.org", true },
   { "merson.tv", true },
   { "mertarauh.com", true },
@@ -23056,12 +23975,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mesh.gov", true },
   { "meshok.info", true },
   { "mesicka.com", true },
-  { "messagescelestes-archives.ca", true },
+  { "mesomeds.com", true },
   { "messagevortex.com", true },
   { "messagevortex.net", true },
   { "messdorferfeld.de", true },
   { "messenger.co.tz", true },
-  { "messenger.com", false },
+  { "messenger.com", true },
   { "messengerwebbrands.com", true },
   { "messer24.ch", true },
   { "messymom.com", true },
@@ -23069,20 +23988,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mestr.es", true },
   { "mesvt.com", true },
   { "meta-db.com", true },
+  { "meta-word.com", true },
   { "meta.sc", true },
   { "metachris.com", true },
   { "metacoda.com", true },
   { "metacode.biz", true },
   { "metadatawiki.com", true },
+  { "metaether.net", true },
   { "metafurquest.net", true },
+  { "metallosajding.ru", true },
   { "metalu.ch", true },
-  { "metanic.org", true },
   { "metanic.services", true },
   { "metanodo.com", true },
   { "metapeen.nl", true },
-  { "metaregistrar.com", true },
   { "metasquare.com.au", true },
   { "metasquare.nyc", true },
+  { "metasysteminfo.com", true },
+  { "metaurl.io", true },
+  { "metaword.com", true },
+  { "metaword.net", true },
+  { "metaword.org", true },
   { "meteenonline.nl", true },
   { "meteo-parc.com", true },
   { "meteo-r.ovh", true },
@@ -23094,17 +24019,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "meteobox.mx", true },
   { "meteobox.pl", true },
   { "meteobox.sk", true },
+  { "meteocat.net", true },
   { "meteorapp.space", true },
+  { "meteorologiaenred.com", true },
   { "meteosherbrooke.com", true },
   { "meteosmit.it", true },
   { "meterhost.com", true },
   { "methamphetamine.co.uk", true },
   { "methylone.com", true },
+  { "metric.ai", true },
+  { "metricmutt.com", true },
   { "metro-lawn-care.com", true },
   { "metro-web.net", true },
   { "metroairvirtual.com", true },
   { "metrobriefs.com", true },
   { "metrolush.com", true },
+  { "metron-eging.com", true },
+  { "metron-networks.com", true },
   { "metronaut.de", true },
   { "metropop.ch", true },
   { "metsasta.com", true },
@@ -23117,6 +24048,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mevs.cz", true },
   { "mexican.dating", true },
   { "mexicanjokes.net", true },
+  { "mexico.sh", true },
   { "mexicom.org", true },
   { "meyash.co", true },
   { "mf-fischer.de", true },
@@ -23127,9 +24059,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mflodin.se", true },
   { "mfxbe.de", true },
   { "mfz.mk", true },
-  { "mgcraft.net", true },
   { "mgi.gov", true },
   { "mgknet.com", true },
+  { "mglink.be", true },
   { "mgrossklaus.de", true },
   { "mgrt.net", true },
   { "mgsisk.com", true },
@@ -23147,19 +24079,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mi-beratung.de", true },
   { "mi-so-ji.com", true },
   { "mi80.com", true },
+  { "miadennees.com", true },
   { "miagexport.com", true },
   { "miaonagemi.com", true },
   { "miaoubox.com", true },
   { "miaowo.org", true },
   { "miasarafina.de", true },
+  { "miavierra.org", true },
   { "mibuiin.com", true },
   { "micado-software.com", true },
-  { "micaiahparker.com", true },
   { "micalodeal.ch", true },
-  { "micasamgmt.com", false },
   { "micbase.com", true },
   { "michadenheijer.com", true },
   { "michael-schefczyk.de", true },
+  { "michael-schilling.de", true },
   { "michael-steinhauer.eu", true },
   { "michael.band", true },
   { "michaelasawyer.com", true },
@@ -23170,16 +24103,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "michaelhrehor.com", true },
   { "michaeliscorp.com", true },
   { "michaelismold.com", true },
-  { "michaelizquierdo.com", true },
   { "michaeljdennis.com", true },
   { "michaelkuchta.me", true },
   { "michaelleibundgut.com", true },
   { "michaelpelletterie.it", true },
   { "michaelpfrommer.de", true },
   { "michaelpfrommer.pub", true },
-  { "michaels-homepage-service.de", true },
   { "michaelschmidt.ch", true },
   { "michaelschubert.com", true },
+  { "michaelschule-rheine.de", true },
+  { "michaelslatkine.com", true },
   { "michaelsnoeren.nl", true },
   { "michaelsweater.com", true },
   { "michaeltaboada.me", true },
@@ -23194,7 +24127,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "michalwiglasz.cz", true },
   { "michaonline.de", true },
   { "michel-wein.de", true },
-  { "michele.ml", true },
   { "michellavat.com", true },
   { "michiganstateuniversityonline.com", true },
   { "michiganunionoptout.com", true },
@@ -23205,6 +24137,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "miconcinemas.com", true },
   { "micr.io", true },
   { "micr0lab.org", true },
+  { "microbiote-insectes-vecteurs.group", true },
   { "microco.sm", true },
   { "microcomploja.com.br", true },
   { "microdots.de", true },
@@ -23213,12 +24146,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "micromata.de", true },
   { "microsoftaffiliates.azurewebsites.net", true },
   { "microvb.com", true },
+  { "microzubr.com", true },
   { "midair.io", true },
   { "midasjewellery.com.au", true },
   { "midgawash.com", true },
+  { "midislandrealty.com", true },
   { "midkam.ca", true },
   { "midlandgate.de", true },
   { "midlandleisuresales.co.uk", true },
+  { "midlandroofingri.com", true },
   { "midlandsfundays.co.uk", true },
   { "midlandsphotobooths.co.uk", true },
   { "midlgx.com", true },
@@ -23226,9 +24162,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "midnightmango.co.uk", true },
   { "midnightmango.de", true },
   { "midnightmechanism.com", true },
+  { "midrandplumber24-7.co.za", true },
   { "midstatebasement.com", true },
   { "midtowndentistry.com", true },
   { "midwestbloggers.org", true },
+  { "midwestplus.com", true },
   { "midweststructuralrepair.com", true },
   { "miegl.com", true },
   { "miembarcacion.com", true },
@@ -23239,6 +24177,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "miffy.me", true },
   { "mig5.net", true },
   { "miggy.org", true },
+  { "mightysighty.com", true },
   { "miguel.pw", true },
   { "migueldemoura.com", true },
   { "migueldominguez.ch", true },
@@ -23257,7 +24196,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mijnkinderkleding.com", true },
   { "mijnpartijhandel.nl", true },
   { "mijnreisoverzicht.nl", true },
-  { "mijnsite.ovh", true },
   { "mijnstembureau.nl", true },
   { "mijntransacties.nl", true },
   { "mika.moe", true },
@@ -23276,7 +24214,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mikeguy.co.uk", true },
   { "mikehamburg.com", true },
   { "mikehilldesign.co.uk", true },
-  { "mikerichards.photography", false },
+  { "mikekreuzer.com", true },
+  { "mikerichards.photography", true },
   { "miketabor.com", true },
   { "miketheuer.com", true },
   { "mikevesch.com", true },
@@ -23288,17 +24227,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mikhlevich.ru", true },
   { "miki-boras.de", true },
   { "miki.it", true },
+  { "mikkelladegaard.dk", true },
+  { "mikkelscheike.com", true },
+  { "mikkelvej.dk", true },
   { "mikkonen.bio", true },
   { "miklcct.com", true },
   { "miknight.com", true },
   { "mikonmaa.fi", true },
-  { "mikori.sk", true },
   { "mikropixel.de", true },
   { "mikroskeem.eu", true },
   { "miku.cloud", true },
   { "miku.party", true },
   { "mikumaycry.com", true },
   { "mikumiku.stream", true },
+  { "mikupic.com", true },
   { "mikywow.eu", true },
   { "mil-spec.ch", true },
   { "mil0.com", true },
@@ -23337,9 +24279,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "millions19.com", true },
   { "millions20.com", true },
   { "millions22.com", true },
-  { "millions25.com", true },
-  { "millions26.com", true },
-  { "millions27.com", true },
   { "millions28.com", true },
   { "millions29.com", true },
   { "millions31.com", true },
@@ -23408,9 +24347,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mindleaking.org", true },
   { "mindoktor.se", false },
   { "mindorbs.com", true },
+  { "mindox.com.br", true },
   { "mindstretchers.co.uk", true },
-  { "mindwork.space", true },
   { "mine-craftlife.com", true },
+  { "mine-pixl.de", true },
   { "mine260309.me", true },
   { "minebier.dk", true },
   { "minecraft-forum.eu", true },
@@ -23418,6 +24358,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "minecraftforum.de", true },
   { "minecraftforum.ovh", true },
   { "minecraftjson.com", true },
+  { "minecraftstal.com", true },
   { "minehattan.de", true },
   { "minehub.de", true },
   { "minei.me", true },
@@ -23431,10 +24372,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "minetracker.dk", true },
   { "minez-nightswatch.com", false },
   { "minf3-games.de", true },
+  { "minfin.gov.ua", true },
   { "mingky.net", true },
   { "mingming.info", true },
   { "mingram.net", true },
+  { "mingtreerealty.com", true },
   { "mingwah.ch", true },
+  { "minh.at", false },
   { "mini2.fi", true },
   { "minigames.com", true },
   { "miniglueck.net", true },
@@ -23448,23 +24392,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "minimayhemsoftplay.co.uk", true },
   { "minimbah.com.au", true },
   { "minimvc.com", true },
+  { "miningtronics.com", true },
   { "ministeriumfuerinternet.de", true },
   { "minitruckin.net", true },
   { "minitrucktalk.com", true },
+  { "miniwallaby.com", true },
   { "minkymoon.jp", true },
   { "minnesotakinkyyouth.org", true },
   { "minnesotamathcorps.org", true },
   { "minnesotareadingcorps.org", true },
   { "minnit.chat", true },
   { "minobar.com", true },
-  { "minorshadows.net", true },
   { "minpingvin.dk", true },
   { "minschuns.ch", true },
   { "mintclass.com", true },
   { "mintosherbs.com", true },
   { "mintrak2.com", true },
   { "mintse.com", true },
+  { "minu.link", true },
   { "minube.co.cr", true },
+  { "minutashop.ru", true },
   { "minux.info", true },
   { "mipapo.de", true },
   { "mipla.ch", true },
@@ -23482,7 +24429,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mirepublic.co.nz", true },
   { "mireservaonline.es", true },
   { "mirfire.com", true },
-  { "mirjamderijk.nl", true },
+  { "mirjamderijk.nl", false },
   { "mirkofranz.de", true },
   { "mironet.cz", true },
   { "mirrorbot.ga", true },
@@ -23505,6 +24452,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "missdream.org", true },
   { "misseguf.dk", true },
   { "missevent.pl", true },
+  { "missguidedus.com", true },
   { "mission-orange.de", true },
   { "missionsgemeinde.de", true },
   { "missip.nl", true },
@@ -23518,6 +24466,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "missyou.link", true },
   { "mistacms.com", true },
   { "mister-cooks.fr", true },
+  { "mister-matthew.de", true },
   { "mistreaded.com", true },
   { "mistybox.com", true },
   { "misupport.dk", true },
@@ -23531,6 +24480,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mitarbeitermotivation-anleitungen.de", true },
   { "mitchellhandymanservices.co.uk", true },
   { "mitchelmore.ca", true },
+  { "mitdip-mit-group-ch.azurewebsites.net", true },
   { "miticobikes.com", true },
   { "mitigationcommission.gov", true },
   { "mitnetz-gas.de", true },
@@ -23540,14 +24490,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mitrecaasd.org", true },
   { "mitremai.org", true },
   { "mitrostudios.com", true },
-  { "mitsign.com", true },
   { "mitsu-szene.de", true },
   { "mitsukabose.com", true },
   { "mittagonggardencentre.com.au", true },
   { "mittagonghomestead.com.au", true },
   { "mittelunsachlich.de", true },
   { "mittenofficesystems.com", true },
-  { "mityinc.com", true },
   { "mitylite.com", true },
   { "mitzpettel.com", true },
   { "miui-germany.de", true },
@@ -23574,13 +24522,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mizipack.com", true },
   { "mizque.ch", true },
   { "mizu.coffee", true },
+  { "mizucoffee.net", true },
   { "mizuho-trade.net", true },
   { "mizumax.me", true },
   { "mj420.com", true },
   { "mjacobson.net", true },
   { "mjanja.ch", true },
   { "mjasm.org", true },
-  { "mjec.net", true },
   { "mjlaurindo.pt", true },
   { "mjmedia.co.za", true },
   { "mjmnagy.info", true },
@@ -23598,8 +24546,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mkes.com", true },
   { "mkg-chirurgie-bruchsal.de", true },
   { "mkg-scherer.de", true },
+  { "mkg-wiebelskirchen.de", true },
   { "mkhsoft.eu", true },
   { "mkimage.com", true },
+  { "mkjl.ml", true },
   { "mkk.de", true },
   { "mkkkrc.ru", true },
   { "mklpedia.de", true },
@@ -23611,11 +24561,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mkset.ru", true },
   { "mktdigital.info", true },
   { "mktemp.org", true },
-  { "mkuznets.com", true },
   { "mlarte.com", true },
   { "mlcnfriends.com", true },
   { "mlemay.com", true },
   { "mlfaw.com", true },
+  { "mlii.net", true },
   { "mlmjam.com", true },
   { "mlp.ee", true },
   { "mlpvector.club", true },
@@ -23626,10 +24576,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mm404.com", true },
   { "mma-acareporting.com", true },
   { "mmalisz.com", true },
-  { "mmaps.ddns.net", true },
+  { "mmaps.org", true },
   { "mmbb.org", true },
   { "mmin.us", false },
   { "mmmarco.com", true },
+  { "mmmaximaliselmeny.hu", true },
   { "mmogah.com", true },
   { "mmonit.com", true },
   { "mms.is", true },
@@ -23638,7 +24589,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mna7e.com", true },
   { "mncloud.de", true },
   { "mncr.nl", true },
-  { "mnd.sc", true },
   { "mne.moe", true },
   { "mnedc.org", true },
   { "mnemonic.ninja", true },
@@ -23654,9 +24604,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mo-journal.com", true },
   { "mo.nl", true },
   { "mo2021.de", true },
-  { "mo3.club", true },
   { "moa.moe", true },
-  { "mobag.ru", true },
   { "mobal.com", true },
   { "mobi4.tk", true },
   { "mobidea.com", true },
@@ -23695,7 +24643,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mochanstore.com", true },
   { "mockerel.com", true },
   { "mococo.co.uk", true },
-  { "mocurio.com", false },
   { "modaexecutiva.com.br", true },
   { "modafinil.com", true },
   { "modafinil.wiki", true },
@@ -23706,19 +24653,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "modding-welt.com", true },
   { "mode-hautnah.de", true },
   { "mode-individuell.de", true },
+  { "modehaus-marionk.de", true },
   { "modelcase.co.jp", false },
   { "modelclub-draveil.eu", true },
   { "modelcube.com", true },
-  { "modeldimension.com", true },
   { "modelisme-rc.net", true },
   { "modelisme-voiture-rc.fr", true },
   { "modelservis.cz", true },
   { "modemaille.com", true },
+  { "modemchild.net", true },
   { "modeportaal.nl", true },
   { "moderatoren.org", true },
   { "moderatorenpool.org", true },
+  { "modern-family.tv", true },
   { "modernapprenticeships.org", true },
   { "moderncoinmart.com", true },
+  { "moderncommercialrealestate.com", true },
   { "modifiedmind.com", true },
   { "modistry.com", true },
   { "modistryusercontent.com", true },
@@ -23728,10 +24678,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "modul21.eu", true },
   { "module.market", true },
   { "modulex-gmbh.de", true },
+  { "moe-max.jp", true },
+  { "moeali.com", true },
   { "moechel.com", true },
+  { "moeclue.com", true },
   { "moefactory.com", true },
+  { "moehrke.cc", true },
+  { "moekes.amsterdam", true },
   { "moeking.me", true },
+  { "moeli.org", true },
   { "moellers.systems", true },
+  { "moeqing.net", true },
   { "moetrack.com", true },
   { "moeyoo.net", true },
   { "mofohome.dyndns.org", true },
@@ -23750,6 +24707,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mojefedora.cz", true },
   { "mojilitygroup.com", true },
   { "mojizuri.com", true },
+  { "mojnet.eu", true },
+  { "mojnet.net", true },
+  { "mojoco.co.za", true },
   { "mojzis.com", true },
   { "mojzis.cz", true },
   { "mojzisova.com", true },
@@ -23765,6 +24725,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "molun.net", false },
   { "molunerfinn.com", true },
   { "molwick.com", true },
+  { "momentsofimpact.info", true },
   { "momentum.photos", true },
   { "momentumdash.com", true },
   { "momirfarooq.com", true },
@@ -23775,8 +24736,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "momut.org", true },
   { "momy-genealogie.info", true },
   { "mon-partage.fr", true },
-  { "mon-trafic.com", true },
+  { "mon-trafic.com", false },
   { "mon22.ch", true },
+  { "mona-antenna.com", true },
   { "mona-dress.com", true },
   { "monachatdeco.com", true },
   { "monaco-automaten.de", true },
@@ -23787,6 +24749,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "monbudget.org", true },
   { "moncoach.ch", true },
   { "mondedesnovels.com", true },
+  { "mondedie.fr", true },
   { "mondial-movers.nl", true },
   { "mondo-it.ch", true },
   { "moneni.com", true },
@@ -23799,9 +24762,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "moneypark.ch", true },
   { "moneytoday.se", true },
   { "mongolieenfrance.fr", true },
-  { "monique.io", true },
   { "moniquedekermadec.com", true },
   { "moniquemunhoz.com.br", true },
+  { "monitman.com", true },
   { "monitoring.kalisz.pl", true },
   { "monitzer.com", true },
   { "monix.io", true },
@@ -23814,10 +24777,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "monlabs.com", true },
   { "monloyer.quebec", true },
   { "monnyonle.hu", true },
-  { "mono.cafe", true },
+  { "mono.cafe", false },
   { "mono0x.net", true },
   { "monobank.no", true },
   { "monodukuri.com", true },
+  { "monokoo.com", true },
   { "monolithapps.com", true },
   { "monolithindustries.com", true },
   { "monolithinteractive.com", true },
@@ -23831,9 +24795,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "monpetitmobile.com", true },
   { "monsieursavon.ch", true },
   { "monstermashentertainments.co.uk", true },
-  { "montage-kaika.de", true },
+  { "monsterx.cn", true },
+  { "montage-kaika.de", false },
   { "montagne-tendance.ch", true },
   { "montanasky.tv", true },
+  { "montanteaesthetics.com", true },
   { "montanwerk.de", true },
   { "montarfotoaki.com", true },
   { "montas.io", true },
@@ -23841,6 +24807,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "montemanik.com", true },
   { "montenero.pl", true },
   { "montessori.edu.vn", true },
+  { "montgomerysoccer.net", true },
   { "montopolis.com", true },
   { "montpreveyres.ch", true },
   { "montredeal.fr", true },
@@ -23857,16 +24824,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "moolah.rocks", true },
   { "moon.fish", true },
   { "moonagic.com", true },
-  { "moonagic.io", true },
   { "moonbench.xyz", true },
   { "moonbot.io", true },
+  { "moonchart.co.uk", true },
   { "moondrop.org", true },
   { "moonkin.eu", true },
   { "moonlightcapital.ml", true },
   { "moonmelo.com", true },
   { "moonraptor.co.uk", true },
   { "moonraptor.com", true },
-  { "moonrhythm.info", false },
   { "moonshyne.org", true },
   { "moontaj.com", true },
   { "moonvpn.org", true },
@@ -23881,6 +24847,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mooselook.de", true },
   { "moosmann-moehrle.de", true },
   { "moot-info.co.za", true },
+  { "moovablestorage.com", true },
   { "moparcraft.net", true },
   { "moparisthebest.com", true },
   { "moparisthebest.net", true },
@@ -23893,6 +24860,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "morbatex.com", true },
   { "morbiceramicindustry.com", true },
   { "morbitzer.de", true },
+  { "morbius.cz", true },
   { "morbotron.com", true },
   { "morchino.ch", true },
   { "morchstore.com", true },
@@ -23900,16 +24868,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "more-terrain.de", true },
   { "moreal.co", true },
   { "moreapp.co.uk", true },
-  { "morenci.ch", true },
+  { "moreniche.com", true },
   { "morepablo.com", true },
   { "morepay.cn", true },
-  { "morepopcorn.co.nz", true },
-  { "morespacestorage.com.au", true },
   { "moresw.com", true },
   { "morethanautodealers.com", true },
   { "morethancode.be", true },
   { "morethandigital.info", true },
   { "morganino.it", true },
+  { "morgansjewelerspv.com", true },
   { "morgansleisure.co.uk", true },
   { "morgner.com", true },
   { "morhys.com", true },
@@ -23917,18 +24884,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "moritztremmel.de", true },
   { "moriz.de", true },
   { "moriz.net", true },
+  { "mormon-colleges.com", true },
   { "mormonleaks.io", true },
   { "morningcurve.com", true },
   { "morningstar.moe", true },
   { "morphy2k.io", true },
   { "morrisby.com", true },
-  { "morrodafumacanoticias.com.br", true },
   { "morteruelo.net", true },
   { "mortgagecalculator.biz", true },
   { "mortgagecentersmo.com", true },
   { "mortis.eu", true },
   { "mosaic-design.ru", true },
   { "mosaicadvisors.com", true },
+  { "mosaicmarble.com", true },
   { "moscatalogue.net", true },
   { "moscow.dating", true },
   { "mosfet.cz", true },
@@ -23940,7 +24908,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mosscade.com", true },
   { "mosshi.be", true },
   { "mosstier.com", true },
-  { "mostlyinfinite.com", true },
+  { "mostholynameofjesus.org", true },
+  { "mostlikelyto.fail", true },
   { "mostlyoverhead.com", true },
   { "motd.ch", true },
   { "motd.today", true },
@@ -23957,6 +24926,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "motifstudio.com.ua", true },
   { "motionless.nl", true },
   { "motiweb.fr", true },
+  { "motocollection.pl", true },
   { "motohell.com", true },
   { "motojato.com.br", true },
   { "motonauticaibiza.com", true },
@@ -23970,8 +24940,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "motovated.co.nz", true },
   { "motowilliams.com", true },
   { "motstats.co.uk", true },
-  { "mottomortgage.com", true },
-  { "moucloud.cn", true },
   { "moulinaparoles.ca", true },
   { "mountain-rock.ru", true },
   { "mountainactivitysection.org.uk", true },
@@ -23981,8 +24949,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mountfarmer.de", true },
   { "mousemessages.com", true },
   { "moutiezhaller.com", true },
+  { "movacare.de", true },
   { "move.mil", true },
-  { "moveek.com", true },
   { "moveisfit.com.br", true },
   { "movember.com", false },
   { "movewellnesslab.com", true },
@@ -23997,6 +24965,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "movienized.de", true },
   { "moviepilot.com", true },
   { "moviesetc.net", true },
+  { "moviespur.info", false },
   { "moviko.nz", true },
   { "movil.uno", true },
   { "moviltronix.com", true },
@@ -24009,7 +24978,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "moyer.pub", true },
   { "moylen.eu", true },
   { "moyoo.net", true },
-  { "moysovet.info", true },
   { "mozartgroup.hu", true },
   { "mozektevidi.net", true },
   { "mozilla.cz", true },
@@ -24018,7 +24986,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mpac.ca", false },
   { "mpc-hc.org", true },
   { "mpcompliance.com", true },
-  { "mpe.org", true },
   { "mpetroff.net", true },
   { "mpg-universal.com", true },
   { "mpg.ovh", true },
@@ -24026,20 +24993,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mphoto.at", true },
   { "mpintaamalabanna.it", true },
   { "mpkrachtig.nl", true },
+  { "mpkshop.com.br", true },
   { "mplanetphl.fr", true },
   { "mplant.io", true },
   { "mplicka.cz", true },
   { "mplusm.eu", true },
   { "mpnpokertour.com", true },
   { "mpodraza.pl", true },
-  { "mpreserver.com", false },
   { "mprsco.eu", true },
   { "mpsgarage.com.au", true },
   { "mpsoundcraft.com", true },
-  { "mpu-giessen.com", true },
   { "mpu-vorbereitung.com", true },
   { "mpy.ovh", true },
-  { "mqas.net", true },
   { "mr-anderson.org", true },
   { "mr-designer-oman.com", true },
   { "mr-labo.jp", true },
@@ -24067,16 +25032,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mrketolocksmith.com", true },
   { "mrknee.gr", true },
   { "mrkrabat.de", true },
-  { "mrmoregame.de", true },
   { "mrnh.de", true },
   { "mrning.com", true },
   { "mrprintables.com", true },
   { "mrs-labo.jp", true },
+  { "mrs-shop.com", true },
   { "mrsbairds.com", false },
   { "mrserge.lv", true },
   { "mrsk.me", true },
   { "mrstat.co.uk", true },
   { "mrtunnel.club", true },
+  { "mruczek.ga", true },
   { "mrv.li", true },
   { "mrx.one", true },
   { "mrxn.net", true },
@@ -24094,6 +25060,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mshemailmarketer.com.au", true },
   { "msi-zlin.cz", true },
   { "msiegmund.com", true },
+  { "msize48.ch", true },
   { "msmails.de", true },
   { "msno.no", true },
   { "msnr.net", true },
@@ -24101,6 +25068,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mspsocial.net", true },
   { "msquadrat.de", true },
   { "msroot.de", true },
+  { "mssora.com", true },
   { "mssys.de", true },
   { "mstdn.blue", true },
   { "mstdn.club", true },
@@ -24111,12 +25079,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "msv-limpezas.pt", true },
   { "msx.org", true },
   { "mszavodumiru.cz", true },
+  { "mt-bank.jp", true },
   { "mt.search.yahoo.com", false },
   { "mt2414.com", true },
+  { "mta.org.ua", true },
   { "mtane0412.com", true },
   { "mtasa.com", true },
   { "mtasa.hu", true },
-  { "mtb.wtf", true },
+  { "mtauburnassociates.com", true },
   { "mtcq.jp", true },
   { "mtd.org", true },
   { "mtg-tutor.de", true },
@@ -24133,7 +25103,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mtltransport.com", true },
   { "mtnz.co.za", true },
   { "mtouch.facebook.com", false },
-  { "mtr.md", true },
   { "mtrip.com", true },
   { "mtrock.ru", true },
   { "mts-energia.eu", true },
@@ -24153,6 +25122,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mudgezero.one", true },
   { "muehlemann.net", true },
   { "muel.io", true },
+  { "muelhau.pt", true },
   { "muell-weg.de", true },
   { "muellapp.com", true },
   { "muenchberger.com", true },
@@ -24162,7 +25132,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "muh.io", true },
   { "mui.jp", true },
   { "muitadica.com", true },
+  { "mujerfutura.com", true },
   { "muk-kobetsu.com", true },
+  { "mukilteodentalarts.com", true },
+  { "mukilteoeuropeanautorepair.com", true },
   { "mulaccosmetics.com", true },
   { "mulaisehat.com", true },
   { "mulej.net", true },
@@ -24176,7 +25149,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "multigeist.de", true },
   { "multikalender.de", false },
   { "multimail.work", true },
-  { "multimatte.com", true },
+  { "multimatte.com", false },
   { "multimed.krakow.pl", true },
   { "multimedia-pool.com", true },
   { "multiplayernow.com", true },
@@ -24189,10 +25162,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "multitek.no", true },
   { "multitenantlaravel.com", true },
   { "multitheftauto.com", true },
-  { "multizone.games", true },
   { "multrier.fr", true },
   { "mum.ceo", true },
-  { "muma.ml", true },
   { "mumakil.fi", true },
   { "mumbaionlinegifts.com", true },
   { "muminkoykiran.com", true },
@@ -24200,6 +25171,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "munch.me", true },
   { "munchcorp.com", true },
   { "mundoarabe.com.br", true },
+  { "mundodapoesia.com", true },
   { "mundodasmensagens.com", true },
   { "mundokinderland.com.br", true },
   { "mundolarraz.es", true },
@@ -24208,9 +25180,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mundtec.com.br", true },
   { "munduch.cz", true },
   { "munduch.eu", true },
+  { "munirajiwa.com", true },
   { "munki.org", true },
   { "munkibuilds.org", true },
-  { "munwr.com", false },
+  { "munwr.com", true },
   { "muqu.co", true },
   { "mur-vegetal-interieur.fr", true },
   { "murakami-sah.com", true },
@@ -24222,11 +25195,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "murof.com.br", true },
   { "murray.xyz", true },
   { "murraya.cn", true },
+  { "murzik.space", true },
+  { "musa.gallery", true },
   { "musaccostore.com", true },
   { "muscle-tg.com", true },
   { "musclecarresearch.com", true },
   { "muscolinomusic.com", true },
-  { "musearchengine.com", true },
   { "musehelix.com", true },
   { "muses-success.info", true },
   { "musettishop.com", true },
@@ -24237,13 +25211,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "music-project.eu", true },
   { "music-world.pl", true },
   { "music.amazon.com", true },
-  { "musicaconleali.it", true },
   { "musicalive.nl", true },
   { "musicall.com", true },
   { "musicalschwarzenburg.ch", true },
   { "musicapara.net", true },
+  { "musicasbr.com.br", true },
   { "musicchris.de", true },
-  { "musiccitycats.com", true },
+  { "musicdemons.com", true },
   { "musicgamegalaxy.de", true },
   { "musician.dating", true },
   { "musickhouseleveling.com", true },
@@ -24254,6 +25228,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "musik-mentaltraining.ch", true },
   { "musikverein-elten.de", true },
   { "musikzentrale.net", true },
+  { "musique2nuit.com", true },
   { "musketonhaken.nl", false },
   { "muslim.singles", true },
   { "musmann.io", true },
@@ -24276,16 +25251,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "muthai.in.th", true },
   { "mutuals.cool", true },
   { "mutuelle.fr", true },
+  { "muurlingoogzorg.nl", true },
   { "muusika.fun", true },
   { "muusikoiden.net", true },
   { "muwatenraqamy.org", true },
+  { "muz2u.ru", true },
   { "muzeumkomiksu.eu", true },
-  { "muzhijy.com", true },
+  { "muzhijy.com", false },
   { "muzikantine.nl", true },
+  { "mv-schnuppertage.de", true },
   { "mv-wohnen.de", true },
   { "mvandek.nl", true },
   { "mvbits.com", true },
   { "mvisioncorp.com", true },
+  { "mvistatic.com", true },
   { "mvno.io", true },
   { "mvp-stars.com", true },
   { "mw.search.yahoo.com", false },
@@ -24296,7 +25275,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mwe.st", true },
   { "mwezi-foundation.org", true },
   { "mwezi.org", true },
-  { "mwohlfarth.de", true },
   { "mwtdev.se", true },
   { "mww.moe", true },
   { "mx-quad.fr", true },
@@ -24315,6 +25293,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "my-dns.co.il", true },
   { "my-ebook.es", true },
   { "my-floor.com", true },
+  { "my-gode.fr", true },
   { "my-host.ovh", true },
   { "my-hps.de", true },
   { "my-ip.work", true },
@@ -24332,7 +25311,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "my4thtelco.sg", true },
   { "myabcm.com", true },
   { "myaccount.google.com", false },
-  { "myactivity.google.com", true },
+  { "myactivity.google.com", false },
   { "myadpost.com", true },
   { "myadself.com", true },
   { "myaggic.com", true },
@@ -24355,7 +25334,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mybodylife.com", true },
   { "mybon.at", false },
   { "mybonfire.com", true },
+  { "mybreastcancerjourney.com", true },
   { "mybusiness.wien", true },
+  { "mycamda.com", true },
   { "mycard.moe", true },
   { "mycareersfuture.sg", true },
   { "mycieokien.info", false },
@@ -24368,7 +25349,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mycofairtrade.com", true },
   { "mycompanion.cz", true },
   { "myconan.net", true },
+  { "myconnect.cn", true },
   { "myconsulting.ch", true },
+  { "mycookrecetas.com", true },
   { "mycr.eu", true },
   { "mycreativeartsconsulting.com", true },
   { "mycreativenook.com", true },
@@ -24376,7 +25359,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mycrypnet.io", true },
   { "mycrypto.com", true },
   { "mycrystalgrove.com", true },
-  { "mycuco.it", true },
   { "mycustomwriting.com", true },
   { "mydarkstar.net", true },
   { "mydaywebapp.com", true },
@@ -24387,6 +25369,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mydigitalweek.com", true },
   { "mydjsongbook.com", true },
   { "mydna.bio", true },
+  { "mydoc.fr", true },
   { "mydocserve.com", true },
   { "mydomaindesk.com", true },
   { "mydreamlifelab.com", true },
@@ -24397,14 +25380,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myeberspaecher.com", true },
   { "myeffect.today", true },
   { "myeisenbahn.de", true },
-  { "myeml.net", true },
-  { "myessaygeek.com", true },
+  { "myeml.net", false },
   { "myetherwallet.com", true },
   { "myf.cloud", true },
   { "myfantasysportstalk.com", true },
   { "myfedloan.org", true },
   { "myfirenet.com", true },
-  { "myfishpalace.at", true },
   { "myfloridadeferredcomp.com", true },
   { "myforfaitmobile.com", true },
   { "myfreemp3.click", true },
@@ -24412,11 +25393,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myg21.com", true },
   { "mygadgetguardian.lookout.com", false },
   { "mygallery.homelinux.net", true },
+  { "mygate.at", false },
+  { "mygedit.com", true },
   { "mygeneral.org", true },
   { "mygeotrip.com", true },
+  { "mygest.me", true },
   { "mygigabitnation.com", true },
   { "mygignation.com", true },
   { "mygirlfriendshouse.com", true },
+  { "mygnmr.com", true },
   { "mygoldennetwork.com", true },
   { "mygreatjobs.de", true },
   { "mygreatlakes.org", true },
@@ -24428,23 +25413,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myhealthreviews.com", true },
   { "myhollywoodnews.com", true },
   { "myhome-24.pl", true },
+  { "myhostname.net", true },
+  { "myhuthwaite.com", true },
   { "myimds.com", true },
   { "myimmitracker.com", true },
   { "myinvite.nl", true },
+  { "myipaddr.de", true },
   { "myipv4.de", true },
   { "myjudo.net", true },
   { "myjumparoo.co.uk", true },
-  { "myjumpsuit.de", true },
   { "myki.co", true },
   { "mykontool.de", true },
+  { "mykumedir.com", true },
   { "mylatestnews.org", true },
   { "mylawyer.be", true },
   { "myleanfactory.de", true },
+  { "mylife360mag.com", true },
   { "mylifeabundant.com", true },
   { "mylittlechat.ru", true },
   { "myliveupdates.com", true },
   { "mylookout.com", false },
   { "mylstrom.com", true },
+  { "mylucknursinghome.com", true },
+  { "mymadina.com", true },
   { "mymall.co.jp", true },
   { "mymarketingcourses.com", true },
   { "mymb.pm", true },
@@ -24461,12 +25452,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mymx.lu", true },
   { "myna.go.jp", true },
   { "mynameistavis.com", true },
+  { "mynetworkingbuddy.com", true },
   { "mynext.events", true },
   { "mynextmove.org", true },
   { "mynn.io", true },
   { "mynook.info", false },
   { "mynortherngarden.com", true },
   { "myonline.hu", true },
+  { "myonlinevehicleinsurance.com", true },
   { "myoptumhealthcomplexmedical.com", true },
   { "myoptumhealthparentsteps.com", true },
   { "myotopie.de", true },
@@ -24483,17 +25476,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myowndisk.net", true },
   { "myownwebinar.com", true },
   { "mypaperdone.com", true },
+  { "mypartybynoelia.es", true },
   { "mypayoffloan.com", true },
   { "mypcqq.cc", true },
   { "myperfecthome.ca", true },
-  { "myperfumecollection.com", true },
   { "myperks.in", true },
   { "myphotoshopbrushes.com", true },
   { "mypillcard.com", true },
+  { "mypizza-bremen.de", true },
   { "myplaceonline.com", true },
+  { "mypnu.net", true },
   { "mypress.mx", true },
   { "myprintcard.de", true },
   { "myproblog.com", true },
+  { "myprotime.eu", true },
   { "myproxy.eu.org", true },
   { "mypup.nl", true },
   { "myrandomtips.com", true },
@@ -24501,6 +25497,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myraytech.net", false },
   { "myrealestatemate.com.au", true },
   { "myrealestateschool.com", true },
+  { "myred.net", true },
   { "myref.net", true },
   { "myrekber.co.id", true },
   { "myrent.quebec", true },
@@ -24560,7 +25557,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myrotvorets.center", true },
   { "myrotvorets.news", true },
   { "myrp.co", true },
-  { "mysber.ru", true },
   { "myschoolphoto.org", true },
   { "myseatime.com", true },
   { "mysecretcase.com", false },
@@ -24568,6 +25564,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myself5.de", true },
   { "myseo.ga", true },
   { "myserv.one", true },
+  { "myservicearl.com", true },
   { "myseu.cn", true },
   { "mysexydate24.com", true },
   { "myshirtsize.com", true },
@@ -24578,7 +25575,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mysockfactory.ch", true },
   { "mysockfactory.com", true },
   { "mysocrat.com", true },
-  { "mysongbird.xyz", true },
   { "mysoundtalks.com", false },
   { "myspicer.com", true },
   { "mysqldump-secure.org", true },
@@ -24587,7 +25583,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mysteriouscode.io", true },
   { "mysterydata.com", true },
   { "mysterymind.ch", true },
-  { "mysterysear.ch", true },
   { "mystic-welten.de", true },
   { "mystickphysick.com", true },
   { "mysticplumes.com", true },
@@ -24596,6 +25591,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "mysupboard.de", true },
   { "myswissmailaddress.com", true },
   { "mytfg.de", true },
+  { "mythemeshop.com", false },
   { "mythengay.ch", true },
   { "mythicdelirium.com", true },
   { "myting.net", true },
@@ -24623,25 +25619,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "myvoipnews.com", true },
   { "myvpl.com", true },
   { "mywari.com", true },
-  { "mywebinar.io", true },
   { "mywebmanager.co.uk", true },
   { "mywebpanel.eu", true },
-  { "mywebpanel.nl", true },
+  { "myweddingaway.co.uk", true },
   { "myweddingreceptionideas.com", true },
   { "myworkinfo.com", false },
   { "myworth.com.au", true },
   { "myxnr.com", true },
   { "myyubikey.net", true },
   { "myyubikey.org", true },
+  { "myzhili.com", true },
   { "myzina.cz", false },
   { "mz-mz.net", true },
   { "mza.com", true },
   { "mzh.io", true },
-  { "mziulu.me", false },
+  { "mzmtech.com", true },
   { "mznet.de", true },
+  { "mzstatic.cc", true },
   { "mzzj.de", true },
   { "n-a.date", true },
   { "n-design.de", true },
+  { "n-linear.org", true },
   { "n-m.lu", true },
   { "n-man.info", true },
   { "n-pix.com", false },
@@ -24654,11 +25652,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "n4v.eu", true },
   { "n5118.com", true },
   { "n6a.net", true },
-  { "n7.education", true },
+  { "n8ch.net", true },
   { "n8mgt.com", true },
   { "n8nvi.com", true },
   { "n8solutions.net", true },
   { "na-school.nl", true },
+  { "naahgluck.de", true },
   { "naam.me", true },
   { "nabaleka.com", true },
   { "nabankco.com", true },
@@ -24669,16 +25668,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nachsenden.info", true },
   { "nachtmuziek.info", true },
   { "nacin.com", true },
+  { "nacktetatsachen.at", false },
   { "nacktwanderfreunde.de", true },
   { "nacyklo.cz", true },
   { "nadaquenosepas.com", true },
   { "nadejeproninu.cz", true },
   { "nadelholzkulturen.de", true },
+  { "naders.com", true },
   { "nadine-chaudier.net", true },
   { "nadsandgams.com", true },
   { "nadyaolcer.fr", true },
   { "nafod.net", true },
   { "naga-semi.com", true },
+  { "nagajanroshiya.info", true },
   { "nagashi.ma", false },
   { "nagaya.biz", true },
   { "nagb.gov", true },
@@ -24692,6 +25694,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nailattitude.ch", true },
   { "nailchiodo.com", true },
   { "nailsalon-aztplus.com", true },
+  { "nailtodayminneapolis.com", true },
   { "nairobibusinessreview.com", true },
   { "naive.network", true },
   { "najany.de", true },
@@ -24699,13 +25702,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "najany.fr", true },
   { "najany.nl", true },
   { "najany.se", true },
+  { "nakada4610.com", true },
   { "nakalabo.jp", true },
   { "nakama.tv", true },
   { "nakandya.com", true },
+  { "nakanishi-paint.com", true },
   { "nakayama.systems", true },
   { "nakedalarmclock.me", true },
   { "nakedtruthbeauty.com", true },
   { "nakene.com", true },
+  { "nakliyat.name.tr", true },
   { "nakliyatsirketi.biz.tr", true },
   { "nako.no", true },
   { "nalepky-na-zed.cz", true },
@@ -24722,7 +25728,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "namethissymbol.com", true },
   { "nametiles.co", true },
   { "nami.bo", true },
-  { "nami.exchange", true },
   { "nami.trade", true },
   { "naminam.de", true },
   { "namrs.net", true },
@@ -24730,11 +25735,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "namu.live", true },
   { "namu.moe", true },
   { "namu.wiki", true },
-  { "namuwikiusercontent.com", true },
   { "nanarose.ch", true },
   { "nanch.com", true },
+  { "nancytelford.com", true },
   { "nandex.org", true },
+  { "nange.cn", true },
   { "nange.co", true },
+  { "nani.io", true },
   { "nankiseamansclub.com", true },
   { "nannytax.ca", true },
   { "nanofy.org", true },
@@ -24764,6 +25771,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "narenderchopra.com", true },
   { "narfation.org", true },
   { "nargileh.nl", true },
+  { "naric.com", true },
   { "narindal.ch", true },
   { "narmos.ch", true },
   { "naro.se", true },
@@ -24781,15 +25789,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nashdistribution.com", true },
   { "nashikmatka.com", true },
   { "nashira.cz", true },
+  { "nashvillebasements.com", true },
   { "nashvillelidsurgery.com", true },
   { "nashzhou.me", true },
-  { "nasme.tk", true },
   { "nasrsolar.com", true },
   { "nassi.me", true },
   { "nastoletni.pl", true },
   { "nataldigital.com", true },
   { "nataliedawnhanson.com", true },
-  { "natanaelys.com", false },
   { "natation-nsh.com", false },
   { "natatorium.org", true },
   { "natchmatch.com", true },
@@ -24805,13 +25812,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nathankonopinski.com", true },
   { "nathansmetana.com", true },
   { "nathumarket.com.br", true },
-  { "nationalbank.gov", true },
-  { "nationalbanknet.gov", true },
   { "nationalcentereg.org", true },
   { "nationalcprfoundation.com", true },
   { "nationalcrimecheck.com.au", true },
   { "nationalhomequotes.com", true },
   { "nationalmap.gov", true },
+  { "nationalpassportservice.info", true },
   { "nationalpriorities.org", true },
   { "nationaltaxprep.com", true },
   { "nationaltrails.ru", true },
@@ -24820,7 +25826,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nativitynj.org", true },
   { "nativs.ch", true },
   { "natlec.com", true },
-  { "natropie.pl", true },
   { "natsumihoshino.com", true },
   { "natur.com", true },
   { "natura-sense.com", true },
@@ -24841,7 +25846,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "naturtint.co.uk", true },
   { "natusvita.com.br", true },
   { "naude.co", true },
-  { "naughty.audio", true },
   { "nausicaahotel.it", true },
   { "nautiljon.com", true },
   { "nautsch.de", true },
@@ -24850,7 +25854,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "navenlle.com", true },
   { "navienna.com", true },
   { "navient.com", true },
-  { "navitime.me", true },
+  { "navigate-it-services.de", false },
   { "navstevnik.sk", true },
   { "navycs.com", true },
   { "nawir.de", true },
@@ -24899,8 +25903,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nba2konline.com", true },
   { "nba2konlinex.com", true },
   { "nba2kx.com", true },
-  { "nba669.com", true },
-  { "nba686.com", true },
   { "nbad.al", true },
   { "nbadancers.com", true },
   { "nbade.com", true },
@@ -24921,6 +25923,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nbgrooves.de", true },
   { "nbhorsetraining.com", true },
   { "nbib.gov", true },
+  { "nbnnetwork.com", true },
   { "nbp.com.pk", true },
   { "nbrain.de", true },
   { "nbrii.com", true },
@@ -24929,6 +25932,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nc-beautypro.fr", true },
   { "nc-formation.fr", true },
   { "nc-network.io", true },
+  { "nc99.co", true },
   { "ncamarquee.co.uk", true },
   { "ncands.net", true },
   { "ncaq.net", true },
@@ -24941,19 +25945,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nchangfong.com", true },
   { "nchponline.org", true },
   { "ncic.gg", true },
-  { "nclvle.co.uk", true },
   { "ncm-malerbetrieb.de", true },
   { "ncsc.gov.uk", true },
   { "ncsccs.com", true },
   { "ncstep.org", true },
+  { "nctx.co.uk", true },
   { "ndarville.com", true },
+  { "ndatc.com", true },
   { "ndbt.com", true },
   { "ndcpolipak.com", true },
   { "ndeoffshore.com", true },
   { "nder.be", true },
   { "ndfa.net", true },
+  { "ndfirefighter.com", true },
   { "ndhlink.com", true },
   { "ndpbrn-research.org", true },
+  { "ndphp.org", true },
+  { "ndpigskin.com", true },
   { "nds-helicopter.de", true },
   { "ndy.sex", true },
   { "ne-on.org", true },
@@ -24963,12 +25971,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "neartothesky.com", true },
   { "neatous.cz", true },
   { "neatous.net", true },
+  { "neatzy.co.uk", true },
   { "neave.tv", true },
   { "neba.io", true },
   { "nebelhauch.de", true },
   { "nebelheim.de", true },
   { "nebenbeiblog.ch", true },
   { "nebra.io", true },
+  { "nebracy.com", true },
   { "nebul.at", true },
   { "nebulae.co", true },
   { "nebuluxcapital.com", true },
@@ -24984,23 +25994,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nedys.top", true },
   { "neecist.org", true },
   { "needemand.com", true },
-  { "needle.net.nz", true },
-  { "needle.nz", true },
   { "needstyle.ru", true },
   { "neeerd.org", true },
   { "neel.ch", true },
   { "neemzy.org", true },
-  { "neet-investor.biz", true },
   { "nefertitis.cz", true },
   { "neffat.si", true },
   { "neflabs.com", true },
+  { "nefro-cme.de", true },
   { "neftis.es", true },
   { "neg9.org", false },
   { "negai.moe", true },
-  { "negativecurvature.net", true },
   { "neglecteddiseases.gov", true },
+  { "neheim-huesten.de", true },
   { "nehoupat.cz", true },
   { "nehrp.gov", true },
+  { "neighborhoodelectricwa.com", true },
   { "neil-barrett.com", true },
   { "neil-barrett.uk", true },
   { "neildaniels.com", true },
@@ -25011,6 +26020,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "neilwynne.com", true },
   { "neio.uk", true },
   { "nejenpneu.cz", true },
+  { "nejkasy.cz", true },
   { "nejlevnejsi-parapety.cz", true },
   { "neko-nyan-nuko.com", true },
   { "neko-nyan.org", true },
@@ -25033,17 +26043,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nemo.run", true },
   { "nemopan.com", true },
   { "nemopret.dk", true },
+  { "nemplex.win", false },
   { "nems.no", true },
   { "nemumu.com", true },
   { "nemunai.re", true },
+  { "nenkin-kikin.jp", true },
+  { "neno.io", true },
   { "neo2shyalien.eu", false },
   { "neobits.nl", true },
   { "neocities.org", true },
   { "neoclick.io", true },
-  { "neocoding.com", true },
-  { "neodrive.ch", true },
   { "neoedresources.org", true },
   { "neoeliteconsulting.com", true },
+  { "neohu.com", true },
   { "neojo.org", true },
   { "neokobe.city", true },
   { "neolaudia.es", true },
@@ -25057,14 +26069,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "neosdesignstudio.co.uk", true },
   { "neostralis.com", true },
   { "neotist.com", true },
+  { "neotiv.com", true },
   { "neowa.tk", true },
+  { "neowin.net", true },
   { "neowlan.net", true },
   { "neoxcrf.com", true },
   { "neoz.com.br", true },
   { "nepageeks.com", true },
   { "nepal-evolution.org", true },
   { "nephelion.org", true },
-  { "nephos.xyz", true },
+  { "nephy.jp", true },
   { "nepovolenainternetovahazardnihra.cz", true },
   { "nepremicninar.com", true },
   { "nepremicnine.click", true },
@@ -25082,10 +26096,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nerpa-club.ru", true },
   { "nerull7.info", true },
   { "nerven.se", false },
-  { "nesbase.com", true },
   { "nesolabs.com", true },
   { "nesolabs.de", true },
-  { "nestedquotes.ca", true },
   { "nesterov.pw", true },
   { "nestor.nu", true },
   { "neswec.org.uk", true },
@@ -25096,7 +26108,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "net4visions.de", true },
   { "netamia.com", true },
   { "netapps.de", true },
-  { "netba.net", true },
   { "netbank.com.au", true },
   { "netbears.com", true },
   { "netbears.ro", true },
@@ -25105,6 +26116,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "netbox.org", true },
   { "netbrewventures.com", true },
   { "netbulls.io", true },
+  { "netbuzz.ru", true },
   { "netconnect.at", true },
   { "netcoolusers.org", true },
   { "netdex.co", true },
@@ -25125,7 +26137,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nethunter.top", true },
   { "netki.com", true },
   { "netlentes.com.br", true },
-  { "netlocal.ru", true },
   { "netmagicas.com.br", true },
   { "netmeister.org", true },
   { "netnik.de", true },
@@ -25153,7 +26164,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nettools.link", true },
   { "nettopower.dk", true },
   { "nettoyage.email", true },
-  { "nettplusultra-rhone.fr", true },
   { "nettx.co.uk", true },
   { "netulo.com", true },
   { "netvizura.co.uk", true },
@@ -25172,13 +26182,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "networkingphoenix.com", true },
   { "networkmidlands.co.uk", true },
   { "networkmidlands.uk", true },
-  { "networkmon.net", true },
   { "networkposting.com", true },
   { "networth.at", true },
   { "netz-yokohama.co.jp", true },
   { "netzfabrik.com", true },
   { "netzfrauen.org", true },
   { "netzwerkwerk.de", true },
+  { "neuber.uno", true },
   { "neuflizeobc.net", true },
   { "neuhaus-city.de", true },
   { "neurabyte.com", true },
@@ -25191,10 +26201,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "neurolab.no", true },
   { "neuronasdigitales.com", true },
   { "neuropharmacology.com", true },
+  { "neurostimtms.com", true },
   { "neurotransmitter.net", true },
   { "neurozentrum-zentralschweiz.ch", true },
   { "neutralox.com", false },
   { "neuwal.com", true },
+  { "neva.li", true },
   { "never.pet", true },
   { "nevergreen.io", true },
   { "nevermore.fi", true },
@@ -25225,7 +26237,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "newcityinfo.ch", true },
   { "newcitystudio.ch", true },
   { "newcloudwhodis.com", true },
-  { "newcreamforface.com", true },
   { "newday.host", true },
   { "newearth.press", true },
   { "newfangledscoop.com", true },
@@ -25241,16 +26252,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "newmarketbouncycastlehire.co.uk", true },
   { "newmed.com.br", true },
   { "newmediaone.net", true },
+  { "newmelalife.com", true },
   { "newmovements.net", true },
   { "newmusicjackson.org", true },
   { "newodesign.com", true },
-  { "newpathintegratedtherapy.com", true },
   { "newposts.ru", true },
   { "newreleases.io", true },
   { "news47ell.com", true },
   { "newsa2.com", true },
-  { "newsaboutgames.de", true },
-  { "newserumforskin.com", true },
   { "newsmotor.info", true },
   { "newspsychology.com", true },
   { "newstone-tech.com", true },
@@ -25263,6 +26272,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nexicafiles.com", true },
   { "nexril.net", true },
   { "next-web.ad.jp", true },
+  { "next176.sk", true },
   { "next24.io", true },
   { "nextads.ch", true },
   { "nextbranders.com", true },
@@ -25271,7 +26281,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nextcloud.co.za", true },
   { "nextcloud.com", true },
   { "nextcloud.nerdpol.ovh", true },
-  { "nextend.net", true },
+  { "nextclouddarwinkel.nl", true },
   { "nextevolution.co.uk", true },
   { "nextgen.sk", true },
   { "nextgencel.com", true },
@@ -25279,7 +26289,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nextgreatmess.com", true },
   { "nexthop.jp", true },
   { "nextiot.de", true },
-  { "nextlevel-it.co.uk", true },
   { "nextmbta.com", true },
   { "nextme.se", true },
   { "nextnely.com", true },
@@ -25287,6 +26296,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nextrobotics.de", true },
   { "nextstep-labs.gr", true },
   { "nexttv.co.il", true },
+  { "nextwab.com", true },
   { "nexus-exit.de", true },
   { "nexus-vienna.at", true },
   { "nexusconnectinternational.eu", true },
@@ -25302,10 +26312,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nfl.dedyn.io", true },
   { "nfl.duckdns.org", true },
   { "nflmocks.com", true },
-  { "nfls.io", true },
   { "nflsic.org", true },
+  { "nfpors.gov", true },
   { "nframe.io", true },
-  { "nfrost.me", true },
   { "nfsec.pl", true },
   { "nfz.moe", true },
   { "ng-musique.com", true },
@@ -25315,11 +26324,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nginxconfig.com", true },
   { "nginxconfig.io", true },
   { "ngndn.jp", true },
+  { "ngt.gr", true },
   { "ngvf.de", true },
   { "ngx.hk", true },
   { "ngxpkg.com", true },
   { "nhccnews.org", true },
   { "nhchalton.com", true },
+  { "nhdsilentheroes.org", true },
   { "nhgteam.hu", true },
   { "nhimf.org", true },
   { "nhome.ba", true },
@@ -25342,15 +26353,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nice.ch", true },
   { "niceguyit.biz", true },
   { "nicesco.re", true },
+  { "nicesleepo.com", true },
   { "nicestudio.co.il", false },
   { "nichijou.com", true },
+  { "nicholasperkins.io", true },
   { "nicholaspruss.com", true },
   { "nicholasquigley.com", true },
   { "nicholaswilliams.net", true },
   { "nichteinschalten.de", false },
   { "nichthelfer.de", true },
   { "nicic.gov", true },
-  { "nickcleans.co.uk", true },
   { "nickcraver.com", true },
   { "nickdekruijk.nl", true },
   { "nickguyver.com", true },
@@ -25365,17 +26377,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nickscomputers.nl", true },
   { "nickserve.com", true },
   { "nickstories.de", true },
-  { "nicktheitguy.com", true },
   { "niclasreich.de", true },
   { "nicn.me", true },
   { "nico.st", true },
   { "nicocourts.com", true },
   { "nicoknibbe.nl", true },
+  { "nicoladixonrealestate.com", true },
   { "nicolajanedesigns.co.uk", true },
   { "nicolas-dumermuth.com", true },
   { "nicolas-hoffmann.net", true },
   { "nicolas-hoizey.com", true },
   { "nicolas-simond.com", true },
+  { "nicolasfriedli.ch", true },
   { "nicolashess.de", true },
   { "nicolasiung.me", true },
   { "nicolaszambetti.ch", true },
@@ -25383,11 +26396,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nicolemathew.com", true },
   { "niconico.ooo", true },
   { "niconode.com", false },
-  { "nicoobook.com", true },
   { "nicsezcheckfbi.gov", true },
   { "nicul.in", true },
   { "nidro.de", true },
   { "nidsuber.ch", true },
+  { "niduxcomercial.com", true },
   { "niederohmig.de", true },
   { "niehage.name", true },
   { "nielshoogenhout.be", true },
@@ -25410,6 +26423,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "niesstar.com", true },
   { "nietzsche.com", true },
   { "nieuwslagmaat.nl", true },
+  { "nifc.gov", true },
   { "niftiestsoftware.com", true },
   { "nigelwakefield.com", true },
   { "nigensha.co.jp", true },
@@ -25426,22 +26440,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nightmoose.org", true },
   { "nightsi.de", true },
   { "nightstand.io", true },
-  { "nightwinds.tk", true },
   { "nigt.cf", true },
   { "nihon-no-sake.net", true },
+  { "nihtek.in", true },
+  { "nii2.org", true },
+  { "nij.gov", true },
   { "nijiero-ch.com", false },
   { "nijikata.com", true },
   { "nijm.nl", true },
   { "nikandcara.com", true },
   { "nikao-tech.com", true },
   { "nikavandenbos.nl", true },
-  { "niki.ai", true },
   { "nikimix.com", true },
   { "nikkasystems.com", true },
   { "nikkila.me", true },
   { "nikklassen.ca", true },
   { "niklas.pw", true },
-  { "niklasanderson.com", true },
   { "niklasbabel.com", true },
   { "nikolasgrottendieck.com", true },
   { "nikomo.fi", false },
@@ -25465,6 +26479,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ninepints.co", true },
   { "ninesix.cc", true },
   { "ninespec.com", true },
+  { "ninetailed.ninja", true },
   { "ninetaillabs.com", true },
   { "ninetaillabs.xyz", true },
   { "ninfora.com", true },
@@ -25473,13 +26488,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ninjaworld.co.uk", true },
   { "ninjio.com", true },
   { "ninov.de", true },
-  { "ninreiei.jp", true },
   { "nintendoforum.no", true },
   { "ninth.cat", true },
   { "ninthfloor.org", true },
-  { "ninverse.com", true },
   { "nipax.cz", true },
   { "nipe-systems.de", true },
+  { "nipit.biz", true },
   { "nippon-oku.com", true },
   { "niqex.com", true },
   { "nirjonmela.com", true },
@@ -25497,10 +26511,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nitrous-networks.com", true },
   { "nitschinger.at", true },
   { "niu.moe", true },
-  { "niva.synology.me", true },
   { "nivi.ca", true },
   { "nix.black", true },
   { "nixonlibrary.gov", true },
+  { "nixtest.net", true },
   { "nixx-gel.cz", true },
   { "niyawe.de", true },
   { "niyazpoyilan.com", false },
@@ -25509,7 +26523,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "njguardtraining.com", true },
   { "njilc.com", true },
   { "njpjanssen.nl", true },
+  { "njujb.com", true },
   { "nkapliev.org", true },
+  { "nkforum.pl", true },
   { "nkinka.de", true },
   { "nkp-media.de", true },
   { "nl-ix.net", true },
@@ -25521,13 +26537,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nllboard.co.uk", true },
   { "nlleisure.co.uk", true },
   { "nlm.gov", true },
-  { "nlrb.gov", true },
   { "nlt.by", false },
   { "nmd.so", true },
+  { "nmmlp.org", true },
   { "nmnd.de", true },
   { "nmontag.com", true },
+  { "nmsinusdoc.com", true },
   { "nn.cz", true },
-  { "nn78.com", true },
   { "nna774.net", true },
   { "nnqc.nl", true },
   { "no-ip.cz", true },
@@ -25551,10 +26567,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nodariweb.com.ar", true },
   { "nodecraft.com", true },
   { "nodejs.de", true },
-  { "nodelab-it.de", true },
   { "nodelia.com", true },
   { "nodesec.cc", true },
-  { "nodesturut.cl", true },
   { "nodevops.com", true },
   { "noeatnosleep.me", true },
   { "noedidacticos.com", true },
@@ -25566,6 +26580,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "noez.de", true },
   { "nofrillsdns.com", true },
   { "nogerondier.eu", true },
+  { "nogetime.com", true },
   { "noglobalwarrants.org", true },
   { "nohats.ca", true },
   { "nohkan.fr", true },
@@ -25591,19 +26606,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nomesbiblicos.com", true },
   { "nomial.co.uk", true },
   { "nomifensine.com", true },
-  { "nomoondev.azurewebsites.net", true },
   { "nomsy.net", true },
   { "nonabytes.xyz", true },
   { "noname-ev.de", true },
   { "nonametheme.com", true },
   { "noncombatant.org", true },
   { "noob-box.net", true },
+  { "noobow.me", true },
   { "noobswhatelse.net", true },
-  { "noobunbox.net", true },
   { "noodles.net.nz", true },
   { "noodplan.co.za", true },
   { "noodweer.be", true },
   { "noofficewalls.com", true },
+  { "noomist.com", true },
   { "noon-entertainments.com", true },
   { "noop.ch", true },
   { "noordsee.de", true },
@@ -25614,6 +26629,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "noovell.com", true },
   { "nopaste.xyz", true },
   { "nopaynocure.com", true },
+  { "norad.sytes.net", true },
   { "norbertschneider-music.com", true },
   { "nord-sud.be", true },
   { "nordakademie.de", true },
@@ -25622,6 +26638,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nordlichter-brv.de", true },
   { "nordmoregatebilklubb.com", true },
   { "nordnetz-hamburg.de", true },
+  { "nordseeblicke.de", true },
   { "nordwal.de", true },
   { "nordwaldzendo.de", true },
   { "noreply.mx", true },
@@ -25634,14 +26651,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "normanbauer.com", true },
   { "normandgascon.com", true },
   { "normankranich.de", true },
-  { "norrkemi.se", true },
+  { "noroshi-burger.com", true },
   { "norrliden.de", true },
   { "norsewars.com", true },
   { "norskpensjon.no", true },
   { "northatlantalaw.net", true },
+  { "northbridgecre.com", true },
   { "northbrisbaneapartments.com.au", true },
   { "northconsulting.fr", true },
   { "northcountykiaparts.com", true },
+  { "northcreekresort.com", true },
+  { "northdakotahealthnetwork.com", true },
   { "northdevonbouncycastles.co.uk", true },
   { "northeastcdc.org", true },
   { "northeastrodeo.co.uk", true },
@@ -25652,10 +26672,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "northernselfstorage.co.za", true },
   { "northfieldyarn.com", true },
   { "northokanaganbookkeeping.com", true },
+  { "northpointoutdoors.com", true },
   { "northpole.dance", true },
   { "northridgeelectrical.com", true },
   { "northumbriagames.co.uk", true },
-  { "northwest-events.co.uk", true },
   { "norys-escape.de", true },
   { "nos-medias.fr", true },
   { "nos-oignons.net", true },
@@ -25676,7 +26696,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "notacooldomain.com", true },
   { "notadd.com", true },
   { "notallmine.net", true },
-  { "notalone.gov", true },
   { "notar-glagowski.com", true },
   { "notar-glagowski.de", true },
   { "notar-peikert.com", true },
@@ -25698,7 +26717,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "notoriousdev.com", true },
   { "notrecourrier.net", true },
   { "notsafefor.work", true },
-  { "nottres.com", true },
   { "noudjalink.nl", true },
   { "noustique.com", true },
   { "nova-dess.ch", true },
@@ -25709,6 +26727,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "novacoast.com", false },
   { "novadermis.es", true },
   { "novafreixo.pt", true },
+  { "novascan.net", true },
   { "novawave.ca", true },
   { "nove.city", true },
   { "noveciti.com", true },
@@ -25724,16 +26743,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "novgorod-avia.ru", true },
   { "novilaw.com", true },
   { "novilidery.com", true },
+  { "novinivo.com", true },
   { "novodiegomaia.com.br", true },
   { "novojet.cl", true },
   { "novoresume.com", false },
   { "novosibavia.ru", true },
   { "novurania.com", true },
+  { "nowall.online", true },
   { "nowhere.dk", true },
   { "nowitzki.me", true },
   { "nowlas.org", true },
   { "nowloading.co", true },
-  { "noxi.ga", true },
   { "noxlogic.nl", true },
   { "noydeen.com", true },
   { "noyocenter.org", true },
@@ -25742,9 +26762,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "npath.de", true },
   { "npcrcss.org", true },
   { "npmcdn.com", true },
+  { "npregion.org", true },
   { "npsas.org", true },
   { "npw.net", true },
-  { "nqesh.com", true },
   { "nqeshreviewer.com", true },
   { "nrd.li", true },
   { "nrdstd.io", true },
@@ -25762,12 +26782,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nscnet.jp", true },
   { "nsfw-story.com", true },
   { "nshipster.cn", true },
+  { "nshipster.co.kr", true },
   { "nshipster.com", true },
+  { "nshipster.es", true },
   { "nsm.ee", true },
   { "nsm.stat.no", true },
   { "nso.ie", true },
+  { "nsofficeinteriors.com", true },
   { "nsp.ua", true },
   { "nst-maroc.com", true },
+  { "nstatic.xyz", true },
   { "nstd.net", true },
   { "nstremsdoerfer.ovh", true },
   { "nstrust.co.uk", true },
@@ -25777,12 +26801,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nte.email", true },
   { "nth.sh", true },
   { "nti.de", true },
+  { "ntia.gov", true },
   { "ntotten.com", true },
   { "ntppool.org", false },
+  { "ntsb.gov", true },
   { "ntwt.us", true },
   { "ntx360grad-fallakte.de", true },
   { "ntzwrk.org", true },
-  { "nu-pogodi.net", true },
   { "nu3.com", true },
   { "nu3.dk", true },
   { "nu3.fi", true },
@@ -25794,6 +26819,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nuamooreaindonesia.com", true },
   { "nubella.com.au", true },
   { "nubu.at", true },
+  { "nuclea.id", true },
   { "nuclea.site", true },
   { "nuclearcat.com", true },
   { "nucleuscore.org", true },
@@ -25823,8 +26849,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "numerossanos.com.ar", true },
   { "numis.tech", true },
   { "numismed-seniorcare.de", true },
-  { "numm.fr", true },
   { "numwave.nl", true },
+  { "nunesgh.com", true },
   { "nunnenmacher.net", true },
   { "nunnun.jp", true },
   { "nunomoura.com", true },
@@ -25844,11 +26870,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nutleyef.org", true },
   { "nutonic-sports.com", true },
   { "nutpanda.com", true },
+  { "nutra-creations.com", true },
   { "nutri-spec.me", true },
   { "nutricaovegana.com", true },
   { "nutriciametabolics-shop.de", true },
+  { "nutridieta.com", true },
   { "nutripedia.gr", true },
   { "nutrishop.com", true },
+  { "nutrition.gov", true },
   { "nutrivisa.com.br", true },
   { "nuvechtdal.nl", true },
   { "nuvini.com", true },
@@ -25858,10 +26887,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nvl-game.tokyo", true },
   { "nvq.nl", true },
   { "nvr.bz", true },
+  { "nvtc.gov", true },
+  { "nwapproval.com", true },
+  { "nwautorebuild.com", true },
   { "nwbc.gov", true },
   { "nwerc.party", true },
   { "nwgh.org", false },
+  { "nwimports.com", true },
   { "nwk1.com", true },
+  { "nwperformanceandoffroad.com", true },
   { "nwr-waffenbuch.de", true },
   { "nwra.com", true },
   { "nwuss.okinawa", true },
@@ -25882,6 +26916,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "nycoyote.org", true },
   { "nydig.com", true },
   { "nydnxs.com", true },
+  { "nyghtus.net", true },
   { "nyhaoyuan.net", true },
   { "nyiad.edu", true },
   { "nyip.co.uk", true },
@@ -25907,12 +26942,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "o2careers.co.uk", true },
   { "o3.wf", true },
   { "o3wallet.com", true },
+  { "o5.cx", true },
   { "o6asan.com", true },
   { "o8b.club", true },
   { "oaic.gov.au", true },
   { "oakandresin.co", true },
   { "oakesfam.net", true },
-  { "oakington.info", true },
+  { "oakington.info", false },
   { "oaklands.co.za", true },
   { "oakparkelectrical.com", true },
   { "oakparkexteriorlighting.com", true },
@@ -25922,38 +26958,48 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oakslighting.co.uk", true },
   { "oanalista.com.br", true },
   { "oasisdabeleza.com.br", true },
+  { "oasisim.net", false },
   { "oatmealdome.me", true },
   { "oauth-dropins.appspot.com", false },
   { "obamalibrary.gov", true },
   { "obamawhitehouse.gov", true },
   { "oberhofdrinks.com", true },
   { "obermeiers.eu", true },
+  { "oberoi.de", true },
   { "obesidadlavega.com", true },
   { "obfuscate.xyz", true },
   { "obg-global.com", true },
+  { "obgalslancaster.com", true },
   { "obitech.de", true },
   { "object.earth", true },
   { "objectif-terre.ch", true },
   { "objekt-textil.ch", true },
   { "oblast45.ru", false },
+  { "obligacjekk.pl", true },
   { "oboeta.com", true },
   { "obono.at", true },
+  { "obrienswine.ie", true },
   { "obs.group", true },
   { "obscur.us", true },
+  { "observer.name", true },
   { "obsessharness.com", true },
   { "obsidianirc.net", true },
   { "obsproject.com", true },
   { "obtima.org", true },
   { "obud.cz", true },
+  { "obxlistings.com", true },
   { "obyvateleceska.cz", true },
   { "oc-sa.ch", true },
+  { "ocalaflwomenshealth.com", true },
   { "ocarupo.com", true },
   { "occenterprises.org", true },
   { "occentus.net", true },
   { "occmon.net", true },
   { "occonnections.org", true },
+  { "occupational-therapy-colleges.com", true },
   { "ocd2016.com", true },
   { "ocdadmin.com", true },
+  { "oceancity4sales.com", true },
   { "oceandns.eu", true },
   { "oceandns.net", true },
   { "oceandns.nl", true },
@@ -25971,15 +27017,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ockendenhemming.co.uk", true },
   { "oclausen.com", true },
   { "ocloudhost.com", true },
+  { "ocni-ambulance-most.cz", true },
   { "ocolere.ch", true },
   { "ocotg.com", true },
   { "ocrn.nl", true },
   { "ocsigroup.fr", true },
   { "ocsr.nl", true },
+  { "octagongroup.co", true },
+  { "octal.es", true },
   { "octarineparrot.com", true },
   { "octav.name", false },
+  { "octocaptcha.com", true },
+  { "octofox.de", true },
   { "octohedralpvp.tk", true },
-  { "octohost.net", true },
   { "octolopagon.games", true },
   { "octopus-agents.com", true },
   { "octosniff.net", true },
@@ -25995,12 +27045,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oddsandevensbookkeeping.ca", true },
   { "oddtime.net", true },
   { "odensc.me", true },
+  { "odhosc.ca", true },
   { "odifi.com", true },
   { "odinseye.net", true },
   { "odoo.co.th", true },
   { "odpikedoslike.com", true },
   { "odtu.lu", true },
   { "oducs.org", true },
+  { "odvps.com", true },
   { "odysseyofthemind.eu", true },
   { "odzyskaniedomeny.pl", true },
   { "oec-music.com", true },
@@ -26015,8 +27067,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oemspace.nl", true },
   { "oemwolf.com", true },
   { "oenings.eu", true },
+  { "of2m.fr", true },
   { "ofcampuslausanne.ch", true },
   { "ofda.gov", true },
+  { "ofertasadsl.com", true },
   { "offandonagain.org", true },
   { "offbyinfinity.com", true },
   { "offenekommune.de", true },
@@ -26032,13 +27086,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "officefundays.co.uk", true },
   { "officeinteriors.co.nz", true },
   { "officemovepro.com", true },
-  { "officeprint.co.th", true },
   { "officiants.wedding", false },
   { "officium.tech", true },
   { "offroadeq.com", true },
+  { "offroadhoverboard.net", true },
   { "offshoot.rentals", true },
   { "offshore.digital", true },
+  { "offshoremarineparts.com", false },
+  { "ofggolf.com", true },
   { "oflow.me", true },
+  { "ofsetas.lt", true },
   { "oftamedic.com", true },
   { "oftn.org", true },
   { "oge.ch", true },
@@ -26053,7 +27110,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ohchouette.com", true },
   { "ohd.dk", true },
   { "oheila.com", true },
-  { "ohhdeertrade.com", true },
   { "ohhere.xyz", true },
   { "ohiohealthfortune100.com", true },
   { "ohling.org", true },
@@ -26075,6 +27131,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ojdip.net", true },
   { "ojomovies.com", true },
   { "ojp.gov", true },
+  { "okaidi.es", true },
+  { "okaidi.fr", true },
   { "okakuro.org", true },
   { "okanaganrailtrail.ca", true },
   { "okay.cf", true },
@@ -26097,12 +27155,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oktoberfeststore.nl", true },
   { "oktomus.com", true },
   { "okukan.com.au", true },
+  { "okurapictures.com", true },
   { "okusiassociates.com", true },
   { "olanderflorist.com", true },
   { "olasouris.com", true },
+  { "olastrafford.org", true },
   { "olback.net", true },
   { "olbat.net", true },
   { "olcayanar.com", true },
+  { "olcbrookhaven.org", true },
   { "oldbrookinflatables.co.uk", true },
   { "oldbrookmarqueehire.co.uk", true },
   { "oldchaphome.nl", true },
@@ -26114,6 +27175,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oldoakflorist.com", true },
   { "oldprop.com", true },
   { "oldroutetwo.com", true },
+  { "oldsticker.com", true },
+  { "oldstmary.com", true },
   { "oldtimerreifen-moeller.de", true },
   { "olegon.ru", true },
   { "olegs.be", true },
@@ -26121,10 +27184,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oleodecopayba.com.br", true },
   { "oles-hundehaus.de", true },
   { "olfnewcastle.com", true },
+  { "olfsecane.org", true },
   { "olgiati.org", true },
   { "olgui.net", true },
   { "olgun.eu", true },
-  { "olifant.fr", true },
+  { "olhcparish.net", true },
   { "olightstore.com", true },
   { "olightstore.ro", true },
   { "oliode.tk", true },
@@ -26138,11 +27202,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "olivernaraki.com", true },
   { "oliverniebuhr.de", true },
   { "oliverspringer.eu", true },
+  { "oliverst.com", true },
   { "olivierberardphotographe.com", true },
   { "olivierlemoal.fr", true },
   { "olivierpieters.be", true },
   { "oliviervaillancourt.com", true },
-  { "olivlabs.com", true },
   { "olizeite.ch", true },
   { "ollie.io", true },
   { "ollieowlsblog.com", true },
@@ -26154,17 +27218,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ollning.com", true },
   { "olltechjob.com", true },
   { "olmari.fi", true },
+  { "olmc-nutley.org", true },
+  { "olmcnewark.com", true },
   { "olmsted.io", true },
+  { "olphseaside.org", true },
+  { "olqoa.org", true },
   { "olygazoo.com", true },
   { "olymp-arts.world", true },
+  { "olympeakgaming.tv", true },
   { "olympiads.ca", true },
   { "olympic-research.com", true },
   { "om1.com", true },
   { "omanko.porn", true },
-  { "omarsuniagamusic.ga", true },
+  { "omar.yt", true },
+  { "omarh.net", true },
   { "omdesign.cz", true },
   { "omegahosting.net", true },
   { "omegathermoproducts.nl", true },
+  { "omeopatiadinamica.it", true },
   { "omertabeyond.com", true },
   { "omertabeyond.net", true },
   { "ometepeislandinfo.com", true },
@@ -26172,9 +27243,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "omfg.exposed", true },
   { "omgbouncycastlehire.co.uk", true },
   { "omi-news.fr", true },
-  { "omifind.com", true },
+  { "omicron3069.com", true },
   { "omitech.co.uk", true },
   { "omlmetal.co.jp", true },
+  { "omniaclubs.com", true },
   { "omniasig.ro", true },
   { "omniasl.com", true },
   { "omniatv.com", true },
@@ -26194,7 +27266,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "on-tech.co.uk", true },
   { "ona.io", true },
   { "onaboat.se", true },
-  { "onahonavi.com", true },
   { "onarto.com", true },
   { "onazikgu.com", true },
   { "onbuzzer.com", false },
@@ -26205,6 +27276,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ond-inc.jp", true },
   { "ondcp.gov", true },
   { "onderwijstransparant.nl", true },
+  { "ondevamosjantar.com", true },
   { "ondrej.org", true },
   { "ondrejhoralek.cz", true },
   { "one---line.com", true },
@@ -26224,6 +27296,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "onedrive.live.com", false },
   { "onee3.org", true },
   { "onefour.ga", false },
+  { "onegoodthingbyjillee.com", true },
   { "oneheartbali.church", true },
   { "oneidentity.me", true },
   { "oneiroi.co.uk", true },
@@ -26232,6 +27305,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "onemoonmedia.de", true },
   { "oneononeonone.de", true },
   { "oneononeonone.tv", true },
+  { "onepercentrentals.com", true },
   { "onepointsafeband.ca", true },
   { "onepointsafeband.com", true },
   { "onepointzero.com", true },
@@ -26242,6 +27316,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "onestopcastles.co.uk", true },
   { "onetcenter.org", true },
   { "onetcodeconnector.org", true },
+  { "onetech.it", true },
   { "onetime.info", true },
   { "onetonline.org", true },
   { "oneway.ga", true },
@@ -26249,12 +27324,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oneweb.hu", true },
   { "onfarma.it", true },
   { "ongea.io", true },
+  { "ongiaenegogoa.com", true },
   { "onhistory.co.uk", true },
   { "onhub1.com", true },
   { "oni.nl", true },
   { "onice.ch", true },
   { "onionbot.me", true },
-  { "onionplay.live", true },
   { "onionplay.org", true },
   { "onionscan.org", true },
   { "oniria.ch", true },
@@ -26272,15 +27347,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "online-pr.at", true },
   { "online-results.dk", true },
   { "online-stopwatch.com", true },
+  { "online-textil.com", true },
+  { "online-textil.cz", true },
+  { "online-textil.sk", true },
   { "online.marketing", true },
   { "online.net.gr", true },
+  { "online.swedbank.se", true },
   { "online24.pt", true },
   { "onlinebizdirect.com", false },
   { "onlinecasino.vlaanderen", true },
   { "onlinecensorship.org", true },
   { "onlinecollegeessay.com", true },
-  { "onlinecorners.com", true },
   { "onlinefashion.it", true },
+  { "onlinehashfollow.com", true },
   { "onlineinfographic.com", true },
   { "onlinekmc.com", true },
   { "onlinelegalmarketing.com", true },
@@ -26289,18 +27368,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "onlinemarketingtraining.co.uk", true },
   { "onlinepokerspelen.be", true },
   { "onlineporno.xyz", true },
+  { "onlineprofecional.com", true },
   { "onlinerollout.de", true },
   { "onlinestoreninjas.com", true },
+  { "onlinetextil.cz", true },
   { "onlineth.com", false },
   { "onlineweblearning.com", true },
   { "onlinexl.nl", true },
-  { "onlyesb.com", true },
-  { "onlyesb.net", true },
   { "onlylebanon.net", true },
   { "onmaps.de", true },
   { "onmarketbookbuilds.com", true },
   { "onnaguse.com", true },
-  { "onnext.cc", true },
   { "onoranze-funebri.biz", true },
   { "onpay.io", true },
   { "onpermit.net", true },
@@ -26313,14 +27391,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ontdekhetzelf.nu", true },
   { "onthebriteside.com", true },
   { "ontras.com", false },
+  { "ontsc.com", true },
+  { "ontservice.com", true },
   { "ontsnappingskamer.nl", true },
   { "onurer.net", true },
   { "onvey.io", true },
+  { "onviga.de", true },
   { "onvirt.de", true },
   { "onvori.com", true },
   { "onvori.de", true },
   { "onvousment.fr", true },
   { "onysix.net", true },
+  { "onyxcts.com", true },
   { "onyxfireinc.com", true },
   { "onyxgen.duckdns.org", true },
   { "onyxmoon.me", true },
@@ -26330,7 +27412,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ooharttemplates.com", true },
   { "ooonja.de", true },
   { "oopsis.com", true },
-  { "oosoo.org", true },
   { "ooyo.be", true },
   { "op11.co.uk", false },
   { "opadaily.com", true },
@@ -26342,6 +27423,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "open-bs.com", true },
   { "open-bs.ru", true },
   { "open-desk.org", true },
+  { "open-domotics.info", true },
   { "open-freax.fr", true },
   { "open-gaming.net", true },
   { "open-infrastructure.net", true },
@@ -26350,10 +27432,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "open-sauce-recipes.co.uk", true },
   { "open-source.gr", true },
   { "open.gl", true },
-  { "openacademies.com", true },
   { "openacte.ch", true },
-  { "openas.org", true },
-  { "openbankproject.com", true },
   { "openbayes.com", true },
   { "openbeecloud.com", true },
   { "openblox.org", true },
@@ -26372,15 +27451,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "opengovpartnership.de", true },
   { "openings.ninja", true },
   { "openintelligence.uk", true },
-  { "openiocdb.com", true },
   { "openitforum.pl", true },
   { "openkim.org", true },
   { "openkvk.nl", true },
-  { "openmirrors.cf", true },
+  { "openmirrors.ml", true },
   { "openmtbmap.org", true },
   { "opennippon.com", true },
   { "opennippon.ru", true },
   { "openpictures.ch", true },
+  { "openpresentes.com.br", true },
   { "openquery.com.au", true },
   { "openrainbow.com", true },
   { "openrainbow.net", true },
@@ -26388,6 +27467,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "openrealestate.co", true },
   { "openresty.com", true },
   { "openreview.net", true },
+  { "openroademail.com", true },
   { "openruhr.de", true },
   { "openscreen.lu", true },
   { "opensource-cms.nl", true },
@@ -26407,9 +27487,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "openwifi.gr", true },
   { "openwireless.org", true },
   { "operationforever.com", true },
+  { "opexterminating.com", true },
   { "opfin.com", true },
   { "ophis-phosphoros.com", true },
   { "opiates.ca", true },
+  { "opic.gov", true },
   { "opin.me", true },
   { "opioids.co.uk", true },
   { "opioids.gov", true },
@@ -26417,8 +27499,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oplop.appspot.com", true },
   { "opoleo.com", false },
   { "oportho.com.br", true },
-  { "oportunidadesemfoco.com.br", true },
-  { "opp.ag", true },
+  { "oposiciones.com.es", true },
+  { "oposicionescorreos.com.es", true },
+  { "oposicionesertzaintza.com.es", true },
+  { "oppada.com", true },
   { "oppaiti.me", true },
   { "oppejoud.ee", true },
   { "opportunis.me", true },
@@ -26455,6 +27539,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "optmos.at", true },
   { "optometryscotland.org.uk", true },
   { "optoutday.de", true },
+  { "opure.ml", true },
   { "opure.ru", true },
   { "opus-codium.fr", true },
   { "oraculum.cz", true },
@@ -26464,6 +27549,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "orangefinanse.com.pl", true },
   { "orangejetpack.com", true },
   { "orangenbaum.at", true },
+  { "orangenuts.in", true },
+  { "orangetravel.eu", true },
   { "orangutan-appeal.org.uk", true },
   { "orbu.net", true },
   { "orca.pet", true },
@@ -26477,7 +27564,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ordernow.at", true },
   { "orderswift.com", true },
   { "ordr.mobi", true },
-  { "oreka.online", true },
   { "oreshinya.xyz", true },
   { "oreskylaw.com", true },
   { "oreto.de", true },
@@ -26497,6 +27583,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "orientravelmacas.com", true },
   { "origami.to", true },
   { "origamika.com", true },
+  { "original-christstollen.com", true },
+  { "original-christstollen.de", true },
   { "originalniknihy.cz", true },
   { "origincoffee.com", true },
   { "origincoffee.nz", true },
@@ -26506,6 +27594,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "orimex-mebel.ru", true },
   { "orion-universe.com", true },
   { "orioncokolada.cz", true },
+  { "oriondynamic.be", true },
   { "orionfinancialservices.com", true },
   { "oriongames.eu", true },
   { "orkestar-krizevci.hr", true },
@@ -26514,15 +27603,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "orleika.io", true },
   { "orlives.de", false },
   { "ormer.nl", true },
+  { "orocojuco.com", true },
   { "orovillelaw.com", true },
   { "orro.ro", true },
   { "orrs.de", true },
-  { "ortho-graz.at", true },
+  { "orthocop.cz", true },
   { "orthodontiste-geneve-docteur-rioux.com", true },
   { "orthograph.ch", true },
   { "orthotictransfers.com", true },
   { "ortlepp.eu", true },
-  { "ortodonciaian.com", true },
+  { "orum.in", true },
   { "orwell1984.today", true },
   { "oryva.com", true },
   { "orz.uno", true },
@@ -26542,19 +27632,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oscarvk.ch", true },
   { "oscloud.com", true },
   { "osepideasthatwork.org", true },
-  { "osereso.tn", true },
   { "oses.mobi", true },
   { "oshayr.com", true },
   { "oshell.me", true },
+  { "oshershalom.com", true },
   { "oshrc.gov", true },
   { "osielnava.com", true },
+  { "osirisrp.online", true },
   { "oskrba.net", true },
   { "oskuro.net", true },
   { "osla.org", true },
   { "oslinux.net", true },
   { "osm.is", true },
   { "osmanlitorunu.com", true },
-  { "osmestres.com", true },
   { "osmosis.org", true },
   { "osmre.gov", true },
   { "osnova.cz", true },
@@ -26562,9 +27652,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "osomjournal.org", true },
   { "ospf.sk", true },
   { "ospree.me", true },
-  { "osquery.io", true },
   { "ostan-collections.net", true },
   { "osterkraenzchen.de", true },
+  { "osti.gov", true },
   { "ostimwebyazilim.com", true },
   { "ostr.io", true },
   { "osusume-houhou.com", true },
@@ -26575,17 +27665,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "osx86spain.com", true },
   { "osxentwicklerforum.de", true },
   { "oszri.hu", true },
-  { "otako.pl", true },
   { "otakubox.de", true },
   { "otakurepublic.com", true },
   { "otakurumi.de", true },
-  { "otakuyun.com", true },
-  { "otchecker.com", true },
   { "otellio.com", true },
   { "otellio.de", true },
   { "otellio.it", true },
+  { "other98.com", true },
   { "oticasaopaulo.com.br", true },
   { "oticasvisao.net.br", true },
+  { "otoblok.com", true },
+  { "otokiralama.name.tr", true },
   { "otorrino.pt", true },
   { "otoy.com", true },
   { "otoya.space", true },
@@ -26594,7 +27684,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "otrm.de", true },
   { "otsfreestyle.jp", true },
   { "otsu.beer", true },
-  { "ottoproject.io", true },
   { "ottoversand.at", true },
   { "otus-magnum.com", true },
   { "otvaracie-hodiny.sk", true },
@@ -26612,9 +27701,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ourcloud.at", true },
   { "ourdocuments.gov", true },
   { "ourevents.net", true },
+  { "ourladymountcarmel.net", true },
+  { "ourladyofcalvary.org", true },
+  { "ourladyoftheassumptionchurch.org", true },
+  { "ourladyqueenofmartyrs.org", true },
+  { "ourls.win", true },
+  { "ourmaster.org", true },
   { "ouruglyfood.com", true },
   { "ourwedding.xyz", true },
   { "ourworldindata.org", true },
+  { "out-of-scope.de", true },
   { "outdoorfurniture.ie", true },
   { "outdoorimagingportal.com", true },
   { "outdoorlightingagoura.com", true },
@@ -26641,6 +27737,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "outpostinfo.com", true },
   { "outsideconnections.com", true },
   { "outsiders.paris", true },
+  { "ovabag.com", true },
   { "ovelhaostra.com", true },
   { "overalglas.nl", true },
   { "overamsteluitgevers.nl", true },
@@ -26651,20 +27748,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "oversight.garden", true },
   { "oversight.gov", true },
   { "overstap.deals", true },
-  { "overstappen.nl", true },
   { "overstemmen.nl", true },
   { "overstockpromote.com", true },
   { "overthecloud.it", true },
   { "overthinkingit.com", true },
   { "overtrolls.de", true },
-  { "overture.london", true },
   { "overwall.org", true },
   { "overzicht.pro", true },
   { "overzicht.ws", true },
+  { "oveweddings.com", true },
   { "ovirt.org", true },
   { "ovix.co", true },
   { "ovnrain.com", true },
-  { "ovpn.to", true },
   { "ovvy.net", false },
   { "owapi.net", true },
   { "owennelson.co.uk", true },
@@ -26677,15 +27772,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "owlishmedia.com", true },
   { "own3d.ch", true },
   { "ownc.at", true },
+  { "owncloud.ch", true },
   { "ownmay.com", true },
   { "oxborrow.ca", true },
   { "oxelie.com", true },
   { "oxo.cloud", true },
-  { "oxygaming.com", true },
   { "oxygin.net", true },
   { "oxytocin.org", true },
+  { "oxzeth3sboard.com", true },
+  { "oyashirosama.tokyo", true },
   { "oyosoft.fr", true },
   { "oyosoft.net", true },
+  { "oysterworldwide.com", true },
   { "ozark.be", true },
   { "oznamovacipovinnost.cz", true },
   { "ozonitron.com", true },
@@ -26699,9 +27797,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "p-s-b.com", true },
   { "p-t.io", true },
   { "p.ki", true },
-  { "p1984.nl", false },
   { "p1cn.com", true },
   { "p1ratrulezzz.me", true },
+  { "p22.co", true },
   { "p4chivtac.com", true },
   { "p5118.com", true },
   { "p5r.uk", true },
@@ -26709,12 +27807,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pa.search.yahoo.com", false },
   { "paarberatung-hn.de", true },
   { "paardenhulp.nl", true },
+  { "paardensportbak.nl", true },
   { "paas-inf.net", true },
   { "paass.net", true },
   { "paazmaya.fi", true },
+  { "pablofain.com", true },
   { "pabuzo.vn", true },
+  { "pacaom.com", true },
   { "pacatlantic.com", true },
   { "pacco.com.br", true },
+  { "paccolat.name", true },
   { "pace.car", true },
   { "paceda.nl", true },
   { "pacelink.de", true },
@@ -26732,16 +27834,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "packagingproject.management", true },
   { "packagist.org", false },
   { "packaware.com", true },
-  { "packer.io", true },
   { "packetdigital.com", true },
   { "packetlinux.com", true },
-  { "packshot-creator.com", true },
   { "pact2017.nl", true },
   { "pactf.com", true },
   { "padam-group.com", true },
   { "padberx-marketing-consultants.de", true },
   { "paddy.rocks", true },
   { "padianda.com", true },
+  { "padkit.org", true },
+  { "padovani.de", true },
   { "padpilot.co", true },
   { "padrepio.in", true },
   { "padron.com.es", true },
@@ -26759,13 +27861,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pageantsnews.com", false },
   { "pagedesignhub.com", true },
   { "pagedesignpro.com", true },
-  { "pagedesignshop.com", true },
   { "pagedesignweb.com", true },
   { "pagefulloflies.io", true },
   { "pageperform.com", true },
   { "pagewizz.com", true },
   { "pagiamtzis.com", true },
   { "pagina.com.mx", true },
+  { "paginaweb4u.com", true },
   { "pagure.io", true },
   { "pahae.de", true },
   { "pahealthbilling.com", true },
@@ -26792,17 +27894,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paktolos.net", true },
   { "palabr.as", true },
   { "palapadev.com", true },
+  { "palariviera.com", true },
+  { "palary.work", true },
   { "palatin.at", true },
   { "palava.tv", true },
   { "palavatv.com", true },
   { "palazzo.link", true },
   { "palazzo.work", true },
-  { "paleolowcarb.de", true },
   { "paleotraining.com", true },
+  { "palestra.roma.it", true },
   { "palladium46.com", true },
   { "pallas.in", true },
   { "palletflow.com", true },
   { "palli.ch", true },
+  { "palmaprop.com", true },
   { "palmavile.us", true },
   { "palmaville.com", true },
   { "palmen-apotheke.de", true },
@@ -26817,26 +27922,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "panascais.eu", true },
   { "panascais.host", true },
   { "panascais.me", true },
-  { "panascais.net", true },
   { "panascais.pw", true },
   { "panascais.site", true },
   { "panascais.tech", true },
   { "panascais.us", true },
+  { "panasproducciones.com", true },
   { "panaxis.biz", true },
   { "panaxis.ch", true },
   { "panaxis.li", true },
   { "panda-community.com", true },
   { "panda.tf", true },
   { "pandemicflu.gov", true },
+  { "pandkonijn.nl", true },
   { "pandoraflora.com", true },
   { "pandymic.com", true },
   { "paneldewelopera.pl", true },
   { "paneu.de", true },
+  { "panhandlemenshealth.com", true },
   { "panic.tk", true },
   { "panier-legumes.bio", true },
+  { "paniyanovska.ua", true },
   { "panj.ws", true },
-  { "panjee.fr", true },
-  { "panlex.org", true },
+  { "panjiva.com", true },
   { "panmetro.com", true },
   { "panoma.de", true },
   { "panomizer.de", true },
@@ -26856,6 +27963,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pantou.org", false },
   { "pants-off.xyz", true },
   { "panzer72.ru", true },
+  { "panzerscreen.dk", true },
   { "pap.la", false },
   { "papa-webzeit.de", true },
   { "papadopoulos.me", true },
@@ -26872,13 +27980,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paperlesssolutionsltd.com.ng", true },
   { "papertracker.net", true },
   { "paperturn.com", true },
-  { "paperwallets.io", true },
   { "paperwritinghelp.net", true },
   { "papiermakerijdehoop.nl", true },
   { "papiermeteenverhaal.nl", true },
   { "papierniczy.eu", true },
   { "papillon-events.be", true },
-  { "papotage.net", true },
+  { "papion.it", true },
   { "paprikas.fr", true },
   { "paraborsa.net", true },
   { "parachute70.com", true },
@@ -26887,6 +27994,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paradigi.com.br", true },
   { "paradise-engineer.com", true },
   { "paradise-engineering.com", true },
+  { "paradise-travel.net", true },
   { "paradiselost.com", true },
   { "paradoxdesigns.org", true },
   { "paragonie.com", false },
@@ -26895,11 +28003,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paranoidcrypto.com", true },
   { "paranoidmode.com", true },
   { "paranoidpenguin.net", true },
-  { "paranoxer.hu", true },
   { "parasitologyclub.org", true },
   { "paratlan.hu", true },
   { "paratxt.org", true },
-  { "parcelbroker.co.uk", true },
+  { "parcelbroker.co.uk", false },
   { "parchcraftaustralia.com", true },
   { "parckwart.de", true },
   { "parcon.it", true },
@@ -26909,6 +28016,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "parentsintouch.co.uk", true },
   { "pariga.co.uk", true },
   { "paris-store.com", true },
+  { "parisackerman.com", true },
   { "parisbloom.com", true },
   { "parisderriere.fr", true },
   { "parisescortgirls.com", true },
@@ -26939,17 +28047,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "parsemail.org", true },
   { "parser.nu", true },
   { "parsonsfamilyhomes.com", true },
-  { "partage.ovh", true },
   { "partecipa.tn.it", true },
-  { "parthkolekar.me", true },
-  { "participatorybudgeting.de", true },
-  { "participatorybudgeting.info", true },
   { "partijhandel.website", true },
   { "partijtjevoordevrijheid.nl", false },
   { "partiono.com", true },
-  { "partiwatch.com", true },
   { "partner.sh", true },
-  { "partnerbeam.com", true },
   { "partnermobil.de", true },
   { "partnersfcu.org", true },
   { "partou.de", true },
@@ -27007,6 +28109,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "passionatefoodie.co.uk", true },
   { "passionatehorsemanship.com", true },
   { "passionatelife.com.au", true },
+  { "passionbyd.com", true },
   { "passionebenessere.com", true },
   { "passionpictures.eu", true },
   { "passions-art.com", true },
@@ -27021,6 +28124,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "passrhce.com", true },
   { "passrhcsa.com", true },
   { "passthepopcorn.me", true },
+  { "passumpsicbank.com", true },
   { "passvanille-reservation.fr", true },
   { "passvau.lt", true },
   { "passwd.one", true },
@@ -27035,11 +28139,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "passwordsecurity.info", true },
   { "passworks.io", true },
   { "passy.pw", true },
+  { "pasta-factory.co.il", true },
   { "pastaenprosecco.nl", true },
   { "paste.fedoraproject.org", true },
   { "paste.gg", true },
   { "paste.to", true },
   { "pastebin.co.za", true },
+  { "pastebin.tw", true },
   { "pasteblin.com", true },
   { "pasternok.org", true },
   { "pasticcerialorenzetti.com", true },
@@ -27063,18 +28169,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "patika-biztositas.hu", true },
   { "patikabiztositas.hu", true },
   { "patouille-et-gribouille.fr", true },
-  { "patric-lenhart.de", true },
   { "patrick-othmer.de", true },
   { "patrick-robrecht.de", true },
+  { "patrick21.ch", true },
   { "patrickaudley.ca", true },
   { "patrickaudley.com", true },
   { "patrickbrosi.de", true },
+  { "patrickhoefler.net", true },
   { "patricklynch.xyz", true },
   { "patrickmcnamara.xyz", true },
   { "patrikgarten.de", true },
   { "patriksima.cz", true },
   { "patriksimek.cz", true },
   { "patriotstationatchalfont.com", true },
+  { "patrocinio.com.br", true },
   { "patrz.eu", true },
   { "patsch-photography.de", true },
   { "patsyforyou.ch", true },
@@ -27085,16 +28193,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paudley.ca", true },
   { "paudley.com", true },
   { "paudley.org", true },
+  { "paul-barton.co.uk", true },
   { "paul-bronski.de", true },
   { "paul.reviews", true },
   { "pauladamsmith.com", true },
   { "paulbakaus.com", true },
   { "paulbdelaat.nl", true },
   { "paulbramhall.uk", true },
+  { "paulchen.at", true },
   { "paulcooper.me.uk", true },
   { "pauldev.co", true },
   { "paulerhof.com", true },
   { "paulewen.ca", true },
+  { "paulgerberrealtors.com", true },
   { "paulinewesterman.nl", true },
   { "paulmeier.com", false },
   { "paulomonteiro.pt", true },
@@ -27106,6 +28217,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paulrotter.de", true },
   { "paulschreiber.com", true },
   { "paulscustomauto.com", true },
+  { "paulsnar.lv", true },
   { "paulswartz.net", true },
   { "paulus-foto.pl", true },
   { "paulward.net", true },
@@ -27118,6 +28230,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pavando.com", true },
   { "pavelfojt.cz", true },
   { "pavelrebrov.com", true },
+  { "pavelstriz.cz", true },
   { "pavio.org", true },
   { "paw.cloud", true },
   { "paw.pt", true },
@@ -27134,6 +28247,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "payboy.biz", true },
   { "payboy.rocks", true },
   { "paybro.eu", true },
+  { "payexpresse.com", true },
   { "payfazz.com", true },
   { "paylike.io", true },
   { "payloc.io", true },
@@ -27146,11 +28260,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paymill.de", true },
   { "paynet.com.co", true },
   { "payoff.com", true },
-  { "paypal.com", false },
+  { "paypal.com", true },
   { "paypaq.com", true },
   { "paypro.nl", false },
   { "payroll.xero.com", false },
   { "payrollhive.com", true },
+  { "paysbuy.net", true },
   { "paysera.com", true },
   { "payslipview.com", true },
   { "payssaintgilles.fr", true },
@@ -27159,7 +28274,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "paytonmoledor.com", true },
   { "payupay.ru", true },
   { "payzang.com", true },
-  { "payzwin.com", true },
   { "pb.ax", false },
   { "pback.se", true },
   { "pbosquet.com", true },
@@ -27172,7 +28286,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pbz.im", true },
   { "pc-rescue.me", false },
   { "pc-servis-brno.com", true },
-  { "pcbricole.fr", true },
   { "pccentral.nl", true },
   { "pcdocjim.com", true },
   { "pcel.com", true },
@@ -27189,11 +28302,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pclaeuft.de", true },
   { "pclob.gov", true },
   { "pcloud.com", true },
-  { "pcmedia.co.nz", true },
   { "pcmkrembangan.or.id", true },
   { "pcmr.info", true },
   { "pcnotdienst-oldenburg-rastede.de", true },
+  { "pcreparatiehardenberg.nl", true },
   { "pcrypt.org", true },
+  { "pcs2.gr", true },
   { "pcsetting.com", true },
   { "pctonic.net", true },
   { "pctrouble.net", true },
@@ -27224,14 +28338,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pearlsenroses.nl", true },
   { "pearlsonly.com", true },
   { "peaudorange.net", true },
+  { "pebbleparents.com", true },
   { "pebbles.net.in", true },
   { "pecker-johnson.com", true },
   { "peda.net", true },
+  { "peddock.com", true },
   { "peddy.dyndns.org", true },
   { "pedicurean.nl", true },
   { "pedicureduiven.nl", true },
   { "pedidamanosevilla.com", true },
   { "pedikura-vitu.cz", true },
+  { "pedimanie.cz", true },
   { "pedimoda.com.br", true },
   { "pedro.com.es", true },
   { "pedrosaurus.com", true },
@@ -27239,7 +28356,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pedroventura.com", false },
   { "peeekaaabooo.com", true },
   { "peekier.com", true },
-  { "peen.ch", true },
   { "peep.gq", true },
   { "peercraft.at", true },
   { "peercraft.be", true },
@@ -27265,6 +28381,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "peerigon.com", true },
   { "peername.com", true },
   { "peernode.net", true },
+  { "peertube.social", true },
   { "peervpn.net", true },
   { "peerweb.com", true },
   { "peetah.com", true },
@@ -27275,6 +28392,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "peifi.de", false },
   { "peippo.at", true },
   { "peka.pw", true },
+  { "pekarstvivetvrzi.cz", true },
   { "pekkapleppanen.fi", true },
   { "pekoe.se", true },
   { "pelanucto.cz", true },
@@ -27304,6 +28422,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pensador.com", true },
   { "pensador.info", true },
   { "pensioenfonds-ey.nl", true },
+  { "pension-am-alten-waschhaus.de", true },
   { "pensionpilot.ca", true },
   { "penslabyrinth.com", true },
   { "pentandra.com", true },
@@ -27315,13 +28434,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pentoo.ch", true },
   { "penz.media", true },
   { "peoplelikemeapp.com", true },
-  { "peoplesbankal.com", true },
   { "peoplesdecade.org", true },
   { "peoplesguardian.org", true },
+  { "pepeelektro.sk", true },
   { "pepemodelismo.com.br", true },
+  { "peperstraat.online", true },
   { "peplog.nl", true },
   { "pepwaterproofing.com", true },
-  { "pequenosfavoritos.com.br", true },
+  { "pequenosfavoritos.com.br", false },
   { "per-olsson.se", true },
   { "pera.gs", true },
   { "perala.me", true },
@@ -27329,6 +28449,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "percolate.com", true },
   { "percraft.com", true },
   { "percy.io", true },
+  { "percyflix.com", true },
   { "perd.re", true },
   { "perecraft.com", true },
   { "perezdecastro.org", true },
@@ -27340,7 +28461,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "perfektesgewicht.com", true },
   { "perfektesgewicht.de", true },
   { "performancehealth.com", true },
-  { "performancesantafe.org", true },
+  { "performing-art-schools.com", true },
   { "perfumeaz.com", true },
   { "perfumes.com.br", true },
   { "periscope.tv", true },
@@ -27379,14 +28500,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pescco.com.br", true },
   { "pestici.de", true },
   { "pestkill.info", true },
-  { "pesyun.cn", true },
   { "pet-hotel-mura.net", true },
+  { "pet-life.top", true },
+  { "pet-tekk.co.uk", true },
   { "petabits.de", true },
   { "petalkr.com", true },
   { "petbooking.it", true },
   { "petcarvers.com", true },
   { "petdesign.pet", true },
-  { "peteboc.com", true },
   { "petech.ro", true },
   { "petelew.is", true },
   { "peter.org.ua", true },
@@ -27400,8 +28521,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "peterhuetz.com", true },
   { "peterjin.org", true },
   { "peterjohnson.io", true },
-  { "peterkshultz.com", false },
   { "peterlew.is", true },
+  { "petermaar.com", true },
   { "petersontoscano.com", true },
   { "petervanleeuwentweewielers.nl", true },
   { "petfa.ga", true },
@@ -27418,10 +28539,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "petpost.co.nz", false },
   { "petpower.eu", true },
   { "petr.as", true },
+  { "petrachuk.ru", true },
   { "petrasestakova.cz", true },
+  { "petravdbos.nl", true },
   { "petresort.pt", true },
+  { "petroleum-schools.com", true },
   { "petroscand.eu", true },
   { "petrostathis.com", true },
+  { "petrotranz.com", true },
   { "petrpikora.com", true },
   { "petrucciresidential.com", true },
   { "petruzz.net", true },
@@ -27440,6 +28565,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pfarre-kremsmuenster.at", true },
   { "pfcafeen.dk", true },
   { "pfd-nz.com", false },
+  { "pfefferkuchen-shop.de", true },
+  { "pfefferkuchenprinzessin-dresden.de", true },
   { "pferdekauf.de", true },
   { "pfeuffer-elektro.de", true },
   { "pfft.net", true },
@@ -27471,6 +28598,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pharmaabsoluta.com.br", true },
   { "pharmaboard.de", true },
   { "pharmaboard.org", true },
+  { "pharmacie-fr.org", true },
   { "pharmacieplusfm.ch", true },
   { "pharmafoto.ch", true },
   { "pharmaphoto.ch", true },
@@ -27481,7 +28609,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pharmica.uk", true },
   { "pharside.dyndns.org", true },
   { "pharynks.com", true },
-  { "pharynx.nl", true },
   { "phasersec.com", false },
   { "phasme-2016.com", true },
   { "phattea.tk", true },
@@ -27489,8 +28616,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "phcimages.com", true },
   { "phcorner.net", true },
   { "phdhub.it", true },
+  { "phelx.de", true },
   { "phenixairsoft.com", true },
   { "phget.com", true },
+  { "phhtc.ir", true },
   { "phi-works.com", true },
   { "phialo.de", true },
   { "phil-dirt.com", true },
@@ -27500,6 +28629,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "phileas-psychiatrie.be", true },
   { "philia-sa.com", true },
   { "philipdb.com", true },
+  { "philipdb.nl", true },
   { "philipkohn.com", true },
   { "philipp-trulson.de", true },
   { "philipp-winkler.de", true },
@@ -27512,22 +28642,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "philippheenen.de", true },
   { "philippinedroneassociation.org", true },
   { "philippkeschl.at", true },
+  { "philipssupportforum.com", true },
+  { "philipzhan.tk", true },
   { "phillipgoldfarb.com", true },
   { "phillyinjurylawyer.com", true },
   { "philna.sh", true },
   { "philosoftware.com.br", true },
   { "philosopherswool.com", true },
+  { "philosophy-colleges.com", true },
+  { "philosophy.moe", true },
   { "philosophyguides.org", true },
   { "philphonic.de", true },
+  { "philslab.cloud", true },
   { "philslab.ninja", true },
   { "philsown.de", true },
   { "philsturgeon.uk", true },
   { "philux.ch", true },
   { "phishing-studie.org", true },
+  { "phishing.rs", false },
   { "phishingusertraining.com", true },
   { "phligence.com", true },
   { "phocean.net", true },
-  { "phoenixlogan.com", true },
+  { "phoenics.de", true },
+  { "phoenixurbanspaces.com", true },
   { "phone-service-center.de", true },
   { "phonix-company.fr", true },
   { "phormance.com", true },
@@ -27539,7 +28676,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "photistic.org", true },
   { "photo-livesearch.com", true },
   { "photo-paysage.com", true },
-  { "photo.org.il", true },
   { "photoancestry.com", true },
   { "photoartelle.com", true },
   { "photodeal.fr", true },
@@ -27549,6 +28685,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "photolium.net", true },
   { "photomodelcasting.com", true },
   { "photon.sh", true },
+  { "photosquare.com.tw", true },
   { "phototravel.uk", true },
   { "phototrio.com", true },
   { "phoxmeh.com", true },
@@ -27557,18 +28694,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "phparcade.com", true },
   { "phpartners.org", true },
   { "phpbbchinese.com", true },
-  { "phpdistribution.com", true },
   { "phpdorset.co.uk", true },
+  { "phpinfo.in.th", true },
   { "phpkari.cz", true },
   { "phpliteadmin.org", true },
   { "phpmyadmin.net", true },
+  { "phpower.com", true },
   { "phpprime.com", true },
   { "phpsecure.info", true },
   { "phpunit.de", true },
   { "phra.gs", true },
   { "phrive.space", true },
   { "phryanjr.com", false },
-  { "phryneas.de", true },
   { "phuket-idc.com", true },
   { "phuket-idc.de", true },
   { "phunehehe.net", true },
@@ -27579,6 +28716,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "physicalist.com", true },
   { "physicaltherapist.com", false },
   { "physicpezeshki.com", true },
+  { "physics-schools.com", true },
   { "physiotherapie-seiwald.de", true },
   { "physiovesenaz.ch", true },
   { "pi-box.ml", true },
@@ -27593,7 +28731,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pianyigou.com", true },
   { "piatabrasil.com.br", true },
   { "piboubes.me", true },
-  { "pic.gov", true },
+  { "pic.gov", false },
   { "pic.sr", true },
   { "pic2map.com", true },
   { "picchietti.io", true },
@@ -27601,7 +28739,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "piccolo-parties.co.uk", true },
   { "pickabrain.fr", true },
   { "pickelhaubes.com", true },
-  { "pickersurvey.org", true },
   { "pickme.nl", false },
   { "pickmysoap.gr", true },
   { "pickormix.co.uk", true },
@@ -27623,14 +28760,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pidjipi.com", true },
   { "pie-express.xxx", true },
   { "pieces-or.com", true },
-  { "pieinsurance.com", true },
   { "piekacz.eu.org", true },
   { "piekacz.net", true },
   { "piekacz.tel", true },
   { "pieland.eu", true },
   { "pieldenaranja.com", true },
   { "piem.org", true },
-  { "pieperhome.de", true },
   { "pieq.eu", true },
   { "pieq.eu.org", true },
   { "pier28.com", true },
@@ -27640,11 +28775,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pierre-schmitz.com", true },
   { "pierrefv.com", true },
   { "pierreprinetti.com", true },
+  { "pierrickdeniel.fr", true },
   { "pietechsf.com", true },
   { "pieterbos.nl", true },
   { "pieterhordijk.com", true },
   { "pietermaene.be", false },
   { "pietz.uk", true },
+  { "pigritia.de", true },
   { "pigs.pictures", true },
   { "pijuice.com", true },
   { "pik.bzh", true },
@@ -27657,6 +28794,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pileofgarbage.net", true },
   { "piliszek.net", true },
   { "pill.id", true },
+  { "pilot-colleges.com", true },
   { "pilot.co", true },
   { "pilotgrowth.com", true },
   { "pilsoncontracting.com", true },
@@ -27670,22 +28808,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pindanutjes.be", false },
   { "pinemountainnursery.com.au", true },
   { "pinemountbaptistchurch.org", true },
+  { "pinetopazrealestate.com", true },
   { "pingworks.com", true },
   { "pingworks.de", true },
   { "pingworks.eu", true },
   { "pingworks.net", true },
   { "pinhadigital.com", true },
-  { "pinigseu.xyz", true },
   { "pinimg.com", true },
   { "pinkapple.com", true },
   { "pinkbike.com", true },
   { "pinkbikecycle.com", true },
-  { "pinkcasino.co.uk", true },
   { "pinkerton.io", true },
   { "pinkladyapples.co.uk", true },
   { "pinklecfest.org", true },
   { "pinklittlenotebook.com", true },
   { "pinkwalk.co.nz", true },
+  { "pinkyf.com", false },
   { "pinkylam.me", true },
   { "pinnacleallergy.net", true },
   { "pinnaclelife.co.nz", true },
@@ -27696,7 +28834,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pinoyonlinetv.com", true },
   { "pinoytech.ph", true },
   { "pinpayments.com", true },
-  { "pinpointengineer.co.uk", true },
   { "pinscher.com.br", true },
   { "pinskupakki.fi", true },
   { "pinterest.at", true },
@@ -27707,12 +28844,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pinterest.ie", true },
   { "pinterest.info", true },
   { "pinterest.jp", true },
+  { "pintosbeeremovals.co.za", true },
   { "pintosplumbing.co.za", true },
   { "pioneer-car.eu", true },
   { "pioneer-rus.ru", true },
   { "pipenny.net", true },
   { "pipocao.com", true },
-  { "piranil.com", true },
   { "pirate.trade", true },
   { "piratebayproxy.tf", true },
   { "piraten-basel.ch", true },
@@ -27731,6 +28868,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pirateproxy.tf", true },
   { "pirateproxy.tv", true },
   { "pirates-comic.com", true },
+  { "pirates.click", true },
   { "piratesforums.co", true },
   { "pircher.co.uk", true },
   { "pires.ovh", true },
@@ -27744,17 +28882,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pissblau.com", true },
   { "pissflaps.co.uk", true },
   { "pisupp.ly", true },
+  { "pitbullsecuritysolutions.ca", true },
   { "pitchpinecapital.com", true },
   { "pitchupp.com", true },
   { "pitfire.io", true },
   { "pitot-rs.org", true },
   { "pittmantraffic.co.uk", true },
+  { "pivniraj.com", true },
   { "pivotaltracker.com", true },
   { "pivotanimation.org", true },
   { "piwko.co", true },
   { "pix5.de", true },
   { "pixabay.com", true },
   { "pixe2019.org", true },
+  { "pixel-kraft.de", true },
   { "pixel.facebook.com", false },
   { "pixel.google.com", true },
   { "pixelbash.de", true },
@@ -27762,7 +28903,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pixelfou.com", true },
   { "pixelminers.net", true },
   { "pixelpirat.ch", true },
-  { "pixelrain.info", true },
   { "pixelsquared.us", true },
   { "pixelurbia.com", true },
   { "pixelution.at", true },
@@ -27770,8 +28910,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pixiv.cat", true },
   { "pixiv.moe", true },
   { "pixivimg.me", true },
+  { "pixlfox.com", true },
   { "pixloc.fr", true },
-  { "pizala.de", true },
   { "pizza-show.fr", true },
   { "pizzabesteld.nl", true },
   { "pizzabottle.com", false },
@@ -27785,7 +28925,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pizzeria-mehrhoog.de", true },
   { "pizzeriaamadeus.hr", true },
   { "pizzeriacolore.com", true },
-  { "pj009.com", true },
   { "pj539999.com", true },
   { "pjentertainments.co.uk", true },
   { "pjili.com", true },
@@ -27799,7 +28938,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pkisolutions.com", true },
   { "pkov.cz", true },
   { "pkphotobooths.co.uk", true },
-  { "pksps.com", true },
   { "pl-cours.ch", true },
   { "pl.search.yahoo.com", false },
   { "placasonline.com.br", true },
@@ -27815,10 +28953,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "plainbulktshirts.co.za", true },
   { "plainjs.com", true },
   { "plainmark.com", true },
-  { "plaintech.net.au", true },
-  { "plaintray.com", true },
   { "plaisirdumouvement.com", true },
   { "plan-immobilier.fr", true },
+  { "plan-it-events.de", true },
   { "planboardapp.com", true },
   { "planecon.nz", true },
   { "planer.me", true },
@@ -27828,7 +28965,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "planetanim.fr", true },
   { "planetasuboficial.com.br", true },
   { "planetau2.com", true },
-  { "planetbeauty.com", true },
   { "planetbreath.ch", true },
   { "planete-cocoon.com", false },
   { "planete-lira.fr", true },
@@ -27838,7 +28974,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "planetofthegames.tv", true },
   { "planetromeofoundation.org", true },
   { "planetsoftware.com.au", true },
-  { "planformation.com", true },
   { "planify.io", true },
   { "planitz.com", true },
   { "planitz.net", true },
@@ -27859,16 +28994,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "plantrustler.com", true },
   { "planujemywesele.pl", true },
   { "planup.fr", true },
+  { "planview.com", true },
   { "plaque-funeraire.fr", true },
   { "plassmann.ws", true },
-  { "plasti-pac.ch", true },
   { "plasticsurgeryartist.com", true },
   { "plasticsurgerynola.com", true },
-  { "plasticsurgeryservices.com", true },
+  { "plastiflex.it", true },
   { "plastovelehatko.cz", true },
+  { "plateformecandidature.com", true },
   { "platformadmin.com", true },
   { "platinumexpress.com.ar", true },
-  { "platinumpeek.com", true },
   { "platomania.nl", true },
   { "platschi.net", true },
   { "platten-nach-mass.de", true },
@@ -27880,12 +29015,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "play.google.com", true },
   { "playanka.com", true },
   { "playawaycastles.co.uk", true },
+  { "playcollect.net", true },
   { "playdaysparties.co.uk", true },
   { "playerdb.co", true },
   { "playerscout.net", true },
   { "playform.cloud", true },
   { "playhappywheelsunblocked.com", true },
-  { "playkinder.com", true },
   { "playnation.io", true },
   { "playocean.net", true },
   { "playpirates.com", true },
@@ -27900,7 +29035,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "plazasummerlin.com", true },
   { "pld-entertainment.co.uk", true },
   { "pldx.org", true },
-  { "pleaseuseansnisupportedbrowser.ml", true },
+  { "pleasure-science.com", true },
   { "plegro.com", true },
   { "pleiades.com.tr", true },
   { "pleier-it.de", false },
@@ -27937,13 +29072,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "plumber-in-sandton.co.za", true },
   { "plumbermountedgecombe.co.za", true },
   { "plumberumhlangarocks.co.za", true },
+  { "plumbingandheatingspecialistnw.com", true },
   { "plumbingbenoni.co.za", true },
   { "plumbingcentral.com.au", true },
   { "plumbingglenvista.co.za", true },
   { "plumlocosoft.com", true },
   { "plumnet.ch", true },
   { "plumpie.net", false },
-  { "plumplat.com", true },
   { "plur.com.au", true },
   { "plural.cafe", true },
   { "plurr.me", true },
@@ -27956,12 +29091,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "plusstreamfeed.appspot.com", true },
   { "plustech.id", true },
   { "pluta.net", true },
+  { "pluth.org", true },
   { "plutiedev.com", true },
   { "pluto.life", true },
   { "plutokorea.com", true },
   { "plutopia.ch", true },
   { "plymouthbouncycastles.co.uk", true },
-  { "plymouthglassgallery.com", true },
   { "plzdontpwn.me", true },
   { "plzenskybarcamp.cz", true },
   { "plzh4x.me", true },
@@ -27978,7 +29113,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pmarques.info", true },
   { "pmartin.tech", true },
   { "pmbc.org", true },
-  { "pmbtf.com", true },
   { "pmconference.ch", true },
   { "pmf.gov", true },
   { "pmg-offshore-company.com", true },
@@ -28008,6 +29142,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pnut.io", false },
   { "po.net", true },
   { "poba.fr", true },
+  { "poc88.com", true },
   { "pocakking.tk", true },
   { "pocatellonissanparts.com", true },
   { "pochaneko.com", true },
@@ -28020,14 +29155,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "podroof.com.au", true },
   { "podshrink.de", true },
   { "poe.digital", true },
-  { "poed.com.au", true },
   { "poed.net.au", true },
   { "poedgirl.com", true },
-  { "poeg.cz", true },
   { "poezja.com.pl", true },
   { "poezjagala.pl", true },
   { "poffenhouse.ddns.net", true },
   { "pogera.com", true },
+  { "pogetback.pl", true },
   { "pogrebisky.net", true },
   { "pohlednice-tap.cz", true },
   { "pohlmann.io", true },
@@ -28053,11 +29187,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pokl.cz", true },
   { "polaire.org", true },
   { "polanda.com", true },
-  { "polar-baer.com", true },
   { "polar.uk.com", true },
   { "pole-emotion.ch", true },
   { "poleacademie.com", true },
   { "poles4pilots.com", true },
+  { "police-schools.com", true },
   { "policedriver.com", true },
   { "policereferencecheck.com", true },
   { "policesromandesrecrutement.ch", true },
@@ -28080,6 +29214,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "polit.im", true },
   { "politic.org.ua", true },
   { "politicachubut.com.ar", true },
+  { "political-science-schools.com", true },
   { "politiezoneriho.be", true },
   { "politik-bei-uns.de", true },
   { "polizeiwallis.ch", true },
@@ -28100,6 +29235,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "polyfill.io", true },
   { "polyfluoroltd.com", false },
   { "polygamer.net", true },
+  { "polygraphi.ae", true },
   { "polymake.org", true },
   { "polymathematician.com", true },
   { "polymorph.rs", true },
@@ -28129,13 +29265,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ponydesignclub.nl", true },
   { "ponyfoo.com", true },
   { "ponzi.life", true },
+  { "poodleassassin.com", true },
   { "poodlefan.net", true },
   { "pookl.com", true },
+  { "poolsafely.gov", true },
+  { "poolsafety.gov", true },
   { "poolspondsandwaterscapes.com", true },
+  { "pooltools.net", true },
   { "poolvilla-margarita.net", false },
   { "poon.io", true },
   { "poopjournal.rocks", true },
   { "poopr.ru", true },
+  { "poorclarepa.org", true },
   { "pop-corn.ro", true },
   { "pop3.jp", true },
   { "popcat.ru", true },
@@ -28153,13 +29294,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "porg.es", true },
   { "pork.org.uk", true },
   { "porkel.de", true },
+  { "pornbay.eu", true },
+  { "porncandi.com", true },
+  { "porndragon.net", true },
   { "pornfacefinder.com", false },
+  { "pornflare.net", true },
+  { "porngay.co", true },
   { "pornhubhd.biz", true },
   { "porniwi.com", true },
   { "pornloupe.com", true },
+  { "pornmega.net", true },
   { "pornofilmovi.us", true },
   { "pornomens.be", true },
+  { "pornovk.xxx", true },
+  { "pornshop.biz", true },
   { "pornspider.to", true },
+  { "pornstop.net", true },
+  { "pornsuper.net", true },
+  { "porny.xyz", true },
   { "porpcr.com", true },
   { "pors-sw.cz", true },
   { "port.im", true },
@@ -28174,7 +29326,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "portalcarriers.com", true },
   { "portalcentric.net", true },
   { "portalkla.com.br", true },
-  { "portalzine.de", true },
   { "portamiinpista.it", true },
   { "porte.roma.it", true },
   { "portercup.com", true },
@@ -28183,7 +29334,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "portofala.pt", true },
   { "portofrotterdam.com", false },
   { "portosonline.pl", true },
-  { "portraitsystem.biz", true },
   { "portsdebalears.gob.es", true },
   { "portsmouthbouncycastles.co.uk", true },
   { "portsmoutheic.com", true },
@@ -28195,6 +29345,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "posalji.me", true },
   { "posaunenchor-senden.de", true },
   { "posbank.co.uk", true },
+  { "poschtiliste.ch", true },
   { "poseidonwaterproofing.com", true },
   { "poshcastles.co.uk", true },
   { "poshlashes.se", true },
@@ -28209,7 +29360,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "post-darwinism.com", true },
   { "post.com.ar", true },
   { "post.io", true },
-  { "post.we.bs", true },
   { "postal.dk", true },
   { "postal3.es", true },
   { "postblue.info", true },
@@ -28227,34 +29377,35 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "postn.eu", true },
   { "postpot.co.kr", true },
   { "posttigo.com", true },
+  { "postura-corretta.it", true },
   { "posyperfume.com", true },
   { "potatiz.com", true },
   { "potatofrom.space", true },
   { "potatopro.com", true },
-  { "potatron.tech", true },
   { "potature.rimini.it", true },
   { "potature.roma.it", true },
   { "potentialproject.com", false },
   { "potenzprobleme-info.net", true },
+  { "poterepersonale.it", true },
   { "pothe.com", true },
   { "pothe.de", true },
   { "potolok.am", true },
   { "potrillionaires.com", true },
   { "potterscraftcider.com", true },
   { "pottersheartministry.org", true },
-  { "pottreid.com", true },
+  { "pottreid.com", false },
   { "pottshome.co.uk", true },
   { "potworowski.de", true },
   { "potzwonen.nl", true },
   { "poudlard.fr", true },
   { "poundwholesale.co.uk", true },
-  { "pour-la-culture-aulnay.fr", true },
   { "pourlesenfants.info", true },
   { "pouwels-oss.nl", true },
   { "povareschka.ru", true },
   { "povesham.tk", true },
   { "pow-s.com", true },
   { "pow.jp", true },
+  { "powdersnow.top", true },
   { "powelljones.co.uk", true },
   { "power-coonies.de", true },
   { "power-fit.org", true },
@@ -28262,11 +29413,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "power-meter.cc", true },
   { "power-tools24.com", true },
   { "powerball.shop", true },
+  { "powerblanket.com", true },
   { "powercloud.technology", true },
-  { "powerdent.net.br", true },
   { "poweredbyiris.nl", true },
   { "poweredbypurdy.com", true },
   { "powerfortunes.com", true },
+  { "powerinboxperformance.com", true },
   { "powermatic7.com", true },
   { "powermeter.at", true },
   { "powermint.de", true },
@@ -28278,11 +29430,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "powersergthisisthetunnelfuckyouscott.com", true },
   { "powersergthisisthewebsitefuckyouscott.com", true },
   { "powersergusercontent.com", true },
-  { "powertothebuilder.com", true },
   { "powerwellness-korecki.de", true },
   { "pozemedicale.org", true },
   { "pozlife.net", true },
-  { "pozzitiv.ro", true },
   { "pp-server.com", true },
   { "pp3345.net", true },
   { "ppcrestaurants.com", true },
@@ -28292,8 +29442,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ppmathis.ch", true },
   { "ppmathis.com", true },
   { "ppmoon.com", true },
-  { "ppoou.co.uk", true },
-  { "ppoozl.com", true },
   { "ppro.com", true },
   { "pptavmdata.org", true },
   { "ppy.la", true },
@@ -28302,9 +29450,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pr1sm.com", true },
   { "pr2studio.com", true },
   { "prac.to", true },
+  { "pracevjihlave.cz", true },
   { "pracowniatkanin.com", true },
+  { "practicallabs.com", true },
+  { "practicalprogrammer.tech", true },
   { "practiceflow.nl", true },
   { "practicepanther.com", true },
+  { "practisforms.com", true },
   { "practo.com", true },
   { "prado.it", true },
   { "praeparation-keppner.de", true },
@@ -28313,17 +29465,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "praguepsychology.com", true },
   { "praguepsychology.cz", true },
   { "pragueswim.cz", true },
+  { "praha-9.eu", true },
   { "prajwalkoirala.com", true },
   { "prakhar.uk", true },
   { "prakharprasad.com", true },
   { "praktijkdevecht.nl", true },
   { "praktijkpassepartout.nl", true },
+  { "prashchar.uk", true },
+  { "pratopronto.org", true },
+  { "pratorotoli.it", true },
   { "praxino.de", true },
   { "praxis-dingeldey.de", true },
   { "praxis-familienglueck.de", true },
   { "praxis-odermath.de", true },
   { "prayerrequest.com", true },
-  { "prazeresdavida.com.br", true },
   { "prazynka.pl", true },
   { "prc-newmedia.com", true },
   { "prc.gov", true },
@@ -28333,6 +29488,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "preciseassemblies.com", true },
   { "precision.st", true },
   { "precisiondigital-llc.com", true },
+  { "precisionmachineservice.com", true },
+  { "precisionventures.com", true },
   { "precode.eu", true },
   { "predoiu.ro", true },
   { "prefix.eu", true },
@@ -28341,6 +29498,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "preigu.de", true },
   { "preis-alarm.info", true },
   { "preis-alarm.org", true },
+  { "preissler.co.uk", true },
+  { "preload.link", true },
   { "preloaded-hsts.badssl.com", true },
   { "prelogica.com.br", true },
   { "preludes.org", true },
@@ -28362,22 +29521,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "premiumcredit.am", true },
   { "premiumweb.co.id", true },
   { "premiumwebdesign.it", true },
+  { "premtech.nl", true },
   { "prenatalgeboortekaartjes.nl", true },
   { "prepaid-cards.xyz", true },
   { "prepaid-voip.nl", true },
   { "prepaidkredietkaart.be", true },
   { "prepare-job-hunting.com", true },
-  { "preposted.com", true },
   { "presbee.com", true },
   { "presbvm.org", true },
+  { "presbyterian-colleges.com", true },
   { "prescotonline.co.uk", true },
   { "present-m.com", true },
-  { "presentesdegrife.com.br", true },
+  { "presentationmedia.com", true },
+  { "preserveourhillcountry.org", true },
   { "president.bg", true },
+  { "presidio.gov", true },
   { "prespanok.sk", true },
   { "pressakey.com", true },
-  { "pressakey.de", true },
   { "presscenter.jp", true },
+  { "presscuozzo.com", true },
   { "pressertech.com", true },
   { "presses.ch", true },
   { "presskr.com", true },
@@ -28385,7 +29547,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pressrush.com", true },
   { "pressup.it", true },
   { "pressureradio.com", true },
-  { "prestburyscouts.org.uk", true },
   { "prestige-car-location.ch", true },
   { "prestige-portal.com", true },
   { "prestigebouncycastles.co.uk", true },
@@ -28397,29 +29558,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pretix.eu", true },
   { "pretrialservices.gov", true },
   { "pretty.hu", true },
+  { "prettygirlcheats.com", true },
   { "prettynode.com", true },
-  { "pretwolk.nl", true },
   { "pretzelx.com", true },
   { "prevenir.ch", true },
   { "preview-it-now.com", true },
   { "priceremoval.net", true },
   { "pricesniffer.co", true },
   { "prideindomination.com", true },
+  { "pridetechdesign.com", false },
   { "prielwurmjaeger.de", true },
+  { "prihatno.my.id", true },
   { "primaconsulting.net", true },
   { "primalbase.com", true },
   { "primalinea.pro", true },
   { "primates.com", true },
-  { "primewho.org", true },
+  { "primeequityproperties.com", true },
   { "primoloyalty.com", true },
   { "primorus.lt", true },
   { "primotilesandbathrooms.co.uk", false },
-  { "princeagency.com", true },
+  { "princeofwhales.com", true },
+  { "princesparktouch.com", true },
   { "princessefoulard.com", true },
-  { "principalsexam.com", true },
   { "principalship.net", true },
   { "principalstest.com", true },
-  { "principalstest.ph", true },
   { "principalstest.review", true },
   { "principaltoolbox.com", true },
   { "principia-journal.de", true },
@@ -28430,6 +29592,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "printeknologies.com", true },
   { "printerleasing.be", true },
   { "printf.de", true },
+  { "printfn.com", false },
+  { "printler.com", true },
   { "printmet.com", true },
   { "printus.de", true },
   { "prior-it.be", true },
@@ -28459,6 +29623,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "prismacloud.xyz", true },
   { "pristal.eu", true },
   { "pristinegreenlandscaping.com", true },
+  { "pritalk.com", true },
   { "priv.im", true },
   { "privacy-week-vienna.at", true },
   { "privacy-week.at", true },
@@ -28483,14 +29648,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "privatepokertour.com", true },
   { "privatepropertymallorca.com", true },
   { "privatestatic.com", false },
+  { "privatevoid.net", true },
   { "privatewolke.com", true },
   { "privatfrei.de", true },
   { "privatpatient-krankenhaus.de", true },
-  { "privcloud.cc", true },
   { "privea.fr", true },
   { "privelust.nl", true },
   { "priverify.com", true },
-  { "privu.me", true },
   { "privy-staging.com", true },
   { "privy.com", true },
   { "prjktruby.com", false },
@@ -28498,15 +29662,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "prknje.com", true },
   { "prlved.co.uk", true },
   { "prnav.com", true },
+  { "pro-ben.sk", true },
   { "pro-bike.ro", true },
-  { "pro-esb.com", true },
-  { "pro-esb.net", true },
   { "pro-link.eu", true },
   { "pro-mile.pl", true },
   { "pro-netz.de", false },
   { "pro-taucher.com", true },
   { "pro-taucher.de", true },
   { "pro-wiert.pl", true },
+  { "proactivestructuresolutions.com", true },
   { "proadvanced.com", true },
   { "proautorepairs.com.au", true },
   { "probase.ph", true },
@@ -28514,14 +29678,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "probiv.biz", true },
   { "probiv.cc", true },
   { "procarservices.com", true },
-  { "procens.us", true },
   { "procensus.com", true },
   { "procert.ch", true },
   { "processesinmotion.com", true },
   { "procharter.com", true },
   { "procinorte.net", true },
   { "proclib.org", true },
-  { "procrastinatingengineer.co.uk", true },
+  { "proclubs.news", true },
   { "procrastinationland.com", true },
   { "procreditbank-kos.com", true },
   { "procreditbank.com.al", true },
@@ -28539,7 +29702,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "productboard.com", true },
   { "productdesignsoftware.com.au", true },
   { "production.vn", true },
-  { "productived.net", false },
   { "productlondon.com", true },
   { "productoinnovador.com", true },
   { "productpeo.pl", true },
@@ -28547,12 +29709,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "produkttest-online.com", true },
   { "prodware.fr", true },
   { "prodware.nl", true },
+  { "proeflokaalbakker.nl", true },
   { "proefteksten.nl", false },
   { "proeftuinveenweiden.nl", true },
   { "proemployeeprotection.com", true },
   { "proemployeeprotection.net", true },
-  { "proesb.com", true },
-  { "proesb.net", true },
   { "prof.ch", true },
   { "profection.biz", true },
   { "professional.cleaning", true },
@@ -28569,12 +29730,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "progenda.be", true },
   { "progenitor.space", true },
   { "progeon.nl", true },
+  { "progettograjau.com", true },
   { "progg.no", true },
   { "proggersession.com", true },
   { "proggersession.de", true },
   { "progiscad.com", true },
   { "programistka.com", true },
   { "programlama.tk", true },
+  { "programmaticmagic.com", true },
   { "programsareproofs.com", true },
   { "programsupport300procent.com", true },
   { "progreso.pl", true },
@@ -28583,6 +29746,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "progressive.work", true },
   { "progressiveplanning.com", true },
   { "progressnet.nl", true },
+  { "progresswww.nl", true },
   { "prohrcloud.com", true },
   { "proimpact.it", true },
   { "project.supply", true },
@@ -28590,20 +29754,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "projectarmy.net", false },
   { "projectblackbook.us", true },
   { "projectborealisgitlab.site", false },
-  { "projectcastle.tech", true },
   { "projectforge.org", true },
   { "projectlinuseasttn.org", true },
   { "projectnom.com", true },
+  { "projectsafechildhood.gov", true },
   { "projectsecretidentity.com", true },
   { "projectsecretidentity.org", true },
   { "projectunity.io", true },
   { "projektarbeit-projektplanung.de", true },
   { "projest.ch", true },
   { "projet-fly.ch", true },
-  { "prok.pw", true },
   { "prolan.pw", true },
   { "prolearningcentre.com", true },
   { "prolinos.de", true },
+  { "promedyczny.pl", true },
   { "prometheanfire.net", true },
   { "prometheanfire.org", true },
   { "promisesaplus.com", true },
@@ -28612,6 +29776,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "promo-brille.de", true },
   { "promo-computers.nl", true },
   { "promo-matelas.com", true },
+  { "promods.cn", true },
   { "promods.net", true },
   { "promohulp.nl", true },
   { "promolover.com", true },
@@ -28630,9 +29795,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "proos.nl", true },
   { "proovn.com", true },
   { "propagandablog.de", true },
-  { "propagandism.org", true },
   { "propagationtools.com", true },
   { "propepper.net", true },
+  { "properchels.com", true },
+  { "propermatches.com", true },
   { "properticons.com", true },
   { "property-catalogue.eu", true },
   { "propertygroup.pl", true },
@@ -28682,6 +29848,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "provence-appartements.com", true },
   { "providencecmc.com", true },
   { "providerlijst.com", true },
+  { "providerlijst.ml", true },
   { "providerlijst.nl", true },
   { "provision-isr.nl", true },
   { "provitec.com", true },
@@ -28698,7 +29865,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "proxybay.one", true },
   { "proxybay.tv", true },
   { "proxyportal.eu", true },
-  { "proxyportal.net", true },
+  { "proyectafengshui.com", true },
+  { "proyecto13.com", true },
   { "prpferrara.it", true },
   { "prplz.io", true },
   { "prpr.cloud", true },
@@ -28714,7 +29882,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "prynhawn.net", true },
   { "prynhawn.org", true },
   { "pryspry.com", true },
-  { "prytkov.com", true },
   { "przemas.pl", true },
   { "ps-provider.co.jp", true },
   { "ps-sale.ru", true },
@@ -28735,12 +29902,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "psdsuc.com", true },
   { "pself.net", true },
   { "pseta.ru", true },
+  { "psg-calw.de", true },
   { "psg.bg", true },
   { "pshostpk.com", true },
   { "psici.eu", true },
   { "psicoexpansao.com.br", true },
   { "psicologajanainapresotto.com.br", true },
   { "psicologasandrabernal.es", true },
+  { "psicologo-especialista-barcelona.com", true },
+  { "psicologo-infantil-barcelona.com", true },
   { "psicologoforensemadrid.com", true },
   { "psm.org.ph", true },
   { "psochecker.com", true },
@@ -28750,7 +29920,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pssgcsim.org", true },
   { "pst.moe", true },
   { "pste.pw", true },
-  { "pstrozniak.com", true },
   { "psu.je", true },
   { "psw-consulting.de", true },
   { "psw-group.de", true },
@@ -28768,8 +29937,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "psychoco.net", false },
   { "psychologie-hofner.at", true },
   { "psychotherapie-kp.de", true },
+  { "psycolleges.com", true },
   { "psydix.org", true },
   { "psylab.cc", true },
+  { "psylab.re", true },
+  { "psylab.vip", true },
   { "psytrance-pro.com", true },
   { "pt-d.ru", true },
   { "pt-server.de", true },
@@ -28784,13 +29956,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pthsec.com", true },
   { "ptm.ro", false },
   { "ptmarquees.ie", true },
+  { "ptr.kr", true },
   { "ptrbrs.nl", true },
   { "ptrl.ws", true },
   { "ptron.org", true },
   { "pty.gg", true },
   { "puac.de", true },
+  { "pubclub.com", true },
   { "pubean.com", true },
-  { "pubi.me", true },
   { "publanda.nl", true },
   { "public-g.de", true },
   { "public-projects.com", true },
@@ -28800,7 +29973,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "publicinquiry.eu", true },
   { "publicintegrity.org", true },
   { "publicintelligence.net", true },
-  { "publick.net", true },
   { "publicrea.com", true },
   { "publicsuffix.org", true },
   { "publiq.space", true },
@@ -28819,6 +29991,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pulizieuffici.milano.it", true },
   { "pulpproject.org", true },
   { "pulser.stream", true },
+  { "pulsnitzer-lebkuchen-shop.de", true },
+  { "pulsnitzer-lebkuchen.shop", true },
+  { "pulsnitzer-pfefferkuchen-shop.de", true },
+  { "pulsnitzer-pfefferkuchen.shop", true },
   { "pumperszene.com", true },
   { "puneflowermall.com", true },
   { "punikonta.de", true },
@@ -28837,9 +30013,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "purefkh.xyz", true },
   { "pureitsolutionsllp.com", true },
   { "purelunch.co.uk", true },
+  { "pureluxemedical.com", true },
   { "purevapeofficial.com", true },
-  { "purikore.com", true },
-  { "purplebooth.co.uk", true },
+  { "purplebooth.co.uk", false },
   { "purplebricks.co.uk", true },
   { "purplebricks.com", true },
   { "purplebricks.com.au", true },
@@ -28854,6 +30030,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "purplestar.com", true },
   { "purplestar.mobi", true },
   { "purplewindows.net", true },
+  { "purplez.pw", true },
   { "purrfect-box.co.uk", true },
   { "purrfectboudoir.com", true },
   { "purrfectmembersclub.com", true },
@@ -28861,6 +30038,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "puryearlaw.com", true },
   { "pusatinkubatorbayi.com", true },
   { "pushers.com.mx", true },
+  { "pushoflove.com", true },
   { "pushphp.com", true },
   { "pushrax.com", true },
   { "pusichatka.ddns.net", true },
@@ -28885,6 +30063,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pvmotorco.com", true },
   { "pvpcraft.ca", true },
   { "pvpctutorials.de", true },
+  { "pvphs98.com", true },
   { "pvtschlag.com", true },
   { "pwaresume.com", true },
   { "pwdsafe.com", true },
@@ -28892,6 +30071,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pwnedpass.tk", true },
   { "pwnies.dk", true },
   { "pwolk.com", true },
+  { "pxl-mailtracker.com", true },
   { "pxl.cl", true },
   { "pxx.io", true },
   { "py-amf.org", true },
@@ -28906,6 +30086,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "pypi.org", true },
   { "pypi.python.org", true },
   { "pyramidsofchi.com", true },
+  { "pyrenees.io", true },
   { "pyrios.pro", true },
   { "pyrotechnologie.de", true },
   { "pysays.net", true },
@@ -28920,6 +30101,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "q-inn.nl", true },
   { "q-technologies.com.au", true },
   { "q123123.com", true },
+  { "q1q2q3.tk", true },
   { "q5118.com", true },
   { "qa-brandywineglobal.com", true },
   { "qa-team.xyz", true },
@@ -28930,10 +30112,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qadmium.com", true },
   { "qambarraza.com", true },
   { "qapital.com", true },
-  { "qaz.cloud", true },
-  { "qbeing.info", true },
+  { "qaq.sh", true },
   { "qbiju.com.br", true },
-  { "qbus.pl", true },
+  { "qbiltrade.com", true },
   { "qc.immo", true },
   { "qc.search.yahoo.com", false },
   { "qccareerschool.com", true },
@@ -28946,6 +30127,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qcstudentcenter.com", true },
   { "qcstyleacademy.com", true },
   { "qctravelschool.com", true },
+  { "qdabogados.com", true },
   { "qdon.space", false },
   { "qedcon.org", false },
   { "qelectrotech.org", true },
@@ -28956,22 +30138,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qgustavor.tk", true },
   { "qhse-professionals.nl", true },
   { "qianalysis.com", true },
+  { "qianmo.com", true },
   { "qianqiao.me", true },
   { "qiaohong.org", true },
   { "qicomidadeverdade.com.br", true },
   { "qifu.me", true },
   { "qiliang.wang", true },
   { "qingcao.org", true },
-  { "qingpat.com", true },
   { "qingpei.me", true },
   { "qionouu.cn", true },
   { "qis.fr", true },
   { "qitarabutrans.com", true },
+  { "qiu521119.host", true },
   { "qiuri.org", true },
   { "qivonline.pt", true },
   { "qiwi.be", true },
   { "qixi.biz", true },
-  { "qkka.org", true },
   { "qkmortgage.com", true },
   { "qldconservation.org.au", true },
   { "qldformulaford.org", true },
@@ -28988,8 +30170,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qponverzum.hu", true },
   { "qq-navi.com", true },
   { "qq52o.me", true },
-  { "qqrss.com", true },
-  { "qqvips.com", true },
   { "qr-city.org", true },
   { "qr.cl", true },
   { "qrbird.com", true },
@@ -28999,10 +30179,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qscloud.de", true },
   { "qtacairsoft.com", true },
   { "qtl.me", true },
+  { "qtmsheep.com", true },
   { "qtn.net", true },
-  { "qto.com", true },
-  { "qto.net", true },
-  { "qto.org", true },
   { "qtpass.org", true },
   { "qtpower.co.uk", true },
   { "qtpower.net", true },
@@ -29010,7 +30188,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qtvr.com", true },
   { "qtxh.net", true },
   { "quackerswaterproofing.com", true },
-  { "quaedam.org", true },
   { "quaggan.co", true },
   { "quai10.org", false },
   { "qualite-ecole-et-formation.ch", true },
@@ -29032,13 +30209,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "quantumfurball.net", true },
   { "quantumpair.net", true },
   { "quantumwebs.co", true },
-  { "quanyin.eu.org", true },
   { "quareal.ru", true },
   { "quarkdose.de", true },
   { "quarterfull.com", true },
   { "quartix.com", true },
   { "quartzclinical.com", true },
-  { "quarus.net", true },
   { "quasarelectronics.co.uk", true },
   { "quasiproxy.com", true },
   { "quasseldroid.info", true },
@@ -29066,10 +30241,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "quentinchevre.ch", true },
   { "queo.com.co", true },
   { "quera.ir", true },
-  { "querkommentar.de", true },
   { "query-massage.com", true },
   { "question.com", true },
-  { "questionable.host", true },
+  { "questoj.cn", true },
   { "questsocial.it", true },
   { "quevisiongrafica.com", true },
   { "quic.stream", true },
@@ -29079,7 +30253,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "quietapple.org", true },
   { "quikchange.net", true },
   { "quikpay.com.au", true },
-  { "quikrmovies.to", true },
   { "quilmo.com", true },
   { "quimatic.com.br", true },
   { "quinnlabs.com", true },
@@ -29091,10 +30264,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "quitarlasmanchasde.com", true },
   { "quitimes.com", true },
   { "quizogames.com", true },
+  { "quizstore.net", true },
   { "qul.link", true },
   { "quli.nl", false },
   { "qunzi.la", true },
   { "quocdesign.ch", true },
+  { "quote.gq", true },
   { "quoteidiot.com", true },
   { "quotev.com", true },
   { "quovadisaustria.com", true },
@@ -29109,7 +30284,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "qwant.fr", true },
   { "qwdqwd.de", true },
   { "qwe7002.com", true },
-  { "qweepi.de", false },
   { "qwertee.com", true },
   { "qwerty.work", true },
   { "qwikdash.com", true },
@@ -29122,6 +30296,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "r-t-b.fr", true },
   { "r0t.co", true },
   { "r0uzic.net", true },
+  { "r1a.eu", true },
   { "r1ch.net", true },
   { "r2d2pc.com", true },
   { "r33.space", true },
@@ -29148,6 +30323,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rabynska.eu", true },
   { "raccoltarifiuti.com", true },
   { "racdek.com", true },
+  { "racdek.net", true },
   { "racdek.nl", true },
   { "racermaster.xyz", true },
   { "racesport.nl", false },
@@ -29167,29 +30343,32 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "racoo.net", true },
   { "racozo.com", true },
   { "racunovodstvo-prina.si", true },
+  { "rada-group.eu", true },
   { "radar.sx", true },
   { "radaravia.ru", true },
   { "radarnext.com", true },
   { "radartatska.se", true },
   { "radartek.com", true },
   { "radcube.hu", true },
+  { "radegundisfest.de", true },
   { "radfieldhomecare.co.uk", true },
   { "radfieldhomecarefranchising.co.uk", true },
-  { "radical.org", true },
   { "radicaloptimism.org", true },
   { "radicalsub.com.br", true },
   { "radins.com", true },
   { "radio-pulsar.eu", true },
   { "radio-utopie.de", true },
   { "radio1.ie", true },
-  { "radioafibra.com.br", true },
-  { "radiocomsaocarlos.com.br", true },
+  { "radioactivenetwork.xyz", true },
+  { "radiofmimagen.net", true },
+  { "radioheteroglossia.com", true },
   { "radioilusion.es", true },
   { "radiom.fr", true },
   { "radiomodem.dk", true },
   { "radiomontebianco.it", true },
   { "radionicabg.com", true },
   { "radiopolarniki.spb.ru", true },
+  { "radior9.it", true },
   { "radiormi.com", true },
   { "radiorsvp.com", false },
   { "radiosendungen.com", true },
@@ -29202,9 +30381,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "radyn.com", true },
   { "raeu.me", true },
   { "raeven.nl", true },
+  { "raevinnd.com", true },
   { "rafaelmagalhaesweb.com", true },
   { "rafey.xyz", true },
   { "raffaellaosti.com", true },
+  { "rafleatherdesign.com", true },
   { "raft.pub", true },
   { "rafting-japan.com", true },
   { "ragasto.nl", true },
@@ -29213,13 +30394,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rage4.com", true },
   { "raghavdua.in", true },
   { "rahulpnath.com", true },
+  { "raidensnakesden.co.uk", true },
+  { "raidensnakesden.com", true },
+  { "raidensnakesden.net", true },
   { "raidstone.net", true },
   { "rail-o-rama.nl", true },
   { "rail24.nl", true },
   { "rail360.nl", true },
   { "railbird.nl", true },
   { "railgun.ac", true },
-  { "railjob.cn", true },
+  { "railgun.com.cn", true },
   { "railorama.nl", true },
   { "railpassie.nl", true },
   { "railtoo.com", true },
@@ -29228,25 +30412,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "railvideo.nl", true },
   { "railwaytech.net", true },
   { "raimixmotoparts.com.br", true },
+  { "rain.bz", true },
   { "rainbowbay.org", true },
   { "rainbowinflatables.co.uk", true },
   { "rainbowstore.com.au", true },
   { "rainbowstore.com.ua", true },
+  { "rainel.at", true },
   { "rainforest.engineering", true },
+  { "rainiv.com", true },
   { "rainpaper.com", true },
   { "rainstormsinjuly.co", true },
   { "rainville.me", true },
   { "rainway.io", true },
-  { "raipet.no-ip.biz", true },
   { "raiseyourflag.com", true },
   { "raissarobles.com", true },
   { "raito.win", true },
   { "rajivshah.co.uk", true },
+  { "rajkapoordas.com", true },
   { "rajyogarishikesh.com", true },
   { "rak-business-service.com", true },
+  { "raku.bzh", true },
   { "rakugokai.net", true },
   { "ralf-huebscher.de", true },
-  { "ralfs-zusizone.de", true },
   { "ralimtek.com", false },
   { "rally-base.com", true },
   { "rally-base.cz", true },
@@ -29262,7 +30449,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ram-it.nl", true },
   { "ram.nl", true },
   { "rambo.codes", true },
-  { "ramitmittal.com", true },
   { "rammstein-portugal.com", true },
   { "ramrecha.com", true },
   { "ramsor-gaming.de", true },
@@ -29274,6 +30460,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "randomdysfunctions.com", true },
   { "randomkoalafacts.com", true },
   { "randomprecision.co.uk", true },
+  { "randomquotesapp.com", true },
   { "ranfurlychambers.co.nz", true },
   { "rangde.org", true },
   { "rangercollege.edu", true },
@@ -29294,6 +30481,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "raphaeladdile.com", true },
   { "raphaelcasazza.ch", true },
   { "raphaelmoura.ddns.net", true },
+  { "raphaelschmid.eu", true },
   { "raphrfg.com", true },
   { "rapidapp.io", true },
   { "rapidhubs.com", true },
@@ -29310,11 +30498,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rascalscastles.co.uk", true },
   { "rascalscastlesdoncaster.co.uk", true },
   { "rasebo.ro", true },
-  { "raspberryultradrops.com", true },
   { "raspii.tech", true },
+  { "raspitec.ddns.net", true },
   { "rasty.cz", true },
   { "ratd.net", true },
   { "ratebridge.com", true },
+  { "ratelsec.com", true },
   { "rathbonesonline.com", true },
   { "rathgeb.org", true },
   { "ratinq.co", true },
@@ -29322,12 +30511,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rationalism.com", true },
   { "rationalops.com", true },
   { "rattenkot.io", true },
+  { "rauchenwald.net", true },
   { "raulrivero.es", true },
   { "rault.io", true },
   { "raum4224.de", true },
   { "rauros.net", true },
   { "rautelow.de", true },
-  { "rautermods.net", true },
   { "ravchat.com", true },
   { "raven.dog", true },
   { "ravenger.net", true },
@@ -29336,6 +30525,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ravindran.me", true },
   { "raviparekh.co.uk", true },
   { "ravis.org", true },
+  { "ravkr.duckdns.org", true },
   { "rawdutch.nl", true },
   { "rawinfosec.com", true },
   { "rawsec.net", true },
@@ -29343,6 +30533,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "raxion.tk", true },
   { "ray-home.de", true },
   { "ray-works.de", true },
+  { "rayan-it.ir", true },
+  { "rayanitco.com", true },
   { "rayiris.com", true },
   { "raykitchenware.com", true },
   { "raymcbride.com", true },
@@ -29350,14 +30542,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "raymii.org", true },
   { "raymondelooff.nl", true },
   { "raystark.com", true },
-  { "raywin168.com", true },
-  { "raywin168.net", true },
-  { "raywin88.net", true },
   { "rayworks.de", true },
-  { "razberry.kr", true },
   { "razeen.me", true },
   { "razeencheng.com", true },
   { "raziskovalec-resnice.com", true },
+  { "razvanburz.net", true },
   { "rbcservicehub-uat.azurewebsites.net", true },
   { "rbensch.com", true },
   { "rbflote.lv", true },
@@ -29365,8 +30554,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rbmafrica.co.za", true },
   { "rbnet.xyz", true },
   { "rbran.com", true },
-  { "rbtvshitstorm.is", true },
   { "rburchell.com", true },
+  { "rbx-talk.xyz", true },
   { "rc-offi.net", true },
   { "rc-rp.com", true },
   { "rc-shop.ch", true },
@@ -29380,21 +30569,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rcmpsplib.com", true },
   { "rcmurphy.com", true },
   { "rcnitrotalk.com", true },
-  { "rcorporation.be", true },
   { "rcraigmurphy.com", true },
-  { "rcraigmurphy.net", true },
   { "rcsolutions.nl", true },
   { "rct.sk", true },
+  { "rct.uk", true },
   { "rctalk.com", true },
   { "rdfproject.it", true },
+  { "rdh.asia", true },
   { "rdjb2b.com", true },
   { "rdl.at", false },
+  { "rdmc.fr", true },
   { "rdmrotterdam.nl", true },
   { "rdmtaxservice.com", true },
   { "rdns.cc", true },
-  { "rdplumbingsolutions.com.au", true },
   { "rdv-prefecture.com", true },
   { "rdwh.tech", true },
+  { "rdxsattamatka.mobi", true },
   { "re-curi.com", true },
   { "re-engines.com", true },
   { "reachhead.com", true },
@@ -29407,11 +30597,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "read.sc", true },
   { "reades.co.uk", true },
   { "readheadcopywriting.com", true },
-  { "readify.com.au", true },
   { "readingandmath.org", true },
   { "readingrats.de", true },
+  { "readmusiccoleman.com", true },
   { "readonly.de", true },
   { "readouble.com", false },
+  { "reads.wang", true },
   { "readysell.net", true },
   { "readytobattle.net", true },
   { "readytongue.com", true },
@@ -29420,10 +30611,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "real-digital.co.uk", true },
   { "real-it.nl", true },
   { "realcapoeira.ru", true },
+  { "realestate-in-uruguay.com", true },
+  { "realestatecentralcoast.info", true },
+  { "realestatemarketingblog.org", true },
   { "realestateonehowell.com", true },
   { "realestateradioshow.com", true },
   { "realfreedom.city", true },
-  { "realgarant-shop.de", false },
   { "realhorsegirls.net", true },
   { "realhypnosistraining.com.au", true },
   { "realitea.co.uk", true },
@@ -29431,12 +30624,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reality0ne.com", false },
   { "realitycrazy.com", true },
   { "reallifeforums.com", true },
+  { "realloc.me", true },
   { "really-simple-plugins.com", true },
   { "really-simple-ssl.com", true },
   { "really.ai", true },
   { "reallytrusted.com", true },
   { "realme.govt.nz", true },
   { "realmofespionage.xyz", true },
+  { "realoteam.ddns.net", true },
+  { "realpropertyprofile.gov", true },
+  { "realtygroup-virginia.com", true },
+  { "realtyink.net", true },
   { "realum.com", true },
   { "realum.de", true },
   { "realum.eu", true },
@@ -29459,6 +30657,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reboxetine.com", true },
   { "reboxonline.com", true },
   { "rebtoor.com", true },
+  { "recalls.gov", true },
   { "recantoshop.com", true },
   { "recantoshop.com.br", true },
   { "recapp.ch", true },
@@ -29466,8 +30665,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "receiliart.com", true },
   { "receptionpoint.com", true },
   { "receptionsbook.com", true },
-  { "recepty.eu", false },
-  { "recetasdecocinaideal.com", true },
+  { "recepty.eu", true },
+  { "recetin.com", true },
   { "rechenknaecht.de", true },
   { "rechtsanwaeltin-vollmer.de", true },
   { "rechtsanwalt-koeppen-feucht.de", true },
@@ -29484,10 +30683,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "recolic.net", true },
   { "recommended.reviews", true },
   { "recon-networks.com", true },
+  { "reconexion.life", true },
   { "recordeuropa.com", false },
   { "recoveringspirit.com", true },
   { "recoveryonline.org", true },
-  { "recreation.gov", true },
   { "recreoviral.com", true },
   { "recruitmade.jp", true },
   { "rectecforum.com", true },
@@ -29525,11 +30724,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reddingsbrigade-zwolle.nl", true },
   { "reddit2kindle.com", true },
   { "reddraggone9.com", true },
+  { "reddyai.com", true },
   { "rede-reim.de", true },
   { "rede-t.com", true },
   { "redelectrical.co.uk", true },
   { "redessantaluzia.com.br", true },
   { "redfox-infosec.de", true },
+  { "redfoxmarketiing.com", true },
   { "redgatesoftware.co.uk", true },
   { "redgoose.ca", true },
   { "redhandedsecurity.com.au", true },
@@ -29541,18 +30742,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "redirect.stg.fedoraproject.org", true },
   { "rediske.me", true },
   { "redit.com", true },
+  { "rediverge.com", true },
   { "redivis.com", true },
+  { "redleslie.com", true },
   { "redletter.link", true },
   { "redlinelap.com", true },
   { "redlink.de", true },
+  { "redmind.se", true },
   { "redmore.me", true },
   { "redneragenturen.org", true },
   { "rednsx.org", true },
+  { "redpact.com", true },
   { "redprice.by", true },
   { "redshield.co", true },
   { "redshiftlabs.com.au", true },
   { "redshoeswalking.net", true },
   { "redsicom.com", true },
+  { "redsquarelasvegas.com", true },
+  { "redsquirrelcampsite.co.uk", true },
   { "redstoner.com", true },
   { "redteam-pentesting.de", true },
   { "redwaterhost.com", true },
@@ -29562,14 +30769,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "redzurl.com", false },
   { "reed-sensor.com", true },
   { "reedloden.com", true },
+  { "reedyforkfarm.com", true },
   { "reegle.com", true },
-  { "reepay.com", true },
   { "rees-carter.net", true },
   { "reesmichael1.com", true },
   { "reevaappliances.co.uk", true },
-  { "reevoo.com", true },
-  { "reevu.net", true },
   { "reezer.org", true },
+  { "refactor.zone", false },
   { "referdell.com", true },
   { "refficience.com", true },
   { "refill-roboter.de", true },
@@ -29581,8 +30787,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reflexive-engineering.com", true },
   { "reflexive.xyz", true },
   { "refood-cascaiscpr.eu", true },
+  { "reforesttheplanet.com", true },
   { "refresh-media.nl", true },
   { "refreshliving.us", true },
+  { "refu.net", true },
   { "refuelcollective.com", true },
   { "refuelcreative.com.au", true },
   { "refundo.cz", true },
@@ -29594,6 +30802,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reganclassics.com", true },
   { "reganparty.com", true },
   { "regar42.fr", false },
+  { "regeneo.cz", true },
   { "regenerapoint.it", true },
   { "regenerescence.com", true },
   { "regily.com", true },
@@ -29601,8 +30810,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "regime-maigrir-vite.com", true },
   { "regimebonheur.com", true },
   { "regimecellulite.com", true },
+  { "reginfo.gov", true },
   { "regiobeveland.nl", true },
   { "regionalbasementandcrawlspacerepair.com", true },
+  { "regionalgrowth.com", true },
   { "regiosalland.nl", true },
   { "regiovertrieb.de", false },
   { "regis.tech", true },
@@ -29614,7 +30825,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "registrarplus.net", true },
   { "registrarplus.nl", true },
   { "registryplus.net", true },
-  { "registryplus.nl", true },
   { "regmyr.se", true },
   { "regnix.net", true },
   { "regnr.info", true },
@@ -29631,12 +30841,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rehabthailand.com", true },
   { "rehabthailand.org", true },
   { "reher.pro", true },
-  { "rei.codes", true },
+  { "rei.ki", true },
   { "reichardt-home.goip.de", true },
   { "reichel-steinmetz.de", true },
   { "reichelt-cloud.de", true },
   { "reichl-online.net", true },
   { "reidasbombas.com", true },
+  { "reifr.net", true },
   { "reiki-france.fr", true },
   { "reilly.io", true },
   { "reimaginebelonging.de", true },
@@ -29649,9 +30860,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reinencaressa.be", true },
   { "reinfer.io", true },
   { "reinhard.codes", true },
+  { "reinhardtsgermanautorepair.com", true },
   { "reinhardtsgrimma.de", true },
   { "reinierjonker.nl", true },
-  { "reinoldus.ddns.net", true },
   { "reinout.nu", true },
   { "reinouthoornweg.nl", true },
   { "reinventetoi.com", false },
@@ -29665,6 +30876,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rejsehuskelisten.dk", true },
   { "rekisuta.com", true },
   { "reklamjog.hu", true },
+  { "rekonstrukcestatu.cz", true },
   { "rekorsanat.com.tr", true },
   { "rekyou.com", false },
   { "relates.link", true },
@@ -29676,25 +30888,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "release-monitoring.org", true },
   { "releasetimes.io", true },
   { "reliancebank.bank", true },
-  { "reliant3sixty.com", true },
   { "relocatefeds.gov", false },
   { "relojeriajoyeria.com", true },
   { "relojes-online.com", true },
   { "relojesseiko.es", true },
+  { "relsak.cz", true },
   { "relvan.com", true },
   { "rem0te.net", true },
   { "remaimodern.org", true },
   { "remambo.jp", true },
+  { "remarketable.org", true },
   { "remax.at", true },
   { "remedi.tokyo", true },
   { "remedionaturales.com", true },
   { "remedioparaherpes.com", true },
   { "remedios-caserospara.com", true },
   { "remedioscaserosparalacistitis.com", true },
-  { "remedioskaseros.com", false },
   { "remejeanne.com", true },
   { "rememberthemilk.com", false },
   { "remi-saurel.com", true },
+  { "remiafon.com", true },
   { "remilner.co.uk", true },
   { "remini.cz", true },
   { "remirampin.com", true },
@@ -29706,6 +30919,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "remoteutilities.com", true },
   { "removalcellulite.com", true },
   { "removedrepo.com", true },
+  { "remptmotors.com", true },
+  { "remrol.ru", true },
   { "remszeitung.de", true },
   { "renaissanceplasticsurgery.net", true },
   { "renascentia.asia", true },
@@ -29715,14 +30930,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rene-schwarz.com", true },
   { "rene-stolp.de", true },
   { "renearends.nl", true },
-  { "reneclemens.nl", true },
   { "renedekoeijer.com", true },
   { "renedekoeijer.nl", true },
+  { "renee.today", true },
   { "reneleu.ch", true },
   { "renem.net", false },
   { "renemayrhofer.com", true },
   { "reneschmidt.de", true },
   { "renewablefreedom.org", true },
+  { "renewablemaine.org", true },
   { "renewed.technology", true },
   { "renewgsa.com", true },
   { "renewmedispa.com", true },
@@ -29735,7 +30951,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "renrenche.com", false },
   { "rens.nu", true },
   { "renscreations.com", true },
-  { "rent-a-c.io", true },
   { "rent-a-coder.de", true },
   { "rentacaramerica.com", true },
   { "rentasweb.gob.ar", true },
@@ -29743,11 +30958,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rentinsingapore.com.sg", true },
   { "rentourhomeinprovence.com", true },
   { "renuo.ch", true },
+  { "renxinge.cn", false },
+  { "reo.gov", true },
   { "reorz.com", true },
   { "reox.at", false },
   { "repaik.com", true },
   { "repair.by", true },
   { "repaper.org", true },
+  { "reparo.pe", true },
   { "repaxan.com", true },
   { "repkord.com", true },
   { "replicaswiss.nl", true },
@@ -29770,11 +30988,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "repugnantconclusion.com", true },
   { "reputationweaver.com", true },
   { "reqrut.net", true },
-  { "request-trent.com", true },
   { "requestr.co.uk", true },
   { "res-kc.com", true },
   { "resama.eu", true },
-  { "resc.la", true },
   { "rescms-secure.com", true },
   { "research-panel.jp", true },
   { "research.facebook.com", false },
@@ -29787,8 +31003,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "residence-simoncelli.com", true },
   { "residentiallocksmithsanantoniotx.com", true },
   { "resine.roma.it", true },
+  { "resinflooringcompany.com", true },
   { "resist.ca", true },
   { "resistav.com", true },
+  { "resnickandnash.com", true },
   { "resolvefa.co.uk", true },
   { "resolvefa.com", true },
   { "resolving.com", true },
@@ -29798,15 +31016,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "resourceconnect.com", true },
   { "resourceguruapp.com", true },
   { "resources.flowfinity.com", true },
+  { "resourcesmanagementcorp.com", true },
   { "respectmyprivacy.eu", true },
   { "respectmyprivacy.net", true },
   { "respectmyprivacy.nl", true },
+  { "respecttheflame.com", true },
   { "respon.jp", true },
   { "responer.com", true },
+  { "responsepartner.com", true },
   { "responsibledisclosure.nl", false },
   { "responsive-shop.com", true },
   { "responsivepaper.com", true },
   { "respostas.com.br", true },
+  { "ressl.ch", true },
   { "restaurant-oregano.de", true },
   { "restaurant-rosengarten.at", true },
   { "restaurantguru.com", true },
@@ -29827,10 +31049,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "retetenoi.net", true },
   { "retetop95.it", true },
   { "reticon.de", true },
-  { "retireyourpassword.org", true },
   { "retmig.dk", true },
   { "reto.ch", true },
   { "reto.com", true },
+  { "reto.io", true },
   { "retokromer.ch", true },
   { "retractableawningssydney.com.au", true },
   { "retro.rocks", true },
@@ -29864,14 +31086,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "reverseloansolutions.com", true },
   { "reversesouthafrica.com", true },
   { "review.jp", true },
+  { "reviewmed-215418.appspot.com", true },
   { "reviewninja.net", true },
   { "reviews.anime.my", false },
-  { "reviewspedia.org", true },
   { "revirt.global", true },
   { "revision.co.zw", true },
   { "revisionnotes.xyz", true },
   { "revisit.date", true },
   { "revivalinhisword.com", true },
+  { "revivalprayerfellowship.com", true },
   { "revivingtheredeemed.org", true },
   { "revlect.com", true },
   { "revolt.tv", true },
@@ -29888,12 +31111,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rezosup.net", true },
   { "rezosup.org", true },
   { "rezultant.ru", true },
-  { "rfeif.org", true },
   { "rftoon.com", true },
+  { "rfxanalyst.com", true },
   { "rga.sh", true },
   { "rgavmf.ru", true },
   { "rgbinnovation.com", true },
   { "rgcomportement.fr", true },
+  { "rgraph.net", true },
   { "rhaegal.me", true },
   { "rhd-instruments.com", true },
   { "rhd-instruments.de", true },
@@ -29903,7 +31127,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rheinturm.nrw", true },
   { "rheocube.com", true },
   { "rhese.net", true },
-  { "rhetthenckel.com", true },
+  { "rhetorical.ml", true },
   { "rheuma-online.de", true },
   { "rhevelo.com", true },
   { "rhinelander.ca", true },
@@ -29912,6 +31136,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rhodenmanorcattery.co.uk", true },
   { "rhodri.io", true },
   { "rhowell.io", true },
+  { "rhumblineadvisers.com", true },
   { "rhymeswithmogul.com", true },
   { "rhymix.org", true },
   { "rhynl.io", true },
@@ -29921,17 +31146,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ribs.com", true },
   { "ricardo.nu", true },
   { "ricardobalk.nl", true },
+  { "ricardopq.com", true },
   { "ricaud.me", true },
   { "riccardopiccioni.it", true },
   { "riccy.org", true },
   { "riceadvice.info", true },
   { "richadams.me", true },
   { "richardbloomfield.blog", true },
-  { "richardcrosby.co.uk", true },
+  { "richardfeinbergdds.com", true },
   { "richardharpur.com", true },
   { "richardhering.de", true },
   { "richardjgreen.net", true },
   { "richardlangworth.com", true },
+  { "richardlevinmd.com", true },
   { "richardlugten.nl", true },
   { "richardramos.me", true },
   { "richardrblocker.net", true },
@@ -29940,6 +31167,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "richardson.pictures", true },
   { "richardson.software", true },
   { "richardson.systems", true },
+  { "richardstonerealestate.com", true },
   { "richardwarrender.com", true },
   { "richie.fi", true },
   { "richonrails.com", true },
@@ -29947,9 +31175,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ricki-z.com", true },
   { "rickrongen.nl", true },
   { "rickscastles.co.uk", true },
+  { "ricksfamilycarpetcleaning.com", true },
   { "rickvanderzwet.nl", true },
   { "rickweijers.nl", true },
-  { "ricky.capital", false },
   { "rickyromero.com", true },
   { "rico.ovh", true },
   { "ricobaldegger.ch", true },
@@ -29958,6 +31186,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ricozienke.de", true },
   { "riddims.co", true },
   { "ride-up.com", true },
+  { "rideways.com", true },
   { "rideyourdamn.bike", true },
   { "ridgelandchurch.org", true },
   { "ridingboutique.de", true },
@@ -29981,7 +31210,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rigolitch.fr", true },
   { "rigsalesaustralia.com", true },
   { "rijk-catering.nl", false },
-  { "rijsinkunst.nl", true },
+  { "rijsinkunst.nl", false },
   { "rik.onl", true },
   { "riku.pw", true },
   { "rile5.com", true },
@@ -29997,11 +31226,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ringjewellery.co.uk", true },
   { "rinvex.com", true },
   { "rio-weimar.de", true },
+  { "rioshop.com.br", true },
   { "rioxmarketing.com", true },
   { "rip-sport.cz", true },
   { "ripaton.fr", true },
+  { "ripcorddesign.com", true },
+  { "ripcordsandbox.com", true },
   { "ripmixmake.org", true },
-  { "ripple.com", true },
   { "riqy86.nl", true },
   { "ris.fi", true },
   { "risada.nl", true },
@@ -30019,7 +31250,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ristorantefattoamano.it", true },
   { "ristoviitanen.fi", true },
   { "ristrutturazioneappartamento.roma.it", true },
-  { "rit.space", true },
+  { "rit.space", false },
   { "rittau.biz", true },
   { "rittau.org", true },
   { "ritzlux.com.tw", true },
@@ -30027,8 +31258,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rivalsa.cn", true },
   { "rivastation.de", true },
   { "riverbanktearooms.co.uk", true },
+  { "riverbed.com", false },
+  { "riverbendroofingnd.com", true },
   { "riverford.co.uk", true },
   { "rivermist.com.au", true },
+  { "riverridgecc.com", true },
   { "riversidebaptistchurch.net", true },
   { "riversideradio.nl", true },
   { "riverviewcourtapts.com", true },
@@ -30037,6 +31271,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rivoflor.it", true },
   { "rivus.net", true },
   { "rivy.org", true },
+  { "riwick.com", false },
   { "rix.ninja", true },
   { "rixter.com", true },
   { "rixzz.ovh", true },
@@ -30045,7 +31280,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rizospastis.gr", true },
   { "rj-onderneemt.nl", true },
   { "rkfp.cz", true },
-  { "rkkhok.hu", true },
+  { "rkkhok.hu", false },
   { "rkmns.edu.in", true },
   { "rlalique.com", true },
   { "rld.org", true },
@@ -30064,7 +31299,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rmpsolution.de", true },
   { "rmrig.org", true },
   { "rms.sexy", true },
-  { "rmstudio.tw", true },
   { "rmsupply.nl", true },
   { "rn29.me", true },
   { "rnag.ie", true },
@@ -30081,6 +31315,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rob006.net", true },
   { "robandjanine.com", true },
   { "robbertt.com", false },
+  { "robbiecrash.me", true },
   { "robdavidson.network", true },
   { "robert-flynn.de", true },
   { "robertattfield.com", true },
@@ -30098,9 +31333,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "robertopazeller.ch", true },
   { "robertreiser.photography", true },
   { "robertrijnders.nl", true },
-  { "robertsmits.be", true },
+  { "robertsmits.be", false },
   { "robhorstmanshof.nl", true },
   { "robicue.com", true },
+  { "robigalia.org", false },
   { "robin-novotny.com", true },
   { "robin.co.kr", true },
   { "robin.info", true },
@@ -30113,6 +31349,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "robinsonyu.com", true },
   { "robinwill.de", true },
   { "robinwinslow.uk", true },
+  { "robjager-fotografie.nl", true },
   { "robocop.no", true },
   { "robodeidentidad.gov", true },
   { "robohash.org", true },
@@ -30122,6 +31359,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "robotattack.org", true },
   { "roboth.am", true },
   { "robotham.org", true },
+  { "robotics.plus", true },
   { "robotkvarnen.se", true },
   { "robototes.com", true },
   { "robots-ju.ch", true },
@@ -30133,15 +31371,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "robu.in", true },
   { "robud.info", true },
   { "robustac.com", true },
-  { "roc.net.au", true },
   { "rochakhand-knitcraft.com.np", true },
+  { "rocis.gov", true },
   { "rocka.me", true },
   { "rockagogo.com", true },
   { "rockbankland.com.au", true },
   { "rockcanyonbank.com", true },
   { "rockcellar.ch", true },
   { "rockenfuerlachenhelfen.de", true },
-  { "rocket-wars.de", true },
   { "rocketevents.com.au", true },
   { "rocketgnomes.com", true },
   { "rocketr.net", true },
@@ -30164,25 +31401,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rodevlaggen.nl", true },
   { "rodichi.net", true },
   { "rodinnebyvanie.eu", true },
-  { "rodinneodpoledne2018.cz", true },
   { "rodolfo.gs", true },
+  { "rodomonte.org", true },
   { "rodrigocarvalho.blog.br", true },
-  { "rody-design.com", true },
   { "rodzina-kupiec.eu.org", true },
   { "roeckx.be", true },
   { "roeitijd.nl", false },
   { "roeldevries.me", true },
   { "roeleveld.nl", true },
   { "roelhollander.eu", true },
+  { "roeljoyas.com", true },
   { "roelof.io", true },
   { "roelsworld.eu", true },
   { "roemhild.de", true },
   { "roerstaafjes.nl", true },
-  { "roffe.nu", true },
   { "rofl.com.ua", true },
   { "roflcopter.fr", true },
   { "rogagym.com", true },
-  { "roger101.com", true },
   { "rogerhub.com", true },
   { "rogerriendeau.ca", true },
   { "rogersaam.ch", true },
@@ -30198,16 +31433,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "roguetechhub.org", true },
   { "rohedaten.de", true },
   { "rohitagr.com", true },
-  { "rohlik.cz", true },
   { "rointe.online", true },
-  { "roiscroll.com", true },
   { "roka9.de", true },
+  { "rokass.nl", true },
   { "rokki.ch", true },
   { "rokort.dk", true },
   { "roksolana.be", true },
   { "rokudenashi.de", true },
   { "roland.io", true },
   { "rolandinsh.com", true },
+  { "rolandlips.com", true },
+  { "rolandlips.nl", true },
+  { "rolandreed.cn", true },
   { "rolandszabo.com", true },
   { "roleplayhome.com", true },
   { "roligprylar.se", true },
@@ -30219,6 +31456,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rolodato.com", true },
   { "roma-servizi.it", true },
   { "romab.com", true },
+  { "romain-arias.fr", true },
+  { "romainmuller.xyz", false },
   { "roman-pavlik.cz", true },
   { "romancloud.com", true },
   { "romande-entretien.ch", true },
@@ -30226,18 +31465,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "romano.guru", true },
   { "romanpavlodar.kz", true },
   { "romantelychko.com", true },
-  { "romantic-quotes.co.uk", true },
   { "romantica-hotel.de", true },
   { "romanticfirstdance.com", true },
   { "romanticschemer.com", true },
   { "romanticsexshopguatemala.com", true },
   { "romanticvillas.com.au", false },
   { "romapa.com", true },
+  { "romar-bos.nl", true },
   { "romarin.es", true },
   { "romaservicegroup.it", true },
   { "romatrip.it", true },
   { "rome.dating", true },
   { "rommelwood.de", true },
+  { "romun.net", true },
+  { "romy.tw", true },
   { "ronanrbr.com", true },
   { "rondommen.nl", true },
   { "rondouin.fr", true },
@@ -30252,9 +31493,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "roodfruit.studio", true },
   { "roodhealth.co.uk", true },
   { "roof.ai", false },
+  { "roofingandconstructionllc.com", true },
   { "roofingomaha.com", true },
   { "roofsandbasements.com", true },
   { "rook-playz.net", true },
+  { "rookvrij.nl", true },
   { "room-composite.com", true },
   { "room208.org", true },
   { "room2d.com", true },
@@ -30265,7 +31508,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rooneytours.nl", true },
   { "roopakv.com", true },
   { "roosabels.nl", false },
-  { "roosterpgplus.nl", true },
   { "root-space.eu", true },
   { "root.bg", true },
   { "root.cz", true },
@@ -30283,10 +31525,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rootsbar.fr", true },
   { "rootscope.co.uk", false },
   { "rootspersona.com", true },
-  { "rootswitch.com", true },
+  { "rootswitch.com", false },
   { "rootusers.com", true },
   { "ropd.info", true },
-  { "roromendut.online", true },
   { "roryneville.com", true },
   { "rosabellas.co.uk", true },
   { "rosalindturner.co.uk", true },
@@ -30303,10 +31544,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rosenkeller.org", true },
   { "roseofyork.com", true },
   { "roseofyorkbooking.com", true },
+  { "roseon.net", true },
   { "roseparkhouse.com", true },
   { "rosesciences.com", true },
-  { "rosetiger.life", true },
-  { "rosevillefacialplasticsurgery.com", true },
+  { "roshhashanahfun.com", true },
+  { "roshiya.co.in", true },
   { "roslynpad.net", true },
   { "rosnertexte.at", true },
   { "rosset.me", true },
@@ -30321,6 +31563,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rothe.io", true },
   { "rothkranz.net", true },
   { "rothnater.ch", true },
+  { "rothwellgornthomes.com", true },
   { "rotkreuzshop.de", true },
   { "rotol.me", true },
   { "rottweil-hilft.de", true },
@@ -30334,6 +31577,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "roundcube.mayfirst.org", false },
   { "roundrock-locksmith.com", true },
   { "roundtablekzn.co.za", true },
+  { "roundtoprealestate.com", true },
   { "roussos.cc", true },
   { "rout0r.org", true },
   { "route-wird-berechnet.de", true },
@@ -30405,6 +31649,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "royalbluewa3.cc", true },
   { "royalcitytaxi.ca", true },
   { "royalcitytaxi.com", true },
+  { "royalfoxrealtor.com", true },
   { "royalmarinesassociation.org.uk", true },
   { "royalnissanparts.com", true },
   { "royalpalacenogent.fr", true },
@@ -30413,6 +31658,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "royalyule.com", true },
   { "royceandsteph.com", true },
   { "roycewilliams.net", true },
+  { "rozalynne-dawn.ga", true },
   { "rozhodce.cz", true },
   { "rpadovani.com", true },
   { "rpauto.ru", true },
@@ -30429,8 +31675,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rraesthetics.com", true },
   { "rrdesignsuisse.com", true },
   { "rrg-partner.ch", true },
-  { "rring.me", true },
-  { "rritv.com", true },
   { "rro.rs", true },
   { "rrudnik.com", true },
   { "rrwolfe.com", true },
@@ -30443,8 +31687,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rsl.gd", true },
   { "rsm-intern.de", true },
   { "rsm-liga.de", true },
-  { "rsmith.io", true },
-  { "rsmmail.com", true },
+  { "rsmith.io", false },
   { "rsp-blogs.de", true },
   { "rsridentassist.com", true },
   { "rss.sh", false },
@@ -30460,6 +31703,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rte2fm.ie", true },
   { "rteaertel.ie", true },
   { "rtechservices.io", true },
+  { "rteguide.ie", true },
   { "rteinternational.ie", true },
   { "rtejr.ie", true },
   { "rtek.se", false },
@@ -30467,10 +31711,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rteone.ie", true },
   { "rteplayer.com", true },
   { "rtesport.eu", true },
+  { "rteworld.com", true },
   { "rtfpessoa.xyz", true },
+  { "rths.tk", true },
+  { "rthsoftware.cn", true },
+  { "rthsoftware.net", true },
   { "rtrappman.com", true },
   { "rtrinflatables.co.uk", true },
   { "rtsr.ch", true },
+  { "rttss.com", true },
   { "rttvvip.com", true },
   { "rtwcourse.com", true },
   { "rtzoeller.com", true },
@@ -30499,8 +31748,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ruconsole.com", true },
   { "rud.is", true },
   { "rudd-o.com", true },
-  { "rudelune.fr", true },
-  { "ruderverein-gelsenkirchen.de", true },
   { "rudewiki.com", true },
   { "rudhaulidirectory.com", true },
   { "rudloff.pro", true },
@@ -30527,10 +31774,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ruiming.me", true },
   { "ruin.one", true },
   { "ruiruigeblog.com", true },
+  { "ruitersportbak.nl", true },
   { "ruk.ca", true },
   { "rulu.co", true },
   { "rulu.tv", true },
   { "rulutv.com", true },
+  { "rumartinez.es", true },
+  { "rumlager.de", true },
   { "rummage4property.co.uk", true },
   { "rumplesinflatables.co.uk", true },
   { "rumtaste.com", true },
@@ -30538,15 +31788,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "run-it-direct.co.uk", true },
   { "runagain.ch", true },
   { "runebet.com", true },
-  { "runefake.com", true },
   { "runementors.com", false },
   { "runklesecurity.com", true },
   { "runnergrapher.com", true },
   { "runreport.fr", true },
   { "runschrauger.com", true },
+  { "runtimepanic.xyz", true },
   { "runvs.io", true },
-  { "ruobiyi.com", true },
   { "ruobr.ru", true },
+  { "rupeevest.com", true },
   { "ruquay.com", true },
   { "ruralink.com.ar", true },
   { "ruralsuppliesdirect.co.uk", true },
@@ -30571,12 +31821,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rust.mn", true },
   { "rustable.com", true },
   { "rustikalwallis.ch", true },
-  { "rustralasia.net", true },
   { "rustyrambles.com", true },
-  { "rusxakep.com", true },
   { "rutgerschimmel.nl", true },
   { "ruthmontenegro.com", true },
   { "rutiger.com", true },
+  { "rutika.ru", true },
   { "ruudkoot.nl", true },
   { "ruwhof.net", true },
   { "ruya.com", true },
@@ -30596,9 +31845,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "rws-vertriebsportal.de", true },
   { "rwx.ovh", true },
   { "rx-contact.com", false },
-  { "rxbn.de", true },
   { "rxbusiness.com", true },
   { "rxcheck.com", true },
+  { "rxgroup.io", true },
   { "rxight.com", true },
   { "ryan-design.com", true },
   { "ryan-gehring.com", true },
@@ -30608,13 +31857,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ryanhowell.io", false },
   { "ryankearney.com", false },
   { "ryanmcdonough.co.uk", false },
-  { "ryanroberts.co.uk", true },
   { "ryansmithphotography.com", true },
   { "ryazan-region.ru", true },
   { "rychlikoderi.cz", true },
   { "rydermais.tk", true },
   { "rynekpierwotny.pl", true },
   { "ryois.me", true },
+  { "rys.pw", true },
+  { "ryssl.com", true },
   { "ryu22e.org", true },
   { "ryuu.es", true },
   { "ryyule.com", true },
@@ -30642,7 +31892,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "s404.de", true },
   { "s44.eu", true },
   { "s4db.net", true },
-  { "s4media.org", true },
   { "s4q.me", true },
   { "s4tips.com", true },
   { "s4ur0n.com", true },
@@ -30656,6 +31905,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "saabpartsdistribution.com", true },
   { "saamhorigheidsfonds.nl", false },
   { "saastopankki.fi", true },
+  { "saba-piserver.info", true },
   { "sabahattin-gucukoglu.com", true },
   { "sabe.cz", true },
   { "sabine-forschbach.de", true },
@@ -30665,6 +31915,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sabtunes.com", true },
   { "sacaentradas.com", true },
   { "saccani.net", true },
+  { "sacharidovejednotky.eu", true },
   { "sackers.com", true },
   { "sackmesser.ch", true },
   { "saclier.at", true },
@@ -30674,19 +31925,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sadbox.es", true },
   { "sadbox.org", true },
   { "sadbox.xyz", true },
+  { "sadeghian.us", true },
   { "sadev.co.za", true },
+  { "sadhana.cz", true },
   { "sadhawkict.org", true },
   { "sadmansh.com", true },
   { "sadou.kyoto.jp", true },
-  { "sadsu.com", true },
-  { "saenforcement.agency", true },
+  { "saechsischer-christstollen.shop", true },
   { "saengsook.com", true },
   { "saengsuk.com", true },
   { "safar.sk", true },
   { "safaritenten.nl", true },
   { "safcstore.com", true },
   { "safe.moe", true },
-  { "safe.space", true },
   { "safebaseflorida.com", true },
   { "safebasements.com", true },
   { "safebasementsnorthdakota.com", true },
@@ -30705,17 +31956,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "safeocs.gov", true },
   { "safer-networking.org", true },
   { "saferpost.com", true },
+  { "saferproduct.gov", true },
+  { "saferproducts.gov", true },
+  { "safersurfing.eu", false },
   { "safescan.com", true },
   { "safestore.io", true },
   { "safetext.me", true },
   { "safetycloud.me", true },
   { "safetynames.com", true },
+  { "safetyrisk.net", true },
+  { "safetyworkkits.co.nz", true },
   { "safeui.com", true },
-  { "safezone.cc", false },
   { "safire.ac.za", true },
   { "sagargandecha.com.au", true },
   { "sagedocumentmanager.com", true },
-  { "sagemontchurch.org", true },
   { "sagerus.com", true },
   { "saggiocc.com", true },
   { "sagracefarms.com", true },
@@ -30728,6 +31982,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sahkotyot.eu", true },
   { "said.id", true },
   { "said.my.id", true },
+  { "saidtezel.com", true },
   { "saier.me", true },
   { "saifoundation.in", true },
   { "saigonflowers.com", true },
@@ -30742,10 +31997,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "saimoe.org", true },
   { "sainetworks.net", true },
   { "saint-bernard-gouesch.fr", true },
+  { "saint-cyril.com", true },
   { "saintaardvarkthecarpeted.com", true },
   { "saintanthonyscorner.com", true },
   { "sainth.de", true },
-  { "saintmichelqud.com", true },
+  { "sainthelena-centersquare.net", true },
+  { "sainthelenas.org", true },
+  { "saintisidorecyo.com", true },
+  { "saintjamestheapostle.org", true },
+  { "saintjohn-bocaraton.com", true },
+  { "saintjosephschurch.net", true },
+  { "saintmarkchurch.net", true },
+  { "saintmaryna.com", true },
+  { "saintpatrick-norristown.net", true },
+  { "saintpeterchurch.net", true },
+  { "saintphilipneri.org", true },
+  { "saintpius.net", true },
+  { "saintpolycarp.org", true },
   { "saintsrobotics.com", true },
   { "saipariwar.com", true },
   { "saiputra.com", true },
@@ -30756,13 +32024,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sakaki.anime.my", false },
   { "sakostacloud.de", true },
   { "sakura-paris.org", true },
+  { "sakura.zone", true },
+  { "sakuracdn.com", true },
   { "sakuracommunity.com", true },
   { "sakuraflores.com.br", true },
   { "sakuraplay.com", true },
   { "salamon-it.de", false },
+  { "salandalairconditioning.com", true },
   { "salde.net", true },
   { "sale4ru.ru", true },
-  { "saleaks.org", true },
   { "saleduck.at", true },
   { "saleduck.ch", true },
   { "saleduck.co.id", true },
@@ -30779,7 +32049,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "salesflare.com", true },
   { "salesmachine.io", true },
   { "salexy.kz", true },
-  { "salixcode.com", true },
+  { "salidaswap.com", true },
   { "salland1.nl", true },
   { "salle-quali.fr", true },
   { "sallydowns.name", true },
@@ -30795,21 +32065,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "salonasymetria.com", true },
   { "salonasymetria.pl", true },
   { "salonsantebienetre.ch", true },
+  { "salrosadohimalaia.com", true },
   { "salsa-straubing.de", true },
   { "saltbythesea.com", true },
   { "saltercane.com", false },
   { "saltireconservation.com", true },
-  { "saltro.nl", true },
   { "saltstack.cz", true },
   { "salud.top", false },
+  { "saludmas.site", true },
   { "saludsexualmasculina.org", true },
   { "saludsis.mil.co", true },
+  { "saludyvida.site", true },
   { "salutethefish.com", true },
   { "salutethegrains.com", true },
   { "salutethepig.com", true },
   { "salva.re", true },
   { "salvagedfurnitureparlour.com", true },
   { "sam-football.fr", true },
+  { "samanacafe.com", true },
   { "samanthasgeckos.com", true },
   { "samanthasicecream.com", true },
   { "samappleton.com", true },
@@ -30823,25 +32096,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "samdev.io", true },
   { "samegoal.com", true },
   { "samegoal.org", true },
-  { "samel.de", true },
   { "samenwerkingsportaal.nl", true },
+  { "samenwerkingsportaal.tk", true },
   { "sameworks.com", true },
   { "samgrayson.me", true },
   { "samhuri.net", true },
   { "samifar.in", true },
-  { "samin.tk", true },
   { "samip.fi", true },
   { "samitechnic.com", true },
   { "samizdat.cz", true },
   { "samkelleher.com", true },
   { "saml-gateway.org", true },
+  { "samlaw.co.nz", true },
   { "samlivogarv.dk", true },
   { "sammamish--locksmith.com", true },
-  { "sammenlignakasser.dk", true },
   { "sammyservers.com", true },
   { "sammyservers.net", true },
+  { "sammyslimos.com", true },
   { "samnya.cn", true },
   { "samrobertson.co.uk", true },
+  { "samshouseofspaghetti.net", true },
   { "samsungmobile.it", true },
   { "samsungphonegenerator.xyz", true },
   { "samtalen.nl", true },
@@ -30850,6 +32124,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "samuellaulhau.fr", true },
   { "samui-samui.de", false },
   { "samuirehabcenter.com", true },
+  { "samvanderkris.xyz", true },
   { "samwilberforce.com", true },
   { "samwrigley.co.uk", true },
   { "samwu.tw", true },
@@ -30859,18 +32134,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sana-store.com", true },
   { "sana-store.cz", true },
   { "sana-store.sk", true },
-  { "sanalbayrak.com", true },
   { "sanantoniolocksmithinc.com", true },
   { "sanantoniolocksmithtx.com", true },
   { "sanasport.cz", true },
   { "sanasport.sk", true },
   { "sanatorii-sverdlovskoy-oblasti.ru", true },
-  { "sanatorionosti.com.ar", true },
-  { "sanchez.adv.br", true },
+  { "sanbornteam.com", true },
+  { "sand-islets.de", true },
   { "sandalj.com", true },
   { "sandbagexpress.com", true },
   { "sandbox.mydigipass.com", false },
+  { "sandboxfp.com", true },
   { "sandburner.net", true },
+  { "sander.sh", true },
   { "sanderdorigo.nl", true },
   { "sanderkoenders.eu", true },
   { "sanderkoenders.nl", true },
@@ -30887,11 +32163,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sandraindenfotografie.nl", true },
   { "sandrocorapi.com", true },
   { "sandrolittke.de", true },
+  { "sandrproperty.com", true },
   { "sandtears.com", true },
   { "sandtonescorts.com", true },
   { "sandtonplumber24-7.co.za", true },
   { "sandtonvipcompanions.com", true },
   { "sandyrobsonhypnotherapy.co.uk", true },
+  { "sanemind.de", true },
   { "sanemind.eu", true },
   { "sanepsychologen.nl", true },
   { "sanex.ca", false },
@@ -30904,7 +32182,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sanitairwinkel.com", true },
   { "sanitairwinkel.nl", true },
   { "sanitrak.cz", true },
-  { "sanmuding.com", true },
   { "sannesfotklinikk.no", true },
   { "sanpham-balea.org", true },
   { "sanskritiyoga.com", true },
@@ -30915,13 +32192,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "santenatureetcie.com", true },
   { "santensautomatics.be", true },
   { "santevie.ch", true },
+  { "santiagogarza.co", true },
   { "santojuken.co.jp", true },
   { "santoshpandit.com", true },
   { "sanvitolocapobus.com", true },
-  { "saol.eu", true },
   { "saoneth.pl", true },
   { "saorsat.com", true },
   { "saorsat.ie", true },
+  { "saorsat.tv", true },
   { "saorview.com", true },
   { "saorviewconnect.ie", true },
   { "saorviewconnected.ie", true },
@@ -30932,11 +32210,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sapindus.pl", true },
   { "sapk.fr", true },
   { "saposute-s.jp", true },
-  { "sapphireblue.me", true },
   { "sapphirepearl.com.sg", true },
   { "sapprendre.ch", true },
   { "saprima.de", true },
   { "sarahbeckettharpist.com", true },
+  { "sarahboydrealty.com", true },
+  { "sarahcorliss.com", true },
   { "sarahlicity.co.uk", true },
   { "sarahlicity.me.uk", true },
   { "sarahplusdrei.de", true },
@@ -30944,14 +32223,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sarahwikeley.co.uk", true },
   { "saraleebread.com", false },
   { "sarariman.com", true },
+  { "sarasotaroboticurology.com", true },
   { "sarasturdivant.com", true },
   { "sardegnatirocini.it", true },
   { "sarink.eu", true },
+  { "sarkisianbuilders.com", true },
+  { "saro.me", true },
   { "saronno5stelle.it", true },
   { "sarpsb.org", true },
   { "sarumtechnologies.com", true },
   { "sas-snowboarding.sk", true },
-  { "sasanika.org", true },
   { "sascha.io", true },
   { "sascha.is", true },
   { "saschaeggenberger.ch", true },
@@ -30962,9 +32243,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sasioglu.co.uk", true },
   { "saskpension.com", true },
   { "sasrobotics.xyz", true },
+  { "sassandbelle.co.uk", true },
+  { "sassandbelle.com", true },
+  { "sassandbelletrade.co.uk", true },
+  { "sassandbelletrade.com", true },
   { "sastd.com", true },
   { "sasyabapi.com", true },
   { "sat4all.com", true },
+  { "sat7a-riyadh.com", false },
   { "satai.dk", true },
   { "satal.in", true },
   { "satellites.hopto.me", true },
@@ -30976,12 +32262,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sattamatkachart.in", true },
   { "sattamatkamobi.mobi", true },
   { "saturn.pl", true },
+  { "saturngames.co.uk", true },
   { "satyanarayana.xyz", true },
   { "saudavel.com.vc", true },
   { "saudeealimentos.com", true },
   { "saudeintimadamulher.com.br", true },
   { "saudenoclique.com.br", true },
+  { "saudiarabiaevisa.co.uk", true },
   { "sauer-systems.net", true },
+  { "sauerbrey.eu", true },
   { "sauerland-schnittgruen.de", true },
   { "saulchristie.com", true },
   { "saumon-de-france.com", true },
@@ -30991,20 +32280,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "saunahats.eu", true },
   { "saunas.fr", true },
   { "saunatime.jp", true },
-  { "saurel.me", true },
   { "sauvagebridge.nl", true },
+  { "savaari.com", true },
   { "savageorgiev.com", true },
-  { "savannahtasteexperience.com", true },
   { "save-me-aachen.de", true },
   { "save-me-koeln.de", true },
   { "savecrypto.org", true },
   { "savenet.org", true },
+  { "saveoney.ca", true },
   { "saveora.com", true },
   { "savetheinternet.eu", true },
   { "saveya.com", true },
   { "savic.com", true },
   { "saviezvousque.net", true },
   { "savingrecipe.com", true },
+  { "savingsoftheyear.com", true },
   { "savingsomegreen.com", true },
   { "savingsstoreonline.ca", true },
   { "savisasolutions.co.za", true },
@@ -31020,9 +32310,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sayrodigital.com", true },
   { "sayura.net", true },
   { "saz.sh", true },
+  { "sazavafest.cz", true },
   { "sazuz.cz", true },
   { "sb-group.dk", true },
   { "sb-mnn.com", true },
+  { "sb-sd.org", true },
   { "sb-tuning.ru", true },
   { "sb.im", true },
   { "sb.sb", true },
@@ -31042,6 +32334,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sbrownbourne.com", true },
   { "sbsavings.bank", true },
   { "sbsbaits.com", true },
+  { "sbscyber.com", true },
   { "sbsnursery.co.uk", true },
   { "sbsrv.ml", true },
   { "sbssoft.ru", true },
@@ -31050,9 +32343,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scaarus.com", true },
   { "scaffalature.roma.it", true },
   { "scaffoldhireeastrand.co.za", true },
-  { "scaffoldhirefourways.co.za", true },
-  { "scaffoldhirerandburg.co.za", true },
-  { "scaffoldhiresandton.co.za", true },
   { "scalacollege.nl", true },
   { "scalaire.com", true },
   { "scalaire.fr", true },
@@ -31070,13 +32360,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scarafaggio.it", true },
   { "scatsbouncingcastles.ie", true },
   { "scbdh.org", true },
+  { "sceenfox.de", true },
   { "scelec.com.au", true },
   { "scenastu.pl", true },
   { "scene.mx", true },
   { "scenester.tv", true },
   { "scenicbyways.info", true },
+  { "scentofwine.com", true },
   { "scepticism.com", true },
-  { "sceptique.eu", true },
   { "schadevergoedingen.eu", true },
   { "schaefer-reifen.de", true },
   { "schamlosharmlos.de", true },
@@ -31099,17 +32390,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "schenkes.de", true },
   { "scherfke.de", true },
   { "scheuchenstuel.at", true },
+  { "schgroup.com", true },
   { "schier.info", true },
   { "schil.li", true },
   { "schildbach.de", true },
   { "schillers-friedberg.de", true },
   { "schimmel-test.info", true },
   { "schippendale.de", true },
-  { "schippers-it.nl", true },
   { "schizoids.net", true },
   { "schlachter.ca", true },
   { "schlaf.guru", true },
   { "schlafguru.com", true },
+  { "schlagenhauf.info", true },
+  { "schlagma.de", false },
+  { "schlarb.eu", true },
   { "schlarp.com", true },
   { "schlechtewitze.com", true },
   { "schlick.wedding", true },
@@ -31118,36 +32412,45 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "schlueter-software.de", true },
   { "schmaeh-coaching.ch", true },
   { "schmelle.me", true },
-  { "schmelzle.io", true },
   { "schmetterlingsapp.at", true },
   { "schmid.tv", true },
   { "schmidthomes.com", true },
   { "schmidtplasticsurgery.com", true },
   { "schmitt-max.com", true },
-  { "schmitz.link", true },
+  { "schnapke.name", true },
   { "schneeketten-ratgeber.de", true },
   { "schnegg.name", true },
+  { "schneidr.de", true },
   { "schneids.me", true },
   { "schnellno.de", true },
   { "schnellsuche.de", true },
   { "schnouki.net", true },
+  { "schnuckenhof-wesseloh.de", true },
   { "schnyder-werbung.ch", true },
   { "schoeck-elektro.de", true },
+  { "schoeller.click", true },
   { "schoepski.de", true },
   { "schoknecht.net", true },
   { "schoknecht.one", true },
+  { "schoko-ferien.de", true },
+  { "schokoferien.de", true },
   { "schokofoto.de", true },
   { "schokokeks.org", true },
+  { "scholar.group", true },
+  { "scholar.site", true },
   { "scholarly.com.ph", true },
   { "scholarly.ph", true },
   { "scholarnet.cn", true },
+  { "scholarshipplatform.com", true },
+  { "scholarshipsplatform.com", true },
+  { "scholarstyle.com", true },
   { "scholierenvervoerzeeland.nl", true },
   { "schollbox.de", false },
+  { "scholledev.com", true },
   { "scholz-kallies.de", true },
   { "schont.org", true },
   { "school-b.us", true },
   { "school-register.co.za", true },
-  { "school.in.th", true },
   { "schoolbus.at", true },
   { "schoolcafe.com", true },
   { "schoolotzyv.ru", true },
@@ -31163,6 +32466,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "schrauger.org", true },
   { "schrauger.run", true },
   { "schraugerrun.com", true },
+  { "schreck-thomas.de", false },
   { "schreibers.ca", true },
   { "schreinerei-jahreis.de", true },
   { "schrenkinzl.at", true },
@@ -31176,6 +32480,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "schtiehve.duckdns.org", true },
   { "schubergphilis.com", true },
   { "schubertgmbh-ingelheim.de", true },
+  { "schuelerzeitung-ideenlos.de", true },
   { "schuhbeck.tk", true },
   { "schuhbedarf.de", true },
   { "schuhwerkstatt.at", true },
@@ -31198,14 +32503,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "schwabenhaus-ka.de", true },
   { "schwalliers.com", true },
   { "schwanke.in", true },
+  { "schwarz-gelbe-fuechse.de", true },
   { "schwarzegar.de", true },
   { "schwarzer.it", true },
   { "schwarzes-muenchen.de", true },
   { "schwarzhenri.ch", true },
   { "schwarztrade.cz", true },
   { "schwarzwald-flirt.de", true },
-  { "schwedenhaus.ag", true },
-  { "schwerkraftlabor.de", true },
   { "schwinabart.com", true },
   { "schwinger.me", true },
   { "schwinnbike.ru", true },
@@ -31228,19 +32532,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scilifebiosciences.com", true },
   { "scimage.com", true },
   { "scintilla.nl", true },
+  { "scintillating.stream", true },
   { "scis.com.ua", true },
   { "scistarter.com", true },
   { "scitopia.me", true },
   { "scitopia.net", true },
   { "sckc.stream", false },
   { "sclns.co", true },
+  { "scohetal.de", true },
   { "scontogiusto.com", true },
   { "scoolcode.com", true },
   { "scoop6.co.uk", true },
   { "scootaloo.co.uk", true },
   { "scooterservis.com", true },
   { "scootfleet.com", true },
-  { "scorobudem.ru", true },
+  { "scorerealtygroup.com", true },
   { "scorocode.ru", true },
   { "scorp13.com", true },
   { "scottgalvin.com", true },
@@ -31249,8 +32555,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scotthelme.co.uk", true },
   { "scottishcu.org", true },
   { "scottishseniorsgolf.com", true },
+  { "scottlanderkingman.com", true },
   { "scottseditaacting.com", true },
   { "scottstorey.co.uk", true },
+  { "scotttopperproductions.com", true },
   { "scottynordstrom.org", false },
   { "scoutingridderkerk.nl", true },
   { "scoutingtungelroy.nl", true },
@@ -31259,6 +32567,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scp500.com", true },
   { "scpartyentertainment.co.uk", true },
   { "scpslgame.com", true },
+  { "scra.gov", true },
   { "scrabble-solver.com", true },
   { "scrambox.com", true },
   { "scramget.com", true },
@@ -31269,7 +32578,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scratchandscuffs.uk", true },
   { "scrayos.net", true },
   { "scredible.com", false },
+  { "screefox.de", true },
+  { "screen-fox.de", true },
   { "screen64.tk", true },
+  { "screenfax.de", true },
+  { "screenfox.eu", true },
+  { "screenfox.info", true },
+  { "screenfox.net", true },
   { "screenlight.tv", true },
   { "screenmachine.com", true },
   { "screenparadigm.com", true },
@@ -31281,6 +32596,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "scrisulfacebine.ro", true },
   { "scrod.me", true },
   { "scroll.in", true },
+  { "scrtch.fr", true },
   { "scrumbleship.com", true },
   { "scrumpus.com", true },
   { "scrumstack.co.uk", true },
@@ -31306,9 +32622,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sdayman.com", true },
   { "sdcardrecovery.de", true },
   { "sdg-tracker.org", true },
+  { "sdgllc.com", true },
   { "sdho.org", true },
   { "sdns.fr", true },
-  { "sdocast.com", true },
   { "sdsi.us", true },
   { "sdsk.one", true },
   { "sdsmanagement.me", true },
@@ -31316,11 +32632,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sdvigpress.org", true },
   { "sdvx.net", true },
   { "sdxcentral.com", true },
+  { "se-booster.com", true },
   { "se-theories.org", true },
   { "se.com", true },
   { "se.search.yahoo.com", false },
   { "seac.me", true },
   { "seacam-store.com", true },
+  { "seachef.it", true },
+  { "seadus.ee", true },
   { "seafood.co.nz", true },
   { "seaholmwines.com", true },
   { "sealaw.com", true },
@@ -31353,6 +32672,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "searchcandy.uk", true },
   { "searchdatalogy.com", true },
   { "seareytraining.com", true },
+  { "searsucker.com", true },
   { "searx.ru", true },
   { "searx.xyz", true },
   { "seasidestudios.co.uk", true },
@@ -31375,12 +32695,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sebastiaanwijnimport.nl", true },
   { "sebastian-janich.de", true },
   { "sebastian-kraus.me", true },
-  { "sebastian-lutsch.de", true },
+  { "sebastian-tobie.de", true },
   { "sebastian.expert", true },
   { "sebastianblade.com", true },
   { "sebastianboegl.de", true },
   { "sebastiaperis.com", true },
   { "sebasveeke.nl", true },
+  { "sebepoznani.eu", true },
+  { "sebi.cf", true },
   { "sebi.org", true },
   { "seby.io", true },
   { "sec-mails.de", true },
@@ -31392,11 +32714,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sec.red", true },
   { "sec3ure.co.uk", true },
   { "sec455.com", true },
-  { "sec4share.me", true },
   { "sec530.com", true },
   { "sec555.com", true },
   { "secbone.com", true },
-  { "secboom.com", true },
   { "seccom.ch", true },
   { "secctexasgiving.org", false },
   { "secgui.de", true },
@@ -31408,7 +32728,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "secondchancejobsforfelons.com", true },
   { "seconfig.sytes.net", true },
   { "secpatrol.de", true },
+  { "secpoc.online", true },
   { "secretar.is", true },
+  { "secretary-schools.com", true },
   { "secretsanta.fr", true },
   { "secretsdujeu.com", true },
   { "secretserveronline.com", true },
@@ -31452,16 +32774,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "security-brokers.com", true },
   { "security.gives", true },
   { "security.google.com", true },
-  { "security.love", true },
   { "security201.co.uk", true },
   { "security201.com", true },
   { "securitycamerasaustin.net", true },
   { "securitycamerascincinnati.com", true },
   { "securitycamerasjohnsoncity.com", true },
+  { "securityescrownews.com", true },
   { "securityfest.com", true },
+  { "securitygladiators.com", true },
   { "securityheaders.com", true },
   { "securityheaders.io", true },
   { "securityheaders.nl", true },
+  { "securityindicators.com", true },
   { "securityinet.com", false },
   { "securitykey.co", true },
   { "securitypluspro.com", true },
@@ -31474,11 +32798,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "securitytrails.com", true },
   { "securitywithnick.com", true },
   { "securitywithoutborders.org", true },
+  { "securityzap.com", true },
   { "securocloud.com", true },
   { "secutrans.com", true },
   { "secuvera.de", false },
   { "secwall.me", true },
-  { "secwise.nl", true },
   { "secyourity.se", true },
   { "sedeusquiser.net", true },
   { "sedlakovalegal.com", true },
@@ -31487,24 +32811,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "see.wtf", true },
   { "seeclop.ch", true },
   { "seedandleisure.co.uk", true },
+  { "seedisclaimers.com", true },
   { "seednode.co", true },
   { "seedsofangelica.net", true },
   { "seekers.ch", true },
   { "seeks.ru", true },
   { "seekthe.net", true },
   { "seemeagain.com", true },
-  { "seemeasaperson.com", true },
   { "seesuite.com", true },
   { "seewhatididhere.com", true },
   { "seeworkdone.com", true },
   { "seezeitlodge-bostalsee.de", true },
   { "sefru.de", true },
   { "seg-leipzig.org", true },
+  { "seg-sys.com", true },
   { "segaretro.org", true },
   { "segitz.de", true },
   { "segmetic.com", true },
   { "segulink.com", true },
+  { "seguridadconsumidor.gov", true },
+  { "seguros-de-salud-y-vida.com", true },
   { "segurosbalboa.com.ec", false },
+  { "segurosdecarroshialeah.org", true },
+  { "segurosdevidamiami.org", true },
   { "segurosocial.gov", true },
   { "seguroviagem.srv.br", false },
   { "sehnenweh.org", true },
@@ -31512,7 +32841,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "seibu-kikaku.co.jp", true },
   { "seifried.org", true },
   { "seikatu-navi.com", true },
-  { "seiko-dojo.com", true },
   { "seinfeldquote.com", true },
   { "seirei.ne.jp", true },
   { "seiryokuzai-ch.com", true },
@@ -31545,6 +32873,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "selfoutlet.com", true },
   { "selkiemckatrick.com", true },
   { "sellajoch.com", true },
+  { "selldurango.com", true },
   { "sellguard.pl", true },
   { "sellme.biz", true },
   { "sellmoretires.com", true },
@@ -31562,7 +32891,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "semianalog.com", true },
   { "seminariruum.ee", true },
   { "semiocast.com", true },
+  { "semirben.de", true },
   { "semiread.com", true },
+  { "semjonov.de", true },
   { "semmlers.com", true },
   { "semox.de", true },
   { "semps-2fa.de", true },
@@ -31581,20 +32912,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sendinvoice.nl", true },
   { "sendonce.io", true },
   { "sendthisfile.com", true },
+  { "sendtrix.nl", true },
   { "sendway.com", true },
   { "sendya.me", true },
+  { "senego.com", true },
   { "senekalstorageman.co.za", true },
   { "sengokulife.com", true },
   { "seniorem.eu", true },
+  { "seniorhomepurchaseprogram.com", true },
   { "seniors.singles", true },
   { "senmendai-reform.com", true },
+  { "senmonsyoku.top", true },
   { "sennase.net", true },
+  { "senobio.com", true },
   { "senorcontento.com", true },
+  { "sens2lavie.com", true },
   { "sensebridge.com", true },
   { "sensebridge.net", true },
   { "sensepixel.com", true },
   { "senshudo.tv", true },
   { "sensoft-int.net", true },
+  { "sensound.ml", true },
   { "sentandsecure.com", true },
   { "sentic.info", true },
   { "sentidosdelatierra.org", true },
@@ -31604,6 +32942,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sentry.nu", true },
   { "senzaparole.de", true },
   { "seo-analyse.com", true },
+  { "seo-dr-it.com", true },
   { "seo-linz.at", true },
   { "seo-nerd.de", true },
   { "seo-portal.de", true },
@@ -31611,6 +32950,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "seo.london", true },
   { "seo.tl", true },
   { "seoagentur2go.de", true },
+  { "seoankara.name.tr", true },
   { "seobutler.com", true },
   { "seocomposer.com", true },
   { "seoenmexico.com.mx", true },
@@ -31620,16 +32960,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "seohouston.com", true },
   { "seoinc.com", true },
   { "seoium.com", true },
-  { "seokay.com", true },
+  { "seojames.com", true },
   { "seolib.org", true },
   { "seomarketing.bg", true },
   { "seon.me", true },
   { "seoprovider.nl", true },
   { "seoquake.com", true },
+  { "seosec.xyz", false },
   { "seosof.com", true },
   { "seostepbysteplab.com", true },
   { "seoul.dating", true },
   { "seouniversity.org", true },
+  { "seovision.se", true },
   { "sepalandseed.com", true },
   { "seppelec.com", true },
   { "seproco.com", true },
@@ -31651,14 +32993,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "serf.io", true },
   { "sergeemond.ca", true },
   { "sergefonville.nl", true },
+  { "sergeyreznikov.com", true },
   { "sergije-stanic.me", true },
   { "sergiosantoro.it", true },
   { "sergiozygmunt.com", true },
-  { "sergivb01.me", true },
   { "sergos.de", true },
   { "serialexperiments.co.uk", true },
   { "serienstream.to", true },
   { "serigraphs.co.uk", true },
+  { "serinamusic.com", true },
   { "seriousclimbing.com", true },
   { "seriouss.am", true },
   { "sernate.com", true },
@@ -31683,6 +33026,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "serverd.de", true },
   { "serverexpose.com", true },
   { "serverfrog.de", true },
+  { "serverlog.net", true },
   { "serveroffline.net", false },
   { "serverpedia.de", true },
   { "servers4all.co.uk", true },
@@ -31694,9 +33038,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "servethecity-karlsruhe.de", true },
   { "servettorna.com", true },
   { "servgate.jp", true },
-  { "service-wueste-vodafone.tk", true },
+  { "service.gov.uk", true },
   { "servicebeaute.fr", true },
   { "serviceboss.de", true },
+  { "servicemembers.gov", true },
   { "servida.ch", true },
   { "serviettenhaus.de", true },
   { "servingbaby.com", true },
@@ -31721,12 +33066,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "settberg.de", true },
   { "setterirlandes.com.br", true },
   { "settleapp.co", true },
+  { "setuid.io", true },
   { "setuid0.kr", true },
   { "setyoursite.nl", true },
   { "seva.fashion", true },
   { "seven-purple.com", true },
   { "sevencooks.com", true },
-  { "sevenet.pl", true },
   { "sevenhillsapartments.com.au", true },
   { "sevenicealimentos.com.br", true },
   { "sevenmatches.com", true },
@@ -31734,16 +33079,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "severine-trousselard.com", true },
   { "severntrentinsuranceportal.com", true },
   { "sevinci.ch", true },
-  { "sevsey.ru", true },
-  { "sevsopr.ru", true },
+  { "sewa.nu", true },
   { "sewafineseam.com", true },
   { "sewinginsight.com", true },
   { "sewoo.co.uk", true },
   { "sexaki.com", true },
   { "sexdocka.nu", true },
+  { "sexflare.net", true },
   { "sexmobil.de", true },
   { "sexservice.io", true },
-  { "sexshopnet.com.br", true },
   { "sexy-store.nl", true },
   { "seyfarth.de", true },
   { "seyr.me", true },
@@ -31751,6 +33095,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sfaparish.org", true },
   { "sfcomercio.com.br", true },
   { "sfdev.ovh", true },
+  { "sfg-net.com", true },
+  { "sfg-net.eu", true },
+  { "sfg-net.net", true },
+  { "sfg-net.org", true },
   { "sfg-nordholz.de", true },
   { "sfile.eu", true },
   { "sfirat-haomer.com", true },
@@ -31763,21 +33111,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sgb.co", true },
   { "sgcaccounts.co.uk", true },
   { "sgi.org", true },
+  { "sgitc.de", true },
   { "sglibellen.de", true },
   { "sgroup-hitoduma.com", true },
   { "sgroup-rec.com", true },
   { "sgs-systems.de", true },
   { "sgs.camera", true },
+  { "sgs.systems", true },
   { "sgsp.nl", true },
   { "sgtcodfish.com", true },
-  { "sgtsnookums.net", true },
   { "sgtt.ch", true },
   { "sgutranscripts.org", true },
   { "sh-heppelmann.de", true },
-  { "sh-network.de", false },
+  { "sh-network.de", true },
   { "sh0rt.zone", true },
   { "sh0shin.org", true },
-  { "shaaaaaaaaaaaaa.com", true },
+  { "shaadithailand.com", true },
   { "shad.waw.pl", true },
   { "shadesofgrayadr.com", true },
   { "shadesofgraylaw.com", true },
@@ -31794,6 +33143,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shadowsocks.com.hk", true },
   { "shadowsocks.fr", false },
   { "shadowsocks.la", true },
+  { "shadowsocks.se", true },
   { "shadowsocks.to", true },
   { "shadowstack.de", true },
   { "shadwe.com", true },
@@ -31804,6 +33154,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shaicoleman.com", true },
   { "shainessim.com", true },
   { "shakan.ch", true },
+  { "shaken-kyoto.jp", true },
   { "shaken110.com", true },
   { "shakepeers.org", false },
   { "shakerwebdesign.net", true },
@@ -31815,11 +33166,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shalyapin.by", true },
   { "shamara.info", true },
   { "shamariki.ru", true },
+  { "shamka.ru", true },
+  { "shampoo63.ru", true },
   { "shan.io", false },
   { "shan.si", true },
   { "shanahanstrategy.com", true },
   { "shanetully.com", true },
   { "shanewadleigh.com", true },
+  { "shangzhen.site", true },
   { "shankangke.com", true },
   { "shannoneichorn.com", true },
   { "shansing.cn", true },
@@ -31831,7 +33185,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "share.works", true },
   { "sharealo.org", true },
   { "sharedhost.de", true },
-  { "shareeri.com", true },
+  { "sharejoy.cn", false },
   { "sharekey.com", false },
   { "sharelovenotsecrets.com", true },
   { "shareoffice.ch", true },
@@ -31839,17 +33193,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sharescope.co.uk", false },
   { "shareselecttools.com", true },
   { "sharevari.com", true },
-  { "shariahlawcenter.com", true },
-  { "shariahlawcenter.org", true },
-  { "sharialawcenter.com", true },
-  { "sharialawcenter.org", true },
   { "sharisharpe.com", true },
+  { "shark.cat", true },
   { "shark5060.net", true },
   { "sharkie.org.za", true },
-  { "sharperedge.pw", true },
-  { "sharperedgecomputers.com", true },
   { "sharu.me", true },
-  { "shasso.com", true },
+  { "sharvey.ca", true },
+  { "shattered-souls.de", true },
   { "shaun.net", true },
   { "shaunandamyswedding.com", true },
   { "shaunc.com", true },
@@ -31861,12 +33211,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shawnhogan.com", true },
   { "shawnow.com", true },
   { "shawnwilkerson.com", true },
-  { "shazbots.org", true },
   { "shazzlemd.com", true },
   { "shazzlepro.com", true },
   { "sheaf.site", true },
   { "shearcomfort.com", true },
   { "sheaspire.com.tw", true },
+  { "shee.org", true },
   { "sheehyinfinitioftysonsparts.com", true },
   { "sheekdeveloper.com", true },
   { "sheekmedia.com", true },
@@ -31880,22 +33230,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shelleystoybox.com", true },
   { "shellfire.de", true },
   { "shellgame.io", true },
+  { "shellj.me", true },
   { "shelljuggler.com", false },
   { "shellot.com", true },
   { "shellshock.eu", true },
   { "shellvatore.us", true },
   { "shemsconseils.ma", true },
-  { "shena.co.uk", true },
+  { "shengbao.org", true },
   { "shenghaiautoparts.com", true },
   { "shenghaiautoparts.net", true },
   { "shengrenyu.com", true },
+  { "shens.ai", true },
   { "shenyuqi.com", false },
   { "sherbers.de", true },
   { "sherrikehoetherapy.com", true },
+  { "sherut.net", true },
   { "shft.cl", true },
+  { "shg-pornographieabhaengigkeit.de", false },
   { "shgroup.xyz", true },
   { "shgt.jp", true },
   { "shh-listen.com", true },
+  { "shh.sh", true },
   { "shiawasedo.co.jp", true },
   { "shichibukai.net", true },
   { "shico.org", true },
@@ -31907,30 +33262,37 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shiftdevices.com", true },
   { "shiftj.is", true },
   { "shiftleft.org", true },
-  { "shiftpsych.com", true },
+  { "shiftpsych.com", false },
   { "shiga1.jp", true },
   { "shihadwiki.com", true },
   { "shijing.me", true },
   { "shikimori.one", true },
   { "shikimori.org", true },
   { "shimi.net", true },
+  { "shimo.im", true },
   { "shin-inc.jp", true },
   { "shinghoi.com", true },
+  { "shinglereplacementlv.com", true },
   { "shining.gifts", true },
+  { "shinko-osaka.jp", true },
   { "shinnyosangha.org", false },
   { "shinonome-lab.eu.org", true },
+  { "shinsyo.com", true },
   { "shintoism.com", true },
   { "shinyuu.net", true },
   { "shipard.com", true },
   { "shipard.cz", true },
   { "shipcloud.io", true },
+  { "shippercenter.info", true },
   { "shiqi.ca", true },
+  { "shiqi.one", true },
   { "shiqisifu.cc", true },
   { "shirakaba-cc.com", true },
   { "shirao.jp", true },
   { "shirt2go.shop", true },
   { "shirtsdelivered.com", true },
   { "shirtsofholland.com", true },
+  { "shiseki.top", true },
   { "shishamania.de", true },
   { "shishkabobnc.com", true },
   { "shishkin.us", true },
@@ -31943,6 +33305,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shitsta.in", true },
   { "shivamber.com", true },
   { "shivammaheshwari.com", true },
+  { "shivammathur.com", true },
   { "shivatattvayoga.com", true },
   { "shlmail.info", true },
   { "shmibbles.me", true },
@@ -31951,7 +33314,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shock.ee", true },
   { "shockercityservices.com", true },
   { "shodan.io", true },
-  { "shoemuse.com", true },
   { "shoeracks.uk", true },
   { "shoestringeventing.co.uk", true },
   { "shokola.com", true },
@@ -31959,7 +33321,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shooter.dog", true },
   { "shop-hellsheadbangers.com", true },
   { "shop-s.net", true },
-  { "shop.fr", true },
+  { "shop4d.com", true },
   { "shopadvies.nl", true },
   { "shopalike.cz", true },
   { "shopalike.dk", true },
@@ -31981,16 +33343,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shopcoupons.ph", true },
   { "shopcoupons.sg", true },
   { "shophisway.com", true },
+  { "shopific.co", true },
   { "shopify.com", true },
   { "shopifycloud.com", true },
   { "shopkini.com", true },
   { "shoplandia.co", true },
+  { "shopperexperts.com", true },
   { "shoppia.se", true },
   { "shopping24.de", true },
   { "shoppr.dk", true },
   { "shopregional.com.br", true },
   { "shopsouthafrican.com", true },
   { "shopstart.dk", true },
+  { "shopstasy.com", true },
   { "shoptec.sk", true },
   { "shorebreaksecurity.com", true },
   { "shorehamfort.co.uk", true },
@@ -32001,7 +33366,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shorten.ninja", true },
   { "shoshin-aikido.de", true },
   { "shoshin.technology", true },
-  { "shota.vip", true },
   { "shotbow.net", true },
   { "shotonwhat.com", true },
   { "shouldihookupwithmybarista.com", true },
@@ -32009,6 +33373,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shovonhasan.com", true },
   { "show-saratov.ru", false },
   { "showbits.net", true },
+  { "showersnet.com", true },
   { "showf.om", true },
   { "showmax.com", true },
   { "showmethemoney.ru", true },
@@ -32016,13 +33381,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "showroom.cam", true },
   { "showroom.co.uk", true },
   { "showroom.uk", true },
+  { "showroom113.ru", true },
   { "showsonar.com", true },
+  { "shoxmusic.net", false },
   { "shredriteservices.com", true },
   { "shreyansh26.me", true },
   { "shrike.me", false },
   { "shrinidhiclinic.in", true },
   { "shrinkhub.com", true },
   { "shrub.ca", true },
+  { "shrug.ml", true },
+  { "shtaketniki.kz", true },
+  { "shtaketniki.ru", true },
   { "shteiman.net", true },
   { "shu-fu.net", true },
   { "shuffleradio.nl", true },
@@ -32030,16 +33400,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "shukatsu-support.jp", true },
   { "shulan.moe", true },
   { "shuletime.ml", true },
+  { "shulker.store", true },
   { "shura.eu.org", true },
   { "shuro.de", true },
   { "shuset.dk", true },
   { "shushu.media", true },
   { "shutter-shower.com", true },
-  { "shuvo.rocks", true },
   { "shuvodeep.de", true },
   { "shux.pro", true },
   { "shwrm.ch", true },
-  { "shybynature.com", true },
   { "shyuka.me", true },
   { "si-benelux.nl", true },
   { "si.to", true },
@@ -32048,6 +33417,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "siamsnus.com", true },
   { "sianbryn.co.uk", true },
   { "sianjhon.com", true },
+  { "siberas.de", true },
   { "sibfk.org", true },
   { "sibiutourguide.com", true },
   { "sibrenvasse.nl", true },
@@ -32066,19 +33436,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sidepodcast.com", true },
   { "sidepodcastdaily.com", true },
   { "sidepodcastextra.com", true },
-  { "sideropolisnoticias.com.br", true },
   { "sideshowbarker.net", true },
   { "sidium.de", true },
   { "sidnicio.us", true },
   { "sidonge.com", true },
   { "sidongkim.com", true },
+  { "sidpod.ru", true },
+  { "siduga.com", true },
   { "siegemund-frankfurt.de", true },
   { "siel.nl", true },
   { "sielsystems.nl", true },
   { "sientemendoza.com.ar", true },
   { "sierpinska.co", true },
   { "sierpinska.eu", true },
-  { "sieulog.com", true },
   { "siewert-kau.de", true },
   { "sift-tool.org", true },
   { "sig6.org", true },
@@ -32103,11 +33473,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "significantbanter.com", true },
   { "signing-milter.org", true },
   { "signix.net", true },
-  { "signsdance.uk", true },
   { "signtul.com", false },
   { "sigsrv.net", true },
   { "sigterm.no", true },
   { "sigterm.sh", true },
+  { "sigurnost.online", true },
   { "siirtutkusu.com", true },
   { "sikayetvar.com", false },
   { "sikevux.se", true },
@@ -32118,9 +33488,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "silaslova-ekb.ru", true },
   { "silent-clean.de", true },
   { "silent-yachts.com", true },
+  { "silent.live", false },
   { "silentkernel.fr", false },
   { "silentundo.org", true },
   { "silerfamily.net", true },
+  { "silica-project.com", true },
+  { "silica-project.jp", true },
   { "silicon-north.com", true },
   { "silicon-vision.com", true },
   { "siliconchip.me", true },
@@ -32133,11 +33506,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "siloportem.net", true },
   { "silsha.me", true },
   { "silv.me", true },
-  { "silvacor-ziegel.de", true },
   { "silver-heart.co.uk", true },
-  { "silverartcollector.com", true },
   { "silverbowflyshop.com", true },
   { "silverdragonart.com", true },
+  { "silverfirsdental.com", true },
   { "silvergoldbull.be", true },
   { "silvergoldbull.bg", true },
   { "silvergoldbull.bj", true },
@@ -32185,7 +33557,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "silvergoldbull.ky", true },
   { "silvergoldbull.li", true },
   { "silvergoldbull.lk", true },
-  { "silvergoldbull.lt", true },
   { "silvergoldbull.lv", true },
   { "silvergoldbull.ma", true },
   { "silvergoldbull.mk", true },
@@ -32213,14 +33584,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "silverkingalaska.com", true },
   { "silverlinkz.net", true },
   { "silverseen.com", true },
+  { "silverswanrecruitment.com", true },
   { "silverwind.io", true },
   { "silvine.xyz", true },
   { "silvobeat.blog", true },
   { "silvobeat.com", true },
   { "sim-karten.net", true },
   { "sim-minaoshi.jp", true },
-  { "sim-sim.appspot.com", true },
-  { "sim4seed.org", true },
   { "simam.de", true },
   { "simark.ca", true },
   { "simbeton.nl", true },
@@ -32233,6 +33603,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "simivalleylandscapelighting.com", true },
   { "simivalleylighting.com", true },
   { "simivalleyoutdoorlighting.com", true },
+  { "simkova-reality.cz", true },
   { "simlau.net", true },
   { "simmis.fr", true },
   { "simoesgoulart.com.br", true },
@@ -32248,7 +33619,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "simonfischer.info", true },
   { "simonhirscher.de", true },
   { "simonkjellberg.com", true },
-  { "simonkjellberg.se", true },
   { "simonlyabonnement.nl", true },
   { "simonmaddox.com", true },
   { "simonpaarlberg.com", true },
@@ -32266,7 +33636,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "simpip.com", true },
   { "simple.com", false },
   { "simpleclassiclife.com", true },
+  { "simplecmsdemo.com", true },
+  { "simplecoding.click", true },
   { "simplecontacts.com", true },
+  { "simplecrypt.io", true },
   { "simplednscrypt.org", true },
   { "simpleindianrecipes.com", true },
   { "simpleinout.com", true },
@@ -32291,6 +33664,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "simplyrara.com", true },
   { "simplyregister.net", true },
   { "simplystudio.com", true },
+  { "simplytiles.com", true },
   { "simpte.com", true },
   { "simpul.nl", true },
   { "simrail.nl", true },
@@ -32305,6 +33679,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sinde.ru", true },
   { "sinefili.com", true },
   { "sinergy.ch", true },
+  { "sinfonietta-meridiana.de", true },
   { "sinfulforums.net", true },
   { "singaporemint.com", true },
   { "singapurfirma.com", true },
@@ -32327,6 +33702,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sinterama.biz", true },
   { "sinuelovirtual.com.br", true },
   { "sioeckes.hu", true },
+  { "sion.info", true },
+  { "sipc.org", true },
+  { "sipstix.co.za", true },
   { "siratalmustaqim.com", true },
   { "siraweb.org", true },
   { "sirbouncealotcastles.co.uk", true },
@@ -32340,6 +33718,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sirtaptap.com", true },
   { "sirtuins.com", true },
   { "sirvoy.com", true },
+  { "sisiengineers.gq", true },
   { "sisseastumine.ee", true },
   { "sistel.es", true },
   { "sistem-maklumat.com", true },
@@ -32353,6 +33732,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sit-brn.ru", true },
   { "sit.ec", true },
   { "sitc.sk", true },
+  { "site-helper.com", true },
   { "sitebuilderreport.com", true },
   { "sitedrive.fi", true },
   { "sitefactory.com.br", true },
@@ -32375,20 +33755,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sj-leisure.com", true },
   { "sjaakgilsingfashion.nl", true },
   { "sjatsh.com", true },
+  { "sjbwoodstock.org", true },
   { "sjd.is", true },
+  { "sjdaws.com", true },
   { "sjis.me", true },
   { "sjleisure.co.uk", true },
   { "sjoorm.com", true },
-  { "sjsmith.id.au", true },
   { "sjv4u.ch", true },
-  { "sjzebs.com", true },
-  { "sjzget.com", true },
-  { "sjzybs.com", true },
   { "sk-net.cz", true },
   { "skala.io", true },
   { "skalarwelle.eu", true },
   { "skanvordoff.ru", true },
   { "skanword.info", true },
+  { "skarox.ee", true },
   { "skatclub-beratzhausen.de", true },
   { "skatesins.ch", true },
   { "skatingchina.com", true },
@@ -32405,6 +33784,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "skhire.co.uk", true },
   { "skhoop.cz", true },
   { "skia.org", false },
+  { "skiblandford.com", true },
   { "skid.church", true },
   { "skiddle.com", true },
   { "skifairview.com", true },
@@ -32417,9 +33797,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "skillmoe.at", true },
   { "skillout.org", true },
   { "skills2serve.org", true },
-  { "skills2services.com", true },
   { "skillseo.com", true },
-  { "skimming.net", true },
   { "skin-cosmetic.eu", true },
   { "skincare-note.com", true },
   { "skincases.co", true },
@@ -32434,6 +33812,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "skippy.dog", true },
   { "skischule-wildewiese.de", true },
   { "skizzen-zeichnungen.de", true },
+  { "skk.moe", true },
   { "skks.cz", true },
   { "sklepsamsung.pl", true },
   { "sklepwielobranzowymd.com", true },
@@ -32448,6 +33827,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "skommettiamo.it", true },
   { "skontakt.cz", true },
   { "skontorp-enterprise.no", true },
+  { "skoolergraph.azurewebsites.net", true },
   { "skortekaas.nl", false },
   { "skory.us", true },
   { "skou.dk", false },
@@ -32484,12 +33864,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "skynetz.tk", true },
   { "skype.com", true },
   { "skyquid.co.uk", true },
-  { "skyris.co", true },
   { "skys-entertainment.com", true },
+  { "skyscapecanopies.com", true },
   { "skysuite.nl", true },
   { "skyynet.de", true },
   { "skyzimba.com.br", true },
   { "sl-bildermacher.de", true },
+  { "sl-informatique.ovh", true },
   { "sl0.us", true },
   { "sl899.com", true },
   { "sl998.com", true },
@@ -32503,6 +33884,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "slane.cn", false },
   { "slangbellor.com", true },
   { "slanterns.net", true },
+  { "slapen17.nl", true },
   { "slash32.co.uk", true },
   { "slashcrypto.org", true },
   { "slate.to", true },
@@ -32526,12 +33908,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "slevermann.de", true },
   { "slevomat.cz", true },
   { "slicklines.co.uk", true },
-  { "slicss.com", true },
   { "slidebatch.com", true },
   { "slides.zone", true },
   { "slik.ai", true },
   { "slim-slender.com", true },
-  { "slimk1nd.nl", true },
   { "slimspots.com", true },
   { "slingo-sta.com", true },
   { "slingooriginals.com", true },
@@ -32539,12 +33919,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "slink.hr", true },
   { "slip-gaming.tk", true },
   { "slneighbors.org", true },
+  { "slo-net.net", true },
   { "slo-tech.com", true },
   { "sloancom.com", true },
   { "slonep.net", true },
   { "slopeedge.com", true },
   { "slotarazzi.com", true },
-  { "slotboss.co.uk", true },
   { "slotcar.com", false },
   { "slotfara.com", true },
   { "slotfara.net", true },
@@ -32556,12 +33936,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "slpower.com", true },
   { "slrd-isperih.com", true },
   { "sluciaconstruccion.com", true },
+  { "sluimann.de", true },
   { "sluitkampzeist.nl", false },
   { "slusham.com", true },
   { "slvh.fr", true },
   { "slwilde.ca", true },
   { "slxh.eu", true },
   { "slxh.nl", true },
+  { "slysend.com", true },
   { "sm-supplements.gr", true },
   { "sm.ms", true },
   { "sm2016.ch", true },
@@ -32571,9 +33953,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "smadav.ml", true },
   { "smakassen.no", true },
   { "smallcloudsolutions.co.za", true },
+  { "smalldata.tech", true },
   { "smalldogbreeds.net", true },
   { "smalle-voet.de", true },
   { "smallhadroncollider.com", true },
+  { "smallpath.me", true },
   { "smalltalkconsulting.com", true },
   { "smaltimento-rifiuti.org", true },
   { "smaltimento.caserta.it", true },
@@ -32612,19 +33996,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "smartmarketingcoaching.com", true },
   { "smartmeal.ru", true },
   { "smartmessages.net", true },
+  { "smartmompicks.com", true },
   { "smartmomsmartideas.com", true },
   { "smartpass.government.ae", true },
   { "smartphonechecker.co.uk", true },
   { "smartpolicingplatform.com", true },
-  { "smartrade.tech", true },
   { "smartrecruit.ro", true },
   { "smartservices.nl", true },
   { "smartshiftme.com", true },
   { "smartship.co.jp", true },
   { "smartsparrow.com", true },
+  { "smartthursday.hu", true },
   { "smartvideo.io", true },
   { "smartviewing.com", true },
-  { "smartwank.com", true },
   { "smartwelve.com", true },
   { "smartwoodczech.cz", true },
   { "smartwurk.nl", false },
@@ -32636,6 +34020,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sme-gmbh.net", true },
   { "smeetsengraas.com", true },
   { "smeso.it", true },
+  { "smiatek.name", true },
   { "smileandpay.com", true },
   { "smiledirectsales.com", true },
   { "smilessoftplay.co.uk", true },
@@ -32648,13 +34033,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "smit.com.ua", true },
   { "smit.ee", true },
   { "smith.co", true },
+  { "smith.is", true },
   { "smithandcanova.co.uk", false },
-  { "smithchow.com", true },
+  { "smithchung.eu", true },
   { "smithfieldbaptist.org", true },
   { "smkw.com", false },
   { "smm.im", true },
   { "smmlaba.io", true },
   { "smokeandmirrors.agency", true },
+  { "smokefree.gov", true },
+  { "smokeus.dk", true },
   { "smol.cat", true },
   { "smoo.st", true },
   { "smoothcomp.com", true },
@@ -32674,31 +34062,38 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "smsben.net", true },
   { "smsbrana.cz", true },
   { "smsg-dev.ch", true },
+  { "smsinger.com", true },
   { "smsk.email", true },
   { "smsk.io", true },
   { "smskeywords.co.uk", true },
   { "smskmail.com", true },
   { "smsprivacy.org", true },
-  { "smtpdev.com", true },
+  { "smspujcka24.eu", true },
+  { "smtp.in.th", true },
   { "smuncensored.com", true },
   { "smutba.se", true },
   { "smutek.net", true },
+  { "smvcm.com", true },
   { "smx.net.br", true },
   { "snackbesteld.nl", true },
   { "snafu.cz", true },
   { "snakafya.com", true },
   { "snap.com", true },
   { "snapappointments.com", true },
+  { "snapappts.com", true },
   { "snapchat.com", true },
   { "snapfinance.com", true },
   { "snapserv.ch", true },
   { "snapserv.net", true },
   { "snaptier.co", true },
+  { "snaptools.io", true },
+  { "snargol.com", true },
   { "snatch.com.ua", true },
-  { "snazel.co.uk", true },
+  { "snazel.co.uk", false },
   { "snazzie.nl", true },
   { "sncdn.com", true },
   { "sndbouncycastles.co.uk", true },
+  { "sneak.berlin", true },
   { "sneakpod.de", true },
   { "sneakynote.com", true },
   { "sneakypaw.com", true },
@@ -32707,7 +34102,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sneedit.com", true },
   { "sneedit.de", true },
   { "sneeuwhoogtes.eu", true },
-  { "sneezry.com", true },
+  { "snegozaderzhatel.ru", true },
   { "snel4u.nl", true },
   { "snelbv.nl", true },
   { "snelshops.nl", true },
@@ -32718,6 +34113,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sngallery.co.uk", true },
   { "sngeo.com", true },
   { "sniderman.eu.org", true },
+  { "sniderman.org", true },
   { "sniderman.us", true },
   { "sniep.net", true },
   { "snight.co", true },
@@ -32725,13 +34121,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "snip.run", true },
   { "snippet.wiki", true },
   { "snl.no", true },
+  { "sno-kingroofing-gutters.com", true },
   { "snoerendevelopment.nl", true },
+  { "snohomishsepticservice.com", true },
   { "snopyta.com", true },
   { "snoringhq.com", true },
   { "snote.io", true },
   { "snoupon.com", true },
   { "snow-online.com", true },
-  { "snow-online.de", true },
   { "snow.dog", true },
   { "snowalerts.eu", true },
   { "snowalerts.nl", true },
@@ -32742,9 +34139,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "snowhaze.com", true },
   { "snowpak.com", true },
   { "snowpaws.de", true },
+  { "snowplane.net", true },
+  { "snowraven.de", true },
   { "snowy.land", true },
   { "snowyluma.me", true },
+  { "snperformance.gr", true },
+  { "snrat.com", true },
   { "snrub.co", true },
+  { "sntravel.co.uk", true },
   { "snuff.porn", true },
   { "snughealth.org.uk", true },
   { "sny.no", true },
@@ -32760,6 +34162,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sobie.ch", true },
   { "sobieray.dyndns.org", true },
   { "sobotkama.eu", true },
+  { "sobreporcentagem.com", true },
   { "soc.net", true },
   { "soccorso-stradale.org", true },
   { "sochi-sochno.ru", true },
@@ -32767,6 +34170,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "social-events.net", false },
   { "social-media-strategies.it", true },
   { "social-media-strategy.org.uk", true },
+  { "social-work-colleges.com", true },
   { "socialhams.net", true },
   { "socializam.com", true },
   { "socialmarketingday.nl", true },
@@ -32783,12 +34187,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "societyhilldance.com", true },
   { "socioambiental.org", true },
   { "sociobiology.com", true },
+  { "sociology-schools.com", true },
   { "sociopathy.org", true },
   { "sockeye.io", true },
   { "sockscap64.com", true },
   { "socoastal.com", true },
   { "sodadigital.com.au", true },
   { "sodafilm.de", true },
+  { "sodamakerclub.com", true },
   { "sodexam.pro", true },
   { "sodi.nl", true },
   { "sodiao.cc", true },
@@ -32797,7 +34203,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sofabedshop.de", true },
   { "sofiavanmoorsel.com", true },
   { "sofort.com", true },
+  { "sofortimplantate-muenchen.de", true },
   { "sofortueberweisung.de", true },
+  { "soft41.ru", true },
   { "softandbouncy.co.uk", true },
   { "softanka.com", true },
   { "softart.club", true },
@@ -32805,6 +34213,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "softbebe.com", true },
   { "softcreatr.de", false },
   { "softonic.com", true },
+  { "softonic.com.br", true },
+  { "softonic.jp", true },
+  { "softonic.pl", true },
   { "softplay4hire.co.uk", true },
   { "softprayog.in", true },
   { "softrobot.se", true },
@@ -32813,6 +34224,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "software.rocks", true },
   { "softwarebetrieb.de", true },
   { "softwaredesign.foundation", false },
+  { "softwarehardenberg.nl", true },
   { "softwarevoortherapeuten.nl", true },
   { "softwaylancing.com", true },
   { "softwerk-edv.de", true },
@@ -32859,6 +34271,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "solicafe.at", true },
   { "solidarita-kosovo.net", true },
   { "solidshield.com", true },
+  { "solidtuesday.com", true },
   { "solihullcarnival.co.uk", true },
   { "solihullinflatables.com", true },
   { "solihulllionsclub.org.uk", true },
@@ -32873,12 +34286,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "solos.im", true },
   { "solsocog.de", true },
   { "soluphant.de", true },
-  { "solus-project.com", true },
   { "solutionhoisthire.com.au", true },
   { "solvation.de", true },
   { "solve-it.se", true },
   { "solved.tips", true },
   { "solvemethod.com", true },
+  { "solvewebmedia.com", true },
   { "solvingproblems.com.au", true },
   { "solvops.com", true },
   { "somaini.li", true },
@@ -32888,10 +34301,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "somanao.com", true },
   { "somcase.com.br", true },
   { "somecrazy.com", true },
-  { "somepills.com", true },
   { "somersetscr.nhs.uk", true },
   { "somersetwellbeing.nhs.uk", true },
+  { "somethingsketchy.net", true },
   { "sommefeldt.com", true },
+  { "somoshuemul.cl", true },
   { "somosnoticia.com.br", true },
   { "sompani.com", true },
   { "somuchbetterwithage.com", true },
@@ -32903,6 +34317,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sondergaard.de", true },
   { "sondersobk.dk", true },
   { "songluck.com", true },
+  { "songshuzuoxi.com", true },
   { "songsmp3.co", true },
   { "songsmp3.com", true },
   { "songsmp3.info", true },
@@ -32924,13 +34339,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "soohealthy.nl", true },
   { "soomee.be", true },
   { "soomee1.be", true },
+  { "soontm.de", true },
+  { "soontm.net", true },
   { "soopure.nl", true },
+  { "sooscreekdental.com", true },
+  { "soothemobilemassage.com.au", true },
   { "soph.jp", true },
   { "soph.us", true },
   { "sopher.io", true },
   { "sophiaandmatt.co.uk", true },
   { "sophiakligys.com", true },
   { "sophieandtrey.com", true },
+  { "sopo.me", true },
   { "soprabalao.com.br", true },
   { "soquee.net", true },
   { "sor.so", true },
@@ -32939,20 +34359,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sorakumo.jp", true },
   { "sorcix.com", true },
   { "sorellecollection.com.au", true },
+  { "soren.xyz", true },
   { "sorenstudios.com", true },
   { "sorever.online", true },
+  { "sorincocorada.ro", true },
   { "sorrowfulunfounded.com", true },
   { "sortesim.com.br", true },
   { "soruly.com", true },
   { "sorz.org", true },
+  { "sos-elettricista.it", true },
   { "sos-falegname.it", true },
   { "sos-idraulico.it", true },
+  { "sos-muratore.it", true },
+  { "sosesh.shop", true },
   { "sosoftplay.co.uk", true },
   { "sospromotions.com.au", true },
   { "sostacancun.com", true },
   { "sosteam.jp", true },
   { "sosteric.si", true },
   { "sotadb.info", true },
+  { "sotai.tk", true },
   { "sotar.us", true },
   { "sotavasara.net", true },
   { "sotoasobi.net", true },
@@ -32960,6 +34386,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sou-co.jp", true },
   { "soubriquet.org", true },
   { "soufastnet.com.br", true },
+  { "sougi-review.top", true },
   { "souki.cz", true },
   { "soukodou.jp", true },
   { "soul-source.co.uk", true },
@@ -32972,16 +34399,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "soumikghosh.com", true },
   { "soumya92.me", true },
   { "soundabout.nl", true },
-  { "soundedj.com.br", true },
   { "soundeo.com", true },
   { "soundeo.net", true },
   { "soundhunter.xyz", false },
   { "soundonsound.com", true },
   { "soundprotectionllc.com", true },
   { "soundscrate.com", true },
+  { "soundtruckandautorepair.com", true },
+  { "soupcafe.org", true },
+  { "souqtajmeel.com", true },
   { "sour.is", true },
   { "souravsaha.com", true },
   { "sourcebox.be", false },
+  { "sourcecode.tw", true },
   { "sourcely.net", true },
   { "sourceway.de", true },
   { "souris.ch", true },
@@ -32989,11 +34419,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "southambouncycastle.co.uk", true },
   { "southamerican.dating", true },
   { "southbankregister.com.au", true },
-  { "southernlights.xyz", true },
+  { "southcountyplumbing.com", true },
+  { "southdakotahealthnetwork.com", true },
+  { "southeastvalleyurology.com", true },
+  { "southernlights.gq", true },
   { "southernmost.us", true },
   { "southernstructuralsolutions.com", true },
+  { "southernsurgicalga.com", true },
   { "southernutahinfluencers.com", true },
+  { "southflanewsletter.com", true },
   { "southlakenissanparts.com", true },
+  { "southlandurology.com", true },
   { "southmelbourne.apartments", true },
   { "southmorangtownhouses.com.au", true },
   { "southside-crew.com", true },
@@ -33007,6 +34443,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sovendus.de", true },
   { "sowlutions.com", true },
   { "soybase.org", true },
+  { "soydemac.com", true },
+  { "soz6.com", true },
   { "sozai-good.com", true },
   { "sozialy.com", true },
   { "sozon.ca", true },
@@ -33027,6 +34465,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "spaconnection.com", true },
   { "spahireleeds.co.uk", true },
   { "spaid.xyz", false },
+  { "spakhmer.com", true },
   { "spakurort.eu", true },
   { "spaldingwall.com", true },
   { "spamdrain.com", true },
@@ -33041,13 +34480,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sparkforautism.org", true },
   { "sparklatvia.lv", true },
   { "sparklebastard.com", true },
-  { "sparkresearch.net", true },
-  { "sparkreviewcenter.com", true },
   { "sparkz.no", true },
   { "sparprofi.at", true },
   { "sparta-en.org", true },
   { "sparta-solutions.de", true },
   { "spartaconsulting.fi", true },
+  { "spartacuslife.com", true },
   { "spartaermelo.nl", true },
   { "spasicilia.it", true },
   { "spatzenwerkstatt.de", true },
@@ -33063,9 +34501,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "speak-polish.com", true },
   { "spearfishingmx.com", true },
   { "speargames.net", true },
+  { "specdrones.us", true },
+  { "specialproperties.com", true },
   { "specialtyalloys.ca", true },
   { "speciesism.com", true },
+  { "spectroom.space", true },
   { "spectrosoftware.de", true },
+  { "spectrum.gov", true },
+  { "spediscifiori.com", true },
+  { "spedition-transport-umzug.de", true },
   { "speech-balloon.com", true },
   { "speechdrop.net", true },
   { "speechmate.com", true },
@@ -33087,14 +34531,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "spellcheckci.com", true },
   { "spellchecker.net", true },
   { "spenglerei-shop.de", true },
+  { "sperandii.it", true },
   { "sperec.fr", true },
-  { "sperohub.com", true },
   { "sperrstun.de", true },
   { "spesys-services.fr", true },
   { "spha.info", true },
+  { "sphere-realty.com", true },
   { "sphereblur.com", true },
   { "spherenix.org", true },
   { "sphido.org", true },
+  { "spicejungle.com", true },
   { "spicydog.org", true },
   { "spicymatch.com", true },
   { "spidermail.tk", true },
@@ -33108,15 +34554,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "spiet.nl", true },
   { "spiff.eu", true },
   { "spiga.ch", true },
-  { "spillersfamily.net", true },
-  { "spillmaker.no", false },
+  { "spikelands.com", true },
   { "spilogkoder.dk", true },
   { "spinalien.net", false },
   { "spinalo.se", true },
   { "spindle.com.ph", true },
   { "spindrift.com", true },
   { "spingenie.com", true },
-  { "spinor.im", true },
   { "spins.fedoraproject.org", true },
   { "spinspin.wtf", true },
   { "spiralschneiderkaufen.de", true },
@@ -33137,10 +34581,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "splintermail.com", true },
   { "splitdna.com", true },
   { "splitreflection.com", true },
+  { "splnk.net", true },
   { "splopp.com", true },
   { "splunk.net", true },
-  { "splunk.zone", true },
+  { "spnitalianfestival.com", true },
   { "spodelime.com", true },
+  { "spokaneexteriors.com", true },
+  { "spokanepolebuildings.com", true },
   { "spoketwist.com", true },
   { "spoluck.ca", true },
   { "spolwind.de", true },
@@ -33159,7 +34606,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sport-in-sundern.de", true },
   { "sport-potreby.cz", true },
   { "sport-potreby.sk", true },
-  { "sport-socken.net", true },
+  { "sportabee.com", true },
   { "sportakrobatik.at", true },
   { "sportbetuwe.nl", true },
   { "sporter.com", true },
@@ -33167,6 +34614,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sportovnidum.cz", true },
   { "sportparks.com", true },
   { "sportparks.org", true },
+  { "sports-colleges.com", true },
   { "sports.dating", true },
   { "sportsjaw.com", true },
   { "sportsmansblog.com", true },
@@ -33175,6 +34623,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sporttown.it", true },
   { "sportugalia.ru", true },
   { "sportvereine.online", true },
+  { "sportwette.eu", true },
+  { "sportwetten-anbieter.de", true },
   { "sportxt.ru", true },
   { "spot-lumiere-led.com", true },
   { "spotrebitelskecentrum.sk", true },
@@ -33182,22 +34632,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "spotteredu.com", true },
   { "spotupload.com", true },
   { "sppin.fr", true },
-  { "spr.id.au", true },
   { "sprachfreudehoch3.de", true },
+  { "sprax2013.de", true },
   { "sprayforce.com", true },
   { "spreadsheetgear.com", true },
   { "spreadsheets.google.com", true },
   { "spreadthenews.eu", true },
   { "spree.co.za", true },
   { "spreed.me", true },
-  { "spricknet.de", true },
   { "springerundpartner.de", true },
   { "springfieldbricks.com", true },
+  { "springhillmaine.com", true },
   { "springreizen.nl", true },
   { "sprinklermanohio.com", true },
   { "spritmonitor.de", true },
   { "spritsail.io", true },
   { "sproktz.com", true },
+  { "spron.in", true },
   { "sprucecreekclubs.com", true },
   { "sprucecreekgcc.com", true },
   { "sps-lehrgang.de", true },
@@ -33214,19 +34665,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "spydersec.com", true },
   { "spyprofit.ru", true },
   { "spyroszarzonis.com", true },
-  { "sqkaccountancy.co.uk", true },
   { "sql-und-xml.de", true },
   { "sqlapius.net", true },
   { "sqlfeatures.com", true },
   { "sqr-training.com", true },
   { "sqroot.eu", true },
-  { "squadlinx.com", true },
   { "square-gaming.org", true },
   { "square-src.de", false },
   { "square.com", false },
-  { "squareonebgc.com.ph", true },
+  { "squarelab.it", true },
   { "squareup.com", false },
   { "squawk.cc", true },
+  { "squeakql.online", true },
   { "squeezemetrics.com", true },
   { "squido.ch", true },
   { "squidparty.com", true },
@@ -33236,6 +34686,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "srandom.com", true },
   { "sranje.rocks", true },
   { "srbija-nekretnine.org", true },
+  { "src-el-main.com", true },
   { "src.fedoraproject.org", true },
   { "srchub.org", true },
   { "srife.net", true },
@@ -33262,8 +34713,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ssbgportal.net", true },
   { "ssbkk.ru", true },
   { "ssbrm.ch", true },
-  { "ssc8689.com", true },
-  { "ssc8689.net", true },
   { "sscd.no", true },
   { "ssdax.com", false },
   { "ssdservers.co.uk", true },
@@ -33290,6 +34739,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sslmate.com", true },
   { "sslok.com", true },
   { "sslping.com", true },
+  { "sslpoint.com", true },
   { "ssls.cz", true },
   { "sslsurvey.de", true },
   { "ssmato.me", true },
@@ -33298,12 +34748,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sso.to", false },
   { "ssready.io", true },
   { "ssready.org", true },
+  { "ssrfq.com", true },
+  { "sssppp.gq", true },
   { "sstaging.com", true },
-  { "sstewartgallus.com", true },
   { "ssuc.net", true },
   { "ssuiteoffice.com", true },
   { "ssuitesoft.com", true },
   { "st-antonius-kuenzell.de", true },
+  { "st-bede.org", true },
   { "st-innovationcup.com", true },
   { "st-kilian-markt-erlbach.de", true },
   { "st-news.de", true },
@@ -33313,19 +34765,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "staatsschutz.at", true },
   { "staatsschutzgesetz.at", true },
   { "stablelib.com", true },
+  { "stacklasvegas.com", true },
   { "stackpath.com", true },
   { "stackptr.com", true },
-  { "stacktile.io", false },
   { "stackunderflow.com", true },
   { "staddlestonesbowness.co.uk", true },
   { "stadm.com", true },
   { "stadsbygd.info", true },
   { "stadt-apotheke-muensingen.de", true },
+  { "stadtbauwerk.at", false },
+  { "stadtbuecherei-bad-wurzach.de", true },
   { "stadterneuerung-hwb.de", true },
   { "stadtpapa.de", true },
   { "stadtplan-ilmenau.de", true },
   { "staer.ro", true },
   { "staff.direct", true },
+  { "staffexcellence.com", true },
   { "staffordlabour.org.uk", true },
   { "stage.wepay.com", false },
   { "stage4.ch", true },
@@ -33337,12 +34792,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stair.ch", true },
   { "stairfallgames.com", true },
   { "stairlin.com", true },
-  { "stakestrategy.com", true },
   { "staklim-malang.info", true },
   { "stako.jp", true },
   { "staktrace.com", true },
+  { "stalder.work", true },
   { "staljedevledder.nl", true },
   { "stalker-shop.com", true },
+  { "stalkerteam.pl", true },
   { "stalkr.net", true },
   { "stameystreet.com", true },
   { "stamkassa.nl", true },
@@ -33355,14 +34811,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "standard.co.uk", true },
   { "standardequipment.com", true },
   { "standards.gov", true },
+  { "stannri.org", true },
   { "stanron.com", true },
+  { "stantabler.com", true },
+  { "stanthony-hightstown.net", true },
+  { "stanthony-yonkers.org", true },
   { "stanthonymaryclaret.org", true },
-  { "star-citizen.wiki", true },
+  { "staparishgm.org", true },
   { "star-clean.it", true },
   { "starcoachservices.ca", true },
   { "starcomproj.com", true },
   { "stardanceacademy.net", true },
-  { "stardust-entertainments.co.uk", true },
   { "stareplanymiast.pl", true },
   { "starflix.uk", true },
   { "starfm.de", true },
@@ -33374,13 +34833,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "starka.st", true },
   { "starkbim.com", true },
   { "starking.net.cn", true },
-  { "starlightentertainmentdevon.co.uk", true },
   { "starlim.co.in", true },
   { "starlim.org", true },
   { "starmtech.fr", true },
   { "starpeak.org", true },
   { "starphotoboothsni.co.uk", true },
   { "starsam80.net", true },
+  { "starsguru.com", true },
   { "starsing.bid", true },
   { "starskim.cn", true },
   { "starstreak.net", false },
@@ -33392,12 +34851,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "startpage.info", true },
   { "startrek.in", true },
   { "startsamenvitaal.nu", true },
+  { "starttls-everywhere.org", true },
   { "starttraffic.com", true },
   { "starttraffic.uk", true },
   { "startupgenius.org", true },
-  { "starwatches.eu", true },
   { "starwins.co.uk", true },
-  { "stassi.ch", true },
   { "stastka.ch", true },
   { "stat.ink", true },
   { "statebuildinggroup.com", true },
@@ -33405,6 +34863,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "static-myfxee-808795.c.cdn77.org", true },
   { "static-myfxoau-808795.c.cdn77.org", true },
   { "static-myfxouk-808795.c.cdn77.org", true },
+  { "static.today", true },
   { "static.wepay.com", false },
   { "staticline.de", true },
   { "stationa.ch", true },
@@ -33425,7 +34884,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stayschemingco.com", true },
   { "stb-schefczyk.de", true },
   { "stb-strzyzewski.de", true },
+  { "stb.gov", true },
+  { "stbartholomewmanchester.org", true },
   { "stbennett.org", true },
+  { "stbl.org", true },
+  { "stbridgeteastfalls.org", true },
+  { "stcatharine-stmargaret.org", true },
+  { "stceciliakearny.org", true },
   { "stclementmatawan.org", true },
   { "stclementreligioused.org", true },
   { "stcplasticsurgery.com", true },
@@ -33449,11 +34914,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "steborio.pw", true },
   { "steckel.cc", true },
   { "stedb.eu", true },
+  { "stedbg.net", true },
   { "steef389.eu", true },
   { "steel-roses.de", true },
   { "steelephys.com.au", true },
   { "steelmounta.in", true },
   { "steemit.com", true },
+  { "steemyy.com", true },
   { "steerty.com", true },
   { "stefan-bayer.eu", true },
   { "stefan-rothe.ch", true },
@@ -33464,7 +34931,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stefanorossi.it", true },
   { "stefanovski.io", true },
   { "stefanvanburen.xyz", true },
+  { "stefanvd.net", true },
+  { "stefany.eu", true },
   { "steffentreeservice.com", true },
+  { "stefpastoor.nl", true },
   { "stegmaier-immobilien.de", true },
   { "steidlewirt.de", true },
   { "steigerlegal.ch", true },
@@ -33472,10 +34942,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "steinbergmedia.de", true },
   { "steinibox.de", true },
   { "steklein.de", true },
+  { "stekosouthamerica.com", true },
+  { "stelfox.net", true },
   { "stella-artis-ensemble.at", true },
   { "stellarguard.me", true },
   { "stellarium-gornergrat.ch", true },
-  { "stellarvale.net", true },
   { "stellen.ch", true },
   { "stelleninserate.de", true },
   { "stellenticket.de", true },
@@ -33521,10 +34992,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sternen-sitzberg.ch", true },
   { "sternenbund.info", true },
   { "sternplastic.com", true },
+  { "sternsinus.com", true },
   { "stetspa.it", true },
   { "steuer-voss.de", true },
   { "steuerberater-essen-steele.com", true },
   { "steuerkanzlei-edel.de", true },
+  { "steuerkanzlei-und-wirtschaftsberater-manke.de", true },
   { "steuern-recht-wirtschaft.de", true },
   { "steuerseminare-graf.de", true },
   { "steuertipps-sonderausgaben.de", true },
@@ -33533,16 +35006,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stevecostar.com", true },
   { "stevedesmond.ca", true },
   { "stevedoggett.com", true },
+  { "stevegellerhomes.com", true },
   { "stevegrav.es", true },
   { "stevemonteyne.be", true },
   { "steven-bennett.com", true },
   { "steven-klix.de", true },
   { "stevenbolgartersnakes.com", true },
   { "stevenhumphrey.uk", true },
-  { "stevenkwan.me", true },
   { "stevenroddis.com", true },
   { "stevens.se", false },
   { "steventress.com", true },
+  { "steventruesdell.com", true },
   { "stevenwooding.com", true },
   { "stevenz.net", true },
   { "stevenz.science", true },
@@ -33552,7 +35026,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stewpolley.com", true },
   { "steyaert.be", false },
   { "stforex.com", false },
+  { "stfrancisnaugatuck.org", true },
   { "stfw.info", true },
+  { "stgabrielstowepa.org", true },
+  { "stgeorgegolfing.com", true },
+  { "stgm.org", true },
+  { "sthenryrc.org", true },
   { "stian.net", true },
   { "stichtingliab.nl", true },
   { "stichtingscholierenvervoerzeeland.nl", true },
@@ -33564,15 +35043,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stickertuningfetzt.de", false },
   { "stickies.io", true },
   { "stickmanventures.com", true },
-  { "stickswag.eu", true },
-  { "stiffordacademy.org.uk", true },
+  { "stickstueb.de", true },
+  { "stiens.de", true },
   { "stift-kremsmuenster.at", true },
   { "stiftemaskinen.no", true },
   { "stigharder.com", true },
+  { "stigviewer.com", true },
   { "stijncrevits.be", true },
   { "stijnodink.nl", true },
   { "stikic.me", true },
-  { "stikkie.me", true },
   { "stilartmoebel.de", true },
   { "stilecop.com", true },
   { "stillnessproject.com", true },
@@ -33585,13 +35064,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stintup.com", true },
   { "stipsan.me", true },
   { "stirblaut.de", true },
+  { "stirling.co", true },
   { "stirlingpoon.com", true },
+  { "stisaac.org", true },
+  { "stisidores.org", true },
   { "stitchfiddle.com", true },
   { "stitchinprogress.com", true },
   { "stivesbouncycastlehire.co.uk", true },
   { "stjohnin.com", true },
   { "stjohnsottsville.org", true },
   { "stjoseph-stcatherine.org", true },
+  { "stjosephspringcity.com", true },
   { "stjscatholicchurch.org", true },
   { "stjustin.org", true },
   { "stln.ml", true },
@@ -33600,41 +35083,46 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stlukesbrandon.org", true },
   { "stm-net.de", true },
   { "stma.is", true },
+  { "stmariagoretti.net", true },
+  { "stmarkseagirt.com", true },
   { "stmarthachurch.com", true },
   { "stmaryextra.uk", true },
+  { "stmattsparish.com", true },
   { "stmichaellvt.com", true },
+  { "stmichaelunion.org", true },
   { "stmkza.net", true },
   { "stmlearning.com", true },
   { "stmsolutions.pl", true },
   { "stneotsbouncycastlehire.co.uk", true },
-  { "stnl.de", false },
   { "stockpile.com", true },
   { "stockrow.com", true },
+  { "stockstuck.com", true },
+  { "stocktout.info", true },
   { "stocktrader.com", true },
   { "stodieck.com", true },
   { "stoebermehl.at", true },
   { "stoffelnet.de", true },
   { "stogiesandmash.com", true },
   { "stokvistrading.nl", true },
+  { "stolbart.com", true },
   { "stolin.info", true },
   { "stolina.de", false },
   { "stolkpotplanten.nl", true },
-  { "stolkschepen.nl", true },
+  { "stollen-wurm.de", true },
+  { "stollenwurm.de", true },
   { "stolpi.is", true },
   { "stomt.com", true },
   { "stoneagehealth.com.au", true },
   { "stonedworms.de", true },
   { "stonefusion.org.uk", true },
   { "stonehammerhead.org", true },
-  { "stonewuu.com", true },
+  { "stonehurstcap.com", true },
   { "stony.com", true },
   { "stonystratford.org", true },
-  { "stopbreakupnow.org", true },
   { "stopbullying.gov", true },
   { "stopfraud.gov", false },
   { "stopthethyroidmadness.com", true },
   { "storageideas.uk", true },
-  { "storageshedsnc.com", true },
   { "stordbatlag.no", true },
   { "storedsafe.com", true },
   { "storeit.co.uk", true },
@@ -33647,17 +35135,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "storycollective.nl", true },
   { "storyland.ie", true },
   { "storysift.news", true },
-  { "storytea.top", true },
   { "storytell.com", true },
   { "storytime.hu", true },
   { "stouter.nl", true },
   { "stoxford.com", true },
+  { "stpatrickbayshore.org", true },
   { "straatderzotten.nl", true },
   { "strafensau.de", true },
   { "strahlende-augen.info", true },
   { "straightedgebarbers.ca", true },
   { "strajnar.si", true },
   { "straka.name", true },
+  { "strandschnuppern.de", true },
   { "strangelane.com", true },
   { "strangemusicinc.com", true },
   { "strangemusicinc.net", true },
@@ -33683,7 +35172,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "streamchan.org", true },
   { "streamelements.com", true },
   { "streamkit.gg", true },
-  { "streamlineautogroup.com", true },
   { "streampleasure.xyz", true },
   { "street-medics.fr", true },
   { "street-smart-home.de", true },
@@ -33699,7 +35187,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stremio.com", true },
   { "strengthroots.com", true },
   { "stretchmyan.us", true },
-  { "stretchpc.com", true },
   { "striata.com", true },
   { "striatadev.com", true },
   { "stricted.net", true },
@@ -33712,8 +35199,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "striped.horse", true },
   { "strivephysmed.com", false },
   { "strm.hu", true },
-  { "strming.com", true },
   { "strobeltobias.de", true },
+  { "strobeto.de", true },
   { "strobotti.com", true },
   { "stroeerdigital.de", true },
   { "stroginohelp.ru", true },
@@ -33727,6 +35214,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "strongsalpinesucculents.com", true },
   { "strongtowerpc.com", true },
   { "stroomacties.nl", true },
+  { "strosemausoleum.com", true },
   { "stroseoflima.com", true },
   { "strozik.de", true },
   { "structurally.net", true },
@@ -33735,7 +35223,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "strutta.me", true },
   { "strydom.me.uk", true },
   { "stsolarenerji.com", true },
+  { "ststanstrans.org", true },
   { "stt.wiki", true },
+  { "stthomasbrigantine.org", true },
   { "stuartbell.co.uk", true },
   { "stuarteggerton.com", true },
   { "stuartmorris.id.au", true },
@@ -33743,8 +35233,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stuartmorris.name", true },
   { "stuartmorris.tel", true },
   { "stuarts.xyz", false },
+  { "stuckateur-bruno.de", true },
   { "stuco.co", true },
-  { "stucorweb.com", true },
   { "stucydee.nl", true },
   { "studenckiemetody.pl", true },
   { "student-eshop.cz", true },
@@ -33761,15 +35251,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "studienportal.eu", true },
   { "studienservice.de", true },
   { "studiereader.nl", true },
+  { "studio-637.com", true },
   { "studio-architetto.com", true },
   { "studio-fotografico.ru", true },
   { "studio-webdigi.com", true },
+  { "studio44.fit", true },
+  { "studioadevents.com", true },
+  { "studioavvocato24.it", true },
   { "studiobergaminloja.com.br", true },
   { "studiodentisticosanmarco.it", true },
   { "studiodewit.nl", true },
   { "studiogavioli.com", true },
   { "studiograou.com", true },
   { "studiohelder.fr", false },
+  { "studiohomebase.amsterdam", true },
   { "studiokicca.com", true },
   { "studiolegalepaternostro.it", true },
   { "studiomarcella.com", true },
@@ -33786,15 +35281,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "studipro-formation.fr", true },
   { "studipro-marketing.fr", true },
   { "studium.cz", true },
-  { "studlan.no", true },
   { "studyin.jp", true },
   { "studyspy.ac.nz", true },
   { "studytactics.com", true },
-  { "studytale.com", true },
   { "stuermer.me", true },
   { "stuetzredli.ch", true },
   { "stuffi.fr", true },
   { "stuffie.org", true },
+  { "stuffiwouldbuy.com", true },
   { "stuka-art.de", true },
   { "stulda.cz", false },
   { "stumeta.de", true },
@@ -33802,7 +35296,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "stumf.si", true },
   { "stuntmen.xyz", true },
   { "stupendous.net", false },
-  { "sturbi.de", true },
   { "stutelage.com", true },
   { "stuttgart-gablenberg.de", true },
   { "stuudium.cloud", true },
@@ -33821,6 +35314,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "styles.pm", true },
   { "stylett.ru", true },
   { "stylewish.me", true },
+  { "styloeart.com", true },
   { "stypr.com", true },
   { "su1ph3r.io", true },
   { "suaudeau.fr", true },
@@ -33830,7 +35324,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "subastasdecarros.net", true },
   { "subculture.live", true },
   { "subdev.org", true },
-  { "sublevel.net", false },
   { "sublimebits.com", true },
   { "sublocale.com", true },
   { "submedia.tv", true },
@@ -33846,6 +35339,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "subversive-tech.com", true },
   { "succ.in", true },
   { "succesprojekter.dk", true },
+  { "successdeliv.com", true },
   { "suchmaschinen-werkstatt.de", true },
   { "suckmyan.us", false },
   { "sucretown.net", true },
@@ -33853,6 +35347,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sudaraka.org", true },
   { "sudmotor-occasions.be", true },
   { "sudo-i.net", true },
+  { "sudo.li", true },
   { "sudo.org.au", true },
   { "sudo.ws", true },
   { "sudokian.io", true },
@@ -33886,6 +35381,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "suksit.com", true },
   { "sulek.eu", true },
   { "sulian.me", false },
+  { "sullenholland.nl", true },
   { "suluvir.com", true },
   { "sumguy.com", true },
   { "summa-prefis.com", true },
@@ -33895,7 +35391,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sumoatm.com", false },
   { "sumthing.com", true },
   { "sun-leo.co.jp", true },
-  { "sun-wellness-online.com.vn", true },
   { "sunboxstore.jp", true },
   { "sunbritetv.com", true },
   { "sunchasercats.com", true },
@@ -33913,7 +35408,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sunlit.cloud", true },
   { "sunn.ie", true },
   { "sunoikisis.org", true },
-  { "sunset.im", true },
   { "sunsetwx.com", true },
   { "sunshinesf.org", true },
   { "sunsmartresorts.com", true },
@@ -33923,11 +35417,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "suool.net", true },
   { "supa.sexy", true },
   { "supastuds.com", true },
+  { "supcoronado.com", true },
+  { "supedi.com", true },
+  { "supedi.de", true },
+  { "supedio.com", true },
   { "superaficionados.com", true },
   { "superbart.nl", true },
   { "superbdistribute.com", true },
   { "superbouncebouncycastles.com", true },
-  { "superbowlkneel.com", true },
   { "supercalorias.com", true },
   { "supercastlesadelaide.com.au", true },
   { "supercastlesbrisbane.com.au", true },
@@ -33937,17 +35434,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "supercentenarian.com", true },
   { "supercinebattle.fr", true },
   { "superdaddy.club", true },
+  { "superdroni.com", true },
   { "supereight.net", true },
   { "superguide.com.au", true },
   { "superhappiness.com", true },
   { "superhome.com.au", true },
   { "supermae.pt", true },
-  { "supermarx.nl", true },
   { "supermercadosdia.com.ar", true },
   { "supermercato24.it", true },
   { "supermil.ch", true },
   { "supern0va.net", true },
   { "supernaut.info", true },
+  { "supersec.es", true },
   { "supersole.net", true },
   { "supersonnig-festival.de", true },
   { "supersonnigfestival.de", true },
@@ -33961,6 +35459,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "supervisionassist.com", true },
   { "superway.es", true },
   { "supeuro.com", true },
+  { "supioka.com", true },
+  { "supmil.net", true },
   { "supplementler.com", true },
   { "supplies24.at", true },
   { "supplies24.es", true },
@@ -33969,15 +35469,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "supportdesk.nu", true },
   { "supportericking.org", true },
   { "supportme123.com", true },
+  { "supra.tf", true },
+  { "supracube.com", true },
   { "suprem.biz", true },
   { "suprem.ch", true },
+  { "supremestandards.com", true },
   { "supriville.com.br", true },
   { "sur-v.com", true },
   { "surao.cz", true },
   { "surasak.io", true },
   { "surasak.net", true },
   { "surasak.org", true },
-  { "surdam.casa", false },
+  { "surdam.casa", true },
   { "sure-it.de", true },
   { "surefit-oms.com", true },
   { "suretone.co.za", true },
@@ -33992,10 +35495,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "suruifu.com", true },
   { "suruifu.tk", true },
   { "survature.com", true },
+  { "surveillance104.com", true },
   { "surveyhealthcare.com", true },
-  { "surveyinstrumentsales.com", true },
   { "surveymill.co.uk", true },
-  { "survivalistplanet.com", true },
   { "survivalmonkey.com", true },
   { "susanbpilates.co", true },
   { "susanbpilates.com", true },
@@ -34004,12 +35506,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "susanvelez.com", true },
   { "susc.org.uk", true },
   { "sush.us", true },
-  { "sushi.roma.it", true },
   { "sushibesteld.nl", true },
   { "sushikatze.de", true },
-  { "susoccm.org", true },
   { "susosudon.com", true },
   { "suspension-shop.com", true },
+  { "sussexheart.com", true },
   { "sustainabilityknowledgegroup.com", true },
   { "sustainoss.org", true },
   { "sustsol.com", true },
@@ -34022,11 +35523,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "suzi3d.com", true },
   { "suziekovner.com", true },
   { "suzukimarinepress.com", true },
-  { "sv-1966-medenbach.de", true },
+  { "sv-1966-medenbach.de", false },
+  { "sv-bachum-bergheim.de", true },
   { "sv-turm-hohenlimburg.de", true },
   { "sv.search.yahoo.com", false },
   { "svager.cz", true },
   { "svallee.fr", false },
+  { "svanstrom.com", true },
+  { "svanstrom.org", true },
   { "svantner.sk", true },
   { "svarnyjunak.cz", true },
   { "svc-sitec.com", true },
@@ -34036,28 +35540,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "svdb.co", false },
   { "svdreamcatcher.com", true },
   { "sveinerik.org", true },
-  { "svenbacia.me", true },
   { "svendubbeld.nl", true },
   { "sveneckelmann.de", true },
   { "svenjaundchristian.de", true },
+  { "svenluijten.com", false },
   { "svenmuller.nl", true },
   { "svennd.be", true },
   { "svetandroida.cz", true },
   { "svetlilo.com", true },
-  { "svetzitrka.cz", false },
   { "svht.nl", true },
   { "svijet-medija.hr", true },
   { "svinformatica.es", true },
   { "svm-basketball.de", true },
   { "svm-it.eu", true },
   { "svobodnyblog.cz", true },
+  { "svorcikova.cz", true },
   { "sw-servers.net", true },
   { "sw33tp34.com", true },
-  { "swagsocial.net", true },
   { "swankism.com", true },
   { "swansdoor.org", true },
   { "swap.gg", true },
   { "swapadoodle.com", true },
+  { "swaptaxdata.com", true },
   { "swarfarm.com", true },
   { "swarlys-server.de", true },
   { "swat4stats.com", true },
@@ -34069,16 +35573,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "swedishhost.se", true },
   { "sweep-me.net", true },
   { "sweepay.ch", true },
+  { "sweet-as.co.uk", true },
   { "sweet-orr.com", true },
   { "sweetair.com", true },
   { "sweetbridge.com", true },
   { "sweetgood.de", true },
+  { "sweethomesnohomishrenovations.com", true },
   { "sweets-mimatsu.com", true },
   { "swehack.org", true },
   { "sweharris.org", true },
   { "swerve-media-testbed-03.co.uk", true },
   { "swetrust.com", true },
   { "swfmax.com", true },
+  { "swiftcashforcars.com.au", true },
   { "swiftpcbassembly.com", true },
   { "swiftqueue.com", true },
   { "swilly.org", true },
@@ -34093,20 +35600,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "swiss-apartments.com", true },
   { "swiss-connection.net", true },
   { "swiss-cyber-experts.ch", true },
+  { "swiss-vanilla.ch", true },
+  { "swiss-vanilla.com", true },
   { "swisscannabis.club", true },
   { "swissdojo.ch", true },
   { "swisselement365.com", true },
   { "swissfreshaircan.ch", true },
-  { "swissfreshaircan.com", true },
   { "swissid.ch", true },
   { "swisslinux.org", true },
   { "swisstechassociation.ch", true },
   { "swisstechtalks.ch", true },
+  { "swissvanilla.ch", true },
+  { "swissvanilla.com", true },
   { "switch-trader.com", true },
   { "switch.moe", true },
   { "switcheo.exchange", true },
   { "switcheo.rocks", true },
   { "switzerland-family-office.com", true },
+  { "swivells.com", true },
+  { "swn-nec.de", true },
   { "swordfeng.xyz", true },
   { "swqa.hu", true },
   { "swuosa.org", false },
@@ -34115,10 +35627,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "swyn.net", true },
   { "sx8.ovh", true },
   { "sxistolithos.gr", true },
+  { "sy-anduril.de", true },
   { "sy24.ru", true },
   { "syajvo.if.ua", true },
   { "syamutodon.xyz", true },
-  { "syamuwatching.xyz", true },
   { "sycamorememphis.org", true },
   { "sydney-sehen.com", true },
   { "sydney.dating", true },
@@ -34156,12 +35668,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "sympraxisconsulting.com", true },
   { "symptome-erklaert.de", true },
   { "synabi.com", true },
+  { "synack.uk", true },
   { "synaptickz.me", true },
   { "synatra.co", true },
   { "sync-it.no", true },
   { "synccentre.com", true },
   { "syncflare.com", true },
+  { "synchrocube.com", true },
   { "synchrolarity.com", true },
+  { "synchronicity.cz", true },
   { "synchronyse.com", true },
   { "syncrise.co.jp", true },
   { "syneart.com", true },
@@ -34178,6 +35693,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "syntheticurinereview.com", true },
   { "synthetik.com", true },
   { "syoier.com", true },
+  { "syplasticsurgery.com", true },
   { "syriatalk.biz", true },
   { "syriatalk.org", true },
   { "sysadm.guru", true },
@@ -34189,7 +35705,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "syslogic.io", true },
   { "sysmike.de", true },
   { "systea.fr", true },
-  { "system-m.de", true },
+  { "system-m.de", false },
   { "system.cf", true },
   { "system.is", true },
   { "system12.pl", true },
@@ -34206,17 +35722,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "systemonthego.com", true },
   { "systemreboot.net", true },
   { "systemspace.link", true },
+  { "systemups.com", true },
   { "systemweb.no", true },
   { "systemzeit.info", true },
   { "systoolbox.net", true },
   { "sysystems.cz", true },
   { "syt3.net", true },
-  { "sytk.me", true },
   { "syukatsu-net.jp", true },
+  { "syunpay.cn", true },
   { "syy.im", true },
   { "syzygy-tables.info", true },
+  { "sz-ideenlos.de", true },
+  { "sz-lessgym-kamenz.de", true },
   { "szafkirtv.pl", true },
-  { "szagun.net", true },
   { "szaloneigly.com", true },
   { "szamitogepdepo.com", true },
   { "szaydon.me", false },
@@ -34252,9 +35770,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "taartbesteld.nl", true },
   { "tabarnak.ga", true },
   { "tabernadovinho.com.br", true },
-  { "tabhui.com", true },
   { "tabi-news.com", true },
   { "tabi-runrun.com", true },
+  { "tabino.top", true },
   { "tabithawebb.co.uk", true },
   { "tabledusud.be", true },
   { "tabledusud.nl", true },
@@ -34278,31 +35796,38 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tadu.de", true },
   { "tagabrand.co.uk", true },
   { "tagdocumentary.com", true },
-  { "tagesmutter-zwitscherlinge.de", true },
   { "taggedpdf.com", false },
   { "taglioepiega.com", true },
   { "taglioepiega.eu", true },
   { "taglioepiega.it", true },
   { "tagpay.com", true },
+  { "tagungsraum-usedom.de", true },
+  { "tagungsraum-zinnowitz.de", true },
   { "tahavu.com", true },
   { "tahosa.co", true },
   { "tahosalodge.org", true },
-  { "tai-in.com", true },
-  { "tai-in.net", true },
+  { "tailpuff.net", false },
   { "tails.boum.org", true },
   { "taimane.com", true },
   { "taiphanmem.net", true },
   { "taishon.nagoya", true },
   { "taitmacleod.com", true },
   { "taiwan.dating", true },
+  { "taiwania.capital", true },
+  { "taiwania.vc", true },
+  { "taiwaniacapital.com", true },
+  { "taiwaniacapital.com.tw", true },
+  { "taiwaniacapital.tw", true },
   { "taiwantour.info", true },
   { "taiyouko-hatuden.net", true },
   { "taizegroep.nl", true },
   { "tajper.pl", true },
   { "take1give1.com", false },
+  { "takedownthissite.com", true },
   { "takeitoffline.co.uk", true },
+  { "takemoto-ped.com", true },
   { "taken.pl", true },
-  { "takeshifujimoto.com", true },
+  { "takeshifujimoto.com", false },
   { "takk.pl", true },
   { "takkaaaaa.com", true },
   { "takuhai12.com", true },
@@ -34334,18 +35859,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "talroo.com", true },
   { "talun.de", true },
   { "tam-moon.com", true },
+  { "tam-safe.com", true },
+  { "tamaraboutique.com", true },
   { "tamashimx.net", true },
   { "tamasszabo.net", true },
   { "tambre.ee", true },
   { "tamchunho.com", true },
   { "tamindir.com", true },
+  { "tamirson.com", true },
   { "tammy.pro", true },
   { "tampabaybusinesslistings.com", true },
   { "tamposign.fr", true },
-  { "tamriel-rebuilt.org", true },
+  { "tanacio.com", true },
   { "tanak3n.xyz", false },
   { "tancredi.nl", true },
   { "tandem-trade.ru", false },
+  { "tandemexhibits.com", true },
   { "tandempartnerships.com", true },
   { "tandilmap.com.ar", true },
   { "tandk.com.vn", true },
@@ -34366,6 +35895,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tannerryan.ca", true },
   { "tannerwilliamson.com", true },
   { "tannerwj.com", true },
+  { "tansuya.jp", true },
   { "tantalos.nl", true },
   { "tantei100.net", true },
   { "tanto259.name", false },
@@ -34375,8 +35905,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tanzhijun.com", true },
   { "tanzo.io", true },
   { "taoburee.com", true },
+  { "taotuba.org", true },
   { "taoways.com", true },
-  { "tapakgram.com", true },
   { "taplamvan.net", true },
   { "taplemon.at", true },
   { "taplemon.com", true },
@@ -34385,7 +35915,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "taquilla.com", true },
   { "taqun.club", true },
   { "tar-mag.com", true },
-  { "tarantul.org.ua", true },
+  { "taranis.re", true },
   { "tarasecurity.co.uk", true },
   { "tarasecurity.com", true },
   { "tarasevich.by", true },
@@ -34424,7 +35954,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tatsidou.gr", true },
   { "tattoo.dating", true },
   { "tattvaayoga.com", true },
+  { "tatuantes.com", true },
   { "taunhanh.us", true },
+  { "tauschen.info", true },
   { "tavolaquadrada.com.br", true },
   { "tavsys.net", true },
   { "tax-guard.com", true },
@@ -34433,6 +35965,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "taxi-24std.de", false },
   { "taxi-chamonix.fr", true },
   { "taxi-collectif.ch", true },
+  { "taxi-jihlava.cz", true },
   { "taxi-puck.pl", true },
   { "taxi-waregem.be", true },
   { "taxicollectif.ch", true },
@@ -34441,8 +35974,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "taxisafmatosinhos.pt", true },
   { "taxiscollectifs.ch", true },
   { "taxlab.co.nz", true },
+  { "taxpackagesupport.com", true },
   { "taxsquirrel.com", true },
   { "taylorpearson.me", false },
+  { "taylorreaume.com", true },
   { "taylors-castles.co.uk", true },
   { "taylorstauss.com", true },
   { "taysonvodao.fr", true },
@@ -34464,6 +35999,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tcb-b.org", true },
   { "tccmb.com", true },
   { "tcdw.net", true },
+  { "tcdww.cn", true },
   { "tcf.org", true },
   { "tcgforum.pl", true },
   { "tcgrepublic.com", true },
@@ -34475,8 +36011,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tchoukball.ch", true },
   { "tcmwellnessclinic.com", true },
   { "tcnapplications.com", true },
-  { "tcptun.com", true },
   { "tcpweb.net", true },
+  { "tcspartner.net", true },
   { "tcvvip.com", true },
   { "tcwis.com", true },
   { "tdchrom.com", true },
@@ -34514,15 +36050,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "teamliquidpro.com", true },
   { "teammateworld.com", true },
   { "teammathics.com", true },
-  { "teamnetsol.com", true },
   { "teamninjaapp.com", true },
   { "teamnissannorthparts.com", true },
   { "teamnorthgermany.de", true },
   { "teampaddymurphy.ch", true },
   { "teampaddymurphy.ie", true },
+  { "teamsimplythebest.com", true },
   { "teamtouring.net", true },
   { "teamtrack.uk", true },
+  { "teamtravel.co", true },
   { "teamup.com", true },
+  { "teamup.rocks", true },
   { "teamupturn.com", true },
   { "teamupturn.org", true },
   { "teamusec.de", true },
@@ -34536,7 +36074,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tecart-system.de", true },
   { "tecartcrm.de", true },
   { "tech-blogger.net", true },
-  { "tech-clips.com", true },
+  { "tech-clips.com", false },
   { "tech-director.ru", true },
   { "tech-essential.com", true },
   { "tech-rat.com", true },
@@ -34548,7 +36086,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "techademy.nl", true },
   { "techamigo.in", true },
   { "techarea.fr", true },
+  { "techaulogy.com", true },
   { "techbelife.com", true },
+  { "techbrawl.org", true },
   { "techbrown.com", true },
   { "techcentric.com", false },
   { "techcracky.com", true },
@@ -34559,6 +36099,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "techdroid.eu", true },
   { "techendeavors.com", true },
   { "techformator.pl", true },
+  { "techglover.com", true },
   { "techhappy.ca", true },
   { "techinet.pl", true },
   { "techinsurance.com", true },
@@ -34570,8 +36111,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "technicabv.nl", true },
   { "technicalbrothers.cf", true },
   { "technicallyeasy.net", true },
-  { "technicalsystemsprocessing.com", true },
-  { "techniclab.net", true },
   { "technifocal.com", true },
   { "technik-boeckmann.de", true },
   { "technikblase.fm", true },
@@ -34579,15 +36118,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "technoinfogroup.it", true },
   { "technologie-innovation.fr", true },
   { "technologyand.me", true },
+  { "technologyhound.org", true },
   { "technologysi.com", true },
   { "technoparcepsilon.fr", true },
   { "technoscoots.com", true },
   { "technosorcery.net", true },
-  { "technoswag.ca", true },
+  { "technospeakco.com", true },
   { "technotonic.co.uk", true },
   { "techold.ru", true },
   { "techorbiter.com", true },
   { "techosmarcelo.com.ar", true },
+  { "techpilipinas.com", true },
   { "techpit.us", true },
   { "techpivot.net", true },
   { "techpoint.org", true },
@@ -34599,7 +36140,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "techtalks.no", true },
   { "techtrader.ai", true },
   { "techtrader.io", true },
-  { "techtuts.info", true },
   { "techvalue.gr", true },
   { "techview.link", true },
   { "techviewforum.com", true },
@@ -34616,9 +36156,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tecnodritte.it", true },
   { "tecnogazzetta.it", true },
   { "tecnologiasurbanas.com", true },
-  { "tecnologino.com", true },
   { "tecon.co.at", true },
   { "tecyt.com", true },
+  { "ted.do", true },
   { "tedb.us", true },
   { "teddy.ch", true },
   { "teddybradford.com", true },
@@ -34633,7 +36173,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "teencounseling.com", true },
   { "teengirl.pub", true },
   { "teensexgo.com", true },
-  { "teesypeesy.com", true },
   { "teeworlds-friends.de", true },
   { "tefek.cz", true },
   { "tege-elektronik.hu", true },
@@ -34651,10 +36190,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "teknolit.com", true },
   { "tekstschrijvers.net", true },
   { "tektuts.com", true },
+  { "tekuteku.jp", true },
   { "telamon.eu", true },
   { "telamon.fr", true },
   { "tele-alarme.ch", true },
-  { "tele-assistance.ch", true },
   { "tele-online.com", true },
   { "telealarme.ch", true },
   { "telealarmevalais.ch", true },
@@ -34697,17 +36236,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tempdomain.ml", true },
   { "template-parks.com", true },
   { "templateinvaders.com", true },
+  { "templates-office.com", true },
   { "templum.com.br", true },
   { "tenable.com.au", true },
+  { "tenberg.com", true },
   { "tenbos.ch", true },
   { "tendance-et-accessoires.com", true },
   { "tendermaster.com.ua", true },
+  { "tenderplan.ru", true },
   { "tenderstem.co.uk", true },
   { "tendomag.com", true },
   { "tenenz.com", true },
   { "tenisservis.eu", true },
   { "tenkofx.com", true },
-  { "tenma.pro", true },
   { "tennisadmin.com", true },
   { "tennismindgame.com", true },
   { "tenno.tools", true },
@@ -34715,14 +36256,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tenpolab.com", true },
   { "tenseapp.pl", true },
   { "tenshoku-hanashi.com", true },
-  { "tent.io", true },
   { "tenta.com", true },
   { "tentabrowser.com", true },
   { "tentations-voyages.com", false },
   { "tenthousandcoffees.com", true },
-  { "tenthpin.com", true },
+  { "tenthpin.com", false },
   { "tenyx.de", true },
   { "tenzer.dk", true },
+  { "teoleonie.com", true },
   { "tepid.org", true },
   { "tepitus.de", true },
   { "teplofom.ru", true },
@@ -34738,6 +36279,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "terlindung.com", true },
   { "termax.me", true },
   { "terminalvelocity.co.nz", true },
+  { "termino.eu", true },
   { "terminsrakning.se", true },
   { "termitemounds.org", true },
   { "termitinitus.org", true },
@@ -34759,6 +36301,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "terrastaffinggroup.com", false },
   { "terraweb.net", true },
   { "terresmagiques.com", true },
+  { "terrorbilly.com", true },
   { "terrty.net", true },
   { "terryjohnsononline.com", true },
   { "tes.com", true },
@@ -34774,20 +36317,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "test-textbooks.com", true },
   { "test.de", true },
   { "test.support", true },
-  { "testadren.com", true },
   { "testeveonline.com", true },
   { "testgeomed.ro", true },
-  { "testi.info", true },
   { "testomato.com", true },
-  { "testosterone-complex.com", true },
   { "testosteronedetective.com", true },
-  { "testpornsite.com", true },
   { "testsuite.org", true },
   { "testuje.net", true },
   { "tetedelacourse.ch", true },
   { "teto.nu", true },
   { "tetraetc.com", true },
   { "tetraktus.org", true },
+  { "tetrarch.co", true },
   { "tetsugakunomichi.jp", true },
   { "tetsumaki.net", true },
   { "teufel.dk", true },
@@ -34797,6 +36337,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "teva-li.com", true },
   { "tewarilab.co.uk", true },
   { "tewkesburybouncycastles.co.uk", true },
+  { "texashomesandland.com", true },
   { "texasllcpros.com", true },
   { "texaspaintingandgutters.com", true },
   { "texasparkinglotstriping.com", true },
@@ -34819,10 +36360,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "texy.info", true },
   { "teysens.com", true },
   { "teyssedre.ca", true },
-  { "tezcam.tk", true },
-  { "tf-network.de", true },
   { "tf2b.com", true },
   { "tf2calculator.com", true },
+  { "tf7879.com", true },
   { "tfg-bouncycastles.com", true },
   { "tfle.xyz", true },
   { "tflite.com", true },
@@ -34850,6 +36390,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thaiforest.ch", true },
   { "thaihomecooking.com", true },
   { "thailandpharmacy.net", true },
+  { "thailandpropertylisting.com", true },
   { "thairehabassociation.com", true },
   { "thajskyraj.com", true },
   { "thala.fr", true },
@@ -34869,35 +36410,46 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thca.ca", true },
   { "thcpbees.co.uk", true },
   { "the-bermanns.com", true },
+  { "the-big-bang-theory.com", true },
   { "the-body-shop.hu", false },
   { "the-fermenter.com", true },
   { "the-gdn.net", true },
   { "the-hemingway-code.de", true },
   { "the-mystery.org", true },
   { "the-nash-education-program.com", true },
+  { "the-pack.nl", true },
   { "the-pcca.org", true },
   { "the-webmaster.com", true },
   { "the-woods.org.uk", true },
   { "the-zenti.de", true },
   { "the2f.de", true },
   { "the3musketeers.biz", true },
+  { "the8rules.co.uk", true },
   { "theactuary.ninja", true },
   { "theadelaideshow.com.au", true },
   { "theadultswiki.com", true },
+  { "theafleo.gq", true },
   { "thealexandertechnique.co.uk", true },
+  { "theallmanteam.com", true },
   { "theankhlife.com", true },
   { "theanticellulitediet.com", true },
   { "theaps.net", true },
+  { "thearcheryguide.com", true },
   { "theastrocoach.com", true },
+  { "theatre-schools.com", true },
   { "thebakers.com.br", true },
   { "thebakery2go.de", true },
   { "thebannerstore.com", true },
+  { "thebarbdemariateam.com", true },
+  { "thebarneystyle.com", true },
   { "thebarrens.nu", true },
   { "thebasebk.org", true },
   { "thebcm.co.uk", true },
+  { "thebeachessportsphysio.com", true },
   { "thebeginningviolinist.com", true },
   { "thebest.ch", true },
   { "thebestfun.co.uk", true },
+  { "thebestofthesprings.com", true },
   { "thebestpersonin.ml", true },
   { "thebestsavingsplan.com", true },
   { "thebigbitch.nl", true },
@@ -34911,6 +36463,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theblackknightsings.com", true },
   { "theblondeabroad.com", true },
   { "theblueroofcottage.ca", true },
+  { "theboatmancapital.com", true },
   { "thebodyprinciple.com", true },
   { "thebonerking.com", true },
   { "thebouncedepartment.co.uk", true },
@@ -34936,40 +36489,39 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theconcordbridge.azurewebsites.net", true },
   { "thecondobuyers.com", true },
   { "thecookiejar.me", true },
-  { "thecozycastle.com", true },
   { "thecrazytravel.com", true },
+  { "thecrescentchildcarecenter.com", true },
   { "thecrew-exchange.com", true },
   { "thecrochetcottage.net", true },
   { "thecuppacakery.co.uk", true },
   { "thecuriouscat.net", true },
-  { "thecuriousdev.com", true },
   { "thecurvyfashionista.com", true },
   { "thecustomdroid.com", true },
   { "thedark1337.com", true },
-  { "thedebug.life", true },
   { "thederminstitute.com", true },
+  { "thedhs.com", true },
   { "thediamondcenter.com", true },
   { "thediaryofadam.com", true },
   { "thedisc.nl", true },
   { "thediscovine.com", true },
   { "thedocumentrefinery.com", true },
-  { "thedom.site", true },
   { "thedreamtravelgroup.co.uk", true },
   { "thedronechart.com", true },
   { "thedutchmarketers.com", true },
   { "theebookkeepers.co.za", true },
   { "theeducationchannel.info", true },
   { "theeducationdirectory.org", true },
+  { "theeffingyogablog.com", true },
   { "theeighthbit.com", true },
   { "theel0ja.ovh", true },
   { "theemasphere.com", true },
   { "thefairieswantmedead.com", true },
   { "thefanimatrix.net", true },
   { "thefashionpolos.com", true },
-  { "thefasterweb.com", true },
   { "thefbstalker.com", true },
   { "thefengshuioffice.com", true },
   { "theferrarista.com", true },
+  { "thefilmcolor.com", true },
   { "thefilmphotography.com", true },
   { "theflowerbasketonline.com", true },
   { "theflowershopdeddington.com", true },
@@ -34981,8 +36533,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thefuckingtide.com", true },
   { "thefunfirm.co.uk", true },
   { "thefurnitureco.uk", true },
+  { "thefurniturefamily.com", true },
   { "thegarrowcompany.com", true },
+  { "thegatheringocala.com", true },
   { "thegeekdiary.com", true },
+  { "thegerwingroup.com", true },
   { "thegioinano.com", true },
   { "thegospelforgeeks.org", true },
   { "thegrape.ro", true },
@@ -34994,16 +36549,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thegrs.com", true },
   { "theguitarcompany.nl", true },
   { "thegvoffice.net", true },
-  { "thegym.org", true },
   { "thehackerblog.com", true },
   { "thehairrepublic.net", true },
   { "thehairstandard.com", true },
+  { "thehamiltoncoblog.com", true },
   { "thehaxbys.co.uk", true },
   { "thehiddenbay.fi", true },
   { "thehiddenbay.info", true },
   { "thehiddenbay.ws", true },
   { "thehivedesign.org", true },
+  { "thehobincompany.com", true },
   { "thehomeicreate.com", true },
+  { "thehonorguard.org", true },
   { "thehookup.be", true },
   { "thehoryzon.com", true },
   { "thehotfix.net", true },
@@ -35013,11 +36570,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theidiotboard.com", true },
   { "theillustrationstudio.com.au", true },
   { "theimagefile.com", true },
+  { "theimaginationagency.com", true },
   { "theinflatables-ni.co.uk", true },
   { "theinflatablesne.co.uk", true },
   { "theinitium.com", true },
   { "theintercept.com", true },
   { "theinternationalgeekconspiracy.eu", true },
+  { "theitsage.com", false },
+  { "thej0lt.com", true },
   { "thejacksoninstitute.com.au", true },
   { "thekev.in", true },
   { "thekeytobusiness.co.uk", true },
@@ -35030,19 +36590,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thelatedcult.com", true },
   { "thelearningenterprise.co.uk", true },
   { "thelegionshirley.co.uk", true },
+  { "thelifeofmala.com", true },
   { "thelinuxtree.net", true },
   { "thelittlecraft.com", true },
+  { "thelittlejewel.com", true },
   { "thelocals.ru", true },
   { "thelonelyones.co.uk", true },
   { "thelonious.nl", true },
   { "themacoaching.nl", true },
   { "themallards.info", true },
   { "themarshallproject.org", true },
-  { "themaster.site", true },
   { "themecraft.studio", true },
   { "themefoxx.com", true },
+  { "themeridianway.com", true },
   { "themetacity.com", true },
+  { "themiddle.co", true },
   { "themigraineinstitute.com", true },
+  { "themilanlife.com", true },
   { "themillerslive.com", true },
   { "themimitoof.fr", true },
   { "themist.cz", true },
@@ -35059,11 +36623,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theneatgadgets.com", true },
   { "thenexwork.com", true },
   { "thenib.com", true },
+  { "theninenine.com", true },
   { "thenocman.com", true },
   { "thenovaclinic.com", true },
   { "thenowheremen.com", true },
   { "theo.me", true },
   { "theobromos.fr", true },
+  { "theoc.co", true },
   { "theocharis.org", true },
   { "theodorahome.co", true },
   { "theodorahome.com.br", true },
@@ -35076,6 +36642,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theory.org", true },
   { "theoscure.eu", true },
   { "theoutline.com", true },
+  { "thepaffy.de", true },
   { "thepartner.co.uk", true },
   { "thepartydoctors.co.uk", true },
   { "thepasteb.in", true },
@@ -35084,6 +36651,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thepaymentscompany.com", true },
   { "thepb.in", true },
   { "thepeninsulaires.com", true },
+  { "thepharm.co.nz", true },
   { "thephonecaseplace.com", true },
   { "thephp.cc", true },
   { "thepiabo.ovh", true },
@@ -35105,6 +36673,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theralino.de", true },
   { "theramo.re", true },
   { "therandombits.com", true },
+  { "therapiemi.ch", true },
   { "therapynotes.com", true },
   { "therapyportal.com", true },
   { "therapysxm.com", true },
@@ -35115,10 +36684,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "therepublicofliverpool.com", true },
   { "theresa-mayer.eu", true },
   { "therevenge.me", true },
-  { "therewill.be", true },
+  { "therhetorical.ml", true },
   { "thermalbad-therme.de", true },
   { "thermity.com", true },
   { "thermolamina.nl", true },
+  { "thermorecetas.com", true },
   { "theroks.com", true },
   { "theropes.nyc", true },
   { "theroyalmarinescharity.org.uk", true },
@@ -35132,6 +36702,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thesaurus.net", true },
   { "theschool.jp", true },
   { "thescientists.nl", true },
+  { "thesearchenginepros.com", true },
   { "thesecondsposts.com", false },
   { "theseed.io", true },
   { "theseedbox.xyz", true },
@@ -35157,22 +36728,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thesocialmediacentral.com", true },
   { "thesplashlab.com", true },
   { "thesslstore.com", true },
+  { "thestandingroomrestaurant.com", true },
   { "thestationatwillowgrove.com", true },
-  { "thesteins.org", false },
   { "thestoneage.de", true },
   { "thestory.ie", true },
   { "thestoryshack.com", true },
-  { "thestral.pro", true },
-  { "thestralbot.com", true },
   { "thestrategyagency.com.au", true },
   { "thestudyla.com", true },
   { "thestyle.city", true },
+  { "thestyleforme.com", true },
   { "thesuppercircle.com", true },
   { "theswissbay.ch", true },
   { "thetechnical.me", true },
   { "thetenscrolls.com", true },
   { "thethreepercent.marketing", true },
+  { "thetinylife.com", true },
   { "thetomharling.com", true },
+  { "thetotalemaildelivery.com", true },
   { "thetree.ro", true },
   { "thetrendspotter.net", true },
   { "thetuxkeeper.de", false },
@@ -35180,7 +36752,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theunitedstates.io", true },
   { "thevacweb.com", true },
   { "thevalentineconstitution.com", true },
+  { "thevalueofarchitecture.com", true },
+  { "thevenueofhollywood.com", true },
+  { "theverybusyoffice.co.uk", true },
   { "thevgg.com", false },
+  { "thevisasofoz.com", true },
+  { "thevoya.ga", true },
   { "thewagesroom.co.uk", true },
   { "thewarrencenter.org", true },
   { "thewaxhouse.academy", true },
@@ -35191,7 +36768,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thewebsitemarketingagency.com", true },
   { "thewhitehat.club", true },
   { "thewhitneypaige.com", true },
-  { "thewindow.com", true },
   { "thewinstonatlyndhurst.com", true },
   { "thewoodkid.com.au", true },
   { "thewoolroom.com.au", true },
@@ -35199,15 +36775,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "theworldexchange.net", true },
   { "theworldexchange.org", true },
   { "theworldsend.eu", true },
+  { "thewp.pro", true },
   { "thexfactorgames.com", true },
   { "thexme.de", true },
-  { "theyachtteam.com", true },
   { "theyakshack.co.uk", true },
   { "theyarnhookup.com", false },
   { "theyear199x.org", true },
   { "theyearinpictures.co.uk", true },
   { "theyosh.nl", true },
-  { "thezero.org", true },
   { "thezillersathenshotel.com", true },
   { "thiagohersan.com", true },
   { "thibaultwalle.com", true },
@@ -35217,14 +36792,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thierrybasset.ch", true },
   { "thierryhayoz.ch", true },
   { "thijsalders.nl", false },
-  { "thijsbekke.nl", true },
   { "thijsslop.nl", true },
   { "thijsvanderveen.net", true },
   { "thinegen.de", true },
-  { "thingies.site", true },
   { "thingsimplied.com", true },
   { "thingsof.org", true },
   { "think-asia.org", true },
+  { "think-pink.info", true },
   { "think-positive-watches.de", true },
   { "thinkforwardmedia.com", true },
   { "thinkheaddesign.com", true },
@@ -35237,15 +36811,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thinkrealty.com", true },
   { "thinktux.net", true },
   { "thirdbearsolutions.com", true },
-  { "thirdworld.moe", true },
   { "thiry-automobiles.net", true },
+  { "this-server-will-be-the-death-of-me.com", true },
   { "thisbrownman.com", true },
   { "thiscloudiscrap.com", false },
   { "thiscode.works", true },
   { "thisdot.site", true },
   { "thisfreelife.gov", true },
   { "thisisgrey.com", true },
-  { "thisishugo.com", true },
+  { "thisishugo.com", false },
   { "thisisthefinalact.com", true },
   { "thisistheserver.com", true },
   { "thisiswhywemom.com", true },
@@ -35261,6 +36835,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thomas-fahle.de", true },
   { "thomas-prior.com", true },
   { "thomas-sammut.com", true },
+  { "thomas-schmittner.de", true },
   { "thomas-suchon.fr", true },
   { "thomas.love", false },
   { "thomasbeckers.be", true },
@@ -35274,14 +36849,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thomasmcfly.com", true },
   { "thomasmeester.nl", false },
   { "thomasmerritt.de", true },
-  { "thomassen.sh", false },
+  { "thomassen.sh", true },
   { "thomasstevensmusic.com", true },
   { "thomastimepieces.com.au", true },
   { "thomasverhelst.be", true },
   { "thomasvochten.com", true },
   { "thomaswoo.com", true },
+  { "thompsonfamily.cloud", true },
   { "thomsonscleaning.co.uk", true },
-  { "thomspooren.nl", true },
   { "thomwiggers.nl", true },
   { "thor.edu", true },
   { "thor.re", true },
@@ -35291,7 +36866,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thorstenschaefer.name", true },
   { "thosci.com", true },
   { "thotpublicidad.com", true },
-  { "thoughtlessleaders.online", true },
+  { "thoughtsynth.com", true },
+  { "thoughtsynth.net", true },
+  { "thoughtsynth.org", true },
   { "thouni.de", true },
   { "thousandgreens.com", true },
   { "thousandoakselectrical.com", true },
@@ -35310,8 +36887,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "threelions.ch", true },
   { "threema.ch", true },
   { "threexxx.ch", true },
+  { "threit.de", true },
   { "threv.net", true },
   { "thriftdiving.com", true },
+  { "thrillernyc.com", true },
   { "thrivesummit.com", true },
   { "thriveta.com", true },
   { "thriveweb.com.au", true },
@@ -35329,7 +36908,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "thunraz.com", true },
   { "thusoy.com", true },
   { "thuthuatios.com", true },
-  { "thuviensoft.com", true },
+  { "thuybich.com", true },
   { "thw-bernburg.de", true },
   { "thxandbye.de", true },
   { "thycotic.ru", true },
@@ -35343,11 +36922,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tianeptine.com", true },
   { "tianshili.me", true },
   { "tiantangbt.com", true },
+  { "tibicinagarricola.com", true },
   { "tibipg.com", true },
   { "tibovanheule.space", true },
   { "ticfleet.com", true },
   { "tichieru.pw", true },
   { "ticketassist.nl", true },
+  { "ticketdriver.com", true },
   { "ticketluck.com", true },
   { "ticketmates.com.au", true },
   { "ticketmaze.com", true },
@@ -35364,14 +36945,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tickit.ca", true },
   { "tid.jp", true },
   { "tidycustoms.net", true },
+  { "tiekoetter.com", true },
   { "tielectric.ch", true },
   { "tiendavertigo.com", true },
   { "tiens-ib.cz", true },
   { "tier-1-entrepreneur.com", true },
   { "tierarztpraxis-bogenhausen.de", true },
   { "tierarztpraxis-weinert.de", true },
-  { "tiernanx.com", true },
-  { "tierraprohibida.net", true },
   { "ties.com", true },
   { "tiew.pl", true },
   { "tifan.net", true },
@@ -35381,7 +36961,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tigernode.com", true },
   { "tigernode.net", true },
   { "tiggeriffic.com", true },
-  { "tiggi.pw", true },
   { "tiglitub.com", true },
   { "tiihosen.fi", true },
   { "tiim.technology", true },
@@ -35389,9 +36968,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tijo.ch", true },
   { "tik.edu.ee", true },
   { "tik.help", true },
+  { "tiki-god.co.uk", true },
   { "tildes.net", true },
   { "tildesnyder.com", true },
-  { "tiledailyshop.com", true },
   { "tilesbay.com", true },
   { "tiliaze.be", true },
   { "tiliaze.biz", true },
@@ -35403,6 +36982,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tillberg.us", true },
   { "tilleysbouncycastles.co.uk", true },
   { "tillseasyscore.com", true },
+  { "tilman.ninja", true },
   { "tilosp.de", true },
   { "tilta.com", true },
   { "tiltedwindmillcrafts.com", true },
@@ -35423,16 +37003,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "timebox.tk", true },
   { "timeglass.de", true },
   { "timeless-photostudio.com", true },
-  { "timetab.org", true },
+  { "timelessskincare.co.uk", true },
   { "timetech.io", true },
   { "timetotrade.com", true },
   { "timewasters.nl", true },
+  { "timewk.cn", true },
   { "timfiedler.net", true },
   { "timhieuthuoc.com", true },
+  { "timi-matik.hu", true },
   { "timing.com.br", true },
   { "timjk.de", true },
   { "timmersgems.com", true },
-  { "timmy.im", true },
   { "timmyrs.de", true },
   { "timnash.co.uk", true },
   { "timonengelke.de", true },
@@ -35442,8 +37023,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "timsayedmd.com", true },
   { "timtaubert.de", true },
   { "timtelfer.com", true },
+  { "timtj.ca", true },
   { "timvivian.ca", true },
+  { "timweb.ca", true },
   { "timysewyn.be", true },
+  { "tina-zander.de", true },
   { "tina.media", true },
   { "tinastahlschmidt.de", true },
   { "tindallriley.co.uk", true },
@@ -35451,7 +37035,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tinfoilsecurity.com", false },
   { "tinfoleak.com", true },
   { "tinker.career", true },
+  { "tinkerbeast.com", true },
   { "tinkerboard.org", true },
+  { "tinkererstrunk.co.za", true },
   { "tinkertry.com", true },
   { "tinlc.org", true },
   { "tinte24.de", true },
@@ -35469,7 +37055,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tinytownsoftplay.co.uk", true },
   { "tinyvpn.net", true },
   { "tinyvpn.org", true },
-  { "tioat.net", true },
   { "tipaki.gr", true },
   { "tipbox.is", true },
   { "tipe.io", true },
@@ -35477,6 +37062,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tipoftheday.tips", true },
   { "tippytoad.com", true },
   { "tipsacademicos.com", true },
+  { "tipstersweb.com", true },
+  { "tipulnagish.co.il", true },
   { "tir-mauperthuis.fr", true },
   { "tir-pistolet-chexbres.ch", true },
   { "tiratuki.games", true },
@@ -35493,6 +37080,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tit-dns.de", true },
   { "tit-mail.de", true },
   { "tit.systems", true },
+  { "titanandco.com", true },
   { "titandirect.co.uk", true },
   { "titanous.com", true },
   { "titansized.com", true },
@@ -35517,14 +37105,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tkat.ch", true },
   { "tkeycoin.com", true },
   { "tkgpm.com", true },
-  { "tkirch.de", true },
-  { "tkjg.fi", true },
   { "tkn.me", true },
-  { "tkn.tokyo", true },
   { "tkusano.jp", true },
   { "tkw01536.de", false },
   { "tl.gg", true },
+  { "tlach.cz", true },
   { "tlca.org", true },
+  { "tlcnet.info", true },
   { "tlehseasyads.com", true },
   { "tleng.de", true },
   { "tlo.xyz", true },
@@ -35536,21 +37123,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tlsrobot.se", true },
   { "tlthings.net", true },
   { "tlumaczenie.com", true },
+  { "tlyphed.net", true },
   { "tlys.de", true },
   { "tmakiguchi.org", true },
+  { "tmas.dk", true },
   { "tmberg.cf", true },
   { "tmberg.ga", true },
   { "tmberg.gq", true },
   { "tmberg.ml", true },
   { "tmberg.tk", true },
   { "tmc.com.mt", true },
-  { "tmconnects.com", true },
   { "tmcpromotions.co.uk", true },
   { "tmcreationweb.com", true },
   { "tmdb.biz", true },
-  { "tmdc.ddns.net", true },
   { "tmf.ru", true },
-  { "tmhr.moe", true },
   { "tmi-products.eu", true },
   { "tmi-produkter.se", true },
   { "tmi.news", true },
@@ -35571,9 +37157,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "toad.ga", true },
   { "toast.al", false },
   { "tob-rulez.de", true },
-  { "tobacco.gov", true },
   { "tobaccolocker.com", true },
-  { "tobedo.net", true },
   { "tober-cpag.de", true },
   { "tobi-mayer.de", true },
   { "tobias-bauer.de", true },
@@ -35597,6 +37181,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tobiemilford.com", true },
   { "tobis-rundfluege.de", true },
   { "tobischo.de", true },
+  { "tobisworld.ch", true },
   { "tobyalden.com", true },
   { "tobyx.com", true },
   { "tobyx.de", true },
@@ -35612,7 +37197,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "todapolitica.com", true },
   { "todaymeow.com", true },
   { "toddfry.com", true },
-  { "todocracy.com", true },
+  { "todoereaders.com", true },
   { "todoescine.com", true },
   { "todoist.com", true },
   { "todon.fr", true },
@@ -35628,6 +37213,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "togech.jp", true },
   { "togetter.com", true },
   { "toheb.de", false },
+  { "tohochofu-sportspark.com", true },
   { "tohokinemakan.tk", true },
   { "tokaido-kun.jp", true },
   { "tokaido.com", true },
@@ -35644,6 +37230,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tokke.dk", true },
   { "tokkee.org", true },
   { "tokky.eu", true },
+  { "tokototech.com", true },
   { "tokugai.com", true },
   { "tokumei.co", true },
   { "tokyo-onkyo.jp", true },
@@ -35653,6 +37240,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tokyomakino.com", true },
   { "tokyovipper.com", true },
   { "tolboe.com", true },
+  { "toldositajuba.com", true },
   { "toleressea.fr", true },
   { "toles-sur-mesure.fr", true },
   { "tolle-wolke.de", true },
@@ -35660,9 +37248,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tollfreeproxy.com", true },
   { "tom-geiger.de", true },
   { "tom-kunze.de", true },
-  { "tom-maxwell.com", true },
+  { "tom-kurka.cz", true },
   { "tom.horse", true },
   { "tom.je", true },
+  { "tom94.net", true },
   { "tomabrafix.de", true },
   { "tomahawk.ca", true },
   { "tomandmara.com", true },
@@ -35676,19 +37265,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tomatenaufdenaugen.de", true },
   { "tomatis-nantes.com", true },
   { "tomaw.net", true },
+  { "tomaz.eu", true },
   { "tombaker.me", true },
-  { "tomberek.info", true },
   { "tombrossman.com", true },
   { "tomd.ai", true },
-  { "tomdudfield.com", true },
   { "tomend.es", true },
-  { "tomershemesh.me", true },
   { "tomfisher.eu", true },
   { "tomharling.uk", true },
   { "tomharris.tech", true },
   { "tomi.cc", true },
   { "tomica.me", true },
-  { "tomiler.com", true },
+  { "tomik.cloud", true },
   { "tomjans.nl", true },
   { "tomjn.com", true },
   { "tomjonsson.se", true },
@@ -35701,13 +37288,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tomnatt.com", true },
   { "tomo.gr", false },
   { "tomosm.net", true },
-  { "tomoyaf.com", true },
   { "tomravinmd.com", true },
   { "tomrei.com", true },
   { "tomrichards.net", true },
   { "toms.ovh", true },
   { "tomschlick.com", true },
   { "tomsdevsn.me", true },
+  { "tomsherakmshope.org", true },
   { "tomspdblog.com", true },
   { "tomssl.com", true },
   { "tomticket.com", true },
@@ -35741,12 +37328,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tonermonster.de", true },
   { "tonex.de", true },
   { "tonex.nl", true },
-  { "tonguetechnology.com", true },
+  { "tonigallagherinteriors.com", true },
   { "tonkayagran.com", true },
   { "tonkayagran.ru", true },
   { "tonkinson.com", true },
   { "tonkinwilsonvillenissanparts.com", true },
   { "tonnycat.com", true },
+  { "tono.us", true },
   { "tonsit.com", true },
   { "tonsit.org", true },
   { "tontonnews.net", true },
@@ -35766,17 +37354,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "toolsense.io", true },
   { "toom.io", true },
   { "toomy.ddns.net", true },
+  { "toomy.pri.ee", true },
   { "toonpool.com", true },
   { "toonsburgh.com", true },
+  { "toontown.team", true },
+  { "toontownrewritten.com", true },
   { "toool.nl", true },
+  { "toool.nyc", true },
   { "toool.org", true },
   { "tooolroc.org", false },
   { "toot.center", true },
   { "toothdoc.ca", true },
   { "tooti.biz", true },
-  { "top-esb.com", true },
   { "top-obaly.cz", true },
   { "top-opakowania.pl", true },
+  { "top-solar-info.de", true },
   { "top5hosting.co.uk", true },
   { "topaxi.ch", true },
   { "topaxi.codes", true },
@@ -35788,32 +37380,32 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "topclassfun.ie", true },
   { "topdesk.net", true },
   { "topdogsinflatables.co.uk", true },
+  { "topdroneusa.com", true },
   { "topekafoundationpros.com", true },
   { "topeng-emas.com", true },
-  { "topesb.com", true },
   { "topeyelashenhancerserumreviews.com", true },
   { "topfivepercent.co.uk", true },
+  { "topgshop.ru", true },
   { "topicdesk.com", true },
   { "topicit.net", true },
   { "topirishcasinos.com", true },
   { "topjobs.ch", true },
-  { "topkek.ml", true },
   { "toplist.cz", true },
   { "toplist.eu", true },
   { "topnotepad.com", true },
   { "topodin.com", true },
   { "toponlinecasinosites.co.uk", true },
-  { "toppointrea.com", true },
+  { "toppercan.es", true },
   { "topprice.ua", true },
   { "topsailtechnologies.com", true },
   { "topservercccam.tv", true },
   { "topshelfcommercial.com", true },
+  { "topshoptools.com", true },
   { "topsteaks-daun.de", true },
   { "toptec.net.br", true },
   { "toptexture.com", true },
   { "toptheto.com", true },
   { "topvertimai.lt", true },
-  { "topwin.la", true },
   { "topwindowcleaners.co.uk", true },
   { "topworktops.co.uk", true },
   { "toracon.org", true },
@@ -35828,6 +37420,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "torn1.se", true },
   { "torngalaxy.com", true },
   { "torogroups.com", true },
+  { "torontoaccesscontrol.com", true },
   { "torontocorporatelimo.services", true },
   { "torontostarts.com", true },
   { "torproject.org", false },
@@ -35853,13 +37446,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "toshen.com", true },
   { "toshkov.com", true },
   { "toskana-appartement.de", true },
+  { "tosolini.info", true },
   { "tosostav.cz", true },
   { "tosteberg.se", true },
   { "tostu.de", true },
+  { "tot-radio.com", true },
+  { "totaku.ru", true },
   { "totalbeauty.co.uk", true },
   { "totalbike.com.br", true },
   { "totalcarcheck.co.uk", true },
   { "totalchecklist.com", true },
+  { "totaldragonshop.com.br", true },
+  { "totalemaildelivery.com", true },
   { "totalforcegym.com", true },
   { "totalhomecareinc.com", true },
   { "totallylegitimatehosting.ru", true },
@@ -35876,12 +37474,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "touch.mail.ru", true },
   { "touchoflife.in", true },
   { "touchscreentills.com", true },
-  { "touchtable.nl", true },
+  { "touchstone.io", true },
   { "touchweb.fr", true },
   { "touchwoodtrees.com.au", true },
-  { "tougetu.com", true },
   { "touhou.ac.cn", true },
-  { "touhou.cc", true },
   { "touhou.fm", true },
   { "touhouwiki.net", true },
   { "toujours-actif.com", true },
@@ -35892,6 +37488,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tourispo.com", true },
   { "tournamentmgr.com", true },
   { "tournevis.ch", true },
+  { "toursthatmatter.com", true },
   { "tourtransferitaly.it", true },
   { "tourtrektrip.com", true },
   { "tous-travaux.ch", true },
@@ -35919,6 +37516,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "townandcountryus.com", true },
   { "townhousedevelopments.com.au", true },
   { "townhouseregister.com.au", true },
+  { "townofbridgewater.ca", true },
   { "towsonroofers.com", true },
   { "towywebdesigns.uk", true },
   { "tox21.gov", false },
@@ -35929,6 +37527,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tp-kabushiki.com", true },
   { "tp-kyouyufudousan.com", true },
   { "tp-law.jp", true },
+  { "tpansino.com", true },
   { "tpbproxy.co", true },
   { "tpci.biz", true },
   { "tpidg.us", true },
@@ -35942,7 +37541,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "traas.org", true },
   { "trabajarenremoto.com", true },
   { "trabbel.org", true },
-  { "tracalada.cl", true },
   { "trace.guru", true },
   { "trace.moe", true },
   { "traceheatinguk.co.uk", true },
@@ -35952,6 +37550,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "traceroute.link", true },
   { "traceroute.network", true },
   { "traces.ml", true },
+  { "tracetracker.no", true },
   { "tracfinancialservices.com", true },
   { "tracinsurance.com", true },
   { "trackchair.com", true },
@@ -35962,16 +37561,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trackrecordpro.co.uk", true },
   { "tracksa.com.ar", true },
   { "trackyourlogs.com", true },
+  { "tractorfan.nl", true },
   { "tractorpumps.com", true },
   { "trad-n-vo.com", true },
   { "trade-arcade.com", true },
+  { "trade.gov", true },
   { "trade.gov.uk", true },
   { "trade247.exchange", true },
   { "tradeacademy.in", true },
-  { "tradedesk.co.za", true },
   { "tradeinvent.co.uk", true },
   { "trademan.ky", true },
   { "traderjoe-cloud.de", true },
+  { "tradernet.ru", true },
+  { "tradeshowfreightservices.com", true },
   { "tradik.com", true },
   { "tradinews.com", true },
   { "tradinews.fr", true },
@@ -35994,8 +37596,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trailforks.com", true },
   { "trainex.org", true },
   { "trainhornforums.com", true },
+  { "trainhorns.us", true },
   { "trainiac.com.au", true },
-  { "trainings-handschuhe-test.de", true },
   { "trainline.at", true },
   { "trainline.cn", true },
   { "trainline.com.br", true },
@@ -36016,6 +37618,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trainplaza.net", true },
   { "trainplaza.nl", true },
   { "trainsgoodplanesbad.com", true },
+  { "traintimes.be", true },
+  { "traintimes.ch", true },
+  { "traintimes.dk", true },
+  { "traintimes.fi", true },
+  { "traintimes.ie", true },
+  { "traintimes.it", true },
+  { "traintimes.lu", true },
+  { "traintimes.nl", true },
+  { "traintimes.se", true },
   { "traista.ru", true },
   { "traiteurpapillonevents.be", true },
   { "trajano.net", true },
@@ -36028,16 +37639,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tranceheal.com", true },
   { "tranceheal.de", true },
   { "tranceheal.me", true },
-  { "trancendances.fr", true },
   { "trangcongnghe.com", true },
   { "trangell.com", true },
   { "tranglenull.xyz", true },
   { "tranhsondau.net", false },
+  { "tranquillity.se", true },
   { "transacid.de", true },
   { "transappealrights.com", true },
   { "transcend.org", true },
   { "transcontrol.com.ua", true },
-  { "transcricentro.pt", true },
   { "transfer.pw", true },
   { "transferio.nl", true },
   { "transfers.do", true },
@@ -36065,16 +37675,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "translate.fedoraproject.org", true },
   { "translate.googleapis.com", true },
   { "translate.stg.fedoraproject.org", true },
-  { "translateblender.ru", true },
   { "translatoruk.co.uk", true },
   { "transmarttouring.com", true },
   { "transmisjeonline.pl", true },
+  { "transmitit.pl", true },
+  { "transmute.review", true },
   { "transnexus.com", true },
   { "transoil.co.uk", true },
+  { "transpak-cn.com", true },
   { "transparentcorp.com", true },
   { "transport.eu", true },
   { "transporta.it", true },
   { "transporterlock.com", true },
+  { "transumption.com", true },
   { "transverify.com", true },
   { "trappednerve.org", true },
   { "trashnothing.com", true },
@@ -36082,25 +37695,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "traslocare.roma.it", true },
   { "traslochi-trasporti-facchinaggio.it", true },
   { "trasloco.milano.it", true },
+  { "tratamentoparacelulite.net", true },
   { "trattamenti.biz", true },
   { "trattamento-cotto.it", true },
   { "trauer-beileid.de", true },
   { "traut.cloud", true },
   { "travador.com", true },
   { "travaux-toiture-idf.fr", true },
-  { "travel-dealz.de", true },
   { "travel-to-nature.ch", true },
   { "travel.co.za", true },
   { "travel365.it", true },
   { "travelarmenia.org", true },
   { "traveleets.com", true },
+  { "travelemy.com", true },
+  { "travelholicworld.com", true },
   { "travelinsurance.co.nz", true },
   { "travellers.dating", true },
   { "travellovers.fr", true },
-  { "travelmyth.ie", true },
+  { "travellsell.com", true },
   { "travelogue.jp", true },
   { "travelphoto.cc", true },
-  { "travelpricecheck.com", true },
   { "travelrefund.com", true },
   { "travelshack.com", true },
   { "travi.org", true },
@@ -36112,13 +37726,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trbanka.com", true },
   { "trea98.org", true },
   { "treaslockbox.gov", true },
-  { "treatment.org", true },
   { "tree0.xyz", true },
   { "treebaglia.xyz", true },
   { "treehouseresort.nl", true },
   { "trees.chat", true },
   { "treeschat.com", true },
   { "treetopsecurity.com", true },
+  { "treeworkbyjtec.com", true },
   { "trefcon.cz", true },
   { "trefpuntdemeent.nl", true },
   { "treinaweb.com.br", false },
@@ -36129,21 +37743,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trekfriend.com", true },
   { "trekkinglife.de", true },
   { "tremlor.com", true },
-  { "tremolosoftware.com", true },
   { "trendkraft.de", true },
+  { "trendreportdeals.com", true },
+  { "trendsettersre.com", true },
   { "trendus.no", true },
   { "trendykids.cz", true },
   { "trenta.io", true },
-  { "trentmaydew.com", true },
   { "tresor.it", true },
   { "tresorit.com", true },
   { "tresorsecurity.com", true },
   { "tretail.net", true },
   { "tretkowski.de", true },
+  { "treussart.com", true },
   { "trevsanders.co.uk", true },
   { "trezy.me", true },
   { "trezy.net", true },
   { "trhastane.com", true },
+  { "triage.ai", true },
   { "triage.clinic", true },
   { "triage.com", true },
   { "triage.md", true },
@@ -36151,20 +37767,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trialandsuccess.nl", true },
   { "trialcentralnet.com", true },
   { "trianglecastles.co.uk", true },
-  { "trianon.xyz", true },
   { "tribac.de", true },
   { "tribaldos.com", true },
+  { "tribaljusticeandsafety.gov", true },
   { "tribe.rs", true },
   { "tribetrails.com", true },
   { "tribly.de", true },
   { "tribut.de", true },
-  { "tributh.cf", true },
   { "tributh.ga", true },
   { "tributh.gq", true },
   { "tributh.ml", true },
-  { "tributh.net", true },
   { "tributh.tk", true },
   { "tricefy4.com", true },
+  { "triciaree.com", true },
   { "trident-online.de", true },
   { "tridentflood.com", true },
   { "trietment.com", true },
@@ -36174,6 +37789,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trilliumvacationrentals.ca", true },
   { "triluxds.com", true },
   { "trim-a-slab.com", true },
+  { "trim21.cn", true },
   { "trimage.org", true },
   { "trimarchimanuele.it", true },
   { "trinary.ca", true },
@@ -36192,24 +37808,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tripseats.com", true },
   { "tripsinc.com", true },
   { "trisect.eu", true },
+  { "trish-mcevoy.ru", true },
+  { "triticeaetoolbox.org", true },
+  { "trix360.com", true },
   { "trixexpressweb.nl", true },
   { "triz.co.uk", true },
   { "trkpuls.tk", true },
   { "trockendock.ch", true },
   { "troedel-trolle.de", true },
   { "troedelhannes.at", true },
+  { "troi.de", true },
   { "troianet.com.br", true },
-  { "troisdorf-gestalten.de", true },
   { "trollingeffects.org", true },
   { "trollmoa.se", true },
   { "trollope-apollo.com", true },
   { "trommelwirbel.com", true },
   { "tronatic-studio.com", true },
-  { "trondelan.no", true },
   { "troomcafe.com", true },
   { "troopaid.info", true },
   { "trophee-discount.com", true },
-  { "tropicalserver.com", true },
+  { "tropicalserver.com", false },
   { "trotec.com", true },
   { "trotina.cz", true },
   { "trouble-free-employees.com", true },
@@ -36227,19 +37845,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "trucchibellezza.it", true },
   { "truckersmp.com", true },
   { "truckerswereld.nl", false },
+  { "truckgpsreviews.com", true },
   { "truckstop-magazin.de", false },
   { "true-itk.de", true },
+  { "trueassignmenthelp.co.uk", true },
   { "trueblueessentials.com", true },
+  { "trueduality.net", true },
   { "trueinstincts.ca", true },
   { "truekey.com", true },
   { "truentumvet.it", true },
   { "trueproxy.net", true },
   { "truerizm.ru", true },
-  { "truessl.shop", true },
   { "truestaradvisors.com", true },
   { "truesteamachievements.com", true },
   { "truestor.com", true },
   { "trueteaching.com", true },
+  { "truetraveller.com", true },
   { "truetrophies.com", true },
   { "trufflemonkey.co.uk", true },
   { "truhlarstvi-fise.cz", true },
@@ -36253,11 +37874,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "truserve.org", true },
   { "trusitio.com", true },
   { "trustcase.com", true },
+  { "trustedbody.com", true },
   { "trustednetworks.nl", true },
+  { "trustednewssites.com", true },
   { "trustfield.ch", true },
   { "trustserv.de", true },
   { "truthmessages.pw", true },
   { "truvisory.com", true },
+  { "truyenfull.vn", true },
   { "trw-reseller.com", true },
   { "try2admin.pw", true },
   { "trybabyschoice.com", true },
@@ -36270,6 +37894,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tryndraze.com", true },
   { "trynta.com", true },
   { "trynta.net", true },
+  { "trypineapple.com", true },
   { "tryretool.com", true },
   { "tryupdates.com", true },
   { "trywesayyes.com", true },
@@ -36283,7 +37908,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tsai.com.de", true },
   { "tsatestprep.com", true },
   { "tschuermans.be", true },
-  { "tscqmalawi.info", true },
   { "tsedryk.ca", true },
   { "tsgkc1.com", true },
   { "tsicons.com", true },
@@ -36301,9 +37925,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tsundere.moe", true },
   { "tsung.co", true },
   { "tsurai.work", true },
+  { "tsurimap.com", true },
+  { "tsutsumi-kogyo.jp", true },
   { "tsuyuzakihiroyuki.com", true },
   { "tsv-1894.de", true },
-  { "tt.dog", true },
+  { "ttb.gov", true },
   { "ttbonline.gov", true },
   { "ttc-birkenfeld.de", true },
   { "ttcaarberg.ch", true },
@@ -36312,9 +37938,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ttclub.fr", true },
   { "ttdsevaonline.com", true },
   { "ttll.de", true },
-  { "ttrade.ga", true },
   { "ttsoft.pl", true },
-  { "ttspttsp.com", true },
   { "ttsweb.org", true },
   { "ttt.tt", true },
   { "ttuwiki.ee", true },
@@ -36338,12 +37962,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tucuxi.org", true },
   { "tudiennhakhoa.com", true },
   { "tudorproject.org", true },
+  { "tuev-hessen.de", true },
   { "tufashionista.com", true },
   { "tuffclassified.com", true },
   { "tuffsruffs.se", true },
+  { "tuimprenta.com.ar", true },
   { "tuincentersnaet.be", true },
   { "tuingereedschappen.net", false },
   { "tuitle.com", true },
+  { "tuja.hu", true },
+  { "tulumplayarealestate.com", true },
   { "tumagiri.net", true },
   { "tumblenfun.com", true },
   { "tumedico.es", true },
@@ -36374,11 +38002,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "turl.pl", true },
   { "turnaroundforum.de", true },
   { "turncircles.com", true },
+  { "turnierplanung.com", true },
   { "turnoffthelights.com", true },
   { "turnonsocial.com", true },
   { "turpinpesage.fr", true },
   { "tursiae.org", true },
-  { "turtle.ai", true },
+  { "turtle.ai", false },
   { "turtleduckstudios.com", true },
   { "turtlepwr.com", true },
   { "turunculevye.com", true },
@@ -36389,12 +38018,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tuto-craft.com", true },
   { "tutoragency.org", true },
   { "tutorat-tect.org", true },
+  { "tutoref.com", true },
+  { "tutorialehtml.com", true },
   { "tutorialinux.com", true },
   { "tutorme.com", true },
   { "tuts4you.com", true },
   { "tuttimundi.org", true },
   { "tuttoandroid.net", true },
   { "tuvangoicuoc.com", true },
+  { "tuversionplus.com", true },
   { "tuwaner.com", true },
   { "tuxcloud.net", true },
   { "tuxflow.de", false },
@@ -36419,38 +38051,35 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tvleaks.se", true },
   { "tvlplus.net", true },
   { "tvqc.com", true },
-  { "tvs-virtual.cz", true },
   { "tvseries.info", true },
   { "tvsheerenhoek.nl", true },
   { "tvzr.com", true },
   { "tw.search.yahoo.com", false },
   { "twaka.com", true },
+  { "twalter.de", true },
   { "twb.berlin", true },
   { "twd2.me", true },
   { "twd2.net", false },
   { "tweakers.com.au", true },
   { "tweakers.net", true },
-  { "tweakersbadge.nl", true },
   { "tweaktown.com", true },
+  { "tweedehandslaptophardenberg.nl", true },
   { "tweetfinity.com", true },
   { "tweetfinityapp.com", true },
   { "twenty71.com", true },
   { "twentymilliseconds.com", true },
-  { "twilightcookies.ca", true },
   { "twilleys.com", true },
   { "twincitynissantxparts.com", true },
   { "twinkseason.com", true },
   { "twinztech.com", true },
   { "twisata.com", true },
-  { "twistapp.com", true },
-  { "twistdevelopment.co.uk", true },
+  { "twisted-brains.org", true },
   { "twistedwave.com", true },
   { "twisto.cz", true },
   { "twisto.pl", true },
   { "twistopay.com", true },
   { "twit-guide.com", true },
   { "twitchplaysleaderboard.info", true },
-  { "twittelzie.nl", true },
   { "twitter.com", false },
   { "twitteroauth.com", true },
   { "twizzkidzinflatables.co.uk", true },
@@ -36459,9 +38088,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "twodadsgames.com", true },
   { "twofactorauth.org", true },
   { "twohuo.com", true },
-  { "twojfaktum.pl", true },
   { "twopif.net", true },
   { "tworaz.net", true },
+  { "twtimmy.com", true },
   { "twun.io", true },
   { "twuni.org", true },
   { "txcap.org", true },
@@ -36470,6 +38099,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "txlrs.org", true },
   { "txm.pl", true },
   { "txtecho.com", true },
+  { "txurologist.com", true },
   { "tyche.io", true },
   { "tycho.org", true },
   { "tycom.cz", true },
@@ -36485,9 +38115,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "typcn.com", true },
   { "typeblog.net", true },
   { "typecodes.com", true },
-  { "typehub.net", true },
   { "typeof.pw", true },
-  { "typeonejoe.com", true },
   { "typeria.net", true },
   { "typewolf.com", true },
   { "typewritten.net", true },
@@ -36501,11 +38129,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "tyroremotes.nl", true },
   { "tyroremotes.no", true },
   { "tysox.de", true },
+  { "tysye.ca", true },
+  { "tyuo-keibi.co.jp", true },
   { "tzermias.gr", true },
   { "tzifas.com", true },
   { "u-martfoods.com", true },
   { "u-metals.com", true },
-  { "u-tokyo.club", true },
   { "u.nu", true },
   { "u0010.com", true },
   { "u0020.com", true },
@@ -36525,6 +38154,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "u5b.de", false },
   { "u5r.nl", true },
   { "ua.search.yahoo.com", false },
+  { "uaci.edu.mx", true },
   { "uae-company-service.com", true },
   { "uangteman.com", true },
   { "uasmi.com", true },
@@ -36532,11 +38162,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "uatgootax.ru", false },
   { "ub3rk1tten.com", false },
   { "ubanquity.com", true },
+  { "ubcani.com", true },
   { "uberbkk.com", true },
   { "uberboxen.net", true },
   { "uberestimator.com", true },
   { "ubermail.me", true },
-  { "ubertt.org", true },
   { "uberwald.de", true },
   { "uberwald.ws", true },
   { "ubezpieczeniepsa.com", true },
@@ -36547,12 +38177,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ublaboo.org", true },
   { "uborcare.com", true },
   { "ubun.net", true },
+  { "ubuntu18.com", true },
   { "ucac.nz", false },
   { "ucangiller.com", true },
   { "ucch.be", true },
   { "ucfirst.nl", true },
   { "uchargeapp.com", true },
-  { "uclf.de", true },
   { "uclip.club", true },
   { "ucppe.org", true },
   { "ucrdatatool.gov", true },
@@ -36564,17 +38194,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "udp.sh", false },
   { "udruga-point.hr", true },
   { "udvoukocek.eu", true },
+  { "ueba1085.jp", true },
   { "ueberdosis.io", true },
   { "ueberwachungspaket.at", true },
   { "uedaviolin.com", true },
   { "uel-thompson-okanagan.ca", true },
   { "ueni.com", true },
-  { "uevan.com", true },
   { "uex.im", true },
   { "ufanisi.mx", true },
   { "ufindme.at", true },
   { "ufplanets.com", true },
-  { "uggedal.com", true },
+  { "ugb-verlag.de", true },
   { "ugx-mods.com", true },
   { "uhappy1.com", true },
   { "uhappy11.com", true },
@@ -36628,12 +38258,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "uhappy99.com", true },
   { "uhasseltodin.be", true },
   { "uhc.gg", true },
+  { "uhlhosting.ch", true },
   { "uhrenlux.de", true },
   { "uhssl.com", true },
   { "uhurl.net", true },
   { "ui8.net", true },
   { "uiberlay.cz", true },
-  { "uicchy.com", true },
   { "uiop.link", true },
   { "uiterwijk.org", true },
   { "uitgeverij-deviant.nl", true },
@@ -36655,6 +38285,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ukrigging.net", true },
   { "ukrnet.co.uk", true },
   { "uktw.co.uk", true },
+  { "ukulelejim.com", true },
   { "ukunlocks.com", true },
   { "ukwct.org.uk", true },
   { "ulabox.com", true },
@@ -36662,6 +38293,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ulen.me", true },
   { "ulgc.cz", true },
   { "uli-eckhardt.de", true },
+  { "ulitroyo.com", true },
   { "ullah.se", true },
   { "ulmer-schneesport.de", true },
   { "ulovdomov.cz", true },
@@ -36677,12 +38309,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ultratechlp.com", true },
   { "ultrautoparts.com.au", true },
   { "umanityracing.com", true },
+  { "umbrellaye.online", true },
   { "umbricht.li", true },
   { "umenlisam.com", true },
   { "umisonoda.com", true },
   { "umkmjogja.com", true },
   { "umsapi.com", true },
-  { "umsolugar.com.br", true },
   { "umwandeln-online.de", true },
   { "un-framed.co.za", true },
   { "un-zero-un.fr", true },
@@ -36716,7 +38348,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "unccelearn.org", true },
   { "uncensoreddns.dk", true },
   { "uncensoreddns.org", true },
-  { "undecidable.de", true },
+  { "undeadbrains.de", true },
   { "undeductive.media", true },
   { "undef.in", true },
   { "underbridgeleisure.co.uk", true },
@@ -36725,7 +38357,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "underlined.fr", true },
   { "undernet.uy", false },
   { "underskatten.tk", true },
-  { "underwearoffer.com", true },
   { "undo.co.il", true },
   { "undone.me", true },
   { "unece-deta.eu", true },
@@ -36734,15 +38365,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "unerosesurlalune.fr", true },
   { "unexpected.nu", true },
   { "unfettered.net", false },
-  { "unfuddle.cn", true },
   { "unga.dk", true },
   { "ungeek.eu", true },
-  { "ungeek.fr", true },
+  { "ungeek.fr", false },
   { "ungegamere.dk", true },
   { "unghie.com", true },
   { "unicef-karten.at", true },
   { "unicef.pl", true },
-  { "unicefcards.at", true },
   { "unicefcards.cz", true },
   { "unicefcards.gr", true },
   { "unicefcards.it", true },
@@ -36767,13 +38396,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "uniekglas.nl", true },
   { "unifei.edu.br", true },
   { "uniform-agri.com", true },
-  { "uniformebateriasheliar.com.br", true },
   { "unikoingold.com", true },
   { "unila.edu.br", true },
   { "unimbalr.com", true },
-  { "uninet.cf", true },
-  { "uniojeda.ml", true },
+  { "unioils.la", true },
   { "unionplat.ru", true },
+  { "unionstreetskateboards.com", true },
   { "uniontestprep.com", true },
   { "unipig.de", true },
   { "uniprimebr.com.br", false },
@@ -36803,8 +38431,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "universalcarremote.com", true },
   { "universalpaymentgateway.com", true },
   { "universeinform.com", true },
+  { "universidadvg.edu.mx", true },
   { "univitale.fr", true },
-  { "unix.se", true },
   { "unixadm.org", true },
   { "unixapp.ml", true },
   { "unixattic.com", true },
@@ -36817,23 +38445,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "unli.xyz", true },
   { "unlocken.nl", true },
   { "unlocktalent.gov", true },
-  { "unlogis.ch", true },
   { "unmarkdocs.co", true },
   { "unmonito.red", true },
   { "unn-edu.info", true },
   { "uno-pizza.ru", true },
-  { "uno.fi", true },
   { "unobrindes.com.br", true },
   { "unoccupyabq.org", true },
   { "unp.me", true },
   { "unpkg.com", true },
+  { "unpossible.xyz", true },
   { "unpr.dk", true },
   { "unquote.li", true },
   { "unrealircd.org", true },
   { "unrelated.net.au", true },
-  { "unripple.com", true },
-  { "unruh.fr", true },
-  { "uns.vn", true },
   { "unsacsurledos.com", true },
   { "unsee.cc", true },
   { "unseen.is", true },
@@ -36847,19 +38471,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "unterfrankenclan.de", true },
   { "unterhaltungsbox.com", true },
   { "unternehmer-radio.de", true },
-  { "unterschicht.tv", true },
+  { "unternehmerrat-hagen.de", true },
   { "untethereddog.com", true },
   { "unun.fi", true },
   { "unusualhatclub.com", true },
-  { "unveiledgnosis.com", true },
   { "unworthy.ml", true },
   { "unx.dk", true },
   { "unxicdellum.cat", true },
-  { "upay.ru", true },
+  { "upandrunningtutorials.com", true },
   { "upbad.com", true },
   { "upbeatrobot.com", true },
   { "upbeatrobot.eu", true },
   { "upd.jp", true },
+  { "updatehub.io", true },
   { "upgamerengine.com", true },
   { "upgamerengine.com.br", true },
   { "upgamerengine.net", true },
@@ -36887,9 +38511,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "uptimed.com", true },
   { "uptimenotguaranteed.com", true },
   { "uptodateinteriors.com", true },
-  { "uptogood.org", true },
   { "uptoon.jp", true },
   { "uptownlocators.com", true },
+  { "uptownvintagecafe.com", true },
   { "uptrends.com", true },
   { "uptrends.de", true },
   { "uptrex.co.uk", true },
@@ -36917,10 +38541,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "urbanmelbourne.info", true },
   { "urbannewsservice.com", true },
   { "urbansparrow.in", true },
-  { "urbansurvival.com", true },
   { "urbanwaters.gov", false },
   { "urbanwildlifealliance.org", false },
   { "urbexdk.nl", true },
+  { "urbizoroofing.com", true },
   { "urcentral.com", true },
   { "urcentral.net", true },
   { "urcentral.nl", true },
@@ -36945,6 +38569,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "urspringer.de", true },
   { "ursuslibris.hu", true },
   { "urth.org", true },
+  { "uruguay-experience.com", true },
   { "urukproject.org", true },
   { "usa-greencard.eu", true },
   { "usaa.com", false },
@@ -36960,18 +38585,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "usajobs.gov", true },
   { "usakitchensandflooring.com", true },
   { "usalearning.gov", true },
+  { "usaseanconnect.gov", true },
   { "usastaffing.gov", true },
   { "usbcraft.com", true },
   { "usbevents.co.uk", true },
   { "usbr.gov", true },
   { "uscloud.nl", true },
   { "usd.de", true },
+  { "usdoj.gov", true },
   { "usds.gov", true },
   { "use.be", true },
   { "usebean.com", true },
-  { "usedoor.jp", true },
   { "usedu.us", true },
   { "user-re.com", true },
+  { "userra.gov", true },
   { "usetypo3.com", true },
   { "useyourloaf.com", true },
   { "usgande.com", true },
@@ -36981,19 +38608,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "usipvd.ch", true },
   { "usitcolours.bg", true },
   { "uskaria.com", true },
-  { "usleep.net", true },
   { "usmint.gov", true },
   { "usninosnikrcni.eu", true },
   { "usnti.com", true },
+  { "usphs.gov", true },
   { "uspsoig.gov", true },
   { "ussm.gov", false },
+  { "ussst.org", true },
   { "ussuka.com", true },
   { "ust.space", true },
   { "ustensiles-cuisine.boutique", true },
   { "ustr.gov", false },
+  { "ustugov.kiev.ua", true },
+  { "ustugova.kiev.ua", true },
+  { "usu.org.ua", true },
   { "usualbeings.com", true },
   { "usuan.net", true },
-  { "usuluddin.ga", true },
   { "usweme.info", true },
   { "uswitch.com", true },
   { "ut-addicted.com", true },
@@ -37001,13 +38631,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "utahlocal.net", true },
   { "utahtravelcenter.com", true },
   { "utazas-nyaralas.info", true },
+  { "utazine.com", true },
   { "utcast-mate.com", true },
   { "utdsgda.com", true },
-  { "uteam.it", true },
   { "utepils.de", true },
   { "utgifter.no", true },
   { "utilia.tools", true },
-  { "utilio.nl", true },
   { "utilitarian.com", true },
   { "utilitarian.net", true },
   { "utilitarian.org", true },
@@ -37017,12 +38646,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "utilitronium.com", true },
   { "utilityapi.com", true },
   { "utilityreport.eu", true },
-  { "utitreatment.com", true },
   { "utonia.ch", true },
   { "utopicestudios.com", true },
   { "utox.io", true },
   { "utterberry.io", true },
-  { "utube.tw", true },
   { "utugnn.ru", true },
   { "utw.me", true },
   { "utwente.io", true },
@@ -37032,7 +38659,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "uv.uy", true },
   { "uvenuse.cz", true },
   { "uvocorp.com", true },
-  { "uvolejniku.cz", true },
   { "uw1008.com", true },
   { "uw2333.com", true },
   { "uwac.co.uk", false },
@@ -37047,7 +38673,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "uz.search.yahoo.com", false },
   { "uzaymedya.com.tr", true },
   { "uziregister.nl", true },
+  { "uzpirksana.lv", true },
   { "uzsvm.cz", true },
+  { "uzzamari.com", true },
   { "v-d-p.net", true },
   { "v-spin.cz", true },
   { "v-tek.fi", true },
@@ -37059,8 +38687,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "v2ray6.com", true },
   { "v2ray66.com", true },
   { "v2ray666.com", true },
+  { "v4s.ro", true },
   { "va-reitartikel.com", true },
-  { "va.gov", false },
   { "vacationsbyvip.com", true },
   { "vaccines.gov", true },
   { "vacuumpump.co.id", true },
@@ -37081,11 +38709,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vakantienet.nl", true },
   { "vakuutuskanava.fi", true },
   { "valasi.eu", true },
-  { "valbonne-consulting.com", true },
   { "valcano-krd.ru", true },
   { "valcano.ru", true },
+  { "valcansell.com", true },
   { "valcardiesel.com", true },
+  { "valek.net", true },
   { "valenciadevops.me", true },
+  { "valentin-dederer.de", true },
   { "valentin-ochs.de", true },
   { "valentin-sundermann.de", true },
   { "valentin.ml", true },
@@ -37106,6 +38736,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "valis.sx", true },
   { "valkohattu.fi", true },
   { "valkor.pro", true },
+  { "valkova.net", true },
   { "vallei-veluwe.nl", true },
   { "valleyautofair.com", true },
   { "valleyautoloan.com", true },
@@ -37118,13 +38749,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "valorem-tax.ch", true },
   { "valoremtax.ch", true },
   { "valoremtax.com", true },
+  { "valorizofficial.com", true },
   { "valshamar.is", true },
   { "valsk.is", false },
   { "valskis.lt", true },
   { "valtlai.fi", true },
   { "valtoaho.com", true },
   { "valudo.st", true },
+  { "valuechain.me", true },
   { "valueng.com", true },
+  { "valueofblog.com", true },
   { "valueseed.net", true },
   { "vampyrium.net", false },
   { "van11y.net", true },
@@ -37146,8 +38780,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vandorenscholars.org", true },
   { "vandyhacks.org", true },
   { "vaneigenkweek.be", true },
+  { "vanessarivas.com", true },
+  { "vaneurology.com", true },
   { "vangoghcoaching.nl", true },
-  { "vanhaos.com", true },
   { "vanhoudt-usedcars.be", true },
   { "vanhoutte.be", false },
   { "vanhove.biz", true },
@@ -37157,7 +38792,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vanohaker.ru", true },
   { "vanouwerkerk.net", true },
   { "vantagepointpreneed.com", true },
-  { "vantaio.com", true },
   { "vante.me", true },
   { "vantien.com", true },
   { "vantru.is", true },
@@ -37171,6 +38805,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vapesense.co.uk", true },
   { "vapesupplies.com.au", true },
   { "vaphone.co", true },
+  { "vapingdaily.com", true },
   { "vapor.cloud", false },
   { "vapordepot.jp", true },
   { "varalwamp.com", true },
@@ -37182,12 +38817,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "variable.agency", false },
   { "variag-group.ru", true },
   { "variag-montazh.ru", true },
+  { "variando.fi", true },
   { "varicoseveinssolution.com", true },
   { "varimedoma.com", true },
+  { "variomedia.de", true },
+  { "varonahairrestoration.com", true },
+  { "varshasookt.com", true },
   { "varshathacker.com", true },
   { "varunagw.com", true },
   { "varunpriolkar.com", true },
   { "varvy.com", true },
+  { "varyrentacar.com", true },
   { "varztupasaulis.com", true },
   { "varztupasaulis.eu", true },
   { "varztupasaulis.lt", true },
@@ -37199,6 +38839,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vasileruscior.ro", true },
   { "vasilikieleftheriou.com", true },
   { "vaskulitis-info.de", true },
+  { "vasp.group", true },
   { "vasports.com.au", true },
   { "vasyharan.com", true },
   { "vat-eu.com", true },
@@ -37206,20 +38847,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vatelecom.dk", true },
   { "vati.pw", true },
   { "vats.im", true },
-  { "vatsalyagoel.com", true },
-  { "vatsim-uk.co.uk", true },
-  { "vatsim.uk", true },
   { "vattulainen.fi", true },
+  { "vauceri.hr", true },
   { "vaud-fleurs.ch", true },
   { "vaughanrisher.com", true },
   { "vault21.net", true },
   { "vault81.de", true },
-  { "vaultproject.io", true },
+  { "vaultproject.io", false },
   { "vaur.fr", true },
   { "vavel.com", true },
   { "vawebsite.co", true },
   { "vawlt.io", true },
-  { "vawltstorage.com", true },
+  { "vawomenshealth.com", true },
   { "vaygren.com", true },
   { "vazue.com", true },
   { "vb-oa.co.uk", true },
@@ -37233,7 +38872,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vc.gg", true },
   { "vcam.org", true },
   { "vccmurah.net", true },
-  { "vcelin-na-doliku.cz", true },
   { "vcf.gov", true },
   { "vcientertainment.com", false },
   { "vcmi.download", true },
@@ -37244,7 +38882,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vcti.cloud", true },
   { "vd42.net", true },
   { "vda.li", true },
-  { "vdanker.net", true },
   { "vdbongard.com", true },
   { "vdcomp.cz", false },
   { "vdemuzere.be", true },
@@ -37263,18 +38900,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vectorwish.com", true },
   { "vedma-praktik.com", true },
   { "veg-leiden.nl", true },
+  { "vegalengd.com", true },
   { "vegalitarian.org", true },
-  { "vegane-proteine.com", true },
   { "veganforum.org", true },
   { "veganism.co.uk", true },
   { "veganism.com", true },
   { "veganmasterrace.com", true },
+  { "vegasluxuryestates.com", true },
   { "vegepa.com", true },
   { "vegetariantokyo.net", true },
   { "veggie-treff.de", true },
   { "vegguide.org", true },
   { "veii.de", true },
   { "veil-framework.com", true },
+  { "veincenterbrintonlake.com", true },
   { "veit.zone", true },
   { "veke.fi", true },
   { "velen.io", true },
@@ -37287,6 +38926,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vendreacheter.be", true },
   { "vendreacheter.net", true },
   { "vendserve.eu", true },
+  { "veneerssandiego.com", true },
   { "venenum.org", true },
   { "venev.name", true },
   { "venje.pro", true },
@@ -37303,6 +38943,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "venturum.eu", true },
   { "venturum.net", true },
   { "ventzke.com", true },
+  { "venuedriver.com", true },
   { "ver.ma", true },
   { "vera.bg", true },
   { "veramagazine.jp", true },
@@ -37313,6 +38954,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "verdict.gg", true },
   { "verduccies.com", true },
   { "verein-kiekin.de", true },
+  { "verein-zur-pflege-der-geselligkeit.de", true },
   { "vereinlandwege.de", true },
   { "vereinscheck.de", true },
   { "verfassungsklage.at", true },
@@ -37320,21 +38962,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vergeaccessories.com", true },
   { "vergelijksimonly.nl", true },
   { "vergessen.cn", true },
-  { "verhovs.ky", true },
+  { "verhovs.ky", false },
   { "verifalia.com", true },
   { "verifiedjoseph.com", true },
   { "verifiny.com", true },
   { "verifyos.com", true },
   { "verifyyourip.com", true },
+  { "veriny.tf", true },
+  { "verios.com.br", true },
   { "veritafineviolins.com", true },
   { "veritas-data.de", true },
+  { "veritasinvestmentwealth.com", true },
   { "verizonconnect.com", false },
   { "verizonguidelines.com", true },
   { "verliebt-in-bw.de", true },
   { "verliebt-in-niedersachsen.de", true },
   { "vermeerdealers.com", true },
   { "vermiliontaxiservice.com", true },
-  { "vermogeninkaart.nl", true },
   { "vermuetje.nl", true },
   { "vernaeve-usedcars.be", true },
   { "vernonatvclub.ca", true },
@@ -37362,29 +39006,35 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vertner.net", true },
   { "vertrieb-strategie.de", true },
   { "verustracking.com", true },
+  { "verwandlung.org", true },
   { "verwayen.com", true },
   { "very-kids.fr", true },
   { "veryapt.com", true },
   { "veryimportantusers.com", true },
   { "verymelon.de", true },
   { "verymetal.nl", true },
+  { "verzekeringencambier.be", true },
   { "verzekeringsacties.nl", true },
   { "verzick.com", true },
   { "vescudero.net", true },
   { "veslosada.com", true },
   { "vespacascadia.com", true },
+  { "vestd.com", true },
   { "vestingbar.nl", true },
   { "vestum.ru", true },
   { "vetbits.com", true },
   { "veterinarian-hospital.com", true },
   { "veterinario.roma.it", true },
   { "veterinarioaltea.com", true },
+  { "veterinary-colleges.com", true },
+  { "veteriner.name.tr", true },
   { "vetforum.co", true },
   { "vetinte.eu", true },
   { "vetofish.com", true },
   { "vets.gov", true },
   { "veverusak.cz", true },
   { "vfdworld.com", true },
+  { "vfmc.vic.gov.au", true },
   { "vfn-nrw.de", true },
   { "vgchat.us", true },
   { "vgerak.com", true },
@@ -37392,8 +39042,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vgorcum.com", true },
   { "vgropp.de", true },
   { "vh.net", true },
+  { "vhrca.com", true },
   { "vhummel.nl", true },
   { "vi.photo", true },
+  { "via-shire-krug.ru", true },
   { "viacdn.org", true },
   { "viafinance.cz", false },
   { "viaggio-in-cina.it", true },
@@ -37403,14 +39055,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "viaje-a-china.com", true },
   { "vialorran.com", true },
   { "viaprinto.de", true },
-  { "viato.fr", true },
+  { "viasinc.com", false },
   { "vibrant-america.com", true },
+  { "vibrato1-kutikomi.com", true },
   { "vicentee.com", true },
   { "vichiya.com", true },
-  { "vician.cz", false },
+  { "vician.cz", true },
+  { "vicianovi.cz", true },
   { "vicicode.com", true },
   { "viciousflora.com", true },
   { "vicjuwelen-annelore.be", true },
+  { "vickshomes.com", true },
   { "victora.com", true },
   { "victorcanera.com", true },
   { "victordiaz.me", true },
@@ -37440,12 +39095,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vida-it.com", true },
   { "vida.es", true },
   { "vidadu.com", true },
+  { "vidarity.com", true },
   { "vidbooster.com", true },
   { "vidcloud.xyz", true },
   { "vide-greniers.org", false },
+  { "videobola.win", true },
   { "videogamesartwork.com", true },
   { "videokaufmann.at", true },
-  { "videoload.co", true },
   { "videomail.io", true },
   { "videosdiversosdatv.com", true },
   { "videoseyredin.net", true },
@@ -37465,19 +39121,20 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vierpfeile.de", true },
   { "vierpluseins.wtf", true },
   { "vietnamese.dating", true },
+  { "vietnamguide.co.kr", true },
   { "vietnamhost.vn", false },
   { "vietnamluxurytravelagency.com", true },
+  { "vietnamphotoblog.com", true },
   { "vietnamwomenveterans.org", true },
-  { "vieux.pro", true },
   { "viewbook.com", true },
   { "viewey.com", true },
+  { "viewing.nyc", true },
   { "viewmyrecords.com", true },
   { "viga.me", true },
   { "vigenebio.com", true },
   { "vigilantnow.com", true },
   { "vigliano.ovh", true },
   { "vignoblesdeletat.ch", true },
-  { "vigo-krankenversicherung.de", true },
   { "vigo-tarife.de", true },
   { "vigour.us", true },
   { "vigoxatelier.tech", true },
@@ -37497,9 +39154,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vikings.net", true },
   { "vikodek.com", true },
   { "viktorprevaric.eu", true },
-  { "vilabiamodas.com.br", true },
+  { "vila-eden.cz", true },
   { "viljatori.fi", true },
-  { "villa-anna-cilento.de", true },
+  { "villa-eden.cz", true },
   { "villa-gockel.de", true },
   { "villa-romantica-zillertal.at", true },
   { "villafiore.com.br", true },
@@ -37516,22 +39173,29 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "villenvinkit.com", true },
   { "villerez.fr", true },
   { "villesalonen.fi", true },
+  { "villu.ga", true },
   { "viltsu.net", true },
   { "vima.ch", true },
   { "vimeo.com", true },
   { "vinagro.sk", true },
+  { "vinahost.vn", true },
   { "vinarstvimodryhrozen.cz", true },
   { "vincentcox.com", false },
+  { "vincentoshana.com", true },
   { "vincentpancol.com", true },
+  { "vincentswordpress.nl", true },
   { "vincible.space", true },
   { "vinciconps4.it", true },
   { "vincitraining.com", true },
-  { "vineright.com", true },
   { "vinetech.co.nz", true },
-  { "vinihk.com", true },
+  { "vingt.me", true },
   { "vinilosdecorativos.net", true },
+  { "vinistas.com", true },
   { "vinner.com.au", true },
   { "vinnie.gq", true },
+  { "vinnyandchristina.com", true },
+  { "vinnyvidivici.com", true },
+  { "vinokurov.tk", true },
   { "vinolli.de", true },
   { "vinovum.net", true },
   { "vinsation.com", true },
@@ -37544,20 +39208,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vintagesouthernpicks.com", true },
   { "vintagetrailerbuyers.com", true },
   { "vintazh.net", true },
-  { "vinticom.ch", false },
+  { "vinticom.ch", true },
   { "vintock.com", true },
   { "vinyculture.com", true },
   { "vinzite.com", true },
   { "violin4fun.nl", true },
   { "vionicbeach.com", true },
   { "vionicshoes.com", true },
-  { "vip-9649.com", true },
   { "vip4553.com", true },
   { "vip8522.com", true },
-  { "vip9649.com", true },
-  { "vipesball.cc", true },
-  { "vipesball.info", true },
-  { "vipesball.me", true },
   { "vipi.es", true },
   { "viptamin.eu", true },
   { "viptamol.com", true },
@@ -37565,7 +39224,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vir-tec.eu", true },
   { "vir2.me", true },
   { "viral32111.com", true },
-  { "viralboombox.xyz", true },
   { "viralpop.it", true },
   { "viralsouls.in", true },
   { "viralsv.com", true },
@@ -37573,20 +39231,24 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "virial.de", true },
   { "viridis-milites.cz", true },
   { "virtit.fr", true },
+  { "virtual.hk", true },
   { "virtualcloud.ddns.net", true },
   { "virtualcommodities.org", true },
   { "virtuality4d.com", true },
   { "virtuallifestyle.nl", true },
   { "virtualmt2.pl", true },
   { "virtualsanity.com", true },
+  { "virtualspeech.com", true },
   { "virtualvaults.com", true },
   { "virtubox.net", true },
+  { "virtusaero.com", true },
   { "virvum.ch", true },
   { "visaexpert.co.za", true },
   { "visalist.io", true },
   { "visalogy.com", true },
   { "visaop.com", true },
   { "visapourailleurs.fr", true },
+  { "visasofoz.com", true },
   { "visaya.com.co", true },
   { "viscoelastico.com.br", true },
   { "viscopic.com", true },
@@ -37596,7 +39258,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "visikom.de", true },
   { "visioflux-premium.com", true },
   { "visionarymedia.nl", true },
-  { "visiondigitalsog.com", true },
   { "visiondirectionaldrilling.com", true },
   { "visionexpress.com", true },
   { "visionexpress.ie", true },
@@ -37604,6 +39265,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "visionless.me", false },
   { "visionnissancanandaiguaparts.com", true },
   { "visit-montenegro.com", true },
+  { "visitbeulah.com", true },
   { "visitcambridgeshirefens.org", true },
   { "visitkangaroovalley.com.au", true },
   { "visitmaine.com", true },
@@ -37615,6 +39277,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vistec-support.de", true },
   { "vistodeturista.com.br", true },
   { "visual-cockpit.com", true },
+  { "visual-concept.net", true },
   { "visualdrone.co", true },
   { "visualgrafix.com.mx", true },
   { "visualideas.org", true },
@@ -37622,7 +39285,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "visualmasters.nl", true },
   { "visudira.com", true },
   { "vitahook.pw", true },
-  { "vital-tel.co.uk", true },
   { "vitalamin.com", true },
   { "vitalamin.de", true },
   { "vitalismaatjes.nl", true },
@@ -37633,6 +39295,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vitamineproteine.com", true },
   { "vitaminler.com", true },
   { "vitastic.nl", true },
+  { "vitavie.nl", true },
   { "viteoscrm.ch", true },
   { "vitkausk.as", true },
   { "vitkutny.cz", true },
@@ -37646,8 +39309,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vivaldi.club", true },
   { "vivaldi.com", true },
   { "vivamusic.es", true },
+  { "vivanosports.com.br", false },
   { "vivatv.com.tw", true },
   { "vivendi.de", true },
+  { "viveport.com", true },
   { "vivianmaier.cn", true },
   { "vivid-academy.com", true },
   { "vividinflatables.co.uk", true },
@@ -37656,6 +39321,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vivirenelmundo.com", true },
   { "vivo.sx", true },
   { "vivoitaliankitchen.com", true },
+  { "vivy.com", true },
   { "vixrapedia.org", true },
   { "viyf.org", true },
   { "vizards.cc", true },
@@ -37672,10 +39338,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vk4wip.org.au", true },
   { "vkb-remont.ru", true },
   { "vkennke.org", true },
+  { "vkino.com", false },
   { "vkirichenko.name", true },
   { "vkox.com", true },
   { "vksportphoto.com", true },
   { "vladislavstoyanov.com", true },
+  { "vlakem.net", true },
+  { "vlakjebak.nl", true },
   { "vlastimilburian.cz", true },
   { "vleesbesteld.nl", true },
   { "vleij.com", true },
@@ -37686,7 +39355,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vloeck.de", true },
   { "vlora.city", true },
   { "vlovgr.se", true },
-  { "vlsk.eu", true },
   { "vlsm.se", true },
   { "vlvvl.com", true },
   { "vm-0.com", true },
@@ -37711,33 +39379,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vocalik.com", true },
   { "vocaloid.my", true },
   { "vocalviews.com", true },
+  { "vochuys.nl", true },
   { "vocus.aero", true },
   { "vocustest.aero", true },
   { "vodb.me", true },
   { "vodb.org", true },
-  { "vodpay.com", true },
-  { "vodpay.net", true },
-  { "vodpay.org", true },
+  { "vodicak.info", true },
   { "vogelbus.ch", true },
   { "vogler.name", true },
   { "vogue.cz", true },
   { "voice-of-design.com", true },
   { "voicu.ch", true },
   { "void-zero.com", true },
-  { "voidark.com", true },
   { "voidcore.org", true },
   { "voidma.in", true },
   { "voidpay.com", true },
-  { "voidpay.net", true },
-  { "voidpay.org", true },
   { "voidptr.eu", true },
-  { "voids.org", true },
   { "voidx.top", true },
   { "voidzehn.com", true },
+  { "voipdigit.nl", true },
   { "voipsun.com", true },
   { "vojtechpavelka.cz", true },
-  { "vokalsystem.com", true },
-  { "vokativy.cz", false },
+  { "vokativy.cz", true },
   { "vokeapp.com", true },
   { "vokurka.net", true },
   { "volcanconcretos.com", true },
@@ -37752,34 +39415,36 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "volker-gropp.de", true },
   { "volkergropp.de", true },
   { "volkerwesselstransfer.nl", false },
-  { "volkerwesselswave.nl", false },
   { "volksvorschlagpmar.ch", true },
   { "vollans.id.au", true },
+  { "vollmondstollen.de", true },
   { "voloevents.com", true },
   { "volta.io", true },
+  { "voltahurt.pl", true },
   { "volto.io", true },
   { "volunteeringmatters.org.uk", true },
   { "vomitb.in", true },
-  { "von-lien-aluprofile.de", true },
-  { "von-lien-dachrinnen.de", true },
-  { "von-lien-lichtplatten.de", true },
-  { "von-lien-profilbleche.de", true },
   { "vonauw.com", true },
   { "vonborstelboerner.de", true },
   { "vonniehudson.com", true },
   { "vonski.pl", true },
   { "voodoochile.at", true },
   { "vop.li", true },
-  { "vorkbaard.nl", true },
   { "vorlage-musterbriefe.de", true },
   { "vorlage-mustervertrag.de", true },
   { "vorlagen-geburtstagsgruesse.de", true },
   { "vorlicek.de", true },
   { "vorlif.org", true },
+  { "vorm2.com", true },
   { "vorodevops.com", true },
   { "vorte.ga", true },
   { "vos-fleurs.ch", true },
   { "vos-fleurs.com", true },
+  { "vos-systems.com", true },
+  { "vos-systems.es", true },
+  { "vos-systems.eu", true },
+  { "vos-systems.net", true },
+  { "vos-systems.org", true },
   { "vosgym.jp", true },
   { "vosky.fr", true },
   { "vosn.de", true },
@@ -37793,6 +39458,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "votockova.cz", true },
   { "votoot.com", true },
   { "votre-site-internet.ch", true },
+  { "vouchinsurance.sg", true },
   { "vovladikavkaze.ru", true },
   { "voxfilmeonline.net", true },
   { "voxml.com", true },
@@ -37800,6 +39466,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "voya.ga", true },
   { "voyage-martinique.fr", true },
   { "voyageforum.com", true },
+  { "voyageofyume.com", true },
   { "voyagesaufildespages.be", true },
   { "voyageschine.com", true },
   { "voyagesdetective.fr", true },
@@ -37808,13 +39475,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vpn.ht", true },
   { "vpnpro.com", true },
   { "vpnservice.nl", true },
+  { "vpntech.net", true },
   { "vpsboard.com", true },
   { "vpsdream.dk", true },
-  { "vpsou.com", false },
+  { "vpsou.com", true },
   { "vpsport.ch", true },
+  { "vpsvz.cloud", true },
   { "vpsvz.net", true },
   { "vrandopulo.ru", true },
-  { "vranjske.co.rs", true },
   { "vrcholovka.cz", true },
   { "vreaulafacultate.ro", true },
   { "vreeman.com", true },
@@ -37826,11 +39494,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vrjetpackgame.com", true },
   { "vroedvrouwella.be", true },
   { "vrsystem.com.br", true },
-  { "vrtouring.org", true },
+  { "vrtak-cz.net", true },
   { "vscale.io", true },
+  { "vsd.sk", true },
   { "vsean.net", true },
+  { "vseomedia.com", true },
   { "vserver-preis-vergleich.de", true },
   { "vsesrazu-raiffeisen.ru", true },
+  { "vsestiralnie.com", true },
   { "vsestoki.com", true },
   { "vsl-defi.ch", true },
   { "vssnederland.nl", true },
@@ -37839,13 +39510,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vsx.ch", true },
   { "vtaxi.se", true },
   { "vtipe-vylez.cz", true },
-  { "vtuber-schedule.info", true },
+  { "vtuber.art", true },
   { "vuakhuyenmai.vn", true },
   { "vubey.yt", true },
   { "vuilelakens.be", true },
   { "vuljespaarpot.nl", true },
   { "vullriede-multimedia.de", true },
-  { "vulndetect.org", false },
+  { "vulndetect.com", true },
+  { "vulndetect.org", true },
   { "vulnerability.ch", true },
   { "vulners.com", true },
   { "vulns.sexy", true },
@@ -37864,7 +39536,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vvoip.org.uk", true },
   { "vvw-8522.com", true },
   { "vvzero.com", true },
-  { "vw-touranclub.cz", true },
   { "vwbusje.com", true },
   { "vwfsrentacar.co.uk", true },
   { "vwhcare.com", true },
@@ -37875,7 +39546,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vx.hn", true },
   { "vxst.org", true },
   { "vxstream-sandbox.com", true },
-  { "vxz.me", true },
   { "vybeministry.org", true },
   { "vyber-odhadce.cz", true },
   { "vyberodhadce.cz", true },
@@ -37883,7 +39553,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "vysko.cz", true },
   { "vyskocil.eu", true },
   { "vyvygen.org", true },
-  { "vyzner.cz", true },
   { "vzce.cn", true },
   { "vzis.org", true },
   { "vztekloun.cz", true },
@@ -37892,29 +39561,31 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "w-w-auto.de", true },
   { "w.wiki", true },
   { "w1221.com", true },
+  { "w1n73r.de", true },
   { "w2n.me", true },
   { "w3ctag.org", true },
   { "w3n.org", true },
+  { "w4.no", true },
   { "w4eg.de", true },
   { "w4nvu.org", true },
   { "w50.co.uk", true },
   { "w5gfe.org", true },
   { "w7k.de", true },
   { "w84.it", true },
+  { "w889889.com", true },
+  { "w889889.net", true },
   { "w95.pw", true },
   { "wa-stromerzeuger.de", true },
   { "wa.me", true },
   { "waaw.tv", true },
   { "wabatam.com", true },
-  { "wachter.biz", true },
+  { "wacky-science.com", true },
   { "wacky.one", true },
   { "wadidi.com", true },
   { "wadsworth.gallery", true },
-  { "wadvisor.com", true },
   { "waelisch.de", true },
   { "waf.ninja", true },
   { "waf.sexy", true },
-  { "wafa4hw.com", true },
   { "wafelland.be", true },
   { "waffle.at", false },
   { "wafuton.com", true },
@@ -37924,28 +39595,22 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wahlman.org", true },
   { "wahrnehmungswelt.de", true },
   { "wahrnehmungswelten.de", true },
-  { "wai-in.net", true },
   { "waidfrau.de", true },
   { "waidu.de", true },
   { "waifu-technologies.com", true },
   { "waifu-technologies.moe", true },
   { "waigel.org", true },
   { "waikatowebdesigners.com", true },
-  { "wail.net", true },
   { "wains.be", false },
   { "wait.jp", true },
   { "waiterwheels.com", true },
   { "waits.io", true },
   { "wajtc.com", true },
   { "wak.io", true },
-  { "waka-mono.com", true },
-  { "waka168.com", true },
-  { "waka168.net", true },
-  { "waka88.com", true },
-  { "waka88.net", true },
   { "wakamiyasumiyosi.com", true },
   { "wakandasun.com", true },
   { "wakatime.com", true },
+  { "wakhanyeza.org", true },
   { "wakiminblog.com", true },
   { "wala-floor.de", true },
   { "waldvogel.family", true },
@@ -37990,7 +39655,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wanda.ch", true },
   { "wandelreizen.eu", true },
   { "wander.al", true },
-  { "wandercue.com", true },
   { "wandervoll.ch", true },
   { "wanderzoom.co", true },
   { "wandystan.eu", true },
@@ -38007,6 +39671,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wangql.net", true },
   { "wangqr.tk", true },
   { "wangtanzhang.com", true },
+  { "wangwill.me", true },
   { "wangyubao.cn", true },
   { "wangyue.blog", true },
   { "wangzuan168.cc", true },
@@ -38019,15 +39684,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wanybug.tk", true },
   { "wanzenbug.xyz", true },
   { "waonui.io", true },
+  { "wapa.gov", true },
+  { "wapking.co", true },
+  { "wapoolandspa.com", true },
   { "wardow.com", true },
   { "warebouncycastles.co.uk", true },
   { "warekit.io", true },
   { "warenits.at", false },
-  { "warenmedia.com", true },
   { "wargameexclusive.com", true },
   { "warhaggis.com", true },
-  { "warmestwishes.ca", true },
   { "warmservers.com", true },
+  { "waroengkoe-shop.com", true },
   { "warofelements.de", true },
   { "warp-radio.com", true },
   { "warp-radio.net", true },
@@ -38041,6 +39708,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "waschpark-hantschel.de", true },
   { "wasema.com", true },
   { "wasfestes.de", true },
+  { "wasfuereintheater.com", true },
+  { "washandfun.com", true },
   { "washingtonregisteredagent.io", true },
   { "washingtonviews.com", true },
   { "wasi-net.de", true },
@@ -38051,6 +39720,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wassibauer.com", true },
   { "wastrel.ch", true },
   { "watch-wiki.org", true },
+  { "watchcom.org.za", true },
   { "watchface.watch", true },
   { "watchfreeonline.co.uk", true },
   { "watchinventory.com", true },
@@ -38079,21 +39749,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "waukeect.com", true },
   { "wave-ola.es", true },
   { "wave.is", true },
+  { "wave.red", true },
+  { "wavengine.com", true },
   { "waverlysecuritycameras.com", true },
   { "wavesboardshop.com", true },
-  { "wavesoftime.com", true },
   { "waveum.com", true },
   { "wawak.pl", true },
   { "waxdramatic.com", true },
   { "waycraze.com", true },
   { "wayfair.de", true },
+  { "wayfairertravel.com", true },
   { "wayohoo.com", true },
   { "wayohoo.net", true },
   { "waytt.cf", true },
   { "waze.com", true },
   { "wb256.com", true },
+  { "wba.or.at", true },
   { "wbci.us", false },
   { "wbg-vs.de", true },
+  { "wblautomotive.com", true },
   { "wblinks.com", true },
   { "wbt-solutions.ch", true },
   { "wbt-solutions.net", true },
@@ -38105,6 +39779,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wby.tw", true },
   { "wcbook.ru", false },
   { "wcn.life", false },
+  { "wcrca.org", true },
+  { "wcsi.com", true },
   { "wcwcg.net", true },
   { "wd627.com", true },
   { "wd976.com", true },
@@ -38113,29 +39789,26 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wdic.org", true },
   { "wdodelta.nl", true },
   { "wdol.gov", true },
-  { "wdrl.info", true },
   { "wdt.cz", false },
   { "we-bb.com", true },
   { "we-run-linux.de", true },
   { "we-use-linux.de", true },
   { "weacceptbitcoin.gr", true },
-  { "wealthcentral.com.au", true },
   { "wealthprojector.com", true },
   { "wealthprojector.com.au", true },
   { "wealthreport.com.au", true },
   { "wearandcare.net", true },
   { "weare1inspirit.com", true },
   { "wearebfi.co.uk", true },
-  { "wearedisneyland.com", true },
   { "wearegenki.com", true },
   { "wearehackerone.com", true },
   { "wearepapermill.co", true },
   { "wearesouthafricans.com", true },
   { "wearvr.com", true },
   { "weaspireusa.com", true },
-  { "weather-and-climate.com", false },
+  { "weather-schools.com", true },
+  { "weather.gov", true },
   { "weathermyway.rocks", true },
-  { "web-adminy.co.uk", true },
   { "web-art.cz", true },
   { "web-design.co.il", true },
   { "web-dl.cc", true },
@@ -38147,6 +39820,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "web-redacteuren.nl", true },
   { "web-siena.it", true },
   { "web-smart.com", true },
+  { "web-thinker.ru", true },
   { "web-wave.jp", true },
   { "web.bzh", true },
   { "web.cc", false },
@@ -38167,10 +39841,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "webandmore.de", false },
   { "webappky.cz", true },
   { "webartex.ru", true },
-  { "webauthority.co.uk", true },
   { "webbiz.co.uk", true },
   { "webbson.net", false },
   { "webcamtoy.com", true },
+  { "webcasinos.com", true },
   { "webcatchers.nl", true },
   { "webcatechism.com", false },
   { "webclimbers.ch", true },
@@ -38189,6 +39863,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "webdesignplayground.io", true },
   { "webdesignsandiego.com", true },
   { "webdevops.io", true },
+  { "webdevxp.com", true },
   { "webdl.org", true },
   { "webdollarvpn.io", true },
   { "webduck.nl", false },
@@ -38206,7 +39881,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "webfilings.appspot.com", true },
   { "webfixers.nl", true },
   { "webfox.com.br", true },
-  { "webgap.io", true },
+  { "webgap.io", false },
   { "webgarten.ch", true },
   { "webgears.com", true },
   { "webharvest.gov", true },
@@ -38214,6 +39889,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "webhostingshop.ca", true },
   { "webhostingzzp.nl", false },
   { "webhostplan.info", true },
+  { "webies.ro", true },
   { "webinnovation.ie", true },
   { "webjobposting.com", true },
   { "webkef.com", true },
@@ -38222,7 +39898,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "weblate.org", true },
   { "webliberty.ru", true },
   { "webline.ch", true },
-  { "weblogic.pl", true },
   { "weblogzwolle.nl", true },
   { "webmail.gigahost.dk", false },
   { "webmail.info", false },
@@ -38236,6 +39911,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "webministeriet.net", true },
   { "webmotelli.fi", true },
   { "webnames.ca", true },
+  { "webnetforce.net", true },
   { "webnexty.com", true },
   { "webogram.org", false },
   { "webpinoytambayan.net", true },
@@ -38267,9 +39943,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "websitesdallas.com", true },
   { "websiteservice.pro", true },
   { "webslake.com", true },
+  { "websmartmedia.co.uk", true },
   { "websouthdesign.com", true },
   { "webspiral.jp", true },
   { "webspire.tech", true },
+  { "webstart.nl", true },
   { "webstijlen.nl", true },
   { "webstore.be", false },
   { "webstu.be", true },
@@ -38282,26 +39960,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "webtorrent.io", true },
   { "webtrh.cz", true },
   { "webtropia.com", false },
+  { "webutils.io", true },
   { "webvisum.de", true },
   { "webwednesday.nl", true },
   { "webwinkelexploitatie.nl", true },
   { "webwinkelwestland.nl", true },
   { "webwit.nl", true },
-  { "webwolf.co.za", true },
   { "webworkshop.ltd", true },
+  { "webxr.today", true },
   { "webyazilimankara.com", true },
+  { "webz.one", true },
   { "wechatify.com", true },
   { "wecleanbins.com", true },
   { "wecobble.com", true },
-  { "weddingalbumsdesign.com", true },
   { "weddingfantasy.ru", true },
   { "weddingofficiantwilmington.com", true },
   { "weddingsbynoon.co.uk", true },
   { "weddywood.ru", false },
+  { "wedg.uk", true },
   { "wedos.com", true },
   { "weeblr.com", true },
   { "weeblrpress.com", true },
-  { "weedcircles.com", true },
   { "weedlife.com", true },
   { "weednews.co", true },
   { "weedupdate.com", true },
@@ -38317,15 +39996,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "weekvandemediawijsheid.nl", true },
   { "weemake.fr", true },
   { "weemakers.fr", true },
+  { "weems.fr", true },
   { "weepycat.com", true },
   { "weerda.fr", true },
+  { "weerstationgiethoorn.nl", true },
   { "weerstatistieken.nl", true },
   { "wefinanceinc.com", true },
   { "wefitboilers.com", true },
   { "weforgood.org.tw", true },
   { "wegethitched.co.uk", true },
   { "weggeweest.nl", true },
-  { "wegner.no", true },
+  { "wegonnagetsued.org", true },
   { "wegotcookies.com", true },
   { "wegrzynek.org", true },
   { "wegvielfalt.de", true },
@@ -38339,12 +40020,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "weiling.clinic", true },
   { "weils.net", true },
   { "weiltoast.de", true },
-  { "weimaraner.com.br", true },
   { "weiming.ddns.net", true },
   { "weimz.com", true },
   { "wein.cc", true },
   { "wein.co.kr", true },
   { "weinbergerlawgroup.com", true },
+  { "weingaertner-it.de", true },
   { "weinundsein.com", true },
   { "weirdesigns.com", true },
   { "weisse-liste.de", true },
@@ -38359,6 +40040,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "welcome26.ch", true },
   { "welcomehelp.de", true },
   { "welcomescuba.com", true },
+  { "welcometoscottsdalehomes.com", true },
   { "weld.io", true },
   { "weldwp.com", true },
   { "wella-download-center.de", true },
@@ -38368,6 +40050,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wellensteyn.ru", true },
   { "weller.pm", true },
   { "wellist.com", true },
+  { "wellness-bonbon.de", true },
   { "wellness-gutschein.de", true },
   { "wellnesscheck.net", true },
   { "wellsolveit.com", false },
@@ -38377,18 +40060,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "welteneroberer.de", true },
   { "weltengilde.de", true },
   { "weltenhueter.de", true },
+  { "weltmeister.de", true },
   { "weltverschwoerung.de", true },
   { "welzijnkoggenland.nl", true },
+  { "wem.hr", true },
   { "wemakebookkeepingeasy.com", true },
   { "wemakemenus.com", true },
   { "wemakeonlinereviews.com", true },
   { "wemovemountains.co.uk", true },
-  { "wen-in.com", true },
-  { "wen-in.net", true },
-  { "wenchieh.com", true },
+  { "wendigo.pl", true },
   { "wendlberger.net", true },
   { "wendu.me", true },
-  { "wener.me", false },
   { "wenger-shop.ch", true },
   { "wenjs.me", true },
   { "wensing-und-koenig.de", true },
@@ -38405,7 +40087,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "werbefotografie-leitner.de", true },
   { "werbewelt-tv.de", true },
   { "werbezentrum-stiebler.de", true },
-  { "werbik.at", true },
+  { "werbik.at", false },
   { "werehub.org", true },
   { "wereldkoffie.eu", true },
   { "wereoutthere.nl", true },
@@ -38421,10 +40103,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "werkgroeppaleisparkhetloo.nl", true },
   { "werkinc.de", true },
   { "werkkrew.xyz", true },
+  { "werkslimreisslim.nl", true },
   { "werkstattkinder.de", true },
   { "werktor.com", true },
   { "werktor.net", true },
-  { "werkz.io", true },
   { "wermeester.com", true },
   { "werner-ema.de", true },
   { "werpo.com.ar", true },
@@ -38452,9 +40134,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "westcode.de", true },
   { "westcountrystalking.com", true },
   { "westendwifi.net", true },
+  { "westernfrontierins.com", true },
+  { "westernpadermatologist.com", true },
   { "westeros.hu", true },
-  { "westhighlandwhiteterrier.com.br", true },
   { "westhillselectrical.com", true },
+  { "westlaketire.pt", true },
   { "westlakevillageelectric.com", true },
   { "westlakevillageelectrical.com", true },
   { "westlakevillageelectrician.com", true },
@@ -38468,10 +40152,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "westmeadapartments.com.au", true },
   { "westmidlandsbouncycastlehire.co.uk", true },
   { "westmidlandsinflatables.co.uk", true },
+  { "westside-pediatrics.com", true },
   { "westsuburbanbank.com", true },
   { "westwood.no", true },
+  { "wesupportthebadge.org", true },
+  { "weswitch4u.com", true },
   { "wetofu.top", true },
+  { "wetrepublic.com", true },
+  { "wettanbieter-vergleich.de", true },
+  { "wette.de", true },
+  { "wetten.eu", true },
   { "wevenues.com", true },
+  { "wewin88.com", true },
+  { "wewin88.net", true },
   { "wewitro.de", true },
   { "wewitro.net", true },
   { "wexfordbouncycastles.ie", true },
@@ -38491,9 +40184,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wf-trial-hrd.appspot.com", true },
   { "wfh.ovh", true },
   { "wfh.se", true },
+  { "wfl.ro", true },
   { "wft-portfolio.nl", true },
   { "wg-steubenstrasse.de", true },
   { "wg3k.us", false },
+  { "wgcp.com", true },
   { "wgom.org", true },
   { "wgplatform.co.uk", true },
   { "wgraphics.ru", true },
@@ -38501,7 +40196,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wh-guide.de", true },
   { "what-wood.servehttp.com", true },
   { "whatagreatwebsite.net", true },
-  { "whatanime.ga", true },
   { "whatarepatentsfor.com", true },
   { "whatclinic.co.uk", true },
   { "whatclinic.com", true },
@@ -38518,10 +40212,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "whatsahoy.com", true },
   { "whatsapp.com", true },
   { "whatsmychaincert.com", true },
-  { "whatsupdeco.com", true },
   { "whatsupgold.com.tw", true },
   { "whatsupoutdoor.com", true },
   { "whatthingsweigh.com", true },
+  { "whattominingrigrentals.com", true },
   { "whatusb.com", true },
   { "whatwebcando.today", true },
   { "whatwg.org", true },
@@ -38535,6 +40229,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "when.fm", false },
   { "where2trip.com", true },
   { "whereiszakir.com", true },
+  { "wheresbuzz.com.au", true },
   { "whexit.nl", true },
   { "whey-protein.ch", true },
   { "whiletrue.run", true },
@@ -38542,11 +40237,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "whing.org", true },
   { "whipnic.com", true },
   { "whirlpool-luboss.de", true },
+  { "whirlpool.net.au", true },
   { "whisky-circle.info", true },
+  { "whiskygentle.men", true },
   { "whiskynerd.ca", true },
   { "whisp.ly", false },
   { "whispeer.de", true },
-  { "whisperinghoperanch.org", true },
   { "whisperlab.org", true },
   { "whistleb.com", true },
   { "whistleblower.gov", true },
@@ -38559,16 +40255,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "whitealps.fr", true },
   { "whitealps.net", true },
   { "whitebear.cloud", true },
+  { "whitebirdclinic.org", true },
   { "whitefm.ch", true },
   { "whitehathackers.com.br", true },
+  { "whitehats.nl", true },
   { "whitehouse.gov", true },
+  { "whitehouseconferenceonaging.gov", true },
   { "whitehousedrugpolicy.gov", true },
-  { "whiteink.com", false },
+  { "whiteink.com", true },
   { "whitejaguars.com", true },
+  { "whiteknightsafelockinc.com", true },
   { "whitelabelcashback.nl", true },
   { "whitelabeltickets.com", false },
   { "whitepharmacy.co.uk", true },
-  { "whiteready.it", true },
   { "whiterose.goip.de", true },
   { "whiteshadowimperium.com", true },
   { "whitewebhosting.co.za", true },
@@ -38584,16 +40283,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "whnpa.org", true },
   { "who-calledme.com", true },
   { "who.pm", true },
+  { "whoami.io", true },
   { "whoasome.com", true },
   { "whocalld.com", true },
   { "whocalled.us", true },
   { "whocybered.me", true },
   { "whoimg.com", true },
-  { "whoiscuter.ml", true },
-  { "whoiscutest.ml", true },
+  { "whoisdhh.com", true },
   { "whoisthenightking.com", true },
   { "whoiswp.com", true },
-  { "wholelotofbounce.co.uk", false },
   { "wholesalecbd.com", true },
   { "wholesomeharvestbread.com", false },
   { "whonix.org", true },
@@ -38608,6 +40306,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "whynohttps.com", true },
   { "whyopencomputing.ch", true },
   { "whyopencomputing.com", true },
+  { "whysoslow.co.uk", true },
   { "whytls.com", true },
   { "whyworldhot.com", true },
   { "whyz1722.tk", true },
@@ -38652,6 +40351,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wikibulz.com", true },
   { "wikidata.org", true },
   { "wikidsystems.com", false },
+  { "wikihow.com", true },
+  { "wikihow.com.tr", true },
+  { "wikihow.cz", true },
+  { "wikihow.fitness", true },
+  { "wikihow.it", true },
+  { "wikihow.jp", true },
+  { "wikihow.life", true },
+  { "wikihow.mom", true },
+  { "wikihow.pet", true },
+  { "wikihow.tech", true },
+  { "wikihow.vn", true },
   { "wikileaks.com", true },
   { "wikileaks.org", true },
   { "wikimedia.org", true },
@@ -38659,6 +40369,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wikimilk.org", true },
   { "wikinews.org", true },
   { "wikipedia.org", true },
+  { "wikipiedi.it", true },
   { "wikiquote.org", true },
   { "wikisource.org", true },
   { "wikiversity.org", true },
@@ -38668,7 +40379,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wiktoriaslife.com", true },
   { "wilane.org", true },
   { "wilcodeboer.me", true },
-  { "wild-turtles.com", false },
+  { "wild-turtles.com", true },
   { "wildboaratvparts.com", true },
   { "wilddogdesign.co.uk", true },
   { "wildewood.ca", true },
@@ -38690,12 +40401,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "willfarrell.ca", true },
   { "willi-graf-gymnasium.de", true },
   { "willi-graf-os.de", true },
-  { "william.gg", true },
   { "williamboulton.co.uk", true },
   { "williamfeely.info", true },
   { "williamjohngauthier.net", true },
+  { "williamle.com", true },
   { "williamscomposer.com", true },
-  { "williamsflintlocks.com", true },
+  { "williamsonshore.com", true },
   { "williamsportmortgages.com", true },
   { "williamsroom.com", true },
   { "williamtm.com", true },
@@ -38724,19 +40435,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wind.moe", true },
   { "winddan.nz", true },
   { "windelnkaufen24.de", true },
+  { "windforme.com", true },
   { "windowcleaningexperts.net", true },
-  { "windows10insider.com", true },
   { "windowslatest.com", true },
   { "windowsnerd.com", true },
+  { "windowsnoticias.com", true },
   { "windowwellcovers.com", true },
-  { "windrunner.se", true },
   { "windscribe.com", true },
   { "windsock-app.com", true },
   { "windsorite.ca", true },
   { "windsorspi.com", true },
   { "windycitydubfest.com", true },
   { "wine-tapa.com", true },
-  { "wineonthewall.com", true },
+  { "wineparis.com", true },
   { "winepress.org", true },
   { "winghill.com", true },
   { "wingify.com", true },
@@ -38745,26 +40456,28 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "winkelcentrumputten.nl", true },
   { "winmodels.org", true },
   { "winmodels.ru", true },
+  { "winningattitudeawards.org", true },
   { "winphonemetro.com", true },
-  { "winportal.cz", false },
   { "winsome.world", true },
   { "wint.global", true },
+  { "winter-auszeit.de", true },
   { "winter-elektro.de", true },
   { "winter.engineering", false },
   { "winterbergwebcams.com", true },
   { "wintercam.nl", true },
   { "winterfeldt.de", true },
+  { "winterhavenobgyn.com", true },
   { "winterhillbank.com", true },
   { "wintermeyer-consulting.de", true },
   { "wintermeyer.de", true },
   { "winterschoen.nl", true },
   { "wintodoor.com", true },
+  { "winwares.com", true },
   { "winwitharval.co.uk", true },
   { "wipswiss.ch", true },
   { "wir-bewegen.sh", true },
   { "wircon-int.net", true },
   { "wire.com", true },
-  { "wireframesoftware.com", true },
   { "wireheading.com", true },
   { "wireshark.org", true },
   { "wiretime.de", true },
@@ -38778,7 +40491,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wisal.org", true },
   { "wischu.com", true },
   { "wisedog.eu", true },
-  { "wiseflat.com", true },
+  { "wishlist.net", true },
   { "wispapp.com", false },
   { "wisper.net.au", true },
   { "wispsuperfoods.com", true },
@@ -38790,13 +40503,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "withextraveg.net", true },
   { "withgoogle.com", true },
   { "withinsecurity.com", true },
-  { "withlocals.com", true },
   { "withyoutube.com", true },
   { "witneywaterpolo.org.uk", true },
   { "witt-international.co.uk", true },
   { "witte.cloud", true },
+  { "wittu.fi", true },
   { "witway.nl", false },
   { "wivoc.nl", true },
+  { "wixguide.co", true },
   { "wiz.at", true },
   { "wiz.biz", true },
   { "wiz.farm", true },
@@ -38836,10 +40550,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wo-ist-elvira.net", true },
   { "wo2forum.nl", true },
   { "woah.how", true },
-  { "woaiuhd.com", true },
   { "wobble.ninja", true },
   { "wobblywotnotz.co.uk", true },
-  { "wochennummern.de", true },
   { "wodboss.com", true },
   { "wodinaz.com", true },
   { "wodka-division.de", true },
@@ -38852,7 +40564,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wohlpa.de", true },
   { "wohnbegleitung.ch", true },
   { "wohnsitz-ausland.com", true },
-  { "woi.vision", true },
   { "wokinghammotorhomes.com", true },
   { "wolfachtal-alpaka.de", true },
   { "wolfarth.info", true },
@@ -38862,12 +40573,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wolfgang-kerschbaumer.at", true },
   { "wolfgang-kerschbaumer.com", true },
   { "wolfgang-kerschbaumer.net", true },
-  { "wolfgang-kloke.de", true },
   { "wolfgang-ziegler.com", true },
   { "wolfie.ovh", false },
   { "wolfsden.cz", true },
+  { "wolfvideoproductions.com", true },
   { "wolfwings.us", true },
   { "wolfy1339.com", false },
+  { "wolke7.wtf", true },
   { "wolkoopjes.nl", true },
   { "wollgredel.de", true },
   { "wollongongbaptist.hopto.org", true },
@@ -38881,9 +40593,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "women-only.net", true },
   { "womensalespros.com", true },
   { "womenshairlossproject.com", true },
+  { "womensmedassoc.com", true },
   { "wonabo.com", true },
   { "wonder.com.mx", false },
   { "wonderbill.com", true },
+  { "wonderbits.net", true },
   { "wonderfuleducation.eu", true },
   { "wonderfuleducation.nl", true },
   { "wondergorilla.com", true },
@@ -38894,14 +40608,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wood-crafted.co.uk", true },
   { "wood-crafted.uk", true },
   { "woodbury.io", true },
-  { "woodcoin.org", true },
+  { "woodcoin.org", false },
   { "woodev.us", true },
+  { "woodinvillesepticservice.net", true },
   { "woodlandhillselectrical.com", true },
   { "woodlandsmetro.church", false },
   { "woodlandsvale.uk", true },
   { "woodlandwindows.com", true },
   { "woodomat.com", true },
   { "woodsidepottery.ca", true },
+  { "woodstocksupply.com", true },
   { "woof.gq", true },
   { "woofsbakery.com", true },
   { "woohooyeah.nl", true },
@@ -38910,6 +40626,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wooplagaming.com", true },
   { "wootware.co.za", true },
   { "wopr.network", true },
+  { "worca.de", true },
+  { "worcade.com", true },
   { "worcade.net", true },
   { "worcesterbouncycastlehire.co.uk", true },
   { "worcesterbouncycastles.co.uk", true },
@@ -38926,6 +40644,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "workcelerator.com", true },
   { "workcheck.bz", true },
   { "workcloud.jp", true },
+  { "worker.gov", true },
+  { "workeria-personal.de", true },
   { "workforce.co.tz", true },
   { "workgrouptech.org", true },
   { "workingclassmedia.com", true },
@@ -38957,8 +40677,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "worldofvnc.net", true },
   { "worldofwobble.co.uk", true },
   { "worldpeacetechnology.com", true },
-  { "worldrecipes.eu", true },
   { "worldsgreatestazuredemo.com", true },
+  { "worldsinperil.it", true },
   { "worldsoccerclips.com", true },
   { "worldstone777.com", true },
   { "worldtalk.de", true },
@@ -38966,8 +40686,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wormdisk.net", true },
   { "worst.horse", false },
   { "wort-suchen.de", true },
-  { "woshiluo.site", true },
   { "wot-tudasbazis.hu", true },
+  { "woti.dedyn.io", true },
   { "wotra-register.com", true },
   { "woudenberg.nl", true },
   { "woudenbergsedrukkerij.nl", true },
@@ -38980,7 +40700,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wow-screenshots.net", true },
   { "wowaffixes.info", true },
   { "wowbouncycastles.co.uk", true },
-  { "wowhelp.it", true },
   { "wowi-ffo.de", true },
   { "wowjs.co.uk", true },
   { "wowjs.org", true },
@@ -38994,9 +40713,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wp-site1.com", true },
   { "wp-site2.com", true },
   { "wp-tao.com", true },
+  { "wp-webagentur.de", true },
   { "wpac.de", true },
   { "wpandup.org", true },
   { "wpcanban.com", true },
+  { "wpccu-cdn.org", true },
   { "wpccu.org", true },
   { "wpcdn.bid", true },
   { "wpcharged.nz", true },
@@ -39011,6 +40732,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wpinter.com", true },
   { "wpldn.uk", true },
   { "wpletter.de", false },
+  { "wplistings.pro", true },
   { "wpmeetup-berlin.de", true },
   { "wpmu-tutorials.de", true },
   { "wpoptimalizace.cz", true },
@@ -39042,13 +40764,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "writeandedit-for-you.com", true },
   { "writecustomessay.com", true },
   { "writeenglishright.com", true },
-  { "writemyessay.info", true },
   { "writemyessay.today", true },
   { "writemyessays.com", true },
   { "writemypaperhub.com", true },
   { "writeoff.me", true },
   { "writepride.com", true },
-  { "writepro.net", true },
+  { "writer24.ru", true },
   { "writereditor.com", true },
   { "writing-job-online.com", true },
   { "writingcities.net", true },
@@ -39066,13 +40787,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wsadek.ovh", true },
   { "wsb.pl", true },
   { "wscales.com", false },
-  { "wscbiolo.id", true },
   { "wsdcapital.com", true },
   { "wselektro.de", true },
   { "wsgvet.com", true },
   { "wsl.sh", true },
+  { "wsldp.com", true },
   { "wsspalluto.de", true },
-  { "wssv.ch", true },
   { "wstudio.ch", true },
   { "wstx.com", true },
   { "wsv-grafenau.de", true },
@@ -39081,11 +40801,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wtf.ninja", true },
   { "wtfismyip.com", true },
   { "wtfnope.org", true },
+  { "wtfsec.org", true },
   { "wth.in", true },
   { "wtp.co.jp", true },
   { "wtpdive.jp", true },
   { "wtpmj.com", true },
+  { "wtup.net", true },
   { "wtw.io", true },
+  { "wucke13.de", true },
   { "wuerfel.wf", true },
   { "wuerfelmail.de", true },
   { "wufu.org", false },
@@ -39104,10 +40827,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wunderlist.com", true },
   { "wundernas.ch", true },
   { "wundi.net", true },
-  { "wuppertal-2018.de", true },
-  { "wuppertaler-kurrende.com", true },
-  { "wuppertaler-kurrende.de", true },
+  { "wunschzettel.de", true },
+  { "wuppertal-2018.de", false },
+  { "wuppertaler-kurrende.com", false },
+  { "wuppertaler-kurrende.de", false },
   { "wutianyi.com", true },
+  { "wuwuwu.me", true },
   { "wuxiaobai.win", true },
   { "wuxiaohen.com", true },
   { "wuyue.photo", true },
@@ -39121,18 +40846,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wweforums.net", true },
   { "wweichen.com.cn", true },
   { "wwgc2011.se", true },
-  { "wwjd.dynu.net", true },
-  { "wwv-8522.com", true },
   { "wwv-8722.com", true },
   { "www-33445.com", true },
   { "www-49889.com", true },
-  { "www-7570.com", true },
+  { "www-68277.com", true },
   { "www-80036.com", true },
   { "www-8522.am", true },
   { "www-8522.com", true },
   { "www-86499.com", true },
   { "www-8722.com", true },
-  { "www-9649.com", true },
   { "www-9822.com", true },
   { "www-pj009.com", true },
   { "www.aclu.org", false },
@@ -39144,7 +40866,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "www.braintreepayments.com", false },
   { "www.calyxinstitute.org", false },
   { "www.capitainetrain.com", false },
-  { "www.captaintrain.com", false },
   { "www.cloudflare.com", true },
   { "www.cnet.com", true },
   { "www.dropbox.com", true },
@@ -39156,6 +40877,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "www.facebook.com", false },
   { "www.fastmail.com", true },
   { "www.ft.com", true },
+  { "www.gamesdepartment.co.uk", true },
   { "www.getcloak.com", false },
   { "www.gmail.com", false },
   { "www.googlemail.com", false },
@@ -39192,25 +40914,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "www.twitter.com", false },
   { "www.united.com", true },
   { "www.usaa.com", false },
+  { "www.viasinc.com", false },
   { "www.vino75.com", false },
   { "www.wepay.com", false },
   { "www.wordpress.com", false },
   { "www.zdnet.com", true },
-  { "www68277.com", true },
-  { "wwww.is", true },
-  { "wwww.me.uk", true },
   { "wxcafe.net", true },
   { "wxdisco.com", true },
   { "wxforums.com", true },
   { "wxh.jp", true },
+  { "wxkxsw.com", true },
+  { "wxlog.cn", true },
   { "wxster.com", true },
-  { "wxzm.sx", true },
-  { "wyam.io", true },
+  { "wyam.io", false },
   { "wybar.uk", true },
-  { "wycrow.com", true },
+  { "wycrow.com", false },
   { "wyday.com", true },
   { "wygibanki.pl", true },
   { "wygodnie.pl", true },
+  { "wyhpartnership.co.uk", true },
   { "wynterhill.co.uk", true },
   { "wyo.cam", true },
   { "wypemagazine.se", true },
@@ -39218,9 +40940,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "wyrihaximus.net", true },
   { "wyrimaps.net", true },
   { "wyssmuller.ch", true },
+  { "wyydsb.cn", true },
+  { "wyydsb.com", true },
+  { "wyydsb.xin", true },
+  { "wyysoft.tk", true },
   { "wzfetish.com.br", true },
   { "wzfou.com", true },
+  { "wzrd.in", true },
   { "wzyboy.org", true },
+  { "x-iweb.ru", true },
   { "x-lan.be", true },
   { "x-one.co.jp", true },
   { "x.io", true },
@@ -39231,14 +40959,19 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "x2d2.de", true },
   { "x378.ch", true },
   { "x509.io", true },
+  { "x64architecture.com", true },
   { "x7plus.com", true },
   { "xa.search.yahoo.com", false },
   { "xa1.uk", true },
+  { "xanadu-auto.cz", true },
+  { "xanadu-catering.cz", true },
+  { "xanadu-golf.cz", true },
   { "xanadu-taxi.cz", true },
+  { "xanadu-trans.cz", true },
   { "xanax.pro", false },
   { "xants.de", true },
   { "xatr0z.org", false },
-  { "xavier.is", true },
+  { "xawen.net", false },
   { "xbb.hk", true },
   { "xbb.li", true },
   { "xbertschy.com", true },
@@ -39278,17 +41011,18 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xerhost.de", true },
   { "xerkus.pro", true },
   { "xerownia.eu", true },
+  { "xeryus.nl", true },
   { "xetown.com", true },
   { "xf-liam.com", true },
   { "xfce.space", true },
+  { "xfcy.me", true },
   { "xfd3.de", true },
   { "xferion.com", true },
   { "xfix.pw", true },
-  { "xfrag-networks.com", true },
   { "xgame.com.tr", true },
   { "xgclan.com", true },
   { "xgn.es", true },
-  { "xhadius.de", true },
+  { "xgzepto.cn", true },
   { "xhily.com", true },
   { "xhmikosr.io", true },
   { "xho.me", true },
@@ -39296,6 +41030,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xiamenshipbuilding.com", true },
   { "xiamuzi.com", true },
   { "xiangblog.com", true },
+  { "xiangfajia.cn", true },
   { "xiangweiqing.co.uk", true },
   { "xiangwenquan.me", true },
   { "xianjianruishiyouyiyuan.com", true },
@@ -39307,25 +41042,23 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xiaolong.link", true },
   { "xiaomao.tk", true },
   { "xiaomi.eu", true },
+  { "xiaomionline24.pl", true },
   { "xiaoniaoyou.com", true },
   { "xiaoyu.net", true },
   { "xiaoyy.org", true },
-  { "xiazhanjian.com", true },
-  { "xice.cf", true },
   { "xichtsbuch.de", true },
   { "xicreative.net", true },
   { "xiecongan.org", true },
   { "xif.at", true },
   { "xight.org", true },
   { "xilef.org", true },
+  { "xiliant.com", false },
   { "xilkoi.net", true },
   { "xilou.org", true },
   { "ximbo.net", true },
-  { "xin-in.com", true },
-  { "xin-in.net", true },
-  { "xing-in.net", true },
   { "xinj.com", true },
   { "xinnixdeuren-shop.be", true },
+  { "xinplay.net", true },
   { "xinuspeed.com", true },
   { "xinuspeedtest.com", true },
   { "xinuurl.com", true },
@@ -39342,29 +41075,36 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xkviz.net", true },
   { "xlan.be", true },
   { "xlange.com", true },
+  { "xliang.co", true },
+  { "xlui.me", true },
   { "xluxes.jp", true },
   { "xmedius.ca", true },
   { "xmedius.com", false },
   { "xmedius.eu", true },
   { "xmenrevolution.com", true },
+  { "xmine128.tk", true },
   { "xmlbeam.org", true },
   { "xmlogin288.com", true },
+  { "xmodule.org", true },
   { "xmpp.dk", true },
   { "xmppwocky.net", true },
   { "xmr.to", true },
   { "xmtpro.com", true },
   { "xmusic.live", true },
-  { "xmv.cz", false },
+  { "xmv.cz", true },
   { "xn----7sbfl2alf8a.xn--p1ai", true },
   { "xn----8hcdn2ankm1bfq.com", true },
+  { "xn----8sbjfacqfqshbh7afyeg.xn--80asehdb", true },
   { "xn----zmcaltpp1mdh16i.com", true },
   { "xn--0iv967ab7w.xn--rhqv96g", true },
   { "xn--0kq33cz5c8wmwrqqw1d.com", true },
+  { "xn--158h.ml", true },
   { "xn--24-6kch4bfqee.xn--p1ai", true },
   { "xn--24-glcia8dc.xn--p1ai", true },
   { "xn--48jwg508p.net", true },
   { "xn--4kro7fswi.xn--6qq986b3xl", true },
   { "xn--4pv80kkz8auzf.jp", true },
+  { "xn--57h.ml", true },
   { "xn--5dbkjqb0d.com", true },
   { "xn--5dbkjqb0d.net", true },
   { "xn--6o8h.cf", true },
@@ -39377,6 +41117,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--80aejljbfwxn.xn--p1ai", true },
   { "xn--80anogxed.xn--p1ai", true },
   { "xn--80azelb.xn--p1ai", true },
+  { "xn--8bi.gq", true },
   { "xn--8dry00a7se89ay98epsgxxq.com", true },
   { "xn--90accgba6bldkcbb7a.xn--p1acf", true },
   { "xn--allgu-biker-o8a.de", true },
@@ -39397,7 +41138,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--detrkl13b9sbv53j.com", true },
   { "xn--detrkl13b9sbv53j.org", true },
   { "xn--die-zahnrzte-ncb.de", true },
-  { "xn--dk8haaa.ws", true },
   { "xn--dmonenjger-q5ag.net", true },
   { "xn--dragni-g1a.de", true },
   { "xn--dtursfest-72a.dk", true },
@@ -39405,8 +41145,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--ecki0cd0bu9a4nsjb.com", true },
   { "xn--eckle6c0exa0b0modc7054g7h8ajw6f.com", true },
   { "xn--ehq13kgw4e.ml", true },
+  { "xn--ehqw04eq6e.jp", true },
   { "xn--elsignificadodesoar-c4b.com", true },
   { "xn--erklderbarenben-slbh.dk", true },
+  { "xn--et8h.cf", true },
   { "xn--f9jh4f4b4993b66s.tokyo", true },
   { "xn--familie-pppinghaus-l3b.de", true },
   { "xn--feuerlscher-arten-4zb.de", true },
@@ -39415,11 +41157,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--fp8h58f.ws", true },
   { "xn--frankierknig-djb.de", true },
   { "xn--fs5ak3f.com", true },
-  { "xn--gfrrli-yxa.ch", true },
+  { "xn--gfrr-7qa.li", true },
+  { "xn--heilendehnde-ocb.de", true },
   { "xn--hgbk4a00a.com", true },
   { "xn--hllrigl-90a.at", true },
   { "xn--i2ru8q2qg.com", true },
   { "xn--imker-in-nrnberg-szb.de", true },
+  { "xn--irr.xn--fiqs8s", true },
+  { "xn--is8h6d.gq", true },
   { "xn--j4h.cf", true },
   { "xn--jbs-tna.de", true },
   { "xn--jda.tk", true },
@@ -39454,6 +41199,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--nrrdetval-v2ab.se", true },
   { "xn--o38h.tk", true },
   { "xn--obt757c.com", true },
+  { "xn--oiqt18e8e2a.eu.org", true },
   { "xn--p8j9a0d9c9a.xn--q9jyb4c", true },
   { "xn--pbt947am3ab71g.com", true },
   { "xn--pe-bka.ee", true },
@@ -39472,10 +41218,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--rtter-kva.eu", true },
   { "xn--ruanmller-u9a.com", true },
   { "xn--s-1gaa.fi", true },
+  { "xn--schlerzeitung-ideenlos-ulc.de", true },
+  { "xn--schsischer-christstollen-qbc.shop", true },
   { "xn--seelenwchter-mcb.eu", true },
   { "xn--spenijmazania-yhc.pl", true },
+  { "xn--sz8h.ml", true },
   { "xn--t-oha.lv", true },
   { "xn--t8j4aa4nkg1h9bwcvud.com", true },
+  { "xn--t8j4aa4nyhxa7duezbl49aqg5546e264d.net", true },
   { "xn--t8j4aa4nzg3a5euoxcwee.xyz", true },
   { "xn--tigreray-i1a.org", true },
   { "xn--trdler-xxa.xyz", true },
@@ -39485,6 +41235,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xn--v-wfa35g.ro", true },
   { "xn--v6q426ishax2a.xyz", true },
   { "xn--woistdermlleimer-rzb.de", true },
+  { "xn--wq9h.ml", true },
   { "xn--y-5ga.com", true },
   { "xn--y8j148r.xn--q9jyb4c", true },
   { "xn--y8ja6lb.xn--q9jyb4c", true },
@@ -39500,7 +41251,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xninja.xyz", true },
   { "xnode.org", true },
   { "xntrik.wtf", true },
-  { "xnu.kr", true },
   { "xo.tc", true },
   { "xo7.ovh", true },
   { "xolphin.nl", true },
@@ -39509,8 +41259,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xone.cz", true },
   { "xonn.de", true },
   { "xoonth.net", true },
+  { "xp-ochrona.pl", true },
   { "xp2.de", true },
-  { "xpbytes.com", true },
   { "xpd.se", true },
   { "xperidia.com", true },
   { "xpletus.nl", true },
@@ -39519,12 +41269,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xposedornot.com", true },
   { "xps2pdf.co.uk", true },
   { "xps2pdf.info", true },
+  { "xq55.com", false },
   { "xqk7.com", true },
   { "xr.cx", true },
+  { "xr1s.me", true },
   { "xrg.cz", true },
-  { "xrippedhd.com", true },
   { "xrockx.de", true },
-  { "xroot.org", false },
   { "xrwracing-france.com", true },
   { "xs2a.no", true },
   { "xs74.com", true },
@@ -39535,7 +41285,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xserownia.pl", true },
   { "xsmobile.de", true },
   { "xss.ht", true },
+  { "xss.name", true },
   { "xss.sk", true },
+  { "xsteam.eu", true },
+  { "xsuper.net", true },
   { "xsz.jp", true },
   { "xtarget.ru", true },
   { "xtips.us", true },
@@ -39543,25 +41296,25 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xtom.chat", true },
   { "xtom.com", true },
   { "xtom.com.hk", true },
-  { "xtom.io", true },
   { "xtom.wiki", true },
   { "xtrainsights.com", true },
   { "xtremebouncepartyhire.com.au", true },
-  { "xtremegaming.it", true },
+  { "xtrememidlife.nl", true },
   { "xtronics.com", true },
   { "xts.bike", true },
   { "xts3636.net", true },
   { "xtu2.com", true },
   { "xuab.net", true },
-  { "xuan-li88.com", true },
-  { "xuan-li88.net", true },
   { "xuanmeishe.net", true },
   { "xubo666.com", true },
   { "xuc.me", true },
   { "xuedianshang.com", true },
+  { "xuehao.net.cn", true },
+  { "xuehuang666.cn", true },
   { "xuming.studio", true },
   { "xunn.io", true },
   { "xuntier.ch", true },
+  { "xuyh0120.win", true },
   { "xviimusic.com", true },
   { "xvt-blog.tk", true },
   { "xwalck.se", true },
@@ -39570,28 +41323,27 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "xxffo.com", true },
   { "xxiz.com", true },
   { "xxxlbox.com", true },
+  { "xxxsuper.net", true },
   { "xyenon.bid", true },
   { "xyfun.net", false },
   { "xyngular-health.com", true },
   { "xywing.com", true },
-  { "xyyp.mn", true },
   { "xyzulu.hosting", true },
   { "xza.fr", true },
   { "xzclip.cn", true },
   { "xzy.es", true },
-  { "xzy.one", true },
   { "y11n.net", true },
   { "yabuisha.jp", true },
   { "yachigoya.com", true },
   { "yacineboumaza.fr", true },
   { "yacobo.com", true },
   { "yado-furu.com", true },
-  { "yafull.com", true },
   { "yafuoku.ru", true },
   { "yagihiro.tech", true },
   { "yahan.tv", true },
   { "yaharu.ru", true },
   { "yahvehyireh.com", true },
+  { "yak-soap.co", true },
   { "yak.is", true },
   { "yakmade.com", true },
   { "yakmoo.se", true },
@@ -39609,20 +41361,21 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yanbao.xyz", true },
   { "yandere.moe", true },
   { "yangjingwen.cn", true },
+  { "yangmaodang.org", true },
   { "yanngraf.ch", true },
   { "yanngraf.com", true },
   { "yannic.world", true },
   { "yannick.cloud", true },
   { "yannik-buerkle.de", true },
   { "yannikbloscheck.com", true },
-  { "yannikhenke.de", true },
   { "yanovich.net", true },
   { "yanqiyu.info", true },
+  { "yans.io", true },
   { "yantrasthal.com", true },
-  { "yao-in.com", true },
-  { "yao-in.net", true },
+  { "yanuwa.com", true },
   { "yapbreak.fr", true },
   { "yarcom.ru", false },
+  { "yarogneva.ru", true },
   { "yarravilletownhouses.com.au", true },
   { "yaru.one", true },
   { "yassine-ayari.com", true },
@@ -39634,6 +41387,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yawen.me", true },
   { "yawnbox.com", true },
   { "yaxim.org", true },
+  { "yayart.club", true },
   { "yazaral.com", true },
   { "ybin.me", true },
   { "ybresson.com", true },
@@ -39649,10 +41403,13 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ycnrg.org", true },
   { "yd.io", true },
   { "yeapdata.com", true },
+  { "yecl.net", true },
   { "yeesker.com", true },
   { "yell.ml", true },
+  { "yellotalk.co", true },
   { "yellowfly.co.uk", true },
   { "yellowpages.ee", true },
+  { "yellowtaillasvegas.com", true },
   { "yellowtree.co.za", true },
   { "yelon.hu", true },
   { "yelp.at", true },
@@ -39689,10 +41446,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yelp.se", true },
   { "yemektarifleri.com", true },
   { "yenibilgi.net", true },
+  { "yenpape.com", true },
   { "yep-pro.ch", true },
   { "yephy.com", true },
   { "yeshu.org", true },
   { "yesiammaisey.me", true },
+  { "yeskx.com", true },
   { "yeswecan.co.bw", true },
   { "yeswehack.com", true },
   { "yetanalytics.io", true },
@@ -39708,14 +41467,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yhaupenthal.org", true },
   { "yhb.io", true },
   { "yhe.me", true },
+  { "yhenke.de", true },
   { "yhfou.com", true },
   { "yhndnzj.com", true },
   { "yhong.me", true },
-  { "yhwj.top", false },
+  { "yhrd.org", true },
   { "yicknam.my", true },
-  { "yiffy.tips", false },
-  { "yiffy.zone", false },
-  { "yigujin.cn", true },
   { "yiheng.moe", true },
   { "yii2.cc", true },
   { "yikeyong.com", true },
@@ -39727,7 +41484,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yiyuanzhong.com", true },
   { "yiyueread.com", true },
   { "yiz96.com", true },
+  { "yjsoft.me", true },
   { "yjsw.sh.cn", true },
+  { "ykhut.com", true },
   { "yksityisyydensuoja.fi", true },
   { "ylde.de", true },
   { "ylinternal.com", true },
@@ -39737,6 +41496,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "ymtsonline.org", true },
   { "ynnovasport.be", true },
   { "ynxfh.cn", true },
+  { "yoa.st", true },
   { "yoast.com", true },
   { "yobai-grouprec.jp", true },
   { "yobai28.com", true },
@@ -39755,6 +41515,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yogananda-roma.org", true },
   { "yogaschoolrishikesh.com", true },
   { "yoibyoin.info", true },
+  { "yoimise.net", true },
   { "yoitoko.city", true },
   { "yoitsu.moe", true },
   { "yokohama-legaloffice.jp", true },
@@ -39764,7 +41525,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yolops.net", true },
   { "yombo.net", true },
   { "yon.co.il", true },
-  { "yongbin.org", true },
   { "yoonas.com", true },
   { "yooooex.com", true },
   { "yoppoy.com", true },
@@ -39779,6 +41539,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yosheenetwork.fr", true },
   { "yoshibaworks.com", true },
   { "yoshitsugu.net", true },
+  { "yosida-dental.com", true },
   { "yospos.org", true },
   { "yoticonnections.com", true },
   { "yotilab.com", true },
@@ -39788,6 +41549,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "you2you.fr", true },
   { "youareme.ca", true },
   { "youc.ir", true },
+  { "youcanfuckoff.xyz", true },
   { "youcanmakeit.at", true },
   { "youcruit.com", true },
   { "youdungoofd.com", true },
@@ -39804,24 +41566,30 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "youlovehers.com", true },
   { "youms.de", true },
   { "young-sheldon.com", true },
+  { "youngauthentic.cf", true },
   { "youngdogs.org", true },
   { "youngfree.cn", true },
   { "youngpeopleunited.co.uk", true },
   { "youngsook.com", true },
   { "youngsook.org", true },
   { "youpark.no", true },
+  { "youpickfarms.org", true },
   { "your-erotic-stories.com", true },
   { "your-out.com", true },
+  { "your-waterserver.com", true },
   { "youracnepro.com", true },
+  { "youran.me", true },
   { "yourbonus.click", true },
   { "yourciso.com", true },
   { "yourcomputer.expert", true },
   { "yourcopywriter.it", true },
   { "yourforex.org", true },
   { "yourfriendlytech.com", true },
+  { "yourfuntrivia.com", true },
   { "yourfuturestrategy.com.au", true },
+  { "yourgadget.ro", true },
   { "yourgames.tv", true },
-  { "yourhair.net", true },
+  { "yourlanguages.de", true },
   { "yourmemorykeeper.co.uk", true },
   { "yourneighborhub.com", true },
   { "yourskin.nl", true },
@@ -39831,6 +41599,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yout.com", true },
   { "youth.gov", true },
   { "youthovation.org", true },
+  { "youthrules.gov", true },
   { "youtous.me", true },
   { "youtsuu-raku.com", true },
   { "youtube.com", true },
@@ -39840,21 +41609,16 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yoyoost.duckdns.org", true },
   { "ypart.eu", true },
   { "ypid.de", true },
-  { "ypiresia.fr", false },
   { "yplanapp.com", true },
   { "yqjf68.com", true },
   { "yr166166.com", true },
   { "yrjanheikki.com", true },
   { "ys-shop.biz", true },
   { "ysicorp.com", true },
+  { "yslbeauty.com", true },
   { "yspeo.biz", true },
   { "ysun.xyz", true },
-  { "ysx.me.uk", true },
-  { "ytb.zone", true },
-  { "ytbmp3.com", true },
-  { "ytbmp4.com", true },
   { "ytec.ca", true },
-  { "ytpak.com", true },
   { "ytpak.pk", true },
   { "ytreza.fr", true },
   { "ytuquelees.net", true },
@@ -39922,10 +41686,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yue.la", true },
   { "yue2.net", true },
   { "yugasun.com", true },
-  { "yukari.cafe", true },
+  { "yuisyo.ml", true },
+  { "yukari.cafe", false },
   { "yukari.cloud", true },
   { "yuki-nagato.com", true },
-  { "yuki-portfolio.com", true },
   { "yuki.xyz", true },
   { "yukonconnector.com", true },
   { "yukonlip.com", true },
@@ -39945,8 +41709,9 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yutakato.net", true },
   { "yutang.vn", true },
   { "yutangyun.com", true },
-  { "yutaron.tokyo", true },
   { "yutuo.net", true },
+  { "yuuki0xff.jp", true },
+  { "yuuta.moe", true },
   { "yuwei.org", true },
   { "yuweiyang.xyz", true },
   { "yuxuan.org", true },
@@ -39960,12 +41725,10 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yvonnehaeusser.de", true },
   { "yvonnethomet.ch", true },
   { "yvonnewilhelmi.com", true },
+  { "yxs.me", true },
   { "yxt521.com", true },
-  { "yya.bid", true },
   { "yya.me", true },
-  { "yya.men", true },
   { "yyc.city", true },
-  { "yyrss.com", false },
   { "yyyy.xyz", true },
   { "yzal.io", true },
   { "yzcloud.me", true },
@@ -39973,7 +41736,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "yzimroni.net", true },
   { "z-konzept-nutrition.ru", true },
   { "z-latko.info", true },
-  { "z-to-a.com", true },
   { "z-vector.com", true },
   { "z.ai", true },
   { "z1h.de", true },
@@ -39981,7 +41743,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "z99944x.xyz", true },
   { "za.search.yahoo.com", false },
   { "zaagbaak.nl", true },
-  { "zaalleatherwear.nl", false },
+  { "zabavno.mk", true },
   { "zabbix.tips", true },
   { "zabszk.net", true },
   { "zabukovnik.net", true },
@@ -39999,10 +41761,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zachaysan.com", true },
   { "zachborboa.com", true },
   { "zachgibbens.org", true },
-  { "zachpeters.org", true },
   { "zachschneider.ca", true },
   { "zaclys.com", false },
-  { "zadroweb.com", true },
   { "zafirus.name", true },
   { "zaghyr.org", true },
   { "zahe.me", true },
@@ -40017,10 +41777,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zakladam.cz", true },
   { "zakmccrac.de", true },
   { "zakojifarm.jp", true },
+  { "zakoncontrol.com", false },
   { "zakr.es", true },
   { "zakspartiesandevents.com", true },
   { "zalamea.ph", true },
   { "zaloghaz.ro", true },
+  { "zalohovaniburian.cz", true },
   { "zalvus.com", true },
   { "zamalektoday.com", true },
   { "zamocosmeticos.com.br", true },
@@ -40034,10 +41796,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zapmaster14.com", true },
   { "zappbuildapps.com", false },
   { "zappos.com", true },
-  { "zaptan.info", false },
-  { "zaptan.net", false },
-  { "zaptan.org", false },
-  { "zaptan.us", false },
   { "zarabiaj.com", true },
   { "zaratan.fr", true },
   { "zargescases.co.uk", true },
@@ -40047,10 +41805,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zatsepin.by", true },
   { "zaufanatrzeciastrona.pl", true },
   { "zavec.com.ec", true },
+  { "zavedu.org", true },
   { "zavetaji.lv", true },
   { "zawo-electric.de", true },
   { "zayna.eu", true },
-  { "zberger.com", true },
+  { "zbanks.cn", true },
   { "zbetcheck.in", true },
   { "zbrane-doplnky.cz", true },
   { "zbut.bg", true },
@@ -40068,8 +41827,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zdrave-konzultace.cz", true },
   { "zdravekonzultace.cz", true },
   { "zdravesteny.cz", true },
+  { "zdravystul.cz", true },
   { "zdrojak.cz", true },
+  { "zdymak.by", true },
   { "ze3kr.com", true },
+  { "zeal-and.jp", true },
   { "zeal-interior.com", true },
   { "zealworks.jp", true },
   { "zebbra.ro", true },
@@ -40079,9 +41841,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zeebrieshoekvanholland.nl", true },
   { "zeel.com", true },
   { "zeelynk.com", true },
+  { "zeestraten.nl", true },
   { "zeetoppers.nl", true },
   { "zeeuw.nl", true },
   { "zeguigui.com", true },
+  { "zeibekiko-souvlaki.gr", true },
   { "zeilenmethans.nl", true },
   { "zeilles.nu", true },
   { "zeitoununiversity.org", true },
@@ -40096,9 +41860,8 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zena.cx", false },
   { "zenchain.com", true },
   { "zenevents.ro", true },
-  { "zenfusion.fr", true },
   { "zengdong.ren", true },
-  { "zenics.co.uk", true },
+  { "zenghx.tk", false },
   { "zenithmedia.ca", true },
   { "zenk-security.com", true },
   { "zenlogic.com", true },
@@ -40106,9 +41869,11 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zennzimie.be", true },
   { "zennzimie.com", true },
   { "zenofa.co.id", true },
+  { "zenram.com", true },
   { "zentask.io", true },
   { "zenti.cloud", true },
   { "zenvideocloud.com", true },
+  { "zeparadox.com", true },
   { "zephyrbk.com", true },
   { "zephyrbookkeeping.com", true },
   { "zephyretcoraline.com", true },
@@ -40118,6 +41883,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zerg.uk", true },
   { "zerobounce.net", true },
   { "zerofy.de", true },
+  { "zerolab.org", true },
   { "zeronet.io", true },
   { "zeropoint.bg", true },
   { "zeropush.com", true },
@@ -40153,6 +41919,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zhangyuhao.com", true },
   { "zhangzifan.com", false },
   { "zhaofeng.li", true },
+  { "zhaopage.com", true },
   { "zhaoxixiangban.cc", true },
   { "zhcexo.com", true },
   { "zhen-chen.com", true },
@@ -40162,14 +41929,15 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zhiku8.com", true },
   { "zhima.io", true },
   { "zhitanska.com", true },
-  { "zhiwei.me", true },
   { "zhl123.com", true },
+  { "zhome.info", true },
   { "zhongzicili.ws", true },
-  { "zhoushuo.me", true },
+  { "zhoushuo.me", false },
   { "zhoutiancai.cn", true },
   { "zhovner.com", true },
   { "zhthings.com", true },
   { "zhuihoude.com", true },
+  { "zi.is", true },
   { "ziegler-family.com", true },
   { "ziegler-heizung-frankfurt.de", true },
   { "zielonakarta.com", true },
@@ -40194,6 +41962,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zingjerijk.nl", true },
   { "zings.eu", true },
   { "zinniamay.com", true },
+  { "zinnowitzer-ferienwohnung.de", true },
   { "zinoui.com", true },
   { "ziondrive.com.br", true },
   { "zionnationalpark.net", true },
@@ -40202,22 +41971,34 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zipkey.de", true },
   { "ziptie.com", true },
   { "zircode.com", true },
+  { "zirka24.net", true },
   { "ziroh.be", true },
   { "zirtek.ie", true },
   { "zirtual.com", true },
-  { "zitrone44.de", true },
   { "zitseng.com", true },
   { "zittingskalender.be", true },
   { "zivava.ge", true },
   { "zivmergers.com", true },
-  { "zivyruzenec.cz", false },
+  { "zivver.be", true },
+  { "zivver.com", true },
+  { "zivver.de", true },
+  { "zivver.eu", true },
+  { "zivver.info", true },
+  { "zivver.nl", true },
+  { "zivver.uk", true },
+  { "zivyruzenec.cz", true },
   { "zixiao.wang", true },
+  { "zjuqsc.com", true },
   { "zjv.me", true },
+  { "zjyifa.cn", true },
+  { "zk.com.co", true },
   { "zk.gd", true },
   { "zk9.nl", true },
   { "zkontrolujsiauto.cz", true },
   { "zkrypt.cc", true },
   { "zkzone.net", true },
+  { "zl0iu.com", true },
+  { "zl8862.com", true },
   { "zlatakus.cz", true },
   { "zlatosnadno.cz", true },
   { "zlaty-tyden.cz", true },
@@ -40249,10 +42030,12 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zofrex.com", true },
   { "zohar.wang", true },
   { "zoigl.club", true },
+  { "zoisfinefood.com", true },
   { "zojadravai.com", true },
   { "zoki.art", true },
   { "zollihood.ch", true },
   { "zolokar.xyz", true },
+  { "zom.bi", true },
   { "zomerschoen.nl", true },
   { "zonadigital.co", true },
   { "zone-produkte.de", false },
@@ -40276,6 +42059,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zoomseoservices.com", false },
   { "zooom.azurewebsites.net", true },
   { "zooom2.azurewebsites.net", true },
+  { "zoop.ml", true },
   { "zooparadies.eu", true },
   { "zooplankton.no", true },
   { "zootime.net", true },
@@ -40286,15 +42070,17 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zor.com", true },
   { "zorasvobodova.cz", true },
   { "zorgclustertool.nl", true },
-  { "zorig.ch", true },
+  { "zorig.ch", false },
   { "zorium.org", true },
   { "zorntt.fr", true },
   { "zotero.org", true },
   { "zouk.info", true },
+  { "zouyaoji.top", true },
   { "zozo.com", true },
   { "zozzle.co.uk", true },
+  { "zp25.ninja", true },
+  { "zqstudio.top", true },
   { "zravypapir.cz", true },
-  { "zrhdwz.cn", true },
   { "zrniecka-pre-sny.sk", true },
   { "zrnieckapresny.sk", true },
   { "zrt.io", true },
@@ -40308,9 +42094,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zsrbcs.com", true },
   { "zten.org", true },
   { "ztjuh.tk", true },
-  { "zuan-in.net", true },
   { "zubel.it", false },
-  { "zubora.co", true },
   { "zuefle.net", true },
   { "zug-anwalt.de", true },
   { "zug.fr", true },
@@ -40332,7 +42116,6 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zuppy.pm", true },
   { "zuralski.net", true },
   { "zurgl.com", false },
-  { "zurickrelogios.com.br", true },
   { "zurret.de", true },
   { "zusjesvandenbos.nl", true },
   { "zuzumba.es", true },
@@ -40348,13 +42131,14 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zwy.ch", true },
   { "zx6rninja.de", true },
   { "zx7r.de", true },
-  { "zxavier.com", true },
   { "zxe.com.br", true },
   { "zxtcode.com", true },
   { "zy.md", true },
   { "zybbo.com", true },
   { "zyciedlazwierzat.pl", true },
   { "zyciedogorynogami.pl", true },
+  { "zydronium.com", true },
+  { "zydronium.nl", true },
   { "zyger.co.za", true },
   { "zylai.com", true },
   { "zymmm.com", true },
@@ -40363,6 +42147,7 @@ static const nsSTSPreload kSTSPreloadList[] = {
   { "zyrillezuno.com", true },
   { "zyul.ddns.net", true },
   { "zyzardx.com", true },
+  { "zyzsdy.com", true },
   { "zzekj.net", true },
   { "zzpd.nl", false },
   { "zzsec.org", true },
diff --git a/security/manager/ssl/tests/gtest/DeserializeCertTest.cpp b/security/manager/ssl/tests/gtest/DeserializeCertTest.cpp
index 868ecdba6..c3a4115b3 100644
--- a/security/manager/ssl/tests/gtest/DeserializeCertTest.cpp
+++ b/security/manager/ssl/tests/gtest/DeserializeCertTest.cpp
@@ -94,3 +94,89 @@ TEST(psm_DeserializeCert, gecko46)
   ASSERT_EQ(NS_OK, rv);
   ASSERT_TRUE(cert);
 }
+
+TEST(psm_DeserializeCert, gecko64)
+{
+  // This is a serialization of nsITransportSecurityInfo from gecko 64.
+  // Deserialization should fail.
+  nsCString base64Serialization(
+  "FnhllAKWRHGAlo+ESXykKAAAAAAAAAAAwAAAAAAAAEaphjojH6pBabDSgSnsfLHeAAgAAgAAAAAAAAAAAAAAAAAAAAEAMQFmCjIm"
+  "kVxP+7sgiYWmMt8FvcOXmlQiTNWFiWlrbpbqgwAAAAAAAAX2MIIF8jCCBNqgAwIBAgIQDmTF+8I2reFLFyrrQceMsDANBgkqhkiG"
+  "9w0BAQsFADBwMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMS8w"
+  "LQYDVQQDEyZEaWdpQ2VydCBTSEEyIEhpZ2ggQXNzdXJhbmNlIFNlcnZlciBDQTAeFw0xNTExMDMwMDAwMDBaFw0xODExMjgxMjAw"
+  "MDBaMIGlMQswCQYDVQQGEwJVUzETMBEGA1UECBMKQ2FsaWZvcm5pYTEUMBIGA1UEBxMLTG9zIEFuZ2VsZXMxPDA6BgNVBAoTM0lu"
+  "dGVybmV0IENvcnBvcmF0aW9uIGZvciBBc3NpZ25lZCBOYW1lcyBhbmQgTnVtYmVyczETMBEGA1UECxMKVGVjaG5vbG9neTEYMBYG"
+  "A1UEAxMPd3d3LmV4YW1wbGUub3JnMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAs0CWL2FjPiXBl61lRfvvE0KzLJmG"
+  "9LWAC3bcBjgsH6NiVVo2dt6uXfzi5bTm7F3K7srfUBYkLO78mraM9qizrHoIeyofrV/n+pZZJauQsPjCPxMEJnRoD8Z4KpWKX0Ly"
+  "Du1SputoI4nlQ/htEhtiQnuoBfNZxF7WxcxGwEsZuS1KcXIkHl5VRJOreKFHTaXcB1qcZ/QRaBIv0yhxvK1yBTwWddT4cli6GfHc"
+  "Ce3xGMaSL328Fgs3jYrvG29PueB6VJi/tbbPu6qTfwp/H1brqdjh29U52Bhb0fJkM9DWxCP/Cattcc7az8EXnCO+LK8vkhw/kAiJ"
+  "WPKx4RBvgy73nwIDAQABo4ICUDCCAkwwHwYDVR0jBBgwFoAUUWj/kK8CB3U8zNllZGKiErhZcjswHQYDVR0OBBYEFKZPYB4fLdHn"
+  "8SOgKpUW5Oia6m5IMIGBBgNVHREEejB4gg93d3cuZXhhbXBsZS5vcmeCC2V4YW1wbGUuY29tggtleGFtcGxlLmVkdYILZXhhbXBs"
+  "ZS5uZXSCC2V4YW1wbGUub3Jngg93d3cuZXhhbXBsZS5jb22CD3d3dy5leGFtcGxlLmVkdYIPd3d3LmV4YW1wbGUubmV0MA4GA1Ud"
+  "DwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwdQYDVR0fBG4wbDA0oDKgMIYuaHR0cDovL2NybDMuZGln"
+  "aWNlcnQuY29tL3NoYTItaGEtc2VydmVyLWc0LmNybDA0oDKgMIYuaHR0cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItaGEtc2Vy"
+  "dmVyLWc0LmNybDBMBgNVHSAERTBDMDcGCWCGSAGG/WwBATAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2VydC5jb20v"
+  "Q1BTMAgGBmeBDAECAjCBgwYIKwYBBQUHAQEEdzB1MCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wTQYIKwYB"
+  "BQUHMAKGQWh0dHA6Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFNIQTJIaWdoQXNzdXJhbmNlU2VydmVyQ0EuY3J0MAwG"
+  "A1UdEwEB/wQCMAAwDQYJKoZIhvcNAQELBQADggEBAISomhGn2L0LJn5SJHuyVZ3qMIlRCIdvqe0Q6ls+C8ctRwRO3UU3x8q8OH+2"
+  "ahxlQmpzdC5al4XQzJLiLjiJ2Q1p+hub8MFiMmVPPZjb2tZm2ipWVuMRM+zgpRVM6nVJ9F3vFfUSHOb4/JsEIUvPY+d8/Krc+kPQ"
+  "wLvyieqRbcuFjmqfyPmUv1U9QoI4TQikpw7TZU0zYZANP4C/gj4Ry48/znmUaRvy2kvIl7gRQ21qJTK5suoiYoYNo3J9T+pXPGU7"
+  "Lydz/HwW+w0DpArtAaukI8aNX4ohFUKSwDSiIIWIWJiJGbEeIO0TIFwEVWTOnbNl/faPXpk5IRXicapqiILALwADAAAAAAEBAAAA"
+  "AAAABFAyNTYAAAAQUlNBLVBLQ1MxLVNIQTUxMgGVn7FlZRdIf6ub2JE75TGXrnTNpc0vRz+W9fC3//YsaAAAAANmCjImkVxP+7sg"
+  "iYWmMt8FvcOXmlQiTNWFiWlrbpbqgwAAAAAAAAX2MIIF8jCCBNqgAwIBAgIQDmTF+8I2reFLFyrrQceMsDANBgkqhkiG9w0BAQsF"
+  "ADBwMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMS8wLQYDVQQD"
+  "EyZEaWdpQ2VydCBTSEEyIEhpZ2ggQXNzdXJhbmNlIFNlcnZlciBDQTAeFw0xNTExMDMwMDAwMDBaFw0xODExMjgxMjAwMDBaMIGl"
+  "MQswCQYDVQQGEwJVUzETMBEGA1UECBMKQ2FsaWZvcm5pYTEUMBIGA1UEBxMLTG9zIEFuZ2VsZXMxPDA6BgNVBAoTM0ludGVybmV0"
+  "IENvcnBvcmF0aW9uIGZvciBBc3NpZ25lZCBOYW1lcyBhbmQgTnVtYmVyczETMBEGA1UECxMKVGVjaG5vbG9neTEYMBYGA1UEAxMP"
+  "d3d3LmV4YW1wbGUub3JnMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAs0CWL2FjPiXBl61lRfvvE0KzLJmG9LWAC3bc"
+  "BjgsH6NiVVo2dt6uXfzi5bTm7F3K7srfUBYkLO78mraM9qizrHoIeyofrV/n+pZZJauQsPjCPxMEJnRoD8Z4KpWKX0LyDu1Sputo"
+  "I4nlQ/htEhtiQnuoBfNZxF7WxcxGwEsZuS1KcXIkHl5VRJOreKFHTaXcB1qcZ/QRaBIv0yhxvK1yBTwWddT4cli6GfHcCe3xGMaS"
+  "L328Fgs3jYrvG29PueB6VJi/tbbPu6qTfwp/H1brqdjh29U52Bhb0fJkM9DWxCP/Cattcc7az8EXnCO+LK8vkhw/kAiJWPKx4RBv"
+  "gy73nwIDAQABo4ICUDCCAkwwHwYDVR0jBBgwFoAUUWj/kK8CB3U8zNllZGKiErhZcjswHQYDVR0OBBYEFKZPYB4fLdHn8SOgKpUW"
+  "5Oia6m5IMIGBBgNVHREEejB4gg93d3cuZXhhbXBsZS5vcmeCC2V4YW1wbGUuY29tggtleGFtcGxlLmVkdYILZXhhbXBsZS5uZXSC"
+  "C2V4YW1wbGUub3Jngg93d3cuZXhhbXBsZS5jb22CD3d3dy5leGFtcGxlLmVkdYIPd3d3LmV4YW1wbGUubmV0MA4GA1UdDwEB/wQE"
+  "AwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwdQYDVR0fBG4wbDA0oDKgMIYuaHR0cDovL2NybDMuZGlnaWNlcnQu"
+  "Y29tL3NoYTItaGEtc2VydmVyLWc0LmNybDA0oDKgMIYuaHR0cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItaGEtc2VydmVyLWc0"
+  "LmNybDBMBgNVHSAERTBDMDcGCWCGSAGG/WwBATAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2VydC5jb20vQ1BTMAgG"
+  "BmeBDAECAjCBgwYIKwYBBQUHAQEEdzB1MCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wTQYIKwYBBQUHMAKG"
+  "QWh0dHA6Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydFNIQTJIaWdoQXNzdXJhbmNlU2VydmVyQ0EuY3J0MAwGA1UdEwEB"
+  "/wQCMAAwDQYJKoZIhvcNAQELBQADggEBAISomhGn2L0LJn5SJHuyVZ3qMIlRCIdvqe0Q6ls+C8ctRwRO3UU3x8q8OH+2ahxlQmpz"
+  "dC5al4XQzJLiLjiJ2Q1p+hub8MFiMmVPPZjb2tZm2ipWVuMRM+zgpRVM6nVJ9F3vFfUSHOb4/JsEIUvPY+d8/Krc+kPQwLvyieqR"
+  "bcuFjmqfyPmUv1U9QoI4TQikpw7TZU0zYZANP4C/gj4Ry48/znmUaRvy2kvIl7gRQ21qJTK5suoiYoYNo3J9T+pXPGU7Lydz/HwW"
+  "+w0DpArtAaukI8aNX4ohFUKSwDSiIIWIWJiJGbEeIO0TIFwEVWTOnbNl/faPXpk5IRXicapqiIJmCjImkVxP+7sgiYWmMt8FvcOX"
+  "mlQiTNWFiWlrbpbqgwAAAAAAAAS1MIIEsTCCA5mgAwIBAgIQBOHnpNxc8vNtwCtCuF0VnzANBgkqhkiG9w0BAQsFADBsMQswCQYD"
+  "VQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMSswKQYDVQQDEyJEaWdpQ2Vy"
+  "dCBIaWdoIEFzc3VyYW5jZSBFViBSb290IENBMB4XDTEzMTAyMjEyMDAwMFoXDTI4MTAyMjEyMDAwMFowcDELMAkGA1UEBhMCVVMx"
+  "FTATBgNVBAoTDERpZ2lDZXJ0IEluYzEZMBcGA1UECxMQd3d3LmRpZ2ljZXJ0LmNvbTEvMC0GA1UEAxMmRGlnaUNlcnQgU0hBMiBI"
+  "aWdoIEFzc3VyYW5jZSBTZXJ2ZXIgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC24C/CJAbIbQRf1+8KZAayfSIm"
+  "ZRauQkCbztyfn3YHPsMwVYcZuU+UDlqUH1VWtMICKq/QmO4LQNfE0DtyyBSe75CxEamu0si4QzrZCwvV1ZX1QK/IHe1NnF9Xt4ZQ"
+  "aJn1itrSxwUfqJfJ3KSxgoQtxq2lnMcZgqaFD15EWCo3j/018QsIJzJa9buLnqS9UdAn4t07QjOjBSjEuyjMmqwrIw14xnvmXnG3"
+  "Sj4I+4G3FhahnSMSTeXXkgisdaScus0Xsh5ENWV/UyU50RwKmmMbGZJ0aAo3wsJSSMs5WqK24V3B3aAguCGikyZvFEohQcftbZvy"
+  "SC/zA/WiaJJTL17jAgMBAAGjggFJMIIBRTASBgNVHRMBAf8ECDAGAQH/AgEAMA4GA1UdDwEB/wQEAwIBhjAdBgNVHSUEFjAUBggr"
+  "BgEFBQcDAQYIKwYBBQUHAwIwNAYIKwYBBQUHAQEEKDAmMCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wSwYD"
+  "VR0fBEQwQjBAoD6gPIY6aHR0cDovL2NybDQuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0SGlnaEFzc3VyYW5jZUVWUm9vdENBLmNybDA9"
+  "BgNVHSAENjA0MDIGBFUdIAAwKjAoBggrBgEFBQcCARYcaHR0cHM6Ly93d3cuZGlnaWNlcnQuY29tL0NQUzAdBgNVHQ4EFgQUUWj/"
+  "kK8CB3U8zNllZGKiErhZcjswHwYDVR0jBBgwFoAUsT7DaQP4v0cB1JgmGggC72NkK8MwDQYJKoZIhvcNAQELBQADggEBABiKlYkD"
+  "5m3fXPwdaOpKj4PWUS+Na0QWnqxj9dJubISZi6qBcYRb7TROsLd5kinMLYBq8I4g4Xmk/gNHE+r1hspZcX30BJZr01lYPf7TMSVc"
+  "GDiEo+afgv2MW5gxTs14nhr9hctJqvIni5ly/D6q1UEL2tU2ob8cbkdJf17ZSHwD2f2LSaCYJkJA69aSEaRkCldUxPUd1gJea6zu"
+  "xICaEnL6VpPX/78whQYwvwt/Tv9XBZ0k7YXDK/umdaisLRbvfXknsuvCnQsH6qqF0wGjIChBWUMo0oHjqvbsezt3tkBigAVBRQHv"
+  "FwY+3sAzm2fTYS5yh+Rp/BIAV0AecPUeybRmCjImkVxP+7sgiYWmMt8FvcOXmlQiTNWFiWlrbpbqgwAAAAAAAAPJMIIDxTCCAq2g"
+  "AwIBAgIQAqxcJmoLQJuPC3nyrkYldzANBgkqhkiG9w0BAQUFADBsMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5j"
+  "MRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMSswKQYDVQQDEyJEaWdpQ2VydCBIaWdoIEFzc3VyYW5jZSBFViBSb290IENBMB4X"
+  "DTA2MTExMDAwMDAwMFoXDTMxMTExMDAwMDAwMFowbDELMAkGA1UEBhMCVVMxFTATBgNVBAoTDERpZ2lDZXJ0IEluYzEZMBcGA1UE"
+  "CxMQd3d3LmRpZ2ljZXJ0LmNvbTErMCkGA1UEAxMiRGlnaUNlcnQgSGlnaCBBc3N1cmFuY2UgRVYgUm9vdCBDQTCCASIwDQYJKoZI"
+  "hvcNAQEBBQADggEPADCCAQoCggEBAMbM5XPm+9S75S0tMqbf5YE/yc0lSbZxKsPVlDRnogocsF9ppkCxxLeyj9CYpKlBWTrT3JTW"
+  "PNt0OKRKzE0lgvdKpVMSOO7zSW1xkX5jtqumX8OkhPhPYlG++MXs2ziS4wblCJEMxChBVfvLWokVfnHoNb9Ncgk9vjo4UFt3MRuN"
+  "s8ckRZqnrG0AFFoEt7oT61EKmEFBIk5lYYeBQVCmeVyJ3hlKV9Uu5l0cUyx+mM0aBhakaHPQNAQTXKFx01p8VdteZOE3hzBWBOUR"
+  "tCmAEvF5OYiiAhF8J2a3iLd48soKqDirCmTCv2ZdlYTBoSUeh10aUAsgEsxBu24LUTi4S8sCAwEAAaNjMGEwDgYDVR0PAQH/BAQD"
+  "AgGGMA8GA1UdEwEB/wQFMAMBAf8wHQYDVR0OBBYEFLE+w2kD+L9HAdSYJhoIAu9jZCvDMB8GA1UdIwQYMBaAFLE+w2kD+L9HAdSY"
+  "JhoIAu9jZCvDMA0GCSqGSIb3DQEBBQUAA4IBAQAcGgaX3NecnzyIZgYIVyHbIUf4KmeqvxgydkAQV8GK83rZEWWONfqe/EW1ntlM"
+  "MUu4kehDLI6zeM7b41N5cdblIZQB2lWHmiRk9opmzN6cN82oNLFpmyPInngiK3BD41VHMWEZ71jFhS9OMPagMRYjyOfiZRYzy78a"
+  "G6A9+MpeizGLYAiJLQwGXFK3xPkKmNEVX58Svnw2Yzi9RKR/5CYrCsSXaQ3pjOLAEFe4yHYSkVXySGnYvCoCWw9E1CAx2/S6cCZd"
+  "kGCevEsXCS+0yx5DaMkHJ8HSXPfqIbloEpw8nL+e/IBcm2PN7EeqJSdnoDfzAIJ9VNep+OkuE6N36B9KAA==");
+
+  nsCOMPtr<nsISupports> transportSecurityInfo;
+  nsresult rv = NS_DeserializeObject(base64Serialization, getter_AddRefs(transportSecurityInfo));
+  ASSERT_EQ(NS_ERROR_FACTORY_NOT_REGISTERED, rv);
+  ASSERT_FALSE(transportSecurityInfo);
+}
diff --git a/security/manager/ssl/tests/unit/test_cert_chains.js b/security/manager/ssl/tests/unit/test_cert_chains.js
index 8abcb4e65..dd1fc9369 100644
--- a/security/manager/ssl/tests/unit/test_cert_chains.js
+++ b/security/manager/ssl/tests/unit/test_cert_chains.js
@@ -31,9 +31,30 @@ function test_cert_equals() {
      " should return false");
 }
 
+function test_bad_cert_list_serialization() {
+  // Normally the serialization of an nsIX509CertList consists of some header
+  // junk (IIDs and whatnot), 4 bytes representing how many nsIX509Cert follow,
+  // and then the serialization of each nsIX509Cert. This serialization consists
+  // of the header junk for an nsIX509CertList with 1 "nsIX509Cert", but then
+  // instead of an nsIX509Cert, the subsequent bytes represent the serialization
+  // of another nsIX509CertList (with 0 nsIX509Cert). This test ensures that
+  // nsIX509CertList safely handles this unexpected input when deserializing.
+  const badCertListSerialization =
+    "lZ+xZWUXSH+rm9iRO+UxlwAAAAAAAAAAwAAAAAAAAEYAAAABlZ+xZWUXSH+rm9iRO+UxlwAAAAAA" +
+    "AAAAwAAAAAAAAEYAAAAA";
+  let serHelper = Cc["@mozilla.org/network/serialization-helper;1"]
+                    .getService(Ci.nsISerializationHelper);
+  throws(() => serHelper.deserializeObject(badCertListSerialization),
+         /NS_ERROR_UNEXPECTED/,
+         "deserializing a bogus nsIX509CertList should throw NS_ERROR_UNEXPECTED");
+}
+
 function test_cert_list_serialization() {
   let certList = build_cert_chain(['default-ee', 'expired-ee']);
 
+  throws(() => certList.addCert(null), /NS_ERROR_ILLEGAL_VALUE/,
+         "trying to add a null cert to an nsIX509CertList should throw");
+
   // Serialize the cert list to a string
   let serHelper = Cc["@mozilla.org/network/serialization-helper;1"]
                     .getService(Ci.nsISerializationHelper);
@@ -78,6 +99,11 @@ function run_test() {
 
   // Test serialization of nsIX509CertList
   add_test(function() {
+    test_bad_cert_list_serialization();
+    run_next_test();
+  });
+
+  add_test(function() {
     test_cert_list_serialization();
     run_next_test();
   });
diff --git a/testing/profiles/prefs_general.js b/testing/profiles/prefs_general.js
index 515828d29..bf1534c12 100644
--- a/testing/profiles/prefs_general.js
+++ b/testing/profiles/prefs_general.js
@@ -284,12 +284,6 @@ user_pref("apz.content_response_timeout", 60000);
 // Make sure SSL Error reports don't hit the network
 user_pref("security.ssl.errorReporting.url", "https://example.com/browser/browser/base/content/test/general/ssl_error_reports.sjs?succeed");
 
-// Make sure Translation won't hit the network.
-user_pref("browser.translation.bing.authURL", "http://%(server)s/browser/browser/components/translation/test/bing.sjs");
-user_pref("browser.translation.bing.translateArrayURL", "http://%(server)s/browser/browser/components/translation/test/bing.sjs");
-user_pref("browser.translation.yandex.translateURLOverride", "http://%(server)s/browser/browser/components/translation/test/yandex.sjs");
-user_pref("browser.translation.engine", "bing");
-
 // Make sure we don't try to load snippets from the network.
 user_pref("browser.aboutHomeSnippets.updateUrl", "nonexistent://test");
 
diff --git a/third_party/aom/.clang-format b/third_party/aom/.clang-format
new file mode 100644
index 000000000..e76a526e4
--- /dev/null
+++ b/third_party/aom/.clang-format
@@ -0,0 +1,109 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+# Generated with clang-format 5.0.0
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    false
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/third_party/aom/.cmake-format.py b/third_party/aom/.cmake-format.py
new file mode 100644
index 000000000..aa7354c2a
--- /dev/null
+++ b/third_party/aom/.cmake-format.py
@@ -0,0 +1,48 @@
+# Generated with cmake-format 0.3.6
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always. This introduces some
+# interesting effects with complicated 'if' statements. However, we want file
+# lists to look reasonable. Try to strike a balance.
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# What character to use for bulleted lists
+bullet_char = u'*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = u'.'
+
+# What style line endings to use in the output.
+line_ending = u'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = u'lower'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+  "foo": {
+    "flags": [
+      "BAR",
+      "BAZ"
+    ],
+    "kwargs": {
+      "HEADERS": "*",
+      "DEPENDS": "*",
+      "SOURCES": "*"
+    }
+  }
+}
diff --git a/third_party/aom/.mailmap b/third_party/aom/.mailmap
new file mode 100644
index 000000000..bbe4525b1
--- /dev/null
+++ b/third_party/aom/.mailmap
@@ -0,0 +1,34 @@
+Adrian Grange <agrange@google.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
+Hangyu Kuang <hkuang@google.com>
+Hui Su <huisu@google.com>
+Jacky Chen <jackychen@google.com>
+Jim Bankoski <jimbankoski@google.com>
+Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
+Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
+Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
+Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
+Tom Finegan <tomfinegan@google.com>
+Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
diff --git a/third_party/aom/AUTHORS b/third_party/aom/AUTHORS
new file mode 100644
index 000000000..95c3c8bf2
--- /dev/null
+++ b/third_party/aom/AUTHORS
@@ -0,0 +1,144 @@
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+Aaron Watry <awatry@gmail.com>
+Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
+Adrian Grange <agrange@google.com>
+Aℓex Converse <aconverse@google.com>
+Ahmad Sharif <asharif@google.com>
+Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexis Ballier <aballier@gentoo.org>
+Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
+A.Mahfoodh <ab.mahfoodh@gmail.com>
+Ami Fischman <fischman@chromium.org>
+Andoni Morales Alastruey <ylatuya@gmail.com>
+Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
+Angie Chiang <angiebird@google.com>
+Aron Rosenberg <arosenberg@logitech.com>
+Attila Nagy <attilanagy@google.com>
+Brion Vibber <bvibber@wikimedia.org>
+changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
+chm <chm@rock-chips.com>
+Christian Duvivier <cduvivier@google.com>
+Daniel Kang <ddkang@google.com>
+Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
+Dmitry Kovalev <dkovalev@google.com>
+Dragan Mrdjan <dmrdjan@mips.com>
+Ed Baker <edward.baker@intel.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
+Fabio Pedretti <fabio.ped@libero.it>
+Frank Galligan <fgalligan@google.com>
+Fredrik Söderquist <fs@opera.com>
+Fritz Koenig <frkoenig@google.com>
+Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+Geza Lore <gezalore@gmail.com>
+Ghislain MARY <ghislainmary2@gmail.com>
+Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Guillaume Martres <gmartres@google.com>
+Guillermo Ballester Valor <gbvalor@gmail.com>
+Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
+Henrik Lundin <hlundin@google.com>
+Hui Su <huisu@google.com>
+Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+Jacky Chen <jackychen@google.com>
+James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
+James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
+Jan Kratochvil <jan.kratochvil@redhat.com>
+Janne Salonen <jsalonen@google.com>
+Jean-Marc Valin <jmvalin@jmvalin.ca>
+Jeff Faust <jfaust@google.com>
+Jeff Muizelaar <jmuizelaar@mozilla.com>
+Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
+Jian Zhou <zhoujian@google.com>
+Jim Bankoski <jimbankoski@google.com>
+Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
+Johann Koenig <johannkoenig@chromium.org>
+Johann Koenig <johannkoenig@google.com>
+John Koleszar <jkoleszar@google.com>
+Johnny Klonaris <google@jawknee.com>
+John Stark <jhnstrk@gmail.com>
+Joshua Bleecher Snyder <josh@treelinelabs.com>
+Joshua Litt <joshualitt@google.com>
+Julia Robson <juliamrobson@gmail.com>
+Justin Clift <justin@salasaga.org>
+Justin Lebar <justin.lebar@gmail.com>
+KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
+Lou Quillio <louquillio@google.com>
+Luca Barbato <lu_zero@gentoo.org>
+Makoto Kato <makoto.kt@gmail.com>
+Mans Rullgard <mans@mansr.com>
+Marco Paniconi <marpan@google.com>
+Mark Mentovai <mark@chromium.org>
+Martin Ettl <ettl.martin78@googlemail.com>
+Martin Storsjo <martin@martin.st>
+Matthew Heaney <matthewjheaney@chromium.org>
+Michael Kohler <michaelkohler@live.com>
+Mike Frysinger <vapier@chromium.org>
+Mike Hommey <mhommey@mozilla.com>
+Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
+Morton Jonuschat <yabawock@gmail.com>
+Nathan E. Egge <negge@dgql.org>
+Nico Weber <thakis@chromium.org>
+Parag Salasakar <img.mips1@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Patrik Westin <patrik.westin@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Pavol Rusnak <stick@gk2.sk>
+Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@argondesign.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
+Philip Jägenstedt <philipj@opera.com>
+Priit Laes <plaes@plaes.org>
+Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
+Ralph Giles <giles@xiph.org>
+Rob Bradford <rob@linux.intel.com>
+Ronald S. Bultje <rsbultje@gmail.com>
+Rui Ueyama <ruiu@google.com>
+Sami Pietilä <samipietila@google.com>
+Sasi Inguva <isasi@google.com>
+Scott Graham <scottmg@chromium.org>
+Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Kolomenkin <kolomenkin@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
+Shimon Doodkin <helpmepro1@gmail.com>
+Shunyao Li <shunyaoli@google.com>
+Stefan Holmer <holmer@google.com>
+Steinar Midtskogen <stemidts@cisco.com>
+Suman Sunkara <sunkaras@google.com>
+Taekhyun Kim <takim@nvidia.com>
+Takanori MATSUURA <t.matsuu@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
+Tero Rintaluoma <teror@google.com>
+Thijs Vermeir <thijsvermeir@gmail.com>
+Thomas Daede <tdaede@mozilla.com>
+Thomas Davies <thdavies@cisco.com>
+Thomas <thdavies@cisco.com>
+Tim Kopp <tkopp@google.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Tom Finegan <tomfinegan@google.com>
+Tristan Matthews <le.businessman@gmail.com>
+Tristan Matthews <tmatth@videolan.org>
+Vignesh Venkatasubramanian <vigneshv@google.com>
+Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
+Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
diff --git a/third_party/aom/CHANGELOG b/third_party/aom/CHANGELOG
new file mode 100644
index 000000000..d84aa0249
--- /dev/null
+++ b/third_party/aom/CHANGELOG
@@ -0,0 +1,5 @@
+2018-06-28 v1.0.0
+  AOMedia Codec Workgroup Approved version 1.0
+
+2016-04-07 v0.1.0 "AOMedia Codec 1"
+  This release is the first Alliance for Open Media codec.
diff --git a/third_party/aom/CMakeLists.txt b/third_party/aom/CMakeLists.txt
new file mode 100644
index 000000000..a58e54f40
--- /dev/null
+++ b/third_party/aom/CMakeLists.txt
@@ -0,0 +1,758 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+if(NOT EMSCRIPTEN)
+  if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release"
+        CACHE "Build type: Debug, Release, RelWithDebInfo or MinSizeRel" STRING
+              FORCE)
+  endif()
+endif()
+
+project(AOM C CXX)
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include"
+    CACHE PATH "Installation path of includes")
+set(LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib"
+    CACHE PATH "Installation path of libraries")
+
+if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
+  message(FATAL_ERROR
+            "Building from within the aom source tree is not supported.\n"
+            "Hint: Run these commands\n" "$ rm -rf CMakeCache.txt CMakeFiles\n"
+            "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
+            "And re-run CMake from the aom_build directory.")
+endif()
+
+include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
+include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
+include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
+include("${AOM_ROOT}/aom_ports/aom_ports.cmake")
+include("${AOM_ROOT}/aom_scale/aom_scale.cmake")
+include("${AOM_ROOT}/aom_util/aom_util.cmake")
+include("${AOM_ROOT}/av1/av1.cmake")
+include("${AOM_ROOT}/test/test.cmake")
+include("${AOM_ROOT}/build/cmake/sanitizers.cmake")
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+list(APPEND AOM_RTCD_SOURCES
+            "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+            "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+            "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+            "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+            "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+            "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+            "${AOM_ROOT}/av1/common/av1_rtcd.c"
+            "${AOM_ROOT}/build/cmake/rtcd.pl")
+
+list(APPEND AOM_LIBWEBM_SOURCES
+            "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc"
+            "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h"
+            "${AOM_ROOT}/third_party/libwebm/common/webmids.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h")
+
+list(APPEND AOM_LIBYUV_SOURCES
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h"
+            "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_any.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_common.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_win.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc")
+
+list(APPEND AOM_SOURCES
+            "${AOM_CONFIG_DIR}/config/aom_config.c"
+            "${AOM_CONFIG_DIR}/config/aom_config.h"
+            "${AOM_ROOT}/aom/aom.h"
+            "${AOM_ROOT}/aom/aom_codec.h"
+            "${AOM_ROOT}/aom/aom_decoder.h"
+            "${AOM_ROOT}/aom/aom_encoder.h"
+            "${AOM_ROOT}/aom/aom_frame_buffer.h"
+            "${AOM_ROOT}/aom/aom_image.h"
+            "${AOM_ROOT}/aom/aom_integer.h"
+            "${AOM_ROOT}/aom/aomcx.h"
+            "${AOM_ROOT}/aom/aomdx.h"
+            "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+            "${AOM_ROOT}/aom/src/aom_codec.c"
+            "${AOM_ROOT}/aom/src/aom_decoder.c"
+            "${AOM_ROOT}/aom/src/aom_encoder.c"
+            "${AOM_ROOT}/aom/src/aom_image.c"
+            "${AOM_ROOT}/aom/src/aom_integer.c")
+
+list(APPEND AOM_COMMON_APP_UTIL_SOURCES
+            "${AOM_ROOT}/common/args.c"
+            "${AOM_ROOT}/common/args.h"
+            "${AOM_ROOT}/common/av1_config.c"
+            "${AOM_ROOT}/common/av1_config.h"
+            "${AOM_ROOT}/common/md5_utils.c"
+            "${AOM_ROOT}/common/md5_utils.h"
+            "${AOM_ROOT}/common/tools_common.c"
+            "${AOM_ROOT}/common/tools_common.h"
+            "${AOM_ROOT}/common/video_common.h"
+            "${AOM_ROOT}/common/rawenc.c"
+            "${AOM_ROOT}/common/rawenc.h"
+            "${AOM_ROOT}/common/y4menc.c"
+            "${AOM_ROOT}/common/y4menc.h")
+
+list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h" "${AOM_ROOT}/common/obudec.c"
+            "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c"
+            "${AOM_ROOT}/common/video_reader.h")
+
+list(APPEND AOM_ENCODER_APP_UTIL_SOURCES
+            "${AOM_ROOT}/common/ivfenc.c"
+            "${AOM_ROOT}/common/ivfenc.h"
+            "${AOM_ROOT}/common/video_writer.c"
+            "${AOM_ROOT}/common/video_writer.h"
+            "${AOM_ROOT}/common/warnings.c"
+            "${AOM_ROOT}/common/warnings.h"
+            "${AOM_ROOT}/common/y4minput.c"
+            "${AOM_ROOT}/common/y4minput.h"
+            "${AOM_ROOT}/examples/encoder_util.h"
+            "${AOM_ROOT}/examples/encoder_util.c")
+
+list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
+            "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
+            "${AOM_ROOT}/stats/rate_hist.h")
+
+list(APPEND AOM_PKG_CONFIG_SOURCES "${AOM_CONFIG_DIR}/aom.pc")
+
+list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h")
+
+list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc"
+            "${AOM_ROOT}/common/webmdec.h")
+
+list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc"
+            "${AOM_ROOT}/common/webmenc.h")
+
+include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps
+                    ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats)
+
+# Targets
+add_library(aom_version ${AOM_VERSION_SOURCES})
+add_dummy_source_file_to_target(aom_version c)
+add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h"
+                   COMMAND ${CMAKE_COMMAND} ARGS
+                           -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                           -DAOM_ROOT=${AOM_ROOT}
+                           -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                           -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                           "${AOM_ROOT}/build/cmake/version.cmake"
+                   COMMENT "Writing aom_version.h" VERBATIM)
+
+add_custom_target(aom_version_check
+                  COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                          -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                          "${AOM_ROOT}/build/cmake/version.cmake"
+                  COMMENT "Updating version info if necessary." VERBATIM)
+add_dependencies(aom_version aom_version_check)
+
+if(NOT MSVC)
+  add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES})
+  add_dummy_source_file_to_target(aom_pc c)
+  add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/aom.pc"
+                     COMMAND ${CMAKE_COMMAND} ARGS
+                             -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                             -DAOM_ROOT=${AOM_ROOT}
+                             -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+                             -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+                             -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+                             -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+                             "${AOM_ROOT}/build/cmake/pkg_config.cmake"
+                     COMMENT "Writing aom.pc" VERBATIM)
+  add_dependencies(aom_pc aom_version)
+endif()
+
+# TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd
+# source.
+add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+                    "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+                    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd")
+add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+                    "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+                    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd")
+add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+                    "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+                    "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd")
+
+add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES})
+add_dependencies(aom_rtcd aom_version)
+
+if(ENABLE_EXAMPLES)
+  add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats)
+endif()
+add_library(aom ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
+
+if(NOT MSVC AND NOT APPLE)
+  target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m)
+endif()
+
+# List of object and static library targets.
+set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_encoder_stats aom_mem
+    aom_scale aom)
+
+# Setup dependencies.
+setup_aom_dsp_targets()
+setup_aom_mem_targets()
+setup_aom_ports_targets()
+setup_aom_util_targets()
+setup_aom_scale_targets()
+setup_av1_targets()
+
+# Make all library targets depend on aom_rtcd to make sure it builds first.
+foreach(aom_lib ${AOM_LIB_TARGETS})
+  if(NOT "${aom_lib}" STREQUAL "aom_rtcd")
+    add_dependencies(${aom_lib} aom_rtcd)
+  endif()
+endforeach()
+
+# Generate C/C++ stub files containing the function usage_exit(). Users of the
+# aom_common_app_util library must define this function. This is a convenience
+# to allow omission of the function from applications that might want to use
+# other pieces of the util support without defining usage_exit().
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" "void usage_exit(void) {}")
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.cc"
+           "extern \"C\" void usage_exit(void) {}")
+
+#
+# Application and application support targets.
+#
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+  add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+    # obudec depends on internal headers that require *rtcd.h
+    add_dependencies(aom_decoder_app_util aom_rtcd)
+  endif()
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
+  endif()
+endif()
+
+if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
+  add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>)
+  list(APPEND AOM_APP_TARGETS resize_util)
+endif()
+
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+  add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+
+  if(CONFIG_ANALYZER)
+    add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+    target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
+    list(APPEND AOM_APP_TARGETS analyzer)
+    list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer)
+  endif()
+
+  if(CONFIG_INSPECTION)
+    add_executable(inspect "${AOM_ROOT}/examples/inspect.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+    list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect)
+
+    if(EMSCRIPTEN)
+      add_preproc_definition(_POSIX_SOURCE)
+      append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=402653184")
+      append_link_flag_to_target("inspect" "-s MODULARIZE=1")
+      append_link_flag_to_target(
+        "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'")
+      append_link_flag_to_target("inspect"
+                                 "-s EXPORT_NAME=\"\'DecoderModule\'\"")
+      append_link_flag_to_target("inspect" "--memory-init-file 0")
+
+      if("${CMAKE_BUILD_TYPE}" STREQUAL "")
+
+        # Default to -O3 when no build type is specified.
+        append_compiler_flag("-O3")
+      endif()
+
+      em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js")
+    endif()
+  endif()
+
+  # Maintain a list of decoder example targets.
+  list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5
+              decode_with_drops scalable_decoder simple_decoder)
+
+  # Add decoder examples to the app targets list.
+  list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS})
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  if(ENABLE_EXAMPLES)
+    add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_stats>)
+    add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    # Maintain a list of encoder example targets.
+    list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
+                set_maps simple_encoder scalable_encoder twopass_encoder)
+  endif()
+
+  if(ENABLE_TOOLS)
+    if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS)
+
+      # TODO(tomfinegan): Sort out why a simple link command with
+      # aom_entropy_optimizer.c won't work on macos, but dragging in all the
+      # helper machinery allows the link to succeed.
+      add_executable(aom_entropy_optimizer "${AOM_GEN_SRC_DIR}/usage_exit.c"
+                     "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
+                     $<TARGET_OBJECTS:aom_common_app_util>
+                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+      # Maintain a list of encoder tool targets.
+      list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer)
+    endif()
+  endif()
+
+  # Add encoder examples and tools to the targets list.
+  list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
+              ${AOM_ENCODER_TOOL_TARGETS})
+endif()
+
+if(ENABLE_EXAMPLES)
+
+  # Maintain a separate variable listing only the examples to facilitate
+  # installation of example programs into an examples sub directory of
+  # $AOM_DIST_DIR/bin when building the dist target.
+  list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}
+              ${AOM_ENCODER_EXAMPLE_TARGETS})
+endif()
+
+if(ENABLE_TOOLS)
+  if(CONFIG_AV1_DECODER)
+    require_cxx_flag_nomsvc("-std=c++11" NO)
+    add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.cc"
+                   "${AOM_ROOT}/tools/dump_obu.cc"
+                   "${AOM_ROOT}/tools/obu_parser.cc"
+                   "${AOM_ROOT}/tools/obu_parser.h"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+
+    list(APPEND AOM_TOOL_TARGETS dump_obu)
+    list(APPEND AOM_APP_TARGETS dump_obu)
+
+    if(NOT MSVC)
+      target_compile_options(dump_obu PUBLIC -std=c++11)
+    endif()
+
+    # Maintain a separate variable listing only the examples to facilitate
+    # installation of example programs into an tools sub directory of
+    # $AOM_DIST_DIR/bin when building the dist target.
+    list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS}
+                ${AOM_ENCODER_TOOL_TARGETS})
+  endif()
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_encoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref)
+  list(APPEND AOM_APP_TARGETS aom_cx_set_ref)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER)
+  add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_encoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder)
+  list(APPEND AOM_APP_TARGETS lightfield_encoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_tile_list_decoder
+                 "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder)
+  list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder)
+  list(APPEND AOM_APP_TARGETS lightfield_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_bitstream_parsing
+                 "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_encoder_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing)
+  list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing)
+endif()
+
+foreach(aom_app ${AOM_APP_TARGETS})
+  target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom)
+endforeach()
+
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+  if(CONFIG_LIBYUV)
+    add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
+    if(NOT MSVC)
+      target_compile_options(yuv PRIVATE -Wno-unused-parameter)
+    endif()
+    include_directories("${AOM_ROOT}/third_party/libyuv/include")
+
+    # Add to existing targets.
+    foreach(aom_app ${AOM_APP_TARGETS})
+      target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
+      set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+    endforeach()
+  endif()
+
+  if(CONFIG_WEBM_IO)
+    require_cxx_flag_nomsvc("-std=c++11" NO)
+
+    add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES})
+    include_directories("${AOM_ROOT}/third_party/libwebm")
+    target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS)
+    target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS)
+
+    if(NOT MSVC)
+      target_compile_options(webm PRIVATE -Wno-shadow)
+      target_compile_options(webm PUBLIC -std=c++11)
+    endif()
+
+    # Add to existing targets.
+    if(CONFIG_AV1_DECODER)
+      target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES})
+    endif()
+
+    if(CONFIG_AV1_ENCODER)
+      target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES})
+    endif()
+
+    foreach(aom_app ${AOM_APP_TARGETS})
+      target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:webm>)
+      set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+    endforeach()
+  endif()
+endif()
+
+if(ENABLE_TESTS)
+
+  # Create test_libaom target and the targets it depends on.
+  setup_aom_test_targets()
+endif()
+
+if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
+  find_package(Threads)
+  target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads)
+endif()
+
+if(XCODE)
+
+  # TODO(tomfinegan): Make sure target has no C++ files before doing this as
+  # it's not necessary in that case.
+  if(CONFIG_LIBYUV OR CONFIG_WEBM_IO)
+
+    # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue
+    # what looks like a C++ file needs to be in any target that Xcode will link
+    # when the target contains a C++ dependency. Without this Xcode will try to
+    # link with the C linker, which always ends badly when a dependency actually
+    # includes C++.
+
+    # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched
+    # here, it really is the Xcode generator's fault, or just a deficiency in
+    # Xcode itself.
+    foreach(aom_app ${AOM_APP_TARGETS})
+      add_dummy_source_file_to_target("${aom_app}" "cc")
+    endforeach()
+  endif()
+endif()
+
+if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
+
+  # For historical purposes place the example binaries in the example directory.
+  file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples")
+
+  foreach(target ${AOM_EXAMPLE_TARGETS})
+    if(NOT "${target}" MATCHES "aomdec\|aomenc")
+      set_target_properties(${target}
+                            PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                       "${AOM_CONFIG_DIR}/examples")
+    endif()
+  endforeach()
+
+  if(ENABLE_TOOLS AND AOM_TOOL_TARGETS)
+
+    # The same expectation is true for tool targets.
+    file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools")
+    set_target_properties(${AOM_TOOL_TARGETS}
+                          PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                     "${AOM_CONFIG_DIR}/tools")
+  endif()
+endif()
+
+if(BUILD_SHARED_LIBS)
+  include("${AOM_ROOT}/build/cmake/exports.cmake")
+  setup_exports_target()
+  set_target_properties(aom PROPERTIES SOVERSION 0)
+endif()
+
+# Handle user supplied compile and link flags last to ensure they're obeyed.
+set_user_flags()
+
+# Aomedia documentation rule.
+if(ENABLE_DOCS)
+  include(FindDoxygen)
+  if(DOXYGEN_FOUND)
+    include("${AOM_ROOT}/docs.cmake")
+    setup_documentation_targets()
+  else()
+    message("--- Cannot find doxygen, ENABLE_DOCS turned off.")
+    set(ENABLE_DOCS OFF)
+  endif()
+endif()
+
+if(NOT (MSVC OR XCODE))
+
+  # Aomedia install rule.
+  list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h"
+              "${AOM_ROOT}/aom/aom_codec.h"
+              "${AOM_ROOT}/aom/aom_frame_buffer.h"
+              "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h"
+              "${AOM_ROOT}/aom/aom.h")
+
+  if(CONFIG_AV1_DECODER)
+    if(ENABLE_EXAMPLES)
+      list(APPEND AOM_INSTALL_BINS aomdec)
+    endif()
+
+    list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h"
+                "${AOM_ROOT}/aom/aomdx.h")
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    if(ENABLE_EXAMPLES)
+      list(APPEND AOM_INSTALL_BINS aomenc)
+    endif()
+
+    list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h"
+                "${AOM_ROOT}/aom/aom_encoder.h")
+  endif()
+
+  set(AOM_INSTALL_LIBS aom)
+
+  install(FILES ${AOM_INSTALL_INCS} DESTINATION "${INCLUDE_INSTALL_DIR}/aom")
+  install(FILES "${AOM_CONFIG_DIR}/aom.pc" DESTINATION
+                "${LIB_INSTALL_DIR}/pkgconfig")
+  install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION "${LIB_INSTALL_DIR}")
+
+  if(ENABLE_EXAMPLES)
+    install(TARGETS ${AOM_INSTALL_BINS} DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/bin")
+  endif()
+endif()
+
+# Aomedia dist rule.
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+  list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomdec>)
+endif()
+if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES)
+  list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomenc>)
+endif()
+
+if(ENABLE_EXAMPLES)
+  foreach(example ${AOM_EXAMPLE_TARGETS})
+    list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
+  endforeach()
+endif()
+
+if(ENABLE_TOOLS)
+  foreach(tool ${AOM_TOOL_TARGETS})
+    list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
+  endforeach()
+endif()
+
+if(NOT AOM_DIST_DIR)
+  set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist")
+endif()
+
+add_custom_target(dist
+                  COMMAND ${CMAKE_COMMAND} -DAOM_ROOT=${AOM_ROOT}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_DIST_DIR=${AOM_DIST_DIR}
+                          -DAOM_DIST_APPS="${AOM_DIST_APPS}"
+                          -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}"
+                          -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}"
+                          -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}"
+                          -DAOM_DIST_LIBS=$<TARGET_FILE:aom>
+                          -DENABLE_DOCS=${ENABLE_DOCS} -P
+                          "${AOM_ROOT}/build/cmake/dist.cmake"
+                  DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS}
+                          ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS}
+                          ${AOM_TOOL_TARGETS})
+
+if(ENABLE_DOCS)
+  add_dependencies(dist docs)
+endif()
+
+# Collect all variables containing libaom source files.
+get_cmake_property(all_cmake_vars VARIABLES)
+foreach(var ${all_cmake_vars})
+  if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" AND NOT "${var}" MATCHES
+     "_APP_\|DOXYGEN\|LIBWEBM\|LIBYUV\|_PKG_\|TEST")
+    list(APPEND aom_source_vars ${var})
+  endif()
+endforeach()
+
+# Libaom_srcs.txt generation.
+set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt")
+file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n")
+
+# Static source file list first.
+foreach(aom_source_var ${aom_source_vars})
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+      file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+    endif()
+  endforeach()
+endforeach()
+
+file(APPEND
+       "${libaom_srcs_txt_file}"
+       "# Files below this line are generated by the libaom build system.\n")
+foreach(aom_source_var ${aom_source_vars})
+  foreach(file ${${aom_source_var}})
+    if("${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}")
+      file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+    endif()
+  endforeach()
+endforeach()
+
+# Libaom_srcs.gni generation.
+set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni")
+file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+  if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+    string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+    file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n")
+  endif()
+
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file
+                     "${file}")
+      file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
+    endif()
+  endforeach()
+
+  if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+    file(APPEND "${libaom_srcs_gni_file}" "]\n")
+  endif()
+endforeach()
+
+file(APPEND
+       "${libaom_srcs_gni_file}"
+       "\n# Files below this line are generated by the libaom build system.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+  if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+    string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+    file(APPEND "${libaom_srcs_gni_file}"
+                "\n${aom_source_var_lowercase}_gen = [\n")
+  endif()
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_ROOT}")
+      string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom"
+                     file "${file}")
+      file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
+    endif()
+  endforeach()
+
+  if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+    file(APPEND "${libaom_srcs_gni_file}" "]\n")
+  endif()
+endforeach()
diff --git a/third_party/aom/LICENSE b/third_party/aom/LICENSE
new file mode 100644
index 000000000..fc340c376
--- /dev/null
+++ b/third_party/aom/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/third_party/aom/PATENTS b/third_party/aom/PATENTS
new file mode 100644
index 000000000..97842e02f
--- /dev/null
+++ b/third_party/aom/PATENTS
@@ -0,0 +1,108 @@
+Alliance for Open Media Patent License 1.0
+
+1. License Terms.
+
+1.1. Patent License. Subject to the terms and conditions of this License, each
+     Licensor, on behalf of itself and successors in interest and assigns,
+     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+     no-charge, royalty-free, irrevocable (except as expressly stated in this
+     License) patent license to its Necessary Claims to make, use, sell, offer
+     for sale, import or distribute any Implementation.
+
+1.2. Conditions.
+
+1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
+       sell, offer for sale, import or distribute an Implementation under
+       Section 1.1, Licensee must make its Necessary Claims available under
+       this License, and must reproduce this License with any Implementation
+       as follows:
+
+       a. For distribution in source code, by including this License in the
+          root directory of the source code with its Implementation.
+
+       b. For distribution in any other form (including binary, object form,
+          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+          GDSII, etc.)), by including this License in the documentation, legal
+          notices, and/or other written materials provided with the
+          Implementation.
+
+1.2.2. Additional Conditions. This license is directly from Licensor to
+       Licensee.  Licensee acknowledges as a condition of benefiting from it
+       that no rights from Licensor are received from suppliers, distributors,
+       or otherwise in connection with this License.
+
+1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
+     initiates patent litigation or files, maintains, or voluntarily
+     participates in a lawsuit against another entity or any person asserting
+     that any Implementation infringes Necessary Claims, any patent licenses
+     granted under this License directly to the Licensee are immediately
+     terminated as of the date of the initiation of action unless 1) that suit
+     was in response to a corresponding suit regarding an Implementation first
+     brought against an initiating entity, or 2) that suit was brought to
+     enforce the terms of this License (including intervention in a third-party
+     action by a Licensee).
+
+1.4. Disclaimers. The Reference Implementation and Specification are provided
+     "AS IS" and without warranty. The entire risk as to implementing or
+     otherwise using the Reference Implementation or Specification is assumed
+     by the implementer and user. Licensor expressly disclaims any warranties
+     (express, implied, or otherwise), including implied warranties of
+     merchantability, non-infringement, fitness for a particular purpose, or
+     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. Definitions.
+
+2.1. Affiliate.  �Affiliate� means an entity that directly or indirectly
+     Controls, is Controlled by, or is under common Control of that party.
+
+2.2. Control. �Control� means direct or indirect control of more than 50% of
+     the voting power to elect directors of that corporation, or for any other
+     entity, the power to direct management of such entity.
+
+2.3. Decoder.  "Decoder" means any decoder that conforms fully with all
+     non-optional portions of the Specification.
+
+2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
+     be decoded by a Decoder only to the extent it produces such a bitstream.
+
+2.5. Final Deliverable.  �Final Deliverable� means the final version of a
+     deliverable approved by the Alliance for Open Media as a Final
+     Deliverable.
+
+2.6. Implementation.  "Implementation" means any implementation, including the
+     Reference Implementation, that is an Encoder and/or a Decoder. An
+     Implementation also includes components of an Implementation only to the
+     extent they are used as part of an Implementation.
+
+2.7. License. �License� means this license.
+
+2.8. Licensee. �Licensee� means any person or entity who exercises patent
+     rights granted under this License.
+
+2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
+     for sale, imports or distributes any Implementation, or (ii) a person
+     or entity that has a licensing obligation to the Implementation as a
+     result of its membership and/or participation in the Alliance for Open
+     Media working group that developed the Specification.
+
+2.10. Necessary Claims.  "Necessary Claims" means all claims of patents or
+      patent applications, (a) that currently or at any time in the future,
+      are owned or controlled by the Licensor, and (b) (i) would be an
+      Essential Claim as defined by the W3C Policy as of February 5, 2004
+      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+      as if the Specification was a W3C Recommendation; or (ii) are infringed
+      by the Reference Implementation.
+
+2.11. Reference Implementation. �Reference Implementation� means an Encoder
+      and/or Decoder released by the Alliance for Open Media as a Final
+      Deliverable.
+
+2.12. Specification. �Specification� means the specification designated by
+      the Alliance for Open Media as a Final Deliverable for which this
+      License was issued.
+
diff --git a/third_party/aom/README.md b/third_party/aom/README.md
new file mode 100644
index 000000000..cab3f9993
--- /dev/null
+++ b/third_party/aom/README.md
@@ -0,0 +1,625 @@
+# AV1 Codec Library
+
+## Contents
+1. [Building the lib and applications](#building-the-library-and-applications)
+    - [Prerequisites](#prerequisites)
+    - [Get the code](#get-the-code)
+    - [Basics](#basic-build)
+    - [Configuration options](#configuration-options)
+    - [Dylib builds](#dylib-builds)
+    - [Debugging](#debugging)
+    - [Cross compiling](#cross-compiling)
+    - [Sanitizer support](#sanitizers)
+    - [MSVC builds](#microsoft-visual-studio-builds)
+    - [Xcode builds](#xcode-builds)
+    - [Emscripten builds](#emscripten-builds)
+    - [Extra Build Flags](#extra-build-flags)
+2. [Testing the library](#testing-the-av1-codec)
+    - [Basics](#testing-basics)
+        - [Unit tests](#1_unit-tests)
+        - [Example tests](#2_example-tests)
+        - [Encoder tests](#3_encoder-tests)
+    - [IDE hosted tests](#ide-hosted-tests)
+    - [Downloading test data](#downloading-the-test-data)
+    - [Adding a new test data file](#adding-a-new-test-data-file)
+    - [Additional test data](#additional-test-data)
+    - [Sharded testing](#sharded-testing)
+        - [Running tests directly](#1_running-test_libaom-directly)
+        - [Running tests via CMake](#2_running-the-tests-via-the-cmake-build)
+3. [Coding style](#coding-style)
+4. [Submitting patches](#submitting-patches)
+    - [Login cookie](#login-cookie)
+    - [Contributor agreement](#contributor-agreement)
+    - [Testing your code](#testing-your-code)
+    - [Commit message hook](#commit-message-hook)
+    - [Upload your change](#upload-your-change)
+    - [Incorporating Reviewer Comments](#incorporating-reviewer-comments)
+    - [Submitting your change](#submitting-your-change)
+    - [Viewing change status](#viewing-the-status-of-uploaded-changes)
+5. [Support](#support)
+6. [Bug reports](#bug-reports)
+
+## Building the library and applications
+
+### Prerequisites
+
+ 1. [CMake](https://cmake.org) version 3.5 or higher.
+ 2. [Git](https://git-scm.com/).
+ 3. [Perl](https://www.perl.org/).
+ 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a
+    recent version of [nasm](http://www.nasm.us/).
+ 5. Building the documentation requires [doxygen](http://doxygen.org).
+ 6. Building the unit tests requires [Python](https://www.python.org/).
+ 7. Emscripten builds require the portable
+   [EMSDK](https://kripken.github.io/emscripten-site/index.html).
+
+### Get the code
+
+The AV1 library source code is stored in the Alliance for Open Media Git
+repository:
+
+~~~
+    $ git clone https://aomedia.googlesource.com/aom
+    # By default, the above command stores the source in the aom directory:
+    $ cd aom
+~~~
+
+### Basic build
+
+CMake replaces the configure step typical of many projects. Running CMake will
+produce configuration and build files for the currently selected CMake
+generator. For most systems the default generator is Unix Makefiles. The basic
+form of a makefile build is the following:
+
+~~~
+    $ cmake path/to/aom
+    $ make
+~~~
+
+The above will generate a makefile build that produces the AV1 library and
+applications for the current host system after the make step completes
+successfully. The compiler chosen varies by host platform, but a general rule
+applies: On systems where cc and c++ are present in $PATH at the time CMake is
+run the generated build will use cc and c++ by default.
+
+### Configuration options
+
+The AV1 codec library has a great many configuration options. These come in two
+varieties:
+
+ 1. Build system configuration options. These have the form `ENABLE_FEATURE`.
+ 2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`.
+
+Both types of options are set at the time CMake is run. The following example
+enables ccache and disables the AV1 encoder:
+
+~~~
+    $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0
+    $ make
+~~~
+
+The available configuration options are too numerous to list here. Build system
+configuration options can be found at the top of the CMakeLists.txt file found
+in the root of the AV1 repository, and AV1 codec configuration options can
+currently be found in the file `build/cmake/aom_config_defaults.cmake`.
+
+### Dylib builds
+
+A dylib (shared object) build of the AV1 codec library can be enabled via the
+CMake built in variable `BUILD_SHARED_LIBS`:
+
+~~~
+    $ cmake path/to/aom -DBUILD_SHARED_LIBS=1
+    $ make
+~~~
+
+This is currently only supported on non-Windows targets.
+
+### Debugging
+
+Depending on the generator used there are multiple ways of going about
+debugging AV1 components. For single configuration generators like the Unix
+Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient:
+
+~~~
+    $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug
+~~~
+
+For Xcode, mainly because configuration controls for Xcode builds are buried two
+configuration windows deep and must be set for each subproject within the Xcode
+IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug:
+
+~~~
+    $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug
+~~~
+
+For Visual Studio the in-IDE configuration controls should be used. Simply set
+the IDE project configuration to Debug to allow for stepping through the code.
+
+In addition to the above it can sometimes be useful to debug only C and C++
+code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to
+generic at generation time:
+
+~~~
+    $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+~~~
+
+### Cross compiling
+
+For the purposes of building the AV1 codec and applications and relative to the
+scope of this guide, all builds for architectures differing from the native host
+architecture will be considered cross compiles. The AV1 CMake build handles
+cross compiling via the use of toolchain files included in the AV1 repository.
+The toolchain files available at the time of this writing are:
+
+ - arm64-ios.cmake
+ - arm64-linux-gcc.cmake
+ - arm64-mingw-gcc.cmake
+ - armv7-ios.cmake
+ - armv7-linux-gcc.cmake
+ - armv7-mingw-gcc.cmake
+ - armv7s-ios.cmake
+ - mips32-linux-gcc.cmake
+ - mips64-linux-gcc.cmake
+ - x86-ios-simulator.cmake
+ - x86-linux.cmake
+ - x86-macos.cmake
+ - x86-mingw-gcc.cmake
+ - x86\_64-ios-simulator.cmake
+ - x86\_64-mingw-gcc.cmake
+
+The following example demonstrates use of the x86-macos.cmake toolchain file on
+a x86\_64 MacOS host:
+
+~~~
+    $ cmake path/to/aom \
+      -DCMAKE_TOOLCHAIN_FILE=path/to/aom/build/cmake/toolchains/x86-macos.cmake
+    $ make
+~~~
+
+To build for an unlisted target creation of a new toolchain file is the best
+solution. The existing toolchain files can be used a starting point for a new
+toolchain file since each one exposes the basic requirements for toolchain files
+as used in the AV1 codec build.
+
+As a temporary work around an unoptimized AV1 configuration that builds only C
+and C++ sources can be produced using the following commands:
+
+~~~
+    $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+    $ make
+~~~
+
+In addition to the above it's important to note that the toolchain files
+suffixed with gcc behave differently than the others. These toolchain files
+attempt to obey the $CROSS environment variable.
+
+### Sanitizers
+
+Sanitizer integration is built-in to the CMake build system. To enable a
+sanitizer, add `-DSANITIZE=<type>` to the CMake command line. For example, to
+enable address sanitizer:
+
+~~~
+    $ cmake path/to/aom -DSANITIZE=address
+    $ make
+~~~
+
+Sanitizers available vary by platform, target, and compiler. Consult your
+compiler documentation to determine which, if any, are available.
+
+### Microsoft Visual Studio builds
+
+Building the AV1 codec library in Microsoft Visual Studio is supported. The
+following example demonstrates generating projects and a solution for the
+Microsoft IDE:
+
+~~~
+    # This does not require a bash shell; command.exe is fine.
+    $ cmake path/to/aom -G "Visual Studio 15 2017"
+~~~
+
+### Xcode builds
+
+Building the AV1 codec library in Xcode is supported. The following example
+demonstrates generating an Xcode project:
+
+~~~
+    $ cmake path/to/aom -G Xcode
+~~~
+
+### Emscripten builds
+
+Building the AV1 codec library with Emscripten is supported. Typically this is
+used to hook into the AOMAnalyzer GUI application. These instructions focus on
+using the inspector with AOMAnalyzer, but all tools can be built with
+Emscripten.
+
+It is assumed here that you have already downloaded and installed the EMSDK,
+installed and activated at least one toolchain, and setup your environment
+appropriately using the emsdk\_env script.
+
+1. Download [AOMAnalyzer](https://people.xiph.org/~mbebenita/analyzer/).
+
+2. Configure the build:
+
+~~~
+    $ cmake path/to/aom \
+        -DENABLE_CCACHE=1 \
+        -DAOM_TARGET_CPU=generic \
+        -DENABLE_DOCS=0 \
+        -DENABLE_TESTS=0 \
+        -DCONFIG_ACCOUNTING=1 \
+        -DCONFIG_INSPECTION=1 \
+        -DCONFIG_MULTITHREAD=0 \
+        -DCONFIG_RUNTIME_CPU_DETECT=0 \
+        -DCONFIG_WEBM_IO=0 \
+        -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake
+~~~
+
+3. Build it: run make if that's your generator of choice:
+
+~~~
+    $ make inspect
+~~~
+
+4. Run the analyzer:
+
+~~~
+    # inspect.js is in the examples sub directory of the directory in which you
+    # executed cmake.
+    $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file
+~~~
+
+### Extra build flags
+
+Three variables allow for passing of additional flags to the build system.
+
+- AOM\_EXTRA\_C\_FLAGS
+- AOM\_EXTRA\_CXX\_FLAGS
+- AOM\_EXTRA\_EXE\_LINKER\_FLAGS
+
+The build system attempts to ensure the flags passed through the above variables
+are passed to tools last in order to allow for override of default behavior.
+These flags can be used, for example, to enable asserts in a release build:
+
+~~~
+    $ cmake path/to/aom \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DAOM_EXTRA_C_FLAGS=-UNDEBUG \
+        -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
+~~~
+
+## Testing the AV1 codec
+
+### Testing basics
+
+There are several methods of testing the AV1 codec. All of these methods require
+the presence of the AV1 source code and a working build of the AV1 library and
+applications.
+
+#### 1. Unit tests:
+
+The unit tests can be run at build time:
+
+~~~
+    # Before running the make command the LIBAOM_TEST_DATA_PATH environment
+    # variable should be set to avoid downloading the test files to the
+    # cmake build configuration directory.
+    $ cmake path/to/aom
+    # Note: The AV1 CMake build creates many test targets. Running make
+    # with multiple jobs will speed up the test run significantly.
+    $ make runtests
+~~~
+
+#### 2. Example tests:
+
+The example tests require a bash shell and can be run in the following manner:
+
+~~~
+    # See the note above about LIBAOM_TEST_DATA_PATH above.
+    $ cmake path/to/aom
+    $ make
+    # It's best to build the testdata target using many make jobs.
+    # Running it like this will verify and download (if necessary)
+    # one at a time, which takes a while.
+    $ make testdata
+    $ path/to/aom/test/examples.sh --bin-path examples
+~~~
+
+#### 3. Encoder tests:
+
+When making a change to the encoder run encoder tests to confirm that your
+change has a positive or negligible impact on encode quality. When running these
+tests the build configuration should be changed to enable internal encoder
+statistics:
+
+~~~
+    $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1
+    $ make
+~~~
+
+The repository contains scripts intended to make running these tests as simple
+as possible. The following example demonstrates creating a set of baseline clips
+for comparison to results produced after making your change to libaom:
+
+~~~
+    # This will encode all Y4M files in the current directory using the
+    # settings specified to create the encoder baseline statistical data:
+    $ cd path/to/test/inputs
+    # This command line assumes that run_encodes.sh, its helper script
+    # best_encode.sh, and the aomenc you intend to test are all within a
+    # directory in your PATH.
+    $ run_encodes.sh 200 500 50 baseline
+~~~
+
+After making your change and creating the baseline clips, you'll need to run
+encodes that include your change(s) to confirm that things are working as
+intended:
+
+~~~
+    # This will encode all Y4M files in the current directory using the
+    # settings specified to create the statistical data for your change:
+    $ cd path/to/test/inputs
+    # This command line assumes that run_encodes.sh, its helper script
+    # best_encode.sh, and the aomenc you intend to test are all within a
+    # directory in your PATH.
+    $ run_encodes.sh 200 500 50 mytweak
+~~~
+
+After creating both data sets you can use `test/visual_metrics.py` to generate a
+report that can be viewed in a web browser:
+
+~~~
+    $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \
+      > mytweak.html
+~~~
+
+You can view the report by opening mytweak.html in a web browser.
+
+
+### IDE hosted tests
+
+By default the generated projects files created by CMake will not include the
+runtests and testdata rules when generating for IDEs like Microsoft Visual
+Studio and Xcode. This is done to avoid intolerably long build cycles in the
+IDEs-- IDE behavior is to build all targets when selecting the build project
+options in MSVS and Xcode. To enable the test rules in IDEs the
+`ENABLE_IDE_TEST_HOSTING` variable must be enabled at CMake generation time:
+
+~~~
+    # This example uses Xcode. To get a list of the generators
+    # available, run cmake with the -G argument missing its
+    # value.
+    $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode
+~~~
+
+### Downloading the test data
+
+The fastest and easiest way to obtain the test data is to use CMake to generate
+a build using the Unix Makefiles generator, and then to build only the testdata
+rule:
+
+~~~
+    $ cmake path/to/aom -G "Unix Makefiles"
+    # 28 is used because there are 28 test files as of this writing.
+    $ make -j28 testdata
+~~~
+
+The above make command will only download and verify the test data.
+
+### Adding a new test data file
+
+First, add the new test data file to the `aom-test-data` bucket of the
+`aomedia-testing` project on Google Cloud Platform. You may need to ask someone
+with the necessary access permissions to do this for you.
+
+NOTE: When a new test data file is added to the `aom-test-data` bucket, its
+"Public access" is initially "Not public". We need to change its
+"Public access" to "Public" by using the following
+[`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command:
+~~~
+    $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name
+~~~
+This command grants the `AllUsers` group READ access to the file named
+"test-data-file-name" in the `aom-test-data` bucket.
+
+Once the new test data file has been added to `aom-test-data`, create a CL to
+add the name of the new test data file to `test/test_data_util.cmake` and add
+the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1
+checksum of a file can be calculated by running the `sha1sum` command on the
+file.)
+
+### Additional test data
+
+The test data mentioned above is strictly intended for unit testing.
+
+Additional input data for testing the encoder can be obtained from:
+https://media.xiph.org/video/derf/
+
+### Sharded testing
+
+The AV1 codec library unit tests are built upon gtest which supports sharding of
+test jobs. Sharded test runs can be achieved in a couple of ways.
+
+#### 1. Running test\_libaom directly:
+
+~~~
+   # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
+   # shards.
+   $ export GTEST_TOTAL_SHARDS=10
+   # (GTEST shard indexing is 0 based).
+   $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \
+       | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom
+~~~
+
+To create a test shard for each CPU core available on the current system set
+`GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one.
+
+#### 2. Running the tests via the CMake build:
+
+~~~
+    # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See
+    # the IDE hosted tests section above for more information. If the IDE
+    # supports building targets concurrently tests will be sharded by default.
+
+    # For make and ninja builds the -j parameter controls the number of shards
+    # at test run time. This example will run the tests using 10 shards via
+    # make.
+    $ make -j10 runtests
+~~~
+
+The maximum number of test targets that can run concurrently is determined by
+the number of CPUs on the system where the build is configured as detected by
+CMake. A system with 24 cores can run 24 test shards using a value of 24 with
+the `-j` parameter. When CMake is unable to detect the number of cores 10 shards
+is the default maximum value.
+
+## Coding style
+
+We are using the Google C Coding Style defined by the
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+The coding style used by this project is enforced with clang-format using the
+configuration contained in the
+[.clang-format](https://chromium.googlesource.com/webm/aom/+/master/.clang-format)
+file in the root of the repository.
+
+You can download clang-format using your system's package manager, or directly
+from [llvm.org](http://llvm.org/releases/download.html). You can also view the
+[documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org.
+Output from clang-format varies by clang-format version, for best results your
+version should match the one used on Jenkins. You can find the clang-format
+version by reading the comment in the `.clang-format` file linked above.
+
+Before pushing changes for review you can format your code with:
+
+~~~
+    # Apply clang-format to modified .c, .h and .cc files
+    $ clang-format -i --style=file \
+      $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
+~~~
+
+Check the .clang-format file for the version used to generate it if there is any
+difference between your local formatting and the review system.
+
+Some Git installations have clang-format integration. Here are some examples:
+
+~~~
+    # Apply clang-format to all staged changes:
+    $ git clang-format
+
+    # Clang format all staged and unstaged changes:
+    $ git clang-format -f
+
+    # Clang format all staged and unstaged changes interactively:
+    $ git clang-format -f -p
+~~~
+
+## Submitting patches
+
+We manage the submission of patches using the
+[Gerrit](https://www.gerritcodereview.com/) code review tool. This tool
+implements a workflow on top of the Git version control system to ensure that
+all changes get peer reviewed and tested prior to their distribution.
+
+### Login cookie
+
+Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with
+your account (Gmail credentials, for example). Next, follow the
+`Generate Password` Password link at the top of the page. You’ll be given
+instructions for creating a cookie to use with our Git repos.
+
+### Contributor agreement
+
+You will be required to execute a
+[contributor agreement](http://aomedia.org/license) to ensure that the AOMedia
+Project has the right to distribute your changes.
+
+### Testing your code
+
+The testing basics are covered in the [testing section](#testing-the-av1-codec)
+above.
+
+In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run
+through Jenkins instances upon upload to gerrit.
+
+### Commit message hook
+
+Gerrit requires that each submission include a unique Change-Id. You can assign
+one manually using git commit --amend, but it’s easier to automate it with the
+commit-msg hook provided by Gerrit.
+
+Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an
+example:
+
+~~~
+    $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg
+
+    # Next, ensure that the downloaded commit-msg script is executable:
+    $ chmod u+x aom/.git/hooks/commit-msg
+~~~
+
+See the Gerrit
+[documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html)
+for more information.
+
+### Upload your change
+
+The command line to upload your patch looks like this:
+
+~~~
+    $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/master
+~~~
+
+### Incorporating reviewer comments
+
+If you previously uploaded a change to Gerrit and the Approver has asked for
+changes, follow these steps:
+
+1. Edit the files to make the changes the reviewer has requested.
+2. Recommit your edits using the --amend flag, for example:
+
+~~~
+   $ git commit -a --amend
+~~~
+
+3. Use the same git push command as above to upload to Gerrit again for another
+   review cycle.
+
+In general, you should not rebase your changes when doing updates in response to
+review. Doing so can make it harder to follow the evolution of your change in
+the diff view.
+
+### Submitting your change
+
+Once your change has been Approved and Verified, you can “submit” it through the
+Gerrit UI. This will usually automatically rebase your change onto the branch
+specified.
+
+Sometimes this can’t be done automatically. If you run into this problem, you
+must rebase your changes manually:
+
+~~~
+    $ git fetch
+    $ git rebase origin/branchname
+~~~
+
+If there are any conflicts, resolve them as you normally would with Git. When
+you’re done, reupload your change.
+
+### Viewing the status of uploaded changes
+
+To check the status of a change that you uploaded, open
+[Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My >
+Changes.
+
+## Support
+
+This library is an open source project supported by its community. Please
+please email aomediacodec@jointdevelopment.kavi.com for help.
+
+## Bug reports
+
+Bug reports can be filed in the Alliance for Open Media
+[issue tracker](https://bugs.chromium.org/p/aomedia/issues/list).
diff --git a/third_party/aom/aom/aom.h b/third_party/aom/aom/aom.h
new file mode 100644
index 000000000..b1cc1ecce
--- /dev/null
+++ b/third_party/aom/aom/aom.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom AOM
+ * \ingroup codecs
+ * AOM is aom's newest video compression algorithm that uses motion
+ * compensated prediction, Discrete Cosine Transform (DCT) coding of the
+ * prediction error signal and context dependent entropy coding techniques
+ * based on arithmetic principles. It features:
+ *  - YUV 4:2:0 image format
+ *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
+ *  - 1/4 (1/8) pixel accuracy motion compensated prediction
+ *  - 4x4 DCT transform
+ *  - 128 level linear quantizer
+ *  - In loop deblocking filter
+ *  - Context-based entropy coding
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides controls common to both the AOM encoder and decoder.
+ */
+#ifndef AOM_AOM_AOM_H_
+#define AOM_AOM_AOM_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Control functions
+ *
+ * The set of macros define the control functions of AOM interface
+ */
+enum aom_com_control_id {
+  /*!\brief pass in an external frame into decoder to be used as reference frame
+   */
+  AOM_SET_POSTPROC = 3, /**< set the decoder's post processing settings  */
+  AOM_SET_DBG_COLOR_REF_FRAME =
+      4, /**< set the reference frames to color for each macroblock */
+  AOM_SET_DBG_COLOR_MB_MODES = 5, /**< set which macro block modes to color */
+  AOM_SET_DBG_COLOR_B_MODES = 6,  /**< set which blocks modes to color */
+  AOM_SET_DBG_DISPLAY_MV = 7,     /**< set which motion vector modes to draw */
+
+  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
+   * for its control ids. These should be migrated to something like the
+   * AOM_DECODER_CTRL_ID_START range next time we're ready to break the ABI.
+   */
+  AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
+  AV1_SET_REFERENCE = 129, /**< write a frame into a reference buffer */
+  AV1_COPY_REFERENCE =
+      130, /**< get a copy of reference frame from the decoder */
+  AOM_COMMON_CTRL_ID_MAX,
+
+  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
+  AV1_COPY_NEW_FRAME_IMAGE =
+      193, /**< copy the new frame to an external buffer */
+
+  AOM_DECODER_CTRL_ID_START = 256
+};
+
+/*!\brief post process flags
+ *
+ * The set of macros define AOM decoder post processing flags
+ */
+enum aom_postproc_level {
+  AOM_NOFILTERING = 0,
+  AOM_DEBLOCK = 1 << 0,
+  AOM_DEMACROBLOCK = 1 << 1,
+  AOM_ADDNOISE = 1 << 2,
+  AOM_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
+  AOM_DEBUG_TXT_MBLK_MODES =
+      1 << 4, /**< print macro block modes over each macro block */
+  AOM_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
+  AOM_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
+  AOM_MFQE = 1 << 10
+};
+
+/*!\brief post process flags
+ *
+ * This define a structure that describe the post processing settings. For
+ * the best objective measure (using the PSNR metric) set post_proc_flag
+ * to AOM_DEBLOCK and deblocking_level to 1.
+ */
+
+typedef struct aom_postproc_cfg {
+  /*!\brief the types of post processing to be done, should be combination of
+   * "aom_postproc_level" */
+  int post_proc_flag;
+  int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
+  int noise_level; /**< the strength of additive noise, valid range [0, 16] */
+} aom_postproc_cfg_t;
+
+/*!\brief AV1 specific reference frame data struct
+ *
+ * Define the data struct to access av1 reference frames.
+ */
+typedef struct av1_ref_frame {
+  int idx;              /**< frame index to get (input) */
+  int use_external_ref; /**< Directly use external ref buffer(decoder only) */
+  aom_image_t img;      /**< img structure to populate (output) */
+} av1_ref_frame_t;
+
+/*!\cond */
+/*!\brief aom decoder control function parameter type
+ *
+ * defines the data type for each of AOM decoder control function requires
+ */
+AOM_CTRL_USE_TYPE(AOM_SET_POSTPROC, aom_postproc_cfg_t *)
+#define AOM_CTRL_AOM_SET_POSTPROC
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_REF_FRAME, int)
+#define AOM_CTRL_AOM_SET_DBG_COLOR_REF_FRAME
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_MB_MODES, int)
+#define AOM_CTRL_AOM_SET_DBG_COLOR_MB_MODES
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_COLOR_B_MODES, int)
+#define AOM_CTRL_AOM_SET_DBG_COLOR_B_MODES
+AOM_CTRL_USE_TYPE(AOM_SET_DBG_DISPLAY_MV, int)
+#define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
+AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_GET_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_SET_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_COPY_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
+AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE
+
+/*!\endcond */
+/*! @} - end defgroup aom */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_H_
diff --git a/third_party/aom/aom/aom_codec.h b/third_party/aom/aom/aom_codec.h
new file mode 100644
index 000000000..fc0df5b9e
--- /dev/null
+++ b/third_party/aom/aom/aom_codec.h
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup codec Common Algorithm Interface
+ * This abstraction allows applications to easily support multiple video
+ * formats with minimal code duplication. This section describes the interface
+ * common to all codecs (both encoders and decoders).
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the codec algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video codec algorithm.
+ *
+ * An application instantiates a specific codec instance by using
+ * aom_codec_init() and a pointer to the algorithm's interface structure:
+ *     <pre>
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
+ *       }
+ *     </pre>
+ *
+ * Once initialized, the instance is managed using other functions from
+ * the aom_codec_* family.
+ */
+#ifndef AOM_AOM_AOM_CODEC_H_
+#define AOM_AOM_AOM_CODEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+
+/*!\brief Decorator indicating a function is deprecated */
+#ifndef AOM_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define AOM_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define AOM_DEPRECATED
+#else
+#define AOM_DEPRECATED
+#endif
+#endif /* AOM_DEPRECATED */
+
+#ifndef AOM_DECLSPEC_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
+#elif defined(_MSC_VER)
+/*!\brief \copydoc #AOM_DEPRECATED */
+#define AOM_DECLSPEC_DEPRECATED __declspec(deprecated)
+#else
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
+#endif
+#endif /* AOM_DECLSPEC_DEPRECATED */
+
+/*!\brief Decorator indicating a function is potentially unused */
+#ifdef AOM_UNUSED
+#elif defined(__GNUC__) || defined(__clang__)
+#define AOM_UNUSED __attribute__((unused))
+#else
+#define AOM_UNUSED
+#endif
+
+/*!\brief Decorator indicating that given struct/union/enum is packed */
+#ifndef ATTRIBUTE_PACKED
+#if defined(__GNUC__) && __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((packed))
+#elif defined(_MSC_VER)
+#define ATTRIBUTE_PACKED
+#else
+#define ATTRIBUTE_PACKED
+#endif
+#endif /* ATTRIBUTE_PACKED */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_CODEC_ABI_VERSION (3 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+
+/*!\brief Algorithm return codes */
+typedef enum {
+  /*!\brief Operation completed without error */
+  AOM_CODEC_OK,
+
+  /*!\brief Unspecified error */
+  AOM_CODEC_ERROR,
+
+  /*!\brief Memory operation failed */
+  AOM_CODEC_MEM_ERROR,
+
+  /*!\brief ABI version mismatch */
+  AOM_CODEC_ABI_MISMATCH,
+
+  /*!\brief Algorithm does not have required capability */
+  AOM_CODEC_INCAPABLE,
+
+  /*!\brief The given bitstream is not supported.
+   *
+   * The bitstream was unable to be parsed at the highest level. The decoder
+   * is unable to proceed. This error \ref SHOULD be treated as fatal to the
+   * stream. */
+  AOM_CODEC_UNSUP_BITSTREAM,
+
+  /*!\brief Encoded bitstream uses an unsupported feature
+   *
+   * The decoder does not implement a feature required by the encoder. This
+   * return code should only be used for features that prevent future
+   * pictures from being properly decoded. This error \ref MAY be treated as
+   * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
+   */
+  AOM_CODEC_UNSUP_FEATURE,
+
+  /*!\brief The coded data for this stream is corrupt or incomplete
+   *
+   * There was a problem decoding the current frame.  This return code
+   * should only be used for failures that prevent future pictures from
+   * being properly decoded. This error \ref MAY be treated as fatal to the
+   * stream or \ref MAY be treated as fatal to the current GOP. If decoding
+   * is continued for the current GOP, artifacts may be present.
+   */
+  AOM_CODEC_CORRUPT_FRAME,
+
+  /*!\brief An application-supplied parameter is not valid.
+   *
+   */
+  AOM_CODEC_INVALID_PARAM,
+
+  /*!\brief An iterator reached the end of list.
+   *
+   */
+  AOM_CODEC_LIST_END
+
+} aom_codec_err_t;
+
+/*! \brief Codec capabilities bitfield
+ *
+ *  Each codec advertises the capabilities it supports as part of its
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  or functionality, and are not required to be supported.
+ *
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+typedef long aom_codec_caps_t;
+#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
+#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow for
+ *  proper memory allocation.
+ *
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+typedef long aom_codec_flags_t;
+
+/*!\brief Codec interface structure.
+ *
+ * Contains function pointers and other data private to the codec
+ * implementation. This structure is opaque to the application.
+ */
+typedef const struct aom_codec_iface aom_codec_iface_t;
+
+/*!\brief Codec private data structure.
+ *
+ * Contains data private to the codec implementation. This structure is opaque
+ * to the application.
+ */
+typedef struct aom_codec_priv aom_codec_priv_t;
+
+/*!\brief Iterator
+ *
+ * Opaque storage used for iterating over lists.
+ */
+typedef const void *aom_codec_iter_t;
+
+/*!\brief Codec context structure
+ *
+ * All codecs \ref MUST support this context structure fully. In general,
+ * this data should be considered private to the codec algorithm, and
+ * not be manipulated or examined by the calling application. Applications
+ * may reference the 'name' member to get a printable description of the
+ * algorithm.
+ */
+typedef struct aom_codec_ctx {
+  const char *name;             /**< Printable interface name */
+  aom_codec_iface_t *iface;     /**< Interface pointers */
+  aom_codec_err_t err;          /**< Last returned error */
+  const char *err_detail;       /**< Detailed info, if available */
+  aom_codec_flags_t init_flags; /**< Flags passed at init time */
+  union {
+    /**< Decoder Configuration Pointer */
+    const struct aom_codec_dec_cfg *dec;
+    /**< Encoder Configuration Pointer */
+    const struct aom_codec_enc_cfg *enc;
+    const void *raw;
+  } config;               /**< Configuration pointer aliasing union */
+  aom_codec_priv_t *priv; /**< Algorithm private storage */
+} aom_codec_ctx_t;
+
+/*!\brief Bit depth for codec
+ * *
+ * This enumeration determines the bit depth of the codec.
+ */
+typedef enum aom_bit_depth {
+  AOM_BITS_8 = 8,   /**<  8 bits */
+  AOM_BITS_10 = 10, /**< 10 bits */
+  AOM_BITS_12 = 12, /**< 12 bits */
+} aom_bit_depth_t;
+
+/*!\brief Superblock size selection.
+ *
+ * Defines the superblock size used for encoding. The superblock size can
+ * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+ * selected by the encoder for each frame.
+ */
+typedef enum aom_superblock_size {
+  AOM_SUPERBLOCK_SIZE_64X64,   /**< Always use 64x64 superblocks. */
+  AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
+  AOM_SUPERBLOCK_SIZE_DYNAMIC  /**< Select superblock size dynamically. */
+} aom_superblock_size_t;
+
+/*
+ * Library Version Number Interface
+ *
+ * For example, see the following sample return values:
+ *     aom_codec_version()           (1<<16 | 2<<8 | 3)
+ *     aom_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
+ *     aom_codec_version_extra_str() "rc1-16-gec6a1ba"
+ */
+
+/*!\brief Return the version information (as an integer)
+ *
+ * Returns a packed encoding of the library version number. This will only
+ * include
+ * the major.minor.patch component of the version number. Note that this encoded
+ * value should be accessed through the macros provided, as the encoding may
+ * change
+ * in the future.
+ *
+ */
+int aom_codec_version(void);
+
+/*!\brief Return the version major number */
+#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
+
+/*!\brief Return the version minor number */
+#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
+
+/*!\brief Return the version patch number */
+#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
+
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable string containing the full library version number. This
+ * may
+ * contain additional text following the three digit version number, as to
+ * indicate
+ * release candidates, prerelease versions, etc.
+ *
+ */
+const char *aom_codec_version_str(void);
+
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable "extra string". This is the component of the string
+ * returned
+ * by aom_codec_version_str() following the three digit version number.
+ *
+ */
+const char *aom_codec_version_extra_str(void);
+
+/*!\brief Return the build configuration
+ *
+ * Returns a printable string containing an encoded version of the build
+ * configuration. This may be useful to aom support.
+ *
+ */
+const char *aom_codec_build_config(void);
+
+/*!\brief Return the name for a given interface
+ *
+ * Returns a human readable string for name of the given codec interface.
+ *
+ * \param[in]    iface     Interface pointer
+ *
+ */
+const char *aom_codec_iface_name(aom_codec_iface_t *iface);
+
+/*!\brief Convert error number to printable string
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in]    err     Error number.
+ *
+ */
+const char *aom_codec_err_to_string(aom_codec_err_t err);
+
+/*!\brief Retrieve error synopsis for codec context
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ *
+ */
+const char *aom_codec_error(aom_codec_ctx_t *ctx);
+
+/*!\brief Retrieve detailed error information for codec context
+ *
+ * Returns a human readable string providing detailed information about
+ * the last error.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ *
+ * \retval NULL
+ *     No detailed information is available.
+ */
+const char *aom_codec_error_detail(aom_codec_ctx_t *ctx);
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all codecs.
+ * They represent the base case functionality expected of all codecs.
+ */
+
+/*!\brief Destroy a codec instance
+ *
+ * Destroys a codec context, freeing any associated memory buffers.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ *
+ * \retval #AOM_CODEC_OK
+ *     The codec algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx);
+
+/*!\brief Get the capabilities of an algorithm.
+ *
+ * Retrieves the capabilities bitfield from the algorithm's interface.
+ *
+ * \param[in] iface   Pointer to the algorithm interface
+ *
+ */
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
+
+/*!\brief Control algorithm
+ *
+ * This function is used to exchange algorithm specific data with the codec
+ * instance. This can be used to implement features specific to a particular
+ * algorithm.
+ *
+ * This wrapper function dispatches the request to the helper function
+ * associated with the given ctrl_id. It tries to call this function
+ * transparently, but will return #AOM_CODEC_ERROR if the request could not
+ * be dispatched.
+ *
+ * Note that this function should not be used directly. Call the
+ * #aom_codec_control wrapper macro instead.
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     ctrl_id          Algorithm specific control identifier
+ *
+ * \retval #AOM_CODEC_OK
+ *     The control request was processed.
+ * \retval #AOM_CODEC_ERROR
+ *     The control request was not processed.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     The data was not valid.
+ */
+aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...);
+#if defined(AOM_DISABLE_CTRL_TYPECHECKS) && AOM_DISABLE_CTRL_TYPECHECKS
+#define aom_codec_control(ctx, id, data) aom_codec_control_(ctx, id, data)
+#define AOM_CTRL_USE_TYPE(id, typ)
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)
+#define AOM_CTRL_VOID(id, typ)
+
+#else
+/*!\brief aom_codec_control wrapper macro
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to aom_codec_control_().
+ *
+ * \internal
+ * It works by dispatching the call to the control function through a wrapper
+ * function named with the id parameter.
+ */
+#define aom_codec_control(ctx, id, data) \
+  aom_codec_control_##id(ctx, id, data) /**<\hideinitializer*/
+
+/*!\brief aom_codec_control type definition macro
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to aom_codec_control_(). It defines the type of the argument for a given
+ * control identifier.
+ *
+ * \internal
+ * It defines a static function with
+ * the correctly typed arguments as a wrapper to the type-unsafe internal
+ * function.
+ */
+#define AOM_CTRL_USE_TYPE(id, typ)                                           \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int, typ) \
+      AOM_UNUSED;                                                            \
+                                                                             \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,        \
+                                                int ctrl_id, typ data) {     \
+    return aom_codec_control_(ctx, ctrl_id, data);                           \
+  } /**<\hideinitializer*/
+
+/*!\brief aom_codec_control deprecated type definition macro
+ *
+ * Like #AOM_CTRL_USE_TYPE, but indicates that the specified control is
+ * deprecated and should not be used. Consult the documentation for your
+ * codec for more information.
+ *
+ * \internal
+ * It defines a static function with the correctly typed arguments as a
+ * wrapper to the type-unsafe internal function.
+ */
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
+  AOM_DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
+      aom_codec_ctx_t *, int, typ) AOM_DEPRECATED AOM_UNUSED;            \
+                                                                         \
+  AOM_DECLSPEC_DEPRECATED static aom_codec_err_t aom_codec_control_##id( \
+      aom_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
+    return aom_codec_control_(ctx, ctrl_id, data);                       \
+  } /**<\hideinitializer*/
+
+/*!\brief aom_codec_control void type definition macro
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to aom_codec_control_(). It indicates that a given control identifier takes
+ * no argument.
+ *
+ * \internal
+ * It defines a static function without a data argument as a wrapper to the
+ * type-unsafe internal function.
+ */
+#define AOM_CTRL_VOID(id)                                               \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *, int) \
+      AOM_UNUSED;                                                       \
+                                                                        \
+  static aom_codec_err_t aom_codec_control_##id(aom_codec_ctx_t *ctx,   \
+                                                int ctrl_id) {          \
+    return aom_codec_control_(ctx, ctrl_id);                            \
+  } /**<\hideinitializer*/
+
+#endif
+
+/*!\brief OBU types. */
+typedef enum ATTRIBUTE_PACKED {
+  OBU_SEQUENCE_HEADER = 1,
+  OBU_TEMPORAL_DELIMITER = 2,
+  OBU_FRAME_HEADER = 3,
+  OBU_TILE_GROUP = 4,
+  OBU_METADATA = 5,
+  OBU_FRAME = 6,
+  OBU_REDUNDANT_FRAME_HEADER = 7,
+  OBU_TILE_LIST = 8,
+  OBU_PADDING = 15,
+} OBU_TYPE;
+
+/*!\brief OBU metadata types. */
+typedef enum {
+  OBU_METADATA_TYPE_AOM_RESERVED_0 = 0,
+  OBU_METADATA_TYPE_HDR_CLL = 1,
+  OBU_METADATA_TYPE_HDR_MDCV = 2,
+  OBU_METADATA_TYPE_SCALABILITY = 3,
+  OBU_METADATA_TYPE_ITUT_T35 = 4,
+  OBU_METADATA_TYPE_TIMECODE = 5,
+} OBU_METADATA_TYPE;
+
+/*!\brief Returns string representation of OBU_TYPE.
+ *
+ * \param[in]     type            The OBU_TYPE to convert to string.
+ */
+const char *aom_obu_type_to_string(OBU_TYPE type);
+
+/*!\brief Config Options
+ *
+ * This type allows to enumerate and control options defined for control
+ * via config file at runtime.
+ */
+typedef struct cfg_options {
+  /*!\brief Reflects if ext_partition should be enabled
+   *
+   * If this value is non-zero it enabled the feature
+   */
+  unsigned int ext_partition;
+} cfg_options_t;
+
+/*!@} - end defgroup codec*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AOM_AOM_CODEC_H_
diff --git a/third_party/aom/aom/aom_decoder.h b/third_party/aom/aom/aom_decoder.h
new file mode 100644
index 000000000..06c2dc5f7
--- /dev/null
+++ b/third_party/aom/aom/aom_decoder.h
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_DECODER_H_
+#define AOM_AOM_AOM_DECODER_H_
+
+/*!\defgroup decoder Decoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this decoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all decoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video decoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_codec.h"
+#include "aom/aom_frame_buffer.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_DECODER_ABI_VERSION \
+  (3 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+/*! \brief Decoder capabilities bitfield
+ *
+ *  Each decoder advertises the capabilities it supports as part of its
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  or functionality, and are not required to be supported by a decoder.
+ *
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+#define AOM_CODEC_CAP_PUT_SLICE 0x10000 /**< Will issue put_slice callbacks */
+#define AOM_CODEC_CAP_PUT_FRAME 0x20000 /**< Will issue put_frame callbacks */
+#define AOM_CODEC_CAP_POSTPROC 0x40000  /**< Can postprocess decoded frame */
+/*!\brief Can receive encoded frames one fragment at a time */
+#define AOM_CODEC_CAP_INPUT_FRAGMENTS 0x100000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow for
+ *  proper memory allocation.
+ *
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+/*!\brief Can support frame-based multi-threading */
+#define AOM_CODEC_CAP_FRAME_THREADING 0x200000
+/*!brief Can support external frame buffers */
+#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000
+
+#define AOM_CODEC_USE_POSTPROC 0x10000 /**< Postprocess decoded frame */
+/*!\brief The input frame should be passed to the decoder one fragment at a
+ * time */
+#define AOM_CODEC_USE_INPUT_FRAGMENTS 0x40000
+
+/*!\brief Stream properties
+ *
+ * This structure is used to query or set properties of the decoded
+ * stream.
+ */
+typedef struct aom_codec_stream_info {
+  unsigned int w;                      /**< Width (or 0 for unknown/default) */
+  unsigned int h;                      /**< Height (or 0 for unknown/default) */
+  unsigned int is_kf;                  /**< Current frame is a keyframe */
+  unsigned int number_spatial_layers;  /**< Number of spatial layers */
+  unsigned int number_temporal_layers; /**< Number of temporal layers */
+  unsigned int is_annexb;              /**< Is Bitstream in Annex-B format */
+} aom_codec_stream_info_t;
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all decoders.
+ * They represent the base case functionality expected of all decoders.
+ */
+
+/*!\brief Initialization Configurations
+ *
+ * This structure is used to pass init time configuration options to the
+ * decoder.
+ */
+typedef struct aom_codec_dec_cfg {
+  unsigned int threads; /**< Maximum number of threads to use, default 1 */
+  unsigned int w;       /**< Width */
+  unsigned int h;       /**< Height */
+  unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */
+  cfg_options_t cfg;              /**< Options defined per config attributes */
+} aom_codec_dec_cfg_t;            /**< alias for struct aom_codec_dec_cfg */
+
+/*!\brief Initialize a decoder instance
+ *
+ * Initializes a decoder context using the given interface. Applications
+ * should call the aom_codec_dec_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with --disable-multithread, this call
+ * is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       AOM_DECODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_dec_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for aom_codec_dec_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_dec_init(ctx, iface, cfg, flags) \
+  aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)
+
+/*!\brief Parse stream info from a buffer
+ *
+ * Performs high level parsing of the bitstream. Construction of a decoder
+ * context is not necessary. Can be used to determine if the bitstream is
+ * of the proper format, and to extract information from the stream.
+ *
+ * \param[in]      iface   Pointer to the algorithm interface
+ * \param[in]      data    Pointer to a block of data to parse
+ * \param[in]      data_sz Size of the data buffer
+ * \param[in,out]  si      Pointer to stream info to update. The is_annexb
+ *                         member \ref MUST be properly initialized. This
+ *                         function sets the rest of the members.
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One of the arguments is invalid, for example a NULL pointer.
+ * \retval #AOM_CODEC_UNSUP_BITSTREAM
+ *     The decoder didn't recognize the coded data, or the
+ *     buffer was too short.
+ */
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+                                           const uint8_t *data, size_t data_sz,
+                                           aom_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in]      ctx     Pointer to this instance's context
+ * \param[in,out]  si      Pointer to stream info to update.
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One of the arguments is invalid, for example a NULL pointer.
+ * \retval #AOM_CODEC_UNSUP_BITSTREAM
+ *     The decoder couldn't parse the submitted data.
+ */
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+                                          aom_codec_stream_info_t *si);
+
+/*!\brief Decode data
+ *
+ * Processes a buffer of coded data. If the processing results in a new
+ * decoded frame becoming available, PUT_SLICE and PUT_FRAME events may be
+ * generated, as appropriate. Encoded data \ref MUST be passed in DTS (decode
+ * time stamp) order. Frames produced will always be in PTS (presentation
+ * time stamp) order.
+ * If the decoder is configured with AOM_CODEC_USE_INPUT_FRAGMENTS enabled,
+ * data and data_sz can contain a fragment of the encoded frame. Fragment
+ * \#n must contain at least partition \#n, but can also contain subsequent
+ * partitions (\#n+1 - \#n+i), and if so, fragments \#n+1, .., \#n+i must
+ * be empty. When no more data is available, this function should be called
+ * with NULL as data and 0 as data_sz. The memory passed to this function
+ * must be available until the frame has been decoded.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] data         Pointer to this block of new coded data. If
+ *                         NULL, a AOM_CODEC_CB_PUT_FRAME event is posted
+ *                         for the previously decoded frame.
+ * \param[in] data_sz      Size of the coded data, in bytes.
+ * \param[in] user_priv    Application specific data to associate with
+ *                         this frame.
+ *
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
+ *         and future pictures can be decoded without error. Otherwise,
+ *         see the descriptions of the other error codes in ::aom_codec_err_t
+ *         for recoverability capabilities.
+ */
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+                                 size_t data_sz, void *user_priv);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in,out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ *         produced will always be in PTS (presentation time stamp) order.
+ */
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);
+
+/*!\defgroup cap_put_frame Frame-Based Decoding Functions
+ *
+ * The following functions are required to be implemented for all decoders
+ * that advertise the AOM_CODEC_CAP_PUT_FRAME capability. Calling these
+ * functions
+ * for codecs that don't advertise this capability will result in an error
+ * code being returned, usually AOM_CODEC_ERROR
+ * @{
+ */
+
+/*!\brief put frame callback prototype
+ *
+ * This callback is invoked by the decoder to notify the application of
+ * the availability of decoded image data.
+ */
+typedef void (*aom_codec_put_frame_cb_fn_t)(void *user_priv,
+                                            const aom_image_t *img);
+
+/*!\brief Register for notification of frame completion.
+ *
+ * Registers a given function to be called when a decoded frame is
+ * available.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb           Pointer to the callback function
+ * \param[in] user_priv    User's private data
+ *
+ * \retval #AOM_CODEC_OK
+ *     Callback successfully registered.
+ * \retval #AOM_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     posting slice completion.
+ */
+aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_frame_cb_fn_t cb,
+                                                void *user_priv);
+
+/*!@} - end defgroup cap_put_frame */
+
+/*!\defgroup cap_put_slice Slice-Based Decoding Functions
+ *
+ * The following functions are required to be implemented for all decoders
+ * that advertise the AOM_CODEC_CAP_PUT_SLICE capability. Calling these
+ * functions
+ * for codecs that don't advertise this capability will result in an error
+ * code being returned, usually AOM_CODEC_ERROR
+ * @{
+ */
+
+/*!\brief put slice callback prototype
+ *
+ * This callback is invoked by the decoder to notify the application of
+ * the availability of partially decoded image data. The
+ */
+typedef void (*aom_codec_put_slice_cb_fn_t)(void *user_priv,
+                                            const aom_image_t *img,
+                                            const aom_image_rect_t *valid,
+                                            const aom_image_rect_t *update);
+
+/*!\brief Register for notification of slice completion.
+ *
+ * Registers a given function to be called when a decoded slice is
+ * available.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb           Pointer to the callback function
+ * \param[in] user_priv    User's private data
+ *
+ * \retval #AOM_CODEC_OK
+ *     Callback successfully registered.
+ * \retval #AOM_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     posting slice completion.
+ */
+aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_slice_cb_fn_t cb,
+                                                void *user_priv);
+
+/*!@} - end defgroup cap_put_slice*/
+
+/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+ *
+ * The following section is required to be implemented for all decoders
+ * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+ * Calling this function for codecs that don't advertise this capability
+ * will result in an error code being returned, usually AOM_CODEC_ERROR.
+ *
+ * \note
+ * Currently this only works with AV1.
+ * @{
+ */
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libaom will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #AOM_CODEC_OK
+ *     External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #AOM_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+/*!@} - end defgroup cap_external_frame_buffer */
+
+/*!@} - end defgroup decoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AOM_AOM_DECODER_H_
diff --git a/third_party/aom/aom/aom_encoder.h b/third_party/aom/aom/aom_encoder.h
new file mode 100644
index 000000000..0894ca9e3
--- /dev/null
+++ b/third_party/aom/aom/aom_encoder.h
@@ -0,0 +1,981 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_ENCODER_H_
+#define AOM_AOM_AOM_ENCODER_H_
+
+/*!\defgroup encoder Encoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this encoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all encoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the encoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video encoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_codec.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_ENCODER_ABI_VERSION \
+  (5 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+/*! \brief Encoder capabilities bitfield
+ *
+ *  Each encoder advertises the capabilities it supports as part of its
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra
+ *  interfaces or functionality, and are not required to be supported
+ *  by an encoder.
+ *
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
+
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow
+ *  for proper memory allocation.
+ *
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
+/*!\brief Make the encoder output one  partition at a time. */
+#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
+
+/*!\brief Generic fixed size buffer structure
+ *
+ * This structure is able to hold a reference to any fixed size buffer.
+ */
+typedef struct aom_fixed_buf {
+  void *buf;       /**< Pointer to the data */
+  size_t sz;       /**< Length of the buffer, in chars */
+} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
+
+/*!\brief Time Stamp Type
+ *
+ * An integer, which when multiplied by the stream's time base, provides
+ * the absolute time of a sample.
+ */
+typedef int64_t aom_codec_pts_t;
+
+/*!\brief Compressed Frame Flags
+ *
+ * This type represents a bitfield containing information about a compressed
+ * frame that may be useful to an application. The most significant 16 bits
+ * can be used by an algorithm to provide additional detail, for example to
+ * support frame types that are codec specific (MPEG-1 D-frames for example)
+ */
+typedef uint32_t aom_codec_frame_flags_t;
+#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
+#define AOM_FRAME_IS_DROPPABLE 0x2
+/*!\brief frame should be decoded but will not be shown */
+#define AOM_FRAME_IS_INVISIBLE 0x4
+/*!\brief this is a fragment of the encoded frame */
+#define AOM_FRAME_IS_FRAGMENT 0x8
+
+/*!\brief Error Resilient flags
+ *
+ * These flags define which error resilient features to enable in the
+ * encoder. The flags are specified through the
+ * aom_codec_enc_cfg::g_error_resilient variable.
+ */
+typedef uint32_t aom_codec_er_flags_t;
+/*!\brief Improve resiliency against losses of whole frames */
+#define AOM_ERROR_RESILIENT_DEFAULT 0x1
+
+/*!\brief Encoder output packet variants
+ *
+ * This enumeration lists the different kinds of data packets that can be
+ * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY
+ * extend this list to provide additional functionality.
+ */
+enum aom_codec_cx_pkt_kind {
+  AOM_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+  AOM_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+  AOM_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+  AOM_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
+  AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
+};
+
+/*!\brief Encoder output packet
+ *
+ * This structure contains the different kinds of output data the encoder
+ * may produce while compressing a frame.
+ */
+typedef struct aom_codec_cx_pkt {
+  enum aom_codec_cx_pkt_kind kind; /**< packet variant */
+  union {
+    struct {
+      void *buf; /**< compressed data buffer */
+      size_t sz; /**< length of compressed data */
+      /*!\brief time stamp to show frame (in timebase units) */
+      aom_codec_pts_t pts;
+      /*!\brief duration to show frame (in timebase units) */
+      unsigned long duration;
+      aom_codec_frame_flags_t flags; /**< flags for this frame */
+      /*!\brief the partition id defines the decoding order of the partitions.
+       * Only applicable when "output partition" mode is enabled. First
+       * partition has id 0.*/
+      int partition_id;
+      /*!\brief size of the visible frame in this packet */
+      size_t vis_frame_size;
+    } frame;                            /**< data for compressed frame packet */
+    aom_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
+    aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
+    struct aom_psnr_pkt {
+      unsigned int samples[4]; /**< Number of samples, total/y/u/v */
+      uint64_t sse[4];         /**< sum squared error, total/y/u/v */
+      double psnr[4];          /**< PSNR, total/y/u/v */
+    } psnr;                    /**< data for PSNR packet */
+    aom_fixed_buf_t raw;       /**< data for arbitrary packets */
+
+    /* This packet size is fixed to allow codecs to extend this
+     * interface without having to manage storage for raw packets,
+     * i.e., if it's smaller than 128 bytes, you can store in the
+     * packet list directly.
+     */
+    char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
+  } data;                                               /**< packet data */
+} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
+
+/*!\brief Rational Number
+ *
+ * This structure holds a fractional value.
+ */
+typedef struct aom_rational {
+  int num;        /**< fraction numerator */
+  int den;        /**< fraction denominator */
+} aom_rational_t; /**< alias for struct aom_rational */
+
+/*!\brief Multi-pass Encoding Pass */
+enum aom_enc_pass {
+  AOM_RC_ONE_PASS,   /**< Single pass mode */
+  AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
+  AOM_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+};
+
+/*!\brief Rate control mode */
+enum aom_rc_mode {
+  AOM_VBR, /**< Variable Bit Rate (VBR) mode */
+  AOM_CBR, /**< Constant Bit Rate (CBR) mode */
+  AOM_CQ,  /**< Constrained Quality (CQ)  mode */
+  AOM_Q,   /**< Constant Quality (Q) mode */
+};
+
+/*!\brief Keyframe placement mode.
+ *
+ * This enumeration determines whether keyframes are placed automatically by
+ * the encoder or whether this behavior is disabled. Older releases of this
+ * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled.
+ * This name is confusing for this behavior, so the new symbols to be used
+ * are AOM_KF_AUTO and AOM_KF_DISABLED.
+ */
+enum aom_kf_mode {
+  AOM_KF_FIXED,       /**< deprecated, implies AOM_KF_DISABLED */
+  AOM_KF_AUTO,        /**< Encoder determines optimal placement automatically */
+  AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
+};
+
+/*!\brief Encoded Frame Flags
+ *
+ * This type indicates a bitfield to be passed to aom_codec_encode(), defining
+ * per-frame boolean values. By convention, bits common to all codecs will be
+ * named AOM_EFLAG_*, and bits specific to an algorithm will be named
+ * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
+ */
+typedef long aom_enc_frame_flags_t;
+#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
+
+/*!\brief Encoder configuration structure
+ *
+ * This structure contains the encoder settings that have common representations
+ * across all codecs. This doesn't imply that all codecs support all features,
+ * however.
+ */
+typedef struct aom_codec_enc_cfg {
+  /*
+   * generic settings (g)
+   */
+
+  /*!\brief Algorithm specific "usage" value
+   *
+   * Algorithms may define multiple values for usage, which may convey the
+   * intent of how the application intends to use the stream. If this value
+   * is non-zero, consult the documentation for the codec to determine its
+   * meaning.
+   */
+  unsigned int g_usage;
+
+  /*!\brief Maximum number of threads to use
+   *
+   * For multi-threaded implementations, use no more than this number of
+   * threads. The codec may use fewer threads than allowed. The value
+   * 0 is equivalent to the value 1.
+   */
+  unsigned int g_threads;
+
+  /*!\brief Bitstream profile to use
+   *
+   * Some codecs support a notion of multiple bitstream profiles. Typically
+   * this maps to a set of features that are turned on or off. Often the
+   * profile to use is determined by the features of the intended decoder.
+   * Consult the documentation for the codec to determine the valid values
+   * for this parameter, or set to zero for a sane default.
+   */
+  unsigned int g_profile; /**< profile of bitstream to use */
+
+  /*!\brief Width of the frame
+   *
+   * This value identifies the presentation resolution of the frame,
+   * in pixels. Note that the frames passed as input to the encoder must
+   * have this resolution. Frames will be presented by the decoder in this
+   * resolution, independent of any spatial resampling the encoder may do.
+   */
+  unsigned int g_w;
+
+  /*!\brief Height of the frame
+   *
+   * This value identifies the presentation resolution of the frame,
+   * in pixels. Note that the frames passed as input to the encoder must
+   * have this resolution. Frames will be presented by the decoder in this
+   * resolution, independent of any spatial resampling the encoder may do.
+   */
+  unsigned int g_h;
+
+  /*!\brief Max number of frames to encode
+   *
+   */
+  unsigned int g_limit;
+
+  /*!\brief Forced maximum width of the frame
+   *
+   * If this value is non-zero then it is used to force the maximum frame
+   * width written in write_sequence_header().
+   */
+  unsigned int g_forced_max_frame_width;
+
+  /*!\brief Forced maximum height of the frame
+   *
+   * If this value is non-zero then it is used to force the maximum frame
+   * height written in write_sequence_header().
+   */
+  unsigned int g_forced_max_frame_height;
+
+  /*!\brief Bit-depth of the codec
+   *
+   * This value identifies the bit_depth of the codec,
+   * Only certain bit-depths are supported as identified in the
+   * aom_bit_depth_t enum.
+   */
+  aom_bit_depth_t g_bit_depth;
+
+  /*!\brief Bit-depth of the input frames
+   *
+   * This value identifies the bit_depth of the input frames in bits.
+   * Note that the frames passed as input to the encoder must have
+   * this bit-depth.
+   */
+  unsigned int g_input_bit_depth;
+
+  /*!\brief Stream timebase units
+   *
+   * Indicates the smallest interval of time, in seconds, used by the stream.
+   * For fixed frame rate material, or variable frame rate material where
+   * frames are timed at a multiple of a given clock (ex: video capture),
+   * the \ref RECOMMENDED method is to set the timebase to the reciprocal
+   * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
+   * pts to correspond to the frame number, which can be handy. For
+   * re-encoding video from containers with absolute time timestamps, the
+   * \ref RECOMMENDED method is to set the timebase to that of the parent
+   * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
+   */
+  struct aom_rational g_timebase;
+
+  /*!\brief Enable error resilient modes.
+   *
+   * The error resilient bitfield indicates to the encoder which features
+   * it should enable to take measures for streaming over lossy or noisy
+   * links.
+   */
+  aom_codec_er_flags_t g_error_resilient;
+
+  /*!\brief Multi-pass Encoding Mode
+   *
+   * This value should be set to the current phase for multi-pass encoding.
+   * For single pass, set to #AOM_RC_ONE_PASS.
+   */
+  enum aom_enc_pass g_pass;
+
+  /*!\brief Allow lagged encoding
+   *
+   * If set, this value allows the encoder to consume a number of input
+   * frames before producing output frames. This allows the encoder to
+   * base decisions for the current frame on future frames. This does
+   * increase the latency of the encoding pipeline, so it is not appropriate
+   * in all situations (ex: realtime encoding).
+   *
+   * Note that this is a maximum value -- the encoder may produce frames
+   * sooner than the given limit. Set this value to 0 to disable this
+   * feature.
+   */
+  unsigned int g_lag_in_frames;
+
+  /*
+   * rate control settings (rc)
+   */
+
+  /*!\brief Temporal resampling configuration, if supported by the codec.
+   *
+   * Temporal resampling allows the codec to "drop" frames as a strategy to
+   * meet its target data rate. This can cause temporal discontinuities in
+   * the encoded video, which may appear as stuttering during playback. This
+   * trade-off is often acceptable, but for many applications is not. It can
+   * be disabled in these cases.
+   *
+   * Note that not all codecs support this feature. All aom AVx codecs do.
+   * For other codecs, consult the documentation for that algorithm.
+   *
+   * This threshold is described as a percentage of the target data buffer.
+   * When the data buffer falls below this percentage of fullness, a
+   * dropped frame is indicated. Set the threshold to zero (0) to disable
+   * this feature.
+   */
+  unsigned int rc_dropframe_thresh;
+
+  /*!\brief Mode for spatial resampling, if supported by the codec.
+   *
+   * Spatial resampling allows the codec to compress a lower resolution
+   * version of the frame, which is then upscaled by the decoder to the
+   * correct presentation resolution. This increases visual quality at
+   * low data rates, at the expense of CPU time on the encoder/decoder.
+   */
+  unsigned int rc_resize_mode;
+
+  /*!\brief Frame resize denominator.
+   *
+   * The denominator for resize to use, assuming 8 as the numerator.
+   *
+   * Valid denominators are  8 - 16 for now.
+   */
+  unsigned int rc_resize_denominator;
+
+  /*!\brief Keyframe resize denominator.
+   *
+   * The denominator for resize to use, assuming 8 as the numerator.
+   *
+   * Valid denominators are  8 - 16 for now.
+   */
+  unsigned int rc_resize_kf_denominator;
+
+  /*!\brief Frame super-resolution scaling mode.
+   *
+   * Similar to spatial resampling, frame super-resolution integrates
+   * upscaling after the encode/decode process. Taking control of upscaling and
+   * using restoration filters should allow it to outperform normal resizing.
+   *
+   * Mode 0 is SUPERRES_NONE, mode 1 is SUPERRES_FIXED, mode 2 is
+   * SUPERRES_RANDOM and mode 3 is SUPERRES_QTHRESH.
+   */
+  unsigned int rc_superres_mode;
+
+  /*!\brief Frame super-resolution denominator.
+   *
+   * The denominator for superres to use. If fixed it will only change if the
+   * cumulative scale change over resizing and superres is greater than 1/2;
+   * this forces superres to reduce scaling.
+   *
+   * Valid denominators are 8 to 16.
+   *
+   * Used only by SUPERRES_FIXED.
+   */
+  unsigned int rc_superres_denominator;
+
+  /*!\brief Keyframe super-resolution denominator.
+   *
+   * The denominator for superres to use. If fixed it will only change if the
+   * cumulative scale change over resizing and superres is greater than 1/2;
+   * this forces superres to reduce scaling.
+   *
+   * Valid denominators are 8 - 16 for now.
+   */
+  unsigned int rc_superres_kf_denominator;
+
+  /*!\brief Frame super-resolution q threshold.
+   *
+   * The q level threshold after which superres is used.
+   * Valid values are 1 to 63.
+   *
+   * Used only by SUPERRES_QTHRESH
+   */
+  unsigned int rc_superres_qthresh;
+
+  /*!\brief Keyframe super-resolution q threshold.
+   *
+   * The q level threshold after which superres is used for key frames.
+   * Valid values are 1 to 63.
+   *
+   * Used only by SUPERRES_QTHRESH
+   */
+  unsigned int rc_superres_kf_qthresh;
+
+  /*!\brief Rate control algorithm to use.
+   *
+   * Indicates whether the end usage of this stream is to be streamed over
+   * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
+   * mode should be used, or whether it will be played back on a high
+   * bandwidth link, as from a local disk, where higher variations in
+   * bitrate are acceptable.
+   */
+  enum aom_rc_mode rc_end_usage;
+
+  /*!\brief Two-pass stats buffer.
+   *
+   * A buffer containing all of the stats packets produced in the first
+   * pass, concatenated.
+   */
+  aom_fixed_buf_t rc_twopass_stats_in;
+
+  /*!\brief first pass mb stats buffer.
+   *
+   * A buffer containing all of the first pass mb stats packets produced
+   * in the first pass, concatenated.
+   */
+  aom_fixed_buf_t rc_firstpass_mb_stats_in;
+
+  /*!\brief Target data rate
+   *
+   * Target bandwidth to use for this stream, in kilobits per second.
+   */
+  unsigned int rc_target_bitrate;
+
+  /*
+   * quantizer settings
+   */
+
+  /*!\brief Minimum (Best Quality) Quantizer
+   *
+   * The quantizer is the most direct control over the quality of the
+   * encoded image. The range of valid values for the quantizer is codec
+   * specific. Consult the documentation for the codec to determine the
+   * values to use. To determine the range programmatically, call
+   * aom_codec_enc_config_default() with a usage value of 0.
+   */
+  unsigned int rc_min_quantizer;
+
+  /*!\brief Maximum (Worst Quality) Quantizer
+   *
+   * The quantizer is the most direct control over the quality of the
+   * encoded image. The range of valid values for the quantizer is codec
+   * specific. Consult the documentation for the codec to determine the
+   * values to use. To determine the range programmatically, call
+   * aom_codec_enc_config_default() with a usage value of 0.
+   */
+  unsigned int rc_max_quantizer;
+
+  /*
+   * bitrate tolerance
+   */
+
+  /*!\brief Rate control adaptation undershoot control
+   *
+   * This value, expressed as a percentage of the target bitrate,
+   * controls the maximum allowed adaptation speed of the codec.
+   * This factor controls the maximum amount of bits that can
+   * be subtracted from the target bitrate in order to compensate
+   * for prior overshoot.
+   *
+   * Valid values in the range 0-1000.
+   */
+  unsigned int rc_undershoot_pct;
+
+  /*!\brief Rate control adaptation overshoot control
+   *
+   * This value, expressed as a percentage of the target bitrate,
+   * controls the maximum allowed adaptation speed of the codec.
+   * This factor controls the maximum amount of bits that can
+   * be added to the target bitrate in order to compensate for
+   * prior undershoot.
+   *
+   * Valid values in the range 0-1000.
+   */
+  unsigned int rc_overshoot_pct;
+
+  /*
+   * decoder buffer model parameters
+   */
+
+  /*!\brief Decoder Buffer Size
+   *
+   * This value indicates the amount of data that may be buffered by the
+   * decoding application. Note that this value is expressed in units of
+   * time (milliseconds). For example, a value of 5000 indicates that the
+   * client will buffer (at least) 5000ms worth of encoded data. Use the
+   * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
+   * necessary.
+   */
+  unsigned int rc_buf_sz;
+
+  /*!\brief Decoder Buffer Initial Size
+   *
+   * This value indicates the amount of data that will be buffered by the
+   * decoding application prior to beginning playback. This value is
+   * expressed in units of time (milliseconds). Use the target bitrate
+   * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
+   */
+  unsigned int rc_buf_initial_sz;
+
+  /*!\brief Decoder Buffer Optimal Size
+   *
+   * This value indicates the amount of data that the encoder should try
+   * to maintain in the decoder's buffer. This value is expressed in units
+   * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
+   * to convert to bits/bytes, if necessary.
+   */
+  unsigned int rc_buf_optimal_sz;
+
+  /*
+   * 2 pass rate control parameters
+   */
+
+  /*!\brief Two-pass mode CBR/VBR bias
+   *
+   * Bias, expressed on a scale of 0 to 100, for determining target size
+   * for the current frame. The value 0 indicates the optimal CBR mode
+   * value should be used. The value 100 indicates the optimal VBR mode
+   * value should be used. Values in between indicate which way the
+   * encoder should "lean."
+   */
+  unsigned int rc_2pass_vbr_bias_pct;
+
+  /*!\brief Two-pass mode per-GOP minimum bitrate
+   *
+   * This value, expressed as a percentage of the target bitrate, indicates
+   * the minimum bitrate to be used for a single GOP (aka "section")
+   */
+  unsigned int rc_2pass_vbr_minsection_pct;
+
+  /*!\brief Two-pass mode per-GOP maximum bitrate
+   *
+   * This value, expressed as a percentage of the target bitrate, indicates
+   * the maximum bitrate to be used for a single GOP (aka "section")
+   */
+  unsigned int rc_2pass_vbr_maxsection_pct;
+
+  /*
+   * keyframing settings (kf)
+   */
+
+  /*!\brief Option to enable forward reference key frame
+   *
+   */
+  int fwd_kf_enabled;
+
+  /*!\brief Keyframe placement mode
+   *
+   * This value indicates whether the encoder should place keyframes at a
+   * fixed interval, or determine the optimal placement automatically
+   * (as governed by the #kf_min_dist and #kf_max_dist parameters)
+   */
+  enum aom_kf_mode kf_mode;
+
+  /*!\brief Keyframe minimum interval
+   *
+   * This value, expressed as a number of frames, prevents the encoder from
+   * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
+   * least kf_min_dist frames non-keyframes will be coded before the next
+   * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
+   */
+  unsigned int kf_min_dist;
+
+  /*!\brief Keyframe maximum interval
+   *
+   * This value, expressed as a number of frames, forces the encoder to code
+   * a keyframe if one has not been coded in the last kf_max_dist frames.
+   * A value of 0 implies all frames will be keyframes. Set kf_min_dist
+   * equal to kf_max_dist for a fixed interval.
+   */
+  unsigned int kf_max_dist;
+
+  /*!\brief sframe interval
+   *
+   * This value, expressed as a number of frames, forces the encoder to code
+   * an S-Frame every sframe_dist frames.
+   */
+  unsigned int sframe_dist;
+
+  /*!\brief sframe insertion mode
+   *
+   * This value must be set to 1 or 2, and tells the encoder how to insert
+   * S-Frames. It will only have an effect if sframe_dist != 0.
+   *
+   * If altref is enabled:
+   *   - if sframe_mode == 1, the considered frame will be made into an
+   *     S-Frame only if it is an altref frame
+   *   - if sframe_mode == 2, the next altref frame will be made into an
+   *     S-Frame.
+   *
+   * Otherwise: the considered frame will be made into an S-Frame.
+   */
+  unsigned int sframe_mode;
+
+  /*!\brief Tile coding mode
+   *
+   * This value indicates the tile coding mode.
+   * A value of 0 implies a normal non-large-scale tile coding. A value of 1
+   * implies a large-scale tile coding.
+   */
+  unsigned int large_scale_tile;
+
+  /*!\brief Monochrome mode
+   *
+   * If this is nonzero, the encoder will generate a monochrome stream
+   * with no chroma planes.
+   */
+  unsigned int monochrome;
+
+  /*!\brief full_still_picture_hdr
+   *
+   * If this is nonzero, the encoder will generate a full header even for
+   * still picture encoding. if zero, a reduced header is used for still
+   * picture. This flag has no effect when a regular video with more than
+   * a single frame is encoded.
+   */
+  unsigned int full_still_picture_hdr;
+
+  /*!\brief Bitstream syntax mode
+   *
+   * This value indicates the bitstream syntax mode.
+   * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value
+   * of 1 indicates the bitstream is saved in Annex-B format
+   */
+  unsigned int save_as_annexb;
+
+  /*!\brief Number of explicit tile widths specified
+   *
+   * This value indicates the number of tile widths specified
+   * A value of 0 implies no tile widths are specified.
+   * Tile widths are given in the array tile_widths[]
+   */
+  int tile_width_count;
+
+  /*!\brief Number of explicit tile heights specified
+   *
+   * This value indicates the number of tile heights specified
+   * A value of 0 implies no tile heights are specified.
+   * Tile heights are given in the array tile_heights[]
+   */
+  int tile_height_count;
+
+/*!\brief Maximum number of tile widths in tile widths array
+ *
+ * This define gives the maximum number of elements in the tile_widths array.
+ */
+#define MAX_TILE_WIDTHS 64  // maximum tile width array length
+
+  /*!\brief Array of specified tile widths
+   *
+   * This array specifies tile widths (and may be empty)
+   * The number of widths specified is given by tile_width_count
+   */
+  int tile_widths[MAX_TILE_WIDTHS];
+
+/*!\brief Maximum number of tile heights in tile heights array.
+ *
+ * This define gives the maximum number of elements in the tile_heights array.
+ */
+#define MAX_TILE_HEIGHTS 64  // maximum tile height array length
+
+  /*!\brief Array of specified tile heights
+   *
+   * This array specifies tile heights (and may be empty)
+   * The number of heights specified is given by tile_height_count
+   */
+  int tile_heights[MAX_TILE_HEIGHTS];
+
+  /*!\brief Options defined per config file
+   *
+   */
+  cfg_options_t cfg;
+} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
+
+/*!\brief Initialize an encoder instance
+ *
+ * Initializes a encoder context using the given interface. Applications
+ * should call the aom_codec_enc_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with --disable-multithread, this call
+ * is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known.
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       AOM_ENCODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for aom_codec_enc_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_enc_init(ctx, iface, cfg, flags) \
+  aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)
+
+/*!\brief Initialize multi-encoder instance
+ *
+ * Initializes multi-encoder context using the given interface.
+ * Applications should call the aom_codec_enc_init_multi convenience macro
+ * instead of this function directly, to ensure that the ABI version number
+ * parameter is properly initialized.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known.
+ * \param[in]    num_enc Total number of encoders.
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
+ * \param[in]    dsf     Pointer to down-sampling factors.
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       AOM_ENCODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_enc_init_multi_ver(
+    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
+    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver);
+
+/*!\brief Convenience macro for aom_codec_enc_init_multi_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_enc_init_multi(ctx, iface, cfg, num_enc, flags, dsf) \
+  aom_codec_enc_init_multi_ver(ctx, iface, cfg, num_enc, flags, dsf,   \
+                               AOM_ENCODER_ABI_VERSION)
+
+/*!\brief Get a default configuration
+ *
+ * Initializes a encoder configuration structure with default values. Supports
+ * the notion of "usages" so that an algorithm may offer different default
+ * settings depending on the user's intended goal. This function \ref SHOULD
+ * be called by all applications to initialize the configuration structure
+ * before specializing the configuration with application specific values.
+ *
+ * \param[in]    iface     Pointer to the algorithm interface to use.
+ * \param[out]   cfg       Configuration buffer to populate.
+ * \param[in]    reserved  Must set to 0.
+ *
+ * \retval #AOM_CODEC_OK
+ *     The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, or the usage value was not recognized.
+ */
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+                                             aom_codec_enc_cfg_t *cfg,
+                                             unsigned int reserved);
+
+/*!\brief Set or change configuration
+ *
+ * Reconfigures an encoder instance according to the given configuration.
+ *
+ * \param[in]    ctx     Pointer to this instance's context
+ * \param[in]    cfg     Configuration buffer to use
+ *
+ * \retval #AOM_CODEC_OK
+ *     The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, or the usage value was not recognized.
+ */
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+                                         const aom_codec_enc_cfg_t *cfg);
+
+/*!\brief Get global stream headers
+ *
+ * Retrieves a stream level global header packet, if supported by the codec.
+ * Calls to this function should be deferred until all configuration information
+ * has been passed to libaom. Otherwise the global header data may be
+ * invalidated by additional configuration changes.
+ *
+ * The AV1 implementation of this function returns an OBU. The OBU returned is
+ * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is
+ * set, and the buffer contains the obu_size field for the returned OBU.
+ *
+ * \param[in]    ctx     Pointer to this instance's context
+ *
+ * \retval NULL
+ *     Encoder does not support global header, or an error occurred while
+ *     generating the global header.
+ *
+ * \retval Non-NULL
+ *     Pointer to buffer containing global header packet. The caller owns the
+ *     memory associated with this buffer, and must free the 'buf' member of the
+ *     aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned
+ *     must be freed via call to free().
+ */
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
+
+/*!\brief Encode a frame
+ *
+ * Encodes a video frame at the given "presentation time." The presentation
+ * time stamp (PTS) \ref MUST be strictly increasing.
+ *
+ * When the last frame has been passed to the encoder, this function should
+ * continue to be called, with the img parameter set to NULL. This will
+ * signal the end-of-stream condition to the encoder and allow it to encode
+ * any held buffers. Encoding is complete when aom_codec_encode() is called
+ * and aom_codec_get_cx_data() returns no data.
+ *
+ * \param[in]    ctx       Pointer to this instance's context
+ * \param[in]    img       Image data to encode, NULL to flush.
+ * \param[in]    pts       Presentation time stamp, in timebase units.
+ * \param[in]    duration  Duration to show frame, in timebase units.
+ * \param[in]    flags     Flags to use for encoding this frame.
+ *
+ * \retval #AOM_CODEC_OK
+ *     The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, the image format is unsupported, etc.
+ */
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                                 aom_codec_pts_t pts, unsigned long duration,
+                                 aom_enc_frame_flags_t flags);
+
+/*!\brief Set compressed data output buffer
+ *
+ * Sets the buffer that the codec should output the compressed data
+ * into. This call effectively sets the buffer pointer returned in the
+ * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
+ * appended into this buffer. The buffer is preserved across frames,
+ * so applications must periodically call this function after flushing
+ * the accumulated compressed data to disk or to the network to reset
+ * the pointer to the buffer's head.
+ *
+ * `pad_before` bytes will be skipped before writing the compressed
+ * data, and `pad_after` bytes will be appended to the packet. The size
+ * of the packet will be the sum of the size of the actual compressed
+ * data, pad_before, and pad_after. The padding bytes will be preserved
+ * (not overwritten).
+ *
+ * Note that calling this function does not guarantee that the returned
+ * compressed data will be placed into the specified buffer. In the
+ * event that the encoded data will not fit into the buffer provided,
+ * the returned packet \ref MAY point to an internal buffer, as it would
+ * if this call were never used. In this event, the output packet will
+ * NOT have any padding, and the application must free space and copy it
+ * to the proper place. This is of particular note in configurations
+ * that may output multiple packets for a single encoded frame (e.g., lagged
+ * encoding) or if the application does not reset the buffer periodically.
+ *
+ * Applications may restore the default behavior of the codec providing
+ * the compressed data buffer by calling this function with a NULL
+ * buffer.
+ *
+ * Applications \ref MUSTNOT call this function during iteration of
+ * aom_codec_get_cx_data().
+ *
+ * \param[in]    ctx         Pointer to this instance's context
+ * \param[in]    buf         Buffer to store compressed data into
+ * \param[in]    pad_before  Bytes to skip before writing compressed data
+ * \param[in]    pad_after   Bytes to skip after writing compressed data
+ *
+ * \retval #AOM_CODEC_OK
+ *     The buffer was set successfully.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, the image format is unsupported, etc.
+ */
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+                                          const aom_fixed_buf_t *buf,
+                                          unsigned int pad_before,
+                                          unsigned int pad_after);
+
+/*!\brief Encoded data iterator
+ *
+ * Iterates over a list of data packets to be passed from the encoder to the
+ * application. The different kinds of packets available are enumerated in
+ * #aom_codec_cx_pkt_kind.
+ *
+ * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's
+ * muxer. Multiple compressed frames may be in the list.
+ * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer.
+ *
+ * The application \ref MUST silently ignore any packet kinds that it does
+ * not recognize or support.
+ *
+ * The data buffers returned from this function are only guaranteed to be
+ * valid until the application makes another call to any aom_codec_* function.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in,out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an output data packet (compressed frame data,
+ *         two-pass statistics, etc.) or NULL to signal end-of-list.
+ *
+ */
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+                                                aom_codec_iter_t *iter);
+
+/*!\brief Get Preview Frame
+ *
+ * Returns an image that can be used as a preview. Shows the image as it would
+ * exist at the decompressor. The application \ref MUST NOT write into this
+ * image buffer.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ *
+ * \return Returns a pointer to a preview image, or NULL if no image is
+ *         available.
+ *
+ */
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);
+
+/*!@} - end defgroup encoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AOM_AOM_ENCODER_H_
diff --git a/third_party/aom/aom/aom_frame_buffer.h b/third_party/aom/aom/aom_frame_buffer.h
new file mode 100644
index 000000000..fba4322f8
--- /dev/null
+++ b/third_party/aom/aom/aom_frame_buffer.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_AOM_FRAME_BUFFER_H_
+#define AOM_AOM_AOM_FRAME_BUFFER_H_
+
+/*!\file
+ * \brief Describes the decoder external frame buffer interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_integer.h"
+
+/*!\brief The maximum number of work buffers used by libaom.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
+ */
+#define AOM_MAXIMUM_WORK_BUFFERS 8
+
+/*!\brief The maximum number of reference buffers that a AV1 encoder may use.
+ */
+#define AOM_MAXIMUM_REF_BUFFERS 8
+
+/*!\brief External frame buffer
+ *
+ * This structure holds allocated frame buffers used by the decoder.
+ */
+typedef struct aom_codec_frame_buffer {
+  uint8_t *data; /**< Pointer to the data buffer */
+  size_t size;   /**< Size of data in bytes */
+  void *priv;    /**< Frame's private data */
+} aom_codec_frame_buffer_t;
+
+/*!\brief get frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder to retrieve data for the frame
+ * buffer in order for the decode call to complete. The callback must
+ * allocate at least min_size in bytes and assign it to fb->data. The callback
+ * must zero out all the data allocated. Then the callback must set fb->size
+ * to the allocated size. The application does not need to align the allocated
+ * data. The callback is triggered when the decoder needs a frame buffer to
+ * decode a compressed image into. This function may be called more than once
+ * for every call to aom_codec_decode. The application may set fb->priv to
+ * some data which will be passed back in the ximage and the release function
+ * call. |fb| is guaranteed to not be NULL. On success the callback must
+ * return 0. Any failure the callback must return a value less than 0.
+ *
+ * \param[in] priv         Callback's private data
+ * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in,out] fb       Pointer to aom_codec_frame_buffer_t
+ */
+typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
+                                            aom_codec_frame_buffer_t *fb);
+
+/*!\brief release frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder when the frame buffer is not
+ * referenced by any other buffers. |fb| is guaranteed to not be NULL. On
+ * success the callback must return 0. Any failure the callback must return
+ * a value less than 0.
+ *
+ * \param[in] priv         Callback's private data
+ * \param[in] fb           Pointer to aom_codec_frame_buffer_t
+ */
+typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
+                                                aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_FRAME_BUFFER_H_
diff --git a/third_party/aom/aom/aom_image.h b/third_party/aom/aom/aom_image.h
new file mode 100644
index 000000000..a960127f1
--- /dev/null
+++ b/third_party/aom/aom/aom_image.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the aom image descriptor and associated operations
+ *
+ */
+#ifndef AOM_AOM_AOM_IMAGE_H_
+#define AOM_AOM_AOM_IMAGE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_integer.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_IMAGE_ABI_VERSION (4) /**<\hideinitializer*/
+
+#define AOM_IMG_FMT_PLANAR 0x100       /**< Image is a planar format. */
+#define AOM_IMG_FMT_UV_FLIP 0x200      /**< V plane precedes U in memory. */
+#define AOM_IMG_FMT_HAS_ALPHA 0x400    /**< Image has an alpha channel. */
+#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
+
+/*!\brief List of supported image formats */
+typedef enum aom_img_fmt {
+  AOM_IMG_FMT_NONE,
+  AOM_IMG_FMT_YV12 =
+      AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
+  AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
+  AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP |
+                        3, /** < planar 4:2:0 format with aom color space */
+  AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
+  AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
+  AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
+  AOM_IMG_FMT_444A = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_HAS_ALPHA | 6,
+  AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
+} aom_img_fmt_t; /**< alias for enum aom_img_fmt */
+
+/*!\brief List of supported color primaries */
+typedef enum aom_color_primaries {
+  AOM_CICP_CP_RESERVED_0 = 0,  /**< For future use */
+  AOM_CICP_CP_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_CP_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_CP_BT_470_M = 4,    /**< BT.470 System M (historical) */
+  AOM_CICP_CP_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_CP_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_CP_SMPTE_240 = 7,   /**< SMPTE 240 */
+  AOM_CICP_CP_GENERIC_FILM =
+      8, /**< Generic film (color filters using illuminant C) */
+  AOM_CICP_CP_BT_2020 = 9,      /**< BT.2020, BT.2100 */
+  AOM_CICP_CP_XYZ = 10,         /**< SMPTE 428 (CIE 1921 XYZ) */
+  AOM_CICP_CP_SMPTE_431 = 11,   /**< SMPTE RP 431-2 */
+  AOM_CICP_CP_SMPTE_432 = 12,   /**< SMPTE EG 432-1  */
+  AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21)  */
+  AOM_CICP_CP_EBU_3213 = 22,    /**< EBU Tech. 3213-E  */
+  AOM_CICP_CP_RESERVED_23 = 23  /**< For future use (values 23 - 255)  */
+} aom_color_primaries_t;        /**< alias for enum aom_color_primaries */
+
+/*!\brief List of supported transfer functions */
+typedef enum aom_transfer_characteristics {
+  AOM_CICP_TC_RESERVED_0 = 0,  /**< For future use */
+  AOM_CICP_TC_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_TC_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_TC_BT_470_M = 4,    /**< BT.470 System M (historical)  */
+  AOM_CICP_TC_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_TC_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_TC_SMPTE_240 = 7,   /**< SMPTE 240 M */
+  AOM_CICP_TC_LINEAR = 8,      /**< Linear */
+  AOM_CICP_TC_LOG_100 = 9,     /**< Logarithmic (100 : 1 range) */
+  AOM_CICP_TC_LOG_100_SQRT10 =
+      10,                     /**< Logarithmic (100 * Sqrt(10) : 1 range) */
+  AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */
+  AOM_CICP_TC_BT_1361 = 12,   /**< BT.1361 */
+  AOM_CICP_TC_SRGB = 13,      /**< sRGB or sYCC*/
+  AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */
+  AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */
+  AOM_CICP_TC_SMPTE_2084 = 16,     /**< SMPTE ST 2084, ITU BT.2100 PQ */
+  AOM_CICP_TC_SMPTE_428 = 17,      /**< SMPTE ST 428 */
+  AOM_CICP_TC_HLG = 18,            /**< BT.2100 HLG, ARIB STD-B67 */
+  AOM_CICP_TC_RESERVED_19 = 19     /**< For future use (values 19-255) */
+} aom_transfer_characteristics_t;  /**< alias for enum aom_transfer_function */
+
+/*!\brief List of supported matrix coefficients */
+typedef enum aom_matrix_coefficients {
+  AOM_CICP_MC_IDENTITY = 0,    /**< Identity matrix */
+  AOM_CICP_MC_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_MC_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_MC_FCC = 4,         /**< US FCC 73.628 */
+  AOM_CICP_MC_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_MC_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_MC_SMPTE_240 = 7,   /**< SMPTE 240 M */
+  AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */
+  AOM_CICP_MC_BT_2020_NCL =
+      9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr  */
+  AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */
+  AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */
+  AOM_CICP_MC_CHROMAT_NCL =
+      12, /**< Chromaticity-derived non-constant luminance */
+  AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */
+  AOM_CICP_MC_ICTCP = 14,      /**< BT.2100 ICtCp */
+  AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255)  */
+} aom_matrix_coefficients_t;
+
+/*!\brief List of supported color range */
+typedef enum aom_color_range {
+  AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
+  AOM_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
+} aom_color_range_t;       /**< alias for enum aom_color_range */
+
+/*!\brief List of chroma sample positions */
+typedef enum aom_chroma_sample_position {
+  AOM_CSP_UNKNOWN = 0,          /**< Unknown */
+  AOM_CSP_VERTICAL = 1,         /**< Horizontally co-located with luma(0, 0)*/
+                                /**< sample, between two vertical samples */
+  AOM_CSP_COLOCATED = 2,        /**< Co-located with luma(0, 0) sample */
+  AOM_CSP_RESERVED = 3          /**< Reserved value */
+} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */
+
+/**\brief Image Descriptor */
+typedef struct aom_image {
+  aom_img_fmt_t fmt;                 /**< Image Format */
+  aom_color_primaries_t cp;          /**< CICP Color Primaries */
+  aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */
+  aom_matrix_coefficients_t mc;      /**< CICP Matrix Coefficients */
+  int monochrome;                    /**< Whether image is monochrome */
+  aom_chroma_sample_position_t csp;  /**< chroma sample position */
+  aom_color_range_t range;           /**< Color Range */
+
+  /* Image storage dimensions */
+  unsigned int w;         /**< Stored image width */
+  unsigned int h;         /**< Stored image height */
+  unsigned int bit_depth; /**< Stored image bit-depth */
+
+  /* Image display dimensions */
+  unsigned int d_w; /**< Displayed image width */
+  unsigned int d_h; /**< Displayed image height */
+
+  /* Image intended rendering dimensions */
+  unsigned int r_w; /**< Intended rendering image width */
+  unsigned int r_h; /**< Intended rendering image height */
+
+  /* Chroma subsampling info */
+  unsigned int x_chroma_shift; /**< subsampling order, X */
+  unsigned int y_chroma_shift; /**< subsampling order, Y */
+
+/* Image data pointers. */
+#define AOM_PLANE_PACKED 0  /**< To be used for all packed formats */
+#define AOM_PLANE_Y 0       /**< Y (Luminance) plane */
+#define AOM_PLANE_U 1       /**< U (Chroma) plane */
+#define AOM_PLANE_V 2       /**< V (Chroma) plane */
+#define AOM_PLANE_ALPHA 3   /**< A (Transparency) plane */
+  unsigned char *planes[4]; /**< pointer to the top left pixel for each plane */
+  int stride[4];            /**< stride between rows for each plane */
+  size_t sz;                /**< data size */
+
+  int bps; /**< bits per sample (for packed formats) */
+
+  int temporal_id; /**< Temporal layer Id of image */
+  int spatial_id;  /**< Spatial layer Id of image */
+
+  /*!\brief The following member may be set by the application to associate
+   * data with this image.
+   */
+  void *user_priv;
+
+  /* The following members should be treated as private. */
+  unsigned char *img_data; /**< private */
+  int img_data_owner;      /**< private */
+  int self_allocd;         /**< private */
+
+  void *fb_priv; /**< Frame buffer data associated with the image. */
+} aom_image_t;   /**< alias for struct aom_image */
+
+/**\brief Representation of a rectangle on a surface */
+typedef struct aom_image_rect {
+  unsigned int x;   /**< leftmost column */
+  unsigned int y;   /**< topmost row */
+  unsigned int w;   /**< width */
+  unsigned int h;   /**< height */
+} aom_image_rect_t; /**< alias for struct aom_image_rect */
+
+/*!\brief Open a descriptor, allocating storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for the descriptor is allocated on the heap.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of the image buffer and
+ *                         each row in the image(stride).
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align);
+
+/*!\brief Open a descriptor, using existing storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for descriptor has been allocated elsewhere, and a descriptor is
+ * desired to "wrap" that storage.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of each row in the image.
+ * \param[in]    img_data  Storage to use for the image
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int align,
+                          unsigned char *img_data);
+
+/*!\brief Open a descriptor, allocating storage for the underlying image with a
+ * border
+ *
+ * Returns a descriptor for storing an image of the given format and its
+ * borders. The storage for the descriptor is allocated on the heap.
+ *
+ * \param[in]    img        Pointer to storage for descriptor. If this parameter
+ *                          is NULL, the storage for the descriptor will be
+ *                          allocated on the heap.
+ * \param[in]    fmt        Format for the image
+ * \param[in]    d_w        Width of the image
+ * \param[in]    d_h        Height of the image
+ * \param[in]    align      Alignment, in bytes, of the image buffer and
+ *                          each row in the image(stride).
+ * \param[in]    size_align Alignment, in bytes, of the image width and height.
+ * \param[in]    border     A border that is padded on four sides of the image.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+                                       unsigned int d_w, unsigned int d_h,
+                                       unsigned int align,
+                                       unsigned int size_align,
+                                       unsigned int border);
+
+/*!\brief Set the rectangle identifying the displayed portion of the image
+ *
+ * Updates the displayed rectangle (aka viewport) on the image surface to
+ * match the specified coordinates and size.
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    x         leftmost column
+ * \param[in]    y         topmost row
+ * \param[in]    w         width
+ * \param[in]    h         height
+ * \param[in]    border    A border that is padded on four sides of the image.
+ *
+ * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ */
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h, unsigned int border);
+
+/*!\brief Flip the image vertically (top for bottom)
+ *
+ * Adjusts the image descriptor's pointers and strides to make the image
+ * be referenced upside-down.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void aom_img_flip(aom_image_t *img);
+
+/*!\brief Close an image descriptor
+ *
+ * Frees all allocated storage associated with an image descriptor.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void aom_img_free(aom_image_t *img);
+
+/*!\brief Get the width of a plane
+ *
+ * Get the width of a plane of an image
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    plane     Plane index
+ */
+int aom_img_plane_width(const aom_image_t *img, int plane);
+
+/*!\brief Get the height of a plane
+ *
+ * Get the height of a plane of an image
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    plane     Plane index
+ */
+int aom_img_plane_height(const aom_image_t *img, int plane);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_IMAGE_H_
diff --git a/third_party/aom/aom/aom_integer.h b/third_party/aom/aom/aom_integer.h
new file mode 100644
index 000000000..90263bd4f
--- /dev/null
+++ b/third_party/aom/aom/aom_integer.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_INTEGER_H_
+#define AOM_AOM_AOM_INTEGER_H_
+
+/* get ptrdiff_t, size_t, wchar_t, NULL */
+#include <stddef.h>
+
+#if defined(_MSC_VER)
+#define AOM_FORCE_INLINE __forceinline
+#define AOM_INLINE __inline
+#else
+#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
+// TODO(jbb): Allow a way to force inline off for older compilers.
+#define AOM_INLINE inline
+#endif
+
+#if defined(AOM_EMULATE_INTTYPES)
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef signed int int32_t;
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+#ifndef _UINTPTR_T_DEFINED
+typedef size_t uintptr_t;
+#endif
+
+#else
+
+/* Most platforms have the C99 standard integer types. */
+
+#if defined(__cplusplus)
+#if !defined(__STDC_FORMAT_MACROS)
+#define __STDC_FORMAT_MACROS
+#endif
+#if !defined(__STDC_LIMIT_MACROS)
+#define __STDC_LIMIT_MACROS
+#endif
+#endif  // __cplusplus
+
+#include <stdint.h>
+
+#endif
+
+/* VS2010 defines stdint.h, but not inttypes.h */
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define PRId64 "I64d"
+#else
+#include <inttypes.h>
+#endif
+
+#if !defined(INT8_MAX)
+#define INT8_MAX 127
+#endif
+
+#if !defined(INT32_MAX)
+#define INT32_MAX 2147483647
+#endif
+
+#if !defined(INT32_MIN)
+#define INT32_MIN (-2147483647 - 1)
+#endif
+
+#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
+
+#if defined(__cplusplus)
+extern "C" {
+#endif  // __cplusplus
+
+// Returns size of uint64_t when encoded using LEB128.
+size_t aom_uleb_size_in_bytes(uint64_t value);
+
+// Returns 0 on success, -1 on decode failure.
+// On success, 'value' stores the decoded LEB128 value and 'length' stores
+// the number of bytes decoded.
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+                    size_t *length);
+
+// Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure.
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+                    size_t *coded_size);
+
+// Encodes LEB128 integer to size specified. Returns 0 when successful, and -1
+// upon failure.
+// Note: This will write exactly pad_to_size bytes; if the value cannot be
+// encoded in this many bytes, then this will fail.
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+                               size_t pad_to_size, uint8_t *coded_value,
+                               size_t *coded_size);
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_AOM_AOM_INTEGER_H_
diff --git a/third_party/aom/aom/aomcx.h b/third_party/aom/aom/aomcx.h
new file mode 100644
index 000000000..013ddf57e
--- /dev/null
+++ b/third_party/aom/aom/aomcx.h
@@ -0,0 +1,1198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOMCX_H_
+#define AOM_AOM_AOMCX_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include "aom/aom.h"
+#include "aom/aom_encoder.h"
+
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
+ *        aom Codec Interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to encode raw AV1 streams.
+ * @{
+ */
+extern aom_codec_iface_t aom_codec_av1_cx_algo;
+extern aom_codec_iface_t *aom_codec_av1_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*
+ * Algorithm Flags
+ */
+
+/*!\brief Don't reference the last frame
+ *
+ * When this flag is set, the encoder will not use the last frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST (1 << 16)
+/*!\brief Don't reference the last2 frame
+ *
+ * When this flag is set, the encoder will not use the last2 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last2 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST2 (1 << 17)
+/*!\brief Don't reference the last3 frame
+ *
+ * When this flag is set, the encoder will not use the last3 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last3 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST3 (1 << 18)
+/*!\brief Don't reference the golden frame
+ *
+ * When this flag is set, the encoder will not use the golden frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * golden frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_GF (1 << 19)
+
+/*!\brief Don't reference the alternate reference frame
+ *
+ * When this flag is set, the encoder will not use the alt ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF (1 << 20)
+/*!\brief Don't reference the bwd reference frame
+ *
+ * When this flag is set, the encoder will not use the bwd ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * bwd ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_BWD (1 << 21)
+/*!\brief Don't reference the alt2 reference frame
+ *
+ * When this flag is set, the encoder will not use the alt2 ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt2 ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF2 (1 << 22)
+
+/*!\brief Don't update the last frame
+ *
+ * When this flag is set, the encoder will not update the last frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_LAST (1 << 23)
+
+/*!\brief Don't update the golden frame
+ *
+ * When this flag is set, the encoder will not update the golden frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_GF (1 << 24)
+
+/*!\brief Don't update the alternate reference frame
+ *
+ * When this flag is set, the encoder will not update the alt ref frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_ARF (1 << 25)
+/*!\brief Disable entropy update
+ *
+ * When this flag is set, the encoder will not update its internal entropy
+ * model based on the entropy of this frame.
+ */
+#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26)
+/*!\brief Disable ref frame mvs
+ *
+ * When this flag is set, the encoder will not allow frames to
+ * be encoded using mfmv.
+ */
+#define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27)
+/*!\brief Enable error resilient frame
+ *
+ * When this flag is set, the encoder will code frames as error
+ * resilient.
+ */
+#define AOM_EFLAG_ERROR_RESILIENT (1 << 28)
+/*!\brief Enable s frame mode
+ *
+ * When this flag is set, the encoder will code frames as an
+ * s frame.
+ */
+#define AOM_EFLAG_SET_S_FRAME (1 << 29)
+/*!\brief Force primary_ref_frame to PRIMARY_REF_NONE
+ *
+ * When this flag is set, the encoder will set a frame's primary_ref_frame
+ * to PRIMARY_REF_NONE
+ */
+#define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30)
+
+/*!\brief AVx encoder control functions
+ *
+ * This set of macros define the control functions available for AVx
+ * encoder interface.
+ *
+ * \sa #aom_codec_control
+ */
+enum aome_enc_control_id {
+  /*!\brief Codec control function to set which reference frame encoder can use.
+   */
+  AOME_USE_REFERENCE = 7,
+
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   */
+  AOME_SET_ROI_MAP = 8,
+
+  /*!\brief Codec control function to pass an Active map to encoder.
+   */
+  AOME_SET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set encoder scaling mode.
+   */
+  AOME_SET_SCALEMODE = 11,
+
+  /*!\brief Codec control function to set encoder spatial layer id.
+   */
+  AOME_SET_SPATIAL_LAYER_ID = 12,
+
+  /*!\brief Codec control function to set encoder internal speed settings.
+   *
+   * Changes in this value influences, among others, the encoder's selection
+   * of motion estimation methods. Values greater than 0 will increase encoder
+   * speed at the expense of quality.
+   *
+   * \note Valid range: 0..8
+   */
+  AOME_SET_CPUUSED = 13,
+
+  /*!\brief Codec control function to enable automatic set and use alf frames.
+   */
+  AOME_SET_ENABLEAUTOALTREF,
+
+  /*!\brief Codec control function to set sharpness.
+   */
+  AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,
+
+  /*!\brief Codec control function to set the threshold for MBs treated static.
+   */
+  AOME_SET_STATIC_THRESHOLD,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   */
+  AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+   * parameters.
+   */
+  AOME_GET_LAST_QUANTIZER_64,
+
+  /*!\brief Codec control function to set the max no of frames to create arf.
+   */
+  AOME_SET_ARNR_MAXFRAMES,
+
+  /*!\brief Codec control function to set the filter strength for the arf.
+   */
+  AOME_SET_ARNR_STRENGTH,
+
+  /*!\brief Codec control function to set visual tuning.
+   */
+  AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2,
+
+  /*!\brief Codec control function to set constrained quality level.
+   *
+   * \attention For this value to be used aom_codec_enc_cfg_t::g_usage must be
+   *            set to #AOM_CQ.
+   * \note Valid range: 0..63
+   */
+  AOME_SET_CQ_LEVEL,
+
+  /*!\brief Codec control function to set Max data rate for Intra frames.
+   *
+   * This value controls additional clamping on the maximum size of a
+   * keyframe. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allocate no more than 4.5 frames worth of bitrate
+   * to a keyframe, set this to 450.
+   */
+  AOME_SET_MAX_INTRA_BITRATE_PCT,
+
+  /*!\brief Codec control function to set number of spatial layers.
+   */
+  AOME_SET_NUMBER_SPATIAL_LAYERS,
+
+  /*!\brief Codec control function to set max data rate for Inter frames.
+   *
+   * This value controls additional clamping on the maximum size of an
+   * inter frame. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allow no more than 4.5 frames worth of bitrate
+   * to an inter frame, set this to 450.
+   */
+  AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2,
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   */
+  AV1E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to set lossless encoding mode.
+   *
+   * AV1 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   */
+  AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2,
+
+  /** control function to enable the row based multi-threading of encoder. A
+   * value that is equal to 1 indicates that row based multi-threading is
+   * enabled.
+   */
+  AV1E_SET_ROW_MT,
+
+  /*!\brief Codec control function to set number of tile columns.
+   *
+   * In encoding and decoding, AV1 allows an input image frame be partitioned
+   * into separate vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. The parameter for this control describes the number of tile
+   * columns (in log2 units), which has a valid range of [0, 6]:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   */
+  AV1E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows.
+   *
+   * In encoding and decoding, AV1 allows an input image frame be partitioned
+   * into separate horizontal tile rows, which can be encoded or decoded
+   * independently. The parameter for this control describes the number of tile
+   * rows (in log2 units), which has a valid range of [0, 6]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *            .....
+   *            n = 2**n tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   */
+  AV1E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature.
+   *
+   * AV1 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   */
+  AV1E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to enable error_resilient_mode
+   *
+   * AV1 has a bitstream feature to guarantee parseability of a frame
+   * by turning on the error_resilient_decoding mode, even though the
+   * reference buffers are unreliable or not received.
+   *
+   * By default, this feature is off.
+   */
+  AV1E_SET_ERROR_RESILIENT_MODE,
+
+  /*!\brief Codec control function to enable s_frame_mode
+   *
+   * AV1 has a bitstream feature to designate certain frames as S-frames,
+   * from where we can switch to a different stream,
+   * even though the reference buffers may not be exactly identical.
+   *
+   * By default, this feature is off.
+   */
+  AV1E_SET_S_FRAME_MODE,
+
+  /*!\brief Codec control function to set adaptive quantization mode.
+   *
+   * AV1 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   */
+  AV1E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost.
+   *
+   * One AV1 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   */
+  AV1E_SET_FRAME_PERIODIC_BOOST,
+
+  /*!\brief Codec control function to set noise sensitivity.
+   *
+   *  0: off, 1: On(YOnly)
+   */
+  AV1E_SET_NOISE_SENSITIVITY,
+
+  /*!\brief Codec control function to set content type.
+   * \note Valid parameter range:
+   *              AOM_CONTENT_DEFAULT = Regular video content (Default)
+   *              AOM_CONTENT_SCREEN  = Screen capture content
+   */
+  AV1E_SET_TUNE_CONTENT,
+
+  /*!\brief Codec control function to set CDF update mode.
+   *
+   *  0: no update          1: update on every frame
+   *  2: selectively update
+   */
+  AV1E_SET_CDF_UPDATE_MODE,
+
+  /*!\brief Codec control function to set color space info.
+   * \note Valid ranges: 0..23, default is "Unspecified".
+   *                     0 = For future use
+   *                     1 = BT.709
+   *                     2 = Unspecified
+   *                     3 = For future use
+   *                     4 = BT.470 System M (historical)
+   *                     5 = BT.470 System B, G (historical)
+   *                     6 = BT.601
+   *                     7 = SMPTE 240
+   *                     8 = Generic film (color filters using illuminant C)
+   *                     9 = BT.2020, BT.2100
+   *                     10 = SMPTE 428 (CIE 1921 XYZ)
+   *                     11 = SMPTE RP 431-2
+   *                     12 = SMPTE EG 432-1
+   *                     13 = For future use (values 13 - 21)
+   *                     22 = EBU Tech. 3213-E
+   *                     23 = For future use
+   *
+   */
+  AV1E_SET_COLOR_PRIMARIES,
+
+  /*!\brief Codec control function to set transfer function info.
+   * \note Valid ranges: 0..19, default is "Unspecified".
+   *                     0 = For future use
+   *                     1 = BT.709
+   *                     2 = Unspecified
+   *                     3 = For future use
+   *                     4 = BT.470 System M (historical)
+   *                     5 = BT.470 System B, G (historical)
+   *                     6 = BT.601
+   *                     7 = SMPTE 240 M
+   *                     8 = Linear
+   *                     9 = Logarithmic (100 : 1 range)
+   *                     10 = Logarithmic (100 * Sqrt(10) : 1 range)
+   *                     11 = IEC 61966-2-4
+   *                     12 = BT.1361
+   *                     13 = sRGB or sYCC
+   *                     14 = BT.2020 10-bit systems
+   *                     15 = BT.2020 12-bit systems
+   *                     16 = SMPTE ST 2084, ITU BT.2100 PQ
+   *                     17 = SMPTE ST 428
+   *                     18 = BT.2100 HLG, ARIB STD-B67
+   *                     19 = For future use
+   *
+   */
+  AV1E_SET_TRANSFER_CHARACTERISTICS,
+
+  /*!\brief Codec control function to set transfer function info.
+   * \note Valid ranges: 0..15, default is "Unspecified".
+   *                     0 = Identity matrix
+   *                     1 = BT.709
+   *                     2 = Unspecified
+   *                     3 = For future use
+   *                     4 = US FCC 73.628
+   *                     5 = BT.470 System B, G (historical)
+   *                     6 = BT.601
+   *                     7 = SMPTE 240 M
+   *                     8 = YCgCo
+   *                     9 = BT.2020 non-constant luminance, BT.2100 YCbCr
+   *                     10 = BT.2020 constant luminance
+   *                     11 = SMPTE ST 2085 YDzDx
+   *                     12 = Chromaticity-derived non-constant luminance
+   *                     13 = Chromaticity-derived constant luminance
+   *                     14 = BT.2100 ICtCp
+   *                     15 = For future use
+   *
+   */
+  AV1E_SET_MATRIX_COEFFICIENTS,
+
+  /*!\brief Codec control function to set chroma 4:2:0 sample position info.
+   * \note Valid ranges: 0..3, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = VERTICAL
+   *                     2 = COLOCATED
+   *                     3 = RESERVED
+   */
+  AV1E_SET_CHROMA_SAMPLE_POSITION,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 4.
+   */
+  AV1E_SET_MIN_GF_INTERVAL,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 16.
+   */
+  AV1E_SET_MAX_GF_INTERVAL,
+
+  /*!\brief Codec control function to get an Active map back from the encoder.
+   */
+  AV1E_GET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set color range bit.
+   * \note Valid ranges: 0..1, default is 0
+   *                     0 = Limited range (16..235 or HBD equivalent)
+   *                     1 = Full range (0..255 or HBD equivalent)
+   */
+  AV1E_SET_COLOR_RANGE,
+
+  /*!\brief Codec control function to set intended rendering image size.
+   *
+   * By default, this is identical to the image size in pixels.
+   */
+  AV1E_SET_RENDER_SIZE,
+
+  /*!\brief Codec control function to set target level.
+   *
+   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
+   * 11: target for level 1.1; ... 62: target for level 6.2
+   */
+  AV1E_SET_TARGET_LEVEL,
+
+  /*!\brief Codec control function to get bitstream level.
+   */
+  AV1E_GET_LEVEL,
+
+  /*!\brief Codec control function to set intended superblock size.
+   *
+   * By default, the superblock size is determined separately for each
+   * frame by the encoder.
+   *
+   * Experiment: EXT_PARTITION
+   */
+  AV1E_SET_SUPERBLOCK_SIZE,
+
+  /*!\brief Codec control function to enable automatic set and use
+   * bwd-pred frames.
+   *
+   */
+  AOME_SET_ENABLEAUTOBWDREF,
+
+  /*!\brief Codec control function to encode with CDEF.
+   *
+   * CDEF is the constrained directional enhancement filter which is an
+   * in-loop filter aiming to remove coding artifacts
+   *                          0 = do not apply CDEF
+   *                          1 = apply CDEF
+   *
+   *  By default, the encoder applies CDEF.
+   *
+   * Experiment: AOM_CDEF
+   */
+  AV1E_SET_ENABLE_CDEF,
+
+  /*!\brief Codec control function to encode with Loop Restoration Filter.
+   *
+   *                          0 = do not apply Restoration Filter
+   *                          1 = apply Restoration Filter
+   *
+   *  By default, the encoder applies Restoration Filter.
+   *
+   */
+  AV1E_SET_ENABLE_RESTORATION,
+
+  /*!\brief Codec control function to encode without trellis quantization.
+   *
+   *                          0 = apply trellis quantization
+   *                          1 = do not apply trellis quantization
+   *
+   *  By default, the encoder applies trellis optimization on quantized
+   *  coefficients.
+   *
+   */
+  AV1E_SET_DISABLE_TRELLIS_QUANT,
+
+  /*!\brief Codec control function to encode with quantisation matrices.
+   *
+   * AOM can operate with default quantisation matrices dependent on
+   * quantisation level and block type.
+   *                          0 = do not use quantisation matrices
+   *                          1 = use quantisation matrices
+   *
+   *  By default, the encoder operates without quantisation matrices.
+   *
+   * Experiment: AOM_QM
+   */
+
+  AV1E_SET_ENABLE_QM,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the minimum level of flatness from which the matrices
+   * are determined.
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_MIN,
+
+  /*!\brief Codec control function to set the max quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the maximum level of flatness possible.
+   *
+   * By default, the encoder sets this maximum at the top of the
+   * available range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_MAX,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for luma (Y).
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_Y,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for chroma (U).
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_U,
+
+  /*!\brief Codec control function to set the min quant matrix flatness.
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for chrome (V).
+   *
+   *  By default, the encoder sets this minimum at half the available
+   *  range.
+   *
+   * Experiment: AOM_QM
+   */
+  AV1E_SET_QM_V,
+
+  /*!\brief Codec control function to encode with dist_8x8.
+   *
+   *  The dist_8x8 is enabled automatically for model tuning parameters that
+   *  require measuring distortion at the 8x8 level. This control also allows
+   *  measuring distortion at the 8x8 level for other tuning options
+   *  (e.g., PSNR), for testing purposes.
+   *                          0 = do not use dist_8x8
+   *                          1 = use dist_8x8
+   *
+   *  By default, the encoder does not use dist_8x8
+   *
+   * Experiment: DIST_8X8
+   */
+  AV1E_SET_ENABLE_DIST_8X8,
+
+  /*!\brief Codec control function to set a maximum number of tile groups.
+   *
+   * This will set the maximum number of tile groups. This will be
+   * overridden if an MTU size is set. The default value is 1.
+   *
+   * Experiment: TILE_GROUPS
+   */
+  AV1E_SET_NUM_TG,
+
+  /*!\brief Codec control function to set an MTU size for a tile group.
+   *
+   * This will set the maximum number of bytes in a tile group. This can be
+   * exceeded only if a single tile is larger than this amount.
+   *
+   * By default, the value is 0, in which case a fixed number of tile groups
+   * is used.
+   *
+   * Experiment: TILE_GROUPS
+   */
+  AV1E_SET_MTU,
+
+  /*!\brief Codec control function to set dependent_horz_tiles.
+   *
+   * In encoding and decoding, AV1 allows enabling dependent horizontal tile
+   * The parameter for this control describes the value of this flag,
+   * which has a valid range [0, 1]:
+   *            0 = disable dependent horizontal tile
+   *            1 = enable dependent horizontal tile,
+   *
+   * By default, the value is 0, i.e. disable dependent horizontal tile.
+   */
+  AV1E_SET_TILE_DEPENDENT_ROWS,
+
+  /*!\brief Codec control function to set the number of symbols in an ANS data
+   * window.
+   *
+   * The number of ANS symbols (both boolean and non-booleans alphabets) in an
+   * ANS data window is set to 1 << value.
+   *
+   * \note Valid range: [8, 23]
+   *
+   * Experiment: ANS
+   */
+  AV1E_SET_ANS_WINDOW_SIZE_LOG2,
+
+  /*!\brief Codec control function to turn on / off dual filter
+   * enabling/disabling.
+   *
+   * This will enable or disable dual filter. The default value is 1
+   *
+   */
+  AV1E_SET_ENABLE_DF,
+
+  /*!\brief Codec control function to turn on / off frame order hint for a
+   * few tools:
+   *
+   * joint compound mode
+   * motion field motion vector
+   * ref frame sign bias
+   *
+   * The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_ORDER_HINT,
+
+  /*!\brief Codec control function to turn on / off joint compound mode
+   * at sequence level.
+   *
+   * This will enable or disable joint compound mode. The default value is 1.
+   * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ENABLE_JNT_COMP,
+
+  /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
+   * at sequence level.
+   *
+   * This will enable or disable usage of MFMV. The default value is 1.
+   * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ENABLE_REF_FRAME_MVS,
+
+  /*!\brief Codec control function to set temporal mv prediction
+   * enabling/disabling at frame level.
+   *
+   * This will enable or disable temporal mv predicton. The default value is 1.
+   * If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ALLOW_REF_FRAME_MVS,
+
+  /*!\brief Codec control function to turn on / off warped motion usage
+   * at sequence level.
+   *
+   * This will enable or disable usage of warped motion. The default value is 1.
+   *
+   */
+  AV1E_SET_ENABLE_WARPED_MOTION,
+
+  /*!\brief Codec control function to turn on / off warped motion usage
+   * at frame level.
+   *
+   * This will enable or disable usage of warped motion. The default value is 1.
+   * If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is forced to 0.
+   *
+   */
+  AV1E_SET_ALLOW_WARPED_MOTION,
+
+  /*!\brief Codec control function to turn on / off frame superresolution.
+   *
+   * This will enable or disable frame superresolution. The default value is 1
+   * If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
+   */
+  AV1E_SET_ENABLE_SUPERRES,
+
+  /*!\brief Codec control function to set loop_filter_across_tiles_v_enabled
+   * and loop_filter_across_tiles_h_enabled.
+   * In encoding and decoding, AV1 allows disabling loop filter across tile
+   * boundary The parameter for this control describes the value of this flag,
+   * which has a valid range [0, 1]:
+   *            0 = disable loop filter across tile boundary
+   *            1 = enable loop filter across tile boundary
+   *
+   * By default, the value is 1, i.e. enable loop filter across tile boundary.
+   *
+   * Experiment: LOOPFILTERING_ACROSS_TILES_EXT
+   */
+  AV1E_SET_TILE_LOOPFILTER_V,
+  AV1E_SET_TILE_LOOPFILTER_H,
+
+  /*!\brief Codec control function to set loop_filter_across_tiles_enabled.
+   *
+   * In encoding and decoding, AV1 allows disabling loop filter across tile
+   * boundary The parameter for this control describes the value of this flag,
+   * which has a valid range [0, 1]:
+   *            0 = disable loop filter across tile boundary
+   *            1 = enable loop filter across tile boundary
+   *
+   * By default, the value is 1, i.e. enable loop filter across tile boundary.
+   *
+   * Experiment: LOOPFILTERING_ACROSS_TILES
+   */
+  AV1E_SET_TILE_LOOPFILTER,
+
+  /*!\brief Codec control function to set the delta q mode
+   *
+   * AV1 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. the delta q mode is added on top of segment based
+   * feature, and allows control per 64x64 q and lf delta.This control makes
+   * encoder operate in one of the several DELTA_Q_modes supported.
+   *
+   * By default, encoder operates with DELTAQ_Mode 0(deltaq signaling off).
+   */
+  AV1E_SET_DELTAQ_MODE,
+
+  /*!\brief Codec control function to set the single tile decoding mode to 0 or
+   * 1.
+   *
+   * 0 means that the single tile decoding is off, and 1 means that the single
+   * tile decoding is on.
+   *
+   * Experiment: EXT_TILE
+   */
+  AV1E_SET_SINGLE_TILE_DECODING,
+
+  /*!\brief Codec control function to enable the extreme motion vector unit test
+   * in AV1. Please note that this is only used in motion vector unit test.
+   *
+   * 0 : off, 1 : MAX_EXTREME_MV, 2 : MIN_EXTREME_MV
+   */
+  AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST,
+
+  /*!\brief Codec control function to signal picture timing info in the
+   * bitstream. \note Valid ranges: 0..1, default is "UNKNOWN". 0 = UNKNOWN, 1 =
+   * EQUAL
+   */
+  AV1E_SET_TIMING_INFO_TYPE,
+
+  /*!\brief Codec control function to add film grain parameters (one of several
+   * preset types) info in the bitstream.
+   * \note Valid ranges: 0..11, default is "0". 0 = UNKNOWN,
+   * 1..16 = different test vectors for grain
+   */
+  AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+
+  /*!\brief Codec control function to set the path to the film grain parameters
+   */
+  AV1E_SET_FILM_GRAIN_TABLE,
+
+  /*!\brief Sets the noise level */
+  AV1E_SET_DENOISE_NOISE_LEVEL,
+
+  /*!\brief Sets the denoisers block size */
+  AV1E_SET_DENOISE_BLOCK_SIZE,
+
+  /*!\brief Sets the chroma subsampling x value */
+  AV1E_SET_CHROMA_SUBSAMPLING_X,
+
+  /*!\brief Sets the chroma subsampling y value */
+  AV1E_SET_CHROMA_SUBSAMPLING_Y,
+};
+
+/*!\brief aom 1-D scaling mode
+ *
+ * This set of constants define 1-D aom scaling modes
+ */
+typedef enum aom_scaling_mode_1d {
+  AOME_NORMAL = 0,
+  AOME_FOURFIVE = 1,
+  AOME_THREEFIVE = 2,
+  AOME_ONETWO = 3
+} AOM_SCALING_MODE;
+
+/*!\brief Max number of segments
+ *
+ * This is the limit of number of segments allowed within a frame.
+ *
+ * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_SEGMENTS 8
+
+/*!\brief  aom region of interest map
+ *
+ * These defines the data structures for the region of interest map
+ *
+ * TODO(yaowu): create a unit test for ROI map related APIs
+ *
+ */
+typedef struct aom_roi_map {
+  /*! An id between 0 and 7 for each 8x8 region within a frame. */
+  unsigned char *roi_map;
+  unsigned int rows;              /**< Number of rows. */
+  unsigned int cols;              /**< Number of columns. */
+  int delta_q[AOM_MAX_SEGMENTS];  /**< Quantizer deltas. */
+  int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */
+  /*! Static breakout threshold for each segment. */
+  unsigned int static_threshold[AOM_MAX_SEGMENTS];
+} aom_roi_map_t;
+
+/*!\brief  aom active region map
+ *
+ * These defines the data structures for active region map
+ *
+ */
+
+typedef struct aom_active_map {
+  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
+  unsigned char *active_map;
+  unsigned int rows; /**< number of rows */
+  unsigned int cols; /**< number of cols */
+} aom_active_map_t;
+
+/*!\brief  aom image scaling mode
+ *
+ * This defines the data structure for image scaling mode
+ *
+ */
+typedef struct aom_scaling_mode {
+  AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
+  AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
+} aom_scaling_mode_t;
+
+/*!brief AV1 encoder content type */
+typedef enum {
+  AOM_CONTENT_DEFAULT,
+  AOM_CONTENT_SCREEN,
+  AOM_CONTENT_INVALID
+} aom_tune_content;
+
+/*!brief AV1 encoder timing info type signaling */
+typedef enum {
+  AOM_TIMING_UNSPECIFIED,
+  AOM_TIMING_EQUAL,
+  AOM_TIMING_DEC_MODEL
+} aom_timing_info_type_t;
+
+/*!\brief Model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum {
+  AOM_TUNE_PSNR,
+  AOM_TUNE_SSIM,
+  AOM_TUNE_CDEF_DIST,
+  AOM_TUNE_DAALA_DIST
+} aom_tune_metric;
+
+/*!\cond */
+/*!\brief Encoder control function parameter type
+ *
+ * Defines the data types that AOME/AV1E control functions take. Note that
+ * additional common controls are defined in aom.h
+ *
+ */
+
+AOM_CTRL_USE_TYPE(AOME_USE_REFERENCE, int)
+#define AOM_CTRL_AOME_USE_REFERENCE
+AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
+#define AOM_CTRL_AOME_SET_ROI_MAP
+AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AOME_SET_ACTIVEMAP
+AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
+#define AOM_CTRL_AOME_SET_SCALEMODE
+
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int)
+#define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
+
+AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
+#define AOM_CTRL_AOME_SET_CPUUSED
+AOM_CTRL_USE_TYPE(AOME_SET_DEVSF, int)
+#define AOM_CTRL_AOME_SET_DEVSF
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
+
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
+
+AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
+#define AOM_CTRL_AOME_SET_SHARPNESS
+AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
+#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
+
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
+AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
+#define AOM_CTRL_AOME_SET_TUNING
+AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
+#define AOM_CTRL_AOME_SET_CQ_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, int)
+#define AOM_CTRL_AV1E_SET_ROW_MT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
+#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
+#define AOM_CTRL_AV1E_SET_TILE_ROWS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_DEPENDENT_ROWS, int)
+#define AOM_CTRL_AV1E_SET_TILE_DEPENDENT_ROWS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_LOOPFILTER_V, int)
+#define AOM_CTRL_AV1E_SET_TILE_LOOPFILTER_V
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_LOOPFILTER_H, int)
+#define AOM_CTRL_AV1E_SET_TILE_LOOPFILTER_H
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_LOOPFILTER, int)
+#define AOM_CTRL_AV1E_SET_TILE_LOOPFILTER
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int)
+#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
+#define AOM_CTRL_AV1E_SET_LOSSLESS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CDEF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
+#define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_QM
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_8X8, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_8X8
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MIN
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MAX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_Y
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_U, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_U
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_V, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_V
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int)
+#define AOM_CTRL_AV1E_SET_NUM_TG
+AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
+#define AOM_CTRL_AV1E_SET_MTU
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, aom_timing_info_type_t)
+#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DF, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_JNT_COMP, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_JNT_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, unsigned int)
+#define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_AQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTAQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
+#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
+#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int)
+#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int)
+#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int)
+#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AV1E_GET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
+#define AOM_CTRL_AV1E_SET_COLOR_RANGE
+
+#define AOM_CTRL_AV1E_SET_RENDER_SIZE
+AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
+#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
+#define AOM_CTRL_AV1E_GET_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ANS_WINDOW_SIZE_LOG2, unsigned int)
+#define AOM_CTRL_AV1E_SET_ANS_WINDOW_SIZE_LOG2
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, unsigned int)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int)
+#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
+
+#ifdef CONFIG_DENOISE
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int);
+#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int);
+#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
+#endif
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_X
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y
+
+/*!\endcond */
+/*! @} - end defgroup aom_encoder */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOMCX_H_
diff --git a/third_party/aom/aom/aomdx.h b/third_party/aom/aom/aomdx.h
new file mode 100644
index 000000000..765856a1b
--- /dev/null
+++ b/third_party/aom/aom/aomdx.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
+ * \ingroup aom
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 within the aom Decoder
+ *        interface.
+ */
+#ifndef AOM_AOM_AOMDX_H_
+#define AOM_AOM_AOMDX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Include controls common to both the encoder and decoder */
+#include "aom/aom.h"
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to decode AV1 streams.
+ * @{
+ */
+extern aom_codec_iface_t aom_codec_av1_dx_algo;
+extern aom_codec_iface_t *aom_codec_av1_dx(void);
+/*!@} - end algorithm interface member group*/
+
+/** Data structure that stores bit accounting for debug
+ */
+typedef struct Accounting Accounting;
+
+#ifndef AOM_INSPECTION_H_
+/** Callback that inspects decoder frame data.
+ */
+typedef void (*aom_inspect_cb)(void *decoder, void *ctx);
+#endif
+
+/*!\brief Structure to hold inspection callback and context.
+ *
+ * Defines a structure to hold the inspection callback function and calling
+ * context.
+ */
+typedef struct aom_inspect_init {
+  /*! Inspection callback. */
+  aom_inspect_cb inspect_cb;
+
+  /*! Inspection context. */
+  void *inspect_ctx;
+} aom_inspect_init;
+
+/*!\brief Structure to hold a tile's start address and size in the bitstream.
+ *
+ * Defines a structure to hold a tile's start address and size in the bitstream.
+ */
+typedef struct aom_tile_data {
+  /*! Tile data size. */
+  size_t coded_tile_data_size;
+  /*! Tile's start address. */
+  const void *coded_tile_data;
+  /*! Extra size information. */
+  size_t extra_size;
+} aom_tile_data;
+
+/*!\brief Structure to hold the external reference frame pointer.
+ *
+ * Define a structure to hold the external reference frame pointer.
+ */
+typedef struct av1_ext_ref_frame {
+  /*! Start pointer of external references. */
+  aom_image_t *img;
+  /*! Number of available external references. */
+  int num;
+} av1_ext_ref_frame_t;
+
+/*!\enum aom_dec_control_id
+ * \brief AOM decoder control functions
+ *
+ * This set of macros define the control functions available for the AOM
+ * decoder interface.
+ *
+ * \sa #aom_codec_control
+ */
+enum aom_dec_control_id {
+  /** control function to get info on which reference frames were updated
+   *  by the last decode
+   */
+  AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
+
+  /** check if the indicated frame is corrupted */
+  AOMD_GET_FRAME_CORRUPTED,
+
+  /** control function to get info on which reference frames were used
+   *  by the last decode
+   */
+  AOMD_GET_LAST_REF_USED,
+
+  /** control function to get the dimensions that the current frame is decoded
+   * at. This may be different to the intended display size for the frame as
+   * specified in the wrapper or frame header (see AV1D_GET_DISPLAY_SIZE). */
+  AV1D_GET_FRAME_SIZE,
+
+  /** control function to get the current frame's intended display dimensions
+   * (as specified in the wrapper or frame header). This may be different to
+   * the decoded dimensions of this frame (see AV1D_GET_FRAME_SIZE). */
+  AV1D_GET_DISPLAY_SIZE,
+
+  /** control function to get the bit depth of the stream. */
+  AV1D_GET_BIT_DEPTH,
+
+  /** control function to get the image format of the stream. */
+  AV1D_GET_IMG_FORMAT,
+
+  /** control function to get the size of the tile. */
+  AV1D_GET_TILE_SIZE,
+
+  /** control function to set the byte alignment of the planes in the reference
+   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+   * follows Y plane, and V plane directly follows U plane. Default value is 0.
+   */
+  AV1_SET_BYTE_ALIGNMENT,
+
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
+  AV1_INVERT_TILE_DECODE_ORDER,
+
+  /** control function to set the skip loop filter flag. Valid values are
+   * integers. The decoder will skip the loop filter when its value is set to
+   * nonzero. If the loop filter is skipped the decoder may accumulate decode
+   * artifacts. The default value is 0.
+   */
+  AV1_SET_SKIP_LOOP_FILTER,
+
+  /** control function to retrieve a pointer to the Accounting struct.  When
+   * compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
+   * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
+   * The caller should ensure that AOM_CODEC_OK is returned before attempting
+   * to dereference the Accounting pointer.
+   */
+  AV1_GET_ACCOUNTING,
+
+  /** control function to get last decoded frame quantizer. Returned value uses
+   * internal quantizer scale defined by the codec.
+   */
+  AOMD_GET_LAST_QUANTIZER,
+
+  /** control function to set the range of tile decoding. A value that is
+   * greater and equal to zero indicates only the specific row/column is
+   * decoded. A value that is -1 indicates the whole row/column is decoded.
+   * A special case is both values are -1 that means the whole frame is
+   * decoded.
+   */
+  AV1_SET_DECODE_TILE_ROW,
+  AV1_SET_DECODE_TILE_COL,
+  /** control function to set the tile coding mode. A value that is equal to
+   *  zero indicates the tiles are coded in normal tile mode. A value that is
+   *  1 indicates the tiles are coded in large-scale tile mode.
+   */
+  AV1_SET_TILE_MODE,
+  /** control function to get the frame header information of an encoded frame
+   * in the bitstream. This provides a way to access a frame's header data.
+   */
+  AV1D_GET_FRAME_HEADER_INFO,
+  /** control function to get the start address and size of a tile in the coded
+   * bitstream. This provides a way to access a specific tile's bitstream data.
+   */
+  AV1D_GET_TILE_DATA,
+  /** control function to set the external references' pointers in the decoder.
+   *  This is used while decoding the tile list OBU in large-scale tile coding
+   *  mode.
+   */
+  AV1D_SET_EXT_REF_PTR,
+  /** control function to enable the ext-tile software debug and testing code in
+   * the decoder.
+   */
+  AV1D_EXT_TILE_DEBUG,
+
+  /** control function to enable the row based multi-threading of decoding. A
+   * value that is equal to 1 indicates that row based multi-threading is
+   * enabled.
+   */
+  AV1D_SET_ROW_MT,
+
+  /** control function to indicate whether bitstream is in Annex-B format. */
+  AV1D_SET_IS_ANNEXB,
+
+  /** control function to indicate which operating point to use. A scalable
+   *  stream may define multiple operating points, each of which defines a
+   *  set of temporal and spatial layers to be processed. The operating point
+   *  index may take a value between 0 and operating_points_cnt_minus_1 (which
+   *  is at most 31).
+   */
+  AV1D_SET_OPERATING_POINT,
+
+  /** control function to indicate whether to output one frame per temporal
+   *  unit (the default), or one frame per spatial layer.
+   *  In a scalable stream, each temporal unit corresponds to a single "frame"
+   *  of video, and within a temporal unit there may be multiple spatial layers
+   *  with different versions of that frame.
+   *  For video playback, only the highest-quality version (within the
+   *  selected operating point) is needed, but for some use cases it is useful
+   *  to have access to multiple versions of a frame when they are available.
+   */
+  AV1D_SET_OUTPUT_ALL_LAYERS,
+
+  /** control function to set an aom_inspect_cb callback that is invoked each
+   * time a frame is decoded.  When compiled without --enable-inspection, this
+   * returns AOM_CODEC_INCAPABLE.
+   */
+  AV1_SET_INSPECTION_CALLBACK,
+
+  /** control function to set the skip film grain flag. Valid values are
+   * integers. The decoder will skip the film grain when its value is set to
+   * nonzero. The default value is 0.
+   */
+  AV1D_SET_SKIP_FILM_GRAIN,
+
+  AOM_DECODER_CTRL_ID_MAX,
+};
+
+/*!\cond */
+/*!\brief AOM decoder control function parameter type
+ *
+ * Defines the data types that AOMD control functions take. Note that
+ * additional common controls are defined in aom.h
+ *
+ */
+
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_USED
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
+AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
+AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
+#define AOM_CTRL_AV1D_GET_BIT_DEPTH
+AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *)
+#define AOM_CTRL_AV1D_GET_IMG_FORMAT
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_SIZE
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_FRAME_SIZE
+AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
+#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
+AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
+#define AOM_CTRL_AV1_GET_ACCOUNTING
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
+AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int)
+#define AOM_CTRL_AV1_SET_TILE_MODE
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_TILE_DATA
+AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
+#define AOM_CTRL_AV1D_SET_EXT_REF_PTR
+AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1D_SET_ROW_MT
+AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
+#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
+AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
+#define AOM_CTRL_AV1D_SET_IS_ANNEXB
+AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
+#define AOM_CTRL_AV1D_SET_OPERATING_POINT
+AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int)
+#define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS
+AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
+#define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
+/*!\endcond */
+/*! @} - end defgroup aom_decoder */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOMDX_H_
diff --git a/third_party/aom/aom/exports_com b/third_party/aom/aom/exports_com
new file mode 100644
index 000000000..2798bd51a
--- /dev/null
+++ b/third_party/aom/aom/exports_com
@@ -0,0 +1,32 @@
+text aom_codec_build_config
+text aom_codec_control_
+text aom_codec_destroy
+text aom_codec_err_to_string
+text aom_codec_error
+text aom_codec_error_detail
+text aom_codec_get_caps
+text aom_codec_iface_name
+text aom_codec_version
+text aom_codec_version_extra_str
+text aom_codec_version_str
+text aom_img_alloc
+text aom_img_alloc_with_border
+text aom_img_flip
+text aom_img_free
+text aom_img_plane_height
+text aom_img_plane_width
+text aom_img_set_rect
+text aom_img_wrap
+text aom_malloc
+text aom_rb_bytes_read
+text aom_rb_read_bit
+text aom_rb_read_literal
+text aom_rb_read_uvlc
+text aom_uleb_decode
+text aom_uleb_encode
+text aom_uleb_encode_fixed_size
+text aom_uleb_size_in_bytes
+text aom_wb_bytes_written
+text aom_wb_write_bit
+text aom_wb_write_literal
+text aom_wb_write_unsigned_literal
diff --git a/third_party/aom/aom/exports_dec b/third_party/aom/aom/exports_dec
new file mode 100644
index 000000000..d7d1c4f7d
--- /dev/null
+++ b/third_party/aom/aom/exports_dec
@@ -0,0 +1,10 @@
+text aom_codec_dec_init_ver
+text aom_codec_decode
+text aom_codec_get_frame
+text aom_codec_get_stream_info
+text aom_codec_peek_stream_info
+text aom_codec_register_put_frame_cb
+text aom_codec_register_put_slice_cb
+text aom_codec_set_frame_buffer_functions
+text aom_obu_type_to_string
+text aom_read_obu_header
diff --git a/third_party/aom/aom/exports_enc b/third_party/aom/aom/exports_enc
new file mode 100644
index 000000000..918d742f0
--- /dev/null
+++ b/third_party/aom/aom/exports_enc
@@ -0,0 +1,18 @@
+text aom_codec_enc_config_default
+text aom_codec_enc_config_set
+text aom_codec_enc_init_multi_ver
+text aom_codec_enc_init_ver
+text aom_codec_encode
+text aom_codec_get_cx_data
+text aom_codec_get_global_headers
+text aom_codec_get_preview_frame
+text aom_codec_set_cx_data_buf
+text aom_film_grain_table_append
+text aom_film_grain_table_free
+text aom_film_grain_table_write
+text aom_flat_block_finder_init
+text aom_flat_block_finder_run
+text aom_noise_model_init
+text aom_noise_model_get_grain_parameters
+text aom_noise_model_save_latest
+text aom_noise_model_update
diff --git a/third_party/aom/aom/exports_test b/third_party/aom/aom/exports_test
new file mode 100644
index 000000000..01b864bae
--- /dev/null
+++ b/third_party/aom/aom/exports_test
@@ -0,0 +1,2 @@
+text aom_dsp_rtcd
+text aom_scale_rtcd
diff --git a/third_party/aom/aom/internal/aom_codec_internal.h b/third_party/aom/aom/internal/aom_codec_internal.h
new file mode 100644
index 000000000..21c0dc69c
--- /dev/null
+++ b/third_party/aom/aom/internal/aom_codec_internal.h
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface for algorithm
+ *        implementations.
+ *
+ * This file defines the private structures and data types that are only
+ * relevant to implementing an algorithm, as opposed to using it.
+ *
+ * To create a decoder algorithm class, an interface structure is put
+ * into the global namespace:
+ *     <pre>
+ *     my_codec.c:
+ *       aom_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           AOM_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     </pre>
+ *
+ * An application instantiates a specific decoder instance by using
+ * aom_codec_init() and a pointer to the algorithm's interface structure:
+ *     <pre>
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
+ *       }
+ *     </pre>
+ *
+ * Once initialized, the instance is managed using other functions from
+ * the aom_codec_* family.
+ */
+#ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#include "../aom_decoder.h"
+#include "../aom_encoder.h"
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
+
+typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
+typedef struct aom_codec_priv_enc_mr_cfg aom_codec_priv_enc_mr_cfg_t;
+
+/*!\brief init function pointer prototype
+ *
+ * Performs algorithm-specific initialization of the decoder context. This
+ * function is called by the generic aom_codec_init() wrapper function, so
+ * plugins implementing this interface may trust the input parameters to be
+ * properly initialized.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ * \retval #AOM_CODEC_OK
+ *     The input stream was recognized and decoder initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory operation failed.
+ */
+typedef aom_codec_err_t (*aom_codec_init_fn_t)(
+    aom_codec_ctx_t *ctx, aom_codec_priv_enc_mr_cfg_t *data);
+
+/*!\brief destroy function pointer prototype
+ *
+ * Performs algorithm-specific destruction of the decoder context. This
+ * function is called by the generic aom_codec_destroy() wrapper function,
+ * so plugins implementing this interface may trust the input parameters
+ * to be properly initialized.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ * \retval #AOM_CODEC_OK
+ *     The input stream was recognized and decoder initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory operation failed.
+ */
+typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);
+
+/*!\brief parse stream info function pointer prototype
+ *
+ * Performs high level parsing of the bitstream. This function is called by the
+ * generic aom_codec_peek_stream_info() wrapper function, so plugins
+ * implementing this interface may trust the input parameters to be properly
+ * initialized.
+ *
+ * \param[in]      data    Pointer to a block of data to parse
+ * \param[in]      data_sz Size of the data buffer
+ * \param[in,out]  si      Pointer to stream info to update. The is_annexb
+ *                         member \ref MUST be properly initialized. This
+ *                         function sets the rest of the members.
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
+                                                  size_t data_sz,
+                                                  aom_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in]      ctx     Pointer to this instance's context
+ * \param[in,out]  si      Pointer to stream info to update
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 aom_codec_stream_info_t *si);
+
+/*!\brief control function pointer prototype
+ *
+ * This function is used to exchange algorithm specific data with the decoder
+ * instance. This can be used to implement features specific to a particular
+ * algorithm.
+ *
+ * This function is called by the generic aom_codec_control() wrapper
+ * function, so plugins implementing this interface may trust the input
+ * parameters to be properly initialized. However,  this interface does not
+ * provide type safety for the exchanged data or assign meanings to the
+ * control codes. Those details should be specified in the algorithm's
+ * header file. In particular, the ctrl_id parameter is guaranteed to exist
+ * in the algorithm's control mapping table, and the data parameter may be NULL.
+ *
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     ctrl_id          Algorithm specific control identifier
+ * \param[in,out] data             Data to exchange with algorithm instance.
+ *
+ * \retval #AOM_CODEC_OK
+ *     The internal state data was deserialized.
+ */
+typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                  va_list ap);
+
+/*!\brief control function pointer mapping
+ *
+ * This structure stores the mapping between control identifiers and
+ * implementing functions. Each algorithm provides a list of these
+ * mappings. This list is searched by the aom_codec_control() wrapper
+ * function to determine which function to invoke. The special
+ * value {0, NULL} is used to indicate end-of-list, and must be
+ * present. The special value {0, <non-null>} can be used as a catch-all
+ * mapping. This implies that ctrl_id values chosen by the algorithm
+ * \ref MUST be non-zero.
+ */
+typedef const struct aom_codec_ctrl_fn_map {
+  int ctrl_id;
+  aom_codec_control_fn_t fn;
+} aom_codec_ctrl_fn_map_t;
+
+/*!\brief decode data function pointer prototype
+ *
+ * Processes a buffer of coded data. If the processing results in a new
+ * decoded frame becoming available, #AOM_CODEC_CB_PUT_SLICE and
+ * #AOM_CODEC_CB_PUT_FRAME events are generated as appropriate. This
+ * function is called by the generic aom_codec_decode() wrapper function,
+ * so plugins implementing this interface may trust the input parameters
+ * to be properly initialized.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] data         Pointer to this block of new coded data. If
+ *                         NULL, a #AOM_CODEC_CB_PUT_FRAME event is posted
+ *                         for the previously decoded frame.
+ * \param[in] data_sz      Size of the coded data, in bytes.
+ *
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
+ *         and future pictures can be decoded without error. Otherwise,
+ *         see the descriptions of the other error codes in ::aom_codec_err_t
+ *         for recoverability capabilities.
+ */
+typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 const uint8_t *data,
+                                                 size_t data_sz,
+                                                 void *user_priv);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ *         produced will always be in PTS (presentation time stamp) order.
+ */
+typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 aom_codec_iter_t *iter);
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libaom will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #AOM_CODEC_OK
+ *     External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #AOM_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)(
+    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 const aom_image_t *img,
+                                                 aom_codec_pts_t pts,
+                                                 unsigned long duration,
+                                                 aom_enc_frame_flags_t flags);
+typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
+    aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);
+
+typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)(
+    aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg);
+typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
+    aom_codec_alg_priv_t *ctx);
+
+typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
+    aom_codec_alg_priv_t *ctx);
+
+typedef aom_codec_err_t (*aom_codec_enc_mr_get_mem_loc_fn_t)(
+    const aom_codec_enc_cfg_t *cfg, void **mem_loc);
+
+/*!\brief usage configuration mapping
+ *
+ * This structure stores the mapping between usage identifiers and
+ * configuration structures. Each algorithm provides a list of these
+ * mappings. This list is searched by the aom_codec_enc_config_default()
+ * wrapper function to determine which config to return. The special value
+ * {-1, {0}} is used to indicate end-of-list, and must be present. At least
+ * one mapping must be present, in addition to the end-of-list.
+ *
+ */
+typedef const struct aom_codec_enc_cfg_map {
+  int usage;
+  aom_codec_enc_cfg_t cfg;
+} aom_codec_enc_cfg_map_t;
+
+/*!\brief Decoder algorithm interface interface
+ *
+ * All decoders \ref MUST expose a variable of this type.
+ */
+struct aom_codec_iface {
+  const char *name;                   /**< Identification String  */
+  int abi_version;                    /**< Implemented ABI version */
+  aom_codec_caps_t caps;              /**< Decoder capabilities */
+  aom_codec_init_fn_t init;           /**< \copydoc ::aom_codec_init_fn_t */
+  aom_codec_destroy_fn_t destroy;     /**< \copydoc ::aom_codec_destroy_fn_t */
+  aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */
+  struct aom_codec_dec_iface {
+    aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */
+    aom_codec_get_si_fn_t get_si;   /**< \copydoc ::aom_codec_get_si_fn_t */
+    aom_codec_decode_fn_t decode;   /**< \copydoc ::aom_codec_decode_fn_t */
+    aom_codec_get_frame_fn_t
+        get_frame;                   /**< \copydoc ::aom_codec_get_frame_fn_t */
+    aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
+  } dec;
+  struct aom_codec_enc_iface {
+    int cfg_map_count;
+    aom_codec_enc_cfg_map_t
+        *cfg_maps;                /**< \copydoc ::aom_codec_enc_cfg_map_t */
+    aom_codec_encode_fn_t encode; /**< \copydoc ::aom_codec_encode_fn_t */
+    aom_codec_get_cx_data_fn_t
+        get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
+    aom_codec_enc_config_set_fn_t
+        cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */
+    aom_codec_get_global_headers_fn_t
+        get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
+    aom_codec_get_preview_frame_fn_t
+        get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
+    aom_codec_enc_mr_get_mem_loc_fn_t
+        mr_get_mem_loc; /**< \copydoc ::aom_codec_enc_mr_get_mem_loc_fn_t */
+  } enc;
+};
+
+/*!\brief Callback function pointer / user data pair storage */
+typedef struct aom_codec_priv_cb_pair {
+  union {
+    aom_codec_put_frame_cb_fn_t put_frame;
+    aom_codec_put_slice_cb_fn_t put_slice;
+  } u;
+  void *user_priv;
+} aom_codec_priv_cb_pair_t;
+
+/*!\brief Instance private storage
+ *
+ * This structure is allocated by the algorithm's init function. It can be
+ * extended in one of two ways. First, a second, algorithm specific structure
+ * can be allocated and the priv member pointed to it. Alternatively, this
+ * structure can be made the first member of the algorithm specific structure,
+ * and the pointer cast to the proper type.
+ */
+struct aom_codec_priv {
+  const char *err_detail;
+  aom_codec_flags_t init_flags;
+  struct {
+    aom_codec_priv_cb_pair_t put_frame_cb;
+    aom_codec_priv_cb_pair_t put_slice_cb;
+  } dec;
+  struct {
+    aom_fixed_buf_t cx_data_dst_buf;
+    unsigned int cx_data_pad_before;
+    unsigned int cx_data_pad_after;
+    aom_codec_cx_pkt_t cx_data_pkt;
+    unsigned int total_encoders;
+  } enc;
+};
+
+/*
+ * Multi-resolution encoding internal configuration
+ */
+struct aom_codec_priv_enc_mr_cfg {
+  unsigned int mr_total_resolutions;
+  unsigned int mr_encoder_id;
+  struct aom_rational mr_down_sampling_factor;
+  void *mr_low_res_mode_info;
+};
+
+#undef AOM_CTRL_USE_TYPE
+#define AOM_CTRL_USE_TYPE(id, typ) \
+  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
+
+#undef AOM_CTRL_USE_TYPE_DEPRECATED
+#define AOM_CTRL_USE_TYPE_DEPRECATED(id, typ) \
+  static AOM_INLINE typ id##__value(va_list args) { return va_arg(args, typ); }
+
+#define CAST(id, arg) id##__value(arg)
+
+/* CODEC_INTERFACE convenience macro
+ *
+ * By convention, each codec interface is a struct with extern linkage, where
+ * the symbol is suffixed with _algo. A getter function is also defined to
+ * return a pointer to the struct, since in some cases it's easier to work
+ * with text symbols than data symbols (see issue #169). This function has
+ * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
+ * macro is provided to define this getter function automatically.
+ */
+#define CODEC_INTERFACE(id)                          \
+  aom_codec_iface_t *id(void) { return &id##_algo; } \
+  aom_codec_iface_t id##_algo
+
+/* Internal Utility Functions
+ *
+ * The following functions are intended to be used inside algorithms as
+ * utilities for manipulating aom_codec_* data structures.
+ */
+struct aom_codec_pkt_list {
+  unsigned int cnt;
+  unsigned int max;
+  struct aom_codec_cx_pkt pkts[1];
+};
+
+#define aom_codec_pkt_list_decl(n)     \
+  union {                              \
+    struct aom_codec_pkt_list head;    \
+    struct {                           \
+      struct aom_codec_pkt_list head;  \
+      struct aom_codec_cx_pkt pkts[n]; \
+    } alloc;                           \
+  }
+
+#define aom_codec_pkt_list_init(m) \
+  (m)->alloc.head.cnt = 0,         \
+  (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *,
+                           const struct aom_codec_cx_pkt *);
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter);
+
+#include <stdio.h>
+#include <setjmp.h>
+
+struct aom_internal_error_info {
+  aom_codec_err_t error_code;
+  int has_detail;
+  char detail[80];
+  int setjmp;  // Boolean: whether 'jmp' is valid.
+  jmp_buf jmp;
+};
+
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
+void aom_internal_error(struct aom_internal_error_info *info,
+                        aom_codec_err_t error, const char *fmt,
+                        ...) CLANG_ANALYZER_NORETURN;
+
+void aom_merge_corrupted_flag(int *corrupted, int value);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
diff --git a/third_party/aom/aom/src/aom_codec.c b/third_party/aom/aom/src/aom_codec.c
new file mode 100644
index 000000000..733bffb25
--- /dev/null
+++ b/third_party/aom/aom/src/aom_codec.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <stdarg.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+int aom_codec_version(void) { return VERSION_PACKED; }
+
+const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
+
+const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
+
+const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
+  return iface ? iface->name : "<invalid interface>";
+}
+
+const char *aom_codec_err_to_string(aom_codec_err_t err) {
+  switch (err) {
+    case AOM_CODEC_OK: return "Success";
+    case AOM_CODEC_ERROR: return "Unspecified internal error";
+    case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
+    case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
+    case AOM_CODEC_INCAPABLE:
+      return "Codec does not implement requested capability";
+    case AOM_CODEC_UNSUP_BITSTREAM:
+      return "Bitstream not supported by this decoder";
+    case AOM_CODEC_UNSUP_FEATURE:
+      return "Bitstream required feature not supported by this decoder";
+    case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
+    case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
+    case AOM_CODEC_LIST_END: return "End of iterated list";
+  }
+
+  return "Unrecognized error code";
+}
+
+const char *aom_codec_error(aom_codec_ctx_t *ctx) {
+  return (ctx) ? aom_codec_err_to_string(ctx->err)
+               : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
+}
+
+const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
+  if (ctx && ctx->err)
+    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
+
+  return NULL;
+}
+
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
+  aom_codec_err_t res;
+
+  if (!ctx)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
+
+    ctx->iface = NULL;
+    ctx->name = NULL;
+    ctx->priv = NULL;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
+  return (iface) ? iface->caps : 0;
+}
+
+aom_codec_err_t aom_codec_control_(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
+  aom_codec_err_t res;
+
+  if (!ctx || !ctrl_id)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
+    res = AOM_CODEC_ERROR;
+  else {
+    aom_codec_ctrl_fn_map_t *entry;
+
+    res = AOM_CODEC_ERROR;
+
+    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
+      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
+        va_list ap;
+
+        va_start(ap, ctrl_id);
+        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
+        va_end(ap);
+        break;
+      }
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+void aom_internal_error(struct aom_internal_error_info *info,
+                        aom_codec_err_t error, const char *fmt, ...) {
+  va_list ap;
+
+  info->error_code = error;
+  info->has_detail = 0;
+
+  if (fmt) {
+    size_t sz = sizeof(info->detail);
+
+    info->has_detail = 1;
+    va_start(ap, fmt);
+    vsnprintf(info->detail, sz - 1, fmt, ap);
+    va_end(ap);
+    info->detail[sz - 1] = '\0';
+  }
+
+  if (info->setjmp) longjmp(info->jmp, info->error_code);
+}
+
+void aom_merge_corrupted_flag(int *corrupted, int value) {
+  *corrupted |= value;
+}
+
+const char *aom_obu_type_to_string(OBU_TYPE type) {
+  switch (type) {
+    case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER";
+    case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER";
+    case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER";
+    case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER";
+    case OBU_FRAME: return "OBU_FRAME";
+    case OBU_TILE_GROUP: return "OBU_TILE_GROUP";
+    case OBU_METADATA: return "OBU_METADATA";
+    case OBU_TILE_LIST: return "OBU_TILE_LIST";
+    case OBU_PADDING: return "OBU_PADDING";
+    default: break;
+  }
+  return "<Invalid OBU Type>";
+}
diff --git a/third_party/aom/aom/src/aom_decoder.c b/third_party/aom/aom/src/aom_decoder.c
new file mode 100644
index 000000000..8c9111faf
--- /dev/null
+++ b/third_party/aom/aom/src/aom_decoder.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <string.h>
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+  return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_dec_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver) {
+  aom_codec_err_t res;
+
+  if (ver != AOM_DECODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if ((flags & AOM_CODEC_USE_POSTPROC) &&
+           !(iface->caps & AOM_CODEC_CAP_POSTPROC))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_INPUT_FRAGMENTS) &&
+           !(iface->caps & AOM_CODEC_CAP_INPUT_FRAGMENTS))
+    res = AOM_CODEC_INCAPABLE;
+  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.dec = cfg;
+
+    res = ctx->iface->init(ctx, NULL);
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      aom_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+                                           const uint8_t *data, size_t data_sz,
+                                           aom_codec_stream_info_t *si) {
+  aom_codec_err_t res;
+
+  if (!iface || !data || !data_sz || !si) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = iface->dec.peek_si(data, data_sz, si);
+  }
+
+  return res;
+}
+
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+                                          aom_codec_stream_info_t *si) {
+  aom_codec_err_t res;
+
+  if (!ctx || !si) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv) {
+    res = AOM_CODEC_ERROR;
+  } else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+                                 size_t data_sz, void *user_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
+  aom_image_t *img;
+
+  if (!ctx || !iter || !ctx->iface || !ctx->priv)
+    img = NULL;
+  else
+    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
+
+  return img;
+}
+
+aom_codec_err_t aom_codec_register_put_frame_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_frame_cb_fn_t cb,
+                                                void *user_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv ||
+           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_FRAME))
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->priv->dec.put_frame_cb.u.put_frame = cb;
+    ctx->priv->dec.put_frame_cb.user_priv = user_priv;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_register_put_slice_cb(aom_codec_ctx_t *ctx,
+                                                aom_codec_put_slice_cb_fn_t cb,
+                                                void *user_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv ||
+           !(ctx->iface->caps & AOM_CODEC_CAP_PUT_SLICE))
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->priv->dec.put_slice_cb.u.put_slice = cb;
+    ctx->priv->dec.put_slice_cb.user_priv = user_priv;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb_get || !cb_release) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv ||
+             !(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = AOM_CODEC_ERROR;
+  } else {
+    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
+                                    cb_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
diff --git a/third_party/aom/aom/src/aom_encoder.c b/third_party/aom/aom/src/aom_encoder.c
new file mode 100644
index 000000000..523f40bbe
--- /dev/null
+++ b/third_party/aom/aom/src/aom_encoder.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap encoder algorithms.
+ *
+ */
+#include "config/aom_config.h"
+
+#if HAVE_FEXCEPT
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <fenv.h>
+#endif
+
+#include <limits.h>
+#include <string.h>
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+  return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver) {
+  aom_codec_err_t res;
+
+  if (ver != AOM_ENCODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.enc = cfg;
+    res = ctx->iface->init(ctx, NULL);
+
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      aom_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_enc_init_multi_ver(
+    aom_codec_ctx_t *ctx, aom_codec_iface_t *iface, aom_codec_enc_cfg_t *cfg,
+    int num_enc, aom_codec_flags_t flags, aom_rational_t *dsf, int ver) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  if (ver != AOM_ENCODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg || (num_enc > 16 || num_enc < 1))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    int i;
+    void *mem_loc = NULL;
+
+    if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
+      for (i = 0; i < num_enc; i++) {
+        aom_codec_priv_enc_mr_cfg_t mr_cfg;
+
+        /* Validate down-sampling factor. */
+        if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
+            dsf->den > dsf->num) {
+          res = AOM_CODEC_INVALID_PARAM;
+          break;
+        }
+
+        mr_cfg.mr_low_res_mode_info = mem_loc;
+        mr_cfg.mr_total_resolutions = num_enc;
+        mr_cfg.mr_encoder_id = num_enc - 1 - i;
+        mr_cfg.mr_down_sampling_factor.num = dsf->num;
+        mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+        /* Force Key-frame synchronization. Namely, encoder at higher
+         * resolution always use the same frame_type chosen by the
+         * lowest-resolution encoder.
+         */
+        if (mr_cfg.mr_encoder_id) cfg->kf_mode = AOM_KF_DISABLED;
+
+        ctx->iface = iface;
+        ctx->name = iface->name;
+        ctx->priv = NULL;
+        ctx->init_flags = flags;
+        ctx->config.enc = cfg;
+        res = ctx->iface->init(ctx, &mr_cfg);
+
+        if (res) {
+          const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+          /* Destroy current ctx */
+          ctx->err_detail = error_detail;
+          aom_codec_destroy(ctx);
+
+          /* Destroy already allocated high-level ctx */
+          while (i) {
+            ctx--;
+            ctx->err_detail = error_detail;
+            aom_codec_destroy(ctx);
+            i--;
+          }
+        }
+
+        if (res) break;
+
+        ctx++;
+        cfg++;
+        dsf++;
+      }
+      ctx--;
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+                                             aom_codec_enc_cfg_t *cfg,
+                                             unsigned int usage) {
+  aom_codec_err_t res;
+  aom_codec_enc_cfg_map_t *map;
+  int i;
+
+  if (!iface || !cfg || usage > INT_MAX)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    res = AOM_CODEC_INVALID_PARAM;
+
+    for (i = 0; i < iface->enc.cfg_map_count; ++i) {
+      map = iface->enc.cfg_maps + i;
+      if (map->usage == (int)usage) {
+        *cfg = map->cfg;
+        cfg->g_usage = usage;
+        res = AOM_CODEC_OK;
+        break;
+      }
+    }
+  }
+
+  /* default values */
+  if (cfg) {
+    cfg->cfg.ext_partition = 1;
+  }
+
+  return res;
+}
+
+#if ARCH_X86 || ARCH_X86_64
+/* On X86, disable the x87 unit's internal 80 bit precision for better
+ * consistency with the SSE unit's 64 bit precision.
+ */
+#include "aom_ports/x86.h"
+#define FLOATING_POINT_SET_PRECISION \
+  unsigned short x87_orig_mode = x87_set_double_precision();
+#define FLOATING_POINT_RESTORE_PRECISION x87_set_control_word(x87_orig_mode);
+#else
+#define FLOATING_POINT_SET_PRECISION
+#define FLOATING_POINT_RESTORE_PRECISION
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#if HAVE_FEXCEPT && CONFIG_DEBUG
+#define FLOATING_POINT_SET_EXCEPTIONS \
+  const int float_excepts = feenableexcept(FE_DIVBYZERO);
+#define FLOATING_POINT_RESTORE_EXCEPTIONS feenableexcept(float_excepts);
+#else
+#define FLOATING_POINT_SET_EXCEPTIONS
+#define FLOATING_POINT_RESTORE_EXCEPTIONS
+#endif  // HAVE_FEXCEPT && CONFIG_DEBUG
+
+/* clang-format off */
+#define FLOATING_POINT_INIT    \
+  do {                         \
+  FLOATING_POINT_SET_PRECISION \
+  FLOATING_POINT_SET_EXCEPTIONS
+
+#define FLOATING_POINT_RESTORE      \
+  FLOATING_POINT_RESTORE_EXCEPTIONS \
+  FLOATING_POINT_RESTORE_PRECISION  \
+  } while (0);
+/* clang-format on */
+
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                                 aom_codec_pts_t pts, unsigned long duration,
+                                 aom_enc_frame_flags_t flags) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  if (!ctx || (img && !duration))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    unsigned int num_enc = ctx->priv->enc.total_encoders;
+
+    /* Execute in a normalized floating point environment, if the platform
+     * requires it.
+     */
+    FLOATING_POINT_INIT
+
+    if (num_enc == 1)
+      res =
+          ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags);
+    else {
+      /* Multi-resolution encoding:
+       * Encode multi-levels in reverse order. For example,
+       * if mr_total_resolutions = 3, first encode level 2,
+       * then encode level 1, and finally encode level 0.
+       */
+      int i;
+
+      ctx += num_enc - 1;
+      if (img) img += num_enc - 1;
+
+      for (i = num_enc - 1; i >= 0; i--) {
+        if ((res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration,
+                                          flags)))
+          break;
+
+        ctx--;
+        if (img) img--;
+      }
+      ctx++;
+    }
+
+    FLOATING_POINT_RESTORE
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+                                                aom_codec_iter_t *iter) {
+  const aom_codec_cx_pkt_t *pkt = NULL;
+
+  if (ctx) {
+    if (!iter)
+      ctx->err = AOM_CODEC_INVALID_PARAM;
+    else if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
+  }
+
+  if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+    // If the application has specified a destination area for the
+    // compressed data, and the codec has not placed the data there,
+    // and it fits, copy it.
+    aom_codec_priv_t *const priv = ctx->priv;
+    char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
+
+    if (dst_buf && pkt->data.raw.buf != dst_buf &&
+        pkt->data.raw.sz + priv->enc.cx_data_pad_before +
+                priv->enc.cx_data_pad_after <=
+            priv->enc.cx_data_dst_buf.sz) {
+      aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
+
+      memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
+             pkt->data.raw.sz);
+      *modified_pkt = *pkt;
+      modified_pkt->data.raw.buf = dst_buf;
+      modified_pkt->data.raw.sz +=
+          priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
+      pkt = modified_pkt;
+    }
+
+    if (dst_buf == pkt->data.raw.buf) {
+      priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
+      priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
+    }
+  }
+
+  return pkt;
+}
+
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+                                          const aom_fixed_buf_t *buf,
+                                          unsigned int pad_before,
+                                          unsigned int pad_after) {
+  if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM;
+
+  if (buf) {
+    ctx->priv->enc.cx_data_dst_buf = *buf;
+    ctx->priv->enc.cx_data_pad_before = pad_before;
+    ctx->priv->enc.cx_data_pad_after = pad_after;
+  } else {
+    ctx->priv->enc.cx_data_dst_buf.buf = NULL;
+    ctx->priv->enc.cx_data_dst_buf.sz = 0;
+    ctx->priv->enc.cx_data_pad_before = 0;
+    ctx->priv->enc.cx_data_pad_after = 0;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) {
+  aom_image_t *img = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_preview)
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
+  }
+
+  return img;
+}
+
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) {
+  aom_fixed_buf_t *buf = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_glob_hdrs)
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
+  }
+
+  return buf;
+}
+
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+                                         const aom_codec_enc_cfg_t *cfg) {
+  aom_codec_err_t res;
+
+  if (!ctx || !ctx->iface || !ctx->priv || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else
+    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
+
+  return SAVE_STATUS(ctx, res);
+}
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list,
+                           const struct aom_codec_cx_pkt *pkt) {
+  if (list->cnt < list->max) {
+    list->pkts[list->cnt++] = *pkt;
+    return 0;
+  }
+
+  return 1;
+}
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) {
+  const aom_codec_cx_pkt_t *pkt;
+
+  if (!(*iter)) {
+    *iter = list->pkts;
+  }
+
+  pkt = (const aom_codec_cx_pkt_t *)*iter;
+
+  if ((size_t)(pkt - list->pkts) < list->cnt)
+    *iter = pkt + 1;
+  else
+    pkt = NULL;
+
+  return pkt;
+}
diff --git a/third_party/aom/aom/src/aom_image.c b/third_party/aom/aom/src/aom_image.c
new file mode 100644
index 000000000..437f0241e
--- /dev/null
+++ b/third_party/aom/aom/src/aom_image.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+static INLINE unsigned int align_image_dimension(unsigned int d,
+                                                 unsigned int subsampling,
+                                                 unsigned int size_align) {
+  unsigned int align;
+
+  align = (1 << subsampling) - 1;
+  align = (size_align - 1 > align) ? (size_align - 1) : align;
+  return ((d + align) & ~align);
+}
+
+static aom_image_t *img_alloc_helper(
+    aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h,
+    unsigned int buf_align, unsigned int stride_align, unsigned int size_align,
+    unsigned char *img_data, unsigned int border) {
+  unsigned int h, w, s, xcs, ycs, bps;
+  unsigned int stride_in_bytes;
+
+  /* Treat align==0 like align==1 */
+  if (!buf_align) buf_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (buf_align & (buf_align - 1)) goto fail;
+
+  /* Treat align==0 like align==1 */
+  if (!stride_align) stride_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (stride_align & (stride_align - 1)) goto fail;
+
+  /* Treat align==0 like align==1 */
+  if (!size_align) size_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (size_align & (size_align - 1)) goto fail;
+
+  /* Get sample size for this format */
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12: bps = 12; break;
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: bps = 24; break;
+    case AOM_IMG_FMT_I42016: bps = 24; break;
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416: bps = 48; break;
+    default: bps = 16; break;
+  }
+
+  /* Get chroma shift values for this format */
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216: xcs = 1; break;
+    default: xcs = 0; break;
+  }
+
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12:
+    case AOM_IMG_FMT_I42016: ycs = 1; break;
+    default: ycs = 0; break;
+  }
+
+  /* Calculate storage sizes given the chroma subsampling */
+  w = align_image_dimension(d_w, xcs, size_align);
+  h = align_image_dimension(d_h, ycs, size_align);
+
+  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / 8;
+  s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
+  stride_in_bytes = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? s * 2 : s;
+
+  /* Allocate the new image */
+  if (!img) {
+    img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
+
+    if (!img) goto fail;
+
+    img->self_allocd = 1;
+  } else {
+    memset(img, 0, sizeof(aom_image_t));
+  }
+
+  img->img_data = img_data;
+
+  if (!img_data) {
+    const uint64_t alloc_size =
+        (fmt & AOM_IMG_FMT_PLANAR)
+            ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / 8
+            : (uint64_t)(h + 2 * border) * stride_in_bytes;
+
+    if (alloc_size != (size_t)alloc_size) goto fail;
+
+    img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
+    img->img_data_owner = 1;
+  }
+
+  if (!img->img_data) goto fail;
+
+  img->fmt = fmt;
+  img->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  // aligned width and aligned height
+  img->w = w;
+  img->h = h;
+  img->x_chroma_shift = xcs;
+  img->y_chroma_shift = ycs;
+  img->bps = bps;
+
+  /* Calculate strides */
+  img->stride[AOM_PLANE_Y] = img->stride[AOM_PLANE_ALPHA] = stride_in_bytes;
+  img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
+
+  /* Default viewport to entire image */
+  if (!aom_img_set_rect(img, 0, 0, d_w, d_h, border)) return img;
+
+fail:
+  aom_img_free(img);
+  return NULL;
+}
+
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, NULL, 0);
+}
+
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int stride_align,
+                          unsigned char *img_data) {
+  /* By setting buf_align = 1, we don't change buffer alignment in this
+   * function. */
+  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, img_data, 0);
+}
+
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+                                       unsigned int d_w, unsigned int d_h,
+                                       unsigned int align,
+                                       unsigned int size_align,
+                                       unsigned int border) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, NULL,
+                          border);
+}
+
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h, unsigned int border) {
+  unsigned char *data;
+
+  if (x + w <= img->w && y + h <= img->h) {
+    img->d_w = w;
+    img->d_h = h;
+
+    x += border;
+    y += border;
+
+    /* Calculate plane pointers */
+    if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
+      img->planes[AOM_PLANE_PACKED] =
+          img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
+    } else {
+      const int bytes_per_sample =
+          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      data = img->img_data;
+
+      if (img->fmt & AOM_IMG_FMT_HAS_ALPHA) {
+        img->planes[AOM_PLANE_ALPHA] =
+            data + x * bytes_per_sample + y * img->stride[AOM_PLANE_ALPHA];
+        data += (img->h + 2 * border) * img->stride[AOM_PLANE_ALPHA];
+      }
+
+      img->planes[AOM_PLANE_Y] =
+          data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
+      data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y];
+
+      unsigned int uv_border_h = border >> img->y_chroma_shift;
+      unsigned int uv_x = x >> img->x_chroma_shift;
+      unsigned int uv_y = y >> img->y_chroma_shift;
+      if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
+        img->planes[AOM_PLANE_U] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+                img->stride[AOM_PLANE_U];
+        img->planes[AOM_PLANE_V] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+      } else {
+        img->planes[AOM_PLANE_V] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+                img->stride[AOM_PLANE_V];
+        img->planes[AOM_PLANE_U] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+      }
+    }
+    return 0;
+  }
+  return -1;
+}
+
+void aom_img_flip(aom_image_t *img) {
+  /* Note: In the calculation pointer adjustment calculation, we want the
+   * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
+   * standard indicates that if the adjustment parameter is unsigned, the
+   * stride parameter will be promoted to unsigned, causing errors when
+   * the lhs is a larger type than the rhs.
+   */
+  img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
+  img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
+
+  img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[AOM_PLANE_U];
+  img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
+
+  img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[AOM_PLANE_V];
+  img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
+
+  img->planes[AOM_PLANE_ALPHA] +=
+      (signed)(img->d_h - 1) * img->stride[AOM_PLANE_ALPHA];
+  img->stride[AOM_PLANE_ALPHA] = -img->stride[AOM_PLANE_ALPHA];
+}
+
+void aom_img_free(aom_image_t *img) {
+  if (img) {
+    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
+
+    if (img->self_allocd) free(img);
+  }
+}
+
+int aom_img_plane_width(const aom_image_t *img, int plane) {
+  if (plane > 0 && img->x_chroma_shift > 0)
+    return (img->d_w + 1) >> img->x_chroma_shift;
+  else
+    return img->d_w;
+}
+
+int aom_img_plane_height(const aom_image_t *img, int plane) {
+  if (plane > 0 && img->y_chroma_shift > 0)
+    return (img->d_h + 1) >> img->y_chroma_shift;
+  else
+    return img->d_h;
+}
diff --git a/third_party/aom/aom/src/aom_integer.c b/third_party/aom/aom/src/aom_integer.c
new file mode 100644
index 000000000..7edfd0de8
--- /dev/null
+++ b/third_party/aom/aom/src/aom_integer.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+static const size_t kMaximumLeb128Size = 8;
+static const uint8_t kLeb128ByteMask = 0x7f;  // Binary: 01111111
+
+// Disallow values larger than 32-bits to ensure consistent behavior on 32 and
+// 64 bit targets: value is typically used to determine buffer allocation size
+// when decoded.
+static const uint64_t kMaximumLeb128Value = UINT32_MAX;
+
+size_t aom_uleb_size_in_bytes(uint64_t value) {
+  size_t size = 0;
+  do {
+    ++size;
+  } while ((value >>= 7) != 0);
+  return size;
+}
+
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+                    size_t *length) {
+  if (buffer && value) {
+    *value = 0;
+    for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) {
+      const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask;
+      *value |= ((uint64_t)decoded_byte) << (i * 7);
+      if ((*(buffer + i) >> 7) == 0) {
+        if (length) {
+          *length = i + 1;
+        }
+
+        // Fail on values larger than 32-bits to ensure consistent behavior on
+        // 32 and 64 bit targets: value is typically used to determine buffer
+        // allocation size.
+        if (*value > UINT32_MAX) return -1;
+
+        return 0;
+      }
+    }
+  }
+
+  // If we get here, either the buffer/value pointers were invalid,
+  // or we ran over the available space
+  return -1;
+}
+
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+                    size_t *coded_size) {
+  const size_t leb_size = aom_uleb_size_in_bytes(value);
+  if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size ||
+      leb_size > available || !coded_value || !coded_size) {
+    return -1;
+  }
+
+  for (size_t i = 0; i < leb_size; ++i) {
+    uint8_t byte = value & 0x7f;
+    value >>= 7;
+
+    if (value != 0) byte |= 0x80;  // Signal that more bytes follow.
+
+    *(coded_value + i) = byte;
+  }
+
+  *coded_size = leb_size;
+  return 0;
+}
+
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+                               size_t pad_to_size, uint8_t *coded_value,
+                               size_t *coded_size) {
+  if (value > kMaximumLeb128Value || !coded_value || !coded_size ||
+      available < pad_to_size || pad_to_size > kMaximumLeb128Size) {
+    return -1;
+  }
+  const uint64_t limit = 1ULL << (7 * pad_to_size);
+  if (value >= limit) {
+    // Can't encode 'value' within 'pad_to_size' bytes
+    return -1;
+  }
+
+  for (size_t i = 0; i < pad_to_size; ++i) {
+    uint8_t byte = value & 0x7f;
+    value >>= 7;
+
+    if (i < pad_to_size - 1) byte |= 0x80;  // Signal that more bytes follow.
+
+    *(coded_value + i) = byte;
+  }
+
+  assert(value == 0);
+
+  *coded_size = pad_to_size;
+  return 0;
+}
diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c
new file mode 100644
index 000000000..bfb3e7e00
--- /dev/null
+++ b/third_party/aom/aom_dsp/add_noise.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
+                           char whiteclamp[16], char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  for (i = 0; i < height; ++i) {
+    uint8_t *pos = start + i * pitch;
+    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; ++j) {
+      int v = pos[j];
+
+      v = clamp(v - blackclamp[0], 0, 255);
+      v = clamp(v + bothclamp[0], 0, 255);
+      v = clamp(v - whiteclamp[0], 0, 255);
+
+      pos[j] = v + ref[j];
+    }
+  }
+}
+
+static double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int aom_setup_noise(double sigma, int size, char *noise) {
+  char char_dist[256];
+  int next = 0, i, j;
+
+  // set up a 256 entry lookup that matches gaussian distribution
+  for (i = -32; i < 32; ++i) {
+    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+    if (a_i) {
+      for (j = 0; j < a_i; ++j) {
+        char_dist[next + j] = (char)i;
+      }
+      next = next + j;
+    }
+  }
+
+  // Rounding error - might mean we have less than 256.
+  for (; next < 256; ++next) {
+    char_dist[next] = 0;
+  }
+
+  for (i = 0; i < size; ++i) {
+    noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  // Returns the highest non 0 value used in distribution.
+  return -char_dist[0];
+}
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
new file mode 100644
index 000000000..4791826da
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+                                      const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int sum = horz_scalar_product(src_x, x_filter);
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+                 w, h);
+}
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4, int w,
+                          int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+                w, h);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int filter_x_stride, const int16_t *filter_y,
+                         int filter_y_stride, int w, int h) {
+  int r;
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters, int x0_q4,
+                                  int x_step_q4, int w, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int sum = highbd_horz_scalar_product(src_x, x_filter);
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters, int y0_q4,
+                                 int y_step_q4, int w, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                        x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                       y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                uint8_t *dst8, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int filter_x_stride,
+                                const int16_t *filter_y, int filter_y_stride,
+                                int w, int h, int bd) {
+  int r;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w * sizeof(uint16_t));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
new file mode 100644
index 000000000..11ff73756
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -0,0 +1,356 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
+  return()
+endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
+set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
+
+list(APPEND AOM_DSP_COMMON_SOURCES
+            "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+            "${AOM_ROOT}/aom_dsp/aom_filter.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+            "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+            "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+            "${AOM_ROOT}/aom_dsp/blend.h"
+            "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+            "${AOM_ROOT}/aom_dsp/entcode.c"
+            "${AOM_ROOT}/aom_dsp/entcode.h"
+            "${AOM_ROOT}/aom_dsp/fft.c"
+            "${AOM_ROOT}/aom_dsp/fft_common.h"
+            "${AOM_ROOT}/aom_dsp/intrapred.c"
+            "${AOM_ROOT}/aom_dsp/intrapred_common.h"
+            "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/prob.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/subtract.c"
+            "${AOM_ROOT}/aom_dsp/txfm_common.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_MSA
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_DSP_DECODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader.h"
+              "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
+              "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
+              "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_DSP_ENCODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter.h"
+              "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
+              "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
+              "${AOM_ROOT}/aom_dsp/entenc.c"
+              "${AOM_ROOT}/aom_dsp/entenc.h"
+              "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.h"
+              "${AOM_ROOT}/aom_dsp/noise_model.c"
+              "${AOM_ROOT}/aom_dsp/noise_model.h"
+              "${AOM_ROOT}/aom_dsp/noise_util.c"
+              "${AOM_ROOT}/aom_dsp/noise_util.h"
+              "${AOM_ROOT}/aom_dsp/psnr.c"
+              "${AOM_ROOT}/aom_dsp/psnr.h"
+              "${AOM_ROOT}/aom_dsp/quantize.c"
+              "${AOM_ROOT}/aom_dsp/quantize.h"
+              "${AOM_ROOT}/aom_dsp/sad.c"
+              "${AOM_ROOT}/aom_dsp/sse.c"
+              "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sum_squares.c"
+              "${AOM_ROOT}/aom_dsp/variance.c"
+              "${AOM_ROOT}/aom_dsp/variance.h")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+              "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
+
+  if(CONFIG_INTERNAL_STATS)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
+                "${AOM_ROOT}/aom_dsp/ssim.h")
+  endif()
+endif()
+
+# Creates aom_dsp build targets. Must not be called until after libaom target
+# has been created.
+function(setup_aom_dsp_targets)
+  add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
+  list(APPEND AOM_LIB_TARGETS aom_dsp_common)
+  create_dummy_source_file("aom_av1" "c" "dummy_source_file")
+  add_library(aom_dsp OBJECT "${dummy_source_file}")
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  list(APPEND AOM_LIB_TARGETS aom_dsp)
+
+  # Not all generators support libraries consisting only of object files. Add a
+  # dummy source file to the aom_dsp target.
+  add_dummy_source_file_to_target("aom_dsp" "c")
+
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
+    list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
+    list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+  endif()
+
+  if(HAVE_SSE2)
+    add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
+    add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
+
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
+      add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE2" "aom")
+    endif()
+  endif()
+
+  if(HAVE_SSSE3)
+    add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
+    add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSSE3" "aom")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
+                    ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+      endif()
+      add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
+      add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
+    add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
+    endif()
+  endif()
+
+  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
+                      "aom")
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
+    add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_AVX2" "aom")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_AVX2" "aom")
+    endif()
+  endif()
+
+  if(HAVE_NEON)
+    add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                  "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON"
+                                  "aom")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON" "aom")
+    endif()
+  endif()
+
+  if(HAVE_DSPR2)
+    add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_DSPR2" "aom")
+  endif()
+
+  if(HAVE_MSA)
+    add_intrinsics_object_library("" "msa" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_MSA" "aom")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_MSA" "aom")
+    endif()
+  endif()
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
new file mode 100644
index 000000000..a185b23c8
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MAX_SB_SIZE
+#define MAX_SB_SIZE 128
+#endif  // ndef MAX_SB_SIZE
+
+#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+
+#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+/* Left shifting a negative value became undefined behavior in C99 (downgraded
+   from merely implementation-defined in C89). This should still compile to the
+   correct thing on any two's-complement machine, but avoid ubsan warnings.*/
+#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+typedef uint8_t qm_val_t;
+#define AOM_QM_BITS 5
+
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default: return (uint16_t)clamp(val, 0, 255);
+    case 10: return (uint16_t)clamp(val, 0, 1023);
+    case 12: return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+
+// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
+// or max(0, value) and might be faster in some cases.
+// Care should be taken since the behavior of right shifting signed type
+// negative value is undefined by C standards and implementation defined,
+static INLINE unsigned int negative_to_zero(int value) {
+  return value & ~(value >> (sizeof(value) * 8 - 1));
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
new file mode 100644
index 000000000..1514bd64e
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
new file mode 100755
index 000000000..8e8a480fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -0,0 +1,1575 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+EOF
+}
+forward_decls qw/aom_dsp_forward_decls/;
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+@block_widths = (4, 8, 16, 32, 64, 128);
+
+@block_sizes = ();
+foreach $w (@block_widths) {
+  foreach $h (@block_widths) {
+    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+  }
+}
+push @block_sizes, [4, 16];
+push @block_sizes, [16, 4];
+push @block_sizes, [8, 32];
+push @block_sizes, [32, 8];
+push @block_sizes, [16, 64];
+push @block_sizes, [64, 16];
+
+@tx_dims = (2, 4, 8, 16, 32, 64);
+@tx_sizes = ();
+foreach $w (@tx_dims) {
+  push @tx_sizes, [$w, $w];
+  foreach $h (@tx_dims) {
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
+  }
+}
+
+@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
+
+#
+# Intra prediction
+#
+
+foreach (@tx_sizes) {
+  ($w, $h) = @$_;
+  foreach $pred_name (@pred_names) {
+    add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
+              "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+    add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+              "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  }
+}
+
+specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_top_predictor_4x8 sse2/;
+specialize qw/aom_dc_top_predictor_4x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x4 sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_8x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x4 sse2/;
+specialize qw/aom_dc_top_predictor_16x8 sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_16x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x64 sse2/;
+specialize qw/aom_dc_top_predictor_32x8 sse2/;
+specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_4x8 sse2/;
+specialize qw/aom_dc_left_predictor_4x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x4 sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_8x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x4 sse2/;
+specialize qw/aom_dc_left_predictor_16x8 sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_16x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x64 sse2/;
+specialize qw/aom_dc_left_predictor_32x8 sse2/;
+specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_4x8 sse2/;
+specialize qw/aom_dc_128_predictor_4x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x4 sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_8x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x4 sse2/;
+specialize qw/aom_dc_128_predictor_16x8 sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_16x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x64 sse2/;
+specialize qw/aom_dc_128_predictor_32x8 sse2/;
+specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
+specialize qw/aom_v_predictor_4x4 neon msa sse2/;
+specialize qw/aom_v_predictor_4x8 sse2/;
+specialize qw/aom_v_predictor_4x16 sse2/;
+specialize qw/aom_v_predictor_8x4 sse2/;
+specialize qw/aom_v_predictor_8x8 neon msa sse2/;
+specialize qw/aom_v_predictor_8x16 sse2/;
+specialize qw/aom_v_predictor_8x32 sse2/;
+specialize qw/aom_v_predictor_16x4 sse2/;
+specialize qw/aom_v_predictor_16x8 sse2/;
+specialize qw/aom_v_predictor_16x16 neon msa sse2/;
+specialize qw/aom_v_predictor_16x32 sse2/;
+specialize qw/aom_v_predictor_16x64 sse2/;
+specialize qw/aom_v_predictor_32x8 sse2/;
+specialize qw/aom_v_predictor_32x16 sse2 avx2/;
+specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
+specialize qw/aom_h_predictor_4x8 sse2/;
+specialize qw/aom_h_predictor_4x16 sse2/;
+specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_8x4 sse2/;
+specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_8x16 sse2/;
+specialize qw/aom_h_predictor_8x32 sse2/;
+specialize qw/aom_h_predictor_16x4 sse2/;
+specialize qw/aom_h_predictor_16x8 sse2/;
+specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_16x32 sse2/;
+specialize qw/aom_h_predictor_16x64 sse2/;
+specialize qw/aom_h_predictor_32x8 sse2/;
+specialize qw/aom_h_predictor_32x16 sse2/;
+specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_h_predictor_32x64 sse2/;
+specialize qw/aom_h_predictor_64x64 sse2/;
+specialize qw/aom_h_predictor_64x32 sse2/;
+specialize qw/aom_h_predictor_64x16 sse2/;
+specialize qw/aom_paeth_predictor_4x4 ssse3/;
+specialize qw/aom_paeth_predictor_4x8 ssse3/;
+specialize qw/aom_paeth_predictor_4x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x4 ssse3/;
+specialize qw/aom_paeth_predictor_8x8 ssse3/;
+specialize qw/aom_paeth_predictor_8x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x32 ssse3/;
+specialize qw/aom_paeth_predictor_16x4 ssse3/;
+specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x8 ssse3/;
+specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x8 ssse3/;
+specialize qw/aom_paeth_predictor_16x16 ssse3/;
+specialize qw/aom_paeth_predictor_16x32 ssse3/;
+specialize qw/aom_paeth_predictor_32x16 ssse3/;
+specialize qw/aom_paeth_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+
+# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+# by multiply and shift.
+specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
+specialize qw/aom_dc_predictor_4x8 sse2/;
+specialize qw/aom_dc_predictor_4x16 sse2/;
+specialize qw/aom_dc_predictor_8x4 sse2/;
+specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/aom_dc_predictor_8x16 sse2/;
+specialize qw/aom_dc_predictor_8x32 sse2/;
+specialize qw/aom_dc_predictor_16x4 sse2/;
+specialize qw/aom_dc_predictor_16x8 sse2/;
+specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
+specialize qw/aom_dc_predictor_16x32 sse2/;
+specialize qw/aom_dc_predictor_16x64 sse2/;
+specialize qw/aom_dc_predictor_32x8 sse2/;
+specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
+
+  specialize qw/aom_highbd_v_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_v_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_v_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_v_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_v_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_v_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_v_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_v_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_v_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_v_predictor_32x32 sse2/;
+
+  # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+  # by multiply and shift.
+  specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
+  specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;;
+  specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
+  specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_h_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_h_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_h_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_h_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_h_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_h_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_h_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_h_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_h_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_h_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/aom_convolve_copy       sse2      /;
+specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
+
+add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+
+add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
+
+add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
+
+#
+# Loopfilter
+#
+add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_14_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_8_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_4_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_14_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_6_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_8_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_4_dual sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_8 sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_6 sse2/;
+
+add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_6_dual sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+
+add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
+specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+
+add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
+    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
+
+    # High bit depth
+    add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8 sse2/;
+
+    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft4x4_float                  sse2/;
+
+    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft8x8_float avx2             sse2/;
+
+    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft16x16_float avx2           sse2/;
+
+    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft32x32_float avx2           sse2/;
+
+    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft4x4_float                 sse2/;
+
+    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft8x8_float avx2            sse2/;
+
+    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft16x16_float avx2          sse2/;
+
+    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft32x32_float avx2          sse2/;
+}  # CONFIG_AV1_ENCODER
+
+#
+# Quantization
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+
+  add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+
+  add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+}  # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b sse2 avx2/;
+
+  add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+
+  add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+}  # CONFIG_AV1_ENCODER
+
+#
+# Alpha blending with mask
+#
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
+add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
+add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
+specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
+specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
+
+add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
+add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  #
+  # Block subtraction
+  #
+  add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+  specialize qw/aom_subtract_block neon msa sse2 avx2/;
+
+  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/aom_highbd_subtract_block sse2/;
+
+  add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+  specialize qw/aom_sse  sse4_1 avx2/;
+
+  add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+  specialize qw/aom_highbd_sse  sse4_1 avx2/;
+
+  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+    #
+    # Sum of Squares
+    #
+    add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+    specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
+
+    add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+    specialize qw/aom_sum_squares_i16 sse2/;
+
+  }
+
+
+  #
+  # Single block SAD / Single block Avg SAD
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
+  }
+
+  specialize qw/aom_sad128x128    avx2          sse2/;
+  specialize qw/aom_sad128x64     avx2          sse2/;
+  specialize qw/aom_sad64x128     avx2          sse2/;
+  specialize qw/aom_sad64x64      avx2 neon msa sse2/;
+  specialize qw/aom_sad64x32      avx2      msa sse2/;
+  specialize qw/aom_sad32x64      avx2      msa sse2/;
+  specialize qw/aom_sad32x32      avx2 neon msa sse2/;
+  specialize qw/aom_sad32x16      avx2      msa sse2/;
+  specialize qw/aom_sad16x32                msa sse2/;
+  specialize qw/aom_sad16x16           neon msa sse2/;
+  specialize qw/aom_sad16x8            neon msa sse2/;
+  specialize qw/aom_sad8x16            neon msa sse2/;
+  specialize qw/aom_sad8x8             neon msa sse2/;
+  specialize qw/aom_sad8x4                  msa sse2/;
+  specialize qw/aom_sad4x8                  msa sse2/;
+  specialize qw/aom_sad4x4             neon msa sse2/;
+
+  specialize qw/aom_sad128x128_avg avx2     sse2/;
+  specialize qw/aom_sad128x64_avg  avx2     sse2/;
+  specialize qw/aom_sad64x128_avg  avx2     sse2/;
+  specialize qw/aom_sad64x64_avg   avx2 msa sse2/;
+  specialize qw/aom_sad64x32_avg   avx2 msa sse2/;
+  specialize qw/aom_sad32x64_avg   avx2 msa sse2/;
+  specialize qw/aom_sad32x32_avg   avx2 msa sse2/;
+  specialize qw/aom_sad32x16_avg   avx2 msa sse2/;
+  specialize qw/aom_sad16x32_avg        msa sse2/;
+  specialize qw/aom_sad16x16_avg        msa sse2/;
+  specialize qw/aom_sad16x8_avg         msa sse2/;
+  specialize qw/aom_sad8x16_avg         msa sse2/;
+  specialize qw/aom_sad8x8_avg          msa sse2/;
+  specialize qw/aom_sad8x4_avg          msa sse2/;
+  specialize qw/aom_sad4x8_avg          msa sse2/;
+  specialize qw/aom_sad4x4_avg          msa sse2/;
+
+  specialize qw/aom_sad4x16      sse2/;
+  specialize qw/aom_sad16x4      sse2/;
+  specialize qw/aom_sad8x32      sse2/;
+  specialize qw/aom_sad32x8      sse2/;
+  specialize qw/aom_sad16x64     sse2/;
+  specialize qw/aom_sad64x16     sse2/;
+
+  specialize qw/aom_sad4x16_avg  sse2/;
+  specialize qw/aom_sad16x4_avg  sse2/;
+  specialize qw/aom_sad8x32_avg  sse2/;
+  specialize qw/aom_sad32x8_avg  sse2/;
+  specialize qw/aom_sad16x64_avg sse2/;
+  specialize qw/aom_sad64x16_avg sse2/;
+
+  specialize qw/aom_jnt_sad128x128_avg ssse3/;
+  specialize qw/aom_jnt_sad128x64_avg  ssse3/;
+  specialize qw/aom_jnt_sad64x128_avg  ssse3/;
+  specialize qw/aom_jnt_sad64x64_avg   ssse3/;
+  specialize qw/aom_jnt_sad64x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x64_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad32x16_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x32_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x16_avg   ssse3/;
+  specialize qw/aom_jnt_sad16x8_avg    ssse3/;
+  specialize qw/aom_jnt_sad8x16_avg    ssse3/;
+  specialize qw/aom_jnt_sad8x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad8x4_avg     ssse3/;
+  specialize qw/aom_jnt_sad4x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad4x4_avg     ssse3/;
+
+  specialize qw/aom_jnt_sad4x16_avg     ssse3/;
+  specialize qw/aom_jnt_sad16x4_avg     ssse3/;
+  specialize qw/aom_jnt_sad8x32_avg     ssse3/;
+  specialize qw/aom_jnt_sad32x8_avg     ssse3/;
+  specialize qw/aom_jnt_sad16x64_avg     ssse3/;
+  specialize qw/aom_jnt_sad64x16_avg     ssse3/;
+
+  add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+
+  specialize qw/aom_sad4xh   sse2/;
+  specialize qw/aom_sad8xh   sse2/;
+  specialize qw/aom_sad16xh  sse2/;
+  specialize qw/aom_sad32xh  sse2/;
+  specialize qw/aom_sad64xh  sse2/;
+  specialize qw/aom_sad128xh sse2/;
+
+
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
+      }
+      add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
+    }
+    specialize qw/aom_highbd_sad128x128 avx2/;
+    specialize qw/aom_highbd_sad128x64  avx2/;
+    specialize qw/aom_highbd_sad64x128  avx2/;
+    specialize qw/aom_highbd_sad64x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4     sse2/;
+
+    specialize qw/aom_highbd_sad128x128_avg avx2/;
+    specialize qw/aom_highbd_sad128x64_avg  avx2/;
+    specialize qw/aom_highbd_sad64x128_avg  avx2/;
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4_avg     sse2/;
+
+    specialize qw/aom_highbd_sad16x4       sse2/;
+    specialize qw/aom_highbd_sad8x32       sse2/;
+    specialize qw/aom_highbd_sad32x8       sse2/;
+    specialize qw/aom_highbd_sad16x64      sse2/;
+    specialize qw/aom_highbd_sad64x16      sse2/;
+
+    specialize qw/aom_highbd_sad16x4_avg   sse2/;
+    specialize qw/aom_highbd_sad8x32_avg   sse2/;
+    specialize qw/aom_highbd_sad32x8_avg   sse2/;
+    specialize qw/aom_highbd_sad16x64_avg  sse2/;
+    specialize qw/aom_highbd_sad64x16_avg  sse2/;
+
+  #
+  # Masked SAD
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
+    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
+  }
+
+
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
+    }
+
+
+  #
+  # OBMC SAD
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+    }
+  }
+
+
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+      }
+    }
+
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  }
+
+  specialize qw/aom_sad128x128x4d avx2          sse2/;
+  specialize qw/aom_sad128x64x4d  avx2          sse2/;
+  specialize qw/aom_sad64x128x4d  avx2          sse2/;
+  specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
+  specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
+  specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
+  specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
+  specialize qw/aom_sad32x16x4d             msa sse2/;
+  specialize qw/aom_sad16x32x4d             msa sse2/;
+  specialize qw/aom_sad16x16x4d        neon msa sse2/;
+  specialize qw/aom_sad16x8x4d              msa sse2/;
+  specialize qw/aom_sad8x16x4d              msa sse2/;
+  specialize qw/aom_sad8x8x4d               msa sse2/;
+  specialize qw/aom_sad8x4x4d               msa sse2/;
+  specialize qw/aom_sad4x8x4d               msa sse2/;
+  specialize qw/aom_sad4x4x4d               msa sse2/;
+
+  specialize qw/aom_sad4x16x4d  sse2/;
+  specialize qw/aom_sad16x4x4d  sse2/;
+  specialize qw/aom_sad8x32x4d  sse2/;
+  specialize qw/aom_sad32x8x4d  sse2/;
+  specialize qw/aom_sad16x64x4d sse2/;
+  specialize qw/aom_sad64x16x4d sse2/;
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+    }
+  }
+  specialize qw/aom_highbd_sad128x128x4d avx2/;
+  specialize qw/aom_highbd_sad128x64x4d  avx2/;
+  specialize qw/aom_highbd_sad64x128x4d  avx2/;
+  specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
+  specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
+  specialize qw/aom_highbd_sad8x16x4d    sse2/;
+  specialize qw/aom_highbd_sad8x8x4d     sse2/;
+  specialize qw/aom_highbd_sad8x4x4d     sse2/;
+  specialize qw/aom_highbd_sad4x8x4d     sse2/;
+  specialize qw/aom_highbd_sad4x4x4d     sse2/;
+
+  specialize qw/aom_highbd_sad4x16x4d  sse2/;
+  specialize qw/aom_highbd_sad16x4x4d  sse2/;
+  specialize qw/aom_highbd_sad8x32x4d  sse2/;
+  specialize qw/aom_highbd_sad32x8x4d  sse2/;
+  specialize qw/aom_highbd_sad16x64x4d sse2/;
+  specialize qw/aom_highbd_sad64x16x4d sse2/;
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+
+  }
+}  # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+  #
+  # Specialty Variance
+  #
+  add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  specialize qw/aom_get16x16var           neon msa/;
+  specialize qw/aom_get8x8var             neon msa/;
+
+
+  add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+  specialize qw/aom_mse16x16          sse2 avx2 neon msa/;
+  specialize qw/aom_mse16x8           sse2           msa/;
+  specialize qw/aom_mse8x16           sse2           msa/;
+  specialize qw/aom_mse8x8            sse2           msa/;
+
+    foreach $bd (8, 10, 12) {
+      add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+      add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+      specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
+      specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+    }
+
+
+  #
+  #
+  #
+  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
+  specialize qw/aom_upsampled_pred sse2/;
+
+  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                   int ref_stride, int subpel_search";
+  specialize qw/aom_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
+
+  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+                                                       int subpel_search";
+  specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
+
+  add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                 const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+  specialize qw/aom_highbd_upsampled_pred sse2/;
+
+  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                          const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                              const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                              int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
+
+
+  #
+  #
+  #
+  add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
+  add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
+
+  specialize qw/aom_get_mb_ss sse2 msa/;
+  specialize qw/aom_get4x4sse_cs neon msa/;
+
+  #
+  # Variance / Subpixel Variance / Subpixel Avg Variance
+  #
+  add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
+  }
+  specialize qw/aom_variance128x128   sse2 avx2         /;
+  specialize qw/aom_variance128x64    sse2 avx2         /;
+  specialize qw/aom_variance64x128    sse2 avx2         /;
+  specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
+  specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
+  specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
+  specialize qw/aom_variance32x16     sse2 avx2 msa/;
+  specialize qw/aom_variance16x32     sse2 avx2 msa/;
+  specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
+  specialize qw/aom_variance8x16      sse2      neon msa/;
+  specialize qw/aom_variance8x8       sse2      neon msa/;
+  specialize qw/aom_variance8x4       sse2           msa/;
+  specialize qw/aom_variance4x8       sse2           msa/;
+  specialize qw/aom_variance4x4       sse2           msa/;
+
+  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32               msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16          neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
+
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
+
+  specialize qw/aom_variance4x16 sse2/;
+  specialize qw/aom_variance16x4 sse2 avx2/;
+  specialize qw/aom_variance8x32 sse2/;
+  specialize qw/aom_variance32x8 sse2 avx2/;
+  specialize qw/aom_variance16x64 sse2 avx2/;
+  specialize qw/aom_variance64x16 sse2 avx2/;
+  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x8  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x16  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x8   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x4   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x8   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x4   ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance4x16  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x4  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance8x32  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance32x8  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
+
+  specialize qw/aom_jnt_sub_pixel_avg_variance128x128  ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance128x64   ssse3/;
+  specialize qw/aom_jnt_sub_pixel_avg_variance64x128   ssse3/;
+
+
+  foreach $bd (8, 10, 12) {
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      # TODO(david.barker): When ext-partition-types is enabled, we currently
+      # don't have vectorized 4x16 highbd variance functions
+      if ($w == 4 && $h == 4) {
+          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+        }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+      }
+
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
+    }
+  }
+
+  #
+  # Masked Variance / Masked Subpixel Variance
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+  }
+
+
+    foreach $bd ("_8_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+      }
+    }
+
+
+  #
+  # OBMC Variance / OBMC Subpixel Variance
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
+    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
+  }
+
+
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+      }
+    }
+
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+
+  #
+  # Comp Avg
+  #
+  add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+  add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+  specialize qw/aom_jnt_comp_avg_pred ssse3/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance128x128 sse2/;
+
+	add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance128x64 sse2/;
+
+	add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance64x128 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance64x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance64x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance32x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance32x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance32x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance16x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance16x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance8x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+	add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
+
+	add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
+
+	add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance64x64 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance64x32 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance32x64 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance32x32 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance32x16 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance16x32 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance16x16 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance16x8 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance8x16 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance8x8 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+	add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance128x128 sse2/;
+
+	add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance128x64 sse2/;
+
+	add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance64x128 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance64x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance64x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance32x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance32x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance32x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance16x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance16x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance8x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_mse16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_mse8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_mse16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_mse8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_mse16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_mse8x8 sse2/;
+
+    add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+    add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
+    specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
+
+    #
+    # Subpixel Variance
+    #
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+
+
+  add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+  specialize qw/aom_comp_mask_pred ssse3 avx2/;
+
+  add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+  specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+
+}  # CONFIG_AV1_ENCODER
+
+1;
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
new file mode 100644
index 000000000..00686ac38
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_FILTER_H_
+#define AOM_AOM_DSP_AOM_FILTER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+#define SCALE_SUBPEL_BITS 10
+#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
+#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
+#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
+#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
+
+#define RS_SUBPEL_BITS 6
+#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
+#define RS_SCALE_SUBPEL_BITS 14
+#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
+#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
+#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
new file mode 100644
index 000000000..ab950ca55
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_H_
+#define AOM_AOM_DSP_AOM_SIMD_H_
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_simd_inline.h"
+
+#define SIMD_CHECK 1  // Sanity checks in C equivalents
+
+#if HAVE_NEON
+#include "simd/v256_intrinsics_arm.h"
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#include "simd/v256_intrinsics_x86.h"
+#else
+#include "simd/v256_intrinsics.h"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
new file mode 100644
index 000000000..eb333f6f6
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+
+#include "aom/aom_integer.h"
+
+#ifndef SIMD_INLINE
+#define SIMD_INLINE static AOM_FORCE_INLINE
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 000000000..e7f08a5fd
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
+                            const int16x8_t v_maxval, int16x8_t *res) {
+  int32x4_t im_res_low, im_res_high;
+  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+
+  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
+  im_res_low =
+      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+
+  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
+  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
+                          vget_high_s16(src_1));
+
+  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
+                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
+}
+
+static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
+                             int16x8_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1, src0_2, src0_3;
+  int16x8_t src1_0, src1_1, src1_2, src1_3;
+  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+
+  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
+               &src0_3);
+  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
+               &src1_3);
+
+  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
+  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
+  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
+  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+
+  uint16x8_t im_res1_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
+  uint16x8_t im_res1_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
+  uint16x8_t im_res1_2 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
+  uint16x8_t im_res1_3 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
+
+  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
+  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
+  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
+  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
+
+  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
+  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
+  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
+  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
+}
+
+static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
+                             int16x4_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1;
+  int16x8_t src1_0, src1_1;
+  uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
+             tu3 = vdupq_n_u64(0);
+  int16x8_t mask0_1, mask2_3;
+  int16x8_t res0, res1;
+
+  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
+  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
+
+  src0_0 = vreinterpretq_s16_u64(tu0);
+  src0_1 = vreinterpretq_s16_u64(tu1);
+
+  src1_0 = vreinterpretq_s16_u64(tu2);
+  src1_1 = vreinterpretq_s16_u64(tu3);
+
+  mask0_1 = vcombine_s16(mask0, mask1);
+  mask2_3 = vcombine_s16(mask2, mask3);
+
+  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
+  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
+
+  uint16x8_t im_res_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
+  uint16x8_t im_res_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
+
+  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
+  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
+
+  uint8x8_t res_0 = vqmovun_s16(src0_0);
+  uint8x8_t res_1 = vqmovun_s16(src0_1);
+
+  vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
+                1);
+  vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
+                1);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i = 0;
+  const int bd = 8;
+  int w_tmp = w;
+  const uint8_t *mask_tmp = mask;
+  const CONV_BUF_TYPE *src0_tmp = src0;
+  const CONV_BUF_TYPE *src1_tmp = src1;
+  uint8_t *dst_tmp = dst;
+
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint8x8_t s0, s1, s2, s3;
+  uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+             tu3 = vdup_n_u32(0);
+  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+  int16x8_t mask0, mask1, mask2, mask3;
+  int16x8_t mask4, mask5, mask6, mask7;
+  int32x4_t m0_32, m1_32, m2_32, m3_32;
+  int32x4_t m4_32, m5_32, m6_32, m7_32;
+  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
+  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
+  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
+  const uint16x4_t vec_zero = vdup_n_u16(0);
+  const uint16_t offset = round_offset - (1 << (round_bits - 1));
+  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const uint16x8_t vec_offset = vdupq_n_u16(offset);
+
+  if (subw == 0 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+
+          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
+          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
+          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
+          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
+
+        mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 1) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                       &t7);
+
+          mask0 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
+          mask1 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
+          mask2 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
+          mask3 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+
+          mask4 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
+          mask5 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
+          mask6 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
+          mask7 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+
+          m0_32 = vpaddlq_s16(mask0);
+          m1_32 = vpaddlq_s16(mask1);
+          m2_32 = vpaddlq_s16(mask2);
+          m3_32 = vpaddlq_s16(mask3);
+
+          m4_32 = vpaddlq_s16(mask4);
+          m5_32 = vpaddlq_s16(mask5);
+          m6_32 = vpaddlq_s16(mask6);
+          m7_32 = vpaddlq_s16(mask7);
+
+          mask0 =
+              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
+          mask1 =
+              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
+          mask2 =
+              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
+          mask3 =
+              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+        m0_32 = vpaddlq_s16(mask0);
+        m1_32 = vpaddlq_s16(mask1);
+        m2_32 = vpaddlq_s16(mask2);
+        m3_32 = vpaddlq_s16(mask3);
+
+        mask0_low = vqrshrn_n_s32(m0_32, 2);
+        mask1_low = vqrshrn_n_s32(m1_32, 2);
+        mask2_low = vqrshrn_n_s32(m2_32, 2);
+        mask3_low = vqrshrn_n_s32(m3_32, 2);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+
+          mask0 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
+          mask1 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
+          mask2 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
+          mask3 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l);
+
+        mask0 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
+        mask1 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
+        mask2 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
+        mask3 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+
+        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
+        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
+        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
+        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
+        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
+                              &tu3);
+
+        s0 = vreinterpret_u8_u32(tu0);
+        s1 = vreinterpret_u8_u32(tu1);
+        s2 = vreinterpret_u8_u32(tu2);
+        s3 = vreinterpret_u8_u32(tu3);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+
+        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 000000000..e4300c992
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/txfm_common.h"
+
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  int i;
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from aom_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
+void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 000000000..c85b1e910
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
+
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
+
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
+
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
+
+  d1u64 = vld1_u64((const uint64_t *)left);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+  }
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+    }
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       const uint16_t *above,
+                                       const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define intra_pred_highbd_sized_neon(type, width)               \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_##type##_predictor(dst, stride, width, above, left); \
+  }
+
+#define intra_pred_square(type)           \
+  intra_pred_highbd_sized_neon(type, 4);  \
+  intra_pred_highbd_sized_neon(type, 8);  \
+  intra_pred_highbd_sized_neon(type, 16); \
+  intra_pred_highbd_sized_neon(type, 32); \
+  intra_pred_highbd_sized_neon(type, 64);
+
+intra_pred_square(dc);
+#undef intra_pred_square
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 000000000..bdc67626d
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+                                 uint8x8_t p0q0, const uint8_t blimit,
+                                 const uint8_t limit) {
+  // Calculate mask values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p3q3, p2q2);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+                                  const uint8_t blimit, const uint8_t limit) {
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  uint8x8_t mask_8x8, temp_8x8;
+
+  mask_8x8 = vabd_u8(p1q1, p0q0);
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+                                       uint8x8_t p1q1, uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+                                       uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+                                         uint8x8_t p0q0, const uint8_t blimit,
+                                         const uint8_t limit) {
+  // Calculate mask3 values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p2q2, p1q1);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                        uint8x8_t *p0q0, const uint8_t blimit,
+                        const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+      out_f14_pq5;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+  uint8x8_t q0p0, q1p1, q2p2;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  // reverse p and q
+  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    // filter 14
+    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+    uint8x8_t qp_rev;
+
+    out = vaddw_u8(out, *p4q4);
+    out = vaddw_u8(out, *p5q5);
+    out = vaddw_u8(out, *p6q6);
+
+    out_pq5 = vaddw_u8(out, *p4q4);
+    out_pq4 = vaddw_u8(out_pq5, *p3q3);
+    out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+    out_pq5 = vaddw_u8(out_pq5, *p5q5);
+    out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+    out_pq0 = vaddw_u8(out, *p1q1);
+    out_pq1 = vaddw_u8(out_pq0, *p2q2);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+    out_pq0 = vaddw_u8(out_pq0, *p0q0);
+    out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+    out_pq1 = vaddw_u8(out_pq1, *p6q6);
+    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+    out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+    qp_sum = vaddl_u8(q2p2, q1p1);
+    out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+    out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+    // filter14 outputs
+    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+  }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                       uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    uint8x8_t q0p0, q1p1, q2p2;
+
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    // reverse p and q
+    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+  }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+                       const uint8_t blimit, const uint8_t limit,
+                       const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f6_pq0, out_f6_pq1;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 6
+    uint16x8_t out_pq0, out_pq1;
+    uint8x8_t pq_rev;
+
+    out = vaddl_u8(*p0q0, *p1q1);
+    out = vaddq_u16(out, out);
+    out = vaddw_u8(out, *p2q2);
+
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    out = vaddw_u8(out, pq_rev);
+
+    out_pq0 = vaddw_u8(out, pq_rev);
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+    out_pq1 = vaddw_u8(out, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter6_cond;
+    filter6_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter6_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter6 outputs
+    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+  }
+}
+
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  int32x2x2_t ps0_qs0, ps1_qs1;
+  int16x8_t filter_s16;
+  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+  int8x8_t op0, oq0, op1, oq1;
+  int8x8_t pq_s0, pq_s1;
+  int8x8_t filter_s8, filter1_s8, filter2_s8;
+  int8x8_t hev_8x8;
+  const int8x8_t sign_mask = vdup_n_s8(0x80);
+  const int8x8_t val_4 = vdup_n_s8(4);
+  const int8x8_t val_3 = vdup_n_s8(3);
+
+  // Calculate filter mask
+  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+  // hev_mask
+  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+  // add outer taps if we have high edge variance
+  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+  // inner taps
+  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  filter_s16 = vmovl_s8(filter_s8);
+  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+  filter_s8 = vqmovn_s16(filter_s16);
+  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+  filter1_s8 = vqadd_s8(filter_s8, val_4);
+  filter2_s8 = vqadd_s8(filter_s8, val_3);
+  filter1_s8 = vshr_n_s8(filter1_s8, 3);
+  filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+  filter_s8 = vrshr_n_s8(filter1_s8, 1);
+  filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  uint8x16_t row0, row1, row2, row3;
+  uint8x8_t pxp3, p6p2, p5p1, p4p0;
+  uint8x8_t q0q4, q1q5, q2q6, q3qy;
+  uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+  uint32x2_t pq_rev;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+  // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3);
+
+  pxp3 = vget_low_u8(row0);
+  p6p2 = vget_low_u8(row1);
+  p5p1 = vget_low_u8(row2);
+  p4p0 = vget_low_u8(row3);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  q0q4 = vget_high_u8(row0);
+  q1q5 = vget_high_u8(row1);
+  q2q6 = vget_high_u8(row2);
+  q3qy = vget_high_u8(row3);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+  pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+  pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+  p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+  p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+  p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+  q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+  p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+  p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  row0 = vcombine_u8(pxp3, q0q4);
+  row1 = vcombine_u8(p6p2, q1q5);
+  row2 = vcombine_u8(p5p1, q2q6);
+  row3 = vcombine_u8(p4p0, q3qy);
+
+  store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t p3q0, p2q1, p1q2, p0q3;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+  load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
+}
+
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t pxq0, p2q1, p1q2, p0qy;
+  uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+  // row0: px p2 p1 p0 | q0 q1 q2 qy
+  // row1: px p2 p1 p0 | q0 q1 q2 qy
+  // row2: px p2 p1 p0 | q0 q1 q2 qy
+  // row3: px p2 p1 p0 | q0 q1 q2 qy
+  load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+  uint32x2_t pq_rev;
+  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1;
+
+  // row0: p1 p0 | q0 q1
+  // row1: p1 p0 | q0 q1
+  // row2: p1 p0 | q0 q1
+  // row3: p1 p0 | q0 q1
+  load_u8_4x1(src - 2, &p1p0, 0);
+  load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1);
+  load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0);
+  load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1);
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+  pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+  p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+  p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+  p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+  p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+  q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  store_u8_4x1(src - 2, p1p0, 0);
+  store_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
+  store_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
+  store_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
+
+  load_u8_4x1(src - 7 * stride, &p6q6, 0);
+  load_u8_4x1(src - 6 * stride, &p5q5, 0);
+  load_u8_4x1(src - 5 * stride, &p4q4, 0);
+  load_u8_4x1(src - 4 * stride, &p3q3, 0);
+  load_u8_4x1(src - 3 * stride, &p2q2, 0);
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+  load_u8_4x1(src + 2 * stride, &p2q2, 1);
+  load_u8_4x1(src + 3 * stride, &p3q3, 1);
+  load_u8_4x1(src + 4 * stride, &p4q4, 1);
+  load_u8_4x1(src + 5 * stride, &p5q5, 1);
+  load_u8_4x1(src + 6 * stride, &p6q6, 1);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  store_u8_4x1(src - 6 * stride, p5q5, 0);
+  store_u8_4x1(src - 5 * stride, p4q4, 0);
+  store_u8_4x1(src - 4 * stride, p3q3, 0);
+  store_u8_4x1(src - 3 * stride, p2q2, 0);
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+  store_u8_4x1(src + 2 * stride, p2q2, 1);
+  store_u8_4x1(src + 3 * stride, p3q3, 1);
+  store_u8_4x1(src + 4 * stride, p4q4, 1);
+  store_u8_4x1(src + 5 * stride, p5q5, 1);
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+  p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+                                           vreinterpret_u32_u8(p3q3), 1));
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+  vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
+}
+
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2;
+
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
+
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+}
diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c
new file mode 100644
index 000000000..606950ab2
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+                &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+                &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+                &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+                &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo =
+        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo =
+        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo =
+        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo =
+        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
new file mode 100644
index 000000000..a39de91d6
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 15; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+                             unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x2_t d1;
+  uint64x1_t d3;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 3; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  d1 = vpaddl_u16(vget_low_u16(q12));
+  d3 = vpaddl_u32(d1);
+
+  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x16_t q0, q4;
+  uint16x8_t q12, q13;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  q0 = vld1q_u8(src_ptr);
+  src_ptr += src_stride;
+  q4 = vld1q_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+  for (i = 0; i < 7; i++) {
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+  }
+
+  q12 = vaddq_u16(q12, q13);
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+  }
+  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+}
+
+unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref = vld1q_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo =
+        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+    vec_accum_hi =
+        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum = vdupq_n_u16(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint8x8_t vec_src = vld1_u8(src);
+    const uint8x8_t vec_ref = vld1_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  }
+  return horizontal_add_16x8(vec_accum);
+}
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 000000000..cf618eee7
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/variance.h"
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *dst, int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+                            bilinear_filters_2t[yoffset]);
+  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+                             bilinear_filters_2t[yoffset]);
+  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+                             bilinear_filters_2t[yoffset]);
+  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+                             bilinear_filters_2t[yoffset]);
+  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 000000000..28f5ace8e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
+                             ptrdiff_t diff_stride, const uint8_t *src,
+                             ptrdiff_t src_stride, const uint8_t *pred,
+                             ptrdiff_t pred_stride) {
+  int r, c;
+
+  if (cols > 16) {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; c += 32) {
+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t v_diff_lo_00 =
+            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 =
+            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 =
+            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 =
+            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+      }
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else if (cols > 8) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x16_t v_src = vld1q_u8(&src[0]);
+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+      const uint16x8_t v_diff_lo =
+          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi =
+          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else if (cols > 4) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint8x8_t v_pred = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
+
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
new file mode 100644
index 000000000..74385a601
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, uint32_t *sse,
+                             int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                        int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
+}
+
+unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+                   32, 32, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+}
+
+unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+}
+
+unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
+}
+
+unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 4; i++) {
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  uint8x8_t d0u8, d2u8, d4u8, d6u8;
+  int16x4_t d22s16, d23s16, d24s16, d25s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint16x8_t q11u16, q12u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d2u8, d6u8);
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+                               const unsigned char *ref_ptr, int recon_stride,
+                               unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  int64x1_t d0s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  q7s32 = vdupq_n_s32(0);
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q10s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q10s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride) {
+  int16x4_t d22s16, d24s16, d26s16, d28s16;
+  int64x1_t d0s64;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  d0u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d4u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d1u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d5u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d2u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d6u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d3u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d7u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+
+  q11u16 = vsubl_u8(d0u8, d4u8);
+  q12u16 = vsubl_u8(d1u8, d5u8);
+  q13u16 = vsubl_u8(d2u8, d6u8);
+  q14u16 = vsubl_u8(d3u8, d7u8);
+
+  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+  q7s32 = vmull_s16(d22s16, d22s16);
+  q8s32 = vmull_s16(d24s16, d24s16);
+  q9s32 = vmull_s16(d26s16, d26s16);
+  q10s32 = vmull_s16(d28s16, d28s16);
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q9s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q9s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
new file mode 100644
index 000000000..01088010a
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_reader.h"
+
+#include "av1/common/common.h"
+
+// Inverse recenters a non-negative literal v around a reference r
+static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
+  if (v > (r << 1))
+    return v;
+  else if ((v & 1) == 0)
+    return (v >> 1) + r;
+  else
+    return r - ((v + 1) >> 1);
+}
+
+// Inverse recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
+  if ((r << 1) <= n) {
+    return inv_recenter_nonneg(r, v);
+  } else {
+    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
+  }
+}
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r,
+                                      uint16_t n ACCT_STR_PARAM) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
+  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
+}
+
+static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
+                                               uint16_t n) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_rb_read_literal(rb, l - 1);
+  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+// Decode finite subexponential code that for a symbol v in [0, n-1] with
+// parameter k
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                       uint16_t k ACCT_STR_PARAM) {
+  int i = 0;
+  int mk = 0;
+
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+
+    if (n <= mk + 3 * a) {
+      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+    }
+
+    if (!aom_read_bit(r, ACCT_STR_NAME)) {
+      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
+  }
+
+  assert(0);
+  return 0;
+}
+
+static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
+                                                uint16_t n, uint16_t k) {
+  int i = 0;
+  int mk = 0;
+
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+
+    if (n <= mk + 3 * a) {
+      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
+    }
+
+    if (!aom_rb_read_bit(rb)) {
+      return aom_rb_read_literal(rb, b) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
+  }
+
+  assert(0);
+  return 0;
+}
+
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+                                          uint16_t ref ACCT_STR_PARAM) {
+  return inv_recenter_finite_nonneg(
+      n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
+}
+
+static uint16_t aom_rb_read_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
+  return inv_recenter_finite_nonneg(n, ref,
+                                    aom_rb_read_primitive_subexpfin(rb, n, k));
+}
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
+  ref += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
new file mode 100644
index 000000000..364a67469
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
+#define AOM_AOM_DSP_BINARY_CODES_READER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+
+#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
+  aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
+  aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
+  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                       uint16_t k ACCT_STR_PARAM);
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+                                          uint16_t ref ACCT_STR_PARAM);
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
new file mode 100644
index 000000000..ee7a9f567
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_writer.h"
+
+#include "av1/common/common.h"
+
+// Recenters a non-negative literal v around a reference r
+static uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
+  if (v > (r << 1))
+    return v;
+  else if (v >= r)
+    return ((v - r) << 1);
+  else
+    return ((r - v) << 1) - 1;
+}
+
+// Recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
+  if ((r << 1) <= n) {
+    return recenter_nonneg(r, v);
+  } else {
+    return recenter_nonneg(n - 1 - r, n - 1 - v);
+  }
+}
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+                                   unsigned int abs_bits) {
+  if (v == 0) {
+    aom_write_bit(w, 0);
+  } else {
+    const int x = abs(v);
+    const int s = v < 0;
+    aom_write_bit(w, 1);
+    aom_write_bit(w, s);
+    aom_write_literal(w, x - 1, abs_bits);
+  }
+}
+
+int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
+  return (v == 0 ? 1 : abs_bits + 2);
+}
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_bit(w, (v - m) & 1);
+  }
+}
+
+static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
+                                            uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_bit(wb, (v - m) & 1);
+  }
+}
+
+int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  return v < m ? l - 1 : l;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                   uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      aom_write_primitive_quniform(w, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_write_bit(w, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_write_literal(w, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
+                                             uint16_t n, uint16_t k,
+                                             uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      aom_wb_write_primitive_quniform(wb, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_wb_write_bit(wb, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_wb_write_literal(wb, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      count += aom_count_primitive_quniform(n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        count += b;
+        break;
+      }
+    }
+  }
+  return count;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+// Recenters symbol around r first and then uses a finite subexponential code.
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                      uint16_t ref, uint16_t v) {
+  aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                uint16_t ref, uint16_t v) {
+  aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+                                             uint16_t k, int16_t ref,
+                                             int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
+}
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
+}
+
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+                                     uint16_t v) {
+  return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+                                            int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
new file mode 100644
index 000000000..c360e0e29
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/bitwriter_buffer.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+                                   unsigned int mag_bits);
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                   uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                      uint16_t ref, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
+// parameter k based on a reference ref also in [-(n-1), n-1].
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+                                             uint16_t k, int16_t ref,
+                                             int16_t v);
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v);
+
+// Functions that counts bits for the above primitives
+int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
+int aom_count_primitive_quniform(uint16_t n, uint16_t v);
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+                                     uint16_t v);
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+                                            int16_t v);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
new file mode 100644
index 000000000..7c0efcc78
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_H_
+#define AOM_AOM_DSP_BITREADER_H_
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/daalaboolreader.h"
+#include "aom_dsp/prob.h"
+#include "av1/common/odintrin.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct daala_reader aom_reader;
+
+static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
+                                  size_t size) {
+  return aom_daala_reader_init(r, buffer, (int)size);
+}
+
+static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) {
+  return aom_daala_reader_find_begin(r);
+}
+
+static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
+  return aom_daala_reader_find_end(r);
+}
+
+static INLINE int aom_reader_has_error(aom_reader *r) {
+  return aom_daala_reader_has_error(r);
+}
+
+// Returns true if the bit reader has tried to decode more data from the buffer
+// than was actually provided.
+static INLINE int aom_reader_has_overflowed(const aom_reader *r) {
+  return aom_daala_reader_has_overflowed(r);
+}
+
+// Returns the position in the bit reader in bits.
+static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
+  return aom_daala_reader_tell(r);
+}
+
+// Returns the position in the bit reader in 1/8th bits.
+static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
+  return aom_daala_reader_tell_frac(r);
+}
+
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+  if (r->accounting != NULL) {
+    uint32_t tell_frac;
+    tell_frac = aom_reader_tell_frac(r);
+    aom_accounting_record(r->accounting, ACCT_STR_NAME,
+                          tell_frac - r->accounting->last_tell_frac);
+    r->accounting->last_tell_frac = tell_frac;
+  }
+}
+
+static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
+  if (r->accounting != NULL) {
+    r->accounting->syms.num_multi_syms += !is_binary;
+    r->accounting->syms.num_binary_syms += !!is_binary;
+  }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_daala_read(r, prob);
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+  aom_update_symb_counts(r, 1);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read(r, 128, NULL);  // aom_prob_half
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+  int literal = 0, bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return literal;
+}
+
+static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
+                                int nsymbs ACCT_STR_PARAM) {
+  int ret;
+  ret = daala_read_symbol(r, cdf, nsymbs);
+
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+  aom_update_symb_counts(r, (nsymbs == 2));
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+                                   int nsymbs ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
+  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
+  return ret;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
new file mode 100644
index 000000000..b53211784
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
+  return (rb->bit_offset + 7) >> 3;
+}
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
+  const uint32_t off = rb->bit_offset;
+  const uint32_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
+    rb->bit_offset = off + 1;
+    return bit;
+  } else {
+    if (rb->error_handler) rb->error_handler(rb->error_handler_data);
+    return 0;
+  }
+}
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+  assert(bits <= 31);
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
+                                      int bits) {
+  assert(bits <= 32);
+  uint32_t value = 0;
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+  const int nbits = sizeof(unsigned) * 8 - bits - 1;
+  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
+  return ((int)value) >> nbits;
+}
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+  int leading_zeros = 0;
+  while (!aom_rb_read_bit(rb)) ++leading_zeros;
+  // Maximum 32 bits.
+  if (leading_zeros >= 32) return UINT32_MAX;
+  const uint32_t base = (1u << leading_zeros) - 1;
+  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+  return base + value;
+}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
new file mode 100644
index 000000000..725ca1ea2
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_AOM_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*aom_rb_error_handler)(void *data);
+
+struct aom_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  uint32_t bit_offset;
+
+  void *error_handler_data;
+  aom_rb_error_handler error_handler;
+};
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
new file mode 100644
index 000000000..b5ecc2382
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_H_
+#define AOM_AOM_DSP_BITWRITER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/daalaboolwriter.h"
+#include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/common/blockd.h"
+#include "av1/encoder/cost.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct daala_writer aom_writer;
+
+typedef struct TOKEN_STATS {
+  int cost;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
+#endif
+} TOKEN_STATS;
+
+static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
+#if CONFIG_RD_DEBUG
+  int r, c;
+  for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+    for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+      token_stats->txb_coeff_cost_map[r][c] = 0;
+    }
+  }
+#endif
+  token_stats->cost = 0;
+}
+
+static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
+  aom_daala_start_encode(bc, buffer);
+}
+
+static INLINE int aom_stop_encode(aom_writer *bc) {
+  return aom_daala_stop_encode(bc);
+}
+
+static INLINE void aom_write(aom_writer *br, int bit, int probability) {
+  aom_daala_write(br, bit, probability);
+}
+
+static INLINE void aom_write_bit(aom_writer *w, int bit) {
+  aom_write(w, bit, 128);  // aom_prob_half
+}
+
+static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
+}
+
+static INLINE void aom_write_cdf(aom_writer *w, int symb,
+                                 const aom_cdf_prob *cdf, int nsymbs) {
+  daala_write_symbol(w, symb, cdf, nsymbs);
+}
+
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+                                    int nsymbs) {
+  aom_write_cdf(w, symb, cdf, nsymbs);
+  if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
new file mode 100644
index 000000000..596246deb
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitwriter_buffer.h"
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
+  return (wb->bit_offset % CHAR_BIT == 0);
+}
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT - 1) {
+    // Zero next char and write bit
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
+  // Do not zero bytes but overwrite exisiting values
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  wb->bit_buffer[p] &= ~(1 << q);
+  wb->bit_buffer[p] |= bit << q;
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+  assert(bits <= 31);
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits) {
+  assert(bits <= 32);
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+                              int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    aom_wb_overwrite_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits) {
+  aom_wb_write_literal(wb, data, bits + 1);
+}
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+  int64_t shift_val = ++v;
+  int leading_zeroes = 1;
+
+  assert(shift_val > 0);
+
+  while (shift_val >>= 1) leading_zeroes += 2;
+
+  aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+  aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
new file mode 100644
index 000000000..d0311284f
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
+#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_write_bit_buffer {
+  uint8_t *bit_buffer;
+  uint32_t bit_offset;
+};
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits);
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+                              int bits);
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits);
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
new file mode 100644
index 000000000..fd87dc181
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BLEND_H_
+#define AOM_AOM_DSP_BLEND_H_
+
+#include "aom_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the aom_blend_* functions in aom_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+
+#define AOM_BLEND_A64_ROUND_BITS 6
+#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
+
+#define AOM_BLEND_A64(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+                     AOM_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define AOM_BLEND_A256_ROUND_BITS 8
+#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS)  // 256
+
+#define AOM_BLEND_A256(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+                     AOM_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#define DIFF_FACTOR_LOG2 4
+#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
+
+#endif  // AOM_AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
new file mode 100644
index 000000000..0554b43d1
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_hmask.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+    }
+  }
+}
+
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int w, int h, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
new file mode 100644
index 000000000..992cc5c0c
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_mask.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d32 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
+void aom_lowbd_blend_a64_d16_mask_c(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i, j;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[i * mask_stride + j];
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
+                         (AOM_BLEND_A64_MAX_ALPHA - m) *
+                             (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_c(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  // excerpt from clip_pixel_highbd()
+  // set saturation_value to (1 << bd) - 1
+  unsigned int saturation_value;
+  switch (bd) {
+    case 8:
+    default: saturation_value = 255; break;
+    case 10: saturation_value = 1023; break;
+    case 12: saturation_value = 4095; break;
+  }
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[j];
+        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
+                mask[mask_stride + 2 * j + 1],
+            2);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+                          const uint8_t *src0, uint32_t src0_stride,
+                          const uint8_t *src1, uint32_t src1_stride,
+                          const uint8_t *mask, uint32_t mask_stride, int w,
+                          int h, int subw, int subh) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+
+void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                 const uint8_t *src0_8, uint32_t src0_stride,
+                                 const uint8_t *src1_8, uint32_t src1_stride,
+                                 const uint8_t *mask, uint32_t mask_stride,
+                                 int w, int h, int subw, int subh, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
new file mode 100644
index 000000000..4f222e17f
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_vmask.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int w, int h, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c
new file mode 100644
index 000000000..f7703dffc
--- /dev/null
+++ b/third_party/aom/aom_dsp/buf_ans.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+
+#include "aom_dsp/buf_ans.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/internal/aom_codec_internal.h"
+
+void aom_buf_ans_alloc(struct BufAnsCoder *c,
+                       struct aom_internal_error_info *error) {
+  c->error = error;
+  assert(c->size > 1);
+  AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
+  // Initialize to overfull to trigger the assert in write.
+  c->offset = c->size + 1;
+}
+
+void aom_buf_ans_free(struct BufAnsCoder *c) {
+  aom_free(c->buf);
+  c->buf = NULL;
+  c->size = 0;
+}
+
+#if !ANS_MAX_SYMBOLS
+void aom_buf_ans_grow(struct BufAnsCoder *c) {
+  struct buffered_ans_symbol *new_buf = NULL;
+  int new_size = c->size * 2;
+  AOM_CHECK_MEM_ERROR(c->error, new_buf,
+                      aom_malloc(new_size * sizeof(*new_buf)));
+  memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
+  aom_free(c->buf);
+  c->buf = new_buf;
+  c->size = new_size;
+}
+#endif
+
+void aom_buf_ans_flush(struct BufAnsCoder *const c) {
+  int offset;
+#if ANS_MAX_SYMBOLS
+  if (c->offset == 0) return;
+#endif
+  assert(c->offset > 0);
+  offset = c->offset - 1;
+  // Code the first symbol such that it brings the state to the smallest normal
+  // state from an initial state that would have been a subnormal/refill state.
+  if (c->buf[offset].method == ANS_METHOD_RANS) {
+    c->ans.state += c->buf[offset].val_start;
+  } else {
+    c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0;
+  }
+  for (offset = offset - 1; offset >= 0; --offset) {
+    if (c->buf[offset].method == ANS_METHOD_RANS) {
+      rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob);
+    } else {
+      rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start,
+                 (AnsP8)c->buf[offset].prob);
+    }
+  }
+  c->offset = 0;
+  c->output_bytes += ans_write_end(&c->ans);
+}
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
new file mode 100644
index 000000000..985fcdf9e
--- /dev/null
+++ b/third_party/aom/aom_dsp/buf_ans.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BUF_ANS_H_
+#define AOM_AOM_DSP_BUF_ANS_H_
+// Buffered forward ANS writer.
+// Symbols are written to the writer in forward (decode) order and serialized
+// backwards due to ANS's stack like behavior.
+
+#include <assert.h>
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/ans.h"
+#include "aom_dsp/answriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define ANS_METHOD_RABS 0
+#define ANS_METHOD_RANS 1
+
+struct buffered_ans_symbol {
+  unsigned int method : 1;  // one of ANS_METHOD_RABS or ANS_METHOD_RANS
+  // TODO(aconverse): Should be possible to write this in terms of start for ABS
+  unsigned int val_start : RANS_PROB_BITS;  // Boolean value for ABS
+                                            // start in symbol cycle for Rans
+  unsigned int prob : RANS_PROB_BITS;       // Probability of this symbol
+};
+
+struct BufAnsCoder {
+  struct aom_internal_error_info *error;
+  struct buffered_ans_symbol *buf;
+  struct AnsCoder ans;
+  int size;
+  int offset;
+  int output_bytes;
+#if ANS_MAX_SYMBOLS
+  int window_size;
+#endif
+  int pos;  // Dummy variable to store the output buffer after closing
+  uint8_t allow_update_cdf;
+};
+
+// Allocate a buffered ANS coder to store size symbols.
+// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS
+// partition.
+// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the
+// buffer will grow on demand
+void aom_buf_ans_alloc(struct BufAnsCoder *c,
+                       struct aom_internal_error_info *error);
+
+void aom_buf_ans_free(struct BufAnsCoder *c);
+
+#if !ANS_MAX_SYMBOLS
+void aom_buf_ans_grow(struct BufAnsCoder *c);
+#endif
+
+void aom_buf_ans_flush(struct BufAnsCoder *const c);
+
+static INLINE void buf_ans_write_init(struct BufAnsCoder *const c,
+                                      uint8_t *const output_buffer) {
+  c->offset = 0;
+  c->output_bytes = 0;
+  ans_write_init(&c->ans, output_buffer);
+}
+
+static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val,
+                                  AnsP8 prob) {
+  assert(c->offset <= c->size);
+#if !ANS_MAX_SYMBOLS
+  if (c->offset == c->size) {
+    aom_buf_ans_grow(c);
+  }
+#endif
+  c->buf[c->offset].method = ANS_METHOD_RABS;
+  c->buf[c->offset].val_start = val;
+  c->buf[c->offset].prob = prob;
+  ++c->offset;
+#if ANS_MAX_SYMBOLS
+  if (c->offset == c->size) aom_buf_ans_flush(c);
+#endif
+}
+
+// Buffer one symbol for encoding using rANS.
+// cum_prob: The cumulative probability before this symbol (the offset of
+// the symbol in the symbol cycle)
+// prob: The probability of this symbol (l_s from the paper)
+// RANS_PRECISION takes the place of m from the paper.
+static INLINE void buf_rans_write(struct BufAnsCoder *const c,
+                                  aom_cdf_prob cum_prob, aom_cdf_prob prob) {
+  assert(c->offset <= c->size);
+#if !ANS_MAX_SYMBOLS
+  if (c->offset == c->size) {
+    aom_buf_ans_grow(c);
+  }
+#endif
+  c->buf[c->offset].method = ANS_METHOD_RANS;
+  c->buf[c->offset].val_start = cum_prob;
+  c->buf[c->offset].prob = prob;
+  ++c->offset;
+#if ANS_MAX_SYMBOLS
+  if (c->offset == c->size) aom_buf_ans_flush(c);
+#endif
+}
+
+static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) {
+  buf_rabs_write(c, bit, 128);
+}
+
+static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal,
+                                          int bits) {
+  int bit;
+
+  assert(bits < 31);
+  for (bit = bits - 1; bit >= 0; bit--)
+    buf_rabs_write_bit(c, 1 & (literal >> bit));
+}
+
+static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) {
+  assert(c->offset == 0);
+  return c->output_bytes;
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AOM_DSP_BUF_ANS_H_
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
new file mode 100644
index 000000000..6c2259f23
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolreader.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/daalaboolreader.h"
+
+int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
+  if (size && !buffer) {
+    return 1;
+  }
+  r->buffer_end = buffer + size;
+  r->buffer = buffer;
+  od_ec_dec_init(&r->ec, buffer, size);
+#if CONFIG_ACCOUNTING
+  r->accounting = NULL;
+#endif
+  return 0;
+}
+
+const uint8_t *aom_daala_reader_find_begin(daala_reader *r) {
+  return r->buffer;
+}
+
+const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
+  return r->buffer_end;
+}
+
+uint32_t aom_daala_reader_tell(const daala_reader *r) {
+  return od_ec_dec_tell(&r->ec);
+}
+
+uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
+  return od_ec_dec_tell_frac(&r->ec);
+}
+
+int aom_daala_reader_has_overflowed(const daala_reader *r) {
+  const uint32_t tell_bits = aom_daala_reader_tell(r);
+  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
+  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
+}
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
new file mode 100644
index 000000000..ba78f916d
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolreader.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_
+#define AOM_AOM_DSP_DAALABOOLREADER_H_
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_BITSTREAM_DEBUG
+#include <stdio.h>
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct daala_reader {
+  const uint8_t *buffer;
+  const uint8_t *buffer_end;
+  od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  uint8_t allow_update_cdf;
+};
+
+typedef struct daala_reader daala_reader;
+
+int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
+const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
+const uint8_t *aom_daala_reader_find_end(daala_reader *r);
+uint32_t aom_daala_reader_tell(const daala_reader *r);
+uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
+// Returns true if the reader has tried to decode more data from the buffer
+// than was actually provided.
+int aom_daala_reader_has_overflowed(const daala_reader *r);
+
+static INLINE int aom_daala_read(daala_reader *r, int prob) {
+  int bit;
+  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+#if CONFIG_BITSTREAM_DEBUG
+/*{
+  const int queue_r = bitstream_queue_get_read();
+  const int frame_idx = bitstream_queue_get_frame_read();
+  if (frame_idx == 0 && queue_r == 0) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n",
+            frame_idx, queue_r);
+  }
+}*/
+#endif
+
+  bit = od_ec_decode_bool_q15(&r->ec, p);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int ref_bit, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
+    if (ref_nsymbs != 2) {
+      fprintf(stderr,
+              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
+              "%d queue_r %d\n",
+              frame_idx, 2, ref_nsymbs, queue_r);
+      assert(0);
+    }
+    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
+        (ref_cdf[1] != 32767)) {
+      fprintf(stderr,
+              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
+              frame_idx, p, 32767, ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (bit != ref_bit) {
+      fprintf(stderr,
+              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_bit, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
+  return bit;
+}
+
+static INLINE int aom_daala_reader_has_error(daala_reader *r) {
+  return r->ec.error;
+}
+
+static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
+                                    int nsymbs) {
+  int symb;
+  assert(cdf != NULL);
+  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int cdf_error = 0;
+    int ref_symb, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
+    if (nsymbs != ref_nsymbs) {
+      fprintf(stderr,
+              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
+              "queue_r %d\n",
+              frame_idx, nsymbs, ref_nsymbs, queue_r);
+      cdf_error = 0;
+      assert(0);
+    } else {
+      for (i = 0; i < nsymbs; ++i)
+        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
+    }
+    if (cdf_error) {
+      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
+              cdf[0]);
+      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
+      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (symb != ref_symb) {
+      fprintf(
+          stderr,
+          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
+          frame_idx, symb, ref_symb, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
+  return symb;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_DAALABOOLREADER_H_
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c
new file mode 100644
index 000000000..b24ffbf3f
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolwriter.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/daalaboolwriter.h"
+
+void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
+  br->buffer = source;
+  br->pos = 0;
+  od_ec_enc_init(&br->ec, 62025);
+}
+
+int aom_daala_stop_encode(daala_writer *br) {
+  int nb_bits;
+  uint32_t daala_bytes;
+  unsigned char *daala_data;
+  daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
+  nb_bits = od_ec_enc_tell(&br->ec);
+  memcpy(br->buffer, daala_data, daala_bytes);
+  br->pos = daala_bytes;
+  od_ec_enc_clear(&br->ec);
+  return nb_bits;
+}
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
new file mode 100644
index 000000000..3848877ce
--- /dev/null
+++ b/third_party/aom/aom_dsp/daalaboolwriter.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_
+#define AOM_AOM_DSP_DAALABOOLWRITER_H_
+
+#include <stdio.h>
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct daala_writer {
+  unsigned int pos;
+  uint8_t *buffer;
+  od_ec_enc ec;
+  uint8_t allow_update_cdf;
+};
+
+typedef struct daala_writer daala_writer;
+
+void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
+int aom_daala_stop_encode(daala_writer *w);
+
+static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
+  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+#if CONFIG_BITSTREAM_DEBUG
+  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
+  /*int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = bitstream_queue_get_frame_write();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+    frame_idx_w, queue_w);
+  }*/
+  bitstream_queue_push(bit, cdf, 2);
+#endif
+
+  od_ec_encode_bool_q15(&w->ec, bit, p);
+}
+
+static INLINE void daala_write_symbol(daala_writer *w, int symb,
+                                      const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_BITSTREAM_DEBUG
+  /*int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = bitstream_queue_get_frame_write();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+    frame_idx_w, queue_w);
+  }*/
+  bitstream_queue_push(symb, cdf, nsymbs);
+#endif
+
+  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_DAALABOOLWRITER_H_
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
new file mode 100644
index 000000000..aad96c6fc
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/entcode.h"
+
+/*Given the current total integer number of bits used and the current value of
+   rng, computes the fraction number of bits used to OD_BITRES precision.
+  This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
+  nbits_total: The number of whole bits currently used, i.e., the value
+                returned by od_ec_enc_tell() or od_ec_dec_tell().
+  rng: The current value of rng from either the encoder or decoder state.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
+  uint32_t nbits;
+  int l;
+  int i;
+  /*To handle the non-integral number of bits still left in the encoder/decoder
+     state, we compute the worst-case number of bits of val that must be
+     encoded to ensure that the value is inside the range for any possible
+     subsequent bits.
+    The computation here is independent of val itself (the decoder does not
+     even track that value), even though the real number of bits used after
+     od_ec_enc_done() may be 1 smaller if rng is a power of two and the
+     corresponding trailing bits of val are all zeros.
+    If we did try to track that special case, then coding a value with a
+     probability of 1/(1 << n) might sometimes appear to use more than n bits.
+    This may help explain the surprising result that a newly initialized
+     encoder or decoder claims to have used 1 bit.*/
+  nbits = nbits_total << OD_BITRES;
+  l = 0;
+  for (i = OD_BITRES; i-- > 0;) {
+    int b;
+    rng = rng * rng >> 15;
+    b = (int)(rng >> 16);
+    l = l << 1 | b;
+    rng >>= b;
+  }
+  return nbits - l;
+}
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
new file mode 100644
index 000000000..7ba2b1c39
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTCODE_H_
+#define AOM_AOM_DSP_ENTCODE_H_
+
+#include <limits.h>
+#include <stddef.h>
+#include "av1/common/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
+
+/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
+   on a larger type, you can speed up the decoder by using it here.*/
+typedef uint32_t od_ec_window;
+
+#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define OD_BITRES (3)
+
+#define OD_ICDF AOM_ICDF
+
+/*See entcode.c for further documentation.*/
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
+                                               uint32_t rng);
+
+#endif  // AOM_AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
new file mode 100644
index 000000000..d1764c47b
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+
+/*A range decoder.
+  This is an entropy decoder based upon \cite{Mar79}, which is itself a
+   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+  It is very similar to arithmetic encoding, except that encoding is done with
+   digits in any base, instead of with bits, and so it is faster when using
+   larger bases (i.e.: a byte).
+  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+   is the base, longer than the theoretical optimum, but to my knowledge there
+   is no published justification for this claim.
+  This only seems true when using near-infinite precision arithmetic so that
+   the process is carried out with no rounding errors.
+
+  An excellent description of implementation details is available at
+   http://www.arturocampos.com/ac_range.html
+  A recent work \cite{MNW98} which proposes several changes to arithmetic
+   encoding for efficiency actually re-discovers many of the principles
+   behind range encoding, and presents a good theoretical analysis of them.
+
+  End of stream is handled by writing out the smallest number of bits that
+   ensures that the stream will be correctly decoded regardless of the value of
+   any subsequent bits.
+  od_ec_dec_tell() can be used to determine how many bits were needed to decode
+   all the symbols thus far; other data can be packed in the remaining bits of
+   the input buffer.
+  @PHDTHESIS{Pas76,
+    author="Richard Clark Pasco",
+    title="Source coding algorithms for fast data compression",
+    school="Dept. of Electrical Engineering, Stanford University",
+    address="Stanford, CA",
+    month=May,
+    year=1976,
+    URL="http://www.richpasco.org/scaffdc.pdf"
+  }
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video & Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+#define OD_EC_LOTS_OF_BITS (0x4000)
+
+/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
+   call.*/
+static void od_ec_dec_refill(od_ec_dec *dec) {
+  int s;
+  od_ec_window dif;
+  int16_t cnt;
+  const unsigned char *bptr;
+  const unsigned char *end;
+  dif = dec->dif;
+  cnt = dec->cnt;
+  bptr = dec->bptr;
+  end = dec->end;
+  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
+  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
+    assert(s <= OD_EC_WINDOW_SIZE - 8);
+    dif ^= (od_ec_window)bptr[0] << s;
+    cnt += 8;
+  }
+  if (bptr >= end) {
+    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
+    cnt = OD_EC_LOTS_OF_BITS;
+  }
+  dec->dif = dif;
+  dec->cnt = cnt;
+  dec->bptr = bptr;
+}
+
+/*Takes updated dif and range values, renormalizes them so that
+   32768 <= rng < 65536 (reading more bytes from the stream into dif if
+   necessary), and stores them back in the decoder context.
+  dif: The new value of dif.
+  rng: The new value of the range.
+  ret: The value to return.
+  Return: ret.
+          This allows the compiler to jump to this function via a tail-call.*/
+static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
+                               int ret) {
+  int d;
+  assert(rng <= 65535U);
+  // The number of leading zeros in the 16-bit binary representation of rng.
+  d = 16 - OD_ILOG_NZ(rng);
+  dec->cnt -= d;
+  /*This is equivalent to shifting in 1's instead of 0's.*/
+  dec->dif = ((dif + 1) << d) - 1;
+  dec->rng = rng << d;
+  if (dec->cnt < 0) od_ec_dec_refill(dec);
+  return ret;
+}
+
+/*Initializes the decoder.
+  buf: The input buffer to use.
+  Return: 0 on success, or a negative value on error.*/
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
+                    uint32_t storage) {
+  dec->buf = buf;
+  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
+  dec->end = buf + storage;
+  dec->bptr = buf;
+  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
+  dec->rng = 0x8000;
+  dec->cnt = -15;
+  dec->error = 0;
+  od_ec_dec_refill(dec);
+}
+
+/*Decode a single binary value.
+  f: The probability that the bit is one, scaled by 32768.
+  Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
+  od_ec_window dif;
+  od_ec_window vw;
+  unsigned r;
+  unsigned r_new;
+  unsigned v;
+  int ret;
+  assert(0 < f);
+  assert(f < 32768U);
+  dif = dec->dif;
+  r = dec->rng;
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
+  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  ret = 1;
+  r_new = v;
+  if (dif >= vw) {
+    r_new = r - v;
+    dif -= vw;
+    ret = 0;
+  }
+  return od_ec_dec_normalize(dec, dif, r_new, ret);
+}
+
+/*Decodes a symbol given an inverse cumulative distribution function (CDF)
+   table in Q15.
+  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
+        The values must be monotonically non-increasing, and icdf[nsyms - 1]
+         must be 0.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
+  od_ec_window dif;
+  unsigned r;
+  unsigned c;
+  unsigned u;
+  unsigned v;
+  int ret;
+  (void)nsyms;
+  dif = dec->dif;
+  r = dec->rng;
+  const int N = nsyms - 1;
+
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  assert(32768U <= r);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
+  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+  v = r;
+  ret = -1;
+  do {
+    u = v;
+    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT));
+    v += EC_MIN_PROB * (N - ret);
+  } while (c < v);
+  assert(v < u);
+  assert(u <= r);
+  r = u - v;
+  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  return od_ec_dec_normalize(dec, dif, r, ret);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_dec_tell(const od_ec_dec *dec) {
+  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
+  return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
+}
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
new file mode 100644
index 000000000..283bf1831
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTDEC_H_
+#define AOM_AOM_DSP_ENTDEC_H_
+#include <limits.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_dec od_ec_dec;
+
+#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
+#define OD_ACC_STR , char *acc_str
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
+#else
+#define OD_ACC_STR
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
+#endif
+
+/*The entropy decoder context.*/
+struct od_ec_dec {
+  /*The start of the current input buffer.*/
+  const unsigned char *buf;
+  /*An offset used to keep track of tell after reaching the end of the stream.
+    This is constant throughout most of the decoding process, but becomes
+     important once we hit the end of the buffer and stop incrementing pointers
+     (and instead pretend cnt has lots of bits).*/
+  int32_t tell_offs;
+  /*The end of the current input buffer.*/
+  const unsigned char *end;
+  /*The read pointer for the entropy-coded bits.*/
+  const unsigned char *bptr;
+  /*The difference between the high end of the current range, (low + rng), and
+     the coded value, minus 1.
+    This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
+     decoder only uses the top 16 bits of the window to decode the next symbol.
+    As we shift up during renormalization, if we don't have enough bits left in
+     the window to fill the top 16, we'll read in more bits of the coded
+     value.*/
+  od_ec_window dif;
+  /*The number of values in the current range.*/
+  uint16_t rng;
+  /*The number of bits of data in the current value.*/
+  int16_t cnt;
+  /*Nonzero if an error occurred.*/
+  int error;
+};
+
+/*See entdec.c for further documentation.*/
+
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
+                                               const uint16_t *cdf, int nsyms)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
+    OD_ARG_NONNULL(1);
+
+OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
+    OD_ARG_NONNULL(1);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ENTDEC_H_
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
new file mode 100644
index 000000000..a61da263c
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if OD_MEASURE_EC_OVERHEAD
+#if !defined(M_LOG2E)
+#define M_LOG2E (1.4426950408889634073599246810019)
+#endif
+#define OD_LOG2(x) (M_LOG2E * log(x))
+#endif  // OD_MEASURE_EC_OVERHEAD
+
+/*A range encoder.
+  See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
+
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video \& Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+/*Takes updated low and range values, renormalizes them so that
+   32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
+   necessary), and stores them back in the encoder context.
+  low: The new value of low.
+  rng: The new value of the range.*/
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
+                                unsigned rng) {
+  int d;
+  int c;
+  int s;
+  c = enc->cnt;
+  assert(rng <= 65535U);
+  // The number of leading zeros in the 16-bit binary representation of rng.
+  d = 16 - OD_ILOG_NZ(rng);
+  s = c + d;
+  /*TODO: Right now we flush every time we have at least one byte available.
+    Instead we should use an od_ec_window and flush right before we're about to
+     shift bits off the end of the window.
+    For a 32-bit window this is about the same amount of work, but for a 64-bit
+     window it should be a fair win.*/
+  if (s >= 0) {
+    uint16_t *buf;
+    uint32_t storage;
+    uint32_t offs;
+    unsigned m;
+    buf = enc->precarry_buf;
+    storage = enc->precarry_storage;
+    offs = enc->offs;
+    if (offs + 2 > storage) {
+      storage = 2 * storage + 2;
+      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
+      if (buf == NULL) {
+        enc->error = -1;
+        enc->offs = 0;
+        return;
+      }
+      enc->precarry_buf = buf;
+      enc->precarry_storage = storage;
+    }
+    c += 16;
+    m = (1 << c) - 1;
+    if (s >= 8) {
+      assert(offs < storage);
+      buf[offs++] = (uint16_t)(low >> c);
+      low &= m;
+      c -= 8;
+      m >>= 8;
+    }
+    assert(offs < storage);
+    buf[offs++] = (uint16_t)(low >> c);
+    s = c + d - 24;
+    low &= m;
+    enc->offs = offs;
+  }
+  enc->low = low << d;
+  enc->rng = rng << d;
+  enc->cnt = s;
+}
+
+/*Initializes the encoder.
+  size: The initial size of the buffer, in bytes.*/
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
+  od_ec_enc_reset(enc);
+  enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
+  enc->storage = size;
+  if (size > 0 && enc->buf == NULL) {
+    enc->storage = 0;
+    enc->error = -1;
+  }
+  enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
+  enc->precarry_storage = size;
+  if (size > 0 && enc->precarry_buf == NULL) {
+    enc->precarry_storage = 0;
+    enc->error = -1;
+  }
+}
+
+/*Reinitializes the encoder.*/
+void od_ec_enc_reset(od_ec_enc *enc) {
+  enc->offs = 0;
+  enc->low = 0;
+  enc->rng = 0x8000;
+  /*This is initialized to -9 so that it crosses zero after we've accumulated
+     one byte + one carry bit.*/
+  enc->cnt = -9;
+  enc->error = 0;
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy = 0;
+  enc->nb_symbols = 0;
+#endif
+}
+
+/*Frees the buffers used by the encoder.*/
+void od_ec_enc_clear(od_ec_enc *enc) {
+  free(enc->precarry_buf);
+  free(enc->buf);
+}
+
+/*Encodes a symbol given its frequency in Q15.
+  fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
+  before the
+       one to be encoded.
+  fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
+  including
+       the one to be encoded.*/
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
+                             int nsyms) {
+  od_ec_window l;
+  unsigned r;
+  unsigned u;
+  unsigned v;
+  l = enc->low;
+  r = enc->rng;
+  assert(32768U <= r);
+  assert(fh <= fl);
+  assert(fl <= 32768U);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
+  const int N = nsyms - 1;
+  if (fl < CDF_PROB_TOP) {
+    u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s - 1));
+    v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s + 0));
+    l += r - u;
+    r = u - v;
+  } else {
+    r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+          (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+         EC_MIN_PROB * (N - (s + 0));
+  }
+  od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
+  enc->nb_symbols++;
+#endif
+}
+
+/*Encode a single binary value.
+  val: The value to encode (0 or 1).
+  f: The probability that the val is one, scaled by 32768.*/
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
+  od_ec_window l;
+  unsigned r;
+  unsigned v;
+  assert(0 < f);
+  assert(f < 32768U);
+  l = enc->low;
+  r = enc->rng;
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
+  if (val) l += r - v;
+  r = val ? v : r - v;
+  od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
+  enc->nb_symbols++;
+#endif
+}
+
+/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
+  s: The index of the symbol to encode.
+  icdf: 32768 minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+        The values must be monotonically decreasing, and icdf[nsyms - 1] must
+         be 0.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.*/
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
+                          int nsyms) {
+  (void)nsyms;
+  assert(s >= 0);
+  assert(s < nsyms);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
+}
+
+/*Overwrites a few bits at the very start of an existing stream, after they
+   have already been encoded.
+  This makes it possible to have a few flags up front, where it is easy for
+   decoders to access them without parsing the whole stream, even if their
+   values are not determined until late in the encoding process, without having
+   to buffer all the intermediate symbols in the encoder.
+  In order for this to work, at least nbits bits must have already been encoded
+   using probabilities that are an exact power of two.
+  The encoder can verify the number of encoded bits is sufficient, but cannot
+   check this latter condition.
+  val: The bits to encode (in the least nbits significant bits).
+       They will be decoded in order from most-significant to least.
+  nbits: The number of bits to overwrite.
+         This must be no more than 8.*/
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
+  int shift;
+  unsigned mask;
+  assert(nbits >= 0);
+  assert(nbits <= 8);
+  assert(val < 1U << nbits);
+  shift = 8 - nbits;
+  mask = ((1U << nbits) - 1) << shift;
+  if (enc->offs > 0) {
+    /*The first byte has been finalized.*/
+    enc->precarry_buf[0] =
+        (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
+  } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
+    /*The first byte has yet to be output.*/
+    enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
+               (od_ec_window)val << (16 + enc->cnt + shift);
+  } else {
+    /*The encoder hasn't even encoded _nbits of data yet.*/
+    enc->error = -1;
+  }
+}
+
+#if OD_MEASURE_EC_OVERHEAD
+#include <stdio.h>
+#endif
+
+/*Indicates that there are no more symbols to encode.
+  All remaining output bytes are flushed to the output buffer.
+  od_ec_enc_reset() should be called before using the encoder again.
+  bytes: Returns the size of the encoded data in the returned buffer.
+  Return: A pointer to the start of the final buffer, or NULL if there was an
+           encoding error.*/
+unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
+  unsigned char *out;
+  uint32_t storage;
+  uint16_t *buf;
+  uint32_t offs;
+  od_ec_window m;
+  od_ec_window e;
+  od_ec_window l;
+  int c;
+  int s;
+  if (enc->error) return NULL;
+#if OD_MEASURE_EC_OVERHEAD
+  {
+    uint32_t tell;
+    /* Don't count the 1 bit we lose to raw bits as overhead. */
+    tell = od_ec_enc_tell(enc) - 1;
+    fprintf(stderr, "overhead: %f%%\n",
+            100 * (tell - enc->entropy) / enc->entropy);
+    fprintf(stderr, "efficiency: %f bits/symbol\n",
+            (double)tell / enc->nb_symbols);
+  }
+#endif
+  /*We output the minimum number of bits that ensures that the symbols encoded
+     thus far will be decoded correctly regardless of the bits that follow.*/
+  l = enc->low;
+  c = enc->cnt;
+  s = 10;
+  m = 0x3FFF;
+  e = ((l + m) & ~m) | (m + 1);
+  s += c;
+  offs = enc->offs;
+  buf = enc->precarry_buf;
+  if (s > 0) {
+    unsigned n;
+    storage = enc->precarry_storage;
+    if (offs + ((s + 7) >> 3) > storage) {
+      storage = storage * 2 + ((s + 7) >> 3);
+      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
+      if (buf == NULL) {
+        enc->error = -1;
+        return NULL;
+      }
+      enc->precarry_buf = buf;
+      enc->precarry_storage = storage;
+    }
+    n = (1 << (c + 16)) - 1;
+    do {
+      assert(offs < storage);
+      buf[offs++] = (uint16_t)(e >> (c + 16));
+      e &= n;
+      s -= 8;
+      c -= 8;
+      n >>= 8;
+    } while (s > 0);
+  }
+  /*Make sure there's enough room for the entropy-coded bits.*/
+  out = enc->buf;
+  storage = enc->storage;
+  c = OD_MAXI((s + 7) >> 3, 0);
+  if (offs + c > storage) {
+    storage = offs + c;
+    out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+    if (out == NULL) {
+      enc->error = -1;
+      return NULL;
+    }
+    enc->buf = out;
+    enc->storage = storage;
+  }
+  *nbytes = offs;
+  /*Perform carry propagation.*/
+  assert(offs <= storage);
+  out = out + storage - offs;
+  c = 0;
+  while (offs > 0) {
+    offs--;
+    c = buf[offs] + c;
+    out[offs] = (unsigned char)c;
+    c >>= 8;
+  }
+  /*Note: Unless there's an allocation error, if you keep encoding into the
+     current buffer and call this function again later, everything will work
+     just fine (you won't get a new packet out, but you will get a single
+     buffer with the new data appended to the old).
+    However, this function is O(N) where N is the amount of data coded so far,
+     so calling it more than once for a given packet is a bad idea.*/
+  return out;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Warning: The value returned by this function can decrease compared to an
+   earlier call, even after encoding more data, if there is an encoding error
+   (i.e., a failure to allocate enough space for the output buffer).
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_enc_tell(const od_ec_enc *enc) {
+  /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
+     bit, which we reserve for terminating the stream.*/
+  return (enc->cnt + 10) + enc->offs * 8;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Warning: The value returned by this function can decrease compared to an
+   earlier call, even after encoding more data, if there is an encoding error
+   (i.e., a failure to allocate enough space for the output buffer).
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
+  return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
+}
+
+/*Saves a entropy coder checkpoint to dst.
+  This allows an encoder to reverse a series of entropy coder
+   decisions if it decides that the information would have been
+   better coded some other way.*/
+void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
+  OD_COPY(dst, src, 1);
+}
+
+/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
+  This can only be used to restore from checkpoints earlier in the target
+   state's history: you can not switch backwards and forwards or otherwise
+   switch to a state which isn't a casual ancestor of the current state.
+  Restore is also incompatible with patching the initial bits, as the
+   changes will remain in the restored version.*/
+void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
+  unsigned char *buf;
+  uint32_t storage;
+  uint16_t *precarry_buf;
+  uint32_t precarry_storage;
+  assert(dst->storage >= src->storage);
+  assert(dst->precarry_storage >= src->precarry_storage);
+  buf = dst->buf;
+  storage = dst->storage;
+  precarry_buf = dst->precarry_buf;
+  precarry_storage = dst->precarry_storage;
+  OD_COPY(dst, src, 1);
+  dst->buf = buf;
+  dst->storage = storage;
+  dst->precarry_buf = precarry_buf;
+  dst->precarry_storage = precarry_storage;
+}
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
new file mode 100644
index 000000000..3551d4250
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTENC_H_
+#define AOM_AOM_DSP_ENTENC_H_
+#include <stddef.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_enc od_ec_enc;
+
+#define OD_MEASURE_EC_OVERHEAD (0)
+
+/*The entropy encoder context.*/
+struct od_ec_enc {
+  /*Buffered output.
+    This contains only the raw bits until the final call to od_ec_enc_done(),
+     where all the arithmetic-coded data gets prepended to it.*/
+  unsigned char *buf;
+  /*The size of the buffer.*/
+  uint32_t storage;
+  /*A buffer for output bytes with their associated carry flags.*/
+  uint16_t *precarry_buf;
+  /*The size of the pre-carry buffer.*/
+  uint32_t precarry_storage;
+  /*The offset at which the next entropy-coded byte will be written.*/
+  uint32_t offs;
+  /*The low end of the current range.*/
+  od_ec_window low;
+  /*The number of values in the current range.*/
+  uint16_t rng;
+  /*The number of bits of data in the current value.*/
+  int16_t cnt;
+  /*Nonzero if an error occurred.*/
+  int error;
+#if OD_MEASURE_EC_OVERHEAD
+  double entropy;
+  int nb_symbols;
+#endif
+};
+
+/*See entenc.c for further documentation.*/
+
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
+void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
+void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
+
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
+    OD_ARG_NONNULL(1);
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
+
+void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
+    OD_ARG_NONNULL(1);
+
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
+                                                    uint32_t *nbytes)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
+    OD_ARG_NONNULL(1);
+
+void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
+void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ENTENC_H_
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
new file mode 100644
index 000000000..3804519b3
--- /dev/null
+++ b/third_party/aom/aom_dsp/fastssim.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ *  This code was originally written by: Nathan E. Egge, at the Daala
+ *  project.
+ */
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+#include "aom_ports/system_state.h"
+
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+  uint32_t *im1;
+  uint32_t *im2;
+  double *ssim;
+  int w;
+  int h;
+};
+
+struct fs_ctx {
+  fs_level *level;
+  int nlevels;
+  unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+  unsigned char *data;
+  size_t data_size;
+  int lw;
+  int lh;
+  int l;
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  data_size =
+      _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    im_size = lw * (size_t)lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size += im_size;
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    data_size += level_size;
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  data = (unsigned char *)malloc(data_size);
+  _ctx->level = (fs_level *)data;
+  _ctx->nlevels = _nlevels;
+  data += _nlevels * sizeof(*_ctx->level);
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    _ctx->level[l].w = lw;
+    _ctx->level[l].h = lh;
+    im_size = lw * (size_t)lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    _ctx->level[l].im1 = (uint32_t *)data;
+    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+    data += level_size;
+    _ctx->level[l].ssim = (double *)data;
+    data += im_size * sizeof(*_ctx->level[l].ssim);
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  _ctx->col_buf = (unsigned *)data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+  const uint32_t *src1;
+  const uint32_t *src2;
+  uint32_t *dst1;
+  uint32_t *dst2;
+  int w2;
+  int h2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  dst1 = _ctx->level[_l].im1;
+  dst2 = _ctx->level[_l].im2;
+  w2 = _ctx->level[_l - 1].w;
+  h2 = _ctx->level[_l - 1].h;
+  src1 = _ctx->level[_l - 1].im1;
+  src2 = _ctx->level[_l - 1].im2;
+  for (j = 0; j < h; j++) {
+    int j0offs;
+    int j1offs;
+    j0offs = 2 * j * w2;
+    j1offs = FS_MINI(2 * j + 1, h2) * w2;
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, w2);
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+                        src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+                        src2[j1offs + i0] + src2[j1offs + i1];
+    }
+  }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+                                 int _s1ystride, const uint8_t *_src2,
+                                 int _s2ystride, int _w, int _h, uint32_t shift,
+                                 int buf_is_hbd) {
+  uint32_t *dst1;
+  uint32_t *dst2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[0].w;
+  h = _ctx->level[0].h;
+  dst1 = _ctx->level[0].im1;
+  dst2 = _ctx->level[0].im2;
+  for (j = 0; j < h; j++) {
+    int j0;
+    int j1;
+    j0 = 2 * j;
+    j1 = FS_MINI(j0 + 1, _h);
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, _w);
+      if (!buf_is_hbd) {
+        dst1[j * w + i] =
+            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+        dst2[j * w + i] =
+            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
+      } else {
+        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+                          (src1s[j0 * _s1ystride + i1] >> shift) +
+                          (src1s[j1 * _s1ystride + i0] >> shift) +
+                          (src1s[j1 * _s1ystride + i1] >> shift);
+        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+                          (src2s[j0 * _s2ystride + i1] >> shift) +
+                          (src2s[j1 * _s2ystride + i0] >> shift) +
+                          (src2s[j1 * _s2ystride + i1] >> shift);
+      }
+    }
+  }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
+  unsigned *col_sums_x;
+  unsigned *col_sums_y;
+  uint32_t *im1;
+  uint32_t *im2;
+  double *ssim;
+  double c1;
+  int w;
+  int h;
+  int j0offs;
+  int j1offs;
+  int i;
+  int j;
+  double ssim_c1 = SSIM_C1;
+
+  if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+  if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
+
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  col_sums_x = _ctx->col_buf;
+  col_sums_y = col_sums_x + w;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
+  for (j = 1; j < 4; j++) {
+    j1offs = FS_MINI(j, h - 1) * w;
+    for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+  }
+  ssim = _ctx->level[_l].ssim;
+  c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
+  for (j = 0; j < h; j++) {
+    unsigned mux;
+    unsigned muy;
+    int i0;
+    int i1;
+    mux = 5 * col_sums_x[0];
+    muy = 5 * col_sums_y[0];
+    for (i = 1; i < 4; i++) {
+      i1 = FS_MINI(i, w - 1);
+      mux += col_sums_x[i1];
+      muy += col_sums_y[i1];
+    }
+    for (i = 0; i < w; i++) {
+      ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+                         (mux * (double)mux + muy * (double)muy + c1);
+      if (i + 1 < w) {
+        i0 = FS_MAXI(0, i - 4);
+        i1 = FS_MINI(i + 4, w - 1);
+        mux += col_sums_x[i1] - col_sums_x[i0];
+        muy += col_sums_x[i1] - col_sums_x[i0];
+      }
+    }
+    if (j + 1 < h) {
+      j0offs = FS_MAXI(0, j - 4) * w;
+      for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
+      j1offs = FS_MINI(j + 4, h - 1) * w;
+      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+    }
+  }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] = gx * (double)gx;                    \
+    col_sums_gy2[(_col)] = gy * (double)gy;                    \
+    col_sums_gxgy[(_col)] = gx * (double)gy;                   \
+  } while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] += gx * (double)gx;                   \
+    col_sums_gy2[(_col)] += gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] += gx * (double)gy;                  \
+  } while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] -= gx * (double)gx;                   \
+    col_sums_gy2[(_col)] -= gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] -= gx * (double)gy;                  \
+  } while (0)
+
+#define FS_COL_COPY(_col1, _col2)                    \
+  do {                                               \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)];   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)];   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+  } while (0)
+
+#define FS_COL_HALVE(_col1, _col2)                         \
+  do {                                                     \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5;   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+  } while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2)                      \
+  do {                                                   \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2;   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+  } while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+  uint32_t *im1;
+  uint32_t *im2;
+  unsigned *gx_buf;
+  unsigned *gy_buf;
+  double *ssim;
+  double col_sums_gx2[8];
+  double col_sums_gy2[8];
+  double col_sums_gxgy[8];
+  double c2;
+  int stride;
+  int w;
+  int h;
+  int i;
+  int j;
+  double ssim_c2 = SSIM_C2;
+  if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+  if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
+
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  ssim = _ctx->level[_l].ssim;
+  gx_buf = _ctx->col_buf;
+  stride = w + 8;
+  gy_buf = gx_buf + 8 * stride;
+  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+  c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
+  for (j = 0; j < h + 4; j++) {
+    if (j < h - 1) {
+      for (i = 0; i < w - 1; i++) {
+        unsigned g1;
+        unsigned g2;
+        unsigned gx;
+        unsigned gy;
+        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
+        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
+        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
+        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        gx_buf[(j & 7) * stride + i + 4] = gx;
+        gy_buf[(j & 7) * stride + i + 4] = gy;
+      }
+    } else {
+      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+    }
+    if (j >= 4) {
+      int k;
+      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+          col_sums_gxgy[0] = 0;
+      for (i = 4; i < 8; i++) {
+        FS_COL_SET(i, -1, 0);
+        FS_COL_ADD(i, 0, 0);
+        for (k = 1; k < 8 - i; k++) {
+          FS_COL_DOUBLE(i, i);
+          FS_COL_ADD(i, -k - 1, 0);
+          FS_COL_ADD(i, k, 0);
+        }
+      }
+      for (i = 0; i < w; i++) {
+        double mugx2;
+        double mugy2;
+        double mugxgy;
+        mugx2 = col_sums_gx2[0];
+        for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
+        mugy2 = col_sums_gy2[0];
+        for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
+        mugxgy = col_sums_gxgy[0];
+        for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
+        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+        if (i + 1 < w) {
+          FS_COL_SET(0, -1, 1);
+          FS_COL_ADD(0, 0, 1);
+          FS_COL_SUB(2, -3, 2);
+          FS_COL_SUB(2, 2, 2);
+          FS_COL_HALVE(1, 2);
+          FS_COL_SUB(3, -4, 3);
+          FS_COL_SUB(3, 3, 3);
+          FS_COL_HALVE(2, 3);
+          FS_COL_COPY(3, 4);
+          FS_COL_DOUBLE(4, 5);
+          FS_COL_ADD(4, -4, 5);
+          FS_COL_ADD(4, 3, 5);
+          FS_COL_DOUBLE(5, 6);
+          FS_COL_ADD(5, -3, 6);
+          FS_COL_ADD(5, 2, 6);
+          FS_COL_DOUBLE(6, 7);
+          FS_COL_ADD(6, -2, 7);
+          FS_COL_ADD(6, 1, 7);
+          FS_COL_SET(7, -1, 8);
+          FS_COL_ADD(7, 0, 8);
+        }
+      }
+    }
+  }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+  0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+  double *ssim;
+  double ret;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  ssim = _ctx->level[_l].ssim;
+  ret = 0;
+  for (j = 0; j < h; j++)
+    for (i = 0; i < w; i++) ret += ssim[j * w + i];
+  return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+  assert(_weight >= _ssim);
+  if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+                        int _dystride, int _w, int _h, uint32_t _bd,
+                        uint32_t _shift, int buf_is_hbd) {
+  fs_ctx ctx;
+  double ret;
+  int l;
+  ret = 1;
+  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
+                       buf_is_hbd);
+  for (l = 0; l < FS_NLEVELS - 1; l++) {
+    fs_calc_structure(&ctx, l, _bd);
+    ret *= fs_average(&ctx, l);
+    fs_downsample_level(&ctx, l + 1);
+  }
+  fs_calc_structure(&ctx, l, _bd);
+  fs_apply_luminance(&ctx, l, _bd);
+  ret *= fs_average(&ctx, l);
+  fs_ctx_clear(&ctx);
+  return ret;
+}
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd) {
+  double ssimv;
+  uint32_t bd_shift = 0;
+  aom_clear_system_state();
+  assert(bd >= in_bd);
+  assert(source->flags == dest->flags);
+  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
+  bd_shift = bd - in_bd;
+
+  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+                      dest->y_stride, source->y_crop_width,
+                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
+  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+  return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
new file mode 100644
index 000000000..0ba71cfb3
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y++) {
+    for (int x = 0; x < n; x++) {
+      B[y * n + x] = A[x * n + y];
+    }
+  }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+                                    int n) {
+  for (int y = 0; y <= n / 2; ++y) {
+    const int y2 = y + n / 2;
+    const int y_extra = y2 > n / 2 && y2 < n;
+
+    for (int x = 0; x <= n / 2; ++x) {
+      const int x2 = x + n / 2;
+      const int x_extra = x2 > n / 2 && x2 < n;
+      output[2 * (y * n + x)] =
+          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+                                    (x_extra ? col_fft[y * n + x2] : 0);
+      if (y_extra) {
+        output[2 * ((n - y) * n + x)] =
+            col_fft[y * n + x] +
+            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+        output[2 * ((n - y) * n + x) + 1] =
+            -(y_extra ? col_fft[y2 * n + x] : 0) +
+            (x_extra ? col_fft[y * n + x2] : 0);
+      }
+    }
+  }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size) {
+  for (int x = 0; x < n; x += vec_size) {
+    tform(input + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  for (int x = 0; x < n; x += vec_size) {
+    tform(temp + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+          sub_float);
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+          sub_float, mul_float);
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size) {
+  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+  // and get real outputs.
+  for (int y = 0; y <= n / 2; ++y) {
+    output[y * n] = input[2 * y * n];
+    output[y * n + 1] = input[2 * (y * n + n / 2)];
+  }
+  for (int y = n / 2 + 1; y < n; ++y) {
+    output[y * n] = input[2 * (y - n / 2) * n + 1];
+    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+  }
+
+  for (int i = 0; i < 2; i += vec_size) {
+    ifft_multi(output + i, temp + i, n);
+  }
+
+  // For the other columns, since we don't have a full ifft for complex inputs
+  // we have to split them into the real and imaginary counterparts.
+  // Pack the real component, then the imaginary components.
+  for (int y = 0; y < n; ++y) {
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + 1)] = input[2 * (y * n + x)];
+    }
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+    }
+  }
+  for (int y = 2; y < vec_size; y++) {
+    fft_single(output + y, temp + y, n);
+  }
+  // This is the part that can be sped up with SIMD
+  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+    fft_multi(output + y, temp + y, n);
+  }
+
+  // Put the 0 and n/2 th results in the correct place.
+  for (int x = 0; x < n; ++x) {
+    output[x] = temp[x * n];
+    output[(n / 2) * n + x] = temp[x * n + 1];
+  }
+  // This rearranges and transposes.
+  for (int y = 1; y < n / 2; ++y) {
+    // Fill in the real columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + y * n] =
+          temp[(y + 1) + x * n] +
+          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
+    }
+    // Fill in the imag columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + (y + n / 2) * n] =
+          temp[(y + n / 2) + x * n] -
+          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+                                    temp[(y + n / 2) + (n - x) * n];
+    }
+  }
+  for (int y = 0; y < n; y += vec_size) {
+    ifft_multi(output + y, temp + y, n);
+  }
+  transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+           sub_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+                  aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+                  aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+                  aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
new file mode 100644
index 000000000..5137331ae
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft_common.h
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FFT_COMMON_H_
+#define AOM_AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in]  input   Input buffer. See above for size restrictions.
+ * \param[out] output  Output buffer. See above for size restrictions.
+ * \param[in]  stride  The spacing in number of elements between rows
+ *                     (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+                                  int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+                                         int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in]  input     Input buffer to run the transform on (size n x n)
+ * \param[out] temp      Working buffer for computing the transform (size n x n)
+ * \param[out] output    Output buffer (size 2 x n x n)
+ * \param[in]  tform     Forward transform function
+ * \param[in]  transpose Transpose function (for n x n matrix)
+ * \param[in]  unpack    Unpack function used to massage outputs to correct form
+ * \param[in]  vec_size  Vector size (the transform is done vec_size units at
+ *                       a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp       Working buffer for computations (size 2 x n x n)
+ * \param[out] output     Output buffer (size n x n)
+ * \param[in]  fft_single Forward transform function (non vectorized)
+ * \param[in]  fft_multi  Forward transform function (vectorized)
+ * \param[in]  ifft_multi Inverse transform function (vectorized)
+ * \param[in]  transpose  Transpose function (for n x n matrix)
+ * \param[in]  vec_size   Vector size (the transform is done vec_size
+ *                        units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    store(output + 0 * stride, i0 + i1);                            \
+    store(output + 1 * stride, i0 - i1);                            \
+  }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                            \
+    const T_VEC i1 = load(input + 1 * stride);                            \
+    const T_VEC i2 = load(input + 2 * stride);                            \
+    const T_VEC i3 = load(input + 3 * stride);                            \
+    const T_VEC w0 = add(i0, i2);                                         \
+    const T_VEC w1 = sub(i0, i2);                                         \
+    const T_VEC w2 = add(i1, i3);                                         \
+    const T_VEC w3 = sub(i1, i3);                                         \
+    store(output + 0 * stride, add(w0, w2));                              \
+    store(output + 1 * stride, w1);                                       \
+    store(output + 2 * stride, sub(w0, w2));                              \
+    store(output + 3 * stride, sub(kWeight0, w3));                        \
+  }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC w0 = add(i0, i4);                                              \
+    const T_VEC w1 = sub(i0, i4);                                              \
+    const T_VEC w2 = add(i2, i6);                                              \
+    const T_VEC w3 = sub(i2, i6);                                              \
+    const T_VEC w4 = add(w0, w2);                                              \
+    const T_VEC w5 = sub(w0, w2);                                              \
+    const T_VEC w7 = add(i1, i5);                                              \
+    const T_VEC w8 = sub(i1, i5);                                              \
+    const T_VEC w9 = add(i3, i7);                                              \
+    const T_VEC w10 = sub(i3, i7);                                             \
+    const T_VEC w11 = add(w7, w9);                                             \
+    const T_VEC w12 = sub(w7, w9);                                             \
+    store(output + 0 * stride, add(w4, w11));                                  \
+    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 2 * stride, w5);                                            \
+    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 4 * stride, sub(w4, w11));                                  \
+    store(output + 5 * stride,                                                 \
+          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
+    store(output + 6 * stride, sub(kWeight0, w12));                            \
+    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
+  }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC kWeight3 = constant(0.92388f);                             \
+    const T_VEC kWeight4 = constant(0.382683f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC i8 = load(input + 8 * stride);                             \
+    const T_VEC i9 = load(input + 9 * stride);                             \
+    const T_VEC i10 = load(input + 10 * stride);                           \
+    const T_VEC i11 = load(input + 11 * stride);                           \
+    const T_VEC i12 = load(input + 12 * stride);                           \
+    const T_VEC i13 = load(input + 13 * stride);                           \
+    const T_VEC i14 = load(input + 14 * stride);                           \
+    const T_VEC i15 = load(input + 15 * stride);                           \
+    const T_VEC w0 = add(i0, i8);                                          \
+    const T_VEC w1 = sub(i0, i8);                                          \
+    const T_VEC w2 = add(i4, i12);                                         \
+    const T_VEC w3 = sub(i4, i12);                                         \
+    const T_VEC w4 = add(w0, w2);                                          \
+    const T_VEC w5 = sub(w0, w2);                                          \
+    const T_VEC w7 = add(i2, i10);                                         \
+    const T_VEC w8 = sub(i2, i10);                                         \
+    const T_VEC w9 = add(i6, i14);                                         \
+    const T_VEC w10 = sub(i6, i14);                                        \
+    const T_VEC w11 = add(w7, w9);                                         \
+    const T_VEC w12 = sub(w7, w9);                                         \
+    const T_VEC w14 = add(w4, w11);                                        \
+    const T_VEC w15 = sub(w4, w11);                                        \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(sub(kWeight0, w3),                          \
+                               mul(kWeight2, add(w10, w8))) };             \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
+    const T_VEC w19 = add(i1, i9);                                         \
+    const T_VEC w20 = sub(i1, i9);                                         \
+    const T_VEC w21 = add(i5, i13);                                        \
+    const T_VEC w22 = sub(i5, i13);                                        \
+    const T_VEC w23 = add(w19, w21);                                       \
+    const T_VEC w24 = sub(w19, w21);                                       \
+    const T_VEC w26 = add(i3, i11);                                        \
+    const T_VEC w27 = sub(i3, i11);                                        \
+    const T_VEC w28 = add(i7, i15);                                        \
+    const T_VEC w29 = sub(i7, i15);                                        \
+    const T_VEC w30 = add(w26, w28);                                       \
+    const T_VEC w31 = sub(w26, w28);                                       \
+    const T_VEC w33 = add(w23, w30);                                       \
+    const T_VEC w34 = sub(w23, w30);                                       \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(sub(kWeight0, w22),                         \
+                               mul(kWeight2, add(w29, w27))) };            \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
+    store(output + 0 * stride, add(w14, w33));                             \
+    store(output + 1 * stride,                                             \
+          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 3 * stride,                                             \
+          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+    store(output + 4 * stride, w15);                                       \
+    store(output + 5 * stride,                                             \
+          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
+                          mul(kWeight3, w37[1]))));                        \
+    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 7 * stride,                                             \
+          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
+                          mul(kWeight4, w35[1]))));                        \
+    store(output + 8 * stride, sub(w14, w33));                             \
+    store(output + 9 * stride,                                             \
+          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+    store(output + 10 * stride,                                            \
+          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
+    store(output + 11 * stride,                                            \
+          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+    store(output + 12 * stride, sub(kWeight0, w34));                       \
+    store(output + 13 * stride,                                            \
+          sub(sub(kWeight0, w18[1]),                                       \
+              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
+    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
+    store(output + 15 * stride,                                            \
+          sub(sub(kWeight0, w16[1]),                                       \
+              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
+  }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                   mul)                                                      \
+  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                   \
+    const T_VEC kWeight2 = constant(0.707107f);                              \
+    const T_VEC kWeight3 = constant(0.92388f);                               \
+    const T_VEC kWeight4 = constant(0.382683f);                              \
+    const T_VEC kWeight5 = constant(0.980785f);                              \
+    const T_VEC kWeight6 = constant(0.19509f);                               \
+    const T_VEC kWeight7 = constant(0.83147f);                               \
+    const T_VEC kWeight8 = constant(0.55557f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                               \
+    const T_VEC i1 = load(input + 1 * stride);                               \
+    const T_VEC i2 = load(input + 2 * stride);                               \
+    const T_VEC i3 = load(input + 3 * stride);                               \
+    const T_VEC i4 = load(input + 4 * stride);                               \
+    const T_VEC i5 = load(input + 5 * stride);                               \
+    const T_VEC i6 = load(input + 6 * stride);                               \
+    const T_VEC i7 = load(input + 7 * stride);                               \
+    const T_VEC i8 = load(input + 8 * stride);                               \
+    const T_VEC i9 = load(input + 9 * stride);                               \
+    const T_VEC i10 = load(input + 10 * stride);                             \
+    const T_VEC i11 = load(input + 11 * stride);                             \
+    const T_VEC i12 = load(input + 12 * stride);                             \
+    const T_VEC i13 = load(input + 13 * stride);                             \
+    const T_VEC i14 = load(input + 14 * stride);                             \
+    const T_VEC i15 = load(input + 15 * stride);                             \
+    const T_VEC i16 = load(input + 16 * stride);                             \
+    const T_VEC i17 = load(input + 17 * stride);                             \
+    const T_VEC i18 = load(input + 18 * stride);                             \
+    const T_VEC i19 = load(input + 19 * stride);                             \
+    const T_VEC i20 = load(input + 20 * stride);                             \
+    const T_VEC i21 = load(input + 21 * stride);                             \
+    const T_VEC i22 = load(input + 22 * stride);                             \
+    const T_VEC i23 = load(input + 23 * stride);                             \
+    const T_VEC i24 = load(input + 24 * stride);                             \
+    const T_VEC i25 = load(input + 25 * stride);                             \
+    const T_VEC i26 = load(input + 26 * stride);                             \
+    const T_VEC i27 = load(input + 27 * stride);                             \
+    const T_VEC i28 = load(input + 28 * stride);                             \
+    const T_VEC i29 = load(input + 29 * stride);                             \
+    const T_VEC i30 = load(input + 30 * stride);                             \
+    const T_VEC i31 = load(input + 31 * stride);                             \
+    const T_VEC w0 = add(i0, i16);                                           \
+    const T_VEC w1 = sub(i0, i16);                                           \
+    const T_VEC w2 = add(i8, i24);                                           \
+    const T_VEC w3 = sub(i8, i24);                                           \
+    const T_VEC w4 = add(w0, w2);                                            \
+    const T_VEC w5 = sub(w0, w2);                                            \
+    const T_VEC w7 = add(i4, i20);                                           \
+    const T_VEC w8 = sub(i4, i20);                                           \
+    const T_VEC w9 = add(i12, i28);                                          \
+    const T_VEC w10 = sub(i12, i28);                                         \
+    const T_VEC w11 = add(w7, w9);                                           \
+    const T_VEC w12 = sub(w7, w9);                                           \
+    const T_VEC w14 = add(w4, w11);                                          \
+    const T_VEC w15 = sub(w4, w11);                                          \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(sub(kWeight0, w3),                            \
+                               mul(kWeight2, add(w10, w8))) };               \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
+    const T_VEC w19 = add(i2, i18);                                          \
+    const T_VEC w20 = sub(i2, i18);                                          \
+    const T_VEC w21 = add(i10, i26);                                         \
+    const T_VEC w22 = sub(i10, i26);                                         \
+    const T_VEC w23 = add(w19, w21);                                         \
+    const T_VEC w24 = sub(w19, w21);                                         \
+    const T_VEC w26 = add(i6, i22);                                          \
+    const T_VEC w27 = sub(i6, i22);                                          \
+    const T_VEC w28 = add(i14, i30);                                         \
+    const T_VEC w29 = sub(i14, i30);                                         \
+    const T_VEC w30 = add(w26, w28);                                         \
+    const T_VEC w31 = sub(w26, w28);                                         \
+    const T_VEC w33 = add(w23, w30);                                         \
+    const T_VEC w34 = sub(w23, w30);                                         \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(sub(kWeight0, w22),                           \
+                               mul(kWeight2, add(w29, w27))) };              \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
+    const T_VEC w38 = add(w14, w33);                                         \
+    const T_VEC w39 = sub(w14, w33);                                         \
+    const T_VEC w40[2] = {                                                   \
+      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
+      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
+    };                                                                       \
+    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(sub(kWeight0, w12),                           \
+                               mul(kWeight2, add(w31, w24))) };              \
+    const T_VEC w42[2] = {                                                   \
+      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
+      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
+    };                                                                       \
+    const T_VEC w44[2] = {                                                   \
+      add(w18[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+      sub(sub(kWeight0, w18[1]),                                             \
+          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
+    };                                                                       \
+    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
+    const T_VEC w46[2] = {                                                   \
+      add(w16[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+      sub(sub(kWeight0, w16[1]),                                             \
+          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
+    };                                                                       \
+    const T_VEC w47 = add(i1, i17);                                          \
+    const T_VEC w48 = sub(i1, i17);                                          \
+    const T_VEC w49 = add(i9, i25);                                          \
+    const T_VEC w50 = sub(i9, i25);                                          \
+    const T_VEC w51 = add(w47, w49);                                         \
+    const T_VEC w52 = sub(w47, w49);                                         \
+    const T_VEC w54 = add(i5, i21);                                          \
+    const T_VEC w55 = sub(i5, i21);                                          \
+    const T_VEC w56 = add(i13, i29);                                         \
+    const T_VEC w57 = sub(i13, i29);                                         \
+    const T_VEC w58 = add(w54, w56);                                         \
+    const T_VEC w59 = sub(w54, w56);                                         \
+    const T_VEC w61 = add(w51, w58);                                         \
+    const T_VEC w62 = sub(w51, w58);                                         \
+    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(sub(kWeight0, w50),                           \
+                               mul(kWeight2, add(w57, w55))) };              \
+    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
+    const T_VEC w66 = add(i3, i19);                                          \
+    const T_VEC w67 = sub(i3, i19);                                          \
+    const T_VEC w68 = add(i11, i27);                                         \
+    const T_VEC w69 = sub(i11, i27);                                         \
+    const T_VEC w70 = add(w66, w68);                                         \
+    const T_VEC w71 = sub(w66, w68);                                         \
+    const T_VEC w73 = add(i7, i23);                                          \
+    const T_VEC w74 = sub(i7, i23);                                          \
+    const T_VEC w75 = add(i15, i31);                                         \
+    const T_VEC w76 = sub(i15, i31);                                         \
+    const T_VEC w77 = add(w73, w75);                                         \
+    const T_VEC w78 = sub(w73, w75);                                         \
+    const T_VEC w80 = add(w70, w77);                                         \
+    const T_VEC w81 = sub(w70, w77);                                         \
+    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(sub(kWeight0, w69),                           \
+                               mul(kWeight2, add(w76, w74))) };              \
+    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
+    const T_VEC w85 = add(w61, w80);                                         \
+    const T_VEC w86 = sub(w61, w80);                                         \
+    const T_VEC w87[2] = {                                                   \
+      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
+      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
+    };                                                                       \
+    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(sub(kWeight0, w59),                           \
+                               mul(kWeight2, add(w78, w71))) };              \
+    const T_VEC w89[2] = {                                                   \
+      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
+      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
+    };                                                                       \
+    const T_VEC w91[2] = {                                                   \
+      add(w65[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+      sub(sub(kWeight0, w65[1]),                                             \
+          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
+    };                                                                       \
+    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
+    const T_VEC w93[2] = {                                                   \
+      add(w63[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+      sub(sub(kWeight0, w63[1]),                                             \
+          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
+    };                                                                       \
+    store(output + 0 * stride, add(w38, w85));                               \
+    store(output + 1 * stride,                                               \
+          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
+    store(output + 2 * stride,                                               \
+          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
+    store(output + 3 * stride,                                               \
+          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
+    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
+    store(output + 5 * stride,                                               \
+          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
+    store(output + 6 * stride,                                               \
+          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
+    store(output + 7 * stride,                                               \
+          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
+    store(output + 8 * stride, w39);                                         \
+    store(output + 9 * stride,                                               \
+          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
+                          mul(kWeight5, w93[1]))));                          \
+    store(output + 10 * stride,                                              \
+          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
+                          mul(kWeight3, w92[1]))));                          \
+    store(output + 11 * stride,                                              \
+          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
+                          mul(kWeight7, w91[1]))));                          \
+    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
+    store(output + 13 * stride,                                              \
+          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
+                          mul(kWeight8, w89[1]))));                          \
+    store(output + 14 * stride,                                              \
+          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
+                          mul(kWeight4, w88[1]))));                          \
+    store(output + 15 * stride,                                              \
+          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
+                          mul(kWeight6, w87[1]))));                          \
+    store(output + 16 * stride, sub(w38, w85));                              \
+    store(output + 17 * stride,                                              \
+          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
+    store(output + 18 * stride,                                              \
+          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
+    store(output + 19 * stride,                                              \
+          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
+    store(output + 20 * stride,                                              \
+          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
+    store(output + 21 * stride,                                              \
+          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
+    store(output + 22 * stride,                                              \
+          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
+    store(output + 23 * stride,                                              \
+          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
+    store(output + 24 * stride, sub(kWeight0, w86));                         \
+    store(output + 25 * stride,                                              \
+          sub(sub(kWeight0, w46[1]),                                         \
+              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
+    store(output + 26 * stride,                                              \
+          sub(sub(kWeight0, w45[1]),                                         \
+              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
+    store(output + 27 * stride,                                              \
+          sub(sub(kWeight0, w44[1]),                                         \
+              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
+    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
+    store(output + 29 * stride,                                              \
+          sub(sub(kWeight0, w42[1]),                                         \
+              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
+    store(output + 30 * stride,                                              \
+          sub(sub(kWeight0, w41[1]),                                         \
+              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
+    store(output + 31 * stride,                                              \
+          sub(sub(kWeight0, w40[1]),                                         \
+              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
+  }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    store(output + 0 * stride, i0 + i1);                             \
+    store(output + 1 * stride, i0 - i1);                             \
+  }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC w2 = add(i0, i2);                                          \
+    const T_VEC w3 = sub(i0, i2);                                          \
+    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
+    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
+    store(output + 0 * stride, add(w2, w4[0]));                            \
+    store(output + 1 * stride, add(w3, w5[1]));                            \
+    store(output + 2 * stride, sub(w2, w4[0]));                            \
+    store(output + 3 * stride, sub(w3, w5[1]));                            \
+  }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC w6 = add(i0, i4);                                          \
+    const T_VEC w7 = sub(i0, i4);                                          \
+    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
+    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
+    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
+    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
+    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
+    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
+    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
+    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
+    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
+    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
+    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
+    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
+    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
+    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
+    store(output + 0 * stride, add(w10[0], w18[0]));                       \
+    store(output + 1 * stride,                                             \
+          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
+    store(output + 2 * stride, add(w11[0], w19[1]));                       \
+    store(output + 3 * stride,                                             \
+          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
+    store(output + 5 * stride,                                             \
+          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
+                          mul(kWeight2, w20[1]))));                        \
+    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
+    store(output + 7 * stride,                                             \
+          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+  }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                    mul)                                                      \
+  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                    \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC w14 = add(i0, i8);                                            \
+    const T_VEC w15 = sub(i0, i8);                                            \
+    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
+    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
+    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
+    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
+    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
+    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
+    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
+    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
+    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
+    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
+    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
+    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
+    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
+    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
+    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
+    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
+    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
+                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+    const T_VEC w33[2] = { add(w20[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
+                                   mul(kWeight2, w28[1]))),                   \
+                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
+    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
+    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
+    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
+    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
+    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
+    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
+    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
+    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
+    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
+    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
+    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
+    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
+    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
+    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
+    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
+    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
+                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+    const T_VEC w57[2] = { add(w44[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
+                                   mul(kWeight2, w52[1]))),                   \
+                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
+    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
+    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    store(output + 0 * stride, add(w30[0], w54[0]));                          \
+    store(output + 1 * stride,                                                \
+          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
+    store(output + 2 * stride,                                                \
+          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
+    store(output + 3 * stride,                                                \
+          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
+    store(output + 4 * stride, add(w31[0], w55[1]));                          \
+    store(output + 5 * stride,                                                \
+          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 6 * stride,                                                \
+          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 7 * stride,                                                \
+          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
+    store(output + 9 * stride,                                                \
+          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
+                          mul(kWeight4, w56[1]))));                           \
+    store(output + 10 * stride,                                               \
+          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
+                          mul(kWeight2, w58[1]))));                           \
+    store(output + 11 * stride,                                               \
+          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
+                          mul(kWeight3, w60[1]))));                           \
+    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
+    store(output + 13 * stride,                                               \
+          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 14 * stride,                                               \
+          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 15 * stride,                                               \
+          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+  }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
+                    mul)                                                       \
+  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC kWeight3 = constant(0.92388f);                                 \
+    const T_VEC kWeight4 = constant(0.382683f);                                \
+    const T_VEC kWeight5 = constant(0.980785f);                                \
+    const T_VEC kWeight6 = constant(0.19509f);                                 \
+    const T_VEC kWeight7 = constant(0.83147f);                                 \
+    const T_VEC kWeight8 = constant(0.55557f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC i8 = load(input + 8 * stride);                                 \
+    const T_VEC i9 = load(input + 9 * stride);                                 \
+    const T_VEC i10 = load(input + 10 * stride);                               \
+    const T_VEC i11 = load(input + 11 * stride);                               \
+    const T_VEC i12 = load(input + 12 * stride);                               \
+    const T_VEC i13 = load(input + 13 * stride);                               \
+    const T_VEC i14 = load(input + 14 * stride);                               \
+    const T_VEC i15 = load(input + 15 * stride);                               \
+    const T_VEC i16 = load(input + 16 * stride);                               \
+    const T_VEC i17 = load(input + 17 * stride);                               \
+    const T_VEC i18 = load(input + 18 * stride);                               \
+    const T_VEC i19 = load(input + 19 * stride);                               \
+    const T_VEC i20 = load(input + 20 * stride);                               \
+    const T_VEC i21 = load(input + 21 * stride);                               \
+    const T_VEC i22 = load(input + 22 * stride);                               \
+    const T_VEC i23 = load(input + 23 * stride);                               \
+    const T_VEC i24 = load(input + 24 * stride);                               \
+    const T_VEC i25 = load(input + 25 * stride);                               \
+    const T_VEC i26 = load(input + 26 * stride);                               \
+    const T_VEC i27 = load(input + 27 * stride);                               \
+    const T_VEC i28 = load(input + 28 * stride);                               \
+    const T_VEC i29 = load(input + 29 * stride);                               \
+    const T_VEC i30 = load(input + 30 * stride);                               \
+    const T_VEC i31 = load(input + 31 * stride);                               \
+    const T_VEC w30 = add(i0, i16);                                            \
+    const T_VEC w31 = sub(i0, i16);                                            \
+    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
+    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
+    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
+    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
+    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
+    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
+    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
+    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
+    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
+    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
+    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
+    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
+    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
+                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
+    const T_VEC w49[2] = { add(w36[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
+                                   mul(kWeight2, w44[1]))),                    \
+                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
+    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
+    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
+    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
+    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
+    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
+    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
+    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
+    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
+    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
+    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
+    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
+    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
+    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
+    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
+    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
+    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
+    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
+    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
+    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
+    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
+    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
+                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
+    const T_VEC w73[2] = { add(w60[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
+                                   mul(kWeight2, w68[1]))),                    \
+                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
+    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
+    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
+    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
+    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
+    const T_VEC w80[2] = {                                                     \
+      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
+      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
+    };                                                                         \
+    const T_VEC w81[2] = {                                                     \
+      add(w48[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
+      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
+    };                                                                         \
+    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
+                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
+    const T_VEC w83[2] = { add(w50[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
+                                   mul(kWeight2, w74[1]))),                    \
+                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
+    const T_VEC w84[2] = {                                                     \
+      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
+      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
+    };                                                                         \
+    const T_VEC w85[2] = {                                                     \
+      add(w52[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
+      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
+    };                                                                         \
+    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
+    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
+    const T_VEC w88[2] = {                                                     \
+      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
+    };                                                                         \
+    const T_VEC w89[2] = {                                                     \
+      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
+    };                                                                         \
+    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w92[2] = {                                                     \
+      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
+    };                                                                         \
+    const T_VEC w93[2] = {                                                     \
+      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
+    };                                                                         \
+    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
+    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
+    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
+    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
+    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
+    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
+    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
+    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
+    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
+    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
+    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
+    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
+    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
+    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
+    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
+    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
+    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
+    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
+    const T_VEC w112[2] = {                                                    \
+      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
+      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
+    };                                                                         \
+    const T_VEC w113[2] = {                                                    \
+      add(w100[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
+    };                                                                         \
+    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
+    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
+    const T_VEC w116[2] = {                                                    \
+      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w117[2] = {                                                    \
+      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
+    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
+    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
+    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
+    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
+    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
+    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
+    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
+    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
+    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
+    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
+    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
+    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
+    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
+    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
+    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
+    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
+    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
+    const T_VEC w136[2] = {                                                    \
+      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
+      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
+    };                                                                         \
+    const T_VEC w137[2] = {                                                    \
+      add(w124[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
+    };                                                                         \
+    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
+    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
+    const T_VEC w140[2] = {                                                    \
+      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w141[2] = {                                                    \
+      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
+    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
+    const T_VEC w144[2] = {                                                    \
+      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
+      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
+    };                                                                         \
+    const T_VEC w145[2] = {                                                    \
+      add(w112[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
+    };                                                                         \
+    const T_VEC w146[2] = {                                                    \
+      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
+      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
+    };                                                                         \
+    const T_VEC w147[2] = {                                                    \
+      add(w114[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
+    };                                                                         \
+    const T_VEC w148[2] = {                                                    \
+      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
+      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
+    };                                                                         \
+    const T_VEC w149[2] = {                                                    \
+      add(w116[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
+    };                                                                         \
+    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
+    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
+    const T_VEC w152[2] = {                                                    \
+      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
+    };                                                                         \
+    const T_VEC w153[2] = {                                                    \
+      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
+    };                                                                         \
+    const T_VEC w154[2] = {                                                    \
+      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w155[2] = {                                                    \
+      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w156[2] = {                                                    \
+      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
+    };                                                                         \
+    const T_VEC w157[2] = {                                                    \
+      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
+    };                                                                         \
+    store(output + 0 * stride, add(w78[0], w142[0]));                          \
+    store(output + 1 * stride,                                                 \
+          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
+    store(output + 2 * stride,                                                 \
+          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
+    store(output + 3 * stride,                                                 \
+          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
+    store(output + 4 * stride,                                                 \
+          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
+    store(output + 5 * stride,                                                 \
+          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
+    store(output + 6 * stride,                                                 \
+          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
+    store(output + 7 * stride,                                                 \
+          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
+    store(output + 8 * stride, add(w79[0], w143[1]));                          \
+    store(output + 9 * stride,                                                 \
+          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 10 * stride,                                                \
+          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 11 * stride,                                                \
+          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 12 * stride,                                                \
+          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 13 * stride,                                                \
+          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 14 * stride,                                                \
+          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 15 * stride,                                                \
+          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
+    store(output + 17 * stride,                                                \
+          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
+                          mul(kWeight6, w144[1]))));                           \
+    store(output + 18 * stride,                                                \
+          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
+                          mul(kWeight4, w146[1]))));                           \
+    store(output + 19 * stride,                                                \
+          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
+                          mul(kWeight8, w148[1]))));                           \
+    store(output + 20 * stride,                                                \
+          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
+                          mul(kWeight2, w150[1]))));                           \
+    store(output + 21 * stride,                                                \
+          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
+                          mul(kWeight7, w152[1]))));                           \
+    store(output + 22 * stride,                                                \
+          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
+                          mul(kWeight3, w154[1]))));                           \
+    store(output + 23 * stride,                                                \
+          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
+                          mul(kWeight5, w156[1]))));                           \
+    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
+    store(output + 25 * stride,                                                \
+          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 26 * stride,                                                \
+          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 27 * stride,                                                \
+          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 28 * stride,                                                \
+          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 29 * stride,                                                \
+          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 30 * stride,                                                \
+          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 31 * stride,                                                \
+          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+  }
+
+#endif  // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
new file mode 100644
index 000000000..e50f951c1
--- /dev/null
+++ b/third_party/aom/aom_dsp/fwd_txfm.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/txfm_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+  }
+}
+
+void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  aom_fdct8x8_c(input, final_output, stride);
+}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
new file mode 100644
index 000000000..b96e1c319
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_synthesis.c
@@ -0,0 +1,1409 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_mem/aom_mem.h"
+
+// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
+// with zero mean and standard deviation of about 512.
+// should be divided by 4 for 10-bit range and 16 for 8-bit range.
+static const int gaussian_sequence[2048] = {
+  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+  428,   -484
+};
+
+static const int gauss_bits = 11;
+
+static int luma_subblock_size_y = 32;
+static int luma_subblock_size_x = 32;
+
+static int chroma_subblock_size_y = 16;
+static int chroma_subblock_size_x = 16;
+
+static const int min_luma_legal_range = 16;
+static const int max_luma_legal_range = 235;
+
+static const int min_chroma_legal_range = 16;
+static const int max_chroma_legal_range = 240;
+
+static int scaling_lut_y[256];
+static int scaling_lut_cb[256];
+static int scaling_lut_cr[256];
+
+static int grain_center;
+static int grain_min;
+static int grain_max;
+
+static uint16_t random_register = 0;  // random number generator register
+
+static void init_arrays(const aom_film_grain_t *params, int luma_stride,
+                        int chroma_stride, int ***pred_pos_luma_p,
+                        int ***pred_pos_chroma_p, int **luma_grain_block,
+                        int **cb_grain_block, int **cr_grain_block,
+                        int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
+                        int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
+                        int luma_grain_samples, int chroma_grain_samples,
+                        int chroma_subsamp_y, int chroma_subsamp_x) {
+  memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
+  memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
+  memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+
+  pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
+  }
+
+  pred_pos_chroma =
+      (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
+  }
+
+  int pos_ar_index = 0;
+
+  for (int row = -params->ar_coeff_lag; row < 0; row++) {
+    for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
+         col++) {
+      pred_pos_luma[pos_ar_index][0] = row;
+      pred_pos_luma[pos_ar_index][1] = col;
+      pred_pos_luma[pos_ar_index][2] = 0;
+
+      pred_pos_chroma[pos_ar_index][0] = row;
+      pred_pos_chroma[pos_ar_index][1] = col;
+      pred_pos_chroma[pos_ar_index][2] = 0;
+      ++pos_ar_index;
+    }
+  }
+
+  for (int col = -params->ar_coeff_lag; col < 0; col++) {
+    pred_pos_luma[pos_ar_index][0] = 0;
+    pred_pos_luma[pos_ar_index][1] = col;
+    pred_pos_luma[pos_ar_index][2] = 0;
+
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = col;
+    pred_pos_chroma[pos_ar_index][2] = 0;
+
+    ++pos_ar_index;
+  }
+
+  if (params->num_y_points > 0) {
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = 0;
+    pred_pos_chroma[pos_ar_index][2] = 1;
+  }
+
+  *pred_pos_luma_p = pred_pos_luma;
+  *pred_pos_chroma_p = pred_pos_chroma;
+
+  *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
+  *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+  *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+
+  *y_col_buf =
+      (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
+  *cb_col_buf =
+      (int *)aom_malloc(sizeof(**cb_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+  *cr_col_buf =
+      (int *)aom_malloc(sizeof(**cr_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+
+  *luma_grain_block =
+      (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
+  *cb_grain_block =
+      (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
+  *cr_grain_block =
+      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
+}
+
+static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
+                           int ***pred_pos_chroma, int **luma_grain_block,
+                           int **cb_grain_block, int **cr_grain_block,
+                           int **y_line_buf, int **cb_line_buf,
+                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
+                           int **cr_col_buf) {
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    aom_free((*pred_pos_luma)[row]);
+  }
+  aom_free(*pred_pos_luma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    aom_free((*pred_pos_chroma)[row]);
+  }
+  aom_free((*pred_pos_chroma));
+
+  aom_free(*y_line_buf);
+
+  aom_free(*cb_line_buf);
+
+  aom_free(*cr_line_buf);
+
+  aom_free(*y_col_buf);
+
+  aom_free(*cb_col_buf);
+
+  aom_free(*cr_col_buf);
+
+  aom_free(*luma_grain_block);
+
+  aom_free(*cb_grain_block);
+
+  aom_free(*cr_grain_block);
+}
+
+// get a number between 0 and 2^bits - 1
+static INLINE int get_random_number(int bits) {
+  uint16_t bit;
+  bit = ((random_register >> 0) ^ (random_register >> 1) ^
+         (random_register >> 3) ^ (random_register >> 12)) &
+        1;
+  random_register = (random_register >> 1) | (bit << 15);
+  return (random_register >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static void init_random_generator(int luma_line, uint16_t seed) {
+  // same for the picture
+
+  uint16_t msb = (seed >> 8) & 255;
+  uint16_t lsb = seed & 255;
+
+  random_register = (msb << 8) + lsb;
+
+  //  changes for each row
+  int luma_num = luma_line >> 5;
+
+  random_register ^= ((luma_num * 37 + 178) & 255) << 8;
+  random_register ^= ((luma_num * 173 + 105) & 255);
+}
+
+// Return 0 for success, -1 for failure
+static int generate_luma_grain_block(
+    const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+    int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
+    int left_pad, int top_pad, int right_pad, int bottom_pad) {
+  if (params->num_y_points == 0) {
+    memset(luma_grain_block, 0,
+           sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
+    return 0;
+  }
+
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+
+  for (int i = 0; i < luma_block_size_y; i++)
+    for (int j = 0; j < luma_block_size_x; j++)
+      luma_grain_block[i * luma_grain_stride + j] =
+          (gaussian_sequence[get_random_number(gauss_bits)] +
+           ((1 << gauss_sec_shift) >> 1)) >>
+          gauss_sec_shift;
+
+  for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
+      int wsum = 0;
+      for (int pos = 0; pos < num_pos_luma; pos++) {
+        wsum = wsum + params->ar_coeffs_y[pos] *
+                          luma_grain_block[(i + pred_pos_luma[pos][0]) *
+                                               luma_grain_stride +
+                                           j + pred_pos_luma[pos][1]];
+      }
+      luma_grain_block[i * luma_grain_stride + j] =
+          clamp(luma_grain_block[i * luma_grain_stride + j] +
+                    ((wsum + rounding_offset) >> params->ar_coeff_shift),
+                grain_min, grain_max);
+    }
+  return 0;
+}
+
+// Return 0 for success, -1 for failure
+static int generate_chroma_grain_blocks(
+    const aom_film_grain_t *params,
+    //                                  int** pred_pos_luma,
+    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
+    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
+    int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
+    int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  if (params->num_y_points > 0) ++num_pos_chroma;
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+  int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
+
+  if (params->num_cb_points || params->chroma_scaling_from_luma) {
+    init_random_generator(7 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cb_grain_block, 0,
+           sizeof(*cb_grain_block) * chroma_grain_block_size);
+  }
+
+  if (params->num_cr_points || params->chroma_scaling_from_luma) {
+    init_random_generator(11 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cr_grain_block, 0,
+           sizeof(*cr_grain_block) * chroma_grain_block_size);
+  }
+
+  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
+      int wsum_cb = 0;
+      int wsum_cr = 0;
+      for (int pos = 0; pos < num_pos_chroma; pos++) {
+        if (pred_pos_chroma[pos][2] == 0) {
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
+                                  cb_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
+                                  cr_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+        } else if (pred_pos_chroma[pos][2] == 1) {
+          int av_luma = 0;
+          int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
+          int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
+
+          for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
+               k++)
+            for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
+                 l++)
+              av_luma += luma_grain_block[k * luma_grain_stride + l];
+
+          av_luma =
+              (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
+              (chroma_subsamp_y + chroma_subsamp_x);
+
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
+        } else {
+          fprintf(
+              stderr,
+              "Grain synthesis: prediction between two chroma components is "
+              "not supported!");
+          return -1;
+        }
+      }
+      if (params->num_cb_points || params->chroma_scaling_from_luma)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            clamp(cb_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+      if (params->num_cr_points || params->chroma_scaling_from_luma)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            clamp(cr_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+    }
+  return 0;
+}
+
+static void init_scaling_function(const int scaling_points[][2], int num_points,
+                                  int scaling_lut[]) {
+  if (num_points == 0) return;
+
+  for (int i = 0; i < scaling_points[0][0]; i++)
+    scaling_lut[i] = scaling_points[0][1];
+
+  for (int point = 0; point < num_points - 1; point++) {
+    int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
+    int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
+
+    int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+
+    for (int x = 0; x < delta_x; x++) {
+      scaling_lut[scaling_points[point][0] + x] =
+          scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
+    }
+  }
+
+  for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
+    scaling_lut[i] = scaling_points[num_points - 1][1];
+}
+
+// function that extracts samples from a LUT (and interpolates intemediate
+// frames for 10- and 12-bit video)
+static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
+  int x = index >> (bit_depth - 8);
+
+  if (!(bit_depth - 8) || x == 255)
+    return scaling_lut[x];
+  else
+    return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
+                                  (index & ((1 << (bit_depth - 8)) - 1)) +
+                              (1 << (bit_depth - 9))) >>
+                             (bit_depth - 8));
+}
+
+static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
+                               uint8_t *cb, uint8_t *cr, int luma_stride,
+                               int chroma_stride, int *luma_grain,
+                               int *cb_grain, int *cr_grain,
+                               int luma_grain_stride, int chroma_grain_stride,
+                               int half_luma_height, int half_luma_width,
+                               int bit_depth, int chroma_subsamp_y,
+                               int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  int cb_offset = params->cb_offset - 256;
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  int cr_offset = params->cr_offset - 256;
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range;
+    max_luma = max_luma_legal_range;
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range;
+      max_chroma = max_luma_legal_range;
+    } else {
+      min_chroma = min_chroma_legal_range;
+      max_chroma = max_chroma_legal_range;
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = 255;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void add_noise_to_block_hbd(
+    const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+    int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
+    int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
+    int half_luma_height, int half_luma_width, int bit_depth,
+    int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range << (bit_depth - 8);
+    max_luma = max_luma_legal_range << (bit_depth - 8);
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range << (bit_depth - 8);
+      max_chroma = max_luma_legal_range << (bit_depth - 8);
+    } else {
+      min_chroma = min_chroma_legal_range << (bit_depth - 8);
+      max_chroma = max_chroma_legal_range << (bit_depth - 8);
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
+                                  bit_depth) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
+                      int dst_stride, int width, int height,
+                      int use_high_bit_depth) {
+  int hbd_coeff = use_high_bit_depth ? 2 : 1;
+  while (height) {
+    memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
+                      int width, int height) {
+  while (height) {
+    memcpy(dst, src, width * sizeof(*src));
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
+                        int use_high_bit_depth) {
+  if ((width & 1) == 0 && (height & 1) == 0) return;
+  if (use_high_bit_depth) {
+    uint16_t *dst16 = (uint16_t *)dst;
+    int dst16_stride = dst_stride / 2;
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
+             sizeof(*dst16) * width);
+    }
+  } else {
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
+             sizeof(*dst) * width);
+    }
+  }
+}
+
+static void ver_boundary_overlap(int *left_block, int left_stride,
+                                 int *right_block, int right_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (width == 1) {
+    while (height) {
+      *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  } else if (width == 2) {
+    while (height) {
+      dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
+                           grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  }
+}
+
+static void hor_boundary_overlap(int *top_block, int top_stride,
+                                 int *bottom_block, int bottom_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (height == 1) {
+    while (width) {
+      *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  } else if (height == 2) {
+    while (width) {
+      dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
+                                     27 * bottom_block[bottom_stride] + 16) >>
+                                        5,
+                                    grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  }
+}
+
+int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
+                       aom_image_t *dst) {
+  uint8_t *luma, *cb, *cr;
+  int height, width, luma_stride, chroma_stride;
+  int use_high_bit_depth = 0;
+  int chroma_subsamp_x = 0;
+  int chroma_subsamp_y = 0;
+  int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
+
+  switch (src->fmt) {
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_I420:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+    case AOM_IMG_FMT_I42016:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+      //    case AOM_IMG_FMT_444A:
+    case AOM_IMG_FMT_I444:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I44416:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I422:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I42216:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    default:  // unknown input format
+      fprintf(stderr, "Film grain error: input format is not supported!");
+      return -1;
+  }
+
+  assert(params->bit_depth == src->bit_depth);
+
+  dst->fmt = src->fmt;
+  dst->bit_depth = src->bit_depth;
+
+  dst->r_w = src->r_w;
+  dst->r_h = src->r_h;
+  dst->d_w = src->d_w;
+  dst->d_h = src->d_h;
+
+  dst->cp = src->cp;
+  dst->tc = src->tc;
+  dst->mc = src->mc;
+
+  dst->monochrome = src->monochrome;
+  dst->csp = src->csp;
+  dst->range = src->range;
+
+  dst->x_chroma_shift = src->x_chroma_shift;
+  dst->y_chroma_shift = src->y_chroma_shift;
+
+  dst->temporal_id = src->temporal_id;
+  dst->spatial_id = src->spatial_id;
+
+  width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
+  height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
+
+  copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+            dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+            src->d_h, use_high_bit_depth);
+  // Note that dst is already assumed to be aligned to even.
+  extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+              src->d_h, use_high_bit_depth);
+
+  if (!src->monochrome) {
+    copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+              dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+
+    copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
+              dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+  }
+
+  luma = dst->planes[AOM_PLANE_Y];
+  cb = dst->planes[AOM_PLANE_U];
+  cr = dst->planes[AOM_PLANE_V];
+
+  // luma and chroma strides in samples
+  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
+  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
+
+  return av1_add_film_grain_run(
+      params, luma, cb, cr, height, width, luma_stride, chroma_stride,
+      use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+}
+
+int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
+                           uint8_t *cb, uint8_t *cr, int height, int width,
+                           int luma_stride, int chroma_stride,
+                           int use_high_bit_depth, int chroma_subsamp_y,
+                           int chroma_subsamp_x, int mc_identity) {
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+  int *luma_grain_block;
+  int *cb_grain_block;
+  int *cr_grain_block;
+
+  int *y_line_buf;
+  int *cb_line_buf;
+  int *cr_line_buf;
+
+  int *y_col_buf;
+  int *cb_col_buf;
+  int *cr_col_buf;
+
+  random_register = params->random_seed;
+
+  int left_pad = 3;
+  int right_pad = 3;  // padding to offset for AR coefficients
+  int top_pad = 3;
+  int bottom_pad = 0;
+
+  int ar_padding = 3;  // maximum lag used for stabilization of AR coefficients
+
+  luma_subblock_size_y = 32;
+  luma_subblock_size_x = 32;
+
+  chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
+  chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
+
+  // Initial padding is only needed for generation of
+  // film grain templates (to stabilize the AR process)
+  // Only a 64x64 luma and 32x32 chroma part of a template
+  // is used later for adding grain, padding can be discarded
+
+  int luma_block_size_y =
+      top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
+  int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
+                          2 * ar_padding + right_pad;
+
+  int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            chroma_subblock_size_y * 2 + bottom_pad;
+  int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            chroma_subblock_size_x * 2 +
+                            (2 >> chroma_subsamp_x) * ar_padding + right_pad;
+
+  int luma_grain_stride = luma_block_size_x;
+  int chroma_grain_stride = chroma_block_size_x;
+
+  int overlap = params->overlap_flag;
+  int bit_depth = params->bit_depth;
+
+  grain_center = 128 << (bit_depth - 8);
+  grain_min = 0 - grain_center;
+  grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
+
+  init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
+              &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
+              &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
+              &y_col_buf, &cb_col_buf, &cr_col_buf,
+              luma_block_size_y * luma_block_size_x,
+              chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
+              chroma_subsamp_x);
+
+  if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+                                luma_block_size_y, luma_block_size_x,
+                                luma_grain_stride, left_pad, top_pad, right_pad,
+                                bottom_pad))
+    return -1;
+
+  if (generate_chroma_grain_blocks(
+          params,
+          //                               pred_pos_luma,
+          pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
+          luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
+          chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
+          chroma_subsamp_y, chroma_subsamp_x))
+    return -1;
+
+  init_scaling_function(params->scaling_points_y, params->num_y_points,
+                        scaling_lut_y);
+
+  if (params->chroma_scaling_from_luma) {
+    memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+    memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+  } else {
+    init_scaling_function(params->scaling_points_cb, params->num_cb_points,
+                          scaling_lut_cb);
+    init_scaling_function(params->scaling_points_cr, params->num_cr_points,
+                          scaling_lut_cr);
+  }
+  for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
+    init_random_generator(y * 2, params->random_seed);
+
+    for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
+      int offset_y = get_random_number(8);
+      int offset_x = (offset_y >> 4) & 15;
+      offset_y &= 15;
+
+      int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
+      int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
+
+      int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            offset_y * (2 >> chroma_subsamp_y);
+      int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            offset_x * (2 >> chroma_subsamp_x);
+
+      if (overlap && x) {
+        ver_boundary_overlap(
+            y_col_buf, 2,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x,
+            luma_grain_stride, y_col_buf, 2, 2,
+            AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        ver_boundary_overlap(
+            cb_col_buf, 2 >> chroma_subsamp_x,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        ver_boundary_overlap(
+            cr_col_buf, 2 >> chroma_subsamp_x,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        int i = y ? 1 : 0;
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params,
+              (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              (uint16_t *)cr +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + ((y + i) << 1) * luma_stride + (x << 1),
+              cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      if (overlap && y) {
+        if (x) {
+          hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
+                               y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                               cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+
+          hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                               cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+        }
+
+        hor_boundary_overlap(
+            y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x + (x ? 2 : 0),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
+                   width - ((x ? x + 1 : 0) << 1)),
+            2);
+
+        hor_boundary_overlap(
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        hor_boundary_overlap(
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + (y << 1) * luma_stride + (x << 1),
+              cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      int i = overlap && y ? 1 : 0;
+      int j = overlap && x ? 1 : 0;
+
+      if (use_high_bit_depth) {
+        add_noise_to_block_hbd(
+            params,
+            (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            (uint16_t *)cb +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            (uint16_t *)cr +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      } else {
+        add_noise_to_block(
+            params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      }
+
+      if (overlap) {
+        if (x) {
+          // Copy overlapped column bufer to line buffer
+          copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
+                    y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          copy_area(
+              cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+
+          copy_area(
+              cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+        }
+
+        // Copy grain to the line buffer for overlap with a bottom block
+        copy_area(
+            luma_grain_block +
+                (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
+                luma_offset_x + ((x ? 2 : 0)),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
+
+        copy_area(cb_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        copy_area(cr_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        // Copy grain to the column buffer for overlap with the next block to
+        // the right
+
+        copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
+                      luma_offset_x + luma_subblock_size_x,
+                  luma_grain_stride, y_col_buf, 2, 2,
+                  AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+
+        copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+      }
+    }
+  }
+
+  dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
+                 &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
+                 &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
+  return 0;
+}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
new file mode 100644
index 000000000..7aee6f6f4
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_synthesis.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom/aom_image.h"
+
+/*!\brief Structure containing film grain synthesis parameters for a frame
+ *
+ * This structure contains input parameters for film grain synthesis
+ */
+typedef struct {
+  int apply_grain;
+
+  int update_parameters;
+
+  // 8 bit values
+  int scaling_points_y[14][2];
+  int num_y_points;  // value: 0..14
+
+  // 8 bit values
+  int scaling_points_cb[10][2];
+  int num_cb_points;  // value: 0..10
+
+  // 8 bit values
+  int scaling_points_cr[10][2];
+  int num_cr_points;  // value: 0..10
+
+  int scaling_shift;  // values : 8..11
+
+  int ar_coeff_lag;  // values:  0..3
+
+  // 8 bit values
+  int ar_coeffs_y[24];
+  int ar_coeffs_cb[25];
+  int ar_coeffs_cr[25];
+
+  // Shift value: AR coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  int ar_coeff_shift;  // values : 6..9
+
+  int cb_mult;       // 8 bits
+  int cb_luma_mult;  // 8 bits
+  int cb_offset;     // 9 bits
+
+  int cr_mult;       // 8 bits
+  int cr_luma_mult;  // 8 bits
+  int cr_offset;     // 9 bits
+
+  int overlap_flag;
+
+  int clip_to_restricted_range;
+
+  unsigned int bit_depth;  // video bit depth
+
+  int chroma_scaling_from_luma;
+
+  int grain_scale_shift;
+
+  uint16_t random_seed;
+} aom_film_grain_t;
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    luma             luma plane
+ * \param[in]    cb               cb plane
+ * \param[in]    cr               cr plane
+ * \param[in]    height           luma plane height
+ * \param[in]    width            luma plane width
+ * \param[in]    luma_stride      luma plane stride
+ * \param[in]    chroma_stride    chroma plane stride
+ */
+int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
+                           uint8_t *cb, uint8_t *cr, int height, int width,
+                           int luma_stride, int chroma_stride,
+                           int use_high_bit_depth, int chroma_subsamp_y,
+                           int chroma_subsamp_x, int mc_identity);
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    src              Source image
+ * \param[out]   dst              Resulting image with grain
+ */
+int av1_add_film_grain(const aom_film_grain_t *grain_params,
+                       const aom_image_t *src, aom_image_t *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
new file mode 100644
index 000000000..0d6a73f55
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This file has the implementation details of the grain table.
+ *
+ * The file format is an ascii representation for readability and
+ * editability. Array parameters are separated from the non-array
+ * parameters and prefixed with a few characters to make for easy
+ * localization with a parameter set. Each entry is prefixed with "E"
+ * and the other parameters are only specified if "update-parms" is
+ * non-zero.
+ *
+ * filmgrn1
+ * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
+ *  p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
+ *  sY <num_y_points> <point_0_x> <point_0_y> ...
+ *  sCb <num_cb_points> <point_0_x> <point_0_y> ...
+ *  sCr <num_cr_points> <point_0_x> <point_0_y> ...
+ *  cY <ar_coeff_y_0> ....
+ *  cCb <ar_coeff_cb_0> ....
+ *  cCr <ar_coeff_cr_0> ....
+ * E <start-time> ...
+ */
+#include <string.h>
+#include <stdio.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_mem/aom_mem.h"
+
+static const char kFileMagic[8] = "filmgrn1";
+
+static void grain_table_entry_read(FILE *file,
+                                   struct aom_internal_error_info *error_info,
+                                   aom_film_grain_table_entry_t *entry) {
+  aom_film_grain_t *pars = &entry->params;
+  int num_read =
+      fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
+             &entry->end_time, &pars->apply_grain, &pars->random_seed,
+             &pars->update_parameters);
+  if (num_read == 0 && feof(file)) return;
+  if (num_read != 5) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read entry header. Read %d != 5", num_read);
+    return;
+  }
+  if (pars->update_parameters) {
+    num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
+                      &pars->ar_coeff_lag, &pars->ar_coeff_shift,
+                      &pars->grain_scale_shift, &pars->scaling_shift,
+                      &pars->chroma_scaling_from_luma, &pars->overlap_flag,
+                      &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
+                      &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
+    if (num_read != 12) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read entry params. Read %d != 12",
+                         num_read);
+      return;
+    }
+    if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num y points");
+      return;
+    }
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
+                      &pars->scaling_points_y[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read y scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cb points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
+                      &pars->scaling_points_cb[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cb scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cr points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
+                      &pars->scaling_points_cr[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cr scaling points");
+        return;
+      }
+    }
+
+    fscanf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Y coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cb coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cr coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n");
+  }
+}
+
+void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) {
+  const aom_film_grain_t *pars = &entry->params;
+  fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
+          entry->end_time, pars->apply_grain, pars->random_seed,
+          pars->update_parameters);
+  if (pars->update_parameters) {
+    fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
+            pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
+            pars->scaling_shift, pars->chroma_scaling_from_luma,
+            pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
+            pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
+            pars->cr_offset);
+    fprintf(file, "\tsY %d ", pars->num_y_points);
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_y[i][0],
+              pars->scaling_points_y[i][1]);
+    }
+    fprintf(file, "\n\tsCb %d", pars->num_cb_points);
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
+              pars->scaling_points_cb[i][1]);
+    }
+    fprintf(file, "\n\tsCr %d", pars->num_cr_points);
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
+              pars->scaling_points_cr[i][1]);
+    }
+    fprintf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_y[i]);
+    }
+    fprintf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cb[i]);
+    }
+    fprintf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cr[i]);
+    }
+    fprintf(file, "\n");
+  }
+}
+
+void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
+                                 int64_t end_time,
+                                 const aom_film_grain_t *grain) {
+  if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
+    aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+    memset(new_tail, 0, sizeof(*new_tail));
+    if (t->tail) t->tail->next = new_tail;
+    if (!t->head) t->head = new_tail;
+    t->tail = new_tail;
+
+    new_tail->start_time = time_stamp;
+    new_tail->end_time = end_time;
+    new_tail->params = *grain;
+  } else {
+    t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
+    t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
+  }
+}
+
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  int16_t random_seed = grain ? grain->random_seed : 0;
+  if (grain) memset(grain, 0, sizeof(*grain));
+
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
+      if (grain) {
+        *grain = entry->params;
+        if (time_stamp != 0) grain->random_seed = random_seed;
+      }
+      if (!erase) return 1;
+
+      const int64_t entry_end_time = entry->end_time;
+      if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
+        if (t->tail == entry) t->tail = prev_entry;
+        if (prev_entry) {
+          prev_entry->next = entry->next;
+        } else {
+          t->head = entry->next;
+        }
+        aom_free(entry);
+      } else if (time_stamp <= entry->start_time &&
+                 end_time < entry->end_time) {
+        entry->start_time = end_time;
+      } else if (time_stamp > entry->start_time &&
+                 end_time >= entry->end_time) {
+        entry->end_time = time_stamp;
+      } else {
+        aom_film_grain_table_entry_t *new_entry =
+            aom_malloc(sizeof(*new_entry));
+        new_entry->next = entry->next;
+        new_entry->start_time = end_time;
+        new_entry->end_time = entry->end_time;
+        new_entry->params = entry->params;
+        entry->next = new_entry;
+        entry->end_time = time_stamp;
+        if (t->tail == entry) t->tail = new_entry;
+      }
+      // If segments aren't aligned, delete from the beggining of subsequent
+      // segments
+      if (end_time > entry_end_time) {
+        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+      }
+      return 1;
+    }
+    prev_entry = entry;
+    entry = next;
+  }
+  return 0;
+}
+
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  FILE *file = fopen(filename, "rb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
+                       filename);
+    return error_info->error_code;
+  }
+  error_info->error_code = AOM_CODEC_OK;
+
+  // Read in one extra character as there should be white space after
+  // the header.
+  char magic[9];
+  if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read (or invalid) file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  while (!feof(file)) {
+    aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+    memset(entry, 0, sizeof(*entry));
+    grain_table_entry_read(file, error_info, entry);
+    entry->next = 0;
+
+    if (prev_entry) prev_entry->next = entry;
+    if (!t->head) t->head = entry;
+    t->tail = entry;
+    prev_entry = entry;
+
+    if (error_info->error_code != AOM_CODEC_OK) break;
+  }
+
+  fclose(file);
+  return error_info->error_code;
+}
+
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  error_info->error_code = AOM_CODEC_OK;
+
+  FILE *file = fopen(filename, "wb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
+                       filename);
+    return error_info->error_code;
+  }
+
+  if (!fwrite(kFileMagic, 8, 1, file)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to write file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  fprintf(file, "\n");
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    grain_table_entry_write(file, entry);
+    entry = entry->next;
+  }
+  fclose(file);
+  return error_info->error_code;
+}
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    aom_free(entry);
+    entry = next;
+  }
+  memset(t, 0, sizeof(*t));
+}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
new file mode 100644
index 000000000..a8ac50730
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief A table mapping from time to corresponding film grain parameters.
+ *
+ * In order to apply grain synthesis in the decoder, the film grain parameters
+ * need to be signalled in the encoder. The film grain parameters are time
+ * varying, and for two-pass encoding (and denoiser implementation flexibility)
+ * it is common to denoise the video and do parameter estimation before encoding
+ * the denoised video.
+ *
+ * The film grain table is used to provide this flexibility and is used as a
+ * parameter that is passed to the encoder.
+ *
+ * Further, if regraining is to be done in say a single pass mode, or in two
+ * pass within the encoder (before frames are added to the lookahead buffer),
+ * this data structure can be used to keep track of on-the-fly estimated grain
+ * parameters, that are then extracted from the table before the encoded frame
+ * is written.
+ */
+#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
+#define AOM_AOM_DSP_GRAIN_TABLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/grain_synthesis.h"
+#include "aom/internal/aom_codec_internal.h"
+
+typedef struct aom_film_grain_table_entry_t {
+  aom_film_grain_t params;
+  int64_t start_time;
+  int64_t end_time;
+  struct aom_film_grain_table_entry_t *next;
+} aom_film_grain_table_entry_t;
+
+typedef struct {
+  aom_film_grain_table_entry_t *head;
+  aom_film_grain_table_entry_t *tail;
+} aom_film_grain_table_t;
+
+/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
+ * parameters
+ *
+ * \param[in/out] table      The grain table
+ * \param[in]     time_stamp The start time stamp
+ * \param[in]     end_stamp  The end time_stamp
+ * \param[in]     grain      The grain parameters
+ */
+void aom_film_grain_table_append(aom_film_grain_table_t *table,
+                                 int64_t time_stamp, int64_t end_time,
+                                 const aom_film_grain_t *grain);
+
+/*!\brief Look-up (and optionally erase) the grain parameters for the given time
+ *
+ * \param[in]  table      The grain table
+ * \param[in]  time_stamp The start time stamp
+ * \param[in]  end_stamp  The end time_stamp
+ * \param[in]  erase      Whether the time segment can be deleted
+ * \param[out] grain      The output grain parameters
+ */
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain);
+
+/*!\brief Reads the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *table, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+/*!\brief Writes the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
new file mode 100644
index 000000000..c6aa6b207
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/intrapred_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    memcpy(dst, above, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void)above;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, left[r], bw);
+    dst += stride;
+  }
+}
+
+static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+
+static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+                                              uint16_t top_left) {
+  const int base = top + left - top_left;
+  const int p_left = abs_diff(base, left);
+  const int p_top = abs_diff(base, top);
+  const int p_top_left = abs_diff(base, top_left);
+
+  // Return nearest to base of left, top and top_left.
+  return (p_left <= p_top && p_left <= p_top_left)
+             ? left
+             : (p_top <= p_top_left) ? top : top_left;
+}
+
+static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint8_t *above,
+                                   const uint8_t *left) {
+  int r, c;
+  const uint8_t ytop_left = above[-1];
+
+  for (r = 0; r < bh; r++) {
+    for (c = 0; c < bw; c++)
+      dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+// Some basic checks on weights for smooth predictor.
+#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
+                                 pred_scale)                          \
+  assert(weights_w[0] < weights_scale);                               \
+  assert(weights_h[0] < weights_scale);                               \
+  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
+  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
+  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
+
+#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
+
+static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
+  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  // scale = 2 * 2^sm_weight_log2_scale
+  const int log2_scale = 1 + sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+                           log2_scale + sizeof(*dst));
+  int r;
+  for (r = 0; r < bh; ++r) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
+      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+                                  sm_weights_w[c], scale - sm_weights_w[c] };
+      uint32_t this_pred = 0;
+      int i;
+      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+      for (i = 0; i < 4; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bh;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { above[c], below_pred };
+      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[r]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bw;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { left[r], right_pred };
+      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[c]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  int r;
+  (void)above;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, 128, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void)above;
+
+  for (i = 0; i < bh; i++) sum += left[i];
+  expected_dc = (sum + (bh >> 1)) / bh;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void)left;
+
+  for (i = 0; i < bw; i++) sum += above[i];
+  expected_dc = (sum + (bw >> 1)) / bw;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  const int count = bw + bh;
+
+  for (i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
+}
+
+  // The constants (multiplier and shifts) for a given block size are obtained
+  // as follows:
+  // - Let sum_w_h =  block width + block height.
+  // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
+  // shifts for that block size be called 'shift1' (see the parameter in
+  // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
+  // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
+  // block].
+  // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
+  // using the "Algorithm 1" in:
+  // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+  // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+  // shift will be 16, regardless of the block size.
+
+  // Note: For low bitdepth, assembly code may be optimized by using smaller
+  // constants for smaller block sizes, where the range of the 'sum' is
+  // restricted to fewer bits.
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left, int shift1,
+                                     int multiplier) {
+  int sum = 0;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+
+  for (int r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)left;
+  (void)bd;
+  for (r = 0; r < bh; r++) {
+    memcpy(dst, above, bw * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)bd;
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, left[r], bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bw, int bh, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  int r, c;
+  const uint16_t ytop_left = above[-1];
+  (void)bd;
+
+  for (r = 0; r < bh; r++) {
+    for (c = 0; c < bw; c++)
+      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
+  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  // scale = 2 * 2^sm_weight_log2_scale
+  const int log2_scale = 1 + sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+                           log2_scale + sizeof(*dst));
+  int r;
+  for (r = 0; r < bh; ++r) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
+      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+                                  sm_weights_w[c], scale - sm_weights_w[c] };
+      uint32_t this_pred = 0;
+      int i;
+      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+      for (i = 0; i < 4; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                             int bw, int bh,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bh;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { above[c], below_pred };
+      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[r]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                             int bw, int bh,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bw;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { left[r], right_pred };
+      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[c]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, 128 << (bd - 8), bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < bh; i++) sum += left[i];
+  expected_dc = (sum + (bh >> 1)) / bh;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < bw; i++) sum += above[i];
+  expected_dc = (sum + (bw >> 1)) / bw;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       int bh, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  const int count = bw + bh;
+  (void)bd;
+
+  for (i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
+// assume 2nd shift of 17 bits instead of 16.
+// Note: Strictly speaking, 2nd shift needs to be 17 only when:
+// - bit depth == 12, and
+// - bw + bh is divisible by 5 (as opposed to divisible by 3).
+// All other cases can use half the multipliers with a shift of 16 instead.
+// This special optimization can be used when writing assembly code.
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+// Note: This constant is odd, but a smaller even constant (0x199a) with the
+// appropriate shift should work for neon in 8/10-bit.
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd,
+                                            int shift1, uint32_t multiplier) {
+  int sum = 0;
+  (void)bd;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
+  assert(expected_dc < (1 << bd));
+
+  for (int r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, width, height)                  \
+  void aom_##type##_predictor_##width##x##height##_c(          \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *above,    \
+      const uint8_t *left) {                                   \
+    type##_predictor(dst, stride, width, height, above, left); \
+  }
+
+#define intra_pred_highbd_sized(type, width, height)                        \
+  void aom_highbd_##type##_predictor_##width##x##height##_c(                \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
+      const uint16_t *left, int bd) {                                       \
+    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
+  }
+
+/* clang-format off */
+#define intra_pred_rectangular(type) \
+  intra_pred_sized(type, 4, 8) \
+  intra_pred_sized(type, 8, 4) \
+  intra_pred_sized(type, 8, 16) \
+  intra_pred_sized(type, 16, 8) \
+  intra_pred_sized(type, 16, 32) \
+  intra_pred_sized(type, 32, 16) \
+  intra_pred_sized(type, 32, 64) \
+  intra_pred_sized(type, 64, 32) \
+  intra_pred_sized(type, 4, 16) \
+  intra_pred_sized(type, 16, 4) \
+  intra_pred_sized(type, 8, 32) \
+  intra_pred_sized(type, 32, 8) \
+  intra_pred_sized(type, 16, 64) \
+  intra_pred_sized(type, 64, 16) \
+  intra_pred_highbd_sized(type, 4, 8) \
+  intra_pred_highbd_sized(type, 8, 4) \
+  intra_pred_highbd_sized(type, 8, 16) \
+  intra_pred_highbd_sized(type, 16, 8) \
+  intra_pred_highbd_sized(type, 16, 32) \
+  intra_pred_highbd_sized(type, 32, 16) \
+  intra_pred_highbd_sized(type, 32, 64) \
+  intra_pred_highbd_sized(type, 64, 32) \
+  intra_pred_highbd_sized(type, 4, 16) \
+  intra_pred_highbd_sized(type, 16, 4) \
+  intra_pred_highbd_sized(type, 8, 32) \
+  intra_pred_highbd_sized(type, 32, 8) \
+  intra_pred_highbd_sized(type, 16, 64) \
+  intra_pred_highbd_sized(type, 64, 16)
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8, 8) \
+  intra_pred_sized(type, 16, 16) \
+  intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
+  intra_pred_highbd_sized(type, 4, 4) \
+  intra_pred_highbd_sized(type, 8, 8) \
+  intra_pred_highbd_sized(type, 16, 16) \
+  intra_pred_highbd_sized(type, 32, 32) \
+  intra_pred_highbd_sized(type, 64, 64) \
+  intra_pred_rectangular(type)
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4, 4) \
+  intra_pred_above_4x4(type)
+#define intra_pred_square(type) \
+  intra_pred_sized(type, 4, 4) \
+  intra_pred_sized(type, 8, 8) \
+  intra_pred_sized(type, 16, 16) \
+  intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
+  intra_pred_highbd_sized(type, 4, 4) \
+  intra_pred_highbd_sized(type, 8, 8) \
+  intra_pred_highbd_sized(type, 16, 16) \
+  intra_pred_highbd_sized(type, 32, 32) \
+  intra_pred_highbd_sized(type, 64, 64)
+
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(smooth)
+intra_pred_allsizes(smooth_v)
+intra_pred_allsizes(smooth_h)
+intra_pred_allsizes(paeth)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_square(dc)
+/* clang-format on */
+#undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
new file mode 100644
index 000000000..3ec62a86e
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
+#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
+
+#include "config/aom_config.h"
+
+// Weights are quadratic from '1' to '1 / block_size', scaled by
+// 2^sm_weight_log2_scale.
+static const int sm_weight_log2_scale = 8;
+
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+
+/* clang-format off */
+static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
+  // Unused, because we always offset by bs, which is at least 2.
+  0, 0,
+  // bs = 2
+  255, 128,
+  // bs = 4
+  255, 149, 85, 64,
+  // bs = 8
+  255, 197, 146, 105, 73, 50, 37, 32,
+  // bs = 16
+  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+  // bs = 32
+  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+  // bs = 64
+  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+};
+/* clang-format on */
+
+#endif  // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
new file mode 100644
index 000000000..a3f261824
--- /dev/null
+++ b/third_party/aom/aom_dsp/loopfilter.c
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+  return (int8_t)clamp(t, -128, 127);
+}
+
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
+    case 8:
+    default: return (int16_t)clamp(t, -128, 128 - 1);
+  }
+}
+
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+                                  uint8_t p0, uint8_t q0, uint8_t q1) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+                                 uint8_t q1, uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p3 - p2) > limit) * -1;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(q3 - q2) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                         uint8_t p2, uint8_t p1, uint8_t p0,
+                                         uint8_t q0, uint8_t q1, uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
+                                       uint8_t p0, uint8_t q0, uint8_t q1,
+                                       uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
+                                uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  mask |= (abs(p3 - p0) > thresh) * -1;
+  mask |= (abs(q3 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                              uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
+  hev |= (abs(p1 - p0) > thresh) * -1;
+  hev |= (abs(q1 - q0) > thresh) * -1;
+  return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter1, filter2;
+
+  const int8_t ps1 = (int8_t)*op1 ^ 0x80;
+  const int8_t ps0 = (int8_t)*op0 ^ 0x80;
+  const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
+  const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
+  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+  // add outer taps if we have high edge variance
+  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+  // inner taps
+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+  // save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way
+  filter1 = signed_char_clamp(filter + 4) >> 3;
+  filter2 = signed_char_clamp(filter + 3) >> 3;
+
+  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
+  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
+
+  // outer tap adjustments
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
+  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+}
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+                            const uint8_t *blimit, const uint8_t *limit,
+                            const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op2, uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
+  if (flat && mask) {
+    const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *oq2, uint8_t *oq3) {
+  if (flat && mask) {
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+    const int8_t mask =
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+            s + 2 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+            s + 1 * p, s + 2 * p, s + 3 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
+    const int8_t mask =
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+            s + 3);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op6, uint8_t *op5,
+                            uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                            uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
+  if (flat2 && flat && mask) {
+    const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
+                  p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+                  q5 = *oq5, q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+  }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int count) {
+  int i;
+  int step = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < step * count; ++i) {
+    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
+                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
+                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+             s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
+                  p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
+                  q5 = s[5], q6 = s[6];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
+             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
+    s += p;
+  }
+}
+
+void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
+}
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
+}
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+                                         uint16_t p1, uint16_t p0, uint16_t q0,
+                                         uint16_t q1, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+                                        uint16_t p3, uint16_t p2, uint16_t p1,
+                                        uint16_t p0, uint16_t q0, uint16_t q1,
+                                        uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                                uint16_t p2, uint16_t p1,
+                                                uint16_t p0, uint16_t q0,
+                                                uint16_t q1, uint16_t q2,
+                                                int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
+                                              uint16_t p1, uint16_t p0,
+                                              uint16_t q0, uint16_t q1,
+                                              uint16_t q2, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2, uint16_t q3,
+                                       int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_4_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_4_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op2, uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+                                  int bd) {
+  if (flat && mask) {
+    const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+    const int8_t mask =
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
+                   s + 1 * p, s + 2 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
+    const int8_t mask =
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+                   bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_6_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+                   s + 2, s + 3, bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_8_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op6, uint16_t *op5,
+                                   uint16_t *op4, uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0, uint16_t *oq0,
+                                   uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
+                                   int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int count,
+                                            int bd) {
+  int i;
+  int step = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < step * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
+                          s[5 * p], s[6 * p], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+                    s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+}
+
+void aom_highbd_lpf_horizontal_14_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh, int count,
+                                          int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
+                    s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
+                    s + 6, bd);
+    s += p;
+  }
+}
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                4, bd);
+}
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
new file mode 100644
index 000000000..96d04cff0
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "aom_dsp/mips/macros_msa.h"
+
+void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], uint32_t width,
+                             uint32_t height, int32_t pitch) {
+  uint32_t i, j;
+
+  for (i = 0; i < height / 2; ++i) {
+    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
+    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
+    for (j = width / 16; j--;) {
+      v16i8 temp00_s, temp01_s;
+      v16u8 temp00, temp01, black_clamp, white_clamp;
+      v16u8 pos0, ref0, pos1, ref1;
+      v16i8 const127 = __msa_ldi_b(127);
+
+      pos0 = LD_UB(pos0_ptr);
+      ref0 = LD_UB(ref0_ptr);
+      pos1 = LD_UB(pos1_ptr);
+      ref1 = LD_UB(ref1_ptr);
+      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+      temp00 = (pos0 < black_clamp);
+      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+      temp01 = (pos1 < black_clamp);
+      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp00 = (v16u8)(temp00_s < pos0);
+      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp01 = (temp01_s < pos1);
+      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      pos0 += ref0;
+      ST_UB(pos0, pos0_ptr);
+      pos1 += ref1;
+      ST_UB(pos1, pos1_ptr);
+      pos0_ptr += 16;
+      pos1_ptr += 16;
+      ref0_ptr += 16;
+      ref1_ptr += 16;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
new file mode 100644
index 000000000..363fad308
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v8i16 filt, out0, out1;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  SRARI_H2_SH(out0, out1, FILTER_BITS);
+  SAT_SH2_SH(out0, out1, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  src += (4 * src_stride);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out2, out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  out = PCKEV_XORI128_UB(out2, out3);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1, out2,
+                             out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  tmp0 = PCKEV_XORI128_UB(out0, out1);
+  tmp1 = PCKEV_XORI128_UB(out2, out3);
+  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (2 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    dst += dst_stride;
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+
+    src0 = LD_SB(src + 32);
+    src2 = LD_SB(src + 48);
+    src3 = LD_SB(src + 56);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst + 32);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, vec0, vec1, res0, res1;
+  v8u16 vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 vec0, vec1, vec2, vec3, filt0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16i8 res0, res1, res2, res3;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+              res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+  ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask, out0, out1;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+  }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  loop_cnt = (height >> 2) - 1;
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
+
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+              out2, out3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+              out6, out7);
+  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+  PCKEV_ST_SB(out0, out1, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out2, out3, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out4, out5, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out6, out7, dst);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    PCKEV_ST_SB(out6, out7, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src4 = LD_SB(src + 32);
+    src6 = LD_SB(src + 48);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    PCKEV_ST_SB(out4, out5, dst + 32);
+    PCKEV_ST_SB(out6, out7, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
new file mode 100644
index 000000000..aa962b41f
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v16u8 out;
+  v8i16 filt, out10, out32;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v16u8 tmp0, tmp1;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+             src54_l, src21_l);
+  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+               src87_l, src98_l, src109_l);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                 filt1, filt2, filt3);
+    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                 filt1, filt2, filt3);
+    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                 filt1, filt2, filt3);
+    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src54_l = src98_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src65_l = src109_l;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      XORI_B4_128_SB(src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4;
+  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+  v16u8 filt0;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 filt;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+
+  src8 = LD_SB(src);
+  src += src_stride;
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+             src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+               vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src5 = LD_UB(src + 16);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+    src += (4 * src_stride);
+
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB4(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(src + 48, src_stride, src10, src11);
+    src += (2 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
new file mode 100644
index 000000000..f7f116f4d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    for (cnt = height >> 3; cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 4) {
+    for (cnt = (height / 4); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 2) {
+    for (cnt = (height / 2); cnt--;) {
+      LD_UB2(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+      SD(out0, dst);
+      dst += dst_stride;
+      SD(out1, dst);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
+             src7);
+      src_tmp += (8 * src_stride);
+
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
+             dst_stride);
+      dst_tmp += (8 * dst_stride);
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+      dst += (8 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int32_t filter_x_stride,
+                           const int16_t *filter_y, int32_t filter_y_stride,
+                           int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt, tmp;
+      /* 1 word storage */
+      for (cnt = h; cnt--;) {
+        tmp = LW(src);
+        SW(tmp, dst);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
new file mode 100644
index 000000000..852415c20
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/aom_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
+                            filt3)                                         \
+  ({                                                                       \
+    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
+                                                                           \
+    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
+    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
+    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
+                                                                           \
+    tmp_dpadd_0;                                                           \
+  })
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
+                                   mask2, mask3, filt0, filt1, filt2, filt3, \
+                                   out0, out1)                               \
+  {                                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);        \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);               \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);        \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);        \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);               \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);        \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);              \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                 \
+  }
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
+                                   mask2, mask3, filt0, filt1, filt2, filt3, \
+                                   out0, out1, out2, out3)                   \
+  {                                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;    \
+                                                                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
+                res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);        \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
+                res4_m, res5_m, res6_m, res7_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);        \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+                 res0_m, res1_m, res2_m, res3_m);                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);        \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+                 res4_m, res5_m, res6_m, res7_m);                            \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,      \
+                res7_m, out0, out1, out2, out3);                             \
+  }
+
+#endif  // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c
new file mode 100644
index 000000000..00ab75dc3
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *aom_ff_cropTbl;
+
+void aom_dsputil_static_init(void) {
+  int i;
+
+  for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+  for (i = 0; i < CROP_WIDTH; i++) {
+    aom_ff_cropTbl_a[i] = 0;
+    aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+  }
+
+  aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
new file mode 100644
index 000000000..c42188d62
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/common_dspr2.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+extern uint8_t *aom_ff_cropTbl;  // From "aom_dsp/mips/intrapred4_dspr2.c"
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__("pref   4,  0(%[src])   \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__("pref   5,  0(%[dst])   \n\t" : : [dst] "r"(dst));
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
new file mode 100644
index 000000000..08bf1ab30
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
@@ -0,0 +1,1031 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint8_t *dst_ptr;
+  int32_t Temp1, Temp2;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [src] "r"(src), [dst_stride] "r"(dst_stride));
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],         0(%[src])                       \n\t"
+        "ulw              %[tp2],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[Temp1],         %[p3](%[cm])                    "
+        "\n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
+          [odd_dst] "+r"(odd_dst)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter, int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      sum += src[x] * filter[3];
+      sum += src[x + 1] * filter[4];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter, int w,
+                         int h) {
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                           filter, h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                           filter, h);
+      break;
+    case 16:
+    case 32:
+      convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                            filter, h, (w / 16));
+      break;
+    case 64:
+      prefetch_load(src + 32);
+      convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                            filter, h);
+      break;
+    default:
+      convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
+                                   h);
+      break;
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
new file mode 100644
index 000000000..097da73ca
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[p1],       1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[p2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+          [Temp4] "=&r"(Temp4)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tp3],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tp3],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[p1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride, uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0, int32_t h,
+                                       int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride, uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0, int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
+  uint32_t pos = 38;
+
+  assert(x_step_q4 == 16);
+
+  prefetch_load((const uint8_t *)filter_x);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
+      break;
+    case 16:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+      break;
+    case 32:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h);
+      break;
+    default:
+      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
+      break;
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
new file mode 100644
index 000000000..40abfd89e
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int16_t *filter_y, int32_t w,
+                                     int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  uint32_t pos = 38;
+
+  assert(y_step_q4 == 16);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+                               h);
+      break;
+    case 64:
+      prefetch_store(dst + 32);
+      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+      break;
+    default:
+      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w, h);
+      break;
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
new file mode 100644
index 000000000..af54b4264
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h) {
+  int x, y;
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4: {
+      uint32_t tp1;
+
+      /* 1 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         (%[src])      \n\t"
+            "sw               %[tp1],         (%[dst])      \n\t" /* store */
+
+            : [tp1] "=&r"(tp1)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 8: {
+      uint32_t tp1, tp2;
+
+      /* 2 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 16: {
+      uint32_t tp1, tp2, tp3, tp4;
+
+      /* 4 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 32: {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      /* 8 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 64: {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
+
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         36(%[src])     \n\t"
+            "ulw              %[tp3],         40(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[src])     \n\t"
+            "ulw              %[tp5],         48(%[src])     \n\t"
+            "ulw              %[tp6],         52(%[src])     \n\t"
+            "ulw              %[tp7],         56(%[src])     \n\t"
+            "ulw              %[tp8],         60(%[src])     \n\t"
+
+            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
+            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
+            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
+            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         60(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    default:
+      for (y = h; y--;) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = src[x];
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
new file mode 100644
index 000000000..f9c6879ab
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[tn1],      1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[n2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+        "ulw              %[tn1],      12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tn3],      %[tn1],         3              \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[n1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    const int16_t *filter_x0, int32_t h,
+                                    int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    const int16_t *filter_x0, int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    uint32_t pos = 38;
+
+    prefetch_load((const uint8_t *)filter_x);
+    src -= 3;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
+
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+                               (int32_t)dst_stride, filter_x, (int32_t)h);
+        break;
+      case 8:
+        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+                               (int32_t)dst_stride, filter_x, (int32_t)h);
+        break;
+      case 16:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+        break;
+      case 32:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+        break;
+      case 64:
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
+
+        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
+        break;
+      default:
+        aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
new file mode 100644
index 000000000..201e66427
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int16_t *filter_y, int32_t w,
+                                  int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
+
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
+        break;
+      case 64:
+        prefetch_store(dst + 32);
+        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+        break;
+      default:
+        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
+
+#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
new file mode 100644
index 000000000..e5d48a884
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h);
+
+void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter, int w,
+                         int h);
+
+void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h);
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
new file mode 100644
index 000000000..7c221ae89
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+  (void)above;
+
+  __asm__ __volatile__(
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "lb         %[tmp5],      4(%[left])                   \n\t"
+      "lb         %[tmp6],      5(%[left])                   \n\t"
+      "lb         %[tmp7],      6(%[left])                   \n\t"
+      "lb         %[tmp8],      7(%[left])                   \n\t"
+      "lb         %[tmp9],      8(%[left])                   \n\t"
+      "lb         %[tmp10],     9(%[left])                   \n\t"
+      "lb         %[tmp11],     10(%[left])                  \n\t"
+      "lb         %[tmp12],     11(%[left])                  \n\t"
+      "lb         %[tmp13],     12(%[left])                  \n\t"
+      "lb         %[tmp14],     13(%[left])                  \n\t"
+      "lb         %[tmp15],     14(%[left])                  \n\t"
+      "lb         %[tmp16],     15(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
+      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
+      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
+      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
+      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
+      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
+      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
+      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
+      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "sw         %[tmp1],      4(%[dst])                    \n\t"
+      "sw         %[tmp1],      8(%[dst])                    \n\t"
+      "sw         %[tmp1],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "sw         %[tmp2],      4(%[dst])                    \n\t"
+      "sw         %[tmp2],      8(%[dst])                    \n\t"
+      "sw         %[tmp2],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "sw         %[tmp3],      4(%[dst])                    \n\t"
+      "sw         %[tmp3],      8(%[dst])                    \n\t"
+      "sw         %[tmp3],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+      "sw         %[tmp4],      4(%[dst])                    \n\t"
+      "sw         %[tmp4],      8(%[dst])                    \n\t"
+      "sw         %[tmp4],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp5],      (%[dst])                     \n\t"
+      "sw         %[tmp5],      4(%[dst])                    \n\t"
+      "sw         %[tmp5],      8(%[dst])                    \n\t"
+      "sw         %[tmp5],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp6],      (%[dst])                     \n\t"
+      "sw         %[tmp6],      4(%[dst])                    \n\t"
+      "sw         %[tmp6],      8(%[dst])                    \n\t"
+      "sw         %[tmp6],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp7],      (%[dst])                     \n\t"
+      "sw         %[tmp7],      4(%[dst])                    \n\t"
+      "sw         %[tmp7],      8(%[dst])                    \n\t"
+      "sw         %[tmp7],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp8],      (%[dst])                     \n\t"
+      "sw         %[tmp8],      4(%[dst])                    \n\t"
+      "sw         %[tmp8],      8(%[dst])                    \n\t"
+      "sw         %[tmp8],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp9],      (%[dst])                     \n\t"
+      "sw         %[tmp9],      4(%[dst])                    \n\t"
+      "sw         %[tmp9],      8(%[dst])                    \n\t"
+      "sw         %[tmp9],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp10],     (%[dst])                     \n\t"
+      "sw         %[tmp10],     4(%[dst])                    \n\t"
+      "sw         %[tmp10],     8(%[dst])                    \n\t"
+      "sw         %[tmp10],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp11],     (%[dst])                     \n\t"
+      "sw         %[tmp11],     4(%[dst])                    \n\t"
+      "sw         %[tmp11],     8(%[dst])                    \n\t"
+      "sw         %[tmp11],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp12],     (%[dst])                     \n\t"
+      "sw         %[tmp12],     4(%[dst])                    \n\t"
+      "sw         %[tmp12],     8(%[dst])                    \n\t"
+      "sw         %[tmp12],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp13],     (%[dst])                     \n\t"
+      "sw         %[tmp13],     4(%[dst])                    \n\t"
+      "sw         %[tmp13],     8(%[dst])                    \n\t"
+      "sw         %[tmp13],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp14],     (%[dst])                     \n\t"
+      "sw         %[tmp14],     4(%[dst])                    \n\t"
+      "sw         %[tmp14],     8(%[dst])                    \n\t"
+      "sw         %[tmp14],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp15],     (%[dst])                     \n\t"
+      "sw         %[tmp15],     4(%[dst])                    \n\t"
+      "sw         %[tmp15],     8(%[dst])                    \n\t"
+      "sw         %[tmp15],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp16],     (%[dst])                     \n\t"
+      "sw         %[tmp16],     4(%[dst])                    \n\t"
+      "sw         %[tmp16],     8(%[dst])                    \n\t"
+      "sw         %[tmp16],     12(%[dst])                   \n\t"
+
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
+        [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
+        [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
+        [tmp16] "=&r"(tmp16)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t above2, left2;
+
+  __asm__ __volatile__(
+      "lw              %[above1],           (%[above])                    \n\t"
+      "lw              %[above2],           4(%[above])                   \n\t"
+      "lw              %[left1],            (%[left])                     \n\t"
+      "lw              %[left2],            4(%[left])                    \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "lw              %[above1],           8(%[above])                   \n\t"
+      "lw              %[above2],           12(%[above])                  \n\t"
+      "lw              %[left1],            8(%[left])                    \n\t"
+      "lw              %[left2],            12(%[left])                   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "addiu           %[average],          %[average],      16           \n\t"
+      "srl             %[tmp],              %[average],      16           \n\t"
+      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
+      "srl             %[expected_dc],      %[average],      5            \n\t"
+      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
+        [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
+        [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
+        [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
new file mode 100644
index 000000000..0a21979c7
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t tmp1, tmp2, tmp3, tmp4;
+  (void)above;
+
+  __asm__ __volatile__(
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+  __asm__ __volatile__(
+      "lw              %[above_c],         (%[above])                    \n\t"
+      "lw              %[left_c],          (%[left])                     \n\t"
+
+      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
+      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
+      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
+      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
+
+      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
+      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
+      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
+      "addiu           %[average],         %[average],       4           \n\t"
+      "srl             %[tmp],             %[average],       16          \n\t"
+      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
+      "srl             %[expected_dc],     %[average],       3           \n\t"
+      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+
+      : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
+        [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
+        [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
+        [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
new file mode 100644
index 000000000..d42a77c80
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  (void)above;
+
+  __asm__ __volatile__(
+      "lb         %[tmp1],      (%[left])                   \n\t"
+      "lb         %[tmp2],      1(%[left])                  \n\t"
+      "lb         %[tmp3],      2(%[left])                  \n\t"
+      "lb         %[tmp4],      3(%[left])                  \n\t"
+      "lb         %[tmp5],      4(%[left])                  \n\t"
+      "lb         %[tmp6],      5(%[left])                  \n\t"
+      "lb         %[tmp7],      6(%[left])                  \n\t"
+      "lb         %[tmp8],      7(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                    \n\t"
+      "sw         %[tmp1],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp2],      (%[dst])                    \n\t"
+      "sw         %[tmp2],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp3],      (%[dst])                    \n\t"
+      "sw         %[tmp3],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp4],      (%[dst])                    \n\t"
+      "sw         %[tmp4],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp5],      (%[dst])                    \n\t"
+      "sw         %[tmp5],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp6],      (%[dst])                    \n\t"
+      "sw         %[tmp6],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp7],      (%[dst])                    \n\t"
+      "sw         %[tmp7],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp8],      (%[dst])                    \n\t"
+      "sw         %[tmp8],      4(%[dst])                   \n\t"
+
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+  __asm__ __volatile__(
+      "lw              %[above1],         (%[above])                      \n\t"
+      "lw              %[above2],         4(%[above])                     \n\t"
+      "lw              %[left1],          (%[left])                       \n\t"
+      "lw              %[left2],          4(%[left])                      \n\t"
+
+      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
+      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
+      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
+      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
+
+      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
+      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
+      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
+      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
+
+      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
+
+      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
+
+      "addiu           %[average],        %[average],       8             \n\t"
+
+      "srl             %[tmp],            %[average],       16            \n\t"
+      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
+      "srl             %[expected_dc],    %[average],       4             \n\t"
+      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
+
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
+        [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
+        [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
+        [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
+        [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
+        [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
+        [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
new file mode 100644
index 000000000..9f25cc1ca
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+  {                                             \
+    out0 = __msa_subs_u_h(out0, in0);           \
+    out1 = __msa_subs_u_h(out1, in1);           \
+  }
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t src_data;
+
+  src_data = LW(src);
+
+  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  uint32_t src_data1, src_data2;
+
+  src_data1 = LW(src);
+  src_data2 = LW(src + 4);
+
+  for (row = 8; row--;) {
+    SW(src_data1, dst);
+    SW(src_data2, (dst + 4));
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src0;
+
+  src0 = LD_UB(src);
+
+  for (row = 16; row--;) {
+    ST_UB(src0, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src1, src2;
+
+  src1 = LD_UB(src);
+  src2 = LD_UB(src + 16);
+
+  for (row = 32; row--;) {
+    ST_UB2(src1, src2, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t out0, out1, out2, out3;
+
+  out0 = src[0] * 0x01010101;
+  out1 = src[1] * 0x01010101;
+  out2 = src[2] * 0x01010101;
+  out3 = src[3] * 0x01010101;
+
+  SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  out0 = src[0] * 0x0101010101010101ull;
+  out1 = src[1] * 0x0101010101010101ull;
+  out2 = src[2] * 0x0101010101010101ull;
+  out3 = src[3] * 0x0101010101010101ull;
+  out4 = src[4] * 0x0101010101010101ull;
+  out5 = src[5] * 0x0101010101010101ull;
+  out6 = src[6] * 0x0101010101010101ull;
+  out7 = src[7] * 0x0101010101010101ull;
+
+  SD4(out0, out1, out2, out3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 4; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 8; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB2(src0, src0, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src1, src1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src2, src2, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src3, src3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
+  uint32_t val0, val1;
+  v16i8 store, src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LW(src_top);
+  val1 = LW(src_left);
+  INSERT_W2_SB(val0, val1, src);
+  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t val0;
+  v16i8 store, data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+
+  val0 = LW(src);
+  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
+  uint64_t val0, val1;
+  v16i8 store;
+  v16u8 src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src_top);
+  val1 = LD(src_left);
+  INSERT_D2_UB(val0, val1, src);
+  sum_h = __msa_hadd_u_h(src, src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t val0;
+  v16i8 store;
+  v16u8 data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src);
+  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+  uint64_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(out, out, out, out, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
+  v16u8 top, left, out;
+  v8u16 sum_h, sum_top, sum_left;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  top = LD_UB(src_top);
+  left = LD_UB(src_left);
+  HADD_UB2_UH(top, left, sum_top, sum_left);
+  sum_h = sum_top + sum_left;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  v16u8 data, out;
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  data = LD_UB(src);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  v16u8 top0, top1, left0, left1, out;
+  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src_top, 16, top0, top1);
+  LD_UB2(src_left, 16, left0, left1);
+  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+  sum_h = sum_top0 + sum_top1;
+  sum_h += sum_left0 + sum_left1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  v16u8 data0, data1, out;
+  v8u16 sum_h, sum_data0, sum_data1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src, 16, data0, data1);
+  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+  sum_h = sum_data0 + sum_data1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_32x32_msa(dst, y_stride);
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
new file mode 100644
index 000000000..38a10e9b2
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
@@ -0,0 +1,1488 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 96);
+
+  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    src -= 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1, q2, src, pitch);
+  } else {
+    src -= 7 * pitch;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += pitch;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += pitch;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
+
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += pitch;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += pitch;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += pitch;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += pitch;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += pitch;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+  }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr,
+                                        int32_t count) {
+  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  (void)count;
+
+  early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                        limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    aom_hz_lpf_t16_16w(src, pitch, filter48);
+  }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr, int32_t count) {
+  if (1 == count) {
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
+    AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+    if (__msa_test_bz_v(flat)) {
+      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                 q3_r);
+      AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+      /* store pixel values */
+      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+      /* load 16 vector elements */
+      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+      AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__msa_test_bz_v(flat2)) {
+        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+        SD(q1_d, src + pitch);
+        SD(q2_d, src + 2 * pitch);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
+                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
+                   q7_r);
+
+        tmp0 = p7_r << 3;
+        tmp0 -= p7_r;
+        tmp0 += p6_r;
+        tmp0 += q0_r;
+
+        src -= 7 * pitch;
+
+        /* calculation of p6 and p5 */
+        tmp1 = p6_r + p5_r + p4_r + p3_r;
+        tmp1 += (p2_r + p1_r + p0_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp0 = p5_r - p6_r + q1_r - p7_r;
+        tmp1 += tmp0;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p4 and p3 */
+        tmp0 = p4_r - p5_r + q2_r - p7_r;
+        tmp2 = p3_r - p4_r + q3_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p2 and p1 */
+        tmp0 = p2_r - p3_r + q4_r - p7_r;
+        tmp2 = p1_r - p2_r + q5_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p0 and q0 */
+        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q1 and q2 */
+        tmp0 = q7_r - q0_r + q1_r - p6_r;
+        tmp2 = q7_r - q1_r + q2_r - p5_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q3 and q4 */
+        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q5 and q6 */
+        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+      }
+    }
+  } else {
+    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+                                count);
+  }
+}
+
+void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
+}
+
+void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
+         p1_org, p0_org);
+  /* 8x8 transpose */
+  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+  /* 8x8 transpose */
+  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+             tmp0, tmp1, tmp2, tmp3);
+  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
+                            int32_t out_pitch) {
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp2, tmp3;
+
+  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  input += (8 * in_pitch);
+  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
+  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
+  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
+  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
+  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
+  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
+  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
+  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
+
+  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
+  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
+
+  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
+  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
+
+  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
+  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
+  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
+  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
+  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                uint8_t *src_org, int32_t pitch_org,
+                                const uint8_t *b_limit_ptr,
+                                const uint8_t *limit_ptr,
+                                const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    /* convert 16 bit output data into 8 bit */
+    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
+    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
+    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
+    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
+    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
+    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                          uint8_t *filter48) {
+  v16i8 zero = { 0 };
+  v16u8 filter8, flat, flat2;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 tmp0_r, tmp1_r;
+  v8i16 r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST8x1_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST8x1_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST8x1_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST8x1_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST8x1_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST8x1_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST8x1_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST8x1_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+                             const uint8_t *b_limit_ptr,
+                             const uint8_t *limit_ptr,
+                             const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+  early_exit =
+      aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
+                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                   &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+    }
+  }
+}
+
+int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                 uint8_t *src_org, int32_t pitch,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src_org -= 2;
+    ST4x8_UB(vec2, vec3, src_org, pitch);
+    src_org += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src_org, pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                           uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit =
+      aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                    &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
new file mode 100644
index 000000000..dc0a97764
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  v16u8 mask, hev, flat, limit, thresh, b_limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v8i16 vec0, vec1, vec2, vec3;
+
+  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+  src -= 2;
+  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+  src += 4 * pitch;
+  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat;
+  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+         row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+  src -= 2;
+
+  ST4x8_UB(tmp2, tmp3, src, pitch);
+  src += (8 * pitch);
+  ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
new file mode 100644
index 000000000..dc203e79c
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+  v16i8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+                q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+    src -= 3 * pitch;
+
+    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+    src += (4 * pitch);
+    SD(q1_d, src);
+    src += pitch;
+    SD(q2_d, src);
+  }
+}
+
+void aom_lpf_horizontal_8_dual_msa(
+    uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  tmp = (v16u8)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  tmp = (v16u8)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  tmp = (v16u8)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    src -= 3 * pitch;
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1_out, q2_out, src, pitch);
+    src += (2 * pitch);
+  }
+}
+
+void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4;
+
+  /* load vector elements */
+  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src -= 3;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 4, src + 4, pitch);
+  }
+}
+
+void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *temp_src;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+  temp_src = src - 4;
+
+  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+  temp_src += (8 * pitch);
+  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+  /* transpose 16x8 matrix into 8x16 */
+  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  vec0 = (v8i16)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  vec0 = (v8i16)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  vec0 = (v8i16)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src -= 2;
+    ST4x8_UB(vec2, vec3, src, pitch);
+    src += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+
+    /* filter8 */
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 4, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 4, src + 4, pitch);
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
new file mode 100644
index 000000000..8c41278be
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
+                                const uint8_t *blimit, const uint8_t *limit,
+                                const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask;
+  uint32_t hev;
+  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  /* loop filter designed to work using chars so that we can make maximum use
+     of 8 bit simd instructions. */
+  for (i = 0; i < 2; i++) {
+    sm1 = s - (pitch << 2);
+    s0 = sm1 + pitch;
+    s1 = s0 + pitch;
+    s2 = s - pitch;
+    s3 = s;
+    s4 = s + pitch;
+    s5 = s4 + pitch;
+    s6 = s5 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p1],  (%[s1])    \n\t"
+        "lw     %[p2],  (%[s2])    \n\t"
+        "lw     %[p3],  (%[s3])    \n\t"
+        "lw     %[p4],  (%[s4])    \n\t"
+
+        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+       mask will be zero and filtering is not needed */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      __asm__ __volatile__(
+          "lw       %[pm1], (%[sm1])   \n\t"
+          "lw       %[p0],  (%[s0])    \n\t"
+          "lw       %[p5],  (%[s5])    \n\t"
+          "lw       %[p6],  (%[s6])    \n\t"
+
+          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
+          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
+
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+                            p6, thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        __asm__ __volatile__(
+            "sw     %[p1],  (%[s1])    \n\t"
+            "sw     %[p2],  (%[s2])    \n\t"
+            "sw     %[p3],  (%[s3])    \n\t"
+            "sw     %[p4],  (%[s4])    \n\t"
+
+            :
+            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
+              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
+                              const uint8_t *blimit, const uint8_t *limit,
+                              const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev;
+  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s = s4 + pitch;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2 = *((uint32_t *)(s1 - 4));
+    p6 = *((uint32_t *)(s1));
+    p1 = *((uint32_t *)(s2 - 4));
+    p5 = *((uint32_t *)(s2));
+    p0 = *((uint32_t *)(s3 - 4));
+    p4 = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3 = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+                            p6, thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        /* unpack processed 4x4 neighborhood
+         * don't use transpose on output data
+         * because memory isn't aligned
+         */
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s4])    \n\t"
+            "sb     %[p3],   0(%[s4])    \n\t"
+            "sb     %[p2],  -1(%[s4])    \n\t"
+            "sb     %[p1],  -2(%[s4])    \n\t"
+
+            :
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s4] "r"(s4));
+
+        __asm__ __volatile__(
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
+
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s3])    \n\t"
+            "sb     %[p3],   0(%[s3])    \n\t"
+            "sb     %[p2],  -1(%[s3])    \n\t"
+            "sb     %[p1],  -2(%[s3])    \n\t"
+
+            : [p1] "+r"(p1)
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
+
+        __asm__ __volatile__(
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
+
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s2])    \n\t"
+            "sb     %[p3],   0(%[s2])    \n\t"
+            "sb     %[p2],  -1(%[s2])    \n\t"
+            "sb     %[p1],  -2(%[s2])    \n\t"
+
+            :
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s2] "r"(s2));
+
+        __asm__ __volatile__(
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
+
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s1])    \n\t"
+            "sb     %[p3],   0(%[s1])    \n\t"
+            "sb     %[p2],  -1(%[s1])    \n\t"
+            "sb     %[p1],  -2(%[s1])    \n\t"
+
+            :
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s1] "r"(s1));
+      }
+    }
+  }
+}
+
+void aom_lpf_horizontal_4_dual_dspr2(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_dual_dspr2(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
new file mode 100644
index 000000000..28f0dc35a
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
+                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
+  int32_t aom_filter_l, aom_filter_r;
+  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t subr_r, subr_l;
+  uint32_t t1, t2, HWM, t3;
+  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t vps1, vps0, vqs0, vqs1;
+  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t N128;
+
+  N128 = 0x80808080;
+  t1 = 0x03000300;
+  t2 = 0x04000400;
+  t3 = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (*ps0) ^ N128;
+  vps1 = (*ps1) ^ N128;
+  vqs0 = (*qs0) ^ N128;
+  vqs1 = (*qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__(
+      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* aom_filter &= hev; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
+
+      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+
+      /* aom_filter &= mask; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
+
+      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
+        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+        [HWM] "r"(HWM));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__(
+      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+        [vqs0_r] "+r"(vqs0_r)
+      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
+
+  __asm__ __volatile__(
+      /* (aom_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* aom_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+        [vqs1_r] "+r"(vqs1_r)
+      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__(
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+        [vqs0_r] "+r"(vqs0_r)
+      :);
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *ps0 = vps0 ^ N128;
+  *ps1 = vps1 ^ N128;
+  *qs0 = vqs0 ^ N128;
+  *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
+                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t aom_filter_l, aom_filter_r;
+  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t subr_r, subr_l;
+  uint32_t t1, t2, HWM, t3;
+  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t vps1, vps0, vqs0, vqs1;
+  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t N128;
+
+  N128 = 0x80808080;
+  t1 = 0x03000300;
+  t2 = 0x04000400;
+  t3 = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (ps0) ^ N128;
+  vps1 = (ps1) ^ N128;
+  vqs0 = (qs0) ^ N128;
+  vqs1 = (qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__(
+      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* aom_filter &= hev; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
+
+      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+
+      /* aom_filter &= mask; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
+
+      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
+        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+        [HWM] "r"(HWM));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__(
+      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+        [vqs0_r] "+r"(vqs0_r)
+      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
+
+  __asm__ __volatile__(
+      /* (aom_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* aom_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+        [vqs1_r] "+r"(vqs1_r)
+      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__(
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+        [vqs0_r] "+r"(vqs0_r)
+      :);
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *p0_f0 = vps0 ^ N128;
+  *p1_f0 = vps1 ^ N128;
+  *q0_f0 = vqs0 ^ N128;
+  *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
+                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  uint32_t res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2;
+  uint32_t tmp;
+  uint32_t add_p210_q012;
+  uint32_t u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
+
+  __asm__ __volatile__(
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
+
+      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+}
+
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
+                                   uint32_t p0, uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  uint32_t res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2;
+  uint32_t tmp;
+  uint32_t add_p210_q012;
+  uint32_t u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
+
+  __asm__ __volatile__(
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
+
+      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+  *op2_f1 = res_op2;
+  *op1_f1 = res_op1;
+  *op0_f1 = res_op0;
+  *oq0_f1 = res_oq0;
+  *oq1_f1 = res_oq1;
+  *oq2_f1 = res_oq2;
+}
+
+static INLINE void wide_mbfilter_dspr2(
+    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
+    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
+    uint32_t *oq7) {
+  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+  uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t tmp;
+  uint32_t add_p6toq6;
+  uint32_t u32Eight = 0x00080008;
+
+  __asm__ __volatile__(
+      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+         which is used most of the time */
+      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
+
+      : [add_p6toq6] "=&r"(add_p6toq6)
+      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
+        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
+        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+        [u32Eight] "r"(u32Eight));
+
+  __asm__ __volatile__(
+      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+                                   p3 + p2 + p1 + p0 + q0, 4) */
+      "shll.ph       %[tmp],            %[p7],            3               \n\t"
+      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
+      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
+
+      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+                                   p2 + p1 + p0 + q0 + q1, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
+      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
+
+      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+                                   p1 + p0 + q0 + q1 + q2, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
+      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
+
+      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
+      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
+      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
+
+      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
+      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
+
+      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
+      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
+      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
+
+      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
+      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
+
+      : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
+        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
+      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
+        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+        [add_p6toq6] "r"(add_p6toq6));
+
+  *op6 = res_op6;
+  *op5 = res_op5;
+  *op4 = res_op4;
+  *op3 = res_op3;
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+
+  __asm__ __volatile__(
+      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
+      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
+
+      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
+      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
+
+      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
+
+      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
+      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
+
+      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
+
+      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+                                   q5 * 2 + q6 + q7 * 6, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
+
+      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
+      "shll.ph       %[tmp],            %[q7],            3               \n\t"
+      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
+
+      : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
+        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
+        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
+        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
+      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
+        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
+        [add_p6toq6] "r"(add_p6toq6));
+
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+  *oq3 = res_oq3;
+  *oq4 = res_oq4;
+  *oq5 = res_oq5;
+  *oq6 = res_oq6;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
new file mode 100644
index 000000000..62295d69d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define STORE_F0()                                                       \
+  {                                                                      \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s4] "r"(s4));                             \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                    \
+                                                                         \
+        : [p1_f0] "+r"(p1_f0)                                            \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3),          \
+          [p0_f0] "r"(p0_f0));                                           \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s2] "r"(s2));                             \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s1] "r"(s1));                             \
+  }
+
+#define STORE_F1()                                                             \
+  {                                                                            \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                          \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                          \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                          \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                          \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                          \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
+          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                          \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                          \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                          \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                          \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                          \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                          \
+                                                                               \
+        : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r),             \
+          [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r)              \
+        :);                                                                    \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                          \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                          \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                          \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                          \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                          \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
+          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                          \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                          \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                          \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                          \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                          \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
+          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                          \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                          \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                          \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                          \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                          \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                          \
+                                                                               \
+        : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l),             \
+          [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l)              \
+        :);                                                                    \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                          \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                          \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                          \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                          \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                          \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
+          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
+  }
+
+#define STORE_F2()                                                 \
+  {                                                                \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_r],     6(%[s4])           \n\t"              \
+        "sb     %[q5_r],     5(%[s4])           \n\t"              \
+        "sb     %[q4_r],     4(%[s4])           \n\t"              \
+        "sb     %[q3_r],     3(%[s4])           \n\t"              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"              \
+        "sb     %[q1_r],     1(%[s4])           \n\t"              \
+        "sb     %[q0_r],     0(%[s4])           \n\t"              \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"              \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"              \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"              \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"              \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"              \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"              \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
+          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
+          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
+          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
+          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"              \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"              \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"              \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"              \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"              \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"              \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"              \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"              \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"              \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"              \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"              \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"              \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"              \
+                                                                   \
+        : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
+          [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
+          [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
+          [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
+          [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r)                     \
+        :);                                                        \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_r],     6(%[s3])           \n\t"              \
+        "sb     %[q5_r],     5(%[s3])           \n\t"              \
+        "sb     %[q4_r],     4(%[s3])           \n\t"              \
+        "sb     %[q3_r],     3(%[s3])           \n\t"              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"              \
+        "sb     %[q1_r],     1(%[s3])           \n\t"              \
+        "sb     %[q0_r],     0(%[s3])           \n\t"              \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"              \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"              \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"              \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"              \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"              \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"              \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
+          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
+          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
+          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
+          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_l],     6(%[s2])           \n\t"              \
+        "sb     %[q5_l],     5(%[s2])           \n\t"              \
+        "sb     %[q4_l],     4(%[s2])           \n\t"              \
+        "sb     %[q3_l],     3(%[s2])           \n\t"              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"              \
+        "sb     %[q1_l],     1(%[s2])           \n\t"              \
+        "sb     %[q0_l],     0(%[s2])           \n\t"              \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"              \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"              \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"              \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"              \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"              \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"              \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
+          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
+          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
+          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
+          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"               \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"               \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"               \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"               \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"               \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"               \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"               \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"               \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"               \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"               \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"               \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"               \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"               \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"               \
+                                                                   \
+        : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
+          [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
+          [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
+          [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
+          [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l)                     \
+        :);                                                        \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_l],     6(%[s1])           \n\t"              \
+        "sb     %[q5_l],     5(%[s1])           \n\t"              \
+        "sb     %[q4_l],     4(%[s1])           \n\t"              \
+        "sb     %[q3_l],     3(%[s1])           \n\t"              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"              \
+        "sb     %[q1_l],     1(%[s1])           \n\t"              \
+        "sb     %[q0_l],     0(%[s1])           \n\t"              \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"              \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"              \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"              \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"              \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"              \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"              \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
+          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
+          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
+          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
+          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1));       \
+  }
+
+#define PACK_LEFT_0TO3()                                              \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                     \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                     \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                     \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                     \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                     \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                     \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                     \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                     \
+                                                                      \
+        : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
+          [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
+          [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l)                      \
+        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
+          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
+  }
+
+#define PACK_LEFT_4TO7()                                              \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                     \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                     \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                     \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                     \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                     \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                     \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                     \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                     \
+                                                                      \
+        : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
+          [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
+          [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l)                      \
+        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
+          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
+  }
+
+#define PACK_RIGHT_0TO3()                                             \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                      \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                     \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                     \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                     \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                     \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                     \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                     \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                     \
+                                                                      \
+        : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
+          [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
+          [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r)                      \
+        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
+          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
+  }
+
+#define PACK_RIGHT_4TO7()                                             \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                     \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                     \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                     \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                     \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                     \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                     \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                     \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                     \
+                                                                      \
+        : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
+          [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
+          [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r)                      \
+        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
+          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
+  }
+
+#define COMBINE_LEFT_RIGHT_0TO2()                                         \
+  {                                                                       \
+    __asm__ __volatile__(                                                 \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"              \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"              \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"              \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"              \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"              \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"              \
+                                                                          \
+        : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
+          [q1] "=&r"(q1), [q2] "=&r"(q2)                                  \
+        : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l),           \
+          [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r),           \
+          [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l),           \
+          [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r));          \
+  }
+
+#define COMBINE_LEFT_RIGHT_3TO6()                                         \
+  {                                                                       \
+    __asm__ __volatile__(                                                 \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"              \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"              \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"              \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"              \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"              \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"              \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"              \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"              \
+                                                                          \
+        : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
+          [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6)  \
+        : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),           \
+          [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r),           \
+          [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l),           \
+          [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l),           \
+          [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),           \
+          [q6_r] "r"(q6_r));                                              \
+  }
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
new file mode 100644
index 000000000..a0f57f386
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0, uint32_t p3,
+                                         uint32_t p2, uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
+  uint32_t c, r, r3, r_k;
+  uint32_t s1, s2, s3;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t hev1;
+
+  __asm__ __volatile__(
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   $0,        %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  $0,        %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  %[r3],     %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+      "sll            %[r3],    %[r3],    24          \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+        [thresh] "r"(thresh));
+
+  __asm__ __volatile__(
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+        [ones] "r"(ones), [flimit] "r"(flimit));
+
+  *hev = hev1;
+  *mask = s2;
+}
+
+static INLINE void filter_hev_mask_flatmask4_dspr2(
+    uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
+    uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
+    uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
+  uint32_t c, r, r3, r_k, r_flat;
+  uint32_t s1, s2, s3;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t flat_thresh = 0x01010101;
+  uint32_t hev1;
+  uint32_t flat1;
+
+  __asm__ __volatile__(
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       $0,             %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
+      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       * flat |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      $0,             %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       * flat |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      %[r3],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      /* look at stall here */
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
+      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "sll            %[r3],      %[r3],          24           \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
+        [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
+      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+        [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+  __asm__ __volatile__(
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+        [ones] "r"(ones), [flimit] "r"(flimit));
+
+  *hev = hev1;
+  *mask = s2;
+  *flat = flat1;
+}
+
+static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4, uint32_t *flat2) {
+  uint32_t c, r, r_k, r_flat;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t flat_thresh = 0x01010101;
+  uint32_t flat1, flat3;
+
+  __asm__ __volatile__(
+      /* flat |= (abs(p4 - p0) > thresh) */
+      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
+      "or             %[r_k], %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r],   $0,              %[c]         \n\t"
+
+      /* flat |= (abs(q4 - q0) > thresh) */
+      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
+      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
+      "or             %[r_k],   %[r_k],          %[c]      \n\t"
+      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
+      "or             %[r],     %[r],            %[c]      \n\t"
+      "sll            %[r],     %[r],            24        \n\t"
+      "wrdsp          %[r]                                 \n\t"
+      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
+
+      /* flat |= (abs(p1 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* flat |= (abs(q1 - q0) > thresh) */
+      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
+      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
+      "or             %[r_k],    %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
+        [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
+      : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
+        [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
+        [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+  *flat2 = flat1;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
new file mode 100644
index 000000000..b67ccfe9d
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
+                                const uint8_t *blimit, const uint8_t *limit,
+                                const uint8_t *thresh) {
+  uint32_t mask;
+  uint32_t hev, flat;
+  uint8_t i;
+  uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  for (i = 0; i < 2; i++) {
+    sp3 = s - (pitch << 2);
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p3],      (%[sp3])    \n\t"
+        "lw     %[p2],      (%[sp2])    \n\t"
+        "lw     %[p1],      (%[sp1])    \n\t"
+        "lw     %[p0],      (%[sp0])    \n\t"
+        "lw     %[q0],      (%[sq0])    \n\t"
+        "lw     %[q1],      (%[sq1])    \n\t"
+        "lw     %[q2],      (%[sq2])    \n\t"
+        "lw     %[q3],      (%[sq3])    \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
+        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+          [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__(
+          "sw       %[p1_f0],   (%[sp1])    \n\t"
+          "sw       %[p0_f0],   (%[sp0])    \n\t"
+          "sw       %[q0_f0],   (%[sq0])    \n\t"
+          "sw       %[q1_f0],   (%[sq1])    \n\t"
+
+          :
+          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+            [sq1] "r"(sq1));
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__(
+          "sw       %[p2],      (%[sp2])    \n\t"
+          "sw       %[p1],      (%[sp1])    \n\t"
+          "sw       %[p0],      (%[sp0])    \n\t"
+          "sw       %[q0],      (%[sq0])    \n\t"
+          "sw       %[q1],      (%[sq1])    \n\t"
+          "sw       %[q2],      (%[sq2])    \n\t"
+
+          :
+          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+    } else if ((flat != 0) && (mask != 0)) {
+      /* filtering */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    (%[sp2])    \n\t"
+            "sb     %[p1_r],    (%[sp1])    \n\t"
+            "sb     %[p0_r],    (%[sp0])    \n\t"
+            "sb     %[q0_r],    (%[sq0])    \n\t"
+            "sb     %[q1_r],    (%[sq1])    \n\t"
+            "sb     %[q2_r],    (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    +1(%[sp2])    \n\t"
+            "sb     %[p1_r],    +1(%[sp1])    \n\t"
+            "sb     %[p0_r],    +1(%[sp0])    \n\t"
+            "sb     %[q0_r],    +1(%[sq0])    \n\t"
+            "sb     %[q1_r],    +1(%[sq1])    \n\t"
+            "sb     %[q2_r],    +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l],    +2(%[sp2])    \n\t"
+            "sb     %[p1_l],    +2(%[sp1])    \n\t"
+            "sb     %[p0_l],    +2(%[sp0])    \n\t"
+            "sb     %[q0_l],    +2(%[sq0])    \n\t"
+            "sb     %[q1_l],    +2(%[sq1])    \n\t"
+            "sb     %[q2_l],    +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
+                              const uint8_t *blimit, const uint8_t *limit,
+                              const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev, flat;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s = s4 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[q3],    (%[s1])    \n\t"
+        "lw     %[q2],    (%[s2])    \n\t"
+        "lw     %[q1],    (%[s3])    \n\t"
+        "lw     %[q0],    (%[s4])    \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat != 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  -3(%[s4])    \n\t"
+            "sb         %[p1_r],  -2(%[s4])    \n\t"
+            "sb         %[p0_r],  -1(%[s4])    \n\t"
+            "sb         %[q0_r],    (%[s4])    \n\t"
+            "sb         %[q1_r],  +1(%[s4])    \n\t"
+            "sb         %[q2_r],  +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s4] "r"(s4));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  -3(%[s3])    \n\t"
+            "sb         %[p1_r],  -2(%[s3])    \n\t"
+            "sb         %[p0_r],  -1(%[s3])    \n\t"
+            "sb         %[q0_r],    (%[s3])    \n\t"
+            "sb         %[q1_r],  +1(%[s3])    \n\t"
+            "sb         %[q2_r],  +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s3] "r"(s3));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s3])    \n\t"
+            "sb         %[p0_f0],  -1(%[s3])    \n\t"
+            "sb         %[q0_f0],    (%[s3])    \n\t"
+            "sb         %[q1_f0],  +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  -3(%[s2])    \n\t"
+            "sb         %[p1_l],  -2(%[s2])    \n\t"
+            "sb         %[p0_l],  -1(%[s2])    \n\t"
+            "sb         %[q0_l],    (%[s2])    \n\t"
+            "sb         %[q1_l],  +1(%[s2])    \n\t"
+            "sb         %[q2_l],  +2(%[s2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s2] "r"(s2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s2])    \n\t"
+            "sb         %[p0_f0],  -1(%[s2])    \n\t"
+            "sb         %[q0_f0],    (%[s2])    \n\t"
+            "sb         %[q1_f0],  +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  -3(%[s1])    \n\t"
+            "sb         %[p1_l],  -2(%[s1])    \n\t"
+            "sb         %[p0_l],  -1(%[s1])    \n\t"
+            "sb         %[q0_l],    (%[s1])    \n\t"
+            "sb         %[q1_l],  +1(%[s1])    \n\t"
+            "sb         %[q2_l],  +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s1] "r"(s1));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s1])    \n\t"
+            "sb         %[p0_f0],  -1(%[s1])    \n\t"
+            "sb         %[q0_f0],    (%[s1])    \n\t"
+            "sb         %[q1_f0],  +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
new file mode 100644
index 000000000..34733e42e
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count) {
+  uint32_t mask;
+  uint32_t hev, flat, flat2;
+  uint8_t i;
+  uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  for (i = 0; i < (2 * count); i++) {
+    sp7 = s - (pitch << 3);
+    sp6 = sp7 + pitch;
+    sp5 = sp6 + pitch;
+    sp4 = sp5 + pitch;
+    sp3 = sp4 + pitch;
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+    sq4 = sq3 + pitch;
+    sq5 = sq4 + pitch;
+    sq6 = sq5 + pitch;
+    sq7 = sq6 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p7],      (%[sp7])            \n\t"
+        "lw     %[p6],      (%[sp6])            \n\t"
+        "lw     %[p5],      (%[sp5])            \n\t"
+        "lw     %[p4],      (%[sp4])            \n\t"
+        "lw     %[p3],      (%[sp3])            \n\t"
+        "lw     %[p2],      (%[sp2])            \n\t"
+        "lw     %[p1],      (%[sp1])            \n\t"
+        "lw     %[p0],      (%[sp0])            \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+          [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
+
+    __asm__ __volatile__(
+        "lw     %[q0],      (%[sq0])            \n\t"
+        "lw     %[q1],      (%[sq1])            \n\t"
+        "lw     %[q2],      (%[sq2])            \n\t"
+        "lw     %[q3],      (%[sq3])            \n\t"
+        "lw     %[q4],      (%[sq4])            \n\t"
+        "lw     %[q5],      (%[sq5])            \n\t"
+        "lw     %[q6],      (%[sq6])            \n\t"
+        "lw     %[q7],      (%[sq7])            \n\t"
+
+        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+        : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
+          [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__(
+          "sw       %[p1_f0],   (%[sp1])            \n\t"
+          "sw       %[p0_f0],   (%[sp0])            \n\t"
+          "sw       %[q0_f0],   (%[sq0])            \n\t"
+          "sw       %[q1_f0],   (%[sq1])            \n\t"
+
+          :
+          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+            [sq1] "r"(sq1));
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+      COMBINE_LEFT_RIGHT_3TO6()
+
+      __asm__ __volatile__(
+          "sw         %[p6], (%[sp6])    \n\t"
+          "sw         %[p5], (%[sp5])    \n\t"
+          "sw         %[p4], (%[sp4])    \n\t"
+          "sw         %[p3], (%[sp3])    \n\t"
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+
+          :
+          : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
+            [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
+            [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+      __asm__ __volatile__(
+          "sw         %[q6], (%[sq6])    \n\t"
+          "sw         %[q5], (%[sq5])    \n\t"
+          "sw         %[q4], (%[sq4])    \n\t"
+          "sw         %[q3], (%[sq3])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+
+          :
+          : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+            [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
+            [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
+            [sq1] "r"(sq1), [sq0] "r"(sq0));
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__(
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+
+          :
+          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  +3(%[sp2])    \n\t"
+            "sb         %[p1_l],  +3(%[sp1])    \n\t"
+            "sb         %[p0_l],  +3(%[sp0])    \n\t"
+            "sb         %[q0_l],  +3(%[sq0])    \n\t"
+            "sb         %[q1_l],  +3(%[sq1])    \n\t"
+            "sb         %[q2_l],  +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 + f2 */
+      /* f0  function */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* f1  function */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      /* f2  function */
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p6_r],  (%[sp6])    \n\t"
+            "sb         %[p5_r],  (%[sp5])    \n\t"
+            "sb         %[p4_r],  (%[sp4])    \n\t"
+            "sb         %[p3_r],  (%[sp3])    \n\t"
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+            "sb         %[q3_r],  (%[sq3])    \n\t"
+            "sb         %[q4_r],  (%[sq4])    \n\t"
+            "sb         %[q5_r],  (%[sq5])    \n\t"
+            "sb         %[q6_r],  (%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl        %[p6_r], %[p6_r], 16     \n\t"
+          "srl        %[p5_r], %[p5_r], 16     \n\t"
+          "srl        %[p4_r], %[p4_r], 16     \n\t"
+          "srl        %[p3_r], %[p3_r], 16     \n\t"
+          "srl        %[p2_r], %[p2_r], 16     \n\t"
+          "srl        %[p1_r], %[p1_r], 16     \n\t"
+          "srl        %[p0_r], %[p0_r], 16     \n\t"
+          "srl        %[q0_r], %[q0_r], 16     \n\t"
+          "srl        %[q1_r], %[q1_r], 16     \n\t"
+          "srl        %[q2_r], %[q2_r], 16     \n\t"
+          "srl        %[q3_r], %[q3_r], 16     \n\t"
+          "srl        %[q4_r], %[q4_r], 16     \n\t"
+          "srl        %[q5_r], %[q5_r], 16     \n\t"
+          "srl        %[q6_r], %[q6_r], 16     \n\t"
+
+          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+            [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
+            [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
+            [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
+          :);
+
+      __asm__ __volatile__(
+          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
+          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
+          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
+          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
+          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
+          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
+          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
+          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
+          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
+          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
+
+          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p6_r],  +1(%[sp6])    \n\t"
+            "sb         %[p5_r],  +1(%[sp5])    \n\t"
+            "sb         %[p4_r],  +1(%[sp4])    \n\t"
+            "sb         %[p3_r],  +1(%[sp3])    \n\t"
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+            "sb         %[q3_r],  +1(%[sq3])    \n\t"
+            "sb         %[q4_r],  +1(%[sq4])    \n\t"
+            "sb         %[q5_r],  +1(%[sq5])    \n\t"
+            "sb         %[q6_r],  +1(%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
+          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
+          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
+          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p6_l],  +2(%[sp6])    \n\t"
+            "sb         %[p5_l],  +2(%[sp5])    \n\t"
+            "sb         %[p4_l],  +2(%[sp4])    \n\t"
+            "sb         %[p3_l],  +2(%[sp3])    \n\t"
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+            "sb         %[q3_l],  +2(%[sq3])    \n\t"
+            "sb         %[q4_l],  +2(%[sq4])    \n\t"
+            "sb         %[q5_l],  +2(%[sq5])    \n\t"
+            "sb         %[q6_l],  +2(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p6_l],    %[p6_l],    16   \n\t"
+          "srl      %[p5_l],    %[p5_l],    16   \n\t"
+          "srl      %[p4_l],    %[p4_l],    16   \n\t"
+          "srl      %[p3_l],    %[p3_l],    16   \n\t"
+          "srl      %[p2_l],    %[p2_l],    16   \n\t"
+          "srl      %[p1_l],    %[p1_l],    16   \n\t"
+          "srl      %[p0_l],    %[p0_l],    16   \n\t"
+          "srl      %[q0_l],    %[q0_l],    16   \n\t"
+          "srl      %[q1_l],    %[q1_l],    16   \n\t"
+          "srl      %[q2_l],    %[q2_l],    16   \n\t"
+          "srl      %[q3_l],    %[q3_l],    16   \n\t"
+          "srl      %[q4_l],    %[q4_l],    16   \n\t"
+          "srl      %[q5_l],    %[q5_l],    16   \n\t"
+          "srl      %[q6_l],    %[q6_l],    16   \n\t"
+
+          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+          :);
+
+      __asm__ __volatile__(
+          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
+          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
+          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
+          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
+          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
+          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
+          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
+          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
+          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
+          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
+
+          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p6_l],    +3(%[sp6])    \n\t"
+            "sb     %[p5_l],    +3(%[sp5])    \n\t"
+            "sb     %[p4_l],    +3(%[sp4])    \n\t"
+            "sb     %[p3_l],    +3(%[sp3])    \n\t"
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+            "sb     %[q3_l],    +3(%[sq3])    \n\t"
+            "sb     %[q4_l],    +3(%[sq4])    \n\t"
+            "sb     %[q5_l],    +3(%[sq5])    \n\t"
+            "sb     %[q6_l],    +3(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
+              [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
+            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
+            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
+            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
+            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
+            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+                                 const uint8_t *blimit, const uint8_t *limit,
+                                 const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
new file mode 100644
index 000000000..3d3f1ec97
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev, flat, flat2;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s = s4 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[p4],  -8(%[s1])    \n\t"
+        "lw     %[p5],  -8(%[s2])    \n\t"
+        "lw     %[p6],  -8(%[s3])    \n\t"
+        "lw     %[p7],  -8(%[s4])    \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    __asm__ __volatile__(
+        "lw     %[q3],  (%[s1])     \n\t"
+        "lw     %[q2],  (%[s2])     \n\t"
+        "lw     %[q1],  (%[s3])     \n\t"
+        "lw     %[q0],  (%[s4])     \n\t"
+        "lw     %[q7],  +4(%[s1])   \n\t"
+        "lw     %[q6],  +4(%[s2])   \n\t"
+        "lw     %[q5],  +4(%[s3])   \n\t"
+        "lw     %[q4],  +4(%[s4])   \n\t"
+
+        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose p7, p6, p5, p4
+       original (when loaded from memory)
+       register      -8    -7   -6     -5
+         p4         p4_0  p4_1  p4_2  p4_3
+         p5         p5_0  p5_1  p5_2  p5_3
+         p6         p6_0  p6_1  p6_2  p6_3
+         p7         p7_0  p7_1  p7_2  p7_3
+
+       after transpose
+       register
+         p4         p7_3  p6_3  p5_3  p4_3
+         p5         p7_2  p6_2  p5_2  p4_2
+         p6         p7_1  p6_1  p5_1  p4_1
+         p7         p7_0  p6_0  p5_0  p4_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p7],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
+          [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose q4, q5, q6, q7
+       original (when loaded from memory)
+       register      +5    +6    +7    +8
+         q7         q7_0  q7_1  q7_2  q7_3
+         q6         q6_0  q6_1  q6_2  q6_3
+         q5         q5_0  q5_1  q5_2  q5_3
+         q4         q4_0  q4_1  q4_2  q4_3
+
+       after transpose
+       register
+         q7         q4_3  q5_3  q26_3  q7_3
+         q6         q4_2  q5_2  q26_2  q7_2
+         q5         q4_1  q5_1  q26_1  q7_1
+         q4         q4_0  q5_0  q26_0  q7_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
+
+        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
+        "append         %[q6],      %[sec3],    16          \n\t"
+        "append         %[q4],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
+          [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      STORE_F2()
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s4] "r"(s4));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s3] "r"(s3));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb       %[p2_l],    -3(%[s2])    \n\t"
+            "sb       %[p1_l],    -2(%[s2])    \n\t"
+            "sb       %[p0_l],    -1(%[s2])    \n\t"
+            "sb       %[q0_l],      (%[s2])    \n\t"
+            "sb       %[q1_l],    +1(%[s2])    \n\t"
+            "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s2] "r"(s2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+            "sb     %[q0_l],      (%[s1])    \n\t"
+            "sb     %[q1_l],    +1(%[s1])    \n\t"
+            "sb     %[q2_l],    +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s1] "r"(s1));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1+f2 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      PACK_LEFT_0TO3()
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      PACK_RIGHT_0TO3()
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p6_r],    -7(%[s4])    \n\t"
+            "sb     %[p5_r],    -6(%[s4])    \n\t"
+            "sb     %[p4_r],    -5(%[s4])    \n\t"
+            "sb     %[p3_r],    -4(%[s4])    \n\t"
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [s4] "r"(s4));
+
+        __asm__ __volatile__(
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+            "sb     %[q3_r],    +3(%[s4])    \n\t"
+            "sb     %[q4_r],    +4(%[s4])    \n\t"
+            "sb     %[q5_r],    +5(%[s4])    \n\t"
+            "sb     %[q6_r],    +6(%[s4])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [s4] "r"(s4));
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
+            "sb     %[q0_r_f1],       (%[s4])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s4])    \n\t"
+            "sb     %[p0_f0],   -1(%[s4])    \n\t"
+            "sb     %[q0_f0],     (%[s4])    \n\t"
+            "sb     %[q1_f0],   +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p6_r],        %[p6_r],        16     \n\t"
+          "srl      %[p5_r],        %[p5_r],        16     \n\t"
+          "srl      %[p4_r],        %[p4_r],        16     \n\t"
+          "srl      %[p3_r],        %[p3_r],        16     \n\t"
+          "srl      %[p2_r],        %[p2_r],        16     \n\t"
+          "srl      %[p1_r],        %[p1_r],        16     \n\t"
+          "srl      %[p0_r],        %[p0_r],        16     \n\t"
+          "srl      %[q0_r],        %[q0_r],        16     \n\t"
+          "srl      %[q1_r],        %[q1_r],        16     \n\t"
+          "srl      %[q2_r],        %[q2_r],        16     \n\t"
+          "srl      %[q3_r],        %[q3_r],        16     \n\t"
+          "srl      %[q4_r],        %[q4_r],        16     \n\t"
+          "srl      %[q5_r],        %[q5_r],        16     \n\t"
+          "srl      %[q6_r],        %[q6_r],        16     \n\t"
+
+          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+            [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
+            [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
+            [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
+          :);
+
+      __asm__ __volatile__(
+          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
+          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
+          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
+          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
+          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
+          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p6_r],    -7(%[s3])    \n\t"
+            "sb     %[p5_r],    -6(%[s3])    \n\t"
+            "sb     %[p4_r],    -5(%[s3])    \n\t"
+            "sb     %[p3_r],    -4(%[s3])    \n\t"
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [s3] "r"(s3));
+
+        __asm__ __volatile__(
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+            "sb     %[q3_r],    +3(%[s3])    \n\t"
+            "sb     %[q4_r],    +4(%[s3])    \n\t"
+            "sb     %[q5_r],    +5(%[s3])    \n\t"
+            "sb     %[q6_r],    +6(%[s3])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [s3] "r"(s3));
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
+            "sb     %[q0_r_f1],       (%[s3])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p6_l],    -7(%[s2])    \n\t"
+            "sb     %[p5_l],    -6(%[s2])    \n\t"
+            "sb     %[p4_l],    -5(%[s2])    \n\t"
+            "sb     %[p3_l],    -4(%[s2])    \n\t"
+            "sb     %[p2_l],    -3(%[s2])    \n\t"
+            "sb     %[p1_l],    -2(%[s2])    \n\t"
+            "sb     %[p0_l],    -1(%[s2])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [s2] "r"(s2));
+
+        __asm__ __volatile__(
+            "sb     %[q0_l],      (%[s2])    \n\t"
+            "sb     %[q1_l],    +1(%[s2])    \n\t"
+            "sb     %[q2_l],    +2(%[s2])    \n\t"
+            "sb     %[q3_l],    +3(%[s2])    \n\t"
+            "sb     %[q4_l],    +4(%[s2])    \n\t"
+            "sb     %[q5_l],    +5(%[s2])    \n\t"
+            "sb     %[q6_l],    +6(%[s2])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [s2] "r"(s2));
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
+            "sb     %[q0_l_f1],       (%[s2])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p6_l],        %[p6_l],        16     \n\t"
+          "srl      %[p5_l],        %[p5_l],        16     \n\t"
+          "srl      %[p4_l],        %[p4_l],        16     \n\t"
+          "srl      %[p3_l],        %[p3_l],        16     \n\t"
+          "srl      %[p2_l],        %[p2_l],        16     \n\t"
+          "srl      %[p1_l],        %[p1_l],        16     \n\t"
+          "srl      %[p0_l],        %[p0_l],        16     \n\t"
+          "srl      %[q0_l],        %[q0_l],        16     \n\t"
+          "srl      %[q1_l],        %[q1_l],        16     \n\t"
+          "srl      %[q2_l],        %[q2_l],        16     \n\t"
+          "srl      %[q3_l],        %[q3_l],        16     \n\t"
+          "srl      %[q4_l],        %[q4_l],        16     \n\t"
+          "srl      %[q5_l],        %[q5_l],        16     \n\t"
+          "srl      %[q6_l],        %[q6_l],        16     \n\t"
+
+          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+          :);
+
+      __asm__ __volatile__(
+          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
+          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
+          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
+          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
+          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
+          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p6_l],    -7(%[s1])    \n\t"
+            "sb     %[p5_l],    -6(%[s1])    \n\t"
+            "sb     %[p4_l],    -5(%[s1])    \n\t"
+            "sb     %[p3_l],    -4(%[s1])    \n\t"
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [s1] "r"(s1));
+
+        __asm__ __volatile__(
+            "sb     %[q0_l],     (%[s1])    \n\t"
+            "sb     %[q1_l],    1(%[s1])    \n\t"
+            "sb     %[q2_l],    2(%[s1])    \n\t"
+            "sb     %[q3_l],    3(%[s1])    \n\t"
+            "sb     %[q4_l],    4(%[s1])    \n\t"
+            "sb     %[q5_l],    5(%[s1])    \n\t"
+            "sb     %[q6_l],    6(%[s1])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [s1] "r"(s1));
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
+            "sb     %[q0_l_f1],       (%[s1])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
new file mode 100644
index 000000000..54b0bb4bd
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+                           p1_out, p0_out, q0_out, q1_out)              \
+  {                                                                     \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+    filt = filt & (v16i8)hev_in;                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    /* combine left and right part */                                   \
+    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
+                                                                        \
+    filt = filt & (v16i8)mask_in;                                       \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+  }
+
+#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+                           p1_out, p0_out, q0_out, q1_out)              \
+  {                                                                     \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
+    filt = filt & (v16i8)mask_in;                                       \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+  }
+
+#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
+  {                                                                      \
+    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+    v16u8 zero_in = { 0 };                                               \
+                                                                         \
+    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
+                                                                         \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
+                                                                         \
+    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
+    flat_out = __msa_xori_b(flat_out, 0xff);                             \
+    flat_out = flat_out & (mask);                                        \
+  }
+
+#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+                  q6_in, q7_in, flat_in, flat2_out)                       \
+  {                                                                       \
+    v16u8 tmp_flat5, zero_in = { 0 };                                     \
+    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
+    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
+                                                                          \
+    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
+    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
+    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
+    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
+    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
+    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
+    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
+    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
+    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
+                                                                          \
+    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
+    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
+    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
+    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
+                                                                          \
+    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
+    flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
+    flat2_out = flat2_out & flat_in;                                      \
+  }
+
+#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  {                                                                         \
+    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
+                                                                            \
+    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
+    tmp_filt8_0 = p3_in << 1;                                               \
+                                                                            \
+    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
+    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
+    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
+    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
+    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
+    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
+    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
+    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
+                                                                            \
+    tmp_filt8_0 = q2_in + q3_in;                                            \
+    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
+    tmp_filt8_1 = q3_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
+    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
+    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
+    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
+    tmp_filt8_0 = q1_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
+    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+  }
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  {                                                                          \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
+    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
+    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
+    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
+    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
+    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
+    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
+    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = thresh_in < (v16u8)flat_out;                                   \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m >>= 1;                                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
+                                                                             \
+    mask_out = b_limit_in < p0_asub_q0_m;                                    \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                            \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = limit_in < (v16u8)mask_out;                                   \
+    mask_out = __msa_xori_b(mask_out, 0xff);                                 \
+  }
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
new file mode 100644
index 000000000..9bfc27147
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/macros_msa.h
@@ -0,0 +1,2058 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
+#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LH(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint16_t val_m;                                       \
+                                                          \
+    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+
+#define LW(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint32_t val_m;                                       \
+                                                          \
+    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint64_t val_m = 0;                                   \
+                                                          \
+    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                            \
+  ({                                                        \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
+    uint32_t val0_m, val1_m;                                \
+    uint64_t val_m = 0;                                     \
+                                                            \
+    val0_m = LW(psrc_m);                                    \
+    val1_m = LW(psrc_m + 4);                                \
+                                                            \
+    val_m = (uint64_t)(val1_m);                             \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
+                                                            \
+    val_m;                                                  \
+  })
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint16_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+
+#define SW(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint32_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+
+#define SD(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint64_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+#else  // !(__mips_isa_rev >= 6)
+#define LH(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint16_t val_m;                                        \
+                                                           \
+    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
+
+#define LW(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint32_t val_m;                                        \
+                                                           \
+    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint64_t val_m = 0;                                    \
+                                                           \
+    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                              \
+  ({                                                                          \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
+    uint32_t val0_m, val1_m;                                                  \
+    uint64_t val_m_combined = 0;                                              \
+                                                                              \
+    val0_m = LW(psrc_m1);                                                     \
+    val1_m = LW(psrc_m1 + 4);                                                 \
+                                                                              \
+    val_m_combined = (uint64_t)(val1_m);                                      \
+    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
+    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
+                                                                              \
+    val_m_combined;                                                           \
+  })
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                                      \
+  {                                                        \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
+    const uint16_t val_m = (val);                          \
+                                                           \
+    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
+                                                           \
+                         : [pdst_m] "=m"(*pdst_m)          \
+                         : [val_m] "r"(val_m));            \
+  }
+
+#define SW(val, pdst)                                      \
+  {                                                        \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
+    const uint32_t val_m = (val);                          \
+                                                           \
+    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
+                                                           \
+                         : [pdst_m] "=m"(*pdst_m)          \
+                         : [val_m] "r"(val_m));            \
+  }
+
+#define SD(val, pdst)                                        \
+  {                                                          \
+    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t val0_m, val1_m;                                 \
+                                                             \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                             \
+    SW(val0_m, pdst_m1);                                     \
+    SW(val1_m, pdst_m1 + 4);                                 \
+  }
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+  {                                               \
+    out0 = LW((psrc));                            \
+    out1 = LW((psrc) + stride);                   \
+    out2 = LW((psrc) + 2 * stride);               \
+    out3 = LW((psrc) + 3 * stride);               \
+  }
+
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) \
+  {                                   \
+    out0 = LD((psrc));                \
+    out1 = LD((psrc) + stride);       \
+  }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+  {                                               \
+    LD2((psrc), stride, out0, out1);              \
+    LD2((psrc) + 2 * stride, stride, out2, out3); \
+  }
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+  {                                           \
+    SW(in0, (pdst))                           \
+    SW(in1, (pdst) + stride);                 \
+    SW(in2, (pdst) + 2 * stride);             \
+    SW(in3, (pdst) + 3 * stride);             \
+  }
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+  {                                           \
+    SD(in0, (pdst))                           \
+    SD(in1, (pdst) + stride);                 \
+    SD(in2, (pdst) + 2 * stride);             \
+    SD(in3, (pdst) + 3 * stride);             \
+  }
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_B(RTYPE, (psrc));                \
+    out1 = LD_B(RTYPE, (psrc) + stride);       \
+  }
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+  {                                                  \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
+    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
+  }
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+  {                                                              \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
+    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
+  }
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+  {                                                                          \
+    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
+    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
+  }
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+              out7)                                                          \
+  {                                                                          \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+  }
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_H(RTYPE, (psrc));                \
+    out1 = LD_H(RTYPE, (psrc) + (stride));     \
+  }
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+              out7)                                                          \
+  {                                                                          \
+    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+  }
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
+               out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
+  {                                                                            \
+    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
+          out7);                                                               \
+    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+          out13, out14, out15);                                                \
+  }
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Input   - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3)            \
+  {                                                       \
+    out0 = LD_SH(psrc);                                   \
+    out2 = LD_SH(psrc + 8);                               \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+  }
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) \
+  {                                      \
+    out0 = LD_SW((psrc));                \
+    out1 = LD_SW((psrc) + stride);       \
+  }
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_B(RTYPE, in0, (pdst));                \
+    ST_B(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+  {                                                                        \
+    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
+    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
+  }
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_H(RTYPE, in0, (pdst));                \
+    ST_H(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+  {                                                                        \
+    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
+    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
+  }
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) \
+  {                                    \
+    ST_SW(in0, (pdst));                \
+    ST_SW(in1, (pdst) + stride);       \
+  }
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride)            \
+  {                                                  \
+    uint16_t out0_m, out1_m, out2_m, out3_m;         \
+    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
+                                                     \
+    out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
+    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+                                                     \
+    SH(out0_m, pblk_2x4_m);                          \
+    SH(out1_m, pblk_2x4_m + stride);                 \
+    SH(out2_m, pblk_2x4_m + 2 * stride);             \
+    SH(out3_m, pblk_2x4_m + 3 * stride);             \
+  }
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst)
+                 Index 1 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride)           \
+  {                                          \
+    uint32_t out0_m, out1_m;                 \
+    uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+                                             \
+    out0_m = __msa_copy_u_w((v4i32)in, 0);   \
+    out1_m = __msa_copy_u_w((v4i32)in, 1);   \
+                                             \
+    SW(out0_m, pblk_4x2_m);                  \
+    SW(out1_m, pblk_4x2_m + stride);         \
+  }
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+  {                                                              \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                     \
+    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
+                                                                 \
+    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
+    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
+    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
+    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
+                                                                 \
+    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
+  }
+#define ST4x8_UB(in0, in1, pdst, stride)                           \
+  {                                                                \
+    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
+                                                                   \
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
+    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+  }
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst)                 \
+  {                                        \
+    uint64_t out0_m;                       \
+                                           \
+    out0_m = __msa_copy_u_d((v2i64)in, 0); \
+    SD(out0_m, pdst);                      \
+  }
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride)           \
+  {                                          \
+    uint64_t out0_m, out1_m;                 \
+    uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+                                             \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);   \
+    out1_m = __msa_copy_u_d((v2i64)in, 1);   \
+                                             \
+    SD(out0_m, pblk_8x2_m);                  \
+    SD(out1_m, pblk_8x2_m + stride);         \
+  }
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride)                     \
+  {                                                          \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                 \
+    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
+                                                             \
+    out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
+    out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
+    out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
+    out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
+                                                             \
+    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+  }
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+   Arguments   : Inputs  - in0, in1, in2, in3,
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from 'in0' vector is added with
+                 each unsigned byte element from 'in1' vector. Then the average
+                 with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+  {                                                       \
+    out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
+    out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
+  }
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
+    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
+  }
+#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
+  {                                                                   \
+    v16i8 zero_m = { 0 };                                             \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+  }
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
+                  slide_val)                                         \
+  {                                                                  \
+    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
+    SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
+  }
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+  {                                                                       \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
+  }
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+                out2, slide_val)                                             \
+  {                                                                          \
+    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
+    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
+  }
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  {                                                                   \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+  }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
+                out3)                                                          \
+  {                                                                            \
+    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
+    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
+  }
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
+  }
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
+  }
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
+  }
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
+  }
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+  }
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                  cnst3, out0, out1, out2, out3)                          \
+  {                                                                       \
+    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+  }
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed word element from 'mult0' is multiplied with itself
+                 producing an intermediate result twice the size of input
+                 i.e. signed double word
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+  }
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Minimum values between unsigned elements of
+                 either vector are copied to the output vector
+   Arguments   : Inputs  - in0, in1, min_vec
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Minimum of unsigned halfword element values from 'in0' and
+                 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+    in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+  }
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
+  {                                                 \
+    MIN_UH2(RTYPE, in0, in1, min_vec);              \
+    MIN_UH2(RTYPE, in2, in3, min_vec);              \
+  }
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in)                              \
+  ({                                                   \
+    v8i16 max_m = __msa_ldi_h(255);                    \
+    v8i16 out_m;                                       \
+                                                       \
+    out_m = __msa_maxi_s_h((v8i16)in, 0);              \
+    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+    out_m;                                             \
+  })
+#define CLIP_SH2_0_255(in0, in1) \
+  {                              \
+    in0 = CLIP_SH_0_255(in0);    \
+    in1 = CLIP_SH_0_255(in1);    \
+  }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+  {                                        \
+    CLIP_SH2_0_255(in0, in1);              \
+    CLIP_SH2_0_255(in2, in3);              \
+  }
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in)                            \
+  ({                                               \
+    v2i64 res0_m, res1_m;                          \
+    int32_t sum_m;                                 \
+                                                   \
+    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+    res1_m = __msa_splati_d(res0_m, 1);            \
+    res0_m = res0_m + res1_m;                      \
+    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
+    sum_m;                                         \
+  })
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Inputs  - in       (unsigned halfword vector)
+                 Outputs - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of input vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in)                               \
+  ({                                                  \
+    v4u32 res_m;                                      \
+    v2u64 res0_m, res1_m;                             \
+    uint32_t sum_m;                                   \
+                                                      \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
+    res0_m = __msa_hadd_u_d(res_m, res_m);            \
+    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
+    res0_m = res0_m + res1_m;                         \
+    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
+    sum_m;                                            \
+  })
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+  }
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                 \
+    HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
+    HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
+  }
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+  }
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : SAD (Sum of Absolute Difference)
+   Arguments   : Inputs  - in0, in1, ref0, ref1
+                 Outputs - sad_m                 (halfword vector)
+                 Return Type - unsigned halfword
+   Details     : Absolute difference of all the byte elements from 'in0' with
+                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+                 pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
+  ({                                                         \
+    v16u8 diff0_m, diff1_m;                                  \
+    v8u16 sad_m = { 0 };                                     \
+                                                             \
+    diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
+    diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
+                                                             \
+    sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
+    sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
+                                                             \
+    sad_m;                                                   \
+  })
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+  }
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2(RTYPE, in0, in1, out)              \
+  {                                                  \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+  }
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
+  {                                                  \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
+  }
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+
+#define INSERT_D2(RTYPE, in0, in1, out)              \
+  {                                                  \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+  }
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+  }
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+  }
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+  }
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+  }
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+  }
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+  }
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+  }
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+  }
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+                in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
+                out5, out6, out7)                                              \
+  {                                                                            \
+    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
+            out3);                                                             \
+    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
+            out6, out7);                                                       \
+  }
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+  }
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+  }
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
+  {                                                         \
+    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+  }
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
+  {                                                                    \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
+    out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
+  }
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+  }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+  }
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+  }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+  }
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
+  {                                                 \
+    SAT_UH2(RTYPE, in0, in1, sat_val);              \
+    SAT_UH2(RTYPE, in2, in3, sat_val)               \
+  }
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+  }
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+  {                                                 \
+    SAT_SH2(RTYPE, in0, in1, sat_val);              \
+    SAT_SH2(RTYPE, in2, in3, sat_val);              \
+  }
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+  {                                                  \
+    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
+    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
+  }
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
+  {                                                                          \
+    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
+    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
+  }
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+  }
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+  }
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+  }
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1)            \
+  {                                             \
+    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+  }
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2)       \
+  {                                             \
+    XORI_B2_128(RTYPE, in0, in1);               \
+    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+  }
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+  {                                            \
+    XORI_B2_128(RTYPE, in0, in1);              \
+    XORI_B2_128(RTYPE, in2, in3);              \
+  }
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
+  {                                                           \
+    XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
+    XORI_B3_128(RTYPE, in4, in5, in6);                        \
+  }
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each signed halfword element from 'in0' is added to each
+                 signed halfword element of 'in1' with full precision resulting
+                 in one extra bit in the result. The result is then divided by
+                 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
+    out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
+    out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
+    out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
+  }
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'in0' are added to signed
+                 halfword elements of 'in1'. The result is then signed saturated
+                 between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+  {                                                       \
+    out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+  }
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+  {                                        \
+    in0 = in0 << shift;                    \
+    in1 = in1 << shift;                    \
+    in2 = in2 << shift;                    \
+    in3 = in3 << shift;                    \
+  }
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) \
+  {                                       \
+    in0 = in0 >> shift;                   \
+    in1 = in1 >> shift;                   \
+    in2 = in2 >> shift;                   \
+    in3 = in3 >> shift;                   \
+  }
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift)                  \
+  {                                                      \
+    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+  }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                               \
+    SRAR_W2(RTYPE, in0, in1, shift)               \
+    SRAR_W2(RTYPE, in2, in3, shift)               \
+  }
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift)           \
+  {                                                \
+    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+  }
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                                \
+    SRARI_H2(RTYPE, in0, in1, shift);              \
+    SRARI_H2(RTYPE, in2, in3, shift);              \
+  }
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift)           \
+  {                                                \
+    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+  }
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                                \
+    SRARI_W2(RTYPE, in0, in1, shift);              \
+    SRARI_W2(RTYPE, in2, in3, shift);              \
+  }
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
+  {                                                                       \
+    out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
+    out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
+    out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
+    out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
+  }
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 * in1;                        \
+    out1 = in2 * in3;                        \
+  }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    MUL2(in0, in1, in2, in3, out0, out1);                                    \
+    MUL2(in4, in5, in6, in7, out2, out3);                                    \
+  }
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 + in1;                        \
+    out1 = in2 + in3;                        \
+  }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    ADD2(in0, in1, in2, in3, out0, out1);                                    \
+    ADD2(in4, in5, in6, in7, out2, out3);                                    \
+  }
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 - in1;                        \
+    out1 = in2 - in3;                        \
+  }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    out0 = in0 - in1;                                                        \
+    out1 = in2 - in3;                                                        \
+    out2 = in4 - in5;                                                        \
+    out3 = in6 - in7;                                                        \
+  }
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out)                    \
+  {                                               \
+    v8i16 sign_m;                                 \
+                                                  \
+    sign_m = __msa_clti_s_h((v8i16)in, 0);        \
+    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+  }
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1)      \
+  {                                      \
+    v16i8 zero_m = { 0 };                \
+                                         \
+    ILVRL_B2_SH(zero_m, in, out0, out1); \
+  }
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1)       \
+  {                                       \
+    v8i16 tmp_m;                          \
+                                          \
+    tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+    ILVRL_H2_SW(tmp_m, in, out0, out1);   \
+  }
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                             \
+    out0 = in0 + in3;                                           \
+    out1 = in1 + in2;                                           \
+                                                                \
+    out2 = in1 - in2;                                           \
+    out3 = in0 - in3;                                           \
+  }
+
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                    out3, out4, out5, out6, out7)                             \
+  {                                                                           \
+    out0 = in0 + in7;                                                         \
+    out1 = in1 + in6;                                                         \
+    out2 = in2 + in5;                                                         \
+    out3 = in3 + in4;                                                         \
+                                                                              \
+    out4 = in3 - in4;                                                         \
+    out5 = in2 - in5;                                                         \
+    out6 = in1 - in6;                                                         \
+    out7 = in0 - in7;                                                         \
+  }
+
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
+                     in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
+                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
+                     out13, out14, out15)                                     \
+  {                                                                           \
+    out0 = in0 + in15;                                                        \
+    out1 = in1 + in14;                                                        \
+    out2 = in2 + in13;                                                        \
+    out3 = in3 + in12;                                                        \
+    out4 = in4 + in11;                                                        \
+    out5 = in5 + in10;                                                        \
+    out6 = in6 + in9;                                                         \
+    out7 = in7 + in8;                                                         \
+                                                                              \
+    out8 = in7 - in8;                                                         \
+    out9 = in6 - in9;                                                         \
+    out10 = in5 - in10;                                                       \
+    out11 = in4 - in11;                                                       \
+    out12 = in3 - in12;                                                       \
+    out13 = in2 - in13;                                                       \
+    out14 = in1 - in14;                                                       \
+    out15 = in0 - in15;                                                       \
+  }
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
+                        out1, out2, out3, out4, out5, out6, out7)              \
+  {                                                                            \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
+                                                                               \
+    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+               tmp3_m);                                                        \
+    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
+    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
+    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
+    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
+    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
+    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
+  }
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+                            in10, in11, in12, in13, in14, in15, out0, out1,   \
+                            out2, out3, out4, out5, out6, out7)               \
+  {                                                                           \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
+                                                                              \
+    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
+    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
+    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
+    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
+                                                                              \
+    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
+    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
+    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
+    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
+    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
+    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
+    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
+    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
+                                                                              \
+    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
+    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
+    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
+    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
+    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+  }
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v8i16 s0_m, s1_m;                                                  \
+                                                                       \
+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
+  }
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                           out2, out3, out4, out5, out6, out7)                 \
+  {                                                                            \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
+    v8i16 zero_m = { 0 };                                                      \
+                                                                               \
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
+               tmp3_n);                                                        \
+    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
+    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
+                                                                               \
+    out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
+    out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
+                                                                               \
+    out4 = zero_m;                                                             \
+    out5 = zero_m;                                                             \
+    out6 = zero_m;                                                             \
+    out7 = zero_m;                                                             \
+  }
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
+                                                                       \
+    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
+    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
+    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
+    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
+  }
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+                       out1, out2, out3, out4, out5, out6, out7)            \
+  {                                                                         \
+    v8i16 s0_m, s1_m;                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
+                                                                            \
+    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
+    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
+    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
+    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
+    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
+             tmp7_m, out0, out2, out4, out6);                               \
+    out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
+    out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
+    out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
+    out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
+  }
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
+                                                                       \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
+                                                                       \
+    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
+    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
+    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
+    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
+  }
+
+/* Description : Add block 4x4
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
+  {                                                              \
+    uint32_t src0_m, src1_m, src2_m, src3_m;                     \
+    v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
+    v16i8 dst0_m = { 0 };                                        \
+    v16i8 dst1_m = { 0 };                                        \
+    v16i8 zero_m = { 0 };                                        \
+                                                                 \
+    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
+    LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
+    INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
+    INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
+    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
+    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
+    CLIP_SH2_0_255(res0_m, res1_m);                              \
+    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+    ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
+  }
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1)                        \
+  ({                                                      \
+    v16u8 out_m;                                          \
+                                                          \
+    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
+    out_m;                                                \
+  })
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+                 as 8x4 unsigned byte block
+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                          pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
+                                pdst, stride)                               \
+  {                                                                         \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
+    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
+    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
+  }
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst)                \
+  {                                                \
+    v16i8 tmp_m;                                   \
+                                                   \
+    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+    ST_SB(tmp_m, (pdst));                          \
+  }
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
+  ({                                                            \
+    v16i8 tmp0_m;                                               \
+    v8u16 tmp1_m;                                               \
+                                                                \
+    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
+    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
+                                                                \
+    tmp1_m;                                                     \
+  })
+#endif  // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
new file mode 100644
index 000000000..58cdd80d9
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/sad_msa.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
+  {                                                        \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
+  }
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
+static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  uint32_t sad = 0;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = HADD_UH_U32(sad0);
+  sad += HADD_UH_U32(sad1);
+
+  return sad;
+}
+
+static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+
+    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref0_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref1_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref2_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref3_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref0_ptr += (4 * ref_stride);
+    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
+    ref1_ptr += (4 * ref_stride);
+    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
+    ref2_ptr += (4 * ref_stride);
+    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
+    ref3_ptr += (4 * ref_stride);
+
+    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src, ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+
+    LD_UB2(ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+
+    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0_0);
+  sad_array[0] += HADD_UH_U32(sad0_1);
+  sad_array[1] = HADD_UH_U32(sad1_0);
+  sad_array[1] += HADD_UH_U32(sad1_1);
+  sad_array[2] = HADD_UH_U32(sad2_0);
+  sad_array[2] += HADD_UH_U32(sad2_1);
+  sad_array[3] = HADD_UH_U32(sad3_0);
+  sad_array[3] += HADD_UH_U32(sad3_1);
+}
+
+static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                  const uint8_t *ref_ptr, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff, pred, comp;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    comp = __msa_aver_u_b(pred, ref);
+    diff = __msa_asub_u_b(src, comp);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
+                                  const uint8_t *ref, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 diff0, diff1, pred0, pred1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
+    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 3); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
+    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
+    ref += (4 * ref_stride);
+
+    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
+    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
+    sec_pred += (4 * 32);
+
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
+    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
+    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
+    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 comp0, comp1, comp2, comp3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+  }
+
+  sad = __msa_hadd_u_w(sad0, sad0);
+  sad += __msa_hadd_u_w(sad1, sad1);
+
+  return HADD_SW_S32(sad);
+}
+
+#define AOM_SAD_4xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_8xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_16xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_32xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_64xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[],            \
+                                  int32_t ref_stride, uint32_t *sads) {   \
+    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_8xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[],            \
+                                  int32_t ref_stride, uint32_t *sads) {   \
+    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_16xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_32xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_64xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_AVGSAD_4xHEIGHT_MSA(height)                                        \
+  uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+                                       const uint8_t *ref, int32_t ref_stride, \
+                                       const uint8_t *second_pred) {           \
+    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
+                             second_pred);                                     \
+  }
+
+#define AOM_AVGSAD_8xHEIGHT_MSA(height)                                        \
+  uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+                                       const uint8_t *ref, int32_t ref_stride, \
+                                       const uint8_t *second_pred) {           \
+    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
+                             second_pred);                                     \
+  }
+
+#define AOM_AVGSAD_16xHEIGHT_MSA(height)                                \
+  uint32_t aom_sad16x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define AOM_AVGSAD_32xHEIGHT_MSA(height)                                \
+  uint32_t aom_sad32x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define AOM_AVGSAD_64xHEIGHT_MSA(height)                                \
+  uint32_t aom_sad64x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+/* clang-format off */
+// 64x64
+AOM_SAD_64xHEIGHT_MSA(64)
+AOM_SAD_64xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_64xHEIGHT_MSA(64)
+
+// 64x32
+AOM_SAD_64xHEIGHT_MSA(32)
+AOM_SAD_64xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_64xHEIGHT_MSA(32)
+
+// 32x64
+AOM_SAD_32xHEIGHT_MSA(64)
+AOM_SAD_32xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_32xHEIGHT_MSA(64)
+
+// 32x32
+AOM_SAD_32xHEIGHT_MSA(32)
+AOM_SAD_32xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_32xHEIGHT_MSA(32)
+
+// 32x16
+AOM_SAD_32xHEIGHT_MSA(16)
+AOM_SAD_32xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_32xHEIGHT_MSA(16)
+
+// 16x32
+AOM_SAD_16xHEIGHT_MSA(32)
+AOM_SAD_16xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_16xHEIGHT_MSA(32)
+
+// 16x16
+AOM_SAD_16xHEIGHT_MSA(16)
+AOM_SAD_16xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_16xHEIGHT_MSA(16)
+
+// 16x8
+AOM_SAD_16xHEIGHT_MSA(8)
+AOM_SAD_16xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_16xHEIGHT_MSA(8)
+
+// 8x16
+AOM_SAD_8xHEIGHT_MSA(16)
+AOM_SAD_8xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_8xHEIGHT_MSA(16)
+
+// 8x8
+AOM_SAD_8xHEIGHT_MSA(8)
+AOM_SAD_8xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_8xHEIGHT_MSA(8)
+
+// 8x4
+AOM_SAD_8xHEIGHT_MSA(4)
+AOM_SAD_8xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_8xHEIGHT_MSA(4)
+
+// 4x8
+AOM_SAD_4xHEIGHT_MSA(8)
+AOM_SAD_4xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_4xHEIGHT_MSA(8)
+
+// 4x4
+AOM_SAD_4xHEIGHT_MSA(4)
+AOM_SAD_4xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_4xHEIGHT_MSA(4)
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 000000000..810b6efaa
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -0,0 +1,1792 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/variance.h"
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+                                                                    \
+    sub += res_l0_m + res_l1_m;                                     \
+  }
+
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred, int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 pred, src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred, int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref, pred;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    CALC_MSE_AVG_B(src0, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 filt0, out, ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4, out;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 ref = { 0 };
+  v16u8 src2110, src4332;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out, ref = { 0 };
+  v16u8 filt_vt, filt_hz, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt_vt, filt_hz, vec0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 out, pred, filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 out, pred, filt0;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 pred0, pred1, pred2, pred3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
+                tmp2, tmp3);
+    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
+                tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 out, pred, ref = { 0 };
+  v16u8 src2110, src4332, filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, filt0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+                out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 out, pred, ref = { 0 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 pred0, pred1, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 out0, out1, out2, out3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+                out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                       sec_pred, filter_horiz, filter_vert,
+                                       height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
+  uint32_t aom_sub_pixel_variance##wd##x##ht##_msa(                           \
+      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
+      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
+    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
+
+/* clang-format off */
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
+/* clang-format on */
+
+#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
+  uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
+      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
+    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
+                                            ref_stride, sec_pred, ht, &diff); \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
+  }
+
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
+
+uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+                                             int32_t src_stride,
+                                             int32_t xoffset, int32_t yoffset,
+                                             const uint8_t *ref_ptr,
+                                             int32_t ref_stride, uint32_t *sse,
+                                             const uint8_t *sec_pred) {
+  int32_t diff;
+  const uint8_t *h_filter = bilinear_filters_2t[xoffset];
+  const uint8_t *v_filter = bilinear_filters_2t[yoffset];
+
+  if (yoffset) {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
+          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
+          v_filter, 64, &diff);
+    } else {
+      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
+                                                  ref_stride, sec_pred,
+                                                  v_filter, 64, &diff);
+    }
+  } else {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
+                                                  ref_stride, sec_pred,
+                                                  h_filter, 64, &diff);
+    } else {
+      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    sec_pred, &diff);
+    }
+  }
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
+  uint32_t aom_sub_pixel_avg_variance64x##ht##_msa(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
+      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
+    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
+
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
+/* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
new file mode 100644
index 000000000..bfed773ac
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/subtract_msa.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t pred0, pred1, pred2, pred3;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+  ILVRL_B2_UB(src, pred, src_l0, src_l1);
+  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  uint64_t src0, src1, pred0, pred1;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    LD2(src_ptr, src_stride, src0, src1);
+    src_ptr += (2 * src_stride);
+    LD2(pred_ptr, pred_stride, pred0, pred1);
+    pred_ptr += (2 * pred_stride);
+
+    INSERT_D2_SB(src0, src1, src);
+    INSERT_D2_SB(pred0, pred1, pred);
+    ILVRL_B2_UB(src, pred, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+    diff_ptr += (2 * diff_stride);
+  }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  int8_t count;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (count = 2; count--;) {
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
+           pred7);
+    pred += (8 * pred_stride);
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    LD_SB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_SB2(src, 16, src2, src3);
+    src += src_stride;
+    LD_SB2(src, 16, src4, src5);
+    src += src_stride;
+    LD_SB2(src, 16, src6, src7);
+    src += src_stride;
+
+    LD_SB2(pred, 16, pred0, pred1);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred2, pred3);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred4, pred5);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    LD_SB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_SB4(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+
+    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+    pred += pred_stride;
+    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+  }
+}
+
+void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      default:
+        aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
new file mode 100644
index 000000000..065c09ac5
--- /dev/null
+++ b/third_party/aom/aom_dsp/mips/variance_msa.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define CALC_MSE_B(src, ref, var)                                   \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+  }
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+                                                                    \
+    sub += res_l0_m + res_l1_m;                                     \
+  }
+
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  int32_t ht_cnt;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t get_mb_ss_msa(const int16_t *src) {
+  uint32_t sum, cnt;
+  v8i16 src0, src1, src2, src3;
+  v4i32 src0_l, src1_l, src2_l, src3_l;
+  v4i32 src0_r, src1_r, src2_r, src3_r;
+  v2i64 sq_src_l = { 0 };
+  v2i64 sq_src_r = { 0 };
+
+  for (cnt = 8; cnt--;) {
+    LD_SH4(src, 8, src0, src1, src2, src3);
+    src += 4 * 8;
+
+    UNPCK_SH_SW(src0, src0_l, src0_r);
+    UNPCK_SH_SW(src1, src1_l, src1_r);
+    UNPCK_SH_SW(src2, src2_l, src2_r);
+    UNPCK_SH_SW(src3, src3_l, src3_r);
+
+    DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
+  }
+
+  sq_src_l += __msa_splati_d(sq_src_l, 1);
+  sq_src_r += __msa_splati_d(sq_src_r, 1);
+
+  sum = __msa_copy_s_d(sq_src_l, 0);
+  sum += __msa_copy_s_d(sq_src_r, 0);
+
+  return sum;
+}
+
+static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride) {
+  uint32_t err = 0;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16i8 src = { 0 };
+  v16i8 ref = { 0 };
+  v16u8 src_vec0, src_vec1;
+  v8i16 diff0, diff1;
+  v4i32 err0 = { 0 };
+  v4i32 err1 = { 0 };
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
+  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
+  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
+  err = HADD_SW_S32(err0);
+  err += HADD_SW_S32(err1);
+
+  return err;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define AOM_VARIANCE_WDXHT_MSA(wd, ht)                                         \
+  uint32_t aom_variance##wd##x##ht##_msa(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
+
+/* clang-format off */
+AOM_VARIANCE_WDXHT_MSA(4, 4)
+AOM_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_VARIANCE_WDXHT_MSA(8, 4)
+AOM_VARIANCE_WDXHT_MSA(8, 8)
+AOM_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_VARIANCE_WDXHT_MSA(16, 8)
+AOM_VARIANCE_WDXHT_MSA(16, 16)
+AOM_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_VARIANCE_WDXHT_MSA(32, 16)
+AOM_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
+
+uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx32H(*sse, diff);
+}
+
+uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
+                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
+                       const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                       int32_t *sum) {
+  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
+}
+
+void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
+  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
+}
+
+uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
new file mode 100644
index 000000000..2faee8506
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -0,0 +1,1648 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "av1/encoder/mathutils.h"
+
+#define kLowPolyNumParams 3
+
+static const int kMaxLag = 4;
+
+// Defines a function that can be used to obtain the mean of a block for the
+// provided data type (uint8_t, or uint16_t)
+#define GET_BLOCK_MEAN(INT_TYPE, suffix)                                    \
+  static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
+                                        int stride, int x_o, int y_o,       \
+                                        int block_size) {                   \
+    const int max_h = AOMMIN(h - y_o, block_size);                          \
+    const int max_w = AOMMIN(w - x_o, block_size);                          \
+    double block_mean = 0;                                                  \
+    for (int y = 0; y < max_h; ++y) {                                       \
+      for (int x = 0; x < max_w; ++x) {                                     \
+        block_mean += data[(y_o + y) * stride + x_o + x];                   \
+      }                                                                     \
+    }                                                                       \
+    return block_mean / (max_w * max_h);                                    \
+  }
+
+GET_BLOCK_MEAN(uint8_t, lowbd);
+GET_BLOCK_MEAN(uint16_t, highbd);
+
+static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+                                    int stride, int x_o, int y_o,
+                                    int block_size, int use_highbd) {
+  if (use_highbd)
+    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+                                 block_size);
+  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+}
+
+// Defines a function that can be used to obtain the variance of a block
+// for the provided data type (uint8_t, or uint16_t)
+#define GET_NOISE_VAR(INT_TYPE, suffix)                                  \
+  static double get_noise_var_##suffix(                                  \
+      const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
+      int h, int x_o, int y_o, int block_size_x, int block_size_y) {     \
+    const int max_h = AOMMIN(h - y_o, block_size_y);                     \
+    const int max_w = AOMMIN(w - x_o, block_size_x);                     \
+    double noise_var = 0;                                                \
+    double noise_mean = 0;                                               \
+    for (int y = 0; y < max_h; ++y) {                                    \
+      for (int x = 0; x < max_w; ++x) {                                  \
+        double noise = (double)data[(y_o + y) * stride + x_o + x] -      \
+                       denoised[(y_o + y) * stride + x_o + x];           \
+        noise_mean += noise;                                             \
+        noise_var += noise * noise;                                      \
+      }                                                                  \
+    }                                                                    \
+    noise_mean /= (max_w * max_h);                                       \
+    return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
+  }
+
+GET_NOISE_VAR(uint8_t, lowbd);
+GET_NOISE_VAR(uint16_t, highbd);
+
+static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+                                   int w, int h, int stride, int x_o, int y_o,
+                                   int block_size_x, int block_size_y,
+                                   int use_highbd) {
+  if (use_highbd)
+    return get_noise_var_highbd((const uint16_t *)data,
+                                (const uint16_t *)denoised, w, h, stride, x_o,
+                                y_o, block_size_x, block_size_y);
+  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
+                             block_size_x, block_size_y);
+}
+
+static void equation_system_clear(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
+  memset(eqns->x, 0, sizeof(*eqns->x) * n);
+  memset(eqns->b, 0, sizeof(*eqns->b) * n);
+}
+
+static void equation_system_copy(aom_equation_system_t *dst,
+                                 const aom_equation_system_t *src) {
+  const int n = dst->n;
+  memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
+  memcpy(dst->x, src->x, sizeof(*dst->x) * n);
+  memcpy(dst->b, src->b, sizeof(*dst->b) * n);
+}
+
+static int equation_system_init(aom_equation_system_t *eqns, int n) {
+  eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
+  eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
+  eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
+  eqns->n = n;
+  if (!eqns->A || !eqns->b || !eqns->x) {
+    fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
+    aom_free(eqns->A);
+    aom_free(eqns->b);
+    aom_free(eqns->x);
+    memset(eqns, 0, sizeof(*eqns));
+    return 0;
+  }
+  equation_system_clear(eqns);
+  return 1;
+}
+
+static int equation_system_solve(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  double *b = (double *)aom_malloc(sizeof(*b) * n);
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  int ret = 0;
+  if (A == NULL || b == NULL) {
+    fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
+    aom_free(b);
+    aom_free(A);
+    return 0;
+  }
+  memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
+  memcpy(b, eqns->b, sizeof(*eqns->b) * n);
+  ret = linsolve(n, A, eqns->n, b, eqns->x);
+  aom_free(b);
+  aom_free(A);
+
+  if (ret == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static void equation_system_add(aom_equation_system_t *dest,
+                                aom_equation_system_t *src) {
+  const int n = dest->n;
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      dest->A[i * n + j] += src->A[i * n + j];
+    }
+    dest->b[i] += src->b[i];
+  }
+}
+
+static void equation_system_free(aom_equation_system_t *eqns) {
+  if (!eqns) return;
+  aom_free(eqns->A);
+  aom_free(eqns->b);
+  aom_free(eqns->x);
+  memset(eqns, 0, sizeof(*eqns));
+}
+
+static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
+  equation_system_clear(&solver->eqns);
+  solver->num_equations = 0;
+  solver->total = 0;
+}
+
+static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
+                                      aom_noise_strength_solver_t *src) {
+  equation_system_add(&dest->eqns, &src->eqns);
+  dest->num_equations += src->num_equations;
+  dest->total += src->total;
+}
+
+// Return the number of coefficients required for the given parameters
+static int num_coeffs(const aom_noise_model_params_t params) {
+  const int n = 2 * params.lag + 1;
+  switch (params.shape) {
+    case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
+    case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
+  }
+  return 0;
+}
+
+static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
+  const int kNumBins = 20;
+  if (!equation_system_init(&state->eqns, n)) {
+    fprintf(stderr, "Failed initialization noise state with size %d\n", n);
+    return 0;
+  }
+  state->ar_gain = 1.0;
+  state->num_observations = 0;
+  return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
+                                        bit_depth);
+}
+
+static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
+  const double kTolerance = 1e-6;
+  const int last = eqns->n - 1;
+  // Set all of the AR coefficients to zero, but try to solve for correlation
+  // with the luma channel
+  memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
+  if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
+    eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
+  }
+}
+
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
+  if (!lut) return 0;
+  lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
+  if (!lut->points) return 0;
+  lut->num_points = num_points;
+  memset(lut->points, 0, sizeof(*lut->points) * num_points);
+  return 1;
+}
+
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
+  if (!lut) return;
+  aom_free(lut->points);
+  memset(lut, 0, sizeof(*lut));
+}
+
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x) {
+  int i = 0;
+  // Constant extrapolation for x <  x_0.
+  if (x < lut->points[0][0]) return lut->points[0][1];
+  for (i = 0; i < lut->num_points - 1; ++i) {
+    if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
+      const double a =
+          (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
+      return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
+    }
+  }
+  // Constant extrapolation for x > x_{n-1}
+  return lut->points[lut->num_points - 1][1];
+}
+
+static double noise_strength_solver_get_bin_index(
+    const aom_noise_strength_solver_t *solver, double value) {
+  const double val =
+      fclamp(value, solver->min_intensity, solver->max_intensity);
+  const double range = solver->max_intensity - solver->min_intensity;
+  return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
+}
+
+static double noise_strength_solver_get_value(
+    const aom_noise_strength_solver_t *solver, double x) {
+  const double bin = noise_strength_solver_get_bin_index(solver, x);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
+}
+
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
+  const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  const int n = solver->num_bins;
+  solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
+  solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
+  solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
+  solver->eqns.b[bin_i1] += a * noise_std;
+  solver->total += noise_std;
+  solver->num_equations++;
+}
+
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
+  // Add regularization proportional to the number of constraints
+  const int n = solver->num_bins;
+  const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
+  int result = 0;
+  double mean = 0;
+
+  // Do this in a non-destructive manner so it is not confusing to the caller
+  double *old_A = solver->eqns.A;
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  if (!A) {
+    fprintf(stderr, "Unable to allocate copy of A\n");
+    return 0;
+  }
+  memcpy(A, old_A, sizeof(*A) * n * n);
+
+  for (int i = 0; i < n; ++i) {
+    const int i_lo = AOMMAX(0, i - 1);
+    const int i_hi = AOMMIN(n - 1, i + 1);
+    A[i * n + i_lo] -= kAlpha;
+    A[i * n + i] += 2 * kAlpha;
+    A[i * n + i_hi] -= kAlpha;
+  }
+
+  // Small regularization to give average noise strength
+  mean = solver->total / solver->num_equations;
+  for (int i = 0; i < n; ++i) {
+    A[i * n + i] += 1.0 / 8192.;
+    solver->eqns.b[i] += mean / 8192.;
+  }
+  solver->eqns.A = A;
+  result = equation_system_solve(&solver->eqns);
+  solver->eqns.A = old_A;
+
+  aom_free(A);
+  return result;
+}
+
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth) {
+  if (!solver) return 0;
+  memset(solver, 0, sizeof(*solver));
+  solver->num_bins = num_bins;
+  solver->min_intensity = 0;
+  solver->max_intensity = (1 << bit_depth) - 1;
+  solver->total = 0;
+  solver->num_equations = 0;
+  return equation_system_init(&solver->eqns, num_bins);
+}
+
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
+  if (!solver) return;
+  equation_system_free(&solver->eqns);
+}
+
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i) {
+  const double range = solver->max_intensity - solver->min_intensity;
+  const int n = solver->num_bins;
+  return ((double)i) / (n - 1) * range + solver->min_intensity;
+}
+
+// Computes the residual if a point were to be removed from the lut. This is
+// calculated as the area between the output of the solver and the line segment
+// that would be formed between [x_{i - 1}, x_{i + 1}).
+static void update_piecewise_linear_residual(
+    const aom_noise_strength_solver_t *solver,
+    const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
+  const double dx = 255. / solver->num_bins;
+  for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
+    const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
+                                    solver, lut->points[i - 1][0])));
+    const int upper = AOMMIN(solver->num_bins - 1,
+                             (int)ceil(noise_strength_solver_get_bin_index(
+                                 solver, lut->points[i + 1][0])));
+    double r = 0;
+    for (int j = lower; j <= upper; ++j) {
+      const double x = aom_noise_strength_solver_get_center(solver, j);
+      if (x < lut->points[i - 1][0]) continue;
+      if (x >= lut->points[i + 1][0]) continue;
+      const double y = solver->eqns.x[j];
+      const double a = (x - lut->points[i - 1][0]) /
+                       (lut->points[i + 1][0] - lut->points[i - 1][0]);
+      const double estimate_y =
+          lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
+      r += fabs(y - estimate_y);
+    }
+    residual[i] = r * dx;
+  }
+}
+
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_output_points,
+    aom_noise_strength_lut_t *lut) {
+  // The tolerance is normalized to be give consistent results between
+  // different bit-depths.
+  const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
+  if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
+    fprintf(stderr, "Failed to init lut\n");
+    return 0;
+  }
+  for (int i = 0; i < solver->num_bins; ++i) {
+    lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
+    lut->points[i][1] = solver->eqns.x[i];
+  }
+  if (max_output_points < 0) {
+    max_output_points = solver->num_bins;
+  }
+
+  double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
+  memset(residual, 0, sizeof(*residual) * solver->num_bins);
+
+  update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
+
+  // Greedily remove points if there are too many or if it doesn't hurt local
+  // approximation (never remove the end points)
+  while (lut->num_points > 2) {
+    int min_index = 1;
+    for (int j = 1; j < lut->num_points - 1; ++j) {
+      if (residual[j] < residual[min_index]) {
+        min_index = j;
+      }
+    }
+    const double dx =
+        lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
+    const double avg_residual = residual[min_index] / dx;
+    if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
+      break;
+    }
+
+    const int num_remaining = lut->num_points - min_index - 1;
+    memmove(lut->points + min_index, lut->points + min_index + 1,
+            sizeof(lut->points[0]) * num_remaining);
+    lut->num_points--;
+
+    update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
+                                     min_index + 1);
+  }
+  aom_free(residual);
+  return 1;
+}
+
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd) {
+  const int n = block_size * block_size;
+  aom_equation_system_t eqns;
+  double *AtA_inv = 0;
+  double *A = 0;
+  int x = 0, y = 0, i = 0, j = 0;
+  if (!equation_system_init(&eqns, kLowPolyNumParams)) {
+    fprintf(stderr, "Failed to init equation system for block_size=%d\n",
+            block_size);
+    return 0;
+  }
+
+  AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
+                                 sizeof(*AtA_inv));
+  A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
+  if (AtA_inv == NULL || A == NULL) {
+    fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
+            block_size);
+    aom_free(AtA_inv);
+    aom_free(A);
+    equation_system_free(&eqns);
+    return 0;
+  }
+
+  block_finder->A = A;
+  block_finder->AtA_inv = AtA_inv;
+  block_finder->block_size = block_size;
+  block_finder->normalization = (1 << bit_depth) - 1;
+  block_finder->use_highbd = use_highbd;
+
+  for (y = 0; y < block_size; ++y) {
+    const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
+    for (x = 0; x < block_size; ++x) {
+      const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
+      const double coords[3] = { yd, xd, 1 };
+      const int row = y * block_size + x;
+      A[kLowPolyNumParams * row + 0] = yd;
+      A[kLowPolyNumParams * row + 1] = xd;
+      A[kLowPolyNumParams * row + 2] = 1;
+
+      for (i = 0; i < kLowPolyNumParams; ++i) {
+        for (j = 0; j < kLowPolyNumParams; ++j) {
+          eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
+        }
+      }
+    }
+  }
+
+  // Lazy inverse using existing equation solver.
+  for (i = 0; i < kLowPolyNumParams; ++i) {
+    memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
+    eqns.b[i] = 1;
+    equation_system_solve(&eqns);
+
+    for (j = 0; j < kLowPolyNumParams; ++j) {
+      AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
+    }
+  }
+  equation_system_free(&eqns);
+  return 1;
+}
+
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
+  if (!block_finder) return;
+  aom_free(block_finder->A);
+  aom_free(block_finder->AtA_inv);
+  memset(block_finder, 0, sizeof(*block_finder));
+}
+
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block) {
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double *A = block_finder->A;
+  const double *AtA_inv = block_finder->AtA_inv;
+  double plane_coords[kLowPolyNumParams];
+  double AtA_inv_b[kLowPolyNumParams];
+  int xi, yi, i;
+
+  if (block_finder->use_highbd) {
+    const uint16_t *const data16 = (const uint16_t *const)data;
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data16[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  } else {
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  }
+  multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
+  multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
+               kLowPolyNumParams, 1);
+  multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
+
+  for (i = 0; i < n; ++i) {
+    block[i] -= plane[i];
+  }
+}
+
+typedef struct {
+  int index;
+  float score;
+} index_and_score_t;
+
+static int compare_scores(const void *a, const void *b) {
+  const float diff =
+      ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks) {
+  // The gradient-based features used in this code are based on:
+  //  A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
+  //  correlation for improved video denoising," 2012 19th, ICIP.
+  // The thresholds are more lenient to allow for correct grain modeling
+  // if extreme cases.
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double kTraceThreshold = 0.15 / (32 * 32);
+  const double kRatioThreshold = 1.25;
+  const double kNormThreshold = 0.08 / (32 * 32);
+  const double kVarThreshold = 0.005 / (double)n;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int num_flat = 0;
+  int bx = 0, by = 0;
+  double *plane = (double *)aom_malloc(n * sizeof(*plane));
+  double *block = (double *)aom_malloc(n * sizeof(*block));
+  index_and_score_t *scores = (index_and_score_t *)aom_malloc(
+      num_blocks_w * num_blocks_h * sizeof(*scores));
+  if (plane == NULL || block == NULL || scores == NULL) {
+    fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
+    aom_free(plane);
+    aom_free(block);
+    aom_free(scores);
+    return -1;
+  }
+
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "score = [");
+#endif
+  for (by = 0; by < num_blocks_h; ++by) {
+    for (bx = 0; bx < num_blocks_w; ++bx) {
+      // Compute gradient covariance matrix.
+      double Gxx = 0, Gxy = 0, Gyy = 0;
+      double var = 0;
+      double mean = 0;
+      int xi, yi;
+      aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
+                                          bx * block_size, by * block_size,
+                                          plane, block);
+
+      for (yi = 1; yi < block_size - 1; ++yi) {
+        for (xi = 1; xi < block_size - 1; ++xi) {
+          const double gx = (block[yi * block_size + xi + 1] -
+                             block[yi * block_size + xi - 1]) /
+                            2;
+          const double gy = (block[yi * block_size + xi + block_size] -
+                             block[yi * block_size + xi - block_size]) /
+                            2;
+          Gxx += gx * gx;
+          Gxy += gx * gy;
+          Gyy += gy * gy;
+
+          mean += block[yi * block_size + xi];
+          var += block[yi * block_size + xi] * block[yi * block_size + xi];
+        }
+      }
+      mean /= (block_size - 2) * (block_size - 2);
+
+      // Normalize gradients by block_size.
+      Gxx /= ((block_size - 2) * (block_size - 2));
+      Gxy /= ((block_size - 2) * (block_size - 2));
+      Gyy /= ((block_size - 2) * (block_size - 2));
+      var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
+
+      {
+        const double trace = Gxx + Gyy;
+        const double det = Gxx * Gyy - Gxy * Gxy;
+        const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
+        const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
+        const double norm = e1;  // Spectral norm
+        const double ratio = (e1 / AOMMAX(e2, 1e-6));
+        const int is_flat = (trace < kTraceThreshold) &&
+                            (ratio < kRatioThreshold) &&
+                            (norm < kNormThreshold) && (var > kVarThreshold);
+        // The following weights are used to combine the above features to give
+        // a sigmoid score for flatness. If the input was normalized to [0,100]
+        // the magnitude of these values would be close to 1 (e.g., weights
+        // corresponding to variance would be a factor of 10000x smaller).
+        // The weights are given in the following order:
+        //    [{var}, {ratio}, {trace}, {norm}, offset]
+        // with one of the most discriminative being simply the variance.
+        const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
+        const float score =
+            (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio +
+                                     weights[2] * trace + weights[3] * norm +
+                                     weights[4]))));
+        flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
+        scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
+        scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
+#ifdef NOISE_MODEL_LOG_SCORE
+        fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
+                is_flat);
+#endif
+        num_flat += is_flat;
+      }
+    }
+#ifdef NOISE_MODEL_LOG_SCORE
+    fprintf(stderr, "\n");
+#endif
+  }
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "];\n");
+#endif
+  // Find the top-scored blocks (most likely to be flat) and set the flat blocks
+  // be the union of the thresholded results and the top 10th percentile of the
+  // scored results.
+  qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
+  const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
+  const float score_threshold = scores[top_nth_percentile].score;
+  for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
+    if (scores[i].score >= score_threshold) {
+      num_flat += flat_blocks[scores[i].index] == 0;
+      flat_blocks[scores[i].index] |= 1;
+    }
+  }
+  aom_free(block);
+  aom_free(plane);
+  aom_free(scores);
+  return num_flat;
+}
+
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params) {
+  const int n = num_coeffs(params);
+  const int lag = params.lag;
+  const int bit_depth = params.bit_depth;
+  int x = 0, y = 0, i = 0, c = 0;
+
+  memset(model, 0, sizeof(*model));
+  if (params.lag < 1) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
+    return 0;
+  }
+  if (params.lag > kMaxLag) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
+            kMaxLag);
+    return 0;
+  }
+
+  memcpy(&model->params, &params, sizeof(params));
+  for (c = 0; c < 3; ++c) {
+    if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+    if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+  }
+  model->n = n;
+  model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+
+  for (y = -lag; y <= 0; ++y) {
+    const int max_x = y == 0 ? -1 : lag;
+    for (x = -lag; x <= max_x; ++x) {
+      switch (params.shape) {
+        case AOM_NOISE_SHAPE_DIAMOND:
+          if (abs(x) <= y + lag) {
+            model->coords[i][0] = x;
+            model->coords[i][1] = y;
+            ++i;
+          }
+          break;
+        case AOM_NOISE_SHAPE_SQUARE:
+          model->coords[i][0] = x;
+          model->coords[i][1] = y;
+          ++i;
+          break;
+        default:
+          fprintf(stderr, "Invalid shape\n");
+          aom_noise_model_free(model);
+          return 0;
+      }
+    }
+  }
+  assert(i == n);
+  return 1;
+}
+
+void aom_noise_model_free(aom_noise_model_t *model) {
+  int c = 0;
+  if (!model) return;
+
+  aom_free(model->coords);
+  for (c = 0; c < 3; ++c) {
+    equation_system_free(&model->latest_state[c].eqns);
+    equation_system_free(&model->combined_state[c].eqns);
+
+    equation_system_free(&model->latest_state[c].strength_solver.eqns);
+    equation_system_free(&model->combined_state[c].strength_solver.eqns);
+  }
+  memset(model, 0, sizeof(*model));
+}
+
+// Extracts the neighborhood defined by coords around point (x, y) from
+// the difference between the data and denoised images. Also extracts the
+// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
+#define EXTRACT_AR_ROW(INT_TYPE, suffix)                                   \
+  static double extract_ar_row_##suffix(                                   \
+      int(*coords)[2], int num_coords, const INT_TYPE *const data,         \
+      const INT_TYPE *const denoised, int stride, int sub_log2[2],         \
+      const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised,  \
+      int alt_stride, int x, int y, double *buffer) {                      \
+    for (int i = 0; i < num_coords; ++i) {                                 \
+      const int x_i = x + coords[i][0], y_i = y + coords[i][1];            \
+      buffer[i] =                                                          \
+          (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
+    }                                                                      \
+    const double val =                                                     \
+        (double)data[y * stride + x] - denoised[y * stride + x];           \
+                                                                           \
+    if (alt_data && alt_denoised) {                                        \
+      double avg_data = 0, avg_denoised = 0;                               \
+      int num_samples = 0;                                                 \
+      for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) {              \
+        const int y_up = (y << sub_log2[1]) + dy_i;                        \
+        for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) {            \
+          const int x_up = (x << sub_log2[0]) + dx_i;                      \
+          avg_data += alt_data[y_up * alt_stride + x_up];                  \
+          avg_denoised += alt_denoised[y_up * alt_stride + x_up];          \
+          num_samples++;                                                   \
+        }                                                                  \
+      }                                                                    \
+      buffer[num_coords] = (avg_data - avg_denoised) / num_samples;        \
+    }                                                                      \
+    return val;                                                            \
+  }
+
+EXTRACT_AR_ROW(uint8_t, lowbd);
+EXTRACT_AR_ROW(uint16_t, highbd);
+
+static int add_block_observations(
+    aom_noise_model_t *noise_model, int c, const uint8_t *const data,
+    const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
+    const uint8_t *const alt_data, const uint8_t *const alt_denoised,
+    int alt_stride, const uint8_t *const flat_blocks, int block_size,
+    int num_blocks_w, int num_blocks_h) {
+  const int lag = noise_model->params.lag;
+  const int num_coords = noise_model->n;
+  const double normalization = (1 << noise_model->params.bit_depth) - 1;
+  double *A = noise_model->latest_state[c].eqns.A;
+  double *b = noise_model->latest_state[c].eqns.b;
+  double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
+  const int n = noise_model->latest_state[c].eqns.n;
+
+  if (!buffer) {
+    fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
+    return 0;
+  }
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      int y_start =
+          (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
+      int x_start =
+          (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
+      int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                         block_size >> sub_log2[1]);
+      int x_end = AOMMIN(
+          (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
+          (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
+              ? (block_size >> sub_log2[0])
+              : ((block_size >> sub_log2[0]) - lag));
+      for (int y = y_start; y < y_end; ++y) {
+        for (int x = x_start; x < x_end; ++x) {
+          const double val =
+              noise_model->params.use_highbd
+                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
+                                          (const uint16_t *const)data,
+                                          (const uint16_t *const)denoised,
+                                          stride, sub_log2,
+                                          (const uint16_t *const)alt_data,
+                                          (const uint16_t *const)alt_denoised,
+                                          alt_stride, x + x_o, y + y_o, buffer)
+                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
+                                         denoised, stride, sub_log2, alt_data,
+                                         alt_denoised, alt_stride, x + x_o,
+                                         y + y_o, buffer);
+          for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < n; ++j) {
+              A[i * n + j] +=
+                  (buffer[i] * buffer[j]) / (normalization * normalization);
+            }
+            b[i] += (buffer[i] * val) / (normalization * normalization);
+          }
+          noise_model->latest_state[c].num_observations++;
+        }
+      }
+    }
+  }
+  aom_free(buffer);
+  return 1;
+}
+
+static void add_noise_std_observations(
+    aom_noise_model_t *noise_model, int c, const double *coeffs,
+    const uint8_t *const data, const uint8_t *const denoised, int w, int h,
+    int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
+    const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
+    int num_blocks_h) {
+  const int num_coords = noise_model->n;
+  aom_noise_strength_solver_t *noise_strength_solver =
+      &noise_model->latest_state[c].strength_solver;
+
+  const aom_noise_strength_solver_t *noise_strength_luma =
+      &noise_model->latest_state[0].strength_solver;
+  const double luma_gain = noise_model->latest_state[0].ar_gain;
+  const double noise_gain = noise_model->latest_state[c].ar_gain;
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      const int num_samples_h =
+          AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                 block_size >> sub_log2[1]);
+      const int num_samples_w =
+          AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
+                 (block_size >> sub_log2[0]));
+      // Make sure that we have a reasonable amount of samples to consider the
+      // block
+      if (num_samples_w * num_samples_h > block_size) {
+        const double block_mean = get_block_mean(
+            alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
+            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
+            noise_model->params.use_highbd);
+        const double noise_var = get_noise_var(
+            data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
+            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
+            noise_model->params.use_highbd);
+        // We want to remove the part of the noise that came from being
+        // correlated with luma. Note that the noise solver for luma must
+        // have already been run.
+        const double luma_strength =
+            c > 0 ? luma_gain * noise_strength_solver_get_value(
+                                    noise_strength_luma, block_mean)
+                  : 0;
+        const double corr = c > 0 ? coeffs[num_coords] : 0;
+        // Chroma noise:
+        //    N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
+        // The uncorrelated component:
+        //   uncorr_var = noise_var - (corr * luma_strength)^2
+        // But don't allow fully correlated noise (hence the max), since the
+        // synthesis cannot model it.
+        const double uncorr_std = sqrt(
+            AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
+        // After we've removed correlation with luma, undo the gain that will
+        // come from running the IIR filter.
+        const double adjusted_strength = uncorr_std / noise_gain;
+        aom_noise_strength_solver_add_measurement(
+            noise_strength_solver, block_mean, adjusted_strength);
+      }
+    }
+  }
+}
+
+// Return true if the noise estimate appears to be different from the combined
+// (multi-frame) estimate. The difference is measured by checking whether the
+// AR coefficients have diverged (using a threshold on normalized cross
+// correlation), or whether the noise strength has changed.
+static int is_noise_model_different(aom_noise_model_t *const noise_model) {
+  // These thresholds are kind of arbitrary and will likely need further tuning
+  // (or exported as parameters). The threshold on noise strength is a weighted
+  // difference between the noise strength histograms
+  const double kCoeffThreshold = 0.9;
+  const double kStrengthThreshold =
+      0.005 * (1 << (noise_model->params.bit_depth - 8));
+  for (int c = 0; c < 1; ++c) {
+    const double corr =
+        aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.n);
+    if (corr < kCoeffThreshold) return 1;
+
+    const double dx =
+        1.0 / noise_model->latest_state[c].strength_solver.num_bins;
+
+    const aom_equation_system_t *latest_eqns =
+        &noise_model->latest_state[c].strength_solver.eqns;
+    const aom_equation_system_t *combined_eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    double diff = 0;
+    double total_weight = 0;
+    for (int j = 0; j < latest_eqns->n; ++j) {
+      double weight = 0;
+      for (int i = 0; i < latest_eqns->n; ++i) {
+        weight += latest_eqns->A[i * latest_eqns->n + j];
+      }
+      weight = sqrt(weight);
+      diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
+      total_weight += weight;
+    }
+    if (diff * dx / total_weight > kStrengthThreshold) return 1;
+  }
+  return 0;
+}
+
+static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
+  const int ret = equation_system_solve(&state->eqns);
+  state->ar_gain = 1.0;
+  if (!ret) return ret;
+
+  // Update the AR gain from the equation system as it will be used to fit
+  // the noise strength as a function of intensity.  In the Yule-Walker
+  // equations, the diagonal should be the variance of the correlated noise.
+  // In the case of the least squares estimate, there will be some variability
+  // in the diagonal. So use the mean of the diagonal as the estimate of
+  // overall variance (this works for least squares or Yule-Walker formulation).
+  double var = 0;
+  const int n = state->eqns.n;
+  for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
+    var += state->eqns.A[i * n + i] / state->num_observations;
+  }
+  var /= (n - is_chroma);
+
+  // Keep track of E(Y^2) = <b, x> + E(X^2)
+  // In the case that we are using chroma and have an estimate of correlation
+  // with luma we adjust that estimate slightly to remove the correlated bits by
+  // subtracting out the last column of a scaled by our correlation estimate
+  // from b. E(y^2) = <b - A(:, end)*x(end), x>
+  double sum_covar = 0;
+  for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
+    double bi = state->eqns.b[i];
+    if (is_chroma) {
+      bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
+    }
+    sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
+  }
+  // Now, get an estimate of the variance of uncorrelated noise signal and use
+  // it to determine the gain of the AR filter.
+  const double noise_var = AOMMAX(var - sum_covar, 1e-6);
+  state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
+  return ret;
+}
+
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int stride[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int y_model_different = 0;
+  int num_blocks = 0;
+  int i = 0, channel = 0;
+
+  if (block_size <= 1) {
+    fprintf(stderr, "block_size = %d must be > 1\n", block_size);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  if (block_size < noise_model->params.lag * 2 + 1) {
+    fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
+            noise_model->params.lag * 2 + 1);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  // Clear the latest equation system
+  for (i = 0; i < 3; ++i) {
+    equation_system_clear(&noise_model->latest_state[i].eqns);
+    noise_model->latest_state[i].num_observations = 0;
+    noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
+  }
+
+  // Check that we have enough flat blocks
+  for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
+    if (flat_blocks[i]) {
+      num_blocks++;
+    }
+  }
+
+  if (num_blocks <= 1) {
+    fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
+    return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
+  }
+
+  for (channel = 0; channel < 3; ++channel) {
+    int no_subsampling[2] = { 0, 0 };
+    const uint8_t *alt_data = channel > 0 ? data[0] : 0;
+    const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
+    int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
+    const int is_chroma = channel != 0;
+    if (!data[channel] || !denoised[channel]) break;
+    if (!add_block_observations(noise_model, channel, data[channel],
+                                denoised[channel], w, h, stride[channel], sub,
+                                alt_data, alt_denoised, stride[0], flat_blocks,
+                                block_size, num_blocks_w, num_blocks_h)) {
+      fprintf(stderr, "Adding block observation failed\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    if (!ar_equation_system_solve(&noise_model->latest_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->latest_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving latest noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    add_noise_std_observations(
+        noise_model, channel, noise_model->latest_state[channel].eqns.x,
+        data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
+        stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->latest_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving latest noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    // Check noise characteristics and return if error.
+    if (channel == 0 &&
+        noise_model->combined_state[channel].strength_solver.num_equations >
+            0 &&
+        is_noise_model_different(noise_model)) {
+      y_model_different = 1;
+    }
+
+    // Don't update the combined stats if the y model is different.
+    if (y_model_different) continue;
+
+    noise_model->combined_state[channel].num_observations +=
+        noise_model->latest_state[channel].num_observations;
+    equation_system_add(&noise_model->combined_state[channel].eqns,
+                        &noise_model->latest_state[channel].eqns);
+    if (!ar_equation_system_solve(&noise_model->combined_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->combined_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving combined noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    noise_strength_solver_add(
+        &noise_model->combined_state[channel].strength_solver,
+        &noise_model->latest_state[channel].strength_solver);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->combined_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving combined noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+  }
+
+  return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
+                           : AOM_NOISE_STATUS_OK;
+}
+
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
+  for (int c = 0; c < 3; c++) {
+    equation_system_copy(&noise_model->combined_state[c].eqns,
+                         &noise_model->latest_state[c].eqns);
+    equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
+                         &noise_model->latest_state[c].strength_solver.eqns);
+    noise_model->combined_state[c].strength_solver.num_equations =
+        noise_model->latest_state[c].strength_solver.num_equations;
+    noise_model->combined_state[c].num_observations =
+        noise_model->latest_state[c].num_observations;
+    noise_model->combined_state[c].ar_gain =
+        noise_model->latest_state[c].ar_gain;
+  }
+}
+
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain) {
+  if (noise_model->params.lag > 3) {
+    fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
+    return 0;
+  }
+  uint16_t random_seed = film_grain->random_seed;
+  memset(film_grain, 0, sizeof(*film_grain));
+  film_grain->random_seed = random_seed;
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+
+  film_grain->ar_coeff_lag = noise_model->params.lag;
+
+  // Convert the scaling functions to 8 bit values
+  aom_noise_strength_lut_t scaling_points[3];
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+
+  // Both the domain and the range of the scaling functions in the film_grain
+  // are normalized to 8-bit (e.g., they are implicitly scaled during grain
+  // synthesis).
+  const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
+  double max_scaling_value = 1e-4;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      scaling_points[c].points[i][0] =
+          AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
+      scaling_points[c].points[i][1] =
+          AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
+      max_scaling_value =
+          AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
+    }
+  }
+
+  // Scaling_shift values are in the range [8,11]
+  const int max_scaling_value_log2 =
+      clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
+  film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
+
+  const double scale_factor = 1 << (8 - max_scaling_value_log2);
+  film_grain->num_y_points = scaling_points[0].num_points;
+  film_grain->num_cb_points = scaling_points[1].num_points;
+  film_grain->num_cr_points = scaling_points[2].num_points;
+
+  int(*film_grain_scaling[3])[2] = {
+    film_grain->scaling_points_y,
+    film_grain->scaling_points_cb,
+    film_grain->scaling_points_cr,
+  };
+  for (int c = 0; c < 3; c++) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
+      film_grain_scaling[c][i][1] = clamp(
+          (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
+    }
+  }
+  aom_noise_strength_lut_free(scaling_points + 0);
+  aom_noise_strength_lut_free(scaling_points + 1);
+  aom_noise_strength_lut_free(scaling_points + 2);
+
+  // Convert the ar_coeffs into 8-bit values
+  const int n_coeff = noise_model->combined_state[0].eqns.n;
+  double max_coeff = 1e-4, min_coeff = -1e-4;
+  double y_corr[2] = { 0, 0 };
+  double avg_luma_strength = 0;
+  for (int c = 0; c < 3; c++) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      max_coeff = AOMMAX(max_coeff, eqns->x[i]);
+      min_coeff = AOMMIN(min_coeff, eqns->x[i]);
+    }
+    // Since the correlation between luma/chroma was computed in an already
+    // scaled space, we adjust it in the un-scaled space.
+    aom_noise_strength_solver_t *solver =
+        &noise_model->combined_state[c].strength_solver;
+    // Compute a weighted average of the strength for the channel.
+    double average_strength = 0, total_weight = 0;
+    for (int i = 0; i < solver->eqns.n; ++i) {
+      double w = 0;
+      for (int j = 0; j < solver->eqns.n; ++j) {
+        w += solver->eqns.A[i * solver->eqns.n + j];
+      }
+      w = sqrt(w);
+      average_strength += solver->eqns.x[i] * w;
+      total_weight += w;
+    }
+    if (total_weight == 0)
+      average_strength = 1;
+    else
+      average_strength /= total_weight;
+    if (c == 0) {
+      avg_luma_strength = average_strength;
+    } else {
+      y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
+      max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
+      min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
+    }
+  }
+  // Shift value: AR coeffs range (values 6-9)
+  // 6: [-2, 2),  7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
+  film_grain->ar_coeff_shift =
+      clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
+            6, 9);
+  double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
+  int *ar_coeffs[3] = {
+    film_grain->ar_coeffs_y,
+    film_grain->ar_coeffs_cb,
+    film_grain->ar_coeffs_cr,
+  };
+  for (int c = 0; c < 3; ++c) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      ar_coeffs[c][i] =
+          clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
+    }
+    if (c > 0) {
+      ar_coeffs[c][n_coeff] =
+          clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
+    }
+  }
+
+  // At the moment, the noise modeling code assumes that the chroma scaling
+  // functions are a function of luma.
+  film_grain->cb_mult = 128;       // 8 bits
+  film_grain->cb_luma_mult = 192;  // 8 bits
+  film_grain->cb_offset = 256;     // 9 bits
+
+  film_grain->cr_mult = 128;       // 8 bits
+  film_grain->cr_luma_mult = 192;  // 8 bits
+  film_grain->cr_offset = 256;     // 9 bits
+
+  film_grain->chroma_scaling_from_luma = 0;
+  film_grain->grain_scale_shift = 0;
+  film_grain->overlap_flag = 1;
+  return 1;
+}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+  for (int i = 0; i < n; ++i) {
+    b[i] *= a[i];
+  }
+}
+
+static float *get_half_cos_window(int block_size) {
+  float *window_function =
+      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  for (int y = 0; y < block_size; ++y) {
+    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+    for (int x = 0; x < block_size; ++x) {
+      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+    }
+  }
+  return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
+  static void dither_and_quantize_##suffix(                                 \
+      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
+      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
+      float block_normalization) {                                          \
+    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
+      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
+        const int result_idx =                                              \
+            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
+            (block_size >> chroma_sub_w);                                   \
+        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
+            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
+            block_normalization);                                           \
+        const float err =                                                   \
+            -(((float)new_val) / block_normalization - result[result_idx]); \
+        denoised[y * stride + x] = new_val;                                 \
+        if (x + 1 < (w >> chroma_sub_w)) {                                  \
+          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
+        }                                                                   \
+        if (y + 1 < (h >> chroma_sub_h)) {                                  \
+          if (x > 0) {                                                      \
+            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
+          }                                                                 \
+          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
+          if (x + 1 < (w >> chroma_sub_w)) {                                \
+            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd) {
+  float *plane = NULL, *block = NULL, *window_full = NULL,
+        *window_chroma = NULL;
+  double *block_d = NULL, *plane_d = NULL;
+  struct aom_noise_tx_t *tx_full = NULL;
+  struct aom_noise_tx_t *tx_chroma = NULL;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  const int result_stride = (num_blocks_w + 2) * block_size;
+  const int result_height = (num_blocks_h + 2) * block_size;
+  float *result = NULL;
+  int init_success = 1;
+  aom_flat_block_finder_t block_finder_full;
+  aom_flat_block_finder_t block_finder_chroma;
+  const float kBlockNormalization = (float)((1 << bit_depth) - 1);
+  if (chroma_sub[0] != chroma_sub[1]) {
+    fprintf(stderr,
+            "aom_wiener_denoise_2d doesn't handle different chroma "
+            "subsampling");
+    return 0;
+  }
+  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+                                             bit_depth, use_highbd);
+  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+                               sizeof(*result));
+  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+  block =
+      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+  window_full = get_half_cos_window(block_size);
+  tx_full = aom_noise_tx_malloc(block_size);
+
+  if (chroma_sub[0] != 0) {
+    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+                                               block_size >> chroma_sub[0],
+                                               bit_depth, use_highbd);
+    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+  } else {
+    window_chroma = window_full;
+    tx_chroma = tx_full;
+  }
+
+  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+                  (window_full != NULL) && (window_chroma != NULL) &&
+                  (result != NULL);
+  for (int c = init_success ? 0 : 3; c < 3; ++c) {
+    float *window_function = c == 0 ? window_full : window_chroma;
+    aom_flat_block_finder_t *block_finder = &block_finder_full;
+    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+    struct aom_noise_tx_t *tx =
+        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+    if (!data[c] || !denoised[c]) continue;
+    if (c > 0 && chroma_sub[0] != 0) {
+      block_finder = &block_finder_chroma;
+    }
+    memset(result, 0, sizeof(*result) * result_stride * result_height);
+    // Do overlapped block processing (half overlapped). The block rows can
+    // easily be done in parallel
+    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+         offsy += (block_size >> chroma_sub_h) / 2) {
+      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+           offsx += (block_size >> chroma_sub_w) / 2) {
+        // Pad the boundary when processing each block-set.
+        for (int by = -1; by < num_blocks_h; ++by) {
+          for (int bx = -1; bx < num_blocks_w; ++bx) {
+            const int pixels_per_block =
+                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+            aom_flat_block_finder_extract_block(
+                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+            for (int j = 0; j < pixels_per_block; ++j) {
+              block[j] = (float)block_d[j];
+              plane[j] = (float)plane_d[j];
+            }
+            pointwise_multiply(window_function, block, pixels_per_block);
+            aom_noise_tx_forward(tx, block);
+            aom_noise_tx_filter(tx, noise_psd[c]);
+            aom_noise_tx_inverse(tx, block);
+
+            // Apply window function to the plane approximation (we will apply
+            // it to the sum of plane + block when composing the results).
+            pointwise_multiply(window_function, plane, pixels_per_block);
+
+            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+              const int y_result =
+                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+                const int x_result =
+                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+                result[y_result * result_stride + x_result] +=
+                    (block[y * (block_size >> chroma_sub_w) + x] +
+                     plane[y * (block_size >> chroma_sub_w) + x]) *
+                    window_function[y * (block_size >> chroma_sub_w) + x];
+              }
+            }
+          }
+        }
+      }
+    }
+    if (use_highbd) {
+      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                                 block_size, kBlockNormalization);
+    } else {
+      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+                                stride[c], chroma_sub_w, chroma_sub_h,
+                                block_size, kBlockNormalization);
+    }
+  }
+  aom_free(result);
+  aom_free(plane);
+  aom_free(block);
+  aom_free(plane_d);
+  aom_free(block_d);
+  aom_free(window_full);
+
+  aom_noise_tx_free(tx_full);
+
+  aom_flat_block_finder_free(&block_finder_full);
+  if (chroma_sub[0] != 0) {
+    aom_flat_block_finder_free(&block_finder_chroma);
+    aom_free(window_chroma);
+    aom_noise_tx_free(tx_chroma);
+  }
+  return init_success;
+}
+
+struct aom_denoise_and_model_t {
+  int block_size;
+  int bit_depth;
+  float noise_level;
+
+  // Size of current denoised buffer and flat_block buffer
+  int width;
+  int height;
+  int y_stride;
+  int uv_stride;
+  int num_blocks_w;
+  int num_blocks_h;
+
+  // Buffers for image and noise_psd allocated on the fly
+  float *noise_psd[3];
+  uint8_t *denoised[3];
+  uint8_t *flat_blocks;
+
+  aom_flat_block_finder_t flat_block_finder;
+  aom_noise_model_t noise_model;
+};
+
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level) {
+  struct aom_denoise_and_model_t *ctx =
+      (struct aom_denoise_and_model_t *)aom_malloc(
+          sizeof(struct aom_denoise_and_model_t));
+  if (!ctx) {
+    fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
+    return NULL;
+  }
+  memset(ctx, 0, sizeof(*ctx));
+
+  ctx->block_size = block_size;
+  ctx->noise_level = noise_level;
+  ctx->bit_depth = bit_depth;
+
+  ctx->noise_psd[0] =
+      aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
+  ctx->noise_psd[1] =
+      aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
+  ctx->noise_psd[2] =
+      aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
+  if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
+    fprintf(stderr, "Unable to allocate noise PSD buffers\n");
+    aom_denoise_and_model_free(ctx);
+    return NULL;
+  }
+  return ctx;
+}
+
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
+  aom_free(ctx->flat_blocks);
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    aom_free(ctx->noise_psd[i]);
+  }
+  aom_noise_model_free(&ctx->noise_model);
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  aom_free(ctx);
+}
+
+static int denoise_and_model_realloc_if_necessary(
+    struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
+  if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
+      ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
+    return 1;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  const int block_size = ctx->block_size;
+
+  ctx->width = sd->y_width;
+  ctx->height = sd->y_height;
+  ctx->y_stride = sd->y_stride;
+  ctx->uv_stride = sd->uv_stride;
+
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    ctx->denoised[i] = NULL;
+  }
+  aom_free(ctx->flat_blocks);
+  ctx->flat_blocks = NULL;
+
+  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
+  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
+    fprintf(stderr, "Unable to allocate denoise buffers\n");
+    return 0;
+  }
+  ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
+  ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
+  ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
+                                  ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to init flat block finder\n");
+    return 0;
+  }
+
+  const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                            ctx->bit_depth, use_highbd };
+  aom_noise_model_free(&ctx->noise_model);
+  if (!aom_noise_model_init(&ctx->noise_model, params)) {
+    fprintf(stderr, "Unable to init noise model\n");
+    return 0;
+  }
+
+  // Simply use a flat PSD (although we could use the flat blocks to estimate
+  // PSD) those to estimate an actual noise PSD)
+  const float y_noise_level =
+      aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
+  const float uv_noise_level = aom_noise_psd_get_default_value(
+      ctx->block_size >> sd->subsampling_x, ctx->noise_level);
+  for (int i = 0; i < block_size * block_size; ++i) {
+    ctx->noise_psd[0][i] = y_noise_level;
+    ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
+  }
+  return 1;
+}
+
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *sd,
+                              aom_film_grain_t *film_grain) {
+  const int block_size = ctx->block_size;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  uint8_t *raw_data[3] = {
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+  };
+  const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
+  int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
+  int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
+
+  if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
+    fprintf(stderr, "Unable to realloc buffers\n");
+    return 0;
+  }
+
+  aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
+                            sd->y_height, strides[0], ctx->flat_blocks);
+
+  if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
+                             strides, chroma_sub_log2, ctx->noise_psd,
+                             block_size, ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to denoise image\n");
+    return 0;
+  }
+
+  const aom_noise_status_t status = aom_noise_model_update(
+      &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
+      sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
+      block_size);
+  int have_noise_estimate = 0;
+  if (status == AOM_NOISE_STATUS_OK) {
+    have_noise_estimate = 1;
+  } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+    aom_noise_model_save_latest(&ctx->noise_model);
+    have_noise_estimate = 1;
+  } else {
+    // Unable to update noise model; proceed if we have a previous estimate.
+    have_noise_estimate =
+        (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
+  }
+
+  film_grain->apply_grain = 0;
+  if (have_noise_estimate) {
+    if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
+      fprintf(stderr, "Unable to get grain parameters.\n");
+      return 0;
+    }
+    if (!film_grain->random_seed) {
+      film_grain->random_seed = 7391;
+    }
+    memcpy(raw_data[0], ctx->denoised[0],
+           (strides[0] * sd->y_height) << use_highbd);
+    memcpy(raw_data[1], ctx->denoised[1],
+           (strides[1] * sd->uv_height) << use_highbd);
+    memcpy(raw_data[2], ctx->denoised[2],
+           (strides[2] * sd->uv_height) << use_highbd);
+  }
+  return 1;
+}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
new file mode 100644
index 000000000..049d5be15
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
+#define AOM_AOM_DSP_NOISE_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include <stdint.h>
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_scale/yv12config.h"
+
+/*!\brief Wrapper of data required to represent linear system of eqns and soln.
+ */
+typedef struct {
+  double *A;
+  double *b;
+  double *x;
+  int n;
+} aom_equation_system_t;
+
+/*!\brief Representation of a piecewise linear curve
+ *
+ * Holds n points as (x, y) pairs, that store the curve.
+ */
+typedef struct {
+  double (*points)[2];
+  int num_points;
+} aom_noise_strength_lut_t;
+
+/*!\brief Init the noise strength lut with the given number of points*/
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
+
+/*!\brief Frees the noise strength lut. */
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
+
+/*!\brief Evaluate the lut at the point x.
+ *
+ * \param[in] lut  The lut data.
+ * \param[in] x    The coordinate to evaluate the lut.
+ */
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x);
+
+/*!\brief Helper struct to model noise strength as a function of intensity.
+ *
+ * Internally, this structure holds a representation of a linear system
+ * of equations that models noise strength (standard deviation) as a
+ * function of intensity. The mapping is initially stored using a
+ * piecewise representation with evenly spaced bins that cover the entire
+ * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
+ * constraint of the form:
+ *   y_{i} (1 - a) + y_{i+1} a = y
+ * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
+ * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
+ * normal equations.
+ *
+ * As there may be missing data, the solution is regularized to get a
+ * complete set of values for the bins. A reduced representation after
+ * solving can be obtained by getting the corresponding noise_strength_lut_t.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  double min_intensity;
+  double max_intensity;
+  int num_bins;
+  int num_equations;
+  double total;
+} aom_noise_strength_solver_t;
+
+/*!\brief Initializes the noise solver with the given number of bins.
+ *
+ * Returns 0 if initialization fails.
+ *
+ * \param[in]  solver    The noise solver to be initialized.
+ * \param[in]  num_bins  Number of bins to use in the internal representation.
+ * \param[in]  bit_depth The bit depth used to derive {min,max}_intensity.
+ */
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth);
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
+
+/*!\brief Gets the x coordinate of bin i.
+ *
+ * \param[in]  i  The bin whose coordinate to query.
+ */
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i);
+
+/*!\brief Add an observation of the block mean intensity to its noise strength.
+ *
+ * \param[in]  block_mean  The average block intensity,
+ * \param[in]  noise_std   The observed noise strength.
+ */
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
+
+/*!\brief Solves the current set of equations for the noise strength. */
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
+
+/*!\brief Fits a reduced piecewise linear lut to the internal solution
+ *
+ * \param[in] max_num_points  The maximum number of output points
+ * \param[out] lut  The output piecewise linear lut.
+ */
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_num_points,
+    aom_noise_strength_lut_t *lut);
+
+/*!\brief Helper for holding precomputed data for finding flat blocks.
+ *
+ * Internally a block is modeled with a low-order polynomial model. A
+ * planar model would be a bunch of equations like:
+ * <[y_i x_i 1], [a_1, a_2, a_3]>  = b_i
+ * for each point in the block. The system matrix A with row i as [y_i x_i 1]
+ * is maintained as is the inverse, inv(A'*A), so that the plane parameters
+ * can be fit for each block.
+ */
+typedef struct {
+  double *AtA_inv;
+  double *A;
+  int num_params;  // The number of parameters used for internal low-order model
+  int block_size;  // The block size the finder was initialized with
+  double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
+  int use_highbd;        // Whether input data should be interpreted as uint16
+} aom_flat_block_finder_t;
+
+/*!\brief Init the block_finder with the given block size, bit_depth */
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd);
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
+
+/*!\brief Helper to extract a block and low order "planar" model. */
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block);
+
+/*!\brief Runs the flat block finder on the input data.
+ *
+ * Find flat blocks in the input image data. Returns a map of
+ * flat_blocks, where the value of flat_blocks map will be non-zero
+ * when a block is determined to be flat. A higher value indicates a bigger
+ * confidence in the decision.
+ */
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks);
+
+// The noise shape indicates the allowed coefficients in the AR model.
+typedef enum {
+  AOM_NOISE_SHAPE_DIAMOND = 0,
+  AOM_NOISE_SHAPE_SQUARE = 1
+} aom_noise_shape;
+
+// The parameters of the noise model include the shape type, lag, the
+// bit depth of the input images provided, and whether the input images
+// will be using uint16 (or uint8) representation.
+typedef struct {
+  aom_noise_shape shape;
+  int lag;
+  int bit_depth;
+  int use_highbd;
+} aom_noise_model_params_t;
+
+/*!\brief State of a noise model estimate for a single channel.
+ *
+ * This contains a system of equations that can be used to solve
+ * for the auto-regressive coefficients as well as a noise strength
+ * solver that can be used to model noise strength as a function of
+ * intensity.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  aom_noise_strength_solver_t strength_solver;
+  int num_observations;  // The number of observations in the eqn system
+  double ar_gain;        // The gain of the current AR filter
+} aom_noise_state_t;
+
+/*!\brief Complete model of noise for a planar video
+ *
+ * This includes a noise model for the latest frame and an aggregated
+ * estimate over all previous frames that had similar parameters.
+ */
+typedef struct {
+  aom_noise_model_params_t params;
+  aom_noise_state_t combined_state[3];  // Combined state per channel
+  aom_noise_state_t latest_state[3];    // Latest state per channel
+  int (*coords)[2];  // Offsets (x,y) of the coefficient samples
+  int n;             // Number of parameters (size of coords)
+  int bit_depth;
+} aom_noise_model_t;
+
+/*!\brief Result of a noise model update. */
+typedef enum {
+  AOM_NOISE_STATUS_OK = 0,
+  AOM_NOISE_STATUS_INVALID_ARGUMENT,
+  AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+  AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
+  AOM_NOISE_STATUS_INTERNAL_ERROR,
+} aom_noise_status_t;
+
+/*!\brief Initializes a noise model with the given parameters.
+ *
+ * Returns 0 on failure.
+ */
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params);
+void aom_noise_model_free(aom_noise_model_t *model);
+
+/*!\brief Updates the noise model with a new frame observation.
+ *
+ * Updates the noise model with measurements from the given input frame and a
+ * denoised variant of it. Noise is sampled from flat blocks using the flat
+ * block map.
+ *
+ * Returns a noise_status indicating if the update was successful. If the
+ * Update was successful, the combined_state is updated with measurements from
+ * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
+ * state will be updated with measurements from the provided frame.
+ *
+ * \param[in,out] noise_model     The noise model to be updated
+ * \param[in]     data            Raw frame data
+ * \param[in]     denoised        Denoised frame data.
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     strides         Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     flat_blocks     A map to blocks that have been determined flat
+ * \param[in]     block_size      The size of blocks.
+ */
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int strides[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
+
+/*\brief Save the "latest" estimate into the "combined" estimate.
+ *
+ * This is meant to be called when the noise modeling detected a change
+ * in parameters (or for example, if a user wanted to reset estimation at
+ * a shot boundary).
+ */
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
+
+/*!\brief Converts the noise_model parameters to the corresponding
+ *    grain_parameters.
+ *
+ * The noise structs in this file are suitable for estimation (e.g., using
+ * floats), but the grain parameters in the bitstream are quantized. This
+ * function does the conversion by selecting the correct quantization levels.
+ */
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain);
+
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in]     data            Raw frame data
+ * \param[out]    denoised        Denoised frame data
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     stride          Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     noise_psd       The power spectral density of the noise
+ * \param[in]     block_size      The size of blocks
+ * \param[in]     bit_depth       Bit depth of the image
+ * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
+ *                                uint16 and stride is measured in uint16.
+ *                                This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub_log2[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd);
+
+struct aom_denoise_and_model_t;
+
+/*!\brief Denoise the buffer and model the residual noise.
+ *
+ * This is meant to be called sequentially on input frames. The input buffer
+ * is denoised and the residual noise is modelled. The current noise estimate
+ * is populated in film_grain. Returns true on success. The grain.apply_grain
+ * parameter will be true when the input buffer was successfully denoised and
+ * grain was modelled. Returns false on error.
+ *
+ * \param[in]      ctx   Struct allocated with aom_denoise_and_model_alloc
+ *                       that holds some buffers for denoising and the current
+ *                       noise estimate.
+ * \param[in/out]   buf  The raw input buffer to be denoised.
+ * \param[out]    grain  Output film grain parameters
+ */
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
+
+/*!\brief Allocates a context that can be used for denoising and noise modeling.
+ *
+ * \param[in]  bit_depth   Bit depth of buffers this will be run on.
+ * \param[in]  block_size  Block size for noise modeling and flat block
+ *                         estimation
+ * \param[in]  noise_level The noise_level (2.5 for moderate noise, and 5 for
+ *                         higher levels of noise)
+ */
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level);
+
+/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
+ */
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
new file mode 100644
index 000000000..87e8e9fec
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+  return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+  float *tx_block;
+  float *temp;
+  int block_size;
+  void (*fft)(const float *, float *, float *);
+  void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+  struct aom_noise_tx_t *noise_tx =
+      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+  if (!noise_tx) return NULL;
+  memset(noise_tx, 0, sizeof(*noise_tx));
+  switch (block_size) {
+    case 2:
+      noise_tx->fft = aom_fft2x2_float;
+      noise_tx->ifft = aom_ifft2x2_float;
+      break;
+    case 4:
+      noise_tx->fft = aom_fft4x4_float;
+      noise_tx->ifft = aom_ifft4x4_float;
+      break;
+    case 8:
+      noise_tx->fft = aom_fft8x8_float;
+      noise_tx->ifft = aom_ifft8x8_float;
+      break;
+    case 16:
+      noise_tx->fft = aom_fft16x16_float;
+      noise_tx->ifft = aom_ifft16x16_float;
+      break;
+    case 32:
+      noise_tx->fft = aom_fft32x32_float;
+      noise_tx->ifft = aom_ifft32x32_float;
+      break;
+    default:
+      aom_free(noise_tx);
+      fprintf(stderr, "Unsupported block size %d\n", block_size);
+      return NULL;
+  }
+  noise_tx->block_size = block_size;
+  noise_tx->tx_block = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  noise_tx->temp = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  if (!noise_tx->tx_block || !noise_tx->temp) {
+    aom_noise_tx_free(noise_tx);
+    return NULL;
+  }
+  // Clear the buffers up front. Some outputs of the forward transform are
+  // real only (the imaginary component will never be touched)
+  memset(noise_tx->tx_block, 0,
+         2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  memset(noise_tx->temp, 0,
+         2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+  const int block_size = noise_tx->block_size;
+  const float kBeta = 1.1f;
+  const float kEps = 1e-6f;
+  for (int y = 0; y < block_size; ++y) {
+    for (int x = 0; x < block_size; ++x) {
+      int i = y * block_size + x;
+      float *c = noise_tx->tx_block + 2 * i;
+      const float p = c[0] * c[0] + c[1] * c[1];
+      if (p > kBeta * psd[i] && p > 1e-6) {
+        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+      } else {
+        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+      }
+    }
+  }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+  const int n = noise_tx->block_size * noise_tx->block_size;
+  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+  for (int i = 0; i < n; ++i) {
+    data[i] /= n;
+  }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+                             float *psd) {
+  const int block_size = noise_tx->block_size;
+  for (int yb = 0; yb < block_size; ++yb) {
+    for (int xb = 0; xb <= block_size / 2; ++xb) {
+      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+    }
+  }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+  if (!noise_tx) return;
+  aom_free(noise_tx->tx_block);
+  aom_free(noise_tx->temp);
+  aom_free(noise_tx);
+}
+
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n) {
+  double c = 0;
+  double a_len = 0;
+  double b_len = 0;
+  for (int i = 0; i < n; ++i) {
+    a_len += a[i] * a[i];
+    b_len += b[i] * b[i];
+    c += a[i] * b[i];
+  }
+  return c / (sqrt(a_len) * sqrt(b_len));
+}
+
+int aom_noise_data_validate(const double *data, int w, int h) {
+  const double kVarianceThreshold = 2;
+  const double kMeanThreshold = 2;
+
+  int x = 0, y = 0;
+  int ret_value = 1;
+  double var = 0, mean = 0;
+  double *mean_x, *mean_y, *var_x, *var_y;
+
+  // Check that noise variance is not increasing in x or y
+  // and that the data is zero mean.
+  mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
+  var_x = (double *)aom_malloc(sizeof(*var_x) * w);
+  mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
+  var_y = (double *)aom_malloc(sizeof(*var_y) * h);
+
+  memset(mean_x, 0, sizeof(*mean_x) * w);
+  memset(var_x, 0, sizeof(*var_x) * w);
+  memset(mean_y, 0, sizeof(*mean_y) * h);
+  memset(var_y, 0, sizeof(*var_y) * h);
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const double d = data[y * w + x];
+      var_x[x] += d * d;
+      var_y[y] += d * d;
+      mean_x[x] += d;
+      mean_y[y] += d;
+      var += d * d;
+      mean += d;
+    }
+  }
+  mean /= (w * h);
+  var = var / (w * h) - mean * mean;
+
+  for (y = 0; y < h; ++y) {
+    mean_y[y] /= h;
+    var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
+    if (fabs(var_y[y] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  for (x = 0; x < w; ++x) {
+    mean_x[x] /= w;
+    var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
+    if (fabs(var_x[x] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  aom_free(mean_x);
+  aom_free(mean_y);
+  aom_free(var_x);
+  aom_free(var_y);
+
+  return ret_value;
+}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
new file mode 100644
index 000000000..2284a171a
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
+#define AOM_AOM_DSP_NOISE_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+                          const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+                             float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
+// Computes normalized cross correlation of two vectors a and b of length n.
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n);
+
+// Validates the correlated noise in the data buffer of size (w, h).
+int aom_noise_data_validate(const double *data, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h
new file mode 100644
index 000000000..f3d87f264
--- /dev/null
+++ b/third_party/aom/aom_dsp/postproc.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_POSTPROC_H_
+#define AOM_AOM_DSP_POSTPROC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Fills a noise buffer with gaussian noise strength determined by sigma.
+int aom_setup_noise(double sigma, int size, char *noise);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_POSTPROC_H_
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
new file mode 100644
index 000000000..d003a986e
--- /dev/null
+++ b/third_party/aom/aom_dsp/prob.h
@@ -0,0 +1,671 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PROB_H_
+#define AOM_AOM_DSP_PROB_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/entcode.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(negge): Rename this aom_prob once we remove vpxbool.
+typedef uint16_t aom_cdf_prob;
+
+#define CDF_SIZE(x) ((x) + 1)
+#define CDF_PROB_BITS 15
+#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+#define CDF_INIT_TOP 32768
+#define CDF_SHIFT (15 - CDF_PROB_BITS)
+/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
+  probability (an "inverse" CDF).
+  This function converts from one representation to the other (and is its own
+  inverse).*/
+#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
+
+#if CDF_SHIFT == 0
+
+#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2) \
+  AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3) \
+  AOM_ICDF(a0)                   \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                        \
+  AOM_ICDF(a0)                                              \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                                  \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                              \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)                          \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)                     \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)                 \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)               \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11)          \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12)     \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+
+#else
+#define AOM_CDF2(a0)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \
+            ((CDF_INIT_TOP - 2) >> 1)) /                   \
+               ((CDF_INIT_TOP - 2)) +                      \
+           1)                                              \
+  , AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) +     \
+            ((CDF_INIT_TOP - 3) >> 1)) /                       \
+               ((CDF_INIT_TOP - 3)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
+                ((CDF_INIT_TOP - 3) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 3)) +                      \
+               2),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2)                                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) +     \
+            ((CDF_INIT_TOP - 4) >> 1)) /                       \
+               ((CDF_INIT_TOP - 4)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               3),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3)                               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) +     \
+            ((CDF_INIT_TOP - 5) >> 1)) /                       \
+               ((CDF_INIT_TOP - 5)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               4),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) +     \
+            ((CDF_INIT_TOP - 6) >> 1)) /                       \
+               ((CDF_INIT_TOP - 6)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               5),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) +     \
+            ((CDF_INIT_TOP - 7) >> 1)) /                       \
+               ((CDF_INIT_TOP - 7)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               6),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) +     \
+            ((CDF_INIT_TOP - 8) >> 1)) /                       \
+               ((CDF_INIT_TOP - 8)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               7),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) +     \
+            ((CDF_INIT_TOP - 9) >> 1)) /                       \
+               ((CDF_INIT_TOP - 9)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               7),                                             \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               8),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) +     \
+            ((CDF_INIT_TOP - 10) >> 1)) /                       \
+               ((CDF_INIT_TOP - 10)) +                          \
+           1)                                                   \
+  ,                                                             \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               2),                                              \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               3),                                              \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               4),                                              \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               5),                                              \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               6),                                              \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               7),                                              \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               8),                                              \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               9),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +      \
+            ((CDF_INIT_TOP - 11) >> 1)) /                        \
+               ((CDF_INIT_TOP - 11)) +                           \
+           1)                                                    \
+  ,                                                              \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               2),                                               \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               3),                                               \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               4),                                               \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               5),                                               \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               6),                                               \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               7),                                               \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               8),                                               \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               9),                                               \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               10),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)    \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +       \
+            ((CDF_INIT_TOP - 12) >> 1)) /                         \
+               ((CDF_INIT_TOP - 12)) +                            \
+           1)                                                     \
+  ,                                                               \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               2),                                                \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               3),                                                \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               4),                                                \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               5),                                                \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               6),                                                \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               7),                                                \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               8),                                                \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               9),                                                \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +  \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               10),                                               \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               11),                                               \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +         \
+            ((CDF_INIT_TOP - 13) >> 1)) /                           \
+               ((CDF_INIT_TOP - 13)) +                              \
+           1)                                                       \
+  ,                                                                 \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               2),                                                  \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               3),                                                  \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               4),                                                  \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               5),                                                  \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               6),                                                  \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               7),                                                  \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               8),                                                  \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               9),                                                  \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +    \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               10),                                                 \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               11),                                                 \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               12),                                                 \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +              \
+            ((CDF_INIT_TOP - 14) >> 1)) /                                \
+               ((CDF_INIT_TOP - 14)) +                                   \
+           1)                                                            \
+  ,                                                                      \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               2),                                                       \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               3),                                                       \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               4),                                                       \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               5),                                                       \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               6),                                                       \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               7),                                                       \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               8),                                                       \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               9),                                                       \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +         \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               10),                                                      \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               11),                                                      \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               12),                                                      \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               13),                                                      \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +                   \
+            ((CDF_INIT_TOP - 15) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 15)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +              \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               14),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +                   \
+            ((CDF_INIT_TOP - 16) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 16)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +              \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               14),                                                           \
+      AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               15),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
+
+#endif
+
+static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
+  assert(den != 0);
+  {
+    const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+    // (p > 255) ? 255 : (p < 1) ? 1 : p;
+    const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+    return (uint8_t)clipped_prob;
+  }
+}
+
+static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
+  int rate;
+  int i, tmp;
+
+  static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
+                                        2, 2, 2, 2, 2, 2, 2, 2 };
+  assert(nsymbs < 17);
+  rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
+         nsymbs2speed[nsymbs];  // + get_msb(nsymbs);
+  tmp = AOM_ICDF(0);
+
+  // Single loop (faster)
+  for (i = 0; i < nsymbs - 1; ++i) {
+    tmp = (i == val) ? 0 : tmp;
+    if (tmp < cdf[i]) {
+      cdf[i] -= ((cdf[i] - tmp) >> rate);
+    } else {
+      cdf[i] += ((tmp - cdf[i]) >> rate);
+    }
+  }
+  cdf[nsymbs] += (cdf[nsymbs] < 32);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
new file mode 100644
index 000000000..50f376a4a
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_scale/yv12config.h"
+
+double aom_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+ * and highbd_8_variance(). It should not.
+ */
+static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, unsigned int *sse,
+                             int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride, int w,
+                                      int h, uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
+    }
+    tsum += lsum;
+    a += a_stride;
+    b += b_stride;
+  }
+  *sum = tsum;
+  *sse = tsse;
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride, int w,
+                                      int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
+                            &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
+                     height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
+                     &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int width,
+                                    int height, unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
+                              b_stride, dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                              &b[(height - dh) * b_stride], b_stride,
+                              width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                 b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
+                 width, height);
+}
+
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+                 b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+                 width, height);
+}
+
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+                 b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+                 width, height);
+}
+
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(
+      a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+      b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
+}
+
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride,
+                        b->u_buffer + vstart * b->uv_stride + hstart,
+                        b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride,
+                        b->v_buffer + vstart * b->uv_stride + hstart,
+                        b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
+  if (highbd) {
+    switch (plane) {
+      case 0: return aom_highbd_get_y_sse(a, b);
+      case 1: return aom_highbd_get_u_sse(a, b);
+      case 2: return aom_highbd_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
+  }
+  switch (plane) {
+    case 0: return aom_get_y_sse(a, b);
+    case 1: return aom_get_u_sse(a, b);
+    case 2: return aom_get_v_sse(a, b);
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+}
+
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          uint32_t bit_depth, uint32_t in_bit_depth) {
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
+                                   b_strides[i], w, h, input_shift);
+      } else {
+        sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                             b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
+                    h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] =
+      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
+
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse =
+        get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] =
+      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
new file mode 100644
index 000000000..58e4e71ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PSNR_H_
+#define AOM_AOM_DSP_PSNR_H_
+
+#include "aom_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
+double aom_sse_to_psnr(double samples, double peak, double sse);
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd);
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          unsigned int bit_depth, unsigned int in_bit_depth);
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr);
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+                   double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
new file mode 100644
index 000000000..30fe21d9c
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnrhvs.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ *  This code was originally written by: Gregory Maxwell, at the Daala
+ *  project.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/system_state.h"
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                           int xstride) {
+  int i, j;
+  (void)xstride;
+  aom_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                               int xstride) {
+  int i, j;
+  (void)xstride;
+  aom_highbd_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const double csf_y[8][8] = {
+  { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+    0.678296995242, 0.466224900598, 0.3265091542 },
+  { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+    0.868920337363, 0.61280991668, 0.436405793551 },
+  { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+    0.670882927016, 0.501731932449, 0.372504254596 },
+  { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+    0.48309405692, 0.380429446972, 0.295774038565 },
+  { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+    0.352889268808, 0.283006984131, 0.226951348204 },
+  { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+    0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+  { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+    0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+  { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+    0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
+static const double csf_cb420[8][8] = {
+  { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+    0.898018824055, 0.74725392039, 0.615105596242 },
+  { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+    1.17428548929, 0.996404342439, 0.830890433625 },
+  { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+    0.960060382087, 0.849823426169, 0.731221236837 },
+  { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+    0.751437590932, 0.685398513368, 0.608694761374 },
+  { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+    0.605503172737, 0.55002013668, 0.495804539034 },
+  { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+    0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+  { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+    0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+  { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+    0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
+static const double csf_cr420[8][8] = {
+  { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+    0.867069376285, 0.721500455585, 0.593906509971 },
+  { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+    1.13381474809, 0.962064122248, 0.802254508198 },
+  { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+    0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+  { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+    0.725539939514, 0.661776842059, 0.587716619023 },
+  { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+    0.584635025748, 0.531064164893, 0.478717061273 },
+  { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+    0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+  { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+    0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+  { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+    0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
+
+static double convert_score_db(double _score, double _weight, int bit_depth) {
+  int16_t pix_max = 255;
+  assert(_score * _weight >= 0.0);
+  if (bit_depth == 10)
+    pix_max = 1023;
+  else if (bit_depth == 12)
+    pix_max = 4095;
+
+  if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
+  return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+                           const unsigned char *dst, int _dystride, double _par,
+                           int _w, int _h, int _step, const double _csf[8][8],
+                           uint32_t _shift, int buf_is_hbd) {
+  double ret;
+  const uint8_t *_src8 = src;
+  const uint8_t *_dst8 = dst;
+  const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
+  double mask[8][8];
+  int pixels;
+  int x;
+  int y;
+  (void)_par;
+  ret = pixels = 0;
+  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+   their masking table as "we have used the quantization table for the
+   color component Y of JPEG [6] that has been also obtained on the
+   basis of CSF. Note that the values in quantization table JPEG have
+   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+   was also constructed from the JPEG matrices. I can not find any obvious
+   scheme of normalizing to produce their table, but if I multiply their
+   CSF by 0.38857 and square the result I get their masking table.
+   I have no idea where this constant comes from, but deviating from it
+   too greatly hurts MOS agreement.
+
+   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+   of DCT basis functions", CD-ROM Proceedings of the Third
+   International Workshop on Video Processing and Quality Metrics for Consumer
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+  for (x = 0; x < 8; x++)
+    for (y = 0; y < 8; y++)
+      mask[x][y] =
+          (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
+  for (y = 0; y < _h - 7; y += _step) {
+    for (x = 0; x < _w - 7; x += _step) {
+      int i;
+      int j;
+      double s_means[4];
+      double d_means[4];
+      double s_vars[4];
+      double d_vars[4];
+      double s_gmean = 0;
+      double d_gmean = 0;
+      double s_gvar = 0;
+      double d_gvar = 0;
+      double s_mask = 0;
+      double d_mask = 0;
+      for (i = 0; i < 4; i++)
+        s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          if (!buf_is_hbd) {
+            dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+            dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+          } else {
+            dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+            dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+          }
+          s_gmean += dct_s[i * 8 + j];
+          d_gmean += dct_d[i * 8 + j];
+          s_means[sub] += dct_s[i * 8 + j];
+          d_means[sub] += dct_d[i * 8 + j];
+        }
+      }
+      s_gmean /= 64.f;
+      d_gmean /= 64.f;
+      for (i = 0; i < 4; i++) s_means[i] /= 16.f;
+      for (i = 0; i < 4; i++) d_means[i] /= 16.f;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+          d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
+                         (dct_s[i * 8 + j] - s_means[sub]);
+          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
+                         (dct_d[i * 8 + j] - d_means[sub]);
+        }
+      }
+      s_gvar *= 1 / 63.f * 64;
+      d_gvar *= 1 / 63.f * 64;
+      for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
+      for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
+      if (s_gvar > 0)
+        s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+      if (d_gvar > 0)
+        d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+      if (!buf_is_hbd) {
+        od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      } else {
+        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      }
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+      s_mask = sqrt(s_mask * s_gvar) / 32.f;
+      d_mask = sqrt(d_mask * d_gvar) / 32.f;
+      if (d_mask > s_mask) s_mask = d_mask;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          double err;
+          err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+          if (i != 0 || j != 0)
+            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+          pixels++;
+        }
+      }
+    }
+  }
+  if (pixels <= 0) return 0;
+  ret /= pixels;
+  return ret;
+}
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
+                   double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
+                   uint32_t bd, uint32_t in_bd) {
+  double psnrhvs;
+  const double par = 1.0;
+  const int step = 7;
+  uint32_t bd_shift = 0;
+  aom_clear_system_state();
+  assert(bd == 8 || bd == 10 || bd == 12);
+  assert(bd >= in_bd);
+  assert(src->flags == dst->flags);
+  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  bd_shift = bd - in_bd;
+
+  *y_psnrhvs = calc_psnrhvs(
+      src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
+      src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd);
+  *u_psnrhvs =
+      calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cb420, bd_shift, buf_is_hbd);
+  *v_psnrhvs =
+      calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cr420, bd_shift, buf_is_hbd);
+  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+  return convert_score_db(psnrhvs, 1.0, in_bd);
+}
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
new file mode 100644
index 000000000..62dbd86a9
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+
+void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan, const qm_val_t *qm_ptr,
+                         const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  int idx_arr[4096];
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
+        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+      idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+    const int64_t tmpw = tmp1 * wt;
+    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                 (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = idx_arr[i];
+  }
+  *eob_ptr = eob + 1;
+}
+
+/* These functions should only be called when quantisation matrices
+   are not used. */
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                      eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                      eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                      eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                             NULL, NULL, 2);
+}
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
new file mode 100644
index 000000000..c55ab234e
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_QUANTIZE_H_
+#define AOM_AOM_DSP_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan, const qm_val_t *qm_ptr,
+                         const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan);
+
+void highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
new file mode 100644
index 000000000..1e24df4a5
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define sadMxh(m)                                                          \
+  unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
+                                const uint8_t *b, int b_stride, int width, \
+                                int height) {                              \
+    return sad(a, a_stride, b, b_stride, width, height);                   \
+  }
+
+#define sadMxN(m, n)                                                          \
+  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref, int ref_stride) {     \
+    return sad(src, src_stride, ref, ref_stride, m, n);                       \
+  }                                                                           \
+  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
+                                        const uint8_t *ref, int ref_stride,   \
+                                        const uint8_t *second_pred) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_jnt_sad##m##x##n##_avg_c(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride,    \
+                            jcp_param);                                       \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }
+
+// Calculate sad against 4 reference locations and store each in sad_array
+#define sadMxNx4D(m, n)                                                    \
+  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[],           \
+                               int ref_stride, uint32_t *sad_array) {      \
+    int i;                                                                 \
+    for (i = 0; i < 4; ++i)                                                \
+      sad_array[i] =                                                       \
+          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+  }
+
+/* clang-format off */
+// 128x128
+sadMxN(128, 128)
+sadMxNx4D(128, 128)
+
+// 128x64
+sadMxN(128, 64)
+sadMxNx4D(128, 64)
+
+// 64x128
+sadMxN(64, 128)
+sadMxNx4D(64, 128)
+
+// 64x64
+sadMxN(64, 64)
+sadMxNx4D(64, 64)
+
+// 64x32
+sadMxN(64, 32)
+sadMxNx4D(64, 32)
+
+// 32x64
+sadMxN(32, 64)
+sadMxNx4D(32, 64)
+
+// 32x32
+sadMxN(32, 32)
+sadMxNx4D(32, 32)
+
+// 32x16
+sadMxN(32, 16)
+sadMxNx4D(32, 16)
+
+// 16x32
+sadMxN(16, 32)
+sadMxNx4D(16, 32)
+
+// 16x16
+sadMxN(16, 16)
+sadMxNx4D(16, 16)
+
+// 16x8
+sadMxN(16, 8)
+sadMxNx4D(16, 8)
+
+// 8x16
+sadMxN(8, 16)
+sadMxNx4D(8, 16)
+
+// 8x8
+sadMxN(8, 8)
+sadMxNx4D(8, 8)
+
+// 8x4
+sadMxN(8, 4)
+sadMxNx4D(8, 4)
+
+// 4x8
+sadMxN(4, 8)
+sadMxNx4D(4, 8)
+
+// 4x4
+sadMxN(4, 4)
+sadMxNx4D(4, 4)
+
+sadMxh(128);
+sadMxh(64);
+sadMxh(32);
+sadMxh(16);
+sadMxh(8);
+sadMxh(4);
+
+sadMxN(4, 16)
+sadMxNx4D(4, 16)
+sadMxN(16, 4)
+sadMxNx4D(16, 4)
+sadMxN(8, 32)
+sadMxNx4D(8, 32)
+sadMxN(32, 8)
+sadMxNx4D(32, 8)
+sadMxN(16, 64)
+sadMxNx4D(16, 64)
+sadMxN(64, 16)
+sadMxNx4D(64, 16)
+
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+                                       const uint16_t *b, int b_stride,
+                                       int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define highbd_sadMxN(m, n)                                                    \
+  unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  }                                                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avg_c(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred) {                                            \
+    uint16_t comp_pred[m * n];                                                 \
+    aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \
+                             ref, ref_stride);                                 \
+    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+  }                                                                            \
+  unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+    uint16_t comp_pred[m * n];                                                 \
+    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred,   \
+                                 m, n, ref, ref_stride, jcp_param);            \
+    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+  }
+
+#define highbd_sadMxNx4D(m, n)                                               \
+  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
+                                      const uint8_t *const ref_array[],      \
+                                      int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
+                                                 ref_array[i], ref_stride);  \
+    }                                                                        \
+  }
+
+/* clang-format off */
+// 128x128
+highbd_sadMxN(128, 128)
+highbd_sadMxNx4D(128, 128)
+
+// 128x64
+highbd_sadMxN(128, 64)
+highbd_sadMxNx4D(128, 64)
+
+// 64x128
+highbd_sadMxN(64, 128)
+highbd_sadMxNx4D(64, 128)
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxNx4D(64, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxNx4D(32, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxNx4D(16, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxNx4D(8, 16)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxNx4D(8, 4)
+
+// 4x8
+highbd_sadMxN(4, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 4x4
+highbd_sadMxN(4, 4)
+highbd_sadMxNx4D(4, 4)
+
+highbd_sadMxN(4, 16)
+highbd_sadMxNx4D(4, 16)
+highbd_sadMxN(16, 4)
+highbd_sadMxNx4D(16, 4)
+highbd_sadMxN(8, 32)
+highbd_sadMxNx4D(8, 32)
+highbd_sadMxN(32, 8)
+highbd_sadMxNx4D(32, 8)
+highbd_sadMxN(16, 64)
+highbd_sadMxNx4D(16, 64)
+highbd_sadMxN(64, 16)
+highbd_sadMxNx4D(64, 16)
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
new file mode 100644
index 000000000..c176001d6
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad_av1.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+                                      const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride, int width,
+                                      int height) {
+  int y, x;
+  unsigned int sad = 0;
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+  return sad;
+}
+
+#define MASKSADMxN(m, n)                                                       \
+  unsigned int aom_masked_sad##m##x##n##_c(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
+      int invert_mask) {                                                       \
+    if (!invert_mask)                                                          \
+      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n);                                     \
+    else                                                                       \
+      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
+                        msk_stride, m, n);                                     \
+  }
+
+/* clang-format off */
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+MASKSADMxN(4, 16)
+MASKSADMxN(16, 4)
+MASKSADMxN(8, 32)
+MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
+                                   const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
+      int msk_stride, int invert_mask) {                                \
+    if (!invert_mask)                                                   \
+      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
+                               second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                \
+      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
+                               ref_stride, msk, msk_stride, m, n);      \
+  }
+
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+HIGHBD_MASKSADMXN(4, 16)
+HIGHBD_MASKSADMXN(16, 4)
+HIGHBD_MASKSADMXN(8, 32)
+HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                     \
+  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+                                         const int32_t *wsrc,                \
+                                         const int32_t *mask) {              \
+    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
+  }
+
+/* clang-format off */
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+OBMCSADMxN(4, 16)
+OBMCSADMxN(16, 4)
+OBMCSADMxN(8, 32)
+OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+    /* clang-format on */
+
+    static INLINE
+    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                \
+  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+      const int32_t *mask) {                                   \
+    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+  }
+
+/* clang-format off */
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+HIGHBD_OBMCSADMXN(4, 16)
+HIGHBD_OBMCSADMXN(16, 4)
+HIGHBD_OBMCSADMXN(8, 32)
+HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+/* clang-format on */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
new file mode 100644
index 000000000..01dbb8fd2
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v128 v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
+SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
+SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
+SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
+  return c_v128_from_64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
+  return c_v128_from_v64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return c_v128_from_32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+  return c_v128_load_unaligned(p);
+}
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return c_v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+  c_v128_store_unaligned(p, a);
+}
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+  c_v128_store_aligned(p, a);
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+  return c_v128_align(a, b, c);
+}
+
+SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
+
+typedef uint32_t sad128_internal;
+SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+  return c_v128_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+  return c_v128_sad_u8_sum(s);
+}
+typedef uint32_t ssd128_internal;
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+  return c_v128_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+  return c_v128_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  return c_v128_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  return c_v128_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  return c_v128_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
+SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
+SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
+SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return c_v128_mullo_s16(a, b);
+}
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+  return c_v128_mulhi_s16(a, b);
+}
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+  return c_v128_mullo_s32(a, b);
+}
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  return c_v128_blend_8(a, b, c);
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return c_v128_rdavg_u16(a, b);
+}
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+  return c_v128_unziplo_8(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+  return c_v128_unziphi_8(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+  return c_v128_unziplo_16(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+  return c_v128_unziphi_16(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+  return c_v128_unziplo_32(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+  return c_v128_unziphi_32(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return c_v128_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return c_v128_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return c_v128_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return c_v128_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return c_v128_pack_s32_s16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return c_v128_pack_s32_u16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return c_v128_pack_s16_u8(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return c_v128_pack_s16_s8(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return c_v128_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return c_v128_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return c_v128_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return c_v128_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
+  return c_v128_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+  return c_v128_cmpgt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+  return c_v128_cmplt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return c_v128_shl_8(a, c);
+}
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return c_v128_shr_u8(a, c);
+}
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  return c_v128_shr_s8(a, c);
+}
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return c_v128_shl_16(a, c);
+}
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return c_v128_shr_u16(a, c);
+}
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return c_v128_shr_s16(a, c);
+}
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return c_v128_shl_32(a, c);
+}
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return c_v128_shr_u32(a, c);
+}
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return c_v128_shr_s32(a, c);
+}
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return c_v128_shr_s64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  return c_v128_shr_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  return c_v128_shl_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
+  return c_v128_shl_n_8(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
+  return c_v128_shl_n_16(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
+  return c_v128_shl_n_32(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+  return c_v128_shl_n_64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
+  return c_v128_shr_n_u8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
+  return c_v128_shr_n_u16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
+  return c_v128_shr_n_u32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+  return c_v128_shr_n_u64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
+  return c_v128_shr_n_s8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
+  return c_v128_shr_n_s16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
+  return c_v128_shr_n_s32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+  return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+  return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+  return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return c_v128_ssd_s16_sum(s);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
new file mode 100644
index 000000000..3c669d579
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
+
+typedef int64x2_t v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+  return v64_low_u32(vget_low_s64(a));
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+  return vcombine_s64((int64x1_t)b, (int64x1_t)a);
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+  return v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
+  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
+  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+  return c ? vreinterpretq_s64_s8(
+                 vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
+           : b;
+#else
+  return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
+                               v64_align(v128_high_v64(b), v128_low_v64(b), c))
+               : v128_from_v64(
+                     v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
+                     v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
+#endif
+}
+
+SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
+
+SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) {
+  return vreinterpretq_s64_u8(vdupq_n_u8(x));
+}
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) {
+  return vreinterpretq_s64_u16(vdupq_n_u16(x));
+}
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) {
+  return vreinterpretq_s64_u32(vdupq_n_u32(x));
+}
+
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  return vreinterpretq_s64_u64(vdupq_n_u64(x));
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  int16x8_t t1 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
+  int16x8_t t2 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
+#else
+  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
+         v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
+}
+
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  int64x2_t t = vpaddlq_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
+#if defined(__aarch64__)
+  return vaddlvq_u8(vreinterpretq_u8_s64(x));
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
+  return vget_lane_s32(
+      vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
+}
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+  return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
+}
+
+typedef struct {
+  sad64_internal hi, lo;
+} sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init() {
+  sad128_internal s;
+  s.hi = s.lo = vdupq_n_u16(0);
+  return s;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v128_sad_u8_sum().
+   The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+  sad128_internal r;
+  r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
+  r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
+  return r;
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
+  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+#endif
+}
+
+typedef struct {
+  ssd64_internal hi, lo;
+} ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
+  ssd128_internal s;
+  s.hi = s.lo = v64_ssd_u8_init();
+  return s;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+  ssd128_internal r;
+  r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
+  r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
+  return r;
+}
+
+SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+  return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
+}
+
+SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
+
+SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
+
+SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
+
+SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
+
+SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
+  return vreinterpretq_s64_u64(
+      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 x) {
+  return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 x) {
+  return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
+}
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+  return vreinterpretq_s64_s32(
+      vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return vreinterpretq_s64_s16(
+      vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_s16(vuzp2q_s16(
+      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                                      vreinterpret_s16_s64(vget_low_s64(b)))),
+      vreinterpretq_s16_s32(
+          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
+#else
+  return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
+                       v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+  return vreinterpretq_s64_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                           vreinterpret_s16_s64(vget_low_s64(b)));
+  int32x4_t t2 =
+      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
+  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
+#else
+  return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
+                       v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int16x8_t t1 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
+  int16x8_t t2 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
+#else
+  return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
+                       v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
+  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
+#if defined(__aarch64__)
+  uint8x16_t m =
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
+#else
+  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
+  return v64_low_u32(
+      v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+#endif
+}
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
+  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
+  return vreinterpretq_s64_s16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
+  return vreinterpretq_s64_s16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
+  uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+  return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
+  return vreinterpretq_s64_s32(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
+  return vreinterpretq_s64_s32(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
+  uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
+  return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+  return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+  return v128_from_v64(vget_high_s64((int64x2_t)a),
+                       vget_high_s64((int64x2_t)b));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  uint16x8x2_t r =
+      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
+  return vreinterpretq_s64_u16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  uint16x8x2_t r =
+      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
+  return vreinterpretq_s64_u16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  uint32x4x2_t r =
+      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
+  return vreinterpretq_s64_u32(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  uint32x4x2_t r =
+      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
+  return vreinterpretq_s64_u32(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
+      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
+      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+  return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+  return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return vreinterpretq_s64_u32(
+      vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return vreinterpretq_s64_s32(
+      vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return vreinterpretq_s64_u32(
+      vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return vreinterpretq_s64_s32(
+      vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
+#else
+  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
+                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
+  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
+                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#endif
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  return (c > 7) ? v128_ones()
+                 : vreinterpretq_s64_s8(
+                       vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return (c > 15) ? v128_zero()
+                  : vreinterpretq_s64_u16(
+                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return (c > 15) ? v128_zero()
+                  : vreinterpretq_s64_u16(
+                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return (c > 15) ? v128_ones()
+                  : vreinterpretq_s64_s16(
+                        vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return (c > 31) ? v128_zero()
+                  : vreinterpretq_s64_u32(
+                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return (c > 31) ? v128_zero()
+                  : vreinterpretq_s64_u32(
+                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return (c > 31) ? v128_ones()
+                  : vreinterpretq_s64_s32(
+                        vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  return n < 8
+             ? v128_from_64(
+                   (uint64_t)vorr_u64(
+                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                  n * 8),
+                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                  (8 - n) * 8)),
+                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                        n * 8))
+             : (n == 8 ? v128_from_64(
+                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
+                       : v128_from_64((uint64_t)vshl_n_u64(
+                                          vreinterpret_u64_s64(vget_low_s64(a)),
+                                          (n - 8) * 8),
+                                      0));
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  return n < 8
+             ? v128_from_64(
+                   (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                        n * 8),
+                   (uint64_t)vorr_u64(
+                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
+                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                  (8 - n) * 8)))
+             : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+                                             vget_high_s64(a)))
+                       : v128_from_64(
+                             0, (uint64_t)vshr_n_u64(
+                                    vreinterpret_u64_s64(vget_high_s64(a)),
+                                    (n - 8) * 8)));
+}
+
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
+  return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
+  return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
+  return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return vshrq_n_s64(a, c);
+}
+
+#else
+
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  if (n < 8)
+    return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
+                                v64_shr_n_byte(v128_low_v64(a), 8 - n)),
+                         v64_shl_n_byte(v128_low_v64(a), n));
+  else
+    return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  if (n < 8)
+    return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
+                         v64_or(v64_shr_n_byte(v128_low_v64(a), n),
+                                v64_shl_n_byte(v128_high_v64(a), 8 - n)));
+  else
+    return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
+}
+
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
+  return v128_shl_8(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
+  return v128_shr_u8(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
+  return v128_shr_s8(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
+  return v128_shl_16(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
+  return v128_shr_u16(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
+  return v128_shr_s16(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
+  return v128_shl_32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
+  return v128_shr_u32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
+  return v128_shr_s32(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return v128_shl_64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return v128_shr_u64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return v128_shr_s64(a, c);
+}
+
+#endif
+
+typedef uint32x4_t sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return vaddq_u32(
+      s, vpaddlq_u16(vsubq_u16(
+             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
+             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  uint64x2_t t = vpaddlq_u32(s);
+  return (uint32_t)(uint64_t)vget_high_u64(t) +
+         (uint32_t)(uint64_t)vget_low_u64(t);
+}
+
+typedef v128 ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(
+      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
new file mode 100644
index 000000000..bbe9a9d28
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+typedef union {
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+  uint64_t u64[2];
+  int8_t s8[16];
+  int16_t s16[8];
+  int32_t s32[4];
+  int64_t s64[2];
+  c_v64 v64[2];
+} c_v128;
+
+SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
+
+SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
+  c_v128 t;
+  t.u64[1] = hi;
+  t.u64[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
+  c_v128 t;
+  t.v64[1] = hi;
+  t.v64[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
+                                  uint32_t d) {
+  c_v128 t;
+  t.u32[3] = a;
+  t.u32[2] = b;
+  t.u32[1] = c;
+  t.u32[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
+  c_v128 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 16; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 15) {
+    fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
+    abort();
+  }
+  return c_v128_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 16; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 15) {
+    fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
+    abort();
+  }
+  c_v128_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v128 c_v128_zero() {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+         c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
+  return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
+         c_v64_dotp_s16(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+  // 32 bit products, 64 bit sum
+  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
+SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
+  return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
+}
+
+typedef uint32_t c_sad128_internal;
+
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v128_sad_u8_sum().
+   The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
+                                            c_v128 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
+
+typedef uint32_t c_ssd128_internal;
+
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
+                                            c_v128 b) {
+  int c;
+  for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
+
+SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
+                         c_v64_or(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
+                         c_v64_xor(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
+                         c_v64_and(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
+                         c_v64_andn(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
+                         c_v64_add_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
+                         c_v64_add_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
+                         c_v64_add_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+  // Two complement overflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+                                   : a.v64[1].u64 + b.v64[1].u64,
+      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+                                   : a.v64[0].u64 + b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
+  c_v128 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+  c_v128 t;
+  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
+                         c_v64_sub_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
+                         c_v64_ssub_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
+                         c_v64_ssub_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
+                         c_v64_sub_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
+                         c_v64_ssub_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
+                         c_v64_ssub_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
+                         c_v64_sub_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+  // Two complement underflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+                                  : a.v64[1].u64 - b.v64[1].u64,
+      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+                                  : a.v64[0].u64 - b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
+  return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
+  c_v64 lo_bits = c_v64_mullo_s16(a, b);
+  c_v64 hi_bits = c_v64_mulhi_s16(a, b);
+  return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
+                         c_v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
+                         c_v64_mullo_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
+                         c_v64_mulhi_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
+                         c_v64_mullo_s32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
+                         c_v64_madd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
+                         c_v64_madd_us8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
+                         c_v64_avg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
+                         c_v64_avg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
+                         c_v64_min_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
+                         c_v64_max_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
+                         c_v64_min_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+  c_v128 t;
+  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
+                         c_v64_max_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
+                         c_v64_min_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
+                         c_v64_max_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_8(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_16(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_32(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(a.v64[1], b.v64[1]);
+}
+
+SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u8[15] = b.u8[15];
+    t.u8[14] = b.u8[13];
+    t.u8[13] = b.u8[11];
+    t.u8[12] = b.u8[9];
+    t.u8[11] = b.u8[7];
+    t.u8[10] = b.u8[5];
+    t.u8[9] = b.u8[3];
+    t.u8[8] = b.u8[1];
+    t.u8[7] = a.u8[15];
+    t.u8[6] = a.u8[13];
+    t.u8[5] = a.u8[11];
+    t.u8[4] = a.u8[9];
+    t.u8[3] = a.u8[7];
+    t.u8[2] = a.u8[5];
+    t.u8[1] = a.u8[3];
+    t.u8[0] = a.u8[1];
+  } else {
+    t.u8[15] = a.u8[14];
+    t.u8[14] = a.u8[12];
+    t.u8[13] = a.u8[10];
+    t.u8[12] = a.u8[8];
+    t.u8[11] = a.u8[6];
+    t.u8[10] = a.u8[4];
+    t.u8[9] = a.u8[2];
+    t.u8[8] = a.u8[0];
+    t.u8[7] = b.u8[14];
+    t.u8[6] = b.u8[12];
+    t.u8[5] = b.u8[10];
+    t.u8[4] = b.u8[8];
+    t.u8[3] = b.u8[6];
+    t.u8[2] = b.u8[4];
+    t.u8[1] = b.u8[2];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+                           : _c_v128_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+                           : _c_v128_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u16[7] = b.u16[7];
+    t.u16[6] = b.u16[5];
+    t.u16[5] = b.u16[3];
+    t.u16[4] = b.u16[1];
+    t.u16[3] = a.u16[7];
+    t.u16[2] = a.u16[5];
+    t.u16[1] = a.u16[3];
+    t.u16[0] = a.u16[1];
+  } else {
+    t.u16[7] = a.u16[6];
+    t.u16[6] = a.u16[4];
+    t.u16[5] = a.u16[2];
+    t.u16[4] = a.u16[0];
+    t.u16[3] = b.u16[6];
+    t.u16[2] = b.u16[4];
+    t.u16[1] = b.u16[2];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+                           : _c_v128_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+                           : _c_v128_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u32[3] = b.u32[3];
+    t.u32[2] = b.u32[1];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[3] = a.u32[2];
+    t.u32[2] = a.u32[0];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+                           : _c_v128_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+                           : _c_v128_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
+                         c_v64_unpacklo_u8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
+                         c_v64_unpacklo_u8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
+                         c_v64_unpacklo_s8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
+                         c_v64_unpacklo_s8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
+                         c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
+                         c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
+                         c_v64_unpacklo_u16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
+                         c_v64_unpacklo_s16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
+                         c_v64_unpacklo_u16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
+                         c_v64_unpacklo_s16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 16; c++)
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+                                     : pattern.u8[c] & 15];
+
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
+                         c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
+                         c_v64_cmplt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
+                         c_v64_cmpeq_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
+                         c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
+                         c_v64_cmplt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
+                         c_v64_cmpeq_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+  if (n < 8)
+    return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
+                                    c_v64_shr_n_byte(a.v64[0], 8 - n)),
+                           c_v64_shl_n_byte(a.v64[0], n));
+  else
+    return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+  if (n < 8)
+    return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
+                           c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
+                                    c_v64_shl_n_byte(a.v64[1], 8 - n)));
+  else
+    return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
+}
+
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
+  if (SIMD_CHECK && c > 15) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
+           : b;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
+                         c_v64_shr_u16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
+                         c_v64_shr_s16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
+                         c_v64_shr_u32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
+                         c_v64_shr_s32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 <<= c;
+  a.v64[0].u64 <<= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 >>= c;
+  a.v64[0].u64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+  a.v64[1].s64 >>= c;
+  a.v64[0].s64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
+  return c_v128_shl_8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
+  return c_v128_shl_16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
+  return c_v128_shl_32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+  return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
new file mode 100644
index 000000000..6c7241ff4
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+
+#include <stdint.h>
+#include "aom_dsp/simd/v64_intrinsics_x86.h"
+
+typedef __m128i v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+  return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) {
+  return _mm_unpacklo_epi64(a, v64_zero());
+}
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
+  return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+  return v128_from_v64(v64_from_64(a), v64_from_64(b));
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_set_epi32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return _mm_load_si128((__m128i *)p);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+#if defined(__SSSE3__)
+  return (__m128i)_mm_lddqu_si128((__m128i *)p);
+#else
+  return _mm_loadu_si128((__m128i *)p);
+#endif
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+  _mm_store_si128((__m128i *)p, a);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+  _mm_storeu_si128((__m128i *)p, a);
+}
+
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#if defined(__SSSE3__)
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
+  return c ? _mm_alignr_epi8(a, b, c) : b;
+}
+#else
+#define v128_align(a, b, c) \
+  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#else
+#if defined(__SSSE3__)
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
+#else
+#define v128_align(a, b, c) \
+  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#endif
+
+SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+  return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
+}
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+  return _mm_madd_epi16(a, _mm_set1_epi16(1));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi16(a);
+#else
+  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi8(a);
+#else
+  v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
+  return _mm_unpacklo_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
+  return _mm_unpackhi_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
+  return _mm_unpacklo_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
+  return _mm_unpackhi_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
+  return _mm_unpacklo_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
+  return _mm_unpackhi_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+  return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+  return _mm_unpackhi_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
+  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+                            _mm_shuffle_epi8(a, order));
+#else
+  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
+  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+                            _mm_shuffle_epi8(a, order));
+#else
+  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+  return _mm_castps_si128(_mm_shuffle_ps(
+      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+  return _mm_castps_si128(_mm_shuffle_ps(
+      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return _mm_packs_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(b, a);
+#else
+  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return _mm_packus_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return _mm_packs_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(x, pattern);
+#else
+  v128 output;
+  unsigned char *input = (unsigned char *)&x;
+  unsigned char *index = (unsigned char *)&pattern;
+  char *selected = (char *)&output;
+  int counter;
+
+  for (counter = 0; counter < 16; counter++) {
+    selected[counter] = input[index[counter] & 15];
+  }
+
+  return output;
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+  v128 t = v128_add_32(t1, t2);
+  t = v128_add_32(t, _mm_srli_si128(t, 8));
+  t = v128_add_32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  v128 r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
+                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
+  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
+#else
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
+  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
+  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
+}
+
+typedef v128 sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v128_sad_sum().
+   The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
+}
+
+typedef int32_t ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+  v128 z = _mm_setzero_si128();
+  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
+  v128 rl = _mm_madd_epi16(l, l);
+  v128 rh = _mm_madd_epi16(h, h);
+  v128 r = _mm_add_epi32(rl, rh);
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+  return s + _mm_cvtsi128_si32(r);
+}
+
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+  v64 lo_bits = v64_mullo_s16(a, b);
+  v64 hi_bits = v64_mulhi_s16(a, b);
+  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
+                       v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return _mm_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+  return _mm_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#else
+  return _mm_unpacklo_epi32(
+      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
+      _mm_shuffle_epi32(
+          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  v128 r = v128_mullo_s32(a, b);
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, b);
+#else
+  return _mm_packs_epi32(
+      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
+      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
+                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
+#endif
+}
+
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
+  return _mm_sub_epi8(_mm_avg_epu8(a, b),
+                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#else
+  v128 mask = _mm_cmplt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(a, b, c);
+#else
+  c = _mm_cmplt_epi8(c, v128_zero());
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#else
+  v128 mask = _mm_cmplt_epi8(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+  return _mm_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+  return _mm_cmplt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128(c + 8);
+  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  // _mm_sra_epi64 is missing in gcc?
+  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
+                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
+#define v128_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c)                                         \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
+#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+#if defined(__SSE4_1__)
+  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+                          v128_xor(b, v128_dup_16(32768)));
+  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+                  v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+  return v128_add_32(
+      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+         v128_low_u32(v128_shr_n_byte(s, 8)) +
+         v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+                                    _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 000000000..cb99d35b7
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
+  return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
+
+typedef uint32_t sad256_internal;
+SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  return c_v256_sad_u8_sum(s);
+}
+typedef uint32_t ssd256_internal;
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  return c_v256_ssd_u8_sum(s);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return c_v256_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return c_v256_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return c_v256_blend_8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return c_v256_rdavg_u16(a, b);
+}
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return c_v256_unziphi_64(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return c_v256_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return c_v256_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return c_v256_pack_s32_u16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  return c_v256_wideshuffle_8(a, b, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return c_v256_cmplt_s32(a, b);
+}
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return c_v256_shr_s32(a, c);
+}
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return c_v256_shl_64(a, c);
+}
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return c_v256_shr_u64(a, c);
+}
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+  return c_v256_shr_s64(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
+  return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
+  return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
+  return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
+  return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
+  return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+  return c_v256_shl_n_64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
+  return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
+  return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
+  return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+  return c_v256_shr_n_u64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
+  return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
+  return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
+  return c_v256_shr_n_s32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+  return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+  return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+  return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd256_internal_s16;
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
new file mode 100644
index 000000000..bd86ea172
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
+
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 000000000..a1c08e95a
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+
+typedef union {
+  uint8_t u8[32];
+  uint16_t u16[16];
+  uint32_t u32[8];
+  uint64_t u64[4];
+  int8_t s8[32];
+  int16_t s16[16];
+  int32_t s32[8];
+  int64_t s64[4];
+  c_v64 v64[4];
+  c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+  c_v256 t;
+  t.v128[1] = hi;
+  t.v128[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+                                  uint64_t d) {
+  c_v256 t;
+  t.u64[3] = a;
+  t.u64[2] = b;
+  t.u64[1] = c;
+  t.u64[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+  c_v256 t;
+  t.u64[3] = a.u64;
+  t.u64[2] = b.u64;
+  t.u64[1] = c.u64;
+  t.u64[0] = d.u64;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+  c_v256 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 32; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+    abort();
+  }
+  return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 32; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+    abort();
+  }
+  c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero() {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+         c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+  return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef uint32_t c_sad256_internal;
+
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++)
+    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+                          c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+                          c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+                          c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+                          c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+                          c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+                          c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+                          c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+                          c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+                          c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+  c_v256 t;
+  for (int i = 0; i < 16; i++)
+    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+  c_v256 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+  t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+  t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+  t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+                          c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+                          c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+                          c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+  c_v128 lo_bits = c_v128_mullo_s16(a, b);
+  c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+  return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+                          c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+                          c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+                          c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+                          c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+                          c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+                          c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+                          c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+                          c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+                          c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+  c_v256 t;
+  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+                          c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+                          c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+                          c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+                          c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+                          c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = a.u8[i * 2 + 1];
+      t.u8[i + 16] = b.u8[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = b.u8[i * 2];
+      t.u8[i + 16] = a.u8[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+                           : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+                           : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = a.u16[i * 2 + 1];
+      t.u16[i + 8] = b.u16[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = b.u16[i * 2];
+      t.u16[i + 8] = a.u16[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+                           : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+                           : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u32[7] = b.u32[7];
+    t.u32[6] = b.u32[5];
+    t.u32[5] = b.u32[3];
+    t.u32[4] = b.u32[1];
+    t.u32[3] = a.u32[7];
+    t.u32[2] = a.u32[5];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[7] = a.u32[6];
+    t.u32[6] = a.u32[4];
+    t.u32[5] = a.u32[2];
+    t.u32[4] = a.u32[0];
+    t.u32[3] = b.u32[6];
+    t.u32[2] = b.u32[4];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+                           : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+                           : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u64[3] = b.u64[3];
+    t.u64[2] = b.u64[1];
+    t.u64[1] = a.u64[3];
+    t.u64[0] = a.u64[1];
+  } else {
+    t.u64[3] = a.u64[2];
+    t.u64[2] = a.u64[0];
+    t.u64[1] = b.u64[2];
+    t.u64[0] = b.u64[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+                           : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+                           : _c_v256_unzip_64(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+                          c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+                          c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
+                          c_v128_unpacklo_s8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
+                          c_v128_unpacklo_s8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+                          c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+                          c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+                          c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+                          c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+                          c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+                          c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                     : pattern.u8[c] & 31];
+
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = (pattern.u8[c] < 32
+                   ? b.u8
+                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                             : pattern.u8[c] & 31];
+  return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+  return c_v256_from_v128(
+      c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+      c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+  if (n < 16)
+    return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+                                      c_v128_shr_n_byte(a.v128[0], 16 - n)),
+                            c_v128_shl_n_byte(a.v128[0], n));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+                            c_v128_zero());
+  else
+    return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+  if (n < 16)
+    return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+                            c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+                                      c_v128_shl_n_byte(a.v128[1], 16 - n)));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_zero(),
+                            c_v128_shr_n_byte(a.v128[1], n - 16));
+  else
+    return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
+  if (SIMD_CHECK && c > 31) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+           : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+                          c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+                          c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+                          c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+                          c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+                          c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+                          c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+                          c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+                          c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+                          c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.s64[3] = a.s64[3] >> n;
+  t.s64[2] = a.s64[2] >> n;
+  t.s64[1] = a.s64[1] >> n;
+  t.s64[0] = a.s64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] >> n;
+  t.u64[2] = a.u64[2] >> n;
+  t.u64[1] = a.u64[1] >> n;
+  t.u64[0] = a.u64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] << n;
+  t.u64[2] = a.u64[2] << n;
+  t.u64[1] = a.u64[1] << n;
+  t.u64[0] = a.u64[0] << n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
+  return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
+  return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
+  return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+  return c_v256_shl_64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
+  return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
+  return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
+  return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+  return c_v256_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
+  return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
+  return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
+  return c_v256_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+  return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 000000000..d5b7905ef
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+
+#if HAVE_NEON
+#include "aom_dsp/simd/v128_intrinsics_arm.h"
+#elif HAVE_SSE2
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+#else
+#include "aom_dsp/simd/v128_intrinsics.h"
+#endif
+
+#if HAVE_NEON
+typedef int64x2x2_t v256;
+#else
+typedef struct {
+  v128 val[2];
+} v256;
+#endif
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  v256 t;
+  t.val[1] = hi;
+  t.val[0] = lo;
+  return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+                        v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+                        v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  v128_store_unaligned(p, a.val[0]);
+  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  v128_store_aligned(p, a.val[0]);
+  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE v256 v256_zero() {
+  return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+  v128 t = v128_dup_8(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+  v128 t = v128_dup_16(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+  v128 t = v128_dup_32(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  v128 t = v128_dup_64(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
+}
+
+typedef struct {
+  sad128_internal val[2];
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+  sad256_internal t;
+  t.val[1] = v128_sad_u8_init();
+  t.val[0] = v128_sad_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  sad256_internal t;
+  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal val[2];
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+  ssd256_internal t;
+  t.val[1] = v128_ssd_u8_init();
+  t.val[0] = v128_ssd_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  ssd256_internal t;
+  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+  return v256_from_v128(v128_or(a.val[1], b.val[1]),
+                        v128_or(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+                        v128_xor(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+  return v256_from_v128(v128_and(a.val[1], b.val[1]),
+                        v128_and(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+                        v128_andn(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+                        v128_add_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+                        v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+                        v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+                        v128_sadd_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+                        v128_sadd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+                        v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+                        v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+                        v128_sub_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+                        v128_ssub_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+                        v128_ssub_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+                        v128_sub_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+                        v128_ssub_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+                        v128_ssub_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+                        v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+                        v128_sub_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) {
+  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+                        v128_mullo_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+                        v128_mulhi_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+                        v128_mullo_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+                        v128_madd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+                        v128_madd_us8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+                        v128_avg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+                        v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+                        v128_rdavg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+                        v128_avg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+                        v128_min_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+                        v128_max_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+                        v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (v128_movemask_8(v256_high_v128(a)) << 16) |
+         v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+                        v128_max_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+                        v128_min_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+                        v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+                        v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+                        v128_max_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+                        v128_ziplo_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+                        v128_ziplo_8(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+                        v128_ziplo_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+                        v128_ziplo_16(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+                        v128_ziplo_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+                        v128_ziplo_32(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+                        v128_ziplo_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+                        v128_ziplo_64(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return v256_from_v128(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return v256_from_v128(a.val[1], b.val[1]);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+                        v128_unziplo_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+                        v128_unziphi_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+                        v128_unziplo_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+                        v128_unziphi_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+                        v128_unziplo_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+                        v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 0)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 0)));
+#else
+  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 3)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 3)));
+#else
+  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+                        v128_unpacklo_u8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+                        v128_unpacklo_u8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+                        v128_unpacklo_s8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+                        v128_unpacklo_s8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+                        v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+                        v128_pack_s32_u16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+                        v128_pack_s16_u8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+                        v128_pack_s16_s8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+                        v128_unpacklo_u16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+                        v128_unpacklo_s16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+                        v128_unpacklo_u16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+                        v128_unpacklo_s16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+                        v128_cmpgt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+                        v128_cmplt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+                        v128_cmpeq_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+                        v128_cmpgt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+                        v128_cmplt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+                        v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+                        v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+                        v128_cmplt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+                        v128_cmpeq_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
+                       vreinterpretq_u8_s64(x.val[1]) } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  return v256_from_64(
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+  return v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+#endif
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x4_t p = { {
+      vreinterpretq_u8_s64(y.val[0]),
+      vreinterpretq_u8_s64(y.val[1]),
+      vreinterpretq_u8_s64(x.val[0]),
+      vreinterpretq_u8_s64(x.val[1]),
+  } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
+  v256 r1 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
+  v256 r2 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 c32 = v128_dup_8(32);
+  v128 c48 = v128_dup_8(48);
+  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+  v256 r1 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+                   maskhi48),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+                   masklo48));
+  v256 r2 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+#endif
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return v256_from_v128(
+      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v256_shl_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
+                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
+                             v128_shl_n_byte(a.val[0], (n)))               \
+            : v256_from_v128(                                              \
+                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
+                  v128_zero()))
+
+#define v256_shr_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                             v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
+            : v256_from_v128(                                              \
+                  v128_zero(),                                             \
+                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
+
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
+#define v256_shl_n_16(a, n) \
+  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
+#define v256_shl_n_32(a, n) \
+  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
+#define v256_shr_n_u8(a, n) \
+  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
+#define v256_shr_n_u16(a, n) \
+  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
+#define v256_shr_n_u32(a, n) \
+  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
+#define v256_shr_n_s8(a, n) \
+  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
+#define v256_shr_n_s16(a, n) \
+  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
+#define v256_shr_n_s32(a, n) \
+  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+  sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16_init();
+  t.val[0] = v128_sad_u16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum().
+   The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16_init();
+  t.val[0] = v128_ssd_s16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 000000000..44594bc41
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+
+#if !defined(__AVX2__)
+
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
+    defined(__AVX2__) && defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+  return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+  // gcc seems to be missing _mm256_set_m128i()
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+  return _mm256_subs_epu16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return _mm256_unpacklo_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return _mm256_unpackhi_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return _mm256_unpacklo_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return _mm256_unpackhi_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(3, 1, 3, 1))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(2, 0, 2, 0))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+                                            _mm256_castsi256_pd(a), 15)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(
+          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return _mm256_srai_epi16(
+      _mm256_unpacklo_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return _mm256_srai_epi16(
+      _mm256_unpackhi_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return _mm256_srai_epi32(
+      _mm256_unpacklo_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return _mm256_srai_epi32(
+      _mm256_unpackhi_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  v256 r1 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+  v256 r2 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+  t1 = _mm256_add_epi32(t1, t2);
+  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+                         _mm256_extracti128_si256(t1, 1));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  v128 lo = v256_low_v128(t);
+  v128 hi = v256_high_v128(t);
+  lo = v128_add_32(lo, hi);
+  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+  v256 rl = _mm256_madd_epi16(l, l);
+  v256 rh = _mm256_madd_epi16(h, h);
+  v128 c = _mm_cvtsi32_si128(32);
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+  return _mm256_add_epi64(
+      s,
+      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return _mm256_sub_epi8(
+      _mm256_avg_epu8(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return _mm256_sub_epi16(
+      _mm256_avg_epu16(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return _mm256_blendv_epi8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
+                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
+                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128(c + 8);
+  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512F__)
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+#else
+  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+                        v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n)                                                \
+  ((n) < 16 ? v256_from_v128(                                                \
+                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+                  v128_shl_n_byte(v256_low_v128(a), n))                      \
+            : _mm256_inserti128_si256(                                       \
+                  _mm256_setzero_si256(),                                    \
+                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n)                                                \
+  ((n) < 16                                                                  \
+       ? _mm256_alignr_epi8(                                                 \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+       : _mm256_inserti128_si256(                                            \
+             _mm256_setzero_si256(),                                         \
+             v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+
+#define v256_shl_n_8(a, c)                                   \
+  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+                   _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c) \
+  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c)                                                  \
+  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+#if defined(__SSE4_1__)
+  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+                          v256_xor(b, v256_dup_16(32768)));
+  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+                  v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+  return v256_add_32(
+      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+         v128_low_u32(v128_shr_n_byte(t, 8)) +
+         v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  v256 d = v256_sub_16(a, b);
+  d = v256_madd_s16(d, d);
+  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+                                    _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
+#endif
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
new file mode 100644
index 000000000..afc55428d
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v64 v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
+SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
+SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return c_v64_from_32(x, y);
+}
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return c_v64_from_16(a, b, c, d);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return c_u32_load_unaligned(p);
+}
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return c_u32_load_aligned(p);
+}
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+  c_u32_store_unaligned(p, a);
+}
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  c_u32_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return c_v64_load_unaligned(p);
+}
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return c_v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+  c_v64_store_unaligned(p, a);
+}
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+  c_v64_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+  return c_v64_align(a, b, c);
+}
+
+SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
+SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
+SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+  return c_v64_pack_s32_s16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+  return c_v64_pack_s32_u16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+  return c_v64_pack_s16_u8(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+  return c_v64_pack_s16_s8(a, b);
+}
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+  return c_v64_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+  return c_v64_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+  return c_v64_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+  return c_v64_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
+  return c_v64_shuffle_8(a, pattern);
+}
+
+typedef uint32_t sad64_internal;
+SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+  return c_v64_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+  return c_v64_sad_u8_sum(s);
+}
+typedef uint32_t ssd64_internal;
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+  return c_v64_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+  return c_v64_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
+  return c_v64_shr_u16(a, n);
+}
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
+  return c_v64_shr_s16(a, n);
+}
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
+  return c_v64_shr_u32(a, n);
+}
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
+  return c_v64_shr_s32(a, n);
+}
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
+  return c_v64_shr_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
+  return c_v64_shl_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+  return c_v64_shl_n_8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+  return c_v64_shr_n_u8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+  return c_v64_shr_n_s8(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+  return c_v64_shl_n_16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return c_v64_shr_n_u16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return c_v64_shr_n_s16(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+  return c_v64_shl_n_32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return c_v64_shr_n_u32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return c_v64_shr_n_s32(a, c);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
new file mode 100644
index 000000000..8f39ad6e8
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
@@ -0,0 +1,680 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
+#include "aom_ports/arm.h"
+
+#ifdef AOM_INCOMPATIBLE_GCC
+#error Incompatible gcc
+#endif
+
+typedef int64x1_t v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+  return vget_lane_u32(vreinterpret_u32_s64(a), 0);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+  return vget_lane_u32(vreinterpret_u32_s64(a), 1);
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) {
+  return vget_lane_s32(vreinterpret_s32_s64(a), 0);
+}
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+  return vget_lane_s32(vreinterpret_s32_s64(a), 1);
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
+                     d);
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return vcreate_s64((uint64_t)x << 32 | y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
+
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+#if defined(__clang__)
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
+#elif defined(__CC_ARM)
+  *(__packed uint32_t *)p) = a;
+#elif defined(__GNUC__)
+  *((__attribute((packed)) uint32_t *)p) = a;
+#else
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
+#endif
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
+  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
+  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
+}
+
+// The following function requires an immediate.
+// Some compilers will check this if it's optimising, others wont.
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+  return c ? vreinterpret_s64_s8(
+                 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
+           : b;
+#else
+  return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
+           : b;
+#endif
+}
+
+SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) {
+  return vreinterpret_s64_u8(vdup_n_u8(x));
+}
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) {
+  return vreinterpret_s64_u16(vdup_n_u16(x));
+}
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) {
+  return vreinterpret_s64_u32(vdup_n_u32(x));
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
+  int16x8_t t =
+      vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
+                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t);
+#else
+  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
+  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vaddlvq_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+#else
+  int64x2_t r =
+      vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
+#if defined(__aarch64__)
+  return vaddlv_u8(vreinterpret_u8_s64(x));
+#else
+  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+#endif
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+}
+
+typedef uint16x8_t sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
+
+// Implementation dependent return value. Result must be finalised with
+// v64_sad_u8_sum().
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+  return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s);
+#else
+  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
+  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+#endif
+}
+
+typedef uint32x4_t ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
+
+// Implementation dependent return value. Result must be finalised with
+// v64_ssd_u8_sum().
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+  uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
+  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+#if defined(__aarch64__)
+  return vaddvq_u32(s);
+#else
+  uint64x2_t t = vpaddlq_u32(s);
+  return vget_lane_u32(
+      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
+}
+
+SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
+
+SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
+
+SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
+
+SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
+
+SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
+  return vreinterpret_s64_u32(
+      vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
+  return vreinterpret_s64_s32(
+      vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_abs_s16(v64 x) {
+  return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 x) {
+  return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
+}
+
+SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  int16x8_t t = vreinterpretq_s16_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
+#else
+  return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
+#endif
+}
+
+SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
+  return vreinterpret_s64_s32(
+      vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
+  int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
+  return vreinterpret_s64_s32(
+      vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
+                vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
+}
+
+SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
+  int16x8_t t =
+      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
+                vmovl_s8(vreinterpret_s8_s64(y)));
+  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
+  return vreinterpret_s64_s16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
+  return vreinterpret_s64_s16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
+  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
+  return vreinterpret_s64_s32(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
+  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
+  return vreinterpret_s64_s32(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+  return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+  return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+  return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+  return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(vqmovn_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
+SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(vqmovun_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+  return vreinterpret_s64_u16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+  return vreinterpret_s64_u16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
+  return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
+  return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
+  return vreinterpret_s64_s32(
+      vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
+  return vreinterpret_s64_u32(
+      vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+  return vreinterpret_s64_u8(
+      vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
+}
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+  return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+  return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+  return vreinterpret_s64_u16(
+      vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+  return vreinterpret_s64_s16(
+      vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+  return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+  return vreinterpret_s64_u32(
+      vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+  return vreinterpret_s64_s32(
+      vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
+}
+
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
+  return vshl_n_s64(a, c * 8);
+}
+
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
+  return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
+}
+
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+  return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+  return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+  return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+  return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+  return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
+}
+
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
+}
+
+#else
+
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
+  return v64_from_64(v64_u64(a) << c * 8);
+}
+
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
+  return v64_from_64(v64_u64(a) >> c * 8);
+}
+
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
+
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return v64_shr_u16(a, c);
+}
+
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return v64_shr_s16(a, c);
+}
+
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return v64_shr_u32(a, c);
+}
+
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return v64_shr_s32(a, c);
+}
+
+#endif
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
new file mode 100644
index 000000000..028d68c4f
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+
+/* Note: This implements the intrinsics in plain, unoptimised C.
+   Intended for reference, porting or debugging. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+typedef union {
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+  uint64_t u64;
+  int8_t s8[8];
+  int16_t s16[4];
+  int32_t s32[2];
+  int64_t s64;
+} c_v64;
+
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+  return a.u32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
+  return a.u32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+  return a.s32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
+  return a.s32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
+  c_v64 t;
+  t.u32[!CONFIG_BIG_ENDIAN] = x;
+  t.u32[!!CONFIG_BIG_ENDIAN] = y;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
+  c_v64 t;
+  t.u64 = x;
+  return t;
+}
+
+SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
+
+SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
+                                uint16_t d) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    t.u16[0] = a;
+    t.u16[1] = b;
+    t.u16[2] = c;
+    t.u16[3] = d;
+  } else {
+    t.u16[3] = a;
+    t.u16[2] = b;
+    t.u16[1] = c;
+    t.u16[0] = d;
+  }
+  return t;
+}
+
+SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
+  uint32_t t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 4; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 4; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 3) {
+    fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
+    abort();
+  }
+  return c_u32_load_unaligned(p);
+}
+
+SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
+  if (SIMD_CHECK && (uintptr_t)p & 3) {
+    fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
+    abort();
+  }
+  c_u32_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
+  c_v64 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 8; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 7) {
+    fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
+    abort();
+  }
+  return c_v64_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
+  uint8_t *q = (uint8_t *)p;
+  uint8_t *r = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 8; c++) q[c] = r[c];
+}
+
+SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 7) {
+    fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
+    abort();
+  }
+  c_v64_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_zero() {
+  c_v64 t;
+  t.u64 = 0;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
+  c_v64 t;
+  t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
+      t.u8[7] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
+  c_v64 t;
+  t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
+  c_v64 t;
+  t.u32[0] = t.u32[1] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
+                  ? 255
+                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
+                        ? 0
+                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
+                  ? 127
+                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
+                        ? -128
+                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
+                   ? 32767
+                   : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
+                         ? -32768
+                         : (int32_t)a.s16[c] + (int32_t)b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
+  t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) {
+    int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
+    t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
+                   ? -32768
+                   : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
+                         ? 32767
+                         : (int32_t)a.s16[c] - (int32_t)b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.u16[c] =
+        (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
+  t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u8[7] = a.u8[7];
+    t.u8[6] = b.u8[7];
+    t.u8[5] = a.u8[6];
+    t.u8[4] = b.u8[6];
+    t.u8[3] = a.u8[5];
+    t.u8[2] = b.u8[5];
+    t.u8[1] = a.u8[4];
+    t.u8[0] = b.u8[4];
+  } else {
+    t.u8[7] = a.u8[3];
+    t.u8[6] = b.u8[3];
+    t.u8[5] = a.u8[2];
+    t.u8[4] = b.u8[2];
+    t.u8[3] = a.u8[1];
+    t.u8[2] = b.u8[1];
+    t.u8[1] = a.u8[0];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u16[3] = a.u16[3];
+    t.u16[2] = b.u16[3];
+    t.u16[1] = a.u16[2];
+    t.u16[0] = b.u16[2];
+  } else {
+    t.u16[3] = a.u16[1];
+    t.u16[2] = b.u16[1];
+    t.u16[1] = a.u16[0];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u32[1] = a.u32[1];
+    t.u32[0] = b.u32[1];
+  } else {
+    t.u32[1] = a.u32[0];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u8[7] = b.u8[7];
+    t.u8[6] = b.u8[5];
+    t.u8[5] = b.u8[3];
+    t.u8[4] = b.u8[1];
+    t.u8[3] = a.u8[7];
+    t.u8[2] = a.u8[5];
+    t.u8[1] = a.u8[3];
+    t.u8[0] = a.u8[1];
+  } else {
+    t.u8[7] = a.u8[6];
+    t.u8[6] = a.u8[4];
+    t.u8[5] = a.u8[2];
+    t.u8[4] = a.u8[0];
+    t.u8[3] = b.u8[6];
+    t.u8[2] = b.u8[4];
+    t.u8[1] = b.u8[2];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u16[3] = b.u16[3];
+    t.u16[2] = b.u16[1];
+    t.u16[1] = a.u16[3];
+    t.u16[0] = a.u16[1];
+  } else {
+    t.u16[3] = a.u16[2];
+    t.u16[2] = a.u16[0];
+    t.u16[1] = b.u16[2];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
+                           : _c_v64_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
+                           : _c_v64_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.u8[3 + endian];
+  t.s16[2] = (int16_t)a.u8[2 + endian];
+  t.s16[1] = (int16_t)a.u8[1 + endian];
+  t.s16[0] = (int16_t)a.u8[0 + endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.u8[7 - endian];
+  t.s16[2] = (int16_t)a.u8[6 - endian];
+  t.s16[1] = (int16_t)a.u8[5 - endian];
+  t.s16[0] = (int16_t)a.u8[4 - endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.s8[3 + endian];
+  t.s16[2] = (int16_t)a.s8[2 + endian];
+  t.s16[1] = (int16_t)a.s8[1 + endian];
+  t.s16[0] = (int16_t)a.s8[0 + endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.s8[7 - endian];
+  t.s16[2] = (int16_t)a.s8[6 - endian];
+  t.s16[1] = (int16_t)a.s8[5 - endian];
+  t.s16[0] = (int16_t)a.s8[4 - endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
+  t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
+  t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
+  t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
+  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
+  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
+  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
+  t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
+  t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
+  t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
+  t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
+  t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
+  t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
+  t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
+  t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
+  t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
+  t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
+  t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
+  t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
+  t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
+  t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) {
+    if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
+      fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
+              pattern.u8[c], c);
+      abort();
+    }
+    t.u8[c] =
+        a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
+  }
+  return t;
+}
+
+SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
+  return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
+         a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
+         a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
+  return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
+         (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
+}
+
+SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
+  return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
+         a.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
+  return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
+}
+
+typedef uint32_t c_sad64_internal;
+
+/* Implementation dependent return value.  Result must be finalised with
+   v64_sad_u8_sum().
+   The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
+                                          c_v64 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
+
+typedef uint32_t c_ssd64_internal;
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
+
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
+                                          c_v64 b) {
+  int c;
+  for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
+
+SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 | b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 ^ b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 & b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 & ~b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
+  t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
+  t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int32_t u;
+  u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
+  t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
+  t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
+  t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
+  t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
+    abort();
+  }
+  t.u32[1] = a.u32[1] << n;
+  t.u32[0] = a.u32[0] << n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
+    abort();
+  }
+  t.u32[1] = a.u32[1] >> n;
+  t.u32[0] = a.u32[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
+    abort();
+  }
+  t.s32[1] = a.s32[1] >> n;
+  t.s32[0] = a.s32[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
+  c_v64 t;
+  t.u64 = x.u64 >> i * 8;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
+  c_v64 t;
+  t.u64 = x.u64 << i * 8;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
+  if (SIMD_CHECK && c > 7) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
+  return c_v64_shl_8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
+  return c_v64_shr_u8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
+  return c_v64_shr_s8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
+  return c_v64_shl_16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
+  return c_v64_shr_u16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
+  return c_v64_shr_s16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
+  return c_v64_shl_32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
+  return c_v64_shr_u32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
+  return c_v64_shr_s32(a, c);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
new file mode 100644
index 000000000..5f9a57b37
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+typedef __m128i v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+  return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+  return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return _mm_packs_epi32(
+      _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
+      _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return _mm_set_epi32(0, 0, x, y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+  return _mm_cvtsi64_si128(x);
+#else
+  return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_u64(v64 x) {
+  return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
+}
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+  _mm_storel_epi64((__m128i *)p, a);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+  _mm_storel_epi64((__m128i *)p, a);
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#define v64_align(a, b, c) \
+  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
+#else
+#define v64_align(a, b, c)                                                  \
+  ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
+       : (b))
+#endif
+
+SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v64 v64_abs_s16(v64 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi16(a);
+#else
+  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi8(a);
+#else
+  v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packs_epi32(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi32(t, t);
+#else
+  int32_t ah = v64_high_u32(a);
+  int32_t al = v64_low_u32(a);
+  int32_t bh = v64_high_u32(b);
+  int32_t bl = v64_low_u32(b);
+  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
+                     al > 65535 ? 65535 : al < 0 ? 0 : al,
+                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
+                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
+#endif
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packs_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0f0d0b0907050301LL));
+#else
+  return _mm_packus_epi16(
+      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
+      _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0e0c0a0806040200LL));
+#else
+  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0f0e0b0a07060302LL));
+#else
+  return _mm_packs_epi32(
+      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
+      _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0d0c090805040100LL));
+#else
+  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+  return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+  return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+  return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+  return _mm_srli_si128(
+      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(x, pattern);
+#else
+  v64 output;
+  unsigned char *input = (unsigned char *)&x;
+  unsigned char *index = (unsigned char *)&pattern;
+  char *selected = (char *)&output;
+  int counter;
+
+  for (counter = 0; counter < 8; counter++) {
+    selected[counter] = input[index[counter]];
+  }
+
+  return output;
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
+  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v64_low_u32(t);
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
+  __m128i r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+  __m128i x = _mm_cvtepi32_epi64(r);
+  return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
+#else
+  return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(r);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
+  return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+  return v64_dotp_s16(a, v64_dup_16(1));
+}
+
+typedef v64 sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v64_sad_u8_sum().
+   The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+typedef v64 ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+  v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
+  v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
+  v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
+  return _mm_add_epi64(
+      s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#else
+  return _mm_unpacklo_epi32(
+      _mm_mul_epu32(a, b),
+      _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
+#endif
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, b);
+#else
+  __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+                             _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
+  return _mm_packs_epi32(t, t);
+#endif
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
+  return _mm_sub_epi8(_mm_avg_epu8(a, b),
+                      _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#else
+  v64 mask = _mm_cmplt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#else
+  v64 mask = _mm_cmplt_epi8(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+  return _mm_packs_epi16(
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
+#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
+#define v64_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
+#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c
new file mode 100644
index 000000000..249394807
--- /dev/null
+++ b/third_party/aom/aom_dsp/sse.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* Sum the difference between every corresponding element of the buffers. */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
new file mode 100644
index 000000000..681770ba9
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                            uint32_t *sum_s, uint32_t *sum_r,
+                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                            uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+                         uint32_t bd) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    c1 = c2 = 0;
+    assert(0);
+  }
+
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width,
+                        int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                               int stride_img1, int stride_img2, int width,
+                               int height, uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest, double *weight) {
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
+                       source->strides[is_uv], dest->strides[is_uv],
+                       source->crop_widths[is_uv], source->crop_heights[is_uv]);
+  }
+
+  *weight = 1;
+  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+                     &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+  aom_clear_system_state();
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term =
+            (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term =
+            (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term =
+            pow((2.0 * ssim_old * ssim_new + c3) /
+                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1) ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0) inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest, double *weight,
+                            uint32_t bd, uint32_t in_bd) {
+  assert(bd >= in_bd);
+  const uint32_t shift = bd - in_bd;
+
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                              source->strides[is_uv], dest->strides[is_uv],
+                              source->crop_widths[is_uv],
+                              source->crop_heights[is_uv], in_bd, shift);
+  }
+
+  *weight = 1;
+  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+}
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
new file mode 100644
index 000000000..55038f4c2
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SSIM_H_
+#define AOM_AOM_DSP_SSIM_H_
+
+#define MAX_SSIM_DB 100.0;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+  // source sum ( over 8x8 region )
+  uint32_t sum_s;
+
+  // reference sum (over 8x8 region )
+  uint32_t sum_r;
+
+  // source sum squared ( over 8x8 region )
+  uint32_t sum_sq_s;
+
+  // reference sum squared (over 8x8 region )
+  uint32_t sum_sq_r;
+
+  // sum of source times reference (over 8x8 region)
+  uint32_t sum_sxr;
+
+  // calculated ssim score between source and reference
+  double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+  // ssim consistency error metric ( see code for explanation )
+  double ssimc;
+
+  // standard ssim
+  double ssim;
+
+  // revised ssim ( see code for explanation)
+  double ssim2;
+
+  // ssim restated as an error metric like sse
+  double dssim;
+
+  // dssim converted to decibels
+  double dssimd;
+
+  // ssimc converted to decibels
+  double ssimcd;
+} Metrics;
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency);
+
+double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest, double *weight);
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd);
+
+double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest, double *weight,
+                            uint32_t bd, uint32_t in_bd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
new file mode 100644
index 000000000..2f6da96e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/subtract.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_c(int rows, int cols, int16_t *diff,
+                          ptrdiff_t diff_stride, const uint8_t *src,
+                          ptrdiff_t src_stride, const uint8_t *pred,
+                          ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src += src_stride;
+  }
+}
+
+void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+                                 ptrdiff_t diff_stride, const uint8_t *src8,
+                                 ptrdiff_t src_stride, const uint8_t *pred8,
+                                 ptrdiff_t pred_stride, int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void)bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src += src_stride;
+  }
+}
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
new file mode 100644
index 000000000..44ec41f2e
--- /dev/null
+++ b/third_party/aom/aom_dsp/sum_squares.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
+                                  int height) {
+  int r, c;
+  uint64_t ss = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const int16_t v = src[c];
+      ss += v * v;
+    }
+    src += src_stride;
+  }
+
+  return ss;
+}
+
+uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+  uint64_t ss = 0;
+  do {
+    const int16_t v = *src++;
+    ss += v * v;
+  } while (--n);
+
+  return ss;
+}
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
new file mode 100644
index 000000000..f98242840
--- /dev/null
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
+#define AOM_AOM_DSP_TXFM_COMMON_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+typedef struct txfm_param {
+  // for both forward and inverse transforms
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int lossless;
+  int bd;
+  // are the pixel buffers octets or shorts?  This should collapse to
+  // bd==8 implies !is_hbd, but that's not certain right now.
+  int is_hbd;
+  TxSetType tx_set_type;
+  // for inverse transforms only
+  int eob;
+} TxfmParam;
+
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//    printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*M_PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+static const tran_high_t InvSqrt2 = 11585;
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return rv;
+}
+
+#endif  // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
new file mode 100644
index 000000000..23b715309
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.c
@@ -0,0 +1,1579 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/variance.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride) {
+  int distortion = 0;
+  int r, c;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < 4; ++c) {
+      int diff = a[c] - b[c];
+      distortion += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return distortion;
+}
+
+uint32_t aom_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, int w, int h) {
+  uint32_t sse;
+  int sum;
+  variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
+  return sse;
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int pixel_step,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              unsigned int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+#define VAR(W, H)                                                    \
+  uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                     const uint8_t *b, int b_stride, \
+                                     uint32_t *sse) {                \
+    int sum;                                                         \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
+  }
+
+#define SUBPIX_VAR(W, H)                                                      \
+  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
+  }
+
+#define SUBPIX_AVG_VAR(W, H)                                                  \
+  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse,                          \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                    \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);             \
+  }                                                                           \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse,                          \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param);     \
+                                                                              \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                 \
+  }
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H)                                                         \
+  void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
+                               const uint8_t *b, int b_stride, uint32_t *sse, \
+                               int *sum) {                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
+  }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                               \
+  uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                const uint8_t *b, int b_stride, \
+                                uint32_t *sse) {                \
+    int sum;                                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
+    return *sse;                                                \
+  }
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+  VAR(W, H)             \
+  SUBPIX_VAR(W, H)      \
+  SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+VARIANCES(4, 2)
+VARIANCES(2, 4)
+VARIANCES(2, 2)
+VARIANCES(4, 16)
+VARIANCES(16, 4)
+VARIANCES(8, 32)
+VARIANCES(32, 8)
+VARIANCES(16, 64)
+VARIANCES(64, 16)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                         int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
+                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                          int ref_stride, int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter =
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                          -1, width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                         16, width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                          width, intermediate_height);
+    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                         width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, int subpel_x_q3,
+                                   int subpel_y_q3, const uint8_t *ref,
+                                   int ref_stride, int subpel_search) {
+  int i, j;
+
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const JNT_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
+    }
+    tsum += lsum;
+    a += a_stride;
+    b += b_stride;
+  }
+  *sum = tsum;
+  *sse = tsse;
+}
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int w, int h) {
+  uint64_t sse;
+  int64_t sum;
+  highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
+  return sse;
+}
+
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H)                                                       \
+  uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
+                                              const uint8_t *b, int b_stride,  \
+                                              uint32_t *sse) {                 \
+    int sum;                                                                   \
+    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define HIGHBD_GET_VAR(S)                                                    \
+  void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
+                                        const uint8_t *ref, int ref_stride,  \
+                                        uint32_t *sse, int *sum) {           \
+    highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
+  }                                                                          \
+                                                                             \
+  void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         uint32_t *sse, int *sum) {          \
+    highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+  }                                                                          \
+                                                                             \
+  void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         uint32_t *sse, int *sum) {          \
+    highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+  }
+
+#define HIGHBD_MSE(W, H)                                                      \
+  uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
+                                         const uint8_t *ref, int ref_stride,  \
+                                         uint32_t *sse) {                     \
+    int sum;                                                                  \
+    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
+                               CONVERT_TO_BYTEPTR(temp2), W);                  \
+                                                                               \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
+                                              dst, dst_stride, sse);           \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
+                               CONVERT_TO_BYTEPTR(temp2), W);                  \
+                                                                               \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               dst, dst_stride, sse);          \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
+                               CONVERT_TO_BYTEPTR(temp2), W);                  \
+                                                                               \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               dst, dst_stride, sse);          \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c(               \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
+                                                                               \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,   \
+                                          dst_stride, sse);                    \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
+                                                                               \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                           dst_stride, sse);                   \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
+                                                                               \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                           dst_stride, sse);                   \
+  }
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+  HIGHBD_VAR(W, H)             \
+  HIGHBD_SUBPIX_VAR(W, H)      \
+  HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+HIGHBD_VARIANCES(4, 2)
+HIGHBD_VARIANCES(2, 4)
+HIGHBD_VARIANCES(2, 2)
+HIGHBD_VARIANCES(4, 16)
+HIGHBD_VARIANCES(16, 4)
+HIGHBD_VARIANCES(8, 32)
+HIGHBD_VARIANCES(32, 8)
+HIGHBD_VARIANCES(16, 64)
+HIGHBD_VARIANCES(64, 16)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint8_t *comp_pred8, int width, int height,
+                                 int subpel_x_q3, int subpel_y_q3,
+                                 const uint8_t *ref8, int ref_stride, int bd,
+                                 int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter =
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  int i, j;
+
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride,
+                                    const JNT_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_highbd_jnt_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                          int height, const uint8_t *ref, int ref_stride,
+                          const uint8_t *mask, int mask_stride,
+                          int invert_mask) {
+  int i, j;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
+    }
+    comp_pred += width;
+    src0 += stride0;
+    src1 += stride1;
+    mask += mask_stride;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, int subpel_x_q3,
+                                    int subpel_y_q3, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask,
+                                    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+                       mask_stride, invert_mask);
+}
+
+#define MASK_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
+                         invert_mask);                                         \
+    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
+  }
+
+MASK_SUBPIX_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 8)
+MASK_SUBPIX_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 16)
+MASK_SUBPIX_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 32)
+MASK_SUBPIX_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 64)
+MASK_SUBPIX_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 128)
+MASK_SUBPIX_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 128)
+MASK_SUBPIX_VAR(4, 16)
+MASK_SUBPIX_VAR(16, 4)
+MASK_SUBPIX_VAR(8, 32)
+MASK_SUBPIX_VAR(32, 8)
+MASK_SUBPIX_VAR(16, 64)
+MASK_SUBPIX_VAR(64, 16)
+
+void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                 int width, int height, const uint8_t *ref8,
+                                 int ref_stride, const uint8_t *mask,
+                                 int mask_stride, int invert_mask) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      if (!invert_mask)
+        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
+      else
+        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+    mask += mask_stride;
+  }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+                            mask, mask_stride, invert_mask);
+}
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
+                                              ref, ref_stride, sse);           \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref, ref_stride, sse);          \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref, ref_stride, sse);          \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+HIGHBD_MASK_SUBPIX_VAR(4, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 16)
+
+static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+#define OBMC_VAR(W, H)                                            \
+  unsigned int aom_obmc_variance##W##x##H##_c(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask, unsigned int *sse) {                   \
+    int sum;                                                      \
+    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+  }
+
+#define OBMC_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
+  }
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+
+OBMC_VAR(4, 16)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_VAR(16, 4)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_VAR(8, 32)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_VAR(32, 8)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_VAR(16, 64)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_VAR(64, 16)
+OBMC_SUBPIX_VAR(64, 16)
+
+static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int w, int h,
+                                          uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask, int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H)                                              \
+  unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
+  unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                                 wsrc, mask, sse);             \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+
+HIGHBD_OBMC_VAR(4, 16)
+HIGHBD_OBMC_SUBPIX_VAR(4, 16)
+HIGHBD_OBMC_VAR(16, 4)
+HIGHBD_OBMC_SUBPIX_VAR(16, 4)
+HIGHBD_OBMC_VAR(8, 32)
+HIGHBD_OBMC_SUBPIX_VAR(8, 32)
+HIGHBD_OBMC_VAR(32, 8)
+HIGHBD_OBMC_SUBPIX_VAR(32, 8)
+HIGHBD_OBMC_VAR(16, 64)
+HIGHBD_OBMC_SUBPIX_VAR(16, 64)
+HIGHBD_OBMC_VAR(64, 16)
+HIGHBD_OBMC_SUBPIX_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
new file mode 100644
index 000000000..362da29d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VARIANCE_H_
+#define AOM_AOM_DSP_VARIANCE_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride);
+
+typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         const uint8_t *second_pred);
+
+typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+                                  int b_stride, int n);
+
+typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride, unsigned int *sad_array);
+
+typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*aom_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred);
+
+typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             const uint8_t *second_pred,
+                                             const JNT_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred,
+    const JNT_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *msk, int msk_stride,
+                                            int invert_mask);
+typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search);
+
+typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *msk);
+typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
+                                               int pred_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *msk,
+                                               unsigned int *sse);
+typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
+    const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
+    const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
+
+typedef struct aom_variance_vtable {
+  aom_sad_fn_t sdf;
+  aom_sad_avg_fn_t sdaf;
+  aom_variance_fn_t vf;
+  aom_subpixvariance_fn_t svf;
+  aom_subp_avg_variance_fn_t svaf;
+  aom_sad_multi_d_fn_t sdx4df;
+  aom_masked_sad_fn_t msdf;
+  aom_masked_subpixvariance_fn_t msvf;
+  aom_obmc_sad_fn_t osdf;
+  aom_obmc_variance_fn_t ovf;
+  aom_obmc_subpixvariance_fn_t osvf;
+  aom_jnt_sad_avg_fn_t jsdaf;
+  aom_jnt_subp_avg_variance_fn_t jsvaf;
+} aom_variance_fn_ptr_t;
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, int w, int h);
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
new file mode 100644
index 000000000..5f5bf5f14
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction aom_filter_block1d16_v8_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_sse2;
+
+#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
+#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
+#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
+#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
+#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
+#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
+
+filter8_1dfunction aom_filter_block1d16_v2_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_sse2;
+
+// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+
+#if ARCH_X86_64
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+
+#endif  // ARCH_X86_64
+#endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
new file mode 100644
index 000000000..7283c32b8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -0,0 +1,297 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                              dst, dst_stride, \
+                                              fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                           dst, dst_stride, \
+                                           fx, fxs, fy, fys, w, h
+%endif
+  mov r4d, dword wm
+%ifidn %2, highbd
+  shl r4d, 1
+  shl srcq, 1
+  shl src_strideq, 1
+  shl dstq, 1
+  shl dst_strideq, 1
+%else
+  cmp r4d, 4
+  je .w4
+%endif
+  cmp r4d, 8
+  je .w8
+  cmp r4d, 16
+  je .w16
+  cmp r4d, 32
+  je .w32
+
+  cmp r4d, 64
+  je .w64
+%ifidn %2, highbd
+  cmp r4d, 128
+  je .w128
+
+.w256:
+  mov                    r4d, dword hm
+.loop256:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  movu                    m0, [srcq+128]
+  movu                    m1, [srcq+128+16]
+  movu                    m2, [srcq+128+32]
+  movu                    m3, [srcq+128+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq+128]
+  pavg                    m1, [dstq+128+16]
+  pavg                    m2, [dstq+128+32]
+  pavg                    m3, [dstq+128+48]
+%endif
+  mova         [dstq+128   ], m0
+  mova         [dstq+128+16], m1
+  mova         [dstq+128+32], m2
+  mova         [dstq+128+48], m3
+  movu                    m0, [srcq+128+64]
+  movu                    m1, [srcq+128+80]
+  movu                    m2, [srcq+128+96]
+  movu                    m3, [srcq+128+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+128+64]
+  pavg                    m1, [dstq+128+80]
+  pavg                    m2, [dstq+128+96]
+  pavg                    m3, [dstq+128+112]
+%endif
+  mova         [dstq+128+64], m0
+  mova         [dstq+128+80], m1
+  mova         [dstq+128+96], m2
+  mova        [dstq+128+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop256
+  RET
+%endif
+
+.w128:
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop128
+  RET
+
+.w64:
+  mov                    r4d, dword hm
+.loop64:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop64
+  RET
+
+.w32:
+  mov                    r4d, dword hm
+.loop32:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq            +16]
+  pavg                    m2, [dstq+dst_strideq]
+  pavg                    m3, [dstq+dst_strideq+16]
+%endif
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
+  jnz .loop32
+  RET
+
+.w16:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop16:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop16
+  RET
+
+.w8:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop8:
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop8
+  RET
+
+%ifnidn %2, highbd
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movd                    m0, [srcq]
+  movd                    m1, [srcq+src_strideq]
+  movd                    m2, [srcq+src_strideq*2]
+  movd                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movd                    m4, [dstq]
+  movd                    m5, [dstq+dst_strideq]
+  movd                    m6, [dstq+dst_strideq*2]
+  movd                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movd  [dstq              ], m0
+  movd  [dstq+dst_strideq  ], m1
+  movd  [dstq+dst_strideq*2], m2
+  movd  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop4
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+convolve_fn copy, highbd
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..b6f040791
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -0,0 +1,613 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm6
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+    punpcklwd   xmm1, xmm7
+
+    movdqa      k0k6, xmm0
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+    movdqa      k1k7, xmm1
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+    punpcklwd   xmm0, xmm6                  ;two row in one register
+    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+
+    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
+    pmaddwd     xmm1, k1k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm3, k3k4
+
+    paddd       xmm0, xmm1                  ;sum
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+
+    paddd       xmm0, krd                   ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm6, xmm7
+    punpckhwd   xmm2, xmm5
+    punpckhwd   xmm3, xmm4
+
+    movdqa      k0k1, xmm0                  ;store filter factors on stack
+    movdqa      k6k7, xmm6
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+    movdqu      xmm0, [rsi + %1]            ;0
+    movdqu      xmm1, [rsi + rax + %1]      ;1
+    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
+    movdqu      xmm2, [rsi + rax + %1]      ;2
+    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
+    movdqu      xmm4, [rsi + rdx + %1]      ;4
+    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm1, xmm6
+    punpcklwd   xmm6, xmm7
+    punpckhwd   xmm1, xmm7
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm2, xmm5
+    punpckhwd   xmm7, xmm5
+
+    movdqu      xmm5, temp
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm3
+    punpcklwd   xmm3, xmm5
+    punpckhwd   xmm4, xmm5
+    movdqu      xmm5, temp
+
+    pmaddwd     xmm0, k0k1
+    pmaddwd     xmm5, k0k1
+    pmaddwd     xmm6, k6k7
+    pmaddwd     xmm1, k6k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm7, k2k5
+    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm4, k3k4
+
+    paddd       xmm0, xmm6
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+    paddd       xmm5, xmm1
+    paddd       xmm5, xmm7
+    paddd       xmm5, xmm4
+
+    paddd       xmm0, krd                   ;rounding
+    paddd       xmm5, krd
+    psrad       xmm0, 7                     ;shift
+    psrad       xmm5, 7
+    packssdw    xmm0, xmm5                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi + %2]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 0, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 0, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..7b3fe6419
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,338 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklwd   xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm5, rdx
+    movq        xmm2, rcx
+    pshufd      xmm5, xmm5, 0b
+    movdqa      xmm1, xmm5
+    psllw       xmm5, xmm2
+    psubw       xmm5, xmm1                  ;max value (for clamping)
+    pxor        xmm2, xmm2                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+    punpcklwd   xmm0, xmm1                  ;two row in one register
+    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
+
+    paddd       xmm0, xmm3                  ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm5
+    pmaxsw      xmm0, xmm2
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+
+    movq        [rdi], xmm0
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%if ARCH_X86_64
+%macro HIGH_GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm6, [rdx]                 ;load filters
+
+    pshuflw     xmm7, xmm6, 11111111b       ;k3
+    pshufhw     xmm6, xmm6, 0b              ;k4
+    psrldq      xmm6, 8
+    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm8, rdx
+    movq        xmm5, rcx
+    pshufd      xmm8, xmm8, 0b
+    movdqa      xmm1, xmm8
+    psllw       xmm8, xmm5
+    psubw       xmm8, xmm1                  ;max value (for clamping)
+    pxor        xmm5, xmm5                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+    movdqa      xmm6, xmm0
+    punpckhwd   xmm6, xmm1
+    punpcklwd   xmm0, xmm1
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+
+    paddd       xmm6, xmm4                  ;rounding
+    paddd       xmm0, xmm4                  ;rounding
+    psrad       xmm6, 7                     ;shift
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+    movdqa      xmm9, xmm0
+    movdqa      xmm6, xmm2
+    punpckhwd   xmm9, xmm1
+    punpckhwd   xmm6, xmm3
+    punpcklwd   xmm0, xmm1
+    punpcklwd   xmm2, xmm3
+
+    pmaddwd     xmm9, xmm7
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+    pmaddwd     xmm2, xmm7
+
+    paddd       xmm9, xmm4                  ;rounding
+    paddd       xmm6, xmm4
+    paddd       xmm0, xmm4
+    paddd       xmm2, xmm4
+
+    psrad       xmm9, 7                     ;shift
+    psrad       xmm6, 7
+    psrad       xmm0, 7
+    psrad       xmm2, 7
+
+    packssdw    xmm0, xmm9                  ;pack back to word
+    packssdw    xmm2, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm8
+    pmaxsw      xmm0, xmm5
+    pminsw      xmm2, xmm8
+    pmaxsw      xmm2, xmm5
+
+%if %1
+    movdqu      xmm1, [rdi]
+    movdqu      xmm3, [rdi + 16]
+    pavgw       xmm0, xmm1
+    pavgw       xmm2, xmm3
+%endif
+    movdqu      [rdi], xmm0               ;store the result
+    movdqu      [rdi + 16], xmm2          ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+%endif
+
+SECTION .text
+
+global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
+
+global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%if ARCH_X86_64
+global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
new file mode 100644
index 000000000..94b5da171
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_ports/mem.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
+    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
+    (defined(__APPLE__) && defined(__apple_build_version__) && \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+  *((uint32_t *)(output_ptr + stride)) =
+      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_storel_epi64((__m128i *)(output_ptr + stride),
+                   _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+                                   const ptrdiff_t stride, const __m256i *a) {
+  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_store_si128((__m128i *)(output_ptr + stride),
+                  _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg;
+  __m256i firstFilters, secondFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 32 bits
+  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+  // duplicate only the second 32 bits
+  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    // filter the source buffer
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d16_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m256i srcReg1, srcReg12;
+    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+    // filter the source buffer
+    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr,
+                    _mm256_castsi256_si128(srcRegFilt1_1));
+  }
+}
+
+static void aom_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(
+        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 =
+        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i resReg23_34_lo, resReg45_56_lo;
+  __m256i resReglo, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
+static void aom_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d16_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+  __m256i resReglo, resReghi, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+    resReghi = _mm256_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg23_34_hi = srcReg45_56_hi;
+    srcReg4x = srcReg6x;
+  }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b1 = srcReg32b3;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b3 = srcReg32b5;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b5 = srcReg32b7;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 =
+        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 =
+        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d4_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i srcReg2345_3456_lo;
+  __m256i resReglo, resReg;
+  __m256i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 000000000..325a21b76
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
+  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
+
+void aom_filter_block1d4_h8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr += src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_h8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr += src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_v8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr += src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+
+    output_ptr += out_pitch;
+  }
+}
+
+filter8_1dfunction aom_filter_block1d16_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+
+#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
+#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
+#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
+#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
+#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
+#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
+
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+
+// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..c88fc9ffb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -0,0 +1,615 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d4_v8_sse2) PRIVATE
+sym(aom_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d8_v8_sse2) PRIVATE
+sym(aom_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d16_v8_sse2) PRIVATE
+sym(aom_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d4_h8_sse2) PRIVATE
+sym(aom_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d8_h8_sse2) PRIVATE
+sym(aom_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d16_h8_sse2) PRIVATE
+sym(aom_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
new file mode 100644
index 000000000..3ca7921b6
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -0,0 +1,870 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64:    times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
+    ; pmaddubsw has a higher latency on some platforms, this might be eased by
+    ; interleaving the instructions.
+    %define    k0k1  [rsp + 16*0]
+    %define    k2k3  [rsp + 16*1]
+    %define    k4k5  [rsp + 16*2]
+    %define    k6k7  [rsp + 16*3]
+    packsswb     m4, m4
+    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+    ; some platforms.
+    pshuflw      m0, m4, 0b              ;k0_k1
+    pshuflw      m1, m4, 01010101b       ;k2_k3
+    pshuflw      m2, m4, 10101010b       ;k4_k5
+    pshuflw      m3, m4, 11111111b       ;k6_k7
+    punpcklqdq   m0, m0
+    punpcklqdq   m1, m1
+    punpcklqdq   m2, m2
+    punpcklqdq   m3, m3
+    mova       k0k1, m0
+    mova       k2k3, m1
+    mova       k4k5, m2
+    mova       k6k7, m3
+%if ARCH_X86_64
+    %define     krd  m12
+    %define    tmp0  [rsp + 16*4]
+    %define    tmp1  [rsp + 16*5]
+    mova        krd, [GLOBAL(pw_64)]
+%else
+    %define     krd  [rsp + 16*4]
+%if CONFIG_PIC=0
+    mova         m6, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb      m6, m6                  ;all ones
+    psrlw        m6, 15
+    psllw        m6, 6                   ;aka pw_64
+%endif
+    mova        krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if ARCH_X86_64
+  %define LOCAL_VARS_SIZE_H4 0
+%else
+  %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+                            src, sstride, dst, dstride, height, filter
+    mova                m4, [filterq]
+    packsswb            m4, m4
+%if ARCH_X86_64
+    %define       k0k1k4k5  m8
+    %define       k2k3k6k7  m9
+    %define            krd  m10
+    mova               krd, [GLOBAL(pw_64)]
+    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
+    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
+    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+    %define       k0k1k4k5  [rsp + 16*0]
+    %define       k2k3k6k7  [rsp + 16*1]
+    %define            krd  [rsp + 16*2]
+    pshuflw             m6, m4, 0b              ;k0_k1
+    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
+    pshuflw             m7, m4, 01010101b       ;k2_k3
+    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+    mova                m1, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb             m1, m1                  ;all ones
+    psrlw               m1, 15
+    psllw               m1, 6                   ;aka pw_64
+%endif
+    mova          k0k1k4k5, m6
+    mova          k2k3k6k7, m7
+    mova               krd, m1
+%endif
+    dec            heightd
+
+.loop:
+    ;Do two rows at once
+    movu                m4, [srcq - 3]
+    movu                m5, [srcq + sstrideq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    punpckhbw           m3, m5, m5
+    punpcklbw           m5, m5
+    palignr             m0, m1, m4, 1
+    pmaddubsw           m0, k0k1k4k5
+    palignr             m1, m4, 5
+    pmaddubsw           m1, k2k3k6k7
+    palignr             m2, m3, m5, 1
+    pmaddubsw           m2, k0k1k4k5
+    palignr             m3, m5, 5
+    pmaddubsw           m3, k2k3k6k7
+    punpckhqdq          m4, m0, m2
+    punpcklqdq          m0, m2
+    punpckhqdq          m5, m1, m3
+    punpcklqdq          m1, m3
+    paddsw              m0, m4
+    paddsw              m1, m5
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    movd                m5, [dstq + dstrideq]
+%endif
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+    punpcklbw            m4, m3
+    paddsw               m0, m4
+%endif
+    packuswb            m0, m0
+    psrldq              m1, m0, 4
+
+%ifidn %1, h8_avg
+    pavgb               m0, m4
+    pavgb               m1, m5
+%endif
+    movd            [dstq], m0
+    movd [dstq + dstrideq], m1
+
+    lea               srcq, [srcq + sstrideq        ]
+    prefetcht0              [srcq + 4 * sstrideq - 3]
+    lea               srcq, [srcq + sstrideq        ]
+    lea               dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0              [srcq + 2 * sstrideq - 3]
+
+    sub            heightd, 2
+    jg               .loop
+
+    ; Do last row if output_height is odd
+    jne              .done
+
+    movu                m4, [srcq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    palignr             m0, m1, m4, 1
+    palignr             m1, m4, 5
+    pmaddubsw           m0, k0k1k4k5
+    pmaddubsw           m1, k2k3k6k7
+    psrldq              m2, m0, 8
+    psrldq              m3, m1, 8
+    paddsw              m0, m2
+    paddsw              m1, m3
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                m3, m3
+    movu                m4, [srcq]
+    punpcklbw           m4, m3
+    paddsw              m0, m4
+%endif
+    packuswb            m0, m0
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    pavgb               m0, m4
+%endif
+    movd            [dstq], m0
+.done:
+    REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                 m4, [filterq]
+    SETUP_LOCAL_VARS
+    dec             heightd
+
+.loop:
+    ;Do two rows at once
+    movu                 m0, [srcq - 3]
+    movu                 m4, [srcq + sstrideq - 3]
+    punpckhbw            m1, m0, m0
+    punpcklbw            m0, m0
+    palignr              m5, m1, m0, 13
+    pmaddubsw            m5, k6k7
+    palignr              m2, m1, m0, 5
+    palignr              m3, m1, m0, 9
+    palignr              m1, m0, 1
+    pmaddubsw            m1, k0k1
+    punpckhbw            m6, m4, m4
+    punpcklbw            m4, m4
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+
+    palignr              m7, m6, m4, 13
+    palignr              m0, m6, m4, 5
+    pmaddubsw            m7, k6k7
+    paddsw               m1, m3
+    paddsw               m2, m5
+    paddsw               m1, m2
+%ifidn %1, h8_avg
+    movh                 m2, [dstq]
+    movhps               m2, [dstq + dstrideq]
+%endif
+    palignr              m5, m6, m4, 9
+    palignr              m6, m4, 1
+    pmaddubsw            m0, k2k3
+    pmaddubsw            m6, k0k1
+    paddsw               m1, krd
+    pmaddubsw            m5, k4k5
+    psraw                m1, 7
+    paddsw               m0, m7
+    paddsw               m6, m5
+    paddsw               m6, m0
+    paddsw               m6, krd
+    psraw                m6, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpcklbw            m4, m3
+    punpcklbw            m5, m3
+    paddsw               m1, m4
+    paddsw               m6, m5
+%endif
+    packuswb             m1, m6
+%ifidn %1, h8_avg
+    pavgb                m1, m2
+%endif
+    movh              [dstq], m1
+    movhps [dstq + dstrideq], m1
+
+    lea                srcq, [srcq + sstrideq        ]
+    prefetcht0               [srcq + 4 * sstrideq - 3]
+    lea                srcq, [srcq + sstrideq        ]
+    lea                dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0               [srcq + 2 * sstrideq - 3]
+    sub             heightd, 2
+    jg                .loop
+
+    ; Do last row if output_height is odd
+    jne               .done
+
+    movu                 m0, [srcq - 3]
+    punpckhbw            m3, m0, m0
+    punpcklbw            m0, m0
+    palignr              m1, m3, m0, 1
+    palignr              m2, m3, m0, 5
+    palignr              m4, m3, m0, 13
+    palignr              m3, m0, 9
+    pmaddubsw            m1, k0k1
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+    pmaddubsw            m4, k6k7
+    paddsw               m1, m3
+    paddsw               m4, m2
+    paddsw               m1, m4
+    paddsw               m1, krd
+    psraw                m1, 7
+%ifidn %1, h8_add_src
+    pxor                 m6, m6
+    movu                 m5, [srcq]
+    punpcklbw            m5, m6
+    paddsw               m1, m5
+%endif
+    packuswb             m1, m1
+%ifidn %1, h8_avg
+    movh                 m0, [dstq]
+    pavgb                m1, m0
+%endif
+    movh             [dstq], m1
+.done:
+    REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+
+.loop:
+    prefetcht0        [srcq + 2 * sstrideq -3]
+
+    movu          m0, [srcq - 3]
+    movu          m4, [srcq - 2]
+    pmaddubsw     m0, k0k1
+    pmaddubsw     m4, k0k1
+    movu          m1, [srcq - 1]
+    movu          m5, [srcq + 0]
+    pmaddubsw     m1, k2k3
+    pmaddubsw     m5, k2k3
+    movu          m2, [srcq + 1]
+    movu          m6, [srcq + 2]
+    pmaddubsw     m2, k4k5
+    pmaddubsw     m6, k4k5
+    movu          m3, [srcq + 3]
+    movu          m7, [srcq + 4]
+    pmaddubsw     m3, k6k7
+    pmaddubsw     m7, k6k7
+    paddsw        m0, m2
+    paddsw        m1, m3
+    paddsw        m0, m1
+    paddsw        m4, m6
+    paddsw        m5, m7
+    paddsw        m4, m5
+    paddsw        m0, krd
+    paddsw        m4, krd
+    psraw         m0, 7
+    psraw         m4, 7
+%ifidn %1, h8_add_src
+%if ARCH_X86=1 && CONFIG_PIC=1
+    pcmpeqb       m2, m2                  ;all ones
+    psrlw         m2, 8                   ;even_byte_mask
+%else
+    mova          m2, [GLOBAL(even_byte_mask)]
+%endif
+    movu          m5, [srcq]
+    mova          m7, m5
+    pand          m5, m2
+    psrlw         m7, 8
+    paddsw        m0, m5
+    paddsw        m4, m7
+%endif
+    packuswb      m0, m0
+    packuswb      m4, m4
+    punpcklbw     m0, m4
+%ifidn %1, h8_avg
+    pavgb         m0, [dstq]
+%endif
+    lea         srcq, [srcq + sstrideq]
+    mova      [dstq], m0
+    lea         dstq, [dstq + dstrideq]
+    dec      heightd
+    jnz        .loop
+    REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER8  h8
+SUBPIX_HFILTER4  h8
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+    %define NUM_GENERAL_REG_USED 9
+%else
+    %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+    %define                movx  movh
+%else
+    %define                movx  movd
+%endif
+
+    dec                 heightd
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    mov                   src1q, srcq
+    add                   src1q, sstrideq
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    ;Do two rows at once
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [src1q               ]     ;B
+    punpcklbw                m0, m1                         ;A B
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw                m0, k0k1
+    mova                     m6, m2
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw                m2, m3                         ;C D
+    pmaddubsw                m2, k2k3
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    mova                     m7, m4
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m4, k4k5
+    punpcklbw                m1, m6                         ;A B next iter
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m5, m6                         ;E F next iter
+    punpcklbw                m3, m7                         ;C D next iter
+    pmaddubsw                m5, k4k5
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m3, k2k3
+    pmaddubsw                m1, k0k1
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw                m7, m6
+    pmaddubsw                m7, k6k7
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    paddsw                   m1, m5
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
+    packuswb                 m0, m0
+
+    paddsw                   m3, m7
+    paddsw                   m1, m3
+    paddsw                   m1, krd
+    psraw                    m1, 7
+%ifidn %1, v8_add_src
+    movu                     m4, [src1q]
+    punpcklbw                m4, m6
+    paddsw                   m1, m4
+%endif
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    lea                   src1q, [src1q + sstrideq * 2]
+    packuswb                 m1, m1
+
+%ifidn %1, v8_avg
+    movx                     m2, [dstq]
+    pavgb                    m0, m2
+%endif
+    movx                 [dstq], m0
+    add                    dstq, dst_stride
+%ifidn %1, v8_avg
+    movx                     m3, [dstq]
+    pavgb                    m1, m3
+%endif
+    movx                 [dstq], m1
+    add                    dstq, dst_stride
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m0, m1                         ;A B
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    pmaddubsw                m0, k0k1
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw                m6, m7                         ;G H
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    pmaddubsw                m6, k6k7
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw                m2, m3                         ;C D
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    paddsw                   m2, m6
+    paddsw                   m0, m4
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%else
+    ; ARCH_X86_64
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m2, [srcq]                     ;C
+    movx                     m3, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m4, [srcq]                     ;E
+    movx                     m5, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m6, [srcq]                     ;G
+    punpcklbw                m0, m1                         ;A B
+    punpcklbw                m1, m2                         ;A B next iter
+    punpcklbw                m2, m3                         ;C D
+    punpcklbw                m3, m4                         ;C D next iter
+    punpcklbw                m4, m5                         ;E F
+    punpcklbw                m5, m6                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    movx                     m7, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                    m14, [srcq]                     ;H next iter
+    punpcklbw                m6, m7                         ;G H
+    punpcklbw                m7, m14                        ;G H next iter
+    pmaddubsw                m8, m0, k0k1
+    pmaddubsw                m9, m1, k0k1
+    mova                     m0, m2
+    mova                     m1, m3
+    pmaddubsw               m10, m2, k2k3
+    pmaddubsw               m11, m3, k2k3
+    mova                     m2, m4
+    mova                     m3, m5
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m5, k4k5
+    paddsw                   m8, m4
+    paddsw                   m9, m5
+    mova                     m4, m6
+    mova                     m5, m7
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m7, k6k7
+    paddsw                  m10, m6
+    paddsw                  m11, m7
+    paddsw                   m8, m10
+    paddsw                   m9, m11
+    mova                     m6, m14
+    paddsw                   m8, krd
+    paddsw                   m9, krd
+    psraw                    m8, 7
+    psraw                    m9, 7
+%ifidn %2, 4
+    packuswb                 m8, m8
+    packuswb                 m9, m9
+%else
+    packuswb                 m8, m9
+%endif
+
+%ifidn %1, v8_avg
+    movx                     m7, [dstq]
+%ifidn %2, 4
+    movx                    m10, [dstq + dstrideq]
+    pavgb                    m9, m10
+%else
+    movhpd                   m7, [dstq + dstrideq]
+%endif
+    pavgb                    m8, m7
+%endif
+    movx                 [dstq], m8
+%ifidn %2, 4
+    movx      [dstq + dstrideq], m9
+%else
+    movhpd    [dstq + dstrideq], m8
+%endif
+
+    lea                    dstq, [dstq + dstrideq * 2 ]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m7, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m6, k6k7
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%endif ; ARCH_X86_64
+
+.done:
+    REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova                     m4, [filterq]
+    SETUP_LOCAL_VARS
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    lea                   src1q, [srcq + sstrideq]
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    movh                     m0, [srcq                ]     ;A
+    movh                     m1, [src1q               ]     ;B
+    movh                     m2, [srcq + sstrideq * 2 ]     ;C
+    movh                     m3, [src1q + sstrideq * 2]     ;D
+    movh                     m4, [srcq + sstrideq * 4 ]     ;E
+    movh                     m5, [src1q + sstrideq * 4]     ;F
+
+    punpcklbw                m0, m1                         ;A B
+    movh                     m6, [srcq + sstride6q]         ;G
+    punpcklbw                m2, m3                         ;C D
+    movh                     m7, [src1q + sstride6q]        ;H
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m0, k0k1
+    movh                     m3, [srcq + 8]                 ;A
+    pmaddubsw                m2, k2k3
+    punpcklbw                m6, m7                         ;G H
+    movh                     m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw                m4, k4k5
+    punpcklbw                m3, m5                         ;A B
+    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw                m6, k6k7
+    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
+    punpcklbw                m7, m5                         ;C D
+    paddsw                   m2, m6
+    pmaddubsw                m3, k0k1
+    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw                   m0, m4
+    pmaddubsw                m7, k2k3
+    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw                m1, m6                         ;E F
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    movh                     m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw                m1, k4k5
+    movh                     m5, [src1q + sstride6q + 8]    ;H
+    psraw                    m0, 7
+    punpcklbw                m2, m5                         ;G H
+    pmaddubsw                m2, k6k7
+    paddsw                   m7, m2
+    paddsw                   m3, m1
+    paddsw                   m3, m7
+    paddsw                   m3, krd
+    psraw                    m3, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+    mova                     m5, m4
+    punpcklbw                m4, m6
+    punpckhbw                m5, m6
+    paddsw                   m0, m4
+    paddsw                   m3, m5
+%endif
+    packuswb                 m0, m3
+
+    add                    srcq, sstrideq
+    add                   src1q, sstrideq
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+    add                    dstq, dst_stride
+    dec                 heightd
+    jnz                   .loop
+    REP_RET
+
+%else
+    ; ARCH_X86_64
+    dec                 heightd
+
+    movu                     m1, [srcq                ]     ;A
+    movu                     m3, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m0, m1, m3                     ;A B
+    punpckhbw                m1, m3                         ;A B
+    movu                     m5, [srcq]                     ;C
+    punpcklbw                m2, m3, m5                     ;A B next iter
+    punpckhbw                m3, m5                         ;A B next iter
+    mova                   tmp0, m2                         ;store to stack
+    mova                   tmp1, m3                         ;store to stack
+    movu                     m7, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m4, m5, m7                     ;C D
+    punpckhbw                m5, m7                         ;C D
+    movu                     m9, [srcq]                     ;E
+    punpcklbw                m6, m7, m9                     ;C D next iter
+    punpckhbw                m7, m9                         ;C D next iter
+    movu                    m11, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m8, m9, m11                    ;E F
+    punpckhbw                m9, m11                        ;E F
+    movu                     m2, [srcq]                     ;G
+    punpcklbw               m10, m11, m2                    ;E F next iter
+    punpckhbw               m11, m2                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    pmaddubsw               m13, m0, k0k1
+    mova                     m0, m4
+    pmaddubsw               m14, m8, k4k5
+    pmaddubsw               m15, m4, k2k3
+    mova                     m4, m8
+    paddsw                  m13, m14
+    movu                     m3, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw               m14, m2, m3                     ;G H
+    mova                     m8, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m15, m14
+    paddsw                  m13, m15
+    paddsw                  m13, krd
+    psraw                   m13, 7
+
+    pmaddubsw               m14, m1, k0k1
+    pmaddubsw                m1, m9, k4k5
+    pmaddubsw               m15, m5, k2k3
+    paddsw                  m14, m1
+    mova                     m1, m5
+    mova                     m5, m9
+    punpckhbw                m2, m3                         ;G H
+    mova                     m9, m2
+    pmaddubsw                m2, k6k7
+    paddsw                  m15, m2
+    paddsw                  m14, m15
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m13, m14
+%ifidn %1, v8_avg
+    pavgb                   m13, [dstq]
+%endif
+    mova                 [dstq], m13
+
+    ; next iter
+    pmaddubsw               m15, tmp0, k0k1
+    pmaddubsw               m14, m10, k4k5
+    pmaddubsw               m13, m6, k2k3
+    paddsw                  m15, m14
+    mova                   tmp0, m6
+    mova                     m6, m10
+    movu                     m2, [srcq]                     ;G next iter
+    punpcklbw               m14, m3, m2                     ;G H next iter
+    mova                    m10, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m13, m14
+    paddsw                  m15, m13
+    paddsw                  m15, krd
+    psraw                   m15, 7
+
+    pmaddubsw               m14, tmp1, k0k1
+    mova                   tmp1, m7
+    pmaddubsw               m13, m7, k2k3
+    mova                     m7, m11
+    pmaddubsw               m11, k4k5
+    paddsw                  m14, m11
+    punpckhbw                m3, m2                         ;G H next iter
+    mova                    m11, m3
+    pmaddubsw                m3, k6k7
+    paddsw                  m13, m3
+    paddsw                  m14, m13
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m15, m14
+%ifidn %1, v8_avg
+    pavgb                   m15, [dstq + dstrideq]
+%endif
+    mova      [dstq + dstrideq], m15
+    lea                    dstq, [dstq + dstrideq * 2]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movu                     m3, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m2, m3                     ;G H
+    punpckhbw                m2, m3                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m1, k0k1
+    pmaddubsw                m4, k2k3
+    pmaddubsw                m5, k2k3
+    pmaddubsw                m8, k4k5
+    pmaddubsw                m9, k4k5
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m2, k6k7
+    paddsw                   m0, m8
+    paddsw                   m1, m9
+    paddsw                   m4, m6
+    paddsw                   m5, m2
+    paddsw                   m0, m4
+    paddsw                   m1, m5
+    paddsw                   m0, krd
+    paddsw                   m1, krd
+    psraw                    m0, 7
+    psraw                    m1, 7
+    packuswb                 m0, m1
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+
+.done:
+    REP_RET
+
+%endif ; ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16     v8
+SUBPIX_VFILTER       v8, 8
+SUBPIX_VFILTER       v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..d0b4b2839
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklqdq  xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    pxor        xmm2, xmm2
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpcklbw   xmm0, xmm2                  ;unpack to word
+    pmullw      xmm0, xmm4                  ;multiply the filter factors
+
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+
+    paddsw      xmm0, xmm3                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+
+    pshuflw     xmm6, xmm7, 11111111b       ;k3
+    pshufhw     xmm7, xmm7, 0b              ;k4
+    punpcklwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    pxor        xmm5, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm4                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+    punpckhbw   xmm2, xmm5
+    punpckhbw   xmm3, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    pmullw      xmm2, xmm6
+    pmullw      xmm3, xmm7
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm2, xmm3
+
+    paddsw      xmm0, xmm4                  ;rounding
+    paddsw      xmm2, xmm4
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+global sym(aom_filter_block1d4_v2_sse2) PRIVATE
+sym(aom_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_v2_sse2) PRIVATE
+sym(aom_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_v2_sse2) PRIVATE
+sym(aom_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d4_h2_sse2) PRIVATE
+sym(aom_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_h2_sse2) PRIVATE
+sym(aom_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_h2_sse2) PRIVATE
+sym(aom_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
new file mode 100644
index 000000000..59edc49a9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,267 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         ecx, 0x01000100
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movd        xmm2, ecx                   ;rounding_shift
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         ecx, 0x01000100
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movd        xmm6, ecx                   ;rounding_shift
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    pmulhrsw    xmm2, xmm6
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
+sym(aom_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
+sym(aom_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
+sym(aom_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
+sym(aom_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
+sym(aom_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
+sym(aom_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 000000000..4f5e3f8c1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, 0, w, h, 0, 0);
+}
+
+void aom_highbd_blend_a64_hmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
+                                   bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 000000000..67fb4d32b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+#include <immintrin.h>  // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+    int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  __m256i res = _mm256_packus_epi16(res0, res0);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+    const __m256i *v_maxval, int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s0_1 = yy_loadu_256(src0 + 16);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  const __m256i s1_1 = yy_loadu_256(src1 + 16);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
+  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+  __m256i res = _mm256_packus_epi16(res0, res1);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m = xx_loadu_128(mask);
+    const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m = yy_loadu_256(mask + j);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m256i m_i00 = yy_loadu_256(mask);
+    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + j);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+      const __m256i m_ac =
+          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+      const __m256i m1 =
+          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+  return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = yy_loadu_256(src0);
+  const __m256i v_s1_b = yy_loadu_256(src1);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m256i v_p1_w =
+      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m256i v_ral_b = yy_loadu_256(mask);
+    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+    const __m256i v_rvsbl_w =
+        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+      const __m256i v_rvsbl_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+      const __m256i v_rvsbh_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadu_128(mask);
+        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+  do {
+    const __m256i v_rl_b = yy_loadu_256(mask);
+    const __m256i v_al_b =
+        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m256i v_al_b =
+          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+      const __m256i v_ah_b =
+          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_r_b = xx_loadl_64(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_r_b = xx_loadu_128(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storeu_128(dst, v_res_b);
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ra_b = yy_loadu_256(mask + c);
+      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_32(mask);
+        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_m0_b = yy_loadu_256(mask + c);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_m0_b = xx_loadl_32(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_m0_b = xx_loadl_64(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      do {
+        const __m128i v_m0_b = xx_loadu_128(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storeu_128(dst, v_res_b);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    default:
+      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subx, int suby) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subx, suby);
+  } else {
+    if (subx & suby) {
+      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, mask_stride, w, h);
+    } else if (subx) {
+      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else if (suby) {
+      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else {
+      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, mask_stride, w, h);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 000000000..9d6b4c2f7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,1109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int w, int h) {
+  (void)w;
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int w, int h) {
+  (void)w;
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_m0_b = xx_loadl_64(mask);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_m0_b = xx_loadu_128(mask + c);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_r_b = xx_loadu_128(mask);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ra_b = xx_loadu_128(mask + c);
+      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+      const __m128i v_rvsbl_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+      const __m128i v_rvsbh_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
+      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                               const uint8_t *src0, uint32_t src0_stride,
+                               const uint8_t *src1, uint32_t src1_stride,
+                               const uint8_t *mask, uint32_t mask_stride, int w,
+                               int h, int subx, int suby) {
+  typedef void (*blend_fn)(
+      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
+      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+  // Dimensions are: width_index X subx X suby
+  static const blend_fn blend[3][2][2] = {
+    { // w % 16 == 0
+      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+    { // w == 4
+      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+    { // w == 8
+      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subx, suby);
+  } else {
+    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
+                                              src0_stride, src1, src1_stride,
+                                              mask, mask_stride, w, h);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h, blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_m0_b = xx_loadl_64(mask + c);
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h,
+                               blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadl_64(mask + c);
+      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w =
+        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
+                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+      const __m128i v_rvsb_w =
+          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h,
+                                     blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h,
+                                     blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+                                      const uint8_t *src0_8,
+                                      uint32_t src0_stride,
+                                      const uint8_t *src1_8,
+                                      uint32_t src1_stride, const uint8_t *mask,
+                                      uint32_t mask_stride, int w, int h,
+                                      int subx, int suby, int bd) {
+  typedef void (*blend_fn)(
+      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
+      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+  // Dimensions are: bd_index X width_index X subx X suby
+  static const blend_fn blend[2][2][2][2] = {
+    {   // bd == 8 or 10
+      { // w % 8 == 0
+        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+        { blend_a64_mask_b10_sx_w8n_sse4_1,
+          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+        { blend_a64_mask_b10_sx_w4_sse4_1,
+          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
+    {   // bd == 12
+      { // w % 8 == 0
+        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+        { blend_a64_mask_b12_sx_w8n_sse4_1,
+          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+        { blend_a64_mask_b12_sx_w4_sse4_1,
+          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                src1_stride, mask, mask_stride, w, h, subx,
+                                suby, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+        mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_d16_mask_w16_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+    const __m128i *v_maxval, int shift) {
+  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+  const __m128i s0_0 = xx_loadu_128(src0);
+  const __m128i s0_1 = xx_loadu_128(src0 + 8);
+  const __m128i s1_0 = xx_loadu_128(src1);
+  const __m128i s1_1 = xx_loadu_128(src1 + 8);
+  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
+  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
+  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
+  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+  const __m128i res = _mm_packus_epi16(res0, res1);
+
+  _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m = xx_loadu_128(mask + j);
+      const __m128i m0 = _mm_cvtepu8_epi16(m);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 000000000..064910232
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                        const uint8_t *src0,
+                                        uint32_t src0_stride,
+                                        const uint8_t *src1,
+                                        uint32_t src1_stride,
+                                        const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h);
+
+  // Dimension: width_index
+  static const blend_fn blend[9] = {
+    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
+    aom_blend_a64_vmask_c,        // w == 1
+    aom_blend_a64_vmask_c,        // w == 2
+    NULL,                         // INVALID
+    blend_a64_vmask_w4_sse4_1,    // w == 4
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    blend_a64_vmask_w8_sse4_1,    // w == 8
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+                 h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int w, int h) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h, blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int w, int h) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int w, int h) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, w, h, blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int w, int h) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, w, h, blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_vmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
+                           const uint16_t *src0, uint32_t src0_stride,
+                           const uint16_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h);
+
+  // Dimensions are: bd_index X width_index
+  static const blend_fn blend[2][2] = {
+    {
+        // bd == 8 or 10
+        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    },
+    {
+        // bd == 12
+        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, w, h);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 000000000..c071fdcfc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+  const __m128i res_d = _mm_srai_epi32(res_c, shift);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadu_128(src0);
+  const __m128i s1 = xx_loadu_128(src1);
+  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+                                  _mm_unpacklo_epi16(*m, max_minus_m));
+  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+                                  _mm_unpackhi_epi16(*m, max_minus_m));
+  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_32(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_64(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+#endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
new file mode 100644
index 000000000..8d9b32510
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                  const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadu_128(src0);
+  const __m128i v_s1_b = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d =
+      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d =
+      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d =
+      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
new file mode 100644
index 000000000..96fe4ebb6
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
new file mode 100644
index 000000000..3e19682cd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                uint32_t output_height, const int16_t *filter);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+  void aom_convolve8_##name##_##opt(                                         \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    (void)filter_x;                                                          \
+    (void)x_step_q4;                                                         \
+    (void)filter_y;                                                          \
+    (void)y_step_q4;                                                         \
+    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
+    assert(step_q4 == 16);                                                   \
+    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
+        (filter[2] | filter[5])) {                                           \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else if (filter[0] | filter[1] | filter[2]) {                          \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else {                                                                 \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    }                                                                        \
+    if (w) {                                                                 \
+      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
+                               x_step_q4, filter_y, y_step_q4, w, h);        \
+    }                                                                        \
+  }
+
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+                                       const ptrdiff_t src_pitch,
+                                       uint16_t *output_ptr,
+                                       ptrdiff_t out_pitch,
+                                       unsigned int output_height,
+                                       const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)  \
+  void aom_highbd_convolve8_##name##_##opt(                                \
+      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,            \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,        \
+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {      \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
+    if (step_q4 == 16 && filter[3] != 128) {                               \
+      if (filter[0] | filter[1] | filter[2]) {                             \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else {                                                             \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+    if (w) {                                                               \
+      aom_highbd_convolve8_##name##_c(                                     \
+          CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst),    \
+          dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+    }                                                                      \
+  }
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..30253f65c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+static INLINE void prepare_coeffs_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16(0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+                                     const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+                                       _mm256_add_epi16(res_23, res_67));
+
+  return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+                               const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                       _mm256_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+                                       const __m256i *const coeffs,
+                                       const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+  s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_lowbd(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+                                         const __m256i *const res,
+                                         const int do_average) {
+  __m256i d;
+  if (do_average) {
+    d = _mm256_load_si256((__m256i *)dst);
+    d = _mm256_add_epi32(d, *res);
+    d = _mm256_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+                               const __m256i *const res_unsigned,
+                               const __m256i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm256_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+                                        const __m256i *const offset_const,
+                                        const __m256i *const round_const,
+                                        const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi16(
+      _mm256_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+                                      const __m256i *const res_unsigned,
+                                      const __m256i *const wt0,
+                                      const __m256i *const wt1,
+                                      const int use_jnt_comp_avg) {
+  __m256i res;
+  if (use_jnt_comp_avg) {
+    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+    const __m256i *const res_unsigned, const __m256i *const offset_const,
+    const __m256i *const round_const, const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi32(
+      _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 000000000..707bd2d78
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+                             const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(d, *res);
+    d = _mm_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 000000000..445d04b10
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+  const __m128i res =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+                               const __m128i *const res_unsigned,
+                               const __m128i *const wt,
+                               const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+                                        const __m128i *const offset_const,
+                                        const __m128i *const round_const,
+                                        const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+    const __m128i *const res_unsigned, const __m128i *const offset_const,
+    const __m128i *const round_const, const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 000000000..6b8388d84
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+                                  const __m128i *const res,
+                                  const __m128i *const wt0,
+                                  const __m128i *const wt1,
+                                  const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+                                             const __m128i *const res_unsigned,
+                                             const __m128i *const wt0,
+                                             const __m128i *const wt1,
+                                             const int use_jnt_comp_avg) {
+  __m128i res;
+  if (use_jnt_comp_avg) {
+    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..54da02253
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+          _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..12bdc3e18
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = _mm_sub_ps(real1, real2);
+      imag1 = _mm_add_ps(imag1, imag2);
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = _mm_add_ps(real1, real2);
+      imag1 = _mm_sub_ps(imag2, imag1);
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 000000000..1e3d13ec8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow =
+          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          aom_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          aom_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            aom_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 000000000..2d8f8f71e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+
+void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i u0, u1, sum;
+
+  u0 = _mm_add_epi16(in0, in1);
+  u1 = _mm_add_epi16(in2, in3);
+
+  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  sum = _mm_add_epi16(u0, u1);
+
+  in0 = _mm_add_epi16(in0, in1);
+  in2 = _mm_add_epi16(in2, in3);
+  sum = _mm_add_epi16(sum, in0);
+
+  u0 = _mm_setzero_si128();
+  sum = _mm_add_epi16(sum, in2);
+
+  in0 = _mm_unpacklo_epi16(u0, sum);
+  in1 = _mm_unpackhi_epi16(u0, sum);
+  in0 = _mm_srai_epi32(in0, 16);
+  in1 = _mm_srai_epi32(in1, 16);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_unpacklo_epi32(sum, u0);
+  in1 = _mm_unpackhi_epi32(sum, u0);
+
+  sum = _mm_add_epi32(in0, in1);
+  in0 = _mm_srli_si128(sum, 8);
+
+  in1 = _mm_add_epi32(sum, in0);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
+}
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT8x8_2D aom_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT8x8_2D
+
+#undef DCT_HIGH_BIT_DEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
+#undef FDCT8x8_2D
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 000000000..260d8dd58
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+    const __m128i *preg30, const __m128i *preg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+    _mm_store_si128((__m128i *)(dst_ptr), out0);
+    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  } else {
+    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 000000000..c1fb259a1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,379 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585,  11585
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+%macro STORE_OUTPUT 2 ; index, result
+  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  pxor               m11, m11
+  pcmpgtw            m11, m%2
+  movdqa             m12, m%2
+  punpcklwd          m%2, m11
+  punpckhwd          m12, m11
+  mova               [outputq + 4*%1 +  0], m%2
+  mova               [outputq + 4*%1 + 16], m12
+%endmacro
+
+SECTION .text
+
+%if ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  paddw m9, m1, m6
+  psubw m1, m6
+
+  paddw m7, m2, m5
+  psubw m2, m5
+
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  STORE_OUTPUT  0,  4
+  STORE_OUTPUT  8,  5
+  STORE_OUTPUT 16,  3
+  STORE_OUTPUT 24,  0
+  STORE_OUTPUT 32, 10
+  STORE_OUTPUT 40,  9
+  STORE_OUTPUT 48,  7
+  STORE_OUTPUT 56,  2
+
+  RET
+%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 000000000..099fcf7fc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
+                                   uint8_t *dst8, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int filter_x_stride,
+                                   const int16_t *filter_y, int filter_y_stride,
+                                   int width, int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  assert(width % 4 == 0);
+  if (width > 32) {  // width = 64
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 16) {  // width = 32
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 8) {  // width = 16
+    __m256i p0, p1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+      p1 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (width > 4) {  // width = 8
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storeu_si128((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storeu_si128((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // width = 4
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storel_epi64((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m256i s[8], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[4], coeffs_x[4];
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+                                  round_shift_bits);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+                                 round_shift_bits);
+
+      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+      res = _mm256_min_epi16(res, clip_pixel);
+      res = _mm256_max_epi16(res, zero);
+
+      if (w - j > 4) {
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else if (w == 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else {
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                     _mm256_castsi256_si128(res));
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                     _mm256_extracti128_si256(res, 1));
+      }
+    }
+  }
+}
+
+#define CONV8_ROUNDING_BITS (7)
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13,
+                                              4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15,
+                                              6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
+  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
+  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
+  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
+}
+
+// Note:
+//  Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+                                  __m256i *x /*x[8]*/) {
+  __m256i pp[8];
+  pack_pixels(s0, pp);
+  pack_pixels(s1, &pp[4]);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+  x[4] = x[2];
+  x[5] = x[3];
+  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i pp[8];
+  __m256i s0;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  pack_pixels(&s0, pp);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+                                   __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+//  Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p0 = _mm256_set1_epi32(0x03020100);
+  const __m256i p1 = _mm256_set1_epi32(0x07060504);
+  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+  f[0] = _mm256_shuffle_epi8(hh, p0);
+  f[1] = _mm256_shuffle_epi8(hh, p1);
+  f[2] = _mm256_shuffle_epi8(hh, p2);
+  f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+                                     const __m256i *fil /*fil[4]*/,
+                                     __m256i *y) {
+  __m256i a, a0, a1;
+
+  a0 = _mm256_madd_epi16(fil[0], sig[0]);
+  a1 = _mm256_madd_epi16(fil[3], sig[3]);
+  a = _mm256_add_epi32(a0, a1);
+
+  a0 = _mm256_madd_epi16(fil[1], sig[1]);
+  a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+  {
+    const __m256i min = _mm256_min_epi32(a0, a1);
+    a = _mm256_add_epi32(a, min);
+  }
+  {
+    const __m256i max = _mm256_max_epi32(a0, a1);
+    a = _mm256_add_epi32(a, max);
+  }
+  {
+    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+    a = _mm256_add_epi32(a, rounding);
+    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+  }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+                                    uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y);
+  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+                                    const __m256i *mask, uint16_t *dst,
+                                    ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void aom_highbd_filter_block1d8_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void aom_highbd_filter_block1d16_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p = _mm256_set1_epi32(0x09080706);
+  f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+                                     __m256i *sig) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  r1 = _mm256_shuffle_epi8(r1, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+                                      const ptrdiff_t pitch, __m256i *sig) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+                                       __m256i *sig /*sig[2]*/) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+                                      __m256i *sig /*sig[2]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+  r0 = _mm256_permutevar8x32_epi32(r0, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+                                       __m256i *y0, __m256i *y1) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  x1 = _mm256_add_epi32(x1, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+                                        __m256i *y0) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void aom_highbd_filter_block1d8_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void aom_highbd_filter_block1d16_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+  __m256i s1 =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+  __m256i s2 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+  __m256i s3 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+  __m256i s4 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+  __m256i s5 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+  __m256i s6 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+  sig[0] = _mm256_unpacklo_epi16(s0, s1);
+  sig[4] = _mm256_unpackhi_epi16(s0, s1);
+  sig[1] = _mm256_unpacklo_epi16(s2, s3);
+  sig[5] = _mm256_unpackhi_epi16(s2, s3);
+  sig[2] = _mm256_unpacklo_epi16(s4, s5);
+  sig[6] = _mm256_unpackhi_epi16(s4, s5);
+  sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                   __m256i *sig) {
+  // base + 7th row
+  __m256i s0 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+  // base + 8th row
+  __m256i s1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  sig[3] = _mm256_unpacklo_epi16(s2, s3);
+  sig[7] = _mm256_unpackhi_epi16(s2, s3);
+  sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+                                     __m256i *y0, __m256i *y1) {
+  filter_8x1_pixels(sig, f, y0);
+  filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+  int i;
+  for (i = 0; i < 3; ++i) {
+    sig[i] = sig[i + 1];
+    sig[i + 4] = sig[i + 5];
+  }
+}
+
+static void aom_highbd_filter_block1d8_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i u0, u1, u2, u3;
+  // load 0-6 rows
+  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
+  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
+
+  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
+  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
+
+  sig[0] = _mm256_unpacklo_epi16(u0, u2);
+  sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[8] = _mm256_unpacklo_epi16(u1, u3);
+  sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+  sig[1] = _mm256_unpacklo_epi16(u0, u2);
+  sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[9] = _mm256_unpacklo_epi16(u1, u3);
+  sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+  sig[2] = _mm256_unpacklo_epi16(u0, u2);
+  sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[10] = _mm256_unpacklo_epi16(u1, u3);
+  sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                             __m256i *sig) {
+  // base + 7th row
+  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+  // base + 8th row
+  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+  __m256i u0, u1, u2, u3;
+  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+  sig[3] = _mm256_unpacklo_epi16(u0, u2);
+  sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[11] = _mm256_unpacklo_epi16(u1, u3);
+  sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+                                      __m256i *y0, __m256i *y1) {
+  __m256i res[4];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+  }
+
+  {
+    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+  }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst,
+                                     ptrdiff_t pitch) {
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  _mm256_storeu_si256((__m256i *)dst, p);
+  p = _mm256_min_epi16(*y1, *mask);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+  update_pixels(&sig[0]);
+  update_pixels(&sig[8]);
+}
+
+static void aom_highbd_filter_block1d16_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+  sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                       __m256i *sig) {
+  // load the next row
+  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+                                         __m256i *y0, __m256i *y1) {
+  filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void aom_highbd_filter_block1d16_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i p = _mm_set1_epi32(0x09080706);
+  f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+  sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+                                          __m128i *sig) {
+  // load the next row
+  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+  sig[0] = _mm_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+                                      __m128i *y0, __m128i *y1) {
+  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m128i x0 = _mm_madd_epi16(sig[0], *f);
+  __m128i x1 = _mm_madd_epi16(sig[1], *f);
+  x0 = _mm_add_epi32(x0, rounding);
+  x1 = _mm_add_epi32(x1, rounding);
+  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+                                           const __m128i *mask, uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  res = _mm_min_epi16(res, *mask);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void aom_highbd_filter_block1d8_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
+#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
+#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
+#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 000000000..e7b33d1c4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m128i s[16], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m128i s[4], coeffs_x[4];
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                 round_shift_bits);
+        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                round_shift_bits);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        res = _mm_min_epi16(res, clip_pixel);
+        res = _mm_max_epi16(res, zero);
+
+        if (w - j > 4) {
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        } else if (w == 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+        } else {
+          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
new file mode 100644
index 000000000..5a55736c4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+  dst += stride << 2;
+  left += 4;
+  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+}
+
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+}
+
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+  dst += stride << 3;
+  left += 8;
+  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)*dst, val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *left) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  h_store_16_unpacklo(&dst, stride, &row0);
+  h_store_16_unpacklo(&dst, stride, &row1);
+  h_store_16_unpacklo(&dst, stride, &row2);
+  h_store_16_unpacklo(&dst, stride, &row3);
+  h_store_16_unpackhi(&dst, stride, &row4);
+  h_store_16_unpackhi(&dst, stride, &row5);
+  h_store_16_unpackhi(&dst, stride, &row6);
+  h_store_16_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  h_predictor_16x8(dst, stride, left);
+}
+
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    h_predictor_16x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    h_predictor_16x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *left) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  h_store_32_unpacklo(&dst, stride, &row0);
+  h_store_32_unpacklo(&dst, stride, &row1);
+  h_store_32_unpacklo(&dst, stride, &row2);
+  h_store_32_unpacklo(&dst, stride, &row3);
+  h_store_32_unpackhi(&dst, stride, &row4);
+  h_store_32_unpackhi(&dst, stride, &row5);
+  h_store_32_unpackhi(&dst, stride, &row6);
+  h_store_32_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    h_predictor_32x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    h_predictor_32x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP, DC_LEFT, DC_128
+
+// 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 4x8
+
+static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+// Shared with DC 8xh
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i sum = dc_sum_8(left);
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x8(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 8xh
+
+static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+                                        int height, const uint16_t *above) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  dc_store_8xh(dst, stride, height, &dc);
+}
+
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 4, above);
+}
+
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 8, above);
+}
+
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 16, above);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 4, &dc);
+}
+
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 8, &dc);
+}
+
+// Shared with DC 16xh
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+  const __m128i sum_lo = dc_sum_8(ref);
+  const __m128i sum_hi = dc_sum_8(ref + 8);
+  return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 16, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+                                        int height, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  dc_store_8xh(dst, stride, height, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 4, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 8, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 16, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 16xh
+
+static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                 const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 16, &dc);
+}
+
+// Shared with 32xh
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sum_a = dc_sum_16(ref);
+  const __m128i sum_b = dc_sum_16(ref + 16);
+  // 12 bit bd will outrange, so expand to 32 bit before adding final total
+  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+                       _mm_unpacklo_epi16(sum_b, zero));
+}
+
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 8, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 32xh
+
+static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                 const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+  }
+}
+
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_storel_epi64((__m128i *)dst, above_u16);
+    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
+    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
+    dst += stride << 2;
+  }
+}
+
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+  _mm_store_si128((__m128i *)dst, above_u16);
+  _mm_store_si128((__m128i *)(dst + stride), above_u16);
+  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+}
+
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, above_u16);
+    _mm_store_si128((__m128i *)(dst + stride), above_u16);
+    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+    dst += stride << 2;
+  }
+}
+
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+  }
+}
+
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  int i;
+  for (i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+  }
+}
+
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)bd;
+  const __m128i sum_above = dc_sum_4(above);
+  const __m128i sum_left = dc_sum_8(left);
+  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 >>= 16;
+  sum32 += 6;
+  sum32 /= 12;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_storel_epi64((__m128i *)dst, row);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)bd;
+  const __m128i sum_left = dc_sum_4(left);
+  const __m128i sum_above = dc_sum_8(above);
+  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 >>= 16;
+  sum32 += 6;
+  sum32 /= 12;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+}
+
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 12;
+  sum32 /= 24;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 12;
+  sum32 /= 24;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 24;
+  sum32 /= 48;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 24;
+  sum32 /= 48;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
new file mode 100644
index 000000000..91b3d126c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
@@ -0,0 +1,259 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  paddw                 m0, m2
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
new file mode 100644
index 000000000..c954da94e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 000000000..097e0778f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1697 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+                                         __m128i *pixel) {
+  *pixel = _mm_min_epi16(*pixel, *max);
+  *pixel = _mm_max_epi16(*pixel, *min);
+}
+
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+                             const uint8_t *t, int bd, __m128i *blt,
+                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+  *blt = _mm_slli_epi16(x, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+  *lt = _mm_slli_epi16(x, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+  *thr = _mm_slli_epi16(x, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+    __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+  __m128i x1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *blt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *lt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *thr_out = _mm_slli_epi16(x0, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+                                     __m128i *p, __m128i *q) {
+  int i;
+  for (i = 0; i < size; i++) {
+    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
+  }
+}
+
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+                                           const __m128i *l, const __m128i *bl,
+                                           __m128i *mask) {
+  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+
+  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  int i;
+  for (i = 1; i < 4; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
+  }
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
+}
+
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+                                                 __m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *abs_p1p0, __m128i *l,
+                                                 __m128i *bl, __m128i *t,
+                                                 __m128i *hev, __m128i *mask) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+  __m128i max, max01, h;
+
+  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
+
+  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+  // mask |= (abs(*p1 - *p0) > limit) * -1;
+  // mask |= (abs(*q1 - *q0) > limit) * -1;
+  h = _mm_subs_epu16(max01, *t);
+
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+  // replicate for the further "merged variables" usage
+  *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+  max = _mm_max_epi16(max, max01);
+  int i;
+  for (i = 2; i < x; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+                                      int start, int end, __m128i *flat) {
+  int i;
+  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+                              abs_diff16(pq[start + 1], pq[0]));
+
+  for (i = start + 2; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+                                           const __m128i *q, int start, int end,
+                                           __m128i *flat) {
+  int i;
+  __m128i max =
+      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+  for (i = start + 1; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
+  }
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+                                          __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal(&th, pq, 1, 4, flat);
+  flat_mask_internal(&th, pq, 4, 7, flat2);
+}
+
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+                                               const __m128i *q, __m128i *flat,
+                                               __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
+}
+
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *hev, __m128i *mask,
+                                                 __m128i *qs1qs0,
+                                                 __m128i *ps1ps0, __m128i *t80,
+                                                 int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+  __m128i ps1ps0_work, qs1qs0_work, work;
+  __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &work);
+  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm_and_si128(filt, *mask);
+  filt = _mm_unpacklo_epi64(filt, filt);
+
+  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+  pixel_clamp(&pmin, &pmax, &filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
+
+  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filt, one);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(*hev, filt);
+
+  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+                                            __m128i *qs, const __m128i *mask,
+                                            const __m128i *th, int bd,
+                                            __m128i *t80) {
+  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+  __m128i filter = _mm_subs_epi16(ps1, qs1);
+  pixel_clamp(&pmin, &pmax, &filter);
+
+  // hev_filter
+  __m128i hev;
+  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm_subs_epu16(h, *th);
+  const __m128i ffff = _mm_cmpeq_epi16(h, h);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
+  filter = _mm_and_si128(filter, hev);
+
+  const __m128i x = _mm_subs_epi16(qs0, ps0);
+  filter = _mm_adds_epi16(filter, x);
+  filter = _mm_adds_epi16(filter, x);
+  filter = _mm_adds_epi16(filter, x);
+  pixel_clamp(&pmin, &pmax, &filter);
+  filter = _mm_and_si128(filter, *mask);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t4 = _mm_set1_epi16(4);
+  __m128i filter1 = _mm_adds_epi16(filter, t4);
+  __m128i filter2 = _mm_adds_epi16(filter, t3);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  pixel_clamp(&pmin, &pmax, &filter2);
+  filter1 = _mm_srai_epi16(filter1, 3);
+  filter2 = _mm_srai_epi16(filter2, 3);
+  qs0 = _mm_subs_epi16(qs0, filter1);
+  pixel_clamp(&pmin, &pmax, &qs0);
+  ps0 = _mm_adds_epi16(ps0, filter2);
+  pixel_clamp(&pmin, &pmax, &ps0);
+  qs[0] = _mm_adds_epi16(qs0, *t80);
+  ps[0] = _mm_adds_epi16(ps0, *t80);
+  filter = _mm_adds_epi16(filter1, one);
+  filter = _mm_srai_epi16(filter, 1);
+  filter = _mm_andnot_si128(hev, filter);
+  qs1 = _mm_subs_epi16(qs1, filter);
+  pixel_clamp(&pmin, &pmax, &qs1);
+  ps1 = _mm_adds_epi16(ps1, filter);
+  pixel_clamp(&pmin, &pmax, &ps1);
+  qs[1] = _mm_adds_epi16(qs1, *t80);
+  ps[1] = _mm_adds_epi16(ps1, *t80);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+    const unsigned char *lt, const unsigned char *thr, int bd) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i t80;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+  for (i = 0; i < 7; i++) {
+    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+  }
+  __m128i mask, hevhev;
+  __m128i p1p0, q1q0, abs_p1p0;
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hevhev, &mask);
+
+  __m128i ps0ps1, qs0qs1;
+  // filter4
+  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
+
+  __m128i flat, flat2;
+  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+  flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+  // flat and wide flat calculations
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3], flat_pq[3];
+    __m128i flat2_p[6], flat2_q[6];
+    __m128i flat2_pq[6];
+    __m128i sum_p6, sum_p3;
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    work0_1 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+    work0 = _mm_add_epi16(sum_p3, pq[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+    work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+    }  // flat2
+       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // highbd_filter8
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+    for (i = 0; i < 3; i++) {
+      pq[i] = _mm_andnot_si128(flat, pq[i]);
+      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+    }
+
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        pq[i] = _mm_andnot_si128(flat2, pq[i]);
+        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+      }
+    }
+  } else {
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blt, const uint8_t *lt,
+                                       const uint8_t *thr, int bd) {
+  __m128i p[7], q[7], pq[7];
+  int i;
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
+  __m128i blimit, limit, thresh, t80;
+  const __m128i zero = _mm_setzero_si128();
+
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
+  __m128i mask;
+  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+  __m128i flat, flat2;
+  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+  __m128i ps[2], qs[2];
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+  // flat and wide flat calculations
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
+    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
+    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
+    sum_q = _mm_add_epi16(sum_q, sum_lq);
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+    flat_p[0] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
+    flat_q[0] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
+    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
+    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
+    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+    flat_p[1] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
+    flat_q[1] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
+    flat_p[2] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
+    flat_q[2] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                               _mm_add_epi16(p[1], q[0]))),
+          4);
+      flat2_q[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                               _mm_add_epi16(p[0], q[1]))),
+          4);
+
+      flat2_p[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+          4);
+      flat2_q[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, p[4]);
+      flat2_p[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+          4);
+      flat2_q[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, p[3]);
+      flat2_p[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+          4);
+      flat2_q[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, p[2]);
+      flat2_p[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+          4);
+      flat2_q[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, p[1]);
+      flat2_p[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+          4);
+      flat2_q[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+          4);
+    }
+    // highbd_filter8
+    int i;
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    p[2] = _mm_andnot_si128(flat, p[2]);
+    //  p2 remains unchanged if !(flat && mask)
+    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+    //  when (flat && mask)
+    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
+    q[2] = _mm_andnot_si128(flat, q[2]);
+    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
+
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    // highbd_filter16
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        //  p[i] remains unchanged if !(flat2 && flat && mask)
+        p[i] = _mm_andnot_si128(flat2, p[i]);
+        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+        //  get values for when (flat2 && flat && mask)
+        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+        q[i] = _mm_andnot_si128(flat2, q[i]);
+        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+        q[i] = _mm_or_si128(q[i], flat2_q[i]);
+      }
+    }
+  } else {
+    p[0] = ps[0];
+    q[0] = qs[0];
+    p[1] = ps[1];
+    q[1] = qs[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[3];
+  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+  __m128i flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+    const unsigned char *_thresh0, const unsigned char *_blimit1,
+    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat, work;
+  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+  __m128i op1, op0, oq0, oq1;
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p2p1 = abs_diff16(*p2, *p1);
+  abs_p1p0 = abs_diff16(*p1, *p0);
+  abs_q1q0 = abs_diff16(*q1, *q0);
+  abs_q2q1 = abs_diff16(*q2, *q1);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  mask = _mm_max_epi16(abs_q2q1, mask);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_max_epi16(mask, abs_p2p1);
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+  flat = _mm_max_epi16(flat, work);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    op1 = _mm_srli_epi16(workp_shft0, 3);
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+    op0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    oq1 = _mm_srli_epi16(workp_shft1, 3);
+
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+                             _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    const unsigned char *_blimit, const unsigned char *_limit,
+    const unsigned char *_thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[4];
+  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+  __m128i abs_p1p0;
+
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+  // flat_mask4
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    workp_c = _mm_add_epi16(workp_a, workp_c);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    workp_a = _mm_add_epi16(workp_a, workp_b);
+    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+    work_a = _mm_andnot_si128(flat, pq[2]);
+    *p2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_a, *p2);
+    *q2 = _mm_srli_si128(*p2, 8);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+    const unsigned char *_limit0, const unsigned char *_thresh0,
+    const unsigned char *_blimit1, const unsigned char *_limit1,
+    const unsigned char *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat;
+  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
+
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+  work1 =
+      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
+  work0 = _mm_max_epi16(work0, work1);
+  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+  work2 = _mm_max_epi16(work2, work0);
+  mask = _mm_max_epi16(work2, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+  flat = _mm_max_epi16(work1, flat);
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+  flat = _mm_max_epi16(work0, flat);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+
+    work_a = _mm_andnot_si128(flat, *q2);
+    *q2 = _mm_and_si128(flat, oq2);
+    *q2 = _mm_or_si128(work_a, *q2);
+
+    work_a = _mm_andnot_si128(flat, *p2);
+    *p2 = _mm_and_si128(flat, op2);
+    *p2 = _mm_or_si128(work_a, *p2);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+                             &p1p0, _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+                                  _blimit0, _limit0, _thresh0, _blimit1,
+                                  _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+    const uint8_t *_thresh, int bd) {
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev;
+  __m128i p1p0, q1q0;
+  __m128i pq[2];
+
+  __m128i abs_p1p0;
+
+  __m128i t80;
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i mask, flat;
+  __m128i p[2], q[2];
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+
+  __m128i t80;
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+  mask = _mm_max_epi16(flat, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  p[0] = *p0;
+  p[1] = *p1;
+  q[0] = *q0;
+  q[1] = *q1;
+
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
+
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p1p0, q1q0;
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+                             _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i ps[2], qs[2];
+
+  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
+}
+
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+  __m128i p1p0, q1q0;
+  __m128i p1, q1;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
+
+  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+                             thresh, bd);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  // transpose from 8x4 to 4x8
+  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i ps[2], qs[2];
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                               &d2, &d3);
+
+  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+                                  thresh0, blimit1, limit1, thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+                               &d3, &d4, &d5, &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x3, x2, x1, x0, p0, q0;
+  __m128i p1p0, q1q0;
+
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+                             limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0, p1, q1, p2, q2;
+
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+                           &p0, &q0, &q1, &q2, &d6, &d7);
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p2, p1, p0, p3, q0;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  // Loop filtering
+  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+                             &p1p0, blimit, limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+                               &d1, &d2, &d3);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                           &d2, &d3, &d4, &d5, &d6, &d7);
+
+  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+                                  blimit0, limit0, thresh0, blimit1, limit1,
+                                  thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+                           &x2, &x3, &x4, &x5, &x6, &x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
+
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  __m128i q[7], p[7], pq[7];
+  __m128i p6, p5, p4, p3;
+  __m128i p6_2, p5_2, p4_2, p3_2;
+  __m128i d0, d1, d2, d3;
+  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+
+  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+                               &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+                               &q[3], &q[4], &q[5], &q[6], &d7_2);
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
+
+  q[0] = _mm_srli_si128(pq[0], 8);
+  q[1] = _mm_srli_si128(pq[1], 8);
+  q[2] = _mm_srli_si128(pq[2], 8);
+  q[3] = _mm_srli_si128(pq[3], 8);
+  q[4] = _mm_srli_si128(pq[4], 8);
+  q[5] = _mm_srli_si128(pq[5], 8);
+
+  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 000000000..b9689202a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
+                           const int16_t *quant_ptr, const int16_t *dequant_ptr,
+                           const int16_t *quant_shift_ptr, __m256i *qp) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y,
+                                         __m256i *p) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  *p = _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
+                            tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi32(*c);
+  const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]);
+  __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]);
+  flag2 = _mm256_or_si256(flag1, flag2);
+  const int32_t nzflag = _mm256_movemask_epi8(flag2);
+
+  if (LIKELY(nzflag)) {
+    __m256i q = _mm256_add_epi32(abs, qp[1]);
+    __m256i tmp;
+    mm256_mul_shift_epi32(&q, &qp[2], &tmp);
+    q = _mm256_add_epi32(tmp, q);
+
+    mm256_mul_shift_epi32(&q, &qp[4], &q);
+    __m256i dq = _mm256_mullo_epi32(q, qp[3]);
+
+    q = _mm256_sign_epi32(q, *c);
+    dq = _mm256_sign_epi32(dq, *c);
+    q = _mm256_and_si256(q, flag2);
+    dq = _mm256_and_si256(dq, flag2);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+    const __m128i zr = _mm_setzero_si128();
+    const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+    const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+    const __m256i iscan =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+    const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+    __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+    cur_eob = _mm256_and_si256(cur_eob, nz);
+    *eob = _mm256_max_epi32(cur_eob, *eob);
+  } else {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+  }
+}
+
+void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  const unsigned int step = 8;
+
+  __m256i qp[5], coeff;
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+    quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 000000000..58e5f98e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
+
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+void aom_highbd_quantize_b_32x32_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 000000000..e0d22522d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,296 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+
+; set m1
+  push                srcq
+  mov                 srcd, 0x00010001
+  movd                  m1, srcd
+  pshufd                m1, m1, 0x0
+  pop                 srcq
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
+HIGH_SADNXN4D  4, 16
+HIGH_SADNXN4D 16,  4
+HIGH_SADNXN4D  8, 32
+HIGH_SADNXN4D 32,  8
+HIGH_SADNXN4D 16, 64
+HIGH_SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 000000000..3398d8a2a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,374 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
+HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+
+; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
+HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
+
+; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+
+; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 000000000..61f5b8e86
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1036 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                eax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if ARCH_X86_64
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse, \
+                                        g_bilin_filter, g_pw_8
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+
+      ; Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse, \
+                                    g_bilin_filter, g_pw_8
+      %define block_height heightd
+
+      ; Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   block_height, 1
+%endif
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 000000000..18eb03d12
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+                                    const uint16_t *src, ptrdiff_t src_stride,
+                                    const uint16_t *pred,
+                                    ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *)(diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *)(diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *)(diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *)(diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+#define STACK_V(h, fun)                                                        \
+  do {                                                                         \
+    fun(diff, diff_stride, src, src_stride, pred, pred_stride);                \
+    fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
+        pred + pred_stride * h, pred_stride);                                  \
+  } while (0)
+
+#define STACK_H(w, fun)                                                     \
+  do {                                                                      \
+    fun(diff, diff_stride, src, src_stride, pred, pred_stride);             \
+    fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
+  } while (0)
+
+#define SUBTRACT_FUN(size)                                               \
+  static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride,      \
+                              const uint16_t *src, ptrdiff_t src_stride, \
+                              const uint16_t *pred, ptrdiff_t pred_stride)
+
+SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
+SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
+SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
+SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
+SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
+SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
+SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
+SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
+SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
+SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
+SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
+SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
+SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
+SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
+SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
+SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
+SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
+SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  if (rows == 4) {
+    if (cols == 4) return subtract_4x4;
+    if (cols == 8) return subtract_8x4;
+    if (cols == 16) return subtract_16x4;
+  }
+  if (rows == 8) {
+    if (cols == 4) return subtract_4x8;
+    if (cols == 8) return subtract_8x8;
+    if (cols == 16) return subtract_16x8;
+    if (cols == 32) return subtract_32x8;
+  }
+  if (rows == 16) {
+    if (cols == 4) return subtract_4x16;
+    if (cols == 8) return subtract_8x16;
+    if (cols == 16) return subtract_16x16;
+    if (cols == 32) return subtract_32x16;
+    if (cols == 64) return subtract_64x16;
+  }
+  if (rows == 32) {
+    if (cols == 8) return subtract_8x32;
+    if (cols == 16) return subtract_16x32;
+    if (cols == 32) return subtract_32x32;
+    if (cols == 64) return subtract_64x32;
+  }
+  if (rows == 64) {
+    if (cols == 16) return subtract_16x64;
+    if (cols == 32) return subtract_32x64;
+    if (cols == 64) return subtract_64x64;
+    if (cols == 128) return subtract_128x64;
+  }
+  if (rows == 128) {
+    if (cols == 64) return subtract_64x128;
+    if (cols == 128) return subtract_128x128;
+  }
+  assert(0);
+  return NULL;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+  (void)bd;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 000000000..9b1b4c9de
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   uint32_t *sse, int *sum);
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+                                const uint16_t *ref, int ref_stride,
+                                uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  for (int i = 0; i < 8; i += 2) {
+    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride * 2;
+    ref += ref_stride * 2;
+  }
+  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+                                  const uint16_t *ref, int ref_stride,
+                                  uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  for (int i = 0; i < 16; ++i) {
+    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_avx2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+
+#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 000000000..0d954e178
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,318 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int aom_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(aom_highbd_calc16x16var_sse2) PRIVATE
+sym(aom_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int aom_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(aom_highbd_calc8x8var_sse2) PRIVATE
+sym(aom_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 000000000..47b052abc
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,868 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                       const uint16_t *ref, int ref_stride,
+                                       uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride, int w,
+                                   int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define HIGH_GET_VAR(S)                                                       \
+  void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                         const uint8_t *ref8, int ref_stride, \
+                                         uint32_t *sse, int *sum) {           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+  }                                                                           \
+                                                                              \
+  void aom_highbd_10_get##S##x##S##var_sse2(                                  \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
+  }                                                                           \
+                                                                              \
+  void aom_highbd_12_get##S##x##S##var_sse2(                                  \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
+  }
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_8_variance_sse2(                                                \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+  }                                                                        \
+                                                                           \
+  uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_12_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+
+#undef VAR_FN
+
+unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                        const uint8_t *ref8, int ref_stride,
+                                        unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                         aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                         aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt)                                                         \
+  int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
+      unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        NULL);                                                                 \
+    if (w > wf) {                                                              \
+      unsigned int sse2;                                                       \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          &sse2, NULL, NULL);                                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        NULL);                                                                 \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          &sse2, NULL, NULL);                                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int start_row;                                                             \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    int64_t var;                                                               \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
+          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
+          NULL);                                                               \
+      se += se2;                                                               \
+      long_sse += sse2;                                                        \
+      if (w > wf) {                                                            \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              height, &sse2, NULL, NULL);                                      \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              height, &sse2, NULL, NULL);                                      \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define FNS(opt)                        \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
+  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
+  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
+  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
+  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+  FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                         \
+  int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
+      void *unused);
+#define DECLS(opt) \
+  DECL(16, opt)    \
+  DECL(8, opt)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            sec + 32, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            sec + 48, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            sec + 32, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            sec + 48, w, h, &sse2, NULL, NULL);                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int start_row;                                                             \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
+          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
+          w, height, &sse2, NULL, NULL);                                       \
+      se += se2;                                                               \
+      long_sse += sse2;                                                        \
+      if (w > wf) {                                                            \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
+            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
+              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define FNS(opt)                        \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
+  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
+  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
+  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
+  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+  FN(64, 16, 16, 6, 4, opt, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter =
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      /*Read 8 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          comp_pred += 8;
+          ref += 8;
+        }
+        ref += ref_stride - width;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      /*Read 4 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        comp_pred += 8;
+        ref += 2 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+                               ref_stride, CONVERT_TO_BYTEPTR(temp),
+                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                               intermediate_height, bd);
+    aom_highbd_convolve8_vert(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+  assert(!(width * height & 7));
+  int n = width * height >> 3;
+  for (int i = 0; i < n; i++) {
+    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+
+static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                               const __m128i *w0,
+                                               const __m128i *w1,
+                                               const __m128i *r,
+                                               void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
+                                       const uint8_t *pred8, int width,
+                                       int height, const uint8_t *ref8,
+                                       int ref_stride,
+                                       const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width >= 8) {
+    // Read 8 pixels one row at a time
+    assert(!(width & 7));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 8) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+        comp_pred += 8;
+        pred += 8;
+        ref += 8;
+      }
+      ref += ref_stride - width;
+    }
+  } else {
+    // Read 4 pixels two rows at a time
+    assert(!(width & 3));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+      comp_pred += 8;
+      pred += 8;
+      ref += 2 * ref_stride;
+    }
+  }
+}
+
+void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred16);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 000000000..df5449a9d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
new file mode 100644
index 000000000..1e67d392e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -0,0 +1,811 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y0 = _mm256_sad_epu8(x0, zero);
+  __m256i y1 = _mm256_sad_epu8(x1, zero);
+  y0 = _mm256_add_epi64(y0, y1);
+  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+  y0 = _mm256_add_epi64(u0, y0);
+  u0 = _mm256_unpackhi_epi64(y0, y0);
+  return _mm256_add_epi16(y0, u0);
+}
+
+static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y = _mm256_sad_epu8(x, zero);
+  __m256i u = _mm256_permute2x128_si256(y, y, 1);
+  y = _mm256_add_epi64(u, y);
+  u = _mm256_unpackhi_epi64(y, y);
+  return _mm256_add_epi16(y, u);
+}
+
+static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm256_srai_epi16(sum_left, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+// There are 32 rows togeter. This function does line:
+// 0,1,2,3, and 16,17,18,19. The next call would do
+// 4,5,6,7, and 20,21,22,23. So 4 times of calling
+// would finish 32 rows.
+static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+                                        ptrdiff_t stride) {
+  __m256i t[4];
+  __m256i m = _mm256_setzero_si256();
+  const __m256i inc = _mm256_set1_epi8(4);
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    t[i] = _mm256_shuffle_epi8(*row, m);
+    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
+    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
+    _mm256_storeu_si256((__m256i *)dst, r0);
+    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
+    dst += stride;
+    m = _mm256_add_epi8(m, inc);
+  }
+}
+
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
+
+  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
+
+  __m256i v = _mm256_unpacklo_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  v = _mm256_unpackhi_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  u = _mm256_unpackhi_epi8(left_col, left_col);
+
+  v = _mm256_unpacklo_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  v = _mm256_unpackhi_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// Rectangle
+
+// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
+// Use a header file, intrapred_common_x86.h
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i top_sum = dc_sum_32_sse2(above);
+  __m128i left_sum = dc_sum_16_sse2(left);
+  left_sum = _mm_add_epi16(top_sum, left_sum);
+  uint32_t sum = _mm_cvtsi128_si32(left_sum);
+  sum += 24;
+  sum /= 48;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 64;
+  sum /= 128;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 40;
+  sum /= 80;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 16 16-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+                                 const __m256i *topleft) {
+  const __m256i base =
+      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
+
+  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
+  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
+  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
+
+  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
+  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
+  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
+
+  pl = _mm256_andnot_si256(mask1, *left);
+
+  ptl = _mm256_and_si256(mask2, *topleft);
+  pt = _mm256_andnot_si256(mask2, *top);
+  pt = _mm256_or_si256(pt, ptl);
+  pt = _mm256_and_si256(mask1, pt);
+
+  return _mm256_or_si256(pt, pl);
+}
+
+// Return 16 8-bit pixels in one row (__m128i)
+static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+                                      const __m256i *topleft) {
+  const __m256i p0 = paeth_pred(left, top, topleft);
+  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i p = _mm256_packus_epi16(p0, p1);
+  return _mm256_castsi256_si128(p);
+}
+
+static INLINE __m256i get_top_vector(const uint8_t *above) {
+  const __m128i x = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
+  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
+}
+
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i x = _mm_loadl_epi64((const __m128i *)left);
+  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+static INLINE __m256i get_left_vector(const uint8_t *left) {
+  const __m128i x = _mm_load_si128((const __m128i *)left);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+}
+
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i l = get_left_vector(left);
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m256i l = get_left_vector(left);
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+
+  l = get_left_vector(left + 16);
+  rep = _mm256_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+// Return 32 8-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+                                      const __m256i *top1,
+                                      const __m256i *topleft) {
+  __m256i p0 = paeth_pred(left, top0, topleft);
+  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i x0 = _mm256_packus_epi16(p0, p1);
+
+  p0 = paeth_pred(left, top1, topleft);
+  p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i x1 = _mm256_packus_epi16(p0, p1);
+
+  return _mm256_permute2x128_si256(x0, x1, 0x20);
+}
+
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i l = get_left_vector(left);
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
+
+    _mm256_storeu_si256((__m256i *)dst, r);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m256i l = get_left_vector(left);
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+
+  l = get_left_vector(left + 16);
+  rep = _mm256_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  const __m256i l = get_left_vector(left);
+  __m256i rep = _mm256_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 000000000..5b2452c8e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,1430 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  for (int i = 0; i < height; i += 2) {
+    *(uint32_t *)dst = dc;
+    dst += stride;
+    *(uint32_t *)dst = dc;
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_storel_epi64((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_unpacklo_epi8(x, zero);
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_16(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x2 = _mm_sad_epu8(x2, zero);
+  x3 = _mm_sad_epu8(x3, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  x2 = _mm_add_epi16(x2, x3);
+  x0 = _mm_add_epi16(x0, x2);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> DC_SHIFT2;
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_64(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_8(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_16(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 64;
+  sum /= 128;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_32(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_16(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 16);
+}
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+  left_col = _mm_unpackhi_epi64(left_col, left_col);
+  row0 = _mm_shufflelo_epi16(left_col, 0);
+  row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_load_si128((__m128i const *)left);
+  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+  row0 = _mm_shufflelo_epi16(left_col_low, 0);
+  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  row0 = _mm_shufflelo_epi16(left_col_high, 0);
+  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+  row0 = _mm_shufflelo_epi16(left_col_high, 0);
+  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int count) {
+  (void)above;
+  for (int i = 0; i < count; ++i) {
+    const __m128i left_col = _mm_load_si128((__m128i const *)left);
+    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+    row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+    left += 16;
+  }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
+static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    _mm_store_si128((__m128i *)dst, row[i]);
+    dst += stride;
+  }
+}
+
+static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
+  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
+  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
+  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
+
+  row[0] = _mm_unpacklo_epi64(u0, u0);
+  row[1] = _mm_unpacklo_epi64(u1, u1);
+  row[2] = _mm_unpacklo_epi64(u2, u2);
+  row[3] = _mm_unpacklo_epi64(u3, u3);
+}
+
+static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
+  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
+  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
+  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
+
+  row[0] = _mm_unpackhi_epi64(u0, u0);
+  row[1] = _mm_unpackhi_epi64(u1, u1);
+  row[2] = _mm_unpackhi_epi64(u2, u2);
+  row[3] = _mm_unpackhi_epi64(u3, u3);
+}
+
+// Process 16x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_low_4pixels(left, row);
+  h_pred_store_16xh(row, 4, dst, stride);
+}
+
+// Process 16x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_high_4pixels(left, row);
+  h_pred_store_16xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_16x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int count) {
+  int i = 0;
+  do {
+    const __m128i left_col = _mm_load_si128((const __m128i *)left);
+    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
+    dst += stride << 2;
+    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
+    dst += stride << 2;
+
+    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
+    dst += stride << 2;
+    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
+    dst += stride << 2;
+
+    left += 16;
+    i++;
+  } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 4);
+}
+
+static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    _mm_store_si128((__m128i *)dst, row[i]);
+    _mm_store_si128((__m128i *)(dst + 16), row[i]);
+    dst += stride;
+  }
+}
+
+// Process 32x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_low_4pixels(left, row);
+  h_pred_store_32xh(row, 4, dst, stride);
+}
+
+// Process 32x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_high_4pixels(left, row);
+  h_pred_store_32xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+  dst += stride << 2;
+
+  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + 32), r0);
+    _mm_store_si128((__m128i *)(dst + 48), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
new file mode 100644
index 000000000..9aece27be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
@@ -0,0 +1,625 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
+
+SECTION .text
+
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
+  pavgb               %4, %1, %3
+  pxor                %3, %1
+  pand                %3, [GLOBAL(pb_1)]
+  psubb               %4, %3
+  pavgb               %4, %2
+%endmacro
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movd                  m2, [leftq]
+  movd                  m0, [aboveq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_8)]
+  psraw                 m0, 4
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_16)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_32)]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0
+  pshufd                m1, m0, 0x1
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m1
+  pshufd                m2, m0, 0x2
+  lea                 dstq, [dstq+strideq*2]
+  pshufd                m3, m0, 0x3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -2
+  DEFINE_ARGS  dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+  movq                  m0, [leftq    ]
+  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
+.loop:
+  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
+  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  pshuflw               m1, m0, 0xaa
+  pshuflw               m2, m0, 0xff
+  movq    [dstq+strideq*2], m1
+  movq    [dstq+stride3q ], m2
+  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+  inc                lineq
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -4
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+.loop:
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
+  mova    [dstq          ], m1
+  mova    [dstq+strideq  ], m2
+  pshufd            m1, m0, 0xaa
+  pshufd            m2, m0, 0xff
+  mova    [dstq+strideq*2], m1
+  mova    [dstq+stride3q ], m2
+  inc                lineq
+  lea                leftq, [leftq+4       ]
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
new file mode 100644
index 000000000..807ed1770
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -0,0 +1,1692 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 8 16-bit pixels in one row
+static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+                                     const __m128i *topleft) {
+  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
+
+  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
+  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
+  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
+
+  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
+  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
+  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
+
+  pl = _mm_andnot_si128(mask1, *left);
+
+  ptl = _mm_and_si128(mask2, *topleft);
+  pt = _mm_andnot_si128(mask2, *top);
+  pt = _mm_or_si128(pt, ptl);
+  pt = _mm_and_si128(mask1, pt);
+
+  return _mm_or_si128(pl, pt);
+}
+
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+// Return 16 8-bit pixels in one row
+static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+                                      const __m128i *top1,
+                                      const __m128i *topleft) {
+  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
+  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
+  return _mm_packus_epi16(p0, p1);
+}
+
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+
+  l = _mm_load_si128((const __m128i *)(left + 16));
+  rep = _mm_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  __m128i l16;
+
+  for (int i = 0; i < 8; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+
+  rep = _mm_set1_epi16(0x8000);
+  l = _mm_load_si128((const __m128i *)(left + 16));
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r32l);
+      _mm_store_si128((__m128i *)(dst + 16), r32h);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  const __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i rep = _mm_set1_epi16(0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+                                 int height, __m128i *pixels) {
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
+  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
+
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i zero = _mm_setzero_si128();
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
+                                  __m128i *weight_h, __m128i *weight_w) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+  weight_h[0] = _mm_unpacklo_epi8(t, zero);
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+  if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  }
+}
+
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
+  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
+
+    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
+    b = _mm_unpacklo_epi16(b, pixel[2]);
+    __m128i sum = _mm_madd_epi16(b, ww[0]);
+
+    sum = _mm_add_epi32(s, sum);
+    sum = _mm_add_epi32(sum, round);
+    sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
+
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 4, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 8, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+                                 int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
+                                  __m128i *weight_h, __m128i *weight_w) {
+  const __m128i zero = _mm_setzero_si128();
+  const int we_offset = height < 8 ? 4 : 8;
+  __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
+  weight_h[0] = _mm_unpacklo_epi8(we, zero);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+
+  if (height == 4) {
+    we = _mm_srli_si128(we, 4);
+    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+  } else {
+    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+  }
+
+  if (height == 16) {
+    we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(we, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(we, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
+  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  int i;
+  for (i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
+    b = _mm_unpacklo_epi16(b, pixels[3]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+    s0 = _mm_add_epi32(s0, sum0);
+    s0 = _mm_add_epi32(s0, round);
+    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
+
+    s1 = _mm_add_epi32(s1, sum1);
+    s1 = _mm_add_epi32(s1, round);
+    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
+
+    sum0 = _mm_packus_epi16(s0, s1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 4, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 8, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  __m128i wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i top_right =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
+    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
+      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
+
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      const __m128i scale_m_weights_x =
+          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
+      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
+      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
+      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
+      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
+
+      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
+      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
+
+      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
+      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// pixels[0]: above and below_pred interleave vector
+static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weights[0]: weights_h vector
+// weights[1]: scale - weights_h vector
+static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height == 4) {
+    const __m128i weight =
+        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(d, weights[2]);
+  }
+}
+
+static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 4, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 8, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 8, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  load_weight_v_w4(sm_weight_arrays, 16, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_h) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height < 16) {
+    const int offset = height < 8 ? 4 : 8;
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                     int h, uint8_t *dst, ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+    s0 = _mm_add_epi32(s0, pred_round);
+    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
+
+    s1 = _mm_add_epi32(s1, pred_round);
+    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
+
+    __m128i sum01 = _mm_packus_epi16(s0, s1);
+    sum01 = _mm_shuffle_epi8(sum01, gat);
+    _mm_storel_epi64((__m128i *)dst, sum01);
+    dst += stride;
+
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 4, pixels);
+
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 4, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 8, pixels);
+
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 8, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 16, pixels);
+
+  __m128i wh[4];
+  load_weight_v_w8(sm_weight_arrays, 16, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 32, pixels);
+
+  __m128i wh[8];
+  load_weight_v_w8(sm_weight_arrays, 32, wh);
+
+  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
+}
+
+static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i bottom_left =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round =
+      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i scale_m_weights_y =
+        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      // 8 -> 16
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
+      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
+      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
+      // top_x * weights_y + scale_m_weights_y * bottom_left
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      pred_lo = _mm_add_epi32(pred_lo, round);
+      pred_hi = _mm_add_epi32(pred_hi, round);
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  if (height == 4)
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
+  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+}
+
+// weights[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  (void)height;
+  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
+  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+}
+
+static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = _mm_set1_epi16(0x8000);
+
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
+    b = _mm_unpacklo_epi16(b, pixel[1]);
+    __m128i sum = _mm_madd_epi16(b, weight[0]);
+
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 4, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 4, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 8, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 16, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  dst += stride << 3;
+
+  pixels[0] = _mm_srli_si128(pixels[0], 8);
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+// pixels[2]: left vector + 16
+// pixels[3]: right_pred vector
+static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[3] = pixels[1];
+  }
+}
+
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_w) {
+  (void)height;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+}
+
+static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
+                                     int h, uint8_t *dst, ptrdiff_t stride,
+                                     int second_half) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
+    b = _mm_unpacklo_epi16(b, pixels[1]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+    sum0 = _mm_add_epi32(sum0, pred_round);
+    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
+
+    sum1 = _mm_add_epi32(sum1, pred_round);
+    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
+
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 4, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 4, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 8, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 8, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 16, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 16, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_h_w8(above, left, 32, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 32, ww);
+
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i tr_ly =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
+      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
+      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
+      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
+      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
+      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_round);
+      pred_hi = _mm_add_epi32(pred_hi, pred_round);
+
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 000000000..0bc841a7a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,107 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
+  paddw           m0,        m2
+  psubw           m3,        m1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movd            m%3,       [outputq]
+  movd            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+  mova            m0,        [inputq +  0]
+  packssdw        m0,        [inputq + 16]
+  mova            m1,        [inputq + 32]
+  packssdw        m1,        [inputq + 48]
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  REORDER_INPUTS
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
new file mode 100644
index 000000000..c3c88245a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 4);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 4) {
+    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    x0 = xx_loadl_32(b + 0 * b_stride);
+    x1 = xx_loadl_32(b + 1 * b_stride);
+    x2 = xx_loadl_32(b + 2 * b_stride);
+    x3 = xx_loadl_32(b + 3 * b_stride);
+    x_lo = _mm_unpacklo_epi32(x0, x1);
+    x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    __m128i sad4x4 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad4x4);
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 8);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 2) {
+    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+    __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+    x0 = xx_loadl_64(b + 0 * b_stride);
+    x1 = xx_loadl_64(b + 1 * b_stride);
+
+    __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+    __m128i sad8x2 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad8x2);
+
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i;
+  assert(width == 16);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    __m128i x = xx_loadu_128(a);
+    __m128i y = xx_loadu_128(b);
+
+    __m128i sad16x1 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad16x1);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 32);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 2; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad32_half = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad32_half);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 64);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 4; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 128);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 8; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+#define jnt_sadMxN_sse2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+#define jnt_sadMxN_avx2(m, n)                                                 \
+  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
+                          jcp_param);                                         \
+    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+/* clang-format off */
+jnt_sadMxN_sse2(128, 128)
+jnt_sadMxN_sse2(128, 64)
+jnt_sadMxN_sse2(64, 128)
+jnt_sadMxN_sse2(64, 64)
+jnt_sadMxN_sse2(64, 32)
+jnt_sadMxN_sse2(32, 64)
+jnt_sadMxN_sse2(32, 32)
+jnt_sadMxN_sse2(32, 16)
+jnt_sadMxN_sse2(16, 32)
+jnt_sadMxN_sse2(16, 16)
+jnt_sadMxN_sse2(16, 8)
+jnt_sadMxN_sse2(8, 16)
+jnt_sadMxN_sse2(8, 8)
+jnt_sadMxN_sse2(8, 4)
+jnt_sadMxN_sse2(4, 8)
+jnt_sadMxN_sse2(4, 4)
+jnt_sadMxN_sse2(4, 16)
+jnt_sadMxN_sse2(16, 4)
+jnt_sadMxN_sse2(8, 32)
+jnt_sadMxN_sse2(32, 8)
+jnt_sadMxN_sse2(16, 64)
+jnt_sadMxN_sse2(64, 16)
+    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 000000000..f9a41a210
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                 int width, int height, const uint8_t *ref,
+                                 int ref_stride,
+                                 const JNT_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      const uint8_t *row0 = ref + 0 * ref_stride;
+      const uint8_t *row1 = ref + 1 * ref_stride;
+      const uint8_t *row2 = ref + 2 * ref_stride;
+      const uint8_t *row3 = ref + 3 * ref_stride;
+
+      __m128i p0 =
+          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+                        row3[0], row3[1], row3[2], row3[3]);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+void aom_jnt_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
+  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
+      const uint8_t *b, int b_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                        \
+    uint8_t temp2[H * W];                                                \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
+                                                                         \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
+                                                                         \
+    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
+                                jcp_param);                              \
+                                                                         \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
+  }
+
+JNT_SUBPIX_AVG_VAR(128, 128)
+JNT_SUBPIX_AVG_VAR(128, 64)
+JNT_SUBPIX_AVG_VAR(64, 128)
+JNT_SUBPIX_AVG_VAR(64, 64)
+JNT_SUBPIX_AVG_VAR(64, 32)
+JNT_SUBPIX_AVG_VAR(32, 64)
+JNT_SUBPIX_AVG_VAR(32, 32)
+JNT_SUBPIX_AVG_VAR(32, 16)
+JNT_SUBPIX_AVG_VAR(16, 32)
+JNT_SUBPIX_AVG_VAR(16, 16)
+JNT_SUBPIX_AVG_VAR(16, 8)
+JNT_SUBPIX_AVG_VAR(8, 16)
+JNT_SUBPIX_AVG_VAR(8, 8)
+JNT_SUBPIX_AVG_VAR(8, 4)
+JNT_SUBPIX_AVG_VAR(4, 8)
+JNT_SUBPIX_AVG_VAR(4, 4)
+JNT_SUBPIX_AVG_VAR(4, 16)
+JNT_SUBPIX_AVG_VAR(16, 4)
+JNT_SUBPIX_AVG_VAR(8, 32)
+JNT_SUBPIX_AVG_VAR(32, 8)
+JNT_SUBPIX_AVG_VAR(16, 64)
+JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 000000000..9d88b5e49
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,2385 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them  independently while flipping the second matrix horizontaly  Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *x6, __m128i *x7,
+                                            __m128i *pq0, __m128i *pq1,
+                                            __m128i *pq2, __m128i *pq3) {
+  __m128i w10, w11, w12, w13;
+  __m128i w0, w1, w2, w3, w4, w5;
+  __m128i d0, d1, d2, d3;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w10 = _mm_unpacklo_epi8(
+      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+  w11 = _mm_unpacklo_epi8(
+      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+  w12 = _mm_unpacklo_epi8(
+      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+  w13 = _mm_unpacklo_epi8(
+      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+  w4 = _mm_unpackhi_epi16(
+      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpackhi_epi16(
+      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
+  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
+  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
+  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8  independently while flipping the second matrix horizontaly. Used
+// for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                        __m128i *x3, __m128i *q0p0,
+                                        __m128i *q1p1, __m128i *q2p2,
+                                        __m128i *q3p3, __m128i *q4p4,
+                                        __m128i *q5p5, __m128i *q6p6,
+                                        __m128i *q7p7) {
+  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi8(
+      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+  w3 = _mm_unpackhi_epi8(
+      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
+  ww2 = _mm_unpacklo_epi16(
+      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
+  ww3 = _mm_unpackhi_epi16(
+      w2,
+      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
+
+  *q7p7 = _mm_unpacklo_epi32(
+      ww0,
+      _mm_srli_si128(
+          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww0, 4),
+      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(
+      ww0,
+      _mm_slli_si128(
+          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
+  *q4p4 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww0, 12),
+      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(
+      ww1,
+      _mm_srli_si128(
+          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww1, 4),
+      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(
+      ww1,
+      _mm_slli_si128(
+          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww1, 12),
+      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                          __m128i *hev, __m128i *mask,
+                                          __m128i *qs1qs0, __m128i *ps1ps0) {
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i t3t4 =
+      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8(0x80);
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter2filter1 =
+      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
+  filter = _mm_srai_epi16(filter, 9);          /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+  hev1 = _mm_srli_si128(filter2filter1, 8);
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+                                               __m128i *hev, __m128i *mask,
+                                               __m128i *qs1qs0,
+                                               __m128i *ps1ps0) {
+  const __m128i t3t4 =
+      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8(0x80);
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi64(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);
+  filter = _mm_srai_epi16(filter, 9); /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+
+  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, flat, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
+  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi32(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  __m128i flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+
+  /* const int8_t mask = filter_mask2(*limit, *blimit, */
+  /*                                  p1, p0, q0, q1); */
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi64(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+                               const uint8_t *_blimit, const uint8_t *_limit,
+                               const uint8_t *_thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+  __m128i qs1qs0, ps1ps0;
+  __m128i p1, p0, q0, q1;
+
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+  xx_storel_32(s - 1 * p, ps1ps0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
+  xx_storel_32(s + 0 * p, qs1qs0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
+}
+
+void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+                             const uint8_t *_blimit, const uint8_t *_limit,
+                             const uint8_t *_thresh) {
+  __m128i p1p0, q1q0;
+  __m128i p1, p0, q0, q1;
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+  __m128i x0, x1, x2, x3;
+  __m128i d0, d1, d2, d3;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
+
+  // Transpose 8x4 to 4x8
+  p1 = _mm_srli_si128(p1p0, 4);
+  q1 = _mm_srli_si128(q1q0, 4);
+
+  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
+
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+  xx_storel_32(s - (num + 1) * p, x);
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
+
+  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
+
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    __m128i fe, ff, work;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+  // loopfilter done
+
+  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+  __m128i work;
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pixelFilter_p, pixelFilter_q;
+    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+    __m128i sum_p6, sum_q6;
+    __m128i sum_p3, sum_q3, res_p, res_q;
+
+    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+    pixelFilter_p =
+        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+    pixetFilter_p2p1p0 = _mm_add_epi16(
+        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+                                    _mm_add_epi16(p1_16, q0_16))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+                                    _mm_add_epi16(p0_16, q1_16))),
+        4);
+    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(p6_16, p6_16);
+    sum_q6 = _mm_add_epi16(q6_16, q6_16);
+    sum_p3 = _mm_add_epi16(p3_16, p3_16);
+    sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+        4);
+    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    // work with flat2
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat = _mm_unpacklo_epi64(flat, flat);
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+          4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+          4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+          4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+          4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i flat2_pq[6], flat_pq[3];
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
+
+  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  __m128i fe, ff, work;
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+    fe = _mm_set1_epi8(0xfe);
+    ff = _mm_cmpeq_epi8(fe, fe);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+  // loopfilter done
+
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pq_16[7];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p6;
+    __m128i sum_p3;
+
+    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+    q0_16 = _mm_srli_si128(pq_16[0], 8);
+    q1_16 = _mm_srli_si128(pq_16[1], 8);
+    q2_16 = _mm_srli_si128(pq_16[2], 8);
+    q3_16 = _mm_srli_si128(pq_16[3], 8);
+    q4_16 = _mm_srli_si128(pq_16[4], 8);
+    q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+    work0_1 = _mm_add_epi16(
+        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+      sum_p = _mm_sub_epi16(sum_p, q4_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q3_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q2_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q1_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
+}
+
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
+                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  store_buffer_horz_8(q0p0, p, 0, s);
+  store_buffer_horz_8(q1p1, p, 1, s);
+  store_buffer_horz_8(q2p2, p, 2, s);
+  store_buffer_horz_8(q3p3, p, 3, s);
+  store_buffer_horz_8(q4p4, p, 4, s);
+  store_buffer_horz_8(q5p5, p, 5, s);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_add_epi16(q1_16, q2_16);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi32(flat, flat);
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+    q0_16 = _mm_srli_si128(pq0_16, 8);
+    q2_16 = _mm_srli_si128(pq2_16, 8);
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_b = _mm_srli_epi16(workp_b, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_a = _mm_srli_epi16(workp_a, 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
+}
+
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                           &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  // filter_mask and hev_mask
+
+  // considering sse doesn't have unsigned elements comparison the idea is to
+  // find at least one case when X > limit, it means the corresponding  mask
+  // bit is set.
+  // to achieve that we find global max value of all inputs of abs(x-y) or
+  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+  // otherwise - not
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+  abs_p1p0 = abs_diff(q1p1, q0p0);
+  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+  abs_p0q0 = abs_diff(p1p0, q1q0);
+  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu8(flat, *thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  // replicate for the further "merged variables" usage
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+  mask = _mm_unpacklo_epi32(mask, zero);
+  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  mask = _mm_max_epu8(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+  mask = _mm_max_epu8(work, mask);
+  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+  // flat_mask4
+  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+
+    opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 4);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  {
+    // filter_mask and hev_mask
+
+    // considering sse doesn't have unsigned elements comparison the idea is to
+    // find at least one case when X > limit, it means the corresponding  mask
+    // bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+    // flat_mask4
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 8);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
+  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
+  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
+  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
+  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
+  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+  xx_storel_32(s - 3 * p, p2);
+  xx_storel_32(s + 2 * p, q2);
+}
+
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
+
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i p1, p0, q0, q1;
+  __m128i qs1qs0, ps1ps0;
+
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
+}
+
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i p0, q0, q1, p1;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i qs1qs0, ps1ps0;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
+
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+                        &q1);
+
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+  p1 = _mm_srli_si128(ps1ps0, 8);
+  q1 = _mm_srli_si128(qs1qs0, 8);
+
+  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+                        &d5, &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x2, x1, x0, x3;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+
+  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
+
+  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
+
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+                        &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  __m128i p0, q0;
+  __m128i x2, x1, x0, x3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+  // Loop filtering
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
+
+  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
+                        &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d1, d3, d5, d7;
+  __m128i q1q0, p1p0;
+  __m128i p1, q1;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+                           &q1q0, &p1p0, &blimit, &limit, &thresh);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+                    &d2d3, &d4d5, &d6d7);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
+}
+
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+                              const unsigned char *_blimit,
+                              const unsigned char *_limit,
+                              const unsigned char *_thresh) {
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3;
+  __m128i pq0, pq1, pq2, pq3;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+
+  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+                       &q5p5, &q6p6, &q7p7);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                           &q0p0, &pq0, &pq1, &pq2, &pq3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
+}
+
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+  __m128i q0, q1, q2, q3, q7;
+  __m128i p0p1, p2p3, p4p5, p6p7;
+
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+  q7 = _mm_srli_si128(d14d15, 8);
+
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
+}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
new file mode 100644
index 000000000..8970fe7dd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *d0, __m128i *d1,
+                                            __m128i *d2, __m128i *d3,
+                                            __m128i *d4, __m128i *d5) {
+  __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+  // 00 01 02 03 04 05 xx xx
+  // 10 11 12 13 14 15 xx xx
+  // 20 21 22 23 24 25 xx xx
+  // 30 31 32 33 34 35 xx xx
+  // 40 41 42 43 44 45 xx xx
+  // 50 51 52 53 54 55 xx xx
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
+  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
+  *d1 = _mm_unpackhi_epi64(ww0,
+                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  *d2 = _mm_unpacklo_epi64(ww0,
+                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
+
+  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
+  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
+  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
+
+  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
+
+  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
+  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
+  *d5 = _mm_unpackhi_epi64(ww0,
+                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                                    __m128i *x2, __m128i *x3,
+                                                    __m128i *d0, __m128i *d1,
+                                                    __m128i *d2, __m128i *d3) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+
+  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
+  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
+  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
+  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+                                                     __m128i *x2, __m128i *x3,
+                                                     __m128i *d4, __m128i *d5,
+                                                     __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, ww2, ww3;
+  __m128i zero = _mm_setzero_si128();
+
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+
+  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
+  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
+
+  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
+  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
+  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
+  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3,
+                                                __m128i *d4, __m128i *d5,
+                                                __m128i *d6, __m128i *d7) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
+}
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *x4, __m128i *x5,
+                                                __m128i *x6, __m128i *x7,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+                                                 __m128i *x2, __m128i *x3,
+                                                 __m128i *x4, __m128i *x5,
+                                                 __m128i *x6, __m128i *x7,
+                                                 __m128i *d4, __m128i *d5,
+                                                 __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
+  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
+#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 000000000..584b5e7e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 32) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return (sad + 31) >> 6;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+  __m256i a = _mm256_castsi128_si256(a0);
+  return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+    // Calculate 16 predicted pixels.
+    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+    // is 64 * 255, so we have plenty of space to add rounding constants.
+    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+                               msk, msk_stride, invert_mask, m, n);           \
+  }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    // Zero-extend mask to 16 bits
+    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(m_ptr)),
+        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+    // so it is safe to do signed saturation here.
+    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+    // There is no 16-bit SAD instruction, so we have to synthesize
+    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+    // and accumulating them at the end
+    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m256i m =
+          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return (sad + 31) >> 6;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                           second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+                                        second_pred, m, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+                                         second_pred, m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                           ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+                                        ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                         ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
+  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
+      int msk_stride, int invert_mask) {                                  \
+    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+                                      second_pred8, msk, msk_stride,      \
+                                      invert_mask, m, n);                 \
+  }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4);
+HIGHBD_MASKSADMXN_AVX2(4, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 16);
+HIGHBD_MASKSADMXN_AVX2(32, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 32);
+HIGHBD_MASKSADMXN_AVX2(64, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 128);
+HIGHBD_MASKSADMXN_AVX2(128, 64);
+HIGHBD_MASKSADMXN_AVX2(128, 128);
+HIGHBD_MASKSADMXN_AVX2(4, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 000000000..493f9bd8f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+
+// For width a multiple of 16
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height);
+
+#define MASKSADMXN_SSSE3(m, n)                                                \
+  unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred,  \
+                              m, msk, msk_stride, m, n);                      \
+    else                                                                      \
+      return masked_sad_ssse3(src, src_stride, second_pred, m, ref,           \
+                              ref_stride, msk, msk_stride, m, n);             \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 8, msk, msk_stride, n);     \
+    else                                                                      \
+      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 4, msk, msk_stride, n);     \
+    else                                                                      \
+      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height) {
+  int x, y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m128i data_l = _mm_unpacklo_epi8(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi8(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  int32_t sad =
+      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
+  return (sad + 31) >> 6;
+}
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
+    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
+    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
+    const __m128i m =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
+    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
+    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  int32_t sad =
+      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
+  return (sad + 31) >> 6;
+}
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y += 2) {
+    // Load two rows at a time, this seems to be a bit faster
+    // than four rows at a time in this case.
+    const __m128i src = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
+        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i a =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+    const __m128i m =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+    const __m128i data = _mm_unpacklo_epi8(a, b);
+    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
+    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
+    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
+    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  // At this point, the SAD is stored in lane 0 of 'res'
+  int32_t sad = _mm_cvtsi128_si32(res);
+  return (sad + 31) >> 6;
+}
+
+// For width a multiple of 8
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
+      int msk_stride, int invert_mask) {                                      \
+    if (!invert_mask)                                                         \
+      return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride,      \
+                                     second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                      \
+      return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
+                                     ref_stride, msk, msk_stride, m, n);      \
+  }
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n)                                             \
+  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                             \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
+      int msk_stride, int invert_mask) {                                       \
+    if (!invert_mask)                                                          \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
+                                            ref_stride, second_pred8, 4, msk,  \
+                                            msk_stride, n);                    \
+    else                                                                       \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+                                            ref8, ref_stride, msk, msk_stride, \
+                                            n);                                \
+  }
+
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+HIGHBD_MASKSAD4XN_SSSE3(16)
+HIGHBD_MASKSADMXN_SSSE3(16, 4)
+HIGHBD_MASKSADMXN_SSSE3(8, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 8)
+HIGHBD_MASKSADMXN_SSSE3(16, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 8) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m128i m = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
+      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+      const __m128i data_l = _mm_unpacklo_epi16(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi16(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm_hadd_epi32(res, res);
+  res = _mm_hadd_epi32(res, res);
+  int sad = _mm_cvtsi128_si32(res);
+  return (sad + 31) >> 6;
+}
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
+                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
+                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
+    // Zero-extend mask to 16 bits
+    const __m128i m = _mm_unpacklo_epi8(
+        _mm_unpacklo_epi32(
+            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
+            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        _mm_setzero_si128());
+    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi16(a, b);
+    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpackhi_epi16(a, b);
+    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, res);
+  res = _mm_hadd_epi32(res, res);
+  int sad = _mm_cvtsi128_si32(res);
+  return (sad + 31) >> 6;
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 000000000..cffbd9672
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height);
+
+#endif  // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 000000000..d7dbefd7d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+// For width a multiple of 16
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+                            int yoffset, uint8_t *dst, int w, int h);
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h);
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h);
+
+// For width a multiple of 16
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *a_ptr, int a_stride,
+                            const uint8_t *b_ptr, int b_stride,
+                            const uint8_t *m_ptr, int m_stride, int width,
+                            int height, unsigned int *sse, int *sum_);
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_);
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_);
+
+#define MASK_SUBPIX_VAR_SSSE3(W, H)                                   \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,   \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+      const uint8_t *msk, int msk_stride, int invert_mask,            \
+      unsigned int *sse) {                                            \
+    int sum;                                                          \
+    uint8_t temp[(H + 1) * W];                                        \
+                                                                      \
+    bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);   \
+                                                                      \
+    if (!invert_mask)                                                 \
+      masked_variance(ref, ref_stride, temp, W, second_pred, W, msk,  \
+                      msk_stride, W, H, sse, &sum);                   \
+    else                                                              \
+      masked_variance(ref, ref_stride, second_pred, W, temp, W, msk,  \
+                      msk_stride, W, H, sse, &sum);                   \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));         \
+  }
+
+#define MASK_SUBPIX_VAR8XH_SSSE3(H)                                           \
+  unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
+      const uint8_t *msk, int msk_stride, int invert_mask,                    \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    uint8_t temp[(H + 1) * 8];                                                \
+                                                                              \
+    bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H);           \
+                                                                              \
+    if (!invert_mask)                                                         \
+      masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    else                                                                      \
+      masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H));                 \
+  }
+
+#define MASK_SUBPIX_VAR4XH_SSSE3(H)                                           \
+  unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
+      const uint8_t *msk, int msk_stride, int invert_mask,                    \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    uint8_t temp[(H + 1) * 4];                                                \
+                                                                              \
+    bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);           \
+                                                                              \
+    if (!invert_mask)                                                         \
+      masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    else                                                                      \
+      masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
+  }
+
+MASK_SUBPIX_VAR_SSSE3(128, 128)
+MASK_SUBPIX_VAR_SSSE3(128, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 128)
+MASK_SUBPIX_VAR_SSSE3(64, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 64)
+MASK_SUBPIX_VAR_SSSE3(32, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 32)
+MASK_SUBPIX_VAR_SSSE3(16, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 8)
+MASK_SUBPIX_VAR8XH_SSSE3(16)
+MASK_SUBPIX_VAR8XH_SSSE3(8)
+MASK_SUBPIX_VAR8XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(8)
+MASK_SUBPIX_VAR4XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(16)
+MASK_SUBPIX_VAR_SSSE3(16, 4)
+MASK_SUBPIX_VAR8XH_SSSE3(32)
+MASK_SUBPIX_VAR_SSSE3(32, 8)
+MASK_SUBPIX_VAR_SSSE3(64, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 64)
+
+static INLINE __m128i filter_block(const __m128i a, const __m128i b,
+                                   const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi8(a, b);
+  v0 = _mm_maddubs_epi16(v0, filter);
+  v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpackhi_epi8(a, b);
+  v1 = _mm_maddubs_epi16(v1, filter);
+  v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+  return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+                            int yoffset, uint8_t *dst, int w, int h) {
+  int i, j;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        _mm_storeu_si128((__m128i *)&b[j], x);
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+        __m128i z = _mm_alignr_epi8(y, x, 1);
+        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+        const __m128i z = _mm_alignr_epi8(y, x, 1);
+        const __m128i res = filter_block(x, z, hfilter_vec);
+        _mm_storeu_si128((__m128i *)&b[j], res);
+      }
+
+      src += src_stride;
+      b += w;
+    }
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
+      }
+      dst += w;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        const __m128i res = filter_block(x, y, vfilter_vec);
+        _mm_storeu_si128((__m128i *)&dst[j], res);
+      }
+
+      dst += w;
+    }
+  }
+}
+
+static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
+                                         const __m128i a1, const __m128i b1,
+                                         const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi8(a0, b0);
+  v0 = _mm_maddubs_epi16(v0, filter);
+  v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpacklo_epi8(a1, b1);
+  v1 = _mm_maddubs_epi16(v1, filter);
+  v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+  return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      _mm_storel_epi64((__m128i *)b, x);
+      src += src_stride;
+      b += 8;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadu_si128((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 1);
+      _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
+      src += src_stride;
+      b += 8;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 1);
+      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 1);
+      const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 2;
+      b += 16;
+    }
+    // Handle i = h separately
+    const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+    const __m128i z0 = _mm_srli_si128(x0, 1);
+
+    __m128i v0 = _mm_unpacklo_epi8(x0, z0);
+    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+    _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
+      dst += 8;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
+      const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 16;
+    }
+  }
+}
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = xx_loadl_32((__m128i *)src);
+      xx_storel_32((__m128i *)b, x);
+      src += src_stride;
+      b += 4;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 1);
+      xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
+      src += src_stride;
+      b += 4;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h; i += 4) {
+      const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 1);
+      const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 1);
+      const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
+      const __m128i z2 = _mm_srli_si128(x2, 1);
+      const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
+      const __m128i z3 = _mm_srli_si128(x3, 1);
+
+      const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
+      const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
+      const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
+      const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
+      const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 4;
+      b += 16;
+    }
+    // Handle i = h separately
+    const __m128i x = _mm_loadl_epi64((__m128i *)src);
+    const __m128i z = _mm_srli_si128(x, 1);
+
+    __m128i v0 = _mm_unpacklo_epi8(x, z);
+    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+    xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = xx_loadl_32((__m128i *)dst);
+      __m128i y = xx_loadl_32((__m128i *)&dst[4]);
+      xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
+      dst += 4;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; i += 4) {
+      const __m128i a = xx_loadl_32((__m128i *)dst);
+      const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
+      const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
+      const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
+      const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
+
+      const __m128i a0 = _mm_unpacklo_epi32(a, b);
+      const __m128i b0 = _mm_unpacklo_epi32(b, c);
+      const __m128i a1 = _mm_unpacklo_epi32(c, d);
+      const __m128i b1 = _mm_unpacklo_epi32(d, e);
+      const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 16;
+    }
+  }
+}
+
+static INLINE void accumulate_block(const __m128i src, const __m128i a,
+                                    const __m128i b, const __m128i m,
+                                    __m128i *sum, __m128i *sum_sq) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+  // Calculate 16 predicted pixels.
+  // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+  // is 64 * 255, so we have plenty of space to add rounding constants.
+  const __m128i data_l = _mm_unpacklo_epi8(a, b);
+  const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+  __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i data_r = _mm_unpackhi_epi8(a, b);
+  const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+  __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i src_l = _mm_unpacklo_epi8(src, zero);
+  const __m128i src_r = _mm_unpackhi_epi8(src, zero);
+  const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
+  const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
+
+  // Update partial sums and partial sums of squares
+  *sum =
+      _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
+  *sum_sq =
+      _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
+                                           _mm_madd_epi16(diff_r, diff_r)));
+}
+
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *a_ptr, int a_stride,
+                            const uint8_t *b_ptr, int b_stride,
+                            const uint8_t *m_ptr, int m_stride, int width,
+                            int height, unsigned int *sse, int *sum_) {
+  int x, y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      accumulate_block(src, a, b, m, &sum, &sum_sq);
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_) {
+  int y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 2) {
+    __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+    accumulate_block(src, a, b, m, &sum, &sum_sq);
+
+    src_ptr += src_stride * 2;
+    a_ptr += 16;
+    b_ptr += 16;
+    m_ptr += m_stride * 2;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_) {
+  int y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 4) {
+    // Load four rows at a time
+    __m128i src =
+        _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
+                       *(uint32_t *)&src_ptr[src_stride * 2],
+                       *(uint32_t *)&src_ptr[src_stride * 3]);
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m = _mm_setr_epi32(
+        *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
+        *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
+    accumulate_block(src, a, b, m, &sum, &sum_sq);
+
+    src_ptr += src_stride * 4;
+    a_ptr += 16;
+    b_ptr += 16;
+    m_ptr += m_stride * 4;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+// For width a multiple of 8
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+                                   int xoffset, int yoffset, uint16_t *dst,
+                                   int w, int h);
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+                                      int xoffset, int yoffset, uint16_t *dst,
+                                      int h);
+
+// For width a multiple of 8
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+                                   const uint16_t *a_ptr, int a_stride,
+                                   const uint16_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int width, int height, uint64_t *sse,
+                                   int *sum_);
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+                                      const uint16_t *a_ptr,
+                                      const uint16_t *b_ptr,
+                                      const uint8_t *m_ptr, int m_stride,
+                                      int height, int *sse, int *sum_);
+
+#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H)                                  \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3(     \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)sse64;                                                 \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(    \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3(         \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)sse_;                                                  \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
+  }                                                                         \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3(        \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+
+static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
+                                          const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi16(a, b);
+  v0 = _mm_madd_epi16(v0, filter);
+  v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpackhi_epi16(a, b);
+  v1 = _mm_madd_epi16(v1, filter);
+  v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+  return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+                                   int xoffset, int yoffset, uint16_t *dst,
+                                   int w, int h) {
+  int i, j;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        _mm_storeu_si128((__m128i *)&b[j], x);
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else if (xoffset == 4) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+        __m128i z = _mm_alignr_epi8(y, x, 2);
+        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else {
+    uint16_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+        const __m128i z = _mm_alignr_epi8(y, x, 2);
+        const __m128i res = highbd_filter_block(x, z, hfilter_vec);
+        _mm_storeu_si128((__m128i *)&b[j], res);
+      }
+
+      src += src_stride;
+      b += w;
+    }
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
+      }
+      dst += w;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        const __m128i res = highbd_filter_block(x, y, vfilter_vec);
+        _mm_storeu_si128((__m128i *)&dst[j], res);
+      }
+
+      dst += w;
+    }
+  }
+}
+
+static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
+                                                const __m128i b0,
+                                                const __m128i a1,
+                                                const __m128i b1,
+                                                const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi16(a0, b0);
+  v0 = _mm_madd_epi16(v0, filter);
+  v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpacklo_epi16(a1, b1);
+  v1 = _mm_madd_epi16(v1, filter);
+  v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+  return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+                                      int xoffset, int yoffset, uint16_t *dst,
+                                      int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      _mm_storel_epi64((__m128i *)b, x);
+      src += src_stride;
+      b += 4;
+    }
+  } else if (xoffset == 4) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadu_si128((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 2);
+      _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
+      src += src_stride;
+      b += 4;
+    }
+  } else {
+    uint16_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 2);
+      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 2);
+      const __m128i res =
+          highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 2;
+      b += 8;
+    }
+    // Process i = h separately
+    __m128i x = _mm_loadu_si128((__m128i *)src);
+    __m128i z = _mm_srli_si128(x, 2);
+
+    __m128i v0 = _mm_unpacklo_epi16(x, z);
+    v0 = _mm_madd_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+    _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
+      dst += 4;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
+      const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 8;
+    }
+  }
+}
+
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+                                   const uint16_t *a_ptr, int a_stride,
+                                   const uint16_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int width, int height, uint64_t *sse,
+                                   int *sum_) {
+  int x, y;
+  // Note on bit widths:
+  // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
+  // so this can be kept as four 32-bit values.
+  // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
+  // so this must be stored as two 64-bit values.
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 8) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m =
+          _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
+      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+      // Calculate 8 predicted pixels.
+      const __m128i data_l = _mm_unpacklo_epi16(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi16(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+      const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+      __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+      __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+      // Update partial sums and partial sums of squares
+      sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+      // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
+      // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
+      // So we can re-pack into 16-bit fields and use _mm_madd_epi16
+      // to calculate the squares and partially sum them.
+      const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+      const __m128i prod = _mm_madd_epi16(tmp, tmp);
+      // Then we want to sign-extend to 64 bits and accumulate
+      const __m128i sign = _mm_srai_epi32(prod, 31);
+      const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
+      const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
+      sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, zero);
+  sum = _mm_hadd_epi32(sum, zero);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
+  _mm_storel_epi64((__m128i *)sse, sum_sq);
+}
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+                                      const uint16_t *a_ptr,
+                                      const uint16_t *b_ptr,
+                                      const uint8_t *m_ptr, int m_stride,
+                                      int height, int *sse, int *sum_) {
+  int y;
+  // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
+  // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
+  // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
+  // So we can safely pack sum_sq into 32-bit fields, which is slightly more
+  // convenient.
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 2) {
+    __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m = _mm_unpacklo_epi8(
+        _mm_unpacklo_epi32(
+            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
+            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        zero);
+    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi16(a, b);
+    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpackhi_epi16(a, b);
+    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+    const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+    __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+    __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+    // Update partial sums and partial sums of squares
+    sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+    const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+    const __m128i prod = _mm_madd_epi16(tmp, tmp);
+    sum_sq = _mm_add_epi32(sum_sq, prod);
+
+    src_ptr += src_stride * 2;
+    a_ptr += 8;
+    b_ptr += 8;
+    m_ptr += m_stride * 2;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, zero);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                              int width, int height, const uint8_t *ref,
+                              int ref_stride, const uint8_t *mask,
+                              int mask_stride, int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  assert(height % 2 == 0);
+  int i = 0;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+                              mask + mask_stride, comp_pred + width);
+      comp_pred += (width << 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  } else {  // width == 32
+    assert(width == 32);
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
+      comp_pred += (width);
+      src0 += (stride0);
+      src1 += (stride1);
+      mask += (mask_stride);
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 000000000..4faa098ac
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+                                           const uint8_t *src1,
+                                           const uint8_t *mask, uint8_t *dst) {
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *mask,
+                                          int mask_stride) {
+  int i = 0;
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    // odd line A
+    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+    // even line B
+    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+    const __m128i ma = _mm_sub_epi8(alpha_max, a);
+    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+    const __m128i round = _mm_packus_epi16(roundA, roundB);
+    // comp_pred's stride == width == 8
+    _mm_store_si128((__m128i *)(comp_pred), round);
+    comp_pred += (8 << 1);
+    src0 += (stride0 << 1);
+    src1 += (stride1 << 1);
+    mask += (mask_stride << 1);
+    i += 2;
+  } while (i < height);
+}
+
+#endif  // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..6c821673e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  __m128i dst;
+  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+  return dst;
+}
+
+#endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 000000000..5181e444c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    unsigned int *const sse, int *const sum,
+                                    const int h) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
new file mode 100644
index 000000000..48486c6c4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+  const __m128i v_tmp_d =
+      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 000000000..2aa2a0555
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int height) {
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  do {
+    const __m128i v_p_b_0 = xx_loadl_32(pre);
+    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_b = xx_loadl_64(pre + n);
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if ((n & (width - 1)) == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h)                                          \
+  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *msk) {                                       \
+    if (w == 4) {                                                 \
+      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
+    } else {                                                      \
+      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+    }                                                             \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+                                                const int pre_stride,
+                                                const int32_t *wsrc,
+                                                const int32_t *mask,
+                                                const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  do {
+    const __m128i v_p_w_0 = xx_loadl_64(pre);
+    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                           \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask) {                                           \
+    if (w == 4) {                                                      \
+      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                           \
+      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                                  \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 000000000..0338a8c77
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                                 const int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask,
+                                                 const int height) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_b = xx_loadl_32(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h)                                       \
+  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+      const int32_t *msk) {                                    \
+    if (w == 4) {                                              \
+      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
+    } else {                                                   \
+      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
+    }                                                          \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                                     const int pre_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                      \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask) {                                      \
+    if (w == 4) {                                                 \
+      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                      \
+      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                             \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 000000000..bfec0e8a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  int n = 0, width, height = h;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m128i v_d;
+  const uint8_t *pre_temp;
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+      // boundaries. We use pmaddwd, as it has lower latency on Haswell
+      // than pmulld but produces the same result with these inputs.
+      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_tmp_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 8;
+      n += 8;
+      width -= 8;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  *sum = _mm_cvtsi128_si32(v_d);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *const sse, int *const sum,
+                                      const int w, const int h) {
+  int n = 0, width, height = h;
+  __m256i v_d;
+  __m128i res0;
+  const uint8_t *pre_temp;
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+
+  assert(w >= 16);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_m1_d =
+          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+      const __m256i v_w1_d =
+          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+      const __m256i v_tmp0_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+      const __m256i v_tmp1_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 16;
+      n += 16;
+      width -= 16;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+
+  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm256_hadd_epi32(v_d, v_d);
+  res0 = _mm256_castsi256_si128(v_d);
+  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+  *sum = _mm_cvtsi128_si32(res0);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H)                                                \
+  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
+      const int32_t *mask, unsigned int *sse) {                         \
+    int sum;                                                            \
+    if (W == 4) {                                                       \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
+    } else if (W == 8) {                                                \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
+    } else {                                                            \
+      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                   \
+                                                                        \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 000000000..72eda0e57
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H)                                               \
+  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask, unsigned int *sse) {                        \
+    int sum;                                                           \
+    if (W == 4) {                                                      \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
+    } else {                                                           \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
+
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H)                                                \
+  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
+  }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void hbd_obmc_variance_w4(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+    const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum += xx_hsum_epi32_si64(v_sum_d);
+  *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask, int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  }
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else if (w < 128 || h < 128) {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    assert(w == 128 && h == 128);
+
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * w;
+      mask += 64 * w;
+      h -= 64;
+    } while (h > 0);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  int max_pel_allowed_per_ovf = 512;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else if (w * h <= max_pel_allowed_per_ovf) {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+    assert(max_pel_allowed_per_ovf % w == 0);
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            h_per_ovf);
+      pre8 += h_per_ovf * pre_stride;
+      wsrc += h_per_ovf * w;
+      mask += h_per_ovf * w;
+      h -= h_per_ovf;
+    } while (h > 0);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H)                                               \
+  unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+HBD_OBMCVARWXH(4, 16)
+HBD_OBMCVARWXH(16, 4)
+HBD_OBMCVARWXH(8, 32)
+HBD_OBMCVARWXH(32, 8)
+HBD_OBMCVARWXH(16, 64)
+HBD_OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 000000000..216a0bd8f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -0,0 +1,435 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  vzeroupper
+
+%ifnidn %1, b_32x32
+
+  ; Special case for ncoeff == 16, as it is frequent and we can save on
+  ; not setting up a loop.
+  cmp                       ncoeffmp, 16
+  jne .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Special case of ncoeff == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+  movifnidn                   coeffq, coeffmp
+  movifnidn                    zbinq, zbinmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+
+  ; Get DC and first 15 AC coeffs - in this special case, that is all.
+  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+  mova                            m9, [coeffq]
+  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
+  mova                           m10, [coeffq+32]
+  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
+
+  mov                             r0, eobmp                ; Output pointer
+  mov                             r1, qcoeffmp             ; Output pointer
+  mov                             r2, dqcoeffmp            ; Output pointer
+
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  pcmpeqw                         m4, m4                   ; All word lanes -1
+  paddw                           m0, m4                   ; m0 = zbin - 1
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, we just write zeros
+  ; to the outputs and we are done.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .single_nonzero
+
+  mova                       [r1   ], ymm5
+  mova                       [r1+32], ymm5
+  mova                       [r2   ], ymm5
+  mova                       [r2+32], ymm5
+  mov                           [r0], word 0
+
+  vzeroupper
+  RET
+
+.single_nonzero:
+
+  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+  movifnidn                       r3, roundmp
+  movifnidn                       r4, quantmp
+  mov                             r6, dequantmp
+  mov                             r5, shiftmp
+  mova                            m1, [r3]              ; m1 = round
+  mova                            m2, [r4]              ; m2 = quant
+  mova                            m3, [r6]              ; m3 = dequant
+  mova                            m4, [r5]              ; m4 = shift
+
+  mov                             r3, iscanmp
+
+  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                  [qcoeffq   ], m11
+  mova                  [qcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                  [qcoeffq+32], m11
+  mova                  [qcoeffq+48], m6
+
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                 [dqcoeffq   ], m11
+  mova                 [dqcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                 [dqcoeffq+32], m11
+  mova                 [dqcoeffq+48], m6
+
+  mova                            m6, [iscanq]            ; m6 = scan[i]
+  mova                           m11, [iscanq+16]         ; m11 = scan[i]
+
+  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
+  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
+  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
+  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
+  pandn                           m8,  m8,  m6            ; m8 = max(eob)
+  pandn                          m13, m13, m11            ; m13 = max(eob)
+  pmaxsw                          m8,  m8, m13
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                         [eobq], ax
+
+  vzeroupper
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of ncoeff != 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  ; Actual quantization loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+  mova                            m3, [dequantq]           ; m3 = dequant
+  pcmpeqw                         m4, m4                   ; All lanes -1
+%ifidn %1, b_32x32
+  psubw                           m0, m4
+  psubw                           m1, m4
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  paddw                           m0, m4                   ; m0 = m0 + 1
+
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]            ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+
+
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .first_nonzero
+
+  mova        [qcoeffq+ncoeffq*4   ], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4   ], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+  add                        ncoeffq, mmsize
+
+  punpckhqdq                      m1, m1
+  punpckhqdq                      m2, m2
+  punpckhqdq                      m3, m3
+  punpckhqdq                      m4, m4
+  pxor                            m8, m8
+
+  jmp .ac_only_loop
+
+.first_nonzero:
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+
+  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                    ; m8 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+
+.ac_only_loop:
+
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+  ; And just write zeros as the result would be.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .rest_nonzero
+
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+.rest_nonzero:
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+
+  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                    ; m14 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
new file mode 100644
index 000000000..d3de6e24d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
+
+void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan_ptr,
+                         const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan_ptr;
+
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_coefficients(qcoeff0, qcoeff_ptr);
+  store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+  store_coefficients(coeff0, dqcoeff_ptr);
+  store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 000000000..39d4ca674
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,272 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [dequantq]           ; m3 = dequant
+  mov                             r2, shiftmp
+  psubw                           m0, [GLOBAL(pw_1)]
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, b_32x32
+  psllw                           m4, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                            m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pxor                           m11, m11
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5
+
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*4+ 0], m5
+  mova        [qcoeffq+ncoeffq*4+16], m5
+  mova        [qcoeffq+ncoeffq*4+32], m5
+  mova        [qcoeffq+ncoeffq*4+48], m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m5
+  mova       [dqcoeffq+ncoeffq*4+16], m5
+  mova       [dqcoeffq+ncoeffq*4+32], m5
+  mova       [dqcoeffq+ncoeffq*4+48], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..4eed7dd29
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+                                 const int16_t *round_ptr, __m128i *round,
+                                 const int16_t *quant_ptr, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 const int16_t *shift_ptr, __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+  return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const __m128i zbin_mask0,
+                                   const __m128i zbin_mask1,
+                                   const int16_t *scan_ptr, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i eob0, eob1;
+  // Add one to convert from indices to counts
+  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
new file mode 100644
index 000000000..f662b62b1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 32; i++) {
+    // load src and all refs
+    src_reg = _mm256_loadu_si256((const __m256i *)src);
+    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+  {
+    __m128i sum;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+  _mm256_zeroupper();
+}
+
+void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t res[4]) {
+  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
+  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
+  __m256i ref3_reg, ref3next_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m256i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_set1_epi16(0);
+  sum_ref1 = _mm256_set1_epi16(0);
+  sum_ref2 = _mm256_set1_epi16(0);
+  sum_ref3 = _mm256_set1_epi16(0);
+  for (i = 0; i < 64; i++) {
+    // load 64 bytes from src and all refs
+    src_reg = _mm256_loadu_si256((const __m256i *)src);
+    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
+    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
+    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
+    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
+    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
+    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
+    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
+    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
+    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
+    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
+    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
+    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+  {
+    __m128i sum;
+
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+  _mm256_zeroupper();
+}
+
+void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t res[4]) {
+  const uint8_t *rf[4];
+  uint32_t sum0[4];
+  uint32_t sum1[4];
+
+  rf[0] = ref[0];
+  rf[1] = ref[1];
+  rf[2] = ref[2];
+  rf[3] = ref[3];
+  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
+  src += src_stride << 5;
+  rf[0] += ref_stride << 5;
+  rf[1] += ref_stride << 5;
+  rf[2] += ref_stride << 5;
+  rf[3] += ref_stride << 5;
+  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  res[2] = sum0[2] + sum1[2];
+  res[3] = sum0[3] + sum1[3];
+}
+
+void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t res[4]) {
+  const uint8_t *rf[4];
+  uint32_t sum0[4];
+  uint32_t sum1[4];
+  unsigned int half_width = 32;
+
+  rf[0] = ref[0];
+  rf[1] = ref[1];
+  rf[2] = ref[2];
+  rf[3] = ref[3];
+  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
+  src += half_width;
+  rf[0] += half_width;
+  rf[1] += half_width;
+  rf[2] += half_width;
+  rf[3] += half_width;
+  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  res[2] = sum0[2] + sum1[2];
+  res[3] = sum0[3] + sum1[3];
+}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 000000000..55a856985
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,257 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_4x2x4 5-6 0
+  movd                  m0, [srcq +%2]
+%if %1 == 1
+  movd                  m6, [ref1q+%3]
+  movd                  m4, [ref2q+%3]
+  movd                  m7, [ref3q+%3]
+  movd                  m5, [ref4q+%3]
+  movd                  m1, [srcq +%4]
+  movd                  m2, [ref1q+%5]
+  punpckldq             m0, m1
+  punpckldq             m6, m2
+  movd                  m1, [ref2q+%5]
+  movd                  m2, [ref3q+%5]
+  movd                  m3, [ref4q+%5]
+  punpckldq             m4, m1
+  punpckldq             m7, m2
+  punpckldq             m5, m3
+  movlhps               m0, m0
+  movlhps               m6, m4
+  movlhps               m7, m5
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movd                  m1, [ref1q+%3]
+  movd                  m5, [ref1q+%5]
+  movd                  m2, [ref2q+%3]
+  movd                  m4, [ref2q+%5]
+  punpckldq             m1, m5
+  punpckldq             m2, m4
+  movd                  m3, [ref3q+%3]
+  movd                  m5, [ref3q+%5]
+  punpckldq             m3, m5
+  movd                  m4, [ref4q+%3]
+  movd                  m5, [ref4q+%5]
+  punpckldq             m4, m5
+  movd                  m5, [srcq +%4]
+  punpckldq             m0, m5
+  movlhps               m0, m0
+  movlhps               m1, m2
+  movlhps               m3, m4
+  psadbw                m1, m0
+  psadbw                m3, m0
+  paddd                 m6, m1
+  paddd                 m7, m3
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_8x2x4 5-6 0
+  movh                  m0, [srcq +%2]
+%if %1 == 1
+  movh                  m4, [ref1q+%3]
+  movh                  m5, [ref2q+%3]
+  movh                  m6, [ref3q+%3]
+  movh                  m7, [ref4q+%3]
+  movhps                m0, [srcq +%4]
+  movhps                m4, [ref1q+%5]
+  movhps                m5, [ref2q+%5]
+  movhps                m6, [ref3q+%5]
+  movhps                m7, [ref4q+%5]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movh                  m1, [ref1q+%3]
+  movh                  m2, [ref2q+%3]
+  movh                  m3, [ref3q+%3]
+  movhps                m0, [srcq +%4]
+  movhps                m1, [ref1q+%5]
+  movhps                m2, [ref2q+%5]
+  movhps                m3, [ref3q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movh                  m1, [ref4q+%3]
+  movhps                m1, [ref4q+%5]
+  paddd                 m5, m2
+  paddd                 m6, m3
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_16x2x4 5-6 0
+  ; 1st 16 px
+  mova                  m0, [srcq +%2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3]
+  movu                  m5, [ref2q+%3]
+  movu                  m6, [ref3q+%3]
+  movu                  m7, [ref4q+%3]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movu                  m1, [ref1q+%3]
+  movu                  m2, [ref2q+%3]
+  movu                  m3, [ref3q+%3]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movu                  m1, [ref4q+%3]
+  paddd                 m5, m2
+  paddd                 m6, m3
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endif
+
+  ; 2nd 16 px
+  mova                  m0, [srcq +%4]
+  movu                  m1, [ref1q+%5]
+  movu                  m2, [ref2q+%5]
+  movu                  m3, [ref3q+%5]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  psadbw                m3, m0
+  paddd                 m4, m1
+  movu                  m1, [ref4q+%5]
+  paddd                 m5, m2
+  paddd                 m6, m3
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*2]
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endif
+  psadbw                m1, m0
+  paddd                 m7, m1
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_32x2x4 5-6 0
+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_64x2x4 5-6 0
+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
+%endmacro
+
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_128x2x4 5-6 0
+  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
+  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
+%endmacro
+
+; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
+%macro SADNXN4D 2
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+
+%if %1 > 4
+  pslldq                m5, 4
+  pslldq                m7, 4
+  por                   m4, m5
+  por                   m6, m7
+  mova                  m5, m4
+  mova                  m7, m6
+  punpcklqdq            m4, m6
+  punpckhqdq            m5, m7
+  movifnidn             r4, r4mp
+  paddd                 m4, m5
+  movu                [r4], m4
+  RET
+%else
+  movifnidn             r4, r4mp
+  pshufd            m6, m6, 0x08
+  pshufd            m7, m7, 0x08
+  movq              [r4+0], m6
+  movq              [r4+8], m7
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64,  128
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16,  8
+SADNXN4D  8, 16
+SADNXN4D  8,  8
+SADNXN4D  8,  4
+SADNXN4D  4,  8
+SADNXN4D  4,  4
+SADNXN4D  4, 16
+SADNXN4D 16,  4
+SADNXN4D  8, 32
+SADNXN4D 32,  8
+SADNXN4D 16, 64
+SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
new file mode 100644
index 000000000..a50dba64a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+#define FSAD64_H(h)                                                           \
+  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSAD32_H(h)                                                           \
+  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSAD64  \
+  FSAD64_H(64); \
+  FSAD64_H(32);
+
+#define FSAD32  \
+  FSAD32_H(64); \
+  FSAD32_H(32); \
+  FSAD32_H(16);
+
+/* clang-format off */
+FSAD64
+FSAD32
+/* clang-format on */
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h)                                                        \
+  unsigned int aom_sad64x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSADAVG32_H(h)                                                        \
+  unsigned int aom_sad32x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSADAVG64  \
+  FSADAVG64_H(64); \
+  FSADAVG64_H(32);
+
+#define FSADAVG32  \
+  FSADAVG32_H(64); \
+  FSADAVG32_H(32); \
+  FSADAVG32_H(16);
+
+/* clang-format off */
+FSADAVG64
+FSADAVG32
+/* clang-format on */
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
new file mode 100644
index 000000000..b506d4663
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
@@ -0,0 +1,1038 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/mem.h"
+
+// SAD
+static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+  // input 8 32-bit summation
+  __m128i lo128, hi128;
+  __m256i u = _mm256_srli_si256(*v, 8);
+  u = _mm256_add_epi32(u, *v);
+
+  // 4 32-bit summation
+  hi128 = _mm256_extracti128_si256(u, 1);
+  lo128 = _mm256_castsi256_si128(u);
+  lo128 = _mm_add_epi32(hi128, lo128);
+
+  // 2 32-bit summation
+  hi128 = _mm_srli_si128(lo128, 4);
+  lo128 = _mm_add_epi32(lo128, hi128);
+
+  return (unsigned int)_mm_cvtsi128_si32(lo128);
+}
+
+unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+  // first 4 rows
+  __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+  __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+  __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+  __m256i u0 = _mm256_sub_epi16(s0, r0);
+  __m256i u1 = _mm256_sub_epi16(s1, r1);
+  __m256i u2 = _mm256_sub_epi16(s2, r2);
+  __m256i u3 = _mm256_sub_epi16(s3, r3);
+  __m256i zero = _mm256_setzero_si256();
+  __m256i sum0, sum1;
+
+  u0 = _mm256_abs_epi16(u0);
+  u1 = _mm256_abs_epi16(u1);
+  u2 = _mm256_abs_epi16(u2);
+  u3 = _mm256_abs_epi16(u3);
+
+  sum0 = _mm256_add_epi16(u0, u1);
+  sum0 = _mm256_add_epi16(sum0, u2);
+  sum0 = _mm256_add_epi16(sum0, u3);
+
+  // second 4 rows
+  src_ptr += src_stride << 2;
+  ref_ptr += ref_stride << 2;
+  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+  u0 = _mm256_sub_epi16(s0, r0);
+  u1 = _mm256_sub_epi16(s1, r1);
+  u2 = _mm256_sub_epi16(s2, r2);
+  u3 = _mm256_sub_epi16(s3, r3);
+
+  u0 = _mm256_abs_epi16(u0);
+  u1 = _mm256_abs_epi16(u1);
+  u2 = _mm256_abs_epi16(u2);
+  u3 = _mm256_abs_epi16(u3);
+
+  sum1 = _mm256_add_epi16(u0, u1);
+  sum1 = _mm256_add_epi16(sum1, u2);
+  sum1 = _mm256_add_epi16(sum1, u3);
+
+  // find out the SAD
+  s0 = _mm256_unpacklo_epi16(sum0, zero);
+  s1 = _mm256_unpackhi_epi16(sum0, zero);
+  r0 = _mm256_unpacklo_epi16(sum1, zero);
+  r1 = _mm256_unpackhi_epi16(sum1, zero);
+  s0 = _mm256_add_epi32(s0, s1);
+  r0 = _mm256_add_epi32(r0, r1);
+  sum0 = _mm256_add_epi32(s0, r0);
+  // 8 32-bit summation
+
+  return (unsigned int)get_sad_from_mm256_epi32(&sum0);
+}
+
+unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
+  __m256i sum0;
+  __m256i sum = _mm256_setzero_si256();
+  const __m256i zero = _mm256_setzero_si256();
+  int row = 0;
+
+  // Loop for every 4 rows
+  while (row < 16) {
+    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+    u0 = _mm256_sub_epi16(s0, r0);
+    u1 = _mm256_sub_epi16(s1, r1);
+    u2 = _mm256_sub_epi16(s2, r2);
+    u3 = _mm256_sub_epi16(s3, r3);
+
+    u0 = _mm256_abs_epi16(u0);
+    u1 = _mm256_abs_epi16(u1);
+    u2 = _mm256_abs_epi16(u2);
+    u3 = _mm256_abs_epi16(u3);
+
+    sum0 = _mm256_add_epi16(u0, u1);
+    sum0 = _mm256_add_epi16(sum0, u2);
+    sum0 = _mm256_add_epi16(sum0, u3);
+
+    s0 = _mm256_unpacklo_epi16(sum0, zero);
+    s1 = _mm256_unpackhi_epi16(sum0, zero);
+    sum = _mm256_add_epi32(sum, s0);
+    sum = _mm256_add_epi32(sum, s1);
+    // 8 32-bit summation
+
+    row += 4;
+    src_ptr += src_stride << 2;
+    ref_ptr += ref_stride << 2;
+  }
+  return get_sad_from_mm256_epi32(&sum);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
+  const __m256i zero = _mm256_setzero_si256();
+  int row_sections = 0;
+
+  while (row_sections < 2) {
+    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+    if (sec_ptr) {
+      r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r1 = _mm256_avg_epu16(
+          r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r2 = _mm256_avg_epu16(
+          r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r3 = _mm256_avg_epu16(
+          r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+    }
+    s0 = _mm256_sub_epi16(s0, r0);
+    s1 = _mm256_sub_epi16(s1, r1);
+    s2 = _mm256_sub_epi16(s2, r2);
+    s3 = _mm256_sub_epi16(s3, r3);
+
+    s0 = _mm256_abs_epi16(s0);
+    s1 = _mm256_abs_epi16(s1);
+    s2 = _mm256_abs_epi16(s2);
+    s3 = _mm256_abs_epi16(s3);
+
+    s0 = _mm256_add_epi16(s0, s1);
+    s0 = _mm256_add_epi16(s0, s2);
+    s0 = _mm256_add_epi16(s0, s3);
+
+    r0 = _mm256_unpacklo_epi16(s0, zero);
+    r1 = _mm256_unpackhi_epi16(s0, zero);
+
+    r0 = _mm256_add_epi32(r0, r1);
+    *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+
+    row_sections += 1;
+    src_ptr += src_stride << 1;
+    ref_ptr += ref_stride << 1;
+    if (sec_ptr) sec_ptr += 32 << 1;
+  }
+}
+
+unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
+  src += src_stride << 4;
+  ref += ref_stride << 4;
+  sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
+  src += src_stride << 4;
+  ref += ref_stride << 4;
+  sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
+  src += src_stride << 5;
+  ref += ref_stride << 5;
+  sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
+  return sum;
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[8], r[8];
+  const __m256i zero = _mm256_setzero_si256();
+
+  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
+  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
+
+  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
+  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
+
+  if (sec_ptr) {
+    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r[1] = _mm256_avg_epu16(
+        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r[2] = _mm256_avg_epu16(
+        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r[3] = _mm256_avg_epu16(
+        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+    r[4] = _mm256_avg_epu16(
+        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
+    r[5] = _mm256_avg_epu16(
+        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
+    r[6] = _mm256_avg_epu16(
+        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
+    r[7] = _mm256_avg_epu16(
+        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
+  }
+
+  s[0] = _mm256_sub_epi16(s[0], r[0]);
+  s[1] = _mm256_sub_epi16(s[1], r[1]);
+  s[2] = _mm256_sub_epi16(s[2], r[2]);
+  s[3] = _mm256_sub_epi16(s[3], r[3]);
+  s[4] = _mm256_sub_epi16(s[4], r[4]);
+  s[5] = _mm256_sub_epi16(s[5], r[5]);
+  s[6] = _mm256_sub_epi16(s[6], r[6]);
+  s[7] = _mm256_sub_epi16(s[7], r[7]);
+
+  s[0] = _mm256_abs_epi16(s[0]);
+  s[1] = _mm256_abs_epi16(s[1]);
+  s[2] = _mm256_abs_epi16(s[2]);
+  s[3] = _mm256_abs_epi16(s[3]);
+  s[4] = _mm256_abs_epi16(s[4]);
+  s[5] = _mm256_abs_epi16(s[5]);
+  s[6] = _mm256_abs_epi16(s[6]);
+  s[7] = _mm256_abs_epi16(s[7]);
+
+  s[0] = _mm256_add_epi16(s[0], s[1]);
+  s[0] = _mm256_add_epi16(s[0], s[2]);
+  s[0] = _mm256_add_epi16(s[0], s[3]);
+
+  s[4] = _mm256_add_epi16(s[4], s[5]);
+  s[4] = _mm256_add_epi16(s[4], s[6]);
+  s[4] = _mm256_add_epi16(s[4], s[7]);
+
+  r[0] = _mm256_unpacklo_epi16(s[0], zero);
+  r[1] = _mm256_unpackhi_epi16(s[0], zero);
+  r[2] = _mm256_unpacklo_epi16(s[4], zero);
+  r[3] = _mm256_unpackhi_epi16(s[4], zero);
+
+  r[0] = _mm256_add_epi32(r[0], r[1]);
+  r[0] = _mm256_add_epi32(r[0], r[2]);
+  r[0] = _mm256_add_epi32(r[0], r[3]);
+  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 16) {
+    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride) {
+  uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
+  src += src_stride << 5;
+  ref += ref_stride << 5;
+  sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
+  return sum;
+}
+
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+                     const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[8], r[8];
+  const __m256i zero = _mm256_setzero_si256();
+
+  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
+  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
+  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
+  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
+
+  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
+  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
+  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
+  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
+
+  if (sec_ptr) {
+    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r[1] = _mm256_avg_epu16(
+        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r[2] = _mm256_avg_epu16(
+        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r[3] = _mm256_avg_epu16(
+        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+    r[4] = _mm256_avg_epu16(
+        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
+    r[5] = _mm256_avg_epu16(
+        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
+    r[6] = _mm256_avg_epu16(
+        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
+    r[7] = _mm256_avg_epu16(
+        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
+  }
+
+  s[0] = _mm256_sub_epi16(s[0], r[0]);
+  s[1] = _mm256_sub_epi16(s[1], r[1]);
+  s[2] = _mm256_sub_epi16(s[2], r[2]);
+  s[3] = _mm256_sub_epi16(s[3], r[3]);
+  s[4] = _mm256_sub_epi16(s[4], r[4]);
+  s[5] = _mm256_sub_epi16(s[5], r[5]);
+  s[6] = _mm256_sub_epi16(s[6], r[6]);
+  s[7] = _mm256_sub_epi16(s[7], r[7]);
+
+  s[0] = _mm256_abs_epi16(s[0]);
+  s[1] = _mm256_abs_epi16(s[1]);
+  s[2] = _mm256_abs_epi16(s[2]);
+  s[3] = _mm256_abs_epi16(s[3]);
+  s[4] = _mm256_abs_epi16(s[4]);
+  s[5] = _mm256_abs_epi16(s[5]);
+  s[6] = _mm256_abs_epi16(s[6]);
+  s[7] = _mm256_abs_epi16(s[7]);
+
+  s[0] = _mm256_add_epi16(s[0], s[1]);
+  s[0] = _mm256_add_epi16(s[0], s[2]);
+  s[0] = _mm256_add_epi16(s[0], s[3]);
+
+  s[4] = _mm256_add_epi16(s[4], s[5]);
+  s[4] = _mm256_add_epi16(s[4], s[6]);
+  s[4] = _mm256_add_epi16(s[4], s[7]);
+
+  r[0] = _mm256_unpacklo_epi16(s[0], zero);
+  r[1] = _mm256_unpackhi_epi16(s[0], zero);
+  r[2] = _mm256_unpacklo_epi16(s[4], zero);
+  r[3] = _mm256_unpackhi_epi16(s[4], zero);
+
+  r[0] = _mm256_add_epi32(r[0], r[1]);
+  r[0] = _mm256_add_epi32(r[0], r[2]);
+  r[0] = _mm256_add_epi32(r[0], r[3]);
+  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  int row = 0;
+  while (row < 64) {
+    sad128x1(srcp, refp, NULL, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    row += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride) {
+  uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
+  src += src_stride << 6;
+  ref += ref_stride << 6;
+  sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
+  return sum;
+}
+
+unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride) {
+  uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
+  src += src_stride << 6;
+  ref += ref_stride << 6;
+  sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
+  return sum;
+}
+
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+                           const uint16_t *ref_ptr, int ref_stride,
+                           const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
+  const __m256i zero = _mm256_setzero_si256();
+
+  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+  if (sec_ptr) {
+    r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r1 = _mm256_avg_epu16(r1,
+                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r2 = _mm256_avg_epu16(r2,
+                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r3 = _mm256_avg_epu16(r3,
+                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+  }
+
+  s0 = _mm256_sub_epi16(s0, r0);
+  s1 = _mm256_sub_epi16(s1, r1);
+  s2 = _mm256_sub_epi16(s2, r2);
+  s3 = _mm256_sub_epi16(s3, r3);
+
+  s0 = _mm256_abs_epi16(s0);
+  s1 = _mm256_abs_epi16(s1);
+  s2 = _mm256_abs_epi16(s2);
+  s3 = _mm256_abs_epi16(s3);
+
+  s0 = _mm256_add_epi16(s0, s1);
+  s0 = _mm256_add_epi16(s0, s2);
+  s0 = _mm256_add_epi16(s0, s3);
+
+  r0 = _mm256_unpacklo_epi16(s0, zero);
+  r1 = _mm256_unpackhi_epi16(s0, zero);
+
+  r0 = _mm256_add_epi32(r0, r1);
+  *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+}
+
+unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+  // Next 4 rows
+  srcp += src_stride << 2;
+  refp += ref_stride << 2;
+  secp += 64;
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 3;
+  uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+                                             second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+                                     second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 4;
+  uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 4;
+  uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 32 << left_shift;
+  sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 32 << left_shift;
+  sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 16) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 64 << left_shift;
+  sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
+                                           const uint8_t *ref, int ref_stride,
+                                           const uint8_t *second_pred) {
+  const int left_shift = 6;
+  uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 64 << left_shift;
+  sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
+                                           const uint8_t *ref, int ref_stride,
+                                           const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  int row = 0;
+  while (row < 64) {
+    sad128x1(srcp, refp, secp, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    secp += 16 << 3;
+    row += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred) {
+  unsigned int sum;
+  const int left_shift = 6;
+
+  sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 128 << left_shift;
+  sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                       second_pred);
+  return sum;
+}
+
+// SAD 4D
+// Combine 4 __m256i vectors to uint32_t result[4]
+static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+                                               uint32_t *res) {
+  __m256i u0, u1, u2, u3;
+  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
+  __m128i sad;
+
+  // 8 32-bit summation
+  u0 = _mm256_srli_si256(v[0], 4);
+  u1 = _mm256_srli_si256(v[1], 4);
+  u2 = _mm256_srli_si256(v[2], 4);
+  u3 = _mm256_srli_si256(v[3], 4);
+
+  u0 = _mm256_add_epi32(u0, v[0]);
+  u1 = _mm256_add_epi32(u1, v[1]);
+  u2 = _mm256_add_epi32(u2, v[2]);
+  u3 = _mm256_add_epi32(u3, v[3]);
+
+  u0 = _mm256_and_si256(u0, mask);
+  u1 = _mm256_and_si256(u1, mask);
+  u2 = _mm256_and_si256(u2, mask);
+  u3 = _mm256_and_si256(u3, mask);
+  // 4 32-bit summation, evenly positioned
+
+  u1 = _mm256_slli_si256(u1, 4);
+  u3 = _mm256_slli_si256(u3, 4);
+
+  u0 = _mm256_or_si256(u0, u1);
+  u2 = _mm256_or_si256(u2, u3);
+  // 8 32-bit summation, interleaved
+
+  u1 = _mm256_unpacklo_epi64(u0, u2);
+  u3 = _mm256_unpackhi_epi64(u0, u2);
+
+  u0 = _mm256_add_epi32(u1, u3);
+  sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
+                      _mm256_castsi256_si128(u0));
+  _mm_storeu_si128((__m128i *)res, sad);
+}
+
+static void convert_pointers(const uint8_t *const ref8[],
+                             const uint16_t *ref[]) {
+  ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
+  ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
+  ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
+  ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
+}
+
+static void init_sad(__m256i *s) {
+  s[0] = _mm256_setzero_si256();
+  s[1] = _mm256_setzero_si256();
+  s[2] = _mm256_setzero_si256();
+  s[3] = _mm256_setzero_si256();
+}
+
+void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
+                                const uint8_t *const ref_array[],
+                                int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_4_rows = 2;
+  int i;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+    srcp += src_stride << shift_for_4_rows;
+    refp[i] += ref_stride << shift_for_4_rows;
+    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  uint32_t first8rows[4];
+  uint32_t second8rows[4];
+  const uint8_t *ref[4];
+  const int shift_for_8_rows = 3;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
+  src += src_stride << shift_for_8_rows;
+  ref[0] += ref_stride << shift_for_8_rows;
+  ref[1] += ref_stride << shift_for_8_rows;
+  ref[2] += ref_stride << shift_for_8_rows;
+  ref[3] += ref_stride << shift_for_8_rows;
+  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
+  sad_array[0] = first8rows[0] + second8rows[0];
+  sad_array[1] = first8rows[1] + second8rows[1];
+  sad_array[2] = first8rows[2] + second8rows[2];
+  sad_array[3] = first8rows[3] + second8rows[3];
+}
+
+void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  uint32_t first_half[4];
+  uint32_t second_half[4];
+  const uint8_t *ref[4];
+  const int shift_for_rows = 4;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+  src += src_stride << shift_for_rows;
+  ref[0] += ref_stride << shift_for_rows;
+  ref[1] += ref_stride << shift_for_rows;
+  ref[2] += ref_stride << shift_for_rows;
+  ref[3] += ref_stride << shift_for_rows;
+  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+  sad_array[0] = first_half[0] + second_half[0];
+  sad_array[1] = first_half[1] + second_half[1];
+  sad_array[2] = first_half[2] + second_half[2];
+  sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_4_rows = 2;
+  int i;
+  int rows_section;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    rows_section = 0;
+    while (rows_section < 4) {
+      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      srcp += src_stride << shift_for_4_rows;
+      refp[i] += ref_stride << shift_for_4_rows;
+      rows_section++;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  uint32_t first_half[4];
+  uint32_t second_half[4];
+  const uint8_t *ref[4];
+  const int shift_for_rows = 4;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+  src += src_stride << shift_for_rows;
+  ref[0] += ref_stride << shift_for_rows;
+  ref[1] += ref_stride << shift_for_rows;
+  ref[2] += ref_stride << shift_for_rows;
+  ref[3] += ref_stride << shift_for_rows;
+  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+  sad_array[0] = first_half[0] + second_half[0];
+  sad_array[1] = first_half[1] + second_half[1];
+  sad_array[2] = first_half[2] + second_half[2];
+  sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  uint32_t first_half[4];
+  uint32_t second_half[4];
+  const uint8_t *ref[4];
+  const int shift_for_rows = 5;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+  src += src_stride << shift_for_rows;
+  ref[0] += ref_stride << shift_for_rows;
+  ref[1] += ref_stride << shift_for_rows;
+  ref[2] += ref_stride << shift_for_rows;
+  ref[3] += ref_stride << shift_for_rows;
+  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+  sad_array[0] = first_half[0] + second_half[0];
+  sad_array[1] = first_half[1] + second_half[1];
+  sad_array[2] = first_half[2] + second_half[2];
+  sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_rows = 1;
+  int i;
+  int rows_section;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    rows_section = 0;
+    while (rows_section < 16) {
+      sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+      srcp += src_stride << shift_for_rows;
+      refp[i] += ref_stride << shift_for_rows;
+      rows_section++;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
+                                 const uint8_t *const ref_array[],
+                                 int ref_stride, uint32_t *sad_array) {
+  uint32_t first_half[4];
+  uint32_t second_half[4];
+  const uint8_t *ref[4];
+  const int shift_for_rows = 5;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+  src += src_stride << shift_for_rows;
+  ref[0] += ref_stride << shift_for_rows;
+  ref[1] += ref_stride << shift_for_rows;
+  ref[2] += ref_stride << shift_for_rows;
+  ref[3] += ref_stride << shift_for_rows;
+  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+  sad_array[0] = first_half[0] + second_half[0];
+  sad_array[1] = first_half[1] + second_half[1];
+  sad_array[2] = first_half[2] + second_half[2];
+  sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref_array[],
+                                  int ref_stride, uint32_t *sad_array) {
+  uint32_t first_half[4];
+  uint32_t second_half[4];
+  const uint8_t *ref[4];
+  const int shift_for_rows = 6;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+  src += src_stride << shift_for_rows;
+  ref[0] += ref_stride << shift_for_rows;
+  ref[1] += ref_stride << shift_for_rows;
+  ref[2] += ref_stride << shift_for_rows;
+  ref[3] += ref_stride << shift_for_rows;
+  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+  sad_array[0] = first_half[0] + second_half[0];
+  sad_array[1] = first_half[1] + second_half[1];
+  sad_array[2] = first_half[2] + second_half[2];
+  sad_array[3] = first_half[3] + second_half[3];
+}
+
+void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref_array[],
+                                  int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  int i;
+  int rows_section;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    rows_section = 0;
+    while (rows_section < 64) {
+      sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+      srcp += src_stride;
+      refp[i] += ref_stride;
+      rows_section++;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref_array[],
+                                   int ref_stride, uint32_t *sad_array) {
+  uint32_t first_half[4];
+  uint32_t second_half[4];
+  const uint8_t *ref[4];
+  const int shift_for_rows = 6;
+
+  ref[0] = ref_array[0];
+  ref[1] = ref_array[1];
+  ref[2] = ref_array[2];
+  ref[3] = ref_array[3];
+
+  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
+  src += src_stride << shift_for_rows;
+  ref[0] += ref_stride << shift_for_rows;
+  ref[1] += ref_stride << shift_for_rows;
+  ref[2] += ref_stride << shift_for_rows;
+  ref[3] += ref_stride << shift_for_rows;
+  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
+  sad_array[0] = first_half[0] + second_half[0];
+  sad_array[1] = first_half[1] + second_half[1];
+  sad_array[2] = first_half[2] + second_half[2];
+  sad_array[3] = first_half[3] + second_half[3];
+}
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
new file mode 100644
index 000000000..c6fd62c9e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  __m256i s1, s2, r1, r2;
+  __m256i sum = _mm256_setzero_si256();
+  __m128i sum_i128;
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
+    s2 = _mm256_sad_epu8(
+        r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
+    ref_ptr += ref_stride << 1;
+    src_ptr += src_stride << 1;
+  }
+
+  sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
+  sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
+                           _mm256_castsi256_si128(sum));
+  return _mm_cvtsi128_si32(sum_i128);
+}
+
+static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  unsigned int half_width = 32;
+  uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 5;
+  ref_ptr += ref_stride << 5;
+  sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride) {
+  unsigned int half_width = 64;
+  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+static void sad64x64x4d(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        __m128i *res) {
+  uint32_t sum[4];
+  aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
+  *res = _mm_loadu_si128((const __m128i *)sum);
+}
+
+void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
+                           const uint8_t *const ref[4], int ref_stride,
+                           uint32_t res[4]) {
+  __m128i sum0, sum1;
+  const uint8_t *rf[4];
+
+  rf[0] = ref[0];
+  rf[1] = ref[1];
+  rf[2] = ref[2];
+  rf[3] = ref[3];
+  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
+  src += src_stride << 6;
+  rf[0] += ref_stride << 6;
+  rf[1] += ref_stride << 6;
+  rf[2] += ref_stride << 6;
+  rf[3] += ref_stride << 6;
+  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
+  sum0 = _mm_add_epi32(sum0, sum1);
+  _mm_storeu_si128((__m128i *)res, sum0);
+}
+
+void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
+                           const uint8_t *const ref[4], int ref_stride,
+                           uint32_t res[4]) {
+  __m128i sum0, sum1;
+  unsigned int half_width = 64;
+  const uint8_t *rf[4];
+
+  rf[0] = ref[0];
+  rf[1] = ref[1];
+  rf[2] = ref[2];
+  rf[3] = ref[3];
+  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
+  src += half_width;
+  rf[0] += half_width;
+  rf[1] += half_width;
+  rf[2] += half_width;
+  rf[3] += half_width;
+  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
+  sum0 = _mm_add_epi32(sum0, sum1);
+  _mm_storeu_si128((__m128i *)res, sum0);
+}
+
+void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
+                            const uint8_t *const ref[4], int ref_stride,
+                            uint32_t res[4]) {
+  const uint8_t *rf[4];
+  uint32_t sum0[4];
+  uint32_t sum1[4];
+
+  rf[0] = ref[0];
+  rf[1] = ref[1];
+  rf[2] = ref[2];
+  rf[3] = ref[3];
+  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
+  src += src_stride << 6;
+  rf[0] += ref_stride << 6;
+  rf[1] += ref_stride << 6;
+  rf[2] += ref_stride << 6;
+  rf[3] += ref_stride << 6;
+  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
+  res[0] = sum0[0] + sum1[0];
+  res[1] = sum0[1] + sum1[1];
+  res[2] = sum0[2] + sum1[2];
+  res[3] = sum0[3] + sum1[3];
+}
+
+static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const int h, const uint8_t *second_pred,
+                                     const int second_pred_stride) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    ref1_reg = _mm256_avg_epu8(
+        ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
+    ref2_reg = _mm256_avg_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+    second_pred += second_pred_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+
+  return res;
+}
+
+unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred) {
+  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  second_pred, 64);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  second_pred += 64 << 6;
+  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                          second_pred, 64);
+  return sum;
+}
+
+unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred) {
+  unsigned int half_width = 64;
+  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  second_pred, 128);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  second_pred += half_width;
+  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                          second_pred, 128);
+  return sum;
+}
+
+unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const uint8_t *second_pred) {
+  uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
+                                        ref_stride, second_pred);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  second_pred += 128 << 6;
+  sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
+                                second_pred);
+  return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
new file mode 100644
index 000000000..3251b7655
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -0,0 +1,353 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+;                                  uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+  SAD_FN 128, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*4]
+  pavgb                 m2, [second_predq+mmsize*5]
+  pavgb                 m3, [second_predq+mmsize*6]
+  pavgb                 m4, [second_predq+mmsize*7]
+  lea         second_predq, [second_predq+mmsize*8]
+%endif
+  psadbw                m1, [srcq+64]
+  psadbw                m2, [srcq+80]
+  psadbw                m3, [srcq+96]
+  psadbw                m4, [srcq+112]
+
+  add                 refq, ref_strideq
+  add                 srcq, src_strideq
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  sub              n_rowsd, 1
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128     ; sad128x128_sse2
+SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 64      ; sad128x64_sse2
+SAD128XN 64, 1   ; sad128x64_avg_sse2
+
+
+; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+  SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  add                 refq, ref_strideq
+  paddd                 m0, m1
+  add                 srcq, src_strideq
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 128     ; sad64x128_sse2
+SAD64XN 128, 1  ; sad64x128_avg_sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 16 ; sad64x16_sse2
+SAD64XN 16, 1 ; sad64x16_avg_sse2
+
+; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+  SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq]
+  movu                  m4, [refq+ref_strideq+16]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+src_strideq]
+  psadbw                m4, [srcq+src_strideq+16]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 8 ; sad_32x8_sse2
+SAD32XN 8, 1 ; sad_32x8_avg_sse2
+
+; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+  SAD_FN 16, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+src_strideq]
+  psadbw                m3, [srcq+src_strideq*2]
+  psadbw                m4, [srcq+src_stride3q]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN  8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 4 ; sad_16x4_sse2
+SAD16XN 4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64 ; sad_16x64_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
+
+; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+  SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movh                  m1, [refq]
+  movhps                m1, [refq+ref_strideq]
+  movh                  m2, [refq+ref_strideq*2]
+  movhps                m2, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
+  movh                  m3, [srcq]
+  movhps                m3, [srcq+src_strideq]
+  movh                  m4, [srcq+src_strideq*2]
+  movhps                m4, [srcq+src_stride3q]
+  psadbw                m1, m3
+  psadbw                m2, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN  8 ; sad8x8_sse2
+SAD8XN  4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN  8, 1 ; sad8x8_avg_sse2
+SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 32 ; sad_8x32_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
+
+; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+  SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movd                  m1, [refq]
+  movd                  m2, [refq+ref_strideq]
+  movd                  m3, [refq+ref_strideq*2]
+  movd                  m4, [refq+ref_stride3q]
+  punpckldq             m1, m2
+  punpckldq             m3, m4
+  movlhps               m1, m3
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  lea         second_predq, [second_predq+mmsize*1]
+%endif
+  movd                  m2, [srcq]
+  movd                  m5, [srcq+src_strideq]
+  movd                  m4, [srcq+src_strideq*2]
+  movd                  m3, [srcq+src_stride3q]
+  punpckldq             m2, m5
+  punpckldq             m4, m3
+  movlhps               m2, m4
+  psadbw                m1, m2
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN  8 ; sad4x8_sse
+SAD4XN  4 ; sad4x4_sse
+SAD4XN  8, 1 ; sad4x8_avg_sse
+SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN 16 ; sad_4x16_sse2
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..305dde5c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = yy_loadu_256(a);
+  const __m256i v_b0 = yy_loadu_256(b);
+  const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
+  const __m256i v_a01_w =
+      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
+  const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
+  const __m256i v_b01_w =
+      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_32(a);
+        const __m128i v_a1 = xx_loadl_32(a + a_stride);
+        const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+        const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+        const __m128i v_b0 = xx_loadl_32(b);
+        const __m128i v_b1 = xx_loadl_32(b + b_stride);
+        const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+        const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+        const __m128i v_a0123 = _mm_unpacklo_epi64(
+            _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
+        const __m128i v_b0123 = _mm_unpacklo_epi64(
+            _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
+        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_a1 = xx_loadl_64(a + a_stride);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_b1 = xx_loadl_64(b + b_stride);
+        const __m256i v_a_w =
+            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+        const __m256i v_b_w =
+            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = xx_loadu_128(a);
+        const __m128i v_b0 = xx_loadu_128(b);
+        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
+        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        sse_w32_avx2(&sum, a + 64, b + 64);
+        sse_w32_avx2(&sum, a + 96, b + 96);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default: break;
+  }
+
+  return sse;
+}
+
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = yy_loadu_256(a);
+  const __m256i v_b_w = yy_loadu_256(b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_a1 = xx_loadl_64(a + a_stride);
+        const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+        const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_b1 = xx_loadl_64(b + b_stride);
+        const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+        const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+        const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+                                           _mm_unpacklo_epi64(v_a2, v_a3));
+        const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+                                           _mm_unpacklo_epi64(v_b2, v_b3));
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+        const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        highbd_sse_w16_avx2(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
+        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
+        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
+        highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
+        highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
+        highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
+        highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default: break;
+  }
+  return sse;
+}
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..8b5af8469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = xx_loadu_128(a);
+  const __m128i v_b0 = xx_loadu_128(b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_32(a);
+        const __m128i v_a1 = xx_loadl_32(a + a_stride);
+        const __m128i v_b0 = xx_loadl_32(b);
+        const __m128i v_b1 = xx_loadl_32(b + b_stride);
+        const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+        const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+        const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default: break;
+  }
+
+  return sse;
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = xx_loadu_128(a);
+  const __m128i v_b_w = xx_loadu_128(b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        const __m128i v_a0 = xx_loadl_64(a);
+        const __m128i v_a1 = xx_loadl_64(a + a_stride);
+        const __m128i v_b0 = xx_loadl_64(b);
+        const __m128i v_b1 = xx_loadl_64(b + b_stride);
+        const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+        const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
+        highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default: break;
+  }
+  return sse;
+}
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
new file mode 100644
index 000000000..6d9b5a12f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(aom_ssim_parms_16x16_sse2) PRIVATE
+sym(aom_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(aom_ssim_parms_8x8_sse2) PRIVATE
+sym(aom_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 000000000..45bf6ec3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1481 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+bilin_filter_m_ssse3: times  8 db 16,  0
+                      times  8 db 14,  2
+                      times  8 db 12,  4
+                      times  8 db 10,  6
+                      times 16 db  8
+                      times  8 db  6, 10
+                      times  8 db  4, 12
+                      times  8 db  2, 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  paddw                %5, %3
+  pmaddwd              %3, %3
+  paddw                %5, %1
+  pmaddwd              %1, %1
+  paddd                %6, %3
+  paddd                %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  movhlps              m3, m7
+  punpcklwd            m4, m6, m5
+  punpckhwd            m6, m5           ; sign-extend m6 word->dword
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  pshufd               m4, m6, 0x1
+  movd               [r1], m7           ; store sse
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
+  paddw                m6, m4
+  paddd                m7, m3
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  punpcklwd            m6, m5           ; sign-extend m6 word->dword
+  movd               [r1], m7           ; store sse
+  pshuflw              m4, m6, 0xe
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if ARCH_X86_64
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse, \
+                                          g_bilin_filter, g_pw_8
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+
+      ;Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse, g_bilin_filter, g_pw_8
+      %define block_height heightd
+
+      ;Store bilin_filter and pw_8 location in stack
+      %if GET_GOT_DEFINED == 1
+        GET_GOT eax
+        add esp, 4                ; restore esp
+      %endif
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
+      %define block_height heightd
+    %endif
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+  ; could perhaps use it for something more productive then
+  pxor                 m5, m5           ; dedicated zero register
+%if %1 < 16
+  sar                   block_height, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  mova                 m1, [dstq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+%endif
+%else ; !avg
+  movx                 m2, [srcq+src_strideq]
+%endif
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+  pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%if %1 > 4
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m2, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
+  punpckldq            m2, m1
+%endif
+  movx                 m1, [dstq]
+%if %1 > 4
+  movlhps              m0, m2
+%else ; 4xh
+  punpckldq            m0, m2
+%endif
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
+  pavgb                m0, m2
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movx                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movx                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonzero:
+  cmp           x_offsetd, 4
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
+%endif
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
+  pavgb                m0, m4
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET %1
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
+  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m2, m1
+  movx                 m1, [srcq+src_strideq+1]
+  punpckldq            m3, m1
+%endif
+  pavgb                m2, m3
+%if %1 > 4
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; 4xh
+  punpckldq            m0, m2
+  pshuflw              m4, m2, 0xe
+%endif
+  movx                 m1, [dstq]
+  pavgb                m0, m2
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
+  pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%if %1 > 4
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
+  pavgb                m2, m3
+  pavgb                m4, m1
+  pavgb                m0, m2
+  pavgb                m2, m4
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_other_loop:
+  movu                 m4, [srcq]
+  movu                 m2, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m2
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  punpcklbw            m0, m5
+  paddw                m2, m3
+  punpcklbw            m3, m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+%endif
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+%if notcpuflag(ssse3)
+  punpcklbw            m0, m5
+%endif
+.x_half_y_other_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
+  pavgb                m2, m1
+  pavgb                m4, m3
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movx                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  paddw                m0, m1
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m2, m1
+  movx                 m1, [dstq]
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  movx                 m1, [dstq]
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_x_a
+  pmaddubsw            m2, filter_x_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movx                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+%if cpuflag(ssse3)
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%else
+  punpckhbw            m2, m4, m5
+  punpckhbw            m1, m3, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m4, m3
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  psraw                m4, 4
+  psraw                m2, 4
+  punpckhbw            m3, m1, m5
+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+  ; have a 1-register shortage to be able to store the backup of the bilin
+  ; filtered second line as words as cache for the next line. Packing into
+  ; a byte costs 1 pack and 2 unpacks, but saves a register.
+  packuswb             m4, m2
+  punpcklbw            m1, m5
+  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  add                srcq, src_strideq
+  psraw                m0, 4
+.x_other_y_half_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  movx                 m1, [dstq]
+  paddw                m4, m3
+  movx                 m3, [dstq+dst_strideq]
+%endif
+  psraw                m2, 4
+  psraw                m4, 4
+  pavgw                m0, m2
+  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                m11, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+  packuswb             m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  punpckhbw            m3, m1, m5
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  punpcklbw            m1, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  psraw                m0, 4
+%else
+  movu                 m3, [srcq]
+  movu                 m4, [srcq+1]
+  punpckhbw            m1, m3, m5
+  punpckhbw            m2, m4, m5
+  punpcklbw            m3, m5
+  punpcklbw            m4, m5
+  pmullw               m3, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m3, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m3, m4
+  paddw                m1, m2
+  psraw                m3, 4
+  psraw                m1, 4
+  packuswb             m4, m3, m1
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  pmullw               m2, filter_y_a
+  pmullw               m1, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  psraw                m0, 4
+%if cpuflag(ssse3)
+  packuswb             m0, m0
+%endif
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
+
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m2, m2
+  packuswb             m4, m4
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m1, m5
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  paddw                m4, m3
+  psraw                m2, 4
+  psraw                m4, 4
+  pmullw               m0, filter_y_a
+  pmullw               m3, m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m0, m3
+  movx                 m3, [dstq+dst_strideq]
+  paddw                m2, m1
+  movx                 m1, [dstq]
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+  STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  4
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE  4
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4389d123d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+                                   const uint8_t *pred_ptr) {
+  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static INLINE void aom_subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+    __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void aom_subtract_block_128xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 32:
+      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 64:
+      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                   src_stride, pred_ptr, pred_stride);
+      break;
+    case 128:
+      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                    src_stride, pred_ptr, pred_stride);
+      break;
+    default:
+      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
new file mode 100644
index 000000000..1a75a234f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,146 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void aom_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+  cmp                colsd, 64
+  je .case_64
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  mova                  m1, [predq+%3]
+  mova                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_128:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
+  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
+  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  sub                rowsd, 1
+  jnz .loop_128
+  RET
+
+.case_64:
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 000000000..0af44e3a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                                int width, int height) {
+  uint64_t result;
+  __m256i v_acc_q = _mm256_setzero_si256();
+  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_acc_d = _mm256_setzero_si256();
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+    }
+    v_acc_q =
+        _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+    src += 4 * stride;
+  }
+  __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+  __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+  __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+  result_64_2_int = _mm_add_epi64(
+      result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+  xx_storel_64(&result, result_64_2_int);
+
+  return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+                                     int height) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 000000000..22d7739ec
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, a);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+  __m128i v_sum_d =
+      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height) {
+  int r = 0;
+  __m128i v_acc_q = _mm_setzero_si128();
+  do {
+    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
+  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+  return xx_cvtsi128_si64(v_acc_64);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                                int height) {
+  int r = 0;
+
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_q = _mm_setzero_si128();
+
+  do {
+    __m128i v_acc_d = _mm_setzero_si128();
+    int c = 0;
+    do {
+      const int16_t *b = src + c;
+
+      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+
+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+      c += 8;
+    } while (c < width);
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+  return xx_cvtsi128_si64(v_acc_q);
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
+                                     int height) {
+  // 4 elements per row only requires half an XMM register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc0_q = _mm_setzero_si128();
+  __m128i v_acc1_q = _mm_setzero_si128();
+
+  const int16_t *const end = src + n;
+
+  assert(n % 64 == 0);
+
+  while (src < end) {
+    const __m128i v_val_0_w = xx_load_128(src);
+    const __m128i v_val_1_w = xx_load_128(src + 8);
+    const __m128i v_val_2_w = xx_load_128(src + 16);
+    const __m128i v_val_3_w = xx_load_128(src + 24);
+    const __m128i v_val_4_w = xx_load_128(src + 32);
+    const __m128i v_val_5_w = xx_load_128(src + 40);
+    const __m128i v_val_6_w = xx_load_128(src + 48);
+    const __m128i v_val_7_w = xx_load_128(src + 56);
+
+    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+    src += 64;
+  }
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+  return xx_cvtsi128_si64(v_acc0_q);
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+  if (n % 64 == 0) {
+    return aom_sum_squares_i16_64n_sse2(src, n);
+  } else if (n > 64) {
+    int k = n & ~(64 - 1);
+    return aom_sum_squares_i16_64n_sse2(src, k) +
+           aom_sum_squares_i16_c(src + k, n - k);
+  } else {
+    return aom_sum_squares_i16_c(src, n);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 000000000..491e31cc5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+                                         int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+#endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
new file mode 100644
index 000000000..1e9f1e27b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+  return _mm_cvtsi32_si128(*(const uint32_t *)a);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i *)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i *)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+  *(uint32_t *)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i *)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+  _mm_store_si128((__m128i *)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i *)a, v);
+}
+
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, e1, 0, e0);
+#else
+  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, a, 0, a);
+#else
+  return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 000000000..3f69b120e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+  return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+  _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+  _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+  return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+  __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
+  __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+  return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 000000000..d0d1ee684
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 000000000..b1611ba87
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+                                   __m256i *in0, __m256i *in1, const __m256i _r,
+                                   const int32_t cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, w0);
+  __m256i u1 = _mm256_madd_epi16(t1, w0);
+  __m256i v0 = _mm256_madd_epi16(t0, w1);
+  __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, _r);
+  __m256i a1 = _mm256_add_epi32(u1, _r);
+  __m256i b0 = _mm256_add_epi32(v0, _r);
+  __m256i b1 = _mm256_add_epi32(v1, _r);
+
+  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+  *in0 = _mm256_packs_epi32(c0, c1);
+  *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_adds_epi16(_in0, _in1);
+  *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_add_epi32(_in0, _in1);
+  *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+                                             __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_adds_epi16(_in0, _in1);
+  *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+                                           __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_add_epi32(_in0, _in1);
+  *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+                                                        int stride,
+                                                        __m256i *out,
+                                                        int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
+  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
+  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
+  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
+  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
+  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
+  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
+  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
+  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
+  // to:
+  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
+  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
+  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
+  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
+  // ...
+  __m256i a[16];
+  for (int i = 0; i < 16; i += 2) {
+    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
+    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+  }
+  __m256i b[16];
+  for (int i = 0; i < 16; i += 2) {
+    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
+    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+  }
+  __m256i c[16];
+  for (int i = 0; i < 16; i += 2) {
+    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
+    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+  }
+  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
+  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
+  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
+  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+
+  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
+  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
+  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
+  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+
+  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
+  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
+  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
+  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+
+  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
+  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
+  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
+  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_adds_epi16(in[i], round);
+      in[i] = _mm256_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 000000000..ed82eee96
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
new file mode 100644
index 000000000..800aef126
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+  return _mm_add_epi16(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+  return _mm_add_epi32(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+                                                     unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+                                          unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 =
+      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  vsum = sum_to_32bit_avx2(vsum);
+  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m256i *const vsse,
+                                    __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum;                                                             \
+    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
+
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum = _mm256_setzero_si256();                                    \
+    for (int i = 0; i < (bh / uh); i++) {                                     \
+      __m256i vsum16;                                                         \
+      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
+      src += uh * src_stride;                                                 \
+      ref += uh * ref_stride;                                                 \
+    }                                                                         \
+    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
+    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
+  }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse);
+
+unsigned int aom_sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sseptr);
+
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2);                                                           \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
+  }
+
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    /*Avoid overflow in helper by capping height.*/                       \
+    const int hf = AOMMIN(h, 64);                                         \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, hf, &sse2);                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
+
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
+
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+                                            const __m256i a,
+                                            uint8_t *comp_pred) {
+  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  int i = 0;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      // comp_pred's stride == width == 16
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (16 << 2);
+      i += 4;
+    } while (i < height);
+  } else {  // for width == 32
+    do {
+      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
+      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
+      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
+
+      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
+      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
+      const __m256i aB =
+          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
+
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (32 << 1);
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  }
+}
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+                                                      const __m256i s1,
+                                                      const __m256i a) {
+  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+  const __m256i pred_l = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+  const __m256i pred_h = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m256i zero = _mm256_setzero_si256();
+
+  if (width == 8) {
+    do {
+      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+      __m256i m = _mm256_castsi128_si256(m_l);
+      m = _mm256_insertf128_si256(m, m_h, 1);
+      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+      _mm_storeu_si128((__m128i *)(comp_pred + width),
+                       _mm256_extractf128_si256(comp, 1));
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      comp_pred += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+      const __m256i m_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
+      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
+      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+
+      const __m256i m01_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+      const __m256i m23_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 000000000..88e27aef3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,517 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
+  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
+  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
+  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
+  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
+  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
+  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
+  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
+   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
+   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
+   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
+   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
+   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
+};
+/* clang-format on */
+
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg)               \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST                                    \
+  /* load source and destination */                     \
+  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  /* average between current and next stride source */                     \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */                                     \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+    }
+    // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+      // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
+
+unsigned int aom_sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+    }
+    // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec += sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+      // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec += sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec += sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec += sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec += sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 000000000..66b0d7d84
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = xx_loadl_64(a + 1);
+
+        // unpack to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    const __m128i shuffle_mask =
+        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
new file mode 100644
index 000000000..3c37e77c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = xx_loadu_128(src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return _mm_cvtsi128_si32(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
+
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
+
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return _mm_cvtsi128_si32(val);
+}
+
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src, ref);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = add32x4_sse2(vsum);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+                                                unsigned int *const sse,
+                                                int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = sum_to_32bit_sse2(vsum);
+  *sum = add32x4_sse2(vsum);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 256);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src, src_stride);
+    const __m128i r = load4x2_sse2(ref, ref_stride);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+  for (int i = 0; i < h; i++) {
+    const __m128i s = load8_8to16_sse2(src);
+    const __m128i r = load8_8to16_sse2(ref);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 64);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src, ref, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 16);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m128i *const sse,
+                                    __m128i *const sum) {
+  assert(h <= 8);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const int offset0 = j << 5;
+      const int offset1 = offset0 + 16;
+      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum;                                                             \
+    int sum = 0;                                                              \
+    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum = _mm_setzero_si128();                                       \
+    for (int i = 0; i < (bh / uh); ++i) {                                     \
+      __m128i vsum16;                                                         \
+      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
+      src += (src_stride * uh);                                               \
+      ref += (ref_stride * uh);                                               \
+    }                                                                         \
+    *sse = add32x4_sse2(vsse);                                                \
+    int sum = add32x4_sse2(vsum);                                             \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
+
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
+
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
+
+unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt)                                                           \
+  int aom_sub_pixel_variance##w##xh_##opt(                                     \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
+      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+      void *unused0, void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
+  }
+
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+
+FNS(sse2);
+FNS(ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                        \
+  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
+      void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+  }
+
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+
+FNS(sse2);
+FNS(ssse3);
+
+#undef FNS
+#undef FN
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
+                             int subpel_x_q3, int subpel_y_q3,
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      // Note: This is mostly a copy from the >=8X8 case in
+      // build_inter_predictors() function, with some small tweaks.
+
+      // Some assumptions.
+      const int plane = 0;
+
+      // Get pre-requisites.
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int ssx = pd->subsampling_x;
+      const int ssy = pd->subsampling_y;
+      assert(ssx == 0 && ssy == 0);
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+
+      // Calculate subpel_x/y and x/y_step.
+      const int row_start = 0;  // Because ss_y is 0.
+      const int col_start = 0;  // Because ss_x is 0.
+      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
+      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
+      int orig_pos_y = pre_y << SUBPEL_BITS;
+      orig_pos_y += mv->row * (1 << (1 - ssy));
+      int orig_pos_x = pre_x << SUBPEL_BITS;
+      orig_pos_x += mv->col * (1 << (1 - ssx));
+      int pos_y = sf->scale_value_y(orig_pos_y, sf);
+      int pos_x = sf->scale_value_x(orig_pos_x, sf);
+      pos_x += SCALE_EXTRA_OFF;
+      pos_y += SCALE_EXTRA_OFF;
+
+      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                         << SCALE_SUBPEL_BITS;
+      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
+                        << SCALE_SUBPEL_BITS;
+      pos_y = clamp(pos_y, top, bottom);
+      pos_x = clamp(pos_x, left, right);
+
+      const uint8_t *const pre =
+          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+          (pos_x >> SCALE_SUBPEL_BITS);
+
+      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                           pos_x & SCALE_SUBPEL_MASK,
+                                           pos_y & SCALE_SUBPEL_MASK };
+
+      // Get warp types.
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mi->ref_frame[ref_num]];
+      const int is_global = is_global_mv_block(mi, wm->wmtype);
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global;
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      // Get convolve parameters.
+      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+      const InterpFilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      // Get the inter predictor.
+      const int build_for_obmc = 0;
+      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
+                               &subpel_params, sf, width, height, &conv_params,
+                               filters, &warp_types, mi_x >> pd->subsampling_x,
+                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
+                               build_for_obmc, xd, cm->allow_warped_motion);
+
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter =
+      (subpel_search == 1)
+          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
+          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
+  int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    if (width >= 16) {
+      int i;
+      assert(!(width & 15));
+      /*Read 16 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 16) {
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
+          comp_pred += 16;
+          ref += 16;
+        }
+        ref += ref_stride - width;
+      }
+    } else if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      assert(!(height & 1));
+      /*Read 8 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+        comp_pred += 16;
+        ref += 2 * ref_stride;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      assert(!(height & 3));
+      /*Read 4 pixels four rows at a time.*/
+      for (i = 0; i < height; i++) {
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
+        comp_pred += 16;
+        ref += 4 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz =
+        (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    // TODO(Deepa): Remove the memset below when we have
+    // 4 tap simd for sse2 and ssse3.
+    if (subpel_search == 1) {
+      memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
+      memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
+      memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
+      memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
+    }
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+  for (i = 0; i < n; i++) {
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+                                                      const __m128i s1,
+                                                      const __m128i a) {
+  const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+  const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+  const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+  const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+  const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+  const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (width == 8) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+      const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+      const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+      const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+      const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+      _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      for (int j = 0; j < 2; j++) {
+        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
+        const __m128i s2 =
+            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
+        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
+        const __m128i s3 =
+            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+
+        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
+        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_mem/aom_mem.c b/third_party/aom/aom_mem/aom_mem.c
new file mode 100644
index 000000000..e603fc5bf
--- /dev/null
+++ b/third_party/aom/aom_mem/aom_mem.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "include/aom_mem_intrnl.h"
+#include "aom/aom_integer.h"
+
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+// Returns 0 in case of overflow of nmemb * size.
+static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
+  const uint64_t total_size = nmemb * size;
+  if (nmemb == 0) return 1;
+  if (size > AOM_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
+  if (total_size != (size_t)total_size) return 0;
+  return 1;
+}
+#endif
+
+static size_t GetAlignedMallocSize(size_t size, size_t align) {
+  return size + align - 1 + ADDRESS_STORAGE_SIZE;
+}
+
+static size_t *GetMallocAddressLocation(void *const mem) {
+  return ((size_t *)mem) - 1;
+}
+
+static void SetActualMallocAddress(void *const mem,
+                                   const void *const malloc_addr) {
+  size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
+  *malloc_addr_location = (size_t)malloc_addr;
+}
+
+static void *GetActualMallocAddress(void *const mem) {
+  const size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
+  return (void *)(*malloc_addr_location);
+}
+
+void *aom_memalign(size_t align, size_t size) {
+  void *x = NULL;
+  const size_t aligned_size = GetAlignedMallocSize(size, align);
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+  if (!check_size_argument_overflow(1, aligned_size)) return NULL;
+#endif
+  void *const addr = malloc(aligned_size);
+  if (addr) {
+    x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
+    SetActualMallocAddress(x, addr);
+  }
+  return x;
+}
+
+void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); }
+
+void *aom_calloc(size_t num, size_t size) {
+  const size_t total_size = num * size;
+  void *const x = aom_malloc(total_size);
+  if (x) memset(x, 0, total_size);
+  return x;
+}
+
+void aom_free(void *memblk) {
+  if (memblk) {
+    void *addr = GetActualMallocAddress(memblk);
+    free(addr);
+  }
+}
+
+void *aom_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
diff --git a/third_party/aom/aom_mem/aom_mem.cmake b/third_party/aom/aom_mem/aom_mem.cmake
new file mode 100644
index 000000000..eaee8440b
--- /dev/null
+++ b/third_party/aom/aom_mem/aom_mem.cmake
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_MEM_AOM_MEM_CMAKE_)
+  return()
+endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_
+set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1)
+
+list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c"
+            "${AOM_ROOT}/aom_mem/aom_mem.h"
+            "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+
+# Creates the aom_mem build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_mem_targets)
+  add_library(aom_mem OBJECT ${AOM_MEM_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_mem>)
+endfunction()
diff --git a/third_party/aom/aom_mem/aom_mem.h b/third_party/aom/aom_mem/aom_mem.h
new file mode 100644
index 000000000..4b1fa45f1
--- /dev/null
+++ b/third_party/aom/aom_mem/aom_mem.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_MEM_AOM_MEM_H_
+#define AOM_AOM_MEM_AOM_MEM_H_
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#if defined(__uClinux__)
+#include <lddk.h>
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef AOM_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 32)
+#define AOM_MAX_ALLOCABLE_MEMORY 8589934592  // 8 GB
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif
+
+void *aom_memalign(size_t align, size_t size);
+void *aom_malloc(size_t size);
+void *aom_calloc(size_t num, size_t size);
+void aom_free(void *memblk);
+void *aom_memset16(void *dest, int val, size_t length);
+
+#include <string.h>
+
+#ifdef AOM_MEM_PLTFRM
+#include AOM_MEM_PLTFRM
+#endif
+
+#if CONFIG_DEBUG
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr)                         \
+  do {                                                                      \
+    lval = (expr);                                                          \
+    if (!lval)                                                              \
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,                   \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr)       \
+  do {                                                    \
+    lval = (expr);                                        \
+    if (!lval)                                            \
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
+                         "Failed to allocate " #lval);    \
+  } while (0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // AOM_AOM_MEM_AOM_MEM_H_
diff --git a/third_party/aom/aom_mem/include/aom_mem_intrnl.h b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
new file mode 100644
index 000000000..cbc30a9bb
--- /dev/null
+++ b/third_party/aom/aom_mem/include/aom_mem_intrnl.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+#define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+
+#include "config/aom_config.h"
+
+#define ADDRESS_STORAGE_SIZE sizeof(size_t)
+
+#ifndef DEFAULT_ALIGNMENT
+#if defined(VXWORKS)
+/*default addr alignment to use in calls to aom_* functions other than
+  aom_memalign*/
+#define DEFAULT_ALIGNMENT 32
+#else
+#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
+#endif
+#endif
+
+/*returns an addr aligned to the byte boundary specified by align*/
+#define align_addr(addr, align) \
+  (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))
+
+#endif  // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
diff --git a/third_party/aom/aom_ports/aom_once.h b/third_party/aom/aom_ports/aom_once.h
new file mode 100644
index 000000000..4d77aac5a
--- /dev/null
+++ b/third_party/aom/aom_ports/aom_once.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_AOM_ONCE_H_
+#define AOM_AOM_PORTS_AOM_ONCE_H_
+
+#include "config/aom_config.h"
+
+/* Implement a function wrapper to guarantee initialization
+ * thread-safety for library singletons.
+ *
+ * NOTE: This function uses static locks, and can only be
+ * used with one common argument per compilation unit. So
+ *
+ * file1.c:
+ *   aom_once(foo);
+ *   ...
+ *   aom_once(foo);
+ *
+ * file2.c:
+ *   aom_once(bar);
+ *
+ * will ensure foo() and bar() are each called only once, but in
+ *
+ * file1.c:
+ *   aom_once(foo);
+ *   aom_once(bar):
+ *
+ * bar() will never be called because the lock is used up
+ * by the call to foo().
+ */
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+#include <stdlib.h>
+/* Declare a per-compilation-unit state variable to track the progress
+ * of calling func() only once. This must be at global scope because
+ * local initializers are not thread-safe in MSVC prior to Visual
+ * Studio 2015.
+ *
+ * As a static, aom_once_state will be zero-initialized as program start.
+ */
+static LONG aom_once_state;
+static void aom_once(void (*func)(void)) {
+  /* Try to advance aom_once_state from its initial value of 0 to 1.
+   * Only one thread can succeed in doing so.
+   */
+  if (InterlockedCompareExchange(&aom_once_state, 1, 0) == 0) {
+    /* We're the winning thread, having set aom_once_state to 1.
+     * Call our function. */
+    func();
+    /* Now advance aom_once_state to 2, unblocking any other threads. */
+    InterlockedIncrement(&aom_once_state);
+    return;
+  }
+
+  /* We weren't the winning thread, but we want to block on
+   * the state variable so we don't return before func()
+   * has finished executing elsewhere.
+   *
+   * Try to advance aom_once_state from 2 to 2, which is only possible
+   * after the winning thead advances it from 1 to 2.
+   */
+  while (InterlockedCompareExchange(&aom_once_state, 2, 2) != 2) {
+    /* State isn't yet 2. Try again.
+     *
+     * We are used for singleton initialization functions,
+     * which should complete quickly. Contention will likewise
+     * be rare, so it's worthwhile to use a simple but cpu-
+     * intensive busy-wait instead of successive backoff,
+     * waiting on a kernel object, or another heavier-weight scheme.
+     *
+     * We can at least yield our timeslice.
+     */
+    Sleep(0);
+  }
+
+  /* We've seen aom_once_state advance to 2, so we know func()
+   * has been called. And we've left aom_once_state as we found it,
+   * so other threads will have the same experience.
+   *
+   * It's safe to return now.
+   */
+  return;
+}
+
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void aom_once(void (*func)(void)) {
+  static int done;
+
+  /* If the initialization is complete, return early. */
+  if (done) return;
+
+  /* Causes all other threads in the process to block themselves
+   * and give up their time slice.
+   */
+  DosEnterCritSec();
+
+  if (!done) {
+    func();
+    done = 1;
+  }
+
+  /* Restores normal thread dispatching for the current process. */
+  DosExitCritSec();
+}
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void aom_once(void (*func)(void)) {
+  static pthread_once_t lock = PTHREAD_ONCE_INIT;
+  pthread_once(&lock, func);
+}
+
+#else
+/* Default version that performs no synchronization. */
+
+static void aom_once(void (*func)(void)) {
+  static int done;
+
+  if (!done) {
+    func();
+    done = 1;
+  }
+}
+#endif
+
+#endif  // AOM_AOM_PORTS_AOM_ONCE_H_
diff --git a/third_party/aom/aom_ports/aom_ports.cmake b/third_party/aom/aom_ports/aom_ports.cmake
new file mode 100644
index 000000000..6272fc0e3
--- /dev/null
+++ b/third_party/aom/aom_ports/aom_ports.cmake
@@ -0,0 +1,81 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_)
+  return()
+endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
+set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1)
+
+list(APPEND AOM_PORTS_INCLUDES
+            "${AOM_ROOT}/aom_ports/aom_once.h"
+            "${AOM_ROOT}/aom_ports/aom_timer.h"
+            "${AOM_ROOT}/aom_ports/bitops.h"
+            "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+            "${AOM_ROOT}/aom_ports/mem.h"
+            "${AOM_ROOT}/aom_ports/mem_ops.h"
+            "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+            "${AOM_ROOT}/aom_ports/msvc.h"
+            "${AOM_ROOT}/aom_ports/sanitizer.h"
+            "${AOM_ROOT}/aom_ports/system_state.h")
+
+list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm")
+
+list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
+
+list(APPEND AOM_PORTS_SOURCES_ARM "${AOM_ROOT}/aom_ports/arm.h"
+            "${AOM_ROOT}/aom_ports/arm_cpudetect.c")
+
+list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
+            "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
+
+# For arm and x86 targets:
+#
+# * Creates the aom_ports build target, adds the includes in aom_ports to the
+#   target, and makes libaom depend on it.
+#
+# Otherwise:
+#
+# * Adds the includes in aom_ports to the libaom target.
+#
+# For all target platforms:
+#
+# * The libaom target must exist before this function is called.
+function(setup_aom_ports_targets)
+  if("${AOM_TARGET_CPU}" MATCHES "^x86")
+    add_asm_library("aom_ports" "AOM_PORTS_ASM_X86" "aom")
+    set(aom_ports_has_symbols 1)
+  elseif("${AOM_TARGET_CPU}" MATCHES "arm")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
+    set(aom_ports_has_symbols 1)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
+  elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
+    set(aom_ports_has_symbols 1)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
+  endif()
+
+  if(aom_ports_has_symbols)
+    target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
+
+    if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL
+       "x86_64")
+      target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86})
+    endif()
+
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+  else()
+    target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES})
+
+    if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL
+       "x86_64")
+      target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES_X86})
+    endif()
+  endif()
+endfunction()
diff --git a/third_party/aom/aom_ports/aom_timer.h b/third_party/aom/aom_ports/aom_timer.h
new file mode 100644
index 000000000..9b17b8983
--- /dev/null
+++ b/third_party/aom/aom_ports/aom_timer.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_AOM_TIMER_H_
+#define AOM_AOM_PORTS_AOM_TIMER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#if CONFIG_OS_SUPPORT
+
+#if defined(_WIN32)
+/*
+ * Win32 specific includes
+ */
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#else
+/*
+ * POSIX specific includes
+ */
+#include <sys/time.h>
+
+/* timersub is not provided by msys at this time. */
+#ifndef timersub
+#define timersub(a, b, result)                       \
+  do {                                               \
+    (result)->tv_sec = (a)->tv_sec - (b)->tv_sec;    \
+    (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+    if ((result)->tv_usec < 0) {                     \
+      --(result)->tv_sec;                            \
+      (result)->tv_usec += 1000000;                  \
+    }                                                \
+  } while (0)
+#endif
+#endif
+
+struct aom_usec_timer {
+#if defined(_WIN32)
+  LARGE_INTEGER begin, end;
+#else
+  struct timeval begin, end;
+#endif
+};
+
+static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+  QueryPerformanceCounter(&t->begin);
+#else
+  gettimeofday(&t->begin, NULL);
+#endif
+}
+
+static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+  QueryPerformanceCounter(&t->end);
+#else
+  gettimeofday(&t->end, NULL);
+#endif
+}
+
+static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+  LARGE_INTEGER freq, diff;
+
+  diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;
+
+  QueryPerformanceFrequency(&freq);
+  return diff.QuadPart * 1000000 / freq.QuadPart;
+#else
+  struct timeval diff;
+
+  timersub(&t->end, &t->begin, &diff);
+  return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec;
+#endif
+}
+
+#else /* CONFIG_OS_SUPPORT = 0*/
+
+/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */
+#ifndef timersub
+#define timersub(a, b, result)
+#endif
+
+struct aom_usec_timer {
+  void *dummy;
+};
+
+static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; }
+
+static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; }
+
+static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+  (void)t;
+  return 0;
+}
+
+#endif /* CONFIG_OS_SUPPORT */
+
+#endif  // AOM_AOM_PORTS_AOM_TIMER_H_
diff --git a/third_party/aom/aom_ports/arm.h b/third_party/aom/aom_ports/arm.h
new file mode 100644
index 000000000..cb1fb9bec
--- /dev/null
+++ b/third_party/aom/aom_ports/arm.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_ARM_H_
+#define AOM_AOM_PORTS_ARM_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*ARMv5TE "Enhanced DSP" instructions.*/
+#define HAS_EDSP 0x01
+/*ARMv6 "Parallel" or "Media" instructions.*/
+#define HAS_MEDIA 0x02
+/*ARMv7 optional NEON instructions.*/
+#define HAS_NEON 0x04
+
+int aom_arm_cpu_caps(void);
+
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \
+    __GNUC_MINOR__ <= 6
+#define AOM_INCOMPATIBLE_GCC
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_ARM_H_
diff --git a/third_party/aom/aom_ports/arm_cpudetect.c b/third_party/aom/aom_ports/arm_cpudetect.c
new file mode 100644
index 000000000..5a75bb348
--- /dev/null
+++ b/third_party/aom/aom_ports/arm_cpudetect.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "aom_ports/arm.h"
+#include "config/aom_config.h"
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+static int arm_cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int arm_cpu_env_mask(void) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+int aom_arm_cpu_caps(void) {
+  /* This function should actually be a no-op. There is no way to adjust any of
+   * these because the RTCD tables do not exist: the functions are called
+   * statically */
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif /* HAVE_NEON */
+  return flags & mask;
+}
+
+#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+
+int aom_arm_cpu_caps(void) {
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+/* MSVC has no inline __asm support for ARM, but it does let you __emit
+ *  instructions via their assembled hex code.
+ * All of these instructions should be essentially nops.
+ */
+#if HAVE_NEON
+  if (mask & HAS_NEON) {
+    __try {
+      /*VORR q0,q0,q0*/
+      __emit(0xF2200150);
+      flags |= HAS_NEON;
+    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+      /*Ignore exception.*/
+    }
+  }
+#endif /* HAVE_NEON */
+  return flags & mask;
+}
+
+#elif defined(__ANDROID__) /* end _MSC_VER */
+#include <cpu-features.h>
+
+int aom_arm_cpu_caps(void) {
+  int flags;
+  int mask;
+  uint64_t features;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  features = android_getCpuFeatures();
+
+#if HAVE_NEON
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
+#endif /* HAVE_NEON */
+  return flags & mask;
+}
+
+#elif defined(__linux__) /* end __ANDROID__ */
+
+#include <stdio.h>
+
+int aom_arm_cpu_caps(void) {
+  FILE *fin;
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
+   *  on Android.
+   * This also means that detection will fail in Scratchbox.
+   */
+  fin = fopen("/proc/cpuinfo", "r");
+  if (fin != NULL) {
+    /* 512 should be enough for anybody (it's even enough for all the flags
+     * that x86 has accumulated... so far).
+     */
+    char buf[512];
+    while (fgets(buf, 511, fin) != NULL) {
+#if HAVE_NEON
+      if (memcmp(buf, "Features", 8) == 0) {
+        char *p;
+        p = strstr(buf, " neon");
+        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
+          flags |= HAS_NEON;
+        }
+      }
+#endif /* HAVE_NEON */
+    }
+    fclose(fin);
+  }
+  return flags & mask;
+}
+#else  /* end __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
diff --git a/third_party/aom/aom_ports/bitops.h b/third_party/aom/aom_ports/bitops.h
new file mode 100644
index 000000000..44df17307
--- /dev/null
+++ b/third_party/aom/aom_ports/bitops.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_BITOPS_H_
+#define AOM_AOM_PORTS_BITOPS_H_
+
+#include <assert.h>
+
+#include "aom_ports/msvc.h"
+#include "config/aom_config.h"
+
+#ifdef _MSC_VER
+#if defined(_M_X64) || defined(_M_IX86)
+#include <intrin.h>
+#define USE_MSC_INTRINSICS
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// get_msb:
+// Returns (int)floor(log2(n)). n must be > 0.
+// These versions of get_msb() are only valid when n != 0 because all
+// of the optimized versions are undefined when n == 0:
+// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_msb(unsigned int n) {
+  assert(n != 0);
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanReverse)
+
+static INLINE int get_msb(unsigned int n) {
+  unsigned long first_set_bit;
+  assert(n != 0);
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#undef USE_MSC_INTRINSICS
+#else
+static INLINE int get_msb(unsigned int n) {
+  int log = 0;
+  unsigned int value = n;
+  int i;
+
+  assert(n != 0);
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const unsigned int x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_BITOPS_H_
diff --git a/third_party/aom/aom_ports/emmintrin_compat.h b/third_party/aom/aom_ports/emmintrin_compat.h
new file mode 100644
index 000000000..85d218a3d
--- /dev/null
+++ b/third_party/aom/aom_ports/emmintrin_compat.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+#define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+
+#if defined(__GNUC__) && __GNUC__ < 4
+/* From emmintrin.h (gcc 4.5.3) */
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castpd_ps(__m128d __A) {
+  return (__m128)__A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castpd_si128(__m128d __A) {
+  return (__m128i)__A;
+}
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castps_pd(__m128 __A) {
+  return (__m128d)__A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castps_si128(__m128 __A) {
+  return (__m128i)__A;
+}
+
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castsi128_ps(__m128i __A) {
+  return (__m128)__A;
+}
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castsi128_pd(__m128i __A) {
+  return (__m128d)__A;
+}
+#endif
+
+#endif  // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/third_party/aom/aom_ports/emms.asm b/third_party/aom/aom_ports/emms.asm
new file mode 100644
index 000000000..90776bacb
--- /dev/null
+++ b/third_party/aom/aom_ports/emms.asm
@@ -0,0 +1,41 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+section .text
+global sym(aom_reset_mmx_state) PRIVATE
+sym(aom_reset_mmx_state):
+    emms
+    ret
+
+
+%if LIBAOM_YASM_WIN64
+global sym(aom_winx64_fldcw) PRIVATE
+sym(aom_winx64_fldcw):
+    sub   rsp, 8
+    mov   [rsp], rcx ; win x64 specific
+    fldcw [rsp]
+    add   rsp, 8
+    ret
+
+
+global sym(aom_winx64_fstcw) PRIVATE
+sym(aom_winx64_fstcw):
+    sub   rsp, 8
+    fstcw [rsp]
+    mov   rax, [rsp]
+    add   rsp, 8
+    ret
+%endif
diff --git a/third_party/aom/aom_ports/mem.h b/third_party/aom/aom_ports/mem.h
new file mode 100644
index 000000000..3ffea3cd6
--- /dev/null
+++ b/third_party/aom/aom_ports/mem.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_H_
+#define AOM_AOM_PORTS_MEM_H_
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
+#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val
+#else
+#warning No alignment directives known for this compiler.
+#define DECLARE_ALIGNED(n, typ, val) typ val
+#endif
+
+/* Indicates that the usage of the specified variable has been audited to assure
+ * that it's safe to use uninitialized. Silences 'may be used uninitialized'
+ * warnings on gcc.
+ */
+#if defined(__GNUC__) && __GNUC__
+#define UNINITIALIZED_IS_SAFE(x) x = x
+#else
+#define UNINITIALIZED_IS_SAFE(x) x
+#endif
+
+#if HAVE_NEON && defined(_MSC_VER)
+#define __builtin_prefetch(x)
+#endif
+
+/* Shift down with rounding for use when n >= 0, value >= 0 */
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
+
+/* Shift down with rounding for signed integers, for use when n >= 0 */
+#define ROUND_POWER_OF_TWO_SIGNED(value, n)           \
+  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
+                 : ROUND_POWER_OF_TWO((value), (n)))
+
+/* Shift down with rounding for use when n >= 0, value >= 0 for (64 bit) */
+#define ROUND_POWER_OF_TWO_64(value, n) \
+  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
+/* Shift down with rounding for signed integers, for use when n >= 0 (64 bit) */
+#define ROUND_POWER_OF_TWO_SIGNED_64(value, n)           \
+  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
+                 : ROUND_POWER_OF_TWO_64((value), (n)))
+
+/* shift right or left depending on sign of n */
+#define RIGHT_SIGNED_SHIFT(value, n) \
+  ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n)))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+
+#endif  // AOM_AOM_PORTS_MEM_H_
diff --git a/third_party/aom/aom_ports/mem_ops.h b/third_party/aom/aom_ports/mem_ops.h
new file mode 100644
index 000000000..2b5bc0f0f
--- /dev/null
+++ b/third_party/aom/aom_ports/mem_ops.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_OPS_H_
+#define AOM_AOM_PORTS_MEM_OPS_H_
+
+/* \file
+ * \brief Provides portable memory access primitives
+ *
+ * This function provides portable primitives for getting and setting of
+ * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations
+ * can be performed on unaligned data regardless of hardware support for
+ * unaligned accesses.
+ *
+ * The type used to pass the integral values may be changed by defining
+ * MEM_VALUE_T with the appropriate type. The type given must be an integral
+ * numeric type.
+ *
+ * The actual functions instantiated have the MEM_VALUE_T type name pasted
+ * on to the symbol name. This allows the developer to instantiate these
+ * operations for multiple types within the same translation unit. This is
+ * of somewhat questionable utility, but the capability exists nonetheless.
+ * Users not making use of this functionality should call the functions
+ * without the type name appended, and the preprocessor will take care of
+ * it.
+ *
+ * NOTE: This code is not supported on platforms where char > 1 octet ATM.
+ */
+
+#ifndef MAU_T
+/* Minimum Access Unit for this target */
+#define MAU_T unsigned char
+#endif
+
+#ifndef MEM_VALUE_T
+#define MEM_VALUE_T int
+#endif
+
+#undef MEM_VALUE_T_SZ_BITS
+#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3)
+
+#undef mem_ops_wrap_symbol
+#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
+#undef mem_ops_wrap_symbol2
+#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
+#undef mem_ops_wrap_symbol3
+#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
+
+/*
+ * Include aligned access routines
+ */
+#define INCLUDED_BY_MEM_OPS_H
+#include "mem_ops_aligned.h"
+#undef INCLUDED_BY_MEM_OPS_H
+
+#undef mem_get_be16
+#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16)
+static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 8;
+  val |= mem[1];
+  return val;
+}
+
+#undef mem_get_be24
+#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24)
+static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 16;
+  val |= mem[1] << 8;
+  val |= mem[2];
+  return val;
+}
+
+#undef mem_get_be32
+#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32)
+static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
+  val |= mem[1] << 16;
+  val |= mem[2] << 8;
+  val |= mem[3];
+  return val;
+}
+
+#undef mem_get_le16
+#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16)
+static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#undef mem_get_le24
+#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24)
+static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#undef mem_get_le32
+#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
+static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
+  val |= mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#define mem_get_s_generic(end, sz)                                            \
+  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \
+    const MAU_T *mem = (const MAU_T *)vmem;                                   \
+    signed MEM_VALUE_T val = mem_get_##end##sz(mem);                          \
+    return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \
+  }
+
+/* clang-format off */
+#undef  mem_get_sbe16
+#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16)
+mem_get_s_generic(be, 16)
+
+#undef  mem_get_sbe24
+#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24)
+mem_get_s_generic(be, 24)
+
+#undef  mem_get_sbe32
+#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32)
+mem_get_s_generic(be, 32)
+
+#undef  mem_get_sle16
+#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16)
+mem_get_s_generic(le, 16)
+
+#undef  mem_get_sle24
+#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24)
+mem_get_s_generic(le, 24)
+
+#undef  mem_get_sle32
+#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32)
+mem_get_s_generic(le, 32)
+
+#undef  mem_put_be16
+#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
+static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 8) & 0xff);
+  mem[1] = (MAU_T)((val >> 0) & 0xff);
+}
+
+#undef  mem_put_be24
+#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
+static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 16) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >>  0) & 0xff);
+}
+
+#undef  mem_put_be32
+#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
+static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 24) & 0xff);
+  mem[1] = (MAU_T)((val >> 16) & 0xff);
+  mem[2] = (MAU_T)((val >>  8) & 0xff);
+  mem[3] = (MAU_T)((val >>  0) & 0xff);
+}
+
+#undef  mem_put_le16
+#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
+static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 0) & 0xff);
+  mem[1] = (MAU_T)((val >> 8) & 0xff);
+}
+
+#undef  mem_put_le24
+#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
+static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
+}
+
+#undef  mem_put_le32
+#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
+static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
+  mem[3] = (MAU_T)((val >> 24) & 0xff);
+}
+/* clang-format on */
+#endif  // AOM_AOM_PORTS_MEM_OPS_H_
diff --git a/third_party/aom/aom_ports/mem_ops_aligned.h b/third_party/aom/aom_ports/mem_ops_aligned.h
new file mode 100644
index 000000000..37c367531
--- /dev/null
+++ b/third_party/aom/aom_ports/mem_ops_aligned.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+#define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+
+#include "aom/aom_integer.h"
+
+/* \file
+ * \brief Provides portable memory access primitives for operating on aligned
+ *        data
+ *
+ * This file is split from mem_ops.h for easier maintenance. See mem_ops.h
+ * for a more detailed description of these primitives.
+ */
+#ifndef INCLUDED_BY_MEM_OPS_H
+#error Include mem_ops.h, not mem_ops_aligned.h directly.
+#endif
+
+/* Architectures that provide instructions for doing this byte swapping
+ * could redefine these macros.
+ */
+#define swap_endian_16(val, raw)                                     \
+  do {                                                               \
+    val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \
+  } while (0)
+#define swap_endian_32(val, raw)                                   \
+  do {                                                             \
+    val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \
+          ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000);  \
+  } while (0)
+#define swap_endian_16_se(val, raw) \
+  do {                              \
+    swap_endian_16(val, raw);       \
+    val = ((val << 16) >> 16);      \
+  } while (0)
+#define swap_endian_32_se(val, raw) swap_endian_32(val, raw)
+
+#define mem_get_ne_aligned_generic(end, sz)                           \
+  static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+      const void *vmem) {                                             \
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;             \
+    return *mem;                                                      \
+  }
+
+#define mem_get_sne_aligned_generic(end, sz)                         \
+  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+      const void *vmem) {                                            \
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;              \
+    return *mem;                                                     \
+  }
+
+#define mem_get_se_aligned_generic(end, sz)                           \
+  static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+      const void *vmem) {                                             \
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;             \
+    unsigned MEM_VALUE_T val, raw = *mem;                             \
+    swap_endian_##sz(val, raw);                                       \
+    return val;                                                       \
+  }
+
+#define mem_get_sse_aligned_generic(end, sz)                         \
+  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+      const void *vmem) {                                            \
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;              \
+    unsigned MEM_VALUE_T val, raw = *mem;                            \
+    swap_endian_##sz##_se(val, raw);                                 \
+    return val;                                                      \
+  }
+
+#define mem_put_ne_aligned_generic(end, sz)                             \
+  static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem,        \
+                                                     MEM_VALUE_T val) { \
+    uint##sz##_t *mem = (uint##sz##_t *)vmem;                           \
+    *mem = (uint##sz##_t)val;                                           \
+  }
+
+#define mem_put_se_aligned_generic(end, sz)                             \
+  static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem,        \
+                                                     MEM_VALUE_T val) { \
+    uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;                      \
+    swap_endian_##sz(raw, val);                                         \
+    *mem = (uint##sz##_t)raw;                                           \
+  }
+
+#include "config/aom_config.h"
+
+#if CONFIG_BIG_ENDIAN
+#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz)
+#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz)
+#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz)
+#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz)
+#else
+#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz)
+#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz)
+#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz)
+#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz)
+#endif
+
+/* clang-format off */
+#undef  mem_get_be16_aligned
+#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned)
+mem_get_be_aligned_generic(16)
+
+#undef  mem_get_be32_aligned
+#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned)
+mem_get_be_aligned_generic(32)
+
+#undef  mem_get_le16_aligned
+#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned)
+mem_get_le_aligned_generic(16)
+
+#undef  mem_get_le32_aligned
+#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned)
+mem_get_le_aligned_generic(32)
+
+#undef  mem_get_sbe16_aligned
+#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned)
+mem_get_sbe_aligned_generic(16)
+
+#undef  mem_get_sbe32_aligned
+#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned)
+mem_get_sbe_aligned_generic(32)
+
+#undef  mem_get_sle16_aligned
+#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned)
+mem_get_sle_aligned_generic(16)
+
+#undef  mem_get_sle32_aligned
+#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned)
+mem_get_sle_aligned_generic(32)
+
+#undef  mem_put_be16_aligned
+#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned)
+mem_put_be_aligned_generic(16)
+
+#undef  mem_put_be32_aligned
+#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned)
+mem_put_be_aligned_generic(32)
+
+#undef  mem_put_le16_aligned
+#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned)
+mem_put_le_aligned_generic(16)
+
+#undef  mem_put_le32_aligned
+#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned)
+mem_put_le_aligned_generic(32)
+
+#undef mem_get_ne_aligned_generic
+#undef mem_get_se_aligned_generic
+#undef mem_get_sne_aligned_generic
+#undef mem_get_sse_aligned_generic
+#undef mem_put_ne_aligned_generic
+#undef mem_put_se_aligned_generic
+#undef swap_endian_16
+#undef swap_endian_32
+#undef swap_endian_16_se
+#undef swap_endian_32_se
+/* clang-format on */
+
+#endif  // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/third_party/aom/aom_ports/msvc.h b/third_party/aom/aom_ports/msvc.h
new file mode 100644
index 000000000..e78e605f2
--- /dev/null
+++ b/third_party/aom/aom_ports/msvc.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MSVC_H_
+#define AOM_AOM_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "config/aom_config.h"
+
+#if _MSC_VER < 1900  // VS2015 provides snprintf
+#define snprintf _snprintf
+#endif  // _MSC_VER < 1900
+
+#if _MSC_VER < 1800  // VS2013 provides round
+#include <math.h>
+static INLINE double round(double x) {
+  if (x < 0)
+    return ceil(x - 0.5);
+  else
+    return floor(x + 0.5);
+}
+
+static INLINE float roundf(float x) {
+  if (x < 0)
+    return (float)ceil(x - 0.5f);
+  else
+    return (float)floor(x + 0.5f);
+}
+
+static INLINE long lroundf(float x) {
+  if (x < 0)
+    return (long)(x - 0.5f);
+  else
+    return (long)(x + 0.5f);
+}
+#endif  // _MSC_VER < 1800
+
+#if HAVE_AVX
+#include <immintrin.h>
+// Note:
+// _mm256_insert_epi16 intrinsics is available from vs2017.
+// We define this macro for vs2015 and earlier. The
+// intrinsics used here are in vs2015 document:
+// https://msdn.microsoft.com/en-us/library/hh977022.aspx
+// Input parameters:
+// a: __m256i,
+// d: int16_t,
+// indx: imm8 (0 - 15)
+#if _MSC_VER <= 1900
+#define _mm256_insert_epi16(a, d, indx)                                      \
+  _mm256_insertf128_si256(                                                   \
+      a,                                                                     \
+      _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
+      indx >> 3)
+
+static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
+  return a.m256i_i32[i & 7];
+}
+static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
+  __m256i c = a;
+  c.m256i_i32[i & 7] = b;
+  return c;
+}
+#endif  // _MSC_VER <= 1900
+#endif  // HAVE_AVX
+#endif  // _MSC_VER
+#endif  // AOM_AOM_PORTS_MSVC_H_
diff --git a/third_party/aom/aom_ports/ppc.h b/third_party/aom/aom_ports/ppc.h
new file mode 100644
index 000000000..3159bda68
--- /dev/null
+++ b/third_party/aom/aom_ports/ppc.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_PPC_H_
+#define AOM_AOM_PORTS_PPC_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_VSX 0x01
+
+int ppc_simd_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_PPC_H_
diff --git a/third_party/aom/aom_ports/ppc_cpudetect.c b/third_party/aom/aom_ports/ppc_cpudetect.c
new file mode 100644
index 000000000..82b4f58cc
--- /dev/null
+++ b/third_party/aom/aom_ports/ppc_cpudetect.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/ppc.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+static int cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int cpu_env_mask(void) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+int ppc_simd_caps(void) {
+  int flags;
+  int mask;
+  int fd;
+  ssize_t count;
+  unsigned int i;
+  uint64_t buf[64];
+
+  // If VPX_SIMD_CAPS is set then allow only those capabilities.
+  if (!cpu_env_flags(&flags)) {
+    return flags;
+  }
+
+  mask = cpu_env_mask();
+
+  fd = open("/proc/self/auxv", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  while ((count = read(fd, buf, sizeof(buf))) > 0) {
+    for (i = 0; i < (count / sizeof(*buf)); i += 2) {
+      if (buf[i] == AT_HWCAP) {
+#if HAVE_VSX
+        if (buf[i + 1] & PPC_FEATURE_HAS_VSX) {
+          flags |= HAS_VSX;
+        }
+#endif  // HAVE_VSX
+        goto out_close;
+      } else if (buf[i] == AT_NULL) {
+        goto out_close;
+      }
+    }
+  }
+out_close:
+  close(fd);
+  return flags & mask;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int ppc_simd_caps(void) { return 0; }
+#endif  // CONFIG_RUNTIME_CPU_DETECT
diff --git a/third_party/aom/aom_ports/sanitizer.h b/third_party/aom/aom_ports/sanitizer.h
new file mode 100644
index 000000000..1dd8eb4cf
--- /dev/null
+++ b/third_party/aom/aom_ports/sanitizer.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_SANITIZER_H_
+#define AOM_AOM_PORTS_SANITIZER_H_
+
+// AddressSanitizer support.
+
+// Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used.
+// Clang.
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define AOM_ADDRESS_SANITIZER 1
+#endif
+#endif  // defined(__has_feature)
+// GCC.
+#if defined(__SANITIZE_ADDRESS__)
+#define AOM_ADDRESS_SANITIZER 1
+#endif  // defined(__SANITIZE_ADDRESS__)
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if defined(AOM_ADDRESS_SANITIZER)
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
+#endif  // AOM_AOM_PORTS_SANITIZER_H_
diff --git a/third_party/aom/aom_ports/system_state.h b/third_party/aom/aom_ports/system_state.h
new file mode 100644
index 000000000..6640839d8
--- /dev/null
+++ b/third_party/aom/aom_ports/system_state.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_SYSTEM_STATE_H_
+#define AOM_AOM_PORTS_SYSTEM_STATE_H_
+
+#include "config/aom_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+void aom_reset_mmx_state(void);
+#define aom_clear_system_state() aom_reset_mmx_state()
+#else
+#define aom_clear_system_state()
+#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // AOM_AOM_PORTS_SYSTEM_STATE_H_
diff --git a/third_party/aom/aom_ports/x86.h b/third_party/aom/aom_ports/x86.h
new file mode 100644
index 000000000..52ee49cb3
--- /dev/null
+++ b/third_party/aom/aom_ports/x86.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_X86_H_
+#define AOM_AOM_PORTS_X86_H_
+#include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h> /* For __cpuidex, __rdtsc */
+#endif
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  AOM_CPU_UNKNOWN = -1,
+  AOM_CPU_AMD,
+  AOM_CPU_AMD_OLD,
+  AOM_CPU_CENTAUR,
+  AOM_CPU_CYRIX,
+  AOM_CPU_INTEL,
+  AOM_CPU_NEXGEN,
+  AOM_CPU_NSC,
+  AOM_CPU_RISE,
+  AOM_CPU_SIS,
+  AOM_CPU_TRANSMETA,
+  AOM_CPU_TRANSMETA_OLD,
+  AOM_CPU_UMC,
+  AOM_CPU_VIA,
+
+  AOM_CPU_LAST
+} aom_cpu_t;
+
+#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
+#if ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx)                      \
+  __asm__ __volatile__("cpuid           \n\t"                   \
+                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
+                       : "a"(func), "c"(func2));
+#else
+#define cpuid(func, func2, ax, bx, cx, dx)     \
+  __asm__ __volatile__(                        \
+      "mov %%ebx, %%edi   \n\t"                \
+      "cpuid              \n\t"                \
+      "xchg %%edi, %%ebx  \n\t"                \
+      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+      : "a"(func), "c"(func2));
+#endif
+#elif defined(__SUNPRO_C) || \
+    defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
+#if ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx)     \
+  asm volatile(                                \
+      "xchg %rsi, %rbx \n\t"                   \
+      "cpuid           \n\t"                   \
+      "movl %ebx, %edi \n\t"                   \
+      "xchg %rsi, %rbx \n\t"                   \
+      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+      : "a"(func), "c"(func2));
+#else
+#define cpuid(func, func2, ax, bx, cx, dx)     \
+  asm volatile(                                \
+      "pushl %ebx       \n\t"                  \
+      "cpuid            \n\t"                  \
+      "movl %ebx, %edi  \n\t"                  \
+      "popl %ebx        \n\t"                  \
+      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+      : "a"(func), "c"(func2));
+#endif
+#else /* end __SUNPRO__ */
+#if ARCH_X86_64
+#if defined(_MSC_VER) && _MSC_VER > 1500
+#define cpuid(func, func2, a, b, c, d) \
+  do {                                 \
+    int regs[4];                       \
+    __cpuidex(regs, func, func2);      \
+    a = regs[0];                       \
+    b = regs[1];                       \
+    c = regs[2];                       \
+    d = regs[3];                       \
+  } while (0)
+#else
+#define cpuid(func, func2, a, b, c, d) \
+  do {                                 \
+    int regs[4];                       \
+    __cpuid(regs, func);               \
+    a = regs[0];                       \
+    b = regs[1];                       \
+    c = regs[2];                       \
+    d = regs[3];                       \
+  } while (0)
+#endif
+#else
+/* clang-format off */
+#define cpuid(func, func2, a, b, c, d) \
+  __asm mov eax, func                  \
+  __asm mov ecx, func2                 \
+  __asm cpuid                          \
+  __asm mov a, eax                     \
+  __asm mov b, ebx                     \
+  __asm mov c, ecx                     \
+  __asm mov d, edx
+#endif
+/* clang-format on */
+#endif /* end others */
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
+                   : "=a"(eax), "=d"(edx)
+                   : "c"(ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \
+    _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700
+#include <windows.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#define HAS_MMX 0x01
+#define HAS_SSE 0x02
+#define HAS_SSE2 0x04
+#define HAS_SSE3 0x08
+#define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
+#define HAS_AVX 0x40
+#define HAS_AVX2 0x80
+#define HAS_SSE4_2 0x100
+#ifndef BIT
+#define BIT(n) (1 << n)
+#endif
+
+static INLINE int x86_simd_caps(void) {
+  unsigned int flags = 0;
+  unsigned int mask = ~0;
+  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
+  char *env;
+  (void)reg_ebx;
+
+  /* See if the CPU capabilities are being overridden by the environment */
+  env = getenv("AOM_SIMD_CAPS");
+
+  if (env && *env) return (int)strtol(env, NULL, 0);
+
+  env = getenv("AOM_SIMD_CAPS_MASK");
+
+  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
+
+  /* Ensure that the CPUID instruction supports extended features */
+  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
+
+  if (max_cpuid_val < 1) return 0;
+
+  /* Get the standard feature flags */
+  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+  if (reg_edx & BIT(23)) flags |= HAS_MMX;
+
+  if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */
+
+  if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
+
+  if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
+
+  if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
+
+  if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
+  if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2;
+
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
+
+      if (max_cpuid_val >= 7) {
+        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+        if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+      }
+    }
+  }
+
+  return flags & mask;
+}
+
+// Note:
+//  32-bit CPU cycle counter is light-weighted for most function performance
+//  measurement. For large function (CPU time > a couple of seconds), 64-bit
+//  counter should be used.
+// 32-bit CPU cycle counter
+static INLINE unsigned int x86_readtsc(void) {
+#if defined(__GNUC__) && __GNUC__
+  unsigned int tsc;
+  __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
+  return tsc;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tsc;
+  asm volatile("rdtsc\n\t" : "=a"(tsc) :);
+  return tsc;
+#else
+#if ARCH_X86_64
+  return (unsigned int)__rdtsc();
+#else
+  __asm rdtsc;
+#endif
+#endif
+}
+// 64-bit CPU cycle counter
+static INLINE uint64_t x86_readtsc64(void) {
+#if defined(__GNUC__) && __GNUC__
+  uint32_t hi, lo;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  uint_t hi, lo;
+  asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#else
+#if ARCH_X86_64
+  return (uint64_t)__rdtsc();
+#else
+  __asm rdtsc;
+#endif
+#endif
+}
+
+#if defined(__GNUC__) && __GNUC__
+#define x86_pause_hint() __asm__ __volatile__("pause \n\t")
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#define x86_pause_hint() asm volatile("pause \n\t")
+#else
+#if ARCH_X86_64
+#define x86_pause_hint() _mm_pause();
+#else
+#define x86_pause_hint() __asm pause
+#endif
+#endif
+
+#if defined(__GNUC__) && __GNUC__
+static void x87_set_control_word(unsigned short mode) {
+  __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short x87_get_control_word(void) {
+  unsigned short mode;
+  __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :);
+  return mode;
+}
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+static void x87_set_control_word(unsigned short mode) {
+  asm volatile("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short x87_get_control_word(void) {
+  unsigned short mode;
+  asm volatile("fstcw %0\n\t" : "=m"(*&mode) :);
+  return mode;
+}
+#elif ARCH_X86_64
+/* No fldcw intrinsics on Windows x64, punt to external asm */
+extern void aom_winx64_fldcw(unsigned short mode);
+extern unsigned short aom_winx64_fstcw(void);
+#define x87_set_control_word aom_winx64_fldcw
+#define x87_get_control_word aom_winx64_fstcw
+#else
+static void x87_set_control_word(unsigned short mode) {
+  __asm { fldcw mode }
+}
+static unsigned short x87_get_control_word(void) {
+  unsigned short mode;
+  __asm { fstcw mode }
+  return mode;
+}
+#endif
+
+static INLINE unsigned int x87_set_double_precision(void) {
+  unsigned int mode = x87_get_control_word();
+  x87_set_control_word((mode & ~0x300) | 0x200);
+  return mode;
+}
+
+extern void aom_reset_mmx_state(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_X86_H_
diff --git a/third_party/aom/aom_ports/x86_abi_support.asm b/third_party/aom/aom_ports/x86_abi_support.asm
new file mode 100644
index 000000000..0e7c26287
--- /dev/null
+++ b/third_party/aom/aom_ports/x86_abi_support.asm
@@ -0,0 +1,395 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "config/aom_config.asm"
+
+; 32/64 bit compatibility macros
+;
+; In general, we make the source use 64 bit syntax, then twiddle with it using
+; the preprocessor to get the 32 bit syntax on 32 bit platforms.
+;
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+%define rax eax
+%define rbx ebx
+%define rcx ecx
+%define rdx edx
+%define rsi esi
+%define rdi edi
+%define rsp esp
+%define rbp ebp
+%define movsxd mov
+%macro movq 2
+  %ifidn %1,eax
+    movd %1,%2
+  %elifidn %2,eax
+    movd %1,%2
+  %elifidn %1,ebx
+    movd %1,%2
+  %elifidn %2,ebx
+    movd %1,%2
+  %elifidn %1,ecx
+    movd %1,%2
+  %elifidn %2,ecx
+    movd %1,%2
+  %elifidn %1,edx
+    movd %1,%2
+  %elifidn %2,edx
+    movd %1,%2
+  %elifidn %1,esi
+    movd %1,%2
+  %elifidn %2,esi
+    movd %1,%2
+  %elifidn %1,edi
+    movd %1,%2
+  %elifidn %2,edi
+    movd %1,%2
+  %elifidn %1,esp
+    movd %1,%2
+  %elifidn %2,esp
+    movd %1,%2
+  %elifidn %1,ebp
+    movd %1,%2
+  %elifidn %2,ebp
+    movd %1,%2
+  %else
+    movq %1,%2
+  %endif
+%endmacro
+%endif
+
+
+; LIBAOM_YASM_WIN64
+; Set LIBAOM_YASM_WIN64 if output is Windows 64bit so the code will work if x64
+; or win64 is defined on the Yasm command line.
+%ifidn __OUTPUT_FORMAT__,win64
+%define LIBAOM_YASM_WIN64 1
+%elifidn __OUTPUT_FORMAT__,x64
+%define LIBAOM_YASM_WIN64 1
+%else
+%define LIBAOM_YASM_WIN64 0
+%endif
+
+; sym()
+; Return the proper symbol name for the target ABI.
+;
+; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
+; with C linkage be prefixed with an underscore.
+;
+%ifidn   __OUTPUT_FORMAT__,elf32
+%define sym(x) x
+%elifidn __OUTPUT_FORMAT__,elf64
+%define sym(x) x
+%elifidn __OUTPUT_FORMAT__,elfx32
+%define sym(x) x
+%elif LIBAOM_YASM_WIN64
+%define sym(x) x
+%else
+%define sym(x) _ %+ x
+%endif
+
+;  PRIVATE
+;  Macro for the attribute to hide a global symbol for the target ABI.
+;  This is only active if CHROMIUM is defined.
+;
+;  Chromium doesn't like exported global symbols due to symbol clashing with
+;  plugins among other things.
+;
+;  Requires Chromium's patched copy of yasm:
+;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;    http://www.tortall.net/projects/yasm/ticket/236
+;
+%ifdef CHROMIUM
+  %ifidn   __OUTPUT_FORMAT__,elf32
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elf64
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elfx32
+    %define PRIVATE :hidden
+  %elif LIBAOM_YASM_WIN64
+    %define PRIVATE
+  %else
+    %define PRIVATE :private_extern
+  %endif
+%else
+  %define PRIVATE
+%endif
+
+; arg()
+; Return the address specification of the given argument
+;
+%if ABI_IS_32BIT
+  %define arg(x) [ebp+8+4*x]
+%else
+  ; 64 bit ABI passes arguments in registers. This is a workaround to get up
+  ; and running quickly. Relies on SHADOW_ARGS_TO_STACK
+  %if LIBAOM_YASM_WIN64
+    %define arg(x) [rbp+16+8*x]
+  %else
+    %define arg(x) [rbp-8-8*x]
+  %endif
+%endif
+
+; REG_SZ_BYTES, REG_SZ_BITS
+; Size of a register
+%if ABI_IS_32BIT
+%define REG_SZ_BYTES 4
+%define REG_SZ_BITS  32
+%else
+%define REG_SZ_BYTES 8
+%define REG_SZ_BITS  64
+%endif
+
+
+; ALIGN_STACK <alignment> <register>
+; This macro aligns the stack to the given alignment (in bytes). The stack
+; is left such that the previous value of the stack pointer is the first
+; argument on the stack (ie, the inverse of this macro is 'pop rsp.')
+; This macro uses one temporary register, which is not preserved, and thus
+; must be specified as an argument.
+%macro ALIGN_STACK 2
+    mov         %2, rsp
+    and         rsp, -%1
+    lea         rsp, [rsp - (%1 - REG_SZ_BYTES)]
+    push        %2
+%endmacro
+
+
+;
+; The Microsoft assembler tries to impose a certain amount of type safety in
+; its register usage. YASM doesn't recognize these directives, so we just
+; %define them away to maintain as much compatibility as possible with the
+; original inline assembler we're porting from.
+;
+%idefine PTR
+%idefine XMMWORD
+%idefine MMWORD
+
+; PIC macros
+;
+%if ABI_IS_32BIT
+  %if CONFIG_PIC=1
+  %ifidn __OUTPUT_FORMAT__,elf32
+    %define WRT_PLT wrt ..plt
+    %macro GET_GOT 1
+      extern _GLOBAL_OFFSET_TABLE_
+      push %1
+      call %%get_got
+      %%sub_offset:
+      jmp %%exitGG
+      %%get_got:
+      mov %1, [esp]
+      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+      ret
+      %%exitGG:
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 wrt ..gotoff
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %elifidn __OUTPUT_FORMAT__,macho32
+    %macro GET_GOT 1
+      push %1
+      call %%get_got
+      %%get_got:
+      pop  %1
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 - %%get_got
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %endif
+  %endif
+
+  %ifdef CHROMIUM
+    %ifidn __OUTPUT_FORMAT__,macho32
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
+%else
+  %macro GET_GOT 1
+  %endmacro
+  %define GLOBAL(x) rel x
+  %ifidn __OUTPUT_FORMAT__,elf64
+    %define WRT_PLT wrt ..plt
+    %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,elfx32
+    %define WRT_PLT wrt ..plt
+    %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,macho64
+    %ifdef CHROMIUM
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
+%endif
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+%define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+%define WRT_PLT
+%endif
+
+%if ABI_IS_32BIT
+  %macro SHADOW_ARGS_TO_STACK 1
+  %endm
+  %define UNSHADOW_ARGS
+%else
+%if LIBAOM_YASM_WIN64
+  %macro SHADOW_ARGS_TO_STACK 1 ; argc
+    %if %1 > 0
+        mov arg(0),rcx
+    %endif
+    %if %1 > 1
+        mov arg(1),rdx
+    %endif
+    %if %1 > 2
+        mov arg(2),r8
+    %endif
+    %if %1 > 3
+        mov arg(3),r9
+    %endif
+  %endm
+%else
+  %macro SHADOW_ARGS_TO_STACK 1 ; argc
+    %if %1 > 0
+        push rdi
+    %endif
+    %if %1 > 1
+        push rsi
+    %endif
+    %if %1 > 2
+        push rdx
+    %endif
+    %if %1 > 3
+        push rcx
+    %endif
+    %if %1 > 4
+        push r8
+    %endif
+    %if %1 > 5
+        push r9
+    %endif
+    %if %1 > 6
+      %assign i %1-6
+      %assign off 16
+      %rep i
+        mov rax,[rbp+off]
+        push rax
+        %assign off off+8
+      %endrep
+    %endif
+  %endm
+%endif
+  %define UNSHADOW_ARGS mov rsp, rbp
+%endif
+
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
+%if LIBAOM_YASM_WIN64
+%macro SAVE_XMM 1-2 a
+  %if %1 < 6
+    %error Only xmm registers 6-15 must be preserved
+  %else
+    %assign last_xmm %1
+    %define movxmm movdq %+ %2
+    %assign xmm_stack_space ((last_xmm - 5) * 16)
+    sub rsp, xmm_stack_space
+    %assign i 6
+    %rep (last_xmm - 5)
+      movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+      %assign i i+1
+    %endrep
+  %endif
+%endmacro
+%macro RESTORE_XMM 0
+  %ifndef last_xmm
+    %error RESTORE_XMM must be paired with SAVE_XMM n
+  %else
+    %assign i last_xmm
+    %rep (last_xmm - 5)
+      movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+      %assign i i-1
+    %endrep
+    add rsp, xmm_stack_space
+    ; there are a couple functions which return from multiple places.
+    ; otherwise, we could uncomment these:
+    ; %undef last_xmm
+    ; %undef xmm_stack_space
+    ; %undef movxmm
+  %endif
+%endmacro
+%else
+%macro SAVE_XMM 1-2
+%endmacro
+%macro RESTORE_XMM 0
+%endmacro
+%endif
+
+; Name of the rodata section
+;
+; .rodata seems to be an elf-ism, as it doesn't work on OSX.
+;
+%ifidn __OUTPUT_FORMAT__,macho64
+%define SECTION_RODATA section .text
+%elifidn __OUTPUT_FORMAT__,macho32
+%macro SECTION_RODATA 0
+section .text
+%endmacro
+%elifidn __OUTPUT_FORMAT__,aout
+%define SECTION_RODATA section .data
+%else
+%define SECTION_RODATA section .rodata
+%endif
+
+
+; Tell GNU ld that we don't require an executable stack.
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elfx32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
diff --git a/third_party/aom/aom_scale/aom_scale.cmake b/third_party/aom/aom_scale/aom_scale.cmake
new file mode 100644
index 000000000..197dea6bd
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale.cmake
@@ -0,0 +1,38 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_)
+  return()
+endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_
+set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1)
+
+list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h"
+            "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+            "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+            "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+            "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+            "${AOM_ROOT}/aom_scale/yv12config.h")
+
+list(APPEND AOM_SCALE_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_scale/mips/dspr2/yv12extend_dspr2.c")
+
+# Creates the aom_scale build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_scale_targets)
+  add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES})
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+  if(HAVE_DSPR2)
+    add_intrinsics_object_library("" "dspr2" "aom_scale"
+                                  "AOM_SCALE_INTRIN_DSPR2" "aom")
+  endif()
+
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/aom_scale/aom_scale.h b/third_party/aom/aom_scale/aom_scale.h
new file mode 100644
index 000000000..11812a145
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_SCALE_AOM_SCALE_H_
+#define AOM_AOM_SCALE_AOM_SCALE_H_
+
+#include "aom_scale/yv12config.h"
+
+extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                            unsigned char *temp_area, unsigned char temp_height,
+                            unsigned int hscale, unsigned int hratio,
+                            unsigned int vscale, unsigned int vratio,
+                            unsigned int interlaced, const int num_planes);
+
+#endif  // AOM_AOM_SCALE_AOM_SCALE_H_
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.c b/third_party/aom/aom_scale/aom_scale_rtcd.c
new file mode 100644
index 000000000..a04e053b0
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_scale_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_scale/aom_scale_rtcd.pl b/third_party/aom/aom_scale/aom_scale_rtcd.pl
new file mode 100644
index 000000000..c5990b1bb
--- /dev/null
+++ b/third_party/aom/aom_scale/aom_scale_rtcd.pl
@@ -0,0 +1,52 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_scale_forward_decls() {
+print <<EOF
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/aom_scale_forward_decls/;
+
+# Scaler functions
+if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
+  add_proto qw/void aom_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_5_4_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+  add_proto qw/void aom_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_5_3_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+  add_proto qw/void aom_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_2_1_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+}
+
+add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+
+add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes";
+
+add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
+add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+
+add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+
+add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_yv12_partial_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+specialize qw/aom_extend_frame_borders dspr2/;
+
+add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+specialize qw/aom_extend_frame_inner_borders dspr2/;
+
+add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
+1;
diff --git a/third_party/aom/aom_scale/generic/aom_scale.c b/third_party/aom/aom_scale/generic/aom_scale.c
new file mode 100644
index 000000000..206c42c9f
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/aom_scale.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/****************************************************************************
+ *
+ *   Module Title :     scale.c
+ *
+ *   Description  :     Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+ *  Header Files
+ ****************************************************************************/
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+typedef struct {
+  int expanded_frame_width;
+  int expanded_frame_height;
+
+  int HScale;
+  int HRatio;
+  int VScale;
+  int VRatio;
+
+  YV12_BUFFER_CONFIG *src_yuv_config;
+  YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_i
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step             : Number of pixels to step on
+ *                                                in source.
+ *                  unsigned int source_scale   : Scale for source (UNUSED).
+ *                  unsigned int source_length  : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step               : Number of pixels to step on
+ *                                                in destination.
+ *                  unsigned int dest_scale     : Scale for destination
+ *                                                (UNUSED).
+ *                  unsigned int dest_length    : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 interpolated scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_2t1_i(const unsigned char *source, int source_step,
+                          unsigned int source_scale, unsigned int source_length,
+                          unsigned char *dest, int dest_step,
+                          unsigned int dest_scale, unsigned int dest_length) {
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  (void)source_length;
+  (void)source_scale;
+  (void)dest_scale;
+
+  source_step *= 2;  // Every other row.
+
+  dest[0] = source[0];  // Special case: 1st pixel.
+  source += source_step;
+  dest += dest_step;
+
+  while (dest < dest_end) {
+    const unsigned int a = 3 * source[-source_step];
+    const unsigned int b = 10 * source[0];
+    const unsigned int c = 3 * source[source_step];
+    *dest = (unsigned char)((8 + a + b + c) >> 4);
+    source += source_step;
+    dest += dest_step;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_ps
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step             : Number of pixels to step on
+ *                                                in source.
+ *                  unsigned int source_scale   : Scale for source (UNUSED).
+ *                  unsigned int source_length  : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step               : Number of pixels to step on
+ *                                                in destination.
+ *                  unsigned int dest_scale     : Scale for destination
+ *                                                (UNUSED).
+ *                  unsigned int dest_length    : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_2t1_ps(const unsigned char *source, int source_step,
+                           unsigned int source_scale,
+                           unsigned int source_length, unsigned char *dest,
+                           int dest_step, unsigned int dest_scale,
+                           unsigned int dest_length) {
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  (void)source_length;
+  (void)source_scale;
+  (void)dest_scale;
+
+  source_step *= 2;  // Every other row.
+
+  while (dest < dest_end) {
+    *dest = *source;
+    source += source_step;
+    dest += dest_step;
+  }
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step             : Number of pixels to step on
+ *                                                in source.
+ *                  unsigned int source_scale   : Scale for source.
+ *                  unsigned int source_length  : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step               : Number of pixels to step on
+ *                                                in destination.
+ *                  unsigned int dest_scale     : Scale for destination.
+ *                  unsigned int dest_length    : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs linear interpolation in one dimension.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_c(const unsigned char *source, int source_step,
+                      unsigned int source_scale, unsigned int source_length,
+                      unsigned char *dest, int dest_step,
+                      unsigned int dest_scale, unsigned int dest_length) {
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  const unsigned int round_value = dest_scale / 2;
+  unsigned int left_modifier = dest_scale;
+  unsigned int right_modifier = 0;
+  unsigned char left_pixel = source[0];
+  unsigned char right_pixel = source[source_step];
+
+  (void)source_length;
+
+  /* These asserts are needed if there are boundary issues... */
+  /* assert ( dest_scale > source_scale );*/
+  /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
+   * source_scale);*/
+
+  while (dest < dest_end) {
+    *dest = (unsigned char)((left_modifier * left_pixel +
+                             right_modifier * right_pixel + round_value) /
+                            dest_scale);
+
+    right_modifier += source_scale;
+
+    while (right_modifier > dest_scale) {
+      right_modifier -= dest_scale;
+      source += source_step;
+      left_pixel = source[0];
+      right_pixel = source[source_step];
+    }
+
+    left_modifier = dest_scale - right_modifier;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : Scale2D
+ *
+ *  INPUTS        : const unsigned char *source    : Pointer to data to be
+ *                                                   scaled.
+ *                  int source_pitch               : Stride of source image.
+ *                  unsigned int source_width      : Width of input image.
+ *                  unsigned int source_height     : Height of input image.
+ *                  unsigned char *dest            : Pointer to output data
+ *                                                   array.
+ *                  int dest_pitch                 : Stride of destination
+ *                                                   image.
+ *                  unsigned int dest_width        : Width of destination image.
+ *                  unsigned int dest_height       : Height of destination
+ *                                                   image.
+ *                  unsigned char *temp_area       : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale            : Horizontal scale factor
+ *                                                   numerator.
+ *                  unsigned int hratio            : Horizontal scale factor
+ *                                                   denominator.
+ *                  unsigned int vscale            : Vertical scale factor
+ *                                                   numerator.
+ *                  unsigned int vratio            : Vertical scale factor
+ *                                                   denominator.
+ *                  unsigned int interlaced        : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+static void Scale2D(
+    /*const*/
+    unsigned char *source, int source_pitch, unsigned int source_width,
+    unsigned int source_height, unsigned char *dest, int dest_pitch,
+    unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
+    unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
+    unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
+  unsigned int i, j, k;
+  unsigned int bands;
+  unsigned int dest_band_height;
+  unsigned int source_band_height;
+
+  typedef void (*Scale1D)(const unsigned char *source, int source_step,
+                          unsigned int source_scale, unsigned int source_length,
+                          unsigned char *dest, int dest_step,
+                          unsigned int dest_scale, unsigned int dest_length);
+
+  Scale1D Scale1Dv = scale1d_c;
+  Scale1D Scale1Dh = scale1d_c;
+
+  void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *,
+                           unsigned int) = NULL;
+  void (*vert_band_scale)(unsigned char *, int, unsigned char *, int,
+                          unsigned int) = NULL;
+
+  int ratio_scalable = 1;
+  int interpolation = 0;
+
+  unsigned char *source_base;
+  unsigned char *line_src;
+
+  source_base = (unsigned char *)source;
+
+  if (source_pitch < 0) {
+    int offset;
+
+    offset = (source_height - 1);
+    offset *= source_pitch;
+
+    source_base += offset;
+  }
+
+  /* find out the ratio for each direction */
+  switch (hratio * 10 / hscale) {
+    case 8:
+      /* 4-5 Scale in Width direction */
+      horiz_line_scale = aom_horizontal_line_5_4_scale;
+      break;
+    case 6:
+      /* 3-5 Scale in Width direction */
+      horiz_line_scale = aom_horizontal_line_5_3_scale;
+      break;
+    case 5:
+      /* 1-2 Scale in Width direction */
+      horiz_line_scale = aom_horizontal_line_2_1_scale;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  switch (vratio * 10 / vscale) {
+    case 8:
+      /* 4-5 Scale in vertical direction */
+      vert_band_scale = aom_vertical_band_5_4_scale;
+      source_band_height = 5;
+      dest_band_height = 4;
+      break;
+    case 6:
+      /* 3-5 Scale in vertical direction */
+      vert_band_scale = aom_vertical_band_5_3_scale;
+      source_band_height = 5;
+      dest_band_height = 3;
+      break;
+    case 5:
+      /* 1-2 Scale in vertical direction */
+
+      if (interlaced) {
+        /* if the content is interlaced, point sampling is used */
+        vert_band_scale = aom_vertical_band_2_1_scale;
+      } else {
+        interpolation = 1;
+        /* if the content is progressive, interplo */
+        vert_band_scale = aom_vertical_band_2_1_scale_i;
+      }
+
+      source_band_height = 2;
+      dest_band_height = 1;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  if (ratio_scalable) {
+    if (source_height == dest_height) {
+      /* for each band of the image */
+      for (k = 0; k < dest_height; ++k) {
+        horiz_line_scale(source, source_width, dest, dest_width);
+        source += source_pitch;
+        dest += dest_pitch;
+      }
+
+      return;
+    }
+
+    if (interpolation) {
+      if (source < source_base) source = source_base;
+
+      horiz_line_scale(source, source_width, temp_area, dest_width);
+    }
+
+    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
+         ++k) {
+      /* scale one band horizontally */
+      for (i = 0; i < source_band_height; ++i) {
+        /* Trap case where we could read off the base of the source buffer */
+
+        line_src = source + i * source_pitch;
+
+        if (line_src < source_base) line_src = source_base;
+
+        horiz_line_scale(line_src, source_width,
+                         temp_area + (i + 1) * dest_pitch, dest_width);
+      }
+
+      /* Vertical scaling is in place */
+      vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch,
+                      dest_width);
+
+      if (interpolation)
+        memcpy(temp_area, temp_area + source_band_height * dest_pitch,
+               dest_width);
+
+      /* Next band... */
+      source += (unsigned long)source_band_height * source_pitch;
+      dest += (unsigned long)dest_band_height * dest_pitch;
+    }
+
+    return;
+  }
+
+  if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps;
+
+  if (vscale == 2 && vratio == 1) {
+    if (interlaced)
+      Scale1Dv = scale1d_2t1_ps;
+    else
+      Scale1Dv = scale1d_2t1_i;
+  }
+
+  if (source_height == dest_height) {
+    /* for each band of the image */
+    for (k = 0; k < dest_height; ++k) {
+      Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
+               dest_width);
+      source += source_pitch;
+      dest += dest_pitch;
+    }
+
+    return;
+  }
+
+  if (dest_height > source_height) {
+    dest_band_height = temp_area_height - 1;
+    source_band_height = dest_band_height * source_height / dest_height;
+  } else {
+    source_band_height = temp_area_height - 1;
+    dest_band_height = source_band_height * vratio / vscale;
+  }
+
+  /* first row needs to be done so that we can stay one row ahead for vertical
+   * zoom */
+  Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio,
+           dest_width);
+
+  /* for each band of the image */
+  bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+  for (k = 0; k < bands; ++k) {
+    /* scale one band horizontally */
+    for (i = 1; i < source_band_height + 1; ++i) {
+      if (k * source_band_height + i < source_height) {
+        Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+                 temp_area + i * dest_pitch, 1, hratio, dest_width);
+      } else { /*  Duplicate the last row */
+        /* copy temp_area row 0 over from last row in the past */
+        memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch,
+               dest_pitch);
+      }
+    }
+
+    /* scale one band vertically */
+    for (j = 0; j < dest_width; ++j) {
+      Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+               &dest[j], dest_pitch, vratio, dest_band_height);
+    }
+
+    /* copy temp_area row 0 over from last row in the past */
+    memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+    /* move to the next band */
+    source += source_band_height * source_pitch;
+    dest += dest_band_height * dest_pitch;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : aom_scale_frame
+ *
+ *  INPUTS        : YV12_BUFFER_CONFIG *src        : Pointer to frame to be
+ *                                                   scaled.
+ *                  YV12_BUFFER_CONFIG *dst        : Pointer to buffer to hold
+ *                                                   scaled frame.
+ *                  unsigned char *temp_area       : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale            : Horizontal scale factor
+ *                                                   numerator.
+ *                  unsigned int hratio            : Horizontal scale factor
+ *                                                   denominator.
+ *                  unsigned int vscale            : Vertical scale factor
+ *                                                   numerator.
+ *                  unsigned int vratio            : Vertical scale factor
+ *                                                   denominator.
+ *                  unsigned int interlaced        : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                     unsigned char *temp_area, unsigned char temp_height,
+                     unsigned int hscale, unsigned int hratio,
+                     unsigned int vscale, unsigned int vratio,
+                     unsigned int interlaced, const int num_planes) {
+  const int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+  const int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int is_uv = plane > 0;
+    const int plane_dw = dw >> is_uv;
+    const int plane_dh = dh >> is_uv;
+
+    Scale2D((unsigned char *)src->buffers[plane], src->strides[is_uv],
+            src->widths[is_uv], src->heights[is_uv],
+            (unsigned char *)dst->buffers[plane], dst->strides[is_uv], plane_dw,
+            plane_dh, temp_area, temp_height, hscale, hratio, vscale, vratio,
+            interlaced);
+
+    if (plane_dw < dst->widths[is_uv])
+      for (int i = 0; i < plane_dh; ++i)
+        memset(dst->buffers[plane] + i * dst->strides[is_uv] + plane_dw - 1,
+               dst->buffers[plane][i * dst->strides[is_uv] + plane_dw - 2],
+               dst->widths[is_uv] - plane_dw + 1);
+
+    if (plane_dh < dst->heights[is_uv])
+      for (int i = plane_dh - 1; i < dst->heights[is_uv]; ++i)
+        memcpy(dst->buffers[plane] + i * dst->strides[is_uv],
+               dst->buffers[plane] + (plane_dh - 2) * dst->strides[is_uv],
+               dst->widths[is_uv] + 1);
+  }
+}
diff --git a/third_party/aom/aom_scale/generic/gen_scalers.c b/third_party/aom/aom_scale/generic/gen_scalers.c
new file mode 100644
index 000000000..549e2aa69
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/gen_scalers.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_scale/aom_scale.h"
+#include "aom_mem/aom_mem.h"
+/****************************************************************************
+ *  Imports
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width   : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width     : Stride of destination
+ *                                                (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  const unsigned char *const source_end = source + source_width;
+  (void)dest_width;
+
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
+
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+    source += 5;
+    dest += 4;
+  }
+}
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
+                                   unsigned int dest_width) {
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
+
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+    ++source;
+    ++dest;
+  }
+}
+
+/*7***************************************************************************
+ *
+ *  ROUTINE       : aom_horizontal_line_3_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width   : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width     : Stride of destination
+ *                                                (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  const unsigned char *const source_end = source + source_width;
+  (void)dest_width;
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
+
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+    source += 5;
+    dest += 3;
+  }
+}
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
+                                   unsigned int dest_width) {
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
+
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+    ++source;
+    ++dest;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : aom_horizontal_line_1_2_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width   : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width     : Stride of destination
+ *                                                (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  const unsigned char *const source_end = source + source_width;
+  (void)dest_width;
+  while (source < source_end) {
+    dest[0] = source[0];
+    source += 2;
+    ++dest;
+  }
+}
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
+                                   unsigned int dest_width) {
+  (void)dest_pitch;
+  (void)src_pitch;
+  memcpy(dest, source, dest_width);
+}
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch,
+                                     unsigned char *dest, int dest_pitch,
+                                     unsigned int dest_width) {
+  const unsigned char *const dest_end = dest + dest_width;
+  (void)dest_pitch;
+  while (dest < dest_end) {
+    const unsigned int a = source[-src_pitch] * 3;
+    const unsigned int b = source[0] * 10;
+    const unsigned int c = source[src_pitch] * 3;
+    dest[0] = (unsigned char)((8 + a + b + c) >> 4);
+    ++source;
+    ++dest;
+  }
+}
diff --git a/third_party/aom/aom_scale/generic/yv12config.c b/third_party/aom/aom_scale/generic/yv12config.c
new file mode 100644
index 000000000..84705e2d8
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/yv12config.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+
+/****************************************************************************
+ *  Exports
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+#define yv12_align_addr(addr, align) \
+  (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align))
+
+// TODO(jkoleszar): Maybe replace this with struct aom_image
+
+int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf) {
+    if (ybf->buffer_alloc_sz > 0) {
+      aom_free(ybf->buffer_alloc);
+    }
+    if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
+
+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+      all of this so that a freed pointer isn't inadvertently used */
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+  } else {
+    return -1;
+  }
+
+  return 0;
+}
+
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                             int ss_x, int ss_y, int use_highbitdepth,
+                             int border, int byte_alignment,
+                             aom_codec_frame_buffer_t *fb,
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
+  if (ybf) {
+    const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
+    const int aligned_width = (width + 7) & ~7;
+    const int aligned_height = (height + 7) & ~7;
+    const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+    const uint64_t yplane_size =
+        (aligned_height + 2 * border) * (uint64_t)y_stride + byte_alignment;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_stride = y_stride >> ss_x;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+    const uint64_t uvplane_size =
+        (uv_height + 2 * uv_border_h) * (uint64_t)uv_stride + byte_alignment;
+
+    const uint64_t frame_size =
+        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
+
+    uint8_t *buf = NULL;
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+    // The size of ybf->buffer_alloc.
+    uint64_t alloc_size = frame_size;
+    // The size of ybf->y_buffer_8bit.
+    if (use_highbitdepth) alloc_size += yplane_size;
+    // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
+    // pool. Bound the total amount of allocated memory as if these REF_FRAMES
+    // frame buffers were allocated in a single allocation.
+    if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1;
+#endif
+
+    if (cb != NULL) {
+      const int align_addr_extra_size = 31;
+      const uint64_t external_frame_size = frame_size + align_addr_extra_size;
+
+      assert(fb != NULL);
+
+      if (external_frame_size != (size_t)external_frame_size) return -1;
+
+      // Allocation to hold larger frame, or first allocation.
+      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0) return -1;
+
+      if (fb->data == NULL || fb->size < external_frame_size) return -1;
+
+      ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32);
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+      // This memset is needed for fixing the issue of using uninitialized
+      // value in msan test. It will cause a perf loss, so only do this for
+      // msan test.
+      memset(ybf->buffer_alloc, 0, (size_t)frame_size);
+#endif
+#endif
+    } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
+      // Allocation to hold larger frame, or first allocation.
+      aom_free(ybf->buffer_alloc);
+      ybf->buffer_alloc = NULL;
+
+      if (frame_size != (size_t)frame_size) return -1;
+
+      ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
+      if (!ybf->buffer_alloc) return -1;
+
+      ybf->buffer_alloc_sz = (size_t)frame_size;
+
+      // This memset is needed for fixing valgrind error from C loop filter
+      // due to access uninitialized memory in frame border. It could be
+      // removed if border is totally removed.
+      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+    }
+
+    /* Only support allocating buffers that have a border that's a multiple
+     * of 32. The border restriction is required to get 16-byte alignment of
+     * the start of the chroma rows without introducing an arbitrary gap
+     * between planes, which would break the semantics of things like
+     * aom_img_set_rect(). */
+    if (border & 0x1f) return -3;
+
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width = aligned_width;
+    ybf->y_height = aligned_height;
+    ybf->y_stride = y_stride;
+
+    ybf->uv_crop_width = (width + ss_x) >> ss_x;
+    ybf->uv_crop_height = (height + ss_y) >> ss_y;
+    ybf->uv_width = uv_width;
+    ybf->uv_height = uv_height;
+    ybf->uv_stride = uv_stride;
+
+    ybf->border = border;
+    ybf->frame_size = (size_t)frame_size;
+    ybf->subsampling_x = ss_x;
+    ybf->subsampling_y = ss_y;
+
+    buf = ybf->buffer_alloc;
+    if (use_highbitdepth) {
+      // Store uint16 addresses when using 16bit framebuffers
+      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+    } else {
+      ybf->flags = 0;
+    }
+
+    ybf->y_buffer = (uint8_t *)yv12_align_addr(
+        buf + (border * y_stride) + border, aom_byte_align);
+    ybf->u_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+        aom_byte_align);
+    ybf->v_buffer =
+        (uint8_t *)yv12_align_addr(buf + yplane_size + uvplane_size +
+                                       (uv_border_h * uv_stride) + uv_border_w,
+                                   aom_byte_align);
+
+    ybf->use_external_reference_buffers = 0;
+
+    if (use_highbitdepth) {
+      if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
+      ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
+      if (!ybf->y_buffer_8bit) return -1;
+    } else {
+      if (ybf->y_buffer_8bit) {
+        aom_free(ybf->y_buffer_8bit);
+        ybf->y_buffer_8bit = NULL;
+        ybf->buf_8bit_valid = 0;
+      }
+    }
+
+    ybf->corrupted = 0; /* assume not corrupted by errors */
+    return 0;
+  }
+  return -2;
+}
+
+int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                           int ss_x, int ss_y, int use_highbitdepth, int border,
+                           int byte_alignment) {
+  if (ybf) {
+    aom_free_frame_buffer(ybf);
+    return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+                                    use_highbitdepth, border, byte_alignment,
+                                    NULL, NULL, NULL);
+  }
+  return -2;
+}
diff --git a/third_party/aom/aom_scale/generic/yv12extend.c b/third_party/aom/aom_scale/generic/yv12extend.c
new file mode 100644
index 000000000..ba183520a
--- /dev/null
+++ b/third_party/aom/aom_scale/generic/yv12extend.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+static void extend_plane(uint8_t *const src, int src_stride, int width,
+                         int height, int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+
+  /* copy the left and right most columns out */
+  uint8_t *src_ptr1 = src;
+  uint8_t *src_ptr2 = src + width - 1;
+  uint8_t *dst_ptr1 = src - extend_left;
+  uint8_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += src_stride;
+  }
+}
+
+static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
+                              int height, int extend_top, int extend_left,
+                              int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  /* copy the left and right most columns out */
+  uint16_t *src_ptr1 = src;
+  uint16_t *src_ptr2 = src + width - 1;
+  uint16_t *dst_ptr1 = src - extend_left;
+  uint16_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += src_stride;
+  }
+}
+
+void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                     const int num_planes) {
+  assert(ybf->border % 2 == 0);
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const int is_uv = plane > 0;
+      const int plane_border = ybf->border >> is_uv;
+      extend_plane_high(
+          ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
+          ybf->crop_heights[is_uv], plane_border, plane_border,
+          plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+    }
+    return;
+  }
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int is_uv = plane > 0;
+    const int plane_border = ybf->border >> is_uv;
+    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+                 plane_border, plane_border,
+                 plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
+                         const int num_planes) {
+  const int ss_x = ybf->uv_width < ybf->y_width;
+  const int ss_y = ybf->uv_height < ybf->y_height;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const int is_uv = plane > 0;
+      const int top = ext_size >> (is_uv ? ss_y : 0);
+      const int left = ext_size >> (is_uv ? ss_x : 0);
+      const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+      const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+      extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
+                        ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
+                        left, bottom, right);
+    }
+    return;
+  }
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int is_uv = plane > 0;
+    const int top = ext_size >> (is_uv ? ss_y : 0);
+    const int left = ext_size >> (is_uv ? ss_x : 0);
+    const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+    const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
+                 bottom, right);
+  }
+}
+
+void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
+  extend_frame(ybf, ybf->border, num_planes);
+}
+
+void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                      const int num_planes) {
+  const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
+                           ? AOMINNERBORDERINPIXELS
+                           : ybf->border;
+  extend_frame(ybf, inner_bw, num_planes);
+}
+
+void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
+  int ext_size = ybf->border;
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+                      ybf->y_crop_height, ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    return;
+  }
+  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+               ybf->y_crop_height, ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+}
+
+static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memcpy(dst, src, num * sizeof(uint16_t));
+}
+
+// Copies the source image into the destination image and updates the
+// destination's UMV borders.
+// Note: The frames are assumed to be identical in size.
+void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
+                           YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
+#if 0
+  /* These assertions are valid in the codec, but the libaom-tester uses
+   * this code slightly differently.
+   */
+  assert(src_bc->y_width == dst_bc->y_width);
+  assert(src_bc->y_height == dst_bc->y_height);
+#endif
+
+  assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
+         (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const uint8_t *plane_src = src_bc->buffers[plane];
+      uint8_t *plane_dst = dst_bc->buffers[plane];
+      const int is_uv = plane > 0;
+
+      for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
+        memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
+        plane_src += src_bc->strides[is_uv];
+        plane_dst += dst_bc->strides[is_uv];
+      }
+    }
+    aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
+    return;
+  }
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint8_t *plane_src = src_bc->buffers[plane];
+    uint8_t *plane_dst = dst_bc->buffers[plane];
+    const int is_uv = plane > 0;
+
+    for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
+      memcpy(plane_dst, plane_src, src_bc->widths[is_uv]);
+      plane_src += src_bc->strides[is_uv];
+      plane_dst += dst_bc->strides[is_uv];
+    }
+  }
+  aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
+}
+
+void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                       YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
+
+void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+                       YV12_BUFFER_CONFIG *dst_bc) {
+  int row;
+  const uint8_t *src = src_bc->u_buffer;
+  uint8_t *dst = dst_bc->u_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_bc->uv_height; ++row) {
+      memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  for (row = 0; row < src_bc->uv_height; ++row) {
+    memcpy(dst, src, src_bc->uv_width);
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+                       YV12_BUFFER_CONFIG *dst_bc) {
+  int row;
+  const uint8_t *src = src_bc->v_buffer;
+  uint8_t *dst = dst_bc->v_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_bc->uv_height; ++row) {
+      memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  for (row = 0; row < src_bc->uv_height; ++row) {
+    memcpy(dst, src, src_bc->uv_width);
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                               YV12_BUFFER_CONFIG *dst_ybc, int hstart,
+                               int hend, int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_ybc->y_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_ybc->y_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+  src = (src + vstart * src_ybc->y_stride + hstart);
+  dst = (dst + vstart * dst_ybc->y_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
+
+void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend,
+                               int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_bc->u_buffer;
+  uint8_t *dst = dst_bc->u_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  src = (src + vstart * src_bc->uv_stride + hstart);
+  dst = (dst + vstart * dst_bc->uv_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend,
+                               int vstart, int vend) {
+  int row;
+  const uint8_t *src = src_bc->v_buffer;
+  uint8_t *dst = dst_bc->v_buffer;
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart);
+    for (row = vstart; row < vend; ++row) {
+      memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+
+  src = (src + vstart * src_bc->uv_stride + hstart);
+  dst = (dst + vstart * dst_bc->uv_stride + hstart);
+
+  for (row = vstart; row < vend; ++row) {
+    memcpy(dst, src, (hend - hstart));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
diff --git a/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c b/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c
new file mode 100644
index 000000000..869e594d7
--- /dev/null
+++ b/third_party/aom/aom_scale/mips/dspr2/yv12extend_dspr2.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_scale/aom_scale.h"
+
+#if HAVE_DSPR2
+static void extend_plane(uint8_t *const src, int src_stride, int width,
+                         int height, int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int i, j;
+  uint8_t *left_src, *right_src;
+  uint8_t *left_dst_start, *right_dst_start;
+  uint8_t *left_dst, *right_dst;
+  uint8_t *top_src, *bot_src;
+  uint8_t *top_dst, *bot_dst;
+  uint32_t left_pix;
+  uint32_t right_pix;
+  uint32_t linesize;
+
+  /* copy the left and right most columns out */
+  left_src = src;
+  right_src = src + width - 1;
+  left_dst_start = src - extend_left;
+  right_dst_start = src + width;
+
+  for (i = height; i--;) {
+    left_dst = left_dst_start;
+    right_dst = right_dst_start;
+
+    __asm__ __volatile__(
+        "lb        %[left_pix],     0(%[left_src])      \n\t"
+        "lb        %[right_pix],    0(%[right_src])     \n\t"
+        "replv.qb  %[left_pix],     %[left_pix]         \n\t"
+        "replv.qb  %[right_pix],    %[right_pix]        \n\t"
+
+        : [left_pix] "=&r"(left_pix), [right_pix] "=&r"(right_pix)
+        : [left_src] "r"(left_src), [right_src] "r"(right_src));
+
+    for (j = extend_left / 4; j--;) {
+      __asm__ __volatile__(
+          "sw     %[left_pix],    0(%[left_dst])     \n\t"
+          "sw     %[right_pix],   0(%[right_dst])    \n\t"
+
+          :
+          : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix),
+            [right_dst] "r"(right_dst), [right_pix] "r"(right_pix));
+
+      left_dst += 4;
+      right_dst += 4;
+    }
+
+    for (j = extend_left % 4; j--;) {
+      __asm__ __volatile__(
+          "sb     %[left_pix],    0(%[left_dst])     \n\t"
+          "sb     %[right_pix],   0(%[right_dst])     \n\t"
+
+          :
+          : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix),
+            [right_dst] "r"(right_dst), [right_pix] "r"(right_pix));
+
+      left_dst += 1;
+      right_dst += 1;
+    }
+
+    left_src += src_stride;
+    right_src += src_stride;
+    left_dst_start += src_stride;
+    right_dst_start += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  top_src = src - extend_left;
+  bot_src = src + src_stride * (height - 1) - extend_left;
+  top_dst = src + src_stride * (-extend_top) - extend_left;
+  bot_dst = src + src_stride * (height)-extend_left;
+  linesize = extend_left + extend_right + width;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(top_dst, top_src, linesize);
+    top_dst += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(bot_dst, bot_src, linesize);
+    bot_dst += src_stride;
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
+  const int c_w = ybf->uv_crop_width;
+  const int c_h = ybf->uv_crop_height;
+  const int ss_x = ybf->uv_width < ybf->y_width;
+  const int ss_y = ybf->uv_height < ybf->y_height;
+  const int c_et = ext_size >> ss_y;
+  const int c_el = ext_size >> ss_x;
+  const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height;
+  const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+               ybf->y_crop_height, ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+
+void aom_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                    const int num_planes) {
+  extend_frame(ybf, ybf->border, num_planes);
+}
+
+void aom_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                          const int num_planes) {
+  const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
+                           ? AOMINNERBORDERINPIXELS
+                           : ybf->border;
+  extend_frame(ybf, inner_bw, num_planes);
+}
+#endif
diff --git a/third_party/aom/aom_scale/yv12config.h b/third_party/aom/aom_scale/yv12config.h
new file mode 100644
index 000000000..2fb81acd7
--- /dev/null
+++ b/third_party/aom/aom_scale/yv12config.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_SCALE_YV12CONFIG_H_
+#define AOM_AOM_SCALE_YV12CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+
+#define AOMINNERBORDERINPIXELS 160
+#define AOM_INTERP_EXTEND 4
+
+// TODO(jingning): Use unified inter predictor for encoder and
+// decoder during the development process. Revisit the frame border
+// to improve the decoder performance.
+#if CONFIG_REDUCED_ENCODER_BORDER
+#define AOM_BORDER_IN_PIXELS 160
+#else
+#define AOM_BORDER_IN_PIXELS 288
+#endif  // CONFIG_REDUCED_ENCODER_BORDER
+
+typedef struct yv12_buffer_config {
+  union {
+    struct {
+      int y_width;
+      int uv_width;
+      int alpha_width;
+    };
+    int widths[3];
+  };
+  union {
+    struct {
+      int y_height;
+      int uv_height;
+      int alpha_height;
+    };
+    int heights[3];
+  };
+  union {
+    struct {
+      int y_crop_width;
+      int uv_crop_width;
+    };
+    int crop_widths[2];
+  };
+  union {
+    struct {
+      int y_crop_height;
+      int uv_crop_height;
+    };
+    int crop_heights[2];
+  };
+  union {
+    struct {
+      int y_stride;
+      int uv_stride;
+      int alpha_stride;
+    };
+    int strides[3];
+  };
+  union {
+    struct {
+      uint8_t *y_buffer;
+      uint8_t *u_buffer;
+      uint8_t *v_buffer;
+      uint8_t *alpha_buffer;
+    };
+    uint8_t *buffers[4];
+  };
+
+  // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally
+  // allocated memory or external buffers.
+  int use_external_reference_buffers;
+  // This is needed to store y_buffer, u_buffer, and v_buffer when set reference
+  // uses an external refernece, and restore those buffer pointers after the
+  // external reference frame is no longer used.
+  uint8_t *store_buf_adr[3];
+
+  // If the frame is stored in a 16-bit buffer, this stores an 8-bit version
+  // for use in global motion detection. It is allocated on-demand.
+  uint8_t *y_buffer_8bit;
+  int buf_8bit_valid;
+
+  uint8_t *buffer_alloc;
+  size_t buffer_alloc_sz;
+  int border;
+  size_t frame_size;
+  int subsampling_x;
+  int subsampling_y;
+  unsigned int bit_depth;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  int monochrome;
+  aom_chroma_sample_position_t chroma_sample_position;
+  aom_color_range_t color_range;
+  int render_width;
+  int render_height;
+
+  int corrupted;
+  int flags;
+} YV12_BUFFER_CONFIG;
+
+#define YV12_FLAG_HIGHBITDEPTH 8
+
+int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                           int ss_x, int ss_y, int use_highbitdepth, int border,
+                           int byte_alignment);
+
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
+// NULL, then libaom is using the frame buffer callbacks to handle memory.
+// If cb is not NULL, libaom will call cb with minimum size in bytes needed
+// to decode the current frame. If cb is NULL, libaom will allocate memory
+// internally to decode the current frame. Returns 0 on success. Returns < 0
+// on failure.
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                             int ss_x, int ss_y, int use_highbitdepth,
+                             int border, int byte_alignment,
+                             aom_codec_frame_buffer_t *fb,
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
+int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_SCALE_YV12CONFIG_H_
diff --git a/third_party/aom/aom_util/aom_thread.c b/third_party/aom/aom_util/aom_thread.c
new file mode 100644
index 000000000..cae9f5e25
--- /dev/null
+++ b/third_party/aom/aom_util/aom_thread.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Multi-threaded worker
+//
+// Original source:
+//  https://chromium.googlesource.com/webm/libwebp
+
+#include <assert.h>
+#include <string.h>  // for memset()
+
+#include "aom_mem/aom_mem.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_MULTITHREAD
+
+struct AVxWorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t condition_;
+  pthread_t thread_;
+};
+
+//------------------------------------------------------------------------------
+
+static void execute(AVxWorker *const worker);  // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+  AVxWorker *const worker = (AVxWorker *)ptr;
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    while (worker->status_ == OK) {  // wait in idling mode
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      execute(worker);
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {  // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for sync())
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+  }
+  return THREAD_RETURN(NULL);  // Thread is finished
+}
+
+// main thread state control
+static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) {
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
+
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
+  }
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+}
+
+#endif  // CONFIG_MULTITHREAD
+
+//------------------------------------------------------------------------------
+
+static void init(AVxWorker *const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+static int sync(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+static int reset(AVxWorker *const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+    worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
+      return 0;
+    }
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+    Error:
+      aom_free(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = sync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+static void execute(AVxWorker *const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
+static void launch(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, WORK);
+#else
+  execute(worker);
+#endif
+}
+
+static void end(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  if (worker->impl_ != NULL) {
+    change_state(worker, NOT_OK);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
+    aom_free(worker->impl_);
+    worker->impl_ = NULL;
+  }
+#else
+  worker->status_ = NOT_OK;
+  assert(worker->impl_ == NULL);
+#endif
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+static AVxWorkerInterface g_worker_interface = { init,   reset,   sync,
+                                                 launch, execute, end };
+
+int aom_set_worker_interface(const AVxWorkerInterface *const winterface) {
+  if (winterface == NULL || winterface->init == NULL ||
+      winterface->reset == NULL || winterface->sync == NULL ||
+      winterface->launch == NULL || winterface->execute == NULL ||
+      winterface->end == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const AVxWorkerInterface *aom_get_worker_interface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/third_party/aom/aom_util/aom_thread.h b/third_party/aom/aom_util/aom_thread.h
new file mode 100644
index 000000000..f14c1ac18
--- /dev/null
+++ b/third_party/aom/aom_util/aom_thread.h
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Multi-threaded worker
+//
+// Original source:
+//  https://chromium.googlesource.com/webm/libwebp
+
+#ifndef AOM_AOM_UTIL_AOM_THREAD_H_
+#define AOM_AOM_UTIL_AOM_THREAD_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set maximum decode threads to be 8 due to the limit of frame buffers
+// and not enough semaphores in the emulation layer on windows.
+#define MAX_DECODE_THREADS 8
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#include <errno.h>    // NOLINT
+#include <process.h>  // NOLINT
+#include <windows.h>  // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
+#define USE_WINDOWS_CONDITION_VARIABLE
+typedef CONDITION_VARIABLE pthread_cond_t;
+#else
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+#endif  // _WIN32_WINNT >= 0x600
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+#if _WIN32_WINNT >= 0x0501  // Windows XP or greater
+#define WaitForSingleObject(obj, timeout) \
+  WaitForSingleObjectEx(obj, timeout, FALSE /*bAlertable*/)
+#endif
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 unsigned int(__stdcall *start)(void *),
+                                 void *arg) {
+  (void)attr;
+#ifdef USE_CREATE_THREAD
+  *thread = CreateThread(NULL,          /* lpThreadAttributes */
+                         0,             /* dwStackSize */
+                         start, arg, 0, /* dwStackSize */
+                         NULL);         /* lpThreadId */
+#else
+  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
+                                      0,             /* unsigned stack_size */
+                                      start, arg, 0, /* unsigned initflag */
+                                      NULL);         /* unsigned *thrdaddr */
+#endif
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+#if _WIN32_WINNT >= 0x0600  // Windows Vista / Server 2008 or greater
+  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+#else
+  InitializeCriticalSection(mutex);
+#endif
+  return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  (void)condition;
+#else
+  ok &= (CloseHandle(condition->waiting_sem_) != 0);
+  ok &= (CloseHandle(condition->received_sem_) != 0);
+  ok &= (CloseHandle(condition->signal_event_) != 0);
+#endif
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  (void)cond_attr;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  InitializeConditionVariable(condition);
+#else
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+  if (condition->waiting_sem_ == NULL || condition->received_sem_ == NULL ||
+      condition->signal_event_ == NULL) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  WakeConditionVariable(condition);
+#else
+  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok = SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+#endif
+  return !ok;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  WakeAllConditionVariable(condition);
+#else
+  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok &= SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+#endif
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+#else
+  // note that there is a consumer available so the signal isn't dropped in
+  // pthread_cond_signal
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
+  // now unlock the mutex so pthread_cond_signal may be issued
+  pthread_mutex_unlock(mutex);
+  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+        WAIT_OBJECT_0);
+  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+  pthread_mutex_lock(mutex);
+#endif
+  return !ok;
+}
+#elif defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>  // NOLINT
+
+#include <errno.h>        // NOLINT
+#include <stdlib.h>       // NOLINT
+#include <sys/builtin.h>  // NOLINT
+
+#define pthread_t TID
+#define pthread_mutex_t HMTX
+
+typedef struct {
+  HEV event_sem_;
+  HEV ack_sem_;
+  volatile unsigned wait_count_;
+} pthread_cond_t;
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#define THREADFN void *
+#define THREAD_RETURN(val) (val)
+
+typedef struct {
+  void *(*start_)(void *);
+  void *arg_;
+} thread_arg;
+
+static void thread_start(void *arg) {
+  thread_arg targ = *(thread_arg *)arg;
+  free(arg);
+
+  targ.start_(targ.arg_);
+}
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 void *(*start)(void *), void *arg) {
+  int tid;
+  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
+  if (targ == NULL) return 1;
+
+  (void)attr;
+
+  targ->start_ = start;
+  targ->arg_ = arg;
+  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
+  if (tid == -1) {
+    free(targ);
+    return 1;
+  }
+
+  *thread = tid;
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return DosWaitThread(&thread, DCWW_WAIT) != 0;
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  return DosReleaseMutexSem(*mutex) != 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  return DosCloseMutexSem(*mutex) != 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+  ok &= DosCloseEventSem(condition->event_sem_) == 0;
+  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  int ok = 1;
+  (void)cond_attr;
+
+  ok &=
+      DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
+  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
+  if (!ok) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  condition->wait_count_ = 0;
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
+    ok &= DosPostEventSem(condition->event_sem_) == 0;
+    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
+  }
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
+    ok &= pthread_cond_signal(condition) == 0;
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok = 1;
+
+  __atomic_increment(&condition->wait_count_);
+
+  ok &= pthread_mutex_unlock(mutex) == 0;
+
+  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
+
+  __atomic_decrement(&condition->wait_count_);
+
+  ok &= DosPostEventSem(condition->ack_sem_) == 0;
+
+  pthread_mutex_lock(mutex);
+
+  return !ok;
+}
+#else                 // _WIN32
+#include <pthread.h>  // NOLINT
+#define THREADFN void *
+#define THREAD_RETURN(val) val
+#endif
+
+#endif  // CONFIG_MULTITHREAD
+
+// State of the worker thread object
+typedef enum {
+  NOT_OK = 0,  // object is unusable
+  OK,          // ready to work
+  WORK         // busy finishing the current task
+} AVxWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
+typedef int (*AVxWorkerHook)(void *, void *);
+
+// Platform-dependent implementation details for the worker.
+typedef struct AVxWorkerImpl AVxWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
+typedef struct {
+  AVxWorkerImpl *impl_;
+  AVxWorkerStatus status_;
+  AVxWorkerHook hook;  // hook to call
+  void *data1;         // first argument passed to 'hook'
+  void *data2;         // second argument passed to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
+} AVxWorker;
+
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*init)(AVxWorker *const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*reset)(AVxWorker *const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*sync)(AVxWorker *const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*launch)(AVxWorker *const worker);
+  // This function is similar to launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the AVxWorker structs. sync() must
+  // still be called afterward (for error reporting).
+  void (*execute)(AVxWorker *const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call reset() again.
+  void (*end)(AVxWorker *const worker);
+} AVxWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int aom_set_worker_interface(const AVxWorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const AVxWorkerInterface *aom_get_worker_interface(void);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_UTIL_AOM_THREAD_H_
diff --git a/third_party/aom/aom_util/aom_util.cmake b/third_party/aom/aom_util/aom_util.cmake
new file mode 100644
index 000000000..d4f3bce74
--- /dev/null
+++ b/third_party/aom/aom_util/aom_util.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_)
+  return()
+endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_
+set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1)
+
+list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c"
+            "${AOM_ROOT}/aom_util/aom_thread.h"
+            "${AOM_ROOT}/aom_util/endian_inl.h"
+            "${AOM_ROOT}/aom_util/debug_util.c"
+            "${AOM_ROOT}/aom_util/debug_util.h")
+
+# Creates the aom_util build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_util_targets)
+  add_library(aom_util OBJECT ${AOM_UTIL_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_util>)
+endfunction()
diff --git a/third_party/aom/aom_util/debug_util.c b/third_party/aom/aom_util/debug_util.c
new file mode 100644
index 000000000..468c47ed1
--- /dev/null
+++ b/third_party/aom/aom_util/debug_util.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "aom_util/debug_util.h"
+
+static int frame_idx_w = 0;
+
+static int frame_idx_r = 0;
+
+void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; }
+
+int bitstream_queue_get_frame_write(void) { return frame_idx_w; }
+
+void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; }
+
+int bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 2000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int nsymbs_queue[QUEUE_MAX_SIZE];
+static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+
+void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
+
+void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
+
+void bitstream_queue_record_write(void) { queue_prev_w = queue_w; }
+
+void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; }
+
+int bitstream_queue_get_write(void) { return queue_w; }
+
+int bitstream_queue_get_read(void) { return queue_r; }
+
+void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) {
+  if (!skip_r) {
+    if (queue_w == queue_r) {
+      printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+    *result = result_queue[queue_r];
+    *nsymbs = nsymbs_queue[queue_r];
+    memcpy(cdf, cdf_queue[queue_r], *nsymbs * sizeof(*cdf));
+    queue_r = (queue_r + 1) % QUEUE_MAX_SIZE;
+  }
+}
+
+void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) {
+  if (!skip_w) {
+    result_queue[queue_w] = result;
+    nsymbs_queue[queue_w] = nsymbs;
+    memcpy(cdf_queue[queue_w], cdf, nsymbs * sizeof(*cdf));
+    queue_w = (queue_w + 1) % QUEUE_MAX_SIZE;
+    if (queue_w == queue_r) {
+      printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+  }
+}
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+static int max_frame_buf_num = 5;
+#define MAX_FRAME_STRIDE 1280
+#define MAX_FRAME_HEIGHT 720
+static uint16_t
+    frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction only
+static uint16_t
+    frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w() {
+  frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num;
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf overflow\n");
+    assert(0);
+  }
+}
+
+void mismatch_reset_frame(int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    memset(frame_pre[frame_buf_idx_w][plane], 0,
+           sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+    memset(frame_tx[frame_buf_idx_w][plane], 0,
+           sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+  }
+}
+
+void mismatch_move_frame_idx_r() {
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf underflow\n");
+    assert(0);
+  }
+  frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+                               int frame_offset, int plane, int pixel_c,
+                               int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      frame_pre[frame_buf_idx_w][plane]
+               [(r + pixel_r) * frame_stride + c + pixel_c] =
+                   src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  int ref_frame_idx = 3;
+  int ref_frame_offset = 4;
+  int ref_plane = 1;
+  int ref_pixel_c = 162;
+  int ref_pixel_r = 16;
+  if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+      frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c &&
+      ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r &&
+      ref_pixel_r < pixel_r + blk_h) {
+    printf(
+        "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+        "%d blk_h %d\n",
+        frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+  }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      frame_tx[frame_buf_idx_w][plane]
+              [(r + pixel_r) * frame_stride + c + pixel_c] =
+                  src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  int ref_frame_idx = 3;
+  int ref_frame_offset = 4;
+  int ref_plane = 1;
+  int ref_pixel_c = 162;
+  int ref_pixel_r = 16;
+  if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset &&
+      ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+      ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+    printf(
+        "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+        "%d blk_h %d\n",
+        frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+  }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      if (frame_pre[frame_buf_idx_r][plane]
+                   [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    printf(
+        "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d "
+        "pixel_c %d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_pre[frame_buf_idx_r][plane]
+                               [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+                             int frame_offset, int plane, int pixel_c,
+                             int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      if (frame_tx[frame_buf_idx_r][plane]
+                  [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    printf(
+        "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c "
+        "%d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_tx[frame_buf_idx_r][plane]
+                              [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+#endif  // CONFIG_MISMATCH_DEBUG
diff --git a/third_party/aom/aom_util/debug_util.h b/third_party/aom/aom_util/debug_util.h
new file mode 100644
index 000000000..127a8b468
--- /dev/null
+++ b/third_party/aom/aom_util/debug_util.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_
+#define AOM_AOM_UTIL_DEBUG_UTIL_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void bitstream_queue_set_frame_write(int frame_idx);
+int bitstream_queue_get_frame_write(void);
+void bitstream_queue_set_frame_read(int frame_idx);
+int bitstream_queue_get_frame_read(void);
+
+#if CONFIG_BITSTREAM_DEBUG
+/* This is a debug tool used to detect bitstream error. On encoder side, it
+ * pushes each bit and probability into a queue before the bit is written into
+ * the Arithmetic coder. On decoder side, whenever a bit is read out from the
+ * Arithmetic coder, it pops out the reference bit and probability from the
+ * queue as well. If the two results do not match, this debug tool will report
+ * an error.  This tool can be used to pin down the bitstream error precisely.
+ * By combining gdb's backtrace method, we can detect which module causes the
+ * bitstream error. */
+int bitstream_queue_get_write(void);
+int bitstream_queue_get_read(void);
+void bitstream_queue_record_write(void);
+void bitstream_queue_reset_write(void);
+void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs);
+void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs);
+void bitstream_queue_set_skip_write(int skip);
+void bitstream_queue_set_skip_read(int skip);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w();
+void mismatch_move_frame_idx_r();
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+                               int frame_offset, int plane, int pixel_c,
+                               int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+                             int frame_offset, int plane, int pixel_c,
+                             int pixel_r, int blk_w, int blk_h, int highbd);
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_UTIL_DEBUG_UTIL_H_
diff --git a/third_party/aom/aom_util/endian_inl.h b/third_party/aom/aom_util/endian_inl.h
new file mode 100644
index 000000000..f536ec5b8
--- /dev/null
+++ b/third_party/aom/aom_util/endian_inl.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Endian related functions.
+
+#ifndef AOM_AOM_UTIL_ENDIAN_INL_H_
+#define AOM_AOM_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+// handle clang compatibility
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) &&                   \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
+
+#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6)
+#define AOM_USE_MIPS32_R2
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(AOM_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile(
+      "wsbh   %[ret], %[x]          \n\t"
+      "rotr   %[ret], %[ret],  16   \n\t"
+      : [ret] "=r"(ret)
+      : [x] "r"(x));
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else   // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // AOM_AOM_UTIL_ENDIAN_INL_H_
diff --git a/third_party/aom/apps/aomdec.c b/third_party/aom/apps/aomdec.c
new file mode 100644
index 000000000..ff13b6f50
--- /dev/null
+++ b/third_party/aom/apps/aomdec.c
@@ -0,0 +1,1046 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#if CONFIG_OS_SUPPORT
+#if HAVE_UNISTD_H
+#include <unistd.h>  // NOLINT
+#elif !defined(STDOUT_FILENO)
+#define STDOUT_FILENO 1
+#endif
+#endif
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfdec.h"
+#include "common/md5_utils.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+
+#if CONFIG_WEBM_IO
+#include "common/webmdec.h"
+#endif
+
+#include "common/rawenc.h"
+#include "common/y4menc.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+static const char *exec_name;
+
+struct AvxDecInputContext {
+  struct AvxInputContext *aom_input_ctx;
+  struct ObuDecInputContext *obu_ctx;
+  struct WebmInputContext *webm_ctx;
+};
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t looparg =
+    ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 =
+    ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg =
+    ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo =
+    ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg =
+    ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg =
+    ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg =
+    ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg =
+    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t postprocarg =
+    ARG_DEF(NULL, "postproc", 0, "Postprocess decoded frames");
+static const arg_def_t summaryarg =
+    ARG_DEF(NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg =
+    ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t verbosearg =
+    ARG_DEF("v", "verbose", 0, "Show version string");
+static const arg_def_t scalearg =
+    ARG_DEF("S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg =
+    ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg =
+    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg =
+    ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
+static const arg_def_t framestatsarg =
+    ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
+static const arg_def_t outbitdeptharg =
+    ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
+static const arg_def_t isannexb =
+    ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format");
+static const arg_def_t oppointarg = ARG_DEF(
+    NULL, "oppoint", 1, "Select an operating point of a scalable bitstream");
+static const arg_def_t outallarg = ARG_DEF(
+    NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");
+static const arg_def_t skipfilmgrain =
+    ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
+
+static const arg_def_t *all_args[] = {
+  &help,           &codecarg,   &use_yv12,      &use_i420,
+  &flipuvarg,      &rawvideo,   &noblitarg,     &progressarg,
+  &limitarg,       &skiparg,    &postprocarg,   &summaryarg,
+  &outputfile,     &threadsarg, &verbosearg,    &scalearg,
+  &fb_arg,         &md5arg,     &framestatsarg, &continuearg,
+  &outbitdeptharg, &isannexb,   &oppointarg,    &outallarg,
+  &skipfilmgrain,  NULL
+};
+
+#if CONFIG_LIBYUV
+static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
+                               FilterModeEnum mode) {
+  if (src->fmt == AOM_IMG_FMT_I42016) {
+    assert(dst->fmt == AOM_IMG_FMT_I42016);
+    return I420Scale_16(
+        (uint16_t *)src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y] / 2,
+        (uint16_t *)src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U] / 2,
+        (uint16_t *)src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V] / 2,
+        src->d_w, src->d_h, (uint16_t *)dst->planes[AOM_PLANE_Y],
+        dst->stride[AOM_PLANE_Y] / 2, (uint16_t *)dst->planes[AOM_PLANE_U],
+        dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V],
+        dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode);
+  }
+  assert(src->fmt == AOM_IMG_FMT_I420);
+  assert(dst->fmt == AOM_IMG_FMT_I420);
+  return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+                   src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+                   src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], src->d_w,
+                   src->d_h, dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y],
+                   dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+                   dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], dst->d_w,
+                   dst->d_h, mode);
+}
+#endif
+
+void show_help(FILE *fout, int shorthelp) {
+  fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "Options:\n");
+  arg_show_usage(fout, all_args);
+  fprintf(fout,
+          "\nOutput File Patterns:\n\n"
+          "  The -o argument specifies the name of the file(s) to "
+          "write to. If the\n  argument does not include any escape "
+          "characters, the output will be\n  written to a single file. "
+          "Otherwise, the filename will be calculated by\n  expanding "
+          "the following escape characters:\n");
+  fprintf(fout,
+          "\n\t%%w   - Frame width"
+          "\n\t%%h   - Frame height"
+          "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
+          "\n\n  Pattern arguments are only supported in conjunction "
+          "with the --yv12 and\n  --i420 options. If the -o option is "
+          "not specified, the output will be\n  directed to stdout.\n");
+  fprintf(fout, "\nIncluded decoders:\n\n");
+
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+    fprintf(fout, "    %-6s - %s\n", decoder->name,
+            aom_codec_iface_name(decoder->codec_interface()));
+  }
+}
+
+void usage_exit(void) {
+  show_help(stderr, 1);
+  exit(EXIT_FAILURE);
+}
+
+static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                          size_t *buffer_size) {
+  char raw_hdr[RAW_FRAME_HDR_SZ];
+  size_t frame_size = 0;
+
+  if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile)) warn("Failed to read RAW frame size\n");
+  } else {
+    const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
+    const size_t kFrameTooSmallThreshold = 256 * 1024;
+    frame_size = mem_get_le32(raw_hdr);
+
+    if (frame_size > kCorruptFrameThreshold) {
+      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size < kFrameTooSmallThreshold) {
+      warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
+           (unsigned int)frame_size);
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buf = realloc(*buffer, 2 * frame_size);
+      if (new_buf) {
+        *buffer = new_buf;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer\n");
+        frame_size = 0;
+      }
+    }
+  }
+
+  if (!feof(infile)) {
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame\n");
+      return 1;
+    }
+    *bytes_read = frame_size;
+  }
+
+  return 0;
+}
+
+static int read_frame(struct AvxDecInputContext *input, uint8_t **buf,
+                      size_t *bytes_in_buffer, size_t *buffer_size) {
+  switch (input->aom_input_ctx->file_type) {
+#if CONFIG_WEBM_IO
+    case FILE_TYPE_WEBM:
+      return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer,
+                             buffer_size);
+#endif
+    case FILE_TYPE_RAW:
+      return raw_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
+                            buffer_size);
+    case FILE_TYPE_IVF:
+      return ivf_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
+                            buffer_size, NULL);
+    case FILE_TYPE_OBU:
+      return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer,
+                                       buffer_size);
+    default: return 1;
+  }
+}
+
+static int file_is_raw(struct AvxInputContext *input) {
+  uint8_t buf[32];
+  int is_raw = 0;
+  aom_codec_stream_info_t si;
+  memset(&si, 0, sizeof(si));
+
+  if (fread(buf, 1, 32, input->file) == 32) {
+    int i;
+
+    if (mem_get_le32(buf) < 256 * 1024 * 1024) {
+      for (i = 0; i < get_aom_decoder_count(); ++i) {
+        const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+        if (!aom_codec_peek_stream_info(decoder->codec_interface(), buf + 4,
+                                        32 - 4, &si)) {
+          is_raw = 1;
+          input->fourcc = decoder->fourcc;
+          input->width = si.w;
+          input->height = si.h;
+          input->framerate.numerator = 30;
+          input->framerate.denominator = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  rewind(input->file);
+  return is_raw;
+}
+
+static void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+  fprintf(stderr,
+          "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r",
+          frame_in, frame_out, dx_time,
+          (double)frame_out * 1000000.0 / (double)dx_time);
+}
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+struct ExternalFrameBufferList {
+  int num_external_frame_buffers;
+  struct ExternalFrameBuffer *ext_fb;
+};
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Application private data passed into the set function. |min_size| is the
+// minimum size in bytes needed to decode the next frame. |fb| pointer to the
+// frame buffer.
+static int get_av1_frame_buffer(void *cb_priv, size_t min_size,
+                                aom_codec_frame_buffer_t *fb) {
+  int i;
+  struct ExternalFrameBufferList *const ext_fb_list =
+      (struct ExternalFrameBufferList *)cb_priv;
+  if (ext_fb_list == NULL) return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
+    if (!ext_fb_list->ext_fb[i].in_use) break;
+  }
+
+  if (i == ext_fb_list->num_external_frame_buffers) return -1;
+
+  if (ext_fb_list->ext_fb[i].size < min_size) {
+    free(ext_fb_list->ext_fb[i].data);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
+    if (!ext_fb_list->ext_fb[i].data) return -1;
+
+    ext_fb_list->ext_fb[i].size = min_size;
+  }
+
+  fb->data = ext_fb_list->ext_fb[i].data;
+  fb->size = ext_fb_list->ext_fb[i].size;
+  ext_fb_list->ext_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the external frame buffer.
+  fb->priv = &ext_fb_list->ext_fb[i];
+  return 0;
+}
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| user private data passed into the set function. |fb| pointer
+// to the frame buffer.
+static int release_av1_frame_buffer(void *cb_priv,
+                                    aom_codec_frame_buffer_t *fb) {
+  struct ExternalFrameBuffer *const ext_fb =
+      (struct ExternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  ext_fb->in_use = 0;
+  return 0;
+}
+
+static void generate_filename(const char *pattern, char *out, size_t q_len,
+                              unsigned int d_w, unsigned int d_h,
+                              unsigned int frame_in) {
+  const char *p = pattern;
+  char *q = out;
+
+  do {
+    char *next_pat = strchr(p, '%');
+
+    if (p == next_pat) {
+      size_t pat_len;
+
+      /* parse the pattern */
+      q[q_len - 1] = '\0';
+      switch (p[1]) {
+        case 'w': snprintf(q, q_len - 1, "%d", d_w); break;
+        case 'h': snprintf(q, q_len - 1, "%d", d_h); break;
+        case '1': snprintf(q, q_len - 1, "%d", frame_in); break;
+        case '2': snprintf(q, q_len - 1, "%02d", frame_in); break;
+        case '3': snprintf(q, q_len - 1, "%03d", frame_in); break;
+        case '4': snprintf(q, q_len - 1, "%04d", frame_in); break;
+        case '5': snprintf(q, q_len - 1, "%05d", frame_in); break;
+        case '6': snprintf(q, q_len - 1, "%06d", frame_in); break;
+        case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
+        case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
+        case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]); break;
+      }
+
+      pat_len = strlen(q);
+      if (pat_len >= q_len - 1) die("Output filename too long.\n");
+      q += pat_len;
+      p += 2;
+      q_len -= pat_len;
+    } else {
+      size_t copy_len;
+
+      /* copy the next segment */
+      if (!next_pat)
+        copy_len = strlen(p);
+      else
+        copy_len = next_pat - p;
+
+      if (copy_len >= q_len - 1) die("Output filename too long.\n");
+
+      memcpy(q, p, copy_len);
+      q[copy_len] = '\0';
+      q += copy_len;
+      p += copy_len;
+      q_len -= copy_len;
+    }
+  } while (*p);
+}
+
+static int is_single_file(const char *outfile_pattern) {
+  const char *p = outfile_pattern;
+
+  do {
+    p = strchr(p, '%');
+    if (p && p[1] >= '1' && p[1] <= '9')
+      return 0;  // pattern contains sequence number, so it's not unique
+    if (p) p++;
+  } while (p);
+
+  return 1;
+}
+
+static void print_md5(unsigned char digest[16], const char *filename) {
+  int i;
+
+  for (i = 0; i < 16; ++i) printf("%02x", digest[i]);
+  printf("  %s\n", filename);
+}
+
+static FILE *open_outfile(const char *name) {
+  if (strcmp("-", name) == 0) {
+    set_binary_mode(stdout);
+    return stdout;
+  } else {
+    FILE *file = fopen(name, "wb");
+    if (!file) fatal("Failed to open output file '%s'", name);
+    return file;
+  }
+}
+
+static int img_shifted_realloc_required(const aom_image_t *img,
+                                        const aom_image_t *shifted,
+                                        aom_img_fmt_t required_fmt) {
+  return img->d_w != shifted->d_w || img->d_h != shifted->d_h ||
+         required_fmt != shifted->fmt;
+}
+
+static int main_loop(int argc, const char **argv_) {
+  aom_codec_ctx_t decoder;
+  char *fn = NULL;
+  int i;
+  int ret = EXIT_FAILURE;
+  uint8_t *buf = NULL;
+  size_t bytes_in_buffer = 0, buffer_size = 0;
+  FILE *infile;
+  int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+  int do_md5 = 0, progress = 0;
+  int stop_after = 0, postproc = 0, summary = 0, quiet = 1;
+  int arg_skip = 0;
+  int keep_going = 0;
+  const AvxInterface *interface = NULL;
+  const AvxInterface *fourcc_interface = NULL;
+  uint64_t dx_time = 0;
+  struct arg arg;
+  char **argv, **argi, **argj;
+
+  int single_file;
+  int use_y4m = 1;
+  int opt_yv12 = 0;
+  int opt_i420 = 0;
+  int opt_raw = 0;
+  aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+  unsigned int output_bit_depth = 0;
+  unsigned int is_annexb = 0;
+  int frames_corrupted = 0;
+  int dec_flags = 0;
+  int do_scale = 0;
+  int operating_point = 0;
+  int output_all_layers = 0;
+  int skip_film_grain = 0;
+  aom_image_t *scaled_img = NULL;
+  aom_image_t *img_shifted = NULL;
+  int frame_avail, got_data, flush_decoder = 0;
+  int num_external_frame_buffers = 0;
+  struct ExternalFrameBufferList ext_fb_list = { 0, NULL };
+
+  const char *outfile_pattern = NULL;
+  char outfile_name[PATH_MAX] = { 0 };
+  FILE *outfile = NULL;
+
+  FILE *framestats_file = NULL;
+
+  MD5Context md5_ctx;
+  unsigned char md5_digest[16];
+
+  struct AvxDecInputContext input = { NULL, NULL, NULL };
+  struct AvxInputContext aom_input_ctx;
+  memset(&aom_input_ctx, 0, sizeof(aom_input_ctx));
+#if CONFIG_WEBM_IO
+  struct WebmInputContext webm_ctx;
+  memset(&webm_ctx, 0, sizeof(webm_ctx));
+  input.webm_ctx = &webm_ctx;
+#endif
+  struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+
+  obu_ctx.avx_ctx = &aom_input_ctx;
+  input.obu_ctx = &obu_ctx;
+  input.aom_input_ctx = &aom_input_ctx;
+
+  /* Parse command line */
+  exec_name = argv_[0];
+  argv = argv_dup(argc - 1, argv_ + 1);
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    memset(&arg, 0, sizeof(arg));
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
+      interface = get_aom_decoder_by_name(arg.val);
+      if (!interface)
+        die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+    } else if (arg_match(&arg, &looparg, argi)) {
+      // no-op
+    } else if (arg_match(&arg, &outputfile, argi)) {
+      outfile_pattern = arg.val;
+    } else if (arg_match(&arg, &use_yv12, argi)) {
+      use_y4m = 0;
+      flipuv = 1;
+      opt_yv12 = 1;
+      opt_i420 = 0;
+      opt_raw = 0;
+    } else if (arg_match(&arg, &use_i420, argi)) {
+      use_y4m = 0;
+      flipuv = 0;
+      opt_yv12 = 0;
+      opt_i420 = 1;
+      opt_raw = 0;
+    } else if (arg_match(&arg, &rawvideo, argi)) {
+      use_y4m = 0;
+      opt_yv12 = 0;
+      opt_i420 = 0;
+      opt_raw = 1;
+    } else if (arg_match(&arg, &flipuvarg, argi)) {
+      flipuv = 1;
+    } else if (arg_match(&arg, &noblitarg, argi)) {
+      noblit = 1;
+    } else if (arg_match(&arg, &progressarg, argi)) {
+      progress = 1;
+    } else if (arg_match(&arg, &limitarg, argi)) {
+      stop_after = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &skiparg, argi)) {
+      arg_skip = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &postprocarg, argi)) {
+      postproc = 1;
+    } else if (arg_match(&arg, &md5arg, argi)) {
+      do_md5 = 1;
+    } else if (arg_match(&arg, &framestatsarg, argi)) {
+      framestats_file = fopen(arg.val, "w");
+      if (!framestats_file) {
+        die("Error: Could not open --framestats file (%s) for writing.\n",
+            arg.val);
+      }
+    } else if (arg_match(&arg, &summaryarg, argi)) {
+      summary = 1;
+    } else if (arg_match(&arg, &threadsarg, argi)) {
+      cfg.threads = arg_parse_uint(&arg);
+#if !CONFIG_MULTITHREAD
+      if (cfg.threads > 1) {
+        die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = "
+            "0.\n",
+            cfg.threads);
+      }
+#endif
+    } else if (arg_match(&arg, &verbosearg, argi)) {
+      quiet = 0;
+    } else if (arg_match(&arg, &scalearg, argi)) {
+      do_scale = 1;
+    } else if (arg_match(&arg, &fb_arg, argi)) {
+      num_external_frame_buffers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &continuearg, argi)) {
+      keep_going = 1;
+    } else if (arg_match(&arg, &outbitdeptharg, argi)) {
+      output_bit_depth = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &isannexb, argi)) {
+      is_annexb = 1;
+      input.obu_ctx->is_annexb = 1;
+    } else if (arg_match(&arg, &oppointarg, argi)) {
+      operating_point = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &outallarg, argi)) {
+      output_all_layers = 1;
+    } else if (arg_match(&arg, &skipfilmgrain, argi)) {
+      skip_film_grain = 1;
+    } else {
+      argj++;
+    }
+  }
+
+  /* Check for unrecognized options */
+  for (argi = argv; *argi; argi++)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
+
+  /* Handle non-option arguments */
+  fn = argv[0];
+
+  if (!fn) {
+    free(argv);
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
+  /* Open file */
+  infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
+
+  if (!infile) {
+    fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
+  }
+#if CONFIG_OS_SUPPORT
+  /* Make sure we don't dump to the terminal, unless forced to with -o - */
+  if (!outfile_pattern && isatty(STDOUT_FILENO) && !do_md5 && !noblit) {
+    fprintf(stderr,
+            "Not dumping raw video to your terminal. Use '-o -' to "
+            "override.\n");
+    return EXIT_FAILURE;
+  }
+#endif
+  input.aom_input_ctx->filename = fn;
+  input.aom_input_ctx->file = infile;
+  if (file_is_ivf(input.aom_input_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+#if CONFIG_WEBM_IO
+  else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
+#endif
+  else if (file_is_obu(&obu_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_OBU;
+  else if (file_is_raw(input.aom_input_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_RAW;
+  else {
+    fprintf(stderr, "Unrecognized input file type.\n");
+#if !CONFIG_WEBM_IO
+    fprintf(stderr, "aomdec was built without WebM container support.\n");
+#endif
+    return EXIT_FAILURE;
+  }
+
+  outfile_pattern = outfile_pattern ? outfile_pattern : "-";
+  single_file = is_single_file(outfile_pattern);
+
+  if (!noblit && single_file) {
+    generate_filename(outfile_pattern, outfile_name, PATH_MAX,
+                      aom_input_ctx.width, aom_input_ctx.height, 0);
+    if (do_md5)
+      MD5Init(&md5_ctx);
+    else
+      outfile = open_outfile(outfile_name);
+  }
+
+  if (use_y4m && !noblit) {
+    if (!single_file) {
+      fprintf(stderr,
+              "YUV4MPEG2 not supported with output patterns,"
+              " try --i420 or --yv12 or --rawvideo.\n");
+      return EXIT_FAILURE;
+    }
+
+#if CONFIG_WEBM_IO
+    if (aom_input_ctx.file_type == FILE_TYPE_WEBM) {
+      if (webm_guess_framerate(input.webm_ctx, input.aom_input_ctx)) {
+        fprintf(stderr,
+                "Failed to guess framerate -- error parsing "
+                "webm file?\n");
+        return EXIT_FAILURE;
+      }
+    }
+#endif
+  }
+
+  fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+  if (interface && fourcc_interface && interface != fourcc_interface)
+    warn("Header indicates codec: %s\n", fourcc_interface->name);
+  else
+    interface = fourcc_interface;
+
+  if (!interface) interface = get_aom_decoder_by_index(0);
+
+  dec_flags = (postproc ? AOM_CODEC_USE_POSTPROC : 0);
+  if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
+                         dec_flags)) {
+    fprintf(stderr, "Failed to initialize decoder: %s\n",
+            aom_codec_error(&decoder));
+    goto fail2;
+  }
+
+  if (!quiet) fprintf(stderr, "%s\n", decoder.name);
+
+  if (aom_codec_control(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
+    fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (aom_codec_control(&decoder, AV1D_SET_OPERATING_POINT, operating_point)) {
+    fprintf(stderr, "Failed to set operating_point: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (aom_codec_control(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS,
+                        output_all_layers)) {
+    fprintf(stderr, "Failed to set output_all_layers: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (aom_codec_control(&decoder, AV1D_SET_SKIP_FILM_GRAIN, skip_film_grain)) {
+    fprintf(stderr, "Failed to set skip_film_grain: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
+  while (arg_skip) {
+    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
+    arg_skip--;
+  }
+
+  if (num_external_frame_buffers > 0) {
+    ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
+    ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
+        num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer,
+                                             release_av1_frame_buffer,
+                                             &ext_fb_list)) {
+      fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+              aom_codec_error(&decoder));
+      goto fail;
+    }
+  }
+
+  frame_avail = 1;
+  got_data = 0;
+
+  if (framestats_file) fprintf(framestats_file, "bytes,qp\r\n");
+
+  /* Decode file */
+  while (frame_avail || got_data) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img;
+    struct aom_usec_timer timer;
+    int corrupted = 0;
+
+    frame_avail = 0;
+    if (!stop_after || frame_in < stop_after) {
+      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
+        frame_avail = 1;
+        frame_in++;
+
+        aom_usec_timer_start(&timer);
+
+        if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) {
+          const char *detail = aom_codec_error_detail(&decoder);
+          warn("Failed to decode frame %d: %s", frame_in,
+               aom_codec_error(&decoder));
+
+          if (detail) warn("Additional information: %s", detail);
+          if (!keep_going) goto fail;
+        }
+
+        if (framestats_file) {
+          int qp;
+          if (aom_codec_control(&decoder, AOMD_GET_LAST_QUANTIZER, &qp)) {
+            warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
+                 aom_codec_error(&decoder));
+            if (!keep_going) goto fail;
+          }
+          fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp);
+        }
+
+        aom_usec_timer_mark(&timer);
+        dx_time += aom_usec_timer_elapsed(&timer);
+      } else {
+        flush_decoder = 1;
+      }
+    } else {
+      flush_decoder = 1;
+    }
+
+    aom_usec_timer_start(&timer);
+
+    if (flush_decoder) {
+      // Flush the decoder in frame parallel decode.
+      if (aom_codec_decode(&decoder, NULL, 0, NULL)) {
+        warn("Failed to flush decoder: %s", aom_codec_error(&decoder));
+      }
+    }
+
+    aom_usec_timer_mark(&timer);
+    dx_time += aom_usec_timer_elapsed(&timer);
+
+    got_data = 0;
+    while ((img = aom_codec_get_frame(&decoder, &iter))) {
+      ++frame_out;
+      got_data = 1;
+
+      if (aom_codec_control(&decoder, AOMD_GET_FRAME_CORRUPTED, &corrupted)) {
+        warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
+        if (!keep_going) goto fail;
+      }
+      frames_corrupted += corrupted;
+
+      if (progress) show_progress(frame_in, frame_out, dx_time);
+
+      if (!noblit) {
+        const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+        const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U };
+        const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
+
+        if (do_scale) {
+          if (frame_out == 1) {
+            // If the output frames are to be scaled to a fixed display size
+            // then use the width and height specified in the container. If
+            // either of these is set to 0, use the display size set in the
+            // first frame header. If that is unavailable, use the raw decoded
+            // size of the first decoded frame.
+            int render_width = aom_input_ctx.width;
+            int render_height = aom_input_ctx.height;
+            if (!render_width || !render_height) {
+              int render_size[2];
+              if (aom_codec_control(&decoder, AV1D_GET_DISPLAY_SIZE,
+                                    render_size)) {
+                // As last resort use size of first frame as display size.
+                render_width = img->d_w;
+                render_height = img->d_h;
+              } else {
+                render_width = render_size[0];
+                render_height = render_size[1];
+              }
+            }
+            scaled_img =
+                aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+            scaled_img->bit_depth = img->bit_depth;
+            scaled_img->monochrome = img->monochrome;
+            scaled_img->csp = img->csp;
+          }
+
+          if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+#if CONFIG_LIBYUV
+            libyuv_scale(img, scaled_img, kFilterBox);
+            img = scaled_img;
+#else
+            fprintf(
+                stderr,
+                "Failed to scale output frame: %s.\n"
+                "libyuv is required for scaling but is currently disabled.\n"
+                "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n",
+                aom_codec_error(&decoder));
+            goto fail;
+#endif
+          }
+        }
+        // Default to codec bit depth if output bit depth not set
+        if (!output_bit_depth && single_file && !do_md5) {
+          output_bit_depth = img->bit_depth;
+        }
+        // Shift up or down if necessary
+        if (output_bit_depth != 0) {
+          const aom_img_fmt_t shifted_fmt =
+              output_bit_depth == 8 ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH
+                                    : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
+
+          if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
+            if (img_shifted &&
+                img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+              aom_img_free(img_shifted);
+              img_shifted = NULL;
+            }
+            if (img_shifted) {
+              img_shifted->monochrome = img->monochrome;
+            }
+            if (!img_shifted) {
+              img_shifted =
+                  aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+              img_shifted->bit_depth = output_bit_depth;
+              img_shifted->monochrome = img->monochrome;
+              img_shifted->csp = img->csp;
+            }
+            if (output_bit_depth > img->bit_depth) {
+              aom_img_upshift(img_shifted, img,
+                              output_bit_depth - img->bit_depth);
+            } else {
+              aom_img_downshift(img_shifted, img,
+                                img->bit_depth - output_bit_depth);
+            }
+            img = img_shifted;
+          }
+        }
+
+        aom_input_ctx.width = img->d_w;
+        aom_input_ctx.height = img->d_h;
+
+        int num_planes = (opt_raw && img->monochrome) ? 1 : 3;
+        if (single_file) {
+          if (use_y4m) {
+            char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
+            size_t len = 0;
+            if (frame_out == 1) {
+              // Y4M file header
+              len = y4m_write_file_header(
+                  y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
+                  aom_input_ctx.height, &aom_input_ctx.framerate,
+                  img->monochrome, img->csp, img->fmt, img->bit_depth);
+              if (do_md5) {
+                MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+              } else {
+                fputs(y4m_buf, outfile);
+              }
+            }
+
+            // Y4M frame header
+            len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
+            if (do_md5) {
+              MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+              y4m_update_image_md5(img, planes, &md5_ctx);
+            } else {
+              fputs(y4m_buf, outfile);
+              y4m_write_image_file(img, planes, outfile);
+            }
+          } else {
+            if (frame_out == 1) {
+              // Check if --yv12 or --i420 options are consistent with the
+              // bit-stream decoded
+              if (opt_i420) {
+                if (img->fmt != AOM_IMG_FMT_I420 &&
+                    img->fmt != AOM_IMG_FMT_I42016) {
+                  fprintf(stderr,
+                          "Cannot produce i420 output for bit-stream.\n");
+                  goto fail;
+                }
+              }
+              if (opt_yv12) {
+                if ((img->fmt != AOM_IMG_FMT_I420 &&
+                     img->fmt != AOM_IMG_FMT_YV12) ||
+                    img->bit_depth != 8) {
+                  fprintf(stderr,
+                          "Cannot produce yv12 output for bit-stream.\n");
+                  goto fail;
+                }
+              }
+            }
+            if (do_md5) {
+              raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+            } else {
+              raw_write_image_file(img, planes, num_planes, outfile);
+            }
+          }
+        } else {
+          generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
+                            img->d_h, frame_in);
+          if (do_md5) {
+            MD5Init(&md5_ctx);
+            if (use_y4m) {
+              y4m_update_image_md5(img, planes, &md5_ctx);
+            } else {
+              raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+            }
+            MD5Final(md5_digest, &md5_ctx);
+            print_md5(md5_digest, outfile_name);
+          } else {
+            outfile = open_outfile(outfile_name);
+            if (use_y4m) {
+              y4m_write_image_file(img, planes, outfile);
+            } else {
+              raw_write_image_file(img, planes, num_planes, outfile);
+            }
+            fclose(outfile);
+          }
+        }
+      }
+    }
+  }
+
+  if (summary || progress) {
+    show_progress(frame_in, frame_out, dx_time);
+    fprintf(stderr, "\n");
+  }
+
+  if (frames_corrupted) {
+    fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted);
+  } else {
+    ret = EXIT_SUCCESS;
+  }
+
+fail:
+
+  if (aom_codec_destroy(&decoder)) {
+    fprintf(stderr, "Failed to destroy decoder: %s\n",
+            aom_codec_error(&decoder));
+  }
+
+fail2:
+
+  if (!noblit && single_file) {
+    if (do_md5) {
+      MD5Final(md5_digest, &md5_ctx);
+      print_md5(md5_digest, outfile_name);
+    } else {
+      fclose(outfile);
+    }
+  }
+
+#if CONFIG_WEBM_IO
+  if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM)
+    webm_free(input.webm_ctx);
+#endif
+  if (input.aom_input_ctx->file_type == FILE_TYPE_OBU)
+    obudec_free(input.obu_ctx);
+
+  if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf);
+
+  if (scaled_img) aom_img_free(scaled_img);
+  if (img_shifted) aom_img_free(img_shifted);
+
+  for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
+    free(ext_fb_list.ext_fb[i].data);
+  }
+  free(ext_fb_list.ext_fb);
+
+  fclose(infile);
+  if (framestats_file) fclose(framestats_file);
+
+  free(argv);
+
+  return ret;
+}
+
+int main(int argc, const char **argv_) {
+  unsigned int loops = 1, i;
+  char **argv, **argi, **argj;
+  struct arg arg;
+  int error = 0;
+
+  argv = argv_dup(argc - 1, argv_ + 1);
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    memset(&arg, 0, sizeof(arg));
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &looparg, argi)) {
+      loops = arg_parse_uint(&arg);
+      break;
+    }
+  }
+  free(argv);
+  for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_);
+  return error;
+}
diff --git a/third_party/aom/apps/aomenc.c b/third_party/aom/apps/aomenc.c
new file mode 100644
index 000000000..2e5d35cfe
--- /dev/null
+++ b/third_party/aom/apps/aomenc.c
@@ -0,0 +1,2391 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "apps/aomenc.h"
+
+#include "config/aom_config.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_AV1_DECODER
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#endif
+
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomcx.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfenc.h"
+#include "common/tools_common.h"
+#include "common/warnings.h"
+
+#if CONFIG_WEBM_IO
+#include "common/webmenc.h"
+#endif
+
+#include "common/y4minput.h"
+#include "examples/encoder_util.h"
+#include "stats/aomstats.h"
+#include "stats/rate_hist.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
+  return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+
+static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
+                          FILE *stream) {
+  return fwrite(ptr, size, nmemb, stream);
+}
+#define fwrite wrap_fwrite
+
+static const char *exec_name;
+
+static void warn_or_exit_on_errorv(aom_codec_ctx_t *ctx, int fatal,
+                                   const char *s, va_list ap) {
+  if (ctx->err) {
+    const char *detail = aom_codec_error_detail(ctx);
+
+    vfprintf(stderr, s, ap);
+    fprintf(stderr, ": %s\n", aom_codec_error(ctx));
+
+    if (detail) fprintf(stderr, "    %s\n", detail);
+
+    if (fatal) exit(EXIT_FAILURE);
+  }
+}
+
+static void ctx_exit_on_error(aom_codec_ctx_t *ctx, const char *s, ...) {
+  va_list ap;
+
+  va_start(ap, s);
+  warn_or_exit_on_errorv(ctx, 1, s, ap);
+  va_end(ap);
+}
+
+static void warn_or_exit_on_error(aom_codec_ctx_t *ctx, int fatal,
+                                  const char *s, ...) {
+  va_list ap;
+
+  va_start(ap, s);
+  warn_or_exit_on_errorv(ctx, fatal, s, ap);
+  va_end(ap);
+}
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t debugmode =
+    ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t use_yv12 =
+    ARG_DEF(NULL, "yv12", 0, "Input file is YV12 ");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)");
+static const arg_def_t use_i422 =
+    ARG_DEF(NULL, "i422", 0, "Input file is I422");
+static const arg_def_t use_i444 =
+    ARG_DEF(NULL, "i444", 0, "Input file is I444");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t passes =
+    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg =
+    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name =
+    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
+#if CONFIG_FP_MB_STATS
+static const arg_def_t fpmbf_name =
+    ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
+#endif
+static const arg_def_t limit =
+    ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
+static const arg_def_t skip =
+    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t good_dl =
+    ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t quietarg =
+    ARG_DEF("q", "quiet", 0, "Do not print encode progress");
+static const arg_def_t verbosearg =
+    ARG_DEF("v", "verbose", 0, "Show encoder parameters");
+static const arg_def_t psnrarg =
+    ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
+#if CONFIG_FILEOPTIONS
+static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use");
+static const arg_def_t ext_partition =
+    ARG_DEF(NULL, "ext-partition", 1, "corresponds to extended partitions");
+#endif
+
+static const struct arg_enum_list test_decode_enum[] = {
+  { "off", TEST_DECODE_OFF },
+  { "fatal", TEST_DECODE_FATAL },
+  { "warn", TEST_DECODE_WARN },
+  { NULL, 0 }
+};
+static const arg_def_t recontest = ARG_DEF_ENUM(
+    NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
+static const arg_def_t framerate =
+    ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm =
+    ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
+static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF");
+static const arg_def_t use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU");
+static const arg_def_t q_hist_n =
+    ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
+static const arg_def_t rate_hist_n =
+    ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings =
+    ARG_DEF(NULL, "disable-warnings", 0,
+            "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt =
+    ARG_DEF("y", "disable-warning-prompt", 0,
+            "Display warnings, but do not prompt user to continue.");
+static const struct arg_enum_list bitdepth_enum[] = {
+  { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
+};
+
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
+    "b", "bit-depth", 1,
+    "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
+    bitdepth_enum);
+static const arg_def_t inbitdeptharg =
+    ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input");
+
+static const arg_def_t input_chroma_subsampling_x = ARG_DEF(
+    NULL, "input-chroma-subsampling-x", 1, "chroma subsampling x value.");
+static const arg_def_t input_chroma_subsampling_y = ARG_DEF(
+    NULL, "input-chroma-subsampling-y", 1, "chroma subsampling y value.");
+
+static const arg_def_t *main_args[] = { &help,
+#if CONFIG_FILEOPTIONS
+                                        &use_cfg,
+#endif
+                                        &debugmode,
+                                        &outputfile,
+                                        &codecarg,
+                                        &passes,
+                                        &pass_arg,
+                                        &fpf_name,
+                                        &limit,
+                                        &skip,
+                                        &good_dl,
+                                        &quietarg,
+                                        &verbosearg,
+                                        &psnrarg,
+                                        &use_webm,
+                                        &use_ivf,
+                                        &use_obu,
+                                        &q_hist_n,
+                                        &rate_hist_n,
+                                        &disable_warnings,
+                                        &disable_warning_prompt,
+                                        &recontest,
+                                        NULL };
+
+static const arg_def_t usage =
+    ARG_DEF("u", "usage", 1, "Usage profile number to use");
+static const arg_def_t threads =
+    ARG_DEF("t", "threads", 1, "Max number of threads to use");
+static const arg_def_t profile =
+    ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use");
+static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
+static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
+static const arg_def_t forced_max_frame_width = ARG_DEF(
+    NULL, "forced_max_frame_width", 0, "Maximum frame width value to force");
+static const arg_def_t forced_max_frame_height = ARG_DEF(
+    NULL, "forced_max_frame_height", 0, "Maximum frame height value to force");
+#if CONFIG_WEBM_IO
+static const struct arg_enum_list stereo_mode_enum[] = {
+  { "mono", STEREO_FORMAT_MONO },
+  { "left-right", STEREO_FORMAT_LEFT_RIGHT },
+  { "bottom-top", STEREO_FORMAT_BOTTOM_TOP },
+  { "top-bottom", STEREO_FORMAT_TOP_BOTTOM },
+  { "right-left", STEREO_FORMAT_RIGHT_LEFT },
+  { NULL, 0 }
+};
+static const arg_def_t stereo_mode = ARG_DEF_ENUM(
+    NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
+#endif
+static const arg_def_t timebase = ARG_DEF(
+    NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
+static const arg_def_t global_error_resilient =
+    ARG_DEF(NULL, "global-error-resilient", 1,
+            "Enable global error resiliency features");
+static const arg_def_t lag_in_frames =
+    ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
+static const arg_def_t large_scale_tile =
+    ARG_DEF(NULL, "large-scale-tile", 1,
+            "Large scale tile coding (0: off (default), 1: on)");
+static const arg_def_t monochrome =
+    ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
+static const arg_def_t full_still_picture_hdr = ARG_DEF(
+    NULL, "full-still-picture-hdr", 0, "Use full header for still picture");
+
+static const arg_def_t *global_args[] = { &use_yv12,
+                                          &use_i420,
+                                          &use_i422,
+                                          &use_i444,
+                                          &usage,
+                                          &threads,
+                                          &profile,
+                                          &width,
+                                          &height,
+                                          &forced_max_frame_width,
+                                          &forced_max_frame_height,
+#if CONFIG_WEBM_IO
+                                          &stereo_mode,
+#endif
+                                          &timebase,
+                                          &framerate,
+                                          &global_error_resilient,
+                                          &bitdeptharg,
+                                          &lag_in_frames,
+                                          &large_scale_tile,
+                                          &monochrome,
+                                          &full_still_picture_hdr,
+                                          NULL };
+
+static const arg_def_t dropframe_thresh =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t resize_mode =
+    ARG_DEF(NULL, "resize-mode", 1, "Frame resize mode");
+static const arg_def_t resize_denominator =
+    ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator");
+static const arg_def_t resize_kf_denominator = ARG_DEF(
+    NULL, "resize-kf-denominator", 1, "Frame resize keyframe denominator");
+static const arg_def_t superres_mode =
+    ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode");
+static const arg_def_t superres_denominator = ARG_DEF(
+    NULL, "superres-denominator", 1, "Frame super-resolution denominator");
+static const arg_def_t superres_kf_denominator =
+    ARG_DEF(NULL, "superres-kf-denominator", 1,
+            "Frame super-resolution keyframe denominator");
+static const arg_def_t superres_qthresh = ARG_DEF(
+    NULL, "superres-qthresh", 1, "Frame super-resolution qindex threshold");
+static const arg_def_t superres_kf_qthresh =
+    ARG_DEF(NULL, "superres-kf-qthresh", 1,
+            "Frame super-resolution keyframe qindex threshold");
+static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR },
+                                                       { "cbr", AOM_CBR },
+                                                       { "cq", AOM_CQ },
+                                                       { "q", AOM_Q },
+                                                       { NULL, 0 } };
+static const arg_def_t end_usage =
+    ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
+static const arg_def_t target_bitrate =
+    ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)");
+static const arg_def_t min_quantizer =
+    ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer");
+static const arg_def_t max_quantizer =
+    ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer");
+static const arg_def_t undershoot_pct =
+    ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
+static const arg_def_t overshoot_pct =
+    ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
+static const arg_def_t buf_sz =
+    ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)");
+static const arg_def_t buf_initial_sz =
+    ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
+static const arg_def_t buf_optimal_sz =
+    ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
+static const arg_def_t *rc_args[] = { &dropframe_thresh,
+                                      &resize_mode,
+                                      &resize_denominator,
+                                      &resize_kf_denominator,
+                                      &superres_mode,
+                                      &superres_denominator,
+                                      &superres_kf_denominator,
+                                      &superres_qthresh,
+                                      &superres_kf_qthresh,
+                                      &end_usage,
+                                      &target_bitrate,
+                                      &min_quantizer,
+                                      &max_quantizer,
+                                      &undershoot_pct,
+                                      &overshoot_pct,
+                                      &buf_sz,
+                                      &buf_initial_sz,
+                                      &buf_optimal_sz,
+                                      NULL };
+
+static const arg_def_t bias_pct =
+    ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct =
+    ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct =
+    ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
+static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
+                                              &maxsection_pct, NULL };
+static const arg_def_t fwd_kf_enabled =
+    ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes");
+static const arg_def_t kf_min_dist =
+    ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
+static const arg_def_t kf_max_dist =
+    ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
+static const arg_def_t kf_disabled =
+    ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement");
+static const arg_def_t *kf_args[] = { &fwd_kf_enabled, &kf_min_dist,
+                                      &kf_max_dist, &kf_disabled, NULL };
+static const arg_def_t sframe_dist =
+    ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)");
+static const arg_def_t sframe_mode =
+    ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)");
+static const arg_def_t save_as_annexb =
+    ARG_DEF(NULL, "annexb", 1, "Save as Annex-B");
+static const arg_def_t noise_sens =
+    ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
+static const arg_def_t sharpness =
+    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+static const arg_def_t static_thresh =
+    ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
+static const arg_def_t auto_altref =
+    ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
+static const arg_def_t arnr_maxframes =
+    ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
+static const arg_def_t arnr_strength =
+    ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
+static const struct arg_enum_list tuning_enum[] = {
+  { "psnr", AOM_TUNE_PSNR },
+  { "ssim", AOM_TUNE_SSIM },
+#ifdef CONFIG_DIST_8X8
+  { "cdef-dist", AOM_TUNE_CDEF_DIST },
+  { "daala-dist", AOM_TUNE_DAALA_DIST },
+#endif
+  { NULL, 0 }
+};
+static const arg_def_t tune_metric =
+    ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with", tuning_enum);
+static const arg_def_t cq_level =
+    ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level");
+static const arg_def_t max_intra_rate_pct =
+    ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+
+#if CONFIG_AV1_ENCODER
+static const arg_def_t cpu_used_av1 =
+    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1,
+            "Enable row based multi-threading (0: off (default), 1: on)");
+static const arg_def_t tile_cols =
+    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows =
+    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t tile_width =
+    ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
+static const arg_def_t tile_height =
+    ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)");
+static const arg_def_t lossless =
+    ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
+static const arg_def_t enable_cdef =
+    ARG_DEF(NULL, "enable-cdef", 1,
+            "Enable the constrained directional enhancement filter (0: false, "
+            "1: true (default))");
+static const arg_def_t enable_restoration =
+    ARG_DEF(NULL, "enable-restoration", 1,
+            "Enable the loop restoration filter (0: false, "
+            "1: true (default))");
+static const arg_def_t disable_trellis_quant =
+    ARG_DEF(NULL, "disable-trellis-quant", 1,
+            "Disable trellis optimization of quantized coefficients (0: false ("
+            "default) 1: true)");
+static const arg_def_t enable_qm =
+    ARG_DEF(NULL, "enable-qm", 1,
+            "Enable quantisation matrices (0: false (default), 1: true)");
+static const arg_def_t qm_min = ARG_DEF(
+    NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8");
+static const arg_def_t qm_max = ARG_DEF(
+    NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
+#if CONFIG_DIST_8X8
+static const arg_def_t enable_dist_8x8 =
+    ARG_DEF(NULL, "enable-dist-8x8", 1,
+            "Enable dist-8x8 (0: false (default), 1: true)");
+#endif  // CONFIG_DIST_8X8
+static const arg_def_t num_tg = ARG_DEF(
+    NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1");
+static const arg_def_t mtu_size =
+    ARG_DEF(NULL, "mtu-size", 1,
+            "MTU size for a tile group, default is 0 (no MTU targeting), "
+            "overrides maximum number of tile groups");
+static const struct arg_enum_list timing_info_enum[] = {
+  { "unspecified", AOM_TIMING_UNSPECIFIED },
+  { "constant", AOM_TIMING_EQUAL },
+  { "model", AOM_TIMING_DEC_MODEL },
+  { NULL, 0 }
+};
+static const arg_def_t timing_info =
+    ARG_DEF_ENUM(NULL, "timing-info", 1,
+                 "Signal timing info in the bitstream (model unly works for no "
+                 "hidden frames, no super-res yet):",
+                 timing_info_enum);
+static const arg_def_t film_grain_test =
+    ARG_DEF(NULL, "film-grain-test", 1,
+            "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
+            "... 16: test-16)");
+static const arg_def_t film_grain_table =
+    ARG_DEF(NULL, "film-grain-table", 1,
+            "Path to file containing film grain parameters");
+#if CONFIG_DENOISE
+static const arg_def_t denoise_noise_level =
+    ARG_DEF(NULL, "denoise-noise-level", 1,
+            "Amount of noise (from 0 = don't denoise, to 50)");
+static const arg_def_t denoise_block_size =
+    ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)");
+#endif
+static const arg_def_t enable_ref_frame_mvs =
+    ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
+            "Enable temporal mv prediction (default is 1)");
+static const arg_def_t frame_parallel_decoding =
+    ARG_DEF(NULL, "frame-parallel", 1,
+            "Enable frame parallel decodability features "
+            "(0: false (default), 1: true)");
+static const arg_def_t error_resilient_mode =
+    ARG_DEF(NULL, "error-resilient", 1,
+            "Enable error resilient features "
+            "(0: false (default), 1: true)");
+static const arg_def_t aq_mode = ARG_DEF(
+    NULL, "aq-mode", 1,
+    "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
+    "3: cyclic refresh)");
+static const arg_def_t deltaq_mode = ARG_DEF(
+    NULL, "deltaq-mode", 1,
+    "Delta qindex mode (0: off (default), 1: deltaq 2: deltaq + deltalf)");
+static const arg_def_t frame_periodic_boost =
+    ARG_DEF(NULL, "frame-boost", 1,
+            "Enable frame periodic boost (0: off (default), 1: on)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
+static const arg_def_t max_inter_rate_pct =
+    ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t min_gf_interval = ARG_DEF(
+    NULL, "min-gf-interval", 1,
+    "min gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t max_gf_interval = ARG_DEF(
+    NULL, "max-gf-interval", 1,
+    "max gf/arf frame interval (default 0, indicating in-built behavior)");
+
+static const struct arg_enum_list color_primaries_enum[] = {
+  { "bt709", AOM_CICP_CP_BT_709 },
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt601", AOM_CICP_CP_BT_601 },
+  { "bt470m", AOM_CICP_CP_BT_470_M },
+  { "bt470bg", AOM_CICP_CP_BT_470_B_G },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "film", AOM_CICP_CP_GENERIC_FILM },
+  { "bt2020", AOM_CICP_CP_BT_2020 },
+  { "xyz", AOM_CICP_CP_XYZ },
+  { "smpte431", AOM_CICP_CP_SMPTE_431 },
+  { "smpte432", AOM_CICP_CP_SMPTE_432 },
+  { "ebu3213", AOM_CICP_CP_EBU_3213 },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_primaries = ARG_DEF_ENUM(
+    NULL, "color-primaries", 1,
+    "Color primaries (CICP) of input content:", color_primaries_enum);
+
+static const struct arg_enum_list transfer_characteristics_enum[] = {
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt709", AOM_CICP_TC_BT_709 },
+  { "bt470m", AOM_CICP_TC_BT_470_M },
+  { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+  { "bt601", AOM_CICP_TC_BT_601 },
+  { "smpte240", AOM_CICP_TC_SMPTE_240 },
+  { "lin", AOM_CICP_TC_LINEAR },
+  { "log100", AOM_CICP_TC_LOG_100 },
+  { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 },
+  { "iec61966", AOM_CICP_TC_IEC_61966 },
+  { "bt1361", AOM_CICP_TC_BT_1361 },
+  { "srgb", AOM_CICP_TC_SRGB },
+  { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT },
+  { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT },
+  { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+  { "hlg", AOM_CICP_TC_HLG },
+  { "smpte428", AOM_CICP_TC_SMPTE_428 },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_transfer_characteristics =
+    ARG_DEF_ENUM(NULL, "transfer-characteristics", 1,
+                 "Transfer characteristics (CICP) of input content:",
+                 transfer_characteristics_enum);
+
+static const struct arg_enum_list matrix_coefficients_enum[] = {
+  { "identity", AOM_CICP_MC_IDENTITY },
+  { "bt709", AOM_CICP_MC_BT_709 },
+  { "unspecified", AOM_CICP_MC_UNSPECIFIED },
+  { "fcc73", AOM_CICP_MC_FCC },
+  { "bt470bg", AOM_CICP_MC_BT_470_B_G },
+  { "bt601", AOM_CICP_MC_BT_601 },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "ycgco", AOM_CICP_MC_SMPTE_YCGCO },
+  { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL },
+  { "bt2020cl", AOM_CICP_MC_BT_2020_CL },
+  { "smpte2085", AOM_CICP_MC_SMPTE_2085 },
+  { "chromncl", AOM_CICP_MC_CHROMAT_NCL },
+  { "chromcl", AOM_CICP_MC_CHROMAT_CL },
+  { "ictcp", AOM_CICP_MC_ICTCP },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_matrix_coefficients = ARG_DEF_ENUM(
+    NULL, "matrix-coefficients", 1,
+    "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum);
+
+static const struct arg_enum_list chroma_sample_position_enum[] = {
+  { "unknown", AOM_CSP_UNKNOWN },
+  { "vertical", AOM_CSP_VERTICAL },
+  { "colocated", AOM_CSP_COLOCATED },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_chroma_sample_position =
+    ARG_DEF_ENUM(NULL, "chroma-sample-position", 1,
+                 "The chroma sample position when chroma 4:2:0 is signaled:",
+                 chroma_sample_position_enum);
+
+static const struct arg_enum_list tune_content_enum[] = {
+  { "default", AOM_CONTENT_DEFAULT },
+  { "screen", AOM_CONTENT_SCREEN },
+  { NULL, 0 }
+};
+
+static const arg_def_t tune_content = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+
+static const arg_def_t cdf_update_mode =
+    ARG_DEF(NULL, "cdf-update-mode", 1,
+            "CDF update mode for entropy coding "
+            "(0: no CDF update; 1: update CDF on all frames(default); "
+            "2: selectively update CDF on some frames");
+
+static const struct arg_enum_list superblock_size_enum[] = {
+  { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
+  { "64", AOM_SUPERBLOCK_SIZE_64X64 },
+  { "128", AOM_SUPERBLOCK_SIZE_128X128 },
+  { NULL, 0 }
+};
+static const arg_def_t superblock_size = ARG_DEF_ENUM(
+    NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
+
+static const arg_def_t *av1_args[] = { &cpu_used_av1,
+                                       &auto_altref,
+                                       &sharpness,
+                                       &static_thresh,
+                                       &rowmtarg,
+                                       &tile_cols,
+                                       &tile_rows,
+                                       &arnr_maxframes,
+                                       &arnr_strength,
+                                       &tune_metric,
+                                       &cq_level,
+                                       &max_intra_rate_pct,
+                                       &max_inter_rate_pct,
+                                       &gf_cbr_boost_pct,
+                                       &lossless,
+                                       &enable_cdef,
+                                       &enable_restoration,
+                                       &disable_trellis_quant,
+                                       &enable_qm,
+                                       &qm_min,
+                                       &qm_max,
+#if CONFIG_DIST_8X8
+                                       &enable_dist_8x8,
+#endif
+                                       &frame_parallel_decoding,
+                                       &error_resilient_mode,
+                                       &aq_mode,
+                                       &deltaq_mode,
+                                       &frame_periodic_boost,
+                                       &noise_sens,
+                                       &tune_content,
+                                       &cdf_update_mode,
+                                       &input_color_primaries,
+                                       &input_transfer_characteristics,
+                                       &input_matrix_coefficients,
+                                       &input_chroma_sample_position,
+                                       &min_gf_interval,
+                                       &max_gf_interval,
+                                       &superblock_size,
+                                       &num_tg,
+                                       &mtu_size,
+                                       &timing_info,
+                                       &film_grain_test,
+                                       &film_grain_table,
+#if CONFIG_DENOISE
+                                       &denoise_noise_level,
+                                       &denoise_block_size,
+#endif
+                                       &enable_ref_frame_mvs,
+                                       &bitdeptharg,
+                                       &inbitdeptharg,
+                                       &input_chroma_subsampling_x,
+                                       &input_chroma_subsampling_y,
+                                       &sframe_dist,
+                                       &sframe_mode,
+                                       &save_as_annexb,
+                                       NULL };
+static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
+                                        AOME_SET_ENABLEAUTOALTREF,
+                                        AOME_SET_SHARPNESS,
+                                        AOME_SET_STATIC_THRESHOLD,
+                                        AV1E_SET_ROW_MT,
+                                        AV1E_SET_TILE_COLUMNS,
+                                        AV1E_SET_TILE_ROWS,
+                                        AOME_SET_ARNR_MAXFRAMES,
+                                        AOME_SET_ARNR_STRENGTH,
+                                        AOME_SET_TUNING,
+                                        AOME_SET_CQ_LEVEL,
+                                        AOME_SET_MAX_INTRA_BITRATE_PCT,
+                                        AV1E_SET_MAX_INTER_BITRATE_PCT,
+                                        AV1E_SET_GF_CBR_BOOST_PCT,
+                                        AV1E_SET_LOSSLESS,
+                                        AV1E_SET_ENABLE_CDEF,
+                                        AV1E_SET_ENABLE_RESTORATION,
+                                        AV1E_SET_DISABLE_TRELLIS_QUANT,
+                                        AV1E_SET_ENABLE_QM,
+                                        AV1E_SET_QM_MIN,
+                                        AV1E_SET_QM_MAX,
+#if CONFIG_DIST_8X8
+                                        AV1E_SET_ENABLE_DIST_8X8,
+#endif
+                                        AV1E_SET_FRAME_PARALLEL_DECODING,
+                                        AV1E_SET_ERROR_RESILIENT_MODE,
+                                        AV1E_SET_AQ_MODE,
+                                        AV1E_SET_DELTAQ_MODE,
+                                        AV1E_SET_FRAME_PERIODIC_BOOST,
+                                        AV1E_SET_NOISE_SENSITIVITY,
+                                        AV1E_SET_TUNE_CONTENT,
+                                        AV1E_SET_CDF_UPDATE_MODE,
+                                        AV1E_SET_COLOR_PRIMARIES,
+                                        AV1E_SET_TRANSFER_CHARACTERISTICS,
+                                        AV1E_SET_MATRIX_COEFFICIENTS,
+                                        AV1E_SET_CHROMA_SAMPLE_POSITION,
+                                        AV1E_SET_MIN_GF_INTERVAL,
+                                        AV1E_SET_MAX_GF_INTERVAL,
+                                        AV1E_SET_SUPERBLOCK_SIZE,
+                                        AV1E_SET_NUM_TG,
+                                        AV1E_SET_MTU,
+                                        AV1E_SET_TIMING_INFO_TYPE,
+                                        AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+                                        AV1E_SET_FILM_GRAIN_TABLE,
+#if CONFIG_DENOISE
+                                        AV1E_SET_DENOISE_NOISE_LEVEL,
+                                        AV1E_SET_DENOISE_BLOCK_SIZE,
+#endif
+                                        AV1E_SET_ENABLE_REF_FRAME_MVS,
+                                        AV1E_SET_ENABLE_DF,
+                                        AV1E_SET_ENABLE_ORDER_HINT,
+                                        AV1E_SET_ENABLE_JNT_COMP,
+                                        AV1E_SET_ENABLE_SUPERRES,
+                                        0 };
+#endif  // CONFIG_AV1_ENCODER
+
+static const arg_def_t *no_args[] = { NULL };
+
+void show_help(FILE *fout, int shorthelp) {
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
+          exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "\nOptions:\n");
+  arg_show_usage(fout, main_args);
+  fprintf(fout, "\nEncoder Global Options:\n");
+  arg_show_usage(fout, global_args);
+  fprintf(fout, "\nRate Control Options:\n");
+  arg_show_usage(fout, rc_args);
+  fprintf(fout, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(fout, rc_twopass_args);
+  fprintf(fout, "\nKeyframe Placement Options:\n");
+  arg_show_usage(fout, kf_args);
+#if CONFIG_AV1_ENCODER
+  fprintf(fout, "\nAV1 Specific Options:\n");
+  arg_show_usage(fout, av1_args);
+#endif
+  fprintf(fout,
+          "\nStream timebase (--timebase):\n"
+          "  The desired precision of timestamps in the output, expressed\n"
+          "  in fractional seconds. Default is 1/1000.\n");
+  fprintf(fout, "\nIncluded encoders:\n\n");
+
+  const int num_encoder = get_aom_encoder_count();
+  for (int i = 0; i < num_encoder; ++i) {
+    const AvxInterface *const encoder = get_aom_encoder_by_index(i);
+    const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
+    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
+            aom_codec_iface_name(encoder->codec_interface()), defstr);
+  }
+  fprintf(fout, "\n        ");
+  fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
+
+void usage_exit(void) {
+  show_help(stderr, 1);
+  exit(EXIT_FAILURE);
+}
+
+#if CONFIG_AV1_ENCODER
+#define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map)
+#endif
+
+#if !CONFIG_WEBM_IO
+typedef int stereo_format_t;
+struct WebmOutputContext {
+  int debug;
+};
+#endif
+
+/* Per-stream configuration */
+struct stream_config {
+  struct aom_codec_enc_cfg cfg;
+  const char *out_fn;
+  const char *stats_fn;
+#if CONFIG_FP_MB_STATS
+  const char *fpmb_stats_fn;
+#endif
+  stereo_format_t stereo_fmt;
+  int arg_ctrls[ARG_CTRL_CNT_MAX][2];
+  int arg_ctrl_cnt;
+  int write_webm;
+  const char *film_grain_filename;
+  int write_ivf;
+  // whether to use 16bit internal buffers
+  int use_16bit_internal;
+};
+
+struct stream_state {
+  int index;
+  struct stream_state *next;
+  struct stream_config config;
+  FILE *file;
+  struct rate_hist *rate_hist;
+  struct WebmOutputContext webm_ctx;
+  uint64_t psnr_sse_total;
+  uint64_t psnr_samples_total;
+  double psnr_totals[4];
+  int psnr_count;
+  int counts[64];
+  aom_codec_ctx_t encoder;
+  unsigned int frames_out;
+  uint64_t cx_time;
+  size_t nbytes;
+  stats_io_t stats;
+#if CONFIG_FP_MB_STATS
+  stats_io_t fpmb_stats;
+#endif
+  struct aom_image *img;
+  aom_codec_ctx_t decoder;
+  int mismatch_seen;
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
+};
+
+static void validate_positive_rational(const char *msg,
+                                       struct aom_rational *rat) {
+  if (rat->den < 0) {
+    rat->num *= -1;
+    rat->den *= -1;
+  }
+
+  if (rat->num < 0) die("Error: %s must be positive\n", msg);
+
+  if (!rat->den) die("Error: %s has zero denominator\n", msg);
+}
+
+static void parse_global_config(struct AvxEncoderConfig *global, int *argc,
+                                char ***argv) {
+  char **argi, **argj;
+  struct arg arg;
+  const int num_encoder = get_aom_encoder_count();
+  char **argv_local = (char **)*argv;
+#if CONFIG_FILEOPTIONS
+  int argc_local = *argc;
+#endif
+  if (num_encoder < 1) die("Error: no valid encoder available\n");
+
+  /* Initialize default parameters */
+  memset(global, 0, sizeof(*global));
+  global->codec = get_aom_encoder_by_index(num_encoder - 1);
+  global->passes = 0;
+  global->color_type = I420;
+
+#if CONFIG_FILEOPTIONS
+  const char *cfg = NULL;
+  int cfg_included = 0;
+#endif
+  for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+#if CONFIG_FILEOPTIONS
+    if (arg_match(&arg, &use_cfg, argi)) {
+      if (cfg_included) continue;
+      cfg = arg.val;
+
+      arg_cfg(&argc_local, &argv_local, cfg);
+
+      *argj = *argi = *argv_local;
+      argj = argi = argv_local;
+      *argv = argv_local;
+      cfg_included = 1;
+      continue;
+    }
+#endif
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
+      global->codec = get_aom_encoder_by_name(arg.val);
+      if (!global->codec)
+        die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+    } else if (arg_match(&arg, &passes, argi)) {
+      global->passes = arg_parse_uint(&arg);
+
+      if (global->passes < 1 || global->passes > 2)
+        die("Error: Invalid number of passes (%d)\n", global->passes);
+    } else if (arg_match(&arg, &pass_arg, argi)) {
+      global->pass = arg_parse_uint(&arg);
+
+      if (global->pass < 1 || global->pass > 2)
+        die("Error: Invalid pass selected (%d)\n", global->pass);
+    } else if (arg_match(&arg, &usage, argi))
+      global->usage = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &good_dl, argi))
+      warn("Deprecated --good option! Ignoring\n");
+    else if (arg_match(&arg, &use_yv12, argi))
+      global->color_type = YV12;
+    else if (arg_match(&arg, &use_i420, argi))
+      global->color_type = I420;
+    else if (arg_match(&arg, &use_i422, argi))
+      global->color_type = I422;
+    else if (arg_match(&arg, &use_i444, argi))
+      global->color_type = I444;
+    else if (arg_match(&arg, &quietarg, argi))
+      global->quiet = 1;
+    else if (arg_match(&arg, &verbosearg, argi))
+      global->verbose = 1;
+    else if (arg_match(&arg, &limit, argi))
+      global->limit = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &skip, argi))
+      global->skip_frames = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &psnrarg, argi))
+      global->show_psnr = 1;
+    else if (arg_match(&arg, &recontest, argi))
+      global->test_decode = arg_parse_enum_or_int(&arg);
+    else if (arg_match(&arg, &framerate, argi)) {
+      global->framerate = arg_parse_rational(&arg);
+      validate_positive_rational(arg.name, &global->framerate);
+      global->have_framerate = 1;
+    } else if (arg_match(&arg, &debugmode, argi))
+      global->debug = 1;
+    else if (arg_match(&arg, &q_hist_n, argi))
+      global->show_q_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &rate_hist_n, argi))
+      global->show_rate_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &disable_warnings, argi))
+      global->disable_warnings = 1;
+    else if (arg_match(&arg, &disable_warning_prompt, argi))
+      global->disable_warning_prompt = 1;
+    else
+      argj++;
+  }
+
+  if (global->pass) {
+    /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
+    if (global->pass > global->passes) {
+      warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
+           global->pass);
+      global->passes = global->pass;
+    }
+  }
+  /* Validate global config */
+  if (global->passes == 0) {
+#if CONFIG_AV1_ENCODER
+    // Make default AV1 passes = 2 until there is a better quality 1-pass
+    // encoder
+    if (global->codec != NULL && global->codec->name != NULL)
+      global->passes = (strcmp(global->codec->name, "av1") == 0) ? 2 : 1;
+#else
+    global->passes = 1;
+#endif
+  }
+}
+
+static void open_input_file(struct AvxInputContext *input) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
+
+  if (!input->file) fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.aom_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else
+      fatal("Unsupported Y4M stream.");
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+static struct stream_state *new_stream(struct AvxEncoderConfig *global,
+                                       struct stream_state *prev) {
+  struct stream_state *stream;
+
+  stream = calloc(1, sizeof(*stream));
+  if (stream == NULL) {
+    fatal("Failed to allocate new stream.");
+  }
+
+  if (prev) {
+    memcpy(stream, prev, sizeof(*stream));
+    stream->index++;
+    prev->next = stream;
+  } else {
+    aom_codec_err_t res;
+
+    /* Populate encoder configuration */
+    res = aom_codec_enc_config_default(global->codec->codec_interface(),
+                                       &stream->config.cfg, global->usage);
+    if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res));
+
+    /* Change the default timebase to a high enough value so that the
+     * encoder will always create strictly increasing timestamps.
+     */
+    stream->config.cfg.g_timebase.den = 1000;
+
+    /* Never use the library's default resolution, require it be parsed
+     * from the file or set on the command line.
+     */
+    stream->config.cfg.g_w = 0;
+    stream->config.cfg.g_h = 0;
+
+    /* Initialize remaining stream parameters */
+    stream->config.write_webm = 1;
+    stream->config.write_ivf = 0;
+
+#if CONFIG_WEBM_IO
+    stream->config.stereo_fmt = STEREO_FORMAT_MONO;
+    stream->webm_ctx.last_pts_ns = -1;
+    stream->webm_ctx.writer = NULL;
+    stream->webm_ctx.segment = NULL;
+#endif
+
+    /* Allows removal of the application version from the EBML tags */
+    stream->webm_ctx.debug = global->debug;
+  }
+
+  /* Output files must be specified for each stream */
+  stream->config.out_fn = NULL;
+
+  stream->next = NULL;
+  return stream;
+}
+
+static void set_config_arg_ctrls(struct stream_config *config, int key,
+                                 const struct arg *arg) {
+  int j;
+  if (key == AV1E_SET_FILM_GRAIN_TABLE) {
+    config->film_grain_filename = arg->val;
+    return;
+  }
+
+  /* Point either to the next free element or the first instance of this
+   * control.
+   */
+  for (j = 0; j < config->arg_ctrl_cnt; j++)
+    if (config->arg_ctrls[j][0] == key) break;
+
+  /* Update/insert */
+  assert(j < (int)ARG_CTRL_CNT_MAX);
+  config->arg_ctrls[j][0] = key;
+  config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+
+  if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) {
+    warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+    config->arg_ctrls[j][1] = 1;
+  }
+  if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
+}
+
+static int parse_stream_params(struct AvxEncoderConfig *global,
+                               struct stream_state *stream, char **argv) {
+  char **argi, **argj;
+  struct arg arg;
+  static const arg_def_t **ctrl_args = no_args;
+  static const int *ctrl_args_map = NULL;
+  struct stream_config *config = &stream->config;
+  int eos_mark_found = 0;
+  int webm_forced = 0;
+
+  // Handle codec specific options
+  if (0) {
+#if CONFIG_AV1_ENCODER
+  } else if (strcmp(global->codec->name, "av1") == 0) {
+    // TODO(jingning): Reuse AV1 specific encoder configuration parameters.
+    // Consider to expand this set for AV1 encoder control.
+    ctrl_args = av1_args;
+    ctrl_args_map = av1_arg_ctrl_map;
+#endif
+  }
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    /* Once we've found an end-of-stream marker (--) we want to continue
+     * shifting arguments but not consuming them.
+     */
+    if (eos_mark_found) {
+      argj++;
+      continue;
+    } else if (!strcmp(*argj, "--")) {
+      eos_mark_found = 1;
+      continue;
+    }
+
+    if (arg_match(&arg, &outputfile, argi)) {
+      config->out_fn = arg.val;
+      if (!webm_forced) {
+        const size_t out_fn_len = strlen(config->out_fn);
+        if (out_fn_len >= 4 &&
+            !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) {
+          config->write_webm = 0;
+          config->write_ivf = 1;
+        } else if (out_fn_len >= 4 &&
+                   !strcmp(config->out_fn + out_fn_len - 4, ".obu")) {
+          config->write_webm = 0;
+          config->write_ivf = 0;
+        }
+      }
+    } else if (arg_match(&arg, &fpf_name, argi)) {
+      config->stats_fn = arg.val;
+#if CONFIG_FP_MB_STATS
+    } else if (arg_match(&arg, &fpmbf_name, argi)) {
+      config->fpmb_stats_fn = arg.val;
+#endif
+    } else if (arg_match(&arg, &use_webm, argi)) {
+#if CONFIG_WEBM_IO
+      config->write_webm = 1;
+      webm_forced = 1;
+#else
+      die("Error: --webm specified but webm is disabled.");
+#endif
+    } else if (arg_match(&arg, &use_ivf, argi)) {
+      config->write_webm = 0;
+      config->write_ivf = 1;
+    } else if (arg_match(&arg, &use_obu, argi)) {
+      config->write_webm = 0;
+      config->write_ivf = 0;
+    } else if (arg_match(&arg, &threads, argi)) {
+      config->cfg.g_threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &profile, argi)) {
+      config->cfg.g_profile = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &width, argi)) {
+      config->cfg.g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height, argi)) {
+      config->cfg.g_h = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &forced_max_frame_width, argi)) {
+      config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &forced_max_frame_height, argi)) {
+      config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &bitdeptharg, argi)) {
+      config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+      config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &input_chroma_subsampling_x, argi)) {
+      stream->chroma_subsampling_x = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &input_chroma_subsampling_y, argi)) {
+      stream->chroma_subsampling_y = arg_parse_uint(&arg);
+#if CONFIG_WEBM_IO
+    } else if (arg_match(&arg, &stereo_mode, argi)) {
+      config->stereo_fmt = arg_parse_enum_or_int(&arg);
+#endif
+    } else if (arg_match(&arg, &timebase, argi)) {
+      config->cfg.g_timebase = arg_parse_rational(&arg);
+      validate_positive_rational(arg.name, &config->cfg.g_timebase);
+    } else if (arg_match(&arg, &global_error_resilient, argi)) {
+      config->cfg.g_error_resilient = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lag_in_frames, argi)) {
+      config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &large_scale_tile, argi)) {
+      config->cfg.large_scale_tile = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &monochrome, argi)) {
+      config->cfg.monochrome = 1;
+    } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
+      config->cfg.full_still_picture_hdr = 1;
+    } else if (arg_match(&arg, &dropframe_thresh, argi)) {
+      config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_mode, argi)) {
+      config->cfg.rc_resize_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_denominator, argi)) {
+      config->cfg.rc_resize_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_kf_denominator, argi)) {
+      config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_mode, argi)) {
+      config->cfg.rc_superres_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_denominator, argi)) {
+      config->cfg.rc_superres_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_kf_denominator, argi)) {
+      config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_qthresh, argi)) {
+      config->cfg.rc_superres_qthresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_kf_qthresh, argi)) {
+      config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &end_usage, argi)) {
+      config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &target_bitrate, argi)) {
+      config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &min_quantizer, argi)) {
+      config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_quantizer, argi)) {
+      config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &undershoot_pct, argi)) {
+      config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &overshoot_pct, argi)) {
+      config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_sz, argi)) {
+      config->cfg.rc_buf_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_initial_sz, argi)) {
+      config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
+      config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &bias_pct, argi)) {
+      config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &minsection_pct, argi)) {
+      config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &maxsection_pct, argi)) {
+      config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &fwd_kf_enabled, argi)) {
+      config->cfg.fwd_kf_enabled = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_min_dist, argi)) {
+      config->cfg.kf_min_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_max_dist, argi)) {
+      config->cfg.kf_max_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_disabled, argi)) {
+      config->cfg.kf_mode = AOM_KF_DISABLED;
+    } else if (arg_match(&arg, &sframe_dist, argi)) {
+      config->cfg.sframe_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &sframe_mode, argi)) {
+      config->cfg.sframe_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &save_as_annexb, argi)) {
+      config->cfg.save_as_annexb = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &tile_width, argi)) {
+      config->cfg.tile_width_count =
+          arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS);
+    } else if (arg_match(&arg, &tile_height, argi)) {
+      config->cfg.tile_height_count =
+          arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS);
+#if CONFIG_FILEOPTIONS
+    } else if (arg_match(&arg, &ext_partition, argi)) {
+      config->cfg.cfg.ext_partition = !!arg_parse_uint(&arg) > 0;
+#endif
+    } else {
+      int i, match = 0;
+      for (i = 0; ctrl_args[i]; i++) {
+        if (arg_match(&arg, ctrl_args[i], argi)) {
+          match = 1;
+          if (ctrl_args_map) {
+            set_config_arg_ctrls(config, ctrl_args_map[i], &arg);
+          }
+        }
+      }
+      if (!match) argj++;
+    }
+  }
+  config->use_16bit_internal =
+      config->cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH;
+  return eos_mark_found;
+}
+
+#define FOREACH_STREAM(iterator, list)                 \
+  for (struct stream_state *iterator = list; iterator; \
+       iterator = iterator->next)
+
+static void validate_stream_config(const struct stream_state *stream,
+                                   const struct AvxEncoderConfig *global) {
+  const struct stream_state *streami;
+  (void)global;
+
+  if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
+    fatal(
+        "Stream %d: Specify stream dimensions with --width (-w) "
+        " and --height (-h)",
+        stream->index);
+
+  // Check that the codec bit depth is greater than the input bit depth.
+  if (stream->config.cfg.g_input_bit_depth >
+      (unsigned int)stream->config.cfg.g_bit_depth) {
+    fatal("Stream %d: codec bit depth (%d) less than input bit depth (%d)",
+          stream->index, (int)stream->config.cfg.g_bit_depth,
+          stream->config.cfg.g_input_bit_depth);
+  }
+
+  for (streami = stream; streami; streami = streami->next) {
+    /* All streams require output files */
+    if (!streami->config.out_fn)
+      fatal("Stream %d: Output file is required (specify with -o)",
+            streami->index);
+
+    /* Check for two streams outputting to the same file */
+    if (streami != stream) {
+      const char *a = stream->config.out_fn;
+      const char *b = streami->config.out_fn;
+      if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul"))
+        fatal("Stream %d: duplicate output file (from stream %d)",
+              streami->index, stream->index);
+    }
+
+    /* Check for two streams sharing a stats file. */
+    if (streami != stream) {
+      const char *a = stream->config.stats_fn;
+      const char *b = streami->config.stats_fn;
+      if (a && b && !strcmp(a, b))
+        fatal("Stream %d: duplicate stats file (from stream %d)",
+              streami->index, stream->index);
+    }
+
+#if CONFIG_FP_MB_STATS
+    /* Check for two streams sharing a mb stats file. */
+    if (streami != stream) {
+      const char *a = stream->config.fpmb_stats_fn;
+      const char *b = streami->config.fpmb_stats_fn;
+      if (a && b && !strcmp(a, b))
+        fatal("Stream %d: duplicate mb stats file (from stream %d)",
+              streami->index, stream->index);
+    }
+#endif
+  }
+}
+
+static void set_stream_dimensions(struct stream_state *stream, unsigned int w,
+                                  unsigned int h) {
+  if (!stream->config.cfg.g_w) {
+    if (!stream->config.cfg.g_h)
+      stream->config.cfg.g_w = w;
+    else
+      stream->config.cfg.g_w = w * stream->config.cfg.g_h / h;
+  }
+  if (!stream->config.cfg.g_h) {
+    stream->config.cfg.g_h = h * stream->config.cfg.g_w / w;
+  }
+}
+
+static const char *file_type_to_string(enum VideoFileType t) {
+  switch (t) {
+    case FILE_TYPE_RAW: return "RAW";
+    case FILE_TYPE_Y4M: return "Y4M";
+    default: return "Other";
+  }
+}
+
+static const char *image_format_to_string(aom_img_fmt_t f) {
+  switch (f) {
+    case AOM_IMG_FMT_I420: return "I420";
+    case AOM_IMG_FMT_I422: return "I422";
+    case AOM_IMG_FMT_I444: return "I444";
+    case AOM_IMG_FMT_YV12: return "YV12";
+    case AOM_IMG_FMT_I42016: return "I42016";
+    case AOM_IMG_FMT_I42216: return "I42216";
+    case AOM_IMG_FMT_I44416: return "I44416";
+    default: return "Other";
+  }
+}
+
+static void show_stream_config(struct stream_state *stream,
+                               struct AvxEncoderConfig *global,
+                               struct AvxInputContext *input) {
+#define SHOW(field) \
+  fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
+
+  if (stream->index == 0) {
+    fprintf(stderr, "Codec: %s\n",
+            aom_codec_iface_name(global->codec->codec_interface()));
+    fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
+            input->filename, file_type_to_string(input->file_type),
+            image_format_to_string(input->fmt));
+  }
+  if (stream->next || stream->index)
+    fprintf(stderr, "\nStream Index: %d\n", stream->index);
+  fprintf(stderr, "Destination file: %s\n", stream->config.out_fn);
+  fprintf(stderr, "Coding path: %s\n",
+          stream->config.use_16bit_internal ? "HBD" : "LBD");
+  fprintf(stderr, "Encoder parameters:\n");
+
+  SHOW(g_usage);
+  SHOW(g_threads);
+  SHOW(g_profile);
+  SHOW(g_w);
+  SHOW(g_h);
+  SHOW(g_bit_depth);
+  SHOW(g_input_bit_depth);
+  SHOW(g_timebase.num);
+  SHOW(g_timebase.den);
+  SHOW(g_error_resilient);
+  SHOW(g_pass);
+  SHOW(g_lag_in_frames);
+  SHOW(large_scale_tile);
+  SHOW(rc_dropframe_thresh);
+  SHOW(rc_resize_mode);
+  SHOW(rc_resize_denominator);
+  SHOW(rc_resize_kf_denominator);
+  SHOW(rc_superres_mode);
+  SHOW(rc_superres_denominator);
+  SHOW(rc_superres_kf_denominator);
+  SHOW(rc_superres_qthresh);
+  SHOW(rc_superres_kf_qthresh);
+  SHOW(rc_end_usage);
+  SHOW(rc_target_bitrate);
+  SHOW(rc_min_quantizer);
+  SHOW(rc_max_quantizer);
+  SHOW(rc_undershoot_pct);
+  SHOW(rc_overshoot_pct);
+  SHOW(rc_buf_sz);
+  SHOW(rc_buf_initial_sz);
+  SHOW(rc_buf_optimal_sz);
+  SHOW(rc_2pass_vbr_bias_pct);
+  SHOW(rc_2pass_vbr_minsection_pct);
+  SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(fwd_kf_enabled);
+  SHOW(kf_mode);
+  SHOW(kf_min_dist);
+  SHOW(kf_max_dist);
+}
+
+static void open_output_file(struct stream_state *stream,
+                             struct AvxEncoderConfig *global,
+                             const struct AvxRational *pixel_aspect_ratio) {
+  const char *fn = stream->config.out_fn;
+  const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == AOM_RC_FIRST_PASS) return;
+
+  stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
+
+  if (!stream->file) fatal("Failed to open output file");
+
+  if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR))
+    fatal("WebM output to pipes not supported.");
+
+#if CONFIG_WEBM_IO
+  if (stream->config.write_webm) {
+    stream->webm_ctx.stream = stream->file;
+    write_webm_file_header(&stream->webm_ctx, cfg, stream->config.stereo_fmt,
+                           global->codec->fourcc, pixel_aspect_ratio);
+  }
+#else
+  (void)pixel_aspect_ratio;
+#endif
+
+  if (!stream->config.write_webm && stream->config.write_ivf) {
+    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+  }
+}
+
+static void close_output_file(struct stream_state *stream,
+                              unsigned int fourcc) {
+  const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == AOM_RC_FIRST_PASS) return;
+
+#if CONFIG_WEBM_IO
+  if (stream->config.write_webm) {
+    write_webm_file_footer(&stream->webm_ctx);
+  }
+#endif
+
+  if (!stream->config.write_webm && stream->config.write_ivf) {
+    if (!fseek(stream->file, 0, SEEK_SET))
+      ivf_write_file_header(stream->file, &stream->config.cfg, fourcc,
+                            stream->frames_out);
+  }
+
+  fclose(stream->file);
+}
+
+static void setup_pass(struct stream_state *stream,
+                       struct AvxEncoderConfig *global, int pass) {
+  if (stream->config.stats_fn) {
+    if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass))
+      fatal("Failed to open statistics store");
+  } else {
+    if (!stats_open_mem(&stream->stats, pass))
+      fatal("Failed to open statistics store");
+  }
+
+#if CONFIG_FP_MB_STATS
+  if (stream->config.fpmb_stats_fn) {
+    if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
+                         pass))
+      fatal("Failed to open mb statistics store");
+  } else {
+    if (!stats_open_mem(&stream->fpmb_stats, pass))
+      fatal("Failed to open mb statistics store");
+  }
+#endif
+
+  stream->config.cfg.g_pass = global->passes == 2
+                                  ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS
+                                  : AOM_RC_ONE_PASS;
+  if (pass) {
+    stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
+#if CONFIG_FP_MB_STATS
+    stream->config.cfg.rc_firstpass_mb_stats_in =
+        stats_get(&stream->fpmb_stats);
+#endif
+  }
+
+  stream->cx_time = 0;
+  stream->nbytes = 0;
+  stream->frames_out = 0;
+}
+
+static void initialize_encoder(struct stream_state *stream,
+                               struct AvxEncoderConfig *global) {
+  int i;
+  int flags = 0;
+
+  flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
+  flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
+
+  /* Construct Encoder Context */
+  aom_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
+                     &stream->config.cfg, flags);
+  ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
+
+  /* Note that we bypass the aom_codec_control wrapper macro because
+   * we're being clever to store the control IDs in an array. Real
+   * applications will want to make use of the enumerations directly
+   */
+  for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
+    int ctrl = stream->config.arg_ctrls[i][0];
+    int value = stream->config.arg_ctrls[i][1];
+    if (aom_codec_control_(&stream->encoder, ctrl, value))
+      fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value);
+
+    ctx_exit_on_error(&stream->encoder, "Failed to control codec");
+  }
+  if (stream->config.film_grain_filename) {
+    aom_codec_control_(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
+                       stream->config.film_grain_filename);
+  }
+
+#if CONFIG_AV1_DECODER
+  if (global->test_decode != TEST_DECODE_OFF) {
+    const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+    aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+
+    if (strcmp(global->codec->name, "av1") == 0) {
+      aom_codec_control(&stream->decoder, AV1_SET_TILE_MODE,
+                        stream->config.cfg.large_scale_tile);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
+
+      aom_codec_control(&stream->decoder, AV1D_SET_IS_ANNEXB,
+                        stream->config.cfg.save_as_annexb);
+      ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb");
+
+      aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_ROW, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
+
+      aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_COL, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
+    }
+  }
+#endif
+}
+
+static void encode_frame(struct stream_state *stream,
+                         struct AvxEncoderConfig *global, struct aom_image *img,
+                         unsigned int frames_in) {
+  aom_codec_pts_t frame_start, next_frame_start;
+  struct aom_codec_enc_cfg *cfg = &stream->config.cfg;
+  struct aom_usec_timer timer;
+
+  frame_start =
+      (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) /
+      cfg->g_timebase.num / global->framerate.num;
+  next_frame_start =
+      (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) /
+      cfg->g_timebase.num / global->framerate.num;
+
+  /* Scale if necessary */
+  if (img) {
+    if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) &&
+        (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+      if (img->fmt != AOM_IMG_FMT_I42016) {
+        fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+        exit(EXIT_FAILURE);
+      }
+#if CONFIG_LIBYUV
+      if (!stream->img) {
+        stream->img =
+            aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
+      }
+      I420Scale_16(
+          (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2,
+          (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2,
+          (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2,
+          img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y],
+          stream->img->stride[AOM_PLANE_Y] / 2,
+          (uint16_t *)stream->img->planes[AOM_PLANE_U],
+          stream->img->stride[AOM_PLANE_U] / 2,
+          (uint16_t *)stream->img->planes[AOM_PLANE_V],
+          stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w,
+          stream->img->d_h, kFilterBox);
+      img = stream->img;
+#else
+      stream->encoder.err = 1;
+      ctx_exit_on_error(&stream->encoder,
+                        "Stream %d: Failed to encode frame.\n"
+                        "libyuv is required for scaling but is currently "
+                        "disabled.\n"
+                        "Be sure to specify -DCONFIG_LIBYUV=1 when running "
+                        "cmake.\n",
+                        stream->index);
+#endif
+    }
+  }
+  if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+    if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) {
+      fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
+      exit(EXIT_FAILURE);
+    }
+#if CONFIG_LIBYUV
+    if (!stream->img)
+      stream->img =
+          aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16);
+    I420Scale(
+        img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y],
+        img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U],
+        img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V], img->d_w, img->d_h,
+        stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y],
+        stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U],
+        stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V],
+        stream->img->d_w, stream->img->d_h, kFilterBox);
+    img = stream->img;
+#else
+    stream->encoder.err = 1;
+    ctx_exit_on_error(&stream->encoder,
+                      "Stream %d: Failed to encode frame.\n"
+                      "Scaling disabled in this configuration. \n"
+                      "To enable, configure with --enable-libyuv\n",
+                      stream->index);
+#endif
+  }
+
+  aom_usec_timer_start(&timer);
+  aom_codec_encode(&stream->encoder, img, frame_start,
+                   (uint32_t)(next_frame_start - frame_start), 0);
+  aom_usec_timer_mark(&timer);
+  stream->cx_time += aom_usec_timer_elapsed(&timer);
+  ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
+                    stream->index);
+}
+
+static void update_quantizer_histogram(struct stream_state *stream) {
+  if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) {
+    int q;
+
+    aom_codec_control(&stream->encoder, AOME_GET_LAST_QUANTIZER_64, &q);
+    ctx_exit_on_error(&stream->encoder, "Failed to read quantizer");
+    stream->counts[q]++;
+  }
+}
+
+static void get_cx_data(struct stream_state *stream,
+                        struct AvxEncoderConfig *global, int *got_data) {
+  const aom_codec_cx_pkt_t *pkt;
+  const struct aom_codec_enc_cfg *cfg = &stream->config.cfg;
+  aom_codec_iter_t iter = NULL;
+
+  *got_data = 0;
+  while ((pkt = aom_codec_get_cx_data(&stream->encoder, &iter))) {
+    static size_t fsize = 0;
+    static FileOffset ivf_header_pos = 0;
+
+    switch (pkt->kind) {
+      case AOM_CODEC_CX_FRAME_PKT:
+        if (!(pkt->data.frame.flags & AOM_FRAME_IS_FRAGMENT)) {
+          stream->frames_out++;
+        }
+        if (!global->quiet)
+          fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz);
+
+        update_rate_histogram(stream->rate_hist, cfg, pkt);
+#if CONFIG_WEBM_IO
+        if (stream->config.write_webm) {
+          write_webm_block(&stream->webm_ctx, cfg, pkt);
+        }
+#endif
+        if (!stream->config.write_webm) {
+          if (stream->config.write_ivf) {
+            if (pkt->data.frame.partition_id <= 0) {
+              ivf_header_pos = ftello(stream->file);
+              fsize = pkt->data.frame.sz;
+
+              ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
+            } else {
+              fsize += pkt->data.frame.sz;
+
+              if (!(pkt->data.frame.flags & AOM_FRAME_IS_FRAGMENT)) {
+                const FileOffset currpos = ftello(stream->file);
+                fseeko(stream->file, ivf_header_pos, SEEK_SET);
+                ivf_write_frame_size(stream->file, fsize);
+                fseeko(stream->file, currpos, SEEK_SET);
+              }
+            }
+          }
+
+          (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                       stream->file);
+        }
+        stream->nbytes += pkt->data.raw.sz;
+
+        *got_data = 1;
+#if CONFIG_AV1_DECODER
+        if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) {
+          aom_codec_decode(&stream->decoder, pkt->data.frame.buf,
+                           pkt->data.frame.sz, NULL);
+          if (stream->decoder.err) {
+            warn_or_exit_on_error(&stream->decoder,
+                                  global->test_decode == TEST_DECODE_FATAL,
+                                  "Failed to decode frame %d in stream %d",
+                                  stream->frames_out + 1, stream->index);
+            stream->mismatch_seen = stream->frames_out + 1;
+          }
+        }
+#endif
+        break;
+      case AOM_CODEC_STATS_PKT:
+        stream->frames_out++;
+        stats_write(&stream->stats, pkt->data.twopass_stats.buf,
+                    pkt->data.twopass_stats.sz);
+        stream->nbytes += pkt->data.raw.sz;
+        break;
+#if CONFIG_FP_MB_STATS
+      case AOM_CODEC_FPMB_STATS_PKT:
+        stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
+                    pkt->data.firstpass_mb_stats.sz);
+        stream->nbytes += pkt->data.raw.sz;
+        break;
+#endif
+      case AOM_CODEC_PSNR_PKT:
+
+        if (global->show_psnr) {
+          int i;
+
+          stream->psnr_sse_total += pkt->data.psnr.sse[0];
+          stream->psnr_samples_total += pkt->data.psnr.samples[0];
+          for (i = 0; i < 4; i++) {
+            if (!global->quiet)
+              fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
+            stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
+          }
+          stream->psnr_count++;
+        }
+
+        break;
+      default: break;
+    }
+  }
+}
+
+static void show_psnr(struct stream_state *stream, double peak, int64_t bps) {
+  int i;
+  double ovpsnr;
+
+  if (!stream->psnr_count) return;
+
+  fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
+                       (double)stream->psnr_sse_total);
+  fprintf(stderr, " %.3f", ovpsnr);
+
+  for (i = 0; i < 4; i++) {
+    fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count);
+  }
+  if (bps > 0) {
+    fprintf(stderr, " %7" PRId64 " bps", bps);
+  }
+  fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
+  fprintf(stderr, "\n");
+}
+
+static float usec_to_fps(uint64_t usec, unsigned int frames) {
+  return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
+}
+
+static void test_decode(struct stream_state *stream,
+                        enum TestDecodeFatality fatal) {
+  aom_image_t enc_img, dec_img;
+
+  if (stream->mismatch_seen) return;
+
+  /* Get the internal reference frame */
+  aom_codec_control(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
+  aom_codec_control(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
+
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
+
+  ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
+  ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
+
+  if (!aom_compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+    stream->decoder.err = 1;
+    warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
+                          "Stream %d: Encode/decode mismatch on frame %d at"
+                          " Y[%d, %d] {%d/%d},"
+                          " U[%d, %d] {%d/%d},"
+                          " V[%d, %d] {%d/%d}",
+                          stream->index, stream->frames_out, y[0], y[1], y[2],
+                          y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]);
+    stream->mismatch_seen = stream->frames_out;
+  }
+
+  aom_img_free(&enc_img);
+  aom_img_free(&dec_img);
+}
+
+static void print_time(const char *label, int64_t etl) {
+  int64_t hours;
+  int64_t mins;
+  int64_t secs;
+
+  if (etl >= 0) {
+    hours = etl / 3600;
+    etl -= hours * 3600;
+    mins = etl / 60;
+    etl -= mins * 60;
+    secs = etl;
+
+    fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label,
+            hours, mins, secs);
+  } else {
+    fprintf(stderr, "[%3s  unknown] ", label);
+  }
+}
+
+int main(int argc, const char **argv_) {
+  int pass;
+  aom_image_t raw;
+  aom_image_t raw_shift;
+  int allocated_raw_shift = 0;
+  int use_16bit_internal = 0;
+  int input_shift = 0;
+  int frame_avail, got_data;
+
+  struct AvxInputContext input;
+  struct AvxEncoderConfig global;
+  struct stream_state *streams = NULL;
+  char **argv, **argi;
+  uint64_t cx_time = 0;
+  int stream_cnt = 0;
+  int res = 0;
+  int profile_updated = 0;
+
+  memset(&input, 0, sizeof(input));
+  exec_name = argv_[0];
+
+  /* Setup default input stream settings */
+  input.framerate.numerator = 30;
+  input.framerate.denominator = 1;
+  input.only_i420 = 1;
+  input.bit_depth = 0;
+
+  /* First parse the global configuration values, because we want to apply
+   * other parameters on top of the default configuration provided by the
+   * codec.
+   */
+  argv = argv_dup(argc - 1, argv_ + 1);
+  parse_global_config(&global, &argc, &argv);
+
+#if CONFIG_FILEOPTIONS
+  if (argc < 2) usage_exit();
+#else
+  if (argc < 3) usage_exit();
+#endif
+
+  switch (global.color_type) {
+    case I420: input.fmt = AOM_IMG_FMT_I420; break;
+    case I422: input.fmt = AOM_IMG_FMT_I422; break;
+    case I444: input.fmt = AOM_IMG_FMT_I444; break;
+    case YV12: input.fmt = AOM_IMG_FMT_YV12; break;
+  }
+
+  {
+    /* Now parse each stream's parameters. Using a local scope here
+     * due to the use of 'stream' as loop variable in FOREACH_STREAM
+     * loops
+     */
+    struct stream_state *stream = NULL;
+
+    do {
+      stream = new_stream(&global, stream);
+      stream_cnt++;
+      if (!streams) streams = stream;
+    } while (parse_stream_params(&global, stream, argv));
+  }
+
+  /* Check for unrecognized options */
+  for (argi = argv; *argi; argi++)
+    if (argi[0][0] == '-' && argi[0][1])
+      die("Error: Unrecognized option %s\n", *argi);
+
+  FOREACH_STREAM(stream, streams) {
+    check_encoder_config(global.disable_warning_prompt, &global,
+                         &stream->config.cfg);
+  }
+
+  /* Handle non-option arguments */
+  input.filename = argv[0];
+
+  if (!input.filename) {
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
+
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == AV1_FOURCC) input.only_i420 = 0;
+
+  for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
+    int frames_in = 0, seen_frames = 0;
+    int64_t estimated_time_left = -1;
+    int64_t average_rate = -1;
+    int64_t lagged_count = 0;
+
+    open_input_file(&input);
+
+    /* If the input file doesn't specify its w/h (raw files), try to get
+     * the data from the first stream's configuration.
+     */
+    if (!input.width || !input.height) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+          input.width = stream->config.cfg.g_w;
+          input.height = stream->config.cfg.g_h;
+          break;
+        }
+      };
+    }
+
+    /* Update stream configurations from the input file's parameters */
+    if (!input.width || !input.height)
+      fatal(
+          "Specify stream dimensions with --width (-w) "
+          " and --height (-h)");
+
+    /* If input file does not specify bit-depth but input-bit-depth parameter
+     * exists, assume that to be the input bit-depth. However, if the
+     * input-bit-depth paramter does not exist, assume the input bit-depth
+     * to be the same as the codec bit-depth.
+     */
+    if (!input.bit_depth) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.cfg.g_input_bit_depth)
+          input.bit_depth = stream->config.cfg.g_input_bit_depth;
+        else
+          input.bit_depth = stream->config.cfg.g_input_bit_depth =
+              (int)stream->config.cfg.g_bit_depth;
+      }
+      if (input.bit_depth > 8) input.fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+    } else {
+      FOREACH_STREAM(stream, streams) {
+        stream->config.cfg.g_input_bit_depth = input.bit_depth;
+      }
+    }
+
+    FOREACH_STREAM(stream, streams) {
+      if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
+        /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
+           was selected. */
+        switch (stream->config.cfg.g_profile) {
+          case 0:
+            if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+                                         input.fmt == AOM_IMG_FMT_I44416)) {
+              if (!stream->config.cfg.monochrome) {
+                stream->config.cfg.g_profile = 1;
+                profile_updated = 1;
+              }
+            } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+                       input.fmt == AOM_IMG_FMT_I42216) {
+              stream->config.cfg.g_profile = 2;
+              profile_updated = 1;
+            }
+            break;
+          case 1:
+            if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+                input.fmt == AOM_IMG_FMT_I42216) {
+              stream->config.cfg.g_profile = 2;
+              profile_updated = 1;
+            } else if (input.bit_depth < 12 &&
+                       (input.fmt == AOM_IMG_FMT_I420 ||
+                        input.fmt == AOM_IMG_FMT_I42016)) {
+              stream->config.cfg.g_profile = 0;
+              profile_updated = 1;
+            }
+            break;
+          case 2:
+            if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+                                         input.fmt == AOM_IMG_FMT_I44416)) {
+              stream->config.cfg.g_profile = 1;
+              profile_updated = 1;
+            } else if (input.bit_depth < 12 &&
+                       (input.fmt == AOM_IMG_FMT_I420 ||
+                        input.fmt == AOM_IMG_FMT_I42016)) {
+              stream->config.cfg.g_profile = 0;
+              profile_updated = 1;
+            } else if (input.bit_depth == 12 &&
+                       input.file_type == FILE_TYPE_Y4M) {
+              // Note that here the input file values for chroma subsampling
+              // are used instead of those from the command line.
+              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                input.y4m.dst_c_dec_h >> 1);
+              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y,
+                                input.y4m.dst_c_dec_v >> 1);
+            } else if (input.bit_depth == 12 &&
+                       input.file_type == FILE_TYPE_RAW) {
+              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                stream->chroma_subsampling_x);
+              aom_codec_control(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y,
+                                stream->chroma_subsampling_y);
+            }
+            break;
+          default: break;
+        }
+      }
+      if (stream->config.cfg.g_bit_depth > 10) {
+        switch (stream->config.cfg.g_profile) {
+          case 0:
+          case 1:
+            stream->config.cfg.g_profile = 2;
+            profile_updated = 1;
+            break;
+          default: break;
+        }
+      }
+      if (stream->config.cfg.g_bit_depth > 8) {
+        stream->config.use_16bit_internal = 1;
+      }
+      if (profile_updated && !global.quiet) {
+        fprintf(stderr,
+                "Warning: automatically updating to profile %d to "
+                "match input format.\n",
+                stream->config.cfg.g_profile);
+      }
+      /* Set limit */
+      stream->config.cfg.g_limit = global.limit;
+    }
+
+    FOREACH_STREAM(stream, streams) {
+      set_stream_dimensions(stream, input.width, input.height);
+    }
+    FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); }
+
+    /* Ensure that --passes and --pass are consistent. If --pass is set and
+     * --passes=2, ensure --fpf was set.
+     */
+    if (global.pass && global.passes == 2) {
+      FOREACH_STREAM(stream, streams) {
+        if (!stream->config.stats_fn)
+          die("Stream %d: Must specify --fpf when --pass=%d"
+              " and --passes=2\n",
+              stream->index, global.pass);
+      }
+    }
+
+#if !CONFIG_WEBM_IO
+    FOREACH_STREAM(stream, streams) {
+      if (stream->config.write_webm) {
+        stream->config.write_webm = 0;
+        stream->config.write_ivf = 0;
+        warn("aomenc compiled w/o WebM support. Writing OBU stream.");
+      }
+    }
+#endif
+
+    /* Use the frame rate from the file only if none was specified
+     * on the command-line.
+     */
+    if (!global.have_framerate) {
+      global.framerate.num = input.framerate.numerator;
+      global.framerate.den = input.framerate.denominator;
+    }
+    FOREACH_STREAM(stream, streams) {
+      stream->config.cfg.g_timebase.den = global.framerate.num;
+      stream->config.cfg.g_timebase.num = global.framerate.den;
+    }
+    /* Show configuration */
+    if (global.verbose && pass == 0) {
+      FOREACH_STREAM(stream, streams) {
+        show_stream_config(stream, &global, &input);
+      }
+    }
+
+    if (pass == (global.pass ? global.pass - 1 : 0)) {
+      if (input.file_type == FILE_TYPE_Y4M)
+        /*The Y4M reader does its own allocation.
+          Just initialize this here to avoid problems if we never read any
+          frames.*/
+        memset(&raw, 0, sizeof(raw));
+      else
+        aom_img_alloc(&raw, input.fmt, input.width, input.height, 32);
+
+      FOREACH_STREAM(stream, streams) {
+        stream->rate_hist =
+            init_rate_histogram(&stream->config.cfg, &global.framerate);
+      }
+    }
+
+    FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); }
+    FOREACH_STREAM(stream, streams) {
+      open_output_file(stream, &global, &input.pixel_aspect_ratio);
+    }
+    FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); }
+    if (strcmp(global.codec->name, "av1") == 0 ||
+        strcmp(global.codec->name, "av1") == 0) {
+      // Check to see if at least one stream uses 16 bit internal.
+      // Currently assume that the bit_depths for all streams using
+      // highbitdepth are the same.
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.use_16bit_internal) {
+          use_16bit_internal = 1;
+        }
+        input_shift = (int)stream->config.cfg.g_bit_depth -
+                      stream->config.cfg.g_input_bit_depth;
+      };
+    }
+
+    frame_avail = 1;
+    got_data = 0;
+
+    while (frame_avail || got_data) {
+      struct aom_usec_timer timer;
+
+      if (!global.limit || frames_in < global.limit) {
+        frame_avail = read_frame(&input, &raw);
+
+        if (frame_avail) frames_in++;
+        seen_frames =
+            frames_in > global.skip_frames ? frames_in - global.skip_frames : 0;
+
+        if (!global.quiet) {
+          float fps = usec_to_fps(cx_time, seen_frames);
+          fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);
+
+          if (stream_cnt == 1)
+            fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in,
+                    streams->frames_out, (int64_t)streams->nbytes);
+          else
+            fprintf(stderr, "frame %4d ", frames_in);
+
+          fprintf(stderr, "%7" PRId64 " %s %.2f %s ",
+                  cx_time > 9999999 ? cx_time / 1000 : cx_time,
+                  cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60,
+                  fps >= 1.0 ? "fps" : "fpm");
+          print_time("ETA", estimated_time_left);
+        }
+
+      } else {
+        frame_avail = 0;
+      }
+
+      if (frames_in > global.skip_frames) {
+        aom_image_t *frame_to_encode;
+        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+          assert(use_16bit_internal);
+          // Input bit depth and stream bit depth do not match, so up
+          // shift frame to stream bit depth
+          if (!allocated_raw_shift) {
+            aom_img_alloc(&raw_shift, raw.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                          input.width, input.height, 32);
+            allocated_raw_shift = 1;
+          }
+          aom_img_upshift(&raw_shift, &raw, input_shift);
+          frame_to_encode = &raw_shift;
+        } else {
+          frame_to_encode = &raw;
+        }
+        aom_usec_timer_start(&timer);
+        if (use_16bit_internal) {
+          assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+          FOREACH_STREAM(stream, streams) {
+            if (stream->config.use_16bit_internal)
+              encode_frame(stream, &global,
+                           frame_avail ? frame_to_encode : NULL, frames_in);
+            else
+              assert(0);
+          };
+        } else {
+          assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0);
+          FOREACH_STREAM(stream, streams) {
+            encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL,
+                         frames_in);
+          }
+        }
+        aom_usec_timer_mark(&timer);
+        cx_time += aom_usec_timer_elapsed(&timer);
+
+        FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); }
+
+        got_data = 0;
+        FOREACH_STREAM(stream, streams) {
+          get_cx_data(stream, &global, &got_data);
+        }
+
+        if (!got_data && input.length && streams != NULL &&
+            !streams->frames_out) {
+          lagged_count = global.limit ? seen_frames : ftello(input.file);
+        } else if (input.length) {
+          int64_t remaining;
+          int64_t rate;
+
+          if (global.limit) {
+            const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000;
+
+            rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
+            remaining = 1000 * (global.limit - global.skip_frames -
+                                seen_frames + lagged_count);
+          } else {
+            const int64_t input_pos = ftello(input.file);
+            const int64_t input_pos_lagged = input_pos - lagged_count;
+            const int64_t input_limit = input.length;
+
+            rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
+            remaining = input_limit - input_pos + lagged_count;
+          }
+
+          average_rate =
+              (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8;
+          estimated_time_left = average_rate ? remaining / average_rate : -1;
+        }
+
+        if (got_data && global.test_decode != TEST_DECODE_OFF) {
+          FOREACH_STREAM(stream, streams) {
+            test_decode(stream, global.test_decode);
+          }
+        }
+      }
+
+      fflush(stdout);
+      if (!global.quiet) fprintf(stderr, "\033[K");
+    }
+
+    if (stream_cnt > 1) fprintf(stderr, "\n");
+
+    if (!global.quiet) {
+      FOREACH_STREAM(stream, streams) {
+        const int64_t bpf =
+            seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0;
+        const int64_t bps = bpf * global.framerate.num / global.framerate.den;
+        fprintf(stderr,
+                "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
+                "b/f %7" PRId64
+                "b/s"
+                " %7" PRId64 " %s (%.2f fps)\033[K\n",
+                pass + 1, global.passes, frames_in, stream->frames_out,
+                (int64_t)stream->nbytes, bpf, bps,
+                stream->cx_time > 9999999 ? stream->cx_time / 1000
+                                          : stream->cx_time,
+                stream->cx_time > 9999999 ? "ms" : "us",
+                usec_to_fps(stream->cx_time, seen_frames));
+      }
+    }
+
+    if (global.show_psnr) {
+      if (global.codec->fourcc == AV1_FOURCC) {
+        FOREACH_STREAM(stream, streams) {
+          int64_t bps = 0;
+          if (stream->psnr_count && seen_frames && global.framerate.den) {
+            bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num /
+                  global.framerate.den / seen_frames;
+          }
+          show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
+                    bps);
+        }
+      } else {
+        FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); }
+      }
+    }
+
+    FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); }
+
+    if (global.test_decode != TEST_DECODE_OFF) {
+      FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->decoder); }
+    }
+
+    close_input_file(&input);
+
+    if (global.test_decode == TEST_DECODE_FATAL) {
+      FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; }
+    }
+    FOREACH_STREAM(stream, streams) {
+      close_output_file(stream, global.codec->fourcc);
+    }
+
+    FOREACH_STREAM(stream, streams) {
+      stats_close(&stream->stats, global.passes - 1);
+    }
+
+#if CONFIG_FP_MB_STATS
+    FOREACH_STREAM(stream, streams) {
+      stats_close(&stream->fpmb_stats, global.passes - 1);
+    }
+#endif
+
+    if (global.pass) break;
+  }
+
+  if (global.show_q_hist_buckets) {
+    FOREACH_STREAM(stream, streams) {
+      show_q_histogram(stream->counts, global.show_q_hist_buckets);
+    }
+  }
+
+  if (global.show_rate_hist_buckets) {
+    FOREACH_STREAM(stream, streams) {
+      show_rate_histogram(stream->rate_hist, &stream->config.cfg,
+                          global.show_rate_hist_buckets);
+    }
+  }
+  FOREACH_STREAM(stream, streams) { destroy_rate_histogram(stream->rate_hist); }
+
+#if CONFIG_INTERNAL_STATS
+  /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now,
+   * to match some existing utilities.
+   */
+  if (!(global.pass == 1 && global.passes == 2)) {
+    FOREACH_STREAM(stream, streams) {
+      FILE *f = fopen("opsnr.stt", "a");
+      if (stream->mismatch_seen) {
+        fprintf(f, "First mismatch occurred in frame %d\n",
+                stream->mismatch_seen);
+      } else {
+        fprintf(f, "No mismatch detected in recon buffers\n");
+      }
+      fclose(f);
+    }
+  }
+#endif
+
+  if (allocated_raw_shift) aom_img_free(&raw_shift);
+  aom_img_free(&raw);
+  free(argv);
+  free(streams);
+  return res ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/third_party/aom/apps/aomenc.h b/third_party/aom/apps/aomenc.h
new file mode 100644
index 000000000..7c23df006
--- /dev/null
+++ b/third_party/aom/apps/aomenc.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_APPS_AOMENC_H_
+#define AOM_APPS_AOMENC_H_
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum TestDecodeFatality {
+  TEST_DECODE_OFF,
+  TEST_DECODE_FATAL,
+  TEST_DECODE_WARN,
+};
+
+typedef enum {
+  I420,  // 4:2:0 8+ bit-depth
+  I422,  // 4:2:2 8+ bit-depth
+  I444,  // 4:4:4 8+ bit-depth
+  YV12,  // 4:2:0 with uv flipped, only 8-bit depth
+} ColorInputType;
+
+struct AvxInterface;
+
+/* Configuration elements common to all streams. */
+struct AvxEncoderConfig {
+  const struct AvxInterface *codec;
+  int passes;
+  int pass;
+  int usage;
+  ColorInputType color_type;
+  int quiet;
+  int verbose;
+  int limit;
+  int skip_frames;
+  int show_psnr;
+  enum TestDecodeFatality test_decode;
+  int have_framerate;
+  struct aom_rational framerate;
+  int debug;
+  int show_q_hist_buckets;
+  int show_rate_hist_buckets;
+  int disable_warnings;
+  int disable_warning_prompt;
+  int experimental_bitstream;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_APPS_AOMENC_H_
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
new file mode 100644
index 000000000..3a7cd7ee1
--- /dev/null
+++ b/third_party/aom/av1/av1.cmake
@@ -0,0 +1,469 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AV1_AV1_CMAKE_)
+  return()
+endif() # AOM_AV1_AV1_CMAKE_
+set(AOM_AV1_AV1_CMAKE_ 1)
+
+list(APPEND AOM_AV1_COMMON_SOURCES
+            "${AOM_ROOT}/av1/av1_iface_common.h"
+            "${AOM_ROOT}/av1/common/alloccommon.c"
+            "${AOM_ROOT}/av1/common/alloccommon.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+            "${AOM_ROOT}/av1/common/av1_txfm.c"
+            "${AOM_ROOT}/av1/common/av1_txfm.h"
+            "${AOM_ROOT}/av1/common/blockd.c"
+            "${AOM_ROOT}/av1/common/blockd.h"
+            "${AOM_ROOT}/av1/common/cdef.c"
+            "${AOM_ROOT}/av1/common/cdef.h"
+            "${AOM_ROOT}/av1/common/cdef_block.c"
+            "${AOM_ROOT}/av1/common/cdef_block.h"
+            "${AOM_ROOT}/av1/common/cfl.c"
+            "${AOM_ROOT}/av1/common/cfl.h"
+            "${AOM_ROOT}/av1/common/common.h"
+            "${AOM_ROOT}/av1/common/common_data.h"
+            "${AOM_ROOT}/av1/common/convolve.c"
+            "${AOM_ROOT}/av1/common/convolve.h"
+            "${AOM_ROOT}/av1/common/debugmodes.c"
+            "${AOM_ROOT}/av1/common/entropy.c"
+            "${AOM_ROOT}/av1/common/entropy.h"
+            "${AOM_ROOT}/av1/common/entropymode.c"
+            "${AOM_ROOT}/av1/common/entropymode.h"
+            "${AOM_ROOT}/av1/common/entropymv.c"
+            "${AOM_ROOT}/av1/common/entropymv.h"
+            "${AOM_ROOT}/av1/common/enums.h"
+            "${AOM_ROOT}/av1/common/filter.h"
+            "${AOM_ROOT}/av1/common/frame_buffers.c"
+            "${AOM_ROOT}/av1/common/frame_buffers.h"
+            "${AOM_ROOT}/av1/common/idct.c"
+            "${AOM_ROOT}/av1/common/idct.h"
+            "${AOM_ROOT}/av1/common/mv.h"
+            "${AOM_ROOT}/av1/common/mvref_common.c"
+            "${AOM_ROOT}/av1/common/mvref_common.h"
+            "${AOM_ROOT}/av1/common/obu_util.c"
+            "${AOM_ROOT}/av1/common/obu_util.h"
+            "${AOM_ROOT}/av1/common/odintrin.c"
+            "${AOM_ROOT}/av1/common/odintrin.h"
+            "${AOM_ROOT}/av1/common/onyxc_int.h"
+            "${AOM_ROOT}/av1/common/pred_common.c"
+            "${AOM_ROOT}/av1/common/pred_common.h"
+            "${AOM_ROOT}/av1/common/quant_common.c"
+            "${AOM_ROOT}/av1/common/quant_common.h"
+            "${AOM_ROOT}/av1/common/reconinter.c"
+            "${AOM_ROOT}/av1/common/reconinter.h"
+            "${AOM_ROOT}/av1/common/reconintra.c"
+            "${AOM_ROOT}/av1/common/reconintra.h"
+            "${AOM_ROOT}/av1/common/resize.c"
+            "${AOM_ROOT}/av1/common/resize.h"
+            "${AOM_ROOT}/av1/common/restoration.c"
+            "${AOM_ROOT}/av1/common/restoration.h"
+            "${AOM_ROOT}/av1/common/scale.c"
+            "${AOM_ROOT}/av1/common/scale.h"
+            "${AOM_ROOT}/av1/common/scan.c"
+            "${AOM_ROOT}/av1/common/scan.h"
+            "${AOM_ROOT}/av1/common/seg_common.c"
+            "${AOM_ROOT}/av1/common/seg_common.h"
+            "${AOM_ROOT}/av1/common/thread_common.c"
+            "${AOM_ROOT}/av1/common/thread_common.h"
+            "${AOM_ROOT}/av1/common/tile_common.c"
+            "${AOM_ROOT}/av1/common/tile_common.h"
+            "${AOM_ROOT}/av1/common/timing.c"
+            "${AOM_ROOT}/av1/common/timing.h"
+            "${AOM_ROOT}/av1/common/token_cdfs.h"
+            "${AOM_ROOT}/av1/common/txb_common.c"
+            "${AOM_ROOT}/av1/common/txb_common.h"
+            "${AOM_ROOT}/av1/common/warped_motion.c"
+            "${AOM_ROOT}/av1/common/warped_motion.h")
+
+list(APPEND AOM_AV1_DECODER_SOURCES
+            "${AOM_ROOT}/av1/av1_dx_iface.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.h"
+            "${AOM_ROOT}/av1/decoder/decodemv.c"
+            "${AOM_ROOT}/av1/decoder/decodemv.h"
+            "${AOM_ROOT}/av1/decoder/decoder.c"
+            "${AOM_ROOT}/av1/decoder/decoder.h"
+            "${AOM_ROOT}/av1/decoder/decodetxb.c"
+            "${AOM_ROOT}/av1/decoder/decodetxb.h"
+            "${AOM_ROOT}/av1/decoder/detokenize.c"
+            "${AOM_ROOT}/av1/decoder/detokenize.h"
+            "${AOM_ROOT}/av1/decoder/dthread.c"
+            "${AOM_ROOT}/av1/decoder/dthread.h"
+            "${AOM_ROOT}/av1/decoder/obu.h"
+            "${AOM_ROOT}/av1/decoder/obu.c")
+
+list(APPEND AOM_AV1_ENCODER_SOURCES
+            "${AOM_ROOT}/av1/av1_cx_iface.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+            "${AOM_ROOT}/av1/encoder/aq_variance.c"
+            "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+            "${AOM_ROOT}/av1/encoder/bitstream.c"
+            "${AOM_ROOT}/av1/encoder/bitstream.h"
+            "${AOM_ROOT}/av1/encoder/block.h"
+            "${AOM_ROOT}/av1/encoder/context_tree.c"
+            "${AOM_ROOT}/av1/encoder/context_tree.h"
+            "${AOM_ROOT}/av1/encoder/corner_detect.c"
+            "${AOM_ROOT}/av1/encoder/corner_detect.h"
+            "${AOM_ROOT}/av1/encoder/corner_match.c"
+            "${AOM_ROOT}/av1/encoder/corner_match.h"
+            "${AOM_ROOT}/av1/encoder/cost.c"
+            "${AOM_ROOT}/av1/encoder/cost.h"
+            "${AOM_ROOT}/av1/encoder/encodeframe.c"
+            "${AOM_ROOT}/av1/encoder/encodeframe.h"
+            "${AOM_ROOT}/av1/encoder/encodemb.c"
+            "${AOM_ROOT}/av1/encoder/encodemb.h"
+            "${AOM_ROOT}/av1/encoder/encodemv.c"
+            "${AOM_ROOT}/av1/encoder/encodemv.h"
+            "${AOM_ROOT}/av1/encoder/encoder.c"
+            "${AOM_ROOT}/av1/encoder/encoder.h"
+            "${AOM_ROOT}/av1/encoder/encodetxb.c"
+            "${AOM_ROOT}/av1/encoder/encodetxb.h"
+            "${AOM_ROOT}/av1/encoder/ethread.c"
+            "${AOM_ROOT}/av1/encoder/ethread.h"
+            "${AOM_ROOT}/av1/encoder/extend.c"
+            "${AOM_ROOT}/av1/encoder/extend.h"
+            "${AOM_ROOT}/av1/encoder/firstpass.c"
+            "${AOM_ROOT}/av1/encoder/firstpass.h"
+            "${AOM_ROOT}/av1/encoder/global_motion.c"
+            "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
+            "${AOM_ROOT}/av1/encoder/hash.c"
+            "${AOM_ROOT}/av1/encoder/hash.h"
+            "${AOM_ROOT}/av1/encoder/hash_motion.c"
+            "${AOM_ROOT}/av1/encoder/hash_motion.h"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+            "${AOM_ROOT}/av1/encoder/lookahead.c"
+            "${AOM_ROOT}/av1/encoder/lookahead.h"
+            "${AOM_ROOT}/av1/encoder/mbgraph.c"
+            "${AOM_ROOT}/av1/encoder/mbgraph.h"
+            "${AOM_ROOT}/av1/encoder/mcomp.c"
+            "${AOM_ROOT}/av1/encoder/mcomp.h"
+            "${AOM_ROOT}/av1/encoder/ml.c"
+            "${AOM_ROOT}/av1/encoder/ml.h"
+            "${AOM_ROOT}/av1/encoder/palette.c"
+            "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/pickcdef.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.h"
+            "${AOM_ROOT}/av1/encoder/pickrst.c"
+            "${AOM_ROOT}/av1/encoder/pickrst.h"
+            "${AOM_ROOT}/av1/encoder/ransac.c"
+            "${AOM_ROOT}/av1/encoder/ransac.h"
+            "${AOM_ROOT}/av1/encoder/ratectrl.c"
+            "${AOM_ROOT}/av1/encoder/ratectrl.h"
+            "${AOM_ROOT}/av1/encoder/rd.c"
+            "${AOM_ROOT}/av1/encoder/rd.h"
+            "${AOM_ROOT}/av1/encoder/rdopt.c"
+            "${AOM_ROOT}/av1/encoder/rdopt.h"
+            "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+            "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
+            "${AOM_ROOT}/av1/encoder/segmentation.c"
+            "${AOM_ROOT}/av1/encoder/segmentation.h"
+            "${AOM_ROOT}/av1/encoder/speed_features.c"
+            "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+            "${AOM_ROOT}/av1/encoder/tokenize.c"
+            "${AOM_ROOT}/av1/encoder/tokenize.h"
+            "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.h"
+            "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
+            "${AOM_ROOT}/third_party/fastfeat/nonmax.c"
+            "${AOM_ROOT}/third_party/vector/vector.c"
+            "${AOM_ROOT}/third_party/vector/vector.h"
+            "${AOM_ROOT}/av1/encoder/dwt.c"
+            "${AOM_ROOT}/av1/encoder/dwt.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+            "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
+            "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+            "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/mem_neon.h"
+            "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+            "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
+            "${AOM_ROOT}/av1/common/cdef_block_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
+            "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
+if(CONFIG_ACCOUNTING)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
+              "${AOM_ROOT}/av1/decoder/accounting.h")
+endif()
+
+if(CONFIG_INSPECTION)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c"
+              "${AOM_ROOT}/av1/decoder/inspection.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif()
+
+# Setup AV1 common/decoder/encoder targets. The libaom target must exist before
+# this function is called.
+function(setup_av1_targets)
+  add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
+  list(APPEND AOM_LIB_TARGETS aom_av1_common)
+
+  create_dummy_source_file("aom_av1" "c" "dummy_source_file")
+  add_library(aom_av1 OBJECT "${dummy_source_file}")
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+  list(APPEND AOM_LIB_TARGETS aom_av1)
+
+  # Not all generators support libraries consisting only of object files. Add a
+  # dummy source file to the aom_av1 target.
+  add_dummy_source_file_to_target("aom_av1" "c")
+
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+  endif()
+
+  if(HAVE_SSE2)
+    require_compiler_flag_nomsvc("-msse2" NO)
+    add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSE2" "aom")
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_ASM_SSE2)
+        add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2" "aom")
+      endif()
+
+      if(AOM_AV1_DECODER_INTRIN_SSE2)
+        add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
+                                      "AOM_AV1_DECODER_INTRIN_SSE2" "aom")
+      endif()
+    endif()
+
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2" "aom")
+      add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_SSE2" "aom")
+    endif()
+  endif()
+
+  if(HAVE_SSSE3)
+    require_compiler_flag_nomsvc("-mssse3" NO)
+    add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSSE3" "aom")
+
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_INTRIN_SSSE3)
+        add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
+                                      "AOM_AV1_DECODER_INTRIN_SSSE3" "aom")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
+    require_compiler_flag_nomsvc("-msse4.1" NO)
+    add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSE4_1" "aom")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        add_asm_library("aom_av1_encoder_ssse3"
+                        "AOM_AV1_ENCODER_ASM_SSSE3_X86_64" "aom")
+      endif()
+
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
+        add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_1" "aom")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_SSE4_2)
+    require_compiler_flag_nomsvc("-msse4.2" NO)
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
+        add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
+    require_compiler_flag_nomsvc("-mavx2" NO)
+    add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_AVX2" "aom")
+
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_AVX2" "aom")
+    endif()
+  endif()
+
+  if(HAVE_NEON)
+    if(AOM_AV1_COMMON_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+                                    "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_NEON" "aom")
+    endif()
+
+    if(AOM_AV1_ENCODER_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+                                    "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_NEON" "aom")
+    endif()
+  endif()
+
+  if(HAVE_VSX)
+    if(AOM_AV1_COMMON_INTRIN_VSX)
+      add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_VSX" "aom")
+    endif()
+  endif()
+
+  if(HAVE_MSA)
+    add_intrinsics_object_library("" "msa" "aom_av1_encoder"
+                                  "AOM_AV1_ENCODER_INTRIN_MSA" "aom")
+  endif()
+
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+function(setup_av1_test_targets)
+endfunction()
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
new file mode 100644
index 000000000..3295f618a
--- /dev/null
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -0,0 +1,1908 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+
+#include "aom/aom_encoder.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+#define MAG_SIZE (4)
+#define MAX_NUM_ENHANCEMENT_LAYERS 3
+
+struct av1_extracfg {
+  int cpu_used;  // available cpu percentage in 1/16
+  unsigned int enable_auto_alt_ref;
+  unsigned int enable_auto_bwd_ref;
+  unsigned int noise_sensitivity;
+  unsigned int sharpness;
+  unsigned int static_thresh;
+  unsigned int row_mt;
+  unsigned int tile_columns;  // log2 number of tile columns
+  unsigned int tile_rows;     // log2 number of tile rows
+  unsigned int arnr_max_frames;
+  unsigned int arnr_strength;
+  unsigned int min_gf_interval;
+  unsigned int max_gf_interval;
+  aom_tune_metric tuning;
+  unsigned int cq_level;  // constrained quality level
+  unsigned int rc_max_intra_bitrate_pct;
+  unsigned int rc_max_inter_bitrate_pct;
+  unsigned int gf_cbr_boost_pct;
+  unsigned int lossless;
+  unsigned int enable_cdef;
+  unsigned int enable_restoration;
+  unsigned int disable_trellis_quant;
+  unsigned int enable_qm;
+  unsigned int qm_y;
+  unsigned int qm_u;
+  unsigned int qm_v;
+  unsigned int qm_min;
+  unsigned int qm_max;
+#if CONFIG_DIST_8X8
+  unsigned int enable_dist_8x8;
+#endif
+  unsigned int num_tg;
+  unsigned int mtu_size;
+
+  aom_timing_info_type_t timing_info_type;
+  unsigned int frame_parallel_decoding_mode;
+  int use_dual_filter;
+  AQ_MODE aq_mode;
+  DELTAQ_MODE deltaq_mode;
+  unsigned int frame_periodic_boost;
+  aom_bit_depth_t bit_depth;
+  aom_tune_content content;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  aom_chroma_sample_position_t chroma_sample_position;
+  int color_range;
+  int render_width;
+  int render_height;
+  aom_superblock_size_t superblock_size;
+  unsigned int single_tile_decoding;
+  int error_resilient_mode;
+  int s_frame_mode;
+
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
+  unsigned int motion_vector_unit_test;
+  unsigned int cdf_update_mode;
+  int enable_order_hint;
+  int enable_jnt_comp;
+  int enable_ref_frame_mvs;  // sequence level
+  int allow_ref_frame_mvs;   // frame level
+  int enable_warped_motion;  // sequence level
+  int allow_warped_motion;   // frame level
+  int enable_superres;
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
+
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
+};
+
+static struct av1_extracfg default_extra_cfg = {
+  0,                       // cpu_used
+  1,                       // enable_auto_alt_ref
+  0,                       // enable_auto_bwd_ref
+  0,                       // noise_sensitivity
+  CONFIG_SHARP_SETTINGS,   // sharpness
+  0,                       // static_thresh
+  0,                       // row_mt
+  0,                       // tile_columns
+  0,                       // tile_rows
+  7,                       // arnr_max_frames
+  5,                       // arnr_strength
+  0,                       // min_gf_interval; 0 -> default decision
+  0,                       // max_gf_interval; 0 -> default decision
+  AOM_TUNE_PSNR,           // tuning
+  10,                      // cq_level
+  0,                       // rc_max_intra_bitrate_pct
+  0,                       // rc_max_inter_bitrate_pct
+  0,                       // gf_cbr_boost_pct
+  0,                       // lossless
+  !CONFIG_SHARP_SETTINGS,  // enable_cdef
+  1,                       // enable_restoration
+  0,                       // disable_trellis_quant
+  0,                       // enable_qm
+  DEFAULT_QM_Y,            // qm_y
+  DEFAULT_QM_U,            // qm_u
+  DEFAULT_QM_V,            // qm_v
+  DEFAULT_QM_FIRST,        // qm_min
+  DEFAULT_QM_LAST,         // qm_max
+#if CONFIG_DIST_8X8
+  0,
+#endif
+  1,                            // max number of tile groups
+  0,                            // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  1,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  NO_AQ,                        // aq_mode
+  NO_DELTA_Q,                   // deltaq_mode
+  0,                            // frame_periodic_delta_q
+  AOM_BITS_8,                   // Bit depth
+  AOM_CONTENT_DEFAULT,          // content
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color space
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
+  AOM_CSP_UNKNOWN,              // chroma sample position
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
+  1,                            // this depends on large_scale_tile.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  0,                            // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+  1,                            // CDF update mode
+  1,                            // frame order hint
+  1,                            // jnt_comp
+  1,                            // enable_ref_frame_mvs sequence level
+  1,                            // allow ref_frame_mvs frame level
+  1,                            // enable_warped_motion at sequence level
+  1,                            // allow_warped_motion at frame level
+  1,                            // superres
+#if CONFIG_DENOISE
+  0,   // noise_level
+  32,  // noise_block_size
+#endif
+  0,  // chroma_subsampling_x
+  0,  // chroma_subsampling_y
+};
+
+struct aom_codec_alg_priv {
+  aom_codec_priv_t base;
+  aom_codec_enc_cfg_t cfg;
+  struct av1_extracfg extra_cfg;
+  AV1EncoderConfig oxcf;
+  AV1_COMP *cpi;
+  unsigned char *cx_data;
+  size_t cx_data_sz;
+  unsigned char *pending_cx_data;
+  size_t pending_cx_data_sz;
+  int pending_frame_count;
+  size_t pending_frame_sizes[8];
+  aom_image_t preview_img;
+  aom_enc_frame_flags_t next_frame_flags;
+  aom_postproc_cfg_t preview_ppcfg;
+  aom_codec_pkt_list_decl(256) pkt_list;
+  unsigned int fixed_kf_cntr;
+  // BufferPool that holds all reference frames.
+  BufferPool *buffer_pool;
+};
+
+static aom_codec_err_t update_error_state(
+    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+  const aom_codec_err_t res = error->error_code;
+
+  if (res != AOM_CODEC_OK)
+    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+  return res;
+}
+
+#undef ERROR
+#define ERROR(str)                  \
+  do {                              \
+    ctx->base.err_detail = str;     \
+    return AOM_CODEC_INVALID_PARAM; \
+  } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi)                   \
+  do {                                                 \
+    if (!((p)->memb >= (lo) && (p)->memb <= (hi)))     \
+      ERROR(#memb " out of range [" #lo ".." #hi "]"); \
+  } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi)                                     \
+  do {                                                                  \
+    if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
+  } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb)                                     \
+  do {                                                                \
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
+  } while (0)
+
+static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       const struct av1_extracfg *extra_cfg) {
+  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+  RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+  RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
+
+  RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+  RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+  RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
+  RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
+  RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+  RANGE_CHECK_HI(cfg, g_threads, 64);
+  RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+  RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+  RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+  RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
+                (MAX_LAG_BUFFERS - 1));
+  }
+
+  RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
+  RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
+  RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63);
+  RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
+  RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
+
+  // AV1 does not support a lower bound on the keyframe interval in
+  // automatic keyframe placement mode.
+  if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
+      cfg->kf_min_dist > 0)
+    ERROR(
+        "kf_min_dist not supported in auto mode, use 0 "
+        "or kf_max_dist instead.");
+
+  RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
+  RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
+              AOM_SUPERBLOCK_SIZE_DYNAMIC);
+  RANGE_CHECK_HI(cfg, large_scale_tile, 1);
+  RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
+
+  RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+
+  RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+
+  RANGE_CHECK_HI(cfg, monochrome, 1);
+
+  if (cfg->large_scale_tile && extra_cfg->aq_mode)
+    ERROR(
+        "Adaptive quantization are not supported in large scale tile "
+        "coding.");
+
+  RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+  RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+  RANGE_CHECK_HI(extra_cfg, cq_level, 63);
+  RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+  RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
+
+  // TODO(yaowu): remove this when ssim tuning is implemented for av1
+  if (extra_cfg->tuning == AOM_TUNE_SSIM)
+    ERROR("Option --tune=ssim is not currently supported in AV1.");
+
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    const FIRSTPASS_STATS *stats;
+
+    if (cfg->rc_twopass_stats_in.buf == NULL)
+      ERROR("rc_twopass_stats_in.buf not set.");
+
+    if (cfg->rc_twopass_stats_in.sz % packet_sz)
+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+      ERROR("rc_twopass_stats_in requires at least two packets.");
+
+    stats =
+        (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+    if ((int)(stats->count + 0.5) != n_packets - 1)
+      ERROR("rc_twopass_stats_in missing EOS stats packet");
+  }
+
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth > AOM_BITS_10) {
+    ERROR("Codec bit-depth 12 not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 10) {
+    ERROR("Source bit-depth 12 not supported in profile < 2");
+  }
+
+  RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
+              AOM_CICP_CP_EBU_3213);  // Need to check range more precisely to
+                                      // check for reserved values?
+  RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709,
+              AOM_CICP_TC_HLG);
+  RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY,
+              AOM_CICP_MC_ICTCP);
+  RANGE_CHECK(extra_cfg, color_range, 0, 1);
+
+#if CONFIG_DIST_8X8
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_DAALA_DIST);
+#else
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
+#endif
+
+  RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
+              AOM_TIMING_DEC_MODEL);
+
+  RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16);
+
+  if (extra_cfg->lossless) {
+    if (extra_cfg->aq_mode != 0)
+      ERROR("Only --aq_mode=0 can be used with --lossless=1.");
+#if CONFIG_DIST_8X8
+    if (extra_cfg->enable_dist_8x8)
+      ERROR("dist-8x8 cannot be used with lossless compression.");
+#endif
+  }
+
+  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
+                                    const aom_image_t *img) {
+  switch (img->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I42016: break;
+    case AOM_IMG_FMT_I444:
+    case AOM_IMG_FMT_I44416:
+      if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 &&
+          !ctx->cfg.monochrome) {
+        ERROR("Invalid image format. I444 images not supported in profile.");
+      }
+      break;
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I42216:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) {
+        ERROR("Invalid image format. I422 images not supported in profile.");
+      }
+      break;
+    default:
+      ERROR(
+          "Invalid image format. Only YV12, I420, I422, I444 images are "
+          "supported.");
+      break;
+  }
+
+  if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+    ERROR("Image size must match encoder init configuration size");
+
+  return AOM_CODEC_OK;
+}
+
+static int get_image_bps(const aom_image_t *img) {
+  switch (img->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420: return 12;
+    case AOM_IMG_FMT_I422: return 16;
+    case AOM_IMG_FMT_I444: return 24;
+    case AOM_IMG_FMT_I42016: return 24;
+    case AOM_IMG_FMT_I42216: return 32;
+    case AOM_IMG_FMT_I44416: return 48;
+    default: assert(0 && "Invalid image format"); break;
+  }
+  return 0;
+}
+
+// Set appropriate options to disable frame super-resolution.
+static void disable_superres(AV1EncoderConfig *const oxcf) {
+  oxcf->superres_mode = SUPERRES_NONE;
+  oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_qthresh = 255;
+  oxcf->superres_kf_qthresh = 255;
+}
+
+static aom_codec_err_t set_encoder_config(
+    AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg,
+    const struct av1_extracfg *extra_cfg) {
+  const int is_vbr = cfg->rc_end_usage == AOM_VBR;
+  oxcf->profile = cfg->g_profile;
+  oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
+  oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->width = cfg->g_w;
+  oxcf->height = cfg->g_h;
+  oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
+  oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
+  oxcf->bit_depth = cfg->g_bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+  // guess a frame rate if out of whack, use 30
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
+      extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    oxcf->timing_info_present = 1;
+    oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+    oxcf->timing_info.time_scale = cfg->g_timebase.den;
+    oxcf->timing_info.num_ticks_per_picture = 1;
+  } else {
+    oxcf->timing_info_present = 0;
+  }
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
+    oxcf->timing_info.equal_picture_interval = 1;
+    oxcf->decoder_model_info_present_flag = 0;
+    oxcf->display_model_info_present_flag = 1;
+  } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    //    if( extra_cfg->arnr_strength > 0 )
+    //    {
+    //      printf("Only --arnr-strength=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    //    if( extra_cfg->enable_superres)
+    //    {
+    //      printf("Only --superres-mode=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
+    oxcf->timing_info.equal_picture_interval = 0;
+    oxcf->decoder_model_info_present_flag = 1;
+    oxcf->buffer_removal_time_present = 1;
+    oxcf->display_model_info_present_flag = 1;
+  }
+  if (oxcf->init_framerate > 180) {
+    oxcf->init_framerate = 30;
+    oxcf->timing_info_present = 0;
+  }
+  oxcf->mode = GOOD;
+  oxcf->cfg = &cfg->cfg;
+
+  switch (cfg->g_pass) {
+    case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
+    case AOM_RC_FIRST_PASS: oxcf->pass = 1; break;
+    case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
+  }
+
+  oxcf->lag_in_frames =
+      cfg->g_pass == AOM_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames;
+  oxcf->rc_mode = cfg->rc_end_usage;
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+  oxcf->fixed_q = -1;
+
+  oxcf->enable_cdef = extra_cfg->enable_cdef;
+  oxcf->enable_restoration = extra_cfg->enable_restoration;
+  oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+  oxcf->using_qm = extra_cfg->enable_qm;
+  oxcf->qm_y = extra_cfg->qm_y;
+  oxcf->qm_u = extra_cfg->qm_u;
+  oxcf->qm_v = extra_cfg->qm_v;
+  oxcf->qm_minlevel = extra_cfg->qm_min;
+  oxcf->qm_maxlevel = extra_cfg->qm_max;
+#if CONFIG_DIST_8X8
+  oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
+  if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
+      extra_cfg->tuning == AOM_TUNE_DAALA_DIST)
+    oxcf->using_dist_8x8 = 1;
+#endif
+  oxcf->num_tile_groups = extra_cfg->num_tg;
+  // In large-scale tile encoding mode, num_tile_groups is always 1.
+  if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
+  oxcf->mtu = extra_cfg->mtu_size;
+
+  // FIXME(debargha): Should this be:
+  // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
+  //                             extra_cfg->enable_order_hint ?
+  // Disallow using temporal MVs while large_scale_tile = 1.
+  oxcf->allow_ref_frame_mvs =
+      extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
+  oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
+  oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
+
+  oxcf->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
+  oxcf->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
+  oxcf->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator;
+  if (oxcf->resize_mode == RESIZE_FIXED &&
+      oxcf->resize_scale_denominator == SCALE_NUMERATOR &&
+      oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
+    oxcf->resize_mode = RESIZE_NONE;
+
+  if (extra_cfg->lossless || cfg->large_scale_tile) {
+    disable_superres(oxcf);
+  } else {
+    oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
+    oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
+    oxcf->superres_kf_scale_denominator =
+        (uint8_t)cfg->rc_superres_kf_denominator;
+    oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+    oxcf->superres_kf_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+    if (oxcf->superres_mode == SUPERRES_FIXED &&
+        oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
+        oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+      disable_superres(oxcf);
+    }
+    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+        oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
+      disable_superres(oxcf);
+    }
+  }
+
+  oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+
+  oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+
+  oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+  oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+
+  oxcf->auto_key =
+      cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+
+  oxcf->key_freq = cfg->kf_max_dist;
+  oxcf->sframe_dist = cfg->sframe_dist;
+  oxcf->sframe_mode = cfg->sframe_mode;
+  oxcf->sframe_enabled = cfg->sframe_dist != 0;
+  oxcf->speed = extra_cfg->cpu_used;
+  oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+  oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+  oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+  oxcf->sharpness = extra_cfg->sharpness;
+
+  oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
+
+#if CONFIG_FP_MB_STATS
+  oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
+#endif
+
+  oxcf->color_primaries = extra_cfg->color_primaries;
+  oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
+  oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
+  oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
+
+  oxcf->color_range = extra_cfg->color_range;
+  oxcf->render_width = extra_cfg->render_width;
+  oxcf->render_height = extra_cfg->render_height;
+  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  // Adjust g_lag_in_frames down if not needed
+  oxcf->lag_in_frames =
+      AOMMIN(MAX_GF_INTERVAL + oxcf->arnr_max_frames / 2, oxcf->lag_in_frames);
+  oxcf->arnr_strength = extra_cfg->arnr_strength;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+
+  oxcf->tuning = extra_cfg->tuning;
+  oxcf->content = extra_cfg->content;
+  oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
+  oxcf->superblock_size = extra_cfg->superblock_size;
+  if (cfg->large_scale_tile) {
+    oxcf->film_grain_test_vector = 0;
+    oxcf->film_grain_table_filename = NULL;
+  } else {
+    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+  }
+#if CONFIG_DENOISE
+  oxcf->noise_level = extra_cfg->noise_level;
+  oxcf->noise_block_size = extra_cfg->noise_block_size;
+#endif
+  oxcf->large_scale_tile = cfg->large_scale_tile;
+  oxcf->single_tile_decoding =
+      (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
+  if (oxcf->large_scale_tile) {
+    // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
+    // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
+    // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+    // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
+    if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
+        extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
+      oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+  }
+
+  oxcf->row_mt = extra_cfg->row_mt;
+
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows = extra_cfg->tile_rows;
+
+  oxcf->monochrome = cfg->monochrome;
+  oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  oxcf->enable_dual_filter = extra_cfg->use_dual_filter;
+  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
+  oxcf->enable_jnt_comp =
+      extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint;
+  oxcf->enable_ref_frame_mvs =
+      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+
+  oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
+  oxcf->allow_warped_motion =
+      extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+
+  oxcf->enable_superres =
+      (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
+  if (!oxcf->enable_superres) {
+    disable_superres(oxcf);
+  }
+
+  oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
+  oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
+  for (int i = 0; i < oxcf->tile_width_count; i++) {
+    oxcf->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
+  }
+  for (int i = 0; i < oxcf->tile_height_count; i++) {
+    oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
+  }
+  oxcf->error_resilient_mode =
+      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+  oxcf->s_frame_mode = extra_cfg->s_frame_mode;
+  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    oxcf->limit = n_packets - 1;
+  } else {
+    oxcf->limit = cfg->g_limit;
+  }
+
+  if (oxcf->limit == 1) {
+    // still picture mode, display model and timing is meaningless
+    oxcf->display_model_info_present_flag = 0;
+    oxcf->timing_info_present = 0;
+  }
+
+  oxcf->aq_mode = extra_cfg->aq_mode;
+  oxcf->deltaq_mode = extra_cfg->deltaq_mode;
+
+  oxcf->save_as_annexb = cfg->save_as_annexb;
+
+  oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
+  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
+#if CONFIG_REDUCED_ENCODER_BORDER
+  if (oxcf->superres_mode != SUPERRES_NONE ||
+      oxcf->resize_mode != RESIZE_NONE) {
+    warn(
+        "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
+        "Disabling superres/resize.\n");
+    // return AOM_CODEC_INVALID_PARAM;
+    disable_superres(oxcf);
+    oxcf->resize_mode = RESIZE_NONE;
+    oxcf->resize_scale_denominator = SCALE_NUMERATOR;
+    oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
+  }
+#endif  // CONFIG_REDUCED_ENCODER_BORDER
+
+  oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+  oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
+                                          const aom_codec_enc_cfg_t *cfg) {
+  aom_codec_err_t res;
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
+
+  // Prevent increasing lag_in_frames. This check is stricter than it needs
+  // to be -- the limit is not increasing past the first lag_in_frames
+  // value, but we don't track the initial config, only the last successful
+  // config.
+  if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+    ERROR("Cannot increase lag_in_frames");
+
+  res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+  if (res == AOM_CODEC_OK) {
+    ctx->cfg = *cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
+
+  return res;
+}
+
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+  return av1_get_global_headers(ctx->cpi);
+}
+
+static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = av1_get_quantizer(ctx->cpi);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->cpi));
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
+                                        const struct av1_extracfg *extra_cfg) {
+  const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+  if (res == AOM_CODEC_OK) {
+    ctx->extra_cfg = *extra_cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+  return res;
+}
+
+static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tuning = CAST(AOME_SET_TUNING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#if CONFIG_DIST_8X8
+static aom_codec_err_t ctrl_set_enable_dist_8x8(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_dist_8x8 = CAST(AV1E_SET_ENABLE_DIST_8X8, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_parallel_decoding_mode =
+      CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_test_vector(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_test_vector =
+      CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_DENOISE
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_level =
+      ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
+static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.motion_vector_unit_test =
+      CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
+                                    aom_codec_priv_enc_mr_cfg_t *data) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+  (void)data;
+
+  if (ctx->priv == NULL) {
+    aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv));
+    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+    ctx->priv = (aom_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    ctx->priv->enc.total_encoders = 1;
+    priv->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+    if (priv->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+      return AOM_CODEC_MEM_ERROR;
+    }
+#endif
+
+    if (ctx->config.enc) {
+      // Update the reference to the config structure to an internal copy.
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
+    }
+
+    priv->extra_cfg = default_extra_cfg;
+    aom_once(av1_initialize_enc);
+
+    res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+    if (res == AOM_CODEC_OK) {
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+      priv->cpi = av1_create_compressor(&priv->oxcf, priv->buffer_pool);
+      if (priv->cpi == NULL)
+        res = AOM_CODEC_MEM_ERROR;
+      else
+        priv->cpi->output_pkt_list = &priv->pkt_list.head;
+    }
+  }
+
+  return res;
+}
+
+static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
+  free(ctx->cx_data);
+  av1_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  aom_free(ctx->buffer_pool);
+  aom_free(ctx);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  aom_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
+
+  if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
+static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
+                                      const aom_image_t *img,
+                                      aom_codec_pts_t pts,
+                                      unsigned long duration,
+                                      aom_enc_frame_flags_t enc_flags) {
+  const size_t kMinCompressedSize = 8192;
+  volatile aom_codec_err_t res = AOM_CODEC_OK;
+  AV1_COMP *const cpi = ctx->cpi;
+  const aom_rational_t *const timebase = &ctx->cfg.g_timebase;
+
+  if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+
+  if (img != NULL) {
+    res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == AOM_CODEC_OK) {
+      size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                       ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
+      if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return AOM_CODEC_MEM_ERROR;
+        }
+      }
+    }
+  }
+
+  if (ctx->oxcf.mode != GOOD) {
+    ctx->oxcf.mode = GOOD;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  aom_codec_pkt_list_init(&ctx->pkt_list);
+
+  volatile aom_enc_frame_flags_t flags = enc_flags;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cpi->common.error.jmp)) {
+    cpi->common.error.setjmp = 0;
+    res = update_error_state(ctx, &cpi->common.error);
+    aom_clear_system_state();
+    return res;
+  }
+  cpi->common.error.setjmp = 1;
+
+  // Note(yunqing): While applying encoding flags, always start from enabling
+  // all, and then modifying according to the flags. Previous frame's flags are
+  // overwritten.
+  av1_apply_encoding_flags(cpi, flags);
+
+  // Handle fixed keyframe intervals
+  if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
+      ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+    if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+      flags |= AOM_EFLAG_FORCE_KF;
+      ctx->fixed_kf_cntr = 1;
+    }
+  }
+
+  if (res == AOM_CODEC_OK) {
+    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_end_time_stamp =
+        timebase_units_to_ticks(timebase, pts + duration);
+
+    // Set up internal flags
+    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+
+    if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+      res = image2yuvconfig(img, &sd);
+
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+                                dst_time_stamp, dst_end_time_stamp)) {
+        res = update_error_state(ctx, &cpi->common.error);
+      }
+      ctx->next_frame_flags = 0;
+    }
+
+    unsigned char *cx_data = ctx->cx_data;
+    size_t cx_data_sz = ctx->cx_data_sz;
+
+    /* Any pending invisible frames? */
+    if (ctx->pending_cx_data) {
+      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+      ctx->pending_cx_data = cx_data;
+      cx_data += ctx->pending_cx_data_sz;
+      cx_data_sz -= ctx->pending_cx_data_sz;
+
+      /* TODO: this is a minimal check, the underlying codec doesn't respect
+       * the buffer size anyway.
+       */
+      if (cx_data_sz < ctx->cx_data_sz / 2) {
+        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                           "Compressed data buffer too small");
+      }
+    }
+
+    size_t frame_size = 0;
+    unsigned int lib_flags = 0;
+    int is_frame_visible = 0;
+    int index_size = 0;
+    // invisible frames get packed with the next visible frame
+    while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 &&
+           !is_frame_visible &&
+           -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
+                                         &dst_time_stamp, &dst_end_time_stamp,
+                                         !img, timebase)) {
+      if (cpi->common.seq_params.frame_id_numbers_present_flag) {
+        if (cpi->common.invalid_delta_frame_id_minus_1) {
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                             "Invalid delta_frame_id_minus_1");
+        }
+      }
+      cpi->seq_params_locked = 1;
+      if (frame_size) {
+        if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
+
+        const int write_temporal_delimiter =
+            !cpi->common.spatial_layer_id && !ctx->pending_frame_count;
+
+        if (write_temporal_delimiter) {
+          uint32_t obu_header_size = 1;
+          const uint32_t obu_payload_size = 0;
+          const size_t length_field_size =
+              aom_uleb_size_in_bytes(obu_payload_size);
+
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size + 1;
+            memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                    frame_size);
+          }
+          const uint32_t obu_header_offset = 0;
+          obu_header_size = write_obu_header(
+              OBU_TEMPORAL_DELIMITER, 0,
+              (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
+
+          // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+          if (write_uleb_obu_size(obu_header_size, obu_payload_size,
+                                  ctx->pending_cx_data) != AOM_CODEC_OK) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          }
+
+          frame_size += obu_header_size + obu_payload_size + length_field_size;
+        }
+
+        if (ctx->oxcf.save_as_annexb) {
+          size_t curr_frame_size = frame_size;
+          if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
+              AOM_CODEC_OK) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          }
+          frame_size = curr_frame_size;
+
+          // B_PRIME (add frame size)
+          const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size;
+            memmove(cx_data + move_offset, cx_data, frame_size);
+          }
+          if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
+              AOM_CODEC_OK) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          }
+          frame_size += length_field_size;
+        }
+
+        ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size;
+        ctx->pending_cx_data_sz += frame_size;
+
+        cx_data += frame_size;
+        cx_data_sz -= frame_size;
+
+        index_size = MAG_SIZE * (ctx->pending_frame_count - 1) + 2;
+
+        is_frame_visible = cpi->common.show_frame;
+      }
+    }
+    if (is_frame_visible) {
+      // Add the frame packet to the list of returned packets.
+      aom_codec_cx_pkt_t pkt;
+
+      if (ctx->oxcf.save_as_annexb) {
+        //  B_PRIME (add TU size)
+        size_t tu_size = ctx->pending_cx_data_sz;
+        const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
+        if (ctx->pending_cx_data) {
+          const size_t move_offset = length_field_size;
+          memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                  tu_size);
+        }
+        if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
+            AOM_CODEC_OK) {
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        }
+        ctx->pending_cx_data_sz += length_field_size;
+      }
+
+      pkt.kind = AOM_CODEC_CX_FRAME_PKT;
+
+      pkt.data.frame.buf = ctx->pending_cx_data;
+      pkt.data.frame.sz = ctx->pending_cx_data_sz;
+      pkt.data.frame.partition_id = -1;
+      pkt.data.frame.vis_frame_size = frame_size;
+
+      pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+      pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+      pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
+          timebase, dst_end_time_stamp - dst_time_stamp);
+
+      aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+      ctx->pending_cx_data = NULL;
+      ctx->pending_cx_data_sz = 0;
+      ctx->pending_frame_count = 0;
+    }
+  }
+
+  cpi->common.error.setjmp = 0;
+  return res;
+}
+
+static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx,
+                                                    aom_codec_iter_t *iter) {
+  return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    av1_set_reference_enc(ctx->cpi, frame->idx, &sd);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    av1_copy_reference_enc(ctx->cpi, frame->idx, &sd);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    if (fb == NULL) return AOM_CODEC_ERROR;
+
+    yuvconfig2image(&frame->img, fb, NULL);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(new_img, &sd);
+      return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+}
+
+static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+
+  if (av1_get_preview_raw_frame(ctx->cpi, &sd) == 0) {
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
+    return &ctx->preview_img;
+  } else {
+    return NULL;
+  }
+}
+
+static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  const int reference_flag = va_arg(args, int);
+
+  av1_use_as_reference(ctx->cpi, reference_flag);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  (void)ctx;
+  (void)args;
+
+  // TODO(yaowu): Need to re-implement and test for AV1.
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+  if (map) {
+    if (!av1_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
+                            (int)map->cols))
+      return AOM_CODEC_OK;
+    else
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+  if (map) {
+    if (!av1_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
+                            (int)map->cols))
+      return AOM_CODEC_OK;
+    else
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
+
+  if (mode) {
+    const int res =
+        av1_set_internal_size(ctx->cpi, (AOM_SCALING)mode->h_scaling_mode,
+                              (AOM_SCALING)mode->v_scaling_mode);
+    return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  const int spatial_layer_id = va_arg(args, int);
+  if (spatial_layer_id > MAX_NUM_ENHANCEMENT_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.spatial_layer_id = spatial_layer_id;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  const int number_spatial_layers = va_arg(args, int);
+  if (number_spatial_layers > MAX_NUM_ENHANCEMENT_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.number_spatial_layers = number_spatial_layers;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_transfer_characteristics(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.transfer_characteristics =
+      CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_sample_position(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_sample_position =
+      CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  int *const render_size = va_arg(args, int *);
+  extra_cfg.render_width = render_size[0];
+  extra_cfg.render_height = render_size[1];
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+  { AV1_COPY_REFERENCE, ctrl_copy_reference },
+  { AOME_USE_REFERENCE, ctrl_use_reference },
+
+  // Setters
+  { AV1_SET_REFERENCE, ctrl_set_reference },
+  { AOM_SET_POSTPROC, ctrl_set_previewpp },
+  { AOME_SET_ROI_MAP, ctrl_set_roi_map },
+  { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
+  { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+  { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
+  { AOME_SET_CPUUSED, ctrl_set_cpuused },
+  { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
+  { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
+  { AOME_SET_SHARPNESS, ctrl_set_sharpness },
+  { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+  { AV1E_SET_ROW_MT, ctrl_set_row_mt },
+  { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
+  { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
+  { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
+  { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
+  { AOME_SET_TUNING, ctrl_set_tuning },
+  { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
+  { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+  { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers },
+  { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
+  { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { AV1E_SET_LOSSLESS, ctrl_set_lossless },
+  { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
+  { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+  { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
+  { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+  { AV1E_SET_QM_Y, ctrl_set_qm_y },
+  { AV1E_SET_QM_U, ctrl_set_qm_u },
+  { AV1E_SET_QM_V, ctrl_set_qm_v },
+  { AV1E_SET_QM_MIN, ctrl_set_qm_min },
+  { AV1E_SET_QM_MAX, ctrl_set_qm_max },
+#if CONFIG_DIST_8X8
+  { AV1E_SET_ENABLE_DIST_8X8, ctrl_set_enable_dist_8x8 },
+#endif
+  { AV1E_SET_NUM_TG, ctrl_set_num_tg },
+  { AV1E_SET_MTU, ctrl_set_mtu },
+  { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
+  { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+  { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
+  { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
+  { AV1E_SET_ENABLE_DF, ctrl_set_enable_df },
+  { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
+  { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp },
+  { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
+  { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+  { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
+  { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+  { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+  { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+  { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+  { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
+  { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
+  { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
+  { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries },
+  { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics },
+  { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients },
+  { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position },
+  { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
+  { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
+  { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
+  { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+  { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
+  { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
+  { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
+  { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
+  { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+#if CONFIG_DENOISE
+  { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+  { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+#endif  // CONFIG_FILM_GRAIN
+  { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+
+  // Getters
+  { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
+  { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+  { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+  { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
+  { -1, NULL },
+};
+
+static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
+  { 0,
+    {
+        // NOLINT
+        0,  // g_usage
+        0,  // g_threads
+        0,  // g_profile
+
+        320,         // g_width
+        240,         // g_height
+        0,           // g_limit
+        0,           // g_forced_max_frame_width
+        0,           // g_forced_max_frame_height
+        AOM_BITS_8,  // g_bit_depth
+        8,           // g_input_bit_depth
+
+        { 1, 30 },  // g_timebase
+
+        0,  // g_error_resilient
+
+        AOM_RC_ONE_PASS,  // g_pass
+
+        19,  // g_lag_in_frames
+
+        0,                // rc_dropframe_thresh
+        RESIZE_NONE,      // rc_resize_mode
+        SCALE_NUMERATOR,  // rc_resize_denominator
+        SCALE_NUMERATOR,  // rc_resize_kf_denominator
+
+        0,                // rc_superres_mode
+        SCALE_NUMERATOR,  // rc_superres_denominator
+        SCALE_NUMERATOR,  // rc_superres_kf_denominator
+        63,               // rc_superres_qthresh
+        63,               // rc_superres_kf_qthresh
+
+        AOM_VBR,      // rc_end_usage
+        { NULL, 0 },  // rc_twopass_stats_in
+        { NULL, 0 },  // rc_firstpass_mb_stats_in
+        256,          // rc_target_bandwidth
+        0,            // rc_min_quantizer
+        63,           // rc_max_quantizer
+        25,           // rc_undershoot_pct
+        25,           // rc_overshoot_pct
+
+        6000,  // rc_max_buffer_size
+        4000,  // rc_buffer_initial_size
+        5000,  // rc_buffer_optimal_size
+
+        50,    // rc_two_pass_vbrbias
+        0,     // rc_two_pass_vbrmin_section
+        2000,  // rc_two_pass_vbrmax_section
+
+        // keyframing settings (kf)
+        0,            // fwd_kf_enabled
+        AOM_KF_AUTO,  // g_kfmode
+        0,            // kf_min_dist
+        9999,         // kf_max_dist
+        0,            // sframe_dist
+        1,            // sframe_mode
+        0,            // large_scale_tile
+        0,            // monochrome
+        0,            // full_still_picture_hdr
+        0,            // save_as_annexb
+        0,            // tile_width_count
+        0,            // tile_height_count
+        { 0 },        // tile_widths
+        { 0 },        // tile_heights
+        { 1 },        // config file
+    } },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_cx) = {
+  "AOMedia Project AV1 Encoder" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
+      AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
+  encoder_init,            // aom_codec_init_fn_t
+  encoder_destroy,         // aom_codec_destroy_fn_t
+  encoder_ctrl_maps,       // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      NULL,  // aom_codec_peek_si_fn_t
+      NULL,  // aom_codec_get_si_fn_t
+      NULL,  // aom_codec_decode_fn_t
+      NULL,  // aom_codec_get_frame_fn_t
+      NULL   // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      1,                           // 1 cfg map
+      encoder_usage_cfg_map,       // aom_codec_enc_cfg_map_t
+      encoder_encode,              // aom_codec_encode_fn_t
+      encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
+      encoder_set_config,          // aom_codec_enc_config_set_fn_t
+      encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
+      encoder_get_preview,         // aom_codec_get_preview_frame_fn_t
+      NULL                         // aom_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
new file mode 100644
index 000000000..4a6631047
--- /dev/null
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -0,0 +1,1328 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
+
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+#include "av1/av1_iface_common.h"
+
+struct aom_codec_alg_priv {
+  aom_codec_priv_t base;
+  aom_codec_dec_cfg_t cfg;
+  aom_codec_stream_info_t si;
+  int postproc_cfg_set;
+  aom_postproc_cfg_t postproc_cfg;
+  aom_image_t img;
+  int img_avail;
+  int flushed;
+  int invert_tile_order;
+  int last_show_frame;  // Index of last output frame.
+  int byte_alignment;
+  int skip_loop_filter;
+  int skip_film_grain;
+  int decode_tile_row;
+  int decode_tile_col;
+  unsigned int tile_mode;
+  unsigned int ext_tile_debug;
+  unsigned int row_mt;
+  EXTERNAL_REFERENCES ext_refs;
+  unsigned int is_annexb;
+  int operating_point;
+  int output_all_layers;
+
+  AVxWorker *frame_workers;
+  int num_frame_workers;
+  int next_submit_worker_id;
+  int last_submit_worker_id;
+  int next_output_worker_id;
+  int available_threads;
+  aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS];
+  int need_resync;  // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool *buffer_pool;
+
+  // External frame buffer info to save for AV1 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  aom_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+#if CONFIG_INSPECTION
+  aom_inspect_cb inspect_cb;
+  void *inspect_ctx;
+#endif
+};
+
+static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
+                                    aom_codec_priv_enc_mr_cfg_t *data) {
+  // This function only allocates space for the aom_codec_alg_priv_t
+  // structure. More memory may be required at the time the stream
+  // information becomes known.
+  (void)data;
+
+  if (!ctx->priv) {
+    aom_codec_alg_priv_t *const priv =
+        (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
+    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+    ctx->priv = (aom_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    priv->flushed = 0;
+
+    // TODO(tdaede): this should not be exposed to the API
+    priv->cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+    if (ctx->config.dec) {
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
+      // default values
+      priv->cfg.cfg.ext_partition = 1;
+    }
+    av1_zero(priv->image_with_grain);
+    // Turn row_mt on by default.
+    priv->row_mt = 1;
+
+    // Turn on normal tile coding mode by default.
+    // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+    // mode(refer to lightfield example).
+    priv->tile_mode = 0;
+    priv->decode_tile_row = -1;
+    priv->decode_tile_col = -1;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      AVxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      aom_get_worker_interface()->end(worker);
+      aom_free(frame_worker_data->pbi->common.tpl_mvs);
+      frame_worker_data->pbi->common.tpl_mvs = NULL;
+      av1_remove_common(&frame_worker_data->pbi->common);
+      av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+      av1_decoder_remove(frame_worker_data->pbi);
+      aom_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      aom_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  }
+
+  if (ctx->buffer_pool) {
+    av1_free_ref_frame_buffers(ctx->buffer_pool);
+    av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
+
+  aom_free(ctx->frame_workers);
+  aom_free(ctx->buffer_pool);
+  for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) {
+    if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]);
+  }
+  aom_free(ctx);
+  return AOM_CODEC_OK;
+}
+
+// Parses the operating points (including operating_point_idc, seq_level_idx,
+// and seq_tier) and then sets si->number_spatial_layers and
+// si->number_temporal_layers based on operating_point_idc[0].
+static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
+                                              int is_reduced_header,
+                                              aom_codec_stream_info_t *si) {
+  int operating_point_idc0 = 0;
+
+  if (is_reduced_header) {
+    aom_rb_read_literal(rb, LEVEL_BITS);  // level
+  } else {
+    const uint8_t operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
+      int operating_point_idc;
+      operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (i == 0) operating_point_idc0 = operating_point_idc;
+      int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
+      if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
+    }
+  }
+
+  if (aom_get_num_layers_from_operating_point_idc(
+          operating_point_idc0, &si->number_spatial_layers,
+          &si->number_temporal_layers) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+                                                size_t data_sz,
+                                                aom_codec_stream_info_t *si,
+                                                int *is_intra_only) {
+  int intra_only_flag = 0;
+  int got_sequence_header = 0;
+  int found_keyframe = 0;
+
+  if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
+
+  si->w = 0;
+  si->h = 0;
+  si->is_kf = 0;  // is_kf indicates whether the current packet contains a RAP
+
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t payload_size = 0;
+  size_t bytes_read = 0;
+  int reduced_still_picture_hdr = 0;
+  aom_codec_err_t status = aom_read_obu_header_and_size(
+      data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+  if (status != AOM_CODEC_OK) return status;
+
+  // If the first OBU is a temporal delimiter, skip over it and look at the next
+  // OBU in the bitstream
+  if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
+    // Skip any associated payload (there shouldn't be one, but just in case)
+    if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
+    data += bytes_read + payload_size;
+    data_sz -= bytes_read + payload_size;
+
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
+  }
+  while (1) {
+    data += bytes_read;
+    data_sz -= bytes_read;
+    if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
+    // Check that the selected OBU is a sequence header
+    if (obu_header.type == OBU_SEQUENCE_HEADER) {
+      // Sanity check on sequence header size
+      if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
+      // Read a few values from the sequence header payload
+      struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+
+      av1_read_profile(&rb);  // profile
+      const int still_picture = aom_rb_read_bit(&rb);
+      reduced_still_picture_hdr = aom_rb_read_bit(&rb);
+
+      if (!still_picture && reduced_still_picture_hdr) {
+        return AOM_CODEC_UNSUP_BITSTREAM;
+      }
+
+      if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
+          AOM_CODEC_OK) {
+        return AOM_CODEC_ERROR;
+      }
+
+      int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
+      int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
+      int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
+      int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
+      si->w = max_frame_width;
+      si->h = max_frame_height;
+      got_sequence_header = 1;
+    } else if (obu_header.type == OBU_FRAME_HEADER ||
+               obu_header.type == OBU_FRAME) {
+      if (got_sequence_header && reduced_still_picture_hdr) {
+        found_keyframe = 1;
+        break;
+      } else {
+        // make sure we have enough bits to get the frame type out
+        if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
+        struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+        const int show_existing_frame = aom_rb_read_bit(&rb);
+        if (!show_existing_frame) {
+          const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
+          if (frame_type == KEY_FRAME) {
+            found_keyframe = 1;
+            break;  // Stop here as no further OBUs will change the outcome.
+          }
+        }
+      }
+    }
+    // skip past any unread OBU header data
+    data += payload_size;
+    data_sz -= payload_size;
+    if (data_sz == 0) break;  // exit if we're out of OBUs
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
+  }
+  if (got_sequence_header && found_keyframe) si->is_kf = 1;
+  if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz,
+                                       aom_codec_stream_info_t *si) {
+  return decoder_peek_si_internal(data, data_sz, si, NULL);
+}
+
+static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
+                                      aom_codec_stream_info_t *si) {
+  memcpy(si, &ctx->si, sizeof(*si));
+
+  return AOM_CODEC_OK;
+}
+
+static void set_error_detail(aom_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
+static aom_codec_err_t update_error_state(
+    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+  if (error->error_code)
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+  return error->error_code;
+}
+
+static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
+  int i;
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    AVxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;
+
+    cm->new_fb_idx = INVALID_IDX;
+    cm->byte_alignment = ctx->byte_alignment;
+    cm->skip_loop_filter = ctx->skip_loop_filter;
+    cm->skip_film_grain = ctx->skip_film_grain;
+
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = av1_get_frame_buffer;
+      pool->release_fb_cb = av1_release_frame_buffer;
+
+      if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
+
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
+  }
+}
+
+static void set_default_ppflags(aom_postproc_cfg_t *cfg) {
+  cfg->post_proc_flag = AOM_DEBLOCK | AOM_DEMACROBLOCK;
+  cfg->deblocking_level = 4;
+  cfg->noise_level = 0;
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
+
+  int result = av1_receive_compressed_data(frame_worker_data->pbi,
+                                           frame_worker_data->data_size, &data);
+  frame_worker_data->data_end = data;
+
+  if (result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !result;
+}
+
+static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
+  int i;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->need_resync = 1;
+  ctx->num_frame_workers = 1;
+  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+    ctx->num_frame_workers = MAX_DECODE_THREADS;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+    return AOM_CODEC_MEM_ERROR;
+  }
+#endif
+
+  ctx->frame_workers = (AVxWorker *)aom_malloc(ctx->num_frame_workers *
+                                               sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    AVxWorker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return AOM_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return AOM_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->common.options = &ctx->cfg.cfg;
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return AOM_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return AOM_CODEC_MEM_ERROR;
+    }
+#endif
+    frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
+
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads = ctx->cfg.threads;
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+    frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+    frame_worker_data->pbi->operating_point = ctx->operating_point;
+    frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+    frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+    frame_worker_data->pbi->row_mt = ctx->row_mt;
+
+    worker->hook = frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return AOM_CODEC_MEM_ERROR;
+    }
+  }
+
+  // If postprocessing was enabled by the application and a
+  // configuration has not been provided, default it.
+  if (!ctx->postproc_cfg_set && (ctx->base.init_flags & AOM_CODEC_USE_POSTPROC))
+    set_default_ppflags(&ctx->postproc_cfg);
+
+  init_buffer_callbacks(ctx);
+
+  return AOM_CODEC_OK;
+}
+
+static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
+                                const AV1Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+    ctx->need_resync = 0;
+}
+
+static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
+                                  const uint8_t **data, size_t data_sz,
+                                  void *user_priv) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  // Determine the stream parameters. Note that we rely on peek_si to
+  // validate that we have a buffer that does not wrap around the top
+  // of the heap.
+  if (!ctx->si.h) {
+    int is_intra_only = 0;
+    ctx->si.is_annexb = ctx->is_annexb;
+    const aom_codec_err_t res =
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
+    if (res != AOM_CODEC_OK) return res;
+
+    if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
+  }
+
+  AVxWorker *const worker = ctx->frame_workers;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  frame_worker_data->data = *data;
+  frame_worker_data->data_size = data_sz;
+  frame_worker_data->user_priv = user_priv;
+  frame_worker_data->received_frame = 1;
+
+#if CONFIG_INSPECTION
+  frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+  frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+#endif
+
+  frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->row_mt = ctx->row_mt;
+  frame_worker_data->pbi->ext_refs = ctx->ext_refs;
+
+  frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+
+  worker->had_error = 0;
+  winterface->execute(worker);
+
+  // Update data pointer after decode.
+  *data = frame_worker_data->data_end;
+
+  if (worker->had_error)
+    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
+                                      const uint8_t *data, size_t data_sz,
+                                      void *user_priv) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  // Release any pending output frames from the previous decoder_decode call.
+  // We need to do this even if the decoder is being flushed or the input
+  // arguments are invalid.
+  if (ctx->frame_workers) {
+    BufferPool *const pool = ctx->buffer_pool;
+    RefCntBuffer *const frame_bufs = pool->frame_bufs;
+    lock_buffer_pool(pool);
+    for (int i = 0; i < ctx->num_frame_workers; ++i) {
+      AVxWorker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      struct AV1Decoder *pbi = frame_worker_data->pbi;
+      for (size_t j = 0; j < pbi->num_output_frames; j++) {
+        decrease_ref_count((int)pbi->output_frame_index[j], frame_bufs, pool);
+      }
+      pbi->num_output_frames = 0;
+    }
+    unlock_buffer_pool(ctx->buffer_pool);
+  }
+
+  /* Sanity checks */
+  /* NULL data ptr allowed if data_sz is 0 too */
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return AOM_CODEC_OK;
+  }
+  if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    res = init_decoder(ctx);
+    if (res != AOM_CODEC_OK) return res;
+  }
+
+  const uint8_t *data_start = data;
+  const uint8_t *data_end = data + data_sz;
+
+  if (ctx->is_annexb) {
+    // read the size of this temporal unit
+    size_t length_of_size;
+    uint64_t temporal_unit_size;
+    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+                        &length_of_size) != 0) {
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+    data_start += length_of_size;
+    if (temporal_unit_size > (size_t)(data_end - data_start))
+      return AOM_CODEC_CORRUPT_FRAME;
+    data_end = data_start + temporal_unit_size;
+  }
+
+  // Decode in serial mode.
+  while (data_start < data_end) {
+    uint64_t frame_size;
+    if (ctx->is_annexb) {
+      // read the size of this frame unit
+      size_t length_of_size;
+      if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+                          &frame_size, &length_of_size) != 0) {
+        return AOM_CODEC_CORRUPT_FRAME;
+      }
+      data_start += length_of_size;
+      if (frame_size > (size_t)(data_end - data_start))
+        return AOM_CODEC_CORRUPT_FRAME;
+    } else {
+      frame_size = (uint64_t)(data_end - data_start);
+    }
+
+    res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
+    if (res != AOM_CODEC_OK) return res;
+
+    // Allow extra zero bytes after the frame end
+    while (data_start < data_end) {
+      const uint8_t marker = data_start[0];
+      if (marker) break;
+      ++data_start;
+    }
+  }
+
+  return res;
+}
+
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr
+// if necessary), and returns *grain_img_ptr.
+static aom_image_t *add_grain_if_needed(aom_image_t *img,
+                                        aom_image_t **grain_img_ptr,
+                                        aom_film_grain_t *grain_params) {
+  if (!grain_params->apply_grain) return img;
+
+  aom_image_t *grain_img_buf = *grain_img_ptr;
+
+  const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
+  const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+
+  if (grain_img_buf) {
+    const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1);
+    const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1);
+    if (w_even != alloc_w || h_even != alloc_h ||
+        img->fmt != grain_img_buf->fmt) {
+      aom_img_free(grain_img_buf);
+      grain_img_buf = NULL;
+      *grain_img_ptr = NULL;
+    }
+  }
+  if (!grain_img_buf) {
+    grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
+    *grain_img_ptr = grain_img_buf;
+  }
+
+  if (grain_img_buf) {
+    grain_img_buf->user_priv = img->user_priv;
+    if (av1_add_film_grain(grain_params, img, grain_img_buf)) {
+      aom_img_free(grain_img_buf);
+      grain_img_buf = NULL;
+      *grain_img_ptr = NULL;
+    }
+  }
+
+  return grain_img_buf;
+}
+
+static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
+                                      aom_codec_iter_t *iter) {
+  aom_image_t *img = NULL;
+
+  if (!iter) {
+    return NULL;
+  }
+
+  // To avoid having to allocate any extra storage, treat 'iter' as
+  // simply a pointer to an integer index
+  uintptr_t *index = (uintptr_t *)iter;
+
+  if (ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG *sd;
+      // NOTE(david.barker): This code does not support multiple worker threads
+      // yet. We should probably move the iteration over threads into *iter
+      // instead of using ctx->next_output_worker_id.
+      const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+      AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      // Wait for the frame from worker thread.
+      if (winterface->sync(worker)) {
+        // Check if worker has received any frames.
+        if (frame_worker_data->received_frame == 1) {
+          ++ctx->available_threads;
+          frame_worker_data->received_frame = 0;
+          check_resync(ctx, frame_worker_data->pbi);
+        }
+        aom_film_grain_t *grain_params;
+        if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
+                              &grain_params) == 0) {
+          AV1Decoder *const pbi = frame_worker_data->pbi;
+          AV1_COMMON *const cm = &pbi->common;
+          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+          ctx->last_show_frame = cm->new_fb_idx;
+          if (ctx->need_resync) return NULL;
+          yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+
+          if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+            *index += 1;  // Advance the iterator to point to the next image
+            img = &ctx->img;
+            img->img_data = pbi->tile_list_output;
+            img->sz = pbi->tile_list_size;
+            return img;
+          }
+
+          const int num_planes = av1_num_planes(cm);
+          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+              pbi->dec_tile_row >= 0) {
+            const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
+            const int mi_row = tile_row * cm->tile_height;
+            const int ssy = ctx->img.y_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+            if (num_planes > 1) {
+              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+                ctx->img.planes[plane] +=
+                    mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+              }
+            }
+            ctx->img.d_h =
+                AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
+          }
+
+          if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+              pbi->dec_tile_col >= 0) {
+            const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
+            const int mi_col = tile_col * cm->tile_width;
+            const int ssx = ctx->img.x_chroma_shift;
+            const int is_hbd =
+                (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+            int plane;
+            ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
+            if (num_planes > 1) {
+              for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+                ctx->img.planes[plane] +=
+                    mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
+              }
+            }
+            ctx->img.d_w =
+                AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
+          }
+
+          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+          img = &ctx->img;
+          img->temporal_id = cm->temporal_layer_id;
+          img->spatial_id = cm->spatial_layer_id;
+          if (cm->skip_film_grain) grain_params->apply_grain = 0;
+          aom_image_t *res = add_grain_if_needed(
+              img, &ctx->image_with_grain[*index], grain_params);
+          if (!res) {
+            aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+                               "Grain systhesis failed\n");
+          }
+          *index += 1;  // Advance the iterator to point to the next image
+          return res;
+        }
+      } else {
+        // Decoding failed. Release the worker thread.
+        frame_worker_data->received_frame = 0;
+        ++ctx->available_threads;
+        ctx->need_resync = 1;
+        if (ctx->flushed != 1) return NULL;
+      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  }
+  return NULL;
+}
+
+static aom_codec_err_t decoder_set_fb_fn(
+    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  } else if (ctx->frame_workers == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return AOM_CODEC_OK;
+  }
+
+  return AOM_CODEC_ERROR;
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
+
+  if (data) {
+    av1_ref_frame_t *const frame = data;
+    YV12_BUFFER_CONFIG sd;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
+                                 frame->use_external_ref, &sd);
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+  if (frame) {
+    YV12_BUFFER_CONFIG sd;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
+  if (data) {
+    YV12_BUFFER_CONFIG *fb;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    if (fb == NULL) return AOM_CODEC_ERROR;
+    yuvconfig2image(&data->img, fb, NULL);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *new_img = va_arg(args, aom_image_t *);
+  if (new_img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *img = va_arg(args, aom_image_t *);
+  if (img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(img, &sd);
+      return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
+                                    &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+}
+
+static aom_codec_err_t ctrl_set_dbg_options(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+}
+
+static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const update_info = va_arg(args, int *);
+
+  if (update_info) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg =
+      ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *corrupted = va_arg(args, int *);
+
+  if (corrupted) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      AV1Decoder *const pbi = frame_worker_data->pbi;
+      RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
+      if (pbi->seen_frame_header && pbi->num_output_frames == 0)
+        return AOM_CODEC_ERROR;
+      if (ctx->last_show_frame >= 0)
+        *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  if (frame_size) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
+
+  if (frame_header_info) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
+      frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
+      frame_header_info->extra_size = pbi->frame_header_size;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
+
+  if (tile_data) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      tile_data->coded_tile_data_size =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size;
+      tile_data->coded_tile_data =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *);
+
+  if (data) {
+    av1_ext_ref_frame_t *const ext_frames = data;
+    ctx->ext_refs.num = ext_frames->num;
+    for (int i = 0; i < ctx->ext_refs.num; i++) {
+      image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]);
+    }
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
+
+  if (render_size) {
+    if (ctx->frame_workers) {
+      AVxWorker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      render_size[0] = cm->render_width;
+      render_size[1] = cm->render_height;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (bit_depth) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      *bit_depth = cm->seq_params.bit_depth;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+                                    int use_highbitdepth) {
+  aom_img_fmt_t fmt = 0;
+
+  if (subsampling_x == 0 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I444;
+  else if (subsampling_x == 1 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I422;
+  else if (subsampling_x == 1 && subsampling_y == 1)
+    fmt = AOM_IMG_FMT_I420;
+
+  if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (img_fmt) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
+                                cm->seq_params.subsampling_y,
+                                cm->seq_params.use_highbitdepth);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const tile_size = va_arg(args, unsigned int *);
+  AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+  if (tile_size) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      *tile_size =
+          ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->invert_tile_order = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return AOM_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  ctx->skip_loop_filter = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->skip_film_grain = va_arg(args, int);
+
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+#if !CONFIG_ACCOUNTING
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *pbi = frame_worker_data->pbi;
+    Accounting **acct = va_arg(args, Accounting **);
+    *acct = &pbi->accounting;
+    return AOM_CODEC_OK;
+  }
+  return AOM_CODEC_ERROR;
+#endif
+}
+static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_row = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_col = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->tile_mode = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->is_annexb = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->operating_point = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->output_all_layers = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+#if !CONFIG_INSPECTION
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  aom_inspect_init *init = va_arg(args, aom_inspect_init *);
+  ctx->inspect_cb = init->inspect_cb;
+  ctx->inspect_ctx = init->inspect_ctx;
+  return AOM_CODEC_OK;
+#endif
+}
+
+static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  ctx->ext_tile_debug = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  ctx->row_mt = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+  { AV1_COPY_REFERENCE, ctrl_copy_reference },
+
+  // Setters
+  { AV1_SET_REFERENCE, ctrl_set_reference },
+  { AOM_SET_POSTPROC, ctrl_set_postproc },
+  { AOM_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options },
+  { AOM_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options },
+  { AOM_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
+  { AOM_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
+  { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
+  { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
+  { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
+  { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+  { AV1_SET_TILE_MODE, ctrl_set_tile_mode },
+  { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb },
+  { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point },
+  { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
+  { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+  { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+  { AV1D_SET_ROW_MT, ctrl_set_row_mt },
+  { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+  { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
+
+  // Getters
+  { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
+  { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
+  { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
+  { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+  { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+  { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
+  { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
+  { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
+  { AV1_GET_ACCOUNTING, ctrl_get_accounting },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+  { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
+  { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
+
+  { -1, NULL },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_dx) = {
+  "AOMedia Project AV1 Decoder" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_DECODER |
+      AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // aom_codec_caps_t
+  decoder_init,                             // aom_codec_init_fn_t
+  decoder_destroy,                          // aom_codec_destroy_fn_t
+  decoder_ctrl_maps,                        // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      decoder_peek_si,    // aom_codec_peek_si_fn_t
+      decoder_get_si,     // aom_codec_get_si_fn_t
+      decoder_decode,     // aom_codec_decode_fn_t
+      decoder_get_frame,  // aom_codec_get_frame_fn_t
+      decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      0,
+      NULL,  // aom_codec_enc_cfg_map_t
+      NULL,  // aom_codec_encode_fn_t
+      NULL,  // aom_codec_get_cx_data_fn_t
+      NULL,  // aom_codec_enc_config_set_fn_t
+      NULL,  // aom_codec_get_global_headers_fn_t
+      NULL,  // aom_codec_get_preview_frame_fn_t
+      NULL   // aom_codec_enc_mr_get_mem_loc_fn_t
+  }
+};
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
new file mode 100644
index 000000000..4a7af580b
--- /dev/null
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
+
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
+                            void *user_priv) {
+  /* aom_img_wrap() doesn't allow specifying independent strides for
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.
+   */
+  int bps;
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
+      img->fmt = AOM_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = AOM_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    img->fmt = AOM_IMG_FMT_I420;
+    bps = 12;
+  }
+  img->cp = yv12->color_primaries;
+  img->tc = yv12->transfer_characteristics;
+  img->mc = yv12->matrix_coefficients;
+  img->monochrome = yv12->monochrome;
+  img->csp = yv12->chroma_sample_position;
+  img->range = yv12->color_range;
+  img->bit_depth = 8;
+  img->w = yv12->y_width;
+  img->h = yv12->y_height;
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
+  img->planes[AOM_PLANE_Y] = yv12->y_buffer;
+  img->planes[AOM_PLANE_U] = yv12->u_buffer;
+  img->planes[AOM_PLANE_V] = yv12->v_buffer;
+  img->planes[AOM_PLANE_ALPHA] = NULL;
+  img->stride[AOM_PLANE_Y] = yv12->y_stride;
+  img->stride[AOM_PLANE_U] = yv12->uv_stride;
+  img->stride[AOM_PLANE_V] = yv12->uv_stride;
+  img->stride[AOM_PLANE_ALPHA] = yv12->y_stride;
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // aom_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
+    img->bit_depth = yv12->bit_depth;
+    img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->planes[AOM_PLANE_ALPHA] = NULL;
+    img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
+    img->stride[AOM_PLANE_ALPHA] = 2 * yv12->y_stride;
+  }
+  img->bps = bps;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+}
+
+static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[AOM_PLANE_Y];
+  yv12->u_buffer = img->planes[AOM_PLANE_U];
+  yv12->v_buffer = img->planes[AOM_PLANE_V];
+
+  yv12->y_crop_width = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->render_width = img->r_w;
+  yv12->render_height = img->r_h;
+  yv12->y_width = img->w;
+  yv12->y_height = img->h;
+
+  yv12->uv_width =
+      img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
+  yv12->uv_height =
+      img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
+
+  yv12->y_stride = img->stride[AOM_PLANE_Y];
+  yv12->uv_stride = img->stride[AOM_PLANE_U];
+  yv12->color_primaries = img->cp;
+  yv12->transfer_characteristics = img->tc;
+  yv12->matrix_coefficients = img->mc;
+  yv12->monochrome = img->monochrome;
+  yv12->chroma_sample_position = img->csp;
+  yv12->color_range = img->range;
+
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    // In aom_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+  yv12->border = (yv12->y_stride - img->w) / 2;
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
+  return AOM_CODEC_OK;
+}
+
+#endif  // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
new file mode 100644
index 000000000..1bf81c91d
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -0,0 +1,300 @@
+/*
+ *
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/onyxc_int.h"
+
+int av1_get_MBs(int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+  const int mi_cols = aligned_width >> MI_SIZE_LOG2;
+  const int mi_rows = aligned_height >> MI_SIZE_LOG2;
+
+  const int mb_cols = (mi_cols + 2) >> 2;
+  const int mb_rows = (mi_rows + 2) >> 2;
+  return mb_rows * mb_cols;
+}
+
+#if LOOP_FILTER_BITMASK
+static int alloc_loop_filter_mask(AV1_COMMON *cm) {
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+
+  // Each lfm holds bit masks for all the 4x4 blocks in a max
+  // 64x64 (128x128 for ext_partitions) region.  The stride
+  // and rows are rounded up / truncated to a multiple of 16
+  // (32 for ext_partition).
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+  cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+                   cm->lf.lfm_stride;
+  cm->lf.lfm =
+      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) return 1;
+
+  unsigned int i;
+  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+  return 0;
+}
+
+static void free_loop_filter_mask(AV1_COMMON *cm) {
+  if (cm->lf.lfm == NULL) return;
+
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+  cm->lf.lfm_num = 0;
+  cm->lf.lfm_stride = 0;
+}
+#endif
+
+void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  cm->mi_stride = calc_mi_size(cm->mi_cols);
+
+  cm->mb_cols = (cm->mi_cols + 2) >> 2;
+  cm->mb_rows = (cm->mi_rows + 2) >> 2;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+
+#if LOOP_FILTER_BITMASK
+  alloc_loop_filter_mask(cm);
+#endif
+}
+
+void av1_free_ref_frame_buffers(BufferPool *pool) {
+  int i;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
+    }
+    aom_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    aom_free(pool->frame_bufs[i].seg_map);
+    pool->frame_bufs[i].seg_map = NULL;
+    aom_free_frame_buffer(&pool->frame_bufs[i].buf);
+  }
+}
+
+// Assumes cm->rst_info[p].restoration_unit_size is already initialized
+void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  for (int p = 0; p < num_planes; ++p)
+    av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+
+  if (cm->rst_tmpbuf == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
+                    (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+  }
+
+  if (cm->rlbs == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
+  }
+
+  // For striped loop restoration, we divide each row of tiles into "stripes",
+  // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
+  // luma pixels to match the output from CDEF. We will need to store 2 *
+  // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
+  // able to quickly answer the question "Where is the <n>'th stripe for tile
+  // row <m>?" To make that efficient, we generate the rst_last_stripe array.
+  int num_stripes = 0;
+  for (int i = 0; i < cm->tile_rows; ++i) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, i);
+    const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
+    const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+    const int tile_stripes = (ext_h + 63) / 64;
+    num_stripes += tile_stripes;
+    cm->rst_end_stripe[i] = num_stripes;
+  }
+
+  // Now we need to allocate enough space to store the line buffers for the
+  // stripes
+  const int frame_w = cm->superres_upscaled_width;
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+
+  for (int p = 0; p < num_planes; ++p) {
+    const int is_uv = p > 0;
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
+    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
+    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
+                         << use_highbd;
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+
+    if (buf_size != boundaries->stripe_boundary_size ||
+        boundaries->stripe_boundary_above == NULL ||
+        boundaries->stripe_boundary_below == NULL) {
+      aom_free(boundaries->stripe_boundary_above);
+      aom_free(boundaries->stripe_boundary_below);
+
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
+                      (uint8_t *)aom_memalign(32, buf_size));
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
+                      (uint8_t *)aom_memalign(32, buf_size));
+
+      boundaries->stripe_boundary_size = buf_size;
+    }
+    boundaries->stripe_boundary_stride = stride;
+  }
+}
+
+void av1_free_restoration_buffers(AV1_COMMON *cm) {
+  int p;
+  for (p = 0; p < MAX_MB_PLANE; ++p)
+    av1_free_restoration_struct(&cm->rst_info[p]);
+  aom_free(cm->rst_tmpbuf);
+  cm->rst_tmpbuf = NULL;
+  aom_free(cm->rlbs);
+  cm->rlbs = NULL;
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+    aom_free(boundaries->stripe_boundary_above);
+    aom_free(boundaries->stripe_boundary_below);
+    boundaries->stripe_boundary_above = NULL;
+    boundaries->stripe_boundary_below = NULL;
+  }
+
+  aom_free_frame_buffer(&cm->rst_frame);
+}
+
+void av1_free_above_context_buffers(AV1_COMMON *cm,
+                                    int num_free_above_contexts) {
+  int i;
+  const int num_planes = cm->num_allocated_above_context_planes;
+
+  for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+    for (i = 0; i < num_planes; i++) {
+      aom_free(cm->above_context[i][tile_row]);
+      cm->above_context[i][tile_row] = NULL;
+    }
+    aom_free(cm->above_seg_context[tile_row]);
+    cm->above_seg_context[tile_row] = NULL;
+
+    aom_free(cm->above_txfm_context[tile_row]);
+    cm->above_txfm_context[tile_row] = NULL;
+  }
+  for (i = 0; i < num_planes; i++) {
+    aom_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
+  aom_free(cm->above_seg_context);
+  cm->above_seg_context = NULL;
+
+  aom_free(cm->above_txfm_context);
+  cm->above_txfm_context = NULL;
+
+  cm->num_allocated_above_contexts = 0;
+  cm->num_allocated_above_context_mi_col = 0;
+  cm->num_allocated_above_context_planes = 0;
+}
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+  cm->free_mi(cm);
+
+  av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+
+#if LOOP_FILTER_BITMASK
+  free_loop_filter_mask(cm);
+#endif
+}
+
+int av1_alloc_above_context_buffers(AV1_COMMON *cm,
+                                    int num_alloc_above_contexts) {
+  const int num_planes = av1_num_planes(cm);
+  int plane_idx;
+  const int aligned_mi_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+
+  // Allocate above context buffers
+  cm->num_allocated_above_contexts = num_alloc_above_contexts;
+  cm->num_allocated_above_context_mi_col = aligned_mi_cols;
+  cm->num_allocated_above_context_planes = num_planes;
+  for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+    cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+        num_alloc_above_contexts, sizeof(cm->above_context[0]));
+    if (!cm->above_context[plane_idx]) return 1;
+  }
+
+  cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
+      num_alloc_above_contexts, sizeof(cm->above_seg_context));
+  if (!cm->above_seg_context) return 1;
+
+  cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
+      num_alloc_above_contexts, sizeof(cm->above_txfm_context));
+  if (!cm->above_txfm_context) return 1;
+
+  for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
+    for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+      cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
+          aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
+      if (!cm->above_context[plane_idx][tile_row]) return 1;
+    }
+
+    cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
+    if (!cm->above_seg_context[tile_row]) return 1;
+
+    cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
+    if (!cm->above_txfm_context[tile_row]) return 1;
+  }
+
+  return 0;
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+  int new_mi_size;
+
+  av1_set_mb_mi(cm, width, height);
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    cm->free_mi(cm);
+    if (cm->alloc_mi(cm, new_mi_size)) goto fail;
+  }
+
+  return 0;
+
+fail:
+  // clear the mi_* values to force a realloc on resync
+  av1_set_mb_mi(cm, 0, 0);
+  av1_free_context_buffers(cm);
+  return 1;
+}
+
+void av1_remove_common(AV1_COMMON *cm) {
+  av1_free_context_buffers(cm);
+
+  aom_free(cm->fc);
+  cm->fc = NULL;
+  aom_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
+}
+
+void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
new file mode 100644
index 000000000..8e5896981
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1  // Invalid buffer index.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct BufferPool;
+
+void av1_remove_common(struct AV1Common *cm);
+
+int av1_alloc_above_context_buffers(struct AV1Common *cm,
+                                    int num_alloc_above_contexts);
+void av1_free_above_context_buffers(struct AV1Common *cm,
+                                    int num_free_above_contexts);
+int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
+void av1_init_context_buffers(struct AV1Common *cm);
+void av1_free_context_buffers(struct AV1Common *cm);
+
+void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_restoration_buffers(struct AV1Common *cm);
+void av1_free_restoration_buffers(struct AV1Common *cm);
+
+int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
+void av1_free_state_buffers(struct AV1Common *cm);
+
+void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
+int av1_get_MBs(int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 000000000..bad411743
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,3231 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+  { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
+  { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
+  { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
+  { av1_idct32_new, NULL, NULL },
+  { av1_idct64_new, NULL, NULL },
+};
+
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+                                                  uint8_t *output, int stride,
+                                                  int flipud,
+                                                  const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  int16x8_t temp_output;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+    temp_output = vaddq_s16(temp_output, in[j]);
+    vst1_u8(output, vqmovun_s16(temp_output));
+    output += stride;
+  }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+                                                    int16x8_t res0,
+                                                    int16x8_t res1) {
+  int16x8_t temp_output[2];
+  uint8x16_t temp_output_8q;
+  temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+  temp_output[0] = vaddq_s16(temp_output[0], res0);
+  temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+  temp_output[1] = vaddq_s16(temp_output[1], res1);
+  temp_output_8q =
+      vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+  return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+                                                   uint8_t *output, int stride,
+                                                   int flipud, int height) {
+  uint8x16_t temp_output_8q;
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output_8q = vld1q_u8(output + i * stride);
+    temp_output_8q =
+        lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+    vst1q_u8((output + i * stride), temp_output_8q);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+                                                int value) {
+  for (int i = 0; i < size; i++) {
+    a[i] = vdupq_n_s16((int16_t)value);
+  }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+                               int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0_l, s0_h, s1_l, s1_h;
+  int16x4_t v0[2], v1[2];
+
+  s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+  s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+  s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+  s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+  v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+  int32x4_t t0[2], t1[2];
+  int16x4_t v0[2], v1[2];
+
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+  v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+  x[0] = vcombine_s16(v0[0], v0[1]);
+  x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
+                                          int16_t *const c2,
+                                          int16_t *const c3) {
+  int16x4_t val = vdup_n_s16((int16_t)0);
+  val = vld1_lane_s16(c0, val, 0);
+  val = vld1_lane_s16(c1, val, 1);
+  val = vld1_lane_s16(c2, val, 2);
+  val = vld1_lane_s16(c3, val, 3);
+  return val;
+}
+
+static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // Stage 1
+  x[0] = in[7];
+  x[1] = in[0];
+  x[2] = in[5];
+  x[3] = in[2];
+  x[4] = in[3];
+  x[5] = in[4];
+  x[6] = in[1];
+  x[7] = in[6];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s4);
+  x[1] = vqaddq_s16(s1, s5);
+  x[2] = vqaddq_s16(s2, s6);
+  x[3] = vqaddq_s16(s3, s7);
+  x[4] = vqsubq_s16(s0, s4);
+  x[5] = vqsubq_s16(s1, s5);
+  x[6] = vqsubq_s16(s2, s6);
+  x[7] = vqsubq_s16(s3, s7);
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  s2 = x[2];
+  s3 = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+  // Stage 5
+  x[0] = vqaddq_s16(s0, s2);
+  x[1] = vqaddq_s16(s1, s3);
+  x[2] = vqsubq_s16(s0, s2);
+  x[3] = vqsubq_s16(s1, s3);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s4, s5;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+
+  btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[4] = s0;
+  x[5] = s1;
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+  // Stage 5
+  x[0] = s0;
+  x[1] = s1;
+  x[2] = s0;
+  x[3] = s1;
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                                  int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[8], step2[8];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+  btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+  // stage 3
+  btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+  // stage 4
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+
+  // stage 5
+  out[0] = vqaddq_s16(step1[0], step2[7]);
+  out[1] = vqaddq_s16(step1[1], step1[6]);
+  out[2] = vqaddq_s16(step1[2], step1[5]);
+  out[3] = vqaddq_s16(step1[3], step2[4]);
+  out[4] = vqsubq_s16(step1[3], step2[4]);
+  out[5] = vqsubq_s16(step1[2], step1[5]);
+  out[6] = vqsubq_s16(step1[1], step1[6]);
+  out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 4
+  // stage 5
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+  for (int i = 0; i < size; i++) {
+    arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+  }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+  int16x8_t temp[8];
+  for (int i = 0; i < size; ++i) {
+    temp[i] = input[size - 1 - i];
+  }
+  for (int i = 0; i < size; ++i) {
+    input[i] = temp[i];
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+                                                   int16x8_t *const a,
+                                                   int out_size) {
+  for (int i = 0; i < 8; ++i) {
+    a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+                        vmovn_s32(vld1q_s32(input + 4)));
+    input += out_size;
+  }
+}
+
+static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
+                                      int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  output[0] = vmulq_n_s16(input[0], (int16_t)2);
+  output[1] = vmulq_n_s16(input[1], (int16_t)2);
+  output[2] = vmulq_n_s16(input[2], (int16_t)2);
+  output[3] = vmulq_n_s16(input[3], (int16_t)2);
+  output[4] = vmulq_n_s16(input[4], (int16_t)2);
+  output[5] = vmulq_n_s16(input[5], (int16_t)2);
+  output[6] = vmulq_n_s16(input[6], (int16_t)2);
+  output[7] = vmulq_n_s16(input[7], (int16_t)2);
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+                                        int size) {
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+
+  for (int z = 0; z < size; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+  int16_t scale = (int16_t)(2 * NewSqrt2);
+
+  for (int z = 0; z < 16; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
+                                       int8_t cos_bit, int bit) {
+  (void)bit;
+  (void)cos_bit;
+
+  for (int z = 0; z < 32; ++z) {
+    output[z] = vmulq_n_s16(input[z], (int16_t)4);
+  }
+}
+
+static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 4
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+}
+
+static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+  btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+  btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+  btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+  step2[0] = in[0];
+  step2[1] = in[8];
+  step2[2] = in[4];
+  step2[3] = in[12];
+  step2[4] = in[2];
+  step2[5] = in[10];
+  step2[6] = in[6];
+  step2[7] = in[14];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
+                       &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[2] = in[4];
+  step2[4] = in[2];
+  step2[6] = in[6];
+
+  btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+  btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+  btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+  // stage 3
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
+                       &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+                        (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+                        (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
+                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
+                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+  btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+  btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+  btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[10];
+  int16x8_t s0, s1, s4, s5;
+  int16x8_t s8, s9, s12, s13;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[8] = s0;
+  x[9] = s1;
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+
+  // Stage 5
+  x[0] = t[0];
+  x[1] = t[1];
+  x[4] = t[0];
+  x[5] = t[1];
+  x[8] = s8;
+  x[9] = s9;
+  x[12] = s8;
+  x[13] = s9;
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  t[8] = x[8];
+  t[9] = x[9];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+
+  // Stage 7
+  x[0] = t[0];
+  x[1] = t[1];
+  x[2] = t[0];
+  x[3] = t[1];
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+  x[8] = t[8];
+  x[9] = t[9];
+  x[10] = t[8];
+  x[11] = t[9];
+  x[12] = s12;
+  x[13] = s13;
+  x[14] = s12;
+  x[15] = s13;
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[1] = in[0];
+  x[3] = in[2];
+  x[5] = in[4];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[10] = in[5];
+  x[12] = in[3];
+  x[14] = in[1];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+  btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+  btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+  btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+  btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+  btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+  btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+  btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c);
+  btf_16_half_neon(x + 6, c);
+  btf_16_half_neon(x + 10, c);
+  btf_16_half_neon(x + 14, c);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+                        (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+                        (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
+  const int16x4_t c2 =
+      create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
+                        (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+  const int16x4_t c3 =
+      create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
+                        (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+  const int16x4_t c4 =
+      create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+                        (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+  const int16x4_t c5 =
+      create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+                        (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+  const int16x4_t c6 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c7 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+  btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+  btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+  btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+  btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+  btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+  step2[0] = in[0];
+  step2[1] = in[16];
+  step2[2] = in[8];
+  step2[3] = in[24];
+  step2[4] = in[4];
+  step2[5] = in[20];
+  step2[6] = in[12];
+  step2[7] = in[28];
+  step2[8] = in[2];
+  step2[9] = in[18];
+  step2[10] = in[10];
+  step2[11] = in[26];
+  step2[12] = in[6];
+  step2[13] = in[22];
+  step2[14] = in[14];
+  step2[15] = in[30];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+  btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+  btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+  btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[4] = step2[4];
+  step1[5] = step2[5];
+  step1[6] = step2[6];
+  step1[7] = step2[7];
+
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+  btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+  btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
+                       &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
+                       &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[1], step1[2]);
+  step2[2] = vqsubq_s16(step1[1], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+  out[16] = step1;
+  out[17] = step1;
+  out[18] = step1;
+  out[19] = step1;
+  out[20] = step1;
+  out[21] = step1;
+  out[22] = step1;
+  out[23] = step1;
+  out[24] = step1;
+  out[25] = step1;
+  out[26] = step1;
+  out[27] = step1;
+  out[28] = step1;
+  out[29] = step1;
+  out[30] = step1;
+  out[31] = step1;
+}
+
+static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
+                                        int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[4] = in[4];
+  step2[8] = in[2];
+  step2[12] = in[6];
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = step2[4];
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[16];
+  step1[18] = step2[19];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[21] = step2[20];
+  step1[22] = step2[23];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[24];
+  step1[26] = step2[27];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[29] = step2[28];
+  step1[30] = step2[31];
+  step1[31] = step2[31];
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[8] = step1[8];
+  step2[9] = step1[8];
+  step2[10] = step1[11];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[13] = step1[12];
+  step2[14] = step1[15];
+  step2[15] = step1[15];
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+                       &step1[10], &step1[13]);
+
+  step1[4] = step2[4];
+  step1[5] = step2[4];
+  step1[6] = step2[7];
+  step1[7] = step2[7];
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+                       &step2[21], &step2[26]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[0];
+  step2[2] = step1[0];
+  step2[3] = step1[0];
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
+                                         int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 =
+      create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+                        (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+  const int16x4_t c1 =
+      create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+                        (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+  // stage 1
+  // stage 2
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+  btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+  btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  step2[0] = in[0];
+  step2[2] = in[8];
+  step2[4] = in[4];
+  step2[6] = in[12];
+  step2[8] = in[2];
+  step2[10] = in[10];
+  step2[12] = in[6];
+  step2[14] = in[14];
+
+  // stage 3
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+  btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = step2[4];
+  step1[6] = step2[6];
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+                       &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+                       &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+                       &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+                       &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+                       &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[0], step1[2]);
+  step2[2] = vqsubq_s16(step1[0], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_neon
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { av1_idct4_new, av1_idct4_new, NULL, NULL },
+          { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
+          { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
+      },
+      { { av1_idct8_new, av1_idct8_new, NULL, NULL },
+        { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
+        { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
+      {
+          { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
+          { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
+          { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
+      },
+      { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
+        { NULL, NULL, NULL, NULL },
+        { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
+          av1_iidentity32_c } },
+      { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static const transform_neon
+    lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
+        { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
+        { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+      {
+          { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
+          { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
+            NULL },
+          { identity16_new_neon, identity16_new_neon, identity16_new_neon,
+            NULL },
+      },
+      { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
+          idct32_new_neon },
+        { NULL, NULL, NULL, NULL },
+        { identity32_new_neon, identity32_new_neon, identity32_new_neon,
+          identity32_new_neon } },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
+
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    for (r = 0; r < txfm_size_row; ++r) {
+      output[r * stride + c] =
+          highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_TYPE tx_type,
+                                                  TX_SIZE tx_size, int eob) {
+  int16x8_t a[32 * 4];
+  int16x8_t b[32 * 4];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // row tx
+  int row_start = (buf_size_nonzero_h_div8 * 8);
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  // col tx
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+                                                 uint8_t *output, int stride,
+                                                 TX_TYPE tx_type,
+                                                 TX_SIZE tx_size, int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  (void)eob;
+
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  int32_t *temp_in = txfm_buf;
+
+  int eobx, eoby, ud_flip, lr_flip, row_start;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+  const int bd = 8;
+  int r;
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  row_start = (buf_size_nonzero_h_div8 << 3);
+
+  for (int i = 0; i < row_start; i++) {
+    if (abs(rect_type) == 1) {
+      for (int j = 0; j < txfm_size_col; j++)
+        temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+      row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    } else {
+      row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Doing memset for the rows which are not processed in row transform.
+  memset(buf_ptr, 0,
+         sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+  for (int c = 0; c < txfm_size_col; c++) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[64 * 8];
+  int16x8_t b[64 * 8];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
+                                               tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
+                                               tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
+                                                tx_size, eob);
+      break;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  int row;
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
+                                    eob);
+      break;
+
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case TX_16X64: {
+      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+                                             tx_size, eob);
+    } break;
+
+    case TX_64X16: {
+      int32_t mod_input[64 * 16];
+      for (row = 0; row < 16; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
+    } break;
+
+    case TX_32X64: {
+      lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+                                             tx_size, eob);
+    } break;
+
+    case TX_64X32: {
+      int32_t mod_input[64 * 32];
+      for (row = 0; row < 32; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
+    } break;
+
+    case TX_64X64: {
+      int32_t mod_input[64 * 64];
+      for (row = 0; row < 32; ++row) {
+        memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+        memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+      }
+      lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+                                             tx_size, eob);
+    } break;
+
+    default:
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 000000000..9ec658291
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+                                  const int8_t cos_bit,
+                                  const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+                               int8_t cos_bit, int bit);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+#endif  // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c
new file mode 100644
index 000000000..de3c54724
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c
@@ -0,0 +1,28 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  for (int i = 0; i < size; i += 4) {
+    int32x4_t tmp_q_s32 = vld1q_s32(arr);
+    tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4);
+    vst1q_s32(arr, tmp_q_s32);
+    arr += 4;
+  }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
new file mode 100644
index 000000000..7134f183e
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -0,0 +1,134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
+  const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
+
+  if (w >= 16) {
+    const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        const uint8x16_t tmp0_q = vld1q_u8(src0);
+        const uint8x16_t tmp1_q = vld1q_u8(src1);
+        const uint8x16_t m_q = vld1q_u8(mask);
+        const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
+        res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
+        res_low =
+            vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
+        res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
+                            vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+        mask += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+      mask -= w;
+    }
+  } else if (w == 8) {
+    const uint8x8_t m = vld1_u8(mask);
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
new file mode 100644
index 000000000..194e94c8c
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -0,0 +1,141 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t tmp0_q, tmp1_q, res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (w >= 16) {
+    for (int i = 0; i < h; ++i) {
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        tmp0_q = vld1q_u8(src0);
+        tmp1_q = vld1q_u8(src1);
+        res_low = vmull_u8(m, vget_low_u8(tmp0_q));
+        res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
+        res_high = vmull_u8(m, vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+    }
+  } else if (w == 8) {
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
+      const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
+      const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
+      const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
+      const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
+      const uint8x8_t max_minus_m =
+          vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint8x8_t m1 = vdup_n_u8(mask[i]);
+      const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
+      const uint16x4x2_t m_trn =
+          vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
+      const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
+      const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
+      const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
+      const uint16x4x2_t max_minus_m_trn = vtrn_u16(
+          vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
+      const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
new file mode 100644
index 000000000..39025b5e5
--- /dev/null
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+                                 int16x8_t sub) {
+  vst1q_s16(dst + offset,
+            vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
+}
+
+static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+  return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
+}
+
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+}
+
+// Store half of a vector.
+static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+      vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+      vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      const uint8x8x4_t bot = vld4_u8(input + input_stride);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      uint16x8x2_t sum;
+      // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+      sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
+      vst1_u16(pred_buf_q3, vget_low_u16(top));
+    } else if (width == 8) {
+      const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
+      vst1q_u16(pred_buf_q3, top);
+    } else {
+      const uint8x16_t top = vld1q_u8(input);
+      vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+      vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
+      if (width == 32) {
+        const uint8x16_t next_top = vld1q_u8(input + 16);
+        vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+        vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
+      }
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t bot = vld1_u16(input + input_stride);
+      const uint16x4_t sum = vadd_u16(top, bot);
+      const uint16x4_t hsum = vpadd_u16(sum, sum);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+    } else if (width < 32) {
+      const uint16x8_t top = vld1q_u16(input);
+      const uint16x8_t bot = vld1q_u16(input + input_stride);
+      const uint16x8_t sum = vaddq_u16(top, bot);
+      if (width == 8) {
+        const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+        vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+      } else {
+        const uint16x8_t top_1 = vld1q_u16(input + 8);
+        const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
+        const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
+        const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+        vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
+      }
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      const uint16x8x4_t bot = vld4q_u16(input + input_stride);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t hsum = vpadd_u16(top, top);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 8) {
+      const uint16x4x2_t top = vld2_u16(input);
+      // equivalent to a vpadd_u16 (because vld2 interleaves)
+      const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
+      vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 16) {
+      const uint16x8x2_t top = vld2q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld2q interleaves)
+      const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
+      uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+                                vshlq_n_u16(hsum_1, 2) } };
+      vst2q_u16(pred_buf_q3, result);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
+    } else if (width == 8) {
+      const uint16x8_t top = vld1q_u16(input);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
+    } else if (width == 16) {
+      uint16x8x2_t top = vld2q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      vst2q_u16(pred_buf_q3, top);
+    } else {
+      uint16x8x4_t top = vld4q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      top.val[2] = vshlq_n_u16(top.val[2], 3);
+      top.val[3] = vshlq_n_u16(top.val[3], 3);
+      vst4q_u16(pred_buf_q3, top);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+                                         int width, int height,
+                                         int round_offset,
+                                         const int num_pel_log2) {
+  const uint16_t *const end = src + height * CFL_BUF_LINE;
+
+  // Round offset is not needed, because NEON will handle the rounding.
+  (void)round_offset;
+
+  // To optimize the use of the CPU pipeline, we process 4 rows per iteration
+  const int step = 4 * CFL_BUF_LINE;
+
+  // At this stage, the prediction buffer contains scaled reconstructed luma
+  // pixels, which are positive integer and only require 15 bits. By using
+  // unsigned integer for the sum, we can do one addition operation inside 16
+  // bits (8 lanes) before having to convert to 32 bits (4 lanes).
+  const uint16_t *sum_buf = src;
+  uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+  do {
+    // For all widths, we load, add and combine the data so it fits in 4 lanes.
+    if (width == 4) {
+      const uint16x4_t a0 =
+          vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
+      const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
+                                     vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
+      sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
+    } else if (width == 8) {
+      const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
+      const uint16x8_t a1 =
+          vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
+      sum_32x4 = vpadalq_u16(sum_32x4, a0);
+      sum_32x4 = vpadalq_u16(sum_32x4, a1);
+    } else {
+      const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
+      const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
+      const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
+      const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
+      sum_32x4 = vpadalq_u16(sum_32x4, row0);
+      sum_32x4 = vpadalq_u16(sum_32x4, row1);
+      sum_32x4 = vpadalq_u16(sum_32x4, row2);
+      sum_32x4 = vpadalq_u16(sum_32x4, row3);
+
+      if (width == 32) {
+        const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
+        const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row2_1 =
+            vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row3_1 =
+            vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
+
+        sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
+      }
+    }
+    sum_buf += step;
+  } while (sum_buf < end);
+
+  // Permute and add in such a way that each lane contains the block sum.
+  // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
+#ifdef __aarch64__
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+#else
+  uint32x4_t flip =
+      vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
+  sum_32x4 = vaddq_u32(sum_32x4, flip);
+  sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
+#endif
+
+  // Computing the average could be done using scalars, but getting off the NEON
+  // engine introduces latency, so we use vqrshrn.
+  int16x4_t avg_16x4;
+  // Constant propagation makes for some ugly code.
+  switch (num_pel_log2) {
+    case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
+    case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
+    case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
+    case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
+    case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
+    case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
+    case 10:
+      avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
+      break;
+    default: assert(0);
+  }
+
+  if (width == 4) {
+    do {
+      vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
+      src += CFL_BUF_LINE;
+      dst += CFL_BUF_LINE;
+    } while (src < end);
+  } else {
+    const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
+    do {
+      vldsubstq_s16(dst, src, 0, avg_16x8);
+      vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
+
+      if (width > 8) {
+        vldsubstq_s16(dst, src, 8, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      if (width == 32) {
+        vldsubstq_s16(dst, src, 16, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      src += step;
+      dst += step;
+    } while (src < end);
+  }
+}
+
+CFL_SUB_AVG_FN(neon)
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsign is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
+  const int16x4_t mask = vshr_n_s16(b, 15);
+  return veor_s16(vadd_s16(a, mask), mask);
+}
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsignq is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vshrq_n_s16(b, 15);
+  return veorq_s16(vaddq_s16(a, mask), mask);
+}
+
+static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+                                   int16x4_t alpha_sign, int abs_alpha_q12,
+                                   int16x4_t dc) {
+  const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
+  const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
+  int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
+  return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+                                   int16x8_t alpha_sign, int abs_alpha_q12,
+                                   int16x8_t dc) {
+  const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
+  const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
+  int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
+  return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  int16x8x2_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  return result;
+}
+
+static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
+  const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  const int16x8_t scaled_luma_2 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12);
+  const int16x8_t scaled_luma_3 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12);
+  int16x8x4_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc);
+  result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc);
+  return result;
+}
+
+static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    do {
+      const int16x4_t pred =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    do {
+      if (width == 8) {
+        vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
+                                            abs_alpha_q12, dc)));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
+                                       vqmovun_s16(pred.val[1]) } };
+        vst2_u8(dst, predun);
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x4_t predun = {
+          { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
+            vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
+        };
+        vst4_u8(dst, predun);
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, lbd)
+
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+  return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+  return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+  uint16x8x2_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+  uint16x8x4_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  result.val[2] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+  result.val[3] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  const int max = (1 << bd) - 1;
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    const int16x4_t max_16x4 = vdup_n_s16(max);
+    do {
+      const int16x4_t scaled_luma =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    const int16x8_t max_16x8 = vdupq_n_s16(max);
+    do {
+      if (width == 8) {
+        const int16x8_t pred =
+            predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst1q_u16(dst, clampq_s16(pred, max_16x8));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, hbd)
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
new file mode 100644
index 000000000..d0c4f8ff6
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -0,0 +1,1455 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16_t *filter) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+    const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  sum = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(sum);
+}
+
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+    const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  sum = vqrshl_s16(sum, shift_round_0);
+  sum = vqrshl_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif  // !defined(__arch64__)
+
+static INLINE uint8x8_t convolve8_vert_8x4(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint16x4_t convolve8_vert_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  int32x4_t sum0, sum1;
+  uint16x8_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+
+  sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum1 = vaddq_s32(sum1, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum1 = vsubq_s32(sum1, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+  res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
+                     vqmovn_u32(vreinterpretq_u32_s32(sum1)));
+
+  res = vqrshlq_u16(res, vec_round_bits);
+
+  return vqmovn_u16(res);
+}
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)filter_params_y;
+
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3;
+#endif
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
+  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+  src -= horiz_offset;
+#if defined(__aarch64__)
+  if (h == 4) {
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t d01_temp, d23_temp;
+
+    __builtin_prefetch(src + 0 * src_stride);
+    __builtin_prefetch(src + 1 * src_stride);
+    __builtin_prefetch(src + 2 * src_stride);
+    __builtin_prefetch(src + 3 * src_stride);
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    __builtin_prefetch(dst + 0 * dst_stride);
+    __builtin_prefetch(dst + 1 * dst_stride);
+    __builtin_prefetch(dst + 2 * dst_stride);
+    __builtin_prefetch(dst + 3 * dst_stride);
+    src += 7;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+
+      d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
+      d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
+
+      d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
+      d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
+
+      d01 = vqmovun_s16(d01_temp);
+      d23 = vqmovun_s16(d23_temp);
+
+      transpose_u8_4x4(&d01, &d23);
+
+      if (w != 2) {
+        vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
+                      vreinterpret_u32_u8(d01), 0);
+        vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
+                      vreinterpret_u32_u8(d23), 0);
+        vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
+                      vreinterpret_u32_u8(d01), 1);
+        vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
+                      vreinterpret_u32_u8(d23), 1);
+      } else {
+        vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
+                      vreinterpret_u16_u8(d01), 0);
+        vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
+                      vreinterpret_u16_u8(d23), 0);
+        vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
+                      vreinterpret_u16_u8(d01), 2);
+        vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
+                      vreinterpret_u16_u8(d23), 2);
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w > 0);
+  } else {
+#endif
+    int width;
+    const uint8_t *s;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint8x8_t t4, t5, t6, t7;
+#endif
+
+    if (w <= 4) {
+#if defined(__aarch64__)
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        src += 8 * src_stride;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                 shift_round_0, shift_by_bits);
+
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        if ((w == 4) && (h > 4)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        0);  // 20 21 22 23
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        0);  // 30 31 32 33
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        1);  // 40 41 42 43
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        1);  // 50 51 52 53
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        1);  // 60 61 62 63
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        1);  // 70 71 72 73
+          dst += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+        } else if ((w == 2) && (h > 4)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
+          dst += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+        }
+        h -= 8;
+      } while (h > 0);
+#else
+    int16x8_t tt0;
+    int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+    const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+    const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
+      x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
+
+      t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
+
+      x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
+      x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
+      x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
+      x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
+      x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
+      x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
+
+      src += src_stride;
+
+      t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+                               shift_round_0_low, shift_by_bits_low);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+        dst += dst_stride;
+      }
+      h -= 1;
+    } while (h > 0);
+#endif
+    } else {
+      uint8_t *d;
+      int16x8_t s11;
+#if defined(__aarch64__)
+      int16x8_t s12, s13, s14;
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        width = w;
+        s = src + 7;
+        d = dst;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter, shift_round_0, shift_by_bits);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          if (h != 2) {
+            store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          } else {
+            store_row2_u8_8x8(d, dst_stride, t0, t1);
+          }
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+#else
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      width = w;
+      s = src + 8;
+      d = dst;
+      __builtin_prefetch(dst);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s11 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        vst1_u8(d, t0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 1;
+    } while (h > 0);
+#endif
+    }
+#if defined(__aarch64__)
+  }
+#endif
+}
+
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+
+  src -= vert_offset * src_stride;
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  if (w <= 4) {
+    uint8x8_t d01;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    uint8x8_t d23;
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+
+    do {
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+#if defined(__aarch64__)
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      if ((w == 4) && (h != 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      0);  // 20 21 22 23
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      1);  // 30 31 32 33
+        dst += dst_stride;
+      } else if ((w == 4) && (h == 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+      } else if ((w == 2) && (h != 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
+        dst += dst_stride;
+      } else if ((w == 2) && (h == 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+#else
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+        dst += dst_stride;
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      h -= 1;
+#endif
+    } while (h > 0);
+  } else {
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+    uint8x8_t t1, t2, t3;
+    int16x8_t s8, s9, s10;
+#endif
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      s = src;
+      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+#if defined(__aarch64__)
+        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+        t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+        t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+        t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+        if (h != 2) {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+          vst1_u8(d, t2);
+          d += dst_stride;
+          vst1_u8(d, t3);
+          d += dst_stride;
+        } else {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+        }
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+#else
+        __builtin_prefetch(d);
+        __builtin_prefetch(s);
+
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  int im_dst_stride;
+  int width, height;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int bd = 8;
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+
+  dst_ptr = im_block;
+  im_dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+
+    do {
+      s = src_ptr;
+
+#if defined(__aarch64__)
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+      if (w == 4) {
+        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      }
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+#else
+      int16x8_t tt0;
+
+      __builtin_prefetch(s);
+
+      t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s0 = vget_low_s16(tt0);
+      s4 = vget_high_s16(tt0);
+
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+
+      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      if (w == 4) {
+        vst1_s16(dst_ptr, d0);
+        dst_ptr += im_dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+        dst_ptr += im_dst_stride;
+      }
+
+      src_ptr += src_stride;
+      height -= 1;
+#endif
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
+    int16x8_t s11, s12, s13, s14;
+#endif
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+
+#if defined(__aarch64__)
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * im_dst_stride;
+      height -= 8;
+    } while (height > 0);
+#else
+    do {
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        int16x8_t sum = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += im_dst_stride;
+      height -= 1;
+    } while (height > 0);
+#endif
+  }
+
+  // vertical
+  {
+    uint8_t *dst_u8_ptr, *d_u8;
+    int16_t *v_src_ptr, *v_s;
+
+    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                              (1 << (offset_bits - conv_params->round_1 - 1));
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+    src_stride = im_stride;
+    v_src_ptr = im_block;
+    dst_u8_ptr = dst;
+
+    height = h;
+    width = w;
+
+    if (width <= 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint16x4_t d0;
+      uint16x8_t dd0;
+      uint8x8_t d01;
+
+#if defined(__aarch64__)
+      int16x4_t s8, s9, s10;
+      uint16x4_t d1, d2, d3;
+      uint16x8_t dd1;
+      uint8x8_t d23;
+#endif
+
+      d_u8 = dst_u8_ptr;
+      v_s = v_src_ptr;
+
+      __builtin_prefetch(v_s + 0 * im_stride);
+      __builtin_prefetch(v_s + 1 * im_stride);
+      __builtin_prefetch(v_s + 2 * im_stride);
+      __builtin_prefetch(v_s + 3 * im_stride);
+      __builtin_prefetch(v_s + 4 * im_stride);
+      __builtin_prefetch(v_s + 5 * im_stride);
+      __builtin_prefetch(v_s + 6 * im_stride);
+      __builtin_prefetch(v_s + 7 * im_stride);
+
+      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      v_s += (7 * im_stride);
+
+      do {
+#if defined(__aarch64__)
+        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+        v_s += (im_stride << 2);
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+        __builtin_prefetch(d_u8 + 1 * dst_stride);
+        __builtin_prefetch(d_u8 + 2 * dst_stride);
+        __builtin_prefetch(d_u8 + 3 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+        d01 = vqmovn_u16(dd0);
+        d23 = vqmovn_u16(dd1);
+
+        if ((w == 4) && (h != 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        0);  // 20 21 22 23
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        1);  // 30 31 32 33
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h != 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        0);  // 20 21
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        2);  // 30 31
+          d_u8 += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+#else
+        s7 = vld1_s16(v_s);
+        v_s += im_stride;
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+        d01 = vqmovn_u16(dd0);
+
+        if (w == 4) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+
+        } else if (w == 2) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
+      } while (height > 0);
+    } else {
+      // if width is a multiple of 8 & height is a multiple of 4
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t res0;
+#if defined(__aarch64__)
+      int16x8_t s8, s9, s10;
+      uint8x8_t res1, res2, res3;
+#endif
+
+      do {
+        __builtin_prefetch(v_src_ptr + 0 * im_stride);
+        __builtin_prefetch(v_src_ptr + 1 * im_stride);
+        __builtin_prefetch(v_src_ptr + 2 * im_stride);
+        __builtin_prefetch(v_src_ptr + 3 * im_stride);
+        __builtin_prefetch(v_src_ptr + 4 * im_stride);
+        __builtin_prefetch(v_src_ptr + 5 * im_stride);
+        __builtin_prefetch(v_src_ptr + 6 * im_stride);
+        __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+        v_s = v_src_ptr;
+        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+        v_s += (7 * im_stride);
+
+        d_u8 = dst_u8_ptr;
+        height = h;
+
+        do {
+#if defined(__aarch64__)
+          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+          v_s += (im_stride << 2);
+
+          __builtin_prefetch(d_u8 + 4 * dst_stride);
+          __builtin_prefetch(d_u8 + 5 * dst_stride);
+          __builtin_prefetch(d_u8 + 6 * dst_stride);
+          __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          if (h != 2) {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res2);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res3);
+            d_u8 += dst_stride;
+          } else {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+          }
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+#else
+          s7 = vld1q_s16(v_s);
+          v_s += im_stride;
+
+          __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          vst1_u8(d_u8, res0);
+          d_u8 += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+#endif
+        } while (height > 0);
+        v_src_ptr += 8;
+        dst_u8_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
new file mode 100644
index 000000000..f382984f2
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
+
+static INLINE uint8x8_t wiener_convolve8_vert_4x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, int16_t *filter_y, const int bd,
+    const int round1_bits) {
+  int16x8_t ss0, ss1, ss2;
+  int32x4_t sum0, sum1;
+  uint16x4_t tmp0, tmp1;
+  uint16x8_t tmp;
+  uint8x8_t res;
+
+  const int32_t round_const = (1 << (bd + round1_bits - 1));
+  const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec = vdupq_n_s32(round_const);
+
+  ss0 = vaddq_s16(s0, s6);
+  ss1 = vaddq_s16(s1, s5);
+  ss2 = vaddq_s16(s2, s4);
+
+  sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+
+  sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+
+  sum0 = vsubq_s32(sum0, round_vec);
+  sum1 = vsubq_s32(sum1, round_vec);
+
+  /* right shift & rounding */
+  sum0 = vrshlq_s32(sum0, round_bits);
+  sum1 = vrshlq_s32(sum1, round_bits);
+
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+
+  /* from int32x4_t to uint8x8_t */
+  tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
+  tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
+  tmp = vcombine_u16(tmp0, tmp1);
+  res = vqmovn_u16(tmp);
+
+  return res;
+}
+
+static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  int16x8_t sum;
+  uint16x8_t res;
+  int32x4_t sum_0, sum_1;
+  int32x4_t s3_0, s3_1;
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+
+  /* for the purpose of right shift by { conv_params->round_0 } */
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  sum = vmulq_n_s16(s0, filter_x[0]);
+  sum = vmlaq_n_s16(sum, s1, filter_x[1]);
+  sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+
+  /* sum from 16x8 to 2 32x4 registers */
+  sum_0 = vmovl_s16(vget_low_s16(sum));
+  sum_1 = vmovl_s16(vget_high_s16(sum));
+
+  /* s[3]*128 -- and filter coef max can be 128
+   *  then max value possible = 128*128*255 exceeding 16 bit
+   */
+
+  s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
+  s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+  sum_1 = vaddq_s32(sum_1, s3_1);
+
+  /* Add the constant value */
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_1 = vaddq_s32(sum_1, round_vec_0);
+
+  /* right shift & rounding & saturating */
+  sum_0 = vqrshlq_s32(sum_0, round_bits);
+  sum_1 = vqrshlq_s32(sum_1, round_bits);
+
+  /* Clipping to max value */
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  sum_1 = vminq_s32(sum_1, round_vec_1);
+
+  res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
+  return res;
+}
+
+static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  uint16x4_t res;
+  int32x4_t sum_0, s3_0;
+  int16x4_t sum, temp0, temp1, temp2;
+
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  temp0 = vadd_s16(s0, s6);
+  temp1 = vadd_s16(s1, s5);
+  temp2 = vadd_s16(s2, s4);
+
+  sum = vmul_n_s16(temp0, filter_x[0]);
+  sum = vmla_n_s16(sum, temp1, filter_x[1]);
+  sum = vmla_n_s16(sum, temp2, filter_x[2]);
+  sum_0 = vmovl_s16(sum);
+
+  /* s[3]*128 -- and filter coff max can be 128.
+   * then max value possible = 128*128*255 Therefore, 32 bits are required to
+   * hold the result.
+   */
+  s3_0 = vmull_n_s16(s3, filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_0 = vrshlq_s32(sum_0, round_bits);
+
+  sum_0 = vmaxq_s32(sum_0, zero);
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  res = vqmovun_s32(sum_0);
+  return res;
+}
+
+static INLINE int16x8_t
+convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                  const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+                  const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t sum;
+  int16x8_t res;
+
+  sum = horiz_const;
+  sum = vmlaq_n_s16(sum, s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s3, filter[3]);
+  sum = vmlaq_n_s16(sum, s4, filter[4]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+
+  res = vqrshlq_s16(sum, shift_round_0);
+
+  return res;
+}
+
+static INLINE int16x4_t
+convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                  const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+                  const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t sum;
+  sum = horiz_const;
+  sum = vmla_n_s16(sum, s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s3, filter[3]);
+  sum = vmla_n_s16(sum, s4, filter[4]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+
+  sum = vqrshl_s16(sum, shift_round_0);
+
+  return sum;
+}
+
+static INLINE uint16x4_t convolve8_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
new file mode 100644
index 000000000..e5674ef7c
--- /dev/null
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -0,0 +1,1740 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
+                                   const uint16_t fwd_offset,
+                                   const uint16_t bck_offset,
+                                   const int16x4_t sub_const_vec,
+                                   const int16_t round_bits,
+                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0;
+  uint16x4_t tmp_u0;
+  uint32x4_t sum0;
+  int32x4_t dst0;
+  int16x8_t tmp4;
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  }
+}
+
+static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
+                                   const uint16_t fwd_offset,
+                                   const uint16_t bck_offset,
+                                   const int16x4_t sub_const,
+                                   const int16_t round_bits,
+                                   const int use_jnt_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0, tmp2;
+  int16x8_t f0;
+  uint32x4_t sum0, sum2;
+  int32x4_t dst0, dst2;
+
+  uint16x8_t tmp_u0;
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp2 = vqmovn_s32(dst2);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+
+    *t0 = vqmovun_s16(f0);
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+
+    *t0 = vqmovun_s16(f0);
+  }
+}
+#endif  // !defined(__arch64__)
+
+static INLINE void compute_avg_4x4(
+    uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
+    uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const_vec, const int16_t round_bits,
+    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3;
+  uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+
+  int32x4_t dst0, dst1, dst2, dst3;
+  int16x8_t tmp4, tmp5;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+    const int32x4_t const_vec = vmovl_s16(sub_const_vec);
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+    sum1 = vmull_n_u16(res1, fwd_offset);
+    sum1 = vmlal_n_u16(sum1, d1, bck_offset);
+    sum2 = vmull_n_u16(res2, fwd_offset);
+    sum2 = vmlal_n_u16(sum2, d2, bck_offset);
+    sum3 = vmull_n_u16(res3, fwd_offset);
+    sum3 = vmlal_n_u16(sum3, d3, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+    tmp_u1 = vhadd_u16(res1, d1);
+    tmp_u2 = vhadd_u16(res2, d2);
+    tmp_u3 = vhadd_u16(res3, d3);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+    tmp1 = vsub_s16(vreinterpret_s16_u16(tmp_u1), sub_const_vec);
+    tmp2 = vsub_s16(vreinterpret_s16_u16(tmp_u2), sub_const_vec);
+    tmp3 = vsub_s16(vreinterpret_s16_u16(tmp_u3), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+    tmp1 = vqrshl_s16(tmp1, round_bits_vec);
+    tmp2 = vqrshl_s16(tmp2, round_bits_vec);
+    tmp3 = vqrshl_s16(tmp3, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  }
+}
+
+static INLINE void compute_avg_8x4(
+    uint16x8_t res0, uint16x8_t res1, uint16x8_t res2, uint16x8_t res3,
+    uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const, const int16_t round_bits,
+    const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+    uint8x8_t *t3) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int16x8_t f0, f1, f2, f3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+  uint32x4_t sum4, sum5, sum6, sum7;
+  int32x4_t dst0, dst1, dst2, dst3;
+  int32x4_t dst4, dst5, dst6, dst7;
+  uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_jnt_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum1 = vmull_n_u16(vget_low_u16(res1), fwd_offset);
+    sum1 = vmlal_n_u16(sum1, vget_low_u16(d1), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum3 = vmull_n_u16(vget_high_u16(res1), fwd_offset);
+    sum3 = vmlal_n_u16(sum3, vget_high_u16(d1), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    sum4 = vmull_n_u16(vget_low_u16(res2), fwd_offset);
+    sum4 = vmlal_n_u16(sum4, vget_low_u16(d2), bck_offset);
+    sum5 = vmull_n_u16(vget_low_u16(res3), fwd_offset);
+    sum5 = vmlal_n_u16(sum5, vget_low_u16(d3), bck_offset);
+    sum4 = vshrq_n_u32(sum4, DIST_PRECISION_BITS);
+    sum5 = vshrq_n_u32(sum5, DIST_PRECISION_BITS);
+
+    sum6 = vmull_n_u16(vget_high_u16(res2), fwd_offset);
+    sum6 = vmlal_n_u16(sum6, vget_high_u16(d2), bck_offset);
+    sum7 = vmull_n_u16(vget_high_u16(res3), fwd_offset);
+    sum7 = vmlal_n_u16(sum7, vget_high_u16(d3), bck_offset);
+    sum6 = vshrq_n_u32(sum6, DIST_PRECISION_BITS);
+    sum7 = vshrq_n_u32(sum7, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), sub_const_vec);
+    dst4 = vsubq_s32(vreinterpretq_s32_u32(sum4), sub_const_vec);
+    dst5 = vsubq_s32(vreinterpretq_s32_u32(sum5), sub_const_vec);
+    dst6 = vsubq_s32(vreinterpretq_s32_u32(sum6), sub_const_vec);
+    dst7 = vsubq_s32(vreinterpretq_s32_u32(sum7), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+    dst4 = vqrshlq_s32(dst4, round_bits_vec);
+    dst5 = vqrshlq_s32(dst5, round_bits_vec);
+    dst6 = vqrshlq_s32(dst6, round_bits_vec);
+    dst7 = vqrshlq_s32(dst7, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vqmovn_s32(dst4);
+    tmp5 = vqmovn_s32(dst5);
+    tmp6 = vqmovn_s32(dst6);
+    tmp7 = vqmovn_s32(dst7);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+    f1 = vcombine_s16(tmp1, tmp3);
+    f2 = vcombine_s16(tmp4, tmp6);
+    f3 = vcombine_s16(tmp5, tmp7);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+    tmp_u1 = vhaddq_u16(res1, d1);
+    tmp_u2 = vhaddq_u16(res2, d2);
+    tmp_u3 = vhaddq_u16(res3, d3);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+    f1 = vsubq_s16(vreinterpretq_s16_u16(tmp_u1), sub_const_vec);
+    f2 = vsubq_s16(vreinterpretq_s16_u16(tmp_u2), sub_const_vec);
+    f3 = vsubq_s16(vreinterpretq_s16_u16(tmp_u3), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+    f1 = vqrshlq_s16(f1, round_bits_vec);
+    f2 = vqrshlq_s16(f2, round_bits_vec);
+    f3 = vqrshlq_s16(f3, round_bits_vec);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+  }
+}
+
+static INLINE void jnt_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+  int dst_stride;
+  int width, height;
+
+  dst_ptr = im_block;
+  dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint8x8_t t0;
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint8x8_t t1, t2, t3;
+#endif
+    do {
+      s = src;
+      __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s7 = vget_low_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s9 = vget_low_s16(tt2);
+      s10 = vget_low_s16(tt3);
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+      t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+      // a8 a9 a10 a11
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      vst1_s16(dst_ptr, d0);
+
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint8x8_t t0;
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+    do {
+#if defined(__aarch64__)
+      uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+      int16x8_t s8, s9, s10, s11, s12, s13, s14;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      __builtin_prefetch(src + 7 * src_stride);
+      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src + 7;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      height -= 8;
+#else
+      int16x8_t temp_0;
+      t0 = vld1_u8(src);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src + 8;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
+    } while (height > 0);
+  }
+}
+
+static INLINE void jnt_convolve_2d_vert_neon(
+    int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
+    ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+  uint8_t *dst_u8_ptr, *d_u8;
+  CONV_BUF_TYPE *dst_ptr, *dst;
+  int16_t *src_ptr, *s;
+  uint16_t *d;
+
+  const int bd = 8;
+  int height;
+  int dst_stride = conv_params->dst_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+
+  const int16_t round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset);
+  const int16x4_t sub_const_vec = vdup_n_s16(sub_const);
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  uint16x4_t res4, d0;
+  uint8x8_t t0;
+
+#if defined(__aarch64__)
+  int16x4_t s8, s9, s10;
+  uint16x4_t res5, res6, res7, d1, d2, d3;
+  uint8x8_t t1;
+#endif
+
+  dst = conv_params->dst;
+  src_ptr = im_block;
+  dst_u8_ptr = dst8;
+  dst_ptr = dst;
+  height = h;
+
+  do {
+    d = dst_ptr;
+    d_u8 = dst_u8_ptr;
+    s = src_ptr;
+    height = h;
+
+    __builtin_prefetch(s + 0 * im_stride);
+    __builtin_prefetch(s + 1 * im_stride);
+    __builtin_prefetch(s + 2 * im_stride);
+    __builtin_prefetch(s + 3 * im_stride);
+    __builtin_prefetch(s + 4 * im_stride);
+    __builtin_prefetch(s + 5 * im_stride);
+    __builtin_prefetch(s + 6 * im_stride);
+    __builtin_prefetch(s + 7 * im_stride);
+
+    load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    s += (7 * im_stride);
+
+    do {
+#if defined(__aarch64__)
+      load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
+      s += (im_stride << 2);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      __builtin_prefetch(d_u8 + 4 * dst8_stride);
+      __builtin_prefetch(d_u8 + 5 * dst8_stride);
+      __builtin_prefetch(d_u8 + 6 * dst8_stride);
+      __builtin_prefetch(d_u8 + 7 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+      d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                             round_shift_vec, offset_const);
+      d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                             round_shift_vec, offset_const);
+      d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+        d += (dst_stride << 2);
+
+        compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
+                        bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
+                        &t0, &t1);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
+        d_u8 += dst8_stride;
+
+      } else {
+        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+        d += (dst_stride << 2);
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      height -= 4;
+#else
+      s7 = vld1_s16(s);
+      s += (im_stride);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        res4 = vld1_u16(d);
+        d += (dst_stride);
+
+        compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+                        round_bits, use_jnt_comp_avg, &t0);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+        d_u8 += dst8_stride;
+
+      } else {
+        vst1_u16(d, d0);
+        d += (dst_stride);
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      height--;
+#endif
+    } while (height > 0);
+    src_ptr += 4;
+    dst_ptr += 4;
+    dst_u8_ptr += 4;
+    w -= 4;
+  } while (w > 0);
+}
+
+void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                              int dst8_stride, int w, int h,
+                              const InterpFilterParams *filter_params_x,
+                              const InterpFilterParams *filter_params_y,
+                              const int subpel_x_q4, const int subpel_y_q4,
+                              ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int round_0 = conv_params->round_0 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                             x_filter_tmp, im_h, w, round_0);
+
+  jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
+                            y_filter, h, w);
+}
+
+void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
+      tmp_shift3;
+  uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
+  uint16x4_t tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7;
+  const uint8_t *src1, *src2;
+  uint8_t *dst8_1;
+  CONV_BUF_TYPE *dst = conv_params->dst, *dst_1, *dst_2;
+  const int dst_stride = conv_params->dst_stride;
+  int x, y;
+  const int16_t bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int16x4_t sub_const_vec = vdup_n_s16((int16_t)round_offset);
+  const uint16x8_t dup_round_offset16x8 = vdupq_n_u16((uint16_t)round_offset);
+  const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
+  const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
+
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  if (!(w & 0x07)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+      for (x = 0; x < (w >> 3); ++x) {
+        src2 = src1;
+        load_u8_8x4(src2, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+        res_q0 = vaddq_u16(vshlq_u16(vmovl_u8(res0_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q1 = vaddq_u16(vshlq_u16(vmovl_u8(res1_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q2 = vaddq_u16(vshlq_u16(vmovl_u8(res2_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q3 = vaddq_u16(vshlq_u16(vmovl_u8(res3_8), dup_bits16x8),
+                           dup_round_offset16x8);
+
+        if (conv_params->do_average) {
+          dst_2 = dst_1;
+          load_u16_8x4(dst_2, dst_stride, &tmp_q0, &tmp_q1, &tmp_q2, &tmp_q3);
+
+          compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
+                          res_q2, res_q3, conv_params->fwd_offset,
+                          conv_params->bck_offset, sub_const_vec, bits,
+                          conv_params->use_jnt_comp_avg, &tmp_shift0,
+                          &tmp_shift1, &tmp_shift2, &tmp_shift3);
+
+          vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
+          vst1_u8(dst8_1 + (1 * dst8_stride), tmp_shift1);
+          vst1_u8(dst8_1 + (2 * dst8_stride), tmp_shift2);
+          vst1_u8(dst8_1 + (3 * dst8_stride), tmp_shift3);
+
+        } else {
+          vst1q_u16(dst_1 + (0 * dst_stride), res_q0);
+          vst1q_u16(dst_1 + (1 * dst_stride), res_q1);
+          vst1q_u16(dst_1 + (2 * dst_stride), res_q2);
+          vst1q_u16(dst_1 + (3 * dst_stride), res_q3);
+        }
+        src1 = src1 + 8;
+        dst_1 = dst_1 + 8;
+        dst8_1 = dst8_1 + 8;
+      }
+      src += src_stride * 4;
+      dst8 += dst8_stride * 4;
+      dst += dst_stride * 4;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+
+      load_u8_8x4(src1, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+      res4 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res0_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res5 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res1_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res6 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res2_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res7 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res3_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      if (conv_params->do_average) {
+        load_u16_4x4(dst_1, dst_stride, &tmp4, &tmp5, &tmp6, &tmp7);
+
+        compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
+                        conv_params->fwd_offset, conv_params->bck_offset,
+                        sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+                        &tmp_shift0, &tmp_shift1);
+
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 1);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 1);
+
+      } else {
+        vst1_u16(dst_1, res4);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res5);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res6);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res7);
+      }
+      src += src_stride * 4;
+      dst += dst_stride * 4;
+      dst8 += dst8_stride * 4;
+    }
+  }
+}
+
+void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - horiz_offset;
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint16x4_t res4;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint16x4_t res5, res6, res7;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
+    int16x8_t u0, u1;
+#else
+    int16x4_t temp_0;
+#endif
+    const int16x4_t zero = vdup_n_s16(0);
+    const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
+    const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
+    const int16x4_t horiz_const = vdup_n_s16(bits);
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      width = w;
+      __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      s += 7;
+      do {
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+        t0 = vreinterpret_u8_u32(tu0);
+        t1 = vreinterpret_u8_u32(tu1);
+
+        transpose_u8_4x4(&t0, &t1);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_low_s16(u1);
+        s9 = vget_high_s16(u0);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                               zero, shift_round_0);
+        d1 = vrshl_s16(d1, horiz_const);
+        d1 = vadd_s16(d1, round_offset_vec);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                               zero, shift_round_0);
+        d2 = vrshl_s16(d2, horiz_const);
+        d2 = vadd_s16(d2, round_offset_vec);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                               zero, shift_round_0);
+        d3 = vrshl_s16(d3, horiz_const);
+        d3 = vadd_s16(d3, round_offset_vec);
+
+        transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
+                          &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          vst1_lane_u32((uint32_t *)(d_u8 + dst8_stride),
+                        vreinterpret_u32_u8(t0),
+                        1);  // 10 11 12 13
+          vst1_lane_u32((uint32_t *)(d_u8 + 2 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        0);  // 20 21 22 23
+          vst1_lane_u32((uint32_t *)(d_u8 + 3 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        1);  // 30 31 32 33
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride << 2);
+      dst_ptr += (dst_stride << 2);
+      dst_u8_ptr += (dst8_stride << 2);
+      height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(d);
+
+      s += 8;
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+        // a8 a9 a10 a11
+        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        temp_0 = s7;
+        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+        s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+        s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+        s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        s0 = s4;
+        s4 = temp_0;
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+          __builtin_prefetch(d_u8);
+
+          res4 = vld1_u16(d);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset_vec, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+        }
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride);
+      dst_ptr += (dst_stride);
+      dst_u8_ptr += (dst8_stride);
+      height--;
+#endif
+    } while (height > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    uint8_t *d_u8_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+    const int16x8_t horiz_const = vdupq_n_s16(bits);
+    const int16x8_t zero = vdupq_n_s16(0);
+
+    d = dst_ptr = dst;
+    d_u8 = dst_u8_ptr = dst8;
+    do {
+#if defined(__aarch64__)
+      int16x8_t s11, s12, s13, s14;
+      int16x8_t s8, s9, s10;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
+      uint16x8_t res9, res10, res11;
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 zero, shift_round_0);
+        res1 = vrshlq_s16(res1, horiz_const);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 zero, shift_round_0);
+        res2 = vrshlq_s16(res2, horiz_const);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 zero, shift_round_0);
+        res3 = vrshlq_s16(res3, horiz_const);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 zero, shift_round_0);
+        res4 = vrshlq_s16(res4, horiz_const);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, zero, shift_round_0);
+        res5 = vrshlq_s16(res5, horiz_const);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, zero, shift_round_0);
+        res6 = vrshlq_s16(res6, horiz_const);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, zero, shift_round_0);
+        res7 = vrshlq_s16(res7, horiz_const);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        if (conv_params->do_average) {
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      dst_u8_ptr += 8 * dst8_stride;
+      height -= 8;
+#else
+      int16x8_t temp_0;
+      __builtin_prefetch(src_ptr);
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        if (conv_params->do_average) {
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += (dst_stride);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst_u8_ptr += dst8_stride;
+      height--;
+#endif
+    } while (height > 0);
+  }
+}
+
+void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int shift_value = (conv_params->round_1 - 1 - bits);
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+  int16_t y_filter_tmp[8];
+  int16x8_t filter_y_coef = vld1q_s16(y_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
+  vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  // used to get rid of multiplication = (vertical filter output sum) *
+  // (1<<bits).
+  assert((conv_params->round_1 - 2) >= bits);
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    uint16x4_t res4;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+               tu3 = vdup_n_u32(0);
+    int16x8_t u0, u1, u2, u3;
+    uint8x8_t t0;
+
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    uint16x4_t res5, res6, res7;
+    uint8x8_t t1;
+#endif
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x4_t shift_vec = vdup_n_s16(-shift_value);
+    const int16x4_t zero = vdup_n_s16(0);
+
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      height = h;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
+
+      u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+      u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+      u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
+      u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+
+      s0 = vget_low_s16(u0);
+      s1 = vget_high_s16(u0);
+      s2 = vget_low_s16(u1);
+      s3 = vget_high_s16(u1);
+      s4 = vget_low_s16(u2);
+      s5 = vget_high_s16(u2);
+      s6 = vget_low_s16(u3);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += (7 * src_stride);
+      do {
+#if defined(__aarch64__)
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_high_s16(u0);
+        s9 = vget_low_s16(u1);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+        d0 = vadd_s16(d0, round_offset64);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                               zero, shift_vec);
+        d1 = vadd_s16(d1, round_offset64);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                               zero, shift_vec);
+        d2 = vadd_s16(d2, round_offset64);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                               zero, shift_vec);
+        d3 = vadd_s16(d3, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+          d += (dst_stride << 2);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_jnt_comp_avg, &t0,
+                          &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
+          d_u8 += dst8_stride;
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+          d += (dst_stride << 2);
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += (src_stride << 2);
+        height -= 4;
+#else
+        load_unaligned_u8_4x1(s, src_stride, &tu0);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        s7 = vget_low_s16(u0);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+
+        d0 = vadd_s16(d0, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+
+          res4 = vld1_u16(d);
+          d += (dst_stride);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+          d_u8 += dst8_stride;
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+          d += (dst_stride);
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        s += (src_stride);
+        height--;
+#endif
+      } while (height > 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst_u8_ptr += 4;
+      width -= 4;
+    } while (width > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t zero = vdupq_n_s16(0);
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res10, res11, res9;
+#endif
+    dst_ptr = dst;
+    dst_u8_ptr = dst8;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      height = h;
+      s = src_ptr + (7 * src_stride);
+      d_tmp = dst_ptr;
+      d_u8 = dst_u8_ptr;
+
+      do {
+#if defined(__aarch64__)
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                                 zero, shift_vec);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                                 zero, shift_vec);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                                 zero, shift_vec);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+                                 zero, shift_vec);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 y_filter_tmp, zero, shift_vec);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 y_filter_tmp, zero, shift_vec);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 y_filter_tmp, zero, shift_vec);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp + 0 * dst8_stride);
+          __builtin_prefetch(d_tmp + 1 * dst8_stride);
+          __builtin_prefetch(d_tmp + 2 * dst8_stride);
+          __builtin_prefetch(d_tmp + 3 * dst8_stride);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(
+              res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+              round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += (8 * src_stride);
+        height -= 8;
+#else
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp);
+
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_jnt_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += dst_stride;
+        }
+
+        s += (src_stride);
+        height--;
+#endif
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst_u8_ptr += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
new file mode 100644
index 000000000..c4ae2e784
--- /dev/null
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -0,0 +1,494 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+
+static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
+                                     const uint8x8_t s1) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define load_u8_4x1(s, s0, lane)                                           \
+  do {                                                                     \
+    *(s0) = vreinterpret_u8_u32(                                           \
+        vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+  } while (0)
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5,
+                                int16x4_t *const s6, int16x4_t *const s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define store_u8_4x1(s, s0, lane)                                  \
+  do {                                                             \
+    vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+  } while (0)
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3, const uint8x8_t s4,
+                                const uint8x8_t s5, const uint8x8_t s6,
+                                const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+                                 const uint8x16_t s1, const uint8x16_t s2,
+                                 const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3,
+                                 const uint16x8_t s4, const uint16x8_t s5,
+                                 const uint16x8_t s6, const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+  s += dst_stride;
+  vst1q_u16(s, s4);
+  s += dst_stride;
+  vst1q_u16(s, s5);
+  s += dst_stride;
+  vst1q_u16(s, s6);
+  s += dst_stride;
+  vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3,
+                                 const int16x8_t s4, const int16x8_t s5,
+                                 const int16x8_t s6, const int16x8_t s7) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+  s += dst_stride;
+  vst1q_s16(s, s4);
+  s += dst_stride;
+  vst1q_s16(s, s5);
+  s += dst_stride;
+  vst1q_s16(s, s6);
+  s += dst_stride;
+  vst1q_s16(s, s7);
+}
+
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5,
+                                int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1,
+                                         uint32x2_t *tu2, uint32x2_t *tu3) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu3 = vset_lane_u32(a, *tu3, 0);
+  memcpy(&a, buf, 4);
+  *tu3 = vset_lane_u32(a, *tu3, 1);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+}
+
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
+static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+}
+
+static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
+                                         uint16x4_t *tu0) {
+  uint16_t a;
+
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 0);
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 1);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+                                          uint64x2_t *tu0, uint64x2_t *tu1) {
+  uint64_t a;
+
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 0);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 1);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu1 = vsetq_lane_u64(a, *tu1, 0);
+  memcpy(&a, buf, 8);
+  *tu1 = vsetq_lane_u64(a, *tu1, 1);
+}
+
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+                                int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+  *s1 = vld1q_s32(s);
+  s += p;
+  *s2 = vld1q_s32(s);
+  s += p;
+  *s3 = vld1q_s32(s);
+  s += p;
+  *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+                                 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+  vst1q_s32(s, s1);
+  s += p;
+  vst1q_s32(s, s2);
+  s += p;
+  vst1q_s32(s, s3);
+  s += p;
+  vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+                                uint32x4_t *s2, uint32x4_t *s3,
+                                uint32x4_t *s4) {
+  *s1 = vld1q_u32(s);
+  s += p;
+  *s2 = vld1q_u32(s);
+  s += p;
+  *s3 = vld1q_u32(s);
+  s += p;
+  *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+                                 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+  vst1q_u32(s, s1);
+  s += p;
+  vst1q_u32(s, s2);
+  s += p;
+  vst1q_u32(s, s3);
+  s += p;
+  vst1q_u32(s, s4);
+}
+
+#endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c
new file mode 100644
index 000000000..44e064195
--- /dev/null
+++ b/third_party/aom/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  assert(h >= 4);
+  assert(w >= 4);
+  assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+  const int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  uint16x8_t diff_q, tmp0, tmp1;
+  uint8x8_t diff_d, diff_select;
+  const CONV_BUF_TYPE *src0_1, *src1_1;
+  const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+  const uint8x8_t dup_38 = vdup_n_u8(38);
+  const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+  if (mask_type == DIFFWTD_38) {
+    diff_select = vdup_n_u8(255);
+  } else {
+    diff_select = vdup_n_u8(0);
+  }
+  if (w >= 8) {
+    for (int i = 0; i < h; ++i) {
+      src0_1 = src0;
+      src1_1 = src1;
+      for (int j = 0; j < w; j += 8) {
+        __builtin_prefetch(src0_1);
+        __builtin_prefetch(src1_1);
+        diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+        diff_q = vrshlq_u16(diff_q, dup_round);
+        diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+        diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+        diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+        vst1_u8(mask, diff_d);
+        src0_1 += 8;
+        src1_1 += 8;
+        mask += 8;
+      }
+      src0 += src0_stride;
+      src1 += src1_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      src0_1 = src0;
+      src1_1 = src1;
+      __builtin_prefetch(src0_1 + 0 * src0_stride);
+      __builtin_prefetch(src0_1 + 1 * src0_stride);
+      __builtin_prefetch(src1_1 + 0 * src1_stride);
+      __builtin_prefetch(src1_1 + 1 * src1_stride);
+      tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+                          vld1_u16(src0_1 + (1 * src0_stride)));
+      tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+                          vld1_u16(src1_1 + (1 * src1_stride)));
+      diff_q = vabdq_u16(tmp0, tmp1);
+      diff_q = vrshlq_u16(diff_q, dup_round);
+      diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+      diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+      diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+      vst1_u8(mask, diff_d);
+      src0 += src0_stride * 2;
+      src1 += src1_stride * 2;
+      mask += w * 2;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
new file mode 100644
index 000000000..b3a37c4cb
--- /dev/null
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1508 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+    int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+    uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+    uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+    const int buf_stride) {
+  uint32x4_t q0, q1, q2, q3;
+  uint32x4_t p0, p1, p2, p3;
+  uint16x4_t d0, d1, d2, d3;
+
+  s0 = vmulq_u32(s0, const_n_val);
+  s1 = vmulq_u32(s1, const_n_val);
+  s2 = vmulq_u32(s2, const_n_val);
+  s3 = vmulq_u32(s3, const_n_val);
+
+  q0 = vmulq_u32(s4, s4);
+  q1 = vmulq_u32(s5, s5);
+  q2 = vmulq_u32(s6, s6);
+  q3 = vmulq_u32(s7, s7);
+
+  p0 = vcleq_u32(q0, s0);
+  p1 = vcleq_u32(q1, s1);
+  p2 = vcleq_u32(q2, s2);
+  p3 = vcleq_u32(q3, s3);
+
+  q0 = vsubq_u32(s0, q0);
+  q1 = vsubq_u32(s1, q1);
+  q2 = vsubq_u32(s2, q2);
+  q3 = vsubq_u32(s3, q3);
+
+  p0 = vandq_u32(p0, q0);
+  p1 = vandq_u32(p1, q1);
+  p2 = vandq_u32(p2, q2);
+  p3 = vandq_u32(p3, q3);
+
+  p0 = vmulq_u32(p0, s_vec);
+  p1 = vmulq_u32(p1, s_vec);
+  p2 = vmulq_u32(p2, s_vec);
+  p3 = vmulq_u32(p3, s_vec);
+
+  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+  p0 = vminq_u32(p0, const_val);
+  p1 = vminq_u32(p1, const_val);
+  p2 = vminq_u32(p2, const_val);
+  p3 = vminq_u32(p3, const_val);
+
+  {
+    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+    for (int x = 0; x < 4; x++) {
+      for (int y = 0; y < 4; y++) {
+        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+      }
+    }
+    load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+  }
+  p0 = vsubl_u16(sgrproj_sgr, d0);
+  p1 = vsubl_u16(sgrproj_sgr, d1);
+  p2 = vsubl_u16(sgrproj_sgr, d2);
+  p3 = vsubl_u16(sgrproj_sgr, d3);
+
+  s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+  s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+  s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+  s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+  s4 = vmulq_u32(s4, p0);
+  s5 = vmulq_u32(s5, p1);
+  s6 = vmulq_u32(s6, p2);
+  s7 = vmulq_u32(s7, p3);
+
+  p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+  p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+  p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+  p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+  store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+                vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+    uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+    uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+    uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+    uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+    uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+  uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+  s0 = vmulq_u32(s0, const_n_val);
+  s1 = vmulq_u32(s1, const_n_val);
+  s2 = vmulq_u32(s2, const_n_val);
+  s3 = vmulq_u32(s3, const_n_val);
+  s4 = vmulq_u32(s4, const_n_val);
+  s5 = vmulq_u32(s5, const_n_val);
+  s6 = vmulq_u32(s6, const_n_val);
+  s7 = vmulq_u32(s7, const_n_val);
+
+  d0 = vget_low_u16(s16_4);
+  d1 = vget_low_u16(s16_5);
+  d2 = vget_low_u16(s16_6);
+  d3 = vget_low_u16(s16_7);
+  d4 = vget_high_u16(s16_4);
+  d5 = vget_high_u16(s16_5);
+  d6 = vget_high_u16(s16_6);
+  d7 = vget_high_u16(s16_7);
+
+  q0 = vmull_u16(d0, d0);
+  q1 = vmull_u16(d1, d1);
+  q2 = vmull_u16(d2, d2);
+  q3 = vmull_u16(d3, d3);
+  q4 = vmull_u16(d4, d4);
+  q5 = vmull_u16(d5, d5);
+  q6 = vmull_u16(d6, d6);
+  q7 = vmull_u16(d7, d7);
+
+  p0 = vcleq_u32(q0, s0);
+  p1 = vcleq_u32(q1, s1);
+  p2 = vcleq_u32(q2, s2);
+  p3 = vcleq_u32(q3, s3);
+  p4 = vcleq_u32(q4, s4);
+  p5 = vcleq_u32(q5, s5);
+  p6 = vcleq_u32(q6, s6);
+  p7 = vcleq_u32(q7, s7);
+
+  q0 = vsubq_u32(s0, q0);
+  q1 = vsubq_u32(s1, q1);
+  q2 = vsubq_u32(s2, q2);
+  q3 = vsubq_u32(s3, q3);
+  q4 = vsubq_u32(s4, q4);
+  q5 = vsubq_u32(s5, q5);
+  q6 = vsubq_u32(s6, q6);
+  q7 = vsubq_u32(s7, q7);
+
+  p0 = vandq_u32(p0, q0);
+  p1 = vandq_u32(p1, q1);
+  p2 = vandq_u32(p2, q2);
+  p3 = vandq_u32(p3, q3);
+  p4 = vandq_u32(p4, q4);
+  p5 = vandq_u32(p5, q5);
+  p6 = vandq_u32(p6, q6);
+  p7 = vandq_u32(p7, q7);
+
+  p0 = vmulq_u32(p0, s_vec);
+  p1 = vmulq_u32(p1, s_vec);
+  p2 = vmulq_u32(p2, s_vec);
+  p3 = vmulq_u32(p3, s_vec);
+  p4 = vmulq_u32(p4, s_vec);
+  p5 = vmulq_u32(p5, s_vec);
+  p6 = vmulq_u32(p6, s_vec);
+  p7 = vmulq_u32(p7, s_vec);
+
+  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+  p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+  p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+  p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+  p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+  p0 = vminq_u32(p0, const_val);
+  p1 = vminq_u32(p1, const_val);
+  p2 = vminq_u32(p2, const_val);
+  p3 = vminq_u32(p3, const_val);
+  p4 = vminq_u32(p4, const_val);
+  p5 = vminq_u32(p5, const_val);
+  p6 = vminq_u32(p6, const_val);
+  p7 = vminq_u32(p7, const_val);
+
+  {
+    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+    store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+    for (int x = 0; x < 4; x++) {
+      for (int y = 0; y < 8; y++) {
+        dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+      }
+    }
+    load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+  }
+
+  s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+  s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+  s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+  s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+  s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+  s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+  s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+  s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+  s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+  s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+  s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+  s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+  s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+  s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+  s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+  s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+  s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+  s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+  s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+  s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+  p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+  p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+  p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+  p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+  p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+  p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+  p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+  p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+  store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+                vreinterpretq_s32_u32(p3));
+  store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+                vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+                vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+    int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+    int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+    int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+  int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+  int32x4_t r12, r34, r67, r89, r1011;
+  int32x4_t r345, r6789, r789;
+
+  d1 = vmull_s16(t1, t1);
+  d2 = vmull_s16(t2, t2);
+  d3 = vmull_s16(t3, t3);
+  d4 = vmull_s16(t4, t4);
+  d5 = vmull_s16(t5, t5);
+  d6 = vmull_s16(t6, t6);
+  d7 = vmull_s16(t7, t7);
+  d8 = vmull_s16(t8, t8);
+  d9 = vmull_s16(t9, t9);
+  d10 = vmull_s16(t10, t10);
+  d11 = vmull_s16(t11, t11);
+
+  r12 = vaddq_s32(d1, d2);
+  r34 = vaddq_s32(d3, d4);
+  r67 = vaddq_s32(d6, d7);
+  r89 = vaddq_s32(d8, d9);
+  r1011 = vaddq_s32(d10, d11);
+  r345 = vaddq_s32(r34, d5);
+  r6789 = vaddq_s32(r67, r89);
+  r789 = vsubq_s32(r6789, d6);
+  *r0 = vaddq_s32(r12, r345);
+  *r1 = vaddq_s32(r67, r345);
+  *r2 = vaddq_s32(d5, r6789);
+  *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+                           int32_t *dst32, int32_t *dst2, const int dst_stride,
+                           const int width, const int height) {
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  int16_t *dst1_16_ptr, *src_ptr;
+  int32_t *dst2_ptr;
+  int h, w, count = 0;
+  const int dst_stride_2 = (dst_stride << 1);
+  const int dst_stride_8 = (dst_stride << 3);
+
+  dst1_16_ptr = dst16;
+  dst2_ptr = dst2;
+  src_ptr = src;
+  w = width;
+  {
+    int16x8_t t1, t2, t3, t4, t5, t6, t7;
+    int16x8_t t8, t9, t10, t11, t12;
+
+    int16x8_t q12345, q56789, q34567, q7891011;
+    int16x8_t q12, q34, q67, q89, q1011;
+    int16x8_t q345, q6789, q789;
+
+    int32x4_t r12345, r56789, r34567, r7891011;
+
+    do {
+      h = height;
+      dst1_16_ptr = dst16 + (count << 3);
+      dst2_ptr = dst2 + (count << 3);
+      src_ptr = src + (count << 3);
+
+      dst1_16_ptr += dst_stride_2;
+      dst2_ptr += dst_stride_2;
+      do {
+        load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+        src_ptr += 4 * src_stride;
+        load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+        src_ptr += 4 * src_stride;
+        load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+        q12 = vaddq_s16(t1, t2);
+        q34 = vaddq_s16(t3, t4);
+        q67 = vaddq_s16(t6, t7);
+        q89 = vaddq_s16(t8, t9);
+        q1011 = vaddq_s16(t10, t11);
+        q345 = vaddq_s16(q34, t5);
+        q6789 = vaddq_s16(q67, q89);
+        q789 = vaddq_s16(q89, t7);
+        q12345 = vaddq_s16(q12, q345);
+        q34567 = vaddq_s16(q67, q345);
+        q56789 = vaddq_s16(t5, q6789);
+        q7891011 = vaddq_s16(q789, q1011);
+
+        store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+                      q7891011);
+        dst1_16_ptr += dst_stride_8;
+
+        boxsum2_square_sum_calc(
+            vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+            vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+            vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+            vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+            &r7891011);
+
+        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+        boxsum2_square_sum_calc(
+            vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+            vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+            vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+            vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+            &r7891011);
+
+        store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+                      r7891011);
+        dst2_ptr += (dst_stride_8);
+        h -= 8;
+      } while (h > 0);
+      w -= 8;
+      count++;
+    } while (w > 0);
+  }
+
+  {
+    int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+    int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+    int32x4_t q12345, q34567, q23456, q45678;
+    int32x4_t q23, q45, q67;
+    int32x4_t q2345, q4567;
+
+    int32x4_t r12345, r34567, r23456, r45678;
+    int32x4_t r23, r45, r67;
+    int32x4_t r2345, r4567;
+
+    int32_t *src2_ptr, *dst1_32_ptr;
+    int16_t *src1_ptr;
+    count = 0;
+    h = height;
+    do {
+      dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+      dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+      src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+      src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+      w = width;
+
+      dst1_32_ptr += 2;
+      dst2_ptr += 2;
+      load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+      transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+      load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+      transpose_s32_4x4(&d1, &d2, &d3, &d4);
+      do {
+        src1_ptr += 4;
+        src2_ptr += 4;
+        load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+        transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+        load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+        transpose_s32_4x4(&d5, &d6, &d7, &d8);
+        q23 = vaddl_s16(s2, s3);
+        q45 = vaddl_s16(s4, s5);
+        q67 = vaddl_s16(s6, s7);
+        q2345 = vaddq_s32(q23, q45);
+        q4567 = vaddq_s32(q45, q67);
+        q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+        q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+        q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+        q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+        transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+        store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+                      q45678);
+        dst1_32_ptr += 4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+
+        r23 = vaddq_s32(d2, d3);
+        r45 = vaddq_s32(d4, d5);
+        r67 = vaddq_s32(d6, d7);
+        r2345 = vaddq_s32(r23, r45);
+        r4567 = vaddq_s32(r45, r67);
+        r12345 = vaddq_s32(d1, r2345);
+        r23456 = vaddq_s32(r2345, d6);
+        r34567 = vaddq_s32(r4567, d3);
+        r45678 = vaddq_s32(r4567, d8);
+
+        transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+        dst2_ptr += 4;
+        d1 = d5;
+        d2 = d6;
+        d3 = d7;
+        d4 = d8;
+        w -= 4;
+      } while (w > 0);
+      h -= 8;
+      count++;
+    } while (h > 0);
+  }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+                                        uint16_t *B16, int32_t *B,
+                                        const int buf_stride, const int width,
+                                        const int height, const int r,
+                                        const int s, const int ht_inc) {
+  int32_t *src1, *dst2, count = 0;
+  uint16_t *dst_A16, *src2;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B16 + (count << 2) * buf_stride;
+    dst2 = B + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+      load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+      s16_4 = s16_0;
+      s16_5 = s16_1;
+      s16_6 = s16_2;
+      s16_7 = s16_3;
+
+      calc_ab_internal_common(
+          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+      w -= 8;
+      dst2 += 8;
+      src1 += 8;
+      src2 += 8;
+      dst_A16 += 8;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+                                        uint16_t *B16, int32_t *B,
+                                        const int buf_stride, const int width,
+                                        const int height, const int bit_depth,
+                                        const int r, const int s,
+                                        const int ht_inc) {
+  int32_t *src1, *dst2, count = 0;
+  uint16_t *dst_A16, *src2;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint16x8_t s16_0, s16_1, s16_2, s16_3;
+  uint16x8_t s16_4, s16_5, s16_6, s16_7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B16 + (count << 2) * buf_stride;
+    dst2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+      s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+      s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+      s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+      s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+      calc_ab_internal_common(
+          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+      w -= 8;
+      dst2 += 8;
+      src1 += 8;
+      src2 += 8;
+      dst_A16 += 8;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+                                             int32_t *B, const int buf_stride,
+                                             const int width, const int height,
+                                             const int r, const int s,
+                                             const int ht_inc) {
+  int32_t *src1, *src2, count = 0;
+  uint16_t *dst_A16;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+      s0 = vreinterpretq_u32_s32(sr0);
+      s1 = vreinterpretq_u32_s32(sr1);
+      s2 = vreinterpretq_u32_s32(sr2);
+      s3 = vreinterpretq_u32_s32(sr3);
+      s4 = vreinterpretq_u32_s32(sr4);
+      s5 = vreinterpretq_u32_s32(sr5);
+      s6 = vreinterpretq_u32_s32(sr6);
+      s7 = vreinterpretq_u32_s32(sr7);
+
+      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+                                   sr6, sr7, const_n_val, s_vec, const_val,
+                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
+                                   dst_A16, src2, buf_stride);
+
+      w -= 4;
+      src1 += 4;
+      src2 += 4;
+      dst_A16 += 4;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+                                             int32_t *B, const int buf_stride,
+                                             const int width, const int height,
+                                             const int bit_depth, const int r,
+                                             const int s, const int ht_inc) {
+  int32_t *src1, *src2, count = 0;
+  uint16_t *dst_A16;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+                                   sr6, sr7, const_n_val, s_vec, const_val,
+                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
+                                   dst_A16, src2, buf_stride);
+
+      w -= 4;
+      src1 += 4;
+      src2 += 4;
+      dst_A16 += 4;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+                           int32_t *dst2, const int dst_stride, const int width,
+                           const int height) {
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  int16_t *src_ptr;
+  int32_t *dst2_ptr;
+  uint16_t *dst1_ptr;
+  int h, w, count = 0;
+
+  w = width;
+  {
+    int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+    int16x8_t q23, q34, q56, q234, q345, q456, q567;
+    int32x4_t r23, r56, r345, r456, r567, r78, r678;
+    int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+    int32x4_t r2, r3, r5, r6, r7, r8;
+    int16x8_t q678, q78;
+
+    do {
+      dst1_ptr = dst1 + (count << 3);
+      dst2_ptr = dst2 + (count << 3);
+      src_ptr = src + (count << 3);
+      h = height;
+
+      load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+      src_ptr += 4 * src_stride;
+
+      q23 = vaddq_s16(s2, s3);
+      q234 = vaddq_s16(q23, s4);
+      q34 = vaddq_s16(s3, s4);
+      dst1_ptr += (dst_stride << 1);
+
+      r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+      r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+      r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+      r23 = vaddq_s32(r2, r3);
+      r234_low = vaddq_s32(r23, r4_low);
+      r34_low = vaddq_s32(r3, r4_low);
+
+      r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+      r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+      r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+      r23 = vaddq_s32(r2, r3);
+      r234_high = vaddq_s32(r23, r4_high);
+      r34_high = vaddq_s32(r3, r4_high);
+
+      dst2_ptr += (dst_stride << 1);
+
+      do {
+        load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+        src_ptr += 4 * src_stride;
+
+        q345 = vaddq_s16(s5, q34);
+        q56 = vaddq_s16(s5, s6);
+        q456 = vaddq_s16(s4, q56);
+        q567 = vaddq_s16(s7, q56);
+        q78 = vaddq_s16(s7, s8);
+        q678 = vaddq_s16(s6, q78);
+
+        store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+        dst1_ptr += (dst_stride << 2);
+
+        s4 = s8;
+        q34 = q78;
+        q234 = q678;
+
+        r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+        r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+        r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+        r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+        r345 = vaddq_s32(r5, r34_low);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4_low, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+        r4_low = r8;
+        r34_low = r78;
+        r234_low = r678;
+
+        r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+        r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+        r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+        r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+        r345 = vaddq_s32(r5, r34_high);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4_high, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+        dst2_ptr += (dst_stride << 2);
+
+        r4_high = r8;
+        r34_high = r78;
+        r234_high = r678;
+
+        h -= 4;
+      } while (h > 0);
+      w -= 8;
+      count++;
+    } while (w > 0);
+  }
+
+  {
+    int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+    int16x4_t q23, q34, q56, q234, q345, q456, q567;
+    int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+    int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+    int16x4_t q678, q78;
+
+    int32_t *src2_ptr;
+    uint16_t *src1_ptr;
+    count = 0;
+    h = height;
+    w = width;
+    do {
+      dst1_ptr = dst1 + (count << 2) * dst_stride;
+      dst2_ptr = dst2 + (count << 2) * dst_stride;
+      src1_ptr = dst1 + (count << 2) * dst_stride;
+      src2_ptr = dst2 + (count << 2) * dst_stride;
+      w = width;
+
+      load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+      transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+      load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+      transpose_s32_4x4(&r1, &r2, &r3, &r4);
+      src1_ptr += 4;
+      src2_ptr += 4;
+
+      q23 = vadd_s16(d2, d3);
+      q234 = vadd_s16(q23, d4);
+      q34 = vadd_s16(d3, d4);
+      dst1_ptr += 2;
+      r23 = vaddq_s32(r2, r3);
+      r234 = vaddq_s32(r23, r4);
+      r34 = vaddq_s32(r3, r4);
+      dst2_ptr += 2;
+
+      do {
+        load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+        transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+        load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+        transpose_s32_4x4(&r5, &r6, &r7, &r8);
+        src1_ptr += 4;
+        src2_ptr += 4;
+
+        q345 = vadd_s16(d5, q34);
+        q56 = vadd_s16(d5, d6);
+        q456 = vadd_s16(d4, q56);
+        q567 = vadd_s16(d7, q56);
+        q78 = vadd_s16(d7, d8);
+        q678 = vadd_s16(d6, q78);
+        transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+        store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+        dst1_ptr += 4;
+
+        d4 = d8;
+        q34 = q78;
+        q234 = q678;
+
+        r345 = vaddq_s32(r5, r34);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        transpose_s32_4x4(&r234, &r345, &r456, &r567);
+        store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+        dst2_ptr += 4;
+
+        r4 = r8;
+        r34 = r78;
+        r234 = r678;
+        w -= 4;
+      } while (w > 0);
+      h -= 4;
+      count++;
+    } while (h > 0);
+  }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+  int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+  int32x4_t fours, threes, res;
+
+  xtl = vld1q_s32(buf - buf_stride - 1);
+  xt = vld1q_s32(buf - buf_stride);
+  xtr = vld1q_s32(buf - buf_stride + 1);
+  xl = vld1q_s32(buf - 1);
+  x = vld1q_s32(buf);
+  xr = vld1q_s32(buf + 1);
+  xbl = vld1q_s32(buf + buf_stride - 1);
+  xb = vld1q_s32(buf + buf_stride);
+  xbr = vld1q_s32(buf + buf_stride + 1);
+
+  fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+  threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+  res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+  return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+                                     int32x4_t *a0, int32x4_t *a1) {
+  uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+  uint16x8_t r0, r1;
+
+  xtl = vld1q_u16(buf - buf_stride - 1);
+  xt = vld1q_u16(buf - buf_stride);
+  xtr = vld1q_u16(buf - buf_stride + 1);
+  xl = vld1q_u16(buf - 1);
+  x = vld1q_u16(buf);
+  xr = vld1q_u16(buf + 1);
+  xbl = vld1q_u16(buf + buf_stride - 1);
+  xb = vld1q_u16(buf + buf_stride);
+  xbr = vld1q_u16(buf + buf_stride + 1);
+
+  xb = vaddq_u16(xb, x);
+  xt = vaddq_u16(xt, xr);
+  xl = vaddq_u16(xl, xb);
+  xl = vaddq_u16(xl, xt);
+
+  r0 = vshlq_n_u16(xl, 2);
+
+  xbl = vaddq_u16(xbl, xbr);
+  xtl = vaddq_u16(xtl, xtr);
+  xtl = vaddq_u16(xtl, xbl);
+
+  r1 = vshlq_n_u16(xtl, 2);
+  r1 = vsubq_u16(r1, xtl);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+  int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+  int32x4_t fives, sixes, fives_plus_sixes;
+
+  xtl = vld1q_s32(buf - buf_stride - 1);
+  xt = vld1q_s32(buf - buf_stride);
+  xtr = vld1q_s32(buf - buf_stride + 1);
+  xbl = vld1q_s32(buf + buf_stride - 1);
+  xb = vld1q_s32(buf + buf_stride);
+  xbr = vld1q_s32(buf + buf_stride + 1);
+
+  fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+  sixes = vaddq_s32(xt, xb);
+  fives_plus_sixes = vaddq_s32(fives, sixes);
+
+  return vaddq_s32(
+      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+                                                 int32x4_t *a0, int32x4_t *a1) {
+  uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+  xtl = vld1q_u16(buf - buf_stride - 1);
+  xt = vld1q_u16(buf - buf_stride);
+  xtr = vld1q_u16(buf - buf_stride + 1);
+  xbl = vld1q_u16(buf + buf_stride - 1);
+  xb = vld1q_u16(buf + buf_stride);
+  xbr = vld1q_u16(buf + buf_stride + 1);
+
+  xbr = vaddq_u16(xbr, xbl);
+  xtr = vaddq_u16(xtr, xtl);
+  xbr = vaddq_u16(xbr, xtr);
+  xtl = vshlq_n_u16(xbr, 2);
+  xbr = vaddq_u16(xtl, xbr);
+
+  xb = vaddq_u16(xb, xt);
+  xb0 = vshlq_n_u16(xb, 1);
+  xb = vshlq_n_u16(xb, 2);
+  xb = vaddq_u16(xb, xb0);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+  int32x4_t xl, x, xr;
+  int32x4_t fives, sixes, fives_plus_sixes;
+
+  xl = vld1q_s32(buf - 1);
+  x = vld1q_s32(buf);
+  xr = vld1q_s32(buf + 1);
+  fives = vaddq_s32(xl, xr);
+  sixes = x;
+  fives_plus_sixes = vaddq_s32(fives, sixes);
+
+  return vaddq_s32(
+      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+                                                int32x4_t *a1) {
+  uint16x8_t xl, x, xr;
+  uint16x8_t x0;
+
+  xl = vld1q_u16(buf - 1);
+  x = vld1q_u16(buf);
+  xr = vld1q_u16(buf + 1);
+  xl = vaddq_u16(xl, xr);
+  x0 = vshlq_n_u16(xl, 2);
+  xl = vaddq_u16(xl, x0);
+
+  x0 = vshlq_n_u16(x, 1);
+  x = vshlq_n_u16(x, 2);
+  x = vaddq_u16(x, x0);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+                                       const int buf_stride, int16_t *src,
+                                       const int src_stride, int32_t *dst,
+                                       const int dst_stride, const int width,
+                                       const int height) {
+  int16x8_t s0;
+  int32_t *B_tmp, *dst_ptr;
+  uint16_t *A_tmp;
+  int16_t *src_ptr;
+  int32x4_t a_res0, a_res1, b_res0, b_res1;
+  int w, h, count = 0;
+  assert(SGRPROJ_SGR_BITS == 8);
+  assert(SGRPROJ_RST_BITS == 4);
+
+  A_tmp = A;
+  B_tmp = B;
+  src_ptr = src;
+  dst_ptr = dst;
+  h = height;
+  do {
+    A_tmp = (A + count * buf_stride);
+    B_tmp = (B + count * buf_stride);
+    src_ptr = (src + count * src_stride);
+    dst_ptr = (dst + count * dst_stride);
+    w = width;
+    if (!(count & 1)) {
+      do {
+        s0 = vld1q_s16(src_ptr);
+        cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+        b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+        b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+        a_res0 = vaddq_s32(a_res0, b_res0);
+        a_res1 = vaddq_s32(a_res1, b_res1);
+
+        a_res0 =
+            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+        a_res1 =
+            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+        vst1q_s32(dst_ptr, a_res0);
+        vst1q_s32(dst_ptr + 4, a_res1);
+
+        A_tmp += 8;
+        B_tmp += 8;
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    } else {
+      do {
+        s0 = vld1q_s16(src_ptr);
+        cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+        b_res0 = cross_sum_fast_odd_row(B_tmp);
+        b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+        a_res0 = vaddq_s32(a_res0, b_res0);
+        a_res1 = vaddq_s32(a_res1, b_res1);
+
+        a_res0 =
+            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+        a_res1 =
+            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+        vst1q_s32(dst_ptr, a_res0);
+        vst1q_s32(dst_ptr + 4, a_res1);
+
+        A_tmp += 8;
+        B_tmp += 8;
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+    count++;
+    h -= 1;
+  } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+                           int16_t *src, const int src_stride, int32_t *dst,
+                           const int dst_stride, const int width,
+                           const int height) {
+  int16x8_t s0;
+  int32_t *B_tmp, *dst_ptr;
+  uint16_t *A_tmp;
+  int16_t *src_ptr;
+  int32x4_t a_res0, a_res1, b_res0, b_res1;
+  int w, h, count = 0;
+
+  assert(SGRPROJ_SGR_BITS == 8);
+  assert(SGRPROJ_RST_BITS == 4);
+  h = height;
+
+  do {
+    A_tmp = (A + count * buf_stride);
+    B_tmp = (B + count * buf_stride);
+    src_ptr = (src + count * src_stride);
+    dst_ptr = (dst + count * dst_stride);
+    w = width;
+    do {
+      s0 = vld1q_s16(src_ptr);
+      cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+      a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+      a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+      b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+      b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+      a_res0 = vaddq_s32(a_res0, b_res0);
+      a_res1 = vaddq_s32(a_res1, b_res1);
+
+      a_res0 =
+          vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+      a_res1 =
+          vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+      vst1q_s32(dst_ptr, a_res0);
+      vst1q_s32(dst_ptr + 4, a_res1);
+
+      A_tmp += 8;
+      B_tmp += 8;
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+    count++;
+    h -= 1;
+  } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+                                             int height, int dgd_stride,
+                                             int32_t *dst, int dst_stride,
+                                             int bit_depth, int sgr_params_idx,
+                                             int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  const int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *square_sum_buf = A_;
+  int32_t *sum_buf = B_;
+  uint16_t *tmp16_buf = A16_;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  assert(radius_idx == 0);
+  assert(r == 2);
+
+  // input(dgd16) is 16bit.
+  // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+  // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+  // buffer(square_sum_buf).
+  boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+                      SGRPROJ_BORDER_HORZ),
+          dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+          width_ext, height_ext);
+
+  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+  if (8 == bit_depth) {
+    calc_ab_fast_internal_lbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+        params->s[radius_idx], 2);
+  } else {
+    calc_ab_fast_internal_hbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+        bit_depth, r, params->s[radius_idx], 2);
+  }
+  final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+                             dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+                                        int dgd_stride, int32_t *dst,
+                                        int dst_stride, int bit_depth,
+                                        int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *square_sum_buf = A_;
+  uint16_t *sum_buf = B16_;
+  uint16_t *A16 = A16_;
+  int32_t *B = B_;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  assert(radius_idx == 1);
+  assert(r == 1);
+
+  // input(dgd16) is 16bit.
+  // sum of pixels output will be in 16bit(sum_buf).
+  // sum of squares output is kept in 32bit buffer(square_sum_buf).
+  boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+                      SGRPROJ_BORDER_HORZ),
+          dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+          height_ext);
+
+  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+  if (8 == bit_depth) {
+    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, r, params->s[radius_idx], 1);
+  } else {
+    calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, bit_depth, r, params->s[radius_idx], 1);
+  }
+  final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+                        dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+                                         const int src_stride, uint16_t *dst,
+                                         const int dst_stride, const int width,
+                                         const int height) {
+  const uint8_t *src_ptr;
+  uint16_t *dst_ptr;
+  int h, w, count = 0;
+
+  uint8x8_t t1, t2, t3, t4;
+  uint16x8_t s1, s2, s3, s4;
+  h = height;
+  do {
+    src_ptr = src + (count << 2) * src_stride;
+    dst_ptr = dst + (count << 2) * dst_stride;
+    w = width;
+    if (w >= 7) {
+      do {
+        load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+        s1 = vmovl_u8(t1);
+        s2 = vmovl_u8(t2);
+        s3 = vmovl_u8(t3);
+        s4 = vmovl_u8(t4);
+        store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 7);
+    }
+
+    for (int y = 0; y < w; y++) {
+      dst_ptr[y] = src_ptr[y];
+      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+    }
+    count++;
+    h -= 4;
+  } while (h > 3);
+
+  src_ptr = src + (count << 2) * src_stride;
+  dst_ptr = dst + (count << 2) * dst_stride;
+  for (int x = 0; x < h; x++) {
+    for (int y = 0; y < width; y++) {
+      dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+    }
+  }
+}
+
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+                                        uint16_t *dst, const int dst_stride,
+                                        int width, int height) {
+  const uint16_t *src_ptr;
+  uint16_t *dst_ptr;
+  int h, w, count = 0;
+  uint16x8_t s1, s2, s3, s4;
+
+  h = height;
+  do {
+    src_ptr = src + (count << 2) * src_stride;
+    dst_ptr = dst + (count << 2) * dst_stride;
+    w = width;
+    do {
+      load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+      store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 7);
+
+    for (int y = 0; y < w; y++) {
+      dst_ptr[y] = src_ptr[y];
+      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+    }
+    count++;
+    h -= 4;
+  } while (h > 3);
+
+  src_ptr = src + (count << 2) * src_stride;
+  dst_ptr = dst + (count << 2) * dst_stride;
+
+  for (int x = 0; x < h; x++) {
+    memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+           sizeof(uint16_t) * width);
+  }
+}
+
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+                                    int stride, int32_t *flt0, int32_t *flt1,
+                                    int flt_stride, int sgr_params_idx,
+                                    int bit_depth, int highbd) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  uint16_t *dgd16 =
+      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int dgd_stride = stride;
+
+  if (highbd) {
+    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+    src_convert_hbd_copy(
+        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  } else {
+    src_convert_u8_to_u16(
+        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  }
+
+  if (params->r[0] > 0)
+    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+                              flt_stride, bit_depth, sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+                         bit_depth, sgr_params_idx, 1);
+  return 0;
+}
+
+void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+                                       int height, int stride, int eps,
+                                       const int *xqd, uint8_t *dst8,
+                                       int dst_stride, int32_t *tmpbuf,
+                                       int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  uint16_t *dgd16 =
+      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int dgd_stride = stride;
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  if (highbd) {
+    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+    src_convert_hbd_copy(
+        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  } else {
+    src_convert_u8_to_u16(
+        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  }
+
+  if (params->r[0] > 0)
+    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+                              bit_depth, eps, 0);
+  if (params->r[1] > 0)
+    restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+                         bit_depth, eps, 1);
+
+  decode_xq(xqd, xq, params);
+
+  {
+    int16_t *src_ptr;
+    uint8_t *dst_ptr;
+    uint16_t *dst16_ptr;
+    int16x4_t d0, d4;
+    int16x8_t r0, s0;
+    uint16x8_t r4;
+    int32x4_t u0, u4, v0, v4, f00, f10;
+    uint8x8_t t0;
+    int count = 0, w = width, h = height, rc = 0;
+
+    const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+    const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+    const int16x8_t zero = vdupq_n_s16(0);
+    const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+    dst_ptr = dst8;
+    src_ptr = (int16_t *)dgd16;
+    do {
+      w = width;
+      count = 0;
+      dst_ptr = dst8 + rc * dst_stride;
+      dst16_ptr = dst16 + rc * dst_stride;
+      do {
+        s0 = vld1q_s16(src_ptr + count);
+
+        u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+        u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+        v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+        v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+        if (params->r[0] > 0) {
+          f00 = vld1q_s32(flt0 + count);
+          f10 = vld1q_s32(flt0 + count + 4);
+
+          f00 = vsubq_s32(f00, u0);
+          f10 = vsubq_s32(f10, u4);
+
+          v0 = vmlaq_s32(v0, xq0_vec, f00);
+          v4 = vmlaq_s32(v4, xq0_vec, f10);
+        }
+
+        if (params->r[1] > 0) {
+          f00 = vld1q_s32(flt1 + count);
+          f10 = vld1q_s32(flt1 + count + 4);
+
+          f00 = vsubq_s32(f00, u0);
+          f10 = vsubq_s32(f10, u4);
+
+          v0 = vmlaq_s32(v0, xq1_vec, f00);
+          v4 = vmlaq_s32(v4, xq1_vec, f10);
+        }
+
+        d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+        d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+        r0 = vcombine_s16(d0, d4);
+
+        r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+        if (highbd) {
+          r4 = vminq_u16(r4, max);
+          vst1q_u16(dst16_ptr, r4);
+        } else {
+          t0 = vqmovn_u16(r4);
+          vst1_u8(dst_ptr, t0);
+        }
+        w -= 8;
+        count += 8;
+        dst_ptr += 8;
+        dst16_ptr += 8;
+      } while (w > 0);
+
+      src_ptr += dgd16_stride;
+      flt1 += width;
+      flt0 += width;
+      rc++;
+      h--;
+    } while (h > 0);
+  }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
new file mode 100644
index 000000000..8a3d9f07f
--- /dev/null
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -0,0 +1,537 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+                                    uint8x8_t *a6, uint8x8_t *a7) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint16x4x2_t b0 =
+      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b0.val[1]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, const uint8x8_t a4,
+                                    const uint8x8_t a5, const uint8x8_t a6,
+                                    const uint8x8_t a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+  *a2 = d1.val[0];
+  *a3 = d1.val[1];
+}
+
+static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
+                                     uint16x4_t *a2, uint16x4_t *a3,
+                                     uint16x4_t *a4, uint16x4_t *a5,
+                                     uint16x4_t *a6, uint16x4_t *a7,
+                                     uint16x8_t *o0, uint16x8_t *o1,
+                                     uint16x8_t *o2, uint16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+  uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+  uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
+  uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                     vreinterpret_u16_u32(c2.val[0]));
+  *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                     vreinterpret_u16_u32(c3.val[0]));
+  *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                     vreinterpret_u16_u32(c2.val[1]));
+  *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                     vreinterpret_u16_u32(c3.val[1]));
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3,
+                                     uint16x8_t *a4, uint16x8_t *a5,
+                                     uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
+  *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
+
+  *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
+  *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
+
+  *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
+  *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
+
+  *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
+  *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+                                     int16x8_t *a2, int16x8_t *a3,
+                                     int16x8_t *a4, int16x8_t *a5,
+                                     int16x8_t *a6, int16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
+  *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
+
+  *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
+  *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
+
+  *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
+  *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
+
+  *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
+  *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
+}
+
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+  const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+  const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+  const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  *out = d0.val[0];
+  *(out + 1) = d1.val[0];
+  *(out + 2) = d2.val[0];
+  *(out + 3) = d3.val[0];
+  *(out + 4) = d0.val[1];
+  *(out + 5) = d1.val[1];
+  *(out + 6) = d2.val[1];
+  *(out + 7) = d3.val[1];
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+                                      int16x4_t *a2, int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
+#endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 000000000..7f02d42a7
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+                            uint8x8_t src_1, int16x4_t *res) {
+  int16x8_t coeff_0, coeff_1;
+  int16x8_t pix_0, pix_1;
+
+  coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+                         vreinterpret_s16_s32(x1.val[0]));
+  coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+                         vreinterpret_s16_s32(x1.val[1]));
+
+  pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+  pix_0 = vmulq_s16(coeff_0, pix_0);
+
+  pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+  pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+  *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+                                          uint8x16_t src_3, uint8x16_t src_4,
+                                          int16x8_t *tmp_dst, int sx, int alpha,
+                                          int k, const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+                            255, 0, 255, 0, 255, 0, 255, 0 };
+  const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+  const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int32x2x2_t b0, b1;
+  uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+  int32x4_t tmp_res_low, tmp_res_high;
+  uint16x8_t res;
+  int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+  uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+  uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+  uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+  uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+  tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+  tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+  src_1 = vaddq_u8(tmp_0, tmp_2);
+  src_2 = vaddq_u8(tmp_1, tmp_3);
+
+  src_1_low = vget_low_u8(src_1);
+  src_2_low = vget_low_u8(src_2);
+  src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+  src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+  src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+  src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+  // Loading the 8 filter taps
+  f0 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f1 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f2 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f3 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f4 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f5 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f6 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f7 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+                vreinterpret_s32_s16(vget_low_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+                vreinterpret_s32_s16(vget_low_s16(f6)));
+  convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+                vreinterpret_s32_s16(vget_low_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+                vreinterpret_s32_s16(vget_low_s16(f7)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+                vreinterpret_s32_s16(vget_high_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+                vreinterpret_s32_s16(vget_high_s16(f6)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+                vreinterpret_s32_s16(vget_high_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+                vreinterpret_s32_s16(vget_high_s16(f7)));
+  convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+  tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+  tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+  res = vqrshlq_u16(res, shift);
+
+  tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+                                        int32x4_t *res_low, int32x4_t *res_high,
+                                        int sy, int gamma) {
+  int16x4_t src_0, src_1, fltr_0, fltr_1;
+  int32x4_t res_0, res_1;
+  int32x2_t res_0_im, res_1_im;
+  int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int16x8x2_t b0, b1, b2, b3;
+  int32x4x2_t c0, c1, c2, c3;
+  int32x4x2_t d0, d1, d2, d3;
+
+  b0 = vtrnq_s16(src[0], src[1]);
+  b1 = vtrnq_s16(src[2], src[3]);
+  b2 = vtrnq_s16(src[4], src[5]);
+  b3 = vtrnq_s16(src[6], src[7]);
+
+  c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                 vreinterpretq_s32_s16(b0.val[1]));
+  c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+                 vreinterpretq_s32_s16(b1.val[1]));
+  c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                 vreinterpretq_s32_s16(b2.val[1]));
+  c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+                 vreinterpretq_s32_s16(b3.val[1]));
+
+  f0 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f1 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f2 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f3 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f4 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f5 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f6 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f7 = vld1q_s16(
+      (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+  d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+  d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+  d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+  // row:0,1 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 even_col:0,2,4,6
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 even_col:0,2,4,6
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 even_col:0,2,4,6
+  res_even = vaddq_s32(im_res_0, im_res_1);
+
+  // row:0,1 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 odd_col:1,3,5,7
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 odd_col:1,3,5,7
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 odd_col:1,3,5,7
+  res_odd = vaddq_s32(im_res_0, im_res_1);
+
+  // reordering as 0 1 2 3 | 4 5 6 7
+  c0 = vtrnq_s32(res_even, res_odd);
+
+  // Final store
+  *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+  *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  int16x8_t tmp[15];
+  const int bd = 8;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+  const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+  const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+  int limit = 0;
+  uint8x16_t vec_dup, mask_val;
+  int32x4_t res_lo, res_hi;
+  int16x8_t result_final;
+  uint8x16_t src_1, src_2, src_3, src_4;
+  uint8x16_t indx_vec = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  uint8x16_t cmp_vec;
+
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16x4_t res_sub_const =
+      vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+                   (1 << (offset_bits - conv_params->round_1 - 1))));
+  int k;
+
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      // horizontal
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val =
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                            ref[iy * stride + (width - 1)] *
+                                (1 << (FILTER_BITS - reduce_bits_horiz));
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+
+          if (out_of_boundary_left >= 0) {
+            limit = out_of_boundary_left + 1;
+            cmp_vec = vdupq_n_u8(out_of_boundary_left);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcleq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          if (out_of_boundary_right >= 0) {
+            limit = 15 - (out_of_boundary_right + 1);
+            cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcgeq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      }
+
+      // vertical
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        const int16x8_t *v_src = tmp + (k + 4);
+
+        vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+        res_lo = vaddq_s32(res_lo, add_const_vert);
+        res_hi = vaddq_s32(res_hi, add_const_vert);
+
+        if (conv_params->is_compound) {
+          uint16_t *const p =
+              (uint16_t *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          if (conv_params->do_average) {
+            uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+            uint16x4_t tmp16_lo = vld1_u16(p);
+            int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+            int16x4_t tmp16_low;
+            if (conv_params->use_jnt_comp_avg) {
+              res_lo = vmulq_s32(res_lo, bwd);
+              tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+            } else {
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+            }
+            int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+            res_low = vqrshl_s16(res_low, round_bits_vec);
+            int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+            uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+            vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+          } else {
+            uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+            vst1_u16(p, res_u16_low);
+          }
+          if (p_width > 4) {
+            uint16_t *const p4 =
+                (uint16_t *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = vrshlq_s32(res_hi, shift_vert);
+            if (conv_params->do_average) {
+              uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+              uint16x4_t tmp16_hi = vld1_u16(p4);
+              int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+              int16x4_t tmp16_high;
+              if (conv_params->use_jnt_comp_avg) {
+                res_hi = vmulq_s32(res_hi, bwd);
+                tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+              } else {
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+              }
+              int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+              res_high = vqrshl_s16(res_high, round_bits_vec);
+              int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+              uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+              vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+                            0);
+            } else {
+              uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+              vst1_u16(p4, res_u16_high);
+            }
+          }
+        } else {
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          res_hi = vrshlq_s32(res_hi, shift_vert);
+
+          result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+          result_final = vsubq_s16(result_final, sub_constant);
+
+          uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+          uint8x8_t val = vqmovun_s16(result_final);
+
+          if (p_width == 4) {
+            vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+          } else {
+            vst1_u8(p, val);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
new file mode 100644
index 000000000..a9bb5bcf0
--- /dev/null
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+/* Wiener filter 2D
+   Apply horizontal filter and store in a temporary buffer. When applying
+   vertical filter, overwrite the original pixel values.
+ */
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  uint16_t *d_tmp;
+  uint8_t *d;
+  const uint8_t *src_ptr, *s_tmp;
+  uint16_t *dst_ptr;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  int width, height;
+  const int bd = 8;
+  const int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  int16_t filter_x_tmp[7], filter_y_tmp[7];
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w % 8));
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  assert(filter_x[7] == 0);
+  assert(filter_y[7] == 0);
+
+  /* assumption of horizontal filtering output will not exceed 15 bit.
+     ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
+     16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
+   */
+  assert((conv_params->round_0) >= 1);
+
+  memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
+  memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
+
+  filter_x_tmp[3] += (1 << FILTER_BITS);
+  filter_y_tmp[3] += (1 << FILTER_BITS);
+
+  s_tmp = src - center_tap * src_stride - center_tap;
+  dst_ptr = temp;
+  src_ptr = s_tmp;
+  height = intermediate_height;
+
+  /* if height is a multiple of 8 */
+  if (!(h & 7)) {
+    int16x8_t res0, res1, res2, res3;
+    uint16x8_t res4;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
+    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+        transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
+                          &res11);
+        store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
+                      res10, res11);
+
+        t0 = t8;
+        t1 = t9;
+        t2 = t10;
+        t3 = t11;
+        t4 = t12;
+        t5 = t13;
+        t6 = t14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * MAX_SB_SIZE;
+      height -= 8;
+    } while (height > 0);
+#else
+    uint8x8_t temp_0;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        vst1q_u16(d_tmp, res4);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height--;
+    } while (height > 0);
+#endif
+  } else {
+    /*if height is a multiple of 4*/
+    const uint8_t *s;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint16x8_t d0;
+    uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
+    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x4_t s11, s12, s13, s14;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+      transpose_u8_8x4(&t0, &t1, &t2,
+                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
+
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
+      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
+      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
+      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
+      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
+      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
+      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
+        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
+        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
+        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
+        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
+        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
+        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
+        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
+
+        res0 = wiener_convolve8_horiz_4x8(
+            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
+        res1 = wiener_convolve8_horiz_4x8(
+            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
+        res2 = wiener_convolve8_horiz_4x8(
+            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
+        res3 = wiener_convolve8_horiz_4x8(
+            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
+        res4 =
+            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res5 =
+            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res6 =
+            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res7 =
+            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
+                                       filter_x_tmp, bd, conv_params->round_0);
+
+        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7, &d0, &d1, &d2, &d3);
+
+        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * MAX_SB_SIZE;
+      height -= 4;
+    } while (height > 0);
+#else
+    uint8x8_t temp_0, t4, t5, t6, t7;
+
+    do {
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      __builtin_prefetch(dst_ptr);
+
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+                                        conv_params->round_0);
+
+        vst1q_u16(d_tmp, d0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height -= 1;
+    } while (height > 0);
+#endif
+  }
+
+  {
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x8_t t0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint8x8_t t1, t2, t3;
+#endif
+    int16_t *src_tmp_ptr, *s;
+    uint8_t *dst_tmp_ptr;
+    height = h;
+    width = w;
+    src_tmp_ptr = (int16_t *)temp;
+    dst_tmp_ptr = dst;
+    src_stride = MAX_SB_SIZE;
+
+    do {
+      s = src_tmp_ptr;
+      s0 = vld1q_s16(s);
+      s += src_stride;
+      s1 = vld1q_s16(s);
+      s += src_stride;
+      s2 = vld1q_s16(s);
+      s += src_stride;
+      s3 = vld1q_s16(s);
+      s += src_stride;
+      s4 = vld1q_s16(s);
+      s += src_stride;
+      s5 = vld1q_s16(s);
+      s += src_stride;
+      s6 = vld1q_s16(s);
+      s += src_stride;
+      d = dst_tmp_ptr;
+      height = h;
+
+#if defined(__aarch64__)
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+        s8 = vld1q_s16(s);
+        s += src_stride;
+        s9 = vld1q_s16(s);
+        s += src_stride;
+        s10 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+        vst1_u8(d, t1);
+        d += dst_stride;
+        vst1_u8(d, t2);
+        d += dst_stride;
+        vst1_u8(d, t3);
+        d += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 3);
+
+      if (height != 0) {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+
+        do {
+          s7 = vld1q_s16(s);
+          s += src_stride;
+
+          t0 =
+              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
+                                        filter_y_tmp, bd, conv_params->round_1);
+          vst1_u8(d, t0);
+          d += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+        } while (height > 0);
+      }
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+#else
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+      } while (height > 0);
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+#endif
+  }
+}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
new file mode 100644
index 000000000..7ef2d6d7f
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -0,0 +1,1846 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+// TODO(angiebird): Make 1-d txfm functions static
+//
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 4;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[2];
+  bf1[2] = input[1];
+  bf1[3] = input[3];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+}
+
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 8;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[4];
+  bf1[2] = input[2];
+  bf1[3] = input[6];
+  bf1[4] = input[1];
+  bf1[5] = input[5];
+  bf1[6] = input[3];
+  bf1[7] = input[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+}
+
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 16;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[8];
+  bf1[2] = input[4];
+  bf1[3] = input[12];
+  bf1[4] = input[2];
+  bf1[5] = input[10];
+  bf1[6] = input[6];
+  bf1[7] = input[14];
+  bf1[8] = input[1];
+  bf1[9] = input[9];
+  bf1[10] = input[5];
+  bf1[11] = input[13];
+  bf1[12] = input[3];
+  bf1[13] = input[11];
+  bf1[14] = input[7];
+  bf1[15] = input[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+}
+
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 32;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[16];
+  bf1[2] = input[8];
+  bf1[3] = input[24];
+  bf1[4] = input[4];
+  bf1[5] = input[20];
+  bf1[6] = input[12];
+  bf1[7] = input[28];
+  bf1[8] = input[2];
+  bf1[9] = input[18];
+  bf1[10] = input[10];
+  bf1[11] = input[26];
+  bf1[12] = input[6];
+  bf1[13] = input[22];
+  bf1[14] = input[14];
+  bf1[15] = input[30];
+  bf1[16] = input[1];
+  bf1[17] = input[17];
+  bf1[18] = input[9];
+  bf1[19] = input[25];
+  bf1[20] = input[5];
+  bf1[21] = input[21];
+  bf1[22] = input[13];
+  bf1[23] = input[29];
+  bf1[24] = input[3];
+  bf1[25] = input[19];
+  bf1[26] = input[11];
+  bf1[27] = input[27];
+  bf1[28] = input[7];
+  bf1[29] = input[23];
+  bf1[30] = input[15];
+  bf1[31] = input[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+}
+
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  int32_t x0 = input[0];
+  int32_t x1 = input[1];
+  int32_t x2 = input[2];
+  int32_t x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  assert(sinpi[1] + sinpi[2] == sinpi[4]);
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
+  s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
+  s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
+  s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
+  s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
+  s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
+  s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
+
+  // stage 2
+  // NOTICE: (x0 - x2) here may use one extra bit compared to the
+  // opt_range_row/col specified in av1_gen_inv_stage_range()
+  s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
+
+  // stage 3
+  s0 = range_check_value(s0 + s3, stage_range[3] + bit);
+  s1 = range_check_value(s1 - s4, stage_range[3] + bit);
+  s3 = range_check_value(s2, stage_range[3] + bit);
+  s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
+
+  // stage 4
+  s0 = range_check_value(s0 + s5, stage_range[4] + bit);
+  s1 = range_check_value(s1 - s6, stage_range[4] + bit);
+
+  // stage 5
+  x0 = range_check_value(s0 + s3, stage_range[5] + bit);
+  x1 = range_check_value(s1 + s3, stage_range[5] + bit);
+  x2 = range_check_value(s2, stage_range[5] + bit);
+  x3 = range_check_value(s0 + s1, stage_range[5] + bit);
+
+  // stage 6
+  x3 = range_check_value(x3 - s3, stage_range[6] + bit);
+
+  output[0] = round_shift(x0, bit);
+  output[1] = round_shift(x1, bit);
+  output[2] = round_shift(x2, bit);
+  output[3] = round_shift(x3, bit);
+}
+
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 8;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
+}
+
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 16;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
+}
+
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 4; ++i) {
+    output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
+  }
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
+}
+
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 16; ++i)
+    output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
+}
+
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 64;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+  bf1[31] = bf0[31];
+  bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
+  bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
+  bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
+  bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
+  bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
+  bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
+  bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
+  bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
+  bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
+  bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
+  bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
+  bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
+}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
new file mode 100644
index 000000000..c31c019aa
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+  if (bit <= 0) return value;  // Do nothing for invalid clamp bit.
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  return (int32_t)clamp64(value, min_value, max_value);
+}
+
+static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+  for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
+}
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range);
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
new file mode 100644
index 000000000..7d80a0099
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#include "av1/common/av1_inv_txfm1d.h"
+
+// sum of fwd_shift_##
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+  5,  // 4x4 transform
+  6,  // 8x8 transform
+  7,  // 16x16 transform
+  7,  // 32x32 transform
+  7,  // 64x64 transform
+  5,  // 4x8 transform
+  5,  // 8x4 transform
+  6,  // 8x16 transform
+  6,  // 16x8 transform
+  6,  // 16x32 transform
+  6,  // 32x16 transform
+  6,  // 32x64 transform
+  6,  // 64x32 transform
+  6,  // 4x16 transform
+  6,  // 16x4 transform
+  7,  // 8x32 transform
+  7,  // 32x8 transform
+  7,  // 16x64 transform
+  7,  // 64x16 transform
+};
+
+extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+
+// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// for each valid row and col combination
+#define INV_COS_BIT 12
+extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
+
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c
new file mode 100644
index 000000000..4e6944314
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm2d.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_low_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    range_check_value(a1, bd + 1);
+    range_check_value(b1, bd + 1);
+    range_check_value(c1, bd + 1);
+    range_check_value(d1, bd + 1);
+
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_low_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void)bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = a1;
+  op[1] = op[2] = op[3] = e1;
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] =
+        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] =
+        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] =
+        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] =
+        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_idct4_new;
+    case TXFM_TYPE_DCT8: return av1_idct8_new;
+    case TXFM_TYPE_DCT16: return av1_idct16_new;
+    case TXFM_TYPE_DCT32: return av1_idct32_new;
+    case TXFM_TYPE_DCT64: return av1_idct64_new;
+    case TXFM_TYPE_ADST4: return av1_iadst4_new;
+    case TXFM_TYPE_ADST8: return av1_iadst8_new;
+    case TXFM_TYPE_ADST16: return av1_iadst16_new;
+    case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+static const int8_t inv_shift_4x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
+static const int8_t inv_shift_4x8[2] = { 0, -4 };
+static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x32[2] = { -1, -4 };
+static const int8_t inv_shift_32x16[2] = { -1, -4 };
+static const int8_t inv_shift_32x64[2] = { -1, -4 };
+static const int8_t inv_shift_64x32[2] = { -1, -4 };
+static const int8_t inv_shift_4x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x4[2] = { -1, -4 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
+
+const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+  inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
+  inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
+  inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
+  inv_shift_64x32, inv_shift_4x16,  inv_shift_16x4,  inv_shift_8x32,
+  inv_shift_32x8,  inv_shift_16x64, inv_shift_64x16,
+};
+
+/* clang-format off */
+const int8_t inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
+
+const int8_t inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
+/* clang-format on */
+
+const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  cfg->shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
+  }
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
+  }
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+}
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+                             int bd) {
+  const int fwd_shift = inv_start_range[tx_size];
+  const int8_t *shift = cfg->shift;
+  int8_t opt_range_row, opt_range_col;
+  if (bd == 8) {
+    opt_range_row = 16;
+    opt_range_col = 16;
+  } else if (bd == 10) {
+    opt_range_row = 18;
+    opt_range_col = 16;
+  } else {
+    assert(bd == 12);
+    opt_range_row = 20;
+    opt_range_col = 18;
+  }
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
+    (void)real_range_row;
+    if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_row[i] = opt_range_row;
+    } else {
+      assert(opt_range_row >= real_range_row);
+      stage_range_row[i] = opt_range_row;
+    }
+  }
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_col =
+        cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
+    (void)real_range_col;
+    if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_col[i] = opt_range_col;
+    } else {
+      assert(opt_range_col >= real_range_col);
+      stage_range_col[i] = opt_range_col;
+    }
+  }
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
+                                    int32_t *txfm_buf, TX_SIZE tx_size,
+                                    int bd) {
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
+
+  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
+  // AOMMAX(txfm_size_row, txfm_size_col)
+  // it is used for intermediate data buffering
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  int c, r;
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    if (abs(rect_type) == 1) {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    } else {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = input[c];
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
+                                         int stride, int32_t *txfm_buf,
+                                         TX_TYPE tx_type, TX_SIZE tx_size,
+                                         int bd) {
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
+  // Forward shift sum uses larger square size, to be consistent with what
+  // av1_gen_inv_stage_range() does for inverse shifts.
+  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
+}
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
+}
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
+}
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
+}
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
+}
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // TODO(urvang): Can the same array be reused, instead of using a new array?
+  // Remap 32x32 input into a modified 64x64 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 64];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x32 input into a modified 64x32 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 32];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x32 input into a modified 32x64 input by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[32 * 64];
+  memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
+  memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 16x32 input into a modified 16x64 input by:
+  // - Copying over these values in top-left 16x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[16 * 64];
+  memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
+  memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x16 input into a modified 64x16 by:
+  // - Copying over these values in top-left 32x16 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 16];
+  for (int row = 0; row < 16; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
new file mode 100644
index 000000000..537d8dfe9
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -0,0 +1,2377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
+  { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
+  { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
+  { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
+};
+
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
+  { 0, 1 }, { 2, 2 }, { 3, 3 }
+};
+
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+
+static const int mode_lf_lut[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1,                             // INTER_MODES (GLOBALMV == 0)
+  1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
+};
+
+#if LOOP_FILTER_BITMASK
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    -----------------
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    -----------------
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
+};
+
+const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
+
+const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
+
+const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
+                                                      -1, -1, -1, 0,  1,  2,
+                                                      3,  -1, -1, -1, -1, -1,
+                                                      -1, -1, -1, -1 };
+
+const FilterMask left_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+const FilterMask above_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+      0x000000000000000fULL } },  // block size 16X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
+                                     int mi_col) {
+  assert(cm->lf.lfm != NULL);
+  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
+  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
+
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+                        const uint8_t *limit, const uint8_t *thresh);
+
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+                            const uint8_t *limit0, const uint8_t *thresh0,
+                            const uint8_t *blimit1, const uint8_t *limit1,
+                            const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+#endif  // LOOP_FILTER_BITMASK
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+  int lvl;
+
+  // For each possible value for the loop filter fill out limits
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+    // Set loop filter parameters that control sharpness.
+    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+    if (sharpness_lvl > 0) {
+      if (block_inside_limit > (9 - sharpness_lvl))
+        block_inside_limit = (9 - sharpness_lvl);
+    }
+
+    if (block_inside_limit < 1) block_inside_limit = 1;
+
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
+  }
+}
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+                         const int dir_idx, int plane,
+                         const MB_MODE_INFO *mbmi) {
+  const int segment_id = mbmi->segment_id;
+  if (cm->delta_lf_present_flag) {
+    int delta_lf;
+    if (cm->delta_lf_multi) {
+      const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
+      delta_lf = mbmi->delta_lf[delta_lf_idx];
+    } else {
+      delta_lf = mbmi->delta_lf_from_base;
+    }
+    int base_level;
+    if (plane == 0)
+      base_level = cm->lf.filter_level[dir_idx];
+    else if (plane == 1)
+      base_level = cm->lf.filter_level_u;
+    else
+      base_level = cm->lf.filter_level_v;
+    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
+    assert(plane >= 0 && plane <= 2);
+    const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
+    if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
+      const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
+      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+    }
+
+    if (cm->lf.mode_ref_delta_enabled) {
+      const int scale = 1 << (lvl_seg >> 5);
+      lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
+      if (mbmi->ref_frame[0] > INTRA_FRAME)
+        lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
+      lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
+    }
+    return lvl_seg;
+  } else {
+    return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
+                     [mode_lf_lut[mbmi->mode]];
+  }
+}
+
+void av1_loop_filter_init(AV1_COMMON *cm) {
+  assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
+  loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
+  int lvl;
+
+  lf->combine_vert_horz_lf = 1;
+
+  // init limits for given sharpness
+  update_sharpness(lfi, lf->sharpness_level);
+
+  // init hev threshold const vectors
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+// Update the loop filter for the current frame.
+// This should be called before loop_filter_rows(),
+// av1_loop_filter_frame() calls this function directly.
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
+                                int plane_end) {
+  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
+  int plane;
+  int seg_id;
+  // n_shift is the multiplier for lf_deltas
+  // the multiplier is 1 for when filter_lvl is between 0 and 31;
+  // 2 when filter_lvl is between 32 and 63
+  loop_filter_info_n *const lfi = &cm->lf_info;
+  struct loopfilter *const lf = &cm->lf;
+  const struct segmentation *const seg = &cm->seg;
+
+  // update sharpness limits
+  update_sharpness(lfi, lf->sharpness_level);
+
+  filt_lvl[0] = cm->lf.filter_level[0];
+  filt_lvl[1] = cm->lf.filter_level_u;
+  filt_lvl[2] = cm->lf.filter_level_v;
+
+  filt_lvl_r[0] = cm->lf.filter_level[1];
+  filt_lvl_r[1] = cm->lf.filter_level_u;
+  filt_lvl_r[2] = cm->lf.filter_level_v;
+
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
+      break;
+    else if (plane == 1 && !filt_lvl[1])
+      continue;
+    else if (plane == 2 && !filt_lvl[2])
+      continue;
+
+    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+      for (int dir = 0; dir < 2; ++dir) {
+        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
+        assert(plane >= 0 && plane <= 2);
+        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
+        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
+          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
+          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+        }
+
+        if (!lf->mode_ref_delta_enabled) {
+          // we could get rid of this if we assume that deltas are set to
+          // zero when not in use; encoder always uses deltas
+          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
+                 sizeof(lfi->lvl[plane][seg_id][dir]));
+        } else {
+          int ref, mode;
+          const int scale = 1 << (lvl_seg >> 5);
+          const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+          lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
+              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+          for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
+            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+                                    lf->mode_deltas[mode] * scale;
+              lfi->lvl[plane][seg_id][dir][ref][mode] =
+                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+#if LOOP_FILTER_BITMASK
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+int get_index_shift(int mi_col, int mi_row, int *index) {
+  // *index = mi_row >> 2;
+  // rows = mi_row % 4;
+  // stride_log2 = 4;
+  // shift = (rows << stride_log2) + mi_col;
+  *index = mi_row >> 2;
+  return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void check_mask(const FilterMask *lfm) {
+#ifndef NDEBUG
+  for (int i = 0; i < 4; ++i) {
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
+    assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
+    assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
+  }
+#else
+  (void)lfm;
+#endif
+}
+
+static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
+  if (plane == 0) {
+    // Assert if we try to apply 2 different loop filters at the same
+    // position.
+    check_mask(lfm->left_y);
+    check_mask(lfm->above_y);
+  } else if (plane == 1) {
+    check_mask(lfm->left_u);
+    check_mask(lfm->above_u);
+  } else {
+    check_mask(lfm->left_v);
+    check_mask(lfm->above_v);
+  }
+}
+
+static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
+                         TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
+  if (dir == VERT_EDGE) {
+    switch (plane) {
+      case 0:
+        for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 1:
+        for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 2:
+        for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      default: assert(plane <= 2);
+    }
+  } else {
+    switch (plane) {
+      case 0:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 1:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      case 2:
+        for (int i = 0; i < 4; ++i)
+          lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
+        break;
+      default: assert(plane <= 2);
+    }
+  }
+}
+
+static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
+                             int mi_col, int ssx, int ssy, EDGE_DIR dir) {
+  if (plane && (ssx || ssy)) {
+    if (ssx && ssy) {  // format 420
+      if ((mi_row << MI_SIZE_LOG2) > cm->height ||
+          (mi_col << MI_SIZE_LOG2) > cm->width)
+        return 1;
+    } else if (ssx) {  // format 422
+      if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+          (mi_col << MI_SIZE_LOG2) > cm->width)
+        return 1;
+    }
+  } else {
+    if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+        (mi_col << MI_SIZE_LOG2) >= cm->width)
+      return 1;
+  }
+
+  int row_or_col;
+  if (plane == 0) {
+    row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
+  } else {
+    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+    // So if mi_col == 1, it is actually the frame boundary.
+    if (dir == VERT_EDGE) {
+      row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
+    } else {
+      row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
+    }
+  }
+  return row_or_col == 0;
+}
+
+static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+                        int ssx, int ssy, TX_SIZE tx_size) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
+  const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
+  // decide whether current vertical/horizontal edge needs loop filtering
+  for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
+    // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+    mi_row |= ssy;
+    mi_col |= ssx;
+
+    MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+    const MB_MODE_INFO *const mbmi = mi[0];
+    const int curr_skip = mbmi->skip && is_inter_block(mbmi);
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+    const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
+    const int prediction_masks = dir == VERT_EDGE
+                                     ? block_size_wide[plane_bsize] - 1
+                                     : block_size_high[plane_bsize] - 1;
+    const int is_coding_block_border =
+        dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
+
+    // TODO(chengchen): step can be optimized.
+    const int row_step = mi_size_high[TX_4X4] << ssy;
+    const int col_step = mi_size_wide[TX_4X4] << ssx;
+    const int mi_height =
+        dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
+    const int mi_width =
+        dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
+
+    // assign filter levels
+    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+        // do not filter frame boundary
+        // Note: when chroma planes' size are half of luma plane,
+        // chroma plane mi corresponds to even position.
+        // If frame size is not even, we still need to filter this chroma
+        // position. Therefore the boundary condition check needs to be
+        // separated to two cases.
+        if (plane && (ssx || ssy)) {
+          if (ssx && ssy) {  // format 420
+            if ((r << MI_SIZE_LOG2) > cm->height ||
+                (c << MI_SIZE_LOG2) > cm->width)
+              continue;
+          } else if (ssx) {  // format 422
+            if ((r << MI_SIZE_LOG2) >= cm->height ||
+                (c << MI_SIZE_LOG2) > cm->width)
+              continue;
+          }
+        } else {
+          if ((r << MI_SIZE_LOG2) >= cm->height ||
+              (c << MI_SIZE_LOG2) >= cm->width)
+            continue;
+        }
+
+        const int row = r % MI_SIZE_64X64;
+        const int col = c % MI_SIZE_64X64;
+        if (plane == 0) {
+          if (dir == VERT_EDGE)
+            lfm->lfl_y_ver[row][col] = level;
+          else
+            lfm->lfl_y_hor[row][col] = level;
+        } else if (plane == 1) {
+          lfm->lfl_u[row][col] = level;
+        } else {
+          lfm->lfl_v[row][col] = level;
+        }
+      }
+    }
+
+    for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+      for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+        // do not filter frame boundary
+        if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
+
+        uint64_t mask[4] = { 0 };
+        const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
+        const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
+        MB_MODE_INFO **mi_prev =
+            cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
+        const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
+        const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
+        const uint8_t level_prev =
+            get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
+        const int is_edge =
+            (level || level_prev) &&
+            (!curr_skip || !prev_skip || is_coding_block_border);
+
+        if (is_edge) {
+          const TX_SIZE prev_tx_size =
+              plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
+                    : mbmi_prev->tx_size;
+          TX_SIZE min_tx_size = (dir == VERT_EDGE)
+                                    ? AOMMIN(txsize_horz_map[tx_size],
+                                             txsize_horz_map[prev_tx_size])
+                                    : AOMMIN(txsize_vert_map[tx_size],
+                                             txsize_vert_map[prev_tx_size]);
+          min_tx_size = AOMMIN(min_tx_size, TX_16X16);
+          assert(min_tx_size < TX_SIZES);
+          const int row = r % MI_SIZE_64X64;
+          const int col = c % MI_SIZE_64X64;
+          int index = 0;
+          const int shift = get_index_shift(col, row, &index);
+          assert(index < 4 && index >= 0);
+          mask[index] |= ((uint64_t)1 << shift);
+          // set mask on corresponding bit
+          update_masks(dir, plane, mask, min_tx_size, lfm);
+        }
+      }
+    }
+  }
+}
+
+static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                int blk_row, int blk_col,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                int plane, int ssx, int ssy) {
+  blk_row <<= ssy;
+  blk_col <<= ssx;
+  if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
+      ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
+    return;
+
+  // U/V plane, tx_size is always the largest size
+  if (plane) {
+    assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
+    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+                tx_size);
+    return;
+  }
+
+  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  const MB_MODE_INFO *const mbmi = mi[0];
+  // For Y plane:
+  // If intra block, tx size is univariant.
+  // If inter block, tx size follows inter_tx_size.
+  TX_SIZE plane_tx_size = tx_size;
+  const int is_inter = is_inter_block(mbmi);
+
+  if (plane == 0) {
+    if (is_inter) {
+      if (mbmi->skip) {
+        // TODO(chengchen): change av1_get_transform_size() to be consistant.
+        // plane_tx_size = get_max_rect_tx_size(plane_bsize);
+        plane_tx_size = mbmi->tx_size;
+      } else {
+        plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+            plane_bsize, blk_row, blk_col)];
+      }
+    } else {
+      MB_MODE_INFO **mi_this = cm->mi_grid_visible +
+                               (mi_row + blk_row) * cm->mi_stride + mi_col +
+                               blk_col;
+      const MB_MODE_INFO *const mbmi_this = mi_this[0];
+      plane_tx_size = mbmi_this->tx_size;
+    }
+  }
+
+  assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
+
+  if (plane || plane_tx_size == tx_size) {
+    setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+                tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
+                            sub_txs, plane, ssx, ssy);
+      }
+    }
+  }
+}
+
+static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int plane, int ssx, int ssy) {
+  MB_MODE_INFO **mi =
+      cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
+  const MB_MODE_INFO *const mbmi = mi[0];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+  const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+
+  const int block_width = mi_size_wide[plane_bsize];
+  const int block_height = mi_size_high[plane_bsize];
+
+  TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
+  // The decoder is designed so that it can process 64x64 luma pixels at a
+  // time. If this is a chroma plane with subsampling and bsize corresponds to
+  // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
+  // mustn't be used for the subsampled plane (because it would be bigger than
+  // a 64x64 luma block) so we round down to TX_32X32.
+  if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
+    if (max_txsize == TX_16X64)
+      max_txsize = TX_16X32;
+    else if (max_txsize == TX_64X16)
+      max_txsize = TX_32X16;
+    else
+      max_txsize = TX_32X32;
+  }
+
+  const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
+  const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+  const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+  const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+  mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
+
+  // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
+  // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
+  // U/V: largest tx size is 32x32.
+  for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
+    for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
+      const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
+      const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
+      for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+        for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+          setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
+                              max_txsize, plane, ssx, ssy);
+        }
+      }
+    }
+  }
+}
+
+static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
+  if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+      (mi_col << MI_SIZE_LOG2) >= cm->width)
+    return;
+
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
+  const int has_next_row =
+      (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
+  const int has_next_col =
+      (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
+  int i;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_SPLIT:
+      setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
+      if (has_next_col)
+        setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
+      if (has_next_row)
+        setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
+      if (has_next_col & has_next_row)
+        setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
+                         ssy);
+      break;
+    case PARTITION_HORZ_A:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ_B:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      if (has_next_col & has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT_A:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_VERT_B:
+      setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+      if (has_next_col)
+        setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+      if (has_next_row)
+        setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
+        // chroma plane filter the odd location
+        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+        setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        // chroma plane filter the odd location
+        if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+        setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
+      }
+      break;
+    default: assert(0);
+  }
+}
+
+// TODO(chengchen): if lossless, do not need to setup mask. But when
+// segments enabled, each segment has different lossless settings.
+void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+                       int subsampling_x, int subsampling_y, int row_end,
+                       int col_end) {
+  const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
+  for (int y = 0; y < num_64x64; ++y) {
+    for (int x = 0; x < num_64x64; ++x) {
+      const int row = mi_row + y * MI_SIZE_64X64;
+      const int col = mi_col + x * MI_SIZE_64X64;
+      if (row >= row_end || col >= col_end) continue;
+      if ((row << MI_SIZE_LOG2) >= cm->height ||
+          (col << MI_SIZE_LOG2) >= cm->width)
+        continue;
+
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+      if (lfm == NULL) return;
+
+      // init mask to zero
+      if (plane == 0) {
+        av1_zero(lfm->left_y);
+        av1_zero(lfm->above_y);
+        av1_zero(lfm->lfl_y_ver);
+        av1_zero(lfm->lfl_y_hor);
+      } else if (plane == 1) {
+        av1_zero(lfm->left_u);
+        av1_zero(lfm->above_u);
+        av1_zero(lfm->lfl_u);
+      } else {
+        av1_zero(lfm->left_v);
+        av1_zero(lfm->above_v);
+        av1_zero(lfm->lfl_v);
+      }
+    }
+  }
+
+  // set up bitmask for each superblock
+  setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
+                   subsampling_x, subsampling_y);
+
+  for (int y = 0; y < num_64x64; ++y) {
+    for (int x = 0; x < num_64x64; ++x) {
+      const int row = mi_row + y * MI_SIZE_64X64;
+      const int col = mi_col + x * MI_SIZE_64X64;
+      if (row >= row_end || col >= col_end) continue;
+      if ((row << MI_SIZE_LOG2) >= cm->height ||
+          (col << MI_SIZE_LOG2) >= cm->width)
+        continue;
+
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+      if (lfm == NULL) return;
+
+      // check if the mask is valid
+      check_loop_filter_masks(lfm, plane);
+
+      {
+        // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
+        // Even tx size is greater, we only apply max length filter, which
+        // is 16.
+        if (plane == 0) {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
+            lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
+            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
+            lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
+
+            // set 32x32 and 64x64 to 0
+            lfm->left_y[TX_32X32].bits[j] = 0;
+            lfm->left_y[TX_64X64].bits[j] = 0;
+            lfm->above_y[TX_32X32].bits[j] = 0;
+            lfm->above_y[TX_64X64].bits[j] = 0;
+          }
+        } else if (plane == 1) {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
+            lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
+
+            // set 32x32 to 0
+            lfm->left_u[TX_32X32].bits[j] = 0;
+            lfm->above_u[TX_32X32].bits[j] = 0;
+          }
+        } else {
+          for (int j = 0; j < 4; ++j) {
+            lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
+            lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
+
+            // set 32x32 to 0
+            lfm->left_v[TX_32X32].bits[j] = 0;
+            lfm->above_v[TX_32X32].bits[j] = 0;
+          }
+        }
+      }
+
+      // check if the mask is valid
+      check_loop_filter_masks(lfm, plane);
+    }
+  }
+}
+
+static void filter_selectively_vert_row2(
+    int subsampling_factor, uint8_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
+          }
+        } else if (mask_16x16_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          }
+        } else if (mask_8x8_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+
+static void highbd_filter_selectively_vert_row2(
+    int subsampling_factor, uint16_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_16x16_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_8x8_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+                                     int subsampling, uint64_t mask_16x16,
+                                     uint64_t mask_8x8, uint64_t mask_4x4,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds.
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, lfin->mblim, lfin->lim,
+                                       lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_8x8 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          count = 2;
+        } else {
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+  }
+}
+
+static void highbd_filter_selectively_horiz(
+    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+    uint8_t *lfl, int bd) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds.
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_8x8 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                           lfi->hev_thr, lfin->mblim, lfin->lim,
+                                           lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+  }
+}
+
+void av1_build_bitmask_vert_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  int skip, prev_skip = 0;
+  int is_coding_block_border;
+
+  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
+    const int mi_row = r << subsampling_y;
+    const int row = mi_row % MI_SIZE_64X64;
+    int index = 0;
+    const int shift = get_index_shift(0, row, &index);
+
+    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+      const int mi_col = c << subsampling_x;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int col_in_unit = 0;
+           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+        if (x >= plane_ptr->dst.width) break;
+        const int col = col_in_unit << subsampling_x;
+        const uint64_t mask = ((uint64_t)1 << (shift | col));
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_ver[row][col]; break;
+          case 1: level = lfm->lfl_u[row][col]; break;
+          case 2: level = lfm->lfl_v[row][col]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((c + col_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
+          const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
+          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+          switch (plane) {
+            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        col_in_unit += tx_size_wide_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_build_bitmask_horz_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  int skip, prev_skip = 0;
+  int is_coding_block_border;
+
+  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
+    const int mi_col = c << subsampling_x;
+    const int col = mi_col % MI_SIZE_64X64;
+
+    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+      const int mi_row = r << subsampling_y;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int r_in_unit = 0;
+           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+        if (y >= plane_ptr->dst.height) break;
+        const int row = r_in_unit << subsampling_y;
+        int index = 0;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_hor[row][col]; break;
+          case 1: level = lfm->lfl_u[row][col]; break;
+          case 2: level = lfm->lfl_v[row][col]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((r + r_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
+          const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
+          const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+          switch (plane) {
+            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        r_in_unit += tx_size_high_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int two_row_step = 2 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  const int two_row_stride = row_stride << 1;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+
+  // 1. vertical filtering. filter two rows at a time
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += two_row_step) {
+    const int row = r | ssy;
+    const int row_next = row + row_step;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    int index_next = 0;
+    const int shift_next = get_index_shift(col, row_next, &index_next);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_ver[row][col];
+        lfl2 = &lfm->lfl_y_ver[row_next][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u[row][col];
+        lfl2 = &lfm->lfl_u[row_next][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v[row][col];
+        lfl2 = &lfm->lfl_v[row_next][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+    dst->buf += two_row_stride;
+  }
+  // reset buf pointer for horizontal filtering
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += row_step) {
+    if (mi_row + r == 0) {
+      dst->buf += row_stride;
+      continue;
+    }
+    const int row = r | ssy;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_hor[row][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u[row][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v[row][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+    dst->buf += row_stride;
+  }
+  // reset buf pointer for next block
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int single_step = 1 << ssy;
+  const int r_step = 2 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+
+  // filter two rows at a time
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      // current and next row should belong to the same mask_idx and index
+      // next row's shift
+      const int row_next = row + single_step;
+      int index_next = 0;
+      const int shift_next = get_index_shift(col, row_next, &index_next);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_ver[row][col];
+          lfl2 = &lfm->lfl_y_ver[row_next][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u[row][col];
+          lfl2 = &lfm->lfl_u[row_next][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v[row][col];
+          lfl2 = &lfm->lfl_v[row_next][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_vert_row2(
+            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
+                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
+                                     &cm->lf_info, lfl, lfl2);
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += 2 * MI_SIZE * dst->stride;
+  }
+}
+
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int r_step = 1 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      if (mi_row + r == 0) continue;
+
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_hor[row][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u[row][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v[row][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride, pl, ssx, mask_16x16,
+                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                        (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += MI_SIZE * dst->stride;
+  }
+}
+#endif  // LOOP_FILTER_BITMASK
+
+static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
+                                  const MB_MODE_INFO *const mbmi,
+                                  const EDGE_DIR edge_dir, const int mi_row,
+                                  const int mi_col, const int plane,
+                                  const struct macroblockd_plane *plane_ptr) {
+  assert(mbmi != NULL);
+  if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
+
+  TX_SIZE tx_size =
+      (plane == AOM_PLANE_Y)
+          ? mbmi->tx_size
+          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
+  assert(tx_size < TX_SIZES_ALL);
+  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
+    const BLOCK_SIZE sb_type = mbmi->sb_type;
+    const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
+    const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
+    const TX_SIZE mb_tx_size =
+        mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
+    assert(mb_tx_size < TX_SIZES_ALL);
+    tx_size = mb_tx_size;
+  }
+
+  // since in case of chrominance or non-square transorm need to convert
+  // transform size into transform size in particular direction.
+  // for vertical edge, filter direction is horizontal, for horizontal
+  // edge, filter direction is vertical.
+  tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size]
+                                    : txsize_vert_map[tx_size];
+  return tx_size;
+}
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+  // length of the filter applied to the outer edge
+  uint32_t filter_length;
+  // deblocking limits
+  const uint8_t *lim;
+  const uint8_t *mblim;
+  const uint8_t *hev_thr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+// Return TX_SIZE from get_transform_size(), so it is plane and direction
+// awared
+static TX_SIZE set_lpf_parameters(
+    AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
+    const int plane, const struct macroblockd_plane *const plane_ptr) {
+  // reset to initial values
+  params->filter_length = 0;
+
+  // no deblocking is required
+  const uint32_t width = plane_ptr->dst.width;
+  const uint32_t height = plane_ptr->dst.height;
+  if ((width <= x) || (height <= y)) {
+    // just return the smallest transform unit size
+    return TX_4X4;
+  }
+
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
+  // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
+  // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
+  // and mi_col should be odd number for chroma plane.
+  const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
+  const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
+  MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+  const MB_MODE_INFO *mbmi = mi[0];
+  // If current mbmi is not correctly setup, return an invalid value to stop
+  // filtering. One example is that if this tile is not coded, then its mbmi
+  // it not set up.
+  if (mbmi == NULL) return TX_INVALID;
+
+  const TX_SIZE ts =
+      get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
+
+  {
+    const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
+    const uint32_t transform_masks =
+        edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+    const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
+
+    if (!tu_edge) return ts;
+
+    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+    {
+      const uint32_t curr_level =
+          get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+      const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+      uint32_t level = curr_level;
+      if (coord) {
+        {
+          const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+          if (mi_prev == NULL) return TX_INVALID;
+          const int pv_row =
+              (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
+          const int pv_col =
+              (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
+          const TX_SIZE pv_ts = get_transform_size(
+              xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+
+          const uint32_t pv_lvl =
+              get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+
+          const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
+          const BLOCK_SIZE bsize =
+              get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
+                                   plane_ptr->subsampling_y);
+          const int prediction_masks = edge_dir == VERT_EDGE
+                                           ? block_size_wide[bsize] - 1
+                                           : block_size_high[bsize] - 1;
+          const int32_t pu_edge = !(coord & prediction_masks);
+          // if the current and the previous blocks are skipped,
+          // deblock the edge if the edge belongs to a PU's edge only.
+          if ((curr_level || pv_lvl) &&
+              (!pv_skip || !curr_skipped || pu_edge)) {
+            const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
+            if (TX_4X4 >= min_ts) {
+              params->filter_length = 4;
+            } else if (TX_8X8 == min_ts) {
+              if (plane != 0)
+                params->filter_length = 6;
+              else
+                params->filter_length = 8;
+            } else {
+              params->filter_length = 14;
+              // No wide filtering for chroma plane
+              if (plane != 0) {
+                params->filter_length = 6;
+              }
+            }
+
+            // update the level if the current block is skipped,
+            // but the previous one is not
+            level = (curr_level) ? (curr_level) : (pv_lvl);
+          }
+        }
+      }
+      // prepare common parameters
+      if (params->filter_length) {
+        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+        params->lim = limits->lim;
+        params->mblim = limits->mblim;
+        params->hev_thr = limits->hev_thr;
+      }
+    }
+  }
+
+  return ts;
+}
+
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
+  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = (MAX_MIB_SIZE >> scale_vert);
+  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+  for (int y = 0; y < y_range; y += row_step) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 trasnform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
+          else
+            aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
+          else
+            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
+          else
+            aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                       params.mblim, params.lim, params.hev_thr,
+                                       bit_depth);
+          else
+            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
+  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = (MAX_MIB_SIZE >> scale_vert);
+  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+  const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+  for (int x = 0; x < x_range; x += col_step) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 trasnform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
+                             HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                         params.mblim, params.lim,
+                                         params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+
+      // advance the destination pointer
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+    }
+  }
+}
+
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
+                             MACROBLOCKD *xd, int start, int stop,
+#if LOOP_FILTER_BITMASK
+                             int is_decoding,
+#endif
+                             int plane_start, int plane_end) {
+  struct macroblockd_plane *pd = xd->plane;
+  const int col_start = 0;
+  const int col_end = cm->mi_cols;
+  int mi_row, mi_col;
+  int plane;
+
+#if LOOP_FILTER_BITMASK
+  if (is_decoding) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+                           plane, plane + 1);
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+      // apply loop filtering which only goes through buffer once
+      for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+          av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+                               plane, plane + 1);
+          av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+                                              mi_col);
+          if (mi_col - MI_SIZE_64X64 >= 0) {
+            av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
+            av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                                mi_col - MI_SIZE_64X64);
+          }
+        }
+        av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+                             mi_col - MI_SIZE_64X64, plane, plane + 1);
+        av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                            mi_col - MI_SIZE_64X64);
+      }
+    }
+    return;
+  }
+#endif
+
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+      break;
+    else if (plane == 1 && !(cm->lf.filter_level_u))
+      continue;
+    else if (plane == 2 && !(cm->lf.filter_level_v))
+      continue;
+
+#if LOOP_FILTER_BITMASK
+    // filter all vertical edges every superblock (could be 128x128 or 64x64)
+    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+      for (mi_col = col_start; mi_col < col_end;
+           mi_col += cm->seq_params.mib_size) {
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col, plane, plane + 1);
+
+        av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x,
+                          pd[plane].subsampling_y, stop, col_end);
+        av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col);
+      }
+    }
+
+    // filter all horizontal edges every superblock
+    for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+      for (mi_col = col_start; mi_col < col_end;
+           mi_col += cm->seq_params.mib_size) {
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col, plane, plane + 1);
+
+        av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col);
+      }
+    }
+#else
+    if (cm->lf.combine_vert_horz_lf) {
+      // filter all vertical and horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          // filter vertical edges
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+          // filter horizontal edges
+          if (mi_col - MAX_MIB_SIZE >= 0) {
+            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
+                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
+                                 plane + 1);
+            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col - MAX_MIB_SIZE);
+          }
+        }
+        // filter horizontal edges
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
+        av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                    mi_col - MAX_MIB_SIZE);
+      }
+    } else {
+      // filter all vertical edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+        }
+      }
+
+      // filter all horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+        }
+      }
+    }
+#endif  // LOOP_FILTER_BITMASK
+  }
+}
+
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                           MACROBLOCKD *xd,
+#if LOOP_FILTER_BITMASK
+                           int is_decoding,
+#endif
+                           int plane_start, int plane_end, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if LOOP_FILTER_BITMASK
+                   is_decoding,
+#endif
+                   plane_start, plane_end);
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
new file mode 100644
index 000000000..80ac61178
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
+#if LOOP_FILTER_BITMASK
+typedef struct {
+  uint64_t bits[4];
+} FilterMask;
+
+// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
+// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
+// a uint64_t to represent bitmask.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  FilterMask left_y[TX_SIZES];
+  FilterMask above_y[TX_SIZES];
+  FilterMask left_u[TX_SIZES];
+  FilterMask above_u[TX_SIZES];
+  FilterMask left_v[TX_SIZES];
+  FilterMask above_v[TX_SIZES];
+
+  // Y plane vertical edge and horizontal edge filter level
+  uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // U plane filter level
+  uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // V plane filter level
+  uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // other info
+  FilterMask skip;
+  FilterMask is_vert_border;
+  FilterMask is_horz_border;
+  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+  FilterMask tx_size_ver[2][5];
+  FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
+#endif  // LOOP_FILTER_BITMASK
+
+struct loopfilter {
+  int filter_level[2];
+  int filter_level_u;
+  int filter_level_v;
+
+  int sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, Last2+Last3,
+  // GF, BRF, ARF2, ARF
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+  int combine_vert_horz_lf;
+
+#if LOOP_FILTER_BITMASK
+  LoopFilterMask *lfm;
+  size_t lfm_num;
+  int lfm_stride;
+#endif  // LOOP_FILTER_BITMASK
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+  uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+/* assorted loopfilter functions which get used elsewhere */
+struct AV1Common;
+struct macroblockd;
+struct AV1LfSyncData;
+
+void av1_loop_filter_init(struct AV1Common *cm);
+
+void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
+                                int plane_end);
+
+#if LOOP_FILTER_BITMASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int is_decoding,
+                           int plane_start, int plane_end, int partial_frame);
+#else
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *mbd, int plane_start,
+                           int plane_end, int partial_frame);
+#endif
+
+void av1_filter_block_plane_vert(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
+
+void av1_filter_block_plane_horz(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
+
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct AV1Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+  // add lossless as a member here.
+  MACROBLOCKD *xd;
+} LFWorkerData;
+
+uint8_t get_filter_level(const struct AV1Common *cm,
+                         const loop_filter_info_n *lfi_n, const int dir_idx,
+                         int plane, const MB_MODE_INFO *mbmi);
+#if LOOP_FILTER_BITMASK
+void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
+                       int plane, int subsampling_x, int subsampling_y,
+                       int row_end, int col_end);
+
+void av1_filter_block_plane_ver(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_hor(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane, int pl,
+                                int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+                                     int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+  { { 0x0000000000000001ULL,  // TX_4X4,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0000000000010001ULL,  // TX_8X8,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_16X16,
+      0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_32X32,
+      0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+  { { 0x0001000100010001ULL,  // TX_64X64,
+      0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000003ULL,  // TX_8X8
+      0x000000000000000fULL,  // TX_16X16
+      0x00000000000000ffULL,  // TX_32X32
+      0x000000000000ffffULL,  // TX_64X64
+  },
+  {
+      0x0000000000000001ULL,  // TX_4X4
+      0x0000000000000005ULL,  // TX_8X8
+      0x0000000000000055ULL,  // TX_16X16
+      0x0000000000005555ULL,  // TX_32X32
+      0x0000000055555555ULL,  // TX_64X64
+  },
+};
+
+extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+
+extern const FilterMask left_mask_univariant_reordered[67];
+
+extern const FilterMask above_mask_univariant_reordered[67];
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
new file mode 100644
index 000000000..a77a4d254
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void av1_rtcd() {
+  // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+  // decoder setup functions are protected by aom_once();
+  aom_once(setup_rtcd_internal);
+}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
new file mode 100755
index 000000000..dee1f1c79
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -0,0 +1,398 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub av1_common_forward_decls() {
+print <<EOF
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+EOF
+}
+forward_decls qw/av1_common_forward_decls/;
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
+specialize qw/av1_convolve_horiz_rs sse4_1/;
+
+add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
+
+add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+
+add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
+
+specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
+specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
+specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
+
+
+# directional intra predictor functions
+add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+
+# FILTER_INTRA predictor functions
+add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+specialize qw/av1_filter_intra_predictor sse4_1/;
+
+# High bitdepth functions
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
+
+add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+
+#inv txfm
+add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x16/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/,  "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+# directional intra predictor functions
+add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+
+# build compound seg mask functions
+add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
+
+add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+
+add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
+
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+  # ENCODEMB INVOKE
+
+  # the transform coefficients are held in 32-bit
+  # values, so the assembler code for  av1_block_error can no longer be used.
+  add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/av1_block_error avx2/;
+
+  add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp sse2 avx2/;
+
+  add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_32x32 avx2/;
+
+  add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_64x64 avx2/;
+
+  # fdct functions
+
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+  #fwd txfm
+  add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
+
+  add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+
+  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+
+  #
+  # Motion search
+  #
+  add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
+
+  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
+
+  add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/av1_temporal_filter_apply sse2 msa/;
+
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+  # ENCODEMB INVOKE
+
+  add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/av1_highbd_block_error sse2/;
+
+  add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+
+  add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+
+  add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+  # End av1_high encoder functions
+
+  # txb
+  add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
+  specialize qw/av1_get_nz_map_contexts sse2/;
+  add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
+  specialize qw/av1_txb_init_levels sse4_1 avx2/;
+
+  add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+  specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
+  add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
+  add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+  specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
+
+  # hash
+  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+  specialize qw/av1_get_crc32c_value sse4_2/;
+
+  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride,  double *M, double *H";
+  specialize qw/av1_compute_stats sse4_1 avx2/;
+
+  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
+}
+# end encoder functions
+
+# Deringing Functions
+
+add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
+
+add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+
+# VS compiling for 32 bit targets does not support vector types in
+# structs as arguments, which makes the v256 type of the intrinsics
+# hard to support, so optimizations for this target are disabled.
+if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+  specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+}
+
+# WARPED_MOTION / GLOBAL_MOTION functions
+
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1 neon/;
+
+add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_highbd_warp_affine sse4_1/;
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+  specialize qw/compute_cross_correlation sse4_1/;
+}
+
+# LOOP_RESTORATION functions
+
+add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
+
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd";
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
+
+# CONVOLVE_ROUND/COMPOUND_ROUND functions
+
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_scale sse4_1/;
+  specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/;
+  specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
+  specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
+  specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+  specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+  specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+  specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
+  specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
+
+# INTRA_EDGE functions
+add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge sse4_1/;
+add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
+specialize qw/av1_upsample_intra_edge sse4_1/;
+
+add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge_high sse4_1/;
+add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
+specialize qw/av1_upsample_intra_edge_high sse4_1/;
+
+# CFL
+add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
+
+1;
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
new file mode 100644
index 000000000..bb70eab70
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+
+// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+const int32_t av1_cospi_arr_data[7][64] = {
+  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
+    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
+    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
+    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
+  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
+    449,  400,  350,  301,  251,  201,  151,  100,  50 },
+  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101 },
+  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
+    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
+    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
+    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+};
+
+// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
+// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
+const int32_t av1_sinpi_arr_data[7][5] = {
+  { 0, 330, 621, 836, 951 },        { 0, 660, 1241, 1672, 1901 },
+  { 0, 1321, 2482, 3344, 3803 },    { 0, 2642, 4964, 6689, 7606 },
+  { 0, 5283, 9929, 13377, 15212 },  { 0, 10566, 19858, 26755, 30424 },
+  { 0, 21133, 39716, 53510, 60849 }
+};
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN,
+                                  INT32_MAX);
+      }
+    }
+  }
+}
+
+const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = {
+  { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
+  { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
+  { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+  { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
+    TXFM_TYPE_IDENTITY32 },
+  { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
+};
+
+const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
+  4,   // TXFM_TYPE_DCT4
+  6,   // TXFM_TYPE_DCT8
+  8,   // TXFM_TYPE_DCT16
+  10,  // TXFM_TYPE_DCT32
+  12,  // TXFM_TYPE_DCT64
+  7,   // TXFM_TYPE_ADST4
+  8,   // TXFM_TYPE_ADST8
+  10,  // TXFM_TYPE_ADST16
+  1,   // TXFM_TYPE_IDENTITY4
+  1,   // TXFM_TYPE_IDENTITY8
+  1,   // TXFM_TYPE_IDENTITY16
+  1,   // TXFM_TYPE_IDENTITY32
+};
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+
+  int in_range = 1;
+
+  for (int i = 0; i < size; ++i) {
+    if (buf[i] < min_value || buf[i] > max_value) {
+      in_range = 0;
+    }
+  }
+
+  if (!in_range) {
+    fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+    fprintf(stderr, "size: %d\n", size);
+    fprintf(stderr, "stage: %d\n", stage);
+    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+            max_value);
+
+    fprintf(stderr, "coeffs: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", input[j]);
+    }
+    fprintf(stderr, "]\n");
+
+    fprintf(stderr, "   buf: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", buf[j]);
+    }
+    fprintf(stderr, "]\n\n");
+  }
+
+  assert(in_range);
+#else
+  (void)stage;
+  (void)input;
+  (void)buf;
+  (void)size;
+  (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
new file mode 100644
index 000000000..59d64ca4a
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(DO_RANGE_CHECK_CLAMP)
+#define DO_RANGE_CHECK_CLAMP 0
+#endif
+
+extern const int32_t av1_cospi_arr_data[7][64];
+extern const int32_t av1_sinpi_arr_data[7][5];
+
+#define MAX_TXFM_STAGE_NUM 12
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+#define NewSqrt2Bits ((int32_t)12)
+// 2^12 * sqrt(2)
+static const int32_t NewSqrt2 = 5793;
+// 2^12 / sqrt(2)
+static const int32_t NewInvSqrt2 = 2896;
+
+static INLINE const int32_t *cospi_arr(int n) {
+  return av1_cospi_arr_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *sinpi_arr(int n) {
+  return av1_sinpi_arr_data[n - cos_bit_min];
+}
+
+static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  if (value < min_value || value > max_value) {
+    fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+    assert(0);
+  }
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+#if DO_RANGE_CHECK_CLAMP
+  bit = AOMMIN(bit, 31);
+  return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
+#endif  // DO_RANGE_CHECK_CLAMP
+  (void)bit;
+  return value;
+}
+
+static INLINE int32_t round_shift(int64_t value, int bit) {
+  assert(bit >= 1);
+  return (int32_t)((value + (1ll << (bit - 1))) >> bit);
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+                               int bit) {
+  int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+  int64_t intermediate = result_64 + (1LL << (bit - 1));
+  // NOTE(david.barker): The value 'result_64' may not necessarily fit
+  // into 32 bits. However, the result of this function is nominally
+  // ROUND_POWER_OF_TWO_64(result_64, bit)
+  // and that is required to fit into stage_range[stage] many bits
+  // (checked by range_check_buf()).
+  //
+  // Here we've unpacked that rounding operation, and it can be shown
+  // that the value of 'intermediate' here *does* fit into 32 bits
+  // for any conformant bitstream.
+  // The upshot is that, if you do all this calculation using
+  // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+  // then you'll still get the correct result.
+  // To provide a check on this logic, we assert that 'intermediate'
+  // would fit into an int32 if range checking is enabled.
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
+#endif
+  return (int32_t)(intermediate >> bit);
+}
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  return clip_pixel_highbd(dest + (int)trans, bd);
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
+                         const int8_t *stage_range);
+
+typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
+                              TX_TYPE tx_type, int bd);
+
+typedef enum TXFM_TYPE {
+  TXFM_TYPE_DCT4,
+  TXFM_TYPE_DCT8,
+  TXFM_TYPE_DCT16,
+  TXFM_TYPE_DCT32,
+  TXFM_TYPE_DCT64,
+  TXFM_TYPE_ADST4,
+  TXFM_TYPE_ADST8,
+  TXFM_TYPE_ADST16,
+  TXFM_TYPE_IDENTITY4,
+  TXFM_TYPE_IDENTITY8,
+  TXFM_TYPE_IDENTITY16,
+  TXFM_TYPE_IDENTITY32,
+  TXFM_TYPES,
+  TXFM_TYPE_INVALID,
+} TXFM_TYPE;
+
+typedef struct TXFM_2D_FLIP_CFG {
+  TX_SIZE tx_size;
+  int ud_flip;  // flip upside down
+  int lr_flip;  // flip left to right
+  const int8_t *shift;
+  int8_t cos_bit_col;
+  int8_t cos_bit_row;
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  TXFM_TYPE txfm_type_col;
+  TXFM_TYPE txfm_type_row;
+  int stage_num_col;
+  int stage_num_row;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      *ud_flip = 0;
+      *lr_flip = 0;
+      break;
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+      *ud_flip = 0;
+      *lr_flip = 0;
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      *ud_flip = 1;
+      *lr_flip = 0;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      *ud_flip = 0;
+      *lr_flip = 1;
+      break;
+    case FLIPADST_FLIPADST:
+      *ud_flip = 1;
+      *lr_flip = 1;
+      break;
+    default:
+      *ud_flip = 0;
+      *lr_flip = 0;
+      assert(0);
+  }
+}
+
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
+// Utility function that returns the log of the ratio of the col and row
+// sizes.
+static INLINE int get_rect_tx_log_ratio(int col, int row) {
+  if (col == row) return 0;
+  if (col > row) {
+    if (col == row * 2) return 1;
+    if (col == row * 4) return 2;
+    assert(0 && "Unsupported transform size");
+  } else {
+    if (row == col * 2) return -1;
+    if (row == col * 4) return -2;
+    assert(0 && "Unsupported transform size");
+  }
+  return 0;  // Invalid
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd);
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+                             int bd);
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
+extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
+static INLINE int get_txw_idx(TX_SIZE tx_size) {
+  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+}
+static INLINE int get_txh_idx(TX_SIZE tx_size) {
+  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+}
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit);
+#define MAX_TXWH_IDX 5
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
new file mode 100644
index 000000000..2e796b656
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
+  if (!left_mi) return DC_PRED;
+  assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
+  return left_mi->mode;
+}
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
+  if (!above_mi) return DC_PRED;
+  assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
+  return above_mi->mode;
+}
+
+void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
+  const int txs_wide = tx_size_wide_unit[tx_size];
+  const int txs_high = tx_size_high_unit[tx_size];
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
+    memset(a, has_eob, sizeof(*a) * above_contexts);
+    memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
+  } else {
+    memset(a, has_eob, sizeof(*a) * txs_wide);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    const int blocks_high = max_block_high(xd, plane_bsize, plane);
+    const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
+    memset(l, has_eob, sizeof(*l) * left_contexts);
+    memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
+  } else {
+    memset(l, has_eob, sizeof(*l) * txs_high);
+  }
+}
+void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, const int num_planes) {
+  int i;
+  int nplanes;
+  int chroma_ref;
+  chroma_ref =
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  nplanes = 1 + (num_planes - 1) * chroma_ref;
+  for (i = 0; i < nplanes; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+  }
+}
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
+  xd->delta_lf_from_base = 0;
+  const int frame_lf_count =
+      num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+  for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
+}
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
+  for (int p = 0; p < num_planes; ++p) {
+    set_default_wiener(xd->wiener_info + p);
+    set_default_sgrproj(xd->sgrproj_info + p);
+  }
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes) {
+  int i;
+
+  for (i = 0; i < num_planes; i++) {
+    xd->plane[i].plane_type = get_plane_type(i);
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
+  }
+  for (i = num_planes; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].subsampling_x = 1;
+    xd->plane[i].subsampling_y = 1;
+  }
+}
+
+const int16_t dr_intra_derivative[90] = {
+  // More evenly spread out angles and limited to 10-bit
+  // Values that are 0 will never be used
+  //                    Approx angle
+  0,    0, 0,        //
+  1023, 0, 0,        // 3, ...
+  547,  0, 0,        // 6, ...
+  372,  0, 0, 0, 0,  // 9, ...
+  273,  0, 0,        // 14, ...
+  215,  0, 0,        // 17, ...
+  178,  0, 0,        // 20, ...
+  151,  0, 0,        // 23, ... (113 & 203 are base angles)
+  132,  0, 0,        // 26, ...
+  116,  0, 0,        // 29, ...
+  102,  0, 0, 0,     // 32, ...
+  90,   0, 0,        // 36, ...
+  80,   0, 0,        // 39, ...
+  71,   0, 0,        // 42, ...
+  64,   0, 0,        // 45, ... (45 & 135 are base angles)
+  57,   0, 0,        // 48, ...
+  51,   0, 0,        // 51, ...
+  45,   0, 0, 0,     // 54, ...
+  40,   0, 0,        // 58, ...
+  35,   0, 0,        // 61, ...
+  31,   0, 0,        // 64, ...
+  27,   0, 0,        // 67, ... (67 & 157 are base angles)
+  23,   0, 0,        // 70, ...
+  19,   0, 0,        // 73, ...
+  15,   0, 0, 0, 0,  // 76, ...
+  11,   0, 0,        // 81, ...
+  7,    0, 0,        // 84, ...
+  3,    0, 0,        // 87, ...
+};
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
new file mode 100644
index 000000000..a2311c1b0
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.h
@@ -0,0 +1,1176 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_B_QUANT_NO_TRELLIS 1
+
+#define MAX_MB_PLANE 3
+
+#define MAX_DIFFWTD_MASK_BITS 1
+
+// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
+typedef enum ATTRIBUTE_PACKED {
+  DIFFWTD_38 = 0,
+  DIFFWTD_38_INV,
+  DIFFWTD_MASK_TYPES,
+} DIFFWTD_MASK_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  KEY_FRAME = 0,
+  INTER_FRAME = 1,
+  INTRA_ONLY_FRAME = 2,  // replaces intra-only
+  S_FRAME = 3,
+  FRAME_TYPES,
+} FRAME_TYPE;
+
+static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
+  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+  return mode >= INTER_MODE_START && mode < INTER_MODE_END;
+}
+
+typedef struct {
+  uint8_t *plane[MAX_MB_PLANE];
+  int stride[MAX_MB_PLANE];
+} BUFFER_SET;
+
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+  return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
+}
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+  return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[] = {
+    MB_MODE_COUNT,  // DC_PRED
+    MB_MODE_COUNT,  // V_PRED
+    MB_MODE_COUNT,  // H_PRED
+    MB_MODE_COUNT,  // D45_PRED
+    MB_MODE_COUNT,  // D135_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
+    MB_MODE_COUNT,  // SMOOTH_PRED
+    MB_MODE_COUNT,  // SMOOTH_V_PRED
+    MB_MODE_COUNT,  // SMOOTH_H_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
+    MB_MODE_COUNT,  // NEARESTMV
+    MB_MODE_COUNT,  // NEARMV
+    MB_MODE_COUNT,  // GLOBALMV
+    MB_MODE_COUNT,  // NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEARESTMV,      // NEAREST_NEWMV
+    NEWMV,          // NEW_NEARESTMV
+    NEARMV,         // NEAR_NEWMV
+    NEWMV,          // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
+  };
+  assert(NELEMENTS(lut) == MB_MODE_COUNT);
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[] = {
+    MB_MODE_COUNT,  // DC_PRED
+    MB_MODE_COUNT,  // V_PRED
+    MB_MODE_COUNT,  // H_PRED
+    MB_MODE_COUNT,  // D45_PRED
+    MB_MODE_COUNT,  // D135_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
+    MB_MODE_COUNT,  // SMOOTH_PRED
+    MB_MODE_COUNT,  // SMOOTH_V_PRED
+    MB_MODE_COUNT,  // SMOOTH_H_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
+    MB_MODE_COUNT,  // NEARESTMV
+    MB_MODE_COUNT,  // NEARMV
+    MB_MODE_COUNT,  // GLOBALMV
+    MB_MODE_COUNT,  // NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEWMV,          // NEAREST_NEWMV
+    NEARESTMV,      // NEW_NEARESTMV
+    NEWMV,          // NEAR_NEWMV
+    NEARMV,         // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
+  };
+  assert(NELEMENTS(lut) == MB_MODE_COUNT);
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
+          mode == NEW_NEARMV);
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
+          mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+
+static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
+  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
+}
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+typedef int8_t MV_REFERENCE_FRAME;
+
+typedef struct {
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
+  // Value of base colors for Y, U, and V
+  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+} PALETTE_MODE_INFO;
+
+typedef struct {
+  uint8_t use_filter_intra;
+  FILTER_INTRA_MODE filter_intra_mode;
+} FILTER_INTRA_MODE_INFO;
+
+static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
+  DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED
+};
+
+#if CONFIG_RD_DEBUG
+#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE)
+#endif
+
+typedef struct RD_STATS {
+  int rate;
+  int64_t dist;
+  // Please be careful of using rdcost, it's not guaranteed to be set all the
+  // time.
+  // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In
+  // these functions, make sure rdcost is always up-to-date according to
+  // rate/dist.
+  int64_t rdcost;
+  int64_t sse;
+  int skip;  // sse should equal to dist when skip == 1
+  int64_t ref_rdcost;
+  int zero_rate;
+  uint8_t invalid_rate;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost[MAX_MB_PLANE];
+  int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+                        [TXB_COEFF_COST_MAP_SIZE];
+#endif  // CONFIG_RD_DEBUG
+} RD_STATS;
+
+// This struct is used to group function args that are commonly
+// sent together in functions related to interinter compound modes
+typedef struct {
+  int wedge_index;
+  int wedge_sign;
+  DIFFWTD_MASK_TYPE mask_type;
+  uint8_t *seg_mask;
+  COMPOUND_TYPE type;
+} INTERINTER_COMPOUND_DATA;
+
+#define INTER_TX_SIZE_BUF_LEN 16
+#define TXK_TYPE_BUF_LEN 64
+// This structure now relates to 4x4 block regions.
+typedef struct MB_MODE_INFO {
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
+  TX_SIZE tx_size;
+  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  int8_t skip;
+  int8_t skip_mode;
+  int8_t segment_id;
+  int8_t seg_id_predicted;  // valid only when temporal_update is enabled
+
+  // Only for INTRA blocks
+  UV_PREDICTION_MODE uv_mode;
+
+  PALETTE_MODE_INFO palette_mode_info;
+  uint8_t use_intrabc;
+
+  // Only for INTER blocks
+  InterpFilters interp_filters;
+  MV_REFERENCE_FRAME ref_frame[2];
+
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+
+  // The actual prediction angle is the base angle + (angle_delta * step).
+  int8_t angle_delta[PLANE_TYPES];
+
+  // interintra members
+  INTERINTRA_MODE interintra_mode;
+  // TODO(debargha): Consolidate these flags
+  int use_wedge_interintra;
+  int interintra_wedge_index;
+  int interintra_wedge_sign;
+  // interinter members
+  INTERINTER_COMPOUND_DATA interinter_comp;
+  MOTION_MODE motion_mode;
+  int overlappable_neighbors[2];
+  int_mv mv[2];
+  uint8_t ref_mv_idx;
+  PARTITION_TYPE partition;
+  /* deringing gain *per-superblock* */
+  int8_t cdef_strength;
+  int current_qindex;
+  int delta_lf_from_base;
+  int delta_lf[FRAME_LF_COUNT];
+#if CONFIG_RD_DEBUG
+  RD_STATS rd_stats;
+  int mi_row;
+  int mi_col;
+#endif
+  int num_proj_ref;
+  WarpedMotionParams wm_params;
+
+  // Index of the alpha Cb and alpha Cr combination
+  int cfl_alpha_idx;
+  // Joint sign of alpha Cb and alpha Cr
+  int cfl_alpha_signs;
+
+  int compound_idx;
+  int comp_group_idx;
+} MB_MODE_INFO;
+
+static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
+  return mbmi->use_intrabc;
+}
+
+static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
+  assert(mode < UV_INTRA_MODES);
+  static const PREDICTION_MODE uv2y[] = {
+    DC_PRED,        // UV_DC_PRED
+    V_PRED,         // UV_V_PRED
+    H_PRED,         // UV_H_PRED
+    D45_PRED,       // UV_D45_PRED
+    D135_PRED,      // UV_D135_PRED
+    D113_PRED,      // UV_D113_PRED
+    D157_PRED,      // UV_D157_PRED
+    D203_PRED,      // UV_D203_PRED
+    D67_PRED,       // UV_D67_PRED
+    SMOOTH_PRED,    // UV_SMOOTH_PRED
+    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
+    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
+    PAETH_PRED,     // UV_PAETH_PRED
+    DC_PRED,        // UV_CFL_PRED
+    INTRA_INVALID,  // UV_INTRA_MODES
+    INTRA_INVALID,  // UV_MODE_INVALID
+  };
+  return uv2y[mode];
+}
+
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}
+
+static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
+  return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
+                                    (mbmi->ref_frame[1] >= BWDREF_FRAME)));
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
+  static const MV_REFERENCE_FRAME lut[] = {
+    LAST_FRAME,     // LAST_LAST2_FRAMES,
+    LAST_FRAME,     // LAST_LAST3_FRAMES,
+    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST2_FRAME,    // LAST2_LAST3_FRAMES
+    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
+    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
+    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
+  };
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+  return lut[ref_idx];
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
+  static const MV_REFERENCE_FRAME lut[] = {
+    LAST2_FRAME,    // LAST_LAST2_FRAMES,
+    LAST3_FRAME,    // LAST_LAST3_FRAMES,
+    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
+    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST3_FRAME,    // LAST2_LAST3_FRAMES
+    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
+    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
+    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
+    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
+  };
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+  return lut[ref_idx];
+}
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
+
+static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
+                                     TransformationType type) {
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int block_size_allowed =
+      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
+         block_size_allowed;
+}
+
+#if CONFIG_MISMATCH_DEBUG
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+                                   int mi_row, int tx_blk_col, int tx_blk_row,
+                                   int subsampling_x, int subsampling_y) {
+  *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
+             (tx_blk_col << tx_size_wide_log2[0]);
+  *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
+             (tx_blk_row << tx_size_high_log2[0]);
+}
+#endif
+
+enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+
+struct buf_2d {
+  uint8_t *buf;
+  uint8_t *buf0;
+  int width;
+  int height;
+  int stride;
+};
+
+typedef struct eob_info {
+  uint16_t eob;
+  uint16_t max_scan_line;
+} eob_info;
+
+typedef struct {
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]);
+  eob_info eob_data[MAX_MB_PLANE]
+                   [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+} CB_BUFFER;
+
+typedef struct macroblockd_plane {
+  tran_low_t *dqcoeff;
+  tran_low_t *dqcoeff_block;
+  eob_info *eob_data;
+  PLANE_TYPE plane_type;
+  int subsampling_x;
+  int subsampling_y;
+  struct buf_2d dst;
+  struct buf_2d pre[2];
+  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *left_context;
+
+  // The dequantizers below are true dequntizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
+  uint8_t *color_index_map;
+
+  // block size in pixels
+  uint8_t width, height;
+
+  qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // the 'dequantizers' below are not literal dequantizer values.
+  // They're used by encoder RDO to generate ad-hoc lambda values.
+  // They use a hardwired Q3 coeff shift and do not necessarily match
+  // the TX scale in use.
+  const int16_t *dequant_Q3;
+} MACROBLOCKD_PLANE;
+
+#define BLOCK_OFFSET(x, i) \
+  ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
+
+typedef struct RefBuffer {
+  int idx;      // frame buf idx
+  int map_idx;  // frame map idx
+  YV12_BUFFER_CONFIG *buf;
+  struct scale_factors sf;
+} RefBuffer;
+
+typedef struct {
+  DECLARE_ALIGNED(16, InterpKernel, vfilter);
+  DECLARE_ALIGNED(16, InterpKernel, hfilter);
+} WienerInfo;
+
+typedef struct {
+  int ep;
+  int xqd[2];
+} SgrprojInfo;
+
+#if CONFIG_DEBUG
+#define CFL_SUB8X8_VAL_MI_SIZE (4)
+#define CFL_SUB8X8_VAL_MI_SQUARE \
+  (CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE)
+#endif  // CONFIG_DEBUG
+#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
+#define CFL_BUF_LINE (32)
+#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
+#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
+#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
+typedef struct cfl_ctx {
+  // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
+  // shifts)
+  uint16_t recon_buf_q3[CFL_BUF_SQUARE];
+  // Q3 AC contributions (reconstructed luma pixels - tx block avg)
+  int16_t ac_buf_q3[CFL_BUF_SQUARE];
+
+  // Cache the DC_PRED when performing RDO, so it does not have to be recomputed
+  // for every scaling parameter
+  int dc_pred_is_cached[CFL_PRED_PLANES];
+  // The DC_PRED cache is disable when decoding
+  int use_dc_pred_cache;
+  // Only cache the first row of the DC_PRED
+  int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
+
+  // Height and width currently used in the CfL prediction buffer.
+  int buf_height, buf_width;
+
+  int are_parameters_computed;
+
+  // Chroma subsampling
+  int subsampling_x, subsampling_y;
+
+  int mi_row, mi_col;
+
+  // Whether the reconstructed luma pixels need to be stored
+  int store_y;
+
+#if CONFIG_DEBUG
+  int rate;
+#endif  // CONFIG_DEBUG
+
+  int is_chroma_reference;
+} CFL_CTX;
+
+typedef struct jnt_comp_params {
+  int use_jnt_comp_avg;
+  int fwd_offset;
+  int bck_offset;
+} JNT_COMP_PARAMS;
+
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
+typedef struct macroblockd {
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+
+  TileInfo tile;
+
+  int mi_stride;
+
+  MB_MODE_INFO **mi;
+  MB_MODE_INFO *left_mbmi;
+  MB_MODE_INFO *above_mbmi;
+  MB_MODE_INFO *chroma_left_mbmi;
+  MB_MODE_INFO *chroma_above_mbmi;
+
+  int up_available;
+  int left_available;
+  int chroma_up_available;
+  int chroma_left_available;
+
+  /* Distance of MB away from frame edges in subpixels (1/8th pixel)  */
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  /* pointers to reference frames */
+  const RefBuffer *block_refs[2];
+
+  /* pointer to current frame */
+  const YV12_BUFFER_CONFIG *cur_buf;
+
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
+
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT *left_txfm_context;
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+  WienerInfo wiener_info[MAX_MB_PLANE];
+  SgrprojInfo sgrproj_info[MAX_MB_PLANE];
+
+  // block dimension in the unit of mode_info.
+  uint8_t n4_w, n4_h;
+
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint8_t is_sec_rect;
+
+  // Counts of each reference frame in the above and left neighboring blocks.
+  // NOTE: Take into account both single and comp references.
+  uint8_t neighbors_ref_counts[REF_FRAMES];
+
+  FRAME_CONTEXT *tile_ctx;
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+
+  int qindex[MAX_SEGMENTS];
+  int lossless[MAX_SEGMENTS];
+  int corrupted;
+  int cur_frame_force_integer_mv;
+  // same with that in AV1_COMMON
+  struct aom_internal_error_info *error_info;
+  const WarpedMotionParams *global_motion;
+  int delta_qindex;
+  int current_qindex;
+  // Since actual frame level loop filtering level value is not available
+  // at the beginning of the tile (only available during actual filtering)
+  // at encoder side.we record the delta_lf (against the frame level loop
+  // filtering level) and code the delta between previous superblock's delta
+  // lf and current delta lf. It is equivalent to the delta between previous
+  // superblock's actual lf and current lf.
+  int delta_lf_from_base;
+  // For this experiment, we have four frame filter levels for different plane
+  // and direction. So, to support the per superblock update, we need to add
+  // a few more params as below.
+  // 0: delta loop filter level for y plane vertical
+  // 1: delta loop filter level for y plane horizontal
+  // 2: delta loop filter level for u plane
+  // 3: delta loop filter level for v plane
+  // To make it consistent with the reference to each filter level in segment,
+  // we need to -1, since
+  // SEG_LVL_ALT_LF_Y_V = 1;
+  // SEG_LVL_ALT_LF_Y_H = 2;
+  // SEG_LVL_ALT_LF_U   = 3;
+  // SEG_LVL_ALT_LF_V   = 4;
+  int delta_lf[FRAME_LF_COUNT];
+  int cdef_preset[4];
+
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *mc_buf[2];
+  CFL_CTX cfl;
+
+  JNT_COMP_PARAMS jcp_param;
+
+  uint16_t cb_offset[MAX_MB_PLANE];
+  uint16_t txb_offset[MAX_MB_PLANE];
+  uint16_t color_index_map_offset[2];
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+} MACROBLOCKD;
+
+static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+}
+
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+  return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+             ? CONVERT_TO_BYTEPTR(buf16)
+             : buf16;
+}
+
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_4X4: return 0;
+    case BLOCK_8X8: return 1;
+    case BLOCK_16X16: return 2;
+    case BLOCK_32X32: return 3;
+    case BLOCK_64X64: return 4;
+    case BLOCK_128X128: return 5;
+    default: return SQR_BLOCK_SIZES;
+  }
+}
+
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+                                               PARTITION_TYPE partition) {
+  if (partition == PARTITION_INVALID) {
+    return BLOCK_INVALID;
+  } else {
+    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+    return sqr_bsize_idx >= SQR_BLOCK_SIZES
+               ? BLOCK_INVALID
+               : subsize_lookup[partition][sqr_bsize_idx];
+  }
+}
+
+static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
+                                     PLANE_TYPE plane_type) {
+  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
+    DCT_DCT,    // DC
+    ADST_DCT,   // V
+    DCT_ADST,   // H
+    DCT_DCT,    // D45
+    ADST_ADST,  // D135
+    ADST_DCT,   // D117
+    DCT_ADST,   // D153
+    DCT_ADST,   // D207
+    ADST_DCT,   // D63
+    ADST_ADST,  // SMOOTH
+    ADST_DCT,   // SMOOTH_V
+    DCT_ADST,   // SMOOTH_H
+    ADST_ADST,  // PAETH
+  };
+  const PREDICTION_MODE mode =
+      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  assert(mode < INTRA_MODES);
+  return _intra_mode_to_tx_type[mode];
+}
+
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+
+static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
+  return bsize > BLOCK_4X4;
+}
+
+// Number of transform types in each set type
+static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = {
+  1, 2, 5, 7, 12, 16,
+};
+
+static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+  0x0001,  // 0000 0000 0000 0001
+  0x0201,  // 0000 0010 0000 0001
+  0x020F,  // 0000 0010 0000 1111
+  0x0E0F,  // 0000 1110 0000 1111
+  0x0FFF,  // 0000 1111 1111 1111
+  0xFFFF,  // 1111 1111 1111 1111
+};
+
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+                                                int use_reduced_set) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
+  if (tx_size_sqr_up == TX_32X32)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
+  if (use_reduced_set)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+  if (is_inter) {
+    return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
+                                    : EXT_TX_SET_ALL16);
+  } else {
+    return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
+                                    : EXT_TX_SET_DTT4_IDTX_1DDCT);
+  }
+}
+
+// Maps tx set types to the indices.
+static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
+  { // Intra
+    0, -1, 2, 1, -1, -1 },
+  { // Inter
+    0, 3, -1, -1, 2, 1 },
+};
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
+                                 int use_reduced_set) {
+  const TxSetType set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+  return ext_tx_set_index[is_inter][set_type];
+}
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
+                                   int use_reduced_set) {
+  const int set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+  return av1_num_ext_tx_set[set_type];
+}
+
+#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
+#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
+
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+  if (bsize == BLOCK_4X4)
+    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
+  if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
+    return max_rect_tx_size;
+  else
+    return largest_tx_size;
+}
+
+extern const int16_t dr_intra_derivative[90];
+static const uint8_t mode_to_angle_map[] = {
+  0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
+};
+
+// Converts block_index for given transform size to index of the block in raster
+// order.
+static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
+                                                  int block_idx) {
+  // For transform size 4x8, the possible block_idx values are 0 & 2, because
+  // block_idx values are incremented in steps of size 'tx_width_unit x
+  // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to
+  // block number 1 in raster order, inside an 8x8 MI block.
+  // For any other transform size, the two indices are equivalent.
+  return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx;
+}
+
+// Inverse of above function.
+// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now.
+static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
+                                                  int raster_order) {
+  assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
+  // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4.
+  return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0;
+}
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+                                          const MACROBLOCKD *xd,
+                                          TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type(mbmi, plane_type);
+}
+
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                              int subsampling_x,
+                                              int subsampling_y) {
+  if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+  return ss_size_lookup[bsize][subsampling_x][subsampling_y];
+}
+
+static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_log2 - tx_w_log2;
+  const int index =
+      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+  assert(index < INTER_TX_SIZE_BUF_LEN);
+  return index;
+}
+
+static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_uint_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_uint_log2 - tx_w_log2;
+  const int index =
+      ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+  assert(index < TXK_TYPE_BUF_LEN);
+  return index;
+}
+
+static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
+                                    int blk_row, int blk_col, TX_SIZE tx_size,
+                                    TX_TYPE tx_type) {
+  const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
+  txk_type[txk_type_idx] = tx_type;
+
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+  // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+  // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+  // the intricacy, cover all the 16x16 units inside a 64 level transform.
+  if (txw == tx_size_wide_unit[TX_64X64] ||
+      txh == tx_size_high_unit[TX_64X64]) {
+    const int tx_unit = tx_size_wide_unit[TX_16X16];
+    for (int idy = 0; idy < txh; idy += tx_unit) {
+      for (int idx = 0; idx < txw; idx += tx_unit) {
+        const int this_index =
+            av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
+        txk_type[this_index] = tx_type;
+      }
+    }
+  }
+}
+
+static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
+                                      const MACROBLOCKD *xd, int blk_row,
+                                      int blk_col, TX_SIZE tx_size,
+                                      int reduced_tx_set) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane_type];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+
+  TX_TYPE tx_type;
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+    tx_type = DCT_DCT;
+  } else {
+    if (plane_type == PLANE_TYPE_Y) {
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      tx_type = mbmi->txk_type[txk_type_idx];
+    } else if (is_inter_block(mbmi)) {
+      // scale back to y plane's coordinate
+      blk_row <<= pd->subsampling_y;
+      blk_col <<= pd->subsampling_x;
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      tx_type = mbmi->txk_type[txk_type_idx];
+    } else {
+      // In intra mode, uv planes don't share the same prediction mode as y
+      // plane, so the tx_type should not be shared
+      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
+    }
+  }
+  assert(tx_type < TX_TYPES);
+  if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+  return tx_type;
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes);
+
+static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+    depth++;
+    tx_size = sub_tx_size_map[tx_size];
+  }
+  return depth;
+}
+
+static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+  TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  assert(tx_size != TX_4X4);
+  int depth = 0;
+  while (tx_size != TX_4X4) {
+    depth++;
+    tx_size = sub_tx_size_map[tx_size];
+    assert(depth < 10);
+  }
+  assert(depth <= MAX_TX_CATS);
+  return depth - 1;
+}
+
+static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+  TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  TX_SIZE tx_size = max_tx_size;
+  for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
+  return tx_size;
+}
+
+static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_64X64:
+    case TX_64X32:
+    case TX_32X64: return TX_32X32;
+    case TX_64X16: return TX_32X16;
+    case TX_16X64: return TX_16X32;
+    default: return tx_size;
+  }
+}
+
+static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, subsampling_x, subsampling_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
+  return av1_get_adjusted_tx_size(uv_tx);
+}
+
+static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
+  if (plane == 0) return mbmi->tx_size;
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                               pd->subsampling_y);
+}
+
+void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, const int num_planes);
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes);
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  int blk_row, int blk_col,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size, void *arg);
+
+void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+                      int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      int has_eob, int aoff, int loff);
+
+#define MAX_INTERINTRA_SB_SQUARE 32 * 32
+static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[0] > INTRA_FRAME &&
+          mbmi->ref_frame[1] == INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+  return is_interintra_allowed_bsize(mbmi->sb_type) &&
+         is_interintra_allowed_mode(mbmi->mode) &&
+         is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(int group) {
+  int i;
+  for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+    if (size_group_lookup[i] == group &&
+        is_interintra_allowed_bsize((BLOCK_SIZE)i)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[0] > INTRA_FRAME &&
+         mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
+}
+
+static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                       int plane) {
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+  const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+  if (plane == 0) return max_txsize;            // luma
+  return av1_get_adjusted_tx_size(max_txsize);  // chroma
+}
+
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_motion_variation_allowed_compound(
+    const MB_MODE_INFO *mbmi) {
+  if (!has_second_ref(mbmi))
+    return 1;
+  else
+    return 0;
+}
+
+// input: log2 of length, 0(4), 1(8), ...
+static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
+
+static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
+  return !(mbmi->overlappable_neighbors[0] == 0 &&
+           mbmi->overlappable_neighbors[1] == 0);
+}
+
+static INLINE MOTION_MODE
+motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+                    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+  if (xd->cur_frame_force_integer_mv == 0) {
+    const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
+    if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
+  }
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+      is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
+      is_motion_variation_allowed_compound(mbmi)) {
+    if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
+    assert(!has_second_ref(mbmi));
+    if (mbmi->num_proj_ref >= 1 &&
+        (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+      if (xd->cur_frame_force_integer_mv) {
+        return OBMC_CAUSAL;
+      }
+      return WARPED_CAUSAL;
+    }
+    return OBMC_CAUSAL;
+  } else {
+    return SIMPLE_TRANSLATION;
+  }
+}
+
+static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
+                                            const WarpedMotionParams *gm_params,
+                                            const MACROBLOCKD *xd,
+                                            const MB_MODE_INFO *mbmi,
+                                            int allow_warped_motion) {
+  const MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
+
+  // Check that the input mode is not illegal
+  if (last_motion_mode_allowed < mode)
+    assert(0 && "Illegal motion mode selected");
+}
+
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+  return (is_inter_block(mbmi));
+}
+
+static INLINE int av1_allow_palette(int allow_screen_content_tools,
+                                    BLOCK_SIZE sb_type) {
+  return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
+         block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
+}
+
+// Returns sub-sampled dimensions of the given block.
+// The output values for 'rows_within_bounds' and 'cols_within_bounds' will
+// differ from 'height' and 'width' when part of the block is outside the
+// right
+// and/or bottom image boundary.
+static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
+                                            const MACROBLOCKD *xd, int *width,
+                                            int *height,
+                                            int *rows_within_bounds,
+                                            int *cols_within_bounds) {
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  const int block_rows = (xd->mb_to_bottom_edge >= 0)
+                             ? block_height
+                             : (xd->mb_to_bottom_edge >> 3) + block_height;
+  const int block_cols = (xd->mb_to_right_edge >= 0)
+                             ? block_width
+                             : (xd->mb_to_right_edge >> 3) + block_width;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
+  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
+  assert(block_width >= block_cols);
+  assert(block_height >= block_rows);
+  const int plane_block_width = block_width >> pd->subsampling_x;
+  const int plane_block_height = block_height >> pd->subsampling_y;
+  // Special handling for chroma sub8x8.
+  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
+  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
+  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
+  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (rows_within_bounds) {
+    *rows_within_bounds =
+        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+  }
+  if (cols_within_bounds) {
+    *cols_within_bounds =
+        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+  }
+}
+
+/* clang-format off */
+typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS]
+                              [CDF_SIZE(PALETTE_COLORS)];
+typedef const int (*ColorCost)[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                              [PALETTE_COLORS];
+/* clang-format on */
+
+typedef struct {
+  int rows;
+  int cols;
+  int n_colors;
+  int plane_width;
+  int plane_height;
+  uint8_t *color_map;
+  MapCdf map_cdf;
+  ColorCost color_cost;
+} Av1ColorMapParam;
+
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+                                            const MB_MODE_INFO *mbmi) {
+  int ref;
+
+  // First check if all modes are GLOBALMV
+  if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
+
+  if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+    return 0;
+
+  // Now check if all global motion is non translational
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
+  }
+  return 1;
+}
+
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+}
+
+static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+  if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
+    return 1024;
+  }
+  if (tx_size == TX_16X64 || tx_size == TX_64X16) {
+    return 512;
+  }
+  return tx_size_2d[tx_size];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
new file mode 100644
index 000000000..e9e2b0e42
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
+  int maxc, maxr;
+  int skip = 1;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+
+  maxr = AOMMIN(maxr, MI_SIZE_64X64);
+  maxc = AOMMIN(maxc, MI_SIZE_64X64);
+
+  for (int r = 0; r < maxr; r++) {
+    for (int c = 0; c < maxc; c++) {
+      skip =
+          skip &&
+          cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
+    }
+  }
+  return skip;
+}
+
+static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
+                             int mi_stride) {
+  int is_skip = 1;
+  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
+    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
+      is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
+
+  return is_skip;
+}
+
+int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
+                         cdef_list *dlist, BLOCK_SIZE bs) {
+  MB_MODE_INFO **grid = cm->mi_grid_visible;
+  int maxc = cm->mi_cols - mi_col;
+  int maxr = cm->mi_rows - mi_row;
+
+  if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
+    maxc = AOMMIN(maxc, MI_SIZE_128X128);
+  else
+    maxc = AOMMIN(maxc, MI_SIZE_64X64);
+  if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
+    maxr = AOMMIN(maxr, MI_SIZE_128X128);
+  else
+    maxr = AOMMIN(maxr, MI_SIZE_64X64);
+
+  const int r_step = mi_size_high[BLOCK_8X8];
+  const int c_step = mi_size_wide[BLOCK_8X8];
+  const int r_shift = (r_step == 2);
+  const int c_shift = (c_step == 2);
+
+  assert(r_step == 1 || r_step == 2);
+  assert(c_step == 1 || c_step == 2);
+
+  int count = 0;
+
+  for (int r = 0; r < maxr; r += r_step) {
+    for (int c = 0; c < maxc; c += c_step) {
+      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
+        dlist[count].by = r >> r_shift;
+        dlist[count].bx = c >> c_shift;
+        dlist[count].skip = 0;
+        count++;
+      }
+    }
+  }
+  return count;
+}
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
+                                int sstride, int v, int h) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+                                 const uint16_t *src, int sstride, int v,
+                                 int h) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+                        const uint8_t *src, int src_voffset, int src_hoffset,
+                        int sstride, int vsize, int hsize) {
+  if (cm->seq_params.use_highbitdepth) {
+    const uint16_t *base =
+        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+    copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+  } else {
+    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+    copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+  }
+}
+
+static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+                             uint16_t x) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = x;
+    }
+  }
+}
+
+static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
+                             int sstride, int v, int h) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                    MACROBLOCKD *xd) {
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
+  uint16_t *linebuf[3];
+  uint16_t *colbuf[3];
+  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
+  unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef;
+  int cdef_count;
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int mi_wide_l2[3];
+  int mi_high_l2[3];
+  int xdec[3];
+  int ydec[3];
+  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
+  row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
+  memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
+  prev_row_cdef = row_cdef + 1;
+  curr_row_cdef = prev_row_cdef + nhfb + 2;
+  for (int pli = 0; pli < num_planes; pli++) {
+    xdec[pli] = xd->plane[pli].subsampling_x;
+    ydec[pli] = xd->plane[pli].subsampling_y;
+    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+  }
+  const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  for (int pli = 0; pli < num_planes; pli++) {
+    linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
+    colbuf[pli] =
+        aom_malloc(sizeof(*colbuf) *
+                   ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) *
+                   CDEF_HBORDER);
+  }
+  for (int fbr = 0; fbr < nvfb; fbr++) {
+    for (int pli = 0; pli < num_planes; pli++) {
+      const int block_height =
+          (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
+      fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
+                CDEF_VERY_LARGE);
+    }
+    int cdef_left = 1;
+    for (int fbc = 0; fbc < nhfb; fbc++) {
+      int level, sec_strength;
+      int uv_level, uv_sec_strength;
+      int nhb, nvb;
+      int cstart = 0;
+      curr_row_cdef[fbc] = 0;
+      if (cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                              MI_SIZE_64X64 * fbc] == NULL ||
+          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                              MI_SIZE_64X64 * fbc]
+                  ->cdef_strength == -1) {
+        cdef_left = 0;
+        continue;
+      }
+      if (!cdef_left) cstart = -CDEF_HBORDER;
+      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
+      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+      int frame_top, frame_left, frame_bottom, frame_right;
+
+      int mi_row = MI_SIZE_64X64 * fbr;
+      int mi_col = MI_SIZE_64X64 * fbc;
+      // for the current filter block, it's top left corner mi structure (mi_tl)
+      // is first accessed to check whether the top and left boundaries are
+      // frame boundaries. Then bottom-left and top-right mi structures are
+      // accessed to check whether the bottom and right boundaries
+      // (respectively) are frame boundaries.
+      //
+      // Note that we can't just check the bottom-right mi structure - eg. if
+      // we're at the right-hand edge of the frame but not the bottom, then
+      // the bottom-right mi is NULL but the bottom-left is not.
+      frame_top = (mi_row == 0) ? 1 : 0;
+      frame_left = (mi_col == 0) ? 1 : 0;
+
+      if (fbr != nvfb - 1)
+        frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
+      else
+        frame_bottom = 1;
+
+      if (fbc != nhfb - 1)
+        frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
+      else
+        frame_right = 1;
+
+      const int mbmi_cdef_strength =
+          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                              MI_SIZE_64X64 * fbc]
+              ->cdef_strength;
+      level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+      sec_strength =
+          cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+      sec_strength += sec_strength == 3;
+      uv_level = cm->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+      uv_sec_strength =
+          cm->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+      uv_sec_strength += uv_sec_strength == 3;
+      if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
+           uv_sec_strength == 0) ||
+          (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+                                             fbc * MI_SIZE_64X64, dlist,
+                                             BLOCK_64X64)) == 0) {
+        cdef_left = 0;
+        continue;
+      }
+
+      curr_row_cdef[fbc] = 1;
+      for (int pli = 0; pli < num_planes; pli++) {
+        int coffset;
+        int rend, cend;
+        int pri_damping = cm->cdef_pri_damping;
+        int sec_damping = cm->cdef_sec_damping;
+        int hsize = nhb << mi_wide_l2[pli];
+        int vsize = nvb << mi_high_l2[pli];
+
+        if (pli) {
+          level = uv_level;
+          sec_strength = uv_sec_strength;
+        }
+
+        if (fbc == nhfb - 1)
+          cend = hsize;
+        else
+          cend = hsize + CDEF_HBORDER;
+
+        if (fbr == nvfb - 1)
+          rend = vsize;
+        else
+          rend = vsize + CDEF_VBORDER;
+
+        coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+        if (fbc == nhfb - 1) {
+          /* On the last superblock column, fill in the right border with
+             CDEF_VERY_LARGE to avoid filtering with the outside. */
+          fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE,
+                    rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend,
+                    CDEF_VERY_LARGE);
+        }
+        if (fbr == nvfb - 1) {
+          /* On the last superblock row, fill in the bottom border with
+             CDEF_VERY_LARGE to avoid filtering with the outside. */
+          fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
+                    CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+        /* Copy in the pixels we need from the current superblock for
+           deringing.*/
+        copy_sb8_16(cm,
+                    &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+                    CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+                    (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart,
+                    xd->plane[pli].dst.stride, rend, cend - cstart);
+        if (!prev_row_cdef[fbc]) {
+          copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE,
+                      xd->plane[pli].dst.buf,
+                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+                      coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
+        } else if (fbr > 0) {
+          copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset],
+                    stride, CDEF_VBORDER, hsize);
+        } else {
+          fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
+                    CDEF_VERY_LARGE);
+        }
+        if (!prev_row_cdef[fbc - 1]) {
+          copy_sb8_16(cm, src, CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+                      coffset - CDEF_HBORDER, xd->plane[pli].dst.stride,
+                      CDEF_VBORDER, CDEF_HBORDER);
+        } else if (fbr > 0 && fbc > 0) {
+          copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER],
+                    stride, CDEF_VBORDER, CDEF_HBORDER);
+        } else {
+          fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+                    CDEF_VERY_LARGE);
+        }
+        if (!prev_row_cdef[fbc + 1]) {
+          copy_sb8_16(cm, &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])],
+                      CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+                      coffset + hsize, xd->plane[pli].dst.stride, CDEF_VBORDER,
+                      CDEF_HBORDER);
+        } else if (fbr > 0 && fbc < nhfb - 1) {
+          copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+                    &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER,
+                    CDEF_HBORDER);
+        } else {
+          fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+                    CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+        if (cdef_left) {
+          /* If we deringed the superblock on the left then we need to copy in
+             saved pixels. */
+          copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER,
+                    rend + CDEF_VBORDER, CDEF_HBORDER);
+        }
+        /* Saving pixels in case we need to dering the superblock on the
+            right. */
+        copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+                  rend + CDEF_VBORDER, CDEF_HBORDER);
+        copy_sb8_16(
+            cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
+            (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
+            coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
+
+        if (frame_top) {
+          fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
+                    CDEF_VERY_LARGE);
+        }
+        if (frame_left) {
+          fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
+                    CDEF_VERY_LARGE);
+        }
+        if (frame_bottom) {
+          fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
+                    CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+        if (frame_right) {
+          fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+                    vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+
+        if (cm->seq_params.use_highbitdepth) {
+          cdef_filter_fb(
+              NULL,
+              &CONVERT_TO_SHORTPTR(
+                  xd->plane[pli]
+                      .dst.buf)[xd->plane[pli].dst.stride *
+                                    (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                                (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+              xd->plane[pli].dst.stride,
+              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+              sec_strength, pri_damping, sec_damping, coeff_shift);
+        } else {
+          cdef_filter_fb(
+              &xd->plane[pli]
+                   .dst.buf[xd->plane[pli].dst.stride *
+                                (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                            (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+              NULL, xd->plane[pli].dst.stride,
+              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+              sec_strength, pri_damping, sec_damping, coeff_shift);
+        }
+      }
+      cdef_left = 1;
+    }
+    {
+      unsigned char *tmp = prev_row_cdef;
+      prev_row_cdef = curr_row_cdef;
+      curr_row_cdef = tmp;
+    }
+  }
+  aom_free(row_cdef);
+  for (int pli = 0; pli < num_planes; pli++) {
+    aom_free(linebuf[pli]);
+    aom_free(colbuf[pli]);
+  }
+}
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
new file mode 100644
index 000000000..3b2eac8a5
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
+
+#define CDEF_STRENGTH_BITS 6
+
+#define CDEF_PRI_STRENGTHS 16
+#define CDEF_SEC_STRENGTHS 4
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/onyxc_int.h"
+
+static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
+
+static INLINE int constrain(int diff, int threshold, int damping) {
+  if (!threshold) return 0;
+
+  const int shift = AOMMAX(0, damping - get_msb(threshold));
+  return sign(diff) *
+         AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
+int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
+                         cdef_list *dlist, BLOCK_SIZE bsize);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int fast);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c
new file mode 100644
index 000000000..df1de89be
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef.h"
+
+/* Generated from gen_filter_tables.c. */
+DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
+  { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
+  { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
+};
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+   The search minimizes the weighted variance along all the lines in a
+   particular direction, i.e. the squared error between the input and a
+   "predicted" block where each pixel is replaced by the average along a line
+   in a particular direction. Since each direction have the same sum(x^2) term,
+   that term is never computed. See Section 2, step 2, of:
+   http://jmvalin.ca/notes/intra_paint.pdf */
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
+                    int coeff_shift) {
+  int i;
+  int32_t cost[8] = { 0 };
+  int partial[8][15] = { { 0 } };
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+     The output is then 840 times larger, but we don't care for finding
+     the max. */
+  static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 };
+  for (i = 0; i < 8; i++) {
+    int j;
+    for (j = 0; j < 8; j++) {
+      int x;
+      /* We subtract 128 here to reduce the maximum range of the squared
+         partial sums. */
+      x = (img[i * stride + j] >> coeff_shift) - 128;
+      partial[0][i + j] += x;
+      partial[1][i + j / 2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j / 2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i / 2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i / 2 + j] += x;
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    cost[2] += partial[2][i] * partial[2][i];
+    cost[6] += partial[6][i] * partial[6][i];
+  }
+  cost[2] *= div_table[8];
+  cost[6] *= div_table[8];
+  for (i = 0; i < 7; i++) {
+    cost[0] += (partial[0][i] * partial[0][i] +
+                partial[0][14 - i] * partial[0][14 - i]) *
+               div_table[i + 1];
+    cost[4] += (partial[4][i] * partial[4][i] +
+                partial[4][14 - i] * partial[4][14 - i]) *
+               div_table[i + 1];
+  }
+  cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+  cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+  for (i = 1; i < 8; i += 2) {
+    int j;
+    for (j = 0; j < 4 + 1; j++) {
+      cost[i] += partial[i][3 + j] * partial[i][3 + j];
+    }
+    cost[i] *= div_table[8];
+    for (j = 0; j < 4 - 1; j++) {
+      cost[i] += (partial[i][j] * partial[i][j] +
+                  partial[i][10 - j] * partial[i][10 - j]) *
+                 div_table[2 * j + 2];
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+
+/* Smooth in the direction detected. */
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+                         const uint16_t *in, int pri_strength, int sec_strength,
+                         int dir, int pri_damping, int sec_damping, int bsize,
+                         AOM_UNUSED int max_unused, int coeff_shift) {
+  int i, j, k;
+  const int s = CDEF_BSTRIDE;
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+  for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
+    for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
+      int16_t sum = 0;
+      int16_t y;
+      int16_t x = in[i * s + j];
+      int max = x;
+      int min = x;
+      for (k = 0; k < 2; k++) {
+        int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+        int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+        sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+        sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+        if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+        if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+        min = AOMMIN(p0, min);
+        min = AOMMIN(p1, min);
+        int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
+        int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
+        int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
+        int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
+        if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+        if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+        if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+        if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+        min = AOMMIN(s0, min);
+        min = AOMMIN(s1, min);
+        min = AOMMIN(s2, min);
+        min = AOMMIN(s3, min);
+        sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+      }
+      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
+      if (dst8)
+        dst8[i * dstride + j] = (uint8_t)y;
+      else
+        dst16[i * dstride + j] = (uint16_t)y;
+    }
+  }
+}
+
+/* Compute the primary filter strength for an 8x8 block based on the
+   directional variance difference. A high variance difference means
+   that we have a highly directional pattern (e.g. a high contrast
+   edge), so we can apply more deringing. A low variance means that we
+   either have a low contrast edge, or a non-directional texture, so
+   we want to be careful not to blur. */
+static INLINE int adjust_strength(int strength, int32_t var) {
+  const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
+  /* We use the variance of 8x8 blocks to adjust the strength. */
+  return var ? (strength * (4 + i) + 8) >> 4 : 0;
+}
+
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                    cdef_list *dlist, int cdef_count, int level,
+                    int sec_strength, int pri_damping, int sec_damping,
+                    int coeff_shift) {
+  int bi;
+  int bx;
+  int by;
+  int bsize, bsizex, bsizey;
+
+  int pri_strength = level << coeff_shift;
+  sec_strength <<= coeff_shift;
+  sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
+  pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
+  bsize =
+      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+  bsizex = 3 - xdec;
+  bsizey = 3 - ydec;
+  if (dirinit && pri_strength == 0 && sec_strength == 0) {
+    // If we're here, both primary and secondary strengths are 0, and
+    // we still haven't written anything to y[] yet, so we just copy
+    // the input to y[]. This is necessary only for av1_cdef_search()
+    // and only av1_cdef_search() sets dirinit.
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      int iy, ix;
+      // TODO(stemidts/jmvalin): SIMD optimisations
+      for (iy = 0; iy < 1 << bsizey; iy++)
+        for (ix = 0; ix < 1 << bsizex; ix++)
+          dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+              in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
+    }
+    return;
+  }
+
+  if (pli == 0) {
+    if (!dirinit || !*dirinit) {
+      for (bi = 0; bi < cdef_count; bi++) {
+        by = dlist[bi].by;
+        bx = dlist[bi].bx;
+        dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+                                    CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+      }
+      if (dirinit) *dirinit = 1;
+    }
+  }
+  if (pli == 1 && xdec != ydec) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
+      static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
+    }
+  }
+
+  for (bi = 0; bi < cdef_count; bi++) {
+    int t = dlist[bi].skip ? 0 : pri_strength;
+    int s = dlist[bi].skip ? 0 : sec_strength;
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    if (dst8)
+      cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL,
+                        dstride,
+                        &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+                        (pli ? t : adjust_strength(t, var[by][bx])), s,
+                        t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
+                        (256 << coeff_shift) - 1, coeff_shift);
+    else
+      cdef_filter_block(
+          NULL,
+          &dst16[dirinit ? bi << (bsizex + bsizey)
+                         : (by << bsizey) * dstride + (bx << bsizex)],
+          dirinit ? 1 << bsizex : dstride,
+          &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+          pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
+          coeff_shift);
+  }
+}
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
new file mode 100644
index 000000000..6b4452cd6
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
+
+#include "av1/common/odintrin.h"
+
+#define CDEF_BLOCKSIZE 64
+#define CDEF_BLOCKSIZE_LOG2 6
+#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
+#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
+
+/* We need to buffer three vertical lines. */
+#define CDEF_VBORDER (3)
+/* We only need to buffer three horizontal pixels too, but let's align to
+   16 bytes (8 x 16 bits) to make vectorization easier. */
+#define CDEF_HBORDER (8)
+#define CDEF_BSTRIDE \
+  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
+
+#define CDEF_VERY_LARGE (30000)
+#define CDEF_INBUF_SIZE \
+  (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
+
+extern const int cdef_pri_taps[2][2];
+extern const int cdef_sec_taps[2][2];
+DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
+
+typedef struct {
+  uint8_t by;
+  uint8_t bx;
+  uint8_t skip;
+} cdef_list;
+
+typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
+                                       int dstride, const uint16_t *in,
+                                       int pri_strength, int sec_strength,
+                                       int dir, int pri_damping,
+                                       int sec_damping, int bsize, int max,
+                                       int coeff_shift);
+void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                              cdef_list *dlist, int cdef_count, int bsize);
+
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+                    int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+                    int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                    cdef_list *dlist, int cdef_count, int level,
+                    int sec_strength, int pri_damping, int sec_damping,
+                    int coeff_shift);
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_avx2.c b/third_party/aom/av1/common/cdef_block_avx2.c
new file mode 100644
index 000000000..e2b85b3e2
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_avx2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_neon.c b/third_party/aom/av1/common/cdef_block_neon.c
new file mode 100644
index 000000000..2d6bc65e3
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_neon.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
new file mode 100644
index 000000000..14587a023
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef_block.h"
+
+/* partial A is a 16-bit vector of the form:
+   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+   [0  y1 y2 y3 y4 y5 y6 y7].
+   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+   and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+                                    v128 const2) {
+  v128 tmp;
+  /* Reverse partial B. */
+  partialb = v128_shuffle_8(
+      partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = partiala;
+  partiala = v128_ziplo_16(partialb, partiala);
+  partialb = v128_ziphi_16(partialb, tmp);
+  /* Square and add the corresponding x and y values. */
+  partiala = v128_madd_s16(partiala, partiala);
+  partialb = v128_madd_s16(partialb, partialb);
+  /* Multiply by constant. */
+  partiala = v128_mullo_s32(partiala, const1);
+  partialb = v128_mullo_s32(partialb, const2);
+  /* Sum all results. */
+  partiala = v128_add_32(partiala, partialb);
+  return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+  v128 t0, t1, t2, t3;
+  t0 = v128_ziplo_32(x1, x0);
+  t1 = v128_ziplo_32(x3, x2);
+  t2 = v128_ziphi_32(x1, x0);
+  t3 = v128_ziphi_32(x3, x2);
+  x0 = v128_ziplo_64(t1, t0);
+  x1 = v128_ziphi_64(t1, t0);
+  x2 = v128_ziplo_64(t3, t2);
+  x3 = v128_ziphi_64(t3, t2);
+  return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+   to compute the remaining directions. */
+static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+  v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  v128 partial6;
+  v128 tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = v128_shl_n_byte(lines[0], 14);
+  partial4b = v128_shr_n_byte(lines[0], 2);
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+  tmp = v128_add_16(lines[0], lines[1]);
+  partial5a = v128_shl_n_byte(tmp, 10);
+  partial5b = v128_shr_n_byte(tmp, 6);
+  partial7a = v128_shl_n_byte(tmp, 4);
+  partial7b = v128_shr_n_byte(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+  tmp = v128_add_16(lines[2], lines[3]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+  tmp = v128_add_16(lines[4], lines[5]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+  partial4a = v128_add_16(partial4a, lines[7]);
+  tmp = v128_add_16(lines[6], lines[7]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+                       v128_from_32(105, 120, 140, 168));
+  partial7a =
+      fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial5a =
+      fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial6 = v128_madd_s16(partial6, partial6);
+  partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+  v128_store_unaligned(tmp_cost1, partial4a);
+  return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+   counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+  const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+  const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+  const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+  const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+  const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+  const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+  const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+  const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+  const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+  const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+  const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+  const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+  const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+  const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+  const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+  const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+  res[7] = v128_ziplo_64(tr1_1, tr1_0);
+  res[6] = v128_ziphi_64(tr1_1, tr1_0);
+  res[5] = v128_ziplo_64(tr1_3, tr1_2);
+  res[4] = v128_ziphi_64(tr1_3, tr1_2);
+  res[3] = v128_ziplo_64(tr1_5, tr1_4);
+  res[2] = v128_ziphi_64(tr1_5, tr1_4);
+  res[1] = v128_ziplo_64(tr1_7, tr1_6);
+  res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
+                             int coeff_shift) {
+  int i;
+  int32_t cost[8];
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  v128 lines[8];
+  for (i = 0; i < 8; i++) {
+    lines[i] = v128_load_unaligned(&img[i * stride]);
+    lines[i] =
+        v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+  }
+
+  /* Compute "mostly vertical" directions. */
+  v128 dir47 = compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  v128 dir03 = compute_directions(lines, cost);
+
+  v128 max = v128_max_s32(dir03, dir47);
+  max = v128_max_s32(max, v128_align(max, max, 8));
+  max = v128_max_s32(max, v128_align(max, max, 4));
+  best_cost = v128_low_u32(max);
+  v128 t =
+      v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
+  best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
+  best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
+
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
+                             unsigned int adjdamp) {
+  v256 diff = v256_sub_16(a, b);
+  const v256 sign = v256_shr_n_s16(diff, 15);
+  diff = v256_abs_s16(diff);
+  const v256 s =
+      v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
+  return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
+}
+
+// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
+SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
+                           unsigned int adjdamp) {
+  const v256 diff16 = v256_sub_16(a, b);
+  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
+  const v128 sign = v128_cmplt_s8(diff, v128_zero());
+  diff = v128_abs_s8(diff);
+  return v128_xor(
+      v128_add_8(sign,
+                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
+                                                v128_shr_u8(diff, adjdamp)))),
+      sign);
+}
+
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        AOM_UNUSED int max_unused,
+                                        int coeff_shift) {
+  v128 p0, p1, p2, p3;
+  v256 sum, row, tap, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+  sum = v256_zero();
+  row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
+  max = min = row;
+
+  if (pri_strength) {
+    // Primary near taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Primary far taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+  }
+
+  if (sec_strength) {
+    // Secondary near taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // Secondary far taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+  }
+
+  // res = row + ((sum - (sum < 0) + 8) >> 4)
+  sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+  res = v256_add_16(sum, v256_dup_16(8));
+  res = v256_shr_n_s16(res, 4);
+  res = v256_add_16(row, res);
+  res = v256_min_s16(v256_max_s16(res, min), max);
+  res = v256_pack_s16_u8(res, res);
+
+  p0 = v256_low_v128(res);
+  u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
+  u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
+  u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
+  u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
+}
+
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        AOM_UNUSED int max_unused,
+                                        int coeff_shift) {
+  int i;
+  v128 p0, p1, p2, p3;
+  v256 sum, row, res, tap;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+    max = min = row;
+    // Primary near taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Primary far taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Secondary near taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // Secondary far taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+    res = v256_pack_s16_u8(res, res);
+
+    p0 = v256_low_v128(res);
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
+    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         AOM_UNUSED int max_unused,
+                                         int coeff_shift) {
+  int i;
+  v256 p0, p1, p2, p3, sum, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  for (i = 0; i < 4; i += 4) {
+    sum = v256_zero();
+    row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
+    min = max = row;
+
+    // Primary near taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+    // Primary far taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+    // Secondary near taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // Secondary far taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 1) * dstride],
+                      v128_low_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 2) * dstride],
+                      v128_high_v64(v256_low_v128(res)));
+    v64_store_aligned(&dst[(i + 3) * dstride],
+                      v128_low_v64(v256_low_v128(res)));
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         AOM_UNUSED int max_unused,
+                                         int coeff_shift) {
+  int i;
+  v256 sum, p0, p1, p2, p3, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+    min = max = row;
+    // Primary near taps
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+    // Primary far taps
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+    // Secondary near taps
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // Secondary far taps
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+    v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
+    v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
+                                  const uint16_t *in, int pri_strength,
+                                  int sec_strength, int dir, int pri_damping,
+                                  int sec_damping, int bsize, int max,
+                                  int coeff_shift) {
+  if (dst8) {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    }
+  } else {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    } else {
+      assert(bsize == BLOCK_4X4);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, max, coeff_shift);
+    }
+  }
+}
+
+void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                         const uint8_t *src, int sstride, int v,
+                                         int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                          const uint16_t *src, int sstride,
+                                          int v, int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v128 row = v128_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], row);
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cdef_block_sse2.c b/third_party/aom/av1/common/cdef_block_sse2.c
new file mode 100644
index 000000000..73f115d17
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_sse2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_sse4.c b/third_party/aom/av1/common/cdef_block_sse4.c
new file mode 100644
index 000000000..349329af6
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_sse4.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_ssse3.c b/third_party/aom/av1/common/cdef_block_ssse3.c
new file mode 100644
index 000000000..3a93b150f
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_ssse3.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
new file mode 100644
index 000000000..ccc59b4eb
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/common_data.h"
+#include "av1/common/onyxc_int.h"
+
+#include "config/av1_rtcd.h"
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
+  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+
+  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
+  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
+  cfl->subsampling_x = seq_params->subsampling_x;
+  cfl->subsampling_y = seq_params->subsampling_y;
+  cfl->are_parameters_computed = 0;
+  cfl->store_y = 0;
+  // The DC_PRED cache is disabled by default and is only enabled in
+  // cfl_rd_pick_alpha
+  cfl->use_dc_pred_cache = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
+}
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width) {
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+    memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+    return;
+  }
+
+  memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
+}
+
+static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
+                                 int dst_stride, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, width);
+    dst += dst_stride;
+  }
+}
+
+static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
+                                 int dst_stride, int width, int height) {
+  const size_t num_bytes = width << 1;
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, num_bytes);
+    dst += dst_stride;
+  }
+}
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+  assert(height <= CFL_BUF_LINE);
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
+                         width, height);
+    return;
+  }
+  cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+                       width, height);
+}
+
+// Due to frame boundary issues, it is possible that the total area covered by
+// chroma exceeds that of luma. When this happens, we fill the missing pixels by
+// repeating the last columns and/or rows.
+static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
+  const int diff_width = width - cfl->buf_width;
+  const int diff_height = height - cfl->buf_height;
+
+  if (diff_width > 0) {
+    const int min_height = height - diff_height;
+    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
+    for (int j = 0; j < min_height; j++) {
+      const uint16_t last_pixel = recon_buf_q3[-1];
+      assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+      for (int i = 0; i < diff_width; i++) {
+        recon_buf_q3[i] = last_pixel;
+      }
+      recon_buf_q3 += CFL_BUF_LINE;
+    }
+    cfl->buf_width = width;
+  }
+  if (diff_height > 0) {
+    uint16_t *recon_buf_q3 =
+        cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
+    for (int j = 0; j < diff_height; j++) {
+      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+      assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+      for (int i = 0; i < width; i++) {
+        recon_buf_q3[i] = last_row_q3[i];
+      }
+      recon_buf_q3 += CFL_BUF_LINE;
+    }
+    cfl->buf_height = height;
+  }
+}
+
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
+                               int height, int round_offset, int num_pel_log2) {
+  int sum = round_offset;
+  const uint16_t *recon = src;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      sum += recon[i];
+    }
+    recon += CFL_BUF_LINE;
+  }
+  const int avg = sum >> num_pel_log2;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = src[i] - avg;
+    }
+    src += CFL_BUF_LINE;
+    dst += CFL_BUF_LINE;
+  }
+}
+
+CFL_SUB_AVG_FN(c)
+
+static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
+                                   CFL_PRED_TYPE pred_type) {
+  const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
+                                                   : CFL_SIGN_V(joint_sign);
+  if (alpha_sign == CFL_SIGN_ZERO) return 0;
+  const int abs_alpha_q3 =
+      (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
+  return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
+}
+
+static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+                                     int dst_stride, int alpha_q3, int width,
+                                     int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
+    }
+    dst += dst_stride;
+    ac_buf_q3 += CFL_BUF_LINE;
+  }
+}
+
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst,
+                          int dst_stride, int alpha_q3) {
+  (void)ac_buf_q3;
+  (void)dst;
+  (void)dst_stride;
+  (void)alpha_q3;
+  assert(0);
+}
+
+CFL_PREDICT_FN(c, lbd)
+
+void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
+                       int alpha_q3, int bit_depth, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = clip_pixel_highbd(
+          get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth);
+    }
+    dst += dst_stride;
+    ac_buf_q3 += CFL_BUF_LINE;
+  }
+}
+
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst,
+                          int dst_stride, int alpha_q3, int bd) {
+  (void)ac_buf_q3;
+  (void)dst;
+  (void)dst_stride;
+  (void)alpha_q3;
+  (void)bd;
+  assert(0);
+}
+
+CFL_PREDICT_FN(c, hbd)
+
+static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = &xd->cfl;
+  // Do not call cfl_compute_parameters multiple time on the same values.
+  assert(cfl->are_parameters_computed == 0);
+
+  cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
+  get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+  cfl->are_parameters_computed = 1;
+}
+
+void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                       TX_SIZE tx_size, int plane) {
+  CFL_CTX *const cfl = &xd->cfl;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(is_cfl_allowed(xd));
+
+  if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
+
+  const int alpha_q3 =
+      cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
+  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
+         CFL_BUF_SQUARE);
+  if (get_bitdepth_data_path_index(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
+                                xd->bd);
+    return;
+  }
+  get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+}
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+                            uint16_t *output_q3) {
+  (void)input;
+  (void)input_stride;
+  (void)output_q3;
+  assert(0);
+}
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+                            uint16_t *output_q3) {
+  (void)input;
+  (void)input_stride;
+  (void)output_q3;
+  assert(0);
+}
+
+static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+    }
+    input += input_stride << 1;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      output_q3[i] = input[i] << 3;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+    }
+    input += input_stride << 1;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      output_q3[i] = input[i] << 3;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(c)
+
+static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_hbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_hbd(tx_size);
+  }
+  return cfl_get_luma_subsampling_444_hbd(tx_size);
+}
+
+static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_lbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_lbd(tx_size);
+  }
+  return cfl_get_luma_subsampling_444_lbd(tx_size);
+}
+
+static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
+                      int row, int col, TX_SIZE tx_size, int use_hbd) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const int tx_off_log2 = tx_size_wide_log2[0];
+  const int sub_x = cfl->subsampling_x;
+  const int sub_y = cfl->subsampling_y;
+  const int store_row = row << (tx_off_log2 - sub_y);
+  const int store_col = col << (tx_off_log2 - sub_x);
+  const int store_height = height >> sub_y;
+  const int store_width = width >> sub_x;
+
+  // Invalidate current parameters
+  cfl->are_parameters_computed = 0;
+
+  // Store the surface of the pixel buffer that was written to, this way we
+  // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
+  // frame boundary)
+  if (col == 0 && row == 0) {
+    cfl->buf_width = store_width;
+    cfl->buf_height = store_height;
+  } else {
+    cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width);
+    cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height);
+  }
+
+  // Check that we will remain inside the pixel buffer.
+  assert(store_row + store_height <= CFL_BUF_LINE);
+  assert(store_col + store_width <= CFL_BUF_LINE);
+
+  // Store the input into the CfL pixel buffer
+  uint16_t *recon_buf_q3 =
+      cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
+
+  if (use_hbd) {
+    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+                                               input_stride, recon_buf_q3);
+  } else {
+    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
+                                               recon_buf_q3);
+  }
+}
+
+// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
+// and non-chroma-referenced blocks are stored together in the CfL buffer.
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
+                                        int *col_out) {
+  // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
+  if ((cfl->mi_row & 0x01) && cfl->subsampling_y) {
+    assert(*row_out == 0);
+    (*row_out)++;
+  }
+
+  // Increment col index for right: 4x8, 4x16 or both right 4x4s.
+  if ((cfl->mi_col & 0x01) && cfl->subsampling_x) {
+    assert(*col_out == 0);
+    (*col_out)++;
+  }
+}
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+                  BLOCK_SIZE bsize) {
+  CFL_CTX *const cfl = &xd->cfl;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  uint8_t *dst =
+      &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    // Only dimensions of size 4 can have an odd offset.
+    assert(!((col & 1) && tx_size_wide[tx_size] != 4));
+    assert(!((row & 1) && tx_size_high[tx_size] != 4));
+    sub8x8_adjust_offset(cfl, &row, &col);
+  }
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
+            get_bitdepth_data_path_index(xd));
+}
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = &xd->cfl;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  int row = 0;
+  int col = 0;
+
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    sub8x8_adjust_offset(cfl, &row, &col);
+  }
+  const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
+  const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
+  tx_size = get_tx_size(width, height);
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
+            get_bitdepth_data_path_index(xd));
+}
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
new file mode 100644
index 000000000..d627891bf
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+// Can we use CfL for the current block?
+static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  if (xd->lossless[mbmi->segment_id]) {
+    // In lossless, CfL is available when the partition size is equal to the
+    // transform size.
+    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
+  }
+  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
+  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+                            block_size_high[bsize] <= 32);
+}
+
+// Do we need to save the luma pixels from the current block,
+// for a possible future CfL prediction?
+static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+                                                  const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+
+  if (!xd->cfl.is_chroma_reference) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // If this block has chroma information, we know whether we're
+  // actually going to perform a CfL prediction
+  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
+                            mbmi->uv_mode == UV_CFL_PRED);
+}
+
+static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
+  int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
+  return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
+}
+
+static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) {
+  assert(plane > 0);
+  return (CFL_PRED_TYPE)(plane - 1);
+}
+
+void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                       TX_SIZE tx_size, int plane);
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+                  BLOCK_SIZE bsize);
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width);
+
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+                            uint16_t *output_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+                            uint16_t *output_q3);
+
+// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
+#define CFL_lbd_TYPE uint8_t *cfl_type
+#define CFL_hbd_TYPE uint16_t *cfl_type
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
+  void subsample_##bd##_##sub##_##width##x##height##_##arch(              \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
+    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
+                                               output_q3, width, height); \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd)                            \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 8)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 4)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 32)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 8)                                     \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+      TX_SIZE tx_size) {                                                  \
+    CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+    return subfn_##sub[tx_size];                                          \
+  }
+
+// Declare an architecture-specific array of function pointers for size-specific
+// wrappers.
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                       \
+  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {      \
+    subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
+    subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
+    subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
+    subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
+    cfl_subsample_##bd##_null,             /* 64x64 (invalid CFL size) */ \
+    subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
+    subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
+    subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
+    subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
+    subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
+    subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
+    cfl_subsample_##bd##_null,             /* 32x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_null,             /* 64x32 (invalid CFL size) */ \
+    subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
+    subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
+    subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
+    subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
+    cfl_subsample_##bd##_null,             /* 16x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_null,             /* 64x16 (invalid CFL size) */ \
+  };
+
+// The RTCD script does not support passing in an array, so we wrap it in this
+// function.
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+
+// Null function used for invalid tx_sizes
+static INLINE void cfl_subtract_average_null(const uint16_t *src,
+                                             int16_t *dst) {
+  (void)dst;
+  (void)src;
+  assert(0);
+}
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)   \
+  void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                    int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,       \
+                            num_pel_log2);                               \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUB_AVG_FN(arch)                                                \
+  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                           \
+  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                          \
+  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                         \
+  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                          \
+  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                          \
+  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                         \
+  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                        \
+  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                         \
+  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                         \
+  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                       \
+  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                       \
+  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                        \
+  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                       \
+  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                      \
+  cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
+    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {          \
+      subtract_average_4x4_##arch,   /* 4x4 */                              \
+      subtract_average_8x8_##arch,   /* 8x8 */                              \
+      subtract_average_16x16_##arch, /* 16x16 */                            \
+      subtract_average_32x32_##arch, /* 32x32 */                            \
+      cfl_subtract_average_null,     /* 64x64 (invalid CFL size) */         \
+      subtract_average_4x8_##arch,   /* 4x8 */                              \
+      subtract_average_8x4_##arch,   /* 8x4 */                              \
+      subtract_average_8x16_##arch,  /* 8x16 */                             \
+      subtract_average_16x8_##arch,  /* 16x8 */                             \
+      subtract_average_16x32_##arch, /* 16x32 */                            \
+      subtract_average_32x16_##arch, /* 32x16 */                            \
+      cfl_subtract_average_null,     /* 32x64 (invalid CFL size) */         \
+      cfl_subtract_average_null,     /* 64x32 (invalid CFL size) */         \
+      subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */          \
+      subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */          \
+      subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */          \
+      subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */          \
+      cfl_subtract_average_null,     /* 16x64 (invalid CFL size) */         \
+      cfl_subtract_average_null,     /* 64x16 (invalid CFL size) */         \
+    };                                                                      \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
+    /* index the function pointer array out of bounds. */                   \
+    return sub_avg[tx_size % TX_SIZES_ALL];                                 \
+  }
+
+// For VSX SIMD optimization, the C versions of width == 4 subtract are
+// faster than the VSX. As such, the VSX code calls the C versions.
+void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height)                                 \
+  void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,   \
+                                               uint8_t *dst, int dst_stride, \
+                                               int alpha_q3) {               \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width,    \
+                           height);                                          \
+  }
+
+#define CFL_PREDICT_hbd(arch, width, height)                                  \
+  void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3,    \
+                                               uint16_t *dst, int dst_stride, \
+                                               int alpha_q3, int bd) {        \
+    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
+                           height);                                           \
+  }
+
+// This wrapper exists because clang format does not like calling macros with
+// lowercase letters.
+#define CFL_PREDICT_X(arch, width, height, bd) \
+  CFL_PREDICT_##bd(arch, width, height)
+
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst,
+                          int dst_stride, int alpha_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
+                          int dst_stride, int alpha_q3, int bd);
+
+#define CFL_PREDICT_FN(arch, bd)                                          \
+  CFL_PREDICT_X(arch, 4, 4, bd)                                           \
+  CFL_PREDICT_X(arch, 4, 8, bd)                                           \
+  CFL_PREDICT_X(arch, 4, 16, bd)                                          \
+  CFL_PREDICT_X(arch, 8, 4, bd)                                           \
+  CFL_PREDICT_X(arch, 8, 8, bd)                                           \
+  CFL_PREDICT_X(arch, 8, 16, bd)                                          \
+  CFL_PREDICT_X(arch, 8, 32, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 4, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 8, bd)                                          \
+  CFL_PREDICT_X(arch, 16, 16, bd)                                         \
+  CFL_PREDICT_X(arch, 16, 32, bd)                                         \
+  CFL_PREDICT_X(arch, 32, 8, bd)                                          \
+  CFL_PREDICT_X(arch, 32, 16, bd)                                         \
+  CFL_PREDICT_X(arch, 32, 32, bd)                                         \
+  cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) {   \
+    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {             \
+      predict_##bd##_4x4_##arch,   /* 4x4 */                              \
+      predict_##bd##_8x8_##arch,   /* 8x8 */                              \
+      predict_##bd##_16x16_##arch, /* 16x16 */                            \
+      predict_##bd##_32x32_##arch, /* 32x32 */                            \
+      cfl_predict_##bd##_null,     /* 64x64 (invalid CFL size) */         \
+      predict_##bd##_4x8_##arch,   /* 4x8 */                              \
+      predict_##bd##_8x4_##arch,   /* 8x4 */                              \
+      predict_##bd##_8x16_##arch,  /* 8x16 */                             \
+      predict_##bd##_16x8_##arch,  /* 16x8 */                             \
+      predict_##bd##_16x32_##arch, /* 16x32 */                            \
+      predict_##bd##_32x16_##arch, /* 32x16 */                            \
+      cfl_predict_##bd##_null,     /* 32x64 (invalid CFL size) */         \
+      cfl_predict_##bd##_null,     /* 64x32 (invalid CFL size) */         \
+      predict_##bd##_4x16_##arch,  /* 4x16  */                            \
+      predict_##bd##_16x4_##arch,  /* 16x4  */                            \
+      predict_##bd##_8x32_##arch,  /* 8x32  */                            \
+      predict_##bd##_32x8_##arch,  /* 32x8  */                            \
+      cfl_predict_##bd##_null,     /* 16x64 (invalid CFL size) */         \
+      cfl_predict_##bd##_null,     /* 64x16 (invalid CFL size) */         \
+    };                                                                    \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+    /* index the function pointer array out of bounds. */                 \
+    return pred[tx_size % TX_SIZES_ALL];                                  \
+  }
+
+#endif  // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
new file mode 100644
index 000000000..bed6083db
--- /dev/null
+++ b/third_party/aom/av1/common/common.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/bitops.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PI 3.141592653589793238462643383279502884
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define av1_copy(dest, src)              \
+  {                                      \
+    assert(sizeof(dest) == sizeof(src)); \
+    memcpy(dest, src, sizeof(src));      \
+  }
+
+// Use this for variably-sized arrays.
+#define av1_copy_array(dest, src, n)           \
+  {                                            \
+    assert(sizeof(*(dest)) == sizeof(*(src))); \
+    memcpy(dest, src, n * sizeof(*(src)));     \
+  }
+
+#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+  return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#define CHECK_MEM_ERROR(cm, lval, expr) \
+  AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+
+#define AOM_FRAME_MARKER 0x2
+
+#define AV1_MIN_TILE_SIZE_BYTES 1
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
new file mode 100644
index 000000000..46e455fdb
--- /dev/null
+++ b/third_party/aom/av1/common/common_data.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
+};
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
+  0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
+};
+
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
+  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
+};
+
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
+  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
+};
+
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
+  4,  4,  8,  8,   8,   16, 16, 16, 32, 32, 32,
+  64, 64, 64, 128, 128, 4,  16, 8,  32, 16, 64
+};
+
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
+  4,  8,  4,   8,  16,  8,  16, 32, 16, 32, 64,
+  32, 64, 128, 64, 128, 16, 4,  32, 8,  64, 16
+};
+
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
+static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
+};
+
+static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
+  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
+};
+
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
+/* clang-format off */
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4, BLOCK_8X8, BLOCK_16X16,
+    BLOCK_32X32, BLOCK_64X64, BLOCK_128X128
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
+    BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
+  }, {  // PARTITION_HORZ_A
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+  }, {  // PARTITION_HORZ_B
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+  }, {  // PARTITION_VERT_A
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+  }, {  // PARTITION_VERT_B
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+  }, {  // PARTITION_HORZ_4
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+    BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID
+  }, {  // PARTITION_VERT_4
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
+  }
+};
+
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,
+  TX_32X32,  TX_32X32,
+  // 64X64
+  TX_64X64,
+  // 64x128, 128x64,   128x128
+  TX_64X64,  TX_64X64, TX_64X64,
+  // 4x16,   16x4,     8x32
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 32x8,   16x64     64x16
+  TX_8X8,    TX_16X16, TX_16X16
+};
+
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = {
+      // 4X4
+      TX_4X4,
+      // 4X8,    8X4,      8X8
+      TX_4X8,    TX_8X4,   TX_8X8,
+      // 8X16,   16X8,     16X16
+      TX_8X16,   TX_16X8,  TX_16X16,
+      // 16X32,  32X16,    32X32
+      TX_16X32,  TX_32X16, TX_32X32,
+      // 32X64,  64X32,
+      TX_32X64,  TX_64X32,
+      // 64X64
+      TX_64X64,
+      // 64x128, 128x64,   128x128
+      TX_64X64,  TX_64X64, TX_64X64,
+      // 4x16,   16x4,
+      TX_4X16,   TX_16X4,
+      // 8x32,   32x8
+      TX_8X32,   TX_32X8,
+      // 16x64,  64x16
+      TX_16X64,  TX_64X16
+};
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+  DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
+  FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+  DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+  DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
+  DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+  IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
+};
+
+#define TXSIZE_CAT_INVALID (-1)
+
+/* clang-format on */
+
+static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_4X4,    // TX_8X8
+  TX_8X8,    // TX_16X16
+  TX_16X16,  // TX_32X32
+  TX_32X32,  // TX_64X64
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_32X32,  // TX_32X64
+  TX_32X32,  // TX_64X32
+  TX_4X8,    // TX_4X16
+  TX_8X4,    // TX_16X4
+  TX_8X16,   // TX_8X32
+  TX_16X8,   // TX_32X8
+  TX_16X32,  // TX_16X64
+  TX_32X16,  // TX_64X16
+};
+
+static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_4X4,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_32X32,  // TX_32X16
+  TX_32X32,  // TX_32X64
+  TX_64X64,  // TX_64X32
+  TX_4X4,    // TX_4X16
+  TX_16X16,  // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_32X32,  // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_64X64,  // TX_64X16
+};
+
+static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_8X8,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_64X64,  // TX_32X64
+  TX_32X32,  // TX_64X32
+  TX_16X16,  // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_32X32,  // TX_8X32
+  TX_8X8,    // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_16X16,  // TX_64X16
+};
+
+#define TX_SIZE_W_MIN 4
+
+// Transform block width in pixels
+static const int tx_size_wide[TX_SIZES_ALL] = {
+  4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
+};
+
+#define TX_SIZE_H_MIN 4
+
+// Transform block height in pixels
+static const int tx_size_high[TX_SIZES_ALL] = {
+  4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
+};
+
+// Transform block width in unit
+static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+  1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16,
+};
+
+// Transform block height in unit
+static const int tx_size_high_unit[TX_SIZES_ALL] = {
+  1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4,
+};
+
+// Transform block width in log2
+static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
+};
+
+// Transform block height in log2
+static const int tx_size_high_log2[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
+};
+
+static const int tx_size_2d[TX_SIZES_ALL + 1] = {
+  16,  64,   256,  1024, 4096, 32,  32,  128,  128,  512,
+  512, 2048, 2048, 64,   64,   256, 256, 1024, 1024,
+};
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+  BLOCK_4X4,    // TX_4X4
+  BLOCK_8X8,    // TX_8X8
+  BLOCK_16X16,  // TX_16X16
+  BLOCK_32X32,  // TX_32X32
+  BLOCK_64X64,  // TX_64X64
+  BLOCK_4X8,    // TX_4X8
+  BLOCK_8X4,    // TX_8X4
+  BLOCK_8X16,   // TX_8X16
+  BLOCK_16X8,   // TX_16X8
+  BLOCK_16X32,  // TX_16X32
+  BLOCK_32X16,  // TX_32X16
+  BLOCK_32X64,  // TX_32X64
+  BLOCK_64X32,  // TX_64X32
+  BLOCK_4X16,   // TX_4X16
+  BLOCK_16X4,   // TX_16X4
+  BLOCK_8X32,   // TX_8X32
+  BLOCK_32X8,   // TX_32X8
+  BLOCK_16X64,  // TX_16X64
+  BLOCK_64X16,  // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_32X32,  // TX_32X64
+  TX_32X32,  // TX_64X32
+  TX_4X4,    // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_8X8,    // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_16X16,  // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_8X8,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_32X32,  // TX_32X16
+  TX_64X64,  // TX_32X64
+  TX_64X64,  // TX_64X32
+  TX_16X16,  // TX_4X16
+  TX_16X16,  // TX_16X4
+  TX_32X32,  // TX_8X32
+  TX_32X32,  // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_64X64,  // TX_64X16
+};
+
+static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
+  0,  // TX_4X4
+  2,  // TX_8X8
+  4,  // TX_16X16
+  6,  // TX_32X32
+  6,  // TX_64X64
+  1,  // TX_4X8
+  1,  // TX_8X4
+  3,  // TX_8X16
+  3,  // TX_16X8
+  5,  // TX_16X32
+  5,  // TX_32X16
+  6,  // TX_32X64
+  6,  // TX_64X32
+  2,  // TX_4X16
+  2,  // TX_16X4
+  4,  // TX_8X32
+  4,  // TX_32X8
+  5,  // TX_16X64
+  5,  // TX_64X16
+};
+
+/* clang-format off */
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,    // ONLY_4X4
+  TX_64X64,  // TX_MODE_LARGEST
+  TX_64X64,  // TX_MODE_SELECT
+};
+
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
+static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
+  //  ss_x == 0      ss_x == 0          ss_x == 1      ss_x == 1
+  //  ss_y == 0      ss_y == 1          ss_y == 0      ss_y == 1
+  { { BLOCK_4X4,     BLOCK_4X4 },     { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_4X8,     BLOCK_4X4 },     { BLOCK_INVALID, BLOCK_4X4 } },
+  { { BLOCK_8X4,     BLOCK_INVALID }, { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_8X8,     BLOCK_8X4 },     { BLOCK_4X8,     BLOCK_4X4 } },
+  { { BLOCK_8X16,    BLOCK_8X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X8,    BLOCK_INVALID }, { BLOCK_8X8,     BLOCK_8X4 } },
+  { { BLOCK_16X16,   BLOCK_16X8 },    { BLOCK_8X16,    BLOCK_8X8 } },
+  { { BLOCK_16X32,   BLOCK_16X16 },   { BLOCK_INVALID, BLOCK_8X16 } },
+  { { BLOCK_32X16,   BLOCK_INVALID }, { BLOCK_16X16,   BLOCK_16X8 } },
+  { { BLOCK_32X32,   BLOCK_32X16 },   { BLOCK_16X32,   BLOCK_16X16 } },
+  { { BLOCK_32X64,   BLOCK_32X32 },   { BLOCK_INVALID, BLOCK_16X32 } },
+  { { BLOCK_64X32,   BLOCK_INVALID }, { BLOCK_32X32,   BLOCK_32X16 } },
+  { { BLOCK_64X64,   BLOCK_64X32 },   { BLOCK_32X64,   BLOCK_32X32 } },
+  { { BLOCK_64X128,  BLOCK_64X64 },   { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64,  BLOCK_INVALID }, { BLOCK_64X64,   BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 },  { BLOCK_64X128,  BLOCK_64X64 } },
+  { { BLOCK_4X16,    BLOCK_4X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X4,    BLOCK_INVALID }, { BLOCK_8X4,     BLOCK_8X4 } },
+  { { BLOCK_8X32,    BLOCK_8X16 },    { BLOCK_INVALID, BLOCK_4X16 } },
+  { { BLOCK_32X8,    BLOCK_INVALID }, { BLOCK_16X8,    BLOCK_16X4 } },
+  { { BLOCK_16X64,   BLOCK_16X32 },   { BLOCK_INVALID, BLOCK_8X32 } },
+  { { BLOCK_64X16,   BLOCK_INVALID }, { BLOCK_32X16,   BLOCK_32X8 } }
+};
+/* clang-format on */
+
+// Generates 5 bit field in which each bit set to 1 represents
+// a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
+// and 8x8.  10000 means we just split the 128x128 to 64x64
+/* clang-format off */
+static const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES_ALL] = {
+  { 31, 31 },  // 4X4   - {0b11111, 0b11111}
+  { 31, 30 },  // 4X8   - {0b11111, 0b11110}
+  { 30, 31 },  // 8X4   - {0b11110, 0b11111}
+  { 30, 30 },  // 8X8   - {0b11110, 0b11110}
+  { 30, 28 },  // 8X16  - {0b11110, 0b11100}
+  { 28, 30 },  // 16X8  - {0b11100, 0b11110}
+  { 28, 28 },  // 16X16 - {0b11100, 0b11100}
+  { 28, 24 },  // 16X32 - {0b11100, 0b11000}
+  { 24, 28 },  // 32X16 - {0b11000, 0b11100}
+  { 24, 24 },  // 32X32 - {0b11000, 0b11000}
+  { 24, 16 },  // 32X64 - {0b11000, 0b10000}
+  { 16, 24 },  // 64X32 - {0b10000, 0b11000}
+  { 16, 16 },  // 64X64 - {0b10000, 0b10000}
+  { 16, 0 },   // 64X128- {0b10000, 0b00000}
+  { 0, 16 },   // 128X64- {0b00000, 0b10000}
+  { 0, 0 },    // 128X128-{0b00000, 0b00000}
+  { 31, 28 },  // 4X16  - {0b11111, 0b11100}
+  { 28, 31 },  // 16X4  - {0b11100, 0b11111}
+  { 30, 24 },  // 8X32  - {0b11110, 0b11000}
+  { 24, 30 },  // 32X8  - {0b11000, 0b11110}
+  { 28, 16 },  // 16X64 - {0b11100, 0b10000}
+  { 16, 28 },  // 64X16 - {0b10000, 0b11100}
+};
+/* clang-format on */
+
+static const int intra_mode_context[INTRA_MODES] = {
+  0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0,
+};
+
+// Note: this is also used in unit tests. So whenever one changes the table,
+// the unit tests need to be changed accordingly.
+static const int quant_dist_weight[4][2] = {
+  { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
+};
+static const int quant_dist_lookup_table[2][4][2] = {
+  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
+  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
new file mode 100644
index 000000000..1f11126fc
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.c
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const int16_t *x_filters, int x0_qn,
+                             int x_step_qn) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_qn += x_step_qn;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const int16_t *x_filters, int x0_qn,
+                                    int x_step_qn, int bd) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_qn += x_step_qn;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+    }
+  }
+}
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
+                         const int subpel_x_q4, const int subpel_y_q4,
+                         ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      dst[y * dst_stride + x] =
+          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
+    }
+  }
+}
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
+                         const int subpel_x_q4, const int subpel_y_q4,
+                         ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+    }
+  }
+}
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  for (int y = 0; y < h; ++y) {
+    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+  }
+}
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                           int dst8_stride, int w, int h,
+                           const InterpFilterParams *filter_params_x,
+                           const InterpFilterParams *filter_params_y,
+                           const int subpel_x_q4, const int subpel_y_q4,
+                           ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                          int dst8_stride, int w, int h,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                          int dst8_stride, int w, int h,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
+                          const int subpel_x_q4, const int subpel_y_q4,
+                          ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst8, int dst8_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+                             int dst8_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int x_step_qn,
+                             const int subpel_y_qn, const int y_step_qn,
+                             ConvolveParams *conv_params) {
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  for (int y = 0; y < im_h; ++y) {
+    int x_qn = subpel_x_qn;
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(x_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *x_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_x[k - fo_horiz];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+    src_horiz += src_stride;
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int x = 0; x < w; ++x) {
+    int y_qn = subpel_y_qn;
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(y_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *y_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
+    }
+    src_vert++;
+  }
+}
+
+static void convolve_2d_scale_wrapper(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params) {
+  if (conv_params->is_compound) {
+    assert(conv_params->dst != NULL);
+  }
+  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
+                        y_step_qn, conv_params);
+}
+
+// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
+// we may create optimized code to do 2-tap filtering for all bilinear filtering
+// usages, not just IntraBC.
+static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    int subpel_x_q4, int subpel_y_q4,
+                                    ConvolveParams *conv_params) {
+  const InterpFilterParams *filter_params_x =
+      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+  const InterpFilterParams *filter_params_y =
+      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, 0, 0, conv_params);
+  } else if (subpel_x_q4 != 0) {
+    av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, 0, 0, conv_params);
+  } else {
+    av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, 0, 0, conv_params);
+  }
+}
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilters interp_filters, const int subpel_x_q4,
+                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
+                            int scaled, ConvolveParams *conv_params,
+                            const struct scale_factors *sf, int is_intrabc) {
+  assert(IMPLIES(is_intrabc, !scaled));
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)dst;
+  (void)dst_stride;
+
+  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+    convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
+                            subpel_y_q4, conv_params);
+    return;
+  }
+
+  InterpFilter filter_x = 0;
+  InterpFilter filter_y = 0;
+  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+  if (need_filter_params_x)
+    filter_x = av1_extract_interp_filter(interp_filters, 1);
+  if (need_filter_params_y)
+    filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const InterpFilterParams *filter_params_x =
+      need_filter_params_x
+          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+          : NULL;
+  const InterpFilterParams *filter_params_y =
+      need_filter_params_y
+          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+          : NULL;
+
+  if (scaled) {
+    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
+                              filter_params_x, filter_params_y, subpel_x_q4,
+                              x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+  } else {
+    sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+  }
+}
+
+void av1_highbd_convolve_2d_copy_sr_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+
+  for (int y = 0; y < h; ++y) {
+    memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+  }
+}
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params, int bd) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+    }
+  }
+}
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_q4, const int subpel_y_q4,
+                                ConvolveParams *conv_params, int bd) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+
+  // horizontal filter
+  const uint16_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
+                                  uint16_t *dst16, int dst16_stride, int w,
+                                  int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params, int bd) {
+  int x, y, k;
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+
+  // horizontal filter
+  const uint16_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (y = 0; y < im_h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      (void)bd;
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst16, int dst16_stride, int w,
+                                 int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  assert(bits >= 0);
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst16, int dst16_stride, int w,
+                                 int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  assert(bits >= 0);
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_copy_c(
+    const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+    int w, int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  assert(bits >= 0);
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+      if (conv_params->do_average) {
+        int32_t tmp = dst[y * dst_stride + x];
+        if (conv_params->use_jnt_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst16[y * dst16_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      } else {
+        dst[y * dst_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_qn, const int x_step_qn,
+                                    const int subpel_y_qn, const int y_step_qn,
+                                    ConvolveParams *conv_params, int bd) {
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+  // horizontal filter
+  const uint16_t *src_horiz = src - fo_vert * src_stride;
+  for (int y = 0; y < im_h; ++y) {
+    int x_qn = subpel_x_qn;
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(x_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *x_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_x[k - fo_horiz];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+    src_horiz += src_stride;
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int x = 0; x < w; ++x) {
+    int y_qn = subpel_y_qn;
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(y_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *y_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      }
+    }
+    src_vert++;
+  }
+}
+
+static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
+                                           uint16_t *dst, int dst_stride, int w,
+                                           int h, int subpel_x_q4,
+                                           int subpel_y_q4,
+                                           ConvolveParams *conv_params,
+                                           int bd) {
+  const InterpFilterParams *filter_params_x =
+      subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+  const InterpFilterParams *filter_params_y =
+      subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+  if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+    av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params_x, filter_params_y, 0, 0,
+                                conv_params, bd);
+  } else if (subpel_x_q4 != 0) {
+    av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, filter_params_y, 0, 0,
+                               conv_params, bd);
+  } else {
+    av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                               filter_params_x, filter_params_y, 0, 0,
+                               conv_params, bd);
+  }
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst8, int dst_stride, int w, int h,
+                                   InterpFilters interp_filters,
+                                   const int subpel_x_q4, int x_step_q4,
+                                   const int subpel_y_q4, int y_step_q4,
+                                   int scaled, ConvolveParams *conv_params,
+                                   const struct scale_factors *sf,
+                                   int is_intrabc, int bd) {
+  assert(IMPLIES(is_intrabc, !scaled));
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)dst_stride;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
+                                   subpel_x_q4, subpel_y_q4, conv_params, bd);
+    return;
+  }
+
+  InterpFilter filter_x = 0;
+  InterpFilter filter_y = 0;
+  const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+  if (need_filter_params_x)
+    filter_x = av1_extract_interp_filter(interp_filters, 1);
+  if (need_filter_params_y)
+    filter_y = av1_extract_interp_filter(interp_filters, 0);
+  const InterpFilterParams *filter_params_x =
+      need_filter_params_x
+          ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+          : NULL;
+  const InterpFilterParams *filter_params_y =
+      need_filter_params_y
+          ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+          : NULL;
+
+  if (scaled) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    if (conv_params->is_compound) {
+      assert(conv_params->dst != NULL);
+    }
+    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, filter_params_y, subpel_x_q4,
+                                 x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+                                 bd);
+  } else {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+                                          0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+  }
+}
+
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 128x128 pixels.
+// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
+#define WIENER_MAX_EXT_SIZE 263
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint16_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h,
+                                       int round0_bits) {
+  const int bd = 8;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h,
+                                      int round1_bits) {
+  const int bd = 8;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h,
+                                   const ConvolveParams *conv_params) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
+                             x_step_q4, w, intermediate_height,
+                             conv_params->round_0);
+  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
+                            y_step_q4, w, h, conv_params->round_1);
+}
+
+static void highbd_convolve_add_src_horiz_hip(
+    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+    int x_step_q4, int w, int h, int round0_bits, int bd) {
+  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               extraprec_clamp_limit - 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_add_src_vert_hip(
+    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+    int y_step_q4, int w, int h, int round1_bits, int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void av1_highbd_wiener_convolve_add_src_c(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+
+  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                    src_stride, temp, MAX_SB_SIZE, filters_x,
+                                    x0_q4, x_step_q4, w, intermediate_height,
+                                    conv_params->round_0, bd);
+  highbd_convolve_add_src_vert_hip(
+      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
+      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
+}
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
new file mode 100644
index 000000000..4109dd843
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
+#include "av1/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t CONV_BUF_TYPE;
+typedef struct ConvolveParams {
+  int do_average;
+  CONV_BUF_TYPE *dst;
+  int dst_stride;
+  int round_0;
+  int round_1;
+  int plane;
+  int is_compound;
+  int use_jnt_comp_avg;
+  int fwd_offset;
+  int bck_offset;
+} ConvolveParams;
+
+#define ROUND0_BITS 3
+#define COMPOUND_ROUND1_BITS 7
+#define WIENER_ROUND0_BITS 3
+
+#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
+
+typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params);
+
+typedef void (*aom_highbd_convolve_fn_t)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+struct AV1Common;
+struct scale_factors;
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            InterpFilters interp_filters, const int subpel_x_q4,
+                            int x_step_q4, const int subpel_y_q4, int y_step_q4,
+                            int scaled, ConvolveParams *conv_params,
+                            const struct scale_factors *sf, int is_intrabc);
+
+static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
+                                                      CONV_BUF_TYPE *dst,
+                                                      int dst_stride,
+                                                      int is_compound, int bd) {
+  ConvolveParams conv_params;
+  conv_params.do_average = do_average;
+  assert(IMPLIES(do_average, is_compound));
+  conv_params.is_compound = is_compound;
+  conv_params.round_0 = ROUND0_BITS;
+  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+                                    : 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    if (!is_compound) conv_params.round_1 -= intbufrange - 16;
+  }
+  // TODO(yunqing): The following dst should only be valid while
+  // is_compound = 1;
+  conv_params.dst = dst;
+  conv_params.dst_stride = dst_stride;
+  conv_params.plane = plane;
+  return conv_params;
+}
+
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
+                                             int bd) {
+  return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
+}
+
+static INLINE ConvolveParams get_conv_params_wiener(int bd) {
+  ConvolveParams conv_params;
+  (void)bd;
+  conv_params.do_average = 0;
+  conv_params.is_compound = 0;
+  conv_params.round_0 = WIENER_ROUND0_BITS;
+  conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    conv_params.round_1 -= intbufrange - 16;
+  }
+  conv_params.dst = NULL;
+  conv_params.dst_stride = 0;
+  conv_params.plane = 0;
+  return conv_params;
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   InterpFilters interp_filters,
+                                   const int subpel_x_q4, int x_step_q4,
+                                   const int subpel_y_q4, int y_step_q4,
+                                   int scaled, ConvolveParams *conv_params,
+                                   const struct scale_factors *sf,
+                                   int is_intrabc, int bd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
new file mode 100644
index 000000000..868f341b5
--- /dev/null
+++ b/third_party/aom/av1/common/debugmodes.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+
+static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+          cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
+                          size_t member_offset) {
+  int mi_row, mi_col;
+  MB_MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+  char prefix = descriptor[0];
+
+  log_frame_info(cm, descriptor, file);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
+      mi++;
+    }
+    fprintf(file, "\n");
+    mi += MAX_MIB_SIZE;
+  }
+  fprintf(file, "\n");
+}
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
+  int mi_row;
+  int mi_col;
+  FILE *mvs = fopen(file, "a");
+  MB_MODE_INFO **mi = cm->mi_grid_visible;
+  int rows = cm->mi_rows;
+  int cols = cm->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += MAX_MIB_SIZE;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
+  log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi_grid_visible;
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "V ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += MAX_MIB_SIZE;
+  }
+  fprintf(mvs, "\n");
+
+  fclose(mvs);
+}
+
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                         const char *filename) {
+  FILE *hdrFile = fopen(filename, "w");
+  fwrite(data, size, sizeof(uint8_t), hdrFile);
+  fclose(hdrFile);
+}
+
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) {
+  FILE *fcFile = fopen(filename, "w");
+  const uint16_t *fcp = (uint16_t *)fc;
+  const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t);
+  unsigned int i;
+
+  for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++);
+  fclose(fcFile);
+}
diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c
new file mode 100644
index 000000000..4f95ef69b
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/scan.h"
+#include "av1/common/token_cdfs.h"
+#include "av1/common/txb_common.h"
+
+static int get_q_ctx(int q) {
+  if (q <= 20) return 0;
+  if (q <= 60) return 1;
+  if (q <= 120) return 2;
+  return 3;
+}
+
+void av1_default_coef_probs(AV1_COMMON *cm) {
+  const int index = get_q_ctx(cm->base_qindex);
+#if CONFIG_ENTROPY_STATS
+  cm->coef_cdf_category = index;
+#endif
+
+  av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
+  av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
+  av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
+  av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_eob_cdf,
+           av1_default_coeff_base_eob_multi_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
+}
+
+static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
+                                     int cdf_stride, int nsymbs) {
+  for (int i = 0; i < num_cdfs; i++) {
+    cdf_ptr[i * cdf_stride + nsymbs] = 0;
+  }
+}
+
+#define RESET_CDF_COUNTER(cname, nsymbs) \
+  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
+
+#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride)          \
+  do {                                                               \
+    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
+    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
+    int num_cdfs = array_size / cdf_stride;                          \
+    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
+  } while (0)
+
+static void reset_nmv_counter(nmv_context *nmv) {
+  RESET_CDF_COUNTER(nmv->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
+  }
+}
+
+void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
+  RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
+  RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
+  RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
+  RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
+  RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
+  RESET_CDF_COUNTER(fc->newmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
+  RESET_CDF_COUNTER(fc->refmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->drl_cdf, 2);
+  RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+  RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
+  RESET_CDF_COUNTER(fc->interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
+  RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
+  RESET_CDF_COUNTER(fc->obmc_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
+  RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
+    RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
+  }
+  RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
+  RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
+  RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
+  RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
+  RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
+  RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
+  RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+  RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
+  reset_nmv_counter(&fc->nmvc);
+  reset_nmv_counter(&fc->ndvc);
+  RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
+  RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
+  RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
+  RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
+                           CDF_SIZE(UV_INTRA_MODES));
+  RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
+    } else if (i < 16) {
+      RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
+    } else {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
+    }
+  }
+  RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
+  RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
+  RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
+                           CDF_SIZE(MAX_TX_DEPTH + 1));
+  RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
+  RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
+  }
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
+}
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
new file mode 100644
index 000000000..991692c2f
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TOKEN_CDF_Q_CTXS 4
+
+#define TXB_SKIP_CONTEXTS 13
+
+#define EOB_COEF_CONTEXTS 9
+
+#define SIG_COEF_CONTEXTS_2D 26
+#define SIG_COEF_CONTEXTS_1D 16
+#define SIG_COEF_CONTEXTS_EOB 4
+#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D)
+
+#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS)
+#define DC_SIGN_CONTEXTS 3
+
+#define BR_TMP_OFFSET 12
+#define BR_REF_CAT 4
+#define LEVEL_CONTEXTS 21
+
+#define NUM_BASE_LEVELS 2
+
+#define BR_CDF_SIZE (4)
+#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
+
+#define COEFF_CONTEXT_BITS 6
+#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
+
+#define BASE_CONTEXT_POSITION_NUM 12
+
+typedef enum TX_CLASS {
+  TX_CLASS_2D = 0,
+  TX_CLASS_HORIZ = 1,
+  TX_CLASS_VERT = 2,
+  TX_CLASSES = 3,
+} TX_CLASS;
+
+#define DCT_MAX_VALUE 16384
+#define DCT_MAX_VALUE_HIGH10 65536
+#define DCT_MAX_VALUE_HIGH12 262144
+
+/* Coefficients are predicted via a 3-dimensional probability table indexed on
+ * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */
+#define REF_TYPES 2  // intra=0, inter=1
+
+struct AV1Common;
+struct frame_contexts;
+void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
+void av1_default_coef_probs(struct AV1Common *cm);
+
+struct frame_contexts;
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_4X8:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_8X4:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X16:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X8:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X32:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X16:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_8X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_64X64:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_32X64:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_64X32:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_4X16:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X4:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X32:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X8:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X64:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_64X16:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    default: assert(0 && "Invalid transform size."); break;
+  }
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
+static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+  return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
+                   1);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c
new file mode 100644
index 000000000..41dc30ddb
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/scan.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+
+static const aom_cdf_prob
+    default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
+        INTRA_MODES)] = {
+      { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
+                    24189, 28165, 29093, 30466) },
+        { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032,
+                    24434, 28658, 30172, 31409) },
+        { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620,
+                    26160, 29336, 29929, 31567) },
+        { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096,
+                    24746, 29585, 30958, 32462) },
+        { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583,
+                    26437, 30261, 31073, 32475) } },
+      { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023,
+                    25381, 29014, 30482, 31436) },
+        { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423,
+                    27610, 29905, 31276, 31794) },
+        { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405,
+                    24469, 27915, 29090, 30492) },
+        { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825,
+                    24649, 29153, 31096, 32210) },
+        { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516,
+                    26001, 29675, 30981, 31994) } },
+      { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055,
+                    25729, 29538, 30305, 32077) },
+        { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062,
+                    23219, 27743, 29211, 30907) },
+        { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555,
+                    30467, 30794, 32086) },
+        { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523,
+                    23878, 28975, 30287, 32252) },
+        { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561,
+                    30072, 30737, 32463) } },
+      { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419,
+                    25060, 29696, 30917, 32409) },
+        { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468,
+                    25225, 29485, 31158, 32342) },
+        { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605,
+                    29118, 30078, 32018) },
+        { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743,
+                    30389, 31536, 32528) },
+        { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718,
+                    25769, 29953, 30983, 32485) } },
+      { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449,
+                    26219, 30214, 31150, 32477) },
+        { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236,
+                    25380, 29653, 31143, 32277) },
+        { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466,
+                    29900, 30523, 32261) },
+        { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753,
+                    24615, 29489, 30883, 32482) },
+        { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180,
+                    31355, 31802, 32593) } }
+    };
+
+static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE(
+    2 * MAX_ANGLE_DELTA + 1)] = {
+  { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) },
+  { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) },
+  { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) },
+  { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) },
+  { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) },
+  { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) },
+  { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) },
+  { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) }
+};
+
+static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123,
+                                  26606, 27418, 27945, 29228, 29685, 30349) },
+                      { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649,
+                                  25527, 27364, 28152, 29701, 29984, 30852) },
+                      { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654,
+                                  25136, 27073, 27830, 29360, 29730, 30659) },
+                      { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533,
+                                  23703, 24804, 25352, 26575, 27016, 28049) } };
+
+static const aom_cdf_prob
+    default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE(
+        UV_INTRA_MODES)] = {
+      { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923,
+                    28244, 30059, 30941, 31961) },
+        { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824,
+                    28359, 29505, 29800, 31796) },
+        { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854,
+                    30764, 31777, 32029) },
+        { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148,
+                    28577, 30612, 31355, 32493) },
+        { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243,
+                    31101, 31744, 32363) },
+        { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458,
+                    29711, 31161, 31441, 32550) },
+        { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200,
+                    30245, 31837, 32342, 32667) },
+        { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128,
+                    29267, 30643, 31961, 32461) },
+        { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273,
+                    28443, 30388, 30767, 32416) },
+        { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719,
+                    23174, 28861, 30379, 32175) },
+        { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119,
+                    23527, 27053, 31397, 32148) },
+        { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907,
+                    22482, 25896, 26541, 31819) },
+        { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166,
+                    15255, 15753, 16039, 16606) } },
+      { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656,
+                    15986, 20086, 20995, 22455, 24212) },
+        { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451,
+                    22099, 24228, 24693, 27032, 29472) },
+        { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774,
+                    23138, 24256, 24703, 26679) },
+        { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371,
+                    21520, 22206, 23389, 24182) },
+        { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411,
+                    24911, 25380, 26027, 26376) },
+        { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981,
+                    24780, 25386, 26517, 27176) },
+        { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803,
+                    23188, 23763, 24455, 24940) },
+        { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059,
+                    22336, 23204, 23964, 24793) },
+        { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898,
+                    22494, 23139, 24764, 25989) },
+        { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004,
+                    15534, 20714, 21789, 23443, 24861) },
+        { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235,
+                    15902, 20102, 22696, 23774, 25838) },
+        { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163,
+                    15636, 19676, 20474, 23519, 25208) },
+        { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
+                    9875, 10521, 29048) } }
+    };
+
+static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
+    EXT_PARTITION_TYPES)] = {
+  { AOM_CDF4(19132, 25510, 30392) },
+  { AOM_CDF4(13928, 19855, 28540) },
+  { AOM_CDF4(12522, 23679, 28629) },
+  { AOM_CDF4(9896, 18783, 25853) },
+  { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) },
+  { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) },
+  { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) },
+  { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) },
+  { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) },
+  { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) },
+  { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) },
+  { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) },
+  { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) },
+  { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) },
+  { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) },
+  { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) },
+  { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+  { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+  { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+  { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
+};
+
+static const aom_cdf_prob default_intra_ext_tx_cdf
+    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
+      {
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+      },
+      {
+          {
+              { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) },
+              { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) },
+              { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) },
+              { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) },
+              { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) },
+              { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) },
+              { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) },
+              { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) },
+              { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) },
+              { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) },
+              { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) },
+              { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) },
+              { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) },
+          },
+          {
+              { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) },
+              { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) },
+              { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) },
+              { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) },
+              { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) },
+              { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) },
+              { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) },
+              { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) },
+              { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) },
+              { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) },
+              { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) },
+              { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) },
+              { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+          },
+      },
+      {
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(1127, 12814, 22772, 27483) },
+              { AOM_CDF5(145, 6761, 11980, 26667) },
+              { AOM_CDF5(362, 5887, 11678, 16725) },
+              { AOM_CDF5(385, 15213, 18587, 30693) },
+              { AOM_CDF5(25, 2914, 23134, 27903) },
+              { AOM_CDF5(60, 4470, 11749, 23991) },
+              { AOM_CDF5(37, 3332, 14511, 21448) },
+              { AOM_CDF5(157, 6320, 13036, 17439) },
+              { AOM_CDF5(119, 6719, 12906, 29396) },
+              { AOM_CDF5(47, 5537, 12576, 21499) },
+              { AOM_CDF5(269, 6076, 11258, 23115) },
+              { AOM_CDF5(83, 5615, 12001, 17228) },
+              { AOM_CDF5(1968, 5556, 12023, 18547) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+      },
+    };
+
+static const aom_cdf_prob
+    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+        TX_TYPES)] = {
+      {
+          { 0 },
+          { 0 },
+          { 0 },
+          { 0 },
+      },
+      {
+          { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504,
+                      22848, 23934, 25474, 27727, 28915, 30631) },
+          { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674,
+                      20408, 22517, 25010, 27116, 28856, 30749) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
+      },
+      {
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595,
+                      28526, 30529) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+      },
+      {
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(4167) },
+          { AOM_CDF2(1998) },
+          { AOM_CDF2(748) },
+      },
+    };
+
+static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
+  AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
+};
+
+static const aom_cdf_prob
+    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+      { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
+                  32704, 32708, 32712, 32716, 32720, 32724) },
+      { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620,
+                  32647, 32668, 32672, 32676, 32680, 32684) },
+      { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673,
+                  32677, 32681, 32685, 32689, 32693, 32697) },
+      { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708,
+                  32712, 32716, 32720, 32724, 32728, 32732) },
+      { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394,
+                  32464, 32516, 32560, 32576, 32593, 32622) },
+      { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
+                  32413, 32520, 32594, 32622, 32656, 32660) }
+    };
+
+static const aom_cdf_prob
+    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+        SWITCHABLE_FILTERS)] = {
+      { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) },
+      { AOM_CDF3(422, 2938) },    { AOM_CDF3(28244, 32608) },
+      { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) },
+      { AOM_CDF3(770, 1152) },    { AOM_CDF3(20889, 25637) },
+      { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) },
+      { AOM_CDF3(305, 2247) },    { AOM_CDF3(27403, 32636) },
+      { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) },
+      { AOM_CDF3(601, 943) },     { AOM_CDF3(14969, 21398) }
+    };
+
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+    { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+      { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+
+static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
+
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+    { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+      { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+
+static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
+};
+
+static const aom_cdf_prob
+    default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
+        INTER_COMPOUND_MODES)] = {
+      { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+      { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+      { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+      { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+      { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+      { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+      { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+      { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) }
+    };
+
+static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    2)] = { { AOM_CDF2(16384) },
+            { AOM_CDF2(26887) },
+            { AOM_CDF2(27597) },
+            { AOM_CDF2(30237) } };
+
+static const aom_cdf_prob
+    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
+        { { AOM_CDF4(8192, 16384, 24576) },
+          { AOM_CDF4(1875, 11082, 27332) },
+          { AOM_CDF4(2473, 9996, 26388) },
+          { AOM_CDF4(4238, 11537, 25926) } };
+
+static const aom_cdf_prob
+    default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) },
+      { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) },
+      { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
+    };
+
+static const aom_cdf_prob
+    default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+      { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
+      { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
+    };
+
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
+    { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
+                  22362, 24127, 25702, 27752, 29450, 31171) },
+      { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
+                  18452, 19422, 22839, 26127, 29629) },
+      { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
+                  24520, 27470, 29456, 30529, 31656) },
+      { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
+                  20961, 22884, 24471, 26719, 28714, 30877) },
+      { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
+                  18114, 19313, 22521, 26012, 29550) },
+      { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
+                  20533, 23434, 25972, 27944, 29570, 31416) },
+      { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
+                  22038, 23963, 25311, 26988, 28766, 31012) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
+                  24985, 25684, 27259, 28883, 30911) },
+      { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
+                  27251, 29173, 30089, 30960, 31933) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) },
+      { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                  20480, 22528, 24576, 26624, 28672, 30720) } };
+
+static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) },
+                       { AOM_CDF3(4738, 24765) },  { AOM_CDF3(5391, 25528) },
+                       { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) },
+                       { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) },
+                       { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) },
+                       { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) },
+                       { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) },
+                       { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } };
+
+static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(10437) }, { AOM_CDF2(9371) },  { AOM_CDF2(9301) },
+  { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) },
+  { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) },
+  { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) },
+  { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) },
+  { AOM_CDF2(26879) }
+};
+
+static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS]
+                                                 [CDF_SIZE(2)] = {
+                                                   { AOM_CDF2(806) },
+                                                   { AOM_CDF2(16662) },
+                                                   { AOM_CDF2(20186) },
+                                                   { AOM_CDF2(26538) }
+                                                 };
+
+static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(26828) },
+            { AOM_CDF2(24035) },
+            { AOM_CDF2(12031) },
+            { AOM_CDF2(10640) },
+            { AOM_CDF2(2901) } };
+
+static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS]
+                                                   [CDF_SIZE(2)] = {
+                                                     { AOM_CDF2(1198) },
+                                                     { AOM_CDF2(2070) },
+                                                     { AOM_CDF2(9166) },
+                                                     { AOM_CDF2(7499) },
+                                                     { AOM_CDF2(22475) }
+                                                   };
+
+static const aom_cdf_prob
+    default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS -
+                                                    1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } },
+      { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } },
+      { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } }
+    };
+
+static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1]
+                                                [CDF_SIZE(2)] = {
+                                                  { { AOM_CDF2(4897) },
+                                                    { AOM_CDF2(1555) },
+                                                    { AOM_CDF2(4236) },
+                                                    { AOM_CDF2(8650) },
+                                                    { AOM_CDF2(904) },
+                                                    { AOM_CDF2(1444) } },
+                                                  { { AOM_CDF2(16973) },
+                                                    { AOM_CDF2(16751) },
+                                                    { AOM_CDF2(19647) },
+                                                    { AOM_CDF2(24773) },
+                                                    { AOM_CDF2(11014) },
+                                                    { AOM_CDF2(15087) } },
+                                                  { { AOM_CDF2(29744) },
+                                                    { AOM_CDF2(30279) },
+                                                    { AOM_CDF2(31194) },
+                                                    { AOM_CDF2(31895) },
+                                                    { AOM_CDF2(26875) },
+                                                    { AOM_CDF2(30304) } }
+                                                };
+
+static const aom_cdf_prob
+    default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } },
+      { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } },
+      { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } }
+    };
+
+static const aom_cdf_prob
+    default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } },
+      { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } },
+      { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } }
+    };
+
+static const aom_cdf_prob
+    default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) },
+      { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) },
+      { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) },
+      { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) },
+      { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) },
+      { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) },
+      { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) }
+    };
+
+static const aom_cdf_prob
+    default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) },
+      { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) },
+      { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) },
+      { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) },
+      { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) },
+      { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) },
+      { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) }
+    };
+
+static const aom_cdf_prob default_palette_y_mode_cdf
+    [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } },
+      { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } },
+      { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } },
+      { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } },
+      { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } },
+      { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } },
+      { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } }
+    };
+
+static const aom_cdf_prob
+    default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(32461) }, { AOM_CDF2(21488) }
+    };
+
+static const aom_cdf_prob default_palette_y_color_index_cdf
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+      {
+          { AOM_CDF2(28710) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(10553) },
+          { AOM_CDF2(27036) },
+          { AOM_CDF2(31603) },
+      },
+      {
+          { AOM_CDF3(27877, 30490) },
+          { AOM_CDF3(11532, 25697) },
+          { AOM_CDF3(6544, 30234) },
+          { AOM_CDF3(23018, 28072) },
+          { AOM_CDF3(31915, 32385) },
+      },
+      {
+          { AOM_CDF4(25572, 28046, 30045) },
+          { AOM_CDF4(9478, 21590, 27256) },
+          { AOM_CDF4(7248, 26837, 29824) },
+          { AOM_CDF4(19167, 24486, 28349) },
+          { AOM_CDF4(31400, 31825, 32250) },
+      },
+      {
+          { AOM_CDF5(24779, 26955, 28576, 30282) },
+          { AOM_CDF5(8669, 20364, 24073, 28093) },
+          { AOM_CDF5(4255, 27565, 29377, 31067) },
+          { AOM_CDF5(19864, 23674, 26716, 29530) },
+          { AOM_CDF5(31646, 31893, 32147, 32426) },
+      },
+      {
+          { AOM_CDF6(23132, 25407, 26970, 28435, 30073) },
+          { AOM_CDF6(7443, 17242, 20717, 24762, 27982) },
+          { AOM_CDF6(6300, 24862, 26944, 28784, 30671) },
+          { AOM_CDF6(18916, 22895, 25267, 27435, 29652) },
+          { AOM_CDF6(31270, 31550, 31808, 32059, 32353) },
+      },
+      {
+          { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) },
+          { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) },
+          { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) },
+          { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) },
+          { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) },
+      },
+      {
+          { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+          { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+          { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+          { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+          { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+      },
+    };
+
+static const aom_cdf_prob default_palette_uv_color_index_cdf
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+      {
+          { AOM_CDF2(29089) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(8713) },
+          { AOM_CDF2(29257) },
+          { AOM_CDF2(31610) },
+      },
+      {
+          { AOM_CDF3(25257, 29145) },
+          { AOM_CDF3(12287, 27293) },
+          { AOM_CDF3(7033, 27960) },
+          { AOM_CDF3(20145, 25405) },
+          { AOM_CDF3(30608, 31639) },
+      },
+      {
+          { AOM_CDF4(24210, 27175, 29903) },
+          { AOM_CDF4(9888, 22386, 27214) },
+          { AOM_CDF4(5901, 26053, 29293) },
+          { AOM_CDF4(18318, 22152, 28333) },
+          { AOM_CDF4(30459, 31136, 31926) },
+      },
+      {
+          { AOM_CDF5(22980, 25479, 27781, 29986) },
+          { AOM_CDF5(8413, 21408, 24859, 28874) },
+          { AOM_CDF5(2257, 29449, 30594, 31598) },
+          { AOM_CDF5(19189, 21202, 25915, 28620) },
+          { AOM_CDF5(31844, 32044, 32281, 32518) },
+      },
+      {
+          { AOM_CDF6(22217, 24567, 26637, 28683, 30548) },
+          { AOM_CDF6(7307, 16406, 19636, 24632, 28424) },
+          { AOM_CDF6(4441, 25064, 26879, 28942, 30919) },
+          { AOM_CDF6(17210, 20528, 23319, 26750, 29582) },
+          { AOM_CDF6(30674, 30953, 31396, 31735, 32207) },
+      },
+      {
+          { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) },
+          { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) },
+          { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) },
+          { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) },
+          { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) },
+      },
+      {
+          { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+          { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+          { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+          { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+          { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+      },
+    };
+
+static const aom_cdf_prob
+    default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) },
+      { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) },
+      { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) },
+      { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) },
+      { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) },
+      { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) },
+      { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
+    };
+
+static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
+};
+
+static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } };
+
+static const aom_cdf_prob
+    default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) },
+      { AOM_CDF2(13259) }, { AOM_CDF2(9334) },  { AOM_CDF2(4644) }
+    };
+
+static const aom_cdf_prob
+    default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) },
+      { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) }
+    };
+
+static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    30531) };
+
+static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE(
+    FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) };
+
+static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(
+    2)] = { { AOM_CDF2(4621) },  { AOM_CDF2(6743) },  { AOM_CDF2(5893) },
+            { AOM_CDF2(7866) },  { AOM_CDF2(12551) }, { AOM_CDF2(9394) },
+            { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) },
+            { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) },
+            { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) } };
+
+static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE(
+    RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) };
+
+static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    11570) };
+
+static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    16855) };
+
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(
+    DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) } };
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+// FIXME(someone) need real defaults here
+static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
+  AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672)
+};
+
+static const aom_cdf_prob
+    default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
+      { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
+    };
+
+static const aom_cdf_prob
+    default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE(
+        MAX_SEGMENTS)] = {
+      {
+          AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533),
+      },
+      {
+          AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344),
+      },
+      {
+          AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679),
+      },
+    };
+
+static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+                                             [CDF_SIZE(MAX_TX_DEPTH + 1)] = {
+                                               { { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(24320) } },
+                                               { { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(18677, 30848) } },
+                                               { { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(24302, 25602) } },
+                                               { { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(16803, 22759) } },
+                                             };
+
+#define MAX_COLOR_CONTEXT_HASH 8
+// Negative values are invalid
+static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
+                                                    1] = { -1, -1, 0, -1, -1,
+                                                           4,  3,  2, 1 };
+
+#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx) {
+  assert(palette_size <= PALETTE_MAX_SIZE);
+  assert(r > 0 || c > 0);
+
+  // Get color indices of neighbors.
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+  color_neighbors[1] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+  int scores[PALETTE_MAX_SIZE + 10] = { 0 };
+  int i;
+  static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    if (color_neighbors[i] >= 0) {
+      scores[color_neighbors[i]] += weights[i];
+    }
+  }
+
+  int inverse_color_order[PALETTE_MAX_SIZE];
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
+
+  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    int max = scores[i];
+    int max_idx = i;
+    for (int j = i + 1; j < palette_size; ++j) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+    }
+    if (max_idx != i) {
+      // Move the score at index 'max_idx' to index 'i', and shift the scores
+      // from 'i' to 'max_idx - 1' by 1.
+      const int max_score = scores[max_idx];
+      const uint8_t max_color_order = color_order[max_idx];
+      for (int k = max_idx; k > i; --k) {
+        scores[k] = scores[k - 1];
+        color_order[k] = color_order[k - 1];
+        inverse_color_order[color_order[k]] = k;
+      }
+      scores[i] = max_score;
+      color_order[i] = max_color_order;
+      inverse_color_order[color_order[i]] = i;
+    }
+  }
+
+  if (color_idx != NULL)
+    *color_idx = inverse_color_order[color_map[r * stride + c]];
+
+  // Get hash value of context.
+  int color_index_ctx_hash = 0;
+  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    color_index_ctx_hash += scores[i] * hash_multipliers[i];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
+#undef NUM_PALETTE_NEIGHBORS
+#undef MAX_COLOR_CONTEXT_HASH
+
+static void init_mode_probs(FRAME_CONTEXT *fc) {
+  av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
+  av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
+  av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
+  av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
+  av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
+  av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
+  av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
+  av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
+  av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
+  av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
+  av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
+  av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
+  av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
+  av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
+  av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
+  av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
+  av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
+  av1_copy(fc->newmv_cdf, default_newmv_cdf);
+  av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
+  av1_copy(fc->refmv_cdf, default_refmv_cdf);
+  av1_copy(fc->drl_cdf, default_drl_cdf);
+  av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
+  av1_copy(fc->obmc_cdf, default_obmc_cdf);
+  av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
+  av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
+  av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
+  av1_copy(fc->interintra_cdf, default_interintra_cdf);
+  av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
+  av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
+  av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
+  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
+  av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
+  av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
+  av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
+  av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
+  av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
+  av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
+  av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
+  av1_copy(fc->partition_cdf, default_partition_cdf);
+  av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
+  av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
+  av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
+  av1_copy(fc->skip_cdfs, default_skip_cdfs);
+  av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
+  for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
+    av1_copy(fc->seg.spatial_pred_seg_cdf[i],
+             default_spatial_pred_seg_tree_cdf[i]);
+  av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
+  av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
+  av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
+  av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
+  av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
+  av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
+  av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
+}
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas) {
+  assert(ref_deltas != NULL);
+
+  ref_deltas[INTRA_FRAME] = 1;
+  ref_deltas[LAST_FRAME] = 0;
+  ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[GOLDEN_FRAME] = -1;
+  ref_deltas[ALTREF2_FRAME] = -1;
+  ref_deltas[ALTREF_FRAME] = -1;
+}
+
+void av1_set_default_mode_deltas(int8_t *mode_deltas) {
+  assert(mode_deltas != NULL);
+
+  mode_deltas[0] = 0;
+  mode_deltas[1] = 0;
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
+
+  av1_set_default_ref_deltas(lf->ref_deltas);
+  av1_set_default_mode_deltas(lf->mode_deltas);
+}
+
+void av1_setup_frame_contexts(AV1_COMMON *cm) {
+  // Store the frame context into a special slot (not associated with any
+  // reference buffer), so that we can set up cm->pre_fc correctly later
+  // This function must ONLY be called when cm->fc has been initialized with
+  // default probs, either by av1_setup_past_independence or after manually
+  // initializing them
+  cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc;
+  if (cm->large_scale_tile) {
+    for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  }
+}
+
+void av1_setup_past_independence(AV1_COMMON *cm) {
+  // Reset the segment feature data to the default stats:
+  // Features disabled, 0, with delta coding (Default state).
+  av1_clearall_segfeatures(&cm->seg);
+
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+
+  if (cm->current_frame_seg_map)
+    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  // reset mode ref deltas
+  av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+  av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+  set_default_lf_deltas(&cm->lf);
+
+  av1_default_coef_probs(cm);
+  init_mode_probs(cm->fc);
+  av1_init_mv_probs(cm);
+  av1_init_lv_map(cm);
+  cm->fc->initialized = 1;
+  av1_setup_frame_contexts(cm);
+
+  // prev_mip will only be allocated in encoder.
+  if (frame_is_intra_only(cm) && cm->prev_mip)
+    memset(cm->prev_mip, 0,
+           cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
+}
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
new file mode 100644
index 000000000..7047f34d2
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/filter.h"
+#include "av1/common/seg_common.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 3
+
+#define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV)
+
+// Number of possible contexts for a color index.
+// As can be seen from av1_get_palette_color_index_context(), the possible
+// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to
+// a value from 0 to 4 using 'palette_color_index_context_lookup' table.
+#define PALETTE_COLOR_INDEX_CONTEXTS 5
+
+// Palette Y mode context for a block is determined by number of neighboring
+// blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
+// context values are:
+// 0 if neither left nor top block uses palette for Y plane,
+// 1 if exactly one of left or top block uses palette for Y plane, and
+// 2 if both left and top blocks use palette for Y plane.
+#define PALETTE_Y_MODE_CONTEXTS 3
+
+// Palette UV mode context for a block is determined by whether this block uses
+// palette for the Y plane. So, possible values are:
+// 0 if this block doesn't use palette for Y plane.
+// 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
+#define PALETTE_UV_MODE_CONTEXTS 2
+
+// Map the number of pixels in a block size to a context
+//   64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4)  -> 0
+//  128(BLOCK_8X16, BLOCK_16x8)             -> 1
+//   ...
+// 4096(BLOCK_64X64)                        -> 6
+#define PALATTE_BSIZE_CTXS 7
+
+#define KF_MODE_CONTEXTS 5
+
+struct AV1Common;
+
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+  const int16_t *neighbors;
+} SCAN_ORDER;
+
+typedef struct frame_contexts {
+  aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
+                            [CDF_SIZE(2)];
+  aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)];
+  aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)];
+  aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)];
+  aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)];
+  aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)];
+  aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)];
+  aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)];
+  aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]
+                                 [CDF_SIZE(3)];
+  aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+                             [CDF_SIZE(4)];
+  aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+                           [CDF_SIZE(BR_CDF_SIZE)];
+
+  aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)];
+
+  aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
+                                      [CDF_SIZE(INTER_COMPOUND_MODES)];
+  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+  aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
+  aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
+  aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS]
+                                  [CDF_SIZE(INTERINTRA_MODES)];
+  aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)];
+  aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES]
+                                        [PALETTE_COLOR_INDEX_CONTEXTS]
+                                        [CDF_SIZE(PALETTE_COLORS)];
+  aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES]
+                                         [PALETTE_COLOR_INDEX_CONTEXTS]
+                                         [CDF_SIZE(PALETTE_COLORS)];
+  aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]
+                                 [CDF_SIZE(2)];
+  aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)];
+  aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                               [CDF_SIZE(2)];
+  aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)];
+  aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)];
+  aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
+  nmv_context nmvc;
+  nmv_context ndvc;
+  aom_cdf_prob intrabc_cdf[CDF_SIZE(2)];
+  struct segmentation_probs seg;
+  aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)];
+  aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)];
+  aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)];
+  aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)];
+  aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
+  aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]
+                          [CDF_SIZE(UV_INTRA_MODES)];
+  aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
+  aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
+                                    [CDF_SIZE(SWITCHABLE_FILTERS)];
+  /* kf_y_cdf is discarded after use, so does not require persistent storage.
+     However, we keep it with the other CDFs in this struct since it needs to
+     be copied to each tile to support parallelism just like the others.
+  */
+  aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]
+                       [CDF_SIZE(INTRA_MODES)];
+
+  aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES]
+                              [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)];
+
+  aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+                          [CDF_SIZE(MAX_TX_DEPTH + 1)];
+  aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
+  aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)];
+  aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
+  aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                               [CDF_SIZE(TX_TYPES)];
+  aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
+                               [CDF_SIZE(TX_TYPES)];
+  aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)];
+  aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)];
+  int initialized;
+} FRAME_CONTEXT;
+
+static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 },
+  { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 },
+  { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 },
+};
+
+static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 },
+  { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 },
+};
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas);
+void av1_set_default_mode_deltas(int8_t *mode_deltas);
+void av1_setup_frame_contexts(struct AV1Common *cm);
+void av1_setup_past_independence(struct AV1Common *cm);
+
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
+static INLINE int av1_ceil_log2(int n) {
+  if (n < 2) return 0;
+  int i = 1, p = 2;
+  while (p < n) {
+    i++;
+    p = p << 1;
+  }
+  return i;
+}
+
+// Returns the context for palette color index at row 'r' and column 'c',
+// along with the 'color_order' of neighbors and the 'color_idx'.
+// The 'color_map' is a 2D array with the given 'stride'.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
new file mode 100644
index 000000000..491337387
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/entropymv.h"
+
+static const nmv_context default_nmv_context = {
+  { AOM_CDF4(4096, 11264, 19328) },  // joints_cdf
+  { {
+        // Vertical component
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
+    },
+    {
+        // Horizontal component
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
+    } },
+};
+
+void av1_init_mv_probs(AV1_COMMON *cm) {
+  // NB: this sets CDFs too
+  cm->fc->nmvc = default_nmv_context;
+  cm->fc->ndvc = default_nmv_context;
+}
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
new file mode 100644
index 000000000..fa818a2c1
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+void av1_init_mv_probs(struct AV1Common *cm);
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS 4
+typedef enum {
+  MV_JOINT_ZERO = 0,   /* Zero vector */
+  MV_JOINT_HNZVZ = 1,  /* Vert zero, hor nonzero */
+  MV_JOINT_HZVNZ = 2,  /* Hor zero, vert nonzero */
+  MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES 11
+typedef enum {
+  MV_CLASS_0 = 0,   /* (0, 2]     integer pel */
+  MV_CLASS_1 = 1,   /* (2, 4]     integer pel */
+  MV_CLASS_2 = 2,   /* (4, 8]     integer pel */
+  MV_CLASS_3 = 3,   /* (8, 16]    integer pel */
+  MV_CLASS_4 = 4,   /* (16, 32]   integer pel */
+  MV_CLASS_5 = 5,   /* (32, 64]   integer pel */
+  MV_CLASS_6 = 6,   /* (64, 128]  integer pel */
+  MV_CLASS_7 = 7,   /* (128, 256] integer pel */
+  MV_CLASS_8 = 8,   /* (256, 512] integer pel */
+  MV_CLASS_9 = 9,   /* (512, 1024] integer pel */
+  MV_CLASS_10 = 10, /* (1024,2048] integer pel */
+} MV_CLASS_TYPE;
+
+#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
+#define CLASS0_SIZE (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_BITS_CONTEXTS 6
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP (1 << MV_IN_USE_BITS)
+#define MV_LOW (-(1 << MV_IN_USE_BITS))
+
+typedef struct {
+  aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)];
+  aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
+  aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
+  aom_cdf_prob sign_cdf[CDF_SIZE(2)];
+  aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)];
+  aom_cdf_prob hp_cdf[CDF_SIZE(2)];
+  aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)];
+  aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)];
+} nmv_component;
+
+typedef struct {
+  aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)];
+  nmv_component comps[2];
+} nmv_context;
+
+typedef enum {
+  MV_SUBPEL_NONE = -1,
+  MV_SUBPEL_LOW_PRECISION = 0,
+  MV_SUBPEL_HIGH_PRECISION,
+} MvSubpelPrecision;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
new file mode 100644
index 000000000..869c06ef2
--- /dev/null
+++ b/third_party/aom/av1/common/enums.h
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef MAX_SB_SIZE
+
+// Max superblock size
+#define MAX_SB_SIZE_LOG2 7
+#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
+// Pixels per Mode Info (MI) unit
+#define MI_SIZE_LOG2 2
+#define MI_SIZE (1 << MI_SIZE_LOG2)
+
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
+
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
+
+// Maximum number of tile rows and tile columns
+#define MAX_TILE_ROWS 64
+#define MAX_TILE_COLS 64
+
+#define MAX_VARTX_DEPTH 2
+
+#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
+
+#define MAX_PALETTE_SQUARE (64 * 64)
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+#define FRAME_OFFSET_BITS 5
+#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
+
+// 4 frame filter levels: y plane vertical, y plane horizontal,
+// u plane, and v plane
+#define FRAME_LF_COUNT 4
+#define DEFAULT_DELTA_LF_MULTI 0
+#define MAX_MODE_LF_DELTAS 2
+
+#define DIST_PRECISION_BITS 4
+#define DIST_PRECISION (1 << DIST_PRECISION_BITS)  // 16
+
+// TODO(chengchen): Temporal flag serve as experimental flag for WIP
+// bitmask construction.
+// Shall be removed when bitmask code is completely checkedin
+#define LOOP_FILTER_BITMASK 0
+
+#define PROFILE_BITS 3
+// The following three profiles are currently defined.
+// Profile 0.  8-bit and 10-bit 4:2:0 and 4:0:0 only.
+// Profile 1.  8-bit and 10-bit 4:4:4
+// Profile 2.  8-bit and 10-bit 4:2:2
+//            12-bit  4:0:0, 4:2:2 and 4:4:4
+// Since we have three bits for the profiles, it can be extended later.
+typedef enum BITSTREAM_PROFILE {
+  PROFILE_0,
+  PROFILE_1,
+  PROFILE_2,
+  MAX_PROFILES,
+} BITSTREAM_PROFILE;
+
+#define LEVEL_MAJOR_BITS 3
+#define LEVEL_MINOR_BITS 2
+#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
+
+#define LEVEL_MAJOR_MIN 2
+#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
+#define LEVEL_MINOR_MIN 0
+#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+
+#define OP_POINTS_CNT_MINUS_1_BITS 5
+#define OP_POINTS_IDC_BITS 12
+
+// Note: Some enums use the attribute 'packed' to use smallest possible integer
+// type, so that we can save memory when they are used in structs/arrays.
+
+typedef enum ATTRIBUTE_PACKED {
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
+  BLOCK_16X16,
+  BLOCK_16X32,
+  BLOCK_32X16,
+  BLOCK_32X32,
+  BLOCK_32X64,
+  BLOCK_64X32,
+  BLOCK_64X64,
+  BLOCK_64X128,
+  BLOCK_128X64,
+  BLOCK_128X128,
+  BLOCK_4X16,
+  BLOCK_16X4,
+  BLOCK_8X32,
+  BLOCK_32X8,
+  BLOCK_16X64,
+  BLOCK_64X16,
+  BLOCK_SIZES_ALL,
+  BLOCK_SIZES = BLOCK_4X16,
+  BLOCK_INVALID = 255,
+  BLOCK_LARGEST = (BLOCK_SIZES - 1)
+} BLOCK_SIZE;
+
+// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+#define SQR_BLOCK_SIZES 6
+
+typedef enum ATTRIBUTE_PACKED {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_HORZ_A,  // HORZ split and the top partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the bottom partition is split again
+  PARTITION_VERT_A,  // VERT split and the left partition is split again
+  PARTITION_VERT_B,  // VERT split and the right partition is split again
+  PARTITION_HORZ_4,  // 4:1 horizontal partition
+  PARTITION_VERT_4,  // 4:1 vertical partition
+  EXT_PARTITION_TYPES,
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = 255
+} PARTITION_TYPE;
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET 4  // number of probability models per block size
+#define PARTITION_BLOCK_SIZES 5
+#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
+
+// block transform size
+#if defined(_MSC_VER)
+typedef uint8_t TX_SIZE;
+enum ATTRIBUTE_PACKED {
+#else
+typedef enum ATTRIBUTE_PACKED {
+#endif
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+  TX_32X64,           // 32x64 transform
+  TX_64X32,           // 64x32 transform
+  TX_4X16,            // 4x16 transform
+  TX_16X4,            // 16x4 transform
+  TX_8X32,            // 8x32 transform
+  TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+#if defined(_MSC_VER)
+};
+#else
+} TX_SIZE;
+#endif
+
+#define TX_SIZE_LUMA_MIN (TX_4X4)
+/* We don't need to code a transform size unless the allowed size is at least
+   one more than the minimum. */
+#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1)
+
+// Maximum tx_size categories
+#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN)
+#define MAX_TX_DEPTH 2
+
+#define MAX_TX_SIZE_LOG2 (6)
+#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2 2
+#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR_LOG2 2
+#define TX_PAD_HOR 4
+// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
+// check.
+#define TX_PAD_TOP 2
+#define TX_PAD_BOTTOM 4
+#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+// frame transform mode
+typedef enum ATTRIBUTE_PACKED {
+  ONLY_4X4,         // use only 4x4 transform
+  TX_MODE_LARGEST,  // transform size is the largest possible for pu size
+  TX_MODE_SELECT,   // transform specified for each block
+  TX_MODES,
+} TX_MODE;
+
+// 1D tx types
+typedef enum ATTRIBUTE_PACKED {
+  DCT_1D,
+  ADST_1D,
+  FLIPADST_1D,
+  IDTX_1D,
+  TX_TYPES_1D,
+} TX_TYPE_1D;
+
+typedef enum ATTRIBUTE_PACKED {
+  DCT_DCT,    // DCT  in both horizontal and vertical
+  ADST_DCT,   // ADST in vertical, DCT in horizontal
+  DCT_ADST,   // DCT  in vertical, ADST in horizontal
+  ADST_ADST,  // ADST in both directions
+  FLIPADST_DCT,
+  DCT_FLIPADST,
+  FLIPADST_FLIPADST,
+  ADST_FLIPADST,
+  FLIPADST_ADST,
+  IDTX,
+  V_DCT,
+  H_DCT,
+  V_ADST,
+  H_ADST,
+  V_FLIPADST,
+  H_FLIPADST,
+  TX_TYPES,
+} TX_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  REG_REG,
+  REG_SMOOTH,
+  REG_SHARP,
+  SMOOTH_REG,
+  SMOOTH_SMOOTH,
+  SMOOTH_SHARP,
+  SHARP_REG,
+  SHARP_SMOOTH,
+  SHARP_SHARP,
+} DUAL_FILTER_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  // DCT only
+  EXT_TX_SET_DCTONLY,
+  // DCT + Identity only
+  EXT_TX_SET_DCT_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1)
+  EXT_TX_SET_DTT4_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+  EXT_TX_SET_DTT4_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+  EXT_TX_SET_DTT9_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+  EXT_TX_SET_ALL16,
+  EXT_TX_SET_TYPES
+} TxSetType;
+
+#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
+
+#define EXT_TX_SIZES 4       // number of sizes that use extended transforms
+#define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
+
+typedef enum ATTRIBUTE_PACKED {
+  AOM_LAST_FLAG = 1 << 0,
+  AOM_LAST2_FLAG = 1 << 1,
+  AOM_LAST3_FLAG = 1 << 2,
+  AOM_GOLD_FLAG = 1 << 3,
+  AOM_BWD_FLAG = 1 << 4,
+  AOM_ALT2_FLAG = 1 << 5,
+  AOM_ALT_FLAG = 1 << 6,
+  AOM_REFFRAME_ALL = (1 << 7) - 1
+} AOM_REFFRAME;
+
+typedef enum ATTRIBUTE_PACKED {
+  UNIDIR_COMP_REFERENCE,
+  BIDIR_COMP_REFERENCE,
+  COMP_REFERENCE_TYPES,
+} COMP_REFERENCE_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  PLANE_TYPE_Y,
+  PLANE_TYPE_UV,
+  PLANE_TYPES
+} PLANE_TYPE;
+
+#define CFL_ALPHABET_SIZE_LOG2 4
+#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
+#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
+#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
+#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
+
+typedef enum ATTRIBUTE_PACKED {
+  CFL_PRED_U,
+  CFL_PRED_V,
+  CFL_PRED_PLANES
+} CFL_PRED_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  CFL_SIGN_ZERO,
+  CFL_SIGN_NEG,
+  CFL_SIGN_POS,
+  CFL_SIGNS
+} CFL_SIGN_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  CFL_DISALLOWED,
+  CFL_ALLOWED,
+  CFL_ALLOWED_TYPES
+} CFL_ALLOWED_TYPE;
+
+// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
+#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
+// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8
+#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
+// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8
+#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
+
+// There is no context when the alpha for a given plane is zero.
+// So there are 2 fewer contexts than joint signs.
+#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS)
+#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS)
+// Also, the contexts are symmetric under swapping the planes.
+#define CFL_CONTEXT_V(js) \
+  (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
+
+typedef enum ATTRIBUTE_PACKED {
+  PALETTE_MAP,
+  COLOR_MAP_TYPES,
+} COLOR_MAP_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  TWO_COLORS,
+  THREE_COLORS,
+  FOUR_COLORS,
+  FIVE_COLORS,
+  SIX_COLORS,
+  SEVEN_COLORS,
+  EIGHT_COLORS,
+  PALETTE_SIZES
+} PALETTE_SIZE;
+
+typedef enum ATTRIBUTE_PACKED {
+  PALETTE_COLOR_ONE,
+  PALETTE_COLOR_TWO,
+  PALETTE_COLOR_THREE,
+  PALETTE_COLOR_FOUR,
+  PALETTE_COLOR_FIVE,
+  PALETTE_COLOR_SIX,
+  PALETTE_COLOR_SEVEN,
+  PALETTE_COLOR_EIGHT,
+  PALETTE_COLORS
+} PALETTE_COLOR;
+
+// Note: All directional predictors must be between V_PRED and D67_PRED (both
+// inclusive).
+typedef enum ATTRIBUTE_PACKED {
+  DC_PRED,        // Average of above and left pixels
+  V_PRED,         // Vertical
+  H_PRED,         // Horizontal
+  D45_PRED,       // Directional 45  degree
+  D135_PRED,      // Directional 135 degree
+  D113_PRED,      // Directional 113 degree
+  D157_PRED,      // Directional 157 degree
+  D203_PRED,      // Directional 203 degree
+  D67_PRED,       // Directional 67  degree
+  SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
+  SMOOTH_V_PRED,  // Vertical interpolation
+  SMOOTH_H_PRED,  // Horizontal interpolation
+  PAETH_PRED,     // Predict from the direction of smallest gradient
+  NEARESTMV,
+  NEARMV,
+  GLOBALMV,
+  NEWMV,
+  // Compound ref compound modes
+  NEAREST_NEARESTMV,
+  NEAR_NEARMV,
+  NEAREST_NEWMV,
+  NEW_NEARESTMV,
+  NEAR_NEWMV,
+  NEW_NEARMV,
+  GLOBAL_GLOBALMV,
+  NEW_NEWMV,
+  MB_MODE_COUNT,
+  INTRA_MODE_START = DC_PRED,
+  INTRA_MODE_END = NEARESTMV,
+  INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
+  SINGLE_INTER_MODE_START = NEARESTMV,
+  SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
+  SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START,
+  COMP_INTER_MODE_START = NEAREST_NEARESTMV,
+  COMP_INTER_MODE_END = MB_MODE_COUNT,
+  COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+  INTER_MODE_START = NEARESTMV,
+  INTER_MODE_END = MB_MODE_COUNT,
+  INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
+  INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
+} PREDICTION_MODE;
+
+// TODO(ltrudeau) Do we really want to pack this?
+// TODO(ltrudeau) Do we match with PREDICTION_MODE?
+typedef enum ATTRIBUTE_PACKED {
+  UV_DC_PRED,        // Average of above and left pixels
+  UV_V_PRED,         // Vertical
+  UV_H_PRED,         // Horizontal
+  UV_D45_PRED,       // Directional 45  degree
+  UV_D135_PRED,      // Directional 135 degree
+  UV_D113_PRED,      // Directional 113 degree
+  UV_D157_PRED,      // Directional 157 degree
+  UV_D203_PRED,      // Directional 203 degree
+  UV_D67_PRED,       // Directional 67  degree
+  UV_SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
+  UV_SMOOTH_V_PRED,  // Vertical interpolation
+  UV_SMOOTH_H_PRED,  // Horizontal interpolation
+  UV_PAETH_PRED,     // Predict from the direction of smallest gradient
+  UV_CFL_PRED,       // Chroma-from-Luma
+  UV_INTRA_MODES,
+  UV_MODE_INVALID,  // For uv_mode in inter blocks
+} UV_PREDICTION_MODE;
+
+typedef enum ATTRIBUTE_PACKED {
+  SIMPLE_TRANSLATION,
+  OBMC_CAUSAL,    // 2-sided OBMC
+  WARPED_CAUSAL,  // 2-sided WARPED
+  MOTION_MODES
+} MOTION_MODE;
+
+typedef enum ATTRIBUTE_PACKED {
+  II_DC_PRED,
+  II_V_PRED,
+  II_H_PRED,
+  II_SMOOTH_PRED,
+  INTERINTRA_MODES
+} INTERINTRA_MODE;
+
+typedef enum ATTRIBUTE_PACKED {
+  COMPOUND_AVERAGE,
+  COMPOUND_WEDGE,
+  COMPOUND_DIFFWTD,
+  COMPOUND_TYPES,
+} COMPOUND_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+  FILTER_DC_PRED,
+  FILTER_V_PRED,
+  FILTER_H_PRED,
+  FILTER_D157_PRED,
+  FILTER_PAETH_PRED,
+  FILTER_INTRA_MODES,
+} FILTER_INTRA_MODE;
+
+#define DIRECTIONAL_MODES 8
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define SKIP_MODE_CONTEXTS 3
+
+#define COMP_INDEX_CONTEXTS 6
+#define COMP_GROUP_IDX_CONTEXTS 6
+
+#define NMV_CONTEXTS 3
+
+#define NEWMV_MODE_CONTEXTS 6
+#define GLOBALMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 6
+#define DRL_MODE_CONTEXTS 3
+
+#define GLOBALMV_OFFSET 3
+#define REFMV_OFFSET 4
+
+#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
+#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define COMP_NEWMV_CTXS 5
+#define INTER_MODE_CONTEXTS 8
+
+#define DELTA_Q_SMALL 3
+#define DELTA_Q_PROBS (DELTA_Q_SMALL)
+#define DEFAULT_DELTA_Q_RES 4
+#define DELTA_LF_SMALL 3
+#define DELTA_LF_PROBS (DELTA_LF_SMALL)
+#define DEFAULT_DELTA_LF_RES 2
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define MAX_REF_MV_STACK_SIZE 8
+#define REF_CAT_LEVEL 640
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 3
+
+#define COMP_REF_TYPE_CONTEXTS 5
+#define UNI_COMP_REF_CONTEXTS 3
+
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
+typedef uint8_t TXFM_CONTEXT;
+
+#define NONE_FRAME -1
+#define INTRA_FRAME 0
+#define LAST_FRAME 1
+#define LAST2_FRAME 2
+#define LAST3_FRAME 3
+#define GOLDEN_FRAME 4
+#define BWDREF_FRAME 5
+#define ALTREF2_FRAME 6
+#define ALTREF_FRAME 7
+#define EXTREF_FRAME REF_FRAMES
+#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
+
+#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
+
+#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+
+#define SINGLE_REFS (FWD_REFS + BWD_REFS)
+
+typedef enum ATTRIBUTE_PACKED {
+  LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
+  LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
+  LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF_FRAMES,   // { BWDREF_FRAME, ALTREF_FRAME }
+  LAST2_LAST3_FRAMES,     // { LAST2_FRAME, LAST3_FRAME }
+  LAST2_GOLDEN_FRAMES,    // { LAST2_FRAME, GOLDEN_FRAME }
+  LAST3_GOLDEN_FRAMES,    // { LAST3_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF2_FRAMES,  // { BWDREF_FRAME, ALTREF2_FRAME }
+  ALTREF2_ALTREF_FRAMES,  // { ALTREF2_FRAME, ALTREF_FRAME }
+  TOTAL_UNIDIR_COMP_REFS,
+  // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
+  //       that are explicitly signaled.
+  UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
+} UNIDIR_COMP_REF;
+
+#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
+
+#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
+
+// NOTE: A limited number of unidirectional reference pairs can be signalled for
+//       compound prediction. The use of skip mode, on the other hand, makes it
+//       possible to have a reference pair not listed for explicit signaling.
+#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
+
+typedef enum ATTRIBUTE_PACKED {
+  RESTORE_NONE,
+  RESTORE_WIENER,
+  RESTORE_SGRPROJ,
+  RESTORE_SWITCHABLE,
+  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
+  RESTORE_TYPES = 4,
+} RestorationType;
+
+#define SUPERRES_SCALE_BITS 3
+#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
+
+// In large_scale_tile coding, external references are used.
+#define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
new file mode 100644
index 000000000..571422d11
--- /dev/null
+++ b/third_party/aom/av1/common/filter.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_FILTER_TAP 8
+
+typedef enum ATTRIBUTE_PACKED {
+  EIGHTTAP_REGULAR,
+  EIGHTTAP_SMOOTH,
+  MULTITAP_SHARP,
+  BILINEAR,
+  INTERP_FILTERS_ALL,
+  SWITCHABLE_FILTERS = BILINEAR,
+  SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
+  EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+} InterpFilter;
+
+// With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
+// there are at most 10 filters, we can use 16 bits for each and have more than
+// enough space. This reduces argument passing and unifies the operation of
+// setting a (pair of) filters.
+//
+// Without CONFIG_DUAL_FILTER,
+typedef uint32_t InterpFilters;
+static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
+                                                     int x_filter) {
+  return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
+}
+
+static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
+                                                    InterpFilter x_filter) {
+  uint16_t y16 = y_filter & 0xf;
+  uint16_t x16 = x_filter & 0xf;
+  return y16 | ((uint32_t)x16 << 16);
+}
+
+static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
+  return av1_make_interp_filters(filter, filter);
+}
+
+static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
+  return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
+}
+
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
+
+#define MAX_SUBPEL_TAPS 12
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+
+typedef struct InterpFilterParams {
+  const int16_t *filter_ptr;
+  uint16_t taps;
+  uint16_t subpel_shifts;
+  InterpFilter interp_filter;
+} InterpFilterParams;
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
+  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
+  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
+  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
+  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
+  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
+  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
+  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
+  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
+  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
+  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
+  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
+  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+        SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
+
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
+  64,
+  64,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+  av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
+  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_SMOOTH },
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+                                             const int w) {
+  if (w <= 4) return &av1_interp_4tap[interp_filter];
+  return &av1_interp_filter_params_list[interp_filter];
+}
+
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+    const InterpFilter interp_filter) {
+  return &av1_interp_4tap[interp_filter];
+}
+
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+    const InterpFilter interp_filter) {
+  return av1_interp_filter_params_list[interp_filter].filter_ptr;
+}
+
+static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
+    const InterpFilterParams *const filter_params, const int subpel) {
+  return filter_params->filter_ptr + filter_params->taps * subpel;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
new file mode 100644
index 000000000..fd6c4bc79
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/frame_buffers.h"
+#include "aom_mem/aom_mem.h"
+
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  av1_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  list->int_fb = (InternalFrameBuffer *)aom_calloc(
+      list->num_internal_frame_buffers, sizeof(*list->int_fb));
+  return (list->int_fb == NULL);
+}
+
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    aom_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  aom_free(list->int_fb);
+  list->int_fb = NULL;
+}
+
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    if (list->int_fb[i].data && !list->int_fb[i].in_use)
+      memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+  }
+}
+
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL) return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use) break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers) return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    aom_free(int_fb_list->int_fb[i].data);
+    // The data must be zeroed to fix a valgrind error from the C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // skipped if border were totally removed.
+    int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
+    if (!int_fb_list->int_fb[i].data) return -1;
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  if (int_fb) int_fb->in_use = 0;
+  fb->priv = NULL;
+  return 0;
+}
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
new file mode 100644
index 000000000..16188e51c
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
+
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb);
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
new file mode 100644
index 000000000..2c1cb9827
--- /dev/null
+++ b/third_party/aom/av1/common/idct.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+int av1_get_tx_scale(const TX_SIZE tx_size) {
+  const int pels = tx_size_2d[tx_size];
+  // Largest possible pels is 4096 (64x64).
+  return (pels > 256) + (pels > 1024);
+}
+
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
+// idct
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+
+  av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+
+  av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+
+  av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+
+  av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  assert(tx_type == DCT_DCT);
+  av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+}
+
+static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int eob, int reduced_tx_set,
+                            TxfmParam *txfm_param) {
+  (void)plane;
+  txfm_param->tx_type = tx_type;
+  txfm_param->tx_size = tx_size;
+  txfm_param->eob = eob;
+  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
+  txfm_param->bd = xd->bd;
+  txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+}
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+                               int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+      av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_16X64:
+      av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X16:
+      av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      // this is like av1_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                        const TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
+  int tmp_stride = MAX_TX_SIZE;
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      tmp[r * tmp_stride + c] = dst[r * stride + c];
+    }
+  }
+
+  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                          txfm_param);
+
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
+    }
+  }
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, int plane,
+                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+                                 int stride, int eob, int reduced_tx_set) {
+  if (!eob) return;
+
+  assert(eob <= av1_get_max_eob(tx_size));
+
+  TxfmParam txfm_param;
+  init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
+                  &txfm_param);
+  assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
+
+  if (txfm_param.is_hbd) {
+    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  } else {
+    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
new file mode 100644
index 000000000..d9454e73f
--- /dev/null
+++ b/third_party/aom/av1/common/idct.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "aom_dsp/txfm_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d)(const tran_low_t *, tran_low_t *);
+
+typedef struct {
+  transform_1d cols, rows;  // vertical and horizontal
+} transform_2d;
+
+#define MAX_TX_SCALE 1
+int av1_get_tx_scale(const TX_SIZE tx_size);
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, int plane,
+                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+                                 int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+  assert(sizeof(int32_t) == sizeof(tran_low_t));
+  return (const int32_t *)input;
+}
+
+typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *param);
+
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
new file mode 100644
index 000000000..5b0225192
--- /dev/null
+++ b/third_party/aom/av1/common/mv.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_MV 0x80008000
+
+typedef struct mv {
+  int16_t row;
+  int16_t col;
+} MV;
+
+static const MV kZeroMv = { 0, 0 };
+
+typedef union int_mv {
+  uint32_t as_int;
+  MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+  int32_t row;
+  int32_t col;
+} MV32;
+
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS 16
+#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
+
+#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
+#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
+#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 2))
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS 6
+#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
+
+#define WARP_PARAM_REDUCE_BITS 6
+
+#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+/* clang-format off */
+typedef enum ATTRIBUTE_PACKED {
+  IDENTITY = 0,      // identity transformation, 0-parameter
+  TRANSLATION = 1,   // translational motion 2-parameter
+  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
+  AFFINE = 3,        // affine, 6-parameter
+  TRANS_TYPES,
+} TransformationType;
+/* clang-format on */
+
+// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
+// The following can be useful:
+// GLOBAL_TRANS_TYPES 3 - up to rotation-zoom
+// GLOBAL_TRANS_TYPES 4 - up to affine
+// GLOBAL_TRANS_TYPES 6 - up to hor/ver trapezoids
+// GLOBAL_TRANS_TYPES 7 - up to full homography
+#define GLOBAL_TRANS_TYPES 4
+
+typedef struct {
+  int global_warp_allowed;
+  int local_warp_allowed;
+} WarpTypesAllowed;
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+
+// The order of values in the wmmat matrix below is best described
+// by the homography:
+//      [x'     (m2 m3 m0   [x
+//  z .  y'  =   m4 m5 m1 *  y
+//       1]      m6 m7 1)    1]
+typedef struct {
+  TransformationType wmtype;
+  int32_t wmmat[8];
+  int16_t alpha, beta, gamma, delta;
+  int8_t invalid;
+} WarpedMotionParams;
+
+/* clang-format off */
+static const WarpedMotionParams default_warp_params = {
+  IDENTITY,
+  { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
+    0 },
+  0, 0, 0, 0,
+  0,
+};
+/* clang-format on */
+
+// The following constants describe the various precisions
+// of different parameters in the global motion experiment.
+//
+// Given the general homography:
+//      [x'     (a  b  c   [x
+//  z .  y'  =   d  e  f *  y
+//       1]      g  h  i)    1]
+//
+// Constants using the name ALPHA here are related to parameters
+// a, b, d, e. Constants using the name TRANS are related
+// to parameters c and f.
+//
+// Anything ending in PREC_BITS is the number of bits of precision
+// to maintain when converting from double to integer.
+//
+// The ABS parameters are used to create an upper and lower bound
+// for each parameter. In other words, after a parameter is integerized
+// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS).
+//
+// XXX_PREC_DIFF and XXX_DECODE_FACTOR
+// are computed once here to prevent repetitive
+// computation on the decoder side. These are
+// to allow the global motion parameters to be encoded in a lower
+// precision than the warped model precision. This means that they
+// need to be changed to warped precision when they are decoded.
+//
+// XX_MIN, XX_MAX are also computed to avoid repeated computation
+
+#define SUBEXPFIN_K 3
+#define GM_TRANS_PREC_BITS 6
+#define GM_ABS_TRANS_BITS 12
+#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
+#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
+#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
+#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
+#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
+
+#define GM_ALPHA_PREC_BITS 15
+#define GM_ABS_ALPHA_BITS 12
+#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
+#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
+
+#define GM_ROW3HOMO_PREC_BITS 16
+#define GM_ABS_ROW3HOMO_BITS 11
+#define GM_ROW3HOMO_PREC_DIFF \
+  (WARPEDMODEL_ROW3HOMO_PREC_BITS - GM_ROW3HOMO_PREC_BITS)
+#define GM_ROW3HOMO_DECODE_FACTOR (1 << GM_ROW3HOMO_PREC_DIFF)
+
+#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS)
+#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
+#define GM_ROW3HOMO_MAX (1 << GM_ABS_ROW3HOMO_BITS)
+
+#define GM_TRANS_MIN -GM_TRANS_MAX
+#define GM_ALPHA_MIN -GM_ALPHA_MAX
+#define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX
+
+static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
+  const int bw = block_size_wide[bs];
+  return mi_col * MI_SIZE + bw / 2 - 1;
+}
+
+static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
+  const int bh = block_size_high[bs];
+  return mi_row * MI_SIZE + bh / 2 - 1;
+}
+
+static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
+  if (allow_hp)
+    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
+  else
+    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
+}
+static INLINE void integer_mv_precision(MV *mv) {
+  int mod = (mv->row % 8);
+  if (mod != 0) {
+    mv->row -= mod;
+    if (abs(mod) > 4) {
+      if (mod > 0) {
+        mv->row += 8;
+      } else {
+        mv->row -= 8;
+      }
+    }
+  }
+
+  mod = (mv->col % 8);
+  if (mod != 0) {
+    mv->col -= mod;
+    if (abs(mod) > 4) {
+      if (mod > 0) {
+        mv->col += 8;
+      } else {
+        mv->col -= 8;
+      }
+    }
+  }
+}
+// Convert a global motion vector into a motion vector at the centre of the
+// given block.
+//
+// The resulting motion vector will have three fractional bits of precision. If
+// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
+// is_integer is true, the bottom three bits will be zero (so the motion vector
+// represents an integer)
+static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+                                          int allow_hp, BLOCK_SIZE bsize,
+                                          int mi_col, int mi_row,
+                                          int is_integer) {
+  int_mv res;
+
+  if (gm->wmtype == IDENTITY) {
+    res.as_int = 0;
+    return res;
+  }
+
+  const int32_t *mat = gm->wmmat;
+  int x, y, tx, ty;
+
+  if (gm->wmtype == TRANSLATION) {
+    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
+    // bits of fractional precision. The offset for a translation is stored in
+    // entries 0 and 1. For translations, all but the top three (two if
+    // cm->allow_high_precision_mv is false) fractional bits are always zero.
+    //
+    // After the right shifts, there are 3 fractional bits of precision. If
+    // allow_hp is false, the bottom bit is always zero (so we don't need a
+    // call to convert_to_trans_prec here)
+    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
+    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
+    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
+    if (is_integer) {
+      integer_mv_precision(&res.as_mv);
+    }
+    return res;
+  }
+
+  x = block_center_x(mi_col, bsize);
+  y = block_center_y(mi_row, bsize);
+
+  if (gm->wmtype == ROTZOOM) {
+    assert(gm->wmmat[5] == gm->wmmat[2]);
+    assert(gm->wmmat[4] == -gm->wmmat[3]);
+  }
+
+  const int xc =
+      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+  const int yc =
+      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+  tx = convert_to_trans_prec(allow_hp, xc);
+  ty = convert_to_trans_prec(allow_hp, yc);
+
+  res.as_mv.row = ty;
+  res.as_mv.col = tx;
+
+  if (is_integer) {
+    integer_mv_precision(&res.as_mv);
+  }
+  return res;
+}
+
+static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+  if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
+      gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
+    return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
+  }
+  if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4])
+    return ROTZOOM;
+  else
+    return AFFINE;
+}
+
+typedef struct candidate_mv {
+  int_mv this_mv;
+  int_mv comp_mv;
+  int weight;
+} CANDIDATE_MV;
+
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
+                            int max_row) {
+  mv->col = clamp(mv->col, min_col, max_col);
+  mv->row = clamp(mv->row, min_row, max_row);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
new file mode 100644
index 000000000..7f24ab4e6
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -0,0 +1,1523 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "av1/common/mvref_common.h"
+#include "av1/common/warped_motion.h"
+
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
+                            2048, 1820,  1638, 1489, 1365, 1260, 1170, 1092,
+                            1024, 963,   910,  862,  819,  780,  744,  712,
+                            682,  655,   630,  606,  585,  564,  546,  528 };
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static void get_mv_projection(MV *output, MV ref, int num, int den) {
+  den = AOMMIN(den, MAX_FRAME_DISTANCE);
+  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
+                : AOMMAX(num, -MAX_FRAME_DISTANCE);
+  const int mv_row =
+      ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+  const int mv_col =
+      ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+  const int clamp_max = MV_UPP - 1;
+  const int clamp_min = MV_LOW + 1;
+  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
+  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis) {
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+  int w, h;
+
+  for (h = 0; h < y_mis; h++) {
+    MV_REF *mv = frame_mvs;
+    for (w = 0; w < x_mis; w++) {
+      mv->ref_frame = NONE_FRAME;
+      mv->mv.as_int = 0;
+
+      for (int idx = 0; idx < 2; ++idx) {
+        MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
+        if (ref_frame > INTRA_FRAME) {
+          int8_t ref_idx = cm->ref_frame_side[ref_frame];
+          if (ref_idx) continue;
+          if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
+              (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
+            continue;
+          mv->ref_frame = ref_frame;
+          mv->mv.as_int = mi->mv[idx].as_int;
+        }
+      }
+      mv++;
+    }
+    frame_mvs += frame_mvs_stride;
+  }
+}
+
+static void add_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
+    uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
+    CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
+    const WarpedMotionParams *gm_params, int col, int weight) {
+  if (!is_inter_block(candidate)) return;  // for intrabc
+  int index = 0, ref;
+  assert(weight % 2 == 0);
+
+  if (rf[1] == NONE_FRAME) {
+    // single reference frame
+    for (ref = 0; ref < 2; ++ref) {
+      if (candidate->ref_frame[ref] == rf[0]) {
+        int_mv this_refmv;
+        if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
+          this_refmv = gm_mv_candidates[0];
+        else
+          this_refmv = get_sub_block_mv(candidate, ref, col);
+
+        for (index = 0; index < *refmv_count; ++index)
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
+
+        if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+
+        // Add a new item to the list.
+        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+          ref_mv_stack[index].this_mv = this_refmv;
+          ref_mv_stack[index].weight = weight;
+          ++(*refmv_count);
+        }
+        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+        ++*ref_match_count;
+      }
+    }
+  } else {
+    // compound reference frame
+    if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
+      int_mv this_refmv[2];
+
+      for (ref = 0; ref < 2; ++ref) {
+        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
+          this_refmv[ref] = gm_mv_candidates[ref];
+        else
+          this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
+      }
+
+      for (index = 0; index < *refmv_count; ++index)
+        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
+            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+          break;
+
+      if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+
+      // Add a new item to the list.
+      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[index].this_mv = this_refmv[0];
+        ref_mv_stack[index].comp_mv = this_refmv[1];
+        ref_mv_stack[index].weight = weight;
+        ++(*refmv_count);
+      }
+      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+      ++*ref_match_count;
+    }
+  }
+}
+
+static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int row_offset,
+                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates, int max_row_offset,
+                          int *processed_rows) {
+  int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
+  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
+  const int n8_w_8 = mi_size_wide[BLOCK_8X8];
+  const int n8_w_16 = mi_size_wide[BLOCK_16X16];
+  int i;
+  int col_offset = 0;
+  // TODO(jingning): Revisit this part after cb4x4 is stable.
+  if (abs(row_offset) > 1) {
+    col_offset = 1;
+    if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
+  }
+  const int use_step_16 = (xd->n4_w >= 16);
+  MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
+  (void)mi_row;
+
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
+    const int candidate_bsize = candidate->sb_type;
+    const int n4_w = mi_size_wide[candidate_bsize];
+    int len = AOMMIN(xd->n4_w, n4_w);
+    if (use_step_16)
+      len = AOMMAX(n8_w_16, len);
+    else if (abs(row_offset) > 1)
+      len = AOMMAX(len, n8_w_8);
+
+    int weight = 2;
+    if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
+      int inc = AOMMIN(-max_row_offset + row_offset + 1,
+                       mi_size_high[candidate_bsize]);
+      // Obtain range used in weight calculation.
+      weight = AOMMAX(weight, inc);
+      // Update processed rows.
+      *processed_rows = inc - row_offset - 1;
+    }
+
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, col_offset + i, len * weight);
+
+    i += len;
+  }
+}
+
+static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int col_offset,
+                          CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates, int max_col_offset,
+                          int *processed_cols) {
+  int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
+  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
+  const int n8_h_8 = mi_size_high[BLOCK_8X8];
+  const int n8_h_16 = mi_size_high[BLOCK_16X16];
+  int i;
+  int row_offset = 0;
+  if (abs(col_offset) > 1) {
+    row_offset = 1;
+    if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
+  }
+  const int use_step_16 = (xd->n4_h >= 16);
+  (void)mi_col;
+
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate =
+        xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
+    const int candidate_bsize = candidate->sb_type;
+    const int n4_h = mi_size_high[candidate_bsize];
+    int len = AOMMIN(xd->n4_h, n4_h);
+    if (use_step_16)
+      len = AOMMAX(n8_h_16, len);
+    else if (abs(col_offset) > 1)
+      len = AOMMAX(len, n8_h_8);
+
+    int weight = 2;
+    if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
+      int inc = AOMMIN(-max_col_offset + col_offset + 1,
+                       mi_size_wide[candidate_bsize]);
+      // Obtain range used in weight calculation.
+      weight = AOMMAX(weight, inc);
+      // Update processed cols.
+      *processed_cols = inc - col_offset - 1;
+    }
+
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, col_offset, len * weight);
+
+    i += len;
+  }
+}
+
+static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          const int mi_row, const int mi_col,
+                          const MV_REFERENCE_FRAME rf[2], int row_offset,
+                          int col_offset, CANDIDATE_MV *ref_mv_stack,
+                          uint8_t *ref_match_count, uint8_t *newmv_count,
+                          int_mv *gm_mv_candidates,
+                          uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
+  const TileInfo *const tile = &xd->tile;
+  POSITION mi_pos;
+
+  mi_pos.row = row_offset;
+  mi_pos.col = col_offset;
+
+  if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
+    const MB_MODE_INFO *const candidate =
+        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+    const int len = mi_size_wide[BLOCK_8X8];
+
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, gm_mv_candidates,
+                         cm->global_motion, mi_pos.col, 2 * len);
+  }  // Analyze a single 8x8 block motion information.
+}
+
+static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                         int mi_row, int mi_col, int bs) {
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int mask_row = mi_row & (sb_mi_size - 1);
+  const int mask_col = mi_col & (sb_mi_size - 1);
+
+  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
+
+  // In a split partition all apart from the bottom right has a top right
+  int has_tr = !((mask_row & bs) && (mask_col & bs));
+
+  // bs > 0 and bs is a power of 2
+  assert(bs > 0 && !(bs & (bs - 1)));
+
+  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+  // to the right have not been decoded therefore the bottom right does
+  // not have a top right
+  while (bs < sb_mi_size) {
+    if (mask_col & bs) {
+      if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
+        has_tr = 0;
+        break;
+      }
+    } else {
+      break;
+    }
+    bs <<= 1;
+  }
+
+  // The left hand of two vertical rectangles always has a top right (as the
+  // block above will have been decoded)
+  if (xd->n4_w < xd->n4_h)
+    if (!xd->is_sec_rect) has_tr = 1;
+
+  // The bottom of two horizontal rectangles never has a top right (as the block
+  // to the right won't have been decoded)
+  if (xd->n4_w > xd->n4_h)
+    if (xd->is_sec_rect) has_tr = 0;
+
+  // The bottom left square of a Vertical A (in the old format) does
+  // not have a top right as it is decoded before the right hand
+  // rectangle of the partition
+  if (xd->mi[0]->partition == PARTITION_VERT_A) {
+    if (xd->n4_w == xd->n4_h)
+      if (mask_row & bs) has_tr = 0;
+  }
+
+  return has_tr;
+}
+
+static int check_sb_border(const int mi_row, const int mi_col,
+                           const int row_offset, const int col_offset) {
+  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
+  const int row = mi_row & (sb_mi_size - 1);
+  const int col = mi_col & (sb_mi_size - 1);
+
+  if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
+      col + col_offset < 0 || col + col_offset >= sb_mi_size)
+    return 0;
+
+  return 1;
+}
+
+static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
+                          int blk_row, int blk_col, int_mv *gm_mv_candidates,
+                          uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+                          CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+                          int16_t *mode_context) {
+  POSITION mi_pos;
+  int idx;
+  const int weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
+
+  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
+  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
+
+  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
+
+  const TPL_MV_REF *prev_frame_mvs =
+      cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+      ((mi_col + mi_pos.col) >> 1);
+
+  MV_REFERENCE_FRAME rf[2];
+  av1_set_ref_frame(rf, ref_frame);
+
+  if (rf[1] == NONE_FRAME) {
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
+
+    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+      int_mv this_refmv;
+
+      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
+      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
+
+      if (blk_row == 0 && blk_col == 0)
+        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+      for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+      if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
+
+      if (idx == refmv_count[rf[0]] &&
+          refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+        ref_mv_stack[idx].weight = 2 * weight_unit;
+        ++(refmv_count[rf[0]]);
+      }
+      return 1;
+    }
+  } else {
+    // Process compound inter mode
+    int cur_frame_index = cm->cur_frame->cur_frame_offset;
+    int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+    int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+
+    int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+    int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
+    int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+    int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
+    CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
+
+    if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+      int_mv this_refmv;
+      int_mv comp_refmv;
+      get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_0, prev_frame_mvs->ref_frame_offset);
+      get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                        cur_offset_1, prev_frame_mvs->ref_frame_offset);
+
+      lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
+      lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
+                         cm->cur_frame_force_integer_mv);
+
+      if (blk_row == 0 && blk_col == 0)
+        if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+            abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+            abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+            abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+          mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+      for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
+        if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+            comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+          break;
+
+      if (idx < refmv_count[ref_frame])
+        ref_mv_stack[idx].weight += 2 * weight_unit;
+
+      if (idx == refmv_count[ref_frame] &&
+          refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+        ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+        ref_mv_stack[idx].weight = 2 * weight_unit;
+        ++(refmv_count[ref_frame]);
+      }
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static void process_compound_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+    int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+    for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+      if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+        ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+        ++ref_id_count[cmp_idx];
+      } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+        int_mv this_mv = candidate->mv[rf_idx];
+        if (cm->ref_frame_sign_bias[can_rf] !=
+            cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+          this_mv.as_mv.row = -this_mv.as_mv.row;
+          this_mv.as_mv.col = -this_mv.as_mv.col;
+        }
+        ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+        ++ref_diff_count[cmp_idx];
+      }
+    }
+  }
+}
+
+static void process_single_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+      int_mv this_mv = candidate->mv[rf_idx];
+      if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+          cm->ref_frame_sign_bias[ref_frame]) {
+        this_mv.as_mv.row = -this_mv.as_mv.row;
+        this_mv.as_mv.col = -this_mv.as_mv.col;
+      }
+      int stack_idx;
+      for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+        const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+        if (this_mv.as_int == stack_mv.as_int) break;
+      }
+
+      if (stack_idx == refmv_count[ref_frame]) {
+        ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+        // TODO(jingning): Set an arbitrary small number here. The weight
+        // doesn't matter as long as it is properly initialized.
+        ref_mv_stack[ref_frame][stack_idx].weight = 2;
+        ++refmv_count[ref_frame];
+      }
+    }
+  }
+}
+
+static void setup_ref_mv_list(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
+    uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+    CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+    int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+    int mi_row, int mi_col, int16_t *mode_context) {
+  const int bs = AOMMAX(xd->n4_w, xd->n4_h);
+  const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
+  MV_REFERENCE_FRAME rf[2];
+
+  const TileInfo *const tile = &xd->tile;
+  int max_row_offset = 0, max_col_offset = 0;
+  const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+  const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+  int processed_rows = 0;
+  int processed_cols = 0;
+
+  av1_set_ref_frame(rf, ref_frame);
+  mode_context[ref_frame] = 0;
+  refmv_count[ref_frame] = 0;
+
+  // Find valid maximum row/col offset.
+  if (xd->up_available) {
+    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
+
+    if (xd->n4_h < mi_size_high[BLOCK_8X8])
+      max_row_offset = -(2 << 1) + row_adj;
+
+    max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
+  }
+
+  if (xd->left_available) {
+    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
+
+    if (xd->n4_w < mi_size_wide[BLOCK_8X8])
+      max_col_offset = -(2 << 1) + col_adj;
+
+    max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
+  }
+
+  uint8_t col_match_count = 0;
+  uint8_t row_match_count = 0;
+  uint8_t newmv_count = 0;
+
+  // Scan the first above row mode info. row_offset = -1;
+  if (abs(max_row_offset) >= 1)
+    scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+                  &refmv_count[ref_frame], &row_match_count, &newmv_count,
+                  gm_mv_candidates, max_row_offset, &processed_rows);
+  // Scan the first left column mode info. col_offset = -1;
+  if (abs(max_col_offset) >= 1)
+    scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+                  &refmv_count[ref_frame], &col_match_count, &newmv_count,
+                  gm_mv_candidates, max_col_offset, &processed_cols);
+  // Check top-right boundary
+  if (has_tr)
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
+                  ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
+                  gm_mv_candidates, &refmv_count[ref_frame]);
+
+  const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+  const uint8_t nearest_refmv_count = refmv_count[ref_frame];
+
+  // TODO(yunqing): for comp_search, do it for all 3 cases.
+  for (int idx = 0; idx < nearest_refmv_count; ++idx)
+    ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+
+  if (cm->allow_ref_frame_mvs) {
+    int is_available = 0;
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
+    const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
+
+    const int tpl_sample_pos[3][2] = {
+      { voffset, -2 },
+      { voffset, hoffset },
+      { voffset - 2, hoffset },
+    };
+    const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
+                                (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+
+    const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+                           ? mi_size_high[BLOCK_16X16]
+                           : mi_size_high[BLOCK_8X8];
+    const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+                           ? mi_size_wide[BLOCK_16X16]
+                           : mi_size_wide[BLOCK_8X8];
+
+    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
+      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
+        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
+                                 blk_col, gm_mv_candidates, refmv_count,
+                                 ref_mv_stack, mode_context);
+        if (blk_row == 0 && blk_col == 0) is_available = ret;
+      }
+    }
+
+    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+    for (int i = 0; i < 3 && allow_extension; ++i) {
+      const int blk_row = tpl_sample_pos[i][0];
+      const int blk_col = tpl_sample_pos[i][1];
+
+      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
+      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
+                     gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
+    }
+  }
+
+  uint8_t dummy_newmv_count = 0;
+
+  // Scan the second outer area.
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+                &refmv_count[ref_frame]);
+
+  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
+    const int row_offset = -(idx << 1) + 1 + row_adj;
+    const int col_offset = -(idx << 1) + 1 + col_adj;
+
+    if (abs(row_offset) <= abs(max_row_offset) &&
+        abs(row_offset) > processed_rows)
+      scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
+                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+                    &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+                    max_row_offset, &processed_rows);
+
+    if (abs(col_offset) <= abs(max_col_offset) &&
+        abs(col_offset) > processed_cols)
+      scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
+                    ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+                    &col_match_count, &dummy_newmv_count, gm_mv_candidates,
+                    max_col_offset, &processed_cols);
+  }
+
+  const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+
+  switch (nearest_match) {
+    case 0:
+      mode_context[ref_frame] |= 0;
+      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
+      if (ref_match_count == 1)
+        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+      else if (ref_match_count >= 2)
+        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+      break;
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+      if (ref_match_count == 1)
+        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+      else if (ref_match_count >= 2)
+        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+      break;
+    case 2:
+    default:
+      if (newmv_count >= 1)
+        mode_context[ref_frame] |= 4;
+      else
+        mode_context[ref_frame] |= 5;
+
+      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+      break;
+  }
+
+  // Rank the likelihood and assign nearest and near mvs.
+  int len = nearest_refmv_count;
+  while (len > 0) {
+    int nr_len = 0;
+    for (int idx = 1; idx < len; ++idx) {
+      if (ref_mv_stack[ref_frame][idx - 1].weight <
+          ref_mv_stack[ref_frame][idx].weight) {
+        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+        ref_mv_stack[ref_frame][idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  len = refmv_count[ref_frame];
+  while (len > nearest_refmv_count) {
+    int nr_len = nearest_refmv_count;
+    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_stack[ref_frame][idx - 1].weight <
+          ref_mv_stack[ref_frame][idx].weight) {
+        CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+        ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+        ref_mv_stack[ref_frame][idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  if (rf[1] > NONE_FRAME) {
+    // TODO(jingning, yunqing): Refactor and consolidate the compound and
+    // single reference frame modes. Reduce unnecessary redundancy.
+    if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+      int_mv ref_id[2][2], ref_diff[2][2];
+      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
+
+      int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
+      mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+      int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
+      mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+      int mi_size = AOMMIN(mi_width, mi_height);
+
+      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_wide[candidate->sb_type];
+      }
+
+      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_high[candidate->sb_type];
+      }
+
+      // Build up the compound mv predictor
+      int_mv comp_list[3][2];
+
+      for (int idx = 0; idx < 2; ++idx) {
+        int comp_idx = 0;
+        for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
+        for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
+        for (; comp_idx < 3; ++comp_idx)
+          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
+      }
+
+      if (refmv_count[ref_frame]) {
+        assert(refmv_count[ref_frame] == 1);
+        if (comp_list[0][0].as_int ==
+                ref_mv_stack[ref_frame][0].this_mv.as_int &&
+            comp_list[0][1].as_int ==
+                ref_mv_stack[ref_frame][0].comp_mv.as_int) {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[1][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[1][1];
+        } else {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[0][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[0][1];
+        }
+        ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+        ++refmv_count[ref_frame];
+      } else {
+        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+              comp_list[idx][0];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+              comp_list[idx][1];
+          ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+          ++refmv_count[ref_frame];
+        }
+      }
+    }
+
+    assert(refmv_count[ref_frame] >= 2);
+
+    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+    }
+  } else {
+    // Handle single reference frame extension
+    int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
+    mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+    int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
+    mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+    int mi_size = AOMMIN(mi_width, mi_height);
+
+    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
+                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack);
+      idx += mi_size_wide[candidate->sb_type];
+    }
+
+    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
+                      refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack);
+      idx += mi_size_high[candidate->sb_type];
+    }
+
+    for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+                   xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+    }
+
+    if (mv_ref_list != NULL) {
+      for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
+        mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
+
+      for (int idx = 0;
+           idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
+        mv_ref_list[rf[0]][idx].as_int =
+            ref_mv_stack[ref_frame][idx].this_mv.as_int;
+      }
+    }
+  }
+}
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int mi_row, int mi_col,
+                      int16_t *mode_context) {
+  int_mv zeromv[2];
+  BLOCK_SIZE bsize = mi->sb_type;
+  MV_REFERENCE_FRAME rf[2];
+  av1_set_ref_frame(rf, ref_frame);
+
+  if (ref_frame < REF_FRAMES) {
+    if (ref_frame != INTRA_FRAME) {
+      global_mvs[ref_frame] = gm_get_motion_vector(
+          &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
+          mi_col, mi_row, cm->cur_frame_force_integer_mv);
+    } else {
+      global_mvs[ref_frame].as_int = INVALID_MV;
+    }
+  }
+
+  if (ref_frame != INTRA_FRAME) {
+    zeromv[0].as_int =
+        gm_get_motion_vector(&cm->global_motion[rf[0]],
+                             cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                             cm->cur_frame_force_integer_mv)
+            .as_int;
+    zeromv[1].as_int =
+        (rf[1] != NONE_FRAME)
+            ? gm_get_motion_vector(&cm->global_motion[rf[1]],
+                                   cm->allow_high_precision_mv, bsize, mi_col,
+                                   mi_row, cm->cur_frame_force_integer_mv)
+                  .as_int
+            : 0;
+  } else {
+    zeromv[0].as_int = zeromv[1].as_int = 0;
+  }
+
+  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
+                    zeromv, mi_row, mi_col, mode_context);
+}
+
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv, int is_integer) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
+  }
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
+  cm->cur_frame->cur_frame_offset = cm->frame_offset;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    if (buf_idx >= 0)
+      cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] =
+          cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+  }
+}
+
+void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) {
+      const int ref_frame_offset =
+          cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+      cm->ref_frame_sign_bias[ref_frame] =
+          (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0)
+              ? 0
+              : 1;
+    } else {
+      cm->ref_frame_sign_bias[ref_frame] = 0;
+    }
+  }
+}
+
+#define MAX_OFFSET_WIDTH 64
+#define MAX_OFFSET_HEIGHT 0
+
+static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
+                              int blk_col, MV mv, int sign_bias) {
+  const int base_blk_row = (blk_row >> 3) << 3;
+  const int base_blk_col = (blk_col >> 3) << 3;
+
+  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
+
+  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
+
+  const int row =
+      (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+  const int col =
+      (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+
+  if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
+      col >= (cm->mi_cols >> 1))
+    return 0;
+
+  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
+      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
+      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
+      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
+    return 0;
+
+  *mi_r = row;
+  *mi_c = col;
+
+  return 1;
+}
+
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+                                   MV_REFERENCE_FRAME start_frame, int dir) {
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int ref_offset[REF_FRAMES] = { 0 };
+
+  (void)dir;
+
+  const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
+  if (start_frame_idx < 0) return 0;
+
+  if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
+
+  if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
+      cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
+    return 0;
+
+  const int start_frame_offset =
+      cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
+  const unsigned int *const ref_frame_offsets =
+      &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
+  const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
+  int start_to_current_frame_offset =
+      get_relative_dist(cm, start_frame_offset, cur_frame_offset);
+
+  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
+    ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
+                                       ref_frame_offsets[rf - LAST_FRAME]);
+  }
+
+  if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
+
+  MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
+  const int mvs_rows = (cm->mi_rows + 1) >> 1;
+  const int mvs_cols = (cm->mi_cols + 1) >> 1;
+
+  for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
+    for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
+      MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
+      MV fwd_mv = mv_ref->mv.as_mv;
+
+      if (mv_ref->ref_frame > INTRA_FRAME) {
+        int_mv this_mv;
+        int mi_r, mi_c;
+        const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
+
+        int pos_valid =
+            abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+            ref_frame_offset > 0 &&
+            abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
+
+        if (pos_valid) {
+          get_mv_projection(&this_mv.as_mv, fwd_mv,
+                            start_to_current_frame_offset, ref_frame_offset);
+          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
+                                         this_mv.as_mv, dir >> 1);
+        }
+
+        if (pos_valid) {
+          const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
+          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
+        }
+      }
+    }
+  }
+
+  return 1;
+}
+
+void av1_setup_motion_field(AV1_COMMON *cm) {
+  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
+  if (!cm->seq_params.enable_order_hint) return;
+
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  for (int idx = 0; idx < size; ++idx) {
+    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
+    tpl_mvs_base[idx].ref_frame_offset = 0;
+  }
+
+  const int cur_order_hint = cm->cur_frame->cur_frame_offset;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  int ref_buf_idx[INTER_REFS_PER_FRAME];
+  int ref_order_hint[INTER_REFS_PER_FRAME];
+
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    const int ref_idx = ref_frame - LAST_FRAME;
+    const int buf_idx = cm->frame_refs[ref_idx].idx;
+    int order_hint = 0;
+
+    if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
+
+    ref_buf_idx[ref_idx] = buf_idx;
+    ref_order_hint[ref_idx] = order_hint;
+
+    if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
+      cm->ref_frame_side[ref_frame] = 1;
+    else if (order_hint == cur_order_hint)
+      cm->ref_frame_side[ref_frame] = -1;
+  }
+
+  int ref_stamp = MFMV_STACK_SIZE - 1;
+
+  if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
+    const int alt_of_lst_order_hint =
+        frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
+            .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
+
+    const int is_lst_overlay =
+        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
+    if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
+    --ref_stamp;
+  }
+
+  if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
+  }
+
+  if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
+  }
+
+  if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0 &&
+      ref_stamp >= 0)
+    if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
+
+  if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
+    if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
+}
+
+static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
+                                  int row_offset, int sign_r, int col_offset,
+                                  int sign_c) {
+  int bw = block_size_wide[mbmi->sb_type];
+  int bh = block_size_high[mbmi->sb_type];
+  int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
+  int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
+
+  pts[0] = (x * 8);
+  pts[1] = (y * 8);
+  pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+}
+
+// Select samples according to the motion vector difference.
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
+  int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
+  int i, j, k, l = len;
+  int ret = 0;
+  assert(len <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Obtain the motion vector difference.
+  for (i = 0; i < len; ++i) {
+    pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+                 abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+
+    if (pts_mvd[i] > thresh)
+      pts_mvd[i] = -1;
+    else
+      ret++;
+  }
+
+  // Keep at least 1 sample.
+  if (!ret) return 1;
+
+  i = 0;
+  j = l - 1;
+  for (k = 0; k < l - ret; k++) {
+    while (pts_mvd[i] != -1) i++;
+    while (pts_mvd[j] == -1) j--;
+    assert(i != j);
+    if (i > j) break;
+
+    // Replace the discarded samples;
+    pts_mvd[i] = pts_mvd[j];
+    pts[2 * i] = pts[2 * j];
+    pts[2 * i + 1] = pts[2 * j + 1];
+    pts_inref[2 * i] = pts_inref[2 * j];
+    pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
+    i++;
+    j--;
+  }
+
+  return ret;
+}
+
+// Note: Samples returned are at 1/8-pel precision
+// Sample are the neighbor block center point's coordinates relative to the
+// left-top pixel of current block.
+int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+                int *pts, int *pts_inref) {
+  MB_MODE_INFO *const mbmi0 = xd->mi[0];
+  int ref_frame = mbmi0->ref_frame[0];
+  int up_available = xd->up_available;
+  int left_available = xd->left_available;
+  int i, mi_step = 1, np = 0;
+
+  const TileInfo *const tile = &xd->tile;
+  int do_tl = 1;
+  int do_tr = 1;
+
+  // scan the nearest above rows
+  if (up_available) {
+    int mi_row_offset = -1;
+    MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
+    uint8_t n4_w = mi_size_wide[mbmi->sb_type];
+
+    if (xd->n4_w <= n4_w) {
+      // Handle "current block width <= above block width" case.
+      int col_offset = -mi_col % n4_w;
+
+      if (col_offset < 0) do_tl = 0;
+      if (col_offset + n4_w > xd->n4_w) do_tr = 0;
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    } else {
+      // Handle "current block width > above block width" case.
+      for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
+        int mi_col_offset = i;
+        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+        n4_w = mi_size_wide[mbmi->sb_type];
+        mi_step = AOMMIN(xd->n4_w, n4_w);
+
+        if (mbmi->ref_frame[0] == ref_frame &&
+            mbmi->ref_frame[1] == NONE_FRAME) {
+          record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
+          pts += 2;
+          pts_inref += 2;
+          np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        }
+      }
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // scan the nearest left columns
+  if (left_available) {
+    int mi_col_offset = -1;
+
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+    uint8_t n4_h = mi_size_high[mbmi->sb_type];
+
+    if (xd->n4_h <= n4_h) {
+      // Handle "current block height <= above block height" case.
+      int row_offset = -mi_row % n4_h;
+
+      if (row_offset < 0) do_tl = 0;
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    } else {
+      // Handle "current block height > above block height" case.
+      for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
+        int mi_row_offset = i;
+        mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+        n4_h = mi_size_high[mbmi->sb_type];
+        mi_step = AOMMIN(xd->n4_h, n4_h);
+
+        if (mbmi->ref_frame[0] == ref_frame &&
+            mbmi->ref_frame[1] == NONE_FRAME) {
+          record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
+          pts += 2;
+          pts_inref += 2;
+          np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        }
+      }
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Top-left block
+  if (do_tl && left_available && up_available) {
+    int mi_row_offset = -1;
+    int mi_col_offset = -1;
+
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+
+    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+      record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
+      pts += 2;
+      pts_inref += 2;
+      np++;
+      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Top-right block
+  if (do_tr &&
+      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
+    POSITION trb_pos = { -1, xd->n4_w };
+
+    if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
+      int mi_row_offset = -1;
+      int mi_col_offset = xd->n4_w;
+
+      MB_MODE_INFO *mbmi =
+          xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  return np;
+}
+
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
+  cm->is_skip_mode_allowed = 0;
+  cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX;
+
+  if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) ||
+      cm->reference_mode == SINGLE_REFERENCE)
+    return;
+
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  const int cur_frame_offset = cm->frame_offset;
+  int ref_frame_offset[2] = { -1, INT_MAX };
+  int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
+
+  // Identify the nearest forward and backward references.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    const int buf_idx = cm->frame_refs[i].idx;
+    if (buf_idx == INVALID_IDX) continue;
+
+    const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+    if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) {
+      // Forward reference
+      if (ref_frame_offset[0] == -1 ||
+          get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) {
+        ref_frame_offset[0] = ref_offset;
+        ref_idx[0] = i;
+      }
+    } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) {
+      // Backward reference
+      if (ref_frame_offset[1] == INT_MAX ||
+          get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) {
+        ref_frame_offset[1] = ref_offset;
+        ref_idx[1] = i;
+      }
+    }
+  }
+
+  if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
+    // == Bi-directional prediction ==
+    cm->is_skip_mode_allowed = 1;
+    cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+    cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+  } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
+    // == Forward prediction only ==
+    // Identify the second nearest forward reference.
+    ref_frame_offset[1] = -1;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      const int buf_idx = cm->frame_refs[i].idx;
+      if (buf_idx == INVALID_IDX) continue;
+
+      const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+      if ((ref_frame_offset[0] != -1 &&
+           get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) &&
+          (ref_frame_offset[1] == -1 ||
+           get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) {
+        // Second closest forward reference
+        ref_frame_offset[1] = ref_offset;
+        ref_idx[1] = i;
+      }
+    }
+    if (ref_frame_offset[1] != -1) {
+      cm->is_skip_mode_allowed = 1;
+      cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+      cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+    }
+  }
+}
+
+typedef struct {
+  int map_idx;   // frame map index
+  int buf_idx;   // frame buffer index
+  int sort_idx;  // index based on the offset to be used for sorting
+} REF_FRAME_INFO;
+
+static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
+  const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
+  const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
+
+  if (info_a->sort_idx < info_b->sort_idx) return -1;
+  if (info_a->sort_idx > info_b->sort_idx) return 1;
+  return (info_a->map_idx < info_b->map_idx)
+             ? -1
+             : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+}
+
+static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
+                               REF_FRAME_INFO *ref_info) {
+  assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
+
+  const int buf_idx = ref_info->buf_idx;
+
+  cm->frame_refs[frame_idx].idx = buf_idx;
+  cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+  cm->frame_refs[frame_idx].map_idx = ref_info->map_idx;
+}
+
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
+                        int gld_map_idx) {
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  int lst_frame_sort_idx = -1;
+  int gld_frame_sort_idx = -1;
+
+  assert(cm->seq_params.enable_order_hint);
+  assert(cm->seq_params.order_hint_bits_minus_1 >= 0);
+  const int cur_frame_offset = (int)cm->frame_offset;
+  const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1;
+
+  REF_FRAME_INFO ref_frame_info[REF_FRAMES];
+  int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    const int map_idx = i;
+
+    ref_frame_info[i].map_idx = map_idx;
+    ref_frame_info[i].sort_idx = -1;
+
+    const int buf_idx = cm->ref_frame_map[map_idx];
+    ref_frame_info[i].buf_idx = buf_idx;
+
+    if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue;
+    // TODO(zoeliu@google.com): To verify the checking on ref_count.
+    if (frame_bufs[buf_idx].ref_count <= 0) continue;
+
+    const int offset = (int)frame_bufs[buf_idx].cur_frame_offset;
+    ref_frame_info[i].sort_idx =
+        (offset == -1) ? -1
+                       : cur_frame_sort_idx +
+                             get_relative_dist(cm, offset, cur_frame_offset);
+    assert(ref_frame_info[i].sort_idx >= -1);
+
+    if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
+    if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
+  }
+
+  // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
+  // frames.
+  if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as LAST");
+  }
+  if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as GOLDEN");
+  }
+
+  // Sort ref frames based on their frame_offset values.
+  qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
+        compare_ref_frame_info);
+
+  // Identify forward and backward reference frames.
+  // Forward  reference: offset < cur_frame_offset
+  // Backward reference: offset >= cur_frame_offset
+  int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
+
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (ref_frame_info[i].sort_idx == -1) {
+      fwd_start_idx++;
+      continue;
+    }
+
+    if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
+      fwd_end_idx = i - 1;
+      break;
+    }
+  }
+
+  int bwd_start_idx = fwd_end_idx + 1;
+  int bwd_end_idx = REF_FRAMES - 1;
+
+  // === Backward Reference Frames ===
+
+  // == ALTREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_end_idx]);
+    ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
+    bwd_end_idx--;
+  }
+
+  // == BWDREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
+    bwd_start_idx++;
+  }
+
+  // == ALTREF2_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
+  }
+
+  // === Forward Reference Frames ===
+
+  for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
+    // == LAST_FRAME ==
+    if (ref_frame_info[i].map_idx == lst_map_idx) {
+      set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
+    }
+
+    // == GOLDEN_FRAME ==
+    if (ref_frame_info[i].map_idx == gld_map_idx) {
+      set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+      ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
+    }
+  }
+
+  assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
+         ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
+
+  // == LAST2_FRAME ==
+  // == LAST3_FRAME ==
+  // == BWDREF_FRAME ==
+  // == ALTREF2_FRAME ==
+  // == ALTREF_FRAME ==
+
+  // Set up the reference frames in the anti-chronological order.
+  static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
+    LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
+  };
+
+  int ref_idx;
+  for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+
+    while (fwd_start_idx <= fwd_end_idx &&
+           (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
+            ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
+      fwd_end_idx--;
+    }
+    if (fwd_start_idx > fwd_end_idx) break;
+
+    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_end_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+
+    fwd_end_idx--;
+  }
+
+  // Assign all the remaining frame(s), if any, to the earliest reference frame.
+  for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+    set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_start_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+  }
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    assert(ref_flag_list[i] == 1);
+  }
+}
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
new file mode 100644
index 000000000..83f7a1ac0
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MVREF_ROW_COLS 3
+
+// Set the upper limit of the motion vector component magnitude.
+// This would make a motion vector fit in 26 bits. Plus 3 bits for the
+// reference frame index. A tuple of motion vector can hence be stored within
+// 32 bit range for efficient load/store operations.
+#define REFMVS_LIMIT ((1 << 12) - 1)
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
+  if (!cm->seq_params.enable_order_hint) return 0;
+
+  const int bits = cm->seq_params.order_hint_bits_minus_1 + 1;
+
+  assert(bits >= 1);
+  assert(a >= 0 && a < (1 << bits));
+  assert(b >= 0 && b < (1 << bits));
+
+  int diff = a - b;
+  const int m = 1 << (bits - 1);
+  diff = (diff & (m - 1)) - (diff & m);
+  return diff;
+}
+
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
+           xd->mb_to_right_edge + bw * 8 + MV_BORDER,
+           xd->mb_to_top_edge - bh * 8 - MV_BORDER,
+           xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
+                                      int which_mv, int search_col) {
+  (void)search_col;
+  return candidate->mv[which_mv];
+}
+
+static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
+                                           int which_mv, int search_col) {
+  (void)search_col;
+  return candidate->mv[which_mv];
+}
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv mv = mbmi->mv[ref];
+  if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+    mv.as_mv.row *= -1;
+    mv.as_mv.col *= -1;
+  }
+  return mv;
+}
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
+static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
+                                        int row_offset) {
+  return clamp(row_offset, tile->mi_row_start - mi_row,
+               tile->mi_row_end - mi_row - 1);
+}
+
+static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
+                                        int col_offset) {
+  return clamp(col_offset, tile->mi_col_start - mi_col,
+               tile->mi_col_end - mi_col - 1);
+}
+
+static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
+  if (is_integer) {
+    integer_mv_precision(mv);
+  } else {
+    if (!allow_hp) {
+      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
+      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
+    }
+  }
+}
+
+static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+  // Single ref pred
+  if (rf[1] <= INTRA_FRAME) return -1;
+
+  // Bi-directional comp ref pred
+  if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
+
+  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
+    if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
+      return ref_idx;
+  }
+  return -1;
+}
+
+static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] > INTRA_FRAME) {
+    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
+    if (uni_comp_ref_idx >= 0) {
+      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
+             MODE_CTX_REF_FRAMES);
+      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
+    } else {
+      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
+             BWD_RF_OFFSET(rf[1]) * FWD_REFS;
+    }
+  }
+
+  return rf[0];
+}
+
+// clang-format off
+static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
+  { LAST_FRAME, BWDREF_FRAME },  { LAST2_FRAME, BWDREF_FRAME },
+  { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
+
+  { LAST_FRAME, ALTREF2_FRAME },  { LAST2_FRAME, ALTREF2_FRAME },
+  { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
+
+  { LAST_FRAME, ALTREF_FRAME },  { LAST2_FRAME, ALTREF_FRAME },
+  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+
+  { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+
+  // NOTE: Following reference frame pairs are not supported to be explicitly
+  //       signalled, but they are possibly chosen by the use of skip_mode,
+  //       which may use the most recent one-sided reference frame pair.
+  { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
+  { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
+  { ALTREF2_FRAME, ALTREF_FRAME }
+};
+// clang-format on
+
+static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
+                                     int8_t ref_frame_type) {
+  if (ref_frame_type >= REF_FRAMES) {
+    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
+    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
+  } else {
+    rf[0] = ref_frame_type;
+    rf[1] = NONE_FRAME;
+    assert(ref_frame_type > NONE_FRAME);
+  }
+}
+
+static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
+  { 0, 1, 1, 1, 1 },
+  { 1, 2, 3, 4, 4 },
+  { 4, 4, 5, 6, 7 },
+};
+
+static INLINE int16_t av1_mode_context_analyzer(
+    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
+  const int8_t ref_frame = av1_ref_frame_type(rf);
+
+  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
+
+  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx =
+      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
+}
+
+static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
+                                  int ref_idx) {
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL)
+    return 0;
+
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+    return 1;
+
+  if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+    return 2;
+
+  return 0;
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm);
+void av1_setup_frame_sign_bias(AV1_COMMON *cm);
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
+void av1_setup_motion_field(AV1_COMMON *cm);
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+
+static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+  av1_zero(xd->neighbors_ref_counts);
+
+  uint8_t *const ref_counts = xd->neighbors_ref_counts;
+
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Above neighbor
+  if (above_in_image && is_inter_block(above_mbmi)) {
+    ref_counts[above_mbmi->ref_frame[0]]++;
+    if (has_second_ref(above_mbmi)) {
+      ref_counts[above_mbmi->ref_frame[1]]++;
+    }
+  }
+
+  // Left neighbor
+  if (left_in_image && is_inter_block(left_mbmi)) {
+    ref_counts[left_mbmi->ref_frame[0]]++;
+    if (has_second_ref(left_mbmi)) {
+      ref_counts[left_mbmi->ref_frame[1]]++;
+    }
+  }
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis);
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int mi_row, int mi_col,
+                      int16_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv, int is_integer);
+
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
+int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+                int *pts, int *pts_inref);
+
+#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
+#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
+
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+                                   int mib_size, int mi_row, int mi_col) {
+  (void)mi_col;
+  if (mi_row - mib_size < tile->mi_row_start) {
+    ref_dv->as_mv.row = 0;
+    ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+  } else {
+    ref_dv->as_mv.row = -MI_SIZE * mib_size;
+    ref_dv->as_mv.col = 0;
+  }
+  ref_dv->as_mv.row *= 8;
+  ref_dv->as_mv.col *= 8;
+}
+
+static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+                                  const MACROBLOCKD *xd, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, int mib_size_log2) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int SCALE_PX_TO_MV = 8;
+  // Disallow subpixel for now
+  // SUBPEL_MASK is not the correct scale
+  if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
+    return 0;
+
+  const TileInfo *const tile = &xd->tile;
+  // Is the source top-left inside the current tile?
+  const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
+  const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_top_edge < tile_top_edge) return 0;
+  const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
+  const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_left_edge < tile_left_edge) return 0;
+  // Is the bottom right inside the current tile?
+  const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
+  const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_bottom_edge > tile_bottom_edge) return 0;
+  const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
+  const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_right_edge > tile_right_edge) return 0;
+
+  // Special case for sub 8x8 chroma cases, to prevent referring to chroma
+  // pixels outside current tile.
+  for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                            pd->subsampling_y)) {
+      if (bw < 8 && pd->subsampling_x)
+        if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+      if (bh < 8 && pd->subsampling_y)
+        if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
+    }
+  }
+
+  // Is the bottom right within an already coded SB? Also consider additional
+  // constraints to facilitate HW decoder.
+  const int max_mib_size = 1 << mib_size_log2;
+  const int active_sb_row = mi_row >> mib_size_log2;
+  const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
+  const int sb_size = max_mib_size * MI_SIZE;
+  const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
+  const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
+  const int total_sb64_per_row =
+      ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
+  const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
+  const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
+  if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
+
+  // Wavefront constraint: use only top left area of frame for reference.
+  const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
+  const int wf_offset = gradient * (active_sb_row - src_sb_row);
+  if (src_sb_row > active_sb_row ||
+      src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
+    return 0;
+
+  return 1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
new file mode 100644
index 000000000..1c90cd93f
--- /dev/null
+++ b/third_party/aom/av1/common/obmc.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
+
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
+                                          uint8_t nb_mi_size,
+                                          MB_MODE_INFO *nb_mi, void *fun_ctxt,
+                                          const int num_planes);
+
+static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int mi_col,
+                                                 int nb_max,
+                                                 overlappable_nb_visitor_t fun,
+                                                 void *fun_ctxt) {
+  const int num_planes = av1_num_planes(cm);
+  if (!xd->up_available) return;
+
+  int nb_count = 0;
+
+  // prev_row_mi points into the mi array, starting at the beginning of the
+  // previous row.
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
+  uint8_t mi_step;
+  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
+       above_mi_col += mi_step) {
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step =
+        AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
+    // If we're considering a block with width 4, it should be treated as
+    // half of a pair of blocks with chroma information in the second. Move
+    // above_mi_col back to the start of the pair if needed, set above_mbmi
+    // to point at the block with chroma information, and set mi_step to 2 to
+    // step over the entire pair at the end of the iteration.
+    if (mi_step == 1) {
+      above_mi_col &= ~1;
+      above_mi = prev_row_mi + above_mi_col + 1;
+      mi_step = 2;
+    }
+    if (is_neighbor_overlappable(*above_mi)) {
+      ++nb_count;
+      fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
+          fun_ctxt, num_planes);
+    }
+  }
+}
+
+static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
+                                                MACROBLOCKD *xd, int mi_row,
+                                                int nb_max,
+                                                overlappable_nb_visitor_t fun,
+                                                void *fun_ctxt) {
+  const int num_planes = av1_num_planes(cm);
+  if (!xd->left_available) return;
+
+  int nb_count = 0;
+
+  // prev_col_mi points into the mi array, starting at the top of the
+  // previous column
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
+  uint8_t mi_step;
+  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
+       left_mi_row += mi_step) {
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step =
+        AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
+    if (mi_step == 1) {
+      left_mi_row &= ~1;
+      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
+      mi_step = 2;
+    }
+    if (is_neighbor_overlappable(*left_mi)) {
+      ++nb_count;
+      fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
+          fun_ctxt, num_planes);
+    }
+  }
+}
+
+#endif  // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 000000000..823b700b1
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+  int valid_type = 0;
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: valid_type = 1; break;
+    default: break;
+  }
+  return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+                                     size_t bytes_available,
+                                     size_t *const obu_size,
+                                     size_t *const length_field_size) {
+  uint64_t u_obu_size = 0;
+  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+      0) {
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+  *obu_size = (size_t)u_obu_size;
+  return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+                                       int is_annexb, ObuHeader *header) {
+  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->size = 1;
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // Forbidden bit. Must not be set.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->has_extension = aom_rb_read_bit(rb);
+  header->has_size_field = aom_rb_read_bit(rb);
+
+  if (!header->has_size_field && !is_annexb) {
+    // section 5 obu streams must have obu_size field set.
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // obu_reserved_1bit must be set to 0.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (header->has_extension) {
+    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+    header->size += 1;
+    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+    if (aom_rb_read_literal(rb, 3) != 0) {
+      // extension_header_reserved_3bits must be set to 0.
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb) {
+  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+  // TODO(tomfinegan): Set the error handler here and throughout this file, and
+  // confirm parsing work done via aom_read_bit_buffer is successful.
+  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+                                    NULL };
+  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+  return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read) {
+  size_t length_field_size = 0, obu_size = 0;
+  aom_codec_err_t status;
+
+  if (is_annexb) {
+    // Size field comes before the OBU header, and includes the OBU header
+    status =
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  struct aom_read_bit_buffer rb = { data + length_field_size,
+                                    data + bytes_available, 0, NULL, NULL };
+
+  status = read_obu_header(&rb, is_annexb, obu_header);
+  if (status != AOM_CODEC_OK) return status;
+
+  if (is_annexb) {
+    // Derive the payload size from the data we've already read
+    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+    *payload_size = obu_size - obu_header->size;
+  } else {
+    // Size field comes after the OBU header, and is just the payload size
+    status = read_obu_size(data + obu_header->size,
+                           bytes_available - obu_header->size, payload_size,
+                           &length_field_size);
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  *bytes_read = length_field_size + obu_header->size;
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 000000000..7c56904c8
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
+                // optional OBU extension header) in the bitstream.
+  OBU_TYPE type;
+  int has_size_field;
+  int has_extension;
+  // The following fields come from the OBU extension header and therefore are
+  // only used if has_extension is true.
+  int temporal_layer_id;
+  int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/odintrin.c b/third_party/aom/av1/common/odintrin.c
new file mode 100644
index 000000000..7584b2e52
--- /dev/null
+++ b/third_party/aom/av1/common/odintrin.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#include "av1/common/odintrin.h"
+
+/*Constants for use with OD_DIVU_SMALL().
+  See \cite{Rob05} for details on computing these constants.
+  @INPROCEEDINGS{Rob05,
+    author="Arch D. Robison",
+    title="{N}-bit Unsigned Division via {N}-bit Multiply-Add",
+    booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic
+     (ARITH'05)",
+    pages="131--139",
+    address="Cape Cod, MA",
+    month=Jun,
+    year=2005
+  }*/
+uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = {
+  { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xAAAAAAAB, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xCCCCCCCD, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xE38E38E4, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xBA2E8BA3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x9D89D89E, 0 },          { 0x92492492, 0x92492492 },
+  { 0x88888889, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF0F0F0F1, 0 },          { 0xE38E38E4, 0 },
+  { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 },
+  { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 },
+  { 0xB21642C9, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA3D70A3E, 0 },          { 0x9D89D89E, 0 },
+  { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 },
+  { 0x8D3DCB09, 0 },          { 0x88888889, 0 },
+  { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF83E0F84, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 },
+  { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 },
+  { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 },
+  { 0xC7CE0C7D, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xBE82FA0C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 },
+  { 0xAE4C415D, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA72F053A, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA0A0A0A1, 0 },          { 0x9D89D89E, 0 },
+  { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED },
+  { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 },
+  { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 },
+  { 0x8AD8F2FC, 0 },          { 0x88888889, 0 },
+  { 0x864B8A7E, 0 },          { 0x84210842, 0x84210842 },
+  { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFC0FC0FD, 0 },          { 0xF83E0F84, 0 },
+  { 0xF4898D60, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xED7303B6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE6C2B449, 0 },          { 0xE38E38E4, 0 },
+  { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDA740DA8, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD4C77B04, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xCF6474A9, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCA4587E7, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC565C87C, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC0C0C0C1, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBC52640C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB81702E1, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 },
+  { 0xB02C0B03, 0 },          { 0xAE4C415D, 0 },
+  { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 },
+  { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 },
+  { 0xA57EB503, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 },
+  { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 },
+  { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED },
+  { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F },
+  { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 },
+  { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 },
+  { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 },
+  { 0x89AE408A, 0 },          { 0x88888889, 0 },
+  { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 },
+  { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 },
+  { 0x83126E98, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFE03F810, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFA232CF3, 0 },          { 0xF83E0F84, 0 },
+  { 0xF6603D99, 0 },          { 0xF4898D60, 0 },
+  { 0xF2B9D649, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEF2EB720, 0 },          { 0xED7303B6, 0 },
+  { 0xEBBDB2A6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE865AC7C, 0 },          { 0xE6C2B449, 0 },
+  { 0xE525982B, 0 },          { 0xE38E38E4, 0 },
+  { 0xE1FC780F, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDEE95C4D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDBEB61EF, 0 },          { 0xDA740DA8, 0 },
+  { 0xD901B204, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD62B80D7, 0 },          { 0xD4C77B04, 0 },
+  { 0xD3680D37, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD0B69FCC, 0 },          { 0xCF6474A9, 0 },
+  { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 },
+  { 0xCB8727C1, 0 },          { 0xCA4587E7, 0 },
+  { 0xC907DA4F, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC6980C6A, 0 },          { 0xC565C87C, 0 },
+  { 0xC4372F86, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC1E4BBD6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 },
+  { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 },
+  { 0xBB3EE722, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 },
+  { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB509E68B, 0 },          { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB30F6353, 0 },          { 0xB21642C9, 0 },
+  { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 },
+  { 0xAF3ADDC7, 0 },          { 0xAE4C415D, 0 },
+  { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 },
+  { 0xAB8F69E3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA9C84A48, 0 },          { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 },
+  { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 },
+  { 0xA4A9CF1E, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3065E40, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA16B312F, 0 },          { 0xA0A0A0A1, 0 },
+  { 0x9FD809FE, 0 },          { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9E4CAD24, 0 },          { 0x9D89D89E, 0 },
+  { 0x9CC8E161, 0 },          { 0x9C09C09C, 0x9C09C09C },
+  { 0x9B4C6F9F, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x99D722DB, 0 },          { 0x991F1A51, 0x991F1A51 },
+  { 0x9868C80A, 0 },          { 0x97B425ED, 0x97B425ED },
+  { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C },
+  { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F },
+  { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 },
+  { 0x91A2B3C5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F1779DA, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 },
+  { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 },
+  { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 },
+  { 0x891AC73B, 0 },          { 0x88888889, 0 },
+  { 0x87F78088, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x86D90545, 0 },          { 0x864B8A7E, 0 },
+  { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 },
+  { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 },
+  { 0x83993052, 0x83993052 }, { 0x83126E98, 0 },
+  { 0x828CBFBF, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81848DA9, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80808081, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF00FF01, 0 },          { 0xFE03F810, 0 },
+  { 0xFD08E551, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB188566, 0 },          { 0xFA232CF3, 0 },
+  { 0xF92FB222, 0 },          { 0xF83E0F84, 0 },
+  { 0xF74E3FC3, 0 },          { 0xF6603D99, 0 },
+  { 0xF57403D6, 0 },          { 0xF4898D60, 0 },
+  { 0xF3A0D52D, 0 },          { 0xF2B9D649, 0 },
+  { 0xF1D48BCF, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 },
+  { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 },
+  { 0xEC979119, 0 },          { 0xEBBDB2A6, 0 },
+  { 0xEAE56404, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9396520, 0 },          { 0xE865AC7C, 0 },
+  { 0xE79372E3, 0 },          { 0xE6C2B449, 0 },
+  { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 },
+  { 0xE45932D8, 0 },          { 0xE38E38E4, 0 },
+  { 0xE2C4A689, 0 },          { 0xE1FC780F, 0 },
+  { 0xE135A9CA, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDFAC1F75, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE27EB2D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDCA8F159, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB2F171E, 0 },          { 0xDA740DA8, 0 },
+  { 0xD9BA4257, 0 },          { 0xD901B204, 0 },
+  { 0xD84A598F, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD6DF43FD, 0 },          { 0xD62B80D7, 0 },
+  { 0xD578E97D, 0 },          { 0xD4C77B04, 0 },
+  { 0xD417328A, 0 },          { 0xD3680D37, 0 },
+  { 0xD2BA083C, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 },
+  { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 },
+  { 0xCEBCF8BC, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCD712753, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC29786D, 0 },          { 0xCB8727C1, 0 },
+  { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 },
+  { 0xC9A633FD, 0 },          { 0xC907DA4F, 0 },
+  { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 },
+  { 0xC73293D8, 0 },          { 0xC6980C6A, 0 },
+  { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 },
+  { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 },
+  { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2780614, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 },
+  { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF112A8B, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBDF59C92, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBCDD535E, 0 },          { 0xBC52640C, 0 },
+  { 0xBBC8408D, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 },
+  { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA },
+  { 0xB89BC36D, 0 },          { 0xB81702E1, 0 },
+  { 0xB79300B8, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 },
+  { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 },
+  { 0xB2927C2A, 0 },          { 0xB21642C9, 0 },
+  { 0xB19AB5C5, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0A59B42, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 },
+  { 0xAEC33E20, 0 },          { 0xAE4C415D, 0 },
+  { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 },
+  { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC02B00B, 0 },          { 0xAB8F69E3, 0 },
+  { 0xAB1CBDD4, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA392F36, 0 },          { 0xA9C84A48, 0 },
+  { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8791709, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA79C7B17, 0 },          { 0xA72F053A, 0 },
+  { 0xA6C21DF7, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 },
+  { 0xA513FD6C, 0 },          { 0xA4A9CF1E, 0 },
+  { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 },
+  { 0xA36E71A3, 0 },          { 0xA3065E40, 0 },
+  { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B },
+  { 0xA1D13986, 0 },          { 0xA16B312F, 0 },
+  { 0xA105A933, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA03C1689, 0 },          { 0x9FD809FE, 0 },
+  { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 },
+  { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 },
+  { 0x9D2921C4, 0 },          { 0x9CC8E161, 0 },
+  { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 },
+  { 0x9AEE72FD, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 },
+  { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 },
+  { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED },
+  { 0x975A7510, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96A8500A, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x95F7CC73, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x9548E498, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x949B92DE, 0 },          { 0x94458094, 0x94458094 },
+  { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x93459BE7, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x929CEBF5, 0 },          { 0x92492492, 0x92492492 },
+  { 0x91F5BCB9, 0 },          { 0x91A2B3C5, 0 },
+  { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 },
+  { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F67A1E4, 0 },          { 0x8F1779DA, 0 },
+  { 0x8EC7AB3A, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8E2917E1, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8D8BE340, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C55841D, 0 },          { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BBC50C9, 0 },          { 0x8B70344A, 0x8B70344A },
+  { 0x8B246A88, 0 },          { 0x8AD8F2FC, 0 },
+  { 0x8A8DCD20, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x89F8746A, 0 },          { 0x89AE408A, 0 },
+  { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 },
+  { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 },
+  { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 },
+  { 0x87AF6FD6, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 },
+  { 0x869222B2, 0 },          { 0x864B8A7E, 0 },
+  { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 },
+  { 0x84EEDD36, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x84655D9C, 0 },          { 0x84210842, 0x84210842 },
+  { 0x83DCF94E, 0 },          { 0x83993052, 0x83993052 },
+  { 0x8355ACE4, 0 },          { 0x83126E98, 0 },
+  { 0x82CF7504, 0 },          { 0x828CBFBF, 0 },
+  { 0x824A4E61, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 },
+  { 0x814327E4, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80C121B3, 0 },          { 0x80808081, 0 },
+  { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF803FE1, 0 },          { 0xFF00FF01, 0 },
+  { 0xFE823CA6, 0 },          { 0xFE03F810, 0 },
+  { 0xFD863087, 0 },          { 0xFD08E551, 0 },
+  { 0xFC8C15B5, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB93E673, 0 },          { 0xFB188566, 0 },
+  { 0xFA9D9D20, 0 },          { 0xFA232CF3, 0 },
+  { 0xF9A9342D, 0 },          { 0xF92FB222, 0 },
+  { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 },
+  { 0xF7C5ED9D, 0 },          { 0xF74E3FC3, 0 },
+  { 0xF6D7054E, 0 },          { 0xF6603D99, 0 },
+  { 0xF5E9E7FD, 0 },          { 0xF57403D6, 0 },
+  { 0xF4FE9083, 0 },          { 0xF4898D60, 0 },
+  { 0xF414F9CE, 0 },          { 0xF3A0D52D, 0 },
+  { 0xF32D1EE0, 0 },          { 0xF2B9D649, 0 },
+  { 0xF246FACC, 0 },          { 0xF1D48BCF, 0 },
+  { 0xF16288B9, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 },
+  { 0xEF9EA78C, 0 },          { 0xEF2EB720, 0 },
+  { 0xEEBF2F19, 0 },          { 0xEE500EE5, 0xEE500EE5 },
+  { 0xEDE155F4, 0 },          { 0xED7303B6, 0 },
+  { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 },
+  { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 },
+  { 0xEB5159A0, 0 },          { 0xEAE56404, 0 },
+  { 0xEA79D14A, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 },
+  { 0xE8CF58AB, 0 },          { 0xE865AC7C, 0 },
+  { 0xE7FC600F, 0 },          { 0xE79372E3, 0 },
+  { 0xE72AE476, 0 },          { 0xE6C2B449, 0 },
+  { 0xE65AE1DC, 0 },          { 0xE5F36CB0, 0xE5F36CB0 },
+  { 0xE58C544A, 0 },          { 0xE525982B, 0 },
+  { 0xE4BF37D9, 0 },          { 0xE45932D8, 0 },
+  { 0xE3F388AF, 0 },          { 0xE38E38E4, 0 },
+  { 0xE32942FF, 0 },          { 0xE2C4A689, 0 },
+  { 0xE260630B, 0 },          { 0xE1FC780F, 0 },
+  { 0xE198E520, 0 },          { 0xE135A9CA, 0 },
+  { 0xE0D2C59A, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 },
+  { 0xDF4A9369, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE8879B3, 0 },          { 0xDE27EB2D, 0 },
+  { 0xDDC7B04D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDD0833CE, 0 },          { 0xDCA8F159, 0 },
+  { 0xDC4A00DD, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB8D1428, 0 },          { 0xDB2F171E, 0 },
+  { 0xDAD16A6B, 0 },          { 0xDA740DA8, 0 },
+  { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 },
+  { 0xD95DD300, 0 },          { 0xD901B204, 0 },
+  { 0xD8A5DEFF, 0 },          { 0xD84A598F, 0 },
+  { 0xD7EF2152, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD73996E9, 0 },          { 0xD6DF43FD, 0 },
+  { 0xD6853CC1, 0 },          { 0xD62B80D7, 0 },
+  { 0xD5D20FDF, 0 },          { 0xD578E97D, 0 },
+  { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 },
+  { 0xD46F3235, 0 },          { 0xD417328A, 0 },
+  { 0xD3BF7BA9, 0 },          { 0xD3680D37, 0 },
+  { 0xD310E6DB, 0 },          { 0xD2BA083C, 0 },
+  { 0xD2637101, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD1B71759, 0 },          { 0xD161543E, 0xD161543E },
+  { 0xD10BD72C, 0 },          { 0xD0B69FCC, 0 },
+  { 0xD061ADCA, 0 },          { 0xD00D00D0, 0xD00D00D0 },
+  { 0xCFB8988C, 0 },          { 0xCF6474A9, 0 },
+  { 0xCF1094D4, 0 },          { 0xCEBCF8BC, 0 },
+  { 0xCE69A00D, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 },
+  { 0xCD1ED924, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC7B0200, 0 },          { 0xCC29786D, 0 },
+  { 0xCBD82FC7, 0 },          { 0xCB8727C1, 0 },
+  { 0xCB36600D, 0 },          { 0xCAE5D85F, 0xCAE5D85F },
+  { 0xCA95906C, 0 },          { 0xCA4587E7, 0 },
+  { 0xC9F5BE86, 0 },          { 0xC9A633FD, 0 },
+  { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 },
+  { 0xC8B90A96, 0 },          { 0xC86A7890, 0xC86A7890 },
+  { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 },
+  { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 },
+  { 0xC6E5321D, 0 },          { 0xC6980C6A, 0 },
+  { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 },
+  { 0xC5B200C6, 0 },          { 0xC565C87C, 0 },
+  { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 },
+  { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 },
+  { 0xC3EC1A06, 0 },          { 0xC3A13DE6, 0xC3A13DE6 },
+  { 0xC3569AE6, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2C1FF3E, 0 },          { 0xC2780614, 0 },
+  { 0xC22E4507, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC19B6A42, 0 },          { 0xC152500C, 0xC152500C },
+  { 0xC1096CF6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xC0784B2F, 0 },          { 0xC0300C03, 0xC0300C03 },
+  { 0xBFE80300, 0 },          { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF589280, 0 },          { 0xBF112A8B, 0 },
+  { 0xBEC9F7CE, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBE3C310C, 0 },          { 0xBDF59C92, 0 },
+  { 0xBDAF3C64, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBD231803, 0 },          { 0xBCDD535E, 0 },
+  { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 },
+  { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 },
+  { 0xBB837AB1, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 },
+  { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 },
+  { 0xB9EAF063, 0 },          { 0xB9A7862A, 0xB9A7862A },
+  { 0xB9644CC4, 0 },          { 0xB92143FA, 0xB92143FA },
+  { 0xB8DE6B9A, 0 },          { 0xB89BC36D, 0 },
+  { 0xB8594B41, 0 },          { 0xB81702E1, 0 },
+  { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 },
+  { 0xB7514689, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 },
+  { 0xB64C31D9, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB5CABD9B, 0 },          { 0xB58A4855, 0xB58A4855 },
+  { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 },
+  { 0xB4C9F9A5, 0 },          { 0xB48A39D4, 0xB48A39D4 },
+  { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB3CC0706, 0 },          { 0xB38CF9B0, 0xB38CF9B0 },
+  { 0xB34E1884, 0 },          { 0xB30F6353, 0 },
+  { 0xB2D0D9EF, 0 },          { 0xB2927C2A, 0 },
+  { 0xB25449D7, 0 },          { 0xB21642C9, 0 },
+  { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 },
+  { 0xB15D2F76, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 },
+  { 0xB068BE31, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFEF818C, 0 },          { 0xAFB321A1, 0xAFB321A1 },
+  { 0xAF76EB19, 0 },          { 0xAF3ADDC7, 0 },
+  { 0xAEFEF982, 0 },          { 0xAEC33E20, 0 },
+  { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 },
+  { 0xAE10FFA9, 0 },          { 0xADD5E632, 0xADD5E632 },
+  { 0xAD9AF4D0, 0 },          { 0xAD602B58, 0xAD602B58 },
+  { 0xAD2589A4, 0 },          { 0xACEB0F89, 0xACEB0F89 },
+  { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC3C8D4A, 0 },          { 0xAC02B00B, 0 },
+  { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 },
+  { 0xAB5600AC, 0 },          { 0xAB1CBDD4, 0 },
+  { 0xAAE3A136, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA71DA0D, 0 },          { 0xAA392F36, 0 },
+  { 0xAA00AA01, 0 },          { 0xA9C84A48, 0 },
+  { 0xA9900FE6, 0 },          { 0xA957FAB5, 0xA957FAB5 },
+  { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 },
+  { 0xA841B9AD, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA7D36BD8, 0 },          { 0xA79C7B17, 0 },
+  { 0xA765AE44, 0 },          { 0xA72F053A, 0 },
+  { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 },
+  { 0xA68BDF79, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED },
+  { 0xA5B4449D, 0 },          { 0xA57EB503, 0 },
+  { 0xA54947FE, 0 },          { 0xA513FD6C, 0 },
+  { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 },
+  { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 },
+  { 0xA40B88D0, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 },
+  { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 },
+  { 0xA2D28634, 0 },          { 0xA29ECF16, 0xA29ECF16 },
+  { 0xA26B38C9, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 },
+  { 0xA19E2540, 0 },          { 0xA16B312F, 0 },
+  { 0xA1385D35, 0 },          { 0xA105A933, 0 },
+  { 0xA0D3150C, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 },
+  { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 },
+  { 0x9FA63284, 0 },          { 0x9F747A15, 0x9F747A15 },
+  { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D },
+  { 0x9E7DADA9, 0 },          { 0x9E4CAD24, 0 },
+  { 0x9E1BCAE3, 0 },          { 0x9DEB06C9, 0x9DEB06C9 },
+  { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 },
+  { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 },
+  { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 },
+  { 0x9C98ED58, 0 },          { 0x9C69169B, 0x9C69169B },
+  { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E },
+  { 0x9B7B98C0, 0 },          { 0x9B4C6F9F, 0 },
+  { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 },
+  { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A624C97, 0 },          { 0x9A33CD67, 0x9A33CD67 },
+  { 0x9A056A31, 0 },          { 0x99D722DB, 0 },
+  { 0x99A8F74C, 0 },          { 0x997AE76B, 0x997AE76B },
+  { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98F15CE7, 0 },          { 0x98C3BAC7, 0x98C3BAC7 },
+  { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 },
+  { 0x983B773B, 0 },          { 0x980E4156, 0x980E4156 },
+  { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED },
+  { 0x97874039, 0 },          { 0x975A7510, 0 },
+  { 0x972DC45B, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96D4B1EF, 0 },          { 0x96A8500A, 0 },
+  { 0x967C083B, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 },
+  { 0x95CBEC1B, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x95747844, 0 },          { 0x9548E498, 0 },
+  { 0x951D6A4E, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x94C6C187, 0 },          { 0x949B92DE, 0 },
+  { 0x94707D3F, 0 },          { 0x94458094, 0x94458094 },
+  { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 },
+  { 0x93C51F76, 0 },          { 0x939A85C4, 0x939A85C4 },
+  { 0x9370049C, 0 },          { 0x93459BE7, 0 },
+  { 0x931B4B91, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 },
+  { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 },
+  { 0x921F64BF, 0 },          { 0x91F5BCB9, 0 },
+  { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 },
+  { 0x917952AF, 0 },          { 0x91500915, 0x91500915 },
+  { 0x9126D6E5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90D4B86F, 0 },          { 0x90ABCC02, 0x90ABCC02 },
+  { 0x9082F6B0, 0 },          { 0x905A3863, 0x905A3863 },
+  { 0x9031910A, 0 },          { 0x90090090, 0x90090090 },
+  { 0x8FE086E3, 0 },          { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F8FD7A0, 0 },          { 0x8F67A1E4, 0 },
+  { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 },
+  { 0x8EEF8766, 0 },          { 0x8EC7AB3A, 0 },
+  { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D },
+  { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 },
+  { 0x8E01AA05, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 },
+  { 0x8D64CC5C, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF },
+  { 0x8CC947C5, 0 },          { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C7C057D, 0 },          { 0x8C55841D, 0 },
+  { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 },
+  { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A },
+  { 0x8B4A451A, 0 },          { 0x8B246A88, 0 },
+  { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 },
+  { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 },
+  { 0x8A6858AB, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 },
+  { 0x89D3507D, 0 },          { 0x89AE408A, 0 },
+  { 0x89894480, 0 },          { 0x89645C4F, 0x89645C4F },
+  { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 },
+  { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD },
+  { 0x88ACFAEE, 0 },          { 0x88888889, 0 },
+  { 0x8864298F, 0 },          { 0x883FDDF0, 0x883FDDF0 },
+  { 0x881BA59E, 0 },          { 0x87F78088, 0 },
+  { 0x87D36EA0, 0 },          { 0x87AF6FD6, 0 },
+  { 0x878B841B, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x8743E595, 0 },          { 0x872032AC, 0x872032AC },
+  { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 },
+  { 0x86B58AA8, 0 },          { 0x869222B2, 0 },
+  { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 },
+  { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 },
+  { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 },
+  { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 },
+  { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 },
+  { 0x84CC6290, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x8487A2D1, 0 },          { 0x84655D9C, 0 },
+  { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 },
+  { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 },
+  { 0x83BB0C18, 0 },          { 0x83993052, 0x83993052 },
+  { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 },
+  { 0x83340520, 0x83340520 }, { 0x83126E98, 0 },
+  { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 },
+  { 0x82AE11DE, 0 },          { 0x828CBFBF, 0 },
+  { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 },
+  { 0x82292F08, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC },
+  { 0x81A55963, 0 },          { 0x81848DA9, 0 },
+  { 0x8163D283, 0 },          { 0x814327E4, 0 },
+  { 0x81228DBF, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80E18AB3, 0 },          { 0x80C121B3, 0 },
+  { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 },
+  { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 },
+  { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF }
+};
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
new file mode 100644
index 000000000..e1db0f44d
--- /dev/null
+++ b/third_party/aom/av1/common/odintrin.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int od_coeff;
+
+#define OD_DIVU_DMAX (1024)
+
+extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
+
+/*Translate unsigned division by small divisors into multiplications.*/
+#define OD_DIVU_SMALL(_x, _d)                                     \
+  ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \
+               OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >>                \
+              32) >>                                              \
+   (OD_ILOG_NZ(_d) - 1))
+
+#define OD_DIVU(_x, _d) \
+  (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d)))
+
+#define OD_MINI AOMMIN
+#define OD_MAXI AOMMAX
+#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
+
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+  OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
+
+/*Enable special features for gcc and compatible compilers.*/
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define OD_GNUC_PREREQ(maj, min, pat)                                \
+  ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \
+   ((maj) << 16) + ((min) << 8) + pat)  // NOLINT
+#else
+#define OD_GNUC_PREREQ(maj, min, pat) (0)
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define OD_WARN_UNUSED_RESULT
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x)))
+#else
+#define OD_ARG_NONNULL(x)
+#endif
+
+/** Copy n elements of memory from src to dst. The 0* term provides
+    compile-time type checking  */
+#if !defined(OVERRIDE_OD_COPY)
+#define OD_COPY(dst, src, n) \
+  (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src))))
+#endif
+
+/** Copy n elements of memory from src to dst, allowing overlapping regions.
+    The 0* term provides compile-time type checking */
+#if !defined(OVERRIDE_OD_MOVE)
+# define OD_MOVE(dst, src, n) \
+ (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
+#endif
+
+/*All of these macros should expect floats as arguments.*/
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
new file mode 100644
index 000000000..ff011c89e
--- /dev/null
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -0,0 +1,1342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
+#define AOM_AV1_COMMON_ONYXC_INT_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/hash_motion.h"
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough))  // NOLINT
+#endif
+
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+  do {                           \
+  } while (0)
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
+
+#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
+// Extra frame context which is always kept at default values
+#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
+
+#define NUM_PING_PONG_BUFFERS 2
+
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+  MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
+/* clang-format on*/
+
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
+
+typedef enum {
+  SINGLE_REFERENCE = 0,
+  COMPOUND_REFERENCE = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES = 3,
+} REFERENCE_MODE;
+
+typedef enum {
+  /**
+   * Frame context updates are disabled
+   */
+  REFRESH_FRAME_CONTEXT_DISABLED,
+  /**
+   * Update frame context to values resulting from backward probability
+   * updates based on entropy/counts in the decoded frame
+   */
+  REFRESH_FRAME_CONTEXT_BACKWARD,
+} REFRESH_FRAME_CONTEXT_MODE;
+
+#define MFMV_STACK_SIZE 3
+typedef struct {
+  int_mv mfmv0;
+  uint8_t ref_frame_offset;
+} TPL_MV_REF;
+
+typedef struct {
+  int_mv mv;
+  MV_REFERENCE_FRAME ref_frame;
+} MV_REF;
+
+typedef struct {
+  int ref_count;
+
+  unsigned int cur_frame_offset;
+  unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
+
+  MV_REF *mvs;
+  uint8_t *seg_map;
+  struct segmentation seg;
+  int mi_rows;
+  int mi_cols;
+  // Width and height give the size of the buffer (before any upscaling, unlike
+  // the sizes that can be derived from the buf structure)
+  int width;
+  int height;
+  WarpedMotionParams global_motion[REF_FRAMES];
+  int showable_frame;  // frame can be used as show existing frame in future
+  int film_grain_params_present;
+  aom_film_grain_t film_grain_params;
+  aom_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+  hash_table hash_table;
+  uint8_t intra_only;
+  FRAME_TYPE frame_type;
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  AVxWorker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
+
+  // Inter frame reference frame delta for loop filter
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  aom_get_frame_buffer_cb_fn_t get_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct {
+  int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
+                    [BASE_CONTEXT_POSITION_NUM + 1];
+} LV_MAP_CTX_TABLE;
+typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
+                          [BASE_CONTEXT_POSITION_NUM + 1];
+
+typedef struct BitstreamLevel {
+  uint8_t major;
+  uint8_t minor;
+} BitstreamLevel;
+
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
+typedef struct SequenceHeader {
+  int num_bits_width;
+  int num_bits_height;
+  int max_frame_width;
+  int max_frame_height;
+  int frame_id_numbers_present_flag;
+  int frame_id_length;
+  int delta_frame_id_length;
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
+  int order_hint_bits_minus_1;
+  int force_screen_content_tools;  // 0 - force off
+                                   // 1 - force on
+                                   // 2 - adaptive
+  int force_integer_mv;            // 0 - Not to force. MV can be in 1/4 or 1/8
+                                   // 1 - force to integer
+                                   // 2 - adaptive
+  int still_picture;               // Video is a single frame still picture
+  int reduced_still_picture_hdr;   // Use reduced header for still picture
+  int enable_filter_intra;         // enables/disables filterintra
+  int enable_intra_edge_filter;    // enables/disables corner/edge/upsampling
+  int enable_interintra_compound;  // enables/disables interintra_compound
+  int enable_masked_compound;      // enables/disables masked compound
+  int enable_dual_filter;          // 0 - disable dual interpolation filter
+                                   // 1 - enable vert/horiz filter selection
+  int enable_order_hint;           // 0 - disable order hint, and related tools
+                                   // jnt_comp, ref_frame_mvs, frame_sign_bias
+                                   // if 0, enable_jnt_comp and
+                                   // enable_ref_frame_mvs must be set zs 0.
+  int enable_jnt_comp;             // 0 - disable joint compound modes
+                                   // 1 - enable it
+  int enable_ref_frame_mvs;        // 0 - disable ref frame mvs
+                                   // 1 - enable it
+  int enable_warped_motion;        // 0 - disable warped motion for sequence
+                                   // 1 - enable it for the sequence
+  int enable_superres;     // 0 - Disable superres for the sequence, and disable
+                           //     transmitting per-frame superres enabled flag.
+                           // 1 - Enable superres for the sequence, and also
+                           //     enable per-frame flag to denote if superres is
+                           //     enabled for that frame.
+  int enable_cdef;         // To turn on/off CDEF
+  int enable_restoration;  // To turn on/off loop restoration
+  BITSTREAM_PROFILE profile;
+
+  // Operating point info.
+  int operating_points_cnt_minus_1;
+  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  int display_model_info_present_flag;
+  int decoder_model_info_present_flag;
+  BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
+  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in the spec. One bit: 0
+                                           // or 1.
+
+  // Color config.
+  aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
+                              // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+  int use_highbitdepth;       // If true, we need to use 16bit frame buffers.
+  int monochrome;             // Monochorme video
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  int color_range;
+  int subsampling_x;          // Chroma subsampling for x
+  int subsampling_y;          // Chroma subsampling for y
+  aom_chroma_sample_position_t chroma_sample_position;
+  int separate_uv_delta_q;
+
+  int film_grain_params_present;
+} SequenceHeader;
+
+typedef struct AV1Common {
+  struct aom_internal_error_info error;
+  int width;
+  int height;
+  int render_width;
+  int render_height;
+  int last_width;
+  int last_height;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int buffer_removal_time_present;
+  aom_dec_model_info_t buffer_model;
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+  uint32_t frame_presentation_time;
+
+  int largest_tile_id;
+  size_t largest_tile_size;
+  int context_update_tile_id;
+
+  // Scale of the current frame with respect to itself.
+  struct scale_factors sf_identity;
+
+  YV12_BUFFER_CONFIG *frame_to_show;
+  RefCntBuffer *prev_frame;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
+
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
+  // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+  // roll new_fb_idx into it.
+
+  // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
+  RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+  int is_skip_mode_allowed;
+  int skip_mode_flag;
+  int ref_frame_idx_0;
+  int ref_frame_idx_1;
+
+  int new_fb_idx;
+
+  FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
+  FRAME_TYPE frame_type;
+
+  int show_frame;
+  int showable_frame;  // frame can be used as show existing frame in future
+  int last_show_frame;
+  int show_existing_frame;
+  // Flag for a frame used as a reference - not written to the bitstream
+  int is_reference_frame;
+  int reset_decoder_state;
+
+  // Flag signaling that the frame is encoded using only INTRA modes.
+  uint8_t intra_only;
+  uint8_t last_intra_only;
+  uint8_t disable_cdf_update;
+  int allow_high_precision_mv;
+  int cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+
+  int allow_screen_content_tools;
+  int allow_intrabc;
+  int allow_warped_motion;
+
+  // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
+  // MB_MODE_INFO (8-pixel) units.
+  int MBs;
+  int mb_rows, mi_rows;
+  int mb_cols, mi_cols;
+  int mi_stride;
+
+  /* profile settings */
+  TX_MODE tx_mode;
+
+#if CONFIG_ENTROPY_STATS
+  int coef_cdf_category;
+#endif
+
+  int base_qindex;
+  int y_dc_delta_q;
+  int u_dc_delta_q;
+  int v_dc_delta_q;
+  int u_ac_delta_q;
+  int v_ac_delta_q;
+
+  // The dequantizers below are true dequntizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+
+  // Global quant matrix tables
+  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+
+  // Local quant matrix tables for each frame
+  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // Encoder
+  int using_qmatrix;
+  int qm_y;
+  int qm_u;
+  int qm_v;
+  int min_qmlevel;
+  int max_qmlevel;
+
+  /* We allocate a MB_MODE_INFO struct for each macroblock, together with
+     an extra row on top and column on the left to simplify prediction. */
+  int mi_alloc_size;
+  MB_MODE_INFO *mip; /* Base of allocated array */
+  MB_MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+  // TODO(agrange): Move prev_mi into encoder structure.
+  // prev_mip and prev_mi will only be allocated in encoder.
+  MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
+  MB_MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
+
+  // Separate mi functions between encoder and decoder.
+  int (*alloc_mi)(struct AV1Common *cm, int mi_size);
+  void (*free_mi)(struct AV1Common *cm);
+  void (*setup_mi)(struct AV1Common *cm);
+
+  // Grid of pointers to 8x8 MB_MODE_INFO structs.  Any 8x8 not in the visible
+  // area will be NULL.
+  MB_MODE_INFO **mi_grid_base;
+  MB_MODE_INFO **mi_grid_visible;
+  MB_MODE_INFO **prev_mi_grid_base;
+  MB_MODE_INFO **prev_mi_grid_visible;
+
+  // Whether to use previous frames' motion vectors for prediction.
+  int allow_ref_frame_mvs;
+
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;
+  int seg_map_alloc_size;
+
+  InterpFilter interp_filter;
+
+  int switchable_motion_mode;
+
+  loop_filter_info_n lf_info;
+  // The denominator of the superres scale; the numerator is fixed.
+  uint8_t superres_scale_denominator;
+  int superres_upscaled_width;
+  int superres_upscaled_height;
+  RestorationInfo rst_info[MAX_MB_PLANE];
+
+  // rst_end_stripe[i] is one more than the index of the bottom stripe
+  // for tile row i.
+  int rst_end_stripe[MAX_TILE_ROWS];
+
+  // Pointer to a scratch buffer used by self-guided restoration
+  int32_t *rst_tmpbuf;
+  RestorationLineBuffers *rlbs;
+
+  // Output of loop restoration
+  YV12_BUFFER_CONFIG rst_frame;
+
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+
+  int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
+
+  struct loopfilter lf;
+  struct segmentation seg;
+  int coded_lossless;  // frame is fully lossless at the coded resolution.
+  int all_lossless;    // frame is fully lossless at the upscaled resolution.
+
+  int reduced_tx_set_used;
+
+  // Context probabilities for reference frame prediction
+  MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
+  MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
+  REFERENCE_MODE reference_mode;
+
+  FRAME_CONTEXT *fc;              /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;  // FRAME_CONTEXTS
+  unsigned int frame_context_idx; /* Context to use/update */
+  int fb_of_context_type[REF_FRAMES];
+  int primary_ref_frame;
+
+  unsigned int frame_offset;
+
+  unsigned int current_video_frame;
+
+  aom_bit_depth_t dequant_bit_depth;  // bit_depth of current dequantizer
+
+  int error_resilient_mode;
+  int force_primary_ref_none;
+
+  int tile_cols, tile_rows;
+  int last_tile_cols, last_tile_rows;
+
+  int max_tile_width_sb;
+  int min_log2_tile_cols;
+  int max_log2_tile_cols;
+  int max_log2_tile_rows;
+  int min_log2_tile_rows;
+  int min_log2_tiles;
+  int max_tile_height_sb;
+  int uniform_tile_spacing_flag;
+  int log2_tile_cols;                        // only valid for uniform tiles
+  int log2_tile_rows;                        // only valid for uniform tiles
+  int tile_col_start_sb[MAX_TILE_COLS + 1];  // valid for 0 <= i <= tile_cols
+  int tile_row_start_sb[MAX_TILE_ROWS + 1];  // valid for 0 <= i <= tile_rows
+  int tile_width, tile_height;               // In MI units
+
+  unsigned int large_scale_tile;
+  unsigned int single_tile_decoding;
+
+  int byte_alignment;
+  int skip_loop_filter;
+  int skip_film_grain;
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+  aom_get_frame_buffer_cb_fn_t get_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  // Handles memory for the codec.
+  InternalFrameBufferList int_frame_buffers;
+
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
+  PARTITION_CONTEXT **above_seg_context;
+  ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
+  TXFM_CONTEXT **above_txfm_context;
+  WarpedMotionParams global_motion[REF_FRAMES];
+  aom_film_grain_t film_grain_params;
+
+  int cdef_pri_damping;
+  int cdef_sec_damping;
+  int nb_cdef_strengths;
+  int cdef_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_bits;
+
+  int delta_q_present_flag;
+  // Resolution of delta quant
+  int delta_q_res;
+  int delta_lf_present_flag;
+  // Resolution of delta lf level
+  int delta_lf_res;
+  // This is a flag for number of deltas of loop filter level
+  // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
+  // 1: use separate deltas for each filter level
+  int delta_lf_multi;
+  int num_tg;
+  SequenceHeader seq_params;
+  int current_frame_id;
+  int ref_frame_id[REF_FRAMES];
+  int valid_for_referencing[REF_FRAMES];
+  int invalid_delta_frame_id_minus_1;
+  LV_MAP_CTX_TABLE coeff_ctx_table;
+  TPL_MV_REF *tpl_mvs;
+  int tpl_mvs_mem_size;
+  // TODO(jingning): This can be combined with sign_bias later.
+  int8_t ref_frame_side[REF_FRAMES];
+
+  int is_annexb;
+
+  int frame_refs_short_signaling;
+  int temporal_layer_id;
+  int spatial_layer_id;
+  unsigned int number_temporal_layers;
+  unsigned int number_spatial_layers;
+  int num_allocated_above_context_mi_col;
+  int num_allocated_above_contexts;
+  int num_allocated_above_context_planes;
+
+#if TXCOEFF_TIMER
+  int64_t cum_txcoeff_timer;
+  int64_t txcoeff_timer;
+  int txb_count;
+#endif
+
+#if TXCOEFF_COST_TIMER
+  int64_t cum_txcoeff_cost_timer;
+  int64_t txcoeff_cost_timer;
+  int64_t txcoeff_cost_count;
+#endif
+  const cfg_options_t *options;
+} AV1_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES) return NULL;
+  if (cm->ref_frame_map[index] < 0) return NULL;
+  assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
+  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+    const AV1_COMMON *const cm) {
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0) break;
+
+  if (i != FRAME_BUFFERS) {
+    if (frame_bufs[i].buf.use_external_reference_buffers) {
+      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+      // external reference buffers. Restore the buffer pointers to point to the
+      // internally allocated memory.
+      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+      ybf->y_buffer = ybf->store_buf_adr[0];
+      ybf->u_buffer = ybf->store_buf_adr[1];
+      ybf->v_buffer = ybf->store_buf_adr[2];
+      ybf->use_external_reference_buffers = 0;
+    }
+
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
+  return i;
+}
+
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+  const int ref_index = *idx;
+
+  if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+    bufs[ref_index].ref_count--;
+
+  *idx = new_idx;
+
+  bufs[new_idx].ref_count++;
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+  return cm->frame_type == S_FRAME;
+}
+
+static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+      cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) {
+    return NULL;
+  } else {
+    return &cm->buffer_pool
+                ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx];
+  }
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+  return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs &&
+         cm->seq_params.enable_order_hint && !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+  return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
+         cm->seq_params.enable_warped_motion;
+}
+
+static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+  const int buf_rows = buf->mi_rows;
+  const int buf_cols = buf->mi_cols;
+
+  if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
+    aom_free(buf->mvs);
+    buf->mi_rows = cm->mi_rows;
+    buf->mi_cols = cm->mi_cols;
+    CHECK_MEM_ERROR(cm, buf->mvs,
+                    (MV_REF *)aom_calloc(
+                        ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
+                        sizeof(*buf->mvs)));
+    aom_free(buf->seg_map);
+    CHECK_MEM_ERROR(cm, buf->seg_map,
+                    (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                          sizeof(*buf->seg_map)));
+  }
+
+  const int mem_size =
+      ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+  int realloc = cm->tpl_mvs == NULL;
+  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+  if (realloc) {
+    aom_free(cm->tpl_mvs);
+    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+    cm->tpl_mvs_mem_size = mem_size;
+  }
+}
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+}
+
+static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          const int tile_row) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->above_context[i] = cm->above_context[i][tile_row];
+  }
+  xd->above_seg_context = cm->above_seg_context[tile_row];
+  xd->above_txfm_context = cm->above_txfm_context[tile_row];
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+
+    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
+             sizeof(cm->y_dequant_QTX));
+      memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
+
+    } else {
+      if (i == AOM_PLANE_U) {
+        memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
+               sizeof(cm->u_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
+               sizeof(cm->u_iqmatrix));
+      } else {
+        memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
+               sizeof(cm->v_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
+               sizeof(cm->v_iqmatrix));
+      }
+    }
+  }
+  xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
+  cfl_init(&xd->cfl, &cm->seq_params);
+}
+
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    const int num_planes) {
+  int i;
+  int row_offset = mi_row;
+  int col_offset = mi_col;
+  for (i = 0; i < num_planes; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    // Offset the buffer pointer
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+      row_offset = mi_row - 1;
+    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+      col_offset = mi_col - 1;
+    int above_idx = col_offset;
+    int left_idx = row_offset & MAX_MIB_MASK;
+    pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
+    pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
+  }
+}
+
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units. Align to a multiple of SBs.
+  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+                                const int num_planes) {
+  int i;
+  for (i = 0; i < num_planes; i++) {
+    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+  }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh, int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
+
+  // Are edges available for intra prediction?
+  xd->up_available = (mi_row > tile->mi_row_start);
+
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+
+  xd->left_available = (mi_col > tile->mi_col_start);
+  xd->chroma_up_available = xd->up_available;
+  xd->chroma_left_available = xd->left_available;
+  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
+    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+  if (ss_y && bh < mi_size_high[BLOCK_8X8])
+    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+  if (xd->up_available) {
+    xd->above_mbmi = xd->mi[-xd->mi_stride];
+  } else {
+    xd->above_mbmi = NULL;
+  }
+
+  if (xd->left_available) {
+    xd->left_mbmi = xd->mi[-1];
+  } else {
+    xd->left_mbmi = NULL;
+  }
+
+  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+  if (chroma_ref) {
+    // To help calculate the "above" and "left" chroma blocks, note that the
+    // current block may cover multiple luma blocks (eg, if partitioned into
+    // 4x4 luma blocks).
+    // First, find the top-left-most luma block covered by this chroma block
+    MB_MODE_INFO **base_mi =
+        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+    // Then, we consider the luma region covered by the left or above 4x4 chroma
+    // prediction. We want to point to the chroma reference block in that
+    // region, which is the bottom-right-most mi unit.
+    // This leads to the following offsets:
+    MB_MODE_INFO *chroma_above_mi =
+        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+    xd->chroma_above_mbmi = chroma_above_mi;
+
+    MB_MODE_INFO *chroma_left_mi =
+        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+    xd->chroma_left_mbmi = chroma_left_mi;
+  }
+
+  xd->n4_h = bh;
+  xd->n4_w = bw;
+  xd->is_sec_rect = 0;
+  if (xd->n4_w < xd->n4_h) {
+    // Only mark is_sec_rect as 1 for the last block.
+    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+    // For other partitions, it would be (0, 1).
+    if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
+  }
+
+  if (xd->n4_w > xd->n4_h)
+    if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
+}
+
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+                                           const MB_MODE_INFO *above_mi,
+                                           const MB_MODE_INFO *left_mi) {
+  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[above];
+  const int left_ctx = intra_mode_context[left];
+  return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx =
+      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+}
+
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                      int subsampling_x, int subsampling_y) {
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+                ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+  return ref_pos;
+}
+
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  BLOCK_SIZE bs = bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_4X8:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_8X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_8X8;
+      break;
+    case BLOCK_4X16:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X16;
+      break;
+    case BLOCK_16X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_16X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_16X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_16X8;
+      break;
+    default: break;
+  }
+  return bs;
+}
+
+static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+                                            size_t element) {
+  assert(cdf != NULL);
+  return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
+}
+
+static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
+  out[0] = CDF_PROB_TOP;
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ);
+  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+  out[0] = AOM_ICDF(out[0]);
+  out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
+  out[0] = CDF_PROB_TOP;
+  out[0] -= cdf_element_prob(in, PARTITION_VERT);
+  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+  out[0] = AOM_ICDF(out[0]);
+  out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+                                                int mi_col, BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int hbs = mi_size_wide[bsize] / 2;
+    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+      case PARTITION_HORZ_4:
+      case PARTITION_VERT_4:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+                                          int mi_col, BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx =
+      xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+  // Minimum partition point is 8x8. Offset the bsl accordingly.
+  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+  if (bsize <= BLOCK_8X8)
+    return PARTITION_TYPES;
+  else if (bsize == BLOCK_128X128)
+    return EXT_PARTITION_TYPES - 2;
+  else
+    return EXT_PARTITION_TYPES;
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  int max_blocks_wide = block_size_wide[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_wide >> tx_size_wide_log2[0];
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  int max_blocks_high = block_size_high[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+
+  // Scale the height in the transform block unit.
+  return max_blocks_high >> tx_size_high_log2[0];
+}
+
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+                                        BLOCK_SIZE plane_bsize, int plane,
+                                        TX_SIZE tx_size) {
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+                              << tx_size_wide_log2[0];
+  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         TX_SIZE tx_size) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+                              << tx_size_high_log2[0];
+  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
+  int mi_col_start, int mi_col_end, const int tile_row) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  const int width = mi_col_end - mi_col_start;
+  const int aligned_width =
+    ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
+
+  const int offset_y = mi_col_start;
+  const int width_y = aligned_width;
+  const int offset_uv = offset_y >> seq_params->subsampling_x;
+  const int width_uv = width_y >> seq_params->subsampling_x;
+
+  av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
+  if (num_planes > 1) {
+    if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
+      av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
+      av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
+    } else {
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid value of planes");
+    }
+  }
+
+  av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
+
+  memset(cm->above_txfm_context[tile_row] + mi_col_start,
+    tx_size_wide[TX_SIZES_LARGEST],
+    aligned_width * sizeof(TXFM_CONTEXT));
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+  av1_zero(xd->left_context);
+  av1_zero(xd->left_seg_context);
+
+  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+         sizeof(xd->left_txfm_context_buffer));
+}
+
+// Disable array-bounds checks as the TX_SIZE enum contains values larger than
+// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
+// infeasible. The assert is enough for static analysis and this or other tools
+// asan, valgrind would catch oob access at runtime.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic warning "-Warray-bounds"
+#endif
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+  int i;
+  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+                                 const MACROBLOCKD *xd) {
+  uint8_t bw = tx_size_wide[tx_size];
+  uint8_t bh = tx_size_high[tx_size];
+
+  if (skip) {
+    bw = n4_w * MI_SIZE;
+    bh = n4_h * MI_SIZE;
+  }
+
+  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size, TX_SIZE txb_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+  uint8_t txw = tx_size_wide[tx_size];
+  uint8_t txh = tx_size_high[tx_size];
+  int i;
+  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+  switch (tx_dim) {
+    case 128:
+    case 64: return TX_64X64; break;
+    case 32: return TX_32X32; break;
+    case 16: return TX_16X16; break;
+    case 8: return TX_8X8; break;
+    default: return TX_4X4;
+  }
+}
+
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+  if (width == height) {
+    return get_sqr_tx_size(width);
+  }
+  if (width < height) {
+    if (width + width == height) {
+      switch (width) {
+        case 4: return TX_4X8; break;
+        case 8: return TX_8X16; break;
+        case 16: return TX_16X32; break;
+        case 32: return TX_32X64; break;
+      }
+    } else {
+      switch (width) {
+        case 4: return TX_4X16; break;
+        case 8: return TX_8X32; break;
+        case 16: return TX_16X64; break;
+      }
+    }
+  } else {
+    if (height + height == width) {
+      switch (height) {
+        case 4: return TX_8X4; break;
+        case 8: return TX_16X8; break;
+        case 16: return TX_32X16; break;
+        case 32: return TX_64X32; break;
+      }
+    } else {
+      switch (height) {
+        case 4: return TX_16X4; break;
+        case 8: return TX_32X8; break;
+        case 16: return TX_64X16; break;
+      }
+    }
+  }
+  assert(0);
+  return TX_4X4;
+}
+
+static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  const int above = *above_ctx < txw;
+  const int left = *left_ctx < txh;
+  int category = TXFM_PARTITION_CONTEXTS;
+
+  // dummy return, not used by others.
+  if (tx_size <= TX_4X4) return 0;
+
+  TX_SIZE max_tx_size =
+      get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
+
+  if (max_tx_size >= TX_8X8) {
+    category =
+        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+        (TX_SIZES - 1 - max_tx_size) * 2;
+  }
+  assert(category != TXFM_PARTITION_CONTEXTS);
+  return category * 3 + above + left;
+}
+
+// Compute the next partition in the direction of the sb_type stored in the mi
+// array, starting with bsize.
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize) {
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
+
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
+  const BLOCK_SIZE subsize = mi[0]->sb_type;
+
+  if (subsize == bsize) return PARTITION_NONE;
+
+  const int bhigh = mi_size_high[bsize];
+  const int bwide = mi_size_wide[bsize];
+  const int sshigh = mi_size_high[subsize];
+  const int sswide = mi_size_wide[subsize];
+
+  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
+      mi_col + bhigh / 2 < cm->mi_cols) {
+    // In this case, the block might be using an extended partition
+    // type.
+    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
+
+    if (sswide == bwide) {
+      // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
+      // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
+      // half was split.
+      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+      assert(sshigh * 2 == bhigh);
+
+      if (mbmi_below->sb_type == subsize)
+        return PARTITION_HORZ;
+      else
+        return PARTITION_HORZ_B;
+    } else if (sshigh == bhigh) {
+      // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
+      // PARTITION_VERT_B. To distinguish the latter two, check if the right
+      // half was split.
+      if (sswide * 4 == bwide) return PARTITION_VERT_4;
+      assert(sswide * 2 == bhigh);
+
+      if (mbmi_right->sb_type == subsize)
+        return PARTITION_VERT;
+      else
+        return PARTITION_VERT_B;
+    } else {
+      // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
+      // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
+      // dimensions, we immediately know this is a split (which will recurse to
+      // get to subsize). Otherwise look down and to the right. With
+      // PARTITION_VERT_A, the right block will have height bhigh; with
+      // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
+      // it's PARTITION_SPLIT.
+      if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
+
+      if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
+      if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+
+      return PARTITION_SPLIT;
+    }
+  }
+  const int vert_split = sswide < bwide;
+  const int horz_split = sshigh < bhigh;
+  const int split_idx = (vert_split << 1) | horz_split;
+  assert(split_idx != 0);
+
+  static const PARTITION_TYPE base_partitions[4] = {
+    PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
+  };
+
+  return base_partitions[split_idx];
+}
+
+static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) {
+  cm->seq_params.frame_id_numbers_present_flag = use;
+}
+
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+                               BLOCK_SIZE sb_size) {
+  seq_params->sb_size = sb_size;
+  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
+}
+
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int coded_lossless = 1;
+  if (cm->seg.enabled) {
+    for (int i = 0; i < MAX_SEGMENTS; ++i) {
+      if (!xd->lossless[i]) {
+        coded_lossless = 0;
+        break;
+      }
+    }
+  } else {
+    coded_lossless = xd->lossless[0];
+  }
+  return coded_lossless;
+}
+
+static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
+  return seq_level_idx < 24 || seq_level_idx == 31;
+}
+
+static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
+  assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
+  // Since bl.minor is unsigned a comparison will return a warning:
+  // comparison is always true due to limited range of data type
+  assert(LEVEL_MINOR_MIN == 0);
+  assert(bl.minor <= LEVEL_MINOR_MAX);
+  return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 000000000..026a07809
--- /dev/null
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector signed char int8x16_t;          // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t;       // NOLINT(runtime/int)
+typedef vector signed short int16x8_t;         // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t;      // NOLINT(runtime/int)
+typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)
+
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+                                        int width, int height, int round_offset,
+                                        int num_pel_log2) {
+  //  int16_t *dst = dst_ptr;
+  const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = (const int16_t *)src_ptr;
+  const int16_t *end = sum_buf + height * CFL_BUF_LINE;
+  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+  do {
+    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    if (width >= 16) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+    if (width == 32) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+  } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+  sum_32x4 = vec_add(sum_32x4, perm_64);
+  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+  sum_32x4 = vec_add(sum_32x4, perm_32);
+  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+  const int16x8_t vec_avg = vec_pack(avg, avg);
+  do {
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+               OFF_0 + CFL_LINE_2, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+               OFF_0 + CFL_LINE_3, dst);
+    if (width >= 16) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+                 OFF_1 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+                 OFF_1 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+                 OFF_1 + CFL_LINE_3, dst);
+    }
+    if (width == 32) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+                 OFF_2 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+                 OFF_2 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+                 OFF_2 + CFL_LINE_3, dst);
+
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+                 OFF_3 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+                 OFF_3 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+                 OFF_3 + CFL_LINE_3, dst);
+    }
+  } while ((dst += CFL_BUF_LINE * 4) < dst_end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_c,     /* 4x4 */
+    subtract_average_8x8_vsx,   /* 8x8 */
+    subtract_average_16x16_vsx, /* 16x16 */
+    subtract_average_32x32_vsx, /* 32x32 */
+    cfl_subtract_average_null,  /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_c,     /* 4x8 */
+    subtract_average_8x4_vsx,   /* 8x4 */
+    subtract_average_8x16_vsx,  /* 8x16 */
+    subtract_average_16x8_vsx,  /* 16x8 */
+    subtract_average_16x32_vsx, /* 16x32 */
+    subtract_average_32x16_vsx, /* 32x16 */
+    cfl_subtract_average_null,  /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_c,    /* 4x16 */
+    subtract_average_16x4_vsx,  /* 16x4 */
+    subtract_average_8x32_vsx,  /* 8x32 */
+    subtract_average_32x8_vsx,  /* 32x8 */
+    cfl_subtract_average_null,  /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,  /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
new file mode 100644
index 000000000..5952441d1
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi,
+                                        const MACROBLOCKD *xd, int dir,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  (void)xd;
+
+  return ((ref_mbmi->ref_frame[0] == ref_frame ||
+           ref_mbmi->ref_frame[1] == ref_frame)
+              ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
+              : SWITCHABLE_FILTERS);
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx_offset =
+      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+  assert(dir == 0 || dir == 1);
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+  int left_type = SWITCHABLE_FILTERS;
+  int above_type = SWITCHABLE_FILTERS;
+
+  if (xd->left_available)
+    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+  if (xd->up_available)
+    above_type =
+        get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
+
+  if (left_type == above_type) {
+    filter_type_ctx += left_type;
+  } else if (left_type == SWITCHABLE_FILTERS) {
+    assert(above_type != SWITCHABLE_FILTERS);
+    filter_type_ctx += above_type;
+  } else if (above_type == SWITCHABLE_FILTERS) {
+    assert(left_type != SWITCHABLE_FILTERS);
+    filter_type_ctx += left_type;
+  } else {
+    filter_type_ctx += SWITCHABLE_FILTERS;
+  }
+
+  return filter_type_ctx;
+}
+
+static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
+  // Do not add an already existing value
+  if (*n > 0 && val == cache[*n - 1]) return;
+
+  cache[(*n)++] = val;
+}
+
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+                          uint16_t *cache) {
+  const int row = -xd->mb_to_top_edge >> 3;
+  // Do not refer to above SB row when on SB boundary.
+  const MB_MODE_INFO *const above_mi =
+      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int above_n = 0, left_n = 0;
+  if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
+  if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
+  if (above_n == 0 && left_n == 0) return 0;
+  int above_idx = plane * PALETTE_MAX_SIZE;
+  int left_idx = plane * PALETTE_MAX_SIZE;
+  int n = 0;
+  const uint16_t *above_colors =
+      above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
+  const uint16_t *left_colors =
+      left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
+  // Merge the sorted lists of base colors from above and left to get
+  // combined sorted color cache.
+  while (above_n > 0 && left_n > 0) {
+    uint16_t v_above = above_colors[above_idx];
+    uint16_t v_left = left_colors[left_idx];
+    if (v_left < v_above) {
+      palette_add_to_cache(cache, &n, v_left);
+      ++left_idx, --left_n;
+    } else {
+      palette_add_to_cache(cache, &n, v_above);
+      ++above_idx, --above_n;
+      if (v_left == v_above) ++left_idx, --left_n;
+    }
+  }
+  while (above_n-- > 0) {
+    uint16_t val = above_colors[above_idx++];
+    palette_add_to_cache(cache, &n, val);
+  }
+  while (left_n-- > 0) {
+    uint16_t val = left_colors[left_idx++];
+    palette_add_to_cache(cache, &n, val);
+  }
+  assert(n <= 2 * PALETTE_MAX_SIZE);
+  return n;
+}
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+    return left_intra && above_intra ? 3 : left_intra || above_intra;
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+  } else {
+    return 0;
+  }
+}
+
+#define CHECK_BACKWARD_REFS(ref_frame) \
+  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
+#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+      // neither edge uses comp pred (0/1)
+      ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
+            IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
+    else if (!has_second_ref(above_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
+                 !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
+                 !is_inter_block(left_mbmi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!has_second_ref(edge_mbmi))
+      // edge does not use comp pred (0/1)
+      ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(inter_mbmi))  // single pred
+        pred_context = 2;
+      else  // comp pred
+        pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi);
+    } else {  // inter/inter
+      const int a_sg = !has_second_ref(above_mbmi);
+      const int l_sg = !has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
+
+      if (a_sg && l_sg) {  // single/single
+        pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
+                                  IS_BACKWARD_REF_FRAME(frfl)));
+      } else if (l_sg || a_sg) {  // single/comp
+        const int uni_rfc =
+            a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
+
+        if (!uni_rfc)  // comp bidir
+          pred_context = 1;
+        else  // comp unidir
+          pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^
+                                IS_BACKWARD_REF_FRAME(frfl)));
+      } else {  // comp/comp
+        const int a_uni_rfc = has_uni_comp_refs(above_mbmi);
+        const int l_uni_rfc = has_uni_comp_refs(left_mbmi);
+
+        if (!a_uni_rfc && !l_uni_rfc)  // bidir/bidir
+          pred_context = 0;
+        else if (!a_uni_rfc || !l_uni_rfc)  // unidir/bidir
+          pred_context = 2;
+        else  // unidir/unidir
+          pred_context =
+              3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME)));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {                           // inter
+      if (!has_second_ref(edge_mbmi))  // single pred
+        pred_context = 2;
+      else  // comp pred
+        pred_context = 4 * has_uni_comp_refs(edge_mbmi);
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS);
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as either
+// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as uni-directional.
+//
+// 3 contexts: Voting is used to compare the count of forward references with
+//             that of backward references from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of forward references (L, L2, L3, or G)
+  const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+  // Count of backward references (B or A)
+  const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
+
+  const int pred_context =
+      (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above three.
+//
+// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the
+//             total count of LAST3/GOLDEN from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST2
+  const int last2_count = ref_counts[LAST2_FRAME];
+  // Count of LAST3 or GOLDEN
+  const int last3_or_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+  const int pred_context = (last2_count == last3_or_gld_count)
+                               ? 1
+                               : ((last2_count < last3_or_gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST3) or (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above two.
+//
+// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the
+//             total count of GOLDEN_FRAME from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST3
+  const int last3_count = ref_counts[LAST3_FRAME];
+  // Count of GOLDEN
+  const int gld_count = ref_counts[GOLDEN_FRAME];
+
+  const int pred_context =
+      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+  return pred_context;
+}
+
+// == Common context functions for both comp and single ref ==
+//
+// Obtain contexts to signal a reference frame to be either LAST/LAST2 or
+// LAST3/GOLDEN.
+static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST + LAST2
+  const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
+  // Count of LAST3 + GOLDEN
+  const int last3_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+  const int pred_context = (last_last2_count == last3_gld_count)
+                               ? 1
+                               : ((last_last2_count < last3_gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST or LAST2.
+static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST
+  const int last_count = ref_counts[LAST_FRAME];
+  // Count of LAST2
+  const int last2_count = ref_counts[LAST2_FRAME];
+
+  const int pred_context =
+      (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN.
+static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST3
+  const int last3_count = ref_counts[LAST3_FRAME];
+  // Count of GOLDEN
+  const int gld_count = ref_counts[GOLDEN_FRAME];
+
+  const int pred_context =
+      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or
+// ALTREF.
+static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
+  const int brfarf2_count =
+      ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
+  const int arf_count = ref_counts[ALTREF_FRAME];
+
+  const int pred_context =
+      (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2.
+static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of BWDREF frames (B)
+  const int brf_count = ref_counts[BWDREF_FRAME];
+  // Count of ALTREF2 frames (A2)
+  const int arf2_count = ref_counts[ALTREF2_FRAME];
+
+  const int pred_context =
+      (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// == Context functions for comp ref ==
+//
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_last_or_last2(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
+  return get_pred_context_last3_or_gld(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF, or ALTREF2/BWDREF.
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF2 or BWDREF.
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_brf_or_arf2(xd);
+}
+
+// == Context functions for single ref ==
+//
+// For the bit to signal whether the single reference is a forward reference
+// frame or a backward reference frame.
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of forward reference frames
+  const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+  // Count of backward reference frames
+  const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
+
+  const int pred_context =
+      (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// non-ALTREF backward reference frame, knowing that it shall be either of
+// these 2 choices.
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// For the bit to signal whether the single reference is LAST3/GOLDEN or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+  return get_pred_context_last_or_last2(xd);
+}
+
+// For the bit to signal whether the single reference is GOLDEN_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+  return get_pred_context_last3_or_gld(xd);
+}
+
+// For the bit to signal whether the single reference is ALTREF2_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
+  return get_pred_context_brf_or_arf2(xd);
+}
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
new file mode 100644
index 000000000..6dba2322d
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int get_segment_id(const AV1_COMMON *const cm,
+                                 const uint8_t *segment_ids, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  int x, y, segment_id = MAX_SEGMENTS;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_id =
+          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+                                           const MACROBLOCKD *const xd,
+                                           int mi_row, int mi_col,
+                                           int *cdf_index) {
+  int prev_ul = -1;  // top left segment_id
+  int prev_l = -1;   // left segment_id
+  int prev_u = -1;   // top segment_id
+  if ((xd->up_available) && (xd->left_available)) {
+    prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                             mi_row - 1, mi_col - 1);
+  }
+  if (xd->up_available) {
+    prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                            mi_row - 1, mi_col - 0);
+  }
+  if (xd->left_available) {
+    prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+                            mi_row - 0, mi_col - 1);
+  }
+
+  // Pick CDF index based on number of matching/out-of-bounds segment IDs.
+  if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+    *cdf_index = 0;
+  else if ((prev_ul == prev_u) && (prev_ul == prev_l))
+    *cdf_index = 2;
+  else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
+    *cdf_index = 1;
+  else
+    *cdf_index = 0;
+
+  // If 2 or more are identical returns that as predictor, otherwise prev_l.
+  if (prev_u == -1)  // edge case
+    return prev_l == -1 ? 0 : prev_l;
+  if (prev_l == -1)  // edge case
+    return prev_u;
+  return (prev_ul == prev_u) ? prev_u : prev_l;
+}
+
+static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
+
+  return above_sip + left_sip;
+}
+
+static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+  int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+  int cur_frame_index = cm->cur_frame->cur_frame_offset;
+
+  if (bck_idx >= 0)
+    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+
+  if (fwd_idx >= 0)
+    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+  int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index));
+  int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index));
+
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+
+  int above_ctx = 0, left_ctx = 0;
+  const int offset = (fwd == bck);
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->compound_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 1;
+  }
+
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->compound_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 1;
+  }
+
+  return above_ctx + left_ctx + 3 * offset;
+}
+
+static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int above_ctx = 0, left_ctx = 0;
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->comp_group_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 3;
+  }
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->comp_group_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 3;
+  }
+
+  return AOMMIN(5, above_ctx + left_ctx);
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
+    struct segmentation_probs *segp, const MACROBLOCKD *xd) {
+  return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
+  const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
+  return above_skip_mode + left_skip_mode;
+}
+
+static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip = above_mi ? above_mi->skip : 0;
+  const int left_skip = left_mi ? left_mi->skip : 0;
+  return above_skip + left_skip;
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+
+// Get a list of palette base colors that are used in the above and left blocks,
+// referred to as "color cache". The return value is the number of colors in the
+// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache"
+// in ascending order.
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+                          uint16_t *cache);
+
+static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+  return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
+}
+
+static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int ctx = 0;
+  if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
+  if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
+  return ctx;
+}
+
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
+
+// == Uni-directional contexts ==
+
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_comp_reference_type_context(xd);
+  return xd->tile_ctx->comp_ref_type_cdf[pred_context];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
+  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
+  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
+  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
+}
+
+// == Bi-directional contexts ==
+
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][2];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
+  return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
+  return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
+}
+
+// == Single contexts ==
+
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
+}
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+  const int max_tx_wide = tx_size_wide[max_tx_size];
+  const int max_tx_high = tx_size_high[max_tx_size];
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  int above = xd->above_txfm_context[0] >= max_tx_wide;
+  int left = xd->left_txfm_context[0] >= max_tx_high;
+
+  if (has_above)
+    if (is_inter_block(above_mbmi))
+      above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+
+  if (has_left)
+    if (is_inter_block(left_mbmi))
+      left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+
+  if (has_above && has_left)
+    return (above + left);
+  else if (has_above)
+    return above;
+  else if (has_left)
+    return left;
+  else
+    return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
new file mode 100644
index 000000000..0e14da7a3
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.c
@@ -0,0 +1,13676 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/entropy.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/blockd.h"
+
+static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
+  4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
+  19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
+  31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
+  43,   43,   44,   45,   46,  47,  48,  48,  49,  50,  51,   52,   53,   53,
+  54,   55,   56,   57,   57,  58,  59,  60,  61,  62,  62,   63,   64,   65,
+  66,   66,   67,   68,   69,  70,  70,  71,  72,  73,  74,   74,   75,   76,
+  77,   78,   78,   79,   80,  81,  81,  82,  83,  84,  85,   85,   87,   88,
+  90,   92,   93,   95,   96,  98,  99,  101, 102, 104, 105,  107,  108,  110,
+  111,  113,  114,  116,  117, 118, 120, 121, 123, 125, 127,  129,  131,  134,
+  136,  138,  140,  142,  144, 146, 148, 150, 152, 154, 156,  158,  161,  164,
+  166,  169,  172,  174,  177, 180, 182, 185, 187, 190, 192,  195,  199,  202,
+  205,  208,  211,  214,  217, 220, 223, 226, 230, 233, 237,  240,  243,  247,
+  250,  253,  257,  261,  265, 269, 272, 276, 280, 284, 288,  292,  296,  300,
+  304,  309,  313,  317,  322, 326, 330, 335, 340, 344, 349,  354,  359,  364,
+  369,  374,  379,  384,  389, 395, 400, 406, 411, 417, 423,  429,  435,  441,
+  447,  454,  461,  467,  475, 482, 489, 497, 505, 513, 522,  530,  539,  549,
+  559,  569,  579,  590,  602, 614, 626, 640, 654, 668, 684,  700,  717,  736,
+  755,  775,  796,  819,  843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+  1184, 1232, 1282, 1336,
+};
+
+static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
+  4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
+  40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
+  86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
+  136,  140,  143,  147,  151,  155,  159,  163,  166,  170,  174,  178,  182,
+  185,  189,  193,  197,  200,  204,  208,  212,  215,  219,  223,  226,  230,
+  233,  237,  241,  244,  248,  251,  255,  259,  262,  266,  269,  273,  276,
+  280,  283,  287,  290,  293,  297,  300,  304,  307,  310,  314,  317,  321,
+  324,  327,  331,  334,  337,  343,  350,  356,  362,  369,  375,  381,  387,
+  394,  400,  406,  412,  418,  424,  430,  436,  442,  448,  454,  460,  466,
+  472,  478,  484,  490,  499,  507,  516,  525,  533,  542,  550,  559,  567,
+  576,  584,  592,  601,  609,  617,  625,  634,  644,  655,  666,  676,  687,
+  698,  708,  718,  729,  739,  749,  759,  770,  782,  795,  807,  819,  831,
+  844,  856,  868,  880,  891,  906,  920,  933,  947,  961,  975,  988,  1001,
+  1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
+  1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
+  1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
+  1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
+  2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
+  2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
+  3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+};
+
+static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
+  4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
+  103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
+  251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
+  421,   437,   453,   469,   484,   500,   516,   532,   548,   564,   580,
+  596,   611,   627,   643,   659,   674,   690,   706,   721,   737,   752,
+  768,   783,   798,   814,   829,   844,   859,   874,   889,   904,   919,
+  934,   949,   964,   978,   993,   1008,  1022,  1037,  1051,  1065,  1080,
+  1094,  1108,  1122,  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,  1368,  1393,  1419,
+  1444,  1469,  1494,  1519,  1544,  1569,  1594,  1618,  1643,  1668,  1692,
+  1717,  1741,  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,  1957,
+  1992,  2027,  2061,  2096,  2130,  2165,  2199,  2233,  2267,  2300,  2334,
+  2367,  2400,  2434,  2467,  2499,  2532,  2575,  2618,  2661,  2704,  2746,
+  2788,  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,  3177,  3226,
+  3275,  3324,  3373,  3421,  3469,  3517,  3565,  3621,  3677,  3733,  3788,
+  3843,  3897,  3951,  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,  5013,  5083,  5153,
+  5222,  5291,  5367,  5442,  5517,  5591,  5665,  5745,  5825,  5905,  5984,
+  6063,  6149,  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,  6966,
+  7064,  7163,  7269,  7376,  7483,  7599,  7715,  7832,  7958,  8085,  8214,
+  8352,  8492,  8635,  8788,  8945,  9104,  9275,  9450,  9639,  9832,  10031,
+  10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
+  13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
+  19718, 20521, 21387,
+};
+
+static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
+  4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+  20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
+  33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
+  46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+  59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,
+  72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,
+  85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,
+  98,   99,   100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+  120,  122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  142,  144,
+  146,  148,  150,  152,  155,  158,  161,  164,  167,  170,  173,  176,  179,
+  182,  185,  188,  191,  194,  197,  200,  203,  207,  211,  215,  219,  223,
+  227,  231,  235,  239,  243,  247,  251,  255,  260,  265,  270,  275,  280,
+  285,  290,  295,  300,  305,  311,  317,  323,  329,  335,  341,  347,  353,
+  359,  366,  373,  380,  387,  394,  401,  408,  416,  424,  432,  440,  448,
+  456,  465,  474,  483,  492,  501,  510,  520,  530,  540,  550,  560,  571,
+  582,  593,  604,  615,  627,  639,  651,  663,  676,  689,  702,  715,  729,
+  743,  757,  771,  786,  801,  816,  832,  848,  864,  881,  898,  915,  933,
+  951,  969,  988,  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
+  1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
+  1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
+  4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
+  44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
+  96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
+  154,  158,  163,  168,  172,  177,  181,  186,  190,  195,  199,  204,  208,
+  213,  217,  222,  226,  231,  235,  240,  244,  249,  253,  258,  262,  267,
+  271,  275,  280,  284,  289,  293,  297,  302,  306,  311,  315,  319,  324,
+  328,  332,  337,  341,  345,  349,  354,  358,  362,  367,  371,  375,  379,
+  384,  388,  392,  396,  401,  409,  417,  425,  433,  441,  449,  458,  466,
+  474,  482,  490,  498,  506,  514,  523,  531,  539,  547,  555,  563,  571,
+  579,  588,  596,  604,  616,  628,  640,  652,  664,  676,  688,  700,  713,
+  725,  737,  749,  761,  773,  785,  797,  809,  825,  841,  857,  873,  889,
+  905,  922,  938,  954,  970,  986,  1002, 1018, 1038, 1058, 1078, 1098, 1118,
+  1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
+  1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
+  1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
+  2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
+  2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
+  3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
+  4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
+  6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+};
+
+static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
+  4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
+  112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
+  280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
+  475,   493,   511,   530,   548,   567,   586,   604,   623,   642,   660,
+  679,   698,   716,   735,   753,   772,   791,   809,   828,   846,   865,
+  884,   902,   920,   939,   957,   976,   994,   1012,  1030,  1049,  1067,
+  1085,  1103,  1121,  1139,  1157,  1175,  1193,  1211,  1229,  1246,  1264,
+  1282,  1299,  1317,  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,  1627,  1660,  1693,
+  1725,  1758,  1791,  1824,  1856,  1889,  1922,  1954,  1987,  2020,  2052,
+  2085,  2118,  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,  2411,
+  2459,  2508,  2556,  2605,  2653,  2701,  2750,  2798,  2847,  2895,  2943,
+  2992,  3040,  3088,  3137,  3185,  3234,  3298,  3362,  3426,  3491,  3555,
+  3619,  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,  4230,  4310,
+  4390,  4470,  4550,  4631,  4711,  4791,  4871,  4967,  5064,  5160,  5256,
+  5352,  5448,  5544,  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,  7579,  7723,  7867,
+  8011,  8155,  8315,  8475,  8635,  8795,  8956,  9132,  9308,  9484,  9660,
+  9836,  10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
+  12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
+  14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
+  18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
+  22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
+  28143, 28687, 29247,
+};
+
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms.  Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values.  Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale().  Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients.  However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// dequantized/decoded coefficients, even for large TX blocks, are always
+// effectively Q3. Meanwhile, quantized/coded coefficients are Q0
+// because Qn quantizers are applied to Qn tx coefficients.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input.  In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  switch (bit_depth) {
+    case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  switch (bit_depth) {
+    case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+    case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
+// bits), so QTX == Q3.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_dc_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  return av1_ac_quant_Q3(qindex, delta, bit_depth);
+}
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    const int seg_qindex = base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
+  } else {
+    return base_qindex;
+  }
+}
+
+const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+                             TX_SIZE tx_size) {
+  return &cm->giqmatrix[qmlevel][plane][tx_size][0];
+}
+const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+                            TX_SIZE tx_size) {
+  return &cm->gqmatrix[qmlevel][plane][tx_size][0];
+}
+
+#define QM_TOTAL_SIZE 3344
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+
+void av1_qm_init(AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  int q, c, t;
+  int current;
+  for (q = 0; q < NUM_QM_LEVELS; ++q) {
+    for (c = 0; c < num_planes; ++c) {
+      current = 0;
+      for (t = 0; t < TX_SIZES_ALL; ++t) {
+        const int size = tx_size_2d[t];
+        const int qm_tx_size = av1_get_adjusted_tx_size(t);
+        if (q == NUM_QM_LEVELS - 1) {
+          cm->gqmatrix[q][c][t] = NULL;
+          cm->giqmatrix[q][c][t] = NULL;
+        } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
+          cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
+          cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+        } else {
+          assert(current + size <= QM_TOTAL_SIZE);
+          cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+          cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+          current += size;
+        }
+      }
+    }
+  }
+}
+
+/* Provide 16 sets of quantization matrices for chroma and luma
+   and each TX size. Matrices for different TX sizes are in fact
+   sub-sampled from the 32x32 and 16x16 sizes, but explicitly
+   defined here for convenience. Intra and inter matrix sets are the
+   same but changing DEFAULT_QM_INTER_OFFSET from zero allows
+   for different matrices for inter and intra blocks in the same
+   frame.
+   Matrices for different QM levels have been rescaled in the
+   frequency domain according to different nominal viewing
+   distances.
+ */
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200,
+        /* Size 8x8 */
+        32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38,
+        40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63,
+        78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89,
+        98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220,
+        /* Size 16x16 */
+        32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31,
+        32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32,
+        33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35,
+        39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48,
+        54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63,
+        67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71,
+        80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92,
+        98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98,
+        105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110,
+        118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113,
+        121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107,
+        115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100,
+        108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96,
+        102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100,
+        97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119,
+        112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220,
+        231,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71,
+        80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32,
+        32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77,
+        78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32,
+        32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83,
+        86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35,
+        35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91,
+        94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42,
+        44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100,
+        104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49,
+        53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106,
+        109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63,
+        68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34,
+        34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77,
+        79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36,
+        38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88,
+        90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47,
+        50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98,
+        100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58,
+        63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102,
+        105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67,
+        69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110,
+        111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71,
+        76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117,
+        119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82,
+        87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120,
+        121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+        92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125,
+        129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89,
+        95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131,
+        131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+        98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133,
+        135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90,
+        97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137,
+        140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93,
+        96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142,
+        144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85,
+        91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148,
+        149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81,
+        86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156,
+        152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82,
+        85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154,
+        158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81,
+        84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154,
+        159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84,
+        82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146,
+        156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90,
+        88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143,
+        148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101,
+        95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130,
+        138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197,
+        204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122,
+        125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200,
+        204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110,
+        113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191,
+        193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102,
+        103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173,
+        180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104,
+        102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151,
+        155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119,
+        113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130,
+        135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220,
+        222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114,
+        121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191,
+        204, 206, 222, 224, 230, 232, 242,
+        /* Size 4x8 */
+        32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65,
+        84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190,
+        /* Size 8x4 */
+        32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75,
+        69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190,
+        /* Size 8x16 */
+        32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34,
+        36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60,
+        68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102,
+        105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124,
+        122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121,
+        144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107,
+        128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101,
+        117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203,
+        /* Size 16x8 */
+        32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32,
+        33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34,
+        36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50,
+        54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59,
+        63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77,
+        79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86,
+        82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99,
+        93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
+        /* Size 16x32 */
+        32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32,
+        32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33,
+        34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41,
+        49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54,
+        59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69,
+        75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78,
+        80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+        87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90,
+        39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41,
+        43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49,
+        55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56,
+        67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60,
+        71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63,
+        75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66,
+        77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68,
+        79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73,
+        84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79,
+        90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+        80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72,
+        76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80,
+        77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91,
+        83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155,
+        94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166,
+        168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173,
+        171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173,
+        178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170,
+        174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155,
+        168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138,
+        141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117,
+        127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105,
+        115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111,
+        107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217,
+        /* Size 32x16 */
+        32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+        79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32,
+        32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74,
+        75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34,
+        35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80,
+        83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41,
+        43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87,
+        90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53,
+        55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100,
+        101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67,
+        71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108,
+        112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82,
+        87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117,
+        117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+        92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123,
+        127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+        98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136,
+        138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96,
+        103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144,
+        142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98,
+        102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152,
+        150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89,
+        93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160,
+        161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84,
+        86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162,
+        165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83,
+        84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152,
+        160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89,
+        89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+        152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93,
+        93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142,
+        143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+        /* Size 4x16 */
+        31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54,
+        78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118,
+        123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100,
+        140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197,
+        /* Size 16x4 */
+        31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44,
+        41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72,
+        71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90,
+        86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
+        /* Size 8x32 */
+        32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33,
+        34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50,
+        59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79,
+        80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90,
+        39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44,
+        55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60,
+        82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66,
+        89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73,
+        97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80,
+        105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77,
+        85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94,
+        83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171,
+        100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174,
+        186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138,
+        161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105,
+        118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204,
+        /* Size 32x8 */
+        32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+        79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32,
+        33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71,
+        72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36,
+        38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88,
+        91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57,
+        60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105,
+        107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65,
+        68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131,
+        128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82,
+        84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148,
+        153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80,
+        84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153,
+        162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89,
+        89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+        152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204 },
+      { /* Chroma */
+        /* Size 4x4 */
+        35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
+        /* Size 8x8 */
+        31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46,
+        54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72,
+        82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95,
+        104, 107, 71, 67, 68, 75, 84, 95, 107, 113,
+        /* Size 16x16 */
+        32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32,
+        35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45,
+        47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49,
+        50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55,
+        56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67,
+        68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74,
+        73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78,
+        78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83,
+        63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58,
+        57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59,
+        64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69,
+        73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73,
+        78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78,
+        82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83,
+        88, 93, 98, 104, 109, 112, 116,
+        /* Size 32x32 */
+        32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60,
+        63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33,
+        34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63,
+        64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44,
+        46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66,
+        67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46,
+        46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+        69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50,
+        51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38,
+        40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55,
+        55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47,
+        48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59,
+        60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51,
+        51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63,
+        63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54,
+        55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66,
+        48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61,
+        63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45,
+        45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69,
+        67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51,
+        54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70,
+        71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60,
+        61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74,
+        74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69,
+        70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52,
+        49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78,
+        79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49,
+        52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80,
+        81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60,
+        63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82,
+        82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72,
+        75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86,
+        63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85,
+        89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57,
+        57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92,
+        93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60,
+        61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94,
+        93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70,
+        73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95,
+        95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82,
+        84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65,
+        63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92,
+        96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63,
+        62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+        99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62,
+        61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100,
+        101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61,
+        63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102,
+        103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63,
+        66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104,
+        105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66,
+        67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106,
+        108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68,
+        70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+        110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72,
+        74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111,
+        112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74,
+        77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113,
+        115, 115, 118,
+        /* Size 4x8 */
+        31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65,
+        85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105,
+        /* Size 8x4 */
+        31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54,
+        64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105,
+        /* Size 8x16 */
+        32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43,
+        47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54,
+        57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73,
+        72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79,
+        63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59,
+        64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75,
+        83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82,
+        91, 101, 109,
+        /* Size 16x8 */
+        32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40,
+        43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50,
+        53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61,
+        64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73,
+        76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92,
+        93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98,
+        99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98,
+        104, 106, 109,
+        /* Size 16x32 */
+        32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31,
+        38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42,
+        46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45,
+        48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49,
+        51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54,
+        57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,
+        59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60,
+        61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+        48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45,
+        46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49,
+        53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59,
+        64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68,
+        71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78,
+        80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,
+        79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80,
+        79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+        63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58,
+        55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58,
+        61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69,
+        76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78,
+        86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88,
+        95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,
+        99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102,
+        102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101,
+        103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105,
+        106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106,
+        107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+        76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77,
+        70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113,
+        /* Size 32x16 */
+        32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+        63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34,
+        36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61,
+        62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47,
+        48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63,
+        64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49,
+        50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65,
+        66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56,
+        56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47,
+        45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68,
+        68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47,
+        50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72,
+        73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59,
+        62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76,
+        77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71,
+        73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+        63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85,
+        89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59,
+        59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95,
+        94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61,
+        63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98,
+        100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68,
+        72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103,
+        103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73,
+        74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105,
+        107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75,
+        76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109,
+        108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77,
+        81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113,
+        /* Size 4x16 */
+        31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53,
+        60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80,
+        57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68,
+        89, 103, 67, 70, 86, 105, 69, 72, 88, 107,
+        /* Size 16x4 */
+        31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45,
+        46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57,
+        60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66,
+        70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
+        /* Size 8x32 */
+        32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40,
+        46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47,
+        51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60,
+        59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62,
+        48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46,
+        53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66,
+        71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82,
+        79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+        63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56,
+        61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74,
+        86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96,
+        99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101,
+        104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106,
+        75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77,
+        70, 67, 73, 81, 90, 99, 108,
+        /* Size 32x8 */
+        32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+        63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41,
+        43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58,
+        59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51,
+        53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67,
+        66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62,
+        64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74,
+        73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75,
+        76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63,
+        60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92,
+        93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60,
+        59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99,
+        99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61,
+        62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102,
+        104, 106, 106, 109, 109, 108 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184,
+        /* Size 8x8 */
+        32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39,
+        51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71,
+        87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93,
+        106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201,
+        /* Size 16x16 */
+        32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31,
+        32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33,
+        34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40,
+        42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56,
+        60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70,
+        78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89,
+        95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97,
+        104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106,
+        113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117,
+        125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125,
+        134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119,
+        131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110,
+        122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104,
+        114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102,
+        109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98,
+        105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65,
+        71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32,
+        32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76,
+        78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32,
+        33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82,
+        85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35,
+        36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89,
+        92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42,
+        45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97,
+        100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49,
+        53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32,
+        33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65,
+        68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35,
+        37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79,
+        82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48,
+        50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90,
+        93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58,
+        60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100,
+        103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65,
+        69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106,
+        44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79,
+        84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45,
+        44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+        95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46,
+        45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98,
+        100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49,
+        50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+        106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54,
+        54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110,
+        113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56,
+        56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113,
+        116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59,
+        59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118,
+        121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65,
+        64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122,
+        125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70,
+        69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122,
+        127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76,
+        73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125,
+        130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78,
+        76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121,
+        128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162,
+        86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119,
+        124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165,
+        162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114,
+        120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169,
+        169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110,
+        117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174,
+        174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106,
+        113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176,
+        180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102,
+        104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170,
+        176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95,
+        99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165,
+        173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90,
+        95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160,
+        167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96,
+        95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150,
+        155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105,
+        101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135,
+        141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211,
+        114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120,
+        121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203,
+        204, 210, 211, 219,
+        /* Size 4x8 */
+        32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79,
+        112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177,
+        /* Size 8x4 */
+        32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64,
+        77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177,
+        /* Size 8x16 */
+        32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34,
+        36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56,
+        68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95,
+        101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113,
+        116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133,
+        142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121,
+        148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110,
+        129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188,
+        /* Size 16x8 */
+        32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32,
+        32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36,
+        40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56,
+        60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73,
+        84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84,
+        95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91,
+        101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87,
+        94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
+        /* Size 16x32 */
+        32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32,
+        32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33,
+        35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41,
+        44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50,
+        59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64,
+        69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76,
+        78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+        84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87,
+        38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37,
+        40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45,
+        53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66,
+        69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71,
+        77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82,
+        92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87,
+        98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89,
+        100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+        105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97,
+        111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101,
+        115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104,
+        118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106,
+        121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100,
+        109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99,
+        112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95,
+        101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92,
+        95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+        93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91,
+        88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95,
+        94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107,
+        98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192,
+        110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188,
+        193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166,
+        189, 190, 201,
+        /* Size 32x16 */
+        32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+        71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32,
+        32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72,
+        75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35,
+        35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79,
+        82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40,
+        41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85,
+        88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56,
+        57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98,
+        100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77,
+        79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45,
+        44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+        95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49,
+        50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+        106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59,
+        59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118,
+        121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66,
+        65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123,
+        127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72,
+        71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125,
+        130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83,
+        80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126,
+        130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+        90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116,
+        124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169,
+        166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113,
+        117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177,
+        183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110,
+        111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181,
+        188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106,
+        107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185,
+        186, 192, 193, 201,
+        /* Size 4x16 */
+        31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54,
+        74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107,
+        117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132,
+        163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183,
+        /* Size 16x4 */
+        31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41,
+        42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65,
+        66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83,
+        81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
+        /* Size 8x32 */
+        32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32,
+        35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45,
+        59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71,
+        78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87,
+        38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42,
+        53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57,
+        71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79,
+        98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+        105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92,
+        115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97,
+        121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99,
+        124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92,
+        105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91,
+        94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107,
+        97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188,
+        114, 104, 100, 111, 127, 145, 166, 190,
+        /* Size 32x8 */
+        32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+        71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32,
+        33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70,
+        73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38,
+        40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87,
+        89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58,
+        60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108,
+        110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79,
+        84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124,
+        128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90,
+        95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141,
+        146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94,
+        101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163,
+        163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90,
+        94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161,
+        171, 174, 179, 181, 188, 188, 190 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
+        /* Size 8x8 */
+        31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47,
+        53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67,
+        76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92,
+        101, 103, 69, 65, 66, 73, 82, 92, 103, 109,
+        /* Size 16x16 */
+        32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31,
+        35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44,
+        47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48,
+        47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54,
+        56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61,
+        63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72,
+        71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76,
+        76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80,
+        60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58,
+        56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58,
+        62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67,
+        71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71,
+        76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76,
+        80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80,
+        85, 90, 95, 100, 105, 108, 111,
+        /* Size 32x32 */
+        32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57,
+        60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32,
+        34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61,
+        63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42,
+        46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64,
+        65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45,
+        46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66,
+        67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49,
+        50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38,
+        39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53,
+        54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47,
+        47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58,
+        58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50,
+        49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61,
+        62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54,
+        54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64,
+        49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58,
+        60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45,
+        46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64,
+        65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49,
+        53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68,
+        69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+        61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72,
+        72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66,
+        66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50,
+        48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74,
+        75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48,
+        49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78,
+        79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58,
+        59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80,
+        80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67,
+        70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83,
+        60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79,
+        82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56,
+        55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88,
+        90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57,
+        60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91,
+        90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68,
+        71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92,
+        91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80,
+        81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64,
+        62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89,
+        93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59,
+        58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97,
+        99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62,
+        65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101,
+        100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65,
+        66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103,
+        102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+        71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104,
+        106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+        76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106,
+        108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+        77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108,
+        110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80,
+        82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75,
+        71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84,
+        88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113,
+        /* Size 4x8 */
+        31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64,
+        79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102,
+        /* Size 8x4 */
+        31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52,
+        61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102,
+        /* Size 8x16 */
+        32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41,
+        47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54,
+        57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70,
+        71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78,
+        60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58,
+        62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73,
+        81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79,
+        89, 98, 105,
+        /* Size 16x8 */
+        32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38,
+        41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48,
+        53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56,
+        61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71,
+        74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85,
+        89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95,
+        97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96,
+        101, 103, 105,
+        /* Size 16x32 */
+        32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31,
+        37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40,
+        46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45,
+        46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47,
+        51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52,
+        54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57,
+        58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59,
+        60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+        48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46,
+        46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47,
+        53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59,
+        61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64,
+        68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73,
+        75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79,
+        78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78,
+        78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+        60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56,
+        54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56,
+        60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68,
+        71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76,
+        84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86,
+        93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95,
+        97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99,
+        100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101,
+        99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+        72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73,
+        66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67,
+        67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68,
+        65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109,
+        /* Size 32x16 */
+        32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+        60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33,
+        36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59,
+        60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46,
+        48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61,
+        62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+        48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63,
+        64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+        56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47,
+        45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+        67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46,
+        47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71,
+        71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56,
+        57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74,
+        75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68,
+        71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+        61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79,
+        82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57,
+        56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90,
+        91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60,
+        63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96,
+        97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67,
+        71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100,
+        100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73,
+        76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103,
+        105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78,
+        79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71,
+        67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80,
+        84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109,
+        /* Size 4x16 */
+        31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53,
+        59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78,
+        54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66,
+        87, 100, 65, 68, 83, 102, 67, 70, 86, 103,
+        /* Size 16x4 */
+        31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45,
+        46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54,
+        59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65,
+        68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
+        /* Size 8x32 */
+        32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38,
+        46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46,
+        51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55,
+        58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61,
+        48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45,
+        53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61,
+        68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78,
+        78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+        60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55,
+        60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72,
+        84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94,
+        97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98,
+        101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103,
+        73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75,
+        68, 65, 71, 78, 87, 96, 105,
+        /* Size 32x8 */
+        32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+        60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38,
+        41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56,
+        58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50,
+        53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65,
+        65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+        61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72,
+        71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73,
+        74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60,
+        58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87,
+        89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59,
+        58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96,
+        97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61,
+        63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100,
+        101, 103, 103, 105, 105, 105 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
+        /* Size 8x8 */
+        32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37,
+        48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64,
+        80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87,
+        100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184,
+        /* Size 16x16 */
+        32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32,
+        32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34,
+        36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40,
+        42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53,
+        59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73,
+        78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91,
+        96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103,
+        107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114,
+        118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125,
+        125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133,
+        134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143,
+        145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150,
+        156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156,
+        163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148,
+        160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141,
+        152, 163, 177, 184, 191,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59,
+        65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32,
+        32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68,
+        76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81,
+        83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37,
+        38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89,
+        91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46,
+        50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32,
+        32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58,
+        62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34,
+        34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71,
+        72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41,
+        42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82,
+        83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50,
+        53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92,
+        93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63,
+        64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38,
+        37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78,
+        84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41,
+        46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91,
+        93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54,
+        58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99,
+        102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66,
+        70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105,
+        105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71,
+        73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112,
+        112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77,
+        82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113,
+        116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81,
+        86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121,
+        121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+        87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122,
+        125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87,
+        92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130,
+        130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90,
+        96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134,
+        133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+        92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136,
+        139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96,
+        98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140,
+        144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+        97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148,
+        148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92,
+        98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153,
+        153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91,
+        96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151,
+        156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87,
+        89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+        156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85,
+        89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153,
+        156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83,
+        88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148,
+        153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86,
+        85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144,
+        148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91,
+        90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139,
+        144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94,
+        94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135,
+        141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101,
+        101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129,
+        129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+        /* Size 4x8 */
+        32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69,
+        98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165,
+        /* Size 8x4 */
+        32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58,
+        68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165,
+        /* Size 8x16 */
+        32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33,
+        36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50,
+        59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90,
+        97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111,
+        113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135,
+        131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148,
+        151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141,
+        160, 169, 103, 94, 92, 103, 119, 137, 158, 175,
+        /* Size 16x8 */
+        32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32,
+        33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38,
+        44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58,
+        63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74,
+        81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90,
+        97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97,
+        104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97,
+        105, 113, 122, 131, 141, 151, 163, 169, 175,
+        /* Size 16x32 */
+        32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32,
+        32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32,
+        35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37,
+        41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50,
+        53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58,
+        70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70,
+        76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+        81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84,
+        36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37,
+        39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41,
+        51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58,
+        63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75,
+        79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81,
+        86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92,
+        103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+        108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98,
+        111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105,
+        118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109,
+        122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111,
+        125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118,
+        133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120,
+        135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+        136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125,
+        140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129,
+        135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117,
+        128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105,
+        120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103,
+        108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98,
+        103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92,
+        101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93,
+        93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187,
+        /* Size 32x16 */
+        32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+        65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32,
+        32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65,
+        72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34,
+        35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77,
+        80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40,
+        41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85,
+        88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57,
+        60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39,
+        38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73,
+        76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41,
+        42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90,
+        91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49,
+        49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105,
+        106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52,
+        57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111,
+        114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63,
+        65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121,
+        125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73,
+        75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136,
+        140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75,
+        77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139,
+        143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79,
+        81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145,
+        147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78,
+        79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136,
+        143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84,
+        83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131,
+        136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87,
+        87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128,
+        129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+        /* Size 4x16 */
+        31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47,
+        65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97,
+        112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125,
+        154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170,
+        /* Size 16x4 */
+        31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38,
+        39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58,
+        65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78,
+        83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
+        /* Size 8x32 */
+        32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32,
+        35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42,
+        53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69,
+        76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84,
+        36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40,
+        51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66,
+        79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86,
+        103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91,
+        111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100,
+        122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109,
+        133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111,
+        136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113,
+        135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103,
+        120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90,
+        103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97,
+        93, 104, 118, 135, 155, 176,
+        /* Size 32x8 */
+        32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+        65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32,
+        32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64,
+        71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38,
+        42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85,
+        87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58,
+        59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103,
+        103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74,
+        79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120,
+        121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90,
+        95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137,
+        141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97,
+        99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153,
+        155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92,
+        97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163,
+        168, 169, 175, 175, 176 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
+        /* Size 8x8 */
+        31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47,
+        53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65,
+        71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89,
+        97, 99, 67, 63, 64, 71, 79, 89, 99, 104,
+        /* Size 16x16 */
+        32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31,
+        35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43,
+        46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46,
+        45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51,
+        53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60,
+        61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68,
+        69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74,
+        73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77,
+        57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55,
+        53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55,
+        59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65,
+        69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74,
+        79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77,
+        82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82,
+        87, 92, 96, 101, 104, 106,
+        /* Size 32x32 */
+        32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55,
+        57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31,
+        34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60,
+        61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42,
+        44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63,
+        64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45,
+        45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64,
+        65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47,
+        49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36,
+        37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+        53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45,
+        47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55,
+        57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50,
+        49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59,
+        60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51,
+        51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62,
+        49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55,
+        56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45,
+        46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63,
+        64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49,
+        51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67,
+        68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+        58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70,
+        69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63,
+        64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48,
+        47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69,
+        69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47,
+        47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75,
+        77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55,
+        58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78,
+        77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65,
+        66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81,
+        57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73,
+        76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53,
+        53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84,
+        85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55,
+        56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88,
+        87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65,
+        67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89,
+        88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75,
+        78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61,
+        60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85,
+        86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58,
+        57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93,
+        95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63,
+        65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98,
+        99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71,
+        74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101,
+        99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78,
+        80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67,
+        66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85,
+        86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64,
+        64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89,
+        91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64,
+        63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93,
+        96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62,
+        62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99,
+        99, 104, 104, 106, 106, 108,
+        /* Size 4x8 */
+        31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59,
+        74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99,
+        /* Size 8x4 */
+        31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50,
+        57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99,
+        /* Size 8x16 */
+        32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40,
+        47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51,
+        53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67,
+        70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76,
+        57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57,
+        61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71,
+        79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86,
+        95, 102,
+        /* Size 16x8 */
+        32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36,
+        40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47,
+        51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56,
+        58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65,
+        70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82,
+        85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93,
+        94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98,
+        100, 102,
+        /* Size 16x32 */
+        32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31,
+        35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39,
+        46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45,
+        45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47,
+        49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50,
+        55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55,
+        57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,
+        59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+        49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46,
+        46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46,
+        53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56,
+        58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63,
+        65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68,
+        71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75,
+        76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,
+        76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+        57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54,
+        52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52,
+        58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64,
+        67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75,
+        78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83,
+        90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93,
+        94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,
+        97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+        69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70,
+        64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65,
+        64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65,
+        62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63,
+        63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105,
+        /* Size 32x16 */
+        32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+        57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32,
+        36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57,
+        58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45,
+        46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60,
+        61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46,
+        46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61,
+        62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54,
+        55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47,
+        46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61,
+        62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45,
+        45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68,
+        69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54,
+        57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72,
+        73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65,
+        65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+        57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74,
+        76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57,
+        56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89,
+        89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58,
+        59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93,
+        94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66,
+        70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97,
+        95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75,
+        77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69,
+        65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81,
+        82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63,
+        63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86,
+        87, 91, 91, 95, 96, 101, 101, 103, 103, 105,
+        /* Size 4x16 */
+        31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51,
+        54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77,
+        52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64,
+        84, 97, 64, 66, 81, 99, 65, 68, 83, 100,
+        /* Size 16x4 */
+        31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46,
+        46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50,
+        54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64,
+        67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
+        /* Size 8x32 */
+        32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36,
+        46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46,
+        49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54,
+        57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61,
+        49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46,
+        53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59,
+        65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75,
+        76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+        57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53,
+        58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68,
+        78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91,
+        94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98,
+        69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71,
+        64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66,
+        63, 69, 76, 84, 93, 101,
+        /* Size 32x8 */
+        32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+        57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36,
+        40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56,
+        57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50,
+        51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63,
+        63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56,
+        58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70,
+        69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68,
+        70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60,
+        58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84,
+        85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58,
+        57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94,
+        94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62,
+        64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100,
+        100, 102, 102, 101 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
+        /* Size 8x8 */
+        32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36,
+        46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73,
+        86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92,
+        109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168,
+        /* Size 16x16 */
+        32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32,
+        32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33,
+        35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38,
+        41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50,
+        54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64,
+        68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83,
+        88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100,
+        102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110,
+        109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117,
+        117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125,
+        126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135,
+        137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144,
+        148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150,
+        155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155,
+        162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162,
+        168, 174,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56,
+        59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32,
+        32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68,
+        71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77,
+        81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37,
+        38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86,
+        88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45,
+        49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32,
+        32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58,
+        59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34,
+        35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69,
+        71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39,
+        40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78,
+        80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47,
+        50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86,
+        35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60,
+        62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34,
+        36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73,
+        75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42,
+        45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86,
+        90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55,
+        57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97,
+        97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67,
+        71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47,
+        45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79,
+        83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44,
+        45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90,
+        93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50,
+        49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100,
+        104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51,
+        53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106,
+        108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55,
+        58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112,
+        113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62,
+        66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118,
+        119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63,
+        67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120,
+        121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68,
+        72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127,
+        128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70,
+        74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131,
+        131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74,
+        78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136,
+        137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76,
+        79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+        139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76,
+        80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+        140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80,
+        83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142,
+        144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78,
+        82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141,
+        147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80,
+        84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137,
+        142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83,
+        82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138,
+        143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86,
+        84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133,
+        140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90,
+        89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129,
+        137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+        /* Size 4x8 */
+        32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61,
+        92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154,
+        /* Size 8x4 */
+        32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57,
+        65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154,
+        /* Size 8x16 */
+        32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33,
+        35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48,
+        54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76,
+        88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106,
+        108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120,
+        124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139,
+        142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150,
+        157, 97, 88, 86, 97, 111, 128, 147, 163,
+        /* Size 16x8 */
+        32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32,
+        33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37,
+        41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54,
+        60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75,
+        82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92,
+        98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106,
+        112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108,
+        116, 124, 134, 142, 153, 157, 163,
+        /* Size 16x32 */
+        32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32,
+        32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32,
+        34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34,
+        41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44,
+        49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57,
+        59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69,
+        71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+        78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80,
+        35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34,
+        36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40,
+        48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51,
+        60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66,
+        71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82,
+        84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+        98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106,
+        107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108,
+        111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112,
+        117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118,
+        119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120,
+        127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127,
+        129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131,
+        135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136,
+        140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139,
+        145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139,
+        148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144,
+        148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145,
+        153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150,
+        153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151,
+        154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147,
+        159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144,
+        163, 163, 173,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+        58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+        32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65,
+        68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34,
+        35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74,
+        78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40,
+        41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82,
+        85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54,
+        57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35,
+        35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67,
+        68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42,
+        42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90,
+        92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+        56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99,
+        101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67,
+        71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114,
+        112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78,
+        82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116,
+        119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79,
+        84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132,
+        130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90,
+        95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135,
+        136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92,
+        97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150,
+        151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97,
+        98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153,
+        153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93,
+        94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153,
+        157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91,
+        91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150,
+        161, 162, 166, 167, 173,
+        /* Size 4x16 */
+        31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42,
+        61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107,
+        54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+        79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159,
+        /* Size 16x4 */
+        31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35,
+        36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56,
+        61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76,
+        81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
+        /* Size 8x32 */
+        32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32,
+        34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42,
+        49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58,
+        71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81,
+        35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39,
+        48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63,
+        71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+        98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108,
+        110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118,
+        119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127,
+        129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136,
+        139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139,
+        150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145,
+        157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151,
+        163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144,
+        163,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+        58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+        33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64,
+        67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39,
+        41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79,
+        83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+        60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97,
+        97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76,
+        82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111,
+        110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92,
+        94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+        82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106,
+        108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147,
+        144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101,
+        108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163,
+        163, 163 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
+        /* Size 8x8 */
+        31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47,
+        52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62,
+        68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86,
+        91, 95, 65, 61, 62, 68, 76, 86, 95, 100,
+        /* Size 16x16 */
+        32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31,
+        34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42,
+        44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48,
+        46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50,
+        50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55,
+        56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64,
+        66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72,
+        71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75,
+        54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53,
+        51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53,
+        56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61,
+        66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72,
+        77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79,
+        83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88,
+        93, 97, 100, 102,
+        /* Size 32x32 */
+        32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53,
+        54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31,
+        34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57,
+        58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40,
+        42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60,
+        62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45,
+        45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62,
+        63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46,
+        47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34,
+        35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51,
+        51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43,
+        47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54,
+        55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48,
+        49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58,
+        58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49,
+        50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60,
+        47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54,
+        55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46,
+        47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58,
+        59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48,
+        49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64,
+        66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+        55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68,
+        67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60,
+        61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48,
+        47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66,
+        66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46,
+        46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71,
+        71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53,
+        54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75,
+        75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62,
+        64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78,
+        54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69,
+        71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51,
+        51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78,
+        79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51,
+        53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83,
+        84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61,
+        63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86,
+        85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,
+        73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60,
+        59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81,
+        82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57,
+        55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90,
+        91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61,
+        61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94,
+        95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69,
+        72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95,
+        67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78,
+        80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62,
+        61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87,
+        88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60,
+        59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92,
+        95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63,
+        64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98,
+        100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,
+        70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101,
+        101, 104,
+        /* Size 4x8 */
+        31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55,
+        71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95,
+        /* Size 8x4 */
+        31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50,
+        55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95,
+        /* Size 8x16 */
+        32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37,
+        46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49,
+        50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61,
+        66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74,
+        54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55,
+        58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69,
+        77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83,
+        92, 98,
+        /* Size 16x8 */
+        32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34,
+        37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47,
+        49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53,
+        57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63,
+        66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73,
+        76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87,
+        91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95,
+        97, 98,
+        /* Size 16x32 */
+        32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31,
+        34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39,
+        45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46,
+        45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46,
+        47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50,
+        51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54,
+        55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57,
+        58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59,
+        47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46,
+        47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47,
+        51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53,
+        57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59,
+        61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65,
+        66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71,
+        71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75,
+        74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76,
+        54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+        51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50,
+        55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58,
+        65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69,
+        73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80,
+        82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90,
+        91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94,
+        94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93,
+        67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62,
+        61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59,
+        61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66,
+        67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74,
+        74, 82, 82, 90, 90, 98, 98, 102,
+        /* Size 32x16 */
+        32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+        54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32,
+        35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54,
+        56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43,
+        44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58,
+        59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47,
+        46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59,
+        60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52,
+        53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47,
+        46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56,
+        56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46,
+        45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67,
+        68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+        54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70,
+        71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+        63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+        56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71,
+        72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52,
+        51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79,
+        80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55,
+        57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90,
+        92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64,
+        66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94,
+        92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73,
+        75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64,
+        64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80,
+        83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60,
+        61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89,
+        89, 93, 93, 97, 98, 99, 99, 102,
+        /* Size 4x16 */
+        31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50,
+        53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75,
+        50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63,
+        82, 94, 62, 64, 79, 96, 63, 66, 81, 97,
+        /* Size 16x4 */
+        31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46,
+        47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49,
+        53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63,
+        66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
+        /* Size 8x32 */
+        32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34,
+        45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45,
+        47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50,
+        55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60,
+        47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46,
+        51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58,
+        61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67,
+        71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+        54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51,
+        55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66,
+        73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83,
+        91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95,
+        67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62,
+        61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67,
+        74, 82, 90, 98,
+        /* Size 32x8 */
+        32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+        54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35,
+        37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54,
+        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48,
+        49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60,
+        61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+        57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68,
+        67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64,
+        66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54,
+        53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76,
+        76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57,
+        55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90,
+        91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61,
+        63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97,
+        97, 99, 98, 98 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
+        /* Size 8x8 */
+        32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35,
+        39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65,
+        78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86,
+        101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153,
+        /* Size 16x16 */
+        32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32,
+        32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33,
+        34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37,
+        40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46,
+        50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60,
+        64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74,
+        80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91,
+        95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101,
+        104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112,
+        59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66,
+        63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70,
+        67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77,
+        73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81,
+        77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86,
+        82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52,
+        54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62,
+        63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75,
+        77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+        37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80,
+        82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43,
+        45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32,
+        32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53,
+        54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33,
+        34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63,
+        66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37,
+        38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72,
+        74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45,
+        46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81,
+        34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53,
+        54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34,
+        35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68,
+        69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38,
+        42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79,
+        80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51,
+        54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91,
+        91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61,
+        63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42,
+        42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75,
+        75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44,
+        44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87,
+        89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50,
+        51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98,
+        99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59,
+        64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106,
+        106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67,
+        71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112,
+        112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75,
+        77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115,
+        118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80,
+        85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65,
+        62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92,
+        97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63,
+        63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98,
+        99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67,
+        65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103,
+        103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70,
+        68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105,
+        106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75,
+        72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110,
+        110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77,
+        74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111,
+        112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78,
+        75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112,
+        113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81,
+        78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115,
+        116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84,
+        80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114,
+        119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86,
+        82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115,
+        120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88,
+        84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118,
+        119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+        /* Size 4x8 */
+        32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59,
+        80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144,
+        /* Size 8x4 */
+        32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50,
+        54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144,
+        /* Size 8x16 */
+        32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32,
+        34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44,
+        52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73,
+        84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103,
+        53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66,
+        60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+        73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91,
+        82, 80, 90, 103, 119, 137, 151,
+        /* Size 16x8 */
+        32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32,
+        32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36,
+        38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51,
+        54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69,
+        74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85,
+        92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96,
+        103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103,
+        111, 118, 126, 134, 143, 147, 151,
+        /* Size 16x32 */
+        32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32,
+        32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32,
+        33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34,
+        38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41,
+        47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50,
+        59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59,
+        69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+        75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78,
+        34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34,
+        35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38,
+        42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50,
+        54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59,
+        65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71,
+        79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+        93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98,
+        103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105,
+        107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107,
+        58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58,
+        54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60,
+        59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61,
+        60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65,
+        65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68,
+        67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72,
+        72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74,
+        73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+        74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78,
+        78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81,
+        80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83,
+        82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85,
+        85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+        53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60,
+        61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72,
+        73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38,
+        40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77,
+        79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48,
+        50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35,
+        35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63,
+        63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39,
+        40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79,
+        81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48,
+        54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92,
+        95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63,
+        65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105,
+        103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71,
+        73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114,
+        117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85,
+        90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+        66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91,
+        93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79,
+        75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+        103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+        81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103,
+        105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+        87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+        105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+        152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100,
+        100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155,
+        155, 160,
+        /* Size 4x16 */
+        31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41,
+        53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98,
+        49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74,
+        80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148,
+        /* Size 16x4 */
+        31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35,
+        35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49,
+        53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74,
+        80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
+        /* Size 8x32 */
+        32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32,
+        33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38,
+        47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58,
+        69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78,
+        34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35,
+        42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56,
+        65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+        93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101,
+        105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113,
+        58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66,
+        60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74,
+        67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81,
+        73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86,
+        78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91,
+        82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+        53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+        32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59,
+        60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36,
+        38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73,
+        75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51,
+        54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90,
+        90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72,
+        74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+        65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90,
+        92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79,
+        75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+        103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+        87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+        105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+        152 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
+        /* Size 8x8 */
+        31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45,
+        48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58,
+        65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82,
+        89, 92, 64, 59, 60, 66, 74, 83, 92, 96,
+        /* Size 16x16 */
+        32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31,
+        32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39,
+        43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47,
+        46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48,
+        49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54,
+        55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60,
+        62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68,
+        69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72,
+        52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51,
+        49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51,
+        53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59,
+        62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68,
+        71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76,
+        80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85,
+        89, 94, 96, 98,
+        /* Size 32x32 */
+        32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52,
+        52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31,
+        32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54,
+        55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39,
+        41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59,
+        60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46,
+        45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61,
+        61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46,
+        46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34,
+        35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49,
+        49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41,
+        44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52,
+        53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47,
+        48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55,
+        56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49,
+        48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59,
+        41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50,
+        50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46,
+        46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+        57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47,
+        49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61,
+        61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+        54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66,
+        65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57,
+        57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47,
+        46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62,
+        62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46,
+        46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67,
+        68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50,
+        54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71,
+        72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+        61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75,
+        52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65,
+        66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49,
+        49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73,
+        73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48,
+        51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78,
+        78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57,
+        60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83,
+        82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65,
+        67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57,
+        56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75,
+        75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54,
+        53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83,
+        84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57,
+        60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90,
+        91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65,
+        68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91,
+        65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74,
+        75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60,
+        59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84,
+        84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57,
+        59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92,
+        93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64,
+        65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96,
+        98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71,
+        71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99,
+        /* Size 4x8 */
+        31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54,
+        66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93,
+        /* Size 8x4 */
+        31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47,
+        50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93,
+        /* Size 8x16 */
+        32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35,
+        43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49,
+        49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60,
+        64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73,
+        52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52,
+        53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65,
+        73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81,
+        89, 95,
+        /* Size 16x8 */
+        32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33,
+        35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46,
+        48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53,
+        54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61,
+        63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71,
+        73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82,
+        86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92,
+        94, 95,
+        /* Size 16x32 */
+        32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31,
+        33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38,
+        41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46,
+        45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45,
+        47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47,
+        51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51,
+        55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55,
+        57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58,
+        42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46,
+        47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48,
+        50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53,
+        54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56,
+        58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61,
+        64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66,
+        69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71,
+        73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74,
+        52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+        49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49,
+        51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56,
+        60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64,
+        69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72,
+        79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81,
+        86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90,
+        92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90,
+        64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60,
+        59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58,
+        59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64,
+        65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71,
+        71, 79, 79, 87, 87, 95, 95, 98,
+        /* Size 32x16 */
+        32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+        52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32,
+        33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52,
+        52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41,
+        43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57,
+        58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48,
+        47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58,
+        58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49,
+        49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47,
+        47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55,
+        55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46,
+        46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62,
+        63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49,
+        53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68,
+        69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+        61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+        52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66,
+        66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52,
+        51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76,
+        77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50,
+        53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82,
+        83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61,
+        64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91,
+        89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69,
+        71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63,
+        63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78,
+        81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60,
+        60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86,
+        86, 90, 90, 95, 95, 96, 96, 98,
+        /* Size 4x16 */
+        31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49,
+        50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71,
+        48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61,
+        75, 90, 60, 62, 76, 92, 62, 64, 78, 94,
+        /* Size 16x4 */
+        31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47,
+        46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47,
+        50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61,
+        64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
+        /* Size 8x32 */
+        32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33,
+        41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46,
+        47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50,
+        55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59,
+        42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47,
+        50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55,
+        58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65,
+        69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+        52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50,
+        51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61,
+        69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80,
+        86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92,
+        64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60,
+        59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65,
+        71, 79, 87, 95,
+        /* Size 32x8 */
+        32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+        52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33,
+        35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+        52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47,
+        48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57,
+        58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+        54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66,
+        65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62,
+        63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54,
+        54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73,
+        73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56,
+        55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85,
+        86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60,
+        62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94,
+        94, 96, 95, 95 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
+        /* Size 8x8 */
+        32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35,
+        39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61,
+        71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90,
+        103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140,
+        /* Size 16x16 */
+        32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32,
+        32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32,
+        32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35,
+        36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40,
+        42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51,
+        54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64,
+        68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78,
+        84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92,
+        48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51,
+        49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54,
+        54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59,
+        58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63,
+        68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74,
+        79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76,
+        81, 86, 92, 99, 106, 113, 121, 128, 137, 140,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+        48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56,
+        56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68,
+        68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75,
+        75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41,
+        41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32,
+        32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50,
+        50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33,
+        33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59,
+        59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37,
+        37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69,
+        69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+        40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75,
+        34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51,
+        51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33,
+        33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58,
+        58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38,
+        38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73,
+        73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+        48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81,
+        81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58,
+        58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38,
+        38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65,
+        65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42,
+        42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79,
+        79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47,
+        47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+        90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61,
+        61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102,
+        48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
+        71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49,
+        49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87,
+        87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50,
+        49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92,
+        97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58,
+        58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103,
+        110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64,
+        64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110,
+        113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73,
+        73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121,
+        121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73,
+        79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121,
+        124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84,
+        90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132,
+        71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90,
+        90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80,
+        76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96,
+        104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76,
+        76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104,
+        104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78,
+        75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+        106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78,
+        75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+        106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83,
+        79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109,
+        109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+        /* Size 4x8 */
+        32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56,
+        76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136,
+        /* Size 8x4 */
+        32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50,
+        54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136,
+        /* Size 8x16 */
+        32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+        33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38,
+        42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60,
+        68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+        48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54,
+        54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63,
+        73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81,
+        92, 106, 121, 136,
+        /* Size 16x8 */
+        32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32,
+        32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34,
+        35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42,
+        48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58,
+        63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76,
+        82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92,
+        98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103,
+        110, 118, 125, 133, 136,
+        /* Size 16x32 */
+        32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32,
+        32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32,
+        32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34,
+        34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41,
+        41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50,
+        50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59,
+        59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+        69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75,
+        34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34,
+        34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38,
+        38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48,
+        48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58,
+        58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65,
+        65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+        79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90,
+        90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
+        48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49,
+        49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49,
+        50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54,
+        54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54,
+        63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68,
+        68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68,
+        79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84,
+        84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84,
+        97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+        104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+        104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+        106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+        106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96,
+        109, 109, 124, 124, 141, 141, 149,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+        48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+        54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65,
+        65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38,
+        38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72,
+        72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
+        43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35,
+        35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60,
+        60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36,
+        36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68,
+        68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48,
+        48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+        90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+        58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+        53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
+        76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51,
+        49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82,
+        87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59,
+        59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105,
+        105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58,
+        58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105,
+        111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+        73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+        125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+        73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+        125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75,
+        79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124,
+        132, 132, 141, 141, 144, 144, 149,
+        /* Size 4x16 */
+        31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38,
+        49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90,
+        45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65,
+        73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136,
+        /* Size 16x4 */
+        31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35,
+        34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50,
+        49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69,
+        73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
+        /* Size 8x32 */
+        32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+        32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34,
+        41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50,
+        59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69,
+        34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34,
+        38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50,
+        58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71,
+        79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+        48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49,
+        50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54,
+        63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68,
+        79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84,
+        97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90,
+        104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92,
+        106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+        48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+        54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63,
+        63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+        48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81,
+        81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
+        63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51,
+        51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+        82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59,
+        59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98,
+        105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71,
+        69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
+        118, 125, 125, 133, 133, 136, 136, 141 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
+        /* Size 8x8 */
+        31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45,
+        48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56,
+        61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75,
+        82, 86, 61, 57, 58, 64, 71, 79, 86, 91,
+        /* Size 16x16 */
+        32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31,
+        31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35,
+        40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45,
+        47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46,
+        45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50,
+        50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55,
+        56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61,
+        63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68,
+        50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50,
+        47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49,
+        48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53,
+        56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61,
+        65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71,
+        75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79,
+        83, 86, 90, 91,
+        /* Size 32x32 */
+        32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50,
+        50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31,
+        31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52,
+        52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38,
+        38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57,
+        57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46,
+        46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58,
+        58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45,
+        45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34,
+        34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47,
+        47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39,
+        39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51,
+        51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47,
+        47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54,
+        54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+        46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57,
+        41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50,
+        50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42,
+        42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52,
+        52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48,
+        48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58,
+        58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
+        53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+        61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55,
+        55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47,
+        47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57,
+        57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45,
+        45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63,
+        63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49,
+        49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67,
+        67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56,
+        56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72,
+        50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61,
+        61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47,
+        47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68,
+        68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47,
+        47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72,
+        72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55,
+        55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79,
+        79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62,
+        62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54,
+        54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70,
+        70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51,
+        51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76,
+        76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55,
+        55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85,
+        85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61,
+        61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88,
+        63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71,
+        71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57,
+        57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78,
+        78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55,
+        55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86,
+        86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61,
+        61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91,
+        91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69,
+        69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+        /* Size 4x8 */
+        31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54,
+        64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90,
+        /* Size 8x4 */
+        31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47,
+        50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90,
+        /* Size 8x16 */
+        32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32,
+        40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47,
+        45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54,
+        57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+        50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50,
+        49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58,
+        65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75,
+        83, 90,
+        /* Size 16x8 */
+        32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31,
+        32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43,
+        47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50,
+        53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56,
+        58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64,
+        66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73,
+        76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85,
+        89, 90,
+        /* Size 16x32 */
+        32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31,
+        31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38,
+        38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46,
+        46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45,
+        45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47,
+        47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51,
+        51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54,
+        54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57,
+        42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43,
+        43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48,
+        48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53,
+        53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56,
+        56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57,
+        57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64,
+        64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67,
+        67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73,
+        50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+        48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47,
+        47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55,
+        55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62,
+        62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70,
+        70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76,
+        76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85,
+        85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88,
+        63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57,
+        57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56,
+        56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61,
+        61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69,
+        69, 77, 77, 84, 84, 92, 92, 95,
+        /* Size 32x16 */
+        32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+        50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+        32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+        50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40,
+        40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54,
+        54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48,
+        48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56,
+        56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
+        46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47,
+        47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+        54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47,
+        47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+        56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49,
+        49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67,
+        67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+        56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+        52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64,
+        64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48,
+        48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68,
+        68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50,
+        50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79,
+        79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57,
+        57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83,
+        83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67,
+        67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60,
+        60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75,
+        75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59,
+        59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84,
+        84, 88, 88, 92, 92, 93, 93, 95,
+        /* Size 4x16 */
+        31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47,
+        47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67,
+        46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58,
+        72, 85, 57, 60, 75, 89, 59, 61, 75, 90,
+        /* Size 16x4 */
+        31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47,
+        46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47,
+        47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57,
+        60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
+        /* Size 8x32 */
+        32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31,
+        38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46,
+        45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47,
+        51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54,
+        42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46,
+        48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53,
+        56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61,
+        64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+        50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48,
+        47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55,
+        62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70,
+        76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85,
+        63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59,
+        56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63,
+        69, 77, 84, 92,
+        /* Size 32x8 */
+        32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+        50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+        32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+        50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47,
+        47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52,
+        52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+        53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+        61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
+        58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50,
+        50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66,
+        66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51,
+        51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76,
+        76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57,
+        57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
+        89, 90, 90, 92 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
+        /* Size 8x8 */
+        31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33,
+        35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54,
+        63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89,
+        98, 108, 69, 65, 64, 73, 85, 97, 108, 119,
+        /* Size 16x16 */
+        32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32,
+        32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32,
+        32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35,
+        36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39,
+        41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48,
+        51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59,
+        62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71,
+        76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86,
+        45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46,
+        44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50,
+        49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58,
+        62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67,
+        71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76,
+        80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86,
+        91, 96, 104, 110, 118, 125, 134,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+        45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51,
+        52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59,
+        62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66,
+        73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37,
+        38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45,
+        45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33,
+        33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52,
+        54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60,
+        64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38,
+        39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69,
+        32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43,
+        43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33,
+        33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54,
+        54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35,
+        37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60,
+        63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42,
+        46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72,
+        78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50,
+        52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37,
+        37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59,
+        59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38,
+        39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67,
+        69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41,
+        46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77,
+        80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54,
+        56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+        45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64,
+        65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45,
+        44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75,
+        76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45,
+        46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83,
+        85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54,
+        58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97,
+        103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63,
+        65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+        57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+        75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56,
+        54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80,
+        86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56,
+        56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93,
+        95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59,
+        58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101,
+        105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60,
+        64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106,
+        108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68,
+        72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+        117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+        73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118,
+        119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79,
+        82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+        134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82,
+        84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+        134, 134,
+        /* Size 4x8 */
+        32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48,
+        64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111,
+        /* Size 8x4 */
+        32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42,
+        44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111,
+        /* Size 8x16 */
+        32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32,
+        33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38,
+        42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59,
+        65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81,
+        44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49,
+        50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68,
+        79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90,
+        104, 115, 127,
+        /* Size 16x8 */
+        32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32,
+        32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34,
+        35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42,
+        47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56,
+        60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72,
+        76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90,
+        95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105,
+        112, 119, 127,
+        /* Size 16x32 */
+        32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32,
+        32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32,
+        32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33,
+        35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36,
+        41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42,
+        49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50,
+        57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59,
+        65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+        32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34,
+        34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36,
+        37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42,
+        47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49,
+        54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57,
+        63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65,
+        71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76,
+        81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+        44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45,
+        45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46,
+        46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54,
+        59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63,
+        71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75,
+        85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,
+        87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89,
+        98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102,
+        105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106,
+        114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119,
+        125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+        79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+        79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+        44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49,
+        50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57,
+        60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36,
+        37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64,
+        71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40,
+        41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34,
+        34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51,
+        51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35,
+        36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62,
+        63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40,
+        44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72,
+        76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54,
+        56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+        44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64,
+        64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50,
+        49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81,
+        82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50,
+        49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87,
+        89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56,
+        61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102,
+        103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63,
+        67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111,
+        111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74,
+        77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119,
+        127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81,
+        84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125,
+        133, 133,
+        /* Size 4x16 */
+        31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37,
+        43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76,
+        42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63,
+        80, 105, 66, 68, 85, 111, 73, 74, 91, 118,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34,
+        33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43,
+        43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63,
+        67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
+        /* Size 8x32 */
+        32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32,
+        32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34,
+        41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50,
+        57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64,
+        32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34,
+        37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48,
+        54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65,
+        71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+        44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45,
+        46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60,
+        71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87,
+        95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102,
+        112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119,
+        72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79,
+        72, 70, 79, 90, 104, 115, 127,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+        44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49,
+        49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34,
+        35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56,
+        58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42,
+        47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73,
+        79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58,
+        60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51,
+        51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76,
+        76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57,
+        57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90,
+        94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65,
+        65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103,
+        105, 108, 112, 114, 119, 119, 127, 127 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
+        /* Size 8x8 */
+        31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42,
+        47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53,
+        58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69,
+        73, 77, 57, 54, 52, 58, 65, 72, 77, 82,
+        /* Size 16x16 */
+        32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+        31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35,
+        40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45,
+        47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47,
+        46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49,
+        50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53,
+        55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58,
+        60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65,
+        49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48,
+        46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47,
+        47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52,
+        55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58,
+        61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66,
+        68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75,
+        78, 82, 85, 89,
+        /* Size 32x32 */
+        32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49,
+        49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31,
+        31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50,
+        50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35,
+        38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53,
+        54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42,
+        46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55,
+        58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45,
+        45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32,
+        33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46,
+        46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37,
+        39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49,
+        49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44,
+        45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51,
+        53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48,
+        47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54,
+        37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45,
+        46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42,
+        42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
+        50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45,
+        47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52,
+        53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50,
+        52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58,
+        60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53,
+        53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47,
+        47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55,
+        55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46,
+        46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58,
+        58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+        49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62,
+        63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53,
+        55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67,
+        49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59,
+        59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47,
+        46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63,
+        63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46,
+        46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66,
+        67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50,
+        53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72,
+        74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57,
+        59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51,
+        51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65,
+        65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49,
+        49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70,
+        71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49,
+        52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75,
+        77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56,
+        58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82,
+        58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64,
+        65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55,
+        54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72,
+        72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53,
+        52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77,
+        79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57,
+        60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85,
+        89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63,
+        65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+        /* Size 4x8 */
+        31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49,
+        59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79,
+        /* Size 8x4 */
+        31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45,
+        46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79,
+        /* Size 8x16 */
+        32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32,
+        40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47,
+        45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53,
+        55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63,
+        49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48,
+        47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56,
+        64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75,
+        80, 86,
+        /* Size 16x8 */
+        32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+        32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43,
+        47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50,
+        52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55,
+        57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62,
+        64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70,
+        72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79,
+        82, 86,
+        /* Size 16x32 */
+        32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31,
+        31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36,
+        38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42,
+        46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46,
+        45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45,
+        48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47,
+        50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51,
+        53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54,
+        37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43,
+        43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46,
+        47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50,
+        52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53,
+        53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55,
+        56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57,
+        59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61,
+        63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67,
+        49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+        46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46,
+        46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50,
+        54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56,
+        61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62,
+        68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68,
+        72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75,
+        78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82,
+        58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55,
+        54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53,
+        53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58,
+        60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62,
+        67, 68, 75, 75, 80, 82, 86, 89,
+        /* Size 32x16 */
+        32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+        49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+        32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48,
+        48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37,
+        40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51,
+        52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46,
+        47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53,
+        56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47,
+        46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42,
+        42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50,
+        50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46,
+        47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
+        55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47,
+        50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59,
+        60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53,
+        55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+        49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59,
+        59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48,
+        48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66,
+        66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47,
+        47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69,
+        70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53,
+        55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78,
+        80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60,
+        61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58,
+        57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69,
+        69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57,
+        56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77,
+        78, 80, 82, 83, 85, 85, 89, 89,
+        /* Size 4x16 */
+        31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47,
+        46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61,
+        46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53,
+        64, 76, 55, 55, 66, 79, 58, 58, 68, 82,
+        /* Size 16x4 */
+        31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42,
+        42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46,
+        46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53,
+        56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
+        /* Size 8x32 */
+        32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31,
+        38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46,
+        45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47,
+        50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52,
+        37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43,
+        47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53,
+        53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57,
+        59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+        49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46,
+        46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54,
+        61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68,
+        72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79,
+        58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55,
+        53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60,
+        67, 75, 80, 86,
+        /* Size 32x8 */
+        32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+        49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+        32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48,
+        48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44,
+        47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49,
+        50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50,
+        52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59,
+        60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56,
+        57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50,
+        50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64,
+        64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51,
+        50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71,
+        72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53,
+        56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80,
+        82, 83, 86, 86 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
+        /* Size 8x8 */
+        31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32,
+        35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47,
+        54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75,
+        82, 92, 63, 59, 58, 65, 73, 84, 92, 105,
+        /* Size 16x16 */
+        32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32,
+        32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32,
+        32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33,
+        35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37,
+        39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41,
+        42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51,
+        54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63,
+        65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72,
+        41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42,
+        41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45,
+        45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49,
+        54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63,
+        67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73,
+        77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85,
+        92, 97, 101, 105,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+        41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46,
+        47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51,
+        55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61,
+        61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41,
+        41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47,
+        49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+        34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54,
+        56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36,
+        37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40,
+        41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32,
+        32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45,
+        46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35,
+        35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53,
+        56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+        39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63,
+        63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45,
+        46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35,
+        35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54,
+        54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34,
+        36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58,
+        60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40,
+        40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68,
+        69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47,
+        50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77,
+        41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55,
+        57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42,
+        41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+        67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42,
+        42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71,
+        74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49,
+        50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84,
+        84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57,
+        60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49,
+        48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68,
+        68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49,
+        49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+        82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49,
+        49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87,
+        89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59,
+        63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101,
+        59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69,
+        71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57,
+        56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81,
+        82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59,
+        59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92,
+        92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58,
+        58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+        101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67,
+        69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109,
+        109, 114,
+        /* Size 4x8 */
+        32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40,
+        58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97,
+        /* Size 8x4 */
+        32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41,
+        42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97,
+        /* Size 8x16 */
+        32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32,
+        33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36,
+        39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48,
+        54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72,
+        41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45,
+        46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60,
+        67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79,
+        92, 105,
+        /* Size 16x8 */
+        32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32,
+        32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34,
+        34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37,
+        41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50,
+        52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60,
+        63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76,
+        82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97,
+        100, 105,
+        /* Size 16x32 */
+        32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32,
+        32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32,
+        32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32,
+        34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34,
+        37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41,
+        41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48,
+        49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,
+        59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+        32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32,
+        33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35,
+        36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37,
+        41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44,
+        46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54,
+        54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58,
+        60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,
+        72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+        41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41,
+        41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42,
+        43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45,
+        53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56,
+        60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68,
+        68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79,
+        82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,
+        92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+        58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57,
+        56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60,
+        59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59,
+        58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62,
+        62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+        41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44,
+        45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49,
+        53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35,
+        35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59,
+        59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+        40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32,
+        32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43,
+        43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34,
+        35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55,
+        57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+        38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63,
+        65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+        50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+        44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58,
+        60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41,
+        41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+        67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48,
+        48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79,
+        83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53,
+        54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92,
+        92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63,
+        67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63,
+        62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79,
+        79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59,
+        59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85,
+        88, 92, 92, 97, 98, 100, 105, 105, 109,
+        /* Size 4x16 */
+        31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34,
+        42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67,
+        39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53,
+        74, 90, 57, 56, 77, 93, 61, 58, 79, 97,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32,
+        33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42,
+        42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52,
+        57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
+        /* Size 8x32 */
+        32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32,
+        32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34,
+        37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42,
+        49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58,
+        32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33,
+        36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43,
+        46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54,
+        60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+        41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41,
+        43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54,
+        60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71,
+        82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+        58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60,
+        58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62,
+        70, 76, 83, 96, 109,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+        41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44,
+        45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34,
+        34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50,
+        53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+        41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66,
+        66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50,
+        52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42,
+        42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63,
+        63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49,
+        49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+        82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58,
+        58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+        100, 105, 105, 109 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
+        /* Size 8x8 */
+        31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40,
+        47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51,
+        54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63,
+        66, 70, 55, 52, 50, 54, 60, 66, 70, 76,
+        /* Size 16x16 */
+        32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31,
+        31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34,
+        37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42,
+        44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47,
+        46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46,
+        45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50,
+        50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55,
+        56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59,
+        49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47,
+        45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46,
+        46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47,
+        50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55,
+        58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60,
+        63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67,
+        70, 73, 74, 76,
+        /* Size 32x32 */
+        32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48,
+        49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+        31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48,
+        49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34,
+        36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50,
+        51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41,
+        42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53,
+        53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46,
+        45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31,
+        31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45,
+        45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34,
+        37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47,
+        47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43,
+        43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49,
+        50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46,
+        47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+        36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46,
+        46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39,
+        40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46,
+        46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44,
+        45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49,
+        51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48,
+        48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53,
+        53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51,
+        51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48,
+        47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+        53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46,
+        47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+        54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47,
+        47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58,
+        58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51,
+        53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61,
+        49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55,
+        55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46,
+        45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59,
+        60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45,
+        45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61,
+        62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49,
+        50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66,
+        66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54,
+        56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49,
+        49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60,
+        60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47,
+        47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65,
+        66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47,
+        47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68,
+        69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53,
+        55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74,
+        54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58,
+        60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52,
+        50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65,
+        66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51,
+        50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70,
+        73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52,
+        53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76,
+        76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58,
+        60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80,
+        /* Size 4x8 */
+        31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47,
+        56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73,
+        /* Size 8x4 */
+        31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45,
+        45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73,
+        /* Size 8x16 */
+        32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32,
+        40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46,
+        46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49,
+        50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59,
+        48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46,
+        46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54,
+        58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64,
+        70, 76,
+        /* Size 16x8 */
+        32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31,
+        32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42,
+        44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47,
+        49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53,
+        54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57,
+        58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64,
+        66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73,
+        74, 76,
+        /* Size 16x32 */
+        32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31,
+        31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34,
+        38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39,
+        45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46,
+        45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45,
+        45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47,
+        47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49,
+        51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51,
+        37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39,
+        40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44,
+        47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47,
+        49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51,
+        51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53,
+        53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54,
+        54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58,
+        59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60,
+        48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+        45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45,
+        46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46,
+        52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54,
+        56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60,
+        60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65,
+        66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68,
+        71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73,
+        54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51,
+        51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51,
+        50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50,
+        55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58,
+        61, 65, 65, 70, 72, 74, 78, 78,
+        /* Size 32x16 */
+        32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+        48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+        32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47,
+        47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36,
+        37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48,
+        49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44,
+        44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51,
+        51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+        47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38,
+        38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46,
+        46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44,
+        46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52,
+        53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+        47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55,
+        56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51,
+        53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+        49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56,
+        57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46,
+        45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59,
+        59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47,
+        47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65,
+        67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50,
+        50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70,
+        70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55,
+        58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55,
+        54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64,
+        64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52,
+        51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68,
+        71, 71, 73, 73, 74, 76, 76, 78,
+        /* Size 4x16 */
+        31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44,
+        45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58,
+        46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48,
+        62, 70, 51, 49, 63, 71, 53, 50, 64, 73,
+        /* Size 16x4 */
+        31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38,
+        40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45,
+        45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48,
+        51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
+        /* Size 8x32 */
+        32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31,
+        38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44,
+        45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45,
+        47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51,
+        37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42,
+        47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50,
+        51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53,
+        54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+        48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45,
+        46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52,
+        56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61,
+        66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+        54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52,
+        50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57,
+        61, 65, 72, 78,
+        /* Size 32x8 */
+        32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+        48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32,
+        32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46,
+        46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43,
+        44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47,
+        48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+        49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55,
+        55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
+        54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47,
+        47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58,
+        58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48,
+        47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65,
+        66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50,
+        50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73,
+        74, 76, 76, 78 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
+        /* Size 8x8 */
+        31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32,
+        34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42,
+        48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65,
+        71, 77, 53, 50, 51, 55, 61, 70, 77, 85,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32,
+        32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32,
+        32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33,
+        33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35,
+        36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39,
+        40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44,
+        47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51,
+        54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64,
+        38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38,
+        37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41,
+        42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45,
+        47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51,
+        57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63,
+        65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75,
+        79, 81, 87, 92,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+        38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41,
+        43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46,
+        47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52,
+        56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+        34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37,
+        37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41,
+        44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49,
+        50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+        35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+        38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32,
+        32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42,
+        42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46,
+        46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+        36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52,
+        55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41,
+        42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34,
+        34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45,
+        45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34,
+        34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51,
+        53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+        38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59,
+        60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+        42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64,
+        38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50,
+        51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38,
+        37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56,
+        58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39,
+        39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61,
+        62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42,
+        44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70,
+        73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50,
+        54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43,
+        43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58,
+        58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44,
+        44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+        69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45,
+        46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75,
+        76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51,
+        51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81,
+        53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59,
+        62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51,
+        50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69,
+        71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51,
+        51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77,
+        78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53,
+        55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88,
+        92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61,
+        64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+        /* Size 4x8 */
+        32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38,
+        49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83,
+        /* Size 8x4 */
+        32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36,
+        38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83,
+        /* Size 8x16 */
+        32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32,
+        32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34,
+        36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44,
+        47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60,
+        38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41,
+        42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48,
+        57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75,
+        79, 87,
+        /* Size 16x8 */
+        32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32,
+        32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33,
+        34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36,
+        36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42,
+        48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56,
+        58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66,
+        69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77,
+        82, 87,
+        /* Size 16x32 */
+        32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32,
+        32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32,
+        32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32,
+        33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34,
+        35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36,
+        41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41,
+        44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49,
+        49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54,
+        32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32,
+        33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33,
+        35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36,
+        36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42,
+        42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44,
+        48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50,
+        53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60,
+        60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+        38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+        37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38,
+        40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42,
+        44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52,
+        53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56,
+        63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66,
+        69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76,
+        76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81,
+        53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51,
+        49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51,
+        51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54,
+        55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62,
+        63, 67, 75, 75, 79, 87, 87, 92,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+        38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40,
+        42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45,
+        46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51,
+        54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37,
+        37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40,
+        40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33,
+        33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45,
+        47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+        38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58,
+        59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+        42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+        38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49,
+        51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42,
+        41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61,
+        63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42,
+        42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67,
+        67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45,
+        47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76,
+        79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57,
+        60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52,
+        51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65,
+        65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54,
+        54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76,
+        79, 80, 81, 86, 87, 88, 92, 92,
+        /* Size 4x16 */
+        31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34,
+        38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60,
+        37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47,
+        60, 77, 51, 50, 63, 82, 55, 54, 67, 87,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32,
+        32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36,
+        38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49,
+        51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
+        /* Size 8x32 */
+        32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32,
+        32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33,
+        35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41,
+        44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50,
+        32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33,
+        35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38,
+        42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50,
+        53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+        38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37,
+        40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45,
+        53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66,
+        69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77,
+        53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51,
+        51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55,
+        63, 75, 79, 87,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+        38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40,
+        41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33,
+        34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46,
+        47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+        36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53,
+        55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45,
+        48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43,
+        42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58,
+        58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44,
+        44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+        69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49,
+        49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
+        82, 83, 87, 87 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
+        /* Size 8x8 */
+        31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36,
+        41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50,
+        53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59,
+        61, 64, 51, 48, 48, 51, 54, 60, 64, 68,
+        /* Size 16x16 */
+        32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31,
+        31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32,
+        35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40,
+        41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45,
+        47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47,
+        46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+        47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50,
+        50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+        49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47,
+        46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45,
+        45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46,
+        47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50,
+        54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56,
+        57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62,
+        64, 66, 68, 71,
+        /* Size 32x32 */
+        32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49,
+        49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+        31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47,
+        47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34,
+        34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38,
+        40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50,
+        52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44,
+        46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31,
+        31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45,
+        45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32,
+        33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45,
+        46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40,
+        43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47,
+        47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45,
+        45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49,
+        34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47,
+        47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38,
+        39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
+        45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43,
+        43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46,
+        46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47,
+        47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48,
+        49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49,
+        50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42,
+        42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49,
+        49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44,
+        44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+        52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47,
+        48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+        54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+        49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+        53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47,
+        46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55,
+        55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46,
+        46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56,
+        56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45,
+        47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60,
+        61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51,
+        53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47,
+        47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+        56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46,
+        46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59,
+        61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+        46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63,
+        63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50,
+        50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66,
+        52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54,
+        56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49,
+        48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60,
+        61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48,
+        48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64,
+        64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48,
+        49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69,
+        71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53,
+        55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71,
+        /* Size 4x8 */
+        31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48,
+        53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67,
+        /* Size 8x4 */
+        31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46,
+        47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67,
+        /* Size 8x16 */
+        32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32,
+        38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44,
+        47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47,
+        47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54,
+        48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45,
+        45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48,
+        54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62,
+        65, 68,
+        /* Size 16x8 */
+        32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31,
+        32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38,
+        41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47,
+        47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50,
+        53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55,
+        56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+        61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
+        66, 68,
+        /* Size 16x32 */
+        32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31,
+        31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32,
+        37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38,
+        40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45,
+        46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46,
+        45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45,
+        46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47,
+        47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49,
+        34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39,
+        40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41,
+        45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47,
+        47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50,
+        50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50,
+        49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51,
+        52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54,
+        54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+        48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+        46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46,
+        46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46,
+        47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52,
+        53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55,
+        58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59,
+        61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64,
+        64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66,
+        52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49,
+        48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48,
+        48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49,
+        50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55,
+        55, 57, 62, 62, 65, 68, 68, 71,
+        /* Size 32x16 */
+        32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+        48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+        31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46,
+        46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35,
+        36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+        47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41,
+        42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48,
+        50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47,
+        48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38,
+        38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47,
+        47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41,
+        41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47,
+        48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47,
+        47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53,
+        53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+        48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+        53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47,
+        45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57,
+        58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45,
+        46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59,
+        60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46,
+        47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63,
+        65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52,
+        54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50,
+        50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57,
+        57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50,
+        50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63,
+        65, 65, 66, 68, 68, 69, 71, 71,
+        /* Size 4x16 */
+        31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43,
+        46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54,
+        47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47,
+        55, 64, 49, 47, 56, 66, 51, 49, 57, 68,
+        /* Size 16x4 */
+        31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38,
+        39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46,
+        46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47,
+        48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
+        /* Size 8x32 */
+        32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31,
+        37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40,
+        46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45,
+        46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47,
+        34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40,
+        45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48,
+        50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51,
+        52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+        48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46,
+        46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47,
+        53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59,
+        61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64,
+        52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48,
+        48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50,
+        55, 62, 65, 68,
+        /* Size 32x8 */
+        32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+        48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+        32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45,
+        45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41,
+        41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46,
+        47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47,
+        47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49,
+        50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51,
+        53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48,
+        47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+        56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46,
+        46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60,
+        61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47,
+        47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66,
+        66, 67, 68, 68 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
+        /* Size 8x8 */
+        31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32,
+        33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37,
+        41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56,
+        63, 67, 47, 44, 45, 46, 52, 59, 67, 71,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+        32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32,
+        32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34,
+        35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36,
+        36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40,
+        41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44,
+        47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51,
+        35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35,
+        35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37,
+        39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40,
+        41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44,
+        47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+        56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61,
+        63, 67, 70, 71,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34,
+        35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+        39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42,
+        42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46,
+        46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+        34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+        38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42,
+        44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
+        36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38,
+        39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42,
+        42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46,
+        46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37,
+        37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40,
+        40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33,
+        33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45,
+        46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35,
+        35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48,
+        50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+        38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54,
+        35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43,
+        46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35,
+        35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50,
+        50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35,
+        36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54,
+        54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39,
+        39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59,
+        59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45,
+        45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39,
+        38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50,
+        50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38,
+        38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55,
+        57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42,
+        42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64,
+        66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+        44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+        45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49,
+        53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45,
+        45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58,
+        60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45,
+        45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67,
+        67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45,
+        46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71,
+        71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53,
+        53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77,
+        /* Size 4x8 */
+        31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37,
+        43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67,
+        /* Size 8x4 */
+        31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34,
+        36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67,
+        /* Size 8x16 */
+        32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32,
+        32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34,
+        35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38,
+        42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+        35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37,
+        39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43,
+        51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56,
+        67, 70,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+        32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32,
+        33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34,
+        35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39,
+        41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
+        48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+        60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
+        69, 70,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31,
+        32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32,
+        32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32,
+        32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33,
+        34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+        34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38,
+        41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41,
+        44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49,
+        32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32,
+        32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32,
+        33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35,
+        35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36,
+        37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40,
+        40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45,
+        48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48,
+        50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55,
+        35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+        34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34,
+        36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40,
+        40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42,
+        48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50,
+        50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56,
+        60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63,
+        66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71,
+        44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45,
+        44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45,
+        45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46,
+        46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50,
+        56, 58, 58, 64, 69, 69, 73, 79,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+        35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+        38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41,
+        41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45,
+        45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35,
+        35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32,
+        32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38,
+        38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33,
+        33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40,
+        41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45,
+        47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37,
+        39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+        36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43,
+        47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35,
+        35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49,
+        50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38,
+        39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59,
+        59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42,
+        42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67,
+        67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48,
+        48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46,
+        45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56,
+        56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49,
+        49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65,
+        67, 71, 71, 72, 75, 76, 76, 79,
+        /* Size 4x16 */
+        31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34,
+        36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48,
+        35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43,
+        53, 63, 45, 45, 56, 66, 46, 46, 56, 67,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32,
+        32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34,
+        36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42,
+        42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
+        /* Size 8x32 */
+        32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32,
+        32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32,
+        34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34,
+        41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44,
+        32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32,
+        33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35,
+        37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42,
+        48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+        35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34,
+        36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40,
+        48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51,
+        60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66,
+        44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45,
+        45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48,
+        56, 58, 69, 73,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+        35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36,
+        37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42,
+        42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+        35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46,
+        46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41,
+        41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35,
+        35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48,
+        48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41,
+        41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58,
+        60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45,
+        45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
+        69, 70, 70, 73 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
+        /* Size 8x8 */
+        31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35,
+        39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47,
+        49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55,
+        58, 60, 49, 46, 46, 46, 50, 55, 60, 61,
+        /* Size 16x16 */
+        32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31,
+        31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31,
+        34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35,
+        40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43,
+        44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47,
+        47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46,
+        46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47,
+        47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50,
+        47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47,
+        47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45,
+        46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46,
+        46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47,
+        49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+        54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56,
+        58, 60, 61, 61,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43,
+        47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+        31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48,
+        47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37,
+        38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42,
+        42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31,
+        31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46,
+        46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32,
+        32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45,
+        45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36,
+        36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+        46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42,
+        43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+        33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45,
+        47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34,
+        35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46,
+        46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38,
+        41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45,
+        45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46,
+        47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46,
+        46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47,
+        47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39,
+        40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+        49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42,
+        42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49,
+        49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45,
+        45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49,
+        50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47,
+        48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51,
+        47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+        52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47,
+        47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53,
+        53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46,
+        47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53,
+        53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+        47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55,
+        55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49,
+        49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47,
+        47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53,
+        53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45,
+        45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55,
+        55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+        45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59,
+        59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+        47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60,
+        49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50,
+        52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48,
+        47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55,
+        56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46,
+        46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60,
+        60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46,
+        46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61,
+        61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50,
+        50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64,
+        /* Size 4x8 */
+        31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47,
+        50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59,
+        /* Size 8x4 */
+        31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46,
+        47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59,
+        /* Size 8x16 */
+        32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32,
+        34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42,
+        46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47,
+        45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+        47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46,
+        46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46,
+        51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54,
+        59, 61,
+        /* Size 16x8 */
+        32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31,
+        32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35,
+        37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43,
+        47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48,
+        49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52,
+        53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+        57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
+        61, 61,
+        /* Size 16x32 */
+        32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31,
+        31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31,
+        34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38,
+        38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40,
+        45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46,
+        46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45,
+        45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45,
+        46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47,
+        33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35,
+        36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38,
+        41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47,
+        47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+        47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48,
+        48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49,
+        49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49,
+        50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51,
+        47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+        46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46,
+        47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47,
+        47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48,
+        51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53,
+        53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55,
+        57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58,
+        59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61,
+        49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48,
+        46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46,
+        46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46,
+        46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48,
+        52, 54, 54, 58, 60, 60, 62, 65,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+        47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47,
+        47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33,
+        35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45,
+        45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38,
+        40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46,
+        46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44,
+        44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38,
+        38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40,
+        40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+        46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+        44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47,
+        48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47,
+        48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+        48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+        52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47,
+        46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53,
+        53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45,
+        46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56,
+        56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45,
+        45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59,
+        59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49,
+        49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49,
+        48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54,
+        54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48,
+        48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57,
+        59, 61, 61, 62, 63, 64, 64, 65,
+        /* Size 4x16 */
+        31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42,
+        47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49,
+        46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46,
+        53, 58, 48, 46, 54, 59, 48, 46, 54, 59,
+        /* Size 16x4 */
+        31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38,
+        39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46,
+        47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46,
+        45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
+        /* Size 8x32 */
+        32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31,
+        34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39,
+        45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46,
+        45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46,
+        33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38,
+        41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47,
+        47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50,
+        49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+        47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46,
+        47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47,
+        51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53,
+        57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59,
+        49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46,
+        46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47,
+        52, 54, 60, 62,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+        47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+        32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46,
+        46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35,
+        37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45,
+        45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46,
+        46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49,
+        49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47,
+        47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53,
+        53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45,
+        45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56,
+        57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+        46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60,
+        61, 61, 61, 62 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
+        /* Size 8x8 */
+        31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32,
+        32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35,
+        37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46,
+        51, 54, 41, 39, 40, 41, 44, 49, 54, 58,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36,
+        36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+        40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42,
+        34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34,
+        34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34,
+        34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36,
+        36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40,
+        40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45,
+        45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54,
+        54, 58, 58, 63,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+        35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37,
+        38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+        42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+        34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+        34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+        37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+        34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36,
+        36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38,
+        39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40,
+        42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+        34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+        37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+        38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+        40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35,
+        36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45,
+        34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+        39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34,
+        34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41,
+        42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33,
+        33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44,
+        45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36,
+        36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49,
+        50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+        38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35,
+        35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+        42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35,
+        34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+        48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37,
+        38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52,
+        52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40,
+        40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58,
+        39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42,
+        45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38,
+        38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47,
+        50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39,
+        39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54,
+        56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42,
+        42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60,
+        63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+        42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+        /* Size 4x8 */
+        31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34,
+        38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56,
+        /* Size 8x4 */
+        31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34,
+        34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56,
+        /* Size 8x16 */
+        32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32,
+        32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33,
+        33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36,
+        36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+        34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34,
+        34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40,
+        40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53,
+        53, 63,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+        32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34,
+        34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42,
+        42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+        48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+        58, 63,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34,
+        35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+        34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37,
+        41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41,
+        31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32,
+        32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32,
+        32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34,
+        34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37,
+        38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38,
+        38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+        42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45,
+        34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+        34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34,
+        34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36,
+        37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38,
+        38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43,
+        48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48,
+        48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52,
+        56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58,
+        39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38,
+        37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39,
+        39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42,
+        43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43,
+        43, 48, 53, 53, 53, 58, 63, 63,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+        35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+        37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39,
+        41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+        35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+        38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+        40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+        34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+        39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35,
+        35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45,
+        48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34,
+        34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49,
+        50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
+        36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52,
+        53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40,
+        40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43,
+        42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48,
+        48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41,
+        41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54,
+        54, 56, 58, 58, 58, 60, 63, 63,
+        /* Size 4x16 */
+        31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32,
+        33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40,
+        34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39,
+        45, 54, 38, 39, 45, 54, 42, 42, 48, 58,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33,
+        33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39,
+        39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
+        /* Size 8x32 */
+        32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32,
+        32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32,
+        32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34,
+        34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+        31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32,
+        32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34,
+        34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38,
+        38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+        34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34,
+        34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38,
+        38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48,
+        48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58,
+        39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39,
+        39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43,
+        43, 53, 53, 63,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+        37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+        34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42,
+        43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+        35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35,
+        35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+        42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35,
+        34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+        48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41,
+        42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
+        58, 60, 63, 63 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
+        /* Size 8x8 */
+        31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33,
+        35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45,
+        47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51,
+        53, 54, 48, 46, 45, 46, 47, 51, 54, 56,
+        /* Size 16x16 */
+        32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31,
+        31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+        31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35,
+        35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40,
+        40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45,
+        45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47,
+        47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+        46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45,
+        41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42,
+        42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46,
+        46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47,
+        47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46,
+        46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49,
+        49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53,
+        53, 55, 55, 58,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39,
+        41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45,
+        48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34,
+        34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38,
+        38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42,
+        42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31,
+        32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46,
+        46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33,
+        35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45,
+        45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37,
+        40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42,
+        43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34,
+        34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46,
+        47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35,
+        35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47,
+        46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39,
+        39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46,
+        45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45,
+        45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37,
+        38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47,
+        47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39,
+        40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+        43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46,
+        46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45,
+        47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47,
+        41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48,
+        48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42,
+        42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+        50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42,
+        42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50,
+        49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46,
+        46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+        51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48,
+        48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48,
+        47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47,
+        46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53,
+        53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46,
+        47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+        53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+        46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55,
+        48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48,
+        49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47,
+        47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51,
+        53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45,
+        45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54,
+        55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45,
+        45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57,
+        58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+        45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58,
+        /* Size 4x8 */
+        31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44,
+        48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55,
+        /* Size 8x4 */
+        31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42,
+        44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55,
+        /* Size 8x16 */
+        32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31,
+        31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40,
+        40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47,
+        47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+        42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46,
+        46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47,
+        47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53,
+        53, 58,
+        /* Size 16x8 */
+        32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31,
+        31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32,
+        32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43,
+        43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47,
+        47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50,
+        50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+        53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+        56, 58,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31,
+        31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31,
+        31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34,
+        38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38,
+        38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42,
+        46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46,
+        46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45,
+        45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45,
+        32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34,
+        36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36,
+        36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39,
+        43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45,
+        45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47,
+        47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47,
+        47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+        45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47,
+        42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+        43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43,
+        43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46,
+        47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48,
+        48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50,
+        53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53,
+        53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54,
+        54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56,
+        48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47,
+        46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45,
+        45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45,
+        46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46,
+        46, 49, 53, 53, 53, 56, 58, 58,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+        42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45,
+        48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+        46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36,
+        36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45,
+        45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40,
+        40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34,
+        34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45,
+        45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39,
+        40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+        43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+        47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+        47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+        42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48,
+        48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47,
+        47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51,
+        53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46,
+        46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53,
+        53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47,
+        47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53,
+        53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46,
+        46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48,
+        47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49,
+        49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46,
+        45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53,
+        53, 54, 56, 56, 56, 57, 58, 58,
+        /* Size 4x16 */
+        31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35,
+        42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46,
+        42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46,
+        50, 54, 47, 46, 50, 54, 47, 45, 49, 56,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34,
+        34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42,
+        42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46,
+        46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
+        /* Size 8x32 */
+        32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31,
+        31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38,
+        38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46,
+        46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+        32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36,
+        36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45,
+        45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47,
+        47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+        42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43,
+        43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48,
+        48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53,
+        53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56,
+        48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45,
+        45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46,
+        46, 53, 53, 58,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+        42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44,
+        46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+        46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43,
+        43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46,
+        46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47,
+        47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46,
+        46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53,
+        53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45,
+        46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56,
+        56, 57, 58, 58 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
+        /* Size 8x8 */
+        31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32,
+        32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34,
+        35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38,
+        39, 42, 35, 35, 34, 36, 38, 40, 42, 48,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+        35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36,
+        36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33,
+        33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33,
+        35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+        36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+        38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42,
+        42, 45, 48, 48,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35,
+        35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36,
+        36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37,
+        37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35,
+        35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+        36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37,
+        37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40,
+        40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35,
+        35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34,
+        34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37,
+        37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34,
+        34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39,
+        39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+        33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42,
+        42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+        36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45,
+        35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37,
+        38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35,
+        35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40,
+        42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34,
+        34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42,
+        45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35,
+        36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48,
+        48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37,
+        38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50,
+        /* Size 4x8 */
+        31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33,
+        35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48,
+        /* Size 8x4 */
+        31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32,
+        33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48,
+        /* Size 8x16 */
+        32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32,
+        32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32,
+        33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34,
+        35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37,
+        32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33,
+        33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35,
+        37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38,
+        46, 48,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32,
+        33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34,
+        34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35,
+        35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+        41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44,
+        48, 48,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35,
+        35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+        33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
+        34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
+        36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37,
+        37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38,
+        32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+        32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33,
+        33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33,
+        34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36,
+        36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37,
+        37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+        41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42,
+        42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44,
+        35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35,
+        35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34,
+        34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34,
+        36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38,
+        39, 39, 39, 42, 46, 49, 49, 49,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+        34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37,
+        37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+        35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+        36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39,
+        40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35,
+        35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46,
+        46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36,
+        37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35,
+        35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38,
+        38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35,
+        35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42,
+        42, 42, 44, 47, 48, 48, 48, 49,
+        /* Size 4x16 */
+        31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32,
+        33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37,
+        32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34,
+        37, 44, 35, 34, 38, 48, 35, 34, 38, 48,
+        /* Size 16x4 */
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32,
+        33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34,
+        36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
+        /* Size 8x32 */
+        32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32,
+        32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32,
+        32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32,
+        34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34,
+        31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32,
+        32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33,
+        34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34,
+        36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+        32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32,
+        33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35,
+        36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37,
+        41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+        35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35,
+        34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37,
+        39, 39, 46, 49,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36,
+        36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
+        35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35,
+        34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41,
+        41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34,
+        34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
+        48, 48, 48, 49 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
+        /* Size 8x8 */
+        31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31,
+        32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42,
+        46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48,
+        48, 50, 48, 47, 46, 47, 47, 49, 50, 53,
+        /* Size 16x16 */
+        32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31,
+        31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31,
+        31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31,
+        34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35,
+        37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40,
+        40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44,
+        44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46,
+        47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47,
+        36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38,
+        38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41,
+        42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42,
+        44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46,
+        46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48,
+        48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50,
+        50, 51, 53, 53,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36,
+        36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+        41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42,
+        44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47,
+        47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34,
+        36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38,
+        38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42,
+        42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46,
+        46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35,
+        35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46,
+        30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39,
+        40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41,
+        42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43,
+        44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+        37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+        47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+        40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34,
+        34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43,
+        43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34,
+        35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45,
+        45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37,
+        37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47,
+        47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42,
+        42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+        36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46,
+        47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39,
+        40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42,
+        43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49,
+        49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44,
+        45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42,
+        42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47,
+        47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42,
+        42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48,
+        48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50,
+        50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46,
+        46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51,
+        47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47,
+        47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47,
+        47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49,
+        50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46,
+        46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50,
+        51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53,
+        53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47,
+        47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+        /* Size 4x8 */
+        31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39,
+        46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53,
+        /* Size 8x4 */
+        31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38,
+        40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53,
+        /* Size 8x16 */
+        32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31,
+        31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35,
+        40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42,
+        46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47,
+        37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42,
+        42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45,
+        47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48,
+        52, 53,
+        /* Size 16x8 */
+        32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31,
+        31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32,
+        32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35,
+        37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43,
+        44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47,
+        47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+        49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51,
+        53, 53,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31,
+        31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31,
+        31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+        34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37,
+        38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38,
+        38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41,
+        45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46,
+        46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46,
+        30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31,
+        32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33,
+        33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35,
+        37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41,
+        43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43,
+        43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44,
+        46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47,
+        47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47,
+        37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+        39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40,
+        40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41,
+        43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46,
+        47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47,
+        47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48,
+        49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50,
+        50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51,
+        47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48,
+        47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46,
+        46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46,
+        47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47,
+        47, 47, 47, 49, 52, 53, 53, 53,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+        37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39,
+        41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42,
+        44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+        35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46,
+        46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+        37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40,
+        40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34,
+        34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44,
+        44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38,
+        38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47,
+        48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+        43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+        37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46,
+        47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42,
+        42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48,
+        49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45,
+        46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52,
+        52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48,
+        47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+        47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47,
+        46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50,
+        50, 50, 51, 52, 53, 53, 53, 53,
+        /* Size 4x16 */
+        31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32,
+        40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47,
+        38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44,
+        47, 51, 48, 46, 48, 53, 48, 46, 48, 53,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31,
+        31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39,
+        40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46,
+        47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
+        /* Size 8x32 */
+        32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31,
+        31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34,
+        38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39,
+        45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+        30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33,
+        33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38,
+        43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43,
+        46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+        37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39,
+        40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44,
+        47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47,
+        49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+        47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47,
+        46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47,
+        47, 47, 52, 53,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+        37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40,
+        42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+        44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+        37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+        47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+        44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37,
+        38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47,
+        47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45,
+        45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49,
+        49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+        46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
+        53, 53, 53, 53 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
+        /* Size 8x8 */
+        31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
+        32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32,
+        33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34,
+        35, 36, 33, 33, 33, 33, 35, 35, 36, 38,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36,
+        37, 37, 38, 39,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
+        34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+        36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
+        35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+        36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+        33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38,
+        39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34,
+        35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+        /* Size 4x8 */
+        31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32,
+        33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36,
+        /* Size 8x4 */
+        31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+        32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+        32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33,
+        33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34,
+        32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+        32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33,
+        34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37,
+        37, 38,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+        34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+        35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+        36, 38,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+        34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+        35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35,
+        35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35,
+        36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37,
+        32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33,
+        33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34,
+        34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+        35, 36, 37, 37, 37, 37, 38, 39,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+        35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+        35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+        37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+        34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+        37, 37, 37, 37, 38, 38, 39, 39,
+        /* Size 4x16 */
+        31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34,
+        32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33,
+        34, 35, 33, 33, 35, 36, 34, 34, 36, 37,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33,
+        33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31,
+        32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+        32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+        32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+        31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32,
+        32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32,
+        32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33,
+        33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+        32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+        32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32,
+        33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35,
+        35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+        32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33,
+        33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34,
+        35, 37, 37, 38,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+        36, 37, 38, 38 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
+        /* Size 8x8 */
+        31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31,
+        31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35,
+        39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44,
+        47, 47, 40, 41, 41, 42, 44, 45, 47, 48,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+        31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35,
+        35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38,
+        40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40,
+        41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44,
+        33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34,
+        34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37,
+        37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40,
+        40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40,
+        42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+        44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46,
+        47, 47, 48, 48,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+        33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35,
+        36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+        37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41,
+        42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
+        34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34,
+        34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38,
+        38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39,
+        40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+        30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35,
+        35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36,
+        38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40,
+        40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42,
+        42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+        34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+        36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41,
+        41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43,
+        43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+        36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45,
+        33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39,
+        39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34,
+        34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40,
+        41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34,
+        35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43,
+        43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36,
+        36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45,
+        45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39,
+        41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43,
+        43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47,
+        47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39,
+        40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47,
+        47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40,
+        40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+        37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43,
+        43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40,
+        40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45,
+        46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41,
+        41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47,
+        47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48,
+        48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43,
+        44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+        /* Size 4x8 */
+        31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36,
+        40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47,
+        /* Size 8x4 */
+        31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36,
+        37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47,
+        /* Size 8x16 */
+        32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31,
+        31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32,
+        34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40,
+        40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43,
+        33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37,
+        38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40,
+        43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47,
+        47, 48,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31,
+        31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31,
+        32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32,
+        32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35,
+        37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
+        43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+        47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+        47, 48,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31,
+        31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31,
+        31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36,
+        38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38,
+        38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38,
+        40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42,
+        30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31,
+        31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32,
+        32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32,
+        32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33,
+        35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38,
+        40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41,
+        41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42,
+        43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45,
+        33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+        35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35,
+        36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37,
+        37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39,
+        41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44,
+        47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47,
+        47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47,
+        47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47,
+        37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39,
+        40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42,
+        42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43,
+        43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43,
+        44, 46, 47, 47, 47, 47, 48, 48,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+        33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35,
+        36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39,
+        39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+        43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+        35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+        36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39,
+        40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41,
+        42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+        35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+        35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40,
+        40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38,
+        38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44,
+        45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38,
+        39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47,
+        47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40,
+        40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41,
+        42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39,
+        39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44,
+        44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42,
+        42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47,
+        47, 47, 47, 47, 48, 48, 48, 48,
+        /* Size 4x16 */
+        31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32,
+        36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42,
+        34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40,
+        45, 47, 39, 41, 45, 47, 42, 43, 46, 47,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31,
+        31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36,
+        36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40,
+        40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+        /* Size 8x32 */
+        32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31,
+        31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31,
+        34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38,
+        38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40,
+        30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31,
+        32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32,
+        35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41,
+        41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+        33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35,
+        36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38,
+        41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47,
+        47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+        37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41,
+        42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43,
+        44, 47, 47, 48,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+        33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36,
+        37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40,
+        40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+        43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36,
+        37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37,
+        38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43,
+        43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47,
+        47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40,
+        41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47,
+        47, 47, 48, 48 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+        /* Size 8x8 */
+        31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        /* Size 4x8 */
+        31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+        31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32,
+        33, 34,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 34,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 34, 34,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 34, 34, 34, 34, 34,
+        /* Size 4x16 */
+        31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32,
+        32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+        31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+        31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+        31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+        32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+        32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32,
+        32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32,
+        32, 32, 33, 34,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 34 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
+        /* Size 8x8 */
+        31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31,
+        31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31,
+        32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36,
+        39, 39, 33, 34, 34, 35, 35, 36, 39, 39,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+        34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+        35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
+        30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+        35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37,
+        38, 39, 39, 39,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31,
+        31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34,
+        34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34,
+        34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33,
+        34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34,
+        34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35,
+        35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+        35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35,
+        35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+        30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+        35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36,
+        36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+        34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34,
+        34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+        34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+        39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+        35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40,
+        33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
+        35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34,
+        34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+        37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
+        39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35,
+        35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39,
+        39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+        36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+        /* Size 4x8 */
+        31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32,
+        32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31,
+        31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31,
+        31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31,
+        31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32,
+        34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+        30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32,
+        33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36,
+        36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36,
+        38, 41,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+        37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41,
+        41, 41,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+        34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+        37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+        35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36,
+        38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+        30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33,
+        33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33,
+        33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34,
+        34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+        37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40,
+        41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43,
+        33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34,
+        34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35,
+        35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36,
+        36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36,
+        36, 36, 36, 38, 39, 40, 42, 44,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34,
+        34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+        35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36,
+        36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37,
+        37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38,
+        38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36,
+        36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35,
+        36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38,
+        38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38,
+        38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41,
+        42, 43, 43, 43, 43, 43, 43, 44,
+        /* Size 4x16 */
+        31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31,
+        31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36,
+        31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36,
+        36, 40, 34, 36, 36, 40, 34, 36, 36, 40,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35,
+        36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31,
+        31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31,
+        31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31,
+        34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+        31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31,
+        31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32,
+        32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32,
+        35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+        30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31,
+        32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33,
+        33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35,
+        37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+        33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34,
+        35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36,
+        36, 36, 39, 42,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+        35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36,
+        36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37,
+        37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41,
+        41, 41, 41, 42 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        /* Size 8x8 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        /* Size 8x8 */
+        31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        /* Size 4x8 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31,
+        31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31,
+        32, 32, 31, 31, 32, 32, 30, 31, 32, 32,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+        32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+        30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31,
+        31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+  },
+};
+
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5,
+        /* Size 8x8 */
+        32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26,
+        19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9,
+        8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10,
+        10, 8, 7, 6, 5, 5,
+        /* Size 16x16 */
+        32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32,
+        32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29,
+        28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22,
+        20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16,
+        15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11,
+        11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9,
+        9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16,
+        17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13,
+        11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7,
+        7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6,
+        11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11,
+        11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7,
+        6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+        32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12,
+        12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28,
+        25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10,
+        10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19,
+        18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33,
+        32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14,
+        14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30,
+        29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24,
+        23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+        11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19,
+        18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28,
+        27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12,
+        11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22,
+        20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10,
+        10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15,
+        15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12,
+        11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20,
+        21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+        9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13,
+        13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17,
+        17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9,
+        9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16,
+        15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10,
+        9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15,
+        14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10,
+        9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14,
+        14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10,
+        9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12,
+        13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+        6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+        9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12,
+        12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+        6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+        9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11,
+        11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+        5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+        8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10,
+        10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5,
+        5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+        7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10,
+        10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
+        4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+        6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+        /* Size 4x8 */
+        32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12,
+        8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5,
+        /* Size 8x4 */
+        32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15,
+        12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5,
+        /* Size 8x16 */
+        32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30,
+        28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17,
+        15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10,
+        10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13,
+        15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8,
+        7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10,
+        10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5,
+        /* Size 16x8 */
+        32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31,
+        30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24,
+        21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14,
+        13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10,
+        9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6,
+        7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+        12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5,
+        /* Size 16x32 */
+        32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32,
+        32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31,
+        30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25,
+        21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19,
+        17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15,
+        14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13,
+        13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12,
+        12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11,
+        26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25,
+        24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21,
+        19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15,
+        13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12,
+        11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9,
+        9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9,
+        16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15,
+        14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9,
+        9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+        7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13,
+        13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9,
+        8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6,
+        6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+        12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9,
+        8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5,
+        5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10,
+        10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7,
+        6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5,
+        /* Size 32x16 */
+        32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+        31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12,
+        12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27,
+        26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20,
+        19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+        10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25,
+        25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18,
+        17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9,
+        9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12,
+        11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17,
+        17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8,
+        8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12,
+        11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12,
+        12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
+        6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+        8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12,
+        12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+        5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9,
+        8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5,
+        5, 5, 5, 5,
+        /* Size 4x16 */
+        33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19,
+        13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14,
+        11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9,
+        7, 5, 9, 9, 7, 5,
+        /* Size 16x4 */
+        33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25,
+        24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14,
+        13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8,
+        8, 7, 6, 6, 6, 5, 5,
+        /* Size 8x32 */
+        32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31,
+        30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20,
+        17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13,
+        13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11,
+        26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23,
+        19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12,
+        11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9,
+        16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10,
+        9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12,
+        13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7,
+        6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10,
+        9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9,
+        10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6,
+        5,
+        /* Size 32x8 */
+        32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30,
+        29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21,
+        20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+        10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16,
+        17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9,
+        8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12,
+        12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7,
+        11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+        7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5,
+        5, 5 },
+      { /* Chroma */
+        /* Size 4x4 */
+        29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9,
+        /* Size 8x8 */
+        33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22,
+        19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14,
+        12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11,
+        10, 10, 14, 15, 15, 14, 12, 11, 10, 9,
+        /* Size 16x16 */
+        32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32,
+        29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23,
+        22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21,
+        20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19,
+        18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15,
+        15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14,
+        14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13,
+        13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12,
+        16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18,
+        18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17,
+        16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15,
+        14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13,
+        12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12,
+        11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+        9, 9, 9,
+        /* Size 32x32 */
+        32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31,
+        30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23,
+        22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22,
+        22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15,
+        15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20,
+        20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27,
+        26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22,
+        21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+        20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16,
+        16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+        21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+        16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23,
+        23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20,
+        19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+        14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17,
+        17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15,
+        15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20,
+        21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21,
+        20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13,
+        13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+        14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+        18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17,
+        17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11,
+        11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12,
+        12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16,
+        16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17,
+        17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,
+        10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15,
+        15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+        10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14,
+        15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16,
+        16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14,
+        15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11,
+        11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        /* Size 4x8 */
+        33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16,
+        12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10,
+        /* Size 8x4 */
+        33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19,
+        16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10,
+        /* Size 8x16 */
+        32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24,
+        22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19,
+        18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14,
+        14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13,
+        16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17,
+        16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14,
+        12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11,
+        10, 9,
+        /* Size 16x8 */
+        32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26,
+        24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20,
+        19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17,
+        16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14,
+        13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10,
+        10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+        10, 9,
+        /* Size 16x32 */
+        32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33,
+        27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24,
+        22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23,
+        21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21,
+        20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19,
+        18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18,
+        17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17,
+        17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17,
+        21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23,
+        22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21,
+        19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17,
+        16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15,
+        14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13,
+        13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13,
+        13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13,
+        13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13,
+        16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18,
+        19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18,
+        17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15,
+        13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13,
+        12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12,
+        11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11,
+        10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10,
+        10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10,
+        14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15,
+        15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16,
+        16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14,
+        14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13,
+        13, 11, 11, 10, 10, 9, 9, 9,
+        /* Size 32x16 */
+        32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30,
+        28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22,
+        21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21,
+        20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16,
+        16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22,
+        23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15,
+        15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22,
+        20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14,
+        14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17,
+        17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14,
+        14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+        16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17,
+        17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10,
+        10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+        14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+        10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13,
+        13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15,
+        16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+        17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11,
+        10, 10, 10, 10, 9, 9, 9,
+        /* Size 4x16 */
+        33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19,
+        17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13,
+        18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15,
+        12, 10, 15, 15, 12, 10, 15, 14, 12, 10,
+        /* Size 16x4 */
+        33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23,
+        22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18,
+        17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16,
+        15, 14, 13, 12, 11, 11, 10, 10, 10, 10,
+        /* Size 8x32 */
+        32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26,
+        22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22,
+        20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17,
+        17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+        21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22,
+        19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16,
+        14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12,
+        13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13,
+        16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18,
+        17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14,
+        12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11,
+        10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10,
+        14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15,
+        16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14,
+        13, 11, 10, 9,
+        /* Size 32x8 */
+        32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25,
+        24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15,
+        16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16,
+        17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+        17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+        10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10,
+        10, 9, 9, 9 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6,
+        /* Size 8x8 */
+        32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26,
+        20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12,
+        10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10,
+        11, 10, 9, 8, 7, 6, 5,
+        /* Size 16x16 */
+        32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32,
+        32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30,
+        28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24,
+        22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17,
+        16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13,
+        12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10,
+        10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8,
+        17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16,
+        16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11,
+        10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7,
+        7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10,
+        11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11,
+        10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7,
+        6, 6, 5, 5, 5,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16,
+        14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32,
+        32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12,
+        12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30,
+        28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11,
+        11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23,
+        23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+        10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18,
+        17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32,
+        31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28,
+        27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23,
+        22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+        11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17,
+        16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27,
+        28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26,
+        26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12,
+        11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19,
+        18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10,
+        10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9,
+        21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12,
+        11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20,
+        21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+        9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15,
+        14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+        17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+        10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11,
+        11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15,
+        15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8,
+        7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13,
+        13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12,
+        12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6,
+        12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+        8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12,
+        12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+        6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8,
+        8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12,
+        12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+        5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+        8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11,
+        11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+        5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+        8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10,
+        11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5,
+        5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+        7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
+        /* Size 4x8 */
+        32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13,
+        9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6,
+        /* Size 8x4 */
+        32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16,
+        13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6,
+        /* Size 8x16 */
+        32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30,
+        28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18,
+        15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11,
+        10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14,
+        16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8,
+        7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11,
+        11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5,
+        /* Size 16x8 */
+        32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32,
+        30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26,
+        21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17,
+        15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10,
+        9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7,
+        7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12,
+        12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5,
+        /* Size 16x32 */
+        32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32,
+        32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31,
+        29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25,
+        23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20,
+        17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16,
+        15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13,
+        13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13,
+        12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12,
+        27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28,
+        26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23,
+        19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16,
+        15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13,
+        12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10,
+        10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9,
+        9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17,
+        17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12,
+        12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8,
+        8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8,
+        12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13,
+        12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8,
+        8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6,
+        6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12,
+        12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9,
+        8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+        6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10,
+        11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8,
+        7, 7, 6, 6, 5, 5, 5,
+        /* Size 32x16 */
+        32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+        14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32,
+        32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14,
+        13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28,
+        28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12,
+        12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+        22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16,
+        16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24,
+        25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12,
+        11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23,
+        22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16,
+        14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8,
+        8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+        10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16,
+        16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11,
+        11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13,
+        13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7,
+        7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11,
+        12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+        7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+        5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+        8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5,
+        /* Size 4x16 */
+        33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19,
+        14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9,
+        16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6,
+        11, 10, 8, 6, 10, 9, 7, 6,
+        /* Size 16x4 */
+        33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25,
+        24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16,
+        14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10,
+        9, 8, 7, 7, 6, 6, 6, 6,
+        /* Size 8x32 */
+        32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32,
+        29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23,
+        17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14,
+        13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12,
+        27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24,
+        19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14,
+        12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9,
+        9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14,
+        12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7,
+        12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10,
+        8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10,
+        12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7,
+        6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9,
+        8, 7, 6, 5,
+        /* Size 32x8 */
+        32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+        14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31,
+        30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24,
+        21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11,
+        11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+        15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+        10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15,
+        14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+        9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12,
+        12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6,
+        6, 6, 6, 6, 5, 5, 5 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10,
+        /* Size 8x8 */
+        33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22,
+        19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15,
+        13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11,
+        10, 10, 15, 16, 16, 14, 12, 11, 10, 9,
+        /* Size 16x16 */
+        32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33,
+        29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23,
+        22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21,
+        22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19,
+        18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17,
+        16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14,
+        14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13,
+        13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13,
+        17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18,
+        18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18,
+        17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15,
+        14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13,
+        13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12,
+        12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11,
+        10, 10, 9, 9,
+        /* Size 32x32 */
+        32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32,
+        30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17,
+        16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24,
+        22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16,
+        16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23,
+        22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16,
+        15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21,
+        20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27,
+        26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22,
+        22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20,
+        21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+        17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16,
+        21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+        17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23,
+        22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21,
+        19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
+        15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20,
+        21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14,
+        14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21,
+        21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18,
+        17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13,
+        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18,
+        19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12,
+        11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18,
+        17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13,
+        13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+        17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17,
+        18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16,
+        16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14,
+        13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+        14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12,
+        12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16,
+        16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16,
+        16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14,
+        14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12,
+        12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        /* Size 4x8 */
+        33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16,
+        13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10,
+        /* Size 8x4 */
+        33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20,
+        17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10,
+        /* Size 8x16 */
+        32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25,
+        22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19,
+        18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15,
+        14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13,
+        17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18,
+        17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14,
+        13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12,
+        10, 10,
+        /* Size 16x8 */
+        32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27,
+        25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21,
+        19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14,
+        14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+        12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11,
+        11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10,
+        10, 10,
+        /* Size 16x32 */
+        32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33,
+        28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26,
+        22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23,
+        22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22,
+        20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20,
+        19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18,
+        18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17,
+        17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17,
+        21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+        22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22,
+        19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17,
+        17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16,
+        15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14,
+        14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13,
+        13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13,
+        13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13,
+        17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18,
+        19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18,
+        17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15,
+        14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13,
+        12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12,
+        11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11,
+        11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10,
+        10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10,
+        14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16,
+        16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16,
+        16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15,
+        14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13,
+        13, 12, 12, 11, 11, 10, 10, 9,
+        /* Size 32x16 */
+        32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31,
+        28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22,
+        21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+        21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16,
+        16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+        23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16,
+        15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22,
+        22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14,
+        14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18,
+        18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14,
+        14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13,
+        17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13,
+        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+        18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17,
+        16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16,
+        16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17,
+        17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 10, 10, 10, 10, 10, 10, 9,
+        /* Size 4x16 */
+        33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19,
+        17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13,
+        19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16,
+        12, 10, 16, 15, 12, 10, 15, 15, 12, 10,
+        /* Size 16x4 */
+        33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23,
+        22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19,
+        17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16,
+        15, 14, 13, 12, 12, 11, 10, 10, 10, 10,
+        /* Size 8x32 */
+        32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27,
+        22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22,
+        20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19,
+        18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+        21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23,
+        19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17,
+        15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13,
+        13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13,
+        17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19,
+        17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14,
+        12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11,
+        11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10,
+        14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16,
+        16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14,
+        13, 12, 11, 10,
+        /* Size 32x8 */
+        32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27,
+        25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14,
+        14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17,
+        18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17,
+        18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16,
+        16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+        10, 10, 10, 10 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6,
+        /* Size 8x8 */
+        32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28,
+        21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13,
+        11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11,
+        11, 11, 10, 8, 7, 6, 6,
+        /* Size 16x16 */
+        32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32,
+        32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30,
+        28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26,
+        24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19,
+        17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14,
+        13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11,
+        11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10,
+        9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17,
+        17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15,
+        13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9,
+        9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7,
+        6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11,
+        12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10,
+        10, 9, 8, 8, 7, 7, 6, 6, 6, 5,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17,
+        16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+        32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12,
+        12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27,
+        25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20,
+        19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32,
+        32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17,
+        16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30,
+        29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14,
+        14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24,
+        23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19,
+        19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11,
+        28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28,
+        26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+        12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22,
+        21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17,
+        16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22,
+        22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11,
+        11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21,
+        19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9,
+        9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14,
+        13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18,
+        19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10,
+        10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16,
+        15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11,
+        10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16,
+        16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+        8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14,
+        14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12,
+        12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12,
+        12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+        8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12,
+        12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6,
+        6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+        8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6,
+        6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11,
+        11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+        6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11,
+        11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6,
+        6, 6, 6, 6, 5, 5, 5,
+        /* Size 4x8 */
+        32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15,
+        10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6,
+        /* Size 8x4 */
+        32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18,
+        15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6,
+        /* Size 8x16 */
+        32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31,
+        28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20,
+        17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11,
+        11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16,
+        17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9,
+        8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12,
+        12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+        /* Size 16x8 */
+        32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32,
+        31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27,
+        23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18,
+        16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13,
+        11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8,
+        7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11,
+        12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 16x32 */
+        32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32,
+        32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32,
+        29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28,
+        25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20,
+        19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18,
+        15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15,
+        13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13,
+        13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12,
+        28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28,
+        26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25,
+        20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18,
+        16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14,
+        13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12,
+        11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10,
+        9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9,
+        17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+        18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13,
+        12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9,
+        8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8,
+        7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14,
+        14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11,
+        9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7,
+        7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12,
+        12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11,
+        10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7,
+        7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6,
+        10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5,
+        /* Size 32x16 */
+        32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+        16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+        32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14,
+        14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29,
+        28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25,
+        24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17,
+        16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27,
+        27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13,
+        13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24,
+        24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9,
+        9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17,
+        18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12,
+        11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13,
+        14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+        8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12,
+        12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7,
+        11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8,
+        8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13,
+        12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+        6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9,
+        9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+        /* Size 4x16 */
+        33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22,
+        16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9,
+        17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6,
+        12, 11, 8, 6, 11, 10, 8, 6,
+        /* Size 16x4 */
+        33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27,
+        26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18,
+        16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11,
+        10, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 8x32 */
+        32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32,
+        29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24,
+        19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15,
+        13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12,
+        28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26,
+        20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16,
+        13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10,
+        9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17,
+        15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8,
+        8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14,
+        13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7,
+        6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12,
+        10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+        10, 11, 11, 10, 9, 8, 7, 6,
+        /* Size 32x8 */
+        32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+        16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32,
+        31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14,
+        14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24,
+        23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17,
+        16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14,
+        15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7,
+        8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11,
+        10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12,
+        12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7,
+        7, 7, 6, 6, 6, 6, 6, 6, 6 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10,
+        /* Size 8x8 */
+        33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22,
+        19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16,
+        14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12,
+        11, 10, 15, 16, 16, 14, 13, 12, 10, 10,
+        /* Size 16x16 */
+        32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33,
+        29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24,
+        22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22,
+        23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20,
+        19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15,
+        15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+        14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13,
+        18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19,
+        19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19,
+        17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+        15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12,
+        12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11,
+        11, 10, 10, 10,
+        /* Size 32x32 */
+        32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+        18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33,
+        30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24,
+        23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23,
+        23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22,
+        21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28,
+        28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23,
+        22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20,
+        20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17,
+        21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23,
+        22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16,
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+        20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+        15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21,
+        22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+        15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22,
+        22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14,
+        13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19,
+        18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19,
+        19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+        12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16,
+        15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12,
+        12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17,
+        17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+        12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18,
+        18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+        10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14,
+        14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+        15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16,
+        16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+        11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10,
+        10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15,
+        15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10,
+        10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+        /* Size 4x8 */
+        33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17,
+        14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10,
+        /* Size 8x4 */
+        33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20,
+        18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10,
+        /* Size 8x16 */
+        32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26,
+        22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20,
+        19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15,
+        15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13,
+        18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18,
+        17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14,
+        13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12,
+        11, 10,
+        /* Size 16x8 */
+        32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28,
+        26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22,
+        20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16,
+        15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+        12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11,
+        11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10,
+        10, 10,
+        /* Size 16x32 */
+        32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33,
+        29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26,
+        22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23,
+        23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22,
+        21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20,
+        19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19,
+        18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18,
+        17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17,
+        21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22,
+        22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22,
+        19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18,
+        18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16,
+        16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15,
+        14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14,
+        13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13,
+        13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13,
+        18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19,
+        20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20,
+        18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16,
+        15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12,
+        11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11,
+        11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11,
+        11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11,
+        15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16,
+        16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17,
+        16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15,
+        15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13,
+        13, 12, 12, 11, 11, 10, 10, 10,
+        /* Size 32x16 */
+        32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+        18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32,
+        28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23,
+        22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17,
+        17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22,
+        22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+        22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23,
+        23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+        15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19,
+        18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14,
+        14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16,
+        16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18,
+        18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12,
+        12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18,
+        17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16,
+        16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17,
+        17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 10, 10, 10, 10, 10,
+        /* Size 4x16 */
+        33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20,
+        19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13,
+        20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16,
+        12, 11, 16, 16, 13, 10, 16, 15, 12, 10,
+        /* Size 16x4 */
+        33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22,
+        22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20,
+        19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16,
+        15, 14, 13, 12, 12, 11, 11, 11, 10, 10,
+        /* Size 8x32 */
+        32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28,
+        22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22,
+        21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19,
+        18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17,
+        21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22,
+        19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17,
+        16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14,
+        13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13,
+        18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19,
+        18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15,
+        13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11,
+        11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10,
+        15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16,
+        16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15,
+        13, 12, 11, 10,
+        /* Size 32x8 */
+        32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+        18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28,
+        26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18,
+        18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20,
+        20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18,
+        18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+        15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+        15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17,
+        18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18,
+        18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 10, 10 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7,
+        /* Size 8x8 */
+        32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28,
+        22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14,
+        12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7,
+        11, 12, 12, 10, 9, 8, 7, 6,
+        /* Size 16x16 */
+        32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32,
+        32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31,
+        29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27,
+        25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20,
+        19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16,
+        15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12,
+        12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10,
+        10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17,
+        19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18,
+        16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12,
+        11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9,
+        8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+        11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12,
+        12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18,
+        17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32,
+        32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15,
+        14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13,
+        13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+        27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+        12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23,
+        21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32,
+        32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18,
+        17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30,
+        29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26,
+        26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+        13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22,
+        20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12,
+        29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17,
+        17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30,
+        28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24,
+        23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12,
+        11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19,
+        18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+        11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23,
+        23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+        12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23,
+        22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11,
+        10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17,
+        17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9,
+        9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19,
+        19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10,
+        10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16,
+        15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8,
+        8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12,
+        11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16,
+        16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13,
+        12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13,
+        13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14,
+        13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7,
+        7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+        10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13,
+        14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+        7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10,
+        10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12,
+        12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+        7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11,
+        11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11,
+        11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+        7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11,
+        11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+        /* Size 4x8 */
+        32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17,
+        11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7,
+        /* Size 8x4 */
+        32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18,
+        16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7,
+        /* Size 8x16 */
+        32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31,
+        29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21,
+        19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13,
+        12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9,
+        18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14,
+        12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7,
+        7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6,
+        /* Size 16x8 */
+        32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32,
+        31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28,
+        25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19,
+        17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14,
+        12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10,
+        9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+        11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+        /* Size 16x32 */
+        32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32,
+        32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32,
+        30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30,
+        25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23,
+        21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18,
+        17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15,
+        14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13,
+        13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13,
+        29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30,
+        28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26,
+        21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20,
+        17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16,
+        14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12,
+        12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11,
+        10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10,
+        9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18,
+        19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18,
+        16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13,
+        12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9,
+        8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8,
+        13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14,
+        13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11,
+        10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7,
+        7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12,
+        12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11,
+        11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8,
+        7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+        18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+        32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16,
+        15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30,
+        29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26,
+        25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12,
+        12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19,
+        18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29,
+        29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24,
+        24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+        18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14,
+        14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17,
+        17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11,
+        10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17,
+        16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+        8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14,
+        14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+        7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12,
+        11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12,
+        12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12,
+        12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+        /* Size 4x16 */
+        33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24,
+        17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10,
+        19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7,
+        12, 12, 9, 7, 12, 11, 8, 6,
+        /* Size 16x4 */
+        33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29,
+        28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18,
+        17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11,
+        10, 10, 9, 8, 8, 7, 7, 7, 6,
+        /* Size 8x32 */
+        32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32,
+        30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24,
+        21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18,
+        14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13,
+        29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26,
+        21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16,
+        14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12,
+        10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18,
+        19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13,
+        11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8,
+        13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13,
+        11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7,
+        11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11,
+        9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+        18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+        31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16,
+        15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26,
+        25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+        17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+        17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13,
+        13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7,
+        11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9,
+        9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11,
+        /* Size 8x8 */
+        33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22,
+        20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17,
+        15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12,
+        11, 11, 16, 17, 17, 15, 13, 12, 11, 10,
+        /* Size 16x16 */
+        32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33,
+        30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24,
+        23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21,
+        22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20,
+        20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19,
+        18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16,
+        16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14,
+        14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14,
+        19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+        20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19,
+        18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17,
+        16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14,
+        13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13,
+        12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12,
+        11, 11, 10, 10,
+        /* Size 32x32 */
+        32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33,
+        30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26,
+        24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23,
+        23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30,
+        29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20,
+        20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24,
+        22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+        19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21,
+        21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18,
+        18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21,
+        20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+        22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22,
+        22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+        21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+        16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+        15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+        17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21,
+        22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22,
+        22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14,
+        14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+        19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14,
+        14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17,
+        16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20,
+        20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13,
+        13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17,
+        17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17,
+        17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+        15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17,
+        17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17,
+        17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+        10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+        14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        /* Size 4x8 */
+        33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19,
+        14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11,
+        /* Size 8x4 */
+        33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20,
+        19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11,
+        /* Size 8x16 */
+        32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28,
+        22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21,
+        20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17,
+        16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14,
+        19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19,
+        18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15,
+        13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12,
+        11, 10,
+        /* Size 16x8 */
+        32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30,
+        28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22,
+        21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19,
+        18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14,
+        13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12,
+        11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11,
+        11, 10,
+        /* Size 16x32 */
+        32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33,
+        30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26,
+        23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22,
+        23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22,
+        22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20,
+        20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19,
+        19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18,
+        18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22,
+        22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22,
+        20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19,
+        18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17,
+        17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16,
+        16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14,
+        14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14,
+        14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13,
+        19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20,
+        20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20,
+        19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18,
+        16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15,
+        14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13,
+        12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11,
+        11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11,
+        11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11,
+        15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17,
+        17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17,
+        17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16,
+        15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14,
+        14, 12, 12, 11, 11, 10, 10, 10,
+        /* Size 32x16 */
+        32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32,
+        29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24,
+        23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17,
+        17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22,
+        23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15,
+        14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14,
+        18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14,
+        14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20,
+        20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+        13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11,
+        11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16,
+        16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+        14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16,
+        16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17,
+        17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 10, 10, 10, 10,
+        /* Size 4x16 */
+        33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20,
+        19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14,
+        20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16,
+        12, 11, 17, 16, 13, 11, 16, 16, 13, 11,
+        /* Size 16x4 */
+        33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22,
+        22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21,
+        19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16,
+        16, 15, 14, 13, 12, 12, 11, 11, 11, 11,
+        /* Size 8x32 */
+        32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30,
+        23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23,
+        22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20,
+        19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17,
+        22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22,
+        20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18,
+        17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15,
+        14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14,
+        19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20,
+        19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16,
+        14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12,
+        11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11,
+        15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17,
+        17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15,
+        14, 12, 11, 10,
+        /* Size 32x8 */
+        32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29,
+        28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17,
+        17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15,
+        15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19,
+        19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+        19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17,
+        16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+        11, 10, 10, 10 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7,
+        /* Size 8x8 */
+        32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29,
+        26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16,
+        13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8,
+        7, 12, 13, 13, 11, 10, 8, 7, 7,
+        /* Size 16x16 */
+        32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32,
+        32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31,
+        30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28,
+        26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22,
+        20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14,
+        13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11,
+        11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10,
+        19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18,
+        19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16,
+        15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12,
+        11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8,
+        8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11,
+        12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20,
+        19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17,
+        16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14,
+        13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+        28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+        12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24,
+        23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32,
+        32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19,
+        19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31,
+        30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16,
+        16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28,
+        27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14,
+        14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23,
+        22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+        30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19,
+        19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30,
+        29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27,
+        24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20,
+        19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24,
+        24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+        14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23,
+        23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20,
+        18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16,
+        15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+        19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19,
+        19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10,
+        10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12,
+        12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17,
+        17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+        8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12,
+        11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14,
+        14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+        12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12,
+        13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14,
+        13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+        7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+        10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12,
+        13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+        7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11,
+        11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+        /* Size 4x8 */
+        32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17,
+        13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7,
+        /* Size 8x4 */
+        32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20,
+        19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7,
+        /* Size 8x16 */
+        32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+        30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23,
+        20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14,
+        12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10,
+        19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16,
+        14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8,
+        7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7,
+        /* Size 16x8 */
+        32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32,
+        32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28,
+        27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20,
+        19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15,
+        14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11,
+        10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8,
+        7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7,
+        /* Size 16x32 */
+        32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32,
+        32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32,
+        31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30,
+        27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25,
+        22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20,
+        17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17,
+        15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15,
+        14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13,
+        30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30,
+        29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27,
+        24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20,
+        19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17,
+        16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14,
+        13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12,
+        11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10,
+        10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10,
+        19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19,
+        19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18,
+        16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13,
+        12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10,
+        10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8,
+        8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14,
+        15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13,
+        12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10,
+        10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7,
+        7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12,
+        12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11,
+        11, 10, 10, 9, 9, 8, 8, 7, 7, 6,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+        19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17,
+        17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14,
+        14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27,
+        26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13,
+        13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21,
+        20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29,
+        29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26,
+        26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21,
+        19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16,
+        15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10,
+        19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17,
+        17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10,
+        9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+        15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8,
+        8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+        10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14,
+        14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+        7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12,
+        12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+        9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6,
+        /* Size 4x16 */
+        33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25,
+        19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10,
+        21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10,
+        7, 13, 12, 9, 7, 12, 12, 9, 7,
+        /* Size 16x4 */
+        33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29,
+        29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21,
+        19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13,
+        12, 11, 10, 10, 9, 8, 8, 7, 7, 7,
+        /* Size 8x32 */
+        32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+        31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27,
+        22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18,
+        15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13,
+        30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29,
+        24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18,
+        16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12,
+        11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10,
+        19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18,
+        15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10,
+        9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14,
+        14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8,
+        8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12,
+        13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+        19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+        32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17,
+        17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28,
+        27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+        14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20,
+        19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14,
+        14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17,
+        17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+        10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+        7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11,
+        /* Size 8x8 */
+        33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23,
+        21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18,
+        16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12,
+        12, 11, 16, 17, 17, 16, 14, 12, 11, 11,
+        /* Size 16x16 */
+        32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33,
+        32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26,
+        24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22,
+        22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21,
+        21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19,
+        19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17,
+        17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15,
+        15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20,
+        21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20,
+        19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17,
+        17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15,
+        14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13,
+        13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12,
+        12, 11, 11, 10,
+        /* Size 32x32 */
+        32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33,
+        32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26,
+        25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+        17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22,
+        23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22,
+        22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30,
+        29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21,
+        21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25,
+        23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22,
+        21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19,
+        18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21,
+        21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20,
+        20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22,
+        21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+        19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+        16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22,
+        22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17,
+        17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22,
+        22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+        14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21,
+        21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+        14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21,
+        20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13,
+        13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18,
+        17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12,
+        12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18,
+        18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19,
+        19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+        12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18,
+        17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+        16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+        14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17,
+        17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18,
+        17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10,
+        /* Size 4x8 */
+        33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19,
+        16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11,
+        /* Size 8x4 */
+        33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22,
+        20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11,
+        /* Size 8x16 */
+        32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29,
+        24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21,
+        21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17,
+        16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14,
+        20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20,
+        19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16,
+        14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13,
+        12, 11,
+        /* Size 16x8 */
+        32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31,
+        29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22,
+        21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19,
+        19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17,
+        16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14,
+        14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+        12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11,
+        11, 11,
+        /* Size 16x32 */
+        32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33,
+        31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27,
+        25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22,
+        23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23,
+        22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22,
+        20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20,
+        19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19,
+        18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18,
+        24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22,
+        22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21,
+        20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19,
+        19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18,
+        18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17,
+        16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16,
+        15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+        14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14,
+        20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20,
+        21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21,
+        20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18,
+        17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16,
+        15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14,
+        13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13,
+        12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11,
+        11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11,
+        16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17,
+        17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18,
+        17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+        16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14,
+        14, 13, 13, 12, 12, 11, 11, 10,
+        /* Size 32x16 */
+        32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32,
+        31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20,
+        20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25,
+        24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+        22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22,
+        22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+        16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+        15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20,
+        20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+        19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12,
+        12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17,
+        16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11,
+        12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+        16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+        17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 11, 10,
+        /* Size 4x16 */
+        33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21,
+        20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14,
+        21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17,
+        14, 11, 17, 17, 13, 11, 17, 16, 13, 11,
+        /* Size 16x4 */
+        33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22,
+        22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22,
+        20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+        /* Size 8x32 */
+        32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31,
+        25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22,
+        22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20,
+        19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17,
+        24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22,
+        20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19,
+        18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16,
+        15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14,
+        20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20,
+        20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17,
+        15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13,
+        12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11,
+        16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17,
+        17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16,
+        14, 13, 12, 11,
+        /* Size 32x8 */
+        32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31,
+        29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+        20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22,
+        21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18,
+        18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+        19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+        16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19,
+        19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+        14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18,
+        19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17,
+        17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8,
+        /* Size 8x8 */
+        32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29,
+        26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17,
+        14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10,
+        9, 8, 13, 14, 13, 12, 10, 9, 8, 7,
+        /* Size 16x16 */
+        32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32,
+        32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32,
+        32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29,
+        28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+        24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20,
+        19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13,
+        12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11,
+        21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20,
+        21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19,
+        19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16,
+        15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12,
+        11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9,
+        9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+        21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18,
+        18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15,
+        15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14,
+        14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25,
+        25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32,
+        32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20,
+        20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31,
+        31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17,
+        17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28,
+        28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15,
+        15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+        26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14,
+        30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20,
+        20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31,
+        31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+        18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27,
+        27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+        14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+        21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27,
+        27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24,
+        24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22,
+        22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17,
+        17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10,
+        21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14,
+        14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21,
+        21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21,
+        21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14,
+        13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17,
+        17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10,
+        10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16,
+        16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12,
+        12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16,
+        16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10,
+        10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14,
+        14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7,
+        7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14,
+        14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+        8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+        13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12,
+        12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+        9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+        /* Size 4x8 */
+        32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18,
+        13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8,
+        /* Size 8x4 */
+        32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20,
+        19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8,
+        /* Size 8x16 */
+        32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+        31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27,
+        24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17,
+        15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11,
+        21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19,
+        19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14,
+        12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8,
+        8,
+        /* Size 16x8 */
+        32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32,
+        32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30,
+        29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24,
+        21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18,
+        16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13,
+        12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10,
+        10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+        /* Size 16x32 */
+        32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32,
+        32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32,
+        32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30,
+        30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25,
+        25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20,
+        20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17,
+        17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15,
+        15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14,
+        30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30,
+        30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27,
+        27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21,
+        21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18,
+        18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16,
+        16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13,
+        13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11,
+        11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+        21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21,
+        21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20,
+        20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16,
+        16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14,
+        12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10,
+        10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9,
+        8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16,
+        16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15,
+        13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11,
+        11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10,
+        8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8,
+        7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+        21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+        19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16,
+        16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27,
+        27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14,
+        14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+        24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29,
+        29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28,
+        28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21,
+        21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+        18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11,
+        19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21,
+        21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18,
+        18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+        9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13,
+        14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11,
+        10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15,
+        15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+        8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12,
+        11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+        /* Size 4x16 */
+        33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27,
+        21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11,
+        23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14,
+        11, 8, 14, 13, 10, 8, 14, 13, 10, 8,
+        /* Size 16x4 */
+        33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29,
+        30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20,
+        21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14,
+        13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+        /* Size 8x32 */
+        32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+        32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30,
+        25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20,
+        17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15,
+        30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30,
+        27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20,
+        18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14,
+        13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11,
+        21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21,
+        20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16,
+        14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11,
+        10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13,
+        14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13,
+        11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, 7,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+        21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+        19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+        21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20,
+        20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17,
+        17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13,
+        13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12,
+        /* Size 8x8 */
+        33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23,
+        21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18,
+        17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14,
+        12, 12, 17, 18, 18, 16, 14, 13, 12, 11,
+        /* Size 16x16 */
+        32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+        33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29,
+        26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23,
+        22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22,
+        23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20,
+        20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19,
+        18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17,
+        16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15,
+        20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20,
+        22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21,
+        21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19,
+        18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14,
+        14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13,
+        12, 12, 11, 11,
+        /* Size 32x32 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33,
+        33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27,
+        27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22,
+        22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+        18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23,
+        23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30,
+        30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26,
+        26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20,
+        20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22,
+        22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+        19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+        22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18,
+        25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20,
+        20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24,
+        24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20,
+        20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+        17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+        22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23,
+        23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21,
+        21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+        20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22,
+        22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22,
+        22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14,
+        14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+        13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+        19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15,
+        15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12,
+        16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18,
+        18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        /* Size 4x8 */
+        33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19,
+        16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11,
+        /* Size 8x4 */
+        33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22,
+        20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11,
+        /* Size 8x16 */
+        32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32,
+        26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22,
+        23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19,
+        18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15,
+        20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20,
+        21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18,
+        16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14,
+        12, 11,
+        /* Size 16x8 */
+        32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+        32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24,
+        22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20,
+        19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18,
+        18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16,
+        16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14,
+        13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12,
+        12, 11,
+        /* Size 16x32 */
+        32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33,
+        33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27,
+        27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22,
+        22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23,
+        23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22,
+        22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20,
+        20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19,
+        19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+        24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24,
+        24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21,
+        21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19,
+        19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18,
+        18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18,
+        18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16,
+        16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21,
+        21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22,
+        22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19,
+        19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17,
+        17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15,
+        15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13,
+        13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12,
+        12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+        16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18,
+        18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18,
+        18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17,
+        17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15,
+        15, 13, 13, 12, 12, 11, 11, 11,
+        /* Size 32x16 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+        32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26,
+        26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+        18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21,
+        21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+        20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21,
+        21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+        13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17,
+        17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17,
+        17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 11, 11,
+        /* Size 4x16 */
+        33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22,
+        22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15,
+        22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18,
+        14, 12, 18, 17, 14, 12, 17, 17, 14, 11,
+        /* Size 16x4 */
+        33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22,
+        22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22,
+        22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18,
+        17, 16, 15, 14, 14, 13, 12, 12, 12, 11,
+        /* Size 8x32 */
+        32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33,
+        27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22,
+        23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22,
+        20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19,
+        24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22,
+        21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19,
+        18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17,
+        16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14,
+        20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21,
+        22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19,
+        17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15,
+        13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12,
+        16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17,
+        18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16,
+        15, 13, 12, 11,
+        /* Size 32x8 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+        32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22,
+        22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+        17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20,
+        20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+        12, 11, 11, 11 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9,
+        /* Size 8x8 */
+        33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31,
+        29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19,
+        16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12,
+        10, 9, 15, 16, 16, 14, 12, 11, 9, 9,
+        /* Size 16x16 */
+        32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32,
+        32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32,
+        32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29,
+        28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+        25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21,
+        20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17,
+        17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14,
+        13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+        23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22,
+        23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20,
+        21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18,
+        17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14,
+        14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12,
+        11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9,
+        9, 8, 8,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+        23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20,
+        20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17,
+        17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+        14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28,
+        27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23,
+        23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31,
+        31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20,
+        19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27,
+        26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15,
+        32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24,
+        24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31,
+        31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19,
+        19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29,
+        28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24,
+        22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14,
+        13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28,
+        28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17,
+        17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27,
+        26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25,
+        22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+        13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+        23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23,
+        23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+        13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23,
+        22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+        12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19,
+        18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19,
+        19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13,
+        13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19,
+        19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11,
+        11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18,
+        17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10,
+        10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16,
+        16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+        16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+        13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14,
+        14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9,
+        9, 9, 9, 9, 8, 8, 8, 8,
+        /* Size 4x8 */
+        32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21,
+        16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9,
+        /* Size 8x4 */
+        32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24,
+        23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9,
+        /* Size 8x16 */
+        32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+        31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27,
+        24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17,
+        16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13,
+        23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21,
+        20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15,
+        13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9,
+        8,
+        /* Size 16x8 */
+        32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32,
+        32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30,
+        29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24,
+        22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18,
+        17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11,
+        11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+        8,
+        /* Size 16x32 */
+        32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32,
+        32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32,
+        32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31,
+        29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28,
+        25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24,
+        21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20,
+        18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17,
+        16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15,
+        32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30,
+        30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28,
+        28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24,
+        22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21,
+        19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18,
+        16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16,
+        14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13,
+        13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11,
+        23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23,
+        23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22,
+        22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19,
+        17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16,
+        14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14,
+        12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12,
+        11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10,
+        9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15,
+        17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16,
+        16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15,
+        14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11,
+        11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10,
+        9, 9, 8, 8,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+        23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21,
+        20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18,
+        17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28,
+        28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16,
+        14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26,
+        25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30,
+        30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20,
+        20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29,
+        28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+        23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+        13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+        23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20,
+        21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21,
+        21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17,
+        16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+        9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15,
+        16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+        10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+        8, 8,
+        /* Size 4x16 */
+        33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28,
+        24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13,
+        24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16,
+        13, 10, 16, 15, 12, 9, 14, 14, 11, 9,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30,
+        31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24,
+        24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16,
+        15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+        /* Size 8x32 */
+        32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+        32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30,
+        25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20,
+        18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16,
+        32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30,
+        28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21,
+        19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16,
+        14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12,
+        23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23,
+        22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17,
+        14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12,
+        11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9,
+        15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16,
+        14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10,
+        9, 8,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+        23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21,
+        21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30,
+        29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18,
+        18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+        13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20,
+        20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18,
+        18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        8, 8 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13,
+        /* Size 8x8 */
+        33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24,
+        22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19,
+        18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15,
+        14, 13, 18, 19, 20, 18, 16, 14, 13, 12,
+        /* Size 16x16 */
+        32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+        33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29,
+        26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23,
+        22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22,
+        22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21,
+        20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19,
+        19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18,
+        17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16,
+        21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21,
+        22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22,
+        22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20,
+        19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18,
+        17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16,
+        15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14,
+        13, 12, 12, 12,
+        /* Size 32x32 */
+        32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21,
+        21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33,
+        33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29,
+        27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24,
+        22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+        18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23,
+        23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32,
+        31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28,
+        26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21,
+        22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+        28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+        22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24,
+        24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20,
+        20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23,
+        22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20,
+        20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+        17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22,
+        22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+        21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17,
+        16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19,
+        19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+        21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22,
+        22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16,
+        16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+        15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+        14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20,
+        20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21,
+        21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15,
+        14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21,
+        20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14,
+        13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18,
+        18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12,
+        18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19,
+        19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19,
+        20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13,
+        13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+        12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+        /* Size 4x8 */
+        33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21,
+        17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13,
+        /* Size 8x4 */
+        33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23,
+        22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13,
+        /* Size 8x16 */
+        32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32,
+        26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22,
+        23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19,
+        19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16,
+        21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21,
+        22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18,
+        16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14,
+        13, 12,
+        /* Size 16x8 */
+        32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+        32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24,
+        22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20,
+        20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19,
+        18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15,
+        14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13,
+        12, 12,
+        /* Size 16x32 */
+        32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33,
+        33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28,
+        27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24,
+        22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22,
+        23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23,
+        21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22,
+        20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20,
+        19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19,
+        28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24,
+        24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22,
+        22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19,
+        19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+        18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18,
+        17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17,
+        16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15,
+        21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22,
+        22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22,
+        22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20,
+        19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18,
+        17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17,
+        15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15,
+        14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14,
+        13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12,
+        18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19,
+        19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19,
+        19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18,
+        17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17,
+        15, 15, 14, 14, 13, 12, 12, 12,
+        /* Size 32x16 */
+        32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+        21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+        32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28,
+        26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20,
+        20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22,
+        22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19,
+        18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24,
+        24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20,
+        20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17,
+        17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19,
+        19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+        21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21,
+        21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22,
+        22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+        15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+        13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+        17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18,
+        18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15,
+        15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+        18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 13, 12, 12, 12, 12, 12, 12,
+        /* Size 4x16 */
+        33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22,
+        22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17,
+        22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19,
+        16, 13, 19, 19, 16, 13, 18, 18, 15, 12,
+        /* Size 16x4 */
+        33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24,
+        24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22,
+        22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19,
+        18, 18, 17, 16, 15, 14, 14, 13, 13, 12,
+        /* Size 8x32 */
+        32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33,
+        27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22,
+        23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22,
+        20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20,
+        28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24,
+        22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19,
+        19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18,
+        17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16,
+        21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22,
+        22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19,
+        17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15,
+        14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13,
+        18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19,
+        19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17,
+        15, 14, 13, 12,
+        /* Size 32x8 */
+        32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+        21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+        32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21,
+        21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23,
+        22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+        20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+        17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20,
+        20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20,
+        20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+        12, 12, 12, 12 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11,
+        /* Size 8x8 */
+        33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32,
+        29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22,
+        19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14,
+        12, 11, 16, 17, 18, 16, 14, 12, 11, 10,
+        /* Size 16x16 */
+        32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+        32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32,
+        32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31,
+        29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28,
+        26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25,
+        24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20,
+        19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16,
+        16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14,
+        25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24,
+        25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23,
+        23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21,
+        19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16,
+        15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12,
+        11, 11, 10, 10,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+        25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22,
+        22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20,
+        19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+        17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25,
+        25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22,
+        21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+        30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19,
+        18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28,
+        28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26,
+        25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32,
+        32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23,
+        22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29,
+        29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19,
+        18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26,
+        26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16,
+        16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23,
+        22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29,
+        29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30,
+        28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26,
+        26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15,
+        15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22,
+        20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19,
+        18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24,
+        25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24,
+        24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14,
+        14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21,
+        20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+        12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18,
+        17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+        15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21,
+        21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+        12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21,
+        21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12,
+        12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17,
+        16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+        17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18,
+        18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17,
+        17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+        10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+        /* Size 4x8 */
+        32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26,
+        18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11,
+        /* Size 8x4 */
+        32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25,
+        24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11,
+        /* Size 8x16 */
+        32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32,
+        31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28,
+        26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21,
+        19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14,
+        25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23,
+        22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17,
+        15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13,
+        11, 10,
+        /* Size 16x8 */
+        32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+        32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30,
+        30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28,
+        25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20,
+        20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17,
+        16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13,
+        12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11,
+        10, 10,
+        /* Size 16x32 */
+        32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32,
+        32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32,
+        32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32,
+        30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30,
+        28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25,
+        25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21,
+        21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19,
+        17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18,
+        32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32,
+        31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29,
+        28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28,
+        25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23,
+        22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19,
+        19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18,
+        17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15,
+        14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14,
+        25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25,
+        25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24,
+        24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23,
+        19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18,
+        17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15,
+        15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13,
+        12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12,
+        11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11,
+        18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18,
+        18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17,
+        18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18,
+        16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14,
+        13, 12, 12, 11, 11, 10, 9, 9,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+        25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23,
+        23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21,
+        19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29,
+        29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+        17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27,
+        26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32,
+        32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24,
+        24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30,
+        29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19,
+        18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+        27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18,
+        17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25,
+        25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21,
+        21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19,
+        19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+        11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16,
+        15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16,
+        17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17,
+        17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 10, 9,
+        /* Size 4x16 */
+        33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30,
+        24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15,
+        26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19,
+        14, 11, 18, 18, 13, 11, 17, 18, 13, 11,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32,
+        31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24,
+        24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20,
+        18, 16, 15, 15, 14, 13, 12, 11, 11, 11,
+        /* Size 8x32 */
+        32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32,
+        32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30,
+        28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24,
+        21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18,
+        32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31,
+        28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24,
+        22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19,
+        17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14,
+        25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25,
+        24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19,
+        17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14,
+        12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11,
+        18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17,
+        18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15,
+        13, 12, 11, 9,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+        25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23,
+        23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30,
+        30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20,
+        19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24,
+        24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+        16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21,
+        21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+        12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18,
+        18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,
+        10, 10, 10, 9 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14,
+        /* Size 8x8 */
+        33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26,
+        22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20,
+        19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16,
+        16, 15, 19, 20, 20, 19, 17, 16, 15, 13,
+        /* Size 16x16 */
+        32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+        33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30,
+        28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24,
+        23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22,
+        22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22,
+        23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20,
+        20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+        21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22,
+        23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22,
+        22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22,
+        20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19,
+        18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17,
+        16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15,
+        15, 14, 14, 13,
+        /* Size 32x32 */
+        32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+        33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30,
+        28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20,
+        20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25,
+        24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22,
+        23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33,
+        33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23,
+        23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30,
+        28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+        22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24,
+        24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21,
+        20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20,
+        28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+        22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26,
+        26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22,
+        22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23,
+        23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+        20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21,
+        21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19,
+        19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+        22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18,
+        18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17,
+        21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22,
+        23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+        17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+        20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+        16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+        18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21,
+        21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22,
+        22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+        15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14,
+        19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20,
+        20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20,
+        20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+        14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13,
+        13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13,
+        /* Size 4x8 */
+        33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22,
+        18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14,
+        /* Size 8x4 */
+        33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23,
+        23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14,
+        /* Size 8x16 */
+        32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32,
+        26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22,
+        22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21,
+        20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17,
+        21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22,
+        22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19,
+        18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16,
+        15, 13,
+        /* Size 16x8 */
+        32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+        32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24,
+        23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22,
+        21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19,
+        19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18,
+        18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16,
+        16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14,
+        14, 13,
+        /* Size 16x32 */
+        32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33,
+        33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30,
+        27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26,
+        23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22,
+        23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23,
+        23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22,
+        22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21,
+        20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20,
+        28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26,
+        26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23,
+        22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22,
+        21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20,
+        20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19,
+        19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19,
+        19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18,
+        17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+        21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+        23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23,
+        22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22,
+        20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19,
+        18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17,
+        17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16,
+        16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15,
+        14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14,
+        19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20,
+        20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20,
+        20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20,
+        19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18,
+        17, 16, 16, 15, 14, 14, 13, 13,
+        /* Size 32x16 */
+        32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+        32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28,
+        28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21,
+        21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23,
+        23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20,
+        20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27,
+        27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17,
+        21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22,
+        23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+        15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20,
+        20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+        15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19,
+        18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19,
+        19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20,
+        20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+        14, 14, 14, 14, 14, 13, 13, 13,
+        /* Size 4x16 */
+        33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23,
+        23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18,
+        22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21,
+        17, 15, 20, 21, 16, 14, 19, 20, 16, 14,
+        /* Size 16x4 */
+        33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27,
+        26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23,
+        23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21,
+        20, 19, 18, 17, 17, 16, 15, 15, 14, 14,
+        /* Size 8x32 */
+        32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33,
+        27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23,
+        23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23,
+        22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20,
+        28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24,
+        22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20,
+        20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19,
+        19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17,
+        21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23,
+        22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20,
+        18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17,
+        16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14,
+        19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20,
+        20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18,
+        17, 16, 14, 13,
+        /* Size 32x8 */
+        32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32,
+        32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22,
+        22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24,
+        23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22,
+        22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21,
+        22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+        20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14,
+        14, 13, 13, 13 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13,
+        /* Size 8x8 */
+        33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32,
+        30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24,
+        21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16,
+        14, 13, 19, 20, 20, 19, 17, 15, 13, 12,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32,
+        32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32,
+        32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31,
+        31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29,
+        28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26,
+        26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+        22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20,
+        19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16,
+        27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27,
+        28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25,
+        24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23,
+        22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20,
+        18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16,
+        16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14,
+        13, 13, 12, 11,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+        27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25,
+        24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22,
+        22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20,
+        18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+        30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28,
+        28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25,
+        23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21,
+        20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+        29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+        27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32,
+        32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24,
+        24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22,
+        22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+        28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+        19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25,
+        24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30,
+        30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23,
+        23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30,
+        30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20,
+        19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+        27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+        24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20,
+        20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27,
+        28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18,
+        18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26,
+        26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+        17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24,
+        23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+        14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20,
+        19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24,
+        24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+        18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14,
+        13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20,
+        20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13,
+        19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17,
+        17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20,
+        20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15,
+        14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20,
+        20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17,
+        16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11,
+        /* Size 4x8 */
+        32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27,
+        21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12,
+        /* Size 8x4 */
+        32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28,
+        27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12,
+        /* Size 8x16 */
+        32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32,
+        32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30,
+        28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23,
+        22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17,
+        27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25,
+        24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21,
+        18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14,
+        13, 12,
+        /* Size 16x8 */
+        32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32,
+        32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31,
+        30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28,
+        28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24,
+        21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18,
+        18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+        15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13,
+        12, 12,
+        /* Size 16x32 */
+        32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32,
+        32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32,
+        32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32,
+        31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30,
+        29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+        25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25,
+        23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21,
+        21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19,
+        32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32,
+        31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31,
+        29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28,
+        28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24,
+        24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23,
+        21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20,
+        19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17,
+        17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16,
+        27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27,
+        28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27,
+        26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24,
+        23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20,
+        19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18,
+        16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16,
+        15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13,
+        13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13,
+        19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20,
+        21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20,
+        20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19,
+        19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17,
+        16, 15, 14, 14, 13, 12, 12, 11,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+        27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26,
+        24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23,
+        22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20,
+        19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28,
+        28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26,
+        26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31,
+        31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23,
+        22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+        27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18,
+        17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+        24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21,
+        20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24,
+        25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17,
+        16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24,
+        24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15,
+        15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13,
+        13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20,
+        20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+        16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 11, 11,
+        /* Size 4x16 */
+        33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30,
+        27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17,
+        28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22,
+        17, 13, 20, 20, 16, 12, 19, 19, 15, 12,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32,
+        32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28,
+        27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21,
+        20, 19, 17, 16, 16, 14, 14, 13, 12, 12,
+        /* Size 8x32 */
+        32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32,
+        32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31,
+        29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25,
+        23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20,
+        32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31,
+        29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27,
+        24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20,
+        19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17,
+        27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28,
+        26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23,
+        19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16,
+        15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13,
+        19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20,
+        20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19,
+        16, 14, 13, 12,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+        27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26,
+        25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31,
+        30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22,
+        22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+        28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19,
+        19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23,
+        21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24,
+        24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+        18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21,
+        21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+        12, 12, 12, 12 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16,
+        /* Size 8x8 */
+        33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28,
+        25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20,
+        19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17,
+        17, 16, 20, 21, 21, 20, 19, 17, 16, 15,
+        /* Size 16x16 */
+        32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+        33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32,
+        29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26,
+        25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23,
+        22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22,
+        22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+        22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20,
+        20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22,
+        22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23,
+        23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22,
+        22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20,
+        19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18,
+        18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17,
+        16, 16, 15, 14,
+        /* Size 32x32 */
+        32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21,
+        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+        33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30,
+        30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27,
+        26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20,
+        20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23,
+        22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33,
+        33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23,
+        23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32,
+        31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26,
+        24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+        22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23,
+        23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+        30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27,
+        26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24,
+        24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+        21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21,
+        20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24,
+        24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+        23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22,
+        22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22,
+        22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+        17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+        19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+        16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+        20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+        20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+        17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21,
+        21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
+        21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+        14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+        19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14,
+        /* Size 4x8 */
+        33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21,
+        19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15,
+        /* Size 8x4 */
+        33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22,
+        22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15,
+        /* Size 8x16 */
+        32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32,
+        27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23,
+        22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22,
+        22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19,
+        21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23,
+        23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21,
+        19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17,
+        16, 15,
+        /* Size 16x8 */
+        32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+        32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27,
+        25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22,
+        22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19,
+        18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16,
+        16, 15,
+        /* Size 16x32 */
+        32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+        33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32,
+        28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27,
+        26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23,
+        22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22,
+        23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23,
+        22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22,
+        22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+        30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26,
+        26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25,
+        23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22,
+        22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20,
+        20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20,
+        21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20,
+        20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22,
+        22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22,
+        22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22,
+        22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20,
+        19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19,
+        18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17,
+        17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16,
+        16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+        20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21,
+        21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21,
+        21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21,
+        20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19,
+        19, 18, 17, 17, 16, 15, 15, 14,
+        /* Size 32x16 */
+        32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+        33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29,
+        28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+        22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21,
+        20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22,
+        21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27,
+        27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25,
+        25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+        21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22,
+        22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+        19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22,
+        23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+        18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23,
+        22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+        17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16,
+        16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20,
+        20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18,
+        18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20,
+        20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 14, 14,
+        /* Size 4x16 */
+        33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24,
+        22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19,
+        22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22,
+        19, 16, 21, 22, 18, 16, 20, 21, 18, 15,
+        /* Size 16x4 */
+        33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27,
+        26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+        22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22,
+        21, 20, 19, 18, 18, 17, 16, 16, 16, 15,
+        /* Size 8x32 */
+        32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33,
+        28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26,
+        22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23,
+        22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22,
+        30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26,
+        23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21,
+        20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20,
+        20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19,
+        21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22,
+        22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22,
+        19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17,
+        17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16,
+        20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21,
+        21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20,
+        19, 17, 16, 15,
+        /* Size 32x8 */
+        32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+        32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25,
+        25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22,
+        22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21,
+        20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+        22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+        18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22,
+        22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+        16, 15, 15, 15 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16,
+        /* Size 8x8 */
+        33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32,
+        31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28,
+        25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18,
+        16, 15, 22, 23, 23, 22, 20, 17, 15, 14,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+        32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32,
+        32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30,
+        29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28,
+        28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26,
+        25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23,
+        22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20,
+        29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29,
+        29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28,
+        26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26,
+        25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23,
+        22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+        18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17,
+        16, 15, 15, 14,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30,
+        29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+        26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24,
+        24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22,
+        22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+        30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+        27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24,
+        23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27,
+        26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24,
+        24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+        22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28,
+        28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26,
+        26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31,
+        31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23,
+        22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29,
+        29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21,
+        20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+        27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+        22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29,
+        29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29,
+        28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19,
+        19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26,
+        26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17,
+        17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23,
+        23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26,
+        27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20,
+        20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27,
+        27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19,
+        18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24,
+        24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16,
+        16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+        23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15,
+        23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21,
+        19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23,
+        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18,
+        17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15,
+        15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14,
+        14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19,
+        19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+        /* Size 4x8 */
+        33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28,
+        24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15,
+        /* Size 8x4 */
+        33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30,
+        28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15,
+        /* Size 8x16 */
+        32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32,
+        32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30,
+        29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27,
+        24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20,
+        29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28,
+        26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24,
+        20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18,
+        15, 15,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+        32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32,
+        31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30,
+        29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26,
+        25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22,
+        21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+        17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16,
+        15, 15,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33,
+        32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32,
+        32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32,
+        32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31,
+        30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+        30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27,
+        25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25,
+        23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21,
+        32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32,
+        32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32,
+        31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29,
+        29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28,
+        28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26,
+        26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23,
+        21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21,
+        20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19,
+        29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29,
+        30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30,
+        28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26,
+        26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24,
+        21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20,
+        20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18,
+        17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16,
+        16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14,
+        23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23,
+        23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23,
+        23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22,
+        22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20,
+        18, 18, 18, 16, 15, 15, 14, 13,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+        29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+        27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25,
+        25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23,
+        23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32,
+        32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27,
+        27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31,
+        31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+        25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+        22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28,
+        26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18,
+        28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+        22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29,
+        29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21,
+        20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27,
+        26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17,
+        17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24,
+        24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15,
+        15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21,
+        21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22,
+        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18,
+        18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21,
+        21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+        15, 14, 14, 14, 14, 13, 13, 13,
+        /* Size 4x16 */
+        33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30,
+        28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21,
+        29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24,
+        19, 16, 23, 23, 18, 16, 22, 22, 18, 15,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32,
+        32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30,
+        28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24,
+        24, 23, 21, 20, 19, 18, 17, 16, 16, 15,
+        /* Size 8x32 */
+        32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32,
+        32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32,
+        30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30,
+        25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23,
+        32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32,
+        31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29,
+        28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24,
+        21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20,
+        29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30,
+        28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26,
+        21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20,
+        17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16,
+        23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23,
+        23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21,
+        18, 18, 15, 14,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+        29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28,
+        28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24,
+        24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+        22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25,
+        25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29,
+        29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21,
+        21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25,
+        25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18,
+        17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15,
+        15, 15, 15, 14 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17,
+        /* Size 8x8 */
+        33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29,
+        26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22,
+        21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19,
+        18, 17, 21, 22, 22, 22, 20, 19, 17, 17,
+        /* Size 16x16 */
+        32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+        33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33,
+        30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29,
+        26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24,
+        23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22,
+        22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22,
+        22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22,
+        22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20,
+        22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22,
+        22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23,
+        22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22,
+        22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22,
+        21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+        19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18,
+        18, 17, 17, 17,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+        33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28,
+        27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24,
+        24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33,
+        33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32,
+        32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23,
+        23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28,
+        28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24,
+        24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+        31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30,
+        29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27,
+        25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+        22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26,
+        26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+        21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24,
+        24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23,
+        23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+        20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22,
+        21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22,
+        22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+        21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+        20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+        17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+        20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+        /* Size 4x8 */
+        33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22,
+        20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17,
+        /* Size 8x4 */
+        33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17,
+        /* Size 8x16 */
+        32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32,
+        30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24,
+        22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22,
+        23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20,
+        22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+        22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22,
+        20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19,
+        17, 17,
+        /* Size 16x8 */
+        32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+        32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29,
+        28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24,
+        22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+        18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+        17, 17,
+        /* Size 16x32 */
+        32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33,
+        33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33,
+        30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27,
+        27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26,
+        23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22,
+        22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23,
+        23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23,
+        22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22,
+        31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29,
+        28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27,
+        25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22,
+        22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+        22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21,
+        21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21,
+        20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22,
+        22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+        22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+        22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21,
+        20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19,
+        19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19,
+        18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18,
+        17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17,
+        21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21,
+        22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22,
+        22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22,
+        22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21,
+        20, 19, 19, 18, 17, 17, 17, 16,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31,
+        29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23,
+        23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27,
+        26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22,
+        22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27,
+        27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26,
+        26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+        23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+        21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23,
+        23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17,
+        17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21,
+        21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21,
+        21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21,
+        21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 16, 16, 16, 16,
+        /* Size 4x16 */
+        33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24,
+        22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21,
+        22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22,
+        19, 18, 21, 22, 19, 17, 21, 22, 19, 17,
+        /* Size 16x4 */
+        33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+        26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22,
+        23, 22, 21, 19, 19, 18, 18, 18, 17, 17,
+        /* Size 8x32 */
+        32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33,
+        30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26,
+        23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22,
+        23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22,
+        31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27,
+        25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22,
+        22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20,
+        21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20,
+        22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+        22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22,
+        20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19,
+        18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17,
+        21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22,
+        22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22,
+        20, 19, 17, 17,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+        32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22,
+        22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23,
+        23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 17, 17 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19,
+        /* Size 8x8 */
+        33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32,
+        32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29,
+        28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22,
+        20, 19, 25, 26, 26, 25, 23, 21, 19, 18,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28,
+        28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+        26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+        30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30,
+        30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30,
+        30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28,
+        28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26,
+        26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23,
+        23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19,
+        19, 18, 18, 16,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+        29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+        27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+        24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30,
+        30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+        30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+        28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30,
+        30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+        28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+        26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26,
+        24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+        30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+        28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+        27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+        26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+        30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+        26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30,
+        30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25,
+        24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31,
+        31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+        23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+        28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+        20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27,
+        27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29,
+        29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+        24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+        30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+        27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20,
+        20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26,
+        26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+        26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24,
+        23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27,
+        27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22,
+        20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26,
+        26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19,
+        18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24,
+        24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+        16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+        24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16,
+        /* Size 4x8 */
+        33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30,
+        27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18,
+        /* Size 8x4 */
+        33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30,
+        30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18,
+        /* Size 8x16 */
+        32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32,
+        32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31,
+        31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28,
+        28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24,
+        30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30,
+        30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26,
+        26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19,
+        19, 16,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+        32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30,
+        30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24,
+        24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+        21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+        18, 16,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30,
+        29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+        30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28,
+        25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25,
+        33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32,
+        32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32,
+        32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30,
+        30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28,
+        27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27,
+        27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+        24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23,
+        30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30,
+        30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30,
+        30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28,
+        28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27,
+        27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24,
+        21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21,
+        21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20,
+        18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18,
+        26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27,
+        28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26,
+        26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24,
+        24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24,
+        24, 21, 19, 19, 19, 18, 16, 16,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+        29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+        28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26,
+        25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+        29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+        27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+        26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24,
+        30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+        26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29,
+        29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+        21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30,
+        30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21,
+        20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28,
+        28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20,
+        19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26,
+        26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24,
+        24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21,
+        21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25,
+        25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19,
+        19, 18, 18, 18, 18, 17, 16, 16,
+        /* Size 4x16 */
+        33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32,
+        31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26,
+        30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26,
+        23, 19, 27, 26, 23, 19, 24, 24, 21, 18,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31,
+        31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26,
+        26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+        /* Size 8x32 */
+        32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32,
+        32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32,
+        32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30,
+        30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25,
+        33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32,
+        32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30,
+        30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27,
+        27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23,
+        30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30,
+        30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27,
+        27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21,
+        21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18,
+        26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26,
+        26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24,
+        24, 19, 19, 16,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+        28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+        30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24,
+        24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+        29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29,
+        29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+        24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+        30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25,
+        24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 16, 16 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19,
+        /* Size 8x8 */
+        33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31,
+        29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23,
+        22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20,
+        19, 19, 21, 22, 23, 22, 22, 20, 19, 18,
+        /* Size 16x16 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33,
+        33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33,
+        33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29,
+        29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26,
+        26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23,
+        23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22,
+        22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+        22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23,
+        25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24,
+        24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+        22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22,
+        22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+        21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19,
+        19, 19, 19, 18,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+        25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23,
+        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30,
+        30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27,
+        27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24,
+        24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33,
+        32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31,
+        29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23,
+        23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28,
+        26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29,
+        29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26,
+        26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28,
+        27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+        26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+        22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23,
+        22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+        25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+        20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+        21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+        21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22,
+        22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19,
+        19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        /* Size 4x8 */
+        33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23,
+        21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19,
+        /* Size 8x4 */
+        33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24,
+        23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19,
+        /* Size 8x16 */
+        32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33,
+        33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26,
+        26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22,
+        22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23,
+        24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22,
+        22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22,
+        22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19,
+        19, 18,
+        /* Size 16x8 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33,
+        33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32,
+        32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24,
+        24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22,
+        22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+        20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+        19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+        18, 18,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33,
+        33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33,
+        33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30,
+        27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27,
+        27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24,
+        22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22,
+        22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23,
+        23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23,
+        32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30,
+        28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28,
+        28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23,
+        23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22,
+        22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22,
+        22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+        23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22,
+        24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24,
+        24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24,
+        24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22,
+        22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21,
+        21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20,
+        19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19,
+        19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+        19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23,
+        23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23,
+        22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22,
+        22, 21, 19, 19, 19, 18, 18, 18,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+        24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+        21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28,
+        28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+        23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26,
+        26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30,
+        30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+        23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+        26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21,
+        22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21,
+        21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+        23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19,
+        19, 19, 18, 18, 18, 18, 18, 18,
+        /* Size 4x16 */
+        33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29,
+        24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22,
+        24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22,
+        20, 19, 22, 22, 20, 19, 22, 23, 21, 18,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30,
+        30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24,
+        24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22,
+        22, 22, 22, 20, 20, 19, 19, 19, 19, 18,
+        /* Size 8x32 */
+        32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33,
+        33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27,
+        27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22,
+        22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23,
+        32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28,
+        28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23,
+        23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22,
+        22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22,
+        24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24,
+        24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21,
+        21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19,
+        19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18,
+        21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23,
+        23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22,
+        22, 19, 19, 18,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+        24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+        22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24,
+        24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+        22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 18 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22,
+        /* Size 8x8 */
+        33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32,
+        32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30,
+        29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27,
+        26, 24, 29, 29, 30, 28, 27, 26, 24, 21,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+        29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28,
+        28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31,
+        31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31,
+        29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28,
+        28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+        27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24,
+        24, 23, 21, 21,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29,
+        29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29,
+        29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+        28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26,
+        26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29,
+        29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30,
+        30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28,
+        28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30,
+        30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26,
+        26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+        31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24,
+        24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28,
+        28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23,
+        29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28,
+        27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29,
+        29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+        24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30,
+        30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24,
+        23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21,
+        21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+        27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20,
+        /* Size 4x8 */
+        33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31,
+        29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21,
+        /* Size 8x4 */
+        33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32,
+        31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21,
+        /* Size 8x16 */
+        32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32,
+        32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32,
+        31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30,
+        29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28,
+        32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31,
+        31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29,
+        28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27,
+        22, 21,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32,
+        31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30,
+        30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29,
+        29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23,
+        21, 21,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29,
+        29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+        31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30,
+        30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28,
+        28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27,
+        32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32,
+        32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31,
+        31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31,
+        30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28,
+        28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28,
+        28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24,
+        24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23,
+        29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29,
+        29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30,
+        30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30,
+        28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27,
+        26, 26, 26, 24, 22, 21, 21, 21,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+        30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28,
+        28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+        29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+        26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29,
+        29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22,
+        22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28,
+        28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29,
+        29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27,
+        27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29,
+        29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24,
+        24, 24, 23, 22, 21, 21, 21, 21,
+        /* Size 4x16 */
+        33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32,
+        31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28,
+        32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30,
+        28, 23, 29, 30, 27, 21, 29, 30, 27, 21,
+        /* Size 16x4 */
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32,
+        31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30,
+        28, 28, 28, 27, 27, 25, 24, 23, 21, 21,
+        /* Size 8x32 */
+        32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32,
+        32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32,
+        32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32,
+        30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30,
+        33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32,
+        32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31,
+        30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30,
+        28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27,
+        32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32,
+        31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29,
+        28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28,
+        25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23,
+        29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29,
+        30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28,
+        26, 26, 22, 21,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29,
+        29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29,
+        30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25,
+        25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30,
+        30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22,
+        21, 21, 21, 21 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20,
+        /* Size 8x8 */
+        33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33,
+        32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24,
+        22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21,
+        21, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+        /* Size 16x16 */
+        32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33,
+        33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33,
+        33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33,
+        30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29,
+        28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26,
+        26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23,
+        23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22,
+        22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22,
+        28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+        27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25,
+        24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24,
+        23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 19, 19,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28,
+        28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24,
+        23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22,
+        22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30,
+        28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27,
+        27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29,
+        29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22,
+        34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26,
+        26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25,
+        24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28,
+        28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+        22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+        26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30,
+        30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24,
+        24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30,
+        29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+        28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26,
+        26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24,
+        24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21,
+        21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+        23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+        22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+        20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20,
+        20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19,
+        19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+        /* Size 4x8 */
+        33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26,
+        22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19,
+        /* Size 8x4 */
+        33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27,
+        26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19,
+        /* Size 8x16 */
+        32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33,
+        33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29,
+        26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24,
+        22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22,
+        28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24,
+        24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23,
+        22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21,
+        20, 19,
+        /* Size 16x8 */
+        32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33,
+        33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32,
+        32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29,
+        28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24,
+        23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22,
+        22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        19, 19,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33,
+        33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+        30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28,
+        27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27,
+        27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25,
+        23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22,
+        22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22,
+        34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33,
+        32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31,
+        31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29,
+        28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24,
+        24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23,
+        22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22,
+        22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+        28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27,
+        26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26,
+        26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25,
+        24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22,
+        22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22,
+        22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+        22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 20, 19, 19, 19,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+        28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26,
+        25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+        29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+        22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28,
+        28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26,
+        26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30,
+        30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23,
+        23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27,
+        27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22,
+        28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24,
+        24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+        21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+        20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 20, 20, 19, 19, 19, 19,
+        /* Size 4x16 */
+        33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32,
+        26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22,
+        27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23,
+        22, 20, 21, 22, 21, 19, 21, 22, 21, 19,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33,
+        33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26,
+        26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+        /* Size 8x32 */
+        32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33,
+        33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30,
+        27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26,
+        23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22,
+        34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31,
+        31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27,
+        24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24,
+        22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22,
+        28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26,
+        26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23,
+        22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22,
+        21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20,
+        22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22,
+        22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22,
+        22, 22, 20, 19,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+        28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26,
+        24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+        28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+        22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28,
+        27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+        21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20,
+        19, 19, 19, 19 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29,
+        /* Size 8x8 */
+        33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32,
+        32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+        31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30,
+        29, 28, 31, 31, 31, 31, 29, 29, 28, 27,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28,
+        28, 28, 27, 26,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
+        30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+        28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+        31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27,
+        26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30,
+        29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+        /* Size 4x8 */
+        33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
+        31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28,
+        /* Size 8x4 */
+        33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32,
+        32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+        32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31,
+        31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30,
+        32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+        32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31,
+        30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28,
+        28, 27,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+        30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+        29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28,
+        28, 27,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30,
+        30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+        29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29,
+        29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29,
+        28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28,
+        32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31,
+        31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30,
+        30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30,
+        29, 28, 28, 28, 28, 28, 27, 26,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+        29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+        29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+        28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+        30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+        28, 28, 28, 28, 27, 27, 26, 26,
+        /* Size 4x16 */
+        33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30,
+        32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31,
+        30, 29, 31, 31, 29, 28, 30, 30, 28, 28,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31,
+        31, 31, 30, 30, 30, 29, 29, 29, 28, 28,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33,
+        32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+        32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32,
+        32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31,
+        33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32,
+        32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32,
+        32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31,
+        31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30,
+        32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+        32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32,
+        31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29,
+        29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28,
+        32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31,
+        31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30,
+        29, 28, 28, 27,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 27, 27 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22,
+        /* Size 8x8 */
+        33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33,
+        33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29,
+        26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23,
+        22, 22, 26, 25, 25, 24, 23, 23, 22, 21,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+        33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29,
+        29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27,
+        26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26,
+        25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+        31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30,
+        30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28,
+        28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26,
+        26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26,
+        24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+        23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22,
+        22, 22, 21, 21,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+        31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28,
+        28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25,
+        24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31,
+        30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30,
+        30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27,
+        27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26,
+        26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+        34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29,
+        29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28,
+        27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+        26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+        24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+        30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+        28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25,
+        25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24,
+        24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+        28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+        31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26,
+        26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30,
+        30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26,
+        25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30,
+        29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+        24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23,
+        23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26,
+        25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28,
+        28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26,
+        26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26,
+        26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26,
+        26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25,
+        25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+        /* Size 4x8 */
+        33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28,
+        26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22,
+        /* Size 8x4 */
+        33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28,
+        28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22,
+        /* Size 8x16 */
+        32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33,
+        33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32,
+        30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26,
+        26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24,
+        31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28,
+        27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26,
+        24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22,
+        22, 21,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33,
+        33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33,
+        32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32,
+        32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24,
+        24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+        22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22,
+        22, 21,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+        33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+        33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28,
+        27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27,
+        27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27,
+        26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24,
+        34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33,
+        33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32,
+        32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32,
+        32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31,
+        29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27,
+        26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25,
+        25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24,
+        24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23,
+        31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30,
+        29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29,
+        28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28,
+        28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26,
+        25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23,
+        22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22,
+        22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22,
+        22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22,
+        28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26,
+        26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24,
+        24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 21, 21,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+        31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29,
+        28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26,
+        26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+        24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30,
+        29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+        28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26,
+        26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25,
+        24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+        29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23,
+        29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+        26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27,
+        27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23,
+        23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27,
+        26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26,
+        26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25,
+        24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26,
+        26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 21, 21, 21, 21,
+        /* Size 4x16 */
+        33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32,
+        28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24,
+        30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26,
+        23, 22, 26, 25, 23, 22, 24, 24, 22, 22,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33,
+        33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28,
+        28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26,
+        26, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+        /* Size 8x32 */
+        32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33,
+        33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33,
+        30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27,
+        27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26,
+        34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33,
+        32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32,
+        29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25,
+        25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23,
+        31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29,
+        28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27,
+        25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22,
+        22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+        28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25,
+        24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24,
+        23, 22, 22, 21,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+        31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28,
+        28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+        26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+        24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+        28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28,
+        27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+        25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31,
+        /* Size 8x8 */
+        33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        /* Size 4x8 */
+        33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+        33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
+        31, 30,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 30,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 30, 30,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 30, 30, 30, 30, 30,
+        /* Size 4x16 */
+        33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32,
+        32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+        33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+        33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+        33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+        32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30,
+        32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32,
+        32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32,
+        32, 32, 31, 30,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 30 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26,
+        /* Size 8x8 */
+        33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33,
+        33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33,
+        32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28,
+        26, 26, 31, 30, 30, 29, 29, 28, 26, 26,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+        30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+        29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29,
+        34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+        29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+        27, 26, 26, 26,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33,
+        33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30,
+        30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30,
+        30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31,
+        30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30,
+        30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29,
+        29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+        29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29,
+        29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+        34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+        29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28,
+        28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30,
+        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+        28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+        30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26,
+        26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+        29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26,
+        31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29,
+        29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30,
+        30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28,
+        28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26,
+        26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29,
+        29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26,
+        26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26,
+        /* Size 4x8 */
+        33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32,
+        32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33,
+        33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33,
+        33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33,
+        33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32,
+        30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+        34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32,
+        31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28,
+        28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28,
+        27, 25,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+        28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25,
+        25, 25,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+        30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+        28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+        29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28,
+        27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26,
+        34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31,
+        31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31,
+        31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+        28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26,
+        25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24,
+        31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30,
+        30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29,
+        29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28,
+        28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28,
+        28, 28, 28, 27, 26, 26, 24, 23,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30,
+        30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+        29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28,
+        28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28,
+        28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+        27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+        28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29,
+        28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27,
+        27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27,
+        27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25,
+        24, 24, 24, 24, 24, 24, 24, 23,
+        /* Size 4x16 */
+        33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33,
+        33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28,
+        33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28,
+        28, 26, 30, 28, 28, 26, 30, 28, 28, 26,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29,
+        28, 28, 28, 28, 28, 28, 26, 26, 26, 26,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33,
+        33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33,
+        33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33,
+        30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28,
+        33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33,
+        33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32,
+        32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32,
+        29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+        34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33,
+        32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31,
+        31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29,
+        28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25,
+        31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30,
+        29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28,
+        28, 28, 26, 24,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+        29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28,
+        28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28,
+        28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+        28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25,
+        25, 25, 25, 24 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        /* Size 8x8 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        /* Size 8x8 */
+        33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        /* Size 4x8 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33,
+        33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33,
+        32, 32, 33, 33, 32, 32, 34, 33, 32, 32,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+        32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+        34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33,
+        33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 16x4 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+  },
+};
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
new file mode 100644
index 000000000..d1f52a660
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
+
+#include "aom/aom_codec.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+// Total number of QM sets stored
+#define QM_LEVEL_BITS 4
+#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
+/* Range of QMS is between first and last value, with offset applied to inter
+ * blocks*/
+#define DEFAULT_QM_Y 10
+#define DEFAULT_QM_U 11
+#define DEFAULT_QM_V 12
+#define DEFAULT_QM_FIRST 5
+#define DEFAULT_QM_LAST 9
+
+struct AV1Common;
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
+// Reduce the large number of quantizers to a smaller number of levels for which
+// different matrices may be defined
+static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
+  return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
+}
+void av1_qm_init(struct AV1Common *cm);
+const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
+                             TX_SIZE tx_size);
+const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
+                            TX_SIZE tx_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
new file mode 100644
index 000000000..3203efce4
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.c
@@ -0,0 +1,1162 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+
+#define USE_PRECOMPUTED_WEDGE_MASK 1
+#define USE_PRECOMPUTED_WEDGE_SIGN 1
+
+// This function will determine whether or not to create a warped
+// prediction.
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, int x_scale, int y_scale,
+                   WarpedMotionParams *final_warp_params) {
+  if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS)
+    return 0;
+
+  if (final_warp_params != NULL) *final_warp_params = default_warp_params;
+
+  if (build_for_obmc) return 0;
+
+  if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
+    if (final_warp_params != NULL)
+      memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
+    return 1;
+  } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
+    if (final_warp_params != NULL)
+      memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  }
+
+  return 0;
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, const SubpelParams *subpel_params,
+                              const struct scale_factors *sf, int w, int h,
+                              ConvolveParams *conv_params,
+                              InterpFilters interp_filters,
+                              const WarpTypesAllowed *warp_types, int p_col,
+                              int p_row, int plane, int ref,
+                              const MB_MODE_INFO *mi, int build_for_obmc,
+                              const MACROBLOCKD *xd, int can_use_previous) {
+  // Make sure the selected motion mode is valid for this configuration
+  assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
+                           can_use_previous);
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  WarpedMotionParams final_warp_params;
+  const int do_warp =
+      (w >= 8 && h >= 8 &&
+       av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
+                      build_for_obmc, subpel_params->xs, subpel_params->ys,
+                      &final_warp_params));
+  const int is_intrabc = mi->use_intrabc;
+  assert(IMPLIES(is_intrabc, !do_warp));
+
+  if (do_warp && xd->cur_frame_force_integer_mv == 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const struct buf_2d *const pre_buf = &pd->pre[ref];
+    av1_warp_plane(&final_warp_params,
+                   xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                   pre_buf->buf0, pre_buf->width, pre_buf->height,
+                   pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
+                   pd->subsampling_x, pd->subsampling_y, conv_params);
+  } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
+                           w, h, conv_params, interp_filters, is_intrabc,
+                           xd->bd);
+  } else {
+    inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
+                    conv_params, interp_filters, is_intrabc);
+  }
+}
+
+#if USE_PRECOMPUTED_WEDGE_MASK
+static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
+  37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+  46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+  43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+  if (shift >= 0) {
+    memcpy(dst + shift, src, width - shift);
+    memset(dst, src[0], shift);
+  } else {
+    shift = -shift;
+    memcpy(dst, src + shift, width - shift);
+    memset(dst + width - shift, src[width - 1], shift);
+  }
+}
+#endif  // USE_PRECOMPUTED_WEDGE_MASK
+
+#if USE_PRECOMPUTED_WEDGE_SIGN
+/* clang-format off */
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+};
+/* clang-format on */
+#else
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
+#endif  // USE_PRECOMPUTED_WEDGE_SIGN
+
+// [negative][direction]
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+    wedge_masks[BLOCK_8X8] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+    wedge_masks[BLOCK_8X16] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+    wedge_masks[BLOCK_16X8] },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+    wedge_masks[BLOCK_16X16] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+    wedge_masks[BLOCK_16X32] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+    wedge_masks[BLOCK_32X16] },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+    wedge_masks[BLOCK_32X32] },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+    wedge_masks[BLOCK_8X32] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+    wedge_masks[BLOCK_32X8] },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+};
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
+                                             BLOCK_SIZE sb_type) {
+  const uint8_t *master;
+  const int bh = block_size_high[sb_type];
+  const int bw = block_size_wide[sb_type];
+  const wedge_code_type *a =
+      wedge_params_lookup[sb_type].codebook + wedge_index;
+  int woff, hoff;
+  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+
+  assert(wedge_index >= 0 &&
+         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  woff = (a->x_offset * bw) >> 3;
+  hoff = (a->y_offset * bh) >> 3;
+  master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
+           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+           MASK_MASTER_SIZE / 2 - woff;
+  return master;
+}
+
+const uint8_t *av1_get_compound_type_mask(
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
+  assert(is_masked_compound_type(comp_data->type));
+  (void)sb_type;
+  switch (comp_data->type) {
+    case COMPOUND_WEDGE:
+      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
+                                          comp_data->wedge_sign, sb_type);
+    case COMPOUND_DIFFWTD: return comp_data->seg_mask;
+    default: assert(0); return NULL;
+  }
+}
+
+static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
+                             const CONV_BUF_TYPE *src0, int src0_stride,
+                             const CONV_BUF_TYPE *src1, int src1_stride, int h,
+                             int w, ConvolveParams *conv_params, int bd) {
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  int i, j, m, diff;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
+      diff = ROUND_POWER_OF_TWO(diff, round);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
+                         const uint8_t *src0, int src0_stride,
+                         const uint8_t *src1, int src1_stride, int h, int w) {
+  int i, j, m, diff;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff =
+          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
+                                       DIFFWTD_MASK_TYPE mask_type,
+                                       const uint8_t *src0, int src0_stride,
+                                       const uint8_t *src1, int src1_stride,
+                                       int h, int w) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_FORCE_INLINE void diffwtd_mask_highbd(
+    uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
+    int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
+    const unsigned int bd) {
+  assert(bd >= 8);
+  if (bd == 8) {
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  } else {
+    const unsigned int bd_shift = bd - 8;
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static void init_wedge_master_masks() {
+  int i, j;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+// Note: index [0] stores the masters, and [1] its complement.
+#if USE_PRECOMPUTED_WEDGE_MASK
+  // Generate prototype by shifting the masters
+  int shift = h / 4;
+  for (i = 0; i < h; i += 2) {
+    shift_copy(wedge_master_oblique_even,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
+               MASK_MASTER_SIZE);
+    shift--;
+    shift_copy(wedge_master_oblique_odd,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
+               MASK_MASTER_SIZE);
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+  }
+#else
+  static const double smoother_param = 2.85;
+  const int a[2] = { 2, 1 };
+  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; ++j) {
+      int x = (2 * j + 1 - w);
+      int y = (2 * i + 1 - h);
+      double d = (a[0] * x + a[1] * y) / asqrt;
+      const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
+      wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
+      const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
+      wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
+    }
+  }
+#endif  // USE_PRECOMPUTED_WEDGE_MASK
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
+      wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
+      wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
+      const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
+      wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+      wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - mskx;
+    }
+  }
+}
+
+#if !USE_PRECOMPUTED_WEDGE_SIGN
+// If the signs for the wedges for various blocksizes are
+// inconsistent flip the sign flag. Do it only once for every
+// wedge codebook.
+static void init_wedge_signs() {
+  BLOCK_SIZE sb_type;
+  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
+  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
+    const int bw = block_size_wide[sb_type];
+    const int bh = block_size_high[sb_type];
+    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
+    const int wbits = wedge_params.bits;
+    const int wtypes = 1 << wbits;
+    int i, w;
+    if (wbits) {
+      for (w = 0; w < wtypes; ++w) {
+        // Get the mask master, i.e. index [0]
+        const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+        int avg = 0;
+        for (i = 0; i < bw; ++i) avg += mask[i];
+        for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
+        avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
+        // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
+        // If default sign is 1:
+        //   If sign requested is 0, we need to flip the sign and return
+        //   the complement i.e. index [1] instead. If sign requested is 1
+        //   we need to flip the sign and return index [0] instead.
+        // If default sign is 0:
+        //   If sign requested is 0, we need to return index [0] the master
+        //   if sign requested is 1, we need to return the complement index [1]
+        //   instead.
+        wedge_params.signflip[w] = (avg < 32);
+      }
+    }
+  }
+}
+#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
+
+static void init_wedge_masks() {
+  uint8_t *dst = wedge_mask_buf;
+  BLOCK_SIZE bsize;
+  memset(wedge_masks, 0, sizeof(wedge_masks));
+  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const uint8_t *mask;
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
+    const int wbits = wedge_params->bits;
+    const int wtypes = 1 << wbits;
+    int w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      mask = get_wedge_mask_inplace(w, 0, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[0][w] = dst;
+      dst += bw * bh;
+
+      mask = get_wedge_mask_inplace(w, 1, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[1][w] = dst;
+      dst += bw * bh;
+    }
+    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+  }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void av1_init_wedge_masks() {
+  init_wedge_master_masks();
+#if !USE_PRECOMPUTED_WEDGE_SIGN
+  init_wedge_signs();
+#endif  // !USE_PRECOMPUTED_WEDGE_SIGN
+  init_wedge_masks();
+}
+
+static void build_masked_compound_no_round(
+    uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, block_size_wide[sb_type],
+                                  w, h, subw, subh, conv_params, xd->bd);
+  else
+    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, block_size_wide[sb_type], w,
+                                 h, subw, subh, conv_params);
+}
+
+void av1_make_masked_inter_predictor(
+    const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
+    const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
+    MACROBLOCKD *xd, int can_use_previous) {
+  MB_MODE_INFO *mi = xd->mi[0];
+  (void)dst;
+  (void)dst_stride;
+  mi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
+
+// We're going to call av1_make_inter_predictor to generate a prediction into
+// a temporary buffer, then will blend that temporary buffer with that from
+// the other reference.
+//
+#define INTER_PRED_BYTES_PER_PIXEL 2
+
+  DECLARE_ALIGNED(32, uint8_t,
+                  tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
+#undef INTER_PRED_BYTES_PER_PIXEL
+
+  uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
+
+  const int tmp_buf_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *org_dst = conv_params->dst;
+  int org_dst_stride = conv_params->dst_stride;
+  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+  conv_params->dst = tmp_buf16;
+  conv_params->dst_stride = tmp_buf_stride;
+  assert(conv_params->do_average == 0);
+
+  // This will generate a prediction in tmp_buf for the second reference
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
+                           sf, w, h, conv_params, interp_filters, warp_types,
+                           p_col, p_row, plane, ref, mi, 0, xd,
+                           can_use_previous);
+
+  if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+    av1_build_compound_diffwtd_mask_d16(
+        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+        tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
+  }
+  build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
+                                 tmp_buf16, tmp_buf_stride, comp_data,
+                                 mi->sb_type, h, w, conv_params, xd);
+}
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                                int order_idx, int *fwd_offset, int *bck_offset,
+                                int *use_jnt_comp_avg, int is_compound) {
+  assert(fwd_offset != NULL && bck_offset != NULL);
+  if (!is_compound || mbmi->compound_idx) {
+    *use_jnt_comp_avg = 0;
+    return;
+  }
+
+  *use_jnt_comp_avg = 1;
+  const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+  const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+  const int cur_frame_index = cm->cur_frame->cur_frame_offset;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+
+  if (bck_idx >= 0) {
+    bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+  }
+
+  if (fwd_idx >= 0) {
+    fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+  }
+
+  int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+  int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+
+  const int order = d0 <= d1;
+
+  if (d0 == 0 || d1 == 0) {
+    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
+    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+    return;
+  }
+
+  int i;
+  for (i = 0; i < 3; ++i) {
+    int c0 = quant_dist_weight[i][order];
+    int c1 = quant_dist_weight[i][!order];
+    int d0_c0 = d0 * c0;
+    int d1_c1 = d1 * c1;
+    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+  }
+
+  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
+  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
+    struct macroblockd_plane *const pd = &planes[i];
+    const int is_uv = i > 0;
+    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+}
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf,
+                          const int num_planes) {
+  if (src != NULL) {
+    // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+    // the static analysis warnings.
+    for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const int is_uv = i > 0;
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+                       src->crop_widths[is_uv], src->crop_heights[is_uv],
+                       src->strides[is_uv], mi_row, mi_col, sf,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
+
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = { 64 };
+
+static const uint8_t obmc_mask_2[2] = { 45, 64 };
+
+static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 };
+
+static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
+
+static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
+                                          56, 58, 60, 61, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
+                                          45, 47, 48, 50, 51, 52, 53, 55,
+                                          56, 57, 58, 59, 60, 60, 61, 62,
+                                          64, 64, 64, 64, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_64[64] = {
+  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+const uint8_t *av1_get_obmc_mask(int length) {
+  switch (length) {
+    case 1: return obmc_mask_1;
+    case 2: return obmc_mask_2;
+    case 4: return obmc_mask_4;
+    case 8: return obmc_mask_8;
+    case 16: return obmc_mask_16;
+    case 32: return obmc_mask_32;
+    case 64: return obmc_mask_64;
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
+                                     uint8_t mi_hw, MB_MODE_INFO *mi,
+                                     void *fun_ctxt, const int num_planes) {
+  (void)xd;
+  (void)rel_mi_rc;
+  (void)mi_hw;
+  (void)mi;
+  ++*(int *)fun_ctxt;
+  (void)num_planes;
+}
+
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+
+  mbmi->overlappable_neighbors[0] = 0;
+  mbmi->overlappable_neighbors[1] = 0;
+
+  if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
+
+  foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr,
+                                &mbmi->overlappable_neighbors[0]);
+  foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr,
+                               &mbmi->overlappable_neighbors[1]);
+}
+
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
+// block-size of current plane is smaller than 8x8, always only blend with the
+// left neighbor(s) (skip blending with the above side).
+#define DISABLE_CHROMA_U8X8_OBMC 0  // 0: one-sided obmc; 1: disable
+
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir) {
+  assert(is_motion_variation_allowed_bsize(bsize));
+
+  const BLOCK_SIZE bsize_plane =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  switch (bsize_plane) {
+#if DISABLE_CHROMA_U8X8_OBMC
+    case BLOCK_4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8: return 1; break;
+#else
+    case BLOCK_4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8: return dir == 0; break;
+#endif
+    default: return 0;
+  }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+  return;
+}
+
+struct obmc_inter_pred_ctxt {
+  uint8_t **adjacent;
+  int *adjacent_stride;
+};
+
+static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
+                                               uint8_t above_mi_width,
+                                               MB_MODE_INFO *above_mi,
+                                               void *fun_ctxt,
+                                               const int num_planes) {
+  (void)above_mi;
+  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int overlap =
+      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    const int bh = overlap >> pd->subsampling_y;
+    const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
+    const int dst_stride = pd->dst.stride;
+    uint8_t *const dst = &pd->dst.buf[plane_col];
+    const int tmp_stride = ctxt->adjacent_stride[plane];
+    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
+    const uint8_t *const mask = av1_get_obmc_mask(bh);
+
+    if (is_hbd)
+      aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                 tmp_stride, mask, bw, bh, xd->bd);
+    else
+      aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                          mask, bw, bh);
+  }
+}
+
+static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
+                                              uint8_t left_mi_height,
+                                              MB_MODE_INFO *left_mi,
+                                              void *fun_ctxt,
+                                              const int num_planes) {
+  (void)left_mi;
+  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int overlap =
+      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = overlap >> pd->subsampling_x;
+    const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+    const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
+    const int dst_stride = pd->dst.stride;
+    uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
+    const int tmp_stride = ctxt->adjacent_stride[plane];
+    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
+    const uint8_t *const mask = av1_get_obmc_mask(bw);
+
+    if (is_hbd)
+      aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                 tmp_stride, mask, bw, bh, xd->bd);
+    else
+      aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                          mask, bw, bh);
+  }
+}
+
+// This function combines motion compensated predictions that are generated by
+// top/left neighboring blocks' inter predictors with the regular inter
+// prediction. We assume the original prediction (bmc) is stored in
+// xd->plane[].dst.buf
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  // handle above row
+  struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                build_obmc_inter_pred_above, &ctxt_above);
+
+  // handle left column
+  struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               build_obmc_inter_pred_left, &ctxt_left);
+}
+
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes) {
+  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+
+  av1_modify_neighbor_predictor_for_obmc(above_mbmi);
+
+  for (int j = 0; j < num_planes; ++j) {
+    struct macroblockd_plane *const pd = &xd->plane[j];
+    setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+                     ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const int num_refs = 1 + has_second_ref(above_mbmi);
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+    const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+
+    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!av1_is_valid_scale(&ref_buf->sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
+                         &ref_buf->sf, num_planes);
+  }
+
+  xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
+  xd->mb_to_right_edge = ctxt->mb_to_far_edge +
+                         (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+}
+
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes) {
+  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+
+  av1_modify_neighbor_predictor_for_obmc(left_mbmi);
+
+  for (int j = 0; j < num_planes; ++j) {
+    struct macroblockd_plane *const pd = &xd->plane[j];
+    setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+                     ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const int num_refs = 1 + has_second_ref(left_mbmi);
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+    const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+
+    const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+    if ((!av1_is_valid_scale(&ref_buf->sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
+                         &ref_buf->sf, num_planes);
+  }
+
+  xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+  xd->mb_to_bottom_edge =
+      ctxt->mb_to_far_edge +
+      (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+}
+
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+  60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+  16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
+  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
+  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+    32, 16, 16, 16, 8, 8, 8, 4,
+    4,  4,  2,  2,  2, 1, 1, 1,
+    8,  8,  4,  4,  2, 2
+};
+/* clang-format on */
+
+static void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                         BLOCK_SIZE plane_bsize,
+                                         INTERINTRA_MODE mode) {
+  int i, j;
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_SMOOTH_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j)
+          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, 32, bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+  }
+}
+
+static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
+                               int wedge_index, int wedge_sign,
+                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+                               uint8_t *comppred, int compstride,
+                               const uint8_t *interpred, int interstride,
+                               const uint8_t *intrapred, int intrastride) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subw = 2 * mi_size_wide[bsize] == bw;
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+                         interpred, interstride, mask, block_size_wide[bsize],
+                         bw, bh, subw, subh);
+    }
+    return;
+  }
+
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
+                     interstride, mask, bw, bw, bh, 0, 0);
+}
+
+static void combine_interintra_highbd(
+    INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
+    int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred8, int compstride, const uint8_t *interpred8,
+    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      const int subw = 2 * mi_size_wide[bsize] == bw;
+      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                                interpred8, interstride, mask,
+                                block_size_wide[bsize], bw, bh, subw, subh, bd);
+    }
+    return;
+  }
+
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                            interpred8, interstride, mask, bw, bw, bh, 0, 0,
+                            bd);
+}
+
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               BUFFER_SET *ctx, uint8_t *dst,
+                                               int dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+  PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
+  assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+  assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+  assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+  assert(xd->mi[0]->use_intrabc == 0);
+
+  av1_predict_intra_block(cm, xd, pd->width, pd->height,
+                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
+                          FILTER_INTRA_MODES, ctx->plane[plane],
+                          ctx->stride[plane], dst, dst_stride, 0, 0, plane);
+}
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride) {
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    combine_interintra_highbd(
+        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+        xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+        bsize, plane_bsize, xd->plane[plane].dst.buf,
+        xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
+        intra_stride, xd->bd);
+    return;
+  }
+  combine_interintra(
+      xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+      xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+      bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      inter_pred, inter_stride, intra_pred, intra_stride);
+}
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *pred, int stride,
+                                         BUFFER_SET *ctx, int plane,
+                                         BLOCK_SIZE bsize) {
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(
+        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
+        MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, pred, stride,
+                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
+                                              intrapredictor, MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
+                           MAX_SB_SIZE);
+  }
+}
+
+void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          uint8_t *upred, uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
+  av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
+}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
new file mode 100644
index 000000000..db86c777e
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
+
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+#include "aom/aom_integer.h"
+
+// Work out how many pixels off the edge of a reference frame we're allowed
+// to go when forming an inter prediction.
+// The outermost row/col of each referernce frame is extended by
+// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep
+// at least AOM_INTERP_EXTEND pixels within that to account for filtering.
+//
+// We have to break this up into two macros to keep both clang-format and
+// tools/lint-hunks.py happy.
+#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \
+  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
+#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \
+  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set to (1 << 5) if the 32-ary codebooks are used for any bock size
+#define MAX_WEDGE_TYPES (1 << 4)
+
+#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
+#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE -1
+
+// Angles are with respect to horizontal anti-clockwise
+typedef enum {
+  WEDGE_HORIZONTAL = 0,
+  WEDGE_VERTICAL = 1,
+  WEDGE_OBLIQUE27 = 2,
+  WEDGE_OBLIQUE63 = 3,
+  WEDGE_OBLIQUE117 = 4,
+  WEDGE_OBLIQUE153 = 5,
+  WEDGE_DIRECTIONS
+} WedgeDirectionType;
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+  WedgeDirectionType direction;
+  int x_offset;
+  int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+  int bits;
+  const wedge_code_type *codebook;
+  uint8_t *signflip;
+  wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
+
+typedef struct SubpelParams {
+  int xs;
+  int ys;
+  int subpel_x;
+  int subpel_y;
+} SubpelParams;
+
+struct build_prediction_ctxt {
+  const AV1_COMMON *cm;
+  int mi_row;
+  int mi_col;
+  uint8_t **tmp_buf;
+  int *tmp_width;
+  int *tmp_height;
+  int *tmp_stride;
+  int mb_to_far_edge;
+};
+
+static INLINE int has_scale(int xs, int ys) {
+  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
+}
+
+static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+  sp->subpel_x >>= SCALE_EXTRA_BITS;
+  sp->subpel_y >>= SCALE_EXTRA_BITS;
+  sp->xs >>= SCALE_EXTRA_BITS;
+  sp->ys >>= SCALE_EXTRA_BITS;
+  assert(sp->subpel_x < SUBPEL_SHIFTS);
+  assert(sp->subpel_y < SUBPEL_SHIFTS);
+  assert(sp->xs <= SUBPEL_SHIFTS);
+  assert(sp->ys <= SUBPEL_SHIFTS);
+}
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride,
+                                   const SubpelParams *subpel_params,
+                                   const struct scale_factors *sf, int w, int h,
+                                   ConvolveParams *conv_params,
+                                   InterpFilters interp_filters,
+                                   int is_intrabc) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  assert(IMPLIES(is_intrabc, !is_scaled));
+  if (is_scaled) {
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, subpel_params->subpel_x,
+                           subpel_params->xs, subpel_params->subpel_y,
+                           subpel_params->ys, 1, conv_params, sf, is_intrabc);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
+                           sp.ys, 0, conv_params, sf, is_intrabc);
+  }
+}
+
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const SubpelParams *subpel_params,
+                                          const struct scale_factors *sf, int w,
+                                          int h, ConvolveParams *conv_params,
+                                          InterpFilters interp_filters,
+                                          int is_intrabc, int bd) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  assert(IMPLIES(is_intrabc, !is_scaled));
+  if (is_scaled) {
+    av1_highbd_convolve_2d_facade(
+        src, src_stride, dst, dst_stride, w, h, interp_filters,
+        subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
+        subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_highbd_convolve_2d_facade(
+        src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
+        sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
+  }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir);
+
+static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
+                                              BLOCK_SIZE sb_type) {
+  const int comp_allowed = is_comp_ref_allowed(sb_type);
+  switch (type) {
+    case COMPOUND_AVERAGE:
+    case COMPOUND_DIFFWTD: return comp_allowed;
+    case COMPOUND_WEDGE:
+      return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
+    default: assert(0); return 0;
+  }
+}
+
+static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
+  COMPOUND_TYPE comp_type;
+  int i;
+  if (!is_comp_ref_allowed(sb_type)) return 0;
+  for (i = 0; i < COMPOUND_TYPES; i++) {
+    comp_type = (COMPOUND_TYPE)i;
+    if (is_masked_compound_type(comp_type) &&
+        is_interinter_compound_used(comp_type, sb_type))
+      return 1;
+  }
+  return 0;
+}
+
+static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
+  const int wbits = wedge_params_lookup[sb_type].bits;
+  return (wbits > 0) ? wbits + 1 : 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, const SubpelParams *subpel_params,
+                              const struct scale_factors *sf, int w, int h,
+                              ConvolveParams *conv_params,
+                              InterpFilters interp_filters,
+                              const WarpTypesAllowed *warp_types, int p_col,
+                              int p_row, int plane, int ref,
+                              const MB_MODE_INFO *mi, int build_for_obmc,
+                              const MACROBLOCKD *xd, int can_use_previous);
+
+void av1_make_masked_inter_predictor(
+    const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
+    const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
+    MACROBLOCKD *xd, int can_use_previous);
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+                                           const MV *src_mv, int bw, int bh,
+                                           int ss_x, int ss_y) {
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
+  const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+
+  clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+           xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+           xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+           xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+
+  return clamped_mv;
+}
+
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+                                       const struct scale_factors *sf) {
+  const int x =
+      sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
+  const int y =
+      sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
+  return y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
+                                    uint8_t *src, int width, int height,
+                                    int stride, int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
+  // Offset the buffer pointer
+  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+    mi_row -= 1;
+  if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+    mi_col -= 1;
+
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->buf0 = src;
+  dst->width = width;
+  dst->height = height;
+  dst->stride = stride;
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end);
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf, const int num_planes);
+
+static INLINE void set_default_interp_filters(
+    MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
+  mbmi->interp_filters =
+      av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
+}
+
+static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (mbmi->skip_mode) return 0;
+  if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
+  if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
+  return 1;
+}
+
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes);
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                      int mi_row, int mi_col);
+
+#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
+#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
+
+void av1_init_wedge_masks();
+
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
+                                                          int wedge_sign,
+                                                          BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+const uint8_t *av1_get_compound_type_mask(
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *pred, int stride,
+                                         BUFFER_SET *ctx, int plane,
+                                         BLOCK_SIZE bsize);
+
+void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          uint8_t *upred, uint8_t *vpred,
+                                          int ustride, int vstride,
+                                          BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
+void av1_build_intra_predictors_for_interintra(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+    BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride);
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+                                int order_idx, int *fwd_offset, int *bck_offset,
+                                int *use_jnt_comp_avg, int is_compound);
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, int x_scale, int y_scale,
+                   WarpedMotionParams *final_warp_params);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
new file mode 100644
index 000000000..71a52e73e
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.c
@@ -0,0 +1,1640 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/cfl.h"
+
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+  NEED_ABOVELEFT = 1 << 4,
+  NEED_BOTTOMLEFT = 1 << 5,
+};
+
+#define INTRA_EDGE_FILT 3
+#define INTRA_EDGE_TAPS 5
+#define MAX_UPSAMPLE_SZ 16
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,                   // DC
+  NEED_ABOVE,                               // V
+  NEED_LEFT,                                // H
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D45
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D113
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D157
+  NEED_LEFT | NEED_BOTTOMLEFT,              // D203
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D67
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH_V
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH_H
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // PAETH
+};
+
+// Tables to store if the top-right reference pixels are available. The flags
+// are represented with bits, packed into 8-bit integers. E.g., for the 32x32
+// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
+// order), so its flag is stored at the 3rd bit of the 2nd entry in the table,
+// i.e. (table[10 / 8] >> (10 % 8)) & 1.
+//       . . . .
+//       . . . .
+//       . . o .
+//       . . . .
+static uint8_t has_tr_4x4[128] = {
+  255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+};
+static uint8_t has_tr_4x8[64] = {
+  255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119,
+  119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127,
+  127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119,
+  119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127,
+  119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119,
+};
+static uint8_t has_tr_8x4[64] = {
+  255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+};
+static uint8_t has_tr_8x8[32] = {
+  255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+  255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+};
+static uint8_t has_tr_8x16[16] = {
+  255, 255, 119, 119, 127, 127, 119, 119,
+  255, 127, 119, 119, 127, 127, 119, 119,
+};
+static uint8_t has_tr_16x8[16] = {
+  255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0,
+};
+static uint8_t has_tr_16x16[8] = {
+  255, 85, 119, 85, 127, 85, 119, 85,
+};
+static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 };
+static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 };
+static uint8_t has_tr_32x32[2] = { 95, 87 };
+static uint8_t has_tr_32x64[1] = { 127 };
+static uint8_t has_tr_64x32[1] = { 19 };
+static uint8_t has_tr_64x64[1] = { 7 };
+static uint8_t has_tr_64x128[1] = { 3 };
+static uint8_t has_tr_128x64[1] = { 1 };
+static uint8_t has_tr_128x128[1] = { 1 };
+static uint8_t has_tr_4x16[32] = {
+  255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255,
+  127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127,
+  127, 127, 255, 127, 255, 127, 127, 127, 127, 127,
+};
+static uint8_t has_tr_16x4[32] = {
+  255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+  127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+};
+static uint8_t has_tr_8x32[8] = {
+  255, 255, 127, 127, 255, 127, 127, 127,
+};
+static uint8_t has_tr_32x8[8] = {
+  15, 0, 5, 0, 7, 0, 5, 0,
+};
+static uint8_t has_tr_16x64[2] = { 255, 127 };
+static uint8_t has_tr_64x16[2] = { 3, 1 };
+
+static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_tr_4x4,
+  // 4X8,       8X4,            8X8
+  has_tr_4x8, has_tr_8x4, has_tr_8x8,
+  // 8X16,      16X8,           16X16
+  has_tr_8x16, has_tr_16x8, has_tr_16x16,
+  // 16X32,     32X16,          32X32
+  has_tr_16x32, has_tr_32x16, has_tr_32x32,
+  // 32X64,     64X32,          64X64
+  has_tr_32x64, has_tr_64x32, has_tr_64x64,
+  // 64x128,    128x64,         128x128
+  has_tr_64x128, has_tr_128x64, has_tr_128x128,
+  // 4x16,      16x4,            8x32
+  has_tr_4x16, has_tr_16x4, has_tr_8x32,
+  // 32x8,      16x64,           64x16
+  has_tr_32x8, has_tr_16x64, has_tr_64x16
+};
+
+static uint8_t has_tr_vert_8x8[32] = {
+  255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+  255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+};
+static uint8_t has_tr_vert_16x16[8] = {
+  255, 0, 119, 0, 127, 0, 119, 0,
+};
+static uint8_t has_tr_vert_32x32[2] = { 15, 7 };
+static uint8_t has_tr_vert_64x64[1] = { 3 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,      8X4,         8X8
+  has_tr_4x8, NULL, has_tr_vert_8x8,
+  // 8X16,     16X8,        16X16
+  has_tr_8x16, NULL, has_tr_vert_16x16,
+  // 16X32,    32X16,       32X32
+  has_tr_16x32, NULL, has_tr_vert_32x32,
+  // 32X64,    64X32,       64X64
+  has_tr_32x64, NULL, has_tr_vert_64x64,
+  // 64x128,   128x64,      128x128
+  has_tr_64x128, NULL, has_tr_128x128
+};
+
+static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_tr_vert_tables[bsize];
+  } else {
+    ret = has_tr_tables[bsize];
+  }
+  assert(ret);
+  return ret;
+}
+
+static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int top_available, int right_available,
+                         PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                         int col_off, int ss_x, int ss_y) {
+  if (!top_available || !right_available) return 0;
+
+  const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
+  const int top_right_count_unit = tx_size_wide_unit[txsz];
+
+  if (row_off > 0) {  // Just need to check if enough pixels on the right.
+    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+      // Special case: For 128x128 blocks, the transform unit whose
+      // top-right corner is at the center of the block does in fact have
+      // pixels available at its top-right corner.
+      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+        return 1;
+      }
+      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+      const int col_off_64 = col_off % plane_bw_unit_64;
+      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+    }
+    return col_off + top_right_count_unit < plane_bw_unit;
+  } else {
+    // All top-right pixels are in the block above, which is already available.
+    if (col_off + top_right_count_unit < plane_bw_unit) return 1;
+
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+    // Top row of superblock: so top-right pixels are in the top and/or
+    // top-right superblocks, both of which are already available.
+    if (blk_row_in_sb == 0) return 1;
+
+    // Rightmost column of superblock (and not the top row): so top-right pixels
+    // fall in the right superblock, which is not available yet.
+    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
+      return 0;
+    }
+
+    // General case (neither top row nor rightmost column): check if the
+    // top-right block is coded before the current block.
+    const int this_blk_index =
+        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 0;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
+    return (has_tr_table[idx1] >> idx2) & 1;
+  }
+}
+
+// Similar to the has_tr_* tables, but store if the bottom-left reference
+// pixels are available.
+static uint8_t has_bl_4x4[128] = {
+  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85,
+  85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,  0,  84, 85, 85, 85, 16, 17,
+  17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84,
+  85, 85, 85, 0,  0,  0,  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
+  0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,
+  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85,
+  85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  0,  0,
+};
+static uint8_t has_bl_4x8[64] = {
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+};
+static uint8_t has_bl_8x4[64] = {
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+};
+static uint8_t has_bl_8x8[32] = {
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+};
+static uint8_t has_bl_8x16[16] = {
+  16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
+};
+static uint8_t has_bl_16x8[16] = {
+  254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
+};
+static uint8_t has_bl_16x16[8] = {
+  84, 16, 84, 0, 84, 16, 84, 0,
+};
+static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
+static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
+static uint8_t has_bl_32x32[2] = { 4, 4 };
+static uint8_t has_bl_32x64[1] = { 0 };
+static uint8_t has_bl_64x32[1] = { 34 };
+static uint8_t has_bl_64x64[1] = { 0 };
+static uint8_t has_bl_64x128[1] = { 0 };
+static uint8_t has_bl_128x64[1] = { 0 };
+static uint8_t has_bl_128x128[1] = { 0 };
+static uint8_t has_bl_4x16[32] = {
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+};
+static uint8_t has_bl_16x4[32] = {
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+};
+static uint8_t has_bl_8x32[8] = {
+  0, 1, 0, 0, 0, 1, 0, 0,
+};
+static uint8_t has_bl_32x8[8] = {
+  238, 78, 238, 14, 238, 78, 238, 14,
+};
+static uint8_t has_bl_16x64[2] = { 0, 0 };
+static uint8_t has_bl_64x16[2] = { 42, 42 };
+
+static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_bl_4x4,
+  // 4X8,         8X4,         8X8
+  has_bl_4x8, has_bl_8x4, has_bl_8x8,
+  // 8X16,        16X8,        16X16
+  has_bl_8x16, has_bl_16x8, has_bl_16x16,
+  // 16X32,       32X16,       32X32
+  has_bl_16x32, has_bl_32x16, has_bl_32x32,
+  // 32X64,       64X32,       64X64
+  has_bl_32x64, has_bl_64x32, has_bl_64x64,
+  // 64x128,      128x64,      128x128
+  has_bl_64x128, has_bl_128x64, has_bl_128x128,
+  // 4x16,        16x4,        8x32
+  has_bl_4x16, has_bl_16x4, has_bl_8x32,
+  // 32x8,        16x64,       64x16
+  has_bl_32x8, has_bl_16x64, has_bl_64x16
+};
+
+static uint8_t has_bl_vert_8x8[32] = {
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+};
+static uint8_t has_bl_vert_16x16[8] = {
+  254, 16, 254, 0, 254, 16, 254, 0,
+};
+static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
+static uint8_t has_bl_vert_64x64[1] = { 2 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,     8X4,         8X8
+  has_bl_4x8, NULL, has_bl_vert_8x8,
+  // 8X16,    16X8,        16X16
+  has_bl_8x16, NULL, has_bl_vert_16x16,
+  // 16X32,   32X16,       32X32
+  has_bl_16x32, NULL, has_bl_vert_32x32,
+  // 32X64,   64X32,       64X64
+  has_bl_32x64, NULL, has_bl_vert_64x64,
+  // 64x128,  128x64,      128x128
+  has_bl_64x128, NULL, has_bl_128x128
+};
+
+static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_bl_vert_tables[bsize];
+  } else {
+    ret = has_bl_tables[bsize];
+  }
+  assert(ret);
+  return ret;
+}
+
+static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+                           int mi_col, int bottom_available, int left_available,
+                           PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                           int col_off, int ss_x, int ss_y) {
+  if (!bottom_available || !left_available) return 0;
+
+  // Special case for 128x* blocks, when col_off is half the block width.
+  // This is needed because 128x* superblocks are divided into 64x* blocks in
+  // raster order
+  if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
+    const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+    const int col_off_64 = col_off % plane_bw_unit_64;
+    if (col_off_64 == 0) {
+      // We are at the left edge of top-right or bottom-right 64x* block.
+      const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
+      const int row_off_64 = row_off % plane_bh_unit_64;
+      const int plane_bh_unit =
+          AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
+      // Check if all bottom-left pixels are in the left 64x* block (which is
+      // already coded).
+      return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
+    }
+  }
+
+  if (col_off > 0) {
+    // Bottom-left pixels are in the bottom-left block, which is not available.
+    return 0;
+  } else {
+    const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0];
+    const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
+    const int bottom_left_count_unit = tx_size_high_unit[txsz];
+
+    // All bottom-left pixels are in the left block, which is already available.
+    if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
+
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+    // Leftmost column of superblock: so bottom-left pixels maybe in the left
+    // and/or bottom-left superblocks. But only the left superblock is
+    // available, so check if all required pixels fall in that superblock.
+    if (blk_col_in_sb == 0) {
+      const int blk_start_row_off = blk_row_in_sb
+                                        << (bh_in_mi_log2 + MI_SIZE_LOG2 -
+                                            tx_size_wide_log2[0]) >>
+                                    ss_y;
+      const int row_off_in_sb = blk_start_row_off + row_off;
+      const int sb_height_unit = sb_mi_size >> ss_y;
+      return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
+    }
+
+    // Bottom row of superblock (and not the leftmost column): so bottom-left
+    // pixels fall in the bottom superblock, which is not available yet.
+    if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
+
+    // General case (neither leftmost column nor bottom row): check if the
+    // bottom-left block is coded before the current block.
+    const int this_blk_index =
+        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 0;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
+    return (has_bl_table[idx1] >> idx2) & 1;
+  }
+}
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
+static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
+
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+
+static void init_intra_predictors_internal(void) {
+  assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
+
+#define INIT_RECTANGULAR(p, type)             \
+  p[TX_4X8] = aom_##type##_predictor_4x8;     \
+  p[TX_8X4] = aom_##type##_predictor_8x4;     \
+  p[TX_8X16] = aom_##type##_predictor_8x16;   \
+  p[TX_16X8] = aom_##type##_predictor_16x8;   \
+  p[TX_16X32] = aom_##type##_predictor_16x32; \
+  p[TX_32X16] = aom_##type##_predictor_32x16; \
+  p[TX_32X64] = aom_##type##_predictor_32x64; \
+  p[TX_64X32] = aom_##type##_predictor_64x32; \
+  p[TX_4X16] = aom_##type##_predictor_4x16;   \
+  p[TX_16X4] = aom_##type##_predictor_16x4;   \
+  p[TX_8X32] = aom_##type##_predictor_8x32;   \
+  p[TX_32X8] = aom_##type##_predictor_32x8;   \
+  p[TX_16X64] = aom_##type##_predictor_16x64; \
+  p[TX_64X16] = aom_##type##_predictor_64x16;
+
+#define INIT_NO_4X4(p, type)                  \
+  p[TX_8X8] = aom_##type##_predictor_8x8;     \
+  p[TX_16X16] = aom_##type##_predictor_16x16; \
+  p[TX_32X32] = aom_##type##_predictor_32x32; \
+  p[TX_64X64] = aom_##type##_predictor_64x64; \
+  INIT_RECTANGULAR(p, type)
+
+#define INIT_ALL_SIZES(p, type)           \
+  p[TX_4X4] = aom_##type##_predictor_4x4; \
+  INIT_NO_4X4(p, type)
+
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
+  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
+  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
+  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
+  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
+  INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v);
+  INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h);
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#undef intra_pred_allsizes
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int dx, int dy) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx > 0);
+
+  const int max_base_x = ((bw + bh) - 1) << upsample_above;
+  const int frac_bits = 6 - upsample_above;
+  const int base_inc = 1 << upsample_above;
+  x = dx;
+  for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+    base = x >> frac_bits;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
+
+    if (base >= max_base_x) {
+      for (int i = r; i < bh; ++i) {
+        memset(dst, above[max_base_x], bw * sizeof(dst[0]));
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bw; ++c, base += base_inc) {
+      if (base < max_base_x) {
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        dst[c] = above[max_base_x];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int upsample_left, int dx,
+                            int dy) {
+  int r, c, x, y, shift1, shift2, val, base1, base2;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int min_base_x = -(1 << upsample_above);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+  const int base_inc_x = 1 << upsample_above;
+  x = -dx;
+  for (r = 0; r < bh; ++r, x -= dx, dst += stride) {
+    base1 = x >> frac_bits_x;
+    y = (r << 6) - dy;
+    for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) {
+      if (base1 >= min_base_x) {
+        shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        base2 = y >> frac_bits_y;
+        assert(base2 >= -(1 << upsample_left));
+        shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      }
+      dst[c] = val;
+    }
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_left, int dx, int dy) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy > 0);
+
+  const int max_base_y = (bw + bh - 1) << upsample_left;
+  const int frac_bits = 6 - upsample_left;
+  const int base_inc = 1 << upsample_left;
+  y = dy;
+  for (c = 0; c < bw; ++c, y += dy) {
+    base = y >> frac_bits;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
+
+    for (r = 0; r < bh; ++r, base += base_inc) {
+      if (base < max_base_y) {
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = val = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+        break;
+      }
+    }
+  }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                         const uint8_t *above, const uint8_t *left,
+                         int upsample_above, int upsample_left, int angle) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
+                         dy);
+  } else if (angle > 90 && angle < 180) {
+    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
+                         upsample_left, dx, dy);
+  } else if (angle > 180 && angle < 270) {
+    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
+                         dy);
+  } else if (angle == 90) {
+    pred[V_PRED][tx_size](dst, stride, above, left);
+  } else if (angle == 180) {
+    pred[H_PRED][tx_size](dst, stride, above, left);
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int dx, int dy, int bd) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  (void)bd;
+  assert(dy == 1);
+  assert(dx > 0);
+
+  const int max_base_x = ((bw + bh) - 1) << upsample_above;
+  const int frac_bits = 6 - upsample_above;
+  const int base_inc = 1 << upsample_above;
+  x = dx;
+  for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+    base = x >> frac_bits;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
+
+    if (base >= max_base_x) {
+      for (int i = r; i < bh; ++i) {
+        aom_memset16(dst, above[max_base_x], bw);
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bw; ++c, base += base_inc) {
+      if (base < max_base_x) {
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        dst[c] = above[max_base_x];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int upsample_left, int dx, int dy, int bd) {
+  int r, c, x, y, shift, val, base;
+
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int min_base_x = -(1 << upsample_above);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+  for (r = 0; r < bh; ++r) {
+    for (c = 0; c < bw; ++c) {
+      y = r + 1;
+      x = (c << 6) - y * dx;
+      base = x >> frac_bits_x;
+      if (base >= min_base_x) {
+        shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        x = c + 1;
+        y = (r << 6) - x * dy;
+        base = y >> frac_bits_y;
+        shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      }
+      dst[c] = val;
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_left,
+                                   int dx, int dy, int bd) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+  (void)bd;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  const int max_base_y = (bw + bh - 1) << upsample_left;
+  const int frac_bits = 6 - upsample_left;
+  const int base_inc = 1 << upsample_left;
+  y = dy;
+  for (c = 0; c < bw; ++c, y += dy) {
+    base = y >> frac_bits;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
+
+    for (r = 0; r < bh; ++r, base += base_inc) {
+      if (base < max_base_y) {
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+        break;
+      }
+    }
+  }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
+                                TX_SIZE tx_size, const uint16_t *above,
+                                const uint16_t *left, int upsample_above,
+                                int upsample_left, int angle, int bd) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
+                                upsample_above, dx, dy, bd);
+  } else if (angle > 90 && angle < 180) {
+    av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
+                                upsample_above, upsample_left, dx, dy, bd);
+  } else if (angle > 180 && angle < 270) {
+    av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
+                                dx, dy, bd);
+  } else if (angle == 90) {
+    pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
+  } else if (angle == 180) {
+    pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
+  }
+}
+
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
+  {
+      { -6, 10, 0, 0, 0, 12, 0, 0 },
+      { -5, 2, 10, 0, 0, 9, 0, 0 },
+      { -3, 1, 1, 10, 0, 7, 0, 0 },
+      { -3, 1, 1, 2, 10, 5, 0, 0 },
+      { -4, 6, 0, 0, 0, 2, 12, 0 },
+      { -3, 2, 6, 0, 0, 2, 9, 0 },
+      { -3, 2, 2, 6, 0, 2, 7, 0 },
+      { -3, 1, 2, 2, 6, 3, 5, 0 },
+  },
+  {
+      { -10, 16, 0, 0, 0, 10, 0, 0 },
+      { -6, 0, 16, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 16, 0, 4, 0, 0 },
+      { -2, 0, 0, 0, 16, 2, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 10, 0 },
+      { -6, 0, 16, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 16, 0, 0, 4, 0 },
+      { -2, 0, 0, 0, 16, 0, 2, 0 },
+  },
+  {
+      { -8, 8, 0, 0, 0, 16, 0, 0 },
+      { -8, 0, 8, 0, 0, 16, 0, 0 },
+      { -8, 0, 0, 8, 0, 16, 0, 0 },
+      { -8, 0, 0, 0, 8, 16, 0, 0 },
+      { -4, 4, 0, 0, 0, 0, 16, 0 },
+      { -4, 0, 4, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 4, 0, 0, 16, 0 },
+      { -4, 0, 0, 0, 4, 0, 16, 0 },
+  },
+  {
+      { -2, 8, 0, 0, 0, 10, 0, 0 },
+      { -1, 3, 8, 0, 0, 6, 0, 0 },
+      { -1, 2, 3, 8, 0, 4, 0, 0 },
+      { 0, 1, 2, 3, 8, 2, 0, 0 },
+      { -1, 4, 0, 0, 0, 3, 10, 0 },
+      { -1, 3, 4, 0, 0, 4, 6, 0 },
+      { -1, 2, 3, 4, 0, 4, 4, 0 },
+      { -1, 2, 2, 3, 4, 3, 3, 0 },
+  },
+  {
+      { -12, 14, 0, 0, 0, 14, 0, 0 },
+      { -10, 0, 14, 0, 0, 12, 0, 0 },
+      { -9, 0, 0, 14, 0, 11, 0, 0 },
+      { -8, 0, 0, 0, 14, 10, 0, 0 },
+      { -10, 12, 0, 0, 0, 0, 14, 0 },
+      { -9, 1, 12, 0, 0, 0, 12, 0 },
+      { -8, 0, 0, 12, 0, 1, 11, 0 },
+      { -7, 0, 0, 1, 12, 1, 9, 0 },
+  },
+};
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+                                  TX_SIZE tx_size, const uint8_t *above,
+                                  const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint8_t p0 = buffer[r - 1][c - 1];
+      const uint8_t p1 = buffer[r - 1][c];
+      const uint8_t p2 = buffer[r - 1][c + 1];
+      const uint8_t p3 = buffer[r - 1][c + 2];
+      const uint8_t p4 = buffer[r - 1][c + 3];
+      const uint8_t p5 = buffer[r][c - 1];
+      const uint8_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+                av1_filter_intra_taps[mode][k][0] * p0 +
+                    av1_filter_intra_taps[mode][k][1] * p1 +
+                    av1_filter_intra_taps[mode][k][2] * p2 +
+                    av1_filter_intra_taps[mode][k][3] * p3 +
+                    av1_filter_intra_taps[mode][k][4] * p4 +
+                    av1_filter_intra_taps[mode][k][5] * p5 +
+                    av1_filter_intra_taps[mode][k][6] * p6,
+                FILTER_INTRA_SCALE_BITS));
+      }
+    }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
+
+static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          TX_SIZE tx_size,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int mode,
+                                          int bd) {
+  int r, c;
+  uint16_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint16_t p0 = buffer[r - 1][c - 1];
+      const uint16_t p1 = buffer[r - 1][c];
+      const uint16_t p2 = buffer[r - 1][c + 1];
+      const uint16_t p3 = buffer[r - 1][c + 2];
+      const uint16_t p4 = buffer[r - 1][c + 3];
+      const uint16_t p5 = buffer[r][c - 1];
+      const uint16_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+                                  av1_filter_intra_taps[mode][k][0] * p0 +
+                                      av1_filter_intra_taps[mode][k][1] * p1 +
+                                      av1_filter_intra_taps[mode][k][2] * p2 +
+                                      av1_filter_intra_taps[mode][k][3] * p3 +
+                                      av1_filter_intra_taps[mode][k][4] * p4 +
+                                      av1_filter_intra_taps[mode][k][5] * p5 +
+                                      av1_filter_intra_taps[mode][k][6] * p6,
+                                  FILTER_INTRA_SCALE_BITS),
+                              bd);
+      }
+    }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
+    dst += stride;
+  }
+}
+
+static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
+  if (plane == 0) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+            mode == SMOOTH_H_PRED);
+  } else {
+    // uv_mode is not set for inter blocks, so need to explicitly
+    // detect that case.
+    if (is_inter_block(mbmi)) return 0;
+
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
+            uv_mode == UV_SMOOTH_H_PRED);
+  }
+}
+
+static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+  int ab_sm, le_sm;
+
+  if (plane == 0) {
+    const MB_MODE_INFO *ab = xd->above_mbmi;
+    const MB_MODE_INFO *le = xd->left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
+  } else {
+    const MB_MODE_INFO *ab = xd->chroma_above_mbmi;
+    const MB_MODE_INFO *le = xd->chroma_left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
+  }
+
+  return (ab_sm || le_sm) ? 1 : 0;
+}
+
+static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
+  const int d = abs(delta);
+  int strength = 0;
+
+  const int blk_wh = bs0 + bs1;
+  if (type == 0) {
+    if (blk_wh <= 8) {
+      if (d >= 56) strength = 1;
+    } else if (blk_wh <= 12) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 16) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 24) {
+      if (d >= 8) strength = 1;
+      if (d >= 16) strength = 2;
+      if (d >= 32) strength = 3;
+    } else if (blk_wh <= 32) {
+      if (d >= 1) strength = 1;
+      if (d >= 4) strength = 2;
+      if (d >= 32) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
+  } else {
+    if (blk_wh <= 8) {
+      if (d >= 40) strength = 1;
+      if (d >= 64) strength = 2;
+    } else if (blk_wh <= 16) {
+      if (d >= 20) strength = 1;
+      if (d >= 48) strength = 2;
+    } else if (blk_wh <= 24) {
+      if (d >= 4) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
+  }
+  return strength;
+}
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
+    { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
+  };
+  const int filt = strength - 1;
+  uint8_t edge[129];
+
+  memcpy(edge, p, sz * sizeof(*p));
+  for (int i = 1; i < sz; i++) {
+    int s = 0;
+    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+      int k = i - 2 + j;
+      k = (k < 0) ? 0 : k;
+      k = (k > sz - 1) ? sz - 1 : k;
+      s += edge[k] * kernel[filt][j];
+    }
+    s = (s + 8) >> 4;
+    p[i] = s;
+  }
+}
+
+static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
+    { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
+  };
+  const int filt = strength - 1;
+  uint16_t edge[129];
+
+  memcpy(edge, p, sz * sizeof(*p));
+  for (int i = 1; i < sz; i++) {
+    int s = 0;
+    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+      int k = i - 2 + j;
+      k = (k < 0) ? 0 : k;
+      k = (k > sz - 1) ? sz - 1 : k;
+      s += edge[k] * kernel[filt][j];
+    }
+    s = (s + 8) >> 4;
+    p[i] = s;
+  }
+}
+
+static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
+  // interpolate half-sample positions
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint8_t in[MAX_UPSAMPLE_SZ + 3];
+  // copy p[-1..(sz-1)] and extend first and last samples
+  in[0] = p[-1];
+  in[1] = p[-1];
+  for (int i = 0; i < sz; i++) {
+    in[i + 2] = p[i];
+  }
+  in[sz + 2] = p[sz - 1];
+
+  // interpolate half-sample edge positions
+  p[-2] = in[0];
+  for (int i = 0; i < sz; i++) {
+    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+    s = clip_pixel((s + 8) >> 4);
+    p[2 * i - 1] = s;
+    p[2 * i] = in[i + 2];
+  }
+}
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
+  // interpolate half-sample positions
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint16_t in[MAX_UPSAMPLE_SZ + 3];
+  // copy p[-1..(sz-1)] and extend first and last samples
+  in[0] = p[-1];
+  in[1] = p[-1];
+  for (int i = 0; i < sz; i++) {
+    in[i + 2] = p[i];
+  }
+  in[sz + 2] = p[sz - 1];
+
+  // interpolate half-sample edge positions
+  p[-2] = in[0];
+  for (int i = 0; i < sz; i++) {
+    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+    s = (s + 8) >> 4;
+    s = clip_pixel_highbd(s, bd);
+    p[2 * i - 1] = s;
+    p[2 * i] = in[i + 2];
+  }
+}
+
+static void build_intra_predictors_high(
+    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
+    int dst_stride, PREDICTION_MODE mode, int angle_delta,
+    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
+    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
+    int n_bottomleft_px, int plane) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  uint16_t *const above_row = above_data + 16;
+  uint16_t *const left_col = left_data + 16;
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+  const uint16_t *left_ref = ref - 1;
+  int p_angle = 0;
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+  int base = 128 << (xd->bd - 8);
+
+  // The default values if ref pixels are not available:
+  // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+  // base+1   A      B  ..     Y      Z
+  // base+1   C      D  ..     W      X
+  // base+1   E      F  ..     U      V
+  // base+1   G      H  ..     S      T      T      T      T      T
+
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] + angle_delta;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : base + 1;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      aom_memset16(dst, val, txwpx);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    if (use_filter_intra) need_bottom = 0;
+    if (is_dr_mode) need_bottom = p_angle > 180;
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == txhpx);
+        for (; i < txhpx + n_bottomleft_px; i++)
+          left_col[i] = left_ref[i * ref_stride];
+      }
+      if (i < num_left_pixels_needed)
+        aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+    } else {
+      if (n_top_px > 0) {
+        aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
+      } else {
+        aom_memset16(left_col, base + 1, num_left_pixels_needed);
+      }
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    if (use_filter_intra) need_right = 0;
+    if (is_dr_mode) need_right = p_angle < 90;
+    const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == txwpx);
+        memcpy(above_row + txwpx, above_ref + txwpx,
+               n_topright_px * sizeof(above_ref[0]));
+        i += n_topright_px;
+      }
+      if (i < num_top_pixels_needed)
+        aom_memset16(&above_row[i], above_row[i - 1],
+                     num_top_pixels_needed - i);
+    } else {
+      if (n_left_px > 0) {
+        aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
+      } else {
+        aom_memset16(above_row, base - 1, num_top_pixels_needed);
+      }
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = base;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
+  if (use_filter_intra) {
+    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                                  filter_intra_mode, xd->bd);
+    return;
+  }
+
+  if (is_dr_mode) {
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner_high(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+        }
+      }
+      upsample_above =
+          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+      }
+      upsample_left =
+          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
+      }
+    }
+    highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                        upsample_above, upsample_left, p_angle, xd->bd);
+    return;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+        dst, dst_stride, above_row, left_col, xd->bd);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
+  }
+}
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, int angle_delta,
+                                   FILTER_INTRA_MODE filter_intra_mode,
+                                   TX_SIZE tx_size, int disable_edge_filter,
+                                   int n_top_px, int n_topright_px,
+                                   int n_left_px, int n_bottomleft_px,
+                                   int plane) {
+  int i;
+  const uint8_t *above_ref = ref - ref_stride;
+  const uint8_t *left_ref = ref - 1;
+  DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  uint8_t *const above_row = above_data + 16;
+  uint8_t *const left_col = left_data + 16;
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  int p_angle = 0;
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+
+  // The default values if ref pixels are not available:
+  // 127 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] + angle_delta;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : 129;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : 127;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      memset(dst, val, txwpx);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    if (use_filter_intra) need_bottom = 0;
+    if (is_dr_mode) need_bottom = p_angle > 180;
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == txhpx);
+        for (; i < txhpx + n_bottomleft_px; i++)
+          left_col[i] = left_ref[i * ref_stride];
+      }
+      if (i < num_left_pixels_needed)
+        memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+    } else {
+      if (n_top_px > 0) {
+        memset(left_col, above_ref[0], num_left_pixels_needed);
+      } else {
+        memset(left_col, 129, num_left_pixels_needed);
+      }
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    if (use_filter_intra) need_right = 0;
+    if (is_dr_mode) need_right = p_angle < 90;
+    const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px);
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == txwpx);
+        memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
+        i += n_topright_px;
+      }
+      if (i < num_top_pixels_needed)
+        memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
+    } else {
+      if (n_left_px > 0) {
+        memset(above_row, left_ref[0], num_top_pixels_needed);
+      } else {
+        memset(above_row, 127, num_top_pixels_needed);
+      }
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = 128;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
+  if (use_filter_intra) {
+    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                               filter_intra_mode);
+    return;
+  }
+
+  if (is_dr_mode) {
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+        }
+      }
+      upsample_above =
+          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge(above_row, n_px);
+      }
+      upsample_left =
+          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge(left_col, n_px);
+      }
+    }
+    dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+                 upsample_left, p_angle);
+    return;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
+                                                  left_col);
+  } else {
+    pred[mode][tx_size](dst, dst_stride, above_row, left_col);
+  }
+}
+
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  const int x = col_off << tx_size_wide_log2[0];
+  const int y = row_off << tx_size_high_log2[0];
+
+  if (use_palette) {
+    int r, c;
+    const uint8_t *const map = xd->plane[plane != 0].color_index_map +
+                               xd->color_index_map_offset[plane != 0];
+    const uint16_t *const palette =
+        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    } else {
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst[r * dst_stride + c] =
+              (uint8_t)palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    }
+    return;
+  }
+
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
+                                                     : xd->up_available);
+  const int have_left =
+      col_off ||
+      (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int xr_chr_offset = 0;
+  const int yd_chr_offset = 0;
+
+  // Distance between the right edge of this prediction block to
+  // the frame right edge
+  const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
+                 (wpx - x - txwpx) - xr_chr_offset;
+  // Distance between the bottom edge of this prediction block to
+  // the frame bottom edge
+  const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
+                 (hpx - y - txhpx) - yd_chr_offset;
+  const int right_available =
+      mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+  const int bottom_available =
+      (yd > 0) &&
+      (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+
+  const PARTITION_TYPE partition = mbmi->partition;
+
+  // force 4x4 chroma component block size.
+  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  const int have_top_right = has_top_right(
+      cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
+      row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+  const int have_bottom_left = has_bottom_left(
+      cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
+      tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+
+  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    build_intra_predictors_high(
+        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
+        filter_intra_mode, tx_size, disable_edge_filter,
+        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        have_top_right ? AOMMIN(txwpx, xr) : 0,
+        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+    return;
+  }
+
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                         angle_delta, filter_intra_mode, tx_size,
+                         disable_edge_filter,
+                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+                         have_top_right ? AOMMIN(txwpx, xr) : 0,
+                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+                         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+}
+
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  const PREDICTION_MODE mode =
+      (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
+          ? mbmi->filter_intra_mode_info.filter_intra_mode
+          : FILTER_INTRA_MODES;
+  const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+
+  if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
+#if CONFIG_DEBUG
+    assert(is_cfl_allowed(xd));
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+    (void)plane_bsize;
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    if (!xd->lossless[mbmi->segment_id]) {
+      assert(blk_col == 0);
+      assert(blk_row == 0);
+      assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+      assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+    }
+#endif
+    CFL_CTX *const cfl = &xd->cfl;
+    CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+    if (cfl->dc_pred_is_cached[pred_plane] == 0) {
+      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                              angle_delta, use_palette, filter_intra_mode, dst,
+                              dst_stride, dst, dst_stride, blk_col, blk_row,
+                              plane);
+      if (cfl->use_dc_pred_cache) {
+        cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
+        cfl->dc_pred_is_cached[pred_plane] = 1;
+      }
+    } else {
+      cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
+    }
+    cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
+    return;
+  }
+  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                          angle_delta, use_palette, filter_intra_mode, dst,
+                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+}
+
+void av1_init_intra_predictors(void) {
+  aom_once(init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
new file mode 100644
index 000000000..07853aba0
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
+
+#include <stdlib.h>
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_init_intra_predictors(void);
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size);
+void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             int bw, int bh, TX_SIZE tx_size,
+                             PREDICTION_MODE mode, int angle_delta,
+                             int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int aoff, int loff, int plane);
+
+// Mapping of interintra to intra mode for use in the intra component
+static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
+  DC_PRED, V_PRED, H_PRED, SMOOTH_PRED
+};
+
+// Mapping of intra mode to the interintra mode
+static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
+  II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED,      II_SMOOTH_PRED, II_V_PRED,
+  II_H_PRED,  II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+  return mode >= V_PRED && mode <= D67_PRED;
+}
+
+static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
+  return bsize >= BLOCK_8X8;
+}
+
+static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+  return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+         cm->allow_intrabc;
+}
+
+static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+                                                 BLOCK_SIZE bs) {
+  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+
+  return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
+}
+
+static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+                                           const MB_MODE_INFO *mbmi) {
+  return mbmi->mode == DC_PRED &&
+         mbmi->palette_mode_info.palette_size[0] == 0 &&
+         av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+}
+
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int av1_get_dx(int angle) {
+  if (angle > 0 && angle < 90) {
+    return dr_intra_derivative[angle];
+  } else if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[180 - angle];
+  } else {
+    // In this case, we are not really going to use dx. We may return any value.
+    return 1;
+  }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int av1_get_dy(int angle) {
+  if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[angle - 90];
+  } else if (angle > 180 && angle < 270) {
+    return dr_intra_derivative[270 - angle];
+  } else {
+    // In this case, we are not really going to use dy. We may return any value.
+    return 1;
+  }
+}
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+                                              int type) {
+  const int d = abs(delta);
+  const int blk_wh = bs0 + bs1;
+  if (d <= 0 || d >= 40) return 0;
+  return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
new file mode 100644
index 000000000..d61a20aa2
--- /dev/null
+++ b/third_party/aom/av1/common/resize.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+
+#include "config/aom_scale_rtcd.h"
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = {
+  { -3, 0, 35, 64, 35, 0, -3, 0 },    { -3, 0, 34, 64, 36, 0, -3, 0 },
+  { -3, -1, 34, 64, 36, 1, -3, 0 },   { -3, -1, 33, 64, 37, 1, -3, 0 },
+  { -3, -1, 32, 64, 38, 1, -3, 0 },   { -3, -1, 31, 64, 39, 1, -3, 0 },
+  { -3, -1, 31, 63, 39, 2, -3, 0 },   { -2, -2, 30, 63, 40, 2, -3, 0 },
+  { -2, -2, 29, 63, 41, 2, -3, 0 },   { -2, -2, 29, 63, 41, 3, -4, 0 },
+  { -2, -2, 28, 63, 42, 3, -4, 0 },   { -2, -2, 27, 63, 43, 3, -4, 0 },
+  { -2, -3, 27, 63, 43, 4, -4, 0 },   { -2, -3, 26, 62, 44, 5, -4, 0 },
+  { -2, -3, 25, 62, 45, 5, -4, 0 },   { -2, -3, 25, 62, 45, 5, -4, 0 },
+  { -2, -3, 24, 62, 46, 5, -4, 0 },   { -2, -3, 23, 61, 47, 6, -4, 0 },
+  { -2, -3, 23, 61, 47, 6, -4, 0 },   { -2, -3, 22, 61, 48, 7, -4, -1 },
+  { -2, -3, 21, 60, 49, 7, -4, 0 },   { -1, -4, 20, 60, 49, 8, -4, 0 },
+  { -1, -4, 20, 60, 50, 8, -4, -1 },  { -1, -4, 19, 59, 51, 9, -4, -1 },
+  { -1, -4, 19, 59, 51, 9, -4, -1 },  { -1, -4, 18, 58, 52, 10, -4, -1 },
+  { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 },
+  { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 },
+  { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 },
+  { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 },
+  { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 },
+  { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 },
+  { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 },
+  { -1, -4, 9, 51, 59, 19, -4, -1 },  { -1, -4, 9, 51, 59, 19, -4, -1 },
+  { -1, -4, 8, 50, 60, 20, -4, -1 },  { 0, -4, 8, 49, 60, 20, -4, -1 },
+  { 0, -4, 7, 49, 60, 21, -3, -2 },   { -1, -4, 7, 48, 61, 22, -3, -2 },
+  { 0, -4, 6, 47, 61, 23, -3, -2 },   { 0, -4, 6, 47, 61, 23, -3, -2 },
+  { 0, -4, 5, 46, 62, 24, -3, -2 },   { 0, -4, 5, 45, 62, 25, -3, -2 },
+  { 0, -4, 5, 45, 62, 25, -3, -2 },   { 0, -4, 5, 44, 62, 26, -3, -2 },
+  { 0, -4, 4, 43, 63, 27, -3, -2 },   { 0, -4, 3, 43, 63, 27, -2, -2 },
+  { 0, -4, 3, 42, 63, 28, -2, -2 },   { 0, -4, 3, 41, 63, 29, -2, -2 },
+  { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 2, 40, 63, 30, -2, -2 },
+  { 0, -3, 2, 39, 63, 31, -1, -3 },   { 0, -3, 1, 39, 64, 31, -1, -3 },
+  { 0, -3, 1, 38, 64, 32, -1, -3 },   { 0, -3, 1, 37, 64, 33, -1, -3 },
+  { 0, -3, 1, 36, 64, 34, -1, -3 },   { 0, -3, 0, 36, 64, 34, 0, -3 },
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = {
+  { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 },
+  { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 },
+  { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 },
+  { 0, -8, 26, 79, 39, -7, -2, 1 },  { 0, -8, 25, 79, 40, -7, -2, 1 },
+  { 0, -8, 24, 79, 41, -7, -2, 1 },  { 0, -8, 23, 78, 42, -6, -2, 1 },
+  { 0, -8, 22, 78, 43, -6, -2, 1 },  { 0, -8, 21, 78, 44, -6, -2, 1 },
+  { 0, -8, 20, 78, 45, -5, -3, 1 },  { 0, -8, 19, 77, 47, -5, -3, 1 },
+  { 0, -8, 18, 77, 48, -5, -3, 1 },  { 0, -8, 17, 77, 49, -5, -3, 1 },
+  { 0, -8, 16, 76, 50, -4, -3, 1 },  { 0, -8, 15, 76, 51, -4, -3, 1 },
+  { 0, -8, 15, 75, 52, -3, -4, 1 },  { 0, -7, 14, 74, 53, -3, -4, 1 },
+  { 0, -7, 13, 74, 54, -3, -4, 1 },  { 0, -7, 12, 73, 55, -2, -4, 1 },
+  { 0, -7, 11, 73, 56, -2, -4, 1 },  { 0, -7, 10, 72, 57, -1, -4, 1 },
+  { 1, -7, 10, 71, 58, -1, -5, 1 },  { 0, -7, 9, 71, 59, 0, -5, 1 },
+  { 1, -7, 8, 70, 60, 0, -5, 1 },    { 1, -7, 7, 69, 61, 1, -5, 1 },
+  { 1, -6, 6, 68, 62, 1, -5, 1 },    { 0, -6, 6, 68, 62, 2, -5, 1 },
+  { 1, -6, 5, 67, 63, 2, -5, 1 },    { 1, -6, 5, 66, 64, 3, -6, 1 },
+  { 1, -6, 4, 65, 65, 4, -6, 1 },    { 1, -6, 3, 64, 66, 5, -6, 1 },
+  { 1, -5, 2, 63, 67, 5, -6, 1 },    { 1, -5, 2, 62, 68, 6, -6, 0 },
+  { 1, -5, 1, 62, 68, 6, -6, 1 },    { 1, -5, 1, 61, 69, 7, -7, 1 },
+  { 1, -5, 0, 60, 70, 8, -7, 1 },    { 1, -5, 0, 59, 71, 9, -7, 0 },
+  { 1, -5, -1, 58, 71, 10, -7, 1 },  { 1, -4, -1, 57, 72, 10, -7, 0 },
+  { 1, -4, -2, 56, 73, 11, -7, 0 },  { 1, -4, -2, 55, 73, 12, -7, 0 },
+  { 1, -4, -3, 54, 74, 13, -7, 0 },  { 1, -4, -3, 53, 74, 14, -7, 0 },
+  { 1, -4, -3, 52, 75, 15, -8, 0 },  { 1, -3, -4, 51, 76, 15, -8, 0 },
+  { 1, -3, -4, 50, 76, 16, -8, 0 },  { 1, -3, -5, 49, 77, 17, -8, 0 },
+  { 1, -3, -5, 48, 77, 18, -8, 0 },  { 1, -3, -5, 47, 77, 19, -8, 0 },
+  { 1, -3, -5, 45, 78, 20, -8, 0 },  { 1, -2, -6, 44, 78, 21, -8, 0 },
+  { 1, -2, -6, 43, 78, 22, -8, 0 },  { 1, -2, -6, 42, 78, 23, -8, 0 },
+  { 1, -2, -7, 41, 79, 24, -8, 0 },  { 1, -2, -7, 40, 79, 25, -8, 0 },
+  { 1, -2, -7, 39, 79, 26, -8, 0 },  { 1, -2, -7, 38, 80, 27, -8, -1 },
+  { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 },
+  { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 },
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = {
+  { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 },
+  { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 },
+  { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 },
+  { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 },
+  { 2, -9, 14, 94, 37, -12, 2, 0 },  { 2, -9, 13, 94, 38, -12, 2, 0 },
+  { 2, -8, 12, 93, 40, -12, 1, 0 },  { 2, -8, 11, 93, 41, -12, 1, 0 },
+  { 2, -8, 9, 92, 43, -12, 1, 1 },   { 2, -8, 8, 92, 44, -12, 1, 1 },
+  { 2, -7, 7, 91, 46, -12, 1, 0 },   { 2, -7, 6, 90, 47, -12, 1, 1 },
+  { 2, -7, 5, 90, 49, -12, 1, 0 },   { 2, -6, 4, 89, 50, -12, 1, 0 },
+  { 2, -6, 3, 88, 52, -12, 0, 1 },   { 2, -6, 2, 87, 54, -12, 0, 1 },
+  { 2, -5, 1, 86, 55, -12, 0, 1 },   { 2, -5, 0, 85, 57, -12, 0, 1 },
+  { 2, -5, -1, 84, 58, -11, 0, 1 },  { 2, -5, -2, 83, 60, -11, 0, 1 },
+  { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 },
+  { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 },
+  { 1, -3, -5, 77, 67, -9, -1, 1 },  { 1, -3, -6, 76, 69, -9, -1, 1 },
+  { 1, -3, -6, 75, 70, -8, -2, 1 },  { 1, -2, -7, 74, 71, -8, -2, 1 },
+  { 1, -2, -7, 72, 72, -7, -2, 1 },  { 1, -2, -8, 71, 74, -7, -2, 1 },
+  { 1, -2, -8, 70, 75, -6, -3, 1 },  { 1, -1, -9, 69, 76, -6, -3, 1 },
+  { 1, -1, -9, 67, 77, -5, -3, 1 },  { 1, -1, -10, 66, 79, -4, -4, 1 },
+  { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 },
+  { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 },
+  { 1, 0, -11, 58, 84, -1, -5, 2 },  { 1, 0, -12, 57, 85, 0, -5, 2 },
+  { 1, 0, -12, 55, 86, 1, -5, 2 },   { 1, 0, -12, 54, 87, 2, -6, 2 },
+  { 1, 0, -12, 52, 88, 3, -6, 2 },   { 0, 1, -12, 50, 89, 4, -6, 2 },
+  { 0, 1, -12, 49, 90, 5, -7, 2 },   { 1, 1, -12, 47, 90, 6, -7, 2 },
+  { 0, 1, -12, 46, 91, 7, -7, 2 },   { 1, 1, -12, 44, 92, 8, -8, 2 },
+  { 1, 1, -12, 43, 92, 9, -8, 2 },   { 0, 1, -12, 41, 93, 11, -8, 2 },
+  { 0, 1, -12, 40, 93, 12, -8, 2 },  { 0, 2, -12, 38, 94, 13, -9, 2 },
+  { 0, 2, -12, 37, 94, 14, -9, 2 },  { 0, 2, -12, 35, 95, 15, -9, 2 },
+  { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 },
+  { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 },
+  { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 },
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
+  { 3, -8, 13, 112, 13, -8, 3, 0 },   { 2, -7, 12, 112, 15, -8, 3, -1 },
+  { 3, -7, 10, 112, 17, -9, 3, -1 },  { 2, -6, 8, 112, 19, -9, 3, -1 },
+  { 2, -6, 7, 112, 21, -10, 3, -1 },  { 2, -5, 6, 111, 22, -10, 3, -1 },
+  { 2, -5, 4, 111, 24, -10, 3, -1 },  { 2, -4, 3, 110, 26, -11, 3, -1 },
+  { 2, -4, 1, 110, 28, -11, 3, -1 },  { 2, -4, 0, 109, 30, -12, 4, -1 },
+  { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 },
+  { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 },
+  { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 },
+  { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 },
+  { 1, 0, -9, 100, 48, -15, 4, -1 },  { 1, 0, -10, 99, 50, -15, 4, -1 },
+  { 1, 1, -11, 97, 53, -16, 4, -1 },  { 0, 1, -11, 96, 55, -16, 4, -1 },
+  { 0, 1, -12, 95, 57, -16, 4, -1 },  { 0, 2, -13, 93, 59, -16, 4, -1 },
+  { 0, 2, -13, 91, 61, -16, 4, -1 },  { 0, 2, -14, 90, 63, -16, 4, -1 },
+  { 0, 2, -14, 88, 65, -16, 4, -1 },  { 0, 2, -15, 86, 67, -16, 4, 0 },
+  { 0, 3, -15, 84, 69, -17, 4, 0 },   { 0, 3, -16, 83, 71, -17, 4, 0 },
+  { 0, 3, -16, 81, 73, -16, 3, 0 },   { 0, 3, -16, 79, 75, -16, 3, 0 },
+  { 0, 3, -16, 77, 77, -16, 3, 0 },   { 0, 3, -16, 75, 79, -16, 3, 0 },
+  { 0, 3, -16, 73, 81, -16, 3, 0 },   { 0, 4, -17, 71, 83, -16, 3, 0 },
+  { 0, 4, -17, 69, 84, -15, 3, 0 },   { 0, 4, -16, 67, 86, -15, 2, 0 },
+  { -1, 4, -16, 65, 88, -14, 2, 0 },  { -1, 4, -16, 63, 90, -14, 2, 0 },
+  { -1, 4, -16, 61, 91, -13, 2, 0 },  { -1, 4, -16, 59, 93, -13, 2, 0 },
+  { -1, 4, -16, 57, 95, -12, 1, 0 },  { -1, 4, -16, 55, 96, -11, 1, 0 },
+  { -1, 4, -16, 53, 97, -11, 1, 1 },  { -1, 4, -15, 50, 99, -10, 0, 1 },
+  { -1, 4, -15, 48, 100, -9, 0, 1 },  { -1, 4, -15, 46, 101, -8, 0, 1 },
+  { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 },
+  { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 },
+  { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 },
+  { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 },
+  { -1, 3, -11, 28, 110, 1, -4, 2 },  { -1, 3, -11, 26, 110, 3, -4, 2 },
+  { -1, 3, -10, 24, 111, 4, -5, 2 },  { -1, 3, -10, 22, 111, 6, -5, 2 },
+  { -1, 3, -10, 21, 112, 7, -6, 2 },  { -1, 3, -9, 19, 112, 8, -6, 2 },
+  { -1, 3, -9, 17, 112, 10, -7, 3 },  { -1, 3, -8, 15, 112, 12, -7, 2 },
+};
+
+const int16_t av1_resize_filter_normative[(
+    1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
+#if UPSCALE_NORMATIVE_TAPS == 8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
+  { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
+  { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
+  { -1, 2, -8, 125, 13, -5, 2, 0 },    { -1, 3, -9, 124, 15, -6, 2, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 3, -11, 122, 20, -7, 3, -1 },
+  { -1, 4, -12, 121, 22, -8, 3, -1 },  { -1, 4, -13, 120, 25, -9, 3, -1 },
+  { -1, 4, -14, 118, 28, -9, 3, -1 },  { -1, 4, -15, 117, 30, -10, 4, -1 },
+  { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
+  { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
+  { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
+  { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
+  { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
+  { -1, 6, -20, 97, 58, -17, 6, -1 },  { -1, 6, -20, 95, 61, -18, 6, -1 },
+  { -2, 7, -20, 93, 64, -18, 6, -2 },  { -2, 7, -20, 91, 66, -19, 6, -1 },
+  { -2, 7, -20, 88, 69, -19, 6, -1 },  { -2, 7, -20, 86, 71, -19, 6, -1 },
+  { -2, 7, -20, 84, 74, -20, 7, -2 },  { -2, 7, -20, 81, 76, -20, 7, -1 },
+  { -2, 7, -20, 79, 79, -20, 7, -2 },  { -1, 7, -20, 76, 81, -20, 7, -2 },
+  { -2, 7, -20, 74, 84, -20, 7, -2 },  { -1, 6, -19, 71, 86, -20, 7, -2 },
+  { -1, 6, -19, 69, 88, -20, 7, -2 },  { -1, 6, -19, 66, 91, -20, 7, -2 },
+  { -2, 6, -18, 64, 93, -20, 7, -2 },  { -1, 6, -18, 61, 95, -20, 6, -1 },
+  { -1, 6, -17, 58, 97, -20, 6, -1 },  { -1, 6, -17, 56, 99, -20, 6, -1 },
+  { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
+  { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
+  { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
+  { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
+  { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
+  { -1, 3, -9, 28, 118, -14, 4, -1 },  { -1, 3, -9, 25, 120, -13, 4, -1 },
+  { -1, 3, -8, 22, 121, -12, 4, -1 },  { -1, 3, -7, 20, 122, -11, 3, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 2, -6, 15, 124, -9, 3, -1 },
+  { 0, 2, -5, 13, 125, -8, 2, -1 },    { 0, 1, -4, 11, 125, -7, 2, 0 },
+  { 0, 1, -3, 8, 126, -6, 2, 0 },      { 0, 1, -3, 6, 127, -4, 1, 0 },
+  { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
+#else
+#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
+#endif  // UPSCALE_NORMATIVE_TAPS == 8
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
+static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
+  int out_length16 = out_length * 16;
+  if (out_length16 >= in_length * 16)
+    return filteredinterp_filters1000;
+  else if (out_length16 >= in_length * 13)
+    return filteredinterp_filters875;
+  else if (out_length16 >= in_length * 11)
+    return filteredinterp_filters750;
+  else if (out_length16 >= in_length * 9)
+    return filteredinterp_filters625;
+  else
+    return filteredinterp_filters500;
+}
+
+static void interpolate_core(const uint8_t *const input, int in_length,
+                             uint8_t *output, int out_length,
+                             const int16_t *interp_filters, int interp_taps) {
+  const int32_t delta =
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
+  const int32_t offset =
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int32_t y;
+
+  x = 0;
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+         ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k) {
+        const int pk = int_pel - interp_taps / 2 + 1 + k;
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] *
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void interpolate(const uint8_t *const input, int in_length,
+                        uint8_t *output, int out_length) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  interpolate_core(input, in_length, output, out_length, &interp_filters[0][0],
+                   SUBPEL_TAPS);
+}
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
+  return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
+}
+
+static int32_t get_upscale_convolve_x0(int in_length, int out_length,
+                                       int32_t x_step_qn) {
+  const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
+  const int32_t x0 =
+      (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+       out_length / 2) /
+          out_length +
+      RS_SCALE_EXTRA_OFF - err / 2;
+  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
+}
+
+#ifndef __clang_analyzer__
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  const int16_t *filter = av1_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+#endif
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  const int16_t *filter = av1_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  for (int s = 0; s < steps; ++s) length = (length + 1) >> 1;
+  return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input, int length,
+                             uint8_t *output, int olength, uint8_t *otmp) {
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  const int steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    uint8_t *out = NULL;
+    int filteredlength = length;
+
+    assert(otmp != NULL);
+    uint8_t *otmp2 = otmp + get_down2_length(length, 1);
+    for (int s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+static void resize_plane(const uint8_t *const input, int height, int width,
+                         int in_stride, uint8_t *output, int height2,
+                         int width2, int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf =
+      (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height));
+  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
+  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2,
+                     tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  aom_free(intbuf);
+  aom_free(tmpbuf);
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+}
+
+static void upscale_normative_rect(const uint8_t *const input, int height,
+                                   int width, int in_stride, uint8_t *output,
+                                   int height2, int width2, int out_stride,
+                                   int x_step_qn, int x0_qn, int pad_left,
+                                   int pad_right) {
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  uint8_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint8_t *tmp_right = NULL;
+  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
+  uint8_t *const in_tr = (uint8_t *)(input + width);
+  if (pad_left) {
+    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
+      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
+    }
+  }
+  if (pad_right) {
+    tmp_right =
+        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
+      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
+             border_cols);
+    }
+  }
+
+  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
+                        height2, &av1_resize_filter_normative[0][0], x0_qn,
+                        x_step_qn);
+
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
+    }
+    aom_free(tmp_right);
+  }
+}
+
+static void highbd_interpolate_core(const uint16_t *const input, int in_length,
+                                    uint16_t *output, int out_length, int bd,
+                                    const int16_t *interp_filters,
+                                    int interp_taps) {
+  const int32_t delta =
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
+  const int32_t offset =
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int32_t y;
+
+  x = 0;
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+         ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k) {
+        const int pk = int_pel - interp_taps / 2 + 1 + k;
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] *
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_interpolate(const uint16_t *const input, int in_length,
+                               uint16_t *output, int out_length, int bd) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  highbd_interpolate_core(input, in_length, output, out_length, bd,
+                          &interp_filters[0][0], SUBPEL_TAPS);
+}
+
+#ifndef __clang_analyzer__
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = av1_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                                uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = av1_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+#endif
+
+static void highbd_resize_multistep(const uint16_t *const input, int length,
+                                    uint16_t *output, int olength,
+                                    uint16_t *otmp, int bd) {
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  const int steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    uint16_t *out = NULL;
+    int filteredlength = length;
+
+    assert(otmp != NULL);
+    uint16_t *otmp2 = otmp + get_down2_length(length, 1);
+    for (int s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+static void highbd_resize_plane(const uint8_t *const input, int height,
+                                int width, int in_stride, uint8_t *output,
+                                int height2, int width2, int out_stride,
+                                int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf =
+      (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height));
+  uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height);
+  uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf2);
+  }
+
+Error:
+  aom_free(intbuf);
+  aom_free(tmpbuf);
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+}
+
+static void highbd_upscale_normative_rect(const uint8_t *const input,
+                                          int height, int width, int in_stride,
+                                          uint8_t *output, int height2,
+                                          int width2, int out_stride,
+                                          int x_step_qn, int x0_qn,
+                                          int pad_left, int pad_right, int bd) {
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  const int border_size = border_cols * sizeof(uint16_t);
+  uint16_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint16_t *tmp_right = NULL;
+  uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
+  uint16_t *const in_tl = input16 - border_cols;
+  uint16_t *const in_tr = input16 + width;
+  if (pad_left) {
+    tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
+      aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
+    }
+  }
+  if (pad_right) {
+    tmp_right =
+        (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
+      aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
+                   border_cols);
+    }
+  }
+
+  av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
+                               CONVERT_TO_SHORTPTR(output), out_stride, width2,
+                               height2, &av1_resize_filter_normative[0][0],
+                               x0_qn, x_step_qn, bd);
+
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
+    }
+    aom_free(tmp_right);
+  }
+}
+
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, owidth / 2,
+               ouv_stride);
+  resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, owidth / 2,
+               ouv_stride);
+}
+
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+               ouv_stride);
+  resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+               ouv_stride);
+}
+
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  resize_plane(u, height, width, uv_stride, ou, oheight, owidth, ouv_stride);
+  resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride);
+}
+
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                      oy_stride, bd);
+  highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                      owidth / 2, ouv_stride, bd);
+  highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                      owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                      oy_stride, bd);
+  highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+                      ouv_stride, bd);
+  highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+                      ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                      oy_stride, bd);
+  highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                      ouv_stride, bd);
+  highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                      ouv_stride, bd);
+}
+
+void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes) {
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH)
+      highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                          src->crop_widths[is_uv], src->strides[is_uv],
+                          dst->buffers[i], dst->crop_heights[is_uv],
+                          dst->crop_widths[is_uv], dst->strides[is_uv], bd);
+    else
+      resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                   src->crop_widths[is_uv], src->strides[is_uv],
+                   dst->buffers[i], dst->crop_heights[is_uv],
+                   dst->crop_widths[is_uv], dst->strides[is_uv]);
+  }
+  aom_extend_frame_borders(dst, num_planes);
+}
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows) {
+  const int is_uv = (plane > 0);
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
+  const int upscaled_plane_width =
+      ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  const int superres_denom = cm->superres_scale_denominator;
+
+  TileInfo tile_col;
+  const int32_t x_step_qn = av1_get_upscale_convolve_step(
+      downscaled_plane_width, upscaled_plane_width);
+  int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
+                                          upscaled_plane_width, x_step_qn);
+
+  for (int j = 0; j < cm->tile_cols; j++) {
+    av1_tile_set_col(&tile_col, cm, j);
+    // Determine the limits of this tile column in both the source
+    // and destination images.
+    // Note: The actual location which we start sampling from is
+    // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
+    // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
+    const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
+    const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
+    const int src_width = downscaled_x1 - downscaled_x0;
+
+    const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
+    int upscaled_x1;
+    if (j == cm->tile_cols - 1) {
+      // Note that we can't just use AOMMIN here - due to rounding,
+      // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
+      // upscaled_plane_width.
+      upscaled_x1 = upscaled_plane_width;
+    } else {
+      upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
+    }
+
+    const uint8_t *const src_ptr = src + downscaled_x0;
+    uint8_t *const dst_ptr = dst + upscaled_x0;
+    const int dst_width = upscaled_x1 - upscaled_x0;
+
+    const int pad_left = (j == 0);
+    const int pad_right = (j == cm->tile_cols - 1);
+
+    if (cm->seq_params.use_highbitdepth)
+      highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+                                    dst_ptr, rows, dst_width, dst_stride,
+                                    x_step_qn, x0_qn, pad_left, pad_right,
+                                    cm->seq_params.bit_depth);
+    else
+      upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
+                             rows, dst_width, dst_stride, x_step_qn, x0_qn,
+                             pad_left, pad_right);
+
+    // Update the fractional pixel offset to prepare for the next tile column.
+    x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
+  }
+}
+
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
+                                            YV12_BUFFER_CONFIG *dst) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    const int is_uv = (i > 0);
+    av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
+                               dst->buffers[i], dst->strides[is_uv], i,
+                               src->crop_heights[is_uv]);
+  }
+
+  aom_extend_frame_borders(dst, num_planes);
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled) {
+  const int num_planes = av1_num_planes(cm);
+  if (cm->width != unscaled->y_crop_width ||
+      cm->height != unscaled->y_crop_height) {
+    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
+                                num_planes);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+// Calculates the scaled dimension given the original dimension and the scale
+// denominator.
+static void calculate_scaled_size_helper(int *dim, int denom) {
+  if (denom != SCALE_NUMERATOR) {
+    // Use this version if we need *dim to be even
+    // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
+    // *width <<= 1;
+    *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
+  }
+}
+
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom) {
+  calculate_scaled_size_helper(width, resize_denom);
+  calculate_scaled_size_helper(height, resize_denom);
+}
+
+void av1_calculate_scaled_superres_size(int *width, int *height,
+                                        int superres_denom) {
+  (void)height;
+  calculate_scaled_size_helper(width, superres_denom);
+}
+
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
+  if (denom != SCALE_NUMERATOR) {
+    // Note: av1_calculate_scaled_superres_size() rounds *up* after division
+    // when the resulting dimensions are odd. So here, we round *down*.
+    *width = *width * denom / SCALE_NUMERATOR;
+    (void)height;
+  }
+}
+
+// Copy only the config data from 'src' to 'dst'.
+static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
+                               YV12_BUFFER_CONFIG *const dst) {
+  dst->bit_depth = src->bit_depth;
+  dst->color_primaries = src->color_primaries;
+  dst->transfer_characteristics = src->transfer_characteristics;
+  dst->matrix_coefficients = src->matrix_coefficients;
+  dst->monochrome = src->monochrome;
+  dst->chroma_sample_position = src->chroma_sample_position;
+  dst->color_range = src->color_range;
+}
+
+// TODO(afergs): Look for in-place upscaling
+// TODO(afergs): aom_ vs av1_ functions? Which can I use?
+// Upscale decoded image.
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
+  const int num_planes = av1_num_planes(cm);
+  if (!av1_superres_scaled(cm)) return;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+
+  YV12_BUFFER_CONFIG copy_buffer;
+  memset(&copy_buffer, 0, sizeof(copy_buffer));
+
+  YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
+
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
+  if (aom_alloc_frame_buffer(
+          &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate copy buffer for superres upscaling");
+
+  // Copy function assumes the frames are the same size.
+  // Note that it does not copy YV12_BUFFER_CONFIG config data.
+  aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
+
+  assert(copy_buffer.y_crop_width == aligned_width);
+  assert(copy_buffer.y_crop_height == cm->height);
+
+  // Realloc the current frame buffer at a higher resolution in place.
+  if (pool != NULL) {
+    // Use callbacks if on the decoder.
+    aom_codec_frame_buffer_t *fb =
+        &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer;
+    aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
+    aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
+    void *cb_priv = pool->cb_priv;
+
+    // Realloc with callback does not release the frame buffer - release first.
+    if (release_fb_cb(cb_priv, fb))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to free current frame buffer before superres upscaling");
+
+    // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
+    if (aom_realloc_frame_buffer(
+            frame_to_show, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to allocate current frame buffer for superres upscaling");
+  } else {
+    // Make a copy of the config data for frame_to_show in copy_buffer
+    copy_buffer_config(frame_to_show, &copy_buffer);
+
+    // Don't use callbacks on the encoder.
+    // aom_alloc_frame_buffer() clears the config data for frame_to_show
+    if (aom_alloc_frame_buffer(
+            frame_to_show, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to reallocate current frame buffer for superres upscaling");
+
+    // Restore config data back to frame_to_show
+    copy_buffer_config(&copy_buffer, frame_to_show);
+  }
+  // TODO(afergs): verify frame_to_show is correct after realloc
+  //               encoder:
+  //               decoder:
+
+  assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
+  assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
+
+  // Scale up and back into frame_to_show.
+  assert(frame_to_show->y_crop_width != cm->width);
+  av1_upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
+
+  // Free the copy buffer
+  aom_free_frame_buffer(&copy_buffer);
+}
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
new file mode 100644
index 000000000..9a59a8d63
--- /dev/null
+++ b/third_party/aom/av1/common/resize.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
+
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+                      int in_stride, uint8_t *output, int height2, int width2,
+                      int out_stride);
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+                             int in_stride, uint8_t *output, int height2,
+                             int width2, int out_stride, int bd);
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes);
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows);
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
+                                            YV12_BUFFER_CONFIG *dst);
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled);
+
+// Calculates the scaled dimensions from the given original dimensions and the
+// resize scale denominator.
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom);
+
+// Similar to above, but calculates scaled dimensions after superres from the
+// given original dimensions and superres scale denominator.
+void av1_calculate_scaled_superres_size(int *width, int *height,
+                                        int superres_denom);
+
+// Inverse of av1_calculate_scaled_superres_size() above: calculates the
+// original dimensions from the given scaled dimensions and the scale
+// denominator.
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
+
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool);
+
+// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
+static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
+  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
+  // So, the following check is more accurate.
+  return !(cm->width == cm->superres_upscaled_width);
+}
+
+#define UPSCALE_NORMATIVE_TAPS 8
+extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS]
+                                                [UPSCALE_NORMATIVE_TAPS];
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
new file mode 100644
index 000000000..d276a915b
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.c
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+
+#include "aom_ports/mem.h"
+
+// The 's' values are calculated based on original 'r' and 'e' values in the
+// spec using GenSgrprojVtable().
+// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
+const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
+  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
+  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
+  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
+  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
+  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
+  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
+  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
+};
+
+AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
+  AV1PixelRect rect;
+
+  int ss_x = is_uv && cm->seq_params.subsampling_x;
+  int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+  rect.top = 0;
+  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
+  rect.left = 0;
+  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  return rect;
+}
+
+// Count horizontal or vertical units per tile (use a width or height for
+// tile_size, respectively). We basically want to divide the tile size by the
+// size of a restoration unit. Rather than rounding up unconditionally as you
+// might expect, we round to nearest, which models the way a right or bottom
+// restoration unit can extend to up to 150% its normal width or height. The
+// max with 1 is to deal with tiles that are smaller than half of a restoration
+// unit.
+int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
+  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+}
+
+void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
+                                  int is_uv) {
+  // We need to allocate enough space for restoration units to cover the
+  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
+  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
+  // to do the computation ourselves, iterating over the tiles and keeping
+  // track of the largest width and height, then upscaling.
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int max_tile_w = tile_rect.right - tile_rect.left;
+  const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+  // To calculate hpertile and vpertile (horizontal and vertical units per
+  // tile), we basically want to divide the largest tile width or height by the
+  // size of a restoration unit. Rather than rounding up unconditionally as you
+  // might expect, we round to nearest, which models the way a right or bottom
+  // restoration unit can extend to up to 150% its normal width or height. The
+  // max with 1 is to deal with tiles that are smaller than half of a
+  // restoration unit.
+  const int unit_size = rsi->restoration_unit_size;
+  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
+  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+
+  rsi->units_per_tile = hpertile * vpertile;
+  rsi->horz_units_per_tile = hpertile;
+  rsi->vert_units_per_tile = vpertile;
+
+  const int ntiles = 1;
+  const int nunits = ntiles * rsi->units_per_tile;
+
+  aom_free(rsi->unit_info);
+  CHECK_MEM_ERROR(cm, rsi->unit_info,
+                  (RestorationUnitInfo *)aom_memalign(
+                      16, sizeof(*rsi->unit_info) * nunits));
+}
+
+void av1_free_restoration_struct(RestorationInfo *rst_info) {
+  aom_free(rst_info->unit_info);
+  rst_info->unit_info = NULL;
+}
+
+#if 0
+// Pair of values for each sgrproj parameter:
+// Index 0 corresponds to r[0], e[0]
+// Index 1 corresponds to r[1], e[1]
+int sgrproj_mtable[SGRPROJ_PARAMS][2];
+
+static void GenSgrprojVtable() {
+  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
+    const sgr_params_type *const params = &sgr_params[i];
+    for (int j = 0; j < 2; ++j) {
+      const int e = params->e[j];
+      const int r = params->r[j];
+      if (r == 0) {                 // filter is disabled
+        sgrproj_mtable[i][j] = -1;  // mark invalid
+      } else {                      // filter is enabled
+        const int n = (2 * r + 1) * (2 * r + 1);
+        const int n2e = n * n * e;
+        assert(n2e != 0);
+        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+      }
+    }
+  }
+}
+#endif
+
+void av1_loop_restoration_precal() {
+#if 0
+  GenSgrprojVtable();
+#endif
+}
+
+static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
+                               int border_horz, int border_vert) {
+  uint8_t *data_p;
+  int i;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    memset(data_p - border_horz, data_p[0], border_horz);
+    memset(data_p + width, data_p[width - 1], border_horz);
+  }
+  data_p = data - border_horz;
+  for (i = -border_vert; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
+  }
+  for (i = height; i < height + border_vert; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           width + 2 * border_horz);
+  }
+}
+
+static void extend_frame_highbd(uint16_t *data, int width, int height,
+                                int stride, int border_horz, int border_vert) {
+  uint16_t *data_p;
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
+  }
+  data_p = data - border_horz;
+  for (i = -border_vert; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p,
+           (width + 2 * border_horz) * sizeof(uint16_t));
+  }
+  for (i = height; i < height + border_vert; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           (width + 2 * border_horz) * sizeof(uint16_t));
+  }
+}
+
+void extend_frame(uint8_t *data, int width, int height, int stride,
+                  int border_horz, int border_vert, int highbd) {
+  if (highbd)
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+                        border_horz, border_vert);
+  else
+    extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+}
+
+static void copy_tile_lowbd(int width, int height, const uint8_t *src,
+                            int src_stride, uint8_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width);
+}
+
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+                             int src_stride, uint16_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+
+static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride, int highbd) {
+  if (highbd)
+    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+                     CONVERT_TO_SHORTPTR(dst), dst_stride);
+  else
+    copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+}
+
+#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+
+// With striped loop restoration, the filtering for each 64-pixel stripe gets
+// most of its input from the output of CDEF (stored in data8), but we need to
+// fill out a border of 3 pixels above/below the stripe according to the
+// following
+// rules:
+//
+// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
+//   This extension is done by a call to extend_frame() at the start of the loop
+//   restoration process, so the value of copy_above/copy_below doesn't strictly
+//   matter.
+//   However, by setting *copy_above = *copy_below = 1 whenever loop filtering
+//   across tiles is disabled, we can allow
+//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
+//   data has always been copied, simplifying the behaviour at the left and
+//   right edges of tiles.
+//
+// * If we're at a tile boundary and loop filtering across tiles is enabled,
+//   then there is a logical stripe which is 64 pixels high, but which is split
+//   into an 8px high and a 56px high stripe so that the processing (and
+//   coefficient set usage) can be aligned to tiles.
+//   In this case, we use the 3 rows of CDEF output across the boundary for
+//   context; this corresponds to leaving the frame buffer as-is.
+//
+// * If we're at a tile boundary and loop filtering across tiles is disabled,
+//   then we take the outermost row of CDEF pixels *within the current tile*
+//   and copy it three times. Thus we behave exactly as if the tile were a full
+//   frame.
+//
+// * Otherwise, we're at a stripe boundary within a tile. In that case, we
+//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
+//
+// The distinction between the latter two cases is handled by the
+// av1_loop_restoration_save_boundary_lines() function, so here we just need
+// to decide if we're overwriting the above/below boundary pixels or not.
+static void get_stripe_boundary_info(const RestorationTileLimits *limits,
+                                     const AV1PixelRect *tile_rect, int ss_y,
+                                     int *copy_above, int *copy_below) {
+  *copy_above = 1;
+  *copy_below = 1;
+
+  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+  const int this_stripe_height =
+      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
+  const int last_stripe_in_tile =
+      (limits->v_start + this_stripe_height >= tile_rect->bottom);
+
+  if (first_stripe_in_tile) *copy_above = 0;
+  if (last_stripe_in_tile) *copy_below = 0;
+}
+
+// Overwrite the border pixels around a processing stripe so that the conditions
+// listed above get_stripe_boundary_info() are preserved.
+// We save the pixels which get overwritten into a temporary buffer, so that
+// they can be restored by restore_processing_stripe_boundary() after we've
+// processed the stripe.
+//
+// limits gives the rectangular limits of the remaining stripes for the current
+// restoration unit. rsb is the stored stripe boundaries (taken from either
+// deblock or CDEF output as necessary).
+//
+// tile_rect is the limits of the current tile and tile_stripe0 is the index of
+// the first stripe in this tile (needed to convert the tile-relative stripe
+// index we get from limits into something we can look up in rsb).
+static void setup_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
+    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
+  // Offsets within the line buffers. The buffer logically starts at column
+  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
+  // has column x0 in the buffer.
+  const int buf_stride = rsb->stripe_boundary_stride;
+  const int buf_x0_off = limits->h_start;
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  // Replace RESTORATION_BORDER pixels above the top of the stripe
+  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
+  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
+  // duplicating the topmost of the 2 lines (see the AOMMAX call when
+  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
+  //
+  // Special case: If we're at the top of a tile, which isn't on the topmost
+  // tile row, and we're allowed to loop filter across tiles, then we have a
+  // logical 64-pixel-high stripe which has been split into an 8-pixel high
+  // stripe and a 56-pixel high stripe (the current one). So, in this case,
+  // we want to leave the boundary alone!
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *buf =
+            rsb->stripe_boundary_above + (buf_off << use_highbd);
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_above
+        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+               REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+      }
+    }
+
+    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+    // for i = 0, 1, 2.
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *src =
+            rsb->stripe_boundary_below + (buf_off << use_highbd);
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_below
+        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only save and overwrite i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_above
+      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd,
+                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+             line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      // Only save and overwrite i=2 line.
+      uint8_t *dst8 = data8_bl + 2 * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_below
+      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+    }
+  }
+}
+
+// This function restores the boundary lines modified by
+// setup_processing_stripe_boundary.
+//
+// Note: We need to be careful when handling the corners of the processing
+// unit, because (eg.) the top-left corner is considered to be part of
+// both the left and top borders. This means that, depending on the
+// loop_filter_across_tiles_enabled flag, the corner pixels might get
+// overwritten twice, once as part of the "top" border and once as part
+// of the "left" border (or similar for other corners).
+//
+// Everything works out fine as long as we make sure to reverse the order
+// when restoring, ie. we need to restore the left/right borders followed
+// by the top/bottom borders.
+static void restore_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
+    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
+    int copy_below, int opt) {
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8),
+               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+      }
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only restore i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      // Only restore i=2 line.
+      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+        uint8_t *dst8 = data8_bl + 2 * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
+      }
+    }
+  }
+}
+
+static void wiener_filter_stripe(const RestorationUnitInfo *rui,
+                                 int stripe_width, int stripe_height,
+                                 int procunit_width, const uint8_t *src,
+                                 int src_stride, uint8_t *dst, int dst_stride,
+                                 int32_t *tmpbuf, int bit_depth) {
+  (void)tmpbuf;
+  (void)bit_depth;
+  assert(bit_depth == 8);
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src_p = src + j;
+    uint8_t *dst_p = dst + j;
+    av1_wiener_convolve_add_src(
+        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
+        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
+  }
+}
+
+/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
+   over the input. The window is of size (2r + 1)x(2r + 1), and we
+   specialize to r = 1, 2, 3. A default function is used for r > 3.
+
+   Each loop follows the same format: We keep a window's worth of input
+   in individual variables and select data out of that as appropriate.
+*/
+static void boxsum1(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  // Vertical sum over 3-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[src_stride + j];
+      c = src[2 * src_stride + j];
+
+      dst[j] = a + b;
+      for (i = 1; i < height - 2; ++i) {
+        // Loop invariant: At the start of each iteration,
+        // a = src[(i - 1) * src_stride + j]
+        // b = src[(i    ) * src_stride + j]
+        // c = src[(i + 1) * src_stride + j]
+        dst[i * dst_stride + j] = a + b + c;
+        a = b;
+        b = c;
+        c = src[(i + 2) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c;
+      dst[(i + 1) * dst_stride + j] = b + c;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[src_stride + j] * src[src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+
+      dst[j] = a + b;
+      for (i = 1; i < height - 2; ++i) {
+        dst[i * dst_stride + j] = a + b + c;
+        a = b;
+        b = c;
+        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c;
+      dst[(i + 1) * dst_stride + j] = b + c;
+    }
+  }
+
+  // Horizontal sum over 3-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+
+    dst[i * dst_stride] = a + b;
+    for (j = 1; j < width - 2; ++j) {
+      // Loop invariant: At the start of each iteration,
+      // a = src[i * src_stride + (j - 1)]
+      // b = src[i * src_stride + (j    )]
+      // c = src[i * src_stride + (j + 1)]
+      dst[i * dst_stride + j] = a + b + c;
+      a = b;
+      b = c;
+      c = dst[i * dst_stride + (j + 2)];
+    }
+    dst[i * dst_stride + j] = a + b + c;
+    dst[i * dst_stride + (j + 1)] = b + c;
+  }
+}
+
+static void boxsum2(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c, d, e;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  // Vertical sum over 5-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[src_stride + j];
+      c = src[2 * src_stride + j];
+      d = src[3 * src_stride + j];
+      e = src[4 * src_stride + j];
+
+      dst[j] = a + b + c;
+      dst[dst_stride + j] = a + b + c + d;
+      for (i = 2; i < height - 3; ++i) {
+        // Loop invariant: At the start of each iteration,
+        // a = src[(i - 2) * src_stride + j]
+        // b = src[(i - 1) * src_stride + j]
+        // c = src[(i    ) * src_stride + j]
+        // d = src[(i + 1) * src_stride + j]
+        // e = src[(i + 2) * src_stride + j]
+        dst[i * dst_stride + j] = a + b + c + d + e;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = src[(i + 3) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e;
+      dst[(i + 2) * dst_stride + j] = c + d + e;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[src_stride + j] * src[src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+      d = src[3 * src_stride + j] * src[3 * src_stride + j];
+      e = src[4 * src_stride + j] * src[4 * src_stride + j];
+
+      dst[j] = a + b + c;
+      dst[dst_stride + j] = a + b + c + d;
+      for (i = 2; i < height - 3; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e;
+      dst[(i + 2) * dst_stride + j] = c + d + e;
+    }
+  }
+
+  // Horizontal sum over 5-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+    d = dst[i * dst_stride + 3];
+    e = dst[i * dst_stride + 4];
+
+    dst[i * dst_stride] = a + b + c;
+    dst[i * dst_stride + 1] = a + b + c + d;
+    for (j = 2; j < width - 3; ++j) {
+      // Loop invariant: At the start of each iteration,
+      // a = src[i * src_stride + (j - 2)]
+      // b = src[i * src_stride + (j - 1)]
+      // c = src[i * src_stride + (j    )]
+      // d = src[i * src_stride + (j + 1)]
+      // e = src[i * src_stride + (j + 2)]
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      a = b;
+      b = c;
+      c = d;
+      d = e;
+      e = dst[i * dst_stride + (j + 3)];
+    }
+    dst[i * dst_stride + j] = a + b + c + d + e;
+    dst[i * dst_stride + (j + 1)] = b + c + d + e;
+    dst[i * dst_stride + (j + 2)] = c + d + e;
+  }
+}
+
+static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
+                   int sqr, int32_t *dst, int dst_stride) {
+  if (r == 1)
+    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
+  else if (r == 2)
+    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
+  else
+    assert(0 && "Invalid value of r in self-guided filter");
+}
+
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xq[0] = 0;
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
+  } else if (params->r[1] == 0) {
+    xq[0] = xqd[0];
+    xq[1] = 0;
+  } else {
+    xq[0] = xqd[0];
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
+  }
+}
+
+const int32_t x_by_xplus1[256] = {
+  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
+  // instead of 0. See comments in selfguided_restoration_internal() for why
+  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
+  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
+  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
+  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
+  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
+  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  256,
+};
+
+const int32_t one_by_x[MAX_NELEM] = {
+  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
+};
+
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+                                          int dgd_stride, int bit_depth,
+                                          int sgr_params_idx, int radius_idx,
+                                          int pass, int32_t *A, int32_t *B) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  const int step = pass == 0 ? 1 : 2;
+  int i, j;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+  for (i = -1; i < height + 1; i += step) {
+    for (j = -1; j < width + 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int n = (2 * r + 1) * (2 * r + 1);
+
+      // a < 2^16 * n < 2^22 regardless of bit depth
+      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+      // b < 2^8 * n < 2^14 regardless of bit depth
+      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+      // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // This bound on p is due to:
+      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+      //
+      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+      // This is an artefact of rounding, and can only happen if all pixels
+      // are (almost) identical, so in this case we saturate to p=0.
+      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+
+      const uint32_t s = params->s[radius_idx];
+
+      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+      // (this holds even after accounting for the rounding in s)
+      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+      // Note: We have to be quite careful about the value of A[k].
+      // This is used as a blend factor between individual pixel values and the
+      // local mean. So it logically has a range of [0, 256], including both
+      // endpoints.
+      //
+      // This is a pain for hardware, as we'd like something which can be stored
+      // in exactly 8 bits.
+      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+      // slightly above 2^(8 + bit depth), due to rounding in the value of
+      // one_by_x[25-1].
+      //
+      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+      // overflow), without significantly affecting the final result: z == 0
+      // implies that the image is essentially "flat", so the local mean and
+      // individual pixel values are very similar.
+      //
+      // Note that saturating on the other side, ie. requring A[k] <= 255,
+      // would be a bad idea, as that corresponds to the case where the image
+      // is very variable, when we want to preserve the local pixel value as
+      // much as possible.
+      A[k] = x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+
+      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
+      // one_by_x[n - 1] = round(2^12 / n)
+      // => the product here is < 2^(20 + bit_depth) <= 2^32,
+      // and B[k] is set to a value < 2^(8 + bit depth)
+      // This holds even with the rounding in one_by_x and in the overall
+      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
+      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+                                             (uint32_t)B[k] *
+                                             (uint32_t)one_by_x[n - 1],
+                                         SGRPROJ_RECIP_BITS);
+    }
+  }
+}
+
+static void selfguided_restoration_fast_internal(
+    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 1, A, B);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Use the A[] and B[] arrays to calculate the filtered image
+  (void)r;
+  assert(r == 2);
+  for (i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 5;
+        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+                              5;
+        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+                              5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    } else {  // odd row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 4;
+        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    }
+  }
+}
+
+static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
+                                            int dgd_stride, int32_t *dst,
+                                            int dst_stride, int bit_depth,
+                                            int sgr_params_idx,
+                                            int radius_idx) {
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 0, A, B);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Use the A[] and B[] arrays to calculate the filtered image
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * dgd_stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride, int sgr_params_idx,
+                                 int bit_depth, int highbd) {
+  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  int32_t *dgd32 =
+      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+
+  if (highbd) {
+    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
+      }
+    }
+  } else {
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
+      }
+    }
+  }
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  if (params->r[0] > 0)
+    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
+                                         flt0, flt_stride, bit_depth,
+                                         sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
+                                    flt_stride, bit_depth, sgr_params_idx, 1);
+  return 0;
+}
+
+void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
+                                    int stride, int eps, const int *xqd,
+                                    uint8_t *dst8, int dst_stride,
+                                    int32_t *tmpbuf, int bit_depth,
+                                    int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+
+  const int ret = av1_selfguided_restoration_c(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+  decode_xq(xqd, xq, params);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      uint8_t *dst8ij = dst8 + i * dst_stride + j;
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+
+      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
+      int32_t v = u << SGRPROJ_PRJ_BITS;
+      // If params->r == 0 then we skipped the filtering in
+      // av1_selfguided_restoration_c, i.e. flt[k] == u
+      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
+      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      const uint16_t out = clip_pixel_highbd(w, bit_depth);
+      if (highbd)
+        *CONVERT_TO_SHORTPTR(dst8ij) = out;
+      else
+        *dst8ij = (uint8_t)out;
+    }
+  }
+}
+
+static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth) {
+  (void)bit_depth;
+  assert(bit_depth == 8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
+                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+                                 dst + j, dst_stride, tmpbuf, bit_depth, 0);
+  }
+}
+
+static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                        int stripe_width, int stripe_height,
+                                        int procunit_width, const uint8_t *src8,
+                                        int src_stride, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth) {
+  (void)tmpbuf;
+  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src8_p = src8 + j;
+    uint8_t *dst8_p = dst8 + j;
+    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
+                                       rui->wiener_info.hfilter, 16,
+                                       rui->wiener_info.vfilter, 16, w,
+                                       stripe_height, &conv_params, bit_depth);
+  }
+}
+
+static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                         int stripe_width, int stripe_height,
+                                         int procunit_width,
+                                         const uint8_t *src8, int src_stride,
+                                         uint8_t *dst8, int dst_stride,
+                                         int32_t *tmpbuf, int bit_depth) {
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
+                                 rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+                                 dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+  }
+}
+
+typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth);
+
+#define NUM_STRIPE_FILTERS 4
+
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
+  sgrproj_filter_stripe_highbd
+};
+
+// Filter one restoration unit
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+  RestorationType unit_rtype = rui->restoration_type;
+
+  int unit_h = limits->v_end - limits->v_start;
+  int unit_w = limits->h_end - limits->h_start;
+  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
+  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
+
+  if (unit_rtype == RESTORE_NONE) {
+    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
+    return;
+  }
+
+  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+  assert(filter_idx < NUM_STRIPE_FILTERS);
+  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
+
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+
+  // Convolve the whole tile one stripe at a time
+  RestorationTileLimits remaining_stripes = *limits;
+  int i = 0;
+  while (i < unit_h) {
+    int copy_above, copy_below;
+    remaining_stripes.v_start = limits->v_start + i;
+
+    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
+                             &copy_below);
+
+    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+    // Work out where this stripe's boundaries are within
+    // rsb->stripe_boundary_{above,below}
+    const int tile_stripe =
+        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
+        full_stripe_height;
+    const int frame_stripe = tile_stripe0 + tile_stripe;
+    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
+
+    // Calculate this stripe's height, based on two rules:
+    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+    // * We can't extend past the end of the current restoration unit
+    const int nominal_stripe_height =
+        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+    const int h = AOMMIN(nominal_stripe_height,
+                         remaining_stripes.v_end - remaining_stripes.v_start);
+
+    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
+                                     h, data8, stride, rlbs, copy_above,
+                                     copy_below, optimized_lr);
+
+    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
+                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+
+    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
+                                       data8, stride, copy_above, copy_below,
+                                       optimized_lr);
+
+    i += h;
+  }
+}
+
+static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
+                                 AV1_COMMON *cm) {
+  (void)tile_col;
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
+}
+
+static void filter_frame_on_unit(const RestorationTileLimits *limits,
+                                 const AV1PixelRect *tile_rect,
+                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 RestorationLineBuffers *rlbs) {
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  const RestorationInfo *rsi = ctxt->rsi;
+
+  av1_loop_restoration_filter_unit(
+      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
+      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
+      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
+      rsi->optimized_lr);
+}
+
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            AV1_COMMON *cm, int optimized_lr,
+                                            int num_planes) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int bit_depth = seq_params->bit_depth;
+  const int highbd = seq_params->use_highbitdepth;
+  lr_ctxt->dst = &cm->rst_frame;
+
+  const int frame_width = frame->crop_widths[0];
+  const int frame_height = frame->crop_heights[0];
+  if (aom_realloc_frame_buffer(
+          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL) < 0)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate restoration dst buffer");
+
+  lr_ctxt->on_rest_unit = filter_frame_on_unit;
+  lr_ctxt->frame = frame;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    RestorationInfo *rsi = &cm->rst_info[plane];
+    RestorationType rtype = rsi->frame_restoration_type;
+    rsi->optimized_lr = optimized_lr;
+
+    if (rtype == RESTORE_NONE) {
+      continue;
+    }
+
+    const int is_uv = plane > 0;
+    const int plane_width = frame->crop_widths[is_uv];
+    const int plane_height = frame->crop_heights[is_uv];
+    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+
+    extend_frame(frame->buffers[plane], plane_width, plane_height,
+                 frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
+                 highbd);
+
+    lr_plane_ctxt->rsi = rsi;
+    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
+    lr_plane_ctxt->highbd = highbd;
+    lr_plane_ctxt->bit_depth = bit_depth;
+    lr_plane_ctxt->data8 = frame->buffers[plane];
+    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
+    lr_plane_ctxt->data_stride = frame->strides[is_uv];
+    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
+    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
+    filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
+  }
+}
+
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      AV1_COMMON *cm, int num_planes) {
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
+                     tile_rect.right, tile_rect.top, tile_rect.bottom);
+  }
+}
+
+static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
+                                        int num_planes) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
+      continue;
+    }
+
+    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
+                                   &ctxt[plane], &ctxt[plane].tile_rect,
+                                   cm->rst_tmpbuf, cm->rlbs);
+  }
+}
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       AV1_COMMON *cm, int optimized_lr,
+                                       void *lr_ctxt) {
+  assert(!cm->all_lossless);
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
+
+  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
+
+  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
+}
+
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync) {
+  const int tile_w = tile_rect->right - tile_rect->left;
+  const int ext_size = unit_size * 3 / 2;
+  int x0 = 0, j = 0;
+  while (x0 < tile_w) {
+    int remaining_w = tile_w - x0;
+    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
+
+    limits->h_start = tile_rect->left + x0;
+    limits->h_end = tile_rect->left + x0 + w;
+    assert(limits->h_end <= tile_rect->right);
+
+    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
+
+    // No sync for even numbered rows
+    // For odd numbered rows, Loop Restoration of current block requires the LR
+    // of top-right and bottom-right blocks to be completed
+
+    // top-right sync
+    on_sync_read(lr_sync, row_number, j, plane);
+    if ((row_number + 1) < vunits_per_tile)
+      // bottom-right sync
+      on_sync_read(lr_sync, row_number + 2, j, plane);
+
+    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+
+    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
+
+    x0 += w;
+    ++j;
+  }
+}
+
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+}
+
+static void foreach_rest_unit_in_tile(
+    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
+  const int tile_h = tile_rect->bottom - tile_rect->top;
+  const int ext_size = unit_size * 3 / 2;
+
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  const int unit_idx0 = tile_idx * units_per_tile;
+
+  int y0 = 0, i = 0;
+  while (y0 < tile_h) {
+    int remaining_h = tile_h - y0;
+    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+    RestorationTileLimits limits;
+    limits.v_start = tile_rect->top + y0;
+    limits.v_end = tile_rect->top + y0 + h;
+    assert(limits.v_end <= tile_rect->bottom);
+    // Offset the tile upwards to align with the restoration processing stripe
+    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
+    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+
+    av1_foreach_rest_unit_in_row(
+        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+
+    y0 += h;
+    ++i;
+  }
+}
+
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+
+  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
+                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+                            rsi->units_per_tile, rsi->restoration_unit_size,
+                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
+}
+
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int *rcol0, int *rcol1, int *rrow0,
+                                       int *rrow1) {
+  assert(rcol0 && rcol1 && rrow0 && rrow1);
+
+  if (bsize != cm->seq_params.sb_size) return 0;
+  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
+
+  assert(!cm->all_lossless);
+
+  const int is_uv = plane > 0;
+
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int tile_w = tile_rect.right - tile_rect.left;
+  const int tile_h = tile_rect.bottom - tile_rect.top;
+
+  const int mi_top = 0;
+  const int mi_left = 0;
+
+  // Compute the mi-unit corners of the superblock relative to the top-left of
+  // the tile
+  const int mi_rel_row0 = mi_row - mi_top;
+  const int mi_rel_col0 = mi_col - mi_left;
+  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
+  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  const int size = rsi->restoration_unit_size;
+
+  // Calculate the number of restoration units in this tile (which might be
+  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
+  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
+  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+
+  // The size of an MI-unit on this plane of the image
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int mi_size_x = MI_SIZE >> ss_x;
+  const int mi_size_y = MI_SIZE >> ss_y;
+
+  // Write m for the relative mi column or row, D for the superres denominator
+  // and N for the superres numerator. If u is the upscaled pixel offset then
+  // we can write the downscaled pixel offset in two ways as:
+  //
+  //   MI_SIZE * m = N / D u
+  //
+  // from which we get u = D * MI_SIZE * m / N
+  const int mi_to_num_x = av1_superres_scaled(cm)
+                              ? mi_size_x * cm->superres_scale_denominator
+                              : mi_size_x;
+  const int mi_to_num_y = mi_size_y;
+  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
+  const int denom_y = size;
+
+  const int rnd_x = denom_x - 1;
+  const int rnd_y = denom_y - 1;
+
+  // rcol0/rrow0 should be the first column/row of restoration units (relative
+  // to the top-left of the tile) that doesn't start left/below of
+  // mi_col/mi_row. For this calculation, we need to round up the division (if
+  // the sb starts at runit column 10.1, the first matching runit has column
+  // index 11)
+  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
+  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+
+  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
+  // below-right. If we're at the bottom or right of the tile, this restoration
+  // unit might not exist, in which case we'll clamp accordingly.
+  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+
+  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
+}
+
+// Extend to left and right
+static void extend_lines(uint8_t *buf, int width, int height, int stride,
+                         int extend, int use_highbitdepth) {
+  for (int i = 0; i < height; ++i) {
+    if (use_highbitdepth) {
+      uint16_t *buf16 = (uint16_t *)buf;
+      aom_memset16(buf16 - extend, buf16[0], extend);
+      aom_memset16(buf16 + width, buf16[width - 1], extend);
+    } else {
+      memset(buf - extend, buf[0], extend);
+      memset(buf + width, buf[width - 1], extend);
+    }
+    buf += stride;
+  }
+}
+
+static void save_deblock_boundary_lines(
+    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
+    int stripe, int use_highbd, int is_above,
+    RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+
+  // There is a rare case in which a processing stripe can end 1px above the
+  // crop border. In this case, we do want to use deblocked pixels from below
+  // the stripe (hence why we ended up in this function), but instead of
+  // fetching 2 "below" rows we need to fetch one and duplicate it.
+  // This is equivalent to clamping the sample locations against the crop border
+  const int lines_to_save =
+      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
+  assert(lines_to_save == 1 || lines_to_save == 2);
+
+  int upscaled_width;
+  int line_bytes;
+  if (av1_superres_scaled(cm)) {
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
+    line_bytes = upscaled_width << use_highbd;
+    if (use_highbd)
+      av1_upscale_normative_rows(
+          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+          plane, lines_to_save);
+    else
+      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
+                                 boundaries->stripe_boundary_stride, plane,
+                                 lines_to_save);
+  } else {
+    upscaled_width = frame->crop_widths[is_uv];
+    line_bytes = upscaled_width << use_highbd;
+    for (int i = 0; i < lines_to_save; i++) {
+      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
+             line_bytes);
+    }
+  }
+  // If we only saved one line, then copy it into the second line buffer
+  if (lines_to_save == 1)
+    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
+
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                     const AV1_COMMON *cm, int plane, int row,
+                                     int stripe, int use_highbd, int is_above,
+                                     RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+  const int src_width = frame->crop_widths[is_uv];
+
+  // At the point where this function is called, we've already applied
+  // superres. So we don't need to extend the lines here, we can just
+  // pull directly from the topmost row of the upscaled frame.
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int upscaled_width = av1_superres_scaled(cm)
+                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
+                                 : src_width;
+  const int line_bytes = upscaled_width << use_highbd;
+  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
+    // Copy the line at 'row' into both context lines. This is because
+    // we want to (effectively) extend the outermost row of CDEF data
+    // from this tile to produce a border, rather than using deblocked
+    // pixels from the tile above/below.
+    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
+  }
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                         int use_highbd, int plane,
+                                         AV1_COMMON *cm, int after_cdef) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  // Get the tile rectangle, with height rounded up to the next multiple of 8
+  // luma pixels (only relevant for the bottom tile of the frame)
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int stripe0 = 0;
+
+  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
+
+  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
+
+  int tile_stripe;
+  for (tile_stripe = 0;; ++tile_stripe) {
+    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
+    const int y0 = tile_rect.top + rel_y0;
+    if (y0 >= tile_rect.bottom) break;
+
+    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
+    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+
+    const int frame_stripe = stripe0 + tile_stripe;
+
+    // In this case, we should only use CDEF pixels at the top
+    // and bottom of the frame as a whole; internal tile boundaries
+    // can use deblocked pixels from adjacent tiles for context.
+    const int use_deblock_above = (frame_stripe > 0);
+    const int use_deblock_below = (y1 < plane_height);
+
+    if (!after_cdef) {
+      // Save deblocked context where needed.
+      if (use_deblock_above) {
+        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
+                                    frame_stripe, use_highbd, 1, boundaries);
+      }
+      if (use_deblock_below) {
+        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+                                    use_highbd, 0, boundaries);
+      }
+    } else {
+      // Save CDEF context where needed. Note that we need to save the CDEF
+      // context for a particular boundary iff we *didn't* save deblocked
+      // context for that boundary.
+      //
+      // In addition, we need to save copies of the outermost line within
+      // the tile, rather than using data from outside the tile.
+      if (!use_deblock_above) {
+        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+                                 1, boundaries);
+      }
+      if (!use_deblock_below) {
+        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+                                 use_highbd, 0, boundaries);
+      }
+    }
+  }
+}
+
+// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
+// lines to be used as boundary in the loop restoration process. The
+// lines are saved in rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              AV1_COMMON *cm, int after_cdef) {
+  const int num_planes = av1_num_planes(cm);
+  const int use_highbd = cm->seq_params.use_highbitdepth;
+  for (int p = 0; p < num_planes; ++p) {
+    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+  }
+}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
new file mode 100644
index 000000000..d834f9270
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
+
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
+
+#define RESTORATION_PROC_UNIT_SIZE 64
+
+// Filter tile grid offset upwards compared to the superblock grid
+#define RESTORATION_UNIT_OFFSET 8
+
+#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
+
+#define WIENER_BORDER_VERT 2  // Vertical border used for Wiener
+#define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN)  // Horizontal border for Wiener
+
+// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
+// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
+// Note the line buffer needed is twice the value of this macro.
+#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
+#else
+#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
+#endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
+#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
+#else
+#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
+#endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+// How many border pixels do we need for each processing unit?
+#define RESTORATION_BORDER 3
+
+// How many rows of deblocked pixels do we save above/below each processing
+// stripe?
+#define RESTORATION_CTX_VERT 2
+
+// Additional pixels to the left and right in above/below buffers
+// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
+#define RESTORATION_EXTRA_HORZ 4
+
+// Pad up to 20 more (may be much less is needed)
+#define RESTORATION_PADDING 20
+#define RESTORATION_PROC_UNIT_PELS                             \
+  ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
+    RESTORATION_PADDING) *                                     \
+   (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
+    RESTORATION_PADDING))
+
+#define RESTORATION_UNITSIZE_MAX 256
+#define RESTORATION_UNITPELS_HORZ_MAX \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_UNITPELS_VERT_MAX                                \
+  ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+    RESTORATION_UNIT_OFFSET))
+#define RESTORATION_UNITPELS_MAX \
+  (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
+
+// Two 32-bit buffers needed for the restored versions from two filters
+// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
+// on the decoder side.
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t))
+
+#define SGRPROJ_EXTBUF_SIZE (0)
+#define SGRPROJ_PARAMS_BITS 4
+#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
+
+// Precision bits for projection
+#define SGRPROJ_PRJ_BITS 7
+// Restoration precision bits generated higher than source before projection
+#define SGRPROJ_RST_BITS 4
+// Internal precision bits for core selfguided_restoration
+#define SGRPROJ_SGR_BITS 8
+#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
+
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+
+#define SGRPROJ_PRJ_SUBEXP_K 4
+
+#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
+
+#define MAX_RADIUS 2  // Only 1, 2, 3 allowed
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
+
+#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
+#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
+#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
+#define WIENER_TMPBUF_SIZE (0)
+#define WIENER_EXTBUF_SIZE (0)
+
+// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
+// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
+#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
+
+#define WIENER_FILT_PREC_BITS 7
+#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
+
+// Central values for the taps
+#define WIENER_FILT_TAP0_MIDV (3)
+#define WIENER_FILT_TAP1_MIDV (-7)
+#define WIENER_FILT_TAP2_MIDV (15)
+#define WIENER_FILT_TAP3_MIDV                                              \
+  (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
+                           WIENER_FILT_TAP2_MIDV))
+
+#define WIENER_FILT_TAP0_BITS 4
+#define WIENER_FILT_TAP1_BITS 5
+#define WIENER_FILT_TAP2_BITS 6
+
+#define WIENER_FILT_BITS \
+  ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MINV \
+  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MINV \
+  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MINV \
+  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+  (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MAXV \
+  (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MAXV \
+  (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_SUBEXP_K 1
+#define WIENER_FILT_TAP1_SUBEXP_K 2
+#define WIENER_FILT_TAP2_SUBEXP_K 3
+
+// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE
+#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE)
+
+// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE
+#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE)
+
+// Check the assumptions of the existing code
+#if SUBPEL_TAPS != WIENER_WIN + 1
+#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1"
+#endif
+#if WIENER_FILT_PREC_BITS != 7
+#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
+#endif
+
+#define LR_TILE_ROW 0
+#define LR_TILE_COL 0
+#define LR_TILE_COLS 1
+
+typedef struct {
+  int r[2];  // radii
+  int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
+} sgr_params_type;
+
+typedef struct {
+  RestorationType restoration_type;
+  WienerInfo wiener_info;
+  SgrprojInfo sgrproj_info;
+} RestorationUnitInfo;
+
+// A restoration line buffer needs space for two lines plus a horizontal filter
+// margin of RESTORATION_EXTRA_HORZ on each side.
+#define RESTORATION_LINEBUFFER_WIDTH \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
+
+// Similarly, the column buffers (used when we're at a vertical tile edge
+// that we can't filter across) need space for one processing unit's worth
+// of pixels, plus the top/bottom border width
+#define RESTORATION_COLBUFFER_HEIGHT \
+  (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
+
+typedef struct {
+  // Temporary buffers to save/restore 3 lines above/below the restoration
+  // stripe.
+  uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+  uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+} RestorationLineBuffers;
+
+typedef struct {
+  uint8_t *stripe_boundary_above;
+  uint8_t *stripe_boundary_below;
+  int stripe_boundary_stride;
+  int stripe_boundary_size;
+} RestorationStripeBoundaries;
+
+typedef struct {
+  RestorationType frame_restoration_type;
+  int restoration_unit_size;
+
+  // Fields below here are allocated and initialised by
+  // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
+  // restoration units in (one row of) the largest tile in the frame. The data
+  // in unit_info is laid out with units_per_tile entries for each tile, which
+  // have stride horz_units_per_tile.
+  //
+  // Even if there are tiles of different sizes, the data in unit_info is laid
+  // out as if all tiles are of full size.
+  int units_per_tile;
+  int vert_units_per_tile, horz_units_per_tile;
+  RestorationUnitInfo *unit_info;
+  RestorationStripeBoundaries boundaries;
+  int optimized_lr;
+} RestorationInfo;
+
+static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
+  sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
+  sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
+}
+
+static INLINE void set_default_wiener(WienerInfo *wiener_info) {
+  wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
+  wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
+  wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
+  wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
+      -2 *
+      (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
+  wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
+  wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
+  wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
+}
+
+typedef struct {
+  int h_start, h_end, v_start, v_end;
+} RestorationTileLimits;
+
+typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    int rest_unit_idx, void *priv,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
+typedef struct FilterFrameCtxt {
+  const RestorationInfo *rsi;
+  int tile_stripe0;
+  int ss_x, ss_y;
+  int highbd, bit_depth;
+  uint8_t *data8, *dst8;
+  int data_stride, dst_stride;
+  AV1PixelRect tile_rect;
+} FilterFrameCtxt;
+
+typedef struct AV1LrStruct {
+  rest_unit_visitor_t on_rest_unit;
+  FilterFrameCtxt ctxt[MAX_MB_PLANE];
+  YV12_BUFFER_CONFIG *frame;
+  YV12_BUFFER_CONFIG *dst;
+} AV1LrStruct;
+
+extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
+extern const int32_t x_by_xplus1[256];
+extern const int32_t one_by_x[MAX_NELEM];
+
+void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
+                                  int is_uv);
+void av1_free_restoration_struct(RestorationInfo *rst_info);
+
+void extend_frame(uint8_t *data, int width, int height, int stride,
+                  int border_horz, int border_vert, int highbd);
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+
+// Filter a single loop restoration unit.
+//
+// limits is the limits of the unit. rui gives the mode to use for this unit
+// and its coefficients. If striped loop restoration is enabled, rsb contains
+// deblocked pixels to use for stripe boundaries; rlbs is just some space to
+// use as a scratch buffer. tile_rect gives the limits of the tile containing
+// this unit. tile_stripe0 is the index of the first stripe in this tile.
+//
+// ss_x and ss_y are flags which should be 1 if this is a plane with
+// horizontal/vertical subsampling, respectively. highbd is a flag which should
+// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
+//
+// data8 is the frame data (pointing at the top-left corner of the frame, not
+// the restoration unit) and stride is its stride. dst8 is the buffer where the
+// results will be written and has stride dst_stride. Like data8, dst8 should
+// point at the top-left corner of the frame.
+//
+// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
+// be at least SGRPROJ_TMPBUF_SIZE big.
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr);
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       struct AV1Common *cm, int optimized_lr,
+                                       void *lr_ctxt);
+void av1_loop_restoration_precal();
+
+typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
+                                          void *priv);
+struct AV1LrSyncData;
+
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+                                const int sb_cols, int plane);
+
+// Call on_rest_unit for each loop restoration unit in the plane.
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
+// Return 1 iff the block at mi_row, mi_col with size bsize is a
+// top-level superblock containing the top-left corner of at least one
+// loop restoration unit.
+//
+// If the block is a top-level superblock, the function writes to
+// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
+// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
+// to the current tile, whose starting index is returned as
+// *tile_tl_idx.
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int *rcol0, int *rcol1, int *rrow0,
+                                       int *rrow1);
+
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              struct AV1Common *cm,
+                                              int after_cdef);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            struct AV1Common *cm,
+                                            int optimized_lr, int num_planes);
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      struct AV1Common *cm, int num_planes);
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync);
+AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c
new file mode 100644
index 000000000..c525fe229
--- /dev/null
+++ b/third_party/aom/av1/common/scale.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/scale.h"
+#include "aom_dsp/aom_filter.h"
+
+// Note: Expect val to be in q4 precision
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+  const int off =
+      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
+  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+  const int off =
+      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
+  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static int unscaled_value(int val, const struct scale_factors *sf) {
+  (void)sf;
+  return val << SCALE_EXTRA_BITS;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
+}
+
+// Given the fixed point scale, calculate coarse point scale.
+static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
+  return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
+}
+
+// Note: x and y are integer precision, mvq4 is q4 precision.
+MV32 av1_scale_mv(const MV *mvq4, int x, int y,
+                  const struct scale_factors *sf) {
+  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
+  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
+  const MV32 res = { scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
+                     scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 };
+  return res;
+}
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h) {
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    sf->x_scale_fp = REF_INVALID_SCALE;
+    sf->y_scale_fp = REF_INVALID_SCALE;
+    return;
+  }
+
+  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+
+  sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
+  sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
+
+  if (av1_is_scaled(sf)) {
+    sf->scale_value_x = scaled_x;
+    sf->scale_value_y = scaled_y;
+  } else {
+    sf->scale_value_x = unscaled_value;
+    sf->scale_value_y = unscaled_value;
+  }
+
+  // AV1 convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_convolve_2d.
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
+  // subpel_x_q4 == 0
+  sf->convolve[0][1][0] = av1_convolve_y_sr;
+  // subpel_y_q4 == 0
+  sf->convolve[1][0][0] = av1_convolve_x_sr;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->convolve[1][1][0] = av1_convolve_2d_sr;
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
+  // subpel_x_q4 == 0
+  sf->convolve[0][1][1] = av1_jnt_convolve_y;
+  // subpel_y_q4 == 0
+  sf->convolve[1][0][1] = av1_jnt_convolve_x;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+  // AV1 High BD convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_highbd_convolve_2d.
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
+  // subpel_x_q4 == 0
+  sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
+  // subpel_y_q4 == 0
+  sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
+  // subpel_x_q4 == 0 && subpel_y_q4 == 0
+  sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
+  // subpel_x_q4 == 0
+  sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
+  // subpel_y_q4 == 0
+  sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
+  // subpel_x_q4 != 0 && subpel_y_q4 != 0
+  sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
+}
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
new file mode 100644
index 000000000..748e958c3
--- /dev/null
+++ b/third_party/aom/av1/common/scale.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
+
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCALE_NUMERATOR 8
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+  int x_scale_fp;  // horizontal fixed point scale factor
+  int y_scale_fp;  // vertical fixed point scale factor
+  int x_step_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *sf);
+  int (*scale_value_y)(int val, const struct scale_factors *sf);
+
+  // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
+  aom_convolve_fn_t convolve[2][2][2];
+  aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
+};
+
+MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h);
+
+static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+  return av1_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                       int this_width, int this_height) {
+  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c
new file mode 100644
index 000000000..31a787b53
--- /dev/null
+++ b/third_party/aom/av1/common/scan.c
@@ -0,0 +1,3735 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/common_data.h"
+#include "av1/common/scan.h"
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
+  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+  0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+  12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+  33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+  49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
+  0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+  20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+  24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+  28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+  2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+  3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 128, 87,  94,  101, 108, 115, 122, 129, 136, 95,  102, 109, 116,
+  123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+  132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+  141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+  150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+  159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+  216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+  225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+  234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+  250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+  195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+  228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+  12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+  45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+  78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+  111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+  144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+  177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+  210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+  243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+  27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+  60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+  93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+  95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = {
+  0,   8,   16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96,  104, 112,
+  120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+  240, 248, 1,   9,   17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97,
+  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+  225, 233, 241, 249, 2,   10,  18,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 242, 250, 3,   11,  19,  27,  35,  43,  51,  59,  67,
+  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+  195, 203, 211, 219, 227, 235, 243, 251, 4,   12,  20,  28,  36,  44,  52,
+  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+  180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5,   13,  21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6,   14,  22,
+  30,  38,  46,  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142,
+  150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95,  103, 111, 119, 127,
+  135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 1,  33, 65, 97,  129, 161, 193, 225,
+  2,  34, 66, 98,  130, 162, 194, 226, 3,  35, 67, 99,  131, 163, 195, 227,
+  4,  36, 68, 100, 132, 164, 196, 228, 5,  37, 69, 101, 133, 165, 197, 229,
+  6,  38, 70, 102, 134, 166, 198, 230, 7,  39, 71, 103, 135, 167, 199, 231,
+  8,  40, 72, 104, 136, 168, 200, 232, 9,  41, 73, 105, 137, 169, 201, 233,
+  10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+  12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+  14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+  16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+  18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+  20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+  22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+  24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+  26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+  28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+  30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+  117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
+  0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+  65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+  52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+  54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+  56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+  58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+  60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+  47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
+  0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+  5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+  37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+  85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+  192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+  241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+  242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+  243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+  244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+  245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+  247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+  248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+  249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+  250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+  251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+  252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+  253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+  254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+  255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+  480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+  481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+  482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+  498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+  350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+  411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+  487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+  459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+  491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+  288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+  165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+  73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+  12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+  448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+  480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+  15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+  47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+  79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+  111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+  143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+  175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+  207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+  239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+  271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+  303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+  335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+  367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+  399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+  431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+  463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+  495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+  30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+  62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+  94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+  95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+  469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+  377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+  254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+  380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+  382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+  479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+  0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+  5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+  37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+  85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+  12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+  31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+  243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+  109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+  170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+  218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+  250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+  255
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+  0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+  35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+  37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+  163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+  226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+  10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+  384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+  13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+  416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+  76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+  325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+  295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+  110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+  513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+  235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+  174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+  577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+  299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+  114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+  517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+  487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+  84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+  333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+  736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+  396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+  25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+  428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+  832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+  429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+  26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+  399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+  802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+  586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+  183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+  246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+  649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+  867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+  464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+  61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+  372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+  775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+  838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+  435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+  95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+  498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+  901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+  716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+  313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+  345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+  748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+  873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+  470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+  316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+  719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+  906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+  503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+  411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+  814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+  815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+  412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+  630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+  1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+  600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+  570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+  973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+  664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+  634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+  1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+  604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+  822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+  823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+  731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+  918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+  764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+  889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+  921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+  799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+  862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+  927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023
+};
+
+// Neighborhood 2-tuples for various scans and blocksizes,
+// in {top, left} order for each position in corresponding scan order.
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 0,  0, 4, 4, 1, 4, 1,  1,  2,  2,  2,  5, 5,
+  8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
+  1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
+  4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
+  9,  2,  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16,
+  11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
+  0, 1, 4,  5,  8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 1,  1,
+  2, 5, 6,  9,  10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2,  2,  3,
+  6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,
+  4,  5,  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12,
+  13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0, 1,  8,  1,  1,  8,  8,  2,  9,  9, 16, 10,
+  17, 2,  2,  16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3,  3, 4,  11,
+  19, 26, 12, 19, 4,  4, 20, 27, 5,  12, 13, 20, 21, 28, 5, 5,  6,
+  13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
+  1,  2,  9,  10, 17, 18, 25, 2,  2,  3,  10, 11, 18, 19, 26, 3,  3,
+  4,  11, 12, 19, 20, 27, 4,  4,  5,  12, 13, 20, 21, 28, 5,  5,  6,
+  13, 14, 21, 22, 29, 6,  6,  7,  14, 15, 22, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,
+  0,  1,  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,
+  9,  16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
+  24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,  9,  2,
+  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16, 11, 14, 14, 17,
+  15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
+  24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
+  34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
+  46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
+  53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  16, 1,  1,  16, 16, 2,  17, 17, 32, 18, 33, 2,
+  2,  32, 32, 3,  18, 33, 48, 19, 34, 34, 49, 3,  3,  4,  19, 35, 50, 20, 35,
+  4,  4,  36, 51, 5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
+  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
+  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
+  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
+  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,  4,  5,
+  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12, 13, 16, 14, 17,
+  15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
+  24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
+  34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
+  46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
+  53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
+  8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0,  0,  1,  16, 2,  17,
+  3,  18, 4,  19, 5,  20, 6,  21, 7,  22, 8,  23, 9,  24, 10, 25, 11, 26, 12,
+  27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36,
+  22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31,
+  46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55,
+  41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32,
+  32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0,  0,  1,  4,  5,  8,
+  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45,
+  48, 49, 52, 53, 56, 57, 60, 1,  1,  2,  5,  6,  9,  10, 13, 14, 17, 18, 21,
+  22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58,
+  61, 2,  2,  3,  6,  7,  10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34,
+  35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  16, 16, 32, 32, 0,  0,  1,  16, 17, 32, 33, 48, 1,  1,  2,
+  17, 18, 33, 34, 49, 2,  2,  3,  18, 19, 34, 35, 50, 3,  3,  4,  19, 20, 35,
+  36, 51, 4,  4,  5,  20, 21, 36, 37, 52, 5,  5,  6,  21, 22, 37, 38, 53, 6,
+  6,  7,  22, 23, 38, 39, 54, 7,  7,  8,  23, 24, 39, 40, 55, 8,  8,  9,  24,
+  25, 40, 41, 56, 9,  9,  10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
+  58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
+  14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
+  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
+  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
+  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
+  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
+  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
+  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
+  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
+  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
+  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
+  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
+  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
+  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
+  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
+  106, 113, 113, 120, 120, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107,
+  114, 114, 121, 121, 128, 128, 128, 87,  94,  94,  101, 101, 108, 108, 115,
+  115, 122, 122, 129, 129, 136, 136, 136, 95,  102, 102, 109, 109, 116, 116,
+  123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124,
+  124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125,
+  132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133,
+  133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134,
+  141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142,
+  142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143,
+  150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192,
+  151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200,
+  200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208,
+  208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209,
+  216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217,
+  217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218,
+  225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226,
+  226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227,
+  234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242,
+  242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230,
+  237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253,
+  247, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
+  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
+  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
+  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
+  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
+  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 8,   8,
+  8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194, 225, 9,
+  9,   9,   40,  40,  71,  71,  102, 102, 133, 133, 164, 164, 195, 195, 226,
+  10,  10,  10,  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196,
+  227, 11,  11,  11,  42,  42,  73,  73,  104, 104, 135, 135, 166, 166, 197,
+  197, 228, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
+  198, 198, 229, 13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168,
+  168, 199, 199, 230, 14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138,
+  169, 169, 200, 200, 231, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139,
+  139, 170, 170, 201, 201, 232, 16,  16,  16,  47,  47,  78,  78,  109, 109,
+  140, 140, 171, 171, 202, 202, 233, 17,  17,  17,  48,  48,  79,  79,  110,
+  110, 141, 141, 172, 172, 203, 203, 234, 18,  18,  18,  49,  49,  80,  80,
+  111, 111, 142, 142, 173, 173, 204, 204, 235, 19,  19,  19,  50,  50,  81,
+  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 20,  20,  20,  51,  51,
+  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 21,  21,  21,  52,
+  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238, 22,  22,  22,
+  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208, 208, 239, 23,  23,
+  23,  54,  54,  85,  85,  116, 116, 147, 147, 178, 178, 209, 209, 240, 24,
+  24,  24,  55,  55,  86,  86,  117, 117, 148, 148, 179, 179, 210, 210, 241,
+  25,  25,  25,  56,  56,  87,  87,  118, 118, 149, 149, 180, 180, 211, 211,
+  242, 26,  26,  26,  57,  57,  88,  88,  119, 119, 150, 150, 181, 181, 212,
+  212, 243, 27,  27,  27,  58,  58,  89,  89,  120, 120, 151, 151, 182, 182,
+  213, 213, 244, 28,  28,  28,  59,  59,  90,  90,  121, 121, 152, 152, 183,
+  183, 214, 214, 245, 29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153,
+  184, 184, 215, 215, 246, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154,
+  154, 185, 185, 216, 216, 247, 31,  62,  62,  93,  93,  124, 124, 155, 155,
+  186, 186, 217, 217, 248, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218,
+  218, 249, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158,
+  189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253,
+  223, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
+  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
+  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
+  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
+  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
+  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
+  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
+  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
+  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
+  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
+  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
+  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
+  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
+  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+  126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133,
+  127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134,
+  141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148,
+  142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149,
+  156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163,
+  157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164,
+  171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178,
+  172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179,
+  186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193,
+  187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194,
+  201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208,
+  202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209,
+  216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216,
+  217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224,
+  224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238,
+  232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239,
+  246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253,
+  247, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
+  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
+  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
+  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
+  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
+  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
+  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
+  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
+  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
+  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
+  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
+  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
+  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
+  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
+  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
+  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
+  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+  223, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   8,   8,   16,  16,  24,  24,  32,  32,  40,  40,  48,
+  48,  56,  56,  64,  64,  72,  72,  80,  80,  88,  88,  96,  96,  104, 104,
+  112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168,
+  168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224,
+  232, 232, 240, 240, 0,   0,   1,   8,   9,   16,  17,  24,  25,  32,  33,
+  40,  41,  48,  49,  56,  57,  64,  65,  72,  73,  80,  81,  88,  89,  96,
+  97,  104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153,
+  160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216,
+  217, 224, 225, 232, 233, 240, 241, 248, 1,   1,   2,   9,   10,  17,  18,
+  25,  26,  33,  34,  41,  42,  49,  50,  57,  58,  65,  66,  73,  74,  81,
+  82,  89,  90,  97,  98,  105, 106, 113, 114, 121, 122, 129, 130, 137, 138,
+  145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201,
+  202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2,   2,   3,
+  10,  11,  18,  19,  26,  27,  34,  35,  42,  43,  50,  51,  58,  59,  66,
+  67,  74,  75,  82,  83,  90,  91,  98,  99,  106, 107, 114, 115, 122, 123,
+  130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186,
+  187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243,
+  250, 3,   3,   4,   11,  12,  19,  20,  27,  28,  35,  36,  43,  44,  51,
+  52,  59,  60,  67,  68,  75,  76,  83,  84,  91,  92,  99,  100, 107, 108,
+  115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171,
+  172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228,
+  235, 236, 243, 244, 251, 4,   4,   5,   12,  13,  20,  21,  28,  29,  36,
+  37,  44,  45,  52,  53,  60,  61,  68,  69,  76,  77,  84,  85,  92,  93,
+  100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156,
+  157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213,
+  220, 221, 228, 229, 236, 237, 244, 245, 252, 5,   5,   6,   13,  14,  21,
+  22,  29,  30,  37,  38,  45,  46,  53,  54,  61,  62,  69,  70,  77,  78,
+  85,  86,  93,  94,  101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141,
+  142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198,
+  205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6,   6,
+  7,   14,  15,  22,  23,  30,  31,  38,  39,  46,  47,  54,  55,  62,  63,
+  70,  71,  78,  79,  86,  87,  94,  95,  102, 103, 110, 111, 118, 119, 126,
+  127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183,
+  190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246,
+  247, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  32, 32, 64, 64,  96,  96,  128, 128, 160, 160, 192, 192,
+  0,  0,  1,  32, 33, 64, 65, 96,  97,  128, 129, 160, 161, 192, 193, 224,
+  1,  1,  2,  33, 34, 65, 66, 97,  98,  129, 130, 161, 162, 193, 194, 225,
+  2,  2,  3,  34, 35, 66, 67, 98,  99,  130, 131, 162, 163, 194, 195, 226,
+  3,  3,  4,  35, 36, 67, 68, 99,  100, 131, 132, 163, 164, 195, 196, 227,
+  4,  4,  5,  36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
+  5,  5,  6,  37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229,
+  6,  6,  7,  38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
+  7,  7,  8,  39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231,
+  8,  8,  9,  40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232,
+  9,  9,  10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233,
+  10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234,
+  11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
+  12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236,
+  13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237,
+  14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238,
+  15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239,
+  16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
+  17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241,
+  18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
+  19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243,
+  20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244,
+  21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
+  22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246,
+  23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247,
+  24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248,
+  25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
+  26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250,
+  27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251,
+  28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252,
+  29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253,
+  30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
+  8,  9,  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1,  1,  2,  9,  10, 17,
+  18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2,  2,  3,  10, 11, 18, 19, 26, 27,
+  34, 35, 42, 43, 50, 51, 58, 3,  3,  4,  11, 12, 19, 20, 27, 28, 35, 36, 43,
+  44, 51, 52, 59, 4,  4,  5,  12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
+  60, 5,  5,  6,  13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6,  6,
+  7,  14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,  0,  1,
+  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,  9,  16, 10, 17,
+  11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
+  27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
+  30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
+  46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
+  49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  2,  2,  2,  9,  9,  16, 16,
+  16, 24, 24, 17, 24, 10, 17, 3,  10, 3,  3,  4,  4,  4,  11, 11, 18, 18, 25,
+  25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5,  12, 5,  5,  6,
+  6,  6,  13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
+  35, 42, 28, 35, 21, 28, 14, 21, 7,  14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
+  50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
+  52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
+  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
+  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
+  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
+  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
+  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
+  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
+  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
+  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
+  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
+  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
+  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
+  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
+  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
+  106, 113, 113, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107, 114, 114,
+  121, 87,  94,  94,  101, 101, 108, 108, 115, 115, 122, 95,  102, 102, 109,
+  109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,  1,  1,   1,   16,  16,  16,  2,   2,   2,
+  17,  17,  32,  32,  32,  3,  3,  3,   18,  18,  33,  33,  48,  48,  48,
+  4,   4,   4,   19,  19,  34, 34, 49,  49,  64,  64,  64,  5,   5,   5,
+  20,  20,  35,  35,  50,  50, 65, 65,  80,  80,  80,  6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66, 66, 81,  81,  96,  96,  96,  7,   7,   7,
+  22,  22,  37,  37,  52,  52, 67, 67,  82,  82,  97,  97,  112, 8,   8,
+  8,   23,  23,  38,  38,  53, 53, 68,  68,  83,  83,  98,  98,  113, 9,
+  9,   9,   24,  24,  39,  39, 54, 54,  69,  69,  84,  84,  99,  99,  114,
+  10,  10,  10,  25,  25,  40, 40, 55,  55,  70,  70,  85,  85,  100, 100,
+  115, 11,  11,  11,  26,  26, 41, 41,  56,  56,  71,  71,  86,  86,  101,
+  101, 116, 12,  12,  12,  27, 27, 42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 13,  13,  13, 28, 28,  43,  43,  58,  58,  73,  73,  88,
+  88,  103, 103, 118, 14,  14, 14, 29,  29,  44,  44,  59,  59,  74,  74,
+  89,  89,  104, 104, 119, 15, 30, 30,  45,  45,  60,  60,  75,  75,  90,
+  90,  105, 105, 120, 31,  46, 46, 61,  61,  76,  76,  91,  91,  106, 106,
+  121, 47,  62,  62,  77,  77, 92, 92,  107, 107, 122, 63,  78,  78,  93,
+  93,  108, 108, 123, 79,  94, 94, 109, 109, 124, 95,  110, 110, 125, 111,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
+  56, 56, 64, 64, 72, 72, 80, 80, 88, 88,  96,  96,  104, 104, 112, 112,
+  0,  0,  1,  8,  9,  16, 17, 24, 25, 32,  33,  40,  41,  48,  49,  56,
+  57, 64, 65, 72, 73, 80, 81, 88, 89, 96,  97,  104, 105, 112, 113, 120,
+  1,  1,  2,  9,  10, 17, 18, 25, 26, 33,  34,  41,  42,  49,  50,  57,
+  58, 65, 66, 73, 74, 81, 82, 89, 90, 97,  98,  105, 106, 113, 114, 121,
+  2,  2,  3,  10, 11, 18, 19, 26, 27, 34,  35,  42,  43,  50,  51,  58,
+  59, 66, 67, 74, 75, 82, 83, 90, 91, 98,  99,  106, 107, 114, 115, 122,
+  3,  3,  4,  11, 12, 19, 20, 27, 28, 35,  36,  43,  44,  51,  52,  59,
+  60, 67, 68, 75, 76, 83, 84, 91, 92, 99,  100, 107, 108, 115, 116, 123,
+  4,  4,  5,  12, 13, 20, 21, 28, 29, 36,  37,  44,  45,  52,  53,  60,
+  61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
+  5,  5,  6,  13, 14, 21, 22, 29, 30, 37,  38,  45,  46,  53,  54,  61,
+  62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
+  6,  6,  7,  14, 15, 22, 23, 30, 31, 38,  39,  46,  47,  54,  55,  62,
+  63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  16, 16, 32, 32, 48, 48, 64, 64, 80, 80,  96,  96,
+  0,  0,  1,  16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96,  97,  112,
+  1,  1,  2,  17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97,  98,  113,
+  2,  2,  3,  18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98,  99,  114,
+  3,  3,  4,  19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,  100, 115,
+  4,  4,  5,  20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+  5,  5,  6,  21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
+  6,  6,  7,  22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+  7,  7,  8,  23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
+  8,  8,  9,  24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
+  9,  9,  10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
+  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
+  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
+  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
+  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
+  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
+  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
+  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
+  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
+  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
+  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
+  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
+  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
+  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
+  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
+  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
+  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
+  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   16,  16,  16,  2,   2,   2,
+  17,  17,  32,  32,  32,  3,   3,   3,   18,  18,  33,  33,  48,  48,  48,
+  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  5,   5,   5,
+  20,  20,  35,  35,  50,  50,  65,  65,  80,  80,  80,  6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  7,   7,   7,
+  22,  22,  37,  37,  52,  52,  67,  67,  82,  82,  97,  97,  112, 112, 112,
+  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
+  113, 113, 128, 128, 128, 9,   9,   9,   24,  24,  39,  39,  54,  54,  69,
+  69,  84,  84,  99,  99,  114, 114, 129, 129, 144, 144, 144, 10,  10,  10,
+  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
+  130, 145, 145, 160, 160, 160, 11,  11,  11,  26,  26,  41,  41,  56,  56,
+  71,  71,  86,  86,  101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
+  176, 176, 12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+  13,  13,  13,  28,  28,  43,  43,  58,  58,  73,  73,  88,  88,  103, 103,
+  118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
+  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
+  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+  224, 224, 15,  30,  30,  45,  45,  60,  60,  75,  75,  90,  90,  105, 105,
+  120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
+  225, 240, 240, 240, 31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106,
+  121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
+  226, 241, 241, 256, 256, 256, 47,  62,  62,  77,  77,  92,  92,  107, 107,
+  122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
+  227, 242, 242, 257, 257, 272, 272, 272, 63,  78,  78,  93,  93,  108, 108,
+  123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
+  228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79,  94,  94,  109, 109,
+  124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
+  229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95,  110, 110,
+  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+  230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
+  126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
+  231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
+  336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
+  232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
+  352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
+  233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
+  353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+  234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
+  354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
+  235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
+  355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
+  236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
+  356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
+  237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
+  357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
+  238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
+  358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
+  239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
+  359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
+  464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
+  360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
+  465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
+  361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
+  466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
+  377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
+  482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
+  408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
+  334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
+  454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
+  395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
+  366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
+  471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
+  457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
+  443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
+  459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
+  475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
+  462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
+  494, 509, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
+  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
+  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
+  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
+  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
+  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
+  8,   8,   8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194,
+  225, 225, 256, 256, 256, 9,   9,   9,   40,  40,  71,  71,  102, 102, 133,
+  133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10,  10,  10,
+  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
+  258, 289, 289, 320, 320, 320, 11,  11,  11,  42,  42,  73,  73,  104, 104,
+  135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
+  352, 352, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
+  198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
+  13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168, 168, 199, 199,
+  230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
+  14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138, 169, 169, 200, 200,
+  231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
+  448, 448, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139, 139, 170, 170,
+  201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
+  418, 449, 449, 480, 16,  16,  16,  47,  47,  78,  78,  109, 109, 140, 140,
+  171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
+  388, 419, 419, 450, 450, 481, 17,  17,  17,  48,  48,  79,  79,  110, 110,
+  141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
+  358, 389, 389, 420, 420, 451, 451, 482, 18,  18,  18,  49,  49,  80,  80,
+  111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
+  328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19,  19,  19,  50,  50,
+  81,  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
+  298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20,  20,  20,
+  51,  51,  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
+  268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
+  21,  21,  52,  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238,
+  238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
+  486, 22,  22,  22,  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208,
+  208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
+  456, 456, 487, 23,  23,  23,  54,  54,  85,  85,  116, 116, 147, 147, 178,
+  178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
+  426, 426, 457, 457, 488, 24,  24,  24,  55,  55,  86,  86,  117, 117, 148,
+  148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
+  396, 396, 427, 427, 458, 458, 489, 25,  25,  25,  56,  56,  87,  87,  118,
+  118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
+  366, 366, 397, 397, 428, 428, 459, 459, 490, 26,  26,  26,  57,  57,  88,
+  88,  119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
+  336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27,  27,  27,  58,
+  58,  89,  89,  120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
+  306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28,  28,
+  28,  59,  59,  90,  90,  121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
+  276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
+  29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153, 184, 184, 215, 215,
+  246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
+  463, 494, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154, 154, 185, 185,
+  216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
+  433, 464, 464, 495, 31,  62,  62,  93,  93,  124, 124, 155, 155, 186, 186,
+  217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
+  434, 465, 465, 496, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218, 218,
+  249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
+  466, 497, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
+  312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
+  158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
+  406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
+  283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
+  222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
+  439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
+  409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
+  379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
+  411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
+  443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
+  414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
+  478, 509, 479, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
+  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+  224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
+  336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
+  464, 464, 480, 480, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,
+  80,  81,  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
+  193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
+  320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
+  433, 448, 449, 464, 465, 480, 481, 496, 1,   1,   2,   17,  18,  33,  34,
+  49,  50,  65,  66,  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161,
+  162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
+  289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
+  402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2,   2,   3,
+  18,  19,  34,  35,  50,  51,  66,  67,  82,  83,  98,  99,  114, 115, 130,
+  131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
+  258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
+  371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
+  498, 3,   3,   4,   19,  20,  35,  36,  51,  52,  67,  68,  83,  84,  99,
+  100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
+  227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
+  340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
+  467, 468, 483, 484, 499, 4,   4,   5,   20,  21,  36,  37,  52,  53,  68,
+  69,  84,  85,  100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
+  196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
+  309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
+  436, 437, 452, 453, 468, 469, 484, 485, 500, 5,   5,   6,   21,  22,  37,
+  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133, 134, 149, 150,
+  165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
+  278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
+  405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6,   6,
+  7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118, 119,
+  134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
+  247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
+  374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
+  487, 502, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,
+  103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
+  216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
+  343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
+  456, 471, 472, 487, 488, 503, 8,   8,   9,   24,  25,  40,  41,  56,  57,
+  72,  73,  88,  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
+  185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
+  312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
+  425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9,   9,   10,  25,  26,
+  41,  42,  57,  58,  73,  74,  89,  90,  105, 106, 121, 122, 137, 138, 153,
+  154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
+  281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
+  394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
+  10,  11,  26,  27,  42,  43,  58,  59,  74,  75,  90,  91,  106, 107, 122,
+  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
+  250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
+  363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
+  490, 491, 506, 11,  11,  12,  27,  28,  43,  44,  59,  60,  75,  76,  91,
+  92,  107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
+  219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
+  332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
+  459, 460, 475, 476, 491, 492, 507, 12,  12,  13,  28,  29,  44,  45,  60,
+  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
+  188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
+  301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
+  428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13,  13,  14,  29,
+  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126, 141, 142,
+  157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
+  270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
+  397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
+  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
+  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+  239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
+  366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
+  479, 494, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   32,  32,  64,  64,  96,  96,  128, 128, 160, 160, 192,
+  192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
+  448, 448, 0,   0,   1,   32,  33,  64,  65,  96,  97,  128, 129, 160, 161,
+  192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
+  417, 448, 449, 480, 1,   1,   2,   33,  34,  65,  66,  97,  98,  129, 130,
+  161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
+  386, 417, 418, 449, 450, 481, 2,   2,   3,   34,  35,  66,  67,  98,  99,
+  130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
+  355, 386, 387, 418, 419, 450, 451, 482, 3,   3,   4,   35,  36,  67,  68,
+  99,  100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
+  324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4,   4,   5,   36,  37,
+  68,  69,  100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
+  293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5,   5,   6,
+  37,  38,  69,  70,  101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
+  262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
+  6,   7,   38,  39,  70,  71,  102, 103, 134, 135, 166, 167, 198, 199, 230,
+  231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
+  486, 7,   7,   8,   39,  40,  71,  72,  103, 104, 135, 136, 167, 168, 199,
+  200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
+  455, 456, 487, 8,   8,   9,   40,  41,  72,  73,  104, 105, 136, 137, 168,
+  169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
+  424, 425, 456, 457, 488, 9,   9,   10,  41,  42,  73,  74,  105, 106, 137,
+  138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
+  393, 394, 425, 426, 457, 458, 489, 10,  10,  11,  42,  43,  74,  75,  106,
+  107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
+  362, 363, 394, 395, 426, 427, 458, 459, 490, 11,  11,  12,  43,  44,  75,
+  76,  107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
+  331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12,  12,  13,  44,
+  45,  76,  77,  108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
+  300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13,  13,
+  14,  45,  46,  77,  78,  109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
+  269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
+  14,  14,  15,  46,  47,  78,  79,  110, 111, 142, 143, 174, 175, 206, 207,
+  238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
+  463, 494, 15,  15,  16,  47,  48,  79,  80,  111, 112, 143, 144, 175, 176,
+  207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
+  432, 463, 464, 495, 16,  16,  17,  48,  49,  80,  81,  112, 113, 144, 145,
+  176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
+  401, 432, 433, 464, 465, 496, 17,  17,  18,  49,  50,  81,  82,  113, 114,
+  145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
+  370, 401, 402, 433, 434, 465, 466, 497, 18,  18,  19,  50,  51,  82,  83,
+  114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
+  339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19,  19,  20,  51,  52,
+  83,  84,  115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
+  308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20,  20,  21,
+  52,  53,  84,  85,  116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
+  277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
+  21,  22,  53,  54,  85,  86,  117, 118, 149, 150, 181, 182, 213, 214, 245,
+  246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
+  501, 22,  22,  23,  54,  55,  86,  87,  118, 119, 150, 151, 182, 183, 214,
+  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
+  470, 471, 502, 23,  23,  24,  55,  56,  87,  88,  119, 120, 151, 152, 183,
+  184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
+  439, 440, 471, 472, 503, 24,  24,  25,  56,  57,  88,  89,  120, 121, 152,
+  153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
+  408, 409, 440, 441, 472, 473, 504, 25,  25,  26,  57,  58,  89,  90,  121,
+  122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
+  377, 378, 409, 410, 441, 442, 473, 474, 505, 26,  26,  27,  58,  59,  90,
+  91,  122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
+  346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27,  27,  28,  59,
+  60,  91,  92,  123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
+  315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28,  28,
+  29,  60,  61,  92,  93,  124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
+  284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
+  29,  29,  30,  61,  62,  93,  94,  125, 126, 157, 158, 189, 190, 221, 222,
+  253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
+  478, 509, 30,  30,  31,  62,  63,  94,  95,  126, 127, 158, 159, 190, 191,
+  222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
+  447, 478, 479, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+  239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
+  261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
+  254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
+  276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
+  269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
+  291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
+  284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
+  306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
+  299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
+  321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
+  314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
+  336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
+  329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
+  336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
+  344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
+  366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
+  359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
+  381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
+  374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
+  396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
+  389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
+  411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
+  404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
+  426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
+  419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
+  441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
+  434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
+  456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
+  449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
+  471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
+  464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
+  486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
+  479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
+  501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
+  494, 509, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
+  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
+  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
+  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
+  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
+  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
+  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
+  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
+  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
+  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
+  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
+  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
+  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
+  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
+  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
+  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
+  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+  223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
+  261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
+  238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
+  276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
+  253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
+  291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
+  268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
+  306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
+  283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
+  321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
+  298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
+  336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
+  313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
+  320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
+  328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
+  366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
+  343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
+  381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
+  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
+  396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
+  373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
+  411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
+  388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
+  426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
+  403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
+  441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
+  418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
+  456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
+  433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
+  471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
+  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
+  486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
+  463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
+  501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
+  478, 509, 479, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
+  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+  224, 224, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,  80,  81,
+  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
+  209, 224, 225, 240, 1,   1,   2,   17,  18,  33,  34,  49,  50,  65,  66,
+  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
+  194, 209, 210, 225, 226, 241, 2,   2,   3,   18,  19,  34,  35,  50,  51,
+  66,  67,  82,  83,  98,  99,  114, 115, 130, 131, 146, 147, 162, 163, 178,
+  179, 194, 195, 210, 211, 226, 227, 242, 3,   3,   4,   19,  20,  35,  36,
+  51,  52,  67,  68,  83,  84,  99,  100, 115, 116, 131, 132, 147, 148, 163,
+  164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4,   4,   5,   20,  21,
+  36,  37,  52,  53,  68,  69,  84,  85,  100, 101, 116, 117, 132, 133, 148,
+  149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5,   5,   6,
+  21,  22,  37,  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133,
+  134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
+  6,   7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118,
+  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
+  246, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,  103,
+  104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
+  231, 232, 247, 8,   8,   9,   24,  25,  40,  41,  56,  57,  72,  73,  88,
+  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
+  216, 217, 232, 233, 248, 9,   9,   10,  25,  26,  41,  42,  57,  58,  73,
+  74,  89,  90,  105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
+  201, 202, 217, 218, 233, 234, 249, 10,  10,  11,  26,  27,  42,  43,  58,
+  59,  74,  75,  90,  91,  106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
+  186, 187, 202, 203, 218, 219, 234, 235, 250, 11,  11,  12,  27,  28,  43,
+  44,  59,  60,  75,  76,  91,  92,  107, 108, 123, 124, 139, 140, 155, 156,
+  171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12,  12,  13,  28,
+  29,  44,  45,  60,  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141,
+  156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13,  13,
+  14,  29,  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126,
+  141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
+  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
+  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+  239, 254, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+  239, 254, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   2,   2,   2,
+  17,  17,  32,  32,  32,  48,  48,  33,  48,  18,  33,  3,   18,  3,   3,
+  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  80,  80,  65,
+  80,  50,  65,  35,  50,  20,  35,  5,   20,  5,   5,   6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  112, 112, 97,
+  112, 82,  97,  67,  82,  52,  67,  37,  52,  22,  37,  7,   22,  7,   7,
+  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
+  113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99,  114, 84,  99,
+  69,  84,  54,  69,  39,  54,  24,  39,  9,   24,  9,   9,   10,  10,  10,
+  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
+  130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
+  131, 101, 116, 86,  101, 71,  86,  56,  71,  41,  56,  26,  41,  11,  26,
+  11,  11,  12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+  208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
+  118, 88,  103, 73,  88,  58,  73,  43,  58,  28,  43,  13,  28,  13,  13,
+  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
+  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+  224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
+  150, 120, 135, 105, 120, 90,  105, 75,  90,  60,  75,  45,  60,  30,  45,
+  15,  30,  31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106, 121, 121,
+  136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
+  227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
+  137, 107, 122, 92,  107, 77,  92,  62,  77,  47,  62,  63,  78,  78,  93,
+  93,  108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
+  213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
+  154, 169, 139, 154, 124, 139, 109, 124, 94,  109, 79,  94,  95,  110, 110,
+  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+  230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
+  156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
+  202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
+  188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+  234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
+  221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
+  239, 254, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
+  192, 192,  224, 224,  256, 256,  288, 288,  320, 320,  352, 352,  384, 384,
+  416, 416,  448, 448,  480, 480,  512, 512,  544, 544,  576, 576,  608, 608,
+  640, 640,  672, 672,  704, 704,  736, 736,  768, 768,  800, 800,  832, 832,
+  864, 864,  896, 896,  928, 928,  960, 960,  0,   0,    1,   32,   33,  64,
+  65,  96,   97,  128,  129, 160,  161, 192,  193, 224,  225, 256,  257, 288,
+  289, 320,  321, 352,  353, 384,  385, 416,  417, 448,  449, 480,  481, 512,
+  513, 544,  545, 576,  577, 608,  609, 640,  641, 672,  673, 704,  705, 736,
+  737, 768,  769, 800,  801, 832,  833, 864,  865, 896,  897, 928,  929, 960,
+  961, 992,  1,   1,    2,   33,   34,  65,   66,  97,   98,  129,  130, 161,
+  162, 193,  194, 225,  226, 257,  258, 289,  290, 321,  322, 353,  354, 385,
+  386, 417,  418, 449,  450, 481,  482, 513,  514, 545,  546, 577,  578, 609,
+  610, 641,  642, 673,  674, 705,  706, 737,  738, 769,  770, 801,  802, 833,
+  834, 865,  866, 897,  898, 929,  930, 961,  962, 993,  2,   2,    3,   34,
+  35,  66,   67,  98,   99,  130,  131, 162,  163, 194,  195, 226,  227, 258,
+  259, 290,  291, 322,  323, 354,  355, 386,  387, 418,  419, 450,  451, 482,
+  483, 514,  515, 546,  547, 578,  579, 610,  611, 642,  643, 674,  675, 706,
+  707, 738,  739, 770,  771, 802,  803, 834,  835, 866,  867, 898,  899, 930,
+  931, 962,  963, 994,  3,   3,    4,   35,   36,  67,   68,  99,   100, 131,
+  132, 163,  164, 195,  196, 227,  228, 259,  260, 291,  292, 323,  324, 355,
+  356, 387,  388, 419,  420, 451,  452, 483,  484, 515,  516, 547,  548, 579,
+  580, 611,  612, 643,  644, 675,  676, 707,  708, 739,  740, 771,  772, 803,
+  804, 835,  836, 867,  868, 899,  900, 931,  932, 963,  964, 995,  4,   4,
+  5,   36,   37,  68,   69,  100,  101, 132,  133, 164,  165, 196,  197, 228,
+  229, 260,  261, 292,  293, 324,  325, 356,  357, 388,  389, 420,  421, 452,
+  453, 484,  485, 516,  517, 548,  549, 580,  581, 612,  613, 644,  645, 676,
+  677, 708,  709, 740,  741, 772,  773, 804,  805, 836,  837, 868,  869, 900,
+  901, 932,  933, 964,  965, 996,  5,   5,    6,   37,   38,  69,   70,  101,
+  102, 133,  134, 165,  166, 197,  198, 229,  230, 261,  262, 293,  294, 325,
+  326, 357,  358, 389,  390, 421,  422, 453,  454, 485,  486, 517,  518, 549,
+  550, 581,  582, 613,  614, 645,  646, 677,  678, 709,  710, 741,  742, 773,
+  774, 805,  806, 837,  838, 869,  870, 901,  902, 933,  934, 965,  966, 997,
+  6,   6,    7,   38,   39,  70,   71,  102,  103, 134,  135, 166,  167, 198,
+  199, 230,  231, 262,  263, 294,  295, 326,  327, 358,  359, 390,  391, 422,
+  423, 454,  455, 486,  487, 518,  519, 550,  551, 582,  583, 614,  615, 646,
+  647, 678,  679, 710,  711, 742,  743, 774,  775, 806,  807, 838,  839, 870,
+  871, 902,  903, 934,  935, 966,  967, 998,  7,   7,    8,   39,   40,  71,
+  72,  103,  104, 135,  136, 167,  168, 199,  200, 231,  232, 263,  264, 295,
+  296, 327,  328, 359,  360, 391,  392, 423,  424, 455,  456, 487,  488, 519,
+  520, 551,  552, 583,  584, 615,  616, 647,  648, 679,  680, 711,  712, 743,
+  744, 775,  776, 807,  808, 839,  840, 871,  872, 903,  904, 935,  936, 967,
+  968, 999,  8,   8,    9,   40,   41,  72,   73,  104,  105, 136,  137, 168,
+  169, 200,  201, 232,  233, 264,  265, 296,  297, 328,  329, 360,  361, 392,
+  393, 424,  425, 456,  457, 488,  489, 520,  521, 552,  553, 584,  585, 616,
+  617, 648,  649, 680,  681, 712,  713, 744,  745, 776,  777, 808,  809, 840,
+  841, 872,  873, 904,  905, 936,  937, 968,  969, 1000, 9,   9,    10,  41,
+  42,  73,   74,  105,  106, 137,  138, 169,  170, 201,  202, 233,  234, 265,
+  266, 297,  298, 329,  330, 361,  362, 393,  394, 425,  426, 457,  458, 489,
+  490, 521,  522, 553,  554, 585,  586, 617,  618, 649,  650, 681,  682, 713,
+  714, 745,  746, 777,  778, 809,  810, 841,  842, 873,  874, 905,  906, 937,
+  938, 969,  970, 1001, 10,  10,   11,  42,   43,  74,   75,  106,  107, 138,
+  139, 170,  171, 202,  203, 234,  235, 266,  267, 298,  299, 330,  331, 362,
+  363, 394,  395, 426,  427, 458,  459, 490,  491, 522,  523, 554,  555, 586,
+  587, 618,  619, 650,  651, 682,  683, 714,  715, 746,  747, 778,  779, 810,
+  811, 842,  843, 874,  875, 906,  907, 938,  939, 970,  971, 1002, 11,  11,
+  12,  43,   44,  75,   76,  107,  108, 139,  140, 171,  172, 203,  204, 235,
+  236, 267,  268, 299,  300, 331,  332, 363,  364, 395,  396, 427,  428, 459,
+  460, 491,  492, 523,  524, 555,  556, 587,  588, 619,  620, 651,  652, 683,
+  684, 715,  716, 747,  748, 779,  780, 811,  812, 843,  844, 875,  876, 907,
+  908, 939,  940, 971,  972, 1003, 12,  12,   13,  44,   45,  76,   77,  108,
+  109, 140,  141, 172,  173, 204,  205, 236,  237, 268,  269, 300,  301, 332,
+  333, 364,  365, 396,  397, 428,  429, 460,  461, 492,  493, 524,  525, 556,
+  557, 588,  589, 620,  621, 652,  653, 684,  685, 716,  717, 748,  749, 780,
+  781, 812,  813, 844,  845, 876,  877, 908,  909, 940,  941, 972,  973, 1004,
+  13,  13,   14,  45,   46,  77,   78,  109,  110, 141,  142, 173,  174, 205,
+  206, 237,  238, 269,  270, 301,  302, 333,  334, 365,  366, 397,  398, 429,
+  430, 461,  462, 493,  494, 525,  526, 557,  558, 589,  590, 621,  622, 653,
+  654, 685,  686, 717,  718, 749,  750, 781,  782, 813,  814, 845,  846, 877,
+  878, 909,  910, 941,  942, 973,  974, 1005, 14,  14,   15,  46,   47,  78,
+  79,  110,  111, 142,  143, 174,  175, 206,  207, 238,  239, 270,  271, 302,
+  303, 334,  335, 366,  367, 398,  399, 430,  431, 462,  463, 494,  495, 526,
+  527, 558,  559, 590,  591, 622,  623, 654,  655, 686,  687, 718,  719, 750,
+  751, 782,  783, 814,  815, 846,  847, 878,  879, 910,  911, 942,  943, 974,
+  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111,  112, 143,  144, 175,
+  176, 207,  208, 239,  240, 271,  272, 303,  304, 335,  336, 367,  368, 399,
+  400, 431,  432, 463,  464, 495,  496, 527,  528, 559,  560, 591,  592, 623,
+  624, 655,  656, 687,  688, 719,  720, 751,  752, 783,  784, 815,  816, 847,
+  848, 879,  880, 911,  912, 943,  944, 975,  976, 1007, 16,  16,   17,  48,
+  49,  80,   81,  112,  113, 144,  145, 176,  177, 208,  209, 240,  241, 272,
+  273, 304,  305, 336,  337, 368,  369, 400,  401, 432,  433, 464,  465, 496,
+  497, 528,  529, 560,  561, 592,  593, 624,  625, 656,  657, 688,  689, 720,
+  721, 752,  753, 784,  785, 816,  817, 848,  849, 880,  881, 912,  913, 944,
+  945, 976,  977, 1008, 17,  17,   18,  49,   50,  81,   82,  113,  114, 145,
+  146, 177,  178, 209,  210, 241,  242, 273,  274, 305,  306, 337,  338, 369,
+  370, 401,  402, 433,  434, 465,  466, 497,  498, 529,  530, 561,  562, 593,
+  594, 625,  626, 657,  658, 689,  690, 721,  722, 753,  754, 785,  786, 817,
+  818, 849,  850, 881,  882, 913,  914, 945,  946, 977,  978, 1009, 18,  18,
+  19,  50,   51,  82,   83,  114,  115, 146,  147, 178,  179, 210,  211, 242,
+  243, 274,  275, 306,  307, 338,  339, 370,  371, 402,  403, 434,  435, 466,
+  467, 498,  499, 530,  531, 562,  563, 594,  595, 626,  627, 658,  659, 690,
+  691, 722,  723, 754,  755, 786,  787, 818,  819, 850,  851, 882,  883, 914,
+  915, 946,  947, 978,  979, 1010, 19,  19,   20,  51,   52,  83,   84,  115,
+  116, 147,  148, 179,  180, 211,  212, 243,  244, 275,  276, 307,  308, 339,
+  340, 371,  372, 403,  404, 435,  436, 467,  468, 499,  500, 531,  532, 563,
+  564, 595,  596, 627,  628, 659,  660, 691,  692, 723,  724, 755,  756, 787,
+  788, 819,  820, 851,  852, 883,  884, 915,  916, 947,  948, 979,  980, 1011,
+  20,  20,   21,  52,   53,  84,   85,  116,  117, 148,  149, 180,  181, 212,
+  213, 244,  245, 276,  277, 308,  309, 340,  341, 372,  373, 404,  405, 436,
+  437, 468,  469, 500,  501, 532,  533, 564,  565, 596,  597, 628,  629, 660,
+  661, 692,  693, 724,  725, 756,  757, 788,  789, 820,  821, 852,  853, 884,
+  885, 916,  917, 948,  949, 980,  981, 1012, 21,  21,   22,  53,   54,  85,
+  86,  117,  118, 149,  150, 181,  182, 213,  214, 245,  246, 277,  278, 309,
+  310, 341,  342, 373,  374, 405,  406, 437,  438, 469,  470, 501,  502, 533,
+  534, 565,  566, 597,  598, 629,  630, 661,  662, 693,  694, 725,  726, 757,
+  758, 789,  790, 821,  822, 853,  854, 885,  886, 917,  918, 949,  950, 981,
+  982, 1013, 22,  22,   23,  54,   55,  86,   87,  118,  119, 150,  151, 182,
+  183, 214,  215, 246,  247, 278,  279, 310,  311, 342,  343, 374,  375, 406,
+  407, 438,  439, 470,  471, 502,  503, 534,  535, 566,  567, 598,  599, 630,
+  631, 662,  663, 694,  695, 726,  727, 758,  759, 790,  791, 822,  823, 854,
+  855, 886,  887, 918,  919, 950,  951, 982,  983, 1014, 23,  23,   24,  55,
+  56,  87,   88,  119,  120, 151,  152, 183,  184, 215,  216, 247,  248, 279,
+  280, 311,  312, 343,  344, 375,  376, 407,  408, 439,  440, 471,  472, 503,
+  504, 535,  536, 567,  568, 599,  600, 631,  632, 663,  664, 695,  696, 727,
+  728, 759,  760, 791,  792, 823,  824, 855,  856, 887,  888, 919,  920, 951,
+  952, 983,  984, 1015, 24,  24,   25,  56,   57,  88,   89,  120,  121, 152,
+  153, 184,  185, 216,  217, 248,  249, 280,  281, 312,  313, 344,  345, 376,
+  377, 408,  409, 440,  441, 472,  473, 504,  505, 536,  537, 568,  569, 600,
+  601, 632,  633, 664,  665, 696,  697, 728,  729, 760,  761, 792,  793, 824,
+  825, 856,  857, 888,  889, 920,  921, 952,  953, 984,  985, 1016, 25,  25,
+  26,  57,   58,  89,   90,  121,  122, 153,  154, 185,  186, 217,  218, 249,
+  250, 281,  282, 313,  314, 345,  346, 377,  378, 409,  410, 441,  442, 473,
+  474, 505,  506, 537,  538, 569,  570, 601,  602, 633,  634, 665,  666, 697,
+  698, 729,  730, 761,  762, 793,  794, 825,  826, 857,  858, 889,  890, 921,
+  922, 953,  954, 985,  986, 1017, 26,  26,   27,  58,   59,  90,   91,  122,
+  123, 154,  155, 186,  187, 218,  219, 250,  251, 282,  283, 314,  315, 346,
+  347, 378,  379, 410,  411, 442,  443, 474,  475, 506,  507, 538,  539, 570,
+  571, 602,  603, 634,  635, 666,  667, 698,  699, 730,  731, 762,  763, 794,
+  795, 826,  827, 858,  859, 890,  891, 922,  923, 954,  955, 986,  987, 1018,
+  27,  27,   28,  59,   60,  91,   92,  123,  124, 155,  156, 187,  188, 219,
+  220, 251,  252, 283,  284, 315,  316, 347,  348, 379,  380, 411,  412, 443,
+  444, 475,  476, 507,  508, 539,  540, 571,  572, 603,  604, 635,  636, 667,
+  668, 699,  700, 731,  732, 763,  764, 795,  796, 827,  828, 859,  860, 891,
+  892, 923,  924, 955,  956, 987,  988, 1019, 28,  28,   29,  60,   61,  92,
+  93,  124,  125, 156,  157, 188,  189, 220,  221, 252,  253, 284,  285, 316,
+  317, 348,  349, 380,  381, 412,  413, 444,  445, 476,  477, 508,  509, 540,
+  541, 572,  573, 604,  605, 636,  637, 668,  669, 700,  701, 732,  733, 764,
+  765, 796,  797, 828,  829, 860,  861, 892,  893, 924,  925, 956,  957, 988,
+  989, 1020, 29,  29,   30,  61,   62,  93,   94,  125,  126, 157,  158, 189,
+  190, 221,  222, 253,  254, 285,  286, 317,  318, 349,  350, 381,  382, 413,
+  414, 445,  446, 477,  478, 509,  510, 541,  542, 573,  574, 605,  606, 637,
+  638, 669,  670, 701,  702, 733,  734, 765,  766, 797,  798, 829,  830, 861,
+  862, 893,  894, 925,  926, 957,  958, 989,  990, 1021, 30,  30,   31,  62,
+  63,  94,   95,  126,  127, 158,  159, 190,  191, 222,  223, 254,  255, 286,
+  287, 318,  319, 350,  351, 382,  383, 414,  415, 446,  447, 478,  479, 510,
+  511, 542,  543, 574,  575, 606,  607, 638,  639, 670,  671, 702,  703, 734,
+  735, 766,  767, 798,  799, 830,  831, 862,  863, 894,  895, 926,  927, 958,
+  959, 990,  991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    1,   1,    2,   2,    3,   3,    4,   4,    5,   5,
+  6,   6,    7,   7,    8,   8,    9,   9,    10,  10,   11,  11,   12,  12,
+  13,  13,   14,  14,   15,  15,   16,  16,   17,  17,   18,  18,   19,  19,
+  20,  20,   21,  21,   22,  22,   23,  23,   24,  24,   25,  25,   26,  26,
+  27,  27,   28,  28,   29,  29,   30,  30,   0,   0,    1,   32,   2,   33,
+  3,   34,   4,   35,   5,   36,   6,   37,   7,   38,   8,   39,   9,   40,
+  10,  41,   11,  42,   12,  43,   13,  44,   14,  45,   15,  46,   16,  47,
+  17,  48,   18,  49,   19,  50,   20,  51,   21,  52,   22,  53,   23,  54,
+  24,  55,   25,  56,   26,  57,   27,  58,   28,  59,   29,  60,   30,  61,
+  31,  62,   32,  32,   33,  64,   34,  65,   35,  66,   36,  67,   37,  68,
+  38,  69,   39,  70,   40,  71,   41,  72,   42,  73,   43,  74,   44,  75,
+  45,  76,   46,  77,   47,  78,   48,  79,   49,  80,   50,  81,   51,  82,
+  52,  83,   53,  84,   54,  85,   55,  86,   56,  87,   57,  88,   58,  89,
+  59,  90,   60,  91,   61,  92,   62,  93,   63,  94,   64,  64,   65,  96,
+  66,  97,   67,  98,   68,  99,   69,  100,  70,  101,  71,  102,  72,  103,
+  73,  104,  74,  105,  75,  106,  76,  107,  77,  108,  78,  109,  79,  110,
+  80,  111,  81,  112,  82,  113,  83,  114,  84,  115,  85,  116,  86,  117,
+  87,  118,  88,  119,  89,  120,  90,  121,  91,  122,  92,  123,  93,  124,
+  94,  125,  95,  126,  96,  96,   97,  128,  98,  129,  99,  130,  100, 131,
+  101, 132,  102, 133,  103, 134,  104, 135,  105, 136,  106, 137,  107, 138,
+  108, 139,  109, 140,  110, 141,  111, 142,  112, 143,  113, 144,  114, 145,
+  115, 146,  116, 147,  117, 148,  118, 149,  119, 150,  120, 151,  121, 152,
+  122, 153,  123, 154,  124, 155,  125, 156,  126, 157,  127, 158,  128, 128,
+  129, 160,  130, 161,  131, 162,  132, 163,  133, 164,  134, 165,  135, 166,
+  136, 167,  137, 168,  138, 169,  139, 170,  140, 171,  141, 172,  142, 173,
+  143, 174,  144, 175,  145, 176,  146, 177,  147, 178,  148, 179,  149, 180,
+  150, 181,  151, 182,  152, 183,  153, 184,  154, 185,  155, 186,  156, 187,
+  157, 188,  158, 189,  159, 190,  160, 160,  161, 192,  162, 193,  163, 194,
+  164, 195,  165, 196,  166, 197,  167, 198,  168, 199,  169, 200,  170, 201,
+  171, 202,  172, 203,  173, 204,  174, 205,  175, 206,  176, 207,  177, 208,
+  178, 209,  179, 210,  180, 211,  181, 212,  182, 213,  183, 214,  184, 215,
+  185, 216,  186, 217,  187, 218,  188, 219,  189, 220,  190, 221,  191, 222,
+  192, 192,  193, 224,  194, 225,  195, 226,  196, 227,  197, 228,  198, 229,
+  199, 230,  200, 231,  201, 232,  202, 233,  203, 234,  204, 235,  205, 236,
+  206, 237,  207, 238,  208, 239,  209, 240,  210, 241,  211, 242,  212, 243,
+  213, 244,  214, 245,  215, 246,  216, 247,  217, 248,  218, 249,  219, 250,
+  220, 251,  221, 252,  222, 253,  223, 254,  224, 224,  225, 256,  226, 257,
+  227, 258,  228, 259,  229, 260,  230, 261,  231, 262,  232, 263,  233, 264,
+  234, 265,  235, 266,  236, 267,  237, 268,  238, 269,  239, 270,  240, 271,
+  241, 272,  242, 273,  243, 274,  244, 275,  245, 276,  246, 277,  247, 278,
+  248, 279,  249, 280,  250, 281,  251, 282,  252, 283,  253, 284,  254, 285,
+  255, 286,  256, 256,  257, 288,  258, 289,  259, 290,  260, 291,  261, 292,
+  262, 293,  263, 294,  264, 295,  265, 296,  266, 297,  267, 298,  268, 299,
+  269, 300,  270, 301,  271, 302,  272, 303,  273, 304,  274, 305,  275, 306,
+  276, 307,  277, 308,  278, 309,  279, 310,  280, 311,  281, 312,  282, 313,
+  283, 314,  284, 315,  285, 316,  286, 317,  287, 318,  288, 288,  289, 320,
+  290, 321,  291, 322,  292, 323,  293, 324,  294, 325,  295, 326,  296, 327,
+  297, 328,  298, 329,  299, 330,  300, 331,  301, 332,  302, 333,  303, 334,
+  304, 335,  305, 336,  306, 337,  307, 338,  308, 339,  309, 340,  310, 341,
+  311, 342,  312, 343,  313, 344,  314, 345,  315, 346,  316, 347,  317, 348,
+  318, 349,  319, 350,  320, 320,  321, 352,  322, 353,  323, 354,  324, 355,
+  325, 356,  326, 357,  327, 358,  328, 359,  329, 360,  330, 361,  331, 362,
+  332, 363,  333, 364,  334, 365,  335, 366,  336, 367,  337, 368,  338, 369,
+  339, 370,  340, 371,  341, 372,  342, 373,  343, 374,  344, 375,  345, 376,
+  346, 377,  347, 378,  348, 379,  349, 380,  350, 381,  351, 382,  352, 352,
+  353, 384,  354, 385,  355, 386,  356, 387,  357, 388,  358, 389,  359, 390,
+  360, 391,  361, 392,  362, 393,  363, 394,  364, 395,  365, 396,  366, 397,
+  367, 398,  368, 399,  369, 400,  370, 401,  371, 402,  372, 403,  373, 404,
+  374, 405,  375, 406,  376, 407,  377, 408,  378, 409,  379, 410,  380, 411,
+  381, 412,  382, 413,  383, 414,  384, 384,  385, 416,  386, 417,  387, 418,
+  388, 419,  389, 420,  390, 421,  391, 422,  392, 423,  393, 424,  394, 425,
+  395, 426,  396, 427,  397, 428,  398, 429,  399, 430,  400, 431,  401, 432,
+  402, 433,  403, 434,  404, 435,  405, 436,  406, 437,  407, 438,  408, 439,
+  409, 440,  410, 441,  411, 442,  412, 443,  413, 444,  414, 445,  415, 446,
+  416, 416,  417, 448,  418, 449,  419, 450,  420, 451,  421, 452,  422, 453,
+  423, 454,  424, 455,  425, 456,  426, 457,  427, 458,  428, 459,  429, 460,
+  430, 461,  431, 462,  432, 463,  433, 464,  434, 465,  435, 466,  436, 467,
+  437, 468,  438, 469,  439, 470,  440, 471,  441, 472,  442, 473,  443, 474,
+  444, 475,  445, 476,  446, 477,  447, 478,  448, 448,  449, 480,  450, 481,
+  451, 482,  452, 483,  453, 484,  454, 485,  455, 486,  456, 487,  457, 488,
+  458, 489,  459, 490,  460, 491,  461, 492,  462, 493,  463, 494,  464, 495,
+  465, 496,  466, 497,  467, 498,  468, 499,  469, 500,  470, 501,  471, 502,
+  472, 503,  473, 504,  474, 505,  475, 506,  476, 507,  477, 508,  478, 509,
+  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
+  486, 517,  487, 518,  488, 519,  489, 520,  490, 521,  491, 522,  492, 523,
+  493, 524,  494, 525,  495, 526,  496, 527,  497, 528,  498, 529,  499, 530,
+  500, 531,  501, 532,  502, 533,  503, 534,  504, 535,  505, 536,  506, 537,
+  507, 538,  508, 539,  509, 540,  510, 541,  511, 542,  512, 512,  513, 544,
+  514, 545,  515, 546,  516, 547,  517, 548,  518, 549,  519, 550,  520, 551,
+  521, 552,  522, 553,  523, 554,  524, 555,  525, 556,  526, 557,  527, 558,
+  528, 559,  529, 560,  530, 561,  531, 562,  532, 563,  533, 564,  534, 565,
+  535, 566,  536, 567,  537, 568,  538, 569,  539, 570,  540, 571,  541, 572,
+  542, 573,  543, 574,  544, 544,  545, 576,  546, 577,  547, 578,  548, 579,
+  549, 580,  550, 581,  551, 582,  552, 583,  553, 584,  554, 585,  555, 586,
+  556, 587,  557, 588,  558, 589,  559, 590,  560, 591,  561, 592,  562, 593,
+  563, 594,  564, 595,  565, 596,  566, 597,  567, 598,  568, 599,  569, 600,
+  570, 601,  571, 602,  572, 603,  573, 604,  574, 605,  575, 606,  576, 576,
+  577, 608,  578, 609,  579, 610,  580, 611,  581, 612,  582, 613,  583, 614,
+  584, 615,  585, 616,  586, 617,  587, 618,  588, 619,  589, 620,  590, 621,
+  591, 622,  592, 623,  593, 624,  594, 625,  595, 626,  596, 627,  597, 628,
+  598, 629,  599, 630,  600, 631,  601, 632,  602, 633,  603, 634,  604, 635,
+  605, 636,  606, 637,  607, 638,  608, 608,  609, 640,  610, 641,  611, 642,
+  612, 643,  613, 644,  614, 645,  615, 646,  616, 647,  617, 648,  618, 649,
+  619, 650,  620, 651,  621, 652,  622, 653,  623, 654,  624, 655,  625, 656,
+  626, 657,  627, 658,  628, 659,  629, 660,  630, 661,  631, 662,  632, 663,
+  633, 664,  634, 665,  635, 666,  636, 667,  637, 668,  638, 669,  639, 670,
+  640, 640,  641, 672,  642, 673,  643, 674,  644, 675,  645, 676,  646, 677,
+  647, 678,  648, 679,  649, 680,  650, 681,  651, 682,  652, 683,  653, 684,
+  654, 685,  655, 686,  656, 687,  657, 688,  658, 689,  659, 690,  660, 691,
+  661, 692,  662, 693,  663, 694,  664, 695,  665, 696,  666, 697,  667, 698,
+  668, 699,  669, 700,  670, 701,  671, 702,  672, 672,  673, 704,  674, 705,
+  675, 706,  676, 707,  677, 708,  678, 709,  679, 710,  680, 711,  681, 712,
+  682, 713,  683, 714,  684, 715,  685, 716,  686, 717,  687, 718,  688, 719,
+  689, 720,  690, 721,  691, 722,  692, 723,  693, 724,  694, 725,  695, 726,
+  696, 727,  697, 728,  698, 729,  699, 730,  700, 731,  701, 732,  702, 733,
+  703, 734,  704, 704,  705, 736,  706, 737,  707, 738,  708, 739,  709, 740,
+  710, 741,  711, 742,  712, 743,  713, 744,  714, 745,  715, 746,  716, 747,
+  717, 748,  718, 749,  719, 750,  720, 751,  721, 752,  722, 753,  723, 754,
+  724, 755,  725, 756,  726, 757,  727, 758,  728, 759,  729, 760,  730, 761,
+  731, 762,  732, 763,  733, 764,  734, 765,  735, 766,  736, 736,  737, 768,
+  738, 769,  739, 770,  740, 771,  741, 772,  742, 773,  743, 774,  744, 775,
+  745, 776,  746, 777,  747, 778,  748, 779,  749, 780,  750, 781,  751, 782,
+  752, 783,  753, 784,  754, 785,  755, 786,  756, 787,  757, 788,  758, 789,
+  759, 790,  760, 791,  761, 792,  762, 793,  763, 794,  764, 795,  765, 796,
+  766, 797,  767, 798,  768, 768,  769, 800,  770, 801,  771, 802,  772, 803,
+  773, 804,  774, 805,  775, 806,  776, 807,  777, 808,  778, 809,  779, 810,
+  780, 811,  781, 812,  782, 813,  783, 814,  784, 815,  785, 816,  786, 817,
+  787, 818,  788, 819,  789, 820,  790, 821,  791, 822,  792, 823,  793, 824,
+  794, 825,  795, 826,  796, 827,  797, 828,  798, 829,  799, 830,  800, 800,
+  801, 832,  802, 833,  803, 834,  804, 835,  805, 836,  806, 837,  807, 838,
+  808, 839,  809, 840,  810, 841,  811, 842,  812, 843,  813, 844,  814, 845,
+  815, 846,  816, 847,  817, 848,  818, 849,  819, 850,  820, 851,  821, 852,
+  822, 853,  823, 854,  824, 855,  825, 856,  826, 857,  827, 858,  828, 859,
+  829, 860,  830, 861,  831, 862,  832, 832,  833, 864,  834, 865,  835, 866,
+  836, 867,  837, 868,  838, 869,  839, 870,  840, 871,  841, 872,  842, 873,
+  843, 874,  844, 875,  845, 876,  846, 877,  847, 878,  848, 879,  849, 880,
+  850, 881,  851, 882,  852, 883,  853, 884,  854, 885,  855, 886,  856, 887,
+  857, 888,  858, 889,  859, 890,  860, 891,  861, 892,  862, 893,  863, 894,
+  864, 864,  865, 896,  866, 897,  867, 898,  868, 899,  869, 900,  870, 901,
+  871, 902,  872, 903,  873, 904,  874, 905,  875, 906,  876, 907,  877, 908,
+  878, 909,  879, 910,  880, 911,  881, 912,  882, 913,  883, 914,  884, 915,
+  885, 916,  886, 917,  887, 918,  888, 919,  889, 920,  890, 921,  891, 922,
+  892, 923,  893, 924,  894, 925,  895, 926,  896, 896,  897, 928,  898, 929,
+  899, 930,  900, 931,  901, 932,  902, 933,  903, 934,  904, 935,  905, 936,
+  906, 937,  907, 938,  908, 939,  909, 940,  910, 941,  911, 942,  912, 943,
+  913, 944,  914, 945,  915, 946,  916, 947,  917, 948,  918, 949,  919, 950,
+  920, 951,  921, 952,  922, 953,  923, 954,  924, 955,  925, 956,  926, 957,
+  927, 958,  928, 928,  929, 960,  930, 961,  931, 962,  932, 963,  933, 964,
+  934, 965,  935, 966,  936, 967,  937, 968,  938, 969,  939, 970,  940, 971,
+  941, 972,  942, 973,  943, 974,  944, 975,  945, 976,  946, 977,  947, 978,
+  948, 979,  949, 980,  950, 981,  951, 982,  952, 983,  953, 984,  954, 985,
+  955, 986,  956, 987,  957, 988,  958, 989,  959, 990,  960, 960,  961, 992,
+  962, 993,  963, 994,  964, 995,  965, 996,  966, 997,  967, 998,  968, 999,
+  969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
+  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
+  983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
+  990, 1021, 991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    2,   2,
+  2,   33,   33,  64,   64,  64,   96,  96,   65,  96,  34,  65,   3,   34,
+  3,   3,    4,   4,    4,   35,   35,  66,   66,  97,  97,  128,  128, 128,
+  160, 160,  129, 160,  98,  129,  67,  98,   36,  67,  5,   36,   5,   5,
+  6,   6,    6,   37,   37,  68,   68,  99,   99,  130, 130, 161,  161, 192,
+  192, 192,  224, 224,  193, 224,  162, 193,  131, 162, 100, 131,  69,  100,
+  38,  69,   7,   38,   7,   7,    8,   8,    8,   39,  39,  70,   70,  101,
+  101, 132,  132, 163,  163, 194,  194, 225,  225, 256, 256, 256,  288, 288,
+  257, 288,  226, 257,  195, 226,  164, 195,  133, 164, 102, 133,  71,  102,
+  40,  71,   9,   40,   9,   9,    10,  10,   10,  41,  41,  72,   72,  103,
+  103, 134,  134, 165,  165, 196,  196, 227,  227, 258, 258, 289,  289, 320,
+  320, 320,  352, 352,  321, 352,  290, 321,  259, 290, 228, 259,  197, 228,
+  166, 197,  135, 166,  104, 135,  73,  104,  42,  73,  11,  42,   11,  11,
+  12,  12,   12,  43,   43,  74,   74,  105,  105, 136, 136, 167,  167, 198,
+  198, 229,  229, 260,  260, 291,  291, 322,  322, 353, 353, 384,  384, 384,
+  416, 416,  385, 416,  354, 385,  323, 354,  292, 323, 261, 292,  230, 261,
+  199, 230,  168, 199,  137, 168,  106, 137,  75,  106, 44,  75,   13,  44,
+  13,  13,   14,  14,   14,  45,   45,  76,   76,  107, 107, 138,  138, 169,
+  169, 200,  200, 231,  231, 262,  262, 293,  293, 324, 324, 355,  355, 386,
+  386, 417,  417, 448,  448, 448,  480, 480,  449, 480, 418, 449,  387, 418,
+  356, 387,  325, 356,  294, 325,  263, 294,  232, 263, 201, 232,  170, 201,
+  139, 170,  108, 139,  77,  108,  46,  77,   15,  46,  15,  15,   16,  16,
+  16,  47,   47,  78,   78,  109,  109, 140,  140, 171, 171, 202,  202, 233,
+  233, 264,  264, 295,  295, 326,  326, 357,  357, 388, 388, 419,  419, 450,
+  450, 481,  481, 512,  512, 512,  544, 544,  513, 544, 482, 513,  451, 482,
+  420, 451,  389, 420,  358, 389,  327, 358,  296, 327, 265, 296,  234, 265,
+  203, 234,  172, 203,  141, 172,  110, 141,  79,  110, 48,  79,   17,  48,
+  17,  17,   18,  18,   18,  49,   49,  80,   80,  111, 111, 142,  142, 173,
+  173, 204,  204, 235,  235, 266,  266, 297,  297, 328, 328, 359,  359, 390,
+  390, 421,  421, 452,  452, 483,  483, 514,  514, 545, 545, 576,  576, 576,
+  608, 608,  577, 608,  546, 577,  515, 546,  484, 515, 453, 484,  422, 453,
+  391, 422,  360, 391,  329, 360,  298, 329,  267, 298, 236, 267,  205, 236,
+  174, 205,  143, 174,  112, 143,  81,  112,  50,  81,  19,  50,   19,  19,
+  20,  20,   20,  51,   51,  82,   82,  113,  113, 144, 144, 175,  175, 206,
+  206, 237,  237, 268,  268, 299,  299, 330,  330, 361, 361, 392,  392, 423,
+  423, 454,  454, 485,  485, 516,  516, 547,  547, 578, 578, 609,  609, 640,
+  640, 640,  672, 672,  641, 672,  610, 641,  579, 610, 548, 579,  517, 548,
+  486, 517,  455, 486,  424, 455,  393, 424,  362, 393, 331, 362,  300, 331,
+  269, 300,  238, 269,  207, 238,  176, 207,  145, 176, 114, 145,  83,  114,
+  52,  83,   21,  52,   21,  21,   22,  22,   22,  53,  53,  84,   84,  115,
+  115, 146,  146, 177,  177, 208,  208, 239,  239, 270, 270, 301,  301, 332,
+  332, 363,  363, 394,  394, 425,  425, 456,  456, 487, 487, 518,  518, 549,
+  549, 580,  580, 611,  611, 642,  642, 673,  673, 704, 704, 704,  736, 736,
+  705, 736,  674, 705,  643, 674,  612, 643,  581, 612, 550, 581,  519, 550,
+  488, 519,  457, 488,  426, 457,  395, 426,  364, 395, 333, 364,  302, 333,
+  271, 302,  240, 271,  209, 240,  178, 209,  147, 178, 116, 147,  85,  116,
+  54,  85,   23,  54,   23,  23,   24,  24,   24,  55,  55,  86,   86,  117,
+  117, 148,  148, 179,  179, 210,  210, 241,  241, 272, 272, 303,  303, 334,
+  334, 365,  365, 396,  396, 427,  427, 458,  458, 489, 489, 520,  520, 551,
+  551, 582,  582, 613,  613, 644,  644, 675,  675, 706, 706, 737,  737, 768,
+  768, 768,  800, 800,  769, 800,  738, 769,  707, 738, 676, 707,  645, 676,
+  614, 645,  583, 614,  552, 583,  521, 552,  490, 521, 459, 490,  428, 459,
+  397, 428,  366, 397,  335, 366,  304, 335,  273, 304, 242, 273,  211, 242,
+  180, 211,  149, 180,  118, 149,  87,  118,  56,  87,  25,  56,   25,  25,
+  26,  26,   26,  57,   57,  88,   88,  119,  119, 150, 150, 181,  181, 212,
+  212, 243,  243, 274,  274, 305,  305, 336,  336, 367, 367, 398,  398, 429,
+  429, 460,  460, 491,  491, 522,  522, 553,  553, 584, 584, 615,  615, 646,
+  646, 677,  677, 708,  708, 739,  739, 770,  770, 801, 801, 832,  832, 832,
+  864, 864,  833, 864,  802, 833,  771, 802,  740, 771, 709, 740,  678, 709,
+  647, 678,  616, 647,  585, 616,  554, 585,  523, 554, 492, 523,  461, 492,
+  430, 461,  399, 430,  368, 399,  337, 368,  306, 337, 275, 306,  244, 275,
+  213, 244,  182, 213,  151, 182,  120, 151,  89,  120, 58,  89,   27,  58,
+  27,  27,   28,  28,   28,  59,   59,  90,   90,  121, 121, 152,  152, 183,
+  183, 214,  214, 245,  245, 276,  276, 307,  307, 338, 338, 369,  369, 400,
+  400, 431,  431, 462,  462, 493,  493, 524,  524, 555, 555, 586,  586, 617,
+  617, 648,  648, 679,  679, 710,  710, 741,  741, 772, 772, 803,  803, 834,
+  834, 865,  865, 896,  896, 896,  928, 928,  897, 928, 866, 897,  835, 866,
+  804, 835,  773, 804,  742, 773,  711, 742,  680, 711, 649, 680,  618, 649,
+  587, 618,  556, 587,  525, 556,  494, 525,  463, 494, 432, 463,  401, 432,
+  370, 401,  339, 370,  308, 339,  277, 308,  246, 277, 215, 246,  184, 215,
+  153, 184,  122, 153,  91,  122,  60,  91,   29,  60,  29,  29,   30,  30,
+  30,  61,   61,  92,   92,  123,  123, 154,  154, 185, 185, 216,  216, 247,
+  247, 278,  278, 309,  309, 340,  340, 371,  371, 402, 402, 433,  433, 464,
+  464, 495,  495, 526,  526, 557,  557, 588,  588, 619, 619, 650,  650, 681,
+  681, 712,  712, 743,  743, 774,  774, 805,  805, 836, 836, 867,  867, 898,
+  898, 929,  929, 960,  960, 960,  961, 992,  930, 961, 899, 930,  868, 899,
+  837, 868,  806, 837,  775, 806,  744, 775,  713, 744, 682, 713,  651, 682,
+  620, 651,  589, 620,  558, 589,  527, 558,  496, 527, 465, 496,  434, 465,
+  403, 434,  372, 403,  341, 372,  310, 341,  279, 310, 248, 279,  217, 248,
+  186, 217,  155, 186,  124, 155,  93,  124,  62,  93,  31,  62,   63,  94,
+  94,  125,  125, 156,  156, 187,  187, 218,  218, 249, 249, 280,  280, 311,
+  311, 342,  342, 373,  373, 404,  404, 435,  435, 466, 466, 497,  497, 528,
+  528, 559,  559, 590,  590, 621,  621, 652,  652, 683, 683, 714,  714, 745,
+  745, 776,  776, 807,  807, 838,  838, 869,  869, 900, 900, 931,  931, 962,
+  962, 993,  963, 994,  932, 963,  901, 932,  870, 901, 839, 870,  808, 839,
+  777, 808,  746, 777,  715, 746,  684, 715,  653, 684, 622, 653,  591, 622,
+  560, 591,  529, 560,  498, 529,  467, 498,  436, 467, 405, 436,  374, 405,
+  343, 374,  312, 343,  281, 312,  250, 281,  219, 250, 188, 219,  157, 188,
+  126, 157,  95,  126,  127, 158,  158, 189,  189, 220, 220, 251,  251, 282,
+  282, 313,  313, 344,  344, 375,  375, 406,  406, 437, 437, 468,  468, 499,
+  499, 530,  530, 561,  561, 592,  592, 623,  623, 654, 654, 685,  685, 716,
+  716, 747,  747, 778,  778, 809,  809, 840,  840, 871, 871, 902,  902, 933,
+  933, 964,  964, 995,  965, 996,  934, 965,  903, 934, 872, 903,  841, 872,
+  810, 841,  779, 810,  748, 779,  717, 748,  686, 717, 655, 686,  624, 655,
+  593, 624,  562, 593,  531, 562,  500, 531,  469, 500, 438, 469,  407, 438,
+  376, 407,  345, 376,  314, 345,  283, 314,  252, 283, 221, 252,  190, 221,
+  159, 190,  191, 222,  222, 253,  253, 284,  284, 315, 315, 346,  346, 377,
+  377, 408,  408, 439,  439, 470,  470, 501,  501, 532, 532, 563,  563, 594,
+  594, 625,  625, 656,  656, 687,  687, 718,  718, 749, 749, 780,  780, 811,
+  811, 842,  842, 873,  873, 904,  904, 935,  935, 966, 966, 997,  967, 998,
+  936, 967,  905, 936,  874, 905,  843, 874,  812, 843, 781, 812,  750, 781,
+  719, 750,  688, 719,  657, 688,  626, 657,  595, 626, 564, 595,  533, 564,
+  502, 533,  471, 502,  440, 471,  409, 440,  378, 409, 347, 378,  316, 347,
+  285, 316,  254, 285,  223, 254,  255, 286,  286, 317, 317, 348,  348, 379,
+  379, 410,  410, 441,  441, 472,  472, 503,  503, 534, 534, 565,  565, 596,
+  596, 627,  627, 658,  658, 689,  689, 720,  720, 751, 751, 782,  782, 813,
+  813, 844,  844, 875,  875, 906,  906, 937,  937, 968, 968, 999,  969, 1000,
+  938, 969,  907, 938,  876, 907,  845, 876,  814, 845, 783, 814,  752, 783,
+  721, 752,  690, 721,  659, 690,  628, 659,  597, 628, 566, 597,  535, 566,
+  504, 535,  473, 504,  442, 473,  411, 442,  380, 411, 349, 380,  318, 349,
+  287, 318,  319, 350,  350, 381,  381, 412,  412, 443, 443, 474,  474, 505,
+  505, 536,  536, 567,  567, 598,  598, 629,  629, 660, 660, 691,  691, 722,
+  722, 753,  753, 784,  784, 815,  815, 846,  846, 877, 877, 908,  908, 939,
+  939, 970,  970, 1001, 971, 1002, 940, 971,  909, 940, 878, 909,  847, 878,
+  816, 847,  785, 816,  754, 785,  723, 754,  692, 723, 661, 692,  630, 661,
+  599, 630,  568, 599,  537, 568,  506, 537,  475, 506, 444, 475,  413, 444,
+  382, 413,  351, 382,  383, 414,  414, 445,  445, 476, 476, 507,  507, 538,
+  538, 569,  569, 600,  600, 631,  631, 662,  662, 693, 693, 724,  724, 755,
+  755, 786,  786, 817,  817, 848,  848, 879,  879, 910, 910, 941,  941, 972,
+  972, 1003, 973, 1004, 942, 973,  911, 942,  880, 911, 849, 880,  818, 849,
+  787, 818,  756, 787,  725, 756,  694, 725,  663, 694, 632, 663,  601, 632,
+  570, 601,  539, 570,  508, 539,  477, 508,  446, 477, 415, 446,  447, 478,
+  478, 509,  509, 540,  540, 571,  571, 602,  602, 633, 633, 664,  664, 695,
+  695, 726,  726, 757,  757, 788,  788, 819,  819, 850, 850, 881,  881, 912,
+  912, 943,  943, 974,  974, 1005, 975, 1006, 944, 975, 913, 944,  882, 913,
+  851, 882,  820, 851,  789, 820,  758, 789,  727, 758, 696, 727,  665, 696,
+  634, 665,  603, 634,  572, 603,  541, 572,  510, 541, 479, 510,  511, 542,
+  542, 573,  573, 604,  604, 635,  635, 666,  666, 697, 697, 728,  728, 759,
+  759, 790,  790, 821,  821, 852,  852, 883,  883, 914, 914, 945,  945, 976,
+  976, 1007, 977, 1008, 946, 977,  915, 946,  884, 915, 853, 884,  822, 853,
+  791, 822,  760, 791,  729, 760,  698, 729,  667, 698, 636, 667,  605, 636,
+  574, 605,  543, 574,  575, 606,  606, 637,  637, 668, 668, 699,  699, 730,
+  730, 761,  761, 792,  792, 823,  823, 854,  854, 885, 885, 916,  916, 947,
+  947, 978,  978, 1009, 979, 1010, 948, 979,  917, 948, 886, 917,  855, 886,
+  824, 855,  793, 824,  762, 793,  731, 762,  700, 731, 669, 700,  638, 669,
+  607, 638,  639, 670,  670, 701,  701, 732,  732, 763, 763, 794,  794, 825,
+  825, 856,  856, 887,  887, 918,  918, 949,  949, 980, 980, 1011, 981, 1012,
+  950, 981,  919, 950,  888, 919,  857, 888,  826, 857, 795, 826,  764, 795,
+  733, 764,  702, 733,  671, 702,  703, 734,  734, 765, 765, 796,  796, 827,
+  827, 858,  858, 889,  889, 920,  920, 951,  951, 982, 982, 1013, 983, 1014,
+  952, 983,  921, 952,  890, 921,  859, 890,  828, 859, 797, 828,  766, 797,
+  735, 766,  767, 798,  798, 829,  829, 860,  860, 891, 891, 922,  922, 953,
+  953, 984,  984, 1015, 985, 1016, 954, 985,  923, 954, 892, 923,  861, 892,
+  830, 861,  799, 830,  831, 862,  862, 893,  893, 924, 924, 955,  955, 986,
+  986, 1017, 987, 1018, 956, 987,  925, 956,  894, 925, 863, 894,  895, 926,
+  926, 957,  957, 988,  988, 1019, 989, 1020, 958, 989, 927, 958,  959, 990,
+  990, 1021, 991, 1022, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
+  0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
+  0, 2, 5,  9,  13, 17, 21, 25, 1, 4,  8,  12, 16, 20, 24, 28,
+  3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+  29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+  45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
+  0, 2,  5,  9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57,
+  1, 4,  8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  3, 7,  11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62,
+  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+  2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+  3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  2,   4,   7,   11,  16,  22,  29,
+  36,  5,   8,   12,  17,  23,  30,  37,  44,  9,   13,  18,  24,  31,  38,
+  45,  52,  14,  19,  25,  32,  39,  46,  53,  60,  20,  26,  33,  40,  47,
+  54,  61,  68,  27,  34,  41,  48,  55,  62,  69,  76,  35,  42,  49,  56,
+  63,  70,  77,  84,  43,  50,  57,  64,  71,  78,  85,  92,  51,  58,  65,
+  72,  79,  86,  93,  100, 59,  66,  73,  80,  87,  94,  101, 108, 67,  74,
+  81,  88,  95,  102, 109, 116, 75,  82,  89,  96,  103, 110, 117, 124, 83,
+  90,  97,  104, 111, 118, 125, 132, 91,  98,  105, 112, 119, 126, 133, 140,
+  99,  106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149,
+  156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158,
+  165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167,
+  174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176,
+  183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185,
+  192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194,
+  201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
+  210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
+  219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
+  0,   2,   5,   9,   14,  20,  27,  35,  43,  51,  59,  67,  75,  83,  91,
+  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211,
+  219, 227, 1,   4,   8,   13,  19,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 3,   7,   12,  18,  25,  33,  41,  49,  57,  65,  73,
+  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193,
+  201, 209, 217, 225, 233, 240, 6,   11,  17,  24,  32,  40,  48,  56,  64,
+  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184,
+  192, 200, 208, 216, 224, 232, 239, 245, 10,  16,  23,  31,  39,  47,  55,
+  63,  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175,
+  183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15,  22,  30,  38,  46,
+  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166,
+  174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28,
+  36,  44,  52,  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148,
+  156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 1,  33, 65, 97,  129, 161, 193, 225,
+  2,  34, 66, 98,  130, 162, 194, 226, 3,  35, 67, 99,  131, 163, 195, 227,
+  4,  36, 68, 100, 132, 164, 196, 228, 5,  37, 69, 101, 133, 165, 197, 229,
+  6,  38, 70, 102, 134, 166, 198, 230, 7,  39, 71, 103, 135, 167, 199, 231,
+  8,  40, 72, 104, 136, 168, 200, 232, 9,  41, 73, 105, 137, 169, 201, 233,
+  10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+  12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+  14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+  16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+  18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+  20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+  22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+  24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+  26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+  28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+  30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
+  0,   8,   16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96,  104, 112,
+  120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+  240, 248, 1,   9,   17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97,
+  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+  225, 233, 241, 249, 2,   10,  18,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 242, 250, 3,   11,  19,  27,  35,  43,  51,  59,  67,
+  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+  195, 203, 211, 219, 227, 235, 243, 251, 4,   12,  20,  28,  36,  44,  52,
+  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+  180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5,   13,  21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6,   14,  22,
+  30,  38,  46,  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142,
+  150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95,  103, 111, 119, 127,
+  135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
+  0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
+  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
+  0,  1,  3,   6,   10,  15,  21,  28,  2,  4,   7,   11,  16,  22,  29,  36,
+  5,  8,  12,  17,  23,  30,  37,  44,  9,  13,  18,  24,  31,  38,  45,  52,
+  14, 19, 25,  32,  39,  46,  53,  60,  20, 26,  33,  40,  47,  54,  61,  68,
+  27, 34, 41,  48,  55,  62,  69,  76,  35, 42,  49,  56,  63,  70,  77,  84,
+  43, 50, 57,  64,  71,  78,  85,  92,  51, 58,  65,  72,  79,  86,  93,  100,
+  59, 66, 73,  80,  87,  94,  101, 107, 67, 74,  81,  88,  95,  102, 108, 113,
+  75, 82, 89,  96,  103, 109, 114, 118, 83, 90,  97,  104, 110, 115, 119, 122,
+  91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
+  0,  2,  5,  9,  14, 20, 27, 35, 43, 51,  59,  67,  75,  83,  91,  99,
+  1,  4,  8,  13, 19, 26, 34, 42, 50, 58,  66,  74,  82,  90,  98,  106,
+  3,  7,  12, 18, 25, 33, 41, 49, 57, 65,  73,  81,  89,  97,  105, 112,
+  6,  11, 17, 24, 32, 40, 48, 56, 64, 72,  80,  88,  96,  104, 111, 117,
+  10, 16, 23, 31, 39, 47, 55, 63, 71, 79,  87,  95,  103, 110, 116, 121,
+  15, 22, 30, 38, 46, 54, 62, 70, 78, 86,  94,  102, 109, 115, 120, 124,
+  21, 29, 37, 45, 53, 61, 69, 77, 85, 93,  101, 108, 114, 119, 123, 126,
+  28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
+  120, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,  106,
+  121, 136, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,  93,  107,
+  122, 137, 152, 9,   13,  18,  24,  31,  39,  48,  58,  69,  81,  94,  108,
+  123, 138, 153, 168, 14,  19,  25,  32,  40,  49,  59,  70,  82,  95,  109,
+  124, 139, 154, 169, 184, 20,  26,  33,  41,  50,  60,  71,  83,  96,  110,
+  125, 140, 155, 170, 185, 200, 27,  34,  42,  51,  61,  72,  84,  97,  111,
+  126, 141, 156, 171, 186, 201, 216, 35,  43,  52,  62,  73,  85,  98,  112,
+  127, 142, 157, 172, 187, 202, 217, 232, 44,  53,  63,  74,  86,  99,  113,
+  128, 143, 158, 173, 188, 203, 218, 233, 248, 54,  64,  75,  87,  100, 114,
+  129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65,  76,  88,  101, 115,
+  130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77,  89,  102, 116,
+  131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90,  103, 117,
+  132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118,
+  133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119,
+  134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344,
+  135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345,
+  360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346,
+  361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347,
+  362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348,
+  363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349,
+  364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350,
+  365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351,
+  366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352,
+  367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353,
+  368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354,
+  369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355,
+  370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356,
+  371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357,
+  372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358,
+  373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359,
+  374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506,
+  375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507,
+  509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
+  0,   2,   5,   9,   14,  20,  27,  35,  44,  54,  65,  77,  90,  104, 119,
+  135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
+  375, 391, 1,   4,   8,   13,  19,  26,  34,  43,  53,  64,  76,  89,  103,
+  118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342,
+  358, 374, 390, 406, 3,   7,   12,  18,  25,  33,  42,  52,  63,  75,  88,
+  102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325,
+  341, 357, 373, 389, 405, 420, 6,   11,  17,  24,  32,  41,  51,  62,  74,
+  87,  101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308,
+  324, 340, 356, 372, 388, 404, 419, 433, 10,  16,  23,  31,  40,  50,  61,
+  73,  86,  100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291,
+  307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15,  22,  30,  39,  49,
+  60,  72,  85,  99,  114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274,
+  290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21,  29,  38,
+  48,  59,  71,  84,  98,  113, 129, 145, 161, 177, 193, 209, 225, 241, 257,
+  273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28,
+  37,  47,  58,  70,  83,  97,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465,
+  475, 36,  46,  57,  69,  82,  96,  111, 127, 143, 159, 175, 191, 207, 223,
+  239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453,
+  464, 474, 483, 45,  56,  68,  81,  95,  110, 126, 142, 158, 174, 190, 206,
+  222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440,
+  452, 463, 473, 482, 490, 55,  67,  80,  94,  109, 125, 141, 157, 173, 189,
+  205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426,
+  439, 451, 462, 472, 481, 489, 496, 66,  79,  93,  108, 124, 140, 156, 172,
+  188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411,
+  425, 438, 450, 461, 471, 480, 488, 495, 501, 78,  92,  107, 123, 139, 155,
+  171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395,
+  410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91,  106, 122, 138,
+  154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378,
+  394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121,
+  137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361,
+  377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510,
+  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+  360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506,
+  509, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
+  0,   1,   5,   6,   14,  15,  27,  28,  44,  45,  65,  66,  90,  91,  119,
+  120, 2,   4,   7,   13,  16,  26,  29,  43,  46,  64,  67,  89,  92,  118,
+  121, 150, 3,   8,   12,  17,  25,  30,  42,  47,  63,  68,  88,  93,  117,
+  122, 149, 151, 9,   11,  18,  24,  31,  41,  48,  62,  69,  87,  94,  116,
+  123, 148, 152, 177, 10,  19,  23,  32,  40,  49,  61,  70,  86,  95,  115,
+  124, 147, 153, 176, 178, 20,  22,  33,  39,  50,  60,  71,  85,  96,  114,
+  125, 146, 154, 175, 179, 200, 21,  34,  38,  51,  59,  72,  84,  97,  113,
+  126, 145, 155, 174, 180, 199, 201, 35,  37,  52,  58,  73,  83,  98,  112,
+  127, 144, 156, 173, 181, 198, 202, 219, 36,  53,  57,  74,  82,  99,  111,
+  128, 143, 157, 172, 182, 197, 203, 218, 220, 54,  56,  75,  81,  100, 110,
+  129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55,  76,  80,  101, 109,
+  130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77,  79,  102, 108,
+  131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78,  103, 107,
+  132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+  133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+  134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+  135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+  255
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
+  0,    1,    5,    6,    14,   15,   27,   28,   44,   45,   65,   66,   90,
+  91,   119,  120,  152,  153,  189,  190,  230,  231,  275,  276,  324,  325,
+  377,  378,  434,  435,  495,  496,  2,    4,    7,    13,   16,   26,   29,
+  43,   46,   64,   67,   89,   92,   118,  121,  151,  154,  188,  191,  229,
+  232,  274,  277,  323,  326,  376,  379,  433,  436,  494,  497,  558,  3,
+  8,    12,   17,   25,   30,   42,   47,   63,   68,   88,   93,   117,  122,
+  150,  155,  187,  192,  228,  233,  273,  278,  322,  327,  375,  380,  432,
+  437,  493,  498,  557,  559,  9,    11,   18,   24,   31,   41,   48,   62,
+  69,   87,   94,   116,  123,  149,  156,  186,  193,  227,  234,  272,  279,
+  321,  328,  374,  381,  431,  438,  492,  499,  556,  560,  617,  10,   19,
+  23,   32,   40,   49,   61,   70,   86,   95,   115,  124,  148,  157,  185,
+  194,  226,  235,  271,  280,  320,  329,  373,  382,  430,  439,  491,  500,
+  555,  561,  616,  618,  20,   22,   33,   39,   50,   60,   71,   85,   96,
+  114,  125,  147,  158,  184,  195,  225,  236,  270,  281,  319,  330,  372,
+  383,  429,  440,  490,  501,  554,  562,  615,  619,  672,  21,   34,   38,
+  51,   59,   72,   84,   97,   113,  126,  146,  159,  183,  196,  224,  237,
+  269,  282,  318,  331,  371,  384,  428,  441,  489,  502,  553,  563,  614,
+  620,  671,  673,  35,   37,   52,   58,   73,   83,   98,   112,  127,  145,
+  160,  182,  197,  223,  238,  268,  283,  317,  332,  370,  385,  427,  442,
+  488,  503,  552,  564,  613,  621,  670,  674,  723,  36,   53,   57,   74,
+  82,   99,   111,  128,  144,  161,  181,  198,  222,  239,  267,  284,  316,
+  333,  369,  386,  426,  443,  487,  504,  551,  565,  612,  622,  669,  675,
+  722,  724,  54,   56,   75,   81,   100,  110,  129,  143,  162,  180,  199,
+  221,  240,  266,  285,  315,  334,  368,  387,  425,  444,  486,  505,  550,
+  566,  611,  623,  668,  676,  721,  725,  770,  55,   76,   80,   101,  109,
+  130,  142,  163,  179,  200,  220,  241,  265,  286,  314,  335,  367,  388,
+  424,  445,  485,  506,  549,  567,  610,  624,  667,  677,  720,  726,  769,
+  771,  77,   79,   102,  108,  131,  141,  164,  178,  201,  219,  242,  264,
+  287,  313,  336,  366,  389,  423,  446,  484,  507,  548,  568,  609,  625,
+  666,  678,  719,  727,  768,  772,  813,  78,   103,  107,  132,  140,  165,
+  177,  202,  218,  243,  263,  288,  312,  337,  365,  390,  422,  447,  483,
+  508,  547,  569,  608,  626,  665,  679,  718,  728,  767,  773,  812,  814,
+  104,  106,  133,  139,  166,  176,  203,  217,  244,  262,  289,  311,  338,
+  364,  391,  421,  448,  482,  509,  546,  570,  607,  627,  664,  680,  717,
+  729,  766,  774,  811,  815,  852,  105,  134,  138,  167,  175,  204,  216,
+  245,  261,  290,  310,  339,  363,  392,  420,  449,  481,  510,  545,  571,
+  606,  628,  663,  681,  716,  730,  765,  775,  810,  816,  851,  853,  135,
+  137,  168,  174,  205,  215,  246,  260,  291,  309,  340,  362,  393,  419,
+  450,  480,  511,  544,  572,  605,  629,  662,  682,  715,  731,  764,  776,
+  809,  817,  850,  854,  887,  136,  169,  173,  206,  214,  247,  259,  292,
+  308,  341,  361,  394,  418,  451,  479,  512,  543,  573,  604,  630,  661,
+  683,  714,  732,  763,  777,  808,  818,  849,  855,  886,  888,  170,  172,
+  207,  213,  248,  258,  293,  307,  342,  360,  395,  417,  452,  478,  513,
+  542,  574,  603,  631,  660,  684,  713,  733,  762,  778,  807,  819,  848,
+  856,  885,  889,  918,  171,  208,  212,  249,  257,  294,  306,  343,  359,
+  396,  416,  453,  477,  514,  541,  575,  602,  632,  659,  685,  712,  734,
+  761,  779,  806,  820,  847,  857,  884,  890,  917,  919,  209,  211,  250,
+  256,  295,  305,  344,  358,  397,  415,  454,  476,  515,  540,  576,  601,
+  633,  658,  686,  711,  735,  760,  780,  805,  821,  846,  858,  883,  891,
+  916,  920,  945,  210,  251,  255,  296,  304,  345,  357,  398,  414,  455,
+  475,  516,  539,  577,  600,  634,  657,  687,  710,  736,  759,  781,  804,
+  822,  845,  859,  882,  892,  915,  921,  944,  946,  252,  254,  297,  303,
+  346,  356,  399,  413,  456,  474,  517,  538,  578,  599,  635,  656,  688,
+  709,  737,  758,  782,  803,  823,  844,  860,  881,  893,  914,  922,  943,
+  947,  968,  253,  298,  302,  347,  355,  400,  412,  457,  473,  518,  537,
+  579,  598,  636,  655,  689,  708,  738,  757,  783,  802,  824,  843,  861,
+  880,  894,  913,  923,  942,  948,  967,  969,  299,  301,  348,  354,  401,
+  411,  458,  472,  519,  536,  580,  597,  637,  654,  690,  707,  739,  756,
+  784,  801,  825,  842,  862,  879,  895,  912,  924,  941,  949,  966,  970,
+  987,  300,  349,  353,  402,  410,  459,  471,  520,  535,  581,  596,  638,
+  653,  691,  706,  740,  755,  785,  800,  826,  841,  863,  878,  896,  911,
+  925,  940,  950,  965,  971,  986,  988,  350,  352,  403,  409,  460,  470,
+  521,  534,  582,  595,  639,  652,  692,  705,  741,  754,  786,  799,  827,
+  840,  864,  877,  897,  910,  926,  939,  951,  964,  972,  985,  989,  1002,
+  351,  404,  408,  461,  469,  522,  533,  583,  594,  640,  651,  693,  704,
+  742,  753,  787,  798,  828,  839,  865,  876,  898,  909,  927,  938,  952,
+  963,  973,  984,  990,  1001, 1003, 405,  407,  462,  468,  523,  532,  584,
+  593,  641,  650,  694,  703,  743,  752,  788,  797,  829,  838,  866,  875,
+  899,  908,  928,  937,  953,  962,  974,  983,  991,  1000, 1004, 1013, 406,
+  463,  467,  524,  531,  585,  592,  642,  649,  695,  702,  744,  751,  789,
+  796,  830,  837,  867,  874,  900,  907,  929,  936,  954,  961,  975,  982,
+  992,  999,  1005, 1012, 1014, 464,  466,  525,  530,  586,  591,  643,  648,
+  696,  701,  745,  750,  790,  795,  831,  836,  868,  873,  901,  906,  930,
+  935,  955,  960,  976,  981,  993,  998,  1006, 1011, 1015, 1020, 465,  526,
+  529,  587,  590,  644,  647,  697,  700,  746,  749,  791,  794,  832,  835,
+  869,  872,  902,  905,  931,  934,  956,  959,  977,  980,  994,  997,  1007,
+  1010, 1016, 1019, 1021, 527,  528,  588,  589,  645,  646,  698,  699,  747,
+  748,  792,  793,  833,  834,  870,  871,  903,  904,  932,  933,  957,  958,
+  978,  979,  995,  996,  1008, 1009, 1017, 1018, 1022, 1023
+};
+
+const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
+  { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+  { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+  { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
+  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+  // Half of the coefficients of tx64 at higher frequencies are set to
+  // zeros. So tx32's scan order is used.
+  { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+};
+
+const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+  },
+  {
+      // TX_8X8
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+  },
+  {
+      // TX_16X16
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+  },
+  {
+      // TX_32X32
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+  },
+  {
+      // TX_64X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+  },
+  {
+      // TX_4X8
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+  },
+  {
+      // TX_8X4
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+  },
+  {
+      // TX_8X16
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+  },
+  {
+      // TX_16X8
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+  },
+  {
+      // TX_16X32
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+  },
+  {
+      // TX_32X16
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+  },
+  {
+      // TX_32X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+  },
+  {
+      // TX_64X32
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+  },
+  {
+      // TX_4X16
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { default_scan_4x16, av1_default_iscan_4x16,
+        default_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+  },
+  {
+      // TX_16X4
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { default_scan_16x4, av1_default_iscan_16x4,
+        default_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+  },
+  {
+      // TX_8X32
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { default_scan_8x32, av1_default_iscan_8x32,
+        default_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+  },
+  {
+      // TX_32X8
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { default_scan_32x8, av1_default_iscan_32x8,
+        default_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+  },
+  {
+      // TX_16X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+  },
+  {
+      // TX_64X16
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+  },
+};
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
new file mode 100644
index 000000000..233dc0efa
--- /dev/null
+++ b/third_party/aom/av1/common/scan.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+typedef enum SCAN_MODE {
+  SCAN_MODE_ZIG_ZAG,
+  SCAN_MODE_COL_DIAG,
+  SCAN_MODE_ROW_DIAG,
+  SCAN_MODE_COL_1D,
+  SCAN_MODE_ROW_1D,
+  SCAN_MODES
+} SCAN_MODE;
+
+extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
+extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
+
+void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
+                                                 TX_TYPE tx_type) {
+  return &av1_scan_orders[tx_size][tx_type];
+}
+
+static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+  return get_default_scan(tx_size, tx_type);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c
new file mode 100644
index 000000000..cd189ad76
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = {
+  MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0
+};
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void av1_clearall_segfeatures(struct segmentation *seg) {
+  av1_zero(seg->feature_data);
+  av1_zero(seg->feature_mask);
+}
+
+void calculate_segdata(struct segmentation *seg) {
+  seg->segid_preskip = 0;
+  seg->last_active_segid = 0;
+  for (int i = 0; i < MAX_SEGMENTS; i++) {
+    for (int j = 0; j < SEG_LVL_MAX; j++) {
+      if (seg->feature_mask[i] & (1 << j)) {
+        seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
+        seg->last_active_segid = i;
+      }
+    }
+  }
+}
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_max[feature_id];
+}
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_signed[feature_id];
+}
+
+// The 'seg_data' given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data) {
+  if (seg_data < 0) {
+    assert(seg_feature_data_signed[feature_id]);
+    assert(-seg_data <= seg_feature_data_max[feature_id]);
+  } else {
+    assert(seg_data <= seg_feature_data_max[feature_id]);
+  }
+
+  seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
new file mode 100644
index 000000000..8c35bba86
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_SEGMENTS 8
+#define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
+
+#define SEG_TEMPORAL_PRED_CTXS 3
+#define SPATIAL_PREDICTION_PROBS 3
+
+typedef enum {
+  SEG_LVL_ALT_Q,       // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF_Y_V,  // Use alternate loop filter value on y plane vertical
+  SEG_LVL_ALT_LF_Y_H,  // Use alternate loop filter value on y plane horizontal
+  SEG_LVL_ALT_LF_U,    // Use alternate loop filter value on u plane
+  SEG_LVL_ALT_LF_V,    // Use alternate loop filter value on v plane
+  SEG_LVL_REF_FRAME,   // Optional Segment reference frame
+  SEG_LVL_SKIP,        // Optional Segment (0,0) + skip mode
+  SEG_LVL_GLOBALMV,
+  SEG_LVL_MAX
+} SEG_LVL_FEATURES;
+
+struct segmentation {
+  uint8_t enabled;
+  uint8_t update_map;
+  uint8_t update_data;
+  uint8_t temporal_update;
+
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
+  int last_active_segid;  // The highest numbered segment id that has some
+                          // enabled feature.
+  uint8_t segid_preskip;  // Whether the segment id will be read before the
+                          // skip syntax element.
+                          // 1: the segment id will be read first.
+                          // 0: the skip syntax element will be read first.
+};
+
+struct segmentation_probs {
+  aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
+  aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
+  aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
+                                   [CDF_SIZE(MAX_SEGMENTS)];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+static INLINE void segfeatures_copy(struct segmentation *dst,
+                                    const struct segmentation *src) {
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    dst->feature_mask[i] = src->feature_mask[i];
+    for (j = 0; j < SEG_LVL_MAX; j++) {
+      dst->feature_data[i][j] = src->feature_data[i][j];
+    }
+  }
+  dst->segid_preskip = src->segid_preskip;
+  dst->last_active_segid = src->last_active_segid;
+}
+
+void av1_clearall_segfeatures(struct segmentation *seg);
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id);
+
+void calculate_segdata(struct segmentation *seg);
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
new file mode 100644
index 000000000..8df4c9a09
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.c
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/reconinter.h"
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+static INLINE int get_lr_sync_range(int width) {
+#if 0
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+  return 1;
+#endif
+}
+
+// Allocate memory for lf row synchronization
+static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                              int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i, j;
+
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
+      if (lf_sync->mutex_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
+        }
+      }
+
+      CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
+                      aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
+      if (lf_sync->cond_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_cond_init(&lf_sync->cond_[j][i], NULL);
+        }
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
+                    aom_malloc(sizeof(*(lf_sync->job_mutex))));
+    if (lf_sync->job_mutex) {
+      pthread_mutex_init(lf_sync->job_mutex, NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
+  lf_sync->num_workers = num_workers;
+
+  for (int j = 0; j < MAX_MB_PLANE; j++) {
+    CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
+                    aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
+  }
+  CHECK_MEM_ERROR(
+      cm, lf_sync->job_queue,
+      aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+    int j;
+#if CONFIG_MULTITHREAD
+    int i;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lf_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
+        }
+        aom_free(lf_sync->mutex_[j]);
+      }
+      if (lf_sync->cond_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_cond_destroy(&lf_sync->cond_[j][i]);
+        }
+        aom_free(lf_sync->cond_[j]);
+      }
+    }
+    if (lf_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lf_sync->job_mutex);
+      aom_free(lf_sync->job_mutex);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(lf_sync->lfdata);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lf_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lf_sync->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lf_sync);
+  }
+}
+
+static void loop_filter_data_reset(LFWorkerData *lf_data,
+                                   YV12_BUFFER_CONFIG *frame_buffer,
+                                   struct AV1Common *cm, MACROBLOCKD *xd) {
+  struct macroblockd_plane *pd = xd->plane;
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->xd = xd;
+  for (int i = 0; i < MAX_MB_PLANE; i++) {
+    memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+  }
+}
+
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+                             int plane) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
+                              const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
+
+    lf_sync->cur_sb_col[plane][r] = cur;
+
+    pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
+                            int stop, int plane_start, int plane_end) {
+  int mi_row, plane, dir;
+  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+  lf_sync->jobs_enqueued = 0;
+  lf_sync->jobs_dequeued = 0;
+
+  for (dir = 0; dir < 2; dir++) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        lf_job_queue->mi_row = mi_row;
+        lf_job_queue->plane = plane;
+        lf_job_queue->dir = dir;
+        lf_job_queue++;
+        lf_sync->jobs_enqueued++;
+      }
+    }
+  }
+}
+
+AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+  AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lf_sync->job_mutex);
+
+  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+    lf_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+  (void)lf_sync;
+#endif
+
+  return cur_job_info;
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE void thread_loop_filter_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
+  const int sb_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MAX_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+        }
+      }
+    } else {
+      break;
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->xd, lf_sync);
+  return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                                MACROBLOCKD *xd, int start, int stop,
+                                int plane_start, int plane_end,
+                                AVxWorker *workers, int nworkers,
+                                AV1LfSync *lf_sync) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  // Number of superblock rows and cols
+  const int sb_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  const int num_workers = nworkers;
+  int i;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    av1_loop_filter_dealloc(lf_sync);
+    loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    memset(lf_sync->cur_sb_col[i], -1,
+           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
+  }
+
+  enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
+
+  // Set up loopfilter thread data.
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    loop_filter_data_reset(lf_data, frame, cm, xd);
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                              MACROBLOCKD *xd, int plane_start, int plane_end,
+                              int partial_frame, AVxWorker *workers,
+                              int num_workers, AV1LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+  loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                      plane_end, workers, num_workers, lf_sync);
+}
+
+static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+                                 const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
+
+    loop_res_sync->cur_sb_col[plane][r] = cur;
+
+    pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
+  }
+#else
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for loop restoration row synchronization
+static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                   int num_workers, int num_rows_lr,
+                                   int num_planes, int width) {
+  lr_sync->rows = num_rows_lr;
+  lr_sync->num_planes = num_planes;
+#if CONFIG_MULTITHREAD
+  {
+    int i, j;
+
+    for (j = 0; j < num_planes; j++) {
+      CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
+      if (lr_sync->mutex_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
+        }
+      }
+
+      CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
+                      aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
+      if (lr_sync->cond_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_cond_init(&lr_sync->cond_[j][i], NULL);
+        }
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
+                    aom_malloc(sizeof(*(lr_sync->job_mutex))));
+    if (lr_sync->job_mutex) {
+      pthread_mutex_init(lr_sync->job_mutex, NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
+                  aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata))));
+
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    if (worker_idx < num_workers - 1) {
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
+                      (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
+                      aom_malloc(sizeof(RestorationLineBuffers)));
+
+    } else {
+      lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
+      lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
+    }
+  }
+
+  lr_sync->num_workers = num_workers;
+
+  for (int j = 0; j < num_planes; j++) {
+    CHECK_MEM_ERROR(
+        cm, lr_sync->cur_sb_col[j],
+        aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
+  }
+  CHECK_MEM_ERROR(
+      cm, lr_sync->job_queue,
+      aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
+  // Set up nsync.
+  lr_sync->sync_range = get_lr_sync_range(width);
+}
+
+// Deallocate loop restoration synchronization related mutex and data
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers) {
+  if (lr_sync != NULL) {
+    int j;
+#if CONFIG_MULTITHREAD
+    int i;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lr_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
+        }
+        aom_free(lr_sync->mutex_[j]);
+      }
+      if (lr_sync->cond_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_cond_destroy(&lr_sync->cond_[j][i]);
+        }
+        aom_free(lr_sync->cond_[j]);
+      }
+    }
+    if (lr_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lr_sync->job_mutex);
+      aom_free(lr_sync->job_mutex);
+    }
+#endif  // CONFIG_MULTITHREAD
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lr_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lr_sync->job_queue);
+
+    if (lr_sync->lrworkerdata) {
+      for (int worker_idx = 0; worker_idx < num_workers - 1; worker_idx++) {
+        LRWorkerData *const workerdata_data =
+            lr_sync->lrworkerdata + worker_idx;
+
+        aom_free(workerdata_data->rst_tmpbuf);
+        aom_free(workerdata_data->rlbs);
+      }
+      aom_free(lr_sync->lrworkerdata);
+    }
+
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lr_sync);
+  }
+}
+
+static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
+                            AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+  AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
+  lr_sync->jobs_enqueued = 0;
+  lr_sync->jobs_dequeued = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    num_even_lr_jobs =
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+  }
+  lr_job_counter[0] = 0;
+  lr_job_counter[1] = num_even_lr_jobs;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    const int is_uv = plane > 0;
+    const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+    AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+
+    const int tile_h = tile_rect.bottom - tile_rect.top;
+    const int ext_size = unit_size * 3 / 2;
+
+    int y0 = 0, i = 0;
+    while (y0 < tile_h) {
+      int remaining_h = tile_h - y0;
+      int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+      RestorationTileLimits limits;
+      limits.v_start = tile_rect.top + y0;
+      limits.v_end = tile_rect.top + y0 + h;
+      assert(limits.v_end <= tile_rect.bottom);
+      // Offset the tile upwards to align with the restoration processing stripe
+      const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+      limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
+      if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+
+      assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+      if ((i & 1) == 0) {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            limits.v_start + RESTORATION_BORDER;
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            limits.v_end - RESTORATION_BORDER;
+        if (i == 0) {
+          assert(limits.v_start == tile_rect.top);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+        }
+        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+          assert(limits.v_end == tile_rect.bottom);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        }
+      } else {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+      }
+      lr_job_counter[i & 1]++;
+      lr_sync->jobs_enqueued++;
+
+      y0 += h;
+      ++i;
+    }
+  }
+}
+
+AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+  AV1LrMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lr_sync->job_mutex);
+
+  if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+    cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
+    lr_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lr_sync->job_mutex);
+#else
+  (void)lr_sync;
+#endif
+
+  return cur_job_info;
+}
+
+// Implement row loop restoration for each thread.
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+  AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+  LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
+  AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+  int lr_unit_row;
+  int plane;
+  const int tile_row = LR_TILE_ROW;
+  const int tile_col = LR_TILE_COL;
+  const int tile_cols = LR_TILE_COLS;
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = {
+    aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+  };
+
+  while (1) {
+    AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
+    if (cur_job_info != NULL) {
+      RestorationTileLimits limits;
+      sync_read_fn_t on_sync_read;
+      sync_write_fn_t on_sync_write;
+      limits.v_start = cur_job_info->v_start;
+      limits.v_end = cur_job_info->v_end;
+      lr_unit_row = cur_job_info->lr_unit_row;
+      plane = cur_job_info->plane;
+      const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+
+      // sync_mode == 1 implies only sync read is required in LR Multi-threading
+      // sync_mode == 0 implies only sync write is required.
+      on_sync_read =
+          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+                                                   : av1_lr_sync_write_dummy;
+
+      av1_foreach_rest_unit_in_row(
+          &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
+          ctxt[plane].rsi->restoration_unit_size, unit_idx0,
+          ctxt[plane].rsi->horz_units_per_tile,
+          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+          on_sync_write, lr_sync);
+
+      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
+                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+                       cur_job_info->v_copy_end);
+    } else {
+      break;
+    }
+  }
+  return 1;
+}
+
+static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
+                                           AVxWorker *workers, int nworkers,
+                                           AV1LrSync *lr_sync, AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_rows_lr = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+    const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+    const int unit_size = cm->rst_info[plane].restoration_unit_size;
+
+    num_rows_lr =
+        AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+  }
+
+  const int num_workers = nworkers;
+  int i;
+  assert(MAX_MB_PLANE == 3);
+
+  if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
+      num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+    av1_loop_restoration_dealloc(lr_sync, num_workers);
+    loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
+                           cm->width);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (i = 0; i < num_planes; i++) {
+    memset(lr_sync->cur_sb_col[i], -1,
+           sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
+  }
+
+  enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
+
+  // Set up looprestoration thread data.
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+    worker->hook = loop_restoration_row_worker;
+    worker->data1 = lr_sync;
+    worker->data2 = &lr_sync->lrworkerdata[i];
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          AV1_COMMON *cm, int optimized_lr,
+                                          AVxWorker *workers, int num_workers,
+                                          AV1LrSync *lr_sync, void *lr_ctxt) {
+  assert(!cm->all_lossless);
+
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
+
+  foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
+                                 cm);
+}
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
new file mode 100644
index 000000000..23d61d72a
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+typedef struct AV1LfMTInfo {
+  int mi_row;
+  int plane;
+  int dir;
+} AV1LfMTInfo;
+
+// Loopfilter row synchronization
+typedef struct AV1LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col[MAX_MB_PLANE];
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  AV1LfMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+} AV1LfSync;
+
+typedef struct AV1LrMTInfo {
+  int v_start;
+  int v_end;
+  int lr_unit_row;
+  int plane;
+  int sync_mode;
+  int v_copy_start;
+  int v_copy_end;
+} AV1LrMTInfo;
+
+typedef struct LoopRestorationWorkerData {
+  int32_t *rst_tmpbuf;
+  void *rlbs;
+  void *lr_ctxt;
+} LRWorkerData;
+
+// Looprestoration row synchronization
+typedef struct AV1LrSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+  // Allocate memory to store the loop-restoration block index in each row.
+  int *cur_sb_col[MAX_MB_PLANE];
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+  int num_planes;
+
+  int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  // Row-based parallel loopfilter data
+  LRWorkerData *lrworkerdata;
+
+  AV1LrMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+} AV1LrSync;
+
+// Deallocate loopfilter synchronization related mutex and data.
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                              struct macroblockd *mbd, int plane_start,
+                              int plane_end, int partial_frame,
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync);
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          struct AV1Common *cm,
+                                          int optimized_lr, AVxWorker *workers,
+                                          int num_workers, AV1LrSync *lr_sync,
+                                          void *lr_ctxt);
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
new file mode 100644
index 000000000..1b413487f
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/tile_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
+  av1_tile_set_row(tile, cm, row);
+  av1_tile_set_col(tile, cm, col);
+}
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static int tile_log2(int blk_size, int target) {
+  int k;
+  for (k = 0; (blk_size << k) < target; k++) {
+  }
+  return k;
+}
+
+void av1_get_tile_limits(AV1_COMMON *const cm) {
+  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+
+  int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
+  cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+  int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+  cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
+  cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+  cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+  cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+  cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
+}
+
+void av1_calculate_tile_cols(AV1_COMMON *const cm) {
+  int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+  int i;
+
+  if (cm->uniform_tile_spacing_flag) {
+    int start_sb;
+    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
+    size_sb >>= cm->log2_tile_cols;
+    assert(size_sb > 0);
+    for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
+      cm->tile_col_start_sb[i] = start_sb;
+      start_sb += size_sb;
+    }
+    cm->tile_cols = i;
+    cm->tile_col_start_sb[i] = sb_cols;
+    cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
+    cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
+
+    cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
+    cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+  } else {
+    int max_tile_area_sb = (sb_rows * sb_cols);
+    int widest_tile_sb = 1;
+    cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
+    for (i = 0; i < cm->tile_cols; i++) {
+      int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+      widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+    }
+    if (cm->min_log2_tiles) {
+      max_tile_area_sb >>= (cm->min_log2_tiles + 1);
+    }
+    cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+  }
+}
+
+void av1_calculate_tile_rows(AV1_COMMON *const cm) {
+  int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+  int start_sb, size_sb, i;
+
+  if (cm->uniform_tile_spacing_flag) {
+    size_sb = ALIGN_POWER_OF_TWO(sb_rows, cm->log2_tile_rows);
+    size_sb >>= cm->log2_tile_rows;
+    assert(size_sb > 0);
+    for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
+      cm->tile_row_start_sb[i] = start_sb;
+      start_sb += size_sb;
+    }
+    cm->tile_rows = i;
+    cm->tile_row_start_sb[i] = sb_rows;
+
+    cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
+    cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+  } else {
+    cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
+  }
+}
+
+void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
+  assert(row < cm->tile_rows);
+  int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
+  int mi_row_end = cm->tile_row_start_sb[row + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_row = row;
+  tile->mi_row_start = mi_row_start;
+  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+  assert(tile->mi_row_end > tile->mi_row_start);
+}
+
+void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
+  assert(col < cm->tile_cols);
+  int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
+  int mi_col_end = cm->tile_col_start_sb[col + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_col = col;
+  tile->mi_col_start = mi_col_start;
+  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
+  assert(tile->mi_col_end > tile->mi_col_start);
+}
+
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_cols;
+}
+
+int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
+  // Round the frame up to a whole number of max superblocks
+  mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
+
+  // Divide by the signalled number of tiles, rounding up to the multiple of
+  // the max superblock size. To do this, shift right (and round up) to get the
+  // tile size in max super-blocks and then shift left again to convert it to
+  // mi units.
+  const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2;
+  const int max_sb_tile_size =
+      ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift;
+  const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2;
+
+  // The actual number of tiles is the ceiling of the frame size in mi units
+  // divided by mi_size. This is at most 1 << log2_tile_num but might be
+  // strictly less if max_sb_tile_size got rounded up significantly.
+  if (ntiles) {
+    *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size;
+    assert(*ntiles <= (1 << log2_tile_num));
+  }
+
+  return mi_tile_size;
+}
+
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+                               int is_uv) {
+  AV1PixelRect r;
+
+  // Calculate position in the Y plane
+  r.left = tile_info->mi_col_start * MI_SIZE;
+  r.right = tile_info->mi_col_end * MI_SIZE;
+  r.top = tile_info->mi_row_start * MI_SIZE;
+  r.bottom = tile_info->mi_row_end * MI_SIZE;
+
+  // If upscaling is enabled, the tile limits need scaling to match the
+  // upscaled frame where the restoration units live. To do this, scale up the
+  // top-left and bottom-right of the tile.
+  if (av1_superres_scaled(cm)) {
+    av1_calculate_unscaled_superres_size(&r.left, &r.top,
+                                         cm->superres_scale_denominator);
+    av1_calculate_unscaled_superres_size(&r.right, &r.bottom,
+                                         cm->superres_scale_denominator);
+  }
+
+  const int frame_w = cm->superres_upscaled_width;
+  const int frame_h = cm->superres_upscaled_height;
+
+  // Make sure we don't fall off the bottom-right of the frame.
+  r.right = AOMMIN(r.right, frame_w);
+  r.bottom = AOMMIN(r.bottom, frame_h);
+
+  // Convert to coordinates in the appropriate plane
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+  r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
+  r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
+  r.top = ROUND_POWER_OF_TWO(r.top, ss_y);
+  r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y);
+
+  return r;
+}
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
new file mode 100644
index 000000000..c03553dc6
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+struct AV1Common;
+
+#define DEFAULT_MAX_NUM_TG 1
+
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+  int tg_horz_boundary;
+  int tile_row;
+  int tile_col;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
+                   int col);
+
+void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
+void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
+void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
+                         int *max_log2_tile_cols);
+
+// Calculate the correct tile size (width or height) for (1 << log2_tile_num)
+// tiles horizontally or vertically in the frame.
+int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
+
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
+typedef struct {
+  int left, top, right, bottom;
+} AV1PixelRect;
+
+// Return the pixel extents of the given tile
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+                               const struct AV1Common *cm, int is_uv);
+
+// Define tile maximum width and area
+// There is no maximum height since height is limited by area and width limits
+// The minimum tile width or height is fixed at one superblock
+#define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
+#define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
+
+void av1_get_tile_limits(struct AV1Common *const cm);
+void av1_calculate_tile_cols(struct AV1Common *const cm);
+void av1_calculate_tile_rows(struct AV1Common *const cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
new file mode 100644
index 000000000..49dbde78f
--- /dev/null
+++ b/third_party/aom/av1/common/timing.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/timing.h"
+
+/* Tables for AV1 max bitrates for different levels of main and high tier.
+ * The tables are in Kbps instead of Mbps in the specification.
+ * Note that depending on the profile, a multiplier is needed.
+ */
+
+/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t main_kbps[1 << LEVEL_BITS] = {
+  1500, 3000,  0,     0,     6000,  10000, 0,      0,      12000,  20000,    0,
+  0,    30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0,        0,
+  0,    0,     0,     0,     0,     0,     0,      0,      0,      (1 << 26)
+};
+
+/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t high_kbps[1 << LEVEL_BITS] = {
+  0,      0,      0,      0,      0,      0,      0,      0,
+  30000,  50000,  0,      0,      100000, 160000, 240000, 240000,
+  240000, 480000, 800000, 800000, 0,      0,      0,      0,
+  0,      0,      0,      0,      0,      0,      0,      (1 << 26)
+};
+
+/* BitrateProfileFactor */
+static int bitrate_profile_factor[1 << PROFILE_BITS] = {
+  1, 2, 3, 0, 0, 0, 0, 0
+};
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                          int seq_tier) {
+  int64_t bitrate;
+
+  if (seq_tier) {
+    bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  } else {
+    bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  }
+
+  return bitrate * 1000;
+}
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+  decoder_model->encoder_decoder_buffer_delay_length = 16;
+  decoder_model->buffer_removal_time_length = 10;
+  decoder_model->frame_presentation_time_length = 10;
+}
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 1;
+  op_params->decoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->encoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->low_delay_mode_flag = 0;
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
+
+void set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 0;
+  op_params->decoder_buffer_delay =
+      70000;  // Resource availability mode default
+  op_params->encoder_buffer_delay =
+      20000;                           // Resource availability mode default
+  op_params->low_delay_mode_flag = 0;  // Resource availability mode default
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
new file mode 100644
index 000000000..06939ae43
--- /dev/null
+++ b/third_party/aom/av1/common/timing.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+#define MAX_NUM_OP_POINTS 32
+
+typedef struct aom_timing {
+  uint32_t num_units_in_display_tick;
+  uint32_t time_scale;
+  int equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+} aom_timing_info_t;
+
+typedef struct aom_dec_model_info {
+  uint32_t num_units_in_decoding_tick;
+  int encoder_decoder_buffer_delay_length;
+  int buffer_removal_time_length;
+  int frame_presentation_time_length;
+} aom_dec_model_info_t;
+
+typedef struct aom_dec_model_op_parameters {
+  int decoder_model_param_present_flag;
+  int64_t bitrate;
+  int64_t buffer_size;
+  uint32_t decoder_buffer_delay;
+  uint32_t encoder_buffer_delay;
+  int low_delay_mode_flag;
+  int display_model_param_present_flag;
+  int initial_display_delay;
+} aom_dec_model_op_parameters_t;
+
+typedef struct aom_op_timing_info_t {
+  uint32_t buffer_removal_time;
+} aom_op_timing_info_t;
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
+
+void set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params);
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                          int seq_tier);
+
+#endif  // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
new file mode 100644
index 000000000..53e956450
--- /dev/null
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -0,0 +1,3555 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/entropy.h"
+
+static const aom_cdf_prob
+    av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS]
+                            [CDF_SIZE(2)] = {
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                            };
+
+static const aom_cdf_prob
+    av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS]
+                             [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) },
+                                                   { AOM_CDF2(5892) },
+                                                   { AOM_CDF2(12112) },
+                                                   { AOM_CDF2(21935) },
+                                                   { AOM_CDF2(20289) },
+                                                   { AOM_CDF2(27473) },
+                                                   { AOM_CDF2(32487) },
+                                                   { AOM_CDF2(7654) },
+                                                   { AOM_CDF2(19473) },
+                                                   { AOM_CDF2(29984) },
+                                                   { AOM_CDF2(9961) },
+                                                   { AOM_CDF2(30242) },
+                                                   { AOM_CDF2(32117) } },
+                                                 { { AOM_CDF2(31548) },
+                                                   { AOM_CDF2(1549) },
+                                                   { AOM_CDF2(10130) },
+                                                   { AOM_CDF2(16656) },
+                                                   { AOM_CDF2(18591) },
+                                                   { AOM_CDF2(26308) },
+                                                   { AOM_CDF2(32537) },
+                                                   { AOM_CDF2(5403) },
+                                                   { AOM_CDF2(18096) },
+                                                   { AOM_CDF2(30003) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(29957) },
+                                                   { AOM_CDF2(5391) },
+                                                   { AOM_CDF2(18039) },
+                                                   { AOM_CDF2(23566) },
+                                                   { AOM_CDF2(22431) },
+                                                   { AOM_CDF2(25822) },
+                                                   { AOM_CDF2(32197) },
+                                                   { AOM_CDF2(3778) },
+                                                   { AOM_CDF2(15336) },
+                                                   { AOM_CDF2(28981) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(17920) },
+                                                   { AOM_CDF2(1818) },
+                                                   { AOM_CDF2(7282) },
+                                                   { AOM_CDF2(25273) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(31554) },
+                                                   { AOM_CDF2(32624) },
+                                                   { AOM_CDF2(1366) },
+                                                   { AOM_CDF2(15628) },
+                                                   { AOM_CDF2(30462) },
+                                                   { AOM_CDF2(146) },
+                                                   { AOM_CDF2(5132) },
+                                                   { AOM_CDF2(31657) } },
+                                                 { { AOM_CDF2(6308) },
+                                                   { AOM_CDF2(117) },
+                                                   { AOM_CDF2(1638) },
+                                                   { AOM_CDF2(2161) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(30247) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(30371) },
+                                                   { AOM_CDF2(7570) },
+                                                   { AOM_CDF2(13155) },
+                                                   { AOM_CDF2(20751) },
+                                                   { AOM_CDF2(20969) },
+                                                   { AOM_CDF2(27067) },
+                                                   { AOM_CDF2(32013) },
+                                                   { AOM_CDF2(5495) },
+                                                   { AOM_CDF2(17942) },
+                                                   { AOM_CDF2(28280) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31782) },
+                                                   { AOM_CDF2(1836) },
+                                                   { AOM_CDF2(10689) },
+                                                   { AOM_CDF2(17604) },
+                                                   { AOM_CDF2(21622) },
+                                                   { AOM_CDF2(27518) },
+                                                   { AOM_CDF2(32399) },
+                                                   { AOM_CDF2(4419) },
+                                                   { AOM_CDF2(16294) },
+                                                   { AOM_CDF2(28345) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31901) },
+                                                   { AOM_CDF2(10311) },
+                                                   { AOM_CDF2(18047) },
+                                                   { AOM_CDF2(24806) },
+                                                   { AOM_CDF2(23288) },
+                                                   { AOM_CDF2(27914) },
+                                                   { AOM_CDF2(32296) },
+                                                   { AOM_CDF2(4215) },
+                                                   { AOM_CDF2(15756) },
+                                                   { AOM_CDF2(28341) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(26726) },
+                                                   { AOM_CDF2(1045) },
+                                                   { AOM_CDF2(11703) },
+                                                   { AOM_CDF2(20590) },
+                                                   { AOM_CDF2(18554) },
+                                                   { AOM_CDF2(25970) },
+                                                   { AOM_CDF2(31938) },
+                                                   { AOM_CDF2(5583) },
+                                                   { AOM_CDF2(21313) },
+                                                   { AOM_CDF2(29390) },
+                                                   { AOM_CDF2(641) },
+                                                   { AOM_CDF2(22265) },
+                                                   { AOM_CDF2(31452) } },
+                                                 { { AOM_CDF2(26584) },
+                                                   { AOM_CDF2(188) },
+                                                   { AOM_CDF2(8847) },
+                                                   { AOM_CDF2(24519) },
+                                                   { AOM_CDF2(22938) },
+                                                   { AOM_CDF2(30583) },
+                                                   { AOM_CDF2(32608) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(29614) },
+                                                   { AOM_CDF2(9068) },
+                                                   { AOM_CDF2(12924) },
+                                                   { AOM_CDF2(19538) },
+                                                   { AOM_CDF2(17737) },
+                                                   { AOM_CDF2(24619) },
+                                                   { AOM_CDF2(30642) },
+                                                   { AOM_CDF2(4119) },
+                                                   { AOM_CDF2(16026) },
+                                                   { AOM_CDF2(25657) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31957) },
+                                                   { AOM_CDF2(3230) },
+                                                   { AOM_CDF2(11153) },
+                                                   { AOM_CDF2(18123) },
+                                                   { AOM_CDF2(20143) },
+                                                   { AOM_CDF2(26536) },
+                                                   { AOM_CDF2(31986) },
+                                                   { AOM_CDF2(3050) },
+                                                   { AOM_CDF2(14603) },
+                                                   { AOM_CDF2(25155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32363) },
+                                                   { AOM_CDF2(10692) },
+                                                   { AOM_CDF2(19090) },
+                                                   { AOM_CDF2(24357) },
+                                                   { AOM_CDF2(24442) },
+                                                   { AOM_CDF2(28312) },
+                                                   { AOM_CDF2(32169) },
+                                                   { AOM_CDF2(3648) },
+                                                   { AOM_CDF2(15690) },
+                                                   { AOM_CDF2(26815) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(30669) },
+                                                   { AOM_CDF2(3832) },
+                                                   { AOM_CDF2(11663) },
+                                                   { AOM_CDF2(18889) },
+                                                   { AOM_CDF2(19782) },
+                                                   { AOM_CDF2(23313) },
+                                                   { AOM_CDF2(31330) },
+                                                   { AOM_CDF2(5124) },
+                                                   { AOM_CDF2(18719) },
+                                                   { AOM_CDF2(28468) },
+                                                   { AOM_CDF2(3082) },
+                                                   { AOM_CDF2(20982) },
+                                                   { AOM_CDF2(29443) } },
+                                                 { { AOM_CDF2(28573) },
+                                                   { AOM_CDF2(3183) },
+                                                   { AOM_CDF2(17802) },
+                                                   { AOM_CDF2(25977) },
+                                                   { AOM_CDF2(26677) },
+                                                   { AOM_CDF2(27832) },
+                                                   { AOM_CDF2(32387) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(26887) },
+                                                   { AOM_CDF2(6729) },
+                                                   { AOM_CDF2(10361) },
+                                                   { AOM_CDF2(17442) },
+                                                   { AOM_CDF2(15045) },
+                                                   { AOM_CDF2(22478) },
+                                                   { AOM_CDF2(29072) },
+                                                   { AOM_CDF2(2713) },
+                                                   { AOM_CDF2(11861) },
+                                                   { AOM_CDF2(20773) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31903) },
+                                                   { AOM_CDF2(2044) },
+                                                   { AOM_CDF2(7528) },
+                                                   { AOM_CDF2(14618) },
+                                                   { AOM_CDF2(16182) },
+                                                   { AOM_CDF2(24168) },
+                                                   { AOM_CDF2(31037) },
+                                                   { AOM_CDF2(2786) },
+                                                   { AOM_CDF2(11194) },
+                                                   { AOM_CDF2(20155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32510) },
+                                                   { AOM_CDF2(8430) },
+                                                   { AOM_CDF2(17318) },
+                                                   { AOM_CDF2(24154) },
+                                                   { AOM_CDF2(23674) },
+                                                   { AOM_CDF2(28789) },
+                                                   { AOM_CDF2(32139) },
+                                                   { AOM_CDF2(3440) },
+                                                   { AOM_CDF2(13117) },
+                                                   { AOM_CDF2(22702) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31671) },
+                                                   { AOM_CDF2(2056) },
+                                                   { AOM_CDF2(11746) },
+                                                   { AOM_CDF2(16852) },
+                                                   { AOM_CDF2(18635) },
+                                                   { AOM_CDF2(24715) },
+                                                   { AOM_CDF2(31484) },
+                                                   { AOM_CDF2(4656) },
+                                                   { AOM_CDF2(16074) },
+                                                   { AOM_CDF2(24704) },
+                                                   { AOM_CDF2(1806) },
+                                                   { AOM_CDF2(14645) },
+                                                   { AOM_CDF2(25336) } },
+                                                 { { AOM_CDF2(31539) },
+                                                   { AOM_CDF2(8433) },
+                                                   { AOM_CDF2(20576) },
+                                                   { AOM_CDF2(27904) },
+                                                   { AOM_CDF2(27852) },
+                                                   { AOM_CDF2(30026) },
+                                                   { AOM_CDF2(32441) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = {
+                                { { {
+                                        { AOM_CDF2(16961) },
+                                        { AOM_CDF2(17223) },
+                                        { AOM_CDF2(7621) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(19069) },
+                                        { AOM_CDF2(22525) },
+                                        { AOM_CDF2(13377) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20401) },
+                                        { AOM_CDF2(17025) },
+                                        { AOM_CDF2(12845) },
+                                        { AOM_CDF2(12873) },
+                                        { AOM_CDF2(14094) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20681) },
+                                        { AOM_CDF2(20701) },
+                                        { AOM_CDF2(15250) },
+                                        { AOM_CDF2(15017) },
+                                        { AOM_CDF2(14928) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(17194) },
+                                        { AOM_CDF2(16170) },
+                                        { AOM_CDF2(17695) },
+                                        { AOM_CDF2(13826) },
+                                        { AOM_CDF2(15810) },
+                                        { AOM_CDF2(12036) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(23959) },
+                                        { AOM_CDF2(20799) },
+                                        { AOM_CDF2(19021) },
+                                        { AOM_CDF2(16203) },
+                                        { AOM_CDF2(17886) },
+                                        { AOM_CDF2(14144) },
+                                        { AOM_CDF2(12010) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(27399) },
+                                        { AOM_CDF2(16327) },
+                                        { AOM_CDF2(18071) },
+                                        { AOM_CDF2(19584) },
+                                        { AOM_CDF2(20721) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(19560) },
+                                        { AOM_CDF2(10150) },
+                                        { AOM_CDF2(8805) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24932) },
+                                        { AOM_CDF2(20833) },
+                                        { AOM_CDF2(12027) },
+                                        { AOM_CDF2(16670) },
+                                        { AOM_CDF2(19914) },
+                                        { AOM_CDF2(15106) },
+                                        { AOM_CDF2(17662) },
+                                        { AOM_CDF2(13783) },
+                                        { AOM_CDF2(28756) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23406) },
+                                        { AOM_CDF2(21845) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(17096) },
+                                        { AOM_CDF2(12561) },
+                                        { AOM_CDF2(17320) },
+                                        { AOM_CDF2(22395) },
+                                        { AOM_CDF2(21370) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(17471) },
+                                        { AOM_CDF2(20223) },
+                                        { AOM_CDF2(11357) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20335) },
+                                        { AOM_CDF2(21667) },
+                                        { AOM_CDF2(14818) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20430) },
+                                        { AOM_CDF2(20662) },
+                                        { AOM_CDF2(15367) },
+                                        { AOM_CDF2(16970) },
+                                        { AOM_CDF2(14657) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22117) },
+                                        { AOM_CDF2(22028) },
+                                        { AOM_CDF2(18650) },
+                                        { AOM_CDF2(16042) },
+                                        { AOM_CDF2(15885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(22409) },
+                                        { AOM_CDF2(21012) },
+                                        { AOM_CDF2(15650) },
+                                        { AOM_CDF2(17395) },
+                                        { AOM_CDF2(15469) },
+                                        { AOM_CDF2(20205) },
+                                        { AOM_CDF2(19511) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24220) },
+                                        { AOM_CDF2(22480) },
+                                        { AOM_CDF2(17737) },
+                                        { AOM_CDF2(18916) },
+                                        { AOM_CDF2(19268) },
+                                        { AOM_CDF2(18412) },
+                                        { AOM_CDF2(18844) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(25991) },
+                                        { AOM_CDF2(20314) },
+                                        { AOM_CDF2(17731) },
+                                        { AOM_CDF2(19678) },
+                                        { AOM_CDF2(18649) },
+                                        { AOM_CDF2(17307) },
+                                        { AOM_CDF2(21798) },
+                                        { AOM_CDF2(17549) },
+                                        { AOM_CDF2(15630) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26585) },
+                                        { AOM_CDF2(21469) },
+                                        { AOM_CDF2(20432) },
+                                        { AOM_CDF2(17735) },
+                                        { AOM_CDF2(19280) },
+                                        { AOM_CDF2(15235) },
+                                        { AOM_CDF2(20297) },
+                                        { AOM_CDF2(22471) },
+                                        { AOM_CDF2(28997) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26605) },
+                                        { AOM_CDF2(11304) },
+                                        { AOM_CDF2(16726) },
+                                        { AOM_CDF2(16560) },
+                                        { AOM_CDF2(20866) },
+                                        { AOM_CDF2(23524) },
+                                        { AOM_CDF2(19878) },
+                                        { AOM_CDF2(13469) },
+                                        { AOM_CDF2(23084) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(18983) },
+                                        { AOM_CDF2(20512) },
+                                        { AOM_CDF2(14885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20090) },
+                                        { AOM_CDF2(19444) },
+                                        { AOM_CDF2(17286) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19139) },
+                                        { AOM_CDF2(21487) },
+                                        { AOM_CDF2(18959) },
+                                        { AOM_CDF2(20910) },
+                                        { AOM_CDF2(19089) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20536) },
+                                        { AOM_CDF2(20664) },
+                                        { AOM_CDF2(20625) },
+                                        { AOM_CDF2(19123) },
+                                        { AOM_CDF2(14862) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19833) },
+                                        { AOM_CDF2(21502) },
+                                        { AOM_CDF2(17485) },
+                                        { AOM_CDF2(20267) },
+                                        { AOM_CDF2(18353) },
+                                        { AOM_CDF2(23329) },
+                                        { AOM_CDF2(21478) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22041) },
+                                        { AOM_CDF2(23434) },
+                                        { AOM_CDF2(20001) },
+                                        { AOM_CDF2(20554) },
+                                        { AOM_CDF2(20951) },
+                                        { AOM_CDF2(20145) },
+                                        { AOM_CDF2(15562) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23312) },
+                                        { AOM_CDF2(21607) },
+                                        { AOM_CDF2(16526) },
+                                        { AOM_CDF2(18957) },
+                                        { AOM_CDF2(18034) },
+                                        { AOM_CDF2(18934) },
+                                        { AOM_CDF2(24247) },
+                                        { AOM_CDF2(16921) },
+                                        { AOM_CDF2(17080) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26579) },
+                                        { AOM_CDF2(24910) },
+                                        { AOM_CDF2(18637) },
+                                        { AOM_CDF2(19800) },
+                                        { AOM_CDF2(20388) },
+                                        { AOM_CDF2(9887) },
+                                        { AOM_CDF2(15642) },
+                                        { AOM_CDF2(30198) },
+                                        { AOM_CDF2(24721) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26998) },
+                                        { AOM_CDF2(16737) },
+                                        { AOM_CDF2(17838) },
+                                        { AOM_CDF2(18922) },
+                                        { AOM_CDF2(19515) },
+                                        { AOM_CDF2(18636) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(15776) },
+                                        { AOM_CDF2(22658) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(20177) },
+                                        { AOM_CDF2(20789) },
+                                        { AOM_CDF2(20262) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(21416) },
+                                        { AOM_CDF2(20855) },
+                                        { AOM_CDF2(23410) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20238) },
+                                        { AOM_CDF2(21057) },
+                                        { AOM_CDF2(19159) },
+                                        { AOM_CDF2(22337) },
+                                        { AOM_CDF2(20159) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20125) },
+                                        { AOM_CDF2(20559) },
+                                        { AOM_CDF2(21707) },
+                                        { AOM_CDF2(22296) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19941) },
+                                        { AOM_CDF2(20527) },
+                                        { AOM_CDF2(21470) },
+                                        { AOM_CDF2(22487) },
+                                        { AOM_CDF2(19558) },
+                                        { AOM_CDF2(22354) },
+                                        { AOM_CDF2(20331) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22752) },
+                                        { AOM_CDF2(25006) },
+                                        { AOM_CDF2(22075) },
+                                        { AOM_CDF2(21576) },
+                                        { AOM_CDF2(17740) },
+                                        { AOM_CDF2(21690) },
+                                        { AOM_CDF2(19211) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(21442) },
+                                        { AOM_CDF2(22358) },
+                                        { AOM_CDF2(18503) },
+                                        { AOM_CDF2(20291) },
+                                        { AOM_CDF2(19945) },
+                                        { AOM_CDF2(21294) },
+                                        { AOM_CDF2(21178) },
+                                        { AOM_CDF2(19400) },
+                                        { AOM_CDF2(10556) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24648) },
+                                        { AOM_CDF2(24949) },
+                                        { AOM_CDF2(20708) },
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(20501) },
+                                        { AOM_CDF2(9558) },
+                                        { AOM_CDF2(9423) },
+                                        { AOM_CDF2(30365) },
+                                        { AOM_CDF2(19253) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26064) },
+                                        { AOM_CDF2(22098) },
+                                        { AOM_CDF2(19613) },
+                                        { AOM_CDF2(20525) },
+                                        { AOM_CDF2(17595) },
+                                        { AOM_CDF2(16618) },
+                                        { AOM_CDF2(20497) },
+                                        { AOM_CDF2(18989) },
+                                        { AOM_CDF2(15513) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } }
+                              };
+
+static const aom_cdf_prob
+    av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) },
+                    { AOM_CDF5(370, 671, 1883, 4471) } },
+                  { { AOM_CDF5(3247, 4950, 9688, 14563) },
+                    { AOM_CDF5(1904, 3354, 7763, 14647) } } },
+                { { { AOM_CDF5(2125, 2551, 5165, 8946) },
+                    { AOM_CDF5(513, 765, 1859, 6339) } },
+                  { { AOM_CDF5(7637, 9498, 14259, 19108) },
+                    { AOM_CDF5(2497, 4096, 8866, 16993) } } },
+                { { { AOM_CDF5(4016, 4897, 8881, 14968) },
+                    { AOM_CDF5(716, 1105, 2646, 10056) } },
+                  { { AOM_CDF5(11139, 13270, 18241, 23566) },
+                    { AOM_CDF5(3192, 5032, 10297, 19755) } } },
+                { { { AOM_CDF5(6708, 8958, 14746, 22133) },
+                    { AOM_CDF5(1222, 2074, 4783, 15410) } },
+                  { { AOM_CDF5(19575, 21766, 26044, 29709) },
+                    { AOM_CDF5(7297, 10767, 19273, 28194) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) },
+                    { AOM_CDF6(210, 405, 1315, 3326, 7537) } },
+                  { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) },
+                    { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } },
+                { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) },
+                    { AOM_CDF6(313, 441, 1099, 2917, 8562) } },
+                  { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) },
+                    { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } },
+                { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) },
+                    { AOM_CDF6(574, 821, 1836, 5089, 13128) } },
+                  { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) },
+                    { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } },
+                { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) },
+                    { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } },
+                  { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) },
+                    { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) },
+                    { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } },
+                  { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) },
+                    { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } },
+                { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) },
+                    { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } },
+                  { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) },
+                    { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } },
+                { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) },
+                    { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } },
+                  { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) },
+                    { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } },
+                { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) },
+                    { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } },
+                  { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) },
+                    { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        8)] = {
+      { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) },
+          { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } },
+        { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+          { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } },
+      { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) },
+          { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } },
+        { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+          { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } },
+      { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+          { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } },
+        { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+          { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } },
+      { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+          { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } },
+        { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+          { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        9)] = {
+      { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) },
+          { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } },
+        { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) },
+          { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } },
+      { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) },
+          { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } },
+        { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) },
+          { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } },
+      { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) },
+          { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } },
+        { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) },
+          { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842,
+                     32708) } } },
+      { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) },
+          { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } },
+        { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) },
+          { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403,
+                     32695) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788,
+                                 23412, 26061) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919,
+                                 26129, 29140) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590,
+                                 24584, 28749) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478,
+                                 28396, 31811) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267,
+                                 28410, 31078) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812,
+                                 27300, 29219, 32114) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456,
+                                 31142, 32060) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716,
+                                 30073, 30820, 31956) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047,
+                                 22571, 25830) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354,
+                                 27255, 28546, 31784) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851,
+                                 21856, 25692, 28034) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527,
+                                 28027, 28377, 30876) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155,
+                                 26682, 29229, 31045) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601,
+                                 25483, 25843, 32056) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434,
+                                 29326, 31082, 32050) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913,
+                                 29486, 29724, 29807, 32570) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } } };
+
+static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+    [CDF_SIZE(BR_CDF_SIZE)] = {
+      { { { { AOM_CDF4(14298, 20718, 24174) },
+            { AOM_CDF4(12536, 19601, 23789) },
+            { AOM_CDF4(8712, 15051, 19503) },
+            { AOM_CDF4(6170, 11327, 15434) },
+            { AOM_CDF4(4742, 8926, 12538) },
+            { AOM_CDF4(3803, 7317, 10546) },
+            { AOM_CDF4(1696, 3317, 4871) },
+            { AOM_CDF4(14392, 19951, 22756) },
+            { AOM_CDF4(15978, 23218, 26818) },
+            { AOM_CDF4(12187, 19474, 23889) },
+            { AOM_CDF4(9176, 15640, 20259) },
+            { AOM_CDF4(7068, 12655, 17028) },
+            { AOM_CDF4(5656, 10442, 14472) },
+            { AOM_CDF4(2580, 4992, 7244) },
+            { AOM_CDF4(12136, 18049, 21426) },
+            { AOM_CDF4(13784, 20721, 24481) },
+            { AOM_CDF4(10836, 17621, 21900) },
+            { AOM_CDF4(8372, 14444, 18847) },
+            { AOM_CDF4(6523, 11779, 16000) },
+            { AOM_CDF4(5337, 9898, 13760) },
+            { AOM_CDF4(3034, 5860, 8462) } },
+          { { AOM_CDF4(15967, 22905, 26286) },
+            { AOM_CDF4(13534, 20654, 24579) },
+            { AOM_CDF4(9504, 16092, 20535) },
+            { AOM_CDF4(6975, 12568, 16903) },
+            { AOM_CDF4(5364, 10091, 14020) },
+            { AOM_CDF4(4357, 8370, 11857) },
+            { AOM_CDF4(2506, 4934, 7218) },
+            { AOM_CDF4(23032, 28815, 30936) },
+            { AOM_CDF4(19540, 26704, 29719) },
+            { AOM_CDF4(15158, 22969, 27097) },
+            { AOM_CDF4(11408, 18865, 23650) },
+            { AOM_CDF4(8885, 15448, 20250) },
+            { AOM_CDF4(7108, 12853, 17416) },
+            { AOM_CDF4(4231, 8041, 11480) },
+            { AOM_CDF4(19823, 26490, 29156) },
+            { AOM_CDF4(18890, 25929, 28932) },
+            { AOM_CDF4(15660, 23491, 27433) },
+            { AOM_CDF4(12147, 19776, 24488) },
+            { AOM_CDF4(9728, 16774, 21649) },
+            { AOM_CDF4(7919, 14277, 19066) },
+            { AOM_CDF4(5440, 10170, 14185) } } },
+        { { { AOM_CDF4(14406, 20862, 24414) },
+            { AOM_CDF4(11824, 18907, 23109) },
+            { AOM_CDF4(8257, 14393, 18803) },
+            { AOM_CDF4(5860, 10747, 14778) },
+            { AOM_CDF4(4475, 8486, 11984) },
+            { AOM_CDF4(3606, 6954, 10043) },
+            { AOM_CDF4(1736, 3410, 5048) },
+            { AOM_CDF4(14430, 20046, 22882) },
+            { AOM_CDF4(15593, 22899, 26709) },
+            { AOM_CDF4(12102, 19368, 23811) },
+            { AOM_CDF4(9059, 15584, 20262) },
+            { AOM_CDF4(6999, 12603, 17048) },
+            { AOM_CDF4(5684, 10497, 14553) },
+            { AOM_CDF4(2822, 5438, 7862) },
+            { AOM_CDF4(15785, 21585, 24359) },
+            { AOM_CDF4(18347, 25229, 28266) },
+            { AOM_CDF4(14974, 22487, 26389) },
+            { AOM_CDF4(11423, 18681, 23271) },
+            { AOM_CDF4(8863, 15350, 20008) },
+            { AOM_CDF4(7153, 12852, 17278) },
+            { AOM_CDF4(3707, 7036, 9982) } },
+          { { AOM_CDF4(15460, 21696, 25469) },
+            { AOM_CDF4(12170, 19249, 23191) },
+            { AOM_CDF4(8723, 15027, 19332) },
+            { AOM_CDF4(6428, 11704, 15874) },
+            { AOM_CDF4(4922, 9292, 13052) },
+            { AOM_CDF4(4139, 7695, 11010) },
+            { AOM_CDF4(2291, 4508, 6598) },
+            { AOM_CDF4(19856, 26920, 29828) },
+            { AOM_CDF4(17923, 25289, 28792) },
+            { AOM_CDF4(14278, 21968, 26297) },
+            { AOM_CDF4(10910, 18136, 22950) },
+            { AOM_CDF4(8423, 14815, 19627) },
+            { AOM_CDF4(6771, 12283, 16774) },
+            { AOM_CDF4(4074, 7750, 11081) },
+            { AOM_CDF4(19852, 26074, 28672) },
+            { AOM_CDF4(19371, 26110, 28989) },
+            { AOM_CDF4(16265, 23873, 27663) },
+            { AOM_CDF4(12758, 20378, 24952) },
+            { AOM_CDF4(10095, 17098, 21961) },
+            { AOM_CDF4(8250, 14628, 19451) },
+            { AOM_CDF4(5205, 9745, 13622) } } },
+        { { { AOM_CDF4(10563, 16233, 19763) },
+            { AOM_CDF4(9794, 16022, 19804) },
+            { AOM_CDF4(6750, 11945, 15759) },
+            { AOM_CDF4(4963, 9186, 12752) },
+            { AOM_CDF4(3845, 7435, 10627) },
+            { AOM_CDF4(3051, 6085, 8834) },
+            { AOM_CDF4(1311, 2596, 3830) },
+            { AOM_CDF4(11246, 16404, 19689) },
+            { AOM_CDF4(12315, 18911, 22731) },
+            { AOM_CDF4(10557, 17095, 21289) },
+            { AOM_CDF4(8136, 14006, 18249) },
+            { AOM_CDF4(6348, 11474, 15565) },
+            { AOM_CDF4(5196, 9655, 13400) },
+            { AOM_CDF4(2349, 4526, 6587) },
+            { AOM_CDF4(13337, 18730, 21569) },
+            { AOM_CDF4(19306, 26071, 28882) },
+            { AOM_CDF4(15952, 23540, 27254) },
+            { AOM_CDF4(12409, 19934, 24430) },
+            { AOM_CDF4(9760, 16706, 21389) },
+            { AOM_CDF4(8004, 14220, 18818) },
+            { AOM_CDF4(4138, 7794, 10961) } },
+          { { AOM_CDF4(10870, 16684, 20949) },
+            { AOM_CDF4(9664, 15230, 18680) },
+            { AOM_CDF4(6886, 12109, 15408) },
+            { AOM_CDF4(4825, 8900, 12305) },
+            { AOM_CDF4(3630, 7162, 10314) },
+            { AOM_CDF4(3036, 6429, 9387) },
+            { AOM_CDF4(1671, 3296, 4940) },
+            { AOM_CDF4(13819, 19159, 23026) },
+            { AOM_CDF4(11984, 19108, 23120) },
+            { AOM_CDF4(10690, 17210, 21663) },
+            { AOM_CDF4(7984, 14154, 18333) },
+            { AOM_CDF4(6868, 12294, 16124) },
+            { AOM_CDF4(5274, 8994, 12868) },
+            { AOM_CDF4(2988, 5771, 8424) },
+            { AOM_CDF4(19736, 26647, 29141) },
+            { AOM_CDF4(18933, 26070, 28984) },
+            { AOM_CDF4(15779, 23048, 27200) },
+            { AOM_CDF4(12638, 20061, 24532) },
+            { AOM_CDF4(10692, 17545, 22220) },
+            { AOM_CDF4(9217, 15251, 20054) },
+            { AOM_CDF4(5078, 9284, 12594) } } },
+        { { { AOM_CDF4(2331, 3662, 5244) },
+            { AOM_CDF4(2891, 4771, 6145) },
+            { AOM_CDF4(4598, 7623, 9729) },
+            { AOM_CDF4(3520, 6845, 9199) },
+            { AOM_CDF4(3417, 6119, 9324) },
+            { AOM_CDF4(2601, 5412, 7385) },
+            { AOM_CDF4(600, 1173, 1744) },
+            { AOM_CDF4(7672, 13286, 17469) },
+            { AOM_CDF4(4232, 7792, 10793) },
+            { AOM_CDF4(2915, 5317, 7397) },
+            { AOM_CDF4(2318, 4356, 6152) },
+            { AOM_CDF4(2127, 4000, 5554) },
+            { AOM_CDF4(1850, 3478, 5275) },
+            { AOM_CDF4(977, 1933, 2843) },
+            { AOM_CDF4(18280, 24387, 27989) },
+            { AOM_CDF4(15852, 22671, 26185) },
+            { AOM_CDF4(13845, 20951, 24789) },
+            { AOM_CDF4(11055, 17966, 22129) },
+            { AOM_CDF4(9138, 15422, 19801) },
+            { AOM_CDF4(7454, 13145, 17456) },
+            { AOM_CDF4(3370, 6393, 9013) } },
+          { { AOM_CDF4(5842, 9229, 10838) },
+            { AOM_CDF4(2313, 3491, 4276) },
+            { AOM_CDF4(2998, 6104, 7496) },
+            { AOM_CDF4(2420, 7447, 9868) },
+            { AOM_CDF4(3034, 8495, 10923) },
+            { AOM_CDF4(4076, 8937, 10975) },
+            { AOM_CDF4(1086, 2370, 3299) },
+            { AOM_CDF4(9714, 17254, 20444) },
+            { AOM_CDF4(8543, 13698, 17123) },
+            { AOM_CDF4(4918, 9007, 11910) },
+            { AOM_CDF4(4129, 7532, 10553) },
+            { AOM_CDF4(2364, 5533, 8058) },
+            { AOM_CDF4(1834, 3546, 5563) },
+            { AOM_CDF4(1473, 2908, 4133) },
+            { AOM_CDF4(15405, 21193, 25619) },
+            { AOM_CDF4(15691, 21952, 26561) },
+            { AOM_CDF4(12962, 19194, 24165) },
+            { AOM_CDF4(10272, 17855, 22129) },
+            { AOM_CDF4(8588, 15270, 20718) },
+            { AOM_CDF4(8682, 14669, 19500) },
+            { AOM_CDF4(4870, 9636, 13205) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(14995, 21341, 24749) },
+            { AOM_CDF4(13158, 20289, 24601) },
+            { AOM_CDF4(8941, 15326, 19876) },
+            { AOM_CDF4(6297, 11541, 15807) },
+            { AOM_CDF4(4817, 9029, 12776) },
+            { AOM_CDF4(3731, 7273, 10627) },
+            { AOM_CDF4(1847, 3617, 5354) },
+            { AOM_CDF4(14472, 19659, 22343) },
+            { AOM_CDF4(16806, 24162, 27533) },
+            { AOM_CDF4(12900, 20404, 24713) },
+            { AOM_CDF4(9411, 16112, 20797) },
+            { AOM_CDF4(7056, 12697, 17148) },
+            { AOM_CDF4(5544, 10339, 14460) },
+            { AOM_CDF4(2954, 5704, 8319) },
+            { AOM_CDF4(12464, 18071, 21354) },
+            { AOM_CDF4(15482, 22528, 26034) },
+            { AOM_CDF4(12070, 19269, 23624) },
+            { AOM_CDF4(8953, 15406, 20106) },
+            { AOM_CDF4(7027, 12730, 17220) },
+            { AOM_CDF4(5887, 10913, 15140) },
+            { AOM_CDF4(3793, 7278, 10447) } },
+          { { AOM_CDF4(15571, 22232, 25749) },
+            { AOM_CDF4(14506, 21575, 25374) },
+            { AOM_CDF4(10189, 17089, 21569) },
+            { AOM_CDF4(7316, 13301, 17915) },
+            { AOM_CDF4(5783, 10912, 15190) },
+            { AOM_CDF4(4760, 9155, 13088) },
+            { AOM_CDF4(2993, 5966, 8774) },
+            { AOM_CDF4(23424, 28903, 30778) },
+            { AOM_CDF4(20775, 27666, 30290) },
+            { AOM_CDF4(16474, 24410, 28299) },
+            { AOM_CDF4(12471, 20180, 24987) },
+            { AOM_CDF4(9410, 16487, 21439) },
+            { AOM_CDF4(7536, 13614, 18529) },
+            { AOM_CDF4(5048, 9586, 13549) },
+            { AOM_CDF4(21090, 27290, 29756) },
+            { AOM_CDF4(20796, 27402, 30026) },
+            { AOM_CDF4(17819, 25485, 28969) },
+            { AOM_CDF4(13860, 21909, 26462) },
+            { AOM_CDF4(11002, 18494, 23529) },
+            { AOM_CDF4(8953, 15929, 20897) },
+            { AOM_CDF4(6448, 11918, 16454) } } },
+        { { { AOM_CDF4(15999, 22208, 25449) },
+            { AOM_CDF4(13050, 19988, 24122) },
+            { AOM_CDF4(8594, 14864, 19378) },
+            { AOM_CDF4(6033, 11079, 15238) },
+            { AOM_CDF4(4554, 8683, 12347) },
+            { AOM_CDF4(3672, 7139, 10337) },
+            { AOM_CDF4(1900, 3771, 5576) },
+            { AOM_CDF4(15788, 21340, 23949) },
+            { AOM_CDF4(16825, 24235, 27758) },
+            { AOM_CDF4(12873, 20402, 24810) },
+            { AOM_CDF4(9590, 16363, 21094) },
+            { AOM_CDF4(7352, 13209, 17733) },
+            { AOM_CDF4(5960, 10989, 15184) },
+            { AOM_CDF4(3232, 6234, 9007) },
+            { AOM_CDF4(15761, 20716, 23224) },
+            { AOM_CDF4(19318, 25989, 28759) },
+            { AOM_CDF4(15529, 23094, 26929) },
+            { AOM_CDF4(11662, 18989, 23641) },
+            { AOM_CDF4(8955, 15568, 20366) },
+            { AOM_CDF4(7281, 13106, 17708) },
+            { AOM_CDF4(4248, 8059, 11440) } },
+          { { AOM_CDF4(14899, 21217, 24503) },
+            { AOM_CDF4(13519, 20283, 24047) },
+            { AOM_CDF4(9429, 15966, 20365) },
+            { AOM_CDF4(6700, 12355, 16652) },
+            { AOM_CDF4(5088, 9704, 13716) },
+            { AOM_CDF4(4243, 8154, 11731) },
+            { AOM_CDF4(2702, 5364, 7861) },
+            { AOM_CDF4(22745, 28388, 30454) },
+            { AOM_CDF4(20235, 27146, 29922) },
+            { AOM_CDF4(15896, 23715, 27637) },
+            { AOM_CDF4(11840, 19350, 24131) },
+            { AOM_CDF4(9122, 15932, 20880) },
+            { AOM_CDF4(7488, 13581, 18362) },
+            { AOM_CDF4(5114, 9568, 13370) },
+            { AOM_CDF4(20845, 26553, 28932) },
+            { AOM_CDF4(20981, 27372, 29884) },
+            { AOM_CDF4(17781, 25335, 28785) },
+            { AOM_CDF4(13760, 21708, 26297) },
+            { AOM_CDF4(10975, 18415, 23365) },
+            { AOM_CDF4(9045, 15789, 20686) },
+            { AOM_CDF4(6130, 11199, 15423) } } },
+        { { { AOM_CDF4(13549, 19724, 23158) },
+            { AOM_CDF4(11844, 18382, 22246) },
+            { AOM_CDF4(7919, 13619, 17773) },
+            { AOM_CDF4(5486, 10143, 13946) },
+            { AOM_CDF4(4166, 7983, 11324) },
+            { AOM_CDF4(3364, 6506, 9427) },
+            { AOM_CDF4(1598, 3160, 4674) },
+            { AOM_CDF4(15281, 20979, 23781) },
+            { AOM_CDF4(14939, 22119, 25952) },
+            { AOM_CDF4(11363, 18407, 22812) },
+            { AOM_CDF4(8609, 14857, 19370) },
+            { AOM_CDF4(6737, 12184, 16480) },
+            { AOM_CDF4(5506, 10263, 14262) },
+            { AOM_CDF4(2990, 5786, 8380) },
+            { AOM_CDF4(20249, 25253, 27417) },
+            { AOM_CDF4(21070, 27518, 30001) },
+            { AOM_CDF4(16854, 24469, 28074) },
+            { AOM_CDF4(12864, 20486, 25000) },
+            { AOM_CDF4(9962, 16978, 21778) },
+            { AOM_CDF4(8074, 14338, 19048) },
+            { AOM_CDF4(4494, 8479, 11906) } },
+          { { AOM_CDF4(13960, 19617, 22829) },
+            { AOM_CDF4(11150, 17341, 21228) },
+            { AOM_CDF4(7150, 12964, 17190) },
+            { AOM_CDF4(5331, 10002, 13867) },
+            { AOM_CDF4(4167, 7744, 11057) },
+            { AOM_CDF4(3480, 6629, 9646) },
+            { AOM_CDF4(1883, 3784, 5686) },
+            { AOM_CDF4(18752, 25660, 28912) },
+            { AOM_CDF4(16968, 24586, 28030) },
+            { AOM_CDF4(13520, 21055, 25313) },
+            { AOM_CDF4(10453, 17626, 22280) },
+            { AOM_CDF4(8386, 14505, 19116) },
+            { AOM_CDF4(6742, 12595, 17008) },
+            { AOM_CDF4(4273, 8140, 11499) },
+            { AOM_CDF4(22120, 27827, 30233) },
+            { AOM_CDF4(20563, 27358, 29895) },
+            { AOM_CDF4(17076, 24644, 28153) },
+            { AOM_CDF4(13362, 20942, 25309) },
+            { AOM_CDF4(10794, 17965, 22695) },
+            { AOM_CDF4(9014, 15652, 20319) },
+            { AOM_CDF4(5708, 10512, 14497) } } },
+        { { { AOM_CDF4(5705, 10930, 15725) },
+            { AOM_CDF4(7946, 12765, 16115) },
+            { AOM_CDF4(6801, 12123, 16226) },
+            { AOM_CDF4(5462, 10135, 14200) },
+            { AOM_CDF4(4189, 8011, 11507) },
+            { AOM_CDF4(3191, 6229, 9408) },
+            { AOM_CDF4(1057, 2137, 3212) },
+            { AOM_CDF4(10018, 17067, 21491) },
+            { AOM_CDF4(7380, 12582, 16453) },
+            { AOM_CDF4(6068, 10845, 14339) },
+            { AOM_CDF4(5098, 9198, 12555) },
+            { AOM_CDF4(4312, 8010, 11119) },
+            { AOM_CDF4(3700, 6966, 9781) },
+            { AOM_CDF4(1693, 3326, 4887) },
+            { AOM_CDF4(18757, 24930, 27774) },
+            { AOM_CDF4(17648, 24596, 27817) },
+            { AOM_CDF4(14707, 22052, 26026) },
+            { AOM_CDF4(11720, 18852, 23292) },
+            { AOM_CDF4(9357, 15952, 20525) },
+            { AOM_CDF4(7810, 13753, 18210) },
+            { AOM_CDF4(3879, 7333, 10328) } },
+          { { AOM_CDF4(8278, 13242, 15922) },
+            { AOM_CDF4(10547, 15867, 18919) },
+            { AOM_CDF4(9106, 15842, 20609) },
+            { AOM_CDF4(6833, 13007, 17218) },
+            { AOM_CDF4(4811, 9712, 13923) },
+            { AOM_CDF4(3985, 7352, 11128) },
+            { AOM_CDF4(1688, 3458, 5262) },
+            { AOM_CDF4(12951, 21861, 26510) },
+            { AOM_CDF4(9788, 16044, 20276) },
+            { AOM_CDF4(6309, 11244, 14870) },
+            { AOM_CDF4(5183, 9349, 12566) },
+            { AOM_CDF4(4389, 8229, 11492) },
+            { AOM_CDF4(3633, 6945, 10620) },
+            { AOM_CDF4(3600, 6847, 9907) },
+            { AOM_CDF4(21748, 28137, 30255) },
+            { AOM_CDF4(19436, 26581, 29560) },
+            { AOM_CDF4(16359, 24201, 27953) },
+            { AOM_CDF4(13961, 21693, 25871) },
+            { AOM_CDF4(11544, 18686, 23322) },
+            { AOM_CDF4(9372, 16462, 20952) },
+            { AOM_CDF4(6138, 11210, 15390) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(16138, 22223, 25509) },
+            { AOM_CDF4(15347, 22430, 26332) },
+            { AOM_CDF4(9614, 16736, 21332) },
+            { AOM_CDF4(6600, 12275, 16907) },
+            { AOM_CDF4(4811, 9424, 13547) },
+            { AOM_CDF4(3748, 7809, 11420) },
+            { AOM_CDF4(2254, 4587, 6890) },
+            { AOM_CDF4(15196, 20284, 23177) },
+            { AOM_CDF4(18317, 25469, 28451) },
+            { AOM_CDF4(13918, 21651, 25842) },
+            { AOM_CDF4(10052, 17150, 21995) },
+            { AOM_CDF4(7499, 13630, 18587) },
+            { AOM_CDF4(6158, 11417, 16003) },
+            { AOM_CDF4(4014, 7785, 11252) },
+            { AOM_CDF4(15048, 21067, 24384) },
+            { AOM_CDF4(18202, 25346, 28553) },
+            { AOM_CDF4(14302, 22019, 26356) },
+            { AOM_CDF4(10839, 18139, 23166) },
+            { AOM_CDF4(8715, 15744, 20806) },
+            { AOM_CDF4(7536, 13576, 18544) },
+            { AOM_CDF4(5413, 10335, 14498) } },
+          { { AOM_CDF4(17394, 24501, 27895) },
+            { AOM_CDF4(15889, 23420, 27185) },
+            { AOM_CDF4(11561, 19133, 23870) },
+            { AOM_CDF4(8285, 14812, 19844) },
+            { AOM_CDF4(6496, 12043, 16550) },
+            { AOM_CDF4(4771, 9574, 13677) },
+            { AOM_CDF4(3603, 6830, 10144) },
+            { AOM_CDF4(21656, 27704, 30200) },
+            { AOM_CDF4(21324, 27915, 30511) },
+            { AOM_CDF4(17327, 25336, 28997) },
+            { AOM_CDF4(13417, 21381, 26033) },
+            { AOM_CDF4(10132, 17425, 22338) },
+            { AOM_CDF4(8580, 15016, 19633) },
+            { AOM_CDF4(5694, 11477, 16411) },
+            { AOM_CDF4(24116, 29780, 31450) },
+            { AOM_CDF4(23853, 29695, 31591) },
+            { AOM_CDF4(20085, 27614, 30428) },
+            { AOM_CDF4(15326, 24335, 28575) },
+            { AOM_CDF4(11814, 19472, 24810) },
+            { AOM_CDF4(10221, 18611, 24767) },
+            { AOM_CDF4(7689, 14558, 20321) } } },
+        { { { AOM_CDF4(16214, 22380, 25770) },
+            { AOM_CDF4(14213, 21304, 25295) },
+            { AOM_CDF4(9213, 15823, 20455) },
+            { AOM_CDF4(6395, 11758, 16139) },
+            { AOM_CDF4(4779, 9187, 13066) },
+            { AOM_CDF4(3821, 7501, 10953) },
+            { AOM_CDF4(2293, 4567, 6795) },
+            { AOM_CDF4(15859, 21283, 23820) },
+            { AOM_CDF4(18404, 25602, 28726) },
+            { AOM_CDF4(14325, 21980, 26206) },
+            { AOM_CDF4(10669, 17937, 22720) },
+            { AOM_CDF4(8297, 14642, 19447) },
+            { AOM_CDF4(6746, 12389, 16893) },
+            { AOM_CDF4(4324, 8251, 11770) },
+            { AOM_CDF4(16532, 21631, 24475) },
+            { AOM_CDF4(20667, 27150, 29668) },
+            { AOM_CDF4(16728, 24510, 28175) },
+            { AOM_CDF4(12861, 20645, 25332) },
+            { AOM_CDF4(10076, 17361, 22417) },
+            { AOM_CDF4(8395, 14940, 19963) },
+            { AOM_CDF4(5731, 10683, 14912) } },
+          { { AOM_CDF4(14433, 21155, 24938) },
+            { AOM_CDF4(14658, 21716, 25545) },
+            { AOM_CDF4(9923, 16824, 21557) },
+            { AOM_CDF4(6982, 13052, 17721) },
+            { AOM_CDF4(5419, 10503, 15050) },
+            { AOM_CDF4(4852, 9162, 13014) },
+            { AOM_CDF4(3271, 6395, 9630) },
+            { AOM_CDF4(22210, 27833, 30109) },
+            { AOM_CDF4(20750, 27368, 29821) },
+            { AOM_CDF4(16894, 24828, 28573) },
+            { AOM_CDF4(13247, 21276, 25757) },
+            { AOM_CDF4(10038, 17265, 22563) },
+            { AOM_CDF4(8587, 14947, 20327) },
+            { AOM_CDF4(5645, 11371, 15252) },
+            { AOM_CDF4(22027, 27526, 29714) },
+            { AOM_CDF4(23098, 29146, 31221) },
+            { AOM_CDF4(19886, 27341, 30272) },
+            { AOM_CDF4(15609, 23747, 28046) },
+            { AOM_CDF4(11993, 20065, 24939) },
+            { AOM_CDF4(9637, 18267, 23671) },
+            { AOM_CDF4(7625, 13801, 19144) } } },
+        { { { AOM_CDF4(14438, 20798, 24089) },
+            { AOM_CDF4(12621, 19203, 23097) },
+            { AOM_CDF4(8177, 14125, 18402) },
+            { AOM_CDF4(5674, 10501, 14456) },
+            { AOM_CDF4(4236, 8239, 11733) },
+            { AOM_CDF4(3447, 6750, 9806) },
+            { AOM_CDF4(1986, 3950, 5864) },
+            { AOM_CDF4(16208, 22099, 24930) },
+            { AOM_CDF4(16537, 24025, 27585) },
+            { AOM_CDF4(12780, 20381, 24867) },
+            { AOM_CDF4(9767, 16612, 21416) },
+            { AOM_CDF4(7686, 13738, 18398) },
+            { AOM_CDF4(6333, 11614, 15964) },
+            { AOM_CDF4(3941, 7571, 10836) },
+            { AOM_CDF4(22819, 27422, 29202) },
+            { AOM_CDF4(22224, 28514, 30721) },
+            { AOM_CDF4(17660, 25433, 28913) },
+            { AOM_CDF4(13574, 21482, 26002) },
+            { AOM_CDF4(10629, 17977, 22938) },
+            { AOM_CDF4(8612, 15298, 20265) },
+            { AOM_CDF4(5607, 10491, 14596) } },
+          { { AOM_CDF4(13569, 19800, 23206) },
+            { AOM_CDF4(13128, 19924, 23869) },
+            { AOM_CDF4(8329, 14841, 19403) },
+            { AOM_CDF4(6130, 10976, 15057) },
+            { AOM_CDF4(4682, 8839, 12518) },
+            { AOM_CDF4(3656, 7409, 10588) },
+            { AOM_CDF4(2577, 5099, 7412) },
+            { AOM_CDF4(22427, 28684, 30585) },
+            { AOM_CDF4(20913, 27750, 30139) },
+            { AOM_CDF4(15840, 24109, 27834) },
+            { AOM_CDF4(12308, 20029, 24569) },
+            { AOM_CDF4(10216, 16785, 21458) },
+            { AOM_CDF4(8309, 14203, 19113) },
+            { AOM_CDF4(6043, 11168, 15307) },
+            { AOM_CDF4(23166, 28901, 30998) },
+            { AOM_CDF4(21899, 28405, 30751) },
+            { AOM_CDF4(18413, 26091, 29443) },
+            { AOM_CDF4(15233, 23114, 27352) },
+            { AOM_CDF4(12683, 20472, 25288) },
+            { AOM_CDF4(10702, 18259, 23409) },
+            { AOM_CDF4(8125, 14464, 19226) } } },
+        { { { AOM_CDF4(9040, 14786, 18360) },
+            { AOM_CDF4(9979, 15718, 19415) },
+            { AOM_CDF4(7913, 13918, 18311) },
+            { AOM_CDF4(5859, 10889, 15184) },
+            { AOM_CDF4(4593, 8677, 12510) },
+            { AOM_CDF4(3820, 7396, 10791) },
+            { AOM_CDF4(1730, 3471, 5192) },
+            { AOM_CDF4(11803, 18365, 22709) },
+            { AOM_CDF4(11419, 18058, 22225) },
+            { AOM_CDF4(9418, 15774, 20243) },
+            { AOM_CDF4(7539, 13325, 17657) },
+            { AOM_CDF4(6233, 11317, 15384) },
+            { AOM_CDF4(5137, 9656, 13545) },
+            { AOM_CDF4(2977, 5774, 8349) },
+            { AOM_CDF4(21207, 27246, 29640) },
+            { AOM_CDF4(19547, 26578, 29497) },
+            { AOM_CDF4(16169, 23871, 27690) },
+            { AOM_CDF4(12820, 20458, 25018) },
+            { AOM_CDF4(10224, 17332, 22214) },
+            { AOM_CDF4(8526, 15048, 19884) },
+            { AOM_CDF4(5037, 9410, 13118) } },
+          { { AOM_CDF4(12339, 17329, 20140) },
+            { AOM_CDF4(13505, 19895, 23225) },
+            { AOM_CDF4(9847, 16944, 21564) },
+            { AOM_CDF4(7280, 13256, 18348) },
+            { AOM_CDF4(4712, 10009, 14454) },
+            { AOM_CDF4(4361, 7914, 12477) },
+            { AOM_CDF4(2870, 5628, 7995) },
+            { AOM_CDF4(20061, 25504, 28526) },
+            { AOM_CDF4(15235, 22878, 26145) },
+            { AOM_CDF4(12985, 19958, 24155) },
+            { AOM_CDF4(9782, 16641, 21403) },
+            { AOM_CDF4(9456, 16360, 20760) },
+            { AOM_CDF4(6855, 12940, 18557) },
+            { AOM_CDF4(5661, 10564, 15002) },
+            { AOM_CDF4(25656, 30602, 31894) },
+            { AOM_CDF4(22570, 29107, 31092) },
+            { AOM_CDF4(18917, 26423, 29541) },
+            { AOM_CDF4(15940, 23649, 27754) },
+            { AOM_CDF4(12803, 20581, 25219) },
+            { AOM_CDF4(11082, 18695, 23376) },
+            { AOM_CDF4(7939, 14373, 19005) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(18315, 24289, 27551) },
+            { AOM_CDF4(16854, 24068, 27835) },
+            { AOM_CDF4(10140, 17927, 23173) },
+            { AOM_CDF4(6722, 12982, 18267) },
+            { AOM_CDF4(4661, 9826, 14706) },
+            { AOM_CDF4(3832, 8165, 12294) },
+            { AOM_CDF4(2795, 6098, 9245) },
+            { AOM_CDF4(17145, 23326, 26672) },
+            { AOM_CDF4(20733, 27680, 30308) },
+            { AOM_CDF4(16032, 24461, 28546) },
+            { AOM_CDF4(11653, 20093, 25081) },
+            { AOM_CDF4(9290, 16429, 22086) },
+            { AOM_CDF4(7796, 14598, 19982) },
+            { AOM_CDF4(6502, 12378, 17441) },
+            { AOM_CDF4(21681, 27732, 30320) },
+            { AOM_CDF4(22389, 29044, 31261) },
+            { AOM_CDF4(19027, 26731, 30087) },
+            { AOM_CDF4(14739, 23755, 28624) },
+            { AOM_CDF4(11358, 20778, 25511) },
+            { AOM_CDF4(10995, 18073, 24190) },
+            { AOM_CDF4(9162, 14990, 20617) } },
+          { { AOM_CDF4(21425, 27952, 30388) },
+            { AOM_CDF4(18062, 25838, 29034) },
+            { AOM_CDF4(11956, 19881, 24808) },
+            { AOM_CDF4(7718, 15000, 20980) },
+            { AOM_CDF4(5702, 11254, 16143) },
+            { AOM_CDF4(4898, 9088, 16864) },
+            { AOM_CDF4(3679, 6776, 11907) },
+            { AOM_CDF4(23294, 30160, 31663) },
+            { AOM_CDF4(24397, 29896, 31836) },
+            { AOM_CDF4(19245, 27128, 30593) },
+            { AOM_CDF4(13202, 19825, 26404) },
+            { AOM_CDF4(11578, 19297, 23957) },
+            { AOM_CDF4(8073, 13297, 21370) },
+            { AOM_CDF4(5461, 10923, 19745) },
+            { AOM_CDF4(27367, 30521, 31934) },
+            { AOM_CDF4(24904, 30671, 31940) },
+            { AOM_CDF4(23075, 28460, 31299) },
+            { AOM_CDF4(14400, 23658, 30417) },
+            { AOM_CDF4(13885, 23882, 28325) },
+            { AOM_CDF4(14746, 22938, 27853) },
+            { AOM_CDF4(5461, 16384, 27307) } } },
+        { { { AOM_CDF4(18274, 24813, 27890) },
+            { AOM_CDF4(15537, 23149, 27003) },
+            { AOM_CDF4(9449, 16740, 21827) },
+            { AOM_CDF4(6700, 12498, 17261) },
+            { AOM_CDF4(4988, 9866, 14198) },
+            { AOM_CDF4(4236, 8147, 11902) },
+            { AOM_CDF4(2867, 5860, 8654) },
+            { AOM_CDF4(17124, 23171, 26101) },
+            { AOM_CDF4(20396, 27477, 30148) },
+            { AOM_CDF4(16573, 24629, 28492) },
+            { AOM_CDF4(12749, 20846, 25674) },
+            { AOM_CDF4(10233, 17878, 22818) },
+            { AOM_CDF4(8525, 15332, 20363) },
+            { AOM_CDF4(6283, 11632, 16255) },
+            { AOM_CDF4(20466, 26511, 29286) },
+            { AOM_CDF4(23059, 29174, 31191) },
+            { AOM_CDF4(19481, 27263, 30241) },
+            { AOM_CDF4(15458, 23631, 28137) },
+            { AOM_CDF4(12416, 20608, 25693) },
+            { AOM_CDF4(10261, 18011, 23261) },
+            { AOM_CDF4(8016, 14655, 19666) } },
+          { { AOM_CDF4(17616, 24586, 28112) },
+            { AOM_CDF4(15809, 23299, 27155) },
+            { AOM_CDF4(10767, 18890, 23793) },
+            { AOM_CDF4(7727, 14255, 18865) },
+            { AOM_CDF4(6129, 11926, 16882) },
+            { AOM_CDF4(4482, 9704, 14861) },
+            { AOM_CDF4(3277, 7452, 11522) },
+            { AOM_CDF4(22956, 28551, 30730) },
+            { AOM_CDF4(22724, 28937, 30961) },
+            { AOM_CDF4(18467, 26324, 29580) },
+            { AOM_CDF4(13234, 20713, 25649) },
+            { AOM_CDF4(11181, 17592, 22481) },
+            { AOM_CDF4(8291, 18358, 24576) },
+            { AOM_CDF4(7568, 11881, 14984) },
+            { AOM_CDF4(24948, 29001, 31147) },
+            { AOM_CDF4(25674, 30619, 32151) },
+            { AOM_CDF4(20841, 26793, 29603) },
+            { AOM_CDF4(14669, 24356, 28666) },
+            { AOM_CDF4(11334, 23593, 28219) },
+            { AOM_CDF4(8922, 14762, 22873) },
+            { AOM_CDF4(8301, 13544, 20535) } } },
+        { { { AOM_CDF4(17113, 23733, 27081) },
+            { AOM_CDF4(14139, 21406, 25452) },
+            { AOM_CDF4(8552, 15002, 19776) },
+            { AOM_CDF4(5871, 11120, 15378) },
+            { AOM_CDF4(4455, 8616, 12253) },
+            { AOM_CDF4(3469, 6910, 10386) },
+            { AOM_CDF4(2255, 4553, 6782) },
+            { AOM_CDF4(18224, 24376, 27053) },
+            { AOM_CDF4(19290, 26710, 29614) },
+            { AOM_CDF4(14936, 22991, 27184) },
+            { AOM_CDF4(11238, 18951, 23762) },
+            { AOM_CDF4(8786, 15617, 20588) },
+            { AOM_CDF4(7317, 13228, 18003) },
+            { AOM_CDF4(5101, 9512, 13493) },
+            { AOM_CDF4(22639, 28222, 30210) },
+            { AOM_CDF4(23216, 29331, 31307) },
+            { AOM_CDF4(19075, 26762, 29895) },
+            { AOM_CDF4(15014, 23113, 27457) },
+            { AOM_CDF4(11938, 19857, 24752) },
+            { AOM_CDF4(9942, 17280, 22282) },
+            { AOM_CDF4(7167, 13144, 17752) } },
+          { { AOM_CDF4(15820, 22738, 26488) },
+            { AOM_CDF4(13530, 20885, 25216) },
+            { AOM_CDF4(8395, 15530, 20452) },
+            { AOM_CDF4(6574, 12321, 16380) },
+            { AOM_CDF4(5353, 10419, 14568) },
+            { AOM_CDF4(4613, 8446, 12381) },
+            { AOM_CDF4(3440, 7158, 9903) },
+            { AOM_CDF4(24247, 29051, 31224) },
+            { AOM_CDF4(22118, 28058, 30369) },
+            { AOM_CDF4(16498, 24768, 28389) },
+            { AOM_CDF4(12920, 21175, 26137) },
+            { AOM_CDF4(10730, 18619, 25352) },
+            { AOM_CDF4(10187, 16279, 22791) },
+            { AOM_CDF4(9310, 14631, 22127) },
+            { AOM_CDF4(24970, 30558, 32057) },
+            { AOM_CDF4(24801, 29942, 31698) },
+            { AOM_CDF4(22432, 28453, 30855) },
+            { AOM_CDF4(19054, 25680, 29580) },
+            { AOM_CDF4(14392, 23036, 28109) },
+            { AOM_CDF4(12495, 20947, 26650) },
+            { AOM_CDF4(12442, 20326, 26214) } } },
+        { { { AOM_CDF4(12162, 18785, 22648) },
+            { AOM_CDF4(12749, 19697, 23806) },
+            { AOM_CDF4(8580, 15297, 20346) },
+            { AOM_CDF4(6169, 11749, 16543) },
+            { AOM_CDF4(4836, 9391, 13448) },
+            { AOM_CDF4(3821, 7711, 11613) },
+            { AOM_CDF4(2228, 4601, 7070) },
+            { AOM_CDF4(16319, 24725, 28280) },
+            { AOM_CDF4(15698, 23277, 27168) },
+            { AOM_CDF4(12726, 20368, 25047) },
+            { AOM_CDF4(9912, 17015, 21976) },
+            { AOM_CDF4(7888, 14220, 19179) },
+            { AOM_CDF4(6777, 12284, 17018) },
+            { AOM_CDF4(4492, 8590, 12252) },
+            { AOM_CDF4(23249, 28904, 30947) },
+            { AOM_CDF4(21050, 27908, 30512) },
+            { AOM_CDF4(17440, 25340, 28949) },
+            { AOM_CDF4(14059, 22018, 26541) },
+            { AOM_CDF4(11288, 18903, 23898) },
+            { AOM_CDF4(9411, 16342, 21428) },
+            { AOM_CDF4(6278, 11588, 15944) } },
+          { { AOM_CDF4(13981, 20067, 23226) },
+            { AOM_CDF4(16922, 23580, 26783) },
+            { AOM_CDF4(11005, 19039, 24487) },
+            { AOM_CDF4(7389, 14218, 19798) },
+            { AOM_CDF4(5598, 11505, 17206) },
+            { AOM_CDF4(6090, 11213, 15659) },
+            { AOM_CDF4(3820, 7371, 10119) },
+            { AOM_CDF4(21082, 26925, 29675) },
+            { AOM_CDF4(21262, 28627, 31128) },
+            { AOM_CDF4(18392, 26454, 30437) },
+            { AOM_CDF4(14870, 22910, 27096) },
+            { AOM_CDF4(12620, 19484, 24908) },
+            { AOM_CDF4(9290, 16553, 22802) },
+            { AOM_CDF4(6668, 14288, 20004) },
+            { AOM_CDF4(27704, 31055, 31949) },
+            { AOM_CDF4(24709, 29978, 31788) },
+            { AOM_CDF4(21668, 29264, 31657) },
+            { AOM_CDF4(18295, 26968, 30074) },
+            { AOM_CDF4(16399, 24422, 29313) },
+            { AOM_CDF4(14347, 23026, 28104) },
+            { AOM_CDF4(12370, 19806, 24477) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } }
+    };
+
+static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+    [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
+        { { { { { AOM_CDF4(4034, 8930, 12727) },
+                { AOM_CDF4(18082, 29741, 31877) },
+                { AOM_CDF4(12596, 26124, 30493) },
+                { AOM_CDF4(9446, 21118, 27005) },
+                { AOM_CDF4(6308, 15141, 21279) },
+                { AOM_CDF4(2463, 6357, 9783) },
+                { AOM_CDF4(20667, 30546, 31929) },
+                { AOM_CDF4(13043, 26123, 30134) },
+                { AOM_CDF4(8151, 18757, 24778) },
+                { AOM_CDF4(5255, 12839, 18632) },
+                { AOM_CDF4(2820, 7206, 11161) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(15736, 27553, 30604) },
+                { AOM_CDF4(11210, 23794, 28787) },
+                { AOM_CDF4(5947, 13874, 19701) },
+                { AOM_CDF4(4215, 9323, 13891) },
+                { AOM_CDF4(2833, 6462, 10059) },
+                { AOM_CDF4(19605, 30393, 31582) },
+                { AOM_CDF4(13523, 26252, 30248) },
+                { AOM_CDF4(8446, 18622, 24512) },
+                { AOM_CDF4(3818, 10343, 15974) },
+                { AOM_CDF4(1481, 4117, 6796) },
+                { AOM_CDF4(22649, 31302, 32190) },
+                { AOM_CDF4(14829, 27127, 30449) },
+                { AOM_CDF4(8313, 17702, 23304) },
+                { AOM_CDF4(3022, 8301, 12786) },
+                { AOM_CDF4(1536, 4412, 7184) },
+                { AOM_CDF4(22354, 29774, 31372) },
+                { AOM_CDF4(14723, 25472, 29214) },
+                { AOM_CDF4(6673, 13745, 18662) },
+                { AOM_CDF4(2068, 5766, 9322) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6302, 16444, 21761) },
+                { AOM_CDF4(23040, 31538, 32475) },
+                { AOM_CDF4(15196, 28452, 31496) },
+                { AOM_CDF4(10020, 22946, 28514) },
+                { AOM_CDF4(6533, 16862, 23501) },
+                { AOM_CDF4(3538, 9816, 15076) },
+                { AOM_CDF4(24444, 31875, 32525) },
+                { AOM_CDF4(15881, 28924, 31635) },
+                { AOM_CDF4(9922, 22873, 28466) },
+                { AOM_CDF4(6527, 16966, 23691) },
+                { AOM_CDF4(4114, 11303, 17220) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(20201, 30770, 32209) },
+                { AOM_CDF4(14754, 28071, 31258) },
+                { AOM_CDF4(8378, 20186, 26517) },
+                { AOM_CDF4(5916, 15299, 21978) },
+                { AOM_CDF4(4268, 11583, 17901) },
+                { AOM_CDF4(24361, 32025, 32581) },
+                { AOM_CDF4(18673, 30105, 31943) },
+                { AOM_CDF4(10196, 22244, 27576) },
+                { AOM_CDF4(5495, 14349, 20417) },
+                { AOM_CDF4(2676, 7415, 11498) },
+                { AOM_CDF4(24678, 31958, 32585) },
+                { AOM_CDF4(18629, 29906, 31831) },
+                { AOM_CDF4(9364, 20724, 26315) },
+                { AOM_CDF4(4641, 12318, 18094) },
+                { AOM_CDF4(2758, 7387, 11579) },
+                { AOM_CDF4(25433, 31842, 32469) },
+                { AOM_CDF4(18795, 29289, 31411) },
+                { AOM_CDF4(7644, 17584, 23592) },
+                { AOM_CDF4(3408, 9014, 15047) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4536, 10072, 14001) },
+                { AOM_CDF4(25459, 31416, 32206) },
+                { AOM_CDF4(16605, 28048, 30818) },
+                { AOM_CDF4(11008, 22857, 27719) },
+                { AOM_CDF4(6915, 16268, 22315) },
+                { AOM_CDF4(2625, 6812, 10537) },
+                { AOM_CDF4(24257, 31788, 32499) },
+                { AOM_CDF4(16880, 29454, 31879) },
+                { AOM_CDF4(11958, 25054, 29778) },
+                { AOM_CDF4(7916, 18718, 25084) },
+                { AOM_CDF4(3383, 8777, 13446) },
+                { AOM_CDF4(22720, 31603, 32393) },
+                { AOM_CDF4(14960, 28125, 31335) },
+                { AOM_CDF4(9731, 22210, 27928) },
+                { AOM_CDF4(6304, 15832, 22277) },
+                { AOM_CDF4(2910, 7818, 12166) },
+                { AOM_CDF4(20375, 30627, 32131) },
+                { AOM_CDF4(13904, 27284, 30887) },
+                { AOM_CDF4(9368, 21558, 27144) },
+                { AOM_CDF4(5937, 14966, 21119) },
+                { AOM_CDF4(2667, 7225, 11319) },
+                { AOM_CDF4(23970, 31470, 32378) },
+                { AOM_CDF4(17173, 29734, 32018) },
+                { AOM_CDF4(12795, 25441, 29965) },
+                { AOM_CDF4(8981, 19680, 25893) },
+                { AOM_CDF4(4728, 11372, 16902) },
+                { AOM_CDF4(24287, 31797, 32439) },
+                { AOM_CDF4(16703, 29145, 31696) },
+                { AOM_CDF4(10833, 23554, 28725) },
+                { AOM_CDF4(6468, 16566, 23057) },
+                { AOM_CDF4(2415, 6562, 10278) },
+                { AOM_CDF4(26610, 32395, 32659) },
+                { AOM_CDF4(18590, 30498, 32117) },
+                { AOM_CDF4(12420, 25756, 29950) },
+                { AOM_CDF4(7639, 18746, 24710) },
+                { AOM_CDF4(3001, 8086, 12347) },
+                { AOM_CDF4(25076, 32064, 32580) },
+                { AOM_CDF4(17946, 30128, 32028) },
+                { AOM_CDF4(12024, 24985, 29378) },
+                { AOM_CDF4(7517, 18390, 24304) },
+                { AOM_CDF4(3243, 8781, 13331) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6037, 16771, 21957) },
+                { AOM_CDF4(24774, 31704, 32426) },
+                { AOM_CDF4(16830, 28589, 31056) },
+                { AOM_CDF4(10602, 22828, 27760) },
+                { AOM_CDF4(6733, 16829, 23071) },
+                { AOM_CDF4(3250, 8914, 13556) },
+                { AOM_CDF4(25582, 32220, 32668) },
+                { AOM_CDF4(18659, 30342, 32223) },
+                { AOM_CDF4(12546, 26149, 30515) },
+                { AOM_CDF4(8420, 20451, 26801) },
+                { AOM_CDF4(4636, 12420, 18344) },
+                { AOM_CDF4(27581, 32362, 32639) },
+                { AOM_CDF4(18987, 30083, 31978) },
+                { AOM_CDF4(11327, 24248, 29084) },
+                { AOM_CDF4(7264, 17719, 24120) },
+                { AOM_CDF4(3995, 10768, 16169) },
+                { AOM_CDF4(25893, 31831, 32487) },
+                { AOM_CDF4(16577, 28587, 31379) },
+                { AOM_CDF4(10189, 22748, 28182) },
+                { AOM_CDF4(6832, 17094, 23556) },
+                { AOM_CDF4(3708, 10110, 15334) },
+                { AOM_CDF4(25904, 32282, 32656) },
+                { AOM_CDF4(19721, 30792, 32276) },
+                { AOM_CDF4(12819, 26243, 30411) },
+                { AOM_CDF4(8572, 20614, 26891) },
+                { AOM_CDF4(5364, 14059, 20467) },
+                { AOM_CDF4(26580, 32438, 32677) },
+                { AOM_CDF4(20852, 31225, 32340) },
+                { AOM_CDF4(12435, 25700, 29967) },
+                { AOM_CDF4(8691, 20825, 26976) },
+                { AOM_CDF4(4446, 12209, 17269) },
+                { AOM_CDF4(27350, 32429, 32696) },
+                { AOM_CDF4(21372, 30977, 32272) },
+                { AOM_CDF4(12673, 25270, 29853) },
+                { AOM_CDF4(9208, 20925, 26640) },
+                { AOM_CDF4(5018, 13351, 18732) },
+                { AOM_CDF4(27351, 32479, 32713) },
+                { AOM_CDF4(21398, 31209, 32387) },
+                { AOM_CDF4(12162, 25047, 29842) },
+                { AOM_CDF4(7896, 18691, 25319) },
+                { AOM_CDF4(4670, 12882, 18881) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5487, 10460, 13708) },
+                { AOM_CDF4(21597, 28303, 30674) },
+                { AOM_CDF4(11037, 21953, 26476) },
+                { AOM_CDF4(8147, 17962, 22952) },
+                { AOM_CDF4(5242, 13061, 18532) },
+                { AOM_CDF4(1889, 5208, 8182) },
+                { AOM_CDF4(26774, 32133, 32590) },
+                { AOM_CDF4(17844, 29564, 31767) },
+                { AOM_CDF4(11690, 24438, 29171) },
+                { AOM_CDF4(7542, 18215, 24459) },
+                { AOM_CDF4(2993, 8050, 12319) },
+                { AOM_CDF4(28023, 32328, 32591) },
+                { AOM_CDF4(18651, 30126, 31954) },
+                { AOM_CDF4(12164, 25146, 29589) },
+                { AOM_CDF4(7762, 18530, 24771) },
+                { AOM_CDF4(3492, 9183, 13920) },
+                { AOM_CDF4(27591, 32008, 32491) },
+                { AOM_CDF4(17149, 28853, 31510) },
+                { AOM_CDF4(11485, 24003, 28860) },
+                { AOM_CDF4(7697, 18086, 24210) },
+                { AOM_CDF4(3075, 7999, 12218) },
+                { AOM_CDF4(28268, 32482, 32654) },
+                { AOM_CDF4(19631, 31051, 32404) },
+                { AOM_CDF4(13860, 27260, 31020) },
+                { AOM_CDF4(9605, 21613, 27594) },
+                { AOM_CDF4(4876, 12162, 17908) },
+                { AOM_CDF4(27248, 32316, 32576) },
+                { AOM_CDF4(18955, 30457, 32075) },
+                { AOM_CDF4(11824, 23997, 28795) },
+                { AOM_CDF4(7346, 18196, 24647) },
+                { AOM_CDF4(3403, 9247, 14111) },
+                { AOM_CDF4(29711, 32655, 32735) },
+                { AOM_CDF4(21169, 31394, 32417) },
+                { AOM_CDF4(13487, 27198, 30957) },
+                { AOM_CDF4(8828, 21683, 27614) },
+                { AOM_CDF4(4270, 11451, 17038) },
+                { AOM_CDF4(28708, 32578, 32731) },
+                { AOM_CDF4(20120, 31241, 32482) },
+                { AOM_CDF4(13692, 27550, 31321) },
+                { AOM_CDF4(9418, 22514, 28439) },
+                { AOM_CDF4(4999, 13283, 19462) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(5673, 14302, 19711) },
+                { AOM_CDF4(26251, 30701, 31834) },
+                { AOM_CDF4(12782, 23783, 27803) },
+                { AOM_CDF4(9127, 20657, 25808) },
+                { AOM_CDF4(6368, 16208, 21462) },
+                { AOM_CDF4(2465, 7177, 10822) },
+                { AOM_CDF4(29961, 32563, 32719) },
+                { AOM_CDF4(18318, 29891, 31949) },
+                { AOM_CDF4(11361, 24514, 29357) },
+                { AOM_CDF4(7900, 19603, 25607) },
+                { AOM_CDF4(4002, 10590, 15546) },
+                { AOM_CDF4(29637, 32310, 32595) },
+                { AOM_CDF4(18296, 29913, 31809) },
+                { AOM_CDF4(10144, 21515, 26871) },
+                { AOM_CDF4(5358, 14322, 20394) },
+                { AOM_CDF4(3067, 8362, 13346) },
+                { AOM_CDF4(28652, 32470, 32676) },
+                { AOM_CDF4(17538, 30771, 32209) },
+                { AOM_CDF4(13924, 26882, 30494) },
+                { AOM_CDF4(10496, 22837, 27869) },
+                { AOM_CDF4(7236, 16396, 21621) },
+                { AOM_CDF4(30743, 32687, 32746) },
+                { AOM_CDF4(23006, 31676, 32489) },
+                { AOM_CDF4(14494, 27828, 31120) },
+                { AOM_CDF4(10174, 22801, 28352) },
+                { AOM_CDF4(6242, 15281, 21043) },
+                { AOM_CDF4(25817, 32243, 32720) },
+                { AOM_CDF4(18618, 31367, 32325) },
+                { AOM_CDF4(13997, 28318, 31878) },
+                { AOM_CDF4(12255, 26534, 31383) },
+                { AOM_CDF4(9561, 21588, 28450) },
+                { AOM_CDF4(28188, 32635, 32724) },
+                { AOM_CDF4(22060, 32365, 32728) },
+                { AOM_CDF4(18102, 30690, 32528) },
+                { AOM_CDF4(14196, 28864, 31999) },
+                { AOM_CDF4(12262, 25792, 30865) },
+                { AOM_CDF4(24176, 32109, 32628) },
+                { AOM_CDF4(18280, 29681, 31963) },
+                { AOM_CDF4(10205, 23703, 29664) },
+                { AOM_CDF4(7889, 20025, 27676) },
+                { AOM_CDF4(6060, 16743, 23970) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5141, 7096, 8260) },
+                { AOM_CDF4(27186, 29022, 29789) },
+                { AOM_CDF4(6668, 12568, 15682) },
+                { AOM_CDF4(2172, 6181, 8638) },
+                { AOM_CDF4(1126, 3379, 4531) },
+                { AOM_CDF4(443, 1361, 2254) },
+                { AOM_CDF4(26083, 31153, 32436) },
+                { AOM_CDF4(13486, 24603, 28483) },
+                { AOM_CDF4(6508, 14840, 19910) },
+                { AOM_CDF4(3386, 8800, 13286) },
+                { AOM_CDF4(1530, 4322, 7054) },
+                { AOM_CDF4(29639, 32080, 32548) },
+                { AOM_CDF4(15897, 27552, 30290) },
+                { AOM_CDF4(8588, 20047, 25383) },
+                { AOM_CDF4(4889, 13339, 19269) },
+                { AOM_CDF4(2240, 6871, 10498) },
+                { AOM_CDF4(28165, 32197, 32517) },
+                { AOM_CDF4(20735, 30427, 31568) },
+                { AOM_CDF4(14325, 24671, 27692) },
+                { AOM_CDF4(5119, 12554, 17805) },
+                { AOM_CDF4(1810, 5441, 8261) },
+                { AOM_CDF4(31212, 32724, 32748) },
+                { AOM_CDF4(23352, 31766, 32545) },
+                { AOM_CDF4(14669, 27570, 31059) },
+                { AOM_CDF4(8492, 20894, 27272) },
+                { AOM_CDF4(3644, 10194, 15204) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(2461, 7013, 9371) },
+                { AOM_CDF4(24749, 29600, 30986) },
+                { AOM_CDF4(9466, 19037, 22417) },
+                { AOM_CDF4(3584, 9280, 14400) },
+                { AOM_CDF4(1505, 3929, 5433) },
+                { AOM_CDF4(677, 1500, 2736) },
+                { AOM_CDF4(23987, 30702, 32117) },
+                { AOM_CDF4(13554, 24571, 29263) },
+                { AOM_CDF4(6211, 14556, 21155) },
+                { AOM_CDF4(3135, 10972, 15625) },
+                { AOM_CDF4(2435, 7127, 11427) },
+                { AOM_CDF4(31300, 32532, 32550) },
+                { AOM_CDF4(14757, 30365, 31954) },
+                { AOM_CDF4(4405, 11612, 18553) },
+                { AOM_CDF4(580, 4132, 7322) },
+                { AOM_CDF4(1695, 10169, 14124) },
+                { AOM_CDF4(30008, 32282, 32591) },
+                { AOM_CDF4(19244, 30108, 31748) },
+                { AOM_CDF4(11180, 24158, 29555) },
+                { AOM_CDF4(5650, 14972, 19209) },
+                { AOM_CDF4(2114, 5109, 8456) },
+                { AOM_CDF4(31856, 32716, 32748) },
+                { AOM_CDF4(23012, 31664, 32572) },
+                { AOM_CDF4(13694, 26656, 30636) },
+                { AOM_CDF4(8142, 19508, 26093) },
+                { AOM_CDF4(4253, 10955, 16724) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(601, 983, 1311) },
+                { AOM_CDF4(18725, 23406, 28087) },
+                { AOM_CDF4(5461, 8192, 10923) },
+                { AOM_CDF4(3781, 15124, 21425) },
+                { AOM_CDF4(2587, 7761, 12072) },
+                { AOM_CDF4(106, 458, 810) },
+                { AOM_CDF4(22282, 29710, 31894) },
+                { AOM_CDF4(8508, 20926, 25984) },
+                { AOM_CDF4(3726, 12713, 18083) },
+                { AOM_CDF4(1620, 7112, 10893) },
+                { AOM_CDF4(729, 2236, 3495) },
+                { AOM_CDF4(30163, 32474, 32684) },
+                { AOM_CDF4(18304, 30464, 32000) },
+                { AOM_CDF4(11443, 26526, 29647) },
+                { AOM_CDF4(6007, 15292, 21299) },
+                { AOM_CDF4(2234, 6703, 8937) },
+                { AOM_CDF4(30954, 32177, 32571) },
+                { AOM_CDF4(17363, 29562, 31076) },
+                { AOM_CDF4(9686, 22464, 27410) },
+                { AOM_CDF4(8192, 16384, 21390) },
+                { AOM_CDF4(1755, 8046, 11264) },
+                { AOM_CDF4(31168, 32734, 32748) },
+                { AOM_CDF4(22486, 31441, 32471) },
+                { AOM_CDF4(12833, 25627, 29738) },
+                { AOM_CDF4(6980, 17379, 23122) },
+                { AOM_CDF4(3111, 8887, 13479) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(6041, 11854, 15927) },
+                { AOM_CDF4(20326, 30905, 32251) },
+                { AOM_CDF4(14164, 26831, 30725) },
+                { AOM_CDF4(9760, 20647, 26585) },
+                { AOM_CDF4(6416, 14953, 21219) },
+                { AOM_CDF4(2966, 7151, 10891) },
+                { AOM_CDF4(23567, 31374, 32254) },
+                { AOM_CDF4(14978, 27416, 30946) },
+                { AOM_CDF4(9434, 20225, 26254) },
+                { AOM_CDF4(6658, 14558, 20535) },
+                { AOM_CDF4(3916, 8677, 12989) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(18088, 29545, 31587) },
+                { AOM_CDF4(13062, 25843, 30073) },
+                { AOM_CDF4(8940, 16827, 22251) },
+                { AOM_CDF4(7654, 13220, 17973) },
+                { AOM_CDF4(5733, 10316, 14456) },
+                { AOM_CDF4(22879, 31388, 32114) },
+                { AOM_CDF4(15215, 27993, 30955) },
+                { AOM_CDF4(9397, 19445, 24978) },
+                { AOM_CDF4(3442, 9813, 15344) },
+                { AOM_CDF4(1368, 3936, 6532) },
+                { AOM_CDF4(25494, 32033, 32406) },
+                { AOM_CDF4(16772, 27963, 30718) },
+                { AOM_CDF4(9419, 18165, 23260) },
+                { AOM_CDF4(2677, 7501, 11797) },
+                { AOM_CDF4(1516, 4344, 7170) },
+                { AOM_CDF4(26556, 31454, 32101) },
+                { AOM_CDF4(17128, 27035, 30108) },
+                { AOM_CDF4(8324, 15344, 20249) },
+                { AOM_CDF4(1903, 5696, 9469) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8455, 19003, 24368) },
+                { AOM_CDF4(23563, 32021, 32604) },
+                { AOM_CDF4(16237, 29446, 31935) },
+                { AOM_CDF4(10724, 23999, 29358) },
+                { AOM_CDF4(6725, 17528, 24416) },
+                { AOM_CDF4(3927, 10927, 16825) },
+                { AOM_CDF4(26313, 32288, 32634) },
+                { AOM_CDF4(17430, 30095, 32095) },
+                { AOM_CDF4(11116, 24606, 29679) },
+                { AOM_CDF4(7195, 18384, 25269) },
+                { AOM_CDF4(4726, 12852, 19315) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(22822, 31648, 32483) },
+                { AOM_CDF4(16724, 29633, 31929) },
+                { AOM_CDF4(10261, 23033, 28725) },
+                { AOM_CDF4(7029, 17840, 24528) },
+                { AOM_CDF4(4867, 13886, 21502) },
+                { AOM_CDF4(25298, 31892, 32491) },
+                { AOM_CDF4(17809, 29330, 31512) },
+                { AOM_CDF4(9668, 21329, 26579) },
+                { AOM_CDF4(4774, 12956, 18976) },
+                { AOM_CDF4(2322, 7030, 11540) },
+                { AOM_CDF4(25472, 31920, 32543) },
+                { AOM_CDF4(17957, 29387, 31632) },
+                { AOM_CDF4(9196, 20593, 26400) },
+                { AOM_CDF4(4680, 12705, 19202) },
+                { AOM_CDF4(2917, 8456, 13436) },
+                { AOM_CDF4(26471, 32059, 32574) },
+                { AOM_CDF4(18458, 29783, 31909) },
+                { AOM_CDF4(8400, 19464, 25956) },
+                { AOM_CDF4(3812, 10973, 17206) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(6779, 13743, 17678) },
+                { AOM_CDF4(24806, 31797, 32457) },
+                { AOM_CDF4(17616, 29047, 31372) },
+                { AOM_CDF4(11063, 23175, 28003) },
+                { AOM_CDF4(6521, 16110, 22324) },
+                { AOM_CDF4(2764, 7504, 11654) },
+                { AOM_CDF4(25266, 32367, 32637) },
+                { AOM_CDF4(19054, 30553, 32175) },
+                { AOM_CDF4(12139, 25212, 29807) },
+                { AOM_CDF4(7311, 18162, 24704) },
+                { AOM_CDF4(3397, 9164, 14074) },
+                { AOM_CDF4(25988, 32208, 32522) },
+                { AOM_CDF4(16253, 28912, 31526) },
+                { AOM_CDF4(9151, 21387, 27372) },
+                { AOM_CDF4(5688, 14915, 21496) },
+                { AOM_CDF4(2717, 7627, 12004) },
+                { AOM_CDF4(23144, 31855, 32443) },
+                { AOM_CDF4(16070, 28491, 31325) },
+                { AOM_CDF4(8702, 20467, 26517) },
+                { AOM_CDF4(5243, 13956, 20367) },
+                { AOM_CDF4(2621, 7335, 11567) },
+                { AOM_CDF4(26636, 32340, 32630) },
+                { AOM_CDF4(19990, 31050, 32341) },
+                { AOM_CDF4(13243, 26105, 30315) },
+                { AOM_CDF4(8588, 19521, 25918) },
+                { AOM_CDF4(4717, 11585, 17304) },
+                { AOM_CDF4(25844, 32292, 32582) },
+                { AOM_CDF4(19090, 30635, 32097) },
+                { AOM_CDF4(11963, 24546, 28939) },
+                { AOM_CDF4(6218, 16087, 22354) },
+                { AOM_CDF4(2340, 6608, 10426) },
+                { AOM_CDF4(28046, 32576, 32694) },
+                { AOM_CDF4(21178, 31313, 32296) },
+                { AOM_CDF4(13486, 26184, 29870) },
+                { AOM_CDF4(7149, 17871, 23723) },
+                { AOM_CDF4(2833, 7958, 12259) },
+                { AOM_CDF4(27710, 32528, 32686) },
+                { AOM_CDF4(20674, 31076, 32268) },
+                { AOM_CDF4(12413, 24955, 29243) },
+                { AOM_CDF4(6676, 16927, 23097) },
+                { AOM_CDF4(2966, 8333, 12919) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8639, 19339, 24429) },
+                { AOM_CDF4(24404, 31837, 32525) },
+                { AOM_CDF4(16997, 29425, 31784) },
+                { AOM_CDF4(11253, 24234, 29149) },
+                { AOM_CDF4(6751, 17394, 24028) },
+                { AOM_CDF4(3490, 9830, 15191) },
+                { AOM_CDF4(26283, 32471, 32714) },
+                { AOM_CDF4(19599, 31168, 32442) },
+                { AOM_CDF4(13146, 26954, 30893) },
+                { AOM_CDF4(8214, 20588, 26890) },
+                { AOM_CDF4(4699, 13081, 19300) },
+                { AOM_CDF4(28212, 32458, 32669) },
+                { AOM_CDF4(18594, 30316, 32100) },
+                { AOM_CDF4(11219, 24408, 29234) },
+                { AOM_CDF4(6865, 17656, 24149) },
+                { AOM_CDF4(3678, 10362, 16006) },
+                { AOM_CDF4(25825, 32136, 32616) },
+                { AOM_CDF4(17313, 29853, 32021) },
+                { AOM_CDF4(11197, 24471, 29472) },
+                { AOM_CDF4(6947, 17781, 24405) },
+                { AOM_CDF4(3768, 10660, 16261) },
+                { AOM_CDF4(27352, 32500, 32706) },
+                { AOM_CDF4(20850, 31468, 32469) },
+                { AOM_CDF4(14021, 27707, 31133) },
+                { AOM_CDF4(8964, 21748, 27838) },
+                { AOM_CDF4(5437, 14665, 21187) },
+                { AOM_CDF4(26304, 32492, 32698) },
+                { AOM_CDF4(20409, 31380, 32385) },
+                { AOM_CDF4(13682, 27222, 30632) },
+                { AOM_CDF4(8974, 21236, 26685) },
+                { AOM_CDF4(4234, 11665, 16934) },
+                { AOM_CDF4(26273, 32357, 32711) },
+                { AOM_CDF4(20672, 31242, 32441) },
+                { AOM_CDF4(14172, 27254, 30902) },
+                { AOM_CDF4(9870, 21898, 27275) },
+                { AOM_CDF4(5164, 13506, 19270) },
+                { AOM_CDF4(26725, 32459, 32728) },
+                { AOM_CDF4(20991, 31442, 32527) },
+                { AOM_CDF4(13071, 26434, 30811) },
+                { AOM_CDF4(8184, 20090, 26742) },
+                { AOM_CDF4(4803, 13255, 19895) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7555, 14942, 18501) },
+                { AOM_CDF4(24410, 31178, 32287) },
+                { AOM_CDF4(14394, 26738, 30253) },
+                { AOM_CDF4(8413, 19554, 25195) },
+                { AOM_CDF4(4766, 12924, 18785) },
+                { AOM_CDF4(2029, 5806, 9207) },
+                { AOM_CDF4(26776, 32364, 32663) },
+                { AOM_CDF4(18732, 29967, 31931) },
+                { AOM_CDF4(11005, 23786, 28852) },
+                { AOM_CDF4(6466, 16909, 23510) },
+                { AOM_CDF4(3044, 8638, 13419) },
+                { AOM_CDF4(29208, 32582, 32704) },
+                { AOM_CDF4(20068, 30857, 32208) },
+                { AOM_CDF4(12003, 25085, 29595) },
+                { AOM_CDF4(6947, 17750, 24189) },
+                { AOM_CDF4(3245, 9103, 14007) },
+                { AOM_CDF4(27359, 32465, 32669) },
+                { AOM_CDF4(19421, 30614, 32174) },
+                { AOM_CDF4(11915, 25010, 29579) },
+                { AOM_CDF4(6950, 17676, 24074) },
+                { AOM_CDF4(3007, 8473, 13096) },
+                { AOM_CDF4(29002, 32676, 32735) },
+                { AOM_CDF4(22102, 31849, 32576) },
+                { AOM_CDF4(14408, 28009, 31405) },
+                { AOM_CDF4(9027, 21679, 27931) },
+                { AOM_CDF4(4694, 12678, 18748) },
+                { AOM_CDF4(28216, 32528, 32682) },
+                { AOM_CDF4(20849, 31264, 32318) },
+                { AOM_CDF4(12756, 25815, 29751) },
+                { AOM_CDF4(7565, 18801, 24923) },
+                { AOM_CDF4(3509, 9533, 14477) },
+                { AOM_CDF4(30133, 32687, 32739) },
+                { AOM_CDF4(23063, 31910, 32515) },
+                { AOM_CDF4(14588, 28051, 31132) },
+                { AOM_CDF4(9085, 21649, 27457) },
+                { AOM_CDF4(4261, 11654, 17264) },
+                { AOM_CDF4(29518, 32691, 32748) },
+                { AOM_CDF4(22451, 31959, 32613) },
+                { AOM_CDF4(14864, 28722, 31700) },
+                { AOM_CDF4(9695, 22964, 28716) },
+                { AOM_CDF4(4932, 13358, 19502) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(6465, 16958, 21688) },
+                { AOM_CDF4(25199, 31514, 32360) },
+                { AOM_CDF4(14774, 27149, 30607) },
+                { AOM_CDF4(9257, 21438, 26972) },
+                { AOM_CDF4(5723, 15183, 21882) },
+                { AOM_CDF4(3150, 8879, 13731) },
+                { AOM_CDF4(26989, 32262, 32682) },
+                { AOM_CDF4(17396, 29937, 32085) },
+                { AOM_CDF4(11387, 24901, 29784) },
+                { AOM_CDF4(7289, 18821, 25548) },
+                { AOM_CDF4(3734, 10577, 16086) },
+                { AOM_CDF4(29728, 32501, 32695) },
+                { AOM_CDF4(17431, 29701, 31903) },
+                { AOM_CDF4(9921, 22826, 28300) },
+                { AOM_CDF4(5896, 15434, 22068) },
+                { AOM_CDF4(3430, 9646, 14757) },
+                { AOM_CDF4(28614, 32511, 32705) },
+                { AOM_CDF4(19364, 30638, 32263) },
+                { AOM_CDF4(13129, 26254, 30402) },
+                { AOM_CDF4(8754, 20484, 26440) },
+                { AOM_CDF4(4378, 11607, 17110) },
+                { AOM_CDF4(30292, 32671, 32744) },
+                { AOM_CDF4(21780, 31603, 32501) },
+                { AOM_CDF4(14314, 27829, 31291) },
+                { AOM_CDF4(9611, 22327, 28263) },
+                { AOM_CDF4(4890, 13087, 19065) },
+                { AOM_CDF4(25862, 32567, 32733) },
+                { AOM_CDF4(20794, 32050, 32567) },
+                { AOM_CDF4(17243, 30625, 32254) },
+                { AOM_CDF4(13283, 27628, 31474) },
+                { AOM_CDF4(9669, 22532, 28918) },
+                { AOM_CDF4(27435, 32697, 32748) },
+                { AOM_CDF4(24922, 32390, 32714) },
+                { AOM_CDF4(21449, 31504, 32536) },
+                { AOM_CDF4(16392, 29729, 31832) },
+                { AOM_CDF4(11692, 24884, 29076) },
+                { AOM_CDF4(24193, 32290, 32735) },
+                { AOM_CDF4(18909, 31104, 32563) },
+                { AOM_CDF4(12236, 26841, 31403) },
+                { AOM_CDF4(8171, 21840, 29082) },
+                { AOM_CDF4(7224, 17280, 25275) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(3078, 6839, 9890) },
+                { AOM_CDF4(13837, 20450, 24479) },
+                { AOM_CDF4(5914, 14222, 19328) },
+                { AOM_CDF4(3866, 10267, 14762) },
+                { AOM_CDF4(2612, 7208, 11042) },
+                { AOM_CDF4(1067, 2991, 4776) },
+                { AOM_CDF4(25817, 31646, 32529) },
+                { AOM_CDF4(13708, 26338, 30385) },
+                { AOM_CDF4(7328, 18585, 24870) },
+                { AOM_CDF4(4691, 13080, 19276) },
+                { AOM_CDF4(1825, 5253, 8352) },
+                { AOM_CDF4(29386, 32315, 32624) },
+                { AOM_CDF4(17160, 29001, 31360) },
+                { AOM_CDF4(9602, 21862, 27396) },
+                { AOM_CDF4(5915, 15772, 22148) },
+                { AOM_CDF4(2786, 7779, 12047) },
+                { AOM_CDF4(29246, 32450, 32663) },
+                { AOM_CDF4(18696, 29929, 31818) },
+                { AOM_CDF4(10510, 23369, 28560) },
+                { AOM_CDF4(6229, 16499, 23125) },
+                { AOM_CDF4(2608, 7448, 11705) },
+                { AOM_CDF4(30753, 32710, 32748) },
+                { AOM_CDF4(21638, 31487, 32503) },
+                { AOM_CDF4(12937, 26854, 30870) },
+                { AOM_CDF4(8182, 20596, 26970) },
+                { AOM_CDF4(3637, 10269, 15497) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(5244, 12150, 16906) },
+                { AOM_CDF4(20486, 26858, 29701) },
+                { AOM_CDF4(7756, 18317, 23735) },
+                { AOM_CDF4(3452, 9256, 13146) },
+                { AOM_CDF4(2020, 5206, 8229) },
+                { AOM_CDF4(1801, 4993, 7903) },
+                { AOM_CDF4(27051, 31858, 32531) },
+                { AOM_CDF4(15988, 27531, 30619) },
+                { AOM_CDF4(9188, 21484, 26719) },
+                { AOM_CDF4(6273, 17186, 23800) },
+                { AOM_CDF4(3108, 9355, 14764) },
+                { AOM_CDF4(31076, 32520, 32680) },
+                { AOM_CDF4(18119, 30037, 31850) },
+                { AOM_CDF4(10244, 22969, 27472) },
+                { AOM_CDF4(4692, 14077, 19273) },
+                { AOM_CDF4(3694, 11677, 17556) },
+                { AOM_CDF4(30060, 32581, 32720) },
+                { AOM_CDF4(21011, 30775, 32120) },
+                { AOM_CDF4(11931, 24820, 29289) },
+                { AOM_CDF4(7119, 17662, 24356) },
+                { AOM_CDF4(3833, 10706, 16304) },
+                { AOM_CDF4(31954, 32731, 32748) },
+                { AOM_CDF4(23913, 31724, 32489) },
+                { AOM_CDF4(15520, 28060, 31286) },
+                { AOM_CDF4(11517, 23008, 28571) },
+                { AOM_CDF4(6193, 14508, 20629) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(1035, 2807, 4156) },
+                { AOM_CDF4(13162, 18138, 20939) },
+                { AOM_CDF4(2696, 6633, 8755) },
+                { AOM_CDF4(1373, 4161, 6853) },
+                { AOM_CDF4(1099, 2746, 4716) },
+                { AOM_CDF4(340, 1021, 1599) },
+                { AOM_CDF4(22826, 30419, 32135) },
+                { AOM_CDF4(10395, 21762, 26942) },
+                { AOM_CDF4(4726, 12407, 17361) },
+                { AOM_CDF4(2447, 7080, 10593) },
+                { AOM_CDF4(1227, 3717, 6011) },
+                { AOM_CDF4(28156, 31424, 31934) },
+                { AOM_CDF4(16915, 27754, 30373) },
+                { AOM_CDF4(9148, 20990, 26431) },
+                { AOM_CDF4(5950, 15515, 21148) },
+                { AOM_CDF4(2492, 7327, 11526) },
+                { AOM_CDF4(30602, 32477, 32670) },
+                { AOM_CDF4(20026, 29955, 31568) },
+                { AOM_CDF4(11220, 23628, 28105) },
+                { AOM_CDF4(6652, 17019, 22973) },
+                { AOM_CDF4(3064, 8536, 13043) },
+                { AOM_CDF4(31769, 32724, 32748) },
+                { AOM_CDF4(22230, 30887, 32373) },
+                { AOM_CDF4(12234, 25079, 29731) },
+                { AOM_CDF4(7326, 18816, 25353) },
+                { AOM_CDF4(3933, 10907, 16616) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(8896, 16227, 20630) },
+                { AOM_CDF4(23629, 31782, 32527) },
+                { AOM_CDF4(15173, 27755, 31321) },
+                { AOM_CDF4(10158, 21233, 27382) },
+                { AOM_CDF4(6420, 14857, 21558) },
+                { AOM_CDF4(3269, 8155, 12646) },
+                { AOM_CDF4(24835, 32009, 32496) },
+                { AOM_CDF4(16509, 28421, 31579) },
+                { AOM_CDF4(10957, 21514, 27418) },
+                { AOM_CDF4(7881, 15930, 22096) },
+                { AOM_CDF4(5388, 10960, 15918) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(20745, 30773, 32093) },
+                { AOM_CDF4(15200, 27221, 30861) },
+                { AOM_CDF4(13032, 20873, 25667) },
+                { AOM_CDF4(12285, 18663, 23494) },
+                { AOM_CDF4(11563, 17481, 21489) },
+                { AOM_CDF4(26260, 31982, 32320) },
+                { AOM_CDF4(15397, 28083, 31100) },
+                { AOM_CDF4(9742, 19217, 24824) },
+                { AOM_CDF4(3261, 9629, 15362) },
+                { AOM_CDF4(1480, 4322, 7499) },
+                { AOM_CDF4(27599, 32256, 32460) },
+                { AOM_CDF4(16857, 27659, 30774) },
+                { AOM_CDF4(9551, 18290, 23748) },
+                { AOM_CDF4(3052, 8933, 14103) },
+                { AOM_CDF4(2021, 5910, 9787) },
+                { AOM_CDF4(29005, 32015, 32392) },
+                { AOM_CDF4(17677, 27694, 30863) },
+                { AOM_CDF4(9204, 17356, 23219) },
+                { AOM_CDF4(2403, 7516, 12814) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(10808, 22056, 26896) },
+                { AOM_CDF4(25739, 32313, 32676) },
+                { AOM_CDF4(17288, 30203, 32221) },
+                { AOM_CDF4(11359, 24878, 29896) },
+                { AOM_CDF4(6949, 17767, 24893) },
+                { AOM_CDF4(4287, 11796, 18071) },
+                { AOM_CDF4(27880, 32521, 32705) },
+                { AOM_CDF4(19038, 31004, 32414) },
+                { AOM_CDF4(12564, 26345, 30768) },
+                { AOM_CDF4(8269, 19947, 26779) },
+                { AOM_CDF4(5674, 14657, 21674) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(25742, 32319, 32671) },
+                { AOM_CDF4(19557, 31164, 32454) },
+                { AOM_CDF4(13381, 26381, 30755) },
+                { AOM_CDF4(10101, 21466, 26722) },
+                { AOM_CDF4(9209, 19650, 26825) },
+                { AOM_CDF4(27107, 31917, 32432) },
+                { AOM_CDF4(18056, 28893, 31203) },
+                { AOM_CDF4(10200, 21434, 26764) },
+                { AOM_CDF4(4660, 12913, 19502) },
+                { AOM_CDF4(2368, 6930, 12504) },
+                { AOM_CDF4(26960, 32158, 32613) },
+                { AOM_CDF4(18628, 30005, 32031) },
+                { AOM_CDF4(10233, 22442, 28232) },
+                { AOM_CDF4(5471, 14630, 21516) },
+                { AOM_CDF4(3235, 10767, 17109) },
+                { AOM_CDF4(27696, 32440, 32692) },
+                { AOM_CDF4(20032, 31167, 32438) },
+                { AOM_CDF4(8700, 21341, 28442) },
+                { AOM_CDF4(5662, 14831, 21795) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(9704, 17294, 21132) },
+                { AOM_CDF4(26762, 32278, 32633) },
+                { AOM_CDF4(18382, 29620, 31819) },
+                { AOM_CDF4(10891, 23475, 28723) },
+                { AOM_CDF4(6358, 16583, 23309) },
+                { AOM_CDF4(3248, 9118, 14141) },
+                { AOM_CDF4(27204, 32573, 32699) },
+                { AOM_CDF4(19818, 30824, 32329) },
+                { AOM_CDF4(11772, 25120, 30041) },
+                { AOM_CDF4(6995, 18033, 25039) },
+                { AOM_CDF4(3752, 10442, 16098) },
+                { AOM_CDF4(27222, 32256, 32559) },
+                { AOM_CDF4(15356, 28399, 31475) },
+                { AOM_CDF4(8821, 20635, 27057) },
+                { AOM_CDF4(5511, 14404, 21239) },
+                { AOM_CDF4(2935, 8222, 13051) },
+                { AOM_CDF4(24875, 32120, 32529) },
+                { AOM_CDF4(15233, 28265, 31445) },
+                { AOM_CDF4(8605, 20570, 26932) },
+                { AOM_CDF4(5431, 14413, 21196) },
+                { AOM_CDF4(2994, 8341, 13223) },
+                { AOM_CDF4(28201, 32604, 32700) },
+                { AOM_CDF4(21041, 31446, 32456) },
+                { AOM_CDF4(13221, 26213, 30475) },
+                { AOM_CDF4(8255, 19385, 26037) },
+                { AOM_CDF4(4930, 12585, 18830) },
+                { AOM_CDF4(28768, 32448, 32627) },
+                { AOM_CDF4(19705, 30561, 32021) },
+                { AOM_CDF4(11572, 23589, 28220) },
+                { AOM_CDF4(5532, 15034, 21446) },
+                { AOM_CDF4(2460, 7150, 11456) },
+                { AOM_CDF4(29874, 32619, 32699) },
+                { AOM_CDF4(21621, 31071, 32201) },
+                { AOM_CDF4(12511, 24747, 28992) },
+                { AOM_CDF4(6281, 16395, 22748) },
+                { AOM_CDF4(3246, 9278, 14497) },
+                { AOM_CDF4(29715, 32625, 32712) },
+                { AOM_CDF4(20958, 31011, 32283) },
+                { AOM_CDF4(11233, 23671, 28806) },
+                { AOM_CDF4(6012, 16128, 22868) },
+                { AOM_CDF4(3427, 9851, 15414) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(11016, 22111, 26794) },
+                { AOM_CDF4(25946, 32357, 32677) },
+                { AOM_CDF4(17890, 30452, 32252) },
+                { AOM_CDF4(11678, 25142, 29816) },
+                { AOM_CDF4(6720, 17534, 24584) },
+                { AOM_CDF4(4230, 11665, 17820) },
+                { AOM_CDF4(28400, 32623, 32747) },
+                { AOM_CDF4(21164, 31668, 32575) },
+                { AOM_CDF4(13572, 27388, 31182) },
+                { AOM_CDF4(8234, 20750, 27358) },
+                { AOM_CDF4(5065, 14055, 20897) },
+                { AOM_CDF4(28981, 32547, 32705) },
+                { AOM_CDF4(18681, 30543, 32239) },
+                { AOM_CDF4(10919, 24075, 29286) },
+                { AOM_CDF4(6431, 17199, 24077) },
+                { AOM_CDF4(3819, 10464, 16618) },
+                { AOM_CDF4(26870, 32467, 32693) },
+                { AOM_CDF4(19041, 30831, 32347) },
+                { AOM_CDF4(11794, 25211, 30016) },
+                { AOM_CDF4(6888, 18019, 24970) },
+                { AOM_CDF4(4370, 12363, 18992) },
+                { AOM_CDF4(29578, 32670, 32744) },
+                { AOM_CDF4(23159, 32007, 32613) },
+                { AOM_CDF4(15315, 28669, 31676) },
+                { AOM_CDF4(9298, 22607, 28782) },
+                { AOM_CDF4(6144, 15913, 22968) },
+                { AOM_CDF4(28110, 32499, 32669) },
+                { AOM_CDF4(21574, 30937, 32015) },
+                { AOM_CDF4(12759, 24818, 28727) },
+                { AOM_CDF4(6545, 16761, 23042) },
+                { AOM_CDF4(3649, 10597, 16833) },
+                { AOM_CDF4(28163, 32552, 32728) },
+                { AOM_CDF4(22101, 31469, 32464) },
+                { AOM_CDF4(13160, 25472, 30143) },
+                { AOM_CDF4(7303, 18684, 25468) },
+                { AOM_CDF4(5241, 13975, 20955) },
+                { AOM_CDF4(28400, 32631, 32744) },
+                { AOM_CDF4(22104, 31793, 32603) },
+                { AOM_CDF4(13557, 26571, 30846) },
+                { AOM_CDF4(7749, 19861, 26675) },
+                { AOM_CDF4(4873, 14030, 21234) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(9800, 17635, 21073) },
+                { AOM_CDF4(26153, 31885, 32527) },
+                { AOM_CDF4(15038, 27852, 31006) },
+                { AOM_CDF4(8718, 20564, 26486) },
+                { AOM_CDF4(5128, 14076, 20514) },
+                { AOM_CDF4(2636, 7566, 11925) },
+                { AOM_CDF4(27551, 32504, 32701) },
+                { AOM_CDF4(18310, 30054, 32100) },
+                { AOM_CDF4(10211, 23420, 29082) },
+                { AOM_CDF4(6222, 16876, 23916) },
+                { AOM_CDF4(3462, 9954, 15498) },
+                { AOM_CDF4(29991, 32633, 32721) },
+                { AOM_CDF4(19883, 30751, 32201) },
+                { AOM_CDF4(11141, 24184, 29285) },
+                { AOM_CDF4(6420, 16940, 23774) },
+                { AOM_CDF4(3392, 9753, 15118) },
+                { AOM_CDF4(28465, 32616, 32712) },
+                { AOM_CDF4(19850, 30702, 32244) },
+                { AOM_CDF4(10983, 24024, 29223) },
+                { AOM_CDF4(6294, 16770, 23582) },
+                { AOM_CDF4(3244, 9283, 14509) },
+                { AOM_CDF4(30023, 32717, 32748) },
+                { AOM_CDF4(22940, 32032, 32626) },
+                { AOM_CDF4(14282, 27928, 31473) },
+                { AOM_CDF4(8562, 21327, 27914) },
+                { AOM_CDF4(4846, 13393, 19919) },
+                { AOM_CDF4(29981, 32590, 32695) },
+                { AOM_CDF4(20465, 30963, 32166) },
+                { AOM_CDF4(11479, 23579, 28195) },
+                { AOM_CDF4(5916, 15648, 22073) },
+                { AOM_CDF4(3031, 8605, 13398) },
+                { AOM_CDF4(31146, 32691, 32739) },
+                { AOM_CDF4(23106, 31724, 32444) },
+                { AOM_CDF4(13783, 26738, 30439) },
+                { AOM_CDF4(7852, 19468, 25807) },
+                { AOM_CDF4(3860, 11124, 16853) },
+                { AOM_CDF4(31014, 32724, 32748) },
+                { AOM_CDF4(23629, 32109, 32628) },
+                { AOM_CDF4(14747, 28115, 31403) },
+                { AOM_CDF4(8545, 21242, 27478) },
+                { AOM_CDF4(4574, 12781, 19067) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(9185, 19694, 24688) },
+                { AOM_CDF4(26081, 31985, 32621) },
+                { AOM_CDF4(16015, 29000, 31787) },
+                { AOM_CDF4(10542, 23690, 29206) },
+                { AOM_CDF4(6732, 17945, 24677) },
+                { AOM_CDF4(3916, 11039, 16722) },
+                { AOM_CDF4(28224, 32566, 32744) },
+                { AOM_CDF4(19100, 31138, 32485) },
+                { AOM_CDF4(12528, 26620, 30879) },
+                { AOM_CDF4(7741, 20277, 26885) },
+                { AOM_CDF4(4566, 12845, 18990) },
+                { AOM_CDF4(29933, 32593, 32718) },
+                { AOM_CDF4(17670, 30333, 32155) },
+                { AOM_CDF4(10385, 23600, 28909) },
+                { AOM_CDF4(6243, 16236, 22407) },
+                { AOM_CDF4(3976, 10389, 16017) },
+                { AOM_CDF4(28377, 32561, 32738) },
+                { AOM_CDF4(19366, 31175, 32482) },
+                { AOM_CDF4(13327, 27175, 31094) },
+                { AOM_CDF4(8258, 20769, 27143) },
+                { AOM_CDF4(4703, 13198, 19527) },
+                { AOM_CDF4(31086, 32706, 32748) },
+                { AOM_CDF4(22853, 31902, 32583) },
+                { AOM_CDF4(14759, 28186, 31419) },
+                { AOM_CDF4(9284, 22382, 28348) },
+                { AOM_CDF4(5585, 15192, 21868) },
+                { AOM_CDF4(28291, 32652, 32746) },
+                { AOM_CDF4(19849, 32107, 32571) },
+                { AOM_CDF4(14834, 26818, 29214) },
+                { AOM_CDF4(10306, 22594, 28672) },
+                { AOM_CDF4(6615, 17384, 23384) },
+                { AOM_CDF4(28947, 32604, 32745) },
+                { AOM_CDF4(25625, 32289, 32646) },
+                { AOM_CDF4(18758, 28672, 31403) },
+                { AOM_CDF4(10017, 23430, 28523) },
+                { AOM_CDF4(6862, 15269, 22131) },
+                { AOM_CDF4(23933, 32509, 32739) },
+                { AOM_CDF4(19927, 31495, 32631) },
+                { AOM_CDF4(11903, 26023, 30621) },
+                { AOM_CDF4(7026, 20094, 27252) },
+                { AOM_CDF4(5998, 18106, 24437) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4456, 11274, 15533) },
+                { AOM_CDF4(21219, 29079, 31616) },
+                { AOM_CDF4(11173, 23774, 28567) },
+                { AOM_CDF4(7282, 18293, 24263) },
+                { AOM_CDF4(4890, 13286, 19115) },
+                { AOM_CDF4(1890, 5508, 8659) },
+                { AOM_CDF4(26651, 32136, 32647) },
+                { AOM_CDF4(14630, 28254, 31455) },
+                { AOM_CDF4(8716, 21287, 27395) },
+                { AOM_CDF4(5615, 15331, 22008) },
+                { AOM_CDF4(2675, 7700, 12150) },
+                { AOM_CDF4(29954, 32526, 32690) },
+                { AOM_CDF4(16126, 28982, 31633) },
+                { AOM_CDF4(9030, 21361, 27352) },
+                { AOM_CDF4(5411, 14793, 21271) },
+                { AOM_CDF4(2943, 8422, 13163) },
+                { AOM_CDF4(29539, 32601, 32730) },
+                { AOM_CDF4(18125, 30385, 32201) },
+                { AOM_CDF4(10422, 24090, 29468) },
+                { AOM_CDF4(6468, 17487, 24438) },
+                { AOM_CDF4(2970, 8653, 13531) },
+                { AOM_CDF4(30912, 32715, 32748) },
+                { AOM_CDF4(20666, 31373, 32497) },
+                { AOM_CDF4(12509, 26640, 30917) },
+                { AOM_CDF4(8058, 20629, 27290) },
+                { AOM_CDF4(4231, 12006, 18052) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(10202, 20633, 25484) },
+                { AOM_CDF4(27336, 31445, 32352) },
+                { AOM_CDF4(12420, 24384, 28552) },
+                { AOM_CDF4(7648, 18115, 23856) },
+                { AOM_CDF4(5662, 14341, 19902) },
+                { AOM_CDF4(3611, 10328, 15390) },
+                { AOM_CDF4(30945, 32616, 32736) },
+                { AOM_CDF4(18682, 30505, 32253) },
+                { AOM_CDF4(11513, 25336, 30203) },
+                { AOM_CDF4(7449, 19452, 26148) },
+                { AOM_CDF4(4482, 13051, 18886) },
+                { AOM_CDF4(32022, 32690, 32747) },
+                { AOM_CDF4(18578, 30501, 32146) },
+                { AOM_CDF4(11249, 23368, 28631) },
+                { AOM_CDF4(5645, 16958, 22158) },
+                { AOM_CDF4(5009, 11444, 16637) },
+                { AOM_CDF4(31357, 32710, 32748) },
+                { AOM_CDF4(21552, 31494, 32504) },
+                { AOM_CDF4(13891, 27677, 31340) },
+                { AOM_CDF4(9051, 22098, 28172) },
+                { AOM_CDF4(5190, 13377, 19486) },
+                { AOM_CDF4(32364, 32740, 32748) },
+                { AOM_CDF4(24839, 31907, 32551) },
+                { AOM_CDF4(17160, 28779, 31696) },
+                { AOM_CDF4(12452, 24137, 29602) },
+                { AOM_CDF4(6165, 15389, 22477) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(2575, 7281, 11077) },
+                { AOM_CDF4(14002, 20866, 25402) },
+                { AOM_CDF4(6343, 15056, 19658) },
+                { AOM_CDF4(4474, 11858, 17041) },
+                { AOM_CDF4(2865, 8299, 12534) },
+                { AOM_CDF4(1344, 3949, 6391) },
+                { AOM_CDF4(24720, 31239, 32459) },
+                { AOM_CDF4(12585, 25356, 29968) },
+                { AOM_CDF4(7181, 18246, 24444) },
+                { AOM_CDF4(5025, 13667, 19885) },
+                { AOM_CDF4(2521, 7304, 11605) },
+                { AOM_CDF4(29908, 32252, 32584) },
+                { AOM_CDF4(17421, 29156, 31575) },
+                { AOM_CDF4(9889, 22188, 27782) },
+                { AOM_CDF4(5878, 15647, 22123) },
+                { AOM_CDF4(2814, 8665, 13323) },
+                { AOM_CDF4(30183, 32568, 32713) },
+                { AOM_CDF4(18528, 30195, 32049) },
+                { AOM_CDF4(10982, 24606, 29657) },
+                { AOM_CDF4(6957, 18165, 25231) },
+                { AOM_CDF4(3508, 10118, 15468) },
+                { AOM_CDF4(31761, 32736, 32748) },
+                { AOM_CDF4(21041, 31328, 32546) },
+                { AOM_CDF4(12568, 26732, 31166) },
+                { AOM_CDF4(8052, 20720, 27733) },
+                { AOM_CDF4(4336, 12192, 18396) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } },
+          { { { { AOM_CDF4(7062, 16472, 22319) },
+                { AOM_CDF4(24538, 32261, 32674) },
+                { AOM_CDF4(13675, 28041, 31779) },
+                { AOM_CDF4(8590, 20674, 27631) },
+                { AOM_CDF4(5685, 14675, 22013) },
+                { AOM_CDF4(3655, 9898, 15731) },
+                { AOM_CDF4(26493, 32418, 32658) },
+                { AOM_CDF4(16376, 29342, 32090) },
+                { AOM_CDF4(10594, 22649, 28970) },
+                { AOM_CDF4(8176, 17170, 24303) },
+                { AOM_CDF4(5605, 12694, 19139) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(23888, 31902, 32542) },
+                { AOM_CDF4(18612, 29687, 31987) },
+                { AOM_CDF4(16245, 24852, 29249) },
+                { AOM_CDF4(15765, 22608, 27559) },
+                { AOM_CDF4(19895, 24699, 27510) },
+                { AOM_CDF4(28401, 32212, 32457) },
+                { AOM_CDF4(15274, 27825, 30980) },
+                { AOM_CDF4(9364, 18128, 24332) },
+                { AOM_CDF4(2283, 8193, 15082) },
+                { AOM_CDF4(1228, 3972, 7881) },
+                { AOM_CDF4(29455, 32469, 32620) },
+                { AOM_CDF4(17981, 28245, 31388) },
+                { AOM_CDF4(10921, 20098, 26240) },
+                { AOM_CDF4(3743, 11829, 18657) },
+                { AOM_CDF4(2374, 9593, 15715) },
+                { AOM_CDF4(31068, 32466, 32635) },
+                { AOM_CDF4(20321, 29572, 31971) },
+                { AOM_CDF4(10771, 20255, 27119) },
+                { AOM_CDF4(2795, 10410, 17361) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(9320, 22102, 27840) },
+                { AOM_CDF4(27057, 32464, 32724) },
+                { AOM_CDF4(16331, 30268, 32309) },
+                { AOM_CDF4(10319, 23935, 29720) },
+                { AOM_CDF4(6189, 16448, 24106) },
+                { AOM_CDF4(3589, 10884, 18808) },
+                { AOM_CDF4(29026, 32624, 32748) },
+                { AOM_CDF4(19226, 31507, 32587) },
+                { AOM_CDF4(12692, 26921, 31203) },
+                { AOM_CDF4(7049, 19532, 27635) },
+                { AOM_CDF4(7727, 15669, 23252) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(28056, 32625, 32748) },
+                { AOM_CDF4(22383, 32075, 32669) },
+                { AOM_CDF4(15417, 27098, 31749) },
+                { AOM_CDF4(18127, 26493, 27190) },
+                { AOM_CDF4(5461, 16384, 21845) },
+                { AOM_CDF4(27982, 32091, 32584) },
+                { AOM_CDF4(19045, 29868, 31972) },
+                { AOM_CDF4(10397, 22266, 27932) },
+                { AOM_CDF4(5990, 13697, 21500) },
+                { AOM_CDF4(1792, 6912, 15104) },
+                { AOM_CDF4(28198, 32501, 32718) },
+                { AOM_CDF4(21534, 31521, 32569) },
+                { AOM_CDF4(11109, 25217, 30017) },
+                { AOM_CDF4(5671, 15124, 26151) },
+                { AOM_CDF4(4681, 14043, 18725) },
+                { AOM_CDF4(28688, 32580, 32741) },
+                { AOM_CDF4(22576, 32079, 32661) },
+                { AOM_CDF4(10627, 22141, 28340) },
+                { AOM_CDF4(9362, 14043, 28087) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7754, 16948, 22142) },
+                { AOM_CDF4(25670, 32330, 32691) },
+                { AOM_CDF4(15663, 29225, 31994) },
+                { AOM_CDF4(9878, 23288, 29158) },
+                { AOM_CDF4(6419, 17088, 24336) },
+                { AOM_CDF4(3859, 11003, 17039) },
+                { AOM_CDF4(27562, 32595, 32725) },
+                { AOM_CDF4(17575, 30588, 32399) },
+                { AOM_CDF4(10819, 24838, 30309) },
+                { AOM_CDF4(7124, 18686, 25916) },
+                { AOM_CDF4(4479, 12688, 19340) },
+                { AOM_CDF4(28385, 32476, 32673) },
+                { AOM_CDF4(15306, 29005, 31938) },
+                { AOM_CDF4(8937, 21615, 28322) },
+                { AOM_CDF4(5982, 15603, 22786) },
+                { AOM_CDF4(3620, 10267, 16136) },
+                { AOM_CDF4(27280, 32464, 32667) },
+                { AOM_CDF4(15607, 29160, 32004) },
+                { AOM_CDF4(9091, 22135, 28740) },
+                { AOM_CDF4(6232, 16632, 24020) },
+                { AOM_CDF4(4047, 11377, 17672) },
+                { AOM_CDF4(29220, 32630, 32718) },
+                { AOM_CDF4(19650, 31220, 32462) },
+                { AOM_CDF4(13050, 26312, 30827) },
+                { AOM_CDF4(9228, 20870, 27468) },
+                { AOM_CDF4(6146, 15149, 21971) },
+                { AOM_CDF4(30169, 32481, 32623) },
+                { AOM_CDF4(17212, 29311, 31554) },
+                { AOM_CDF4(9911, 21311, 26882) },
+                { AOM_CDF4(4487, 13314, 20372) },
+                { AOM_CDF4(2570, 7772, 12889) },
+                { AOM_CDF4(30924, 32613, 32708) },
+                { AOM_CDF4(19490, 30206, 32107) },
+                { AOM_CDF4(11232, 23998, 29276) },
+                { AOM_CDF4(6769, 17955, 25035) },
+                { AOM_CDF4(4398, 12623, 19214) },
+                { AOM_CDF4(30609, 32627, 32722) },
+                { AOM_CDF4(19370, 30582, 32287) },
+                { AOM_CDF4(10457, 23619, 29409) },
+                { AOM_CDF4(6443, 17637, 24834) },
+                { AOM_CDF4(4645, 13236, 20106) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8626, 20271, 26216) },
+                { AOM_CDF4(26707, 32406, 32711) },
+                { AOM_CDF4(16999, 30329, 32286) },
+                { AOM_CDF4(11445, 25123, 30286) },
+                { AOM_CDF4(6411, 18828, 25601) },
+                { AOM_CDF4(6801, 12458, 20248) },
+                { AOM_CDF4(29918, 32682, 32748) },
+                { AOM_CDF4(20649, 31739, 32618) },
+                { AOM_CDF4(12879, 27773, 31581) },
+                { AOM_CDF4(7896, 21751, 28244) },
+                { AOM_CDF4(5260, 14870, 23698) },
+                { AOM_CDF4(29252, 32593, 32731) },
+                { AOM_CDF4(17072, 30460, 32294) },
+                { AOM_CDF4(10653, 24143, 29365) },
+                { AOM_CDF4(6536, 17490, 23983) },
+                { AOM_CDF4(4929, 13170, 20085) },
+                { AOM_CDF4(28137, 32518, 32715) },
+                { AOM_CDF4(18171, 30784, 32407) },
+                { AOM_CDF4(11437, 25436, 30459) },
+                { AOM_CDF4(7252, 18534, 26176) },
+                { AOM_CDF4(4126, 13353, 20978) },
+                { AOM_CDF4(31162, 32726, 32748) },
+                { AOM_CDF4(23017, 32222, 32701) },
+                { AOM_CDF4(15629, 29233, 32046) },
+                { AOM_CDF4(9387, 22621, 29480) },
+                { AOM_CDF4(6922, 17616, 25010) },
+                { AOM_CDF4(28838, 32265, 32614) },
+                { AOM_CDF4(19701, 30206, 31920) },
+                { AOM_CDF4(11214, 22410, 27933) },
+                { AOM_CDF4(5320, 14177, 23034) },
+                { AOM_CDF4(5049, 12881, 17827) },
+                { AOM_CDF4(27484, 32471, 32734) },
+                { AOM_CDF4(21076, 31526, 32561) },
+                { AOM_CDF4(12707, 26303, 31211) },
+                { AOM_CDF4(8169, 21722, 28219) },
+                { AOM_CDF4(6045, 19406, 27042) },
+                { AOM_CDF4(27753, 32572, 32745) },
+                { AOM_CDF4(20832, 31878, 32653) },
+                { AOM_CDF4(13250, 27356, 31674) },
+                { AOM_CDF4(7718, 21508, 29858) },
+                { AOM_CDF4(7209, 18350, 25559) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(7876, 16901, 21741) },
+                { AOM_CDF4(24001, 31898, 32625) },
+                { AOM_CDF4(14529, 27959, 31451) },
+                { AOM_CDF4(8273, 20818, 27258) },
+                { AOM_CDF4(5278, 14673, 21510) },
+                { AOM_CDF4(2983, 8843, 14039) },
+                { AOM_CDF4(28016, 32574, 32732) },
+                { AOM_CDF4(17471, 30306, 32301) },
+                { AOM_CDF4(10224, 24063, 29728) },
+                { AOM_CDF4(6602, 17954, 25052) },
+                { AOM_CDF4(4002, 11585, 17759) },
+                { AOM_CDF4(30190, 32634, 32739) },
+                { AOM_CDF4(17497, 30282, 32270) },
+                { AOM_CDF4(10229, 23729, 29538) },
+                { AOM_CDF4(6344, 17211, 24440) },
+                { AOM_CDF4(3849, 11189, 17108) },
+                { AOM_CDF4(28570, 32583, 32726) },
+                { AOM_CDF4(17521, 30161, 32238) },
+                { AOM_CDF4(10153, 23565, 29378) },
+                { AOM_CDF4(6455, 17341, 24443) },
+                { AOM_CDF4(3907, 11042, 17024) },
+                { AOM_CDF4(30689, 32715, 32748) },
+                { AOM_CDF4(21546, 31840, 32610) },
+                { AOM_CDF4(13547, 27581, 31459) },
+                { AOM_CDF4(8912, 21757, 28309) },
+                { AOM_CDF4(5548, 15080, 22046) },
+                { AOM_CDF4(30783, 32540, 32685) },
+                { AOM_CDF4(17540, 29528, 31668) },
+                { AOM_CDF4(10160, 21468, 26783) },
+                { AOM_CDF4(4724, 13393, 20054) },
+                { AOM_CDF4(2702, 8174, 13102) },
+                { AOM_CDF4(31648, 32686, 32742) },
+                { AOM_CDF4(20954, 31094, 32337) },
+                { AOM_CDF4(12420, 25698, 30179) },
+                { AOM_CDF4(7304, 19320, 26248) },
+                { AOM_CDF4(4366, 12261, 18864) },
+                { AOM_CDF4(31581, 32723, 32748) },
+                { AOM_CDF4(21373, 31586, 32525) },
+                { AOM_CDF4(12744, 26625, 30885) },
+                { AOM_CDF4(7431, 20322, 26950) },
+                { AOM_CDF4(4692, 13323, 20111) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(7833, 18369, 24095) },
+                { AOM_CDF4(26650, 32273, 32702) },
+                { AOM_CDF4(16371, 29961, 32191) },
+                { AOM_CDF4(11055, 24082, 29629) },
+                { AOM_CDF4(6892, 18644, 25400) },
+                { AOM_CDF4(5006, 13057, 19240) },
+                { AOM_CDF4(29834, 32666, 32748) },
+                { AOM_CDF4(19577, 31335, 32570) },
+                { AOM_CDF4(12253, 26509, 31122) },
+                { AOM_CDF4(7991, 20772, 27711) },
+                { AOM_CDF4(5677, 15910, 23059) },
+                { AOM_CDF4(30109, 32532, 32720) },
+                { AOM_CDF4(16747, 30166, 32252) },
+                { AOM_CDF4(10134, 23542, 29184) },
+                { AOM_CDF4(5791, 16176, 23556) },
+                { AOM_CDF4(4362, 10414, 17284) },
+                { AOM_CDF4(29492, 32626, 32748) },
+                { AOM_CDF4(19894, 31402, 32525) },
+                { AOM_CDF4(12942, 27071, 30869) },
+                { AOM_CDF4(8346, 21216, 27405) },
+                { AOM_CDF4(6572, 17087, 23859) },
+                { AOM_CDF4(32035, 32735, 32748) },
+                { AOM_CDF4(22957, 31838, 32618) },
+                { AOM_CDF4(14724, 28572, 31772) },
+                { AOM_CDF4(10364, 23999, 29553) },
+                { AOM_CDF4(7004, 18433, 25655) },
+                { AOM_CDF4(27528, 32277, 32681) },
+                { AOM_CDF4(16959, 31171, 32096) },
+                { AOM_CDF4(10486, 23593, 27962) },
+                { AOM_CDF4(8192, 16384, 23211) },
+                { AOM_CDF4(8937, 17873, 20852) },
+                { AOM_CDF4(27715, 32002, 32615) },
+                { AOM_CDF4(15073, 29491, 31676) },
+                { AOM_CDF4(11264, 24576, 28672) },
+                { AOM_CDF4(2341, 18725, 23406) },
+                { AOM_CDF4(7282, 18204, 25486) },
+                { AOM_CDF4(28547, 32213, 32657) },
+                { AOM_CDF4(20788, 29773, 32239) },
+                { AOM_CDF4(6780, 21469, 30508) },
+                { AOM_CDF4(5958, 14895, 23831) },
+                { AOM_CDF4(16384, 21845, 27307) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(5992, 14304, 19765) },
+                { AOM_CDF4(22612, 31238, 32456) },
+                { AOM_CDF4(13456, 27162, 31087) },
+                { AOM_CDF4(8001, 20062, 26504) },
+                { AOM_CDF4(5168, 14105, 20764) },
+                { AOM_CDF4(2632, 7771, 12385) },
+                { AOM_CDF4(27034, 32344, 32709) },
+                { AOM_CDF4(15850, 29415, 31997) },
+                { AOM_CDF4(9494, 22776, 28841) },
+                { AOM_CDF4(6151, 16830, 23969) },
+                { AOM_CDF4(3461, 10039, 15722) },
+                { AOM_CDF4(30134, 32569, 32731) },
+                { AOM_CDF4(15638, 29422, 31945) },
+                { AOM_CDF4(9150, 21865, 28218) },
+                { AOM_CDF4(5647, 15719, 22676) },
+                { AOM_CDF4(3402, 9772, 15477) },
+                { AOM_CDF4(28530, 32586, 32735) },
+                { AOM_CDF4(17139, 30298, 32292) },
+                { AOM_CDF4(10200, 24039, 29685) },
+                { AOM_CDF4(6419, 17674, 24786) },
+                { AOM_CDF4(3544, 10225, 15824) },
+                { AOM_CDF4(31333, 32726, 32748) },
+                { AOM_CDF4(20618, 31487, 32544) },
+                { AOM_CDF4(12901, 27217, 31232) },
+                { AOM_CDF4(8624, 21734, 28171) },
+                { AOM_CDF4(5104, 14191, 20748) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(11206, 21090, 26561) },
+                { AOM_CDF4(28759, 32279, 32671) },
+                { AOM_CDF4(14171, 27952, 31569) },
+                { AOM_CDF4(9743, 22907, 29141) },
+                { AOM_CDF4(6871, 17886, 24868) },
+                { AOM_CDF4(4960, 13152, 19315) },
+                { AOM_CDF4(31077, 32661, 32748) },
+                { AOM_CDF4(19400, 31195, 32515) },
+                { AOM_CDF4(12752, 26858, 31040) },
+                { AOM_CDF4(8370, 22098, 28591) },
+                { AOM_CDF4(5457, 15373, 22298) },
+                { AOM_CDF4(31697, 32706, 32748) },
+                { AOM_CDF4(17860, 30657, 32333) },
+                { AOM_CDF4(12510, 24812, 29261) },
+                { AOM_CDF4(6180, 19124, 24722) },
+                { AOM_CDF4(5041, 13548, 17959) },
+                { AOM_CDF4(31552, 32716, 32748) },
+                { AOM_CDF4(21908, 31769, 32623) },
+                { AOM_CDF4(14470, 28201, 31565) },
+                { AOM_CDF4(9493, 22982, 28608) },
+                { AOM_CDF4(6858, 17240, 24137) },
+                { AOM_CDF4(32543, 32752, 32756) },
+                { AOM_CDF4(24286, 32097, 32666) },
+                { AOM_CDF4(15958, 29217, 32024) },
+                { AOM_CDF4(10207, 24234, 29958) },
+                { AOM_CDF4(6929, 18305, 25652) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } },
+            { { { AOM_CDF4(4137, 10847, 15682) },
+                { AOM_CDF4(17824, 27001, 30058) },
+                { AOM_CDF4(10204, 22796, 28291) },
+                { AOM_CDF4(6076, 15935, 22125) },
+                { AOM_CDF4(3852, 10937, 16816) },
+                { AOM_CDF4(2252, 6324, 10131) },
+                { AOM_CDF4(25840, 32016, 32662) },
+                { AOM_CDF4(15109, 28268, 31531) },
+                { AOM_CDF4(9385, 22231, 28340) },
+                { AOM_CDF4(6082, 16672, 23479) },
+                { AOM_CDF4(3318, 9427, 14681) },
+                { AOM_CDF4(30594, 32574, 32718) },
+                { AOM_CDF4(16836, 29552, 31859) },
+                { AOM_CDF4(9556, 22542, 28356) },
+                { AOM_CDF4(6305, 16725, 23540) },
+                { AOM_CDF4(3376, 9895, 15184) },
+                { AOM_CDF4(29383, 32617, 32745) },
+                { AOM_CDF4(18891, 30809, 32401) },
+                { AOM_CDF4(11688, 25942, 30687) },
+                { AOM_CDF4(7468, 19469, 26651) },
+                { AOM_CDF4(3909, 11358, 17012) },
+                { AOM_CDF4(31564, 32736, 32748) },
+                { AOM_CDF4(20906, 31611, 32600) },
+                { AOM_CDF4(13191, 27621, 31537) },
+                { AOM_CDF4(8768, 22029, 28676) },
+                { AOM_CDF4(5079, 14109, 20906) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } },
+              { { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) },
+                { AOM_CDF4(8192, 16384, 24576) } } } } };
+
+static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
+        NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) },
+                                        { AOM_CDF3(29600, 31446) },
+                                        { AOM_CDF3(30844, 31878) },
+                                        { AOM_CDF3(24926, 28948) } },
+                                      { { AOM_CDF3(21365, 30026) },
+                                        { AOM_CDF3(30512, 32423) },
+                                        { AOM_CDF3(31658, 32621) },
+                                        { AOM_CDF3(29630, 31881) } } },
+                                    { { { AOM_CDF3(5717, 26477) },
+                                        { AOM_CDF3(30491, 31703) },
+                                        { AOM_CDF3(31550, 32158) },
+                                        { AOM_CDF3(29648, 31491) } },
+                                      { { AOM_CDF3(12608, 27820) },
+                                        { AOM_CDF3(30680, 32225) },
+                                        { AOM_CDF3(30809, 32335) },
+                                        { AOM_CDF3(31299, 32423) } } },
+                                    { { { AOM_CDF3(1786, 12612) },
+                                        { AOM_CDF3(30663, 31625) },
+                                        { AOM_CDF3(32339, 32468) },
+                                        { AOM_CDF3(31148, 31833) } },
+                                      { { AOM_CDF3(18857, 23865) },
+                                        { AOM_CDF3(31428, 32428) },
+                                        { AOM_CDF3(31744, 32373) },
+                                        { AOM_CDF3(31775, 32526) } } },
+                                    { { { AOM_CDF3(1787, 2532) },
+                                        { AOM_CDF3(30832, 31662) },
+                                        { AOM_CDF3(31824, 32682) },
+                                        { AOM_CDF3(32133, 32569) } },
+                                      { { AOM_CDF3(13751, 22235) },
+                                        { AOM_CDF3(32089, 32409) },
+                                        { AOM_CDF3(27084, 27920) },
+                                        { AOM_CDF3(29291, 32594) } } },
+                                    { { { AOM_CDF3(1725, 3449) },
+                                        { AOM_CDF3(31102, 31935) },
+                                        { AOM_CDF3(32457, 32613) },
+                                        { AOM_CDF3(32412, 32649) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(17560, 29888) },
+                                        { AOM_CDF3(29671, 31549) },
+                                        { AOM_CDF3(31007, 32056) },
+                                        { AOM_CDF3(27286, 30006) } },
+                                      { { AOM_CDF3(26594, 31212) },
+                                        { AOM_CDF3(31208, 32582) },
+                                        { AOM_CDF3(31835, 32637) },
+                                        { AOM_CDF3(30595, 32206) } } },
+                                    { { { AOM_CDF3(15239, 29932) },
+                                        { AOM_CDF3(31315, 32095) },
+                                        { AOM_CDF3(32130, 32434) },
+                                        { AOM_CDF3(30864, 31996) } },
+                                      { { AOM_CDF3(26279, 30968) },
+                                        { AOM_CDF3(31142, 32495) },
+                                        { AOM_CDF3(31713, 32540) },
+                                        { AOM_CDF3(31929, 32594) } } },
+                                    { { { AOM_CDF3(2644, 25198) },
+                                        { AOM_CDF3(32038, 32451) },
+                                        { AOM_CDF3(32639, 32695) },
+                                        { AOM_CDF3(32166, 32518) } },
+                                      { { AOM_CDF3(17187, 27668) },
+                                        { AOM_CDF3(31714, 32550) },
+                                        { AOM_CDF3(32283, 32678) },
+                                        { AOM_CDF3(31930, 32563) } } },
+                                    { { { AOM_CDF3(1044, 2257) },
+                                        { AOM_CDF3(30755, 31923) },
+                                        { AOM_CDF3(32208, 32693) },
+                                        { AOM_CDF3(32244, 32615) } },
+                                      { { AOM_CDF3(21317, 26207) },
+                                        { AOM_CDF3(29133, 30868) },
+                                        { AOM_CDF3(29311, 31231) },
+                                        { AOM_CDF3(29657, 31087) } } },
+                                    { { { AOM_CDF3(478, 1834) },
+                                        { AOM_CDF3(31005, 31987) },
+                                        { AOM_CDF3(32317, 32724) },
+                                        { AOM_CDF3(30865, 32648) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(20092, 30774) },
+                                        { AOM_CDF3(30695, 32020) },
+                                        { AOM_CDF3(31131, 32103) },
+                                        { AOM_CDF3(28666, 30870) } },
+                                      { { AOM_CDF3(27258, 31095) },
+                                        { AOM_CDF3(31804, 32623) },
+                                        { AOM_CDF3(31763, 32528) },
+                                        { AOM_CDF3(31438, 32506) } } },
+                                    { { { AOM_CDF3(18049, 30489) },
+                                        { AOM_CDF3(31706, 32286) },
+                                        { AOM_CDF3(32163, 32473) },
+                                        { AOM_CDF3(31550, 32184) } },
+                                      { { AOM_CDF3(27116, 30842) },
+                                        { AOM_CDF3(31971, 32598) },
+                                        { AOM_CDF3(32088, 32576) },
+                                        { AOM_CDF3(32067, 32664) } } },
+                                    { { { AOM_CDF3(12854, 29093) },
+                                        { AOM_CDF3(32272, 32558) },
+                                        { AOM_CDF3(32667, 32729) },
+                                        { AOM_CDF3(32306, 32585) } },
+                                      { { AOM_CDF3(25476, 30366) },
+                                        { AOM_CDF3(32169, 32687) },
+                                        { AOM_CDF3(32479, 32689) },
+                                        { AOM_CDF3(31673, 32634) } } },
+                                    { { { AOM_CDF3(2809, 19301) },
+                                        { AOM_CDF3(32205, 32622) },
+                                        { AOM_CDF3(32338, 32730) },
+                                        { AOM_CDF3(31786, 32616) } },
+                                      { { AOM_CDF3(22737, 29105) },
+                                        { AOM_CDF3(30810, 32362) },
+                                        { AOM_CDF3(30014, 32627) },
+                                        { AOM_CDF3(30528, 32574) } } },
+                                    { { { AOM_CDF3(935, 3382) },
+                                        { AOM_CDF3(30789, 31909) },
+                                        { AOM_CDF3(32466, 32756) },
+                                        { AOM_CDF3(30860, 32513) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(22497, 31198) },
+                                        { AOM_CDF3(31715, 32495) },
+                                        { AOM_CDF3(31606, 32337) },
+                                        { AOM_CDF3(30388, 31990) } },
+                                      { { AOM_CDF3(27877, 31584) },
+                                        { AOM_CDF3(32170, 32728) },
+                                        { AOM_CDF3(32155, 32688) },
+                                        { AOM_CDF3(32219, 32702) } } },
+                                    { { { AOM_CDF3(21457, 31043) },
+                                        { AOM_CDF3(31951, 32483) },
+                                        { AOM_CDF3(32153, 32562) },
+                                        { AOM_CDF3(31473, 32215) } },
+                                      { { AOM_CDF3(27558, 31151) },
+                                        { AOM_CDF3(32020, 32640) },
+                                        { AOM_CDF3(32097, 32575) },
+                                        { AOM_CDF3(32242, 32719) } } },
+                                    { { { AOM_CDF3(19980, 30591) },
+                                        { AOM_CDF3(32219, 32597) },
+                                        { AOM_CDF3(32581, 32706) },
+                                        { AOM_CDF3(31803, 32287) } },
+                                      { { AOM_CDF3(26473, 30507) },
+                                        { AOM_CDF3(32431, 32723) },
+                                        { AOM_CDF3(32196, 32611) },
+                                        { AOM_CDF3(31588, 32528) } } },
+                                    { { { AOM_CDF3(24647, 30463) },
+                                        { AOM_CDF3(32412, 32695) },
+                                        { AOM_CDF3(32468, 32720) },
+                                        { AOM_CDF3(31269, 32523) } },
+                                      { { AOM_CDF3(28482, 31505) },
+                                        { AOM_CDF3(32152, 32701) },
+                                        { AOM_CDF3(31732, 32598) },
+                                        { AOM_CDF3(31767, 32712) } } },
+                                    { { { AOM_CDF3(12358, 24977) },
+                                        { AOM_CDF3(31331, 32385) },
+                                        { AOM_CDF3(32634, 32756) },
+                                        { AOM_CDF3(30411, 32548) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } } };
+
+#endif  // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c
new file mode 100644
index 000000000..c96d37cca
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                        8, 9, 10, 11, 12, 13, 14, 15 };
+
+const int8_t av1_coeff_band_8x8[64] = {
+  0,  1,  2,  2,  3,  3,  4,  4,  5,  6,  2,  2,  3,  3,  4,  4,
+  7,  7,  8,  8,  9,  9,  10, 10, 7,  7,  8,  8,  9,  9,  10, 10,
+  11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14,
+  15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18,
+};
+
+const int8_t av1_coeff_band_16x16[256] = {
+  0,  1,  4,  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  2,  3,  4,
+  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,
+  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,  7,  7,  8,
+  8,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12,
+  13, 13, 13, 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13,
+  13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15,
+  15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15,
+  16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+  16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17,
+  17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18,
+  18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18,
+  19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18, 19, 19, 19,
+  19, 20, 20, 20, 20, 21, 21, 21, 21,
+};
+
+const int8_t av1_coeff_band_32x32[1024] = {
+  0,  1,  4,  4,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2,  3,  4,  4,  7,  7,
+  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+  12, 12, 12, 12, 12, 12, 12, 5,  5,  6,  6,  7,  7,  7,  7,  10, 10, 10, 10,
+  10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+  12, 5,  5,  6,  6,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11,
+  11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,  9,
+  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+  12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,  9,  9,  9,  9,  10, 10, 10,
+  10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+  12, 12, 8,  8,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11,
+  11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,
+  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
+  11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+  14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+  16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13,
+  13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+  15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14,
+  14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+  16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+  14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13,
+  13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,
+  15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13,
+  14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+  16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+  14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17,
+  17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+  19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17,
+  17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20,
+  20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,
+  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+  17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
+  19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17,
+  17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20,
+  20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18,
+  18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20,
+  20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+  19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17,
+  17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24,
+  24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+  23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21,
+  21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
+  23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22,
+  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24,
+  24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+  23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21,
+  21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+  23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22,
+  22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24,
+  24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+  22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+};
+
+// The ctx offset table when TX is TX_CLASS_2D.
+// TX col and row indices are clamped to 4
+
+const int8_t av1_nz_map_ctx_offset_4x4[16] = {
+  0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x8[64] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 1,  6,  6,  21, 21, 21, 21, 21,
+  6,  6,  21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x16[256] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x32[1024] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x4[32] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 16, 16, 6,  21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x16[128] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x8[128] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x32[512] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x16[512] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x64[1024] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_64x32[1024] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16,
+  16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_4x16[64] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x4[64] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x32[256] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x8[256] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t *av1_nz_map_ctx_offset[19] = {
+  av1_nz_map_ctx_offset_4x4,    // TX_4x4
+  av1_nz_map_ctx_offset_8x8,    // TX_8x8
+  av1_nz_map_ctx_offset_16x16,  // TX_16x16
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x8
+  av1_nz_map_ctx_offset_8x4,    // TX_8x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x16
+  av1_nz_map_ctx_offset_16x8,   // TX_16x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x32
+  av1_nz_map_ctx_offset_32x16,  // TX_32x16
+  av1_nz_map_ctx_offset_32x64,  // TX_32x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x16
+  av1_nz_map_ctx_offset_16x4,   // TX_16x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x32
+  av1_nz_map_ctx_offset_32x8,   // TX_32x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x16
+};
+
+void av1_init_lv_map(AV1_COMMON *cm) {
+  LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
+  for (int row = 0; row < 2; ++row) {
+    for (int col = 0; col < 2; ++col) {
+      for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
+        for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
+          if (row == 0 && col == 0 && count > 5) continue;
+          if ((row == 0 || col == 0) && count > 8) continue;
+
+          coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
+              get_base_ctx_from_count_mag(row, col, count, sig_mag);
+        }
+      }
+    }
+  }
+}
+
+const int16_t k_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
+                                        17, 33, 65, 129, 257, 513 };
+const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
new file mode 100644
index 000000000..1dda51f8b
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
+
+extern const int16_t k_eob_group_start[12];
+extern const int16_t k_eob_offset_bits[12];
+
+extern const int8_t av1_coeff_band_4x4[16];
+
+extern const int8_t av1_coeff_band_8x8[64];
+
+extern const int8_t av1_coeff_band_16x16[256];
+
+extern const int8_t av1_coeff_band_32x32[1024];
+
+extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL];
+
+typedef struct txb_ctx {
+  int txb_skip_ctx;
+  int dc_sign_ctx;
+} TXB_CTX;
+
+static const int base_level_count_to_index[13] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+};
+
+static const TX_CLASS tx_type_to_class[TX_TYPES] = {
+  TX_CLASS_2D,     // DCT_DCT
+  TX_CLASS_2D,     // ADST_DCT
+  TX_CLASS_2D,     // DCT_ADST
+  TX_CLASS_2D,     // ADST_ADST
+  TX_CLASS_2D,     // FLIPADST_DCT
+  TX_CLASS_2D,     // DCT_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_FLIPADST
+  TX_CLASS_2D,     // ADST_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_ADST
+  TX_CLASS_2D,     // IDTX
+  TX_CLASS_VERT,   // V_DCT
+  TX_CLASS_HORIZ,  // H_DCT
+  TX_CLASS_VERT,   // V_ADST
+  TX_CLASS_HORIZ,  // H_ADST
+  TX_CLASS_VERT,   // V_FLIPADST
+  TX_CLASS_HORIZ,  // H_FLIPADST
+};
+
+static INLINE int get_txb_bwl(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide_log2[tx_size];
+}
+
+static INLINE int get_txb_wide(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide[tx_size];
+}
+
+static INLINE int get_txb_high(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_high[tx_size];
+}
+
+static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
+  return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
+}
+
+static INLINE int get_padded_idx(const int idx, const int bwl) {
+  return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
+}
+
+static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
+                                              int sig_mag) {
+  const int ctx = base_level_count_to_index[count];
+  int ctx_idx = -1;
+
+  if (row == 0 && col == 0) {
+    if (sig_mag >= 2) return ctx_idx = 0;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 1;
+      else
+        ctx_idx = 2;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 3 + ctx;
+    assert(ctx_idx <= 6);
+    return ctx_idx;
+  } else if (row == 0) {
+    if (sig_mag >= 2) return ctx_idx = 6;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 7;
+      else
+        ctx_idx = 8;
+      return ctx_idx;
+    }
+
+    ctx_idx = 9 + ctx;
+    assert(ctx_idx <= 11);
+    return ctx_idx;
+  } else if (col == 0) {
+    if (sig_mag >= 2) return ctx_idx = 12;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 13;
+      else
+        ctx_idx = 14;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 15 + ctx;
+    assert(ctx_idx <= 17);
+    // TODO(angiebird): turn this on once the optimization is finalized
+    // assert(ctx_idx < 28);
+  } else {
+    if (sig_mag >= 2) return ctx_idx = 18;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 19;
+      else
+        ctx_idx = 20;
+      return ctx_idx;
+    }
+
+    ctx_idx = 21 + ctx;
+
+    assert(ctx_idx <= 24);
+  }
+  return ctx_idx;
+}
+
+static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+                                const int c,  // raster order
+                                const int bwl) {
+  assert(c > 0);
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
+  mag = AOMMIN((mag + 1) >> 1, 6);
+  //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
+  if ((row | col) < 2) return mag + 7;
+  return mag + 14;
+}
+
+static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
+                                       const int c,  // raster order
+                                       const int bwl, const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = levels[pos + 1];
+  mag += levels[pos + stride];
+  switch (tx_class) {
+    case TX_CLASS_2D:
+      mag += levels[pos + stride + 1];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if ((row < 2) && (col < 2)) return mag + 7;
+      break;
+    case TX_CLASS_HORIZ:
+      mag += levels[pos + 2];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (col == 0) return mag + 7;
+      break;
+    case TX_CLASS_VERT:
+      mag += levels[pos + (stride << 1)];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (row == 0) return mag + 7;
+      break;
+    default: break;
+  }
+
+  return mag + 14;
+}
+
+static const uint8_t clip_max3[256] = {
+  0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
+                                       const int bwl, const TX_CLASS tx_class) {
+  int mag;
+
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  mag = clip_max3[levels[1]];                         // { 0, 1 }
+  mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]];  // { 1, 0 }
+
+  if (tx_class == TX_CLASS_2D) {
+    mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]];          // { 1, 1 }
+    mag += clip_max3[levels[2]];                                    // { 0, 2 }
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+  } else if (tx_class == TX_CLASS_VERT) {
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+    mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]];  // { 3, 0 }
+    mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]];  // { 4, 0 }
+  } else {
+    mag += clip_max3[levels[2]];  // { 0, 2 }
+    mag += clip_max3[levels[3]];  // { 0, 3 }
+    mag += clip_max3[levels[4]];  // { 0, 4 }
+  }
+
+  return mag;
+}
+
+#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
+#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
+#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
+
+static const int nz_map_ctx_offset_1d[32] = {
+  NZ_MAP_CTX_0,  NZ_MAP_CTX_5,  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+};
+
+static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
+    const int stats,
+    const int coeff_idx,  // raster order
+    const int bwl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
+  // tx_class == 0(TX_CLASS_2D)
+  if ((tx_class | coeff_idx) == 0) return 0;
+  int ctx = (stats + 1) >> 1;
+  ctx = AOMMIN(ctx, 4);
+  switch (tx_class) {
+    case TX_CLASS_2D: {
+      // This is the algorithm to generate av1_nz_map_ctx_offset[][]
+      //   const int width = tx_size_wide[tx_size];
+      //   const int height = tx_size_high[tx_size];
+      //   if (width < height) {
+      //     if (row < 2) return 11 + ctx;
+      //   } else if (width > height) {
+      //     if (col < 2) return 16 + ctx;
+      //   }
+      //   if (row + col < 2) return ctx + 1;
+      //   if (row + col < 4) return 5 + ctx + 1;
+      //   return 21 + ctx;
+      return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+    }
+    case TX_CLASS_HORIZ: {
+      const int row = coeff_idx >> bwl;
+      const int col = coeff_idx - (row << bwl);
+      return ctx + nz_map_ctx_offset_1d[col];
+      break;
+    }
+    case TX_CLASS_VERT: {
+      const int row = coeff_idx >> bwl;
+      return ctx + nz_map_ctx_offset_1d[row];
+      break;
+    }
+    default: break;
+  }
+  return 0;
+}
+
+typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
+typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
+
+static INLINE int get_lower_levels_ctx_eob(int bwl, int height, int scan_idx) {
+  if (scan_idx == 0) return 0;
+  if (scan_idx <= (height << bwl) / 8) return 1;
+  if (scan_idx <= (height << bwl) / 4) return 2;
+  return 3;
+}
+
+static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+                                          int bwl, TX_SIZE tx_size) {
+  assert(coeff_idx > 0);
+  int mag;
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  levels = levels + get_padded_idx(coeff_idx, bwl);
+  mag = AOMMIN(levels[1], 3);                                     // { 0, 1 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3);              // { 1, 0 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR + 1], 3);          // { 1, 1 }
+  mag += AOMMIN(levels[2], 3);                                    // { 0, 2 }
+  mag += AOMMIN(levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)], 3);  // { 2, 0 }
+
+  const int ctx = AOMMIN((mag + 1) >> 1, 4);
+  return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+}
+static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
+                                                 int coeff_idx, int bwl,
+                                                 TX_SIZE tx_size,
+                                                 TX_CLASS tx_class) {
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+                                               int bwl, int height,
+                                               const uint8_t *levels,
+                                               int coeff_idx, TX_SIZE tx_size,
+                                               TX_CLASS tx_class) {
+  if (is_last) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) >> 3) return 1;
+    if (scan_idx <= (height << bwl) >> 2) return 2;
+    return 3;
+  }
+  return get_lower_levels_ctx(levels, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+  if (dc_val < 0)
+    *cul_level |= 1 << COEFF_CONTEXT_BITS;
+  else if (dc_val > 0)
+    *cul_level += 2 << COEFF_CONTEXT_BITS;
+}
+
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+                               const TX_SIZE tx_size, const int plane,
+                               const ENTROPY_CONTEXT *const a,
+                               const ENTROPY_CONTEXT *const l,
+                               TXB_CTX *const txb_ctx) {
+#define MAX_TX_SIZE_UNIT 16
+  static const int8_t signs[3] = { 0, -1, 1 };
+  static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+  };
+  const int txb_w_unit = tx_size_wide_unit[tx_size];
+  const int txb_h_unit = tx_size_high_unit[tx_size];
+  int dc_sign = 0;
+  int k = 0;
+
+  do {
+    const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_w_unit);
+
+  k = 0;
+  do {
+    const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_h_unit);
+
+  txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
+
+  if (plane == 0) {
+    if (plane_bsize == txsize_to_bsize[tx_size]) {
+      txb_ctx->txb_skip_ctx = 0;
+    } else {
+      // This is the algorithm to generate table skip_contexts[min][max].
+      //    if (!max)
+      //      txb_skip_ctx = 1;
+      //    else if (!min)
+      //      txb_skip_ctx = 2 + (max > 3);
+      //    else if (max <= 3)
+      //      txb_skip_ctx = 4;
+      //    else if (min <= 3)
+      //      txb_skip_ctx = 5;
+      //    else
+      //      txb_skip_ctx = 6;
+      static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 5 },
+                                                   { 1, 4, 4, 4, 6 } };
+      int top = 0;
+      int left = 0;
+
+      k = 0;
+      do {
+        top |= a[k];
+      } while (++k < txb_w_unit);
+      top &= COEFF_CONTEXT_MASK;
+
+      k = 0;
+      do {
+        left |= l[k];
+      } while (++k < txb_h_unit);
+      left &= COEFF_CONTEXT_MASK;
+      const int max = AOMMIN(top | left, 4);
+      const int min = AOMMIN(AOMMIN(top, left), 4);
+
+      txb_ctx->txb_skip_ctx = skip_contexts[min][max];
+    }
+  } else {
+    const int ctx_base = get_entropy_context(tx_size, a, l);
+    const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
+                            num_pels_log2_lookup[txsize_to_bsize[tx_size]])
+                               ? 10
+                               : 7;
+    txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
+  }
+#undef MAX_TX_SIZE_UNIT
+}
+
+void av1_init_lv_map(AV1_COMMON *cm);
+
+#endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
new file mode 100644
index 000000000..4144c4389
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -0,0 +1,1148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+#define WARP_ERROR_BLOCK 32
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+  // pow 0.7
+  16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+  16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+  15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+  15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+  14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+  14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+  14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+  13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+  13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+  12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+  12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+  12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+  11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+  11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+  10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+  10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+  10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
+  9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+  9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+  8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+  8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+  7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+  7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+  6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+  6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+  5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+  5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+  4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+  3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+  3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+  2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+  1323, 1187, 1045,  894,  731,  550,  339,    0,
+  339,  550,  731,  894, 1045, 1187, 1323, 1452,
+  1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+  2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+  3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+  3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+  4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+  5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+  5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+  6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+  6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+  7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+  7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+  8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+  8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+  9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+  9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
+  10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+  10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+  11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+  11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+  11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+  12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+  12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+  13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+  13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+  13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+  14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+  14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+  15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+  15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+  15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+  16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
+
+// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
+// at a time. The zoom/rotation/shear in the model are applied to the
+// "fractional" position of each pixel, which therefore varies within
+// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
+// We need an extra 2 taps to fit this in, for a total of 8 taps.
+/* clang-format off */
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
+  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
+  { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
+  { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
+  { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
+  { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
+  { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
+  { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
+  { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
+  { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
+  { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
+  { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
+  { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
+  { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
+  { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
+  { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
+  { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
+  { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
+  { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
+  { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
+  { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
+  { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
+  { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
+  { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
+  { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
+  { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
+  { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
+  { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
+  { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
+  { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
+  { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
+  { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
+
+  // [0, 1)
+  { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
+  { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
+  { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
+  {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
+  {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
+  {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
+  {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
+  {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
+  {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
+  {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
+  {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
+  {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
+  {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
+  {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
+  {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
+  {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
+  {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
+  {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
+  {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
+  {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
+  {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
+  {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
+  {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
+  {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
+  {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
+  {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
+  {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
+  {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
+  {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
+  {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
+  { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
+  { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
+
+  // [1, 2)
+  { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
+  { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
+  { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
+  { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
+  { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
+  { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
+  { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
+  { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
+  { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
+  { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
+  { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
+  { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
+  { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
+  { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
+  { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
+  { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
+  { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
+  { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
+  { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
+  { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
+  { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
+  { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
+  { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
+  { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
+  { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
+  { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
+  { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
+  { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
+  { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
+  { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
+  { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
+  { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
+  // dummy (replicate row index 191)
+  { 0, 0, 0,   0,   2, 127, - 1, 0 },
+
+#elif WARPEDPIXEL_PREC_BITS == 5
+  // [-1, 0)
+  {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
+  {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
+  {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
+  {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
+  {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
+  {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
+  {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
+  {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
+  {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
+  {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
+  {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
+  {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
+  {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
+  {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
+  {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
+  {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
+  // [0, 1)
+  { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
+  { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
+  {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
+  {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
+  {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
+  {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
+  {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
+  {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
+  {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
+  {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
+  {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
+  {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
+  {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
+  {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
+  {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
+  { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
+  // [1, 2)
+  {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
+  {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
+  {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
+  {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
+  {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
+  {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
+  {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
+  {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
+  {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
+  {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
+  {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
+  {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
+  {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
+  {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
+  {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
+  {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
+  // dummy (replicate row index 95)
+  {0, 0, 0,  -1,   4, 127,  -3, 1},
+
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+
+/* clang-format on */
+
+#define DIV_LUT_PREC_BITS 14
+#define DIV_LUT_BITS 8
+#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
+
+static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
+  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+  8240,  8224,  8208,  8192,
+};
+
+// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
+// at precision of DIV_LUT_PREC_BITS along with the shift.
+static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
+  int64_t f;
+  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+                               : get_msb((unsigned int)D));
+  // e is obtained from D after resetting the most significant 1 bit.
+  const int64_t e = D - ((uint64_t)1 << *shift);
+  // Get the most significant DIV_LUT_BITS (8) bits of e into f
+  if (*shift > DIV_LUT_BITS)
+    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
+  else
+    f = e << (DIV_LUT_BITS - *shift);
+  assert(f <= DIV_LUT_NUM);
+  *shift += DIV_LUT_PREC_BITS;
+  // Use f as lookup into the precomputed table of multipliers
+  return div_lut[f];
+}
+
+static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
+  int32_t f;
+  *shift = get_msb(D);
+  // e is obtained from D after resetting the most significant 1 bit.
+  const int32_t e = D - ((uint32_t)1 << *shift);
+  // Get the most significant DIV_LUT_BITS (8) bits of e into f
+  if (*shift > DIV_LUT_BITS)
+    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
+  else
+    f = e << (DIV_LUT_BITS - *shift);
+  assert(f <= DIV_LUT_NUM);
+  *shift += DIV_LUT_PREC_BITS;
+  // Use f as lookup into the precomputed table of multipliers
+  return div_lut[f];
+}
+
+static int is_affine_valid(const WarpedMotionParams *const wm) {
+  const int32_t *mat = wm->wmmat;
+  return (mat[2] > 0);
+}
+
+static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
+                                   int16_t delta) {
+  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+    return 0;
+  else
+    return 1;
+}
+
+// Returns 1 on success or 0 on an invalid affine set
+int get_shear_params(WarpedMotionParams *wm) {
+  const int32_t *mat = wm->wmmat;
+  if (!is_affine_valid(wm)) return 0;
+  wm->alpha =
+      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+  int16_t shift;
+  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
+  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+  wm->gamma =
+      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
+  v = ((int64_t)mat[3] * mat[4]) * y;
+  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
+                        (1 << WARPEDMODEL_PREC_BITS),
+                    INT16_MIN, INT16_MAX);
+
+  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
+              (1 << WARP_PARAM_REDUCE_BITS);
+  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
+              (1 << WARP_PARAM_REDUCE_BITS);
+  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
+              (1 << WARP_PARAM_REDUCE_BITS);
+
+  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+    return 0;
+
+  return 1;
+}
+
+static INLINE int highbd_error_measure(int err, int bd) {
+  const int b = bd - 8;
+  const int bmask = (1 << b) - 1;
+  const int v = (1 << b);
+  err = abs(err);
+  const int e1 = err >> b;
+  const int e2 = err & bmask;
+  return error_measure_lut[255 + e1] * (v - e2) +
+         error_measure_lut[256 + e1] * e2;
+}
+
+/* Note: For an explanation of the warp algorithm, and some notes on bit widths
+    for hardware implementations, see the comments above av1_warp_affine_c
+*/
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
+                              int width, int height, int stride, uint16_t *pred,
+                              int p_col, int p_row, int p_width, int p_height,
+                              int p_stride, int subsampling_x,
+                              int subsampling_y, int bd,
+                              ConvolveParams *conv_params, int16_t alpha,
+                              int16_t beta, int16_t gamma, int16_t delta) {
+  int32_t tmp[15 * 8];
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4) << subsampling_x;
+      const int32_t src_y = (i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      for (int k = -7; k < 8; ++k) {
+        const int iy = clamp(iy4 + k, 0, height - 1);
+
+        int sx = sx4 + beta * (k + 4);
+        for (int l = -4; l < 4; ++l) {
+          int ix = ix4 + l - 3;
+          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_horiz;
+          for (int m = 0; m < 8; ++m) {
+            const int sample_x = clamp(ix + m, 0, width - 1);
+            sum += ref[iy * stride + sample_x] * coeffs[m];
+          }
+          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+          assert(0 <= sum && sum < (1 << max_bits_horiz));
+          tmp[(k + 7) * 8 + (l + 4)] = sum;
+          sx += alpha;
+        }
+      }
+
+      // Vertical filter
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_vert;
+          for (int m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+
+          if (conv_params->is_compound) {
+            CONV_BUF_TYPE *p =
+                &conv_params
+                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                           (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint16_t *dst16 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_jnt_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst16 =
+                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
+            } else {
+              *p = sum;
+            }
+          } else {
+            uint16_t *p =
+                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            assert(0 <= sum && sum < (1 << (bd + 2)));
+            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
+          }
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
+static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
+                              int width, int height, int stride,
+                              const uint8_t *const pred8, int p_col, int p_row,
+                              int p_width, int p_height, int p_stride,
+                              int subsampling_x, int subsampling_y, int bd,
+                              ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
+  if (wm->wmtype == ROTZOOM) {
+    wm->wmmat[5] = wm->wmmat[2];
+    wm->wmmat[4] = -wm->wmmat[3];
+  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+
+  const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, bd, conv_params, alpha, beta, gamma,
+                         delta);
+}
+
+static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
+                                  const uint16_t *const dst, int p_width,
+                                  int p_height, int p_stride, int bd) {
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sum_error +=
+          highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
+    }
+  }
+  return sum_error;
+}
+
+static int64_t highbd_warp_error(
+    WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
+    int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
+    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
+    int64_t best_error) {
+  int64_t gm_sumerr = 0;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  conv_params.use_jnt_comp_avg = 0;
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      highbd_warp_plane(wm, ref8, width, height, stride,
+                        CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
+                        WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
+                        &conv_params);
+
+      gm_sumerr += highbd_frame_error(
+          tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
+          warp_w, warp_h, p_stride, bd);
+      if (gm_sumerr > best_error) return gm_sumerr;
+    }
+  }
+  return gm_sumerr;
+}
+
+static INLINE int error_measure(int err) {
+  return error_measure_lut[255 + err];
+}
+
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+   * Split the input into 8x8 blocks
+   * For each block, project the point (4, 4) within the block, to get the
+     overall block position. Split into integer and fractional coordinates,
+     maintaining full WARPEDMODEL precision
+   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+     variable horizontal offset. This means that, while the rows of the
+     intermediate buffer align with the rows of the *reference* image, the
+     columns align with the columns of the *destination* image.
+   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+     destination is too small we crop the output at this stage). Each pixel has
+     a variable vertical offset, so that the resulting rows are aligned with
+     the rows of the destination image.
+
+   To accomplish these alignments, we factor the warp matrix as a
+   product of two shear / asymmetric zoom matrices:
+   / a b \  = /   1       0    \ * / 1+alpha  beta \
+   \ c d /    \ gamma  1+delta /   \    0      1   /
+   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+   The horizontal shear (with alpha and beta) is applied first,
+   then the vertical shear (with gamma and delta) is applied second.
+
+   The only limitation is that, to fit this in a fixed 8-tap filter size,
+   the fractional pixel offsets must be at most +-1. Since the horizontal filter
+   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+   within the block, the parameters must satisfy
+   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
+   for this filter to be applicable.
+
+   Note: This function assumes that the caller has done all of the relevant
+   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+   gamma, delta are all in range.
+
+   TODO(david.barker): Maybe support scaled references?
+*/
+/* A note on hardware implementation:
+    The warp filter is intended to be implementable using the same hardware as
+    the high-precision convolve filters from the loop-restoration and
+    convolve-round experiments.
+
+    For a single filter stage, considering all of the coefficient sets for the
+    warp filter and the regular convolution filter, an input in the range
+    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
+    before rounding.
+
+    Allowing for some changes to the filter coefficient sets, call the range
+    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
+    we can replace this by the range [0, 256 * 2^k], which can be stored in an
+    unsigned value with 8 + k bits.
+
+    This allows the derivation of the appropriate bit widths and offsets for
+    the various intermediate values: If
+
+    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
+         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
+         intermediate value.
+    H := ROUND0_BITS
+    V := VERSHEAR_REDUCE_PREC_BITS
+    (and note that we must have H + V = 2*F for the output to have the same
+     scale as the input)
+
+    then we end up with the following offsets and ranges:
+    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
+                       uint{bd + F + 1}
+    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
+    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
+                     uint{bd + 2*F + 2 - H}
+    After rounding: The final value, before undoing the offset, fits into a
+                    uint{bd + 2}.
+
+    Then we need to undo the offsets before clamping to a pixel. Note that,
+    if we do this at the end, the amount to subtract is actually independent
+    of H and V:
+
+    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
+                         (1 << ((bd + 2*F - H) - V))
+                      == (1 << (bd - 1)) + (1 << bd)
+
+    This allows us to entirely avoid clamping in both the warp filter and
+    the convolve-round experiment. As of the time of writing, the Wiener filter
+    from loop-restoration can encode a central coefficient up to 216, which
+    leads to a maximum value of about 282 * 2^k after applying the offset.
+    So in that case we still need to clamp.
+*/
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
+                       int height, int stride, uint8_t *pred, int p_col,
+                       int p_row, int p_width, int p_height, int p_stride,
+                       int subsampling_x, int subsampling_y,
+                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
+                       int16_t gamma, int16_t delta) {
+  int32_t tmp[15 * 8];
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4) << subsampling_x;
+      const int32_t src_y = (i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      for (int k = -7; k < 8; ++k) {
+        // Clamp to top/bottom edge of the frame
+        const int iy = clamp(iy4 + k, 0, height - 1);
+
+        int sx = sx4 + beta * (k + 4);
+
+        for (int l = -4; l < 4; ++l) {
+          int ix = ix4 + l - 3;
+          // At this point, sx = sx4 + alpha * l + beta * k
+          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_horiz;
+          for (int m = 0; m < 8; ++m) {
+            // Clamp to left/right edge of the frame
+            const int sample_x = clamp(ix + m, 0, width - 1);
+
+            sum += ref[iy * stride + sample_x] * coeffs[m];
+          }
+          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+          assert(0 <= sum && sum < (1 << max_bits_horiz));
+          tmp[(k + 7) * 8 + (l + 4)] = sum;
+          sx += alpha;
+        }
+      }
+
+      // Vertical filter
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+          // At this point, sy = sy4 + gamma * l + delta * k
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_vert;
+          for (int m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+
+          if (conv_params->is_compound) {
+            CONV_BUF_TYPE *p =
+                &conv_params
+                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                           (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint8_t *dst8 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_jnt_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
+            } else {
+              *p = sum;
+            }
+          } else {
+            uint8_t *p =
+                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            assert(0 <= sum && sum < (1 << (bd + 2)));
+            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
+          }
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
+static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
+                       int width, int height, int stride, uint8_t *pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
+  if (wm->wmtype == ROTZOOM) {
+    wm->wmmat[5] = wm->wmmat[2];
+    wm->wmmat[4] = -wm->wmmat[3];
+  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
+                  alpha, beta, gamma, delta);
+}
+
+static int64_t frame_error(const uint8_t *const ref, int stride,
+                           const uint8_t *const dst, int p_width, int p_height,
+                           int p_stride) {
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sum_error +=
+          (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
+    }
+  }
+  return sum_error;
+}
+
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+                          int width, int height, int stride,
+                          const uint8_t *const dst, int p_col, int p_row,
+                          int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          int64_t best_error) {
+  int64_t gm_sumerr = 0;
+  int warp_w, warp_h;
+  int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+  ConvolveParams conv_params = get_conv_params(0, 0, 8);
+  conv_params.use_jnt_comp_avg = 0;
+
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
+                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
+
+      gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+                               warp_w, warp_h, p_stride);
+      if (gm_sumerr > best_error) return gm_sumerr;
+    }
+  }
+  return gm_sumerr;
+}
+
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride) {
+  if (use_hbd) {
+    return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+                              CONVERT_TO_SHORTPTR(dst), p_width, p_height,
+                              p_stride, bd);
+  }
+  return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int width, int height, int stride,
+                       uint8_t *dst, int p_col, int p_row, int p_width,
+                       int p_height, int p_stride, int subsampling_x,
+                       int subsampling_y, int64_t best_error) {
+  if (wm->wmtype <= AFFINE)
+    if (!get_shear_params(wm)) return 1;
+  if (use_hbd)
+    return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
+                             p_width, p_height, p_stride, subsampling_x,
+                             subsampling_y, bd, best_error);
+  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
+                    p_height, p_stride, subsampling_x, subsampling_y,
+                    best_error);
+}
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+                    const uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, ConvolveParams *conv_params) {
+  if (use_hbd)
+    highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
+                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
+                      bd, conv_params);
+  else
+    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+}
+
+#define LS_MV_MAX 256  // max mv in 1/8-pel
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
+
+// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
+// the precision needed is:
+//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
+//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
+//   1 [for sign] +
+//   LEAST_SQUARES_SAMPLES_MAX_BITS
+//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
+// The value is 23
+#define LS_MAT_RANGE_BITS \
+  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+// Bit-depth reduction from the full-range
+#define LS_MAT_DOWN_BITS 2
+
+// bits range of A, Bx and By after downshifting
+#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
+#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
+#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
+
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a)                                          \
+  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b)                                           \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b)                                               \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+
+#define USE_LIMITED_PREC_MULT 0
+
+#if USE_LIMITED_PREC_MULT
+
+#define MUL_PREC_BITS 16
+static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
+  int msb = 0;
+  uint16_t mult = 0;
+  *shift = 0;
+  if (D != 0) {
+    msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+                              : get_msb((unsigned int)D));
+    if (msb >= MUL_PREC_BITS) {
+      mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
+      *shift = msb + 1 - MUL_PREC_BITS;
+    } else {
+      mult = (uint16_t)D;
+      *shift = 0;
+    }
+  }
+  return mult;
+}
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+  int32_t ret;
+  int16_t mshift;
+  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+  shift -= mshift;
+  if (shift > 0) {
+    return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  } else {
+    return (int32_t)clamp(v * (1 << (-shift)),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  }
+  return ret;
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+  int16_t mshift;
+  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+  shift -= mshift;
+  if (shift > 0) {
+    return (int32_t)clamp(
+        ROUND_POWER_OF_TWO_SIGNED(v, shift),
+        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  } else {
+    return (int32_t)clamp(
+        v * (1 << (-shift)),
+        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  }
+}
+
+#else
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+  int64_t v = Px * (int64_t)iDet;
+  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+  int64_t v = Px * (int64_t)iDet;
+  return (int32_t)clamp64(
+      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+#endif  // USE_LIMITED_PREC_MULT
+
+static int find_affine_int(int np, const int *pts1, const int *pts2,
+                           BLOCK_SIZE bsize, int mvy, int mvx,
+                           WarpedMotionParams *wm, int mi_row, int mi_col) {
+  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
+  int32_t Bx[2] = { 0, 0 };
+  int32_t By[2] = { 0, 0 };
+  int i;
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
+  const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+  const int suy = rsuy * 8;
+  const int sux = rsux * 8;
+  const int duy = suy + mvy;
+  const int dux = sux + mvx;
+  const int isuy = (mi_row * MI_SIZE + rsuy);
+  const int isux = (mi_col * MI_SIZE + rsux);
+
+  // Assume the center pixel of the block has exactly the same motion vector
+  // as transmitted for the block. First shift the origin of the source
+  // points to the block center, and the origin of the destination points to
+  // the block center added to the motion vector transmitted.
+  // Let (xi, yi) denote the source points and (xi', yi') denote destination
+  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
+  // Then if  P = [x0, y0,
+  //               x1, y1
+  //               x2, y1,
+  //                ....
+  //              ]
+  //          q = [x0', x1', x2', ... ]'
+  //          r = [y0', y1', y2', ... ]'
+  // the least squares problems that need to be solved are:
+  //          [h1, h2]' = inv(P'P)P'q and
+  //          [h3, h4]' = inv(P'P)P'r
+  // where the affine transformation is given by:
+  //          x' = h1.x + h2.y
+  //          y' = h3.x + h4.y
+  //
+  // The loop below computes: A = P'P, Bx = P'q, By = P'r
+  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+  // Contribution from neighbor block
+  for (i = 0; i < np; i++) {
+    const int dx = pts2[i * 2] - dux;
+    const int dy = pts2[i * 2 + 1] - duy;
+    const int sx = pts1[i * 2] - sux;
+    const int sy = pts1[i * 2 + 1] - suy;
+    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+    // selection is done in find_samples(). Also, global offset can be removed
+    // while collecting samples.
+    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+      A[0][0] += LS_SQUARE(sx);
+      A[0][1] += LS_PRODUCT1(sx, sy);
+      A[1][1] += LS_SQUARE(sy);
+      Bx[0] += LS_PRODUCT2(sx, dx);
+      Bx[1] += LS_PRODUCT1(sy, dx);
+      By[0] += LS_PRODUCT1(sx, dy);
+      By[1] += LS_PRODUCT2(sy, dy);
+    }
+  }
+
+  // Just for debugging, and can be removed later.
+  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
+
+  int64_t Det;
+  int16_t iDet, shift;
+
+  // Compute Determinant of A
+  Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+  if (Det == 0) return 1;
+  iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+  shift -= WARPEDMODEL_PREC_BITS;
+  if (shift < 0) {
+    iDet <<= (-shift);
+    shift = 0;
+  }
+
+  int64_t Px[2], Py[2];
+
+  // These divided by the Det, are the least squares solutions
+  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
+  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
+  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
+  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
+  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
+
+  // Note: In the vx, vy expressions below, the max value of each of the
+  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
+  // for the first term so that the overall sum in the worst case fits
+  // within 32 bits overall.
+  int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+               (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+                isuy * wm->wmmat[3]);
+  int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+               (isux * wm->wmmat[4] +
+                isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+  wm->wmmat[0] =
+      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[1] =
+      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+
+  wm->wmmat[6] = wm->wmmat[7] = 0;
+  return 0;
+}
+
+int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
+                    int mvx, WarpedMotionParams *wm_params, int mi_row,
+                    int mi_col) {
+  assert(wm_params->wmtype == AFFINE);
+
+  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
+                      mi_col))
+    return 1;
+
+  // check compatibility with the fast warp filter
+  if (!get_shear_params(wm_params)) return 1;
+
+  return 0;
+}
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
new file mode 100644
index 000000000..a1a4f067d
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/mv.h"
+#include "av1/common/convolve.h"
+
+#define MAX_PARAMDIM 9
+#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
+#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
+#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
+#define WARPED_MOTION_DEBUG 0
+#define DEFAULT_WMTYPE AFFINE
+
+extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+static const uint8_t warp_pad_left[14][16] = {
+  { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
+  { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
+  { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
+  { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
+  { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
+};
+
+static const uint8_t warp_pad_right[14][16] = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
+  { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
+  { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
+  { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+  { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+  { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
+};
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int width, int height, int stride,
+                       uint8_t *dst, int p_col, int p_row, int p_width,
+                       int p_height, int p_stride, int subsampling_x,
+                       int subsampling_y, int64_t best_error);
+
+// Returns the error between the frame described by 'ref' and the frame
+// described by 'dst'.
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride);
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+                    const uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, ConvolveParams *conv_params);
+
+int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
+                    int mvx, WarpedMotionParams *wm_params, int mi_row,
+                    int mi_col);
+
+int get_shear_params(WarpedMotionParams *wm);
+#endif  // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 000000000..8aa14696f
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const int16_t *x_filters, int x0_qn,
+                                  int x_step_qn) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const uint8_t *src_y;
+  uint8_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint8_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 8-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_8 = xx_loadl_64(src_x0);
+      const __m128i src1_8 = xx_loadl_64(src_x1);
+      const __m128i src2_8 = xx_loadl_64(src_x2);
+      const __m128i src3_8 = xx_loadl_64(src_x3);
+
+      // Now zero-extend up to 16-bit precision, i.e.
+      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Pack 16-bit values into 8-bit values, i.e.
+      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+      // -> [ 0 0 0 0 0 0 DC BA ]
+      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+      // Write to the output
+      xx_storel_32(&dst_y[x], shifted_8);
+    }
+  }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+                                         uint16_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         int x0_qn, int x_step_qn, int bd) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+  const uint16_t *src_y;
+  uint16_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint16_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 16-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_16 = xx_loadu_128(src_x0);
+      const __m128i src1_16 = xx_loadu_128(src_x1);
+      const __m128i src2_16 = xx_loadu_128(src_x2);
+      const __m128i src3_16 = xx_loadu_128(src_x3);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+      // Write to the output
+      xx_storel_64(&dst_y[x], clipped_16);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
new file mode 100644
index 000000000..d9fb53785
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
+                     int h, int subpel_x_qn, int x_step_qn,
+                     const InterpFilterParams *filter_params, unsigned round) {
+  const int bd = 8;
+  const int ntaps = 8;
+
+  src -= ntaps / 2 - 1;
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    // Load the filter coefficients
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint8_t *const src0 = src_col + y * src_stride;
+      const uint8_t *const src1 = src0 + 1 * src_stride;
+      const uint8_t *const src2 = src0 + 2 * src_stride;
+      const uint8_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 8-bit input data; each load is just
+      // loading the lower half of the register and gets 8 pixels
+      const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
+      const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
+      const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
+      const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
+
+      // Now zero-extend up to 16-bit precision by interleaving with
+      // zeros. Drop the upper half of each register (which just had zeros)
+      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+      // Divide down by (1 << round), rounding to nearest.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      shifted = _mm_packus_epi32(shifted, shifted);
+      // Write transposed to the output
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint8_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+  __m128i data = _mm_loadu_si128((__m128i *)src);
+  return _mm_madd_epi16(data, coeff);
+}
+
+// A specialised version of vfilter, the vertical filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
+                     int dst_stride, int w, int h, int subpel_y_qn,
+                     int y_step_qn, const InterpFilterParams *filter_params,
+                     const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int ntaps = 8;
+
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi16(sub32);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+      conv = _mm_add_epi32(conv, res_add_const);
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint8_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+      __m128i result;
+      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+          if (conv_params->use_jnt_comp_avg) {
+            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+            const __m128i shifted_32 =
+                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+          } else {
+            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+          }
+          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+          const __m128i result_8 = _mm_packus_epi16(result, result);
+          *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+        } else {
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+        const __m128i result_8 = _mm_packus_epi16(result, result);
+        *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+      }
+    }
+    for (; x < w; ++x) {
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - sub32;
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
+    }
+  }
+}
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int x_step_qn,
+                                  const int subpel_y_qn, const int y_step_qn,
+                                  ConvolveParams *conv_params) {
+  // TODO(yaowu): remove unnecessary initializations
+  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+
+  const int xtaps = filter_params_x->taps;
+  const int ytaps = filter_params_y->taps;
+  const int fo_vert = ytaps / 2 - 1;
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
+
+  // horizontal filter
+  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+           x_step_qn, filter_params_x, conv_params->round_0);
+
+  // vertical filter (input is transposed)
+  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+           filter_params_y, conv_params, 8);
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+                            int w, int h, int subpel_x_qn, int x_step_qn,
+                            const InterpFilterParams *filter_params,
+                            unsigned round, int bd) {
+  const int ntaps = 8;
+
+  src -= ntaps / 2 - 1;
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    // Load the filter coefficients
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint16_t *const src0 = src_col + y * src_stride;
+      const uint16_t *const src1 = src0 + 1 * src_stride;
+      const uint16_t *const src2 = src0 + 2 * src_stride;
+      const uint16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 16-bit input data, so each load gets the 8
+      // pixels we need.
+      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+      // Divide down by (1 << round), rounding to nearest.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      shifted = _mm_packus_epi32(shifted, shifted);
+      // Write transposed to the output
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint16_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+// A specialised version of vfilter, the vertical filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+                            int dst_stride, int w, int h, int subpel_y_qn,
+                            int y_step_qn,
+                            const InterpFilterParams *filter_params,
+                            const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int ntaps = 8;
+
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi32(sub32);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const __m128i clip_pixel_ =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
+      conv = _mm_add_epi32(conv, res_add_const);
+
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint16_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+
+      __m128i result;
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          __m128i p_32 =
+              _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+          if (conv_params->use_jnt_comp_avg) {
+            shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                    _mm_mullo_epi32(shifted, wt1));
+            shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+          } else {
+            shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+          }
+          __m128i res32 = _mm_sub_epi32(shifted, sub);
+          res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
+                                round_bits_shift);
+
+          __m128i res16 = _mm_packus_epi32(res32, res32);
+          res16 = _mm_min_epi16(res16, clip_pixel_);
+          _mm_storel_epi64((__m128i *)dst_x, res16);
+        } else {
+          __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi32(shifted, sub);
+        result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+        result = _mm_packus_epi32(result, result);
+        result = _mm_min_epi16(result, clip_pixel_);
+        _mm_storel_epi64((__m128i *)dst_x, result);
+      }
+    }
+
+    for (; x < w; ++x) {
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_jnt_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params, int bd) {
+  // TODO(yaowu): Move this out of stack
+  DECLARE_ALIGNED(16, int16_t,
+                  tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  const int xtaps = filter_params_x->taps;
+  const int ytaps = filter_params_y->taps;
+  const int fo_vert = ytaps / 2 - 1;
+
+  memset(tmp, 0, sizeof(tmp));
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
+
+  // horizontal filter
+  highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+                  subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+                  bd);
+
+  // vertical filter (input is transposed)
+  highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+                  filter_params_y, conv_params, bd);
+}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
new file mode 100644
index 000000000..212d3bd72
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/filter.h"
+
+typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
+                              int src_stride, uint16_t *dst, int dst_stride,
+                              int bd);
+
+// pixelsNum 0: write all 4 pixels
+//           1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
+                       int dst_stride) {
+  if (2 == width) {
+    if (0 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+    } else if (1 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+    } else if (2 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+    } else if (3 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+    }
+  } else {
+    if (0 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+    } else if (1 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+    } else if (2 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+    } else if (3 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+    }
+  }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+  int i;
+
+  for (i = 0; i < numVecs; i++) {
+    mask = _mm_cmpgt_epi16(p[i], max);
+    clamped = _mm_andnot_si128(mask, p[i]);
+    mask = _mm_and_si128(mask, max);
+    clamped = _mm_or_si128(mask, clamped);
+    mask = _mm_cmpgt_epi16(clamped, zero);
+    p[i] = _mm_and_si128(clamped, mask);
+  }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+  __m128i v0, v1;
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[1] = _mm_add_epi32(u[1], rnd);
+  u[2] = _mm_add_epi32(u[2], rnd);
+  u[3] = _mm_add_epi32(u[3], rnd);
+
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+  u[0] = _mm_packus_epi32(u[0], u[1]);
+  u[1] = _mm_packus_epi32(u[2], u[3]);
+
+  highbd_clip(u, 2, bd);
+
+  v0 = _mm_unpacklo_epi16(u[0], u[1]);
+  v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(v0, v1);
+  u[2] = _mm_unpackhi_epi16(v0, v1);
+
+  u[1] = _mm_srli_si128(u[0], 8);
+  u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0     : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
+                    uint16_t *dst, int dst_stride, int bd) {
+  __m128i u[4];
+  transClipPixel(src, src_stride, u, bd);
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
+                          int src_stride, uint16_t *dst, int dst_stride,
+                          int bd) {
+  __m128i u[4], v[4];
+  const __m128i ones = _mm_set1_epi16(1);
+
+  transClipPixel(src, src_stride, u, bd);
+
+  v[0] = _mm_loadl_epi64((__m128i const *)dst);
+  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  u[0] = _mm_add_epi16(u[0], v[0]);
+  u[1] = _mm_add_epi16(u[1], v[1]);
+  u[2] = _mm_add_epi16(u[2], v[2]);
+  u[3] = _mm_add_epi16(u[3], v[3]);
+
+  u[0] = _mm_add_epi16(u[0], ones);
+  u[1] = _mm_add_epi16(u[1], ones);
+  u[2] = _mm_add_epi16(u[2], ones);
+  u[3] = _mm_add_epi16(u[3], ones);
+
+  u[0] = _mm_srai_epi16(u[0], 1);
+  u[1] = _mm_srai_epi16(u[1], 1);
+  u[2] = _mm_srai_epi16(u[2], 1);
+  u[3] = _mm_srai_epi16(u[3], 1);
+
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 000000000..5db2ccf6c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
+}
+
+static void idct16_new_avx2(const __m256i *input, __m256i *output,
+                            int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = input[8];
+  x1[2] = input[4];
+  x1[3] = input[12];
+  x1[4] = input[2];
+  x1[5] = input[10];
+  x1[6] = input[6];
+  x1[7] = input[14];
+  x1[8] = input[1];
+  x1[9] = input[9];
+  x1[10] = input[5];
+  x1[11] = input[13];
+  x1[12] = input[3];
+  x1[13] = input[11];
+  x1[14] = input[7];
+  x1[15] = input[15];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[2] = input[4];
+  x1[4] = input[2];
+  x1[6] = input[6];
+  x1[8] = input[1];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x1[2];
+  x1[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+  // stage 5
+  // stage 6
+  output[0] = x1[0];
+  output[1] = x1[1];
+  output[2] = x1[1];
+  output[3] = x1[0];
+  output[4] = x1[0];
+  output[5] = x1[1];
+  output[6] = x1[1];
+  output[7] = x1[0];
+  output[8] = x1[0];
+  output[9] = x1[1];
+  output[10] = x1[1];
+  output[11] = x1[0];
+  output[12] = x1[0];
+  output[13] = x1[1];
+  output[14] = x1[1];
+  output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[0], &x[8]);
+  btf_16_adds_subs_avx2(&x[1], &x[9]);
+  btf_16_adds_subs_avx2(&x[2], &x[10]);
+  btf_16_adds_subs_avx2(&x[3], &x[11]);
+  btf_16_adds_subs_avx2(&x[4], &x[12]);
+  btf_16_adds_subs_avx2(&x[5], &x[13]);
+  btf_16_adds_subs_avx2(&x[6], &x[14]);
+  btf_16_adds_subs_avx2(&x[7], &x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[0], &x[4]);
+  btf_16_adds_subs_avx2(&x[1], &x[5]);
+  btf_16_adds_subs_avx2(&x[2], &x[6]);
+  btf_16_adds_subs_avx2(&x[3], &x[7]);
+  btf_16_adds_subs_avx2(&x[8], &x[12]);
+  btf_16_adds_subs_avx2(&x[9], &x[13]);
+  btf_16_adds_subs_avx2(&x[10], &x[14]);
+  btf_16_adds_subs_avx2(&x[11], &x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[0], &x[2]);
+  btf_16_adds_subs_avx2(&x[1], &x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[6]);
+  btf_16_adds_subs_avx2(&x[5], &x[7]);
+  btf_16_adds_subs_avx2(&x[8], &x[10]);
+  btf_16_adds_subs_avx2(&x[9], &x[11]);
+  btf_16_adds_subs_avx2(&x[12], &x[14]);
+  btf_16_adds_subs_avx2(&x[13], &x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+  const __m256i __zero = _mm256_setzero_si256();
+  output[0] = x1[0];
+  output[1] = _mm256_subs_epi16(__zero, x1[8]);
+  output[2] = x1[12];
+  output[3] = _mm256_subs_epi16(__zero, x1[4]);
+  output[4] = x1[6];
+  output[5] = _mm256_subs_epi16(__zero, x1[14]);
+  output[6] = x1[10];
+  output[7] = _mm256_subs_epi16(__zero, x1[2]);
+  output[8] = x1[3];
+  output[9] = _mm256_subs_epi16(__zero, x1[11]);
+  output[10] = x1[15];
+  output[11] = _mm256_subs_epi16(__zero, x1[7]);
+  output[12] = x1[5];
+  output[13] = _mm256_subs_epi16(__zero, x1[13]);
+  output[14] = x1[9];
+  output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_new_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[15];
+  x1[1] = input[0];
+  x1[2] = input[13];
+  x1[3] = input[2];
+  x1[4] = input[11];
+  x1[5] = input[4];
+  x1[6] = input[9];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[9] = input[8];
+  x1[10] = input[5];
+  x1[11] = input[10];
+  x1[12] = input[3];
+  x1[13] = input[12];
+  x1[14] = input[1];
+  x1[15] = input[14];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+  x1[3] = input[2];
+  x1[5] = input[4];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[1];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+  // stage 3
+  x1[8] = x1[0];
+  x1[9] = x1[1];
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+
+  // stage 5
+  x1[4] = x1[0];
+  x1[5] = x1[1];
+
+  x1[12] = x1[8];
+  x1[13] = x1[9];
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+
+  // stage 7
+  x1[2] = x1[0];
+  x1[3] = x1[1];
+  x1[6] = x1[4];
+  x1[7] = x1[5];
+  x1[10] = x1[8];
+  x1[11] = x1[9];
+  x1[14] = x1[12];
+  x1[15] = x1[13];
+
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[16], &x[17]);
+  btf_16_adds_subs_avx2(&x[19], &x[18]);
+  btf_16_adds_subs_avx2(&x[20], &x[21]);
+  btf_16_adds_subs_avx2(&x[23], &x[22]);
+  btf_16_adds_subs_avx2(&x[24], &x[25]);
+  btf_16_adds_subs_avx2(&x[27], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[29]);
+  btf_16_adds_subs_avx2(&x[31], &x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[19]);
+  btf_16_adds_subs_avx2(&x[17], &x[18]);
+  btf_16_adds_subs_avx2(&x[23], &x[20]);
+  btf_16_adds_subs_avx2(&x[22], &x[21]);
+  btf_16_adds_subs_avx2(&x[24], &x[27]);
+  btf_16_adds_subs_avx2(&x[25], &x[26]);
+  btf_16_adds_subs_avx2(&x[31], &x[28]);
+  btf_16_adds_subs_avx2(&x[30], &x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[23]);
+  btf_16_adds_subs_avx2(&x[17], &x[22]);
+  btf_16_adds_subs_avx2(&x[18], &x[21]);
+  btf_16_adds_subs_avx2(&x[19], &x[20]);
+  btf_16_adds_subs_avx2(&x[31], &x[24]);
+  btf_16_adds_subs_avx2(&x[30], &x[25]);
+  btf_16_adds_subs_avx2(&x[29], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[15]);
+  btf_16_adds_subs_avx2(&x[1], &x[14]);
+  btf_16_adds_subs_avx2(&x[2], &x[13]);
+  btf_16_adds_subs_avx2(&x[3], &x[12]);
+  btf_16_adds_subs_avx2(&x[4], &x[11]);
+  btf_16_adds_subs_avx2(&x[5], &x[10]);
+  btf_16_adds_subs_avx2(&x[6], &x[9]);
+  btf_16_adds_subs_avx2(&x[7], &x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x, cospi, _r, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_avx2(x);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(&x[8], &x[9]);
+  btf_16_adds_subs_avx2(&x[11], &x[10]);
+  btf_16_adds_subs_avx2(&x[12], &x[13]);
+  btf_16_adds_subs_avx2(&x[15], &x[14]);
+  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[5]);
+  btf_16_adds_subs_avx2(&x[7], &x[6]);
+  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+
+  btf_16_adds_subs_avx2(&x[0], &x[3]);
+  btf_16_adds_subs_avx2(&x[1], &x[2]);
+  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x, cospi, _r, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_new_avx2(const __m256i *input, __m256i *output,
+                            int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m256i x1[32];
+  x1[0] = input[0];
+  x1[1] = input[16];
+  x1[2] = input[8];
+  x1[3] = input[24];
+  x1[4] = input[4];
+  x1[5] = input[20];
+  x1[6] = input[12];
+  x1[7] = input[28];
+  x1[8] = input[2];
+  x1[9] = input[18];
+  x1[10] = input[10];
+  x1[11] = input[26];
+  x1[12] = input[6];
+  x1[13] = input[22];
+  x1[14] = input[14];
+  x1[15] = input[30];
+  x1[16] = input[1];
+  x1[17] = input[17];
+  x1[18] = input[9];
+  x1[19] = input[25];
+  x1[20] = input[5];
+  x1[21] = input[21];
+  x1[22] = input[13];
+  x1[23] = input[29];
+  x1[24] = input[3];
+  x1[25] = input[19];
+  x1[26] = input[11];
+  x1[27] = input[27];
+  x1[28] = input[7];
+  x1[29] = input[23];
+  x1[30] = input[15];
+  x1[31] = input[31];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+  idct32_high16_stage3_avx2(x1);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
+
+  idct32_stage7_avx2(x1, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x1, cospi, _r, cos_bit);
+  idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[35]);
+  btf_16_adds_subs_avx2(&x[33], &x[34]);
+  btf_16_adds_subs_avx2(&x[39], &x[36]);
+  btf_16_adds_subs_avx2(&x[38], &x[37]);
+  btf_16_adds_subs_avx2(&x[40], &x[43]);
+  btf_16_adds_subs_avx2(&x[41], &x[42]);
+  btf_16_adds_subs_avx2(&x[47], &x[44]);
+  btf_16_adds_subs_avx2(&x[46], &x[45]);
+  btf_16_adds_subs_avx2(&x[48], &x[51]);
+  btf_16_adds_subs_avx2(&x[49], &x[50]);
+  btf_16_adds_subs_avx2(&x[55], &x[52]);
+  btf_16_adds_subs_avx2(&x[54], &x[53]);
+  btf_16_adds_subs_avx2(&x[56], &x[59]);
+  btf_16_adds_subs_avx2(&x[57], &x[58]);
+  btf_16_adds_subs_avx2(&x[63], &x[60]);
+  btf_16_adds_subs_avx2(&x[62], &x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  btf_16_adds_subs_avx2(&x[16], &x[19]);
+  btf_16_adds_subs_avx2(&x[17], &x[18]);
+  btf_16_adds_subs_avx2(&x[23], &x[20]);
+  btf_16_adds_subs_avx2(&x[22], &x[21]);
+  btf_16_adds_subs_avx2(&x[24], &x[27]);
+  btf_16_adds_subs_avx2(&x[25], &x[26]);
+  btf_16_adds_subs_avx2(&x[31], &x[28]);
+  btf_16_adds_subs_avx2(&x[30], &x[29]);
+  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[39]);
+  btf_16_adds_subs_avx2(&x[33], &x[38]);
+  btf_16_adds_subs_avx2(&x[34], &x[37]);
+  btf_16_adds_subs_avx2(&x[35], &x[36]);
+  btf_16_adds_subs_avx2(&x[47], &x[40]);
+  btf_16_adds_subs_avx2(&x[46], &x[41]);
+  btf_16_adds_subs_avx2(&x[45], &x[42]);
+  btf_16_adds_subs_avx2(&x[44], &x[43]);
+  btf_16_adds_subs_avx2(&x[48], &x[55]);
+  btf_16_adds_subs_avx2(&x[49], &x[54]);
+  btf_16_adds_subs_avx2(&x[50], &x[53]);
+  btf_16_adds_subs_avx2(&x[51], &x[52]);
+  btf_16_adds_subs_avx2(&x[63], &x[56]);
+  btf_16_adds_subs_avx2(&x[62], &x[57]);
+  btf_16_adds_subs_avx2(&x[61], &x[58]);
+  btf_16_adds_subs_avx2(&x[60], &x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_avx2(&x[16], &x[23]);
+  btf_16_adds_subs_avx2(&x[17], &x[22]);
+  btf_16_adds_subs_avx2(&x[18], &x[21]);
+  btf_16_adds_subs_avx2(&x[19], &x[20]);
+  btf_16_adds_subs_avx2(&x[31], &x[24]);
+  btf_16_adds_subs_avx2(&x[30], &x[25]);
+  btf_16_adds_subs_avx2(&x[29], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[15]);
+  btf_16_adds_subs_avx2(&x[1], &x[14]);
+  btf_16_adds_subs_avx2(&x[2], &x[13]);
+  btf_16_adds_subs_avx2(&x[3], &x[12]);
+  btf_16_adds_subs_avx2(&x[4], &x[11]);
+  btf_16_adds_subs_avx2(&x[5], &x[10]);
+  btf_16_adds_subs_avx2(&x[6], &x[9]);
+  btf_16_adds_subs_avx2(&x[7], &x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[47]);
+  btf_16_adds_subs_avx2(&x[33], &x[46]);
+  btf_16_adds_subs_avx2(&x[34], &x[45]);
+  btf_16_adds_subs_avx2(&x[35], &x[44]);
+  btf_16_adds_subs_avx2(&x[36], &x[43]);
+  btf_16_adds_subs_avx2(&x[37], &x[42]);
+  btf_16_adds_subs_avx2(&x[38], &x[41]);
+  btf_16_adds_subs_avx2(&x[39], &x[40]);
+  btf_16_adds_subs_avx2(&x[63], &x[48]);
+  btf_16_adds_subs_avx2(&x[62], &x[49]);
+  btf_16_adds_subs_avx2(&x[61], &x[50]);
+  btf_16_adds_subs_avx2(&x[60], &x[51]);
+  btf_16_adds_subs_avx2(&x[59], &x[52]);
+  btf_16_adds_subs_avx2(&x[58], &x[53]);
+  btf_16_adds_subs_avx2(&x[57], &x[54]);
+  btf_16_adds_subs_avx2(&x[56], &x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[31]);
+  btf_16_adds_subs_avx2(&x[1], &x[30]);
+  btf_16_adds_subs_avx2(&x[2], &x[29]);
+  btf_16_adds_subs_avx2(&x[3], &x[28]);
+  btf_16_adds_subs_avx2(&x[4], &x[27]);
+  btf_16_adds_subs_avx2(&x[5], &x[26]);
+  btf_16_adds_subs_avx2(&x[6], &x[25]);
+  btf_16_adds_subs_avx2(&x[7], &x[24]);
+  btf_16_adds_subs_avx2(&x[8], &x[23]);
+  btf_16_adds_subs_avx2(&x[9], &x[22]);
+  btf_16_adds_subs_avx2(&x[10], &x[21]);
+  btf_16_adds_subs_avx2(&x[11], &x[20]);
+  btf_16_adds_subs_avx2(&x[12], &x[19]);
+  btf_16_adds_subs_avx2(&x[13], &x[18]);
+  btf_16_adds_subs_avx2(&x[14], &x[17]);
+  btf_16_adds_subs_avx2(&x[15], &x[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  x[9] = x[9];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(&x[32], &x[33]);
+  btf_16_adds_subs_avx2(&x[35], &x[34]);
+  btf_16_adds_subs_avx2(&x[36], &x[37]);
+  btf_16_adds_subs_avx2(&x[39], &x[38]);
+  btf_16_adds_subs_avx2(&x[40], &x[41]);
+  btf_16_adds_subs_avx2(&x[43], &x[42]);
+  btf_16_adds_subs_avx2(&x[44], &x[45]);
+  btf_16_adds_subs_avx2(&x[47], &x[46]);
+  btf_16_adds_subs_avx2(&x[48], &x[49]);
+  btf_16_adds_subs_avx2(&x[51], &x[50]);
+  btf_16_adds_subs_avx2(&x[52], &x[53]);
+  btf_16_adds_subs_avx2(&x[55], &x[54]);
+  btf_16_adds_subs_avx2(&x[56], &x[57]);
+  btf_16_adds_subs_avx2(&x[59], &x[58]);
+  btf_16_adds_subs_avx2(&x[60], &x[61]);
+  btf_16_adds_subs_avx2(&x[63], &x[62]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(&x[16], &x[17]);
+  btf_16_adds_subs_avx2(&x[19], &x[18]);
+  btf_16_adds_subs_avx2(&x[20], &x[21]);
+  btf_16_adds_subs_avx2(&x[23], &x[22]);
+  btf_16_adds_subs_avx2(&x[24], &x[25]);
+  btf_16_adds_subs_avx2(&x[27], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[29]);
+  btf_16_adds_subs_avx2(&x[31], &x[30]);
+  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(&x[8], &x[9]);
+  btf_16_adds_subs_avx2(&x[11], &x[10]);
+  btf_16_adds_subs_avx2(&x[12], &x[13]);
+  btf_16_adds_subs_avx2(&x[15], &x[14]);
+  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[5]);
+  btf_16_adds_subs_avx2(&x[7], &x[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(&x[0], &x[3]);
+  btf_16_adds_subs_avx2(&x[1], &x[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
+          { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
+          idct32_new_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
+          idct64_low32_new_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + (i << 4) * input_stride;
+    for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      const int32_t *input_cur = input_row + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
+                                          16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+
+    __m256i *buf1_cur = buf1 + (i << 4);
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_avx2(buf0 + 16 * j, temp, 16);
+        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i *buf1_cur = buf1 + i * txfm_size_row;
+    col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+                                 stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                       (1 << (NewSqrt2Bits - shift - 1)));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale__r);
+      hi = _mm256_madd_epi16(hi, scale__r);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m256i rect_scale =
+        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      src = _mm256_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale__r);
+      hi = _mm256_madd_epi16(hi, scale__r);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+                                           __m256i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
+  for (int h = 0; h < height; ++h) {
+    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+    lo = _mm256_madd_epi16(lo, scale_coeff);
+    hi = _mm256_madd_epi16(hi, scale_coeff);
+    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm256_add_epi32(lo, shift__r);
+    hi = _mm256_add_epi32(hi, shift__r);
+    lo = _mm256_srai_epi32(lo, -shift);
+    hi = _mm256_srai_epi32(hi, -shift);
+    const __m256i x = _mm256_packs_epi32(lo, hi);
+    write_recon_w16_avx2(x, output);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_SIZE tx_size,
+                                                  int32_t eob) {
+  (void)eob;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m256i buf[32];
+  for (int i = 0; i < input_stride; i += 16) {
+    iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
+  const int input_stride = txfm_size_col_notzero;
+  const int buf_size_w_div16 = (eobx + 16) >> 4;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i buf0[64];
+    iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+  assert(row_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 16;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
+                                          buf0_cur, 16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+    __m256i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_avx2(buf0 + 16 * j, temp, 16);
+        transpose_16bit_16x16_avx2(temp,
+                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+      }
+    }
+    for (int j = 0; j < buf_size_w_div16; ++j) {
+      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+                              buf1 + j * 16, shift[1], 16, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:   // ADST in vertical, DCT in horizontal
+    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
+    case ADST_ADST:  // ADST in both directions
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    default:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+    case TX_8X8:
+    case TX_4X8:
+    case TX_8X4:
+    case TX_8X16:
+    case TX_16X8:
+    case TX_4X16:
+    case TX_16X4:
+    case TX_8X32:
+    case TX_32X8:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_16X16:
+    case TX_32X32:
+    case TX_64X64:
+    case TX_16X32:
+    case TX_32X16:
+    case TX_32X64:
+    case TX_64X32:
+    case TX_16X64:
+    case TX_64X16:
+    default:
+      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 000000000..f74cbaeaa
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
+  {                                                \
+    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+    const __m256i _in = in;                        \
+    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
+  }
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+                                    int size) {
+  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm256_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+  __m128i y = _mm256_castsi256_si128(
+      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+  _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    write_recon_w16_avx2(in[j], output + i * stride);
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 000000000..995bc3da4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2923 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 4
+  // stage 5
+  output[0] = x[0];
+  output[7] = x[0];
+  output[1] = x[1];
+  output[6] = x[1];
+  output[2] = x[1];
+  output[5] = x[1];
+  output[3] = x[0];
+  output[4] = x[0];
+}
+
+void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 5
+  // stage 6
+  // stage 7
+  output[0] = x[0];
+  output[15] = x[0];
+  output[1] = x[1];
+  output[14] = x[1];
+  output[2] = x[1];
+  output[13] = x[1];
+  output[3] = x[0];
+  output[12] = x[0];
+  output[4] = x[0];
+  output[11] = x[0];
+  output[5] = x[1];
+  output[10] = x[1];
+  output[6] = x[1];
+  output[9] = x[1];
+  output[7] = x[0];
+  output[8] = x[0];
+}
+
+static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[2] = input[4];
+  x[4] = input[2];
+  x[6] = input[6];
+  x[8] = input[1];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5~7
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+  // stage 7
+  idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_new_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[1] = input[16];
+  x[2] = input[8];
+  x[3] = input[24];
+  x[4] = input[4];
+  x[5] = input[20];
+  x[6] = input[12];
+  x[7] = input[28];
+  x[8] = input[2];
+  x[9] = input[18];
+  x[10] = input[10];
+  x[11] = input[26];
+  x[12] = input[6];
+  x[13] = input[22];
+  x[14] = input[14];
+  x[15] = input[30];
+  x[16] = input[1];
+  x[17] = input[17];
+  x[18] = input[9];
+  x[19] = input[25];
+  x[20] = input[5];
+  x[21] = input[21];
+  x[22] = input[13];
+  x[23] = input[29];
+  x[24] = input[3];
+  x[25] = input[19];
+  x[26] = input[11];
+  x[27] = input[27];
+  x[28] = input[7];
+  x[29] = input[23];
+  x[30] = input[15];
+  x[31] = input[31];
+
+  // stage 2
+  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_adds_subs_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7~8
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_adds_subs_sse2(x[32], x[35]);
+  btf_16_adds_subs_sse2(x[33], x[34]);
+  btf_16_subs_adds_sse2(x[39], x[36]);
+  btf_16_subs_adds_sse2(x[38], x[37]);
+  btf_16_adds_subs_sse2(x[40], x[43]);
+  btf_16_adds_subs_sse2(x[41], x[42]);
+  btf_16_subs_adds_sse2(x[47], x[44]);
+  btf_16_subs_adds_sse2(x[46], x[45]);
+  btf_16_adds_subs_sse2(x[48], x[51]);
+  btf_16_adds_subs_sse2(x[49], x[50]);
+  btf_16_subs_adds_sse2(x[55], x[52]);
+  btf_16_subs_adds_sse2(x[54], x[53]);
+  btf_16_adds_subs_sse2(x[56], x[59]);
+  btf_16_adds_subs_sse2(x[57], x[58]);
+  btf_16_subs_adds_sse2(x[63], x[60]);
+  btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_adds_subs_sse2(x[32], x[39]);
+  btf_16_adds_subs_sse2(x[33], x[38]);
+  btf_16_adds_subs_sse2(x[34], x[37]);
+  btf_16_adds_subs_sse2(x[35], x[36]);
+  btf_16_subs_adds_sse2(x[47], x[40]);
+  btf_16_subs_adds_sse2(x[46], x[41]);
+  btf_16_subs_adds_sse2(x[45], x[42]);
+  btf_16_subs_adds_sse2(x[44], x[43]);
+  btf_16_adds_subs_sse2(x[48], x[55]);
+  btf_16_adds_subs_sse2(x[49], x[54]);
+  btf_16_adds_subs_sse2(x[50], x[53]);
+  btf_16_adds_subs_sse2(x[51], x[52]);
+  btf_16_subs_adds_sse2(x[63], x[56]);
+  btf_16_subs_adds_sse2(x[62], x[57]);
+  btf_16_subs_adds_sse2(x[61], x[58]);
+  btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[47]);
+  btf_16_adds_subs_sse2(x[33], x[46]);
+  btf_16_adds_subs_sse2(x[34], x[45]);
+  btf_16_adds_subs_sse2(x[35], x[44]);
+  btf_16_adds_subs_sse2(x[36], x[43]);
+  btf_16_adds_subs_sse2(x[37], x[42]);
+  btf_16_adds_subs_sse2(x[38], x[41]);
+  btf_16_adds_subs_sse2(x[39], x[40]);
+  btf_16_subs_adds_sse2(x[63], x[48]);
+  btf_16_subs_adds_sse2(x[62], x[49]);
+  btf_16_subs_adds_sse2(x[61], x[50]);
+  btf_16_subs_adds_sse2(x[60], x[51]);
+  btf_16_subs_adds_sse2(x[59], x[52]);
+  btf_16_subs_adds_sse2(x[58], x[53]);
+  btf_16_subs_adds_sse2(x[57], x[54]);
+  btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+                                       const __m128i __rounding,
+                                       int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[31]);
+  btf_16_adds_subs_sse2(x[1], x[30]);
+  btf_16_adds_subs_sse2(x[2], x[29]);
+  btf_16_adds_subs_sse2(x[3], x[28]);
+  btf_16_adds_subs_sse2(x[4], x[27]);
+  btf_16_adds_subs_sse2(x[5], x[26]);
+  btf_16_adds_subs_sse2(x[6], x[25]);
+  btf_16_adds_subs_sse2(x[7], x[24]);
+  btf_16_adds_subs_sse2(x[8], x[23]);
+  btf_16_adds_subs_sse2(x[9], x[22]);
+  btf_16_adds_subs_sse2(x[10], x[21]);
+  btf_16_adds_subs_sse2(x[11], x[20]);
+  btf_16_adds_subs_sse2(x[12], x[19]);
+  btf_16_adds_subs_sse2(x[13], x[18]);
+  btf_16_adds_subs_sse2(x[14], x[17]);
+  btf_16_adds_subs_sse2(x[15], x[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+  btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+  btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+  btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+  btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+  btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+  btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+  btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+  btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+  btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+  btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+  btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+  btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+  btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+  btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+  btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+  btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+  btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+  btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+  btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+  btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+  btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+  btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+  btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+  btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+  btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+  btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+  btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+  btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+  btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+  btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+  btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  x[9] = x[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[33]);
+  btf_16_subs_adds_sse2(x[35], x[34]);
+  btf_16_adds_subs_sse2(x[36], x[37]);
+  btf_16_subs_adds_sse2(x[39], x[38]);
+  btf_16_adds_subs_sse2(x[40], x[41]);
+  btf_16_subs_adds_sse2(x[43], x[42]);
+  btf_16_adds_subs_sse2(x[44], x[45]);
+  btf_16_subs_adds_sse2(x[47], x[46]);
+  btf_16_adds_subs_sse2(x[48], x[49]);
+  btf_16_subs_adds_sse2(x[51], x[50]);
+  btf_16_adds_subs_sse2(x[52], x[53]);
+  btf_16_subs_adds_sse2(x[55], x[54]);
+  btf_16_adds_subs_sse2(x[56], x[57]);
+  btf_16_subs_adds_sse2(x[59], x[58]);
+  btf_16_adds_subs_sse2(x[60], x[61]);
+  btf_16_subs_adds_sse2(x[63], x[62]);
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[4];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+  __m128i x1[16];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
+  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+  __m128i x2[8];
+  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[5]);
+  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+  x2[3] = _mm_add_epi32(x1[3], x1[7]);
+  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
+  x2[5] = _mm_add_epi32(x1[9], x1[11]);
+  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+  x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out1);
+  }
+}
+
+// TODO(binpengsmail@gmail.com):
+// To explore the reuse of VP9 versions of corresponding SSE2 functions and
+// evaluate whether there is a possibility for further speedup.
+void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[2];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+  __m128i x1[8];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
+  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+
+  __m128i x2[4];
+  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
+  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[i], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out0);
+  }
+}
+
+static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+  // stage 3
+  x[4] = x[0];
+  x[5] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+  // stage 5
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[8]);
+  btf_16_adds_subs_sse2(x[1], x[9]);
+  btf_16_adds_subs_sse2(x[2], x[10]);
+  btf_16_adds_subs_sse2(x[3], x[11]);
+  btf_16_adds_subs_sse2(x[4], x[12]);
+  btf_16_adds_subs_sse2(x[5], x[13]);
+  btf_16_adds_subs_sse2(x[6], x[14]);
+  btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[12]);
+  btf_16_adds_subs_sse2(x[9], x[13]);
+  btf_16_adds_subs_sse2(x[10], x[14]);
+  btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[10]);
+  btf_16_adds_subs_sse2(x[9], x[11]);
+  btf_16_adds_subs_sse2(x[12], x[14]);
+  btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+  const __m128i __zero = _mm_setzero_si128();
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[8]);
+  output[2] = x[12];
+  output[3] = _mm_subs_epi16(__zero, x[4]);
+  output[4] = x[6];
+  output[5] = _mm_subs_epi16(__zero, x[14]);
+  output[6] = x[10];
+  output[7] = _mm_subs_epi16(__zero, x[2]);
+  output[8] = x[3];
+  output[9] = _mm_subs_epi16(__zero, x[11]);
+  output[10] = x[15];
+  output[11] = _mm_subs_epi16(__zero, x[7]);
+  output[12] = x[5];
+  output[13] = _mm_subs_epi16(__zero, x[13]);
+  output[14] = x[9];
+  output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+  // stage 3
+  x[8] = x[0];
+  x[9] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+  // stage 5
+  x[4] = x[0];
+  x[5] = x[1];
+  x[12] = x[8];
+  x[13] = x[9];
+
+  // stage 6
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+  // stage 7
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+  x[10] = x[8];
+  x[11] = x[9];
+  x[14] = x[12];
+  x[15] = x[13];
+
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+  x[3] = input[2];
+  x[5] = input[4];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[1];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3~9
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
+                         int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+  // stage 5
+  iadst16_stage5_ssse3(x);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+  // stage 7
+  iadst16_stage7_ssse3(x);
+
+  // stage 8
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+  // stage 9
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
+                                 int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 4; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    output[i] = _mm_adds_epi16(x, input[i]);
+  }
+}
+
+static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
+                                int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) {
+    output[i] = _mm_adds_epi16(input[i], input[i]);
+  }
+}
+
+static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 16; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+    output[i] = _mm_adds_epi16(x, srcx2);
+  }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+                                               __m128i res) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+  return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+    u = _mm_packus_epi16(u, zero);
+    *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+  }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+    _mm_storel_epi64((__m128i *)(output + i * stride), u);
+  }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
+      { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
+      { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
+      { idct32_new_sse2, NULL, NULL },
+      { idct64_low32_new_ssse3, NULL, NULL },
+    };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
+          { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
+          { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+      },
+      { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
+        { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
+        { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+      {
+          { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
+            NULL },
+          { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
+          idct32_new_sse2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
+          idct64_low32_new_ssse3 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
+      { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
+      { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+      { NULL, NULL, NULL },
+      { NULL, NULL, NULL },
+    };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                          (1 << (NewSqrt2Bits - shift - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m128i src = load_32bit_to_16bit(input_row);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m128i rect_scale =
+        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m128i src = load_32bit_to_16bit(input_row);
+      src = _mm_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+                                           __m128i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+  const __m128i zero = _mm_setzero_si128();
+  for (int h = 0; h < height; ++h) {
+    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+    lo = _mm_madd_epi16(lo, scale_coeff);
+    hi = _mm_madd_epi16(hi, scale_coeff);
+    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm_add_epi32(lo, shift_rounding);
+    hi = _mm_add_epi32(hi, shift_rounding);
+    lo = _mm_srai_epi32(lo, -shift);
+    hi = _mm_srai_epi32(hi, -shift);
+    __m128i x = _mm_packs_epi32(lo, hi);
+
+    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+    const __m128i u = _mm_packus_epi16(x, x);
+    _mm_storel_epi64((__m128i *)(output), u);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
+                                                   uint8_t *output, int stride,
+                                                   TX_SIZE tx_size) {
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m128i buf[32];
+
+  for (int i = 0; i < (input_stride >> 3); ++i) {
+    iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[4];
+  const TX_SIZE tx_size = TX_4X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x4(buf, buf);
+  row_txfm(buf, buf, cos_bit_row);
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x4(temp, buf);
+  } else {
+    transpose_16bit_4x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+                                                 __m128i res0, __m128i res1) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+  x0 = _mm_adds_epi16(res0, x0);
+  x1 = _mm_adds_epi16(res1, x1);
+  return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+                                     int size) {
+  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64 * 8];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1 + i * 8;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp,
+                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+                                   output + 16 * i, stride, ud_flip,
+                                   txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = (eobx + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  assert(fun_idx < 5);
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    __m128i buf0[64];
+    iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    uint8_t *out = output + 8 * i;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+      _mm_storel_epi64((__m128i *)(out), u);
+      out += stride;
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+      }
+    }
+
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+                              buf1 + j * 8, shift[1], 8, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case DCT_DCT:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_4X8;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x8(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_8x4(temp, buf);
+  } else {
+    transpose_16bit_8x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_8X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_8x4(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[8];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x8(temp, buf);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_4X16;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  const int row_one_loop = 8;
+  for (int i = 0; i < 2; ++i) {
+    const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+                                  row_one_loop);
+    transpose_16bit_4x8(buf_cur, buf_cur);
+    row_txfm(buf_cur, buf_cur, cos_bit_row);
+    round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    if (lr_flip) {
+      __m128i temp[8];
+      flip_buf_sse2(buf_cur, temp, txfm_size_col);
+      transpose_16bit_8x4(temp, buf_cur);
+    } else {
+      transpose_16bit_8x4(buf_cur, buf_cur);
+    }
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
+                                     int stride, TX_TYPE tx_type,
+                                     TX_SIZE tx_size_, int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_16X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int row_one_loop = 8;
+  for (int i = 0; i < buf_size_w_div8; ++i) {
+    const int32_t *input_cur = input + i * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+                               txfm_size_row);
+    transpose_16bit_8x4(buf_cur, buf_cur);
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  if (lr_flip) {
+    __m128i temp[16];
+    flip_buf_sse2(buf, temp, 16);
+    transpose_16bit_4x8(temp, buf);
+    transpose_16bit_4x8(temp + 8, buf + 8);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+  }
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+                                          tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                            const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                   txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 000000000..66bd339d1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>  // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1)    \
+  do {                                          \
+    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+    const __m128i _in = in;                     \
+    out0 = _mm_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm_mulhrs_epi16(_in, _w1);          \
+  } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+  do {                                                  \
+    const __m128i _in0 = in0;                           \
+    const __m128i _in1 = in1;                           \
+    out0 = _mm_adds_epi16(_in0, _in1);                  \
+    out1 = _mm_subs_epi16(_in0, _in1);                  \
+  } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 000000000..77aeb6eb1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+    const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+    const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+    __m128i *const out0, __m128i *const out1) {
+  const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i u0 = _mm_madd_epi16(t0, *w0);
+  const __m128i v0 = _mm_madd_epi16(t0, *w1);
+  const __m128i a0 = _mm_add_epi32(u0, __rounding);
+  const __m128i b0 = _mm_add_epi32(v0, __rounding);
+  const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+  const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+  *out0 = _mm_packs_epi32(c0, c0);
+  *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                                  \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
+    __m128i u0 = _mm_madd_epi16(t0, w0);             \
+    __m128i v0 = _mm_madd_epi16(t0, w1);             \
+                                                     \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
+                                                     \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
+                                                     \
+    out0 = _mm_packs_epi32(c0, c0);                  \
+    out1 = _mm_packs_epi32(d0, d0);                  \
+  }
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                               \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
+    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
+    __m128i u0 = _mm_madd_epi16(t0, w0);          \
+    __m128i u1 = _mm_madd_epi16(t1, w0);          \
+    __m128i v0 = _mm_madd_epi16(t0, w1);          \
+    __m128i v1 = _mm_madd_epi16(t1, w1);          \
+                                                  \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
+    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
+    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
+                                                  \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
+    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
+    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
+                                                  \
+    out0 = _mm_packs_epi32(c0, c1);               \
+    out1 = _mm_packs_epi32(d0, d1);               \
+  }
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+  const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m128i b = _mm_madd_epi16(a, scale_rounding);
+  return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+                                                int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+                                             int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+  _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+                                                 const int stride,
+                                                 __m128i *const out,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+                                                      const int stride,
+                                                      __m128i *const out,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+                                                 __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w4(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+                                                   uint16_t *out,
+                                                   const int stride) {
+  for (int i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+  }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_adds_epi16(in[i], rounding);
+      in[i] = _mm_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit);
+
+typedef struct {
+  transform_1d_sse2 col, row;  // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 000000000..90b9879cc
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+  __m128i *const vec = (__m128i *)arr;
+  const int vec_size = size >> 2;
+  av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 000000000..6cad821b1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
+                                                   __m128i *output,
+                                                   const int size,
+                                                   const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
+                                                        __m128i *output,
+                                                        const int size,
+                                                        const int bit,
+                                                        const int val) {
+  const __m128i sqrt2 = _mm_set1_epi32(val);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 000000000..a8bfdcce6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                           \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                     \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                      \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(    \
+      TX_SIZE tx_size) {                                                   \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {     \
+      subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      cfl_subsample_##bd##_null,            /* 64x64 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      cfl_subsample_##bd##_null,            /* 32x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x32 (invalid CFL size) */ \
+      subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      cfl_subsample_##bd##_null,            /* 16x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_null,            /* 64x16 (invalid CFL size) */ \
+    };                                                                     \
+    return subfn_##sub[tx_size];                                           \
+  }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                               // Forever 32
+  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+    _mm256_storeu_si256(row, sum_16x16);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                                // Forever 32
+  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+    _mm256_storeu_si256(row, top_16x16);
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  const __m256i zeros = _mm256_setzero_si256();
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+    row_lo = _mm256_slli_epi16(row_lo, 3);
+    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+    row_hi = _mm256_slli_epi16(row_hi, 3);
+
+    _mm256_storeu_si256(row, row_lo);
+    _mm256_storeu_si256(row + 1, row_hi);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+    __m256i sum = _mm256_add_epi16(top, bot);
+
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_add_epi16(hsum, hsum);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i hsum = _mm256_hadd_epi16(top, top_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_slli_epi16(hsum, 2);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+                                        __m256i alpha_sign, __m256i dc_q0) {
+  __m256i ac_q3 = _mm256_loadu_si256(input);
+  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+  __m256i scaled_luma_q0 =
+      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  (void)width;
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+  do {
+    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+    res = _mm256_packus_epi16(res, next);
+    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_storeu_si256((__m256i *)dst, res);
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd);
+CFL_PREDICT_X(avx2, 32, 16, lbd);
+CFL_PREDICT_X(avx2, 32, 32, lbd);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+    predict_lbd_4x4_ssse3,   /* 4x4 */
+    predict_lbd_8x8_ssse3,   /* 8x8 */
+    predict_lbd_16x16_ssse3, /* 16x16 */
+    predict_lbd_32x32_avx2,  /* 32x32 */
+    cfl_predict_lbd_null,    /* 64x64 (invalid CFL size) */
+    predict_lbd_4x8_ssse3,   /* 4x8 */
+    predict_lbd_8x4_ssse3,   /* 8x4 */
+    predict_lbd_8x16_ssse3,  /* 8x16 */
+    predict_lbd_16x8_ssse3,  /* 16x8 */
+    predict_lbd_16x32_ssse3, /* 16x32 */
+    predict_lbd_32x16_avx2,  /* 32x16 */
+    cfl_predict_lbd_null,    /* 32x64 (invalid CFL size) */
+    cfl_predict_lbd_null,    /* 64x32 (invalid CFL size) */
+    predict_lbd_4x16_ssse3,  /* 4x16  */
+    predict_lbd_16x4_ssse3,  /* 16x4  */
+    predict_lbd_8x32_ssse3,  /* 8x32  */
+    predict_lbd_32x8_avx2,   /* 32x8  */
+    cfl_predict_lbd_null,    /* 16x64 (invalid CFL size) */
+    cfl_predict_lbd_null,    /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+static __m256i highbd_max_epi16(int bd) {
+  const __m256i neg_one = _mm256_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  // Use SSSE3 version for smaller widths
+  assert(width == 16 || width == 32);
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+  const __m256i max = highbd_max_epi16(bd);
+
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    _mm256_storeu_si256((__m256i *)dst,
+                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+    if (width == 32) {
+      const __m256i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm256_storeu_si256(
+          (__m256i *)(dst + 16),
+          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+    predict_hbd_4x4_ssse3,  /* 4x4 */
+    predict_hbd_8x8_ssse3,  /* 8x8 */
+    predict_hbd_16x16_avx2, /* 16x16 */
+    predict_hbd_32x32_avx2, /* 32x32 */
+    cfl_predict_hbd_null,   /* 64x64 (invalid CFL size) */
+    predict_hbd_4x8_ssse3,  /* 4x8 */
+    predict_hbd_8x4_ssse3,  /* 8x4 */
+    predict_hbd_8x16_ssse3, /* 8x16 */
+    predict_hbd_16x8_avx2,  /* 16x8 */
+    predict_hbd_16x32_avx2, /* 16x32 */
+    predict_hbd_32x16_avx2, /* 32x16 */
+    cfl_predict_hbd_null,   /* 32x64 (invalid CFL size) */
+    cfl_predict_hbd_null,   /* 64x32 (invalid CFL size) */
+    predict_hbd_4x16_ssse3, /* 4x16  */
+    predict_hbd_16x4_avx2,  /* 16x4  */
+    predict_hbd_8x32_ssse3, /* 8x32  */
+    predict_hbd_32x8_avx2,  /* 32x8  */
+    cfl_predict_hbd_null,   /* 16x64 (invalid CFL size) */
+    cfl_predict_hbd_null,   /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+  // Given that a == [A, B, C, D, E, F, G, H]
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+  // a == [A', C', A', C', E', G', E', G']
+  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+  // a == [A', C', E', G', A', C', E', G']
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A'' == A' + C' and E'' == E' + G'
+  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+  return _mm256_hadd_epi32(a, a);
+  // Given that A''' == A'' + E''
+  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  // Use SSE2 version for smaller widths
+  assert(width == 16 || width == 32);
+
+  const __m256i *src = (__m256i *)src_ptr;
+  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+  // To maximize usage of the AVX2 registers, we sum two rows per loop
+  // iteration
+  const int step = 2 * CFL_BUF_LINE_I256;
+
+  __m256i sum = _mm256_setzero_si256();
+  // For width 32, we use a second sum accumulator to reduce accumulator
+  // dependencies in the loop.
+  __m256i sum2;
+  if (width == 32) sum2 = _mm256_setzero_si256();
+
+  do {
+    // Add top row to the bottom row
+    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+    if (width == 32) { /* Don't worry, this if it gets optimized out. */
+      // Add the second part of the top row to the second part of the bottom row
+      __m256i l1 =
+          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+    }
+    src += step;
+  } while (src < end);
+  // Combine both sum accumulators
+  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+  __m256i fill = fill_sum_epi32(sum);
+
+  __m256i avg_epi16 = _mm256_srli_epi32(
+      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+  // Store and subtract loop
+  src = (__m256i *)src_ptr;
+  __m256i *dst = (__m256i *)dst_ptr;
+  do {
+    _mm256_storeu_si256(dst,
+                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+    if (width == 32) {
+      _mm256_storeu_si256(
+          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+    }
+    src += CFL_BUF_LINE_I256;
+    dst += CFL_BUF_LINE_I256;
+  } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    subtract_average_4x4_sse2,   /* 4x4 */
+    subtract_average_8x8_sse2,   /* 8x8 */
+    subtract_average_16x16_avx2, /* 16x16 */
+    subtract_average_32x32_avx2, /* 32x32 */
+    cfl_subtract_average_null,   /* 64x64 (invalid CFL size) */
+    subtract_average_4x8_sse2,   /* 4x8 */
+    subtract_average_8x4_sse2,   /* 8x4 */
+    subtract_average_8x16_sse2,  /* 8x16 */
+    subtract_average_16x8_avx2,  /* 16x8 */
+    subtract_average_16x32_avx2, /* 16x32 */
+    subtract_average_32x16_avx2, /* 32x16 */
+    cfl_subtract_average_null,   /* 32x64 (invalid CFL size) */
+    cfl_subtract_average_null,   /* 64x32 (invalid CFL size) */
+    subtract_average_4x16_sse2,  /* 4x16 */
+    subtract_average_16x4_avx2,  /* 16x4 */
+    subtract_average_8x32_sse2,  /* 8x32 */
+    subtract_average_32x8_avx2,  /* 32x8 */
+    cfl_subtract_average_null,   /* 16x64 (invalid CFL size) */
+    cfl_subtract_average_null,   /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 000000000..3b342cd4e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
+                                 uint16_t *output_q3);
+void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
+                                  uint16_t *output_q3);
+void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
+                                   uint16_t *output_q3);
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+
+void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                           int dst_stride, int alpha_q3);
+void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+
+void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                            int dst_stride, int alpha_q3);
+void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, int alpha_q3);
+void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, int alpha_q3);
+
+void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                           int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                            int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, int alpha_q3, int bd);
+
+#endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 000000000..4783fe098
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  const __m128i zeros = _mm_setzero_si128();
+  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+  const __m128i *src = (__m128i *)src_ptr;
+  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+  __m128i sum = zeros;
+  do {
+    __m128i l0;
+    if (width == 4) {
+      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpacklo_epi16(l1, zeros)));
+    } else {
+      if (width == 8) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src),
+                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+      } else {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+      }
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpackhi_epi16(l0, zeros)));
+      if (width == 32) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                               _mm_unpackhi_epi16(l0, zeros)));
+      }
+    }
+    src += step;
+  } while (src < end);
+
+  sum = fill_sum_epi32(sum);
+
+  __m128i avg_epi16 =
+      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+  src = (__m128i *)src_ptr;
+  __m128i *dst = (__m128i *)dst_ptr;
+  do {
+    if (width == 4) {
+      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+    } else {
+      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+      if (width > 8) {
+        _mm_storeu_si128(dst + 1,
+                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+        if (width == 32) {
+          _mm_storeu_si128(dst + 2,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+          _mm_storeu_si128(dst + 3,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+        }
+      }
+    }
+    src += CFL_BUF_LINE_I128;
+    dst += CFL_BUF_LINE_I128;
+  } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 000000000..bbf007295
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+  return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+  *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i twos = _mm_set1_epi8(2);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storel_epi64(pred_buf_m128i, sum);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeu_si128(pred_buf_m128i, sum);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, twos);
+        bot_1 = _mm_maddubs_epi16(bot_1, twos);
+        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i fours = _mm_set1_epi8(4);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeh_epi32(pred_buf_m128i, top);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storel_epi64(pred_buf_m128i, top);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeu_si128(pred_buf_m128i, top);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, fours);
+        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+      }
+    }
+    input += input_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i zeros = _mm_setzero_si128();
+  const int luma_stride = input_stride;
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i row = _mm_loadh_epi32((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else if (width == 8) {
+      __m128i row = _mm_loadl_epi64((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else {
+      __m128i row = _mm_loadu_si128((__m128i *)input);
+      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+      if (width == 32) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      sum = _mm_hadd_epi16(sum, sum);
+      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      if (width == 8) {
+        sum = _mm_hadd_epi16(sum, sum);
+        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i bot_2 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i bot_3 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+                           _mm_add_epi16(next_sum, next_sum));
+        }
+      }
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      if (width == 8) {
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+        _mm_storel_epi64(pred_buf_m128i, sum);
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+        _mm_storeu_si128(pred_buf_m128i, sum);
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+        }
+      }
+    }
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+    input += input_stride;
+  } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+    } else {
+      const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+      if (width >= 16) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        row_1 = _mm_slli_epi16(row_1, 3);
+        _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+        if (width == 32) {
+          __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          row_2 = _mm_slli_epi16(row_2, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+          __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          row_3 = _mm_slli_epi16(row_3, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+        }
+      }
+    }
+    input += input_stride;
+    pred_buf_q3 += CFL_BUF_LINE;
+  } while (pred_buf_q3 < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+                                        __m128i alpha_sign, __m128i dc_q0) {
+  __m128i ac_q3 = _mm_loadu_si128(input);
+  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint8_t *dst, int dst_stride,
+                                         int alpha_q3, int width, int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4)
+        _mm_storeh_epi32((__m128i *)dst, res);
+      else
+        _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      res = _mm_packus_epi16(res, next);
+      _mm_storeu_si128((__m128i *)dst, res);
+      if (width == 32) {
+        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+        res = _mm_packus_epi16(res, next);
+        _mm_storeu_si128((__m128i *)(dst + 16), res);
+      }
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+static INLINE __m128i highbd_max_epi16(int bd) {
+  const __m128i neg_one = _mm_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint16_t *dst, int dst_stride,
+                                         int alpha_q3, int bd, int width,
+                                         int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  const __m128i max = highbd_max_epi16(bd);
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    res = highbd_clamp_epi16(res, zeros, max);
+    if (width == 4) {
+      _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      _mm_storeu_si128((__m128i *)dst, res);
+    }
+    if (width >= 16) {
+      const __m128i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128(((__m128i *)dst) + 1,
+                       highbd_clamp_epi16(res_1, zeros, max));
+    }
+    if (width == 32) {
+      const __m128i res_2 =
+          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 16),
+                       highbd_clamp_epi16(res_2, zeros, max));
+      const __m128i res_3 =
+          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 24),
+                       highbd_clamp_epi16(res_3, zeros, max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 000000000..0acafd044
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i sum_round_v = _mm256_set1_epi32(
+      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+      ((1 << (offset_bits - conv_params->round_1)) >> 1));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+  for (j = 0; j < w; j += 8) {
+    for (i = 0; i < im_h; i += 2) {
+      __m256i data = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      // Load the next line
+      if (i + 1 < im_h)
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
+
+      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+    }
+
+    /* Vertical filter */
+    {
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      __m256i s[8];
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        __m256i res_a = convolve(s, coeffs_v);
+        __m256i res_b = convolve(s + 4, coeffs_v);
+
+        // Combine V round and 2F-H-V round into a single rounding
+        res_a =
+            _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
+        res_b =
+            _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+        if (w - j > 4) {
+          _mm_storel_epi64(p_0, res_0);
+          _mm_storel_epi64(p_1, res_1);
+        } else if (w == 4) {
+          xx_storel_32(p_0, res_0);
+          xx_storel_32(p_1, res_1);
+        } else {
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 000000000..b1a62a4f6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  assert(conv_params->round_0 > 0);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i sum_round =
+        _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
+
+        // Accumulate values into the destination buffer
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+        if (w == 2) {
+          *(uint16_t *)p = _mm_cvtsi128_si32(res);
+        } else if (w == 4) {
+          *(uint32_t *)p = _mm_cvtsi128_si32(res);
+        } else {
+          _mm_storel_epi64(p, res);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_q4, const int subpel_y_q4,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  int i, j;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  assert((w % 4) == 0);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+        const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+        const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+        const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+        const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+        const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+        const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
+        if (do_average) {
+          const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+          const __m128i data_ref_0_hi =
+              _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+          const __m128i comp_avg_res_lo =
+              comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i comp_avg_res_hi =
+              comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 =
+              _mm_packus_epi16(round_result_lo, round_result_hi);
+
+          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+          _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+        const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+          else
+            *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
new file mode 100644
index 000000000..0e91ea947
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+  // right shift is F-1 because we are already dividing
+  // filter co-efficients by 2
+  const int right_shift_bits = (FILTER_BITS - 1);
+  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+  const __m256i right_shift_const =
+      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+  __m256i coeffs[4], s[8];
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  for (j = 0; j < w; j += 16) {
+    const uint8_t *data = &src_ptr[j];
+    __m256i src6;
+
+    // Load lines a and b. Line a to lower 128, line b to upper 128
+    const __m256i src_01a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        0x20);
+
+    const __m256i src_12a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        0x20);
+
+    const __m256i src_23a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        0x20);
+
+    const __m256i src_34a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        0x20);
+
+    const __m256i src_45a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        0x20);
+
+    src6 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+    const __m256i src_56a = _mm256_permute2x128_si256(
+        _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+        src6, 0x20);
+
+    s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+    s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+    s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+    s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+    s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+    s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+    for (i = 0; i < h; i += 2) {
+      data = &src_ptr[i * src_stride + j];
+      const __m256i src_67a = _mm256_permute2x128_si256(
+          src6,
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+          src6, 0x20);
+
+      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+      const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+      /* rounding code */
+      // shift by F - 1
+      const __m256i res_16b_lo = _mm256_sra_epi16(
+          _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+      // 8 bit conversion and saturation to uint8
+      __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+      if (w - j > 8) {
+        const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_hi = _mm256_sra_epi16(
+            _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+        __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_a);
+        const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         res_1);
+      } else {
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+        if (w - j > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else if (w - j > 2) {
+          xx_storel_32(&dst[i * dst_stride + j], res_0);
+          xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+
+      s[0] = s[1];
+      s[1] = s[2];
+      s[2] = s[3];
+
+      s[4] = s[5];
+      s[5] = s[6];
+      s[6] = s[7];
+    }
+  }
+}
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  __m256i filt[4], coeffs[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  const __m256i round_0_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+  const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 8) {
+    for (i = 0; i < h; i += 2) {
+      const __m256i data = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+          _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+          0x20);
+
+      __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+      res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                 round_0_shift);
+
+      res_16b =
+          _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
+
+      /* rounding code */
+      // 8 bit conversion and saturation to uint8
+      __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+      const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+      const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+      if (w > 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+      } else if (w > 2) {
+        xx_storel_32(&dst[i * dst_stride], res_0);
+        xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+      } else {
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+        *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+        *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
+        // 19 20 21 22 23
+        const __m256i data = _mm256_inserti128_si256(
+            _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+            1);
+
+        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        // Store values into the destination buffer
+        // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+        res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+        __m128i res = _mm256_castsi256_si128(res_8b);
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 000000000..5016642de
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+  coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+  const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+  return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+  if (w <= 4) {
+    __m128i s[8], src6, res, res_round, res16;
+    uint32_t res_int;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi;
+      __m128i res_lo_round, res_hi_round, res16, res;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_q4, const int subpel_y_q4,
+                            ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_0_const =
+      _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+  if (w <= 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+
+      const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+      const __m128i res = _mm_packus_epi16(res16, res16);
+
+      uint32_t r = _mm_cvtsi128_si32(res);
+      if (w == 2)
+        *(uint16_t *)dst = r;
+      else
+        *(uint32_t *)dst = r;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 000000000..c11edc1d4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                       TX_SIZE tx_size, const uint8_t *above,
+                                       const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  const __m128i filter_intra_scale_bits =
+      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+      const __m128i p_b = xx_loadl_64(p);
+      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+      // Rounding
+      const __m128i round_w =
+          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+      // Storing
+      xx_storel_32(&buffer[r][c], out_r);
+      xx_storel_32(&buffer[r + 1][c], out_r1);
+    }
+  }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 000000000..ae68f0bbb
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+          res_b_round =
+              _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+                               round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memcpy(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
new file mode 100644
index 000000000..15f8872c1
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m128i s[16];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_sse2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+  (void)conv_params;
+  (void)bd;
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      __m128i s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 000000000..3f8dafb4b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const __m128i offset_const_16b = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 8)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_16bit =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo =
+              _mm_add_epi32(res_32b_lo, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+                          res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 4) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+        const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_1 = _mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+          const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+                           res_unsigned_16b);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+        const __m128i res_unsigned_lo =
+            _mm_add_epi32(res_lo_round, offset_const);
+
+        if (w < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+            const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+            const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result = highbd_convolve_rounding_sse2(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result, round_result);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        } else {
+          const __m128i res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_hi_round, offset_const);
+
+          if (do_average) {
+            const __m128i data_lo =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_hi =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+            const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+            const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+            const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_lo =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result_lo, round_result_hi);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 000000000..1d029db39
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+  __m128i coeffs_x[4], coeffs_y[4], s[16];
+
+  const __m128i round_const_x = _mm_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+                     (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+      }
+    }
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 =
+            _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
+        res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 =
+            _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
+        res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 000000000..ade2af03e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,1349 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+// Note:
+//  Total 32x4 registers to represent 32x32 block coefficients.
+//  For high bit depth, each coefficient is 4-byte.
+//  Each __m256i register holds 8 coefficients.
+//  So each "row" we needs 4 register. Totally 32 rows
+//  Register layout:
+//   v0,   v1,   v2,   v3,
+//   v4,   v5,   v6,   v7,
+//   ... ...
+//   v124, v125, v126, v127
+
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(u, max);
+  clamped = _mm256_andnot_si256(mask, u);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  clamped = _mm256_and_si256(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+                                                 __m256i res0, __m256i res1,
+                                                 const int bd) {
+  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+  x0 = _mm256_add_epi32(res0, x0);
+  x1 = _mm256_add_epi32(res1, x1);
+  x0 = _mm256_packus_epi32(x0, x1);
+  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+  x0 = highbd_clamp_epi16_avx2(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+                                                 int stride, int flipud,
+                                                 int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+  }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
+
+  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
+
+  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
+
+  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+                              int input_stiride, int size) {
+  int i;
+  for (i = 0; i < size; ++i) {
+    in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
+  }
+}
+
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+                                      const __m256i *rounding, int bit) {
+  __m256i x;
+  x = _mm256_mullo_epi32(*w0, *n0);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+                                    const __m256i *w1, const __m256i *n1,
+                                    const __m256i *rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(*w0, *n0);
+  y = _mm256_mullo_epi32(*w1, *n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+                        __m256i *out1, const __m256i *clamp_lo,
+                        const __m256i *clamp_hi) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
+                                 __m256i *out0, __m256i *out1) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
+                              __m256i *out0, __m256i *out1,
+                              const __m256i *clamp_lo, const __m256i *clamp_hi,
+                              int shift) {
+  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+  __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
+  __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
+  __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static INLINE void idct32_stage4_avx2(
+    __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+    const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+    const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+    __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+    const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+    __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+                                      const int do_cols, const int bd,
+                                      const int out_shift,
+                                      const int log_range) {
+  if (do_cols) {
+    addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
+    addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
+    addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
+    addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
+    addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
+    addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
+    addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
+    addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
+    addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
+    addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
+    addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
+    addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
+    addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
+    addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
+    addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
+    addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+    addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+                      &clamp_hi_out, out_shift);
+  }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i x;
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  x = _mm256_mullo_epi32(in[0], cospi32);
+  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_srai_epi32(x, bit);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    x = _mm256_max_epi32(x, clamp_lo);
+    x = _mm256_min_epi32(x, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    x = _mm256_add_epi32(offset, x);
+    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+    x = _mm256_max_epi32(x, clamp_lo_out);
+    x = _mm256_min_epi32(x, clamp_hi_out);
+  }
+
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+  out[8] = x;
+  out[9] = x;
+  out[10] = x;
+  out[11] = x;
+  out[12] = x;
+  out[13] = x;
+  out[14] = x;
+  out[15] = x;
+  out[16] = x;
+  out[17] = x;
+  out[18] = x;
+  out[19] = x;
+  out[20] = x;
+  out[21] = x;
+  out[22] = x;
+  out[23] = x;
+  out[24] = x;
+  out[25] = x;
+  out[26] = x;
+  out[27] = x;
+  out[28] = x;
+  out[29] = x;
+  out[30] = x;
+  out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[4] = in[4];
+    bf1[8] = in[2];
+    bf1[12] = in[6];
+    bf1[16] = in[1];
+    bf1[20] = in[5];
+    bf1[24] = in[3];
+    bf1[28] = in[7];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+    bf1[17] = bf1[16];
+    bf1[18] = bf1[19];
+    bf1[21] = bf1[20];
+    bf1[22] = bf1[23];
+    bf1[25] = bf1[24];
+    bf1[26] = bf1[27];
+    bf1[29] = bf1[28];
+    bf1[30] = bf1[31];
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+    bf1[9] = bf1[8];
+    bf1[10] = bf1[11];
+    bf1[13] = bf1[12];
+    bf1[14] = bf1[15];
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[5] = bf1[4];
+    bf1[6] = bf1[7];
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    bf1[3] = bf1[0];
+    bf1[2] = bf1[1];
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[2] = in[8];
+    bf1[4] = in[4];
+    bf1[6] = in[12];
+    bf1[8] = in[2];
+    bf1[10] = in[10];
+    bf1[12] = in[6];
+    bf1[14] = in[14];
+    bf1[16] = in[1];
+    bf1[18] = in[9];
+    bf1[20] = in[5];
+    bf1[22] = in[13];
+    bf1[24] = in[3];
+    bf1[26] = in[11];
+    bf1[28] = in[7];
+    bf1[30] = in[15];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32], bf0[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[1] = in[16];
+    bf1[2] = in[8];
+    bf1[3] = in[24];
+    bf1[4] = in[4];
+    bf1[5] = in[20];
+    bf1[6] = in[12];
+    bf1[7] = in[28];
+    bf1[8] = in[2];
+    bf1[9] = in[18];
+    bf1[10] = in[10];
+    bf1[11] = in[26];
+    bf1[12] = in[6];
+    bf1[13] = in[22];
+    bf1[14] = in[14];
+    bf1[15] = in[30];
+    bf1[16] = in[1];
+    bf1[17] = in[17];
+    bf1[18] = in[9];
+    bf1[19] = in[25];
+    bf1[20] = in[5];
+    bf1[21] = in[21];
+    bf1[22] = in[13];
+    bf1[23] = in[29];
+    bf1[24] = in[3];
+    bf1[25] = in[19];
+    bf1[26] = in[11];
+    bf1[27] = in[27];
+    bf1[28] = in[7];
+    bf1[29] = in[23];
+    bf1[30] = in[15];
+    bf1[31] = in[31];
+
+    // stage 2
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] = bf1[4];
+    bf0[5] = bf1[5];
+    bf0[6] = bf1[6];
+    bf0[7] = bf1[7];
+    bf0[8] = bf1[8];
+    bf0[9] = bf1[9];
+    bf0[10] = bf1[10];
+    bf0[11] = bf1[11];
+    bf0[12] = bf1[12];
+    bf0[13] = bf1[13];
+    bf0[14] = bf1[14];
+    bf0[15] = bf1[15];
+    bf0[16] =
+        half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+    bf0[17] =
+        half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+    bf0[19] =
+        half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+    bf0[20] =
+        half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+    bf0[23] =
+        half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+    bf0[24] =
+        half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+    bf0[28] =
+        half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+    bf0[30] =
+        half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+    bf0[31] =
+        half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+    // stage 3
+    bf1[0] = bf0[0];
+    bf1[1] = bf0[1];
+    bf1[2] = bf0[2];
+    bf1[3] = bf0[3];
+    bf1[4] = bf0[4];
+    bf1[5] = bf0[5];
+    bf1[6] = bf0[6];
+    bf1[7] = bf0[7];
+    bf1[8] =
+        half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+    bf1[9] =
+        half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+    bf1[10] =
+        half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+    bf1[11] =
+        half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+    bf1[12] =
+        half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+    bf1[14] =
+        half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+    bf1[15] =
+        half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] =
+        half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+    bf0[5] =
+        half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+    bf0[6] =
+        half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+    bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+    bf0[16] = bf1[16];
+    bf0[17] =
+        half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+    bf0[19] = bf1[19];
+    bf0[20] = bf1[20];
+    bf0[21] =
+        half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] =
+        half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+    bf0[27] = bf1[27];
+    bf0[28] = bf1[28];
+    bf0[29] =
+        half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+    bf0[30] =
+        half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+    bf0[31] = bf1[31];
+
+    // stage 5
+    bf1[0] =
+        half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+    bf1[1] =
+        half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+    bf1[2] =
+        half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+    bf1[3] =
+        half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+    bf1[8] = bf0[8];
+    bf1[9] =
+        half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+    bf1[10] =
+        half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+    bf1[11] = bf0[11];
+    bf1[12] = bf0[12];
+    bf1[13] =
+        half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+    bf1[14] =
+        half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+    bf1[15] = bf0[15];
+    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+    bf0[4] = bf1[4];
+    bf0[5] =
+        half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+    bf0[6] =
+        half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+    bf0[7] = bf1[7];
+    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] =
+        half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+    bf0[19] =
+        half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+    bf0[20] =
+        half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+    bf0[22] = bf1[22];
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] = bf1[25];
+    bf0[26] =
+        half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+    bf0[28] =
+        half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 7
+    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+    bf1[8] = bf0[8];
+    bf1[9] = bf0[9];
+    bf1[10] =
+        half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+    bf1[11] =
+        half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+    bf1[12] =
+        half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+    bf1[14] = bf0[14];
+    bf1[15] = bf0[15];
+    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+    // stage 8
+    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] = bf1[18];
+    bf0[19] = bf1[19];
+    bf0[20] =
+        half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+    bf0[23] =
+        half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+    bf0[24] =
+        half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+    bf0[28] = bf1[28];
+    bf0[29] = bf1[29];
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 9
+    if (do_cols) {
+      addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
+      addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
+      addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
+      addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
+      addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
+      addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
+      addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
+      addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
+      addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
+      addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
+      addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
+      addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
+      addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
+      addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
+      addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
+      addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+      addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+                                  int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+                                                   uint16_t *output, int stride,
+                                                   TX_TYPE tx_type,
+                                                   TX_SIZE tx_size, int eob,
+                                                   const int bd) {
+  __m256i buf1[64 * 2];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m256i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m256i *buf0_cur = buf0 + j * 8;
+      load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+      transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+    }
+
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m256i *_buf1 = buf1 + i * 8;
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+                                    output + 16 * i, stride, ud_flip,
+                                    txfm_size_row, bd);
+    }
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+                                             uint8_t *output, int stride,
+                                             TX_TYPE tx_type, TX_SIZE tx_size,
+                                             int eob, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+                                             stride, tx_type, tx_size, eob, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
+                                              txfm_param->tx_size,
+                                              txfm_param->eob, bd);
+      break;
+      // Assembly version doesn't support IDTX, so use C version for it.
+    case IDTX:
+      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+
+    default: assert(0);
+  }
+}
+
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+    case TX_16X64:
+    case TX_64X16:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 000000000..e29e0baf5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,5348 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+                                                  __m128i res0, __m128i res1,
+                                                  const int bd) {
+  __m128i x0 = _mm_cvtepi16_epi32(pred);
+  __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+
+  x0 = _mm_add_epi32(res0, x0);
+  x1 = _mm_add_epi32(res1, x1);
+  x0 = _mm_packus_epi32(x0, x1);
+  x0 = highbd_clamp_epi16(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+                                                  int stride, int flipud,
+                                                  int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+                                           __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+                          __m128i *out1, const __m128i *clamp_lo,
+                          const __m128i *clamp_hi) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
+                                   __m128i *out0, __m128i *out1) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
+                                __m128i *out0, __m128i *out1,
+                                const __m128i *clamp_lo,
+                                const __m128i *clamp_hi, int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i in0_w_offset = _mm_add_epi32(in0, offset);
+  __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
+  __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static INLINE void idct32_stage4_sse4_1(
+    __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+    const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+    const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+    __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+    const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+    const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+    __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] =
+      half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 =
+      half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] =
+      half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+                                        const int do_cols, const int bd,
+                                        const int out_shift,
+                                        const int log_range) {
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
+    addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
+    addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
+    addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
+    addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
+    addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
+    addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
+    addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
+    addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
+    addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
+    addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
+    addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
+    addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
+    addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
+    addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
+    addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+  }
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+                             __m128i *out0, __m128i *out1,
+                             const __m128i *clamp_lo, const __m128i *clamp_hi,
+                             int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i a0 = _mm_add_epi32(offset, in0);
+  __m128i a1 = _mm_sub_epi32(offset, in1);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
+    addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
+  } else {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  x0 = _mm_unpacklo_epi64(v0, v2);
+  x1 = _mm_unpackhi_epi64(v0, v2);
+  x2 = _mm_unpacklo_epi64(v1, v3);
+  x3 = _mm_unpackhi_epi64(v1, v3);
+
+  s0 = _mm_mullo_epi32(x0, sinpi1);
+  s1 = _mm_mullo_epi32(x0, sinpi2);
+  s2 = _mm_mullo_epi32(x1, sinpi3);
+  s3 = _mm_mullo_epi32(x2, sinpi4);
+  s4 = _mm_mullo_epi32(x2, sinpi1);
+  s5 = _mm_mullo_epi32(x3, sinpi2);
+  s6 = _mm_mullo_epi32(x3, sinpi4);
+  t = _mm_sub_epi32(x0, x2);
+  s7 = _mm_add_epi32(t, x3);
+
+  t = _mm_add_epi32(s0, s3);
+  s0 = _mm_add_epi32(t, s5);
+  t = _mm_sub_epi32(s1, s4);
+  s1 = _mm_sub_epi32(t, s6);
+  s3 = s2;
+  s2 = _mm_mullo_epi32(s7, sinpi3);
+
+  u0 = _mm_add_epi32(s0, s3);
+  u1 = _mm_add_epi32(s1, s3);
+  u2 = s2;
+  t = _mm_add_epi32(s0, s1);
+  u3 = _mm_sub_epi32(t, s3);
+
+  u0 = _mm_add_epi32(u0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(u1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(u2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(u3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    u0 = _mm_max_epi32(u0, clamp_lo);
+    u0 = _mm_min_epi32(u0, clamp_hi);
+    u1 = _mm_max_epi32(u1, clamp_lo);
+    u1 = _mm_min_epi32(u1, clamp_hi);
+    u2 = _mm_max_epi32(u2, clamp_lo);
+    u2 = _mm_min_epi32(u2, clamp_hi);
+    u3 = _mm_max_epi32(u3, clamp_lo);
+    u3 = _mm_min_epi32(u3, clamp_hi);
+  }
+
+  in[0] = u0;
+  in[1] = u1;
+  in[2] = u2;
+  in[3] = u3;
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[3] = _mm_add_epi32(in[3], rnding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  if (fliplr) {
+    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+  }
+
+  if (flipud) {
+    u0 = _mm_add_epi32(in[3], v0);
+    u1 = _mm_add_epi32(in[2], v1);
+    u2 = _mm_add_epi32(in[1], v2);
+    u3 = _mm_add_epi32(in[0], v3);
+  } else {
+    u0 = _mm_add_epi32(in[0], v0);
+    u1 = _mm_add_epi32(in[1], v1);
+    u2 = _mm_add_epi32(in[2], v2);
+    u3 = _mm_add_epi32(in[3], v3);
+  }
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[4];
+  const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+      iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
+      addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
+      addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
+      addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+      addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[8], v[8], x;
+
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[2] = _mm_sub_epi32(kZero, u[4]);
+    out[4] = u[6];
+    out[6] = _mm_sub_epi32(kZero, u[2]);
+    out[8] = u[3];
+    out[10] = _mm_sub_epi32(kZero, u[7]);
+    out[12] = u[5];
+    out[14] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[1] = u[0];
+    out[3] = _mm_sub_epi32(kZero, u[4]);
+    out[5] = u[6];
+    out[7] = _mm_sub_epi32(kZero, u[2]);
+    out[9] = u[3];
+    out[11] = _mm_sub_epi32(kZero, u[7]);
+    out[13] = u[5];
+    out[15] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+                             int fliplr, int bd) {
+  __m128i x0, x1;
+  const __m128i zero = _mm_setzero_si128();
+
+  x0 = _mm_unpacklo_epi16(pred, zero);
+  x1 = _mm_unpackhi_epi16(pred, zero);
+
+  if (fliplr) {
+    res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+    res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+    x0 = _mm_add_epi32(res_hi, x0);
+    x1 = _mm_add_epi32(res_lo, x1);
+
+  } else {
+    x0 = _mm_add_epi32(res_lo, x0);
+    x1 = _mm_add_epi32(res_hi, x1);
+  }
+
+  x0 = _mm_packus_epi32(x0, x1);
+  return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[16], out[16];
+  const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  x = _mm_mullo_epi32(in[0], cospi32);
+  x = _mm_add_epi32(x, rnding);
+  x = _mm_srai_epi32(x, bit);
+
+  // stage 4
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    x = _mm_add_epi32(x, offset);
+    x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+    x = _mm_max_epi32(x, clamp_lo_out);
+    x = _mm_min_epi32(x, clamp_hi_out);
+  }
+
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+}
+
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = _mm_mullo_epi32(in[1], cospi56);
+  y = _mm_mullo_epi32(in[7], cospim8);
+  u4 = _mm_add_epi32(x, y);
+  u4 = _mm_add_epi32(u4, rnding);
+  u4 = _mm_srai_epi32(u4, bit);
+
+  x = _mm_mullo_epi32(in[1], cospi8);
+  y = _mm_mullo_epi32(in[7], cospi56);
+  u7 = _mm_add_epi32(x, y);
+  u7 = _mm_add_epi32(u7, rnding);
+  u7 = _mm_srai_epi32(u7, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi24);
+  y = _mm_mullo_epi32(in[3], cospim40);
+  u5 = _mm_add_epi32(x, y);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi40);
+  y = _mm_mullo_epi32(in[3], cospi24);
+  u6 = _mm_add_epi32(x, y);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
+
+  // stage 3
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u1, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u2, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u2, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
+
+  x = _mm_mullo_epi32(v5, cospi32);
+  y = _mm_mullo_epi32(v6, cospi32);
+  u6 = _mm_add_epi32(y, x);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
+
+  u5 = _mm_sub_epi32(y, x);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
+
+  // stage 5
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
+    addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
+    addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
+    addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+    addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+    addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
+                        out_shift);
+  }
+}
+
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(kZero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // stage 3
+  // stage 4
+  __m128i temp1, temp2;
+  temp1 = _mm_mullo_epi32(u[0], cospi16);
+  x = _mm_mullo_epi32(u[1], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+  u[4] = temp1;
+
+  temp2 = _mm_mullo_epi32(u[0], cospi48);
+  x = _mm_mullo_epi32(u[1], cospi16);
+  u[5] = _mm_sub_epi32(temp2, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // stage 5
+  // stage 6
+  temp1 = _mm_mullo_epi32(u[0], cospi32);
+  x = _mm_mullo_epi32(u[1], cospi32);
+  u[2] = _mm_add_epi32(temp1, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(temp1, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  temp1 = _mm_mullo_epi32(u[4], cospi32);
+  x = _mm_mullo_epi32(u[5], cospi32);
+  u[6] = _mm_add_epi32(temp1, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(temp1, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+  }
+}
+
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[8], v[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  u[0] = _mm_mullo_epi32(in[7], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[7], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[5], cospi20);
+  x = _mm_mullo_epi32(in[2], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[5], cospi44);
+  x = _mm_mullo_epi32(in[2], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[3], cospi36);
+  x = _mm_mullo_epi32(in[4], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[3], cospi28);
+  x = _mm_mullo_epi32(in[4], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[1], cospi52);
+  x = _mm_mullo_epi32(in[6], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[1], cospi12);
+  x = _mm_mullo_epi32(in[6], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+  }
+}
+
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    in[0] = _mm_mullo_epi32(in[0], cospi32);
+    in[0] = _mm_add_epi32(in[0], rnding);
+    in[0] = _mm_srai_epi32(in[0], bit);
+
+    // stage 5
+    // stage 6
+    // stage 7
+    if (do_cols) {
+      in[0] = _mm_max_epi32(in[0], clamp_lo);
+      in[0] = _mm_min_epi32(in[0], clamp_hi);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      in[0] = _mm_add_epi32(in[0], offset);
+      in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+      in[0] = _mm_max_epi32(in[0], clamp_lo_out);
+      in[0] = _mm_min_epi32(in[0], clamp_hi_out);
+    }
+
+    out[0] = in[0];
+    out[1] = in[0];
+    out[2] = in[0];
+    out[3] = in[0];
+    out[4] = in[0];
+    out[5] = in[0];
+    out[6] = in[0];
+    out[7] = in[0];
+    out[8] = in[0];
+    out[9] = in[0];
+    out[10] = in[0];
+    out[11] = in[0];
+    out[12] = in[0];
+    out[13] = in[0];
+    out[14] = in[0];
+    out[15] = in[0];
+  }
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[2] = in[4];
+    u[4] = in[2];
+    u[6] = in[6];
+    u[8] = in[1];
+    u[10] = in[5];
+    u[12] = in[3];
+    u[14] = in[7];
+
+    // stage 2
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+    u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+    u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    // stage 3
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+    addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    u[0] = _mm_add_epi32(x, rnding);
+    u[0] = _mm_srai_epi32(u[0], bit);
+    u[1] = u[0];
+
+    u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+    u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+    x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = x;
+    y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = y;
+
+    // stage 5
+    addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    x = _mm_mullo_epi32(u[5], cospi32);
+    y = _mm_mullo_epi32(u[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    u[10] = _mm_sub_epi32(y, x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
+
+    u[13] = _mm_add_epi32(x, y);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
+
+    u[12] = _mm_add_epi32(x, y);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+    // stage 7
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
+      addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
+      addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
+      addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
+      addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
+      addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
+      addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
+      addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v[16], x, y, temp1, temp2;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    x = _mm_mullo_epi32(in[0], cospi62);
+    v[0] = _mm_add_epi32(x, rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    x = _mm_mullo_epi32(in[0], cospi2);
+    v[1] = _mm_sub_epi32(zero, x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    // stage 3
+    v[8] = v[0];
+    v[9] = v[1];
+
+    // stage 4
+    temp1 = _mm_mullo_epi32(v[8], cospi8);
+    x = _mm_mullo_epi32(v[9], cospi56);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[8], cospi56);
+    x = _mm_mullo_epi32(v[9], cospi8);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[8] = temp1;
+    v[9] = temp2;
+
+    // stage 5
+    v[4] = v[0];
+    v[5] = v[1];
+    v[12] = v[8];
+    v[13] = v[9];
+
+    // stage 6
+    temp1 = _mm_mullo_epi32(v[4], cospi16);
+    x = _mm_mullo_epi32(v[5], cospi48);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[4], cospi48);
+    x = _mm_mullo_epi32(v[5], cospi16);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[4] = temp1;
+    v[5] = temp2;
+
+    temp1 = _mm_mullo_epi32(v[12], cospi16);
+    x = _mm_mullo_epi32(v[13], cospi48);
+    temp1 = _mm_add_epi32(temp1, x);
+    temp1 = _mm_add_epi32(temp1, rnding);
+    temp1 = _mm_srai_epi32(temp1, bit);
+
+    temp2 = _mm_mullo_epi32(v[12], cospi48);
+    x = _mm_mullo_epi32(v[13], cospi16);
+    temp2 = _mm_sub_epi32(temp2, x);
+    temp2 = _mm_add_epi32(temp2, rnding);
+    temp2 = _mm_srai_epi32(temp2, bit);
+    v[12] = temp1;
+    v[13] = temp2;
+
+    // stage 7
+    v[2] = v[0];
+    v[3] = v[1];
+    v[6] = v[4];
+    v[7] = v[5];
+    v[10] = v[8];
+    v[11] = v[9];
+    v[14] = v[12];
+    v[15] = v[13];
+
+    // stage 8
+    y = _mm_mullo_epi32(v[2], cospi32);
+    x = _mm_mullo_epi32(v[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    y = _mm_mullo_epi32(v[6], cospi32);
+    x = _mm_mullo_epi32(v[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    y = _mm_mullo_epi32(v[10], cospi32);
+    x = _mm_mullo_epi32(v[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    y = _mm_mullo_epi32(v[14], cospi32);
+    x = _mm_mullo_epi32(v[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    __m128i zero = _mm_setzero_si128();
+    x = _mm_mullo_epi32(in[0], cospi62);
+    u[0] = _mm_add_epi32(x, rnding);
+    u[0] = _mm_srai_epi32(u[0], bit);
+
+    x = _mm_mullo_epi32(in[0], cospi2);
+    u[1] = _mm_sub_epi32(zero, x);
+    u[1] = _mm_add_epi32(u[1], rnding);
+    u[1] = _mm_srai_epi32(u[1], bit);
+
+    x = _mm_mullo_epi32(in[2], cospi54);
+    u[2] = _mm_add_epi32(x, rnding);
+    u[2] = _mm_srai_epi32(u[2], bit);
+
+    x = _mm_mullo_epi32(in[2], cospi10);
+    u[3] = _mm_sub_epi32(zero, x);
+    u[3] = _mm_add_epi32(u[3], rnding);
+    u[3] = _mm_srai_epi32(u[3], bit);
+
+    x = _mm_mullo_epi32(in[4], cospi46);
+    u[4] = _mm_add_epi32(x, rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    x = _mm_mullo_epi32(in[4], cospi18);
+    u[5] = _mm_sub_epi32(zero, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    x = _mm_mullo_epi32(in[6], cospi38);
+    u[6] = _mm_add_epi32(x, rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    x = _mm_mullo_epi32(in[6], cospi26);
+    u[7] = _mm_sub_epi32(zero, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[8] = _mm_mullo_epi32(in[7], cospi34);
+    u[8] = _mm_add_epi32(u[8], rnding);
+    u[8] = _mm_srai_epi32(u[8], bit);
+
+    u[9] = _mm_mullo_epi32(in[7], cospi30);
+    u[9] = _mm_add_epi32(u[9], rnding);
+    u[9] = _mm_srai_epi32(u[9], bit);
+
+    u[10] = _mm_mullo_epi32(in[5], cospi42);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
+
+    u[11] = _mm_mullo_epi32(in[5], cospi22);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
+
+    u[12] = _mm_mullo_epi32(in[3], cospi50);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+
+    u[13] = _mm_mullo_epi32(in[3], cospi14);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    u[14] = _mm_mullo_epi32(in[1], cospi58);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    u[15] = _mm_mullo_epi32(in[1], cospi6);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 3
+    addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    y = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    u[8] = _mm_mullo_epi32(u[8], cospi8);
+    u[8] = _mm_add_epi32(u[8], x);
+    u[8] = _mm_add_epi32(u[8], rnding);
+    u[8] = _mm_srai_epi32(u[8], bit);
+
+    x = _mm_mullo_epi32(u[9], cospi8);
+    u[9] = _mm_sub_epi32(y, x);
+    u[9] = _mm_add_epi32(u[9], rnding);
+    u[9] = _mm_srai_epi32(u[9], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi24);
+    y = _mm_mullo_epi32(u[10], cospi24);
+    u[10] = _mm_mullo_epi32(u[10], cospi40);
+    u[10] = _mm_add_epi32(u[10], x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi40);
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi8);
+    y = _mm_mullo_epi32(u[12], cospi8);
+    u[12] = _mm_mullo_epi32(u[12], cospim56);
+    u[12] = _mm_add_epi32(u[12], x);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+
+    x = _mm_mullo_epi32(u[13], cospim56);
+    u[13] = _mm_sub_epi32(y, x);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[15], cospi40);
+    y = _mm_mullo_epi32(u[14], cospi40);
+    u[14] = _mm_mullo_epi32(u[14], cospim24);
+    u[14] = _mm_add_epi32(u[14], x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    x = _mm_mullo_epi32(u[15], cospim24);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 5
+    addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    x = _mm_mullo_epi32(u[5], cospi48);
+    y = _mm_mullo_epi32(u[4], cospi48);
+    u[4] = _mm_mullo_epi32(u[4], cospi16);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    x = _mm_mullo_epi32(u[5], cospi16);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    x = _mm_mullo_epi32(u[7], cospi16);
+    y = _mm_mullo_epi32(u[6], cospi16);
+    u[6] = _mm_mullo_epi32(u[6], cospim48);
+    u[6] = _mm_add_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    x = _mm_mullo_epi32(u[7], cospim48);
+    u[7] = _mm_sub_epi32(y, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi48);
+    y = _mm_mullo_epi32(u[12], cospi48);
+    u[12] = _mm_mullo_epi32(u[12], cospi16);
+    u[12] = _mm_add_epi32(u[12], x);
+    u[12] = _mm_add_epi32(u[12], rnding);
+    u[12] = _mm_srai_epi32(u[12], bit);
+
+    x = _mm_mullo_epi32(u[13], cospi16);
+    u[13] = _mm_sub_epi32(y, x);
+    u[13] = _mm_add_epi32(u[13], rnding);
+    u[13] = _mm_srai_epi32(u[13], bit);
+
+    x = _mm_mullo_epi32(u[15], cospi16);
+    y = _mm_mullo_epi32(u[14], cospi16);
+    u[14] = _mm_mullo_epi32(u[14], cospim48);
+    u[14] = _mm_add_epi32(u[14], x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    x = _mm_mullo_epi32(u[15], cospim48);
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 7
+    addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    u[2] = _mm_add_epi32(y, x);
+    u[2] = _mm_add_epi32(u[2], rnding);
+    u[2] = _mm_srai_epi32(u[2], bit);
+
+    u[3] = _mm_sub_epi32(y, x);
+    u[3] = _mm_add_epi32(u[3], rnding);
+    u[3] = _mm_srai_epi32(u[3], bit);
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = _mm_sub_epi32(y, x);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    u[10] = _mm_add_epi32(y, x);
+    u[10] = _mm_add_epi32(u[10], rnding);
+    u[10] = _mm_srai_epi32(u[10], bit);
+
+    u[11] = _mm_sub_epi32(y, x);
+    u[11] = _mm_add_epi32(u[11], rnding);
+    u[11] = _mm_srai_epi32(u[11], bit);
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    u[14] = _mm_add_epi32(y, x);
+    u[14] = _mm_add_epi32(u[14], rnding);
+    u[14] = _mm_srai_epi32(u[14], bit);
+
+    u[15] = _mm_sub_epi32(y, x);
+    u[15] = _mm_add_epi32(u[15], rnding);
+    u[15] = _mm_srai_epi32(u[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = u[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
+      out[2] = u[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
+      out[4] = u[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
+      out[6] = u[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
+      out[8] = u[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
+      out[10] = u[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
+      out[12] = u[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
+      out[14] = u[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    if (do_cols) {
+      addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
+      addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
+      addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
+      addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
+      addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
+      addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
+      addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
+      addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+      addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
+                          &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm_mullo_epi32(in[15], cospi2);
+    x = _mm_mullo_epi32(in[0], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_mullo_epi32(in[15], cospi62);
+    x = _mm_mullo_epi32(in[0], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13], cospi10);
+    x = _mm_mullo_epi32(in[2], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(in[13], cospi54);
+    x = _mm_mullo_epi32(in[2], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_mullo_epi32(in[11], cospi18);
+    x = _mm_mullo_epi32(in[4], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11], cospi46);
+    x = _mm_mullo_epi32(in[4], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9], cospi26);
+    x = _mm_mullo_epi32(in[6], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(in[9], cospi38);
+    x = _mm_mullo_epi32(in[6], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = _mm_mullo_epi32(in[7], cospi34);
+    x = _mm_mullo_epi32(in[8], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7], cospi30);
+    x = _mm_mullo_epi32(in[8], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5], cospi42);
+    x = _mm_mullo_epi32(in[10], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(in[5], cospi22);
+    x = _mm_mullo_epi32(in[10], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(in[3], cospi50);
+    x = _mm_mullo_epi32(in[12], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(in[3], cospi14);
+    x = _mm_mullo_epi32(in[12], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1], cospi58);
+    x = _mm_mullo_epi32(in[14], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(in[1], cospi6);
+    x = _mm_mullo_epi32(in[14], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 5
+    addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+      neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                       &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static INLINE void idct64_stage8_sse4_1(
+    __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+  u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+  u[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+  u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+                  clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+  u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+  u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+  u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+  u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+  u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+  u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+  u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+  u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+                                         const __m128i *cospi32,
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi,
+                                         const __m128i *rnding, int bit) {
+  __m128i temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+  u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+  u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+  u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+  u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+                                         int bd, int out_shift,
+                                         const int log_range) {
+  if (do_cols) {
+    for (int i = 0; i < 32; i++) {
+      addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
+    }
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    for (int i = 0; i < 32; i++) {
+      addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
+                          &clamp_lo_out, &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+  {
+    __m128i x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (do_cols) {
+      x = _mm_max_epi32(x, clamp_lo);
+      x = _mm_min_epi32(x, clamp_hi);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      x = _mm_add_epi32(x, offset);
+      x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+
+      x = _mm_max_epi32(x, clamp_lo_out);
+      x = _mm_min_epi32(x, clamp_hi_out);
+    }
+
+    out[0] = x;
+    out[63] = x;
+    out[1] = x;
+    out[62] = x;
+    out[2] = x;
+    out[61] = x;
+    out[3] = x;
+    out[60] = x;
+    out[4] = x;
+    out[59] = x;
+    out[5] = x;
+    out[58] = x;
+    out[6] = x;
+    out[57] = x;
+    out[7] = x;
+    out[56] = x;
+    out[8] = x;
+    out[55] = x;
+    out[9] = x;
+    out[54] = x;
+    out[10] = x;
+    out[53] = x;
+    out[11] = x;
+    out[52] = x;
+    out[12] = x;
+    out[51] = x;
+    out[13] = x;
+    out[50] = x;
+    out[14] = x;
+    out[49] = x;
+    out[15] = x;
+    out[48] = x;
+    out[16] = x;
+    out[47] = x;
+    out[17] = x;
+    out[46] = x;
+    out[18] = x;
+    out[45] = x;
+    out[19] = x;
+    out[44] = x;
+    out[20] = x;
+    out[43] = x;
+    out[21] = x;
+    out[42] = x;
+    out[22] = x;
+    out[41] = x;
+    out[23] = x;
+    out[40] = x;
+    out[24] = x;
+    out[39] = x;
+    out[25] = x;
+    out[38] = x;
+    out[26] = x;
+    out[37] = x;
+    out[27] = x;
+    out[36] = x;
+    out[28] = x;
+    out[35] = x;
+    out[29] = x;
+    out[34] = x;
+    out[30] = x;
+    out[33] = x;
+    out[31] = x;
+    out[32] = x;
+  }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+  {
+    __m128i u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    __m128i temp1, temp2;
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = temp2;
+
+    temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = temp1;
+    temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[35] = temp2;
+    temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[36] = temp1;
+    temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[37] = temp2;
+    temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = temp1;
+    temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[43] = temp2;
+    temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[44] = temp1;
+    temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = temp1;
+    temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[19] = temp2;
+    temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[20] = temp1;
+    temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+    u[9] = u[9];
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64];
+    __m128i tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = tmp1;
+    tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[5] = tmp1;
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+  }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+    u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+    u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+    u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+    u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+    u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+    u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+    u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+    v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    if (do_cols) {
+      for (i = 0; i < 32; i++) {
+        addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
+      }
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+          -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+      const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+          (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+      for (i = 0; i < 32; i++) {
+        addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
+                            &clamp_lo_out, &clamp_hi_out, out_shift);
+      }
+    }
+  }
+}
+
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1;
+
+  // stage 0
+  // stage 1
+  bf1 = in[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    bf1 = _mm_max_epi32(bf1, clamp_lo);
+    bf1 = _mm_min_epi32(bf1, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    bf1 = _mm_add_epi32(bf1, offset);
+    bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+    bf1 = _mm_max_epi32(bf1, clamp_lo_out);
+    bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+  }
+  out[0] = bf1;
+  out[1] = bf1;
+  out[2] = bf1;
+  out[3] = bf1;
+  out[4] = bf1;
+  out[5] = bf1;
+  out[6] = bf1;
+  out[7] = bf1;
+  out[8] = bf1;
+  out[9] = bf1;
+  out[10] = bf1;
+  out[11] = bf1;
+  out[12] = bf1;
+  out[13] = bf1;
+  out[14] = bf1;
+  out[15] = bf1;
+  out[16] = bf1;
+  out[17] = bf1;
+  out[18] = bf1;
+  out[19] = bf1;
+  out[20] = bf1;
+  out[21] = bf1;
+  out[22] = bf1;
+  out[23] = bf1;
+  out[24] = bf1;
+  out[25] = bf1;
+  out[26] = bf1;
+  out[27] = bf1;
+  out[28] = bf1;
+  out[29] = bf1;
+  out[30] = bf1;
+  out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
+
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[4] = in[4];
+  bf1[8] = in[2];
+  bf1[12] = in[6];
+  bf1[16] = in[1];
+  bf1[20] = in[5];
+  bf1[24] = in[3];
+  bf1[28] = in[7];
+
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+  bf1[17] = bf1[16];
+  bf1[18] = bf1[19];
+  bf1[21] = bf1[20];
+  bf1[22] = bf1[23];
+  bf1[25] = bf1[24];
+  bf1[26] = bf1[27];
+  bf1[29] = bf1[28];
+  bf1[30] = bf1[31];
+
+  // stage 4 :
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+  bf1[9] = bf1[8];
+  bf1[10] = bf1[11];
+  bf1[13] = bf1[12];
+  bf1[14] = bf1[15];
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[5] = bf1[4];
+  bf1[6] = bf1[7];
+
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+  // stage 6
+  bf1[3] = bf1[0];
+  bf1[2] = bf1[1];
+
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
+
+  // stage 0
+  // stage 1
+
+  bf1[0] = in[0];
+  bf1[2] = in[8];
+  bf1[4] = in[4];
+  bf1[6] = in[12];
+  bf1[8] = in[2];
+  bf1[10] = in[10];
+  bf1[12] = in[6];
+  bf1[14] = in[14];
+  bf1[16] = in[1];
+  bf1[18] = in[9];
+  bf1[20] = in[5];
+  bf1[22] = in[13];
+  bf1[24] = in[3];
+  bf1[26] = in[11];
+  bf1[28] = in[7];
+  bf1[30] = in[15];
+
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+  bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+  bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+  bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+  bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+  bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+  bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+  bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+  bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+  bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+  bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+  addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+  // stage 4
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+  bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+  bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+  bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+  addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32], bf0[32];
+
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[1] = in[16];
+  bf1[2] = in[8];
+  bf1[3] = in[24];
+  bf1[4] = in[4];
+  bf1[5] = in[20];
+  bf1[6] = in[12];
+  bf1[7] = in[28];
+  bf1[8] = in[2];
+  bf1[9] = in[18];
+  bf1[10] = in[10];
+  bf1[11] = in[26];
+  bf1[12] = in[6];
+  bf1[13] = in[22];
+  bf1[14] = in[14];
+  bf1[15] = in[30];
+  bf1[16] = in[1];
+  bf1[17] = in[17];
+  bf1[18] = in[9];
+  bf1[19] = in[25];
+  bf1[20] = in[5];
+  bf1[21] = in[21];
+  bf1[22] = in[13];
+  bf1[23] = in[29];
+  bf1[24] = in[3];
+  bf1[25] = in[19];
+  bf1[26] = in[11];
+  bf1[27] = in[27];
+  bf1[28] = in[7];
+  bf1[29] = in[23];
+  bf1[30] = in[15];
+  bf1[31] = in[31];
+
+  // stage 2
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] = bf1[4];
+  bf0[5] = bf1[5];
+  bf0[6] = bf1[6];
+  bf0[7] = bf1[7];
+  bf0[8] = bf1[8];
+  bf0[9] = bf1[9];
+  bf0[10] = bf1[10];
+  bf0[11] = bf1[11];
+  bf0[12] = bf1[12];
+  bf0[13] = bf1[13];
+  bf0[14] = bf1[14];
+  bf0[15] = bf1[15];
+  bf0[16] =
+      half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+  bf0[17] =
+      half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+  bf0[31] =
+      half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+  // stage 3
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] =
+      half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+  bf1[9] =
+      half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+  bf1[15] =
+      half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+  addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] =
+      half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+  bf0[5] =
+      half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+  bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+  bf0[16] = bf1[16];
+  bf0[17] =
+      half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+  bf0[19] = bf1[19];
+  bf0[20] = bf1[20];
+  bf0[21] =
+      half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] =
+      half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+  bf0[27] = bf1[27];
+  bf0[28] = bf1[28];
+  bf0[29] =
+      half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+  bf0[31] = bf1[31];
+
+  // stage 5
+  bf1[0] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+  bf1[1] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+  bf1[2] =
+      half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+  bf1[3] =
+      half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+  addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] =
+      half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] =
+      half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+  bf0[4] = bf1[4];
+  bf0[5] =
+      half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[7] = bf1[7];
+  addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] =
+      half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+  bf0[22] = bf1[22];
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = bf1[25];
+  bf0[26] =
+      half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 7
+  addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] =
+      half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+  // stage 8
+  addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = bf1[18];
+  bf0[19] = bf1[19];
+  bf0[20] =
+      half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[28] = bf1[28];
+  bf0[29] = bf1[29];
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 9
+  if (do_cols) {
+    addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
+    addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
+    addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
+    addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
+    addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
+    addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
+    addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
+    addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
+    addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
+    addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
+    addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
+    addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
+    addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
+    addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
+    addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
+    addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+        -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+    const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+        (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+    addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+    addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+                        &clamp_hi_out, out_shift);
+  }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                txfm_param->tx_type, txfm_param->bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                txfm_param->tx_type, txfm_param->bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
+                                          uint8_t *dest, int stride,
+                                          const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
+                                          uint8_t *dest, int stride,
+                                          const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+      // Assembly version doesn't support IDTX, so use C version for it.
+    case IDTX:
+      av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                 tx_type, bd);
+      break;
+    default: assert(0);
+  }
+}
+
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+  switch (tx_type) {
+      // Assembly version doesn't support some transform types, so use C version
+      // for those.
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                               bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
+
+static const transform_1d_sse4_1
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+        { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+            NULL },
+          { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+            NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+          idct32x32_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+          idct64x64_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_sse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+    case TX_16X64:
+    case TX_64X16:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 000000000..e298cf653
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit =
+            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        if (do_average) {
+          const __m256i data_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+          const __m256i res_unsigned_lo =
+              _mm256_add_epi32(res_32b_lo, offset_const);
+
+          const __m256i comp_avg_res_lo = highbd_comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+          const __m256i res_unsigned_hi =
+              _mm256_add_epi32(res_32b_hi, offset_const);
+
+          const __m256i comp_avg_res_hi = highbd_comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m256i res_unsigned_16b =
+              _mm256_adds_epu16(res, offset_const_16b);
+
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b, offset_const);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b_lo, offset_const);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+            const __m256i res_unsigned_hi =
+                _mm256_add_epi32(res_32b_hi, offset_const);
+
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_2d_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        const __m256i res_unsigned_lo =
+            _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  int i, j;
+  __m256i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+      if (w - j < 8) {
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+          const __m256i comp_avg_res = highbd_comp_avg(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result = highbd_convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result, round_result);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      } else {
+        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+          const __m256i comp_avg_res_lo = highbd_comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m256i comp_avg_res_hi = highbd_comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                          res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_y_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  assert(bits >= 0);
+  int i, j;
+  __m256i s[8], coeffs_y[4];
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i round_const_y =
+      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res = highbd_comp_avg(
+                &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+          res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo = highbd_comp_avg(
+                &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+            const __m256i comp_avg_res_hi = highbd_comp_avg(
+                &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 000000000..1a29985b5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_jnt_convolve_y_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  assert(bits >= 0);
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i s[16], coeffs_y[4];
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+        res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+                                     round_shift_y);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+        res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+                                     round_shift_y);
+
+        __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+        __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+            const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
+                &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
+                &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_0, round_result_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_1, round_result_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+                             res_clip_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+
+          } else {
+            __m128i res_16b_0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+            __m128i res_16b_1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16b_1);
+          }
+        } else {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+          __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+          __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+            const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+            const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_lo_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_lo_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+            const __m128i comp_avg_res_hi_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+                                       &wt0, &wt1, use_jnt_comp_avg);
+
+            const __m128i round_result_lo_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_lo_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                            res_clip_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+          } else {
+            __m128i res_16bit0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+            __m128i res_16bit1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_16bit1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_jnt_convolve_x_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  int i, j;
+  __m128i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 1) {
+      const __m128i row00 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+      const __m128i row01 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+      // even pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 0);
+      s[1] = _mm_alignr_epi8(row01, row00, 4);
+      s[2] = _mm_alignr_epi8(row01, row00, 8);
+      s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+      __m128i res_even = convolve(s, coeffs_x);
+      res_even =
+          _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+      // odd pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 2);
+      s[1] = _mm_alignr_epi8(row01, row00, 6);
+      s[2] = _mm_alignr_epi8(row01, row00, 10);
+      s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+      __m128i res_odd = convolve(s, coeffs_x);
+      res_odd =
+          _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+      res_even = _mm_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+      __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+      __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+      if (w - j < 8) {
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+          const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i round_result = highbd_convolve_rounding_sse2(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+        }
+      } else {
+        __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 000000000..6f24e5948
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                                \
+    __m128i u0, u1, u2, u3;                           \
+    u0 = _mm_unpacklo_epi32(x0, x1);                  \
+    u1 = _mm_unpackhi_epi32(x0, x1);                  \
+    u2 = _mm_unpacklo_epi32(x2, x3);                  \
+    u3 = _mm_unpackhi_epi32(x2, x3);                  \
+    y0 = _mm_unpacklo_epi64(u0, u2);                  \
+    y1 = _mm_unpackhi_epi64(u0, u2);                  \
+    y2 = _mm_unpacklo_epi64(u1, u3);                  \
+    y3 = _mm_unpackhi_epi64(u1, u3);                  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+  for (int j = 0; j < 8; j++) {
+    for (int i = 0; i < 8; i++) {
+      TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+                    input[i * 32 + j + 16], input[i * 32 + j + 24],
+                    output[j * 32 + i + 0], output[j * 32 + i + 8],
+                    output[j * 32 + i + 16], output[j * 32 + i + 24]);
+    }
+  }
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+                                      const __m128i *w1, const __m128i *n1,
+                                      const __m128i *rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(*w0, *n0);
+  y = _mm_mullo_epi32(*w1, *n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+                                        const __m128i *rounding, int bit) {
+  __m128i x;
+
+  x = _mm_mullo_epi32(*w0, *n0);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                    int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                        const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd);
+
+#endif  // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 000000000..4bcab0564
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+};
+
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                          __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+  coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+  coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+  coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+  coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  // Filter odd-index pixels
+  const __m128i tmp_1 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+    int sx, __m128i *coeff) {
+  // Filter coeff
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+  coeff[4] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+  coeff[6] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+  coeff[1] = coeff[0];
+  coeff[3] = coeff[2];
+  coeff[5] = coeff[4];
+  coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+    const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+    const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+  const __m128i src_1 = *src;
+  const __m128i src2_1 = *src2;
+
+  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+  const __m128i res_2 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+  const __m128i res_4 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+  const __m128i res_6 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+  __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                           _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  const __m128i res_1 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+  const __m128i res_3 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+  const __m128i res_5 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+  const __m128i res_7 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
+
+  __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+  res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+                          _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  // Combine results into one register.
+  // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+  // as this order helps with the vertical filter.
+  tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+                                       __m128i *tmp, int sx, int alpha, int k,
+                                       const int offset_bits_horiz,
+                                       const int reduce_bits_horiz) {
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+                           reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    __m128i coeff[8];
+    highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+                        reduce_bits_horiz);
+  }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    highbd_warp_horizontal_filter_alpha0_beta0(
+        ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha == 0 && beta != 0)
+    highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                         beta, p_height, height, i,
+                                         offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha != 0 && beta == 0)
+    highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else
+    highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+                                   int width, int height, int stride,
+                                   uint16_t *pred, int p_col, int p_row,
+                                   int p_width, int p_height, int p_stride,
+                                   int subsampling_x, int subsampling_y, int bd,
+                                   ConvolveParams *conv_params, int16_t alpha,
+                                   int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(!(bd == 12 && reduce_bits_horiz < 5));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const __m128i res_sub_const =
+      _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          const __m128i src_01 = _mm_shuffle_epi8(
+              src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+          const __m128i src2_01 = _mm_shuffle_epi8(
+              src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+          __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+          __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+          }
+
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+          }
+
+          const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+          const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+          highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+                              offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        highbd_prepare_warp_horizontal_filter(
+            ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+            offset_bits_horiz, reduce_bits_horiz);
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          res_lo = _mm_add_epi32(res_lo, res_add_const);
+          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+                                 reduce_bits_vert_shift);
+
+          if (conv_params->do_average) {
+            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+            if (conv_params->use_jnt_comp_avg) {
+              res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                     _mm_mullo_epi32(res_lo, wt1));
+              res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+            } else {
+              res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+            }
+
+            __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+            res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+                                     round_bits_shift);
+
+            __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+            res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+            _mm_storel_epi64(dst16, res16_lo);
+          } else {
+            res_lo = _mm_packus_epi32(res_lo, res_lo);
+            _mm_storel_epi64(p, res_lo);
+          }
+          if (p_width > 4) {
+            __m128i *const p4 =
+                (__m128i *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = _mm_add_epi32(res_hi, res_add_const);
+            res_hi =
+                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+                              reduce_bits_vert_shift);
+            if (conv_params->do_average) {
+              __m128i *const dst16_4 =
+                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+              __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+              if (conv_params->use_jnt_comp_avg) {
+                res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+                                       _mm_mullo_epi32(res_hi, wt1));
+                res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+              } else {
+                res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+              }
+
+              __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+              res32_hi = _mm_sra_epi32(
+                  _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+              __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+              res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+              _mm_storel_epi64(dst16_4, res16_hi);
+            } else {
+              res_hi = _mm_packus_epi32(res_hi, res_hi);
+              _mm_storel_epi64(p4, res_hi);
+            }
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+          const __m128i zero = _mm_setzero_si128();
+          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            _mm_storel_epi64(p, res_16bit);
+          } else {
+            _mm_storeu_si128(p, res_16bit);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 000000000..0c8a8505b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+
+  /* Horizontal filter */
+  {
+    const __m256i clamp_high_ep =
+        _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+        // Load 16-bit src data
+        const __m256i src_0 = yy_loadu_256(src_ij + 0);
+        const __m256i src_1 = yy_loadu_256(src_ij + 1);
+        const __m256i src_2 = yy_loadu_256(src_ij + 2);
+        const __m256i src_3 = yy_loadu_256(src_ij + 3);
+        const __m256i src_4 = yy_loadu_256(src_ij + 4);
+        const __m256i src_5 = yy_loadu_256(src_ij + 5);
+        const __m256i src_6 = yy_loadu_256(src_ij + 6);
+        const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+        const __m256i res_16bit_clamped = _mm256_min_epi16(
+            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+        // Store in the dst array
+        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 000000000..818b1099c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int i, j;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero = _mm_setzero_si128();
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+  /* Horizontal filter */
+  {
+    const __m128i coeffs_x =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                  conv_params->round_0);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                 conv_params->round_0);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        const __m128i maxval =
+            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m128i coeffs_y =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, res_16bit);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
new file mode 100644
index 000000000..0c857b583
--- /dev/null
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
+    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
+    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
+    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
+  };
+
+  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
+    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  };
+
+  // Extend the first and last samples to simplify the loop for the 5-tap case
+  p[-1] = p[0];
+  __m128i last = _mm_set1_epi8(p[sz - 1]);
+  _mm_storeu_si128((__m128i *)&p[sz], last);
+
+  // Adjust input pointer for filter support area
+  uint8_t *in = (strength == 3) ? p - 1 : p;
+
+  // Avoid modifying first sample
+  uint8_t *out = p + 1;
+  int len = sz - 1;
+
+  const int use_3tap_filter = (strength < 3);
+
+  if (use_3tap_filter) {
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
+    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+      d0 = _mm_maddubs_epi16(d0, coef0);
+      d1 = _mm_maddubs_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srai_epi16(d0, 4);
+      d0 = _mm_packus_epi16(d0, d0);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi8(n_out);
+      __m128i mask = _mm_cmpgt_epi8(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storel_epi64((__m128i *)out, out0);
+      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+      in0 = _mm_alignr_epi8(in1, in0, 8);
+      in += 8;
+      out += 8;
+      len -= n_out;
+    }
+  } else {  // 5-tap filter
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i two = _mm_set1_epi8(2);
+    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
+    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
+    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
+    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
+      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
+      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
+      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
+      d0 = _mm_maddubs_epi16(d0, coef0);
+      d1 = _mm_maddubs_epi16(d1, coef0);
+      d2 = _mm_maddubs_epi16(d2, coef0);
+      d3 = _mm_maddubs_epi16(d3, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      d2 = _mm_hadd_epi16(d2, d3);
+      d0 = _mm_hadd_epi16(d0, d2);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srai_epi16(d0, 4);
+      d0 = _mm_packus_epi16(d0, d0);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi8(n_out);
+      __m128i mask = _mm_cmpgt_epi8(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storel_epi64((__m128i *)out, out0);
+      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+      in0 = _mm_alignr_epi8(in1, in0, 8);
+      in += 8;
+      out += 8;
+      len -= n_out;
+    }
+  }
+}
+
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
+    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
+    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
+    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
+  };
+
+  DECLARE_ALIGNED(16, static const int16_t,
+                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+  // Extend the first and last samples to simplify the loop for the 5-tap case
+  p[-1] = p[0];
+  __m128i last = _mm_set1_epi16(p[sz - 1]);
+  _mm_storeu_si128((__m128i *)&p[sz], last);
+
+  // Adjust input pointer for filter support area
+  uint16_t *in = (strength == 3) ? p - 1 : p;
+
+  // Avoid modifying first sample
+  uint16_t *out = p + 1;
+  int len = sz - 1;
+
+  const int use_3tap_filter = (strength < 3);
+
+  if (use_3tap_filter) {
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+      __m128i in02 = _mm_add_epi16(in0, in2);
+      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
+      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
+      d0 = _mm_mullo_epi16(d0, coef0);
+      d1 = _mm_mullo_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srli_epi16(d0, 4);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi16(n_out);
+      __m128i mask = _mm_cmpgt_epi16(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storeu_si128((__m128i *)out, out0);
+      in += 8;
+      in0 = in8;
+      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+      out += 8;
+      len -= n_out;
+    }
+  } else {  // 5-tap filter
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
+      __m128i in04 = _mm_add_epi16(in0, in4);
+      __m128i in123 = _mm_add_epi16(in1, in2);
+      in123 = _mm_add_epi16(in123, in3);
+      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
+      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
+      d0 = _mm_mullo_epi16(d0, coef0);
+      d1 = _mm_mullo_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srli_epi16(d0, 4);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi16(n_out);
+      __m128i mask = _mm_cmpgt_epi16(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storeu_si128((__m128i *)out, out0);
+      in += 8;
+      in0 = in8;
+      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+      out += 8;
+      len -= n_out;
+    }
+  }
+}
+
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+  };
+
+  DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
+    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+  };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint8_t *in = &p[-2];
+  uint8_t *out = &p[-2];
+
+  int n = sz + 1;  // Input length including upper-left sample
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+  while (n > 0) {
+    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+    d0 = _mm_maddubs_epi16(d0, coef0);
+    d1 = _mm_maddubs_epi16(d1, coef0);
+    d2 = _mm_maddubs_epi16(d2, coef0);
+    d3 = _mm_maddubs_epi16(d3, coef0);
+    d0 = _mm_hadd_epi16(d0, d1);
+    d2 = _mm_hadd_epi16(d2, d3);
+    __m128i eight = _mm_set1_epi16(8);
+    d0 = _mm_add_epi16(d0, eight);
+    d2 = _mm_add_epi16(d2, eight);
+    d0 = _mm_srai_epi16(d0, 4);
+    d2 = _mm_srai_epi16(d2, 4);
+    d0 = _mm_packus_epi16(d0, d2);
+    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[16], out1);
+    in0 = in16;
+    in16 = _mm_setzero_si128();
+    out += 32;
+    n -= 16;
+  }
+}
+
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int16_t,
+                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint16_t *in = &p[-2];
+  uint16_t *out = in;
+  int n = sz + 1;
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
+
+  while (n > 0) {
+    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+    __m128i sum0 = _mm_add_epi16(in0, in3);
+    __m128i sum1 = _mm_add_epi16(in1, in2);
+    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
+    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
+    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+    d0 = _mm_madd_epi16(d0, coef0);
+    d1 = _mm_madd_epi16(d1, coef0);
+    __m128i eight = _mm_set1_epi32(8);
+    d0 = _mm_add_epi32(d0, eight);
+    d1 = _mm_add_epi32(d1, eight);
+    d0 = _mm_srai_epi32(d0, 4);
+    d1 = _mm_srai_epi32(d1, 4);
+    d0 = _mm_packus_epi32(d0, d1);
+    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
+    d0 = _mm_min_epi16(d0, max0);
+    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[8], out1);
+    in0 = in8;
+    in8 = in16;
+    in16 = in24;
+    in24 = _mm_setzero_si128();
+    out += 16;
+    n -= 8;
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 000000000..9f2e2b457
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16(w0);
+  const __m256i wt1 = _mm256_set1_epi16(w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+  return _mm256_permute2x128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  __m256i filt[4], coeffs[4];
+
+  assert(bits >= 0);
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+  const __m256i round_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  for (i = 0; i < h; i += 2) {
+    const uint8_t *src_data = src_ptr + i * src_stride;
+    CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+    for (j = 0; j < w; j += 8) {
+      const __m256i data =
+          load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+      __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+      res = _mm256_slli_epi16(res, bits);
+
+      const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m256i data_ref_0 =
+            load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+        const __m256i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m256i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+        } else {
+          *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+          *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+              _mm_cvtsi128_si32(res_1);
+        }
+      } else {
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                        res_1);
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride;
+  // +1 to compensate for dividing the filter coeffs by 2
+  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i coeffs[4], s[8];
+
+  assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+  (void)conv_params;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  for (j = 0; j < w; j += 16) {
+    const uint8_t *data = &src_ptr[j];
+    __m256i src6;
+    // Load lines a and b. Line a to lower 128, line b to upper 128
+    {
+      __m256i src_ab[7];
+      __m256i src_a[7];
+      src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+      for (int kk = 0; kk < 6; ++kk) {
+        data += src_stride;
+        src_a[kk + 1] =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+      }
+      src6 = src_a[6];
+      s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+      s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+      s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+      s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+      s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+      s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+    }
+
+    for (i = 0; i < h; i += 2) {
+      data = &src_ptr[(i + 7) * src_stride + j];
+      const __m256i src7 =
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+      const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + src_stride)));
+      const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+
+      s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+      s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+      __m256i res_lo = convolve_lowbd(s, coeffs);
+
+      res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+      const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+      const __m256i res_lo_0_shift =
+          _mm256_slli_epi32(res_lo_0_32b, left_shift);
+      const __m256i res_lo_0_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+      const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+      const __m256i res_lo_1_shift =
+          _mm256_slli_epi32(res_lo_1_32b, left_shift);
+      const __m256i res_lo_1_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+      const __m256i res_lo_round =
+          _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+      const __m256i res_lo_unsigned =
+          _mm256_add_epi16(res_lo_round, offset_const_2);
+
+      if (w - j < 16) {
+        if (do_average) {
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      } else {
+        __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+        res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+        const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+        const __m256i res_hi_0_shift =
+            _mm256_slli_epi32(res_hi_0_32b, left_shift);
+        const __m256i res_hi_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+        const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+        const __m256i res_hi_1_shift =
+            _mm256_slli_epi32(res_hi_1_32b, left_shift);
+        const __m256i res_hi_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+        const __m256i res_hi_round =
+            _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+        const __m256i res_hi_unsigned =
+            _mm256_add_epi16(res_hi_round, offset_const_2);
+
+        if (do_average) {
+          const __m256i data_ref_0_lo = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+
+          const __m256i data_ref_0_hi =
+              load_line2_avx2(&dst[i * dst_stride + j + 8],
+                              &dst[i * dst_stride + j + 8 + dst_stride]);
+
+          const __m256i comp_avg_res_lo =
+              comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i comp_avg_res_hi =
+              comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 =
+              _mm256_packus_epi16(round_result_lo, round_result_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128(
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+        } else {
+          const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+          const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_lo_1);
+
+          const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+
+          const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
+          _mm_store_si128(
+              (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+        }
+      }
+      s[0] = s[1];
+      s[1] = s[2];
+      s[2] = s[3];
+
+      s[4] = s[5];
+      s[5] = s[6];
+      s[6] = s[7];
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                              int dst_stride0, int w, int h,
+                              const InterpFilterParams *filter_params_x,
+                              const InterpFilterParams *filter_params_y,
+                              const int subpel_x_q4, const int subpel_y_q4,
+                              ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      const uint8_t *src_h = src_ptr + j;
+      for (i = 0; i < im_h; i += 2) {
+        __m256i data =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
+        if (i + 1 < im_h)
+          data = _mm256_inserti128_si256(
+              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+        src_h += (src_stride << 1);
+        __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res =
+                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        } else {
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i comp_avg_res =
+                comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_q4;
+  (void)subpel_y_q4;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  int i, j;
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m256i data_ref_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                          _mm256_castsi256_si128(res_0));
+        } else {
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 000000000..87dc3242e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+  if (w == 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      const __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+      const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+      const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+      } else {
+        _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+      }
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+        const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
+
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+                             int dst_stride0, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_q4, const int subpel_y_q4,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+  const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+  if (w == 4) {
+    __m128i s[8], src6, res, res_shift;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+      __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      res_16b = _mm_packs_epi32(res_shift, res_shift);
+      res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+
+        __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+        res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 000000000..822772782
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
+                               uint8_t *dst0, int dst_stride0, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_q4, const int subpel_y_q4,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+        const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+          else
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 000000000..f645e0454
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+                                     const __m256i s1) {
+  const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+  return _mm256_abs_epi16(
+      _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+                                          DIFFWTD_MASK_TYPE mask_type,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = xx_loadl_32(src0);
+      const __m128i s0B = xx_loadl_32(src0 + stride0);
+      const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
+      const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+      const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+      const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+      const __m128i s1A = xx_loadl_32(src1);
+      const __m128i s1B = xx_loadl_32(src1 + stride1);
+      const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
+      const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+      const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+      const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+      const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      const __m128i x_m8 =
+          _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+      xx_storeu_128(mask, x_m8);
+      src0 += (stride0 << 2);
+      src1 += (stride1 << 2);
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + stride0);
+      const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
+      const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+      const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+      const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + stride1);
+      const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
+      const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+      const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+      const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+      const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+      const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+      yy_storeu_256(mask, m8);
+      src0 += stride0 << 2;
+      src1 += stride1 << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (16 == w) {
+    do {
+      const __m128i s0A = xx_load_128(src0);
+      const __m128i s0B = xx_load_128(src0 + stride0);
+      const __m128i s1A = xx_load_128(src1);
+      const __m128i s1B = xx_load_128(src1 + stride1);
+      const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+      const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+      const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+      const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+      const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+      const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+      const __m256i m8 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+      yy_storeu_256(mask, m8);
+      src0 += stride0 << 1;
+      src1 += stride1 << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m256i s0 = yy_loadu_256(src0 + j);
+        const __m256i s1 = yy_loadu_256(src1 + j);
+        const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+        const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+        const __m256i s0H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+        const __m256i s1H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+        const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+        const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+        const __m256i m8 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+        yy_storeu_256(mask + j, m8);
+        j += 32;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+                                         const __m256i *data_src1,
+                                         const __m256i *round_const,
+                                         const __m256i *mask_base_16,
+                                         const __m256i *clip_diff, int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+                                             const __m256i *data_src1,
+                                             const __m256i *round_const,
+                                             const __m256i *mask_base_16,
+                                             const __m256i *clip_diff,
+                                             int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+  return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 =
+          calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+  if (mask_type == DIFFWTD_38) {
+    build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+                                         src1_stride, h, w, shift);
+  } else {
+    build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+                                             src1_stride, h, w, shift);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 16) {
+    av1_build_compound_diffwtd_mask_highbd_ssse3(
+        mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+  } else {
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    assert(bd >= 8);
+    assert((w % 16) == 0);
+    const __m256i y0 = _mm256_setzero_si256();
+    const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+        _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 000000000..5171ca493
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+                                const __m128i s1) {
+  const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+  return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+                                            DIFFWTD_MASK_TYPE mask_type,
+                                            const uint8_t *src0, int stride0,
+                                            const uint8_t *src1, int stride1,
+                                            int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m128i mask_base = _mm_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
+      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
+      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+      *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
+      *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += 8;
+      i += 2;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+      __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+      s0 = _mm_cvtepu8_epi16(s0);
+      s1 = _mm_cvtepu8_epi16(s1);
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+      _mm_storel_epi64((__m128i *)mask, m8);
+      src0 += stride0;
+      src1 += stride1;
+      mask += 8;
+      i += 1;
+    } while (i < h);
+  } else {
+    const __m128i zero = _mm_setzero_si128();
+    do {
+      int j = 0;
+      do {
+        const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+        const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+        const __m128i s0L = _mm_cvtepu8_epi16(s0);
+        const __m128i s1L = _mm_cvtepu8_epi16(s1);
+        const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+        const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+        const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+        const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+        const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+        _mm_store_si128((__m128i *)(mask + j), m8);
+        j += 16;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+  const int mask_base = 38;
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+  const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+  const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i add_const =
+      _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+  const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+  int i, j;
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const __m128i data_src0 =
+          _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+      const __m128i data_src1 =
+          _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+      const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+      const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+      const __m128i diff = _mm_max_epu16(diffa, diffb);
+      const __m128i diff_round =
+          _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+      const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+      const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+      __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+      // clamp to 0 can be skipped since we are using add and saturate
+      // instruction
+
+      const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+      const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+      // 8 bit conversion and saturation to uint8
+      const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+      // Store values into the destination buffer
+      __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+      if ((w - j) > 4) {
+        _mm_storel_epi64(dst, res_8);
+      } else {  // w==4
+        *(uint32_t *)dst = _mm_cvtsi128_si32(res_8);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 000000000..cf684447c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 8) {
+    av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+                                             src1, src1_stride, h, w, bd);
+  } else {
+    assert(bd >= 8);
+    assert((w % 8) == 0);
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    const __m128i x0 = _mm_setzero_si128();
+    const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+        _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m128i xmask_base = _mm_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 000000000..0aaf1f454
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+  return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x   = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s   = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+  const __m256i x01 = _mm256_slli_si256(x, 4);
+  const __m256i x02 = _mm256_add_epi32(x, x01);
+  const __m256i x03 = _mm256_slli_si256(x02, 8);
+  const __m256i x04 = _mm256_add_epi32(x02, x03);
+  const int32_t s = _mm256_extract_epi32(x04, 3);
+  const __m128i s01 = _mm_set1_epi32(s);
+  const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+  return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+  unsigned int i = 0;
+  for (i = 0; i < (count & 0xffffffe0); i += 32) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+  }
+  for (; i < (count & 0xfffffff8); i += 8) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+  }
+  for (; i < count; i++) {
+    dest[i] = 0;
+  }
+  return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+  const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+  const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+  const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+  const __m256i u = _mm256_sub_epi32(tr, tl);
+  const __m256i v = _mm256_sub_epi32(br, bl);
+  return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+  return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+  __m256i an, bb;
+  if (bit_depth > 8) {
+    const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m256i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m256i a =
+        _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+    const __m256i b =
+        _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm256_madd_epi16(b, b);
+    an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+  } else {
+    bb = _mm256_madd_epi16(sum1, sum1);
+    an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+  }
+  return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fours = _mm256_add_epi32(
+      xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+  const __m256i threes =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+  return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+                          threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m256i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+      const __m128i raw =
+          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m256i src =
+          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+      __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+      __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+                                    SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+      yy_storeu_256(dst + i * dst_stride + j, w);
+    }
+  }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fives =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+  const __m256i sixes = _mm256_add_epi32(xt, xb);
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+
+  const __m256i fives = _mm256_add_epi32(xl, xr);
+  const __m256i sixes = x;
+
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m256i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m256i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m256i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
+
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                    int dgd_stride, int32_t *flt0,
+                                    int32_t *flt1, int flt_stride,
+                                    int sgr_params_idx, int bit_depth,
+                                    int highbd) {
+  // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+  // Ctl and Dtl is 32-byte aligned.
+  const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+  int32_t *buf = aom_memalign(
+      32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+  if (!buf) return -1;
+
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 32 bytes for efficiency.
+  int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array.
+  int32_t *Atl = buf + 0 * buf_elts + 7;
+  int32_t *Btl = buf + 1 * buf_elts + 7;
+  int32_t *Ctl = buf + 2 * buf_elts + 7;
+  int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
+  }
+
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
+  }
+  aom_free(buf);
+  return 0;
+}
+
+void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                       int height, int stride, int eps,
+                                       const int *xqd, uint8_t *dst8,
+                                       int dst_stride, int32_t *tmpbuf,
+                                       int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  const int ret = av1_selfguided_restoration_avx2(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+  decode_xq(xqd, xq, params);
+
+  __m256i xq0 = _mm256_set1_epi32(xq[0]);
+  __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+  for (int i = 0; i < height; ++i) {
+    // Calculate output in batches of 16 pixels
+    for (int j = 0; j < width; j += 16) {
+      const int k = i * width + j;
+      const int m = i * dst_stride + j;
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m256i ep_0, ep_1;
+      __m128i src_0, src_1;
+      if (highbd) {
+        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+        ep_0 = _mm256_cvtepu16_epi32(src_0);
+        ep_1 = _mm256_cvtepu16_epi32(src_1);
+      } else {
+        src_0 = xx_loadu_128(dat8ij);
+        ep_0 = _mm256_cvtepu8_epi32(src_0);
+        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+      }
+
+      const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+      const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+      __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+        const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+        const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+      }
+
+      const __m256i rounding =
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_0 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_1 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        // Note that packing into 16 bits messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+        const __m256i res = _mm256_min_epi16(tmp2, max);
+        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        // Note that each pack messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i res =
+            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+        const __m128i res2 =
+            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+        xx_storeu_128(dst8 + m, res2);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 000000000..ea3f6d942
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+  return _mm_cvtepu8_epi32(xx_loadl_32(p));
+}
+
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+  return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
+
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+  const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+  return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
+
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
+
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+      const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
+
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+    }
+  }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
+
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
+
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+      const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
+
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+    }
+  }
+}
+
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+  const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+  const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+  const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+  const __m128i u = _mm_sub_epi32(tr, tl);
+  const __m128i v = _mm_sub_epi32(br, bl);
+  return _mm_sub_epi32(v, u);
+}
+
+static __m128i round_for_shift(unsigned shift) {
+  return _mm_set1_epi32((1 << shift) >> 1);
+}
+
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+  __m128i an, bb;
+  if (bit_depth > 8) {
+    const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m128i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+    const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm_madd_epi16(b, b);
+    an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+  } else {
+    bb = _mm_madd_epi16(sum1, sum1);
+    an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+  }
+  return _mm_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
+
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
+
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+      xx_storeu_128(A + i * buf_stride + j, a_res);
+
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+      xx_storeu_128(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fours = _mm_add_epi32(
+      xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+  const __m128i threes =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+  return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m128i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+      const __m128i raw =
+          xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m128i src =
+          highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+      __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+      xx_storeu_128(dst + i * dst_stride + j, w);
+    }
+  }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
+
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
+
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 2)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 1)],
+                                          x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+      xx_storeu_128(A + i * buf_stride + j, a_res);
+
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+      xx_storeu_128(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fives =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+  const __m128i sixes = _mm_add_epi32(xt, xb);
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+
+  const __m128i fives = _mm_add_epi32(xl, xr);
+  const __m128i sixes = x;
+
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m128i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m128i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m128i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+                                   SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
+
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+                                      int height, int dgd_stride, int32_t *flt0,
+                                      int32_t *flt1, int flt_stride,
+                                      int sgr_params_idx, int bit_depth,
+                                      int highbd) {
+  int32_t *buf = (int32_t *)aom_memalign(
+      16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+  if (!buf) return -1;
+  memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes for efficiency.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+  int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
+  }
+
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
+  }
+  aom_free(buf);
+  return 0;
+}
+
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                         int height, int stride, int eps,
+                                         const int *xqd, uint8_t *dst8,
+                                         int dst_stride, int32_t *tmpbuf,
+                                         int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  const int ret = av1_selfguided_restoration_sse4_1(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
+  const sgr_params_type *const params = &sgr_params[eps];
+  int xq[2];
+  decode_xq(xqd, xq, params);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+  for (int i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (int j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int m = i * dst_stride + j;
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m128i src;
+      if (highbd) {
+        src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+      } else {
+        src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+      }
+
+      const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+      const __m128i u_0 = _mm_cvtepu16_epi32(u);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+      __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+        const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+        const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+      }
+
+      const __m128i rounding =
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+        const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+        const __m128i res = _mm_min_epi16(tmp, max);
+        xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+        const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+        xx_storel_64(dst8 + m, res);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
new file mode 100644
index 000000000..b810cea2e
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -0,0 +1,942 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+static const uint8_t even_mask[16] = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                       8, 10, 10, 12, 12, 14, 14, 0 };
+static const uint8_t odd_mask[16] = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                      9, 11, 11, 13, 13, 15, 15, 0 };
+
+static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                                   0, 1, 0, 1, 0, 1, 0, 1 };
+
+static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                                   2, 3, 2, 3, 2, 3, 2, 3 };
+
+static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                                   4, 5, 4, 5, 4, 5, 4, 5 };
+
+static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                                   6, 7, 6, 7, 6, 7, 6, 7 };
+
+static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                                  0, 1, 2, 3, 0, 1, 2, 3 };
+static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                                  4, 5, 6, 7, 4, 5, 6, 7 };
+static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                                  8, 9, 10, 11, 8, 9, 10, 11 };
+static const uint8_t shuffle_gamma0_mask3[16] = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz, int k) {
+  const __m128i src_even =
+      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+  const __m128i src_odd =
+      _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+  // The pixel order we need for 'src' is:
+  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                            _mm_srli_si128(src_odd, 4));
+  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+  const __m128i src_13 =
+      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+                                            _mm_srli_si128(src_even, 6));
+  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  // Note: The values res_02 + res_46 and res_13 + res_57 both
+  // fit into int16s at this point, but their sum may be too wide to fit
+  // into an int16. However, once we also add round_const, the sum of
+  // all of these fits into a uint16.
+  //
+  // The wrapping behaviour of _mm_add_* is used here to make sure we
+  // get the correct result despite converting between different
+  // (implicit) types.
+  const __m128i res_even = _mm_add_epi16(res_02, res_46);
+  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+  const __m128i res =
+      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+                                                          __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  coeff[1] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  coeff[3] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+}
+
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+                                     int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+                                          int stride, int32_t ix4, int32_t iy4,
+                                          int32_t sx4, int alpha, int beta,
+                                          int p_height, int height, int i,
+                                          const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                      reduce_bits_horiz);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+    __m128i coeff[4];
+    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+  *res_sub_const =
+      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+                                                  __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // even coeffs
+  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  const __m128i tmp_1 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  // odd coeffs
+  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+                                                         __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  // even coeffs
+  coeffs[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+  coeffs[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+  coeffs[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+  coeffs[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+
+  // odd coeffs
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+                                              __m128i *res_lo, __m128i *res_hi,
+                                              int k) {
+  // Load from tmp and rearrange pairs of consecutive rows into the
+  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+  const __m128i *src = tmp + (k + 4);
+  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+  const __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+  // Filter odd-index pixels
+  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+  const __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+    const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m128i res_lo_1 = *res_lo;
+  __m128i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+                              reduce_bits_vert);
+    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+    __m128i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      const __m128i p_16 = _mm_loadl_epi64(p);
+
+      if (conv_params->use_jnt_comp_avg) {
+        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+        const __m128i shifted_32 =
+            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+      }
+
+      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+                                 round_bits);
+      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+      *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+    } else {
+      _mm_storel_epi64(p, temp_lo_16);
+    }
+    if (p_width > 4) {
+      __m128i *const p4 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+                                reduce_bits_vert);
+      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+      __m128i res_hi_16;
+
+      if (conv_params->do_average) {
+        __m128i *const dst8_4 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+        if (conv_params->use_jnt_comp_avg) {
+          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+          const __m128i shifted_32 =
+              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+                                   round_bits);
+        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+        *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+      } else {
+        _mm_storel_epi64(p4, temp_hi_16);
+      }
+    }
+  } else {
+    const __m128i res_lo_round = _mm_srai_epi32(
+        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m128i res_hi_round = _mm_srai_epi32(
+        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+    // Note: If we're outputting a 4x4 block, we need to be very careful
+    // to only output 4 pixels at this point, to avoid encode/decode
+    // mismatches when encoding with multiple threads.
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+    } else {
+      _mm_storel_epi64(p, res_8bit);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  (void)gamma;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  (void)gamma;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0(
+        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else
+    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                         res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                 p_height, height, i, offset_bits_horiz,
+                                 reduce_bits_horiz);
+  else
+    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                           p_height, height, i, offset_bits_horiz,
+                           reduce_bits_horiz);
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+                            int height, int stride, uint8_t *pred, int p_col,
+                            int p_row, int p_width, int p_height, int p_stride,
+                            int subsampling_x, int subsampling_y,
+                            ConvolveParams *conv_params, int16_t alpha,
+                            int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+  __m128i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                     ((1 << reduce_bits_vert) >> 1));
+  }
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_right);
+          }
+          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
+        }
+      } else {
+        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                       beta, p_height, height, i,
+                                       offset_bits_horiz, reduce_bits_horiz);
+      }
+
+      // Vertical filter
+      prepare_warp_vertical_filter(
+          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 000000000..87a6e1239
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+  const __m256i clamp_high =
+      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+  /* Horizontal filter */
+  {
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+        // Load 8-bit src data
+        const __m128i data_0 = xx_loadu_128(data_ij + 0);
+        const __m128i data_1 = xx_loadu_128(data_ij + 1);
+        const __m128i data_2 = xx_loadu_128(data_ij + 2);
+        const __m128i data_3 = xx_loadu_128(data_ij + 3);
+        const __m128i data_4 = xx_loadu_128(data_ij + 4);
+        const __m128i data_5 = xx_loadu_128(data_ij + 5);
+        const __m128i data_6 = xx_loadu_128(data_ij + 6);
+        const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+        // (Zero-)Extend 8-bit data to 16-bit data
+        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+        // Reduce to 8-bit precision. This messes up the order:
+        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit =
+            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+        // Swap the two central 32-bit values to get the order:
+        // [ - - - - - - - - - - - - - - - - ]
+        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+        // Store the lower 128-bit lane in the dst array
+        xx_storeu_128(dst + i * dst_stride + j,
+                      _mm256_castsi256_si128(res_8bit2));
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 000000000..f9d00b733
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+  int i, j;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero = _mm_setzero_si128();
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+  /* Horizontal filter */
+  {
+    const __m128i coeffs_x =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                  conv_params->round_0);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                 conv_params->round_0);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_min_epi16(
+            _mm_max_epi16(res, zero),
+            _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m128i coeffs_y =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storel_epi64(p, res_8bit);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c
new file mode 100644
index 000000000..8d8f3dfdb
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "av1/decoder/accounting.h"
+
+static int aom_accounting_hash(const char *str) {
+  uint32_t val;
+  const unsigned char *ustr;
+  val = 0;
+  ustr = (const unsigned char *)str;
+  /* This is about the worst hash one can design, but it should be good enough
+     here. */
+  while (*ustr) val += *ustr++;
+  return val % AOM_ACCOUNTING_HASH_SIZE;
+}
+
+/* Dictionary lookup based on an open-addressing hash table. */
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
+  int hash;
+  size_t len;
+  AccountingDictionary *dictionary;
+  dictionary = &accounting->syms.dictionary;
+  hash = aom_accounting_hash(str);
+  while (accounting->hash_dictionary[hash] != -1) {
+    if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
+      return accounting->hash_dictionary[hash];
+    }
+    hash++;
+    if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0;
+  }
+  /* No match found. */
+  assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES);
+  accounting->hash_dictionary[hash] = dictionary->num_strs;
+  len = strlen(str);
+  dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+  snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
+  dictionary->num_strs++;
+  return dictionary->num_strs - 1;
+}
+
+void aom_accounting_init(Accounting *accounting) {
+  int i;
+  accounting->num_syms_allocated = 1000;
+  accounting->syms.syms =
+      malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+  accounting->syms.dictionary.num_strs = 0;
+  assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
+  for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
+    accounting->hash_dictionary[i] = -1;
+  aom_accounting_reset(accounting);
+}
+
+void aom_accounting_reset(Accounting *accounting) {
+  accounting->syms.num_syms = 0;
+  accounting->syms.num_binary_syms = 0;
+  accounting->syms.num_multi_syms = 0;
+  accounting->context.x = -1;
+  accounting->context.y = -1;
+  accounting->last_tell_frac = 0;
+}
+
+void aom_accounting_clear(Accounting *accounting) {
+  int i;
+  AccountingDictionary *dictionary;
+  free(accounting->syms.syms);
+  dictionary = &accounting->syms.dictionary;
+  for (i = 0; i < dictionary->num_strs; i++) {
+    free(dictionary->strs[i]);
+  }
+}
+
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) {
+  accounting->context.x = x;
+  accounting->context.y = y;
+}
+
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits) {
+  AccountingSymbol sym;
+  // Reuse previous symbol if it has the same context and symbol id.
+  if (accounting->syms.num_syms) {
+    AccountingSymbol *last_sym;
+    last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1];
+    if (memcmp(&last_sym->context, &accounting->context,
+               sizeof(AccountingSymbolContext)) == 0) {
+      uint32_t id;
+      id = aom_accounting_dictionary_lookup(accounting, str);
+      if (id == last_sym->id) {
+        last_sym->bits += bits;
+        last_sym->samples++;
+        return;
+      }
+    }
+  }
+  sym.context = accounting->context;
+  sym.samples = 1;
+  sym.bits = bits;
+  sym.id = aom_accounting_dictionary_lookup(accounting, str);
+  assert(sym.id <= 255);
+  if (accounting->syms.num_syms == accounting->num_syms_allocated) {
+    accounting->num_syms_allocated *= 2;
+    accounting->syms.syms =
+        realloc(accounting->syms.syms,
+                sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+    assert(accounting->syms.syms != NULL);
+  }
+  accounting->syms.syms[accounting->syms.num_syms++] = sym;
+}
+
+void aom_accounting_dump(Accounting *accounting) {
+  int i;
+  AccountingSymbol *sym;
+  printf("\n----- Number of recorded syntax elements = %d -----\n",
+         accounting->syms.num_syms);
+  printf("----- Total number of symbol calls = %d (%d binary) -----\n",
+         accounting->syms.num_multi_syms + accounting->syms.num_binary_syms,
+         accounting->syms.num_binary_syms);
+  for (i = 0; i < accounting->syms.num_syms; i++) {
+    sym = &accounting->syms.syms[i];
+    printf("%s x: %d, y: %d bits: %f samples: %d\n",
+           accounting->syms.dictionary.strs[sym->id], sym->context.x,
+           sym->context.y, (float)sym->bits / 8.0, sym->samples);
+  }
+}
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
new file mode 100644
index 000000000..288e5e63e
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
+#include <stdlib.h>
+#include "aom/aomdx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define AOM_ACCOUNTING_HASH_SIZE (1021)
+
+/* Max number of entries for symbol types in the dictionary (increase as
+   necessary). */
+#define MAX_SYMBOL_TYPES (256)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define AOM_ACCT_BITRES (3)
+
+typedef struct {
+  int16_t x;
+  int16_t y;
+} AccountingSymbolContext;
+
+typedef struct {
+  AccountingSymbolContext context;
+  uint32_t id;
+  /** Number of bits in units of 1/8 bit. */
+  uint32_t bits;
+  uint32_t samples;
+} AccountingSymbol;
+
+/** Dictionary for translating strings into id. */
+typedef struct {
+  char *(strs[MAX_SYMBOL_TYPES]);
+  int num_strs;
+} AccountingDictionary;
+
+typedef struct {
+  /** All recorded symbols decoded. */
+  AccountingSymbol *syms;
+  /** Number of syntax actually recorded. */
+  int num_syms;
+  /** Raw symbol decoding calls for non-binary values. */
+  int num_multi_syms;
+  /** Raw binary symbol decoding calls. */
+  int num_binary_syms;
+  /** Dictionary for translating strings into id. */
+  AccountingDictionary dictionary;
+} AccountingSymbols;
+
+struct Accounting {
+  AccountingSymbols syms;
+  /** Size allocated for symbols (not all may be used). */
+  int num_syms_allocated;
+  int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
+  AccountingSymbolContext context;
+  uint32_t last_tell_frac;
+};
+
+void aom_accounting_init(Accounting *accounting);
+void aom_accounting_reset(Accounting *accounting);
+void aom_accounting_clear(Accounting *accounting);
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y);
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str);
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits);
+void aom_accounting_dump(Accounting *accounting);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
new file mode 100644
index 000000000..31f14b531
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -0,0 +1,5567 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/obmc.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodetxb.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+// This is needed by ext_tile related unit tests.
+#define EXT_TILE_DEBUG 1
+#define MC_TEMP_BUF_PELS                       \
+  (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
+   ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
+
+// Checks that the remaining bits start with a 1 and ends with 0s.
+// It consumes an additional byte, if already byte aligned before the check.
+int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
+  int bits_before_alignment = 8 - rb->bit_offset % 8;
+  int trailing = aom_rb_read_literal(rb, bits_before_alignment);
+  if (trailing != (1 << (bits_before_alignment - 1))) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+  return 0;
+}
+
+// Use only_chroma = 1 to only set the chroma planes
+static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
+                                       const YV12_BUFFER_CONFIG *const buf,
+                                       int only_chroma) {
+  if (seq_params->use_highbitdepth) {
+    const int val = 1 << (seq_params->bit_depth - 1);
+    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+      const int is_uv = plane > 0;
+      uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+      // Set the first row to neutral grey. Then copy the first row to all
+      // subsequent rows.
+      if (buf->crop_heights[is_uv] > 0) {
+        aom_memset16(base, val, buf->crop_widths[is_uv]);
+        for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+          memcpy(&base[row_idx * buf->strides[is_uv]], base,
+                 sizeof(*base) * buf->crop_widths[is_uv]);
+        }
+      }
+    }
+  } else {
+    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+      const int is_uv = plane > 0;
+      for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+        memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
+               buf->crop_widths[is_uv]);
+      }
+    }
+  }
+}
+
+static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
+                                            MACROBLOCKD *xd,
+                                            aom_reader *const r, int plane,
+                                            int runit_idx);
+
+static void setup_compound_reference_mode(AV1_COMMON *cm) {
+  cm->comp_fwd_ref[0] = LAST_FRAME;
+  cm->comp_fwd_ref[1] = LAST2_FRAME;
+  cm->comp_fwd_ref[2] = LAST3_FRAME;
+  cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+
+  cm->comp_bwd_ref[0] = BWDREF_FRAME;
+  cm->comp_bwd_ref[1] = ALTREF2_FRAME;
+  cm->comp_bwd_ref[2] = ALTREF_FRAME;
+}
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+  return len != 0 && len <= (size_t)(end - start);
+}
+
+static TX_MODE read_tx_mode(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (cm->coded_lossless) return ONLY_4X4;
+  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
+}
+
+static REFERENCE_MODE read_frame_reference_mode(
+    const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (frame_is_intra_only(cm)) {
+    return SINGLE_REFERENCE;
+  } else {
+    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
+  }
+}
+
+static void inverse_transform_block(MACROBLOCKD *xd, int plane,
+                                    const TX_TYPE tx_type,
+                                    const TX_SIZE tx_size, uint8_t *dst,
+                                    int stride, int reduced_tx_set) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t scan_line = eob_data->max_scan_line;
+  uint16_t eob = eob_data->eob;
+
+  memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane],
+         (scan_line + 1) * sizeof(dqcoeff[0]));
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
+                              eob, reduced_tx_set);
+  memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
+}
+
+static void read_coeffs_tx_intra_block(const AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       aom_reader *const r, const int plane,
+                                       const int row, const int col,
+                                       const TX_SIZE tx_size) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!mbmi->skip) {
+#if TXCOEFF_TIMER
+    struct aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+#endif
+    av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
+#if TXCOEFF_TIMER
+    aom_usec_timer_mark(&timer);
+    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+    cm->txcoeff_timer += elapsed_time;
+    ++cm->txb_count;
+#endif
+  }
+}
+
+static void decode_block_void(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                              aom_reader *const r, const int plane,
+                              const int row, const int col,
+                              const TX_SIZE tx_size) {
+  (void)cm;
+  (void)xd;
+  (void)r;
+  (void)plane;
+  (void)row;
+  (void)col;
+  (void)tx_size;
+}
+
+static void predict_inter_block_void(AV1_COMMON *const cm,
+                                     MACROBLOCKD *const xd, int mi_row,
+                                     int mi_col, BLOCK_SIZE bsize) {
+  (void)cm;
+  (void)xd;
+  (void)mi_row;
+  (void)mi_col;
+  (void)bsize;
+}
+
+static void cfl_store_inter_block_void(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd) {
+  (void)cm;
+  (void)xd;
+}
+
+static void predict_and_reconstruct_intra_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int row, const int col, const TX_SIZE tx_size) {
+  (void)r;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  PLANE_TYPE plane_type = get_plane_type(plane);
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+  if (!mbmi->skip) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+
+    // tx_type will be read out in av1_read_coeffs_txb_facade
+    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+                                            cm->reduced_tx_set_used);
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    if (eob_data->eob) {
+      uint8_t *dst =
+          &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                              cm->reduced_tx_set_used);
+    }
+  }
+  if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
+    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
+  }
+}
+
+static void inverse_transform_inter_block(const AV1_COMMON *const cm,
+                                          MACROBLOCKD *const xd,
+                                          aom_reader *const r, const int plane,
+                                          const int blk_row, const int blk_col,
+                                          const TX_SIZE tx_size) {
+  (void)r;
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // tx_type will be read out in av1_read_coeffs_txb_facade
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+
+  uint8_t *dst =
+      &pd->dst
+           .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+  inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                          cm->reduced_tx_set_used);
+#if CONFIG_MISMATCH_DEBUG
+  int pixel_c, pixel_r;
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int blk_w = block_size_wide[bsize];
+  int blk_h = block_size_high[bsize];
+  mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                  pd->subsampling_x, pd->subsampling_y);
+  mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, pixel_c,
+                          pixel_r, blk_w, blk_h,
+                          xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
+}
+
+static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
+                                  int plane) {
+  xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+  xd->txb_offset[plane] =
+      xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+}
+
+static void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td,
+                                  aom_reader *r, MB_MODE_INFO *const mbmi,
+                                  int plane, BLOCK_SIZE plane_bsize,
+                                  int blk_row, int blk_col, int block,
+                                  TX_SIZE tx_size, int *eob_total) {
+  MACROBLOCKD *const xd = &td->xd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  // Scale to match transform block unit.
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size || plane) {
+    td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+                                         tx_size);
+
+    td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+                                     tx_size);
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    *eob_total += eob_data->eob;
+    set_cb_buffer_offsets(xd, tx_size, plane);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int sub_step = bsw * bsh;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
+                              offsetc, block, sub_txs, eob_total);
+        block += sub_step;
+      }
+    }
+  }
+}
+
+static void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                        BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
+                        int bh, int x_mis, int y_mis) {
+  const int num_planes = av1_num_planes(cm);
+
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+  // passing bsize from decode_partition().
+  xd->mi[0]->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  xd->mi[0]->mi_row = mi_row;
+  xd->mi[0]->mi_col = mi_col;
+#endif
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
+
+  assert(x_mis && y_mis);
+  for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
+  int idx = cm->mi_stride;
+  for (int y = 1; y < y_mis; ++y) {
+    memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
+    idx += cm->mi_stride;
+  }
+
+  set_plane_n4(xd, bw, bh, num_planes);
+  set_skip_context(xd, mi_row, mi_col, num_planes);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col, 0, num_planes);
+}
+
+static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                              int mi_row, int mi_col, aom_reader *r,
+                              PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+
+#if CONFIG_ACCOUNTING
+  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  xd->mi[0]->partition = partition;
+  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+  if (bsize >= BLOCK_8X8 &&
+      (seq_params->subsampling_x || seq_params->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][seq_params->subsampling_x]
+                      [seq_params->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+typedef struct PadBlock {
+  int x0;
+  int x1;
+  int y0;
+  int y1;
+} PadBlock;
+
+static void highbd_build_mc_border(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst8, int dst_stride, int x, int y,
+                                   int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w) left = b_w;
+
+    if (x + b_w > w) right = x + b_w - w;
+
+    if (right > b_w) right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left) aom_memset16(dst, ref_row[0], left);
+
+    if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int x, int y, int b_w, int b_h,
+                            int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w) left = b_w;
+
+    if (x + b_w > w) right = x + b_w - w;
+
+    if (right > b_w) right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left) memset(dst, ref_row[0], left);
+
+    if (copy) memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right) memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
+}
+
+static INLINE int update_extend_mc_border_params(
+    const struct scale_factors *const sf, struct buf_2d *const pre_buf,
+    MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
+    int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
+  const int is_scaled = av1_is_scaled(sf);
+  // Get reference width and height.
+  int frame_width = pre_buf->width;
+  int frame_height = pre_buf->height;
+
+  // Do border extension if there is motion or
+  // width/height is not a multiple of 8 pixels.
+  if ((!is_intrabc) && (!do_warp) &&
+      (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
+       (frame_height & 0x7))) {
+    if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      block->x0 -= AOM_INTERP_EXTEND - 1;
+      block->x1 += AOM_INTERP_EXTEND;
+      *x_pad = 1;
+    }
+
+    if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      block->y0 -= AOM_INTERP_EXTEND - 1;
+      block->y1 += AOM_INTERP_EXTEND;
+      *y_pad = 1;
+    }
+
+    // Skip border extension if block is inside the frame.
+    if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
+        block->y1 > frame_height - 1) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE void extend_mc_border(const struct scale_factors *const sf,
+                                    struct buf_2d *const pre_buf,
+                                    MV32 scaled_mv, PadBlock block,
+                                    int subpel_x_mv, int subpel_y_mv,
+                                    int do_warp, int is_intrabc, int highbd,
+                                    uint8_t *mc_buf, uint8_t **pre,
+                                    int *src_stride) {
+  int x_pad = 0, y_pad = 0;
+  if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
+                                     subpel_x_mv, subpel_y_mv, do_warp,
+                                     is_intrabc, &x_pad, &y_pad)) {
+    // Get reference block pointer.
+    const uint8_t *const buf_ptr =
+        pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+    int buf_stride = pre_buf->stride;
+    const int b_w = block.x1 - block.x0;
+    const int b_h = block.y1 - block.y0;
+
+    // Extend the border.
+    if (highbd) {
+      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
+                             block.y0, b_w, b_h, pre_buf->width,
+                             pre_buf->height);
+    } else {
+      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+                      b_h, pre_buf->width, pre_buf->height);
+    }
+    *src_stride = b_w;
+    *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
+           x_pad * (AOM_INTERP_EXTEND - 1);
+  }
+}
+
+static INLINE void dec_calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, SubpelParams *subpel_params, int bw, int bh,
+    PadBlock *block, int mi_x, int mi_y, MV32 *scaled_mv, int *subpel_x_mv,
+    int *subpel_y_mv) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+
+    // Get reference block top left coordinate.
+    block->x0 = pos_x >> SCALE_SUBPEL_BITS;
+    block->y0 = pos_y >> SCALE_SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 =
+        ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
+    block->y1 =
+        ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
+
+    MV temp_mv;
+    temp_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
+                                        pd->subsampling_y);
+    *scaled_mv = av1_scale_mv(&temp_mv, (mi_x + x), (mi_y + y), sf);
+    scaled_mv->row += SCALE_EXTRA_OFF;
+    scaled_mv->col += SCALE_EXTRA_OFF;
+
+    *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
+  } else {
+    // Get block position in current frame.
+    int pos_x = (pre_x + x) << SUBPEL_BITS;
+    int pos_y = (pre_y + y) << SUBPEL_BITS;
+
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+
+    // Get reference block top left coordinate.
+    pos_x += mv_q4.col;
+    pos_y += mv_q4.row;
+    block->x0 = pos_x >> SUBPEL_BITS;
+    block->y0 = pos_y >> SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
+    block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
+
+    scaled_mv->row = mv_q4.row;
+    scaled_mv->col = mv_q4.col;
+    *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
+  }
+}
+
+static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
+                                              MACROBLOCKD *xd, int plane,
+                                              const MB_MODE_INFO *mi,
+                                              int build_for_obmc, int bw,
+                                              int bh, int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int is_compound = has_second_ref(mi);
+  int ref;
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  int is_global[2] = { 0, 0 };
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  sub8x8_inter = sub8x8_inter && !build_for_obmc;
+  if (sub8x8_inter) {
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
+    }
+  }
+
+  if (sub8x8_inter) {
+    // block size
+    const int b4_w = block_size_wide[bsize] >> ss_x;
+    const int b4_h = block_size_high[bsize] >> ss_y;
+    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+    const int b8_h = block_size_high[plane_bsize] >> ss_y;
+    assert(!is_compound);
+
+    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+    int row = row_start;
+    int src_stride;
+    for (int y = 0; y < b8_h; y += b4_h) {
+      int col = col_start;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        is_compound = has_second_ref(this_mbmi);
+        int tmp_dst_stride = 8;
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
+        struct buf_2d *const dst_buf = &pd->dst;
+        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        PadBlock block;
+        MV32 scaled_mv;
+        int subpel_x_mv, subpel_y_mv;
+        int highbd;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf,
+                               &subpel_params, bw, bh, &block, mi_x, mi_y,
+                               &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+        pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+        src_stride = pre_buf->stride;
+        highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+        extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
+                         subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
+                         &pre, &src_stride);
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
+        }
+
+        av1_make_inter_predictor(
+            pre, src_stride, dst, dst_buf->stride, &subpel_params, sf, b4_w,
+            b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+        ++col;
+      }
+      ++row;
+    }
+
+    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+    return;
+  }
+
+  {
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf;
+    uint8_t *pre[2];
+    SubpelParams subpel_params[2];
+    int src_stride[2];
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+      const MV mv = mi->mv[ref].as_mv;
+      PadBlock block;
+      MV32 scaled_mv;
+      int subpel_x_mv, subpel_y_mv;
+      int highbd;
+
+      dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
+                             &subpel_params[ref], bw, bh, &block, mi_x, mi_y,
+                             &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+      pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+      src_stride[ref] = pre_buf->stride;
+      highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+      int do_warp = (bw >= 8 && bh >= 8 &&
+                     av1_allow_warp(mi, &warp_types,
+                                    &xd->global_motion[mi->ref_frame[ref]],
+                                    build_for_obmc, subpel_params[ref].xs,
+                                    subpel_params[ref].ys, NULL));
+      do_warp = (do_warp && xd->cur_frame_force_integer_mv == 0);
+
+      extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv,
+                       do_warp, is_intrabc, highbd, xd->mc_buf[ref], &pre[ref],
+                       &src_stride[ref]);
+    }
+
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+      conv_params.do_average = ref;
+      if (is_masked_compound_type(mi->interinter_comp.type)) {
+        // masked compound type has its own average mechanism
+        conv_params.do_average = 0;
+      }
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type))
+        av1_make_masked_inter_predictor(
+            pre[ref], src_stride[ref], dst, dst_buf->stride,
+            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+            plane, &warp_types, mi_x >> pd->subsampling_x,
+            mi_y >> pd->subsampling_y, ref, xd, cm->allow_warped_motion);
+      else
+        av1_make_inter_predictor(
+            pre[ref], src_stride[ref], dst, dst_buf->stride,
+            &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+            &warp_types, mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+    }
+  }
+}
+
+static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
+                                                  MACROBLOCKD *xd,
+                                                  BLOCK_SIZE bsize, int mi_row,
+                                                  int mi_col, int plane_from,
+                                                  int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = pd->width;
+    const int bh = pd->height;
+
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+
+    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+  }
+}
+
+static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
+                                           MACROBLOCKD *xd, int mi_row,
+                                           int mi_col, BUFFER_SET *ctx,
+                                           BLOCK_SIZE bsize) {
+  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
+
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
+                               { xd->plane[0].dst.stride, 0, 0 } };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, ctx, 0, bsize);
+  }
+}
+
+static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
+                                            MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BUFFER_SET *ctx,
+                                            BLOCK_SIZE bsize) {
+  dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
+                                        MAX_MB_PLANE - 1);
+
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = {
+      { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+      { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
+    };
+    if (!ctx) ctx = &default_ctx;
+    av1_build_interintra_predictors_sbuv(
+        cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
+  }
+}
+
+static void dec_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
+  dec_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    dec_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+static INLINE void dec_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           &backup_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
+}
+
+static void dec_build_prediction_by_above_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  int this_height = xd->n4_h * MI_SIZE;
+  int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                dec_build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void dec_build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          &backup_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
+}
+
+static void dec_build_prediction_by_left_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+    uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+    int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  int this_width = xd->n4_w * MI_SIZE;
+  int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               dec_build_prediction_by_left_pred, &ctxt);
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd, int mi_row,
+                                               int mi_col) {
+  const int num_planes = av1_num_planes(cm);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+  } else {
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+  }
+  dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                      dst_width1, dst_height1, dst_stride1);
+  dec_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                     dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2);
+}
+
+static void cfl_store_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (store_cfl_required(cm, xd)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+  }
+}
+
+static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    if (frame < LAST_FRAME) {
+      assert(is_intrabc_block(mbmi));
+      assert(frame == INTRA_FRAME);
+      assert(ref == 0);
+    } else {
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf,
+                           num_planes);
+    }
+  }
+
+  dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+  if (mbmi->motion_mode == OBMC_CAUSAL) {
+    dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+  }
+#if CONFIG_MISMATCH_DEBUG
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    int pixel_c, pixel_r;
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
+                    pd->subsampling_y);
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+    mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                             plane, pixel_c, pixel_r, pd->width, pd->height,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
+}
+
+static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
+                                       aom_reader *r) {
+  (void)r;
+  Av1ColorMapParam params;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, NULL, NULL);
+  xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
+}
+
+static void decode_token_recon_block(AV1Decoder *const pbi,
+                                     ThreadData *const td, int mi_row,
+                                     int mi_col, aom_reader *r,
+                                     BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
+  const int num_planes = av1_num_planes(cm);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  CFL_CTX *const cfl = &xd->cfl;
+  cfl->is_chroma_reference = is_chroma_reference(
+      mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
+
+  if (!is_inter_block(mbmi)) {
+    int row, col;
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+    const int max_blocks_high = max_block_high(xd, bsize, 0);
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+    mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+    for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+      for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          const struct macroblockd_plane *const pd = &xd->plane[plane];
+          if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                   pd->subsampling_y))
+            continue;
+
+          const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+          const int stepr = tx_size_high_unit[tx_size];
+          const int stepc = tx_size_wide_unit[tx_size];
+
+          const int unit_height = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+          const int unit_width = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+
+          for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+               blk_row += stepr) {
+            for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                 blk_col += stepc) {
+              td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+                                                   blk_col, tx_size);
+              td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
+                                                      blk_col, tx_size);
+              set_cb_buffer_offsets(xd, tx_size, plane);
+            }
+          }
+        }
+      }
+    }
+  } else {
+    td->predict_inter_block_visit(cm, xd, mi_row, mi_col, bsize);
+    // Reconstruction
+    if (!mbmi->skip) {
+      int eobtotal = 0;
+
+      const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+      const int max_blocks_high = max_block_high(xd, bsize, 0);
+      int row, col;
+
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
+      int mu_blocks_wide =
+          block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+      int mu_blocks_high =
+          block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+      mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+      mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+      for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+        for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+          for (int plane = 0; plane < num_planes; ++plane) {
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                     pd->subsampling_y))
+              continue;
+            const BLOCK_SIZE bsizec =
+                scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+            const BLOCK_SIZE plane_bsize = get_plane_block_size(
+                bsizec, pd->subsampling_x, pd->subsampling_y);
+
+            const TX_SIZE max_tx_size =
+                get_vartx_max_txsize(xd, plane_bsize, plane);
+            const int bh_var_tx = tx_size_high_unit[max_tx_size];
+            const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+            int block = 0;
+            int step =
+                tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+            int blk_row, blk_col;
+            const int unit_height = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_high + row, max_blocks_high),
+                pd->subsampling_y);
+            const int unit_width = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_wide + col, max_blocks_wide),
+                pd->subsampling_x);
+
+            for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+                 blk_row += bh_var_tx) {
+              for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                   blk_col += bw_var_tx) {
+                decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
+                                      blk_row, blk_col, block, max_tx_size,
+                                      &eobtotal);
+                block += step;
+              }
+            }
+          }
+        }
+      }
+    }
+    td->cfl_store_inter_block_visit(cm, xd);
+  }
+
+  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+                    set_color_index_map_offset);
+}
+
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                MB_MODE_INFO *mbmi);
+#endif
+
+static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                               TX_SIZE tx_size, int depth,
+#if LOOP_FILTER_BITMASK
+                               AV1_COMMON *cm, int mi_row, int mi_col,
+#endif
+                               int blk_row, int blk_col, aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  int is_split = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
+
+  if (is_split) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    if (sub_txs == TX_4X4) {
+      for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+        for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+          const int index =
+              av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+          mbmi->inter_tx_size[index] = sub_txs;
+        }
+      }
+      mbmi->tx_size = sub_txs;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+#if LOOP_FILTER_BITMASK
+      store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, BLOCK_8X8,
+                          TX_4X4, mbmi);
+#endif
+      return;
+    }
+#if LOOP_FILTER_BITMASK
+    if (depth + 1 == MAX_VARTX_DEPTH) {
+      store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                          txsize_to_bsize[tx_size], sub_txs, mbmi);
+    }
+#endif
+
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
+#if LOOP_FILTER_BITMASK
+                           cm, mi_row, mi_col,
+#endif
+                           offsetr, offsetc, r);
+      }
+    }
+  } else {
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+#if LOOP_FILTER_BITMASK
+    store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                        txsize_to_bsize[tx_size], tx_size, mbmi);
+#endif
+  }
+}
+
+static TX_SIZE read_selected_tx_size(MACROBLOCKD *xd, aom_reader *r) {
+  // TODO(debargha): Clean up the logic here. This function should only
+  // be called for intra.
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int max_depths = bsize_to_max_depth(bsize);
+  const int ctx = get_tx_size_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
+                                    max_depths + 1, ACCT_STR);
+  assert(depth >= 0 && depth <= max_depths);
+  const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
+  return tx_size;
+}
+
+static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
+                            int allow_select_inter, aom_reader *r) {
+  const TX_MODE tx_mode = cm->tx_mode;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+
+  if (block_signals_txsize(bsize)) {
+    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+      const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
+      return coded_tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, tx_mode);
+    }
+  } else {
+    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+    return max_txsize_rect_lookup[bsize];
+  }
+}
+
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                MB_MODE_INFO *mbmi) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (tx_size - TX_4X16);
+  }
+  int index = 0;
+  const int row = mi_row % MI_SIZE_64X64;
+  const int col = mi_col % MI_SIZE_64X64;
+  const int shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  // Use a lookup table that provides one bitmask for a given block size and
+  // a univariant transform size.
+  int index;
+  int shift;
+  int row;
+  int col;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (mbmi->tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+    mask_id =
+        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (mbmi->tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (mbmi->tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (mbmi->tx_size - TX_4X16);
+  }
+  row = mi_row % MI_SIZE_64X64;
+  col = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  int index;
+  int shift;
+  int row;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int row_start = mi_row % MI_SIZE_64X64;
+  const int col_start = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col_start, row_start, &index);
+  const uint64_t top_edge_mask =
+      ((uint64_t)1 << (shift + mi_size_wide[bsize])) - ((uint64_t)1 << shift);
+  lfm->is_horz_border.bits[index] |= top_edge_mask;
+  const int is_vert_border = mask_id_table_vert_border[bsize];
+  const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+  for (int i = 0; i + index < 4; ++i) {
+    lfm->is_vert_border.bits[i + index] |=
+        (left_mask_univariant_reordered[is_vert_border].bits[i] << vert_shift);
+  }
+  const int is_skip = mbmi->skip && is_inter_block(mbmi);
+  if (is_skip) {
+    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->skip.bits[i + index] |=
+          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+    }
+  }
+  const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+  const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+  const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+  const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+    index = 0;
+    row = r % MI_SIZE_64X64;
+    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+  }
+}
+#endif
+
+static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                               int mi_row, int mi_col, aom_reader *r,
+                               PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &td->xd;
+  decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+
+  av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+                    av1_decode_palette_tokens);
+
+  AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+    const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+
+    for (int idy = 0; idy < height; idy += bh)
+      for (int idx = 0; idx < width; idx += bw)
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
+#if LOOP_FILTER_BITMASK
+                           cm, mi_row, mi_col,
+#endif
+                           idy, idx, r);
+  } else {
+    mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
+    if (inter_block_tx)
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
+                  mbmi->skip && is_inter_block(mbmi), xd);
+#if LOOP_FILTER_BITMASK
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+      store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+    } else {
+      for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+        for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+          store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+                                      BLOCK_64X64, mbmi);
+        }
+      }
+    }
+#endif
+  }
+#if LOOP_FILTER_BITMASK
+  const int w = mi_size_wide[bsize];
+  const int h = mi_size_high[bsize];
+  if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+    store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi);
+  } else {
+    for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+      for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+        store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64,
+                                 mbmi);
+      }
+    }
+  }
+#endif
+
+  if (cm->delta_q_present_flag) {
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      const int current_qindex =
+          av1_get_qindex(&cm->seg, i, xd->current_qindex);
+      for (int j = 0; j < num_planes; ++j) {
+        const int dc_delta_q =
+            j == 0 ? cm->y_dc_delta_q
+                   : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
+        const int ac_delta_q =
+            j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+        xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
+            current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
+            current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+      }
+    }
+  }
+  if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+
+  decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+
+  int reader_corrupted_flag = aom_reader_has_error(r);
+  aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+static void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+                                           ThreadData *const td, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int num_planes = av1_num_planes(cm);
+
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
+
+  set_plane_n4(xd, bw, bh, num_planes);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col, 0, num_planes);
+}
+
+static void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                         int mi_row, int mi_col, aom_reader *r,
+                         PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+  (void)partition;
+  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+  decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+}
+
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     aom_reader *r, int has_rows, int has_cols,
+                                     BLOCK_SIZE bsize) {
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!has_rows && !has_cols) return PARTITION_SPLIT;
+
+  assert(ctx >= 0);
+  aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
+  if (has_rows && has_cols) {
+    return (PARTITION_TYPE)aom_read_symbol(
+        r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
+  } else if (!has_rows && has_cols) {
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_vert_alike(cdf, partition_cdf, bsize);
+    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+  } else {
+    assert(has_rows && !has_cols);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_horz_alike(cdf, partition_cdf, bsize);
+    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
+  }
+}
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static void decode_partition(AV1Decoder *const pbi, ThreadData *const td,
+                             int mi_row, int mi_col, aom_reader *r,
+                             BLOCK_SIZE bsize, int parse_decode_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
+  const int bw = mi_size_wide[bsize];
+  const int hbs = bw >> 1;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int quarter_step = bw / 4;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  // parse_decode_flag takes the following values :
+  // 01 - do parse only
+  // 10 - do decode only
+  // 11 - do parse and decode
+  static const block_visitor_fn_t block_visit[4] = {
+    NULL, parse_decode_block, decode_block, parse_decode_block
+  };
+
+  if (parse_decode_flag & 1) {
+    const int num_planes = av1_num_planes(cm);
+    for (int plane = 0; plane < num_planes; ++plane) {
+      int rcol0, rcol1, rrow0, rrow1;
+      if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                             &rcol0, &rcol1, &rrow0, &rrow1)) {
+        const int rstride = cm->rst_info[plane].horz_units_per_tile;
+        for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+          for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+            const int runit_idx = rcol + rrow * rstride;
+            loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+          }
+        }
+      }
+    }
+
+    partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+                                    : read_partition(xd, mi_row, mi_col, r,
+                                                     has_rows, has_cols, bsize);
+  } else {
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  }
+  subsize = get_partition_subsize(bsize, partition);
+
+  // Check the bitstream is conformant: if there is subsampling on the
+  // chroma planes, subsize must subsample to a valid block size.
+  const struct macroblockd_plane *const pd_u = &xd->plane[1];
+  if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
+      BLOCK_INVALID) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Block size %dx%d invalid with this subsampling mode",
+                       block_size_wide[subsize], block_size_high[subsize]);
+  }
+
+#define DEC_BLOCK_STX_ARG
+#define DEC_BLOCK_EPT_ARG partition,
+#define DEC_BLOCK(db_r, db_c, db_subsize)                                     \
+  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
+                                 DEC_BLOCK_EPT_ARG(db_subsize))
+#define DEC_PARTITION(db_r, db_c, db_subsize)                                 \
+  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize), \
+                   parse_decode_flag)
+
+  switch (partition) {
+    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+    case PARTITION_HORZ:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_VERT:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_SPLIT:
+      DEC_PARTITION(mi_row, mi_col, subsize);
+      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_HORZ_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_VERT_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_VERT_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+        DEC_BLOCK(this_mi_row, mi_col, subsize);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+        DEC_BLOCK(mi_row, this_mi_col, subsize);
+      }
+      break;
+    default: assert(0 && "Invalid partition type");
+  }
+
+#undef DEC_PARTITION
+#undef DEC_BLOCK
+#undef DEC_BLOCK_EPT_ARG
+#undef DEC_BLOCK_STX_ARG
+
+  if (parse_decode_flag & 1)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
+                               const size_t read_size,
+                               struct aom_internal_error_info *error_info,
+                               aom_reader *r, uint8_t allow_update_cdf) {
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+  if (aom_reader_init(r, data, read_size))
+    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder %d", 1);
+
+  r->allow_update_cdf = allow_update_cdf;
+}
+
+static void setup_segmentation(AV1_COMMON *const cm,
+                               struct aom_read_bit_buffer *rb) {
+  struct segmentation *const seg = &cm->seg;
+
+  seg->update_map = 0;
+  seg->update_data = 0;
+  seg->temporal_update = 0;
+
+  seg->enabled = aom_rb_read_bit(rb);
+  if (!seg->enabled) {
+    if (cm->cur_frame->seg_map)
+      memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+    memset(seg, 0, sizeof(*seg));
+    segfeatures_copy(&cm->cur_frame->seg, seg);
+    return;
+  }
+  if (cm->seg.enabled && cm->prev_frame &&
+      (cm->mi_rows == cm->prev_frame->mi_rows) &&
+      (cm->mi_cols == cm->prev_frame->mi_cols)) {
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  } else {
+    cm->last_frame_seg_map = NULL;
+  }
+  // Read update flags
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    // These frames can't use previous frames, so must signal map + features
+    seg->update_map = 1;
+    seg->temporal_update = 0;
+    seg->update_data = 1;
+  } else {
+    seg->update_map = aom_rb_read_bit(rb);
+    if (seg->update_map) {
+      seg->temporal_update = aom_rb_read_bit(rb);
+    } else {
+      seg->temporal_update = 0;
+    }
+    seg->update_data = aom_rb_read_bit(rb);
+  }
+
+  // Segmentation data update
+  if (seg->update_data) {
+    av1_clearall_segfeatures(seg);
+
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      for (int j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = aom_rb_read_bit(rb);
+        if (feature_enabled) {
+          av1_enable_segfeature(seg, i, j);
+
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            data = aom_rb_read_inv_signed_literal(rb, ubits);
+          } else {
+            data = aom_rb_read_literal(rb, ubits);
+          }
+
+          data = clamp(data, data_min, data_max);
+        }
+        av1_set_segdata(seg, i, j, data);
+      }
+    }
+    calculate_segdata(seg);
+  } else if (cm->prev_frame) {
+    segfeatures_copy(seg, &cm->prev_frame->seg);
+  }
+  segfeatures_copy(&cm->cur_frame->seg, seg);
+}
+
+static void decode_restoration_mode(AV1_COMMON *cm,
+                                    struct aom_read_bit_buffer *rb) {
+  assert(!cm->all_lossless);
+  const int num_planes = av1_num_planes(cm);
+  if (cm->allow_intrabc) return;
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (aom_rb_read_bit(rb)) {
+      rsi->frame_restoration_type =
+          aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
+    } else {
+      rsi->frame_restoration_type =
+          aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+    }
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
+  }
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = sb_size;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    if (sb_size == 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+    }
+  } else {
+    const int size = RESTORATION_UNITSIZE_MAX;
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = size;
+  }
+
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    if (s && !chroma_none) {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
+    } else {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size;
+    }
+    cm->rst_info[2].restoration_unit_size =
+        cm->rst_info[1].restoration_unit_size;
+  }
+}
+
+static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
+                               WienerInfo *ref_wiener_info, aom_reader *rb) {
+  memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
+  memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
+
+  if (wiener_win == WIENER_WIN)
+    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
+        aom_read_primitive_refsubexpfin(
+            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+            WIENER_FILT_TAP0_SUBEXP_K,
+            ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+        WIENER_FILT_TAP0_MINV;
+  else
+    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0;
+  wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+          WIENER_FILT_TAP1_SUBEXP_K,
+          ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+          WIENER_FILT_TAP2_SUBEXP_K,
+          ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+      WIENER_FILT_TAP2_MINV;
+  // The central element has an implicit +WIENER_FILT_STEP
+  wiener_info->vfilter[WIENER_HALFWIN] =
+      -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
+            wiener_info->vfilter[2]);
+
+  if (wiener_win == WIENER_WIN)
+    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
+        aom_read_primitive_refsubexpfin(
+            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+            WIENER_FILT_TAP0_SUBEXP_K,
+            ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+        WIENER_FILT_TAP0_MINV;
+  else
+    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0;
+  wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+          WIENER_FILT_TAP1_SUBEXP_K,
+          ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+          WIENER_FILT_TAP2_SUBEXP_K,
+          ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+      WIENER_FILT_TAP2_MINV;
+  // The central element has an implicit +WIENER_FILT_STEP
+  wiener_info->hfilter[WIENER_HALFWIN] =
+      -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
+            wiener_info->hfilter[2]);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+                                SgrprojInfo *ref_sgrproj_info, aom_reader *rb) {
+  sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    sgrproj_info->xqd[0] = 0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  } else if (params->r[1] == 0) {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
+                                 SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+  } else {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  }
+
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
+                                            MACROBLOCKD *xd,
+                                            aom_reader *const r, int plane,
+                                            int runit_idx) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
+  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+
+  assert(!cm->all_lossless);
+
+  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+  WienerInfo *wiener_info = xd->wiener_info + plane;
+  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+
+  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+    rui->restoration_type =
+        aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
+                        RESTORE_SWITCHABLE_TYPES, ACCT_STR);
+    switch (rui->restoration_type) {
+      case RESTORE_WIENER:
+        read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+        break;
+      case RESTORE_SGRPROJ:
+        read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+        break;
+      default: assert(rui->restoration_type == RESTORE_NONE); break;
+    }
+  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+    if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_WIENER;
+      read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+    } else {
+      rui->restoration_type = RESTORE_NONE;
+    }
+  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+    if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_SGRPROJ;
+      read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+    } else {
+      rui->restoration_type = RESTORE_NONE;
+    }
+  }
+}
+
+static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *lf = &cm->lf;
+  if (cm->allow_intrabc || cm->coded_lossless) {
+    // write default deltas to frame buffer
+    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+    return;
+  }
+  assert(!cm->coded_lossless);
+  if (cm->prev_frame) {
+    // write deltas to frame buffer
+    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  } else {
+    av1_set_default_ref_deltas(lf->ref_deltas);
+    av1_set_default_mode_deltas(lf->mode_deltas);
+  }
+  lf->filter_level[0] = aom_rb_read_literal(rb, 6);
+  lf->filter_level[1] = aom_rb_read_literal(rb, 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      lf->filter_level_u = aom_rb_read_literal(rb, 6);
+      lf->filter_level_v = aom_rb_read_literal(rb, 6);
+    }
+  }
+  lf->sharpness_level = aom_rb_read_literal(rb, 3);
+
+  // Read in loop filter deltas applied at the MB level based on mode or ref
+  // frame.
+  lf->mode_ref_delta_update = 0;
+
+  lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
+  if (lf->mode_ref_delta_enabled) {
+    lf->mode_ref_delta_update = aom_rb_read_bit(rb);
+    if (lf->mode_ref_delta_update) {
+      for (int i = 0; i < REF_FRAMES; i++)
+        if (aom_rb_read_bit(rb))
+          lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+
+      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
+        if (aom_rb_read_bit(rb))
+          lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+    }
+  }
+
+  // write deltas to frame buffer
+  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
+}
+
+static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
+  if (cm->allow_intrabc) return;
+  cm->cdef_pri_damping = cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
+  cm->cdef_bits = aom_rb_read_literal(rb, 2);
+  cm->nb_cdef_strengths = 1 << cm->cdef_bits;
+  for (int i = 0; i < cm->nb_cdef_strengths; i++) {
+    cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+    cm->cdef_uv_strengths[i] =
+        num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
+  }
+}
+
+static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
+  return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
+}
+
+static void setup_quantization(AV1_COMMON *const cm,
+                               struct aom_read_bit_buffer *rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
+  cm->y_dc_delta_q = read_delta_q(rb);
+  if (num_planes > 1) {
+    int diff_uv_delta = 0;
+    if (seq_params->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+    cm->u_dc_delta_q = read_delta_q(rb);
+    cm->u_ac_delta_q = read_delta_q(rb);
+    if (diff_uv_delta) {
+      cm->v_dc_delta_q = read_delta_q(rb);
+      cm->v_ac_delta_q = read_delta_q(rb);
+    } else {
+      cm->v_dc_delta_q = cm->u_dc_delta_q;
+      cm->v_ac_delta_q = cm->u_ac_delta_q;
+    }
+  } else {
+    cm->u_dc_delta_q = 0;
+    cm->u_ac_delta_q = 0;
+    cm->v_dc_delta_q = 0;
+    cm->v_ac_delta_q = 0;
+  }
+  cm->dequant_bit_depth = seq_params->bit_depth;
+  cm->using_qmatrix = aom_rb_read_bit(rb);
+  if (cm->using_qmatrix) {
+    cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    if (!seq_params->separate_uv_delta_q)
+      cm->qm_v = cm->qm_u;
+    else
+      cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+  } else {
+    cm->qm_y = 0;
+    cm->qm_u = 0;
+    cm->qm_v = 0;
+  }
+}
+
+// Build y/uv dequant values based on segmentation.
+static void setup_segmentation_dequant(AV1_COMMON *const cm) {
+  const int bit_depth = cm->seq_params.bit_depth;
+  const int using_qm = cm->using_qmatrix;
+  // When segmentation is disabled, only the first value is used.  The
+  // remaining are don't cares.
+  const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
+  for (int i = 0; i < max_segments; ++i) {
+    const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
+    cm->y_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, bit_depth);
+    cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
+    cm->u_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, bit_depth);
+    cm->u_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, bit_depth);
+    cm->v_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, bit_depth);
+    cm->v_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, bit_depth);
+    const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                         cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                         cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+    // NB: depends on base index so there is only 1 set per frame
+    // No quant weighting when lossless or signalled not using QM
+    int qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_y;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->y_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_Y, j);
+    }
+    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_u;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->u_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_U, j);
+    }
+    qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_v;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      cm->v_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_V, j);
+    }
+  }
+}
+
+static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
+  return aom_rb_read_bit(rb) ? SWITCHABLE
+                             : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
+}
+
+static void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  cm->render_width = cm->superres_upscaled_width;
+  cm->render_height = cm->superres_upscaled_height;
+  if (aom_rb_read_bit(rb))
+    av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
+}
+
+// TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
+static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
+                           int *width, int *height) {
+  cm->superres_upscaled_width = *width;
+  cm->superres_upscaled_height = *height;
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) return;
+
+  if (aom_rb_read_bit(rb)) {
+    cm->superres_scale_denominator =
+        (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
+    cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN;
+    // Don't edit cm->width or cm->height directly, or the buffers won't get
+    // resized correctly
+    av1_calculate_scaled_superres_size(width, height,
+                                       cm->superres_scale_denominator);
+  } else {
+    // 1:1 scaling - ie. no scaling, scale not provided
+    cm->superres_scale_denominator = SCALE_NUMERATOR;
+  }
+}
+
+static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+  if (cm->width != width || cm->height != height) {
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+    // Allocations in av1_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
+      if (av1_alloc_context_buffers(cm, width, height)) {
+        // The cm->mi_* values have been cleared and any existing context
+        // buffers have been freed. Clear cm->width and cm->height to be
+        // consistent and to force a realloc next time.
+        cm->width = 0;
+        cm->height = 0;
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate context buffers");
+      }
+    } else {
+      av1_set_mb_mi(cm, width, height);
+    }
+    av1_init_context_buffers(cm);
+    cm->width = width;
+    cm->height = height;
+  }
+
+  ensure_mv_buffer(cm->cur_frame, cm);
+  cm->cur_frame->width = cm->width;
+  cm->cur_frame->height = cm->height;
+}
+
+static void setup_buffer_pool(AV1_COMMON *cm) {
+  BufferPool *const pool = cm->buffer_pool;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+
+  lock_buffer_pool(pool);
+  if (aom_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
+      seq_params->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
+      seq_params->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
+      (unsigned int)seq_params->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
+      seq_params->color_primaries;
+  pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
+      seq_params->transfer_characteristics;
+  pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
+      seq_params->matrix_coefficients;
+  pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
+  pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
+      seq_params->chroma_sample_position;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
+  pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+                             struct aom_read_bit_buffer *rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  int width, height;
+
+  if (frame_size_override_flag) {
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    if (width > seq_params->max_frame_width ||
+        height > seq_params->max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Frame dimensions are larger than the maximum values");
+    }
+  } else {
+    width = seq_params->max_frame_width;
+    height = seq_params->max_frame_height;
+  }
+
+  setup_superres(cm, rb, &width, &height);
+  resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
+  setup_buffer_pool(cm);
+}
+
+static void setup_sb_size(SequenceHeader *seq_params,
+                          struct aom_read_bit_buffer *rb) {
+  set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+}
+
+static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          aom_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
+}
+
+static void setup_frame_size_with_refs(AV1_COMMON *cm,
+                                       struct aom_read_bit_buffer *rb) {
+  int width, height;
+  int found = 0;
+  int has_valid_ref_frame = 0;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    if (aom_rb_read_bit(rb)) {
+      YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+      width = buf->y_crop_width;
+      height = buf->y_crop_height;
+      cm->render_width = buf->render_width;
+      cm->render_height = buf->render_height;
+      setup_superres(cm, rb, &width, &height);
+      resize_context_buffers(cm, width, height);
+      found = 1;
+      break;
+    }
+  }
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!found) {
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    setup_superres(cm, rb, &width, &height);
+    resize_context_buffers(cm, width, height);
+    setup_render_size(cm, rb);
+  }
+
+  if (width <= 0 || height <= 0)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    has_valid_ref_frame |=
+        valid_ref_frame_size(ref_frame->buf->y_crop_width,
+                             ref_frame->buf->y_crop_height, width, height);
+  }
+  if (!has_valid_ref_frame)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf->bit_depth, ref_frame->buf->subsampling_x,
+            ref_frame->buf->subsampling_y, seq_params->bit_depth,
+            seq_params->subsampling_x, seq_params->subsampling_y))
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color format");
+  }
+  setup_buffer_pool(cm);
+}
+
+// Same function as av1_read_uniform but reading from uncompresses header wb
+static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  const int v = aom_rb_read_literal(rb, l - 1);
+  assert(l != 0);
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static void read_tile_info_max_tile(AV1_COMMON *const cm,
+                                    struct aom_read_bit_buffer *const rb) {
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+
+  av1_get_tile_limits(cm);
+  cm->uniform_tile_spacing_flag = aom_rb_read_bit(rb);
+
+  // Read tile columns
+  if (cm->uniform_tile_spacing_flag) {
+    cm->log2_tile_cols = cm->min_log2_tile_cols;
+    while (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+      if (!aom_rb_read_bit(rb)) {
+        break;
+      }
+      cm->log2_tile_cols++;
+    }
+  } else {
+    int i;
+    int start_sb;
+    for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
+      const int size_sb =
+          1 + rb_read_uniform(rb, AOMMIN(width_sb, cm->max_tile_width_sb));
+      cm->tile_col_start_sb[i] = start_sb;
+      start_sb += size_sb;
+      width_sb -= size_sb;
+    }
+    cm->tile_cols = i;
+    cm->tile_col_start_sb[i] = start_sb + width_sb;
+  }
+  av1_calculate_tile_cols(cm);
+
+  // Read tile rows
+  if (cm->uniform_tile_spacing_flag) {
+    cm->log2_tile_rows = cm->min_log2_tile_rows;
+    while (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+      if (!aom_rb_read_bit(rb)) {
+        break;
+      }
+      cm->log2_tile_rows++;
+    }
+  } else {
+    int i;
+    int start_sb;
+    for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
+      const int size_sb =
+          1 + rb_read_uniform(rb, AOMMIN(height_sb, cm->max_tile_height_sb));
+      cm->tile_row_start_sb[i] = start_sb;
+      start_sb += size_sb;
+      height_sb -= size_sb;
+    }
+    cm->tile_rows = i;
+    cm->tile_row_start_sb[i] = start_sb + height_sb;
+  }
+  av1_calculate_tile_rows(cm);
+}
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
+  cm->single_tile_decoding = 0;
+  if (cm->large_scale_tile) {
+    struct loopfilter *lf = &cm->lf;
+
+    // Figure out single_tile_decoding by loopfilter_level.
+    const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
+    const int no_cdef = cm->cdef_bits == 0 && cm->cdef_strengths[0] == 0 &&
+                        cm->cdef_uv_strengths[0] == 0;
+    const int no_restoration =
+        cm->rst_info[0].frame_restoration_type == RESTORE_NONE &&
+        cm->rst_info[1].frame_restoration_type == RESTORE_NONE &&
+        cm->rst_info[2].frame_restoration_type == RESTORE_NONE;
+    assert(IMPLIES(cm->coded_lossless, no_loopfilter && no_cdef));
+    assert(IMPLIES(cm->all_lossless, no_restoration));
+    cm->single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+  }
+}
+
+static void read_tile_info(AV1Decoder *const pbi,
+                           struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  read_tile_info_max_tile(cm, rb);
+
+  cm->context_update_tile_id = 0;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // tile to use for cdf update
+    cm->context_update_tile_id =
+        aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+    if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid context_update_tile_id");
+    }
+    // tile size magnitude
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+}
+
+#if EXT_TILE_DEBUG
+static void read_ext_tile_info(AV1Decoder *const pbi,
+                               struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  // This information is stored as a separate byte.
+  int mod = rb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
+  assert(rb->bit_offset % CHAR_BIT == 0);
+
+  if (cm->tile_cols * cm->tile_rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+}
+#endif  // EXT_TILE_DEBUG
+
+static size_t mem_get_varsize(const uint8_t *src, int sz) {
+  switch (sz) {
+    case 1: return src[0];
+    case 2: return mem_get_le16(src);
+    case 3: return mem_get_le24(src);
+    case 4: return mem_get_le32(src);
+    default: assert(0 && "Invalid size"); return -1;
+  }
+}
+
+#if EXT_TILE_DEBUG
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'. On return, '*data' is updated to point to the end of the
+// raw tile buffer in the bit stream.
+static void get_ls_tile_buffer(
+    const uint8_t *const data_end, struct aom_internal_error_info *error_info,
+    const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+    int tile_size_bytes, int col, int row, int tile_copy_mode) {
+  size_t size;
+
+  size_t copy_size = 0;
+  const uint8_t *copy_data = NULL;
+
+  if (!read_is_valid(*data, tile_size_bytes, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+  size = mem_get_varsize(*data, tile_size_bytes);
+
+  // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
+  // mode.
+  if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
+    // The remaining bits in the top byte signal the row offset
+    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+    // Currently, only use tiles in same column as reference tiles.
+    copy_data = tile_buffers[row - offset][col].data;
+    copy_size = tile_buffers[row - offset][col].size;
+    size = 0;
+  } else {
+    size += AV1_MIN_TILE_SIZE_BYTES;
+  }
+
+  *data += tile_size_bytes;
+
+  if (size > (size_t)(data_end - *data))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile size");
+
+  if (size > 0) {
+    tile_buffers[row][col].data = *data;
+    tile_buffers[row][col].size = size;
+  } else {
+    tile_buffers[row][col].data = copy_data;
+    tile_buffers[row][col].size = copy_size;
+  }
+
+  *data += size;
+}
+
+// Returns the end of the last tile buffer
+// (tile_buffers[cm->tile_rows - 1][cm->tile_cols - 1]).
+static const uint8_t *get_ls_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int have_tiles = tile_cols * tile_rows > 1;
+  const uint8_t *raw_data_end;  // The end of the last tile buffer
+
+  if (!have_tiles) {
+    const size_t tile_size = data_end - data;
+    tile_buffers[0][0].data = data;
+    tile_buffers[0][0].size = tile_size;
+    raw_data_end = NULL;
+  } else {
+    // We locate only the tile buffers that are required, which are the ones
+    // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+    // need the last (bottom right) tile buffer, as we need to know where the
+    // end of the compressed frame buffer is for proper superframe decoding.
+
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
+    const uint8_t *const data_start = data;
+
+    const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+    const int single_row = pbi->dec_tile_row >= 0;
+    const int tile_rows_start = single_row ? dec_tile_row : 0;
+    const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+    const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+    const int single_col = pbi->dec_tile_col >= 0;
+    const int tile_cols_start = single_col ? dec_tile_col : 0;
+    const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+    const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+    const int tile_size_bytes = pbi->tile_size_bytes;
+    const int tile_copy_mode =
+        ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1
+                                                                           : 0;
+    // Read tile column sizes for all columns (we need the last tile buffer)
+    for (int c = 0; c < tile_cols; ++c) {
+      const int is_last = c == tile_cols - 1;
+      size_t tile_col_size;
+
+      if (!is_last) {
+        tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+        data += tile_col_size_bytes;
+        tile_col_data_end[c] = data + tile_col_size;
+      } else {
+        tile_col_size = data_end - data;
+        tile_col_data_end[c] = data_end;
+      }
+      data += tile_col_size;
+    }
+
+    data = data_start;
+
+    // Read the required tile sizes.
+    for (int c = tile_cols_start; c < tile_cols_end; ++c) {
+      const int is_last = c == tile_cols - 1;
+
+      if (c > 0) data = tile_col_data_end[c - 1];
+
+      if (!is_last) data += tile_col_size_bytes;
+
+      // Get the whole of the last column, otherwise stop at the required tile.
+      for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+      }
+    }
+
+    // If we have not read the last column, then read it to get the last tile.
+    if (tile_cols_end != tile_cols) {
+      const int c = tile_cols - 1;
+
+      data = tile_col_data_end[c - 1];
+
+      for (int r = 0; r < tile_rows; ++r) {
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+      }
+    }
+    raw_data_end = data;
+  }
+  return raw_data_end;
+}
+#endif  // EXT_TILE_DEBUG
+
+static const uint8_t *get_ls_single_tile_buffer(
+    AV1Decoder *pbi, const uint8_t *data,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0);
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data;
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size =
+      (size_t)pbi->coded_tile_data_size;
+  return data + pbi->coded_tile_data_size;
+}
+
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            const int tile_size_bytes, int is_last,
+                            struct aom_internal_error_info *error_info,
+                            const uint8_t **data, TileBufferDec *const buf) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, tile_size_bytes, data_end))
+      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile length");
+
+    size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
+    *data += tile_size_bytes;
+
+    if (size > (size_t)(data_end - *data))
+      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile size");
+  } else {
+    size = data_end - *data;
+  }
+
+  buf->data = *data;
+  buf->size = size;
+
+  *data += size;
+}
+
+static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
+                             const uint8_t *data_end,
+                             TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+                             int start_tile, int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tc = 0;
+  int first_tile_in_tg = 0;
+
+  for (int r = 0; r < tile_rows; ++r) {
+    for (int c = 0; c < tile_cols; ++c, ++tc) {
+      TileBufferDec *const buf = &tile_buffers[r][c];
+
+      const int is_last = (tc == end_tile);
+      const size_t hdr_offset = 0;
+
+      if (tc < start_tile || tc > end_tile) continue;
+
+      if (data + hdr_offset >= data_end)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Data ended before all tiles were read.");
+      first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
+      data += hdr_offset;
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
+                      &pbi->common.error, &data, buf);
+    }
+  }
+}
+
+static void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+                          CB_BUFFER *cb_buffer_base, const int num_planes,
+                          int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &pbi->common;
+  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int stride = (cm->mi_cols >> mib_size_log2) + 1;
+  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  CB_BUFFER *cb_buffer = cb_buffer_base + offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
+    xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
+    xd->cb_offset[plane] = 0;
+    xd->txb_offset[plane] = 0;
+  }
+  xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
+  xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
+  xd->color_index_map_offset[0] = 0;
+  xd->color_index_map_offset[1] = 0;
+}
+
+static void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
+  AV1_COMMON *const cm = &pbi->common;
+  aom_free(pbi->tile_data);
+  CHECK_MEM_ERROR(cm, pbi->tile_data,
+                  aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+  pbi->allocated_tiles = n_tiles;
+  for (int i = 0; i < n_tiles; i++) {
+    TileDataDec *const tile_data = pbi->tile_data + i;
+    av1_zero(tile_data->dec_row_mt_sync);
+  }
+  pbi->allocated_row_mt_sync_rows = 0;
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+// nsync numbers are picked by testing.
+#if 0
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+#endif
+  return 1;
+}
+
+// Allocate memory for decoder row synchronization
+static void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm,
+                             int rows) {
+  dec_row_mt_sync->allocated_sb_rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
+                    aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
+    if (dec_row_mt_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
+                    aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
+    if (dec_row_mt_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
+                  aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
+
+  // Set up nsync.
+  dec_row_mt_sync->sync_range = get_sync_range(cm->width);
+}
+
+// Deallocate decoder row synchronization related mutex and data
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
+  if (dec_row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+    if (dec_row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+        pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
+      }
+      aom_free(dec_row_mt_sync->mutex_);
+    }
+    if (dec_row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+        pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
+      }
+      aom_free(dec_row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(dec_row_mt_sync->cur_sb_col);
+
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*dec_row_mt_sync);
+  }
+}
+
+static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+                             int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = dec_row_mt_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)dec_row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+                              int c, const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = dec_row_mt_sync->sync_range;
+  int cur;
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
+
+    dec_row_mt_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)dec_row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+                               TileInfo tile_info, const int mi_row) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileDataDec *const tile_data =
+      pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+  const int sb_row_in_tile =
+      (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+  int sb_col_in_tile = 0;
+
+  for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+       mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+    set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+                  mi_col);
+
+    sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+
+    // Decoding of the super-block
+    decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                     cm->seq_params.sb_size, 0x2);
+
+    sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
+               sb_cols_in_tile);
+  }
+}
+
+static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+  if (aom_reader_has_overflowed(r)) return -1;
+
+  uint32_t nb_bits = aom_reader_tell(r);
+  uint32_t nb_bytes = (nb_bits + 7) >> 3;
+  const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
+
+  // aom_reader_tell() returns 1 for a newly initialized decoder, and the
+  // return value only increases as values are decoded. So nb_bits > 0, and
+  // thus p > p_begin. Therefore accessing p[-1] is safe.
+  uint8_t last_byte = p[-1];
+  uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
+  if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
+
+  // Make sure that all padding bytes are zero as required by the spec.
+  const uint8_t *p_end = aom_reader_find_end(r);
+  while (p < p_end) {
+    if (*p != 0) return -1;
+    p++;
+  }
+  return 0;
+}
+
+static void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) {
+  td->read_coeffs_tx_intra_block_visit = decode_block_void;
+  td->predict_and_recon_intra_block_visit = decode_block_void;
+  td->read_coeffs_tx_inter_block_visit = decode_block_void;
+  td->inverse_tx_inter_block_visit = decode_block_void;
+  td->predict_inter_block_visit = predict_inter_block_void;
+  td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
+
+  if (parse_decode_flag & 0x1) {
+    td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
+    td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade;
+  }
+  if (parse_decode_flag & 0x2) {
+    td->predict_and_recon_intra_block_visit =
+        predict_and_reconstruct_intra_block;
+    td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
+    td->predict_inter_block_visit = predict_inter_block;
+    td->cfl_store_inter_block_visit = cfl_store_inter_block;
+  }
+}
+
+static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
+                        int tile_col) {
+  TileInfo tile_info;
+
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+                         tile_info.mi_col_end, tile_row);
+  av1_reset_loop_filter_delta(&td->xd, num_planes);
+  av1_reset_loop_restoration(&td->xd, num_planes);
+
+  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    av1_zero_left_context(&td->xd);
+
+    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+
+      // Bit-stream parsing and decoding of the superblock
+      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                       cm->seq_params.sb_size, 0x3);
+
+      if (aom_reader_has_overflowed(td->bit_reader)) {
+        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        return;
+      }
+    }
+  }
+
+  int corrupted =
+      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+}
+
+static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
+                                   const uint8_t *data_end, int start_tile,
+                                   int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  ThreadData *const td = &pbi->td;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int inv_col_order;
+  int inv_row_order;
+  int tile_row, tile_col;
+  uint8_t allow_update_cdf;
+  const uint8_t *raw_data_end = NULL;
+
+  if (cm->large_scale_tile) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+    inv_col_order = pbi->inv_tile_order && !single_col;
+    inv_row_order = pbi->inv_tile_order && !single_row;
+    allow_update_cdf = 0;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
+    inv_col_order = pbi->inv_tile_order;
+    inv_row_order = pbi->inv_tile_order;
+    allow_update_cdf = 1;
+  }
+
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * cm->tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * cm->tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && !pbi->ext_tile_debug)
+    raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
+  else if (cm->large_scale_tile && pbi->ext_tile_debug)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+
+  set_decode_func_pointers(&pbi->td, 0x3);
+
+  // Load all tile information into thread_data.
+  td->xd = pbi->mb;
+  td->xd.corrupted = 0;
+  td->xd.mc_buf[0] = td->mc_buf[0];
+  td->xd.mc_buf[1] = td->mc_buf[1];
+  td->xd.tmp_conv_dst = td->tmp_conv_dst;
+  for (int j = 0; j < 2; ++j) {
+    td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+  }
+
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
+
+      if (row * cm->tile_cols + col < start_tile ||
+          row * cm->tile_cols + col > end_tile)
+        continue;
+
+      td->bit_reader = &tile_data->bit_reader;
+      av1_zero(td->dqcoeff);
+      av1_tile_init(&td->xd.tile, cm, row, col);
+      td->xd.current_qindex = cm->base_qindex;
+      setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
+                         &cm->error, td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        td->bit_reader->accounting = &pbi->accounting;
+        td->bit_reader->accounting->last_tell_frac =
+            aom_reader_tell_frac(td->bit_reader);
+      } else {
+        td->bit_reader->accounting = NULL;
+      }
+#endif
+      av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      av1_init_above_context(cm, &td->xd, row);
+
+      // Initialise the tile context from the frame context
+      tile_data->tctx = *cm->fc;
+      td->xd.tile_ctx = &tile_data->tctx;
+
+      // decode tile
+      decode_tile(pbi, td, row, col);
+      aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
+      if (pbi->mb.corrupted)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+    }
+  }
+
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
+  TileJobsDec *cur_job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(tile_mt_info->job_mutex);
+
+  if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
+    cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
+    tile_mt_info->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(tile_mt_info->job_mutex);
+#else
+  (void)tile_mt_info;
+#endif
+  return cur_job_info;
+}
+
+static void tile_worker_hook_init(AV1Decoder *const pbi,
+                                  DecWorkerData *const thread_data,
+                                  const TileBufferDec *const tile_buffer,
+                                  TileDataDec *const tile_data,
+                                  uint8_t allow_update_cdf) {
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  int tile_row = tile_data->tile_info.tile_row;
+  int tile_col = tile_data->tile_info.tile_col;
+
+  td->bit_reader = &tile_data->bit_reader;
+  av1_zero(td->dqcoeff);
+  av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+  td->xd.current_qindex = cm->base_qindex;
+  setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+                     tile_buffer->size, &thread_data->error_info,
+                     td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    td->bit_reader->accounting = &pbi->accounting;
+    td->bit_reader->accounting->last_tell_frac =
+        aom_reader_tell_frac(td->bit_reader);
+  } else {
+    td->bit_reader->accounting = NULL;
+  }
+#endif
+  av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+  td->xd.error_info = &thread_data->error_info;
+  av1_init_above_context(cm, &td->xd, tile_row);
+
+  // Initialise the tile context from the frame context
+  tile_data->tctx = *cm->fc;
+  td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    tile_data->bit_reader.accounting->last_tell_frac =
+        aom_reader_tell_frac(&tile_data->bit_reader);
+  }
+#endif
+}
+
+static int tile_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+    return 0;
+  }
+  thread_data->error_info.setjmp = 1;
+
+  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+  set_decode_func_pointers(td, 0x3);
+
+  assert(cm->tile_cols > 0);
+  while (1) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+    if (cur_job_info != NULL && !td->xd.corrupted) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+                            allow_update_cdf);
+      // decode tile
+      int tile_row = tile_data->tile_info.tile_row;
+      int tile_col = tile_data->tile_info.tile_col;
+      decode_tile(pbi, td, tile_row, tile_col);
+    } else {
+      break;
+    }
+  }
+  thread_data->error_info.setjmp = 0;
+  return !td->xd.corrupted;
+}
+
+static int get_next_job_info(AV1Decoder *const pbi,
+                             AV1DecRowMTJobInfo *next_job_info,
+                             int *end_of_frame) {
+  AV1_COMMON *cm = &pbi->common;
+  TileDataDec *tile_data;
+  AV1DecRowMTSync *dec_row_mt_sync;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+  TileInfo tile_info;
+  const int tile_rows_start = frame_row_mt_info->tile_rows_start;
+  const int tile_rows_end = frame_row_mt_info->tile_rows_end;
+  const int tile_cols_start = frame_row_mt_info->tile_cols_start;
+  const int tile_cols_end = frame_row_mt_info->tile_cols_end;
+  const int start_tile = frame_row_mt_info->start_tile;
+  const int end_tile = frame_row_mt_info->end_tile;
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  int num_mis_to_decode, num_threads_working;
+  int num_mis_waiting_for_decode;
+  int min_threads_working = INT_MAX;
+  int max_mis_to_decode = 0;
+  int tile_row_idx, tile_col_idx;
+  int tile_row = 0;
+  int tile_col = 0;
+
+  memset(next_job_info, 0, sizeof(*next_job_info));
+
+  // Frame decode is completed or error is encountered.
+  *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
+                   frame_row_mt_info->mi_rows_to_decode) ||
+                  (frame_row_mt_info->row_mt_exit == 1);
+  if (*end_of_frame) {
+    return 1;
+  }
+
+  // Decoding cannot start as bit-stream parsing is not complete.
+  if (frame_row_mt_info->mi_rows_parse_done -
+          frame_row_mt_info->mi_rows_decode_started ==
+      0)
+    return 0;
+
+  // Choose the tile to decode.
+  for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
+       ++tile_row_idx) {
+    for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
+         ++tile_col_idx) {
+      if (tile_row_idx * cm->tile_cols + tile_col_idx < start_tile ||
+          tile_row_idx * cm->tile_cols + tile_col_idx > end_tile)
+        continue;
+
+      tile_data = pbi->tile_data + tile_row_idx * cm->tile_cols + tile_col_idx;
+      dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+      num_threads_working = dec_row_mt_sync->num_threads_working;
+      num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
+                                    dec_row_mt_sync->mi_rows_decode_started) *
+                                   dec_row_mt_sync->mi_cols;
+      num_mis_to_decode =
+          (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
+          dec_row_mt_sync->mi_cols;
+
+      assert(num_mis_to_decode >= num_mis_waiting_for_decode);
+
+      // Pick the tile which has minimum number of threads working on it.
+      if (num_mis_waiting_for_decode > 0) {
+        if (num_threads_working < min_threads_working) {
+          min_threads_working = num_threads_working;
+          max_mis_to_decode = 0;
+        }
+        if (num_threads_working == min_threads_working &&
+            num_mis_to_decode > max_mis_to_decode) {
+          max_mis_to_decode = num_mis_to_decode;
+          tile_row = tile_row_idx;
+          tile_col = tile_col_idx;
+        }
+      }
+    }
+  }
+
+  tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+  tile_info = tile_data->tile_info;
+  dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+  next_job_info->tile_row = tile_row;
+  next_job_info->tile_col = tile_col;
+  next_job_info->mi_row =
+      dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start;
+
+  dec_row_mt_sync->num_threads_working++;
+  dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
+  frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
+
+  return 1;
+}
+
+static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+                                            TileDataDec *const tile_data,
+                                            const int sb_mi_size) {
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+  tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
+  frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
+#if CONFIG_MULTITHREAD
+  pthread_cond_broadcast(pbi->row_mt_cond_);
+  pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+}
+
+static int row_mt_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+  td->xd.corrupted = 0;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+    pthread_cond_broadcast(pbi->row_mt_cond_);
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+    return 0;
+  }
+  thread_data->error_info.setjmp = 1;
+
+  const int num_planes = av1_num_planes(cm);
+  allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+  assert(cm->tile_cols > 0);
+  while (1) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+    if (cur_job_info != NULL && !td->xd.corrupted) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+                            allow_update_cdf);
+
+      set_decode_func_pointers(td, 0x1);
+
+      // decode tile
+      TileInfo tile_info = tile_data->tile_info;
+      int tile_row = tile_info.tile_row;
+
+      av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+                             tile_info.mi_col_end, tile_row);
+      av1_reset_loop_filter_delta(&td->xd, num_planes);
+      av1_reset_loop_restoration(&td->xd, num_planes);
+
+      for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->seq_params.mib_size) {
+        av1_zero_left_context(&td->xd);
+
+        for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->seq_params.mib_size) {
+          set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+                        mi_col);
+
+          // Bit-stream parsing of the superblock
+          decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                           cm->seq_params.sb_size, 0x1);
+        }
+        signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
+      }
+
+      int corrupted =
+          (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+      aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+    } else {
+      break;
+    }
+  }
+
+  set_decode_func_pointers(td, 0x2);
+
+  while (1) {
+    AV1DecRowMTJobInfo next_job_info;
+    int end_of_frame = 0;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
+#if CONFIG_MULTITHREAD
+      pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
+#endif
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+    if (end_of_frame) break;
+
+    int tile_row = next_job_info.tile_row;
+    int tile_col = next_job_info.tile_col;
+    int mi_row = next_job_info.mi_row;
+
+    TileDataDec *tile_data =
+        pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+    AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+    TileInfo tile_info = tile_data->tile_info;
+
+    av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+    av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+    td->xd.error_info = &thread_data->error_info;
+
+    decode_tile_sb_row(pbi, td, tile_info, mi_row);
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    dec_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+  }
+  thread_data->error_info.setjmp = 0;
+  return !td->xd.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileJobsDec *const buf1 = (const TileJobsDec *)a;
+  const TileJobsDec *const buf2 = (const TileJobsDec *)b;
+  return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
+}
+
+static void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+                              int tile_rows_start, int tile_rows_end,
+                              int tile_cols_start, int tile_cols_end,
+                              int startTile, int endTile) {
+  AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
+  TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
+  tile_mt_info->jobs_enqueued = 0;
+  tile_mt_info->jobs_dequeued = 0;
+
+  for (int row = tile_rows_start; row < tile_rows_end; row++) {
+    for (int col = tile_cols_start; col < tile_cols_end; col++) {
+      if (row * cm->tile_cols + col < startTile ||
+          row * cm->tile_cols + col > endTile)
+        continue;
+      tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
+      tile_job_queue->tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      tile_job_queue++;
+      tile_mt_info->jobs_enqueued++;
+    }
+  }
+}
+
+static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
+                           int tile_rows, int tile_cols) {
+  tile_mt_info->alloc_tile_rows = tile_rows;
+  tile_mt_info->alloc_tile_cols = tile_cols;
+  int num_tiles = tile_rows * tile_cols;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
+                    aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
+
+    for (int i = 0; i < num_tiles; i++) {
+      pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
+    }
+  }
+#endif
+  CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
+                  aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
+}
+
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
+  int ref;
+  for (ref = 0; ref < 2; ref++) {
+    if (thread_data->mc_buf_use_highbd)
+      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
+    else
+      aom_free(thread_data->mc_buf[ref]);
+    thread_data->mc_buf[ref] = NULL;
+  }
+  thread_data->mc_buf_size = 0;
+  thread_data->mc_buf_use_highbd = 0;
+
+  aom_free(thread_data->tmp_conv_dst);
+  thread_data->tmp_conv_dst = NULL;
+  for (int i = 0; i < 2; ++i) {
+    aom_free(thread_data->tmp_obmc_bufs[i]);
+    thread_data->tmp_obmc_bufs[i] = NULL;
+  }
+}
+
+static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
+                                int buf_size, int use_highbd) {
+  for (int ref = 0; ref < 2; ref++) {
+    if (use_highbd) {
+      uint16_t *hbd_mc_buf;
+      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
+    } else {
+      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
+                      (uint8_t *)aom_memalign(16, buf_size));
+    }
+  }
+  thread_data->mc_buf_size = buf_size;
+  thread_data->mc_buf_use_highbd = use_highbd;
+
+  CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+                  aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                                       sizeof(*thread_data->tmp_conv_dst)));
+  for (int i = 0; i < 2; ++i) {
+    CHECK_MEM_ERROR(
+        cm, thread_data->tmp_obmc_bufs[i],
+        aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                             sizeof(*thread_data->tmp_obmc_bufs[i])));
+  }
+}
+
+static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
+                              int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  // Reset tile decoding hook
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    thread_data->td->xd = pbi->mb;
+    thread_data->td->xd.corrupted = 0;
+    thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
+    thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+    thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    for (int j = 0; j < 2; ++j) {
+      thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+    }
+    winterface->sync(worker);
+
+    worker->hook = worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = pbi;
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+}
+
+static void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end,
+                               int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+    thread_data->data_end = data_end;
+
+    worker->had_error = 0;
+    if (worker_idx == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+}
+
+static void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int corrupted = 0;
+
+  for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+    aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+  }
+
+  pbi->mb.corrupted = corrupted;
+}
+
+static void decode_mt_init(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int worker_idx;
+
+  // Create workers and thread_data
+  if (pbi->num_workers == 0) {
+    const int num_threads = pbi->max_threads;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    CHECK_MEM_ERROR(cm, pbi->thread_data,
+                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      ++pbi->num_workers;
+
+      winterface->init(worker);
+      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+
+      if (worker_idx < num_threads - 1) {
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        aom_memalign(32, sizeof(*thread_data->td)));
+        av1_zero(*thread_data->td);
+      } else {
+        // Main thread acts as a worker and uses the thread data in pbi
+        thread_data->td = &pbi->td;
+      }
+      thread_data->error_info.error_code = AOM_CODEC_OK;
+      thread_data->error_info.setjmp = 0;
+    }
+  }
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    if (thread_data->td->mc_buf_size != buf_size) {
+      av1_free_mc_tmp_buf(thread_data->td);
+      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+    }
+  }
+}
+
+static void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows,
+                          int tile_rows_start, int tile_rows_end,
+                          int tile_cols_start, int tile_cols_end,
+                          int start_tile, int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+  }
+  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile);
+  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+}
+
+static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
+                                      const uint8_t *data_end, int start_tile,
+                                      int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int tile_count_tg;
+  int num_workers;
+  const uint8_t *raw_data_end = NULL;
+
+  if (cm->large_scale_tile) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
+  }
+  tile_count_tg = end_tile - start_tile + 1;
+  num_workers = AOMMIN(pbi->max_threads, tile_count_tg);
+
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+  assert(tile_count_tg > 0);
+  assert(num_workers > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  decode_mt_init(pbi);
+
+  // get tile size in tile group
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
+  if (cm->large_scale_tile)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
+    }
+  }
+
+  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+                tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+  reset_dec_workers(pbi, tile_worker_hook, num_workers);
+  launch_dec_workers(pbi, data_end, num_workers);
+  sync_dec_workers(pbi, num_workers);
+
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void dec_alloc_cb_buf(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+  if (pbi->cb_buffer_alloc_size < size) {
+    av1_dec_free_cb_buf(pbi);
+    CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
+                    aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+    pbi->cb_buffer_alloc_size = size;
+  }
+}
+
+static void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+                              int tile_rows_end, int tile_cols_start,
+                              int tile_cols_end, int start_tile, int end_tile,
+                              int max_sb_rows) {
+  AV1_COMMON *const cm = &pbi->common;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+
+  frame_row_mt_info->tile_rows_start = tile_rows_start;
+  frame_row_mt_info->tile_rows_end = tile_rows_end;
+  frame_row_mt_info->tile_cols_start = tile_cols_start;
+  frame_row_mt_info->tile_cols_end = tile_cols_end;
+  frame_row_mt_info->start_tile = start_tile;
+  frame_row_mt_info->end_tile = end_tile;
+  frame_row_mt_info->mi_rows_to_decode = 0;
+  frame_row_mt_info->mi_rows_parse_done = 0;
+  frame_row_mt_info->mi_rows_decode_started = 0;
+  frame_row_mt_info->row_mt_exit = 0;
+
+  for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      if (tile_row * cm->tile_cols + tile_col < start_tile ||
+          tile_row * cm->tile_cols + tile_col > end_tile)
+        continue;
+
+      TileDataDec *const tile_data =
+          pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+      TileInfo tile_info = tile_data->tile_info;
+
+      tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
+      tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
+      tile_data->dec_row_mt_sync.num_threads_working = 0;
+      tile_data->dec_row_mt_sync.mi_rows =
+          ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
+                             cm->seq_params.mib_size_log2);
+      tile_data->dec_row_mt_sync.mi_cols =
+          ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
+                             cm->seq_params.mib_size_log2);
+
+      frame_row_mt_info->mi_rows_to_decode +=
+          tile_data->dec_row_mt_sync.mi_rows;
+
+      // Initialize cur_sb_col to -1 for all SB rows.
+      memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
+             sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
+    }
+  }
+
+#if CONFIG_MULTITHREAD
+  if (pbi->row_mt_mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
+                    aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
+    if (pbi->row_mt_mutex_) {
+      pthread_mutex_init(pbi->row_mt_mutex_, NULL);
+    }
+  }
+
+  if (pbi->row_mt_cond_ == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
+                    aom_malloc(sizeof(*(pbi->row_mt_cond_))));
+    if (pbi->row_mt_cond_) {
+      pthread_cond_init(pbi->row_mt_cond_, NULL);
+    }
+  }
+#endif
+}
+
+static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
+                                          const uint8_t *data_end,
+                                          int start_tile, int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int tile_count_tg;
+  int num_workers;
+  const uint8_t *raw_data_end = NULL;
+  int max_sb_rows = 0;
+
+  if (cm->large_scale_tile) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
+  }
+  tile_count_tg = end_tile - start_tile + 1;
+  num_workers = pbi->max_threads;
+
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+  assert(tile_count_tg > 0);
+  assert(num_workers > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  (void)tile_count_tg;
+
+  decode_mt_init(pbi);
+
+  // get tile size in tile group
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
+  if (cm->large_scale_tile)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    for (int i = 0; i < pbi->allocated_tiles; i++) {
+      TileDataDec *const tile_data = pbi->tile_data + i;
+      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+    }
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
+
+      max_sb_rows = AOMMAX(max_sb_rows,
+                           av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
+    }
+  }
+
+  if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
+    for (int i = 0; i < n_tiles; ++i) {
+      TileDataDec *const tile_data = pbi->tile_data + i;
+      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+      dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
+    }
+    pbi->allocated_row_mt_sync_rows = max_sb_rows;
+  }
+
+  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+                tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+  dec_alloc_cb_buf(pbi);
+
+  row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile, max_sb_rows);
+
+  reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
+  launch_dec_workers(pbi, data_end, num_workers);
+  sync_dec_workers(pbi, num_workers);
+
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
+  if (cm->large_scale_tile) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void error_handler(void *data) {
+  AV1_COMMON *const cm = (AV1_COMMON *)data;
+  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
+// seq_params->bit_depth based on the values of those fields and
+// seq_params->profile. Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+static void read_bitdepth(struct aom_read_bit_buffer *rb,
+                          SequenceHeader *seq_params,
+                          struct aom_internal_error_info *error_info) {
+  const int high_bitdepth = aom_rb_read_bit(rb);
+  if (seq_params->profile == PROFILE_2 && high_bitdepth) {
+    const int twelve_bit = aom_rb_read_bit(rb);
+    seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+  } else if (seq_params->profile <= PROFILE_2) {
+    seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+  } else {
+    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported profile/bit-depth combination");
+  }
+}
+
+void av1_read_film_grain_params(AV1_COMMON *cm,
+                                struct aom_read_bit_buffer *rb) {
+  aom_film_grain_t *pars = &cm->film_grain_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+
+  pars->apply_grain = aom_rb_read_bit(rb);
+  if (!pars->apply_grain) {
+    memset(pars, 0, sizeof(*pars));
+    return;
+  }
+
+  pars->random_seed = aom_rb_read_literal(rb, 16);
+  if (cm->frame_type == INTER_FRAME)
+    pars->update_parameters = aom_rb_read_bit(rb);
+  else
+    pars->update_parameters = 1;
+
+  pars->bit_depth = seq_params->bit_depth;
+
+  if (!pars->update_parameters) {
+    // inherit parameters from a previous reference frame
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
+    int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx];
+    if (buf_idx == INVALID_IDX) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid Film grain reference idx");
+    }
+    if (!frame_bufs[buf_idx].film_grain_params_present) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Film grain reference parameters not available");
+    }
+    uint16_t random_seed = pars->random_seed;
+    *pars = frame_bufs[buf_idx].film_grain_params;  // inherit paramaters
+    pars->random_seed = random_seed;                // with new random seed
+    return;
+  }
+
+  // Scaling functions parameters
+  pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
+  if (pars->num_y_points > 14)
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Number of points for film grain luma scaling function "
+                       "exceeds the maximum value.");
+  for (int i = 0; i < pars->num_y_points; i++) {
+    pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
+    if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "First coordinate of the scaling function points "
+                         "shall be increasing.");
+    pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
+  }
+
+  if (!seq_params->monochrome)
+    pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+  else
+    pars->chroma_scaling_from_luma = 0;
+
+  if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+       (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
+  } else {
+    pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cb_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cb scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
+    }
+
+    pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cr_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cr scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
+    }
+
+    if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+        (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
+         ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "In YCbCr 4:2:0, film grain shall be applied "
+                         "to both chroma components or neither.");
+  }
+
+  pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8;  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6;  // 6 + value
+
+  pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
+
+  if (pars->num_cb_points) {
+    pars->cb_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_offset = aom_rb_read_literal(rb, 9);
+  }
+
+  if (pars->num_cr_points) {
+    pars->cr_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_offset = aom_rb_read_literal(rb, 9);
+  }
+
+  pars->overlap_flag = aom_rb_read_bit(rb);
+
+  pars->clip_to_restricted_range = aom_rb_read_bit(rb);
+}
+
+static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (cm->seq_params.film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    av1_read_film_grain_params(cm, rb);
+  } else {
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+  cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+  memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
+         sizeof(aom_film_grain_t));
+}
+
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           struct aom_internal_error_info *error_info) {
+  read_bitdepth(rb, seq_params, error_info);
+
+  seq_params->use_highbitdepth =
+      seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+  // monochrome bit (not needed for PROFILE_1)
+  const int is_monochrome =
+      seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+  seq_params->monochrome = is_monochrome;
+  int color_description_present_flag = aom_rb_read_bit(rb);
+  if (color_description_present_flag) {
+    seq_params->color_primaries = aom_rb_read_literal(rb, 8);
+    seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
+    seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
+  } else {
+    seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+    seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+    seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+  }
+  if (is_monochrome) {
+    // [16,235] (including xvycc) vs [0,255] range
+    seq_params->color_range = aom_rb_read_bit(rb);
+    seq_params->subsampling_y = seq_params->subsampling_x = 1;
+    seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
+    seq_params->separate_uv_delta_q = 0;
+    return;
+  }
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    // It would be good to remove this dependency.
+    seq_params->subsampling_y = seq_params->subsampling_x = 0;
+    seq_params->color_range = 1;  // assume full color-range
+    if (!(seq_params->profile == PROFILE_1 ||
+          (seq_params->profile == PROFILE_2 &&
+           seq_params->bit_depth == AOM_BITS_12))) {
+      aom_internal_error(
+          error_info, AOM_CODEC_UNSUP_BITSTREAM,
+          "sRGB colorspace not compatible with specified profile");
+    }
+  } else {
+    // [16,235] (including xvycc) vs [0,255] range
+    seq_params->color_range = aom_rb_read_bit(rb);
+    if (seq_params->profile == PROFILE_0) {
+      // 420 only
+      seq_params->subsampling_x = seq_params->subsampling_y = 1;
+    } else if (seq_params->profile == PROFILE_1) {
+      // 444 only
+      seq_params->subsampling_x = seq_params->subsampling_y = 0;
+    } else {
+      assert(seq_params->profile == PROFILE_2);
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        seq_params->subsampling_x = aom_rb_read_bit(rb);
+        if (seq_params->subsampling_x)
+          seq_params->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
+        else
+          seq_params->subsampling_y = 0;  // 444
+      } else {
+        // 422
+        seq_params->subsampling_x = 1;
+        seq_params->subsampling_y = 0;
+      }
+    }
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+        (seq_params->subsampling_x || seq_params->subsampling_y)) {
+      aom_internal_error(
+          error_info, AOM_CODEC_UNSUP_BITSTREAM,
+          "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
+    }
+    if (seq_params->subsampling_x && seq_params->subsampling_y) {
+      seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
+    }
+  }
+  seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
+}
+
+void av1_read_timing_info_header(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->timing_info.num_units_in_display_tick = aom_rb_read_unsigned_literal(
+      rb, 32);  // Number of units in a display tick
+  cm->timing_info.time_scale =
+      aom_rb_read_unsigned_literal(rb, 32);  // Time scale
+  if (cm->timing_info.num_units_in_display_tick == 0 ||
+      cm->timing_info.time_scale == 0) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "num_units_in_display_tick and time_scale must be greater than 0.");
+  }
+  cm->timing_info.equal_picture_interval =
+      aom_rb_read_bit(rb);  // Equal picture interval bit
+  if (cm->timing_info.equal_picture_interval) {
+    cm->timing_info.num_ticks_per_picture =
+        aom_rb_read_uvlc(rb) + 1;  // ticks per picture
+    if (cm->timing_info.num_ticks_per_picture == 0) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
+    }
+  }
+}
+
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb) {
+  cm->buffer_model.encoder_decoder_buffer_delay_length =
+      aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
+      rb, 32);  // Number of units in a decoding tick
+  cm->buffer_model.buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1;
+  cm->buffer_model.frame_presentation_time_length =
+      aom_rb_read_literal(rb, 5) + 1;
+}
+
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb, int op_num) {
+  // The cm->op_params array has MAX_NUM_OPERATING_POINTS + 1 elements.
+  if (op_num > MAX_NUM_OPERATING_POINTS) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "AV1 does not support %d decoder model operating points",
+                       op_num + 1);
+  }
+
+  cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_unsigned_literal(
+      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_unsigned_literal(
+      rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
+}
+
+static void av1_read_temporal_point_info(AV1_COMMON *const cm,
+                                         struct aom_read_bit_buffer *rb) {
+  cm->frame_presentation_time = aom_rb_read_unsigned_literal(
+      rb, cm->buffer_model.frame_presentation_time_length);
+}
+
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                              SequenceHeader *seq_params) {
+  const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+  const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+  const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag = 0;
+  } else {
+    seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
+  }
+  if (seq_params->frame_id_numbers_present_flag) {
+    // We must always have delta_frame_id_length < frame_id_length,
+    // in order for a frame to be referenced with a unique delta.
+    // Avoid wasting bits by using a coding that enforces this restriction.
+    seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
+    seq_params->frame_id_length =
+        aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
+    if (seq_params->frame_id_length > 16)
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid frame_id_length");
+  }
+
+  setup_sb_size(seq_params, rb);
+
+  seq_params->enable_filter_intra = aom_rb_read_bit(rb);
+  seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->enable_interintra_compound = 0;
+    seq_params->enable_masked_compound = 0;
+    seq_params->enable_warped_motion = 0;
+    seq_params->enable_dual_filter = 0;
+    seq_params->enable_order_hint = 0;
+    seq_params->enable_jnt_comp = 0;
+    seq_params->enable_ref_frame_mvs = 0;
+    seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
+    seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
+    seq_params->order_hint_bits_minus_1 = -1;
+  } else {
+    seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
+    seq_params->enable_masked_compound = aom_rb_read_bit(rb);
+    seq_params->enable_warped_motion = aom_rb_read_bit(rb);
+    seq_params->enable_dual_filter = aom_rb_read_bit(rb);
+
+    seq_params->enable_order_hint = aom_rb_read_bit(rb);
+    seq_params->enable_jnt_comp =
+        seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+    seq_params->enable_ref_frame_mvs =
+        seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+
+    if (aom_rb_read_bit(rb)) {
+      seq_params->force_screen_content_tools =
+          2;  // SELECT_SCREEN_CONTENT_TOOLS
+    } else {
+      seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
+    }
+
+    if (seq_params->force_screen_content_tools > 0) {
+      if (aom_rb_read_bit(rb)) {
+        seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
+      } else {
+        seq_params->force_integer_mv = aom_rb_read_bit(rb);
+      }
+    } else {
+      seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
+    }
+    seq_params->order_hint_bits_minus_1 =
+        seq_params->enable_order_hint ? aom_rb_read_literal(rb, 3) : -1;
+  }
+
+  seq_params->enable_superres = aom_rb_read_bit(rb);
+  seq_params->enable_cdef = aom_rb_read_bit(rb);
+  seq_params->enable_restoration = aom_rb_read_bit(rb);
+}
+
+static int read_global_motion_params(WarpedMotionParams *params,
+                                     const WarpedMotionParams *ref_params,
+                                     struct aom_read_bit_buffer *rb,
+                                     int allow_hp) {
+  TransformationType type = aom_rb_read_bit(rb);
+  if (type != IDENTITY) {
+    if (aom_rb_read_bit(rb))
+      type = ROTZOOM;
+    else
+      type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
+  }
+
+  *params = default_warp_params;
+  params->wmtype = type;
+
+  if (type >= ROTZOOM) {
+    params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+    params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+  }
+
+  if (type >= AFFINE) {
+    params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+    params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+  } else {
+    params->wmmat[4] = -params->wmmat[3];
+    params->wmmat[5] = params->wmmat[2];
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_dec_factor =
+        (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+                              : GM_TRANS_DECODE_FACTOR;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[0] >> trans_prec_diff)) *
+                       trans_dec_factor;
+    params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[1] >> trans_prec_diff)) *
+                       trans_dec_factor;
+  }
+
+  if (params->wmtype <= AFFINE) {
+    int good_shear_params = get_shear_params(params);
+    if (!good_shear_params) return 0;
+  }
+
+  return 1;
+}
+
+static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    const WarpedMotionParams *ref_params =
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
+    int good_params = read_global_motion_params(
+        &cm->global_motion[frame], ref_params, rb, cm->allow_high_precision_mv);
+    if (!good_params) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected global motion shear params from aomenc\n");
+#endif
+      cm->global_motion[frame].invalid = 1;
+    }
+
+    // TODO(sarahparker, debargha): The logic in the commented out code below
+    // does not work currently and causes mismatches when resize is on. Fix it
+    // before turning the optimization back on.
+    /*
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame);
+    if (cm->width == ref_buf->y_crop_width &&
+        cm->height == ref_buf->y_crop_height) {
+      read_global_motion_params(&cm->global_motion[frame],
+                                &cm->prev_frame->global_motion[frame], rb,
+                                cm->allow_high_precision_mv);
+    } else {
+      cm->global_motion[frame] = default_warp_params;
+    }
+    */
+    /*
+    printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
+           frame, cm->current_video_frame, cm->show_frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1],
+           cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         REF_FRAMES * sizeof(WarpedMotionParams));
+}
+
+static void show_existing_frame_reset(AV1Decoder *const pbi,
+                                      int existing_frame_idx) {
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  assert(cm->show_existing_frame);
+
+  cm->frame_type = KEY_FRAME;
+
+  pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    cm->frame_refs[i].idx = INVALID_IDX;
+    cm->frame_refs[i].buf = NULL;
+  }
+
+  if (pbi->need_resync) {
+    memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+    pbi->need_resync = 0;
+  }
+
+  cm->cur_frame->intra_only = 1;
+
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    /* If bitmask is set, update reference frame id values and
+       mark frames as valid for reference.
+       Note that the displayed frame be valid for referencing
+       in order to have been selected.
+    */
+    int refresh_frame_flags = pbi->refresh_frame_flags;
+    int display_frame_id = cm->ref_frame_id[existing_frame_idx];
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = display_frame_id;
+        cm->valid_for_referencing[i] = 1;
+      }
+    }
+  }
+
+  cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  int ref_index = 0;
+  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  // Reload the adapted CDFs from when we originally coded this keyframe
+  *cm->fc = cm->frame_contexts[existing_frame_idx];
+}
+
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (i != cm->new_fb_idx) {
+      frame_bufs[i].ref_count = 0;
+      cm->buffer_pool->release_fb_cb(cm->buffer_pool->cb_priv,
+                                     &frame_bufs[i].raw_frame_buffer);
+    } else {
+      assert(frame_bufs[i].ref_count == 1);
+    }
+    frame_bufs[i].cur_frame_offset = 0;
+    av1_zero(frame_bufs[i].ref_frame_offset);
+  }
+  av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+  unlock_buffer_pool(cm->buffer_pool);
+}
+
+// On success, returns 0. On failure, calls aom_internal_error and does not
+// return.
+static int read_uncompressed_header(AV1Decoder *pbi,
+                                    struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  MACROBLOCKD *const xd = &pbi->mb;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  if (!pbi->sequence_header_ready) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No sequence header");
+  }
+
+  cm->last_frame_type = cm->frame_type;
+  cm->last_intra_only = cm->intra_only;
+
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    cm->show_existing_frame = 0;
+    cm->show_frame = 1;
+    cm->frame_type = KEY_FRAME;
+    cm->error_resilient_mode = 1;
+  } else {
+    cm->show_existing_frame = aom_rb_read_bit(rb);
+    cm->reset_decoder_state = 0;
+
+    if (cm->show_existing_frame) {
+      if (pbi->sequence_header_changed) {
+        aom_internal_error(
+            &cm->error, AOM_CODEC_CORRUPT_FRAME,
+            "New sequence header starts with a show_existing_frame.");
+      }
+      // Show an existing frame directly.
+      const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+      const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+      if (seq_params->decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0) {
+        av1_read_temporal_point_info(cm, rb);
+      }
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_length = seq_params->frame_id_length;
+        int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+        /* Compare display_frame_id with ref_frame_id and check valid for
+         * referencing */
+        if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+            cm->valid_for_referencing[existing_frame_idx] == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Reference buffer frame ID mismatch");
+      }
+      lock_buffer_pool(pool);
+      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+        unlock_buffer_pool(pool);
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer %d does not contain a decoded frame",
+                           frame_to_show);
+      }
+      ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+      cm->reset_decoder_state =
+          frame_bufs[frame_to_show].frame_type == KEY_FRAME;
+      unlock_buffer_pool(pool);
+
+      cm->lf.filter_level[0] = 0;
+      cm->lf.filter_level[1] = 0;
+      cm->show_frame = 1;
+
+      if (!frame_bufs[frame_to_show].showable_frame) {
+        aom_merge_corrupted_flag(&xd->corrupted, 1);
+      }
+      if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0;
+
+      cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params;
+
+      if (cm->reset_decoder_state) {
+        show_existing_frame_reset(pbi, existing_frame_idx);
+      } else {
+        pbi->refresh_frame_flags = 0;
+      }
+
+      return 0;
+    }
+
+    cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);  // 2 bits
+    if (pbi->sequence_header_changed) {
+      if (pbi->common.frame_type == KEY_FRAME) {
+        // This is the start of a new coded video sequence.
+        pbi->sequence_header_changed = 0;
+        pbi->decoding_first_frame = 1;
+        reset_frame_buffers(&pbi->common);
+      } else {
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Sequence header has changed without a keyframe.");
+      }
+    }
+
+    cm->show_frame = aom_rb_read_bit(rb);
+    if (seq_params->still_picture &&
+        (cm->frame_type != KEY_FRAME || !cm->show_frame)) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Still pictures must be coded as shown keyframes");
+    }
+    cm->showable_frame = cm->frame_type != KEY_FRAME;
+    if (cm->show_frame) {
+      if (seq_params->decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0)
+        av1_read_temporal_point_info(cm, rb);
+    } else {
+      // See if this frame can be used as show_existing_frame in future
+      cm->showable_frame = aom_rb_read_bit(rb);
+    }
+    cm->cur_frame->showable_frame = cm->showable_frame;
+    cm->intra_only = cm->frame_type == INTRA_ONLY_FRAME;
+    cm->error_resilient_mode =
+        frame_is_sframe(cm) || (cm->frame_type == KEY_FRAME && cm->show_frame)
+            ? 1
+            : aom_rb_read_bit(rb);
+  }
+
+  cm->disable_cdf_update = aom_rb_read_bit(rb);
+  if (seq_params->force_screen_content_tools == 2) {
+    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+  } else {
+    cm->allow_screen_content_tools = seq_params->force_screen_content_tools;
+  }
+
+  if (cm->allow_screen_content_tools) {
+    if (seq_params->force_integer_mv == 2) {
+      cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+    } else {
+      cm->cur_frame_force_integer_mv = seq_params->force_integer_mv;
+    }
+  } else {
+    cm->cur_frame_force_integer_mv = 0;
+  }
+
+  cm->frame_refs_short_signaling = 0;
+  int frame_size_override_flag = 0;
+  cm->allow_intrabc = 0;
+  cm->primary_ref_frame = PRIMARY_REF_NONE;
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_length = seq_params->frame_id_length;
+      int diff_len = seq_params->delta_frame_id_length;
+      int prev_frame_id = 0;
+      int have_prev_frame_id = !pbi->decoding_first_frame &&
+                               !(cm->frame_type == KEY_FRAME && cm->show_frame);
+      if (have_prev_frame_id) {
+        prev_frame_id = cm->current_frame_id;
+      }
+      cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+      if (have_prev_frame_id) {
+        int diff_frame_id;
+        if (cm->current_frame_id > prev_frame_id) {
+          diff_frame_id = cm->current_frame_id - prev_frame_id;
+        } else {
+          diff_frame_id =
+              (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+        }
+        /* Check current_frame_id for conformance */
+        if (prev_frame_id == cm->current_frame_id ||
+            diff_frame_id >= (1 << (frame_id_length - 1))) {
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Invalid value of current_frame_id");
+        }
+      }
+      /* Check if some frames need to be marked as not valid for referencing */
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->frame_type == KEY_FRAME && cm->show_frame) {
+          cm->valid_for_referencing[i] = 0;
+        } else if (cm->current_frame_id - (1 << diff_len) > 0) {
+          if (cm->ref_frame_id[i] > cm->current_frame_id ||
+              cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+            cm->valid_for_referencing[i] = 0;
+        } else {
+          if (cm->ref_frame_id[i] > cm->current_frame_id &&
+              cm->ref_frame_id[i] < (1 << frame_id_length) +
+                                        cm->current_frame_id - (1 << diff_len))
+            cm->valid_for_referencing[i] = 0;
+        }
+      }
+    }
+
+    frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
+
+    cm->frame_offset =
+        aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
+    cm->current_video_frame = cm->frame_offset;
+
+    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+      cm->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (seq_params->decoder_model_info_present_flag) {
+    cm->buffer_removal_time_present = aom_rb_read_bit(rb);
+    if (cm->buffer_removal_time_present) {
+      for (int op_num = 0;
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+          if ((((seq_params->operating_point_idc[op_num] >>
+                 cm->temporal_layer_id) &
+                0x1) &&
+               ((seq_params->operating_point_idc[op_num] >>
+                 (cm->spatial_layer_id + 8)) &
+                0x1)) ||
+              seq_params->operating_point_idc[op_num] == 0) {
+            cm->op_frame_timing[op_num].buffer_removal_time =
+                aom_rb_read_unsigned_literal(
+                    rb, cm->buffer_model.buffer_removal_time_length);
+          } else {
+            cm->op_frame_timing[op_num].buffer_removal_time = 0;
+          }
+        } else {
+          cm->op_frame_timing[op_num].buffer_removal_time = 0;
+        }
+      }
+    }
+  }
+  if (cm->frame_type == KEY_FRAME) {
+    if (!cm->show_frame)  // unshown keyframe (forward keyframe)
+      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+    else  // shown keyframe
+      pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      cm->frame_refs[i].idx = INVALID_IDX;
+      cm->frame_refs[i].buf = NULL;
+    }
+    if (pbi->need_resync) {
+      memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
+  } else {
+    if (cm->intra_only) {
+      pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+      if (pbi->refresh_frame_flags == 0xFF) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Intra only frames cannot have refresh flags 0xFF");
+      }
+      if (pbi->need_resync) {
+        memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+      pbi->refresh_frame_flags =
+          frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
+      if (!pbi->refresh_frame_flags) {
+        // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+    }
+  }
+
+  if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
+    // Read all ref frame order hints if error_resilient_mode == 1
+    if (cm->error_resilient_mode && seq_params->enable_order_hint) {
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Read order hint from bit stream
+        unsigned int frame_offset =
+            aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
+        // Get buffer index
+        int buf_idx = cm->ref_frame_map[ref_idx];
+        assert(buf_idx < FRAME_BUFFERS);
+        if (buf_idx == -1 ||
+            frame_offset != frame_bufs[buf_idx].cur_frame_offset) {
+          if (buf_idx >= 0) {
+            lock_buffer_pool(pool);
+            decrease_ref_count(buf_idx, frame_bufs, pool);
+            unlock_buffer_pool(pool);
+          }
+          // If no corresponding buffer exists, allocate a new buffer with all
+          // pixels set to neutral grey.
+          buf_idx = get_free_fb(cm);
+          if (buf_idx == INVALID_IDX) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          lock_buffer_pool(pool);
+          if (aom_realloc_frame_buffer(
+                  &frame_bufs[buf_idx].buf, seq_params->max_frame_width,
+                  seq_params->max_frame_height, seq_params->subsampling_x,
+                  seq_params->subsampling_y, seq_params->use_highbitdepth,
+                  AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+                  &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
+                  pool->cb_priv)) {
+            unlock_buffer_pool(pool);
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+          unlock_buffer_pool(pool);
+          set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0);
+
+          cm->ref_frame_map[ref_idx] = buf_idx;
+          frame_bufs[buf_idx].cur_frame_offset = frame_offset;
+        }
+      }
+    }
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    setup_frame_size(cm, frame_size_override_flag, rb);
+
+    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+      cm->allow_intrabc = aom_rb_read_bit(rb);
+    cm->allow_ref_frame_mvs = 0;
+    cm->prev_frame = NULL;
+  } else {
+    cm->allow_ref_frame_mvs = 0;
+
+    if (cm->intra_only) {
+      cm->cur_frame->film_grain_params_present =
+          seq_params->film_grain_params_present;
+      setup_frame_size(cm, frame_size_override_flag, rb);
+      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+        cm->allow_intrabc = aom_rb_read_bit(rb);
+
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+
+      // Frame refs short signaling is off when error resilient mode is on.
+      if (seq_params->enable_order_hint)
+        cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
+
+      if (cm->frame_refs_short_signaling) {
+        // == LAST_FRAME ==
+        const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int lst_idx = cm->ref_frame_map[lst_ref];
+
+        // == GOLDEN_FRAME ==
+        const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const int gld_idx = cm->ref_frame_map[gld_ref];
+
+        // Most of the time, streams start with a keyframe. In that case,
+        // ref_frame_map will have been filled in at that point and will not
+        // contain any -1's. However, streams are explicitly allowed to start
+        // with an intra-only frame, so long as they don't then signal a
+        // reference to a slot that hasn't been set yet. That's what we are
+        // checking here.
+        if (lst_idx == -1)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Inter frame requests nonexistent reference");
+        if (gld_idx == -1)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Inter frame requests nonexistent reference");
+
+        av1_set_frame_refs(cm, lst_ref, gld_ref);
+      }
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        int ref = 0;
+        if (!cm->frame_refs_short_signaling) {
+          ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+          const int idx = cm->ref_frame_map[ref];
+
+          // Most of the time, streams start with a keyframe. In that case,
+          // ref_frame_map will have been filled in at that point and will not
+          // contain any -1's. However, streams are explicitly allowed to start
+          // with an intra-only frame, so long as they don't then signal a
+          // reference to a slot that hasn't been set yet. That's what we are
+          // checking here.
+          if (idx == -1)
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Inter frame requests nonexistent reference");
+
+          RefBuffer *const ref_frame = &cm->frame_refs[i];
+          ref_frame->idx = idx;
+          ref_frame->buf = &frame_bufs[idx].buf;
+          ref_frame->map_idx = ref;
+        } else {
+          ref = cm->frame_refs[i].map_idx;
+        }
+
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
+
+        if (seq_params->frame_id_numbers_present_flag) {
+          int frame_id_length = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
+          int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
+          int ref_frame_id =
+              ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
+                (1 << frame_id_length)) %
+               (1 << frame_id_length));
+          // Compare values derived from delta_frame_id_minus_1 and
+          // refresh_frame_flags. Also, check valid for referencing
+          if (ref_frame_id != cm->ref_frame_id[ref] ||
+              cm->valid_for_referencing[ref] == 0)
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Reference buffer frame ID mismatch");
+        }
+      }
+
+      if (!cm->error_resilient_mode && frame_size_override_flag) {
+        setup_frame_size_with_refs(cm, rb);
+      } else {
+        setup_frame_size(cm, frame_size_override_flag, rb);
+      }
+
+      if (cm->cur_frame_force_integer_mv) {
+        cm->allow_high_precision_mv = 0;
+      } else {
+        cm->allow_high_precision_mv = aom_rb_read_bit(rb);
+      }
+      cm->interp_filter = read_frame_interp_filter(rb);
+      cm->switchable_motion_mode = aom_rb_read_bit(rb);
+    }
+
+    cm->prev_frame = get_prev_frame(cm);
+    if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Reference frame containing this frame's initial "
+                         "frame context is unavailable.");
+    }
+
+    if (!cm->intra_only && pbi->need_resync != 1) {
+      if (frame_might_allow_ref_frame_mvs(cm))
+        cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
+      else
+        cm->allow_ref_frame_mvs = 0;
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        RefBuffer *const ref_buf = &cm->frame_refs[i];
+        av1_setup_scale_factors_for_frame(
+            &ref_buf->sf, ref_buf->buf->y_crop_width,
+            ref_buf->buf->y_crop_height, cm->width, cm->height);
+        if ((!av1_is_valid_scale(&ref_buf->sf)))
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "Reference frame has invalid dimensions");
+      }
+    }
+  }
+
+  av1_setup_frame_buf_refs(cm);
+
+  av1_setup_frame_sign_bias(cm);
+
+  cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
+  cm->cur_frame->frame_type = cm->frame_type;
+
+  if (seq_params->frame_id_numbers_present_flag) {
+    /* If bitmask is set, update reference frame id values and
+       mark frames as valid for reference */
+    int refresh_frame_flags = pbi->refresh_frame_flags;
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = cm->current_frame_id;
+        cm->valid_for_referencing[i] = 1;
+      }
+    }
+  }
+
+  const int might_bwd_adapt =
+      !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  if (might_bwd_adapt) {
+    cm->refresh_frame_context = aom_rb_read_bit(rb)
+                                    ? REFRESH_FRAME_CONTEXT_DISABLED
+                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
+  } else {
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+  }
+
+  get_frame_new_buffer(cm)->bit_depth = seq_params->bit_depth;
+  get_frame_new_buffer(cm)->color_primaries = seq_params->color_primaries;
+  get_frame_new_buffer(cm)->transfer_characteristics =
+      seq_params->transfer_characteristics;
+  get_frame_new_buffer(cm)->matrix_coefficients =
+      seq_params->matrix_coefficients;
+  get_frame_new_buffer(cm)->monochrome = seq_params->monochrome;
+  get_frame_new_buffer(cm)->chroma_sample_position =
+      seq_params->chroma_sample_position;
+  get_frame_new_buffer(cm)->color_range = seq_params->color_range;
+  get_frame_new_buffer(cm)->render_width = cm->render_width;
+  get_frame_new_buffer(cm)->render_height = cm->render_height;
+
+  if (pbi->need_resync) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  int ref_index = 0;
+  for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
+  if (cm->allow_intrabc) {
+    // Set parameters corresponding to no filtering.
+    struct loopfilter *lf = &cm->lf;
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  read_tile_info(pbi, rb);
+  setup_quantization(cm, rb);
+  xd->bd = (int)seq_params->bit_depth;
+
+  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+      cm->num_allocated_above_contexts < cm->tile_rows) {
+    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+  }
+
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    av1_setup_past_independence(cm);
+  }
+
+  setup_segmentation(cm, rb);
+
+  cm->delta_q_res = 1;
+  cm->delta_lf_res = 1;
+  cm->delta_lf_present_flag = 0;
+  cm->delta_lf_multi = 0;
+  cm->delta_q_present_flag = cm->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+  if (cm->delta_q_present_flag) {
+    xd->current_qindex = cm->base_qindex;
+    cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+    if (!cm->allow_intrabc) cm->delta_lf_present_flag = aom_rb_read_bit(rb);
+    if (cm->delta_lf_present_flag) {
+      cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+      cm->delta_lf_multi = aom_rb_read_bit(rb);
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
+  }
+
+  xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+
+  for (int i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = cm->seg.enabled
+                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+                           : cm->base_qindex;
+    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+    xd->qindex[i] = qindex;
+  }
+  cm->coded_lossless = is_coded_lossless(cm, xd);
+  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+  setup_segmentation_dequant(cm);
+  if (cm->coded_lossless) {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+  }
+  if (cm->coded_lossless || !seq_params->enable_cdef) {
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->cdef_uv_strengths[0] = 0;
+  }
+  if (cm->all_lossless || !seq_params->enable_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+  setup_loopfilter(cm, rb);
+
+  if (!cm->coded_lossless && seq_params->enable_cdef) {
+    setup_cdef(cm, rb);
+  }
+  if (!cm->all_lossless && seq_params->enable_restoration) {
+    decode_restoration_mode(cm, rb);
+  }
+
+  cm->tx_mode = read_tx_mode(cm, rb);
+  cm->reference_mode = read_frame_reference_mode(cm, rb);
+  if (cm->reference_mode != SINGLE_REFERENCE) setup_compound_reference_mode(cm);
+
+  av1_setup_skip_mode_allowed(cm);
+  cm->skip_mode_flag = cm->is_skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
+
+  if (frame_might_allow_warped_motion(cm))
+    cm->allow_warped_motion = aom_rb_read_bit(rb);
+  else
+    cm->allow_warped_motion = 0;
+
+  cm->reduced_tx_set_used = aom_rb_read_bit(rb);
+
+  if (cm->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Frame wrongly requests reference frame MVs");
+  }
+
+  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
+
+  cm->cur_frame->film_grain_params_present =
+      seq_params->film_grain_params_present;
+  read_film_grain(cm, rb);
+
+#if EXT_TILE_DEBUG
+  if (pbi->ext_tile_debug && cm->large_scale_tile) {
+    read_ext_tile_info(pbi, rb);
+    av1_set_single_tile_decoding_mode(cm);
+  }
+#endif  // EXT_TILE_DEBUG
+  return 0;
+}
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end) {
+  rb->bit_offset = 0;
+  rb->error_handler = error_handler;
+  rb->error_handler_data = &pbi->common;
+  rb->bit_buffer = data;
+  rb->bit_buffer_end = data_end;
+  return rb;
+}
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height) {
+  *width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  *height = aom_rb_read_literal(rb, num_bits_height) + 1;
+}
+
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
+  int profile = aom_rb_read_literal(rb, PROFILE_BITS);
+  return (BITSTREAM_PROFILE)profile;
+}
+
+void superres_post_decode(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  if (!av1_superres_scaled(cm)) return;
+  assert(!cm->all_lossless);
+
+  lock_buffer_pool(pool);
+  av1_superres_upscale(cm, pool);
+  unlock_buffer_pool(pool);
+}
+
+uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+                                            struct aom_read_bit_buffer *rb,
+                                            const uint8_t *data,
+                                            const uint8_t **p_data_end,
+                                            int trailing_bits_present) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &pbi->mb;
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_r();
+#endif
+
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cm->global_motion[i] = default_warp_params;
+    cm->cur_frame->global_motion[i] = default_warp_params;
+  }
+  xd->global_motion = cm->global_motion;
+
+  read_uncompressed_header(pbi, rb);
+
+  if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
+
+  // If cm->single_tile_decoding = 0, the independent decoding of a single tile
+  // or a section of a frame is not allowed.
+  if (!cm->single_tile_decoding &&
+      (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
+    pbi->dec_tile_row = -1;
+    pbi->dec_tile_col = -1;
+  }
+
+  const uint32_t uncomp_hdr_size =
+      (uint32_t)aom_rb_bytes_read(rb);  // Size of the uncompressed header
+  YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
+  xd->cur_buf = new_fb;
+  if (av1_allow_intrabc(cm)) {
+    av1_setup_scale_factors_for_frame(
+        &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
+        xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
+  }
+
+  if (cm->show_existing_frame) {
+    // showing a frame directly
+    *p_data_end = data + uncomp_hdr_size;
+    if (cm->reset_decoder_state) {
+      // Use the default frame context values.
+      *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+      if (!cm->fc->initialized)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Uninitialized entropy context.");
+    }
+    return uncomp_hdr_size;
+  }
+
+  cm->setup_mi(cm);
+
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+
+  av1_setup_motion_field(cm);
+
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    // use the default frame context values
+    *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+  } else {
+    *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
+  }
+  if (!cm->fc->initialized)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  xd->corrupted = 0;
+  return uncomp_hdr_size;
+}
+
+// Once-per-frame initialization
+static void setup_frame_info(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_alloc_restoration_buffers(cm);
+  }
+  const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  if (pbi->td.mc_buf_size != buf_size) {
+    av1_free_mc_tmp_buf(&pbi->td);
+    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
+  }
+}
+
+void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
+                                    const uint8_t *data_end,
+                                    const uint8_t **p_data_end, int start_tile,
+                                    int end_tile, int initialize_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int tile_count_tg = end_tile - start_tile + 1;
+
+  if (initialize_flag) setup_frame_info(pbi);
+  const int num_planes = av1_num_planes(cm);
+#if LOOP_FILTER_BITMASK
+  av1_loop_filter_frame_init(cm, 0, num_planes);
+  av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
+
+  if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
+      pbi->row_mt)
+    *p_data_end =
+        decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
+  else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
+           !(cm->large_scale_tile && !pbi->ext_tile_debug))
+    *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
+  else
+    *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
+
+  // If the bit stream is monochrome, set the U and V buffers to a constant.
+  if (num_planes < 3) {
+    set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+  }
+
+  if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
+    return;
+  }
+
+  if (!cm->allow_intrabc && !cm->single_tile_decoding) {
+    if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+      av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 1, 0,
+                            num_planes, 0);
+#else
+      if (pbi->num_workers > 1) {
+        av1_loop_filter_frame_mt(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                                 num_planes, 0, pbi->tile_workers,
+                                 pbi->num_workers, &pbi->lf_row_sync);
+      } else {
+        av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+                              num_planes, 0);
+      }
+#endif
+    }
+
+    const int do_loop_restoration =
+        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+    const int do_cdef =
+        !cm->skip_loop_filter && !cm->coded_lossless &&
+        (cm->cdef_bits || cm->cdef_strengths[0] || cm->cdef_uv_strengths[0]);
+    const int do_superres = av1_superres_scaled(cm);
+    const int optimized_loop_restoration = !do_cdef && !do_superres;
+
+    if (!optimized_loop_restoration) {
+      if (do_loop_restoration)
+        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 0);
+
+      if (do_cdef) av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+
+      superres_post_decode(pbi);
+
+      if (do_loop_restoration) {
+        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 1);
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    } else {
+      // In no cdef and no superres case. Provide an optimized version of
+      // loop_restoration_filter.
+      if (do_loop_restoration) {
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    }
+  }
+
+  if (!xd->corrupted) {
+    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      assert(cm->context_update_tile_id < pbi->allocated_tiles);
+      *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
+      av1_reset_cdf_symbol_counters(cm->fc);
+    }
+  } else {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
+  }
+
+#if CONFIG_INSPECTION
+  if (pbi->inspect_cb != NULL) {
+    (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
+  }
+#endif
+
+  // Non frame parallel update frame context here.
+  if (!cm->large_scale_tile) {
+    cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+  }
+}
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
new file mode 100644
index 000000000..ddad273f1
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Decoder;
+struct aom_read_bit_buffer;
+struct ThreadData;
+
+// Reads the middle part of the sequence header OBU (from
+// frame_width_bits_minus_1 to enable_restoration) into seq_params.
+// Reports errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                              SequenceHeader *seq_params);
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height);
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
+
+// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on
+// failure.
+int av1_check_trailing_bits(struct AV1Decoder *pbi,
+                            struct aom_read_bit_buffer *rb);
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+// TODO(wtc): Figure out and document the p_data_end parameter.
+uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+                                            struct aom_read_bit_buffer *rb,
+                                            const uint8_t *data,
+                                            const uint8_t **p_data_end,
+                                            int trailing_bits_present);
+
+void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
+                                    const uint8_t *data_end,
+                                    const uint8_t **p_data_end, int startTile,
+                                    int endTile, int initialize_flag);
+
+// Implements the color_config() function in the spec. Reports errors by
+// calling rb->error_handler() or aom_internal_error().
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           struct aom_internal_error_info *error_info);
+
+// Implements the timing_info() function in the spec. Reports errors by calling
+// rb->error_handler().
+void av1_read_timing_info_header(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the decoder_model_info() function in the spec. Reports errors by
+// calling rb->error_handler().
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the operating_parameters_info() function in the spec. Reports
+// errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+                                 struct aom_read_bit_buffer *rb, int op_num);
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+    struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end);
+
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
new file mode 100644
index 000000000..551e4d543
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -0,0 +1,1560 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define ACCT_STR __func__
+
+#define DEC_MISMATCH_DEBUG 0
+
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
+  return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
+}
+
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd,
+                      int mi_col, int mi_row) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (cm->coded_lossless) return;
+  if (cm->allow_intrabc) {
+    assert(cm->cdef_bits == 0);
+    return;
+  }
+
+  if (!(mi_col & (cm->seq_params.mib_size - 1)) &&
+      !(mi_row & (cm->seq_params.mib_size - 1))) {  // Top left?
+    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+        xd->cdef_preset[3] = -1;
+  }
+  // Read CDEF param at the first non-skip coding block
+  const int mask = (1 << (6 - MI_SIZE_LOG2));
+  const int m = ~(mask - 1);
+  const int index = cm->seq_params.sb_size == BLOCK_128X128
+                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+                        : 0;
+  cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)]
+      ->cdef_strength = xd->cdef_preset[index] =
+      xd->cdef_preset[index] == -1 && !mbmi->skip
+          ? aom_read_literal(r, cm->cdef_bits, ACCT_STR)
+          : xd->cdef_preset[index];
+}
+
+static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             aom_reader *r, MB_MODE_INFO *const mbmi,
+                             int mi_col, int mi_row) {
+  int sign, abs, reduced_delta_qindex = 0;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const int read_delta_q_flag = (b_col == 0 && b_row == 0);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_q_flag) {
+    abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
+    const int smallval = (abs < DELTA_Q_SMALL);
+
+    if (!smallval) {
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
+      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+    }
+
+    if (abs) {
+      sign = aom_read_bit(r, ACCT_STR);
+    } else {
+      sign = 1;
+    }
+
+    reduced_delta_qindex = sign ? -abs : abs;
+  }
+  return reduced_delta_qindex;
+}
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+                              aom_cdf_prob *const cdf,
+                              const MB_MODE_INFO *const mbmi, int mi_col,
+                              int mi_row) {
+  int reduced_delta_lflevel = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
+
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_lf_flag) {
+    int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
+    const int smallval = (abs < DELTA_LF_SMALL);
+    if (!smallval) {
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
+      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+    }
+    const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
+    reduced_delta_lflevel = sign ? -abs : abs;
+  }
+  return reduced_delta_lflevel;
+}
+
+static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx,
+                                             aom_reader *r,
+                                             CFL_ALLOWED_TYPE cfl_allowed,
+                                             PREDICTION_MODE y_mode) {
+  const UV_PREDICTION_MODE uv_mode =
+      aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                      UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
+  return uv_mode;
+}
+
+static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
+                           int *signs_out) {
+  const int joint_sign =
+      aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
+  int idx = 0;
+  // Magnitudes are only coded for nonzero values
+  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    idx = aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
+          << CFL_ALPHABET_SIZE_LOG2;
+  }
+  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    idx += aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
+  }
+  *signs_out = joint_sign;
+  return idx;
+}
+
+static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r,
+                                            int size_group) {
+  const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
+      r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
+      ACCT_STR);
+  return ii_mode;
+}
+
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
+                                       int16_t ctx) {
+  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+  int is_newmv, is_zeromv, is_refmv;
+  is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+  if (is_newmv) return NEWMV;
+
+  mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  is_zeromv =
+      aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+  if (is_zeromv) return GLOBALMV;
+
+  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+  is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+  if (is_refmv)
+    return NEARESTMV;
+  else
+    return NEARMV;
+}
+
+static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
+                         MB_MODE_INFO *mbmi, aom_reader *r) {
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  mbmi->ref_mv_idx = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+        mbmi->ref_mv_idx = idx + drl_idx;
+        if (!drl_idx) return;
+      }
+    }
+  }
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    // Offset the NEARESTMV mode.
+    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+    // mode is factored in.
+    for (int idx = 1; idx < 3; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+        mbmi->ref_mv_idx = idx + drl_idx - 1;
+        if (!drl_idx) return;
+      }
+    }
+  }
+}
+
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    MB_MODE_INFO *mbmi, aom_reader *r) {
+  if (cm->switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+  if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
+
+  const MOTION_MODE last_motion_mode_allowed =
+      motion_mode_allowed(xd->global_motion, xd, mbmi, cm->allow_warped_motion);
+  int motion_mode;
+
+  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
+
+  if (last_motion_mode_allowed == OBMC_CAUSAL) {
+    motion_mode =
+        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+  } else {
+    motion_mode =
+        aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                        MOTION_MODES, ACCT_STR);
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+  }
+}
+
+static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r,
+                                                int16_t ctx) {
+  const int mode =
+      aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
+                      INTER_COMPOUND_MODES, ACCT_STR);
+  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+  return NEAREST_NEARESTMV + mode;
+}
+
+int av1_neg_deinterleave(int diff, int ref, int max) {
+  if (!ref) return diff;
+  if (ref >= (max - 1)) return max - diff - 1;
+  if (2 * ref < max) {
+    if (diff <= 2 * ref) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
+    }
+    return diff;
+  } else {
+    if (diff <= 2 * (max - ref - 1)) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
+    }
+    return max - (diff + 1);
+  }
+}
+
+static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                           int mi_row, int mi_col, aom_reader *r, int skip) {
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+  if (skip) return pred;
+
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
+  const int segment_id =
+      av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
+
+  if (segment_id < 0 || segment_id > seg->last_active_segid) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Corrupted segment_ids");
+  }
+  return segment_id;
+}
+
+static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int segment_id = INT_MAX;
+
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
+      segment_id =
+          AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
+                           int segment_id) {
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+static int read_intra_segment_id(AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, int mi_row,
+                                 int mi_col, int bsize, aom_reader *r,
+                                 int skip) {
+  struct segmentation *const seg = &cm->seg;
+  if (!seg->enabled) return 0;  // Default for disabled segmentation
+
+  assert(seg->update_map && !seg->temporal_update);
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+  const int segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, skip);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static void copy_segment_id(const AV1_COMMON *cm,
+                            const uint8_t *last_segment_ids,
+                            uint8_t *current_segment_ids, int mi_offset,
+                            int x_mis, int y_mis) {
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
+      current_segment_ids[mi_offset + y * cm->mi_cols + x] =
+          last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x]
+                           : 0;
+}
+
+static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
+                                    int x_mis, int y_mis) {
+  return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+                                                     mi_offset, x_mis, y_mis)
+                                : 0;
+}
+
+static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int mi_row, int mi_col, int preskip,
+                                 aom_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[mbmi->sb_type];
+  const int bh = mi_size_high[mbmi->sb_type];
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+
+  if (!seg->enabled) return 0;  // Default for disabled segmentation
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    mi_offset, x_mis, y_mis);
+    return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+  }
+
+  int segment_id;
+  if (preskip) {
+    if (!seg->segid_preskip) return 0;
+  } else {
+    if (seg->segid_preskip) return mbmi->segment_id;
+    if (mbmi->skip) {
+      if (seg->temporal_update) {
+        mbmi->seg_id_predicted = 0;
+      }
+      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 1);
+      set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+      return segment_id;
+    }
+  }
+
+  if (seg->temporal_update) {
+    const int ctx = av1_get_pred_context_seg_id(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    struct segmentation_probs *const segp = &ec_ctx->seg;
+    aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
+    mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
+    if (mbmi->seg_id_predicted) {
+      segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+    } else {
+      segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
+    }
+  } else {
+    segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
+  }
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                          aom_reader *r) {
+  if (!cm->skip_mode_flag) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    return 0;
+  }
+
+  const int ctx = av1_get_skip_mode_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int skip_mode =
+      aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
+  return skip_mode;
+}
+
+static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                     aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = av1_get_skip_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
+    return skip;
+  }
+}
+
+// Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1])
+// and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into
+// one single sorted list(colors[...]).
+static void merge_colors(uint16_t *colors, uint16_t *cached_colors,
+                         int n_colors, int n_cached_colors) {
+  if (n_cached_colors == 0) return;
+  int cache_idx = 0, trans_idx = n_cached_colors;
+  for (int i = 0; i < n_colors; ++i) {
+    if (cache_idx < n_cached_colors &&
+        (trans_idx >= n_colors ||
+         cached_colors[cache_idx] <= colors[trans_idx])) {
+      colors[i] = cached_colors[cache_idx++];
+    } else {
+      assert(trans_idx < n_colors);
+      colors[i] = colors[trans_idx++];
+    }
+  }
+}
+
+static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth,
+                                  PALETTE_MODE_INFO *const pmi, aom_reader *r) {
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  uint16_t cached_colors[PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+  const int n = pmi->palette_size[0];
+  int idx = 0;
+  for (int i = 0; i < n_cache && idx < n; ++i)
+    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+  if (idx < n) {
+    const int n_cached_colors = idx;
+    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+    if (idx < n) {
+      const int min_bits = bit_depth - 3;
+      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1;
+      for (; idx < n; ++idx) {
+        assert(range >= 0);
+        const int delta = aom_read_literal(r, bits, ACCT_STR) + 1;
+        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+                                         0, (1 << bit_depth) - 1);
+        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+        bits = AOMMIN(bits, av1_ceil_log2(range));
+      }
+    }
+    merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors);
+  } else {
+    memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0]));
+  }
+}
+
+static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth,
+                                   PALETTE_MODE_INFO *const pmi,
+                                   aom_reader *r) {
+  const int n = pmi->palette_size[1];
+  // U channel colors.
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  uint16_t cached_colors[PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+  int idx = 0;
+  for (int i = 0; i < n_cache && idx < n; ++i)
+    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+  if (idx < n) {
+    const int n_cached_colors = idx;
+    idx += PALETTE_MAX_SIZE;
+    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+    if (idx < PALETTE_MAX_SIZE + n) {
+      const int min_bits = bit_depth - 3;
+      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1];
+      for (; idx < PALETTE_MAX_SIZE + n; ++idx) {
+        assert(range >= 0);
+        const int delta = aom_read_literal(r, bits, ACCT_STR);
+        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+                                         0, (1 << bit_depth) - 1);
+        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+        bits = AOMMIN(bits, av1_ceil_log2(range));
+      }
+    }
+    merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n,
+                 n_cached_colors);
+  } else {
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors,
+           n * sizeof(cached_colors[0]));
+  }
+
+  // V channel colors.
+  if (aom_read_bit(r, ACCT_STR)) {  // Delta encoding.
+    const int min_bits_v = bit_depth - 4;
+    const int max_val = 1 << bit_depth;
+    int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
+    pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
+        aom_read_literal(r, bit_depth, ACCT_STR);
+    for (int i = 1; i < n; ++i) {
+      int delta = aom_read_literal(r, bits, ACCT_STR);
+      if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
+      int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
+      if (val < 0) val += max_val;
+      if (val >= max_val) val -= max_val;
+      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+          aom_read_literal(r, bit_depth, ACCT_STR);
+    }
+  }
+}
+
+static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                   int mi_row, int mi_col, aom_reader *r) {
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  if (mbmi->mode == DC_PRED) {
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+    const int modev = aom_read_symbol(
+        r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
+        ACCT_STR);
+    if (modev) {
+      pmi->palette_size[0] =
+          aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+                          PALETTE_SIZES, ACCT_STR) +
+          2;
+      read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
+    }
+  }
+  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    const int modev = aom_read_symbol(
+        r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
+    if (modev) {
+      pmi->palette_size[1] =
+          aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+                          PALETTE_SIZES, ACCT_STR) +
+          2;
+      read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
+    }
+  }
+}
+
+static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
+  const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
+  return sym - MAX_ANGLE_DELTA;
+}
+
+static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &mbmi->filter_intra_mode_info;
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    filter_intra_mode_info->use_filter_intra = aom_read_symbol(
+        r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra) {
+      filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
+          r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
+    }
+  } else {
+    filter_intra_mode_info->use_filter_intra = 0;
+  }
+}
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int txk_type_idx =
+      av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+  TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+  *tx_type = DCT_DCT;
+
+  // No need to read transform type if block is skipped.
+  if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    return;
+
+  // No need to read transform type for lossless mode(qindex==0).
+  const int qindex =
+      cm->seg.enabled ? xd->qindex[mbmi->segment_id] : cm->base_qindex;
+  if (qindex <= 0) return;
+
+  const int inter_block = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) {
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, inter_block, cm->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and
+    // there is no need to read the tx_type
+    assert(eset != 0);
+
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    if (inter_block) {
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+    } else {
+      const PREDICTION_MODE intra_mode =
+          mbmi->filter_intra_mode_info.use_filter_intra
+              ? fimode_to_intradir[mbmi->filter_intra_mode_info
+                                       .filter_intra_mode]
+              : mbmi->mode;
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+    }
+  }
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+                           nmv_context *ctx, MvSubpelPrecision precision);
+
+static INLINE int is_mv_valid(const MV *mv);
+
+static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
+                            const int_mv *ref_mv, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
+  // DV should not have sub-pel.
+  assert((mv->as_mv.col & 7) == 0);
+  assert((mv->as_mv.row & 7) == 0);
+  mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
+  mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
+  int valid = is_mv_valid(&mv->as_mv) &&
+              av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
+                              cm->seq_params.mib_size_log2);
+  return valid;
+}
+
+static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                              int mi_row, int mi_col, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
+  if (mbmi->use_intrabc) {
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+    int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+    int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
+    int_mv global_mvs[REF_FRAMES];
+
+    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+                     xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col,
+                     inter_mode_ctx);
+
+    int_mv nearestmv, nearmv;
+
+    av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
+    int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+    if (dv_ref.as_int == 0)
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, mi_row,
+                      mi_col);
+    // Ref DV should not have sub-pel.
+    int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
+    dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
+    dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
+    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row,
+                                     mi_col, bsize, r);
+    if (!valid_dv) {
+      // Intra bc motion vectors are not valid - signal corrupt frame
+      aom_merge_corrupted_flag(&xd->corrupted, 1);
+    }
+  }
+}
+
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                const int mi_row, const int mi_col,
+                                aom_reader *r) {
+  if (cm->delta_q_present_flag) {
+    MB_MODE_INFO *const mbmi = xd->mi[0];
+    xd->current_qindex +=
+        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+    if (cm->delta_lf_present_flag) {
+      if (cm->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
+              read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+                                 mi_col, mi_row) *
+                  cm->delta_lf_res;
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+        }
+      } else {
+        const int tmp_lvl = xd->delta_lf_from_base +
+                            read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+                                               mbmi, mi_col, mi_row) *
+                                cm->delta_lf_res;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+      }
+    }
+  }
+}
+
+static void read_intra_frame_mode_info(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd, int mi_row,
+                                       int mi_col, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  struct segmentation *const seg = &cm->seg;
+
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (seg->segid_preskip)
+    mbmi->segment_id =
+        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, 0);
+
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+  if (!seg->segid_preskip)
+    mbmi->segment_id =
+        read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, mbmi->skip);
+
+  read_cdef(cm, r, xd, mi_col, mi_row);
+
+  read_delta_q_params(cm, xd, mi_row, mi_col, r);
+
+  mbmi->current_qindex = xd->current_qindex;
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  if (av1_allow_intrabc(cm)) {
+    read_intrabc_info(cm, xd, mi_row, mi_col, r);
+    if (is_intrabc_block(mbmi)) return;
+  }
+
+  mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
+
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      (use_angle_delta && av1_is_directional_mode(mbmi->mode))
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
+    xd->cfl.is_chroma_reference = 1;
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+    if (mbmi->uv_mode == UV_CFL_PRED) {
+      mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
+  } else {
+    // Avoid decoding angle_info if there is is no chroma prediction
+    mbmi->uv_mode = UV_DC_PRED;
+    xd->cfl.is_chroma_reference = 0;
+  }
+  xd->cfl.store_y = store_cfl_required(cm, xd);
+
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
+}
+
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
+                             int use_subpel, int usehp) {
+  int mag, d, fr, hp;
+  const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
+  const int mv_class =
+      aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
+  const int class0 = mv_class == MV_CLASS_0;
+
+  // Integer part
+  if (class0) {
+    d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
+    mag = 0;
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    d = 0;
+    for (int i = 0; i < n; ++i)
+      d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
+  }
+
+  if (use_subpel) {
+    // Fractional part
+    fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+                         MV_FP_SIZE, ACCT_STR);
+
+    // High precision part (if hp is not used, the default value of the hp is 1)
+    hp = usehp ? aom_read_symbol(
+                     r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
+                     ACCT_STR)
+               : 1;
+  } else {
+    fr = 3;
+    hp = 1;
+  }
+
+  // Result
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
+  return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+                           nmv_context *ctx, MvSubpelPrecision precision) {
+  MV diff = kZeroMv;
+  const MV_JOINT_TYPE joint_type =
+      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
+
+  if (mv_joint_vertical(joint_type))
+    diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
+                                 precision > MV_SUBPEL_LOW_PRECISION);
+
+  if (mv_joint_horizontal(joint_type))
+    diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
+                                 precision > MV_SUBPEL_LOW_PRECISION);
+
+  mv->row = ref->row + diff.row;
+  mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                aom_reader *r) {
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
+  if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = av1_get_reference_mode_context(xd);
+    const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
+        r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    assert(cm->reference_mode == SINGLE_REFERENCE);
+    return cm->reference_mode;
+  }
+}
+
+#define READ_REF_BIT(pname) \
+  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
+
+static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd,
+                                                    aom_reader *r) {
+  const int ctx = av1_get_comp_reference_type_context(xd);
+  const COMP_REFERENCE_TYPE comp_ref_type =
+      (COMP_REFERENCE_TYPE)aom_read_symbol(
+          r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
+  return comp_ref_type;  // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
+}
+
+static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm,
+                                         MV_REFERENCE_FRAME ref_frame[2]) {
+  ref_frame[0] = LAST_FRAME + cm->ref_frame_idx_0;
+  ref_frame[1] = LAST_FRAME + cm->ref_frame_idx_1;
+}
+
+// Read the referncence frame
+static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *r, int segment_id,
+                            MV_REFERENCE_FRAME ref_frame[2]) {
+  if (xd->mi[0]->skip_mode) {
+    set_ref_frames_for_skip_mode(cm, ref_frame);
+    return;
+  }
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE_FRAME;
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    ref_frame[0] = LAST_FRAME;
+    ref_frame[1] = NONE_FRAME;
+  } else {
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+
+    if (mode == COMPOUND_REFERENCE) {
+      const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
+
+      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+        const int bit = READ_REF_BIT(uni_comp_ref_p);
+        if (bit) {
+          ref_frame[0] = BWDREF_FRAME;
+          ref_frame[1] = ALTREF_FRAME;
+        } else {
+          const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
+            if (bit2) {
+              ref_frame[0] = LAST_FRAME;
+              ref_frame[1] = GOLDEN_FRAME;
+            } else {
+              ref_frame[0] = LAST_FRAME;
+              ref_frame[1] = LAST3_FRAME;
+            }
+          } else {
+            ref_frame[0] = LAST_FRAME;
+            ref_frame[1] = LAST2_FRAME;
+          }
+        }
+
+        return;
+      }
+
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+      const int idx = 1;
+      const int bit = READ_REF_BIT(comp_ref_p);
+      // Decode forward references.
+      if (!bit) {
+        const int bit1 = READ_REF_BIT(comp_ref_p1);
+        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 1 : 0];
+      } else {
+        const int bit2 = READ_REF_BIT(comp_ref_p2);
+        ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
+      }
+
+      // Decode backward references.
+      const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
+      if (!bit_bwd) {
+        const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
+        ref_frame[idx] = cm->comp_bwd_ref[bit1_bwd];
+      } else {
+        ref_frame[idx] = cm->comp_bwd_ref[2];
+      }
+    } else if (mode == SINGLE_REFERENCE) {
+      const int bit0 = READ_REF_BIT(single_ref_p1);
+      if (bit0) {
+        const int bit1 = READ_REF_BIT(single_ref_p2);
+        if (!bit1) {
+          const int bit5 = READ_REF_BIT(single_ref_p6);
+          ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
+        } else {
+          ref_frame[0] = ALTREF_FRAME;
+        }
+      } else {
+        const int bit2 = READ_REF_BIT(single_ref_p3);
+        if (bit2) {
+          const int bit4 = READ_REF_BIT(single_ref_p5);
+          ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+        } else {
+          const int bit3 = READ_REF_BIT(single_ref_p4);
+          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+        }
+      }
+
+      ref_frame[1] = NONE_FRAME;
+    } else {
+      assert(0 && "Invalid prediction mode.");
+    }
+  }
+}
+
+static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
+                                         MACROBLOCKD *const xd,
+                                         MB_MODE_INFO *const mbmi,
+                                         aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!av1_is_interp_needed(xd)) {
+    set_default_interp_filters(mbmi, cm->interp_filter);
+    return;
+  }
+
+  if (cm->interp_filter != SWITCHABLE) {
+    mbmi->interp_filters = av1_broadcast_interp_filter(cm->interp_filter);
+  } else {
+    InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+    for (int dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      ref0_filter[dir] = (InterpFilter)aom_read_symbol(
+          r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
+      if (cm->seq_params.enable_dual_filter == 0) {
+        ref0_filter[1] = ref0_filter[0];
+        break;
+      }
+    }
+    // The index system works as: (0, 1) -> (vertical, horizontal) filter types
+    mbmi->interp_filters =
+        av1_make_interp_filters(ref0_filter[0], ref0_filter[1]);
+  }
+}
+
+static void read_intra_block_mode_info(AV1_COMMON *const cm, const int mi_row,
+                                       const int mi_col, MACROBLOCKD *const xd,
+                                       MB_MODE_INFO *const mbmi,
+                                       aom_reader *r) {
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
+
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      use_angle_delta && av1_is_directional_mode(mbmi->mode)
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+  const int has_chroma =
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  xd->cfl.is_chroma_reference = has_chroma;
+  if (!cm->seq_params.monochrome && has_chroma) {
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+    if (mbmi->uv_mode == UV_CFL_PRED) {
+      mbmi->cfl_alpha_idx =
+          read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
+  } else {
+    // Avoid decoding angle_info if there is is no chroma prediction
+    mbmi->uv_mode = UV_DC_PRED;
+  }
+  xd->cfl.store_y = store_cfl_required(cm, xd);
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+    read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
+         mv->col < MV_UPP;
+}
+
+static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
+                            PREDICTION_MODE mode,
+                            MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
+                            int_mv ref_mv[2], int_mv nearest_mv[2],
+                            int_mv near_mv[2], int mi_row, int mi_col,
+                            int is_compound, int allow_hp, aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  if (cm->cur_frame_force_integer_mv) {
+    allow_hp = MV_SUBPEL_NONE;
+  }
+  switch (mode) {
+    case NEWMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+      break;
+    }
+    case NEARESTMV: {
+      mv[0].as_int = nearest_mv[0].as_int;
+      break;
+    }
+    case NEARMV: {
+      mv[0].as_int = near_mv[0].as_int;
+      break;
+    }
+    case GLOBALMV: {
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
+      break;
+    }
+    case NEW_NEWMV: {
+      assert(is_compound);
+      for (int i = 0; i < 2; ++i) {
+        nmv_context *const nmvc = &ec_ctx->nmvc;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
+      }
+      break;
+    }
+    case NEAREST_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEW_NEARESTMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEWMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      break;
+    }
+    case NEAR_NEWMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      break;
+    }
+    case NEW_NEARMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case GLOBAL_GLOBALMV: {
+      assert(is_compound);
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
+      mv[1].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                               cm->allow_high_precision_mv, bsize, mi_col,
+                               mi_row, cm->cur_frame_force_integer_mv)
+              .as_int;
+      break;
+    }
+    default: { return 0; }
+  }
+
+  int ret = is_mv_valid(&mv[0].as_mv);
+  if (is_compound) {
+    ret = ret && is_mv_valid(&mv[1].as_mv);
+  }
+  return ret;
+}
+
+static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                               int segment_id, aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+    if (frame < LAST_FRAME) return 0;
+    return frame != INTRA_FRAME;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    return 1;
+  }
+  const int ctx = av1_get_intra_inter_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int is_inter =
+      aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
+  return is_inter;
+}
+
+#if DEC_MISMATCH_DEBUG
+static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
+                          int mi_col, int16_t mode_ctx) {
+  int_mv mv[2] = { { 0 } };
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
+    mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+  int16_t zeromv_ctx = -1;
+  int16_t refmv_ctx = -1;
+  if (mbmi->mode != NEWMV) {
+    zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mbmi->mode != GLOBALMV)
+      refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+  }
+
+#define FRAME_TO_CHECK 11
+  if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+    printf(
+        "=== DECODER ===: "
+        "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+        "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+        "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+        "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+        cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+        mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+        mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+        mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx,
+        refmv_ctx, mbmi->tx_size);
+  }
+}
+#endif  // DEC_MISMATCH_DEBUG
+
+static void read_inter_block_mode_info(AV1Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, aom_reader *r) {
+  AV1_COMMON *const cm = &pbi->common;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  int_mv nearestmv[2], nearmv[2];
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
+  int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+  const int is_compound = has_second_ref(mbmi);
+
+  MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  int_mv global_mvs[REF_FRAMES];
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+                   ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx);
+
+  int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
+  mbmi->ref_mv_idx = 0;
+
+  if (mbmi->skip_mode) {
+    assert(is_compound);
+    mbmi->mode = NEAREST_NEARESTMV;
+  } else {
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
+      mbmi->mode = GLOBALMV;
+    } else {
+      if (is_compound)
+        mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
+      else
+        mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
+      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+          have_nearmv_in_inter_mode(mbmi->mode))
+        read_drl_idx(ec_ctx, xd, mbmi, r);
+    }
+  }
+
+  if (is_compound != is_inter_compound_mode(mbmi->mode)) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Prediction mode %d invalid with ref frame %d %d",
+                       mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  }
+
+  if (!is_compound && mbmi->mode != GLOBALMV) {
+    av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
+                          &nearmv[0], cm->cur_frame_force_integer_mv);
+  }
+
+  if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
+    int ref_mv_idx = mbmi->ref_mv_idx + 1;
+    nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
+    nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
+    nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+    nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+    lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[0].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[1].as_mv, allow_hp,
+                       cm->cur_frame_force_integer_mv);
+  } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
+    int_mv cur_mv =
+        xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
+    nearmv[0] = cur_mv;
+  }
+
+  int_mv ref_mv[2];
+  ref_mv[0] = nearestmv[0];
+  ref_mv[1] = nearestmv[1];
+
+  if (is_compound) {
+    int ref_mv_idx = mbmi->ref_mv_idx;
+    // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+    // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+    // mbmi->ref_mv_idx (like NEWMV)
+    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+      ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+    // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
+    if (compound_ref0_mode(mbmi->mode) == NEWMV)
+      ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+
+    if (compound_ref1_mode(mbmi->mode) == NEWMV)
+      ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+  } else {
+    if (mbmi->mode == NEWMV) {
+      if (xd->ref_mv_count[ref_frame] > 1)
+        ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
+    }
+  }
+
+  if (mbmi->skip_mode) {
+    assert(mbmi->mode == NEAREST_NEARESTMV);
+    mbmi->mv[0].as_int = nearestmv[0].as_int;
+    mbmi->mv[1].as_int = nearestmv[1].as_int;
+  } else {
+    int mv_corrupted_flag =
+        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
+                   nearestmv, nearmv, mi_row, mi_col, is_compound, allow_hp, r);
+    aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
+  }
+
+  mbmi->use_wedge_interintra = 0;
+  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+      is_interintra_allowed(mbmi)) {
+    const int bsize_group = size_group_lookup[bsize];
+    const int interintra =
+        aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
+    assert(mbmi->ref_frame[1] == NONE_FRAME);
+    if (interintra) {
+      const INTERINTRA_MODE interintra_mode =
+          read_interintra_mode(xd, r, bsize_group);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->interintra_mode = interintra_mode;
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      if (is_interintra_wedge_used(bsize)) {
+        mbmi->use_wedge_interintra = aom_read_symbol(
+            r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
+        if (mbmi->use_wedge_interintra) {
+          mbmi->interintra_wedge_index =
+              aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
+          mbmi->interintra_wedge_sign = 0;
+        }
+      }
+    }
+  }
+
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+    xd->block_refs[ref] = ref_buf;
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+      !has_second_ref(mbmi))
+    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
+  if (mbmi->ref_frame[1] != INTRA_FRAME)
+    mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
+
+  // init
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+  if (has_second_ref(mbmi) && !mbmi->skip_mode) {
+    // Read idx to indicate current compound inter prediction mode group
+    const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                     cm->seq_params.enable_masked_compound;
+
+    if (masked_compound_used) {
+      const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+      mbmi->comp_group_idx = aom_read_symbol(
+          r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
+    }
+
+    if (mbmi->comp_group_idx == 0) {
+      if (cm->seq_params.enable_jnt_comp) {
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        mbmi->compound_idx = aom_read_symbol(
+            r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+      } else {
+        // Distance-weighted compound is disabled, so always use average
+        mbmi->compound_idx = 1;
+      }
+    } else {
+      assert(cm->reference_mode != SINGLE_REFERENCE &&
+             is_inter_compound_mode(mbmi->mode) &&
+             mbmi->motion_mode == SIMPLE_TRANSLATION);
+      assert(masked_compound_used);
+
+      // compound_diffwtd, wedge
+      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+        mbmi->interinter_comp.type =
+            1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize],
+                                COMPOUND_TYPES - 1, ACCT_STR);
+      else
+        mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+        mbmi->interinter_comp.wedge_index =
+            aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
+        mbmi->interinter_comp.wedge_sign = aom_read_bit(r, ACCT_STR);
+      } else {
+        assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+        mbmi->interinter_comp.mask_type =
+            aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
+      }
+    }
+  }
+
+  read_mb_interp_filter(cm, xd, mbmi, r);
+
+  if (mbmi->motion_mode == WARPED_CAUSAL) {
+    mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+    mbmi->wm_params.invalid = 0;
+
+    if (mbmi->num_proj_ref > 1)
+      mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                         mbmi->num_proj_ref, bsize);
+
+    if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                        mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                        &mbmi->wm_params, mi_row, mi_col)) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected warped model from aomenc\n");
+#endif
+      mbmi->wm_params.invalid = 1;
+    }
+  }
+
+  xd->cfl.is_chroma_reference =
+      is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                          cm->seq_params.subsampling_y);
+  xd->cfl.store_y = store_cfl_required(cm, xd);
+
+#if DEC_MISMATCH_DEBUG
+  dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
+#endif  // DEC_MISMATCH_DEBUG
+}
+
+static void read_inter_frame_mode_info(AV1Decoder *const pbi,
+                                       MACROBLOCKD *const xd, int mi_row,
+                                       int mi_col, aom_reader *r) {
+  AV1_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int inter_block = 1;
+
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 1, r);
+
+  mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
+
+  if (mbmi->skip_mode)
+    mbmi->skip = 1;
+  else
+    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+  mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+
+  read_cdef(cm, r, xd, mi_col, mi_row);
+
+  read_delta_q_params(cm, xd, mi_row, mi_col, r);
+
+  if (!mbmi->skip_mode)
+    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+  mbmi->current_qindex = xd->current_qindex;
+
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  if (inter_block)
+    read_inter_block_mode_info(pbi, xd, mbmi, mi_row, mi_col, r);
+  else
+    read_intra_block_mode_info(cm, mi_row, mi_col, xd, mbmi, r);
+}
+
+static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int x_mis, int y_mis) {
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+
+  for (int h = 0; h < y_mis; h++) {
+    MV_REF *mv = frame_mvs;
+    for (int w = 0; w < x_mis; w++) {
+      mv->ref_frame = NONE_FRAME;
+      mv++;
+    }
+    frame_mvs += frame_mvs_stride;
+  }
+}
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
+                        int mi_col, aom_reader *r, int x_mis, int y_mis) {
+  AV1_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  mi->use_intrabc = 0;
+
+  if (frame_is_intra_only(cm)) {
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+    intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
+  } else {
+    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+  }
+}
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
new file mode 100644
index 000000000..1625e5bd2
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
+
+#include "aom_dsp/bitreader.h"
+
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
+
+                        int mi_row, int mi_col, aom_reader *r, int x_mis,
+                        int y_mis);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r);
+
+#endif  // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
new file mode 100644
index 000000000..a5f4fd67f
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/detokenize.h"
+#include "av1/decoder/obu.h"
+
+static void initialize_dec(void) {
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_wedge_masks();
+}
+
+static void dec_setup_mi(AV1_COMMON *cm) {
+  cm->mi = cm->mip;
+  cm->mi_grid_visible = cm->mi_grid_base;
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
+}
+
+static int av1_dec_alloc_mi(AV1_COMMON *cm, int mi_size) {
+  cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip) return 1;
+  cm->mi_alloc_size = mi_size;
+  cm->mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
+  if (!cm->mi_grid_base) return 1;
+  return 0;
+}
+
+static void dec_free_mi(AV1_COMMON *cm) {
+  aom_free(cm->mip);
+  cm->mip = NULL;
+  aom_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
+}
+
+AV1Decoder *av1_decoder_create(BufferPool *const pool) {
+  AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
+  AV1_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*pbi);
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    av1_decoder_remove(pbi);
+    return NULL;
+  }
+
+  cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)aom_memalign(
+                      32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+  pbi->need_resync = 1;
+  aom_once(initialize_dec);
+
+  // Initialize the references to not point to any frame buffers.
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+  cm->current_video_frame = 0;
+  pbi->decoding_first_frame = 1;
+  pbi->common.buffer_pool = pool;
+
+  cm->seq_params.bit_depth = AOM_BITS_8;
+  cm->dequant_bit_depth = AOM_BITS_8;
+
+  cm->alloc_mi = av1_dec_alloc_mi;
+  cm->free_mi = dec_free_mi;
+  cm->setup_mi = dec_setup_mi;
+
+  av1_loop_filter_init(cm);
+
+  av1_qm_init(cm);
+  av1_loop_restoration_precal();
+#if CONFIG_ACCOUNTING
+  pbi->acct_enabled = 1;
+  aom_accounting_init(&pbi->accounting);
+#endif
+
+  cm->error.setjmp = 0;
+
+  aom_get_worker_interface()->init(&pbi->lf_worker);
+
+  return pbi;
+}
+
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
+  if (tile_mt_info != NULL) {
+#if CONFIG_MULTITHREAD
+    if (tile_mt_info->job_mutex != NULL) {
+      pthread_mutex_destroy(tile_mt_info->job_mutex);
+      aom_free(tile_mt_info->job_mutex);
+    }
+#endif
+    aom_free(tile_mt_info->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*tile_mt_info);
+  }
+}
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi) {
+  aom_free(pbi->cb_buffer_base);
+  pbi->cb_buffer_base = NULL;
+  pbi->cb_buffer_alloc_size = 0;
+}
+
+void av1_decoder_remove(AV1Decoder *pbi) {
+  int i;
+
+  if (!pbi) return;
+
+  // Free the tile list output buffer.
+  if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+  pbi->tile_list_output = NULL;
+
+  aom_get_worker_interface()->end(&pbi->lf_worker);
+  aom_free(pbi->lf_worker.data1);
+
+  if (pbi->thread_data) {
+    for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      av1_free_mc_tmp_buf(thread_data->td);
+      aom_free(thread_data->td);
+    }
+    aom_free(pbi->thread_data);
+  }
+
+  for (i = 0; i < pbi->num_workers; ++i) {
+    AVxWorker *const worker = &pbi->tile_workers[i];
+    aom_get_worker_interface()->end(worker);
+  }
+#if CONFIG_MULTITHREAD
+  if (pbi->row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pbi->row_mt_mutex_);
+    aom_free(pbi->row_mt_mutex_);
+  }
+  if (pbi->row_mt_cond_ != NULL) {
+    pthread_cond_destroy(pbi->row_mt_cond_);
+    aom_free(pbi->row_mt_cond_);
+  }
+#endif
+  for (i = 0; i < pbi->allocated_tiles; i++) {
+    TileDataDec *const tile_data = pbi->tile_data + i;
+    av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+  }
+  aom_free(pbi->tile_data);
+  aom_free(pbi->tile_workers);
+
+  if (pbi->num_workers > 0) {
+    av1_loop_filter_dealloc(&pbi->lf_row_sync);
+    av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers);
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+  }
+
+  av1_dec_free_cb_buf(pbi);
+#if CONFIG_ACCOUNTING
+  aom_accounting_clear(&pbi->accounting);
+#endif
+  av1_free_mc_tmp_buf(&pbi->td);
+
+  aom_free(pbi);
+}
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+                       palette_visitor_fn_t visit) {
+  if (!is_inter_block(xd->mi[0])) {
+    for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
+         ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                              pd->subsampling_y)) {
+        if (xd->mi[0]->palette_mode_info.palette_size[plane])
+          visit(xd, plane, r);
+      } else {
+        assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
+      }
+    }
+  }
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+                            const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
+                                       YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
+  if (cfg == NULL) {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    return AOM_CODEC_ERROR;
+  }
+  if (!equal_dimensions(cfg, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(cfg, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
+                                      YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *ref_buf = NULL;
+
+  // Get the destination reference buffer.
+  ref_buf = get_ref_frame(cm, idx);
+
+  if (ref_buf == NULL) {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    return AOM_CODEC_ERROR;
+  }
+
+  if (!use_external_ref) {
+    if (!equal_dimensions(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer.
+      aom_yv12_copy_frame(sd, ref_buf, num_planes);
+    }
+  } else {
+    if (!equal_dimensions_and_border(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer pointers.
+      // Once we no longer need the external reference buffer, these pointers
+      // are restored.
+      ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
+      ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
+      ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
+      ref_buf->y_buffer = sd->y_buffer;
+      ref_buf->u_buffer = sd->u_buffer;
+      ref_buf->v_buffer = sd->v_buffer;
+      ref_buf->use_external_reference_buffers = 1;
+    }
+  }
+
+  return cm->error.error_code;
+}
+
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here.
+   Consumes a reference to cm->new_fb_idx.
+*/
+static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
+  int ref_index = 0, mask;
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+  if (frame_decoded) {
+    lock_buffer_pool(pool);
+
+    // In ext-tile decoding, the camera frame header is only decoded once. So,
+    // we don't release the references here.
+    if (!pbi->camera_frame_header_ready) {
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
+
+        // Release the reference frame holding in the reference map for the
+        // decoding of the next frame.
+        if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      const int check_on_show_existing_frame =
+          !cm->show_existing_frame || cm->reset_decoder_state;
+      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+           ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+        cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+      }
+    }
+
+    YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+
+    if (cm->show_existing_frame || cm->show_frame) {
+      if (pbi->output_all_layers) {
+        // Append this frame to the output queue
+        if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
+          // We can't store the new frame anywhere, so drop it and return an
+          // error
+          decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        } else {
+          pbi->output_frames[pbi->num_output_frames] = cur_frame;
+          pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx;
+          pbi->num_output_frames++;
+        }
+      } else {
+        // Replace any existing output frame
+        assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
+        if (pbi->num_output_frames > 0) {
+          decrease_ref_count((int)pbi->output_frame_index[0], frame_bufs, pool);
+        }
+        pbi->output_frames[0] = cur_frame;
+        pbi->output_frame_index[0] = cm->new_fb_idx;
+        pbi->num_output_frames = 1;
+      }
+    } else {
+      decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    }
+
+    unlock_buffer_pool(pool);
+  } else {
+    // Nothing was decoded, so just drop this frame buffer
+    lock_buffer_pool(pool);
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
+
+  if (!pbi->camera_frame_header_ready) {
+    pbi->hold_ref_buf = 0;
+
+    // Invalidate these references until the next frame starts.
+    for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+      cm->frame_refs[ref_index].idx = INVALID_IDX;
+      cm->frame_refs[ref_index].buf = NULL;
+    }
+  }
+}
+
+int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
+                                const uint8_t **psource) {
+  AV1_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+  const uint8_t *source = *psource;
+  cm->error.error_code = AOM_CODEC_OK;
+
+  if (size == 0) {
+    // This is used to signal that we are missing frames.
+    // We do not know if the missing frame(s) was supposed to update
+    // any of the reference buffers, but we act conservative and
+    // mark only the last buffer as corrupted.
+    //
+    // TODO(jkoleszar): Error concealment is undefined and non-normative
+    // at this point, but if it becomes so, [0] may not always be the correct
+    // thing to do here.
+    if (cm->frame_refs[0].idx > 0) {
+      assert(cm->frame_refs[0].buf != NULL);
+      cm->frame_refs[0].buf->corrupted = 1;
+    }
+  }
+
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+
+  // Find a free frame buffer. Return error if can not find any.
+  cm->new_fb_idx = get_free_fb(cm);
+  if (cm->new_fb_idx == INVALID_IDX) {
+    cm->error.error_code = AOM_CODEC_MEM_ERROR;
+    return 1;
+  }
+
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
+
+  pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error.jmp)) {
+    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+    int i;
+
+    cm->error.setjmp = 0;
+
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
+
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
+
+        // Release the reference frame holding in the reference map for the
+        // decoding of the next frame.
+        if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+        ++ref_index;
+      }
+
+      // Current thread releases the holding of reference frame.
+      const int check_on_show_existing_frame =
+          !cm->show_existing_frame || cm->reset_decoder_state;
+      for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+           ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+
+    aom_clear_system_state();
+    return -1;
+  }
+
+  cm->error.setjmp = 1;
+
+  int frame_decoded =
+      aom_decode_frame_from_obus(pbi, source, source + size, psource);
+
+  if (cm->error.error_code != AOM_CODEC_OK) {
+    lock_buffer_pool(pool);
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+    cm->error.setjmp = 0;
+    return 1;
+  }
+
+#if TXCOEFF_TIMER
+  cm->cum_txcoeff_timer += cm->txcoeff_timer;
+  fprintf(stderr,
+          "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
+          cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
+  cm->txcoeff_timer = 0;
+  cm->txb_count = 0;
+#endif
+
+  // Note: At this point, this function holds a reference to cm->new_fb_idx
+  // in the buffer pool. This reference is consumed by swap_frame_buffers().
+  swap_frame_buffers(pbi, frame_decoded);
+
+  if (frame_decoded) {
+    pbi->decoding_first_frame = 0;
+  }
+
+  if (cm->error.error_code != AOM_CODEC_OK) {
+    cm->error.setjmp = 0;
+    return 1;
+  }
+
+  aom_clear_system_state();
+
+  if (!cm->show_existing_frame) {
+    cm->last_show_frame = cm->show_frame;
+
+    if (cm->seg.enabled) {
+      if (cm->prev_frame && (cm->mi_rows == cm->prev_frame->mi_rows) &&
+          (cm->mi_cols == cm->prev_frame->mi_cols)) {
+        cm->last_frame_seg_map = cm->prev_frame->seg_map;
+      } else {
+        cm->last_frame_seg_map = NULL;
+      }
+    }
+  }
+
+  // Update progress in frame parallel decode.
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+  cm->last_tile_cols = cm->tile_cols;
+  cm->last_tile_rows = cm->tile_rows;
+  cm->error.setjmp = 0;
+
+  return 0;
+}
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params) {
+  RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
+
+  if (index >= pbi->num_output_frames) return -1;
+  *sd = pbi->output_frames[index];
+  *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params;
+  aom_clear_system_state();
+  return 0;
+}
+
+// Get the highest-spatial-layer output
+// TODO(david.barker): What should this do?
+int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
+  if (pbi->num_output_frames == 0) return -1;
+
+  *frame = *pbi->output_frames[pbi->num_output_frames - 1];
+  return 0;
+}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
new file mode 100644
index 000000000..5ca939c24
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/decoder/dthread.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
+                                          MACROBLOCKD *const xd,
+                                          aom_reader *const r, const int plane,
+                                          const int row, const int col,
+                                          const TX_SIZE tx_size);
+
+typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+                                                 MACROBLOCKD *const xd,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize);
+
+typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+                                                   MACROBLOCKD *const xd);
+
+typedef struct ThreadData {
+  aom_reader *bit_reader;
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  CB_BUFFER cb_buffer_base;
+  uint8_t *mc_buf[2];
+  int32_t mc_buf_size;
+  int mc_buf_use_highbd;  // Boolean: whether the byte pointers stored in
+                          // mc_buf were converted from highbd pointers.
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+
+  decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
+  decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
+  decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit;
+  decode_block_visitor_fn_t inverse_tx_inter_block_visit;
+  predict_inter_block_visitor_fn_t predict_inter_block_visit;
+  cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit;
+} ThreadData;
+
+typedef struct AV1DecRowMTJobInfo {
+  int tile_row;
+  int tile_col;
+  int mi_row;
+} AV1DecRowMTJobInfo;
+
+typedef struct AV1DecRowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  int allocated_sb_rows;
+  int *cur_sb_col;
+  int sync_range;
+  int mi_rows;
+  int mi_cols;
+  int mi_rows_parse_done;
+  int mi_rows_decode_started;
+  int num_threads_working;
+} AV1DecRowMTSync;
+
+typedef struct AV1DecRowMTInfo {
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int start_tile;
+  int end_tile;
+  int mi_rows_parse_done;
+  int mi_rows_decode_started;
+  int mi_rows_to_decode;
+  int row_mt_exit;
+} AV1DecRowMTInfo;
+
+typedef struct TileDataDec {
+  TileInfo tile_info;
+  aom_reader bit_reader;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  AV1DecRowMTSync dec_row_mt_sync;
+} TileDataDec;
+
+typedef struct TileBufferDec {
+  const uint8_t *data;
+  size_t size;
+} TileBufferDec;
+
+typedef struct DataBuffer {
+  const uint8_t *data;
+  size_t size;
+} DataBuffer;
+
+typedef struct EXTERNAL_REFERENCES {
+  YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES];
+  int num;
+} EXTERNAL_REFERENCES;
+
+typedef struct TileJobsDec {
+  TileBufferDec *tile_buffer;
+  TileDataDec *tile_data;
+} TileJobsDec;
+
+typedef struct AV1DecTileMTData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  TileJobsDec *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+  int alloc_tile_rows;
+  int alloc_tile_cols;
+} AV1DecTileMT;
+
+typedef struct AV1Decoder {
+  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
+
+  DECLARE_ALIGNED(32, AV1_COMMON, common);
+
+  int refresh_frame_flags;
+
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;  //  Current decoding frame buffer.
+
+  AVxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
+  AVxWorker lf_worker;
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
+  AVxWorker *tile_workers;
+  int num_workers;
+  DecWorkerData *thread_data;
+  ThreadData td;
+  TileDataDec *tile_data;
+  int allocated_tiles;
+
+  TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+  AV1DecTileMT tile_mt_info;
+
+  // Each time the decoder is called, we expect to receive a full temporal unit.
+  // This can contain up to one shown frame per spatial layer in the current
+  // operating point (note that some layers may be entirely omitted).
+  // If the 'output_all_layers' option is true, we save all of these shown
+  // frames so that they can be returned to the application. If the
+  // 'output_all_layers' option is false, then we only output one image per
+  // temporal unit.
+  //
+  // Note: The saved buffers are released at the start of the next time the
+  // application calls aom_codec_decode().
+  int output_all_layers;
+  YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS];
+  size_t output_frame_index[MAX_NUM_SPATIAL_LAYERS];  // Buffer pool indices
+  size_t num_output_frames;  // How many frames are queued up so far?
+
+  // In order to properly support random-access decoding, we need
+  // to behave slightly differently for the very first frame we decode.
+  // So we track whether this is the first frame or not.
+  int decoding_first_frame;
+
+  int allow_lowbitdepth;
+  int max_threads;
+  int inv_tile_order;
+  int need_resync;   // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
+
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+  int dec_tile_row, dec_tile_col;  // always -1 for non-VR tile encoding
+#if CONFIG_ACCOUNTING
+  int acct_enabled;
+  Accounting accounting;
+#endif
+  int tg_size;   // Number of tiles in the current tilegroup
+  int tg_start;  // First tile in the current tilegroup
+  int tg_size_bit_offset;
+  int sequence_header_ready;
+  int sequence_header_changed;
+#if CONFIG_INSPECTION
+  aom_inspect_cb inspect_cb;
+  void *inspect_ctx;
+#endif
+  int operating_point;
+  int current_operating_point;
+  int seen_frame_header;
+
+  // State if the camera frame header is already decoded while
+  // large_scale_tile = 1.
+  int camera_frame_header_ready;
+  size_t frame_header_size;
+  DataBuffer obu_size_hdr;
+  int output_frame_width_in_tiles_minus_1;
+  int output_frame_height_in_tiles_minus_1;
+  int tile_count_minus_1;
+  uint32_t coded_tile_data_size;
+  unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+  unsigned int row_mt;
+  EXTERNAL_REFERENCES ext_refs;
+  size_t tile_list_size;
+  uint8_t *tile_list_output;
+  size_t buffer_sz;
+
+  CB_BUFFER *cb_buffer_base;
+  int cb_buffer_alloc_size;
+
+  int allocated_row_mt_sync_rows;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mt_mutex_;
+  pthread_cond_t *row_mt_cond_;
+#endif
+
+  AV1DecRowMTInfo frame_row_mt_info;
+} AV1Decoder;
+
+// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
+// code and returns a nonzero value on failure.
+int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
+                                const uint8_t **dest);
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params);
+
+int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx,
+                                       YV12_BUFFER_CONFIG *sd);
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
+                                      YV12_BUFFER_CONFIG *sd);
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
+struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
+
+void av1_decoder_remove(struct AV1Decoder *pbi);
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync);
+
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
+#define ACCT_STR __func__
+static INLINE int av1_read_uniform(aom_reader *r, int n) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR);
+  assert(l != 0);
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
+}
+
+typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
+                                     aom_reader *r);
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+                       int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+                       palette_visitor_fn_t visit);
+
+typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
+                                   int mi_row, int mi_col, aom_reader *r,
+                                   PARTITION_TYPE partition, BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
new file mode 100644
index 000000000..f3ef2d55e
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/decoder/decodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "av1/decoder/decodemv.h"
+
+#define ACCT_STR __func__
+
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
+  int x = 1;
+  int length = 0;
+  int i = 0;
+
+  while (!i) {
+    i = aom_read_bit(r, ACCT_STR);
+    ++length;
+    if (length > 20) {
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid length in read_golomb");
+      break;
+    }
+  }
+
+  for (i = 0; i < length - 1; ++i) {
+    x <<= 1;
+    x += aom_read_bit(r, ACCT_STR);
+  }
+
+  return x - 1;
+}
+
+static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+  int eob = k_eob_group_start[eob_token];
+  if (eob > 2) {
+    eob += extra;
+  }
+  return eob;
+}
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+                                          int start_si, int end_si,
+                                          const int16_t *scan, int bwl,
+                                          uint8_t *levels,
+                                          base_cdf_arr base_cdf,
+                                          br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, tx_size);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx_2d(levels, pos, bwl);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+}
+
+static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+                                       TX_CLASS tx_class, int start_si,
+                                       int end_si, const int16_t *scan, int bwl,
+                                       uint8_t *levels, base_cdf_arr base_cdf,
+                                       br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx =
+        get_lower_levels_ctx(levels, pos, bwl, tx_size, tx_class);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+}
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size) {
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  const int32_t max_value = (1 << (7 + xd->bd)) - 1;
+  const int32_t min_value = -(1 << (7 + xd->bd));
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
+  tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
+  const int shift = av1_get_tx_scale(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  int cul_level = 0;
+  int dc_val = 0;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const int all_zero = aom_read_symbol(
+      r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t *const eob = &(eob_data->eob);
+  uint16_t *const max_scan_line = &(eob_data->max_scan_line);
+  *max_scan_line = 0;
+  *eob = 0;
+  if (all_zero) {
+    *max_scan_line = 0;
+    if (plane == 0) {
+      const int txk_type_idx =
+          av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+      mbmi->txk_type[txk_type_idx] = DCT_DCT;
+    }
+    return 0;
+  }
+
+  memset(levels_buf, 0,
+         sizeof(*levels_buf) *
+             ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+  if (plane == AOM_PLANE_Y) {
+    // only y plane's tx_type is transmitted
+    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
+  }
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int eob_extra = 0;
+  int eob_pt = 1;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
+                          5, ACCT_STR) +
+          1;
+      break;
+    case 1:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
+                          6, ACCT_STR) +
+          1;
+      break;
+    case 2:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
+                          7, ACCT_STR) +
+          1;
+      break;
+    case 3:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
+                          8, ACCT_STR) +
+          1;
+      break;
+    case 4:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
+                          9, ACCT_STR) +
+          1;
+      break;
+    case 5:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
+                          10, ACCT_STR) +
+          1;
+      break;
+    case 6:
+    default:
+      eob_pt = aom_read_symbol(
+                   r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
+                   ACCT_STR) +
+               1;
+      break;
+  }
+
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int bit = aom_read_symbol(
+        r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
+    if (bit) {
+      eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1));
+    }
+
+    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+      bit = aom_read_bit(r, ACCT_STR);
+      if (bit) {
+        eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i));
+      }
+    }
+  }
+  *eob = rec_eob_pos(eob_pt, eob_extra);
+
+  {
+    // Read the non-zero coefficient with scan index eob-1
+    // TODO(angiebird): Put this into a function
+    const int c = *eob - 1;
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, c);
+    const int nsymbs = 3;
+    aom_cdf_prob *cdf =
+        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
+    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(
+            r,
+            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+            BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+  if (*eob > 1) {
+    base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
+    br_cdf_arr br_cdf =
+        ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
+    if (tx_class == TX_CLASS_2D) {
+      read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bwl, levels,
+                             base_cdf, br_cdf);
+      read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bwl, levels,
+                          base_cdf, br_cdf);
+    } else {
+      read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bwl,
+                          levels, base_cdf, br_cdf);
+    }
+  }
+
+  int16_t num_zero_coeffs = 0;
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    num_zero_coeffs = AOMMAX(num_zero_coeffs, pos);
+  }
+  memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0]));
+
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    uint8_t sign;
+    tran_low_t level = levels[get_padded_idx(pos, bwl)];
+    if (level) {
+      *max_scan_line = AOMMAX(*max_scan_line, pos);
+      if (c == 0) {
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                               2, ACCT_STR);
+      } else {
+        sign = aom_read_bit(r, ACCT_STR);
+      }
+      if (level >= MAX_BASE_BR_RANGE) {
+        level += read_golomb(xd, r);
+      }
+
+      if (c == 0) dc_val = sign ? -level : level;
+
+      // Bitmasking to clamp level to valid range:
+      //   The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
+      level &= 0xfffff;
+      cul_level += level;
+      tran_low_t dq_coeff;
+      // Bitmasking to clamp dq_coeff to valid range:
+      //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
+      dq_coeff = (tran_low_t)(
+          (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+      dq_coeff = dq_coeff >> shift;
+      if (sign) {
+        dq_coeff = -dq_coeff;
+      }
+      tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
+    }
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+
+  // DC value
+  set_dc_sign(&cul_level, dc_val);
+
+  return cul_level;
+}
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                MACROBLOCKD *const xd, aom_reader *const r,
+                                const int plane, const int row, const int col,
+                                const TX_SIZE tx_size) {
+#if TXCOEFF_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + col,
+              pd->left_context + row, &txb_ctx);
+  const uint8_t cul_level =
+      av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
+
+  if (is_inter_block(mbmi)) {
+    PLANE_TYPE plane_type = get_plane_type(plane);
+    // tx_type will be read out in av1_read_coeffs_txb_facade
+    const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+                                            cm->reduced_tx_set_used);
+
+    if (plane == 0)
+      update_txk_array(mbmi->txk_type, mbmi->sb_type, row, col, tx_size,
+                       tx_type);
+  }
+
+#if TXCOEFF_TIMER
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  cm->txcoeff_timer += elapsed_time;
+  ++cm->txb_count;
+#endif
+}
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
new file mode 100644
index 000000000..fe04f6abd
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/bitreader.h"
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size);
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                MACROBLOCKD *const xd, aom_reader *const r,
+                                const int plane, const int row, const int col,
+                                const TX_SIZE tx_size);
+#endif  // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c
new file mode 100644
index 000000000..9d54bd13d
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/idct.h"
+
+static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
+  uint8_t color_order[PALETTE_MAX_SIZE];
+  const int n = param->n_colors;
+  uint8_t *const color_map = param->color_map;
+  MapCdf color_map_cdf = param->map_cdf;
+  int plane_block_width = param->plane_width;
+  int plane_block_height = param->plane_height;
+  int rows = param->rows;
+  int cols = param->cols;
+
+  // The first color index.
+  color_map[0] = av1_read_uniform(r, n);
+  assert(color_map[0] < n);
+
+  // Run wavefront on the palette map index decoding.
+  for (int i = 1; i < rows + cols - 1; ++i) {
+    for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, (i - j), j, n, color_order, NULL);
+      const int color_idx = aom_read_symbol(
+          r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
+      assert(color_idx >= 0 && color_idx < n);
+      color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
+    }
+  }
+  // Copy last column to extra columns.
+  if (cols < plane_block_width) {
+    for (int i = 0; i < rows; ++i) {
+      memset(color_map + i * plane_block_width + cols,
+             color_map[i * plane_block_width + cols - 1],
+             (plane_block_width - cols));
+    }
+  }
+  // Copy last row to extra rows.
+  for (int i = rows; i < plane_block_height; ++i) {
+    memcpy(color_map + i * plane_block_width,
+           color_map + (rows - 1) * plane_block_width, plane_block_width);
+  }
+}
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                               aom_reader *r) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam params;
+  params.color_map =
+      xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
+  params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                         : xd->tile_ctx->palette_y_color_index_cdf;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  params.n_colors = mbmi->palette_mode_info.palette_size[plane];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, &params.rows, &params.cols);
+  decode_color_map_tokens(&params, r);
+}
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
new file mode 100644
index 000000000..173b437a9
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/scan.h"
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
new file mode 100644
index 000000000..3946c787a
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "av1/common/reconinter.h"
+#include "av1/decoder/dthread.h"
+#include "av1/decoder/decoder.h"
+
+// #define DEBUG_THREAD
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void av1_frameworker_lock_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void av1_frameworker_unlock_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
+}
+
+void av1_frameworker_signal_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
+
+#else
+  (void)worker;
+#endif
+}
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void av1_frameworker_wait(AVxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf) return;
+
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
+
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    AVxWorker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const AV1Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
+    }
+#endif
+
+    av1_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
+    }
+
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      av1_frameworker_unlock_stats(ref_worker);
+      aom_internal_error(&worker_data->pbi->common.error,
+                         AOM_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    av1_frameworker_unlock_stats(ref_worker);
+  }
+#else
+  (void)worker;
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+  AVxWorker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
+  }
+#endif
+
+  av1_frameworker_lock_stats(worker);
+  buf->row = row;
+  av1_frameworker_signal_stats(worker);
+  av1_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_frameworker_copy_context(AVxWorker *const dst_worker,
+                                  AVxWorker *const src_worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  AV1_COMMON *const src_cm = &src_worker_data->pbi->common;
+  AV1_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
+
+  // Wait until source frame's context is ready.
+  av1_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+                      &src_worker_data->stats_mutex);
+  }
+
+  dst_cm->last_frame_seg_map = src_cm->seg.enabled
+                                   ? src_cm->current_frame_seg_map
+                                   : src_cm->last_frame_seg_map;
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  av1_frameworker_unlock_stats(src_worker);
+
+  dst_cm->seq_params.bit_depth = src_cm->seq_params.bit_depth;
+  dst_cm->seq_params.use_highbitdepth = src_cm->seq_params.use_highbitdepth;
+  // TODO(zoeliu): To handle parallel decoding
+  dst_cm->prev_frame =
+      src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
+  dst_cm->last_width =
+      !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
+  dst_cm->last_height =
+      !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
+  dst_cm->seq_params.subsampling_x = src_cm->seq_params.subsampling_x;
+  dst_cm->seq_params.subsampling_y = src_cm->seq_params.subsampling_y;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame
+                                ? src_cm->show_frame
+                                : src_cm->last_show_frame;
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level[0] = src_cm->lf.filter_level[0];
+  dst_cm->lf.filter_level[1] = src_cm->lf.filter_level[1];
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, REF_FRAMES);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void)dst_worker;
+  (void)src_worker;
+#endif  // CONFIG_MULTITHREAD
+}
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
new file mode 100644
index 000000000..1d264b07e
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_util/aom_thread.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct AV1Decoder;
+struct ThreadData;
+
+typedef struct DecWorkerData {
+  struct ThreadData *td;
+  const uint8_t *data_end;
+  struct aom_internal_error_info error_info;
+} DecWorkerData;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct AV1Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int worker_id;
+  int received_frame;
+
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
+#endif
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
+void av1_frameworker_lock_stats(AVxWorker *const worker);
+void av1_frameworker_unlock_stats(AVxWorker *const worker);
+void av1_frameworker_signal_stats(AVxWorker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void av1_frameworker_wait(AVxWorker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void av1_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void av1_frameworker_copy_context(AVxWorker *const dst_worker,
+                                  AVxWorker *const src_worker);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c
new file mode 100644
index 000000000..e6c89298a
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/inspection.h"
+#include "av1/common/enums.h"
+#include "av1/common/cdef.h"
+
+static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
+  fd->mi_cols = mi_cols;
+  fd->mi_rows = mi_rows;
+  fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows *
+                                           fd->mi_cols);
+}
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) {
+  int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2;
+  int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2;
+  ifd_init_mi_rc(fd, mi_cols, mi_rows);
+}
+
+void ifd_clear(insp_frame_data *fd) {
+  aom_free(fd->mi_grid);
+  fd->mi_grid = NULL;
+}
+
+/* TODO(negge) This function may be called by more than one thread when using
+               a multi-threaded decoder and this may cause a data race. */
+int ifd_inspect(insp_frame_data *fd, void *decoder) {
+  struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
+  AV1_COMMON *const cm = &pbi->common;
+  if (fd->mi_rows != cm->mi_rows || fd->mi_cols != cm->mi_cols) {
+    ifd_clear(fd);
+    ifd_init_mi_rc(fd, cm->mi_rows, cm->mi_cols);
+  }
+  fd->show_frame = cm->show_frame;
+  fd->frame_type = cm->frame_type;
+  fd->base_qindex = cm->base_qindex;
+  // Set width and height of the first tile until generic support can be added
+  TileInfo tile_info;
+  av1_tile_set_row(&tile_info, cm, 0);
+  av1_tile_set_col(&tile_info, cm, 0);
+  fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start;
+  fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start;
+  fd->delta_q_present_flag = cm->delta_q_present_flag;
+  fd->delta_q_res = cm->delta_q_res;
+#if CONFIG_ACCOUNTING
+  fd->accounting = &pbi->accounting;
+#endif
+  // TODO(negge): copy per frame CDEF data
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    for (j = 0; j < 2; j++) {
+      fd->y_dequant[i][j] = cm->y_dequant_QTX[i][j];
+      fd->u_dequant[i][j] = cm->u_dequant_QTX[i][j];
+      fd->v_dequant[i][j] = cm->v_dequant_QTX[i][j];
+    }
+  }
+  for (j = 0; j < cm->mi_rows; j++) {
+    for (i = 0; i < cm->mi_cols; i++) {
+      const MB_MODE_INFO *mbmi = cm->mi_grid_visible[j * cm->mi_stride + i];
+      insp_mi_data *mi = &fd->mi_grid[j * cm->mi_cols + i];
+      // Segment
+      mi->segment_id = mbmi->segment_id;
+      // Motion Vectors
+      mi->mv[0].row = mbmi->mv[0].as_mv.row;
+      mi->mv[0].col = mbmi->mv[0].as_mv.col;
+      mi->mv[1].row = mbmi->mv[1].as_mv.row;
+      mi->mv[1].col = mbmi->mv[1].as_mv.col;
+      // Reference Frames
+      mi->ref_frame[0] = mbmi->ref_frame[0];
+      mi->ref_frame[1] = mbmi->ref_frame[1];
+      // Prediction Mode
+      mi->mode = mbmi->mode;
+      // Prediction Mode for Chromatic planes
+      if (mi->mode < INTRA_MODES) {
+        mi->uv_mode = mbmi->uv_mode;
+      } else {
+        mi->uv_mode = UV_MODE_INVALID;
+      }
+      // Block Size
+      mi->sb_type = mbmi->sb_type;
+      // Skip Flag
+      mi->skip = mbmi->skip;
+      mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
+      mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
+      mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
+      // Transform
+      // TODO(anyone): extract tx type info from mbmi->txk_type[].
+      mi->tx_type = DCT_DCT;
+      mi->tx_size = mbmi->tx_size;
+
+      mi->cdef_level =
+          cm->cdef_strengths[mbmi->cdef_strength] / CDEF_SEC_STRENGTHS;
+      mi->cdef_strength =
+          cm->cdef_strengths[mbmi->cdef_strength] % CDEF_SEC_STRENGTHS;
+      mi->cdef_strength += mi->cdef_strength == 3;
+      if (mbmi->uv_mode == UV_CFL_PRED) {
+        mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
+        mi->cfl_alpha_sign = mbmi->cfl_alpha_signs;
+      } else {
+        mi->cfl_alpha_idx = 0;
+        mi->cfl_alpha_sign = 0;
+      }
+      // delta_q
+      mi->current_qindex = mbmi->current_qindex;
+    }
+  }
+  return 1;
+}
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
new file mode 100644
index 000000000..7214a9bed
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include "av1/common/seg_common.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#ifndef AOM_AOMDX_H_
+typedef void (*aom_inspect_cb)(void *decoder, void *data);
+#endif
+
+typedef struct insp_mv insp_mv;
+
+struct insp_mv {
+  int16_t row;
+  int16_t col;
+};
+
+typedef struct insp_mi_data insp_mi_data;
+
+struct insp_mi_data {
+  insp_mv mv[2];
+  int16_t ref_frame[2];
+  int16_t mode;
+  int16_t uv_mode;
+  int16_t sb_type;
+  int16_t skip;
+  int16_t segment_id;
+  int16_t dual_filter_type;
+  int16_t filter[2];
+  int16_t tx_type;
+  int16_t tx_size;
+  int16_t cdef_level;
+  int16_t cdef_strength;
+  int16_t cfl_alpha_idx;
+  int16_t cfl_alpha_sign;
+  int16_t current_qindex;
+};
+
+typedef struct insp_frame_data insp_frame_data;
+
+struct insp_frame_data {
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  insp_mi_data *mi_grid;
+  int show_frame;
+  int frame_type;
+  int base_qindex;
+  int mi_rows;
+  int mi_cols;
+  int tile_mi_rows;
+  int tile_mi_cols;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t u_dequant[MAX_SEGMENTS][2];
+  int16_t v_dequant[MAX_SEGMENTS][2];
+  // TODO(negge): add per frame CDEF data
+  int delta_q_present_flag;
+  int delta_q_res;
+};
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
+void ifd_clear(insp_frame_data *fd);
+int ifd_inspect(insp_frame_data *fd, void *decoder);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
new file mode 100644
index 000000000..44ecf818e
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.c
@@ -0,0 +1,839 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_ports/mem_ops.h"
+
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+#include "av1/common/timing.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+typedef enum {
+  SCALABILITY_L1T2 = 0,
+  SCALABILITY_L1T3 = 1,
+  SCALABILITY_L2T1 = 2,
+  SCALABILITY_L2T2 = 3,
+  SCALABILITY_L2T3 = 4,
+  SCALABILITY_S2T1 = 5,
+  SCALABILITY_S2T2 = 6,
+  SCALABILITY_S2T3 = 7,
+  SCALABILITY_L2T1h = 8,
+  SCALABILITY_L2T2h = 9,
+  SCALABILITY_L2T3h = 10,
+  SCALABILITY_S2T1h = 11,
+  SCALABILITY_S2T2h = 12,
+  SCALABILITY_S2T3h = 13,
+  SCALABILITY_SS = 14
+} SCALABILITY_STRUCTURES;
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *number_spatial_layers,
+    unsigned int *number_temporal_layers) {
+  // derive number of spatial/temporal layers from operating_point_idc
+
+  if (!number_spatial_layers || !number_temporal_layers)
+    return AOM_CODEC_INVALID_PARAM;
+
+  if (operating_point_idc == 0) {
+    *number_temporal_layers = 1;
+    *number_spatial_layers = 1;
+  } else {
+    *number_spatial_layers = 0;
+    *number_temporal_layers = 0;
+    for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
+      *number_spatial_layers +=
+          (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
+    }
+    for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
+      *number_temporal_layers += (operating_point_idc >> j) & 0x1;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static int is_obu_in_current_operating_point(AV1Decoder *pbi,
+                                             ObuHeader obu_header) {
+  if (!pbi->current_operating_point) {
+    return 1;
+  }
+
+  if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
+      (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+          0x1) {
+    return 1;
+  }
+  return 0;
+}
+
+static int byte_alignment(AV1_COMMON *const cm,
+                          struct aom_read_bit_buffer *const rb) {
+  while (rb->bit_offset & 7) {
+    if (aom_rb_read_bit(rb)) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+  }
+  return 0;
+}
+
+static uint32_t read_temporal_delimiter_obu() { return 0; }
+
+// Returns a boolean that indicates success.
+static int read_bitstream_level(BitstreamLevel *bl,
+                                struct aom_read_bit_buffer *rb) {
+  const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+  if (!is_valid_seq_level_idx(seq_level_idx)) return 0;
+  bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN;
+  bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1);
+  return 1;
+}
+
+// Returns whether two sequence headers are consistent with each other.
+// TODO(huisu,wtc@google.com): make sure the code matches the spec exactly.
+static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
+                                      const SequenceHeader *seq_params_new) {
+  return !memcmp(seq_params_old, seq_params_new, sizeof(SequenceHeader));
+}
+
+// On success, sets pbi->sequence_header_ready to 1 and returns the number of
+// bytes read from 'rb'.
+// On failure, sets pbi->common.error.error_code and returns 0.
+static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
+                                         struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const uint32_t saved_bit_offset = rb->bit_offset;
+
+  // Verify rb has been configured to report errors.
+  assert(rb->error_handler);
+
+  // Use a local variable to store the information as we decode. At the end,
+  // if no errors have occurred, cm->seq_params is updated.
+  SequenceHeader sh = cm->seq_params;
+  SequenceHeader *const seq_params = &sh;
+
+  seq_params->profile = av1_read_profile(rb);
+  if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  // Still picture or not
+  seq_params->still_picture = aom_rb_read_bit(rb);
+  seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
+  // Video must have reduced_still_picture_hdr = 0
+  if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  if (seq_params->reduced_still_picture_hdr) {
+    cm->timing_info_present = 0;
+    seq_params->decoder_model_info_present_flag = 0;
+    seq_params->display_model_info_present_flag = 0;
+    seq_params->operating_points_cnt_minus_1 = 0;
+    seq_params->operating_point_idc[0] = 0;
+    if (!read_bitstream_level(&seq_params->level[0], rb)) {
+      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      return 0;
+    }
+    seq_params->tier[0] = 0;
+    cm->op_params[0].decoder_model_param_present_flag = 0;
+    cm->op_params[0].display_model_param_present_flag = 0;
+  } else {
+    cm->timing_info_present = aom_rb_read_bit(rb);  // timing_info_present_flag
+    if (cm->timing_info_present) {
+      av1_read_timing_info_header(cm, rb);
+
+      seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
+      if (seq_params->decoder_model_info_present_flag)
+        av1_read_decoder_model_info(cm, rb);
+    } else {
+      seq_params->decoder_model_info_present_flag = 0;
+    }
+    seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
+    seq_params->operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      seq_params->operating_point_idc[i] =
+          aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (!read_bitstream_level(&seq_params->level[i], rb)) {
+        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        return 0;
+      }
+      // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
+      // is equivalent to level 3.3.
+      if (seq_params->level[i].major > 3)
+        seq_params->tier[i] = aom_rb_read_bit(rb);
+      else
+        seq_params->tier[i] = 0;
+      if (seq_params->decoder_model_info_present_flag) {
+        cm->op_params[i].decoder_model_param_present_flag = aom_rb_read_bit(rb);
+        if (cm->op_params[i].decoder_model_param_present_flag)
+          av1_read_op_parameters_info(cm, rb, i);
+      } else {
+        cm->op_params[i].decoder_model_param_present_flag = 0;
+      }
+      if (cm->timing_info_present &&
+          (cm->timing_info.equal_picture_interval ||
+           cm->op_params[i].decoder_model_param_present_flag)) {
+        cm->op_params[i].bitrate = max_level_bitrate(
+            seq_params->profile,
+            major_minor_to_seq_level_idx(seq_params->level[i]),
+            seq_params->tier[i]);
+        // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
+        // the check
+        if (cm->op_params[i].bitrate == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "AV1 does not support this combination of "
+                             "profile, level, and tier.");
+        // Buffer size in bits/s is bitrate in bits/s * 1 s
+        cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+      }
+      if (cm->timing_info_present && cm->timing_info.equal_picture_interval &&
+          !cm->op_params[i].decoder_model_param_present_flag) {
+        // When the decoder_model_parameters are not sent for this op, set
+        // the default ones that can be used with the resource availability mode
+        cm->op_params[i].decoder_buffer_delay = 70000;
+        cm->op_params[i].encoder_buffer_delay = 20000;
+        cm->op_params[i].low_delay_mode_flag = 0;
+      }
+
+      if (seq_params->display_model_info_present_flag) {
+        cm->op_params[i].display_model_param_present_flag = aom_rb_read_bit(rb);
+        if (cm->op_params[i].display_model_param_present_flag) {
+          cm->op_params[i].initial_display_delay =
+              aom_rb_read_literal(rb, 4) + 1;
+          if (cm->op_params[i].initial_display_delay > 10)
+            aom_internal_error(
+                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                "AV1 does not support more than 10 decoded frames delay");
+        } else {
+          cm->op_params[i].initial_display_delay = 10;
+        }
+      } else {
+        cm->op_params[i].display_model_param_present_flag = 0;
+        cm->op_params[i].initial_display_delay = 10;
+      }
+    }
+  }
+  // This decoder supports all levels.  Choose operating point provided by
+  // external means
+  int operating_point = pbi->operating_point;
+  if (operating_point < 0 ||
+      operating_point > seq_params->operating_points_cnt_minus_1)
+    operating_point = 0;
+  pbi->current_operating_point =
+      seq_params->operating_point_idc[operating_point];
+  if (aom_get_num_layers_from_operating_point_idc(
+          pbi->current_operating_point, &cm->number_spatial_layers,
+          &cm->number_temporal_layers) != AOM_CODEC_OK) {
+    cm->error.error_code = AOM_CODEC_ERROR;
+    return 0;
+  }
+
+  av1_read_sequence_header(cm, rb, seq_params);
+
+  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
+      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
+      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
+                       "%d %d subsampling is not supported.\n",
+                       seq_params->subsampling_x, seq_params->subsampling_y);
+  }
+
+  seq_params->film_grain_params_present = aom_rb_read_bit(rb);
+
+  if (av1_check_trailing_bits(pbi, rb) != 0) {
+    // cm->error.error_code is already set.
+    return 0;
+  }
+
+  // If a sequence header has been decoded before, we check if the new
+  // one is consistent with the old one.
+  if (pbi->sequence_header_ready) {
+    if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+      pbi->sequence_header_changed = 1;
+  }
+
+  cm->seq_params = *seq_params;
+  pbi->sequence_header_ready = 1;
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+static uint32_t read_frame_header_obu(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      const uint8_t *data,
+                                      const uint8_t **p_data_end,
+                                      int trailing_bits_present) {
+  return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+                                            trailing_bits_present);
+}
+
+static int32_t read_tile_group_header(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      int *start_tile, int *end_tile,
+                                      int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t saved_bit_offset = rb->bit_offset;
+  int tile_start_and_end_present_flag = 0;
+  const int num_tiles = pbi->common.tile_rows * pbi->common.tile_cols;
+
+  if (!pbi->common.large_scale_tile && num_tiles > 1) {
+    tile_start_and_end_present_flag = aom_rb_read_bit(rb);
+  }
+  if (pbi->common.large_scale_tile || num_tiles == 1 ||
+      !tile_start_and_end_present_flag) {
+    *start_tile = 0;
+    *end_tile = num_tiles - 1;
+    return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+  }
+  if (tile_start_implicit && tile_start_and_end_present_flag) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
+    return -1;
+  }
+  *start_tile =
+      aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+  *end_tile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+static uint32_t read_one_tile_group_obu(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg,
+    const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end,
+    int *is_last_tg, int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  int start_tile, end_tile;
+  int32_t header_size, tg_payload_size;
+
+  assert((rb->bit_offset & 7) == 0);
+  assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
+
+  header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
+                                       tile_start_implicit);
+  if (header_size == -1 || byte_alignment(cm, rb)) return 0;
+  if (start_tile > end_tile) return header_size;
+  data += header_size;
+  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
+                                 end_tile, is_first_tg);
+
+  tg_payload_size = (uint32_t)(*p_data_end - data);
+
+  // TODO(shan):  For now, assume all tile groups received in order
+  *is_last_tg = end_tile == cm->tile_rows * cm->tile_cols - 1;
+  return header_size + tg_payload_size;
+}
+
+static void alloc_tile_list_buffer(AV1Decoder *pbi) {
+  // TODO(yunqing): for now, copy each tile's decoded YUV data directly to the
+  // output buffer. This needs to be modified according to the application
+  // requirement.
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  const int ssy = cm->seq_params.subsampling_y;
+  const int ssx = cm->seq_params.subsampling_x;
+  const int num_planes = av1_num_planes(cm);
+  const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels;
+  const size_t uvplane_tile_size =
+      (num_planes > 1)
+          ? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx)
+          : 0;
+  const size_t tile_size = (cm->seq_params.use_highbitdepth ? 2 : 1) *
+                           (yplane_tile_size + 2 * uvplane_tile_size);
+  pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1);
+
+  if (pbi->tile_list_size > pbi->buffer_sz) {
+    if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+    pbi->tile_list_output = NULL;
+
+    pbi->tile_list_output = (uint8_t *)aom_memalign(32, pbi->tile_list_size);
+    if (pbi->tile_list_output == NULL)
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate the tile list output buffer");
+    pbi->buffer_sz = pbi->tile_list_size;
+  }
+}
+
+static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
+                                                  uint8_t **output) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+  const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+  const int ssy = cm->seq_params.subsampling_y;
+  const int ssx = cm->seq_params.subsampling_x;
+  const int num_planes = av1_num_planes(cm);
+
+  // Copy decoded tile to the tile list output buffer.
+  YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+  const int mi_row = pbi->dec_tile_row * cm->tile_height;
+  const int mi_col = pbi->dec_tile_col * cm->tile_width;
+  const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
+  int strides[MAX_MB_PLANE] = { 0, 0, 0 };
+  int plane;
+
+  for (plane = 0; plane < num_planes; ++plane) {
+    int shift_x = plane > 0 ? ssx : 0;
+    int shift_y = plane > 0 ? ssy : 0;
+
+    bufs[plane] = cur_frame->buffers[plane];
+    strides[plane] =
+        (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
+
+    bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
+                   mi_col * (MI_SIZE >> shift_x);
+
+    if (is_hbd) {
+      bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(bufs[plane]);
+      strides[plane] *= 2;
+    }
+
+    int w, h;
+    w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
+                                   : tile_width_in_pixels;
+    w *= (1 + is_hbd);
+    h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
+                                   : tile_height_in_pixels;
+    int j;
+
+    for (j = 0; j < h; ++j) {
+      memcpy(*output, bufs[plane], w);
+      bufs[plane] += strides[plane];
+      *output += w;
+    }
+  }
+}
+
+// Only called while large_scale_tile = 1.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+                                              struct aom_read_bit_buffer *rb,
+                                              const uint8_t *data,
+                                              const uint8_t *data_end,
+                                              const uint8_t **p_data_end,
+                                              int *frame_decoding_finished) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t tile_list_payload_size = 0;
+  const int num_tiles = cm->tile_cols * cm->tile_rows;
+  const int start_tile = 0;
+  const int end_tile = num_tiles - 1;
+  int i = 0;
+
+  // Process the tile list info.
+  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+  if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+
+  // Allocate output frame buffer for the tile list.
+  alloc_tile_list_buffer(pbi);
+
+  uint32_t tile_list_info_bytes = 4;
+  tile_list_payload_size += tile_list_info_bytes;
+  data += tile_list_info_bytes;
+  uint8_t *output = pbi->tile_list_output;
+
+  for (i = 0; i <= pbi->tile_count_minus_1; i++) {
+    // Process 1 tile.
+    // Reset the bit reader.
+    rb->bit_offset = 0;
+    rb->bit_buffer = data;
+
+    // Read out the tile info.
+    uint32_t tile_info_bytes = 5;
+    // Set reference for each tile.
+    int ref_idx = aom_rb_read_literal(rb, 8);
+    if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    av1_set_reference_dec(cm, 0, 1, &pbi->ext_refs.refs[ref_idx]);
+
+    pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
+    pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
+    if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
+        pbi->dec_tile_row >= cm->tile_rows ||
+        pbi->dec_tile_col >= cm->tile_cols) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
+    data += tile_info_bytes;
+    if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
+                                   p_data_end, start_tile, end_tile, 0);
+    uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
+
+    tile_list_payload_size += tile_info_bytes + tile_payload_size;
+
+    // Update data ptr for next tile decoding.
+    data = *p_data_end;
+    assert(data <= data_end);
+
+    // Copy the decoded tile to the tile list output buffer.
+    copy_decoded_tile_to_tile_list_buffer(pbi, &output);
+  }
+
+  *frame_decoding_finished = 1;
+  return tile_list_payload_size;
+}
+
+static void read_metadata_itut_t35(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  for (size_t i = 0; i < sz; i++) {
+    aom_rb_read_literal(&rb, 8);
+  }
+}
+
+static void read_metadata_hdr_cll(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  aom_rb_read_literal(&rb, 16);  // max_cll
+  aom_rb_read_literal(&rb, 16);  // max_fall
+}
+
+static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  for (int i = 0; i < 3; i++) {
+    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_x
+    aom_rb_read_literal(&rb, 16);  // primary_i_chromaticity_y
+  }
+
+  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_x
+  aom_rb_read_literal(&rb, 16);  // white_point_chromaticity_y
+
+  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_max
+  aom_rb_read_unsigned_literal(&rb, 32);  // luminance_min
+}
+
+static void scalability_structure(struct aom_read_bit_buffer *rb) {
+  int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
+  int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+  int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+  int temporal_group_description_present_flag = aom_rb_read_bit(rb);
+  aom_rb_read_literal(rb, 3);  // reserved
+
+  if (spatial_layer_dimensions_present_flag) {
+    int i;
+    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+      aom_rb_read_literal(rb, 16);
+      aom_rb_read_literal(rb, 16);
+    }
+  }
+  if (spatial_layer_description_present_flag) {
+    int i;
+    for (i = 0; i < spatial_layers_cnt + 1; i++) {
+      aom_rb_read_literal(rb, 8);
+    }
+  }
+  if (temporal_group_description_present_flag) {
+    int i, j, temporal_group_size;
+    temporal_group_size = aom_rb_read_literal(rb, 8);
+    for (i = 0; i < temporal_group_size; i++) {
+      aom_rb_read_literal(rb, 3);
+      aom_rb_read_bit(rb);
+      aom_rb_read_bit(rb);
+      int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+      for (j = 0; j < temporal_group_ref_cnt; j++) {
+        aom_rb_read_literal(rb, 8);
+      }
+    }
+  }
+}
+
+static void read_metadata_scalability(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  int scalability_mode_idc = aom_rb_read_literal(&rb, 8);
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    scalability_structure(&rb);
+  }
+}
+
+static void read_metadata_timecode(const uint8_t *data, size_t sz) {
+  struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+  aom_rb_read_literal(&rb, 5);                     // counting_type f(5)
+  int full_timestamp_flag = aom_rb_read_bit(&rb);  // full_timestamp_flag f(1)
+  aom_rb_read_bit(&rb);                            // discontinuity_flag (f1)
+  aom_rb_read_bit(&rb);                            // cnt_dropped_flag f(1)
+  aom_rb_read_literal(&rb, 9);                     // n_frames f(9)
+  if (full_timestamp_flag) {
+    aom_rb_read_literal(&rb, 6);  // seconds_value f(6)
+    aom_rb_read_literal(&rb, 6);  // minutes_value f(6)
+    aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+  } else {
+    int seconds_flag = aom_rb_read_bit(&rb);  // seconds_flag f(1)
+    if (seconds_flag) {
+      aom_rb_read_literal(&rb, 6);              // seconds_value f(6)
+      int minutes_flag = aom_rb_read_bit(&rb);  // minutes_flag f(1)
+      if (minutes_flag) {
+        aom_rb_read_literal(&rb, 6);            // minutes_value f(6)
+        int hours_flag = aom_rb_read_bit(&rb);  // hours_flag f(1)
+        if (hours_flag) {
+          aom_rb_read_literal(&rb, 5);  // hours_value f(5)
+        }
+      }
+    }
+  }
+  // time_offset_length f(5)
+  int time_offset_length = aom_rb_read_literal(&rb, 5);
+  if (time_offset_length) {
+    aom_rb_read_literal(&rb, time_offset_length);  // f(time_offset_length)
+  }
+}
+
+static size_t read_metadata(const uint8_t *data, size_t sz) {
+  size_t type_length;
+  uint64_t type_value;
+  OBU_METADATA_TYPE metadata_type;
+  if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+    return sz;
+  }
+  metadata_type = (OBU_METADATA_TYPE)type_value;
+  if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
+    read_metadata_itut_t35(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
+    read_metadata_hdr_cll(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
+    read_metadata_hdr_mdcv(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+    read_metadata_scalability(data + type_length, sz - type_length);
+  } else if (metadata_type == OBU_METADATA_TYPE_TIMECODE) {
+    read_metadata_timecode(data + type_length, sz - type_length);
+  }
+
+  return sz;
+}
+
+// On success, returns a boolean that indicates whether the decoding of the
+// current frame is finished. On failure, sets cm->error.error_code and
+// returns -1.
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  int frame_decoding_finished = 0;
+  int is_first_tg_obu_received = 1;
+  uint32_t frame_header_size = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  pbi->seen_frame_header = 0;
+
+  if (data_end < data) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+
+  // Reset pbi->camera_frame_header_ready to 0 if cm->large_scale_tile = 0.
+  if (!cm->large_scale_tile) pbi->camera_frame_header_ready = 0;
+
+  // decode frame as a series of OBUs
+  while (!frame_decoding_finished && !cm->error.error_code) {
+    struct aom_read_bit_buffer rb;
+    size_t payload_size = 0;
+    size_t decoded_payload_size = 0;
+    size_t obu_payload_offset = 0;
+    size_t bytes_read = 0;
+    const size_t bytes_available = data_end - data;
+
+    if (bytes_available == 0 && !pbi->seen_frame_header) {
+      *p_data_end = data;
+      cm->error.error_code = AOM_CODEC_OK;
+      break;
+    }
+
+    aom_codec_err_t status =
+        aom_read_obu_header_and_size(data, bytes_available, cm->is_annexb,
+                                     &obu_header, &payload_size, &bytes_read);
+
+    if (status != AOM_CODEC_OK) {
+      cm->error.error_code = status;
+      return -1;
+    }
+
+    // Record obu size header information.
+    pbi->obu_size_hdr.data = data + obu_header.size;
+    pbi->obu_size_hdr.size = bytes_read - obu_header.size;
+
+    // Note: aom_read_obu_header_and_size() takes care of checking that this
+    // doesn't cause 'data' to advance past 'data_end'.
+    data += bytes_read;
+
+    if ((size_t)(data_end - data) < payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    cm->temporal_layer_id = obu_header.temporal_layer_id;
+    cm->spatial_layer_id = obu_header.spatial_layer_id;
+
+    if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+        obu_header.type != OBU_SEQUENCE_HEADER &&
+        obu_header.type != OBU_PADDING) {
+      // don't decode obu if it's not in current operating mode
+      if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+        data += payload_size;
+        continue;
+      }
+    }
+
+    av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
+
+    switch (obu_header.type) {
+      case OBU_TEMPORAL_DELIMITER:
+        decoded_payload_size = read_temporal_delimiter_obu();
+        pbi->seen_frame_header = 0;
+        break;
+      case OBU_SEQUENCE_HEADER:
+        decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      case OBU_FRAME_HEADER:
+      case OBU_REDUNDANT_FRAME_HEADER:
+      case OBU_FRAME:
+        // Only decode first frame header received
+        if (!pbi->seen_frame_header ||
+            (cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
+          frame_header_size = read_frame_header_obu(
+              pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+          pbi->seen_frame_header = 1;
+          if (!pbi->ext_tile_debug && cm->large_scale_tile)
+            pbi->camera_frame_header_ready = 1;
+        } else {
+          // TODO(wtc): Verify that the frame_header_obu is identical to the
+          // original frame_header_obu. For now just skip frame_header_size
+          // bytes in the bit buffer.
+          if (frame_header_size > payload_size) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+          assert(rb.bit_offset == 0);
+          rb.bit_offset = 8 * frame_header_size;
+        }
+
+        decoded_payload_size = frame_header_size;
+        pbi->frame_header_size = frame_header_size;
+
+        if (cm->show_existing_frame) {
+          if (obu_header.type == OBU_FRAME) {
+            cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+            return -1;
+          }
+          frame_decoding_finished = 1;
+          pbi->seen_frame_header = 0;
+          break;
+        }
+
+        // In large scale tile coding, decode the common camera frame header
+        // before any tile list OBU.
+        if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
+          frame_decoding_finished = 1;
+          // Skip the rest of the frame data.
+          decoded_payload_size = payload_size;
+          // Update data_end.
+          *p_data_end = data_end;
+          break;
+        }
+
+        if (obu_header.type != OBU_FRAME) break;
+        obu_payload_offset = frame_header_size;
+        // Byte align the reader before reading the tile group.
+        if (byte_alignment(cm, &rb)) return -1;
+        AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
+      case OBU_TILE_GROUP:
+        if (!pbi->seen_frame_header) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        if (obu_payload_offset > payload_size) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        decoded_payload_size += read_one_tile_group_obu(
+            pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
+            data + payload_size, p_data_end, &frame_decoding_finished,
+            obu_header.type == OBU_FRAME);
+        is_first_tg_obu_received = 0;
+        if (frame_decoding_finished) pbi->seen_frame_header = 0;
+        break;
+      case OBU_METADATA:
+        decoded_payload_size = read_metadata(data, payload_size);
+        break;
+      case OBU_TILE_LIST:
+        if (CONFIG_NORMAL_TILE_MODE) {
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          return -1;
+        }
+
+        // This OBU type is purely for the large scale tile coding mode.
+        // The common camera frame header has to be already decoded.
+        if (!pbi->camera_frame_header_ready) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+
+        cm->large_scale_tile = 1;
+        av1_set_single_tile_decoding_mode(cm);
+        decoded_payload_size =
+            read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
+                                          p_data_end, &frame_decoding_finished);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      case OBU_PADDING:
+      default:
+        // Skip unrecognized OBUs
+        decoded_payload_size = payload_size;
+        break;
+    }
+
+    // Check that the signalled OBU size matches the actual amount of data read
+    if (decoded_payload_size > payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    // If there are extra padding bytes, they should all be zero
+    while (decoded_payload_size < payload_size) {
+      uint8_t padding_byte = data[decoded_payload_size++];
+      if (padding_byte != 0) {
+        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        return -1;
+      }
+    }
+
+    data += payload_size;
+  }
+
+  return frame_decoding_finished;
+}
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
new file mode 100644
index 000000000..5ab243fc9
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
+
+#include "aom/aom_codec.h"
+#include "av1/decoder/decoder.h"
+
+// Try to decode one frame from a buffer.
+// Returns 1 if we decoded a frame,
+//         0 if we didn't decode a frame but that's okay
+//           (eg, if there was a frame but we skipped it),
+//     or -1 on error
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end);
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *num_spatial_layers,
+    unsigned int *num_temporal_layers);
+
+#endif  // AOM_AV1_DECODER_OBU_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 000000000..80f8e2e66
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3  // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 1.75, 1.25, 1.05, 1.00, 0.90 },
+  { 2.00, 1.50, 1.15, 1.00, 0.85 },
+  { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 0.15, 0.30, 0.55, 2.00, 100.0 },
+  { 0.20, 0.40, 0.65, 2.00, 100.0 },
+  { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { -4.0, -3.0, -2.0, 100.00, 100.0 },
+  { -3.5, -2.5, -1.5, 100.00, 100.0 },
+  { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+  // Approximate base quatizer (truncated to int)
+  const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+
+  // Make SURE use of floating point in this function is safe.
+  aom_clear_system_state();
+
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    av1_disable_segmentation(seg);
+    return;
+  }
+
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    int segment;
+    const int aq_strength =
+        get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
+
+    // Clear down the segment map.
+    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+    av1_clearall_segfeatures(seg);
+
+    // Segmentation only makes sense if the target bits per SB is above a
+    // threshold. Below this the overheads will usually outweigh any benefit.
+    if (cpi->rc.sb64_target_rate < 256) {
+      av1_disable_segmentation(seg);
+      return;
+    }
+
+    av1_enable_segmentation(seg);
+
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG) continue;
+
+      qindex_delta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, cm->base_qindex,
+          aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
+
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+      if ((cm->base_qindex + qindex_delta) > 0) {
+        av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
+    }
+  }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, mi_size_high[bs]);
+  int x, y;
+  int i;
+  unsigned char segment;
+
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits << AV1_PROB_COST_SHIFT units.
+    const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+                        << AV1_PROB_COST_SHIFT;
+    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
+    const int target_rate = (int)(num / denom);
+    double logvar;
+    double low_var_thresh;
+    const int aq_strength =
+        get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
+
+    aom_clear_system_state();
+    low_var_thresh =
+        (cpi->oxcf.pass == 2)
+            ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+            : DEFAULT_LV_THRESH;
+
+    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
+    logvar = av1_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
+        break;
+      }
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 000000000..3421d74c9
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs, int mi_row, int mi_col,
+                            int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 000000000..f532d48da
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  int8_t *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+};
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
+  CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+  if (cr == NULL) return NULL;
+
+  cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+
+  return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  if (cr != NULL) {
+    aom_free(cr->map);
+    aom_free(cr->last_coded_q_map);
+    aom_free(cr);
+  }
+}
+
+// Check if we should turn off cyclic refresh based on bitrate condition.
+static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm,
+                                        const RATE_CONTROL *rc) {
+  // Turn off cyclic refresh if bits available per frame is not sufficiently
+  // larger than bit cost of segmentation. Segment map bit cost should scale
+  // with number of seg blocks, so compare available bits to number of blocks.
+  // Average bits available per frame = avg_frame_bandwidth
+  // Number of (8x8) blocks in frame = mi_rows * mi_cols;
+  const float factor = 0.25;
+  const int number_blocks = cm->mi_rows * cm->mi_cols;
+  // The condition below corresponds to turning off at target bitrates:
+  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
+  // Also turn off at very small frame sizes, to avoid too large fraction of
+  // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+  if (rc->avg_frame_bandwidth < factor * number_blocks ||
+      number_blocks / 64 < 5)
+    return 0;
+  else
+    return 1;
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi, int64_t rate,
+                                int64_t dist, int bsize) {
+  MV mv = mbmi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+           is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+           cr->rate_boost_fac > 10)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq =
+      av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, rate_factor,
+                                 cpi->common.seq_params.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
+  }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+                                          double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int estimated_bits;
+  int mbs = cm->MBs;
+  int num8x8bl = mbs << 2;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // Take segment weighted average for estimated bits.
+  estimated_bits =
+      (int)((1.0 - weight_segment1 - weight_segment2) *
+                av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                                       correction_factor,
+                                       cm->seq_params.bit_depth) +
+            weight_segment1 * av1_estimate_bits_at_q(
+                                  cm->frame_type,
+                                  cm->base_qindex + cr->qindex_delta[1], mbs,
+                                  correction_factor, cm->seq_params.bit_depth) +
+            weight_segment2 * av1_estimate_bits_at_q(
+                                  cm->frame_type,
+                                  cm->base_qindex + cr->qindex_delta[2], mbs,
+                                  correction_factor, cm->seq_params.bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+                                      double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num8x8bl = cm->MBs << 2;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment =
+      (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+                cr->actual_num_seg2_blocks) >>
+               1) /
+      num8x8bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb =
+      (int)((1.0 - weight_segment) *
+                av1_rc_bits_per_mb(cm->frame_type, i, correction_factor,
+                                   cm->seq_params.bit_depth) +
+            weight_segment * av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
+                                                correction_factor,
+                                                cm->seq_params.bit_depth));
+  return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_cols + mi_col;
+  const int refresh_this_block =
+      candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+  int x = 0;
+  int y = 0;
+
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1) new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->segmentation_map[map_offset] = mbmi->segment_id;
+      // Inter skip blocks were clearly not coded at the current qindex, so
+      // don't update the map for them. For cases where motion is non-zero or
+      // the reference frame isn't the previous frame, the previous value in
+      // the map for this spatial location is not entirely correct.
+      if ((!is_inter_block(mbmi) || !skip) &&
+          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] = clamp(
+            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+      } else if (is_inter_block(mbmi) && skip &&
+                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] =
+            AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+                         0, MAXQ),
+                   cr->last_coded_q_map[map_offset]);
+      }
+    }
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int mi_row, mi_col;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
+          CR_SEGMENT_ID_BOOST1)
+        cr->actual_num_seg1_blocks++;
+      else if (cyclic_refresh_segment_id(
+                   seg_map[mi_row * cm->mi_cols + mi_col]) ==
+               CR_SEGMENT_ID_BOOST2)
+        cr->actual_num_seg2_blocks++;
+    }
+}
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+  // period. Depending on past encoding stats, GF flag may be reset and update
+  // may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+  else
+    rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int mi_row, mi_col;
+  double fraction_low = 0.0;
+  int low_content_frame = 0;
+
+  MB_MODE_INFO **mi;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt1 = 0, cnt2 = 0;
+  int force_gf_refresh = 0;
+
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
+                            ? mi[0]->mv[0].as_mv.row
+                            : -1 * mi[0]->mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
+                            ? mi[0]->mv[0].as_mv.col
+                            : -1 * mi[0]->mv[0].as_mv.col;
+
+      // Calculate the motion of the background.
+      if (abs_mvr <= 16 && abs_mvc <= 16) {
+        cnt1++;
+        if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
+      }
+      mi++;
+
+      // Accumulate low_content_frame.
+      if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
+    }
+  }
+
+  // For video conference clips, if the background has high motion in current
+  // frame because of the camera movement, set this frame as the golden frame.
+  // Use 70% and 5% as the thresholds for golden frame refreshing.
+  if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
+    av1_cyclic_refresh_set_golden_update(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    force_gf_refresh = 1;
+  }
+
+  fraction_low = (double)low_content_frame / (rows * cols);
+  // Update average.
+  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+    // Don't update golden reference if the amount of low_content for the
+    // current encoded frame is small, or if the recursive average of the
+    // low_content over the update interval window falls below threshold.
+    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+      cpi->refresh_golden_frame = 0;
+    // Reset for next internal.
+    cr->low_content_avg = fraction_low;
+  }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+  sb_cols =
+      (cm->mi_cols + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+  sb_rows =
+      (cm->mi_rows + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * cm->seq_params.mib_size;
+    int mi_col = sb_col_index * cm->seq_params.mib_size;
+    int qindex_thresh =
+        cpi->oxcf.content == AOM_CONTENT_SCREEN
+            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+            : 0;
+    assert(mi_row >= 0 && mi_row < cm->mi_rows);
+    assert(mi_col >= 0 && mi_col < cm->mi_cols);
+    bl_index = mi_row * cm->mi_cols + mi_col;
+    // Loop through all MI blocks in superblock and update map.
+    xmis = AOMMIN(cm->mi_cols - mi_col, cm->seq_params.mib_size);
+    ymis = AOMMIN(cm->mi_rows - mi_row, cm->seq_params.mib_size);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than GLOBALMV.
+        if (cr->map[bl_index2] == 0) {
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 50;
+  cr->time_for_refresh = 0;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame.
+  if (rc->frames_since_key < 4 * cr->percent_refresh)
+    cr->rate_ratio_qdelta = 3.0;
+  else
+    cr->rate_ratio_qdelta = 2.0;
+  // Adjust some parameters for low resolutions at low bitrates.
+  if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
+    cr->motion_thresh = 4;
+    cr->rate_boost_fac = 10;
+  } else {
+    cr->motion_thresh = 32;
+    cr->rate_boost_fac = 17;
+  }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
+  if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
+  // Don't apply refresh on key frame or enhancement layer frames.
+  if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
+    // Set segmentation map to 0 and disable.
+    unsigned char *const seg_map = cpi->segmentation_map;
+    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_disable_segmentation(&cm->seg);
+    if (cm->frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+      cr->sb_index = 0;
+    }
+    return;
+  } else {
+    int qindex_delta = 0;
+    int qindex2;
+    const double q =
+        av1_convert_qindex_to_q(cm->base_qindex, cm->seq_params.bit_depth);
+    aom_clear_system_state();
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+    // Set up segmentation.
+    // Clear down the segment map.
+    av1_enable_segmentation(&cm->seg);
+    av1_clearall_segfeatures(seg);
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in
+    // av1_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
+    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+    cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
+
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, cm->base_qindex,
+        AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
+  }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+  cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 000000000..b45781983
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct AV1_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+                                      double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 000000000..58f906bdc
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
+#include "aom_ports/system_state.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+                                                 0.9, .8,  .7,  .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
+                                                        0.75, 1.0, 1.0, 1.0 };
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+DECLARE_ALIGNED(16, static const uint16_t,
+                av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+  double avg_ratio;
+  if (avg_energy > 7) avg_energy = 7;
+  if (avg_energy < 0) avg_energy = 0;
+  avg_ratio = rate_ratio[avg_energy];
+
+  if (resolution_change) {
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    cpi->vaq_refresh = 1;
+
+    av1_enable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    aom_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      // Set up avg segment id to be 1.0 and adjust the other segments around
+      // it.
+      int qindex_delta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio,
+          cm->seq_params.bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
+      }
+
+      av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  // This functions returns a score for the blocks local variance as calculated
+  // by: sum of the log of the (4x4 variances) of each subblock to the current
+  // block (x,bs)
+  // * 32 / number of pixels in the block_size.
+  // This is used for segmentation because to avoid situations in which a large
+  // block with a gentle gradient gets marked high variance even though each
+  // subblock has a low variance.   This allows us to assign the same segment
+  // number for the same sorts of area regardless of how the partitioning goes.
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  double var = 0;
+  unsigned int sse;
+  int i, j;
+
+  int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  aom_clear_system_state();
+
+  for (i = 0; i < bh; i += 4) {
+    for (j = 0; j < bw; j += 4) {
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        var +=
+            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride,
+                          CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+                          16);
+      } else {
+        var +=
+            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+                          16);
+      }
+    }
+  }
+  // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+  var /= (bw / 4 * bh / 4);
+  if (var > 7) var = 7;
+
+  aom_clear_system_state();
+  return (int)(var);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+
+unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int var = 0;
+  for (int r = 0; r < bh; r += 8)
+    for (int c = 0; c < bw; c += 8) {
+      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
+    }
+
+  return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int haar_sad = haar_ac_energy(x, bs);
+  aom_clear_system_state();
+  return log(haar_sad + 1.0);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  double energy, energy_midpoint;
+  aom_clear_system_state();
+  energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.frame_avg_haar_energy
+                                          : DEFAULT_E_MIDPOINT;
+  energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+                                         int block_var_level) {
+  int rate_level;
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (DELTAQ_MODULATION == 1) {
+    ENERGY_IN_BOUNDS(block_var_level);
+    rate_level = SEGMENT_ID(block_var_level);
+  } else {
+    rate_level = block_var_level;
+  }
+  int qindex_delta = av1_compute_qdelta_by_rate(
+      &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level],
+      cm->seq_params.bit_depth);
+
+  if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+    qindex_delta = -cm->base_qindex + 1;
+  }
+  return qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 000000000..2d22b663e
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+                                         int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 000000000..36e7d3370
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                          int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const int16_t *iscan) {
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    int i;
+    const int16x8_t v_zero = vdupq_n_s16(0);
+    const int16x8_t v_one = vdupq_n_s16(1);
+    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    // adjust for dc
+    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+    // process dc and the first seven ac coeffs
+    {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo =
+          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi =
+          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 =
+          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
+      v_round = vmovq_n_s16(round_ptr[1]);
+      v_quant = vmovq_n_s16(quant_ptr[1]);
+      v_dequant = vmovq_n_s16(dequant_ptr[1]);
+    }
+    // now process the rest of the ac coeffs
+    for (i = 8; i < count; i += 8) {
+      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+      const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+      const int32x4_t v_tmp_lo =
+          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+      const int32x4_t v_tmp_hi =
+          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+      const int16x8_t v_tmp2 =
+          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+      vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+      vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+    }
+    {
+      const int16x4_t v_eobmax_3210 = vmax_s16(
+          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
+      const int64x1_t v_eobmax_xx32 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+      const int16x4_t v_eobmax_tmp =
+          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+      const int64x1_t v_eobmax_xxx3 =
+          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+      const int16x4_t v_eobmax_final =
+          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+    }
+  } else {
+    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    *eob_ptr = 0;
+  }
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 000000000..98505e0b1
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t x0, x1, x2, x3;
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 0
+  av1_range_check_buf(0, input, input, 4, stage_range[0]);
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+  s7 = range_check_value(x0 + x1, stage_range[1]);
+
+  // stage 2
+  s7 = range_check_value(s7 - x3, stage_range[2]);
+
+  // stage 3
+  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+  x3 = range_check_value(s4, bit + stage_range[3]);
+
+  // stage 4
+  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+  // stage 5
+  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+  s1 = range_check_value(x1, bit + stage_range[5]);
+  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+  // stage 6
+  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = round_shift(s0, bit);
+  output[1] = round_shift(s1, bit);
+  output[2] = round_shift(s2, bit);
+  output[3] = round_shift(s3, bit);
+  av1_range_check_buf(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 4; ++i)
+    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  av1_range_check_buf(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  av1_range_check_buf(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 16; ++i)
+    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  av1_range_check_buf(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  av1_range_check_buf(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 000000000..9dcf16552
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                    const int8_t *stage_range);
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+                     const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 000000000..98b6530db
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t fwd_cos_bit_col[5][5];
+extern const int8_t fwd_cos_bit_row[5][5];
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 000000000..f25a667cf
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4_new;
+    case TXFM_TYPE_DCT8: return av1_fdct8_new;
+    case TXFM_TYPE_DCT16: return av1_fdct16_new;
+    case TXFM_TYPE_DCT32: return av1_fdct32_new;
+    case TXFM_TYPE_DCT64: return av1_fdct64_new;
+    case TXFM_TYPE_ADST4: return av1_fadst4_new;
+    case TXFM_TYPE_ADST8: return av1_fadst8_new;
+    case TXFM_TYPE_ADST16: return av1_fadst16_new;
+    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+  }
+
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf, int bd) {
+  int c, r;
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size_row;
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+    }
+    av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        buf[r * txfm_size_col + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip from left to right
+        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
+                  cos_bit_row, stage_range_row);
+    av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
+    if (abs(rect_type) == 1) {
+      // Multiply everything by Sqrt2 if the transform is rectangular and the
+      // size difference is a factor of 2.
+      for (c = 0; c < txfm_size_col; ++c) {
+        output[r * txfm_size_col + c] = round_shift(
+            (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
+      }
+    }
+  }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out top-right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Zero out the bottom 64x32 area.
+  memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 32x32 area.
+  memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out right 32x16 area.
+  for (int row = 0; row < 16; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x16 indices.
+  for (int row = 1; row < 16; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+  fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32,
+  fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,
+  fwd_shift_16x8,  fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+  fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,  fwd_shift_8x32,
+  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+                            [MAX_TXWH_IDX /*txh_idx*/] = {
+                              { 13, 13, 13, 0, 0 },
+                              { 13, 13, 13, 12, 0 },
+                              { 13, 13, 13, 12, 13 },
+                              { 0, 13, 13, 12, 13 },
+                              { 0, 0, 13, 12, 13 }
+                            };
+
+const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+                            [MAX_TXWH_IDX /*txh_idx*/] = {
+                              { 13, 13, 12, 0, 0 },
+                              { 13, 13, 13, 12, 0 },
+                              { 13, 13, 12, 13, 12 },
+                              { 0, 12, 13, 12, 11 },
+                              { 0, 0, 12, 11, 10 }
+                            };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0,  2,  4,  6,  8,  10,
+                                               11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t max_fwd_range_mult2_col[5] = { 3, 5, 7, 9, 11 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+#if 0
+const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/]
+                               [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 },
+                                                              { 3, 4, 5, 6, 0 },
+                                                              { 4, 5, 6, 7, 8 },
+                                                              { 0, 5, 6, 7, 8 },
+                                                              { 0, 0, 7, 8,
+                                                                9 } };
+#endif
+
+const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+  fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
+  fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
+  fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+  fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+  const int txh_idx = get_txh_idx(cfg->tx_size);
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+
+  if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
+    int stage_num_col = cfg->stage_num_col;
+    const int8_t *range_mult2_col =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+    for (int i = 0; i < stage_num_col; ++i)
+      cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+  }
+
+  if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
+    int stage_num_row = cfg->stage_num_row;
+    const int8_t *range_mult2_row =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+    for (int i = 0; i < stage_num_row; ++i)
+      cfg->stage_range_row[i] =
+          (max_fwd_range_mult2_col[txh_idx] + range_mult2_row[i] + 1) >> 1;
+  }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+  const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+  cfg->shift = fwd_txfm_shift_ls[tx_size];
+  cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+  set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 000000000..a0a926005
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+static void quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i, eob = -1;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (qm_ptr == NULL && iqm_ptr == NULL) {
+    const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    {  // rc == 0
+      const int coeff = coeff_ptr[0];
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
+        abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
+        const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
+          dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+          eob = 0;
+        }
+      }
+    }
+    const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
+    for (i = 1; i < n_coeffs; i++) {
+      const int coeff = coeff_ptr[i];
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      if ((abs_coeff << (1 + log_scale)) >= thresh1) {
+        abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
+        const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
+          dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+          eob = AOMMAX(iscan[i], eob);
+        }
+      }
+    }
+  } else {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = (coeff >> 31);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int tmp32 = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+                      (16 - log_scale + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+static void highbd_quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i;
+  int eob = -1;
+  const int shift = 16 - log_scale;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  if (qm_ptr || iqm_ptr) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = (coeff >> 31);
+      const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int abs_qcoeff = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        const int64_t tmp =
+            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_qcoeff =
+            (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+        if (abs_qcoeff) eob = i;
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  } else {
+    const int log_scaled_round_arr[2] = {
+      ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+      ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+    };
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int rc01 = (rc != 0);
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int log_scaled_round = log_scaled_round_arr[rc01];
+      if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+        const int quant = quant_ptr[rc01];
+        const int dequant = dequant_ptr[rc01];
+        const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+        const int abs_qcoeff = (int)((tmp * quant) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        if (abs_qcoeff) eob = i;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                         p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        if (n_coeffs < 16) {
+          // TODO(jingning): Need SIMD implementation for smaller block size
+          // quantization.
+          quantize_fp_helper_c(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+              p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, NULL, NULL, 0);
+        } else {
+          av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                          p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                          dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                          sc->iscan);
+        }
+        break;
+      case 1:
+        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      case 2:
+        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                        p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                        sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                       p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                       dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                       sc->iscan);
+        break;
+      case 1:
+        aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
+        break;
+      case 2:
+        aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                             p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                             dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                             sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t quant, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                        const qm_val_t *iqm_ptr, const int log_scale) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp;
+  int eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  (void)sc;
+  assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+              p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+              eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    highbd_quantize_fp_helper_c(
+        coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+        p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+        sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    if (n_coeffs < 16) {
+      // TODO(jingning): Need SIMD implementation for smaller block size
+      // quantization.
+      av1_highbd_quantize_fp_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qparam->log_scale);
+      return;
+    }
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                           p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                           dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                           sc->iscan, qparam->log_scale);
+  }
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        if (LIKELY(n_coeffs >= 8)) {
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                                p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                                dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                                sc->iscan);
+        } else {
+          // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+          // quantization
+          aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
+                                  p->round_QTX, p->quant_QTX,
+                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+        }
+        break;
+      case 1:
+        aom_highbd_quantize_b_32x32(
+            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+            eob_ptr, sc->scan, sc->iscan);
+        break;
+      case 2:
+        aom_highbd_quantize_b_64x64(
+            coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+            p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+            eob_ptr, sc->scan, sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static INLINE void highbd_quantize_dc(
+    const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+    const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    const int64_t tmpw = tmp * wt;
+    const int abs_qcoeff =
+        (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    const int dequant =
+        (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  (void)sc;
+
+  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+                     p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+                     p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+                     qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+                              int log_scale) {
+  highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                              quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                              dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+                              log_scale);
+}
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+  uint32_t t;
+  int l, m;
+  t = d;
+  for (l = 0; t > 1; l++) t >>= 1;
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
+  *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+  const int quant = av1_dc_quant_Q3(q, 0, bit_depth);
+  switch (bit_depth) {
+    case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq) {
+  int i, q, quant_Q3, quant_QTX;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    const int qzbin_factor = get_qzbin_factor(q, bit_depth);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = 64;
+      // y quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, 0, bit_depth);
+      // y quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, 0, bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+                   quant_QTX);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->y_dequant_QTX[q][i] = quant_QTX;
+      deq->y_dequant_Q3[q][i] = quant_Q3;
+
+      // u quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth);
+      // u quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+      invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+                   quant_QTX);
+      quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->u_dequant_QTX[q][i] = quant_QTX;
+      deq->u_dequant_Q3[q][i] = quant_Q3;
+
+      // v quantizer setup with original coeff shift of Q3
+      quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth)
+                        : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth);
+      // v quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+      invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+                   quant_QTX);
+      quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->v_dequant_QTX[q][i] = quant_QTX;
+      deq->v_dequant_Q3[q][i] = quant_Q3;
+    }
+
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
+      deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+      deq->y_dequant_Q3[q][i] = deq->y_dequant_Q3[q][1];
+
+      quants->u_quant[q][i] = quants->u_quant[q][1];
+      quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+      quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+      quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+      quants->u_zbin[q][i] = quants->u_zbin[q][1];
+      quants->u_round[q][i] = quants->u_round[q][1];
+      deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+      deq->u_dequant_Q3[q][i] = deq->u_dequant_Q3[q][1];
+      quants->v_quant[q][i] = quants->u_quant[q][1];
+      quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+      quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+      quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+      quants->v_zbin[q][i] = quants->v_zbin[q][1];
+      quants->v_round[q][i] = quants->v_round[q][1];
+      deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+      deq->v_dequant_Q3[q][i] = deq->v_dequant_Q3[q][1];
+    }
+  }
+}
+
+void av1_init_quantizer(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  QUANTS *const quants = &cpi->quants;
+  Dequants *const dequants = &cpi->dequants;
+  av1_build_quantizer(cm->seq_params.bit_depth, cm->y_dc_delta_q,
+                      cm->u_dc_delta_q, cm->u_ac_delta_q, cm->v_dc_delta_q,
+                      cm->v_ac_delta_q, quants, dequants);
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const QUANTS *const quants = &cpi->quants;
+
+  int current_qindex = AOMMAX(
+      0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
+                                      ? cm->base_qindex + xd->delta_qindex
+                                      : cm->base_qindex));
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+  const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+  int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                    ? NUM_QM_LEVELS - 1
+                    : cm->qm_y;
+
+  // Y
+  x->plane[0].quant_QTX = quants->y_quant[qindex];
+  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+  x->plane[0].round_QTX = quants->y_round[qindex];
+  x->plane[0].dequant_QTX = cpi->dequants.y_dequant_QTX[qindex];
+  memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
+         sizeof(cm->gqmatrix[qmlevel][0]));
+  memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
+         sizeof(cm->giqmatrix[qmlevel][0]));
+  xd->plane[0].dequant_Q3 = cpi->dequants.y_dequant_Q3[qindex];
+
+  // U
+  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                ? NUM_QM_LEVELS - 1
+                : cm->qm_u;
+  {
+    x->plane[1].quant_QTX = quants->u_quant[qindex];
+    x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+    x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+    x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+    x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+    x->plane[1].round_QTX = quants->u_round[qindex];
+    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+    memcpy(&xd->plane[1].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+           sizeof(cm->gqmatrix[qmlevel][1]));
+    memcpy(&xd->plane[1].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+           sizeof(cm->giqmatrix[qmlevel][1]));
+    x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+    xd->plane[1].dequant_Q3 = cpi->dequants.u_dequant_Q3[qindex];
+  }
+  // V
+  qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+                ? NUM_QM_LEVELS - 1
+                : cm->qm_v;
+  {
+    x->plane[2].quant_QTX = quants->v_quant[qindex];
+    x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+    x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+    x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+    x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+    x->plane[2].round_QTX = quants->v_round[qindex];
+    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+    memcpy(&xd->plane[2].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][2],
+           sizeof(cm->gqmatrix[qmlevel][2]));
+    memcpy(&xd->plane[2].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][2],
+           sizeof(cm->giqmatrix[qmlevel][2]));
+    x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+    xd->plane[2].dequant_Q3 = cpi->dequants.v_dequant_Q3[qindex];
+  }
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->qindex = qindex;
+
+  set_error_per_bit(x, rdmult);
+
+  av1_initialize_me_consts(cpi, x, qindex);
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+}
+
+void av1_set_quantizer(AV1_COMMON *cm, int q) {
+  // quantizer has to be reinitialized with av1_init_quantizer() if any
+  // delta_q changes.
+  cm->base_qindex = AOMMAX(cm->delta_q_present_flag, q);
+  cm->y_dc_delta_q = 0;
+  cm->u_dc_delta_q = 0;
+  cm->u_ac_delta_q = 0;
+  cm->v_dc_delta_q = 0;
+  cm->v_ac_delta_q = 0;
+  cm->qm_y = aom_get_qmlevel(cm->base_qindex, cm->min_qmlevel, cm->max_qmlevel);
+  cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
+                             cm->min_qmlevel, cm->max_qmlevel);
+
+  if (!cm->seq_params.separate_uv_delta_q)
+    cm->qm_v = cm->qm_u;
+  else
+    cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
+                               cm->min_qmlevel, cm->max_qmlevel);
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,   4,   8,   12,  16,  20,  24,  28,  32,  36,  40,  44,  48,
+  52,  56,  60,  64,  68,  72,  76,  80,  84,  88,  92,  96,  100,
+  104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+  156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+  208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+  return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 000000000..35af9a67a
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+  int log_scale;
+  TX_SIZE tx_size;
+  const qm_val_t *qmatrix;
+  const qm_val_t *iqmatrix;
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const MACROBLOCK_PLANE *p,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const SCAN_ORDER *sc,
+                                 const QUANT_PARAM *qparam);
+
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
+typedef struct {
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  // TODO(jingning): in progress of re-working the quantization. will decide
+  // if we want to deprecate the current use of y_quant.
+  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are sufffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t,
+                  y_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  u_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  v_dequant_QTX[QINDEX_RANGE][8]);              // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, u_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, v_dequant_Q3[QINDEX_RANGE][8]);  // 8: SIMD width
+} Dequants;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id);
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq);
+
+void av1_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_set_quantizer(struct AV1Common *cm, int q);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 000000000..2c4acdb02
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,3999 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+#define ENC_MISMATCH_DEBUG 0
+
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
+                                             MACROBLOCKD *xd,
+                                             const RestorationUnitInfo *rui,
+                                             aom_writer *const w, int plane,
+                                             FRAME_COUNTS *counts);
+
+static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                  const MB_MODE_INFO *mi,
+                                  const MB_MODE_INFO *above_mi,
+                                  const MB_MODE_INFO *left_mi,
+                                  PREDICTION_MODE mode, aom_writer *w) {
+  assert(!is_intrabc_block(mi));
+  (void)mi;
+  aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
+                   INTRA_MODES);
+}
+
+static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+                             FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+
+  aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+
+  if (mode != NEWMV) {
+    const int16_t zeromv_ctx =
+        (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+
+    if (mode != GLOBALMV) {
+      int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+    }
+  }
+}
+
+static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+                          const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+  assert(mbmi->ref_mv_idx < 3);
+
+  const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+  if (new_mv) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+
+        aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+                         2);
+        if (mbmi->ref_mv_idx == idx) return;
+      }
+    }
+    return;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    int idx;
+    // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+    for (idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+                         ec_ctx->drl_cdf[drl_ctx], 2);
+        if (mbmi->ref_mv_idx == (idx - 1)) return;
+      }
+    }
+    return;
+  }
+}
+
+static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                      PREDICTION_MODE mode,
+                                      const int16_t mode_ctx) {
+  assert(is_inter_compound_mode(mode));
+  aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+                   xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+                   INTER_COMPOUND_MODES);
+}
+
+static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
+                                TX_SIZE tx_size, int depth, int blk_row,
+                                int blk_col, aom_writer *w) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  const int txb_size_index =
+      av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+  const int write_txfm_partition =
+      tx_size == mbmi->inter_tx_size[txb_size_index];
+  if (write_txfm_partition) {
+    aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    // TODO(yuec): set correct txfm partition update for qttx
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+    if (sub_txs == TX_4X4) {
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+      return;
+    }
+
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+      }
+  }
+}
+
+static void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  if (block_signals_txsize(bsize)) {
+    const TX_SIZE tx_size = mbmi->tx_size;
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int max_depths = bsize_to_max_depth(bsize);
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+    assert(depth >= 0 && depth <= max_depths);
+    assert(!is_inter_block(mbmi));
+    assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+
+    aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                     max_depths + 1);
+  }
+}
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      int segment_id, const MB_MODE_INFO *mi, aom_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip = mi->skip;
+    const int ctx = av1_get_skip_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
+    return skip;
+  }
+}
+
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, const MB_MODE_INFO *mi,
+                           aom_writer *w) {
+  if (!cm->skip_mode_flag) return 0;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+  const int skip_mode = mi->skip_mode;
+  if (!is_comp_ref_allowed(mi->sb_type)) {
+    assert(!skip_mode);
+    return 0;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    assert(!skip_mode);
+    return 0;
+  }
+  const int ctx = av1_get_skip_mode_context(xd);
+  aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+  return skip_mode;
+}
+
+static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, aom_writer *w, const int is_inter) {
+  if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+      assert(is_inter);
+      return;
+    }
+    const int ctx = av1_get_intra_inter_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+  }
+}
+
+static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                              const MB_MODE_INFO *mbmi, aom_writer *w) {
+  MOTION_MODE last_motion_mode_allowed =
+      cm->switchable_motion_mode
+          ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+                                cm->allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->motion_mode <= last_motion_mode_allowed);
+  switch (last_motion_mode_allowed) {
+    case SIMPLE_TRANSLATION: break;
+    case OBMC_CAUSAL:
+      aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                       xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+      break;
+    default:
+      aom_write_symbol(w, mbmi->motion_mode,
+                       xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                       MOTION_MODES);
+  }
+}
+
+static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
+                               aom_writer *w) {
+  int sign = delta_qindex < 0;
+  int abs = sign ? -delta_qindex : delta_qindex;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+                   DELTA_Q_PROBS + 1);
+
+  if (!smallval) {
+    rem_bits = get_msb(abs - 1);
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits - 1, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                int lf_id, int delta_lflevel, aom_writer *w) {
+  int sign = delta_lflevel < 0;
+  int abs = sign ? -delta_lflevel : delta_lflevel;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (cm->delta_lf_multi) {
+    assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                                         : FRAME_LF_COUNT - 2));
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+                     ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+  } else {
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+                     DELTA_LF_PROBS + 1);
+  }
+
+  if (!smallval) {
+    rem_bits = get_msb(abs - 1);
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits - 1, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
+                            int num) {
+  const TOKENEXTRA *p = *tp;
+  write_uniform(w, n, p->token);  // The first color index.
+  ++p;
+  --num;
+  for (int i = 0; i < num; ++i) {
+    aom_write_symbol(w, p->token, p->color_map_cdf, n);
+    ++p;
+  }
+  *tp = p;
+}
+
+static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
+                            const TOKENEXTRA **tp,
+                            const TOKENEXTRA *const tok_end, MACROBLOCKD *xd,
+                            MB_MODE_INFO *mbmi, int plane,
+                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
+                            int block, int blk_row, int blk_col,
+                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+
+  if (tx_size == plane_tx_size || plane) {
+    tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+    const uint16_t eob = x->mbmi_ext->eobs[plane][block];
+    TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+                        x->mbmi_ext->dc_sign_ctx[plane][block] };
+    av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
+                         eob, &txb_ctx);
+#if CONFIG_RD_DEBUG
+    TOKEN_STATS tmp_token_stats;
+    init_token_stats(&tmp_token_stats);
+    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
+    token_stats->cost += tmp_token_stats.cost;
+#endif
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
+        const int offsetr = blk_row + r;
+        const int offsetc = blk_col + c;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+                        bit_depth, block, offsetr, offsetc, sub_txs,
+                        token_stats);
+        block += step;
+      }
+    }
+  }
+}
+
+static INLINE void set_spatial_segment_id(const AV1_COMMON *const cm,
+                                          uint8_t *segment_ids,
+                                          BLOCK_SIZE bsize, int mi_row,
+                                          int mi_col, int segment_id) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  for (y = 0; y < ymis; ++y)
+    for (x = 0; x < xmis; ++x)
+      segment_ids[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+  assert(x < max);
+  const int diff = x - ref;
+  if (!ref) return x;
+  if (ref >= (max - 1)) return -x + max - 1;
+  if (2 * ref < max) {
+    if (abs(diff) <= ref) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return x;
+  } else {
+    if (abs(diff) < (max - ref)) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return (max - x) - 1;
+  }
+}
+
+static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
+                             aom_writer *w, const struct segmentation *seg,
+                             struct segmentation_probs *segp, int mi_row,
+                             int mi_col, int skip) {
+  if (!seg->enabled || !seg->update_map) return;
+
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+
+  if (skip) {
+    // Still need to transmit tx size for intra blocks even if skip is
+    // true. Changing segment_id may make the tx size become invalid, e.g
+    // changing from lossless to lossy.
+    assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
+
+    set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+                           mi_col, pred);
+    set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
+                           mi_col, pred);
+    /* mbmi is read only but we need to update segment_id */
+    ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+    return;
+  }
+
+  const int coded_id =
+      av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+  set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+                         mi_col, mbmi->segment_id);
+}
+
+#define WRITE_REF_BIT(bname, pname) \
+  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
+
+// This function encodes the reference frame
+static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+           get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] == LAST_FRAME);
+  } else {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      if (is_comp_ref_allowed(mbmi->sb_type))
+        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
+    } else {
+      assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
+    }
+
+    if (is_compound) {
+      const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                    ? UNIDIR_COMP_REFERENCE
+                                                    : BIDIR_COMP_REFERENCE;
+      aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+                       2);
+
+      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+        const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+        WRITE_REF_BIT(bit, uni_comp_ref_p);
+
+        if (!bit) {
+          assert(mbmi->ref_frame[0] == LAST_FRAME);
+          const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+                           mbmi->ref_frame[1] == GOLDEN_FRAME;
+          WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+            WRITE_REF_BIT(bit2, uni_comp_ref_p2);
+          }
+        } else {
+          assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+        }
+
+        return;
+      }
+
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME);
+      WRITE_REF_BIT(bit, comp_ref_p);
+
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+        WRITE_REF_BIT(bit1, comp_ref_p1);
+      } else {
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        WRITE_REF_BIT(bit2, comp_ref_p2);
+      }
+
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+      WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+      if (!bit_bwd) {
+        WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+      }
+
+    } else {
+      const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+                        mbmi->ref_frame[0] >= BWDREF_FRAME);
+      WRITE_REF_BIT(bit0, single_ref_p1);
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        WRITE_REF_BIT(bit1, single_ref_p2);
+
+        if (!bit1) {
+          WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+        }
+      } else {
+        const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+                          mbmi->ref_frame[0] == GOLDEN_FRAME);
+        WRITE_REF_BIT(bit2, single_ref_p3);
+
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          WRITE_REF_BIT(bit3, single_ref_p4);
+        } else {
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          WRITE_REF_BIT(bit4, single_ref_p5);
+        }
+      }
+    }
+  }
+}
+
+static void write_filter_intra_mode_info(const AV1_COMMON *cm,
+                                         const MACROBLOCKD *xd,
+                                         const MB_MODE_INFO *const mbmi,
+                                         aom_writer *w) {
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode;
+      aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+                       FILTER_INTRA_MODES);
+    }
+  }
+}
+
+static void write_angle_delta(aom_writer *w, int angle_delta,
+                              aom_cdf_prob *cdf) {
+  aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+                   2 * MAX_ANGLE_DELTA + 1);
+}
+
+static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
+                                   aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!av1_is_interp_needed(xd)) {
+    assert(mbmi->interp_filters ==
+           av1_broadcast_interp_filter(
+               av1_unswitchable_filter(cm->interp_filter)));
+    return;
+  }
+  if (cm->interp_filter == SWITCHABLE) {
+    int dir;
+    for (dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+                       SWITCHABLE_FILTERS);
+      ++cpi->interp_filter_selected[0][filter];
+      if (cm->seq_params.enable_dual_filter == 0) return;
+    }
+  }
+}
+
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static void delta_encode_palette_colors(const int *colors, int num,
+                                        int bit_depth, int min_val,
+                                        aom_writer *w) {
+  if (num <= 0) return;
+  assert(colors[0] < (1 << bit_depth));
+  aom_write_literal(w, colors[0], bit_depth);
+  if (num == 1) return;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  memset(deltas, 0, sizeof(deltas));
+  for (int i = 1; i < num; ++i) {
+    assert(colors[i] < (1 << bit_depth));
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  const int min_bits = bit_depth - 3;
+  int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  aom_write_literal(w, bits - min_bits, 2);
+  for (int i = 0; i < num - 1; ++i) {
+    aom_write_literal(w, deltas[i] - min_val, bits);
+    range -= deltas[i];
+    bits = AOMMIN(bits, av1_ceil_log2(range));
+  }
+}
+
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static void write_palette_colors_y(const MACROBLOCKD *const xd,
+                                   const PALETTE_MODE_INFO *const pmi,
+                                   int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[0];
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  assert(n_in_cache + n_out_cache == n);
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static void write_palette_colors_uv(const MACROBLOCKD *const xd,
+                                    const PALETTE_MODE_INFO *const pmi,
+                                    int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[1];
+  const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+  const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+  // U channel colors.
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+  // V channel colors. Don't use color cache as the colors are not sorted.
+  const int max_val = 1 << bit_depth;
+  int zero_count = 0, min_bits_v = 0;
+  int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int rate_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int rate_using_raw = bit_depth * n;
+  if (rate_using_delta < rate_using_raw) {  // delta encoding
+    assert(colors_v[0] < (1 << bit_depth));
+    aom_write_bit(w, 1);
+    aom_write_literal(w, bits_v - min_bits_v, 2);
+    aom_write_literal(w, colors_v[0], bit_depth);
+    for (int i = 1; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      if (colors_v[i] == colors_v[i - 1]) {  // No need to signal sign bit.
+        aom_write_literal(w, 0, bits_v);
+        continue;
+      }
+      const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+      const int sign_bit = colors_v[i] < colors_v[i - 1];
+      if (delta <= max_val - delta) {
+        aom_write_literal(w, delta, bits_v);
+        aom_write_bit(w, sign_bit);
+      } else {
+        aom_write_literal(w, max_val - delta, bits_v);
+        aom_write_bit(w, !sign_bit);
+      }
+    }
+  } else {  // Transmit raw values.
+    aom_write_bit(w, 0);
+    for (int i = 0; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      aom_write_literal(w, colors_v[i], bit_depth);
+    }
+  }
+}
+
+static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                    const MB_MODE_INFO *const mbmi, int mi_row,
+                                    int mi_col, aom_writer *w) {
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
+    aom_write_symbol(
+        w, n > 0,
+        xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
+    if (n > 0) {
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+                       PALETTE_SIZES);
+      write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
+    }
+  }
+
+  const int uv_dc_pred =
+      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y);
+  if (uv_dc_pred) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    aom_write_symbol(w, n > 0,
+                     xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+    if (n > 0) {
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+                       PALETTE_SIZES);
+      write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
+    }
+  }
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
+                       aom_writer *w) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  // Only y plane's tx_type is transmitted
+  if (plane > 0) return;
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+                                    cm->reduced_tx_set_used);
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and there
+    // is no need to send the tx_type
+    assert(eset > 0);
+    assert(av1_ext_tx_used[tx_set_type][tx_type]);
+    if (is_inter) {
+      aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                       ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                       av1_num_ext_tx_set[tx_set_type]);
+    } else {
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      aom_write_symbol(
+          w, av1_ext_tx_ind[tx_set_type][tx_type],
+          ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type]);
+    }
+  }
+}
+
+static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+                                     PREDICTION_MODE mode, aom_writer *w) {
+  aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+                   INTRA_MODES);
+}
+
+static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+                                UV_PREDICTION_MODE uv_mode,
+                                PREDICTION_MODE y_mode,
+                                CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) {
+  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                   UV_INTRA_MODES - !cfl_allowed);
+}
+
+static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
+                             int joint_sign, aom_writer *w) {
+  aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  // Magnitudes are only signaled for nonzero codes.
+  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+  }
+  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+  }
+}
+
+static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
+                       int skip, int mi_col, int mi_row) {
+  if (cm->coded_lossless || cm->allow_intrabc) {
+    // Initialize to indicate no CDEF for safety.
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    return;
+  }
+
+  const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
+  const MB_MODE_INFO *mbmi =
+      cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)];
+  // Initialise when at top left part of the superblock
+  if (!(mi_row & (cm->seq_params.mib_size - 1)) &&
+      !(mi_col & (cm->seq_params.mib_size - 1))) {  // Top left?
+    xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+        xd->cdef_preset[3] = -1;
+  }
+
+  // Emit CDEF param at first non-skip coding block
+  const int mask = 1 << (6 - MI_SIZE_LOG2);
+  const int index = cm->seq_params.sb_size == BLOCK_128X128
+                        ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+                        : 0;
+  if (xd->cdef_preset[index] == -1 && !skip) {
+    aom_write_literal(w, mbmi->cdef_strength, cm->cdef_bits);
+    xd->cdef_preset[index] = mbmi->cdef_strength;
+  }
+}
+
+static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
+                                   const struct segmentation *const seg,
+                                   struct segmentation_probs *const segp,
+                                   int mi_row, int mi_col, int skip,
+                                   int preskip) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  AV1_COMMON *const cm = &cpi->common;
+
+  if (seg->update_map) {
+    if (preskip) {
+      if (!seg->segid_preskip) return;
+    } else {
+      if (seg->segid_preskip) return;
+      if (skip) {
+        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
+        if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0;
+        return;
+      }
+    }
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+      aom_write_symbol(w, pred_flag, pred_cdf, 2);
+      if (!pred_flag) {
+        write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+      }
+      if (pred_flag) {
+        set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type,
+                               mi_row, mi_col, mbmi->segment_id);
+      }
+    } else {
+      write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+    }
+  }
+}
+
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static void write_delta_q_params(AV1_COMP *cpi, const int mi_row,
+                                 const int mi_col, int skip, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cm->delta_q_present_flag) {
+    MACROBLOCK *const x = &cpi->td.mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const int super_block_upper_left =
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
+      const int reduced_delta_qindex =
+          (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_qindex = mbmi->current_qindex;
+      if (cm->delta_lf_present_flag) {
+        if (cm->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            int reduced_delta_lflevel =
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                cm->delta_lf_res;
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          int reduced_delta_lflevel =
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+              cm->delta_lf_res;
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+  }
+}
+
+static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row,
+                                         const int mi_col, int is_keyframe,
+                                         aom_writer *w) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  // Y mode.
+  if (is_keyframe) {
+    const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+    const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+    write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+  } else {
+    write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+  }
+
+  // Y angle delta.
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  if (use_angle_delta && av1_is_directional_mode(mode)) {
+    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
+  }
+
+  // UV mode and UV angle delta.
+  if (!cm->seq_params.monochrome &&
+      is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                          xd->plane[1].subsampling_y)) {
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+    if (uv_mode == UV_CFL_PRED)
+      write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
+    }
+  }
+
+  // Palette.
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+    write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+  }
+
+  // Filter intra.
+  write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
+
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+                                const int mi_col, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->allow_high_precision_mv;
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int ref;
+
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+  write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  const int skip =
+      mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+  write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+  write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+  if (mbmi->skip_mode) return;
+
+  if (!is_inter) {
+    write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w);
+  } else {
+    int16_t mode_ctx;
+
+    av1_collect_neighbors_ref_counts(xd);
+
+    write_ref_frames(cm, xd, w);
+
+    mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+    // If segment skip is not enabled code the mode.
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (is_inter_compound_mode(mode))
+        write_inter_compound_mode(xd, w, mode, mode_ctx);
+      else if (is_inter_singleref_mode(mode))
+        write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+      if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+        write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
+      else
+        assert(mbmi->ref_mv_idx == 0);
+    }
+
+    if (mode == NEWMV || mode == NEW_NEWMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        nmv_context *nmvc = &ec_ctx->nmvc;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+                      allow_hp);
+      }
+    } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = av1_get_ref_mv(x, 1);
+      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+    } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = av1_get_ref_mv(x, 0);
+      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+    }
+
+    if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
+        cpi->common.seq_params.enable_interintra_compound &&
+        is_interintra_allowed(mbmi)) {
+      const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+      const int bsize_group = size_group_lookup[bsize];
+      aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+      if (interintra) {
+        aom_write_symbol(w, mbmi->interintra_mode,
+                         ec_ctx->interintra_mode_cdf[bsize_group],
+                         INTERINTRA_MODES);
+        if (is_interintra_wedge_used(bsize)) {
+          aom_write_symbol(w, mbmi->use_wedge_interintra,
+                           ec_ctx->wedge_interintra_cdf[bsize], 2);
+          if (mbmi->use_wedge_interintra) {
+            aom_write_symbol(w, mbmi->interintra_wedge_index,
+                             ec_ctx->wedge_idx_cdf[bsize], 16);
+            assert(mbmi->interintra_wedge_sign == 0);
+          }
+        }
+      }
+    }
+
+    if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+    // First write idx to indicate current compound inter prediction mode group
+    // Group A (0): jnt_comp, compound_average
+    // Group B (1): interintra, compound_diffwtd, wedge
+    if (has_second_ref(mbmi)) {
+      const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                       cm->seq_params.enable_masked_compound;
+
+      if (masked_compound_used) {
+        const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+        aom_write_symbol(w, mbmi->comp_group_idx,
+                         ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+      } else {
+        assert(mbmi->comp_group_idx == 0);
+      }
+
+      if (mbmi->comp_group_idx == 0) {
+        if (mbmi->compound_idx)
+          assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+        if (cm->seq_params.enable_jnt_comp) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          aom_write_symbol(w, mbmi->compound_idx,
+                           ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+        } else {
+          assert(mbmi->compound_idx == 1);
+        }
+      } else {
+        assert(cpi->common.reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+        assert(masked_compound_used);
+        // compound_diffwtd, wedge
+        assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+               mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          aom_write_symbol(w, mbmi->interinter_comp.type - 1,
+                           ec_ctx->compound_type_cdf[bsize],
+                           COMPOUND_TYPES - 1);
+
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+          aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+                           ec_ctx->wedge_idx_cdf[bsize], 16);
+          aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+        } else {
+          assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+          aom_write_literal(w, mbmi->interinter_comp.mask_type,
+                            MAX_DIFFWTD_MASK_BITS);
+        }
+      }
+    }
+
+    write_mb_interp_filter(cpi, xd, w);
+  }
+}
+
+static void write_intrabc_info(MACROBLOCKD *xd,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  int use_intrabc = is_intrabc_block(mbmi);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+  if (use_intrabc) {
+    assert(mbmi->mode == DC_PRED);
+    assert(mbmi->uv_mode == UV_DC_PRED);
+    assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+    int_mv dv_ref = mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv;
+    av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+  }
+}
+
+static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
+                              const MB_MODE_INFO_EXT *mbmi_ext,
+                              const int mi_row, const int mi_col,
+                              aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+
+  const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+  if (!seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, skip);
+
+  write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+  write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+  if (av1_allow_intrabc(cm)) {
+    write_intrabc_info(xd, mbmi_ext, w);
+    if (is_intrabc_block(mbmi)) return;
+  }
+
+  write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w);
+}
+
+#if CONFIG_RD_DEBUG
+static void dump_mode_info(MODE_INFO *mi) {
+  printf("\nmi->mi_row == %d\n", mi->mi_row);
+  printf("&& mi->mi_col == %d\n", mi->mi_col);
+  printf("&& mi->sb_type == %d\n", mi->sb_type);
+  printf("&& mi->tx_size == %d\n", mi->tx_size);
+  printf("&& mi->mode == %d\n", mi->mode);
+}
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+                                   int plane) {
+  if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+    int r, c;
+    printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+           plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+    printf("rd txb_coeff_cost_map\n");
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
+      }
+      printf("\n");
+    }
+
+    printf("pack txb_coeff_cost_map\n");
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
+      }
+      printf("\n");
+    }
+    return 1;
+  }
+  return 0;
+}
+#endif
+
+#if ENC_MISMATCH_DEBUG
+static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  const MB_MODE_INFO *const *mbmi = xd->mi[0];
+  if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
+    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+      const BLOCK_SIZE bsize = mbmi->sb_type;
+
+      int_mv mv[2];
+      int is_comp_ref = has_second_ref(mbmi);
+      int ref;
+
+      for (ref = 0; ref < 1 + is_comp_ref; ++ref)
+        mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+      if (!is_comp_ref) {
+        mv[1].as_int = 0;
+      }
+
+      MACROBLOCK *const x = &cpi->td.mb;
+      const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+      const int16_t mode_ctx =
+          is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]
+                      : av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                  mbmi->ref_frame);
+
+      const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+      int16_t zeromv_ctx = -1;
+      int16_t refmv_ctx = -1;
+
+      if (mbmi->mode != NEWMV) {
+        zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+        if (mbmi->mode != GLOBALMV)
+          refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      }
+
+      printf(
+          "=== ENCODER ===: "
+          "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+          "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+          "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+          cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+          bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+          mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+          mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+          zeromv_ctx, refmv_ctx, mbmi->tx_size);
+    }
+  }
+}
+#endif  // ENC_MISMATCH_DEBUG
+
+static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
+                         aom_writer *w, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int bh, bw;
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  MB_MODE_INFO *m = xd->mi[0];
+
+  assert(m->sb_type <= cm->seq_params.sb_size ||
+         (m->sb_type >= BLOCK_SIZES && m->sb_type < BLOCK_SIZES_ALL));
+
+  bh = mi_size_high[m->sb_type];
+  bw = mi_size_wide[m->sb_type];
+
+  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  if (frame_is_intra_only(cm)) {
+    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext, mi_row, mi_col, w);
+  } else {
+    // has_subpel_mv_component needs the ref frame buffers set up to look
+    // up if they are scaled. has_subpel_mv_component is in turn needed by
+    // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+    set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
+
+#if ENC_MISMATCH_DEBUG
+    enc_dump_logs(cpi, mi_row, mi_col);
+#endif  // ENC_MISMATCH_DEBUG
+
+    pack_inter_mode_mvs(cpi, mi_row, mi_col, w);
+  }
+}
+
+static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
+                                  MB_MODE_INFO *const mbmi, aom_writer *w,
+                                  const TOKENEXTRA **tok,
+                                  const TOKENEXTRA *const tok_end,
+                                  TOKEN_STATS *token_stats, const int row,
+                                  const int col, int *block, const int plane) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const BLOCK_SIZE bsizec =
+      scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+  const int step =
+      tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+  const int bkw = tx_size_wide_unit[max_tx_size];
+  const int bkh = tx_size_high_unit[max_tx_size];
+
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+  int blk_row, blk_col;
+
+  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+
+  const int unit_height =
+      AOMMIN(mu_blocks_high + (row >> pd->subsampling_y), num_4x4_h);
+  const int unit_width =
+      AOMMIN(mu_blocks_wide + (col >> pd->subsampling_x), num_4x4_w);
+  for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+       blk_row += bkh) {
+    for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+         blk_col += bkw) {
+      pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                      cm->seq_params.bit_depth, *block, blk_row, blk_col,
+                      max_tx_size, token_stats);
+      *block += step;
+    }
+  }
+}
+
+static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
+                           aom_writer *w, const TOKENEXTRA **tok,
+                           const TOKENEXTRA *const tok_end, int mi_row,
+                           int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *const mbmi = *(cm->mi_grid_visible + mi_offset);
+  int plane;
+  int bh, bw;
+  MACROBLOCK *const x = &cpi->td.mb;
+  (void)tok;
+  (void)tok_end;
+  xd->mi = cm->mi_grid_visible + mi_offset;
+
+  assert(mbmi->sb_type <= cm->seq_params.sb_size ||
+         (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL));
+
+  bh = mi_size_high[mbmi->sb_type];
+  bw = mi_size_wide[mbmi->sb_type];
+  cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  if (!mbmi->skip) {
+    if (!is_inter_block(mbmi))
+      av1_write_coeffs_mb(cm, x, mi_row, mi_col, w, mbmi->sb_type);
+
+    if (is_inter_block(mbmi)) {
+      int block[MAX_MB_PLANE] = { 0 };
+      const BLOCK_SIZE plane_bsize = mbmi->sb_type;
+      assert(plane_bsize == get_plane_block_size(mbmi->sb_type,
+                                                 xd->plane[0].subsampling_x,
+                                                 xd->plane[0].subsampling_y));
+      const int num_4x4_w =
+          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int num_4x4_h =
+          block_size_high[plane_bsize] >> tx_size_high_log2[0];
+      int row, col;
+      TOKEN_STATS token_stats;
+      init_token_stats(&token_stats);
+
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
+      int mu_blocks_wide =
+          block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+      int mu_blocks_high =
+          block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+      mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+      mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
+      for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+        for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+          for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+                                     pd->subsampling_x, pd->subsampling_y)) {
+              continue;
+            }
+            write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats,
+                                  row, col, &block[plane], plane);
+          }
+        }
+#if CONFIG_RD_DEBUG
+        if (mbmi->sb_type >= BLOCK_8X8 &&
+            rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+          dump_mode_info(m);
+          assert(0);
+        }
+#endif  // CONFIG_RD_DEBUG
+      }
+    }
+  }
+}
+
+static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+                          aom_writer *w, const TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end, int mi_row,
+                          int mi_col) {
+  write_mbmi_b(cpi, tile, w, mi_row, mi_col);
+
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+    const uint8_t palette_size_plane =
+        mbmi->palette_mode_info.palette_size[plane];
+    assert(!mbmi->skip_mode || !palette_size_plane);
+    if (palette_size_plane > 0) {
+      assert(mbmi->use_intrabc == 0);
+      assert(av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type));
+      int rows, cols;
+      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
+    }
+  }
+
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  int is_inter_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  int skip = mbmi->skip;
+  int segment_id = mbmi->segment_id;
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+    if (is_inter_tx) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+      const int txbh = tx_size_high_unit[max_tx_size];
+      const int txbw = tx_size_wide_unit[max_tx_size];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += txbh)
+        for (idx = 0; idx < width; idx += txbw)
+          write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+    } else {
+      write_selected_tx_size(xd, w);
+      set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
+                  skip && is_inter_block(mbmi), xd);
+  }
+
+  write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+}
+
+static void write_partition(const AV1_COMMON *const cm,
+                            const MACROBLOCKD *const xd, int hbs, int mi_row,
+                            int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
+                            aom_writer *w) {
+  const int is_partition_point = bsize >= BLOCK_8X8;
+
+  if (!is_partition_point) return;
+
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT);
+    return;
+  }
+
+  if (has_rows && has_cols) {
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+                     partition_cdf_length(bsize));
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+  } else {
+    assert(has_rows && !has_cols);
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+  }
+}
+
+static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
+                           aom_writer *const w, const TOKENEXTRA **tok,
+                           const TOKENEXTRA *const tok_end, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1)) {
+      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = rcol + rrow * rstride;
+          const RestorationUnitInfo *rui =
+              &cm->rst_info[plane].unit_info[runit_idx];
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
+                                           cpi->td.counts);
+        }
+      }
+    }
+  }
+
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+  switch (partition) {
+    case PARTITION_NONE:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_row + hbs < cm->mi_rows)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_col + hbs < cm->mi_cols)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+                     subsize);
+      break;
+    case PARTITION_HORZ_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
+      }
+      break;
+    default: assert(0);
+  }
+
+  // update partition context
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
+                        aom_writer *const w, int tile_row, int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int mi_row_start = tile->mi_row_start;
+  const int mi_row_end = tile->mi_row_end;
+  const int mi_col_start = tile->mi_col_start;
+  const int mi_col_end = tile->mi_col_end;
+  int mi_row, mi_col, sb_row_in_tile;
+
+  av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
+  av1_init_above_context(cm, xd, tile->tile_row);
+
+  if (cpi->common.delta_q_present_flag) {
+    xd->current_qindex = cpi->common.base_qindex;
+    if (cpi->common.delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
+  }
+
+  for (mi_row = mi_row_start; mi_row < mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    sb_row_in_tile =
+        (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+    const TOKENEXTRA *tok =
+        cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
+    const TOKENEXTRA *tok_end =
+        tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+
+    av1_zero_left_context(xd);
+
+    for (mi_col = mi_col_start; mi_col < mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
+                     cm->seq_params.sb_size);
+    }
+    assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
+  }
+}
+
+static void encode_restoration_mode(AV1_COMMON *cm,
+                                    struct aom_write_bit_buffer *wb) {
+  assert(!cm->all_lossless);
+  if (!cm->seq_params.enable_restoration) return;
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
+    switch (rsi->frame_restoration_type) {
+      case RESTORE_NONE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_WIENER:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_SGRPROJ:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 1);
+        break;
+      case RESTORE_SWITCHABLE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 1);
+        break;
+      default: assert(0);
+    }
+  }
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    assert(rsi->restoration_unit_size >= sb_size);
+    assert(RESTORATION_UNITSIZE_MAX == 256);
+
+    if (sb_size == 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
+    }
+  }
+
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    if (s && !chroma_none) {
+      aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+                               cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[1].restoration_unit_size ==
+                 cm->rst_info[0].restoration_unit_size ||
+             cm->rst_info[1].restoration_unit_size ==
+                 (cm->rst_info[0].restoration_unit_size >> s));
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    } else if (!s) {
+      assert(cm->rst_info[1].restoration_unit_size ==
+             cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    }
+  }
+}
+
+static void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info,
+                                WienerInfo *ref_wiener_info, aom_writer *wb) {
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->vfilter[0] == 0 &&
+           wiener_info->vfilter[WIENER_WIN - 1] == 0);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->hfilter[0] == 0 &&
+           wiener_info->hfilter[WIENER_WIN - 1] == 0);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+                                 SgrprojInfo *ref_sgrproj_info,
+                                 aom_writer *wb) {
+  aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    assert(sgrproj_info->xqd[0] == 0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  } else if (params->r[1] == 0) {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  } else {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  }
+
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
+                                             MACROBLOCKD *xd,
+                                             const RestorationUnitInfo *rui,
+                                             aom_writer *const w, int plane,
+                                             FRAME_COUNTS *counts) {
+  const RestorationInfo *rsi = cm->rst_info + plane;
+  RestorationType frame_rtype = rsi->frame_restoration_type;
+  if (frame_rtype == RESTORE_NONE) return;
+
+  (void)counts;
+  assert(!cm->all_lossless);
+
+  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+  WienerInfo *wiener_info = xd->wiener_info + plane;
+  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+  RestorationType unit_rtype = rui->restoration_type;
+
+  if (frame_rtype == RESTORE_SWITCHABLE) {
+    aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+                     RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+    ++counts->switchable_restore[unit_rtype];
+#endif
+    switch (unit_rtype) {
+      case RESTORE_WIENER:
+        write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+        break;
+      case RESTORE_SGRPROJ:
+        write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+        break;
+      default: assert(unit_rtype == RESTORE_NONE); break;
+    }
+  } else if (frame_rtype == RESTORE_WIENER) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+    }
+  } else if (frame_rtype == RESTORE_SGRPROJ) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+    }
+  }
+}
+
+static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->coded_lossless);
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int i;
+  struct loopfilter *lf = &cm->lf;
+
+  // Encode the loop filter level and type
+  aom_wb_write_literal(wb, lf->filter_level[0], 6);
+  aom_wb_write_literal(wb, lf->filter_level[1], 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      aom_wb_write_literal(wb, lf->filter_level_u, 6);
+      aom_wb_write_literal(wb, lf->filter_level_v, 6);
+    }
+  }
+  aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled).
+  aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+  if (lf->mode_ref_delta_enabled) {
+    aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+
+    if (lf->mode_ref_delta_update) {
+      const int prime_idx = cm->primary_ref_frame;
+      const int buf_idx =
+          prime_idx == PRIMARY_REF_NONE ? -1 : cm->frame_refs[prime_idx].idx;
+      int8_t last_ref_deltas[REF_FRAMES];
+      if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+        av1_set_default_ref_deltas(last_ref_deltas);
+      } else {
+        memcpy(last_ref_deltas, cm->buffer_pool->frame_bufs[buf_idx].ref_deltas,
+               REF_FRAMES);
+      }
+      for (i = 0; i < REF_FRAMES; i++) {
+        const int delta = lf->ref_deltas[i];
+        const int changed = delta != last_ref_deltas[i];
+        aom_wb_write_bit(wb, changed);
+        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+      }
+
+      int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+      if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+        av1_set_default_mode_deltas(last_mode_deltas);
+      } else {
+        memcpy(last_mode_deltas,
+               cm->buffer_pool->frame_bufs[buf_idx].mode_deltas,
+               MAX_MODE_LF_DELTAS);
+      }
+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+        const int delta = lf->mode_deltas[i];
+        const int changed = delta != last_mode_deltas[i];
+        aom_wb_write_bit(wb, changed);
+        if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+      }
+    }
+  }
+}
+
+static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->coded_lossless);
+  if (!cm->seq_params.enable_cdef) return;
+  if (cm->allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int i;
+  aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
+  assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
+  aom_wb_write_literal(wb, cm->cdef_bits, 2);
+  for (i = 0; i < cm->nb_cdef_strengths; i++) {
+    aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
+    if (num_planes > 1)
+      aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
+  }
+}
+
+static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
+  if (delta_q != 0) {
+    aom_wb_write_bit(wb, 1);
+    aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+  } else {
+    aom_wb_write_bit(wb, 0);
+  }
+}
+
+static void encode_quantization(const AV1_COMMON *const cm,
+                                struct aom_write_bit_buffer *wb) {
+  const int num_planes = av1_num_planes(cm);
+
+  aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, cm->y_dc_delta_q);
+  if (num_planes > 1) {
+    int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
+                        (cm->u_ac_delta_q != cm->v_ac_delta_q);
+    if (cm->seq_params.separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    write_delta_q(wb, cm->u_dc_delta_q);
+    write_delta_q(wb, cm->u_ac_delta_q);
+    if (diff_uv_delta) {
+      write_delta_q(wb, cm->v_dc_delta_q);
+      write_delta_q(wb, cm->v_ac_delta_q);
+    }
+  }
+  aom_wb_write_bit(wb, cm->using_qmatrix);
+  if (cm->using_qmatrix) {
+    aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
+    if (!cm->seq_params.separate_uv_delta_q)
+      assert(cm->qm_u == cm->qm_v);
+    else
+      aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
+  }
+}
+
+static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                struct aom_write_bit_buffer *wb) {
+  int i, j;
+  struct segmentation *seg = &cm->seg;
+
+  aom_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled) return;
+
+  // Write update flags
+  if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+    assert(seg->update_map == 1);
+    seg->temporal_update = 0;
+    assert(seg->update_data == 1);
+  } else {
+    aom_wb_write_bit(wb, seg->update_map);
+    if (seg->update_map) {
+      // Select the coding strategy (temporal or spatial)
+      av1_choose_segmap_coding_method(cm, xd);
+      aom_wb_write_bit(wb, seg->temporal_update);
+    }
+    aom_wb_write_bit(wb, seg->update_data);
+  }
+
+  // Segmentation data
+  if (seg->update_data) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = segfeature_active(seg, i, j);
+        aom_wb_write_bit(wb, active);
+        if (active) {
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+          const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            aom_wb_write_inv_signed_literal(wb, data, ubits);
+          } else {
+            aom_wb_write_literal(wb, data, ubits);
+          }
+        }
+      }
+    }
+  }
+}
+
+static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
+                          struct aom_write_bit_buffer *wb) {
+  if (cm->coded_lossless) {
+    *mode = ONLY_4X4;
+    return;
+  }
+  aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
+}
+
+static void write_frame_interp_filter(InterpFilter filter,
+                                      struct aom_write_bit_buffer *wb) {
+  aom_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
+  if (cm->interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS];
+    int i, j, c = 0;
+    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      count[i] = 0;
+      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      c += (count[i] > 0);
+    }
+    if (c == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          if (i == EIGHTTAP_REGULAR) cm->interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+// Same function as write_uniform but writing to uncompresses header wb
+static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_literal(wb, (v - m) & 1, 1);
+  }
+}
+
+static void write_tile_info_max_tile(const AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+  int size_sb, i;
+
+  aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag);
+
+  if (cm->uniform_tile_spacing_flag) {
+    // Uniform spaced tiles with power-of-two number of rows and columns
+    // tile columns
+    int ones = cm->log2_tile_cols - cm->min_log2_tile_cols;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+      aom_wb_write_bit(wb, 0);
+    }
+
+    // rows
+    ones = cm->log2_tile_rows - cm->min_log2_tile_rows;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+      aom_wb_write_bit(wb, 0);
+    }
+  } else {
+    // Explicit tiles with configurable tile widths and heights
+    // columns
+    for (i = 0; i < cm->tile_cols; i++) {
+      size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(width_sb, cm->max_tile_width_sb),
+                       size_sb - 1);
+      width_sb -= size_sb;
+    }
+    assert(width_sb == 0);
+
+    // rows
+    for (i = 0; i < cm->tile_rows; i++) {
+      size_sb = cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(height_sb, cm->max_tile_height_sb),
+                       size_sb - 1);
+      height_sb -= size_sb;
+    }
+    assert(height_sb == 0);
+  }
+}
+
+static void write_tile_info(const AV1_COMMON *const cm,
+                            struct aom_write_bit_buffer *saved_wb,
+                            struct aom_write_bit_buffer *wb) {
+  write_tile_info_max_tile(cm, wb);
+
+  *saved_wb = *wb;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // tile id used for cdf update
+    aom_wb_write_literal(wb, 0, cm->log2_tile_cols + cm->log2_tile_rows);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 3, 2);
+  }
+}
+
+static void write_ext_tile_info(const AV1_COMMON *const cm,
+                                struct aom_write_bit_buffer *saved_wb,
+                                struct aom_write_bit_buffer *wb) {
+  // This information is stored as a separate byte.
+  int mod = wb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+  assert(aom_wb_is_byte_aligned(wb));
+
+  *saved_wb = *wb;
+  if (cm->tile_rows * cm->tile_cols > 1) {
+    // Note that the last item in the uncompressed header is the data
+    // describing tile configuration.
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(wb, 0, 2);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 0, 2);
+  }
+}
+
+static int get_refresh_mask(AV1_COMP *cpi) {
+  if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
+      frame_is_sframe(&cpi->common))
+    return 0xFF;
+
+  int refresh_mask = 0;
+
+  // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
+  // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
+  // the 3 LAST reference frames will be updated accordingly, i.e.:
+  // (1) The original virtual index for LAST3_FRAME will become the new virtual
+  //     index for LAST_FRAME; and
+  // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
+  //     shifted and become the new virtual indexes for LAST2_FRAME and
+  //     LAST3_FRAME.
+  refresh_mask |=
+      (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]);
+#if USE_SYMM_MULTI_LAYER
+  refresh_mask |=
+      (cpi->new_bwdref_update_rule == 1)
+          ? (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[EXTREF_FRAME - 1])
+          : (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#else
+  refresh_mask |=
+      (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#endif
+  refresh_mask |=
+      (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]);
+
+  if (av1_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term we leave it in the GF slot and,
+    // if we're updating the GF with the current decoded frame, we save it
+    // instead to the ARF slot.
+    // Later, in the function av1_encoder.c:av1_update_reference_frames() we
+    // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+    // there so that it can be done outside of the recode loop.
+    // Note: This is highly specific to the use of ARF as a forward reference,
+    // and this needs to be generalized as other uses are implemented
+    // (like RTC/temporal scalability).
+
+    if (cpi->preserve_arf_as_gld) {
+      return refresh_mask;
+    } else {
+      return refresh_mask |
+             (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+    }
+  } else {
+    const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    return refresh_mask |
+           (cpi->refresh_golden_frame << cpi->ref_fb_idx[GOLDEN_FRAME - 1]) |
+           (cpi->refresh_alt_ref_frame << arf_idx);
+  }
+}
+
+static INLINE int find_identical_tile(
+    const int tile_row, const int tile_col,
+    TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
+  const MV32 candidate_offset[1] = { { 1, 0 } };
+  const uint8_t *const cur_tile_data =
+      tile_buffers[tile_row][tile_col].data + 4;
+  const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+  int i;
+
+  if (tile_row == 0) return 0;
+
+  // (TODO: yunqingwang) For now, only above tile is checked and used.
+  // More candidates such as left tile can be added later.
+  for (i = 0; i < 1; i++) {
+    int row_offset = candidate_offset[0].row;
+    int col_offset = candidate_offset[0].col;
+    int row = tile_row - row_offset;
+    int col = tile_col - col_offset;
+    uint8_t tile_hdr;
+    const uint8_t *tile_data;
+    TileBufferEnc *candidate;
+
+    if (row < 0 || col < 0) continue;
+
+    tile_hdr = *(tile_buffers[row][col].data);
+
+    // Read out tcm bit
+    if ((tile_hdr >> 7) == 1) {
+      // The candidate is a copy tile itself
+      row_offset += tile_hdr & 0x7f;
+      row = tile_row - row_offset;
+    }
+
+    candidate = &tile_buffers[row][col];
+
+    if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+    tile_data = candidate->data + 4;
+
+    if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+    // Identical tile found
+    assert(row_offset > 0);
+    return row_offset;
+  }
+
+  // No identical tile found
+  return 0;
+}
+
+static void write_render_size(const AV1_COMMON *cm,
+                              struct aom_write_bit_buffer *wb) {
+  const int scaling_active = av1_resize_scaled(cm);
+  aom_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    aom_wb_write_literal(wb, cm->render_width - 1, 16);
+    aom_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
+
+static void write_superres_scale(const AV1_COMMON *const cm,
+                                 struct aom_write_bit_buffer *wb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) {
+    assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+    return;
+  }
+
+  // First bit is whether to to scale or not
+  if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    aom_wb_write_bit(wb, 0);  // no scaling
+  } else {
+    aom_wb_write_bit(wb, 1);  // scaling, write scale factor
+    assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+    assert(cm->superres_scale_denominator <
+           SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+    aom_wb_write_literal(
+        wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+        SUPERRES_SCALE_BITS);
+  }
+}
+
+static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
+                             struct aom_write_bit_buffer *wb) {
+  const int coded_width = cm->superres_upscaled_width - 1;
+  const int coded_height = cm->superres_upscaled_height - 1;
+
+  if (frame_size_override) {
+    const SequenceHeader *seq_params = &cm->seq_params;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    aom_wb_write_literal(wb, coded_width, num_bits_width);
+    aom_wb_write_literal(wb, coded_height, num_bits_height);
+  }
+
+  write_superres_scale(cm, wb);
+  write_render_size(cm, wb);
+}
+
+static void write_frame_size_with_refs(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int found = 0;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+    if (cfg != NULL) {
+      found = cm->superres_upscaled_width == cfg->y_crop_width &&
+              cm->superres_upscaled_height == cfg->y_crop_height;
+      found &= cm->render_width == cfg->render_width &&
+               cm->render_height == cfg->render_height;
+    }
+    aom_wb_write_bit(wb, found);
+    if (found) {
+      write_superres_scale(cm, wb);
+      break;
+    }
+  }
+
+  if (!found) {
+    int frame_size_override = 1;  // Always equal to 1 in this function
+    write_frame_size(cm, frame_size_override, wb);
+  }
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+                          struct aom_write_bit_buffer *wb) {
+  assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+  aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
+
+static void write_bitdepth(const SequenceHeader *const seq_params,
+                           struct aom_write_bit_buffer *wb) {
+  // Profile 0/1: [0] for 8 bit, [1]  10-bit
+  // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+  aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+  if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+    aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
+  }
+}
+
+static void write_color_config(const SequenceHeader *const seq_params,
+                               struct aom_write_bit_buffer *wb) {
+  write_bitdepth(seq_params, wb);
+  const int is_monochrome = seq_params->monochrome;
+  // monochrome bit
+  if (seq_params->profile != PROFILE_1)
+    aom_wb_write_bit(wb, is_monochrome);
+  else
+    assert(!is_monochrome);
+  if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+    aom_wb_write_bit(wb, 0);  // No color description present
+  } else {
+    aom_wb_write_bit(wb, 1);  // Color description present
+    aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+    aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+    aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
+  }
+  if (is_monochrome) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, seq_params->color_range);
+    return;
+  }
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients ==
+          AOM_CICP_MC_IDENTITY) {  // it would be better to remove this
+                                   // dependency too
+    assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    assert(seq_params->profile == PROFILE_1 ||
+           (seq_params->profile == PROFILE_2 &&
+            seq_params->bit_depth == AOM_BITS_12));
+  } else {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, seq_params->color_range);
+    if (seq_params->profile == PROFILE_0) {
+      // 420 only
+      assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+    } else if (seq_params->profile == PROFILE_1) {
+      // 444 only
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    } else if (seq_params->profile == PROFILE_2) {
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        // 420, 444 or 422
+        aom_wb_write_bit(wb, seq_params->subsampling_x);
+        if (seq_params->subsampling_x == 0) {
+          assert(seq_params->subsampling_y == 0 &&
+                 "4:4:0 subsampling not allowed in AV1");
+        } else {
+          aom_wb_write_bit(wb, seq_params->subsampling_y);
+        }
+      } else {
+        // 422 only
+        assert(seq_params->subsampling_x == 1 &&
+               seq_params->subsampling_y == 0);
+      }
+    }
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    }
+    if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+      aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
+    }
+  }
+  aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
+}
+
+static void write_timing_info_header(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, cm->timing_info.num_units_in_display_tick,
+                                32);  // Number of units in tick
+  aom_wb_write_unsigned_literal(wb, cm->timing_info.time_scale,
+                                32);  // Time scale
+  aom_wb_write_bit(
+      wb,
+      cm->timing_info.equal_picture_interval);  // Equal picture interval bit
+  if (cm->timing_info.equal_picture_interval) {
+    aom_wb_write_uvlc(
+        wb,
+        cm->timing_info.num_ticks_per_picture - 1);  // ticks per picture
+  }
+}
+
+static void write_decoder_model_info(AV1_COMMON *const cm,
+                                     struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(
+      wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
+  aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
+                                32);  // Number of units in decoding tick
+  aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_time_length - 1, 5);
+  aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_time_length - 1,
+                       5);
+}
+
+static void write_dec_model_op_parameters(AV1_COMMON *const cm,
+                                          struct aom_write_bit_buffer *wb,
+                                          int op_num) {
+  if (op_num > MAX_NUM_OPERATING_POINTS)
+    aom_internal_error(
+        &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        "Encoder does not support %d decoder model operating points", op_num);
+
+  //  aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
+  //  if (!cm->op_params[op_num].has_parameters) return;
+
+  aom_wb_write_unsigned_literal(
+      wb, cm->op_params[op_num].decoder_buffer_delay,
+      cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  aom_wb_write_unsigned_literal(
+      wb, cm->op_params[op_num].encoder_buffer_delay,
+      cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+  aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
+
+  cm->op_frame_timing[op_num].buffer_removal_time =
+      0;  // reset the decoded frame counter
+}
+
+static void write_tu_pts_info(AV1_COMMON *const cm,
+                              struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(
+      wb, cm->frame_presentation_time,
+      cm->buffer_model.frame_presentation_time_length);
+}
+
+static void write_film_grain_params(AV1_COMP *cpi,
+                                    struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  aom_film_grain_t *pars = &cm->film_grain_params;
+
+  cm->cur_frame->film_grain_params = *pars;
+
+  aom_wb_write_bit(wb, pars->apply_grain);
+  if (!pars->apply_grain) return;
+
+  aom_wb_write_literal(wb, pars->random_seed, 16);
+
+  pars->random_seed += 3381;  // Changing random seed for film grain
+  if (!pars->random_seed)     // Random seed should not be zero
+    pars->random_seed += 7391;
+  if (cm->frame_type == INTER_FRAME)
+    aom_wb_write_bit(wb, pars->update_parameters);
+  else
+    pars->update_parameters = 1;
+  if (!pars->update_parameters) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    int ref_frame, ref_idx, buf_idx;
+    for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+      ref_idx = get_ref_frame_map_idx(cpi, ref_frame);
+      assert(ref_idx != INVALID_IDX);
+      buf_idx = cm->ref_frame_map[ref_idx];
+      if (frame_bufs[buf_idx].film_grain_params_present &&
+          memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) {
+        break;
+      }
+    }
+    assert(ref_frame < REF_FRAMES);
+    aom_wb_write_literal(wb, ref_idx, 3);
+    return;
+  }
+
+  // Scaling functions parameters
+  aom_wb_write_literal(wb, pars->num_y_points, 4);  // max 14
+  for (int i = 0; i < pars->num_y_points; i++) {
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+  }
+
+  if (!cm->seq_params.monochrome)
+    aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+  else
+    pars->chroma_scaling_from_luma = 0;  // for monochrome override to 0
+
+  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->seq_params.subsampling_x == 1) &&
+       (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
+  } else {
+    aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+    }
+
+    aom_wb_write_literal(wb, pars->num_cr_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+    }
+  }
+
+  aom_wb_write_literal(wb, pars->scaling_shift - 8, 2);  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+  aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2);  // 8 + value
+
+  aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+  if (pars->num_cb_points) {
+    aom_wb_write_literal(wb, pars->cb_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_offset, 9);
+  }
+
+  if (pars->num_cr_points) {
+    aom_wb_write_literal(wb, pars->cr_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_offset, 9);
+  }
+
+  aom_wb_write_bit(wb, pars->overlap_flag);
+
+  aom_wb_write_bit(wb, pars->clip_to_restricted_range);
+}
+
+static void write_sb_size(SequenceHeader *seq_params,
+                          struct aom_write_bit_buffer *wb) {
+  (void)seq_params;
+  (void)wb;
+  assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+  assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+  assert(seq_params->sb_size == BLOCK_128X128 ||
+         seq_params->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
+}
+
+static void write_sequence_header(AV1_COMP *cpi,
+                                  struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *seq_params = &cm->seq_params;
+
+  int max_frame_width = cpi->oxcf.forced_max_frame_width
+                            ? cpi->oxcf.forced_max_frame_width
+                            : cpi->oxcf.width;
+  int max_frame_height = cpi->oxcf.forced_max_frame_height
+                             ? cpi->oxcf.forced_max_frame_height
+                             : cpi->oxcf.height;
+  // max((int)ceil(log2(max_frame_width)), 1)
+  const int num_bits_width =
+      (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+  // max((int)ceil(log2(max_frame_height)), 1)
+  const int num_bits_height =
+      (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
+  assert(num_bits_width <= 16);
+  assert(num_bits_height <= 16);
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  aom_wb_write_literal(wb, num_bits_width - 1, 4);
+  aom_wb_write_literal(wb, num_bits_height - 1, 4);
+  aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width);
+  aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height);
+
+  /* Placeholder for actually writing to the bitstream */
+  if (!seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag =
+        cm->large_scale_tile ? 0 : cm->error_resilient_mode;
+    seq_params->frame_id_length = FRAME_ID_LENGTH;
+    seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+    aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+    if (seq_params->frame_id_numbers_present_flag) {
+      // We must always have delta_frame_id_length < frame_id_length,
+      // in order for a frame to be referenced with a unique delta.
+      // Avoid wasting bits by using a coding that enforces this restriction.
+      aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+      aom_wb_write_literal(
+          wb,
+          seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+          3);
+    }
+  }
+
+  write_sb_size(seq_params, wb);
+
+  aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+  aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+    aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+    aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+    aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+    aom_wb_write_bit(wb, seq_params->enable_order_hint);
+
+    if (seq_params->enable_order_hint) {
+      aom_wb_write_bit(wb, seq_params->enable_jnt_comp);
+      aom_wb_write_bit(wb, seq_params->enable_ref_frame_mvs);
+    }
+    if (seq_params->force_screen_content_tools == 2) {
+      aom_wb_write_bit(wb, 1);
+    } else {
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+    }
+    if (seq_params->force_screen_content_tools > 0) {
+      if (seq_params->force_integer_mv == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, seq_params->force_integer_mv);
+      }
+    } else {
+      assert(seq_params->force_integer_mv == 2);
+    }
+    if (seq_params->enable_order_hint)
+      aom_wb_write_literal(wb, seq_params->order_hint_bits_minus_1, 3);
+  }
+
+  aom_wb_write_bit(wb, seq_params->enable_superres);
+  aom_wb_write_bit(wb, seq_params->enable_cdef);
+  aom_wb_write_bit(wb, seq_params->enable_restoration);
+}
+
+static void write_global_motion_params(const WarpedMotionParams *params,
+                                       const WarpedMotionParams *ref_params,
+                                       struct aom_write_bit_buffer *wb,
+                                       int allow_hp) {
+  const TransformationType type = params->wmtype;
+
+  aom_wb_write_bit(wb, type != IDENTITY);
+  if (type != IDENTITY) {
+    aom_wb_write_bit(wb, type == ROTZOOM);
+    if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+  }
+
+  if (type >= ROTZOOM) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+  }
+
+  if (type >= AFFINE) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[0] >> trans_prec_diff),
+        (params->wmmat[0] >> trans_prec_diff));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[1] >> trans_prec_diff),
+        (params->wmmat[1] >> trans_prec_diff));
+  }
+}
+
+static void write_global_motion(AV1_COMP *cpi,
+                                struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    const WarpedMotionParams *ref_params =
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
+    write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+                               cm->allow_high_precision_mv);
+    // TODO(sarahparker, debargha): The logic in the commented out code below
+    // does not work currently and causes mismatches when resize is on.
+    // Fix it before turning the optimization back on.
+    /*
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_buffer(cpi, frame);
+    if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+        cpi->source->y_crop_height == ref_buf->y_crop_height) {
+      write_global_motion_params(&cm->global_motion[frame],
+                                 &cm->prev_frame->global_motion[frame], wb,
+                                 cm->allow_high_precision_mv);
+    } else {
+      assert(cm->global_motion[frame].wmtype == IDENTITY &&
+             "Invalid warp type for frames of different resolutions");
+    }
+    */
+    /*
+    printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+           cm->current_video_frame, cm->show_frame, frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+}
+
+static void check_frame_refs_short_signaling(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cm->frame_refs_short_signaling) return;
+
+  // Check whether all references are distinct frames.
+  int buf_markers[FRAME_BUFFERS] = { 0 };
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    if (buf_idx != INVALID_IDX) {
+      assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+      buf_markers[buf_idx] = 1;
+    }
+  }
+
+  int num_refs = 0;
+  for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) {
+    num_refs += buf_markers[buf_idx];
+  }
+
+  // We only turn on frame_refs_short_signaling when all references are
+  // distinct.
+  if (num_refs < INTER_REFS_PER_FRAME) {
+    // It indicates that there exist more than one reference frame pointing to
+    // the same reference buffer, i.e. two or more references are duplicate.
+    cm->frame_refs_short_signaling = 0;
+    return;
+  }
+
+  // Check whether the encoder side ref frame choices are aligned with that to
+  // be derived at the decoder side.
+  RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME];
+
+  // Backup the frame refs info
+  memcpy(frame_refs_copy, cm->frame_refs,
+         INTER_REFS_PER_FRAME * sizeof(RefBuffer));
+
+  const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME);
+  const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+
+  // Set up the frame refs mapping indexes according to the
+  // frame_refs_short_signaling policy.
+  av1_set_frame_refs(cm, lst_map_idx, gld_map_idx);
+
+  // We only turn on frame_refs_short_signaling when the encoder side decision
+  // on ref frames is identical to that at the decoder side.
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+    // Compare the buffer index between two reference frames indexed
+    // respectively by the encoder and the decoder side decisions.
+    if (cm->frame_refs[ref_idx].idx != frame_refs_copy[ref_idx].idx) {
+      cm->frame_refs_short_signaling = 0;
+      break;
+    }
+  }
+
+#if 0   // For debug
+  printf("\nFrame=%d: \n", cm->current_video_frame);
+  printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. "
+        "dec_ref(map_idx=%d, buf_idx=%d)=%d\n",
+        get_ref_frame_map_idx(cpi, ref_frame),
+        get_ref_frame_buf_idx(cpi, ref_frame), ref_frame,
+        cm->frame_refs[ref_frame - LAST_FRAME].map_idx,
+        cm->frame_refs[ref_frame - LAST_FRAME].idx, ref_frame);
+  }
+#endif  // 0
+
+  // Restore the frame refs info if frame_refs_short_signaling is off.
+  if (!cm->frame_refs_short_signaling)
+    memcpy(cm->frame_refs, frame_refs_copy,
+           INTER_REFS_PER_FRAME * sizeof(RefBuffer));
+}
+
+// New function based on HLS R18
+static void write_uncompressed_header_obu(AV1_COMP *cpi,
+                                          struct aom_write_bit_buffer *saved_wb,
+                                          struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+  cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
+
+  if (seq_params->still_picture) {
+    assert(cm->show_existing_frame == 0);
+    assert(cm->show_frame == 1);
+    assert(cm->frame_type == KEY_FRAME);
+  }
+  if (!seq_params->reduced_still_picture_hdr) {
+    if (encode_show_existing_frame(cm)) {
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+      if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer %d does not contain a reconstructed frame",
+                           frame_to_show);
+      }
+      ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+      aom_wb_write_bit(wb, 1);  // show_existing_frame
+      aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+      if (seq_params->decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0) {
+        write_tu_pts_info(cm, wb);
+      }
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_len = seq_params->frame_id_length;
+        int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+        aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      }
+
+      if (cm->reset_decoder_state &&
+          frame_bufs[frame_to_show].frame_type != KEY_FRAME) {
+        aom_internal_error(
+            &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+            "show_existing_frame to reset state on KEY_FRAME only");
+      }
+
+      return;
+    } else {
+      aom_wb_write_bit(wb, 0);  // show_existing_frame
+    }
+
+    aom_wb_write_literal(wb, cm->frame_type, 2);
+
+    aom_wb_write_bit(wb, cm->show_frame);
+    if (cm->show_frame) {
+      if (seq_params->decoder_model_info_present_flag &&
+          cm->timing_info.equal_picture_interval == 0)
+        write_tu_pts_info(cm, wb);
+    } else {
+      aom_wb_write_bit(wb, cm->showable_frame);
+    }
+    if (frame_is_sframe(cm)) {
+      assert(cm->error_resilient_mode);
+    } else if (!(cm->frame_type == KEY_FRAME && cm->show_frame)) {
+      aom_wb_write_bit(wb, cm->error_resilient_mode);
+    }
+  }
+  aom_wb_write_bit(wb, cm->disable_cdf_update);
+
+  if (seq_params->force_screen_content_tools == 2) {
+    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+  } else {
+    assert(cm->allow_screen_content_tools ==
+           seq_params->force_screen_content_tools);
+  }
+
+  if (cm->allow_screen_content_tools) {
+    if (seq_params->force_integer_mv == 2) {
+      aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
+    } else {
+      assert(cm->cur_frame_force_integer_mv == seq_params->force_integer_mv);
+    }
+  } else {
+    assert(cm->cur_frame_force_integer_mv == 0);
+  }
+
+  cm->invalid_delta_frame_id_minus_1 = 0;
+  int frame_size_override_flag = 0;
+  cm->frame_refs_short_signaling = 0;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(cm->width == seq_params->max_frame_width &&
+           cm->height == seq_params->max_frame_height);
+  } else {
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_len = seq_params->frame_id_length;
+      aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    }
+
+    if (cm->width > seq_params->max_frame_width ||
+        cm->height > seq_params->max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Frame dimensions are larger than the maximum values");
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm) ? 1
+                            : (cm->width != seq_params->max_frame_width ||
+                               cm->height != seq_params->max_frame_height);
+    if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+    if (seq_params->enable_order_hint)
+      aom_wb_write_literal(wb, cm->frame_offset,
+                           seq_params->order_hint_bits_minus_1 + 1);
+
+    if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+      aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (seq_params->decoder_model_info_present_flag) {
+    aom_wb_write_bit(wb, cm->buffer_removal_time_present);
+    if (cm->buffer_removal_time_present) {
+      for (int op_num = 0;
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+        if (cm->op_params[op_num].decoder_model_param_present_flag) {
+          if (((seq_params->operating_point_idc[op_num] >>
+                cm->temporal_layer_id) &
+                   0x1 &&
+               (seq_params->operating_point_idc[op_num] >>
+                (cm->spatial_layer_id + 8)) &
+                   0x1) ||
+              seq_params->operating_point_idc[op_num] == 0) {
+            aom_wb_write_unsigned_literal(
+                wb, cm->op_frame_timing[op_num].buffer_removal_time,
+                cm->buffer_model.buffer_removal_time_length);
+            cm->op_frame_timing[op_num].buffer_removal_time++;
+            if (cm->op_frame_timing[op_num].buffer_removal_time == 0) {
+              aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                                 "buffer_removal_time overflowed");
+            }
+          }
+        }
+      }
+    }
+  }
+  cpi->refresh_frame_mask = get_refresh_mask(cpi);
+  if (cm->frame_type == KEY_FRAME) {
+    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+    } else {
+      assert(cpi->refresh_frame_mask == 0xFF);
+    }
+  } else {
+    if (cm->frame_type == INTRA_ONLY_FRAME) {
+      assert(cpi->refresh_frame_mask != 0xFF);
+      int updated_fb = -1;
+      for (int i = 0; i < REF_FRAMES; i++) {
+        // If more than one frame is refreshed, it doesn't matter which one
+        // we pick, so pick the first.
+        if (cpi->refresh_frame_mask & (1 << i)) {
+          updated_fb = i;
+          break;
+        }
+      }
+      assert(updated_fb >= 0);
+      cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
+      aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+    } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+      if (cm->frame_type == INTER_FRAME) {
+        aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+      } else {
+        assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF);
+      }
+      int updated_fb = -1;
+      for (int i = 0; i < REF_FRAMES; i++) {
+        // If more than one frame is refreshed, it doesn't matter which one
+        // we pick, so pick the first.
+        if (cpi->refresh_frame_mask & (1 << i)) {
+          updated_fb = i;
+          break;
+        }
+      }
+      // large scale tile sometimes won't refresh any fbs
+      if (updated_fb >= 0) {
+        cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
+      }
+
+      if (!cpi->refresh_frame_mask) {
+        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+    }
+  }
+
+  if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
+    // Write all ref frame order hints if error_resilient_mode == 1
+    if (cm->error_resilient_mode && seq_params->enable_order_hint) {
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Get buffer index
+        const int buf_idx = cm->ref_frame_map[ref_idx];
+        assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+
+        // Write order hint to bit stream
+        aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset,
+                             seq_params->order_hint_bits_minus_1 + 1);
+      }
+    }
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    write_frame_size(cm, frame_size_override_flag, wb);
+    assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+    if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+      aom_wb_write_bit(wb, cm->allow_intrabc);
+    // all eight fbs are refreshed, pick one that will live long enough
+    cm->fb_of_context_type[REGULAR_FRAME] = 0;
+  } else {
+    if (cm->frame_type == INTRA_ONLY_FRAME) {
+      write_frame_size(cm, frame_size_override_flag, wb);
+      assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+      if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+        aom_wb_write_bit(wb, cm->allow_intrabc);
+    } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+      MV_REFERENCE_FRAME ref_frame;
+
+      // NOTE: Error resilient mode turns off frame_refs_short_signaling
+      //       automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+      cm->frame_refs_short_signaling = seq_params->enable_order_hint;
+#endif  // FRAME_REFS_SHORT_SIGNALING
+
+      if (cm->frame_refs_short_signaling) {
+        // NOTE(zoeliu@google.com):
+        //   An example solution for encoder-side implementation on frame refs
+        //   short signaling, which is only turned on when the encoder side
+        //   decision on ref frames is identical to that at the decoder side.
+        check_frame_refs_short_signaling(cpi);
+      }
+
+      if (seq_params->enable_order_hint)
+        aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
+
+      if (cm->frame_refs_short_signaling) {
+        const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME);
+        aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+        const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+        aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+      }
+
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        if (!cm->frame_refs_short_signaling)
+          aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+                               REF_FRAMES_LOG2);
+        if (seq_params->frame_id_numbers_present_flag) {
+          int i = get_ref_frame_map_idx(cpi, ref_frame);
+          int frame_id_len = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
+          int delta_frame_id_minus_1 =
+              ((cm->current_frame_id - cm->ref_frame_id[i] +
+                (1 << frame_id_len)) %
+               (1 << frame_id_len)) -
+              1;
+          if (delta_frame_id_minus_1 < 0 ||
+              delta_frame_id_minus_1 >= (1 << diff_len))
+            cm->invalid_delta_frame_id_minus_1 = 1;
+          aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+        }
+      }
+
+      if (!cm->error_resilient_mode && frame_size_override_flag) {
+        write_frame_size_with_refs(cpi, wb);
+      } else {
+        write_frame_size(cm, frame_size_override_flag, wb);
+      }
+
+      if (cm->cur_frame_force_integer_mv) {
+        cm->allow_high_precision_mv = 0;
+      } else {
+        aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+      }
+      fix_interp_filter(cm, cpi->td.counts);
+      write_frame_interp_filter(cm->interp_filter, wb);
+      aom_wb_write_bit(wb, cm->switchable_motion_mode);
+      if (frame_might_allow_ref_frame_mvs(cm)) {
+        aom_wb_write_bit(wb, cm->allow_ref_frame_mvs);
+      } else {
+        assert(cm->allow_ref_frame_mvs == 0);
+      }
+    }
+  }
+
+  const int might_bwd_adapt =
+      !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+  if (cm->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  if (might_bwd_adapt) {
+    aom_wb_write_bit(
+        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+  }
+
+  write_tile_info(cm, saved_wb, wb);
+  encode_quantization(cm, wb);
+  encode_segmentation(cm, xd, wb);
+
+  if (cm->delta_q_present_flag) assert(cm->base_qindex > 0);
+  if (cm->base_qindex > 0) {
+    aom_wb_write_bit(wb, cm->delta_q_present_flag);
+    if (cm->delta_q_present_flag) {
+      aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2);
+      xd->current_qindex = cm->base_qindex;
+      if (cm->allow_intrabc)
+        assert(cm->delta_lf_present_flag == 0);
+      else
+        aom_wb_write_bit(wb, cm->delta_lf_present_flag);
+      if (cm->delta_lf_present_flag) {
+        aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2);
+        aom_wb_write_bit(wb, cm->delta_lf_multi);
+        av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+      }
+    }
+  }
+
+  if (cm->all_lossless) {
+    assert(!av1_superres_scaled(cm));
+  } else {
+    if (!cm->coded_lossless) {
+      encode_loopfilter(cm, wb);
+      encode_cdef(cm, wb);
+    }
+    encode_restoration_mode(cm, wb);
+  }
+
+  write_tx_mode(cm, &cm->tx_mode, wb);
+
+  if (cpi->allow_comp_inter_inter) {
+    const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+
+    aom_wb_write_bit(wb, use_hybrid_pred);
+  }
+
+  if (cm->is_skip_mode_allowed) aom_wb_write_bit(wb, cm->skip_mode_flag);
+
+  if (frame_might_allow_warped_motion(cm))
+    aom_wb_write_bit(wb, cm->allow_warped_motion);
+  else
+    assert(!cm->allow_warped_motion);
+
+  aom_wb_write_bit(wb, cm->reduced_tx_set_used);
+
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+
+  if (seq_params->film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    int flip_back_update_parameters_flag = 0;
+    if (cm->frame_type != INTER_FRAME &&
+        cm->film_grain_params.update_parameters == 0) {
+      cm->film_grain_params.update_parameters = 1;
+      flip_back_update_parameters_flag = 1;
+    }
+    write_film_grain_params(cpi, wb);
+
+    if (flip_back_update_parameters_flag)
+      cm->film_grain_params.update_parameters = 0;
+  }
+
+  if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+  // Choose the number of bytes required to represent size, without
+  // using the 'spare_msbs' number of most significant bits.
+
+  // Make sure we will fit in 4 bytes to start with..
+  if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+  // Normalise to 32 bits
+  size <<= spare_msbs;
+
+  if (size >> 24 != 0)
+    return 4;
+  else if (size >> 16 != 0)
+    return 3;
+  else if (size >> 8 != 0)
+    return 2;
+  else
+    return 1;
+}
+
+static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+  switch (sz) {
+    case 1: dst[0] = (uint8_t)(val & 0xff); break;
+    case 2: mem_put_le16(dst, val); break;
+    case 3: mem_put_le24(dst, val); break;
+    case 4: mem_put_le32(dst, val); break;
+    default: assert(0 && "Invalid size"); break;
+  }
+}
+
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+                       const uint32_t data_size, const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes) {
+  // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+  int tsb;
+  int tcsb;
+
+  if (cm->large_scale_tile) {
+    // The top bit in the tile size field indicates tile copy mode, so we
+    // have 1 less bit to code the tile size
+    tsb = choose_size_bytes(max_tile_size, 1);
+    tcsb = choose_size_bytes(max_tile_col_size, 0);
+  } else {
+    tsb = choose_size_bytes(max_tile_size, 0);
+    tcsb = 4;  // This is ignored
+    (void)max_tile_col_size;
+  }
+
+  assert(tsb > 0);
+  assert(tcsb > 0);
+
+  *tile_size_bytes = tsb;
+  *tile_col_size_bytes = tcsb;
+  if (tsb == 4 && tcsb == 4) return data_size;
+
+  uint32_t wpos = 0;
+  uint32_t rpos = 0;
+
+  if (cm->large_scale_tile) {
+    int tile_row;
+    int tile_col;
+
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < cm->tile_cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
+      }
+
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+        } else {
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+
+          tile_header += AV1_MIN_TILE_SIZE_BYTES;
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
+      }
+    }
+
+    assert(rpos > wpos);
+    assert(rpos == data_size);
+
+    return wpos;
+  }
+  const int n_tiles = cm->tile_cols * cm->tile_rows;
+  int n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_size;
+
+    if (n == n_tiles - 1) {
+      tile_size = data_size - rpos;
+    } else {
+      tile_size = mem_get_le32(dst + rpos);
+      rpos += 4;
+      mem_put_varsize(dst + wpos, tsb, tile_size);
+      tile_size += AV1_MIN_TILE_SIZE_BYTES;
+      wpos += tsb;
+    }
+
+    memmove(dst + wpos, dst + rpos, tile_size);
+
+    rpos += tile_size;
+    wpos += tile_size;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == data_size);
+
+  return wpos;
+}
+
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                          uint8_t *const dst) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+  aom_wb_write_literal(&wb, (int)obu_type, 4);
+  aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_payload_length_field
+  aom_wb_write_literal(&wb, 0, 1);  // reserved
+
+  if (obu_extension) {
+    aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+                        uint8_t *dest) {
+  const uint32_t obu_size = obu_payload_size;
+  const uint32_t offset = obu_header_size;
+  size_t coded_obu_size = 0;
+
+  if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+                      &coded_obu_size) != 0) {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static size_t obu_memmove(uint32_t obu_header_size, uint32_t obu_payload_size,
+                          uint8_t *data) {
+  const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+  const uint32_t move_dst_offset =
+      (uint32_t)length_field_size + obu_header_size;
+  const uint32_t move_src_offset = obu_header_size;
+  const uint32_t move_size = obu_payload_size;
+  memmove(data + move_dst_offset, data + move_src_offset, move_size);
+  return length_field_size;
+}
+
+static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+  if (aom_wb_is_byte_aligned(wb)) {
+    aom_wb_write_literal(wb, 0x80, 8);
+  } else {
+    // assumes that the other bits are already 0s
+    aom_wb_write_bit(wb, 1);
+  }
+}
+
+static void write_bitstream_level(BitstreamLevel bl,
+                                  struct aom_write_bit_buffer *wb) {
+  uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl);
+  assert(is_valid_seq_level_idx(seq_level_idx));
+  aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
+}
+
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  write_profile(cm->seq_params.profile, &wb);
+
+  // Still picture or not
+  aom_wb_write_bit(&wb, cm->seq_params.still_picture);
+  assert(IMPLIES(!cm->seq_params.still_picture,
+                 !cm->seq_params.reduced_still_picture_hdr));
+  // whether to use reduced still picture header
+  aom_wb_write_bit(&wb, cm->seq_params.reduced_still_picture_hdr);
+
+  if (cm->seq_params.reduced_still_picture_hdr) {
+    assert(cm->timing_info_present == 0);
+    assert(cm->seq_params.decoder_model_info_present_flag == 0);
+    assert(cm->seq_params.display_model_info_present_flag == 0);
+    write_bitstream_level(cm->seq_params.level[0], &wb);
+  } else {
+    aom_wb_write_bit(&wb, cm->timing_info_present);  // timing info present flag
+
+    if (cm->timing_info_present) {
+      // timing_info
+      write_timing_info_header(cm, &wb);
+      aom_wb_write_bit(&wb, cm->seq_params.decoder_model_info_present_flag);
+      if (cm->seq_params.decoder_model_info_present_flag) {
+        write_decoder_model_info(cm, &wb);
+      }
+    }
+    aom_wb_write_bit(&wb, cm->seq_params.display_model_info_present_flag);
+    aom_wb_write_literal(&wb, cm->seq_params.operating_points_cnt_minus_1,
+                         OP_POINTS_CNT_MINUS_1_BITS);
+    int i;
+    for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
+      aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
+                           OP_POINTS_IDC_BITS);
+      write_bitstream_level(cm->seq_params.level[i], &wb);
+      if (cm->seq_params.level[i].major > 3)
+        aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
+      if (cm->seq_params.decoder_model_info_present_flag) {
+        aom_wb_write_bit(&wb,
+                         cm->op_params[i].decoder_model_param_present_flag);
+        if (cm->op_params[i].decoder_model_param_present_flag)
+          write_dec_model_op_parameters(cm, &wb, i);
+      }
+      if (cm->seq_params.display_model_info_present_flag) {
+        aom_wb_write_bit(&wb,
+                         cm->op_params[i].display_model_param_present_flag);
+        if (cm->op_params[i].display_model_param_present_flag) {
+          assert(cm->op_params[i].initial_display_delay <= 10);
+          aom_wb_write_literal(&wb, cm->op_params[i].initial_display_delay - 1,
+                               4);
+        }
+      }
+    }
+  }
+  write_sequence_header(cpi, &wb);
+
+  write_color_config(&cm->seq_params, &wb);
+
+  aom_wb_write_bit(&wb, cm->seq_params.film_grain_params_present);
+
+  add_trailing_bits(&wb);
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t *const dst,
+                                       int append_trailing_bits) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  if (append_trailing_bits) add_trailing_bits(&wb);
+  return aom_wb_bytes_written(&wb);
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int startTile,
+                                        int endTile, int tiles_log2,
+                                        int tile_start_and_end_present_flag) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  if (!tiles_log2) return size;
+
+  aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+  if (tile_start_and_end_present_flag) {
+    aom_wb_write_literal(&wb, startTile, tiles_log2);
+    aom_wb_write_literal(&wb, endTile, tiles_log2);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t obu_extension_header,
+                                       const FrameHeaderInfo *fh_info) {
+  AV1_COMMON *const cm = &cpi->common;
+  aom_writer mode_bc;
+  int tile_row, tile_col;
+  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+  uint32_t total_size = 0;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  unsigned int tile_size = 0;
+  unsigned int max_tile_size = 0;
+  unsigned int max_tile_col_size = 0;
+  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cm->num_tg;
+  const int tg_size =
+      (cm->large_scale_tile)
+          ? 1
+          : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  int curr_tg_data_size = 0;
+  uint8_t *data = dst;
+  int new_tg = 1;
+  const int have_tiles = tile_cols * tile_rows > 1;
+  int first_tg = 1;
+
+  cm->largest_tile_id = 0;
+
+  if (cm->large_scale_tile) {
+    // For large_scale_tile case, we always have only one tile group, so it can
+    // be written as an OBU_FRAME.
+    const OBU_TYPE obu_type = OBU_FRAME;
+    const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data);
+    data += tg_hdr_size;
+
+    const uint32_t frame_header_size =
+        write_frame_header_obu(cpi, saved_wb, data, 0);
+    data += frame_header_size;
+    total_size += frame_header_size;
+
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+    {
+      char fn[20] = "./fh";
+      fn[4] = cm->current_video_frame / 100 + '0';
+      fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+      fn[6] = (cm->current_video_frame % 10) + '0';
+      fn[7] = '\0';
+      av1_print_uncompressed_frame_header(data - frame_header_size,
+                                          frame_header_size, fn);
+    }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+    int tile_size_bytes = 0;
+    int tile_col_size_bytes = 0;
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileInfo tile_info;
+      const int is_last_col = (tile_col == tile_cols - 1);
+      const uint32_t col_offset = total_size;
+
+      av1_tile_set_col(&tile_info, cm, tile_col);
+
+      // The last column does not have a column header
+      if (!is_last_col) total_size += 4;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+        const int data_offset = have_tiles ? 4 : 0;
+        const int tile_idx = tile_row * tile_cols + tile_col;
+        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+        av1_tile_set_row(&tile_info, cm, tile_row);
+
+        buf->data = dst + total_size + tg_hdr_size;
+
+        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+        // even for the last one, unless no tiling is used at all.
+        total_size += data_offset;
+        // Initialise tile context from the frame context
+        this_tile->tctx = *cm->fc;
+        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+        mode_bc.allow_update_cdf = !cm->large_scale_tile;
+        mode_bc.allow_update_cdf =
+            mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+        aom_start_encode(&mode_bc, buf->data + data_offset);
+        write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+        aom_stop_encode(&mode_bc);
+        tile_size = mode_bc.pos;
+        buf->size = tile_size;
+
+        // Record the maximum tile size we see, so we can compact headers later.
+        if (tile_size > max_tile_size) {
+          max_tile_size = tile_size;
+          cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        }
+
+        if (have_tiles) {
+          // tile header: size of this tile, or copy offset
+          uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+          const int tile_copy_mode =
+              ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
+                  ? 1
+                  : 0;
+
+          // If tile_copy_mode = 1, check if this tile is a copy tile.
+          // Very low chances to have copy tiles on the key frames, so don't
+          // search on key frames to reduce unnecessary search.
+          if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
+            const int identical_tile_offset =
+                find_identical_tile(tile_row, tile_col, tile_buffers);
+
+            if (identical_tile_offset > 0) {
+              tile_size = 0;
+              tile_header = identical_tile_offset | 0x80;
+              tile_header <<= 24;
+            }
+          }
+
+          mem_put_le32(buf->data, tile_header);
+        }
+
+        total_size += tile_size;
+      }
+
+      if (!is_last_col) {
+        uint32_t col_size = total_size - col_offset - 4;
+        mem_put_le32(dst + col_offset + tg_hdr_size, col_size);
+
+        // Record the maximum tile column size we see.
+        max_tile_col_size = AOMMAX(max_tile_col_size, col_size);
+      }
+    }
+
+    if (have_tiles) {
+      total_size = remux_tiles(cm, data, total_size - frame_header_size,
+                               max_tile_size, max_tile_col_size,
+                               &tile_size_bytes, &tile_col_size_bytes);
+      total_size += frame_header_size;
+    }
+
+    // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+    // current tile group size before tile data(include tile column header).
+    // Tile group size doesn't include the bytes storing tg size.
+    total_size += tg_hdr_size;
+    const uint32_t obu_payload_size = total_size - tg_hdr_size;
+    const size_t length_field_size =
+        obu_memmove(tg_hdr_size, obu_payload_size, dst);
+    if (write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
+        AOM_CODEC_OK) {
+      assert(0);
+    }
+    total_size += (uint32_t)length_field_size;
+    saved_wb->bit_buffer += length_field_size;
+
+    // Now fill in the gaps in the uncompressed header.
+    if (have_tiles) {
+      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+    }
+    return total_size;
+  }
+
+  uint32_t obu_header_size = 0;
+  uint8_t *tile_data_start = dst + total_size;
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+      int is_last_tile_in_tg = 0;
+
+      if (new_tg) {
+        data = dst + total_size;
+
+        // A new tile group begins at this tile.  Write the obu header and
+        // tile group header
+        const OBU_TYPE obu_type =
+            (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+        curr_tg_data_size =
+            write_obu_header(obu_type, obu_extension_header, data);
+        obu_header_size = curr_tg_data_size;
+
+        if (num_tg_hdrs == 1) {
+          curr_tg_data_size += write_frame_header_obu(
+              cpi, saved_wb, data + curr_tg_data_size, 0);
+        }
+        curr_tg_data_size += write_tile_group_header(
+            data + curr_tg_data_size, tile_idx,
+            AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
+            n_log2_tiles, cm->num_tg > 1);
+        total_size += curr_tg_data_size;
+        tile_data_start += curr_tg_data_size;
+        new_tg = 0;
+        tile_count = 0;
+      }
+      tile_count++;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+        is_last_tile_in_tg = 1;
+        new_tg = 1;
+      } else {
+        is_last_tile_in_tg = 0;
+      }
+
+      buf->data = dst + total_size;
+
+      // The last tile of the tile group does not have a header.
+      if (!is_last_tile_in_tg) total_size += 4;
+
+      // Initialise tile context from the frame context
+      this_tile->tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      mode_bc.allow_update_cdf = 1;
+      mode_bc.allow_update_cdf =
+          mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+      const int num_planes = av1_num_planes(cm);
+      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+
+      aom_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+      assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+      curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
+      buf->size = tile_size;
+      if (tile_size > max_tile_size) {
+        cm->largest_tile_id = tile_cols * tile_row + tile_col;
+        max_tile_size = tile_size;
+      }
+
+      if (!is_last_tile_in_tg) {
+        // size of this tile
+        mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+      } else {
+        // write current tile group size
+        const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
+        const size_t length_field_size =
+            obu_memmove(obu_header_size, obu_payload_size, data);
+        if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+            AOM_CODEC_OK) {
+          assert(0);
+        }
+        curr_tg_data_size += (int)length_field_size;
+        total_size += (uint32_t)length_field_size;
+        tile_data_start += length_field_size;
+        if (num_tg_hdrs == 1) {
+          // if this tg is combined with the frame header then update saved
+          // frame header base offset accroding to length field size
+          saved_wb->bit_buffer += length_field_size;
+        }
+
+        if (!first_tg && cm->error_resilient_mode) {
+          // Make room for a duplicate Frame Header OBU.
+          memmove(data + fh_info->total_length, data, curr_tg_data_size);
+
+          // Insert a copy of the Frame Header OBU.
+          memcpy(data, fh_info->frame_header, fh_info->total_length);
+
+          // Force context update tile to be the first tile in error
+          // resiliant mode as the duplicate frame headers will have
+          // context_update_tile_id set to 0
+          cm->largest_tile_id = 0;
+
+          // Rewrite the OBU header to change the OBU type to Redundant Frame
+          // Header.
+          write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header,
+                           &data[fh_info->obu_header_byte_offset]);
+
+          data += fh_info->total_length;
+
+          curr_tg_data_size += (int)(fh_info->total_length);
+          total_size += (uint32_t)(fh_info->total_length);
+        }
+        first_tg = 0;
+      }
+
+      total_size += tile_size;
+    }
+  }
+
+  if (have_tiles) {
+    // Fill in context_update_tile_id indicating the tile to use for the
+    // cdf update. The encoder currently sets it to the largest tile
+    // (but is up to the encoder)
+    aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id,
+                             cm->log2_tile_cols + cm->log2_tile_rows);
+    // If more than one tile group. tile_size_bytes takes the default value 4
+    // and does not need to be set. For a single tile group it is set in the
+    // section below.
+    if (num_tg_hdrs == 1) {
+      int tile_size_bytes = 4, unused;
+      const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+      const uint32_t tile_data_size = total_size - tile_data_offset;
+
+      total_size =
+          remux_tiles(cm, tile_data_start, tile_data_size, max_tile_size,
+                      max_tile_col_size, &tile_size_bytes, &unused);
+      total_size += tile_data_offset;
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+      // Update the OBU length if remux_tiles() reduced the size.
+      uint64_t payload_size;
+      size_t length_field_size;
+      int res =
+          aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size,
+                          &payload_size, &length_field_size);
+      assert(res == 0);
+      (void)res;
+
+      const uint64_t new_payload_size =
+          total_size - obu_header_size - length_field_size;
+      if (new_payload_size != payload_size) {
+        size_t new_length_field_size;
+        res = aom_uleb_encode(new_payload_size, length_field_size,
+                              dst + obu_header_size, &new_length_field_size);
+        assert(res == 0);
+        if (new_length_field_size < length_field_size) {
+          const size_t src_offset = obu_header_size + length_field_size;
+          const size_t dst_offset = obu_header_size + new_length_field_size;
+          memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+          total_size -= (int)(length_field_size - new_length_field_size);
+        }
+      }
+    }
+  }
+  return total_size;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+  uint8_t *data = dst;
+  uint32_t data_size;
+  AV1_COMMON *const cm = &cpi->common;
+  uint32_t obu_header_size = 0;
+  uint32_t obu_payload_size = 0;
+  FrameHeaderInfo fh_info = { NULL, 0, 0 };
+  const uint8_t obu_extension_header =
+      cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_reset_write();
+#endif
+
+  // The TD is now written outside the frame encode loop
+
+  // write sequence header obu if KEY_FRAME, preceded by 4-byte size
+  if (cm->frame_type == KEY_FRAME && cm->show_frame) {
+    obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
+
+    obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+
+    data += obu_header_size + obu_payload_size + length_field_size;
+  }
+
+  const int write_frame_header =
+      (cm->num_tg > 1 || encode_show_existing_frame(cm));
+  struct aom_write_bit_buffer saved_wb;
+  if (write_frame_header) {
+    // Write Frame Header OBU.
+    fh_info.frame_header = data;
+    obu_header_size =
+        write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size =
+        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
+
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+
+    fh_info.obu_header_byte_offset = 0;
+    fh_info.total_length =
+        obu_header_size + obu_payload_size + length_field_size;
+    data += fh_info.total_length;
+
+    // Since length_field_size is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    saved_wb.bit_buffer += length_field_size;
+  }
+
+  if (encode_show_existing_frame(cm)) {
+    data_size = 0;
+  } else {
+    //  Each tile group obu will be preceded by 4-byte size of the tile group
+    //  obu
+    data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb,
+                                       obu_extension_header, &fh_info);
+  }
+  data += data_size;
+  *size = data - dst;
+  return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 000000000..465ccaed5
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct aom_write_bit_buffer;
+
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
+
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+                          uint8_t *const dst);
+
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+                        uint8_t *dest);
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
+
+static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
+  // Do not swap gf and arf indices for internal overlay frames
+  return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+                       int blk_row, int blk_col, int plane, TX_SIZE tx_size,
+                       aom_writer *w);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 000000000..0bc5dea82
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#include "av1/common/mvref_common.h"
+#include "av1/encoder/hash.h"
+#if CONFIG_DIST_8X8
+#include "aom/aomcx.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} DIFF;
+
+typedef struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
+  uint16_t *eobs;
+  uint8_t *txb_entropy_ctx;
+  struct buf_2d src;
+
+  // Quantizer setings
+  // These are used/accessed only in the quantization process
+  // RDO does not / must not depend on any of these values
+  // All values below share the coefficient scale/shift used in TX
+  const int16_t *quant_fp_QTX;
+  const int16_t *round_fp_QTX;
+  const int16_t *quant_QTX;
+  const int16_t *quant_shift_QTX;
+  const int16_t *zbin_QTX;
+  const int16_t *round_QTX;
+  const int16_t *dequant_QTX;
+} MACROBLOCK_PLANE;
+
+typedef struct {
+  int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+  int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  int base_cost[SIG_COEF_CONTEXTS][4];
+  int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+  int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
+} LV_MAP_COEFF_COST;
+
+typedef struct {
+  int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+typedef struct {
+  tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
+  uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  uint8_t txb_skip_ctx[MAX_MB_PLANE]
+                      [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  int dc_sign_ctx[MAX_MB_PLANE]
+                 [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+} CB_COEFF_BUFFER;
+
+typedef struct {
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+  // TODO(angiebird): Reduce the buffer size according to sb_type
+  tran_low_t *tcoeff[MAX_MB_PLANE];
+  uint16_t *eobs[MAX_MB_PLANE];
+  uint8_t *txb_skip_ctx[MAX_MB_PLANE];
+  int *dc_sign_ctx[MAX_MB_PLANE];
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  int_mv global_mvs[REF_FRAMES];
+  int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} MvLimits;
+
+typedef struct {
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+typedef struct {
+  TX_SIZE tx_size;
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+  RD_STATS rd_stats;
+  uint32_t hash_value;
+} MB_RD_INFO;
+
+#define RD_RECORD_BUFFER_LEN 8
+typedef struct {
+  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  int index_start;
+  int num;
+  CRC32C crc_calculator;  // Hash function.
+} MB_RD_RECORD;
+
+typedef struct {
+  int64_t dist;
+  int64_t sse;
+  int rate;
+  uint16_t eob;
+  TX_TYPE tx_type;
+  uint16_t entropy_context;
+  uint8_t txb_entropy_ctx;
+  uint8_t valid;
+  uint8_t fast;  // This is not being used now.
+} TXB_RD_INFO;
+
+#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+typedef struct {
+  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  int index_start;
+  int num;
+} TXB_RD_RECORD;
+
+typedef struct tx_size_rd_info_node {
+  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
+  struct tx_size_rd_info_node *children[4];
+} TXB_RD_INFO_NODE;
+
+// Region size for mode decision sampling in the first pass of partition
+// search(two_pass_partition_search speed feature), in units of mi size(4).
+// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
+#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
+#define FIRST_PARTITION_PASS_STATS_TABLES                     \
+  (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \
+      (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+#define FIRST_PARTITION_PASS_STATS_STRIDE \
+  (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+
+static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) {
+  const int row =
+      (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  const int col =
+      (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+  return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col;
+}
+
+typedef struct {
+  uint8_t ref0_counts[REF_FRAMES];  // Counters for ref_frame[0].
+  uint8_t ref1_counts[REF_FRAMES];  // Counters for ref_frame[1].
+  int sample_counts;                // Number of samples collected.
+} FIRST_PARTITION_PASS_STATS;
+
+#define MAX_INTERP_FILTER_STATS 64
+typedef struct {
+  InterpFilters filters;
+  int_mv mv[2];
+  int8_t ref_frames[2];
+  COMPOUND_TYPE comp_type;
+} INTERPOLATION_FILTER_STATS;
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  // Determine if one would go with reduced complexity transform block
+  // search model to select prediction modes, or full complexity model
+  // to select transform kernel.
+  int rd_model;
+
+  // Indicate if the encoder is running in the first pass partition search.
+  // In that case, apply certain speed features therein to reduce the overhead
+  // cost in the first pass search.
+  int cb_partition_scan;
+
+  FIRST_PARTITION_PASS_STATS
+  first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
+
+  // [comp_idx][saved stat_idx]
+  INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS];
+  int interp_filter_stats_idx[2];
+
+  // Activate constrained coding block partition search range.
+  int use_cb_search_range;
+
+  // Inter macroblock RD search info.
+  MB_RD_RECORD mb_rd_record;
+
+  // Inter transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
+  TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
+  TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
+  TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
+
+  // Intra transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_intra;
+
+  MACROBLOCKD e_mbd;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  int skip_block;
+  int qindex;
+
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
+  int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
+  int sadperbit16;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for sub-8x8 blocks.
+  int sadperbit4;
+  int rdmult;
+  int mb_energy;
+  int sb_energy_level;
+  int *m_search_count_ptr;
+  int *ex_search_count_ptr;
+
+  unsigned int txb_split_count;
+
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  unsigned int max_mv_context[REF_FRAMES];
+  unsigned int source_variance;
+  unsigned int pred_sse[REF_FRAMES];
+  int pred_mv_sad[REF_FRAMES];
+
+  int *nmvjointcost;
+  int nmv_vec_cost[MV_JOINTS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int **mv_cost_stack;
+  int **mvcost;
+
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
+
+  PALETTE_BUFFER *palette_buffer;
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+
+  // buffer for hash value calculation of a block
+  // used only in av1_get_block_hash_value()
+  // [first hash/second hash]
+  // [two buffers used ping-pong]
+  uint32_t *hash_value_buffer[2][2];
+
+  CRC_CALCULATOR crc_calculator1;
+  CRC_CALCULATOR crc_calculator2;
+  int g_crc_initialized;
+
+  // These define limits to motion vector components to prevent them
+  // from extending outside the UMV borders
+  MvLimits mv_limits;
+
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  int skip;
+  int skip_chroma_rd;
+  int skip_cost[SKIP_CONTEXTS][2];
+
+  int skip_mode;  // 0: off; 1: on
+  int skip_mode_cost[SKIP_CONTEXTS][2];
+
+  int compound_idx;
+
+  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  LV_MAP_EOB_COST eob_costs[7][2];
+  uint16_t cb_offset;
+
+  // mode costs
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                       [CDF_SIZE(2)];
+  // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
+  // GOLDEN_FRAME) in bidir-comp mode.
+  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+  // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
+  // BWDREF_FRAME) in bidir-comp mode.
+  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+  int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+  int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+  // The rate associated with each alpha codeword
+  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+  int wiener_restore_cost[2];
+  int sgrproj_restore_cost[2];
+  int intrabc_cost[2];
+
+  // Used to store sub partition's choices.
+  MV pred_mv[REF_FRAMES];
+
+  // Store the best motion vector during motion search
+  int_mv best_mv;
+  // Store the second best motion vector during full-pixel motion search
+  int_mv second_best_mv;
+
+  // use default transform and skip transform type search for intra modes
+  int use_default_intra_tx_type;
+  // use default transform and skip transform type search for inter modes
+  int use_default_inter_tx_type;
+#if CONFIG_DIST_8X8
+  int using_dist_8x8;
+  aom_tune_metric tune_metric;
+#endif  // CONFIG_DIST_8X8
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  // Bit flags for pruning tx type search, tx split, etc.
+  int tx_search_prune[EXT_TX_SET_TYPES];
+  int must_find_valid_partition;
+  int tx_split_prune_flag;  // Flag to skip tx split RD search.
+  int recalc_luma_mc_data;  // Flag to indicate recalculation of MC data during
+                            // interpolation filter search
+};
+
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES_ALL] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    1,  // BLOCK_32X64
+    1,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+    1,  // BLOCK_4X16
+    1,  // BLOCK_16X4
+    1,  // BLOCK_8X32
+    1,  // BLOCK_32X8
+    1,  // BLOCK_16X64
+    1,  // BLOCK_64X16
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+         !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+  TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (tx_size != ctx_size) {
+    depth++;
+    ctx_size = sub_tx_size_map[ctx_size];
+    assert(depth <= MAX_TX_DEPTH);
+  }
+  return depth;
+}
+
+static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+                                int skip) {
+  if (skip)
+    x->blk_skip[blk_idx] |= 1UL << plane;
+  else
+    x->blk_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+  // Set chroma planes to uninitialized states when luma is set to check if
+  // it will be set later
+  if (plane == 0) {
+    x->blk_skip[blk_idx] |= 1UL << (1 + 4);
+    x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+  }
+
+  // Clear the initialization checking bit
+  x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+#ifndef NDEBUG
+  // Check if this is initialized
+  assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+
+  // The magic number is 0x77, this is to test if there is garbage data
+  assert((x->blk_skip[blk_idx] & 0x88) == 0);
+#endif
+  return (x->blk_skip[blk_idx] >> plane) & 1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 000000000..f7cff9e53
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1] * s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch, int width,
+                          int height) {
+  double blockiness = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness +=
+            blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+                                            img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 000000000..57f59f304
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
+                               PICK_MODE_CONTEXT *ctx) {
+  const int num_planes = av1_num_planes(cm);
+  int i;
+  const int num_blk = num_pix / 16;
+  ctx->num_4x4_blk = num_blk;
+
+  CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t)));
+  for (i = 0; i < num_planes; ++i) {
+    CHECK_MEM_ERROR(cm, ctx->coeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
+                    aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->eobs[i],
+                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+    CHECK_MEM_ERROR(
+        cm, ctx->txb_entropy_ctx[i],
+        aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+  }
+
+  if (num_pix <= MAX_PALETTE_SQUARE) {
+    for (i = 0; i < 2; ++i) {
+      CHECK_MEM_ERROR(
+          cm, ctx->color_index_map[i],
+          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    }
+  }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx, const int num_planes) {
+  int i;
+  aom_free(ctx->blk_skip);
+  ctx->blk_skip = 0;
+  for (i = 0; i < num_planes; ++i) {
+    aom_free(ctx->coeff[i]);
+    ctx->coeff[i] = 0;
+    aom_free(ctx->qcoeff[i]);
+    ctx->qcoeff[i] = 0;
+    aom_free(ctx->dqcoeff[i]);
+    ctx->dqcoeff[i] = 0;
+    aom_free(ctx->eobs[i]);
+    ctx->eobs[i] = 0;
+    aom_free(ctx->txb_entropy_ctx[i]);
+    ctx->txb_entropy_ctx[i] = 0;
+  }
+
+  for (i = 0; i < 2; ++i) {
+    aom_free(ctx->color_index_map[i]);
+    ctx->color_index_map[i] = 0;
+  }
+}
+
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix,
+                                int is_leaf) {
+  alloc_mode_context(cm, num_pix, &tree->none);
+
+  if (is_leaf) return;
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2]);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1]);
+  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2]);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1]);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2]);
+
+  for (int i = 0; i < 4; ++i) {
+    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i]);
+    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i]);
+  }
+}
+
+static void free_tree_contexts(PC_TREE *tree, const int num_planes) {
+  int i;
+  for (i = 0; i < 3; i++) {
+    free_mode_context(&tree->horizontala[i], num_planes);
+    free_mode_context(&tree->horizontalb[i], num_planes);
+    free_mode_context(&tree->verticala[i], num_planes);
+    free_mode_context(&tree->verticalb[i], num_planes);
+  }
+  for (i = 0; i < 4; ++i) {
+    free_mode_context(&tree->horizontal4[i], num_planes);
+    free_mode_context(&tree->vertical4[i], num_planes);
+  }
+  free_mode_context(&tree->none, num_planes);
+  free_mode_context(&tree->horizontal[0], num_planes);
+  free_mode_context(&tree->horizontal[1], num_planes);
+  free_mode_context(&tree->vertical[0], num_planes);
+  free_mode_context(&tree->vertical[1], num_planes);
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split.  Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
+  int i, j;
+  const int tree_nodes_inc = 1024;
+  const int leaf_factor = 4;
+  const int leaf_nodes = 256 * leaf_factor;
+  const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+  int pc_tree_index = 0;
+  PC_TREE *this_pc;
+  int square_index = 1;
+  int nodes;
+
+  aom_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
+  this_pc = &td->pc_tree[0];
+
+  // Sets up all the leaf nodes in the tree.
+  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    tree->block_size = square[0];
+    alloc_tree_contexts(cm, tree, 16, 1);
+  }
+
+  // Each node has 4 leaf nodes, fill each block_size level of the tree
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i) {
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+      alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0);
+      tree->block_size = square[square_index];
+      for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+      ++pc_tree_index;
+    }
+    ++square_index;
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[i]->none.best_mode_index = 2;
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->pc_root[i] = td->pc_root[i + 1]->split[0];
+    td->pc_root[i]->none.best_mode_index = 2;
+  }
+}
+
+void av1_free_pc_tree(ThreadData *td, const int num_planes) {
+  if (td->pc_tree != NULL) {
+    const int tree_nodes_inc = 1024;
+    const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+    for (int i = 0; i < tree_nodes; ++i) {
+      free_tree_contexts(&td->pc_tree[i], num_planes);
+    }
+    aom_free(td->pc_tree);
+    td->pc_tree = NULL;
+  }
+}
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx) {
+  dst_ctx->mic = src_ctx->mic;
+  dst_ctx->mbmi_ext = src_ctx->mbmi_ext;
+
+  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+  dst_ctx->skip = src_ctx->skip;
+  dst_ctx->skippable = src_ctx->skippable;
+  dst_ctx->best_mode_index = src_ctx->best_mode_index;
+
+  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+         sizeof(uint8_t) * src_ctx->num_4x4_blk);
+
+  dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
+  dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
+  dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
+
+  dst_ctx->rate = src_ctx->rate;
+  dst_ctx->dist = src_ctx->dist;
+  dst_ctx->rdcost = src_ctx->rdcost;
+  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+
+  memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
+  dst_ctx->pred_interp_filter = src_ctx->pred_interp_filter;
+
+  dst_ctx->partition = src_ctx->partition;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 000000000..4efc34985
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+typedef enum {
+  // Search all the partition types in this plane.
+  SEARCH_FULL_PLANE = 0,
+  // Only search none_partition coding block.
+  NONE_PARTITION_PLANE = 1,
+  // Search all the partition types in this plane except split.
+  SEARCH_SAME_PLANE = 2,
+  // Skip search partition on this plane. Go split directly.
+  SPLIT_PLANE = 3,
+} CB_TREE_SEARCH;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+  MB_MODE_INFO mic;
+  MB_MODE_INFO_EXT mbmi_ext;
+  uint8_t *color_index_map[2];
+  uint8_t *blk_skip;
+
+  tran_low_t *coeff[MAX_MB_PLANE];
+  tran_low_t *qcoeff[MAX_MB_PLANE];
+  tran_low_t *dqcoeff[MAX_MB_PLANE];
+  uint16_t *eobs[MAX_MB_PLANE];
+  uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+
+  int num_4x4_blk;
+  int skip;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 1.
+  int skippable;
+  int best_mode_index;
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
+  // Skip certain ref frames during RD search of rectangular partitions.
+  int skip_ref_frame_mask;
+
+  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+  // scope of refactoring.
+  int rate;
+  int64_t dist;
+  int64_t rdcost;
+  int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
+                         // been made.
+
+  // motion vector cache for adaptive motion search control in partition
+  // search loop
+  MV pred_mv[REF_FRAMES];
+  InterpFilter pred_interp_filter;
+  PARTITION_TYPE partition;
+} PICK_MODE_CONTEXT;
+
+typedef struct {
+  int valid;
+  int split;
+  int skip;
+  int64_t rdcost;
+  int sub_block_split[4];
+  int sub_block_skip[4];
+  int64_t sub_block_rdcost[4];
+} PC_TREE_STATS;
+
+typedef struct PC_TREE {
+  int index;
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT none;
+  PICK_MODE_CONTEXT horizontal[2];
+  PICK_MODE_CONTEXT vertical[2];
+  PICK_MODE_CONTEXT horizontala[3];
+  PICK_MODE_CONTEXT horizontalb[3];
+  PICK_MODE_CONTEXT verticala[3];
+  PICK_MODE_CONTEXT verticalb[3];
+  PICK_MODE_CONTEXT horizontal4[4];
+  PICK_MODE_CONTEXT vertical4[4];
+  CB_TREE_SEARCH cb_search_range;
+  struct PC_TREE *split[4];
+  PC_TREE_STATS pc_tree_stats;
+} PC_TREE;
+
+void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/corner_detect.c b/third_party/aom/av1/encoder/corner_detect.c
new file mode 100644
index 000000000..e4c59dd9c
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "av1/encoder/corner_detect.h"
+
+// Fast_9 wrapper
+#define FAST_BARRIER 18
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+                       int *points, int max_points) {
+  int num_points;
+  xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride,
+                                                 FAST_BARRIER, &num_points);
+  num_points = (num_points <= max_points ? num_points : max_points);
+  if (num_points > 0 && frm_corners_xy) {
+    memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);
+    free(frm_corners_xy);
+    return num_points;
+  }
+  free(frm_corners_xy);
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
new file mode 100644
index 000000000..cab59a774
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
+#define AOM_AV1_ENCODER_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+                       int *points, int max_points);
+
+#endif  // AOM_AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
new file mode 100644
index 000000000..29e934deb
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/corner_match.h"
+
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im,
+   centered at (x, y).
+*/
+static double compute_variance(unsigned char *im, int stride, int x, int y) {
+  int sum = 0;
+  int sumsq = 0;
+  int var;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+      sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+               im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+    }
+  var = sumsq * MATCH_SZ_SQ - sum * sum;
+  return (double)var;
+}
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
+                                   int y1, unsigned char *im2, int stride2,
+                                   int x2, int y2) {
+  int v1, v2;
+  int sum1 = 0;
+  int sum2 = 0;
+  int sumsq2 = 0;
+  int cross = 0;
+  int var2, cov;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+      v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      sum1 += v1;
+      sum2 += v2;
+      sumsq2 += v2 * v2;
+      cross += v1 * v2;
+    }
+  var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+  return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+          pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+                                int point2y, int width, int height) {
+  const int thresh = (width < height ? height : width) >> 4;
+  return ((point1x - point2x) * (point1x - point2x) +
+          (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(unsigned char *frm, unsigned char *ref,
+                                   int width, int height, int frm_stride,
+                                   int ref_stride,
+                                   Correspondence *correspondences,
+                                   int num_correspondences) {
+  int i;
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(correspondences[i].rx + x,
+                               correspondences[i].ry + y, width, height))
+          continue;
+        if (!is_eligible_distance(correspondences[i].x, correspondences[i].y,
+                                  correspondences[i].rx + x,
+                                  correspondences[i].ry + y, width, height))
+          continue;
+        match_ncc = compute_cross_correlation(
+            frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
+            ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    }
+    correspondences[i].rx += best_x;
+    correspondences[i].ry += best_y;
+  }
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(correspondences[i].x + x,
+                               correspondences[i].y + y, width, height))
+          continue;
+        if (!is_eligible_distance(
+                correspondences[i].x + x, correspondences[i].y + y,
+                correspondences[i].rx, correspondences[i].ry, width, height))
+          continue;
+        match_ncc = compute_cross_correlation(
+            ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
+            frm_stride, correspondences[i].x + x, correspondences[i].y + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    correspondences[i].x += best_x;
+    correspondences[i].y += best_y;
+  }
+}
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+                             int num_frm_corners, unsigned char *ref,
+                             int *ref_corners, int num_ref_corners, int width,
+                             int height, int frm_stride, int ref_stride,
+                             int *correspondence_pts) {
+  // TODO(sarahparker) Improve this to include 2-way match
+  int i, j;
+  Correspondence *correspondences = (Correspondence *)correspondence_pts;
+  int num_correspondences = 0;
+  for (i = 0; i < num_frm_corners; ++i) {
+    double best_match_ncc = 0.0;
+    double template_norm;
+    int best_match_j = -1;
+    if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+                           height))
+      continue;
+    for (j = 0; j < num_ref_corners; ++j) {
+      double match_ncc;
+      if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+                             height))
+        continue;
+      if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+                                ref_corners[2 * j], ref_corners[2 * j + 1],
+                                width, height))
+        continue;
+      match_ncc = compute_cross_correlation(
+          frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+          ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+      if (match_ncc > best_match_ncc) {
+        best_match_ncc = match_ncc;
+        best_match_j = j;
+      }
+    }
+    // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+    // but need to account for the normalization in compute_cross_correlation.
+    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+                                     frm_corners[2 * i + 1]);
+    if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+      correspondences[num_correspondences].x = frm_corners[2 * i];
+      correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+      correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+      correspondences[num_correspondences].ry =
+          ref_corners[2 * best_match_j + 1];
+      num_correspondences++;
+    }
+  }
+  improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+                         correspondences, num_correspondences);
+  return num_correspondences;
+}
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
new file mode 100644
index 000000000..535d2faed
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
+#define AOM_AV1_ENCODER_CORNER_MATCH_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+
+typedef struct {
+  int x, y;
+  int rx, ry;
+} Correspondence;
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+                             int num_frm_corners, unsigned char *ref,
+                             int *ref_corners, int num_ref_corners, int width,
+                             int height, int frm_stride, int ref_stride,
+                             int *correspondence_pts);
+
+#endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 000000000..323e2aed5
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+  512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+  430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+  356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+  289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+  228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+  171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+  119, 115, 112, 109, 105, 102, 99,  95,  92,  89,  86,  82,  79,  76,  73,
+  70,  66,  63,  60,  57,  54,  51,  48,  45,  42,  38,  35,  32,  29,  26,
+  23,  20,  18,  15,  12,  9,   6,   3,
+};
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map) {
+  int i;
+  aom_cdf_prob prev_cdf = 0;
+  for (i = 0;; ++i) {
+    aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
+    prev_cdf = AOM_ICDF(cdf[i]);
+
+    if (inv_map)
+      costs[inv_map[i]] = av1_cost_symbol(p15);
+    else
+      costs[i] = av1_cost_symbol(p15);
+
+    // Stop once we reach the end of the CDF
+    if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+  }
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 000000000..af5b09837
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[128];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+  assert(0 < p15 && p15 < CDF_PROB_TOP);
+  const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+  const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+  assert(prob >= 128);
+  return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
+}
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 000000000..04088b25f
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) * 2;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) * 2;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+                                          uint8_t *x, int pitch_x,
+                                          tran_low_t *c, int pitch_c,
+                                          int dwt_scale_bits, int hbd) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+  if (hbd) {
+    uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  } else {
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  }
+
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd) {
+  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  int acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+    }
+  return acsad;
+}
+
+uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  uint64_t acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r > 0 || c > 0) acsad += abs(output[r * stride + c]);
+    }
+
+  return acsad;
+}
+
+uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
+  int sum = 0;
+  uint32_t sse = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      sum += input[r * stride + c];
+      sse += input[r * stride + c] * input[r * stride + c];
+    }
+  return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
+}
+
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+  tran_low_t output[64];
+
+  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  return av1_haar_ac_sad(output, 8, 8, 8);
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 000000000..37306c6a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd);
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+
+#endif  // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 000000000..cb226c59e
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,5739 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int *rate);
+
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16
+};
+
+#if CONFIG_FP_MB_STATS
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4
+};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
+  1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2
+};
+#endif  // CONFIG_FP_MB_STATS
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  unsigned int sse;
+  const unsigned int var =
+      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  switch (bd) {
+    case 10:
+      var =
+          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), 0, &sse);
+      break;
+    case 12:
+      var =
+          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), 0, &sse);
+      break;
+    case 8:
+    default:
+      var =
+          cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                             CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), 0, &sse);
+      break;
+  }
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
+                                                   const struct buf_2d *ref,
+                                                   int mi_row, int mi_col,
+                                                   BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
+  const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
+                                                   int mi_row, int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(
+      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static void set_mode_info_offsets(const AV1_COMP *const cpi,
+                                  MACROBLOCK *const x, MACROBLOCKD *const xd,
+                                  int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int idx_str = xd->mi_stride * mi_row + mi_col;
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
+  x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                           const TileInfo *const tile,
+                                           MACROBLOCK *const x, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+  set_skip_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                       mi_col, 0, num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_limits.row_min =
+      -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
+  x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+                 cm->mi_cols);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+
+  // R/D setup.
+  x->rdmult = cpi->rd.RDMULT;
+
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const struct segmentation *const seg = &cm->seg;
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  mbmi = xd->mi[0];
+  xd->cfl.mi_row = mi_row;
+  xd->cfl.mi_col = mi_col;
+
+  mbmi->segment_id = 0;
+
+  // Setup segment ID.
+  if (seg->enabled) {
+    if (seg->enabled && !cpi->vaq_refresh) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      mbmi->segment_id =
+          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+    }
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+  }
+}
+
+static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) {
+  InterpFilter filters[2];
+
+  for (int dir = 0; dir < 2; ++dir) {
+    filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir);
+  }
+  mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
+}
+
+static void update_filter_type_count(uint8_t allow_update_cdf,
+                                     FRAME_COUNTS *counts,
+                                     const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    ++counts->switchable_interp[ctx][filter];
+    if (allow_update_cdf) {
+      update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+                 SWITCHABLE_FILTERS);
+    }
+  }
+}
+
+static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      const MB_MODE_INFO *mbmi,
+                                      RD_COUNTS *rdc) {
+  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
+    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
+    int ref;
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
+    }
+  }
+}
+
+static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
+                          const TX_MODE tx_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (xd->lossless[mbmi->segment_id]) {
+    mbmi->tx_size = TX_4X4;
+  } else if (tx_mode != TX_MODE_SELECT) {
+    mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
+  } else {
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+  }
+  if (is_inter_block(mbmi)) {
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+  }
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  av1_zero(x->blk_skip);
+  x->skip = 0;
+}
+
+static void update_state(const AV1_COMP *const cpi,
+                         const TileDataEnc *const tile_data, ThreadData *td,
+                         const PICK_MODE_CONTEXT *const ctx, int mi_row,
+                         int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int i, x_idx, y;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const MB_MODE_INFO *const mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int bw = mi_size_wide[mi->sb_type];
+  const int bh = mi_size_high[mi->sb_type];
+  const int mis = cm->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  assert(mi->sb_type == bsize);
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+
+  reset_intmv_filter_type(mi_addr);
+
+  memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+  x->skip = ctx->skip;
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      mi_addr->segment_id =
+          map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, cm->tx_mode);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip);
+      reset_tx_size(x, mi_addr, cm->tx_mode);
+    }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
+  }
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,
+        THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,
+        THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/,
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
+        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+        THR_PAETH /*PAETH_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mi_addr)) {
+      // TODO(sarahparker): global motion stats need to be handled per-tile
+      // to be compatible with tile-based threading.
+      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
+    }
+
+    if (cm->interp_filter == SWITCHABLE &&
+        mi_addr->motion_mode != WARPED_CAUSAL &&
+        !is_nontrans_global_motion(xd, xd->mi[0])) {
+      update_filter_type_count(tile_data->allow_update_cdf, td->counts, xd,
+                               mi_addr);
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col, const int num_planes) {
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+    const int is_uv = i > 0;
+    setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->sb_type, src->buffers[i],
+                     src->crop_widths[is_uv], src->crop_heights[is_uv],
+                     src->strides[is_uv], mi_row, mi_col, NULL,
+                     x->e_mbd.plane[i].subsampling_x,
+                     x->e_mbd.plane[i].subsampling_y);
+  }
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int8_t segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  av1_init_plane_quantizers(cpi, x, segment_id);
+  aom_clear_system_state();
+  int segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+  return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return av1_compute_rd_mult(
+      cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+                             MACROBLOCK *const x, int mi_row, int mi_col,
+                             RD_STATS *rd_cost, PARTITION_TYPE partition,
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  MB_MODE_INFO *ctx_mbmi = &ctx->mic;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
+  int i, orig_rdmult;
+
+  if (best_rd < 0) {
+    ctx->rdcost = INT64_MAX;
+    ctx->skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  aom_clear_system_state();
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  mbmi = xd->mi[0];
+
+  if (ctx->rd_mode_is_ready) {
+    assert(ctx_mbmi->sb_type == bsize);
+    assert(ctx_mbmi->partition == partition);
+    *mbmi = *ctx_mbmi;
+    rd_cost->rate = ctx->rate;
+    rd_cost->dist = ctx->dist;
+    rd_cost->rdcost = ctx->rdcost;
+  } else {
+    mbmi->sb_type = bsize;
+    mbmi->partition = partition;
+  }
+
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+  if (!ctx->rd_mode_is_ready) {
+    ctx->skippable = 0;
+
+    // Set to zero to make sure we do not use the previous encoded frame stats
+    mbmi->skip = 0;
+
+    // Reset skip mode flag.
+    mbmi->skip_mode = 0;
+  }
+
+  x->skip_chroma_rd =
+      !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+                           xd->plane[1].subsampling_y);
+
+  if (ctx->rd_mode_is_ready) {
+    x->skip = ctx->skip;
+    *x->mbmi_ext = ctx->mbmi_ext;
+    return;
+  }
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  orig_rdmult = x->rdmult;
+
+  if (aq_mode == VARIANCE_AQ) {
+    if (cpi->vaq_refresh) {
+      const int energy = bsize <= BLOCK_16X16
+                             ? x->mb_energy
+                             : av1_log_block_var(cpi, x, bsize);
+      mbmi->segment_id = energy;
+    }
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == COMPLEXITY_AQ) {
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+      x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  }
+
+  if (deltaq_mode > 0) x->rdmult = set_deltaq_rdmult(cpi, xd);
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+    av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
+                              best_rd);
+  } else {
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd);
+    } else {
+      av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                                bsize, ctx, best_rd);
+    }
+  }
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
+      (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+       cpi->refresh_alt2_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
+  ctx->rdcost = rd_cost->rdcost;
+}
+
+static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+                                    PREDICTION_MODE mode, int16_t mode_context,
+                                    uint8_t allow_update_cdf) {
+  (void)counts;
+
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][0];
+#endif
+    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    return;
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][1];
+#endif
+    if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->zeromv_mode[mode_ctx][0];
+#endif
+      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+      return;
+    } else {
+#if CONFIG_ENTROPY_STATS
+      ++counts->zeromv_mode[mode_ctx][1];
+#endif
+      if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+      if (allow_update_cdf)
+        update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+    }
+  }
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                               FRAME_COUNTS *counts, uint8_t allow_update_cdf) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  (void)counts;
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+                 n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+      }
+    }
+  }
+
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                   n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+      }
+    }
+  }
+}
+
+static void sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+                            MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+                            const MB_MODE_INFO *above_mi,
+                            const MB_MODE_INFO *left_mi, const int intraonly,
+                            const int mi_row, const int mi_col,
+                            uint8_t allow_update_cdf) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+  }
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf) {
+      update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode,
+                 2);
+      if (use_filter_intra_mode) {
+        update_cdf(fc->filter_intra_mode_cdf,
+                   mbmi->filter_intra_mode_info.filter_intra_mode,
+                   FILTER_INTRA_MODES);
+      }
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    if (allow_update_cdf) {
+      update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+                 mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+                 2 * MAX_ANGLE_DELTA + 1);
+    }
+  }
+
+  if (!is_chroma_reference(mi_row, mi_col, bsize,
+                           xd->plane[AOM_PLANE_U].subsampling_x,
+                           xd->plane[AOM_PLANE_U].subsampling_y))
+    return;
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[is_cfl_allowed(xd)][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  if (allow_update_cdf) {
+    const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+    update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+               UV_INTRA_MODES - !cfl_allowed);
+  }
+  if (uv_mode == UV_CFL_PRED) {
+    const int joint_sign = mbmi->cfl_alpha_signs;
+    const int idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      if (allow_update_cdf)
+        update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+      if (allow_update_cdf)
+        update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+      av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[uv_mode - UV_V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    if (allow_update_cdf) {
+      update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+                 mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+                 2 * MAX_ANGLE_DELTA + 1);
+    }
+  }
+  if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+    update_palette_cdf(xd, mbmi, counts, allow_update_cdf);
+}
+
+static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
+                         ThreadData *td, int mi_row, int mi_col) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const uint8_t allow_update_cdf = tile_data->allow_update_cdf;
+
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left =
+      ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+      ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (cm->skip_mode_flag && !seg_ref_active && is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    if (allow_update_cdf)
+      update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode) {
+    if (!seg_ref_active) {
+      const int skip_ctx = av1_get_skip_context(xd);
+#if CONFIG_ENTROPY_STATS
+      td->counts->skip[skip_ctx][mbmi->skip]++;
+#endif
+      if (allow_update_cdf) update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
+    }
+  }
+
+  if (cm->delta_q_present_flag &&
+      (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
+      super_block_upper_left) {
+#if CONFIG_ENTROPY_STATS
+    const int dq =
+        (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+    const int absdq = abs(dq);
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+      td->counts->delta_q[i][1]++;
+    }
+    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+#endif
+    xd->current_qindex = mbmi->current_qindex;
+    if (cm->delta_lf_present_flag) {
+      if (cm->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+#if CONFIG_ENTROPY_STATS
+          const int delta_lf =
+              (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res;
+          const int abs_delta_lf = abs(delta_lf);
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+            td->counts->delta_lf_multi[lf_id][i][1]++;
+          }
+          if (abs_delta_lf < DELTA_LF_SMALL)
+            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+#endif
+          xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+        }
+      } else {
+#if CONFIG_ENTROPY_STATS
+        const int delta_lf =
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+            cm->delta_lf_res;
+        const int abs_delta_lf = abs(delta_lf);
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          td->counts->delta_lf[i][1]++;
+        }
+        if (abs_delta_lf < DELTA_LF_SMALL)
+          td->counts->delta_lf[abs_delta_lf][0]++;
+#endif
+        xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+      }
+    }
+  }
+
+  if (!is_inter_block(mbmi)) {
+    sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                    frame_is_intra_only(cm), mi_row, mi_col,
+                    tile_data->allow_update_cdf);
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    if (allow_update_cdf)
+      update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc_block(mbmi)];
+#endif  // CONFIG_ENTROPY_STATS
+  }
+
+  if (!frame_is_intra_only(cm)) {
+    RD_COUNTS *rdc = &td->rd_counts;
+
+    FRAME_COUNTS *const counts = td->counts;
+
+    if (mbmi->skip_mode) {
+      rdc->skip_mode_used_flag = 1;
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      return;
+    }
+
+    const int inter_block = is_inter_block(mbmi);
+
+    if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+      counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+      if (allow_update_cdf) {
+        update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+                   inter_block, 2);
+      }
+      // If the segment reference feature is enabled we have only a single
+      // reference frame allowed for the segment so exclude it from
+      // the reference frame counts used to work out probabilities.
+      if (inter_block) {
+        const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+        const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+
+        av1_collect_neighbors_ref_counts(xd);
+
+        if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+          if (has_second_ref(mbmi))
+            // This flag is also updated for 4x4 blocks
+            rdc->compound_ref_used_flag = 1;
+          if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->comp_inter[av1_get_reference_mode_context(xd)]
+                              [has_second_ref(mbmi)]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (allow_update_cdf) {
+              update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi),
+                         2);
+            }
+          }
+        }
+
+        if (has_second_ref(mbmi)) {
+          const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                        ? UNIDIR_COMP_REFERENCE
+                                                        : BIDIR_COMP_REFERENCE;
+          if (allow_update_cdf) {
+            update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                       COMP_REFERENCE_TYPES);
+          }
+#if CONFIG_ENTROPY_STATS
+          counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                               [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
+
+          if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+            const int bit = (ref0 == BWDREF_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
+                                [bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (!bit) {
+              const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+              if (allow_update_cdf)
+                update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                  [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+              if (bit1) {
+                if (allow_update_cdf) {
+                  update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                             ref1 == GOLDEN_FRAME, 2);
+                }
+#if CONFIG_ENTROPY_STATS
+                counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
+                                    [2][ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+              }
+            }
+          } else {
+            const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (!bit) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_ref_p1(xd),
+                           ref0 == LAST2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                              [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            } else {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_ref_p2(xd),
+                           ref0 == GOLDEN_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+                              [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+            if (allow_update_cdf) {
+              update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd),
+                         ref1 == ALTREF_FRAME, 2);
+            }
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+                               [ref1 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (ref1 != ALTREF_FRAME) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                           ref1 == ALTREF2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                                 [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          }
+        } else {
+          const int bit = (ref0 >= BWDREF_FRAME);
+          if (allow_update_cdf)
+            update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (bit) {
+            assert(ref0 <= ALTREF_FRAME);
+            if (allow_update_cdf) {
+              update_cdf(av1_get_pred_cdf_single_ref_p2(xd),
+                         ref0 == ALTREF_FRAME, 2);
+            }
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (ref0 != ALTREF_FRAME) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                           ref0 == ALTREF2_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                                [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          } else {
+            const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+            if (allow_update_cdf)
+              update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+            counts
+                ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (!bit1) {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p4(xd),
+                           ref0 != LAST_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                                [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            } else {
+              if (allow_update_cdf) {
+                update_cdf(av1_get_pred_cdf_single_ref_p5(xd),
+                           ref0 != LAST3_FRAME, 2);
+              }
+#if CONFIG_ENTROPY_STATS
+              counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                                [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          }
+        }
+
+        if (cm->seq_params.enable_interintra_compound &&
+            is_interintra_allowed(mbmi)) {
+          const int bsize_group = size_group_lookup[bsize];
+          if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+            counts->interintra[bsize_group][1]++;
+#endif
+            if (allow_update_cdf)
+              update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->interintra_mode_cdf[bsize_group],
+                         mbmi->interintra_mode, INTERINTRA_MODES);
+            }
+            if (is_interintra_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+              counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+              if (allow_update_cdf) {
+                update_cdf(fc->wedge_interintra_cdf[bsize],
+                           mbmi->use_wedge_interintra, 2);
+              }
+              if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+                counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+                if (allow_update_cdf) {
+                  update_cdf(fc->wedge_idx_cdf[bsize],
+                             mbmi->interintra_wedge_index, 16);
+                }
+              }
+            }
+          } else {
+#if CONFIG_ENTROPY_STATS
+            counts->interintra[bsize_group][0]++;
+#endif
+            if (allow_update_cdf)
+              update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+          }
+        }
+
+        set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        const MOTION_MODE motion_allowed =
+            cm->switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+            counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+                         MOTION_MODES);
+            }
+          } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+            counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL,
+                         2);
+            }
+          }
+        }
+
+        if (has_second_ref(mbmi)) {
+          assert(cm->reference_mode != SINGLE_REFERENCE &&
+                 is_inter_compound_mode(mbmi->mode) &&
+                 mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+          const int masked_compound_used =
+              is_any_masked_compound_used(bsize) &&
+              cm->seq_params.enable_masked_compound;
+          if (masked_compound_used) {
+            const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+            ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                         mbmi->comp_group_idx, 2);
+            }
+          }
+
+          if (mbmi->comp_group_idx == 0) {
+            const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->compound_index_cdf[comp_index_ctx],
+                         mbmi->compound_idx, 2);
+            }
+          } else {
+            assert(masked_compound_used);
+            if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+              ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1];
+#endif
+              if (allow_update_cdf) {
+                update_cdf(fc->compound_type_cdf[bsize],
+                           mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1);
+              }
+            }
+          }
+        }
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+            if (allow_update_cdf) {
+              update_cdf(fc->wedge_idx_cdf[bsize],
+                         mbmi->interinter_comp.wedge_index, 16);
+            }
+          }
+        }
+      }
+    }
+
+    if (inter_block &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int16_t mode_ctx;
+      const PREDICTION_MODE mode = mbmi->mode;
+
+      mode_ctx =
+          av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+      if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+        ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+        if (allow_update_cdf)
+          update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                     INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+      } else {
+        update_inter_mode_stats(fc, counts, mode, mode_ctx, allow_update_cdf);
+      }
+
+      int mode_allowed = (mbmi->mode == NEWMV);
+      mode_allowed |= (mbmi->mode == NEW_NEWMV);
+      if (mode_allowed) {
+        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+        int idx;
+
+        for (idx = 0; idx < 2; ++idx) {
+          if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
+            uint8_t drl_ctx =
+                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+            ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+
+            if (mbmi->ref_mv_idx == idx) break;
+          }
+        }
+      }
+
+      if (have_nearmv_in_inter_mode(mbmi->mode)) {
+        uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+        int idx;
+
+        for (idx = 1; idx < 3; ++idx) {
+          if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
+            uint8_t drl_ctx =
+                av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+            ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+
+            if (mbmi->ref_mv_idx == idx - 1) break;
+          }
+        }
+      }
+    }
+  }
+}
+
+typedef struct {
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+                            const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize,
+                            const int num_planes) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide =
+      block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_blocks_high =
+      block_size_high[bsize] >> tx_size_high_log2[0];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+           ctx->a + num_4x4_blocks_wide * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+               xd->plane[p].subsampling_x);
+    memcpy(xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           ctx->l + num_4x4_blocks_high * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_seg_context + mi_col, ctx->sa,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_seg_context[0]) * mi_height);
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize,
+                         const int num_planes) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide =
+      block_size_wide[bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_blocks_high =
+      block_size_high[bsize] >> tx_size_high_log2[0];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(ctx->a + num_4x4_blocks_wide * p,
+           xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+               xd->plane[p].subsampling_x);
+    memcpy(ctx->l + num_4x4_blocks_high * p,
+           xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(ctx->sa, xd->above_seg_context + mi_col,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+}
+
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                     ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     PARTITION_TYPE partition,
+                     const PICK_MODE_CONTEXT *const ctx, int *rate) {
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  update_state(cpi, tile_data, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+  if (!dry_run) av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
+                    rate);
+
+  if (dry_run == 0)
+    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
+
+  if (!dry_run) {
+    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
+        cpi->common.delta_lf_present_flag) {
+      const int frame_lf_count = av1_num_planes(&cpi->common) > 1
+                                     ? FRAME_LF_COUNT
+                                     : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+    }
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+    update_stats(&cpi->common, tile_data, td, mi_row, mi_col);
+  }
+}
+
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                      TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row,
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < cm->mi_rows;
+    const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+    if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+      td->counts->partition[ctx][partition]++;
+#endif
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
+      }
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->none, rate);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->vertical[0], rate);
+      if (mi_col + hbs < cm->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, &pc_tree->vertical[1], rate);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < cm->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal[1], rate);
+      }
+      break;
+    case PARTITION_SPLIT:
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
+      break;
+
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->verticalb[2], rate);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, &pc_tree->vertical4[i], rate);
+      }
+      break;
+    default: assert(0 && "Invalid partition type."); break;
+  }
+
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+                                      int cols_left, int *bh, int *bw) {
+  if (rows_left <= 0 || cols_left <= 0) {
+    return AOMMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; bsize > 0; bsize -= 3) {
+      *bh = mi_size_high[bsize];
+      *bw = mi_size_wide[bsize];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return bsize;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+                                     MB_MODE_INFO *mi, int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO **mib) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+      const int index = r * cm->mi_stride + c;
+      mib[index] = mi + index;
+      mib[index]->sb_type = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                   MB_MODE_INFO **mib, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  int block_row, block_col;
+  MB_MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
+      (mi_rows_remaining >= cm->seq_params.mib_size)) {
+    for (block_row = 0; block_row < cm->seq_params.mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->seq_params.mib_size;
+           block_col += bw) {
+        int index = block_row * cm->mi_stride + block_col;
+        mib[index] = mi_upper_left + index;
+        mib[index]->sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
+  }
+}
+
+static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, MB_MODE_INFO **mib,
+                             TOKENEXTRA **tp, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
+  int splits_below = 0;
+  BLOCK_SIZE bs_type = mib[0]->sb_type;
+  int do_partition_search = 1;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_invalid_rd_stats(&chosen_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+  }
+
+  if (do_partition_search &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      cpi->sf.adjust_partitioning_from_last_frame) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (i = 0; i < 4; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        MB_MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+        if (this_mi && this_mi->sb_type >= sub_subsize) {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+      }
+
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      mib[0]->sb_type = bs_type;
+      pc_tree->partitioning = partition;
+    }
+  }
+  for (int b = 0; b < 2; ++b) {
+    pc_tree->horizontal[b].skip_ref_frame_mask = 0;
+    pc_tree->vertical[b].skip_ref_frame_mask = 0;
+  }
+  for (int b = 0; b < 3; ++b) {
+    pc_tree->horizontala[b].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
+    pc_tree->verticala[b].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[b].skip_ref_frame_mask = 0;
+  }
+  for (int b = 0; b < 4; ++b) {
+    pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
+    pc_tree->vertical4[b].skip_ref_frame_mask = 0;
+  }
+  switch (partition) {
+    case PARTITION_NONE:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+      break;
+    case PARTITION_HORZ:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                       INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < cm->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                         PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                         INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                       PARTITION_VERT, subsize, &pc_tree->vertical[0],
+                       INT64_MAX);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < cm->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+                          mi_col, subsize, NULL);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                         PARTITION_VERT, subsize,
+                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        rd_use_partition(cpi, td, tile_data,
+                         mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+                         &tmp_rdc.dist, i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += x->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
+      cpi->sf.partition_search_type == SEARCH_PARTITION &&
+      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+      (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
+      (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (i = 0; i < 4; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      RD_STATS tmp_rdc;
+
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+      save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                       &tmp_rdc, PARTITION_SPLIT, split_subsize,
+                       &pc_tree->split[i]->none, INT64_MAX);
+
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        av1_invalid_rd_stats(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != 3)
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+      chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+    }
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mib[0]->sb_type = bsize;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params.sb_size)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params.sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+}
+
+/* clang-format off */
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
+                            BLOCK_4X4,    //                     4x4
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  // 64x128, 128x64, 128x128
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   4x16,   16x4,    8x32
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,  //   32x8,  16x64,   64x16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
+                                  BLOCK_8X8,    //                     4x4
+  BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,    //    4x8,    8x4,     8x8
+  BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,    //   8x16,   16x8,   16x16
+  BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  // 64x128, 128x64, 128x128
+  BLOCK_16X16,   BLOCK_16X16,   BLOCK_32X32,    //   4x16,   16x4,    8x32
+  BLOCK_32X32,   BLOCK_LARGEST, BLOCK_LARGEST,  //   32x8,  16x64,   64x16
+};
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
+                              BLOCK_4X4,    //                     4x4
+  BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,    //    4x8,    8x4,     8x8
+  BLOCK_8X8,   BLOCK_8X8,     BLOCK_16X16,  //   8x16,   16x8,   16x16
+  BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
+  BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128,  // 64x128, 128x64, 128x128
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,      //   4x16,   16x4,    8x32
+  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,    //   32x8,  16x64,   64x16
+};
+/* clang-format on */
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a superblock but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *xd, MB_MODE_INFO **mib,
+                                        BLOCK_SIZE *min_block_size,
+                                        BLOCK_SIZE *max_block_size) {
+  int i, j;
+  int index = 0;
+
+  // Check the sb_type for each block that belongs to this region.
+  for (i = 0; i < cm->seq_params.mib_size; ++i) {
+    for (j = 0; j < cm->seq_params.mib_size; ++j) {
+      MB_MODE_INFO *mi = mib[index + j];
+      BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4;
+      *min_block_size = AOMMIN(*min_block_size, sb_type);
+      *max_block_size = AOMMAX(*max_block_size, sb_type);
+    }
+    index += xd->mi_stride;
+  }
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    const TWO_PASS *const twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+    right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
+  return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) ||
+         active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size);
+}
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
+                                    MACROBLOCKD *const xd, int mi_row,
+                                    int mi_col, BLOCK_SIZE *min_block_size,
+                                    BLOCK_SIZE *max_block_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  MB_MODE_INFO **mi = xd->mi;
+  const int left_in_image = xd->left_available && mi[-1];
+  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  int bh, bw;
+  BLOCK_SIZE min_size = BLOCK_4X4;
+  BLOCK_SIZE max_size = BLOCK_LARGEST;
+
+  // Trap case where we do not have a prediction.
+  if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+    // Default "min to max" and "max to min"
+    min_size = BLOCK_LARGEST;
+    max_size = BLOCK_4X4;
+
+    // NOTE: each call to get_sb_partition_size_range() uses the previous
+    // passed in values for min and max as a starting point.
+    // Find the min and max partition used in previous frame at this location
+    if (cm->frame_type != KEY_FRAME) {
+      MB_MODE_INFO **prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+      get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
+    }
+    // Find the min and max partition sizes used in the left superblock
+    if (left_in_image) {
+      MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size];
+      get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
+    }
+    // Find the min and max partition sizes used in the above suprblock.
+    if (above_in_image) {
+      MB_MODE_INFO **above_sb_mi =
+          &mi[-xd->mi_stride * cm->seq_params.mib_size];
+      get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
+    }
+
+    // Adjust observed min and max for "relaxed" auto partition case.
+    if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+      min_size = min_partition_size[min_size];
+      max_size = max_partition_size[max_size];
+    }
+  }
+
+  // Check border cases where max and min from neighbors may not be legal.
+  max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
+                                 &bh, &bw);
+  min_size = AOMMIN(min_size, max_size);
+
+  // Test for blocks at the edge of the active image.
+  // This may be the actual edge of the image or where there are formatting
+  // bars.
+  if (active_edge_sb(cpi, mi_row, mi_col)) {
+    min_size = BLOCK_4X4;
+  } else {
+    min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
+  }
+
+  // When use_square_partition_only is true, make sure at least one square
+  // partition is allowed by selecting the next smaller square size as
+  // *min_block_size.
+  if (min_size >= cpi->sf.use_square_partition_only_threshold) {
+    min_size = AOMMIN(min_size, next_square_size[max_size]);
+  }
+
+  *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size);
+  *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size);
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(const AV1_COMMON *const cm,
+                                const MACROBLOCKD *const xd, int mi_row,
+                                int mi_col, BLOCK_SIZE bsize,
+                                BLOCK_SIZE *const min_bs,
+                                BLOCK_SIZE *const max_bs) {
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  int idx, idy;
+
+  const int idx_str = cm->mi_stride * mi_row + mi_col;
+  MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
+  BLOCK_SIZE min_size = cm->seq_params.sb_size;  // default values
+  BLOCK_SIZE max_size = BLOCK_4X4;
+
+  if (prev_mi) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      for (idx = 0; idx < mi_width; ++idx) {
+        const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
+        const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
+        min_size = AOMMIN(min_size, bs);
+        max_size = AOMMAX(max_size, bs);
+      }
+    }
+  }
+
+  if (xd->left_available) {
+    for (idy = 0; idy < mi_height; ++idy) {
+      const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
+      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
+      min_size = AOMMIN(min_size, bs);
+      max_size = AOMMAX(max_size, bs);
+    }
+  }
+
+  if (xd->up_available) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
+      const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
+      min_size = AOMMIN(min_size, bs);
+      max_size = AOMMAX(max_size, bs);
+    }
+  }
+
+  if (min_size == max_size) {
+    min_size = min_partition_size[min_size];
+    max_size = max_partition_size[max_size];
+  }
+
+  *min_bs = AOMMIN(min_size, cm->seq_params.sb_size);
+  *max_bs = AOMMIN(max_size, cm->seq_params.sb_size);
+}
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x,
+                                const PICK_MODE_CONTEXT *const ctx) {
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if CONFIG_FP_MB_STATS
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
+  0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
+  // TODO(debargha): What are the correct numbers here?
+  130, 130, 150
+};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
+  0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
+  // TODO(debargha): What are the correct numbers here?
+  160, 160, 240
+};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
+  // TODO(debargha): What are the correct numbers here?
+  8, 8, 10
+};
+
+typedef enum {
+  MV_ZERO = 0,
+  MV_LEFT = 1,
+  MV_UP = 2,
+  MV_RIGHT = 3,
+  MV_DOWN = 4,
+  MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+  if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+    return MV_ZERO;
+  } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+    return MV_LEFT;
+  } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+    return MV_RIGHT;
+  } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+    return MV_UP;
+  } else {
+    return MV_DOWN;
+  }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+                                           MOTION_DIRECTION that_mv) {
+  if (this_mv == that_mv) {
+    return 0;
+  } else {
+    return abs(this_mv - that_mv) == 2 ? 2 : 1;
+  }
+}
+#endif
+
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks)
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
+                           int mi_row, int mi_col, BLOCK_SIZE subsize,
+                           RD_STATS *best_rdc, RD_STATS *sum_rdc,
+                           RD_STATS *this_rdc, PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *prev_ctx,
+                           PICK_MODE_CONTEXT *this_ctx) {
+#define RTS_X_RATE_NOCOEF_ARG
+#define RTS_MAX_RDCOST best_rdc->rdcost
+
+  MACROBLOCK *const x = &td->mb;
+
+  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+
+  const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX
+                                       ? INT64_MAX
+                                       : (best_rdc->rdcost - sum_rdc->rdcost);
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+                   RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+                   rdcost_remaining);
+
+  if (this_rdc->rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc->rate;
+    sum_rdc->dist += this_rdc->dist;
+    sum_rdc->rdcost += this_rdc->rdcost;
+  }
+
+  if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
+
+  if (!is_last) {
+    update_state(cpi, tile_data, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                      subsize, NULL);
+  }
+
+  return 1;
+
+#undef RTS_X_RATE_NOCOEF_ARG
+#undef RTS_MAX_RDCOST
+}
+
+static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TOKENEXTRA **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               PICK_MODE_CONTEXT ctxs[3],
+                               PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                               BLOCK_SIZE bsize, PARTITION_TYPE partition,
+                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_STATS sum_rdc, this_rdc;
+#define RTP_STX_TRY_ARGS
+  int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = x->partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
+    return;
+
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
+    return;
+
+  // With the new layout of mixed partitions for PARTITION_HORZ_B and
+  // PARTITION_VERT_B, the last subblock might start past halfway through the
+  // main block, so we might signal it even though the subblock lies strictly
+  // outside the image. In that case, we won't spend any bits coding it and the
+  // difference (obviously) doesn't contribute to the error.
+  const int try_block2 = 1;
+  if (try_block2 &&
+      !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
+                       best_rdc, &sum_rdc, &this_rdc,
+                       RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
+    return;
+
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
+
+#undef RTP_STX_TRY_ARGS
+}
+
+static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+  pc_tree->none.skip = 0;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      reset_partition(pc_tree->split[idx], subsize);
+  }
+}
+
+static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
+                                  TileDataEnc *tile_data, TOKENEXTRA **tp,
+                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                  RD_STATS *rd_cost, int64_t best_rd,
+                                  PC_TREE *pc_tree, int64_t *none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = mi_size_wide[bsize] / 2;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TOKENEXTRA *const tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  int tmp_partition_cost[PARTITION_TYPES];
+  BLOCK_SIZE subsize;
+  RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc;
+  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  int do_square_split = bsize_at_least_8x8;
+  const int pl = bsize_at_least_8x8
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const int *partition_cost =
+      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+  const int num_planes = av1_num_planes(cm);
+
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int has_rows = (mi_row + mi_step < cm->mi_rows);
+  const int has_cols = (mi_col + mi_step < cm->mi_cols);
+
+  if (none_rd) *none_rd = 0;
+
+  int partition_none_allowed = has_rows && has_cols;
+
+  (void)*tp_orig;
+  (void)split_rd;
+
+  if (best_rd < 0) {
+    pc_tree->none.rdcost = INT64_MAX;
+    pc_tree->none.skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+  pc_tree->pc_tree_stats.valid = 1;
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to fixed pattern
+  // when debugging.
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_init_rd_stats(&this_rdc);
+  av1_init_rd_stats(&sum_rdc);
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+      do_square_split = 0;
+  }
+#endif
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    int pt_cost = 0;
+    if (bsize_at_least_8x8) {
+      pc_tree->partitioning = PARTITION_NONE;
+      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                    ? partition_cost[PARTITION_NONE]
+                    : 0;
+    }
+    int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+    int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - partition_rd_cost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+
+    pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
+    pc_tree->pc_tree_stats.skip = ctx_none->skip;
+
+    if (none_rd) *none_rd = this_rdc.rdcost;
+    if (this_rdc.rate != INT_MAX) {
+      if (bsize_at_least_8x8) {
+        this_rdc.rate += pt_cost;
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        // Adjust dist breakout threshold according to the partition size.
+        const int64_t dist_breakout_thr =
+            cpi->sf.partition_search_breakout_dist_thr >>
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+        const int rate_breakout_thr =
+            cpi->sf.partition_search_breakout_rate_thr *
+            num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+        pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+        // If all y, u, v transform blocks in this partition are skippable, and
+        // the dist & rate are within the thresholds, the partition search is
+        // terminated for current branch of the partition search tree.
+        // The dist & rate thresholds are set to 0 at speed 0 to disable the
+        // early termination at that speed.
+        if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
+            (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
+             best_rdc.rate < rate_breakout_thr)) {
+          do_square_split = 0;
+        }
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+  int64_t temp_best_rdcost = best_rdc.rdcost;
+  pn_rdc = best_rdc;
+
+  // PARTITION_SPLIT
+  if (do_square_split) {
+    int reached_last_index = 0;
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    int idx;
+
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+      // TODO(Cherma) : Account for partition cost while passing best rd to
+      // rd_pick_sqr_partition()
+      rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                            mi_col + x_idx, subsize, &this_rdc,
+                            temp_best_rdcost - sum_rdc.rdcost,
+                            pc_tree->split[idx], p_split_rd);
+
+      pc_tree->pc_tree_stats.sub_block_rdcost[idx] = this_rdc.rdcost;
+      pc_tree->pc_tree_stats.sub_block_skip[idx] =
+          pc_tree->split[idx]->none.skip;
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+    reached_last_index = (idx == 4);
+
+    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    }
+
+    int has_split = 0;
+    if (pc_tree->partitioning == PARTITION_SPLIT) {
+      for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) {
+        if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT)
+          ++has_split;
+      }
+
+      if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) {
+        pc_tree->cb_search_range = SPLIT_PLANE;
+      }
+    }
+
+    if (pc_tree->partitioning == PARTITION_NONE) {
+      pc_tree->cb_search_range = SEARCH_SAME_PLANE;
+      if (pn_rdc.dist <= sum_rdc.dist)
+        pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+    }
+
+    if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }  // if (do_split)
+
+  pc_tree->pc_tree_stats.split = pc_tree->partitioning == PARTITION_SPLIT;
+  if (do_square_split) {
+    for (int i = 0; i < 4; ++i) {
+      pc_tree->pc_tree_stats.sub_block_split[i] =
+          pc_tree->split[i]->partitioning == PARTITION_SPLIT;
+    }
+  }
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void)best_rd;
+  *rd_cost = best_rdc;
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    if (bsize == cm->seq_params.sb_size) {
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  if (bsize == cm->seq_params.sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+#define FEATURE_SIZE 19
+static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+  2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+  0.125296f, -1.134961f, 0.862757f,  -0.418799f, -0.637666f,
+  0.016232f, 0.345013f,  0.018823f,  -0.393394f, -1.130700f,
+  0.695357f, 0.112569f,  -0.341975f, -0.513882f, 5.7488966f,
+};
+
+static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+  2.990993f,  0.423273f,  -0.926544f, 0.454646f,  -0.292698f,
+  -1.311632f, -0.284432f, 0.717141f,  -0.419257f, -0.574760f,
+  -0.674444f, 0.669047f,  -0.374255f, 0.380624f,  -0.804036f,
+  0.264021f,  0.004163f,  1.896802f,  0.924287f,  0.13490619f,
+};
+
+static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+  2.795181f,  -0.136943f, -0.924842f, 0.405330f,  -0.463505f,
+  -0.584076f, -0.831472f, 0.382985f,  -0.597544f, -0.138915f,
+  -1.354350f, 0.466035f,  -0.553961f, 0.213202f,  -1.166429f,
+  0.010776f,  -0.096236f, 2.335084f,  1.699857f,  -0.58178353f,
+};
+
+static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+  1.987888f,  -0.431100f, -1.687703f, 0.262602f,  -0.425298f,
+  -0.463870f, -1.493457f, 0.470917f,  -0.528457f, -0.087700f,
+  -1.815092f, 0.152883f,  -0.337908f, 0.093679f,  -1.548267f,
+  -0.042387f, -0.000861f, 2.556746f,  1.619192f,  0.03643292f,
+};
+
+static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+  2.188344f,  -0.817528f, -2.119219f, 0.000000f,  -0.348167f,
+  -0.658074f, -1.960362f, 0.000000f,  -0.403080f, 0.282699f,
+  -2.061088f, 0.000000f,  -0.431919f, -0.127960f, -1.099550f,
+  0.000000f,  0.121622f,  2.017455f,  2.058228f,  -0.15475988f,
+};
+
+static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+  -1.006689f, 0.777908f,  4.461072f,  -0.395782f, -0.014610f,
+  -0.853863f, 0.729997f,  -0.420477f, 0.282429f,  -1.194595f,
+  3.181220f,  -0.511416f, 0.117084f,  -1.149348f, 1.507990f,
+  -0.477212f, 0.202963f,  -1.469581f, 0.624461f,  -0.89081228f,
+};
+
+static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+  -1.241117f, 0.844878f,  5.638803f,  -0.489780f, -0.108796f,
+  -4.576821f, 1.540624f,  -0.477519f, 0.227791f,  -1.443968f,
+  1.586911f,  -0.505125f, 0.140764f,  -0.464194f, 1.466658f,
+  -0.641166f, 0.195412f,  1.427905f,  2.080007f,  -1.98272777f,
+};
+
+static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+  -2.130825f, 0.476023f,  5.907343f,  -0.516002f, -0.097471f,
+  -2.662754f, 0.614858f,  -0.576728f, 0.085261f,  -0.031901f,
+  0.727842f,  -0.600034f, 0.079326f,  0.324328f,  0.504502f,
+  -0.547105f, -0.037670f, 0.304995f,  0.369018f,  -2.66299987f,
+};
+
+static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+  -1.626410f, 0.872047f,  5.414965f,  -0.554781f, -0.084514f,
+  -3.020550f, 0.467632f,  -0.382280f, 0.199568f,  0.426220f,
+  0.829426f,  -0.467100f, 0.153098f,  0.662994f,  0.327545f,
+  -0.560106f, -0.141610f, 0.403372f,  0.523991f,  -3.02891231f,
+};
+
+static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+  -1.463349f, 0.375376f,  4.751430f, 0.000000f, -0.184451f,
+  -1.655447f, 0.443214f,  0.000000f, 0.127961f, 0.152435f,
+  0.083288f,  0.000000f,  0.143105f, 0.438012f, 0.073238f,
+  0.000000f,  -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+
+// split_score indicates confidence of picking split partition;
+// none_score indicates confidence of picking none partition;
+static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
+                                          BLOCK_SIZE bsize, int *split_score,
+                                          int *none_score) {
+  if (!pc_tree_stats->valid) return 0;
+  const float *split_weights = NULL;
+  const float *none_weights = NULL;
+  switch (bsize) {
+    case BLOCK_4X4: break;
+    case BLOCK_8X8:
+      split_weights = two_pass_split_partition_weights_8;
+      none_weights = two_pass_none_partition_weights_8;
+      break;
+    case BLOCK_16X16:
+      split_weights = two_pass_split_partition_weights_16;
+      none_weights = two_pass_none_partition_weights_16;
+      break;
+    case BLOCK_32X32:
+      split_weights = two_pass_split_partition_weights_32;
+      none_weights = two_pass_none_partition_weights_32;
+      break;
+    case BLOCK_64X64:
+      split_weights = two_pass_split_partition_weights_64;
+      none_weights = two_pass_none_partition_weights_64;
+      break;
+    case BLOCK_128X128:
+      split_weights = two_pass_split_partition_weights_128;
+      none_weights = two_pass_none_partition_weights_128;
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!split_weights || !none_weights) return 0;
+
+  aom_clear_system_state();
+
+  float features[FEATURE_SIZE];
+  int feature_index = 0;
+  features[feature_index++] = (float)pc_tree_stats->split;
+  features[feature_index++] = (float)pc_tree_stats->skip;
+  const int rdcost = (int)AOMMIN(INT_MAX, pc_tree_stats->rdcost);
+  const int rd_valid = rdcost > 0 && rdcost < 1000000000;
+  features[feature_index++] = (float)rd_valid;
+  for (int i = 0; i < 4; ++i) {
+    features[feature_index++] = (float)pc_tree_stats->sub_block_split[i];
+    features[feature_index++] = (float)pc_tree_stats->sub_block_skip[i];
+    const int sub_rdcost =
+        (int)AOMMIN(INT_MAX, pc_tree_stats->sub_block_rdcost[i]);
+    const int sub_rd_valid = sub_rdcost > 0 && sub_rdcost < 1000000000;
+    features[feature_index++] = (float)sub_rd_valid;
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (rd_valid && sub_rd_valid && sub_rdcost < rdcost)
+      rd_ratio = (float)sub_rdcost / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == FEATURE_SIZE);
+
+  float score_1 = split_weights[FEATURE_SIZE];
+  float score_2 = none_weights[FEATURE_SIZE];
+  for (int i = 0; i < FEATURE_SIZE; ++i) {
+    score_1 += features[i] * split_weights[i];
+    score_2 += features[i] * none_weights[i];
+  }
+  *split_score = (int)(score_1 * 100);
+  *none_score = (int)(score_2 * 100);
+  return 1;
+}
+#undef FEATURE_SIZE
+
+static void ml_prune_rect_partition(const AV1_COMP *const cpi,
+                                    const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                    int64_t best_rd, int64_t none_rd,
+                                    int64_t *split_rd,
+                                    int *const dst_prune_horz,
+                                    int *const dst_prune_vert) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  best_rd = AOMMAX(best_rd, 1);
+  const NN_CONFIG *nn_config = NULL;
+  const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+  float cur_thresh = 0.0f;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_rect_partition_nnconfig_8;
+      cur_thresh = prob_thresholds[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_rect_partition_nnconfig_16;
+      cur_thresh = prob_thresholds[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_rect_partition_nnconfig_32;
+      cur_thresh = prob_thresholds[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_rect_partition_nnconfig_64;
+      cur_thresh = prob_thresholds[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_rect_partition_nnconfig_128;
+      cur_thresh = prob_thresholds[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+  aom_clear_system_state();
+
+  // 1. Compute input features
+  float features[9];
+
+  // RD cost ratios
+  for (int i = 0; i < 5; i++) features[i] = 1.0f;
+  if (none_rd > 0 && none_rd < 1000000000)
+    features[0] = (float)none_rd / (float)best_rd;
+  for (int i = 0; i < 4; i++) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      features[1 + i] = (float)split_rd[i] / (float)best_rd;
+  }
+
+  // Variance ratios
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int whole_block_variance;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    whole_block_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    whole_block_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+  int split_variance[4];
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  struct buf_2d buf;
+  buf.stride = x->plane[0].src.stride;
+  const int bw = block_size_wide[bsize];
+  for (int i = 0; i < 4; ++i) {
+    const int x_idx = (i & 1) * bw / 2;
+    const int y_idx = (i >> 1) * bw / 2;
+    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      split_variance[i] =
+          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+    } else {
+      split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+    }
+  }
+
+  for (int i = 0; i < 4; i++)
+    features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+  // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+  float raw_scores[3] = { 0.0f };
+  av1_nn_predict(features, nn_config, raw_scores);
+  float probs[3] = { 0.0f };
+  av1_nn_softmax(raw_scores, probs, 3);
+
+  // probs[0] is the probability of the fact that both rectangular partitions
+  // are worse than current best_rd
+  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                                  int64_t best_rd, int64_t horz_rd[2],
+                                  int64_t vert_rd[2], int64_t split_rd[4],
+                                  int *const horza_partition_allowed,
+                                  int *const horzb_partition_allowed,
+                                  int *const verta_partition_allowed,
+                                  int *const vertb_partition_allowed) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_8X8: nn_config = NULL; break;
+    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[10];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)var_ctx;
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == 10);
+
+  // Calculate scores using the NN model.
+  float score[16] = { 0.0f };
+  av1_nn_predict(features, nn_config, score);
+  int int_score[16];
+  int max_score = -1000;
+  for (int i = 0; i < 16; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 150; break;
+    case BLOCK_32X32: thresh -= 100; break;
+    default: break;
+  }
+  *horza_partition_allowed = 0;
+  *horzb_partition_allowed = 0;
+  *verta_partition_allowed = 0;
+  *vertb_partition_allowed = 0;
+  for (int i = 0; i < 16; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *horza_partition_allowed = 1;
+      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
+      if ((i >> 2) & 1) *verta_partition_allowed = 1;
+      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+    }
+  }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, int part_ctx,
+                                 int64_t best_rd, int64_t horz_rd[2],
+                                 int64_t vert_rd[2], int64_t split_rd[4],
+                                 int *const partition_horz4_allowed,
+                                 int *const partition_vert4_allowed,
+                                 unsigned int pb_source_variance, int mi_row,
+                                 int mi_col) {
+  if (best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[FEATURES];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[4] = { 0 };
+  unsigned int vert_4_source_var[4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common));
+    const int src_stride = x->plane[0].src.stride;
+    const uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    for (int i = 0; i < 4; ++i) {
+      const uint8_t *horz_src =
+          src + i * block_size_high[horz_4_bs] * src_stride;
+      const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
+      unsigned int horz_var, vert_var, sse;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        switch (xd->bd) {
+          case 10:
+            horz_var = cpi->fn_ptr[horz_4_bs].vf(
+                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+                0, &sse);
+            vert_var = cpi->fn_ptr[vert_4_bs].vf(
+                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+                0, &sse);
+            break;
+          case 12:
+            horz_var = cpi->fn_ptr[horz_4_bs].vf(
+                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+                0, &sse);
+            vert_var = cpi->fn_ptr[vert_4_bs].vf(
+                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+                0, &sse);
+            break;
+          case 8:
+          default:
+            horz_var = cpi->fn_ptr[horz_4_bs].vf(
+                horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+                0, &sse);
+            vert_var = cpi->fn_ptr[vert_4_bs].vf(
+                vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+                0, &sse);
+            break;
+        }
+        horz_4_source_var[i] =
+            ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+        vert_4_source_var[i] =
+            ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+      } else {
+        horz_var = cpi->fn_ptr[horz_4_bs].vf(horz_src, src_stride, AV1_VAR_OFFS,
+                                             0, &sse);
+        vert_var = cpi->fn_ptr[vert_4_bs].vf(vert_src, src_stride, AV1_VAR_OFFS,
+                                             0, &sse);
+        horz_4_source_var[i] =
+            ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+        vert_4_source_var[i] =
+            ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+      }
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  assert(feature_index == FEATURES);
+
+  // Calculate scores using the NN model.
+  float score[LABELS] = { 0.0f };
+  av1_nn_predict(features, nn_config, score);
+  int int_score[LABELS];
+  int max_score = -1000;
+  for (int i = 0; i < LABELS; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 500; break;
+    case BLOCK_32X32: thresh -= 500; break;
+    case BLOCK_64X64: thresh -= 200; break;
+    default: break;
+  }
+  *partition_horz4_allowed = 0;
+  *partition_vert4_allowed = 0;
+  for (int i = 0; i < LABELS; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
+      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+    }
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                               const MACROBLOCK *const x,
+                               const RD_STATS *const rd_stats,
+                               unsigned int pb_source_variance) {
+  const NN_CONFIG *nn_config = NULL;
+  int thresh = 0;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_partition_breakout_nnconfig_8;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_partition_breakout_nnconfig_16;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_partition_breakout_nnconfig_32;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_partition_breakout_nnconfig_64;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_partition_breakout_nnconfig_128;
+      thresh = cpi->sf.ml_partition_search_breakout_thresh[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config || thresh < 0) return 0;
+
+  // Generate feature values.
+  float features[FEATURES];
+  int feature_index = 0;
+  aom_clear_system_state();
+
+  const int num_pels_log2 = num_pels_log2_lookup[bsize];
+  float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+  rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+           rate_f;
+  features[feature_index++] = rate_f;
+
+  const float dist_f =
+      (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+  features[feature_index++] = dist_f;
+
+  features[feature_index++] = (float)pb_source_variance;
+
+  const int dc_q = (int)x->plane[0].dequant_QTX[0];
+  features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+  assert(feature_index == FEATURES);
+
+  // Calculate score using the NN model.
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, &score);
+
+  // Make decision.
+  return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TOKENEXTRA **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              RD_STATS *rd_cost, int64_t best_rd,
+                              PC_TREE *pc_tree, int64_t *none_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = mi_size_wide[bsize] / 2;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TOKENEXTRA *const tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  int tmp_partition_cost[PARTITION_TYPES];
+  BLOCK_SIZE subsize;
+  RD_STATS this_rdc, sum_rdc, best_rdc;
+  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  int do_square_split = bsize_at_least_8x8;
+  const int pl = bsize_at_least_8x8
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const int *partition_cost =
+      pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+
+  int do_rectangular_split = 1;
+  int64_t cur_none_rd = 0;
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+  int64_t horz_rd[2] = { 0, 0 };
+  int64_t vert_rd[2] = { 0, 0 };
+
+  int split_ctx_is_ready[2] = { 0, 0 };
+  int horz_ctx_is_ready = 0;
+  int vert_ctx_is_ready = 0;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (best_rd < 0) {
+    pc_tree->none.rdcost = INT64_MAX;
+    pc_tree->none.skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int has_rows = (mi_row + mi_step < cm->mi_rows);
+  const int has_cols = (mi_col + mi_step < cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
+
+  if (none_rd) *none_rd = 0;
+
+#if CONFIG_FP_MB_STATS
+  unsigned int src_diff_var = UINT_MAX;
+  int none_complexity = 0;
+#endif
+
+  int partition_none_allowed = has_rows && has_cols;
+  int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+  int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+
+  (void)*tp_orig;
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to fixed pattern
+  // when debugging.
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_init_rd_stats(&this_rdc);
+  av1_invalid_rd_stats(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    const int cb_partition_search_ctrl =
+        ((pc_tree->index == 0 || pc_tree->index == 3) +
+         get_chessboard_index(cm->current_video_frame)) &
+        0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+  }
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    const int no_partition_allowed = (bsize <= max_size && bsize >= min_size);
+    // Note: Further partitioning is NOT allowed when bsize == min_size already.
+    const int partition_allowed = (bsize <= max_size && bsize > min_size);
+    partition_none_allowed &= no_partition_allowed;
+    partition_horz_allowed &= partition_allowed || !has_rows;
+    partition_vert_allowed &= partition_allowed || !has_cols;
+    do_square_split &= bsize > min_size;
+  }
+
+  if (bsize > cpi->sf.use_square_partition_only_threshold) {
+    partition_horz_allowed &= !has_rows;
+    partition_vert_allowed &= !has_cols;
+  }
+
+  if (bsize > BLOCK_4X4 && x->use_cb_search_range &&
+      cpi->sf.auto_min_max_partition_size == 0) {
+    int split_score = 0;
+    int none_score = 0;
+    const int score_valid = ml_prune_2pass_split_partition(
+        &pc_tree->pc_tree_stats, bsize, &split_score, &none_score);
+    if (score_valid) {
+      {
+        const int only_split_thresh = 300;
+        const int no_none_thresh = 250;
+        const int no_split_thresh = 0;
+        if (split_score > only_split_thresh) {
+          partition_none_allowed = 0;
+          partition_horz_allowed = 0;
+          partition_vert_allowed = 0;
+        } else if (split_score > no_none_thresh) {
+          partition_none_allowed = 0;
+        }
+        if (split_score < no_split_thresh) do_square_split = 0;
+      }
+      {
+        const int no_split_thresh = 120;
+        const int no_none_thresh = -120;
+        if (none_score > no_split_thresh && partition_none_allowed)
+          do_square_split = 0;
+        if (none_score < no_none_thresh) partition_none_allowed = 0;
+      }
+    } else {
+      if (pc_tree->cb_search_range == SPLIT_PLANE) {
+        partition_none_allowed = 0;
+        partition_horz_allowed = 0;
+        partition_vert_allowed = 0;
+      }
+      if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) do_square_split = 0;
+      if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) {
+        do_square_split = 0;
+        partition_horz_allowed = 0;
+        partition_vert_allowed = 0;
+      }
+    }
+
+    // Fall back to default values in case all partition modes are rejected.
+    if (partition_none_allowed == 0 && do_square_split == 0 &&
+        partition_horz_allowed == 0 && partition_vert_allowed == 0) {
+      do_square_split = bsize_at_least_8x8;
+      partition_none_allowed = has_rows && has_cols;
+      partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+      partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+    }
+  }
+
+  xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
+                                                  mi_col, bsize);
+  }
+
+  // Decide whether we shall split directly and skip searching NONE by using
+  // the first pass block statistics
+  if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
+      partition_none_allowed && src_diff_var > 4 &&
+      cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+    int mb_row = mi_row >> 1;
+    int mb_col = mi_col >> 1;
+    int mb_row_end =
+        AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+    int mb_col_end =
+        AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+    int r, c;
+
+    // compute a complexity measure, basically measure inconsistency of motion
+    // vectors obtained from the first pass in the current block
+    for (r = mb_row; r < mb_row_end; r++) {
+      for (c = mb_col; c < mb_col_end; c++) {
+        const int mb_index = r * cm->mb_cols + c;
+
+        MOTION_DIRECTION this_mv;
+        MOTION_DIRECTION right_mv;
+        MOTION_DIRECTION bottom_mv;
+
+        this_mv =
+            get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+        // to its right
+        if (c != mb_col_end - 1) {
+          right_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+          none_complexity += get_motion_inconsistency(this_mv, right_mv);
+        }
+
+        // to its bottom
+        if (r != mb_row_end - 1) {
+          bottom_mv = get_motion_direction_fp(
+              cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+          none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+        }
+
+        // do not count its left and top neighbors to avoid double counting
+      }
+    }
+
+    if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+      partition_none_allowed = 0;
+    }
+  }
+#endif
+
+  // Ref frames picked in the [i_th] quarter subblock during square partition
+  // RD search. It may be used to prune ref frame selection of rect partitions.
+  int ref_frames_used[4] = {
+    0,
+  };
+
+BEGIN_PARTITION_SEARCH:
+  if (x->must_find_valid_partition) {
+    partition_none_allowed = has_rows && has_cols;
+    partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+    partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+  }
+
+  // Partition block source pixel variance.
+  unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
+    if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
+    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+      do_square_split = 0;
+  }
+#endif
+
+  // PARTITION_NONE
+  if (partition_none_allowed) {
+    int pt_cost = 0;
+    if (bsize_at_least_8x8) {
+      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                    ? partition_cost[PARTITION_NONE]
+                    : 0;
+    }
+    int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+    int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX)
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - partition_rd_cost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                     PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+    pb_source_variance = x->source_variance;
+    if (none_rd) *none_rd = this_rdc.rdcost;
+    cur_none_rd = this_rdc.rdcost;
+    if (this_rdc.rate != INT_MAX) {
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
+        for (int i = 0; i < 4; ++i) {
+          ref_frames_used[i] |= (1 << ref_type);
+        }
+      }
+      if (bsize_at_least_8x8) {
+        this_rdc.rate += pt_cost;
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      }
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        // Adjust dist breakout threshold according to the partition size.
+        const int64_t dist_breakout_thr =
+            cpi->sf.partition_search_breakout_dist_thr >>
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+        const int rate_breakout_thr =
+            cpi->sf.partition_search_breakout_rate_thr *
+            num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+        if ((do_square_split || do_rectangular_split) &&
+            !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+          const int use_ml_based_breakout =
+              bsize <= cpi->sf.use_square_partition_only_threshold &&
+              bsize > BLOCK_4X4 && xd->bd == 8;
+          if (use_ml_based_breakout) {
+            if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
+                                    pb_source_variance)) {
+              do_square_split = 0;
+              do_rectangular_split = 0;
+            }
+          }
+
+          // If all y, u, v transform blocks in this partition are skippable,
+          // and the dist & rate are within the thresholds, the partition
+          // search is terminated for current branch of the partition search
+          // tree. The dist & rate thresholds are set to 0 at speed 0 to
+          // disable the early termination at that speed.
+          if (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr) {
+            do_square_split = 0;
+            do_rectangular_split = 0;
+          }
+        }
+
+#if CONFIG_FP_MB_STATS
+        // Check if every 16x16 first pass block statistics has zero
+        // motion and the corresponding first pass residue is small enough.
+        // If that is the case, check the difference variance between the
+        // current frame and the last frame. If the variance is small enough,
+        // stop further splitting in RD optimization
+        if (cpi->use_fp_mb_stats && do_square_split &&
+            cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+          int mb_row = mi_row >> 1;
+          int mb_col = mi_col >> 1;
+          int mb_row_end =
+              AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+          int mb_col_end =
+              AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+          int r, c;
+
+          int skip = 1;
+          for (r = mb_row; r < mb_row_end; r++) {
+            for (c = mb_col; c < mb_col_end; c++) {
+              const int mb_index = r * cm->mb_cols + c;
+              if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_MOTION_ZERO_MASK) ||
+                  !(cpi->twopass.this_frame_mb_stats[mb_index] &
+                    FPMB_ERROR_SMALL_MASK)) {
+                skip = 0;
+                break;
+              }
+            }
+            if (skip == 0) {
+              break;
+            }
+          }
+          if (skip) {
+            if (src_diff_var == UINT_MAX) {
+              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+              src_diff_var = get_sby_perpixel_diff_variance(
+                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
+            }
+            if (src_diff_var < 8) {
+              do_square_split = 0;
+              do_rectangular_split = 0;
+            }
+          }
+        }
+#endif
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+  // PARTITION_SPLIT
+  if (do_square_split) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    sum_rdc.rate = partition_cost[PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+    int idx;
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
+
+      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+        continue;
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+      int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                       ? INT64_MAX
+                                       : (best_rdc.rdcost - sum_rdc.rdcost);
+      if (cpi->sf.prune_ref_frame_for_rect_partitions)
+        pc_tree->split[idx]->none.rate = INT_MAX;
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                        subsize, &this_rdc, best_remain_rdcost,
+                        pc_tree->split[idx], p_split_rd);
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+        break;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+        if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+            pc_tree->split[idx]->none.rate != INT_MAX) {
+          const int ref_type =
+              av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
+          ref_frames_used[idx] |= (1 << ref_type);
+        }
+        if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                         pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+          const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+          const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+          // Neither palette mode nor cfl predicted
+          if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+            if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
+          }
+        }
+      }
+    }
+    const int reached_last_index = (idx == 4);
+
+    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    } else if (cpi->sf.less_rectangular_check_level > 0) {
+      // skip rectangular partition test when larger block size
+      // gives better rd cost
+      if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2)
+        do_rectangular_split &= !partition_none_allowed;
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }  // if (do_split)
+
+  pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+  pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+  pc_tree->vertical[0].skip_ref_frame_mask = 0;
+  pc_tree->vertical[1].skip_ref_frame_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+    int used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[1];
+    if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[2] | ref_frames_used[3];
+    if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[2];
+    if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[1] | ref_frames_used[3];
+    if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
+  }
+
+  int prune_horz = 0;
+  int prune_vert = 0;
+  if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+      (partition_horz_allowed || partition_vert_allowed)) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+    ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+                            split_rd, &prune_horz, &prune_vert);
+  }
+
+  // PARTITION_HORZ
+  if (partition_horz_allowed && !prune_horz &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed) {
+      pc_tree->horizontal[0].pred_interp_filter =
+          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
+    }
+    int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - sum_rdc.rdcost);
+    sum_rdc.rate = partition_cost[PARTITION_HORZ];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                     PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                     best_remain_rdcost);
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+    }
+    horz_rd[0] = this_rdc.rdcost;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
+      const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+      const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
+      }
+      update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                        subsize, NULL);
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed) {
+        pc_tree->horizontal[1].pred_interp_filter =
+            av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
+      }
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+                       PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+      horz_rd[1] = this_rdc.rdcost;
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_HORZ;
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // PARTITION_VERT
+  if (partition_vert_allowed && !prune_vert &&
+      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
+
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed) {
+      pc_tree->vertical[0].pred_interp_filter =
+          av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
+    }
+    sum_rdc.rate = partition_cost[PARTITION_VERT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+    int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+                                     ? INT64_MAX
+                                     : (best_rdc.rdcost - sum_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+                     PARTITION_VERT, subsize, &pc_tree->vertical[0],
+                     best_remain_rdcost);
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+    }
+    vert_rd[0] = this_rdc.rdcost;
+    const int64_t vert_max_rdcost = best_rdc.rdcost;
+    if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
+      const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
+      }
+      update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col,
+                   subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+                        subsize, NULL);
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed) {
+        pc_tree->vertical[1].pred_interp_filter =
+            av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
+      }
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                       PARTITION_VERT, subsize, &pc_tree->vertical[1],
+                       best_rdc.rdcost - sum_rdc.rdcost);
+      vert_rd[1] = this_rdc.rdcost;
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+      }
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_VERT;
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (pb_source_variance == UINT_MAX) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      pb_source_variance = av1_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    } else {
+      pb_source_variance =
+          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+    }
+  }
+
+  const int ext_partition_allowed =
+      do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
+
+  // The standard AB partitions are allowed whenever ext-partition-types are
+  // allowed
+  int horzab_partition_allowed = ext_partition_allowed;
+  int vertab_partition_allowed = ext_partition_allowed;
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
+      horzab_partition_allowed = 0;
+      vertab_partition_allowed = 0;
+    }
+  }
+#endif
+
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+      // TODO(debargha,huisu@google.com): may need to tune the threshold for
+      // pb_source_variance.
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+  int horza_partition_allowed = horzab_partition_allowed;
+  int horzb_partition_allowed = horzab_partition_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.prune_ext_partition_types_search_level) {
+      case 1:
+        horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  int verta_partition_allowed = vertab_partition_allowed;
+  int vertb_partition_allowed = vertab_partition_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.prune_ext_partition_types_search_level) {
+      case 1:
+        verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    // TODO(huisu@google.com): x->source_variance may not be the current block's
+    // variance. The correct one to use is pb_source_variance.
+    // Need to re-train the model to fix it.
+    ml_prune_ab_partition(bsize, pc_tree->partitioning,
+                          get_unsigned_bits(x->source_variance),
+                          best_rdc.rdcost, horz_rd, vert_rd, split_rd,
+                          &horza_partition_allowed, &horzb_partition_allowed,
+                          &verta_partition_allowed, &vertb_partition_allowed);
+  }
+
+  // PARTITION_HORZ_A
+  if (partition_horz_allowed && horza_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
+    pc_tree->horizontala[0].rd_mode_is_ready = 0;
+    pc_tree->horizontala[1].rd_mode_is_ready = 0;
+    pc_tree->horizontala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
+      pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
+      pc_tree->horizontala[0].rd_mode_is_ready = 1;
+      if (split_ctx_is_ready[1]) {
+        av1_copy_tree_context(&pc_tree->horizontala[1],
+                              &pc_tree->split[1]->none);
+        pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
+        pc_tree->horizontala[1].rd_mode_is_ready = 1;
+      }
+    }
+    pc_tree->horizontala[0].skip_ref_frame_mask = 0;
+    pc_tree->horizontala[1].skip_ref_frame_mask = 0;
+    pc_tree->horizontala[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0];
+      if (used_frames)
+        pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[1];
+      if (used_frames)
+        pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[2] | ref_frames_used[3];
+      if (used_frames)
+        pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
+    }
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
+                       mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
+                       subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+  // PARTITION_HORZ_B
+  if (partition_horz_allowed && horzb_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
+    pc_tree->horizontalb[0].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[1].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[2].rd_mode_is_ready = 0;
+    if (horz_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
+      pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
+      pc_tree->horizontalb[0].rd_mode_is_ready = 1;
+    }
+    pc_tree->horizontalb[0].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[1].skip_ref_frame_mask = 0;
+    pc_tree->horizontalb[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0] | ref_frames_used[1];
+      if (used_frames)
+        pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[2];
+      if (used_frames)
+        pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[3];
+      if (used_frames)
+        pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
+    }
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_HORZ_B, mi_row, mi_col, subsize,
+                       mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
+                       mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // PARTITION_VERT_A
+  if (partition_vert_allowed && verta_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
+    pc_tree->verticala[0].rd_mode_is_ready = 0;
+    pc_tree->verticala[1].rd_mode_is_ready = 0;
+    pc_tree->verticala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
+      pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
+      pc_tree->verticala[0].rd_mode_is_ready = 1;
+    }
+    pc_tree->verticala[0].skip_ref_frame_mask = 0;
+    pc_tree->verticala[1].skip_ref_frame_mask = 0;
+    pc_tree->verticala[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0];
+      if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[2];
+      if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[1] | ref_frames_used[3];
+      if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
+    }
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_VERT_A, mi_row, mi_col, bsize2,
+                       mi_row + mi_step, mi_col, bsize2, mi_row,
+                       mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+  // PARTITION_VERT_B
+  if (partition_vert_allowed && vertb_partition_allowed) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
+    pc_tree->verticalb[0].rd_mode_is_ready = 0;
+    pc_tree->verticalb[1].rd_mode_is_ready = 0;
+    pc_tree->verticalb[2].rd_mode_is_ready = 0;
+    if (vert_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
+      pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
+      pc_tree->verticalb[0].rd_mode_is_ready = 1;
+    }
+    pc_tree->verticalb[0].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[1].skip_ref_frame_mask = 0;
+    pc_tree->verticalb[2].skip_ref_frame_mask = 0;
+    if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+      int used_frames;
+      used_frames = ref_frames_used[0] | ref_frames_used[2];
+      if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[1];
+      if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
+      used_frames = ref_frames_used[3];
+      if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
+    }
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
+                       mi_col + mi_step, bsize2, mi_row + mi_step,
+                       mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
+  // so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed =
+      ext_partition_allowed && bsize != BLOCK_128X128;
+  int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
+  int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
+  if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+    partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                pc_tree->partitioning == PARTITION_HORZ_A ||
+                                pc_tree->partitioning == PARTITION_HORZ_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                pc_tree->partitioning == PARTITION_VERT_A ||
+                                pc_tree->partitioning == PARTITION_VERT_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+  }
+  if (cpi->sf.ml_prune_4_partition && partition4_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
+                         horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
+                         &partition_vert4_allowed, pb_source_variance, mi_row,
+                         mi_col);
+  }
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) {
+    if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
+      partition_horz4_allowed = 0;
+      partition_vert4_allowed = 0;
+    }
+  }
+#endif
+
+  // PARTITION_HORZ_4
+  if (partition_horz4_allowed && has_rows &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    const int quarter_step = mi_size_high[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+    for (int i = 0; i < 4; ++i) {
+      const int this_mi_row = mi_row + i * quarter_step;
+
+      if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+      PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
+
+      ctx_this->rd_mode_is_ready = 0;
+      ctx_this->skip_ref_frame_mask = 0;
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int used_frames = i <= 1
+                                    ? (ref_frames_used[0] | ref_frames_used[1])
+                                    : (ref_frames_used[2] | ref_frames_used[3]);
+        if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+      }
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
+                           mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+                           PARTITION_HORZ_4, ctx_prev, ctx_this))
+        break;
+
+      ctx_prev = ctx_this;
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_HORZ_4;
+      }
+    }
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // PARTITION_VERT_4
+  if (partition_vert4_allowed && has_cols &&
+      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
+    av1_init_rd_stats(&sum_rdc);
+    const int quarter_step = mi_size_wide[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
+    sum_rdc.rate = partition_cost[PARTITION_VERT_4];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+    for (int i = 0; i < 4; ++i) {
+      const int this_mi_col = mi_col + i * quarter_step;
+
+      if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+      PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
+
+      ctx_this->rd_mode_is_ready = 0;
+      ctx_this->skip_ref_frame_mask = 0;
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int used_frames = i <= 1
+                                    ? (ref_frames_used[0] | ref_frames_used[2])
+                                    : (ref_frames_used[1] | ref_frames_used[3]);
+        if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+      }
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
+                           this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+                           PARTITION_VERT_4, ctx_prev, ctx_this))
+        break;
+
+      ctx_prev = ctx_this;
+    }
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        pc_tree->partitioning = PARTITION_VERT_4;
+      }
+    }
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (bsize == cm->seq_params.sb_size && best_rdc.rate == INT_MAX) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+    goto BEGIN_PARTITION_SEARCH;
+  }
+
+  // TODO(jbb): This code added so that we avoid static analysis
+  // warning related to the fact that best_rd isn't used after this
+  // point.  This code should be refactored so that the duplicate
+  // checks occur in some sub function and thus are used...
+  (void)best_rd;
+  *rd_cost = best_rdc;
+
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+      pc_tree->index != 3) {
+    if (bsize == cm->seq_params.sb_size) {
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  if (bsize == cm->seq_params.sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+// Set all the counters as max.
+static void init_first_partition_pass_stats_tables(
+    FIRST_PARTITION_PASS_STATS *stats) {
+  for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+    memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
+    memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
+    stats[i].sample_counts = INT_MAX;
+  }
+}
+
+// clear pc_tree_stats
+static INLINE void clear_pc_tree_stats(PC_TREE *pt) {
+  if (pt == NULL) return;
+  pt->pc_tree_stats.valid = 0;
+  for (int i = 0; i < 4; ++i) {
+    clear_pc_tree_stats(pt->split[i]);
+  }
+}
+
+// Minimum number of samples to trigger the
+// mode_pruning_based_on_two_pass_partition_search feature.
+#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
+
+static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, int mi_row,
+                             TOKENEXTRA **tp) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const int leaf_nodes = 256;
+
+  // Initialize the left context for the new SB row
+  av1_zero_left_context(xd);
+
+  // Reset delta for every tile
+  if (mi_row == tile_info->mi_row_start) {
+    if (cm->delta_q_present_flag) xd->current_qindex = cm->base_qindex;
+    if (cm->delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
+  }
+
+  PC_TREE *const pc_root =
+      td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
+  // Code each SB in the row
+  for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+       mi_col += cm->seq_params.mib_size) {
+    av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
+    av1_fill_mode_rates(cm, x, xd->tile_ctx);
+
+    if (sf->adaptive_pred_interp_filter) {
+      for (int i = 0; i < leaf_nodes; ++i) {
+        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+      }
+    }
+
+    x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+
+    av1_zero(x->txb_rd_record_8X8);
+    av1_zero(x->txb_rd_record_16X16);
+    av1_zero(x->txb_rd_record_32X32);
+    av1_zero(x->txb_rd_record_64X64);
+    av1_zero(x->txb_rd_record_intra);
+
+    av1_zero(x->pred_mv);
+    pc_root->index = 0;
+
+    const struct segmentation *const seg = &cm->seg;
+    int seg_skip = 0;
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      const int segment_id =
+          map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
+              : 0;
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+    xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+
+    x->sb_energy_level = 0;
+    if (cm->delta_q_present_flag) {
+      // Delta-q modulation based on variance
+      av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+
+      int offset_qindex;
+      if (DELTAQ_MODULATION == 1) {
+        const int block_wavelet_energy_level =
+            av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size);
+        x->sb_energy_level = block_wavelet_energy_level;
+        offset_qindex = av1_compute_deltaq_from_energy_level(
+            cpi, block_wavelet_energy_level);
+      } else {
+        const int block_var_level =
+            av1_log_block_var(cpi, x, cm->seq_params.sb_size);
+        x->sb_energy_level = block_var_level;
+        offset_qindex =
+            av1_compute_deltaq_from_energy_level(cpi, block_var_level);
+      }
+      const int qmask = ~(cm->delta_q_res - 1);
+      int current_qindex = clamp(cm->base_qindex + offset_qindex,
+                                 cm->delta_q_res, 256 - cm->delta_q_res);
+      current_qindex =
+          ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
+          cm->base_qindex;
+      assert(current_qindex > 0);
+
+      xd->delta_qindex = current_qindex - cm->base_qindex;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      xd->mi[0]->current_qindex = current_qindex;
+      av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+      if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
+        const int lfmask = ~(cm->delta_lf_res - 1);
+        const int delta_lf_from_base =
+            ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask);
+
+        // pre-set the delta lf for loop filter. Note that this value is set
+        // before mi is assigned for each block in current superblock
+        for (int j = 0;
+             j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) {
+          for (int k = 0;
+               k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) {
+            cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+                .delta_lf_from_base =
+                clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+            const int frame_lf_count =
+                av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+            for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+              cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+                  .delta_lf[lf_id] =
+                  clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+            }
+          }
+        }
+      }
+    }
+
+    int dummy_rate;
+    int64_t dummy_dist;
+    RD_STATS dummy_rdc;
+    const int idx_str = cm->mi_stride * mi_row + mi_col;
+    MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    x->source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      const BLOCK_SIZE bsize =
+          seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+                       pc_root);
+    } else if (cpi->partition_search_skippable_frame) {
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+      const BLOCK_SIZE bsize =
+          get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+                       pc_root);
+    } else {
+      // If required set upper and lower partition size limits
+      if (sf->auto_min_max_partition_size) {
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+                                &x->min_partition_size, &x->max_partition_size);
+      }
+
+      reset_partition(pc_root, cm->seq_params.sb_size);
+      x->use_cb_search_range = 0;
+      init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+      // Do the first pass if we need two pass partition search
+      if (cpi->sf.two_pass_partition_search &&
+          cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
+          mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
+          mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
+          cm->frame_type != KEY_FRAME) {
+        x->cb_partition_scan = 1;
+        // Reset the stats tables.
+        if (sf->mode_pruning_based_on_two_pass_partition_search)
+          av1_zero(x->first_partition_pass_stats);
+        clear_pc_tree_stats(pc_root);
+        rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                              cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+                              pc_root, NULL);
+        x->cb_partition_scan = 0;
+
+        x->source_variance = UINT_MAX;
+        if (sf->adaptive_pred_interp_filter) {
+          for (int i = 0; i < leaf_nodes; ++i) {
+            td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+            td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+          }
+        }
+
+        x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+        av1_zero(x->txb_rd_record_8X8);
+        av1_zero(x->txb_rd_record_16X16);
+        av1_zero(x->txb_rd_record_32X32);
+        av1_zero(x->txb_rd_record_64X64);
+        av1_zero(x->txb_rd_record_intra);
+        av1_zero(x->pred_mv);
+        pc_root->index = 0;
+
+        for (int idy = 0; idy < mi_size_high[cm->seq_params.sb_size]; ++idy) {
+          for (int idx = 0; idx < mi_size_wide[cm->seq_params.sb_size]; ++idx) {
+            const int offset = cm->mi_stride * (mi_row + idy) + (mi_col + idx);
+            cm->mi_grid_visible[offset] = 0;
+          }
+        }
+
+        x->use_cb_search_range = 1;
+
+        if (sf->mode_pruning_based_on_two_pass_partition_search) {
+          for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+            FIRST_PARTITION_PASS_STATS *const stat =
+                &x->first_partition_pass_stats[i];
+            if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+              // If there are not enough samples collected, make all available.
+              memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+              memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+            } else if (sf->selective_ref_frame < 2) {
+              // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+              // initial partition scan, so we don't eliminate them.
+              stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+              stat->ref0_counts[BWDREF_FRAME] = 0xff;
+              stat->ref1_counts[BWDREF_FRAME] = 0xff;
+            }
+          }
+        }
+      }
+
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                        cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root,
+                        NULL);
+    }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+    if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+        cm->tile_rows == 1) {
+      av1_inter_mode_data_fit(tile_data, x->rdmult);
+    }
+#endif
+  }
+}
+
+static void init_encode_frame_mb_context(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Copy data over into macro block data structures.
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
+
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
+}
+
+static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
+  if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
+  // We will not update the golden frame with an internal overlay frame
+  else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+           cpi->rc.is_src_frame_ext_arf)
+    return ALTREF_FRAME;
+  else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+           cpi->refresh_alt_ref_frame)
+    return GOLDEN_FRAME;
+  else
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+    return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
+  if (cpi->common.coded_lossless) return ONLY_4X4;
+  if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+    return TX_MODE_LARGEST;
+  else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
+           cpi->sf.tx_size_search_method == USE_FAST_RD)
+    return TX_MODE_SELECT;
+  else
+    return cpi->common.tx_mode;
+}
+
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tile_col, tile_row;
+
+  if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+  CHECK_MEM_ERROR(
+      cm, cpi->tile_data,
+      aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+  cpi->allocated_tiles = tile_cols * tile_rows;
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      int i, j;
+      for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+        for (j = 0; j < MAX_MODES; ++j) {
+          tile_data->thresh_freq_fact[i][j] = 32;
+          tile_data->mode_map[i][j] = j;
+        }
+      }
+    }
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tile_col, tile_row;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  TOKENLIST *tplist = cpi->tplist[0][0];
+  unsigned int tile_tok = 0;
+  int tplist_count = 0;
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *const tile_info = &tile_data->tile_info;
+      av1_tile_init(tile_info, cm, tile_row, tile_col);
+
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(
+          *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+      tplist = cpi->tplist[tile_row][tile_col];
+      tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+      tile_data->allow_update_cdf = !cm->large_scale_tile;
+      tile_data->allow_update_cdf =
+          tile_data->allow_update_cdf && !cm->disable_cdf_update;
+    }
+  }
+}
+
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int tile_cols = cm->tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = NULL;
+  int sb_row_in_tile;
+  int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+
+  int num_mb_rows_in_sb =
+      ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+
+  sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+
+  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
+                cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+
+  encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
+      (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
+                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+
+  assert(
+      (unsigned int)(tok -
+                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
+      get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+                      cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+
+  (void)tile_mb_cols;
+  (void)num_mb_rows_in_sb;
+}
+
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                     int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileDataEnc *const this_tile =
+      &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  int mi_row;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  av1_inter_mode_data_init(this_tile);
+#endif
+
+  av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+                         tile_info->mi_col_end, tile_row);
+  av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
+
+  // Set up pointers to per thread motion search counters.
+  this_tile->m_search_count = 0;   // Count of motion search hits.
+  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
+  td->mb.m_search_count_ptr = &this_tile->m_search_count;
+  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+  this_tile->tctx = *cm->fc;
+  td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+  cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+
+  av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+
+  td->intrabc_used_this_tile = 0;
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+  }
+}
+
+static void encode_tiles(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tile_col, tile_row;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+    av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+      cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
+    }
+  }
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
+                            AV1_COMMON *cm, uint8_t **this_frame_mb_stats) {
+  uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
+                         cm->current_video_frame * cm->MBs * sizeof(uint8_t);
+
+  if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
+
+  *this_frame_mb_stats = mb_stats_in;
+
+  return 1;
+}
+#endif
+
+#define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+                              const WarpedMotionParams *ref_gm, int allow_hp) {
+  int params_cost = 0;
+  int trans_bits, trans_prec_diff;
+  switch (gm->wmtype) {
+    case AFFINE:
+    case ROTZOOM:
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      }
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      trans_bits = (gm->wmtype == TRANSLATION)
+                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                       : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (gm->wmtype == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[0] >> trans_prec_diff),
+          (gm->wmmat[0] >> trans_prec_diff));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[1] >> trans_prec_diff),
+          (gm->wmmat[1] >> trans_prec_diff));
+      AOM_FALLTHROUGH_INTENDED;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
+                              int frame) {
+  (void)num_refs_using_gm;
+  (void)frame;
+  switch (sf->gm_search_type) {
+    case GM_FULL_SEARCH: return 1;
+    case GM_REDUCED_REF_SEARCH:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+    case GM_DISABLE_SEARCH: return 0;
+    default: assert(0);
+  }
+  return 1;
+}
+
+// Estimate if the source frame is screen content, based on the portion of
+// blocks that have no more than 4 (experimentally selected) luma colors.
+static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
+                             int stride, int width, int height) {
+  assert(src != NULL);
+  int counts = 0;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  const int limit = 4;
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+      const int n_colors =
+          use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
+                                            blk_h, bd, count_buf)
+                  : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h,
+                                     count_buf);
+      if (n_colors > 1 && n_colors <= limit) counts++;
+    }
+  }
+  // The threshold is 10%.
+  return counts * blk_h * blk_w * 10 > width * height;
+}
+
+static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                         AOM_LAST_FLAG,
+                                                         AOM_LAST2_FLAG,
+                                                         AOM_LAST3_FLAG,
+                                                         AOM_GOLD_FLAG,
+                                                         AOM_BWD_FLAG,
+                                                         AOM_ALT2_FLAG,
+                                                         AOM_ALT_FLAG };
+
+// Enforce the number of references for each arbitrary frame limited to
+// (INTER_REFS_PER_FRAME - 1)
+static void enforce_max_ref_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])
+      total_valid_refs++;
+  }
+
+  // NOTE(zoeliu): When all the possible reference frames are availble, we
+  // reduce the number of reference frames by 1, following the rules of:
+  // (1) Retain GOLDEN_FARME/ALTEF_FRAME;
+  // (2) Check the earliest 2 remaining reference frames, and remove the one
+  //     with the lower quality factor, otherwise if both have been coded at
+  //     the same quality level, remove the earliest reference frame.
+
+  if (total_valid_refs == INTER_REFS_PER_FRAME) {
+    unsigned int min_ref_offset = UINT_MAX;
+    unsigned int second_min_ref_offset = UINT_MAX;
+    MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME };
+    int earliest_buf_idxes[2] = { 0 };
+
+    // Locate the earliest two reference frames except GOLDEN/ALTREF.
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      // Retain GOLDEN/ALTERF
+      if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue;
+
+      const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+      if (buf_idx >= 0) {
+        const unsigned int ref_offset =
+            cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+
+        if (min_ref_offset == UINT_MAX) {
+          min_ref_offset = ref_offset;
+          earliest_ref_frames[0] = ref_frame;
+          earliest_buf_idxes[0] = buf_idx;
+        } else {
+          if (get_relative_dist(cm, ref_offset, min_ref_offset) < 0) {
+            second_min_ref_offset = min_ref_offset;
+            earliest_ref_frames[1] = earliest_ref_frames[0];
+            earliest_buf_idxes[1] = earliest_buf_idxes[0];
+
+            min_ref_offset = ref_offset;
+            earliest_ref_frames[0] = ref_frame;
+            earliest_buf_idxes[0] = buf_idx;
+          } else if (second_min_ref_offset == UINT_MAX ||
+                     get_relative_dist(cm, ref_offset, second_min_ref_offset) <
+                         0) {
+            second_min_ref_offset = ref_offset;
+            earliest_ref_frames[1] = ref_frame;
+            earliest_buf_idxes[1] = buf_idx;
+          }
+        }
+      }
+    }
+    // Check the coding quality factors of the two earliest reference frames.
+    RATE_FACTOR_LEVEL ref_rf_level[2];
+    double ref_rf_deltas[2];
+    for (int i = 0; i < 2; ++i) {
+      ref_rf_level[i] = cpi->frame_rf_level[earliest_buf_idxes[i]];
+      ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]];
+    }
+    (void)ref_rf_level;
+    (void)ref_rf_deltas;
+
+#define USE_RF_LEVEL_TO_ENFORCE 1
+#if USE_RF_LEVEL_TO_ENFORCE
+    // If both earliest two reference frames are coded using the same rate-
+    // factor, disable the earliest reference frame; Otherwise disable the
+    // reference frame that uses a lower rate-factor delta.
+    const MV_REFERENCE_FRAME ref_frame_to_disable =
+        (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0]
+                                               : earliest_ref_frames[1];
+#else
+    // Always disable the earliest reference frame
+    const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0];
+#endif  // USE_RF_LEVEL_TO_ENFORCE
+#undef USE_RF_LEVEL_TO_ENFORCE
+
+    switch (ref_frame_to_disable) {
+      case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break;
+      case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break;
+      case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      default: break;
+    }
+  }
+}
+
+static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
+  assert(!frame_is_intra_only(cm));
+
+  int one_sided_refs = 1;
+  for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
+    const int buf_idx = cm->frame_refs[ref].idx;
+    if (buf_idx == INVALID_IDX) continue;
+
+    const int ref_offset =
+        cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    if (get_relative_dist(cm, ref_offset, (int)cm->frame_offset) > 0) {
+      one_sided_refs = 0;  // bwd reference
+      break;
+    }
+  }
+  return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+                                             int ref_offset[2]) {
+  ref_offset[0] = ref_offset[1] = 0;
+  if (!cm->is_skip_mode_allowed) return;
+
+  const int buf_idx_0 = cm->frame_refs[cm->ref_frame_idx_0].idx;
+  const int buf_idx_1 = cm->frame_refs[cm->ref_frame_idx_1].idx;
+  assert(buf_idx_0 != INVALID_IDX && buf_idx_1 != INVALID_IDX);
+
+  ref_offset[0] = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+  ref_offset[1] = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  av1_setup_skip_mode_allowed(cm);
+  if (!cm->is_skip_mode_allowed) return 0;
+
+  // Turn off skip mode if the temporal distances of the reference pair to the
+  // current frame are different by more than 1 frame.
+  const int cur_offset = (int)cm->frame_offset;
+  int ref_offset[2];
+  get_skip_mode_ref_offsets(cm, ref_offset);
+  const int cur_to_ref0 = get_relative_dist(cm, cur_offset, ref_offset[0]);
+  const int cur_to_ref1 = abs(get_relative_dist(cm, cur_offset, ref_offset[1]));
+  if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+  // High Latency: Turn off skip mode if all refs are fwd.
+  if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  const int ref_frame[2] = { cm->ref_frame_idx_0 + LAST_FRAME,
+                             cm->ref_frame_idx_1 + LAST_FRAME };
+  if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) ||
+      !(cpi->ref_frame_flags & flag_list[ref_frame[1]]))
+    return 0;
+
+  return 1;
+}
+
+// Function to decide if we can skip the global motion parameter computation
+// for a particular ref frame
+static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
+  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
+      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
+    return get_relative_dist(
+               cm, cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME],
+               cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0;
+  }
+  return 0;
+}
+
+static void set_default_interp_skip_flags(AV1_COMP *cpi) {
+  const int num_planes = av1_num_planes(&cpi->common);
+  cpi->default_interp_skip_flags = (num_planes == 1)
+                                       ? DEFAULT_LUMA_INTERP_SKIP_FLAG
+                                       : DEFAULT_INTERP_SKIP_FLAG;
+}
+
+static void encode_frame_internal(AV1_COMP *cpi) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  int i;
+
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
+#if CONFIG_DIST_8X8
+  x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
+  x->tune_metric = cpi->oxcf.tuning;
+#endif
+  cm->setup_mi(cm);
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  av1_zero(*td->counts);
+  av1_zero(rdc->comp_pred_diff);
+
+  if (frame_is_intra_only(cm)) {
+    if (cm->seq_params.force_screen_content_tools == 2) {
+      cm->allow_screen_content_tools =
+          cpi->oxcf.content == AOM_CONTENT_SCREEN ||
+          is_screen_content(cpi->source->y_buffer,
+                            cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                            cpi->source->y_stride, cpi->source->y_width,
+                            cpi->source->y_height);
+    } else {
+      cm->allow_screen_content_tools =
+          cm->seq_params.force_screen_content_tools;
+    }
+  }
+
+  // Allow intrabc when screen content tools are enabled.
+  cm->allow_intrabc = cm->allow_screen_content_tools;
+  // Reset the flag.
+  cpi->intrabc_used = 0;
+  // Need to disable intrabc when superres is selected
+  if (av1_superres_scaled(cm)) {
+    cm->allow_intrabc = 0;
+  }
+
+  if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
+    // add to hash table
+    const int pic_width = cpi->source->y_crop_width;
+    const int pic_height = cpi->source->y_crop_height;
+    uint32_t *block_hash_values[2][2];
+    int8_t *is_block_same[2][3];
+    int k, j;
+
+    for (k = 0; k < 2; k++) {
+      for (j = 0; j < 2; j++) {
+        CHECK_MEM_ERROR(cm, block_hash_values[k][j],
+                        aom_malloc(sizeof(uint32_t) * pic_width * pic_height));
+      }
+
+      for (j = 0; j < 3; j++) {
+        CHECK_MEM_ERROR(cm, is_block_same[k][j],
+                        aom_malloc(sizeof(int8_t) * pic_width * pic_height));
+      }
+    }
+
+    av1_hash_table_create(&cm->cur_frame->hash_table);
+    av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
+                                      is_block_same[0], &cpi->td.mb);
+    av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
+                                  block_hash_values[1], is_block_same[0],
+                                  is_block_same[1], &cpi->td.mb);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+        pic_width, pic_height, 4);
+    av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0], &cpi->td.mb);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 8);
+    av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
+                                  block_hash_values[1], is_block_same[0],
+                                  is_block_same[1], &cpi->td.mb);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+        pic_width, pic_height, 16);
+    av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0], &cpi->td.mb);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 32);
+    av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
+                                  block_hash_values[1], is_block_same[0],
+                                  is_block_same[1], &cpi->td.mb);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+        pic_width, pic_height, 64);
+
+    av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
+                                  block_hash_values[0], is_block_same[1],
+                                  is_block_same[0], &cpi->td.mb);
+    av1_add_to_hash_map_by_row_with_precal_data(
+        &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+        pic_width, pic_height, 128);
+
+    for (k = 0; k < 2; k++) {
+      for (j = 0; j < 2; j++) {
+        aom_free(block_hash_values[k][j]);
+      }
+
+      for (j = 0; j < 3; j++) {
+        aom_free(is_block_same[k][j]);
+      }
+    }
+  }
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = cm->seg.enabled
+                           ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+                           : cm->base_qindex;
+    xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+                      cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+                      cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+    if (xd->lossless[i]) cpi->has_lossless_segment = 1;
+    xd->qindex[i] = qindex;
+    if (xd->lossless[i]) {
+      cpi->optimize_seg_arr[i] = 0;
+    } else {
+      cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature;
+    }
+  }
+  cm->coded_lossless = is_coded_lossless(cm, xd);
+  cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+
+  cm->tx_mode = select_tx_mode(cpi);
+
+  // Fix delta q resolution for the moment
+  cm->delta_q_res = DEFAULT_DELTA_Q_RES;
+  // Set delta_q_present_flag before it is used for the first time
+  cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
+  cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+  cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+  cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+  // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+  cm->delta_q_present_flag &= cm->base_qindex > 0;
+  cm->delta_lf_present_flag &= cm->base_qindex > 0;
+
+  av1_frame_init_quantizer(cpi);
+
+  av1_initialize_rd_consts(cpi);
+  av1_initialize_me_consts(cpi, x, cm->base_qindex);
+  init_encode_frame_mb_context(cpi);
+  set_default_interp_skip_flags(cpi);
+  if (cm->prev_frame)
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  else
+    cm->last_frame_seg_map = NULL;
+  cm->current_frame_seg_map = cm->cur_frame->seg_map;
+  if (cm->allow_intrabc || cm->coded_lossless) {
+    av1_set_default_ref_deltas(cm->lf.ref_deltas);
+    av1_set_default_mode_deltas(cm->lf.mode_deltas);
+  } else if (cm->prev_frame) {
+    memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
+
+  x->txb_split_count = 0;
+
+  av1_zero(rdc->global_motion_used);
+  av1_zero(cpi->gmparams_cost);
+#if !CONFIG_GLOBAL_MOTION_SEARCH
+  cpi->global_motion_search_done = 1;
+#endif  // !CONFIG_GLOBAL_MOTION_SEARCH
+  if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
+      !cpi->global_motion_search_done) {
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+    int frame;
+    double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
+    const double *params_this_motion;
+    int inliers_by_motion[RANSAC_NUM_MOTIONS];
+    WarpedMotionParams tmp_wm_params;
+    static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+      0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+    };
+    int num_refs_using_gm = 0;
+
+    for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+      ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
+      int pframe;
+      cm->global_motion[frame] = default_warp_params;
+      const WarpedMotionParams *ref_params =
+          cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                         : &default_warp_params;
+      // check for duplicate buffer
+      for (pframe = ALTREF_FRAME; pframe > frame; --pframe) {
+        if (ref_buf[frame] == ref_buf[pframe]) break;
+      }
+      if (pframe > frame) {
+        memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
+               sizeof(WarpedMotionParams));
+      } else if (ref_buf[frame] &&
+                 ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+                 ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+                 do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
+                 !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
+        TransformationType model;
+        const int64_t ref_frame_error =
+            av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                            ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
+                            cpi->source->y_buffer, cpi->source->y_width,
+                            cpi->source->y_height, cpi->source->y_stride);
+
+        if (ref_frame_error == 0) continue;
+
+        aom_clear_system_state();
+        for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+          int64_t best_warp_error = INT64_MAX;
+          // Initially set all params to identity.
+          for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+            memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams,
+                   (MAX_PARAMDIM - 1) * sizeof(*params_by_motion));
+          }
+
+          compute_global_motion_feature_based(
+              model, cpi->source, ref_buf[frame],
+              cpi->common.seq_params.bit_depth, inliers_by_motion,
+              params_by_motion, RANSAC_NUM_MOTIONS);
+
+          for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+            if (inliers_by_motion[i] == 0) continue;
+
+            params_this_motion = params_by_motion + (MAX_PARAMDIM - 1) * i;
+            convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+            if (tmp_wm_params.wmtype != IDENTITY) {
+              const int64_t warp_error = refine_integerized_param(
+                  &tmp_wm_params, tmp_wm_params.wmtype,
+                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+                  ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+                  ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+                  cpi->source->y_buffer, cpi->source->y_width,
+                  cpi->source->y_height, cpi->source->y_stride, 5,
+                  best_warp_error);
+              if (warp_error < best_warp_error) {
+                best_warp_error = warp_error;
+                // Save the wm_params modified by refine_integerized_param()
+                // rather than motion index to avoid rerunning refine() below.
+                memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+                       sizeof(WarpedMotionParams));
+              }
+            }
+          }
+          if (cm->global_motion[frame].wmtype <= AFFINE)
+            if (!get_shear_params(&cm->global_motion[frame]))
+              cm->global_motion[frame] = default_warp_params;
+
+          if (cm->global_motion[frame].wmtype == TRANSLATION) {
+            cm->global_motion[frame].wmmat[0] =
+                convert_to_trans_prec(cm->allow_high_precision_mv,
+                                      cm->global_motion[frame].wmmat[0]) *
+                GM_TRANS_ONLY_DECODE_FACTOR;
+            cm->global_motion[frame].wmmat[1] =
+                convert_to_trans_prec(cm->allow_high_precision_mv,
+                                      cm->global_motion[frame].wmmat[1]) *
+                GM_TRANS_ONLY_DECODE_FACTOR;
+          }
+
+          // If the best error advantage found doesn't meet the threshold for
+          // this motion type, revert to IDENTITY.
+          if (!is_enough_erroradvantage(
+                  (double)best_warp_error / ref_frame_error,
+                  gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                                     cm->allow_high_precision_mv),
+                  cpi->sf.gm_erroradv_type)) {
+            cm->global_motion[frame] = default_warp_params;
+          }
+          if (cm->global_motion[frame].wmtype != IDENTITY) break;
+        }
+        aom_clear_system_state();
+      }
+      if (cm->global_motion[frame].wmtype != IDENTITY) num_refs_using_gm++;
+      cpi->gmparams_cost[frame] =
+          gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                             cm->allow_high_precision_mv) +
+          cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
+          cpi->gmtype_cost[IDENTITY];
+    }
+    // clear disabled ref_frames
+    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+      const int ref_disabled =
+          !(cpi->ref_frame_flags & ref_frame_flag_list[frame]);
+      if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
+        cpi->gmparams_cost[frame] = 0;
+        cm->global_motion[frame] = default_warp_params;
+      }
+    }
+    cpi->global_motion_search_done = 1;
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         REF_FRAMES * sizeof(WarpedMotionParams));
+
+  av1_setup_motion_field(cm);
+
+  cpi->all_one_sided_refs =
+      frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
+
+  cm->skip_mode_flag = check_skip_mode_enabled(cpi);
+
+  {
+    struct aom_usec_timer emr_timer;
+    aom_usec_timer_start(&emr_timer);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+                       &cpi->twopass.this_frame_mb_stats);
+    }
+#endif
+
+    if (cpi->row_mt && (cpi->oxcf.max_threads > 1))
+      av1_encode_tiles_mt(cpi);
+    else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
+      av1_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
+
+    aom_usec_timer_mark(&emr_timer);
+    cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
+  }
+
+  // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+  if (cm->allow_intrabc && !cpi->intrabc_used) cm->allow_intrabc = 0;
+  if (cm->allow_intrabc) cm->delta_lf_present_flag = 0;
+}
+
+void av1_encode_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  // Indicates whether or not to use a default reduced set for ext-tx
+  // rather than the potential full set of 16 transforms
+  cm->reduced_tx_set_used = 0;
+
+  if (cm->show_frame == 0) {
+    int arf_offset = AOMMIN(
+        (MAX_GF_INTERVAL - 1),
+        cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
+    int brf_offset =
+        cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+    arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
+    cm->frame_offset = cm->current_video_frame + arf_offset;
+  } else {
+    cm->frame_offset = cm->current_video_frame;
+  }
+  cm->frame_offset %= (1 << (cm->seq_params.order_hint_bits_minus_1 + 1));
+
+  // Make sure segment_id is no larger than last_active_segid.
+  if (cm->seg.enabled && cm->seg.update_map) {
+    const int mi_rows = cm->mi_rows;
+    const int mi_cols = cm->mi_cols;
+    const int last_active_segid = cm->seg.last_active_segid;
+    uint8_t *map = cpi->segmentation_map;
+    for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+      for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+        map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+      }
+      map += mi_cols;
+    }
+  }
+
+  av1_setup_frame_buf_refs(cm);
+  if (cpi->sf.selective_ref_frame >= 2) enforce_max_ref_frames(cpi);
+  av1_setup_frame_sign_bias(cm);
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(num_planes);
+#else
+  (void)num_planes;
+#endif
+
+  cpi->allow_comp_inter_inter = !frame_is_intra_only(cm);
+
+  if (cpi->sf.frame_parameter_update) {
+    int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+    // This code does a single RD pass over the whole frame assuming
+    // either compound, single or hybrid prediction as per whatever has
+    // worked best for that type of frame in the past.
+    // It also predicts whether another coding mode would have worked
+    // better than this coding mode. If that is the case, it remembers
+    // that for subsequent frames.
+    // It does the same analysis for transform size selection also.
+    //
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+    const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+    int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+    const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+    /* prediction (compound, single or hybrid) mode selection */
+    // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
+      cm->reference_mode = SINGLE_REFERENCE;
+    else
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+
+    cm->interp_filter = SWITCHABLE;
+    if (cm->large_scale_tile) cm->interp_filter = EIGHTTAP_REGULAR;
+
+    cm->switchable_motion_mode = 1;
+
+    rdc->compound_ref_used_flag = 0;
+    rdc->skip_mode_used_flag = 0;
+
+    encode_frame_internal(cpi);
+
+    for (i = 0; i < REFERENCE_MODES; ++i)
+      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      // Use a flag that includes 4x4 blocks
+      if (rdc->compound_ref_used_flag == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+#if CONFIG_ENTROPY_STATS
+        av1_zero(cpi->td.counts->comp_inter);
+#endif  // CONFIG_ENTROPY_STATS
+      }
+    }
+    // Re-check on the skip mode status as reference mode may have been changed.
+    if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE) {
+      cm->is_skip_mode_allowed = 0;
+      cm->skip_mode_flag = 0;
+    }
+    if (cm->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+      cm->skip_mode_flag = 0;
+
+    if (!cm->large_scale_tile) {
+      if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+        cm->tx_mode = TX_MODE_LARGEST;
+    }
+  } else {
+    encode_frame_internal(cpi);
+  }
+}
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+                              int blk_row, int blk_col,
+                              uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row,
+                                   mbmi->sb_type, tx_size);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+    ++x->txb_split_count;
+
+    if (sub_txs == TX_4X4) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
+    }
+  }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize, int mi_row,
+                                      int mi_col, FRAME_COUNTS *td_counts,
+                                      uint8_t allow_update_cdf) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bw)
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+                             int blk_col) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+  } else {
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
+    }
+  }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col) {
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bw)
+      set_txfm_context(xd, max_tx_size, idy, idx);
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+                              ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  const int mis = cm->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int is_inter = is_inter_block(mbmi);
+
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      x->cb_partition_scan) {
+    for (int row = mi_row; row < mi_row + mi_width;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        // Increase the counter of data samples.
+        ++stats->sample_counts;
+        // Increase the counter for ref_frame[0] and ref_frame[1].
+        if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref0_counts[mbmi->ref_frame[0]];
+        if (mbmi->ref_frame[1] >= 0 &&
+            stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+          ++stats->ref1_counts[mbmi->ref_frame[1]];
+      }
+    }
+  }
+
+  if (!is_inter) {
+    xd->cfl.is_chroma_reference =
+        is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                            cm->seq_params.subsampling_y);
+    xd->cfl.store_y = store_cfl_required(cm, xd);
+    mbmi->skip = 1;
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane,
+                                   cpi->optimize_seg_arr[mbmi->segment_id],
+                                   mi_row, mi_col);
+    }
+
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->has_lossless_segment)
+      mbmi->skip = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
+        }
+      }
+    }
+
+    av1_update_txb_context(cpi, td, dry_run, bsize, rate, mi_row, mi_col,
+                           tile_data->allow_update_cdf);
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           &xd->block_refs[ref]->sf, num_planes);
+    }
+
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+    if (mbmi->motion_mode == OBMC_CAUSAL)
+      av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+                                  plane, pixel_c, pixel_r, pd->width,
+                                  pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#else
+    (void)num_planes;
+#endif
+
+    av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate,
+                          tile_data->allow_update_cdf);
+  }
+
+  if (!dry_run) {
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi))
+      td->intrabc_used_this_tile = 1;
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
+        mbmi->sb_type > BLOCK_4X4 && !(is_inter && (mbmi->skip || seg_skip))) {
+      if (is_inter) {
+        tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts,
+                                  tile_data->allow_update_cdf);
+      } else {
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+        }
+      }
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+        }
+      } else {
+        intra_tx_size = mbmi->tx_size;
+      }
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
+            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+
+      if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
+    }
+  }
+
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type) &&
+      is_inter && !(mbmi->skip || seg_skip) &&
+      !xd->lossless[mbmi->segment_id]) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter) {
+      if (xd->lossless[mbmi->segment_id]) {
+        tx_size = TX_4X4;
+      } else {
+        tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+      }
+    } else {
+      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    }
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h,
+                  (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
+  }
+  CFL_CTX *const cfl = &xd->cfl;
+  if (is_inter_block(mbmi) &&
+      !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
+                           cfl->subsampling_y) &&
+      is_cfl_allowed(xd)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+  }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 000000000..e8cf9b468
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DELTAQ_MODULATION 1  // 0: variance based, 1: wavelet AC energy based
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+void av1_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src, int mi_row,
+                          int mi_col, const int num_planes);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+                     int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 000000000..ad12577e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,649 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+
+// Check if one needs to use c version subtraction.
+static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
+
+static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+                           int16_t *diff, ptrdiff_t diff_stride,
+                           const uint8_t *src8, ptrdiff_t src_stride,
+                           const uint8_t *pred8, ptrdiff_t pred_stride) {
+  if (check_subtract_block_size(rows, cols)) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
+                                  src_stride, pred8, pred_stride, xd->bd);
+      return;
+    }
+    aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                         pred_stride);
+
+    return;
+  }
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                              pred8, pred_stride, xd->bd);
+    return;
+  }
+  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                     pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  uint8_t *src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  int16_t *src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                 src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                 pd->dst.buf, pd->dst.stride);
+}
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode,
+                   int *rate_cost) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  const int eob = p->eobs[block];
+  const int segment_id = xd->mi[0]->segment_id;
+
+  if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+      xd->lossless[segment_id]) {
+    *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
+    return eob;
+  }
+
+  (void)fast_mode;
+  return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+                              rate_cost, cpi->oxcf.sharpness);
+}
+
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_HIGHBD = 1,
+  QUANT_FUNC_TYPES = 2
+} QUANT_FUNC;
+
+static AV1_QUANT_FACADE
+    quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+      { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+      { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+      { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+      { NULL, NULL }
+    };
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, TX_TYPE tx_type,
+                     AV1_XFORM_QUANT xform_quant_idx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = block_size_wide[plane_bsize];
+  int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  const qm_val_t *qmatrix =
+      IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size]
+                               : cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+
+  const int src_offset = (blk_row * diff_stride + blk_col);
+  const int16_t *src_diff = &p->src_diff[src_offset << tx_size_wide_log2[0]];
+  QUANT_PARAM qparam;
+  qparam.log_scale = av1_get_tx_scale(tx_size);
+  qparam.tx_size = tx_size;
+  qparam.qmatrix = qmatrix;
+  qparam.iqmatrix = iqmatrix;
+  TxfmParam txfm_param;
+  txfm_param.tx_type = tx_type;
+  txfm_param.tx_size = tx_size;
+  txfm_param.lossless = xd->lossless[mbmi->segment_id];
+  txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
+
+  txfm_param.bd = xd->bd;
+  txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+
+  av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
+
+  if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    const int n_coeffs = av1_get_max_eob(tx_size);
+    if (LIKELY(!x->skip_block)) {
+      quant_func_list[xform_quant_idx][txfm_param.is_hbd](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
+    } else {
+      av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
+    }
+  }
+  // NOTE: optimize_b_following is ture means av1_optimze_b will be called
+  // When the condition of doing optimize_b is changed,
+  // this flag need update simultaneously
+  const int optimize_b_following =
+      (xform_quant_idx != AV1_XFORM_QUANT_FP) || (txfm_param.lossless);
+  if (optimize_b_following) {
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+  } else {
+    p->txb_entropy_ctx[block] = 0;
+  }
+  return;
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+                         int mi_row, int mi_col, RUN_TYPE dry_run) {
+  (void)mi_row;
+  (void)mi_col;
+  (void)dry_run;
+  struct encode_b_args *const args = arg;
+  const AV1_COMMON *const cm = &args->cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
+  int dummy_rate_cost = 0;
+
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  dst = &pd->dst
+             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+
+  if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
+    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+                                      tx_size, cm->reduced_tx_set_used);
+    if (args->enable_optimize_b) {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
+    } else {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+    }
+  } else {
+    p->eobs[block] = 0;
+    p->txb_entropy_ctx[block] = 0;
+  }
+
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  if (p->eobs[block]) {
+    *(args->skip) = 0;
+
+    TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+                                      tx_size, cm->reduced_tx_set_used);
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                pd->dst.stride, p->eobs[block],
+                                cm->reduced_tx_set_used);
+  }
+
+  if (p->eobs[block] == 0 && plane == 0) {
+  // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+  // case. It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ &&
+        args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+      // enable_optimize_b is true to detect potential RD bug.
+      const uint8_t disable_txk_check = args->enable_optimize_b;
+      if (!disable_txk_check) {
+        assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+                                                     blk_col)] == DCT_DCT);
+      }
+    }
+#endif
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
+  }
+
+#if CONFIG_MISMATCH_DEBUG
+  if (dry_run == OUTPUT_ENABLED) {
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
+                             pixel_c, pixel_r, blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
+}
+
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg, int mi_row, int mi_col,
+                               RUN_TYPE dry_run) {
+  (void)mi_row;
+  (void)mi_col;
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  if (!plane) {
+    assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+           tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+  }
+
+  if (tx_size == plane_tx_size || plane) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+                 mi_row, mi_col, dry_run);
+  } else {
+    assert(tx_size < TX_SIZES_ALL);
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+    // This is the square transform block partition entry point.
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                           arg, mi_row, mi_col, dry_run);
+        block += step;
+      }
+    }
+  }
+}
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+  int i = 0, r, c;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+  int blk_row, blk_col;
+
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
+    const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+      for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+        for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+          i += step;
+        }
+      }
+    }
+  }
+}
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg, const int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (!is_chroma_reference(mi_row, mi_col, bsize,
+                             xd->plane[plane].subsampling_x,
+                             xd->plane[plane].subsampling_y))
+      continue;
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+  }
+}
+
+typedef struct encode_block_pass1_args {
+  AV1_COMMON *cm;
+  MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+  AV1_COMMON *cm = args->cm;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  TxfmParam txfm_param;
+  uint8_t *dst;
+  dst = &pd->dst
+             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  DCT_DCT, AV1_XFORM_QUANT_B);
+
+  if (p->eobs[block] > 0) {
+    txfm_param.bd = xd->bd;
+    txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+    txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
+    txfm_param.eob = p->eobs[block];
+    txfm_param.lossless = xd->lossless[xd->mi[0]->segment_id];
+    txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+        txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
+    if (txfm_param.is_hbd) {
+      av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+      return;
+    }
+    av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+  }
+}
+
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  encode_block_pass1_args args = { cm, x };
+  av1_subtract_plane(x, bsize, 0);
+  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, &args);
+}
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run) {
+  (void)dry_run;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct encode_b_args arg = { cpi,
+                               x,
+                               &ctx,
+                               &mbmi->skip,
+                               NULL,
+                               NULL,
+                               cpi->optimize_seg_arr[mbmi->segment_id] };
+  int plane;
+
+  mbmi->skip = 1;
+
+  if (x->skip) return;
+
+  for (plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_x = xd->plane[plane].subsampling_x;
+    const int subsampling_y = xd->plane[plane].subsampling_y;
+
+    if (!is_chroma_reference(mi_row, mi_col, bsize, subsampling_x,
+                             subsampling_y))
+      continue;
+
+    const BLOCK_SIZE bsizec =
+        scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
+
+    // TODO(jingning): Clean this up.
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+    const int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    av1_get_entropy_contexts(bsizec, pd, ctx.ta[plane], ctx.tl[plane]);
+
+    av1_subtract_plane(x, bsizec, plane);
+
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        int blk_row, blk_col;
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+                               max_tx_size, &arg, mi_row, mi_col, dry_run);
+            block += step;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void encode_block_intra_and_set_context(int plane, int block,
+                                               int blk_row, int blk_col,
+                                               BLOCK_SIZE plane_bsize,
+                                               TX_SIZE tx_size, void *arg) {
+  av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                         arg);
+
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *x = args->x;
+  ENTROPY_CONTEXT *a = &args->ta[blk_col];
+  ENTROPY_CONTEXT *l = &args->tl[blk_row];
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  struct encode_b_args *const args = arg;
+  const AV1_COMMON *const cm = &args->cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  uint16_t *eob = &p->eobs[block];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  int dummy_rate_cost = 0;
+
+  av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
+    *eob = 0;
+    p->txb_entropy_ctx[block] = 0;
+  } else {
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+    const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+    const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+    if (args->enable_optimize_b) {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+                     &dummy_rate_cost);
+    } else {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+    }
+  }
+
+  if (*eob) {
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                dst_stride, *eob, cm->reduced_tx_set_used);
+  }
+
+  if (*eob == 0 && plane == 0) {
+  // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+  // It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ
+        && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+                                                   blk_col)] == DCT_DCT);
+    }
+#endif
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
+  }
+
+  // For intra mode, skipped blocks are so rare that transmitting skip=1 is
+  // very expensive.
+  *(args->skip) = 0;
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+}
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b, int mi_row,
+                                  int mi_col) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
+
+  struct encode_b_args arg = {
+    cpi, x, NULL, &(xd->mi[0]->skip), ta, tl, enable_optimize_b
+  };
+
+  if (!is_chroma_reference(mi_row, mi_col, bsize,
+                           xd->plane[plane].subsampling_x,
+                           xd->plane[plane].subsampling_y))
+    return;
+
+  if (enable_optimize_b) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    av1_get_entropy_contexts(bsize, pd, ta, tl);
+  }
+  av1_foreach_transformed_block_in_plane(
+      xd, bsize, plane, encode_block_intra_and_set_context, &arg);
+}
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 000000000..39080de59
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+  const struct AV1_COMP *cpi;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  int8_t enable_optimize_b;
+};
+
+typedef enum AV1_XFORM_QUANT {
+  AV1_XFORM_QUANT_FP = 0,
+  AV1_XFORM_QUANT_B = 1,
+  AV1_XFORM_QUANT_DC = 2,
+  AV1_XFORM_QUANT_SKIP_QUANT,
+  AV1_XFORM_QUANT_TYPES,
+} AV1_XFORM_QUANT;
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   int mi_row, int mi_col, RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   foreach_transformed_block_visitor visit,
+                                   void *arg, const int num_planes);
+
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, TX_TYPE tx_type,
+                     AV1_XFORM_QUANT xform_quant_idx);
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+                                       TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+                                       ENTROPY_CONTEXT *l) {
+  const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+  memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+  memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b, int mi_row,
+                                  int mi_col);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 000000000..42eb5abf6
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t log_in_base_2(unsigned int n) {
+  // get_msb() is only valid when n != 0.
+  return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+                              ? MV_CLASS_10
+                              : (MV_CLASS_TYPE)log_in_base_2(z >> 3);
+  if (offset) *offset = z - mv_class_base(c);
+  return c;
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+                                MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
+
+  // Class
+  aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i)
+      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_write_symbol(
+        w, fr,
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+        MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION)
+    aom_write_symbol(
+        w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+        2);
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+                                           const nmv_component *const mvcomp,
+                                           MvSubpelPrecision precision) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+  int class0_hp_cost[2], hp_cost[2];
+
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+  }
+
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
+  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);     /* int mv data */
+    f = (o >> 1) & 3; /* fractional pel mv data */
+    e = (o & 1);      /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      const int b = c + CLASS0_BITS - 1; /* number of bits */
+      for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (precision > MV_SUBPEL_NONE) {
+      if (c == MV_CLASS_0) {
+        cost += class0_fp_cost[d][f];
+      } else {
+        cost += fp_cost[f];
+      }
+      if (precision > MV_SUBPEL_LOW_PRECISION) {
+        if (c == MV_CLASS_0) {
+          cost += class0_hp_cost[e];
+        } else {
+          cost += hp_cost[e];
+        }
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx, int usehp) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+  if (cpi->common.cur_frame_force_integer_mv) {
+    usehp = MV_SUBPEL_NONE;
+  }
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (cpi->sf.mv.auto_mv_step_size) {
+    unsigned int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->max_mv_magnitude = AOMMAX(maxv, cpi->max_mv_magnitude);
+  }
+}
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx) {
+  // DV and ref DV should not have sub-pel.
+  assert((mv->col & 7) == 0);
+  assert((mv->row & 7) == 0);
+  assert((ref->col & 7) == 0);
+  assert((ref->row & 7) == 0);
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
+}
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *ctx,
+                              MvSubpelPrecision precision) {
+  av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+}
+
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack =
+      mbmi_ext->ref_mv_stack[ref_frame_type];
+  int_mv ref_mv;
+  ref_mv.as_int = INVALID_MV;
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    if (ref_idx == 0) {
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+    } else {
+      assert(ref_idx == 1);
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
+    }
+  } else {
+    assert(ref_idx == 0);
+    if (ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+    } else {
+      ref_mv = mbmi_ext->global_mvs[ref_frame_type];
+    }
+  }
+  return ref_mv;
+}
+
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+  return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                                   x->mbmi_ext);
+}
+
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer) {
+  const int ref_idx = 0;
+  MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+  lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+  *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+  lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 000000000..37ff547c8
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx, int usehp);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *mvctx,
+                              MvSubpelPrecision precision);
+
+void av1_update_mv_count(ThreadData *td);
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx);
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer);
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+  if (mv->row == 0) {
+    return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+  } else {
+    return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 000000000..a2da2df89
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,6437 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/hash_motion.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+#if CONFIG_ENTROPY_STATS
+FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
+
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+      break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+      break;
+    default:
+      *hr = 1;
+      *hs = 1;
+      assert(0);
+      break;
+  }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(AV1_COMP *cpi) {
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(AV1_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      av1_enable_segmentation(seg);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+                      -MAX_LOOP_FILTER);
+    } else {
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+    const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char *const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+    const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv,
+                                  int cur_frame_force_integer_mv) {
+  MACROBLOCK *const mb = &cpi->td.mb;
+  cpi->common.allow_high_precision_mv =
+      allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+  const int copy_hp =
+      cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+  int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost;
+  mb->mv_cost_stack = *src;
+}
+
+static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+#if CONFIG_FILEOPTIONS
+  if (cm->options && cm->options->ext_partition)
+#endif
+    if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+      return BLOCK_128X128;
+
+  assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+// TODO(any): Possibly could improve this with a heuristic.
+#if CONFIG_FILEOPTIONS
+  if (cm->options && !cm->options->ext_partition) return BLOCK_64X64;
+#endif
+
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there. Also, this heuristic gives
+  // compression gain for speed >= 2 only.
+  if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
+      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
+    return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
+                                                   : BLOCK_64X64;
+  }
+
+  return BLOCK_128X128;
+}
+
+static void setup_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+
+  cm->primary_ref_frame = PRIMARY_REF_NONE;
+  if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+      cm->force_primary_ref_none) {
+    av1_setup_past_independence(cm);
+    for (int i = 0; i < REF_FRAMES; i++) {
+      cm->fb_of_context_type[i] = -1;
+    }
+    cm->fb_of_context_type[REGULAR_FRAME] =
+        cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cpi, ALTREF_FRAME);
+    cm->frame_context_idx = REGULAR_FRAME;
+  } else {
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
+      cm->frame_context_idx = EXT_ARF_FRAME;
+    else if (cpi->refresh_alt_ref_frame)
+      cm->frame_context_idx = ARF_FRAME;
+    else if (cpi->rc.is_src_frame_alt_ref)
+      cm->frame_context_idx = OVERLAY_FRAME;
+    else if (cpi->refresh_golden_frame)
+      cm->frame_context_idx = GLD_FRAME;
+    else if (cpi->refresh_bwd_ref_frame)
+      cm->frame_context_idx = BRF_FRAME;
+    else
+      cm->frame_context_idx = REGULAR_FRAME;
+    int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx];
+    for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+      int fb = get_ref_frame_map_idx(cpi, ref_frame);
+      if (fb == wanted_fb) {
+        cm->primary_ref_frame = ref_frame - LAST_FRAME;
+      }
+    }
+  }
+
+  if (cm->frame_type == KEY_FRAME && cm->show_frame) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    av1_zero(cpi->interp_filter_selected);
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+    set_use_reference_buffer(cm, 0);
+  } else if (frame_is_sframe(cm)) {
+    cpi->refresh_golden_frame = 1;
+    cpi->refresh_alt_ref_frame = 1;
+    av1_zero(cpi->interp_filter_selected);
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+  } else {
+    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
+    } else {
+      *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
+    }
+    av1_zero(cpi->interp_filter_selected[0]);
+  }
+
+  cm->prev_frame = get_prev_frame(cm);
+  cpi->vaq_refresh = 0;
+}
+
+static void enc_setup_mi(AV1_COMMON *cm) {
+  int i;
+  int mi_rows_sb_aligned = calc_mi_size(cm->mi_rows);
+  cm->mi = cm->mip;
+  memset(cm->mip, 0, cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip;
+  // Clear top border row
+  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+  // Clear left border column
+  for (i = 0; i < mi_rows_sb_aligned; ++i)
+    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+  cm->mi_grid_visible = cm->mi_grid_base;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
+
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mi_grid_base));
+}
+
+static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
+  cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip) return 1;
+  cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (!cm->prev_mip) return 1;
+  cm->mi_alloc_size = mi_size;
+
+  cm->mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
+  if (!cm->mi_grid_base) return 1;
+  cm->prev_mi_grid_base =
+      (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
+  if (!cm->prev_mi_grid_base) return 1;
+
+  return 0;
+}
+
+static void enc_free_mi(AV1_COMMON *cm) {
+  aom_free(cm->mip);
+  cm->mip = NULL;
+  aom_free(cm->prev_mip);
+  cm->prev_mip = NULL;
+  aom_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  aom_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
+}
+
+static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MB_MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MB_MODE_INFO *temp = cm->prev_mip;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip;
+  cm->prev_mi = cm->prev_mip;
+
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp_base;
+  cm->mi_grid_visible = cm->mi_grid_base;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
+}
+
+void av1_initialize_enc(void) {
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_me_luts();
+  av1_rc_init_minq_luts();
+  av1_init_wedge_masks();
+}
+
+static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
+  if (cpi->mbmi_ext_base) {
+    aom_free(cpi->mbmi_ext_base);
+    cpi->mbmi_ext_base = NULL;
+  }
+}
+
+static void alloc_context_buffers_ext(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int mi_size = cm->mi_cols * cm->mi_rows;
+
+  dealloc_context_buffers_ext(cpi);
+  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+                  aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
+}
+
+static void update_film_grain_parameters(struct AV1_COMP *cpi,
+                                         const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  cpi->oxcf = *oxcf;
+
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (oxcf->film_grain_test_vector) {
+    cm->seq_params.film_grain_params_present = 1;
+    if (cm->frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
+
+      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
+      }
+    }
+  } else if (oxcf->film_grain_table_filename) {
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+
+    aom_film_grain_table_read(cpi->film_grain_table,
+                              oxcf->film_grain_table_filename, &cm->error);
+  } else {
+    cm->seq_params.film_grain_params_present = 0;
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  dealloc_context_buffers_ext(cpi);
+
+  aom_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  aom_free(cpi->segmentation_map);
+  cpi->segmentation_map = NULL;
+
+  av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  aom_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  aom_free(cpi->td.mb.above_pred_buf);
+  cpi->td.mb.above_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.left_pred_buf);
+  cpi->td.mb.left_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.wsrc_buf);
+  cpi->td.mb.wsrc_buf = NULL;
+
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++) {
+      aom_free(cpi->td.mb.hash_value_buffer[i][j]);
+      cpi->td.mb.hash_value_buffer[i][j] = NULL;
+    }
+  aom_free(cpi->td.mb.mask_buf);
+  cpi->td.mb.mask_buf = NULL;
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
+
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+  av1_free_txb_buf(cpi);
+  av1_free_context_buffers(cm);
+
+  aom_free_frame_buffer(&cpi->last_frame_uf);
+  av1_free_restoration_buffers(cm);
+  aom_free_frame_buffer(&cpi->trial_frame_rst);
+  aom_free_frame_buffer(&cpi->scaled_source);
+  aom_free_frame_buffer(&cpi->scaled_last_source);
+  aom_free_frame_buffer(&cpi->alt_ref_buffer);
+  av1_lookahead_destroy(cpi->lookahead);
+
+  aom_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
+
+  aom_free(cpi->tplist[0][0]);
+  cpi->tplist[0][0] = NULL;
+
+  av1_free_pc_tree(&cpi->td, num_planes);
+
+  aom_free(cpi->td.mb.palette_buffer);
+
+  aom_free(cpi->td.mb.tmp_conv_dst);
+  for (int j = 0; j < 2; ++j) {
+    aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
+  }
+
+#if CONFIG_DENOISE
+  if (cpi->denoise_and_model) {
+    aom_denoise_and_model_free(cpi->denoise_and_model);
+    cpi->denoise_and_model = NULL;
+  }
+#endif
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+}
+
+static void save_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  // Stores a snapshot of key state variables which can subsequently be
+  // restored with a call to av1_restore_coding_context. These functions are
+  // intended for use in a re-code loop in av1_compress_frame where the
+  // quantizer value is adjusted between loop iterations.
+  av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost);
+  av1_copy(cc->nmv_costs, cpi->nmv_costs);
+  av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
+
+  cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  // Restore key state variables to the snapshot state stored in the
+  // previous call to av1_save_coding_context.
+  av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost);
+  av1_copy(cpi->nmv_costs, cc->nmv_costs);
+  av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
+
+  *cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation
+    av1_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    av1_clearall_segfeatures(seg);
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+    cpi->static_mb_pct = 0;
+
+    // Disable segmentation and individual segment features by default
+    av1_disable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    // Scan frames from current to arf frame.
+    // This function re-enables segmentation if appropriate.
+    av1_update_mbgraph_stats(cpi);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->seq_params.bit_depth);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      if (rc->source_alt_ref_active) {
+        seg->update_map = 0;
+        seg->update_data = 1;
+
+        qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->seq_params.bit_depth);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+        // Segment coding disabled for compred testing
+        if (high_q || (cpi->static_mb_pct == 100)) {
+          av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        }
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
+        av1_disable_segmentation(seg);
+
+        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+        seg->update_map = 0;
+        seg->update_data = 0;
+
+        av1_clearall_segfeatures(seg);
+      }
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+static void update_reference_segmentation_map(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
+  uint8_t *cache_ptr = cm->current_frame_seg_map;
+  int row, col;
+
+  for (row = 0; row < cm->mi_rows; row++) {
+    MB_MODE_INFO **mi_4x4 = mi_4x4_ptr;
+    uint8_t *cache = cache_ptr;
+    for (col = 0; col < cm->mi_cols; col++, mi_4x4++, cache++)
+      cache[0] = mi_4x4[0]->segment_id;
+    mi_4x4_ptr += cm->mi_stride;
+    cache_ptr += cm->mi_cols;
+  }
+}
+
+static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+
+  if (!cpi->lookahead)
+    cpi->lookahead =
+        av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x,
+                           seq_params->subsampling_y,
+                           seq_params->use_highbitdepth, oxcf->lag_in_frames);
+  if (!cpi->lookahead)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate lag buffers");
+
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (aom_realloc_frame_buffer(
+          &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (aom_realloc_frame_buffer(
+          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate trial restored frame buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_last_source, cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+}
+
+static void alloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_alloc_context_buffers(cm, cm->width, cm->height);
+
+  int mi_rows_aligned_to_sb =
+      ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  av1_alloc_txb_buf(cpi);
+
+  alloc_context_buffers_ext(cpi);
+
+  aom_free(cpi->tile_tok[0][0]);
+
+  {
+    unsigned int tokens =
+        get_token_alloc(cm->mb_rows, cm->mb_cols, MAX_SB_SIZE_LOG2, num_planes);
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+  }
+  aom_free(cpi->tplist[0][0]);
+
+  CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+                  aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                             sizeof(*cpi->tplist[0][0])));
+
+  av1_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
+  av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+}
+
+static void set_tile_info(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, start_sb;
+
+  av1_get_tile_limits(cm);
+
+  // configure tile columns
+  if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
+    cm->uniform_tile_spacing_flag = 1;
+    cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols);
+    cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols);
+  } else {
+    int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+    int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+    int size_sb, j = 0;
+    cm->uniform_tile_spacing_flag = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+      cm->tile_col_start_sb[i] = start_sb;
+      size_sb = cpi->oxcf.tile_widths[j++];
+      if (j >= cpi->oxcf.tile_width_count) j = 0;
+      start_sb += AOMMIN(size_sb, cm->max_tile_width_sb);
+    }
+    cm->tile_cols = i;
+    cm->tile_col_start_sb[i] = sb_cols;
+  }
+  av1_calculate_tile_cols(cm);
+
+  // configure tile rows
+  if (cm->uniform_tile_spacing_flag) {
+    cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows);
+    cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows);
+  } else {
+    int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+    int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+    int size_sb, j = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+      cm->tile_row_start_sb[i] = start_sb;
+      size_sb = cpi->oxcf.tile_heights[j++];
+      if (j >= cpi->oxcf.tile_height_count) j = 0;
+      start_sb += AOMMIN(size_sb, cm->max_tile_height_sb);
+    }
+    cm->tile_rows = i;
+    cm->tile_row_start_sb[i] = sb_rows;
+  }
+  av1_calculate_tile_rows(cm);
+}
+
+static void update_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  av1_set_mb_mi(cm, cm->width, cm->height);
+  av1_init_context_buffers(cm);
+  av1_init_macroblockd(cm, xd, NULL);
+  memset(cpi->mbmi_ext_base, 0,
+         cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+  set_tile_info(cpi);
+}
+
+static void init_buffer_indices(AV1_COMP *cpi) {
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    cpi->ref_fb_idx[fb_idx] = fb_idx;
+  cpi->rate_index = 0;
+  cpi->rate_size = 0;
+  cpi->cur_poc = -1;
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+                                   int lvl_width, int lvl_height,
+                                   double lvl_fps, int lvl_dim_mult) {
+  const int64_t lvl_luma_pels = lvl_width * lvl_height;
+  const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+  const int64_t luma_pels = width * height;
+  const double display_sample_rate = luma_pels * fps;
+  return luma_pels <= lvl_luma_pels &&
+         display_sample_rate <= lvl_display_sample_rate &&
+         width <= lvl_width * lvl_dim_mult &&
+         height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
+                                     const AV1EncoderConfig *oxcf) {
+  // TODO(any): This is a placeholder function that only addresses dimensions
+  // and max display sample rates.
+  // Need to add checks for max bit rate, max decoded luma sample rate, header
+  // rate, etc. that are not covered by this function.
+  (void)oxcf;
+  BitstreamLevel bl = { 9, 3 };
+  if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
+                       288, 30.0, 4)) {
+    bl.major = 2;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              704, 396, 30.0, 4)) {
+    bl.major = 2;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1088, 612, 30.0, 4)) {
+    bl.major = 3;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1376, 774, 30.0, 4)) {
+    bl.major = 3;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 30.0, 3)) {
+    bl.major = 4;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 60.0, 3)) {
+    bl.major = 4;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 30.0, 2)) {
+    bl.major = 5;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 60.0, 2)) {
+    bl.major = 5;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 120.0, 2)) {
+    bl.major = 5;
+    bl.minor = 2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 30.0, 2)) {
+    bl.major = 6;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 60.0, 2)) {
+    bl.major = 6;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 120.0, 2)) {
+    bl.major = 6;
+    bl.minor = 2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 30.0, 2)) {
+    bl.major = 7;
+    bl.minor = 0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 60.0, 2)) {
+    bl.major = 7;
+    bl.minor = 1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              16384, 8704, 120.0, 2)) {
+    bl.major = 7;
+    bl.minor = 2;
+  }
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    seq->level[i] = bl;
+    seq->tier[i] = 0;  // setting main tier by default
+    // Set the maximum parameters for bitrate and buffer size for this profile,
+    // level, and tier
+    cm->op_params[i].bitrate = max_level_bitrate(
+        cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]),
+        seq->tier[i]);
+    // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+    // check
+    if (cm->op_params[i].bitrate == 0)
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "AV1 does not support this combination of profile, level, and tier.");
+    // Buffer size in bits/s is bitrate in bits/s * 1 s
+    cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+  }
+}
+
+static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+                                  const AV1EncoderConfig *oxcf) {
+  seq->still_picture = (oxcf->limit == 1);
+  seq->reduced_still_picture_hdr = seq->still_picture;
+  seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
+  seq->force_screen_content_tools = 2;
+  seq->force_integer_mv = 2;
+  seq->enable_order_hint = oxcf->enable_order_hint;
+  seq->frame_id_numbers_present_flag = oxcf->large_scale_tile;
+  if (seq->still_picture && seq->reduced_still_picture_hdr) {
+    seq->enable_order_hint = 0;
+    seq->frame_id_numbers_present_flag = 0;
+    seq->force_screen_content_tools = 2;
+    seq->force_integer_mv = 2;
+  }
+  seq->order_hint_bits_minus_1 =
+      seq->enable_order_hint ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1;
+
+  seq->enable_dual_filter = oxcf->enable_dual_filter;
+  seq->enable_jnt_comp = oxcf->enable_jnt_comp;
+  seq->enable_jnt_comp &= seq->enable_order_hint;
+  seq->enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+  seq->enable_ref_frame_mvs &= seq->enable_order_hint;
+  seq->enable_superres = oxcf->enable_superres;
+  seq->enable_cdef = oxcf->enable_cdef;
+  seq->enable_restoration = oxcf->enable_restoration;
+  seq->enable_warped_motion = oxcf->enable_warped_motion;
+  seq->enable_interintra_compound = 1;
+  seq->enable_masked_compound = 1;
+  seq->enable_intra_edge_filter = 1;
+  seq->enable_filter_intra = 1;
+
+  set_bitstream_level_tier(seq, cm, oxcf);
+
+  if (seq->operating_points_cnt_minus_1 == 0) {
+    seq->operating_point_idc[0] = 0;
+  } else {
+    // Set operating_point_idc[] such that for the i-th operating point the
+    // first (operating_points_cnt-i) spatial layers and the first temporal
+    // layer are decoded Note that highest quality operating point should come
+    // first
+    for (int i = 0; i < seq->operating_points_cnt_minus_1 + 1; i++)
+      seq->operating_point_idc[i] =
+          (~(~0u << (seq->operating_points_cnt_minus_1 + 1 - i)) << 8) | 1;
+  }
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
+
+  cm->seq_params.profile = oxcf->profile;
+  cm->seq_params.bit_depth = oxcf->bit_depth;
+  cm->seq_params.use_highbitdepth = oxcf->use_highbitdepth;
+  cm->seq_params.color_primaries = oxcf->color_primaries;
+  cm->seq_params.transfer_characteristics = oxcf->transfer_characteristics;
+  cm->seq_params.matrix_coefficients = oxcf->matrix_coefficients;
+  cm->seq_params.monochrome = oxcf->monochrome;
+  cm->seq_params.chroma_sample_position = oxcf->chroma_sample_position;
+  cm->seq_params.color_range = oxcf->color_range;
+  cm->timing_info_present = oxcf->timing_info_present;
+  cm->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+  cm->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  cm->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  cm->seq_params.display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  cm->seq_params.decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    cm->buffer_model.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_time_present = 1;
+    set_aom_dec_model_info(&cm->buffer_model);
+    set_dec_model_op_parameters(&cm->op_params[0]);
+  } else if (cm->timing_info_present &&
+             cm->timing_info.equal_picture_interval &&
+             !cm->seq_params.decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    set_resource_availability_parameters(&cm->op_params[0]);
+  } else {
+    cm->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  if (cm->seq_params.monochrome) {
+    cm->seq_params.subsampling_x = 1;
+    cm->seq_params.subsampling_y = 1;
+  } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 &&
+             cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB &&
+             cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    cm->seq_params.subsampling_x = 0;
+    cm->seq_params.subsampling_y = 0;
+  } else {
+    if (cm->seq_params.profile == 0) {
+      cm->seq_params.subsampling_x = 1;
+      cm->seq_params.subsampling_y = 1;
+    } else if (cm->seq_params.profile == 1) {
+      cm->seq_params.subsampling_x = 0;
+      cm->seq_params.subsampling_y = 0;
+    } else {
+      if (cm->seq_params.bit_depth == AOM_BITS_12) {
+        cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x;
+        cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y;
+      } else {
+        cm->seq_params.subsampling_x = 1;
+        cm->seq_params.subsampling_y = 0;
+      }
+    }
+  }
+
+  cm->width = oxcf->width;
+  cm->height = oxcf->height;
+  set_sb_size(&cm->seq_params,
+              select_sb_size(cpi));  // set sb size before allocations
+  alloc_compressor_data(cpi);
+
+  update_film_grain_parameters(cpi, oxcf);
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cpi->counts;
+
+  // change includes all joint functionality
+  av1_change_config(cpi, oxcf);
+
+  cpi->static_mb_pct = 0;
+  cpi->ref_frame_flags = 0;
+
+  // Reset resize pending flags
+  cpi->resize_pending_width = 0;
+  cpi->resize_pending_height = 0;
+
+  init_buffer_indices(cpi);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const AV1EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level =
+      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size =
+      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                           \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
+  cpi->fn_ptr[BT].vf = VF;                                             \
+  cpi->fn_ptr[BT].svf = SVF;                                           \
+  cpi->fn_ptr[BT].svaf = SVAF;                                         \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
+  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
+                                     int source_stride,                        \
+                                     const uint8_t *ref_ptr, int ref_stride) { \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
+  }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           4;                                                                  \
+  }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const JNT_COMP_PARAMS *jcp_param) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg)
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  cpi->fn_ptr[BT].msdf = MCSDF;       \
+  cpi->fn_ptr[BT].msvf = MCSVF;
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
+  static unsigned int fnname##_bits8(                                    \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask);            \
+  }                                                                      \
+  static unsigned int fnname##_bits10(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           2;                                                            \
+  }                                                                      \
+  static unsigned int fnname##_bits12(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           4;                                                            \
+  }
+
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;           \
+  cpi->fn_ptr[BT].ovf = OVF;             \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
+  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
+                                     const int32_t *wsrc,                 \
+                                     const int32_t *msk) {                \
+    return fnname(ref, ref_stride, wsrc, msk);                            \
+  }                                                                       \
+  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
+  }                                                                       \
+  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
+  }
+
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+
+static void highbd_set_var_fns(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cm->seq_params.use_highbitdepth) {
+    switch (cm->seq_params.bit_depth) {
+      case AOM_BITS_8:
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
+                   aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
+                   aom_highbd_8_sub_pixel_variance64x16,
+                   aom_highbd_8_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits8,
+                   aom_highbd_jnt_sad64x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x16)
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
+                   aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
+                   aom_highbd_8_sub_pixel_variance16x64,
+                   aom_highbd_8_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits8,
+                   aom_highbd_jnt_sad16x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x64)
+
+        HIGHBD_BFP(
+            BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
+            aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
+            aom_highbd_8_sub_pixel_avg_variance32x8,
+            aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance32x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
+            aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
+            aom_highbd_8_sub_pixel_avg_variance8x32,
+            aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance8x32)
+
+        HIGHBD_BFP(
+            BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
+            aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
+            aom_highbd_8_sub_pixel_avg_variance16x4,
+            aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance16x4)
+
+        HIGHBD_BFP(
+            BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
+            aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
+            aom_highbd_8_sub_pixel_avg_variance4x16,
+            aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance4x16)
+
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
+                   aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
+                   aom_highbd_8_sub_pixel_variance32x16,
+                   aom_highbd_8_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits8,
+                   aom_highbd_jnt_sad32x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x16)
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
+                   aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
+                   aom_highbd_8_sub_pixel_variance16x32,
+                   aom_highbd_8_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits8,
+                   aom_highbd_jnt_sad16x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x32)
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
+                   aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
+                   aom_highbd_8_sub_pixel_variance64x32,
+                   aom_highbd_8_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits8,
+                   aom_highbd_jnt_sad64x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x32)
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
+                   aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
+                   aom_highbd_8_sub_pixel_variance32x64,
+                   aom_highbd_8_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits8,
+                   aom_highbd_jnt_sad32x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x64)
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
+                   aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
+                   aom_highbd_8_sub_pixel_variance32x32,
+                   aom_highbd_8_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x4d_bits8,
+                   aom_highbd_jnt_sad32x32_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance32x32)
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
+                   aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
+                   aom_highbd_8_sub_pixel_variance64x64,
+                   aom_highbd_8_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x4d_bits8,
+                   aom_highbd_jnt_sad64x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x64)
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
+                   aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
+                   aom_highbd_8_sub_pixel_variance16x16,
+                   aom_highbd_8_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x4d_bits8,
+                   aom_highbd_jnt_sad16x16_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance16x16)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
+            aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
+            aom_highbd_8_sub_pixel_avg_variance16x8,
+            aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance16x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
+            aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
+            aom_highbd_8_sub_pixel_avg_variance8x16,
+            aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance8x16)
+
+        HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8,
+                   aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8,
+                   aom_highbd_8_sub_pixel_variance8x8,
+                   aom_highbd_8_sub_pixel_avg_variance8x8,
+                   aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance8x8)
+
+        HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
+                   aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
+                   aom_highbd_8_sub_pixel_variance8x4,
+                   aom_highbd_8_sub_pixel_avg_variance8x4,
+                   aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance8x4)
+
+        HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
+                   aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
+                   aom_highbd_8_sub_pixel_variance4x8,
+                   aom_highbd_8_sub_pixel_avg_variance4x8,
+                   aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance4x8)
+
+        HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8,
+                   aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4,
+                   aom_highbd_8_sub_pixel_variance4x4,
+                   aom_highbd_8_sub_pixel_avg_variance4x4,
+                   aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance4x4)
+
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits8,
+            aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128,
+            aom_highbd_8_sub_pixel_variance128x128,
+            aom_highbd_8_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8,
+            aom_highbd_8_jnt_sub_pixel_avg_variance128x128)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
+                   aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
+                   aom_highbd_8_sub_pixel_variance128x64,
+                   aom_highbd_8_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits8,
+                   aom_highbd_jnt_sad128x64_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance128x64)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
+                   aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
+                   aom_highbd_8_sub_pixel_variance64x128,
+                   aom_highbd_8_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits8,
+                   aom_highbd_jnt_sad64x128_avg_bits8,
+                   aom_highbd_8_jnt_sub_pixel_avg_variance64x128)
+
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x128)
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x4)
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x16)
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x64)
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x8)
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x32)
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x4)
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x16)
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
+                    aom_highbd_obmc_variance128x128,
+                    aom_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
+                    aom_highbd_obmc_variance128x64,
+                    aom_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
+                    aom_highbd_obmc_variance64x128,
+                    aom_highbd_obmc_sub_pixel_variance64x128)
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
+                    aom_highbd_obmc_variance64x64,
+                    aom_highbd_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
+                    aom_highbd_obmc_variance64x32,
+                    aom_highbd_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
+                    aom_highbd_obmc_variance32x64,
+                    aom_highbd_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
+                    aom_highbd_obmc_variance32x32,
+                    aom_highbd_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
+                    aom_highbd_obmc_variance32x16,
+                    aom_highbd_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
+                    aom_highbd_obmc_variance16x32,
+                    aom_highbd_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
+                    aom_highbd_obmc_variance16x16,
+                    aom_highbd_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
+                    aom_highbd_obmc_variance8x16,
+                    aom_highbd_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
+                    aom_highbd_obmc_variance16x8,
+                    aom_highbd_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
+                    aom_highbd_obmc_variance8x8,
+                    aom_highbd_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
+                    aom_highbd_obmc_variance4x8,
+                    aom_highbd_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
+                    aom_highbd_obmc_variance8x4,
+                    aom_highbd_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
+                    aom_highbd_obmc_variance4x4,
+                    aom_highbd_obmc_sub_pixel_variance4x4)
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
+                    aom_highbd_obmc_variance64x16,
+                    aom_highbd_obmc_sub_pixel_variance64x16)
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
+                    aom_highbd_obmc_variance16x64,
+                    aom_highbd_obmc_sub_pixel_variance16x64)
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
+                    aom_highbd_obmc_variance32x8,
+                    aom_highbd_obmc_sub_pixel_variance32x8)
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
+                    aom_highbd_obmc_variance8x32,
+                    aom_highbd_obmc_sub_pixel_variance8x32)
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
+                    aom_highbd_obmc_variance16x4,
+                    aom_highbd_obmc_sub_pixel_variance16x4)
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
+                    aom_highbd_obmc_variance4x16,
+                    aom_highbd_obmc_sub_pixel_variance4x16)
+        break;
+
+      case AOM_BITS_10:
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
+                   aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
+                   aom_highbd_10_sub_pixel_variance64x16,
+                   aom_highbd_10_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits10,
+                   aom_highbd_jnt_sad64x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x16);
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
+                   aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
+                   aom_highbd_10_sub_pixel_variance16x64,
+                   aom_highbd_10_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits10,
+                   aom_highbd_jnt_sad16x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x64);
+
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
+                   aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
+                   aom_highbd_10_sub_pixel_variance32x8,
+                   aom_highbd_10_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits10,
+                   aom_highbd_jnt_sad32x8_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x8);
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
+                   aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
+                   aom_highbd_10_sub_pixel_variance8x32,
+                   aom_highbd_10_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits10,
+                   aom_highbd_jnt_sad8x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance8x32);
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
+                   aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
+                   aom_highbd_10_sub_pixel_variance16x4,
+                   aom_highbd_10_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits10,
+                   aom_highbd_jnt_sad16x4_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x4);
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
+                   aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
+                   aom_highbd_10_sub_pixel_variance4x16,
+                   aom_highbd_10_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits10,
+                   aom_highbd_jnt_sad4x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance4x16);
+
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
+                   aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
+                   aom_highbd_10_sub_pixel_variance32x16,
+                   aom_highbd_10_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits10,
+                   aom_highbd_jnt_sad32x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x16);
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
+                   aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
+                   aom_highbd_10_sub_pixel_variance16x32,
+                   aom_highbd_10_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits10,
+                   aom_highbd_jnt_sad16x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x32);
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
+                   aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
+                   aom_highbd_10_sub_pixel_variance64x32,
+                   aom_highbd_10_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits10,
+                   aom_highbd_jnt_sad64x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x32);
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
+                   aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
+                   aom_highbd_10_sub_pixel_variance32x64,
+                   aom_highbd_10_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits10,
+                   aom_highbd_jnt_sad32x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x64);
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
+                   aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
+                   aom_highbd_10_sub_pixel_variance32x32,
+                   aom_highbd_10_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x4d_bits10,
+                   aom_highbd_jnt_sad32x32_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance32x32);
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
+                   aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
+                   aom_highbd_10_sub_pixel_variance64x64,
+                   aom_highbd_10_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x4d_bits10,
+                   aom_highbd_jnt_sad64x64_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance64x64);
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
+                   aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
+                   aom_highbd_10_sub_pixel_variance16x16,
+                   aom_highbd_10_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x4d_bits10,
+                   aom_highbd_jnt_sad16x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x16);
+
+        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
+                   aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
+                   aom_highbd_10_sub_pixel_variance16x8,
+                   aom_highbd_10_sub_pixel_avg_variance16x8,
+                   aom_highbd_sad16x8x4d_bits10,
+                   aom_highbd_jnt_sad16x8_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance16x8);
+
+        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
+                   aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
+                   aom_highbd_10_sub_pixel_variance8x16,
+                   aom_highbd_10_sub_pixel_avg_variance8x16,
+                   aom_highbd_sad8x16x4d_bits10,
+                   aom_highbd_jnt_sad8x16_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance8x16);
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
+            aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
+            aom_highbd_10_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance8x8);
+
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
+            aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
+            aom_highbd_10_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance8x4);
+
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
+            aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
+            aom_highbd_10_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance4x8);
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
+            aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
+            aom_highbd_10_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
+                   aom_highbd_sad128x128_avg_bits10,
+                   aom_highbd_10_variance128x128,
+                   aom_highbd_10_sub_pixel_variance128x128,
+                   aom_highbd_10_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits10,
+                   aom_highbd_jnt_sad128x128_avg_bits10,
+                   aom_highbd_10_jnt_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(
+            BLOCK_128X64, aom_highbd_sad128x64_bits10,
+            aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64,
+            aom_highbd_10_sub_pixel_variance128x64,
+            aom_highbd_10_sub_pixel_avg_variance128x64,
+            aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(
+            BLOCK_64X128, aom_highbd_sad64x128_bits10,
+            aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128,
+            aom_highbd_10_sub_pixel_variance64x128,
+            aom_highbd_10_sub_pixel_avg_variance64x128,
+            aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10,
+            aom_highbd_10_jnt_sub_pixel_avg_variance64x128);
+
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x128)
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x4)
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x16)
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x64)
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x8)
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x32)
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x4)
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x16)
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
+                    aom_highbd_10_obmc_variance128x128,
+                    aom_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
+                    aom_highbd_10_obmc_variance128x64,
+                    aom_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
+                    aom_highbd_10_obmc_variance64x128,
+                    aom_highbd_10_obmc_sub_pixel_variance64x128)
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
+                    aom_highbd_10_obmc_variance64x64,
+                    aom_highbd_10_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
+                    aom_highbd_10_obmc_variance64x32,
+                    aom_highbd_10_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
+                    aom_highbd_10_obmc_variance32x64,
+                    aom_highbd_10_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
+                    aom_highbd_10_obmc_variance32x32,
+                    aom_highbd_10_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
+                    aom_highbd_10_obmc_variance32x16,
+                    aom_highbd_10_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
+                    aom_highbd_10_obmc_variance16x32,
+                    aom_highbd_10_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
+                    aom_highbd_10_obmc_variance16x16,
+                    aom_highbd_10_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
+                    aom_highbd_10_obmc_variance8x16,
+                    aom_highbd_10_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
+                    aom_highbd_10_obmc_variance16x8,
+                    aom_highbd_10_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
+                    aom_highbd_10_obmc_variance8x8,
+                    aom_highbd_10_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
+                    aom_highbd_10_obmc_variance4x8,
+                    aom_highbd_10_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
+                    aom_highbd_10_obmc_variance8x4,
+                    aom_highbd_10_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
+                    aom_highbd_10_obmc_variance4x4,
+                    aom_highbd_10_obmc_sub_pixel_variance4x4)
+
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
+                    aom_highbd_10_obmc_variance64x16,
+                    aom_highbd_10_obmc_sub_pixel_variance64x16)
+
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10,
+                    aom_highbd_10_obmc_variance16x64,
+                    aom_highbd_10_obmc_sub_pixel_variance16x64)
+
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
+                    aom_highbd_10_obmc_variance32x8,
+                    aom_highbd_10_obmc_sub_pixel_variance32x8)
+
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10,
+                    aom_highbd_10_obmc_variance8x32,
+                    aom_highbd_10_obmc_sub_pixel_variance8x32)
+
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10,
+                    aom_highbd_10_obmc_variance16x4,
+                    aom_highbd_10_obmc_sub_pixel_variance16x4)
+
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
+                    aom_highbd_10_obmc_variance4x16,
+                    aom_highbd_10_obmc_sub_pixel_variance4x16)
+        break;
+
+      case AOM_BITS_12:
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
+                   aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
+                   aom_highbd_12_sub_pixel_variance64x16,
+                   aom_highbd_12_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits12,
+                   aom_highbd_jnt_sad64x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x16);
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
+                   aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
+                   aom_highbd_12_sub_pixel_variance16x64,
+                   aom_highbd_12_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits12,
+                   aom_highbd_jnt_sad16x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x64);
+
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
+                   aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
+                   aom_highbd_12_sub_pixel_variance32x8,
+                   aom_highbd_12_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits12,
+                   aom_highbd_jnt_sad32x8_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x8);
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
+                   aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
+                   aom_highbd_12_sub_pixel_variance8x32,
+                   aom_highbd_12_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits12,
+                   aom_highbd_jnt_sad8x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance8x32);
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
+                   aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
+                   aom_highbd_12_sub_pixel_variance16x4,
+                   aom_highbd_12_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits12,
+                   aom_highbd_jnt_sad16x4_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x4);
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
+                   aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
+                   aom_highbd_12_sub_pixel_variance4x16,
+                   aom_highbd_12_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits12,
+                   aom_highbd_jnt_sad4x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance4x16);
+
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
+                   aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
+                   aom_highbd_12_sub_pixel_variance32x16,
+                   aom_highbd_12_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits12,
+                   aom_highbd_jnt_sad32x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x16);
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
+                   aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
+                   aom_highbd_12_sub_pixel_variance16x32,
+                   aom_highbd_12_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits12,
+                   aom_highbd_jnt_sad16x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x32);
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
+                   aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
+                   aom_highbd_12_sub_pixel_variance64x32,
+                   aom_highbd_12_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits12,
+                   aom_highbd_jnt_sad64x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x32);
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
+                   aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
+                   aom_highbd_12_sub_pixel_variance32x64,
+                   aom_highbd_12_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits12,
+                   aom_highbd_jnt_sad32x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x64);
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
+                   aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
+                   aom_highbd_12_sub_pixel_variance32x32,
+                   aom_highbd_12_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x4d_bits12,
+                   aom_highbd_jnt_sad32x32_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance32x32);
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
+                   aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
+                   aom_highbd_12_sub_pixel_variance64x64,
+                   aom_highbd_12_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x4d_bits12,
+                   aom_highbd_jnt_sad64x64_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance64x64);
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
+                   aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
+                   aom_highbd_12_sub_pixel_variance16x16,
+                   aom_highbd_12_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x4d_bits12,
+                   aom_highbd_jnt_sad16x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x16);
+
+        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
+                   aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
+                   aom_highbd_12_sub_pixel_variance16x8,
+                   aom_highbd_12_sub_pixel_avg_variance16x8,
+                   aom_highbd_sad16x8x4d_bits12,
+                   aom_highbd_jnt_sad16x8_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance16x8);
+
+        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
+                   aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
+                   aom_highbd_12_sub_pixel_variance8x16,
+                   aom_highbd_12_sub_pixel_avg_variance8x16,
+                   aom_highbd_sad8x16x4d_bits12,
+                   aom_highbd_jnt_sad8x16_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance8x16);
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
+            aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
+            aom_highbd_12_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance8x8);
+
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
+            aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
+            aom_highbd_12_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance8x4);
+
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
+            aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
+            aom_highbd_12_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance4x8);
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
+            aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
+            aom_highbd_12_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
+                   aom_highbd_sad128x128_avg_bits12,
+                   aom_highbd_12_variance128x128,
+                   aom_highbd_12_sub_pixel_variance128x128,
+                   aom_highbd_12_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits12,
+                   aom_highbd_jnt_sad128x128_avg_bits12,
+                   aom_highbd_12_jnt_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(
+            BLOCK_128X64, aom_highbd_sad128x64_bits12,
+            aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64,
+            aom_highbd_12_sub_pixel_variance128x64,
+            aom_highbd_12_sub_pixel_avg_variance128x64,
+            aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(
+            BLOCK_64X128, aom_highbd_sad64x128_bits12,
+            aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128,
+            aom_highbd_12_sub_pixel_variance64x128,
+            aom_highbd_12_sub_pixel_avg_variance64x128,
+            aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12,
+            aom_highbd_12_jnt_sub_pixel_avg_variance64x128);
+
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x128)
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x4)
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x16)
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x64)
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x8)
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x32)
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x4)
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x16)
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
+                    aom_highbd_12_obmc_variance128x128,
+                    aom_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
+                    aom_highbd_12_obmc_variance128x64,
+                    aom_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
+                    aom_highbd_12_obmc_variance64x128,
+                    aom_highbd_12_obmc_sub_pixel_variance64x128)
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
+                    aom_highbd_12_obmc_variance64x64,
+                    aom_highbd_12_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
+                    aom_highbd_12_obmc_variance64x32,
+                    aom_highbd_12_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
+                    aom_highbd_12_obmc_variance32x64,
+                    aom_highbd_12_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
+                    aom_highbd_12_obmc_variance32x32,
+                    aom_highbd_12_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
+                    aom_highbd_12_obmc_variance32x16,
+                    aom_highbd_12_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
+                    aom_highbd_12_obmc_variance16x32,
+                    aom_highbd_12_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
+                    aom_highbd_12_obmc_variance16x16,
+                    aom_highbd_12_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
+                    aom_highbd_12_obmc_variance8x16,
+                    aom_highbd_12_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
+                    aom_highbd_12_obmc_variance16x8,
+                    aom_highbd_12_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
+                    aom_highbd_12_obmc_variance8x8,
+                    aom_highbd_12_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
+                    aom_highbd_12_obmc_variance4x8,
+                    aom_highbd_12_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
+                    aom_highbd_12_obmc_variance8x4,
+                    aom_highbd_12_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
+                    aom_highbd_12_obmc_variance4x4,
+                    aom_highbd_12_obmc_sub_pixel_variance4x4)
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
+                    aom_highbd_12_obmc_variance64x16,
+                    aom_highbd_12_obmc_sub_pixel_variance64x16)
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
+                    aom_highbd_12_obmc_variance16x64,
+                    aom_highbd_12_obmc_sub_pixel_variance16x64)
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
+                    aom_highbd_12_obmc_variance32x8,
+                    aom_highbd_12_obmc_sub_pixel_variance32x8)
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
+                    aom_highbd_12_obmc_variance8x32,
+                    aom_highbd_12_obmc_sub_pixel_variance8x32)
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
+                    aom_highbd_12_obmc_variance16x4,
+                    aom_highbd_12_obmc_sub_pixel_variance16x4)
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
+                    aom_highbd_12_obmc_variance4x16,
+                    aom_highbd_12_obmc_sub_pixel_variance4x16)
+        break;
+
+      default:
+        assert(0 &&
+               "cm->seq_params.bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+    }
+  }
+}
+
+static void realloc_segmentation_maps(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Create the encoder segmentation map and set all entries to 0
+  aom_free(cpi->segmentation_map);
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  av1_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  aom_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCK *const x = &cpi->td.mb;
+
+  if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->color_primaries = oxcf->color_primaries;
+  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+  seq_params->monochrome = oxcf->monochrome;
+  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+  seq_params->color_range = oxcf->color_range;
+
+  assert(IMPLIES(seq_params->profile <= PROFILE_1,
+                 seq_params->bit_depth <= AOM_BITS_10));
+
+  cm->timing_info_present = oxcf->timing_info_present;
+  cm->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+  cm->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  cm->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  seq_params->display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  seq_params->decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    cm->buffer_model.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_time_present = 1;
+    set_aom_dec_model_info(&cm->buffer_model);
+    set_dec_model_op_parameters(&cm->op_params[0]);
+  } else if (cm->timing_info_present &&
+             cm->timing_info.equal_picture_interval &&
+             !seq_params->decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    set_resource_availability_parameters(&cm->op_params[0]);
+  } else {
+    cm->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  update_film_grain_parameters(cpi, oxcf);
+
+  cpi->oxcf = *oxcf;
+  cpi->common.options = oxcf->cfg;
+  cpi->row_mt = oxcf->row_mt;
+  x->e_mbd.bd = (int)seq_params->bit_depth;
+  x->e_mbd.global_motion = cm->global_motion;
+
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_bwd_ref_frame = 0;
+  cpi->refresh_alt2_ref_frame = 0;
+
+  cm->refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
+                                  ? REFRESH_FRAME_CONTEXT_DISABLED
+                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  if (x->palette_buffer == NULL) {
+    CHECK_MEM_ERROR(cm, x->palette_buffer,
+                    aom_memalign(16, sizeof(*x->palette_buffer)));
+  }
+
+  if (x->tmp_conv_dst == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->tmp_conv_dst,
+        aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+    x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (x->tmp_obmc_bufs[i] == NULL) {
+      CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+                      aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                           sizeof(*x->tmp_obmc_bufs[i])));
+      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+    }
+  }
+
+  av1_reset_segment_features(cm);
+  set_high_precision_mv(cpi, 1, 0);
+
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  av1_new_framerate(cpi, cpi->framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
+
+  cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+  cm->switchable_motion_mode = 1;
+
+  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+    cm->render_width = cpi->oxcf.render_width;
+    cm->render_height = cpi->oxcf.render_height;
+  } else {
+    cm->render_width = cpi->oxcf.width;
+    cm->render_height = cpi->oxcf.height;
+  }
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
+
+  int sb_size = seq_params->sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!cpi->seq_params_locked) {
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+  }
+
+  if (cpi->initial_width || sb_size != seq_params->sb_size) {
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
+        seq_params->sb_size != sb_size) {
+      av1_free_context_buffers(cm);
+      av1_free_pc_tree(&cpi->td, num_planes);
+      alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+    }
+  }
+  update_frame_size(cpi);
+
+  cpi->alt_ref_source = NULL;
+  rc->is_src_frame_alt_ref = 0;
+
+  rc->is_bwd_ref_frame = 0;
+  rc->is_last_bipred_frame = 0;
+  rc->is_bipred_frame = 0;
+
+  set_tile_info(cpi);
+
+  cpi->ext_refresh_frame_flags_pending = 0;
+  cpi->ext_refresh_frame_context_pending = 0;
+
+  highbd_set_var_fns(cpi);
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!cpi->seq_params_locked) {
+    seq_params->operating_points_cnt_minus_1 =
+        cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
+    init_seq_coding_tools(&cm->seq_params, cm, oxcf);
+  }
+}
+
+AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+                                BufferPool *const pool) {
+  unsigned int i;
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*cpi);
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    av1_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error.setjmp = 1;
+  cm->alloc_mi = enc_alloc_mi;
+  cm->free_mi = enc_free_mi;
+  cm->setup_mi = enc_setup_mi;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)aom_memalign(
+                      32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+  cpi->resize_state = 0;
+  cpi->resize_avg_qp = 0;
+  cpi->resize_buffer_underflow = 0;
+
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+
+  cm->current_video_frame = 0;
+  cpi->seq_params_locked = 0;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf_idx = INVALID_IDX;
+
+  realloc_segmentation_maps(cpi);
+
+  memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+  memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
+
+  for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
+       i++) {
+    CHECK_MEM_ERROR(
+        cm, cpi->mbgraph_stats[i].mb_stats,
+        aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+  }
+
+#if CONFIG_FP_MB_STATS
+  cpi->use_fp_mb_stats = 0;
+  if (cpi->use_fp_mb_stats) {
+    // a place holder used to store the first pass mb stats in the first pass
+    CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
+                    aom_calloc(cm->MBs * sizeof(uint8_t), 1));
+  } else {
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+
+  cpi->refresh_alt_ref_frame = 0;
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+  }
+
+  cpi->fastssim.worst = 100.0;
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+                    aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
+                               cpi->common.mi_rows * cpi->common.mi_cols));
+    cpi->worst_consistency = 100.0;
+  }
+#endif
+#if CONFIG_ENTROPY_STATS
+  av1_zero(aggregate_fc);
+#endif  // CONFIG_ENTROPY_STATS
+
+  cpi->first_time_stamp_ever = INT64_MAX;
+
+  cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX];
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX];
+
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+  if (oxcf->pass == 1) {
+    av1_init_first_pass(cpi);
+  } else if (oxcf->pass == 2) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+      const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+
+      cpi->twopass.firstpass_mb_stats.mb_stats_start =
+          oxcf->firstpass_mb_stats_in.buf;
+      cpi->twopass.firstpass_mb_stats.mb_stats_end =
+          cpi->twopass.firstpass_mb_stats.mb_stats_start +
+          (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+    }
+#endif
+
+    cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+    cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+    cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+    av1_init_second_pass(cpi);
+  }
+
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.above_pred_buf,
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.above_pred_buf)));
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.left_pred_buf,
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.left_pred_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                                 sizeof(*cpi->td.mb.hash_value_buffer[0][0])));
+
+  cpi->td.mb.g_crc_initialized = 0;
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
+
+  av1_set_speed_features_framesize_independent(cpi);
+  av1_set_speed_features_framesize_dependent(cpi);
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                    \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
+  cpi->fn_ptr[BT].vf = VF;                                      \
+  cpi->fn_ptr[BT].svf = SVF;                                    \
+  cpi->fn_ptr[BT].svaf = SVAF;                                  \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+  BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+      aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16)
+
+  BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+      aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4)
+
+  BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+      aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32)
+
+  BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+      aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8)
+
+  BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+      aom_sad16x64x4d, aom_jnt_sad16x64_avg,
+      aom_jnt_sub_pixel_avg_variance16x64)
+
+  BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+      aom_sad64x16x4d, aom_jnt_sad64x16_avg,
+      aom_jnt_sub_pixel_avg_variance64x16)
+
+  BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+      aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+      aom_sad128x128x4d, aom_jnt_sad128x128_avg,
+      aom_jnt_sub_pixel_avg_variance128x128)
+
+  BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+      aom_sad128x64x4d, aom_jnt_sad128x64_avg,
+      aom_jnt_sub_pixel_avg_variance128x64)
+
+  BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+      aom_sad64x128x4d, aom_jnt_sad64x128_avg,
+      aom_jnt_sub_pixel_avg_variance64x128)
+
+  BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+      aom_sad32x16x4d, aom_jnt_sad32x16_avg,
+      aom_jnt_sub_pixel_avg_variance32x16)
+
+  BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+      aom_sad16x32x4d, aom_jnt_sad16x32_avg,
+      aom_jnt_sub_pixel_avg_variance16x32)
+
+  BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+      aom_sad64x32x4d, aom_jnt_sad64x32_avg,
+      aom_jnt_sub_pixel_avg_variance64x32)
+
+  BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+      aom_sad32x64x4d, aom_jnt_sad32x64_avg,
+      aom_jnt_sub_pixel_avg_variance32x64)
+
+  BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+      aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+      aom_sad32x32x4d, aom_jnt_sad32x32_avg,
+      aom_jnt_sub_pixel_avg_variance32x32)
+
+  BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+      aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+      aom_sad64x64x4d, aom_jnt_sad64x64_avg,
+      aom_jnt_sub_pixel_avg_variance64x64)
+
+  BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+      aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+      aom_sad16x16x4d, aom_jnt_sad16x16_avg,
+      aom_jnt_sub_pixel_avg_variance16x16)
+
+  BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+      aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8)
+
+  BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+      aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16)
+
+  BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+      aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8)
+
+  BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+      aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4)
+
+  BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+      aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8)
+
+  BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+      aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4)
+
+#define OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;    \
+  cpi->fn_ptr[BT].ovf = OVF;      \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+  OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+       aom_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+       aom_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+       aom_obmc_sub_pixel_variance64x128)
+  OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+       aom_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+       aom_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+       aom_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+       aom_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+       aom_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+       aom_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+       aom_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+       aom_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+       aom_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+       aom_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+       aom_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+       aom_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+       aom_obmc_sub_pixel_variance4x4)
+  OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+       aom_obmc_sub_pixel_variance4x16)
+  OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+       aom_obmc_sub_pixel_variance16x4)
+  OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+       aom_obmc_sub_pixel_variance8x32)
+  OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+       aom_obmc_sub_pixel_variance32x8)
+  OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+       aom_obmc_sub_pixel_variance16x64)
+  OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+       aom_obmc_sub_pixel_variance64x16)
+
+#define MBFP(BT, MCSDF, MCSVF)  \
+  cpi->fn_ptr[BT].msdf = MCSDF; \
+  cpi->fn_ptr[BT].msvf = MCSVF;
+
+  MBFP(BLOCK_128X128, aom_masked_sad128x128,
+       aom_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
+  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+  MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+
+  MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+
+  MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+
+  MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+
+  MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+
+  MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+
+  highbd_set_var_fns(cpi);
+
+  /* av1_init_quantizer() is first called here. Add check in
+   * av1_frame_init_quantizer() so that av1_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * av1_init_quantizer() for every frame.
+   */
+  av1_init_quantizer(cpi);
+  av1_qm_init(cm);
+
+  av1_loop_filter_init(cm);
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  cm->superres_upscaled_width = oxcf->width;
+  cm->superres_upscaled_height = oxcf->height;
+  av1_loop_restoration_precal();
+
+  cm->error.setjmp = 0;
+
+  return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+  AV1_COMMON *cm;
+  unsigned int i;
+  int t;
+
+  if (!cpi) return;
+
+  cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (cm->current_video_frame > 0) {
+#if CONFIG_ENTROPY_STATS
+    if (cpi->oxcf.pass != 1) {
+      fprintf(stderr, "Writing counts.stt\n");
+      FILE *f = fopen("counts.stt", "wb");
+      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+      fclose(f);
+    }
+#endif  // CONFIG_ENTROPY_STATS
+#if CONFIG_INTERNAL_STATS
+    aom_clear_system_state();
+
+    if (cpi->oxcf.pass != 1) {
+      char headings[512] = { 0 };
+      char results[512] = { 0 };
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded =
+          (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+          10000000.000;
+      double total_encode_time =
+          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+      const double dr =
+          (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+      if (cpi->b_calculate_psnr) {
+        const double total_psnr = aom_sse_to_psnr(
+            (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+        const double total_ssim =
+            100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
+                 cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
+                 total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
+                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[STAT_Y] / cpi->count,
+                 cpi->psnr.stat[STAT_U] / cpi->count,
+                 cpi->psnr.stat[STAT_V] / cpi->count);
+
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
+
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              aom_sse_to_psnr((double)cpi->total_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+        fprintf(f, "%s\t    Time\tRcErr\tAbsErr\n", headings);
+        fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
+                rate_err, fabs(rate_err));
+      }
+
+      fclose(f);
+    }
+#endif  // CONFIG_INTERNAL_STATS
+  }
+
+  for (t = 0; t < cpi->num_workers; ++t) {
+    AVxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    aom_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      aom_free(thread_data->td->palette_buffer);
+      aom_free(thread_data->td->tmp_conv_dst);
+      for (int j = 0; j < 2; ++j) {
+        aom_free(thread_data->td->tmp_obmc_bufs[j]);
+      }
+      aom_free(thread_data->td->above_pred_buf);
+      aom_free(thread_data->td->left_pred_buf);
+      aom_free(thread_data->td->wsrc_buf);
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          aom_free(thread_data->td->hash_value_buffer[x][y]);
+          thread_data->td->hash_value_buffer[x][y] = NULL;
+        }
+      }
+      aom_free(thread_data->td->mask_buf);
+      aom_free(thread_data->td->counts);
+      av1_free_pc_tree(thread_data->td, num_planes);
+      aom_free(thread_data->td);
+    }
+  }
+  aom_free(cpi->tile_thr_data);
+  aom_free(cpi->workers);
+
+  if (cpi->num_workers > 1) {
+    av1_loop_filter_dealloc(&cpi->lf_row_sync);
+    av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+  }
+
+  dealloc_compressor_data(cpi);
+
+  for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]);
+       ++i) {
+    aom_free(cpi->mbgraph_stats[i].mb_stats);
+  }
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    aom_free(cpi->twopass.frame_mb_stats_buf);
+    cpi->twopass.frame_mb_stats_buf = NULL;
+  }
+#endif
+#if CONFIG_INTERNAL_STATS
+  aom_free(cpi->ssim_vars);
+  cpi->ssim_vars = NULL;
+#endif  // CONFIG_INTERNAL_STATS
+
+  av1_remove_common(cm);
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    av1_hash_table_destroy(&cm->buffer_pool->frame_bufs[i].hash_table);
+  }
+  if (cpi->sf.use_hash_based_trellis) hbt_destroy();
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+  aom_free(cpi);
+
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+  struct aom_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+  aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
+                       cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = AOM_CODEC_PSNR_PKT;
+  aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
+  if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+  cpi->ext_ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) {
+  cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0;
+  cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0;
+  cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0;
+  cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0;
+  cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0;
+  cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+  if (cfg) {
+    aom_yv12_copy_frame(cfg, sd, num_planes);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+  if (cfg) {
+    aom_yv12_copy_frame(sd, cfg, num_planes);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_update_entropy(AV1_COMP *cpi, int update) {
+  cpi->ext_refresh_frame_context = update;
+  cpi->ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1, f);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+static void check_show_existing_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+  const FRAME_UPDATE_TYPE next_frame_update_type =
+      gf_group->update_type[gf_group->index];
+#if USE_SYMM_MULTI_LAYER
+  const int which_arf = (cpi->new_bwdref_update_rule == 1)
+                            ? gf_group->arf_update_idx[gf_group->index] > 0
+                            : gf_group->arf_update_idx[gf_group->index];
+#else
+  const int which_arf = gf_group->arf_update_idx[gf_group->index];
+#endif
+
+  if (cm->show_existing_frame == 1) {
+    cm->show_existing_frame = 0;
+  } else if (cpi->rc.is_last_bipred_frame) {
+#if USE_SYMM_MULTI_LAYER
+    // NOTE: When new structure is used, every bwdref will have one overlay
+    //       frame. Therefore, there is no need to find out which frame to
+    //       show in advance.
+    if (cpi->new_bwdref_update_rule == 0) {
+#endif
+      // NOTE: If the current frame is a last bi-predictive frame, it is
+      //       needed next to show the BWDREF_FRAME, which is pointed by
+      //       the last_fb_idxes[0] after reference frame buffer update
+      cpi->rc.is_last_bipred_frame = 0;
+      cm->show_existing_frame = 1;
+      cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
+#if USE_SYMM_MULTI_LAYER
+    }
+#endif
+  } else if (cpi->is_arf_filter_off[which_arf] &&
+             (next_frame_update_type == OVERLAY_UPDATE ||
+              next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+#if USE_SYMM_MULTI_LAYER
+    const int bwdref_to_show =
+        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+    const int bwdref_to_show = ALTREF2_FRAME;
+#endif
+    // Other parameters related to OVERLAY_UPDATE will be taken care of
+    // in av1_rc_get_second_pass_params(cpi)
+    cm->show_existing_frame = 1;
+    cpi->rc.is_src_frame_alt_ref = 1;
+    cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
+                                       ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
+                                       : cpi->ref_fb_idx[bwdref_to_show - 1];
+#if USE_SYMM_MULTI_LAYER
+    if (cpi->new_bwdref_update_rule == 0)
+#endif
+      cpi->is_arf_filter_off[which_arf] = 0;
+  }
+  cpi->rc.is_src_frame_ext_arf = 0;
+}
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+  if (yuv_rec_file == NULL) return;
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2, yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fflush(yuv_rec_file);
+}
+#endif  // OUTPUT_YUV_REC
+
+#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
+static int recode_loop_test_global_motion(AV1_COMP *cpi) {
+  int i;
+  int recode = 0;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  AV1_COMMON *const cm = &cpi->common;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    if (cm->global_motion[i].wmtype != IDENTITY &&
+        rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+            cpi->gmparams_cost[i]) {
+      cm->global_motion[i] = default_warp_params;
+      assert(cm->global_motion[i].wmtype == IDENTITY);
+      cpi->gmparams_cost[i] = 0;
+      recode = 1;
+      // TODO(sarahparker): The earlier condition for recoding here was:
+      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
+      // similar to that back to speed up global motion?
+    }
+  }
+  return recode;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
+                            int maxq, int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_mode == AOM_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+
+  fclose(f_ref);
+
+  return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_video_frame, ref_frame);
+    dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
+// This function is used to shift the virtual indices of last reference frames
+// as follows:
+// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+// when the LAST_FRAME is updated.
+static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
+  int ref_frame;
+  for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+    cpi->ref_fb_idx[ref_frame] = cpi->ref_fb_idx[ref_frame - 1];
+
+    // [0] is allocated to the current coded frame. The statistics for the
+    // reference frames start at [LAST_FRAME], i.e. [1].
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+             cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+             sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
+    }
+  }
+}
+
+#if USE_SYMM_MULTI_LAYER
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF -> ALT2_REF -> EXT_REF
+// to clear a space to store the closest bwdref
+static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
+  static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+                                      EXTREF_FRAME - 1 };
+
+  for (int i = 2; i > 0; --i) {
+    // [0] is allocated to the current coded frame, i.e. bwdref
+    memcpy(
+        cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+        cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
+        sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+
+    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
+  }
+}
+
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF <- ALT2_REF <- EXT_REF
+// to update the bwd reference frame for coding the next frame.
+static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
+  // TODO(isbs): shift the scaled indices as well
+  static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+                                      EXTREF_FRAME - 1 };
+
+  for (int i = 0; i < 2; ++i) {
+    // [0] is allocated to the current coded frame, i.e. bwdref
+    memcpy(
+        cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+        cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
+        sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
+
+    cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
+  }
+}
+#endif  // USE_SYMM_MULTI_LAYER
+
+static void update_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
+
+  // In the case of show_existing frame, we will not send fresh flag
+  // to decoder. Any change in the reference frame buffer can be done by
+  // switching the virtual indices.
+  if (cm->show_existing_frame) {
+    cpi->refresh_last_frame = 0;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_bwd_ref_frame = 0;
+    cpi->refresh_alt2_ref_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+
+    cpi->rc.is_bwd_ref_frame = 0;
+    cpi->rc.is_last_bipred_frame = 0;
+    cpi->rc.is_bipred_frame = 0;
+  }
+
+  BufferPool *const pool = cm->buffer_pool;
+
+  // At this point the new frame has been encoded.
+  // If any buffer copy / swapping is signaled it should be done here.
+
+  // Only update all of the reference buffers if a KEY_FRAME is also a
+  // show_frame. This ensures a fwd keyframe does not update all of the buffers
+  if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
+                 cm->new_fb_idx);
+    }
+    return;
+  }
+
+  if (av1_preserve_existing_gf(cpi)) {
+    // We have decided to preserve the previously existing golden frame as our
+    // new ARF frame. However, in the short term in function
+    // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+    // we're updating the GF with the current decoded frame, we save it to the
+    // ARF slot instead.
+    // We now have to update the ARF with the current frame and swap gld_fb_idx
+    // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+    // slot and, if we're updating the GF, the current frame becomes the new GF.
+    int tmp;
+
+    // ARF in general is a better reference than overlay. We shouldkeep ARF as
+    // reference instead of replacing it with overlay.
+
+    if (!cpi->preserve_arf_as_gld) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
+                 cm->new_fb_idx);
+    }
+
+    tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+    cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+    cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
+
+    // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+    // cpi->interp_filter_selected[GOLDEN_FRAME]?
+  } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+#if CONFIG_DEBUG
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
+#endif
+#if USE_SYMM_MULTI_LAYER
+    const int bwdref_to_show =
+        (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+    const int bwdref_to_show = ALTREF2_FRAME;
+#endif
+    // Deal with the special case for showing existing internal ALTREF_FRAME
+    // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
+    // by updating the virtual indices.
+    const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
+    shift_last_ref_frames(cpi);
+
+    cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[bwdref_to_show - 1];
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[bwdref_to_show],
+           sizeof(cpi->interp_filter_selected[bwdref_to_show]));
+#if USE_SYMM_MULTI_LAYER
+    if (cpi->new_bwdref_update_rule == 1) {
+      lshift_bwd_ref_frames(cpi);
+      // pass outdated forward reference frame (previous LAST3) to the
+      // spared space
+      cpi->ref_fb_idx[EXTREF_FRAME - 1] = tmp;
+    } else {
+#endif
+      cpi->ref_fb_idx[bwdref_to_show - 1] = tmp;
+#if USE_SYMM_MULTI_LAYER
+    }
+#endif
+  } else { /* For non key/golden frames */
+    // === ALTREF_FRAME ===
+    if (cpi->refresh_alt_ref_frame) {
+      int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+
+    // === GOLDEN_FRAME ===
+    if (cpi->refresh_golden_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+                 cm->new_fb_idx);
+
+      memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+
+    // === BWDREF_FRAME ===
+    if (cpi->refresh_bwd_ref_frame) {
+#if USE_SYMM_MULTI_LAYER
+      if (cpi->new_bwdref_update_rule) {
+        // We shift the backward reference frame as follows:
+        // BWDREF -> ALTREF2 -> EXTREF
+        // and assign the newly coded frame to BWDREF so that it always
+        // keeps the nearest future frame
+        int tmp = cpi->ref_fb_idx[EXTREF_FRAME - 1];
+        ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[tmp], cm->new_fb_idx);
+
+        rshift_bwd_ref_frames(cpi);
+        cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
+      } else {
+#endif  // USE_SYMM_MULTI_LAYER
+        ref_cnt_fb(pool->frame_bufs,
+                   &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
+                   cm->new_fb_idx);
+#if USE_SYMM_MULTI_LAYER
+      }
+#endif
+      memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+
+    // === ALTREF2_FRAME ===
+    if (cpi->refresh_alt2_ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]],
+                 cm->new_fb_idx);
+
+      memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+  }
+
+  if (cpi->refresh_last_frame) {
+    // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
+    // reference to the reference frame buffer virtual index; and then (2) from
+    // the virtual index to the reference frame buffer physical index:
+    //
+    // LAST_FRAME,      ..., LAST3_FRAME,     ..., ALTREF_FRAME
+    //      |                     |                     |
+    //      v                     v                     v
+    // ref_fb_idx[0],   ..., ref_fb_idx[2],   ..., ref_fb_idx[ALTREF_FRAME-1]
+    //      |                     |                     |
+    //      v                     v                     v
+    // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
+    //
+    // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
+    // have the other 2 LAST reference frames shifted as follows:
+    // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+    // , and then have LAST_FRAME refreshed by the newly coded frame.
+    //
+    // To fulfill it, the decoder will be notified to execute following 2 steps:
+    //
+    // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
+    //     to point to the newly coded frame, i.e.
+    //     ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
+    //
+    // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
+    //     original virtual index of LAST3_FRAME and have the other mappings
+    //     shifted as follows:
+    // LAST_FRAME,      LAST2_FRAME,     LAST3_FRAME
+    //      |                |                |
+    //      v                v                v
+    // ref_fb_idx[2],   ref_fb_idx[0],   ref_fb_idx[1]
+    int tmp;
+
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[LAST_REF_FRAMES - 1]],
+               cm->new_fb_idx);
+
+    tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
+
+    shift_last_ref_frames(cpi);
+    cpi->ref_fb_idx[0] = tmp;
+
+    assert(cm->show_existing_frame == 0);
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[0],
+           sizeof(cpi->interp_filter_selected[0]));
+
+    // If the new structure is used, we will always have overlay frames coupled
+    // with bwdref frames. Therefore, we won't have to perform this update
+    // in advance (we do this update when the overlay frame shows up).
+#if USE_SYMM_MULTI_LAYER
+    if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) {
+#else
+    if (cpi->rc.is_last_bipred_frame) {
+#endif
+      // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
+      // LAST3_FRAME by updating the virtual indices.
+      //
+      // NOTE: The source frame for BWDREF does not have a holding position as
+      //       the OVERLAY frame for ALTREF's. Hence, to resolve the reference
+      //       virtual index reshuffling for BWDREF, the encoder always
+      //       specifies a LAST_BIPRED right before BWDREF and completes the
+      //       reshuffling job accordingly.
+      tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
+
+      shift_last_ref_frames(cpi);
+      cpi->ref_fb_idx[0] = cpi->ref_fb_idx[BWDREF_FRAME - 1];
+      cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
+
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[BWDREF_FRAME],
+             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+    }
+  }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+  // Dump out all reference frame images.
+  dump_ref_frame_images(cpi);
+#endif  // DUMP_REF_FRAME_IMAGES
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
+  assert(buffer_idx != INVALID_IDX);
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  ensure_mv_buffer(new_fb_ptr, cm);
+  new_fb_ptr->width = cm->width;
+  new_fb_ptr->height = cm->height;
+}
+
+static void scale_references(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MV_REFERENCE_FRAME ref_frame;
+  const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
+    AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG,
+    AOM_BWD_FLAG,  AOM_ALT2_FLAG,  AOM_ALT_FLAG
+  };
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_buffer(cpi, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
+
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        RefCntBuffer *new_fb_ptr = NULL;
+        int force_scaling = 0;
+        int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+        if (new_fb == INVALID_IDX) {
+          new_fb = get_free_fb(cm);
+          force_scaling = 1;
+        }
+        if (new_fb == INVALID_IDX) return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
+        if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(
+                  &new_fb_ptr->buf, cm->width, cm->height,
+                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->byte_alignment, NULL, NULL, NULL))
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          av1_resize_and_extend_frame(
+              ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes);
+          cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+      } else {
+        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++buf->ref_count;
+      }
+    } else {
+      if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+    }
+  }
+}
+
+static void release_scaled_references(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int i;
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (i = 0; i < REF_FRAMES; ++i) {
+    const int idx = cpi->scaled_ref_idx[i];
+    RefCntBuffer *const buf =
+        idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
+    }
+  }
+}
+
+static void set_mv_search_params(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = av1_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param = av1_init_search_range(
+            AOMMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_independent_vars(AV1_COMP *cpi) {
+  int i;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cpi->common.global_motion[i] = default_warp_params;
+  }
+  cpi->global_motion_search_done = 0;
+  av1_set_speed_features_framesize_independent(cpi);
+  av1_set_rd_speed_thresholds(cpi);
+  av1_set_rd_speed_thresholds_sub8x8(cpi);
+  cpi->common.interp_filter = SWITCHABLE;
+  cpi->common.switchable_motion_mode = 1;
+}
+
+static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                    int *top_index) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  av1_set_speed_features_framesize_dependent(cpi);
+
+  // Decide q and q bounds.
+  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
+                                top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH,
+                          cpi->common.cur_frame_force_integer_mv);
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    av1_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+static void set_restoration_unit_size(int width, int height, int sx, int sy,
+                                      RestorationInfo *rst) {
+  (void)width;
+  (void)height;
+  (void)sx;
+  (void)sy;
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int s = AOMMIN(sx, sy);
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+
+  if (width * height > 352 * 288)
+    rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
+  else
+    rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+  rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
+  rst[2].restoration_unit_size = rst[1].restoration_unit_size;
+}
+
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
+  }
+  if (cm->seq_params.force_screen_content_tools) {
+    for (i = 0; i < FRAME_BUFFERS; ++i) {
+      av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb);
+    }
+  }
+}
+
+static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                                int subsampling_x, int subsampling_y) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+
+  if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+      seq_params->subsampling_x != subsampling_x ||
+      seq_params->subsampling_y != subsampling_y) {
+    seq_params->subsampling_x = subsampling_x;
+    seq_params->subsampling_y = subsampling_y;
+    seq_params->use_highbitdepth = use_highbitdepth;
+
+    alloc_raw_frame_buffers(cpi);
+    init_ref_frame_bufs(cpi);
+    alloc_util_frame_buffers(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
+  }
+}
+
+// Returns 1 if the assigned width or height was <= 0.
+static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  check_initial_width(cpi, cm->seq_params.use_highbitdepth,
+                      cm->seq_params.subsampling_x,
+                      cm->seq_params.subsampling_y);
+
+  if (width <= 0 || height <= 0) return 1;
+
+  cm->width = width;
+  cm->height = height;
+
+  if (cpi->initial_width && cpi->initial_height &&
+      (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
+    av1_free_context_buffers(cm);
+    av1_free_pc_tree(&cpi->td, num_planes);
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->initial_width = cpi->initial_height = 0;
+  }
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+static void set_frame_size(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int ref_frame;
+
+  if (width != cm->width || height != cm->height) {
+    // There has been a change in the encoded frame size
+    set_size_literal(cpi, width, height);
+    set_mv_search_params(cpi);
+    // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+    cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+  }
+
+  if (cpi->oxcf.pass == 2) {
+    av1_set_target_rate(cpi, cm->width, cm->height);
+  }
+
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
+  // Allocate above context buffers
+  if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+      cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+      cm->num_allocated_above_contexts < cm->tile_rows) {
+    av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+    if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+  }
+
+  // Reset the frame pointers to the current frame size.
+  if (aom_realloc_frame_buffer(
+          get_frame_new_buffer(cm), cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+
+  const int frame_width = cm->superres_upscaled_width;
+  const int frame_height = cm->superres_upscaled_height;
+  set_restoration_unit_size(frame_width, frame_height,
+                            seq_params->subsampling_x,
+                            seq_params->subsampling_y, cm->rst_info);
+  for (int i = 0; i < num_planes; ++i)
+    cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+  av1_alloc_restoration_buffers(cm);
+  alloc_util_frame_buffers(cpi);  // TODO(afergs): Remove? Gets called anyways.
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
+      av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
+                                        buf->y_crop_height, cm->width,
+                                        cm->height);
+      if (av1_is_scaled(&ref_buf->sf))
+        aom_extend_frame_borders(buf, num_planes);
+    } else {
+      ref_buf->buf = NULL;
+    }
+  }
+
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height);
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 56789;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 1) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  switch (oxcf->resize_mode) {
+    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.frame_type == KEY_FRAME)
+        new_denom = oxcf->resize_kf_scale_denominator;
+      else
+        new_denom = oxcf->resize_scale_denominator;
+      break;
+    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->pass == 1) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
+                 cpi->common.seq_params.enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 oxcf->superres_mode == SUPERRES_NONE));
+
+  switch (oxcf->superres_mode) {
+    case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+    case SUPERRES_FIXED:
+      if (cpi->common.frame_type == KEY_FRAME)
+        new_denom = oxcf->superres_kf_scale_denominator;
+      else
+        new_denom = oxcf->superres_scale_denominator;
+      break;
+    case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    case SUPERRES_QTHRESH: {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      const RATE_FACTOR_LEVEL rf_level = gf_group->rf_level[gf_group->index];
+      const double rate_factor_delta = rate_factor_deltas[rf_level];
+      const int qthresh = (rate_factor_delta <= 1.0)
+                              ? oxcf->superres_qthresh
+                              : oxcf->superres_kf_qthresh;
+      av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
+      if (q < qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        const uint8_t min_denom = SCALE_NUMERATOR + 1;
+        const uint8_t denom_step = (MAXQ - qthresh + 1) >> 3;
+
+        if (q == qthresh) {
+          new_denom = min_denom;
+        } else if (denom_step == 0) {
+          new_denom = SCALE_NUMERATOR << 1;
+        } else {
+          const uint8_t additional_denom = (q - qthresh) / denom_step;
+          new_denom =
+              AOMMIN(min_denom + additional_denom, SCALE_NUMERATOR << 1);
+        }
+      }
+      break;
+    }
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+                                SUPERRES_MODE superres_mode, int owidth,
+                                int oheight, size_params_type *rsz) {
+  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
+    return 1;
+  }
+
+  // Calculate current resize scale.
+  int resize_denom =
+      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+  if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+    // Alter superres scale as needed to enforce conformity.
+    rsz->superres_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+    }
+  } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) {
+    // Alter resize scale as needed to enforce conformity.
+    resize_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+    rsz->resize_width = owidth;
+    rsz->resize_height = oheight;
+    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                              resize_denom);
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (resize_denom > SCALE_NUMERATOR) {
+        --resize_denom;
+        rsz->resize_width = owidth;
+        rsz->resize_height = oheight;
+        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                  resize_denom);
+      }
+    }
+  } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+    // Alter both resize and superres scales as needed to enforce conformity.
+    do {
+      if (resize_denom > rsz->superres_denom)
+        --resize_denom;
+      else
+        --rsz->superres_denom;
+      rsz->resize_width = owidth;
+      rsz->resize_height = oheight;
+      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                resize_denom);
+    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+             (resize_denom > SCALE_NUMERATOR ||
+              rsz->superres_denom > SCALE_NUMERATOR));
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
+    return 0;
+  }
+  return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
+  int resize_denom;
+  if (oxcf->pass == 1) return rsz;
+  if (cpi->resize_pending_width && cpi->resize_pending_height) {
+    rsz.resize_width = cpi->resize_pending_width;
+    rsz.resize_height = cpi->resize_pending_height;
+    cpi->resize_pending_width = cpi->resize_pending_height = 0;
+  } else {
+    resize_denom = calculate_next_resize_scale(cpi);
+    rsz.resize_width = cpi->oxcf.width;
+    rsz.resize_height = cpi->oxcf.height;
+    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+                              resize_denom);
+  }
+  rsz.superres_denom = calculate_next_superres_scale(cpi);
+  if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width,
+                            oxcf->height, &rsz))
+    assert(0 && "Invalid scale parameters");
+  return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
+  int encode_width = rsz->resize_width;
+  int encode_height = rsz->resize_height;
+
+  AV1_COMMON *cm = &cpi->common;
+  cm->superres_upscaled_width = encode_width;
+  cm->superres_upscaled_height = encode_height;
+  cm->superres_scale_denominator = rsz->superres_denom;
+  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+                                     rsz->superres_denom);
+  set_frame_size(cpi, encode_width, encode_height);
+}
+
+static void setup_frame_size(AV1_COMP *cpi) {
+  size_params_type rsz = av1_calculate_next_size_params(cpi);
+  setup_frame_size_from_params(cpi, &rsz);
+}
+
+static void superres_post_encode(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (!av1_superres_scaled(cm)) return;
+
+  assert(cpi->oxcf.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf));
+  assert(!cm->all_lossless);
+
+  av1_superres_upscale(cm, NULL);
+
+  // If regular resizing is occurring the source will need to be downscaled to
+  // match the upscaled superres resolution. Otherwise the original source is
+  // used.
+  if (!av1_resize_scaled(cm)) {
+    cpi->source = cpi->unscaled_source;
+    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+  } else {
+    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
+    if (aom_realloc_frame_buffer(
+            &cpi->scaled_source, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, cm->seq_params.subsampling_x,
+            cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to reallocate scaled source buffer for superres");
+    assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
+    assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
+    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
+                                (int)cm->seq_params.bit_depth, num_planes);
+    cpi->source = &cpi->scaled_source;
+  }
+}
+
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+                 cm->coded_lossless && cm->all_lossless));
+
+  const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile;
+  const int no_cdef =
+      !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile;
+  const int no_restoration = !cm->seq_params.enable_restoration ||
+                             cm->all_lossless || cm->large_scale_tile;
+
+  struct loopfilter *lf = &cm->lf;
+
+  if (no_loopfilter) {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+  } else {
+    struct aom_usec_timer timer;
+
+    aom_clear_system_state();
+
+    aom_usec_timer_start(&timer);
+
+    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
+
+    aom_usec_timer_mark(&timer);
+    cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+  }
+
+  if (lf->filter_level[0] || lf->filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0);
+#else
+    if (cpi->num_workers > 1)
+      av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
+#endif
+  }
+
+  if (!no_restoration)
+    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0);
+
+  if (no_cdef) {
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+  } else {
+    // Find CDEF parameters
+    av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
+                    cpi->sf.fast_cdef_search);
+
+    // Apply the filter
+    av1_cdef_frame(cm->frame_to_show, cm, xd);
+  }
+
+  superres_post_encode(cpi);
+
+  if (no_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  } else {
+    av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1);
+    av1_pick_filter_restoration(cpi->source, cpi);
+    if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+      if (cpi->num_workers > 1)
+        av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0,
+                                             cpi->workers, cpi->num_workers,
+                                             &cpi->lr_row_sync, &cpi->lr_ctxt);
+      else
+        av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0,
+                                          &cpi->lr_ctxt);
+    }
+  }
+}
+
+static int encode_without_recode_loop(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+
+  aom_clear_system_state();
+
+  set_size_independent_vars(cpi);
+
+  setup_frame_size(cpi);
+
+  assert(cm->width == cpi->scaled_source.y_crop_width);
+  assert(cm->height == cpi->scaled_source.y_crop_height);
+
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+  cpi->source =
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+  if (cpi->unscaled_last_source != NULL)
+    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+  cpi->source->buf_8bit_valid = 0;
+  if (frame_is_intra_only(cm) == 0) {
+    scale_references(cpi);
+  }
+
+  av1_set_quantizer(cm, q);
+  setup_frame(cpi);
+  suppress_active_map(cpi);
+
+  // Variance adaptive and in frame q adjustment experiments are mutually
+  // exclusive.
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    av1_vaq_frame_setup(cpi);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    av1_setup_in_frame_q_adj(cpi);
+  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_setup(cpi);
+  }
+  apply_active_map(cpi);
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+    } else {
+      calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+
+  // transform / motion compensation build reconstruction frame
+  av1_encode_frame(cpi);
+
+  // Update some stats from cyclic refresh, and check if we should not update
+  // golden reference, for 1 pass CBR.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR))
+    av1_cyclic_refresh_check_golden_update(cpi);
+
+  // Update the skip mb flag probabilities based on the distribution
+  // seen in the last encoder iteration.
+  // update_base_skip_probs(cpi);
+  aom_clear_system_state();
+  return AOM_CODEC_OK;
+}
+
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int bottom_index, top_index;
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int loop = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int frame_over_shoot_limit;
+  int frame_under_shoot_limit;
+  int q = 0, q_low = 0, q_high = 0;
+
+  set_size_independent_vars(cpi);
+
+  cpi->source->buf_8bit_valid = 0;
+
+  aom_clear_system_state();
+  setup_frame_size(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+  do {
+    aom_clear_system_state();
+
+    if (loop_count == 0) {
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+
+      // Reset the loop state for new frame size.
+      overshoot_seen = 0;
+      undershoot_seen = 0;
+
+      q_low = bottom_index;
+      q_high = top_index;
+
+      loop_at_this_size = 0;
+
+      // Decide frame size bounds first time through.
+      av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+    }
+
+    // if frame was scaled calculate global_motion_search again if already
+    // done
+    if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
+      if (cpi->source->y_crop_width != cm->width ||
+          cpi->source->y_crop_height != cm->height)
+        cpi->global_motion_search_done = 0;
+    cpi->source =
+        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL)
+      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+
+    if (frame_is_intra_only(cm) == 0) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      scale_references(cpi);
+    }
+    av1_set_quantizer(cm, q);
+    // printf("Frame %d/%d: q = %d, frame_type = %d\n", cm->current_video_frame,
+    //        cm->show_frame, q, cm->frame_type);
+
+    if (loop_count == 0) setup_frame(cpi);
+
+    // Base q-index may have changed, so we need to assign proper default coef
+    // probs before every iteration.
+    if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+        cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+      av1_default_coef_probs(cm);
+      av1_setup_frame_contexts(cm);
+    }
+
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      av1_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      av1_setup_in_frame_q_adj(cpi);
+    }
+    if (cm->seg.enabled) {
+      if (!cm->seg.update_data && cm->prev_frame) {
+        segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      } else {
+        calculate_segdata(&cm->seg);
+      }
+    } else {
+      memset(&cm->seg, 0, sizeof(cm->seg));
+    }
+    segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+
+    // transform / motion compensation build reconstruction frame
+    save_coding_context(cpi);
+    av1_encode_frame(cpi);
+
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    // update_base_skip_probs(cpi);
+
+    aom_clear_system_state();
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+      restore_coding_context(cpi);
+
+      if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+        return AOM_CODEC_ERROR;
+
+      rc->projected_frame_size = (int)(*size) << 3;
+      restore_coding_context(cpi);
+
+      if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+    }
+
+    if (cpi->oxcf.rc_mode == AOM_Q) {
+      loop = 0;
+    } else {
+      if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
+          (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+        int last_q = q;
+        int64_t kf_err;
+
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
+
+        if (cm->seq_params.use_highbitdepth) {
+          kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+        } else {
+          kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+        }
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             rc->projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             rc->projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = q > q_low ? q - 1 : q_low;
+
+          // Adjust Q
+          q = (int)((q * high_err_target) / kf_err);
+          q = AOMMIN(q, (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   rc->projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = q < q_high ? q + 1 : q_high;
+
+          // Adjust Q
+          q = (int)((q * low_err_target) / kf_err);
+          q = AOMMIN(q, (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = q != last_q;
+      } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+                                  frame_under_shoot_limit, q,
+                                  AOMMAX(q_high, top_index), bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = q;
+        int retries = 0;
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+        // Frame is too large
+        if (rc->projected_frame_size > rc->this_frame_target) {
+          // Special case if the projected size is > the max allowed.
+          if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+            q_high = rc->worst_quality;
+
+          // Raise Qlow as to at least the current value
+          q_low = q < q_high ? q + 1 : q_high;
+
+          if (undershoot_seen || loop_at_this_size > 1) {
+            // Update rate_correction_factor unless
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+            q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+            q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                  AOMMAX(q_high, top_index), cm->width,
+                                  cm->height);
+
+            while (q < q_low && retries < 10) {
+              av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+              q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    AOMMAX(q_high, top_index), cm->width,
+                                    cm->height);
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = q > q_low ? q - 1 : q_low;
+
+          if (overshoot_seen || loop_at_this_size > 1) {
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+            q = (q_high + q_low) / 2;
+          } else {
+            av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+            q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                  top_index, cm->width, cm->height);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) {
+              q_low = q;
+            }
+
+            while (q > q_high && retries < 10) {
+              av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+              q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index, cm->width, cm->height);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+
+        // Clamp Q to upper and lower limits:
+        q = clamp(q, q_low, q_high);
+
+        loop = (q != last_q);
+      } else {
+        loop = 0;
+      }
+    }
+
+    // Special case for overlay frame.
+    if (rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth)
+      loop = 0;
+
+    if (!cpi->sf.gm_disable_recode) {
+      if (recode_loop_test_global_motion(cpi)) loop = 1;
+    }
+
+    if (loop) {
+      ++loop_count;
+      ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+      ++cpi->tot_recode_hits;
+#endif
+    }
+  } while (loop);
+
+  return AOM_CODEC_OK;
+}
+
+static int get_ref_frame_flags(const AV1_COMP *cpi) {
+  const int *const map = cpi->common.ref_frame_map;
+
+  // No.1 Priority: LAST_FRAME
+  const int last2_is_last = map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[0]];
+  const int last3_is_last = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[0]];
+  const int gld_is_last =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int bwd_is_last =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int alt2_is_last =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+  const int alt_is_last =
+      map[cpi->ref_fb_idx[ALTREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+
+  // No.2 Priority: ALTREF_FRAME
+  const int last2_is_alt =
+      map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int last3_is_alt =
+      map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int gld_is_alt = map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int bwd_is_alt = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+  const int alt2_is_alt = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+
+  // No.3 Priority: LAST2_FRAME
+  const int last3_is_last2 = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[1]];
+  const int gld_is_last2 =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+  const int bwd_is_last2 =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+  const int alt2_is_last2 =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+
+  // No.4 Priority: LAST3_FRAME
+  const int gld_is_last3 =
+      map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+  const int bwd_is_last3 =
+      map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+  const int alt2_is_last3 =
+      map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+
+  // No.5 Priority: GOLDEN_FRAME
+  const int bwd_is_gld = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+                         map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+  const int alt2_is_gld = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+
+  // No.6 Priority: BWDREF_FRAME
+  const int alt2_is_bwd = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+                          map[cpi->ref_fb_idx[BWDREF_FRAME - 1]];
+
+  // No.7 Priority: ALTREF2_FRAME
+
+  // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be
+  // adjusted according to external encoder flags.
+  int flags = cpi->ext_ref_frame_flags;
+
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
+
+  if (alt_is_last) flags &= ~AOM_ALT_FLAG;
+
+  if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
+
+  if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
+
+  if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
+    flags &= ~AOM_GOLD_FLAG;
+
+  if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 ||
+       bwd_is_gld) &&
+      (flags & AOM_BWD_FLAG))
+    flags &= ~AOM_BWD_FLAG;
+
+  if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
+       alt2_is_gld || alt2_is_bwd) &&
+      (flags & AOM_ALT2_FLAG))
+    flags &= ~AOM_ALT2_FLAG;
+
+  return flags;
+}
+
+static void set_ext_overrides(AV1_COMP *cpi) {
+  // Overrides the defaults with the externally supplied values with
+  // av1_update_reference() and av1_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to encode_frame_to_data_rate() function
+  if (cpi->ext_use_s_frame) cpi->common.frame_type = S_FRAME;
+  cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none;
+
+  if (cpi->ext_refresh_frame_context_pending) {
+    cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+    cpi->ext_refresh_frame_context_pending = 0;
+  }
+  if (cpi->ext_refresh_frame_flags_pending) {
+    cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+    cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+    cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+    cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
+    cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
+    cpi->ext_refresh_frame_flags_pending = 0;
+  }
+  cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+  // A keyframe is already error resilient and keyframes with
+  // error_resilient_mode interferes with the use of show_existing_frame
+  // when forward reference keyframes are enabled.
+  cpi->common.error_resilient_mode =
+      cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
+}
+
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", cm->current_video_frame);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      cm->current_video_frame, cm->frame_offset, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+    const int ref_offset =
+        (buf_idx >= 0)
+            ? (int)cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset
+            : -1;
+    printf(
+        " %d(%c-%d-%4.2f)", ref_offset,
+        (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N',
+        (buf_idx >= 0) ? (int)cpi->frame_rf_level[buf_idx] : -1,
+        (buf_idx >= 0) ? rate_factor_deltas[cpi->frame_rf_level[buf_idx]] : -1);
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
+           cm->current_video_frame);
+    return;
+  }
+
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (cm->current_video_frame == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, rf_level=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+      cm->current_video_frame, cpi->twopass.gf_group.index,
+      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+      cm->frame_offset, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
+      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+  int ref_frame;
+  printf("get_ref_frame_map_idx: [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    printf(" %d", get_ref_frame_map_idx(cpi, ref_frame));
+  printf(" ]\n");
+  printf("cm->new_fb_idx = %d\n", cm->new_fb_idx);
+  printf("cm->ref_frame_map = [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf(" %d", cm->ref_frame_map[ref_frame - LAST_FRAME]);
+  }
+  printf(" ]\n");
+#endif  // 0
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
+
+static INLINE int is_frame_droppable(AV1_COMP *cpi) {
+  return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+           cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+           cpi->refresh_last_frame);
+}
+
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                                     int skip_adapt,
+                                     unsigned int *frame_flags) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+
+  set_ext_overrides(cpi);
+  aom_clear_system_state();
+
+  // frame type has been decided outside of this function call
+  cm->cur_frame->intra_only = frame_is_intra_only(cm);
+  cm->cur_frame->frame_type = cm->frame_type;
+
+  // S_FRAMEs are always error resilient
+  cm->error_resilient_mode |= frame_is_sframe(cm);
+
+  cm->large_scale_tile = cpi->oxcf.large_scale_tile;
+  cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
+  if (cm->large_scale_tile) seq_params->frame_id_numbers_present_flag = 0;
+
+  cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+  // cm->allow_ref_frame_mvs needs to be written into the frame header while
+  // cm->large_scale_tile is 1, therefore, "cm->large_scale_tile=1" case is
+  // separated from frame_might_allow_ref_frame_mvs().
+  cm->allow_ref_frame_mvs &= !cm->large_scale_tile;
+
+  cm->allow_warped_motion =
+      cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+
+  // Reset the frame packet stamp index.
+  if (cm->frame_type == KEY_FRAME && cm->show_frame)
+    cm->current_video_frame = 0;
+
+  // NOTE:
+  // (1) Move the setup of the ref_frame_flags upfront as it would be
+  //     determined by the current frame properties;
+  // (2) The setup of the ref_frame_flags applies to both
+  // show_existing_frame's
+  //     and the other cases.
+  if (cm->current_video_frame > 0)
+    cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+  if (encode_show_existing_frame(cm)) {
+    // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+    //               BWDREF_FRAME in the reference frame buffer.
+    if (cm->frame_type == KEY_FRAME) {
+      cm->reset_decoder_state = 1;
+    } else {
+      cm->frame_type = INTER_FRAME;
+    }
+    cm->show_frame = 1;
+    cpi->frame_flags = *frame_flags;
+
+    restore_coding_context(cpi);
+
+    // Build the bitstream
+    if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+
+    cpi->seq_params_locked = 1;
+
+    // Set up frame to show to get ready for stats collection.
+    cm->frame_to_show = get_frame_new_buffer(cm);
+
+    // Update current frame offset.
+    cm->frame_offset =
+        cm->buffer_pool->frame_bufs[cm->new_fb_idx].cur_frame_offset;
+
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+    // Update the LAST_FRAME in the reference frame buffer.
+    // NOTE:
+    // (1) For BWDREF_FRAME as the show_existing_frame, the reference frame
+    //     update has been done previously when handling the LAST_BIPRED_FRAME
+    //     right before BWDREF_FRAME (in the display order);
+    // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
+    //     update will be done when the following is called, which will
+    //     exchange
+    //     the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
+    //     LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2,
+    //     and
+    //     ALTREF2_FRAME will serve as the new LAST_FRAME.
+    update_reference_frames(cpi);
+
+    // Update frame flags
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+    // Update the frame type
+    cm->last_frame_type = cm->frame_type;
+
+    // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+    // to do post-encoding update accordingly.
+    if (cpi->rc.is_src_frame_alt_ref) {
+      av1_set_target_rate(cpi, cm->width, cm->height);
+      av1_rc_postencode_update(cpi, *size);
+    }
+
+    ++cm->current_video_frame;
+
+    return AOM_CODEC_OK;
+  }
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    av1_reset_segment_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->rc.source_alt_ref_active = 0;
+  }
+  if (cpi->oxcf.mtu == 0) {
+    cm->num_tg = cpi->oxcf.num_tile_groups;
+  } else {
+    // Use a default value for the purposes of weighting costs in probability
+    // updates
+    cm->num_tg = DEFAULT_MAX_NUM_TG;
+  }
+
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame.
+  if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+      cm->frame_type != KEY_FRAME) {
+    if (av1_rc_drop_frame(cpi)) {
+      av1_rc_postencode_update_drop_frame(cpi);
+      return AOM_CODEC_OK;
+    }
+  }
+
+  aom_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+  if (seq_params->frame_id_numbers_present_flag) {
+    /* Non-normative definition of current_frame_id ("frame counter" with
+     * wraparound) */
+    const int frame_id_length = FRAME_ID_LENGTH;
+    if (cm->current_frame_id == -1) {
+      int lsb, msb;
+      /* quasi-random initialization of current_frame_id for a key frame */
+      if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+        lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+        msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+      } else {
+        lsb = cpi->source->y_buffer[0] & 0xff;
+        msb = cpi->source->y_buffer[1] & 0xff;
+      }
+      cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+
+      // S_frame is meant for stitching different streams of different
+      // resolutions together, so current_frame_id must be the
+      // same across different streams of the same content current_frame_id
+      // should be the same and not random. 0x37 is a chosen number as start
+      // point
+      if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
+    } else {
+      cm->current_frame_id =
+          (cm->current_frame_id + 1 + (1 << frame_id_length)) %
+          (1 << frame_id_length);
+    }
+  }
+
+  switch (cpi->oxcf.cdf_update_mode) {
+    case 0:  // No CDF update for any frames(4~6% compression loss).
+      cm->disable_cdf_update = 1;
+      break;
+    case 1:  // Enable CDF update for all frames.
+      cm->disable_cdf_update = 0;
+      break;
+    case 2:
+      // Strategically determine at which frames to do CDF update.
+      // Currently only enable CDF update for all-intra and no-show frames(1.5%
+      // compression loss).
+      // TODO(huisu@google.com): design schemes for various trade-offs between
+      // compression quality and decoding speed.
+      cm->disable_cdf_update =
+          (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      break;
+  }
+  cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+  if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+    if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
+  } else {
+    if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+  }
+
+  cm->last_tile_cols = cm->tile_cols;
+  cm->last_tile_rows = cm->tile_rows;
+
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    av1_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif  // OUTPUT_YUV_SKINMAP
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+    if (seq_params->use_highbitdepth) {
+      cpi->ambient_err =
+          aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+    } else {
+      cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+    }
+  }
+
+  // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
+  if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
+    cpi->refresh_last_frame = 1;
+  }
+
+  cm->frame_to_show = get_frame_new_buffer(cm);
+  cm->frame_to_show->color_primaries = seq_params->color_primaries;
+  cm->frame_to_show->transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients;
+  cm->frame_to_show->monochrome = seq_params->monochrome;
+  cm->frame_to_show->chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->frame_to_show->color_range = seq_params->color_range;
+  cm->frame_to_show->render_width = cm->render_width;
+  cm->frame_to_show->render_height = cm->render_height;
+
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
+
+  // Pick the loop filter level for the frame.
+  if (!cm->allow_intrabc) {
+    loopfilter_frame(cpi, cm);
+  } else {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+    cm->cdef_bits = 0;
+    cm->cdef_strengths[0] = 0;
+    cm->nb_cdef_strengths = 1;
+    cm->cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // TODO(debargha): Fix mv search range on encoder side
+  // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm));
+  aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm));
+
+#ifdef OUTPUT_YUV_REC
+  aom_write_one_yuv_frame(cm, cm->frame_to_show);
+#endif
+
+  // Build the bitstream
+  if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
+
+  cpi->seq_params_locked = 1;
+
+  if (skip_adapt) return AOM_CODEC_OK;
+
+  if (seq_params->frame_id_numbers_present_flag) {
+    int i;
+    // Update reference frame id values based on the value of refresh_frame_mask
+    for (i = 0; i < REF_FRAMES; i++) {
+      if ((cpi->refresh_frame_mask >> i) & 1) {
+        cm->ref_frame_id[i] = cm->current_frame_id;
+      }
+    }
+  }
+
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+  if (cm->seg.enabled) {
+    if (cm->seg.update_map) {
+      update_reference_segmentation_map(cpi);
+    } else if (cm->last_frame_seg_map) {
+      memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map,
+             cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
+    }
+  }
+
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
+
+  update_reference_frames(cpi);
+
+#if CONFIG_ENTROPY_STATS
+  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
+#endif  // CONFIG_ENTROPY_STATS
+
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx;
+    av1_reset_cdf_symbol_counters(cm->fc);
+  }
+
+  if (cpi->refresh_golden_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+  if (cpi->refresh_alt_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+  if (cpi->refresh_bwd_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_BWDREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+
+  cm->last_frame_type = cm->frame_type;
+
+  av1_rc_postencode_update(cpi, *size);
+
+  if (cm->frame_type == KEY_FRAME) {
+    // Tell the caller that the frame was coded as a key frame
+    *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+  }
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  // A droppable frame might not be shown but it always
+  // takes a space in the gf group. Therefore, even when
+  // it is not shown, we still need update the count down.
+
+  if (cm->show_frame) {
+    // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
+    // are
+    // being used as reference.
+    swap_mi_and_prev_mi(cm);
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+
+    ++cm->current_video_frame;
+  }
+
+  // NOTE: Shall not refer to any frame not used as reference.
+  if (cm->is_reference_frame) {
+    // keep track of the last coded dimensions
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+
+    // reset to normal state now that we are done.
+    cm->last_show_frame = cm->show_frame;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+  //               differently here for rc->avg_frame_bandwidth.
+  if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
+    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+        cpi->common.frame_type == KEY_FRAME) {
+      // If this is a show_existing_frame with a source other than altref,
+      // or if it is not a displayed forward keyframe, the keyframe update
+      // counters were incremented when it was originally encoded.
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+    }
+  }
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame. If this is
+  // a show_existing_frame with a source other than altref, or if it is not
+  // a displayed forward keyframe, the index was incremented when it was
+  // originally encoded.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+      cpi->common.frame_type == KEY_FRAME) {
+    ++cpi->twopass.gf_group.index;
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
+static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                       int skip_adapt, unsigned int *frame_flags) {
+  if (cpi->oxcf.rc_mode == AOM_CBR) {
+    av1_rc_get_one_pass_cbr_params(cpi);
+  } else {
+    av1_rc_get_one_pass_vbr_params(cpi);
+  }
+  if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+  update_rc_counts(cpi);
+  check_show_existing_frame(cpi);
+  return AOM_CODEC_OK;
+}
+
+static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+                       unsigned int *frame_flags) {
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *cm = &cpi->common;
+  cm->txcoeff_cost_timer = 0;
+  cm->txcoeff_cost_count = 0;
+#endif
+
+  if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+#if TXCOEFF_COST_TIMER
+  cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+  fprintf(stderr,
+          "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+          "in us\n",
+          cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+          cm->cum_txcoeff_cost_timer);
+#endif
+
+  av1_twopass_postencode_update(cpi);
+  update_rc_counts(cpi);
+  check_show_existing_frame(cpi);
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+                            int block_size, float noise_level,
+                            int64_t time_stamp, int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cpi->denoise_and_model) {
+    cpi->denoise_and_model = aom_denoise_and_model_alloc(
+        cm->seq_params.bit_depth, block_size, noise_level);
+    if (!cpi->denoise_and_model) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating denoise and model");
+      return -1;
+    }
+  }
+  if (!cpi->film_grain_table) {
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    if (!cpi->film_grain_table) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating grain table");
+      return -1;
+    }
+    memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+  }
+  if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+                                &cm->film_grain_params)) {
+    if (cm->film_grain_params.apply_grain) {
+      aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+                                  &cm->film_grain_params);
+    }
+  }
+  return 0;
+}
+#endif
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  struct aom_usec_timer timer;
+  int res = 0;
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+
+  check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+
+  aom_usec_timer_start(&timer);
+
+#if CONFIG_DENOISE
+  if (cpi->oxcf.noise_level > 0)
+    if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+                         cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+      res = -1;
+#endif  //  CONFIG_DENOISE
+
+  if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+                         use_highbitdepth, frame_flags))
+    res = -1;
+  aom_usec_timer_mark(&timer);
+  cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+
+  if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
+      (subsampling_x != 1 || subsampling_y != 1)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color format requires profile 1 or 2");
+    res = -1;
+  }
+  if ((seq_params->profile == PROFILE_1) &&
+      !(subsampling_x == 0 && subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 1 requires 4:4:4 color format");
+    res = -1;
+  }
+  if ((seq_params->profile == PROFILE_2) &&
+      (seq_params->bit_depth <= AOM_BITS_10) &&
+      !(subsampling_x == 1 && subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 2 bit-depth < 10 requires 4:2:2 color format");
+    res = -1;
+  }
+
+  return res;
+}
+
+static int frame_is_reference(const AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+         cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
+         cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame ||
+         !cm->error_resilient_mode || cm->lf.mode_ref_delta_update ||
+         cm->seg.update_map || cm->seg.update_data;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+                              const struct lookahead_entry *source) {
+  int64_t this_duration;
+  int step = 0;
+
+  if (source->ts_start == cpi->first_time_stamp_ever) {
+    this_duration = source->ts_end - source->ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration =
+        cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+    this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      av1_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval = AOMMIN(
+          (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  cpi->last_time_stamp_seen = source->ts_start;
+  cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int arf_src_index = 0;
+  if (is_altref_enabled(cpi)) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+        arf_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    } else if (rc->source_alt_ref_pending) {
+      arf_src_index = rc->frames_till_gf_update_due;
+    }
+  }
+  return arf_src_index;
+}
+
+static int get_brf_src_index(AV1_COMP *cpi) {
+  int brf_src_index = 0;
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+  //               flag.
+  if (gf_group->bidir_pred_enabled[gf_group->index]) {
+    if (cpi->oxcf.pass == 2) {
+      if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+        brf_src_index = gf_group->brf_src_offset[gf_group->index];
+    } else {
+      // TODO(zoeliu): To re-visit the setup for this scenario
+      brf_src_index = cpi->rc.bipred_group_interval - 1;
+    }
+  }
+
+  return brf_src_index;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf2_src_index(AV1_COMP *cpi) {
+  int arf2_src_index = 0;
+  if (is_altref_enabled(cpi) && cpi->num_extra_arfs) {
+    if (cpi->oxcf.pass == 2) {
+      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+      if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+        arf2_src_index = gf_group->arf_src_offset[gf_group->index];
+      }
+    }
+  }
+  return arf2_src_index;
+}
+
+static void check_src_altref(AV1_COMP *cpi,
+                             const struct lookahead_entry *source) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // If pass == 2, the parameters set here will be reset in
+  // av1_rc_get_second_pass_params()
+
+  if (cpi->oxcf.pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    rc->is_src_frame_alt_ref =
+        (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
+        (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+    rc->is_src_frame_ext_arf =
+        gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+  } else {
+    rc->is_src_frame_alt_ref =
+        cpi->alt_ref_source && (source == cpi->alt_ref_source);
+  }
+
+  if (rc->is_src_frame_alt_ref) {
+    // Current frame is an ARF overlay frame.
+    cpi->alt_ref_source = NULL;
+
+    if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
+      // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
+      // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
+      // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
+      cpi->refresh_last_frame = 1;
+    } else {
+      // Don't refresh the last buffer for an ARF overlay frame. It will
+      // become the GF so preserve last as an alternative prediction option.
+      cpi->refresh_last_frame = 0;
+    }
+  }
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                                 const unsigned char *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[STAT_Y] += y;
+  s->stat[STAT_U] += u;
+  s->stat[STAT_V] += v;
+  s->stat[STAT_ALL] += all;
+  s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+  AV1_COMMON *const cm = &cpi->common;
+  double samples = 0.0;
+  uint32_t in_bit_depth = 8;
+  uint32_t bit_depth = 8;
+
+#if CONFIG_INTER_STATS_ONLY
+  if (cm->frame_type == KEY_FRAME) return;  // skip key frame
+#endif
+  cpi->bytes += frame_bytes;
+
+  if (cm->seq_params.use_highbitdepth) {
+    in_bit_depth = cpi->oxcf.input_bit_depth;
+    bit_depth = cm->seq_params.bit_depth;
+  }
+  if (cm->show_frame) {
+    const YV12_BUFFER_CONFIG *orig = cpi->source;
+    const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+    double y, u, v, frame_all;
+
+    cpi->count++;
+    if (cpi->b_calculate_psnr) {
+      PSNR_STATS psnr;
+      double frame_ssim2 = 0.0, weight = 0.0;
+      aom_clear_system_state();
+      // TODO(yaowu): unify these two versions into one.
+      aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+
+      adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+                        &cpi->psnr);
+      cpi->total_sq_error += psnr.sse[0];
+      cpi->total_samples += psnr.samples[0];
+      samples = psnr.samples[0];
+      // TODO(yaowu): unify these two versions into one.
+      if (cm->seq_params.use_highbitdepth)
+        frame_ssim2 =
+            aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
+      else
+        frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+
+      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
+      cpi->summed_quality += frame_ssim2 * weight;
+      cpi->summed_weights += weight;
+
+#if 0
+      {
+        FILE *f = fopen("q_used.stt", "a");
+        double y2 = psnr.psnr[1];
+        double u2 = psnr.psnr[2];
+        double v2 = psnr.psnr[3];
+        double frame_psnr2 = psnr.psnr[0];
+        fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                cm->current_video_frame, y2, u2, v2,
+                frame_psnr2, frame_ssim2);
+        fclose(f);
+      }
+#endif
+    }
+    if (cpi->b_calculate_blockiness) {
+      if (!cm->seq_params.use_highbitdepth) {
+        const double frame_blockiness =
+            av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+                               recon->y_stride, orig->y_width, orig->y_height);
+        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
+        cpi->total_blockiness += frame_blockiness;
+      }
+
+      if (cpi->b_calculate_consistency) {
+        if (!cm->seq_params.use_highbitdepth) {
+          const double this_inconsistency = aom_get_ssim_metrics(
+              orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+
+          const double peak = (double)((1 << in_bit_depth) - 1);
+          const double consistency =
+              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency =
+                AOMMIN(cpi->worst_consistency, consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
+    }
+
+    frame_all =
+        aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
+                         const YV12_BUFFER_CONFIG *last_picture,
+                         hash_table *last_hash_table) {
+  aom_clear_system_state();
+  // check use hash ME
+  int k;
+  uint32_t hash_value_1;
+  uint32_t hash_value_2;
+
+  const int block_size = 8;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+  int M = 0;  // match with other block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
+        }
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+
+      av1_get_block_hash_value(
+          cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
+          block_size, &hash_value_1, &hash_value_2,
+          (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
+      // Hashing does not work for highbitdepth currently.
+      // TODO(Roger): Make it work for highbitdepth.
+      if (av1_use_hash_me(&cpi->common)) {
+        if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+          M++;
+        }
+      }
+    }
+  }
+
+  assert(T > 0);
+  double csm_rate = ((double)(C + S + M)) / ((double)(T));
+  double m_rate = ((double)(M)) / ((double)(T));
+
+  cpi->csm_rate_array[cpi->rate_index] = csm_rate;
+  cpi->m_rate_array[cpi->rate_index] = m_rate;
+
+  cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
+  cpi->rate_size++;
+  cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
+
+  if (csm_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double csm_average = 0.0;
+  double m_average = 0.0;
+
+  for (k = 0; k < cpi->rate_size; k++) {
+    csm_average += cpi->csm_rate_array[k];
+    m_average += cpi->m_rate_array[k];
+  }
+  csm_average /= cpi->rate_size;
+  m_average /= cpi->rate_size;
+
+  if (csm_average < threshold_average) {
+    return 0;
+  }
+
+  if (M > (T - C - S) / 3) {
+    return 1;
+  }
+
+  if (csm_rate > 0.99 && m_rate > 0.01) {
+    return 1;
+  }
+
+  if (csm_average + m_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush,
+                            const aom_rational_t *timebase) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  BufferPool *const pool = cm->buffer_pool;
+  RATE_CONTROL *const rc = &cpi->rc;
+  struct aom_usec_timer cmptimer;
+  YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+  struct lookahead_entry *last_source = NULL;
+  struct lookahead_entry *source = NULL;
+  int arf_src_index;
+  int brf_src_index;
+  int i;
+
+#if CONFIG_BITSTREAM_DEBUG
+  assert(cpi->oxcf.max_threads == 0 &&
+         "bitstream debug tool does not support multithreading");
+  bitstream_queue_record_write();
+  bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+  cm->showable_frame = 0;
+  aom_usec_timer_start(&cmptimer);
+
+  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
+
+  // Normal defaults
+  cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
+                                  ? REFRESH_FRAME_CONTEXT_DISABLED
+                                  : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  // default reference buffers update config
+  av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE);
+
+  // Initialize fields related to forward keyframes
+  cpi->no_show_kf = 0;
+  cm->reset_decoder_state = 0;
+
+  // Don't allow a show_existing_frame to coincide with an error resilient or
+  // S-Frame. An exception can be made in the case of a keyframe, since it
+  // does not depend on any previous frames. We must make this exception here
+  // because of the use of show_existing_frame with forward coded keyframes.
+  struct lookahead_entry *lookahead_src = NULL;
+  if (cm->current_video_frame > 0)
+    lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
+
+  int use_show_existing = 1;
+  if (lookahead_src != NULL) {
+    const int is_error_resilient =
+        cpi->oxcf.error_resilient_mode ||
+        (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+    const int is_s_frame = cpi->oxcf.s_frame_mode ||
+                           (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+    const int is_key_frame =
+        (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
+    use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
+  }
+
+  if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
+    // Manage the source buffer and flush out the source frame that has been
+    // coded already; Also get prepared for PSNR calculation if needed.
+    if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
+      *size = 0;
+      return -1;
+    }
+    av1_apply_encoding_flags(cpi, source->flags);
+    cpi->source = &source->img;
+    // TODO(zoeliu): To track down to determine whether it's needed to adjust
+    // the frame rate.
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+
+    // We need to adjust frame rate for an overlay frame
+    if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
+
+    // Find a free buffer for the new frame, releasing the reference
+    // previously
+    // held.
+    if (cm->new_fb_idx != INVALID_IDX) {
+      --pool->frame_bufs[cm->new_fb_idx].ref_count;
+    }
+    cm->new_fb_idx = get_free_fb(cm);
+
+    if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+    // Clear down mmx registers
+    aom_clear_system_state();
+
+    // Start with a 0 size frame.
+    *size = 0;
+
+    // We need to update the gf_group for show_existing overlay frame
+    if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
+
+    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+
+    if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+    compute_internal_stats(cpi, (int)(*size));
+#endif  // CONFIG_INTERNAL_STATS
+
+    // Clear down mmx registers
+    aom_clear_system_state();
+
+    cm->show_existing_frame = 0;
+    return 0;
+  }
+
+  // Should we encode an arf frame.
+  arf_src_index = get_arf_src_index(cpi);
+  if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+
+    if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cm->showable_frame = 1;
+      cpi->alt_ref_source = source;
+      // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+      if (arf_src_index == rc->frames_to_key) {
+        // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        int which_arf = gf_group->arf_update_idx[gf_group->index];
+        cpi->is_arf_filter_off[which_arf] = 1;
+        cpi->no_show_kf = 1;
+      } else {
+        if (oxcf->arnr_max_frames > 0) {
+          // Produce the filtered ARF frame.
+          av1_temporal_filter(cpi, arf_src_index);
+          aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+          force_src_buffer = &cpi->alt_ref_buffer;
+        }
+      }
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+
+      if (oxcf->pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE);
+      }
+    }
+    rc->source_alt_ref_pending = 0;
+  }
+
+  // Should we encode an arf2 frame.
+  arf_src_index = get_arf2_src_index(cpi);
+  if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
+
+  if (arf_src_index) {
+    assert(arf_src_index <= rc->frames_to_key);
+
+    if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+      cm->showable_frame = 1;
+      cpi->alt_ref_source = source;
+
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        av1_temporal_filter(cpi, arf_src_index);
+        aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+        force_src_buffer = &cpi->alt_ref_buffer;
+      }
+
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+
+      if (oxcf->pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE);
+      }
+    }
+    rc->source_alt_ref_pending = 0;
+  }
+
+  rc->is_bwd_ref_frame = 0;
+  brf_src_index = get_brf_src_index(cpi);
+  if (brf_src_index) {
+    assert(brf_src_index <= rc->frames_to_key);
+    if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->showable_frame = 1;
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+
+      if (oxcf->pass < 2) {
+        // In second pass, the buffer updates configure will be set
+        // in the function av1_rc_get_second_pass_params
+        av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE);
+      }
+    }
+  }
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_video_frame > 0) {
+      if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
+        return -1;
+    }
+    if (cm->current_video_frame > 0) assert(last_source != NULL);
+    // Read in the source frame.
+    source = av1_lookahead_pop(cpi->lookahead, flush);
+
+    if (source != NULL) {
+      cm->show_frame = 1;
+      cm->intra_only = 0;
+
+      // Check to see if the frame should be encoded as an arf overlay.
+      check_src_altref(cpi, source);
+    }
+  }
+  if (source) {
+    cpi->unscaled_source = cpi->source =
+        force_src_buffer ? force_src_buffer : &source->img;
+    cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+    av1_apply_encoding_flags(cpi, source->flags);
+    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+  } else {
+    *size = 0;
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+    return -1;
+  }
+
+  if (source->ts_start < cpi->first_time_stamp_ever) {
+    cpi->first_time_stamp_ever = source->ts_start;
+    cpi->last_end_time_stamp_seen = source->ts_start;
+  }
+
+  // Clear down mmx registers
+  aom_clear_system_state();
+
+  // adjust frame rates based on timestamps given
+  if (cm->show_frame) adjust_frame_rate(cpi, source);
+
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
+  cm->new_fb_idx = get_free_fb(cm);
+
+  if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+  // Retain the RF_LEVEL for the current newly coded frame.
+  cpi->frame_rf_level[cm->new_fb_idx] =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+  cm->cur_frame->buf.buf_8bit_valid = 0;
+
+  if (cpi->film_grain_table) {
+    cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
+        cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+        &cm->film_grain_params);
+  }
+  cm->cur_frame->film_grain_params_present =
+      cm->seq_params.film_grain_params_present;
+
+  // only one operating point supported now
+  const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+  if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+  cpi->common.frame_presentation_time = (uint32_t)pts64;
+
+  // Start with a 0 size frame.
+  *size = 0;
+
+  cpi->frame_flags = *frame_flags;
+
+  if (oxcf->pass == 2) {
+    av1_rc_get_second_pass_params(cpi);
+  } else if (oxcf->pass == 1) {
+    setup_frame_size(cpi);
+  }
+
+  if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
+    for (i = 0; i < REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+  }
+
+  cm->using_qmatrix = cpi->oxcf.using_qm;
+  cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+  cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+
+  if (cm->seq_params.frame_id_numbers_present_flag) {
+    if (*time_stamp == 0) {
+      cpi->common.current_frame_id = -1;
+    }
+  }
+
+  cpi->cur_poc++;
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+      !frame_is_intra_only(cm)) {
+    if (cpi->common.seq_params.force_integer_mv == 2) {
+      struct lookahead_entry *previous_entry =
+          av1_lookahead_peek(cpi->lookahead, cpi->previous_index);
+      if (!previous_entry)
+        cpi->common.cur_frame_force_integer_mv = 0;
+      else
+        cpi->common.cur_frame_force_integer_mv = is_integer_mv(
+            cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table);
+    } else {
+      cpi->common.cur_frame_force_integer_mv =
+          cpi->common.seq_params.force_integer_mv;
+    }
+  } else {
+    cpi->common.cur_frame_force_integer_mv = 0;
+  }
+
+  if (oxcf->pass == 1) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
+    av1_first_pass(cpi, source);
+  } else if (oxcf->pass == 2) {
+    if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+  } else {
+    // One pass encode
+    if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+  }
+  if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+    cpi->previous_hash_table = &cm->cur_frame->hash_table;
+    {
+      int l;
+      for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
+        if ((cpi->lookahead->buf + l) == source) {
+          cpi->previous_index = l;
+          break;
+        }
+      }
+
+      if (l == cpi->lookahead->max_sz) {
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to find last frame original buffer");
+      }
+    }
+  }
+
+  if (!cm->large_scale_tile) {
+    cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+  }
+
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+  if (cm->large_scale_tile && oxcf->pass == 2) {
+    char fn[20] = "./fc";
+    fn[4] = cm->current_video_frame / 100 + '0';
+    fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+    fn[6] = (cm->current_video_frame % 10) + '0';
+    fn[7] = '\0';
+    av1_print_frame_contexts(cm->fc, fn);
+  }
+#endif  // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+  cm->showable_frame = !cm->show_frame && cm->showable_frame;
+
+  // No frame encoded, or frame was dropped, release scaled references.
+  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+    release_scaled_references(cpi);
+  }
+
+  if (*size > 0) {
+    cpi->droppable = !frame_is_reference(cpi);
+  }
+
+  aom_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+
+  if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
+    generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+  if (oxcf->pass != 1) {
+    compute_internal_stats(cpi, (int)(*size));
+  }
+#endif  // CONFIG_INTERNAL_STATS
+
+  aom_clear_system_state();
+
+  return 0;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+  AV1_COMMON *cm = &cpi->common;
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+    if (cm->frame_to_show) {
+      *dest = *cm->frame_to_show;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+    aom_clear_system_state();
+    return ret;
+  }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+
+  *frame =
+      cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+  return 0;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+                          AOM_SCALING vert_mode) {
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  cpi->resize_pending_width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+  cpi->resize_pending_height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+
+  return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+  size_t output_size = 0;
+  size_t total_bytes_read = 0;
+  size_t remaining_size = *frame_size;
+  uint8_t *buff_ptr = buffer;
+
+  // go through each OBUs
+  while (total_bytes_read < *frame_size) {
+    uint8_t saved_obu_header[2];
+    uint64_t obu_payload_size;
+    size_t length_of_payload_size;
+    size_t length_of_obu_size;
+    uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+    size_t obu_bytes_read = obu_header_size;  // bytes read for current obu
+
+    // save the obu header (1 or 2 bytes)
+    memmove(saved_obu_header, buff_ptr, obu_header_size);
+    // clear the obu_has_size_field
+    saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+    // get the payload_size and length of payload_size
+    if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+                        &obu_payload_size, &length_of_payload_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+    obu_bytes_read += length_of_payload_size;
+
+    // calculate the length of size of the obu header plus payload
+    length_of_obu_size =
+        aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+    // move the rest of data to new location
+    memmove(buff_ptr + length_of_obu_size + obu_header_size,
+            buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+    obu_bytes_read += (size_t)obu_payload_size;
+
+    // write the new obu size
+    const uint64_t obu_size = obu_header_size + obu_payload_size;
+    size_t coded_obu_size;
+    if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+                        &coded_obu_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+
+    // write the saved (modified) obu_header following obu size
+    memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+    total_bytes_read += obu_bytes_read;
+    remaining_size -= obu_bytes_read;
+    buff_ptr += length_of_obu_size + obu_size;
+    output_size += length_of_obu_size + (size_t)obu_size;
+  }
+
+  *frame_size = output_size;
+  return AOM_CODEC_OK;
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+  // TODO(yunqingwang): For what references to use, external encoding flags
+  // should be consistent with internal reference frame selection. Need to
+  // ensure that there is not conflict between the two. In AV1 encoder, the
+  // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+  // GOLDEN, BWDREF, ALTREF2. If only one reference frame is used, it must be
+  // LAST.
+  cpi->ext_ref_frame_flags = AOM_REFFRAME_ALL;
+  if (flags &
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+       AOM_EFLAG_NO_REF_ARF2)) {
+    if (flags & AOM_EFLAG_NO_REF_LAST) {
+      cpi->ext_ref_frame_flags = 0;
+    } else {
+      int ref = AOM_REFFRAME_ALL;
+
+      if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+      if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+      if (flags & AOM_EFLAG_NO_REF_ARF) {
+        ref ^= AOM_ALT_FLAG;
+        ref ^= AOM_BWD_FLAG;
+        ref ^= AOM_ALT2_FLAG;
+      } else {
+        if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+        if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+      }
+
+      av1_use_as_reference(cpi, ref);
+    }
+  }
+
+  if (flags &
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+    int upd = AOM_REFFRAME_ALL;
+
+    // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+    if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_ARF) {
+      upd ^= AOM_ALT_FLAG;
+      upd ^= AOM_BWD_FLAG;
+      upd ^= AOM_ALT2_FLAG;
+    }
+
+    av1_update_reference(cpi, upd);
+  }
+
+  cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+                               ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+  cpi->ext_use_error_resilient = cpi->oxcf.error_resilient_mode |
+                                 ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+  cpi->ext_use_s_frame =
+      cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+  cpi->ext_use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
+  if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+    av1_update_entropy(cpi, 0);
+  }
+}
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) {
+  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
+  const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
+  if (!cpi) return NULL;
+
+  uint8_t header_buf[512] = { 0 };
+  const uint32_t sequence_header_size =
+      write_sequence_header_obu(cpi, &header_buf[0]);
+  assert(sequence_header_size <= sizeof(header_buf));
+  if (sequence_header_size == 0) return NULL;
+
+  const size_t obu_header_size = 1;
+  const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+  const size_t payload_offset = obu_header_size + size_field_size;
+
+  if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+  memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+  if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+      obu_header_size) {
+    return NULL;
+  }
+
+  size_t coded_size_field_size = 0;
+  if (aom_uleb_encode(sequence_header_size, size_field_size,
+                      &header_buf[obu_header_size],
+                      &coded_size_field_size) != 0) {
+    return NULL;
+  }
+  assert(coded_size_field_size == size_field_size);
+
+  aom_fixed_buf_t *global_headers =
+      (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+  if (!global_headers) return NULL;
+
+  const size_t global_header_buf_size =
+      obu_header_size + size_field_size + sequence_header_size;
+
+  global_headers->buf = malloc(global_header_buf_size);
+  if (!global_headers->buf) {
+    free(global_headers);
+    return NULL;
+  }
+
+  memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+  global_headers->sz = global_header_buf_size;
+  return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 000000000..ee7fc4637
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/timing.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tokenize.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int nmv_vec_cost[MV_JOINTS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
+
+  FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+typedef enum {
+  // regular inter frame
+  REGULAR_FRAME = 0,
+  // alternate reference frame
+  ARF_FRAME = 1,
+  // overlay frame
+  OVERLAY_FRAME = 2,
+  // golden frame
+  GLD_FRAME = 3,
+  // backward reference frame
+  BRF_FRAME = 4,
+  // extra alternate reference frame
+  EXT_ARF_FRAME = 5,
+  FRAME_CONTEXT_INDEXES
+} FRAME_CONTEXT_INDEX;
+
+typedef enum {
+  NORMAL = 0,
+  FOURFIVE = 1,
+  THREEFIVE = 2,
+  ONETWO = 3
+} AOM_SCALING;
+
+typedef enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD
+} MODE;
+
+typedef enum {
+  FRAMEFLAGS_KEY = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+  FRAMEFLAGS_BWDREF = 1 << 2,
+  // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
+  FRAMEFLAGS_ALTREF = 1 << 3,
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} AQ_MODE;
+typedef enum {
+  NO_DELTA_Q = 0,
+  DELTA_Q_ONLY = 1,
+  DELTA_Q_LF = 2,
+  DELTAQ_MODE_COUNT  // This should always be the last member of the enum
+} DELTAQ_MODE;
+
+typedef enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed.
+  RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
+  RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
+  RESIZE_MODES
+} RESIZE_MODE;
+
+typedef enum {
+  SUPERRES_NONE = 0,     // No frame superres allowed
+  SUPERRES_FIXED = 1,    // All frames are coded at the specified scale,
+                         // and super-resolved.
+  SUPERRES_RANDOM = 2,   // All frames are coded at a random scale,
+                         // and super-resolved.
+  SUPERRES_QTHRESH = 3,  // Superres scale for a frame is determined based on
+                         // q_index
+  SUPERRES_MODES
+} SUPERRES_MODE;
+
+typedef struct AV1EncoderConfig {
+  BITSTREAM_PROFILE profile;
+  aom_bit_depth_t bit_depth;     // Codec bit-depth.
+  int width;                     // width of data passed to the compressor
+  int height;                    // height of data passed to the compressor
+  int forced_max_frame_width;    // forced maximum width of frame (if != 0)
+  int forced_max_frame_height;   // forced maximum height of frame (if != 0)
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;         // set to passed in framerate
+  int64_t target_bandwidth;      // bandwidth to be used in bits per second
+
+  int noise_sensitivity;  // pre processing blur: recommendation 0
+  int sharpness;          // sharpening output: recommendation 0:
+  int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
+  unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
+
+  MODE mode;
+  int pass;
+
+  // Key Framing Operations
+  int auto_key;  // autodetect cut scenes and set the keyframes
+  int key_freq;  // maximum distance to key frame.
+  int sframe_dist;
+  int sframe_mode;
+  int sframe_enabled;
+  int lag_in_frames;  // how many frames lag before we start encoding
+  int fwd_kf_enabled;
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  // vbr, cbr, constrained quality or constant quality
+  enum aom_rc_mode rc_mode;
+
+  // buffer targeting aggressiveness
+  int under_shoot_pct;
+  int over_shoot_pct;
+
+  // buffering parameters
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
+
+  // Frame drop threshold.
+  int drop_frames_water_mark;
+
+  // controlling quality
+  int fixed_q;
+  int worst_allowed_q;
+  int best_allowed_q;
+  int cq_level;
+  AQ_MODE aq_mode;  // Adaptive Quantization mode
+  DELTAQ_MODE deltaq_mode;
+  int enable_cdef;
+  int enable_restoration;
+  int disable_trellis_quant;
+  int using_qm;
+  int qm_y;
+  int qm_u;
+  int qm_v;
+  int qm_minlevel;
+  int qm_maxlevel;
+#if CONFIG_DIST_8X8
+  int using_dist_8x8;
+#endif
+  unsigned int num_tile_groups;
+  unsigned int mtu;
+
+  // Internal frame size scaling.
+  RESIZE_MODE resize_mode;
+  uint8_t resize_scale_denominator;
+  uint8_t resize_kf_scale_denominator;
+
+  // Frame Super-Resolution size scaling.
+  SUPERRES_MODE superres_mode;
+  uint8_t superres_scale_denominator;
+  uint8_t superres_kf_scale_denominator;
+  int superres_qthresh;
+  int superres_kf_qthresh;
+
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
+  // two pass datarate control
+  int two_pass_vbrbias;  // two pass datarate control tweaks
+  int two_pass_vbrmin_section;
+  int two_pass_vbrmax_section;
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  int enable_auto_arf;
+  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+
+  /* Bitfield defining the error resiliency features to enable.
+   * Can provide decodable frames after losses in previous
+   * frames and decodable partitions after losses in the same frame.
+   */
+  unsigned int error_resilient_mode;
+
+  unsigned int s_frame_mode;
+
+  /* Bitfield defining the parallel decoding mode where the
+   * decoding in successive frames may be conducted in parallel
+   * just by decoding the frame headers.
+   */
+  unsigned int frame_parallel_decoding_mode;
+
+  unsigned int limit;
+
+  int arnr_max_frames;
+  int arnr_strength;
+
+  int min_gf_interval;
+  int max_gf_interval;
+
+  int row_mt;
+  int tile_columns;
+  int tile_rows;
+  int tile_width_count;
+  int tile_height_count;
+  int tile_widths[MAX_TILE_COLS];
+  int tile_heights[MAX_TILE_ROWS];
+
+  int max_threads;
+
+  aom_fixed_buf_t two_pass_stats_in;
+  struct aom_codec_pkt_list *output_pkt_list;
+
+#if CONFIG_FP_MB_STATS
+  aom_fixed_buf_t firstpass_mb_stats_in;
+#endif
+
+  aom_tune_metric tuning;
+  aom_tune_content content;
+  int use_highbitdepth;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  aom_chroma_sample_position_t chroma_sample_position;
+  int color_range;
+  int render_width;
+  int render_height;
+  aom_timing_info_type_t timing_info_type;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int decoder_model_info_present_flag;
+  int display_model_info_present_flag;
+  int buffer_removal_time_present;
+  aom_dec_model_info_t buffer_model;
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+  aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
+
+  uint8_t cdf_update_mode;
+  aom_superblock_size_t superblock_size;
+  unsigned int large_scale_tile;
+  unsigned int single_tile_decoding;
+  int monochrome;
+  unsigned int full_still_picture_hdr;
+  int enable_dual_filter;
+  unsigned int motion_vector_unit_test;
+  const cfg_options_t *cfg;
+  int enable_order_hint;
+  int enable_jnt_comp;
+  int enable_ref_frame_mvs;
+  unsigned int allow_ref_frame_mvs;
+  int enable_warped_motion;
+  int allow_warped_motion;
+  int enable_superres;
+  unsigned int save_as_annexb;
+
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
+
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
+} AV1EncoderConfig;
+
+static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+  unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+  unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  unsigned int cfl_sign[CFL_JOINT_SIGNS];
+  unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+  unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+  unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_y_color_index[PALETTE_SIZES]
+                                    [PALETTE_COLOR_INDEX_CONTEXTS]
+                                    [PALETTE_COLORS];
+  unsigned int palette_uv_color_index[PALETTE_SIZES]
+                                     [PALETTE_COLOR_INDEX_CONTEXTS]
+                                     [PALETTE_COLORS];
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+  unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                        [EOB_COEF_CONTEXTS][2];
+  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+                        [2];
+  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+  unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+  unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+  unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+  unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+  unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+  unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+  unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+  unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [LEVEL_CONTEXTS][BR_CDF_SIZE];
+  unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                               [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+  unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                                   [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+  unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+  unsigned int obmc[BLOCK_SIZES_ALL][2];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+  unsigned int intrabc[2];
+
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+  unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+  unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+  unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+  unsigned int delta_q[DELTA_Q_PROBS][2];
+  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+  unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+  unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+  unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+  unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+  unsigned int wiener_restore[2];
+  unsigned int sgrproj_restore[2];
+#endif  // CONFIG_ENTROPY_STATS
+
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+  int ready;
+  double a;
+  double b;
+  double dist_mean;
+  double ld_mean;
+  double sse_mean;
+  double sse_sse_mean;
+  double sse_ld_mean;
+  int num;
+  double dist_sum;
+  double ld_sum;
+  double sse_sum;
+  double sse_sse_sum;
+  double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+  int idx;
+  int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+typedef struct inter_modes_info {
+  int num;
+  MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+  int mode_rate_arr[MAX_INTER_MODES];
+  int64_t sse_arr[MAX_INTER_MODES];
+  int64_t est_rd_arr[MAX_INTER_MODES];
+  RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+} InterModesInfo;
+#endif
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
+  int m_search_count;
+  int ex_search_count;
+  CFL_CTX cfl;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  uint8_t allow_update_cdf;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  InterModesInfo inter_modes_info;
+#endif
+} TileDataEnc;
+
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+  unsigned int count;
+} TOKENLIST;
+
+typedef struct RD_COUNTS {
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  // Stores number of 4x4 blocks using global motion per reference frame.
+  int global_motion_used[REF_FRAMES];
+  int compound_ref_used_flag;
+  int skip_mode_used_flag;
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+  uint32_t *hash_value_buffer[2][2];
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
+  PALETTE_BUFFER *palette_buffer;
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+  int intrabc_used_this_tile;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+#if CONFIG_INTERNAL_STATS
+// types of stats
+typedef enum {
+  STAT_Y,
+  STAT_U,
+  STAT_V,
+  STAT_ALL,
+  NUM_STAT_TYPES  // This should always be the last member of the enum
+} StatType;
+
+typedef struct IMAGE_STAT {
+  double stat[NUM_STAT_TYPES];
+  double worst;
+} ImageStat;
+#endif  // CONFIG_INTERNAL_STATS
+
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+typedef struct TileBufferEnc {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+typedef struct AV1_COMP {
+  QUANTS quants;
+  ThreadData td;
+  FRAME_COUNTS counts;
+  MB_MODE_INFO_EXT *mbmi_ext_base;
+  CB_COEFF_BUFFER *coeff_buffer_base;
+  Dequants dequants;
+  AV1_COMMON common;
+  AV1EncoderConfig oxcf;
+  struct lookahead_ctx *lookahead;
+  struct lookahead_entry *alt_ref_source;
+  int no_show_kf;
+
+  int optimize_speed_feature;
+  int optimize_seg_arr[MAX_SEGMENTS];
+
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
+  YV12_BUFFER_CONFIG *unscaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
+
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
+  double csm_rate_array[32];
+  double m_rate_array[32];
+  int rate_size;
+  int rate_index;
+  hash_table *previous_hash_table;
+  int previous_index;
+  int cur_poc;  // DebugInfo
+
+  unsigned int row_mt;
+  int scaled_ref_idx[REF_FRAMES];
+  int ref_fb_idx[REF_FRAMES];
+  int refresh_fb_idx;  // ref frame buffer index to refresh
+
+  int last_show_frame_buf_idx;  // last show frame buffer index
+
+  int refresh_last_frame;
+  int refresh_golden_frame;
+  int refresh_bwd_ref_frame;
+  int refresh_alt2_ref_frame;
+  int refresh_alt_ref_frame;
+#if USE_SYMM_MULTI_LAYER
+  int new_bwdref_update_rule;
+#endif
+
+  int ext_refresh_frame_flags_pending;
+  int ext_refresh_last_frame;
+  int ext_refresh_golden_frame;
+  int ext_refresh_bwd_ref_frame;
+  int ext_refresh_alt2_ref_frame;
+  int ext_refresh_alt_ref_frame;
+
+  int ext_refresh_frame_context_pending;
+  int ext_refresh_frame_context;
+  int ext_use_ref_frame_mvs;
+  int ext_use_error_resilient;
+  int ext_use_s_frame;
+  int ext_use_primary_ref_none;
+
+  YV12_BUFFER_CONFIG last_frame_uf;
+  YV12_BUFFER_CONFIG trial_frame_rst;
+
+  // Ambient reconstruction err target for force key frames
+  int64_t ambient_err;
+
+  RD_OPT rd;
+
+  CODING_CONTEXT coding_context;
+
+  int gmtype_cost[TRANS_TYPES];
+  int gmparams_cost[REF_FRAMES];
+
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
+
+  int64_t last_time_stamp_seen;
+  int64_t last_end_time_stamp_seen;
+  int64_t first_time_stamp_ever;
+
+  RATE_CONTROL rc;
+  double framerate;
+
+  // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
+  // references; Plus the currently coded frame itself, it is needed to allocate
+  // sufficient space to the size of the maximum possible number of frames.
+  int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
+
+  struct aom_codec_pkt_list *output_pkt_list;
+
+  MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+  int mbgraph_n_frames;  // number of frames filled in the above
+  int static_mb_pct;     // % forced skip mbs by segmentation
+  int ref_frame_flags;
+  int ext_ref_frame_flags;
+  RATE_FACTOR_LEVEL frame_rf_level[FRAME_BUFFERS];
+
+  SPEED_FEATURES sf;
+
+  unsigned int max_mv_magnitude;
+  int mv_step_param;
+
+  int allow_comp_inter_inter;
+  int all_one_sided_refs;
+
+  uint8_t *segmentation_map;
+
+  CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
+
+  fractional_mv_step_fp *find_fractional_mv_step;
+  av1_diamond_search_fn_t diamond_search_sad;
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+  uint64_t time_pick_lpf;
+  uint64_t time_encode_sb_row;
+
+#if CONFIG_FP_MB_STATS
+  int use_fp_mb_stats;
+#endif
+
+  TWO_PASS twopass;
+
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int count;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+  ImageStat psnr;
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int bytes;
+  double summed_quality;
+  double summed_weights;
+  unsigned int tot_recode_hits;
+  double worst_ssim;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+#endif
+  int b_calculate_psnr;
+
+  int droppable;
+
+  int initial_width;
+  int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
+
+  // When resize is triggered through external control, the desired width/height
+  // are stored here until use in the next frame coded. They are effective only
+  // for
+  // one frame and are reset after use.
+  int resize_pending_width;
+  int resize_pending_height;
+
+  int frame_flags;
+
+  search_site_config ss_cfg;
+
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+  TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  int resize_state;
+  int resize_avg_qp;
+  int resize_buffer_underflow;
+  int resize_count;
+
+  // Sequence parameters have been transmitted already and locked
+  // or not. Once locked av1_change_config cannot change the seq
+  // parameters.
+  int seq_params_locked;
+
+  // VARIANCE_AQ segment map refresh
+  int vaq_refresh;
+
+  // Multi-threading
+  int num_workers;
+  AVxWorker *workers;
+  struct EncWorkerData *tile_thr_data;
+  int refresh_frame_mask;
+  int existing_fb_idx_to_show;
+  int is_arf_filter_off[MAX_EXT_ARFS + 1];
+  int num_extra_arfs;
+  int arf_pos_in_gf[MAX_EXT_ARFS + 1];
+  int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
+  int global_motion_search_done;
+  tran_low_t *tcoeff_buf[MAX_MB_PLANE];
+  int extra_arf_allowed;
+  // A flag to indicate if intrabc is ever used in current frame.
+  int intrabc_used;
+  int dv_cost[2][MV_VALS];
+  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
+  int dv_joint_cost[MV_JOINTS];
+  int has_lossless_segment;
+
+  // For frame refs short signaling:
+  //   A mapping of each reference frame from its encoder side value to the
+  //   decoder side value obtained following the short signaling procedure.
+  int ref_conv[REF_FRAMES];
+
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
+
+  aom_film_grain_table_t *film_grain_table;
+#if CONFIG_DENOISE
+  struct aom_denoise_and_model_t *denoise_and_model;
+#endif
+  // Stores the default value of skip flag depending on chroma format
+  // Set as 1 for monochrome and 3 for other color formats
+  int default_interp_skip_flags;
+  int preserve_arf_as_gld;
+} AV1_COMP;
+
+// Must not be called more than once.
+void av1_initialize_enc(void);
+
+struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+                                       BufferPool *const pool);
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+
+// receive a frames worth of data. caller can assume that a copy of this
+// frame is made and not just a copy of the pointer..
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush,
+                            const aom_rational_t *timebase);
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_update_entropy(AV1_COMP *cpi, int update);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+                          AOM_SCALING vert_mode);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  return (ref_frame >= 1) ? cpi->ref_fb_idx[ref_frame - 1] : INVALID_IDX;
+}
+
+static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
+  return cm->allow_screen_content_tools;
+}
+
+static INLINE hash_table *av1_get_ref_frame_hash_map(
+    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return buf_idx != INVALID_IDX
+             ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
+             : NULL;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+    const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
+                                : NULL;
+}
+
+static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
+  MV_REFERENCE_FRAME ref_frame;
+  AV1_COMMON *const cm = &cpi->common;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    if (buf_idx == INVALID_IDX) continue;
+    if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+  }
+  return (ref_frame <= ALTREF_FRAME);
+}
+
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
+  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
+                                            int num_planes) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+                                 int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  const int tile_mb_cols =
+      (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+  const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+  *tok = cpi->tile_tok[tile_row][tile_col] +
+         get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+#define ALT_MIN_LAG 3
+static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
+  return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_refs[0] =
+      &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
+  xd->block_refs[1] =
+      &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+  return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+  return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+                               int new_uidx) {
+  const int ref_index = *uidx;
+
+  if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+    ubufs[ref_index].ref_count--;
+
+  *uidx = new_uidx;
+  ubufs[new_uidx].ref_count++;
+}
+
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+  return !(cm->superres_upscaled_width == cm->render_width &&
+           cm->superres_upscaled_height == cm->render_height);
+}
+
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+  return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+  return cm->show_existing_frame &&
+         (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME);
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 000000000..5a31d93d7
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,2062 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static int hbt_needs_init = 1;
+static CRC32C crc_calculator;
+static const int HBT_EOB = 16;            // also the length in opt_qcoeff
+static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
+static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
+// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
+static const int HBT_KICKOUT = 3;
+
+typedef struct OptTxbQcoeff {
+  // Use larger type if larger/no kickout value is used in hbt_create_hashes
+  int8_t deltas[16];
+  uint32_t hbt_qc_hash;
+  uint32_t hbt_ctx_hash;
+  int init;
+  int rate_cost;
+} OptTxbQcoeff;
+
+OptTxbQcoeff *hbt_hash_table;
+
+typedef struct LevelDownStats {
+  int update;
+  tran_low_t low_qc;
+  tran_low_t low_dqc;
+  int64_t dist0;
+  int rate;
+  int rate_low;
+  int64_t dist;
+  int64_t dist_low;
+  int64_t rd;
+  int64_t rd_low;
+  int64_t nz_rd;
+  int64_t rd_diff;
+  int cost_diff;
+  int64_t dist_diff;
+  int new_eob;
+} LevelDownStats;
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+  av1_free_txb_buf(cpi);
+  // TODO(jingning): This should be further reduced.
+  CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+                  aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
+
+void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                          int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int stride = (cm->mi_cols >> mib_size_log2) + 1;
+  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
+  const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
+  for (int plane = 0; plane < num_planes; ++plane) {
+    x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
+    x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
+    x->mbmi_ext->txb_skip_ctx[plane] =
+        coeff_buf->txb_skip_ctx[plane] + txb_offset;
+    x->mbmi_ext->dc_sign_ctx[plane] =
+        coeff_buf->dc_sign_ctx[plane] + txb_offset;
+  }
+}
+
+static void write_golomb(aom_writer *w, int level) {
+  int x = level + 1;
+  int i = x;
+  int length = 0;
+
+  while (i) {
+    i >>= 1;
+    ++length;
+  }
+  assert(length > 0);
+
+  for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+  for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
+  if (qc == 0) {
+    return 0;
+  }
+  return qc > 0 ? qc - 1 : qc + 1;
+}
+
+static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
+                                           int dqv, int shift,
+                                           const qm_val_t *iqmatrix) {
+  int sign = qc < 0 ? -1 : 1;
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return sign * ((abs(qc) * dqv) >> shift);
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift) {
+  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  const int64_t error = diff * diff;
+  return error;
+}
+
+static const int8_t eob_to_pos_small[33] = {
+  0, 1, 2,                                        // 0-2
+  3, 3,                                           // 3-4
+  4, 4, 4, 4,                                     // 5-8
+  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+  6,                               // place holder
+  7,                               // 33-64
+  8,  8,                           // 65-128
+  9,  9,  9,  9,                   // 129-256
+  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
+  11                               // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+  int t;
+
+  if (eob < 33) {
+    t = eob_to_pos_small[eob];
+  } else {
+    const int e = AOMMIN((eob - 1) >> 5, 16);
+    t = eob_to_pos_large[e];
+  }
+
+  *extra = eob - k_eob_group_start[t];
+
+  return t;
+}
+
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+                            TX_CLASS tx_class, PLANE_TYPE plane,
+                            FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+                            uint8_t allow_update_cdf) {
+#else
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+                            PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+                            uint8_t allow_update_cdf) {
+#endif
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+
+  switch (eob_multi_size) {
+    case 0:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+      break;
+    case 1:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+      break;
+    case 2:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+      break;
+    case 3:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+                   8);
+      }
+      break;
+    case 4:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+                   9);
+      }
+      break;
+    case 5:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+                   10);
+      }
+      break;
+    case 6:
+    default:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+                   11);
+      }
+      break;
+  }
+
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    int eob_ctx = eob_pt - 3;
+    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+    counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
+  }
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  int eob_cost = 0;
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    const int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+    const int offset_bits = k_eob_offset_bits[eob_pt];
+    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+  }
+  return eob_cost;
+}
+
+static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
+                                    const int (*dc_sign_cost)[2],
+                                    int dc_sign_ctx) {
+  if (coeff_idx == 0) {
+    const int sign = (qc < 0) ? 1 : 0;
+    return dc_sign_cost[dc_sign_ctx][sign];
+  }
+  return av1_cost_literal(1);
+}
+
+static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
+                              const int *coeff_lps) {
+  const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
+  const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
+  (void)ctx;
+  if (abs_qc >= min_level) {
+    if (abs_qc >= max_level) {
+      return coeff_lps[COEFF_BASE_RANGE];  // COEFF_BASE_RANGE * cost0;
+    } else {
+      return coeff_lps[(abs_qc - min_level)];  //  * cost0 + cost1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    const int length = get_msb(r) + 1;
+    return av1_cost_literal(2 * length - 1);
+  }
+  return 0;
+}
+
+static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
+                          const int is_eob, const TxbInfo *const txb_info,
+                          const LV_MAP_COEFF_COST *const txb_costs,
+                          const int coeff_ctx, const TX_CLASS tx_class) {
+  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
+  const int is_nz = (qc != 0);
+  const tran_low_t abs_qc = abs(qc);
+  int cost = 0;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int pos = scan[scan_idx];
+
+  if (is_eob) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (is_nz) {
+    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
+                              txb_ctx->dc_sign_ctx);
+
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int ctx =
+          get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
+      cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
+      cost += get_golomb_cost(abs_qc);
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+                                 const int coeff_idx, const int bwl,
+                                 const int height, const int scan_idx,
+                                 const int is_eob, const TX_SIZE tx_size,
+                                 const TX_CLASS tx_class) {
+  if (is_eob) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) / 8) return 1;
+    if (scan_idx <= (height << bwl) / 4) return 2;
+    return 3;
+  }
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
+                                const int is_eob,
+                                const LV_MAP_COEFF_COST *const txb_costs,
+                                const TxbInfo *const txb_info,
+                                const TX_CLASS tx_class) {
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const uint8_t *const levels = txb_info->levels;
+  stats->new_eob = -1;
+  stats->update = 0;
+  stats->rd_low = 0;
+  stats->rd = 0;
+  stats->nz_rd = 0;
+  stats->dist_low = 0;
+  stats->rate_low = 0;
+  stats->low_qc = 0;
+
+  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  const int coeff_ctx =
+      get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
+                     scan_idx, is_eob, txb_info->tx_size, tx_class);
+  const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
+                                     coeff_ctx, tx_class);
+  assert(qc != 0);
+  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
+                                           txb_info->iqmatrix);
+  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
+
+  // distortion difference when coefficient is quantized to 0
+  const tran_low_t dqc0 =
+      qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+
+  stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
+  stats->dist = dqc_dist - stats->dist0;
+  stats->rate = qc_cost;
+
+  stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
+
+  stats->low_qc = get_lower_coeff(qc);
+
+  if (is_eob && stats->low_qc == 0) {
+    stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
+  } else {
+    if (stats->low_qc == 0) {
+      stats->dist_low = 0;
+    } else {
+      stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
+                                         txb_info->shift, txb_info->iqmatrix);
+      const int64_t low_dqc_dist =
+          get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
+      stats->dist_low = low_dqc_dist - stats->dist0;
+    }
+    const int low_qc_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
+                       coeff_ctx, tx_class);
+    stats->rate_low = low_qc_cost;
+    stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
+  }
+}
+
+static void get_dist_cost_stats_with_eob(
+    LevelDownStats *const stats, const int scan_idx,
+    const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
+    const TX_CLASS tx_class) {
+  const int is_eob = 0;
+  get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
+
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int coeff_ctx_temp = get_nz_map_ctx(
+      txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
+      txb_info->tx_size, tx_class);
+  const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
+                                         coeff_ctx_temp, tx_class);
+  int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
+  if (stats->low_qc != 0) {
+    const int low_qc_eob_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
+                       coeff_ctx_temp, tx_class);
+    int64_t rd_eob_low =
+        RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
+    rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
+  }
+
+  stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
+}
+
+static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
+                                 const TxbInfo *const txb_info) {
+  txb_info->qcoeff[coeff_idx] = qc;
+  txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
+      (uint8_t)clamp(abs(qc), 0, INT8_MAX);
+}
+
+static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
+                                const TxbInfo *const txb_info) {
+  update_qcoeff(coeff_idx, qc, txb_info);
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
+      qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+}
+
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+                           const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *ls = levels;
+
+  memset(levels - TX_PAD_TOP * stride, 0,
+         sizeof(*levels) * TX_PAD_TOP * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX);
+    }
+    for (int j = 0; j < TX_PAD_HOR; j++) {
+      *ls++ = 0;
+    }
+  }
+}
+
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+                               const int16_t *const scan, const uint16_t eob,
+                               const TX_SIZE tx_size, const TX_CLASS tx_class,
+                               int8_t *const coeff_contexts) {
+  const int bwl = get_txb_bwl(tx_size);
+  const int height = get_txb_high(tx_size);
+  for (int i = 0; i < eob; ++i) {
+    const int pos = scan[i];
+    coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i,
+                                         i == eob - 1, tx_size, tx_class);
+  }
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          uint16_t eob, TXB_CTX *txb_ctx) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, eob == 0,
+                   ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+  if (eob == 0) return;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int c;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+
+  av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
+
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+      break;
+    case 1:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+      break;
+    case 2:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+      break;
+    case 3:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+      break;
+    case 4:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+      break;
+    case 5:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+      break;
+    default:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+      break;
+  }
+
+  if (k_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    aom_write_symbol(w, bit,
+                     ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+    for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+      eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+      bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+      aom_write_bit(w, bit);
+    }
+  }
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  for (c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = tcoeff[pos];
+    const tran_low_t level = abs(v);
+
+    if (c == eob - 1) {
+      aom_write_symbol(
+          w, AOMMIN(level, 3) - 1,
+          ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+    } else {
+      aom_write_symbol(w, AOMMIN(level, 3),
+                       ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+                       4);
+    }
+    if (level > NUM_BASE_LEVELS) {
+      // level is above 1.
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        aom_write_symbol(
+            w, k,
+            ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+            BR_CDF_SIZE);
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+  }
+
+  // Loop to code all signs in the transform block,
+  // starting with the sign of DC (if applicable)
+  for (c = 0; c < eob; ++c) {
+    const tran_low_t v = tcoeff[scan[c]];
+    const tran_low_t level = abs(v);
+    const int sign = (v < 0) ? 1 : 0;
+    if (level) {
+      if (c == 0) {
+        aom_write_symbol(
+            w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+      } else {
+        aom_write_bit(w, sign);
+      }
+      if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+        write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+    }
+  }
+}
+
+typedef struct encode_txb_args {
+  const AV1_COMMON *cm;
+  MACROBLOCK *x;
+  aom_writer *w;
+} ENCODE_TXB_ARGS;
+
+static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
+                                  aom_writer *w, int plane, int block,
+                                  int blk_row, int blk_col, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  uint16_t eob = x->mbmi_ext->eobs[plane][block];
+  TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+                      x->mbmi_ext->dc_sign_ctx[plane][block] };
+  av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
+                       &txb_ctx);
+}
+
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+                         int mi_col, aom_writer *w, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  int block[MAX_MB_PLANE] = { 0 };
+  int row, col;
+  assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                       xd->plane[0].subsampling_y));
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+  int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+  int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+    for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        const int step = stepr * stepc;
+
+        const int unit_height = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+        const int unit_width = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+        for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+             blk_row += stepr) {
+          for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+               blk_col += stepc) {
+            write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row,
+                                  blk_col, tx_size);
+            block[plane] += step;
+          }
+        }
+      }
+    }
+  }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+                            const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type) {
+  if (plane > 0) return 0;
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+    } else {
+      if (ext_tx_set > 0) {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+        return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
+                                     [tx_type];
+      }
+    }
+  }
+  return 0;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+    const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane,
+    const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const struct macroblock_plane *p, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+    const TX_CLASS tx_class) {
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+  int c = eob - 1;
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int sign = v >> 31;
+    const int level = (v ^ sign) - sign;
+    const int coeff_ctx = coeff_contexts[pos];
+    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+    if (v) {
+      // sign bit cost
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
+      if (c) {
+        cost += av1_cost_literal(1);
+      } else {
+        const int sign01 = (sign ^ sign) - sign;
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+        return cost;
+      }
+    }
+  }
+  const int(*base_cost)[4] = coeff_costs->base_cost;
+  for (c = eob - 2; c >= 1; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const int level = abs(v);
+    const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)];
+    if (v) {
+      // sign bit cost
+      cost += av1_cost_literal(1);
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
+    }
+    cost += cost0;
+  }
+  if (c == 0) {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int coeff_ctx = coeff_contexts[pos];
+    const int sign = v >> 31;
+    const int level = (v ^ sign) - sign;
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+    if (v) {
+      // sign bit cost
+      const int sign01 = (sign ^ sign) - sign;
+      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        const int base_range =
+            AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+        cost += lps_cost[ctx][base_range];
+        cost += get_golomb_cost(level);
+      }
+    }
+  }
+  return cost;
+}
+
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+                        const int plane, const int block, const TX_SIZE tx_size,
+                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal)                        \
+  case tx_class_literal:                                                       \
+    return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p,  \
+                                    eob, plane_type, coeff_costs, xd, tx_type, \
+                                    tx_class_literal);
+  switch (tx_class) {
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D);
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ);
+    WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT);
+#undef WAREHOUSE_EFFICIENTS_TXB_CASE
+    default: assert(false); return 0;
+  }
+}
+
+static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                        const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
+  int update = 0;
+  if (txb_info->eob == 0) return update;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  // forward optimize the nz_map`
+  const int init_eob = txb_info->eob;
+  const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
+  const int eob_cost =
+      get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
+
+  // backward optimize the level-k map
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int64_t prev_eob_rd_cost = INT64_MAX;
+  int64_t cur_eob_rd_cost = 0;
+
+  {
+    const int si = init_eob - 1;
+    const int coeff_idx = scan[si];
+    LevelDownStats stats;
+    get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
+                        tx_class);
+    if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+      update = 1;
+      update_coeff(coeff_idx, stats.low_qc, txb_info);
+      accu_rate += stats.rate_low;
+      accu_dist += stats.dist_low;
+    } else {
+      accu_rate += stats.rate;
+      accu_dist += stats.dist;
+    }
+  }
+
+  int si = init_eob - 2;
+  int8_t has_nz_tail = 0;
+  // eob is not fixed
+  for (; si >= 0 && has_nz_tail < 2; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      LevelDownStats stats;
+      get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
+      // check if it is better to make this the last significant coefficient
+      int cur_eob_rate =
+          get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
+      cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
+      prev_eob_rd_cost =
+          RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
+      if (cur_eob_rd_cost <= prev_eob_rd_cost) {
+        update = 1;
+        for (int j = si + 1; j < txb_info->eob; j++) {
+          const int coeff_pos_j = scan[j];
+          update_coeff(coeff_pos_j, 0, txb_info);
+        }
+        txb_info->eob = si + 1;
+
+        // rerun cost calculation due to change of eob
+        accu_rate = cur_eob_rate;
+        accu_dist = 0;
+        get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
+        if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+          update = 1;
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
+
+        // reset non zero tail when new eob is found
+        has_nz_tail = 0;
+      } else {
+        int bUpdCoeff = 0;
+        if (stats.rd_low < stats.rd) {
+          if ((si < txb_info->eob - 1)) {
+            bUpdCoeff = 1;
+            update = 1;
+          }
+        } else {
+          ++has_nz_tail;
+        }
+
+        if (bUpdCoeff) {
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
+      }
+    }
+  }  // for (si)
+
+  // eob is fixed
+  for (; si >= 0; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      LevelDownStats stats;
+      get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
+
+      int bUpdCoeff = 0;
+      if (stats.rd_low < stats.rd) {
+        if ((si < txb_info->eob - 1)) {
+          bUpdCoeff = 1;
+          update = 1;
+        }
+      }
+      if (bUpdCoeff) {
+        update_coeff(coeff_idx, stats.low_qc, txb_info);
+        accu_rate += stats.rate_low;
+        accu_dist += stats.dist_low;
+      } else {
+        accu_rate += stats.rate;
+        accu_dist += stats.dist;
+      }
+    }
+  }  // for (si)
+
+  int non_zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
+  prev_eob_rd_cost =
+      RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
+
+  int zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
+  int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
+  if (zero_blk_rd_cost <= prev_eob_rd_cost) {
+    update = 1;
+    for (int j = 0; j < txb_info->eob; j++) {
+      const int coeff_pos_j = scan[j];
+      update_coeff(coeff_pos_j, 0, txb_info);
+    }
+    txb_info->eob = 0;
+  }
+
+  // record total rate cost
+  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
+                   ? zero_blk_rate
+                   : accu_rate + non_zero_blk_rate;
+
+  if (txb_info->eob > 0) {
+    *rate_cost += txb_info->tx_type_cost;
+  }
+
+  return update;
+}
+
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 17, 13 },
+  { 16, 10 },
+};
+
+void hbt_init() {
+  hbt_hash_table =
+      aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  memset(hbt_hash_table, 0,
+         sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
+
+  hbt_needs_init = 0;
+}
+
+void hbt_destroy() { aom_free(hbt_hash_table); }
+
+int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                  TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                  const LV_MAP_EOB_COST *txb_eob_costs,
+                  const struct macroblock_plane *p, int block, int fast_mode,
+                  int *rate_cost) {
+  (void)fast_mode;
+  const int16_t *scan = txb_info->scan_order->scan;
+  int prev_eob = txb_info->eob;
+  assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
+  int32_t prev_coeff[16];
+  for (int i = 0; i < prev_eob; i++) {
+    prev_coeff[i] = txb_info->qcoeff[scan[i]];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    prev_coeff[i] = 0;  // For compiler piece of mind.
+  }
+
+  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                      txb_info->levels);
+
+  const int update =
+      optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+  // Overwrite old entry
+  uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+  uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .rate_cost = *rate_cost;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_qc_hash = hbt_qc_hash;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_ctx_hash = hbt_ctx_hash;
+  assert(prev_eob >= txb_info->eob);  // eob can't get longer
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Record how coeff changed. Convention: towards zero is negative.
+    if (txb_info->qcoeff[scan[i]] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
+  }
+  for (int i = txb_info->eob; i < prev_eob; i++) {
+    // If eob got shorter, record that all after it changed to zero.
+    if (prev_coeff[i] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = -prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    // Record 'no change' after optimized coefficients run out.
+    hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+        .deltas[i] = 0;
+  }
+
+  if (update) {
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+  return txb_info->eob;
+}
+
+int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
+                 TxbInfo *txb_info, const struct macroblock_plane *p, int block,
+                 int *rate_cost) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  int new_eob = 0;
+  int update = 0;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Delta convention is negatives go towards zero, so only apply those ones.
+    if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+            .deltas[i] < 0) {
+      if (txb_info->qcoeff[scan[i]] > 0)
+        txb_info->qcoeff[scan[i]] +=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
+      else
+        txb_info->qcoeff[scan[i]] -=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
+
+      update = 1;
+      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
+    }
+    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
+  }
+
+  // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
+  // it is expensive and gives little benefit as long as qc_hash is high bit
+  *rate_cost =
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .rate_cost;
+
+  if (update) {
+    txb_info->eob = new_eob;
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+
+  return txb_info->eob;
+}
+
+int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                     TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                     const LV_MAP_EOB_COST *txb_eob_costs,
+                     const struct macroblock_plane *p, int block, int fast_mode,
+                     int *rate_cost) {
+  // Check for qcoeff match
+  int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+
+  if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_qc_hash == hbt_qc_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_ctx_hash == hbt_ctx_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .init) {
+    return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
+                        rate_cost);
+  } else {
+    return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                         txb_eob_costs, p, block, fast_mode, rate_cost);
+  }
+}
+
+int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                      const LV_MAP_EOB_COST *txb_eob_costs,
+                      const struct macroblock_plane *p, int block,
+                      int fast_mode, int *rate_cost) {
+  // Initialize hash table if needed.
+  if (hbt_needs_init) {
+    hbt_init();
+  }
+
+  //// Hash creation
+  uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
+  const int16_t *scan = txb_info->scan_order->scan;
+  uint8_t chunk = 0;
+  int hash_data_index = 0;
+
+  // Make qc_hash.
+  int packing_index = 0;  // needed for packing.
+  for (int i = 0; i < txb_info->eob; i++) {
+    tran_low_t prechunk = txb_info->qcoeff[scan[i]];
+
+    // Softening: Improves speed. Aligns with signed deltas.
+    if (prechunk < 0) prechunk *= -1;
+
+    // Early kick out: Don't apply feature if there are large coeffs:
+    // If this kickout value is removed or raised beyond int8_t,
+    // widen deltas type in OptTxbQcoeff struct.
+    assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
+    if (prechunk > HBT_KICKOUT) {
+      av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                          txb_info->levels);
+
+      const int update =
+          optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+      if (update) {
+        p->eobs[block] = txb_info->eob;
+        p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+            txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+      }
+      return txb_info->eob;
+    }
+
+    // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
+    if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
+    chunk = prechunk << packing_index;
+    packing_index += 2;
+    txb_hash_data[hash_data_index] |= chunk;
+
+    // Full byte:
+    if (packing_index == 8) {
+      packing_index = 0;
+      hash_data_index++;
+    }
+  }
+  // Needed when packing_index != 0, to include final byte.
+  hash_data_index++;
+  assert(hash_data_index <= 64);
+  // 31 bit qc_hash: index to array
+  uint32_t hbt_qc_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+
+  // Make ctx_hash.
+  hash_data_index = 0;
+  tran_low_t prechunk;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Save as magnitudes towards or away from zero.
+    if (txb_info->tcoeff[scan[i]] >= 0)
+      prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
+    else
+      prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
+
+    chunk = prechunk & 0xff;
+    txb_hash_data[hash_data_index++] = chunk;
+  }
+
+  // Extra ctx data:
+  // Include dequants.
+  txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
+  txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
+  chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // eob
+  chunk = txb_info->eob & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // rdmult (int64)
+  chunk = txb_info->rdmult & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // tx_type
+  chunk = txb_info->tx_type & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // base_eob_cost
+  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
+    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
+      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+  // eob_cost
+  for (int i = 0; i < 11; i++) {
+    for (int j = 0; j < 2; j++) {
+      chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+  // dc_sign_cost
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
+      chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+
+  assert(hash_data_index <= 256);
+  // 31 bit ctx_hash: used to index table
+  uint32_t hbt_ctx_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+  //// End hash creation
+
+  return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                          txb_eob_costs, p, block, fast_mode, rate_cost);
+}
+
+static AOM_FORCE_INLINE int get_coeff_cost_simple(
+    int ci, tran_low_t abs_qc, int coeff_ctx,
+    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
+    const uint8_t *levels) {
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(ci > 0);
+  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  if (abs_qc) {
+    cost += av1_cost_literal(1);
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+      cost += get_golomb_cost(abs_qc);
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+                                         int sign, int coeff_ctx,
+                                         int dc_sign_ctx,
+                                         const LV_MAP_COEFF_COST *txb_costs,
+                                         int bwl, TX_CLASS tx_class,
+                                         const uint8_t *levels) {
+  int cost = 0;
+  if (is_last) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+      cost += get_golomb_cost(abs_qc);
+    }
+  }
+  return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+                                  int shift, tran_low_t *qc_low,
+                                  tran_low_t *dqc_low) {
+  tran_low_t abs_qc_low = abs_qc - 1;
+  *qc_low = (-sign ^ abs_qc_low) + sign;
+  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+  *dqc_low = (-sign ^ abs_dqc_low) + sign;
+  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_general(
+    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+    TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
+    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) {
+  const int dqv = dequant[si != 0];
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int is_last = si == (eob - 1);
+  const int coeff_ctx = get_lower_levels_ctx_general(
+      is_last, si, bwl, height, levels, ci, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const int sign = (qc < 0) ? 1 : 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    const int rate =
+        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+    const int rate_low =
+        get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+      *accu_dist += dist_low - dist0;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist - dist0;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bwl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels) {
+  const int dqv = dequant[1];
+  (void)eob;
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(si != eob - 1);
+  assert(si > 0);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
+                                           bwl, tx_class, levels);
+    if (abs(dqc) < abs(tqc)) {
+      *accu_rate += rate;
+      return;
+    }
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    const int sign = (qc < 0) ? 1 : 0;
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+    const int rate_low = get_coeff_cost_simple(
+        ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+    } else {
+      *accu_rate += rate;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) {
+  const int dqv = dequant[si != 0];
+  assert(si != *eob - 1);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    int lower_level = 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int sign = (qc < 0) ? 1 : 0;
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+    tran_low_t qc_low, dqc_low;
+    get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+    const int rate_low =
+        get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    const int64_t rd_low =
+        RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+
+    int lower_level_new_eob = 0;
+    const int new_eob = si + 1;
+    uint8_t tmp_levels[3];
+    for (int ni = 0; ni < *nz_num; ++ni) {
+      const int last_ci = nz_ci[ni];
+      tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
+      levels[get_padded_idx(last_ci, bwl)] = 0;
+    }
+
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
+        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    const int new_eob_cost =
+        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+    int rate_coeff_eob =
+        new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
+                                              coeff_ctx_new_eob, dc_sign_ctx,
+                                              txb_costs, bwl, tx_class, levels);
+    int64_t dist_new_eob = dist;
+    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+    if (abs_qc_low > 0) {
+      const int rate_coeff_eob_low =
+          new_eob_cost +
+          get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      const int64_t dist_new_eob_low = dist_low;
+      const int64_t rd_new_eob_low =
+          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+      if (rd_new_eob_low < rd_new_eob) {
+        lower_level_new_eob = 1;
+        rd_new_eob = rd_new_eob_low;
+        rate_coeff_eob = rate_coeff_eob_low;
+        dist_new_eob = dist_new_eob_low;
+      }
+    }
+
+    if (rd_low < rd) {
+      lower_level = 1;
+      rd = rd_low;
+      rate = rate_low;
+      dist = dist_low;
+    }
+
+    if (sharpness == 0 && rd_new_eob < rd) {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        int last_ci = nz_ci[ni];
+        // levels[get_padded_idx(last_ci, bwl)] = 0;
+        qcoeff[last_ci] = 0;
+        dqcoeff[last_ci] = 0;
+      }
+      *eob = new_eob;
+      *nz_num = 0;
+      *accu_rate = rate_coeff_eob;
+      *accu_dist = dist_new_eob;
+      lower_level = lower_level_new_eob;
+    } else {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        const int last_ci = nz_ci[ni];
+        levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
+      }
+      *accu_rate += rate;
+      *accu_dist += dist;
+    }
+
+    if (lower_level) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+    }
+    if (qcoeff[ci]) {
+      nz_ci[*nz_num] = ci;
+      ++*nz_num;
+    }
+  }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+                               int nz_num, int *nz_ci, int64_t rdmult,
+                               int skip_cost, int non_skip_cost,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               int sharpness) {
+  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+  if (sharpness == 0 && rd_new_eob < rd) {
+    for (int i = 0; i < nz_num; ++i) {
+      const int ci = nz_ci[i];
+      qcoeff[ci] = 0;
+      dqcoeff[ci] = 0;
+      // no need to set up levels because this is the last step
+      // levels[get_padded_idx(ci, bwl)] = 0;
+    }
+    *accu_rate = 0;
+    *eob = 0;
+  }
+}
+
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int16_t *dequant = p->dequant_QTX;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  assert(width == (1 << bwl));
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *txb_eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+
+  const int shift = av1_get_tx_scale(tx_size);
+  const int64_t rdmult =
+      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+       2) >>
+      (sharpness +
+       (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+            ? 7 - mbmi->segment_id
+            : 2) +
+       (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+                cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
+            ? (3 - x->sb_energy_level)
+            : 0));
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  // TODO(angirbird): check iqmatrix
+
+  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  int eob = p->eobs[block];
+  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int si = eob - 1;
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const tran_low_t abs_qc = abs(qc);
+  const int sign = qc < 0;
+  const int max_nz_num = 2;
+  int nz_num = 1;
+  int nz_ci[3] = { ci, 0, 0 };
+  if (abs_qc >= 2) {
+    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels);
+    --si;
+  } else {
+    assert(abs_qc == 1);
+    const int coeff_ctx = get_lower_levels_ctx_general(
+        1, si, bwl, height, levels, ci, tx_size, tx_class);
+    accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
+                                        txb_ctx->dc_sign_ctx, txb_costs, bwl,
+                                        tx_class, levels);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    accu_dist += dist - dist0;
+    --si;
+  }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                   \
+    for (; si >= 0 && nz_num <= max_nz_num; --si) {                        \
+      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
+                       tx_size, tx_class_literal, bwl, height,             \
+                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+                       levels, sharpness);                                 \
+    }                                                                      \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+    default: assert(false);
+  }
+
+  if (si == -1 && nz_num <= max_nz_num) {
+    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+                non_skip_cost, qcoeff, dqcoeff, sharpness);
+  }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
+  case tx_class_literal:                                                       \
+    for (; si >= 1; --si) {                                                    \
+      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
+                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
+                          qcoeff, dqcoeff, levels);                            \
+    }                                                                          \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+    default: assert(false);
+  }
+
+  // DC position
+  if (si == 0) {
+    // no need to update accu_dist because it's not used after this point
+    int64_t dummy_dist = 0;
+    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels);
+  }
+
+  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  if (eob == 0)
+    accu_rate += skip_cost;
+  else
+    accu_rate += non_skip_cost + tx_type_cost;
+
+  p->eobs[block] = eob;
+  p->txb_entropy_ctx[block] =
+      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+  *rate_cost = accu_rate;
+  return eob;
+}
+
+// This function is deprecated, but we keep it here because hash trellis
+// is not integrated with av1_optimize_txb_new yet
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+  const int16_t *dequant = p->dequant_QTX;
+  const int seg_eob = av1_get_max_eob(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST txb_eob_costs =
+      x->eob_costs[eob_multi_size][plane_type];
+
+  const int shift = av1_get_tx_scale(tx_size);
+  const int64_t rdmult =
+      ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+       2) >>
+      2;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  const qm_val_t *iqmatrix =
+      IS_2D_TRANSFORM(tx_type)
+          ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+          : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+  assert(width == (1 << bwl));
+  const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+  TxbInfo txb_info = {
+    qcoeff,   levels,       dqcoeff,    tcoeff,  dequant, shift,
+    tx_size,  txs_ctx,      tx_type,    bwl,     width,   height,
+    eob,      seg_eob,      scan_order, txb_ctx, rdmult,  &cm->coeff_ctx_table,
+    iqmatrix, tx_type_cost,
+  };
+
+  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
+  // by storing the coefficient deltas in a hash table.
+  // Currently disabled in speedfeatures.c
+  if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
+    return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
+                             fast_mode, rate_cost);
+  }
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  const int update =
+      optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
+
+  if (update) {
+    p->eobs[block] = txb_info.eob;
+    p->txb_entropy_ctx[block] =
+        av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
+  }
+  return txb_info.eob;
+}
+
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                const SCAN_ORDER *scan_order, int eob) {
+  const int16_t *const scan = scan_order->scan;
+  int cul_level = 0;
+  int c;
+
+  if (eob == 0) return 0;
+  for (c = 0; c < eob; ++c) {
+    cul_level += abs(qcoeff[scan[c]]);
+    if (cul_level > COEFF_CONTEXT_MASK) break;
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+  set_dc_sign(&cul_level, qcoeff[0]);
+
+  return cul_level;
+}
+
+void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+                   blk_row);
+}
+
+static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                 int blk_row, int blk_col, int plane,
+                                 TX_SIZE tx_size, FRAME_COUNTS *counts,
+                                 uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int is_inter = is_inter_block(mbmi);
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+  (void)counts;
+#endif  // !CONFIG_ENTROPY_STATS
+
+  // Only y plane's tx_type is updated
+  if (plane > 0) return;
+  TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size,
+                                    cm->reduced_tx_set_used);
+  if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+      cm->base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+    if (eset > 0) {
+      const TxSetType tx_set_type =
+          av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+      if (is_inter) {
+        if (allow_update_cdf) {
+          update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+                     av1_ext_tx_ind[tx_set_type][tx_type],
+                     av1_num_ext_tx_set[tx_set_type]);
+        }
+#if CONFIG_ENTROPY_STATS
+        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+      } else {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+        if (allow_update_cdf) {
+          update_cdf(
+              fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+              av1_ext_tx_ind[tx_set_type][tx_type],
+              av1_num_ext_tx_set[tx_set_type]);
+        }
+      }
+    }
+  }
+}
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int eob = p->eobs[block];
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
+              pd->left_context + blk_row, &txb_ctx);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const uint8_t allow_update_cdf = args->allow_update_cdf;
+  const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+  int cdf_idx = cm->coef_cdf_category;
+#endif  // CONFIG_ENTROPY_STATS
+
+#if CONFIG_ENTROPY_STATS
+  ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+  if (allow_update_cdf) {
+    update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
+               2);
+  }
+
+  x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
+  x->mbmi_ext->eobs[plane][block] = eob;
+
+  if (eob == 0) {
+    av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
+    return;
+  }
+
+  tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+  const int segment_id = mbmi->segment_id;
+  const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts,
+                       allow_update_cdf);
+
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, cm->reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+#if CONFIG_ENTROPY_STATS
+  av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                         td->counts, allow_update_cdf);
+#else
+  av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                         allow_update_cdf);
+#endif
+
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  for (int c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const tran_low_t level = abs(v);
+
+    if (allow_update_cdf) {
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+        update_cdf(
+            ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+            AOMMIN(level, 3) - 1, 3);
+      } else {
+        update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                   AOMMIN(level, 3), 4);
+      }
+    }
+    {
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
+#endif
+      }
+    }
+    if (level > NUM_BASE_LEVELS) {
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        if (allow_update_cdf) {
+          update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                         [plane_type][br_ctx],
+                     k, BR_CDF_SIZE);
+        }
+        for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps]
+                                 [br_ctx][lps == k];
+#endif  // CONFIG_ENTROPY_STATS
+          if (lps == k) break;
+        }
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                     [plane_type][br_ctx][k];
+#endif
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+  }
+
+  // Update the context needed to code the DC sign (if applicable)
+  if (tcoeff[0] != 0) {
+    const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+    const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+    x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+  }
+
+  const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+                   blk_row);
+}
+
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                            int mi_row, int mi_col, uint8_t allow_update_cdf) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf };
+  (void)rate;
+  (void)mi_row;
+  (void)mi_col;
+  if (mbmi->skip) {
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+    return;
+  }
+
+  if (!dry_run) {
+    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+                                  av1_update_and_record_txb_context, &arg,
+                                  num_planes);
+  } else if (dry_run == DRY_RUN_NORMAL) {
+    av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+                                  av1_update_txb_context_b, &arg, num_planes);
+  } else {
+    printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+    assert(0);
+  }
+}
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 000000000..40ae343b0
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TxbInfo {
+  tran_low_t *qcoeff;
+  uint8_t *levels;  // absolute values and clamped to 255.
+  tran_low_t *dqcoeff;
+  const tran_low_t *tcoeff;
+  const int16_t *dequant;
+  int shift;
+  TX_SIZE tx_size;
+  TX_SIZE txs_ctx;
+  TX_TYPE tx_type;
+  int bwl;
+  int width;
+  int height;
+  int eob;
+  int seg_eob;
+  const SCAN_ORDER *scan_order;
+  TXB_CTX *txb_ctx;
+  int64_t rdmult;
+  const LV_MAP_CTX_TABLE *coeff_ctx_table;
+  const qm_val_t *iqmatrix;
+  int tx_type_cost;
+} TxbInfo;
+
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+void av1_free_txb_buf(AV1_COMP *cpi);
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+                        const int plane, const int block, const TX_SIZE tx_size,
+                        const TX_TYPE tx_type, const TXB_CTX *const txb_ctx);
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          TX_SIZE tx_size, const tran_low_t *tcoeff,
+                          uint16_t eob, TXB_CTX *txb_ctx);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+                         int mi_col, aom_writer *w, BLOCK_SIZE bsize);
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                const SCAN_ORDER *scan_order, int eob);
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                            int mi_row, int mi_col, uint8_t allow_update_cdf);
+
+void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              void *arg);
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg);
+
+void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                          int mi_row, int mi_col);
+
+void hbt_destroy();
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 000000000..e8ac30bb5
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  for (int i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (int i = 0; i < REF_FRAMES; i++)
+    td->rd_counts.global_motion_used[i] +=
+        td_t->rd_counts.global_motion_used[i];
+
+  td->rd_counts.compound_ref_used_flag |=
+      td_t->rd_counts.compound_ref_used_flag;
+  td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int t;
+
+  (void)unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+       t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 1;
+}
+
+static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  CHECK_MEM_ERROR(cm, cpi->workers,
+                  aom_malloc(num_workers * sizeof(*cpi->workers)));
+
+  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                  aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+    ++cpi->num_workers;
+    winterface->init(worker);
+
+    thread_data->cpi = cpi;
+
+    if (i < num_workers - 1) {
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      aom_memalign(32, sizeof(*thread_data->td)));
+      av1_zero(*thread_data->td);
+
+      // Set up pc_tree.
+      thread_data->td->pc_tree = NULL;
+      av1_setup_pc_tree(cm, thread_data->td);
+
+      CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+                      (uint8_t *)aom_memalign(
+                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*thread_data->td->above_pred_buf)));
+      CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+                      (uint8_t *)aom_memalign(
+                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*thread_data->td->left_pred_buf)));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->wsrc_buf,
+          (int32_t *)aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+
+      for (int x = 0; x < 2; x++)
+        for (int y = 0; y < 2; y++)
+          CHECK_MEM_ERROR(
+              cm, thread_data->td->hash_value_buffer[x][y],
+              (uint32_t *)aom_malloc(
+                  AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                  sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->mask_buf,
+          (int32_t *)aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                      aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+      // Allocate buffers used by palette coding mode.
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->palette_buffer,
+          aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
+      for (int j = 0; j < 2; ++j) {
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->tmp_obmc_bufs[j],
+            aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                 sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+      }
+
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->td = &cpi->td;
+    }
+    winterface->sync(worker);
+  }
+}
+
+static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  // Encode a frame
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  // Encoding ends.
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+}
+
+static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+    }
+  }
+}
+
+static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                int num_workers) {
+  for (int i = 0; i < num_workers; i++) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+      thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
+      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          memcpy(thread_data->td->hash_value_buffer[x][y],
+                 cpi->td.mb.hash_value_buffer[x][y],
+                 AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                     sizeof(*thread_data->td->hash_value_buffer[0][0]));
+          thread_data->td->mb.hash_value_buffer[x][y] =
+              thread_data->td->hash_value_buffer[x][y];
+        }
+      }
+      thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+    }
+    if (thread_data->td->counts != &cpi->counts) {
+      memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
+    }
+
+    if (i < num_workers - 1) {
+      thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.tmp_obmc_bufs[j] =
+            thread_data->td->tmp_obmc_bufs[j];
+      }
+
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+            thread_data->td->mb.tmp_obmc_bufs[j];
+      }
+    }
+  }
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+    av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    create_enc_workers(cpi, num_workers);
+  } else {
+    num_workers = AOMMIN(num_workers, cpi->num_workers);
+  }
+  prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+  launch_enc_workers(cpi, num_workers);
+  sync_enc_workers(cpi, num_workers);
+  accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+                                 const FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)acc_counts;
+  const unsigned int *const cnt = (const unsigned int *)counts;
+
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+  for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 000000000..5de4b4803
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+                                 const struct FRAME_COUNTS *counts);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 000000000..e9621a574
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch, int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right) {
+  int i, linesize;
+
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + w - 1;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch, int w,
+                                         int h, int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  // Altref filtering assumes 16 pixel extension
+  const int et_y = 16;
+  const int el_y = 16;
+  // Motion estimation may use src block variance with the block size up
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
+  // or up to 16, whichever is greater.
+  const int er_y =
+      AOMMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y =
+      AOMMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+      src->y_crop_height;
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, src->y_crop_width,
+                                 src->y_crop_height, et_y, el_y, eb_y, er_y);
+
+    highbd_copy_and_extend_plane(
+        src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+
+    highbd_copy_and_extend_plane(
+        src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+        src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    return;
+  }
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                        dst->y_stride, src->y_crop_width, src->y_crop_height,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+                        dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+                        dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+                        et_uv, el_uv, eb_uv, er_uv);
+}
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst, int srcy,
+                                         int srcx, int srch, int srcw) {
+  // If the side is not touching the bounder then don't extend.
+  const int et_y = srcy ? 0 : dst->border;
+  const int el_y = srcx ? 0 : dst->border;
+  const int eb_y = srcy + srch != src->y_height
+                       ? 0
+                       : dst->border + dst->y_height - src->y_height;
+  const int er_y = srcx + srcw != src->y_width
+                       ? 0
+                       : dst->border + dst->y_width - src->y_width;
+  const int src_y_offset = srcy * src->y_stride + srcx;
+  const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+                        dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
+                        et_y, el_y, eb_y, er_y);
+
+  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+                        dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
+
+  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+                        dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+                        srch_uv, et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 000000000..e0432cc97
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+                                         YV12_BUFFER_CONFIG *dst, int srcy,
+                                         int srcx, int srch, int srcw);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 000000000..69dd20c52
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,3480 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "aom_dsp/variance.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/common/txb_common.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#define OUTPUT_FPF 0
+#define ARF_STATS_OUTPUT 0
+
+#define GROUP_ADAPTIVE_MAXQ 1
+
+#define BOOST_BREAKOUT 12.5
+#define BOOST_FACTOR 12.5
+#define FACTOR_PT_LOW 0.70
+#define FACTOR_PT_HIGH 0.90
+#define FIRST_PASS_Q 10.0
+#define GF_MAX_BOOST 90.0
+#define INTRA_MODE_PENALTY 1024
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_ARF_GF_BOOST 240
+#define MIN_DECAY_FACTOR 0.01
+#define MIN_KF_BOOST 300
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+#define DEFAULT_GRP_WEIGHT 1.0
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+#define MIN_FWD_KF_INTERVAL 8
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_in_end) return EOF;
+
+  *fps = *p->stats_in;
+  ++p->stats_in;
+  return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct aom_codec_pkt_list *pktlist) {
+  struct aom_codec_cx_pkt pkt;
+  pkt.kind = AOM_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile,
+            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
+            stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+            stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+            stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+            stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+            stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+            stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+            stats->count, stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+#if CONFIG_FP_MB_STATS
+static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size,
+                              struct aom_codec_pkt_list *pktlist) {
+  struct aom_codec_cx_pkt pkt;
+  pkt.kind = AOM_CODEC_FPMB_STATS_PKT;
+  pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
+  pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats);
+  aom_codec_pkt_list_add(pktlist, &pkt);
+}
+#endif
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+  section->frame = 0.0;
+  section->weight = 0.0;
+  section->intra_error = 0.0;
+  section->frame_avg_wavelet_energy = 0.0;
+  section->coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->pcnt_inter = 0.0;
+  section->pcnt_motion = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
+  section->mvr_abs = 0.0;
+  section->MVc = 0.0;
+  section->mvc_abs = 0.0;
+  section->MVrv = 0.0;
+  section->MVcv = 0.0;
+  section->mv_in_out_count = 0.0;
+  section->new_mv_count = 0.0;
+  section->count = 0.0;
+  section->duration = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+                             const FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->weight += frame->weight;
+  section->intra_error += frame->intra_error;
+  section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->pcnt_inter += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
+  section->mvr_abs += frame->mvr_abs;
+  section->MVc += frame->MVc;
+  section->mvc_abs += frame->mvc_abs;
+  section->MVrv += frame->MVrv;
+  section->MVcv += frame->MVcv;
+  section->mv_in_out_count += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count += frame->count;
+  section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0  // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+  const double this_area = cpi->initial_width * cpi->initial_height;
+  return pow(this_area / BASE_SIZE, 0.5);
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const AV1_COMP *cpi,
+                                    const FIRSTPASS_STATS *this_frame) {
+  double active_pct;
+
+  active_pct =
+      1.0 -
+      ((this_frame->intra_skip_pct / 2) +
+       ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const AV1_COMP *cpi,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+      pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error, twopass->modified_error_min,
+                twopass->modified_error_max);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const AV1EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                      (int64_t)oxcf->two_pass_vbrmax_section) /
+                     100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+void av1_init_first_pass(AV1_COMP *cpi) {
+  zero_stats(&cpi->twopass.total_stats);
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_8X8: return aom_mse8x8;
+    case BLOCK_16X8: return aom_mse16x8;
+    case BLOCK_8X16: return aom_mse8x16;
+    default: return aom_mse16x16;
+  }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_8_mse8x8;
+        case BLOCK_16X8: return aom_highbd_8_mse16x8;
+        case BLOCK_8X16: return aom_highbd_8_mse8x16;
+        default: return aom_highbd_8_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_10_mse8x8;
+        case BLOCK_16X8: return aom_highbd_10_mse16x8;
+        case BLOCK_8X16: return aom_highbd_10_mse8x16;
+        default: return aom_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_12_mse8x8;
+        case BLOCK_16X8: return aom_highbd_12_mse16x8;
+        case BLOCK_8X16: return aom_highbd_12_mse8x16;
+        default: return aom_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const AV1_COMP *cpi) {
+  int sr = 0;
+  const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+  return sr;
+}
+
+static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                     const MV *ref_mv, MV *best_mv,
+                                     int *best_motion_err) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV tmp_mv = kZeroMv;
+  MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int num00, tmp_err, n;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+
+  int step_param = 3;
+  int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+  const int sr = get_search_range(cpi);
+  step_param += sr;
+  further_steps -= sr;
+
+  // Override the default variance function to use MSE.
+  v_fn_ptr.vf = get_block_variance_fn(bsize);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+  }
+
+  // Center the initial step/diamond search on best mv.
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                    step_param, x->sadperbit16, &num00,
+                                    &v_fn_ptr, ref_mv);
+  if (tmp_err < INT_MAX)
+    tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+  if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    *best_mv = tmp_mv;
+  }
+
+  // Carry out further step/diamond searches as necessary.
+  n = num00;
+  num00 = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      --num00;
+    } else {
+      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+                                        step_param + n, x->sadperbit16, &num00,
+                                        &v_fn_ptr, ref_mv);
+      if (tmp_err < INT_MAX)
+        tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+      if (tmp_err < INT_MAX - new_mv_mode_penalty)
+        tmp_err += new_mv_mode_penalty;
+
+      if (tmp_err < *best_motion_err) {
+        *best_motion_err = tmp_err;
+        *best_mv = tmp_mv;
+      }
+    }
+  }
+}
+
+static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
+  if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
+      cm->mi_cols) {
+    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+                   cm->mi_rows
+               ? BLOCK_16X16
+               : BLOCK_16X8;
+  } else {
+    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+                   cm->mi_rows
+               ? BLOCK_8X16
+               : BLOCK_8X8;
+  }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; ++i)
+    if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
+
+  if (i == QINDEX_RANGE) i--;
+
+  return i;
+}
+
+static void set_first_pass_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+    cm->frame_type = KEY_FRAME;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+}
+
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+                                     int raw_motion_err_counts) {
+  int64_t sum_raw_err = 0;
+  double raw_err_avg = 0;
+  double raw_err_stdev = 0;
+  if (raw_motion_err_counts == 0) return 0;
+
+  int i;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    sum_raw_err += raw_motion_err_list[i];
+  }
+  raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+                     (raw_motion_err_list[i] - raw_err_avg);
+  }
+  // Calculate the standard deviation for the motion error of all the inter
+  // blocks of the 0,0 motion using the last source
+  // frame as the reference.
+  raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+  return raw_err_stdev;
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+  int mb_row, mb_col;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const PICK_MODE_CONTEXT *ctx =
+      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
+  int i;
+
+  int recon_yoffset, recon_uvoffset;
+  int64_t intra_error = 0;
+  int64_t frame_avg_wavelet_energy = 0;
+  int64_t coded_error = 0;
+  int64_t sr_coded_error = 0;
+
+  int sum_mvr = 0, sum_mvc = 0;
+  int sum_mvr_abs = 0, sum_mvc_abs = 0;
+  int64_t sum_mvrs = 0, sum_mvcs = 0;
+  int mvcount = 0;
+  int intercount = 0;
+  int second_ref_count = 0;
+  const int intrapenalty = INTRA_MODE_PENALTY;
+  double neutral_count;
+  int intra_skip_count = 0;
+  int image_data_start_row = INVALID_ROW;
+  int new_mv_count = 0;
+  int sum_in_vectors = 0;
+  MV lastmv = kZeroMv;
+  TWO_PASS *twopass = &cpi->twopass;
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+  double intra_factor;
+  double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
+  const int mb_scale = mi_size_wide[BLOCK_16X16];
+
+  int *raw_motion_err_list;
+  int raw_motion_err_counts = 0;
+  CHECK_MEM_ERROR(
+      cm, raw_motion_err_list,
+      aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+#if CONFIG_FP_MB_STATS
+  if (cpi->use_fp_mb_stats) {
+    av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs);
+  }
+#endif
+
+  aom_clear_system_state();
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  x->e_mbd.mi[0]->sb_type = BLOCK_16X16;
+
+  intra_factor = 0.0;
+  brightness_factor = 0.0;
+  neutral_count = 0.0;
+
+  set_first_pass_params(cpi);
+  av1_set_quantizer(cm, qindex);
+
+  av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
+                         seq_params->subsampling_y, num_planes);
+
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
+  av1_setup_dst_planes(xd->plane, seq_params->sb_size, new_yv12, 0, 0, 0,
+                       num_planes);
+
+  if (!frame_is_intra_only(cm)) {
+    av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL, num_planes);
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  // Don't store luma on the fist pass since chroma is not computed
+  xd->cfl.store_y = 0;
+  av1_frame_init_quantizer(cpi);
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+
+  av1_init_mv_probs(cm);
+  av1_init_lv_map(cm);
+  av1_initialize_rd_consts(cpi);
+
+  // Tiling is ignored in the first pass.
+  av1_tile_init(&tile, cm, 0, 0);
+
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    MV best_ref_mv = kZeroMv;
+
+    // Reset above block coeffs.
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_limits.row_max =
+        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int this_error;
+      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+      double log_intra;
+      int level_sample;
+
+#if CONFIG_FP_MB_STATS
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+      aom_clear_system_state();
+
+      const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
+      xd->mi = cm->mi_grid_visible + idx_str;
+      xd->mi[0] = cm->mi + idx_str;
+      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+      xd->left_available = (mb_col != 0);
+      xd->mi[0]->sb_type = bsize;
+      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+      set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
+                     mb_col * mb_scale, mi_size_wide[bsize], cm->mi_rows,
+                     cm->mi_cols);
+
+      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+
+      // Do intra 16x16 prediction.
+      xd->mi[0]->segment_id = 0;
+      xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+      xd->mi[0]->mode = DC_PRED;
+      xd->mi[0]->tx_size =
+          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+      av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+      this_error = aom_get_mb_ss(x->plane[0].src_diff);
+
+      // Keep a record of blocks that have almost no intra error residual
+      // (i.e. are in effect completely flat and untextured in the intra
+      // domain). In natural videos this is uncommon, but it is much more
+      // common in animations, graphics and screen content, so may be used
+      // as a signal to detect these types of content.
+      if (this_error < UL_INTRA_THRESH) {
+        ++intra_skip_count;
+      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+        image_data_start_row = mb_row;
+      }
+
+      if (seq_params->use_highbitdepth) {
+        switch (seq_params->bit_depth) {
+          case AOM_BITS_8: break;
+          case AOM_BITS_10: this_error >>= 4; break;
+          case AOM_BITS_12: this_error >>= 8; break;
+          default:
+            assert(0 &&
+                   "seq_params->bit_depth should be AOM_BITS_8, "
+                   "AOM_BITS_10 or AOM_BITS_12");
+            return;
+        }
+      }
+
+      aom_clear_system_state();
+      log_intra = log(this_error + 1.0);
+      if (log_intra < 10.0)
+        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+      else
+        intra_factor += 1.0;
+
+      if (seq_params->use_highbitdepth)
+        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+      else
+        level_sample = x->plane[0].src.buf[0];
+      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      else
+        brightness_factor += 1.0;
+
+      // Intrapenalty below deals with situations where the intra and inter
+      // error scores are very low (e.g. a plain black frame).
+      // We do not have special cases in first pass for 0,0 and nearest etc so
+      // all inter modes carry an overhead cost estimate for the mv.
+      // When the error score is very low this causes us to pick all or lots of
+      // INTRA modes and throw lots of key frames.
+      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+      this_error += intrapenalty;
+
+      // Accumulate the intra error.
+      intra_error += (int64_t)this_error;
+
+      int stride = x->plane[0].src.stride;
+      uint8_t *buf = x->plane[0].src.buf;
+      for (int r8 = 0; r8 < 2; ++r8)
+        for (int c8 = 0; c8 < 2; ++c8) {
+          int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+          frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+              buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+        }
+
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        // initialization
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+      }
+#endif
+
+      // Set up limit values for motion vectors to prevent them extending
+      // outside the UMV borders.
+      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+      x->mv_limits.col_max =
+          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+      if (!frame_is_intra_only(cm)) {  // Do a motion search
+        int tmp_err, motion_error, raw_motion_error;
+        // Assume 0,0 motion with no mv overhead.
+        MV mv = kZeroMv, tmp_mv = kZeroMv;
+        struct buf_2d unscaled_last_source_buf_2d;
+
+        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        } else {
+          motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                              &xd->plane[0].pre[0]);
+        }
+
+        // Compute the motion error of the 0,0 motion using the last source
+        // frame as the reference. Skip the further motion search on
+        // reconstructed frame if this error is small.
+        unscaled_last_source_buf_2d.buf =
+            cpi->unscaled_last_source->y_buffer + recon_yoffset;
+        unscaled_last_source_buf_2d.stride =
+            cpi->unscaled_last_source->y_stride;
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          raw_motion_error = highbd_get_prediction_error(
+              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+        } else {
+          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                  &unscaled_last_source_buf_2d);
+        }
+
+        // TODO(pengchong): Replace the hard-coded threshold
+        if (raw_motion_error > 25) {
+          // Test last reference frame using the previous best mv as the
+          // starting point (best reference) for the search.
+          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
+
+          // If the current best reference mv is not centered on 0,0 then do a
+          // 0,0 based search as well.
+          if (!is_zero_mv(&best_ref_mv)) {
+            tmp_err = INT_MAX;
+            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+            if (tmp_err < motion_error) {
+              motion_error = tmp_err;
+              mv = tmp_mv;
+            }
+          }
+
+          // Search in an older reference frame.
+          if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+            // Assume 0,0 motion with no mv overhead.
+            int gf_motion_error;
+
+            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+              gf_motion_error = highbd_get_prediction_error(
+                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+            } else {
+              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                     &xd->plane[0].pre[0]);
+            }
+
+            first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv,
+                                     &gf_motion_error);
+
+            if (gf_motion_error < motion_error && gf_motion_error < this_error)
+              ++second_ref_count;
+
+            // Reset to last frame as reference buffer.
+            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+            // In accumulating a score for the older reference frame take the
+            // best of the motion predicted score and the intra coded error
+            // (just as will be done for) accumulation of "coded_error" for
+            // the last frame.
+            if (gf_motion_error < this_error)
+              sr_coded_error += gf_motion_error;
+            else
+              sr_coded_error += this_error;
+          } else {
+            sr_coded_error += motion_error;
+          }
+        } else {
+          sr_coded_error += motion_error;
+        }
+
+        // Start by assuming that intra mode is best.
+        best_ref_mv.row = 0;
+        best_ref_mv.col = 0;
+
+#if CONFIG_FP_MB_STATS
+        if (cpi->use_fp_mb_stats) {
+          // intra predication statistics
+          cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+          if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+          } else if (this_error < FPMB_ERROR_SMALL_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+          }
+        }
+#endif
+
+        if (motion_error <= this_error) {
+          aom_clear_system_state();
+
+          // Keep a count of cases where the inter and intra were very close
+          // and very low. This helps with scene cut detection for example in
+          // cropped clips with black bars at the sides or top and bottom.
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+            // Also track cases where the intra is not much worse than the inter
+            // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count +=
+                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
+
+          mv.row *= 8;
+          mv.col *= 8;
+          this_error = motion_error;
+          xd->mi[0]->mode = NEWMV;
+          xd->mi[0]->mv[0].as_mv = mv;
+          xd->mi[0]->tx_size = TX_4X4;
+          xd->mi[0]->ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->ref_frame[1] = NONE_FRAME;
+          av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
+                                         mb_col * mb_scale, NULL, bsize);
+          av1_encode_sby_pass1(cm, x, bsize);
+          sum_mvr += mv.row;
+          sum_mvr_abs += abs(mv.row);
+          sum_mvc += mv.col;
+          sum_mvc_abs += abs(mv.col);
+          sum_mvrs += mv.row * mv.row;
+          sum_mvcs += mv.col * mv.col;
+          ++intercount;
+
+          best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+          if (cpi->use_fp_mb_stats) {
+            // inter predication statistics
+            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+            if (this_error > FPMB_ERROR_LARGE_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_LARGE_MASK;
+            } else if (this_error < FPMB_ERROR_SMALL_TH) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_ERROR_SMALL_MASK;
+            }
+          }
+#endif
+
+          if (!is_zero_mv(&mv)) {
+            ++mvcount;
+
+#if CONFIG_FP_MB_STATS
+            if (cpi->use_fp_mb_stats) {
+              cpi->twopass.frame_mb_stats_buf[mb_index] &=
+                  ~FPMB_MOTION_ZERO_MASK;
+              // check estimated motion direction
+              if (mv.col > 0 && mv.col >= abs(mv.row)) {
+                // right direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_RIGHT_MASK;
+              } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) {
+                // up direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_UP_MASK;
+              } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) {
+                // left direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_LEFT_MASK;
+              } else {
+                // down direction
+                cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                    FPMB_MOTION_DOWN_MASK;
+              }
+            }
+#endif
+
+            // Non-zero vector, was it different from the last non zero vector?
+            if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
+            lastmv = mv;
+
+            // Does the row vector point inwards or outwards?
+            if (mb_row < cm->mb_rows / 2) {
+              if (mv.row > 0)
+                --sum_in_vectors;
+              else if (mv.row < 0)
+                ++sum_in_vectors;
+            } else if (mb_row > cm->mb_rows / 2) {
+              if (mv.row > 0)
+                ++sum_in_vectors;
+              else if (mv.row < 0)
+                --sum_in_vectors;
+            }
+
+            // Does the col vector point inwards or outwards?
+            if (mb_col < cm->mb_cols / 2) {
+              if (mv.col > 0)
+                --sum_in_vectors;
+              else if (mv.col < 0)
+                ++sum_in_vectors;
+            } else if (mb_col > cm->mb_cols / 2) {
+              if (mv.col > 0)
+                ++sum_in_vectors;
+              else if (mv.col < 0)
+                --sum_in_vectors;
+            }
+          }
+        }
+        raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
+      } else {
+        sr_coded_error += (int64_t)this_error;
+      }
+      coded_error += (int64_t)this_error;
+
+      // Adjust to the next column of MBs.
+      x->plane[0].src.buf += 16;
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+
+      recon_yoffset += 16;
+      recon_uvoffset += uv_mb_height;
+    }
+    // Adjust to the next row of MBs.
+    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+    x->plane[1].src.buf +=
+        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+    x->plane[2].src.buf +=
+        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+
+    aom_clear_system_state();
+  }
+  const double raw_err_stdev =
+      raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
+  aom_free(raw_motion_err_list);
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((image_data_start_row > cm->mb_rows / 2) ||
+      (image_data_start_row == INVALID_ROW)) {
+    image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (image_data_start_row > 0) {
+    intra_skip_count =
+        AOMMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+  }
+
+  {
+    FIRSTPASS_STATS fps;
+    // The minimum error here insures some bit allocation to frames even
+    // in static regions. The allocation per MB declines for larger formats
+    // where the typical "real" energy per MB also falls.
+    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+    // number of mbs is proportional to the image area.
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
+
+    intra_factor = intra_factor / (double)num_mbs;
+    brightness_factor = brightness_factor / (double)num_mbs;
+    fps.weight = intra_factor * brightness_factor;
+
+    fps.frame = cm->current_video_frame;
+    fps.coded_error = (double)(coded_error >> 8) + min_err;
+    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+    fps.intra_error = (double)(intra_error >> 8) + min_err;
+    fps.frame_avg_wavelet_energy = (double)frame_avg_wavelet_energy;
+    fps.count = 1.0;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
+    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+    fps.inactive_zone_rows = (double)image_data_start_row;
+    fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+    fps.raw_error_stdev = raw_err_stdev;
+
+    if (mvcount > 0) {
+      fps.MVr = (double)sum_mvr / mvcount;
+      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+      fps.MVc = (double)sum_mvc / mvcount;
+      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+      fps.MVrv =
+          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+      fps.MVcv =
+          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
+      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
+      fps.new_mv_count = new_mv_count;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
+    } else {
+      fps.MVr = 0.0;
+      fps.mvr_abs = 0.0;
+      fps.MVc = 0.0;
+      fps.mvc_abs = 0.0;
+      fps.MVrv = 0.0;
+      fps.MVcv = 0.0;
+      fps.mv_in_out_count = 0.0;
+      fps.new_mv_count = 0.0;
+      fps.pcnt_motion = 0.0;
+    }
+
+    // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+    // something less than the full time between subsequent values of
+    // cpi->source_time_stamp.
+    fps.duration = (double)(source->ts_end - source->ts_start);
+
+    // Don't want to do output stats with a stack variable!
+    twopass->this_frame_stats = fps;
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+    accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+    if (cpi->use_fp_mb_stats) {
+      output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs,
+                        cpi->output_pkt_list);
+    }
+#endif
+  }
+
+  // Copy the previous Last Frame back into gf and and arf buffers if
+  // the prediction is good enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
+      ((cm->current_video_frame > 0) &&
+       (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+       ((twopass->this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+    if (gld_yv12 != NULL) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+                 cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
+    }
+    twopass->sr_update_lag = 1;
+  } else {
+    ++twopass->sr_update_lag;
+  }
+
+  aom_extend_frame_borders(new_yv12, num_planes);
+
+  // The frame we just compressed now becomes the last frame.
+  ref_cnt_fb(pool->frame_bufs,
+             &cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]],
+             cm->new_fb_idx);
+
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
+  if (cm->current_video_frame == 0 &&
+      cpi->ref_fb_idx[GOLDEN_FRAME - 1] != INVALID_IDX) {
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+               cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
+  }
+
+  // Use this to see what the first pass reconstruction looks like.
+  if (0) {
+    char filename[512];
+    FILE *recon_file;
+    snprintf(filename, sizeof(filename), "enc%04d.yuv",
+             (int)cm->current_video_frame);
+
+    if (cm->current_video_frame == 0)
+      recon_file = fopen(filename, "wb");
+    else
+      recon_file = fopen(filename, "ab");
+
+    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+    fclose(recon_file);
+  }
+
+  ++cm->current_video_frame;
+}
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+                                     double pt_low, double pt_high, int q,
+                                     aom_bit_depth_t bit_depth) {
+  const double error_term = err_per_mb / err_divisor;
+
+  // Adjustment based on actual quantizer to power term.
+  const double power_term =
+      AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+  // Calculate correction factor.
+  if (power_term < 1.0) assert(error_term >= 0.0);
+
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+#define ERR_DIVISOR 100.0
+static int get_twopass_worst_quality(const AV1_COMP *cpi,
+                                     const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+  if (section_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
+    const double speed_term = 1.0;
+    double ediv_size_correction;
+    const int target_norm_bits_per_mb =
+        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+        active_mbs;
+    int q;
+
+    // Larger image formats are expected to be a little harder to code
+    // relatively given the same prediction error score. This in part at
+    // least relates to the increased size and hence coding overheads of
+    // motion vectors. Some account of this is made through adjustment of
+    // the error divisor.
+    ediv_size_correction =
+        AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+    if (ediv_size_correction < 1.0)
+      ediv_size_correction = -(1.0 / ediv_size_correction);
+    ediv_size_correction *= 4.0;
+
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+      const double factor = calc_correction_factor(
+          av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
+          FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth);
+      const int bits_per_mb = av1_rc_bits_per_mb(
+          INTER_FRAME, q, factor * speed_term * group_weight_factor,
+          cpi->common.seq_params.bit_depth);
+      if (bits_per_mb <= target_norm_bits_per_mb) break;
+    }
+
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+    return q;
+  }
+}
+
+static void setup_rf_level_maxq(AV1_COMP *cpi) {
+  int i;
+  RATE_CONTROL *const rc = &cpi->rc;
+  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+    int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality);
+    rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality);
+  }
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  zero_stats(&twopass->total_stats);
+  zero_stats(&twopass->total_left_stats);
+
+  if (!twopass->stats_in_end) return;
+
+  stats = &twopass->total_stats;
+
+  *stats = *twopass->stats_in_end;
+  twopass->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+  av1_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error =
+        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min =
+        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max =
+        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_in_end) {
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  if (oxcf->resize_mode != RESIZE_NONE) {
+    setup_rf_level_maxq(cpi);
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const AV1_COMP *cpi,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const AV1_COMP *cpi,
+                                     const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(cpi, frame);
+  return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const AV1_COMP *cpi,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+  const double zero_motion_factor =
+      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                  ZM_POWER_FACTOR));
+
+  return AOMMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
+                                      int still_interval,
+                                      double loop_decay_rate,
+                                      double last_decay_rate) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int j;
+
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_in_end) break;
+
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+    }
+
+    // Only if it does do we signal a transition to still.
+    return j == still_interval;
+  }
+
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          double *mv_in_out,
+                                          double *mv_in_out_accumulator,
+                                          double *abs_mv_in_out_accumulator,
+                                          double *mv_ratio_accumulator) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  *mv_in_out = stats->mv_in_out_count * pct;
+  *mv_in_out_accumulator += *mv_in_out;
+  *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio =
+        fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio =
+        fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    *mv_ratio_accumulator +=
+        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+    *mv_ratio_accumulator +=
+        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+  }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
+  const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                       : cpi->common.MBs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  int i;
+  double boost_score = 0.0;
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+  int arf_boost;
+  int flash_detected = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR
+                              : decay_accumulator;
+    }
+
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+  }
+
+  *f_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  mv_ratio_accumulator = 0.0;
+  decay_accumulator = 1.0;
+  this_frame_mv_in_out = 0.0;
+  mv_in_out_accumulator = 0.0;
+  abs_mv_in_out_accumulator = 0.0;
+
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+      decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+                              ? MIN_DECAY_FACTOR
+                              : decay_accumulator;
+    }
+
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+  }
+  *b_boost = (int)boost_score;
+
+  arf_boost = (*f_boost + *b_boost);
+  if (arf_boost < ((b_frames + f_frames) * 20))
+    arf_boost = ((b_frames + f_frames) * 20);
+  arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0)
+                         ? 0
+                         : (total_group_bits > twopass->kf_group_bits)
+                               ? twopass->kf_group_bits
+                               : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+                                int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+#if USE_SYMM_MULTI_LAYER
+// #define CHCEK_GF_PARAMETER
+#ifdef CHCEK_GF_PARAMETER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
+                        int frame_nums) {
+  static const char *update_type_strings[] = {
+    "KF_UPDATE",          "LF_UPDATE",      "GF_UPDATE",
+    "ARF_UPDATE",         "OVERLAY_UPDATE", "BRF_UPDATE",
+    "LAST_BIPRED_UPDATE", "BIPRED_UPDATE",  "INTNL_OVERLAY_UPDATE",
+    "INTNL_ARF_UPDATE"
+  };
+  FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+  fprintf(fid, "\n{%d}\n", gf_interval);
+  for (int i = 0; i <= frame_nums; ++i) {
+    fprintf(fid, "%s %d %d %d %d\n",
+            update_type_strings[gf_group->update_type[i]],
+            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+  }
+
+  fprintf(fid, "number of nodes in each level: \n");
+  for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
+    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+  }
+  fprintf(fid, "\n");
+  fclose(fid);
+}
+#endif  // CHCEK_GF_PARAMETER
+static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
+  // Derive rf_level from update_type
+  switch (update_type) {
+    case LF_UPDATE: return INTER_NORMAL;
+    case ARF_UPDATE: return GF_ARF_STD;
+    case OVERLAY_UPDATE: return INTER_NORMAL;
+    case BRF_UPDATE: return GF_ARF_LOW;
+    case LAST_BIPRED_UPDATE: return INTER_NORMAL;
+    case BIPRED_UPDATE: return INTER_NORMAL;
+    case INTNL_ARF_UPDATE: return GF_ARF_LOW;
+    case INTNL_OVERLAY_UPDATE: return INTER_NORMAL;
+    default: return INTER_NORMAL;
+  }
+}
+
+static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
+                                   int *frame_ind, int arf_ind, int level) {
+  if (r - l < 4) {
+    while (++l < r) {
+      // leaf nodes, not a look-ahead frame
+      gf_group->update_type[*frame_ind] = LF_UPDATE;
+      gf_group->arf_src_offset[*frame_ind] = 0;
+      gf_group->arf_pos_in_gf[*frame_ind] = 0;
+      gf_group->arf_update_idx[*frame_ind] = arf_ind;
+      gf_group->pyramid_level[*frame_ind] = 0;
+      ++gf_group->pyramid_lvl_nodes[0];
+      ++(*frame_ind);
+    }
+  } else {
+    int m = (l + r) / 2;
+    int arf_pos_in_gf = *frame_ind;
+
+    gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = m - l - 1;
+    gf_group->arf_pos_in_gf[*frame_ind] = 0;
+    gf_group->arf_update_idx[*frame_ind] = 1;  // mark all internal ARF 1
+    gf_group->pyramid_level[*frame_ind] = level;
+    ++gf_group->pyramid_lvl_nodes[level];
+    ++(*frame_ind);
+
+    // set parameters for frames displayed before this frame
+    set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1);
+
+    // for overlay frames, we need to record the position of its corresponding
+    // arf frames for bit allocation
+    gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = 0;
+    gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;
+    gf_group->arf_update_idx[*frame_ind] = 1;
+    gf_group->pyramid_level[*frame_ind] = 0;
+    ++(*frame_ind);
+
+    // set parameters for frames displayed after this frame
+    set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1);
+  }
+}
+
+static INLINE unsigned char get_pyramid_height(int pyramid_width) {
+  assert(pyramid_width <= 16 && pyramid_width >= 4 &&
+         "invalid gf interval for pyramid structure");
+
+  return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
+}
+
+static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
+                                              const int gf_interval) {
+  int frame_index = 0;
+  gf_group->pyramid_height = get_pyramid_height(gf_interval);
+
+  assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
+
+  av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+
+  // At the beginning of each GF group it will be a key or overlay frame,
+  gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->pyramid_level[frame_index] = 0;
+  ++frame_index;
+
+  // ALT0
+  gf_group->update_type[frame_index] = ARF_UPDATE;
+  gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+  ++frame_index;
+
+  // set parameters for the rest of the frames
+  set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
+                         gf_group->pyramid_height - 1);
+  return frame_index;
+}
+
+void define_customized_gf_group_structure(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  assert(rc->baseline_gf_interval >= 4 &&
+         rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
+
+  const int gf_update_frames =
+      construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
+  int frame_index;
+
+  cpi->num_extra_arfs = 0;
+
+  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+    // Set unused variables to default values
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+
+    // Special handle for the first frame for assigning update_type
+    if (frame_index == 0) {
+      // For key frames the frame target rate is already set and it
+      // is also the golden frame.
+      if (key_frame) {
+        gf_group->update_type[frame_index] = KF_UPDATE;
+        continue;
+      }
+
+      if (rc->source_alt_ref_active) {
+        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      } else {
+        gf_group->update_type[frame_index] = GF_UPDATE;
+      }
+    } else {
+      if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++cpi->num_extra_arfs;
+    }
+
+    // Assign rf level based on update type
+    gf_group->rf_level[frame_index] =
+        update_type_2_rf_level(gf_group->update_type[frame_index]);
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+  gf_group->arf_update_idx[frame_index] = 0;
+  // This value is only used for INTNL_OVERLAY_UPDATE
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+
+  // This parameter is useless?
+  gf_group->arf_ref_idx[frame_index] = 0;
+#ifdef CHCEK_GF_PARAMETER
+  check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+#endif
+}
+
+// It is an example of how to define a GF stucture manually. The function will
+// result in exactly the same GF group structure as
+// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
+#if USE_MANUAL_GF4_STRUCT
+#define GF_INTERVAL_4 4
+static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
+  {
+      // gf_group->index == 0 (Frame 0)
+      // It can also be KEY frame. Will assign the proper value
+      // in define_gf_group_structure
+      OVERLAY_UPDATE,  // update_type (default value)
+      0,               // arf_src_offset
+      0,               // arf_pos_in_gf
+      0                // arf_update_idx
+  },
+  {
+      // gf_group->index == 1 (Frame 4)
+      ARF_UPDATE,         // update_type
+      GF_INTERVAL_4 - 1,  // arf_src_offset
+      0,                  // arf_pos_in_gf
+      0                   // arf_update_idx
+  },
+  {
+      // gf_group->index == 2 (Frame 2)
+      INTNL_ARF_UPDATE,          // update_type
+      (GF_INTERVAL_4 >> 1) - 1,  // arf_src_offset
+      0,                         // arf_pos_in_gf
+      0                          // arf_update_idx
+  },
+  {
+      // gf_group->index == 3 (Frame 1)
+      LAST_BIPRED_UPDATE,  // update_type
+      0,                   // arf_src_offset
+      0,                   // arf_pos_in_gf
+      0                    // arf_update_idx
+  },
+
+  {
+      // gf_group->index == 4 (Frame 2 - OVERLAY)
+      INTNL_OVERLAY_UPDATE,  // update_type
+      0,                     // arf_src_offset
+      2,                     // arf_pos_in_gf
+      0                      // arf_update_idx
+  },
+  {
+      // gf_group->index == 5 (Frame 3)
+      LF_UPDATE,  // update_type
+      0,          // arf_src_offset
+      0,          // arf_pos_in_gf
+      1           // arf_update_idx
+  }
+};
+
+static int define_gf_group_structure_4(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  assert(rc->baseline_gf_interval == GF_INTERVAL_4);
+
+  const int gf_update_frames = rc->baseline_gf_interval + 2;
+  int frame_index;
+
+  for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+    int param_idx = 0;
+
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+
+    if (frame_index == 0) {
+      // gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+
+      // For key frames the frame target rate is already set and it
+      // is also the golden frame.
+      if (key_frame) continue;
+
+      gf_group->update_type[frame_index] =
+          gf4_multi_layer_params[frame_index][param_idx++];
+
+      if (rc->source_alt_ref_active) {
+        gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      } else {
+        gf_group->update_type[frame_index] = GF_UPDATE;
+      }
+      param_idx++;
+    } else {
+      gf_group->update_type[frame_index] =
+          gf4_multi_layer_params[frame_index][param_idx++];
+    }
+
+    // setup other parameters
+    gf_group->rf_level[frame_index] =
+        update_type_2_rf_level(gf_group->update_type[frame_index]);
+
+    // == arf_src_offset ==
+    gf_group->arf_src_offset[frame_index] =
+        gf4_multi_layer_params[frame_index][param_idx++];
+
+    // == arf_pos_in_gf ==
+    gf_group->arf_pos_in_gf[frame_index] =
+        gf4_multi_layer_params[frame_index][param_idx++];
+
+    // == arf_update_idx ==
+    gf_group->brf_src_offset[frame_index] =
+        gf4_multi_layer_params[frame_index][param_idx];
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->arf_ref_idx[frame_index] = 0;
+
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+
+  // This value is only used for INTNL_OVERLAY_UPDATE
+  gf_group->arf_pos_in_gf[frame_index] = 0;
+
+  return gf_update_frames;
+}
+#endif  // USE_MANUAL_GF4_STRUCT
+#endif  // USE_SYMM_MULTI_LAYER
+
+static void define_gf_group_structure(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+#if USE_SYMM_MULTI_LAYER
+  const int valid_customized_gf_length =
+      rc->baseline_gf_interval >= 4 &&
+      rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
+  // used the new structure only if extra_arf is allowed
+  if (valid_customized_gf_length && rc->source_alt_ref_pending &&
+      cpi->extra_arf_allowed > 0) {
+#if USE_MANUAL_GF4_STRUCT
+    if (rc->baseline_gf_interval == 4)
+      define_gf_group_structure_4(cpi);
+    else
+#endif
+      define_customized_gf_group_structure(cpi);
+    cpi->new_bwdref_update_rule = 1;
+    return;
+  } else {
+    cpi->new_bwdref_update_rule = 0;
+  }
+#endif
+
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int i;
+  int frame_index = 0;
+  const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  // The use of bi-predictive frames are only enabled when following 3
+  // conditions are met:
+  // (1) ALTREF is enabled;
+  // (2) The bi-predictive group interval is at least 2; and
+  // (3) The bi-predictive group interval is strictly smaller than the
+  //     golden group interval.
+  const int is_bipred_enabled =
+      cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
+      rc->bipred_group_interval &&
+      rc->bipred_group_interval <=
+          (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  int bipred_group_end = 0;
+  int bipred_frame_index = 0;
+
+  const unsigned char ext_arf_interval =
+      (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
+  int which_arf = cpi->num_extra_arfs;
+  int subgroup_interval[MAX_EXT_ARFS + 1];
+  int is_sg_bipred_enabled = is_bipred_enabled;
+  int accumulative_subgroup_interval = 0;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  if (!key_frame) {
+    if (rc->source_alt_ref_active) {
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+    } else {
+      gf_group->update_type[frame_index] = GF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_STD;
+    }
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->arf_ref_idx[frame_index] = 0;
+  }
+
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+
+  frame_index++;
+
+  bipred_frame_index++;
+
+  // === [frame_index == 1] ===
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->arf_src_offset[frame_index] =
+        (unsigned char)(rc->baseline_gf_interval - 1);
+
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->arf_ref_idx[frame_index] = 0;
+
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+    // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+
+    // Work out the ARFs' positions in this gf group
+    // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
+    // order (except for the original ARF). In the example of three ALT_REF's,
+    // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
+    // but code them in the following order:
+    // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
+    //
+    // arf_pos_for_ovrly[]: Position for OVERLAY
+    // arf_pos_in_gf[]:     Position for ALTREF
+    cpi->arf_pos_for_ovrly[0] = frame_index + cpi->num_extra_arfs +
+                                gf_group->arf_src_offset[frame_index] + 1;
+    for (i = 0; i < cpi->num_extra_arfs; ++i) {
+      cpi->arf_pos_for_ovrly[i + 1] =
+          frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
+      subgroup_interval[i] = cpi->arf_pos_for_ovrly[i] -
+                             cpi->arf_pos_for_ovrly[i + 1] - (i == 0 ? 1 : 2);
+    }
+    subgroup_interval[cpi->num_extra_arfs] =
+        cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
+        (cpi->num_extra_arfs == 0 ? 1 : 2);
+
+    ++frame_index;
+
+    // Insert an extra ARF
+    // === [frame_index == 2] ===
+    if (cpi->num_extra_arfs) {
+      gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+
+      gf_group->arf_update_idx[frame_index] = which_arf;
+      gf_group->arf_ref_idx[frame_index] = 0;
+      ++frame_index;
+    }
+    accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
+  }
+
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+    gf_group->arf_update_idx[frame_index] = which_arf;
+    gf_group->arf_ref_idx[frame_index] = which_arf;
+
+    // If we are going to have ARFs, check whether we can have BWDREF in this
+    // subgroup, and further, whether we can have ARF subgroup which contains
+    // the BWDREF subgroup but contained within the GF group:
+    //
+    // GF group --> ARF subgroup --> BWDREF subgroup
+    if (rc->source_alt_ref_pending) {
+      is_sg_bipred_enabled =
+          is_bipred_enabled &&
+          (subgroup_interval[which_arf] > rc->bipred_group_interval);
+    }
+
+    // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
+    //       frame group interval is strictly smaller than that of the GOLDEN
+    //       FRAME group interval.
+    // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+    if (is_sg_bipred_enabled && !bipred_group_end) {
+      const int cur_brf_src_offset = rc->bipred_group_interval - 1;
+
+      if (bipred_frame_index == 1) {
+        // --- BRF_UPDATE ---
+        gf_group->update_type[frame_index] = BRF_UPDATE;
+        gf_group->rf_level[frame_index] = GF_ARF_LOW;
+        gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
+      } else if (bipred_frame_index == rc->bipred_group_interval) {
+        // --- LAST_BIPRED_UPDATE ---
+        gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
+        gf_group->brf_src_offset[frame_index] = 0;
+
+        // Reset the bi-predictive frame index.
+        bipred_frame_index = 0;
+      } else {
+        // --- BIPRED_UPDATE ---
+        gf_group->update_type[frame_index] = BIPRED_UPDATE;
+        gf_group->rf_level[frame_index] = INTER_NORMAL;
+        gf_group->brf_src_offset[frame_index] = 0;
+      }
+      gf_group->bidir_pred_enabled[frame_index] = 1;
+
+      bipred_frame_index++;
+      // Check whether the next bi-predictive frame group would entirely be
+      // included within the current golden frame group.
+      // In addition, we need to avoid coding a BRF right before an ARF.
+      if (bipred_frame_index == 1 &&
+          (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
+        bipred_group_end = 1;
+      }
+    } else {
+      gf_group->update_type[frame_index] = LF_UPDATE;
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+    }
+
+    ++frame_index;
+
+    // Check if we need to update the ARF.
+    if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
+        frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
+      --which_arf;
+      accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
+
+      // Meet the new subgroup; Reset the bipred_group_end flag.
+      bipred_group_end = 0;
+      // Insert another extra ARF after the overlay frame
+      if (which_arf) {
+        gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
+        gf_group->rf_level[frame_index] = GF_ARF_LOW;
+        gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+
+        gf_group->arf_update_idx[frame_index] = which_arf;
+        gf_group->arf_ref_idx[frame_index] = 0;
+        ++frame_index;
+      }
+    }
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->arf_ref_idx[frame_index] = 0;
+
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+    cpi->arf_pos_in_gf[0] = 1;
+    if (cpi->num_extra_arfs) {
+      // Overwrite the update_type for extra-ARF's corresponding internal
+      // OVERLAY's: Change from LF_UPDATE to INTNL_OVERLAY_UPDATE.
+      for (i = cpi->num_extra_arfs; i > 0; --i) {
+        cpi->arf_pos_in_gf[i] =
+            (i == cpi->num_extra_arfs ? 2 : cpi->arf_pos_for_ovrly[i + 1] + 1);
+
+        gf_group->update_type[cpi->arf_pos_for_ovrly[i]] = INTNL_OVERLAY_UPDATE;
+        gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
+      }
+    }
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+  }
+
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+}
+
+#if USE_SYMM_MULTI_LAYER
+#define LEAF_REDUCTION_FACTOR 0.75f
+#define LVL_3_BOOST_FACTOR 0.8f
+#define LVL_2_BOOST_FACTOR 0.3f
+
+static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+  { 1, 0, 0 },
+  { LVL_3_BOOST_FACTOR, 0, 0 },  // Leaking budget works better
+  { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR,
+    (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) }
+};
+#endif  // USE_SYMM_MULTI_LAYER
+static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
+                                   double group_error, int gf_arf_bits) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int frame_index = 0;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  double modified_err = 0.0;
+  double err_fraction;
+  int ext_arf_boost[MAX_EXT_ARFS];
+
+  define_gf_group_structure(cpi);
+
+  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
+
+  key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  if (!key_frame) {
+    if (rc->source_alt_ref_active)
+      gf_group->bit_allocation[frame_index] = 0;
+    else
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    // Step over the golden frame / overlay frame
+    if (EOF == input_stats(twopass, &frame_stats)) return;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+  frame_index++;
+
+  // Store the bits to spend on the ARF if there is one.
+  // === [frame_index == 1] ===
+  if (rc->source_alt_ref_pending) {
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    ++frame_index;
+
+    // Skip all the extra-ARF's right after ARF at the starting segment of
+    // the current GF group.
+    if (cpi->num_extra_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+  }
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+    if (EOF == input_stats(twopass, &frame_stats)) break;
+
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+    if (group_error > 0)
+      err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+    else
+      err_fraction = 0.0;
+
+    target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+    target_frame_size =
+        clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
+
+    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+      // Boost up the allocated bits on BWDREF_FRAME
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size + (target_frame_size >> 2);
+    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size - (target_frame_size >> 1);
+    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+      // TODO(zoeliu): To investigate whether the allocated bits on
+      // BIPRED_UPDATE frames need to be further adjusted.
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+#if USE_SYMM_MULTI_LAYER
+    } else if (cpi->new_bwdref_update_rule &&
+               gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+      assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+             gf_group->pyramid_height >= 0 &&
+             "non-valid height for a pyramid structure");
+
+      int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+      gf_group->bit_allocation[frame_index] = 0;
+
+      gf_group->bit_allocation[arf_pos] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+      const int pyr_h = gf_group->pyramid_height - 2;
+      const int this_lvl = gf_group->pyramid_level[arf_pos];
+      const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+
+      const float_t budget =
+          LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0];
+      const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] /
+                                gf_group->pyramid_lvl_nodes[this_lvl];
+
+      gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost);
+#endif  // MULTI_LVL_BOOST_VBR_CQ
+#endif  // USE_SYMM_MULTI_LAYER
+    } else {
+      assert(gf_group->update_type[frame_index] == LF_UPDATE ||
+             gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+      if (cpi->new_bwdref_update_rule) {
+        gf_group->bit_allocation[frame_index] -=
+            (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+      }
+#endif  // MULTI_LVL_BOOST_VBR_CQ
+    }
+
+    ++frame_index;
+
+    // Skip all the extra-ARF's.
+    if (cpi->num_extra_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+  }
+
+#if USE_SYMM_MULTI_LAYER
+  if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
+#else
+  if (rc->source_alt_ref_pending) {
+#endif
+    if (cpi->num_extra_arfs) {
+      // NOTE: For bit allocation, move the allocated bits associated with
+      //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
+      //       i > 0 for extra-ARF's and i == 0 for ARF:
+      //         arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE
+      //         arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE
+      for (i = cpi->num_extra_arfs; i > 0; --i) {
+        assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] ==
+               INTNL_OVERLAY_UPDATE);
+
+        // Encoder's choice:
+        //   Set show_existing_frame == 1 for all extra-ARF's, and hence
+        //   allocate zero bit for both all internal OVERLAY frames.
+        gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] =
+            gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]];
+        gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
+      }
+    }
+  }
+}
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  int i;
+
+  double boost_score = 0.0;
+#if !CONFIG_FIX_GF_LENGTH
+  double old_boost_score = 0.0;
+  double mv_ratio_accumulator_thresh;
+  int active_max_gf_interval;
+  int active_min_gf_interval;
+#endif
+  double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
+  double gf_group_skip_pct = 0.0;
+  double gf_group_inactive_zone_rows = 0.0;
+  double gf_first_frame_err = 0.0;
+  double mod_frame_err = 0.0;
+
+  double mv_ratio_accumulator = 0.0;
+  double decay_accumulator = 1.0;
+  double zero_motion_accumulator = 1.0;
+
+  double loop_decay_rate = 1.00;
+  double last_loop_decay_rate = 1.00;
+
+  double this_frame_mv_in_out = 0.0;
+  double mv_in_out_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
+
+  unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+  int f_boost = 0;
+  int b_boost = 0;
+  int flash_detected;
+  int64_t gf_group_bits;
+  double gf_group_error_left;
+  int gf_arf_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+
+  cpi->extra_arf_allowed = 1;
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (is_key_frame == 0) {
+    av1_zero(twopass->gf_group);
+  }
+
+  aom_clear_system_state();
+  av1_zero(next_frame);
+
+  // Load stats for the current frame.
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
+  gf_first_frame_err = mod_frame_err;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (arf_active_or_kf) {
+    gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
+#if !CONFIG_FIX_GF_LENGTH
+  // Motion breakout threshold for loop below depends on image size.
+  mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+  // Set a maximum and minimum interval for the GF group.
+  // If the image appears almost completely static we can extend beyond this.
+  {
+    int int_max_q = (int)(av1_convert_qindex_to_q(
+        twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
+    int int_lbq = (int)(av1_convert_qindex_to_q(
+        rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
+
+    active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
+    if (active_min_gf_interval > rc->max_gf_interval)
+      active_min_gf_interval = rc->max_gf_interval;
+
+    // The value chosen depends on the active Q range. At low Q we have
+    // bits to spare and are better with a smaller interval and smaller boost.
+    // At high Q when there are few bits to spare we are better with a longer
+    // interval to spread the cost of the GF.
+    active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+    // We have: active_min_gf_interval <= rc->max_gf_interval
+    if (active_max_gf_interval < active_min_gf_interval)
+      active_max_gf_interval = active_min_gf_interval;
+    else if (active_max_gf_interval > rc->max_gf_interval)
+      active_max_gf_interval = rc->max_gf_interval;
+  }
+#endif  // !CONFIG_FIX_GF_LENGTH
+  double avg_sr_coded_error = 0;
+  double avg_raw_err_stdev = 0;
+  int non_zero_stdev_count = 0;
+
+  i = 0;
+  while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+    ++i;
+
+    // Accumulate error score of frames in this gf group.
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+    gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
+    gf_group_skip_pct += this_frame->intra_skip_pct;
+    gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, 0);
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(
+        &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+        &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+    // sum up the metric values of current gf group
+    avg_sr_coded_error += next_frame.sr_coded_error;
+    if (fabs(next_frame.raw_error_stdev) > 0.000001) {
+      non_zero_stdev_count++;
+      avg_raw_err_stdev += next_frame.raw_error_stdev;
+    }
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      last_loop_decay_rate = loop_decay_rate;
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+      decay_accumulator = decay_accumulator * loop_decay_rate;
+
+      // Monitor for static sections.
+      zero_motion_accumulator = AOMMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+      // Break clause to detect very still sections after motion. For example,
+      // a static image after a fade or other transition.
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+                                     last_loop_decay_rate)) {
+        allow_alt_ref = 0;
+        break;
+      }
+    }
+
+    // Calculate a boost number for this frame.
+    boost_score +=
+        decay_accumulator *
+        calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+#if CONFIG_FIX_GF_LENGTH
+    if (i == (FIXED_GF_LENGTH + 1)) break;
+#else
+    // Skip breaking condition for CONFIG_FIX_GF_LENGTH
+    // Break out conditions.
+    if (
+        // Break at active_max_gf_interval unless almost totally static.
+        (i >= (active_max_gf_interval + arf_active_or_kf) &&
+         zero_motion_accumulator < 0.995) ||
+        (
+            // Don't break out with a very short interval.
+            (i >= active_min_gf_interval + arf_active_or_kf) &&
+            (!flash_detected) &&
+            ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+             (abs_mv_in_out_accumulator > 3.0) ||
+             (mv_in_out_accumulator < -2.0) ||
+             ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
+      // If GF group interval is < 12, we force it to be 8. Otherwise,
+      // if it is >= 12, we keep it as is.
+      // NOTE: 'i' is 1 more than the GF group interval candidate that is being
+      //       checked.
+      if (i == (8 + 1) || i >= (12 + 1)) {
+        boost_score = old_boost_score;
+        break;
+      }
+    }
+    old_boost_score = boost_score;
+#endif  // CONFIG_FIX_GF_LENGTH
+    *this_frame = next_frame;
+  }
+  twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  assert(num_mbs > 0);
+  if (i) avg_sr_coded_error /= i;
+
+  if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+  // Disable extra altrefs and backward refs for "still" gf group:
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
+  const int disable_bwd_extarf =
+      (zero_motion_accumulator > MIN_ZERO_MOTION &&
+       avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+       avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+
+  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+  int alt_offset = 0;
+#if REDUCE_LAST_GF_LENGTH
+  // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q
+  // mode, and hurting the performance of VBR mode. We need to investigate how
+  // to adjust GF length for other modes.
+
+  int allow_gf_length_reduction =
+      cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0;
+
+  // We are going to have an alt ref, but we don't have do adjustment for
+  // lossless mode
+  if (allow_alt_ref && allow_gf_length_reduction &&
+      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) &&
+      !is_lossless_requested(&cpi->oxcf)) {
+    // adjust length of this gf group if one of the following condition met
+    // 1: only one overlay frame left and this gf is too long
+    // 2: next gf group is too short to have arf compared to the current gf
+
+    // maximum length of next gf group
+    const int next_gf_len = rc->frames_to_key - i;
+    const int single_overlay_left =
+        next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+    // the next gf is probably going to have a ARF but it will be shorter than
+    // this gf
+    const int unbalanced_gf =
+        i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 >= rc->min_gf_interval;
+
+    if (single_overlay_left || unbalanced_gf) {
+      // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
+      // better in the current setting
+      const int roll_back = REDUCE_GF_LENGTH_BY;
+      alt_offset = -roll_back;
+      i -= roll_back;
+    }
+  }
+#endif
+
+  // Should we use the alternate reference frame.
+  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+      (i >= rc->min_gf_interval)) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost =
+        calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
+    rc->source_alt_ref_pending = 1;
+
+    // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+    cpi->preserve_arf_as_gld = 1;
+  } else {
+    rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->source_alt_ref_pending = 0;
+    cpi->preserve_arf_as_gld = 0;
+  }
+
+  // Set the interval until the next gf.
+  // If forward keyframes are enabled, ensure the final gf group obeys the
+  // MIN_FWD_KF_INTERVAL.
+  if (cpi->oxcf.fwd_kf_enabled &&
+      ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+    if (i == rc->frames_to_key) {
+      rc->baseline_gf_interval = i;
+      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+    } else if ((rc->frames_to_key - i <
+                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+               (rc->frames_to_key != i)) {
+      // if possible, merge the last two gf groups
+      if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
+        rc->baseline_gf_interval = rc->frames_to_key;
+        // if merging the last two gf groups creates a group that is too long,
+        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+      } else {
+        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+      }
+    } else {
+      rc->baseline_gf_interval =
+          i - (is_key_frame || rc->source_alt_ref_pending);
+    }
+  } else {
+    rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+  }
+
+#if REDUCE_LAST_ALT_BOOST
+#define LAST_ALR_BOOST_FACTOR 0.2f
+  rc->arf_boost_factor = 1.0;
+  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+    // Reduce the boost of altref in the last gf group
+    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - i == 0) {
+      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+    }
+  }
+#endif
+
+  if (!cpi->extra_arf_allowed) {
+    cpi->num_extra_arfs = 0;
+  } else {
+#if USE_SYMM_MULTI_LAYER
+    if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending)
+      cpi->num_extra_arfs = 1;
+    else
+      cpi->num_extra_arfs = get_number_of_extra_arfs(
+          rc->baseline_gf_interval, rc->source_alt_ref_pending);
+#else
+    // Compute how many extra alt_refs we can have
+    cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+                                                   rc->source_alt_ref_pending);
+#endif  // USE_SYMM_MULTI_LAYER
+  }
+
+#if !USE_SYMM_MULTI_LAYER
+  // Currently at maximum two extra ARFs' are allowed
+  assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+  rc->bipred_group_interval = BFG_INTERVAL;
+  // The minimum bi-predictive frame group interval is 2.
+  if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+        (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+        gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+        ((gf_group_inactive_zone_rows * 2) /
+         (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = AOMMAX(RC_FACTOR_MIN,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = AOMMIN(RC_FACTOR_MAX,
+                         (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q = get_twopass_worst_quality(
+        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+        vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+        AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+  }
+#endif
+
+  // Calculate the extra bits to be used for boosted frame(s)
+  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
+                                     gf_group_bits);
+
+  // Adjust KF group bits and error remaining.
+  twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+  // If this is an arf update we want to remove the score for the overlay
+  // frame at the end which will usually be very cheap to code.
+  // The overlay frame has already, in effect, been coded so we want to spread
+  // the remaining bits among the other frames.
+  // For normal GFs remove the score for the GF itself unless this is
+  // also a key frame in which case it has already been accounted for.
+  if (rc->source_alt_ref_pending) {
+    gf_group_error_left = gf_group_err - mod_frame_err;
+  } else if (is_key_frame == 0) {
+    gf_group_error_left = gf_group_err - gf_first_frame_err;
+  } else {
+    gf_group_error_left = gf_group_err;
+  }
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->section_intra_rating = calculate_section_intra_ratio(
+        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+  }
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+static int test_candidate_kf(TWO_PASS *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame) {
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+      this_frame->pcnt_inter - this_frame->pcnt_neutral;
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+         KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < 16; ++i) {
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame.pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+            0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(twopass, &local_next_frame)) break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > 3)) {
+      is_viable_kf = 1;
+    } else {
+      // Reset the file position
+      reset_fpf_position(twopass, start_pos);
+
+      is_viable_kf = 0;
+    }
+  }
+
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  int i, j;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  FIRSTPASS_STATS next_frame;
+  FIRSTPASS_STATS last_frame;
+  int kf_bits = 0;
+  int loop_decay_counter = 0;
+  double decay_accumulator = 1.0;
+  double av_decay_accumulator = 0.0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+  av1_zero(next_frame);
+
+  cpi->common.frame_type = KEY_FRAME;
+
+  // Reset the GF group data structures.
+  av1_zero(*gf_group);
+
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+  // Find the next keyframe.
+  i = 0;
+  while (twopass->stats_in < twopass->stats_in_end &&
+         rc->frames_to_key < cpi->oxcf.key_freq) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+    // Load the next frame's stats.
+    last_frame = *this_frame;
+    input_stats(twopass, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame,
+                            twopass->stats_in))
+        break;
+
+      // How fast is the prediction quality decaying?
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+                                     loop_decay_rate, decay_accumulator))
+        break;
+
+      // Step on to the next frame.
+      ++rc->frames_to_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq intervals then break out of the loop.
+      if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+    } else {
+      ++rc->frames_to_key;
+    }
+    ++i;
+  }
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
+
+    kf_group_err = 0.0;
+
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+      input_stats(twopass, &tmp_frame);
+    }
+    rc->next_key_frame_forced = 1;
+  } else if (twopass->stats_in == twopass->stats_in_end ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
+  } else {
+    rc->next_key_frame_forced = 0;
+  }
+
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(
+        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  decay_accumulator = 1.0;
+  boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Monitor for static sections.
+    zero_motion_accumulator = AOMMIN(zero_motion_accumulator,
+                                     get_zero_motion_factor(cpi, &next_frame));
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((i <= rc->max_gf_interval) ||
+        ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+      const double frame_boost =
+          calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
+
+      // How fast is prediction quality decaying.
+      if (!detect_flash(twopass, 0)) {
+        const double loop_decay_rate =
+            get_prediction_decay_rate(cpi, &next_frame);
+        decay_accumulator *= loop_decay_rate;
+        decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
+        av_decay_accumulator += decay_accumulator;
+        ++loop_decay_counter;
+      }
+      boost_score += (decay_accumulator * frame_boost);
+    }
+  }
+  if (loop_decay_counter > 0)
+    av_decay_accumulator /= (double)loop_decay_counter;
+
+  reset_fpf_position(twopass, start_position);
+
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_position, twopass->stats_in_end, rc->frames_to_key);
+
+  // Apply various clamps for min and max boost
+  rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+  rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+  rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+
+  // Work out how many bits to allocate for the key frame itself.
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+                                 twopass->kf_group_bits);
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  //        kf_bits, twopass->kf_zeromotion_pct);
+
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+        (double)(twopass->kf_group_bits - kf_bits) /
+        (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+  gf_group->rf_level[0] = KF_STD;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  // NOTE(weitinglin): Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  cpi->rc.is_bwd_ref_frame = 0;
+  cpi->rc.is_last_bipred_frame = 0;
+  cpi->rc.is_bipred_frame = 0;
+  cpi->rc.is_src_frame_ext_arf = 0;
+
+  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case GF_UPDATE:
+      // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
+      //               needed.
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+
+    case BRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_bwd_ref_frame = 1;
+      break;
+
+    case LAST_BIPRED_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_last_bipred_frame = 1;
+      break;
+
+    case BIPRED_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_bipred_frame = 1;
+      break;
+
+    case INTNL_OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cpi->rc.is_src_frame_ext_arf = 1;
+      break;
+
+    case INTNL_ARF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+#if USE_SYMM_MULTI_LAYER
+      if (cpi->new_bwdref_update_rule == 1) {
+        cpi->refresh_bwd_ref_frame = 1;
+        cpi->refresh_alt2_ref_frame = 0;
+      } else {
+#endif
+        cpi->refresh_bwd_ref_frame = 0;
+        cpi->refresh_alt2_ref_frame = 1;
+#if USE_SYMM_MULTI_LAYER
+      }
+#endif
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    default: assert(0); break;
+  }
+}
+
+void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
+                                            FRAME_UPDATE_TYPE update_type) {
+  RATE_CONTROL *rc = &cpi->rc;
+
+  cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_bwd_ref_frame = 0;
+  cpi->refresh_alt2_ref_frame = 0;
+  cpi->refresh_alt_ref_frame = 0;
+
+  rc->is_bwd_ref_frame = 0;
+
+  switch (update_type) {
+    case ARF_UPDATE:
+      cpi->refresh_alt_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+
+      rc->is_src_frame_alt_ref = 0;
+      break;
+    case INTNL_ARF_UPDATE:
+      cpi->refresh_alt2_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      rc->is_src_frame_alt_ref = 0;
+      rc->is_src_frame_ext_arf = 0;
+
+      break;
+    case BIPRED_UPDATE:
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt2_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      rc->is_bwd_ref_frame = 1;
+      break;
+    default: break;
+  }
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const TWO_PASS *const twopass = &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+          twopass->stats_in - 2 > twopass->stats_in_start &&
+          twopass->stats_in < twopass->stats_in_end &&
+          (twopass->stats_in - 1)->pcnt_inter -
+                  (twopass->stats_in - 1)->pcnt_motion ==
+              1 &&
+          (twopass->stats_in - 2)->pcnt_inter -
+                  (twopass->stats_in - 2)->pcnt_motion ==
+              1 &&
+          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frames_left;
+  FIRSTPASS_STATS this_frame;
+
+  int target_rate;
+
+  frames_left = (int)(twopass->total_stats.count - cm->current_video_frame);
+
+  if (!twopass->stats_in) return;
+
+  // If this is an arf frame then we dont want to read the stats file or
+  // advance the input pointer as we already have what we need.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+    configure_buffer_updates(cpi);
+    target_rate = gf_group->bit_allocation[gf_group->index];
+    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+    rc->base_frame_target = target_rate;
+
+    if (cpi->no_show_kf) {
+      assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
+      cm->frame_type = KEY_FRAME;
+    } else {
+      cm->frame_type = INTER_FRAME;
+    }
+
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
+    return;
+  }
+
+  aom_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == AOM_Q) {
+    twopass->active_worst_quality = cpi->oxcf.cq_level;
+  } else if (cm->current_video_frame == 0) {
+    // Special case code for first frame.
+    const int section_target_bandwidth =
+        (int)(twopass->bits_left / frames_left);
+    const double section_length = twopass->total_left_stats.count;
+    const double section_error =
+        twopass->total_left_stats.coded_error / section_length;
+    const double section_intra_skip =
+        twopass->total_left_stats.intra_skip_pct / section_length;
+    const double section_inactive_zone =
+        (twopass->total_left_stats.inactive_zone_rows * 2) /
+        ((double)cm->mb_rows * section_length);
+    const int tmp_q = get_twopass_worst_quality(
+        cpi, section_error, section_intra_skip + section_inactive_zone,
+        section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+    twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+  }
+
+  av1_zero(this_frame);
+  if (EOF == input_stats(twopass, &this_frame)) return;
+
+  // Set the frame content type flag.
+  if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
+    // Define next KF group and assign bits to it.
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (rc->frames_till_gf_update_due == 0) {
+    define_gf_group(cpi, &this_frame);
+
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10d %10d %10d %10d\n", cm->current_video_frame,
+              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+              rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+
+  configure_buffer_updates(cpi);
+
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  target_rate = gf_group->bit_allocation[gf_group->index];
+
+  if (cpi->common.frame_type == KEY_FRAME)
+    target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
+  else
+    target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+
+  rc->base_frame_target = target_rate;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
+    twopass->frame_avg_haar_energy =
+        log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
+  if (cpi->common.frame_type != KEY_FRAME) {
+    twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != AOM_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit =
+        rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+      // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+            fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 000000000..4b7325ae2
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
+typedef struct {
+  uint8_t *mb_stats_start;
+  uint8_t *mb_stats_end;
+} FIRSTPASS_MB_STATS;
+#endif
+
+// Length of the bi-predictive frame group (BFG)
+// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
+//       number of bi-predictive frames.
+#define BFG_INTERVAL 2
+// The maximum number of extra ALTREF's except ALTREF_FRAME
+#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+
+#define MIN_EXT_ARF_INTERVAL 4
+
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double frame_avg_wavelet_energy;
+  double coded_error;
+  double sr_coded_error;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double intra_skip_pct;
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double new_mv_count;
+  double duration;
+  double count;
+  // standard deviation for (0, 0) motion prediction error
+  double raw_error_stdev;
+} FIRSTPASS_STATS;
+
+typedef enum {
+  KF_UPDATE = 0,
+  LF_UPDATE = 1,
+  GF_UPDATE = 2,
+  ARF_UPDATE = 3,
+  OVERLAY_UPDATE = 4,
+  BRF_UPDATE = 5,            // Backward Reference Frame
+  LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
+  BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
+  INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
+  INTNL_ARF_UPDATE = 9,      // Internal Altref Frame (candidate for ALTREF2)
+  FRAME_UPDATE_TYPES = 10
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+  unsigned char index;
+  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if USE_SYMM_MULTI_LAYER
+  unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char pyramid_height;
+  unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
+#endif
+  unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
+  unsigned char refresh_idx[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char refresh_flag[(MAX_LAG_BUFFERS * 2) + 1];
+  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
+typedef struct {
+  unsigned int section_intra_rating;
+  FIRSTPASS_STATS total_stats;
+  FIRSTPASS_STATS this_frame_stats;
+  const FIRSTPASS_STATS *stats_in;
+  const FIRSTPASS_STATS *stats_in_start;
+  const FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS total_left_stats;
+  int first_pass_done;
+  int64_t bits_left;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_left;
+  double mb_av_energy;
+  double frame_avg_haar_energy;
+
+#if CONFIG_FP_MB_STATS
+  uint8_t *frame_mb_stats_buf;
+  uint8_t *this_frame_mb_stats;
+  FIRSTPASS_MB_STATS firstpass_mb_stats;
+#endif
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // The fraction for a kf groups total bits allocated to the inter frames
+  double kfgroup_inter_fraction;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
+  int gf_zeromotion_pct;
+  int active_worst_quality;
+  int baseline_active_worst_quality;
+  int extend_minq;
+  int extend_maxq;
+  int extend_minq_fast;
+
+  GF_GROUP gf_group;
+} TWO_PASS;
+
+struct AV1_COMP;
+
+void av1_init_first_pass(struct AV1_COMP *cpi);
+void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
+void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
+void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
+                                            FRAME_UPDATE_TYPE update_type);
+
+// Post encode update of the rate control parameters for 2-pass
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
+  if (arf_pending && MAX_EXT_ARFS > 0)
+    return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
+               ? MAX_EXT_ARFS
+               : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
+                     ? MAX_EXT_ARFS - 1
+                     : 0;
+  else
+    return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 000000000..e9f8b0bb4
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/corner_match.h"
+#include "av1/encoder/ransac.h"
+
+#define MAX_CORNERS 4096
+#define MIN_INLIER_PROB 0.1
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
+
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                             int erroradv_type) {
+  assert(erroradv_type < GM_ERRORADV_TR_TYPES);
+  return best_erroradvantage < erroradv_tr[erroradv_type] &&
+         best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+  int i;
+  int alpha_present = 0;
+  model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+  model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+
+  for (i = 2; i < 6; ++i) {
+    const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+    model[i] =
+        (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+    alpha_present |= (model[i] != 0);
+    model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+  }
+  for (; i < 8; ++i) {
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5);
+    model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) *
+               GM_ROW3HOMO_DECODE_FACTOR;
+    alpha_present |= (model[i] != 0);
+  }
+
+  if (!alpha_present) {
+    if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) {
+      model[0] = 0;
+      model[1] = 0;
+    }
+  }
+}
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model) {
+  convert_to_params(params, model->wmmat);
+  model->wmtype = get_gmtype(model);
+  model->invalid = 0;
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+                                int32_t offset) {
+  const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
+                              GM_ROW3HOMO_PREC_DIFF };
+  const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX };
+  // type of param: 0 - translation, 1 - affine, 2 - homography
+  const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2));
+  const int is_one_centered = (param_index == 2 || param_index == 5);
+
+  // Make parameter zero-centered and offset the shift that was done to make
+  // it compatible with the warped model
+  param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+                scale_vals[param_type];
+  // Add desired offset to the rescaled/zero-centered parameter
+  param_value += offset;
+  // Clamp the parameter so it does not overflow the number of bits allotted
+  // to it in the bitstream
+  param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+                               clamp_vals[param_type]);
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+  // with the warped motion library
+  param_value *= (1 << scale_vals[param_type]);
+
+  // Undo the zero-centering step if necessary
+  return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+  switch (wmtype) {
+    case IDENTITY:
+      wm->wmmat[0] = 0;
+      wm->wmmat[1] = 0;
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+      wm->wmmat[3] = 0;
+      AOM_FALLTHROUGH_INTENDED;
+    case ROTZOOM:
+      wm->wmmat[4] = -wm->wmmat[3];
+      wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
+    case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
+    default: assert(0);
+  }
+  wm->wmtype = wmtype;
+}
+
+int64_t refine_integerized_param(WarpedMotionParams *wm,
+                                 TransformationType wmtype, int use_hbd, int bd,
+                                 uint8_t *ref, int r_width, int r_height,
+                                 int r_stride, uint8_t *dst, int d_width,
+                                 int d_height, int d_stride, int n_refinements,
+                                 int64_t best_frame_error) {
+  static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+  const int border = ERRORADV_BORDER;
+  int i = 0, p;
+  int n_params = max_trans_model_params[wmtype];
+  int32_t *param_mat = wm->wmmat;
+  int64_t step_error, best_error;
+  int32_t step;
+  int32_t *param;
+  int32_t curr_param;
+  int32_t best_param;
+
+  force_wmtype(wm, wmtype);
+  best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                              dst + border * d_stride + border, border, border,
+                              d_width - 2 * border, d_height - 2 * border,
+                              d_stride, 0, 0, best_frame_error);
+  best_error = AOMMIN(best_error, best_frame_error);
+  step = 1 << (n_refinements - 1);
+  for (i = 0; i < n_refinements; i++, step >>= 1) {
+    for (p = 0; p < n_params; ++p) {
+      int step_dir = 0;
+      // Skip searches for parameters that are forced to be 0
+      param = param_mat + p;
+      curr_param = *param;
+      best_param = curr_param;
+      // look to the left
+      *param = add_param_offset(p, curr_param, -step);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, best_error);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = -1;
+      }
+
+      // look to the right
+      *param = add_param_offset(p, curr_param, step);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, best_error);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = 1;
+      }
+      *param = best_param;
+
+      // look to the direction chosen above repeatedly until error increases
+      // for the biggest step size
+      while (step_dir) {
+        *param = add_param_offset(p, best_param, step * step_dir);
+        step_error =
+            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                           dst + border * d_stride + border, border, border,
+                           d_width - 2 * border, d_height - 2 * border,
+                           d_stride, 0, 0, best_error);
+        if (step_error < best_error) {
+          best_error = step_error;
+          best_param = *param;
+        } else {
+          *param = best_param;
+          step_dir = 0;
+        }
+      }
+    }
+  }
+  force_wmtype(wm, wmtype);
+  wm->wmtype = get_gmtype(wm);
+  return best_error;
+}
+
+static INLINE RansacFunc get_ransac_type(TransformationType type) {
+  switch (type) {
+    case AFFINE: return ransac_affine;
+    case ROTZOOM: return ransac_rotzoom;
+    case TRANSLATION: return ransac_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
+                                        int bit_depth) {
+  int i, j;
+  uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
+  uint8_t *buf_8bit = frm->y_buffer_8bit;
+  assert(buf_8bit);
+  if (!frm->buf_8bit_valid) {
+    for (i = 0; i < frm->y_height; ++i) {
+      for (j = 0; j < frm->y_width; ++j) {
+        buf_8bit[i * frm->y_stride + j] =
+            orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+      }
+    }
+    frm->buf_8bit_valid = 1;
+  }
+  return buf_8bit;
+}
+
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                        int *num_inliers_by_motion,
+                                        double *params_by_motion,
+                                        int num_motions) {
+  int i;
+  int num_frm_corners, num_ref_corners;
+  int num_correspondences;
+  int *correspondences;
+  int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
+  unsigned char *frm_buffer = frm->y_buffer;
+  unsigned char *ref_buffer = ref->y_buffer;
+  RansacFunc ransac = get_ransac_type(type);
+
+  if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // The frame buffer is 16-bit, so we need to convert to 8 bits for the
+    // following code. We cache the result until the frame is released.
+    frm_buffer = downconvert_frame(frm, bit_depth);
+  }
+  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+    ref_buffer = downconvert_frame(ref, bit_depth);
+  }
+
+  // compute interest points in images using FAST features
+  num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
+                                       frm->y_stride, frm_corners, MAX_CORNERS);
+  num_ref_corners = fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+                                       ref->y_stride, ref_corners, MAX_CORNERS);
+
+  // find correspondences between the two images
+  correspondences =
+      (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  num_correspondences = determine_correspondence(
+      frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
+      (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
+      frm->y_stride, ref->y_stride, correspondences);
+
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
+  free(correspondences);
+
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+      num_inliers_by_motion[i] = 0;
+    }
+  }
+
+  // Return true if any one of the motions has inliers.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) return 1;
+  }
+  return 0;
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 000000000..c7c016c43
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model);
+
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                             int erroradv_type);
+
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t refine_integerized_param(WarpedMotionParams *wm,
+                                 TransformationType wmtype, int use_hbd, int bd,
+                                 uint8_t *ref, int r_width, int r_height,
+                                 int r_stride, uint8_t *dst, int d_width,
+                                 int d_height, int d_stride, int n_refinements,
+                                 int64_t best_frame_error);
+
+/*
+  Computes "num_motions" candidate global motion parameters between two frames.
+  The array "params_by_motion" should be length 8 * "num_motions". The ordering
+  of each set of parameters is best described  by the homography:
+
+        [x'     (m2 m3 m0   [x
+    z .  y'  =   m4 m5 m1 *  y
+         1]      m6 m7 1)    1]
+
+  where m{i} represents the ith value in any given set of parameters.
+
+  "num_inliers" should be length "num_motions", and will be populated with the
+  number of inlier feature points for each motion. Params for which the
+  num_inliers entry is 0 should be ignored by the caller.
+*/
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                        int *num_inliers_by_motion,
+                                        double *params_by_motion,
+                                        int num_motions);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 000000000..945dc3733
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+  /* Test 1 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 16, 0 },
+        { 25, 136 },
+        { 33, 144 },
+        { 41, 160 },
+        { 48, 168 },
+        { 56, 136 },
+        { 67, 128 },
+        { 82, 144 },
+        { 97, 152 },
+        { 113, 144 },
+        { 128, 176 },
+        { 143, 168 },
+        { 158, 176 },
+        { 178, 184 } },
+      14 /* num_points_y */,
+      { { 16, 0 },
+        { 20, 64 },
+        { 28, 88 },
+        { 60, 104 },
+        { 90, 136 },
+        { 105, 160 },
+        { 134, 168 },
+        { 168, 208 } },
+      8 /* num_cb_points */,
+      { { 16, 0 },
+        { 28, 96 },
+        { 56, 80 },
+        { 66, 96 },
+        { 80, 104 },
+        { 108, 96 },
+        { 122, 112 },
+        { 137, 112 },
+        { 169, 176 } },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      247 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      18 /* cb_offset */,
+      229 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      54 /* cr_offset */,
+      0 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /* chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 2 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cb_points */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 3 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 192 }, { 255, 192 } },
+      2 /* num_points_y */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cb_points */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      1 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 4 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 137 },
+          { 53, 146 },
+          { 63, 155 },
+          { 78, 155 },
+          { 107, 150 },
+          { 122, 147 },
+          { 136, 147 },
+          { 166, 153 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 72 },
+          { 27, 82 },
+          { 33, 91 },
+          { 69, 121 },
+          { 95, 143 },
+          { 108, 154 },
+          { 134, 169 },
+          { 147, 177 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 24, 95 },
+          { 54, 93 },
+          { 65, 94 },
+          { 79, 98 },
+          { 109, 107 },
+          { 124, 119 },
+          { 139, 136 },
+          { 169, 170 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 5 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_points_y */,
+      {
+          { 0, 96 },
+          { 32, 90 },
+          { 64, 83 },
+          { 96, 76 },
+          { 128, 68 },
+          { 159, 59 },
+          { 191, 48 },
+          { 223, 34 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 34 },
+          { 64, 48 },
+          { 96, 59 },
+          { 128, 68 },
+          { 159, 76 },
+          { 191, 83 },
+          { 223, 90 },
+          { 255, 96 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          -2, 2,  -5, 7,   -6, 4,   -2, -1, 1,  -2,  0,  -2, 2,
+          -3, -5, 13, -13, 6,  -14, 8,  -1, 18, -36, 58, 0,
+      },
+      {
+          -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+          0,  1,  0,  -7, 50, -8, -2, 2, 2,  2, -4,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1063 /* random_seed */
+  },
+  /* Test 6 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 96 },
+          { 20, 92 },
+          { 39, 88 },
+          { 59, 84 },
+          { 78, 80 },
+          { 98, 75 },
+          { 118, 70 },
+          { 137, 65 },
+          { 157, 60 },
+          { 177, 53 },
+          { 196, 46 },
+          { 216, 38 },
+          { 235, 27 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      2754 /* random_seed */
+  },
+  /* Test 7 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 27 },
+          { 39, 38 },
+          { 59, 46 },
+          { 78, 53 },
+          { 98, 60 },
+          { 118, 65 },
+          { 137, 70 },
+          { 157, 75 },
+          { 177, 80 },
+          { 196, 84 },
+          { 216, 88 },
+          { 235, 92 },
+          { 255, 96 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 8 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cb_points */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 9 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 10 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 11 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_points_y */,
+      {
+          { 0, 48 },
+          { 32, 45 },
+          { 64, 42 },
+          { 96, 38 },
+          { 128, 34 },
+          { 159, 29 },
+          { 191, 24 },
+          { 223, 17 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 17 },
+          { 64, 24 },
+          { 96, 29 },
+          { 128, 34 },
+          { 159, 38 },
+          { 191, 42 },
+          { 223, 45 },
+          { 255, 48 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1357 /* random_seed */
+  },
+  /* Test 12 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 49 },
+          { 39, 69 },
+          { 46, 84 },
+          { 53, 91 },
+          { 63, 100 },
+          { 78, 114 },
+          { 92, 134 },
+          { 164, 139 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 31 },
+          { 26, 42 },
+          { 33, 54 },
+          { 40, 65 },
+          { 47, 72 },
+          { 56, 85 },
+          { 84, 123 },
+          { 152, 157 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 25, 14 },
+          { 39, 33 },
+          { 47, 40 },
+          { 54, 47 },
+          { 64, 62 },
+          { 79, 76 },
+          { 94, 83 },
+          { 167, 101 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      0 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 13 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 48 },
+          { 20, 46 },
+          { 39, 44 },
+          { 59, 42 },
+          { 78, 40 },
+          { 98, 38 },
+          { 118, 35 },
+          { 137, 33 },
+          { 157, 30 },
+          { 177, 27 },
+          { 196, 23 },
+          { 216, 19 },
+          { 235, 13 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 14 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 13 },
+          { 39, 19 },
+          { 59, 23 },
+          { 78, 27 },
+          { 98, 30 },
+          { 118, 33 },
+          { 137, 35 },
+          { 157, 38 },
+          { 177, 40 },
+          { 196, 42 },
+          { 216, 44 },
+          { 235, 46 },
+          { 255, 48 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 15 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      1 /* num_points_y */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cb_points */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+      { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+      { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      1 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 16 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 58, 126 },
+          { 87, 120 },
+          { 97, 122 },
+          { 112, 125 },
+          { 126, 131 },
+          { 141, 139 },
+          { 199, 153 },
+      },
+      8 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 59, 68 },
+          { 66, 76 },
+          { 73, 82 },
+          { 79, 85 },
+          { 86, 86 },
+          { 151, 95 },
+          { 192, 101 },
+      },
+      8 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 59, 64 },
+          { 89, 80 },
+          { 99, 86 },
+          { 114, 90 },
+          { 129, 93 },
+          { 144, 97 },
+          { 203, 85 },
+      },
+      8 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      2 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+};
+#endif  // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
new file mode 100644
index 000000000..180115d9f
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+                                        uint8_t *pData, uint32_t dataLength) {
+  for (uint32_t i = 0; i < dataLength; i++) {
+    const uint8_t index =
+        (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+        pData[i];
+    p_crc_calculator->remainder <<= 8;
+    p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+  }
+}
+
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+  p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+  return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+  const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+  const uint32_t byte_high_bit = 1 << (8 - 1);
+
+  for (uint32_t value = 0; value < 256; value++) {
+    uint32_t remainder = 0;
+    for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+      if (value & mask) {
+        remainder ^= high_bit;
+      }
+
+      if (remainder & high_bit) {
+        remainder <<= 1;
+        remainder ^= p_crc_calculator->trunc_poly;
+      } else {
+        remainder <<= 1;
+      }
+    }
+    p_crc_calculator->table[value] = remainder;
+  }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly) {
+  p_crc_calculator->remainder = 0;
+  p_crc_calculator->bits = bits;
+  p_crc_calculator->trunc_poly = truncPoly;
+  p_crc_calculator->final_result_mask = (1 << bits) - 1;
+  crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length) {
+  CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
+  crc_calculator_reset(p_crc_calculator);
+  crc_calculator_process_data(p_crc_calculator, p, length);
+  return crc_calculator_get_crc(p_crc_calculator);
+}
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+  uint32_t crc;
+
+  for (int n = 0; n < 256; n++) {
+    crc = n;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    p_crc32c->table[0][n] = crc;
+  }
+  for (int n = 0; n < 256; n++) {
+    crc = p_crc32c->table[0][n];
+    for (int k = 1; k < 8; k++) {
+      crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+      p_crc32c->table[k][n] = crc;
+    }
+  }
+}
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+ than using the hardware instructions.  This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(CRC32C *p, uint8_t *buf, size_t len) {
+  const uint8_t *next = (const uint8_t *)(buf);
+  uint64_t crc;
+
+  crc = 0 ^ 0xffffffff;
+  while (len && ((uintptr_t)next & 7) != 0) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  while (len >= 8) {
+    crc ^= *(uint64_t *)next;
+    crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+          p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+          p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+          p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+    next += 8;
+    len -= 8;
+  }
+  while (len) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
new file mode 100644
index 000000000..826c004d6
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+  uint32_t remainder;
+  uint32_t trunc_poly;
+  uint32_t bits;
+  uint32_t table[256];
+  uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly);
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+  /* Table for a quadword-at-a-time software crc. */
+  uint32_t table[8][256];
+} CRC32C;
+
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
new file mode 100644
index 000000000..e85a516e8
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+
+static const int crc_bits = 16;
+static const int block_size_bits = 3;
+
+static void hash_table_clear_all(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table == NULL) {
+    return;
+  }
+  int max_addr = 1 << (crc_bits + block_size_bits);
+  for (int i = 0; i < max_addr; i++) {
+    if (p_hash_table->p_lookup_table[i] != NULL) {
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_free(p_hash_table->p_lookup_table[i]);
+      p_hash_table->p_lookup_table[i] = NULL;
+    }
+  }
+}
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
+                                                     uint8_t *p_pixels_in1D) {
+  uint8_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static void get_pixels_in_1D_short_array_by_block_2x2(uint16_t *y_src,
+                                                      int stride,
+                                                      uint16_t *p_pixels_in1D) {
+  uint16_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static int is_block_2x2_row_same_value(uint8_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block16_2x2_row_same_value(uint16_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block_2x2_col_same_value(uint8_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block16_2x2_col_same_value(uint16_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+  return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+  switch (block_size) {
+    case 4: return 0;
+    case 8: return 1;
+    case 16: return 2;
+    case 32: return 3;
+    case 64: return 4;
+    case 128: return 5;
+    default: return -1;
+  }
+}
+
+void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) {
+  if (x->g_crc_initialized == 0) {
+    av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB);
+    av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB);
+    x->g_crc_initialized = 1;
+  }
+  p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+  hash_table_clear_all(p_hash_table);
+  aom_free(p_hash_table->p_lookup_table);
+  p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_create(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table != NULL) {
+    hash_table_clear_all(p_hash_table);
+    return;
+  }
+  const int max_addr = 1 << (crc_bits + block_size_bits);
+  p_hash_table->p_lookup_table =
+      (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+  memset(p_hash_table->p_lookup_table, 0,
+         sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+}
+
+static void hash_table_add_to_table(hash_table *p_hash_table,
+                                    uint32_t hash_value,
+                                    block_hash *curr_block_hash) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    p_hash_table->p_lookup_table[hash_value] =
+        aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+    aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                     sizeof(curr_block_hash[0]));
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
+  } else {
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
+  }
+}
+
+int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    return 0;
+  } else {
+    return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+  }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value) {
+  assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+  return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2) {
+  if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+    return 0;
+  }
+  Iterator iterator =
+      aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
+  for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) {
+    if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3],
+                                       MACROBLOCK *x) {
+  const int width = 2;
+  const int height = 2;
+  const int x_end = picture->y_crop_width - width + 1;
+  const int y_end = picture->y_crop_height - height + 1;
+
+  const int length = width * 2;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_short_array_by_block_2x2(
+            CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+                x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] = av1_get_crc_value(
+            &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] = av1_get_crc_value(
+            &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  } else {
+    uint8_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_char_array_by_block_2x2(
+            picture->y_buffer + y_pos * picture->y_stride + x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  }
+}
+
+void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3],
+                                   MACROBLOCK *x) {
+  const int pic_width = picture->y_crop_width;
+  const int x_end = picture->y_crop_width - block_size + 1;
+  const int y_end = picture->y_crop_height - block_size + 1;
+
+  const int src_size = block_size >> 1;
+  const int quad_size = block_size >> 2;
+
+  uint32_t p[4];
+  const int length = sizeof(p);
+
+  int pos = 0;
+  for (int y_pos = 0; y_pos < y_end; y_pos++) {
+    for (int x_pos = 0; x_pos < x_end; x_pos++) {
+      p[0] = src_pic_block_hash[0][pos];
+      p[1] = src_pic_block_hash[0][pos + src_size];
+      p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[0][pos] =
+          av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length);
+
+      p[0] = src_pic_block_hash[1][pos];
+      p[1] = src_pic_block_hash[1][pos + src_size];
+      p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[1][pos] =
+          av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length);
+
+      dst_pic_block_same_info[0][pos] =
+          src_pic_block_same_info[0][pos] &&
+          src_pic_block_same_info[0][pos + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+      dst_pic_block_same_info[1][pos] =
+          src_pic_block_same_info[1][pos] &&
+          src_pic_block_same_info[1][pos + src_size] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+      pos++;
+    }
+    pos += block_size - 1;
+  }
+
+  if (block_size >= 4) {
+    const int size_minus_1 = block_size - 1;
+    pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        dst_pic_block_same_info[2][pos] =
+            (!dst_pic_block_same_info[0][pos] &&
+             !dst_pic_block_same_info[1][pos]) ||
+            (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
+        pos++;
+      }
+      pos += block_size - 1;
+    }
+  }
+}
+
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size) {
+  const int x_end = pic_width - block_size + 1;
+  const int y_end = pic_height - block_size + 1;
+
+  const int8_t *src_is_added = pic_is_same;
+  const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+  int add_value = hash_block_size_to_index(block_size);
+  assert(add_value >= 0);
+  add_value <<= crc_bits;
+  const int crc_mask = (1 << crc_bits) - 1;
+
+  for (int x_pos = 0; x_pos < x_end; x_pos++) {
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      const int pos = y_pos * pic_width + x_pos;
+      // valid data
+      if (src_is_added[pos]) {
+        block_hash curr_block_hash;
+        curr_block_hash.x = x_pos;
+        curr_block_hash.y = y_pos;
+
+        const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+        curr_block_hash.hash_value2 = src_hash[1][pos];
+
+        hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash);
+      }
+    }
+  }
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j] != p16[0]) {
+          return 0;
+        }
+      }
+      p16 += stride;
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j] != p[0]) {
+          return 0;
+        }
+      }
+      p += stride;
+    }
+  }
+
+  return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j * stride + i] != p16[i]) {
+          return 0;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j * stride + i] != p[i]) {
+          return 0;
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth, MACROBLOCK *x) {
+  uint32_t to_hash[4];
+  const int add_value = hash_block_size_to_index(block_size) << crc_bits;
+  assert(add_value >= 0);
+  const int crc_mask = (1 << crc_bits) - 1;
+
+  // 2x2 subblock hash values in current CU
+  int sub_block_in_width = (block_size >> 1);
+  if (use_highbitdepth) {
+    uint16_t pixel_to_hash[4];
+    uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_short_array_by_block_2x2(
+            y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        x->hash_value_buffer[0][0][pos] =
+            av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash,
+                              sizeof(pixel_to_hash));
+        x->hash_value_buffer[1][0][pos] =
+            av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash,
+                              sizeof(pixel_to_hash));
+      }
+    }
+  } else {
+    uint8_t pixel_to_hash[4];
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+                                                 stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        x->hash_value_buffer[0][0][pos] = av1_get_crc_value(
+            &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+        x->hash_value_buffer[1][0][pos] = av1_get_crc_value(
+            &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+      }
+    }
+  }
+
+  int src_sub_block_in_width = sub_block_in_width;
+  sub_block_in_width >>= 1;
+
+  int src_idx = 1;
+  int dst_idx = 0;
+
+  // 4x4 subblock hash values to current block hash values
+  for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+    src_idx = 1 - src_idx;
+    dst_idx = 1 - dst_idx;
+
+    int dst_pos = 0;
+    for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+      for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+        int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+        assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(srcPos + src_sub_block_in_width + 1 <
+               AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos];
+        to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1];
+        to_hash[2] =
+            x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = x->hash_value_buffer[0][src_idx]
+                                         [srcPos + src_sub_block_in_width + 1];
+
+        x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
+            &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+
+        to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos];
+        to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1];
+        to_hash[2] =
+            x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = x->hash_value_buffer[1][src_idx]
+                                         [srcPos + src_sub_block_in_width + 1];
+        x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
+            &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+        dst_pos++;
+      }
+    }
+
+    src_sub_block_in_width = sub_block_in_width;
+    sub_block_in_width >>= 1;
+  }
+
+  *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
+  *hash_value2 = x->hash_value_buffer[1][dst_idx][0];
+}
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
new file mode 100644
index 000000000..df3ec3215
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+  int16_t x;
+  int16_t y;
+  uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table {
+  Vector **p_lookup_table;
+} hash_table;
+
+void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+void av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3],
+                                       struct macroblock *x);
+void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3],
+                                   struct macroblock *x);
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start);
+void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth, struct macroblock *x);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 000000000..67898fd18
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  av1_fwht4x4_c(input, output, stride);
+}
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  if (txfm_param->lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+  switch (tx_type) {
+    // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+      // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+      // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                         txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                         txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                        txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+    // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+    // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  switch (tx_type) {
+    // use the c version for anything including identity for now
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+    case IDTX:
+      av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+    default:
+      av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+      break;
+  }
+}
+
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x16_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  TxfmParam *txfm_param) {
+  if (txfm_param->bd == 8)
+    av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+  else
+    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TxfmParam *txfm_param) {
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X64:
+      highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X32:
+      highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X64:
+      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X32:
+      highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X16:
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X8:
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X8:
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X4:
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X16:
+      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X8:
+      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X32:
+      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X16:
+      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X4:
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X16:
+      highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X4:
+      highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    default: assert(0); break;
+  }
+}
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 000000000..daabc7119
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  TxfmParam *txfm_param);
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TxfmParam *txfm_param);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
new file mode 100644
index 000000000..9e526b88b
--- /dev/null
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+static int RENAME(calc_dist)(const int *p1, const int *p2) {
+  int dist = 0;
+  for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const int diff = p1[i] - p2[i];
+    dist += diff * diff;
+  }
+  return dist;
+}
+
+void RENAME(av1_calc_indices)(const int *data, const int *centroids,
+                              uint8_t *indices, int n, int k) {
+  for (int i = 0; i < n; ++i) {
+    int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+    indices[i] = 0;
+    for (int j = 1; j < k; ++j) {
+      const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                                              centroids + j * AV1_K_MEANS_DIM);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+static void RENAME(calc_centroids)(const int *data, int *centroids,
+                                   const uint8_t *indices, int n, int k) {
+  int i, j;
+  int count[PALETTE_MAX_SIZE] = { 0 };
+  unsigned int rand_state = (unsigned int)data[0];
+  assert(n <= 32768);
+  memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+
+  for (i = 0; i < n; ++i) {
+    const int index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+      centroids[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * AV1_K_MEANS_DIM,
+             data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+             sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+    } else {
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+        centroids[i * AV1_K_MEANS_DIM + j] =
+            DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+      }
+    }
+  }
+}
+
+static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
+                                       const uint8_t *indices, int n, int k) {
+  int64_t dist = 0;
+  (void)k;
+  for (int i = 0; i < n; ++i) {
+    dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                              centroids + indices[i] * AV1_K_MEANS_DIM);
+  }
+  return dist;
+}
+
+void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
+                         int n, int k, int max_itr) {
+  int pre_centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t pre_indices[MAX_SB_SQUARE];
+
+  RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+  int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+  for (int i = 0; i < max_itr; ++i) {
+    const int64_t pre_dist = this_dist;
+    memcpy(pre_centroids, centroids,
+           sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+
+    RENAME(calc_centroids)(data, centroids, indices, n, k);
+    RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+    this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+    if (this_dist > pre_dist) {
+      memcpy(centroids, pre_centroids,
+             sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids,
+                sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM))
+      break;
+  }
+}
+#undef RENAME_
+#undef RENAME
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 000000000..1bf8ecbac
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+  int index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz) index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      int i;
+
+      for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) {
+  struct lookahead_ctx *ctx = NULL;
+
+  // Clamp the lookahead queue depth
+  depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
+  // Allocate the lookahead structures
+  ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    const int legacy_byte_alignment = 0;
+    unsigned int i;
+    ctx->max_sz = depth;
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf) goto bail;
+    for (i = 0; i < depth; i++)
+      if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
+                                 subsampling_y, use_highbitdepth,
+                                 AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
+        goto bail;
+  }
+  return ctx;
+bail:
+  av1_lookahead_destroy(ctx);
+  return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       aom_enc_frame_flags_t flags) {
+  struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+  int row, col, active_end;
+  int mb_rows = (src->y_height + 15) >> 4;
+  int mb_cols = (src->y_width + 15) >> 4;
+#endif
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
+
+  if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1;
+  ctx->sz++;
+  buf = pop(ctx, &ctx->write_idx);
+
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+  // TODO(jkoleszar): This is disabled for now, as
+  // av1_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+  // Only do this partial copy if the following conditions are all met:
+  // 1. Lookahead queue has has size of 1.
+  // 2. Active map is provided.
+  // 3. This is not a key frame, golden nor altref frame.
+  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+    for (row = 0; row < mb_rows; ++row) {
+      col = 0;
+
+      while (1) {
+        // Find the first active macroblock in this row.
+        for (; col < mb_cols; ++col) {
+          if (active_map[col]) break;
+        }
+
+        // No more active macroblock in this row.
+        if (col == mb_cols) break;
+
+        // Find the end of active region in this row.
+        active_end = col;
+
+        for (; active_end < mb_cols; ++active_end) {
+          if (!active_map[active_end]) break;
+        }
+
+        // Only copy this active region.
+        av1_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
+                                            16, (active_end - col) << 4);
+
+        // Start again from the end of this active region.
+        col = active_end;
+      }
+
+      active_map += mb_cols;
+    }
+  } else {
+#endif
+    if (larger_dimensions) {
+      YV12_BUFFER_CONFIG new_img;
+      memset(&new_img, 0, sizeof(new_img));
+      if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                                 subsampling_y, use_highbitdepth,
+                                 AOM_BORDER_IN_PIXELS, 0))
+        return 1;
+      aom_free_frame_buffer(&buf->img);
+      buf->img = new_img;
+    } else if (new_dimensions) {
+      buf->img.y_crop_width = src->y_crop_width;
+      buf->img.y_crop_height = src->y_crop_height;
+      buf->img.uv_crop_width = src->uv_crop_width;
+      buf->img.uv_crop_height = src->uv_crop_height;
+      buf->img.subsampling_x = src->subsampling_x;
+      buf->img.subsampling_y = src->subsampling_y;
+    }
+    // Partial copy not implemented yet
+    av1_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+  }
+#endif
+
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->flags = flags;
+  return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx,
+                                          int drain) {
+  struct lookahead_entry *buf = NULL;
+
+  if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+    buf = pop(ctx, &ctx->read_idx);
+    ctx->sz--;
+  }
+  return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index) {
+  struct lookahead_entry *buf = NULL;
+
+  if (index >= 0) {
+    // Forward peek
+    if (index < ctx->sz) {
+      index += ctx->read_idx;
+      if (index >= ctx->max_sz) index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += (int)(ctx->read_idx);
+      if (index < 0) index += (int)(ctx->max_sz);
+      buf = ctx->buf + index;
+    }
+  }
+
+  return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; }
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 000000000..e55224cf7
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG img;
+  int64_t ts_start;
+  int64_t ts_end;
+  aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
+  struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+                                           int index);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h
new file mode 100644
index 000000000..64f936176
--- /dev/null
+++ b/third_party/aom/av1/encoder/mathutils.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
+#define AOM_AV1_ENCODER_MATHUTILS_H_
+
+#include <memory.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+static const double TINY_NEAR_ZERO = 1.0E-16;
+
+// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
+static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    // Bring the largest magnitude to the diagonal position
+    for (i = n - 1; i > k; i--) {
+      if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+        for (j = 0; j < n; j++) {
+          c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+    for (i = k; i < n - 1; i++) {
+      if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0;
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+
+  return 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Least-squares
+// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2
+// The solution is simply x = (A'A)^-1 A'b or simply the solution for
+// the system: A'A x = A'b
+static INLINE int least_squares(int n, double *A, int rows, int stride,
+                                double *b, double *scratch, double *x) {
+  int i, j, k;
+  double *scratch_ = NULL;
+  double *AtA, *Atb;
+  if (!scratch) {
+    scratch_ = (double *)aom_malloc(sizeof(*scratch) * n * (n + 1));
+    scratch = scratch_;
+  }
+  AtA = scratch;
+  Atb = scratch + n * n;
+
+  for (i = 0; i < n; ++i) {
+    for (j = i; j < n; ++j) {
+      AtA[i * n + j] = 0.0;
+      for (k = 0; k < rows; ++k)
+        AtA[i * n + j] += A[k * stride + i] * A[k * stride + j];
+      AtA[j * n + i] = AtA[i * n + j];
+    }
+    Atb[i] = 0;
+    for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k];
+  }
+  int ret = linsolve(n, AtA, n, Atb, x);
+  if (scratch_) aom_free(scratch_);
+  return ret;
+}
+
+// Matrix multiply
+static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+                                const int m1_rows, const int inner_dim,
+                                const int m2_cols) {
+  double sum;
+
+  int row, col, inner;
+  for (row = 0; row < m1_rows; ++row) {
+    for (col = 0; col < m2_cols; ++col) {
+      sum = 0;
+      for (inner = 0; inner < inner_dim; ++inner)
+        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+      *(res++) = sum;
+    }
+  }
+}
+
+//
+// The functions below are needed only for homography computation
+// Remove if the homography models are not used.
+//
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static INLINE double sign(double a, double b) {
+  return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+  double ct;
+  const double absa = fabs(a);
+  const double absb = fabs(b);
+
+  if (absa > absb) {
+    ct = absb / absa;
+    return absa * sqrt(1.0 + ct * ct);
+  } else {
+    ct = absa / absb;
+    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+  }
+}
+
+static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
+  const int max_its = 30;
+  int flag, i, its, j, jj, k, l, nm;
+  double anorm, c, f, g, h, s, scale, x, y, z;
+  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+  g = scale = anorm = 0.0;
+  for (i = 0; i < n; i++) {
+    l = i + 1;
+    rv1[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m) {
+      for (k = i; k < m; k++) scale += fabs(u[k][i]);
+      if (scale != 0.) {
+        for (k = i; k < m; k++) {
+          u[k][i] /= scale;
+          s += u[k][i] * u[k][i];
+        }
+        f = u[i][i];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][i] = f - g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+          f = s / h;
+          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+        }
+        for (k = i; k < m; k++) u[k][i] *= scale;
+      }
+    }
+    w[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m && i != n - 1) {
+      for (k = l; k < n; k++) scale += fabs(u[i][k]);
+      if (scale != 0.) {
+        for (k = l; k < n; k++) {
+          u[i][k] /= scale;
+          s += u[i][k] * u[i][k];
+        }
+        f = u[i][l];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][l] = f - g;
+        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+        for (j = l; j < m; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+        }
+        for (k = l; k < n; k++) u[i][k] *= scale;
+      }
+    }
+    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+  }
+
+  for (i = n - 1; i >= 0; i--) {
+    if (i < n - 1) {
+      if (g != 0.) {
+        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+        }
+      }
+      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+    }
+    v[i][i] = 1.0;
+    g = rv1[i];
+    l = i;
+  }
+  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+    l = i + 1;
+    g = w[i];
+    for (j = l; j < n; j++) u[i][j] = 0.0;
+    if (g != 0.) {
+      g = 1.0 / g;
+      for (j = l; j < n; j++) {
+        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+        f = (s / u[i][i]) * g;
+        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+      }
+      for (j = i; j < m; j++) u[j][i] *= g;
+    } else {
+      for (j = i; j < m; j++) u[j][i] = 0.0;
+    }
+    ++u[i][i];
+  }
+  for (k = n - 1; k >= 0; k--) {
+    for (its = 0; its < max_its; its++) {
+      flag = 1;
+      for (l = k; l >= 0; l--) {
+        nm = l - 1;
+        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+          flag = 0;
+          break;
+        }
+        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+      }
+      if (flag) {
+        c = 0.0;
+        s = 1.0;
+        for (i = l; i <= k; i++) {
+          f = s * rv1[i];
+          rv1[i] = c * rv1[i];
+          if ((double)(fabs(f) + anorm) == anorm) break;
+          g = w[i];
+          h = pythag(f, g);
+          w[i] = h;
+          h = 1.0 / h;
+          c = g * h;
+          s = -f * h;
+          for (j = 0; j < m; j++) {
+            y = u[j][nm];
+            z = u[j][i];
+            u[j][nm] = y * c + z * s;
+            u[j][i] = z * c - y * s;
+          }
+        }
+      }
+      z = w[k];
+      if (l == k) {
+        if (z < 0.0) {
+          w[k] = -z;
+          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+        }
+        break;
+      }
+      if (its == max_its - 1) {
+        aom_free(rv1);
+        return 1;
+      }
+      assert(k > 0);
+      x = w[l];
+      nm = k - 1;
+      y = w[nm];
+      g = rv1[nm];
+      h = rv1[k];
+      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      g = pythag(f, 1.0);
+      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      c = s = 1.0;
+      for (j = l; j <= nm; j++) {
+        i = j + 1;
+        g = rv1[i];
+        y = w[i];
+        h = s * g;
+        g = c * g;
+        z = pythag(f, h);
+        rv1[j] = z;
+        c = f / z;
+        s = h / z;
+        f = x * c + g * s;
+        g = g * c - x * s;
+        h = y * s;
+        y *= c;
+        for (jj = 0; jj < n; jj++) {
+          x = v[jj][j];
+          z = v[jj][i];
+          v[jj][j] = x * c + z * s;
+          v[jj][i] = z * c - x * s;
+        }
+        z = pythag(f, h);
+        w[j] = z;
+        if (z != 0.) {
+          z = 1.0 / z;
+          c = f * z;
+          s = h * z;
+        }
+        f = c * g + s * y;
+        x = c * y - s * g;
+        for (jj = 0; jj < m; jj++) {
+          y = u[jj][j];
+          z = u[jj][i];
+          u[jj][j] = y * c + z * s;
+          u[jj][i] = z * c - y * s;
+        }
+      }
+      rv1[l] = 0.0;
+      rv1[k] = f;
+      w[k] = x;
+    }
+  }
+  aom_free(rv1);
+  return 0;
+}
+
+static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
+                      int N) {
+  // Assumes allocation for U is MxN
+  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+  int problem, i;
+
+  problem = !(nrU && nrV);
+  if (!problem) {
+    for (i = 0; i < M; i++) {
+      nrU[i] = &U[i * N];
+    }
+    for (i = 0; i < N; i++) {
+      nrV[i] = &V[i * N];
+    }
+  } else {
+    if (nrU) aom_free(nrU);
+    if (nrV) aom_free(nrV);
+    return 1;
+  }
+
+  /* copy from given matx into nrU */
+  for (i = 0; i < M; i++) {
+    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+  }
+
+  /* HERE IT IS: do SVD */
+  if (svdcmp(nrU, M, N, W, nrV)) {
+    aom_free(nrU);
+    aom_free(nrV);
+    return 1;
+  }
+
+  /* aom_free Numerical Recipes arrays */
+  aom_free(nrU);
+  aom_free(nrV);
+
+  return 0;
+}
+
+#endif  // AOM_AV1_ENCODER_MATHUTILS_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
new file mode 100644
index 000000000..1a35ff77c
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/blockd.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+
+static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
+                                              int mb_row, int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  MV ref_full;
+  int cost_list[5];
+
+  // Further step/diamond searches as necessary
+  int step_param = mv_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  av1_set_mv_search_range(&x->mv_limits, ref_mv);
+
+  ref_full.col = ref_mv->col >> 3;
+  ref_full.row = ref_mv->row >> 3;
+
+  /*cpi->sf.search_method == HEX*/
+  av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+                 cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
+
+  // Try sub-pixel MC
+  // if (bestsme > error_thresh && bestsme < INT_MAX)
+  if (cpi->common.cur_frame_force_integer_mv == 1) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  } else {
+    int distortion;
+    unsigned int sse;
+    cpi->find_fractional_mv_step(
+        x, &cpi->common, mb_row, mb_col, ref_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0,
+        mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL,
+        NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
+  }
+
+  if (has_second_ref(xd->mi[0]))
+    xd->mi[0]->mode = NEW_NEWMV;
+  else
+    xd->mi[0]->mode = NEWMV;
+
+  xd->mi[0]->mv[0] = x->best_mv;
+  xd->mi[0]->ref_frame[1] = NONE_FRAME;
+
+  av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
+                                 BLOCK_16X16);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  return aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
+                                  int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err, tmp_err;
+  MV best_mv;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+  best_mv.col = best_mv.row = 0;
+
+  // Test last reference frame using the previous best mv as the
+  // starting point (best reference) for the search
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
+  if (tmp_err < err) {
+    err = tmp_err;
+    best_mv = x->best_mv.as_mv;
+  }
+
+  // If the current best reference mv is not centered on 0,0 then do a 0,0
+  // based search as well.
+  if (ref_mv->row != 0 || ref_mv->col != 0) {
+    MV zero_ref_mv = kZeroMv;
+
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
+    if (tmp_err < err) {
+      err = tmp_err;
+      best_mv = x->best_mv.as_mv;
+    }
+  }
+
+  x->best_mv.as_mv = best_mv;
+  return err;
+}
+
+static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int err;
+
+  // Try zero MV first
+  // FIXME should really use something like near/nearest MV and/or MV prediction
+  err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                     xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+  dst_mv->as_int = 0;
+
+  return err;
+}
+static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  PREDICTION_MODE best_mode = -1, mode;
+  unsigned int best_err = INT_MAX;
+
+  // calculate SATD for each intra prediction mode;
+  // we're intentionally not doing 4x4, we just want a rough estimate
+  for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) {
+    unsigned int err;
+
+    xd->mi[0]->mode = mode;
+    av1_predict_intra_block(cm, xd, 16, 16, TX_16X16, mode, 0, 0,
+                            FILTER_INTRA_MODES, x->plane[0].src.buf,
+                            x->plane[0].src.stride, xd->plane[0].dst.buf,
+                            xd->plane[0].dst.stride, 0, 0, 0);
+    err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+    // find best
+    if (err < best_err) {
+      best_err = err;
+      best_mode = mode;
+    }
+  }
+
+  if (pbest_mode) *pbest_mode = best_mode;
+
+  return best_err;
+}
+
+static void update_mbgraph_mb_stats(AV1_COMP *cpi, MBGRAPH_MB_STATS *stats,
+                                    YV12_BUFFER_CONFIG *buf, int mb_y_offset,
+                                    YV12_BUFFER_CONFIG *golden_ref,
+                                    const MV *prev_golden_ref_mv,
+                                    YV12_BUFFER_CONFIG *alt_ref, int mb_row,
+                                    int mb_col) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int intra_error;
+  AV1_COMMON *cm = &cpi->common;
+
+  // FIXME in practice we're completely ignoring chroma here
+  x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+  x->plane[0].src.stride = buf->y_stride;
+
+  xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+  xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+  // do intra 16x16 prediction
+  intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode);
+  if (intra_error <= 0) intra_error = 1;
+  stats->ref[INTRA_FRAME].err = intra_error;
+
+  // Golden frame MV search, if it exists and is different than last frame
+  if (golden_ref) {
+    int g_motion_error;
+    xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = golden_ref->y_stride;
+    g_motion_error =
+        do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
+    stats->ref[GOLDEN_FRAME].err = g_motion_error;
+  } else {
+    stats->ref[GOLDEN_FRAME].err = INT_MAX;
+    stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+  }
+
+  // Do an Alt-ref frame MV search, if it exists and is different than
+  // last/golden frame.
+  if (alt_ref) {
+    int a_motion_error;
+    xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+    xd->plane[0].pre[0].stride = alt_ref->y_stride;
+    a_motion_error =
+        do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv);
+
+    stats->ref[ALTREF_FRAME].err = a_motion_error;
+  } else {
+    stats->ref[ALTREF_FRAME].err = INT_MAX;
+    stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+  }
+}
+
+static void update_mbgraph_frame_stats(AV1_COMP *cpi,
+                                       MBGRAPH_FRAME_STATS *stats,
+                                       YV12_BUFFER_CONFIG *buf,
+                                       YV12_BUFFER_CONFIG *golden_ref,
+                                       YV12_BUFFER_CONFIG *alt_ref) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+
+  int mb_col, mb_row, offset = 0;
+  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  MV gld_top_mv = kZeroMv;
+  MB_MODE_INFO mi_local;
+
+  av1_zero(mi_local);
+  // Set up limit values for motion vectors to prevent them extending outside
+  // the UMV borders.
+  x->mv_limits.row_min = -BORDER_MV_PIXELS_B16;
+  x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+  xd->up_available = 0;
+  xd->plane[0].dst.stride = buf->y_stride;
+  xd->plane[0].pre[0].stride = buf->y_stride;
+  xd->plane[1].dst.stride = buf->uv_stride;
+  xd->mi[0] = &mi_local;
+  mi_local.sb_type = BLOCK_16X16;
+  mi_local.ref_frame[0] = LAST_FRAME;
+  mi_local.ref_frame[1] = NONE_FRAME;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    MV gld_left_mv = gld_top_mv;
+    int mb_y_in_offset = mb_y_offset;
+    int arf_y_in_offset = arf_y_offset;
+    int gld_y_in_offset = gld_y_offset;
+
+    // Set up limit values for motion vectors to prevent them extending outside
+    // the UMV borders.
+    x->mv_limits.col_min = -BORDER_MV_PIXELS_B16;
+    x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+    xd->left_available = 0;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+      update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref,
+                              &gld_left_mv, alt_ref, mb_row, mb_col);
+      gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+      if (mb_col == 0) {
+        gld_top_mv = gld_left_mv;
+      }
+      xd->left_available = 1;
+      mb_y_in_offset += 16;
+      gld_y_in_offset += 16;
+      arf_y_in_offset += 16;
+      x->mv_limits.col_min -= 16;
+      x->mv_limits.col_max -= 16;
+    }
+    xd->up_available = 1;
+    mb_y_offset += buf->y_stride * 16;
+    gld_y_offset += golden_ref->y_stride * 16;
+    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
+    x->mv_limits.row_min -= 16;
+    x->mv_limits.row_max -= 16;
+    offset += cm->mb_cols;
+  }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int mb_col, mb_row, offset, i;
+  int mi_row, mi_col;
+  int ncnt[4] = { 0 };
+  int n_frames = cpi->mbgraph_n_frames;
+
+  int *arf_not_zz;
+
+  CHECK_MEM_ERROR(
+      cm, arf_not_zz,
+      aom_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+  // We are not interested in results beyond the alt ref itself.
+  if (n_frames > cpi->rc.frames_till_gf_update_due)
+    n_frames = cpi->rc.frames_till_gf_update_due;
+
+  // defer cost to reference frames
+  for (i = n_frames - 1; i >= 0; i--) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+    for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+         offset += cm->mb_cols, mb_row++) {
+      for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+        MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+        int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+        int intra_err = mb_stats->ref[INTRA_FRAME].err;
+        int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+        // Test for altref vs intra and gf and that its mv was 0,0.
+        if (altref_err > 1000 || altref_err > intra_err ||
+            altref_err > golden_err) {
+          arf_not_zz[offset + mb_col]++;
+        }
+      }
+    }
+  }
+
+  // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+  // of bound access in segmentation_map
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      // If any of the blocks in the sequence failed then the MB
+      // goes in segment 0
+      if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+        ncnt[0]++;
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+      } else {
+        cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+        ncnt[1]++;
+      }
+    }
+  }
+
+  // Only bother with segmentation if over 10% of the MBs in static segment
+  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+  if (1) {
+    // Note % of blocks that are marked as static
+    if (cm->MBs)
+      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+    // This error case should not be reachable as this function should
+    // never be called with the common data structure uninitialized.
+    else
+      cpi->static_mb_pct = 0;
+
+    av1_enable_segmentation(&cm->seg);
+  } else {
+    cpi->static_mb_pct = 0;
+    av1_disable_segmentation(&cm->seg);
+  }
+
+  // Free localy allocated storage
+  aom_free(arf_not_zz);
+}
+
+void av1_update_mbgraph_stats(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, n_frames = av1_lookahead_depth(cpi->lookahead);
+  YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+  assert(golden_ref != NULL);
+
+  // we need to look ahead beyond where the ARF transitions into
+  // being a GF - so exit if we don't look ahead beyond that
+  if (n_frames <= cpi->rc.frames_till_gf_update_due) return;
+
+  if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS;
+
+  cpi->mbgraph_n_frames = n_frames;
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    memset(frame_stats->mb_stats, 0,
+           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+  }
+
+  // do motion search to find contribution of each reference to data
+  // later on in this GF group
+  // FIXME really, the GF/last MC search should be done forward, and
+  // the ARF MC search backwards, to get optimal results for MV caching
+  for (i = 0; i < n_frames; i++) {
+    MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+    struct lookahead_entry *q_cur = av1_lookahead_peek(cpi->lookahead, i);
+
+    assert(q_cur != NULL);
+
+    update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref,
+                               cpi->source);
+  }
+
+  aom_clear_system_state();
+
+  separate_arf_mbs(cpi);
+}
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
new file mode 100644
index 000000000..ba08476f7
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MBGRAPH_H_
+#define AOM_AV1_ENCODER_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  struct {
+    int err;
+    union {
+      int_mv mv;
+      PREDICTION_MODE mode;
+    } m;
+  } ref[REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct AV1_COMP;
+
+void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 000000000..8f6de9b53
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,2885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
+  int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+  int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+  col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+  row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+  col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+  row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
+  if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+  if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+  if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+  if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+}
+
+static void set_subpel_mv_search_range(const MvLimits *mv_limits, int *col_min,
+                                       int *col_max, int *row_min, int *row_max,
+                                       const MV *ref_mv) {
+  const int max_mv = MAX_FULL_PEL_VAL * 8;
+  const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
+  const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
+  const int minr = AOMMAX(mv_limits->row_min * 8, ref_mv->row - max_mv);
+  const int maxr = AOMMIN(mv_limits->row_max * 8, ref_mv->row + max_mv);
+
+  *col_min = AOMMAX(MV_LOW + 1, minc);
+  *col_max = AOMMIN(MV_UPP - 1, maxc);
+  *row_min = AOMMAX(MV_LOW + 1, minr);
+  *row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+int av1_init_search_range(int size) {
+  int sr = 0;
+  // Minimum search size no matter what the passed in value.
+  size = AOMMAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+  sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+  return sr;
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                    int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                       int *mvcost[2], int error_per_bit) {
+  if (mvcost) {
+    const MV diff = { mv->row - ref->row, mv->col - ref->col };
+    return (int)ROUND_POWER_OF_TWO_64(
+        (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+        RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+            PIXEL_TRANSFORM_ERROR_SCALE);
+  }
+  return 0;
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
+                          int sad_per_bit) {
+  const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointcost, x->mvcost) * sad_per_bit,
+      AV1_PROB_COST_SHIFT);
+}
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
+
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 4 search sites per step.
+    const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } };
+    int i;
+    for (i = 0; i < 4; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
+  }
+
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 4;
+}
+
+void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
+  int len, ss_count = 1;
+
+  cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+  cfg->ss[0].offset = 0;
+
+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+    // Generate offsets for 8 search sites per step.
+    const MV ss_mvs[8] = { { -len, 0 },   { len, 0 },     { 0, -len },
+                           { 0, len },    { -len, -len }, { -len, len },
+                           { len, -len }, { len, len } };
+    int i;
+    for (i = 0; i < 8; ++i) {
+      search_site *const ss = &cfg->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
+  }
+
+  cfg->ss_count = ss_count;
+  cfg->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) { return x & 7; }
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+  const int offset = (r >> 3) * stride + (c >> 3);
+  return buf + offset;
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c)                                                \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+    MV this_mv = { r, c };                                                   \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);       \
+    if (second_pred == NULL) {                                               \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),     \
+                         src_address, src_stride, &sse);                     \
+    } else if (mask) {                                                       \
+      thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),    \
+                          src_address, src_stride, second_pred, mask,        \
+                          mask_stride, invert_mask, &sse);                   \
+    } else {                                                                 \
+      if (xd->jcp_param.use_jnt_comp_avg)                                    \
+        thismse = vfp->jsvaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                             src_address, src_stride, &sse, second_pred,     \
+                             &xd->jcp_param);                                \
+      else                                                                   \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
+                            src_address, src_stride, &sse, second_pred);     \
+    }                                                                        \
+    v += thismse;                                                            \
+    if (v < besterr) {                                                       \
+      besterr = v;                                                           \
+      br = r;                                                                \
+      bc = c;                                                                \
+      *distortion = thismse;                                                 \
+      *sse1 = sse;                                                           \
+    }                                                                        \
+  } else {                                                                   \
+    v = INT_MAX;                                                             \
+  }
+
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c)                                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                  \
+    MV this_mv = { r, c };                                                 \
+    thismse = upsampled_pref_error(                                        \
+        xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,    \
+        pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
+        mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);     \
+    v += thismse;                                                          \
+    if (v < besterr) {                                                     \
+      besterr = v;                                                         \
+      br = r;                                                              \
+      bc = c;                                                              \
+      *distortion = thismse;                                               \
+      *sse1 = sse;                                                         \
+    }                                                                      \
+  } else {                                                                 \
+    v = INT_MAX;                                                           \
+  }
+
+#define FIRST_LEVEL_CHECKS                                       \
+  {                                                              \
+    unsigned int left, right, up, down, diag;                    \
+    CHECK_BETTER(left, tr, tc - hstep);                          \
+    CHECK_BETTER(right, tr, tc + hstep);                         \
+    CHECK_BETTER(up, tr - hstep, tc);                            \
+    CHECK_BETTER(down, tr + hstep, tc);                          \
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);     \
+    switch (whichdir) {                                          \
+      case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \
+      case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \
+      case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
+      case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
+    }                                                            \
+  }
+
+#define SECOND_LEVEL_CHECKS                                       \
+  {                                                               \
+    int kr, kc;                                                   \
+    unsigned int second;                                          \
+    if (tr != br && tc != bc) {                                   \
+      kr = br - tr;                                               \
+      kc = bc - tc;                                               \
+      CHECK_BETTER(second, tr + kr, tc + 2 * kc);                 \
+      CHECK_BETTER(second, tr + 2 * kr, tc + kc);                 \
+    } else if (tr == br && tc != bc) {                            \
+      kc = bc - tc;                                               \
+      CHECK_BETTER(second, tr + hstep, tc + 2 * kc);              \
+      CHECK_BETTER(second, tr - hstep, tc + 2 * kc);              \
+      switch (whichdir) {                                         \
+        case 0:                                                   \
+        case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \
+        case 2:                                                   \
+        case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \
+      }                                                           \
+    } else if (tr != br && tc == bc) {                            \
+      kr = br - tr;                                               \
+      CHECK_BETTER(second, tr + 2 * kr, tc + hstep);              \
+      CHECK_BETTER(second, tr + 2 * kr, tc - hstep);              \
+      switch (whichdir) {                                         \
+        case 0:                                                   \
+        case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \
+        case 1:                                                   \
+        case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
+      }                                                           \
+    }                                                             \
+  }
+
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST(k)                \
+  {                                                \
+    unsigned int second;                           \
+    int br0 = br;                                  \
+    int bc0 = bc;                                  \
+    assert(tr == br || tc == bc);                  \
+    if (tr == br && tc != bc) {                    \
+      kc = bc - tc;                                \
+    } else if (tr != br && tc == bc) {             \
+      kr = br - tr;                                \
+    }                                              \
+    CHECK_BETTER##k(second, br0 + kr, bc0);        \
+    CHECK_BETTER##k(second, br0, bc0 + kc);        \
+    if (br0 != br || bc0 != bc) {                  \
+      CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
+    }                                              \
+  }
+
+#define SETUP_SUBPEL_SEARCH                                             \
+  const uint8_t *const src_address = x->plane[0].src.buf;               \
+  const int src_stride = x->plane[0].src.stride;                        \
+  const MACROBLOCKD *xd = &x->e_mbd;                                    \
+  unsigned int besterr = INT_MAX;                                       \
+  unsigned int sse;                                                     \
+  unsigned int whichdir;                                                \
+  int thismse;                                                          \
+  MV *bestmv = &x->best_mv.as_mv;                                       \
+  const unsigned int halfiters = iters_per_step;                        \
+  const unsigned int quarteriters = iters_per_step;                     \
+  const unsigned int eighthiters = iters_per_step;                      \
+  const int y_stride = xd->plane[0].pre[0].stride;                      \
+  const int offset = bestmv->row * y_stride + bestmv->col;              \
+  const uint8_t *const y = xd->plane[0].pre[0].buf;                     \
+                                                                        \
+  int br = bestmv->row * 8;                                             \
+  int bc = bestmv->col * 8;                                             \
+  int hstep = 4;                                                        \
+  int minc, maxc, minr, maxr;                                           \
+  int tr = br;                                                          \
+  int tc = bc;                                                          \
+                                                                        \
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
+                             ref_mv);                                   \
+                                                                        \
+  bestmv->row *= 8;                                                     \
+  bestmv->col *= 8;
+
+static unsigned int setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
+    int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr;
+  if (second_pred != NULL) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+      if (mask) {
+        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset,
+                                  y_stride, mask, mask_stride, invert_mask);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+                                       y_stride, &xd->jcp_param);
+        else
+          aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+                                   y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+      if (mask) {
+        aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
+                           mask, mask_stride, invert_mask);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+                                y_stride, &xd->jcp_param);
+        else
+          aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
+  SETUP_SUBPEL_SEARCH;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, mask, mask_stride, invert_mask, w,
+                               h, offset, mvjcost, mvcost, sse1, distortion);
+  (void)halfiters;
+  (void)quarteriters;
+  (void)eighthiters;
+  (void)whichdir;
+  (void)allow_hp;
+  (void)forced_stop;
+  (void)hstep;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    unsigned int minpt;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+
+    tr = br;
+    tc = bc;
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+    if (forced_stop != 2) {
+      hstep >>= 1;
+      FIRST_LEVEL_CHECKS;
+      if (quarteriters > 1) {
+        SECOND_LEVEL_CHECKS;
+      }
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  if (allow_hp && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
+  SETUP_SUBPEL_SEARCH;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, mask, mask_stride, invert_mask, w,
+                               h, offset, mvjcost, mvcost, sse1, distortion);
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    unsigned int minpt;
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  if (allow_hp && forced_stop == 0) {
+    tr = br;
+    tc = bc;
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
+  SETUP_SUBPEL_SEARCH;
+  (void)use_accurate_subpel_search;
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, mask, mask_stride, invert_mask, w,
+                               h, offset, mvjcost, mvcost, sse1, distortion);
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    unsigned int left, right, up, down, diag;
+    whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+               (cost_list[2] < cost_list[4] ? 0 : 2);
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(down, tr + hstep, tc);
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(left, tr, tc - hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(right, tr, tc + hstep);
+        CHECK_BETTER(up, tr - hstep, tc);
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+    }
+  } else {
+    FIRST_LEVEL_CHECKS;
+    if (halfiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+  }
+
+  tr = br;
+  tc = bc;
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+/* clang-format off */
+static const MV search_step_table[12] = {
+  // left, right, up, down
+  { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
+  { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
+  { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
+};
+/* clang-format on */
+
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *const mv,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const uint8_t *const src, const int src_stride,
+                                const uint8_t *const y, int y_stride,
+                                int subpel_x_q3, int subpel_y_q3,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                unsigned int *sse, int subpel_search) {
+  unsigned int besterr;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_highbd_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+            subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
+            subpel_search);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_highbd_jnt_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search);
+        else
+          aom_highbd_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, xd->bd, subpel_search);
+      }
+    } else {
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+                                subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                                subpel_search);
+    }
+    besterr = vfp->vf(pred8, w, src, src_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+                                     second_pred, w, h, subpel_x_q3,
+                                     subpel_y_q3, y, y_stride, mask,
+                                     mask_stride, invert_mask, subpel_search);
+      } else {
+        if (xd->jcp_param.use_jnt_comp_avg)
+          aom_jnt_comp_avg_upsampled_pred(
+              xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
+              subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search);
+        else
+          aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+                                      second_pred, w, h, subpel_x_q3,
+                                      subpel_y_q3, y, y_stride, subpel_search);
+      }
+    } else {
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                         subpel_y_q3, y, y_stride, subpel_search);
+    }
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+  }
+  return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const y, int y_stride,
+    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+    int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion, int subpel_search) {
+  unsigned int besterr =
+      upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
+                           y + offset, y_stride, 0, 0, second_pred, mask,
+                           mask_stride, invert_mask, w, h, sse1, subpel_search);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+// when use_accurate_subpel_search == 0
+static INLINE unsigned int estimate_upsampled_pref_error(
+    MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const pre, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
+    const uint8_t *mask, int mask_stride, int invert_mask, unsigned int *sse) {
+  if (second_pred == NULL) {
+    return vfp->svf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                    sse);
+  } else if (mask) {
+    return vfp->msvf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     second_pred, mask, mask_stride, invert_mask, sse);
+  } else {
+    if (xd->jcp_param.use_jnt_comp_avg)
+      return vfp->jsvaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src,
+                        src_stride, sse, second_pred, &xd->jcp_param);
+    else
+      return vfp->svaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                       sse, second_pred);
+  }
+}
+
+int av1_find_best_sub_pixel_tree(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search) {
+  const uint8_t *const src_address = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  MV *bestmv = &x->best_mv.as_mv;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  int minc, maxc, minr, maxr;
+
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  if (use_accurate_subpel_search)
+    besterr = upsampled_setup_center_error(
+        xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
+        src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
+        h, offset, mvjcost, mvcost, sse1, distortion,
+        use_accurate_subpel_search);
+  else
+    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                                 src_address, src_stride, y, y_stride,
+                                 second_pred, mask, mask_stride, invert_mask, w,
+                                 h, offset, mvjcost, mvcost, sse1, distortion);
+
+  (void)cost_list;  // to silence compiler warning
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = { tr, tc };
+
+        if (use_accurate_subpel_search) {
+          thismse = upsampled_pref_error(
+              xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+              mask, mask_stride, invert_mask, w, h, &sse,
+              use_accurate_subpel_search);
+        } else {
+          thismse = estimate_upsampled_pref_error(
+              xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+              y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+              invert_mask, &sse);
+        }
+
+        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+                                                mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = { tr, tc };
+
+      if (use_accurate_subpel_search) {
+        thismse = upsampled_pref_error(
+            xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+            mask, mask_stride, invert_mask, w, h, &sse,
+            use_accurate_subpel_search);
+      } else {
+        thismse = estimate_upsampled_pref_error(
+            xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+            y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+            invert_mask, &sse);
+      }
+
+      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                            error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_accurate_subpel_search) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+#undef PRE
+#undef CHECK_BETTER
+
+unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                     const MV *this_mv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *const src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  uint8_t *const dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+  const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
+  unsigned int mse;
+  unsigned int sse;
+
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+  mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost,
+                     x->errorperbit);
+  return mse;
+}
+
+// Refine MV in a small range
+unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int *pts0, int *pts_inref0,
+                                  int total_samples) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+                            { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
+  const int_mv ref_mv = av1_get_ref_mv(x, 0);
+  int16_t br = mbmi->mv[0].as_mv.row;
+  int16_t bc = mbmi->mv[0].as_mv.col;
+  int16_t *tr = &mbmi->mv[0].as_mv.row;
+  int16_t *tc = &mbmi->mv[0].as_mv.col;
+  WarpedMotionParams best_wm_params = mbmi->wm_params;
+  int best_num_proj_ref = mbmi->num_proj_ref;
+  unsigned int bestmse;
+  int minc, maxc, minr, maxr;
+  const int start = cm->allow_high_precision_mv ? 0 : 4;
+  int ite;
+
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+                             &ref_mv.as_mv);
+
+  // Calculate the center position's error
+  assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr);
+  bestmse = av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col,
+                                    &mbmi->mv[0].as_mv);
+
+  // MV search
+  for (ite = 0; ite < 2; ++ite) {
+    int best_idx = -1;
+    int idx;
+
+    for (idx = start; idx < start + 4; ++idx) {
+      unsigned int thismse;
+
+      *tr = br + neighbors[idx].row;
+      *tc = bc + neighbors[idx].col;
+
+      if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
+        MV this_mv = { *tr, *tc };
+        int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+
+        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+        if (total_samples > 1)
+          mbmi->num_proj_ref =
+              selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+
+        if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
+                             *tc, &mbmi->wm_params, mi_row, mi_col)) {
+          thismse =
+              av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
+
+          if (thismse < bestmse) {
+            best_idx = idx;
+            best_wm_params = mbmi->wm_params;
+            best_num_proj_ref = mbmi->num_proj_ref;
+            bestmse = thismse;
+          }
+        }
+      }
+    }
+
+    if (best_idx == -1) break;
+
+    if (best_idx >= 0) {
+      br += neighbors[best_idx].row;
+      bc += neighbors[best_idx].col;
+    }
+  }
+
+  *tr = br;
+  *tc = bc;
+  mbmi->wm_params = best_wm_params;
+  mbmi->num_proj_ref = best_num_proj_ref;
+  return bestmse;
+}
+
+static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
+                               int range) {
+  return ((row - range) >= mv_limits->row_min) &
+         ((row + range) <= mv_limits->row_max) &
+         ((col - range) >= mv_limits->col_min) &
+         ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
+  return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) &&
+         (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max);
+}
+
+#define CHECK_BETTER                                                      \
+  {                                                                       \
+    if (thissad < bestsad) {                                              \
+      if (use_mvcost)                                                     \
+        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \
+      if (thissad < bestsad) {                                            \
+        bestsad = thissad;                                                \
+        best_site = i;                                                    \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *const ref_mv, int sadpb,
+                                      const aom_variance_fn_ptr_t *fn_ptr,
+                                      const MV *best_mv, int *cost_list) {
+  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+  int i;
+  unsigned int sse;
+  const MV this_mv = { br, bc };
+
+  cost_list[0] =
+      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+                 in_what->stride, &sse) +
+      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+  if (check_bounds(&x->mv_limits, br, bc, 1)) {
+    for (i = 0; i < 4; i++) {
+      const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+      cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+                                    get_buf_from_mv(in_what, &neighbor_mv),
+                                    in_what->stride, &sse) +
+                         mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost,
+                                     x->mvcost, x->errorperbit);
+    }
+  } else {
+    for (i = 0; i < 4; i++) {
+      const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+      if (!is_mv_in(&x->mv_limits, &neighbor_mv))
+        cost_list[i + 1] = INT_MAX;
+      else
+        cost_list[i + 1] =
+            fn_ptr->vf(what->buf, what->stride,
+                       get_buf_from_mv(in_what, &neighbor_mv), in_what->stride,
+                       &sse) +
+            mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+                        x->errorperbit);
+    }
+  }
+}
+
+static INLINE void calc_int_sad_list(const MACROBLOCK *x,
+                                     const MV *const ref_mv, int sadpb,
+                                     const aom_variance_fn_ptr_t *fn_ptr,
+                                     const MV *best_mv, int *cost_list,
+                                     const int use_mvcost, const int bestsad) {
+  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int i;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+
+  if (cost_list[0] == INT_MAX) {
+    cost_list[0] = bestsad;
+    if (check_bounds(&x->mv_limits, br, bc, 1)) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        cost_list[i + 1] =
+            fn_ptr->sdf(what->buf, what->stride,
+                        get_buf_from_mv(in_what, &this_mv), in_what->stride);
+      }
+    } else {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        if (!is_mv_in(&x->mv_limits, &this_mv))
+          cost_list[i + 1] = INT_MAX;
+        else
+          cost_list[i + 1] =
+              fn_ptr->sdf(what->buf, what->stride,
+                          get_buf_from_mv(in_what, &this_mv), in_what->stride);
+      }
+    }
+  } else {
+    if (use_mvcost) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        if (cost_list[i + 1] != INT_MAX) {
+          cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+        }
+      }
+    }
+  }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int pattern_search(
+    MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit,
+    int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp,
+    int use_mvcost, const MV *center_mv,
+    const int num_candidates[MAX_PATTERN_SCALES],
+    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int last_is_4 = num_candidates[0] == 4;
+  int br, bc;
+  int bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  assert(search_param < MAX_MVSEARCH_STEPS);
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  br = start_mv->row;
+  bc = start_mv->col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+
+  // Work out the start point for the search
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, start_mv), in_what->stride) +
+            mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
+
+  // Search all possible scales upto the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = { br + candidates[t][i].row,
+                               bc + candidates[t][i].col };
+          thissad =
+              vfp->sdf(what->buf, what->stride,
+                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const MV this_mv = { br + candidates[t][i].row,
+                               bc + candidates[t][i].col };
+          if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+          thissad =
+              vfp->sdf(what->buf, what->stride,
+                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    const int last_s = (last_is_4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= last_s; s--) {
+      // No need to search all points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+            thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    }
+
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = bestsad;
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            cost_list[i + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const MV this_mv = { br + candidates[s][i].row,
+                                 bc + candidates[s][i].col };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+            cost_list[i + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = bestsad;
+
+        if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            cost_list[next_chkpts_indices[i] + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            if (!is_mv_in(&x->mv_limits, &this_mv)) {
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              continue;
+            }
+            cost_list[next_chkpts_indices[i] + 1] = thissad =
+                vfp->sdf(what->buf, what->stride,
+                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      }
+    }
+  }
+
+  // Returns the one-away integer pel cost/sad around the best as follows:
+  // cost_list[0]: cost/sad at the best integer pel
+  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    const MV best_int_mv = { br, bc };
+    if (last_is_4) {
+      calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list,
+                        use_mvcost, bestsad);
+    } else {
+      calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv,
+                         cost_list);
+    }
+  }
+  x->best_mv.as_mv.row = br;
+  x->best_mv.as_mv.col = bc;
+  return bestsad;
+}
+
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+                       const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+                       int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                 in_what->stride, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+                          const MV *center_mv, const uint8_t *second_pred,
+                          const aom_variance_fn_ptr_t *vfp, int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  if (xd->jcp_param.use_jnt_comp_avg)
+    return vfp->jsvaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                      what->buf, what->stride, &unused, second_pred,
+                      &xd->jcp_param) +
+           (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                     x->errorperbit)
+                       : 0);
+  else
+    return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+                     what->buf, what->stride, &unused, second_pred) +
+           (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                     x->errorperbit)
+                       : 0);
+}
+
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
+                            const MV *center_mv, const uint8_t *second_pred,
+                            const uint8_t *mask, int mask_stride,
+                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
+                            int use_mvcost) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->msvf(what->buf, what->stride, 0, 0,
+                   get_buf_from_mv(in_what, best_mv), in_what->stride,
+                   second_pred, mask, mask_stride, invert_mask, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                   int sad_per_bit, int do_init_search, int *cost_list,
+                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                   const MV *center_mv) {
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+                                                              6, 6, 6, 6, 6 };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+    { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
+      { -1, 0 } },
+    { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+    { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+    { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+    { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
+    { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+      { -32, 0 } },
+    { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+      { -64, 0 } },
+    { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
+      { -128, 0 } },
+    { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
+      { -256, 0 } },
+    { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
+      { -512, 0 } },
+    { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+      { -512, 1024 }, { -1024, 0 } },
+  };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        hex_num_candidates, hex_candidates);
+}
+
+static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                         int sad_per_bit, int do_init_search, int *cost_list,
+                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                         const MV *center_mv) {
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV
+      bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+        { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+          { -1, 1 }, { -2, 0 } },
+        { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+          { -2, 2 }, { -4, 0 } },
+        { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+          { -4, 4 }, { -8, 0 } },
+        { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+          { -8, 8 }, { -16, 0 } },
+        { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+          { 0, 32 }, { -16, 16 }, { -32, 0 } },
+        { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+          { 0, 64 }, { -32, 32 }, { -64, 0 } },
+        { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+          { 0, 128 }, { -64, 64 }, { -128, 0 } },
+        { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+          { 0, 256 }, { -128, 128 }, { -256, 0 } },
+        { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+          { 0, 512 }, { -256, 256 }, { -512, 0 } },
+        { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+          { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        bigdia_num_candidates, bigdia_candidates);
+}
+
+static int square_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                         int sad_per_bit, int do_init_search, int *cost_list,
+                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                         const MV *center_mv) {
+  // All scales have 8 closest points in square shape
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV
+      square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+          { -2, 2 }, { -2, 0 } },
+        { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+          { -4, 4 }, { -4, 0 } },
+        { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+          { -8, 8 }, { -8, 0 } },
+        { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+          { 0, 16 }, { -16, 16 }, { -16, 0 } },
+        { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+          { 0, 32 }, { -32, 32 }, { -32, 0 } },
+        { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+          { 0, 64 }, { -64, 64 }, { -64, 0 } },
+        { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+          { 0, 128 }, { -128, 128 }, { -128, 0 } },
+        { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+          { 0, 256 }, { -256, 256 }, { -256, 0 } },
+        { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+          { 0, 512 }, { -512, 512 }, { -512, 0 } },
+        { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+          { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        square_num_candidates, square_candidates);
+}
+
+static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list, const aom_variance_fn_ptr_t *vfp,
+                           int use_mvcost, const MV *center_mv) {
+  return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                        sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv);
+}
+
+static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+                           int sad_per_bit, int do_init_search, int *cost_list,
+                           const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                           const MV *center_mv) {
+  return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                       center_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+                                  int range, int step, int sad_per_bit,
+                                  const aom_variance_fn_ptr_t *fn_ptr,
+                                  const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  MV fcenter_mv = { center_mv->row, center_mv->col };
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  *best_mv = fcenter_mv;
+  best_sad =
+      fn_ptr->sdf(what->buf, what->stride,
+                  get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+      mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+  start_row = AOMMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+  start_col = AOMMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+  end_row = AOMMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+  end_col = AOMMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+        unsigned int sad =
+            fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                        in_what->stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            x->second_best_mv.as_mv = *best_mv;
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+              const unsigned int sad =
+                  sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                x->second_best_mv.as_mv = *best_mv;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            unsigned int sad =
+                fn_ptr->sdf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &mv), in_what->stride);
+            if (sad < best_sad) {
+              sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+              if (sad < best_sad) {
+                best_sad = sad;
+                x->second_best_mv.as_mv = *best_mv;
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
+                             MV *ref_mv, MV *best_mv, int search_param,
+                             int sad_per_bit, int *num00,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv) {
+  int i, j, step;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
+
+  unsigned int bestsad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+
+  int ref_row;
+  int ref_col;
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
+  *num00 = 0;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
+
+  // Work out the start point for the search
+  in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
+            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
+    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_limits.row_min);
+    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_limits.row_max);
+    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_limits.col_min);
+    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_limits.col_max);
+
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
+        unsigned char const *block_offset[4];
+
+        for (t = 0; t < 4; t++)
+          block_offset[t] = ss[i + t].offset + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          if (sad_array[t] < bestsad) {
+            const MV this_mv = { best_mv->row + ss[i].mv.row,
+                                 best_mv->col + ss[i].mv.col };
+            sad_array[t] +=
+                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad_array[t] < bestsad) {
+              bestsad = sad_array[t];
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < cfg->searches_per_step; j++) {
+        // Trap illegal vectors
+        const MV this_mv = { best_mv->row + ss[i].mv.row,
+                             best_mv->col + ss[i].mv.col };
+
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          const uint8_t *const check_here = ss[i].offset + best_address;
+          unsigned int thissad =
+              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+    if (best_site != last_site) {
+      x->second_best_mv.as_mv = *best_mv;
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+                             best_mv->col + ss[best_site].mv.col };
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          const uint8_t *const check_here = ss[best_site].offset + best_address;
+          unsigned int thissad =
+              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what) {
+      (*num00)++;
+    }
+  }
+  return bestsad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine, int *cost_list,
+                              const aom_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  if (bestsme < INT_MAX)
+    bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  x->best_mv.as_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00, fn_ptr,
+                                        ref_mv);
+      if (thissme < INT_MAX)
+        thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        x->best_mv.as_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = x->best_mv.as_mv;
+    thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
+                                      ref_mv);
+    if (thissme < INT_MAX)
+      thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      x->best_mv.as_mv = best_mv;
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
+  }
+  return bestsme;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 const MV *centre_mv_full, int sadpb,
+                                 int *cost_list,
+                                 const aom_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+  MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  // Keep track of number of exhaustive calls (this frame in this thread).
+  ++(*x->ex_search_count_ptr);
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+      (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = AOMMAX(range, (5 * AOMMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = AOMMIN(range, MAX_RANGE);
+  interval = AOMMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+                                   sadpb, fn_ptr, &temp_mv);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhuastive_mesh_search(
+          x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
+          sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
+
+      if (sf->mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  if (bestsme < INT_MAX)
+    bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+  *dst_mv = temp_mv;
+
+  // Return cost list.
+  if (cost_list) {
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+  }
+  return bestsme;
+}
+
+int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+                            int search_range,
+                            const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+  unsigned int best_sad =
+      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) &
+                       ((ref_mv->row + 1) < x->mv_limits.row_max) &
+                       ((ref_mv->col - 1) > x->mv_limits.col_min) &
+                       ((ref_mv->col + 1) < x->mv_limits.col_max);
+
+    if (all_in) {
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = { best_address - in_what->stride,
+                                            best_address - 1, best_address + 1,
+                                            best_address + in_what->stride };
+
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+      for (j = 0; j < 4; ++j) {
+        if (sads[j] < best_sad) {
+          const MV mv = { ref_mv->row + neighbors[j].row,
+                          ref_mv->col + neighbors[j].col };
+          sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sads[j] < best_sad) {
+            best_sad = sads[j];
+            best_site = j;
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < 4; ++j) {
+        const MV mv = { ref_mv->row + neighbors[j].row,
+                        ref_mv->col + neighbors[j].col };
+
+        if (is_mv_in(&x->mv_limits, &mv)) {
+          unsigned int sad =
+              fn_ptr->sdf(what->buf, what->stride,
+                          get_buf_from_mv(in_what, &mv), in_what->stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_site = j;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      x->second_best_mv.as_mv = *ref_mv;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, ref_mv);
+    }
+  }
+
+  return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode, or when searching for one component of an ext-inter compound mode.
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask, const MV *center_mv,
+                             const uint8_t *second_pred) {
+  static const search_neighbors neighbors[8] = {
+    { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+  };
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  MV *best_mv = &x->best_mv.as_mv;
+  unsigned int best_sad = INT_MAX;
+  int i, j;
+  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] =
+      { 0 };
+  int grid_center = SEARCH_GRID_CENTER_8P;
+  int grid_coord = grid_center;
+
+  clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  if (mask) {
+    best_sad = fn_ptr->msdf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, best_mv), in_what->stride,
+                            second_pred, mask, mask_stride, invert_mask) +
+               mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  } else {
+    if (xd->jcp_param.use_jnt_comp_avg)
+      best_sad = fn_ptr->jsdaf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, best_mv),
+                               in_what->stride, second_pred, &xd->jcp_param) +
+                 mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+    else
+      best_sad = fn_ptr->sdaf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, best_mv),
+                              in_what->stride, second_pred) +
+                 mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+  }
+
+  do_refine_search_grid[grid_coord] = 1;
+
+  for (i = 0; i < search_range; ++i) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; ++j) {
+      grid_coord = grid_center + neighbors[j].coord_offset;
+      if (do_refine_search_grid[grid_coord] == 1) {
+        continue;
+      }
+      const MV mv = { best_mv->row + neighbors[j].coord.row,
+                      best_mv->col + neighbors[j].coord.col };
+
+      do_refine_search_grid[grid_coord] = 1;
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        unsigned int sad;
+        if (mask) {
+          sad = fn_ptr->msdf(what->buf, what->stride,
+                             get_buf_from_mv(in_what, &mv), in_what->stride,
+                             second_pred, mask, mask_stride, invert_mask);
+        } else {
+          if (xd->jcp_param.use_jnt_comp_avg)
+            sad = fn_ptr->jsdaf(what->buf, what->stride,
+                                get_buf_from_mv(in_what, &mv), in_what->stride,
+                                second_pred, &xd->jcp_param);
+          else
+            sad = fn_ptr->sdaf(what->buf, what->stride,
+                               get_buf_from_mv(in_what, &mv), in_what->stride,
+                               second_pred);
+        }
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_mv->row += neighbors[best_site].coord.row;
+      best_mv->col += neighbors[best_site].coord.col;
+      grid_center += neighbors[best_site].coord_offset;
+    }
+  }
+  return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex =
+      AOMMAX(MIN_EX_SEARCH_LIMIT,
+             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+         (sf->exhaustive_searches_thresh < INT_MAX) &&
+         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+}
+
+int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                          MV *mvp_full, int step_param, int method,
+                          int run_mesh_search, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max, int rd,
+                          int x_pos, int y_pos, int intra) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  int var = 0;
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
+  switch (method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case HEX:
+      var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                           fn_ptr, 1, ref_mv);
+      break;
+    case SQUARE:
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                          fn_ptr, 1, ref_mv);
+      break;
+    case BIGDIA:
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                          fn_ptr, 1, ref_mv);
+      break;
+    case NSTEP:
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+                               cost_list, fn_ptr, ref_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>=
+            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+          int var_ex;
+          MV tmp_mv_ex;
+          var_ex =
+              full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+                                    cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            x->best_mv.as_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+    default: assert(0 && "Invalid search method.");
+  }
+
+  // Should we allow a follow on exhaustive search?
+  if (!run_mesh_search) {
+    if (method == NSTEP) {
+      if (is_exhaustive_allowed(cpi, x)) {
+        int exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>=
+            10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) run_mesh_search = 1;
+      }
+    }
+  }
+
+  if (run_mesh_search) {
+    int var_ex;
+    MV tmp_mv_ex;
+    var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+                                   cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+    if (var_ex < var) {
+      var = var_ex;
+      x->best_mv.as_mv = tmp_mv_ex;
+    }
+  }
+
+  if (method != NSTEP && rd && var < var_max)
+    var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
+
+  do {
+    if (!intra || !av1_use_hash_me(&cpi->common)) break;
+
+    // already single ME
+    // get block size and original buffer of current block
+    const int block_height = block_size_high[bsize];
+    const int block_width = block_size_wide[bsize];
+    if (block_height == block_width && x_pos >= 0 && y_pos >= 0) {
+      if (block_width == 4 || block_width == 8 || block_width == 16 ||
+          block_width == 32 || block_width == 64 || block_width == 128) {
+        uint8_t *what = x->plane[0].src.buf;
+        const int what_stride = x->plane[0].src.stride;
+        uint32_t hash_value1, hash_value2;
+        MV best_hash_mv;
+        int best_hash_cost = INT_MAX;
+
+        // for the hashMap
+        hash_table *ref_frame_hash =
+            intra
+                ? &cpi->common.cur_frame->hash_table
+                : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]);
+
+        av1_get_block_hash_value(
+            what, what_stride, block_width, &hash_value1, &hash_value2,
+            x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
+
+        const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+        // for intra, at lest one matching can be found, itself.
+        if (count <= (intra ? 1 : 0)) {
+          break;
+        }
+
+        Iterator iterator =
+            av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+        for (int i = 0; i < count; i++, iterator_increment(&iterator)) {
+          block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator));
+          if (hash_value2 == ref_block_hash.hash_value2) {
+            // For intra, make sure the prediction is from valid area.
+            if (intra) {
+              const int mi_col = x_pos / MI_SIZE;
+              const int mi_row = y_pos / MI_SIZE;
+              const MV dv = { 8 * (ref_block_hash.y - y_pos),
+                              8 * (ref_block_hash.x - x_pos) };
+              if (!av1_is_dv_valid(dv, &cpi->common, &x->e_mbd, mi_row, mi_col,
+                                   bsize, cpi->common.seq_params.mib_size_log2))
+                continue;
+            }
+            MV hash_mv;
+            hash_mv.col = ref_block_hash.x - x_pos;
+            hash_mv.row = ref_block_hash.y - y_pos;
+            if (!is_mv_in(&x->mv_limits, &hash_mv)) continue;
+            const int refCost =
+                av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
+            if (refCost < best_hash_cost) {
+              best_hash_cost = refCost;
+              best_hash_mv = hash_mv;
+            }
+          }
+        }
+        if (best_hash_cost < var) {
+          x->second_best_mv = x->best_mv;
+          x->best_mv.as_mv = best_hash_mv;
+          var = best_hash_cost;
+        }
+      }
+    }
+  } while (0);
+
+  return var;
+}
+
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c)                                                              \
+  (unsigned int)(mvcost                                                        \
+                     ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +              \
+                         mvcost[0][((r)-rr)] + (int64_t)mvcost[1][((c)-rc)]) * \
+                            error_per_bit +                                    \
+                        4096) >>                                               \
+                           13                                                  \
+                     : 0)
+
+#define CHECK_BETTER(v, r, c)                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+    thismse = (DIST(r, c));                               \
+    if ((v = MVC(r, c) + thismse) < besterr) {            \
+      besterr = v;                                        \
+      br = r;                                             \
+      bc = c;                                             \
+      *distortion = thismse;                              \
+      *sse1 = sse;                                        \
+    }                                                     \
+  } else {                                                \
+    v = INT_MAX;                                          \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c)                                                \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                     \
+    MV this_mv = { r, c };                                                    \
+    thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv,     \
+                                        mask, vfp, z, pre(y, y_stride, r, c), \
+                                        y_stride, sp(c), sp(r), w, h, &sse,   \
+                                        use_accurate_subpel_search);          \
+    if ((v = MVC(r, c) + thismse) < besterr) {                                \
+      besterr = v;                                                            \
+      br = r;                                                                 \
+      bc = c;                                                                 \
+      *distortion = thismse;                                                  \
+      *sse1 = sse;                                                            \
+    }                                                                         \
+  } else {                                                                    \
+    v = INT_MAX;                                                              \
+  }
+
+static unsigned int setup_obmc_center_error(
+    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, int offset, int *mvjcost,
+    int *mvcost[2], unsigned int *sse1, int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_obmc_pref_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
+    const int32_t *const wsrc, const uint8_t *const y, int y_stride,
+    int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+    int subpel_search) {
+  unsigned int besterr;
+
+  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+                              subpel_search);
+    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, y, y_stride, subpel_search);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+  }
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+    const uint8_t *const y, int y_stride, int w, int h, int offset,
+    int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
+    int subpel_search) {
+  unsigned int besterr = upsampled_obmc_pref_error(
+      xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
+      0, w, h, sse1, subpel_search);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_accurate_subpel_search) {
+  const int32_t *wsrc = x->wsrc_buf;
+  const int32_t *mask = x->mask_buf;
+  const int *const z = wsrc;
+  const int *const src_address = z;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = block_size_wide[mbmi->sb_type];
+  const int h = block_size_high[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  int minc, maxc, minr, maxr;
+
+  set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
+
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+  // use_accurate_subpel_search can be 0 or 1 or 2
+  if (use_accurate_subpel_search)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
+        y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
+        use_accurate_subpel_search);
+  else
+    besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
+                                      z, y, y_stride, offset, mvjcost, mvcost,
+                                      sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = { tr, tc };
+        if (use_accurate_subpel_search) {
+          thismse = upsampled_obmc_pref_error(
+              xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+              pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+              use_accurate_subpel_search);
+        } else {
+          thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
+                              sp(tr), src_address, mask, &sse);
+        }
+
+        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+                                                mvcost, error_per_bit);
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = { tr, tc };
+
+      if (use_accurate_subpel_search) {
+        thismse = upsampled_obmc_pref_error(
+            xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+            pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+            use_accurate_subpel_search);
+      } else {
+        thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
+                            src_address, mask, &sse);
+      }
+
+      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                            error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_accurate_subpel_search) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc,
+                               const int32_t *mask, const MV *best_mv,
+                               const MV *center_mv,
+                               const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                               int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride, wsrc,
+                  mask, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x, const int32_t *wsrc,
+                             const int32_t *mask, MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, int is_second) {
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride, wsrc, mask) +
+                          mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = { ref_mv->row + neighbors[j].row,
+                      ref_mv->col + neighbors[j].col };
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+                                        in_what->stride, wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
+                            const int32_t *wsrc, const int32_t *mask,
+                            MV *ref_mv, MV *best_mv, int search_param,
+                            int sad_per_bit, int *num00,
+                            const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = { best_mv->row + ss[i].mv.row,
+                      best_mv->col + ss[i].mv.col };
+      if (is_mv_in(&x->mv_limits, &mv)) {
+        int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+                               wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+                             best_mv->col + ss[best_site].mv.col };
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+                                 in_what->stride, wsrc, mask);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   MV *mvp_full, int step_param, int sadpb,
+                                   int further_steps, int do_refine,
+                                   const aom_variance_fn_ptr_t *fn_ptr,
+                                   const MV *ref_mv, MV *dst_mv,
+                                   int is_second) {
+  const int32_t *wsrc = x->wsrc_buf;
+  const int32_t *mask = x->mask_buf;
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme =
+      obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv,
+                              step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
+                                  is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full,
+                                        &temp_mv, step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
+                                      1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = obmc_refining_search_sad(x, wsrc, mask, &best_mv, sadpb,
+                                       search_range, fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, mask, &best_mv, ref_mv, fn_ptr, 1,
+                                    is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+
+int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+                               int step_param, int sadpb, int further_steps,
+                               int do_refine,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *ref_mv, MV *dst_mv, int is_second) {
+  if (cpi->sf.obmc_full_pixel_search_level == 0) {
+    return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
+                                   further_steps, do_refine, fn_ptr, ref_mv,
+                                   dst_mv, is_second);
+  } else {
+    const int32_t *wsrc = x->wsrc_buf;
+    const int32_t *mask = x->mask_buf;
+    const int search_range = 8;
+    *dst_mv = *mvp_full;
+    clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+             x->mv_limits.row_min, x->mv_limits.row_max);
+    int thissme = obmc_refining_search_sad(
+        x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1,
+                                    is_second);
+    return thissme;
+  }
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST              \
+  SETUP_SUBPEL_SEARCH;              \
+                                    \
+  (void)error_per_bit;              \
+  (void)vfp;                        \
+  (void)src_address;                \
+  (void)src_stride;                 \
+  (void)y;                          \
+  (void)y_stride;                   \
+  (void)second_pred;                \
+  (void)w;                          \
+  (void)h;                          \
+  (void)use_accurate_subpel_search; \
+  (void)offset;                     \
+  (void)mvjcost;                    \
+  (void)mvcost;                     \
+  (void)sse1;                       \
+  (void)distortion;                 \
+                                    \
+  (void)halfiters;                  \
+  (void)quarteriters;               \
+  (void)eighthiters;                \
+  (void)whichdir;                   \
+  (void)forced_stop;                \
+  (void)hstep;                      \
+                                    \
+  (void)tr;                         \
+  (void)tc;                         \
+  (void)sse;                        \
+  (void)thismse;                    \
+  (void)cost_list;
+// Return the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *ref_mv,
+                                int allow_hp, int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                int use_accurate_subpel_search) {
+  COMMON_MV_TEST;
+  (void)mask;
+  (void)mask_stride;
+  (void)invert_mask;
+  (void)minr;
+  (void)minc;
+
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
+  bestmv->row = maxr;
+  bestmv->col = maxc;
+  besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
+// Return the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+                                int mi_row, int mi_col, const MV *ref_mv,
+                                int allow_hp, int error_per_bit,
+                                const aom_variance_fn_ptr_t *vfp,
+                                int forced_stop, int iters_per_step,
+                                int *cost_list, int *mvjcost, int *mvcost[2],
+                                int *distortion, unsigned int *sse1,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask, int w, int h,
+                                int use_accurate_subpel_search) {
+  COMMON_MV_TEST;
+  (void)maxr;
+  (void)maxc;
+  (void)mask;
+  (void)mask_stride;
+  (void)invert_mask;
+
+  (void)cm;
+  (void)mi_row;
+  (void)mi_col;
+
+  bestmv->row = minr;
+  bestmv->col = minc;
+  besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 000000000..a975218b0
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
+
+#include "av1/encoder/block.h"
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
+
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+  (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
+// motion search site
+typedef struct search_site {
+  MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+  int ss_count;
+  int searches_per_step;
+} search_site_config;
+
+typedef struct {
+  MV coord;
+  int coord_offset;
+} search_neighbors;
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv);
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                    int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+                       const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+                       int use_mvcost);
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+                          const MV *center_mv, const uint8_t *second_pred,
+                          const aom_variance_fn_ptr_t *vfp, int use_mvcost);
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
+                            const MV *center_mv, const uint8_t *second_pred,
+                            const uint8_t *mask, int mask_stride,
+                            int invert_mask, const aom_variance_fn_ptr_t *vfp,
+                            int use_mvcost);
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+int av1_init_search_range(int size);
+
+int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
+                            int distance, const aom_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
+int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                           MV *mvp_full, int step_param, int sadpb,
+                           int further_steps, int do_refine, int *cost_list,
+                           const aom_variance_fn_ptr_t *fn_ptr,
+                           const MV *ref_mv, MV *dst_mv);
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                   int sad_per_bit, int do_init_search, int *cost_list,
+                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                   const MV *center_mv);
+
+typedef int(fractional_mv_step_fp)(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp,
+    int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1, const uint8_t *second_pred,
+    const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
+    int use_accurate_subpel_search);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+
+typedef int (*av1_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
+                                    int sad_per_bit, int distance,
+                                    const aom_variance_fn_ptr_t *fn_ptr,
+                                    const MV *center_mv, MV *best_mv);
+
+typedef int (*av1_diamond_search_fn_t)(
+    MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
+    int search_param, int sad_per_bit, int *num00,
+    const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+                             const aom_variance_fn_ptr_t *fn_ptr,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask, const MV *center_mv,
+                             const uint8_t *second_pred);
+
+int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                          int method, int run_mesh_search, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max, int rd,
+                          int x_pos, int y_pos, int intra);
+
+int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               MV *mvp_full, int step_param, int sadpb,
+                               int further_steps, int do_refine,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *ref_mv, MV *dst_mv, int is_second);
+int av1_find_best_obmc_sub_pixel_tree_up(
+    MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second, int use_accurate_subpel_search);
+
+unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
+                                     MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                     int mi_row, int mi_col, const MV *this_mv);
+unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
+                                  MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col, int *pts0,
+                                  int *pts_inref0, int total_samples);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c
new file mode 100644
index 000000000..2e86dee43
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
+  static int64_t block_error_##BSize##size_msa(                              \
+      const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
+    int64_t err = 0;                                                         \
+    uint32_t loop_cnt;                                                       \
+    v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
+    v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
+    v2i64 sq_coeff_r, sq_coeff_l;                                            \
+    v2i64 err0, err_dup0, err1, err_dup1;                                    \
+                                                                             \
+    coeff = LD_SH(coeff_ptr);                                                \
+    dq_coeff = LD_SH(dq_coeff_ptr);                                          \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+    DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r,      \
+                sq_coeff_l);                                                 \
+    DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
+                                                                             \
+    coeff = LD_SH(coeff_ptr + 8);                                            \
+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
+                                                                             \
+    coeff_ptr += 16;                                                         \
+    dq_coeff_ptr += 16;                                                      \
+                                                                             \
+    for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
+      coeff = LD_SH(coeff_ptr);                                              \
+      dq_coeff = LD_SH(dq_coeff_ptr);                                        \
+      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                             \
+      coeff = LD_SH(coeff_ptr + 8);                                          \
+      dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
+      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                             \
+      coeff_ptr += 16;                                                       \
+      dq_coeff_ptr += 16;                                                    \
+    }                                                                        \
+                                                                             \
+    err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
+    err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
+    sq_coeff_r += err_dup0;                                                  \
+    sq_coeff_l += err_dup1;                                                  \
+    *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
+    *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
+                                                                             \
+    err_dup0 = __msa_splati_d(err0, 1);                                      \
+    err_dup1 = __msa_splati_d(err1, 1);                                      \
+    err0 += err_dup0;                                                        \
+    err1 += err_dup1;                                                        \
+    err = __msa_copy_s_d(err0, 0);                                           \
+    err += __msa_copy_s_d(err1, 0);                                          \
+                                                                             \
+    return err;                                                              \
+  }
+
+/* clang-format off */
+BLOCK_ERROR_BLOCKSIZE_MSA(16)
+BLOCK_ERROR_BLOCKSIZE_MSA(64)
+BLOCK_ERROR_BLOCKSIZE_MSA(256)
+BLOCK_ERROR_BLOCKSIZE_MSA(1024)
+/* clang-format on */
+
+int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
+                            const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
+                            int64_t *ssz) {
+  int64_t err;
+  const int16_t *coeff = (const int16_t *)coeff_ptr;
+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+  switch (blk_size) {
+    case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
+    case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
+    case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
+    case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
+    default:
+      err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+      break;
+  }
+
+  return err;
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
new file mode 100644
index 000000000..085c08bfb
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+
+void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  in0 += in1;
+  in3 -= in2;
+  in4 = (in0 - in3) >> 1;
+  SUB2(in4, in1, in4, in2, in1, in2);
+  in0 -= in2;
+  in3 += in1;
+
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+  in0 += in2;
+  in1 -= in3;
+  in4 = (in0 - in1) >> 1;
+  SUB2(in4, in2, in4, in3, in2, in3);
+  in0 -= in3;
+  in1 += in2;
+
+  SLLI_4V(in0, in1, in2, in3, 2);
+
+  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+  ST4x2_UB(in0, output, 4);
+  ST4x2_UB(in3, output + 4, 4);
+  ST4x2_UB(in1, output + 8, 4);
+  ST4x2_UB(in2, output + 12, 4);
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 000000000..531ae090a
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
+                                            uint8_t *frm2_ptr, int32_t filt_sth,
+                                            int32_t filt_wgt, uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth, int32_t filt_wgt,
+                                             uint32_t *acc, uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
+                                    filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
+                                     filt_wgt, accu, cnt);
+  } else {
+    av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 000000000..d21def43a
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/ml.h"
+
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+                    float *output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  const float *input_nodes = features;
+
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int layer = 0; layer < num_layers; ++layer) {
+    const float *weights = nn_config->weights[layer];
+    const float *bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (int node = 0; node < num_output_nodes; ++node) {
+      float val = 0.0f;
+      for (int i = 0; i < num_input_nodes; ++i)
+        val += weights[i] * input_nodes[i];
+      val += bias[node];
+      // ReLU as activation function.
+      val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+      output_nodes[node] = val;
+      weights += num_input_nodes;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
+
+  // Final output layer.
+  const float *weights = nn_config->weights[num_layers];
+  for (int node = 0; node < nn_config->num_outputs; ++node) {
+    const float *bias = nn_config->bias[num_layers];
+    float val = 0.0f;
+    for (int i = 0; i < num_input_nodes; ++i)
+      val += weights[i] * input_nodes[i];
+    output[node] = val + bias[node];
+    weights += num_input_nodes;
+  }
+}
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+  // Softmax function is invariant to adding the same constant
+  // to all input values, so we subtract the maximum input to avoid
+  // possible overflow.
+  float max_inp = input[0];
+  for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < n; i++) {
+    output[i] = (float)exp(input[i] - max_inp);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 000000000..cb8ef2871
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+typedef struct {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+                    float *output);
+
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 000000000..e61cd02ce
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+
+static int int_comparer(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
+}
+
+int av1_remove_duplicates(int *centroids, int num_centroids) {
+  int num_unique;  // number of unique centroids
+  int i;
+  qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
+  // Remove duplicates.
+  num_unique = 1;
+  for (i = 1; i < num_centroids; ++i) {
+    if (centroids[i] != centroids[i - 1]) {  // found a new unique centroid
+      centroids[num_unique++] = centroids[i];
+    }
+  }
+  return num_unique;
+}
+
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+                             int min_val) {
+  if (num <= 0) return 0;
+  int bits_cost = bit_depth;
+  if (num == 1) return bits_cost;
+  bits_cost += 2;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  const int min_bits = bit_depth - 3;
+  for (int i = 1; i < num; ++i) {
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits_per_delta <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  for (int i = 0; i < num - 1; ++i) {
+    bits_cost += bits_per_delta;
+    range -= deltas[i];
+    bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
+  }
+  return bits_cost;
+}
+
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors) {
+  if (n_cache <= 0) {
+    for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+    return n_colors;
+  }
+  memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+  int n_in_cache = 0;
+  int in_cache_flags[PALETTE_MAX_SIZE];
+  memset(in_cache_flags, 0, sizeof(in_cache_flags));
+  for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+    for (int j = 0; j < n_colors; ++j) {
+      if (colors[j] == color_cache[i]) {
+        in_cache_flags[j] = 1;
+        cache_color_found[i] = 1;
+        ++n_in_cache;
+        break;
+      }
+    }
+  }
+  int j = 0;
+  for (int i = 0; i < n_colors; ++i)
+    if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+  assert(j == n_colors - n_in_cache);
+  return j;
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count,
+                                 int *min_bits) {
+  const int n = pmi->palette_size[1];
+  const int max_val = 1 << bit_depth;
+  int max_d = 0;
+  *min_bits = bit_depth - 4;
+  *zero_count = 0;
+  for (int i = 1; i < n; ++i) {
+    const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+                      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+    const int v = abs(delta);
+    const int d = AOMMIN(v, max_val - v);
+    if (d > max_d) max_d = d;
+    if (d == 0) ++(*zero_count);
+  }
+  return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             uint16_t *color_cache, int n_cache,
+                             int bit_depth) {
+  const int n = pmi->palette_size[0];
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  const int total_bits =
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+  return av1_cost_literal(total_bits);
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              uint16_t *color_cache, int n_cache,
+                              int bit_depth) {
+  const int n = pmi->palette_size[1];
+  int total_bits = 0;
+  // U channel palette color cost.
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+      cache_color_found, out_cache_colors);
+  total_bits +=
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
+  // V channel palette color cost.
+  int zero_count = 0, min_bits_v = 0;
+  const int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int bits_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int bits_using_raw = bit_depth * n;
+  total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+  return av1_cost_literal(total_bits);
+}
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 000000000..8b88c4755
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
+
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
+                                             const int *centroids,
+                                             uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
+                                             const int *centroids,
+                                             uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+
+// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
+// calculate the centroid 'indices' for the data points.
+static INLINE void av1_calc_indices(const int *data, const int *centroids,
+                                    uint8_t *indices, int n, int k, int dim) {
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
+
+// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
+// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
+// updated 'centroids' and the centroid 'indices' for elements in 'data'.
+// Note: the output centroids are rounded off to nearest integers.
+static INLINE void av1_k_means(const int *data, int *centroids,
+                               uint8_t *indices, int n, int k, int dim,
+                               int max_itr) {
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
+
+// Given a list of centroids, returns the unique number of centroids 'k', and
+// puts these unique centroids in first 'k' indices of 'centroids' array.
+// Ideally, the centroids should be rounded to integers before calling this
+// method.
+int av1_remove_duplicates(int *centroids, int num_centroids);
+
+// Given a color cache and a set of base colors, find if each cache color is
+// present in the base colors, record the binary results in "cache_color_found".
+// Record the colors that are not in the color cache in "out_cache_colors".
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors);
+
+// Return the number of bits used to transmit each v palette color delta;
+// assign zero_count with the number of deltas being 0.
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count, int *min_bits);
+
+// Return the rate cost for transmitting luma palette color values.
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             uint16_t *color_cache, int n_cache, int bit_depth);
+
+// Return the rate cost for transmitting chroma palette color values.
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              uint16_t *color_cache, int n_cache,
+                              int bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
new file mode 100644
index 000000000..437ea43f9
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -0,0 +1,2448 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+  -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f,  -0.469759f,
+  0.426152f,  0.489798f,  0.469865f,  0.773821f,  0.088517f,  0.074585f,
+  0.838754f,  0.048449f,  -0.007584f, 0.638968f,  0.233305f,  -0.319236f,
+  -0.257124f, -0.170869f, 0.137180f,  0.114852f,  -0.721241f, -0.947962f,
+  -0.411298f, 0.494306f,  -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+  -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f,  -0.189925f,
+  0.134361f,  -0.258070f, -0.177558f, 0.158049f,  0.168668f,  -0.062919f,
+  0.341986f,  0.038100f,  -0.435577f, -0.321255f, 0.203213f,  0.213061f,
+  0.533304f,  0.359296f,  -0.079558f, 0.004637f,  0.663904f,  0.043779f,
+  0.383018f,  1.136559f,  -0.084155f, 0.333057f,  -0.199011f, 0.152059f,
+  -0.078419f, -0.167752f, -0.093651f, 0.083171f,  -0.190143f, 0.086195f,
+  -0.280632f, -0.160663f, -0.017298f, 0.122628f,  -0.138116f, 0.062927f,
+  0.222462f,  0.626979f,  0.426928f,  0.117170f,  -0.240457f, 0.053750f,
+  0.038017f,  0.007359f,  -0.017595f, 0.101407f,  0.332891f,  0.074933f,
+  0.306498f,  0.219380f,  -0.151638f, -0.247976f, 0.343405f,  0.121256f,
+  0.049173f,  0.171474f,  -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+  0.243401f,  0.059928f,  -0.089396f, -0.195565f, 0.364705f,  -0.020400f,
+  -1.383672f, 0.413018f,  0.536950f,  -0.020904f, -1.335306f, -0.732290f,
+  0.102885f,  0.315290f,  -0.208521f, -0.081811f, 0.182300f,  0.125712f,
+  -0.593833f, -0.220639f, -0.314155f, 0.188327f,  0.118503f,  0.524427f,
+  -1.083859f, -1.130640f, 0.390352f,  -0.045591f, 0.113160f,  -0.009149f,
+  -0.096183f, 0.115829f,  0.377752f,  0.318396f,  -0.591983f, 0.004797f,
+  -0.497377f, -0.342248f, 0.079546f,  -0.025249f, -0.295972f, 0.615501f,
+  -0.464372f, 0.418315f,  -0.173556f, 0.105217f,  0.298073f,  0.082478f,
+  0.033223f,  0.977341f,  -0.372982f, -0.052337f, 0.154124f,  0.396787f,
+  0.536654f,  -0.139061f, -0.223702f, 0.229666f,  -0.846766f, 0.107723f,
+  0.563839f,  -0.483141f, 0.304813f,  -0.765283f, 0.070964f,  0.151101f,
+  0.275188f,  0.490303f,  1.175892f,  0.085377f,  -0.191200f, 0.544532f,
+  -0.365075f, 0.167546f,  0.052183f,  -0.220529f, -0.212227f, -0.144988f,
+  -0.273356f, -0.062023f, 0.103993f,  -0.238493f, -0.161204f, -0.054611f,
+  -0.166672f, 0.128327f,  0.461751f,  -0.545822f, 0.739798f,  0.594386f,
+  -0.163192f, -0.332501f, 0.363834f,  -0.065043f, 0.474812f,  -0.138811f,
+  0.170924f,  -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+  0.340591f,  0.041783f,  0.055419f,  0.015155f,  -0.981830f, -1.355237f,
+  0.347516f,  1.155327f,  0.081319f,  0.274163f,  -0.327230f, -0.113478f,
+  0.556552f,  -0.055986f, 0.217318f,  -0.445351f, 0.325759f,  0.526547f,
+  -0.657434f, -0.572214f, -0.037087f, 0.081384f,  0.064518f,  0.014892f,
+  0.215279f,  1.834504f,  -0.242107f, 0.079810f,  0.129558f,  0.079588f,
+  -0.035189f, -0.221745f, -0.163414f, 0.043978f,  -1.028662f, -0.623609f,
+  1.130336f,  0.664661f,  -0.063975f, -0.415863f, 0.018581f,  0.157758f,
+  0.200570f,  0.063420f,  0.901039f,  -0.746286f, 0.196230f,  -0.290592f,
+  0.042373f,  -0.502500f, 0.183638f,  0.103394f,  -0.298858f, 0.145436f,
+  0.196916f,  0.108319f,  -0.448572f, -0.881385f, 0.302497f,  0.121679f,
+  -0.021327f, 0.025150f,  0.481306f,  -0.359634f, 0.350257f,  -0.228647f,
+  -0.669860f, 0.260025f,  -0.034182f, 0.619247f,  -0.158826f, -0.405864f,
+  0.674112f,  -0.027885f, -0.325274f, -0.241492f, 0.036024f,  -0.437685f,
+  -0.091458f, -0.109295f, -0.350676f, 0.044706f,  0.297059f,  0.016290f,
+  1.121203f,  1.289062f,  -1.299476f, -1.129221f, 0.103752f,  0.131302f,
+  -0.263265f, 0.222155f,  -0.229908f, 0.013922f,  -0.226001f, -0.248383f,
+  -0.004415f, -0.020958f, 0.055634f,  0.086200f,  0.114556f,  -0.184061f,
+  -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f,  0.023781f,
+  -0.264460f, 0.157026f,  -0.235228f, -0.102564f, 0.043463f,  -0.187823f,
+  -0.257500f, -0.199049f, -0.242210f, 0.030448f,  0.221604f,  0.151804f,
+  -0.100404f, -0.073931f, 0.144749f,  -0.001572f, -1.438079f, -0.233716f,
+  0.733422f,  1.727080f,  -0.036397f, 0.027551f,  0.425321f,  0.085703f,
+  0.031186f,  0.032333f,  -0.675130f, 1.437733f,  -0.202392f, -0.525003f,
+  0.087048f,  0.328194f,  -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+  -0.139600f, 0.154665f,  0.026202f,  -0.233501f, -0.009046f, -0.149187f,
+  -0.199646f, 0.115375f,  0.209762f,  -0.014875f, 0.124038f,  -0.119985f,
+  1.079625f,  -0.461513f, 0.614114f,  0.021003f,  0.439449f,  -0.824834f,
+  -0.299701f, 0.193817f,  -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+  0.305310f,  -0.089721f, -0.317314f, -0.075631f, 0.127172f,  -0.208635f,
+  1.191922f,  0.163141f,  0.564285f,  0.286352f,  0.480865f,  0.173094f,
+  -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f,  0.090258f,
+  -0.016099f, 0.193230f,  0.188061f,  0.398144f,  0.722781f,  0.769949f,
+  0.025442f,  -0.162016f, 0.070192f,  -0.056946f, -0.100957f, -0.219934f,
+  -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f,  -0.017493f,
+  0.527446f,  0.083605f,  0.588318f,  0.878215f,  0.028747f,  -0.146479f,
+  -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f,  -0.101340f,
+  -0.027733f, -0.282611f, 0.265366f,  0.082362f,  -0.265420f, -0.131124f,
+  0.166303f,  0.040194f,  -0.100710f, 0.579151f,  -0.530136f, 0.163422f,
+  -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f,  -0.090302f,
+  1.723272f,  0.552370f,  -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+  0.539616f,  -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+  -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f,  -0.159378f,
+  0.029145f,  -0.050892f, -0.223407f, -0.246239f, 0.043152f,  -0.018460f,
+  0.169972f,  -0.187769f, -0.034670f, -0.238330f, 0.288070f,  -0.093243f,
+  -0.437105f, -0.573376f, 0.660073f,  0.285727f,  0.408470f,  0.158475f,
+  0.032699f,  0.056280f,  -0.237176f, -0.083003f, 0.105598f,  -0.169522f,
+  -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+  0.029124f,  0.009580f,  -0.252034f, 0.103087f,  1.156561f,  0.603848f,
+  -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f,  0.076095f,
+  1.490819f,  0.415893f,  -0.277788f, -0.115787f, 0.093750f,  0.270726f,
+  -0.395983f, -0.353742f, 0.034605f,  0.005342f,  0.184537f,  0.086445f,
+  0.156417f,  1.476367f,  0.122587f,  0.002145f,  0.431057f,  -0.381184f,
+  -1.646457f, -0.014009f, -0.671224f, 0.193726f,  -0.019247f, -0.031267f,
+  -0.046208f, 0.298733f,  0.064734f,  0.616984f,  0.039381f,  0.182722f,
+  -0.116670f, 0.233093f,  -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+  0.077697f,  -0.266720f, 0.130875f,  -0.235295f, -0.265754f, -0.159999f,
+  -0.250114f, -0.183017f, 0.194403f,  -0.105808f, -0.169215f, -0.240866f,
+  -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+  -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+  0.881460f,  -0.678603f, 0.008666f,  -0.252053f, -0.341035f, -0.175290f,
+  0.183012f,  0.385991f,  0.079888f,  -0.014039f, -0.148653f, 0.671778f,
+  -0.130219f, 1.086467f,  0.129267f,  -0.040400f, -0.201221f, -0.077005f,
+  0.015890f,  0.000781f,  0.137764f,  1.389546f,  0.172152f,  0.047279f,
+  -0.042783f, 0.127740f,  0.141467f,  -0.335738f, -1.396392f, 0.031496f,
+  0.357385f,  0.343602f,  -0.714553f, 0.311014f,  0.132845f,  0.061149f,
+  0.006796f,  0.568106f,  -0.255949f, 0.104134f,  -0.993447f, 0.298135f,
+  -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+  0.068481f,  0.036240f,  -0.495801f, 0.180574f,  -0.766129f, 0.886967f,
+  -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+  0.019016f,  -0.015837f, 0.600197f,  0.429773f,  0.315026f,  0.319667f,
+  0.214617f,  -0.017316f, 0.270257f,  -0.040524f, 0.695803f,  -0.015223f,
+  -1.554965f, 0.356997f,  -1.472428f, 0.024637f,  -0.562958f, 0.870351f,
+  0.193635f,  0.036063f,  0.328638f,  0.200274f,  -1.634707f, 0.110534f,
+  0.420104f,  -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+  0.643147f,  -1.348826f, 0.431627f,  0.000000f,  0.102717f,  -0.772628f,
+  -0.034351f, -0.761977f, -0.638397f, 0.541969f,  -0.391311f, 0.563076f,
+  0.148553f,  0.267217f,  -0.788092f, 0.544573f,  -0.546280f, 0.000000f,
+  -0.446945f, 0.127732f,  0.270624f,  -0.219435f, -1.220203f, 0.324584f,
+  0.110885f,  0.276547f,  0.179726f,  -0.375160f, 0.026401f,  -0.032595f,
+  0.000000f,  -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+  0.476453f,  -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+  0.915351f,  -0.209962f, 0.000000f,  -0.025731f, 0.218288f,  0.000000f,
+  0.047726f,  -0.813077f, -1.263281f, 0.239087f,  0.278614f,  -0.030753f,
+  0.000000f,  0.346744f,  -0.948543f, -1.174211f, 0.216377f,  0.498913f,
+  0.853918f,  0.002504f,  -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+  0.179769f,  1.499417f,  -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+  -0.203213f, 0.302171f,  0.226877f,  -0.422169f, 1.687586f,  0.783773f,
+  0.220995f,  0.253482f,  0.370435f,  -1.342775f, 0.337229f,  -0.271473f,
+  0.291796f,  1.362227f,  -1.751397f, -0.086178f, 0.725496f,  -0.118597f,
+  0.227963f,  -0.501577f, 0.223849f,  -0.122421f, -0.123437f, -0.051045f,
+  -0.020115f, 0.212711f,  0.246025f,  0.088120f,  -0.168995f, 1.740190f,
+  -0.195098f, 0.680339f,  -0.589572f, -0.075244f, 0.878766f,  0.064092f,
+  -3.548527f, 0.001660f,  0.107926f,  -0.169501f, -0.455212f, 0.123045f,
+  -1.836998f, 0.330365f,  1.301475f,  0.454761f,  -0.576552f, -0.190761f,
+  0.208459f,  0.618483f,  1.383364f,  0.970718f,  0.390174f,  0.406252f,
+  -0.564519f, -0.312062f, 1.345712f,  -0.151873f, 0.109290f,  0.408847f,
+  0.391243f,  0.152024f,  0.181764f,  -0.036263f, -0.160466f, 0.153595f,
+  0.049163f,  -0.753012f, -1.804062f, 0.347475f,  -2.746580f, 0.575618f,
+  0.261799f,  0.210505f,  -0.302054f, -0.109872f, 0.199506f,  -1.182971f,
+  0.723668f,  0.177758f,  -0.338202f, 0.254396f,  -0.220023f, 0.043504f,
+  0.669866f,  -0.040816f, -0.402730f, 0.017990f,  0.215523f,  -0.216816f,
+  0.454826f,  -0.726067f, -0.018750f, -0.928679f, 0.154315f,  -0.465641f,
+  0.144566f,  -0.030064f, -0.054667f, -0.154055f, 0.625384f,  1.323795f,
+  -0.159496f, 0.097072f,  -0.463197f, -0.057938f, 0.750290f,  -0.233061f,
+  0.412631f,  -0.535223f, -0.151423f, -0.154583f, 0.024721f,  -0.494448f,
+  0.230594f,  -0.980138f, -0.653968f, 0.126079f,  0.051814f,  -0.053219f,
+  -0.421708f, -0.228853f, 0.237885f,  0.888157f,  0.059655f,  0.241295f,
+  0.210443f,  0.228238f,  0.119127f,  -0.051989f, -0.355408f, 0.182215f,
+  0.244277f,  -0.104577f, -0.558035f, -0.023270f, 0.054571f,  0.700646f,
+  -0.223006f, 0.115523f,  0.023391f,  0.437264f,  0.709477f,  -0.531212f,
+  -0.094731f, 0.328161f,  -0.105418f, -0.133511f, 0.497168f,  -0.030948f,
+  -0.407132f, -0.043943f, 0.155505f,  0.251945f,  0.205010f,  0.167160f,
+  0.083654f,  -0.636810f, 0.401315f,  -0.398414f, 0.290046f,  0.206846f,
+  0.042218f,  0.168150f,  0.843181f,  -0.671242f, -0.202392f, -0.073301f,
+  0.142895f,  0.237466f,  0.212145f,  -0.091828f, 0.187038f,  -0.720841f,
+  -0.616069f, -0.238021f, 0.065365f,  0.434119f,  0.179023f,  -0.040107f,
+  -0.430734f, -0.297368f, 0.575954f,  0.382619f,  -0.709787f, -0.320810f,
+  0.242342f,  -0.047614f, 0.705216f,  0.098077f,  0.357179f,  0.046017f,
+  0.115074f,  -0.412305f, -0.272304f, 0.048096f,  -0.803811f, 0.275000f,
+  0.642198f,  0.180286f,  -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+  0.241759f,  -1.038870f, 0.728124f,  0.800559f,  -1.296268f, 0.198612f,
+  -0.053478f, 0.414344f,  -0.510529f, 0.124179f,  -2.219115f, -0.074583f,
+  -0.143055f, 0.001697f,  0.810811f,  -0.657140f, 0.186818f,  -0.936414f,
+  0.539578f,  -0.308244f, -0.126624f, -0.204767f, 0.091145f,  -0.049340f,
+  0.252014f,  0.394582f,  0.018764f,  -0.060377f, -0.019133f, 0.064083f,
+  0.069211f,  -0.526693f, 0.209850f,  -0.481466f, -0.468302f, -0.100407f,
+  0.241018f,  -1.037781f, 0.038539f,  -2.113840f, -0.974895f, 0.163187f,
+  0.425132f,  -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+  -0.745175f, -0.177077f, 0.217658f,  0.381431f,  -0.052338f, 0.087176f,
+  -0.165972f, 0.085937f,  0.472564f,  -0.796627f, -2.453307f, 0.569664f,
+  -0.233010f, -0.192134f, 0.064339f,  -0.111411f, -0.262469f, -0.410022f,
+  0.519993f,  -0.684620f, 0.393460f,  -0.277753f, -0.153624f, 0.528984f,
+  -0.415558f, -0.445863f, 0.588512f,  -0.142439f, -0.132127f, 0.199776f,
+  -0.579284f, 0.119488f,  -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+  0.020519f,  0.233973f,  -0.297998f, -0.051511f, 0.518626f,  -0.412782f,
+  -0.074045f, 0.130523f,  0.465751f,  -0.117795f, 2.535813f,  0.352108f,
+  -0.499228f, 0.379784f,  0.056699f,  0.173142f,  -0.076519f, -0.026666f,
+  0.017834f,  0.492333f,  0.093364f,  0.037867f,  -0.165420f, -0.356429f,
+  -0.562334f, 0.057656f,  -0.307544f, 0.085857f,  -0.559851f, 0.107230f,
+  -0.398633f, 0.152618f,  -0.216835f, -0.024539f, 0.026044f,  -0.249519f,
+  -0.563594f, -0.746025f, 0.025265f,  -0.298888f, -0.185243f, 0.058794f,
+  0.233696f,  -0.115223f, 0.144617f,  -0.864390f, 0.619944f,  -0.023980f,
+  0.019481f,  0.225252f,  0.416552f,  -0.115993f, 0.935387f,  0.744386f,
+  0.053353f,  -0.052582f, -0.065650f, 0.228488f,  -0.032042f, -0.371252f,
+  -0.003638f, -0.736984f, -0.203776f, 0.030922f,  -0.065577f, -0.031643f,
+  -0.049253f, -0.054640f, 0.787134f,  0.545414f,  -0.140297f, -0.124274f,
+  -0.110011f, -0.029552f, 0.657005f,  0.214973f,  -0.374300f, 0.251642f,
+  0.276591f,  0.030566f,  -0.145470f, 0.350579f,  -0.356436f, -0.052694f,
+  -0.063966f, -0.751008f, -1.042392f, 0.328892f,  -0.425058f, -0.421571f,
+  -0.571889f, -1.141472f, -0.125216f, 0.212713f,  -0.485170f, -0.088791f,
+  0.124589f,  0.023237f,  0.077635f,  0.020901f,  -0.271402f, -0.321424f,
+  -0.513946f, -0.867872f, -0.284593f, 0.106276f,  0.220192f,  -0.143532f,
+  -0.014648f, 0.073402f,  0.327256f,  -0.139803f, 0.168763f,  0.048199f,
+  -0.122526f, 0.111713f,  -0.134257f, 0.810364f,  -0.085222f, -0.259221f,
+  -0.239349f, 0.044448f,  0.205031f,  0.413113f,  -0.107720f, -0.018816f,
+  -0.247741f, -0.004963f, 0.041170f,  -0.158019f, 0.134839f,  0.129502f,
+  0.800488f,  -1.041584f, -0.129336f, 0.170834f,  0.566586f,  -0.230443f,
+  0.437937f,  -0.149922f, -0.046665f, -0.094646f, 0.200070f,  0.072943f,
+  -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f,  -0.444731f,
+  -0.100877f, 0.545196f,  -1.786626f, -0.482946f, 0.500509f,  -0.843257f,
+  0.200374f,  0.045103f,  -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+  -0.139490f, 0.356058f,  -0.352075f, 0.061751f,  -0.200616f, -1.180921f,
+  -0.181355f, -0.137459f, 0.247574f,  0.181541f,  0.184314f,  -0.961482f,
+  0.493615f,  0.910261f,  -2.279238f, 0.648631f,  -0.055526f, -0.037137f,
+  0.038643f,  0.136609f,  -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+  0.454651f,  -0.595323f, -0.099500f, -0.263717f, 0.150456f,  0.245077f,
+  -0.268666f, 0.162232f,  -0.516451f, -0.024501f, 0.188046f,  -0.002262f,
+  0.261319f,  0.004173f,  0.746982f,  0.174761f,  0.470447f,  -0.159558f,
+  -0.385240f, 0.023084f,  -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+  -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+  -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f,  -0.153361f,
+  0.334394f,  -0.569472f, -0.198118f, 0.255922f,  0.104717f,  -0.065179f,
+  0.111879f,  -0.447237f, 1.373623f,  -0.190191f, -0.063311f, 0.337529f,
+  -0.138800f, 0.057009f,  -0.137006f, 0.641378f,  0.883147f,  -0.679655f,
+  0.267717f,  -0.351602f, -0.135225f, 0.229398f,  -0.513225f, -1.120345f,
+  0.528786f,  -0.051081f, 0.086653f,  0.140141f,  -0.563969f, 0.333402f,
+  -0.174745f, 0.321093f,  -0.438641f, -0.005131f, 0.247415f,  0.110120f,
+  -0.076308f, -0.083244f, 0.838944f,  -0.113043f, -0.013258f, -0.175028f,
+  -0.179941f, 0.272676f,  -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+  -0.083549f, -0.089952f, -0.186253f, 0.257483f,  0.011019f,  0.586435f,
+  0.060580f,  -0.052078f, 0.090277f,  -0.780869f, 0.969811f,  -0.025349f,
+  -0.281917f, 0.014857f,  0.231863f,  -0.228601f, -0.003861f, 0.226550f,
+  0.141825f,  -0.102171f, -0.010387f, 0.220378f,  -2.561975f, -0.497071f,
+  -0.315117f, 0.371981f,  0.138247f,  0.625031f,  -0.308133f, -0.217876f,
+  0.005615f,  -0.860179f, 0.747491f,  0.006356f,  -0.057024f, -0.483189f,
+  0.055592f,  -0.316834f, 0.069858f,  0.218788f,  -0.200044f, 0.227588f,
+  0.215496f,  -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+  -0.152512f, -0.332995f, 0.129053f,  0.178668f,  -0.302694f, 0.030678f,
+  0.925896f,  0.964375f,  0.169021f,  -0.218657f, -0.627204f, 0.206437f,
+  -0.521336f, 0.176206f,  0.142733f,  0.139248f,  0.411682f,  0.181544f,
+  0.224850f,  -0.935547f, -0.558208f, 0.348096f,  0.342129f,  -0.389340f,
+  -0.236308f, -0.132099f, 0.073642f,  0.089391f,  -0.306901f, -0.397842f,
+  0.444282f,  0.074623f,  -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+  -0.138761f, 0.120794f,  -0.647577f, -0.336471f, 0.527899f,  -0.164234f,
+  -0.028354f, 1.083678f,  -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+  -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+  -1.152632f, 0.383685f,  -0.105895f, -0.096829f, 0.118382f,  0.047447f,
+  -0.019051f, 0.310180f,  -0.162793f, -0.029574f, 0.058054f,  -0.636017f,
+  0.490639f,  0.158347f,  -0.385701f, -0.147057f, 1.285825f,  -1.276083f,
+  -0.021795f, -0.101600f, 0.163254f,  0.267160f,  -2.317864f, -0.098598f,
+  -0.296337f, -0.309017f, 0.164127f,  -0.270012f, -0.071187f, -0.262270f,
+  0.075415f,  -0.368328f, 0.186728f,  -0.158031f, 0.481663f,  0.515950f,
+  -0.162551f, 0.497981f,  0.262196f,  0.168479f,  0.726066f,  -0.243856f,
+  -0.058998f, 0.140168f,  0.053242f,  -0.624623f, -0.249480f, 0.055197f,
+  -1.376804f, 0.417571f,  0.203784f,  0.174370f,  -0.155531f, -0.029400f,
+  -0.491473f, 0.079811f,  -0.080123f, 1.345900f,  0.637077f,  0.434862f,
+  -1.787438f, 0.005756f,  -0.362706f, 0.179458f,  -0.288263f, 0.516788f,
+  -0.921248f, 0.043794f,  -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+  -0.156532f, -0.132566f, 0.517989f,  -0.154321f, -0.054174f, -0.077900f,
+  -0.373316f, -0.117718f, 0.188986f,  -0.476188f, -0.245312f, 0.181439f,
+  -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+  -0.135429f, 0.125766f,  -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+  -0.205966f, 0.031472f,  0.744446f,  -0.006680f, -0.837551f, 0.605862f,
+  -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f,  -0.183586f,
+  -0.010307f, 0.099373f,  -0.228278f, 0.175236f,  -0.000133f, 0.104491f,
+  -1.540545f, -0.570971f, -0.252885f, 0.483036f,  0.052531f,  0.260214f,
+  -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f,  -1.775975f,
+  -0.298634f, 0.323626f,  -0.373579f, -0.872977f, 0.619574f,  0.026862f,
+  -0.122531f, -0.084698f, -2.436297f, 0.483996f,  -0.203640f, -0.302157f,
+  -0.150666f, -0.238320f, 0.089250f,  0.236485f,  -0.668654f, -0.122863f,
+  0.491152f,  -0.226444f, -0.181248f, 0.120158f,  0.294027f,  0.250056f,
+  0.307601f,  0.357875f,  -1.746455f, -0.175670f, 0.385447f,  -0.108808f,
+  -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+  0.607555f,  -0.489426f, 0.150624f,  0.598114f,  -0.128816f, -0.445793f,
+  -0.066524f, -0.254380f, 0.227106f,  -0.406495f, -0.121632f, -0.275960f,
+  -0.136494f, 0.339457f,  -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+  -0.001211f, 0.375192f,  -0.473448f, -0.162510f, 0.099329f,  -0.277965f,
+  0.101221f,  -0.060263f, 0.121867f,  -1.042140f, 0.440851f,  0.078898f,
+  -0.209007f, -0.243699f, 0.715197f,  -0.093997f, 0.086022f,  -0.178203f,
+  -2.275496f, -0.098413f, 0.199352f,  -0.526791f, -0.162086f, -0.197806f,
+  -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f,  0.416236f,
+  0.064082f,  0.197655f,  0.340871f,  -0.186645f, -0.291498f, 0.433938f,
+  -1.110063f, 0.003751f,  0.392738f,  0.069360f,  0.102088f,  -0.302128f,
+  -1.518457f, 0.106939f,  0.404527f,  -0.306868f, -0.286928f, 0.729276f,
+  -0.531710f, 0.745048f,  -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+  0.241877f,  -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+  -0.489957f, 0.100850f,  0.323999f,  -0.802837f, -0.462408f, -0.079350f,
+  -0.029374f, 0.131213f,  -0.825032f, 0.040202f,  0.351821f,  0.002869f,
+  -0.132516f, -0.471264f, -0.297002f, 0.263913f,  0.033478f,  0.146161f,
+  0.533229f,  -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+  0.005151f,  0.018584f,  -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+  -1.197056f, 0.146302f,  0.226840f,  -0.852126f, 0.031214f,  0.108880f,
+  0.562000f,  -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+  0.515073f,  -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+  0.347673f,  0.623379f,  0.722067f,  -0.492458f, -0.513263f, 0.585167f,
+  0.721518f,  -0.693499f, 0.343725f,  -0.273861f, -0.040230f, -0.785664f,
+  -0.157500f, -0.308445f, 0.054062f,  0.600131f,  -0.860887f, 0.434470f,
+  -0.191382f, -0.306150f, -0.243965f, 0.705444f,  0.007789f,  -0.146154f,
+  -0.054499f, -0.073500f, -1.067364f, 0.404936f,  -2.864590f, 0.182323f,
+  0.326126f,  0.102405f,  -0.135800f, 1.128095f,  -0.012267f, -0.023996f,
+  -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f,  -0.498361f,
+  0.083560f,  -0.210074f, 0.019225f,  -0.201614f, -0.904760f, 0.181421f,
+  0.586384f,  -0.177706f, 0.065471f,  0.168552f,  0.054705f,  0.045241f,
+  0.048057f,  -0.410957f, -2.188854f, -0.169812f, 0.015521f,  0.176856f,
+  -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f,  0.010454f,
+  0.823643f,  -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+  -0.433195f, -0.120488f, -0.116721f, 0.112134f,  0.118170f, -0.259769f,
+  -0.077530f, 0.394044f,  0.279167f,  -0.317988f, 0.189538f, 0.314776f,
+  0.325655f,  -0.107123f, 0.591049f,  0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_128_layer0,
+      av1_ab_partition_nn_weights_128_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_128_layer0,
+      av1_ab_partition_nn_bias_128_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+  -0.495347f, -0.049498f, -0.026804f, 0.030474f,  -0.289308f, -0.264193f,
+  -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+  -0.038217f, 0.014872f,  -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+  -0.442543f, -0.482492f, 0.073510f,  0.007503f,  2.162329f,  -0.362849f,
+  2.145915f,  -0.883135f, 0.185636f,  -0.062859f, -0.465574f, -0.486205f,
+  -0.056710f, -0.330642f, -0.321860f, 0.042321f,  -0.348965f, 0.003542f,
+  -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+  0.246622f,  0.199651f,  -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+  0.816811f,  0.083247f,  -0.218839f, 0.038143f,  -0.063436f, 0.015517f,
+  -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+  0.050425f,  -0.221723f, -0.256942f, -0.287285f, 0.144011f,  -0.033245f,
+  0.083649f,  0.119428f,  -0.056706f, -0.117805f, 0.021866f,  -0.257300f,
+  -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f,  -0.347247f,
+  0.042539f,  -0.302697f, 1.652316f,  0.000701f,  -0.482843f, -0.160332f,
+  -0.450099f, 0.212399f,  -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+  0.085956f,  -0.037767f, 1.052409f,  -0.931924f, -2.221907f, 0.268946f,
+  0.015512f,  1.237094f,  -1.092185f, 0.418247f,  -0.082143f, -0.076914f,
+  -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+  -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+  -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f,  0.019839f,
+  0.451127f,  0.004376f,  1.410392f,  3.255835f,  -0.344815f, 0.145202f,
+  0.204132f,  0.171948f,  -0.527736f, -0.110353f, 0.901448f,  0.003238f,
+  -3.822090f, 0.235462f,  1.024823f,  -0.821244f, 0.876056f,  2.553762f,
+  -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+  -0.246040f, 0.039430f,  -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+  -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+  0.084721f,  0.168089f,  -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+  0.079376f,  -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+  0.066176f,  -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+  -0.385845f, 0.119769f,  -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+  -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f,  -0.114423f,
+  -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+  -0.279011f, -0.008132f, 0.208463f,  0.020569f,  -0.206803f, -0.213408f,
+  -0.206131f, -0.290245f, 0.069701f,  -0.000371f, -0.307572f, -0.451785f,
+  -0.300838f, -0.453186f, -0.301691f, 0.046327f,  -0.312668f, 0.058272f,
+  -0.303131f, -0.376252f, 0.108384f,  -0.086623f, -0.100630f, -0.027330f,
+  -0.003969f, 0.089502f,  -0.200722f, -0.107889f, 0.061843f,  -0.008478f,
+  -0.265057f, -0.271132f, -0.073562f, 0.129337f,  -0.283698f, -0.353414f,
+  0.076420f,  -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+  -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+  -0.011932f, -0.585700f, 0.253212f,  -1.061900f, -0.205116f, -0.336407f,
+  -0.762199f, 0.577737f,  0.230832f,  0.434440f,  -0.096713f, 0.038552f,
+  -0.147800f, -0.213553f, 0.041740f,  -0.281907f, -0.026154f, -0.082356f,
+  -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+  -0.391963f, -0.467392f, 0.027453f,  -0.394761f, -0.045544f, 0.076052f,
+  0.483985f,  0.067093f,  0.141361f,  0.576772f,  0.859718f,  2.566515f,
+  -0.025476f, 0.769738f,  -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+  -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+  0.551148f,  1.777227f,  -0.461630f, 0.043093f,  0.012293f,  -0.255841f,
+  -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+  -0.297266f, -0.128699f, -0.149555f, 0.016534f,  -0.375498f, -0.346759f,
+  -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+  -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+  -0.048421f, -0.144133f, 0.889073f,  0.012606f,  3.007608f,  -0.602584f,
+  -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f,  -1.867208f,
+  -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f,  0.119141f,
+  -0.230715f, 0.083247f,  0.020367f,  -0.128629f, -0.217455f, -0.159640f,
+  1.815952f,  -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+  0.662971f,  0.486475f,  0.159746f,  -0.018932f, 3.692397f,  1.384353f,
+  -0.401984f, -0.248380f, -0.140861f, 0.215248f,  -0.023711f, 0.059679f,
+  -0.072260f, 0.004271f,  0.039545f,  -0.347971f, -0.081851f, -0.474896f,
+  -0.181572f, 0.066736f,  -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+  -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+  -0.102701f, -0.312336f, 0.149831f,  0.007229f,  -0.155700f, -0.173611f,
+  4.074261f,  1.342306f,  -1.272712f, 1.570899f,  -0.545093f, -0.317605f,
+  -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+  -0.239130f, -0.067211f, 0.041957f,  -0.039234f, -1.003587f, -0.094412f,
+  0.532512f,  -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+  0.419466f,  0.492122f,  -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+  2.396852f,  2.660000f,  -0.376537f, 0.468628f,  0.149413f,  -0.074898f,
+  -0.067154f, 0.021245f,  0.127857f,  0.294189f,  0.508056f,  0.390232f,
+  -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+  -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+  -0.398724f, -0.372068f, -0.234279f, 0.017799f,  -0.424760f, -0.646717f,
+  -0.047568f, 2.924664f,  -0.644165f, 0.359349f,  -0.294800f, 0.591746f,
+  -0.404710f, -0.092358f, -0.250729f, 0.030829f,  -0.147149f, -0.476023f,
+  -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+  -0.377052f, -0.449899f, -0.056452f, 0.138081f,  -0.085350f, -0.308391f,
+  0.106661f,  0.176234f,  0.258869f,  -0.230172f, -0.233029f, -0.241208f,
+  -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+  -0.158114f, -0.223167f, -0.026689f, 0.051863f,  0.212834f,  -0.304714f,
+  -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+  0.280815f,  -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+  -0.380595f, 0.109504f,  -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+  -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+  -0.418429f, -0.183240f, 0.031319f,  -0.095785f, -0.315447f, 0.069404f,
+  -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+  -0.178198f, 0.177208f,  0.134688f,  -0.081933f, -0.229452f, -0.208872f,
+  0.026287f,  -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+  -0.267238f, -0.494125f, -0.056255f, 0.053715f,  -0.487754f, 0.014818f,
+  0.087383f,  -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+  -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f,  0.010186f,
+  -0.001587f, 0.086735f,  -2.465718f, 1.482185f,  1.621193f,  -2.081680f,
+  1.386553f,  -3.204335f, -0.267111f, -0.004508f, 0.164712f,  0.274147f,
+  1.724306f,  -2.273659f, 0.749574f,  -0.891905f, 0.105965f,  -0.030428f,
+  -0.416018f, -0.300762f, 0.122911f,  -0.316908f, -0.292504f, 0.138666f,
+  -0.161327f, -0.042143f, -0.249128f, 0.149210f,  -0.088987f, -0.654101f,
+  -1.501843f, 0.216777f,  0.955914f,  0.524158f,  -1.642561f, -1.643626f,
+  0.864797f,  -0.425451f, -2.115764f, -0.012502f, 0.065172f,  1.297270f,
+  0.018845f,  1.167276f,  -0.470970f, -0.244995f, 0.374782f,  -1.811056f,
+  -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+  -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+  -0.370683f, 0.172816f,  -0.265069f, 0.194321f,  -0.273478f, 0.037442f,
+  -0.235552f, -0.078625f, -0.447541f, 0.016836f,  -0.271123f, -0.171481f,
+  -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+  -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+  0.230343f,  -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+  -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f,  -0.150944f,
+  -0.075727f, -0.208414f, 1.054996f,  0.713758f,  -0.300051f, -0.151482f,
+  -2.443570f, 0.430590f,  -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+  -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+  0.578161f,  -0.220318f, -0.210107f, -3.111584f, 0.604419f,  -0.232622f,
+  -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f,  -2.535531f,
+  -0.209783f, -0.211189f, -2.766337f, 0.000000f,  0.450177f,  -1.754884f,
+  3.262664f,  -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+  -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+  -0.305430f, 0.739171f,  0.991277f,  -0.088150f, 0.086313f,  -0.023379f,
+  -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+  -0.036800f, 0.528721f,  0.490767f,   0.144409f,  1.103640f,  0.361910f,
+  -0.180069f, 0.068033f,  -14.868382f, 0.359013f,  0.322567f,  -0.199212f,
+  0.906164f,  -0.488254f, 0.149653f,   -0.216394f, -0.099347f, 0.004936f,
+  -0.111391f, 0.074848f,  -0.041709f,  0.147627f,  -0.018905f, 0.096116f,
+  0.184817f,  -0.016241f, 0.115739f,   2.376754f,  0.637097f,  0.052954f,
+  0.136428f,  0.225267f,  -0.181873f,  -0.142876f, 0.684048f,  0.658791f,
+  0.105795f,  0.241705f,  1.381114f,   -0.209379f, 1.145949f,  0.795293f,
+  -9.361877f, 0.198302f,  0.539600f,   0.092317f,  -0.081695f, 0.200777f,
+  0.102334f,  0.081583f,  0.060948f,   -0.025110f, 0.160951f,  -0.020170f,
+  0.234006f,  -0.029369f, 0.375036f,   0.270209f,  -0.556529f, 1.402949f,
+  0.101777f,  -0.027331f, 0.004502f,   -0.153166f, -0.116651f, 0.151573f,
+  -0.022187f, 0.144044f,  -0.108719f,  -0.129942f, -0.270321f, 0.227363f,
+  1.892330f,  -0.661052f, -0.219398f,  -0.229417f, -0.856438f, -1.196988f,
+  -0.081774f, 0.078847f,  -0.207057f,  -0.048947f, 0.152073f,  -0.243056f,
+  -0.233329f, -0.288689f, -0.158333f,  -0.141177f, -0.715436f, 0.016947f,
+  -0.093752f, 0.204984f,  -1.209782f,  0.155683f,  0.092239f,  0.146495f,
+  0.813146f,  -0.027757f, 0.330982f,   2.173948f,  -0.028867f, -0.141815f,
+  0.292708f,  -0.204794f, 0.014496f,   1.032799f,  1.312155f,  0.107020f,
+  0.824752f,  -0.013945f, 0.184829f,   -0.041633f, 0.215300f,  -0.476088f,
+  -0.053213f, 0.126862f,  -0.020777f,  0.082893f,  -0.223727f, -0.923063f,
+  0.466529f,  0.082140f,  -0.845758f,  -1.140791f, -0.262033f, 0.138491f,
+  0.151717f,  -0.182479f, -0.131128f,  0.055411f,  0.106771f,  0.125552f,
+  0.297184f,  -0.257403f, -0.059884f,  -0.274903f, 2.694357f,  -0.108244f,
+  0.025377f,  0.043092f,  -0.558317f,  3.517159f,  -0.270833f, -0.240676f,
+  0.205100f,  -0.057068f, -0.140445f,  -0.193449f, -0.030061f, -0.286762f,
+  -0.467523f, -0.012647f, 0.190564f,   0.022394f,  -0.101479f, 0.339684f,
+  -0.902743f, -0.169578f, -0.178029f,  -0.041836f, -3.952108f, -0.028298f,
+  -0.221137f, -0.733895f, -0.223895f,  0.039012f,  0.687867f,  0.021423f,
+  0.113063f,  0.676087f,  -0.961000f,  -0.064847f, 0.712856f,  -0.192765f,
+  -0.001132f, 0.016689f,  -0.236020f,  -0.766186f, -0.175729f, 0.012879f,
+  -0.251064f, -0.105523f, -0.039212f,  -0.347584f, 0.304352f,  -0.034174f,
+  -0.364258f, -0.685252f, -0.266115f,  -0.247345f, -0.155905f, 0.152283f,
+  -0.156315f, 0.174082f,  -0.757654f,  0.102303f,  -2.192316f, -0.245815f,
+  0.119882f,  -0.086542f, 1.987246f,   -1.353163f, -0.374813f, -0.233504f,
+  -1.980895f, 0.692093f,  -0.168351f,  0.172700f,  -0.009052f, -0.015734f,
+  0.106679f,  -0.060472f, -0.256813f,  -0.074874f, -0.207488f, -0.329515f,
+  -0.418268f, -0.017940f, -0.036081f,  0.064719f,  -1.488016f, 0.020591f,
+  -0.176325f, -0.141074f, 0.944494f,   0.150237f,  -0.249805f, -0.277280f,
+  0.012686f,  0.132483f,  0.116123f,   0.013737f,  -0.116091f, 0.750340f,
+  3.251343f,  -0.188864f, 1.096992f,   0.058467f,  -0.041433f, -0.037937f,
+  -0.133294f, -0.137908f, -0.171132f,  0.106362f,  0.069383f,  -0.052662f,
+  -0.177883f, -0.408049f, 0.680221f,   -0.117035f, -0.904240f, -1.395228f,
+  0.154527f,  0.134427f,  0.022767f,   -0.158886f, -0.230316f, 0.161096f,
+  0.362213f,  -0.235060f, -0.941620f,  0.055912f,  -0.049458f, -0.166632f,
+  0.481418f,  0.930146f,  0.041108f,   0.033674f,  1.372066f,  -1.847709f,
+  0.003324f,  0.259534f,  0.177014f,   -0.202761f, -0.262017f, -0.190852f,
+  -0.102839f, 0.028338f,  0.187193f,   -0.041684f, 0.123973f,  -0.198576f,
+  -0.110369f, -1.431400f, 0.208369f,   -0.302370f, -0.248549f, 0.062985f,
+  0.673409f,  0.036662f,  -0.711340f,  -0.120584f, -0.189789f, 0.098812f,
+  2.947819f,  0.216567f,  -0.414472f,  -0.181742f, 1.873779f,  -0.222726f,
+  -0.782870f, 0.007889f,  0.015062f,   -0.554328f, 0.182928f,  -0.191430f,
+  0.123636f,  -0.215460f, -0.225245f,  0.251516f,  -0.013025f, -1.359595f,
+  -0.750602f, 0.342667f,  -0.141899f,  -0.687493f, -0.072639f, 0.048018f,
+  -0.242107f, -0.031917f, -0.287472f,  -0.046088f, 0.832197f,  -0.016576f,
+  -1.553349f, -0.216341f, 0.023077f,   -0.410867f, 4.243743f,  -0.514878f,
+  -0.066007f, -0.160696f, -0.262678f,  -0.648790f, -0.430586f, 0.199940f,
+  -0.202496f, -0.222241f, -0.016406f,  -0.121473f, 0.000828f,  -0.081584f,
+  -0.152641f, -0.190166f, 0.644400f,   0.040196f,  -0.302104f, -1.143654f,
+  -0.160327f, -0.320780f, -0.187006f,  0.037311f,  0.440618f,  -0.070733f,
+  -0.117785f, 1.527539f,  -0.419310f,  0.001300f,  1.389956f,  -0.036366f,
+  -0.269203f, 0.612265f,  2.721897f,   -0.086836f, -0.446999f, 0.012525f,
+  -0.078317f, -0.287052f, -0.111188f,  -0.085181f, -0.164667f, -0.010466f,
+  -0.569722f, -0.018888f, -0.101663f,  -1.147130f, -0.465204f, 0.114524f,
+  -2.192402f, -0.221325f, 0.375748f,   0.206284f,  -0.261548f, -0.246257f,
+  -0.143004f, -0.069981f, -0.057306f,  -0.116481f, -0.435903f, -0.314970f,
+  0.013210f,  -0.010175f, 4.630571f,   -0.473226f, -0.197199f, -0.028204f,
+  0.122907f,  2.475548f,  0.025011f,   -0.092603f, -0.127561f, -0.151330f,
+  -0.077295f, 0.245016f,  -0.045005f,  0.183396f,  -0.330556f, -0.384887f,
+  0.356374f,  -0.016618f, -0.463353f,  -1.291546f, -0.071986f, -0.311599f,
+  0.072385f,  -0.430786f, -2.094788f,  0.202733f,  -0.910109f, -1.336543f,
+  -0.086800f, -0.096413f, 1.544383f,   0.031860f,  -0.796211f, 0.762786f,
+  3.250022f,  -0.441798f, -0.698537f,  0.062839f,  0.033525f,  -0.362996f,
+  0.027022f,  -1.131264f, -0.228926f,  0.053885f,  -0.338628f, 0.155037f,
+  -0.046844f, -0.888172f, -0.241767f,  0.084965f,  -0.617743f, -0.049896f,
+  -0.036894f, -0.304783f, -0.002639f,  0.137957f,  0.052121f,  -0.131161f,
+  -0.117200f, -0.253380f, -0.205561f,  -0.302450f, -0.047397f, -0.330518f,
+  3.613420f,  -1.525951f, -0.026738f,  0.209150f,  -2.103534f, 2.019689f,
+  -0.366199f, -0.095260f, 0.027417f,   -0.242512f, 0.162579f,  0.052113f,
+  -0.293851f, -0.068138f, -0.005799f,  -0.344696f, -0.114824f, -0.431107f,
+  -0.120058f, -1.139926f, -1.048379f,  0.036446f,  -0.323020f, -0.432945f,
+  0.454151f,  -0.140058f, 0.050649f,   -0.094900f, -0.017278f, -0.238719f,
+  1.193153f,  0.120447f,  -0.496061f,  0.917431f,  2.936126f,  -0.115521f,
+  -0.347397f, -0.435325f, -0.004383f,  -0.211864f, 0.162383f,  -1.040726f,
+  0.089537f,  -0.128579f, -0.133505f,  0.107129f,  -0.435657f, -0.180388f,
+  0.043650f,  0.018709f,  -0.773242f,  -0.687192f, -0.120633f, -0.063626f,
+  0.029912f,  0.113972f,  -0.403502f,  -0.127640f, -0.269625f, 0.129794f,
+  -0.188539f, 0.041641f,  0.029769f,   -0.198374f, 1.401407f,  0.353887f,
+  -0.219925f, 0.260515f,  1.157034f,   -2.992044f, -0.097618f, -0.064417f,
+  -0.203626f, -0.008217f, -0.112339f,  -0.227407f, -0.155118f, 0.247705f,
+  -0.012304f, -0.248447f, -0.913463f,  -0.064788f, -0.214619f, -0.251761f,
+  -0.386861f, -0.040574f, -0.163219f,  -0.100700f, 1.488274f,  -0.071684f,
+  -0.033626f, -0.006497f, -0.246945f,  -0.145221f, -3.747390f, 0.149609f,
+  -0.263326f, -0.297385f, -1.039896f,  -0.083174f, -0.025473f, -0.235586f,
+  -0.001087f, 0.254286f,  0.265106f,   0.007325f,  0.199239f,  0.134103f,
+  -0.578211f, -0.259801f, -0.062373f,  2.368348f,  0.560556f,  -0.252260f,
+  0.889997f,  -0.447872f, -0.059218f,  -0.095315f, -0.061667f, 0.183580f,
+  -0.157479f, 0.055387f,  -0.831734f,  0.007606f,  -1.104906f, 0.301180f,
+  -0.117115f, 0.212959f,  4.727223f,   -0.243833f, -0.397495f, -0.025021f,
+  -0.367587f, -2.082058f, -0.217699f,  0.148111f,  0.252430f,  0.111088f,
+  -0.260692f, 0.095124f,  -0.407774f,  -0.322169f, 0.002927f,  0.126169f,
+  -1.272325f, -0.279772f, -0.373680f,  -0.485177f, -0.605458f, 0.021225f,
+  -0.092031f, -0.226585f, 1.895162f,   0.037866f,  -0.275475f, 1.614360f,
+  -0.014972f, -0.277679f, -3.449082f,  -0.092060f, -0.747873f, 0.020716f,
+  2.776178f,  -0.049963f, 0.183999f,   -0.295259f, -0.028868f, 0.221895f,
+  0.001265f,  0.336823f,  0.219372f,   0.112824f,  0.408132f,  -0.017940f,
+  -0.311666f, 1.489606f,  -0.058093f,  -0.305659f, -0.491933f, -0.143847f,
+  0.166115f,  0.042867f,  -0.123447f,  -0.087099f, -0.305395f, -0.365079f,
+  -0.755801f, -0.160649f, 0.736260f,   -0.008611f, 0.095836f,  -0.017345f,
+  5.697515f,  -0.498971f, -0.125280f,  0.199907f,  0.300053f,  0.605026f,
+  -0.228225f, -0.259523f, 0.016384f,   0.146973f,  0.210258f,  0.226766f,
+  -0.075178f, -0.050924f, 0.188496f,   -0.415266f, -0.484880f, -0.236384f,
+  0.071931f,  -0.331863f, -0.601243f,  -0.232479f, -0.285272f, 0.123789f,
+  -1.341333f, 0.037082f,  -0.315202f,  -1.587215f, -0.271576f, 0.003216f,
+  -4.437186f, -0.256205f, -0.576589f,  -0.114147f, 2.153916f,  -0.369618f,
+  0.271415f,  0.145036f,  -0.158731f,  -0.240938f, -0.187369f, 0.036325f,
+  0.254771f,  0.211488f,  -0.240297f,  0.098417f,  -0.415011f, 2.334793f,
+  -0.127252f, 0.020069f,  -0.168755f,  -0.448922f, -0.219207f, 0.016232f,
+  -0.221935f, -0.269500f, -0.100636f,  0.102545f,  -0.809376f, -0.054979f,
+  0.360713f,  -0.326541f, 0.112933f,   0.138073f,  4.229404f,  -0.763801f,
+  -0.305429f, 0.199955f,  -1.787713f,  0.272866f,  0.109895f,  0.138466f,
+  -0.250259f, -0.167162f, -0.212588f,  -0.217589f, -0.067125f, -0.077490f,
+  -0.208970f, -0.006863f, -0.671146f,  -0.298320f, -0.165509f, 0.044597f,
+  -1.408624f, -0.213957f, -0.220947f,  0.129718f,  1.316777f,  -0.098928f,
+  -0.008121f, -0.558293f, -0.297290f,  -0.218873f, -4.346638f, -0.228174f,
+  -0.204710f, -0.388864f, 2.697919f,   0.025260f,  0.857020f,  0.009921f,
+  0.036915f,  -0.320275f, -0.087937f,  0.022636f,  0.236667f,  0.135496f,
+  -0.059616f, -0.192955f, 0.009470f,   2.139589f,  -0.200449f, 0.129818f,
+  1.017444f,  -0.608299f, 0.257914f,   -0.134306f, -0.033327f, 0.002855f,
+  -0.338598f, 0.015559f,  0.117362f,   -0.166760f, 0.086903f,  -0.167666f,
+  0.193523f,  0.033852f,  -1.147686f,  0.489468f,  -0.006969f, 0.125630f,
+  1.557907f,  -1.604449f, -0.071114f,  0.096178f,  0.007065f,  0.200013f,
+  0.213393f,  0.168466f,  -0.100568f,  -0.117861f, -0.161542f, -0.072561f,
+  -1.069871f, -0.470138f, -0.352578f,  -1.503513f, -0.001394f, -0.380109f,
+  0.065089f,  -0.281668f, 0.988953f,   -0.002778f, -0.659026f, -0.470692f,
+  -0.407292f, 0.011710f,  -1.362085f,  0.184738f,  -0.135786f, -1.374241f,
+  4.487930f,  -0.067274f, -0.956404f,  -0.233995f, 0.224527f,  -0.454556f,
+  0.037900f,  -0.281658f, 0.208224f,   -0.254753f, 0.045740f,  0.051444f,
+  -0.388281f, 0.257112f,  -0.485030f,  -0.082659f, 0.148103f,  -1.007456f,
+  -0.022295f, 0.036984f,  -0.369401f,  -0.076943f, -0.007636f, -0.293022f,
+  0.470466f,  0.199012f,  -2.158182f,  0.036577f,  -0.014725f, -0.229516f,
+  2.236929f,  0.030945f,  -0.400045f,  0.109348f,  0.214691f,  -0.891516f,
+  -0.251379f, -0.217358f, 0.013733f,   0.205573f,  -0.151725f, -0.191782f,
+  -0.339630f, -0.163905f, -0.119191f,  -0.032516f, 0.503015f,  0.025772f,
+  0.029094f,  -1.146153f, 0.216723f,   -0.330023f, 0.064695f,  -0.262521f,
+  0.425612f,  -0.093080f, -0.489648f,  1.051293f,  -0.092332f, 0.095557f,
+  -0.874132f, 0.218483f,  -0.127648f,  -1.605802f, 2.763617f,  -0.186734f,
+  -1.243166f, -0.193514f, -0.173748f,  0.337822f,  0.183873f,  -0.251594f,
+  -0.211582f, 0.144081f,  0.029620f,   -0.024853f, -0.385140f, 0.467341f,
+  -0.928316f, -0.195442f, 0.917783f,   0.357084f,  0.174445f,  -0.073659f,
+  -0.012811f, -0.115420f, -0.181147f,  -0.364449f, -0.567395f, -0.012969f,
+  -1.680714f, 0.065323f,  0.198063f,   -0.244201f, 1.428545f,  -0.432539f,
+  -0.208931f, -0.091205f, 0.957125f,   0.813519f,  -0.262677f, 0.246852f,
+  0.015536f,  0.055026f,  0.067054f,   0.262103f,  -0.358115f, -0.095206f,
+  -0.267522f, -0.402710f, -0.680397f,  -0.123627f, -0.385590f, -1.504680f,
+  -0.169513f, -0.215338f, 0.043633f,   -0.079052f, -0.464410f, 0.122894f,
+  -0.278231f, -2.456445f, -0.159917f,  -0.015597f, -0.735449f, -0.078854f,
+  -0.400290f, -1.153870f, 3.657228f,   -0.287093f, -1.174355f, -0.102001f,
+  -0.288281f, 0.185209f,  -0.145228f,  -0.200449f, -0.099914f, -0.138354f,
+  0.254428f,  -0.161751f, -0.118206f,  0.296043f,  -0.482613f, 0.080932f,
+  1.097605f,  -0.010190f, 0.232439f,   0.447617f,  -0.133508f, 0.115763f,
+  -0.388589f, 0.174695f,  -0.236014f,  0.006284f,  -1.374129f, 0.092015f,
+  -0.241419f, -0.231667f, 2.763950f,   -0.922932f, -0.061605f, 0.208740f,
+  -1.597190f, 1.353325f,  -0.198528f,  0.250498f,  -0.013950f, -0.203861f,
+  -0.254563f, 0.081931f,  -0.413369f,  0.011844f,  0.080961f,  -0.231161f,
+  -1.234909f, -0.440843f, -0.174980f,  -0.315283f, -0.337474f, -0.123243f,
+  -0.310001f, -0.271028f, 0.364179f,   0.022845f,  -0.535517f, -0.772936f,
+  -0.188435f, 0.039667f,  -0.807463f,  0.266550f,  -0.288857f, -1.630789f,
+  1.280155f,  0.065712f,  -0.279960f,  -0.300056f, 0.258440f,  -0.073781f,
+  0.213878f,  0.042196f,  0.021360f,   0.211698f,  -0.003751f, -0.192673f,
+  -0.137008f, 0.247878f,  -0.470604f,  0.073164f,  1.523241f,  0.734755f,
+  -0.114126f, -0.193834f, -0.025759f,  0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+  -0.148074f, 0.923430f,  -0.364770f, 0.203550f,  0.401216f,  0.938246f,
+  -0.872737f, 0.718723f,  0.703398f,  2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_64_layer0,
+      av1_ab_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_64_layer0,
+      av1_ab_partition_nn_bias_64_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+  -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+  -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+  0.344916f,  -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+  0.411575f,  -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+  -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+  0.225887f,  -0.000493f, 2.682241f,  0.871204f,  0.059014f,  0.803542f,
+  -1.407028f, -1.154669f, 1.388148f,  -0.293348f, -0.003669f, -0.009607f,
+  1.330030f,  -0.337841f, 2.118617f,  1.033059f,  -0.084788f, 0.212904f,
+  0.082405f,  -0.070579f, -0.494005f, -0.173392f, 0.039546f,  -0.463865f,
+  0.077163f,  -0.434066f, 0.030835f,  -0.427139f, -0.560520f, -0.031606f,
+  -0.368541f, -0.027458f, 0.370574f,  0.461418f,  1.087682f,  -0.572137f,
+  -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+  -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f,  -0.156744f,
+  -0.267922f, 0.171216f,  0.110556f,  0.002954f,  -0.200327f, -0.187663f,
+  3.691601f,  1.234152f,  0.186315f,  -0.125370f, -0.211235f, -0.554432f,
+  -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f,  0.012896f,
+  -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f,  0.016307f,
+  0.384673f,  -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+  -0.169709f, 0.421681f,  -0.033360f, -0.072817f, 0.003647f,  -0.110632f,
+  -0.158651f, -0.095136f, 0.223759f,  0.165767f,  -0.269129f, -0.196075f,
+  -0.023183f, -0.293420f, 0.014875f,  0.018688f,  -0.153407f, -0.172009f,
+  -0.259947f, -0.124015f, 0.173653f,  -0.089103f, -0.021001f, -0.334230f,
+  0.027177f,  0.103371f,  -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+  -0.143771f, -0.247106f, 0.218116f,  -0.013240f, 2.831783f,  1.483928f,
+  -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f,  0.234684f,
+  -0.119150f, -0.075182f, -0.330463f, 0.071503f,  -0.254924f, -0.360071f,
+  -0.037022f, 0.063261f,  -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+  0.065318f,  -0.235743f, -0.257194f, -0.094784f, 0.022423f,  0.055925f,
+  0.086672f,  -0.021010f, 0.009965f,  -0.001648f, -0.104917f, -0.387443f,
+  -0.102673f, -0.281706f, 0.145923f,  -0.233391f, -0.378365f, -0.145584f,
+  -0.077751f, -0.121166f, 1.134565f,  -0.097500f, -0.749202f, -0.544566f,
+  -1.361374f, -0.102494f, 1.089275f,  0.375299f,  -0.105091f, 0.037641f,
+  -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+  -0.339326f, -0.128217f, -0.282905f, 0.014937f,  1.067185f,  -0.171764f,
+  0.484458f,  0.396706f,  -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+  -0.218449f, -0.004755f, 1.572857f,  0.006229f,  1.962895f,  -0.029746f,
+  -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+  -1.263078f, -0.304560f, 1.072374f,  2.556429f,  0.312850f,  0.257488f,
+  -0.634264f, 0.156769f,  -0.188943f, 0.040295f,  -0.389915f, 0.085250f,
+  -0.248525f, 0.045667f,  -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+  -1.285316f, 0.079060f,  0.389124f,  -0.510401f, -0.015299f, -0.664661f,
+  0.099901f,  -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+  -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+  -0.064569f, -0.156516f, 0.543522f,  -0.005924f, 0.161432f,  0.974793f,
+  0.273712f,  1.104850f,  -0.290312f, 0.313417f,  -0.125370f, 0.136234f,
+  -0.191227f, -0.165054f, 0.011872f,  -0.298871f, 0.095740f,  0.142760f,
+  -0.215771f, -0.031437f, 0.101041f,  -0.085620f, 0.435387f,  0.002786f,
+  1.971375f,  0.018392f,  -1.771940f, -0.401433f, 0.808263f,  -3.350013f,
+  2.296952f,  -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+  -0.276088f, -0.455907f, 0.266021f,  0.087348f,  -0.146566f, 0.040492f,
+  -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+  -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f,  0.002185f,
+  -4.225019f, 0.344025f,  0.728796f,  -0.262936f, 1.383924f,  1.577300f,
+  -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+  -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+  0.031970f,  -0.373402f, -0.396079f, 0.045566f,  0.072595f,  -0.222681f,
+  -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+  -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+  -0.205936f, -0.316275f, 0.103729f,  -0.197893f, -0.128029f, -0.218796f,
+  -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+  -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+  -0.270504f, 0.234505f,  0.272144f,  0.266938f,  -0.392395f, -0.011717f,
+  -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+  -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+  -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+  -0.119432f, -0.222351f, 0.000450f,  0.208724f,  -0.510526f, -0.144656f,
+  -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+  0.043714f,  -0.235414f, 0.115594f,  -0.195616f, -0.106693f, -0.124242f,
+  0.083990f,  0.049110f,  -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+  -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+  -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+  -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+  -0.023459f, -0.222538f, 0.028849f,  -0.088038f, -0.301550f, -0.273566f,
+  0.067295f,  -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+  -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+  -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+  -0.086147f, -0.430088f, 0.058466f,  -0.152129f, -0.058411f, -0.236392f,
+  -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+  -0.324501f, 0.000490f,  -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+  -0.175500f, 0.165220f,  -0.276212f, 0.062153f,  -0.217054f, -0.255487f,
+  -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+  -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+  -0.158325f, 0.151907f,  -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+  -0.220028f, -0.247355f, 0.135584f,  0.016511f,  0.367705f,  -1.855877f,
+  0.435622f,  0.444710f,  -3.372301f, -3.030489f, 1.013267f,  0.380951f,
+  -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+  -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+  -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+  -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+  -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f,  1.189112f,
+  1.458468f,  -0.005876f, -0.927475f, 0.062038f,  -1.170818f, 0.338227f,
+  -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+  -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+  -0.310094f, 0.062721f,  0.251422f,  -0.014350f, -1.282910f, 1.619560f,
+  1.180566f,  -0.032163f, -1.322951f, -0.603601f, 1.443710f,  0.654650f,
+  -0.393227f, 0.003536f,  0.029725f,  -0.108925f, -0.053911f, 0.133977f,
+  -0.036145f, -0.168438f, 0.046989f,  -0.331463f, -0.176983f, -0.311922f,
+  -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+  -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f,  -0.032867f,
+  -0.039424f, -0.063670f, 0.193808f,  -0.303514f, -0.013376f, -0.057761f,
+  0.187922f,  0.006938f,  0.031810f,  0.180594f,  -1.198427f, 2.820662f,
+  0.154986f,  -0.375518f, 0.116925f,  -0.795782f, -0.085139f, -0.079365f,
+  -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+  -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+  -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+  -2.453587f, -0.045568f, -0.296932f, 0.613061f,  -0.320284f, 0.191620f,
+  -0.827145f, -0.225277f, 0.275800f,  1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+  -0.176206f, 0.660189f,  -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+  0.732684f,  -0.135581f, -2.193132f, -0.172771f, 0.605001f,  -0.060392f,
+  -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+  0.632779f,  0.005585f,  1.310169f,  1.392136f,  -0.563860f, -0.051053f,
+  0.660998f,  -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+  -0.177726f, 1.200859f,  -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+  0.538503f,  -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+  -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f,  -0.020116f,
+  -0.208096f, 0.000000f,  1.246166f,  -0.225421f, -0.181555f, 0.861761f,
+  1.172429f,  -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+  -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+  -0.037828f,  1.529029f,  0.004927f,  1.475763f,  0.627172f,  0.325872f,
+  -0.990757f,  0.129476f,  0.889958f,  -0.082031f, 0.332133f,  0.074422f,
+  -0.176212f,  -0.074355f, 0.774378f,  0.110987f,  -0.155469f, 0.253310f,
+  0.882538f,   0.253605f,  0.332436f,  -5.389474f, 0.278470f,  0.168644f,
+  0.914611f,   0.154165f,  0.809262f,  -0.174734f, 0.923673f,  0.064716f,
+  -0.070228f,  -0.228735f, 0.002312f,  0.112222f,  -0.045502f, -0.046004f,
+  0.514101f,   0.306480f,  0.021232f,  -0.015955f, -0.288260f, 0.189177f,
+  -0.104158f,  0.103273f,  0.096910f,  -0.086328f, 1.327289f,  -0.154247f,
+  0.056676f,   -0.243327f, -0.646676f, 0.177221f,  -0.086761f, 0.729729f,
+  -14.710893f, -0.044881f, 0.339003f,  -0.134737f, 0.073621f,  -0.162913f,
+  1.215237f,   0.140723f,  0.138630f,  1.241719f,  0.204092f,  -0.463080f,
+  -0.176086f,  1.125868f,  1.034814f,  0.225455f,  -0.203421f, -0.078787f,
+  -0.527498f,  0.012491f,  -0.563307f, -0.170792f, 0.002679f,  0.116153f,
+  0.211348f,   -0.191900f, -0.212505f, 0.263445f,  -0.074679f, -0.081441f,
+  -0.815405f,  2.448215f,  0.781299f,  0.149542f,  -1.045162f, 0.043014f,
+  0.217381f,   -0.094500f, -0.090427f, 0.025784f,  -0.228906f, -2.741798f,
+  0.230475f,   -0.256112f, -0.103297f, 0.159121f,  -0.229793f, -0.014883f,
+  -0.104131f,  -0.123816f, 0.164148f,  -0.052279f, -0.071845f, -0.041197f,
+  0.208527f,   -0.234197f, -0.542336f, 0.020053f,  0.088870f,  0.014346f,
+  2.502164f,   -0.010244f, -0.267792f, 0.844394f,  2.711486f,  -0.015262f,
+  -0.868053f,  -0.295704f, 0.222289f,  -0.000286f, -0.352098f, -0.079000f,
+  0.021267f,   -0.721739f, -0.240558f, -0.384775f, 0.065974f,  -2.161058f,
+  0.195889f,   0.268966f,  -0.009329f, 0.014949f,  0.314943f,  0.235885f,
+  0.072591f,   -0.127120f, 0.150784f,  0.105697f,  -1.297403f, -0.207509f,
+  -0.217688f,  -0.076752f, 0.170952f,  -0.294235f, 0.449973f,  -1.712690f,
+  0.860989f,   0.054757f,  -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+  -3.741608f,  0.495660f,  -0.288936f, 4.654852f,  -0.021305f, -0.308916f,
+  0.049205f,   -0.259996f, 0.114248f,  -0.252647f, -0.253180f, -0.449314f,
+  0.022979f,   0.063281f,  -0.196154f, 0.078295f,  -0.322317f, -0.145142f,
+  0.300573f,   0.048385f,  -0.254787f, 0.123939f,  -1.263088f, -0.228565f,
+  -0.389061f,  0.391084f,  2.322438f,  0.075009f,  0.225743f,  -0.198808f,
+  -0.280538f,  -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+  -0.102756f,  -1.760965f, 0.019149f,  -0.867342f, 0.347141f,  0.031588f,
+  0.302572f,   -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+  -0.108561f,  -0.167077f, -2.851509f, -0.307116f, 0.202720f,  -0.160280f,
+  -0.215525f,  0.064355f,  -0.427220f, 1.516230f,  0.634453f,  0.099400f,
+  -1.013887f,  -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+  -0.160953f,  0.399036f,  -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+  -0.588252f,  -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+  -0.253959f,  -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+  -0.175087f,  -0.055171f, 1.642014f,  -0.192559f, -0.288147f, 0.610311f,
+  4.688195f,   -0.128728f, -0.914869f, -0.108286f, 0.013789f,  0.092125f,
+  0.019770f,   -0.178386f, 0.074164f,  -1.152658f, -0.216738f, -0.277286f,
+  0.012381f,   0.418259f,  -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+  2.009457f,   0.054302f,  1.019838f,  -0.116170f, 0.165134f,  -0.112567f,
+  0.852632f,   -0.385796f, -0.108666f, 0.053181f,  -0.311797f, -0.372875f,
+  -0.675717f,  2.409268f,  -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+  0.203993f,   0.093617f,  -0.301290f, 0.253551f,  -0.128909f, -1.448442f,
+  -0.186823f,  -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+  -0.212084f,  -0.137326f, 0.012505f,  0.087850f,  -0.200413f, -0.394119f,
+  -0.132224f,  0.146917f,  0.155746f,  0.198725f,  -0.322541f, 0.196391f,
+  -0.945500f,  0.036736f,  -0.155646f, -0.677341f, 1.130545f,  -0.339554f,
+  0.411628f,   -0.355813f, -0.249843f, 0.213694f,  -2.035607f, 0.055694f,
+  -0.111669f,  0.408696f,  -0.067043f, -0.048182f, 0.398110f,  -0.067542f,
+  1.459801f,   0.236833f,  -0.178806f, 0.168758f,  0.492387f,  0.099691f,
+  -0.776680f,  -0.172865f, 0.204225f,  0.193982f,  0.575685f,  -0.062248f,
+  0.011486f,   0.058571f,  -0.493391f, 0.026893f,  -0.900467f, 3.793129f,
+  -0.634613f,  -0.064660f, -0.048262f, 0.361905f,  0.033641f,  0.245171f,
+  -0.064671f,  0.034954f,  0.204358f,  -0.904023f, -0.052714f, -0.250134f,
+  0.136700f,   0.000734f,  -0.371720f, 0.226483f,  0.217958f,  0.060559f,
+  0.180111f,   0.000970f,  0.079556f,  -0.096775f, 0.093855f,  -0.026224f,
+  -0.243664f,  0.004290f,  0.123281f,  -0.239476f, 1.230374f,  -0.107826f,
+  -0.101982f,  -0.153917f, 5.464427f,  0.304375f,  -0.809957f, 0.090564f,
+  -0.278416f,  -0.245555f, -2.078421f, 0.243093f,  -0.127666f, 0.052451f,
+  -0.126662f,  -0.783505f, 0.025149f,  -1.422675f, -0.207769f, -0.362547f,
+  0.115310f,   0.133390f,  1.264754f,  -0.027055f, -0.485312f, -0.240717f,
+  -0.239722f,  0.146818f,  -1.265043f, -0.235553f, 0.267104f,  -0.021357f,
+  -0.435949f,  -0.309371f, 0.049920f,  1.302721f,  -0.233978f, -0.097551f,
+  -0.240631f,  -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+  -0.029361f,  2.703590f,  -0.430659f, 0.067927f,  -0.387520f, -0.370630f,
+  -0.229236f,  0.085653f,  -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+  -0.109299f,  -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+  -0.196713f,  -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+  5.963707f,   -0.201157f, 0.726377f,  -0.011076f, 0.010553f,  -0.102918f,
+  -2.230088f,  -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+  -0.094735f,  -1.381839f, 0.587298f,  -0.173048f, 0.721360f,  0.241900f,
+  0.764302f,   -0.023609f, -1.173755f, 0.103912f,  -0.185363f, 0.078435f,
+  -2.245062f,  -0.127269f, 0.202234f,  0.158975f,  -0.260909f, 0.098608f,
+  -0.348247f,  1.732502f,  -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+  -0.530730f,  0.125716f,  -1.004419f, 0.145109f,  -0.059289f, 1.096304f,
+  0.012891f,   0.045033f,  -0.306875f, 0.003514f,  -0.176110f, 0.037544f,
+  -0.441537f,  -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+  -0.128894f,  -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+  1.173404f,   0.088312f,  -0.393568f, -0.175134f, 6.529819f,  -0.326652f,
+  -0.631917f,  -0.393476f, 0.057781f,  -0.217748f, -1.781139f, -0.012614f,
+  -0.212621f,  -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+  -0.608744f,  -0.265146f, 0.238517f,  0.066882f,  -2.916806f, 0.054642f,
+  0.282590f,   0.075248f,  0.010188f,  -0.133486f, 0.985945f,  -0.045849f,
+  -0.347564f,  0.057320f,  -0.417920f, 0.063664f,  0.387062f,  -2.692059f,
+  -0.535549f,  0.263736f,  0.327889f,  -0.070273f, -0.775254f, 0.147250f,
+  3.309425f,   -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+  0.022907f,   0.138421f,  -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+  -0.036527f,  0.021525f,  0.106649f,  -0.291883f, 0.088424f,  -0.057773f,
+  -0.086031f,  0.015277f,  -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+  -0.025820f,  -0.649037f, 0.706381f,  0.096410f,  0.643776f,  -0.046743f,
+  -0.009654f,  -0.024246f, 1.469255f,  -0.183536f, -0.370046f, -0.048442f,
+  -0.376527f,  -0.431264f, -0.245109f, -0.093951f, 0.203683f,  -0.099872f,
+  0.087210f,   0.160692f,  -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+  -0.241949f,  0.193613f,  0.979597f,  -0.091259f, 0.414424f,  -0.047341f,
+  -0.209582f,  -0.295134f, -0.016824f, 0.460327f,  -0.072671f, 0.246234f,
+  0.235896f,   0.127238f,  -1.068683f, 0.035648f,  2.254888f,  0.180105f,
+  -0.260098f,  -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+  -0.237916f,  0.031103f,  -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+  -0.342148f,  -0.257558f, -0.346300f, 0.115333f,  -0.115456f, 0.208354f,
+  -0.359301f,  -0.167395f, 1.146514f,  -0.177861f, -0.098658f, -0.444570f,
+  6.759993f,   -0.369772f, -0.831118f, 0.001866f,  -0.073298f, -0.072095f,
+  0.811902f,   -0.431997f, -0.286587f, -0.269500f, 0.111492f,  -0.525364f,
+  -0.351785f,  -2.463474f, -1.852659f, 0.135325f,  0.138267f,  0.100643f,
+  -2.373278f,  -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+  -0.716424f,  -0.031674f, 0.011147f,  0.057405f,  -0.215873f, -0.094401f,
+  0.573528f,   -1.223820f, 0.414852f,  -0.059053f, -0.076488f, -0.287168f,
+  -0.842640f,  0.174084f,  -0.567186f, 0.336629f,  -0.062514f, 2.075448f,
+  -0.061680f,  -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+  -0.049616f,  -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+  0.141501f,   -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+  -1.521661f,  -0.122639f, -0.015760f, -0.718912f, 5.877828f,  0.146916f,
+  0.151767f,   0.220785f,  -0.032298f, 0.230902f,  0.663943f,  -0.252613f,
+  0.057718f,   -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+  -1.031206f,  -0.104136f, 0.389897f,  0.127602f,  -2.667789f, -0.212366f,
+  -0.506262f,  -0.009115f, -0.213202f, 0.076167f,  -1.629405f, 0.055129f,
+  0.375393f,   -0.150272f, -0.241515f, -0.326497f, 0.100069f,  0.410703f,
+  0.340622f,   0.042437f,  -0.349945f, 0.041176f,  -1.178950f, 0.030992f,
+  0.933908f,   -0.035844f, -0.098660f, 1.030584f,  -0.092043f, -0.355739f,
+  -0.305562f,  0.036161f,  -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+  0.215493f,   -0.149105f, -0.013363f, 0.025886f,  -0.101306f, -0.205781f,
+  -1.072487f,  -0.076019f, 0.077555f,  0.131003f,  1.267763f,  -0.008954f,
+  -0.327617f,  -0.246539f, 6.664081f,  -0.404403f, -1.442489f, 0.191301f,
+  -0.336361f,  0.181156f,  0.833108f,  0.007879f,  -0.194464f, -1.029408f,
+  -0.036268f,  -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+  -0.065990f,  0.203160f,  -0.291788f, 0.000680f,  0.587011f,  -0.241289f,
+  0.037034f,   0.000552f,  1.072308f,  -0.387230f, -0.230050f, 0.292322f,
+  -0.720001f,  0.034109f,  -0.467260f, 2.211644f,  -1.839191f, -0.048797f,
+  -0.083469f,  -0.334686f, -0.269056f, 0.051295f,  1.319904f,  -0.035603f,
+  -0.018457f,  -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+  -0.305469f,  -0.099011f, 0.014225f,  -0.452772f, 0.170331f,  -0.389312f,
+  -0.115084f,  -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+  -0.125137f,  0.067228f,  -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+  -0.588325f,  -0.320024f, 0.085695f,  -0.235047f, -0.217790f, 0.103015f,
+  -0.698644f,  0.017766f,  -0.058299f, 0.199411f,  -0.122485f, -0.563949f,
+  -0.349011f,  -0.557045f, -0.131165f, 0.002281f,  0.118559f,  -0.210302f,
+  -1.153815f,  0.116738f,  -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+  0.953222f,   0.093748f,  0.266869f,  0.241869f,  -0.860832f, -0.387012f,
+  -0.338986f,  2.097515f,  -1.942512f, -0.298021f, 0.543911f,  -0.043214f,
+  0.082125f,   -0.120242f, 0.712231f,  0.213327f,  -0.301687f, -0.544011f,
+  -0.392131f,  0.004302f,  0.004825f,  -0.317440f, -0.107518f, -0.293407f,
+  -0.159111f,  -0.080367f, 0.132663f,  -0.017726f, -0.237521f, -0.190297f,
+  -0.361633f,  0.200518f,  -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+  0.630105f,   -0.190997f, -0.287840f, -0.603488f, 3.605598f,  -0.276614f,
+  -1.346383f,  0.186912f,  -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+  -0.223722f,  0.304924f,  -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+  -2.458027f,  0.237976f,  0.171050f,  -0.103139f, -0.278689f, 0.329824f,
+  -0.262448f,  -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+  0.091018f,   -0.386471f, -0.723940f, 0.064956f,  -0.057652f, 1.321024f,
+  -1.397418f,  -0.143136f, 0.272468f,  -0.030749f, 0.037324f,  0.069316f,
+  -0.904925f,  -0.333693f, -0.117709f, 2.279598f,  -0.428065f, -0.131157f,
+  -0.014288f,  -0.402862f, -0.666090f, 0.017070f,  -0.028333f, 0.002481f,
+  0.197156f,   -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+  -0.905007f,  -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+  -0.089948f,  -0.936827f, 1.437569f,  -0.388908f, 0.126170f,  0.186162f,
+  -0.018819f,  -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+  -0.230436f,  -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+  0.378157f,   0.113377f,  -0.850610f, 0.080245f,  -0.087305f, -0.002852f,
+  0.044408f,   -0.188172f, -1.891998f, 0.092189f,  0.125325f,  -0.105090f,
+  -0.848510f,  -0.396308f, -0.384130f, 2.007509f,  -1.480787f, -0.126946f,
+  0.314767f,   0.000195f,  -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+  -0.417603f,  1.570705f,  0.092459f,  -0.340974f, -0.284754f, -0.007801f,
+  -0.324610f,  -0.004734f, -0.207716f, -0.057175f, 0.055467f,  -0.210830f,
+  -0.113005f,  -0.299177f, 0.068074f,  0.017929f,  -2.897598f, -0.260074f,
+  -0.014422f,  -0.206467f, 1.246997f,  -0.372863f, -0.214160f, -0.114035f,
+  5.805862f,   0.003611f,  -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+  -1.251640f,  -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+  0.001427f,  0.523607f,  0.225068f,  -0.055273f, 1.019519f,  1.181880f,
+  -0.010198f, 0.130597f,  1.276752f,  2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_32_layer0,
+      av1_ab_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_32_layer0,
+      av1_ab_partition_nn_bias_32_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+  0.151902f,  0.007947f,  -1.788454f, 0.431869f,  -2.971387f, 0.923566f,
+  1.632542f,  -1.665136f, -0.338632f, -5.075884f, 0.398267f,  0.030467f,
+  2.263534f,  -0.045532f, -1.066128f, 0.915139f,  -0.560500f, -3.293125f,
+  2.072793f,  -1.011414f, 0.122716f,  -0.060169f, -0.388860f, 0.031019f,
+  -0.381861f, 0.001551f,  -0.328472f, 0.038296f,  -0.060398f, -0.375556f,
+  0.209226f,  0.014764f,  -1.443469f, -0.345486f, 2.409269f,  1.524846f,
+  -0.640666f, 1.322139f,  -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+  0.329701f,  0.115339f,  -1.339542f, 0.249024f,  -0.421545f, -0.409151f,
+  -0.258293f, 0.836288f,  -0.073685f, -0.009624f, 0.895712f,  0.320639f,
+  0.451002f,  -1.544558f, 0.193709f,  -1.389012f, 1.305451f,  0.089795f,
+  0.050338f,  -0.017433f, -0.304667f, 0.500729f,  0.504346f,  0.073757f,
+  0.582649f,  -0.993623f, 1.766766f,  -3.067265f, -0.415774f, -0.006036f,
+  -1.245281f, 0.253205f,  -0.591245f, -0.626238f, 0.551852f,  0.593755f,
+  0.491023f,  1.099384f,  -0.348448f, 0.054564f,  -0.451422f, -0.375781f,
+  -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+  -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+  0.068066f,  -0.374920f, 0.057536f,  -0.189748f, 0.058375f,  -0.267749f,
+  -0.147286f, -0.246153f, 0.006183f,  -0.202029f, -0.059128f, 0.116852f,
+  0.134719f,  -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+  -0.264499f, 0.155816f,  -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+  0.007313f,  -0.254124f, -0.231964f, -0.275972f, 0.032098f,  -0.264564f,
+  -0.208743f, 0.155599f,  -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+  -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+  -0.154114f, 0.017032f,  -0.017364f, -0.233247f, 0.009918f,  -0.179289f,
+  -0.190722f, 0.147106f,  -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+  -0.040718f, -0.324699f, 0.118660f,  -0.170727f, -0.316788f, 0.100886f,
+  -0.202842f, 0.045371f,  0.150561f,  -0.057054f, -0.308150f, 0.028346f,
+  -0.381473f, -0.195365f, 0.026221f,  -0.281795f, 0.087204f,  0.047689f,
+  -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+  -0.340273f, 0.048086f,  0.046103f,  -0.121527f, 0.021697f,  0.054109f,
+  -0.002768f, -0.008461f, -2.297240f, 0.124651f,  3.621661f,  -0.057120f,
+  -1.151656f, 2.296894f,  -3.678720f, -0.290240f, 0.087683f,  -0.186389f,
+  0.007656f,  -0.090236f, -0.245217f, 0.110389f,  -0.251719f, -0.029084f,
+  -0.128203f, -0.100005f, -0.032779f, 0.007281f,  -0.366596f, -0.267870f,
+  -0.215620f, 0.047687f,  0.010303f,  0.097980f,  -0.191569f, -0.341162f,
+  0.119249f,  0.026279f,  -2.161546f, 0.459591f,  1.290566f,  1.791797f,
+  -0.409835f, 0.127081f,  -1.156367f, 0.198286f,  0.099561f,  -0.067445f,
+  -0.034352f, 0.017966f,  -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+  0.146090f,  -0.357530f, 0.097644f,  -0.000932f, 0.446603f,  -0.066793f,
+  2.448620f,  0.937617f,  -1.232922f, 0.313183f,  0.816827f,  -0.275115f,
+  -0.245205f, -0.126895f, 0.156668f,  -0.186977f, -0.273505f, 0.013315f,
+  0.168629f,  -0.089084f, 0.006166f,  -0.116107f, -0.199316f, -0.024010f,
+  -0.242303f, 0.011612f,  -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+  0.006732f,  -0.148718f, -0.164225f, 0.116063f,  1.587898f,  0.690519f,
+  0.360566f,  0.009739f,  -0.678702f, -0.046003f, 0.126984f,  0.605212f,
+  1.240663f,  -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+  -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+  -0.242255f, 0.137424f,  -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+  -0.078695f, 0.038257f,  -0.012110f, -0.263521f, 0.009839f,  -0.109125f,
+  -0.226036f, 0.060712f,  0.093671f,  0.153143f,  0.039116f,  -0.290891f,
+  0.227057f,  -0.204633f, -0.207539f, -0.148242f, 0.046204f,  -0.231268f,
+  -0.209315f, -0.307579f, -0.436556f, 0.023475f,  0.131793f,  -0.038301f,
+  1.650584f,  0.392570f,  1.446576f,  1.254380f,  -0.516867f, -0.057116f,
+  0.149320f,  0.414424f,  -0.246309f, 0.003877f,  -0.480238f, -1.037035f,
+  -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f,  0.940609f,
+  -1.113370f, -0.018554f, 0.141064f,  -0.182504f, 1.270707f,  0.414904f,
+  -0.216036f, 0.203831f,  0.450716f,  -0.452909f, 0.139358f,  -0.027143f,
+  1.956892f,  1.643732f,  -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+  0.205023f,  0.661159f,  -0.000809f, 0.049033f,  -0.348579f, -0.200338f,
+  -0.362144f, -0.346590f, -0.230096f, 0.180746f,  -0.149954f, -0.253429f,
+  -0.378170f, -0.040724f, -0.041597f, 0.243659f,  -0.472181f, 0.015401f,
+  -0.180376f, 0.153139f,  -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+  -0.238925f, -0.265798f, -0.318374f, 0.142352f,  -0.210520f, 0.051928f,
+  -0.352190f, -0.179052f, -0.185498f, 0.025540f,  -0.111667f, -0.235187f,
+  -0.215454f, 0.010931f,  -0.238372f, -0.126659f, 0.075691f,  -0.091167f,
+  -2.462379f, -0.007950f, -0.637990f, 0.285554f,  -0.051275f, 0.282279f,
+  -0.744083f, -0.570646f, 0.592198f,  1.421332f,  -0.256027f, -0.140315f,
+  0.160247f,  -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+  -0.071228f, 0.055864f,  -1.084764f, -0.263409f, 0.779266f,  0.228187f,
+  0.375013f,  0.121204f,  -0.656948f, 0.533561f,  0.272671f,  -0.015423f,
+  -0.124180f, -0.009127f, 2.934838f,  -0.150998f, 1.163152f,  0.081997f,
+  -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f,  0.024046f,
+  -1.451709f, 0.332558f,  0.990504f,  0.376290f,  -1.466773f, -0.448439f,
+  -2.929108f, -4.255188f, 0.065238f,  0.019950f,  1.372393f,  0.444052f,
+  -2.538772f, 1.579767f,  -0.464911f, -1.866114f, 1.053958f,  0.434467f,
+  -0.125964f, 0.034671f,  0.077116f,  -0.138466f, -0.413395f, -0.223453f,
+  -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f,  0.037459f,
+  -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+  -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f,  -0.328036f,
+  -0.169790f, 0.036506f,  0.052572f,  -0.183570f, -0.073617f, -0.244959f,
+  0.266498f,  0.032846f,  -1.902106f, 0.486078f,  2.414993f,  0.975182f,
+  -0.382875f, 1.647810f,  -2.197017f, -0.890107f, 0.221287f,  0.010889f,
+  3.817042f,  0.572728f,  0.092466f,  0.473337f,  -1.634659f, -1.069455f,
+  1.486776f,  -1.023850f, 0.088184f,  0.008842f,  0.518202f,  0.270259f,
+  1.757191f,  -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+  -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+  -0.267836f, -0.319354f, -0.274975f, 0.068970f,  -0.406467f, 0.044074f,
+  -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f,  -0.177674f,
+  -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+  -0.312272f, -0.222217f, -0.100548f, 0.106260f,  -0.034655f, 0.135109f,
+  -0.021276f, 0.018177f,  -0.353097f, -0.011128f, 0.061136f,  -0.511662f,
+  -0.223236f, -0.308841f, 0.118789f,  -0.154628f, -0.053178f, -0.055973f,
+  0.013175f,  -0.368337f, -0.090863f, -0.116920f, 0.178990f,  -0.025278f,
+  -0.190553f, -0.238092f, 0.303943f,  -0.024944f, 0.719373f,  0.384332f,
+  -0.378480f, -0.423316f, 0.709922f,  0.758514f,  -1.559023f, -2.503173f,
+  0.068652f,  -0.234741f, -0.182932f, 0.037878f,  0.020684f,  -0.174142f,
+  -0.182300f, -0.052796f, -0.219145f, 0.113028f,  -1.041826f, 0.035317f,
+  0.919904f,  -0.676011f, 0.652297f,  1.456447f,  -0.166904f, -0.861823f,
+  0.895827f,  0.429821f,  -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+  -0.206692f, -0.080745f, -0.085444f, 0.186953f,  -0.050135f, 0.044243f,
+  -0.391706f, -0.160498f, -0.292268f, 0.164060f,  0.412649f,  0.211611f,
+  -0.327294f, -0.919399f, 0.320297f,  0.385284f,  -0.088848f, -0.072556f,
+  -0.384813f, -0.176267f, -0.065918f, 0.134724f,  -0.231104f, -0.337707f,
+  -0.195442f, -0.263569f, 0.098090f,  -0.341411f, -0.189211f, -0.439276f,
+  -0.404046f, 0.262491f,  -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+  -0.026945f, -0.112036f, -0.322985f, 0.078500f,  -0.230205f, -0.344535f,
+  -0.021087f, 0.110220f,  -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+  2.936406f,  -0.396539f, -0.110456f, -1.254954f, 0.785350f,  0.516290f,
+  -0.172341f, 0.254386f,  -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+  0.000000f,  -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+  0.457446f,  -0.125051f, -0.107712f, 0.714607f,  -0.140809f, -1.788650f,
+  -0.087199f, 0.000000f,  -1.290050f, 0.443930f,  -0.110634f, -0.109380f,
+  -0.188213f, -1.414179f, 1.193579f,  0.388775f,  -0.873193f, -0.110050f,
+  -0.072565f, -0.117050f, -0.119132f, 0.456959f,  -0.132069f, 0.131974f,
+  1.160474f,  1.746465f,  0.442628f,  -0.188849f, -0.207794f, -0.108364f,
+  -0.856655f, -2.141620f, 0.335476f,  -0.105508f, -0.212162f, -0.109319f,
+  -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f,  -0.023908f,
+  0.123809f,  -0.109797f, 0.200510f,  -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+  -6.823716f, 1.406568f,  -0.144009f, 2.228765f,  0.838336f,  0.738107f,
+  -0.319014f, -0.148756f, 0.240862f,  -0.111089f, -0.004241f, 0.025758f,
+  -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f,  0.252994f,
+  -0.289443f, 0.194932f,  0.057467f,  0.724735f,  0.014063f,  1.361352f,
+  0.025191f,  0.024274f,  0.231462f,  -7.227959f, -0.094515f, 0.039946f,
+  0.412719f,  0.812318f,  3.038903f,  -0.286289f, 0.647482f,  -0.115114f,
+  0.053590f,  0.066069f,  0.153134f,  0.996250f,  -0.125700f, 0.951365f,
+  -6.243494f, -4.827697f, 0.566320f,  0.239515f,  -0.099702f, 0.054546f,
+  1.847330f,  3.680076f,  -3.049829f, -0.127709f, 0.068469f,  -0.017794f,
+  0.223864f,  -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+  -0.552073f, 0.043311f,  0.218668f,  0.033209f,  -3.199210f, 0.193079f,
+  0.321406f,  0.718307f,  -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+  -0.029757f, -0.130065f, 0.043782f,  0.072394f,  -0.088686f, 0.025322f,
+  0.129882f,  0.101324f,  0.335707f,  0.072714f,  -2.079774f, 0.203997f,
+  0.239321f,  -0.301757f, 0.257845f,  1.288382f,  -0.031275f, -0.234194f,
+  0.310722f,  2.045469f,  0.034716f,  0.135638f,  -0.251388f, 0.320071f,
+  -1.065301f, -0.322731f, -0.545028f, 0.226276f,  0.090799f,  0.019289f,
+  0.048950f,  -1.079300f, 0.231938f,  0.083683f,  4.762127f,  0.145037f,
+  -0.145549f, 0.075592f,  0.172336f,  0.108175f,  0.333751f,  1.090501f,
+  1.056114f,  0.047073f,  0.182052f,  -0.081587f, 0.089900f,  0.339286f,
+  2.049988f,  0.073585f,  0.537355f,  -0.243322f, -0.010179f, -0.052601f,
+  -0.174915f, 0.117793f,  2.222990f,  -2.520837f, -0.092699f, 1.199887f,
+  0.138720f,  0.679918f,  -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+  0.114633f,  -0.128377f, 0.092970f,  -0.107489f, -0.191078f, 0.185182f,
+  0.216980f,  -0.019343f, 3.443133f,  0.287953f,  0.099314f,  0.985958f,
+  0.157268f,  -0.606516f, 0.049418f,  -0.221809f, -0.453081f, -0.344796f,
+  -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+  -1.011192f, 0.022795f,  0.186363f,  -0.076356f, -0.050932f, -0.165098f,
+  0.168177f,  -0.101596f, -5.270886f, 2.553943f,  -0.440870f, -0.017494f,
+  0.215208f,  -0.017032f, 1.495915f,  -4.304677f, 0.762211f,  0.182937f,
+  0.254406f,  -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+  0.737697f,  -0.234989f, 0.168095f,  0.245118f,  -0.077262f, 0.195718f,
+  0.753302f,  -1.637869f, 0.126227f,  0.982129f,  -0.121444f, -0.295570f,
+  -1.215799f, 0.147867f,  -0.068496f, 0.132726f,  -0.005772f, -0.181774f,
+  0.126513f,  0.204723f,  -0.366123f, 0.103906f,  -0.148053f, -0.075272f,
+  0.243884f,  -0.104828f, 0.198988f,  0.501034f,  -0.112671f, 0.111421f,
+  0.167508f,  -0.117803f, -0.738624f, 2.046292f,  0.124011f,  0.057983f,
+  -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+  0.122417f,  0.060291f,  -0.129033f, -0.843086f, 0.268241f,  -0.399927f,
+  1.585888f,  1.816393f,  -0.631427f, 0.127826f,  0.088105f,  0.073488f,
+  0.717694f,  -1.497362f, 2.608528f,  0.066896f,  -0.079230f, 0.223436f,
+  -0.010530f, 0.175310f,  1.120365f,  0.034391f,  0.835312f,  0.071652f,
+  -0.080615f, 0.111395f,  0.162742f,  0.079927f,  -3.859582f, -0.638431f,
+  -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f,  0.931940f,
+  -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f,  0.102793f,
+  -0.048546f, 0.063545f,  0.023864f,  -0.190863f, 1.934257f,  -0.136286f,
+  -0.107916f, -0.637468f, 0.066449f,  1.089693f,  -0.214047f, -0.265780f,
+  0.899660f,  -0.130333f, 0.288311f,  -0.049024f, 0.090202f,  0.487969f,
+  0.339704f,  0.858479f,  0.841253f,  -0.184100f, -0.637070f, -0.125071f,
+  -0.077650f, -0.087877f, 0.202268f,  -0.027300f, 2.842862f,  -0.100698f,
+  -0.259080f, 0.260556f,  0.157912f,  -0.070364f, 0.467190f,  1.200037f,
+  1.419317f,  -0.033588f, -0.227824f, 0.292617f,  0.228574f,  0.213839f,
+  -1.091099f, -0.022258f, -1.294681f, 0.136118f,  0.081652f,  -0.185359f,
+  -0.039706f, 0.191407f,  -2.053219f, -0.261934f, 0.047812f,  -0.029536f,
+  -0.823869f, -1.090534f, -0.755890f, 0.441035f,  -0.167945f, 0.231441f,
+  -0.135013f, -0.260762f, 0.256872f,  0.130339f,  -0.243751f, 0.189760f,
+  -0.288454f, 0.145363f,  0.338490f,  0.403898f,  -0.022814f, -1.263598f,
+  -0.101315f, 0.860135f,  0.136511f,  0.028942f,  0.574047f,  2.656370f,
+  0.037587f,  -0.188690f, -0.125312f, 1.100435f,  -1.080402f, 0.380905f,
+  0.004635f,  0.097144f,  -0.214309f, 0.085552f,  -0.285066f, -0.705134f,
+  -0.054704f, -0.319951f, 5.486626f,  0.958158f,  -1.380585f, 0.223340f,
+  -0.169167f, -0.170697f, -0.216748f, 0.324232f,  2.684204f,  -0.008490f,
+  -0.211052f, -0.201190f, 0.123466f,  -0.000234f, 0.579907f,  0.096938f,
+  -0.042745f, 0.201855f,  0.157195f,  -0.261440f, 0.029699f,  -0.046599f,
+  1.618216f,  -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+  0.579699f,  -0.100392f, 0.150694f,  0.061794f,  0.200425f,  -0.062515f,
+  -0.179122f, 0.250112f,  -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+  3.662276f,  -0.154921f, -0.312991f, 0.972008f,  -0.308596f, -0.190426f,
+  0.133889f,  -0.238673f, -0.094726f, 1.683835f,  -0.215629f, -0.198890f,
+  -0.035278f, -0.367973f, -0.822435f, 0.240848f,  -0.194656f, 0.034655f,
+  -0.079424f, 0.146670f,  0.026646f,  -0.034507f, 0.059467f,  -0.153109f,
+  -0.431033f, 2.552991f,  -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+  1.026326f,  -3.096230f, 1.346935f,  0.033633f,  -0.181827f, 0.094376f,
+  0.001696f,  -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+  0.281795f,  -0.127251f, 0.180776f,  0.067763f,  0.697124f,  -1.040779f,
+  0.111280f,  0.188351f,  -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+  -0.070310f, -0.032918f, -0.060787f, 0.131484f,  -0.077845f, -0.258652f,
+  0.056911f,  -0.062034f, 0.007663f,  -0.185100f, 1.340361f,  0.014096f,
+  -0.124602f, 0.194241f,  0.128383f,  0.360465f,  0.082979f,  -0.050475f,
+  -0.519294f, 3.323262f,  0.067014f,  0.221203f,  -0.085082f, -0.228606f,
+  -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+  -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f,  1.790253f,
+  -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+  2.251166f,  -0.146007f, 0.138527f,  -0.003134f, 0.103665f,  0.006928f,
+  -0.240253f, -0.227464f, 0.578437f,  -0.214724f, 0.503085f,  0.158093f,
+  0.033091f,  0.008061f,  4.815371f,  2.132264f,  0.281850f,  -2.288560f,
+  -0.145012f, 1.296832f,  -0.362401f, -0.403252f, 0.109873f,  0.185746f,
+  0.244764f,  0.172367f,  -0.185588f, 0.139801f,  -0.178254f, 0.068629f,
+  0.358488f,  -0.153969f, -6.433524f, 0.225983f,  -0.138123f, -0.095971f,
+  -0.036089f, -1.400083f, 0.265908f,  0.257787f,  0.181144f,  -1.647228f,
+  -0.136289f, -0.074206f, 0.122988f,  -0.088895f, -1.266717f, 0.006010f,
+  0.536681f,  0.263061f,  -0.032207f, -0.155136f, 0.086431f,  0.441950f,
+  -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f,  0.117667f,
+  -0.000408f, 0.225719f,  -2.199698f, 0.141447f,  -1.459051f, 0.051315f,
+  0.203228f,  0.354432f,  -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+  -0.666884f, 0.026283f,  -0.317486f, 0.210754f,  0.123897f,  0.223827f,
+  4.214405f,  1.457334f,  -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+  -1.553888f, -0.353429f, 0.069533f,  0.159278f,  -0.173836f, -0.004952f,
+  -0.137033f, 0.127012f,  0.143600f,  0.051587f,  -0.070549f, 0.066509f,
+  -5.776547f, 0.180021f,  -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+  0.140940f,  0.144451f,  -0.104534f, 2.089873f,  -0.168168f, 0.110726f,
+  0.132134f,  -0.215223f, -1.682754f, 0.157757f,  -0.146163f, 0.064882f,
+  0.117313f,  -0.038780f, -0.124720f, -0.501697f, 0.092047f,  -0.233992f,
+  3.324976f,  0.516601f,  1.294202f,  0.119989f,  0.061055f,  0.043420f,
+  -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+  -0.282998f, -0.282705f, 0.073798f,  0.169851f,  0.135651f,  0.182677f,
+  -0.040220f, 0.132462f,  -0.303120f, -0.230113f, 6.165739f,  -0.258596f,
+  0.024127f,  -1.388283f, -0.006042f, 0.572600f,  0.348411f,  -0.387376f,
+  -0.075845f, 0.122319f,  -0.029616f, 0.077873f,  0.154763f,  0.049073f,
+  0.018597f,  0.102688f,  -0.204165f, 0.020734f,  -1.389133f, -0.032854f,
+  -0.147561f, 0.853944f,  0.132100f,  -3.259659f, 0.243745f,  0.181529f,
+  -0.738414f, 1.509994f,  0.023470f,  -0.005329f, 0.066115f,  -1.345081f,
+  -1.455402f, -0.172023f, -0.194625f, 0.071885f,  -0.201742f, -0.262402f,
+  0.077601f,  -0.048938f, 0.257993f,  -0.504029f, -2.032415f, 1.158880f,
+  0.448647f,  -0.025633f, 0.117586f,  -0.072275f, -0.673744f, -3.854342f,
+  -0.983843f, 0.047766f,  -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+  -0.509112f, 0.148812f,  0.130122f,  0.006486f,  -0.099016f, 0.022514f,
+  -0.486850f, -0.059623f, 4.012731f,  0.025454f,  0.029059f,  -0.783546f,
+  -0.295260f, 0.322521f,  -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+  -0.258367f, -0.112897f, 0.269364f,  -0.065912f, 0.169022f,  -0.178783f,
+  -0.095114f, 0.122089f,  -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+  -0.087819f, -2.774399f, -0.100757f, 0.013005f,  -0.964533f, 3.236665f,
+  -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+  -1.799262f, -0.365269f, 0.108611f,  0.037994f,  0.024747f,  -1.073639f,
+  -0.203158f, -0.935006f, 1.880891f,  1.578385f,  0.726272f,  -0.024546f,
+  -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f,  0.075451f,
+  0.182899f,  0.092215f,  -0.207347f, -0.030111f, 0.054316f,  0.192481f,
+  0.594639f,  -0.247694f, 0.547471f,  -0.032094f, -0.065000f, 0.007198f,
+  1.605377f,  -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+  0.919365f,  0.599980f,  0.125545f,  0.265813f,  0.246884f,  0.095385f,
+  -0.260374f, -0.202916f, -0.042770f, 0.234967f,  -0.233139f, -0.326994f,
+  -1.375256f, 0.121766f,  0.077433f,  -1.103569f, 0.019497f,  -1.029185f,
+  0.253905f,  0.206569f,  0.187334f,  -0.237089f, -0.294351f, 0.164137f,
+  0.149696f,  -0.749787f, -0.413433f, 0.976587f,  1.027976f,  -0.285264f,
+  0.209273f,  -0.124762f, 0.050884f,  0.250764f,  -0.082031f, -0.646520f,
+  4.116680f,  0.437336f,  0.671684f,  0.129509f,  -0.078462f, 0.014072f,
+  -0.678232f, 0.094831f,  1.125624f,  0.207070f,  -0.154750f, -0.025780f,
+  -0.103030f, 0.118019f,  -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+  -0.217854f, -0.051790f, 0.017915f,  0.171001f,  1.355562f,  0.094603f,
+  -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+  -0.298901f, 0.038162f,  0.251899f,  0.039612f,  -0.022935f, -0.232308f,
+  -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+  -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+  0.185269f,  1.465082f,  0.040240f,  0.112665f,  0.144329f,  -0.286112f,
+  -0.617649f, 0.916177f,  0.221044f,  -0.079867f, 0.170251f,  -0.093638f,
+  -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f,  1.241179f,
+  0.355922f,  -0.170848f, -0.189168f, 0.080225f,  -1.357793f, 0.190890f,
+  0.976800f,  -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+  -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+  -0.049715f, -0.178005f, 3.029985f,  -1.141546f, 0.080066f,  -1.932316f,
+  -0.641137f, -0.189564f, 0.935080f,  0.136119f,  0.015558f,  -0.179331f,
+  0.204571f,  0.020350f,  0.009362f,  0.108478f,  0.037076f,  -0.049009f,
+  0.081090f,  -0.180202f, 1.455561f,  -0.081559f, 0.059361f,  0.484971f,
+  0.160923f,  -2.170744f, -0.013204f, 0.126561f,  -0.407122f, 1.223661f,
+  0.044262f,  0.118044f,  0.058274f,  -1.747100f, -0.171318f, 0.971374f,
+  0.306995f,  -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+  -0.106479f, -0.907933f, 1.121231f,  1.673840f,  -0.421458f, -0.021146f,
+  -0.254838f, 0.097632f,  0.235109f,  -2.901782f, 0.289518f,  -0.355459f,
+  -0.068264f, -0.179121f, 0.068560f,  -0.047570f, -0.522523f, -0.228963f,
+  -1.037158f, -0.163723f, 0.280563f,  -0.000868f, -0.197220f, -0.239329f,
+  1.985274f,  -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+  -0.792024f, -0.114290f, 0.060969f,  0.104106f,  -0.252123f, -0.150400f,
+  -0.133277f, 0.267147f,  0.274413f,  0.223744f,  -0.180223f, -0.345415f,
+  -0.104883f, 0.119210f,  -0.095041f, -0.301635f, 0.013175f,  -2.128121f,
+  -0.147208f, -0.151509f, -0.692013f, 3.418555f,  -0.016541f, 0.171511f,
+  0.107159f,  -1.516672f, 0.127408f,  0.687035f,  -0.906486f, -0.145463f,
+  -0.169382f, -0.143906f, 0.125091f,  -0.960645f, -0.180869f, -0.716908f,
+  2.840951f,  1.904919f,  -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+  -0.950604f, -1.599800f, 0.943671f,  -0.022744f, -0.270492f, 0.080843f,
+  -0.372916f, 0.047838f,  -0.100300f, -0.026600f, 0.011733f,  -0.226051f,
+  0.172790f,  -0.172982f, 0.041258f,  -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+  0.748430f,  0.203096f,  0.059317f, 0.418219f,  0.841294f,  0.402693f,
+  -0.658522f, 0.723479f,  0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_16_layer0,
+      av1_ab_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_16_layer0,
+      av1_ab_partition_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+  -2.032866f, 0.056691f,  0.495960f,  0.778785f,  0.548153f,  -0.806942f,
+  0.481155f,  0.282298f,  0.584980f,  0.504688f,  0.209648f,  0.234616f,
+  0.213484f,  0.221969f,  0.205862f,  0.235054f,  0.317863f,  0.257139f,
+  0.529478f,  0.098122f,  -0.657532f, 0.036296f,  0.327728f,  1.323180f,
+  -0.813082f, 0.160216f,  -0.702030f, 0.722733f,  -0.270576f, -0.347416f,
+  -0.264700f, -0.254248f, 0.159820f,  0.087995f,  -0.184163f, 0.117357f,
+  0.074194f,  -0.667369f, 0.498246f,  0.420506f,  0.072409f,  -0.121581f,
+  0.315788f,  0.000525f,  0.414986f,  0.678166f,  -0.011230f, 0.188131f,
+  -0.227749f, 0.009564f,  0.108672f,  0.106923f,  -0.080695f, -0.279382f,
+  -0.061339f, -0.297835f, -0.134707f, 0.145865f,  -0.009655f, -0.000842f,
+  -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+  -0.143413f, 0.279668f,  0.000885f,  -0.022380f, -0.140194f, -0.310473f,
+  0.252699f,  0.066204f,  0.477568f,  0.994609f,  -0.276000f, 1.213182f,
+  0.277028f,  -0.411570f, -0.211559f, 0.377815f,  0.121488f,  -0.100559f,
+  -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+  -0.143196f, -0.334035f, 0.162305f,  0.142279f,  -0.001210f, -0.135252f,
+  -0.033562f, 0.204307f,  -0.039757f, -0.394174f, 0.126617f,  -0.128648f,
+  -0.410979f, 0.107641f,  -0.117573f, -0.326512f, 0.235166f,  0.084959f,
+  0.290063f,  -0.005838f, 0.459894f,  1.023709f,  -0.196145f, 1.100137f,
+  -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+  -0.029743f, 0.125113f,  -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+  -0.101234f, 0.079706f,  -1.128615f, -0.467381f, 0.220563f,  -0.409900f,
+  -0.435353f, 0.759499f,  -0.465799f, -0.394309f, 0.176282f,  -0.086275f,
+  -0.161225f, -0.354814f, 0.562871f,  0.418253f,  0.414361f,  0.445480f,
+  -0.995903f, -0.086632f, -0.230645f, 0.354656f,  -0.317576f, 0.079926f,
+  0.424369f,  0.997232f,  -0.304388f, 1.071667f,  -0.023540f, 0.029677f,
+  0.108564f,  0.183581f,  -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+  -0.271949f, -0.358483f, 0.304930f,  0.023823f,  -0.009319f, -0.214247f,
+  0.100712f,  -0.050162f, 0.327103f,  -0.212999f, -0.030496f, 0.316380f,
+  -0.439589f, -0.249959f, 0.229777f,  -0.353664f, -0.384559f, 0.114236f,
+  0.023119f,  0.007927f,  0.618368f,  0.957759f,  -0.019780f, -1.002389f,
+  0.564277f,  -0.839531f, 1.040445f,  0.054340f,  0.031908f,  -0.032893f,
+  -0.019170f, -0.042011f, 0.568928f,  0.362567f,  -0.559999f, -0.605344f,
+  -0.586146f, -0.290778f, 0.195943f,  -0.109580f, -0.088898f, -0.113054f,
+  0.293282f,  0.429019f,  0.306136f,  0.863025f,  0.021234f,  0.125770f,
+  -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f,  0.064151f,
+  0.029883f,  0.076287f,  0.757543f,  0.276713f,  -2.529775f, -0.351727f,
+  -1.832316f, 0.544780f,  -0.944529f, 0.509705f,  -0.010236f, -0.016181f,
+  0.021520f,  0.086417f,  0.041312f,  0.296853f,  -0.372378f, 0.354446f,
+  -1.366762f, 0.048875f,  0.464918f,  -0.007450f, 0.750013f,  -0.360261f,
+  0.518532f,  0.753776f,  0.641448f,  0.710746f,  0.250866f,  0.257063f,
+  0.283421f,  0.253585f,  0.170303f,  0.210426f,  0.208842f,  0.158000f,
+  -0.033144f, 0.130748f,  0.907147f,  0.409248f,  -0.854301f, -0.981307f,
+  0.294427f,  -0.507137f, 1.079967f,  0.203203f,  0.383890f,  0.368278f,
+  0.305122f,  0.449288f,  -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+  0.007016f,  -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+  0.033502f,  -0.018578f, -0.231531f, 0.177949f,  0.099564f,  -0.010233f,
+  -0.333055f, -0.078586f, -0.417867f, 0.171271f,  0.013662f,  -0.143599f,
+  -0.117296f, 0.135382f,  0.048321f,  0.000924f,  -0.055024f, -0.405595f,
+  -0.068260f, -0.271011f, -0.436425f, 0.206751f,  -0.899890f, 0.605510f,
+  0.535649f,  -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+  0.004660f,  0.176644f,  -1.008475f, -0.038895f, 0.155429f,  -0.095229f,
+  -0.680124f, -0.258063f, -0.261901f, 0.110380f,  -0.337649f, -0.505870f,
+  -1.428536f, 0.610629f,  0.254905f,  0.045098f,  0.044109f,  0.172329f,
+  0.060001f,  -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+  -0.312134f, 0.081261f,  0.160166f,  0.112690f,  0.266081f,  0.030175f,
+  -0.242746f, 0.000754f,  -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+  -0.121466f, 0.067300f,  0.342176f,  0.474538f,  0.085441f,  -0.263935f,
+  0.479235f,  -0.003713f, -0.784840f, 0.119480f,  0.456632f,  -0.640082f,
+  -0.080575f, -0.744403f, 0.259970f,  0.034667f,  -0.274641f, -0.257594f,
+  -1.121124f, -0.003745f, -0.420693f, 0.300441f,  -0.100976f, -1.049016f,
+  0.201960f,  0.113054f,  0.187010f,  1.237427f,  0.054803f,  -0.028673f,
+  0.003596f,  -0.034724f, 0.117246f,  0.190977f,  0.278915f,  0.224307f,
+  0.017852f,  -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+  0.045698f,  -0.301095f, 0.184447f,  0.348240f,  -0.017021f, -0.145064f,
+  -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+  0.647597f,  -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+  -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+  -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+  -0.971412f, 0.038542f,  0.705204f,  0.887113f,  0.150430f,  -0.243676f,
+  0.638410f,  0.320953f,  0.776676f,  0.527584f,  0.070389f,  0.051554f,
+  0.177519f,  0.140451f,  0.128892f,  0.087771f,  0.197660f,  0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+  0.614063f,  -0.384872f, 0.084884f,  -0.023980f, -0.378765f, -0.082312f,
+  -0.458271f, 0.189578f,  -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+  0.148803f,  0.829214f,  -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+  -0.031905f, -0.143459f, 0.078823f,  -0.021940f, 0.026834f,  0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+  -0.985391f, 0.587616f,  0.740683f,  0.192066f,  0.447080f,  -0.016585f,
+  0.680449f,  0.028983f,  0.643111f,  0.234338f,  0.107148f,  0.328456f,
+  -0.216394f, 1.106838f,  -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+  -0.306017f, -0.350989f, 0.859284f,  -0.372831f, -0.954419f, 0.250495f,
+  1.046732f,  0.287923f,  -0.421088f, 0.326613f,  -0.314396f, -0.084757f,
+  -0.474228f, 0.687999f,  0.052334f,  0.441708f,  -0.630698f, -0.350348f,
+  -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f,  0.603119f,
+  0.120245f,  0.182920f,  -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+  0.489497f,  -0.527019f, 0.102453f,  0.426731f,  0.034606f,  0.311461f,
+  -0.012723f, -0.229877f, -0.284290f, 0.383227f,  0.065696f,  -0.222400f,
+  1.279248f,  -0.862190f, 0.629766f,  -0.250011f, -0.325060f, -0.360115f,
+  -0.159540f, -0.291856f, -0.038348f, 0.224639f,  0.600934f,  0.030205f,
+  1.337615f,  -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+  -0.481860f, 0.563625f,  -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+  -0.095253f, -0.711135f, 0.207759f,  0.076313f,  -0.056087f, -0.162719f,
+  -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f,  -1.504446f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.462133f,
+  0.465060f,
+  0.062211f,
+  0.401786f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      24,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_16_layer0,
+      av1_4_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_16_layer0,
+      av1_4_partition_nn_bias_16_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+  -0.219494f, -0.428273f, 0.471006f,  0.448210f,  -0.152935f, 0.440435f,
+  0.922857f,  -0.074436f, 1.002195f,  0.414176f,  -0.327202f, -0.380066f,
+  -0.212346f, 0.061868f,  -0.056620f, 0.594134f,  0.617995f,  0.308358f,
+  0.232484f,  0.129849f,  1.483593f,  -0.071460f, 1.984515f,  1.116422f,
+  -1.141762f, -0.306220f, 0.089075f,  -0.271845f, 0.187524f,  0.050396f,
+  -0.061025f, 0.030809f,  0.172799f,  -0.458151f, -0.318357f, 0.122052f,
+  -0.414329f, 0.089366f,  0.118898f,  -0.376213f, -0.206151f, -0.519946f,
+  -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f,  -0.245280f,
+  -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+  -0.500856f, 0.003388f,  -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+  0.023005f,  0.157273f,  0.073400f,  0.019099f,  -0.113848f, -0.098601f,
+  -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+  -1.282604f, 0.048505f,  -0.746382f, 0.093740f,  -0.706583f, -0.085729f,
+  0.947382f,  -0.002961f, 1.175362f,  1.007309f,  0.141638f,  -0.037608f,
+  -0.118807f, -0.021474f, -0.146763f, 0.069363f,  -0.074372f, -0.215713f,
+  -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f,  -0.534598f,
+  -0.357759f, -0.455950f, 0.139469f,  0.036582f,  -0.384743f, -0.168828f,
+  -0.239250f, 0.003520f,  -0.049003f, 0.075702f,  -0.025809f, -0.225972f,
+  -0.228905f, -0.412489f, 0.060570f,  -0.328819f, -0.206446f, -0.080231f,
+  -0.372008f, -0.218118f, -0.011954f, 0.024155f,  0.156014f,  0.020679f,
+  0.194398f,  -0.283491f, -0.024463f, -0.275099f, 0.028031f,  0.026340f,
+  -0.254668f, 0.103637f,  2.178693f,  0.552284f,  0.109366f,  -0.474806f,
+  -0.379286f, -0.026315f, 2.487924f,  -0.089466f, 0.206428f,  0.114578f,
+  0.152248f,  0.184050f,  -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+  0.009343f,  -0.021029f, -0.060534f, -0.025164f, 1.841311f,  1.842748f,
+  -1.979708f, 0.450985f,  -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+  0.198991f,  -0.258070f, 0.055974f,  0.224069f,  0.453051f,  0.408053f,
+  0.027873f,  -0.180538f, 0.056609f,  0.207654f,  0.104086f,  -0.194426f,
+  -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+  -0.160439f, -0.044856f, -0.346647f, 0.044859f,  0.231398f,  -0.023643f,
+  -0.140316f, -0.260177f, 0.206965f,  -0.425386f, -0.420268f, -0.409748f,
+  0.006971f,  0.066186f,  -0.034950f, -0.345518f, 0.018633f,  -0.122489f,
+  -0.038506f, -0.330942f, 0.161236f,  -0.314119f, -0.050202f, -0.179597f,
+  0.731897f,  -0.184481f, 0.153598f,  -0.539501f, -0.301493f, -0.184967f,
+  -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+  -0.101083f, -0.064142f, 0.161190f,  0.430826f,  0.355647f,  0.138266f,
+  0.051114f,  -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+  -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+  0.120193f,  0.011360f,  -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+  -0.259893f, -0.073217f, 0.200128f,  0.103407f,  -0.229233f, 0.128831f,
+  -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+  -0.442879f, -0.310456f, -0.112881f, 0.263696f,  -0.205014f, -0.497936f,
+  -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+  0.117232f,  -0.577809f, 0.154596f,  -0.409522f, -0.413113f, -0.359199f,
+  0.307294f,  -0.008746f, -0.310522f, 0.347620f,  -0.384845f, -0.451398f,
+  -0.226199f, 0.054154f,  -0.167608f, 0.046836f,  -0.013285f, -0.408119f,
+  -0.177973f, -0.248293f, -0.465830f, 0.035827f,  -0.222208f, -0.221717f,
+  0.066392f,  -0.349769f, -0.428029f, -0.516692f, 0.022398f,  -0.251682f,
+  0.134746f,  0.011167f,  -2.078787f, 0.173592f,  -1.948348f, 0.330060f,
+  1.993785f,  -0.052859f, -0.004795f, -3.703177f, 0.013450f,  -0.011687f,
+  0.073079f,  0.034803f,  0.025515f,  0.005994f,  0.101731f,  0.074303f,
+  -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+  0.007667f,  -0.358453f, 0.027412f,  0.033492f,  0.021197f,  -0.049991f,
+  0.104468f,  -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+  -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+  -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f,  -0.246475f,
+  -0.229144f, -0.087744f, -0.346909f, 0.172611f,  0.004377f,  -0.009386f,
+  -0.023104f, 0.008000f,  -0.029390f, -0.317842f, 0.549674f,  -0.195337f,
+  -0.863979f, 0.160889f,  -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+  -0.112837f, 0.881303f,  0.000764f,  -0.035415f, -0.141877f, 0.184831f,
+  -0.363566f, -0.178569f, 0.254134f,  -0.326893f, 0.127325f,  0.310620f,
+  -0.384621f, 0.146058f,  -0.287682f, -0.373447f, 0.026930f,  0.251650f,
+  0.053817f,  0.227509f,  0.121396f,  0.396514f,  -0.278381f, -0.038969f,
+  -1.538756f, -0.002856f, -0.892900f, 0.363426f,  -1.257922f, 0.743795f,
+  0.941177f,  0.219345f,  0.684189f,  1.396858f,  0.026299f,  -0.093433f,
+  -0.066182f, 0.057868f,  -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+  0.005349f,  -0.031314f, 0.027917f,  -0.182113f, -0.212086f, -0.160774f,
+  0.051468f,  0.036787f,  0.183881f,  -0.288205f, -0.349691f, 0.162511f,
+  0.117878f,  -0.294534f, -0.365037f, -0.246313f, 0.073977f,  -0.072378f,
+  -0.173579f, -0.584560f, 0.547194f,  0.259853f,  -0.405287f, -0.421146f,
+  0.165788f,  -0.146964f, 0.257415f,  0.772394f,  -0.475302f, -0.310906f,
+  0.058723f,  0.276833f,  0.586842f,  0.248998f,  -0.061135f, 0.255779f,
+  0.152158f,  -0.024781f, 2.821834f,  1.365141f,  0.914744f,  0.165752f,
+  -1.048304f, -0.333891f, 1.804087f,  -0.437028f, -0.120211f, -0.020443f,
+  0.040077f,  0.258600f,  -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+  0.005258f,  0.053986f,  0.322755f,  0.429495f,  -1.992364f, -0.717192f,
+  -1.774802f, 2.047362f,  -0.016194f, 0.312606f,  0.019331f,  0.060950f,
+  0.116428f,  0.168458f,  -0.307001f, -0.420734f, 0.475843f,  0.425346f,
+  -0.107119f, 0.049892f,  -1.168619f, 0.010878f,  0.354872f,  0.902717f,
+  -0.391407f, 0.332772f,  -1.335037f, -0.447100f, 0.481719f,  -0.101069f,
+  -1.806565f, 0.925280f,  0.346999f,  0.093809f,  0.006275f,  0.270814f,
+  -0.691123f, 0.230748f,  0.137033f,  0.068228f,  1.555975f,  -0.271637f,
+  -0.370403f, 0.236131f,  0.367464f,  -0.136562f, 0.428838f,  0.181750f,
+  0.338762f,  0.292449f,  -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+  -0.140501f, 0.070525f,  1.248748f,  0.637990f,  -1.307246f, -0.514055f,
+  0.393858f,  -1.858727f, 0.713591f,  -0.141044f, 0.080723f,  0.120220f,
+  -0.031175f, 0.224488f,  0.753818f,  -0.833351f, -1.099132f, 0.651100f,
+  -0.135061f, -0.043820f, 0.026983f,  -0.059259f, 0.001345f,  -0.281775f,
+  0.006958f,  0.046103f,  -0.246539f, 0.057630f,  -0.360778f, -0.160681f,
+  -0.414870f, -0.301979f, 0.000683f,  0.132957f,  -0.477609f, 0.106110f,
+  -0.637769f, -0.078374f, -0.229494f, 0.583108f,  -0.822973f, -0.107540f,
+  1.063426f,  -0.268346f, 1.105787f,  2.587550f,  -0.020314f, -0.002161f,
+  -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+  -0.019870f, -0.018920f, -0.219732f, 0.035608f,  -1.789450f, 0.483032f,
+  -0.464729f, 1.563277f,  -1.054195f, 0.359991f,  0.065204f,  0.135623f,
+  0.158380f,  -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+  0.421645f,  -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+  -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+  -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f,  -0.032831f,
+  -0.165621f, 0.145844f,  -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+  -0.862375f, -0.108826f, -0.486259f, 0.685325f,  0.072569f,  -0.187961f,
+  0.109579f,  -0.082685f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+  0.255012f,  0.658860f,  0.216907f,  0.165947f,  0.241182f,  0.340854f,
+  0.409445f,  0.165220f,  0.553373f,  -0.242385f, -0.209571f, 0.255515f,
+  0.222500f,  0.037032f,  0.238590f,  0.061624f,  -2.038693f, 0.264167f,
+  -0.230144f, 0.129952f,  -0.027979f, 0.847761f,  0.438922f,  0.462323f,
+  0.555345f,  0.030689f,  0.336357f,  -0.357326f, -0.113137f, 0.272631f,
+  0.421022f,  0.367776f,  -0.197094f, 0.157117f,  -0.015008f, -0.056123f,
+  -0.283913f, 0.186417f,  0.178561f,  -0.763041f, 0.602038f,  0.341092f,
+  0.320453f,  -0.312776f, -0.371240f, -0.356279f, 0.220117f,  -0.131871f,
+  1.517429f,  0.162223f,  -0.255069f, 0.451861f,  0.045071f,  -0.223257f,
+  0.003257f,  0.015734f,  -0.630447f, -0.672588f, 0.670164f,  0.571031f,
+  -0.657948f, 0.034506f,  -0.249076f, 0.790293f,  0.066491f,  -0.131245f,
+  0.355173f,  0.564622f,  0.374048f,  0.033974f,  0.253970f,  0.495498f,
+  -0.556321f, -0.104651f, 0.276947f,  0.057148f,  -0.039126f, -0.170050f,
+  -0.141542f, 0.158541f,  0.582763f,  -0.100992f, 0.096705f,  -0.209029f,
+  0.008449f,  0.255865f,  0.103565f,  0.317719f,  0.479499f,  0.599126f,
+  -0.065613f, -0.268614f, 0.508736f,  0.180813f,  -0.815868f, 0.051238f,
+  0.001223f,  -0.305423f, -0.270079f, 0.036180f,  0.304342f,  0.202634f,
+  0.218348f,  -0.304304f, -0.438297f, 0.241123f,  0.200230f,  0.151804f,
+  0.051944f,  0.160422f,  -0.262981f, -0.417412f, 1.845729f,  -0.086183f,
+  0.403517f,  0.059667f,  0.564543f,  -0.081752f, 0.114907f,  -0.284489f,
+  -0.673943f, 0.056965f,  0.362221f,  0.403224f,  -0.000233f, -0.209552f,
+  -0.800926f, -0.134132f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.019518f,
+  0.198546f,
+  0.339015f,
+  -0.261961f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_32_layer0,
+      av1_4_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_32_layer0,
+      av1_4_partition_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+  -0.152649f, 0.074509f,  1.000136f,  0.601661f,  -1.416694f, -1.932396f,
+  -1.163850f, 0.640931f,  -0.888625f, -0.345711f, 0.161799f,  0.103165f,
+  0.147513f,  0.089956f,  0.204329f,  0.196922f,  0.014927f,  0.283714f,
+  -0.110422f, 0.062005f,  -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+  -0.005592f, -0.130490f, -0.015779f, 0.093521f,  -0.158487f, 0.072241f,
+  0.066879f,  -0.418566f, -0.206281f, 0.025634f,  0.048334f,  -0.534750f,
+  0.302081f,  0.028707f,  -1.543248f, 0.103799f,  -1.214052f, 0.395870f,
+  0.394754f,  -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+  0.014742f,  0.065263f,  0.000879f,  -0.019768f, 0.101275f,  0.163059f,
+  -0.371392f, -0.283484f, 0.241915f,  0.012684f,  -0.210101f, -0.166534f,
+  -0.024894f, 0.274696f,  0.098993f,  0.104086f,  0.055044f,  -0.289378f,
+  0.146571f,  -0.147441f, 0.004056f,  0.112244f,  -0.416162f, -0.033176f,
+  -0.214836f, -0.213787f, 0.023197f,  -0.339043f, 0.301109f,  -0.408551f,
+  0.284922f,  -0.344418f, -0.039255f, 0.158748f,  -0.344169f, 0.078286f,
+  -0.043957f, -0.302162f, -0.310826f, 0.063425f,  0.198166f,  -0.285324f,
+  -0.108252f, 0.038992f,  -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+  0.643206f,  -0.850240f, 0.889641f,  -0.733214f, 0.147302f,  0.060291f,
+  -0.052954f, 0.167453f,  0.111870f,  0.085471f,  0.035107f,  0.064361f,
+  0.176053f,  0.184373f,  0.676576f,  0.066164f,  1.455569f,  0.925111f,
+  -0.640845f, 0.803795f,  -0.653782f, -0.201038f, 0.060033f,  0.016964f,
+  -0.047590f, 0.045908f,  0.354162f,  0.014812f,  0.156978f,  0.058792f,
+  -0.238119f, 0.002450f,  -0.094388f, -0.155229f, 0.194858f,  -0.355429f,
+  -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f,  -0.425339f,
+  -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f,  -0.185816f,
+  -0.317294f, 0.002453f,  -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+  -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+  -0.146812f, 0.171111f,  0.090261f,  -0.367033f, -0.299051f, -0.322132f,
+  0.428192f,  -0.252613f, 0.488498f,  -0.559682f, 0.486720f,  -0.511084f,
+  0.992506f,  0.346765f,  -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+  -0.426517f, -0.516836f, 0.307083f,  0.609362f,  0.369555f,  0.093775f,
+  -0.375664f, -0.221595f, -0.025465f, 0.134374f,  -0.387031f, 0.096236f,
+  0.337465f,  -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+  -0.247705f, 0.146559f,  -0.236206f, -0.036073f, 0.064206f,  -0.330919f,
+  0.516591f,  -0.013492f, 1.269568f,  1.182530f,  -0.455390f, -1.328091f,
+  -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f,  0.021176f,
+  0.169119f,  0.103707f,  -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+  0.459111f,  0.036129f,  0.769570f,  -0.080405f, 1.667107f,  0.355567f,
+  -2.433896f, 0.627572f,  -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+  -0.009933f, 0.014864f,  -0.049378f, -0.041561f, 0.075180f,  0.138307f,
+  0.122366f,  -0.160756f, 0.215327f,  0.013572f,  0.198194f,  -0.762650f,
+  0.054466f,  1.110332f,  1.692853f,  0.658654f,  -0.409549f, 0.506085f,
+  0.330962f,  -0.223008f, 0.007448f,  -0.289062f, -0.476231f, -0.228359f,
+  0.013977f,  -0.000609f, -0.673604f, 0.275996f,  0.405291f,  1.693561f,
+  -1.079768f, 1.122516f,  -0.203227f, 0.099265f,  -0.165207f, -0.323899f,
+  -0.269973f, -0.080122f, 0.127700f,  0.190201f,  0.219527f,  0.306194f,
+  0.026049f,  -0.003779f, 1.107357f,  1.720315f,  1.017908f,  0.078664f,
+  -1.599813f, -0.482636f, -0.117450f, 0.122249f,  0.030220f,  0.039794f,
+  0.176350f,  0.129715f,  -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+  -0.073616f, -0.564507f, -0.127758f, 0.044855f,  -0.191090f, 0.039095f,
+  0.115378f,  0.969352f,  -0.088360f, 0.301443f,  0.065726f,  -0.019740f,
+  -0.102350f, -0.084913f, -0.194615f, 0.118582f,  0.920789f,  -0.171615f,
+  -1.436553f, -0.026419f, -0.730864f, 0.615697f,  -0.795079f, 0.119701f,
+  0.601782f,  0.792902f,  0.184920f,  1.635090f,  -0.085860f, -0.033187f,
+  -0.166883f, 0.008487f,  -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+  -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+  -0.063101f, -0.121002f, 0.058072f,  -0.031166f, 0.086413f,  -0.016203f,
+  -0.305075f, -0.005420f, -0.168796f, 0.148745f,  -0.116737f, -0.050222f,
+  -0.287952f, -0.290982f, -0.090449f, 0.076098f,  -0.345632f, -0.061309f,
+  0.142218f,  0.035692f,  0.304517f,  -0.228031f, 0.119608f,  -0.120350f,
+  0.163404f,  -0.105605f, -0.305462f, -0.176657f, 0.210070f,  -0.227600f,
+  -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f,  0.017162f,
+  -0.069792f, 0.305768f,  -0.421095f, 0.187740f,  -0.032059f, 0.575115f,
+  -0.064283f, -0.091828f, 0.772648f,  -0.393189f, -0.297098f, 0.141420f,
+  0.826389f,  -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+  1.546000f,  -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+  -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+  -0.492625f, 0.025350f,  -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+  0.108957f,  0.246298f,  -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+  0.073806f,  -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+  0.018810f,  -0.098241f, 1.027369f,  0.479328f,  1.129707f,  0.484813f,
+  -0.085207f, 0.621873f,  -0.520981f, 0.236175f,  0.273487f,  0.061426f,
+  0.306085f,  0.161487f,  0.220991f,  0.223783f,  -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+  0.580225f,  -0.191304f, 1.091767f,  -0.134522f, -0.089361f, 0.398750f,
+  -0.882708f, -0.213102f, -0.119981f, 0.378296f,  -0.075719f, 0.426598f,
+  -2.015505f, 0.202534f,  -1.044792f, -0.841519f, 0.266421f,  -0.047115f,
+  -0.131147f, -0.075066f, -0.009441f, 0.853007f,  -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+  -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f,  -0.340847f,
+  0.498032f,  0.308652f,  -0.051574f, 0.323146f,  -0.097547f, -0.040269f,
+  1.909655f,  0.098348f,  0.588136f,  0.568112f,  0.313297f,  0.920848f,
+  -0.014486f, 0.386014f,  0.029199f,  -0.537330f, -0.021502f, 0.349073f,
+  -0.524715f, -0.351848f, 1.565454f,  -0.297148f, 0.020177f,  0.648369f,
+  0.027321f,  -0.096052f, -0.363163f, -0.132642f, 0.024292f,  -0.734176f,
+  -0.782700f, 0.408299f,  0.476945f,  -0.489512f, -0.728318f, -0.632042f,
+  0.405417f,  0.184086f,  -0.400730f, 0.359032f,  0.019710f,  -0.217409f,
+  0.519159f,  -0.136316f, 0.993592f,  -0.147128f, 0.097495f,  0.426189f,
+  -0.295233f, 0.278799f,  0.080667f,  -0.025052f, -0.307757f, 0.418716f,
+  -0.853388f, -0.374878f, -0.322725f, 0.696335f,  -0.380649f, -0.160356f,
+  -0.140060f, 0.502455f,  0.656728f,  -0.095023f, -0.184198f, -0.347069f,
+  0.456372f,  -0.029754f, 0.907923f,  0.265710f,  -0.065505f, 0.226763f,
+  -0.277798f, 0.413292f,  -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+  -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+  -0.057382f, 0.334741f,  -0.283083f, 0.368280f,  -0.407197f, -0.441849f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.478735f,
+  0.292948f,
+  0.293172f,
+  0.040013f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      24,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_64_layer0,
+      av1_4_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_64_layer0,
+      av1_4_partition_nn_bias_64_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 4
+static const float
+    av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+      -0.331785f,  0.068675f,  -0.323814f,  0.033714f,  -0.237835f, 0.166316f,
+      -0.498766f,  -0.545634f, -0.266173f,  -0.476957f, -0.120409f, -0.021042f,
+      0.124056f,   -0.278750f, -0.110120f,  -0.372812f, 4.547939f,  0.097618f,
+      -0.002710f,  -0.064169f, -1.841173f,  -0.403833f, 0.005536f,  0.067188f,
+      -0.434935f,  -0.227421f, -0.000011f,  -0.139961f, -0.174056f, -0.652384f,
+      -0.000015f,  -0.262847f, -3.319706f,  -0.947693f, 0.002981f,  0.016717f,
+      -10.408850f, -0.014568f, -0.000018f,  0.019084f,  1.523383f,  0.074525f,
+      -0.002076f,  -0.020734f, 4.881495f,   0.002799f,  0.000342f,  -0.019623f,
+      1.786154f,   0.037462f,  -0.019037f,  0.052833f,  11.408153f, -0.044602f,
+      0.026155f,   -0.518627f, -0.474499f,  -0.427430f, -0.442733f, -0.011116f,
+      -22.379410f, -0.000549f, -0.001418f,  0.008090f,  -0.295090f, -0.230268f,
+      -0.337278f,  -0.001127f, -0.644282f,  -0.598783f, -0.539417f, -0.003303f,
+      9.189824f,   0.038066f,  -0.004097f,  -0.460045f, -0.308858f, -0.242691f,
+      -0.230835f,  -0.273057f, 0.152226f,   0.179239f,  -0.146382f, -0.004655f,
+      -0.242940f,  -0.718862f, -0.001685f,  -0.214736f, 3.263186f,  0.079463f,
+      -0.003854f,  -0.187461f, -0.599144f,  -0.419808f, -0.000597f, -0.136980f,
+      0.184813f,   -0.319525f, -0.007246f,  0.079709f,  -0.883229f, -0.343748f,
+      -0.000077f,  -0.172214f, -0.548759f,  -0.194674f, -0.144786f, 0.043896f,
+      -0.176364f,  -0.248394f, -0.090215f,  -0.294743f, -0.280980f, -0.181436f,
+      -0.115681f,  -0.071915f, -13.035494f, -0.075623f, 0.017052f,  -0.171152f,
+      5.910803f,   0.128344f,  0.010256f,   -1.073301f, 2.387826f,  0.166183f,
+      -0.007193f,  -0.257836f,
+    };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+  0.115591f,  -0.100178f, -0.165523f, -0.122997f, 11.045759f,  1.034761f,
+  -0.323672f, -0.189087f, 2.850950f,  7.010029f,  -21.447067f, 1.877031f,
+  0.437442f,  5.929414f,  -0.117274f, 4.462253f,  -0.135198f,  -0.145927f,
+  8.727211f,  0.000000f,  -3.532987f, -0.405898f, 11.364439f,  -0.141728f,
+  -5.994947f, -0.362574f, 1.857687f,  -0.100400f, -0.130312f,  0.006080f,
+  0.429660f,  -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+  -0.013738f, 0.022052f,  -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+  0.002091f,  0.014252f,  0.134834f,  0.190263f,  0.244175f,  -0.031747f,
+  0.020068f,  -0.068326f, 0.185471f,  0.660268f,  -0.134898f, -0.010376f,
+  -0.276023f, -0.282921f, -0.022769f, 0.007070f,  -0.186235f, 0.024407f,
+  -0.024837f, 0.005764f,  0.016599f,  -0.040077f, 0.020990f,  0.095054f,
+  -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+  0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_128_layer0,
+      av1_partition_breakout_nn_weights_128_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_128_layer0,
+      av1_partition_breakout_nn_bias_128_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+      0.872892f,  -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+      -0.001373f, 0.112147f,  5.281734f,  0.060704f,  0.000838f,  -0.961554f,
+      0.244995f,  0.154515f,  -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+      0.003208f,  -0.418226f, 2.618152f,  0.026832f,  0.003988f,  -0.404406f,
+      -0.405434f, 0.102791f,  -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+      0.012947f,  -0.195075f, 0.009311f,  -0.411410f, -0.010986f, -0.554822f,
+      0.160576f,  0.020796f,  -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+      -0.001322f, 0.055691f,  0.291924f,  -0.053076f, -0.148379f, -0.298383f,
+      1.022023f,  -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+      -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+      -2.407131f, -0.062304f, 0.000874f,  0.108786f,
+    };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+  0.081425f,  -14.404084f, 11.511393f, -0.930053f, 1.841889f,  15.020920f,
+  -1.872288f, 5.392535f,   -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+  -0.337413f, 4.492778f,   0.000000f,  17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+  -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+  0.014185f,  0.067030f,  -0.001939f, -0.175049f, 0.245992f,  -0.181660f,
+  -0.038572f, 0.307899f,  -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+  -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_64_layer0,
+      av1_partition_breakout_nn_weights_64_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_64_layer0,
+      av1_partition_breakout_nn_bias_64_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+      -4.825528f, -0.145737f, 0.001907f,  0.145415f,  -1.858153f, -0.080744f,
+      0.000601f,  0.211991f,  0.384265f,  -0.043945f, -0.521332f, -0.170622f,
+      -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+      -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+      -1.202551f, -0.729727f, -0.370084f, 0.088215f,  -1.926800f, -0.086519f,
+      0.000359f,  0.215120f,  0.718749f,  0.022942f,  0.003840f,  -0.176518f,
+      1.213451f,  0.080786f,  0.001557f,  -1.053430f, 0.202698f,  -0.583919f,
+      -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+      -0.178518f, -0.585784f, 0.000029f,  -0.833014f, -0.331358f, -0.520297f,
+      -0.088676f, -0.178487f, -1.430755f, 0.022981f,  -0.106931f, 0.015573f,
+      -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+    };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+  11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+  6.669584f,  16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+  -0.423808f, 0.000000f,  6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+  0.168561f,  -0.122519f, 0.524667f,  0.032474f,  0.059097f,  0.011900f,
+  0.166445f,  0.127256f,  -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+  -0.004171f, 0.157694f,  0.117845f,  0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+  0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_32_layer0,
+      av1_partition_breakout_nn_weights_32_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_32_layer0,
+      av1_partition_breakout_nn_bias_32_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+      0.209371f,  0.028758f,  0.005764f,  -0.384401f, -0.625777f, -0.005647f,
+      -0.316867f, 0.042985f,  0.127344f,  0.025461f,  0.011465f,  -0.071043f,
+      -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+      -0.130997f, -0.012326f, 0.024124f,  -0.323578f, -0.005790f, -0.085664f,
+      -1.575066f, -0.119221f, 0.015018f,  0.187204f,  0.238117f,  0.084924f,
+      -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+      -0.278642f, -0.011114f, 0.021162f,  0.081290f,  -0.467486f, -0.040771f,
+      -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+      -0.814479f, -0.050450f, -0.003666f, 0.085668f,  -0.272589f, 0.057330f,
+      -0.206540f, -0.303418f, 0.075335f,  -0.180468f, -0.064872f, -0.755948f,
+      -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+    };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+  16.421495f, 4.012273f,  -1.828571f, 0.000000f,  -0.263564f, -0.201972f,
+  6.564987f,  14.651000f, -3.227779f, 2.241833f,  -0.137116f, 0.762876f,
+  5.625762f,  0.615822f,  0.040057f,  16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+  -0.096440f, 0.184316f,  -0.021148f, 0.424974f, 0.003743f,  0.006310f,
+  0.046266f,  -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+  0.269773f,  -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+  1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_16_layer0,
+      av1_partition_breakout_nn_weights_16_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_16_layer0,
+      av1_partition_breakout_nn_bias_16_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+      -0.255885f, 0.109548f,  -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+      0.048241f,  -0.356013f, -0.085054f, 0.124908f,  0.000084f,  -0.149906f,
+      -0.729829f, 0.133535f,  -0.002125f, 0.207516f,  -0.210163f, -0.567365f,
+      -0.590103f, 0.045308f,  -0.539406f, 0.130550f,  -0.663879f, -0.170549f,
+      0.017587f,  -0.054187f, 0.000550f,  0.038297f,  -0.112891f, -0.012751f,
+      -0.048067f, 0.095564f,  0.079892f,  0.077285f,  -0.749708f, -0.286312f,
+      -0.054334f, 0.132242f,  -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+      -0.001034f, -0.090990f, 0.122823f,  -0.109794f, -0.230066f, -0.391155f,
+      -0.262245f, -0.004744f, -0.232246f, 0.099290f,  -0.637484f, 0.111937f,
+      -0.548556f, -0.598344f, 0.123265f,  -0.281395f, -0.399711f, -0.525671f,
+      -0.596269f, 0.098494f,  -0.005765f, 0.173652f,
+    };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+  0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+  2.056990f, 5.284306f,  0.639643f, -2.792049f, -2.232339f, -0.232209f,
+  2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+  -0.014439f, 0.010171f, 0.048116f,  -0.090659f, -0.081235f, -0.021840f,
+  -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+  0.055858f,  0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+  1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_8_layer0,
+      av1_partition_breakout_nn_weights_8_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_8_layer0,
+      av1_partition_breakout_nn_bias_8_layer1,
+  },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9  // Input layer size
+#define NUM_NODES 32    // Hidden layer size
+#define LABEL_SIZE 3    // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+                                                          NUM_NODES] = {
+  0.22151f,  0.99424f,  0.23415f,  -1.13841f, -0.11277f, 0.09530f,  0.14769f,
+  -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f,  0.15777f,  -0.03176f,
+  0.02729f,  -0.37344f, -0.01727f, -0.05469f, 0.19402f,  -3.45508f, 0.90106f,
+  -2.91557f, 0.19379f,  0.14356f,  -0.13291f, 0.05734f,  -0.03032f, -0.13060f,
+  0.35744f,  1.31630f,  -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+  1.08305f,  -0.21596f, 0.76244f,  1.10616f,  -1.71706f, 0.05768f,  0.10966f,
+  0.00949f,  -0.12680f, 0.00699f,  -0.11522f, -0.38566f, 0.34283f,  -0.35266f,
+  -0.40643f, -0.22462f, 0.32300f,  -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+  0.30314f,  -1.35659f, -0.38212f, 0.45857f,  0.76615f,  0.16819f,  -1.24459f,
+  0.39677f,  0.87436f,  -2.33757f, 1.27471f,  0.27488f,  0.01019f,  -0.01221f,
+  -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+  -0.06777f, -1.13868f, 0.01354f,  -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+  0.56899f,  1.17144f,  0.70899f,  0.48568f,  0.11266f,  0.81579f,  -0.03929f,
+  0.01088f,  0.33599f,  -0.22401f, -0.49654f, -0.02598f, 0.04509f,  -0.08217f,
+  -0.30687f, 0.19851f,  -2.96860f, -2.30698f, 0.01848f,  0.11801f,  0.06614f,
+  0.01673f,  -0.11002f, -0.08168f, 0.09204f,  -0.06379f, 0.27972f,  -0.31716f,
+  -0.00566f, -0.13651f, -0.37276f, 0.01511f,  -0.23697f, 0.21696f,  -0.19480f,
+  0.60758f,  -0.43506f, -0.02247f, -1.45073f, 0.84442f,  -0.94018f, 0.32550f,
+  0.03985f,  -0.06581f, 0.21665f,  0.79472f,  -2.41080f, 0.04788f,  -0.09492f,
+  -0.10677f, 0.07250f,  0.14329f,  -0.37319f, 0.53043f,  -0.49108f, 0.25792f,
+  -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+  -0.26196f, 0.93379f,  0.72115f,  0.54464f,  0.27642f,  0.04757f,  2.01629f,
+  1.55787f,  -0.11665f, 1.00722f,  -0.24352f, 0.53308f,  0.57719f,  0.39344f,
+  0.19174f,  0.06339f,  -0.02530f, 0.07724f,  -0.32416f, -0.26992f, -0.35887f,
+  -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f,  -1.52153f, -0.26503f,
+  0.97552f,  -2.96705f, -0.91220f, -0.11827f, 0.00406f,  -0.14514f, 0.18417f,
+  -0.20874f, 0.27293f,  -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+  -0.19293f, -0.18275f, -0.05902f, 0.58625f,  -0.05470f, -0.48814f, -0.45382f,
+  -0.05959f, 2.01250f,  -0.30014f, 0.69546f,  -1.24180f, 1.34923f,  0.20337f,
+  0.16850f,  0.07187f,  0.72630f,  -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+  -1.61695f, 0.50052f,  0.09730f,  0.00579f,  0.06133f,  -0.06512f, -0.61439f,
+  -1.16173f, -0.58716f, 1.60438f,  0.23242f,  0.91847f,  0.49041f,  -0.16277f,
+  -0.02574f, -0.64593f, 1.17028f,  0.46852f,  0.14926f,  0.73853f,  -0.78521f,
+  0.05959f,  -0.35590f, 0.02039f,  0.10812f,  -0.28650f, 1.34038f,  -0.72188f,
+  0.62385f,  -0.35271f, -0.39599f, 0.41543f,  0.53124f,  -0.23510f, -0.15480f,
+  -0.05066f, -0.33529f, 0.05238f,  -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+  0.26593f,  -0.18411f, -0.29945f, 0.50090f,  -0.03397f, 0.78562f,  -0.33068f,
+  1.21308f,  -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f,  0.65567f,
+  0.76496f,  0.44325f,  0.01368f,  -0.33619f, -0.64256f, 0.64478f,  0.84553f,
+  1.74183f,  0.22563f,  -0.14550f, -0.16258f, 0.03010f,  0.49922f,  0.64575f,
+  -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f,  0.87411f,
+  -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+  0.33919f,  -0.03003f, 0.79073f,  -0.18508f, 0.00668f,  -0.12017f, 0.35362f,
+  -0.51642f, 0.06536f,  0.41668f,  -0.06509f, 0.94606f,  -0.15385f, 0.14936f,
+  1.46274f,  -0.06961f, 2.82537f,  -1.95576f, -0.09457f, 0.02042f,  -0.07480f,
+  -0.55083f, 0.26170f,  4.39883f,  0.33999f,  -0.10502f, 0.70884f,  -0.06992f,
+  -0.22638f, 1.40940f,  -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+                                                          LABEL_SIZE] = {
+  0.09209f,  0.26236f,  0.62136f,  0.76324f,  -1.14678f, 0.42289f,  -0.08895f,
+  -0.97267f, 2.05958f,  0.00843f,  0.35335f,  1.12096f,  -0.11679f, 0.07350f,
+  -1.23231f, -0.61990f, 1.51379f,  -1.99450f, 0.22441f,  2.41974f,  -0.30488f,
+  -0.37869f, 0.47168f,  -3.70132f, 0.00061f,  0.19432f,  0.11512f,  0.26200f,
+  -0.35285f, 0.37985f,  0.90571f,  0.27344f,  0.74840f,  -0.17965f, -2.51433f,
+  0.59235f,  1.16670f,  -0.53446f, 0.67897f,  0.04505f,  -0.86874f, 0.45361f,
+  -0.35033f, 1.21283f,  0.31426f,  -0.20841f, 0.56757f,  0.45909f,  -1.23683f,
+  0.09835f,  -0.17214f, -0.96323f, 0.01138f,  -0.50233f, 0.30104f,  2.01814f,
+  1.15821f,  -0.11947f, 0.74574f,  -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+  -0.12259f, -0.54977f, 0.30069f,  1.84299f,  -0.95141f, -0.65887f, -0.25888f,
+  -0.63265f, 1.29531f,  -0.56672f, 0.10837f,  -0.21297f, -2.19131f, 0.01156f,
+  0.51912f,  0.46704f,  0.42810f,  -0.59271f, 0.98469f,  -0.17914f, -1.91163f,
+  -0.32807f, 0.48199f,  -0.99525f, 1.67108f,  -0.87631f, -0.60258f, -0.78731f,
+  -0.32877f, 0.44237f,  0.01087f,  0.07489f,  -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+  1.70665f,
+  -0.77954f,
+  -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_8_layer0,
+    av1_rect_partition_nn_weights_8_layer1 },
+  { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.18480f, -0.05410f, -0.18957f, 0.15451f,  -0.38649f, -0.26162f, -0.22727f,
+  -0.38555f, -0.36738f, 0.74384f,  -1.85999f, 0.98491f,  -0.72119f, 1.77321f,
+  0.39983f,  0.96314f,  0.23695f,  0.30200f,  0.30629f,  -0.47617f, -1.43320f,
+  -1.81730f, 0.36554f,  -0.07142f, -1.27242f, -1.27697f, 0.00110f,  -0.32179f,
+  0.27460f,  0.45428f,  0.15308f,  -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+  -0.85390f, 1.05484f,  -1.62812f, 0.77632f,  -0.27327f, -0.32527f, 0.32726f,
+  1.73255f,  0.53763f,  0.59121f,  -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+  0.07519f,  -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+  0.39674f,  -0.08596f, 0.18587f,  -2.04097f, -1.73993f, 1.57212f,  1.42410f,
+  -1.36762f, -0.41485f, -1.12103f, 0.56959f,  0.11500f,  0.48945f,  -0.13585f,
+  1.22125f,  0.67071f,  -1.11812f, -0.20660f, -0.52856f, 0.70663f,  0.74382f,
+  0.61114f,  -0.11454f, 1.14687f,  0.80322f,  -0.45965f, -0.44466f, -0.05830f,
+  0.13206f,  -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+  1.34433f,  2.49427f,  2.91955f,  1.71730f,  0.03295f,  0.03587f,  -0.14550f,
+  0.08189f,  -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+  -1.01334f, -0.57302f, 0.22592f,  0.05916f,  -0.05305f, -0.89824f, -0.52969f,
+  -0.24542f, 0.27029f,  -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+  1.83695f,  2.19716f,  2.31001f,  0.03657f,  0.00063f,  -0.04379f, 0.05835f,
+  -0.08623f, 0.20557f,  -0.17791f, 0.07874f,  -0.25456f, -0.19513f, -0.27753f,
+  -0.31982f, 0.00245f,  -0.33183f, 0.26059f,  -0.22165f, 0.37582f,  -0.30411f,
+  -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f,  1.03673f,
+  0.66139f,  0.44941f,  -0.44461f, -0.50376f, -0.49664f, 0.18608f,  -0.26175f,
+  0.14844f,  0.78715f,  -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+  -0.43135f, -0.22571f, 3.46263f,  3.13580f,  -1.33203f, -0.15247f, -0.15866f,
+  -0.11214f, 0.12211f,  0.03964f,  -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+  -5.62336f, -0.05337f, -0.00943f, 0.00792f,  0.02742f,  1.05679f,  2.41455f,
+  0.85382f,  1.42504f,  0.58096f,  0.21443f,  1.02694f,  1.06746f,  1.20242f,
+  0.60767f,  1.98667f,  -0.80879f, -0.63495f, 1.95508f,  0.23952f,  -0.15019f,
+  -0.16097f, 0.30155f,  -3.42407f, -1.34998f, 9.07689f,  -2.22559f, 2.22562f,
+  -0.03348f, -0.05229f, 0.05931f,  0.03042f,  -0.18068f, -0.05732f, -0.33010f,
+  -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f,  -0.16366f, -0.24935f,
+  -0.69124f, 0.58508f,  0.50654f,  0.04492f,  1.38340f,  -1.51487f, 1.72889f,
+  -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f,  0.03784f,
+  0.08052f,  -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+  0.61751f,  -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+  0.15671f,  -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f,  -0.11533f,
+  -0.05799f, -0.03142f, 0.20218f,  -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+  -0.28459f, -0.20346f, 0.89457f,  -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+  0.33915f,  0.50047f,  -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+  -1.70297f, 1.00482f,  -0.00103f, -1.40813f, 0.21311f,  0.39230f,  -0.07302f,
+  -3.49100f, 1.60675f,  -2.90692f, 0.11022f,  0.13507f,  -0.13308f, 0.15201f,
+  -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+  -0.16783f, -0.16023f, 0.52215f,  -0.04109f, 2.00122f,  -0.11633f, 0.25535f,
+  1.80638f,  1.69273f,  -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+  0.00000f,  -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f,  -0.78548f,
+  -1.39335f, -5.42248f, -0.10388f, 0.07634f,  2.81012f,  -0.57429f, -0.15629f,
+  -0.12044f, 1.65478f,  -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  -0.26407f, 0.06322f,  0.87932f,  0.17772f,  0.71686f,  -0.12283f, 0.08454f,
+  0.20098f,  -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f,  3.80486f,
+  0.16750f,  0.29218f,  0.57234f,  -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+  0.95803f,  -4.13925f, 0.24567f,  0.25708f,  1.60547f,  -1.03251f, -0.31053f,
+  -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f,  0.13689f,
+  0.24504f,  0.49623f,  0.19980f,  0.38349f,  0.37481f,  0.54540f,  -0.02198f,
+  3.43385f,  1.02543f,  -0.40921f, -3.07235f, 0.02996f,  0.00323f,  -0.35414f,
+  0.71099f,  1.39334f,  2.43741f,  -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+  0.00353f,  -1.69637f, 0.45944f,  -0.19884f, 0.03624f,  0.25729f,  0.23659f,
+  -2.08405f, 0.08573f,  -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+  -0.09884f, -0.69026f, -0.37284f, 0.04622f,  1.32973f,  -0.15414f, 0.19138f,
+  -0.67927f, -0.17658f, 0.36008f,  -0.51832f, 0.09887f,  -1.94414f, 2.95227f,
+  1.76937f,  -0.26687f, 8.50976f,  0.26247f,  0.60262f,  -0.27910f, 0.30061f,
+  -0.05117f, 0.16018f,  0.71195f,  0.57871f,  1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+  2.68750f,
+  -1.31894f,
+  -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_16_layer0,
+    av1_rect_partition_nn_weights_16_layer1 },
+  { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+  -0.09858f, -0.09438f, 0.37306f,  0.23934f,  -1.86375f, -1.18307f, -0.32995f,
+  -0.09745f, 0.05431f,  -0.13799f, 0.14734f,  -0.33219f, 0.18057f,  -0.23792f,
+  -0.28126f, 0.02977f,  -0.07431f, 0.07860f,  0.00067f,  -0.01927f, 1.01841f,
+  -0.57739f, 0.08412f,  -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+  -0.16703f, 0.02808f,  0.11994f,  -0.26267f, 0.19706f,  -0.29707f, -0.25305f,
+  -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f,  -0.37001f, -0.23319f,
+  -0.11139f, -0.30513f, 0.04213f,  -0.12550f, 0.02504f,  0.33245f,  0.01102f,
+  -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+  0.65066f,  0.28443f,  -1.24943f, -3.00246f, -1.01897f, 0.09304f,  0.70052f,
+  -0.12877f, 0.21120f,  -0.37476f, 0.23261f,  -0.28401f, 0.09837f,  0.00020f,
+  -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f,  0.16596f,  -0.06532f,
+  1.72938f,  1.57754f,  0.55963f,  0.33246f,  -0.20023f, 0.30715f,  0.08629f,
+  0.18945f,  -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+  -0.57698f, 0.04157f,  -0.92428f, -1.31268f, 1.78210f,  0.10291f,  1.55042f,
+  -1.26793f, 1.39042f,  -1.43729f, 0.25600f,  5.21263f,  5.31955f,  5.19316f,
+  5.43430f,  0.00294f,  -0.00970f, -0.02333f, 0.00250f,  1.17672f,  6.27544f,
+  4.95973f,  3.54009f,  4.51269f,  0.30750f,  0.78780f,  -0.44741f, -0.76442f,
+  0.75050f,  0.58799f,  0.03400f,  -2.09859f, 1.67313f,  0.12503f,  0.28609f,
+  1.15809f,  2.46530f,  -0.04898f, 0.23072f,  -0.12635f, -0.82097f, -0.63827f,
+  2.16779f,  1.77132f,  0.15434f,  -1.06427f, 0.06206f,  -0.87732f, -0.61897f,
+  -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f,  -0.22992f,
+  1.74638f,  1.29199f,  -0.55464f, 0.98316f,  0.06665f,  0.50254f,  -0.66292f,
+  0.17113f,  -0.32633f, -1.85803f, -0.92759f, 4.44965f,  1.33057f,  0.02135f,
+  -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+  -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f,  0.14248f,
+  -0.48630f, 0.18840f,  0.11040f,  0.17287f,  -0.51880f, 1.12466f,  -0.38888f,
+  -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f,  -0.26881f,
+  -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+  -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f,  1.29826f,  0.23788f,
+  0.04189f,  2.66416f,  0.48815f,  -0.06803f, 0.96742f,  1.27165f,  -0.70348f,
+  -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+  -0.48073f, 2.43657f,  -2.60191f, -1.82837f, 0.50440f,  0.71829f,  0.76491f,
+  0.28293f,  0.20568f,  0.92642f,  -0.02496f, 1.43637f,  -0.24474f, -1.21030f,
+  0.54084f,  1.05130f,  1.29572f,  0.03750f,  -0.36894f, 0.74548f,  -1.33857f,
+  -0.84858f, 1.35230f,  0.80175f,  0.66136f,  1.06473f,  0.18701f,  1.42413f,
+  0.04661f,  -0.07820f, 0.64990f,  -0.43595f, 1.18304f,  -0.11437f, -0.06365f,
+  0.03558f,  0.78260f,  -1.74890f, 1.56217f,  -1.23424f, 4.59193f,  -3.35072f,
+  0.01180f,  -0.18296f, -0.20870f, 0.04510f,  1.52595f,  -1.37402f, -0.33123f,
+  -0.85957f, 0.80598f,  0.03743f,  0.02354f,  0.37707f,  1.62095f,  -0.29627f,
+  -0.31778f, -0.45789f, -0.14906f, 0.25315f,  -0.10817f, -0.32610f, -0.40890f,
+  0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+  -0.17482f, 0.39042f,  0.00000f,  1.69677f,  0.08792f,  -0.09301f, 0.13809f,
+  4.84061f,  0.00000f,  0.40515f,  0.46246f,  0.20644f,  -5.77478f, -1.54510f,
+  0.05660f,  -0.32013f, 0.23649f,  0.03778f,  -2.53710f, -0.27869f, 0.45623f,
+  -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f,  1.93272f,  -1.07032f,
+  -0.27602f, -1.98063f, 0.20816f,  -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.02827f,  1.02560f,  -0.07137f, -0.31911f, 0.11365f,  0.13684f,  -0.07816f,
+  -5.23036f, -0.34340f, 0.84526f,  -1.51845f, 0.07017f,  -8.12570f, 6.24061f,
+  0.35739f,  -0.09937f, -0.30978f, 0.22032f,  0.74968f,  -0.34557f, 0.45547f,
+  -0.16512f, 0.07118f,  1.66415f,  0.41320f,  -1.81533f, -1.96004f, 1.04666f,
+  0.84049f,  4.31009f,  0.68850f,  0.26322f,  -0.24634f, -1.25889f, 0.31952f,
+  0.63632f,  0.05801f,  -0.10664f, -0.21992f, 2.44386f,  0.19526f,  -0.09838f,
+  1.53049f,  -0.26630f, 3.54126f,  -3.40574f, 0.72730f,  0.04557f,  0.92652f,
+  0.15522f,  2.35895f,  -0.13347f, 0.56907f,  0.15352f,  0.01823f,  -0.73939f,
+  0.43104f,  1.90321f,  0.31267f,  -0.51972f, 0.50094f,  -3.98372f, -3.41518f,
+  -0.48183f, 0.26661f,  0.64146f,  0.14500f,  -0.01695f, 0.16653f,  -0.37846f,
+  0.08412f,  2.69714f,  -0.20258f, -0.75786f, 0.11201f,  0.61878f,  4.22231f,
+  -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+  -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f,  2.11441f,
+  -1.08794f, -1.41694f, 0.02620f,  2.18792f,  0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+  2.47332f,
+  -1.65756f,
+  -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_32_layer0,
+    av1_rect_partition_nn_weights_32_layer1 },
+  { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  0.08972f,  4.09095f,  -0.31398f, -2.43631f, -0.74767f, 1.42471f,  1.60926f,
+  1.44721f,  1.88259f,  2.35375f,  1.88299f,  2.01109f,  0.98679f,  2.24131f,
+  0.06279f,  -0.08315f, 0.32107f,  0.91334f,  -0.36569f, 5.55049f,  5.44943f,
+  5.20471f,  5.39099f,  -0.01943f, -0.00284f, 0.02203f,  -0.01309f, 1.41917f,
+  6.68460f,  -6.15986f, 6.41341f,  -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+  0.04308f,  0.95366f,  3.48535f,  2.98266f,  4.11784f,  3.44255f,  0.61630f,
+  0.71405f,  0.63945f,  -0.00713f, 0.39193f,  1.91621f,  3.32755f,  0.71674f,
+  -0.11647f, 2.07090f,  2.64191f,  0.07949f,  -0.05023f, 0.99935f,  0.83145f,
+  0.75898f,  -0.98764f, -0.58731f, 1.21734f,  -0.08076f, -3.26780f, 1.66278f,
+  0.04189f,  -0.33177f, -1.58648f, 1.00883f,  -0.56132f, -2.34877f, 0.67056f,
+  -2.32297f, -0.91641f, -1.02909f, 4.19781f,  3.87484f,  4.32778f,  -1.97171f,
+  -0.24734f, 0.00822f,  0.05892f,  0.12697f,  -3.62915f, -2.93127f, 7.94856f,
+  -3.29311f, 3.26001f,  -0.02231f, 0.02741f,  0.05919f,  0.08190f,  -1.49344f,
+  -0.64475f, -0.24627f, 4.03324f,  -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+  0.08580f,  -5.74721f, 4.42467f,  3.63964f,  3.00258f,  -1.22744f, -0.29408f,
+  0.00767f,  0.12305f,  0.05249f,  -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+  0.04628f,  -0.35249f, -0.18272f, 0.03956f,  -0.19329f, -0.33564f, 0.09856f,
+  -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+  -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f,  -0.24918f, -0.53516f,
+  -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f,  6.54429f,
+  -0.00226f, 0.00430f,  0.00321f,  0.00442f,  0.87551f,  -0.16224f, -0.22832f,
+  -0.60640f, -0.28738f, 0.18062f,  0.22008f,  -0.47406f, 0.80302f,  0.12149f,
+  1.49530f,  1.05069f,  -2.02985f, -0.92833f, 0.25616f,  0.12852f,  3.51840f,
+  0.25226f,  -2.63283f, -4.04386f, 8.46300f,  -2.93408f, 0.44069f,  0.08276f,
+  0.34482f,  -0.22615f, 0.28666f,  3.02962f,  -1.20055f, -1.04832f, -0.97632f,
+  -0.99530f, 1.44196f,  1.68550f,  0.49360f,  1.08155f,  -0.26059f, -0.02876f,
+  -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+  1.17443f,  2.41497f,  1.90537f,  2.37716f,  2.91495f,  -0.44455f, -0.51176f,
+  0.48195f,  0.53032f,  0.23696f,  -1.06211f, 1.47459f,  -0.89029f, 0.29521f,
+  0.66291f,  -0.42653f, 1.82308f,  -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+  -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f,  0.27297f,
+  -4.81472f, 4.60404f,  -0.11053f, 0.14765f,  0.02826f,  -0.14688f, -0.07066f,
+  -0.01224f, 1.20377f,  7.02725f,  -6.02627f, 6.87255f,  -3.14257f, 0.01074f,
+  0.02397f,  -0.02359f, 0.01901f,  0.14956f,  -1.67671f, 2.26714f,  2.57043f,
+  -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+  -2.14814f, -0.67266f, -1.56850f, 0.57137f,  -1.14428f, -0.34265f, -0.12521f,
+  0.01220f,  -0.74906f, -0.19270f, 0.68110f,  -0.24737f, -0.70568f, -1.64826f,
+  -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f,  6.17812f,
+  -0.03191f, -0.00104f, 0.01402f,  -0.00046f, -0.94517f, 1.51266f,  -0.56318f,
+  0.72260f,  -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f,  2.21148f,
+  -1.47954f, -1.01439f, 0.31536f,  0.77238f,  -0.85083f, -0.15758f, -0.50886f,
+  0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+  0.91706f,  -1.31328f, -5.16196f, 1.13191f,  -0.98044f, -1.61122f, 1.03039f,
+  -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f,  -0.10752f, -0.13065f,
+  -0.35567f, -0.35693f, 1.74941f,  1.17379f,  -3.45555f, 5.66321f,  -0.24917f,
+  -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f,  -2.97859f, -0.16774f,
+  0.59835f,  -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.58963f,  4.20320f,  -8.62465f, -6.54014f, 5.41108f,  2.33581f,   -0.10354f,
+  -1.17753f, -3.45909f, -2.24722f, 2.20881f,  3.21971f,  -0.09087f,  -0.21624f,
+  0.16529f,  -8.40985f, -1.60205f, -1.41538f, 4.41826f,  -4.63069f,  -0.27742f,
+  4.08710f,  0.26439f,  -1.46028f, 0.51234f,  6.25212f,  -3.35650f,  -1.21348f,
+  1.37201f,  8.89151f,  0.28859f,  -0.97328f, -0.36196f, -2.71701f,  4.54196f,
+  -0.62476f, -2.43814f, -1.34209f, 0.12850f,  1.73859f,  3.09809f,   -4.42434f,
+  -1.82552f, -3.66420f, -0.31535f, 0.00968f,  -0.02019f, 9.66824f,   0.58835f,
+  1.50425f,  2.84487f,  2.55522f,  0.01409f,  -2.27594f, -0.31800f,  0.91076f,
+  -0.66808f, 0.33120f,  -0.12460f, 0.64457f,  -0.36416f, -10.30843f, 1.51013f,
+  2.06861f,  -0.20989f, -0.87119f, 3.68642f,  7.33662f,  -2.88037f,  -0.52414f,
+  -0.35036f, -0.45947f, -0.07406f, 6.46346f,  -0.16031f, 0.27071f,   0.38845f,
+  -0.21940f, 0.08583f,  -1.39526f, 0.50554f,  0.45279f,  -6.61856f,  1.84069f,
+  -0.19149f, -1.77235f, 0.75136f,  1.11797f,  0.32677f,  -7.10427f,  3.82908f,
+  1.04238f,  -0.91435f, 1.93317f,  -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+  0.32215f,
+  -0.57522f,
+  0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_64_layer0,
+    av1_rect_partition_nn_weights_64_layer1 },
+  { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+                                                            NUM_NODES] = {
+  -0.70901f, -3.03481f, 3.30604f,  -1.28803f,  -0.08610f, -0.33320f, -0.30716f,
+  0.25100f,  0.14323f,  -0.98422f, -0.89084f,  -0.24508f, -1.10785f, -0.82524f,
+  0.11766f,  -0.42777f, 1.08965f,  4.35125f,   -1.19388f, 4.22042f,  4.96306f,
+  6.32406f,  3.29899f,  -0.90768f, 0.05203f,   0.38467f,  1.74257f,  -0.19918f,
+  -0.11335f, 0.00140f,  -0.42303f, -0.04419f,  0.03583f,  -0.05441f, -0.19586f,
+  0.01484f,  -1.19964f, 0.25497f,  3.04502f,   0.05446f,  -0.23253f, 0.00266f,
+  0.07117f,  -2.78986f, -4.62953f, 1.45331f,   0.43923f,  0.92298f,  -0.47736f,
+  1.49165f,  0.45942f,  -1.99787f, 3.33510f,   0.17234f,  0.04024f,  -1.42780f,
+  0.23566f,  -0.90970f, 1.18041f,  -1.45865f,  2.30878f,  -1.28507f, 1.87290f,
+  1.91186f,  4.74826f,  -3.70735f, 4.49808f,   -4.72275f, -0.02696f, -0.02642f,
+  -0.06093f, -0.01121f, -0.70683f, 2.69737f,   -1.88563f, 2.48637f,  1.10922f,
+  0.74624f,  0.40308f,  2.06396f,  1.39289f,   0.00909f,  -2.05271f, -1.53539f,
+  -1.38323f, 0.83303f,  -0.32250f, 0.51172f,   3.91249f,  1.66373f,  1.13184f,
+  -2.22874f, -1.13448f, -0.11185f, 0.19387f,   0.36770f,  -0.58933f, 0.22789f,
+  1.17307f,  0.77461f,  0.20817f,  0.33417f,   0.54037f,  0.32961f,  -0.18456f,
+  -9.78171f, -0.17216f, -3.44703f, -2.42158f,  0.51946f,  4.35949f,  -0.73335f,
+  -1.61515f, -0.29622f, -0.37617f, -0.42316f,  0.74922f,  1.44386f,  3.92704f,
+  -3.76274f, 4.19775f,  -3.86958f, 0.00074f,   -0.02418f, -0.12944f, 0.05857f,
+  -0.85507f, 5.42546f,  5.40338f,  5.54347f,   5.59791f,  -0.01611f, 0.01618f,
+  -0.01654f, -0.00270f, -0.39608f, -0.40410f,  -0.24551f, 0.09124f,  -0.34413f,
+  -0.11504f, 0.12793f,  -0.31523f, 0.09148f,   -0.08567f, -0.05140f, -0.13310f,
+  -0.81200f, 0.06882f,  -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+  -1.02573f, 0.32788f,  1.06828f,  -1.25503f,  -0.42693f, 2.01413f,  -2.29103f,
+  0.62271f,  1.11764f,  -1.83113f, -1.32325f,  -1.65651f, -2.87826f, 1.46910f,
+  0.60885f,  0.16079f,  0.00171f,  -0.25658f,  -0.25465f, -0.14149f, 0.19497f,
+  -0.07866f, -0.37080f, -0.05778f, -0.08870f,  -0.20491f, 0.84521f,  -0.18214f,
+  -1.38441f, -1.08932f, -1.76627f, 0.73172f,   0.05967f,  1.28057f,  3.42722f,
+  1.69287f,  0.77169f,  0.44528f,  1.85513f,   0.07840f,  1.31252f,  2.89948f,
+  1.49489f,  0.15281f,  0.54708f,  -1.14185f,  -2.51063f, 0.36618f,  -0.55322f,
+  0.96671f,  1.59470f,  1.38252f,  1.99697f,   0.03266f,  -0.23200f, -0.01127f,
+  -0.18918f, -0.37598f, -0.03119f, -0.36039f,  -0.21192f, -0.11565f, -4.22635f,
+  1.41252f,  0.56608f,  -0.08867f, 3.11924f,   -0.54597f, -0.12504f, -0.05289f,
+  -0.28665f, -0.58297f, -1.18362f, -0.76201f,  -1.22011f, -0.58756f, 0.14740f,
+  1.43971f,  0.98381f,  -0.02998f, -0.40678f,  -0.23047f, -0.12979f, 0.04003f,
+  -0.22081f, -0.09294f, -0.15955f, -0.10379f,  -0.10192f, -1.51316f, 2.39482f,
+  -1.69975f, 3.58976f,  -0.91032f, -0.03498f,  0.48982f,  -0.13418f, 0.76256f,
+  1.61003f,  -2.01676f, -1.24430f, -3.25763f,  1.12314f,  2.00740f,  0.04613f,
+  -0.14746f, -0.57374f, 3.44511f,  -0.56767f,  -4.08432f, -2.04894f, 2.35951f,
+  -0.00458f, 0.18512f,  0.09916f,  -0.04084f,  -1.56207f, 1.38034f,  4.17302f,
+  -1.47326f, -2.03530f, -0.00210f, 0.27469f,   -0.17423f, 0.86860f,  2.76195f,
+  2.43269f,  -3.57331f, 2.08715f,  -1.44171f,  -0.17389f, 2.26157f,  -0.07852f,
+  2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+  2.53427f,  1.66678f,  -0.84914f, -0.15070f, -1.74769f, 0.45218f,  -0.26067f,
+  2.05916f,  0.08978f,  5.30984f,  2.66243f,  -1.62740f, 0.70018f,  1.96403f,
+  -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f,  -0.08373f, 0.00225f,
+  -1.40692f, -0.27569f, -0.30253f, 0.77377f,  -0.67636f, -0.26379f, 1.82348f,
+  0.66120f,  0.61119f,  -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+                                                            LABEL_SIZE] = {
+  1.53453f,  -0.23707f, 7.88368f,  0.33340f,  0.97523f,  1.38538f,  -0.16746f,
+  4.42070f,  3.18678f,  -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+  -8.75673f, 0.27398f,  -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+  3.32134f,  0.25375f,  -0.00394f, 2.30213f,  -0.14183f, 0.14544f,  -1.42830f,
+  1.31101f,  3.99389f,  -0.00017f, -2.90184f, -2.11444f, 2.16734f,  -3.05133f,
+  0.39206f,  4.61489f,  -2.88181f, -0.47745f, 2.86649f,  -1.20621f, 3.70550f,
+  1.58029f,  -4.58731f, -2.29350f, -0.76930f, 5.19135f,  -0.22521f, -5.08782f,
+  2.17316f,  1.30563f,  0.16777f,  -2.17767f, -2.09904f, 1.37001f,  0.25091f,
+  -1.76743f, 1.57940f,  0.30544f,  -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+  -0.88449f, 0.79299f,  -1.35368f, -4.54110f, 0.02244f,  -5.11580f, 1.60883f,
+  0.29352f,  -6.47042f, -1.81426f, 1.24013f,  0.90980f,  7.93977f,  2.12555f,
+  5.24720f,  4.19508f,  0.21499f,  11.06045f, -0.74752f, 0.89396f,  0.26422f,
+  1.72332f,  -1.25113f, -1.71136f, 0.13676f,  -0.07867f, -0.96929f, 0.19911f,
+  3.58233f,  -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+  1.09014f,
+  -0.53317f,
+  -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_128_layer0,
+    av1_rect_partition_nn_weights_128_layer1 },
+  { av1_rect_partition_nn_bias_128_layer0,
+    av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 000000000..6d154a7d2
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+
+#define REDUCED_PRI_STRENGTHS 8
+#define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 };
+
+/* Search for the best strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+                           int fast) {
+  uint64_t tot_mse[TOTAL_STRENGTHS];
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id = 0;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      if (mse[i][lev[gi]] < best_mse) {
+        best_mse = mse[i][lev[gi]];
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < total_strengths; j++) {
+      uint64_t best = best_mse;
+      if (mse[i][j] < best) best = mse[i][j];
+      tot_mse[j] += best;
+    }
+  }
+  for (j = 0; j < total_strengths; j++) {
+    if (tot_mse[j] < best_tot_mse) {
+      best_tot_mse = tot_mse[j];
+      best_id = j;
+    }
+  }
+  lev[nb_strengths] = best_id;
+  return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+                                uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+                                int fast) {
+  uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id0 = 0;
+  int best_id1 = 0;
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      uint64_t curr = mse[0][i][lev0[gi]];
+      curr += mse[1][i][lev1[gi]];
+      if (curr < best_mse) {
+        best_mse = curr;
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < total_strengths; j++) {
+      int k;
+      for (k = 0; k < total_strengths; k++) {
+        uint64_t best = best_mse;
+        uint64_t curr = mse[0][i][j];
+        curr += mse[1][i][k];
+        if (curr < best) best = curr;
+        tot_mse[j][k] += best;
+      }
+    }
+  }
+  for (j = 0; j < total_strengths; j++) {
+    int k;
+    for (k = 0; k < total_strengths; k++) {
+      if (tot_mse[j][k] < best_tot_mse) {
+        best_tot_mse = tot_mse[j][k];
+        best_id0 = j;
+        best_id1 = k;
+      }
+    }
+  }
+  lev0[nb_strengths] = best_id0;
+  lev1[nb_strengths] = best_id1;
+  return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+                                      uint64_t mse[][TOTAL_STRENGTHS],
+                                      int sb_count, int fast) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse = search_one(best_lev, i, mse, sb_count, fast);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  if (!fast) {
+    for (i = 0; i < 4 * nb_strengths; i++) {
+      int j;
+      for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+      best_tot_mse =
+          search_one(best_lev, nb_strengths - 1, mse, sb_count, fast);
+    }
+  }
+  return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+                                           int nb_strengths,
+                                           uint64_t (**mse)[TOTAL_STRENGTHS],
+                                           int sb_count, int fast) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse =
+        search_one_dual(best_lev0, best_lev1, i, mse, sb_count, fast);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  for (i = 0; i < 4 * nb_strengths; i++) {
+    int j;
+    for (j = 0; j < nb_strengths - 1; j++) {
+      best_lev0[j] = best_lev0[j + 1];
+      best_lev1[j] = best_lev1[j + 1];
+    }
+    best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+                                   sb_count, fast);
+  }
+  return best_tot_mse;
+}
+
+/* FIXME: SSE-optimize this. */
+static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
+                         int src_voffset, int src_hoffset, int sstride,
+                         int vsize, int hsize) {
+  int r, c;
+  const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++) {
+    for (c = 0; c < hsize; c++) {
+      dst[r * dstride + c] = base[r * sstride + c];
+    }
+  }
+}
+
+static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                      int sstride, int coeff_shift) {
+  uint64_t svar = 0;
+  uint64_t dvar = 0;
+  uint64_t sum_s = 0;
+  uint64_t sum_d = 0;
+  uint64_t sum_s2 = 0;
+  uint64_t sum_d2 = 0;
+  uint64_t sum_sd = 0;
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      sum_s += src[i * sstride + j];
+      sum_d += dst[i * dstride + j];
+      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+    }
+  }
+  /* Compute the variance -- the calculation cannot go negative. */
+  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+  return (uint64_t)floor(
+      .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+               (svar + dvar + (400 << 2 * coeff_shift)) /
+               (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+}
+
+static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                     int sstride) {
+  uint64_t sum = 0;
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                     int sstride) {
+  uint64_t sum = 0;
+  int i, j;
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+/* Compute MSE only on the blocks we filtered. */
+uint64_t compute_cdef_dist(uint16_t *dst, int dstride, uint16_t *src,
+                           cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize,
+                           int coeff_shift, int pli) {
+  uint64_t sum = 0;
+  int bi, bx, by;
+  if (bsize == BLOCK_8X8) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      if (pli == 0) {
+        sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                              &src[bi << (3 + 3)], 8, coeff_shift);
+      } else {
+        sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+                             &src[bi << (3 + 3)], 8);
+      }
+    }
+  } else if (bsize == BLOCK_4X8) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
+                           &src[bi << (3 + 2)], 4);
+      sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride,
+                           &src[(bi << (3 + 2)) + 4 * 4], 4);
+    }
+  } else if (bsize == BLOCK_8X4) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
+                           &src[bi << (2 + 3)], 8);
+      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
+                           &src[(bi << (2 + 3)) + 4], 8);
+    }
+  } else {
+    assert(bsize == BLOCK_4X4);
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+                           &src[bi << (2 + 2)], 4);
+    }
+  }
+  return sum >> 2 * coeff_shift;
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int fast) {
+  int r, c;
+  int fbr, fbc;
+  uint16_t *src[3];
+  uint16_t *ref_coeff[3];
+  static cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int stride[3];
+  int bsize[3];
+  int mi_wide_l2[3];
+  int mi_high_l2[3];
+  int xdec[3];
+  int ydec[3];
+  int pli;
+  int cdef_count;
+  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  uint64_t tot_mse;
+  int sb_count;
+  int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+  int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  int pri_damping = 3 + (cm->base_qindex >> 6);
+  int sec_damping = 3 + (cm->base_qindex >> 6);
+  int i;
+  int nb_strengths;
+  int nb_strength_bits;
+  int quantizer;
+  double lambda;
+  const int num_planes = av1_num_planes(cm);
+  const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+  uint16_t *in;
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  quantizer = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth) >>
+              (cm->seq_params.bit_depth - 8);
+  lambda = .12 * quantizer * quantizer / 256.;
+
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
+  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+  for (pli = 0; pli < num_planes; pli++) {
+    uint8_t *ref_buffer;
+    int ref_stride;
+    switch (pli) {
+      case 0:
+        ref_buffer = ref->y_buffer;
+        ref_stride = ref->y_stride;
+        break;
+      case 1:
+        ref_buffer = ref->u_buffer;
+        ref_stride = ref->uv_stride;
+        break;
+      case 2:
+        ref_buffer = ref->v_buffer;
+        ref_stride = ref->uv_stride;
+        break;
+    }
+    src[pli] = aom_memalign(
+        32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+    ref_coeff[pli] = aom_memalign(
+        32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+    xdec[pli] = xd->plane[pli].subsampling_x;
+    ydec[pli] = xd->plane[pli].subsampling_y;
+    bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+                           : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+    stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
+    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+
+    const int frame_height =
+        (cm->mi_rows * MI_SIZE) >> xd->plane[pli].subsampling_y;
+    const int frame_width =
+        (cm->mi_cols * MI_SIZE) >> xd->plane[pli].subsampling_x;
+
+    for (r = 0; r < frame_height; ++r) {
+      for (c = 0; c < frame_width; ++c) {
+        if (cm->seq_params.use_highbitdepth) {
+          src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
+              xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
+          ref_coeff[pli][r * stride[pli] + c] =
+              CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
+        } else {
+          src[pli][r * stride[pli] + c] =
+              xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+          ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
+        }
+      }
+    }
+  }
+  in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+  sb_count = 0;
+  for (fbr = 0; fbr < nvfb; ++fbr) {
+    for (fbc = 0; fbc < nhfb; ++fbc) {
+      int nvb, nhb;
+      int gi;
+      int dirinit = 0;
+      nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
+      nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+      int hb_step = 1;
+      int vb_step = 1;
+      BLOCK_SIZE bs = BLOCK_64X64;
+      MB_MODE_INFO *const mbmi =
+          cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+                              MI_SIZE_64X64 * fbc];
+      if (((fbc & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
+          ((fbr & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
+        continue;
+      if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
+          mbmi->sb_type == BLOCK_64X128)
+        bs = mbmi->sb_type;
+      if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+        nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
+        hb_step = 2;
+      }
+      if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+        nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
+        vb_step = 2;
+      }
+      // No filtering if the entire filter block is skipped
+      if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue;
+      cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+                                        fbc * MI_SIZE_64X64, dlist, bs);
+      for (pli = 0; pli < num_planes; pli++) {
+        for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
+        for (gi = 0; gi < total_strengths; gi++) {
+          int threshold;
+          uint64_t curr_mse;
+          int sec_strength;
+          threshold = gi / CDEF_SEC_STRENGTHS;
+          if (fast) threshold = priconv[threshold];
+          /* We avoid filtering the pixels for which some of the pixels to
+             average
+             are outside the frame. We could change the filter instead, but it
+             would add special cases for any future vectorization. */
+          int yoff = CDEF_VBORDER * (fbr != 0);
+          int xoff = CDEF_HBORDER * (fbc != 0);
+          int ysize = (nvb << mi_high_l2[pli]) +
+                      CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
+          int xsize = (nhb << mi_wide_l2[pli]) +
+                      CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
+          sec_strength = gi % CDEF_SEC_STRENGTHS;
+          copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                       src[pli],
+                       (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+                       (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
+                       stride[pli], ysize, xsize);
+          cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
+                         dir, &dirinit, var, pli, dlist, cdef_count, threshold,
+                         sec_strength + (sec_strength == 3), pri_damping,
+                         sec_damping, coeff_shift);
+          curr_mse = compute_cdef_dist(
+              ref_coeff[pli] +
+                  (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
+                  (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
+              stride[pli], tmp_dst, dlist, cdef_count, bsize[pli], coeff_shift,
+              pli);
+          if (pli < 2)
+            mse[pli][sb_count][gi] = curr_mse;
+          else
+            mse[1][sb_count][gi] += curr_mse;
+          sb_index[sb_count] =
+              MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
+        }
+      }
+      sb_count++;
+    }
+  }
+  nb_strength_bits = 0;
+  /* Search for different number of signalling bits. */
+  for (i = 0; i <= 3; i++) {
+    int j;
+    int best_lev0[CDEF_MAX_STRENGTHS];
+    int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+    nb_strengths = 1 << i;
+    if (num_planes >= 3)
+      tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+                                           mse, sb_count, fast);
+    else
+      tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+                                      fast);
+    /* Count superblock signalling cost. */
+    tot_mse += (uint64_t)(sb_count * lambda * i);
+    /* Count header signalling cost. */
+    tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
+    if (tot_mse < best_tot_mse) {
+      best_tot_mse = tot_mse;
+      nb_strength_bits = i;
+      for (j = 0; j < 1 << nb_strength_bits; j++) {
+        cm->cdef_strengths[j] = best_lev0[j];
+        cm->cdef_uv_strengths[j] = best_lev1[j];
+      }
+    }
+  }
+  nb_strengths = 1 << nb_strength_bits;
+
+  cm->cdef_bits = nb_strength_bits;
+  cm->nb_cdef_strengths = nb_strengths;
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    int best_gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    best_gi = 0;
+    for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
+      uint64_t curr = mse[0][i][cm->cdef_strengths[gi]];
+      if (num_planes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
+      if (curr < best_mse) {
+        best_gi = gi;
+        best_mse = curr;
+      }
+    }
+    selected_strength[i] = best_gi;
+    cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
+  }
+
+  if (fast) {
+    for (int j = 0; j < nb_strengths; j++) {
+      cm->cdef_strengths[j] =
+          priconv[cm->cdef_strengths[j] / CDEF_SEC_STRENGTHS] *
+              CDEF_SEC_STRENGTHS +
+          (cm->cdef_strengths[j] % CDEF_SEC_STRENGTHS);
+      cm->cdef_uv_strengths[j] =
+          priconv[cm->cdef_uv_strengths[j] / CDEF_SEC_STRENGTHS] *
+              CDEF_SEC_STRENGTHS +
+          (cm->cdef_uv_strengths[j] % CDEF_SEC_STRENGTHS);
+    }
+  }
+  cm->cdef_pri_damping = pri_damping;
+  cm->cdef_sec_damping = sec_damping;
+  aom_free(mse[0]);
+  aom_free(mse[1]);
+  for (pli = 0; pli < num_planes; pli++) {
+    aom_free(src[pli]);
+    aom_free(ref_coeff[pli]);
+  }
+  aom_free(sb_index);
+  aom_free(selected_strength);
+}
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 000000000..c5508e25c
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+                            YV12_BUFFER_CONFIG *dst_bc, int plane) {
+  switch (plane) {
+    case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+    case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+    case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+    default: assert(plane >= 0 && plane <= 2); break;
+  }
+}
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+  if (cpi->oxcf.pass == 2) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                AV1_COMP *const cpi, int filt_level,
+                                int partial_frame, int plane, int dir) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+  assert(plane >= 0 && plane <= 2);
+  int filter_level[2] = { filt_level, filt_level };
+  if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+  if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+  // set base filters for use of get_filter_level when in DELTA_Q_LF mode
+  switch (plane) {
+    case 0:
+      cm->lf.filter_level[0] = filter_level[0];
+      cm->lf.filter_level[1] = filter_level[1];
+      break;
+    case 1: cm->lf.filter_level_u = filter_level[0]; break;
+    case 2: cm->lf.filter_level_v = filter_level[0]; break;
+  }
+
+      // TODO(any): please enable multi-thread and remove the flag when loop
+      // filter mask is compatible with multi-thread.
+#if LOOP_FILTER_BITMASK
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane,
+                        plane + 1, partial_frame);
+#else
+  if (cpi->num_workers > 1)
+    av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                             plane + 1, partial_frame, cpi->workers,
+                             cpi->num_workers, &cpi->lf_row_sync);
+  else
+    av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+                          plane + 1, partial_frame);
+#endif
+
+  filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane,
+                               cm->seq_params.use_highbitdepth);
+
+  // Re-instate the unfiltered frame
+  yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                               int partial_frame,
+                               const int *last_frame_filter_level,
+                               double *best_cost_ret, int plane, int dir) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int min_filter_level = 0;
+  const int max_filter_level = av1_get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+  MACROBLOCK *x = &cpi->td.mb;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int lvl;
+  switch (plane) {
+    case 0: lvl = last_frame_filter_level[dir]; break;
+    case 1: lvl = last_frame_filter_level[2]; break;
+    case 2: lvl = last_frame_filter_level[3]; break;
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+  int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+  yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] =
+            try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if (ss_err[filt_low] < (best_err + bias)) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] =
+            try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
+      }
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
+  return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *const lf = &cm->lf;
+  (void)sd;
+
+  lf->sharpness_level = 0;
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+  if (method == LPF_PICK_MINIMAL_LPF) {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = av1_get_max_filter_level(cpi);
+    const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
+    // These values were determined by linear fitting the result of the
+    // searched level for 8 bit depth:
+    // Keyframes: filt_guess = q * 0.06699 - 1.60817
+    // Other frames: filt_guess = q * 0.02295 + 2.48225
+    //
+    // And high bit depth separately:
+    // filt_guess = q * 0.316206 + 3.87252
+    int filt_guess;
+    switch (cm->seq_params.bit_depth) {
+      case AOM_BITS_8:
+        filt_guess = (cm->frame_type == KEY_FRAME)
+                         ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+                         : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
+        break;
+      case AOM_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case AOM_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 &&
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+               "or AOM_BITS_12");
+        return;
+    }
+    if (cm->seq_params.bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    // TODO(chengchen): retrain the model for Y, U, V filter levels
+    lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    const int last_frame_filter_level[4] = { lf->filter_level[0],
+                                             lf->filter_level[1],
+                                             lf->filter_level_u,
+                                             lf->filter_level_v };
+
+    lf->filter_level[0] = lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 2);
+    lf->filter_level[0] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 0);
+    lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 1);
+
+    if (num_planes > 1) {
+      lf->filter_level_u =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 1, 0);
+      lf->filter_level_v =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 2, 0);
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 000000000..357097ae1
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 000000000..e7804f6b4
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,1362 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
+// When set to RESTORE_TYPES we allow switchable.
+static const RestorationType force_restore_type = RESTORE_TYPES;
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 5
+
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
+
+const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                      const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                           const YV12_BUFFER_CONFIG *b,
+                                           int hstart, int width, int vstart,
+                                           int height);
+
+#define NUM_EXTRACTORS (3 * (1 + 1))
+
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_sse_part,        aom_get_u_sse_part,
+  aom_get_v_sse_part,        aom_highbd_get_y_sse_part,
+  aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+                                    const YV12_BUFFER_CONFIG *src,
+                                    const YV12_BUFFER_CONFIG *dst, int plane,
+                                    int highbd) {
+  return sse_part_extractors[3 * highbd + plane](
+      src, dst, limits->h_start, limits->h_end - limits->h_start,
+      limits->v_start, limits->v_end - limits->v_start);
+}
+
+typedef struct {
+  // The best coefficients for Wiener or Sgrproj restoration
+  WienerInfo wiener;
+  SgrprojInfo sgrproj;
+
+  // The sum of squared errors for this rtype.
+  int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+  // The rtype to use for this unit given a frame rtype as
+  // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
+  RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+typedef struct {
+  const YV12_BUFFER_CONFIG *src;
+  YV12_BUFFER_CONFIG *dst;
+
+  const AV1_COMMON *cm;
+  const MACROBLOCK *x;
+  int plane;
+  int plane_width;
+  int plane_height;
+  RestUnitSearchInfo *rusi;
+
+  // Speed features
+  const SPEED_FEATURES *sf;
+
+  uint8_t *dgd_buffer;
+  int dgd_stride;
+  const uint8_t *src_buffer;
+  int src_stride;
+
+  // sse and bits are initialised by reset_rsc in search_rest_type
+  int64_t sse;
+  int64_t bits;
+  int tile_y0, tile_stripe0;
+
+  // sgrproj and wiener are initialised by rsc_on_tile when starting the first
+  // tile in the frame.
+  SgrprojInfo sgrproj;
+  WienerInfo wiener;
+  AV1PixelRect tile_rect;
+} RestSearchCtxt;
+
+static void rsc_on_tile(int tile_row, int tile_col, void *priv) {
+  (void)tile_col;
+
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  set_default_sgrproj(&rsc->sgrproj);
+  set_default_wiener(&rsc->wiener);
+
+  rsc->tile_stripe0 =
+      (tile_row == 0) ? 0 : rsc->cm->rst_end_stripe[tile_row - 1];
+}
+
+static void reset_rsc(RestSearchCtxt *rsc) {
+  rsc->sse = 0;
+  rsc->bits = 0;
+}
+
+static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
+                     const MACROBLOCK *x, const SPEED_FEATURES *sf, int plane,
+                     RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst,
+                     RestSearchCtxt *rsc) {
+  rsc->src = src;
+  rsc->dst = dst;
+  rsc->cm = cm;
+  rsc->x = x;
+  rsc->plane = plane;
+  rsc->rusi = rusi;
+  rsc->sf = sf;
+
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int is_uv = plane != AOM_PLANE_Y;
+  rsc->plane_width = src->crop_widths[is_uv];
+  rsc->plane_height = src->crop_heights[is_uv];
+  rsc->src_buffer = src->buffers[plane];
+  rsc->src_stride = src->strides[is_uv];
+  rsc->dgd_buffer = dgd->buffers[plane];
+  rsc->dgd_stride = dgd->strides[is_uv];
+  rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
+  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+                                    const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    const RestorationUnitInfo *rui) {
+  const AV1_COMMON *const cm = rsc->cm;
+  const int plane = rsc->plane;
+  const int is_uv = plane > 0;
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationLineBuffers rlbs;
+  const int bit_depth = cm->seq_params.bit_depth;
+  const int highbd = cm->seq_params.use_highbitdepth;
+
+  const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+  // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+  // also used in encoder.
+  const int optimized_lr = 0;
+
+  av1_loop_restoration_filter_unit(
+      limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+      is_uv && cm->seq_params.subsampling_x,
+      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
+      fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+
+  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
+}
+
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+                                     int src_stride, const uint8_t *dat8,
+                                     int dat_stride, int32_t *flt0,
+                                     int flt0_stride, int32_t *flt1,
+                                     int flt1_stride, int xq[2],
+                                     const sgr_params_type *params) {
+  int i, j;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+    }
+  } else if (params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+    }
+  } else {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t e = (int32_t)(dat[j]) - src[j];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  return err;
+}
+
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int use_highbitdepth,
+                                    int32_t *flt0, int flt0_stride,
+                                    int32_t *flt1, int flt1_stride, int *xqd,
+                                    const sgr_params_type *params) {
+  int i, j;
+  int64_t err = 0;
+  int xq[2];
+  decode_xq(xqd, xq, params);
+  if (!use_highbitdepth) {
+    err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, xq, params);
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+    const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+    if (params->r[0] > 0 && params->r[1] > 0) {
+      int xq0 = xq[0];
+      int xq1 = xq[1];
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+          int32_t v0 = flt0[j] - u;
+          int32_t v1 = flt1[j] - u;
+          int32_t v = half;
+          v += xq0 * v0;
+          v += xq1 * v1;
+          const int32_t e =
+              (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        flt0 += flt0_stride;
+        flt1 += flt1_stride;
+        src += src_stride;
+      }
+    } else if (params->r[0] > 0 || params->r[1] > 0) {
+      int exq;
+      int32_t *flt;
+      int flt_stride;
+      if (params->r[0] > 0) {
+        exq = xq[0];
+        flt = flt0;
+        flt_stride = flt0_stride;
+      } else {
+        exq = xq[1];
+        flt = flt1;
+        flt_stride = flt1_stride;
+      }
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+          int32_t v = half;
+          v += exq * (flt[j] - u);
+          const int32_t e =
+              (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        flt += flt_stride;
+        src += src_stride;
+      }
+    } else {
+      for (i = 0; i < height; ++i) {
+        for (j = 0; j < width; ++j) {
+          const int32_t d = dat[j];
+          const int32_t s = src[j];
+          const int32_t e = d - s;
+          err += e * e;
+        }
+        dat += dat_stride;
+        src += src_stride;
+      }
+    }
+  }
+  return err;
+}
+
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+    int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+    const sgr_params_type *params) {
+  int64_t err = get_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt0_stride, flt1, flt1_stride, xqd, params);
+  (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+  int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = 0; p < 2; ++p) {
+      if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+        continue;
+      }
+      int skip = 0;
+      do {
+        if (xqd[p] - s >= tap_min[p]) {
+          xqd[p] -= s;
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          if (err2 > err) {
+            xqd[p] += s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (xqd[p] + s <= tap_max[p]) {
+          xqd[p] += s;
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          if (err2 > err) {
+            xqd[p] -= s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+#endif  // USE_SGRPROJ_REFINEMENT_SEARCH
+  return err;
+}
+
+static void get_proj_subspace(const uint8_t *src8, int width, int height,
+                              int src_stride, const uint8_t *dat8,
+                              int dat_stride, int use_highbitdepth,
+                              int32_t *flt0, int flt0_stride, int32_t *flt1,
+                              int flt1_stride, int *xq,
+                              const sgr_params_type *params) {
+  int i, j;
+  double H[2][2] = { { 0, 0 }, { 0, 0 } };
+  double C[2] = { 0, 0 };
+  double Det;
+  double x[2];
+  const int size = width * height;
+
+  aom_clear_system_state();
+
+  // Default
+  xq[0] = 0;
+  xq[1] = 0;
+  if (!use_highbitdepth) {
+    const uint8_t *src = src8;
+    const uint8_t *dat = dat8;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const double s =
+            (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+        const double f1 =
+            (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+        const double f2 =
+            (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
+        H[0][0] += f1 * f1;
+        H[1][1] += f2 * f2;
+        H[0][1] += f1 * f2;
+        C[0] += f1 * s;
+        C[1] += f2 * s;
+      }
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+        const double s =
+            (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+        const double f1 =
+            (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+        const double f2 =
+            (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
+        H[0][0] += f1 * f1;
+        H[1][1] += f2 * f2;
+        H[0][1] += f1 * f2;
+        C[0] += f1 * s;
+        C[1] += f2 * s;
+      }
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+  if (params->r[0] == 0) {
+    // H matrix is now only the scalar H[1][1]
+    // C vector is now only the scalar C[1]
+    Det = H[1][1];
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = 0;
+    x[1] = C[1] / Det;
+
+    xq[0] = 0;
+    xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  } else if (params->r[1] == 0) {
+    // H matrix is now only the scalar H[0][0]
+    // C vector is now only the scalar C[0]
+    Det = H[0][0];
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = C[0] / Det;
+    x[1] = 0;
+
+    xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+    xq[1] = 0;
+  } else {
+    Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+    if (Det < 1e-8) return;  // ill-posed, return default values
+    x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+    x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+
+    xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+    xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+  }
+}
+
+void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xqd[0] = 0;
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else if (params->r[1] == 0) {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  }
+}
+
+// Apply the self-guided filter across an entire restoration unit.
+static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
+                      int height, int dat_stride, int use_highbd, int bit_depth,
+                      int pu_width, int pu_height, int32_t *flt0, int32_t *flt1,
+                      int flt_stride) {
+  for (int i = 0; i < height; i += pu_height) {
+    const int h = AOMMIN(pu_height, height - i);
+    int32_t *flt0_row = flt0 + i * flt_stride;
+    int32_t *flt1_row = flt1 + i * flt_stride;
+    const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+    // Iterate over the stripe in blocks of width pu_width
+    for (int j = 0; j < width; j += pu_width) {
+      const int w = AOMMIN(pu_width, width - j);
+      const int ret = av1_selfguided_restoration(
+          dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+          flt_stride, sgr_params_idx, bit_depth, use_highbd);
+      (void)ret;
+      assert(!ret);
+    }
+  }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+    const uint8_t *dat8, int width, int height, int dat_stride,
+    const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+    int pu_width, int pu_height, int32_t *rstbuf) {
+  int32_t *flt0 = rstbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  int ep, bestep = 0;
+  int64_t besterr = -1;
+  int exqd[2], bestxqd[2] = { 0, 0 };
+  int flt_stride = ((width + 7) & ~7) + 8;
+  assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_width == RESTORATION_PROC_UNIT_SIZE);
+  assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_height == RESTORATION_PROC_UNIT_SIZE);
+
+  for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+    int exq[2];
+    apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+              pu_width, pu_height, flt0, flt1, flt_stride);
+    aom_clear_system_state();
+    const sgr_params_type *const params = &sgr_params[ep];
+    get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+                      use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+                      params);
+    aom_clear_system_state();
+    encode_xq(exq, exqd, params);
+    int64_t err = finer_search_pixel_proj_error(
+        src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
+        flt0, flt_stride, flt1, flt_stride, 2, exqd, params);
+    if (besterr == -1 || err < besterr) {
+      bestep = ep;
+      besterr = err;
+      bestxqd[0] = exqd[0];
+      bestxqd[1] = exqd[1];
+    }
+  }
+
+  SgrprojInfo ret;
+  ret.ep = bestep;
+  ret.xqd[0] = bestxqd[0];
+  ret.xqd[1] = bestxqd[1];
+  return ret;
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+                              SgrprojInfo *ref_sgrproj_info) {
+  int bits = SGRPROJ_PARAMS_BITS;
+  const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+  if (params->r[0] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  if (params->r[1] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  return bits;
+}
+
+static void search_sgrproj(const RestorationTileLimits *limits,
+                           const AV1PixelRect *tile, int rest_unit_idx,
+                           void *priv, int32_t *tmpbuf,
+                           RestorationLineBuffers *rlbs) {
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+  const AV1_COMMON *const cm = rsc->cm;
+  const int highbd = cm->seq_params.use_highbitdepth;
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  uint8_t *dgd_start =
+      rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
+  const uint8_t *src_start =
+      rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
+
+  const int is_uv = rsc->plane > 0;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+  const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+
+  rusi->sgrproj = search_selfguided_restoration(
+      dgd_start, limits->h_end - limits->h_start,
+      limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+      rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+      tmpbuf);
+
+  RestorationUnitInfo rui;
+  rui.restoration_type = RESTORE_SGRPROJ;
+  rui.sgrproj_info = rusi->sgrproj;
+
+  rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+
+  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+                           (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
+                            << AV1_PROB_COST_SHIFT);
+
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_sgr =
+      RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+  if (rusi->sgrproj.ep < 10)
+    cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+
+  RestorationType rtype =
+      (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
+  if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
+}
+
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+                         int h_start, int h_end, int v_start, int v_end,
+                         int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l;
+  double Y[WIENER_WIN2];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
+        M[k] += Y[k] * X;
+        for (l = k; l < wiener_win2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * wiener_win2 + l] += Y[k] * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+    }
+  }
+}
+
+static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
+                                  int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
+static AOM_FORCE_INLINE void compute_stats_highbd(
+    int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start,
+    int h_end, int v_start, int v_end, int dgd_stride, int src_stride,
+    double *M, double *H) {
+  int i, j, k, l;
+  double Y[WIENER_WIN2];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const double avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
+        double Yk = Y[k];
+        M[k] += Yk * X;
+        double *H2 = &H[k * wiener_win2];
+        H2[k] += Yk * Yk;
+        for (l = k + 1; l < wiener_win2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H2[l] += Yk * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+    }
+  }
+}
+
+static INLINE int wrap_index(int i, int wiener_win) {
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
+}
+
+// Fix vector b, update vector a
+static void update_a_sep_sym(int wiener_win, double **Mc, double **Hc,
+                             double *a, double *b) {
+  int i, j;
+  double S[WIENER_WIN];
+  double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; ++j) {
+      const int jj = wrap_index(j, wiener_win);
+      A[jj] += Mc[i][j] * b[i];
+    }
+  }
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
+      int k, l;
+      for (k = 0; k < wiener_win; ++k)
+        for (l = 0; l < wiener_win; ++l) {
+          const int kk = wrap_index(k, wiener_win);
+          const int ll = wrap_index(l, wiener_win);
+          B[ll * wiener_halfwin1 + kk] +=
+              Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] * b[j];
+        }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
+    A[i] -=
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
+    for (j = 0; j < wiener_halfwin1 - 1; ++j)
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+  if (linsolve(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = 1.0;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
+    }
+    memcpy(a, S, wiener_win * sizeof(*a));
+  }
+}
+
+// Fix vector a, update vector b
+static void update_b_sep_sym(int wiener_win, double **Mc, double **Hc,
+                             double *a, double *b) {
+  int i, j;
+  double S[WIENER_WIN];
+  double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < wiener_win; i++) {
+    const int ii = wrap_index(i, wiener_win);
+    for (j = 0; j < wiener_win; j++) A[ii] += Mc[i][j] * a[j];
+  }
+
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
+      const int ii = wrap_index(i, wiener_win);
+      const int jj = wrap_index(j, wiener_win);
+      int k, l;
+      for (k = 0; k < wiener_win; ++k)
+        for (l = 0; l < wiener_win; ++l)
+          B[jj * wiener_halfwin1 + ii] +=
+              Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] * a[l];
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
+    A[i] -=
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  for (i = 0; i < wiener_halfwin1 - 1; ++i)
+    for (j = 0; j < wiener_halfwin1 - 1; ++j)
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+  if (linsolve(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = 1.0;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
+    }
+    memcpy(b, S, wiener_win * sizeof(*b));
+  }
+}
+
+static int wiener_decompose_sep_sym(int wiener_win, double *M, double *H,
+                                    double *a, double *b) {
+  static const int init_filt[WIENER_WIN] = {
+    WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+    WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+    WIENER_FILT_TAP0_MIDV,
+  };
+  double *Hc[WIENER_WIN2];
+  double *Mc[WIENER_WIN];
+  int i, j, iter;
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+  for (i = 0; i < wiener_win; i++) {
+    a[i] = b[i] = (double)init_filt[i + plane_off] / WIENER_FILT_STEP;
+  }
+  for (i = 0; i < wiener_win; i++) {
+    Mc[i] = M + i * wiener_win;
+    for (j = 0; j < wiener_win; j++) {
+      Hc[i * wiener_win + j] =
+          H + i * wiener_win * wiener_win2 + j * wiener_win;
+    }
+  }
+
+  iter = 1;
+  while (iter < NUM_WIENER_ITERS) {
+    update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+    update_b_sep_sym(wiener_win, Mc, Hc, a, b);
+    iter++;
+  }
+  return 1;
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static double compute_score(int wiener_win, double *M, double *H,
+                            InterpKernel vfilt, InterpKernel hfilt) {
+  double ab[WIENER_WIN * WIENER_WIN];
+  int i, k, l;
+  double P = 0, Q = 0;
+  double iP = 0, iQ = 0;
+  double Score, iScore;
+  double a[WIENER_WIN], b[WIENER_WIN];
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  aom_clear_system_state();
+
+  a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = 1.0;
+  for (i = 0; i < WIENER_HALFWIN; ++i) {
+    a[i] = a[WIENER_WIN - i - 1] = (double)vfilt[i] / WIENER_FILT_STEP;
+    b[i] = b[WIENER_WIN - i - 1] = (double)hfilt[i] / WIENER_FILT_STEP;
+    a[WIENER_HALFWIN] -= 2 * a[i];
+    b[WIENER_HALFWIN] -= 2 * b[i];
+  }
+  memset(ab, 0, sizeof(ab));
+  for (k = 0; k < wiener_win; ++k) {
+    for (l = 0; l < wiener_win; ++l)
+      ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    P += ab[k] * M[k];
+    for (l = 0; l < wiener_win2; ++l)
+      Q += ab[k] * H[k * wiener_win2 + l] * ab[l];
+  }
+  Score = Q - 2 * P;
+
+  iP = M[wiener_win2 >> 1];
+  iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
+  iScore = iQ - 2 * iP;
+
+  return Score - iScore;
+}
+
+static void quantize_sym_filter(int wiener_win, double *f, InterpKernel fi) {
+  int i;
+  const int wiener_halfwin = (wiener_win >> 1);
+  for (i = 0; i < wiener_halfwin; ++i) {
+    fi[i] = RINT(f[i] * WIENER_FILT_STEP);
+  }
+  // Specialize for 7-tap filter
+  if (wiener_win == WIENER_WIN) {
+    fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+    fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+  } else {
+    fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+    fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[0] = 0;
+  }
+  // Satisfy filter constraints
+  fi[WIENER_WIN - 1] = fi[0];
+  fi[WIENER_WIN - 2] = fi[1];
+  fi[WIENER_WIN - 3] = fi[2];
+  // The central element has an implicit +WIENER_FILT_STEP
+  fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
+                             WienerInfo *ref_wiener_info) {
+  int bits = 0;
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  return bits;
+}
+
+#define USE_WIENER_REFINEMENT_SEARCH 1
+static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
+                                        const RestorationTileLimits *limits,
+                                        const AV1PixelRect *tile,
+                                        RestorationUnitInfo *rui,
+                                        int wiener_win) {
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  int64_t err = try_restoration_unit(rsc, limits, tile, rui);
+#if USE_WIENER_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+                    WIENER_FILT_TAP2_MINV };
+  int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+                    WIENER_FILT_TAP2_MAXV };
+
+  WienerInfo *plane_wiener = &rui->wiener_info;
+
+  // printf("err  pre = %"PRId64"\n", err);
+  const int start_step = 4;
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+          plane_wiener->hfilter[p] -= s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->hfilter[p] += s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+          plane_wiener->hfilter[p] += s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->hfilter[p] -= s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+          plane_wiener->vfilter[p] -= s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->vfilter[p] += s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+          plane_wiener->vfilter[p] += s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->vfilter[p] -= s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+// printf("err post = %"PRId64"\n", err);
+#endif  // USE_WIENER_REFINEMENT_SEARCH
+  return err;
+}
+
+static void search_wiener(const RestorationTileLimits *limits,
+                          const AV1PixelRect *tile_rect, int rest_unit_idx,
+                          void *priv, int32_t *tmpbuf,
+                          RestorationLineBuffers *rlbs) {
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  double M[WIENER_WIN2];
+  double H[WIENER_WIN2 * WIENER_WIN2];
+  double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
+
+  const AV1_COMMON *const cm = rsc->cm;
+  if (cm->seq_params.use_highbitdepth) {
+    compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                         limits->h_start, limits->h_end, limits->v_start,
+                         limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+  } else {
+    av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                      limits->h_start, limits->h_end, limits->v_start,
+                      limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+  }
+
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->wiener_restore_cost[0];
+
+  if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilterd, hfilterd)) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
+    return;
+  }
+
+  RestorationUnitInfo rui;
+  memset(&rui, 0, sizeof(rui));
+  rui.restoration_type = RESTORE_WIENER;
+  quantize_sym_filter(wiener_win, vfilterd, rui.wiener_info.vfilter);
+  quantize_sym_filter(wiener_win, hfilterd, rui.wiener_info.hfilter);
+
+  // Filter score computes the value of the function x'*A*x - x'*b for the
+  // learned filter and compares it against identity filer. If there is no
+  // reduction in the function, the filter is reverted back to identity
+  if (compute_score(wiener_win, M, H, rui.wiener_info.vfilter,
+                    rui.wiener_info.hfilter) > 0) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
+    return;
+  }
+
+  aom_clear_system_state();
+
+  rusi->sse[RESTORE_WIENER] =
+      finer_tile_search_wiener(rsc, limits, tile_rect, &rui, wiener_win);
+  rusi->wiener = rui.wiener_info;
+
+  if (wiener_win != WIENER_WIN) {
+    assert(rui.wiener_info.vfilter[0] == 0 &&
+           rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+    assert(rui.wiener_info.hfilter[0] == 0 &&
+           rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
+  }
+
+  const int64_t bits_wiener =
+      x->wiener_restore_cost[1] +
+      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+       << AV1_PROB_COST_SHIFT);
+
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_wiener =
+      RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
+
+  RestorationType rtype =
+      (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
+
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
+  if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
+}
+
+static void search_norestore(const RestorationTileLimits *limits,
+                             const AV1PixelRect *tile_rect, int rest_unit_idx,
+                             void *priv, int32_t *tmpbuf,
+                             RestorationLineBuffers *rlbs) {
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
+
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const int highbd = rsc->cm->seq_params.use_highbitdepth;
+  rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+      limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
+
+  rsc->sse += rusi->sse[RESTORE_NONE];
+}
+
+static void search_switchable(const RestorationTileLimits *limits,
+                              const AV1PixelRect *tile_rect, int rest_unit_idx,
+                              void *priv, int32_t *tmpbuf,
+                              RestorationLineBuffers *rlbs) {
+  (void)limits;
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  double best_cost = 0;
+  int64_t best_bits = 0;
+  RestorationType best_rtype = RESTORE_NONE;
+
+  for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    // Check for the condition that wiener or sgrproj search could not
+    // find a solution or the solution was worse than RESTORE_NONE.
+    // In either case the best_rtype will be set as RESTORE_NONE. These
+    // should be skipped from the test below.
+    if (r > RESTORE_NONE) {
+      if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
+    }
+
+    const int64_t sse = rusi->sse[r];
+    int64_t coeff_pcost = 0;
+    switch (r) {
+      case RESTORE_NONE: coeff_pcost = 0; break;
+      case RESTORE_WIENER:
+        coeff_pcost =
+            count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+        break;
+      case RESTORE_SGRPROJ:
+        coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+        break;
+      default: assert(0); break;
+    }
+    const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+    const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+    if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+      cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+    if (r == 0 || cost < best_cost) {
+      best_cost = cost;
+      best_bits = bits;
+      best_rtype = r;
+    }
+  }
+
+  rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+  rsc->sse += rusi->sse[best_rtype];
+  rsc->bits += best_bits;
+  if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
+  if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
+}
+
+static void copy_unit_info(RestorationType frame_rtype,
+                           const RestUnitSearchInfo *rusi,
+                           RestorationUnitInfo *rui) {
+  assert(frame_rtype > 0);
+  rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+  if (rui->restoration_type == RESTORE_WIENER)
+    rui->wiener_info = rusi->wiener;
+  else
+    rui->sgrproj_info = rusi->sgrproj;
+}
+
+static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+  static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+    search_norestore, search_wiener, search_sgrproj, search_switchable
+  };
+
+  reset_rsc(rsc);
+  rsc_on_tile(LR_TILE_ROW, LR_TILE_COL, rsc);
+  av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
+                                 &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
+  return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+}
+
+static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  return rsi->units_per_tile;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  assert(!cm->all_lossless);
+
+  int ntiles[2];
+  for (int is_uv = 0; is_uv < 2; ++is_uv)
+    ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+
+  assert(ntiles[1] <= ntiles[0]);
+  RestUnitSearchInfo *rusi =
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]);
+
+  // If the restoration unit dimensions are not multiples of
+  // rsi->restoration_unit_size then some elements of the rusi array may be
+  // left uninitialised when we reach copy_unit_info(...). This is not a
+  // problem, as these elements are ignored later, but in order to quiet
+  // Valgrind's warnings we initialise the array below.
+  memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+  RestSearchCtxt rsc;
+  const int plane_start = AOM_PLANE_Y;
+  const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+  for (int plane = plane_start; plane <= plane_end; ++plane) {
+    init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+             &cpi->trial_frame_rst, &rsc);
+
+    const int plane_ntiles = ntiles[plane > 0];
+    const RestorationType num_rtypes =
+        (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+    double best_cost = 0;
+    RestorationType best_rtype = RESTORE_NONE;
+
+    const int highbd = rsc.cm->seq_params.use_highbitdepth;
+    extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+                 rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+                 highbd);
+
+    for (RestorationType r = 0; r < num_rtypes; ++r) {
+      if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+          (r != force_restore_type))
+        continue;
+
+      double cost = search_rest_type(&rsc, r);
+
+      if (r == 0 || cost < best_cost) {
+        best_cost = cost;
+        best_rtype = r;
+      }
+    }
+
+    cm->rst_info[plane].frame_restoration_type = best_rtype;
+    if (force_restore_type != RESTORE_TYPES)
+      assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE);
+
+    if (best_rtype != RESTORE_NONE) {
+      for (int u = 0; u < plane_ntiles; ++u) {
+        copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+      }
+    }
+  }
+
+  aom_free(rusi);
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 000000000..3fec0c34b
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+static const uint8_t g_shuffle_stats_data[16] = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static INLINE double find_average(const uint8_t *src, int h_start, int h_end,
+                                  int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 000000000..40dd46768
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES_PUSTATS 8
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 12
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -0.1758f, -0.0499f, -10.0069f, -2.2838f,  -0.3359f,  0.3459f,  -0.3285f,
+      -0.0515f, -0.5417f, 0.2357f,   -0.0575f,  -69.0782f, 0.5348f,  1.4068f,
+      0.2213f,  -1.0490f, -0.0636f,  0.1654f,   1.1002f,   33.4924f, 0.4358f,
+      1.2499f,  0.1143f,  0.0592f,   -1.6335f,  -0.0092f,  1.2207f,  -28.4543f,
+      -0.4973f, 0.4368f,  0.2341f,   -0.1623f,  -3.8986f,  0.1311f,  -1.8789f,
+      -3.9079f, -0.8158f, -0.8420f,  1.4295f,   -2.3629f,  -1.4825f, 0.6498f,
+      -5.3669f, 6.4434f,  1.8393f,   -35.0678f, 3.7459f,   -2.8504f, 2.0502f,
+      -0.1812f, -3.9011f, -1.0155f,  1.8375f,   -1.4517f,  1.3917f,  3.8664f,
+      0.8345f,  -0.3472f, 5.7740f,   -1.1196f,  -0.3264f,  -1.2481f, -0.9284f,
+      -4.9657f, 2.2831f,  0.7337f,   2.3176f,   0.6416f,   0.8804f,  1.9988f,
+      -1.3426f, 1.2728f,  1.2249f,   -0.1551f,  5.6045f,   0.2046f,  -2.1464f,
+      -2.4922f, -0.5334f, 12.1055f,  7.2467f,   -0.0070f,  0.0234f,  0.0021f,
+      0.0215f,  -0.0098f, -0.0682f,  -6.1494f,  -0.3176f,  -1.6069f, -0.2119f,
+      -1.0533f, -0.3566f, 0.5294f,   -0.4335f,  0.1626f,
+    };
+
+static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+    {
+      10.5266f, 5.3268f, -1.0678f, 7.7411f,  8.7164f,  -0.3235f,
+      7.3028f,  9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      10.5932f,  2.5192f,  -0.0015f, 5.9479f,   5.2426f,   -0.4091f, 5.3220f,
+      6.0469f,   0.7200f,  3.3241f,  5.5006f,   12.8290f,  -1.6396f, 0.5743f,
+      -0.8370f,  1.9956f,  -4.9270f, -1.5295f,  2.1350f,   -9.4415f, -0.7094f,
+      5.1822f,   19.7287f, -3.0444f, -0.3320f,  0.0031f,   -0.2709f, -0.5249f,
+      0.3281f,   -0.2240f, 0.2225f,  -0.2386f,  -0.4370f,  -0.2438f, -0.4928f,
+      -0.2842f,  -2.1772f, 9.2570f,  -17.6655f, 3.5448f,   -2.8394f, -1.0167f,
+      -0.5115f,  -1.9260f, -0.2111f, -0.7528f,  -1.2387f,  -0.0401f, 5.0716f,
+      -3.3763f,  -0.2898f, -0.4956f, -7.9993f,  0.1526f,   -0.0242f, 0.7354f,
+      6.0432f,   4.8043f,  7.4790f,  -0.6295f,  1.7565f,   3.7197f,  -2.3963f,
+      6.8945f,   2.9717f,  -3.1623f, 3.4241f,   4.4676f,   -1.8154f, -2.9401f,
+      -8.5657f,  -3.0240f, -1.4661f, 8.1145f,   -12.7858f, 3.3624f,  -1.0819f,
+      -4.2856f,  1.1801f,  -0.5587f, -1.6062f,  -1.1813f,  -3.5882f, -0.2490f,
+      -24.9566f, -0.4140f, -0.1113f, 3.5537f,   4.4112f,   0.1367f,  -1.5876f,
+      1.6605f,   1.3903f,  -0.0253f, -2.1419f,  -2.2197f,  -0.7659f, -0.4249f,
+      -0.0424f,  0.1486f,  0.4643f,  -0.9068f,  -0.3619f,  -0.7624f, -0.9132f,
+      -0.4947f,  -0.3527f, -0.5445f, -0.4768f,  -1.7761f,  -1.0686f, 0.5462f,
+      1.3371f,   4.3116f,  0.0777f,  -2.7216f,  -1.8908f,  3.4989f,  7.7269f,
+      -2.7566f,
+    };
+
+static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+    {
+      13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+      6.1715f,  0.5094f,  7.6433f,  -0.3992f, -1.3555f,
+    };
+
+static const float
+    av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      4.3078f, -17.3497f, 0.0195f,  34.6032f, -5.0127f,
+      5.3079f, 10.0077f,  -13.129f, 0.0087f,  -8.4009f,
+    };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+  4.5103f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+  NUM_FEATURES_PUSTATS,                              // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_rate_hiddenlayer_0_kernel,
+      av1_pustats_rate_hiddenlayer_1_kernel,
+      av1_pustats_rate_logits_kernel,
+  },
+  {
+      av1_pustats_rate_hiddenlayer_0_bias,
+      av1_pustats_rate_hiddenlayer_1_bias,
+      av1_pustats_rate_logits_bias,
+  },
+};
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -0.2560f, 0.1105f,  -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+      0.4885f,  1.7518f,  0.4985f,  0.5582f,  -0.3739f, 0.9403f,  0.3874f,
+      0.3265f,  1.7383f,  3.1747f,  0.0285f,  3.3942f,  -0.0123f, 0.5057f,
+      0.1584f,  0.2697f,  4.6151f,  3.6251f,  -0.0121f, -1.0047f, -0.0037f,
+      0.0127f,  0.1935f,  -0.5277f, -2.7144f, 0.0729f,  -0.1457f, -0.0816f,
+      -0.5462f, 0.4738f,  0.3599f,  -0.0564f, 0.0910f,  0.0126f,  -0.0310f,
+      -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f,  -0.2662f, -0.0999f,
+      -0.2983f, -0.4899f, -0.2314f, 0.2873f,  -0.3614f, 0.1783f,  -0.1210f,
+      0.3569f,  0.5436f,  -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+      1.9045f,  0.5463f,  0.1102f,  -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+      -0.3531f, -1.3095f, 0.6099f,  0.7977f,  4.1950f,  -0.0067f, -0.2762f,
+      -0.1574f, -0.2149f, 0.6104f,  -1.7053f, 0.1904f,  4.2402f,  -0.2671f,
+      0.8940f,  0.6820f,  0.2241f,  -0.9459f, 1.4571f,  0.5255f,  2.3352f,
+      -0.0806f, 0.5231f,  0.3928f,  0.4146f,  2.0956f,
+    };
+
+static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+    {
+      1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+      0.f,     1.1485f, 2.7085f,  -4.7897f, 1.4093f,  -1.657f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.5203f, -1.3468f, 0.3865f,  -0.6859f, 0.0058f,  4.0682f,  0.4807f,
+      -0.1380f, 0.6050f,  0.8958f,  0.7748f,  -0.1311f, 1.7317f,  1.1265f,
+      0.0827f,  0.1407f,  -0.3605f, 0.5429f,  0.1880f,  -0.1439f, 0.2837f,
+      1.6477f,  0.0832f,  0.0593f,  -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+      -0.3842f, -2.3637f, 0.2514f,  0.8263f,  -0.1872f, 0.5774f,  -0.3610f,
+      -0.0205f, 1.3977f,  -0.1083f, 0.6923f,  1.3039f,  -0.2870f, 1.0622f,
+      -0.0566f, 0.2697f,  -0.5429f, -0.6193f, 1.7559f,  0.3246f,  1.9159f,
+      0.3744f,  0.0686f,  1.0191f,  -0.4212f, 1.9591f,  -0.0691f, -0.1085f,
+      -1.2034f, 0.0606f,  1.0116f,  0.5565f,  -0.1874f, -0.7898f, 0.4796f,
+      0.2290f,  0.4334f,  -0.5817f, -0.2949f, 0.1367f,  -0.2932f, -1.1265f,
+      0.0133f,  -0.5309f, -3.3191f, 0.0939f,  0.3895f,  -2.5812f, -0.0066f,
+      -3.0063f, -0.2982f, 0.7309f,  -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+      1.9630f,  0.1988f,  0.4194f,  0.8762f,  0.3402f,  0.1051f,  -0.1598f,
+      0.2405f,  0.0392f,  1.1256f,  1.5245f,  0.0950f,  0.2160f,  -0.5023f,
+      0.2584f,  0.2074f,  0.2218f,  0.3966f,  -0.0921f, -0.2435f, -0.4560f,
+      -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f,  -0.3342f, -0.7888f,
+      -0.4488f, -1.7168f, 0.3341f,  0.1146f,  0.5226f,  0.2610f,  -0.4574f,
+      -0.4164f,
+    };
+
+static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+    {
+      -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+      2.7149f,  -2.5649f, 2.7765f, 2.9617f,  2.7684f,
+    };
+
+static const float
+    av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      -0.6868f, -0.6715f, 0.449f,  -1.293f, 0.6214f,
+      0.9894f,  -0.4342f, 0.7002f, 1.4363f, 0.6951f,
+    };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+  2.3371f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+  NUM_FEATURES_PUSTATS,                              // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_dist_hiddenlayer_0_kernel,
+      av1_pustats_dist_hiddenlayer_1_kernel,
+      av1_pustats_dist_logits_kernel,
+  },
+  {
+      av1_pustats_dist_hiddenlayer_0_bias,
+      av1_pustats_dist_hiddenlayer_1_bias,
+      av1_pustats_dist_logits_bias,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
new file mode 100644
index 000000000..0bca39102
--- /dev/null
+++ b/third_party/aom/av1/encoder/random.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Generate a random number in the range [0, 32768).
+static INLINE unsigned int lcg_rand16(unsigned int *state) {
+  *state = (unsigned int)(*state * 1103515245ULL + 12345);
+  return *state / 65536 % 32768;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
new file mode 100644
index 000000000..781f528eb
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "av1/encoder/ransac.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/random.h"
+
+#define MAX_MINPTS 4
+#define MAX_DEGENERATE_ITER 10
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.0
+#define MIN_TRIALS 20
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef int (*IsDegenerateFunc)(double *p);
+typedef void (*NormalizeFunc)(double *p, int np, double *T);
+typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2);
+typedef int (*FindTransformationFunc)(int points, double *points1,
+                                      double *points2, double *params);
+typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
+                                        double *proj, const int n,
+                                        const int stride_points,
+                                        const int stride_proj);
+
+static void project_points_double_translation(double *mat, double *points,
+                                              double *proj, const int n,
+                                              const int stride_points,
+                                              const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = x + mat[0];
+    *(proj++) = y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_rotzoom(double *mat, double *points,
+                                          double *proj, const int n,
+                                          const int stride_points,
+                                          const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = -mat[3] * x + mat[2] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_affine(double *mat, double *points,
+                                         double *proj, const int n,
+                                         const int stride_points,
+                                         const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void normalize_homography(double *pts, int n, double *T) {
+  double *p = pts;
+  double mean[2] = { 0, 0 };
+  double msqe = 0;
+  double scale;
+  int i;
+
+  assert(n > 0);
+  for (i = 0; i < n; ++i, p += 2) {
+    mean[0] += p[0];
+    mean[1] += p[1];
+  }
+  mean[0] /= n;
+  mean[1] /= n;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] -= mean[0];
+    p[1] -= mean[1];
+    msqe += sqrt(p[0] * p[0] + p[1] * p[1]);
+  }
+  msqe /= n;
+  scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe);
+  T[0] = scale;
+  T[1] = 0;
+  T[2] = -scale * mean[0];
+  T[3] = 0;
+  T[4] = scale;
+  T[5] = -scale * mean[1];
+  T[6] = 0;
+  T[7] = 0;
+  T[8] = 1;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] *= scale;
+    p[1] *= scale;
+  }
+}
+
+static void invnormalize_mat(double *T, double *iT) {
+  double is = 1.0 / T[0];
+  double m0 = -T[2] * is;
+  double m1 = -T[5] * is;
+  iT[0] = is;
+  iT[1] = 0;
+  iT[2] = m0;
+  iT[3] = 0;
+  iT[4] = is;
+  iT[5] = m1;
+  iT[6] = 0;
+  iT[7] = 0;
+  iT[8] = 1;
+}
+
+static void denormalize_homography(double *params, double *T1, double *T2) {
+  double iT2[9];
+  double params2[9];
+  invnormalize_mat(T2, iT2);
+  multiply_mat(params, T1, params2, 3, 3, 3);
+  multiply_mat(iT2, params2, params, 3, 3, 3);
+}
+
+static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[4];
+  params_denorm[3] = params[2];
+  params_denorm[4] = params[3];
+  params_denorm[5] = params[5];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = params_denorm[3];
+  params[5] = params_denorm[4];
+  params[6] = params[7] = 0;
+}
+
+static void denormalize_rotzoom_reorder(double *params, double *T1,
+                                        double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[2];
+  params_denorm[3] = -params[1];
+  params_denorm[4] = params[0];
+  params_denorm[5] = params[3];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = -params[3];
+  params[5] = params[2];
+  params[6] = params[7] = 0;
+}
+
+static void denormalize_translation_reorder(double *params, double *T1,
+                                            double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = 1;
+  params_denorm[1] = 0;
+  params_denorm[2] = params[0];
+  params_denorm[3] = 0;
+  params_denorm[4] = 1;
+  params_denorm[5] = params[1];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params[5] = 1;
+  params[3] = params[4] = 0;
+  params[6] = params[7] = 0;
+}
+
+static int find_translation(int np, double *pts1, double *pts2, double *mat) {
+  int i;
+  double sx, sy, dx, dy;
+  double sumx, sumy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  sumx = 0;
+  sumy = 0;
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    sumx += dx - sx;
+    sumy += dy - sy;
+  }
+  mat[0] = sumx / np;
+  mat[1] = sumy / np;
+  denormalize_translation_reorder(mat, T1, T2);
+  return 0;
+}
+
+static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 5 + 20));
+  double *b = a + np2 * 4;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 4 + 0] = sx;
+    a[i * 2 * 4 + 1] = sy;
+    a[i * 2 * 4 + 2] = 1;
+    a[i * 2 * 4 + 3] = 0;
+    a[(i * 2 + 1) * 4 + 0] = sy;
+    a[(i * 2 + 1) * 4 + 1] = -sx;
+    a[(i * 2 + 1) * 4 + 2] = 0;
+    a[(i * 2 + 1) * 4 + 3] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (!least_squares(4, a, np2, 4, b, temp, mat)) {
+    aom_free(a);
+    return 1;
+  }
+  denormalize_rotzoom_reorder(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+static int find_affine(int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 7 + 42));
+  double *b = a + np2 * 6;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 6 + 0] = sx;
+    a[i * 2 * 6 + 1] = sy;
+    a[i * 2 * 6 + 2] = 0;
+    a[i * 2 * 6 + 3] = 0;
+    a[i * 2 * 6 + 4] = 1;
+    a[i * 2 * 6 + 5] = 0;
+    a[(i * 2 + 1) * 6 + 0] = 0;
+    a[(i * 2 + 1) * 6 + 1] = 0;
+    a[(i * 2 + 1) * 6 + 2] = sx;
+    a[(i * 2 + 1) * 6 + 3] = sy;
+    a[(i * 2 + 1) * 6 + 4] = 0;
+    a[(i * 2 + 1) * 6 + 5] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (!least_squares(6, a, np2, 6, b, temp, mat)) {
+    aom_free(a);
+    return 1;
+  }
+  denormalize_affine_reorder(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+static int get_rand_indices(int npoints, int minpts, int *indices,
+                            unsigned int *seed) {
+  int i, j;
+  int ptr = lcg_rand16(seed) % npoints;
+  if (minpts > npoints) return 0;
+  indices[0] = ptr;
+  ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+  i = 1;
+  while (i < minpts) {
+    int index = lcg_rand16(seed) % npoints;
+    while (index) {
+      ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+      for (j = 0; j < i; ++j) {
+        if (indices[j] == ptr) break;
+      }
+      if (j == i) index--;
+    }
+    indices[i++] = ptr;
+  }
+  return 1;
+}
+
+typedef struct {
+  int num_inliers;
+  double variance;
+  int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+  const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+  const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+  if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+  if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+  if (motion_a->variance < motion_b->variance) return -1;
+  if (motion_a->variance > motion_b->variance) return 1;
+  return 0;
+}
+
+static int is_better_motion(const RANSAC_MOTION *motion_a,
+                            const RANSAC_MOTION *motion_b) {
+  return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+                                   const int *indices, int num_points) {
+  for (int i = 0; i < num_points; ++i) {
+    const int index = indices[i];
+    dest[i * 2] = src[index * 2];
+    dest[i * 2 + 1] = src[index * 2 + 1];
+  }
+}
+
+static const double kInfiniteVariance = 1e12;
+
+static void clear_motion(RANSAC_MOTION *motion, int num_points) {
+  motion->num_inliers = 0;
+  motion->variance = kInfiniteVariance;
+  memset(motion->inlier_indices, 0,
+         sizeof(*motion->inlier_indices * num_points));
+}
+
+static int ransac(const int *matched_points, int npoints,
+                  int *num_inliers_by_motion, double *params_by_motion,
+                  int num_desired_motions, const int minpts,
+                  IsDegenerateFunc is_degenerate,
+                  FindTransformationFunc find_transformation,
+                  ProjectPointsDoubleFunc projectpoints) {
+  static const double PROBABILITY_REQUIRED = 0.9;
+  static const double EPS = 1e-12;
+
+  int N = 10000, trial_count = 0;
+  int i = 0;
+  int ret_val = 0;
+
+  unsigned int seed = (unsigned int)npoints;
+
+  int indices[MAX_MINPTS] = { 0 };
+
+  double *points1, *points2;
+  double *corners1, *corners2;
+  double *image1_coord;
+
+  // Store information for the num_desired_motions best transformations found
+  // and the worst motion among them, as well as the motion currently under
+  // consideration.
+  RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+  RANSAC_MOTION current_motion;
+
+  // Store the parameters and the indices of the inlier points for the motion
+  // currently under consideration.
+  double params_this_motion[MAX_PARAMDIM];
+
+  double *cnp1, *cnp2;
+
+  for (i = 0; i < num_desired_motions; ++i) {
+    num_inliers_by_motion[i] = 0;
+  }
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+    return 1;
+  }
+
+  points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+  points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+  corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+  corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+  image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+
+  motions =
+      (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions);
+  for (i = 0; i < num_desired_motions; ++i) {
+    motions[i].inlier_indices =
+        (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints);
+    clear_motion(motions + i, npoints);
+  }
+  current_motion.inlier_indices =
+      (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints);
+  clear_motion(&current_motion, npoints);
+
+  worst_kept_motion = motions;
+
+  if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions &&
+        current_motion.inlier_indices)) {
+    ret_val = 1;
+    goto finish_ransac;
+  }
+
+  cnp1 = corners1;
+  cnp2 = corners2;
+  for (i = 0; i < npoints; ++i) {
+    *(cnp1++) = *(matched_points++);
+    *(cnp1++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+  }
+
+  while (N > trial_count) {
+    double sum_distance = 0.0;
+    double sum_distance_squared = 0.0;
+
+    clear_motion(&current_motion, npoints);
+
+    int degenerate = 1;
+    int num_degenerate_iter = 0;
+
+    while (degenerate) {
+      num_degenerate_iter++;
+      if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+
+      copy_points_at_indices(points1, corners1, indices, minpts);
+      copy_points_at_indices(points2, corners2, indices, minpts);
+
+      degenerate = is_degenerate(points1);
+      if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+    }
+
+    if (find_transformation(minpts, points1, points2, params_this_motion)) {
+      trial_count++;
+      continue;
+    }
+
+    projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2);
+
+    for (i = 0; i < npoints; ++i) {
+      double dx = image1_coord[i * 2] - corners2[i * 2];
+      double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+      double distance = sqrt(dx * dx + dy * dy);
+
+      if (distance < INLIER_THRESHOLD) {
+        current_motion.inlier_indices[current_motion.num_inliers++] = i;
+        sum_distance += distance;
+        sum_distance_squared += distance * distance;
+      }
+    }
+
+    if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
+        current_motion.num_inliers > 1) {
+      int temp;
+      double fracinliers, pNoOutliers, mean_distance, dtemp;
+      mean_distance = sum_distance / ((double)current_motion.num_inliers);
+      current_motion.variance =
+          sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
+          mean_distance * mean_distance * ((double)current_motion.num_inliers) /
+              ((double)current_motion.num_inliers - 1.0);
+      if (is_better_motion(&current_motion, worst_kept_motion)) {
+        // This motion is better than the worst currently kept motion. Remember
+        // the inlier points and variance. The parameters for each kept motion
+        // will be recomputed later using only the inliers.
+        worst_kept_motion->num_inliers = current_motion.num_inliers;
+        worst_kept_motion->variance = current_motion.variance;
+        memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices,
+               sizeof(*current_motion.inlier_indices) * npoints);
+
+        assert(npoints > 0);
+        fracinliers = (double)current_motion.num_inliers / (double)npoints;
+        pNoOutliers = 1 - pow(fracinliers, minpts);
+        pNoOutliers = fmax(EPS, pNoOutliers);
+        pNoOutliers = fmin(1 - EPS, pNoOutliers);
+        dtemp = log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers);
+        temp = (dtemp > (double)INT32_MAX)
+                   ? INT32_MAX
+                   : dtemp < (double)INT32_MIN ? INT32_MIN : (int)dtemp;
+
+        if (temp > 0 && temp < N) {
+          N = AOMMAX(temp, MIN_TRIALS);
+        }
+
+        // Determine the new worst kept motion and its num_inliers and variance.
+        for (i = 0; i < num_desired_motions; ++i) {
+          if (is_better_motion(worst_kept_motion, &motions[i])) {
+            worst_kept_motion = &motions[i];
+          }
+        }
+      }
+    }
+    trial_count++;
+  }
+
+  // Sort the motions, best first.
+  qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+  // Recompute the motions using only the inliers.
+  for (i = 0; i < num_desired_motions; ++i) {
+    if (motions[i].num_inliers >= minpts) {
+      copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+      copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+
+      find_transformation(motions[i].num_inliers, points1, points2,
+                          params_by_motion + (MAX_PARAMDIM - 1) * i);
+    }
+    num_inliers_by_motion[i] = motions[i].num_inliers;
+  }
+
+finish_ransac:
+  aom_free(points1);
+  aom_free(points2);
+  aom_free(corners1);
+  aom_free(corners2);
+  aom_free(image1_coord);
+  aom_free(current_motion.inlier_indices);
+  for (i = 0; i < num_desired_motions; ++i) {
+    aom_free(motions[i].inlier_indices);
+  }
+  aom_free(motions);
+
+  return ret_val;
+}
+
+static int is_collinear3(double *p1, double *p2, double *p3) {
+  static const double collinear_eps = 1e-3;
+  const double v =
+      (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+  return fabs(v) < collinear_eps;
+}
+
+static int is_degenerate_translation(double *p) {
+  return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+
+static int is_degenerate_affine(double *p) {
+  return is_collinear3(p, p + 2, p + 4);
+}
+
+int ransac_translation(int *matched_points, int npoints,
+                       int *num_inliers_by_motion, double *params_by_motion,
+                       int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3,
+                is_degenerate_translation, find_translation,
+                project_points_double_translation);
+}
+
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+                   double *params_by_motion, int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+                find_rotzoom, project_points_double_rotzoom);
+}
+
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+                  double *params_by_motion, int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+                find_affine, project_points_double_affine);
+}
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
new file mode 100644
index 000000000..c429f2ce5
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANSAC_H_
+#define AOM_AV1_ENCODER_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+
+#include "av1/common/warped_motion.h"
+
+typedef int (*RansacFunc)(int *matched_points, int npoints,
+                          int *num_inliers_by_motion, double *params_by_motion,
+                          int num_motions);
+
+/* Each of these functions fits a motion model from a set of
+   corresponding points in 2 frames using RANSAC. */
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+                  double *params_by_motion, int num_motions);
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+                   double *params_by_motion, int num_motions);
+int ransac_translation(int *matched_points, int npoints,
+                       int *num_inliers_by_motion, double *params_by_motion,
+                       int num_motions);
+#endif  // AOM_AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
new file mode 100644
index 000000000..7cd0962c5
--- /dev/null
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// 22 float features +
+// 2 categorical features with 4 possible values, converted to one-hot vectors.
+// So, total 22 + 2 * 4 = 30 features.
+#define NUM_FEATURES 30
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_HIDDEN_NODES 96
+#define NUM_OUTPUTS 1
+
+//------------------------------------------------------------------------------
+// RDCost model
+
+static const float
+    av1_rdcost_model_nn_weights_layer0[NUM_FEATURES * NUM_HIDDEN_NODES] = {
+      -0.0699f,   0.2790f,    0.1915f,    0.2669f,    0.4637f,    0.4095f,
+      0.2129f,    0.0634f,    0.2306f,    -0.2232f,   -0.5711f,   -0.6493f,
+      -0.7406f,   -0.8440f,   0.4105f,    0.1392f,    0.5218f,    -0.1618f,
+      -0.1719f,   0.3409f,    0.1111f,    -0.3609f,   -0.2929f,   0.3869f,
+      -0.5373f,   0.0700f,    0.2572f,    0.2483f,    -0.0314f,   0.5228f,
+      0.0169f,    -0.1357f,   0.0419f,    -0.1722f,   0.1303f,    0.1198f,
+      -0.0013f,   0.1309f,    0.0293f,    -0.1941f,   0.0668f,    -0.0643f,
+      -0.0381f,   0.1249f,    -0.0731f,   -0.1649f,   0.0964f,    0.0270f,
+      0.1354f,    0.0538f,    -0.2064f,   -0.2067f,   -0.0569f,   0.0449f,
+      0.1680f,    -0.0732f,   -0.0785f,   0.1884f,    -0.2137f,   -0.0189f,
+      0.2976f,    0.2818f,    -0.0222f,   0.2658f,    0.0488f,    0.2778f,
+      -0.1110f,   0.2069f,    -0.0072f,   -0.0095f,   -0.1105f,   -0.1365f,
+      -0.4245f,   -0.4751f,   -0.0736f,   0.2333f,    0.0653f,    -0.0249f,
+      0.0055f,    -0.0838f,   -0.0489f,   -0.2597f,   0.2621f,    -0.0251f,
+      -0.0545f,   0.0816f,    -0.0816f,   0.3396f,    -0.1047f,   0.3678f,
+      0.1487f,    -0.0270f,   0.2574f,    0.1018f,    0.2560f,    -0.0598f,
+      -0.0446f,   -0.1792f,   0.5336f,    -0.1590f,   -0.9820f,   -0.6514f,
+      -0.6304f,   -0.8359f,   -0.0699f,   0.0295f,    -0.0057f,   -0.3088f,
+      -0.1466f,   0.2220f,    -0.1980f,   -0.3400f,   -0.1228f,   0.2667f,
+      -0.4816f,   0.0155f,    -0.0194f,   0.2051f,    0.0513f,    0.1575f,
+      -121.4240f, -126.6840f, -124.1106f, -127.6184f, -85.0333f,  -26.6396f,
+      2.7020f,    102.0452f,  -85.5128f,  0.0076f,    122.2206f,  107.5265f,
+      108.3773f,  93.4847f,   20.3705f,   -89.6993f,  -176.9070f, -41.7543f,
+      -123.0293f, -91.6437f,  -205.7099f, -62.5346f,  -83.2987f,  21.3830f,
+      56.6341f,   -120.8647f, -127.7562f, -121.6688f, -127.4225f, -74.8045f,
+      -15.9247f,  -14.6468f,  -14.7788f,  -15.4498f,  -18.5514f,  -11.1579f,
+      -5.8164f,   -3.4318f,   0.8100f,    0.0642f,    203.5111f,  189.6872f,
+      190.4776f,  176.4784f,  -4.9427f,   -12.5324f,  -7.6861f,   21.9182f,
+      -6.7864f,   -7.1906f,   -8.1292f,   21.4780f,   -7.8016f,   -5.2653f,
+      61.8526f,   -15.5105f,  -14.6900f,  -14.1459f,  -15.4350f,  -19.1379f,
+      -0.7876f,   -1.8558f,   -4.6035f,   -6.8405f,   -0.2904f,   2.3202f,
+      1.8127f,    -2.9397f,   -0.8187f,   -0.6098f,   22.6173f,   10.3668f,
+      12.9363f,   2.4541f,    6.6700f,    0.3804f,    -3.3117f,   8.5464f,
+      -25.8348f,  1.8698f,    -9.5753f,   8.5558f,    -16.3576f,  7.2217f,
+      35.3115f,   -1.1447f,   -2.6530f,   -4.7027f,   -5.7024f,   -0.9513f,
+      0.8393f,    0.7085f,    0.7879f,    0.3728f,    3.0574f,    1.1360f,
+      26.0531f,   4.1084f,    -1.7340f,   0.1683f,    -450.7927f, -444.5818f,
+      -442.5239f, -438.1168f, 2.4924f,    -0.0147f,   -0.0797f,   -47.5322f,
+      -1.7638f,   -0.8608f,   -0.6500f,   -44.4326f,  -0.9027f,   2.5560f,
+      -267.6517f, 0.2642f,    0.9457f,    0.7944f,    0.3609f,    3.2742f,
+      -74.3400f,  -81.6894f,  -76.2162f,  -69.2979f,  -90.2476f,  -39.7389f,
+      2.2545f,    36.5095f,   -60.1129f,  -1.0383f,   87.0348f,   83.9940f,
+      83.7199f,   80.8609f,   14.9075f,   -78.7405f,  -74.3549f,  -4.2382f,
+      -23.9739f,  -91.8469f,  -67.2654f,  -21.5293f,  -9.9857f,   11.8391f,
+      35.8223f,   -74.2551f,  -81.0729f,  -73.8347f,  -70.3798f,  -86.8052f,
+      0.1701f,    -0.1136f,   0.0060f,    -0.0496f,   -0.1727f,   0.0195f,
+      -0.1040f,   0.1027f,    0.0467f,    -0.2538f,   -0.1322f,   0.0860f,
+      0.0093f,    -0.2801f,   -0.0958f,   0.0497f,    -0.0582f,   -0.0311f,
+      0.1840f,    0.0752f,    0.0282f,    0.0297f,    0.0607f,    0.0650f,
+      0.0893f,    0.1297f,    0.0373f,    0.0040f,    -0.0973f,   0.0248f,
+      -0.1419f,   0.0322f,    -0.0712f,   0.0860f,    -0.0426f,   -0.1989f,
+      0.1393f,    -0.1183f,   0.0735f,    -0.1895f,   0.1447f,    -0.0056f,
+      -0.1833f,   0.0884f,    0.0949f,    0.0476f,    0.0551f,    0.2125f,
+      -0.1537f,   -0.0141f,   -0.2182f,   0.1567f,    0.0457f,    -0.1485f,
+      -0.1177f,   0.0391f,    0.1982f,    -0.1288f,   0.1165f,    -0.2019f,
+      0.4550f,    0.5179f,    0.4311f,    0.1861f,    0.6199f,    0.4542f,
+      0.2034f,    0.1128f,    1.3489f,    -0.2525f,   -2.1139f,   -2.2444f,
+      -2.3679f,   -2.3378f,   0.5682f,    0.1348f,    0.3032f,    -1.5835f,
+      0.2883f,    0.1693f,    0.0439f,    -1.4556f,   0.3818f,    0.4875f,
+      -1.8899f,   0.2510f,    0.6450f,    0.6082f,    0.5962f,    0.8131f,
+      12.0281f,   13.3899f,   13.6249f,   15.8068f,   -1.5453f,   6.7456f,
+      -6.0877f,   26.2596f,   6.2223f,    -0.5922f,   134.1428f,  128.8985f,
+      128.7538f,  123.0920f,  1.3207f,    18.3069f,   15.7436f,   46.5230f,
+      24.7455f,   15.0688f,   19.9965f,   34.7236f,   19.7171f,   1.2018f,
+      49.7274f,   11.8957f,   13.1578f,   14.0451f,   15.3544f,   -3.5601f,
+      1.0048f,    0.9479f,    1.1832f,    2.0635f,    -2.9808f,   2.0803f,
+      -7.5815f,   8.4733f,    -4.2008f,   0.1217f,    226.5257f,  210.7018f,
+      211.6235f,  195.2605f,  0.8283f,    1.0977f,    1.4858f,    41.1242f,
+      1.5822f,    0.8742f,    2.0440f,    33.6213f,   1.6177f,    0.9661f,
+      65.0014f,   1.4197f,    1.0109f,    1.3153f,    1.5470f,    -3.2833f,
+      2.0858f,    2.0012f,    2.1088f,    2.5593f,    -0.9422f,   1.8554f,
+      -6.5378f,   0.6780f,    2.3186f,    0.0506f,    218.3285f,  203.4055f,
+      204.0362f,  188.7854f,  0.3701f,    2.5257f,    3.5172f,    28.8144f,
+      2.1511f,    3.4676f,    2.6337f,    28.5113f,   2.4254f,    -0.0548f,
+      59.4511f,   2.0757f,    2.1551f,    2.2271f,    2.5300f,    -1.4173f,
+      91.9240f,   88.2142f,   83.6155f,   82.2482f,   -9.2566f,   10.9654f,
+      -2.6974f,   62.6750f,   -3.6298f,   -0.1245f,   69.6721f,   67.1340f,
+      66.9162f,   64.1994f,   -83.6778f,  76.8107f,   69.7832f,   64.9261f,
+      68.4901f,   76.3615f,   70.8108f,   63.5435f,   69.1973f,   -83.6034f,
+      24.8275f,   90.1923f,   87.6831f,   82.9783f,   81.8558f,   -7.1010f,
+      95.1656f,   88.3853f,   80.5835f,   79.5990f,   -3.0720f,   8.1290f,
+      -0.6151f,   63.6425f,   -4.5833f,   -0.0063f,   70.1861f,   66.6250f,
+      66.6148f,   63.0886f,   -89.2863f,  74.7684f,   64.8897f,   60.4134f,
+      62.5241f,   78.7076f,   61.7234f,   60.1688f,   61.9509f,   -89.4098f,
+      30.3361f,   92.9144f,   88.5954f,   79.6336f,   79.2453f,   -0.4101f,
+      0.6287f,    0.8050f,    0.4417f,    0.5419f,    0.5972f,    1.3037f,
+      0.4316f,    -0.0013f,   -0.3673f,   -0.4952f,   6.1773f,    5.7825f,
+      6.1705f,    5.3848f,    1.7607f,    -0.0152f,   -0.2924f,   0.8199f,
+      1.3326f,    0.7197f,    -0.6332f,   1.1127f,    1.0472f,    1.8468f,
+      3.4419f,    0.8233f,    0.7175f,    0.8514f,    0.6372f,    0.9472f,
+      -0.0813f,   -0.0197f,   -0.0096f,   -0.2015f,   0.1133f,    -0.0305f,
+      0.0578f,    0.1375f,    -0.0750f,   -0.1702f,   0.1246f,    -0.1782f,
+      0.2017f,    0.0425f,    -0.0602f,   0.1837f,    0.1044f,    -0.1273f,
+      -0.1431f,   0.0672f,    -0.1807f,   -0.1045f,   -0.1355f,   -0.0497f,
+      -0.0561f,   -0.0633f,   0.1907f,    -0.0777f,   0.1203f,    0.0754f,
+      0.4079f,    0.2001f,    0.0558f,    0.0622f,    0.2951f,    0.6541f,
+      -0.0068f,   0.1070f,    0.4469f,    -0.1266f,   -1.3035f,   -1.3324f,
+      -1.3612f,   -0.9966f,   0.7986f,    0.3192f,    -0.5028f,   -0.3844f,
+      -0.4079f,   0.6690f,    -0.5109f,   -0.2719f,   -0.4958f,   1.0310f,
+      -0.8044f,   0.1447f,    0.4221f,    0.3194f,    0.3063f,    0.5520f,
+      0.4667f,    -5.7238f,   -0.5602f,   12.6339f,   -15.1865f,  -14.9035f,
+      -3.0726f,   9.5347f,    -24.6225f,  -2.7086f,   89.8557f,   95.0657f,
+      93.8693f,   99.1085f,   -35.9483f,  -18.0363f,  -1.6298f,   25.3484f,
+      39.3975f,   -15.3199f,  5.7664f,    17.2367f,   25.2788f,   -36.5648f,
+      29.1426f,   0.3857f,    -5.2117f,   0.0533f,    12.1707f,   -11.1735f,
+      0.2673f,    0.0090f,    0.1574f,    0.0904f,    0.0281f,    0.1144f,
+      0.1123f,    -0.0061f,   0.0954f,    -0.0094f,   -0.4387f,   -0.5006f,
+      -0.2560f,   -0.2326f,   -0.1769f,   0.0465f,    0.1273f,    -0.1627f,
+      0.2987f,    -0.3041f,   0.1131f,    -0.3620f,   0.0932f,    -0.0649f,
+      -0.4597f,   0.2535f,    -0.0994f,   0.1390f,    0.1279f,    0.4207f,
+      -39.1159f,  -42.6382f,  -38.4225f,  -31.2301f,  -28.2382f,  -28.1176f,
+      -9.5822f,   1.1886f,    -1.2964f,   -0.7908f,   154.9819f,  147.1914f,
+      147.0482f,  138.7535f,  -21.7014f,  -35.7117f,  -28.8802f,  -3.8968f,
+      -21.5007f,  -28.2213f,  -28.4878f,  -3.7558f,   -26.8317f,  -22.8491f,
+      50.9464f,   -37.0918f,  -42.8811f,  -39.3079f,  -32.1904f,  -26.6354f,
+      -72.5346f,  -75.5751f,  -72.6896f,  -71.3671f,  -35.3279f,  -21.6077f,
+      -5.8259f,   38.7516f,   -6.8012f,   0.0172f,    170.0685f,  157.4452f,
+      158.2334f,  145.0102f,  10.0653f,   -45.1775f,  -56.4571f,  -5.1165f,
+      -75.8980f,  -46.8672f,  -55.3642f,  -6.5631f,   -81.0258f,  10.1348f,
+      55.9786f,   -70.8124f,  -75.7040f,  -73.9831f,  -70.8786f,  -34.9723f,
+      88.6239f,   86.5330f,   80.9333f,   79.6833f,   -10.0096f,  10.6312f,
+      -4.2350f,   62.6230f,   -3.2991f,   -0.0843f,   75.8659f,   72.7886f,
+      72.5301f,   68.8265f,   -81.8276f,  70.3025f,   62.9511f,   62.5706f,
+      69.1842f,   69.3637f,   65.4820f,   65.4357f,   71.5347f,   -82.1064f,
+      24.1925f,   86.2418f,   85.4985f,   80.4091f,   79.5378f,   -9.3877f,
+      -7.6594f,   -4.9581f,   -10.6385f,  -20.2307f,  -44.2261f,  -13.7557f,
+      -4.5344f,   18.1793f,   -10.5522f,  -1.5878f,   110.3187f,  102.4945f,
+      102.3305f,  94.1324f,   -25.2665f,  9.8172f,    -4.4791f,   69.4972f,
+      -6.7571f,   5.8378f,    -11.6101f,  70.7066f,   -4.9327f,   -24.0513f,
+      41.4598f,   -7.0600f,   -7.0940f,   -10.2478f,  -18.9616f,  -46.7505f,
+      90.9365f,   86.0260f,   73.2934f,   69.3406f,   3.3863f,    3.8524f,
+      0.6536f,    63.2150f,   -10.6304f,  0.0291f,    73.0071f,   69.7660f,
+      69.0457f,   65.5611f,   -92.3379f,  74.2756f,   54.5025f,   84.3183f,
+      53.7481f,   73.5624f,   55.3827f,   82.3242f,   53.5432f,   -92.5355f,
+      25.3457f,   89.1858f,   84.4763f,   72.9840f,   69.1889f,   4.6719f,
+      -0.0129f,   0.1995f,    0.2069f,    0.0358f,    0.1209f,    -0.1185f,
+      -0.1217f,   -0.1456f,   0.0125f,    -0.1354f,   0.0510f,    -0.0572f,
+      0.1397f,    0.1453f,    -0.0086f,   0.0107f,    0.0232f,    0.1508f,
+      0.0884f,    -0.0967f,   -0.1786f,   0.1361f,    -0.1399f,   -0.2021f,
+      -0.0242f,   -0.2169f,   0.0133f,    0.0116f,    -0.1489f,   -0.0093f,
+      -0.0796f,   0.1507f,    0.0906f,    0.0228f,    -0.0166f,   -0.1875f,
+      0.0471f,    0.1184f,    -0.0007f,   -0.2732f,   -0.1386f,   -0.2057f,
+      -0.0213f,   -0.1699f,   0.0996f,    0.1562f,    0.1850f,    -0.0362f,
+      -0.2059f,   0.0258f,    -0.0135f,   -0.1276f,   0.0034f,    0.2023f,
+      0.0857f,    -0.0085f,   -0.1955f,   -0.1666f,   -0.0920f,   0.0971f,
+      -0.0292f,   -0.0512f,   -0.0753f,   -0.0739f,   -0.0873f,   -0.1200f,
+      0.0220f,    -0.1359f,   0.2013f,    -0.0445f,   0.1143f,    -0.1484f,
+      -0.1556f,   -0.0003f,   0.1711f,    -0.0724f,   -0.0531f,   0.1126f,
+      0.0476f,    -0.0057f,   0.0088f,    0.0792f,    -0.0438f,   -0.1118f,
+      -0.0244f,   0.0712f,    0.0930f,    -0.0203f,   0.1662f,    -0.0695f,
+      -12.3872f,  -18.7022f,  -13.4237f,  -1.4731f,   -18.6843f,  -14.1515f,
+      -7.5057f,   40.2090f,   -2.7774f,   -1.8433f,   123.6006f,  119.0557f,
+      118.2758f,  113.6423f,  -32.6216f,  -19.5865f,  -16.2897f,  17.2068f,
+      6.3559f,    -17.8742f,  0.7098f,    11.5970f,   -10.1104f,  -33.1830f,
+      39.5617f,   -10.5499f,  -17.8137f,  -14.7185f,  -2.6172f,   -14.6004f,
+      0.3893f,    0.4443f,    0.5305f,    0.3049f,    0.8316f,    0.8679f,
+      0.2265f,    0.2393f,    1.1970f,    -0.2891f,   -1.8666f,   -1.8266f,
+      -1.6984f,   -1.8787f,   0.8706f,    0.4208f,    0.5076f,    -0.8436f,
+      -0.1623f,   0.8008f,    0.1512f,    -1.0839f,   -0.3002f,   0.9263f,
+      -1.3031f,   0.5964f,    0.3413f,    0.5551f,    0.2618f,    0.7018f,
+      -0.1320f,   -0.1944f,   -0.0209f,   -0.0877f,   0.0721f,    -0.0840f,
+      0.0589f,    0.1019f,    0.1927f,    -0.2011f,   -0.1117f,   0.1575f,
+      0.1080f,    -0.0516f,   0.2154f,    -0.1231f,   0.0426f,    -0.0522f,
+      -0.1824f,   -0.1923f,   -0.1206f,   -0.1724f,   -0.0798f,   0.0401f,
+      -0.2170f,   0.0293f,    -0.0853f,   0.1517f,    0.2128f,    -0.1934f,
+      0.0406f,    0.0517f,    0.0822f,    -0.0150f,   0.0943f,    -0.0989f,
+      -0.1802f,   -0.1453f,   -0.1967f,   -0.1797f,   0.1545f,    -0.1217f,
+      0.1755f,    -0.1604f,   -0.0515f,   0.0509f,    0.0310f,    -0.1220f,
+      -0.1770f,   -0.0157f,   0.1989f,    -0.0069f,   0.1766f,    0.1267f,
+      -0.0517f,   -0.0396f,   0.0346f,    0.1946f,    0.1162f,    -0.1345f,
+      -106.6179f, -110.5917f, -107.5476f, -108.0601f, -61.1687f,  -22.4247f,
+      2.6632f,    109.5208f,  -66.1177f,  0.0062f,    159.9339f,  144.7755f,
+      145.5032f,  128.9872f,  18.9180f,   -75.3569f,  -105.0866f, -52.0704f,
+      -119.1299f, -74.7543f,  -109.9468f, -59.0682f,  -104.5754f, 19.2878f,
+      67.2573f,   -104.8061f, -111.8610f, -106.6751f, -107.3537f, -56.4758f,
+      -0.6967f,   -0.8495f,   -0.9586f,   -1.0461f,   1.4522f,    -0.2762f,
+      28.2828f,   2.9157f,    -2.1062f,   0.1566f,    -467.2388f, -461.0685f,
+      -459.0092f, -453.8370f, 1.5422f,    -0.8186f,   -0.4884f,   -53.0399f,
+      -2.0255f,   -1.1348f,   -1.1039f,   -50.2489f,  -1.4821f,   1.8021f,
+      -258.0319f, -1.0865f,   -0.5542f,   -1.0443f,   -1.2732f,   1.8413f,
+      0.2377f,    0.1937f,    -0.0116f,   0.0935f,    -0.0599f,   0.0118f,
+      -0.0875f,   0.0455f,    -0.1301f,   -0.1081f,   -0.2622f,   -0.1960f,
+      0.0393f,    -0.1490f,   0.1852f,    -0.0964f,   -0.0741f,   0.0419f,
+      0.1162f,    -0.0274f,   0.1200f,    -0.0333f,   -0.1337f,   0.2141f,
+      0.0664f,    0.1044f,    -0.1744f,   0.1060f,    -0.1468f,   0.0679f,
+      0.0218f,    0.0494f,    0.1064f,    0.1363f,    0.0013f,    0.1331f,
+      -0.2095f,   0.2088f,    -0.0399f,   -0.1811f,   0.0678f,    -0.1974f,
+      0.1855f,    -0.0968f,   -0.2008f,   0.0162f,    -0.0096f,   -0.1493f,
+      0.2170f,    -0.1248f,   -0.2055f,   0.1276f,    -0.0269f,   -0.1697f,
+      -0.0662f,   0.1073f,    -0.0029f,   -0.1051f,   -0.1573f,   0.2106f,
+      -0.2020f,   -0.1565f,   0.0335f,    -0.1818f,   -0.1665f,   0.2169f,
+      0.1974f,    -0.1470f,   -0.1738f,   -0.2038f,   0.0558f,    -0.0441f,
+      0.0065f,    -0.1485f,   -0.1366f,   -0.2131f,   0.1042f,    0.0349f,
+      -0.1804f,   -0.1361f,   -0.0116f,   -0.1012f,   -0.0860f,   0.0606f,
+      -0.2077f,   0.1826f,    -0.1014f,   -0.0721f,   -0.1517f,   0.1022f,
+      -0.1110f,   -0.0186f,   0.1505f,    0.1797f,    0.0911f,    0.0340f,
+      0.1702f,    -0.1404f,   -0.0566f,   -0.2744f,   -0.1943f,   -0.1871f,
+      0.0046f,    0.0306f,    -0.0436f,   0.1625f,    -0.1302f,   0.0175f,
+      0.1570f,    -0.1425f,   0.0779f,    0.1398f,    0.0929f,    0.0897f,
+      0.0458f,    -0.0936f,   0.1321f,    -0.1355f,   0.0974f,    0.0457f,
+      -73.3516f,  -75.0655f,  -72.1062f,  -72.4624f,  -34.8640f,  -14.3727f,
+      -4.4720f,   66.4982f,   -18.8358f,  0.0397f,    174.2172f,  160.4959f,
+      161.1034f,  147.3250f,  9.5507f,    -45.0180f,  -73.1609f,  -1.5230f,
+      -74.8677f,  -43.8559f,  -68.7622f,  -4.8971f,   -82.1922f,  9.6490f,
+      64.7115f,   -71.8566f,  -75.3879f,  -72.5479f,  -71.7161f,  -34.8056f,
+      0.1442f,    0.1558f,    0.1267f,    -0.1261f,   -0.0506f,   -0.0823f,
+      -0.1807f,   -0.0889f,   -0.2098f,   -0.1295f,   -0.2046f,   -0.1749f,
+      -0.1197f,   -0.1380f,   0.0799f,    -0.0889f,   -0.1209f,   0.1919f,
+      0.1947f,    -0.2086f,   -0.1042f,   -0.0468f,   0.0232f,    0.1052f,
+      -0.0535f,   0.1398f,    0.1713f,    -0.1522f,   0.1453f,    0.0286f,
+      -64.8503f,  -67.6746f,  -63.6497f,  -60.4614f,  -35.6091f,  -20.1605f,
+      -3.6082f,   84.2801f,   -37.8552f,  -2.2371f,   132.4947f,  123.5057f,
+      123.5776f,  113.9060f,  -14.8772f,  -40.7130f,  -79.1391f,  -10.7024f,
+      -65.7831f,  -43.6078f,  -79.6847f,  -13.0743f,  -69.2533f,  -16.0171f,
+      50.4868f,   -64.3678f,  -68.7061f,  -64.0823f,  -59.3413f,  -28.9405f,
+      77.1601f,   75.4899f,   69.8696f,   67.8764f,   -22.7548f,  5.9814f,
+      -3.2826f,   57.9754f,   -5.9500f,   -0.0014f,   77.2251f,   74.0737f,
+      73.7004f,   70.5072f,   -80.9661f,  69.3065f,   55.8337f,   76.8831f,
+      57.9902f,   63.4765f,   56.4748f,   70.0282f,   61.0874f,   -81.3960f,
+      26.2594f,   76.0367f,   74.9115f,   69.2361f,   66.9262f,   -20.1637f,
+      0.1886f,    -0.1108f,   0.1262f,    0.0189f,    0.1382f,    0.0859f,
+      -0.1874f,   -0.1986f,   -0.0171f,   -0.1400f,   -0.2944f,   -0.0750f,
+      -0.0395f,   -0.2092f,   -0.0878f,   0.1216f,    -0.0870f,   -0.1613f,
+      0.2495f,    0.0754f,    0.0244f,    -0.1205f,   -0.0196f,   -0.1729f,
+      0.1170f,    0.1585f,    0.1482f,    -0.1705f,   -0.1337f,   0.0199f,
+      13.0897f,   9.1111f,    6.7413f,    6.3907f,    -28.1187f,  0.4556f,
+      -5.3116f,   30.7293f,   -16.3644f,  -0.0365f,   118.9118f,  111.6125f,
+      111.3227f,  103.4680f,  -30.1883f,  8.9328f,    -4.1876f,   79.3936f,
+      -9.0522f,   12.7861f,   -1.2736f,   78.0446f,   -5.9485f,   -30.5716f,
+      27.8951f,   13.9613f,   6.7173f,    5.2345f,    8.3271f,    -27.3705f,
+      1.0488f,    1.0864f,    1.0710f,    1.7332f,    -3.0561f,   1.1622f,
+      -7.6688f,   3.0491f,    -1.3865f,   0.0769f,    222.5451f,  207.8170f,
+      208.1767f,  193.1396f,  0.4447f,    2.1654f,    1.8929f,    35.1469f,
+      1.1783f,    2.6199f,    1.1611f,    26.2989f,   3.4446f,    0.1551f,
+      65.6529f,   1.2229f,    0.9851f,    1.0241f,    1.4373f,    -3.3421f,
+      0.1388f,    0.0756f,    0.2047f,    0.1140f,    0.0945f,    0.2038f,
+      0.1038f,    -0.2068f,   -0.0626f,   -0.1937f,   0.1347f,    -0.0464f,
+      -0.0866f,   0.0250f,    0.0264f,    -0.1556f,   -0.1625f,   0.1028f,
+      -0.1255f,   -0.0854f,   0.1033f,    0.0008f,    -0.2133f,   -0.0317f,
+      0.1725f,    -0.1054f,   -0.1900f,   0.0383f,    0.0440f,    -0.1900f,
+      -30.0811f,  -30.9929f,  -29.3194f,  -26.8347f,  -20.5957f,  -4.1595f,
+      -1.9066f,   42.4707f,   -9.0435f,   0.0064f,    175.7328f,  163.1350f,
+      163.5085f,  151.1648f,  4.4620f,    -20.6011f,  -19.3402f,  1.5468f,
+      -32.0920f,  -25.4581f,  -12.3706f,  -2.1636f,   -32.4569f,  3.9365f,
+      61.0117f,   -28.4195f,  -31.0837f,  -30.2749f,  -27.5522f,  -22.8688f,
+      -0.3000f,   0.0092f,    -0.3675f,   -0.4113f,   0.0033f,    0.1138f,
+      0.2182f,    -0.5803f,   0.7507f,    -0.2529f,   -1.7724f,   -1.4702f,
+      -1.5805f,   -1.4294f,   0.1435f,    -0.0168f,   0.2356f,    -0.4373f,
+      -0.4500f,   -0.4803f,   -0.0041f,   -0.3878f,   0.1321f,    0.2761f,
+      -1.1975f,   -0.3509f,   -0.0465f,   -0.4050f,   -0.1110f,   0.2233f,
+      0.0950f,    0.0974f,    -0.1600f,   -0.1753f,   -0.0328f,   0.0741f,
+      -0.0706f,   0.1839f,    -0.0833f,   -0.1367f,   -0.1094f,   -0.1739f,
+      -0.1069f,   0.0370f,    -0.1404f,   0.1631f,    -0.1570f,   0.2117f,
+      -0.1891f,   0.0395f,    0.1081f,    0.1760f,    0.0997f,    0.0853f,
+      -0.1018f,   0.1306f,    -0.0924f,   -0.2078f,   0.0801f,    -0.0949f,
+      0.5803f,    0.5578f,    0.4089f,    0.1912f,    0.6774f,    0.3145f,
+      0.3992f,    -0.1316f,   1.3142f,    -0.2457f,   -2.3536f,   -2.4939f,
+      -2.3165f,   -2.4879f,   0.2321f,    0.1901f,    0.1789f,    -1.5215f,
+      0.2645f,    0.2231f,    0.2411f,    -1.2361f,   0.2971f,    0.1421f,
+      -1.6715f,   0.3158f,    0.2476f,    0.3596f,    0.3029f,    0.9297f,
+      -88.8401f,  -89.5209f,  -86.1926f,  -87.4196f,  -39.6504f,  -17.9684f,
+      -4.2702f,   80.2017f,   -29.1676f,  -0.4190f,   150.2820f,  138.4751f,
+      139.1087f,  126.6569f,  13.7188f,   -57.0739f,  -80.3383f,  -18.8351f,
+      -87.4103f,  -56.0072f,  -82.7707f,  -23.1871f,  -93.6787f,  13.9287f,
+      59.6213f,   -87.4843f,  -90.4227f,  -86.2635f,  -86.6841f,  -37.9086f,
+      0.1184f,    -0.2169f,   -0.1915f,   0.0543f,    0.1253f,    -0.1370f,
+      0.0836f,    -0.1198f,   0.1544f,    -0.2004f,   -0.1118f,   -0.0786f,
+      0.1517f,    -0.1000f,   -0.1055f,   0.0936f,    -0.1579f,   0.1098f,
+      -0.0234f,   -0.0499f,   0.0951f,    -0.1711f,   0.0186f,    -0.2008f,
+      0.1777f,    0.1386f,    -0.1495f,   -0.0684f,   -0.2149f,   -0.1198f,
+      -0.6205f,   -0.7209f,   -0.5487f,   -0.9080f,   1.3400f,    0.0085f,
+      28.2837f,   3.2217f,    -1.8463f,   0.1620f,    -464.3599f, -458.4327f,
+      -455.9967f, -451.0393f, 1.6619f,    -0.6944f,   -0.3167f,   -52.3630f,
+      -1.6971f,   -0.7340f,   -0.8923f,   -49.2771f,  -1.1177f,   1.8810f,
+      -258.9386f, -1.0765f,   -0.7279f,   -0.5208f,   -0.8839f,   1.8175f,
+      -78.8510f,  -80.5740f,  -77.8843f,  -77.9798f,  -36.5560f,  -16.0818f,
+      -5.5362f,   66.4228f,   -16.8150f,  0.0036f,    181.8365f,  167.7181f,
+      168.2344f,  153.9725f,  11.2659f,   -47.5786f,  -92.6978f,  6.7573f,
+      -68.7704f,  -48.3850f,  -95.3637f,  8.8888f,    -76.9497f,  11.2243f,
+      60.9020f,   -77.6515f,  -80.7610f,  -78.4537f,  -77.4659f,  -36.2872f,
+      -0.0936f,   0.1966f,    -0.2121f,   0.0193f,    0.0489f,    -0.1445f,
+      0.0060f,    0.0358f,    -0.0783f,   -0.0985f,   -0.2072f,   -0.0802f,
+      -0.0185f,   0.1868f,    -0.0631f,   0.1260f,    -0.0675f,   0.2167f,
+      -0.2174f,   -0.1085f,   0.1483f,    -0.1655f,   -0.1040f,   0.1605f,
+      -0.1673f,   -0.0148f,   -0.1856f,   -0.1454f,   0.1603f,    -0.1620f,
+      -0.9205f,   -1.2716f,   -3.6561f,   -5.0834f,   -0.7934f,   1.8710f,
+      2.2999f,    -2.9516f,   -1.7631f,   -0.3804f,   41.2998f,   26.2358f,
+      28.9763f,   15.7315f,   5.2164f,    3.2963f,    -5.4457f,   18.6310f,
+      -25.0076f,  5.4368f,    -12.0085f,  17.1462f,   -14.6992f,  5.6365f,
+      48.6207f,   -1.0921f,   -1.8723f,   -3.5354f,   -5.1774f,   -1.0200f,
+      -0.1065f,   -0.2021f,   0.0332f,    0.1692f,    -0.1239f,   0.1325f,
+      -0.0660f,   -0.0567f,   0.2107f,    -0.2084f,   -0.0263f,   0.1411f,
+      0.0178f,    0.0451f,    0.2024f,    -0.1756f,   -0.0771f,   -0.1690f,
+      -0.2097f,   -0.2130f,   0.0714f,    0.0172f,    -0.0310f,   0.0649f,
+      -0.1550f,   0.0701f,    0.0306f,    -0.1750f,   -0.1988f,   -0.2060f,
+      0.0005f,    -0.1325f,   -0.1823f,   -0.0900f,   -0.1291f,   -0.1817f,
+      0.0144f,    0.0951f,    -0.1954f,   -0.0171f,   -0.1985f,   0.0875f,
+      0.0901f,    -0.0857f,   0.1681f,    0.0465f,    0.1023f,    0.0985f,
+      -0.2152f,   -0.1723f,   -0.0825f,   0.0203f,    -0.1206f,   -0.1431f,
+      -0.1552f,   0.1344f,    0.0398f,    0.0169f,    0.2180f,    -0.1530f,
+      2.7964f,    2.7312f,    2.8831f,    3.4729f,    -3.1366f,   2.4043f,
+      -7.2004f,   1.4128f,    2.8648f,    0.0578f,    225.5640f,  210.3712f,
+      210.6907f,  195.0339f,  0.3140f,    1.8060f,    2.7355f,    33.6917f,
+      3.3542f,    3.3682f,    1.7371f,    31.2424f,   3.4094f,    -0.1192f,
+      63.0864f,   3.0562f,    2.8633f,    2.6777f,    3.5495f,    -4.2616f,
+      -1.4034f,   0.3930f,    -4.6756f,   -9.9870f,   -27.8511f,  5.6071f,
+      -1.0862f,   34.4907f,   -10.4831f,  -0.0281f,   117.2617f,  104.9590f,
+      106.1515f,  93.9707f,   -16.8801f,  5.3036f,    -21.7458f,  98.5306f,
+      -20.7596f,  6.4733f,    -17.6440f,  98.3097f,   -31.9540f,  -17.0600f,
+      27.4543f,   -0.6140f,   -1.6182f,   -4.9167f,   -8.9017f,   -26.2485f,
+      -0.1952f,   -0.0462f,   -0.1958f,   0.1679f,    -0.1592f,   -0.1634f,
+      -0.0507f,   -0.0542f,   0.0038f,    -0.0343f,   0.0567f,    -0.1983f,
+      0.0250f,    -0.0762f,   0.0902f,    -0.0343f,   0.1240f,    0.1161f,
+      0.1237f,    0.1870f,    0.0346f,    0.0340f,    0.0625f,    -0.0355f,
+      0.0278f,    -0.1043f,   0.1755f,    0.0253f,    0.1750f,    -0.2070f,
+      -5.5531f,   -5.3122f,   -4.9348f,   -4.4782f,   -7.5686f,   -1.5478f,
+      -5.4341f,   0.5087f,    -2.1382f,   0.0798f,    208.3677f,  194.0083f,
+      194.4168f,  179.3082f,  1.4443f,    -1.5038f,   -1.4021f,   25.9363f,
+      -4.0635f,   -2.6785f,   -1.6640f,   22.2589f,   -1.4910f,   1.4715f,
+      59.1972f,   -4.9638f,   -5.1920f,   -4.9193f,   -5.2649f,   -8.0556f,
+      20.1226f,   12.0195f,   9.7385f,    10.7058f,   -27.4201f,  8.4869f,
+      -5.0826f,   32.9212f,   -2.0674f,   -0.0290f,   120.5002f,  112.3222f,
+      112.3287f,  104.1107f,  -20.6293f,  14.8534f,   -0.8748f,   103.1141f,
+      -1.1368f,   15.3716f,   2.7653f,    91.7285f,   -0.5991f,   -20.7338f,
+      35.9363f,   20.5104f,   11.1988f,   9.0368f,    10.6355f,   -26.5309f,
+      -0.2058f,   -0.2176f,   0.1331f,    -0.1415f,   -0.0825f,   -0.0470f,
+      -0.0615f,   0.1274f,    0.0076f,    -0.0575f,   -0.2065f,   0.0866f,
+      0.2166f,    -0.1942f,   -0.1952f,   0.1323f,    -0.1016f,   0.1803f,
+      -0.0424f,   0.1555f,    0.1118f,    0.1559f,    0.0337f,    -0.0341f,
+      -0.0430f,   0.1988f,    -0.0553f,   -0.0255f,   0.1817f,    0.0608f,
+      0.1431f,    0.0686f,    -0.0245f,   -0.2107f,   0.2001f,    -0.0964f,
+      -0.0090f,   0.1151f,    -0.0365f,   -0.1986f,   0.1740f,    -0.2098f,
+      0.0013f,    0.1369f,    0.1910f,    0.1801f,    -0.2019f,   0.0348f,
+      -0.1175f,   0.0627f,    -0.1929f,   -0.0099f,   0.1349f,    0.1804f,
+      -0.1071f,   -0.1651f,   -0.1146f,   -0.0259f,   0.1626f,    -0.0271f,
+      0.1393f,    0.1304f,    -0.0200f,   0.0924f,    -0.0839f,   -0.0031f,
+      -0.1311f,   0.0350f,    -0.1330f,   -0.0911f,   0.1949f,    -0.0209f,
+      -0.1883f,   0.0269f,    0.2040f,    0.1552f,    0.1532f,    0.1157f,
+      -0.1102f,   -0.1220f,   -0.0808f,   -0.1050f,   0.1716f,    0.0846f,
+      -0.0180f,   -0.1037f,   0.2063f,    0.1237f,    0.1253f,    -0.0496f,
+      -0.0183f,   0.0491f,    0.1703f,    -0.0824f,   -0.0702f,   -0.1100f,
+      -0.0965f,   0.0130f,    -0.1222f,   -0.1081f,   0.0329f,    0.2115f,
+      -0.1438f,   0.0799f,    -0.1602f,   -0.0330f,   0.0501f,    0.1072f,
+      -0.0744f,   -0.1783f,   -0.0240f,   0.0777f,    -0.1944f,   0.0438f,
+      -0.0033f,   -0.1873f,   0.0984f,    -0.0318f,   0.0773f,    0.1489f,
+      0.3966f,    0.4711f,    0.3972f,    0.0623f,    0.5970f,    0.1018f,
+      0.1375f,    -0.1881f,   0.8921f,    -0.1854f,   -2.1138f,   -2.1178f,
+      -1.8295f,   -2.1703f,   0.5784f,    -0.1937f,   -0.0728f,   -0.9953f,
+      0.2442f,    -0.4074f,   -0.1591f,   -1.1660f,   0.4832f,    0.2203f,
+      -1.4957f,   0.1544f,    0.1810f,    0.2275f,    0.4075f,    0.8153f,
+      0.0715f,    0.0222f,    0.0463f,    -0.0201f,   0.0396f,    0.5951f,
+      -0.2779f,   -0.0306f,   0.7532f,    -0.1596f,   -4.1080f,   -3.7925f,
+      -3.8522f,   -3.2468f,   0.7728f,    0.0188f,    -0.1448f,   0.4084f,
+      -0.4666f,   -0.1036f,   -1.1469f,   0.4243f,    0.2778f,    0.9023f,
+      -3.0216f,   0.0384f,    -0.3348f,   -0.0314f,   -0.2788f,   0.0479f,
+      139.0773f,  131.6164f,  115.0392f,  111.1817f,  41.7596f,   9.5379f,
+      1.8542f,    46.9890f,   -12.8221f,  0.0241f,    52.9779f,   51.5268f,
+      50.8060f,   48.7028f,   -132.9665f, 118.3478f,  101.1239f,  81.4608f,
+      75.4251f,   121.0643f,  97.8947f,   86.8911f,   74.5576f,   -133.7606f,
+      29.2657f,   135.8916f,  131.3661f,  114.1687f,  111.0784f,  31.3790f,
+      -0.0807f,   -0.0657f,   -0.0027f,   0.0410f,    0.0765f,    0.1194f,
+      0.0953f,    -0.0060f,   0.1531f,    -0.2339f,   0.1488f,    -0.0615f,
+      -0.0579f,   0.0761f,    0.1250f,    -0.0469f,   0.1480f,    0.0683f,
+      -0.0049f,   0.1558f,    0.2168f,    -0.0736f,   0.1135f,    -0.1244f,
+      0.0725f,    -0.1297f,   -0.0215f,   -0.0412f,   -0.1632f,   -0.0200f,
+      -0.1346f,   -0.1954f,   0.0053f,    0.0151f,    0.1379f,    -0.1497f,
+      -0.0102f,   -0.0336f,   0.0900f,    -0.1706f,   -0.0932f,   -0.2084f,
+      0.1242f,    -0.2027f,   0.0849f,    -0.2139f,   -0.2015f,   0.0944f,
+      -0.0984f,   0.2082f,    0.1625f,    -0.0227f,   -0.1676f,   0.1021f,
+      0.1516f,    0.0245f,    0.0955f,    -0.1488f,   -0.0057f,   0.1783f,
+      -0.8568f,   -0.8175f,   -0.6282f,   -1.3107f,   1.5712f,    0.1044f,
+      28.2289f,   3.0885f,    -1.9829f,   0.1600f,    -465.9583f, -459.5893f,
+      -457.5055f, -452.7600f, 1.7229f,    -0.6620f,   -0.1065f,   -52.8017f,
+      -2.0293f,   -0.8224f,   -1.0389f,   -49.9049f,  -1.2250f,   1.7647f,
+      -259.2465f, -1.0978f,   -0.5169f,   -0.8721f,   -0.8197f,   1.9158f,
+      16.2234f,   15.8523f,   13.8343f,   9.8509f,    -21.4326f,  15.7650f,
+      -6.4451f,   34.8575f,   1.1387f,    -0.0223f,   117.7213f,  109.8494f,
+      109.7624f,  101.8532f,  -20.3275f,  16.0812f,   4.9165f,    92.4919f,
+      4.1615f,    13.8451f,   9.2112f,    97.1580f,   -8.7037f,   -20.4420f,
+      27.1105f,   17.4922f,   13.9998f,   12.3888f,   11.4705f,   -20.9568f,
+      0.5457f,    0.5322f,    0.2823f,    0.3581f,    0.5359f,    0.1576f,
+      0.1969f,    -0.0136f,   -0.2748f,   -0.3168f,   -0.3918f,   -0.2167f,
+      -0.1797f,   -0.1869f,   0.2986f,    -0.2116f,   -0.4226f,   -0.2022f,
+      0.9452f,    0.5474f,    -0.1218f,   0.2067f,    -0.1600f,   0.1937f,
+      0.0808f,    0.4877f,    0.5106f,    0.2626f,    0.5076f,    0.6228f,
+      0.5124f,    0.4044f,    0.4023f,    0.1222f,    2.5446f,    0.9623f,
+      24.9875f,   4.7442f,    -2.0551f,   0.1642f,    -449.9478f, -444.1841f,
+      -442.0153f, -437.1498f, 2.3209f,    -0.6986f,   -0.3456f,   -47.4074f,
+      -1.2374f,   -1.0939f,   -0.9112f,   -41.1851f,  -0.5064f,   2.4209f,
+      -263.4446f, -0.0433f,   0.3460f,    0.1475f,    0.3770f,    2.9154f,
+      0.2032f,    0.1527f,    0.2161f,    -0.1981f,   0.1893f,    -0.2003f,
+      0.1734f,    0.1713f,    0.1207f,    -0.2073f,   -0.1018f,   0.0770f,
+      0.0728f,    0.1665f,    0.0689f,    0.1884f,    -0.1399f,   -0.1326f,
+      -0.0518f,   -0.1948f,   0.1576f,    -0.1835f,   0.1436f,    0.0497f,
+      0.0883f,    -0.1253f,   -0.0417f,   -0.0507f,   -0.1555f,   0.2076f,
+      -2.4080f,   6.1616f,    -0.8564f,   -13.6773f,  -32.7238f,  -16.3144f,
+      -1.9828f,   20.5110f,   -17.0191f,  -1.7154f,   103.6642f,  95.3675f,
+      95.5662f,   86.9504f,   -35.5340f,  19.6681f,   -2.4900f,   65.0847f,
+      -15.8119f,  13.7256f,   -4.6753f,   63.4713f,   -6.5992f,   -34.2369f,
+      41.3959f,   -1.5528f,   3.8106f,    -0.7762f,   -12.3204f,  -35.1734f,
+      -83.9509f,  -87.4861f,  -83.5925f,  -81.5047f,  -54.1256f,  -45.7506f,
+      -13.5325f,  -6.0331f,   -8.5062f,   0.0261f,    189.9450f,  177.7870f,
+      178.6945f,  164.9762f,  9.8521f,    -68.0619f,  -68.6145f,  6.5056f,
+      -55.9651f,  -66.9540f,  -65.3349f,  -2.1954f,   -57.2408f,  8.6577f,
+      60.6966f,   -82.1056f,  -88.5245f,  -83.3057f,  -80.7283f,  -50.5285f,
+      -0.1397f,   0.1862f,    -0.0691f,   -0.0906f,   0.1560f,    0.1377f,
+      -0.0066f,   -0.0213f,   0.0708f,    -0.0386f,   -0.0015f,   -0.0020f,
+      -0.2122f,   0.0747f,    0.0795f,    0.0229f,    0.1923f,    -0.1661f,
+      0.0895f,    0.1176f,    0.1398f,    -0.0443f,   0.0934f,    0.0638f,
+      -0.1924f,   0.0602f,    0.0404f,    0.1597f,    0.1387f,    -0.0601f,
+      -28.3967f,  -21.8483f,  -25.5175f,  -29.9252f,  2.0161f,    -3.0092f,
+      7.7435f,    28.2367f,   -35.0188f,  -0.1578f,   105.0164f,  93.4495f,
+      94.9134f,   81.0315f,   4.3602f,    8.1303f,    -37.7665f,  -16.6986f,
+      -40.8902f,  8.2542f,    -33.3215f,  -2.0457f,   -69.0245f,  4.1016f,
+      47.2770f,   -25.8268f,  -23.6034f,  -26.4339f,  -27.8305f,  8.4468f,
+      13.8742f,   8.3874f,    4.2044f,    1.4619f,    -40.2909f,  -0.6358f,
+      -0.7982f,   36.1931f,   -17.3147f,  -0.3348f,   106.8135f,  96.5298f,
+      97.8829f,   86.9994f,   -25.8170f,  15.0652f,   -0.9181f,   85.8544f,
+      2.5475f,    9.8009f,    -3.5931f,   89.2017f,   -3.7252f,   -25.2986f,
+      22.5505f,   14.0434f,   7.0708f,    4.6646f,    1.5807f,    -39.4024f,
+      -0.1436f,   0.0256f,    0.0274f,    -0.2126f,   0.0401f,    0.0745f,
+      -0.0379f,   -0.0357f,   0.0777f,    -0.0709f,   -0.1093f,   -0.2047f,
+      -0.0713f,   -0.0478f,   -0.0908f,   0.1963f,    0.1282f,    0.0977f,
+      0.1304f,    0.2058f,    0.0700f,    0.0518f,    0.0239f,    0.0686f,
+      -0.1909f,   0.0828f,    -0.1243f,   -0.1920f,   0.1908f,    -0.0808f,
+      90.8028f,   89.2894f,   84.5339f,   83.3491f,   -13.3838f,  12.0240f,
+      -3.9443f,   63.0867f,   -2.5321f,   -0.0099f,   68.9140f,   66.3206f,
+      66.0278f,   63.1498f,   -83.7261f,  74.3448f,   73.4998f,   64.8477f,
+      69.7701f,   74.5878f,   71.0331f,   63.2116f,   74.3162f,   -83.9282f,
+      20.8163f,   89.6818f,   88.6452f,   83.7338f,   82.9360f,   -13.2357f,
+      0.1299f,    -0.1765f,   -0.0168f,   -0.1372f,   -0.1183f,   0.0472f,
+      0.1312f,    0.0267f,    0.0194f,    -0.1593f,   0.0059f,    0.1775f,
+      0.0668f,    -0.1239f,   -0.1982f,   -0.1415f,   -0.1659f,   -0.1148f,
+      0.0136f,    0.0913f,    -0.1254f,   -0.0357f,   0.0892f,    0.0835f,
+      -0.0554f,   0.1969f,    -0.0888f,   -0.0623f,   -0.0236f,   -0.1492f,
+      0.4196f,    0.3218f,    0.2287f,    0.5095f,    0.7210f,    0.2279f,
+      0.4523f,    -0.1832f,   1.3095f,    -0.2041f,   -2.1443f,   -2.1947f,
+      -1.9292f,   -2.1142f,   0.5840f,    0.1018f,    0.1011f,    -1.6565f,
+      0.4325f,    0.0424f,    0.2836f,    -1.7183f,   0.2595f,    0.2686f,
+      -1.8784f,   0.3891f,    0.3050f,    0.6195f,    0.2896f,    0.5905f,
+      -5.3024f,   -3.2518f,   -12.5192f,  -29.1732f,  1.6538f,    -1.8315f,
+      9.9788f,    10.5155f,   6.3234f,    -0.3460f,   76.9925f,   51.3785f,
+      55.7120f,   29.0432f,   5.5901f,    25.6578f,   -3.9565f,   13.0509f,
+      -106.0371f, 23.2124f,   -18.2004f,  8.4618f,    -69.3585f,  5.5651f,
+      80.0565f,   -6.4941f,   -5.3742f,   -14.4209f,  -24.1565f,  6.6801f,
+      -22.0585f,  -20.9909f,  -26.7939f,  -29.6890f,  -14.5085f,  2.1866f,
+      -4.2608f,   17.3977f,   -30.8824f,  -0.4017f,   135.6957f,  126.9320f,
+      127.0044f,  118.1835f,  -1.8768f,   -0.8629f,   -32.0882f,  44.7862f,
+      -23.9174f,  1.6485f,    -27.9940f,  51.9078f,   -48.5279f,  -1.7550f,
+      49.9230f,   -19.9785f,  -22.4647f,  -27.6911f,  -27.3197f,  -10.6545f,
+      -0.1922f,   -0.1999f,   -0.1396f,   0.1065f,    0.0085f,    -0.1940f,
+      0.0351f,    0.1285f,    -0.0292f,   -0.1296f,   0.1543f,    -0.2082f,
+      -0.1758f,   0.0719f,    0.0764f,    0.1394f,    -0.0255f,   -0.0370f,
+      0.1615f,    -0.0568f,   0.1920f,    -0.1631f,   0.0199f,    0.1884f,
+      0.0693f,    0.1074f,    -0.0273f,   0.1540f,    0.0098f,    0.2111f,
+      0.1805f,    -0.0555f,   0.1159f,    0.0469f,    0.1789f,    -0.1711f,
+      -0.1304f,   0.1912f,    -0.0737f,   -0.1408f,   0.1804f,    -0.2023f,
+      -0.0467f,   -0.1019f,   -0.0136f,   0.0691f,    0.1454f,    -0.0213f,
+      0.0929f,    -0.0958f,   0.1299f,    0.1137f,    0.1175f,    0.1042f,
+      -0.2081f,   -0.0737f,   0.0582f,    0.1640f,    0.2120f,    -0.0646f,
+      -0.0326f,   0.1976f,    0.1182f,    -0.1365f,   -0.1784f,   0.2113f,
+      0.0469f,    0.0763f,    -0.0197f,   -0.1902f,   0.1259f,    0.1598f,
+      -0.0180f,   -0.1339f,   -0.1675f,   -0.1884f,   -0.1973f,   0.1529f,
+      0.1160f,    0.2154f,    -0.1446f,   -0.1395f,   0.0355f,    0.1513f,
+      -0.2086f,   -0.1135f,   -0.1502f,   -0.0018f,   0.0486f,    -0.0110f,
+      -0.0843f,   -0.0716f,   -0.1367f,   0.0753f,    0.0114f,    0.0475f,
+      -0.0632f,   0.2045f,    -0.0512f,   -0.0906f,   -0.1071f,   -0.1957f,
+      0.1361f,    0.1821f,    -0.1684f,   -0.1383f,   0.1059f,    0.1579f,
+      -0.0064f,   -0.1205f,   -0.0718f,   -0.1323f,   -0.0174f,   -0.1092f,
+      -0.1915f,   0.1978f,    -0.1245f,   0.1297f,    -0.1542f,   0.1556f,
+      -0.1752f,   0.0718f,    -0.1020f,   -0.1970f,   0.0518f,    -0.0888f,
+      0.0541f,    -0.1922f,   -0.1467f,   -0.0653f,   -0.1940f,   -0.0800f,
+      -0.1096f,   -0.0796f,   -0.1310f,   0.0191f,    -0.1077f,   -0.0973f,
+      0.1566f,    0.0074f,    0.0500f,    -0.0415f,   -0.2116f,   0.0227f,
+      0.0895f,    0.1528f,    0.1404f,    0.0467f,    0.0462f,    -0.0973f,
+      -0.1669f,   0.0551f,    0.1167f,    -0.1470f,   -0.0542f,   -0.1006f,
+      0.2104f,    0.1039f,    -0.0211f,   -0.1726f,   -0.0694f,   -0.0270f,
+      0.0277f,    -0.0715f,   -0.2055f,   -0.1502f,   -0.1718f,   -0.0043f,
+      0.0174f,    0.1019f,    -0.0233f,   -0.1518f,   -0.1331f,   -0.0001f,
+      -0.1483f,   -0.2115f,   0.0666f,    0.0014f,    0.1601f,    -0.0690f,
+    };
+
+static const float av1_rdcost_model_nn_biases_layer0[NUM_HIDDEN_NODES] = {
+  0.156824f,   0.f,         0.130013f,   0.084482f,  -129.058197f, -15.090252f,
+  -3.859116f,  0.736356f,   -81.361557f, -0.001922f, -0.000713f,   0.440181f,
+  14.982646f,  1.282223f,   2.23122f,    94.26635f,  93.920929f,   0.614672f,
+  0.f,         0.315858f,   4.746014f,   0.116901f,  -35.661354f,  -75.148285f,
+  92.006989f,  -14.112332f, 86.673157f,  -0.000307f, -0.000544f,   0.f,
+  -7.851313f,  0.505186f,   0.f,         0.f,        -111.681091f, -0.937782f,
+  0.035789f,   0.f,         0.f,         -0.00102f,  -75.180527f,  0.f,
+  -63.821148f, 79.592392f,  0.085068f,   11.184906f, 1.25406f,     0.f,
+  -29.779242f, -0.181732f,  0.f,         0.425554f,  -90.78405f,   0.f,
+  -0.828326f,  -81.132179f, 0.f,         -2.757063f, 0.f,          0.f,
+  2.967951f,   -4.440599f,  0.f,         -5.105355f, 14.734543f,   0.f,
+  0.f,         0.f,         0.f,         0.295342f,  -0.026907f,   133.375412f,
+  -0.000855f,  0.f,         -0.875029f,  15.665165f, 0.437296f,    0.321257f,
+  -0.001932f,  -4.235782f,  -87.187782f, 0.f,        -28.84696f,   7.055514f,
+  0.f,         95.548302f,  -0.000425f,  0.38969f,   -13.88008f,   -27.347931f,
+  0.f,         0.f,         0.f,         -0.000026f, 0.f,          0.f,
+};
+
+static const float
+    av1_rdcost_model_nn_weights_layer1[NUM_HIDDEN_NODES * NUM_OUTPUTS] = {
+      -0.101706f,   -0.14411f,    -0.139118f,   -0.132945f,   118.811302f,
+      3.137232f,    -32.969776f,  -4.150725f,   26.263071f,   0.092841f,
+      0.174125f,    -0.028195f,   15.712872f,   17.722702f,   5.666006f,
+      -121.143929f, -131.933731f, -3.000318f,   -0.032063f,   -0.380065f,
+      -1.660653f,   -0.164802f,   7.177527f,    87.759155f,   -119.564224f,
+      -98.051651f,  -110.581116f, -0.069982f,   0.023906f,    0.183792f,
+      40.606274f,   -0.080804f,   -0.053744f,   -0.187848f,   157.44313f,
+      -4.820149f,   0.089499f,    0.070232f,    -0.043038f,   0.072996f,
+      93.347313f,   0.225259f,    103.223228f,  -110.682541f, 0.14314f,
+      -89.827538f,  6.505952f,    -0.076949f,   73.816132f,   -0.063416f,
+      -0.23736f,    -0.066059f,   116.049599f,  0.120871f,    -4.708246f,
+      107.501671f,  -0.206708f,   -32.688675f,  0.047608f,    -0.105907f,
+      6.505825f,    -75.461891f,  -0.160341f,   6.532121f,    -84.868111f,
+      -0.065622f,   0.044756f,    0.008672f,    0.017155f,    0.046108f,
+      -0.218818f,   -126.507957f, 0.028271f,    0.180625f,    -4.707376f,
+      -121.524307f, -0.03853f,    -4.103166f,   -0.018947f,   -95.768463f,
+      15.941695f,   0.147154f,    -102.863029f, -72.521698f,  -0.037133f,
+      -138.1492f,   0.210016f,    -0.084692f,   -68.693665f,  -52.523472f,
+      -0.133385f,   -0.17438f,    0.008654f,    -0.035642f,   -0.145202f,
+      0.211135f,
+    };
+
+static const float av1_rdcost_model_nn_biases_layer1[NUM_OUTPUTS] = {
+  0.251909f
+};
+
+static const NN_CONFIG av1_rdcost_model_nnconfig = {
+  NUM_FEATURES,
+  NUM_OUTPUTS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_HIDDEN_NODES,
+  },
+  {
+      av1_rdcost_model_nn_weights_layer0,
+      av1_rdcost_model_nn_weights_layer1,
+  },
+  {
+      av1_rdcost_model_nn_biases_layer0,
+      av1_rdcost_model_nn_biases_layer1,
+  },
+};
+
+//------------------------------------------------------------------------------
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_HIDDEN_NODES
+#undef NUM_OUTPUTS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 000000000..2597fb990
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,1776 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define FRAME_OVERHEAD_BITS 200
+#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
+  do {                                                       \
+    switch (bit_depth) {                                     \
+      case AOM_BITS_8: name = name##_8; break;               \
+      case AOM_BITS_10: name = name##_10; break;             \
+      case AOM_BITS_12: name = name##_12; break;             \
+      default:                                               \
+        assert(0 &&                                          \
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+               " or AOM_BITS_12");                           \
+        name = NULL;                                         \
+    }                                                        \
+  } while (0)
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
+  return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
+}
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          aom_bit_depth_t bit_depth) {
+  int i;
+  const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0) return 0;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i;
+  }
+
+  return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+                           int *arfgf_high, int *inter, int *rtc,
+                           aom_bit_depth_t bit_depth) {
+  int i;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+  }
+}
+
+void av1_rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, AOM_BITS_8);
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, AOM_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, AOM_BITS_12);
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+  // Convert the index to a real Q value (scaled down to match old Q values)
+  switch (bit_depth) {
+    case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1.0;
+  }
+}
+
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, aom_bit_depth_t bit_depth) {
+  const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
+  // q based adjustment to baseline enumerator
+  enumerator += (int)(enumerator * q) >> 12;
+  return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           aom_bit_depth_t bit_depth) {
+  const int bpm =
+      (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+  return AOMMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const int min_frame_target =
+      AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+  // Clip the frame target to the minimum setup value.
+  if (cpi->rc.is_src_frame_alt_ref) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  } else if (target < min_frame_target) {
+    target = min_frame_target;
+  }
+
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+
+  return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  return target;
+}
+
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
+  //               differently, since it is a no-show frame.
+  if (!cm->show_frame && !rc->is_bwd_ref_frame)
+    rc->bits_off_target -= encoded_frame_size;
+  else
+    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+                                       double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return AOMMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+#if CONFIG_FIX_GF_LENGTH
+  interval = AOMMAX(FIXED_GF_LENGTH, interval);
+#endif
+  return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
+  if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[KEY_FRAME] =
+        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+  rc->buffer_level = rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
+
+  rc->rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->rolling_actual_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
+  rc->total_target_vs_actual = 0;
+
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
+
+  rc->frames_till_gf_update_due = 0;
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
+
+  rc->tot_q = 0.0;
+  rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 0.7;
+  }
+  rc->rate_correction_factors[KF_STD] = 1.0;
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (!oxcf->drop_frames_water_mark) {
+    return 0;
+  } else {
+    if (rc->buffer_level < 0) {
+      // Always drop if buffer is below 0.
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      int drop_mark =
+          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
+      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
+      } else {
+        rc->decimation_count = 0;
+        return 0;
+      }
+    }
+  }
+}
+
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+                                         int height) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rcf = rc->rate_correction_factors[KF_STD];
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rcf = rc->rate_correction_factors[rf_lvl];
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref &&
+        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
+    else
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
+  }
+  rcf *= resize_rate_factor(cpi, width, height);
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
+                                       int height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= resize_rate_factor(cpi, width, height);
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (cpi->oxcf.pass == 2) {
+    RATE_FACTOR_LEVEL rf_lvl =
+        cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+    rc->rate_correction_factors[rf_lvl] = factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref &&
+        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
+    else
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
+  }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
+                                           int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int correction_factor = 100;
+  double rate_correction_factor =
+      get_rate_correction_factor(cpi, width, height);
+  double adjustment_limit;
+  const int MBs = av1_get_MBs(width, height);
+
+  int projected_size_based_on_q = 0;
+
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref) return;
+
+  // Clear down mmx registers to allow floating point in what follows
+  aom_clear_system_state();
+
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = av1_estimate_bits_at_q(
+        cpi->common.frame_type, cm->base_qindex, MBs, rate_correction_factor,
+        cm->seq_params.bit_depth);
+  }
+  // Work out a size correction factor.
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                              projected_size_based_on_q);
+
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
+  if (correction_factor > 0) {
+    adjustment_limit =
+        0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor)));
+  } else {
+    adjustment_limit = 0.75;
+  }
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
+
+  if (correction_factor > 102) {
+    // We are not already at the worst allowable quality
+    correction_factor =
+        (int)(100 + ((correction_factor - 100) * adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  } else if (correction_factor < 99) {
+    // We are not already at the best allowable quality
+    correction_factor =
+        (int)(100 - ((100 - correction_factor) * adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  set_rate_correction_factor(cpi, rate_correction_factor, width, height);
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int q = active_worst_quality;
+  int last_error = INT_MAX;
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
+  const int MBs = av1_get_MBs(width, height);
+  const double correction_factor =
+      get_rate_correction_factor(cpi, width, height);
+
+  // Calculate required scaling factor based on target frame size and size of
+  // frame produced using previous Q.
+  target_bits_per_mb =
+      (int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs;
+
+  i = active_best_quality;
+
+  do {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+      bits_per_mb_at_this_q =
+          (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+    } else {
+      bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb(
+          cm->frame_type, i, correction_factor, cm->seq_params.bit_depth);
+    }
+
+    if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+      if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+        q = i;
+      else
+        q = i - 1;
+
+      break;
+    } else {
+      last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+    }
+  } while (++i <= active_worst_quality);
+
+  // In CBR mode, this makes sure q is between oscillating Qs to prevent
+  // resonance.
+  if (cpi->oxcf.rc_mode == AOM_CBR &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    q = clamp(q, AOMMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              AOMMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+  }
+  return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+                            kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+                            arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+#if REDUCE_LAST_ALT_BOOST
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return arfgf_high_motion_minq[q];
+}
+#endif
+
+static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const unsigned int curr_frame = cpi->common.current_video_frame;
+  int active_worst_quality;
+
+  if (cpi->common.frame_type == KEY_FRAME) {
+    active_worst_quality =
+        curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+         cpi->refresh_alt_ref_frame)) {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+                                             : rc->last_q[INTER_FRAME];
+    } else {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+                                             : rc->last_q[INTER_FRAME] * 2;
+    }
+  }
+  return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  int ambient_qp;
+  if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_video_frame < 5)
+                   ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
+                            rc->avg_frame_qindex[KEY_FRAME])
+                   : rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  if (rc->buffer_level > rc->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
+                       max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+                           buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
+      }
+      active_worst_quality = ambient_qp + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+  int *rtc_minq;
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75), bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (cm->current_video_frame > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+      active_best_quality +=
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (cm->current_video_frame > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+      !(cm->current_video_frame == 0)) {
+    int qdelta = 0;
+    aom_clear_system_state();
+    qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                        active_worst_quality, 2.0, bit_depth);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  // Special case code to try and match quality with forced key frames
+  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const AV1EncoderConfig *const oxcf) {
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = oxcf->cq_level;
+  if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int q;
+  int *inter_minq;
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex =
+          av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      const int qindex = rc->last_boosted_qindex;
+      const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex = av1_compute_qdelta(
+          rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {  // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      {
+        const double q_val =
+            av1_convert_qindex_to_q(active_best_quality, bit_depth);
+        active_best_quality +=
+            av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+      }
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    q = (rc->frames_since_key > 1 &&
+         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+            ? rc->avg_frame_qindex[INTER_FRAME]
+            : rc->avg_frame_qindex[KEY_FRAME];
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == AOM_CQ) {
+      if (q < cq_level) q = cq_level;
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+    } else if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex =
+          (cpi->refresh_alt_ref_frame)
+              ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+              : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+                                                     0.70, 1.0, 0.85, 1.0 };
+      const int delta_qindex = av1_compute_qdelta(
+          rc, q_val,
+          q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+          bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      active_best_quality = (cm->current_video_frame > 1)
+                                ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
+                                : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  {
+    int qdelta = 0;
+    aom_clear_system_state();
+    if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+        !(cm->current_video_frame == 0)) {
+      qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+                                          active_worst_quality, 2.0, bit_depth);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->frame_type, active_worst_quality, 1.75, bit_depth);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  if (oxcf->rc_mode == AOM_Q) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
+    INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
+  };
+  const AV1_COMMON *const cm = &cpi->common;
+  int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
+                                          rate_factor_deltas[rf_level],
+                                          cm->seq_params.bit_depth);
+  return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+                                         int height, int *bottom_index,
+                                         int *top_index, int *arf_q) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int cq_level = get_active_cq_level(rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+  int *inter_minq;
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+#if CUSTOMIZED_GF
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+#else
+  const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
+#endif  // CUSTOMIZED_GF
+
+  if (frame_is_intra_only(cm)) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      double last_boosted_q;
+      int delta_qindex;
+      int qindex;
+
+      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+        qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        active_best_quality = qindex;
+        last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                          last_boosted_q * 1.25, bit_depth);
+        active_worst_quality =
+            AOMMIN(qindex + delta_qindex, active_worst_quality);
+      } else {
+        qindex = rc->last_boosted_qindex;
+        last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+        delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                          last_boosted_q * 0.5, bit_depth);
+        active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+      }
+    } else {
+      // Not forced keyframe.
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      // Baseline value derived from cpi->active_worst_quality and kf boost.
+      active_best_quality =
+          get_kf_active_quality(rc, active_worst_quality, bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+      active_best_quality +=
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || is_intrl_arf_boost ||
+              cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == AOM_CQ) {
+      if (q < cq_level) q = cq_level;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+      if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+          (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
+#endif  // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+        active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+
+        // Constrained quality use slightly lower active best.
+        active_best_quality = active_best_quality * 15 / 16;
+#if REDUCE_LAST_ALT_BOOST
+        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+          const int boost = min_boost - active_best_quality;
+
+          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+        }
+#endif
+        *arf_q = active_best_quality;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+      } else {
+        active_best_quality = rc->arf_q;
+        int this_height = gf_group->pyramid_level[gf_group->index];
+        while (this_height < gf_group->pyramid_height) {
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
+          ++this_height;
+        }
+      }
+#endif  // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+    } else if (oxcf->rc_mode == AOM_Q) {
+      if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
+        active_best_quality = cq_level;
+      } else {
+        if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+          active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+          *arf_q = active_best_quality;
+#if REDUCE_LAST_ALT_BOOST
+          const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+          const int boost = min_boost - active_best_quality;
+
+          active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+        } else {
+          active_best_quality = rc->arf_q;
+        }
+#if USE_SYMM_MULTI_LAYER
+        if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+          int this_height = gf_group->pyramid_level[gf_group->index];
+          while (this_height < gf_group->pyramid_height) {
+            active_best_quality = (active_best_quality + cq_level + 1) / 2;
+            ++this_height;
+          }
+        } else {
+#endif
+          // Modify best quality for second level arfs. For mode AOM_Q this
+          // becomes the baseline frame q.
+          if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+            active_best_quality = (active_best_quality + cq_level + 1) / 2;
+#if USE_SYMM_MULTI_LAYER
+        }
+#endif
+      }
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if REDUCE_LAST_ALT_BOOST
+      const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+      const int boost = min_boost - active_best_quality;
+
+      active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+#if USE_SYMM_MULTI_LAYER
+      if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+        int this_height = gf_group->pyramid_level[gf_group->index];
+        while (this_height < gf_group->pyramid_height) {
+          active_best_quality =
+              (active_best_quality + active_worst_quality + 1) / 2;
+          ++this_height;
+        }
+      }
+#endif
+    }
+  } else {
+    if (oxcf->rc_mode == AOM_Q) {
+      active_best_quality = cq_level;
+    } else {
+      active_best_quality = inter_minq[active_worst_quality];
+
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if ((cpi->oxcf.rc_mode != AOM_Q) &&
+      (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || is_intrl_arf_boost ||
+          cpi->refresh_alt_ref_frame))) {
+      active_best_quality -=
+          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -=
+          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
+  aom_clear_system_state();
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
+      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+                                       active_worst_quality);
+    active_worst_quality =
+        AOMMAX(active_worst_quality + qdelta, active_best_quality);
+  }
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = av1_compute_qdelta_by_rate(
+        rc, cm->frame_type, active_best_quality, 2.0, bit_depth);
+    active_best_quality =
+        AOMMAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  if (oxcf->rc_mode == AOM_Q) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames.
+  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    } else {
+      q = AOMMIN(rc->last_boosted_qindex,
+                 (active_best_quality + active_worst_quality) / 2);
+    }
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > active_worst_quality) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        active_worst_quality = q;
+      else
+        q = active_worst_quality;
+    }
+  }
+  clamp(q, active_best_quality, active_worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height,
+                             int *bottom_index, int *top_index) {
+  int q;
+  if (cpi->oxcf.pass == 0) {
+    if (cpi->oxcf.rc_mode == AOM_CBR)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
+                                            top_index);
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
+                                            top_index);
+  } else {
+    assert(cpi->oxcf.pass == 2 && "invalid encode pass");
+
+    GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    int arf_q = 0;
+
+    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
+                                      top_index, &arf_q);
+
+    if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+      cpi->rc.arf_q = arf_q;
+    }
+  }
+
+  return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_mode == AOM_Q) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit = INT_MAX;
+  } else {
+    // For very small rate targets where the fractional adjustment
+    // may be tiny make sure there is at least a minimum range.
+    const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+    *frame_under_shoot_limit = AOMMAX(frame_target - tolerance - 200, 0);
+    *frame_over_shoot_limit =
+        AOMMIN(frame_target + tolerance + 200, cpi->rc.max_frame_bandwidth);
+  }
+}
+
+static void rc_set_frame_target(AV1_COMP *cpi, int target, int width,
+                                int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
+
+  // Modify frame size target when down-scaled.
+  if (av1_frame_scaled(cm))
+    rc->this_frame_target =
+        (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
+
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate =
+      (int)((int64_t)rc->this_frame_target * 64 * 64) / (width * height);
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;
+
+  // Set the alternate reference frame active flag
+  rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const GF_GROUP *const gf_group = &twopass->gf_group;
+  const int is_intrnl_arf =
+      cpi->oxcf.pass == 2
+          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+          : cpi->refresh_alt2_ref_frame;
+#else
+  const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
+
+  // Update the Golden frame usage counts.
+  // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
+  //                   only the virtual indices for the reference frame will be
+  //                   updated and cpi->refresh_golden_frame will still be zero.
+  if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+    // We will not use internal overlay frames to replace the golden frame
+    if (!rc->is_src_frame_ext_arf)
+      // this frame refreshes means next frames don't unless specified by user
+      rc->frames_since_golden = 0;
+
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (cpi->oxcf.pass == 2) {
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    } else if (!rc->source_alt_ref_pending) {
+      rc->source_alt_ref_active = 0;
+    }
+  } else if (!cpi->refresh_alt_ref_frame && !is_intrnl_arf) {
+    rc->frames_since_golden++;
+  }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const GF_GROUP *const gf_group = &twopass->gf_group;
+  const int is_intrnl_arf =
+      cpi->oxcf.pass == 2
+          ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+          : cpi->refresh_alt2_ref_frame;
+#else
+  const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
+
+  const int qindex = cm->base_qindex;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    av1_cyclic_refresh_postencode(cpi);
+  }
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+  // Keep a record of last Q and ambient average Q.
+  if (cm->frame_type == KEY_FRAME) {
+    rc->last_q[KEY_FRAME] = qindex;
+    rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        !(cpi->refresh_golden_frame || is_intrnl_arf ||
+          cpi->refresh_alt_ref_frame)) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
+          ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      rc->ni_frames++;
+      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
+  }
+
+  // Keep record of last boosted (KF/GF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = qindex;
+  }
+  if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (av1_frame_scaled(cm))
+    rc->this_frame_target =
+        (int)(rc->this_frame_target /
+              resize_rate_factor(cpi, cm->width, cm->height));
+  if (cm->frame_type != KEY_FRAME) {
+    rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  }
+
+  // Actual bits spent
+  rc->total_actual_bits += rc->projected_frame_size;
+  // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+  //               differently here for rc->avg_frame_bandwidth.
+  rc->total_target_bits +=
+      (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
+
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+      (cm->frame_type != KEY_FRAME))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
+
+  if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  // if (cm->current_video_frame == 1 && cm->show_frame)
+  /*
+  rc->this_frame_target =
+      (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width,
+  cm->height));
+      */
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->rc.frames_since_key++;
+  cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+#if USE_ALTREF_FOR_ONE_PASS
+  target =
+      (!rc->is_src_frame_alt_ref &&
+       (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
+          ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+                (rc->baseline_gf_interval + af_ratio - 1)
+          : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+                (rc->baseline_gf_interval + af_ratio - 1);
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return av1_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  int altref_enabled = is_altref_enabled(cpi);
+  int sframe_dist = cpi->oxcf.sframe_dist;
+  int sframe_mode = cpi->oxcf.sframe_mode;
+  int sframe_enabled = cpi->oxcf.sframe_enabled;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if (!cpi->refresh_alt_ref_frame &&
+      (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+    if (sframe_enabled) {
+      if (altref_enabled) {
+        if (sframe_mode == 1) {
+          // sframe_mode == 1: insert sframe if it matches altref frame.
+
+          if (cm->current_video_frame % sframe_dist == 0 &&
+              cm->frame_type != KEY_FRAME && cm->current_video_frame != 0 &&
+              cpi->refresh_alt_ref_frame) {
+            cm->frame_type = S_FRAME;
+          }
+        } else {
+          // sframe_mode != 1: if sframe will be inserted at the next available
+          // altref frame
+
+          if (cm->current_video_frame % sframe_dist == 0 &&
+              cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+            rc->sframe_due = 1;
+          }
+
+          if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
+            cm->frame_type = S_FRAME;
+            rc->sframe_due = 0;
+          }
+        }
+      } else {
+        if (cm->current_video_frame % sframe_dist == 0 &&
+            cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+          cm->frame_type = S_FRAME;
+        }
+      }
+    }
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
+    cpi->refresh_golden_frame = 1;
+    rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_vbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_vbr(cpi);
+  rc_set_frame_target(cpi, target, cm->width, cm->height);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target = cpi->refresh_golden_frame
+                 ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+                    af_ratio_pct) /
+                       (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
+                 : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+                       (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  return AOMMAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+  if (cpi->common.current_video_frame == 0) {
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+                 ? INT_MAX
+                 : (int)(rc->starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = cpi->framerate;
+
+    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key < framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target;
+  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+  if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+    cm->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_video_frame != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+  } else {
+    cm->frame_type = INTER_FRAME;
+  }
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      av1_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
+  rc_set_frame_target(cpi, target, cm->width, cm->height);
+  // TODO(afergs): Decide whether to scale up, down, or not at all
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth) {
+  int start_index = rc->worst_quality;
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Convert the average q value to an index.
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    start_index = i;
+    if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break;
+  }
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    target_index = i;
+    if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+  }
+
+  return target_index - start_index;
+}
+
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               aom_bit_depth_t bit_depth) {
+  int target_index = rc->worst_quality;
+  int i;
+
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb =
+      av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  // Convert the q target to an index
+  for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+    if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+        target_bits_per_mb) {
+      target_index = i;
+      break;
+    }
+  }
+  return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Special case code for 1 pass fixed Q mode tests
+  if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+
+    // Extended interval for genuinely static scenes
+    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+    if (is_altref_enabled(cpi)) {
+      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+    }
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+  }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+  const int MBs = av1_get_MBs(width, height);
+
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->min_frame_bandwidth =
+      (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+
+  rc->min_frame_bandwidth =
+      AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits =
+      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+            100);
+  rc->max_frame_bandwidth =
+      AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+  av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count != 0.) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target += (vbr_bits_off_target > max_delta)
+                              ? max_delta
+                              : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -= (vbr_bits_off_target < -max_delta)
+                              ? max_delta
+                              : (int)-vbr_bits_off_target;
+  }
+
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+      rc->vbr_bits_off_target_fast) {
+    int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+    fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)AOMMIN(
+        fast_extra_bits,
+        AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    *this_frame_target += (int)fast_extra_bits;
+    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+  }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  rc_set_frame_target(cpi, target_rate, width, height);
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 000000000..198ecab97
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+#define CUSTOMIZED_GF 1
+
+#if CONFIG_FIX_GF_LENGTH
+#define FIXED_GF_LENGTH 16
+#define MAX_PYRAMID_LVL 4
+// We allow a frame to have at most two left/right descendants before changing
+// them into to a subtree, i.e., we allow the following structure:
+/*                    OUT_OF_ORDER_FRAME
+                     / /              \ \
+(two left children) F F                F F (two right children) */
+// Therefore the max gf size supported by 4 layer structure is
+// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
+#define MAX_PYRAMID_SIZE 24
+#define USE_SYMM_MULTI_LAYER 1
+#define REDUCE_LAST_ALT_BOOST 1
+#define REDUCE_LAST_GF_LENGTH 1
+#define MULTI_LVL_BOOST_VBR_CQ 1
+#else
+#define USE_SYMM_MULTI_LAYER 0
+#define REDUCE_LAST_ALT_BOOST 0
+#define REDUCE_LAST_GF_LENGTH 0
+#define MULTI_LVL_BOOST_VBR_CQ 0
+#endif
+
+#if USE_SYMM_MULTI_LAYER
+#define USE_MANUAL_GF4_STRUCT 0
+#endif
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+#define FIXED_GF_INTERVAL 8  // Used in some testing modes only
+
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_LOW = 1,
+  INTER_HIGH = 2,
+  GF_ARF_LOW = 3,
+  GF_ARF_STD = 4,
+  KF_STD = 5,
+  RATE_FACTOR_LEVELS = 6
+} RATE_FACTOR_LEVEL;
+
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+  1.00,  // INTER_NORMAL
+  0.80,  // INTER_LOW
+  1.50,  // INTER_HIGH
+  1.25,  // GF_ARF_LOW
+  2.00,  // GF_ARF_STD
+  2.00,  // KF_STD
+};
+
+typedef struct {
+  int resize_width;
+  int resize_height;
+  uint8_t superres_denom;
+} size_params_type;
+
+typedef struct {
+  // Rate targetting variables
+  int base_frame_target;  // A baseline frame target before adjustment
+                          // for previous under or over shoot.
+  int this_frame_target;  // Actual frame target after rc adjustment.
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[FRAME_TYPES];  // Separate values for Intra/Inter
+  int last_boosted_qindex;  // Last boosted GF/KF/ARF q
+  int last_kf_qindex;       // Q index of the last key frame coded.
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+  int min_gf_interval;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  int baseline_gf_interval;
+  int constrained_gf_group;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+  int sframe_due;
+
+  // Length of the bi-predictive frame group interval
+  int bipred_group_interval;
+
+  // NOTE: Different types of frames may have different bits allocated
+  //       accordingly, aiming to achieve the overall optimal RD performance.
+  int is_bwd_ref_frame;
+  int is_last_bipred_frame;
+  int is_bipred_frame;
+  int is_src_frame_ext_arf;
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[FRAME_TYPES];
+  double tot_q;
+  double avg_q;
+
+  int64_t buffer_level;
+  int64_t bits_off_target;
+  int64_t vbr_bits_off_target;
+  int64_t vbr_bits_off_target_fast;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int rate_error_estimate;
+
+  int64_t total_actual_bits;
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
+
+  int worst_quality;
+  int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  // Auto frame-scaling variables.
+  int rf_level_maxq[RATE_FACTOR_LEVELS];
+  float_t arf_boost_factor;
+  // Q index used for ALT frame
+  int arf_q;
+} RATE_CONTROL;
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
+                 RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor, aom_bit_depth_t bit_depth);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   av1_rc_get_one_pass_vbr_params()
+//   av1_rc_get_one_pass_cbr_params()
+//   av1_rc_get_first_pass_params()
+//   av1_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   av1_rc_postencode_update()
+//   av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_rc_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
+void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
+                                           int height);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
+                             int *bottom_index, int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, aom_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the av1_rc_get_..._params() functions.
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               aom_bit_depth_t bit_depth);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 000000000..b87d89e50
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#define RD_THRESH_POW 1.25
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
+};
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
+    {
+      { 1, 1, 1, 1 },  // unused
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
+    };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
+    {
+      { 1, 1, 1, 1 },  // unused
+      { 1, 1, 0, 0 },
+      { 0, 0, 1, 0 },
+      { 0, 0, 0, 1 },
+    };
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+                                                      EXT_TX_SETS_INTER)] = {
+  {
+      // Intra
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_DTT4_IDTX_1DDCT,
+      EXT_TX_SET_DTT4_IDTX,
+  },
+  {
+      // Inter
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_ALL16,
+      EXT_TX_SET_DTT9_IDTX_1DDCT,
+      EXT_TX_SET_DCT_IDTX,
+  },
+};
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+                         FRAME_CONTEXT *fc) {
+  int i, j;
+
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+
+  if (cm->skip_mode_flag) {
+    for (i = 0; i < SKIP_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
+                               NULL);
+    }
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+  }
+
+  for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+    for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
+  for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+                               fc->uv_mode_cdf[i][j], NULL);
+
+  av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
+                           NULL);
+  for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    if (av1_filter_intra_allowed_bsize(cm, i))
+      av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+                               fc->filter_intra_cdfs[i], NULL);
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
+                             fc->switchable_interp_cdf[i], NULL);
+
+  for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
+    av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
+                             fc->palette_y_size_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
+                             fc->palette_uv_size_cdf[i], NULL);
+    for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+                               fc->palette_y_mode_cdf[i][j], NULL);
+    }
+  }
+
+  for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+                             fc->palette_uv_mode_cdf[i], NULL);
+  }
+
+  for (i = 0; i < PALETTE_SIZES; ++i) {
+    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j],
+                               fc->palette_y_color_index_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j],
+                               fc->palette_uv_color_index_cdf[i][j], NULL);
+    }
+  }
+
+  int sign_cost[CFL_JOINT_SIGNS];
+  av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+    int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
+    int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
+    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
+      memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+    } else {
+      const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+      av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+    }
+    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
+      memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+    } else {
+      const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+      av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+    }
+    for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+      cost_u[u] += sign_cost[joint_sign];
+  }
+
+  for (i = 0; i < MAX_TX_CATS; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
+                               NULL);
+
+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+                             fc->txfm_partition_cdf[i], NULL);
+  }
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        av1_cost_tokens_from_cdf(
+            x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+            av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j) {
+          av1_cost_tokens_from_cdf(
+              x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j],
+              av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+        }
+      }
+    }
+  }
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+    av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
+                             NULL);
+  }
+  av1_cost_tokens_from_cdf(x->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
+                           NULL);
+  av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
+                           NULL);
+  av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
+
+  if (!frame_is_intra_only(cm)) {
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < SINGLE_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+                                 fc->single_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+                               fc->comp_ref_type_cdf[i], NULL);
+    }
+
+    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+      for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+                                 fc->uni_comp_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < FWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
+                                 NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < BWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+                                 fc->comp_bwdref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
+    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
+                               fc->inter_compound_mode_cdf[i], NULL);
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+      av1_cost_tokens_from_cdf(x->compound_type_cost[i],
+                               fc->compound_type_cdf[i], NULL);
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      if (get_interinter_wedge_bits(i)) {
+        av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
+                                 NULL);
+      }
+    }
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+      av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
+                               NULL);
+      av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
+                               fc->interintra_mode_cdf[i], NULL);
+    }
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+                               fc->wedge_interintra_cdf[i], NULL);
+    }
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
+                               NULL);
+    }
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
+    }
+    for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
+                               NULL);
+    }
+    for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+                               fc->comp_group_idx_cdf[i], NULL);
+    }
+  }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+                            aom_bit_depth_t bit_depth) {
+  int i;
+  // Initialize the sad lut tables using a formulaic calculation for now.
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < range; i++) {
+    const double q = av1_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+    bit4lut[i] = (int)(0.063 * q + 2.742);
+  }
+}
+
+void av1_init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+                  AOM_BITS_8);
+  init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+                  AOM_BITS_10);
+  init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+                  AOM_BITS_12);
+}
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+                                         8,  8,  4,  4,  2,  2,  1,  0 };
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
+  128, 144, 128, 128, 144,
+  // TODO(zoeliu): To adjust further following factor values.
+  128, 128, 128,
+  // TODO(weitinglin): We should investigate if the values should be the same
+  //                   as the value used by OVERLAY frame
+  144,  // INTNL_OVERLAY_UPDATE
+  128   // INTNL_ARF_UPDATE
+};
+
+int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
+  const int64_t q =
+      av1_dc_quant_Q3(qindex, 0, cpi->common.seq_params.bit_depth);
+  int64_t rdmult = 0;
+  switch (cpi->common.seq_params.bit_depth) {
+    case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
+    case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
+    case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+  if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+
+    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  if (rdmult < 1) rdmult = 1;
+  return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+  double q;
+  switch (bit_depth) {
+    case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break;
+    case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+  // TODO(debargha): Adjust the function below.
+  return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
+  switch (cpi->common.seq_params.bit_depth) {
+    case AOM_BITS_8:
+      x->sadperbit16 = sad_per_bit16lut_8[qindex];
+      x->sadperbit4 = sad_per_bit4lut_8[qindex];
+      break;
+    case AOM_BITS_10:
+      x->sadperbit16 = sad_per_bit16lut_10[qindex];
+      x->sadperbit4 = sad_per_bit4lut_10[qindex];
+      break;
+    case AOM_BITS_12:
+      x->sadperbit16 = sad_per_bit16lut_12[qindex];
+      x->sadperbit4 = sad_per_bit4lut_12[qindex];
+      break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+  }
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
+  int i, bsize, segment_id;
+
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    const int qindex =
+        clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+                  cm->y_dc_delta_q,
+              0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
+
+    for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+      for (i = 0; i < MAX_MODES; ++i)
+        rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+                                                 ? rd->thresh_mult[i] * t / 4
+                                                 : INT_MAX;
+    }
+  }
+}
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx) {
+  (void)ref;
+  (void)ref_mv_idx;
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
+}
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes) {
+  const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+  for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+
+      for (int ctx = 0; ctx < 2; ++ctx) {
+        aom_cdf_prob *pcdf;
+        switch (eob_multi_size) {
+          case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+          case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+          case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+          case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+          case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+          case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+          case 6:
+          default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+        }
+        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+      }
+    }
+  }
+  for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
+
+      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+                                 fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+                                 fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+                                 NULL);
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+                                 fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+                                 fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+                                 fc->dc_sign_cdf[plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        int br_rate[BR_CDF_SIZE];
+        int prev_cost = 0;
+        int i, j;
+        av1_cost_tokens_from_cdf(br_rate, fc->coeff_br_cdf[tx_size][plane][ctx],
+                                 NULL);
+        // printf("br_rate: ");
+        // for(j = 0; j < BR_CDF_SIZE; j++)
+        //  printf("%4d ", br_rate[j]);
+        // printf("\n");
+        for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+          for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+            pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
+          }
+          prev_cost += br_rate[j];
+        }
+        pcost->lps_cost[ctx][i] = prev_cost;
+        // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+        // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+        //  printf("%5d ", pcost->lps_cost[ctx][i]);
+        // printf("\n");
+      }
+    }
+  }
+}
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  RD_OPT *const rd = &cpi->rd;
+
+  aom_clear_system_state();
+
+  rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+  set_error_per_bit(x, rd->RDMULT);
+
+  set_block_thresholds(cm, rd);
+
+  if (cm->cur_frame_force_integer_mv) {
+    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc,
+                             MV_SUBPEL_NONE);
+  } else {
+    av1_build_nmv_cost_table(
+        x->nmv_vec_cost,
+        cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
+        cm->allow_high_precision_mv);
+  }
+
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
+
+  if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+      cpi->oxcf.pass != 1) {
+    int *dvcost[2] = { &cpi->dv_cost[0][MV_MAX], &cpi->dv_cost[1][MV_MAX] };
+    av1_build_nmv_cost_table(cpi->dv_joint_cost, dvcost, &cm->fc->ndvc,
+                             MV_SUBPEL_NONE);
+  }
+
+  if (cpi->oxcf.pass != 1) {
+    for (int i = 0; i < TRANS_TYPES; ++i)
+      // IDENTITY: 1 bit
+      // TRANSLATION: 3 bits
+      // ROTZOOM: 2 bits
+      // AFFINE: 3 bits
+      cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
+                            << AV1_PROB_COST_SHIFT;
+  }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+  // NOTE: The tables below must be of the same size.
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256.
+
+  // Normalized rate:
+  // This table models the rate for a Laplacian source with given variance
+  // when quantized with a uniform quantizer with given stepsize. The
+  // closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int rate_tab_q10[] = {
+    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
+    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
+    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
+    5,     3,    2,    1,    1,    1,    0,    0,
+  };
+  // Normalized distortion:
+  // This table models the normalized distortion for a Laplacian source
+  // with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance).
+  // Note the actual distortion is Dn * variance.
+  static const int dist_tab_q10[] = {
+    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
+    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
+    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
+    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
+    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
+    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
+    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
+    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
+    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+  };
+  static const int xsq_iq_q10[] = {
+    0,      4,      8,      12,     16,     20,     24,     28,     32,
+    40,     48,     56,     64,     72,     80,     88,     96,     112,
+    128,    144,    160,    176,    192,    208,    224,    256,    288,
+    320,    352,    384,    416,    448,    480,    544,    608,    672,
+    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+    180192, 196576, 212960, 229344, 245728,
+  };
+  const int tmp = (xsq_q10 >> 2) + 8;
+  const int k = get_msb(tmp) - 3;
+  const int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  if (var == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+    const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
+  }
+}
+
+static double interp_cubic(const double *p, double x) {
+  return p[1] + 0.5 * x *
+                    (p[2] - p[0] +
+                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static double interp_bicubic(const double *p, int p_stride, double x,
+                             double y) {
+  double q[4];
+  q[0] = interp_cubic(p, x);
+  q[1] = interp_cubic(p + p_stride, x);
+  q[2] = interp_cubic(p + 2 * p_stride, x);
+  q[3] = interp_cubic(p + 3 * p_stride, x);
+  return interp_cubic(q, y);
+}
+
+static const double interp_rgrid_surf[65 * 18] = {
+  0.104019,    0.245714,    0.293686,    0.358635,    0.382167,    0.412446,
+  0.419955,    0.421388,    0.426672,    0.427990,    0.428531,    0.456868,
+  0.569880,    0.638822,    1.016319,    2.143453,    3.565229,    4.720880,
+  0.124618,    0.294211,    0.352023,    0.429991,    0.458206,    0.494510,
+  0.503513,    0.505232,    0.511566,    0.513234,    0.519365,    0.570225,
+  0.697373,    0.840624,    1.462198,    3.289054,    6.256517,    6.852788,
+  0.118630,    0.269669,    0.346620,    0.430999,    0.459385,    0.495783,
+  0.504808,    0.506532,    0.512884,    0.514988,    0.543437,    0.662772,
+  0.795876,    1.313596,    2.403841,    4.163098,    7.440589,    8.616275,
+  0.093329,    0.168205,    0.321320,    0.430607,    0.459385,    0.495783,
+  0.504813,    0.506548,    0.512975,    0.520662,    0.571659,    0.701841,
+  1.010727,    2.138851,    3.460626,    6.317955,    10.098127,   14.418553,
+  0.087021,    0.142905,    0.315011,    0.430509,    0.459385,    0.495787,
+  0.505075,    0.507599,    0.513584,    0.543182,    0.669941,    0.825620,
+  1.362800,    2.572187,    4.205047,    7.498399,    12.303118,   16.641735,
+  0.086923,    0.142513,    0.314913,    0.430508,    0.459385,    0.495803,
+  0.506126,    0.511816,    0.514810,    0.549705,    0.725350,    1.127334,
+  2.168597,    3.463686,    6.318605,    10.162284,   18.556041,   19.847042,
+  0.086923,    0.142513,    0.314913,    0.430506,    0.459376,    0.495805,
+  0.506388,    0.512954,    0.520772,    0.580215,    0.810474,    1.391548,
+  2.579442,    4.205160,    7.498399,    12.381597,   21.703618,   24.015457,
+  0.086923,    0.142513,    0.314911,    0.430353,    0.458765,    0.495652,
+  0.506391,    0.513406,    0.544098,    0.702950,    1.121860,    2.168961,
+  3.463798,    6.318607,    10.162284,   18.685361,   28.188192,   37.638872,
+  0.086923,    0.142513,    0.314901,    0.429742,    0.456313,    0.495045,
+  0.506484,    0.519195,    0.580104,    0.810126,    1.391462,    2.579441,
+  4.205160,    7.498399,    12.381597,   21.848607,   33.367199,   42.623190,
+  0.086923,    0.142513,    0.314899,    0.429589,    0.455706,    0.495155,
+  0.507882,    0.542426,    0.702360,    1.119921,    2.168478,    3.463791,
+  6.318607,    10.162284,   18.685361,   28.345760,   47.802028,   49.163533,
+  0.086924,    0.142548,    0.315086,    0.429842,    0.455870,    0.496336,
+  0.512412,    0.556953,    0.773373,    1.266396,    2.548277,    4.204676,
+  7.498399,    12.381597,   21.848607,   33.548250,   54.301011,   56.262859,
+  0.087067,    0.144957,    0.327436,    0.446616,    0.466362,    0.505706,
+  0.522077,    0.610747,    0.972543,    1.666916,    3.338812,    6.316669,
+  10.162284,   18.685361,   28.345760,   48.065311,   66.145302,   78.396020,
+  0.094295,    0.164235,    0.393722,    0.534219,    0.530922,    0.579308,
+  0.603889,    0.760870,    1.229961,    2.423214,    4.173513,    7.497916,
+  12.381597,   21.848607,   33.548250,   54.589585,   74.875848,   86.468182,
+  0.124096,    0.213005,    0.497188,    0.665176,    0.685973,    0.800200,
+  0.911394,    1.077971,    1.677290,    3.332129,    6.314960,    10.162257,
+  18.685361,   28.345760,   48.065311,   66.453506,   98.275189,   96.862588,
+  0.140999,    0.270140,    0.658212,    0.867661,    0.970183,    1.149516,
+  1.480599,    1.664833,    2.421893,    3.857981,    7.418830,    12.380371,
+  21.848607,   33.548250,   54.589585,   75.188867,   106.657971,  99.762997,
+  0.178353,    0.398001,    0.988462,    1.241473,    1.340967,    1.713568,
+  2.335030,    2.701432,    3.348532,    5.077158,    9.829903,    18.676528,
+  28.345700,   48.065311,   66.453506,   98.588283,   117.057193,  101.130722,
+  0.281079,    0.548300,    1.395825,    1.780770,    2.000508,    2.702964,
+  3.638454,    4.573843,    5.051641,    7.079129,    11.293332,   21.594861,
+  33.544335,   54.589585,   75.188867,   106.971065,  119.957601,  101.466632,
+  0.476762,    0.842189,    2.019678,    2.723895,    3.188467,    4.011610,
+  5.545111,    7.508984,    8.176339,    9.774504,    14.720782,   27.334416,
+  48.049609,   66.453506,   98.588283,   117.370357,  121.329855,  101.509242,
+  0.993999,    1.520111,    3.013605,    4.203530,    4.982992,    6.074944,
+  8.583581,    11.818375,   14.192544,   14.937517,   21.258160,   33.305953,
+  54.585735,   75.188867,   106.971135,  120.279824,  121.976055,  102.690130,
+  1.776487,    2.613655,    4.356487,    6.161726,    7.622196,    9.464193,
+  13.077233,   18.051656,   23.221051,   24.080068,   30.085038,   48.345269,
+  66.457698,   98.588353,   117.379415,  121.976128,  124.356210,  107.713202,
+  3.191085,    4.495201,    5.686033,    8.365566,    11.275339,   14.706437,
+  20.300969,   28.152237,   35.688355,   39.341382,   41.030743,   55.752262,
+  75.211764,   106.980285,  120.608403,  124.680746,  130.222528,  112.260098,
+  6.136611,    7.305215,    7.272532,    10.646713,   15.630815,   22.383168,
+  31.349131,   42.419822,   52.301680,   58.983454,   58.915405,   69.161305,
+  98.992460,   117.713855,  124.344836,  130.623638,  138.442401,  127.846670,
+  11.707980,   13.490761,   11.640845,   14.176132,   22.131124,   33.776462,
+  47.365711,   61.603834,   75.281056,   83.463985,   85.510533,   86.026513,
+  108.787480,  123.031136,  130.607284,  138.954406,  160.867784,  158.958882,
+  27.062874,   32.195139,   24.147297,   22.114632,   35.580506,   52.551674,
+  71.652956,   88.606776,   102.107193,  110.703186,  114.398733,  111.118539,
+  121.503578,  132.455924,  139.490806,  161.412674,  193.563210,  172.203945,
+  35.625692,   47.953028,   42.639820,   42.276254,   58.815664,   84.977282,
+  110.656412,  126.168446,  134.658126,  140.604482,  144.006012,  141.702382,
+  140.125323,  153.122630,  164.748041,  194.156197,  206.854650,  174.013079,
+  49.516447,   65.335381,   71.738306,   81.872819,   98.400740,   136.840488,
+  163.775802,  169.440078,  172.747876,  171.222919,  171.679604,  172.173550,
+  168.200129,  187.617133,  199.683394,  207.768200,  210.062520,  175.478356,
+  60.341673,   92.487135,   119.907299,  136.068010,  144.778950,  189.443534,
+  220.120077,  219.641635,  214.616503,  205.894657,  198.453924,  200.013069,
+  195.938103,  206.118661,  210.447375,  212.061379,  216.078218,  181.162805,
+  78.422159,   112.242899,  158.416312,  181.404320,  193.188690,  229.296967,
+  270.461799,  275.168977,  256.511701,  244.706786,  231.344608,  226.065087,
+  222.248618,  218.662324,  217.966722,  218.248574,  218.818588,  182.740573,
+  88.713664,   123.594164,  172.928179,  213.781414,  245.800351,  252.063414,
+  313.283141,  331.703831,  305.866639,  285.177142,  269.759635,  251.988739,
+  245.998388,  232.688076,  230.588702,  230.882657,  230.319053,  192.120741,
+  102.540561,  152.905927,  189.137131,  241.806756,  273.868497,  284.258017,
+  339.689853,  373.561104,  362.657463,  326.291984,  311.922687,  290.460189,
+  276.774381,  273.012072,  277.751792,  279.123748,  278.820447,  233.813798,
+  132.983118,  176.307242,  197.415684,  243.307787,  280.893995,  332.922370,
+  340.329043,  404.530166,  419.475405,  375.775209,  351.300889,  340.042759,
+  315.683832,  306.123530,  306.359319,  306.733063,  307.609556,  261.647847,
+  149.579109,  185.925581,  207.937033,  245.159084,  301.890957,  350.040480,
+  352.250771,  418.742329,  458.112686,  430.125208,  386.460441,  380.346839,
+  354.679150,  337.305620,  334.504124,  335.889932,  341.060725,  286.898578,
+  153.576812,  202.105624,  219.366967,  248.524506,  314.255692,  350.607526,
+  390.567688,  408.629209,  488.000213,  480.563823,  432.461799,  410.412624,
+  398.607371,  400.188740,  402.780916,  408.853470,  430.449735,  363.777088,
+  161.353129,  214.848904,  231.549852,  258.536466,  313.163177,  368.140577,
+  412.136393,  413.409032,  499.838438,  519.571063,  485.833867,  444.562715,
+  435.738129,  442.358549,  450.166531,  453.208524,  458.424358,  385.823139,
+  175.109034,  227.608058,  250.069563,  286.101747,  312.256740,  378.421485,
+  413.344147,  435.058646,  476.960941,  542.448886,  530.189154,  495.408402,
+  475.326752,  465.017144,  464.694045,  465.144689,  466.905382,  398.669138,
+  184.750180,  240.766694,  283.240772,  305.480150,  322.409001,  374.526162,
+  427.141326,  452.840323,  472.604139,  545.366105,  567.676694,  541.666203,
+  509.591873,  492.044219,  492.778569,  493.765684,  493.235693,  413.684325,
+  194.728357,  254.928927,  289.991157,  300.193195,  324.194589,  371.563147,
+  439.226438,  468.295088,  495.654854,  533.506353,  587.476353,  578.298989,
+  548.041942,  527.393885,  538.965146,  545.070442,  544.295454,  454.012211,
+  205.195287,  283.135677,  297.921431,  319.295927,  355.621830,  392.466463,
+  446.696167,  485.053519,  516.426615,  532.264584,  588.481600,  615.906737,
+  589.319634,  555.754316,  558.389367,  569.094521,  569.779764,  475.384946,
+  218.552054,  298.511016,  319.188338,  351.781666,  372.789510,  412.827434,
+  464.569387,  506.270203,  533.049810,  553.347364,  580.644599,  632.759854,
+  622.235843,  569.960552,  580.799340,  586.553714,  579.488366,  491.826482,
+  244.803348,  299.790203,  324.187975,  363.280782,  403.710443,  441.724083,
+  492.732682,  534.722691,  552.193622,  575.112647,  586.097705,  635.224970,
+  644.642944,  606.017786,  640.321218,  642.316989,  616.397020,  548.300111,
+  256.957358,  318.638991,  355.063346,  389.889307,  433.607315,  468.209001,
+  515.178157,  573.556591,  578.113115,  587.246475,  601.762801,  638.454644,
+  656.574853,  641.184609,  676.908189,  684.198162,  678.387412,  574.805864,
+  251.211502,  323.448532,  364.227424,  411.792704,  462.226488,  503.572288,
+  549.299249,  599.124071,  601.227977,  597.118176,  613.247552,  633.278532,
+  658.074755,  664.930719,  685.731531,  693.632845,  693.076350,  578.326477,
+  267.695377,  354.273736,  389.976833,  438.518178,  493.332686,  544.343027,
+  588.895829,  620.206193,  628.327410,  606.067827,  620.998532,  657.985256,
+  683.936059,  691.345257,  693.894723,  695.175306,  693.618786,  578.517148,
+  274.290725,  363.465288,  411.808596,  463.369805,  515.310226,  581.009306,
+  613.070738,  636.638714,  647.333929,  629.867603,  644.646319,  687.796202,
+  702.859596,  713.495479,  704.068069,  704.991807,  704.188594,  587.283658,
+  302.538449,  389.174737,  438.518422,  493.398902,  547.662399,  601.981814,
+  624.773046,  641.629484,  644.699451,  645.848784,  668.033340,  703.643523,
+  707.422408,  717.329600,  726.298973,  744.127507,  745.365167,  617.954068,
+  310.328188,  410.984766,  463.369805,  515.315010,  581.309832,  613.787792,
+  634.988538,  654.145284,  662.632978,  668.413496,  706.494057,  750.545471,
+  730.724808,  730.002100,  743.625262,  750.801609,  745.308457,  606.505800,
+  329.948756,  437.600191,  493.398902,  547.661910,  601.917884,  622.557745,
+  633.244395,  644.055898,  648.224221,  665.062911,  763.555733,  812.391078,
+  769.063582,  744.865168,  727.579796,  724.950408,  722.179707,  598.564510,
+  350.848328,  462.437458,  515.315010,  581.309823,  613.779123,  634.465309,
+  652.056257,  662.179143,  671.466297,  726.881256,  819.824030,  880.232789,
+  810.371672,  754.246481,  725.053473,  724.253390,  723.503395,  603.394909,
+  373.704088,  492.408266,  547.661910,  601.917884,  622.557620,  633.236320,
+  644.023513,  648.232514,  666.381639,  785.498283,  929.441612,  999.772800,
+  890.339033,  775.852504,  731.840181,  726.905100,  725.251844,  604.899901,
+  394.473422,  514.261306,  581.309823,  613.779123,  634.465309,  652.056257,
+  662.179143,  671.466557,  727.134512,  835.764144,  981.747089,  1018.462934,
+  939.686967,  811.276731,  739.398459,  727.365647,  725.285425,  604.923525,
+  419.976505,  546.538939,  601.917884,  622.557620,  633.236320,  644.023513,
+  648.232514,  666.381639,  785.545191,  932.841398,  1036.609617, 1026.945092,
+  963.822765,  840.827315,  755.532423,  730.241865,  725.366847,  604.924155,
+  437.281359,  580.116337,  613.779123,  634.465309,  652.056257,  662.179143,
+  671.466557,  727.134512,  835.764859,  981.996194,  1031.896881, 1002.544732,
+  881.157178,  828.151494,  799.340975,  751.314325,  728.316587,  605.005504,
+  464.713920,  600.649281,  622.557620,  633.236320,  644.023513,  648.232514,
+  666.381639,  785.545191,  932.841398,  1036.735329, 1035.037004, 995.478339,
+  858.093733,  823.471976,  819.881754,  798.749289,  749.440463,  607.955244,
+  495.880237,  612.473139,  634.465309,  652.056257,  662.179143,  671.466557,
+  727.134512,  835.764859,  981.996194,  1032.339788, 1031.105117, 995.303259,
+  857.733663,  823.435877,  822.822791,  819.873050,  796.882480,  629.038445,
+  510.391280,  621.158273,  633.236320,  644.023513,  648.232514,  666.381639,
+  785.545191,  932.841398,  1036.735329, 1035.566013, 1029.599350, 994.926093,
+  857.645648,  823.435143,  822.904139,  822.822791,  817.965681,  673.856962,
+  514.588176,  632.947715,  652.056257,  662.179143,  671.466557,  727.134512,
+  835.764859,  981.996194,  1032.339788, 1031.547475, 1023.835377, 972.158629,
+  851.968626,  823.347128,  822.904770,  822.904139,  820.752301,  684.418900,
+  520.013294,  631.668183,  644.023513,  648.232514,  666.381639,  785.545191,
+  932.841398,  1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
+  829.201546,  822.994150,  822.904770,  822.904770,  820.792975,  684.582020,
+  531.253628,  650.479606,  662.179143,  671.466557,  727.134512,  835.764859,
+  981.996194,  1032.339788, 1031.636855, 1029.601779, 995.366703,  858.086641,
+  823.524524,  822.906135,  822.904770,  822.904770,  820.792975,  684.582020,
+  528.531744,  642.424501,  648.232514,  666.381639,  785.545191,  932.841398,
+  1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687,  857.733663,
+  823.436508,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  545.401164,  660.550678,  671.508859,  727.304161,  835.807162,  981.996850,
+  1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709,  857.645648,
+  823.435143,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  537.684760,  646.650947,  669.110131,  796.487512,  935.569890,  1036.777631,
+  1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629,  851.968626,
+  823.347128,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  552.408370,  670.001885,  738.246482,  879.690154,  992.939171,  1032.509436,
+  1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721,  829.201546,
+  822.994150,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  539.835902,  667.496388,  799.216004,  946.512211,  1039.506123, 1035.609680,
+  1030.219103, 1030.107964, 1029.577207, 995.366703,  858.086641,  823.524524,
+  822.906135,  822.904770,  822.904770,  822.904770,  820.792975,  684.582020,
+  558.362529,  734.277451,  877.197218,  990.478243,  1029.908393, 1028.993978,
+  1027.488620, 1027.464048, 1026.933674, 992.724534,  855.532488,  821.323349,
+  820.792975,  820.792975,  820.792975,  820.792975,  818.686600,  682.825198,
+  453.127195,  649.075095,  780.278390,  867.165890,  862.469711,  857.067460,
+  856.956321,  856.955937,  856.513579,  827.981461,  713.556496,  685.024378,
+  684.582020,  684.582020,  684.582020,  684.582020,  682.825198,  569.510056,
+};
+
+static const double interp_dgrid_surf[65 * 18] = {
+  10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
+  12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
+  12.092165, 11.602421, 11.141559, 8.864495,  12.770003, 14.634889, 14.437149,
+  14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
+  14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
+  10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
+  14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
+  14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
+  14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
+  14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
+  12.176082, 9.228999,  12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
+  14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
+  14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683,  12.980449,
+  15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
+  14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
+  12.201859, 10.891931, 8.482221,  12.980449, 15.384750, 14.651886, 14.238801,
+  14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
+  14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
+  12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
+  14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
+  12.201859, 10.911285, 9.730570,  6.696921,  12.980449, 15.384750, 14.652393,
+  14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
+  14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
+  6.215460,  12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
+  14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
+  12.201859, 10.911285, 9.747361,  7.779960,  5.617541,  12.980448, 15.384731,
+  14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
+  14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
+  7.210003,  5.164575,  12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
+  14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
+  12.201859, 10.911285, 9.747361,  7.790897,  6.322998,  3.931551,  12.981550,
+  15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
+  14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
+  7.219566,  5.781392,  3.486081,  12.991899, 15.376201, 14.579444, 14.296898,
+  14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
+  12.201867, 10.911285, 9.747361,  7.790897,  6.331506,  4.480348,  2.923138,
+  13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
+  14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
+  7.219566,  5.789642,  4.018194,  2.766222,  13.028558, 15.315782, 14.439141,
+  14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
+  12.274375, 10.912967, 9.747371,  7.790897,  6.331506,  4.488594,  3.454993,
+  2.692682,  12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
+  13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
+  7.219566,  5.789642,  4.026440,  3.298077,  2.674624,  12.945493, 15.276596,
+  14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
+  12.288592, 11.511693, 9.900227,  7.793270,  6.331506,  4.488594,  3.463236,
+  3.224318,  2.672433,  12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
+  13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
+  7.220151,  5.789642,  4.026437,  3.305882,  3.191260,  2.615317,  12.581293,
+  14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
+  10.936895, 10.619912, 9.634779,  7.763570,  6.331082,  4.488590,  3.462798,
+  3.216460,  3.076315,  2.373499,  12.283499, 14.455760, 13.890593, 13.427587,
+  13.183783, 12.763833, 11.861006, 10.740618, 9.820756,  9.354945,  8.669862,
+  7.123268,  5.787860,  4.025994,  3.290000,  3.084410,  2.810905,  2.222916,
+  12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
+  9.631040,  8.594396,  8.003736,  7.561587,  6.274418,  4.466637,  3.446574,
+  3.102467,  2.816989,  2.598688,  1.951541,  11.581477, 13.831132, 13.632027,
+  13.380414, 12.807880, 11.665651, 10.218236, 8.562237,  7.222614,  6.611808,
+  6.261676,  5.402793,  3.938544,  3.174375,  2.818166,  2.602758,  2.213911,
+  1.434763,  11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
+  9.109699,  7.421701,  5.965603,  5.272129,  4.991435,  4.423000,  3.369988,
+  2.800371,  2.593901,  2.217431,  1.670917,  1.215265,  10.641194, 11.766277,
+  10.777082, 10.972917, 10.689298, 9.701545,  7.719947,  6.145654,  4.872442,
+  4.099600,  3.880934,  3.514159,  2.786474,  2.368963,  2.162376,  1.673670,
+  1.450770,  1.185424,  10.071964, 11.107701, 9.172361,  8.551313,  8.412080,
+  7.641397,  6.174246,  4.853916,  3.904549,  3.246810,  2.959903,  2.785066,
+  2.240001,  1.793166,  1.585520,  1.449824,  1.405368,  1.168856,  9.213182,
+  9.173278,  7.219231,  6.242951,  5.626013,  5.768007,  4.908666,  3.809589,
+  3.115109,  2.617899,  2.274793,  2.172960,  1.838597,  1.505915,  1.414333,
+  1.392666,  1.338173,  1.105611,  7.365015,  7.471370,  5.622346,  4.520127,
+  3.936272,  4.208822,  3.623024,  2.977794,  2.450003,  2.097261,  1.824090,
+  1.643270,  1.473525,  1.351388,  1.327504,  1.323865,  1.307894,  1.088234,
+  6.198210,  6.580712,  4.682511,  3.416952,  2.941929,  2.766637,  2.650686,
+  2.315439,  1.925838,  1.659784,  1.464419,  1.252806,  1.162722,  1.197518,
+  1.199875,  1.197365,  1.194040,  0.995797,  5.402507,  5.055466,  3.728724,
+  2.624359,  2.165810,  1.943189,  1.918190,  1.738078,  1.516328,  1.290520,
+  1.155793,  1.015962,  0.881900,  0.807203,  0.754242,  0.743378,  0.740288,
+  0.614158,  3.937867,  3.862507,  2.884664,  2.088147,  1.648496,  1.473584,
+  1.340123,  1.291769,  1.165381,  1.000224,  0.893316,  0.821333,  0.691363,
+  0.610501,  0.586766,  0.583762,  0.577840,  0.468733,  3.104660,  3.181078,
+  2.420208,  1.747442,  1.297956,  1.109835,  0.970385,  0.943229,  0.876923,
+  0.777584,  0.678183,  0.628623,  0.553745,  0.523430,  0.519490,  0.514394,
+  0.492259,  0.403172,  2.593833,  2.533720,  2.010452,  1.480944,  1.060302,
+  0.846383,  0.738703,  0.673144,  0.658010,  0.592449,  0.518236,  0.470335,
+  0.425088,  0.393168,  0.378116,  0.355846,  0.275469,  0.213128,  2.176988,
+  2.089575,  1.671284,  1.225008,  0.895382,  0.672008,  0.566241,  0.496746,
+  0.488005,  0.449874,  0.400899,  0.354002,  0.318150,  0.281533,  0.238545,
+  0.224159,  0.202399,  0.160681,  1.874679,  1.769165,  1.430124,  1.068727,
+  0.780272,  0.557801,  0.441643,  0.377256,  0.352957,  0.338452,  0.304965,
+  0.273172,  0.240052,  0.208724,  0.193431,  0.190845,  0.185025,  0.138166,
+  1.590226,  1.502830,  1.193127,  0.917885,  0.670432,  0.474546,  0.355420,
+  0.292305,  0.259035,  0.249937,  0.232079,  0.208943,  0.181936,  0.160038,
+  0.152257,  0.151235,  0.149583,  0.120747,  1.331730,  1.255907,  1.012871,
+  0.778422,  0.578977,  0.412432,  0.293155,  0.231824,  0.197187,  0.183921,
+  0.174876,  0.157252,  0.140263,  0.127050,  0.110244,  0.105041,  0.104323,
+  0.086944,  1.153994,  1.118771,  0.822355,  0.612321,  0.478249,  0.348222,
+  0.247408,  0.186141,  0.152714,  0.135445,  0.129810,  0.119994,  0.115619,
+  0.131626,  0.095612,  0.079343,  0.077502,  0.064550,  0.946317,  0.925894,
+  0.677969,  0.499906,  0.397101,  0.297931,  0.214467,  0.152333,  0.120731,
+  0.102686,  0.095062,  0.090361,  0.122319,  0.240194,  0.112687,  0.070690,
+  0.070461,  0.054194,  0.824155,  0.787241,  0.581856,  0.419228,  0.313167,
+  0.245582,  0.183500,  0.128101,  0.096577,  0.080267,  0.071022,  0.066851,
+  0.085754,  0.154163,  0.075884,  0.052401,  0.054270,  0.026656,  0.716310,
+  0.671378,  0.489580,  0.349569,  0.256155,  0.206343,  0.157853,  0.111950,
+  0.079271,  0.062518,  0.053441,  0.049660,  0.051400,  0.063778,  0.039993,
+  0.029133,  0.023382,  0.013725,  0.614125,  0.579096,  0.417126,  0.299465,
+  0.217849,  0.165515,  0.129040,  0.093127,  0.065612,  0.049543,  0.041429,
+  0.036850,  0.034416,  0.033989,  0.024216,  0.017377,  0.014833,  0.011987,
+  0.520407,  0.487239,  0.349473,  0.251741,  0.184897,  0.135813,  0.107098,
+  0.073607,  0.053938,  0.040531,  0.032931,  0.028876,  0.025759,  0.022168,
+  0.016739,  0.014638,  0.014333,  0.011947,  0.449954,  0.415124,  0.299452,
+  0.216942,  0.158874,  0.115334,  0.088821,  0.060105,  0.042610,  0.032566,
+  0.026903,  0.023123,  0.019913,  0.016835,  0.014306,  0.013625,  0.013535,
+  0.011284,  0.377618,  0.347773,  0.251741,  0.184839,  0.132857,  0.095439,
+  0.070462,  0.052244,  0.036078,  0.026025,  0.021518,  0.018487,  0.015361,
+  0.012905,  0.011470,  0.010569,  0.010283,  0.008297,  0.319953,  0.297976,
+  0.216942,  0.158842,  0.113280,  0.080426,  0.057367,  0.041987,  0.030135,
+  0.022295,  0.017901,  0.015121,  0.012224,  0.010035,  0.009353,  0.009108,
+  0.008695,  0.006139,  0.267864,  0.250502,  0.184839,  0.132851,  0.095039,
+  0.068220,  0.049135,  0.035315,  0.025144,  0.018237,  0.013857,  0.012094,
+  0.009715,  0.007743,  0.006937,  0.006446,  0.006243,  0.004929,  0.230449,
+  0.215895,  0.158842,  0.113280,  0.080417,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015673,  0.012133,  0.010083,  0.007801,  0.006053,  0.005401,
+  0.003834,  0.003429,  0.002851,  0.193984,  0.183963,  0.132851,  0.095039,
+  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013175,  0.010422,
+  0.008491,  0.006397,  0.004567,  0.003494,  0.002933,  0.002825,  0.002355,
+  0.167298,  0.158088,  0.113280,  0.080417,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009257,  0.007051,  0.005543,  0.003905,
+  0.002984,  0.002825,  0.002814,  0.002347,  0.143228,  0.132220,  0.095039,
+  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008403,  0.006661,  0.005378,  0.003545,  0.002876,  0.002818,  0.002814,
+  0.002347,  0.122934,  0.112735,  0.080417,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007182,  0.006012,  0.003762,
+  0.002866,  0.002739,  0.002788,  0.002810,  0.002347,  0.101934,  0.094569,
+  0.068220,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006797,  0.005845,  0.003333,  0.002703,  0.002695,  0.002723,
+  0.002781,  0.002343,  0.086702,  0.080014,  0.057174,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006533,  0.005839,
+  0.003326,  0.002700,  0.002690,  0.002694,  0.002716,  0.002314,  0.073040,
+  0.067886,  0.049133,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006807,  0.006468,  0.005831,  0.003325,  0.002700,  0.002690,
+  0.002690,  0.002687,  0.002253,  0.061685,  0.056890,  0.041304,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006542,  0.006360,
+  0.005416,  0.003221,  0.002698,  0.002690,  0.002690,  0.002683,  0.002238,
+  0.052465,  0.048894,  0.035305,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006807,  0.006472,  0.005943,  0.003748,  0.002805,  0.002692,
+  0.002690,  0.002690,  0.002683,  0.002238,  0.043838,  0.041101,  0.029959,
+  0.021866,  0.015669,  0.011955,  0.009258,  0.007190,  0.006543,  0.006465,
+  0.005839,  0.003333,  0.002702,  0.002690,  0.002690,  0.002690,  0.002683,
+  0.002238,  0.037824,  0.035133,  0.025140,  0.018150,  0.013174,  0.010394,
+  0.008405,  0.006807,  0.006480,  0.006464,  0.005838,  0.003326,  0.002700,
+  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,  0.031865,  0.029815,
+  0.021866,  0.015668,  0.011955,  0.009258,  0.007190,  0.006543,  0.006475,
+  0.006462,  0.005831,  0.003325,  0.002700,  0.002690,  0.002690,  0.002690,
+  0.002683,  0.002238,  0.027150,  0.025016,  0.018128,  0.013083,  0.010371,
+  0.008405,  0.006807,  0.006480,  0.006472,  0.006359,  0.005416,  0.003221,
+  0.002698,  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,  0.023094,
+  0.021760,  0.015577,  0.011590,  0.009167,  0.007188,  0.006543,  0.006475,
+  0.006466,  0.005943,  0.003748,  0.002805,  0.002692,  0.002690,  0.002690,
+  0.002690,  0.002683,  0.002238,  0.019269,  0.018038,  0.013060,  0.010280,
+  0.008382,  0.006806,  0.006480,  0.006474,  0.006464,  0.005839,  0.003333,
+  0.002702,  0.002690,  0.002690,  0.002690,  0.002690,  0.002683,  0.002238,
+  0.016874,  0.015472,  0.011566,  0.009148,  0.007171,  0.006527,  0.006458,
+  0.006457,  0.006447,  0.005823,  0.003318,  0.002693,  0.002683,  0.002683,
+  0.002683,  0.002683,  0.002676,  0.002232,  0.011968,  0.011056,  0.008762,
+  0.007219,  0.005717,  0.005391,  0.005386,  0.005386,  0.005377,  0.004856,
+  0.002767,  0.002246,  0.002238,  0.002238,  0.002238,  0.002238,  0.002232,
+  0.001862,
+};
+
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+                          double *dist_f) {
+  const double x_start = -0.5;
+  const double x_end = 16.5;
+  const double x_step = 1;
+  const double y_start = -15.5;
+  const double y_end = 16.5;
+  const double y_step = 0.5;
+  const double epsilon = 1e-6;
+  const int stride = (int)rint((x_end - x_start) / x_step) + 1;
+  (void)y_end;
+
+  xm = AOMMAX(xm, x_start + x_step + epsilon);
+  xm = AOMMIN(xm, x_end - x_step - epsilon);
+  yl = AOMMAX(yl, y_start + y_step + epsilon);
+  yl = AOMMIN(yl, y_end - y_step - epsilon);
+
+  const double y = (yl - y_start) / y_step;
+  const double x = (xm - x_start) / x_step;
+
+  const int yi = (int)floor(y);
+  const int xi = (int)floor(x);
+  assert(xi > 0);
+  assert(yi > 0);
+
+  const double yo = y - yi;
+  const double xo = x - xi;
+  const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
+  const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
+  *rate_f = interp_bicubic(prate, stride, xo, yo);
+  *dist_f = interp_bicubic(pdist, stride, xo, yo);
+}
+
+static const double interp_rgrid_curv[65] = {
+  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     0.000000,
+  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     0.000000,
+  0.000000,    0.000000,    0.000000,    0.000000,    0.000000,     4.759876,
+  8.132086,    13.651828,   21.908271,   33.522054,   48.782376,    71.530983,
+  106.728649,  151.942795,  199.893011,  242.850965,  283.933923,   322.154203,
+  360.684608,  394.801656,  426.879017,  460.234313,  484.103987,   508.261495,
+  536.486763,  558.196737,  586.285894,  614.764511,  634.166333,   647.706472,
+  658.211478,  681.360407,  701.052141,  727.007310,  768.663973,   804.407660,
+  884.627751,  1065.658131, 1238.875214, 1440.185176, 1678.377931,  1962.243390,
+  2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184,  5116.798028,
+  5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+};
+
+static const double interp_dgrid_curv[65] = {
+  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
+  14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
+  14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
+  10.728960, 9.861975,  8.643612,  6.916021,  5.154769,  3.734940,  2.680051,
+  1.925506,  1.408410,  1.042223,  0.767641,  0.565392,  0.420116,  0.310427,
+  0.231711,  0.172999,  0.128293,  0.094992,  0.072171,  0.052972,  0.039354,
+  0.029555,  0.022857,  0.016832,  0.013297,  0.000000,  0.000000,  0.000000,
+  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
+  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,  0.000000,
+  0.000000,  0.000000,
+};
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+  const double x_start = -15.5;
+  const double x_end = 16.5;
+  const double x_step = 0.5;
+  const double epsilon = 1e-6;
+  (void)x_end;
+
+  xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+  xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+  const double x = (xqr - x_start) / x_step;
+  const int xi = (int)floor(x);
+  const double xo = x - xi;
+
+  assert(xi > 0);
+
+  const double *prate = &interp_rgrid_curv[(xi - 1)];
+  const double *pdist = &interp_dgrid_curv[(xi - 1)];
+  *rate_f = interp_cubic(prate, xo);
+  *distbysse_f = interp_cubic(pdist, xo);
+}
+
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+                                       const struct macroblockd_plane *pd,
+                                       ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                                       ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+  const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+  const ENTROPY_CONTEXT *const above = pd->above_context;
+  const ENTROPY_CONTEXT *const left = pd->left_context;
+
+  memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+  memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+                 int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+  int i;
+  int zero_seen = 0;
+  int best_sad = INT_MAX;
+  int this_sad = INT_MAX;
+  int max_mv = 0;
+  uint8_t *src_y_ptr = x->plane[0].src.buf;
+  uint8_t *ref_y_ptr;
+  MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+  int num_mv_refs = 0;
+  const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  const int_mv ref_mv =
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+  const int_mv ref_mv1 =
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+
+  pred_mv[num_mv_refs++] = ref_mv.as_mv;
+  if (ref_mv.as_int != ref_mv1.as_int) {
+    pred_mv[num_mv_refs++] = ref_mv1.as_mv;
+  }
+  if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size)
+    pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
+
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+  // Get the sad for each candidate reference mv.
+  for (i = 0; i < num_mv_refs; ++i) {
+    const MV *this_mv = &pred_mv[i];
+    int fp_row, fp_col;
+    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+    if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+    zero_seen |= (fp_row == 0 && fp_col == 0);
+
+    ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    // Find sad for current vector.
+    this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+                                           ref_y_ptr, ref_y_stride);
+    // Note if it is the best so far.
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+    }
+  }
+
+  // Note the index of the mv that worked best in the reference list.
+  x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv,
+                          const int num_planes) {
+  int i;
+
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  for (i = 0; i < num_planes; ++i) {
+    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
+                     i ? src->uv_crop_width : src->y_crop_width,
+                     i ? src->uv_crop_height : src->y_crop_height,
+                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+                            int stride) {
+  const int bw = mi_size_wide_log2[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+                                       int16_t *base) {
+  const int stride = block_size_wide[plane_bsize];
+  return base + av1_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+                                             int ref_frame) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
+             ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
+             : NULL;
+}
+
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+                            const MACROBLOCKD *xd) {
+  if (cm->interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    int inter_filter_cost = 0;
+    int dir;
+
+    for (dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      const InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      inter_filter_cost += x->switchable_interp_costs[ctx][filter];
+    }
+    return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+  } else {
+    return 0;
+  }
+}
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+  int i;
+  RD_OPT *const rd = &cpi->rd;
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  // Set baseline threshold values.
+  for (i = 0; i < MAX_MODES; ++i) rd->thresh_mult[i] = cpi->oxcf.mode == 0;
+
+  if (sf->adaptive_rd_thresh) {
+    rd->thresh_mult[THR_NEARESTMV] = 300;
+    rd->thresh_mult[THR_NEARESTL2] = 300;
+    rd->thresh_mult[THR_NEARESTL3] = 300;
+    rd->thresh_mult[THR_NEARESTB] = 300;
+    rd->thresh_mult[THR_NEARESTA2] = 300;
+    rd->thresh_mult[THR_NEARESTA] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
+  } else {
+    rd->thresh_mult[THR_NEARESTMV] = 0;
+    rd->thresh_mult[THR_NEARESTL2] = 0;
+    rd->thresh_mult[THR_NEARESTL3] = 0;
+    rd->thresh_mult[THR_NEARESTB] = 0;
+    rd->thresh_mult[THR_NEARESTA2] = 0;
+    rd->thresh_mult[THR_NEARESTA] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
+  }
+
+  rd->thresh_mult[THR_NEWMV] += 1000;
+  rd->thresh_mult[THR_NEWL2] += 1000;
+  rd->thresh_mult[THR_NEWL3] += 1000;
+  rd->thresh_mult[THR_NEWB] += 1000;
+  rd->thresh_mult[THR_NEWA2] = 1000;
+  rd->thresh_mult[THR_NEWA] += 1000;
+  rd->thresh_mult[THR_NEWG] += 1000;
+
+  rd->thresh_mult[THR_NEARMV] += 1000;
+  rd->thresh_mult[THR_NEARL2] += 1000;
+  rd->thresh_mult[THR_NEARL3] += 1000;
+  rd->thresh_mult[THR_NEARB] += 1000;
+  rd->thresh_mult[THR_NEARA2] = 1000;
+  rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+
+  rd->thresh_mult[THR_GLOBALMV] += 2000;
+  rd->thresh_mult[THR_GLOBALL2] += 2000;
+  rd->thresh_mult[THR_GLOBALL3] += 2000;
+  rd->thresh_mult[THR_GLOBALB] += 2000;
+  rd->thresh_mult[THR_GLOBALA2] = 2000;
+  rd->thresh_mult[THR_GLOBALG] += 2000;
+  rd->thresh_mult[THR_GLOBALA] += 2000;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 2000;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
+
+  rd->thresh_mult[THR_DC] += 1000;
+  rd->thresh_mult[THR_PAETH] += 1000;
+  rd->thresh_mult[THR_SMOOTH] += 2000;
+  rd->thresh_mult[THR_SMOOTH_V] += 2000;
+  rd->thresh_mult[THR_SMOOTH_H] += 2000;
+  rd->thresh_mult[THR_H_PRED] += 2000;
+  rd->thresh_mult[THR_V_PRED] += 2000;
+  rd->thresh_mult[THR_D135_PRED] += 2500;
+  rd->thresh_mult[THR_D203_PRED] += 2500;
+  rd->thresh_mult[THR_D157_PRED] += 2500;
+  rd->thresh_mult[THR_D67_PRED] += 2500;
+  rd->thresh_mult[THR_D113_PRED] += 2500;
+  rd->thresh_mult[THR_D45_PRED] += 2500;
+}
+
+void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
+  static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500,
+                                             2500, 2500, 4500, 4500, 4500,
+                                             4500, 4500, 4500, 4500, 4500,
+                                             4500, 4500, 4500, 4500, 2500 };
+  RD_OPT *const rd = &cpi->rd;
+  memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
+}
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index) {
+  if (rd_thresh > 0) {
+    const int top_mode = MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size =
+          AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &factor_buf[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = AOMMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth) {
+  const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth);
+  switch (bit_depth) {
+    case AOM_BITS_8: return 20 * q;
+    case AOM_BITS_10: return 5 * q;
+    case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 000000000..755b61df5
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, R, D)                                            \
+  (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+   ((D) * (1 << RDDIV_BITS)))
+
+#define RDCOST_DBL(RM, R, D)                                       \
+  (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+   ((double)(D) * (1 << RDDIV_BITS)))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+
+  MAX_MODES,
+
+  LAST_SINGLE_REF_MODES = THR_GLOBALG,
+  MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
+  LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
+  MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
+} THR_MODES;
+
+typedef enum {
+  THR_LAST,
+  THR_LAST2,
+  THR_LAST3,
+  THR_BWDR,
+  THR_ALTR2,
+  THR_GOLD,
+  THR_ALTR,
+
+  THR_COMP_LA,
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+  THR_COMP_GA,
+
+  THR_COMP_LB,
+  THR_COMP_L2B,
+  THR_COMP_L3B,
+  THR_COMP_GB,
+
+  THR_COMP_LA2,
+  THR_COMP_L2A2,
+  THR_COMP_L3A2,
+  THR_COMP_GA2,
+
+  THR_INTRA,
+
+  MAX_REFS
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+  int thresh_mult_sub8x8[MAX_REFS];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
+
+  int64_t prediction_type_threshes[REF_FRAMES][REFERENCE_MODES];
+
+  int RDMULT;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->rdcost = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
+  rd_stats->zero_rate = 0;
+  rd_stats->invalid_rate = 0;
+  rd_stats->ref_rdcost = INT64_MAX;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = 0;
+    {
+      int r, c;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+          rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
+    }
+  }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = INT_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->rdcost = INT64_MAX;
+  rd_stats->sse = INT64_MAX;
+  rd_stats->skip = 0;
+  rd_stats->zero_rate = 0;
+  rd_stats->invalid_rate = 1;
+  rd_stats->ref_rdcost = INT64_MAX;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = INT_MAX;
+    {
+      int r, c;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+          rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
+    }
+  }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+                                      const RD_STATS *rd_stats_src) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats_dst->rate += rd_stats_src->rate;
+  if (!rd_stats_dst->zero_rate)
+    rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
+  rd_stats_dst->dist += rd_stats_src->dist;
+  rd_stats_dst->sse += rd_stats_src->sse;
+  rd_stats_dst->skip &= rd_stats_src->skip;
+  rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+    {
+      // TODO(angiebird): optimize this part
+      int r, c;
+      int ref_txb_coeff_cost = 0;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+          rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
+              rd_stats_src->txb_coeff_cost_map[plane][r][c];
+          ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
+        }
+      assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
+    }
+  }
+#endif
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                              int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+                                  unsigned int qstep, int *rate, int64_t *dist);
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+                          double *distbysse_f);
+
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+                            const MACROBLOCKD *xd);
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+                            int stride);
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+                                       int16_t *base);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+                                             int ref_frame);
+
+void av1_init_me_luts(void);
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+                               int best_mode_index);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+                 BLOCK_SIZE block_size);
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv,
+                          const int num_planes);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth);
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+                         FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 000000000..c2d15534f
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,12199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+
+typedef void (*model_rd_for_sb_type)(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+                                       const MACROBLOCK *const x,
+                                       BLOCK_SIZE plane_bsize, int plane,
+                                       int64_t sse, int num_samples, int *rate,
+                                       int64_t *dist);
+
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                            int plane_to, int mi_row, int mi_col,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb,
+                            int *plane_rate, int64_t *plane_sse,
+                            int64_t *plane_dist);
+static void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_surffit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_dnn(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_fullrdy(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist);
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist);
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist);
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist);
+
+typedef enum {
+  MODELRD_LEGACY,
+  MODELRD_CURVFIT,
+  MODELRD_SUFFIT,
+  MODELRD_DNN,
+  MODELRD_FULLRDY,
+  MODELRD_TYPES
+} ModelRdType;
+
+static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+  model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
+  model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy
+};
+
+static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+  model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit,
+  model_rd_with_dnn, NULL
+};
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_JNT_COMPOUND 1
+
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+  0x00000000, 0x00010000, 0x00020000,  // y = 0
+  0x00000001, 0x00010001, 0x00020001,  // y = 1
+  0x00000002, 0x00010002, 0x00020002,  // y = 2
+};
+
+#define SECOND_REF_FRAME_MASK                                         \
+  ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
+   (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
+
+#define ANGLE_SKIP_THRESH 10
+
+static const double ADST_FLIP_SVM[8] = {
+  /* vertical */
+  -6.6623, -2.8062, -3.2531, 3.1671,
+  /* horizontal */
+  -7.7051, -3.2234, -3.6193, 3.4533
+};
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+typedef enum {
+  FTXS_NONE = 0,
+  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} FAST_TX_SEARCH_MODE;
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+                               int mi_col, int64_t ref_best_rd);
+
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t non_skip_ref_best_rd,
+                            int64_t skip_ref_best_rd,
+                            FAST_TX_SEARCH_MODE ftxs_mode);
+
+struct rdcost_block_args {
+  const AV1_COMP *cpi;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+  RD_STATS rd_stats;
+  int64_t this_rd;
+  int64_t best_rd;
+  int exit_early;
+  int incomplete_exit;
+  int use_fast_coef_costing;
+  FAST_TX_SEARCH_MODE ftxs_mode;
+};
+
+#define LAST_NEW_MV_INDEX 6
+static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
+  { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEWMV, { LAST_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEARMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  // intra modes
+  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
+  7,    // DC_PRED,
+  134,  // V_PRED,
+  133,  // H_PRED,
+  140,  // D45_PRED,
+  135,  // D135_PRED,
+  139,  // D113_PRED,
+  137,  // D157_PRED,
+  136,  // D203_PRED,
+  138,  // D67_PRED,
+  46,   // SMOOTH_PRED,
+  47,   // SMOOTH_V_PRED,
+  48,   // SMOOTH_H_PRED,
+  45,   // PAETH_PRED,
+};
+
+/* clang-format off */
+static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+                                             [REF_FRAMES] = {
+  // NEARESTMV,
+  { -1, 0, 1, 2, 6, 3, 4, 5, },
+  // NEARMV,
+  { -1, 15, 16, 17, 21, 18, 19, 20, },
+  // GLOBALMV,
+  { -1, 22, 23, 24, 27, 25, 26, 28, },
+  // NEWMV,
+  { -1, 8, 9, 10, 14, 11, 12, 13, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+                                     [REF_FRAMES] = {
+  // NEAREST_NEARESTMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 41, 42, 43, 33, 37, 29, },
+      { -1, -1, -1, -1, -1, 34, 38, 30, },
+      { -1, -1, -1, -1, -1, 35, 39, 31, },
+      { -1, -1, -1, -1, -1, 36, 40, 32, },
+      { -1, -1, -1, -1, -1, -1, -1, 44, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAR_NEARMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 141, 148, 155, 77, 105, 49, },
+      { -1, -1, -1, -1, -1, 84, 112, 56, },
+      { -1, -1, -1, -1, -1, 91, 119, 63, },
+      { -1, -1, -1, -1, -1, 98, 126, 70, },
+      { -1, -1, -1, -1, -1, -1, -1, 162, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAREST_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 143, 150, 157, 79, 107, 51, },
+      { -1, -1, -1, -1, -1, 86, 114, 58, },
+      { -1, -1, -1, -1, -1, 93, 121, 65, },
+      { -1, -1, -1, -1, -1, 100, 128, 72, },
+      { -1, -1, -1, -1, -1, -1, -1, 164, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEARESTMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 142, 149, 156, 78, 106, 50, },
+      { -1, -1, -1, -1, -1, 85, 113, 57, },
+      { -1, -1, -1, -1, -1, 92, 120, 64, },
+      { -1, -1, -1, -1, -1, 99, 127, 71, },
+      { -1, -1, -1, -1, -1, -1, -1, 163, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEAR_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 145, 152, 159, 81, 109, 53, },
+      { -1, -1, -1, -1, -1, 88, 116, 60, },
+      { -1, -1, -1, -1, -1, 95, 123, 67, },
+      { -1, -1, -1, -1, -1, 102, 130, 74, },
+      { -1, -1, -1, -1, -1, -1, -1, 166, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEARMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 144, 151, 158, 80, 108, 52, },
+      { -1, -1, -1, -1, -1, 87, 115, 59, },
+      { -1, -1, -1, -1, -1, 94, 122, 66, },
+      { -1, -1, -1, -1, -1, 101, 129, 73, },
+      { -1, -1, -1, -1, -1, -1, -1, 165, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // GLOBAL_GLOBALMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 147, 154, 161, 83, 111, 55, },
+      { -1, -1, -1, -1, -1, 90, 118, 62, },
+      { -1, -1, -1, -1, -1, 97, 125, 69, },
+      { -1, -1, -1, -1, -1, 104, 132, 76, },
+      { -1, -1, -1, -1, -1, -1, -1, 168, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+  // NEW_NEWMV,
+  {
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, 146, 153, 160, 82, 110, 54, },
+      { -1, -1, -1, -1, -1, 89, 117, 61, },
+      { -1, -1, -1, -1, -1, 96, 124, 68, },
+      { -1, -1, -1, -1, -1, 103, 131, 75, },
+      { -1, -1, -1, -1, -1, -1, -1, 167, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+      { -1, -1, -1, -1, -1, -1, -1, -1, },
+  },
+};
+/* clang-format on */
+
+static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   MV_REFERENCE_FRAME second_ref_frame) {
+  if (this_mode < INTRA_MODE_END) {
+    assert(ref_frame == INTRA_FRAME);
+    assert(second_ref_frame == NONE_FRAME);
+    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  }
+  if (this_mode >= SINGLE_INTER_MODE_START &&
+      this_mode < SINGLE_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+                                   [ref_frame];
+  }
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert((second_ref_frame > INTRA_FRAME) &&
+           (second_ref_frame <= ALTREF_FRAME));
+    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+                                 [second_ref_frame];
+  }
+  assert(0);
+  return -1;
+}
+
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
+};
+
+typedef struct SingleInterModeState {
+  int64_t rd;
+  MV_REFERENCE_FRAME ref_frame;
+  int valid;
+} SingleInterModeState;
+
+typedef struct InterModeSearchState {
+  int64_t best_rd;
+  MB_MODE_INFO best_mbmode;
+  int best_rate_y;
+  int best_rate_uv;
+  int best_mode_skippable;
+  int best_skip2;
+  int best_mode_index;
+  int skip_intra_modes;
+  int num_available_refs;
+  int64_t dist_refs[REF_FRAMES];
+  int dist_order_refs[REF_FRAMES];
+  int64_t mode_threshold[MAX_MODES];
+  PREDICTION_MODE best_intra_mode;
+  int64_t best_intra_rd;
+  int angle_stats_ready;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  unsigned int best_pred_sse;
+  int rate_uv_intra[TX_SIZES_ALL];
+  int rate_uv_tokenonly[TX_SIZES_ALL];
+  int64_t dist_uvs[TX_SIZES_ALL];
+  int skip_uvs[TX_SIZES_ALL];
+  UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+  PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
+  int8_t uv_angle_delta[TX_SIZES_ALL];
+  int64_t best_pred_rd[REFERENCE_MODES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  // Save a set of single_newmv for each checked ref_mv.
+  int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
+  int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
+  int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+  // The rd of simple translation in single inter modes
+  int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+
+  // Single search results by [directions][modes][reference frames]
+  SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+  SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+                                            [FWD_REFS];
+  int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+
+  MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+} InterModeSearchState;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_8X8) return 1;
+  if (bsize == BLOCK_16X16) return 2;
+  if (bsize == BLOCK_32X32) return 3;
+  return -1;
+}
+
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
+    md->ready = 0;
+    md->num = 0;
+    md->dist_sum = 0;
+    md->ld_sum = 0;
+    md->sse_sum = 0;
+    md->sse_sse_sum = 0;
+    md->sse_ld_sum = 0;
+  }
+}
+
+static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  aom_clear_system_state();
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    const double est_ld = md->a * sse + md->b;
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
+      *est_dist = (int64_t)round(md->dist_mean);
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
+                          int64_t sse, int curr_cost) {
+  int est_residue_cost;
+  int64_t est_dist;
+  if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
+    int rate = est_residue_cost + curr_cost;
+    int64_t est_rd = RDCOST(rdmult, rate, est_dist);
+    return est_rd;
+  }
+  return 0;
+}
+
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
+  aom_clear_system_state();
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const int block_idx = inter_mode_data_block_idx(bsize);
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+    if (block_idx == -1) continue;
+    if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
+      continue;
+    } else {
+      if (md->ready == 0) {
+        md->dist_mean = md->dist_sum / md->num;
+        md->ld_mean = md->ld_sum / md->num;
+        md->sse_mean = md->sse_sum / md->num;
+        md->sse_sse_mean = md->sse_sse_sum / md->num;
+        md->sse_ld_mean = md->sse_ld_sum / md->num;
+      } else {
+        const double factor = 3;
+        md->dist_mean =
+            (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+        md->ld_mean =
+            (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+        md->sse_mean =
+            (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+        md->sse_sse_mean =
+            (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+            (factor + 1);
+        md->sse_ld_mean =
+            (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+            (factor + 1);
+      }
+
+      const double my = md->ld_mean;
+      const double mx = md->sse_mean;
+      const double dx = sqrt(md->sse_sse_mean);
+      const double dxy = md->sse_ld_mean;
+
+      md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+      md->b = my - md->a * mx;
+      md->ready = 1;
+
+      md->num = 0;
+      md->dist_sum = 0;
+      md->ld_sum = 0;
+      md->sse_sum = 0;
+      md->sse_sse_sum = 0;
+      md->sse_ld_sum = 0;
+    }
+    (void)rdmult;
+  }
+}
+
+static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                                 int64_t sse, int64_t dist, int residue_cost) {
+  if (residue_cost == 0 || sse == dist) return;
+  const int block_idx = inter_mode_data_block_idx(bsize);
+  if (block_idx == -1) return;
+  InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+  if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+    aom_clear_system_state();
+    const double ld = (sse - dist) * 1. / residue_cost;
+    ++rd_model->num;
+    rd_model->dist_sum += dist;
+    rd_model->ld_sum += ld;
+    rd_model->sse_sum += sse;
+    rd_model->sse_sse_sum += sse * sse;
+    rd_model->sse_ld_sum += sse * ld;
+  }
+}
+
+static void inter_modes_info_push(InterModesInfo *inter_modes_info,
+                                  int mode_rate, int64_t sse, int64_t est_rd,
+                                  const MB_MODE_INFO *mbmi) {
+  const int num = inter_modes_info->num;
+  assert(num < MAX_INTER_MODES);
+  inter_modes_info->mbmi_arr[num] = *mbmi;
+  inter_modes_info->mode_rate_arr[num] = mode_rate;
+  inter_modes_info->sse_arr[num] = sse;
+  inter_modes_info->est_rd_arr[num] = est_rd;
+  ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+  if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+    return 0;
+  } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
+                                  RdIdxPair *rd_idx_pair_arr) {
+  if (inter_modes_info->num == 0) {
+    return;
+  }
+  for (int i = 0; i < inter_modes_info->num; ++i) {
+    rd_idx_pair_arr[i].idx = i;
+    rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+  }
+  qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+        compare_rd_idx_pair);
+}
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+
+static INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+                                                      const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED;
+
+  if (!xd->cfl.is_chroma_reference) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // For chroma reference blocks, we should store data in the encoder iff we're
+  // allowed to try out CfL.
+  return is_cfl_allowed(xd);
+}
+
+// constants for prune 1 and prune 2 decision boundaries
+#define FAST_EXT_TX_CORR_MID 0.0
+#define FAST_EXT_TX_EDST_MID 0.1
+#define FAST_EXT_TX_CORR_MARGIN 0.5
+#define FAST_EXT_TX_EDST_MARGIN 0.3
+
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode);
+
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+
+#if CONFIG_DIST_8X8
+static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int coeff_shift) {
+  uint64_t svar = 0;
+  uint64_t dvar = 0;
+  uint64_t sum_s = 0;
+  uint64_t sum_d = 0;
+  uint64_t sum_s2 = 0;
+  uint64_t sum_d2 = 0;
+  uint64_t sum_sd = 0;
+  uint64_t dist = 0;
+
+  int i, j;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      sum_s += src[i * sstride + j];
+      sum_d += dst[i * dstride + j];
+      sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+      sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+      sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+    }
+  }
+  /* Compute the variance -- the calculation cannot go negative. */
+  svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+  dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+
+  // Tuning of jm's original dering distortion metric used in CDEF tool,
+  // suggested by jm
+  const uint64_t a = 4;
+  const uint64_t b = 2;
+  const uint64_t c1 = (400 * a << 2 * coeff_shift);
+  const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
+
+  dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+                                  (svar + dvar + c1) /
+                                  (sqrt(svar * (double)dvar + c2)));
+
+  // Calibrate dist to have similar rate for the same QP with MSE only
+  // distortion (as in master branch)
+  dist = (uint64_t)((float)dist * 0.75);
+
+  return dist;
+}
+
+static int od_compute_var_4x4(uint16_t *x, int stride) {
+  int sum;
+  int s2;
+  int i;
+  sum = 0;
+  s2 = 0;
+  for (i = 0; i < 4; i++) {
+    int j;
+    for (j = 0; j < 4; j++) {
+      int t;
+
+      t = x[i * stride + j];
+      sum += t;
+      s2 += t * t;
+    }
+  }
+
+  return (s2 - (sum * sum >> 4)) >> 4;
+}
+
+/* OD_DIST_LP_MID controls the frequency weighting filter used for computing
+   the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
+   is applied both horizontally and vertically. For X=5, the filter is
+   a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
+#define OD_DIST_LP_MID (5)
+#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
+
+static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
+                                  uint16_t *y, od_coeff *e_lp, int stride) {
+  double sum;
+  int min_var;
+  double mean_var;
+  double var_stat;
+  double activity;
+  double calibration;
+  int i;
+  int j;
+  double vardist;
+
+  vardist = 0;
+
+#if 1
+  min_var = INT_MAX;
+  mean_var = 0;
+  for (i = 0; i < 3; i++) {
+    for (j = 0; j < 3; j++) {
+      int varx;
+      int vary;
+      varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
+      vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
+      min_var = OD_MINI(min_var, varx);
+      mean_var += 1. / (1 + varx);
+      /* The cast to (double) is to avoid an overflow before the sqrt.*/
+      vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
+    }
+  }
+  /* We use a different variance statistic depending on whether activity
+     masking is used, since the harmonic mean appeared slightly worse with
+     masking off. The calibration constant just ensures that we preserve the
+     rate compared to activity=1. */
+  if (use_activity_masking) {
+    calibration = 1.95;
+    var_stat = 9. / mean_var;
+  } else {
+    calibration = 1.62;
+    var_stat = min_var;
+  }
+  /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
+     activity masking constant. */
+  activity = calibration * pow(.25 + var_stat, -1. / 6);
+#else
+  activity = 1;
+#endif  // 1
+  sum = 0;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++)
+      sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
+  }
+  /* Normalize the filter to unit DC response. */
+  sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
+               OD_DIST_LP_NORM);
+  return activity * activity * (sum + vardist);
+}
+
+// Note : Inputs x and y are in a pixel domain
+static double od_compute_dist_common(int activity_masking, uint16_t *x,
+                                     uint16_t *y, int bsize_w, int bsize_h,
+                                     int qindex, od_coeff *tmp,
+                                     od_coeff *e_lp) {
+  int i, j;
+  double sum = 0;
+  const int mid = OD_DIST_LP_MID;
+
+  for (j = 0; j < bsize_w; j++) {
+    e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+    e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] +
+                                        2 * tmp[(bsize_h - 2) * bsize_w + j];
+  }
+  for (i = 1; i < bsize_h - 1; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+                              tmp[(i - 1) * bsize_w + j] +
+                              tmp[(i + 1) * bsize_w + j];
+    }
+  }
+  for (i = 0; i < bsize_h; i += 8) {
+    for (j = 0; j < bsize_w; j += 8) {
+      sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j],
+                                 &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+                                 bsize_w);
+    }
+  }
+  /* Scale according to linear regression against SSE, for 8x8 blocks. */
+  if (activity_masking) {
+    sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
+           (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
+  } else {
+    sum *= qindex >= 128
+               ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
+               : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
+                              : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
+  }
+
+  return sum;
+}
+
+static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
+                              int bsize_h, int qindex) {
+  assert(bsize_w >= 8 && bsize_h >= 8);
+
+  int activity_masking = 0;
+
+  int i, j;
+  DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
+  for (i = 0; i < bsize_h; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
+    }
+  }
+  int mid = OD_DIST_LP_MID;
+  for (i = 0; i < bsize_h; i++) {
+    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+    tmp[i * bsize_w + bsize_w - 1] =
+        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+    for (j = 1; j < bsize_w - 1; j++) {
+      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+                             e[i * bsize_w + j + 1];
+    }
+  }
+  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                qindex, tmp, e_lp);
+}
+
+static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
+                                   int bsize_h, int qindex) {
+  assert(bsize_w >= 8 && bsize_h >= 8);
+
+  int activity_masking = 0;
+
+  DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
+  int i, j;
+  for (i = 0; i < bsize_h; i++) {
+    for (j = 0; j < bsize_w; j++) {
+      y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
+    }
+  }
+  int mid = OD_DIST_LP_MID;
+  for (i = 0; i < bsize_h; i++) {
+    tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+    tmp[i * bsize_w + bsize_w - 1] =
+        mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+    for (j = 1; j < bsize_w - 1; j++) {
+      tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+                             e[i * bsize_w + j + 1];
+    }
+  }
+  return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+                                qindex, tmp, e_lp);
+}
+
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                     const uint8_t *src, int src_stride, const uint8_t *dst,
+                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+                     int bsh, int visible_w, int visible_h, int qindex) {
+  int64_t d = 0;
+  int i, j;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]);
+
+  assert(bsw >= 8);
+  assert(bsh >= 8);
+  assert((bsw & 0x07) == 0);
+  assert((bsh & 0x07) == 0);
+
+  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+      x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+
+      if ((bsw == visible_w) && (bsh == visible_h)) {
+        for (j = 0; j < bsh; j++)
+          for (i = 0; i < bsw; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+      } else {
+        for (j = 0; j < visible_h; j++)
+          for (i = 0; i < visible_w; i++)
+            rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+
+        if (visible_w < bsw) {
+          for (j = 0; j < bsh; j++)
+            for (i = visible_w; i < bsw; i++)
+              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+        }
+
+        if (visible_h < bsh) {
+          for (j = visible_h; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+        }
+      }
+    } else {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+      if ((bsw == visible_w) && (bsh == visible_h)) {
+        for (j = 0; j < bsh; j++)
+          for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+      } else {
+        for (j = 0; j < visible_h; j++)
+          for (i = 0; i < visible_w; i++)
+            rec[j * bsw + i] = dst[j * dst_stride + i];
+
+        if (visible_w < bsw) {
+          for (j = 0; j < bsh; j++)
+            for (i = visible_w; i < bsw; i++)
+              rec[j * bsw + i] = src[j * src_stride + i];
+        }
+
+        if (visible_h < bsh) {
+          for (j = visible_h; j < bsh; j++)
+            for (i = 0; i < bsw; i++)
+              rec[j * bsw + i] = src[j * src_stride + i];
+        }
+      }
+    }
+  }
+
+  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
+  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+    int coeff_shift = AOMMAX(xd->bd - 8, 0);
+
+    for (i = 0; i < bsh; i += 8) {
+      for (j = 0; j < bsw; j += 8) {
+        d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j],
+                                 bsw, coeff_shift);
+      }
+    }
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      d = ((uint64_t)d) >> 2 * coeff_shift;
+  } else {
+    // Otherwise, MSE by default
+    d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
+                                tx_bsize, bsh, bsw, visible_h, visible_w);
+  }
+
+  return d;
+}
+
+static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
+                             int src_stride, const int16_t *diff,
+                             int diff_stride, int bsw, int bsh, int visible_w,
+                             int visible_h, int qindex) {
+  int64_t d = 0;
+  int i, j;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]);
+
+  assert(bsw >= 8);
+  assert(bsh >= 8);
+  assert((bsw & 0x07) == 0);
+  assert((bsh & 0x07) == 0);
+
+  if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+      x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+    } else {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+    }
+
+    if ((bsw == visible_w) && (bsh == visible_h)) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          diff16[j * bsw + i] = diff[j * diff_stride + i];
+    } else {
+      for (j = 0; j < visible_h; j++)
+        for (i = 0; i < visible_w; i++)
+          diff16[j * bsw + i] = diff[j * diff_stride + i];
+
+      if (visible_w < bsw) {
+        for (j = 0; j < bsh; j++)
+          for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
+      }
+
+      if (visible_h < bsh) {
+        for (j = visible_h; j < bsh; j++)
+          for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+      }
+    }
+  }
+
+  if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+    d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
+  } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+    int coeff_shift = AOMMAX(xd->bd - 8, 0);
+    DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]);
+
+    for (i = 0; i < bsh; i++) {
+      for (j = 0; j < bsw; j++) {
+        dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j];
+      }
+    }
+
+    for (i = 0; i < bsh; i += 8) {
+      for (j = 0; j < bsw; j += 8) {
+        d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j],
+                                 bsw, coeff_shift);
+      }
+    }
+    // Don't scale 'd' for HBD since it will be done by caller side for diff
+    // input
+  } else {
+    // Otherwise, MSE by default
+    d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h);
+  }
+
+  return d;
+}
+#endif  // CONFIG_DIST_8X8
+
+static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                         const uint8_t *src, int src_stride,
+                                         const uint8_t *dst, int dst_stride,
+                                         int need_4th, double *hordist,
+                                         double *verdist) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+    // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+    // functions for the 16 (very small) sub-blocks of this block.
+    const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+    const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+    assert(bw <= 32);
+    assert(bh <= 32);
+    assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+    if (cpi->common.seq_params.use_highbitdepth) {
+      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] +=
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+        }
+    } else {
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+    }
+  } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+    const int f_index =
+        (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+    assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+    const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+    assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+    assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[1]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[2]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[5]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[6]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[9]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[10]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[13]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[14]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[15]);
+  }
+
+  double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+                 esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+                 esq[12] + esq[13] + esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+    hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+    hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    if (need_4th) {
+      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+    }
+    verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+    verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+    verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+    if (need_4th) {
+      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+    }
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+    if (need_4th) {
+      hordist[3] = verdist[3] = 0.25;
+    }
+  }
+}
+
+static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                            const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride) {
+  int prune_bitmask = 0;
+  double svm_proj_h = 0, svm_proj_v = 0;
+  double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
+  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0,
+                               hdist, vdist);
+
+  svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
+               vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
+  svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
+               hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
+  if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << FLIPADST_1D;
+  else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << ADST_1D;
+
+  if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (FLIPADST_1D + 8);
+  else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (ADST_1D + 8);
+
+  return prune_bitmask;
+}
+
+static void get_horver_correlation(const int16_t *diff, int stride, int w,
+                                   int h, double *hcorr, double *vcorr) {
+  // Returns hor/ver correlation coefficient
+  const int num = (h - 1) * (w - 1);
+  double num_r;
+  int i, j;
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
+  double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
+  *hcorr = *vcorr = 1;
+
+  assert(num > 0);
+  num_r = 1.0 / num;
+  for (i = 1; i < h; ++i) {
+    for (j = 1; j < w; ++j) {
+      const int16_t x = diff[i * stride + j];
+      const int16_t y = diff[i * stride + j - 1];
+      const int16_t z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      x_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  x_var_n = x2_sum - (x_sum * x_sum) * num_r;
+  y_var_n = y2_sum - (y_sum * y_sum) * num_r;
+  z_var_n = z2_sum - (z_sum * z_sum) * num_r;
+  xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
+  xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
+  if (x_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (x_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  }
+}
+
+static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
+  double hcorr, vcorr;
+  int prune_bitmask = 0;
+  get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
+
+  if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << IDTX_1D;
+  else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << DCT_1D;
+
+  if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (IDTX_1D + 8);
+  else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (DCT_1D + 8);
+  return prune_bitmask;
+}
+
+// Performance drop: 0.5%, Speed improvement: 24%
+static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, const MACROBLOCKD *xd,
+                             int adst_flipadst, int dct_idtx) {
+  int prune = 0;
+
+  if (adst_flipadst) {
+    const struct macroblock_plane *const p = &x->plane[0];
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
+                              pd->dst.buf, pd->dst.stride);
+  }
+  if (dct_idtx) {
+    av1_subtract_plane(x, bsize, 0);
+    const struct macroblock_plane *const p = &x->plane[0];
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
+  }
+
+  return prune;
+}
+
+// Performance drop: 0.3%, Speed improvement: 5%
+static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                             const MACROBLOCK *x, const MACROBLOCKD *xd) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
+                          pd->dst.stride);
+}
+
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+  { 1, 0, 0, 0 },
+  { 1, 1, 1, 1 },
+  { 1, 1, 1, 1 },
+  { 1, 0, 0, 1 },
+};
+
+static void get_energy_distribution_finer(const int16_t *diff, int stride,
+                                          int bw, int bh, float *hordist,
+                                          float *verdist) {
+  // First compute downscaled block energy values (esq); downscale factors
+  // are defined by w_shift and h_shift.
+  unsigned int esq[256];
+  const int w_shift = bw <= 8 ? 0 : 1;
+  const int h_shift = bh <= 8 ? 0 : 1;
+  const int esq_w = bw >> w_shift;
+  const int esq_h = bh >> h_shift;
+  const int esq_sz = esq_w * esq_h;
+  int i, j;
+  memset(esq, 0, esq_sz * sizeof(esq[0]));
+  if (w_shift) {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j += 2) {
+        cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+                                cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+      }
+    }
+  } else {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j++) {
+        cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+      }
+    }
+  }
+
+  uint64_t total = 0;
+  for (i = 0; i < esq_sz; i++) total += esq[i];
+
+  // Output hordist and verdist arrays are normalized 1D projections of esq
+  if (total == 0) {
+    float hor_val = 1.0f / esq_w;
+    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+    float ver_val = 1.0f / esq_h;
+    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+    return;
+  }
+
+  const float e_recip = 1.0f / (float)total;
+  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+  const unsigned int *cur_esq_row;
+  for (i = 0; i < esq_h - 1; i++) {
+    cur_esq_row = esq + i * esq_w;
+    for (j = 0; j < esq_w - 1; j++) {
+      hordist[j] += (float)cur_esq_row[j];
+      verdist[i] += (float)cur_esq_row[j];
+    }
+    verdist[i] += (float)cur_esq_row[j];
+  }
+  cur_esq_row = esq + i * esq_w;
+  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+static void get_horver_correlation_full(const int16_t *diff, int stride, int w,
+                                        int h, float *hcorr, float *vcorr) {
+  const float num_hor = (float)(h * (w - 1));
+  const float num_ver = (float)((h - 1) * w);
+  int i, j;
+
+  // The following notation is used:
+  // x - current pixel
+  // y - left neighbor pixel
+  // z - top neighbor pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t xhor_sum = 0, xver_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2hor_sum = 0, x2ver_sum = 0, y2_sum = 0, z2_sum = 0;
+
+  int16_t x, y, z;
+  for (j = 1; j < w; ++j) {
+    x = diff[j];
+    y = diff[j - 1];
+    xy_sum += x * y;
+    xhor_sum += x;
+    y_sum += y;
+    x2hor_sum += x * x;
+    y2_sum += y * y;
+  }
+  for (i = 1; i < h; ++i) {
+    x = diff[i * stride];
+    z = diff[(i - 1) * stride];
+    xz_sum += x * z;
+    xver_sum += x;
+    z_sum += z;
+    x2ver_sum += x * x;
+    z2_sum += z * z;
+    for (j = 1; j < w; ++j) {
+      x = diff[i * stride + j];
+      y = diff[i * stride + j - 1];
+      z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      xhor_sum += x;
+      xver_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2hor_sum += x * x;
+      x2ver_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  *hcorr = *vcorr = 1;
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  }
+}
+
+// Transforms raw scores into a probability distribution across 16 TX types
+static void score_2D_transform_pow8(float *scores_2D, float shift) {
+  float sum = 0.0f;
+  int i;
+
+  for (i = 0; i < 16; i++) {
+    float v, v2, v4;
+    v = AOMMAX(scores_2D[i] + shift, 0.0f);
+    v2 = v * v;
+    v4 = v2 * v2;
+    scores_2D[i] = v4 * v4;
+    sum += scores_2D[i];
+  }
+  for (i = 0; i < 16; i++) scores_2D[i] /= sum;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+  // TX_4X4
+  (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+             0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+             0.09778f, 0.11780f },
+  // TX_8X8
+  (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+             0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+             0.10803f, 0.14124f },
+  // TX_16X16
+  (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+             0.06897f, 0.07629f, 0.08875f, 0.11169f },
+  // TX_32X32
+  NULL,
+  // TX_64X64
+  NULL,
+  // TX_4X8
+  (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+             0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+             0.10168f, 0.12585f },
+  // TX_8X4
+  (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+             0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+             0.10583f, 0.13123f },
+  // TX_8X16
+  (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+             0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+             0.10730f, 0.14221f },
+  // TX_16X8
+  (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+             0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+             0.10339f, 0.13464f },
+  // TX_16X32
+  NULL,
+  // TX_32X16
+  NULL,
+  // TX_32X64
+  NULL,
+  // TX_64X32
+  NULL,
+  // TX_4X16
+  (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+             0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+             0.10242f, 0.12878f },
+  // TX_16X4
+  (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+             0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+             0.10217f, 0.12610f },
+  // TX_8X32
+  NULL,
+  // TX_32X8
+  NULL,
+  // TX_16X64
+  NULL,
+  // TX_64X16
+  NULL,
+};
+
+static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            int blk_row, int blk_col, TxSetType tx_set_type,
+                            TX_TYPE_PRUNE_MODE prune_mode) {
+  static const int tx_type_table_2D[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+  if (tx_set_type != EXT_TX_SET_ALL16 &&
+      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+    return 0;
+  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+  if (!nn_config_hor || !nn_config_ver) return 0;  // Model not established yet.
+
+  aom_clear_system_state();
+  float hfeatures[16], vfeatures[16];
+  float hscores[4], vscores[4];
+  float scores_2D[16];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+  assert(hfeatures_num <= 16);
+  assert(vfeatures_num <= 16);
+
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+                                vfeatures);
+  get_horver_correlation_full(diff, diff_stride, bw, bh,
+                              &hfeatures[hfeatures_num - 1],
+                              &vfeatures[vfeatures_num - 1]);
+  av1_nn_predict(hfeatures, nn_config_hor, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, vscores);
+
+  float score_2D_average = 0.0f;
+  for (int i = 0; i < 4; i++) {
+    float *cur_scores_2D = scores_2D + i * 4;
+    cur_scores_2D[0] = vscores[i] * hscores[0];
+    cur_scores_2D[1] = vscores[i] * hscores[1];
+    cur_scores_2D[2] = vscores[i] * hscores[2];
+    cur_scores_2D[3] = vscores[i] * hscores[3];
+    score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] +
+                        cur_scores_2D[3];
+  }
+  score_2D_average /= 16;
+
+  const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } };
+  int pruning_aggressiveness = 1;
+  if (tx_set_type == EXT_TX_SET_ALL16) {
+    score_2D_transform_pow8(scores_2D, (10 - score_2D_average));
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+  } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) {
+    score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+  }
+
+  // Always keep the TX type with the highest score, prune all others with
+  // score below score_thresh.
+  int max_score_i = 0;
+  float max_score = 0.0f;
+  for (int i = 0; i < 16; i++) {
+    if (scores_2D[i] > max_score &&
+        av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) {
+      max_score = scores_2D[i];
+      max_score_i = i;
+    }
+  }
+
+  const float score_thresh =
+      prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
+
+  uint16_t prune_bitmask = 0;
+  for (int i = 0; i < 16; i++) {
+    if (scores_2D[i] < score_thresh && i != max_score_i)
+      prune_bitmask |= (1 << tx_type_table_2D[i]);
+  }
+  return prune_bitmask;
+}
+
+// ((prune >> vtx_tab[tx_type]) & 1)
+static const uint16_t prune_v_mask[] = {
+  0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff,
+  0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff,
+};
+
+// ((prune >> (htx_tab[tx_type] + 8)) & 1)
+static const uint16_t prune_h_mask[] = {
+  0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff,
+  0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff,
+};
+
+static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) {
+  uint8_t prune_v = tx_search_prune & 0x0F;
+  uint8_t prune_h = (tx_search_prune >> 8) & 0x0F;
+  return (prune_v_mask[prune_v] & prune_h_mask[prune_h]);
+}
+
+static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+                     const MACROBLOCKD *const xd, int tx_set_type) {
+  x->tx_search_prune[tx_set_type] = 0;
+  x->tx_split_prune_flag = 0;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
+      x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
+      x->cb_partition_scan)
+    return;
+  int tx_set = ext_tx_set_index[1][tx_set_type];
+  assert(tx_set >= 0);
+  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+  int prune = 0;
+  switch (cpi->sf.tx_type_search.prune_mode) {
+    case NO_PRUNE: return;
+    case PRUNE_ONE:
+      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
+      prune = prune_one_for_sby(cpi, bsize, x, xd);
+      x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
+      break;
+    case PRUNE_TWO:
+      if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
+        prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
+        prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+      } else {
+        prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      }
+      x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
+      break;
+    case PRUNE_2D_ACCURATE:
+    case PRUNE_2D_FAST: break;
+    default: assert(0);
+  }
+}
+
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist) {
+  (void)num_samples;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+
+  // Fast approximate the modelling function.
+  if (cpi->sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = pd->dequant_Q3[1] >> dequant_shift;
+    if (quantizer < 120)
+      *rate = (int)AOMMIN(
+          (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+          INT_MAX);
+    else
+      *rate = 0;
+    assert(*rate >= 0);
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+                                 pd->dequant_Q3[1] >> dequant_shift, rate,
+                                 dist);
+  }
+  *dist <<= 4;
+}
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                               pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    total_sse += sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+#endif
+
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                            int plane_to, int mi_row, int mi_col,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb,
+                            int *plane_rate, int64_t *plane_sse,
+                            int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int plane;
+  (void)mi_row;
+  (void)mi_col;
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+    model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+    assert(rate_sum >= 0);
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  rate_sum = AOMMIN(rate_sum, INT_MAX);
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                             int plane_to, int *skip_txfm_sb) {
+  *skip_txfm_sb = 1;
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    // Since fast HBD variance functions scale down sse by 4 bit, we first use
+    // fast vf implementation to rule out blocks with non-zero scaled sse. Then,
+    // only if the source is HBD and the scaled sse is 0, accurate sse
+    // computation is applied to determine if the sse is really 0. This step is
+    // necessary for HBD lossless coding.
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    if (sse) {
+      *skip_txfm_sb = 0;
+      return;
+    } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint64_t sse64 = aom_highbd_sse_odd_size(
+          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+          block_size_wide[bs], block_size_high[bs]);
+
+      if (sse64) {
+        *skip_txfm_sb = 0;
+        return;
+      }
+    }
+  }
+  return;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff, intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+                               BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+                               BLOCK_SIZE tx_bsize, int *width, int *height,
+                               int *visible_width, int *visible_height) {
+  assert(tx_bsize <= plane_bsize);
+  int txb_height = block_size_high[tx_bsize];
+  int txb_width = block_size_wide[tx_bsize];
+  const int block_height = block_size_high[plane_bsize];
+  const int block_width = block_size_wide[plane_bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+  // than the MI size
+  const int block_rows =
+      (xd->mb_to_bottom_edge >= 0)
+          ? block_height
+          : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+  const int block_cols =
+      (xd->mb_to_right_edge >= 0)
+          ? block_width
+          : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+  const int tx_unit_size = tx_size_wide_log2[0];
+  if (width) *width = txb_width;
+  if (height) *height = txb_height;
+  *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
+  *visible_height =
+      clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                           int plane, const uint8_t *src, const int src_stride,
+                           const uint8_t *dst, const int dst_stride,
+                           int blk_row, int blk_col,
+                           const BLOCK_SIZE plane_bsize,
+                           const BLOCK_SIZE tx_bsize) {
+  int txb_rows, txb_cols, visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+  assert(visible_rows > 0);
+  assert(visible_cols > 0);
+
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8 && plane == 0)
+    return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
+                                  tx_bsize, txb_cols, txb_rows, visible_cols,
+                                  visible_rows, x->qindex);
+#endif  // CONFIG_DIST_8X8
+
+  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+                                         dst_stride, tx_bsize, txb_rows,
+                                         txb_cols, visible_rows, visible_cols);
+
+  return sse;
+}
+
+// Compute the pixel domain distortion from diff on all visible 4x4s in the
+// transform block.
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                                      int blk_row, int blk_col,
+                                      const BLOCK_SIZE plane_bsize,
+                                      const BLOCK_SIZE tx_bsize) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+#if CONFIG_DIST_8X8
+  int txb_height = block_size_high[tx_bsize];
+  int txb_width = block_size_wide[tx_bsize];
+  if (x->using_dist_8x8 && plane == 0) {
+    const int src_stride = x->plane[plane].src.stride;
+    const int src_idx = (blk_row * src_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const int diff_idx = (blk_row * diff_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+    return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride,
+                         txb_width, txb_height, visible_cols, visible_rows,
+                         x->qindex);
+  }
+#endif
+  diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
+  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+}
+
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count) {
+  const int max_pix_val = 1 << 8;
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth, int *val_count) {
+  assert(bit_depth <= 12);
+  const int max_pix_val = 1 << bit_depth;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      if (this_val >= max_pix_val) return 0;
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane,
+                                           int block, int blk_row, int blk_col,
+                                           int eob, int reduced_tx_set) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+                                          tx_size, reduced_tx_set);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              dst_stride, eob, reduced_tx_set);
+}
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash);
+
+static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
+                                   int blk_col, BLOCK_SIZE plane_bsize,
+                                   TX_SIZE tx_size) {
+  int16_t tmp_data[64 * 64];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int txb_w = tx_size_wide[tx_size];
+  const int txb_h = tx_size_high[tx_size];
+  uint8_t *hash_data = (uint8_t *)cur_diff_row;
+  if (txb_w != diff_stride) {
+    int16_t *cur_hash_row = tmp_data;
+    for (int i = 0; i < txb_h; i++) {
+      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
+      cur_hash_row += txb_w;
+      cur_diff_row += diff_stride;
+    }
+    hash_data = (uint8_t *)tmp_data;
+  }
+  CRC32C *crc = &x->mb_rd_record.crc_calculator;
+  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
+  return (hash << 5) + tx_size;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        int64_t *out_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  else
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+  const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0];
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+  } else {
+    recon = (uint8_t *)recon16;
+    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                            NULL, 0, 0, NULL);
+  }
+
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+                                    cpi->common.reduced_tx_set_used);
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.reduced_tx_set_used);
+
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int err = diff[j * stride + i];
+      sum += err * err;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += abs(diff[j * stride + i]);
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static void get_2x2_normalized_sses_and_sads(
+    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+    int src_stride, const uint8_t *const dst, int dst_stride,
+    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+    double *const sad_norm_arr) {
+  const BLOCK_SIZE tx_bsize_half =
+      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
+    const int half_width = block_size_wide[tx_bsize] / 2;
+    const int half_height = block_size_high[tx_bsize] / 2;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const int16_t *const this_src_diff =
+            src_diff + row * half_height * diff_stride + col * half_width;
+        if (sse_norm_arr) {
+          sse_norm_arr[row * 2 + col] =
+              get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+        if (sad_norm_arr) {
+          sad_norm_arr[row * 2 + col] =
+              get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+      }
+    }
+  } else {  // use function pointers to calculate stats
+    const int half_width = block_size_wide[tx_bsize_half];
+    const int half_height = block_size_high[tx_bsize_half];
+    const int num_samples_half = half_width * half_height;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const uint8_t *const this_src =
+            src + row * half_height * src_stride + col * half_width;
+        const uint8_t *const this_dst =
+            dst + row * half_height * dst_stride + col * half_width;
+
+        if (sse_norm_arr) {
+          unsigned int this_sse;
+          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                        dst_stride, &this_sse);
+          sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+        }
+
+        if (sad_norm_arr) {
+          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+              this_src, src_stride, this_dst, dst_stride);
+          sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+        }
+      }
+    }
+  }
+}
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const RD_STATS *const rd_stats, int blk_row,
+                                    int blk_col, BLOCK_SIZE plane_bsize,
+                                    TX_SIZE tx_size, TX_TYPE tx_type,
+                                    int64_t rd) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 21743;
+  if (lcg_rand16(&seed) % 256 > 0) return;
+
+  const char output_file[] = "tu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+  const double num_samples = txw * txh;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  unsigned int sse;
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+  const double mean = get_mean(src_diff, diff_stride, txw, txh);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, txw, txh, &hor_corr,
+                         &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+                               1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     const RD_STATS *const rd_stats,
+                                     BLOCK_SIZE plane_bsize) {
+  if (rd_stats->invalid_rate) return;
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 95014;
+  if (lcg_rand16(&seed) % 256 > 0) return;
+
+  const char output_file[] = "pu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  int bw, bh;
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int num_samples = bw * bh;
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+  const double rdcost_norm =
+      (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+  fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  const int16_t *const src_diff = p->src_diff;
+  const int shift = (xd->bd - 8);
+
+  int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh);
+  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm =
+      (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rdcost_norm =
+      (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+          model_rdcost_norm);
+
+  double mean = get_mean(src_diff, diff_stride, bw, bh);
+  mean /= (1 << shift);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS >= 2
+#endif  // CONFIG_COLLECT_RD_STATS
+
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+                              const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+                              int plane, int64_t sse, int num_samples,
+                              int *rate, int64_t *dist) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int log_numpels = num_pels_log2_lookup[plane_bsize];
+
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+  const struct macroblock_plane *const p = &x->plane[plane];
+  int bw, bh;
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  const int16_t *const src_diff = p->src_diff;
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int shift = (xd->bd - 8);
+
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  if (plane) {
+    int model_rate;
+    int64_t model_dist;
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples,
+                          &model_rate, &model_dist);
+    if (rate) *rate = model_rate;
+    if (dist) *dist = model_dist;
+    return;
+  }
+
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+
+  double sse_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, NULL);
+  double mean = get_mean(src_diff, bw, bw, bh);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    mean /= (1 << shift);
+  }
+  double sse_norm_sum = 0.0, sse_frac_arr[3];
+  for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k];
+  for (int k = 0; k < 3; ++k)
+    sse_frac_arr[k] =
+        sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25;
+  const double q_sqr = (double)(q_step * q_step);
+  const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
+  const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0);
+  double hor_corr, vert_corr;
+  get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+
+  float features[NUM_FEATURES_PUSTATS];
+  features[0] = (float)hor_corr;
+  features[1] = (float)log_numpels;
+  features[2] = (float)mean_sqr_by_sse_norm;
+  features[3] = (float)q_sqr_by_sse_norm;
+  features[4] = (float)sse_frac_arr[0];
+  features[5] = (float)sse_frac_arr[1];
+  features[6] = (float)sse_frac_arr[2];
+  features[7] = (float)vert_corr;
+
+  float rate_f, dist_by_sse_norm_f;
+  av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
+  av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+  const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+  return;
+}
+
+static void model_rd_for_sb_with_dnn(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  (void)mi_row;
+  (void)mi_col;
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const int shift = (xd->bd - 8);
+    int bw, bh;
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+    model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+// Fits a surface for rate and distortion using as features:
+// log2(sse_norm + 1) and log2(sse_norm/qstep^2)
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xm = log(sse_norm + 1.0) / log(2.0);
+  const double yl = log(sse_norm / qstepsqr) / log(2.0);
+  double rate_f, dist_by_sse_norm_f;
+
+  av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
+
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_surffit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  (void)mi_row;
+  (void)mi_col;
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const int shift = (xd->bd - 8);
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+    model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                  const MACROBLOCK *const x,
+                                  BLOCK_SIZE plane_bsize, int plane,
+                                  int64_t sse, int num_samples, int *rate,
+                                  int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+
+  double rate_f, dist_by_sse_norm_f;
+  av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  (void)mi_row;
+  (void)mi_col;
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const int shift = (xd->bd - 8);
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+
+    sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static void model_rd_for_sb_with_fullrdy(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+    int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+    int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+                           pd->dst.stride, bw, bh);
+    } else {
+      sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+                    bh);
+    }
+    sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+    RD_STATS rd_stats;
+    if (plane == 0) {
+      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+      if (rd_stats.invalid_rate) {
+        rate = 0;
+        dist = sse << 4;
+      } else {
+        rate = rd_stats.rate;
+        dist = rd_stats.dist;
+      }
+    } else {
+      model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                            &dist);
+    }
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx,
+                               FAST_TX_SEARCH_MODE ftxs_mode,
+                               int use_fast_coef_costing, int64_t ref_best_rd,
+                               RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  TX_TYPE last_tx_type = TX_TYPES;
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+  // of the best tx_type
+  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  tran_low_t *best_dqcoeff = this_dqcoeff;
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  av1_invalid_rd_stats(best_rd_stats);
+
+  TXB_RD_INFO *intra_txb_rd_info = NULL;
+  uint16_t cur_joint_ctx = 0;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+  if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+      !is_inter && plane == 0 &&
+      tx_size_wide[tx_size] == tx_size_high[tx_size]) {
+    const uint32_t intra_hash =
+        get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+    const int intra_hash_idx =
+        find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+    intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+
+    cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+    if (intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+        x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+      mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
+      const TX_TYPE ref_tx_type =
+          av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                          tx_size, cpi->common.reduced_tx_set_used);
+      if (ref_tx_type == intra_txb_rd_info->tx_type) {
+        best_rd_stats->rate = intra_txb_rd_info->rate;
+        best_rd_stats->dist = intra_txb_rd_info->dist;
+        best_rd_stats->sse = intra_txb_rd_info->sse;
+        best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+        x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+        x->plane[plane].txb_entropy_ctx[block] =
+            intra_txb_rd_info->txb_entropy_ctx;
+        best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
+        best_eob = intra_txb_rd_info->eob;
+        best_tx_type = intra_txb_rd_info->tx_type;
+        update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                         best_tx_type);
+        goto RECON_INTRA;
+      }
+    }
+  }
+
+  int rate_cost = 0;
+  TX_TYPE txk_start = DCT_DCT;
+  TX_TYPE txk_end = TX_TYPES - 1;
+  if ((!is_inter && x->use_default_intra_tx_type) ||
+      (is_inter && x->use_default_inter_tx_type)) {
+    txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+  } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
+    if (plane == 0) txk_end = DCT_DCT;
+  }
+
+  uint8_t best_txb_ctx = 0;
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_start = txk_end =
+        av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
+                        cm->reduced_tx_set_used);
+  }
+  const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+      ext_tx_used_flag == 0x0001) {
+    txk_start = txk_end = DCT_DCT;
+  }
+  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
+  if (txk_start == txk_end) {
+    allowed_tx_mask = 1 << txk_start;
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else if (fast_tx_search) {
+    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else {
+    assert(plane == 0);
+    allowed_tx_mask = ext_tx_used_flag;
+    // !fast_tx_search && txk_end != txk_start && plane == 0
+    const int do_prune = cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
+    if (do_prune && is_inter) {
+      if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
+        const uint16_t prune =
+            prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+                        cpi->sf.tx_type_search.prune_mode);
+        allowed_tx_mask &= (~prune);
+      } else {
+        allowed_tx_mask &= (~x->tx_search_prune[tx_set_type]);
+      }
+    }
+  }
+  // Need to have at least one transform type allowed.
+  if (allowed_tx_mask == 0) {
+    txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
+    allowed_tx_mask = (1 << txk_start);
+  }
+
+  int use_transform_domain_distortion =
+      (cpi->sf.use_transform_domain_distortion > 0) &&
+      // Any 64-pt transforms only preserves half the coefficients.
+      // Therefore transform domain distortion is not valid for these
+      // transform sizes.
+      txsize_sqr_up_map[tx_size] != TX_64X64;
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8) use_transform_domain_distortion = 0;
+#endif
+  int calc_pixel_domain_distortion_final =
+      cpi->sf.use_transform_domain_distortion == 1 &&
+      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
+      !x->cb_partition_scan;
+  if (calc_pixel_domain_distortion_final &&
+      (txk_start == txk_end || allowed_tx_mask == 0x0001))
+    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  int64_t block_sse =
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+  block_sse *= 16;
+
+  for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+    if (!(allowed_tx_mask & (1 << tx_type))) continue;
+    if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+
+    if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+      av1_xform_quant(
+          cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+      rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+                                  txb_ctx, use_fast_coef_costing);
+    } else {
+      av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                      tx_size, tx_type, AV1_XFORM_QUANT_FP);
+      if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX &&
+          eobs_ptr[block] >= 4) {
+        // Calculate distortion quickly in transform domain.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+
+        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
+        const int64_t dist_cost_estimate =
+            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
+        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
+
+        rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+                                    txb_ctx, use_fast_coef_costing);
+        const int64_t rd_estimate =
+            AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
+                   RDCOST(x->rdmult, 0, this_rd_stats.sse));
+        if (rd_estimate - (rd_estimate >> 3) > best_rd_) continue;
+      }
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
+                     &rate_cost);
+    }
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (use_transform_domain_distortion) {
+      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+    } else {
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+      this_rd_stats.sse = block_sse;
+    }
+
+    this_rd_stats.rate = rate_cost;
+
+    const int64_t rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *best_rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+      best_eob = x->plane[plane].eobs[block];
+      last_tx_type = best_tx_type;
+
+      // Swap qcoeff and dqcoeff buffers
+      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+      best_dqcoeff = pd->dqcoeff;
+      pd->dqcoeff = tmp_dqcoeff;
+    }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+    if (plane == 0) {
+      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+                              plane_bsize, tx_size, tx_type, rd);
+    }
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+    if (cpi->sf.adaptive_txb_search_level) {
+      if ((best_rd - (best_rd >> cpi->sf.adaptive_txb_search_level)) >
+          ref_best_rd) {
+        break;
+      }
+    }
+
+    // Skip transform type search when we found the block has been quantized to
+    // all zero and at the same time, it has better rdcost than doing transform.
+    if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break;
+  }
+
+  assert(best_rd != INT64_MAX);
+
+  best_rd_stats->skip = best_eob == 0;
+  if (plane == 0) {
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     best_tx_type);
+  }
+  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+  x->plane[plane].eobs[block] = best_eob;
+
+  pd->dqcoeff = best_dqcoeff;
+
+  if (calc_pixel_domain_distortion_final && best_eob) {
+    best_rd_stats->dist = dist_block_px_domain(
+        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->sse = block_sse;
+  }
+
+  if (intra_txb_rd_info != NULL) {
+    intra_txb_rd_info->valid = 1;
+    intra_txb_rd_info->entropy_context = cur_joint_ctx;
+    intra_txb_rd_info->rate = best_rd_stats->rate;
+    intra_txb_rd_info->dist = best_rd_stats->dist;
+    intra_txb_rd_info->sse = best_rd_stats->sse;
+    intra_txb_rd_info->eob = best_eob;
+    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
+    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
+  }
+
+RECON_INTRA:
+  if (!is_inter && best_eob &&
+      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+    // intra mode needs decoded result such that the next transform block
+    // can use it for prediction.
+    // if the last search tx_type is the best tx_type, we don't need to
+    // do this again
+    if (best_tx_type != last_tx_type) {
+      if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+        av1_xform_quant(
+            cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+            best_tx_type,
+            USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+      } else {
+        av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                        tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
+                       &rate_cost);
+      }
+    }
+
+    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                   x->plane[plane].eobs[block],
+                                   cm->reduced_tx_set_used);
+
+    // This may happen because of hash collision. The eob stored in the hash
+    // table is non-zero, but the real eob is zero. We need to make sure tx_type
+    // is DCT_DCT in this case.
+    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+        best_tx_type != DCT_DCT) {
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
+    }
+  }
+  pd->dqcoeff = orig_dqcoeff;
+
+  return best_rd;
+}
+
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const AV1_COMP *cpi = args->cpi;
+  ENTROPY_CONTEXT *a = args->t_above + blk_col;
+  ENTROPY_CONTEXT *l = args->t_left + blk_row;
+  const AV1_COMMON *cm = &cpi->common;
+  int64_t rd1, rd2, rd;
+  RD_STATS this_rd_stats;
+
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (args->exit_early) {
+    args->incomplete_exit = 1;
+    return;
+  }
+
+  if (!is_inter_block(mbmi)) {
+    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  }
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
+                  args->best_rd - args->this_rd, &this_rd_stats);
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+
+#if CONFIG_RD_DEBUG
+  av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+                            this_rd_stats.rate);
+#endif  // CONFIG_RD_DEBUG
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  const int blk_idx =
+      blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+      blk_col;
+
+  if (plane == 0)
+    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+  else
+    set_blk_skip(x, plane, blk_idx, 0);
+
+  rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+  rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+  // TODO(jingning): temporarily enabled only for luma component
+  rd = AOMMIN(rd1, rd2);
+
+  this_rd_stats.skip &= !x->plane[plane].eobs[block];
+
+  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+  args->this_rd += rd;
+
+  if (args->this_rd > args->best_rd) {
+    args->exit_early = 1;
+    return;
+  }
+}
+
+static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                             RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting,
+                             FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.ftxs_mode = ftxs_mode;
+  av1_init_rd_stats(&args.rd_stats);
+
+  if (plane == 0) xd->mi[0]->tx_size = tx_size;
+
+  av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
+
+  av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+                                         &args);
+
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+  if (invalid_rd) {
+    av1_invalid_rd_stats(rd_stats);
+  } else {
+    *rd_stats = args.rd_stats;
+  }
+}
+
+static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
+                        BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int tx_size_ctx = get_tx_size_context(xd);
+    int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+    return r_tx_size;
+  } else {
+    return 0;
+  }
+}
+
+static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                        RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
+                        TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t rd = INT64_MAX;
+  const int skip_ctx = av1_get_skip_context(xd);
+  int s0, s1;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select =
+      cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type);
+  int ctx = txfm_partition_context(
+      xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+  const int r_tx_size = is_inter ? x->txfm_partition_cost[ctx][0]
+                                 : tx_size_cost(cm, x, bs, tx_size);
+
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+
+  s0 = x->skip_cost[skip_ctx][0];
+  s1 = x->skip_cost[skip_ctx][1];
+
+  mbmi->tx_size = tx_size;
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing, ftxs_mode);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  if (rd_stats->skip) {
+    if (is_inter) {
+      rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+    } else {
+      rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
+    }
+  } else {
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
+                rd_stats->dist);
+  }
+
+  if (tx_select) rd_stats->rate += r_tx_size;
+
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
+
+  return rd;
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int *r, int64_t *d, int *s,
+                                   int64_t *sse, int64_t ref_best_rd) {
+  RD_STATS rd_stats;
+  av1_subtract_plane(x, bs, 0);
+  x->rd_model = LOW_TXFM_RD;
+  int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
+                        max_txsize_rect_lookup[bs], FTXS_NONE);
+  x->rd_model = FULL_TXFM_RD;
+  *r = rd_stats.rate;
+  *d = rd_stats.dist;
+  *s = rd_stats.skip;
+  *sse = rd_stats.sse;
+  return rd;
+}
+
+static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   RD_STATS *rd_stats, int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(mbmi->tx_size, is_inter, cm->reduced_tx_set_used);
+  prune_tx(cpi, bs, x, xd, tx_set_type);
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+}
+
+static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    RD_STATS *rd_stats, int64_t ref_best_rd,
+                                    BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  mbmi->tx_size = TX_4X4;
+  txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+                   cpi->sf.use_fast_coef_costing, FTXS_NONE);
+}
+
+static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
+  return num_blk;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+                                 const SPEED_FEATURES *sf) {
+  if (sf->tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+  if (sf->tx_size_search_lgr_block) {
+    if (mi_width > mi_size_wide[BLOCK_64X64] ||
+        mi_height > mi_size_high[BLOCK_64X64])
+      return MAX_VARTX_DEPTH;
+  }
+
+  if (is_inter) {
+    return (mi_height != mi_width) ? sf->inter_tx_size_search_init_depth_rect
+                                   : sf->inter_tx_size_search_init_depth_sqr;
+  } else {
+    return (mi_height != mi_width) ? sf->intra_tx_size_search_init_depth_rect
+                                   : sf->intra_tx_size_search_init_depth_sqr;
+  }
+}
+
+static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                        MACROBLOCK *x, RD_STATS *rd_stats,
+                                        int64_t ref_best_rd, BLOCK_SIZE bs) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t rd = INT64_MAX;
+  int n;
+  int start_tx;
+  int depth;
+  int64_t best_rd = INT64_MAX;
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int n4 = bsize_to_num_blk(bs);
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+
+  av1_invalid_rd_stats(rd_stats);
+
+  if (tx_select) {
+    start_tx = max_rect_tx_size;
+    depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                  is_inter_block(mbmi), &cpi->sf);
+  } else {
+    const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+    start_tx = chosen_tx_size;
+    depth = MAX_TX_DEPTH;
+  }
+
+  prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
+
+  for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+#if CONFIG_DIST_8X8
+    if (x->using_dist_8x8) {
+      if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
+    }
+#endif
+    RD_STATS this_rd_stats;
+    if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
+    rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
+    x->rd_model = FULL_TXFM_RD;
+
+    if (rd < best_rd) {
+      memcpy(best_txk_type, mbmi->txk_type,
+             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+      memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+      best_tx_size = n;
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+    }
+    if (n == TX_4X4) break;
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+  }
+
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+}
+
+static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bs,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  av1_init_rd_stats(rd_stats);
+
+  assert(bs == xd->mi[0]->sb_type);
+
+  if (xd->lossless[xd->mi[0]->segment_id]) {
+    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else {
+    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+  }
+}
+
+// Return the rate cost for luma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                  int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                  .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
+                                       [MAX_ANGLE_DELTA +
+                                        mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += x->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+// Return the rate cost for chroma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                   int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
+                                             MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D113_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D67_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D203_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D157_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost, int mi_row,
+                               int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  RD_STATS this_rd_stats;
+  int row, col;
+  int64_t temp_sse, this_rd;
+  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  mbmi->tx_size = tx_size;
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+    }
+  }
+  // RD estimation.
+  model_rd_sb_fn[MODELRD_TYPE_INTRA](
+      cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate,
+      &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost +=
+        x->angle_delta_cost[mbmi->mode - V_PRED]
+                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
+                   x->filter_intra_mode_cost[mode];
+    } else {
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
+    }
+  }
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
+  return this_rd;
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
+                                     int orig_height, int new_width,
+                                     int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
+                                    int n_colors, int stride, int *centroids) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
+  }
+}
+
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int dc_mode_cost, const int *data,
+                         int *centroids, int n, uint16_t *color_cache,
+                         int n_cache, MB_MODE_INFO *best_mbmi,
+                         uint8_t *best_palette_color_map, int64_t *best_rd,
+                         int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+                         int *rate_overhead, int64_t *distortion,
+                         int *skippable, PICK_MODE_CONTEXT *ctx,
+                         uint8_t *blk_skip) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+  int k = av1_remove_duplicates(centroids, n);
+  if (k < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params.use_highbitdepth)
+    for (int i = 0; i < k; ++i)
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params.bit_depth);
+  else
+    for (int i = 0; i < k; ++i)
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+  pmi->palette_size[0] = k;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  int64_t this_model_rd =
+      intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+    return;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  RD_STATS tokenonly_rd_stats;
+  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    tokenonly_rd_stats.rate -=
+        tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+  }
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    *rate_overhead = this_rate - tokenonly_rd_stats.rate;
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip;
+  }
+}
+
+static int rd_pick_palette_intra_sby(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) {
+  int rate_overhead = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  int colors, n;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  if (seq_params->use_highbitdepth)
+    colors = av1_count_colors_highbd(src, src_stride, rows, cols,
+                                     seq_params->bit_depth, count_buf);
+  else
+    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+  if (colors > 1 && colors <= 64) {
+    int r, c, i;
+    const int max_itr = 50;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub, val;
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    if (seq_params->use_highbitdepth)
+      lb = ub = src16[0];
+    else
+      lb = ub = src[0];
+
+    if (seq_params->use_highbitdepth) {
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src16[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+    } else {
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+    }
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << seq_params->bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // Try the dominant colors directly.
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+      for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
+      palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+                   centroids, n, color_cache, n_cache, best_mbmi,
+                   best_palette_color_map, best_rd, best_model_rd, rate,
+                   rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+                   best_blk_skip);
+    }
+
+    // K-means clustering.
+    for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == n);
+        assert(colors == 2);
+        centroids[0] = lb;
+        centroids[1] = ub;
+      } else {
+        for (i = 0; i < n; ++i) {
+          centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+        }
+        av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
+      }
+      palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+                   centroids, n, color_cache, n_cache, best_mbmi,
+                   best_palette_color_map, best_rd, best_model_rd, rate,
+                   rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+                   best_blk_skip);
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+  return rate_overhead;
+}
+
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int mi_row, int mi_col, int *rate,
+                                    int *rate_tokenonly, int64_t *distortion,
+                                    int *skippable, BLOCK_SIZE bsize,
+                                    int mode_cost, int64_t *best_rd,
+                                    int64_t *best_model_rd,
+                                    PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int filter_intra_selected_flag = 0;
+  FILTER_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_8X8;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  (void)ctx;
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  mbmi->mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    int64_t this_rd, this_model_rd;
+    RD_STATS tokenonly_rd_stats;
+    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+    this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
+    if (*best_model_rd != INT64_MAX &&
+        this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+      continue;
+    if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+    super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) continue;
+    const int this_rate =
+        tokenonly_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      best_tx_size = mbmi->tx_size;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      memcpy(best_txk_type, mbmi->txk_type,
+             sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->filter_intra_mode_info = filter_intra_mode_info;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+// Run RD calculation with given luma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t calc_rd_given_intra_angle(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta,
+    int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta,
+    TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd,
+    TX_TYPE *best_txk_type, uint8_t *best_blk_skip) {
+  RD_STATS tokenonly_rd_stats;
+  int64_t this_rd, this_model_rd;
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  assert(!is_inter_block(mbmi));
+  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
+  this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+    return INT64_MAX;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
+  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
+
+  int this_rate =
+      mode_cost + tokenonly_rd_stats.rate +
+      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+  if (this_rd < *best_rd) {
+    memcpy(best_txk_type, mbmi->txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
+    *best_tx_size = mbmi->tx_size;
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+// With given luma directional intra prediction mode, pick the best angle delta
+// Return the RD cost corresponding to the best angle delta.
+static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int mi_row, int mi_col, int *rate,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int mode_cost, int64_t best_rd,
+                                       int64_t *best_model_rd) {
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  assert(!is_inter_block(mbmi));
+
+  int best_angle_delta = 0;
+  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  int first_try = 1;
+  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      const int64_t best_rd_in =
+          (best_rd == INT64_MAX) ? INT64_MAX
+                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+      const int64_t this_rd = calc_rd_given_intra_angle(
+          cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in,
+          (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+          &best_angle_delta, &best_tx_size, &best_rd, best_model_rd,
+          best_txk_type, best_blk_skip);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (first_try && this_rd == INT64_MAX) return best_rd;
+      first_try = 0;
+      if (angle_delta == 0) {
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      const int64_t rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost,
+                                  best_rd, (1 - 2 * i) * angle_delta,
+                                  MAX_ANGLE_DELTA, rate, rd_stats,
+                                  &best_angle_delta, &best_tx_size, &best_rd,
+                                  best_model_rd, best_txk_type, best_blk_skip);
+      }
+    }
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  }
+  return best_rd;
+}
+
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+  {
+      { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  },
+  {
+      { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
+      { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  },
+};
+
+/* clang-format off */
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+  0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+  0,
+};
+/* clang-format on */
+
+static void angle_estimation(const uint8_t *src, int src_stride, int rows,
+                             int cols, BLOCK_SIZE bsize,
+                             uint8_t *directional_mode_skip_mask) {
+  memset(directional_mode_skip_mask, 0,
+         INTRA_MODES * sizeof(*directional_mode_skip_mask));
+  // Check if angle_delta is used
+  if (!av1_use_angle_delta(bsize)) return;
+  uint64_t hist[DIRECTIONAL_MODES];
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+  src += src_stride;
+  int r, c, dx, dy;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      int index;
+      const int temp = dx * dx + dy * dy;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        const int sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        const int remd = (dx % dy) * 16 / dy;
+        const int quot = dx / dy;
+        index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+      }
+      hist[index] += temp;
+    }
+    src += src_stride;
+  }
+
+  int i;
+  uint64_t hist_sum = 0;
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (av1_is_directional_mode(i)) {
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
+      int weight = 2;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
+      }
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
+      }
+      if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
+}
+
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+                                    int rows, int cols, BLOCK_SIZE bsize,
+                                    uint8_t *directional_mode_skip_mask) {
+  memset(directional_mode_skip_mask, 0,
+         INTRA_MODES * sizeof(*directional_mode_skip_mask));
+  // Check if angle_delta is used
+  if (!av1_use_angle_delta(bsize)) return;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint64_t hist[DIRECTIONAL_MODES];
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+  src += src_stride;
+  int r, c, dx, dy;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      int index;
+      const int temp = dx * dx + dy * dy;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        const int sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        const int remd = (dx % dy) * 16 / dy;
+        const int quot = dx / dy;
+        index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+      }
+      hist[index] += temp;
+    }
+    src += src_stride;
+  }
+
+  int i;
+  uint64_t hist_sum = 0;
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (av1_is_directional_mode(i)) {
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
+      int weight = 2;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
+      }
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
+      }
+      if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
+}
+
+// Given selected prediction mode, search for the best tx type and size.
+static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, const int *bmode_costs,
+                            int64_t *best_rd, int *rate, int *rate_tokenonly,
+                            int64_t *distortion, int *skippable,
+                            MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS rd_stats;
+  super_block_yrd(cpi, x, &rd_stats, bsize, *best_rd);
+  if (rd_stats.rate == INT_MAX) return;
+  int this_rate_tokenonly = rd_stats.rate;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    this_rate_tokenonly -= tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+  }
+  const int this_rate =
+      rd_stats.rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_mbmi = *mbmi;
+    *best_rd = this_rd;
+    *rate = this_rate;
+    *rate_tokenonly = this_rate_tokenonly;
+    *distortion = rd_stats.dist;
+    *skippable = rd_stats.skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  }
+}
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      int mi_row, int mi_col, int *rate,
+                                      int *rate_tokenonly, int64_t *distortion,
+                                      int *skippable, BLOCK_SIZE bsize,
+                                      int64_t best_rd, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int64_t best_model_rd = INT64_MAX;
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  int is_directional_mode;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+  int beat_best_rd = 0;
+  const int *bmode_costs;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  uint8_t *best_palette_color_map =
+      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+
+  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    highbd_angle_estimation(src, src_stride, rows, cols, bsize,
+                            directional_mode_skip_mask);
+  else
+    angle_estimation(src, src_stride, rows, cols, bsize,
+                     directional_mode_skip_mask);
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  pmi->palette_size[0] = 0;
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  /* Y Search for intra prediction mode */
+  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+    RD_STATS this_rd_stats;
+    int this_rate, this_rate_tokenonly, s;
+    int64_t this_distortion, this_rd, this_model_rd;
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    this_model_rd =
+        intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
+    if (best_model_rd != INT64_MAX &&
+        this_model_rd > best_model_rd + (best_model_rd >> 1))
+      continue;
+    if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
+    is_directional_mode = av1_is_directional_mode(mbmi->mode);
+    if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+    if (is_directional_mode && av1_use_angle_delta(bsize)) {
+      this_rd_stats.rate = INT_MAX;
+      rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
+                              &this_rd_stats, bsize, bmode_costs[mbmi->mode],
+                              best_rd, &best_model_rd);
+    } else {
+      super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+    }
+    this_rate_tokenonly = this_rd_stats.rate;
+    this_distortion = this_rd_stats.dist;
+    s = this_rd_stats.skip;
+
+    if (this_rate_tokenonly == INT_MAX) continue;
+
+    if (!xd->lossless[mbmi->segment_id] &&
+        block_signals_txsize(mbmi->sb_type)) {
+      // super_block_yrd above includes the cost of the tx_size in the
+      // tokenonly rate, but for intra blocks, tx_size is always coded
+      // (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -=
+          tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+    }
+    this_rate =
+        this_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      beat_best_rd = 1;
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    }
+  }
+
+  if (try_palette) {
+    rd_pick_palette_intra_sby(
+        cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi,
+        best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly,
+        distortion, skippable, ctx, ctx->blk_skip);
+  }
+
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+    if (rd_pick_filter_intra_sby(
+            cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable,
+            bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) {
+      best_mbmi = *mbmi;
+    }
+  }
+
+  // If previous searches use only the default tx type, do an extra search for
+  // the best tx type.
+  if (x->use_default_intra_tx_type) {
+    *mbmi = best_mbmi;
+    x->use_default_intra_tx_type = 0;
+    intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly,
+                    distortion, skippable, &best_mbmi, ctx);
+  }
+
+  *mbmi = best_mbmi;
+  return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+  int plane;
+  int is_cost_valid = 1;
+  av1_init_rd_stats(rd_stats);
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  if (x->skip_chroma_rd) return is_cost_valid;
+
+  bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, bsize, plane);
+  }
+
+  if (is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      RD_STATS pn_rd_stats;
+      txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
+                       uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+      if (pn_rd_stats.rate == INT_MAX) {
+        is_cost_valid = 0;
+        break;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
+          RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
+        is_cost_valid = 0;
+        break;
+      }
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+
+static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                          int blk_row, int blk_col, int plane, int block,
+                          int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
+                          TXB_RD_INFO *rd_info_array) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const uint16_t cur_joint_ctx =
+      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  // Look up RD and terminate early in case when we've already processed exactly
+  // the same residual with exactly the same entropy context.
+  if (rd_info_array != NULL && rd_info_array->valid &&
+      rd_info_array->entropy_context == cur_joint_ctx) {
+    if (plane == 0)
+      x->e_mbd.mi[0]->txk_type[txk_type_idx] = rd_info_array->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+                        tx_size, cpi->common.reduced_tx_set_used);
+    if (ref_tx_type == rd_info_array->tx_type) {
+      rd_stats->rate += rd_info_array->rate;
+      rd_stats->dist += rd_info_array->dist;
+      rd_stats->sse += rd_info_array->sse;
+      rd_stats->skip &= rd_info_array->eob == 0;
+      p->eobs[block] = rd_info_array->eob;
+      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+      return;
+    }
+  }
+
+  RD_STATS this_rd_stats;
+  search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+
+  av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+  // Save RD results for possible reuse in future.
+  if (rd_info_array != NULL) {
+    rd_info_array->valid = 1;
+    rd_info_array->entropy_context = cur_joint_ctx;
+    rd_info_array->rate = this_rd_stats.rate;
+    rd_info_array->dist = this_rd_stats.dist;
+    rd_info_array->sse = this_rd_stats.sse;
+    rd_info_array->eob = p->eobs[block];
+    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+    if (plane == 0) {
+      rd_info_array->tx_type = x->e_mbd.mi[0]->txk_type[txk_type_idx];
+    }
+  }
+}
+
+static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh,
+                             float *mean, float *dev) {
+  int x_sum = 0;
+  uint64_t x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const int val = data[j];
+      x_sum += val;
+      x2_sum += val * val;
+    }
+    data += stride;
+  }
+
+  const int num = bw * bh;
+  const float e_x = (float)x_sum / num;
+  const float e_x2 = (float)((double)x2_sum / num);
+  const float diff = e_x2 - e_x * e_x;
+  *dev = (diff > 0) ? sqrtf(diff) : 0;
+  *mean = e_x;
+}
+
+static void get_mean_and_dev_float(const float *data, int stride, int bw,
+                                   int bh, float *mean, float *dev) {
+  float x_sum = 0;
+  float x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const float val = data[j];
+      x_sum += val;
+      x2_sum += val * val;
+    }
+    data += stride;
+  }
+
+  const int num = bw * bh;
+  const float e_x = x_sum / num;
+  const float e_x2 = x2_sum / num;
+  const float diff = e_x2 - e_x * e_x;
+  *dev = (diff > 0) ? sqrtf(diff) : 0;
+  *mean = e_x;
+}
+
+// Feature used by the model to predict tx split: the mean and standard
+// deviation values of the block and sub-blocks.
+static void get_mean_dev_features(const int16_t *data, int stride, int bw,
+                                  int bh, int levels, float *feature) {
+  int feature_idx = 0;
+  int width = bw;
+  int height = bh;
+  const int16_t *const data_ptr = &data[0];
+  for (int lv = 0; lv < levels; ++lv) {
+    if (width < 2 || height < 2) break;
+    float mean_buf[16];
+    float dev_buf[16];
+    int blk_idx = 0;
+    for (int row = 0; row < bh; row += height) {
+      for (int col = 0; col < bw; col += width) {
+        float mean, dev;
+        get_mean_and_dev(data_ptr + row * stride + col, stride, width, height,
+                         &mean, &dev);
+        feature[feature_idx++] = mean;
+        feature[feature_idx++] = dev;
+        mean_buf[blk_idx] = mean;
+        dev_buf[blk_idx++] = dev;
+      }
+    }
+    if (blk_idx > 1) {
+      float mean, dev;
+      // Deviation of means.
+      get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev);
+      feature[feature_idx++] = dev;
+      // Mean of deviations.
+      get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev);
+      feature[feature_idx++] = mean;
+    }
+    // Reduce the block size when proceeding to the next level.
+    if (height == width) {
+      height = height >> 1;
+      width = width >> 1;
+    } else if (height > width) {
+      height = height >> 1;
+    } else {
+      width = width >> 1;
+    }
+  }
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+                               int blk_col, TX_SIZE tx_size) {
+  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+  if (!nn_config) return -1;
+
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  aom_clear_system_state();
+
+  float features[64] = { 0.0f };
+  get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, &score);
+  if (score > 8.0f) return 100;
+  if (score < -8.0f) return 0;
+  score = 1.0f / (1.0f + (float)exp(-score));
+  return (int)(score * 100);
+}
+
+typedef struct {
+  int64_t rd;
+  int txb_entropy_ctx;
+  TX_TYPE tx_type;
+} TxCandidateInfo;
+
+static void try_tx_block_no_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+    const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+    int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    TxCandidateInfo *no_split) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+
+  no_split->rd = INT64_MAX;
+  no_split->txb_entropy_ctx = 0;
+  no_split->tx_type = TX_TYPES;
+
+  const ENTROPY_CONTEXT *const pta = ta + blk_col;
+  const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+
+  rd_stats->ref_rdcost = ref_best_rd;
+  rd_stats->zero_rate = zero_blk_rate;
+  const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+  mbmi->inter_tx_size[index] = tx_size;
+  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+                &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
+                rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+  assert(rd_stats->rate < INT_MAX);
+
+  if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+       rd_stats->skip == 1) &&
+      !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+    av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+                              zero_blk_rate - rd_stats->rate);
+#endif  // CONFIG_RD_DEBUG
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+    set_blk_skip(x, 0, blk_row * bw + blk_col, 1);
+    p->eobs[block] = 0;
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     DCT_DCT);
+  } else {
+    set_blk_skip(x, 0, blk_row * bw + blk_col, 0);
+    rd_stats->skip = 0;
+  }
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+
+  no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+  const int txk_type_idx =
+      av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+  no_split->tx_type = mbmi->txk_type[txk_type_idx];
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                            int blk_col, int block, TX_SIZE tx_size, int depth,
+                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+                            int64_t ref_best_rd, int *is_cost_valid,
+                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            TXB_RD_INFO_NODE *rd_info_node);
+
+static void try_tx_block_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    RD_STATS *split_rd_stats, int64_t *split_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+  const int bsw = tx_size_wide_unit[sub_txs];
+  const int bsh = tx_size_high_unit[sub_txs];
+  const int sub_step = bsw * bsh;
+  RD_STATS this_rd_stats;
+  int this_cost_valid = 1;
+  int64_t tmp_rd = 0;
+
+  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  int blk_idx = 0;
+  for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+    for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+      const int offsetr = blk_row + r;
+      const int offsetc = blk_col + c;
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+      assert(blk_idx < 4);
+      select_tx_block(
+          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
+          tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
+          &this_cost_valid, ftxs_mode,
+          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+
+      if (!this_cost_valid) goto LOOP_EXIT;
+
+      av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+
+      tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+
+      if (no_split_rd < tmp_rd) {
+        this_cost_valid = 0;
+        goto LOOP_EXIT;
+      }
+      block += sub_step;
+    }
+  }
+
+LOOP_EXIT : {}
+
+  if (this_cost_valid) *split_rd = tmp_rd;
+}
+
+// Search for the best tx partition/type for a given luma block.
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                            int blk_col, int block, TX_SIZE tx_size, int depth,
+                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+                            TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+                            int64_t ref_best_rd, int *is_cost_valid,
+                            FAST_TX_SEARCH_MODE ftxs_mode,
+                            TXB_RD_INFO_NODE *rd_info_node) {
+  assert(tx_size < TX_SIZES_ALL);
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+  struct macroblock_plane *const p = &x->plane[0];
+
+  const int try_no_split = 1;
+  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+#if CONFIG_DIST_8X8
+  if (x->using_dist_8x8)
+    try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
+#endif
+  TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+  // TX no split
+  if (try_no_split) {
+    try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                          plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+                          ftxs_mode, rd_info_node, &no_split);
+
+    if (cpi->sf.adaptive_txb_search_level &&
+        (no_split.rd -
+         (no_split.rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
+            ref_best_rd) {
+      *is_cost_valid = 0;
+      return;
+    }
+
+    if (cpi->sf.txb_split_cap) {
+      if (p->eobs[block] == 0) try_split = 0;
+    }
+  }
+
+  if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
+    const int threshold = cpi->sf.tx_type_search.ml_tx_split_thresh;
+    if (threshold >= 0) {
+      const int split_score =
+          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+      if (split_score >= 0 && split_score < threshold) try_split = 0;
+    }
+  }
+
+  // TX split
+  int64_t split_rd = INT64_MAX;
+  RD_STATS split_rd_stats;
+  av1_init_rd_stats(&split_rd_stats);
+  if (try_split) {
+    try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                       plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+                       AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+                       rd_info_node, &split_rd_stats, &split_rd);
+  }
+
+  if (no_split.rd < split_rd) {
+    ENTROPY_CONTEXT *pta = ta + blk_col;
+    ENTROPY_CONTEXT *ptl = tl + blk_row;
+    const TX_SIZE tx_size_selected = tx_size;
+    p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+    av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size_selected;
+      }
+    }
+    mbmi->tx_size = tx_size_selected;
+    update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                     no_split.tx_type);
+    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
+  } else {
+    *rd_stats = split_rd_stats;
+    if (split_rd == INT64_MAX) *is_cost_valid = 0;
+  }
+}
+
+static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                   int64_t ref_best_rd,
+                                   FAST_TX_SEARCH_MODE ftxs_mode,
+                                   TXB_RD_INFO_NODE *rd_info_tree) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0, skip_rd = 0;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+
+    RD_STATS pn_rd_stats;
+    const int init_depth =
+        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+    av1_init_rd_stats(&pn_rd_stats);
+
+    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+    const int skip_ctx = av1_get_skip_context(xd);
+    const int s0 = x->skip_cost[skip_ctx][0];
+    const int s1 = x->skip_cost[skip_ctx][1];
+
+    skip_rd = RDCOST(x->rdmult, s1, 0);
+    this_rd = RDCOST(x->rdmult, s0, 0);
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
+        select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
+                        plane_bsize, ctxa, ctxl, tx_above, tx_left,
+                        &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
+                        rd_info_tree);
+        if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+        this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+        block += step;
+        if (rd_info_tree != NULL) rd_info_tree += 1;
+      }
+    }
+    if (skip_rd <= this_rd) {
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+    } else {
+      rd_stats->skip = 0;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+}
+
+static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd,
+                                       TXB_RD_INFO_NODE *rd_info_tree) {
+  const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int skip_ctx = av1_get_skip_context(xd);
+  int s0 = x->skip_cost[skip_ctx][0];
+  int s1 = x->skip_cost[skip_ctx][1];
+  int64_t rd;
+
+  // TODO(debargha): enable this as a speed feature where the
+  // select_inter_block_yrd() function above will use a simplified search
+  // such as not using full optimize, but the inter_block_yrd() function
+  // will use more complex search given that the transform partitions have
+  // already been decided.
+
+  int64_t rd_thresh = ref_best_rd;
+  if (fast_tx_search && rd_thresh < INT64_MAX) {
+    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+  }
+  assert(rd_thresh > 0);
+
+  FAST_TX_SEARCH_MODE ftxs_mode =
+      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+  select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode,
+                         rd_info_tree);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  // If fast_tx_search is true, only DCT and 1D DCT were tested in
+  // select_inter_block_yrd() above. Do a better search for tx type with
+  // tx sizes already decided.
+  if (fast_tx_search) {
+    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+      return INT64_MAX;
+  }
+
+  if (rd_stats->skip)
+    rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+  else
+    rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+
+  if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
+
+  return rd;
+}
+
+// Finds rd cost for a y block, given the transform size partitions
+static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                         int blk_col, int block, TX_SIZE tx_size,
+                         BLOCK_SIZE plane_bsize, int depth,
+                         ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+                         TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                         int64_t ref_best_rd, RD_STATS *rd_stats,
+                         FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  assert(tx_size < TX_SIZES_ALL);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+      plane_bsize, blk_row, blk_col)];
+
+  int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                   mbmi->sb_type, tx_size);
+
+  av1_init_rd_stats(rd_stats);
+  if (tx_size == plane_tx_size) {
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    rd_stats->zero_rate = zero_blk_rate;
+    rd_stats->ref_rdcost = ref_best_rd;
+    tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+                  &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+        rd_stats->skip == 1) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
+      x->plane[0].eobs[block] = 0;
+      x->plane[0].txb_entropy_ctx[block] = 0;
+      update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+                       DCT_DCT);
+    } else {
+      rd_stats->skip = 0;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
+    }
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    RD_STATS pn_rd_stats;
+    int64_t this_rd = 0;
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+        block += step;
+      }
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][1];
+  }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int init_depth =
+        get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+    RD_STATS pn_rd_stats;
+
+    av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+    memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, plane_bsize,
+                     init_depth, ctxa, ctxl, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return 0;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd +=
+            AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                   RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+        block += step;
+      }
+    }
+  }
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int s0 = x->skip_cost[skip_ctx][0];
+  const int s1 = x->skip_cost[skip_ctx][1];
+  int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+  this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+  if (skip_rd < this_rd) {
+    this_rd = skip_rd;
+    rd_stats->rate = 0;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
+  if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+  return is_cost_valid;
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                             (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
+}
+
+static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            MB_RD_RECORD *tx_rd_record) {
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  tx_rd_info->hash_value = hash;
+  tx_rd_info->tx_size = mbmi->tx_size;
+  memcpy(tx_rd_info->blk_skip, x->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
+  av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
+  tx_rd_info->rd_stats = *rd_stats;
+}
+
+static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
+                             RD_STATS *const rd_stats, MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = tx_rd_info->tx_size;
+  memcpy(x->blk_skip, tx_rd_info->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
+  av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
+  *rd_stats = tx_rd_info->rd_stats;
+}
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
+                                const uint32_t hash) {
+  // Linear search through the circular buffer to find matching hash.
+  for (int i = cur_record->index_start - 1; i >= 0; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  int index;
+  // If not found - add new RD info into the buffer and return its index
+  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
+    index = (cur_record->index_start + cur_record->num) %
+            TX_SIZE_RD_RECORD_BUFFER_LEN;
+    cur_record->num++;
+  } else {
+    index = cur_record->index_start;
+    cur_record->index_start =
+        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+  }
+
+  cur_record->hash_vals[index] = hash;
+  av1_zero(cur_record->tx_rd_info[index]);
+  return index;
+}
+
+typedef struct {
+  int leaf;
+  int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0, 0, 0, 0 } },
+  { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0 } },
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 5, 6 } },
+  { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 7, 8 } },
+  { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
+  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
+  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
+  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
+  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+  { 0, { 1, -1, 2, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+  NULL,                    // BLOCK_4X4
+  NULL,                    // BLOCK_4X8
+  NULL,                    // BLOCK_8X4
+  rd_record_tree_8x8,      // BLOCK_8X8
+  rd_record_tree_8x16,     // BLOCK_8X16
+  rd_record_tree_16x8,     // BLOCK_16X8
+  rd_record_tree_16x16,    // BLOCK_16X16
+  rd_record_tree_1_2,      // BLOCK_16X32
+  rd_record_tree_2_1,      // BLOCK_32X16
+  rd_record_tree_sqr,      // BLOCK_32X32
+  rd_record_tree_1_2,      // BLOCK_32X64
+  rd_record_tree_2_1,      // BLOCK_64X32
+  rd_record_tree_sqr,      // BLOCK_64X64
+  rd_record_tree_64x128,   // BLOCK_64X128
+  rd_record_tree_128x64,   // BLOCK_128X64
+  rd_record_tree_128x128,  // BLOCK_128X128
+  NULL,                    // BLOCK_4X16
+  NULL,                    // BLOCK_16X4
+  rd_record_tree_1_4,      // BLOCK_8X32
+  rd_record_tree_4_1,      // BLOCK_32X8
+  rd_record_tree_1_4,      // BLOCK_16X64
+  rd_record_tree_4_1,      // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+  0,                                                            // BLOCK_4X4
+  0,                                                            // BLOCK_4X8
+  0,                                                            // BLOCK_8X4
+  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
+  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
+  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
+  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
+  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
+  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
+  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
+  0,                                                            // BLOCK_4X16
+  0,                                                            // BLOCK_16X4
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+                                       BLOCK_SIZE bsize) {
+  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+  const int size = rd_record_tree_size[bsize];
+  for (int i = 0; i < size; ++i) {
+    if (rd_record[i].leaf) {
+      av1_zero(tree[i].children);
+    } else {
+      for (int j = 0; j < 4; ++j) {
+        const int8_t idx = rd_record[i].children[j];
+        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+      }
+    }
+  }
+}
+
+// Go through all TX blocks that could be used in TX size search, compute
+// residual hash values for them and find matching RD info that stores previous
+// RD search results for these TX blocks. The idea is to prevent repeated
+// rate/distortion computations that happen because of the combination of
+// partition and TX size search. The resulting RD info records are returned in
+// the form of a quadtree for easier access in actual TX size search.
+static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+                                   int mi_col, TXB_RD_INFO_NODE *dst_rd_info) {
+  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
+                                         x->txb_rd_record_16X16,
+                                         x->txb_rd_record_32X32,
+                                         x->txb_rd_record_64X64 };
+  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+
+  // Hashing is performed only for square TX sizes larger than TX_4X4
+  if (max_square_tx_size < TX_8X8) return 0;
+  const int diff_stride = bw;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+  init_rd_record_tree(dst_rd_info, bsize);
+  // Coordinates of the top-left corner of current block within the superblock
+  // measured in pixels:
+  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  int cur_rd_info_idx = 0;
+  int cur_tx_depth = 0;
+  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
+  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
+    const int cur_tx_bw = tx_size_wide[cur_tx_size];
+    const int cur_tx_bh = tx_size_high[cur_tx_size];
+    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
+    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+    const int tx_size_idx = cur_tx_size - TX_8X8;
+    for (int row = 0; row < bh; row += cur_tx_bh) {
+      for (int col = 0; col < bw; col += cur_tx_bw) {
+        if (cur_tx_bw != cur_tx_bh) {
+          // Use dummy nodes for all rectangular transforms within the
+          // TX size search tree.
+          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
+        } else {
+          // Get spatial location of this TX block within the superblock
+          // (measured in cur_tx_bsize units).
+          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
+          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
+
+          int16_t hash_data[MAX_SB_SQUARE];
+          int16_t *cur_hash_row = hash_data;
+          const int16_t *cur_diff_row = diff + row * diff_stride + col;
+          for (int i = 0; i < cur_tx_bh; i++) {
+            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
+            cur_hash_row += cur_tx_bw;
+            cur_diff_row += diff_stride;
+          }
+          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                                (uint8_t *)hash_data,
+                                                2 * cur_tx_bw * cur_tx_bh);
+          // Find corresponding RD info based on the hash value.
+          const int record_idx =
+              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+          int idx = find_tx_size_rd_info(records, hash);
+          dst_rd_info[cur_rd_info_idx].rd_info_array =
+              &records->tx_rd_info[idx];
+        }
+        ++cur_rd_info_idx;
+      }
+    }
+    cur_tx_size = next_tx_size;
+    ++cur_tx_depth;
+  }
+  return 1;
+}
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // Predict not to skip when mse is larger than threshold.
+  if (mse > mse_thresh) return 0;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = get_bitdepth_data_path_index(xd);
+  param.lossless = 0;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
+  }
+  return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
+                          int64_t dist) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+  mbmi->tx_size = tx_size;
+  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
+  rd_stats->skip = 1;
+  rd_stats->rate = 0;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
+}
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+                               int mi_col, int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  const int is_inter = is_inter_block(mbmi);
+  const int n4 = bsize_to_num_blk(bsize);
+  // Get the tx_size 1 level down
+  const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
+  const TxSetType tx_set_type =
+      av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used);
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+
+  av1_invalid_rd_stats(rd_stats);
+
+  if (cpi->sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) {
+    int model_rate;
+    int64_t model_dist;
+    int model_skip;
+    model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+        cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist,
+        &model_skip, NULL, NULL, NULL, NULL);
+    const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+    // If the modeled rd is a lot worse than the best so far, breakout.
+    // TODO(debargha, urvang): Improve the model and make the check below
+    // tighter.
+    assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
+           cpi->sf.model_based_prune_tx_search_level <= 2);
+    static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
+                                            4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
+    if (!model_skip &&
+        ((model_rd *
+          prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
+         3) > ref_best_rd)
+      return;
+  }
+
+  const uint32_t hash = get_block_residue_hash(x, bsize);
+  MB_RD_RECORD *mb_rd_record = &x->mb_rd_record;
+
+  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // terminate early.
+      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index];
+        fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+        return;
+      }
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+      predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
+    set_skip_flag(x, rd_stats, bsize, dist);
+    // Save the RD search results into tx_rd_record.
+    if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    return;
+  }
+
+  // Precompute residual hashes and find existing or add new RD records to
+  // store and reuse rate and distortion values to speed up TX size search.
+  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
+  int found_rd_info = 0;
+  if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
+    found_rd_info =
+        find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
+  }
+
+  prune_tx(cpi, bsize, x, xd, tx_set_type);
+
+  int found = 0;
+
+  RD_STATS this_rd_stats;
+  av1_init_rd_stats(&this_rd_stats);
+
+  rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                               found_rd_info ? matched_rd_info : NULL);
+  assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate,
+                 this_rd_stats.rate == 0));
+
+  ref_best_rd = AOMMIN(rd, ref_best_rd);
+  if (rd < best_rd) {
+    *rd_stats = this_rd_stats;
+    found = 1;
+  }
+
+  // Reset the pruning flags.
+  av1_zero(x->tx_search_prune);
+  x->tx_split_prune_flag = 0;
+
+  // We should always find at least one candidate unless ref_best_rd is less
+  // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+  // might have failed to find something better)
+  assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
+  if (!found) return;
+
+  // Save the RD search results into tx_rd_record.
+  if (within_border && cpi->sf.use_mb_rd_hash)
+    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+}
+
+static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                          int blk_col, int plane, int block, TX_SIZE tx_size,
+                          BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+                          ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats,
+                          FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(plane > 0);
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+  ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
+                &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
+
+  const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  const int blk_idx = blk_row * mi_width + blk_col;
+
+  av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+  if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+           RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+       rd_stats->skip == 1) &&
+      !xd->lossless[mbmi->segment_id]) {
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+  }
+
+  // Set chroma blk_skip to 0
+  set_blk_skip(x, plane, blk_idx, 0);
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t non_skip_ref_best_rd,
+                            int64_t skip_ref_best_rd,
+                            FAST_TX_SEARCH_MODE ftxs_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int plane;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+  int64_t skip_rd = 0;
+
+  if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (x->skip_chroma_rd) {
+    if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+
+    return is_cost_valid;
+  }
+
+  const BLOCK_SIZE bsizec = scale_chroma_bsize(
+      bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, bsizec, plane);
+  }
+
+  if (is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+      const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int mi_height =
+          block_size_high[plane_bsize] >> tx_size_high_log2[0];
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      int idx, idy;
+      int block = 0;
+      const int step = bh * bw;
+      ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
+      ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
+      av1_get_entropy_contexts(bsizec, pd, ta, tl);
+
+      for (idy = 0; idy < mi_height; idy += bh) {
+        for (idx = 0; idx < mi_width; idx += bw) {
+          RD_STATS pn_rd_stats;
+          av1_init_rd_stats(&pn_rd_stats);
+          tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
+                        plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+          if (pn_rd_stats.rate == INT_MAX) {
+            av1_invalid_rd_stats(rd_stats);
+            return 0;
+          }
+          av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+          this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+          skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+          if ((this_rd > non_skip_ref_best_rd) &&
+              (skip_rd > skip_ref_best_rd)) {
+            av1_invalid_rd_stats(rd_stats);
+            return 0;
+          }
+          block += step;
+        }
+      }
+    }
+  } else {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+
+static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int dc_mode_cost,
+                                       uint8_t *best_palette_color_map,
+                                       MB_MODE_INFO *const best_mbmi,
+                                       int64_t *best_rd, int *rate,
+                                       int *rate_tokenonly, int64_t *distortion,
+                                       int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  if (seq_params->use_highbitdepth) {
+    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+  } else {
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+    int *best_angle_delta, int64_t *best_rd) {
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  assert(!is_inter_block(mbmi));
+  int this_rate;
+  int64_t this_rd;
+  RD_STATS tokenonly_rd_stats;
+
+  if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+    return INT64_MAX;
+  this_rate = tokenonly_rd_stats.rate +
+              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+// With given chroma directional intra prediction mode, pick the best angle
+// delta. Return true if a RD cost that is smaller than the input one is found.
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd, int *rate,
+                                    RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int i, angle_delta, best_angle_delta = 0;
+  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+  rd_stats->rate = INT_MAX;
+  rd_stats->skip = 0;
+  rd_stats->dist = INT64_MAX;
+  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+      this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+                                              best_rd_in, rate, rd_stats,
+                                              &best_angle_delta, &best_rd);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (angle_delta == 0) {
+        if (this_rd == INT64_MAX) return 0;
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    int64_t rd_thresh;
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+        pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                      rate, rd_stats, &best_angle_delta,
+                                      &best_rd);
+      }
+    }
+  }
+
+  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+  return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_DEBUG
+  assert(is_cfl_allowed(xd));
+  const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+  const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
+  (void)plane_bsize;
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  if (!xd->lossless[mbmi->segment_id]) {
+    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+  }
+#endif  // CONFIG_DEBUG
+
+  xd->cfl.use_dc_pred_cache = 1;
+  const int64_t mode_rd =
+      RDCOST(x->rdmult,
+             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#if CONFIG_DEBUG
+  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#endif  // CONFIG_DEBUG
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    RD_STATS rd_stats;
+    av1_init_rd_stats(&rd_stats);
+    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+      best_rd_uv[joint_sign][plane] = INT64_MAX;
+      best_c[joint_sign][plane] = 0;
+    }
+    // Collect RD stats for an alpha value of zero in this plane.
+    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
+    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
+      const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
+      if (i == CFL_SIGN_NEG) {
+        mbmi->cfl_alpha_idx = 0;
+        mbmi->cfl_alpha_signs = joint_sign;
+        txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize, tx_size,
+                         cpi->sf.use_fast_coef_costing, FTXS_NONE);
+        if (rd_stats.rate == INT_MAX) break;
+      }
+      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+      best_rd_uv[joint_sign][plane] =
+          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+#if CONFIG_DEBUG
+      best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+    }
+  }
+
+  int best_joint_sign = -1;
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
+      int progress = 0;
+      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+        int flag = 0;
+        RD_STATS rd_stats;
+        if (c > 2 && progress < c) break;
+        av1_init_rd_stats(&rd_stats);
+        for (int i = 0; i < CFL_SIGNS; i++) {
+          const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
+          if (i == 0) {
+            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
+            mbmi->cfl_alpha_signs = joint_sign;
+            txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize,
+                             tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+            if (rd_stats.rate == INT_MAX) break;
+          }
+          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+          int64_t this_rd =
+              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
+          best_rd_uv[joint_sign][plane] = this_rd;
+          best_c[joint_sign][plane] = c;
+#if CONFIG_DEBUG
+          best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+          flag = 2;
+          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
+          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
+          if (this_rd >= best_rd) continue;
+          best_rd = this_rd;
+          best_joint_sign = joint_sign;
+        }
+        progress += flag;
+      }
+    }
+  }
+
+  int best_rate_overhead = INT_MAX;
+  int ind = 0;
+  if (best_joint_sign >= 0) {
+    const int u = best_c[best_joint_sign][CFL_PRED_U];
+    const int v = best_c[best_joint_sign][CFL_PRED_V];
+    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+#if CONFIG_DEBUG
+    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+                   best_rate_overhead +
+                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
+                   best_rate_uv[best_joint_sign][CFL_PRED_V];
+#endif  // CONFIG_DEBUG
+  } else {
+    best_joint_sign = 0;
+  }
+
+  mbmi->cfl_alpha_idx = ind;
+  mbmi->cfl_alpha_signs = best_joint_sign;
+  xd->cfl.use_dc_pred_cache = 0;
+  xd->cfl.dc_pred_is_cached[0] = 0;
+  xd->cfl.dc_pred_is_cached[1] = 0;
+  return best_rate_overhead;
+}
+
+static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+    int this_rate;
+    RD_STATS tokenonly_rd_stats;
+    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+    if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+          (1 << mode)))
+      continue;
+
+    mbmi->uv_mode = mode;
+    int cfl_alpha_rate = 0;
+    if (mode == UV_CFL_PRED) {
+      if (!is_cfl_allowed(xd)) continue;
+      assert(!is_directional_mode);
+      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
+      if (cfl_alpha_rate == INT_MAX) continue;
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
+      const int rate_overhead =
+          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
+      if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                    &this_rate, &tokenonly_rd_stats))
+        continue;
+    } else {
+      if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+        continue;
+      }
+    }
+    const int mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+        cfl_alpha_rate;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+    if (mode == UV_CFL_PRED) {
+      assert(is_cfl_allowed(xd));
+#if CONFIG_DEBUG
+      if (!xd->lossless[mbmi->segment_id])
+        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
+#endif  // CONFIG_DEBUG
+    }
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+    }
+  }
+
+  const int try_palette =
+      av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette) {
+    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    rd_pick_palette_intra_sbuv(
+        cpi, x,
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+        distortion, skippable);
+  }
+
+  *mbmi = best_mbmi;
+  // Make sure we actually chose a mode
+  assert(best_rd < INT64_MAX);
+  return best_rd;
+}
+
+static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
+                                 int *rate_uv, int *rate_uv_tokenonly,
+                                 int64_t *dist_uv, int *skip_uv,
+                                 UV_PREDICTION_MODE *mode_uv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  init_sbuv_mode(mbmi);
+  if (x->skip_chroma_rd) {
+    *rate_uv = 0;
+    *rate_uv_tokenonly = 0;
+    *dist_uv = 0;
+    *skip_uv = 1;
+    *mode_uv = UV_DC_PRED;
+    return;
+  }
+  xd->cfl.is_chroma_reference =
+      is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                          cm->seq_params.subsampling_y);
+  bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+                             xd->plane[AOM_PLANE_U].subsampling_y);
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
+                                 cpi->optimize_seg_arr[mbmi->segment_id],
+                                 mi_row, mi_col);
+    xd->cfl.store_y = 0;
+  }
+  rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+                          bsize, max_tx_size);
+  *mode_uv = mbmi->uv_mode;
+}
+
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+  if (is_inter_compound_mode(mode)) {
+    return x
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+    if (mode == GLOBALMV) {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+}
+
+static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
+                                             const MB_MODE_INFO *const mbmi) {
+  switch (mbmi->interinter_comp.type) {
+    case COMPOUND_AVERAGE: return 0;
+    case COMPOUND_WEDGE:
+      return get_interinter_wedge_bits(mbmi->sb_type) > 0
+                 ? av1_cost_literal(1) +
+                       x->wedge_idx_cost[mbmi->sb_type]
+                                        [mbmi->interinter_comp.wedge_index]
+                 : 0;
+    case COMPOUND_DIFFWTD: return av1_cost_literal(1);
+    default: assert(0); return 0;
+  }
+}
+
+typedef struct {
+  int eobs;
+  int brate;
+  int byrate;
+  int64_t bdist;
+  int64_t bsse;
+  int64_t brdcost;
+  int_mv mvs[2];
+  int_mv pred_mv[2];
+  int_mv ref_mv[2];
+
+  ENTROPY_CONTEXT ta[2];
+  ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
+  int_mv *ref_mv[2];
+  int_mv mvp;
+
+  int64_t segment_rd;
+  int r;
+  int64_t d;
+  int64_t sse;
+  int segment_yrate;
+  PREDICTION_MODE modes[4];
+  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
+  int mvthresh;
+} BEST_SEG_INFO;
+
+static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
+  return (mv->row >> 3) < mv_limits->row_min ||
+         (mv->row >> 3) > mv_limits->row_max ||
+         (mv->col >> 3) < mv_limits->col_min ||
+         (mv->col >> 3) > mv_limits->col_max;
+}
+
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+                                              int ref_idx, int is_comp_pred) {
+  PREDICTION_MODE single_mode;
+  if (is_comp_pred) {
+    single_mode =
+        ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
+  } else {
+    single_mode = this_mode;
+  }
+  return single_mode;
+}
+
+static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
+                                int mi_col, int_mv *ref_mv_sub8x8[2],
+                                const uint8_t *mask, int mask_stride,
+                                int *rate_mv, const int block) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  const int plane = 0;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+  const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+  const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  int_mv ref_mv[2];
+  int ite, ref;
+  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
+  const int ic = block & 1;
+  const int ir = (block - ic) >> 1;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+
+  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+  conv_params.use_jnt_comp_avg = 0;
+  WarpTypesAllowed warp_types[2];
+  for (ref = 0; ref < 2; ++ref) {
+    const WarpedMotionParams *const wm =
+        &xd->global_motion[xd->mi[0]->ref_frame[ref]];
+    const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+    warp_types[ref].global_warp_allowed = is_global;
+    warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+  }
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = { INT_MAX, INT_MAX };
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    av1_get_scaled_ref_frame(cpi, refs[0]),
+    av1_get_scaled_ref_frame(cpi, refs[1])
+  };
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+  (void)ref_mv_sub8x8;
+
+  const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode);
+  const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0);
+  MV *const best_mv = &x->best_mv.as_mv;
+  const int search_range = SEARCH_RANGE_8P;
+  const int sadpb = x->sadperbit16;
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    MvLimits tmp_mv_limits = x->mv_limits;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+    if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+      if (cur_mv[id].as_int == init_mv[id].as_int) {
+        break;
+      } else {
+        int_mv cur_int_mv, init_int_mv;
+        cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+        cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+        init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+        if (cur_int_mv.as_int == init_int_mv.as_int) {
+          break;
+        }
+      }
+    }
+    for (ref = 0; ref < 2; ++ref) {
+      ref_mv[ref] = av1_get_ref_mv(x, ref);
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      if (scaled_ref_frame[ref]) {
+        int i;
+        for (i = 0; i < num_planes; i++)
+          backup_yv12[ref][i] = xd->plane[i].pre[ref];
+        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                             NULL, num_planes);
+      }
+    }
+
+    assert(IMPLIES(scaled_ref_frame[0] != NULL,
+                   cm->width == scaled_ref_frame[0]->y_crop_width &&
+                       cm->height == scaled_ref_frame[0]->y_crop_height));
+    assert(IMPLIES(scaled_ref_frame[1] != NULL,
+                   cm->width == scaled_ref_frame[1]->y_crop_width &&
+                       cm->height == scaled_ref_frame[1]->y_crop_height));
+
+    // Initialize based on (possibly scaled) prediction buffers.
+    ref_yv12[0] = xd->plane[plane].pre[0];
+    ref_yv12[1] = xd->plane[plane].pre[1];
+
+    // Get the prediction block from the 'other' reference frame.
+    const InterpFilters interp_filters = EIGHTTAP_REGULAR;
+
+    // Since we have scaled the reference frames to match the size of the
+    // current frame we must use a unit scaling factor during mode selection.
+    av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+                              second_pred, pw, &cur_mv[!id].as_mv,
+                              &cm->sf_identity, pw, ph, &conv_params,
+                              interp_filters, &warp_types[!id], p_col, p_row,
+                              plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+                              mi_row * MI_SIZE, xd, cm->allow_warped_motion);
+
+    const int order_idx = id != 0;
+    av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+                               &xd->jcp_param.bck_offset,
+                               &xd->jcp_param.use_jnt_comp_avg, 1);
+
+    // Do full-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+    av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
+
+    // Use the mv result from the single mode as mv predictor.
+    *best_mv = cur_mv[id].as_mv;
+
+    best_mv->col >>= 3;
+    best_mv->row >>= 3;
+
+    av1_set_mvcost(x, id, ref_mv_idx);
+
+    // Small-range full-pixel motion search.
+    bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+                                       &cpi->fn_ptr[bsize], mask, mask_stride,
+                                       id, &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX) {
+      if (mask)
+        bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
+                                          second_pred, mask, mask_stride, id,
+                                          &cpi->fn_ptr[bsize], 1);
+      else
+        bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+                                        second_pred, &cpi->fn_ptr[bsize], 1);
+    }
+
+    x->mv_limits = tmp_mv_limits;
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (scaled_ref_frame[ref]) {
+        // Swap back the original buffers for subpel motion search.
+        for (int i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+        // Re-initialize based on unscaled prediction buffers.
+        ref_yv12[ref] = xd->plane[plane].pre[ref];
+      }
+    }
+
+    // Do sub-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    if (cpi->common.cur_frame_force_integer_mv) {
+      x->best_mv.as_mv.row *= 8;
+      x->best_mv.as_mv.col *= 8;
+    }
+    if (bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      bestsme = cpi->find_fractional_mv_step(
+          x, cm, mi_row, mi_col, &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv, x->errorperbit,
+          &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+          x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
+          mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search);
+    }
+
+    // Restore the pointer to the first prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+    if (bestsme < last_besterr[id]) {
+      cur_mv[id].as_mv = *best_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    av1_set_mvcost(x, ref, ref_mv_idx);
+    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+    *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  }
+}
+
+static void estimate_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    int segment_id, unsigned int *ref_costs_single,
+    unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+    int ref_frame;
+    for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+      // cost: if one ref frame is forward ref, the other ref is backward ref
+      int ref0, ref1;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
+      }
+
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+    } else {
+      int ref0, ref1;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+    }
+  }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                                 int mode_index,
+                                 int64_t comp_pred_diff[REFERENCE_MODES],
+                                 int skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->skip = x->skip;
+  ctx->skippable = skippable;
+  ctx->best_mode_index = mode_index;
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+}
+
+static void setup_buffer_ref_mvs_inter(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    BLOCK_SIZE block_size, int mi_row, int mi_col,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+  assert(yv12 != NULL);
+
+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+  // use the UV scaling factors.
+  av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+                       num_planes);
+
+  // Gets an initial list of candidate vectors from neighbours and orders them
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                   mi_col, mbmi_ext->mode_context);
+
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the centre point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  (void)block_size;
+  av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+              block_size);
+}
+
+static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int ref_idx, int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+
+  MvLimits tmp_mv_limits = x->mv_limits;
+  int cost_list[5];
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take the weighted average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param =
+        (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+        2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
+    int boffset =
+        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
+             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
+    step_param = AOMMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = mi_size_wide_log2[bsize];
+    int bhl = mi_size_high_log2[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5) {
+      step_param += 2;
+      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
+    }
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          x->best_mv.as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            // Swap back the original buffers before returning.
+            for (int j = 0; j < num_planes; ++j)
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  // Note: MV limits are modified here. Always restore the original values
+  // after full-pixel motion search.
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+    mvp_full = mbmi->mv[0].as_mv;
+  else
+    mvp_full = ref_mv;
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
+  switch (mbmi->motion_mode) {
+    case SIMPLE_TRANSLATION:
+      bestsme = av1_full_pixel_search(
+          cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+          sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+          (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
+      break;
+    case OBMC_CAUSAL:
+      bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
+                                           MAX_MVSEARCH_STEPS - 1 - step_param,
+                                           1, &cpi->fn_ptr[bsize], &ref_mv,
+                                           &(x->best_mv.as_mv), 0);
+      break;
+    default: assert(0 && "Invalid motion mode!\n");
+  }
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  x->mv_limits = tmp_mv_limits;
+
+  if (cpi->common.cur_frame_force_integer_mv) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    switch (mbmi->motion_mode) {
+      case SIMPLE_TRANSLATION:
+        if (cpi->sf.use_accurate_subpel_search) {
+          int best_mv_var;
+          const int try_second = x->second_best_mv.as_int != INVALID_MV &&
+                                 x->second_best_mv.as_int != x->best_mv.as_int;
+          const int pw = block_size_wide[bsize];
+          const int ph = block_size_high[bsize];
+
+          best_mv_var = cpi->find_fractional_mv_step(
+              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+              0, 0, pw, ph, 1);
+
+          if (try_second) {
+            const int minc =
+                AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
+            const int maxc =
+                AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
+            const int minr =
+                AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
+            const int maxr =
+                AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
+            int this_var;
+            MV best_mv = x->best_mv.as_mv;
+
+            x->best_mv = x->second_best_mv;
+            if (x->best_mv.as_mv.row * 8 <= maxr &&
+                x->best_mv.as_mv.row * 8 >= minr &&
+                x->best_mv.as_mv.col * 8 <= maxc &&
+                x->best_mv.as_mv.col * 8 >= minc) {
+              this_var = cpi->find_fractional_mv_step(
+                  x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+                  &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
+              if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+              x->best_mv.as_mv = best_mv;
+            }
+          }
+        } else {
+          cpi->find_fractional_mv_step(
+              x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+              x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+              0, 0, 0, 0, 0);
+        }
+        break;
+      case OBMC_CAUSAL:
+        av1_find_best_obmc_sub_pixel_tree_up(
+            x, cm, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
+            cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+            cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
+            cpi->sf.use_accurate_subpel_search);
+        break;
+      default: assert(0 && "Invalid motion mode!\n");
+    }
+  }
+  *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
+                             x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
+    x->pred_mv[ref] = x->best_mv.as_mv;
+}
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst,
+                                   const int num_planes) {
+  int i;
+  for (i = 0; i < num_planes; i++) {
+    xd->plane[i].dst.buf = dst.plane[i];
+    xd->plane[i].dst.stride = dst.stride[i];
+  }
+}
+
+static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, const MV *other_mv,
+                                    int mi_row, int mi_col, const int block,
+                                    int ref_idx, uint8_t *second_pred) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int other_ref = mbmi->ref_frame[!ref_idx];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
+  const int ic = block & 1;
+  const int ir = (block - ic) >> 1;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+  const WarpedMotionParams *const wm = &xd->global_motion[other_ref];
+  int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+
+  const int plane = 0;
+  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+  struct scale_factors sf;
+  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+                                    cm->width, cm->height);
+
+  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+  WarpTypesAllowed warp_types;
+  warp_types.global_warp_allowed = is_global;
+  warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+
+  // Get the prediction block from the 'other' reference frame.
+  av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw,
+                            other_mv, &sf, pw, ph, &conv_params,
+                            mbmi->interp_filters, &warp_types, p_col, p_row,
+                            plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
+                            mi_row * MI_SIZE, xd, cm->allow_warped_motion);
+
+  av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+                             &xd->jcp_param.bck_offset,
+                             &xd->jcp_param.use_jnt_comp_avg, 1);
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                          BLOCK_SIZE bsize, MV *this_mv,
+                                          int mi_row, int mi_col,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          int *rate_mv, int ref_idx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int ref = mbmi->ref_frame[ref_idx];
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  struct buf_2d backup_yv12[MAX_MB_PLANE];
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+  // Store the first prediction buffer.
+  struct buf_2d orig_yv12;
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (i = 0; i < num_planes; i++) backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  int bestsme = INT_MAX;
+  int sadpb = x->sadperbit16;
+  MV *const best_mv = &x->best_mv.as_mv;
+  int search_range = SEARCH_RANGE_8P;
+
+  MvLimits tmp_mv_limits = x->mv_limits;
+
+  // Do compound motion search on the current reference frame.
+  av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
+
+  // Use the mv result from the single mode as mv predictor.
+  *best_mv = *this_mv;
+
+  best_mv->col >>= 3;
+  best_mv->row >>= 3;
+
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+
+  // Small-range full-pixel motion search.
+  bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+                                     &cpi->fn_ptr[bsize], mask, mask_stride,
+                                     ref_idx, &ref_mv.as_mv, second_pred);
+  if (bestsme < INT_MAX) {
+    if (mask)
+      bestsme =
+          av1_get_mvpred_mask_var(x, best_mv, &ref_mv.as_mv, second_pred, mask,
+                                  mask_stride, ref_idx, &cpi->fn_ptr[bsize], 1);
+    else
+      bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv.as_mv, second_pred,
+                                      &cpi->fn_ptr[bsize], 1);
+  }
+
+  x->mv_limits = tmp_mv_limits;
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  if (cpi->common.cur_frame_force_integer_mv) {
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+  }
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    unsigned int sse;
+    bestsme = cpi->find_fractional_mv_step(
+        x, cm, mi_row, mi_col, &ref_mv.as_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+        x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
+        ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search);
+  }
+
+  // Restore the pointer to the first unscaled prediction buffer.
+  if (ref_idx) pd->pre[0] = orig_yv12;
+
+  if (bestsme < INT_MAX) *this_mv = *best_mv;
+
+  *rate_mv = 0;
+
+  av1_set_mvcost(
+      x, ref_idx,
+      mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
+                              x->mvcost, MV_COST_WEIGHT);
+}
+
+// Wrapper for compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+static void compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
+    const int block, int ref_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(xd->mi[0]));
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+  else
+    second_pred = (uint8_t *)second_pred_alloc_16;
+
+  MV *this_mv = &cur_mv[ref_idx].as_mv;
+  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+
+  build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
+                          ref_idx, second_pred);
+
+  compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
+                                second_pred, mask, mask_stride, rate_mv,
+                                ref_idx);
+}
+
+static void do_masked_motion_search_indexed(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  const uint8_t *mask;
+  const int mask_stride = block_size_wide[bsize];
+
+  mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+  tmp_mv[0].as_int = cur_mv[0].as_int;
+  tmp_mv[1].as_int = cur_mv[1].as_int;
+  if (which == 0 || which == 1) {
+    compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mi_row,
+                                             mi_col, mask, mask_stride, rate_mv,
+                                             0, which);
+  } else if (which == 2) {
+    joint_motion_search(cpi, x, bsize, tmp_mv, mi_row, mi_col, NULL, mask,
+                        mask_stride, rate_mv, 0);
+  }
+}
+
+#define USE_DISCOUNT_NEWMV_TEST 0
+#if USE_DISCOUNT_NEWMV_TEST
+// In some situations we want to discount the apparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+#define NEW_MV_DISCOUNT_FACTOR 8
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                               int ref_idx, int ref_mv_idx,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               const MB_MODE_INFO_EXT *mbmi_ext);
+static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                               PREDICTION_MODE this_mode, int_mv this_mv) {
+  if (this_mode == NEWMV && this_mv.as_int != 0 &&
+      !cpi->rc.is_src_frame_alt_ref) {
+    // Only discount new_mv when nearst_mv and all near_mv are zero, and the
+    // new_mv is not equal to global_mv
+    const AV1_COMMON *const cm = &cpi->common;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const MV_REFERENCE_FRAME tmp_ref_frames[2] = { mbmi->ref_frame[0],
+                                                   NONE_FRAME };
+    const uint8_t ref_frame_type = av1_ref_frame_type(tmp_ref_frames);
+    int_mv nearest_mv;
+    get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+    int ret = nearest_mv.as_int == 0;
+    for (int ref_mv_idx = 0;
+         ref_mv_idx < x->mbmi_ext->ref_mv_count[ref_frame_type]; ++ref_mv_idx) {
+      int_mv near_mv;
+      get_this_mv(&near_mv, NEARMV, 0, ref_mv_idx, tmp_ref_frames, x->mbmi_ext);
+      ret &= near_mv.as_int == 0;
+    }
+    if (cm->global_motion[tmp_ref_frames[0]].wmtype <= TRANSLATION) {
+      int_mv global_mv;
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      ret &= global_mv.as_int != this_mv.as_int;
+    }
+    return ret;
+  }
+  return 0;
+}
+#endif
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+                               const BLOCK_SIZE bsize, const uint8_t *pred0,
+                               int stride0, const uint8_t *pred1, int stride1) {
+  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+    //                            4X4
+    BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+    // 64x128,     128x64,        128x128
+    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+    // 4X16,       16X4,          8X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    // 32X8,       16X64,         64X16
+    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+  };
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  uint32_t esq[2][4];
+  int64_t tl, br;
+
+  const BLOCK_SIZE f_index = split_qtr[bsize];
+  assert(f_index != BLOCK_INVALID);
+
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
+                          &esq[0][1]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                          pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                          pred0 + bh / 2 * stride0 + bw / 2, stride0,
+                          &esq[0][3]);
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, stride1,
+                          &esq[1][1]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                          pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+  cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                          pred1 + bh / 2 * stride1 + bw / 2, stride0,
+                          &esq[1][3]);
+
+  tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) -
+       ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]);
+  br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) -
+       ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]);
+  return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize, const uint8_t *const p0,
+                          const int16_t *const residual1,
+                          const int16_t *const diff10,
+                          int *const best_wedge_sign,
+                          int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_sign;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+  DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+  }
+
+  int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+                        (int64_t)aom_sum_squares_i16(residual1, N)) *
+                       (1 << WEDGE_WEIGHT_BITS) / 2;
+  int16_t *ds = residual0;
+
+  av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    // int rate2;
+    // int64_t dist2;
+    // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+    // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+    // sse, rate, dist, rate2, dist2); dist = dist2;
+    // rate = rate2;
+
+    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const int16_t *const residual1,
+                                     const int16_t *const diff10,
+                                     const int wedge_sign,
+                                     int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+    }
+  }
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+
+  int64_t rd;
+  int wedge_index = -1;
+  int wedge_sign = 0;
+
+  assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+  assert(cpi->common.seq_params.enable_masked_compound);
+
+  if (cpi->sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+                               &wedge_index);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+                    &wedge_index);
+  }
+
+  mbmi->interinter_comp.wedge_sign = wedge_sign;
+  mbmi->interinter_comp.wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const BLOCK_SIZE bsize,
+                                   const uint8_t *const p0,
+                                   const uint8_t *const p1,
+                                   const int16_t *const residual1,
+                                   const int16_t *const diff10) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = 1 << num_pels_log2_lookup[bsize];
+  int rate;
+  int64_t dist;
+  DIFFWTD_MASK_TYPE cur_mask_type;
+  int64_t best_rd = INT64_MAX;
+  DIFFWTD_MASK_TYPE best_mask_type = 0;
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+  // try each mask type and its inverse
+  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+    // build mask and inverse
+    if (hbd)
+      av1_build_compound_diffwtd_mask_highbd(
+          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+    else
+      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+                                      p0, bw, p1, bw, bh, bw);
+
+    // compute rd for mask
+    uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+                                                tmp_mask[cur_mask_type], N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+    if (rd0 < best_rd) {
+      best_mask_type = cur_mask_type;
+      best_rd = rd0;
+    }
+  }
+  mbmi->interinter_comp.mask_type = best_mask_type;
+  if (best_mask_type == DIFFWTD_38_INV) {
+    memcpy(xd->seg_mask, seg_mask, N * 2);
+  }
+  return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(is_interintra_wedge_used(bsize));
+  assert(cpi->common.seq_params.enable_interintra_compound);
+
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
+  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
+  if (get_bitdepth_data_path_index(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+  }
+  int wedge_index = -1;
+  int64_t rd =
+      pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index);
+
+  mbmi->interintra_wedge_sign = 0;
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const BLOCK_SIZE bsize,
+                                    const uint8_t *const p0,
+                                    const uint8_t *const p1,
+                                    const int16_t *const residual1,
+                                    const int16_t *const diff10) {
+  const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
+  switch (compound_type) {
+    case COMPOUND_WEDGE:
+      return pick_interinter_wedge(cpi, x, bsize, p0, p1, residual1, diff10);
+    case COMPOUND_DIFFWTD:
+      return pick_interinter_seg(cpi, x, bsize, p0, p1, residual1, diff10);
+    default: assert(0); return 0;
+  }
+}
+
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                             MACROBLOCK *x,
+                                             const int_mv *const cur_mv,
+                                             const BLOCK_SIZE bsize,
+                                             const PREDICTION_MODE this_mode,
+                                             int mi_row, int mi_col) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int_mv tmp_mv[2];
+  int tmp_rate_mv = 0;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+  } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  }
+  return tmp_rate_mv;
+}
+
+static void get_inter_predictors_masked_compound(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+    int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1,
+    int16_t *residual1, int16_t *diff10, int *strides) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  int can_use_previous = cm->allow_warped_motion;
+  // get inter predictors to use for masked compound modes
+  av1_build_inter_predictors_for_planes_single_buf(
+      xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+  av1_build_inter_predictors_for_planes_single_buf(
+      xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+  const struct buf_2d *const src = &x->plane[0].src;
+  if (get_bitdepth_data_path_index(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+                              bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+                       bw);
+    aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+  }
+}
+
+static int64_t build_and_cost_compound_type(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+    int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+    uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+    int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
+    int *calc_pred_masked_compound) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int rate_sum;
+  int64_t dist_sum;
+  int64_t best_rd_cur = INT64_MAX;
+  int64_t rd = INT64_MAX;
+  int tmp_skip_txfm_sb;
+  int64_t tmp_skip_sse_sb;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+
+  if (*calc_pred_masked_compound) {
+    get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
+                                         preds1, residual1, diff10, strides);
+    *calc_pred_masked_compound = 0;
+  }
+
+  best_rd_cur =
+      pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
+  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+
+  // Although the true rate_mv might be different after motion search, but it
+  // is unlikely to be the best mode considering the transform rd cost and other
+  // mode overhead cost
+  int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+  if (mode_rd > ref_best_rd) return INT64_MAX;
+
+  if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
+    *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
+                                                     this_mode, mi_row, mi_col);
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+    model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+        cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+        &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+    if (rd >= best_rd_cur) {
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+      *out_rate_mv = rate_mv;
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                               preds1, strides);
+    }
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+    best_rd_cur = rd;
+
+  } else {
+    av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                             preds1, strides);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, *rs2 + rate_mv + rate_sum, dist_sum);
+    best_rd_cur = rd;
+  }
+  return best_rd_cur;
+}
+
+typedef struct {
+  // OBMC secondary prediction buffers and respective strides
+  uint8_t *above_pred_buf[MAX_MB_PLANE];
+  int above_pred_stride[MAX_MB_PLANE];
+  uint8_t *left_pred_buf[MAX_MB_PLANE];
+  int left_pred_stride[MAX_MB_PLANE];
+  int_mv (*single_newmv)[REF_FRAMES];
+  // Pointer to array of motion vectors to use for each ref and their rates
+  // Should point to first of 2 arrays in 2D array
+  int (*single_newmv_rate)[REF_FRAMES];
+  int (*single_newmv_valid)[REF_FRAMES];
+  // Pointer to array of predicted rate-distortion
+  // Should point to first of 2 arrays in 2D array
+  int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+  InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
+  int ref_frame_cost;
+  int single_comp_cost;
+  int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+  int skip_motion_mode;
+  INTERINTRA_MODE *inter_intra_mode;
+} HandleInterModeArgs;
+
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+                            const MACROBLOCK *const x,
+                            PREDICTION_MODE this_mode,
+                            const MV_REFERENCE_FRAME ref_frames[2],
+                            InterModeSearchState *search_state) {
+  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+  if (!is_comp_pred) {
+    if (this_mode == NEARMV) {
+      if (ref_mv_count == 0) {
+        // NEARMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // NEARMV has the same motion vector as GLOBALMV
+        compare_mode = GLOBALMV;
+      }
+    }
+    if (this_mode == GLOBALMV) {
+      if (ref_mv_count == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // GLOBALMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1) {
+        // GLOBALMV has the same motion vector as NEARMV
+        compare_mode = NEARMV;
+      }
+    }
+
+    if (compare_mode != MB_MODE_COUNT) {
+      // Use modelled_rd to check whether compare mode was searched
+      if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+          INT64_MAX) {
+        const int16_t mode_ctx =
+            av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+        const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
+        const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+
+        // Only skip if the mode cost is larger than compare mode cost
+        if (this_cost > compare_cost) {
+          search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+              search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+          return 1;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+                                     const AV1_COMMON *cm,
+                                     const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  *out_mv = in_mv;
+  lower_mv_precision(&out_mv->as_mv, cm->allow_high_precision_mv,
+                     cm->cur_frame_force_integer_mv);
+  clamp_mv2(&out_mv->as_mv, xd);
+  return !mv_check_bounds(&x->mv_limits, &out_mv->as_mv);
+}
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, int_mv *cur_mv,
+                            const int mi_row, const int mi_col,
+                            int *const rate_mv,
+                            HandleInterModeArgs *const args) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+  const int ref_mv_idx = mbmi->ref_mv_idx;
+  int i;
+
+  (void)args;
+
+  if (is_comp_pred) {
+    if (this_mode == NEW_NEWMV) {
+      cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+      cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
+                            0, rate_mv, 0);
+      } else {
+        *rate_mv = 0;
+        for (i = 0; i < 2; ++i) {
+          const int_mv ref_mv = av1_get_ref_mv(x, i);
+          av1_set_mvcost(x, i, mbmi->ref_mv_idx);
+          *rate_mv +=
+              av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmvjointcost,
+                              x->mvcost, MV_COST_WEIGHT);
+        }
+      }
+    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+      cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
+      } else {
+        av1_set_mvcost(x, 1,
+                       mbmi->ref_mv_idx + (this_mode == NEAR_NEWMV ? 1 : 0));
+        const int_mv ref_mv = av1_get_ref_mv(x, 1);
+        *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+    } else {
+      assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+      cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        compound_single_motion_search_interinter(
+            cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
+      } else {
+        const int_mv ref_mv = av1_get_ref_mv(x, 0);
+        av1_set_mvcost(x, 0,
+                       mbmi->ref_mv_idx + (this_mode == NEW_NEARMV ? 1 : 0));
+        *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+    }
+  } else {
+    single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
+    if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+    args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv;
+    args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+    args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
+
+    cur_mv[0].as_int = x->best_mv.as_int;
+
+#if USE_DISCOUNT_NEWMV_TEST
+    // Estimate the rate implications of a new mv but discount this
+    // under certain circumstances where we want to help initiate a weak
+    // motion field, where the distortion gain for a single block may not
+    // be enough to overcome the cost of a new mv.
+    if (discount_newmv_test(cpi, x, this_mode, x->best_mv)) {
+      *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
+    }
+#endif
+  }
+
+  return 0;
+}
+
+static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+                                int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+                                      const InterpFilters filters,
+                                      const int ctx[2]) {
+  int inter_filter_cost;
+  const InterpFilter filter0 = av1_extract_interp_filter(filters, 0);
+  const InterpFilter filter1 = av1_extract_interp_filter(filters, 1);
+  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
+  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
+    const int switchable_ctx[2], const int skip_pred, int *rate,
+    int64_t *dist) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 };
+  int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 };
+
+  const InterpFilters last_best = mbmi->interp_filters;
+  mbmi->interp_filters = filter_sets[filter_idx];
+  const int tmp_rs =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+  assert(skip_pred != 2);
+  assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
+  assert(rate[0] >= 0);
+  assert(dist[0] >= 0);
+  assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1));
+  assert(skip_sse_sb[0] >= 0);
+  assert(rate[1] >= 0);
+  assert(dist[1] >= 0);
+  assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1));
+  assert(skip_sse_sb[1] >= 0);
+
+  if (skip_pred != cpi->default_interp_skip_flags) {
+    if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+      av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_COLLECT_RD_STATS == 3
+      RD_STATS rd_stats_y;
+      select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+      PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+      model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+          &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL);
+      tmp_rate[1] = tmp_rate[0];
+      tmp_dist[1] = tmp_dist[0];
+    } else {
+      // only luma MC is skipped
+      tmp_rate[1] = rate[0];
+      tmp_dist[1] = dist[0];
+    }
+    if (num_planes > 1) {
+      for (int plane = 1; plane < num_planes; ++plane) {
+        int tmp_rate_uv, tmp_skip_sb_uv;
+        int64_t tmp_dist_uv, tmp_skip_sse_uv;
+        int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+        if (tmp_rd >= *rd) {
+          mbmi->interp_filters = last_best;
+          return 0;
+        }
+        av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                       plane);
+        model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+            cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
+            &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
+        tmp_rate[1] =
+            (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX);
+        tmp_dist[1] += tmp_dist_uv;
+        tmp_skip_sb[1] &= tmp_skip_sb_uv;
+        tmp_skip_sse[1] += tmp_skip_sse_uv;
+      }
+    }
+  } else {
+    // both luma and chroma MC is skipped
+    tmp_rate[1] = rate[1];
+    tmp_dist[1] = dist[1];
+  }
+  int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+
+  if (tmp_rd < *rd) {
+    *rd = tmp_rd;
+    *switchable_rate = tmp_rs;
+    if (skip_pred != cpi->default_interp_skip_flags) {
+      if (skip_pred == 0) {
+        // Overwrite the data as current filter is the best one
+        tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1];
+        tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1];
+        memcpy(rate, tmp_rate, sizeof(*rate) * 2);
+        memcpy(dist, tmp_dist, sizeof(*dist) * 2);
+        memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2);
+        memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2);
+        // As luma MC data is computed, no need to recompute after the search
+        x->recalc_luma_mc_data = 0;
+      } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+        // As luma MC data is not computed, update of luma data can be skipped
+        rate[1] = tmp_rate[1];
+        dist[1] = tmp_dist[1];
+        skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1];
+        skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1];
+        // As luma MC data is not recomputed and current filter is the best,
+        // indicate the possibility of recomputing MC data
+        // If current buffer contains valid MC data, toggle to indicate that
+        // luma MC data needs to be recomputed
+        x->recalc_luma_mc_data ^= 1;
+      }
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
+    return 1;
+  }
+  mbmi->interp_filters = last_best;
+  return 0;
+}
+
+// Find the best rd filter in horizontal direction
+static INLINE int find_best_horiz_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+    const int switchable_ctx[2], const int skip_hor, int *rate, int64_t *dist,
+    int best_dual_mode) {
+  int i;
+  const int bw = block_size_wide[bsize];
+  assert(best_dual_mode == 0);
+  if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) {
+    int skip_pred = cpi->default_interp_skip_flags;
+    // Process the filters in reverse order to enable reusing rate and
+    // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+    for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
+                                  dst_bufs, i, switchable_ctx, skip_pred, rate,
+                                  dist)) {
+        best_dual_mode = i;
+      }
+      skip_pred = skip_hor;
+    }
+  } else {
+    for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+      if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                                  switchable_rate, skip_txfm_sb, skip_sse_sb,
+                                  dst_bufs, i, switchable_ctx, skip_hor, rate,
+                                  dist)) {
+        best_dual_mode = i;
+      }
+    }
+  }
+  return best_dual_mode;
+}
+
+// Find the best rd filter in vertical direction
+static INLINE void find_best_vert_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+    int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+    const int switchable_ctx[2], const int skip_ver, int *rate, int64_t *dist,
+    int best_dual_mode, int filter_set_size) {
+  int i;
+  const int bh = block_size_high[bsize];
+  if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) {
+    int skip_pred = cpi->default_interp_skip_flags;
+    // Process the filters in reverse order to enable reusing rate and
+    // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+    assert(filter_set_size == DUAL_FILTER_SET_SIZE);
+    for (i = (filter_set_size - SWITCHABLE_FILTERS + best_dual_mode);
+         i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i, switchable_ctx, skip_pred, rate,
+                              dist);
+      skip_pred = skip_ver;
+    }
+  } else {
+    for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+         i += SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, skip_txfm_sb, skip_sse_sb,
+                              dst_bufs, i, switchable_ctx, skip_ver, rate,
+                              dist);
+    }
+  }
+}
+
+// check if there is saved result match with this search
+static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
+                                         MB_MODE_INFO *const mi) {
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+  }
+  if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0;
+  return 1;
+}
+
+static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
+                                              MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  for (int j = 0; j < offset; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j];
+    if (is_interp_filter_match(st, mbmi)) {
+      mbmi->interp_filters = st->filters;
+      return j;
+    }
+  }
+  return -1;  // no match result found
+}
+
+static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
+                                                  MB_MODE_INFO *const mbmi) {
+  const int comp_idx = mbmi->compound_idx;
+  const int offset = x->interp_filter_stats_idx[comp_idx];
+  if (offset < MAX_INTERP_FILTER_STATS) {
+    INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+                                        { mbmi->mv[0], mbmi->mv[1] },
+                                        { mbmi->ref_frame[0],
+                                          mbmi->ref_frame[1] },
+                                        mbmi->interinter_comp.type };
+    x->interp_filter_stats[comp_idx][offset] = stat;
+    x->interp_filter_stats_idx[comp_idx]++;
+  }
+}
+
+static int64_t interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
+    BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
+    int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
+    int64_t *const skip_sse_sb, const int skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search =
+      av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+  int i;
+  // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative
+  // data of all planes
+  int tmp_rate[2] = { 0, 0 };
+  int64_t tmp_dist[2] = { 0, 0 };
+  int best_skip_txfm_sb[2] = { 1, 1 };
+  int64_t best_skip_sse_sb[2] = { 0, 0 };
+  const int ref_frame = xd->mi[0]->ref_frame[0];
+
+  (void)single_filter;
+  int match_found = -1;
+  const InterpFilter assign_filter = cm->interp_filter;
+  if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+    match_found = find_interp_filter_in_stats(x, mbmi);
+  }
+  if (!need_search || match_found == -1) {
+    set_default_interp_filters(mbmi, assign_filter);
+  }
+  int switchable_ctx[2];
+  switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+  switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+  *switchable_rate =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+  if (!skip_build_pred)
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+  RD_STATS rd_stats_y;
+  select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+  PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+  model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+      cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+      &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL);
+  if (num_planes > 1)
+    model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+        cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1],
+        &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL,
+        NULL);
+  tmp_rate[1] =
+      (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX);
+  assert(tmp_rate[1] >= 0);
+  tmp_dist[1] = tmp_dist[0] + tmp_dist[1];
+  best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1];
+  best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1];
+  *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]);
+  *skip_txfm_sb = best_skip_txfm_sb[1];
+  *skip_sse_sb = best_skip_sse_sb[1];
+  x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
+  if (assign_filter != SWITCHABLE || match_found != -1) {
+    return 0;
+  }
+  if (!need_search) {
+    assert(mbmi->interp_filters ==
+           av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
+    return 0;
+  }
+  if (args->modelled_rd != NULL) {
+    if (has_second_ref(mbmi)) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      int refs[2] = { mbmi->ref_frame[0],
+                      (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+      const int mode0 = compound_ref0_mode(mbmi->mode);
+      const int mode1 = compound_ref1_mode(mbmi->mode);
+      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                                 args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+      if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+        return INT64_MAX;
+      }
+    }
+  }
+
+  x->recalc_luma_mc_data = 0;
+  // skip_flag=xx (in binary form)
+  // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+  // corresponds to skipping chroma MC  skip_flag=0 corresponds to "Don't skip
+  // luma and chroma MC"  Skip flag=1 corresponds to "Skip Luma MC only"
+  // Skip_flag=2 is not a valid case
+  // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+  int skip_hor = cpi->default_interp_skip_flags;
+  int skip_ver = cpi->default_interp_skip_flags;
+  const int is_compound = has_second_ref(mbmi);
+  assert(is_intrabc_block(mbmi) == 0);
+  for (int j = 0; j < 1 + is_compound; ++j) {
+    const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
+    const struct scale_factors *const sf = &ref_buf->sf;
+    // TODO(any): Refine skip flag calculation considering scaling
+    if (av1_is_scaled(sf)) {
+      skip_hor = 0;
+      skip_ver = 0;
+      break;
+    }
+    const MV mv = mbmi->mv[j].as_mv;
+    int skip_hor_plane = 0;
+    int skip_ver_plane = 0;
+    for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) {
+      struct macroblockd_plane *const pd = &xd->plane[k];
+      const int bw = pd->width;
+      const int bh = pd->height;
+      const MV mv_q4 = clamp_mv_to_umv_border_sb(
+          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+      const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      skip_hor_plane |= ((sub_x == 0) << k);
+      skip_ver_plane |= ((sub_y == 0) << k);
+    }
+    skip_hor = skip_hor & skip_hor_plane;
+    skip_ver = skip_ver & skip_ver_plane;
+    // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+    assert(skip_hor != 2);
+    assert(skip_ver != 2);
+  }
+  // When compond prediction type is compound segment wedge, luma MC and chroma
+  // MC need to go hand in hand as mask generated during luma MC is reuired for
+  // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+  // vertical filter decision may be incorrect as temporary MC evaluation
+  // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+  // populated during luma MC
+  if (is_compound && mbmi->compound_idx == 1 &&
+      mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+    assert(mbmi->comp_group_idx == 1);
+    if (skip_hor == 0 && skip_ver == 1) skip_ver = 0;
+  }
+  // do interp_filter search
+  const int filter_set_size = DUAL_FILTER_SET_SIZE;
+  restore_dst_buf(xd, *tmp_dst, num_planes);
+  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+  if (cpi->sf.use_fast_interpolation_filter_search &&
+      cm->seq_params.enable_dual_filter) {
+    // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
+    int best_dual_mode = 0;
+    // Find best of {R}x{R,Sm,Sh}
+    // EIGHTTAP_REGULAR mode is calculated beforehand
+    best_dual_mode = find_best_horiz_interp_filter_rd(
+        x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+        best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+        tmp_rate, tmp_dist, best_dual_mode);
+
+    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+    find_best_vert_interp_filter_rd(
+        x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+        best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+        tmp_rate, tmp_dist, best_dual_mode, filter_set_size);
+  } else {
+    // EIGHTTAP_REGULAR mode is calculated beforehand
+    for (i = 1; i < filter_set_size; ++i) {
+      if (cm->seq_params.enable_dual_filter == 0) {
+        const int16_t filter_y = filter_sets[i] & 0xffff;
+        const int16_t filter_x = filter_sets[i] >> 16;
+        if (filter_x != filter_y) continue;
+      }
+      interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+                              switchable_rate, best_skip_txfm_sb,
+                              best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0,
+                              tmp_rate, tmp_dist);
+      assert(x->recalc_luma_mc_data == 0);
+    }
+  }
+  swap_dst_buf(xd, dst_bufs, num_planes);
+  // Recompute final MC data if required
+  if (x->recalc_luma_mc_data == 1) {
+    // Recomputing final luma MC data is required only if the same was skipped
+    // in either of the directions  Condition below is necessary, but not
+    // sufficient
+    assert((skip_hor == 1) || (skip_ver == 1));
+    av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  }
+  *skip_txfm_sb = best_skip_txfm_sb[1];
+  *skip_sse_sb = best_skip_sse_sb[1];
+  x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
+  // save search results
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    assert(match_found == -1);
+    save_interp_filter_search_stat(x, mbmi);
+  }
+  return 0;
+}
+
+static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                       int mi_row, int mi_col, RD_STATS *rd_stats,
+                       RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                       int mode_rate, int64_t ref_best_rd) {
+  /*
+   * This function combines y and uv planes' transform search processes
+   * together, when the prediction is generated. It first does subtration to
+   * obtain the prediction error. Then it calls
+   * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
+   * handles the early terminations happen in those functions. At the end, it
+   * computes the rd_stats/_y/_uv accordingly.
+   */
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int skip_txfm_sb = 0;
+  const int num_planes = av1_num_planes(cm);
+  const int ref_frame_1 = mbmi->ref_frame[1];
+  const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+  const int64_t rd_thresh =
+      ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int64_t min_header_rate =
+      mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  // Account for minimum skip and non_skip rd.
+  // Eventually either one of them will be added to mode_rate
+  const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+
+  if (min_header_rd_possible > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats_y);
+    av1_invalid_rd_stats(rd_stats);
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  av1_init_rd_stats(rd_stats_uv);
+  rd_stats->rate = mode_rate;
+
+  if (!cpi->common.all_lossless)
+    check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+  if (!skip_txfm_sb) {
+    int64_t non_skip_rdcosty = INT64_MAX;
+    int64_t skip_rdcosty = INT64_MAX;
+    int64_t min_rdcosty = INT64_MAX;
+    int is_cost_valid_uv = 0;
+
+    // cost and distortion
+    av1_subtract_plane(x, bsize, 0);
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+      // Motion mode
+      select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+      PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+    } else {
+      super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+      for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+        set_blk_skip(x, 0, i, rd_stats_y->skip);
+    }
+
+    if (rd_stats_y->rate == INT_MAX) {
+      av1_invalid_rd_stats(rd_stats);
+      // TODO(angiebird): check if we need this
+      // restore_dst_buf(xd, *orig_dst, num_planes);
+      mbmi->ref_frame[1] = ref_frame_1;
+      return 0;
+    }
+
+    av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+    non_skip_rdcosty = RDCOST(
+        x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
+    skip_rdcosty =
+        RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
+    min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+
+    if (min_rdcosty > ref_best_rd) {
+      int64_t tokenonly_rdy =
+          AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+                 RDCOST(x->rdmult, 0, rd_stats_y->sse));
+      // Invalidate rd_stats_y to skip the rest of the motion modes search
+      if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
+          rd_thresh)
+        av1_invalid_rd_stats(rd_stats_y);
+      mbmi->ref_frame[1] = ref_frame_1;
+      return 0;
+    }
+
+    if (num_planes > 1) {
+      /* clang-format off */
+      is_cost_valid_uv =
+          inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
+                           ref_best_rd - non_skip_rdcosty,
+                           ref_best_rd - skip_rdcosty, FTXS_NONE);
+      if (!is_cost_valid_uv) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        return 0;
+      }
+      /* clang-format on */
+      av1_merge_rd_stats(rd_stats, rd_stats_uv);
+    } else {
+      av1_init_rd_stats(rd_stats_uv);
+    }
+    if (rd_stats->skip) {
+      rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+      rd_stats_y->rate = 0;
+      rd_stats_uv->rate = 0;
+      rd_stats->rate += x->skip_cost[skip_ctx][1];
+      mbmi->skip = 0;
+      // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+      int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (tmprd > ref_best_rd) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        return 0;
+      }
+    } else if (!xd->lossless[mbmi->segment_id] &&
+               (RDCOST(x->rdmult,
+                       rd_stats_y->rate + rd_stats_uv->rate +
+                           x->skip_cost[skip_ctx][0],
+                       rd_stats->dist) >=
+                RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
+      rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+      rd_stats->rate += x->skip_cost[skip_ctx][1];
+      rd_stats->dist = rd_stats->sse;
+      rd_stats_y->rate = 0;
+      rd_stats_uv->rate = 0;
+      mbmi->skip = 1;
+    } else {
+      rd_stats->rate += x->skip_cost[skip_ctx][0];
+      mbmi->skip = 0;
+    }
+  } else {
+    x->skip = 1;
+    mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+    // The cost of skip bit needs to be added.
+    mbmi->skip = 0;
+    rd_stats->rate += x->skip_cost[skip_ctx][1];
+
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    rd_stats->skip = 1;
+    int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (tmprd > ref_best_rd) {
+      mbmi->ref_frame[1] = ref_frame_1;
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int handle_inter_intra_mode(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                   int mi_row, int mi_col, MB_MODE_INFO *mbmi,
+                                   HandleInterModeArgs *args,
+                                   int64_t ref_best_rd, int *rate_mv,
+                                   int *tmp_rate2, BUFFER_SET *orig_dst) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+  int64_t rd, best_interintra_rd = INT64_MAX;
+  int rmode, rate_sum;
+  int64_t dist_sum;
+  int tmp_rate_mv = 0;
+  int tmp_skip_txfm_sb;
+  int bw = block_size_wide[bsize];
+  int64_t tmp_skip_sse_sb;
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  const int *const interintra_mode_cost =
+      x->interintra_mode_cost[size_group_lookup[bsize]];
+  const int_mv mv0 = mbmi->mv[0];
+  const int is_wedge_used = is_interintra_wedge_used(bsize);
+  int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = bw;
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  mbmi->use_wedge_interintra = 0;
+  best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
+  int j = 0;
+  if (cpi->sf.reuse_inter_intra_mode == 0 ||
+      best_interintra_mode == INTERINTRA_MODES) {
+    for (j = 0; j < INTERINTRA_MODES; ++j) {
+      mbmi->interintra_mode = (INTERINTRA_MODE)j;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+          cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+          &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+      rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+      if (rd < best_interintra_rd) {
+        best_interintra_rd = rd;
+        best_interintra_mode = mbmi->interintra_mode;
+      }
+    }
+    args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+  }
+  if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+    mbmi->interintra_mode = best_interintra_mode;
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  }
+  rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                           &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+  if (rd != INT64_MAX)
+    rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
+  best_interintra_rd = rd;
+  if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+    return -1;
+  }
+  if (is_wedge_used) {
+    int64_t best_interintra_rd_nowedge = rd;
+    int64_t best_interintra_rd_wedge = INT64_MAX;
+    int_mv tmp_mv;
+    // Disable wedge search if source variance is small
+    if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+      mbmi->use_wedge_interintra = 1;
+
+      rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+               x->wedge_interintra_cost[bsize][1];
+
+      best_interintra_rd_wedge =
+          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+      best_interintra_rd_wedge +=
+          RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
+      rd = INT64_MAX;
+      // Refine motion vector.
+      if (have_newmv_in_inter_mode(mbmi->mode)) {
+        // get negative of mask
+        const uint8_t *mask = av1_get_contiguous_soft_mask(
+            mbmi->interintra_wedge_index, 1, bsize);
+        tmp_mv = mbmi->mv[0];
+        compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+                                      mi_col, intrapred, mask, bw, &tmp_rate_mv,
+                                      0);
+        if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+                                         bsize);
+          model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+              cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+              &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+          rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+                      dist_sum);
+        }
+      }
+      if (rd >= best_interintra_rd_wedge) {
+        tmp_mv.as_int = mv0.as_int;
+        tmp_rate_mv = *rate_mv;
+        av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      }
+      // Evaluate closer to true rd
+      rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                               &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+                    dist_sum);
+      best_interintra_rd_wedge = rd;
+      if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+        mbmi->use_wedge_interintra = 1;
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        *tmp_rate2 += tmp_rate_mv - *rate_mv;
+        *rate_mv = tmp_rate_mv;
+      } else {
+        mbmi->use_wedge_interintra = 0;
+        mbmi->mv[0].as_int = mv0.as_int;
+        av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+      }
+    } else {
+      mbmi->use_wedge_interintra = 0;
+    }
+  }  // if (is_interintra_wedge_used(bsize))
+  if (num_planes > 1) {
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+  }
+  return 0;
+}
+
+// TODO(afergs): Refactor the MBMI references in here - there's four
+// TODO(afergs): Refactor optional args - add them to a struct or remove
+static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, RD_STATS *rd_stats,
+                              RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                              int *disable_skip, int mi_row, int mi_col,
+                              HandleInterModeArgs *const args,
+                              int64_t ref_best_rd, const int *refs,
+                              int *rate_mv, BUFFER_SET *orig_dst
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+                              ,
+                              TileDataEnc *tile_data, int64_t *best_est_rd,
+                              int do_tx_search, InterModesInfo *inter_modes_info
+#endif
+) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int rate2_nocoeff = rd_stats->rate;
+  int best_xskip, best_disable_skip = 0;
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  MB_MODE_INFO base_mbmi, best_mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int rate_mv0 = *rate_mv;
+
+  int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+                           is_interintra_allowed(mbmi) && mbmi->compound_idx;
+  int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+
+  assert(mbmi->ref_frame[1] != INTRA_FRAME);
+  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+  av1_invalid_rd_stats(&best_rd_stats);
+  aom_clear_system_state();
+  mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+  if (cm->switchable_motion_mode) {
+    last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                                   cm->allow_warped_motion);
+  }
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
+  }
+  int total_samples = mbmi->num_proj_ref;
+  if (total_samples == 0) {
+    last_motion_mode_allowed = OBMC_CAUSAL;
+  }
+  base_mbmi = *mbmi;
+
+  const int switchable_rate =
+      av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
+  int64_t best_rd = INT64_MAX;
+  int best_rate_mv = rate_mv0;
+  for (int mode_index = (int)SIMPLE_TRANSLATION;
+       mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
+       mode_index++) {
+    if (args->skip_motion_mode && mode_index) continue;
+    int64_t tmp_rd = INT64_MAX;
+    int tmp_rate2 = rate2_nocoeff;
+    int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+    int skip_txfm_sb = 0;
+    int tmp_rate_mv = rate_mv0;
+
+    *mbmi = base_mbmi;
+    if (is_interintra_mode) {
+      mbmi->motion_mode = SIMPLE_TRANSLATION;
+    } else {
+      mbmi->motion_mode = (MOTION_MODE)mode_index;
+      assert(mbmi->ref_frame[1] != INTRA_FRAME);
+    }
+
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+      // SIMPLE_TRANSLATION mode: no need to recalculate.
+      // The prediction is calculated before motion_mode_rd() is called in
+      // handle_inter_mode()
+    } else if (mbmi->motion_mode == OBMC_CAUSAL) {
+      uint32_t cur_mv = mbmi->mv[0].as_int;
+      assert(!is_comp_pred);
+      if (have_newmv_in_inter_mode(this_mode)) {
+        single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
+        mbmi->mv[0].as_int = x->best_mv.as_int;
+#if USE_DISCOUNT_NEWMV_TEST
+        if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+          tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        }
+#endif
+        tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+      }
+      if (mbmi->mv[0].as_int != cur_mv) {
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+      }
+      av1_build_obmc_inter_prediction(
+          cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
+          args->left_pred_buf, args->left_pred_stride);
+    } else if (mbmi->motion_mode == WARPED_CAUSAL) {
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+      mbmi->motion_mode = WARPED_CAUSAL;
+      mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+      mbmi->interp_filters = av1_broadcast_interp_filter(
+          av1_unswitchable_filter(cm->interp_filter));
+
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Select the samples according to motion vector difference
+      if (mbmi->num_proj_ref > 1) {
+        mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                           mbmi->num_proj_ref, bsize);
+      }
+
+      if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                           mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                           &mbmi->wm_params, mi_row, mi_col)) {
+        // Refine MV for NEWMV mode
+        assert(!is_comp_pred);
+        if (have_newmv_in_inter_mode(this_mode)) {
+          const int_mv mv0 = mbmi->mv[0];
+          const WarpedMotionParams wm_params0 = mbmi->wm_params;
+          int num_proj_ref0 = mbmi->num_proj_ref;
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
+                               total_samples);
+
+          // Keep the refined MV and WM parameters.
+          if (mv0.as_int != mbmi->mv[0].as_int) {
+            const int ref = refs[0];
+            const int_mv ref_mv = av1_get_ref_mv(x, 0);
+            tmp_rate_mv =
+                av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+            if (cpi->sf.adaptive_motion_search)
+              x->pred_mv[ref] = mbmi->mv[0].as_mv;
+
+#if USE_DISCOUNT_NEWMV_TEST
+            if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+              tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+            }
+#endif
+            tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+          } else {
+            // Restore the old MV and WM parameters.
+            mbmi->mv[0] = mv0;
+            mbmi->wm_params = wm_params0;
+            mbmi->num_proj_ref = num_proj_ref0;
+          }
+        }
+
+        av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      } else {
+        continue;
+      }
+    } else if (is_interintra_mode) {
+      const int ret = handle_inter_intra_mode(
+          cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
+          &tmp_rate2, orig_dst);
+      if (ret < 0) continue;
+    }
+
+    if (!cpi->common.all_lossless)
+      check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+
+    x->skip = 0;
+
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    rd_stats->skip = 1;
+    rd_stats->rate = tmp_rate2;
+    if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
+    if (interintra_allowed) {
+      rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
+                                          [mbmi->ref_frame[1] == INTRA_FRAME];
+      if (mbmi->ref_frame[1] == INTRA_FRAME) {
+        rd_stats->rate += x->interintra_mode_cost[size_group_lookup[bsize]]
+                                                 [mbmi->interintra_mode];
+        if (is_interintra_wedge_used(bsize)) {
+          rd_stats->rate +=
+              x->wedge_interintra_cost[bsize][mbmi->use_wedge_interintra];
+          if (mbmi->use_wedge_interintra) {
+            rd_stats->rate +=
+                av1_cost_literal(get_interintra_wedge_bits(bsize));
+          }
+        }
+      }
+    }
+    if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+        (mbmi->ref_frame[1] != INTRA_FRAME)) {
+      if (last_motion_mode_allowed == WARPED_CAUSAL) {
+        rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
+      } else {
+        rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
+      }
+    }
+
+    if (!skip_txfm_sb) {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+      int64_t est_rd = 0;
+      int est_skip = 0;
+      if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+          cm->tile_rows == 1) {
+        InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
+        if (md->ready) {
+          const int64_t curr_sse = get_sse(cpi, x);
+          est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
+                              rd_stats->rate);
+          est_skip = est_rd * 0.8 > *best_est_rd;
+          if (est_skip) {
+            mbmi->ref_frame[1] = ref_frame_1;
+            continue;
+          } else {
+            if (est_rd < *best_est_rd) {
+              *best_est_rd = est_rd;
+            }
+          }
+        }
+      }
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+    }
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    if (!do_tx_search) {
+      const int64_t curr_sse = get_sse(cpi, x);
+      int est_residue_cost = 0;
+      int64_t est_dist = 0;
+      const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+                                               &est_residue_cost, &est_dist);
+      (void)has_est_rd;
+      assert(has_est_rd);
+      const int mode_rate = rd_stats->rate;
+      rd_stats->rate += est_residue_cost;
+      rd_stats->dist = est_dist;
+      rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (cm->reference_mode == SINGLE_REFERENCE) {
+        if (!is_comp_pred) {
+          inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                                rd_stats->rdcost, mbmi);
+        }
+      } else {
+        inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                              rd_stats->rdcost, mbmi);
+      }
+    } else {
+#endif
+      int mode_rate = rd_stats->rate;
+      if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
+                       rd_stats_uv, mode_rate, ref_best_rd)) {
+        if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
+          return INT64_MAX;
+        }
+        continue;
+      }
+      if (!skip_txfm_sb) {
+        const int64_t curr_rd =
+            RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+        if (curr_rd < ref_best_rd) {
+          ref_best_rd = curr_rd;
+        }
+        *disable_skip = 0;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        if (cpi->sf.inter_mode_rd_model_estimation) {
+          const int skip_ctx = av1_get_skip_context(xd);
+          inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+                               rd_stats->dist,
+                               rd_stats_y->rate + rd_stats_uv->rate +
+                                   x->skip_cost[skip_ctx][mbmi->skip]);
+        }
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+      } else {
+        *disable_skip = 1;
+      }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+    }
+#endif
+
+    if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+      if (is_nontrans_global_motion(xd, xd->mi[0])) {
+        mbmi->interp_filters = av1_broadcast_interp_filter(
+            av1_unswitchable_filter(cm->interp_filter));
+      }
+    }
+
+    tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (mode_index == 0)
+      args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+    if ((mode_index == 0) || (tmp_rd < best_rd)) {
+      best_mbmi = *mbmi;
+      best_rd = tmp_rd;
+      best_rd_stats = *rd_stats;
+      best_rd_stats_y = *rd_stats_y;
+      best_rate_mv = tmp_rate_mv;
+      if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+      best_xskip = x->skip;
+      best_disable_skip = *disable_skip;
+      if (best_xskip) break;
+    }
+  }
+  mbmi->ref_frame[1] = ref_frame_1;
+  *rate_mv = best_rate_mv;
+  if (best_rd == INT64_MAX) {
+    av1_invalid_rd_stats(rd_stats);
+    restore_dst_buf(xd, *orig_dst, num_planes);
+    return INT64_MAX;
+  }
+  *mbmi = best_mbmi;
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+  x->skip = best_xskip;
+  *disable_skip = best_disable_skip;
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+                            MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col, BUFFER_SET *const orig_dst) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    av1_subtract_plane(x, bsize, plane);
+    int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+    sse = sse << 4;
+    total_sse += sse;
+  }
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  rd_stats->dist = rd_stats->sse = total_sse;
+  rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode,
+                                    uint8_t ref_mv_idx) {
+  assert(is_inter_singleref_mode(single_mode));
+  int ref_mv_offset;
+  if (single_mode == NEARESTMV) {
+    ref_mv_offset = 0;
+  } else if (single_mode == NEARMV) {
+    ref_mv_offset = ref_mv_idx + 1;
+  } else {
+    ref_mv_offset = -1;
+  }
+  return ref_mv_offset;
+}
+
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                               int ref_idx, int ref_mv_idx,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               const MB_MODE_INFO_EXT *mbmi_ext) {
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
+  const PREDICTION_MODE single_mode =
+      get_single_mode(this_mode, ref_idx, is_comp_pred);
+  assert(is_inter_singleref_mode(single_mode));
+  if (single_mode == NEWMV) {
+    this_mv->as_int = INVALID_MV;
+  } else if (single_mode == GLOBALMV) {
+    *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+  } else {
+    assert(single_mode == NEARMV || single_mode == NEARESTMV);
+    const int ref_mv_offset = get_ref_mv_offset(single_mode, ref_mv_idx);
+    if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      assert(ref_mv_offset >= 0);
+      if (ref_idx == 0) {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+      } else {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+      }
+    } else {
+      *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+    }
+  }
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+                               const AV1_COMMON *cm, const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  int ret = 1;
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    int_mv this_mv;
+    get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
+                x->mbmi_ext);
+    const PREDICTION_MODE single_mode =
+        get_single_mode(this_mode, i, is_comp_pred);
+    if (single_mode == NEWMV) {
+      cur_mv[i] = this_mv;
+    } else {
+      ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+    }
+  }
+  return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               int (*drl_mode_cost0)[2],
+                               int8_t ref_frame_type) {
+  int cost = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+        if (mbmi->ref_mv_idx == idx) return cost;
+      }
+    }
+    return cost;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    for (int idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+        if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+      }
+    }
+    return cost;
+  }
+  return cost;
+}
+
+// Struct for buffers used by compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+  uint8_t *pred0;
+  uint8_t *pred1;
+  int16_t *residual1;          // src - pred1
+  int16_t *diff10;             // pred1 - pred0
+  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                            BLOCK_SIZE bsize, int mi_col, int mi_row,
+                            int_mv *cur_mv, int masked_compound_used,
+                            BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+                            CompoundTypeRdBuffers *buffers, int *rate_mv,
+                            int64_t *rd, RD_STATS *rd_stats,
+                            int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int bw = block_size_wide[bsize];
+  int rate_sum, rs2;
+  int64_t dist_sum;
+
+  int_mv best_mv[2];
+  int best_tmp_rate_mv = *rate_mv;
+  int tmp_skip_txfm_sb;
+  int64_t tmp_skip_sse_sb;
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  best_compound_data.type = COMPOUND_AVERAGE;
+  uint8_t *preds0[1] = { buffers->pred0 };
+  uint8_t *preds1[1] = { buffers->pred1 };
+  int strides[1] = { bw };
+  int tmp_rate_mv;
+  const int num_pix = 1 << num_pels_log2_lookup[bsize];
+  const int mask_len = 2 * num_pix * sizeof(uint8_t);
+  COMPOUND_TYPE cur_type;
+  int best_compmode_interinter_cost = 0;
+  int calc_pred_masked_compound = 1;
+
+  best_mv[0].as_int = cur_mv[0].as_int;
+  best_mv[1].as_int = cur_mv[1].as_int;
+  *rd = INT64_MAX;
+  for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+    if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+    if (!is_interinter_compound_used(cur_type, bsize)) continue;
+    tmp_rate_mv = *rate_mv;
+    int64_t best_rd_cur = INT64_MAX;
+    mbmi->interinter_comp.type = cur_type;
+    int masked_type_cost = 0;
+
+    const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+    const int comp_index_ctx = get_comp_index_context(cm, xd);
+    mbmi->compound_idx = 1;
+    if (cur_type == COMPOUND_AVERAGE) {
+      mbmi->comp_group_idx = 0;
+      if (masked_compound_used) {
+        masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
+      }
+      masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+      rs2 = masked_type_cost;
+      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+      if (mode_rd < ref_best_rd) {
+        av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+        int64_t est_rd =
+            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+        if (est_rd != INT64_MAX)
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+      }
+      // use spare buffer for following compound type try
+      restore_dst_buf(xd, *tmp_dst, 1);
+    } else {
+      mbmi->comp_group_idx = 1;
+      masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
+      masked_type_cost += x->compound_type_cost[bsize][cur_type - 1];
+      rs2 = masked_type_cost;
+      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+          *rd / 3 < ref_best_rd) {
+        best_rd_cur = build_and_cost_compound_type(
+            cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+            &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+            strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
+            &calc_pred_masked_compound);
+      }
+    }
+    if (best_rd_cur < *rd) {
+      *rd = best_rd_cur;
+      best_compound_data = mbmi->interinter_comp;
+      if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
+        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
+      }
+      best_compmode_interinter_cost = rs2;
+      if (have_newmv_in_inter_mode(this_mode)) {
+        if (cur_type == COMPOUND_WEDGE) {
+          best_tmp_rate_mv = tmp_rate_mv;
+          best_mv[0].as_int = mbmi->mv[0].as_int;
+          best_mv[1].as_int = mbmi->mv[1].as_int;
+        } else {
+          best_mv[0].as_int = cur_mv[0].as_int;
+          best_mv[1].as_int = cur_mv[1].as_int;
+        }
+      }
+    }
+    // reset to original mvs for next iteration
+    mbmi->mv[0].as_int = cur_mv[0].as_int;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+  if (mbmi->interinter_comp.type != best_compound_data.type) {
+    mbmi->comp_group_idx =
+        (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
+    mbmi->interinter_comp = best_compound_data;
+    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
+  }
+  if (have_newmv_in_inter_mode(this_mode)) {
+    mbmi->mv[0].as_int = best_mv[0].as_int;
+    mbmi->mv[1].as_int = best_mv[1].as_int;
+    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+      *rate_mv = best_tmp_rate_mv;
+    }
+  }
+  restore_dst_buf(xd, *orig_dst, 1);
+  return best_compmode_interinter_cost;
+}
+
+static INLINE int is_single_newmv_valid(HandleInterModeArgs *args,
+                                        MB_MODE_INFO *mbmi,
+                                        PREDICTION_MODE this_mode) {
+  for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1);
+    const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+    if (single_mode == NEWMV &&
+        args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               PREDICTION_MODE mode) {
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+  const int has_drl =
+      (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+  const int ref_set =
+      has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1;
+
+  return ref_set;
+}
+
+typedef struct {
+  int64_t rd;
+  int drl_cost;
+  int rate_mv;
+  int_mv mv;
+} inter_mode_info;
+
+static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, RD_STATS *rd_stats,
+                                 RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+                                 int *disable_skip, int mi_row, int mi_col,
+                                 HandleInterModeArgs *args, int64_t ref_best_rd,
+                                 uint8_t *const tmp_buf,
+                                 CompoundTypeRdBuffers *rd_buffers
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+                                 ,
+                                 TileDataEnc *tile_data, int64_t *best_est_rd,
+                                 const int do_tx_search,
+                                 InterModesInfo *inter_modes_info
+#endif
+) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  int i;
+  int refs[2] = { mbmi->ref_frame[0],
+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int rate_mv = 0;
+  int64_t rd = INT64_MAX;
+
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  struct macroblockd_plane *p = xd->plane;
+  BUFFER_SET orig_dst = {
+    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+  };
+  const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+                                 tmp_buf + 2 * MAX_SB_SQUARE },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+
+  int skip_txfm_sb = 0;
+  int64_t skip_sse_sb = INT64_MAX;
+  int16_t mode_ctx;
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params.enable_masked_compound;
+  int64_t ret_val = INT64_MAX;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  int64_t best_rd = INT64_MAX;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int best_disable_skip;
+  int best_xskip;
+  int64_t newmv_ret_val = INT64_MAX;
+  int_mv backup_mv[2] = { { 0 } };
+  int backup_rate_mv = 0;
+  inter_mode_info mode_info[MAX_REF_MV_SERCH];
+
+  int comp_idx;
+  const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
+                              (mbmi->mode != GLOBAL_GLOBALMV);
+
+  // TODO(jingning): This should be deprecated shortly.
+  const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
+    mode_info[ref_mv_idx].rd = INT64_MAX;
+
+    if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
+      if (mbmi->ref_frame[0] == LAST2_FRAME ||
+          mbmi->ref_frame[0] == LAST3_FRAME ||
+          mbmi->ref_frame[1] == LAST2_FRAME ||
+          mbmi->ref_frame[1] == LAST3_FRAME) {
+        if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv]
+                .weight < REF_CAT_LEVEL) {
+          continue;
+        }
+      }
+    }
+
+    av1_init_rd_stats(rd_stats);
+
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+    if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+    mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+    mbmi->num_proj_ref = 0;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->ref_mv_idx = ref_mv_idx;
+
+    if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) {
+      continue;
+    }
+
+    rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+    const int drl_cost =
+        get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+    rd_stats->rate += drl_cost;
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+      continue;
+    }
+
+    int64_t best_rd2 = INT64_MAX;
+
+    const RD_STATS backup_rd_stats = *rd_stats;
+    // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
+    for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+      int rs = 0;
+      int compmode_interinter_cost = 0;
+      mbmi->compound_idx = comp_idx;
+      if (is_comp_pred && comp_idx == 0) {
+        *rd_stats = backup_rd_stats;
+        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+        if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+        mbmi->num_proj_ref = 0;
+        mbmi->motion_mode = SIMPLE_TRANSLATION;
+        mbmi->comp_group_idx = 0;
+
+        const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        if (masked_compound_used) {
+          compmode_interinter_cost +=
+              x->comp_group_idx_cost[comp_group_idx_ctx][0];
+        }
+        compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
+      }
+
+      int_mv cur_mv[2];
+      if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
+        continue;
+      }
+      if (have_newmv_in_inter_mode(this_mode)) {
+        if (comp_idx == 0) {
+          cur_mv[0] = backup_mv[0];
+          cur_mv[1] = backup_mv[1];
+          rate_mv = backup_rate_mv;
+        }
+
+        // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
+        if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
+              comp_idx == 0)) {
+          newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
+                                       &rate_mv, args);
+
+          // Store cur_mv and rate_mv so that they can be restored in the next
+          // iteration of the loop
+          backup_mv[0] = cur_mv[0];
+          backup_mv[1] = cur_mv[1];
+          backup_rate_mv = rate_mv;
+        }
+
+        if (newmv_ret_val != 0) {
+          continue;
+        } else {
+          rd_stats->rate += rate_mv;
+        }
+
+        if (cpi->sf.skip_repeated_newmv) {
+          if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+            int skip = 0;
+            int this_rate_mv = 0;
+            for (i = 0; i < ref_mv_idx; ++i) {
+              // Check if the motion search result same as previous results
+              if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) {
+                // If the compared mode has no valid rd, it is unlikely this
+                // mode will be the best mode
+                if (mode_info[i].rd == INT64_MAX) {
+                  skip = 1;
+                  break;
+                }
+                // Compare the cost difference including drl cost and mv cost
+                if (mode_info[i].mv.as_int != INVALID_MV) {
+                  const int compare_cost =
+                      mode_info[i].rate_mv + mode_info[i].drl_cost;
+                  const int_mv ref_mv = av1_get_ref_mv(x, 0);
+                  this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv,
+                                                 &ref_mv.as_mv, x->nmvjointcost,
+                                                 x->mvcost, MV_COST_WEIGHT);
+                  const int this_cost = this_rate_mv + drl_cost;
+
+                  if (compare_cost < this_cost) {
+                    skip = 1;
+                    break;
+                  } else {
+                    // If the cost is less than current best result, make this
+                    // the best and update corresponding variables
+                    if (best_mbmi.ref_mv_idx == i) {
+                      assert(best_rd != INT64_MAX);
+                      best_mbmi.ref_mv_idx = ref_mv_idx;
+                      best_rd_stats.rate += this_cost - compare_cost;
+                      best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+                                       best_rd_stats.dist);
+                      if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+
+                      skip = 1;
+                      break;
+                    }
+                  }
+                }
+              }
+            }
+            if (skip) {
+              args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+                  args->modelled_rd[this_mode][i][refs[0]];
+              args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+                  args->simple_rd[this_mode][i][refs[0]];
+              mode_info[ref_mv_idx].rd = mode_info[i].rd;
+              mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+              mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+              restore_dst_buf(xd, orig_dst, num_planes);
+              continue;
+            }
+          }
+        }
+      }
+      for (i = 0; i < is_comp_pred + 1; ++i) {
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      }
+      const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+#if USE_DISCOUNT_NEWMV_TEST
+      // We don't include the cost of the second reference here, because there
+      // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+      // other words if you present them in that order, the second one is always
+      // known if the first is known.
+      //
+      // Under some circumstances we discount the cost of new mv mode to
+      // encourage initiation of a motion field.
+      if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+        // discount_newmv_test only applies discount on NEWMV mode.
+        assert(this_mode == NEWMV);
+        rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
+                                 cost_mv_ref(x, NEARESTMV, mode_ctx));
+      } else {
+        rd_stats->rate += ref_mv_cost;
+      }
+#else
+      rd_stats->rate += ref_mv_cost;
+#endif
+
+      if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+          mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+        continue;
+      }
+
+      int skip_build_pred = 0;
+      if (is_comp_pred && comp_idx) {
+        // Find matching interp filter or set to default interp filter
+        const int need_search =
+            av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+        int match_found = -1;
+        const InterpFilter assign_filter = cm->interp_filter;
+        if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+          match_found = find_interp_filter_in_stats(x, mbmi);
+        }
+        if (!need_search || match_found == -1) {
+          set_default_interp_filters(mbmi, assign_filter);
+        }
+
+        int64_t best_rd_compound;
+        compmode_interinter_cost = compound_type_rd(
+            cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
+            &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+            rd_stats, ref_best_rd);
+        if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          continue;
+        }
+        // No need to call av1_build_inter_predictors_sby if
+        // COMPOUND_AVERAGE is selected because it is the first
+        // candidate in compound_type_rd, and the following
+        // compound types searching uses tmp_dst buffer
+        if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
+          if (num_planes > 1)
+            av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
+                                            bsize);
+          skip_build_pred = 1;
+        }
+      }
+
+      ret_val = interpolation_filter_search(
+          x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+          args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
+          skip_build_pred, args, ref_best_rd);
+      if (args->modelled_rd != NULL && !is_comp_pred) {
+        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+      }
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        if ((rd >> 3) * 2 > ref_best_rd) break;
+        continue;
+      }
+
+      if (search_jnt_comp) {
+        // if 1/2 model rd is larger than best_rd in jnt_comp mode,
+        // use jnt_comp mode, save additional search
+        if ((rd >> 3) * 4 > best_rd) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          continue;
+        }
+      }
+
+      if (!is_comp_pred)
+        args->single_filter[this_mode][refs[0]] =
+            av1_extract_interp_filter(mbmi->interp_filters, 0);
+
+      if (args->modelled_rd != NULL) {
+        if (is_comp_pred) {
+          const int mode0 = compound_ref0_mode(this_mode);
+          const int mode1 = compound_ref1_mode(this_mode);
+          const int64_t mrd =
+              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
+        }
+      }
+      rd_stats->rate += compmode_interinter_cost;
+
+      if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+        // TODO(chengchen): this speed feature introduces big loss.
+        // Need better estimation of rate distortion.
+        int dummy_rate;
+        int64_t dummy_dist;
+        int plane_rate[MAX_MB_PLANE] = { 0 };
+        int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+        int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+
+        model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+            cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
+            &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
+            plane_dist);
+
+        rd_stats->rate += rs;
+        rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
+        rd_stats_y->rate = plane_rate[0];
+        rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
+        rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
+        rd_stats_y->sse = plane_sse[0];
+        rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
+        rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
+        rd_stats_y->dist = plane_dist[0];
+        rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
+      } else {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        ret_val = motion_mode_rd(
+            cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
+            mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
+            tile_data, best_est_rd, do_tx_search, inter_modes_info);
+#else
+        ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
+                                 rd_stats_uv, disable_skip, mi_row, mi_col,
+                                 args, ref_best_rd, refs, &rate_mv, &orig_dst);
+#endif
+      }
+      mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+      mode_info[ref_mv_idx].rate_mv = rate_mv;
+      if (ret_val != INT64_MAX) {
+        int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+        mode_info[ref_mv_idx].rd = tmp_rd;
+        if (tmp_rd < best_rd) {
+          best_rd_stats = *rd_stats;
+          best_rd_stats_y = *rd_stats_y;
+          best_rd_stats_uv = *rd_stats_uv;
+          best_rd = tmp_rd;
+          best_mbmi = *mbmi;
+          best_disable_skip = *disable_skip;
+          best_xskip = x->skip;
+          memcpy(best_blk_skip, x->blk_skip,
+                 sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
+        }
+
+        if (tmp_rd < best_rd2) {
+          best_rd2 = tmp_rd;
+        }
+
+        if (tmp_rd < ref_best_rd) {
+          ref_best_rd = tmp_rd;
+        }
+      }
+      restore_dst_buf(xd, orig_dst, num_planes);
+    }
+  }
+
+  if (best_rd == INT64_MAX) return INT64_MAX;
+
+  // re-instate status of the best choice
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  *rd_stats_uv = best_rd_stats_uv;
+  *mbmi = best_mbmi;
+  *disable_skip = best_disable_skip;
+  x->skip = best_xskip;
+  assert(IMPLIES(mbmi->comp_group_idx == 1,
+                 mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
+
+  return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+}
+
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                       int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (!av1_allow_intrabc(cm)) return INT64_MAX;
+  const int num_planes = av1_num_planes(cm);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TileInfo *tile = &xd->tile;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
+  const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                   mi_col, mbmi_ext->mode_context);
+
+  int_mv nearestmv, nearmv;
+  av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+                                   0);
+
+  if (nearestmv.as_int == INVALID_MV) {
+    nearestmv.as_int = 0;
+  }
+  if (nearmv.as_int == INVALID_MV) {
+    nearmv.as_int = 0;
+  }
+
+  int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+  if (dv_ref.as_int == 0)
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
+  // Ref DV should not have sub-pel.
+  assert((dv_ref.as_mv.col & 7) == 0);
+  assert((dv_ref.as_mv.row & 7) == 0);
+  mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL,
+                       num_planes);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].pre[0] = yv12_mb[i];
+  }
+
+  enum IntrabcMotionDirection {
+    IBC_MOTION_ABOVE,
+    IBC_MOTION_LEFT,
+    IBC_MOTION_DIRECTIONS
+  };
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  RD_STATS best_rdcost = *rd_cost;
+  int best_skip = x->skip;
+
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+       dir < IBC_MOTION_DIRECTIONS; ++dir) {
+    const MvLimits tmp_mv_limits = x->mv_limits;
+    switch (dir) {
+      case IBC_MOTION_ABOVE:
+        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+        x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
+        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        x->mv_limits.row_max =
+            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+        break;
+      case IBC_MOTION_LEFT:
+        x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+        x->mv_limits.col_max =
+            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+        // TODO(aconverse@google.com): Minimize the overlap between above and
+        // left areas.
+        x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+        int bottom_coded_mi_edge =
+            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+        x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+        break;
+      default: assert(0);
+    }
+    assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
+    assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
+    assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
+    assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
+    av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+
+    if (x->mv_limits.col_max < x->mv_limits.col_min ||
+        x->mv_limits.row_max < x->mv_limits.row_min) {
+      x->mv_limits = tmp_mv_limits;
+      continue;
+    }
+
+    int step_param = cpi->mv_step_param;
+    MV mvp_full = dv_ref.as_mv;
+    mvp_full.col >>= 3;
+    mvp_full.row >>= 3;
+    int sadpb = x->sadperbit16;
+    int cost_list[5];
+    int bestsme = av1_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+        sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+        (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
+
+    x->mv_limits = tmp_mv_limits;
+    if (bestsme == INT_MAX) continue;
+    mvp_full = x->best_mv.as_mv;
+    MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+    if (mv_check_bounds(&x->mv_limits, &dv)) continue;
+    if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+                         cm->seq_params.mib_size_log2))
+      continue;
+
+    // DV should not have sub-pel.
+    assert((dv.col & 7) == 0);
+    assert((dv.row & 7) == 0);
+    memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->use_intrabc = 1;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->mv[0].as_mv = dv;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->skip = 0;
+    x->skip = 0;
+    av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+
+    int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
+                       (int *)&cpi->dv_cost[1][MV_MAX] };
+    // TODO(aconverse@google.com): The full motion field defining discount
+    // in MV_COST_WEIGHT is too large. Explore other values.
+    int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+                                  dvcost, MV_COST_WEIGHT_SUB);
+    const int rate_mode = x->intrabc_cost[1];
+    RD_STATS rd_stats, rd_stats_uv;
+    av1_subtract_plane(x, bsize, 0);
+    if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+      // Intrabc
+      select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+    } else {
+      super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+      for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+        set_blk_skip(x, 0, i, rd_stats.skip);
+    }
+    if (num_planes > 1) {
+      super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+    }
+#if CONFIG_RD_DEBUG
+    mbmi->rd_stats = rd_stats;
+#endif
+
+    const int skip_ctx = av1_get_skip_context(xd);
+
+    RD_STATS rdc_noskip;
+    av1_init_rd_stats(&rdc_noskip);
+    rdc_noskip.rate =
+        rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0];
+    rdc_noskip.dist = rd_stats.dist;
+    rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
+    if (rdc_noskip.rdcost < best_rd) {
+      best_rd = rdc_noskip.rdcost;
+      best_mbmi = *mbmi;
+      best_skip = x->skip;
+      best_rdcost = rdc_noskip;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+    }
+
+    if (!xd->lossless[mbmi->segment_id]) {
+      x->skip = 1;
+      mbmi->skip = 1;
+      RD_STATS rdc_skip;
+      av1_init_rd_stats(&rdc_skip);
+      rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1];
+      rdc_skip.dist = rd_stats.sse;
+      rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
+      if (rdc_skip.rdcost < best_rd) {
+        best_rd = rdc_skip.rdcost;
+        best_mbmi = *mbmi;
+        best_skip = x->skip;
+        best_rdcost = rdc_skip;
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+      }
+    }
+  }
+  *mbmi = best_mbmi;
+  *rd_cost = best_rdcost;
+  x->skip = best_skip;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+  return best_rd;
+}
+
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                               int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int y_skip = 0, uv_skip = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+  TX_SIZE max_uv_tx_size;
+
+  ctx->skip = 0;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->use_intrabc = 0;
+  mbmi->mv[0].as_int = 0;
+
+  const int64_t intra_yrd =
+      rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
+                             &dist_y, &y_skip, bsize, best_rd, ctx);
+
+  if (intra_yrd < best_rd) {
+    // Only store reconstructed luma when there's chroma RDO. When there's no
+    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+    xd->cfl.is_chroma_reference =
+        is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+                            cm->seq_params.subsampling_y);
+    xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+    if (xd->cfl.store_y) {
+      // Restore reconstructed luma values.
+      memcpy(x->blk_skip, ctx->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y,
+                                   cpi->optimize_seg_arr[mbmi->segment_id],
+                                   mi_row, mi_col);
+      xd->cfl.store_y = 0;
+    }
+    if (num_planes > 1) {
+      max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      init_sbuv_mode(mbmi);
+      if (!x->skip_chroma_rd)
+        rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+                                &uv_skip, bsize, max_uv_tx_size);
+    }
+
+    if (y_skip && (uv_skip || x->skip_chroma_rd)) {
+      rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+                      x->skip_cost[av1_get_skip_context(xd)][1];
+      rd_cost->dist = dist_y + dist_uv;
+    } else {
+      rd_cost->rate =
+          rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+      rd_cost->dist = dist_y + dist_uv;
+    }
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+  } else {
+    rd_cost->rate = INT_MAX;
+  }
+
+  if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+    best_rd = rd_cost->rdcost;
+  if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
+    ctx->skip = x->skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    assert(rd_cost->rate != INT_MAX);
+  }
+  if (rd_cost->rate == INT_MAX) return;
+
+  ctx->mic = *xd->mi[0];
+  ctx->mbmi_ext = *x->mbmi_ext;
+}
+
+static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
+
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const uint8_t *above,
+                                      int above_stride, const uint8_t *left,
+                                      int left_stride);
+
+static const int ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                     AOM_LAST_FLAG,
+                                                     AOM_LAST2_FLAG,
+                                                     AOM_LAST3_FLAG,
+                                                     AOM_GOLD_FLAG,
+                                                     AOM_BWD_FLAG,
+                                                     AOM_ALT2_FLAG,
+                                                     AOM_ALT_FLAG };
+
+static void rd_pick_skip_mode(RD_STATS *rd_cost,
+                              InterModeSearchState *search_state,
+                              const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int mi_row, int mi_col,
+                              struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  x->compound_idx = 1;  // COMPOUND_AVERAGE
+  RD_STATS skip_mode_rd_stats;
+  av1_invalid_rd_stats(&skip_mode_rd_stats);
+
+  if (cm->ref_frame_idx_0 == INVALID_IDX ||
+      cm->ref_frame_idx_1 == INVALID_IDX) {
+    return;
+  }
+
+  const MV_REFERENCE_FRAME ref_frame = LAST_FRAME + cm->ref_frame_idx_0;
+  const MV_REFERENCE_FRAME second_ref_frame = LAST_FRAME + cm->ref_frame_idx_1;
+  const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+  const int mode_index =
+      get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+  if (mode_index == -1) {
+    return;
+  }
+
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame;
+  mbmi->ref_frame[1] = second_ref_frame;
+  const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
+    if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+        x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+      return;
+    }
+    MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                     mi_col, mbmi_ext->mode_context);
+  }
+
+  assert(this_mode == NEAREST_NEARESTMV);
+  if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
+    return;
+  }
+
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = x->compound_idx;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = 0;
+  mbmi->skip_mode = mbmi->skip = 1;
+
+  set_default_interp_filters(mbmi, cm->interp_filter);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+    xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+  }
+
+  BUFFER_SET orig_dst;
+  for (int i = 0; i < num_planes; i++) {
+    orig_dst.plane[i] = xd->plane[i].dst.buf;
+    orig_dst.stride[i] = xd->plane[i].dst.stride;
+  }
+
+  // Obtain the rdcost for skip_mode.
+  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, mi_row, mi_col, &orig_dst);
+
+  // Compare the use of skip_mode with the best intra/inter mode obtained.
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  const int64_t best_intra_inter_mode_cost =
+      (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX)
+          ? RDCOST(x->rdmult,
+                   rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
+                   rd_cost->dist)
+          : INT64_MAX;
+
+  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) {
+    assert(mode_index != -1);
+    search_state->best_mbmode.skip_mode = 1;
+    search_state->best_mbmode = *mbmi;
+
+    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
+    search_state->best_mbmode.mode = NEAREST_NEARESTMV;
+    search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
+    search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
+    search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
+    search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
+    search_state->best_mbmode.ref_mv_idx = 0;
+
+    // Set up tx_size related variables for skip-specific loop filtering.
+    search_state->best_mbmode.tx_size =
+        block_signals_txsize(bsize) ? tx_size_from_tx_mode(bsize, cm->tx_mode)
+                                    : max_txsize_rect_lookup[bsize];
+    memset(search_state->best_mbmode.inter_tx_size,
+           search_state->best_mbmode.tx_size,
+           sizeof(search_state->best_mbmode.inter_tx_size));
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h,
+                  search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
+
+    // Set up color-related variables for skip mode.
+    search_state->best_mbmode.uv_mode = UV_DC_PRED;
+    search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
+    search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
+
+    search_state->best_mbmode.comp_group_idx = 0;
+    search_state->best_mbmode.compound_idx = x->compound_idx;
+    search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
+    search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
+
+    search_state->best_mbmode.interintra_mode =
+        (INTERINTRA_MODE)(II_DC_PRED - 1);
+    search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
+
+    set_default_interp_filters(&search_state->best_mbmode, cm->interp_filter);
+
+    search_state->best_mode_index = mode_index;
+
+    // Update rd_cost
+    rd_cost->rate = skip_mode_rd_stats.rate;
+    rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+    rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+    search_state->best_rd = rd_cost->rdcost;
+    search_state->best_skip2 = 1;
+    search_state->best_mode_skippable = (skip_mode_rd_stats.sse == 0);
+
+    x->skip = 1;
+  }
+}
+
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static void sf_refine_fast_tx_type_search(
+    const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+    RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+    int best_mode_index, MB_MODE_INFO *best_mbmode,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y,
+    int best_rate_uv, int *best_skip2) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode->mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode->mode)))) {
+    int skip_blk = 0;
+    RD_STATS rd_stats_y, rd_stats_uv;
+
+    x->use_default_inter_tx_type = 0;
+    x->use_default_intra_tx_type = 0;
+
+    *mbmi = *best_mbmode;
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (has_second_ref(mbmi))
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    if (is_inter_mode(mbmi->mode)) {
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+      av1_subtract_plane(x, bsize, 0);
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+        // av1_rd_pick_inter_mode_sb
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+                           INT64_MAX);
+        assert(rd_stats_y.rate != INT_MAX);
+      } else {
+        super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+        for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+          set_blk_skip(x, 0, i, rd_stats_y.skip);
+      }
+      if (num_planes > 1) {
+        inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
+                         FTXS_NONE);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+    } else {
+      super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+      if (num_planes > 1) {
+        super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+    }
+
+    if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist)) >
+        RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+      skip_blk = 1;
+      rd_stats_y.rate = x->skip_cost[av1_get_skip_context(xd)][1];
+      rd_stats_uv.rate = 0;
+      rd_stats_y.dist = rd_stats_y.sse;
+      rd_stats_uv.dist = rd_stats_uv.sse;
+    } else {
+      skip_blk = 0;
+      rd_stats_y.rate += x->skip_cost[av1_get_skip_context(xd)][0];
+    }
+
+    if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+               (rd_stats_y.dist + rd_stats_uv.dist))) {
+      best_mbmode->tx_size = mbmi->tx_size;
+      av1_copy(best_mbmode->inter_tx_size, mbmi->inter_tx_size);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy(best_mbmode->txk_type, mbmi->txk_type);
+      rd_cost->rate +=
+          (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
+      rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+      rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+      *best_skip2 = skip_blk;
+    }
+  }
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static void set_params_rd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
+    uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
+    unsigned int ref_costs_single[REF_FRAMES],
+    unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct segmentation *const seg = &cm->seg;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  unsigned char segment_id = mbmi->segment_id;
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                   MAX_SB_SIZE >> 1 };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                    MAX_SB_SIZE >> 1 };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  for (int i = 0; i < MB_MODE_COUNT; ++i)
+    for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+  }
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          int skip = 1;
+          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+            if (!(skip_ref_frame_mask & (1 << r))) {
+              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+              if (rf[0] == ref_frame || rf[1] == ref_frame) {
+                skip = 0;
+                break;
+              }
+            }
+          }
+          if (skip) continue;
+        }
+      }
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+                                 yv12_mb);
+    }
+  }
+  // ref_frame = ALTREF_FRAME
+  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+    if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
+          (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+      continue;
+    }
+
+    if (mbmi->partition != PARTITION_NONE &&
+        mbmi->partition != PARTITION_SPLIT) {
+      if (skip_ref_frame_mask & (1 << ref_frame)) {
+        continue;
+      }
+    }
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+                     mi_col, mbmi_ext->mode_context);
+  }
+
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
+  if (check_num_overlappable_neighbors(mbmi) &&
+      is_motion_variation_allowed_bsize(bsize)) {
+    av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                        args->above_pred_buf, dst_width1,
+                                        dst_height1, args->above_pred_stride);
+    av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                       args->left_pred_buf, dst_width2,
+                                       dst_height2, args->left_pred_stride);
+    av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+                         mi_col, 0, num_planes);
+    calc_target_weighted_pred(
+        cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+        args->above_pred_stride[0], args->left_pred_buf[0],
+        args->left_pred_stride[0]);
+  }
+
+  int min_pred_mv_sad = INT_MAX;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+
+  for (int i = 0; i < 2; ++i) {
+    ref_frame_skip_mask[i] = 0;
+  }
+  memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask));
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) {
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    } else {
+      // Skip fixed mv modes for poor references
+      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+        mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  // Disable this drop out case if the ref frame
+  // segment level feature is enabled for this segment. This is to
+  // prevent the possibility that we end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+                               (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) |
+                               (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      // TODO(zoeliu): To further explore whether following needs to be done for
+      //               BWDREF_FRAME as well.
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+      int_mv near_mv, nearest_mv, global_mv;
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+
+      if (near_mv.as_int != global_mv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+      if (nearest_mv.as_int != global_mv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]);
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+    }
+  }
+
+  if (sf->alt_ref_search_fp)
+    if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+      if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+        mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+  if (sf->adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
+        mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
+
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
+  if (cpi->sf.skip_repeat_interpolation_filter_search) {
+    x->interp_filter_stats_idx[0] = 0;
+    x->interp_filter_stats_idx[1] = 0;
+  }
+}
+
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                                int mi_col, RD_STATS *rd_cost,
+                                PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+                                MB_MODE_INFO *const mbmi,
+                                PALETTE_MODE_INFO *const pmi,
+                                unsigned int *ref_costs_single,
+                                InterModeSearchState *search_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int rate2 = 0;
+  int64_t distortion2 = 0, best_rd_palette = search_state->best_rd, this_rd,
+          best_model_rd_palette = INT64_MAX;
+  int skippable = 0, rate_overhead_palette = 0;
+  RD_STATS rd_stats_y;
+  TX_SIZE uv_tx = TX_4X4;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  rate_overhead_palette = rd_pick_palette_intra_sby(
+      cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED],
+      &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+      &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip);
+  if (pmi->palette_size[0] == 0) return;
+
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+  super_block_yrd(cpi, x, &rd_stats_y, bsize, search_state->best_rd);
+  if (rd_stats_y.rate == INT_MAX) return;
+
+  skippable = rd_stats_y.skip;
+  distortion2 = rd_stats_y.dist;
+  rate2 = rd_stats_y.rate + rate_overhead_palette;
+  rate2 += ref_costs_single[INTRA_FRAME];
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+          &search_state->rate_uv_tokenonly[uv_tx],
+          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+          &search_state->mode_uv[uv_tx]);
+      search_state->pmi_uv[uv_tx] = *pmi;
+      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+    mbmi->uv_mode = search_state->mode_uv[uv_tx];
+    pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+    skippable = skippable && search_state->skip_uvs[uv_tx];
+    distortion2 += search_state->dist_uvs[uv_tx];
+    rate2 += search_state->rate_uv_intra[uv_tx];
+  }
+
+  if (skippable) {
+    rate2 -= rd_stats_y.rate;
+    if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+  if (this_rd < search_state->best_rd) {
+    search_state->best_mode_index = 3;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    search_state->best_rd = this_rd;
+    search_state->best_mbmode = *mbmi;
+    search_state->best_skip2 = 0;
+    search_state->best_mode_skippable = skippable;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  }
+}
+
+static void init_inter_mode_search_state(InterModeSearchState *search_state,
+                                         const AV1_COMP *cpi,
+                                         const TileDataEnc *tile_data,
+                                         const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                         int64_t best_rd_so_far) {
+  search_state->best_rd = best_rd_so_far;
+
+  av1_zero(search_state->best_mbmode);
+
+  search_state->best_rate_y = INT_MAX;
+
+  search_state->best_rate_uv = INT_MAX;
+
+  search_state->best_mode_skippable = 0;
+
+  search_state->best_skip2 = 0;
+
+  search_state->best_mode_index = -1;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+
+  search_state->skip_intra_modes = 0;
+
+  search_state->num_available_refs = 0;
+  memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+  memset(search_state->dist_order_refs, -1,
+         sizeof(search_state->dist_order_refs));
+
+  for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    search_state->mode_threshold[i] = 0;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    search_state->mode_threshold[i] =
+        ((int64_t)rd_threshes[i] * tile_data->thresh_freq_fact[bsize][i]) >> 5;
+
+  search_state->best_intra_mode = DC_PRED;
+  search_state->best_intra_rd = INT64_MAX;
+
+  search_state->angle_stats_ready = 0;
+
+  search_state->best_pred_sse = UINT_MAX;
+
+  for (int i = 0; i < TX_SIZES_ALL; i++)
+    search_state->rate_uv_intra[i] = INT_MAX;
+
+  av1_zero(search_state->pmi_uv);
+
+  for (int i = 0; i < REFERENCE_MODES; ++i)
+    search_state->best_pred_rd[i] = INT64_MAX;
+
+  av1_zero(search_state->single_newmv);
+  av1_zero(search_state->single_newmv_rate);
+  av1_zero(search_state->single_newmv_valid);
+  for (int i = 0; i < MB_MODE_COUNT; ++i) {
+    for (int j = 0; j < MAX_REF_MV_SERCH; ++j) {
+      for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+        search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+        search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+      }
+    }
+  }
+
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        SingleInterModeState *state;
+
+        state = &search_state->single_state[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        state = &search_state->single_state_modelled[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+      }
+    }
+  }
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+      }
+    }
+  }
+  av1_zero(search_state->single_state_cnt);
+  av1_zero(search_state->single_state_modelled_cnt);
+}
+
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
+static int inter_mode_search_order_independent_skip(
+    const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
+    BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
+    uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
+    InterModeSearchState *search_state) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+  const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
+  const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  int skip_motion_mode = 0;
+  if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+    const int ref_type = av1_ref_frame_type(ref_frame);
+    int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+    if (ref_type <= ALTREF_FRAME && skip_ref) {
+      // Since the compound ref modes depends on the motion estimation result of
+      // two single ref modes( best mv of single ref modes as the start point )
+      // If current single ref mode is marked skip, we need to check if it will
+      // be used in compound ref modes.
+      for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+        if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+          const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+          if (rf[0] == ref_type || rf[1] == ref_type) {
+            // Found a not skipped compound ref mode which contains current
+            // single ref. So this single ref can't be skipped completly
+            // Just skip it's motion mode search, still try it's simple
+            // transition mode.
+            skip_motion_mode = 1;
+            skip_ref = 0;
+            break;
+          }
+        }
+      }
+    }
+    if (skip_ref) return 1;
+  }
+
+  if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+      !x->cb_partition_scan) {
+    const int mi_width = mi_size_wide[bsize];
+    const int mi_height = mi_size_high[bsize];
+    int found = 0;
+    // Search in the stats table to see if the ref frames have been used in the
+    // first pass of partition search.
+    for (int row = mi_row; row < mi_row + mi_width && !found;
+         row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+      for (int col = mi_col; col < mi_col + mi_height && !found;
+           col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+        const int index = av1_first_partition_pass_stats_index(row, col);
+        const FIRST_PARTITION_PASS_STATS *const stats =
+            &x->first_partition_pass_stats[index];
+        if (stats->ref0_counts[ref_frame[0]] &&
+            (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) {
+          found = 1;
+          break;
+        }
+      }
+    }
+    if (!found) return 1;
+  }
+
+  if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+    // Mode must by compatible
+    if (!is_interintra_allowed_mode(this_mode)) return 1;
+    if (!is_interintra_allowed_bsize(bsize)) return 1;
+  }
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+    return 1;
+
+  if (ref_frame[0] == INTRA_FRAME) {
+    if (this_mode != DC_PRED) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      const unsigned int skip_intra_var_thresh = 64;
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          x->source_variance < skip_intra_var_thresh)
+        return 1;
+    }
+  } else {
+    if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1;
+  }
+
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+  if (comp_pred) {
+    if (!cpi->allow_comp_inter_inter) return 1;
+
+    if (cm->reference_mode == SINGLE_REFERENCE) return 1;
+
+    // Skip compound inter modes if ARF is not available.
+    if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
+
+    // Do not allow compound prediction if the segment level reference frame
+    // feature is in use as in this case there can only be one reference.
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+  }
+
+  if (sf->selective_ref_frame) {
+    if (sf->selective_ref_frame >= 2 || x->cb_partition_scan) {
+      if (ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME)
+        if (get_relative_dist(
+                cm, cm->cur_frame->ref_frame_offset[ALTREF2_FRAME - LAST_FRAME],
+                cm->frame_offset) < 0)
+          return 1;
+      if (ref_frame[0] == BWDREF_FRAME || ref_frame[1] == BWDREF_FRAME)
+        if (get_relative_dist(
+                cm, cm->cur_frame->ref_frame_offset[BWDREF_FRAME - LAST_FRAME],
+                cm->frame_offset) < 0)
+          return 1;
+    }
+    if (ref_frame[0] == LAST3_FRAME || ref_frame[1] == LAST3_FRAME)
+      if (get_relative_dist(
+              cm, cm->cur_frame->ref_frame_offset[LAST3_FRAME - LAST_FRAME],
+              cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+        return 1;
+    if (ref_frame[0] == LAST2_FRAME || ref_frame[1] == LAST2_FRAME)
+      if (get_relative_dist(
+              cm, cm->cur_frame->ref_frame_offset[LAST2_FRAME - LAST_FRAME],
+              cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+        return 1;
+  }
+
+  // One-sided compound is used only when all reference frames are one-sided.
+  if (sf->selective_ref_frame && comp_pred && !cpi->all_one_sided_refs) {
+    unsigned int ref_offsets[2];
+    for (int i = 0; i < 2; ++i) {
+      const int buf_idx = cm->frame_refs[ref_frame[i] - LAST_FRAME].idx;
+      assert(buf_idx >= 0);
+      ref_offsets[i] = cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+    }
+    if ((get_relative_dist(cm, ref_offsets[0], cm->frame_offset) <= 0 &&
+         get_relative_dist(cm, ref_offsets[1], cm->frame_offset) <= 0) ||
+        (get_relative_dist(cm, ref_offsets[0], cm->frame_offset) > 0 &&
+         get_relative_dist(cm, ref_offsets[1], cm->frame_offset) > 0))
+      return 1;
+  }
+
+  if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) {
+    return 1;
+  }
+
+  if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) &&
+      (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) {
+    return 1;
+  }
+
+  if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
+    return 1;
+  }
+  if (skip_motion_mode) {
+    return 2;
+  }
+  return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = av1_mode_order[mode_index].ref_frame[0];
+  mbmi->ref_frame[1] = av1_mode_order[mode_index].ref_frame[1];
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  set_default_interp_filters(mbmi, cm->interp_filter);
+}
+
+static int64_t handle_intra_mode(InterModeSearchState *search_state,
+                                 const AV1_COMP *cpi, MACROBLOCK *x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 int ref_frame_cost,
+                                 const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                                 RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                                 RD_STATS *rd_stats_uv) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->base_qindex, cm->y_dc_delta_q, cm->seq_params.bit_depth);
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int num_planes = av1_num_planes(cm);
+  const int skip_ctx = av1_get_skip_context(xd);
+
+  int known_rate = intra_mode_cost[mbmi->mode];
+  known_rate += ref_frame_cost;
+  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+    known_rate += intra_cost_penalty;
+  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+  if (known_rd > search_state->best_rd) {
+    search_state->skip_intra_modes = 1;
+    return INT64_MAX;
+  }
+
+  TX_SIZE uv_tx;
+  int is_directional_mode = av1_is_directional_mode(mbmi->mode);
+  if (is_directional_mode && av1_use_angle_delta(bsize)) {
+    int rate_dummy;
+    int64_t model_rd = INT64_MAX;
+    if (!search_state->angle_stats_ready) {
+      const int src_stride = x->plane[0].src.stride;
+      const uint8_t *src = x->plane[0].src.buf;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        highbd_angle_estimation(src, src_stride, rows, cols, bsize,
+                                search_state->directional_mode_skip_mask);
+      else
+        angle_estimation(src, src_stride, rows, cols, bsize,
+                         search_state->directional_mode_skip_mask);
+      search_state->angle_stats_ready = 1;
+    }
+    if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX;
+    av1_init_rd_stats(rd_stats_y);
+    rd_stats_y->rate = INT_MAX;
+    rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y,
+                            bsize, intra_mode_cost[mbmi->mode],
+                            search_state->best_rd, &model_rd);
+  } else {
+    av1_init_rd_stats(rd_stats_y);
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
+  }
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  memcpy(best_blk_skip, x->blk_skip,
+         sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+  int try_filter_intra = 0;
+  int64_t best_rd_tmp = INT64_MAX;
+  if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    if (rd_stats_y->rate != INT_MAX) {
+      const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+                           intra_mode_cost[mbmi->mode];
+      best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+      try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd);
+    } else {
+      try_filter_intra = !(search_state->best_mbmode.skip);
+    }
+  }
+  if (try_filter_intra) {
+    RD_STATS rd_stats_y_fi;
+    int filter_intra_selected_flag = 0;
+    TX_SIZE best_tx_size = mbmi->tx_size;
+    TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+    memcpy(best_txk_type, mbmi->txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+
+    mbmi->filter_intra_mode_info.use_filter_intra = 1;
+    for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
+         fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
+      int64_t this_rd_tmp;
+      mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+      super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
+      if (rd_stats_y_fi.rate == INT_MAX) {
+        continue;
+      }
+      const int this_rate_tmp =
+          rd_stats_y_fi.rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize,
+                                 intra_mode_cost[mbmi->mode]);
+      this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+      if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) {
+        break;
+      }
+      if (this_rd_tmp < best_rd_tmp) {
+        best_tx_size = mbmi->tx_size;
+        memcpy(best_txk_type, mbmi->txk_type,
+               sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+        best_fi_mode = fi_mode;
+        *rd_stats_y = rd_stats_y_fi;
+        filter_intra_selected_flag = 1;
+        best_rd_tmp = this_rd_tmp;
+      }
+    }
+
+    mbmi->tx_size = best_tx_size;
+    memcpy(mbmi->txk_type, best_txk_type,
+           sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+    memcpy(x->blk_skip, best_blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+    if (filter_intra_selected_flag) {
+      mbmi->filter_intra_mode_info.use_filter_intra = 1;
+      mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+    } else {
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    }
+  }
+  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+  const int mode_cost_y =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_uv);
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+      int rate_y =
+          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+      const int64_t rdy =
+          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+      if (search_state->best_rd < (INT64_MAX / 2) &&
+          rdy > (search_state->best_rd + (search_state->best_rd >> 2))) {
+        search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+          &search_state->rate_uv_tokenonly[uv_tx],
+          &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+          &search_state->mode_uv[uv_tx]);
+      if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
+      search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+
+      const int uv_rate = search_state->rate_uv_tokenonly[uv_tx];
+      const int64_t uv_dist = search_state->dist_uvs[uv_tx];
+      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+      if (uv_rd > search_state->best_rd) {
+        search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
+    }
+
+    rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
+    rd_stats_uv->dist = search_state->dist_uvs[uv_tx];
+    rd_stats_uv->skip = search_state->skip_uvs[uv_tx];
+    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
+    mbmi->uv_mode = search_state->mode_uv[uv_tx];
+    if (try_palette) {
+      pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+  }
+  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rd_stats_y->rate -= tx_size_cost(cm, x, bsize, mbmi->tx_size);
+  }
+  if (num_planes > 1 && !x->skip_chroma_rd) {
+    const int uv_mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mbmi->uv_mode];
+    rd_stats->rate +=
+        rd_stats_uv->rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+  if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+    rd_stats->rate += intra_cost_penalty;
+  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rd_stats->rate += ref_frame_cost;
+  if (rd_stats->skip) {
+    // Back out the coefficient coding costs
+    rd_stats->rate -= (rd_stats_y->rate + rd_stats_uv->rate);
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    // Cost the skip mb case
+    rd_stats->rate += x->skip_cost[skip_ctx][1];
+  } else {
+    // Add in the cost of the no skip flag.
+    rd_stats->rate += x->skip_cost[skip_ctx][0];
+  }
+  // Calculate the final RD estimate for this mode.
+  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  // Keep record of best intra rd
+  if (this_rd < search_state->best_intra_rd) {
+    search_state->best_intra_rd = this_rd;
+    search_state->best_intra_mode = mbmi->mode;
+  }
+
+  if (sf->skip_intra_in_interframe) {
+    if (search_state->best_rd < (INT64_MAX / 2) &&
+        this_rd > (search_state->best_rd + (search_state->best_rd >> 1)))
+      search_state->skip_intra_modes = 1;
+  }
+
+  if (!disable_skip) {
+    for (int i = 0; i < REFERENCE_MODES; ++i)
+      search_state->best_pred_rd[i] =
+          AOMMIN(search_state->best_pred_rd[i], this_rd);
+  }
+  return this_rd;
+}
+
+static void collect_single_states(MACROBLOCK *x,
+                                  InterModeSearchState *search_state,
+                                  const MB_MODE_INFO *const mbmi) {
+  int i, j;
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+  const int mode_offset = INTER_OFFSET(this_mode);
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+  // Simple rd
+  int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < simple_rd) simple_rd = rd;
+  }
+
+  // Insertion sort of single_state
+  SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+  SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+  i = search_state->single_state_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+    state_s[j] = state_s[j - 1];
+  state_s[j] = this_state_s;
+  search_state->single_state_cnt[dir][mode_offset]++;
+
+  // Modelled rd
+  int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < modelled_rd) modelled_rd = rd;
+  }
+
+  // Insertion sort of single_state_modelled
+  SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+  SingleInterModeState *state_m =
+      search_state->single_state_modelled[dir][mode_offset];
+  i = search_state->single_state_modelled_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+    state_m[j] = state_m[j - 1];
+  state_m[j] = this_state_m;
+  search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static void analyze_single_states(const AV1_COMP *cpi,
+                                  InterModeSearchState *search_state) {
+  int i, j, dir, mode;
+  if (cpi->sf.prune_comp_search_by_single_result >= 1) {
+    for (dir = 0; dir < 2; ++dir) {
+      int64_t best_rd;
+      SingleInterModeState(*state)[FWD_REFS];
+
+      // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+      // reference frames for all the modes (NEARESTMV and NEARMV may not
+      // have same motion vectors). Always keep the best of each mode
+      // because it might form the best possible combination with other mode.
+      state = search_state->single_state[dir];
+      best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                       state[INTER_OFFSET(GLOBALMV)][0].rd);
+      for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+        for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+          if (state[mode][i].rd != INT64_MAX &&
+              (state[mode][i].rd >> 1) > best_rd) {
+            state[mode][i].valid = 0;
+          }
+        }
+      }
+
+      state = search_state->single_state_modelled[dir];
+      best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                       state[INTER_OFFSET(GLOBALMV)][0].rd);
+      for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+        for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode];
+             ++i) {
+          if (state[mode][i].rd != INT64_MAX &&
+              (state[mode][i].rd >> 1) > best_rd) {
+            state[mode][i].valid = 0;
+          }
+        }
+      }
+    }
+  }
+
+  // Ordering by simple rd first, then by modelled rd
+  for (dir = 0; dir < 2; ++dir) {
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+      const int state_cnt_m =
+          search_state->single_state_modelled_cnt[dir][mode];
+      SingleInterModeState *state_s = search_state->single_state[dir][mode];
+      SingleInterModeState *state_m =
+          search_state->single_state_modelled[dir][mode];
+      int count = 0;
+      const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+      for (i = 0; i < state_cnt_s; ++i) {
+        if (state_s[i].rd == INT64_MAX) break;
+        if (state_s[i].valid)
+          search_state->single_rd_order[dir][mode][count++] =
+              state_s[i].ref_frame;
+      }
+      if (count < max_candidates) {
+        for (i = 0; i < state_cnt_m; ++i) {
+          if (state_m[i].rd == INT64_MAX) break;
+          if (state_m[i].valid) {
+            int ref_frame = state_m[i].ref_frame;
+            int match = 0;
+            // Check if existing already
+            for (j = 0; j < count; ++j) {
+              if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+                match = 1;
+                break;
+              }
+            }
+            if (!match) {
+              // Check if this ref_frame is removed in simple rd
+              int valid = 1;
+              for (j = 0; j < state_cnt_s; j++) {
+                if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) {
+                  valid = 0;
+                  break;
+                }
+              }
+              if (valid)
+                search_state->single_rd_order[dir][mode][count++] = ref_frame;
+            }
+            if (count >= max_candidates) break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static int compound_skip_get_candidates(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const int dir, const PREDICTION_MODE mode) {
+  const int mode_offset = INTER_OFFSET(mode);
+  const SingleInterModeState *state =
+      search_state->single_state[dir][mode_offset];
+  const SingleInterModeState *state_modelled =
+      search_state->single_state_modelled[dir][mode_offset];
+  int max_candidates = 0;
+  int candidates;
+
+  for (int i = 0; i < FWD_REFS; ++i) {
+    if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+    max_candidates++;
+  }
+
+  candidates = max_candidates;
+  if (cpi->sf.prune_comp_search_by_single_result >= 2) {
+    candidates = AOMMIN(2, max_candidates);
+  }
+  if (cpi->sf.prune_comp_search_by_single_result >= 3) {
+    if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+        state[0].ref_frame == state_modelled[0].ref_frame)
+      candidates = 1;
+    if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+  }
+  return candidates;
+}
+
+static int compound_skip_by_single_states(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+    const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+  const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+  const int mode[2] = { compound_ref0_mode(this_mode),
+                        compound_ref1_mode(this_mode) };
+  const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+  const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+                            refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+  int ref_searched[2] = { 0, 0 };
+  int ref_mv_match[2] = { 1, 1 };
+  int i, j;
+
+  for (i = 0; i < 2; ++i) {
+    const SingleInterModeState *state =
+        search_state->single_state[mode_dir[i]][mode_offset[i]];
+    const int state_cnt =
+        search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+    for (j = 0; j < state_cnt; ++j) {
+      if (state[j].ref_frame == refs[i]) {
+        ref_searched[i] = 1;
+        break;
+      }
+    }
+  }
+
+  const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+  for (i = 0; i < 2; ++i) {
+    if (mode[i] == NEARESTMV || mode[i] == NEARMV) {
+      const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+      int idential = 1;
+      for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+        int_mv single_mv;
+        int_mv comp_mv;
+        get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs,
+                    x->mbmi_ext);
+        get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext);
+
+        idential &= (single_mv.as_int == comp_mv.as_int);
+        if (!idential) {
+          ref_mv_match[i] = 0;
+          break;
+        }
+      }
+    }
+  }
+
+  for (i = 0; i < 2; ++i) {
+    if (ref_searched[i] && ref_mv_match[i]) {
+      const int candidates =
+          compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+      const MV_REFERENCE_FRAME *ref_order =
+          search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+      int match = 0;
+      for (j = 0; j < candidates; ++j) {
+        if (refs[i] == ref_order[j]) {
+          match = 1;
+          break;
+        }
+      }
+      if (!match) return 1;
+    }
+  }
+
+  return 0;
+}
+
+static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode,
+                                       InterModeSearchState *search_state) {
+  const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+  const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1];
+  if (search_state->num_available_refs > 2) {
+    if ((ref_frame == search_state->dist_order_refs[0] &&
+         second_ref_frame == search_state->dist_order_refs[1]) ||
+        (ref_frame == search_state->dist_order_refs[1] &&
+         second_ref_frame == search_state->dist_order_refs[0]))
+      return 1;  // drop this pair of refs
+  }
+  return 0;
+}
+
+static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state,
+                                       const MODE_DEFINITION *mode,
+                                       int64_t distortion2) {
+  const PREDICTION_MODE this_mode = mode->mode;
+  MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+  const int idx = ref_frame - LAST_FRAME;
+  if (idx && distortion2 > search_state->dist_refs[idx]) {
+    search_state->dist_refs[idx] = distortion2;
+    search_state->dist_order_refs[idx] = ref_frame;
+  }
+
+  // Reach the last single ref prediction mode
+  if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+    // bubble sort dist_refs and the order index
+    for (int i = 0; i < REF_FRAMES; ++i) {
+      for (int k = i + 1; k < REF_FRAMES; ++k) {
+        if (search_state->dist_refs[i] < search_state->dist_refs[k]) {
+          int64_t tmp_dist = search_state->dist_refs[i];
+          search_state->dist_refs[i] = search_state->dist_refs[k];
+          search_state->dist_refs[k] = tmp_dist;
+
+          int tmp_idx = search_state->dist_order_refs[i];
+          search_state->dist_order_refs[i] = search_state->dist_order_refs[k];
+          search_state->dist_order_refs[k] = tmp_idx;
+        }
+      }
+    }
+    for (int i = 0; i < REF_FRAMES; ++i) {
+      if (search_state->dist_refs[i] == -1) break;
+      search_state->num_available_refs = i;
+    }
+    search_state->num_available_refs++;
+  }
+}
+
+static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+                                           CompoundTypeRdBuffers *const bufs) {
+  CHECK_MEM_ERROR(
+      cm, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  CHECK_MEM_ERROR(
+      cm, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                        sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static void release_compound_type_rd_buffers(
+    CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                               MACROBLOCK *x, int mi_row, int mi_col,
+                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int try_palette =
+      av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const struct segmentation *const seg = &cm->seg;
+  PREDICTION_MODE this_mode;
+  unsigned char segment_id = mbmi->segment_id;
+  int i;
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  int *mode_map = tile_data->mode_map[bsize];
+  uint32_t mode_skip_mask[REF_FRAMES];
+  uint16_t ref_frame_skip_mask[2];
+
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+                               best_rd_so_far);
+  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+  };
+  HandleInterModeArgs args = {
+    { NULL },  { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+    { NULL },  { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+    NULL,      NULL,
+    NULL,      search_state.modelled_rd,
+    { { 0 } }, INT_MAX,
+    INT_MAX,   search_state.simple_rd,
+    0,         interintra_modes
+  };
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+  av1_invalid_rd_stats(rd_cost);
+
+  // init params, set frame modes, speed features
+  set_params_rd_pick_inter_mode(
+      cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
+      ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  int64_t best_est_rd = INT64_MAX;
+  // TODO(angiebird): Turn this on when this speed feature is well tested
+#if 1
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  const int do_tx_search = !md->ready;
+#else
+  const int do_tx_search = 1;
+#endif
+  InterModesInfo *inter_modes_info = &tile_data->inter_modes_info;
+  inter_modes_info->num = 0;
+#endif
+
+  int intra_mode_num = 0;
+  int intra_mode_idx_ls[MAX_MODES];
+  int reach_first_comp_mode = 0;
+
+  // Temporary buffers used by handle_inter_mode().
+  // We allocate them once and reuse it in every call to that function.
+  // Note: Must be allocated on the heap due to large size of the arrays.
+  uint8_t *tmp_buf_orig;
+  CHECK_MEM_ERROR(
+      cm, tmp_buf_orig,
+      (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE));
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig);
+
+  CompoundTypeRdBuffers rd_buffers;
+  alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
+  for (int midx = 0; midx < MAX_MODES; ++midx) {
+    int mode_index = mode_map[midx];
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int rate2 = 0, rate_y = 0, rate_uv = 0;
+    int64_t distortion2 = 0;
+    int skippable = 0;
+    int this_skip2 = 0;
+    const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+    const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+    const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+    this_mode = mode_order->mode;
+
+    init_mbmi(mbmi, mode_index, cm);
+
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Reach the first compound prediction mode
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+        reach_first_comp_mode == 0) {
+      analyze_single_states(cpi, &search_state);
+      reach_first_comp_mode = 1;
+    }
+    const int ret = inter_mode_search_order_independent_skip(
+        cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
+        ref_frame_skip_mask, &search_state);
+    if (ret == 1) continue;
+    args.skip_motion_mode = (ret == 2);
+
+    if (sf->drop_ref && comp_pred) {
+      if (sf_check_is_drop_ref(mode_order, &search_state)) {
+        continue;
+      }
+    }
+
+    if (search_state.best_rd < search_state.mode_threshold[mode_index])
+      continue;
+
+    if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+      if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+                                         ref_frame, second_ref_frame, x))
+        continue;
+    }
+
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->reference_mode == REFERENCE_MODE_SELECT ? compmode_cost : 0;
+
+    if (comp_pred) {
+      if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+          search_state.best_mode_index >= 0 &&
+          search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
+        continue;
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      if (sf->adaptive_mode_search)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+            search_state.best_pred_sse)
+          continue;
+
+      if (this_mode != DC_PRED) {
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+          if (search_state.best_mode_index >= 0 &&
+              search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+            continue;
+        }
+        if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+            continue;
+        }
+      }
+    }
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    if (ref_frame == INTRA_FRAME) {
+      intra_mode_idx_ls[intra_mode_num++] = mode_index;
+      continue;
+    } else {
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      mbmi->ref_mv_idx = 0;
+      int64_t ref_best_rd = search_state.best_rd;
+      {
+        RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+        av1_init_rd_stats(&rd_stats);
+        rd_stats.rate = rate2;
+
+        // Point to variables that are maintained between loop iterations
+        args.single_newmv = search_state.single_newmv;
+        args.single_newmv_rate = search_state.single_newmv_rate;
+        args.single_newmv_valid = search_state.single_newmv_valid;
+        args.single_comp_cost = real_compmode_cost;
+        args.ref_frame_cost = ref_frame_cost;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        this_rd = handle_inter_mode(
+            cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
+            mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
+            &best_est_rd, do_tx_search, inter_modes_info);
+#else
+        this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
+                                    &rd_stats_uv, &disable_skip, mi_row, mi_col,
+                                    &args, ref_best_rd, tmp_buf, &rd_buffers);
+#endif
+        rate2 = rd_stats.rate;
+        skippable = rd_stats.skip;
+        distortion2 = rd_stats.dist;
+        rate_y = rd_stats_y.rate;
+        rate_uv = rd_stats_uv.rate;
+      }
+
+      if (sf->prune_comp_search_by_single_result > 0 &&
+          is_inter_singleref_mode(this_mode)) {
+        collect_single_states(x, &search_state, mbmi);
+      }
+
+      if (this_rd == INT64_MAX) continue;
+
+      this_skip2 = mbmi->skip;
+      this_rd = RDCOST(x->rdmult, rate2, distortion2);
+      if (this_skip2) {
+        rate_y = 0;
+        rate_uv = 0;
+      }
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < search_state.best_rd || x->skip) {
+      int mode_excluded = 0;
+      if (comp_pred) {
+        mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+      }
+      if (!mode_excluded) {
+        // Note index of best mode so far
+        search_state.best_mode_index = mode_index;
+
+        if (ref_frame == INTRA_FRAME) {
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        } else {
+          search_state.best_pred_sse = x->pred_sse[ref_frame];
+        }
+
+        rd_cost->rate = rate2;
+        rd_cost->dist = distortion2;
+        rd_cost->rdcost = this_rd;
+        search_state.best_rd = this_rd;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = this_skip2;
+        search_state.best_mode_skippable = skippable;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+        if (do_tx_search) {
+          // When do_tx_search == 0, handle_inter_mode won't provide correct
+          // rate_y and rate_uv because txfm_search process is replaced by
+          // rd estimation.
+          // Therfore, we should avoid updating best_rate_y and best_rate_uv
+          // here. These two values will be updated when txfm_search is called
+          search_state.best_rate_y =
+              rate_y +
+              x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+          search_state.best_rate_uv = rate_uv;
+        }
+#else   // CONFIG_COLLECT_INTER_MODE_RD_STATS
+        search_state.best_rate_y =
+            rate_y +
+            x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+        search_state.best_rate_uv = rate_uv;
+#endif  // CONFIG_COLLECT_INTER_MODE_RD_STATS
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
+
+      if (!comp_pred) {
+        if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+          search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
+      } else {
+        if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+          search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      }
+      if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+        search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+    }
+    if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+      // Collect data from single ref mode, and analyze data.
+      sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+    }
+
+    if (x->skip && !comp_pred) break;
+  }
+
+  aom_free(tmp_buf_orig);
+  tmp_buf_orig = NULL;
+  release_compound_type_rd_buffers(&rd_buffers);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+  if (!do_tx_search) {
+    inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+    search_state.best_rd = INT64_MAX;
+
+    int64_t top_est_rd =
+        inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+    for (int j = 0; j < inter_modes_info->num; ++j) {
+      const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+      *mbmi = inter_modes_info->mbmi_arr[data_idx];
+      int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+      if (curr_est_rd * 0.9 > top_est_rd) {
+        continue;
+      }
+      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+      x->skip = 0;
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+      for (i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      RD_STATS rd_stats;
+      RD_STATS rd_stats_y;
+      RD_STATS rd_stats_uv;
+
+      av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+      if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
+                       &rd_stats_uv, mode_rate, search_state.best_rd)) {
+        continue;
+      } else {
+        const int skip_ctx = av1_get_skip_context(xd);
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                             rd_stats.dist,
+                             rd_stats_y.rate + rd_stats_uv.rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+      if (rd_stats.rdcost < search_state.best_rd) {
+        search_state.best_rd = rd_stats.rdcost;
+        // Note index of best mode so far
+        const int mode_index = get_prediction_mode_idx(
+            mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        search_state.best_mode_index = mode_index;
+        *rd_cost = rd_stats;
+        search_state.best_rd = rd_stats.rdcost;
+        search_state.best_mbmode = *mbmi;
+        search_state.best_skip2 = mbmi->skip;
+        search_state.best_mode_skippable = rd_stats.skip;
+        search_state.best_rate_y =
+            rd_stats_y.rate +
+            x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+        search_state.best_rate_uv = rd_stats_uv.rate;
+        memcpy(ctx->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      }
+    }
+  }
+#endif
+
+  for (int j = 0; j < intra_mode_num; ++j) {
+    const int mode_index = intra_mode_idx_ls[j];
+    const MV_REFERENCE_FRAME ref_frame =
+        av1_mode_order[mode_index].ref_frame[0];
+    assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+    assert(ref_frame == INTRA_FRAME);
+    if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+    init_mbmi(mbmi, mode_index, cm);
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+    }
+
+    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+    const int ref_frame_cost = ref_costs_single[ref_frame];
+    intra_rd_stats.rdcost = handle_intra_mode(
+        &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+        &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+    if (intra_rd_stats.rdcost < search_state.best_rd) {
+      search_state.best_rd = intra_rd_stats.rdcost;
+      // Note index of best mode so far
+      search_state.best_mode_index = mode_index;
+      *rd_cost = intra_rd_stats;
+      search_state.best_rd = intra_rd_stats.rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = intra_rd_stats.skip;
+      search_state.best_rate_y =
+          intra_rd_stats_y.rate +
+          x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+      search_state.best_rate_uv = intra_rd_stats_uv.rate;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    }
+  }
+
+  // In effect only when speed >= 2.
+  sf_refine_fast_tx_type_search(
+      cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index,
+      &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+      search_state.best_rate_uv, &search_state.best_skip2);
+
+  // Only try palette mode when the best mode so far is an intra mode.
+  if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
+    search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
+                        ref_costs_single, &search_state);
+  }
+
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->skip_mode_flag &&
+      !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+      is_comp_ref_allowed(bsize)) {
+    rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+                      yv12_mb);
+  }
+
+  // Make sure that the ref_mv_idx is only nonzero when we're
+  // using a mode which can support ref_mv_idx
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
+  }
+
+  if (search_state.best_mode_index < 0 ||
+      search_state.best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+      !is_inter_block(&search_state.best_mbmode));
+  assert(
+      (cm->interp_filter == SWITCHABLE) ||
+      (cm->interp_filter ==
+       av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+      !is_inter_block(&search_state.best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref)
+    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize,
+                              search_state.best_mode_index);
+
+  // macroblock modes
+  *mbmi = search_state.best_mbmode;
+  x->skip |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      assert(mbmi->interp_filters ==
+             av1_broadcast_interp_filter(
+                 av1_unswitchable_filter(cm->interp_filter)));
+    }
+  }
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (search_state.best_pred_rd[i] == INT64_MAX)
+      search_state.best_pred_diff[i] = INT_MIN;
+    else
+      search_state.best_pred_diff[i] =
+          search_state.best_rd - search_state.best_pred_rd[i];
+  }
+
+  x->skip |= search_state.best_mode_skippable;
+
+  assert(search_state.best_mode_index >= 0);
+
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
+
+  if (pmi->palette_size[1] > 0) {
+    assert(try_palette);
+    restore_uv_color_map(cpi, x);
+  }
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+                                        TileDataEnc *tile_data, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  InterpFilter best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+  (void)mi_row;
+  (void)mi_col;
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mode = GLOBALMV;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->uv_mode = UV_DC_PRED;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+    mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  else
+    mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->mv[0].as_int =
+      gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+                           cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+                           cm->cur_frame_force_integer_mv)
+          .as_int;
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  x->skip = 1;
+
+  mbmi->ref_mv_idx = 0;
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+  if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+    int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+    mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+    // Select the samples according to motion vector difference
+    if (mbmi->num_proj_ref > 1)
+      mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                         mbmi->num_proj_ref, bsize);
+  }
+
+  set_default_interp_filters(mbmi, cm->interp_filter);
+
+  if (cm->interp_filter != SWITCHABLE) {
+    best_filter = cm->interp_filter;
+  } else {
+    best_filter = EIGHTTAP_REGULAR;
+    if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mbmi->interp_filters = av1_broadcast_interp_filter(i);
+        rs = av1_get_switchable_rate(cm, x, xd);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+  rate2 += av1_get_switchable_rate(cm, x, xd);
+
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += comp_inter_cost[comp_pred];
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
+
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter ==
+          av1_extract_interp_filter(mbmi->interp_filters, 0)));
+
+  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_GLOBALMV);
+
+  av1_zero(best_pred_diff);
+
+  store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
+}
+
+struct calc_target_weighted_pred_ctxt {
+  const MACROBLOCK *x;
+  const uint8_t *tmp;
+  int tmp_stride;
+  int overlap;
+};
+
+static INLINE void calc_target_weighted_pred_above(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t nb_mi_width, MB_MODE_INFO *nb_mi,
+    void *fun_ctxt, const int num_planes) {
+  (void)nb_mi;
+  (void)num_planes;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+  const int bw = xd->n4_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+
+  if (!is_hbd) {
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp16[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+  }
+}
+
+static INLINE void calc_target_weighted_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t nb_mi_height, MB_MODE_INFO *nb_mi,
+    void *fun_ctxt, const int num_planes) {
+  (void)nb_mi;
+  (void)num_planes;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+  const int bw = xd->n4_w << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+
+  if (!is_hbd) {
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+  }
+}
+
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+//  PObmc(x,y) =
+//    AOM_BLEND_A64(Mh(x),
+//                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+//                  PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+//  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+//    Mh(x) * Mv(y) * P(x,y) +
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+//  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+//  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+//  wsrc(x, y) =
+//    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+//  mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+//  error(x, y) =
+//    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+                                      const MACROBLOCKD *xd, int mi_row,
+                                      int mi_col, const uint8_t *above,
+                                      int above_stride, const uint8_t *left,
+                                      int left_stride) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int bw = xd->n4_w << MI_SIZE_LOG2;
+  const int bh = xd->n4_h << MI_SIZE_LOG2;
+  int32_t *mask_buf = x->mask_buf;
+  int32_t *wsrc_buf = x->wsrc_buf;
+
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+
+  // plane 0 should not be subsampled
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+
+  av1_zero_array(wsrc_buf, bw * bh);
+  for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap =
+        AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
+                                                   overlap };
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
+                                  max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                  calc_target_weighted_pred_above, &ctxt);
+  }
+
+  for (int i = 0; i < bw * bh; ++i) {
+    wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+    mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap =
+        AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
+                                                   overlap };
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
+                                 max_neighbor_obmc[mi_size_high_log2[bsize]],
+                                 calc_target_weighted_pred_left, &ctxt);
+  }
+
+  if (!is_hbd) {
+    const uint8_t *src = x->plane[0].src.buf;
+
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += bw;
+      src += x->plane[0].src.stride;
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += bw;
+      src += x->plane[0].src.stride;
+    }
+  }
+}
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 000000000..4c11f90b8
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SERCH 3
+#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1
+#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2
+#define DEFAULT_INTERP_SKIP_FLAG \
+  (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG)
+
+struct TileInfo;
+struct macroblock;
+struct RD_STATS;
+
+#if CONFIG_RD_DEBUG
+static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+                                             TX_SIZE tx_size, int blk_row,
+                                             int blk_col, int txb_coeff_cost) {
+  (void)blk_row;
+  (void)blk_col;
+  (void)tx_size;
+  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+  {
+    const int txb_h = tx_size_high_unit[tx_size];
+    const int txb_w = tx_size_wide_unit[tx_size];
+    int idx, idy;
+    for (idy = 0; idy < txb_h; ++idy)
+      for (idx = 0; idx < txb_w; ++idx)
+        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+  }
+  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
+  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
+}
+#endif
+
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count);
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth, int *val_count);
+
+#if CONFIG_DIST_8X8
+int64_t av1_dist_8x8(const struct AV1_COMP *const cpi, const MACROBLOCK *x,
+                     const uint8_t *src, int src_stride, const uint8_t *dst,
+                     int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+                     int bsh, int visible_w, int visible_h, int qindex);
+#endif
+
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+                                    int plane, TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int plane, int block, TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  int use_fast_coef_costing) {
+#if TXCOEFF_COST_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  (void)use_fast_coef_costing;
+  const int cost =
+      av1_cost_coeffs_txb(cm, x, plane, block, tx_size, tx_type, txb_ctx);
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  tmp_cm->txcoeff_cost_timer += elapsed_time;
+  ++tmp_cm->txcoeff_cost_count;
+#endif
+  return cost;
+}
+
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               int mi_row, int mi_col, struct RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd);
+
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+
+void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x, int mi_row, int mi_col,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+    const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+    struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 000000000..23d920fc3
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void calc_subpel_params(
+    MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+    int plane, const int pre_x, const int pre_y, int x, int y,
+    struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+    int bw, int bh) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = pd->subsampling_x;
+    int ssy = pd->subsampling_y;
+    int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+    orig_pos_y += mv.row * (1 << (1 - ssy));
+    int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+    orig_pos_x += mv.col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+           (pos_x >> SCALE_SUBPEL_BITS);
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+  } else {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+           (x + (mv_q4.col >> SUBPEL_BITS));
+  }
+}
+
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                          int plane, const MB_MODE_INFO *mi,
+                                          int build_for_obmc, int bw, int bh,
+                                          int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int is_compound = has_second_ref(mi);
+  int ref;
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  int is_global[2] = { 0, 0 };
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+                     (block_size_high[bsize] < 8 && ss_y);
+
+  if (is_intrabc) sub8x8_inter = 0;
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  sub8x8_inter = sub8x8_inter && !build_for_obmc;
+  if (sub8x8_inter) {
+    for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+      for (int col = col_start; col <= 0; ++col) {
+        const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+        if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+      }
+    }
+  }
+
+  if (sub8x8_inter) {
+    // block size
+    const int b4_w = block_size_wide[bsize] >> ss_x;
+    const int b4_h = block_size_high[bsize] >> ss_y;
+    const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+    const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+    const int b8_h = block_size_high[plane_bsize] >> ss_y;
+    assert(!is_compound);
+
+    const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+    int row = row_start;
+    for (int y = 0; y < b8_h; y += b4_h) {
+      int col = col_start;
+      for (int x = 0; x < b8_w; x += b4_w) {
+        MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+        is_compound = has_second_ref(this_mbmi);
+        int tmp_dst_stride = 8;
+        assert(bw < 8 || bh < 8);
+        ConvolveParams conv_params = get_conv_params_no_round(
+            0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+        conv_params.use_jnt_comp_avg = 0;
+        struct buf_2d *const dst_buf = &pd->dst;
+        uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+        ref = 0;
+        const RefBuffer *ref_buf =
+            &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+        pd->pre[ref].buf0 =
+            (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+        pd->pre[ref].buf =
+            pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+                                                     ref_buf->buf->uv_stride,
+                                                     &ref_buf->sf);
+        pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+        pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+        pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+        const struct scale_factors *const sf =
+            is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+        struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+        const MV mv = this_mbmi->mv[ref].as_mv;
+
+        uint8_t *pre;
+        SubpelParams subpel_params;
+        WarpTypesAllowed warp_types;
+        warp_types.global_warp_allowed = is_global[ref];
+        warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+        calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                           &subpel_params, bw, bh);
+        conv_params.do_average = ref;
+        if (is_masked_compound_type(mi->interinter_comp.type)) {
+          // masked compound type has its own average mechanism
+          conv_params.do_average = 0;
+        }
+
+        av1_make_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+            b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+            (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+            plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+        ++col;
+      }
+      ++row;
+    }
+
+    for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+    return;
+  }
+
+  {
+    ConvolveParams conv_params = get_conv_params_no_round(
+        0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+    av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+                               &conv_params.bck_offset,
+                               &conv_params.use_jnt_comp_avg, is_compound);
+
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf;
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf =
+          is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+      const MV mv = mi->mv[ref].as_mv;
+
+      uint8_t *pre;
+      SubpelParams subpel_params;
+      calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+                         &subpel_params, bw, bh);
+
+      WarpTypesAllowed warp_types;
+      warp_types.global_warp_allowed = is_global[ref];
+      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+      if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
+        // masked compound type has its own average mechanism
+        conv_params.do_average = 0;
+        av1_make_masked_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, plane, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+            cm->allow_warped_motion);
+      } else {
+        conv_params.do_average = ref;
+        av1_make_inter_predictor(
+            pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+            bh, &conv_params, mi->interp_filters, &warp_types,
+            mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+            mi, build_for_obmc, xd, cm->allow_warped_motion);
+      }
+    }
+  }
+}
+
+static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
+                                              MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col,
+                                              int plane_from, int plane_to) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = pd->width;
+    const int bh = pd->height;
+
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+
+    build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+  }
+}
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize) {
+  av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
+}
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col, BUFFER_SET *ctx,
+                                     BLOCK_SIZE bsize) {
+  for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
+    av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
+                                   plane_idx);
+  }
+}
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize, int plane_idx) {
+  build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
+                                    plane_idx);
+
+  if (is_interintra_pred(xd->mi[0])) {
+    BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
+    if (!ctx) {
+      default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf;
+      default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride;
+      ctx = &default_ctx;
+    }
+    av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf,
+                                        xd->plane[plane_idx].dst.stride, ctx,
+                                        plane_idx, bsize);
+  }
+}
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col, BUFFER_SET *ctx,
+                                   BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
+  av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+  if (num_planes > 1)
+    av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+// TODO(sarahparker):
+// av1_build_inter_predictor should be combined with
+// av1_make_inter_predictor
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, const MV *src_mv,
+                               const struct scale_factors *sf, int w, int h,
+                               ConvolveParams *conv_params,
+                               InterpFilters interp_filters,
+                               const WarpTypesAllowed *warp_types, int p_col,
+                               int p_row, int plane, int ref,
+                               enum mv_precision precision, int x, int y,
+                               const MACROBLOCKD *xd, int can_use_previous) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+  mv.col += SCALE_EXTRA_OFF;
+  mv.row += SCALE_EXTRA_OFF;
+
+  const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+                                       mv.col & SCALE_SUBPEL_MASK,
+                                       mv.row & SCALE_SUBPEL_MASK };
+  src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+         (mv.col >> SCALE_SUBPEL_BITS);
+
+  av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+                           w, h, conv_params, interp_filters, warp_types, p_col,
+                           p_row, plane, ref, xd->mi[0], 0, xd,
+                           can_use_previous);
+}
+
+static INLINE void build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = ctxt->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+                                           above_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
+  }
+  *above_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  int this_height = xd->n4_h * MI_SIZE;
+  int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd, mi_col,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+    MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = ctxt->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+                                          left_mbmi, ctxt, num_planes);
+  mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
+  }
+  *left_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  int this_width = xd->n4_w * MI_SIZE;
+  int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+  struct build_prediction_ctxt ctxt = { cm,         mi_row,
+                                        mi_col,     tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd, mi_row,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               build_prediction_by_left_pred, &ctxt);
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col) {
+  const int num_planes = av1_num_planes(cm);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+  } else {
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+  }
+  av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                      dst_width1, dst_height1, dst_stride1);
+  av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                     dst_width2, dst_height2, dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+                                              int bw, int bh, int x, int y,
+                                              int w, int h, int mi_x, int mi_y,
+                                              int ref, uint8_t *const ext_dst,
+                                              int ext_dst_stride,
+                                              int can_use_previous) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *mi = xd->mi[0];
+
+  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  struct buf_2d *const pre_buf = &pd->pre[ref];
+  uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
+  const MV mv = mi->mv[ref].as_mv;
+
+  ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+  WarpTypesAllowed warp_types;
+  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+  const int pre_x = (mi_x) >> pd->subsampling_x;
+  const int pre_y = (mi_y) >> pd->subsampling_y;
+  uint8_t *pre;
+  SubpelParams subpel_params;
+  calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+                     &subpel_params, bw, bh);
+
+  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+                           &subpel_params, sf, w, h, &conv_params,
+                           mi->interp_filters, &warp_types, pre_x + x,
+                           pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+                                      mi_y, ref, ext_dst[plane],
+                                      ext_dst_stride[plane], can_use_previous);
+  }
+}
+
+static void build_masked_compound(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+static void build_masked_compound_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  // const uint8_t *mask =
+  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, block_size_wide[sb_type], w, h,
+                            subw, subh, bd);
+}
+
+static void build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+  if (is_compound && is_masked_compound_type(comp_data->type)) {
+    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        av1_build_compound_diffwtd_mask_highbd(
+            comp_data->seg_mask, comp_data->mask_type,
+            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+      else
+        av1_build_compound_diffwtd_mask(
+            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+    }
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+          mbmi->sb_type, h, w, xd->bd);
+    else
+      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            h, w);
+  } else {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    else
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+                        0, NULL, 0, w, h);
+  }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_wedge_inter_predictor_from_buf(
+        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+        ext_dst1[plane], ext_dst_stride1[plane]);
+  }
+}
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 000000000..10d5e8c28
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/filter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col, BUFFER_SET *ctx,
+                                     BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int mi_row, int mi_col, BUFFER_SET *ctx,
+                                    BLOCK_SIZE bsize, int plane_idx);
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col, BUFFER_SET *ctx,
+                                   BLOCK_SIZE bsize);
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, const MV *src_mv,
+                               const struct scale_factors *sf, int w, int h,
+                               ConvolveParams *conv_params,
+                               InterpFilters interp_filters,
+                               const WarpTypesAllowed *warp_types, int p_col,
+                               int p_row, int plane, int ref,
+                               enum mv_precision precision, int x, int y,
+                               const MACROBLOCKD *xd, int can_use_previous);
+
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+#define CHECK_SUBPEL 0
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
+                                          const MACROBLOCKD *const xd,
+                                          int dir) {
+#if CHECK_SUBPEL
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int plane;
+  int ref = (dir >> 1);
+
+  if (dir & 0x01) {
+    if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
+  } else {
+    if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
+  }
+
+  return 0;
+#else
+  (void)mbmi;
+  (void)xd;
+  (void)dir;
+  return 1;
+#endif
+}
+
+static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int is_compound = has_second_ref(mi);
+  int ref;
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    int row_col;
+    for (row_col = 0; row_col < 2; ++row_col) {
+      const int dir = (ref << 1) + row_col;
+      if (has_subpel_mv_component(mi, xd, dir)) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col);
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+    int can_use_previous);
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 000000000..2e9102745
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
+  seg->temporal_update = 0;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  seg->temporal_update = 0;
+}
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
+
+static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                       const TileInfo *tile, MB_MODE_INFO **mi,
+                       unsigned *no_pred_segcounts,
+                       unsigned (*temporal_predictor_count)[2],
+                       unsigned *t_unpred_seg_counts, int bw, int bh,
+                       int mi_row, int mi_col) {
+  int segment_id;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  xd->mi = mi;
+  segment_id = xd->mi[0]->segment_id;
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  // Count the number of hits on each segment with no prediction
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->frame_type != KEY_FRAME) {
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    // Test to see if the segment id matches the predicted value.
+    const int pred_segment_id =
+        cm->last_frame_seg_map
+            ? get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col)
+            : 0;
+    const int pred_flag = pred_segment_id == segment_id;
+    const int pred_context = av1_get_pred_context_seg_id(xd);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    xd->mi[0]->seg_id_predicted = pred_flag;
+    temporal_predictor_count[pred_context][pred_flag]++;
+
+    // Update the "unpredicted" segment count
+    if (!pred_flag) t_unpred_seg_counts[segment_id]++;
+  }
+}
+
+static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                          const TileInfo *tile, MB_MODE_INFO **mi,
+                          unsigned *no_pred_segcounts,
+                          unsigned (*temporal_predictor_count)[2],
+                          unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize) {
+  const int mis = cm->mi_stride;
+  const int bs = mi_size_wide[bsize], hbs = bs / 2;
+  PARTITION_TYPE partition;
+  const int qbs = bs / 4;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff)                              \
+  count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff),               \
+             no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
+             (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
+
+  if (bsize == BLOCK_8X8)
+    partition = PARTITION_NONE;
+  else
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  switch (partition) {
+    case PARTITION_NONE: CSEGS(bs, bs, 0, 0); break;
+    case PARTITION_HORZ:
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(bs, hbs, hbs, 0);
+      break;
+    case PARTITION_VERT:
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(hbs, bs, 0, hbs);
+      break;
+    case PARTITION_HORZ_A:
+      CSEGS(hbs, hbs, 0, 0);
+      CSEGS(hbs, hbs, 0, hbs);
+      CSEGS(bs, hbs, hbs, 0);
+      break;
+    case PARTITION_HORZ_B:
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(hbs, hbs, hbs, 0);
+      CSEGS(hbs, hbs, hbs, hbs);
+      break;
+    case PARTITION_VERT_A:
+      CSEGS(hbs, hbs, 0, 0);
+      CSEGS(hbs, hbs, hbs, 0);
+      CSEGS(hbs, bs, 0, hbs);
+      break;
+    case PARTITION_VERT_B:
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(hbs, hbs, 0, hbs);
+      CSEGS(hbs, hbs, hbs, hbs);
+      break;
+    case PARTITION_HORZ_4:
+      CSEGS(bs, qbs, 0, 0);
+      CSEGS(bs, qbs, qbs, 0);
+      CSEGS(bs, qbs, 2 * qbs, 0);
+      if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+      break;
+
+    case PARTITION_VERT_4:
+      CSEGS(qbs, bs, 0, 0);
+      CSEGS(qbs, bs, 0, qbs);
+      CSEGS(qbs, bs, 0, 2 * qbs);
+      if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      break;
+
+    case PARTITION_SPLIT: {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      int n;
+
+      for (n = 0; n < 4; n++) {
+        const int mi_dc = hbs * (n & 1);
+        const int mi_dr = hbs * (n >> 1);
+
+        count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row + mi_dr, mi_col + mi_dc, subsize);
+      }
+    } break;
+    default: assert(0);
+  }
+
+#undef CSEGS
+}
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
+  struct segmentation *seg = &cm->seg;
+  struct segmentation_probs *segp = &cm->fc->seg;
+  int no_pred_cost;
+  int t_pred_cost = INT_MAX;
+  int tile_col, tile_row, mi_row, mi_col;
+  unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
+  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
+  (void)xd;
+
+  // First of all generate stats regarding how well the last segment map
+  // predicts this one
+  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      MB_MODE_INFO **mi_ptr;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+      mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
+               tile_info.mi_col_start;
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->seq_params.mib_size,
+          mi_ptr += cm->seq_params.mib_size * cm->mi_stride) {
+        MB_MODE_INFO **mi = mi_ptr;
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->seq_params.mib_size, mi += cm->seq_params.mib_size) {
+          count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+                        temporal_predictor_count, t_unpred_seg_counts, mi_row,
+                        mi_col, cm->seq_params.sb_size);
+        }
+      }
+    }
+  }
+
+  int seg_id_cost[MAX_SEGMENTS];
+  av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL);
+  no_pred_cost = 0;
+  for (int i = 0; i < MAX_SEGMENTS; ++i)
+    no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
+
+  // Frames without past dependency cannot use temporal prediction
+  if (cm->primary_ref_frame != PRIMARY_REF_NONE) {
+    int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
+      av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
+    t_pred_cost = 0;
+    // Cost for signaling the prediction flag.
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+      for (int j = 0; j < 2; ++j)
+        t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j];
+    }
+    // Cost for signaling the unpredicted segment id.
+    for (int i = 0; i < MAX_SEGMENTS; ++i)
+      t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i];
+  }
+
+  // Now choose which coding method to use.
+  if (t_pred_cost < no_pred_cost) {
+    assert(!cm->error_resilient_mode);
+    seg->temporal_update = 1;
+  } else {
+    seg->temporal_update = 0;
+  }
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+  struct segmentation *seg = &cm->seg;
+
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 000000000..1ad13d66a
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 000000000..4c35baae0
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// Setting this to 1 will disable trellis optimization completely.
+// Setting this to 2 will disable trellis optimization within the
+// transform search. Trellis optimization will still be applied
+// in the final encode.
+#define DISABLE_TRELLISQ_SEARCH 0
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+static MESH_PATTERN
+    good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+    };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
+  50, 50, 25, 15, 5, 1
+};
+
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
+                                                            25,  25,  10 };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Select block size based on image format size.
+  if (screen_area < 1280 * 720) {
+    // Formats smaller in area than 720P
+    return BLOCK_4X4;
+  } else if (screen_area < 1920 * 1080) {
+    // Format >= 720P and < 1080P
+    return BLOCK_8X8;
+  } else {
+    // Formats 1080P and up
+    return BLOCK_16X16;
+  }
+}
+
+// Do we have an internal image edge (e.g. formatting bars).
+static int has_internal_image_edge(const AV1_COMP *cpi) {
+  return (cpi->oxcf.pass == 2) &&
+         ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+          (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
+                                                       SPEED_FEATURES *sf,
+                                                       int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+
+  if (is_480p_or_larger) {
+    sf->use_square_partition_only_threshold = BLOCK_128X128;
+  } else {
+    sf->use_square_partition_only_threshold = BLOCK_64X64;
+  }
+
+  // TODO(huisu@google.com): train models for 720P and above.
+  if (!is_720p_or_larger) {
+    sf->ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+  }
+
+  if (speed >= 1) {
+    if (is_720p_or_larger) {
+      sf->use_square_partition_only_threshold = BLOCK_128X128;
+    } else if (is_480p_or_larger) {
+      sf->use_square_partition_only_threshold = BLOCK_64X64;
+    } else {
+      sf->use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    }
+  }
+
+  if (speed >= 2) {
+    if (is_720p_or_larger) {
+      sf->disable_split_mask =
+          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
+    }
+    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+  }
+
+  if (speed >= 3) {
+    if (is_720p_or_larger) {
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  // If this is a two pass clip that fits the criteria for animated or
+  // graphics content then reset disable_split_mask for speeds 2+.
+  // Also if the image edge is internal to the coded area.
+  if ((speed >= 2) && (cpi->oxcf.pass == 2) &&
+      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+       (has_internal_image_edge(cpi)))) {
+    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+  }
+
+  if (speed >= 4) {
+    if (is_720p_or_larger) {
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+  }
+}
+
+static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
+                                                          SPEED_FEATURES *sf,
+                                                          int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->reduce_inter_modes = 1;
+  sf->prune_ext_partition_types_search_level = 1;
+  sf->ml_prune_rect_partition = 1;
+  sf->ml_prune_ab_partition = 1;
+  sf->ml_prune_4_partition = 1;
+  sf->adaptive_txb_search_level = 1;
+  sf->jnt_comp_skip_mv_search = 1;
+  sf->model_based_prune_tx_search_level = 1;
+  sf->model_based_post_interp_filter_breakout = 1;
+  sf->inter_mode_rd_model_estimation = 1;
+  sf->prune_ref_frame_for_rect_partitions =
+      !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+  sf->less_rectangular_check_level = 1;
+  sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+  sf->gm_disable_recode = 1;
+
+  if (speed >= 1) {
+    sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->selective_ref_frame = 1;
+    sf->inter_tx_size_search_init_depth_rect = 1;
+    sf->inter_tx_size_search_init_depth_sqr = 1;
+    sf->intra_tx_size_search_init_depth_rect = 1;
+    sf->intra_tx_size_search_init_depth_sqr = 1;
+    sf->tx_size_search_lgr_block = 1;
+    if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
+      sf->two_pass_partition_search = 1;
+      sf->mode_pruning_based_on_two_pass_partition_search = 1;
+    }
+    sf->prune_ext_partition_types_search_level = 2;
+    sf->use_fast_interpolation_filter_search = 1;
+    sf->skip_repeat_interpolation_filter_search = 1;
+    sf->tx_type_search.skip_tx_search = 1;
+    sf->tx_type_search.ml_tx_split_thresh = 40;
+    sf->model_based_prune_tx_search_level = 0;
+    sf->model_based_post_interp_filter_breakout = 0;
+    // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation
+    // on speed 1
+    sf->inter_mode_rd_model_estimation = 0;
+    sf->adaptive_txb_search_level = 2;
+    sf->use_intra_txb_hash = 1;
+    sf->optimize_b_precheck = 1;
+    sf->dual_sgr_penalty_level = 1;
+    sf->use_accurate_subpel_search = 1;
+    sf->reuse_inter_intra_mode = 1;
+    sf->prune_comp_search_by_single_result = 1;
+    sf->skip_repeated_newmv = 1;
+    sf->obmc_full_pixel_search_level = 1;
+  }
+
+  if (speed >= 2) {
+    sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+
+    sf->selective_ref_frame = 2;
+    sf->fast_cdef_search = 1;
+
+    sf->adaptive_rd_thresh = 1;
+    sf->mv.auto_mv_step_size = 1;
+    sf->mv.subpel_iters_per_step = 1;
+    sf->disable_filter_search_var_thresh = 100;
+    sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+
+    sf->partition_search_breakout_rate_thr = 80;
+    // Note: This speed feature is disable as it seems to be worse in
+    // compression/quality and is also slower.
+    // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->allow_partition_search_skip = 1;
+    sf->disable_wedge_search_var_thresh = 100;
+    sf->fast_wedge_sign_estimate = 1;
+  }
+
+  if (speed >= 3) {
+    sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+    sf->less_rectangular_check_level = 2;
+    sf->adaptive_pred_interp_filter = 1;
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->adaptive_motion_search = 1;
+    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    sf->use_transform_domain_distortion = 1;
+    sf->use_accurate_subpel_search = 0;
+    sf->adaptive_rd_thresh = 2;
+    sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->gm_search_type = GM_DISABLE_SEARCH;
+    sf->prune_comp_search_by_single_result = 2;
+  }
+
+  if (speed >= 4) {
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    sf->use_square_partition_only_threshold =
+        boosted ? BLOCK_128X128 : BLOCK_4X4;
+    sf->tx_size_search_method =
+        frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->adaptive_pred_interp_filter = 0;
+    sf->adaptive_mode_search = 1;
+    sf->cb_partition_search = !boosted;
+    sf->alt_ref_search_fp = 1;
+  }
+
+  if (speed >= 5) {
+    sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+    sf->use_square_partition_only_threshold = BLOCK_4X4;
+    sf->tx_size_search_method = USE_LARGESTALL;
+    sf->mv.search_method = BIGDIA;
+    sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+    sf->adaptive_rd_thresh = 4;
+    sf->mode_search_skip_flags =
+        (cm->frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                  FLAG_EARLY_TERMINATE;
+    sf->disable_filter_search_var_thresh = 200;
+    sf->use_fast_coef_costing = 1;
+    sf->partition_search_breakout_rate_thr = 300;
+    sf->use_transform_domain_distortion = 2;
+  }
+
+  if (speed >= 6) {
+    int i;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+    sf->mv.search_method = HEX;
+    sf->disable_filter_search_var_thresh = 500;
+    for (i = 0; i < TX_SIZES; ++i) {
+      sf->intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+    }
+    sf->partition_search_breakout_rate_thr = 500;
+    sf->mv.reduce_first_step_size = 1;
+    sf->simple_model_rd_from_var = 1;
+  }
+  if (speed >= 7) {
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
+    sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->frame_parameter_update = 0;
+    sf->mv.search_method = FAST_HEX;
+    sf->partition_search_type = REFERENCE_PARTITION;
+    sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+  }
+  if (speed >= 8) {
+    sf->mv.search_method = FAST_DIAMOND;
+    sf->mv.subpel_force_stop = 2;
+    sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+  }
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  }
+
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+    sf->adaptive_pred_interp_filter = 0;
+  }
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; ++i) {
+    if (sf->disable_split_mask & (1 << i)) {
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+    }
+  }
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCK *const x = &cpi->td.mb;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  // best quality defaults
+  sf->frame_parameter_update = 1;
+  sf->mv.search_method = NSTEP;
+  sf->recode_loop = ALLOW_RECODE;
+  sf->mv.subpel_search_method = SUBPEL_TREE;
+  sf->mv.subpel_iters_per_step = 2;
+  sf->mv.subpel_force_stop = 0;
+#if DISABLE_TRELLISQ_SEARCH == 2
+  sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                  ? FINAL_PASS_TRELLIS_OPT
+                                  : NO_TRELLIS_OPT;
+#elif DISABLE_TRELLISQ_SEARCH == 1
+  sf->optimize_coefficients = NO_TRELLIS_OPT;
+#else
+  if (is_lossless_requested(&cpi->oxcf))
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+  else
+    sf->optimize_coefficients = FULL_TRELLIS_OPT;
+#endif  // DISABLE_TRELLISQ_SEARCH
+  sf->gm_erroradv_type = GM_ERRORADV_TR_0;
+  sf->mv.reduce_first_step_size = 0;
+  sf->mv.auto_mv_step_size = 0;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->adaptive_rd_thresh = 0;
+  sf->tx_size_search_method = USE_FULL_RD;
+  sf->inter_tx_size_search_init_depth_sqr = 0;
+  sf->inter_tx_size_search_init_depth_rect = 0;
+  sf->intra_tx_size_search_init_depth_rect = 0;
+  sf->intra_tx_size_search_init_depth_sqr = 0;
+  sf->tx_size_search_lgr_block = 0;
+  sf->model_based_prune_tx_search_level = 0;
+  sf->model_based_post_interp_filter_breakout = 0;
+  sf->reduce_inter_modes = 0;
+  sf->selective_ref_gm = 1;
+  sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_interp_filter = 0;
+  sf->adaptive_mode_search = 0;
+  sf->cb_partition_search = 0;
+  sf->alt_ref_search_fp = 0;
+  sf->partition_search_type = SEARCH_PARTITION;
+  sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  sf->tx_type_search.ml_tx_split_thresh = 30;
+  sf->tx_type_search.use_skip_flag_prediction = 1;
+  sf->tx_type_search.fast_intra_tx_type_search = 0;
+  sf->tx_type_search.fast_inter_tx_type_search = 0;
+  sf->tx_type_search.skip_tx_search = 0;
+  sf->selective_ref_frame = 0;
+  sf->less_rectangular_check_level = 0;
+  sf->use_square_partition_only_threshold = BLOCK_128X128;
+  sf->prune_ref_frame_for_rect_partitions = 0;
+  sf->auto_min_max_partition_size = NOT_IN_USE;
+  sf->rd_auto_partition_min_limit = BLOCK_4X4;
+  sf->default_max_partition_size = BLOCK_LARGEST;
+  sf->default_min_partition_size = BLOCK_4X4;
+  sf->adjust_partitioning_from_last_frame = 0;
+  sf->disable_split_mask = 0;
+  sf->mode_search_skip_flags = 0;
+  sf->disable_filter_search_var_thresh = 0;
+  sf->allow_partition_search_skip = 0;
+  sf->use_accurate_subpel_search = 2;
+  sf->disable_wedge_search_var_thresh = 0;
+  sf->fast_wedge_sign_estimate = 0;
+  sf->drop_ref = 0;
+  sf->skip_intra_in_interframe = 1;
+  sf->txb_split_cap = 1;
+  sf->adaptive_txb_search_level = 0;
+  sf->two_pass_partition_search = 0;
+  sf->mode_pruning_based_on_two_pass_partition_search = 0;
+  sf->use_intra_txb_hash = 0;
+  sf->use_inter_txb_hash = 1;
+  sf->use_mb_rd_hash = 1;
+  sf->optimize_b_precheck = 0;
+  sf->jnt_comp_fast_tx_search = 0;
+  sf->jnt_comp_skip_mv_search = 0;
+  sf->reuse_inter_intra_mode = 0;
+
+  for (i = 0; i < TX_SIZES; i++) {
+    sf->intra_y_mode_mask[i] = INTRA_ALL;
+    sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+  }
+  sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  sf->use_fast_coef_costing = 0;
+  sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  sf->always_this_block_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  sf->recode_tolerance = 25;
+  sf->partition_search_breakout_dist_thr = 0;
+  sf->partition_search_breakout_rate_thr = 0;
+  sf->simple_model_rd_from_var = 0;
+  sf->prune_ext_partition_types_search_level = 0;
+  sf->ml_prune_rect_partition = 0;
+  sf->ml_prune_ab_partition = 0;
+  sf->ml_prune_4_partition = 0;
+  sf->fast_cdef_search = 0;
+  for (i = 0; i < PARTITION_BLOCK_SIZES; ++i)
+    sf->ml_partition_search_breakout_thresh[i] = -1;  // -1 means not enabled.
+
+  // Set this at the appropriate speed levels
+  sf->use_transform_domain_distortion = 0;
+  sf->gm_search_type = GM_FULL_SEARCH;
+  sf->gm_disable_recode = 0;
+  sf->use_fast_interpolation_filter_search = 0;
+  sf->skip_repeat_interpolation_filter_search = 0;
+  sf->use_hash_based_trellis = 0;
+  sf->prune_comp_search_by_single_result = 0;
+  sf->skip_repeated_newmv = 0;
+
+  // Set decoder side speed feature to use less dual sgr modes
+  sf->dual_sgr_penalty_level = 0;
+
+  sf->inter_mode_rd_model_estimation = 0;
+  sf->obmc_full_pixel_search_level = 0;
+
+  if (oxcf->mode == GOOD)
+    set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
+
+  // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+  // blocks. Normalise this if the blocks are bigger.
+  if (MAX_SB_SIZE_LOG2 > 6) {
+    sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+  }
+
+  cpi->diamond_search_sad = av1_diamond_search_sad;
+
+  sf->allow_exhaustive_searches = 1;
+  int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+    sf->exhaustive_searches_thresh = (1 << 24);
+  else
+    sf->exhaustive_searches_thresh = (1 << 25);
+  sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+  if (speed > 0)
+    sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+    sf->mesh_patterns[i].interval =
+        good_quality_mesh_patterns[speed][i].interval;
+  }
+  if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
+      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+       cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
+    for (i = 0; i < MAX_MESH_STEP; ++i) {
+      sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
+      sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+    }
+    sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
+
+  // No recode for 1 pass.
+  if (oxcf->pass == 0) {
+    sf->recode_loop = DISALLOW_RECODE;
+    sf->optimize_coefficients = NO_TRELLIS_OPT;
+  }
+
+  if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
+  }
+
+  cpi->optimize_speed_feature =
+      oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
+  // FIXME: trellis not very efficient for quantisation matrices
+  if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+  if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+
+  x->min_partition_size = sf->default_min_partition_size;
+  x->max_partition_size = sf->default_max_partition_size;
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+#if CONFIG_DIST_8X8
+  if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+
+  if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
+#endif  // CONFIG_DIST_8X8
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 000000000..41013b2e7
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+              (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+              (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+  UV_INTRA_ALL =
+      (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+      (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+      (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+      (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+      (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC = (1 << UV_DC_PRED),
+  UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+  UV_INTRA_DC_PAETH_CFL =
+      (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+                        (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                          (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                              (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                              (1 << UV_CFL_PRED),
+  INTRA_DC = (1 << DC_PRED),
+  INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
+  INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_PAETH_H_V =
+      (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+              (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+              (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+              (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+                            (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
+                            (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+                            (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+                            (1 << NEAR_NEARMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                            (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+typedef enum {
+  TXFM_CODING_SF = 1,
+  INTER_PRED_SF = 2,
+  INTRA_PRED_SF = 4,
+  PARTITION_SF = 8,
+  LOOP_FILTER_SF = 16,
+  RD_SKIP_SF = 32,
+  RESERVE_2_SF = 64,
+  RESERVE_3_SF = 128,
+} DEV_SPEED_FEATURES;
+
+typedef enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+  SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
+  // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+  USE_FULL_RD = 0,
+  USE_FAST_RD,
+  USE_LARGESTALL,
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+  NOT_IN_USE = 0,
+  RELAXED_NEIGHBORING_MIN_MAX = 1
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+  NO_PRUNE = 0,
+  // eliminates one tx type in vertical and horizontal direction
+  PRUNE_ONE = 1,
+  // eliminates two tx types in each direction
+  PRUNE_TWO = 2,
+  // adaptively prunes the least perspective tx types out of all 16
+  // (tuned to provide negligible quality loss)
+  PRUNE_2D_ACCURATE = 3,
+  // similar, but applies much more aggressive pruning to get better speed-up
+  PRUNE_2D_FAST = 4,
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_mode;
+  int fast_intra_tx_type_search;
+  int fast_inter_tx_type_search;
+
+  // Use a skip flag prediction model to detect blocks with skip = 1 early
+  // and avoid doing full TX type search for such blocks.
+  int use_skip_flag_prediction;
+
+  // Threshold used by the ML based method to predict TX block split decisions.
+  int ml_tx_split_thresh;
+
+  // skip remaining transform type search when we found the rdcost of skip is
+  // better than applying transform
+  int skip_tx_search;
+} TX_TYPE_SEARCH;
+
+typedef enum {
+  // Search partitions using RD criterion
+  SEARCH_PARTITION,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION,
+
+  REFERENCE_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef struct MV_SPEED_FEATURES {
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  // This parameter controls which step in the n-step process we start at.
+  // It's changed adaptively based on circumstances.
+  int reduce_first_step_size;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // Control when to stop subpel search
+  int subpel_force_stop;
+} MV_SPEED_FEATURES;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+typedef enum {
+  GM_FULL_SEARCH,
+  GM_REDUCED_REF_SEARCH,
+  GM_DISABLE_SEARCH
+} GM_SEARCH_TYPE;
+
+typedef enum {
+  GM_ERRORADV_TR_0,
+  GM_ERRORADV_TR_1,
+  GM_ERRORADV_TR_2,
+  GM_ERRORADV_TR_TYPES,
+} GM_ERRORADV_TYPE;
+
+typedef enum {
+  NO_TRELLIS_OPT,         // No trellis optimization
+  FULL_TRELLIS_OPT,       // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT  // Trellis optimization in only the final encode pass
+} TRELLIS_OPT_TYPE;
+
+typedef enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} TXFM_RD_MODEL;
+
+typedef struct SPEED_FEATURES {
+  MV_SPEED_FEATURES mv;
+
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // Trellis (dynamic programming) optimization of quantized values
+  TRELLIS_OPT_TYPE optimize_coefficients;
+
+  // Global motion warp error threshold
+  GM_ERRORADV_TYPE gm_erroradv_type;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // Limit the inter mode tested in the RD loop
+  int reduce_inter_modes;
+
+  // Do not compute the global motion parameters for a LAST2_FRAME or
+  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
+  // global model.
+  int selective_ref_gm;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  BLOCK_SIZE comp_inter_joint_search_thresh;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Determine which method we use to determine transform size. We can choose
+  // between options like full rd, largest for prediction size, largest
+  // for intra and model coefs for the rest.
+  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+  // Init search depth for square and rectangular transform partitions.
+  // Values:
+  // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+  int inter_tx_size_search_init_depth_sqr;
+  int inter_tx_size_search_init_depth_rect;
+  int intra_tx_size_search_init_depth_sqr;
+  int intra_tx_size_search_init_depth_rect;
+  // If any dimension of a coding block size above 64, always search the
+  // largest transform only, since the largest transform block size is 64x64.
+  int tx_size_search_lgr_block;
+
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  TX_TYPE_SEARCH tx_type_search;
+
+  // Skip split transform block partition when the collocated bigger block
+  // is selected as all zero coefficients.
+  int txb_split_cap;
+
+  // Shortcut the transform block partition and type search when the target
+  // rdcost is relatively lower.
+  // Values are 0 (not used) , or 1 - 2 with progressively increasing
+  // aggressiveness
+  int adaptive_txb_search_level;
+
+  // Prune level for tx_size_type search for inter based on rd model
+  // 0: no pruning
+  // 1-2: progressively increasing aggressiveness of pruning
+  int model_based_prune_tx_search_level;
+
+  // Model based breakout after interpolation filter search
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_post_interp_filter_breakout;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Drop less likely to be picked reference frames in the RD search.
+  // Has three levels for now: 0, 1 and 2, where higher levels prune more
+  // aggressively than lower ones. (0 means no pruning).
+  int selective_ref_frame;
+
+  // Prune extended partition types search
+  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
+  // aggressiveness of pruning in order.
+  int prune_ext_partition_types_search_level;
+
+  // Use a ML model to prune horz and vert partitions
+  int ml_prune_rect_partition;
+
+  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
+  int ml_prune_ab_partition;
+
+  // Use a ML model to prune horz4 and vert4 partitions.
+  int ml_prune_4_partition;
+
+  int fast_cdef_search;
+
+  // 2-pass coding block partition search
+  int two_pass_partition_search;
+
+  // Use the mode decisions made in the initial partition search to prune mode
+  // candidates, e.g. ref frames.
+  int mode_pruning_based_on_two_pass_partition_search;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split. Can take values 0 - 2, 0 referring to no
+  // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+  int less_rectangular_check_level;
+
+  // Use square partition only beyond this block size.
+  BLOCK_SIZE use_square_partition_only_threshold;
+
+  // Prune reference frames for rectangular partitions.
+  int prune_ref_frame_for_rect_partitions;
+
+  // Sets min and max partition sizes for this superblock based on the
+  // same superblock in last encoded frame, and the left and above neighbor.
+  AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+  // Ensures the rd based auto partition search will always
+  // go down at least to the specified level.
+  BLOCK_SIZE rd_auto_partition_min_limit;
+
+  // Min and max partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Whether or not we allow partitions one smaller or one greater than the last
+  // frame's partitioning. Only used if use_lastframe_partitioning is set.
+  int adjust_partitioning_from_last_frame;
+
+  // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+  // it always, to allow it for only Last frame and Intra, disable it for all
+  // inter modes or to enable it always.
+  int disable_split_mask;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Flag for allowing some use of exhaustive searches;
+  int allow_exhaustive_searches;
+
+  // Threshold for allowing exhaistive motion search.
+  int exhaustive_searches_thresh;
+
+  // Maximum number of exhaustive searches for a frame.
+  int max_exaustive_pct;
+
+  // Pattern to be used for any exhaustive mesh searches.
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  // Allows sub 8x8 modes to use the prediction filter that was determined
+  // best for 8x8 mode. If set to 0 we always re check all the filters for
+  // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+  // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+  int adaptive_pred_interp_filter;
+
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
+  int cb_partition_search;
+
+  int alt_ref_search_fp;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // A source variance threshold below which wedge search is disabled
+  unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Thresholds for ML based partition search breakout.
+  int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // Fast approximation of av1_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+
+  // If true, sub-pixel search uses the exact convolve function used for final
+  // encoding and decoding; otherwise, it uses bilinear interpolation.
+  int use_accurate_subpel_search;
+
+  // Whether to compute distortion in the image domain (slower but
+  // more accurate), or in the transform domain (faster but less acurate).
+  // 0: use image domain
+  // 1: use transform domain in tx_type search, and use image domain for
+  // RD_STATS
+  // 2: use transform domain
+  int use_transform_domain_distortion;
+
+  GM_SEARCH_TYPE gm_search_type;
+
+  // whether to disable the global motion recode loop
+  int gm_disable_recode;
+
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
+
+  // Save results of interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are same with previous
+  // saved results, it can be skipped.
+  int skip_repeat_interpolation_filter_search;
+
+  // Use a hash table to store previously computed optimized qcoeffs from
+  // expensive calls to optimize_txb.
+  int use_hash_based_trellis;
+
+  // flag to drop some ref frames in compound motion search
+  int drop_ref;
+
+  // flag to allow skipping intra mode for inter frame prediction
+  int skip_intra_in_interframe;
+
+  // Use hash table to store intra(keyframe only) txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_intra_txb_hash;
+
+  // Use hash table to store inter txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_inter_txb_hash;
+
+  // Use hash table to store macroblock RD search results
+  // to avoid repeated search on the same residue signal.
+  int use_mb_rd_hash;
+
+  // Calculate RD cost before doing optimize_b, and skip if the cost is large.
+  int optimize_b_precheck;
+
+  // Use model rd instead of transform search in jnt_comp
+  int jnt_comp_fast_tx_search;
+
+  // Skip mv search in jnt_comp
+  int jnt_comp_skip_mv_search;
+
+  // Decoder side speed feature to add penalty for use of dual-sgr filters.
+  // Takes values 0 - 10, 0 indicating no penalty and each additional level
+  // adding a penalty of 1%
+  int dual_sgr_penalty_level;
+
+  // Dynamically estimate final rd from prediction error and mode cost
+  int inter_mode_rd_model_estimation;
+
+  // Skip some ref frames in compound motion search by single motion search
+  // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+  // increasing aggressiveness of skipping in order.
+  // Note: The search order might affect the result. It is better to search same
+  // single inter mode as a group.
+  int prune_comp_search_by_single_result;
+
+  // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+  // single ref modes
+  int reuse_inter_intra_mode;
+
+  // Set the full pixel search level of obmc
+  // 0: obmc_full_pixel_diamond
+  // 1: obmc_refining_search_sad (faster)
+  int obmc_full_pixel_search_level;
+
+  // flag to skip NEWMV mode in drl if the motion search result is the same
+  int skip_repeated_newmv;
+} SPEED_FEATURES;
+
+struct AV1_COMP;
+
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 000000000..75fdf02a5
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+
+static void temporal_filter_predictors_mb_c(
+    MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
+    int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
+    uint8_t *pred, struct scale_factors *scale, int x, int y,
+    int can_use_previous, int num_planes) {
+  const MV mv = { mv_row, mv_col };
+  enum mv_precision mv_precision_uv;
+  int uv_stride;
+  // TODO(angiebird): change plane setting accordingly
+  ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
+  const InterpFilters interp_filters = xd->mi[0]->interp_filters;
+  WarpTypesAllowed warp_types;
+  memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+
+  if (uv_block_width == 8) {
+    uv_stride = (stride + 1) >> 1;
+    mv_precision_uv = MV_PRECISION_Q4;
+  } else {
+    uv_stride = stride;
+    mv_precision_uv = MV_PRECISION_Q3;
+  }
+  av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
+                            &conv_params, interp_filters, &warp_types, x, y, 0,
+                            0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
+
+  if (num_planes > 1) {
+    av1_build_inter_predictor(
+        u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, &conv_params, interp_filters,
+        &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
+
+    av1_build_inter_predictor(
+        v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+        uv_block_width, uv_block_height, &conv_params, interp_filters,
+        &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+  }
+}
+
+void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
+                                 uint8_t *frame2, unsigned int block_width,
+                                 unsigned int block_height, int strength,
+                                 int filter_weight, unsigned int *accumulator,
+                                 uint16_t *count) {
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = (int)i + idy;
+          int col = (int)j + idx;
+
+          if (row >= 0 && row < (int)block_height && col >= 0 &&
+              col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                       frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16) modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+
+void av1_highbd_temporal_filter_apply_c(
+    uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
+    unsigned int block_width, unsigned int block_height, int strength,
+    int filter_weight, unsigned int *accumulator, uint16_t *count) {
+  uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+  uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+  unsigned int i, j, k;
+  int modifier;
+  int byte = 0;
+  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+  for (i = 0, k = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++, k++) {
+      int pixel_value = *frame2;
+
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = (int)i + idy;
+          int col = (int)j + idx;
+
+          if (row >= 0 && row < (int)block_height && col >= 0 &&
+              col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                       frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
+      modifier += rounding;
+      modifier >>= strength;
+
+      if (modifier > 16) modifier = 16;
+
+      modifier = 16 - modifier;
+      modifier *= filter_weight;
+
+      count[k] += modifier;
+      accumulator[k] += modifier * pixel_value;
+
+      byte++;
+    }
+
+    byte += stride - block_width;
+  }
+}
+
+static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
+                                              uint8_t *arf_frame_buf,
+                                              uint8_t *frame_ptr_buf,
+                                              int stride, int x_pos,
+                                              int y_pos) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  int bestsme = INT_MAX;
+  int distortion;
+  unsigned int sse;
+  int cost_list[5];
+  MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = kZeroMv;
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  // Save input state
+  struct buf_2d src = x->plane[0].src;
+  struct buf_2d pre = xd->plane[0].pre[0];
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = arf_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = frame_ptr_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  x->mvcost = x->mv_cost_stack;
+  x->nmvjointcost = x->nmv_vec_cost;
+
+  av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+                        NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
+  x->mv_limits = tmp_mv_limits;
+
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  if (cpi->common.cur_frame_force_integer_mv == 1) {
+    const uint8_t *const src_address = x->plane[0].src.buf;
+    const int src_stride = x->plane[0].src.stride;
+    const uint8_t *const y = xd->plane[0].pre[0].buf;
+    const int y_stride = xd->plane[0].pre[0].stride;
+    const int offset = x->best_mv.as_mv.row * y_stride + x->best_mv.as_mv.col;
+
+    x->best_mv.as_mv.row *= 8;
+    x->best_mv.as_mv.col *= 8;
+
+    bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
+                                          src_stride, &sse);
+  } else {
+    bestsme = cpi->find_fractional_mv_step(
+        x, &cpi->common, 0, 0, &best_ref_mv1,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
+        cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+        NULL, 0, 0, 0, 0, 0);
+  }
+
+  x->e_mbd.mi[0]->mv[0] = x->best_mv;
+
+  // Restore input state
+  x->plane[0].src = src;
+  xd->plane[0].pre[0] = pre;
+
+  return bestsme;
+}
+
+static void temporal_filter_iterate_c(AV1_COMP *cpi,
+                                      YV12_BUFFER_CONFIG **frames,
+                                      int frame_count, int alt_ref_index,
+                                      int strength,
+                                      struct scale_factors *scale) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  int byte;
+  int frame;
+  int mb_col, mb_row;
+  unsigned int filter_weight;
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+  int mb_y_offset = 0;
+  int mb_uv_offset = 0;
+  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+  uint8_t *dst1, *dst2;
+  DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]);
+  uint8_t *predictor;
+  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+
+  // Save input state
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  int i;
+  if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  } else {
+    predictor = predictor8;
+  }
+
+  for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+  for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+    // Source frames are extended to 16 pixels. This is different than
+    //  L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS)
+    // A 6/8 tap filter is used for motion search.  This requires 2 pixels
+    //  before and 3 pixels after.  So the largest Y mv on a border would
+    //  then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the
+    //  Y and therefore only extended by 8.  The largest mv that a UV block
+    //  can support is 8 - AOM_INTERP_EXTEND.  A UV mv is half of a Y mv.
+    //  (16 - AOM_INTERP_EXTEND) >> 1 which is greater than
+    //  8 - AOM_INTERP_EXTEND.
+    // To keep the mv in play for both Y and UV planes the max that it
+    //  can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
+    cpi->td.mb.mv_limits.row_min =
+        -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+    cpi->td.mb.mv_limits.row_max =
+        ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+      int j, k;
+      int stride;
+
+      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+
+      cpi->td.mb.mv_limits.col_min =
+          -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+      cpi->td.mb.mv_limits.col_max =
+          ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+      for (frame = 0; frame < frame_count; frame++) {
+        const int thresh_low = 10000;
+        const int thresh_high = 20000;
+
+        if (frames[frame] == NULL) continue;
+
+        mbd->mi[0]->mv[0].as_mv.row = 0;
+        mbd->mi[0]->mv[0].as_mv.col = 0;
+        mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+        if (frame == alt_ref_index) {
+          filter_weight = 2;
+        } else {
+          // Find best match in this frame by MC
+          int err = temporal_filter_find_matching_mb_c(
+              cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
+              frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+              mb_col * 16, mb_row * 16);
+
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+        }
+
+        if (filter_weight != 0) {
+          // Construct the predictors
+          temporal_filter_predictors_mb_c(
+              mbd, frames[frame]->y_buffer + mb_y_offset,
+              frames[frame]->u_buffer + mb_uv_offset,
+              frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+              mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
+              mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
+              mb_row * 16, cm->allow_warped_motion, num_planes);
+
+          // Apply the filter (YUV)
+          if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            int adj_strength = strength + 2 * (mbd->bd - 8);
+            av1_highbd_temporal_filter_apply(
+                f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
+                adj_strength, filter_weight, accumulator, count);
+            if (num_planes > 1) {
+              av1_highbd_temporal_filter_apply(
+                  f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                  mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                  accumulator + 256, count + 256);
+              av1_highbd_temporal_filter_apply(
+                  f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                  mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+                  accumulator + 512, count + 512);
+            }
+          } else {
+            av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                        predictor, 16, 16, strength,
+                                        filter_weight, accumulator, count);
+            if (num_planes > 1) {
+              av1_temporal_filter_apply_c(
+                  f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+                  mb_uv_width, mb_uv_height, strength, filter_weight,
+                  accumulator + 256, count + 256);
+              av1_temporal_filter_apply_c(
+                  f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+                  mb_uv_width, mb_uv_height, strength, filter_weight,
+                  accumulator + 512, count + 512);
+            }
+          }
+        }
+      }
+
+      // Normalize filter output to produce AltRef frame
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *dst1_16;
+        uint16_t *dst2_16;
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            dst1_16[byte] =
+                (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+            // move to next pixel
+            byte++;
+          }
+
+          byte += stride - 16;
+        }
+        if (num_planes > 1) {
+          dst1 = cpi->alt_ref_buffer.u_buffer;
+          dst2 = cpi->alt_ref_buffer.v_buffer;
+          dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+          dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+          stride = cpi->alt_ref_buffer.uv_stride;
+          byte = mb_uv_offset;
+          for (i = 0, k = 256; i < mb_uv_height; i++) {
+            for (j = 0; j < mb_uv_width; j++, k++) {
+              int m = k + 256;
+              // U
+              dst1_16[byte] =
+                  (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+              // V
+              dst2_16[byte] =
+                  (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+              // move to next pixel
+              byte++;
+            }
+            byte += stride - mb_uv_width;
+          }
+        }
+      } else {
+        dst1 = cpi->alt_ref_buffer.y_buffer;
+        stride = cpi->alt_ref_buffer.y_stride;
+        byte = mb_y_offset;
+        for (i = 0, k = 0; i < 16; i++) {
+          for (j = 0; j < 16; j++, k++) {
+            dst1[byte] =
+                (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+            // move to next pixel
+            byte++;
+          }
+          byte += stride - 16;
+        }
+        if (num_planes > 1) {
+          dst1 = cpi->alt_ref_buffer.u_buffer;
+          dst2 = cpi->alt_ref_buffer.v_buffer;
+          stride = cpi->alt_ref_buffer.uv_stride;
+          byte = mb_uv_offset;
+          for (i = 0, k = 256; i < mb_uv_height; i++) {
+            for (j = 0; j < mb_uv_width; j++, k++) {
+              int m = k + 256;
+              // U
+              dst1[byte] =
+                  (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+              // V
+              dst2[byte] =
+                  (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+              // move to next pixel
+              byte++;
+            }
+            byte += stride - mb_uv_width;
+          }
+        }
+      }
+      mb_y_offset += 16;
+      mb_uv_offset += mb_uv_width;
+    }
+    mb_y_offset += 16 * (f->y_stride - mb_cols);
+    mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+  }
+
+  // Restore input state
+  for (i = 0; i < num_planes; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
+                               int *arnr_frames, int *arnr_strength) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frames_after_arf =
+      av1_lookahead_depth(cpi->lookahead) - distance - 1;
+  int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+  int frames_bwd;
+  int q, frames, strength;
+
+  // Define the forward and backwards filter limits for this arnr group.
+  if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+  if (frames_fwd > distance) frames_fwd = distance;
+
+  frames_bwd = frames_fwd;
+
+  // For even length filter there is one more frame backward
+  // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+  if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+  // Set the baseline active filter size.
+  frames = frames_bwd + 1 + frames_fwd;
+
+  // Adjust the strength based on active max q.
+  if (cpi->common.current_video_frame > 1)
+    q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+                                      cpi->common.seq_params.bit_depth));
+  else
+    q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
+                                      cpi->common.seq_params.bit_depth));
+  if (q > 16) {
+    strength = oxcf->arnr_strength;
+  } else {
+    strength = oxcf->arnr_strength - ((16 - q) / 2);
+    if (strength < 0) strength = 0;
+  }
+
+  // Adjust number of frames in filter and strength based on gf boost level.
+  if (frames > group_boost / 150) {
+    frames = group_boost / 150;
+    frames += !(frames & 1);
+  }
+
+  if (strength > group_boost / 300) {
+    strength = group_boost / 300;
+  }
+
+  *arnr_frames = frames;
+  *arnr_strength = strength;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int frame;
+  int frames_to_blur;
+  int start_frame;
+  int strength;
+  int frames_to_blur_backward;
+  int frames_to_blur_forward;
+  struct scale_factors sf;
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  // Apply context specific adjustments to the arnr filter parameters.
+  adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+  // TODO(weitinglin): Currently, we enforce the filtering strength on
+  //                   extra ARFs' to be zeros. We should investigate in which
+  //                   case it is more beneficial to use non-zero strength
+  //                   filtering.
+  if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+    strength = 0;
+    frames_to_blur = 1;
+  }
+
+  int which_arf = gf_group->arf_update_idx[gf_group->index];
+
+  // Set the temporal filtering status for the corresponding OVERLAY frame
+  if (strength == 0 && frames_to_blur == 1)
+    cpi->is_arf_filter_off[which_arf] = 1;
+  else
+    cpi->is_arf_filter_off[which_arf] = 0;
+  cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf];
+
+  frames_to_blur_backward = (frames_to_blur / 2);
+  frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+  start_frame = distance + frames_to_blur_forward;
+
+  // Setup frame pointers, NULL indicates frame not included in filter.
+  for (frame = 0; frame < frames_to_blur; ++frame) {
+    const int which_buffer = start_frame - frame;
+    struct lookahead_entry *buf =
+        av1_lookahead_peek(cpi->lookahead, which_buffer);
+    frames[frames_to_blur - 1 - frame] = &buf->img;
+  }
+
+  if (frames_to_blur > 0) {
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    // ARF is produced at the native frame size and resized when coded.
+    av1_setup_scale_factors_for_frame(
+        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+        frames[0]->y_crop_width, frames[0]->y_crop_height);
+  }
+
+  temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+                            frames_to_blur_backward, strength, &sf);
+}
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 000000000..2ddc68b2c
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 000000000..16a6a9a35
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
+                                 int plane, int calc_rate, int allow_update_cdf,
+                                 FRAME_COUNTS *counts) {
+  const uint8_t *const color_map = param->color_map;
+  MapCdf map_cdf = param->map_cdf;
+  ColorCost color_cost = param->color_cost;
+  const int plane_block_width = param->plane_width;
+  const int rows = param->rows;
+  const int cols = param->cols;
+  const int n = param->n_colors;
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
+  int this_rate = 0;
+  uint8_t color_order[PALETTE_MAX_SIZE];
+
+  (void)plane;
+  (void)counts;
+
+  for (int k = 1; k < rows + cols - 1; ++k) {
+    for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+      int i = k - j;
+      int color_new_idx;
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+      assert(color_new_idx >= 0 && color_new_idx < n);
+      if (calc_rate) {
+        this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
+      } else {
+        (*t)->token = color_new_idx;
+        (*t)->color_map_cdf = map_cdf[palette_size_idx][color_ctx];
+        ++(*t);
+        if (allow_update_cdf)
+          update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+        if (plane) {
+          ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+                                          [color_new_idx];
+        } else {
+          ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+                                         [color_new_idx];
+        }
+#endif
+      }
+    }
+  }
+  if (calc_rate) return this_rate;
+  return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+                               BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  params->color_map = xd->plane[plane].color_index_map;
+  params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                          : xd->tile_ctx->palette_y_color_index_cdf;
+  params->color_cost =
+      plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost;
+  params->n_colors = pmi->palette_size[plane];
+  av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+                           &params->rows, &params->cols);
+}
+
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+                                 BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                 COLOR_MAP_TYPE type,
+                                 Av1ColorMapParam *params) {
+  (void)tx_size;
+  memset(params, 0, sizeof(*params));
+  switch (type) {
+    case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+    default: assert(0 && "Invalid color map type"); return;
+  }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            FRAME_COUNTS *counts) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  // The first color index does not use context or entropy.
+  (*t)->token = color_map_params.color_map[0];
+  (*t)->color_map_cdf = NULL;
+  ++(*t);
+  cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+                        counts);
+}
+
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
+                    int blk_col, int block, int plane, void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+
+  if (tx_size == plane_tx_size || plane) {
+    plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                       pd->subsampling_y);
+    if (!dry_run) {
+      av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+                                        plane_bsize, tx_size, arg);
+    } else if (dry_run == DRY_RUN_NORMAL) {
+      av1_update_txb_context_b(plane, block, blk_row, blk_col, plane_bsize,
+                               tx_size, arg);
+    } else {
+      printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+      assert(0);
+    }
+  } else {
+    // Half the block size in transform block unit.
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsw * bsh;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+                       block, plane, arg);
+        block += step;
+      }
+    }
+  }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                           RUN_TYPE dry_run, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  (void)t;
+  struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf };
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (mbmi->skip) {
+    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+    return;
+  }
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (!is_chroma_reference(mi_row, mi_col, bsize,
+                             xd->plane[plane].subsampling_x,
+                             xd->plane[plane].subsampling_y)) {
+      continue;
+    }
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bsizec =
+        scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+    int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+    int mu_blocks_wide =
+        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+    int mu_blocks_high =
+        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        int blk_row, blk_col;
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, blk_row,
+                           blk_col, block, plane, &arg);
+            block += step;
+          }
+        }
+      }
+    }
+  }
+  if (rate) *rate += arg.this_rate;
+}
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 000000000..63b505f36
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/encoder/block.h"
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  aom_cdf_prob *color_map_cdf;
+  // TODO(yaowu: use packed enum type if appropriate)
+  uint8_t token;
+} TOKENEXTRA;
+
+struct AV1_COMP;
+struct ThreadData;
+struct FRAME_COUNTS;
+
+struct tokenize_b_args {
+  const struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  TOKENEXTRA **tp;
+  int this_rate;
+  uint8_t allow_update_cdf;
+};
+
+typedef enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf);
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            struct FRAME_COUNTS *counts);
+
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+                                 TX_SIZE tx_size) {
+  const int eob_max = av1_get_max_eob(tx_size);
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 000000000..405bc9e6e
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,1944 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_hor_layer0,
+    av1_tx_type_nn_weights_4x4_hor_layer1 },
+  { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_ver_layer0,
+    av1_tx_type_nn_weights_4x4_ver_layer1 },
+  { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x8_hor_layer0,
+    av1_tx_type_nn_weights_4x8_hor_layer1 },
+  { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x8_ver_layer0,
+    av1_tx_type_nn_weights_4x8_ver_layer1 },
+  { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x4_hor_layer0,
+    av1_tx_type_nn_weights_8x4_hor_layer1 },
+  { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x4_ver_layer0,
+    av1_tx_type_nn_weights_8x4_ver_layer1 },
+  { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_hor_layer0,
+    av1_tx_type_nn_weights_8x8_hor_layer1 },
+  { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_ver_layer0,
+    av1_tx_type_nn_weights_8x8_ver_layer1 },
+  { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x16_hor_layer0,
+    av1_tx_type_nn_weights_8x16_hor_layer1 },
+  { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x16_ver_layer0,
+    av1_tx_type_nn_weights_8x16_ver_layer1 },
+  { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x8_hor_layer0,
+    av1_tx_type_nn_weights_16x8_hor_layer1 },
+  { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x8_ver_layer0,
+    av1_tx_type_nn_weights_16x8_ver_layer1 },
+  { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x16_layer0,
+      av1_tx_type_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x16_layer0,
+      av1_tx_type_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_hor_layer0,
+    av1_tx_type_nn_weights_4x16_hor_layer1 },
+  { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_ver_layer0,
+    av1_tx_type_nn_weights_4x16_ver_layer1 },
+  { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x4_hor_layer0,
+    av1_tx_type_nn_weights_16x4_hor_layer1 },
+  { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x4_ver_layer0,
+    av1_tx_type_nn_weights_16x4_ver_layer1 },
+  { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+  0.068650f,  -0.732073f, -0.040361f, 0.322550f,  -0.021123f, 0.212518f,
+  -0.350546f, 0.435987f,  -0.111756f, -0.401568f, 0.069548f,  -0.313000f,
+  0.073918f,  -0.373805f, -0.775810f, -0.124753f, 0.181094f,  -0.602641f,
+  -0.026219f, -0.350112f, 0.020599f,  -0.311752f, -0.476482f, -0.669465f,
+  -0.310921f, 0.348869f,  -0.115984f, 0.154250f,  0.200485f,  -0.016689f,
+  0.020392f,  0.413810f,  0.634064f,  -0.627530f, 0.399178f,  -0.012284f,
+  0.472030f,  0.091087f,  -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+  0.309339f,  0.505522f,  0.038496f,  -0.152809f, 0.408684f,  -0.068151f,
+  0.271612f,  0.353233f,  -0.150365f, 0.075212f,  -0.035096f, 0.346615f,
+  0.124382f,  0.477072f,  0.216288f,  0.070548f,  -0.106362f, 0.681613f,
+  -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+  0.063009f,  -0.123053f, 0.104875f,  -0.137581f, -0.282933f, -0.003624f,
+  -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+  -0.670248f, -0.353762f, 0.181109f,  0.289715f,  -0.071206f, 0.261141f,
+  0.052796f,  -0.114554f, -0.139214f, -0.261380f, 0.075984f,  -0.647925f,
+  -0.099528f, -0.677814f, 0.015712f,  -0.389385f, -0.095622f, -0.165117f,
+  -0.109454f, -0.175240f, -0.393914f, 0.212330f,  0.037822f,  0.248280f,
+  0.180197f,  0.110493f,  -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+  -0.542373f, -0.435626f, -0.912194f, 0.062794f,  0.160433f,  0.741485f,
+  -0.103659f, -0.119327f, -0.055275f, 0.334358f,  0.014713f,  0.046327f,
+  0.831114f,  -0.576682f, 0.354369f,  -0.082088f, 0.452331f,  0.039730f,
+  -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+  0.238621f,  2.186830f,  1.383035f,  -0.867139f, 1.257119f, -0.351571f,
+  -0.240650f, -0.971692f, 2.744843f,  1.116991f,  0.139062f, -0.165332f,
+  0.262171f,  -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+  -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+  -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f,  0.278987f,
+  0.085082f,  0.614986f, 0.847904f,  0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+  0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x8_layer0,
+      av1_tx_split_nn_weights_4x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x8_layer0,
+      av1_tx_split_nn_bias_4x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+  0.177983f,  -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+  -0.098202f, -0.279510f, 0.001054f,  -0.119319f, -1.835282f, -0.581507f,
+  -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+  -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+  0.015331f,  -0.341818f, 0.145549f,  -0.348362f, 0.147647f,  -0.323400f,
+  0.047558f,  -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+  0.447740f,  0.782381f,  -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+  -0.096783f, 0.038342f,  -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+  0.757204f,  -1.296465f, 0.631015f,  0.009265f,  0.646192f,  0.044523f,
+  0.653161f,  0.033820f,  0.849639f,  -0.068555f, -1.036085f, -0.511652f,
+  0.104693f,  -1.458690f, 0.286051f,  -0.089800f, 0.381564f,  -0.302640f,
+  0.304465f,  -0.268706f, 0.432603f,  -0.117914f, -2.070031f, -0.565696f,
+  -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+  -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+  0.101894f,  -0.221847f, 0.018412f,  -0.423887f, -0.266684f, -0.444930f,
+  -0.196237f, 0.106638f,  -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+  1.089957f,  -0.799928f, 0.504112f,  -0.165763f, 0.578741f,  -0.172653f,
+  0.547316f,  -0.143484f, 0.717220f,  -0.297190f, -1.237854f, -0.074819f,
+  -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+  -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+  0.231249f,  -1.693073f, -0.035899f, 0.380845f,  -0.058476f, 0.409405f,
+  -0.066679f, 0.406731f,  -0.068501f, 0.396748f,  0.639462f,  0.150834f,
+  -0.418659f, -1.421931f, 0.101889f,  0.083573f,  0.129746f,  0.134460f,
+  0.081185f,  0.127420f,  0.083664f,  0.051096f,  1.361688f,  0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+  4.280443f, 2.218902f, -0.256953f, 3.161431f,  2.082548f, 2.506052f,
+  2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+  1.178833f,  -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+  -0.766968f, -0.356663f, 0.450146f,  0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+  -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      12,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x8_layer0,
+      av1_tx_split_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x8_layer0,
+      av1_tx_split_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+  0.374660f,  0.218905f,  -0.139779f, 0.212141f,  0.056517f,  0.051114f,
+  0.042860f,  -0.273258f, -0.340809f, 0.138983f,  -0.216996f, -0.241519f,
+  -0.123244f, 0.078577f,  -0.472273f, -0.194201f, 0.125056f,  0.239761f,
+  -0.332782f, 0.174782f,  -0.211400f, -0.129795f, 0.062195f,  0.113176f,
+  -0.008869f, 0.140764f,  0.059833f,  0.163826f,  0.359293f,  -0.109797f,
+  -0.022091f, -0.059536f, -0.188226f, 0.179709f,  0.031386f,  0.164790f,
+  0.214364f,  0.198555f,  0.152262f,  -0.242980f, 0.319367f,  -0.136902f,
+  0.046524f,  -0.043591f, 0.342178f,  -0.011757f, -0.014286f, 0.072871f,
+  -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+  -0.120865f, -0.160042f, 0.240028f,  0.112902f,  -0.141587f, -0.703012f,
+  -0.136591f, 0.318993f,  -0.154417f, -0.054668f, 0.192870f,  0.176166f,
+  -0.029965f, 0.266942f,  -0.178384f, 0.038680f,  0.134403f,  -0.002426f,
+  0.534825f,  -0.070923f, 0.413281f,  0.418148f,  0.093729f,  0.016454f,
+  0.305358f,  -0.040512f, 0.069904f,  -0.227588f, -0.362220f, -0.031604f,
+  -0.394901f, 0.071506f,  -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+  0.213062f,  0.076805f,  0.278758f,  0.125613f,  -0.035552f, 0.040971f,
+  0.182785f,  -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+  0.114657f,  0.047121f,  0.195902f,  0.264759f,  0.017799f,  0.210230f,
+  0.150749f,  -0.142142f, 0.182494f,  -0.142415f, -0.259782f, -0.114830f,
+  -0.198826f, 0.000061f,  -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+  0.422680f,  0.066960f,  0.351106f,  -0.209034f, 0.367195f,  -0.110274f,
+  0.115573f,  -0.066642f, -0.389673f, -0.260447f, 0.056949f,  -0.180425f,
+  0.069922f,  -0.153506f, -0.097053f, -0.111757f, 0.094069f,  0.144837f,
+  -0.052984f, -0.506681f, -0.034474f, 0.279057f,  -0.105025f, 0.006656f,
+  -0.125017f, -0.114096f, 0.103153f,  -0.117402f, -0.359472f, 0.072534f,
+  0.110291f,  0.003088f,  -0.456897f, 0.038331f,  -0.322298f, 0.113942f,
+  -0.119916f, -0.194392f, 0.093167f,  0.193459f,  0.074671f,  0.033602f,
+  0.004440f,  -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+  0.319160f,  -0.066218f, 0.291246f,  0.181292f,  0.089914f,  0.025273f,
+  0.303128f,  0.019063f,  0.078545f,  -0.396919f, 0.014065f,  -0.122121f,
+  0.037107f,  -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+  0.102970f,  -0.225040f, 0.061059f,  -0.258188f, -0.469871f, -0.099607f,
+  -0.061524f, -0.213700f, 0.070237f,  -0.289134f, -0.238225f, 0.256403f,
+  -0.119344f, 0.067782f,  -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+  0.026569f,  0.031037f,  0.094302f,  -0.101239f, 0.433307f,  -0.303612f,
+  0.088537f,  -0.164436f, 0.202471f,  -0.048592f, -0.251904f, 0.122577f,
+  -0.309874f, -0.263405f, -0.292503f, 0.216589f,  0.035378f,  0.136599f,
+  -0.145844f, -0.018211f, 0.174084f,  -0.449941f, -0.001428f, 0.064134f,
+  0.039652f,  0.111083f,  -0.246076f, -0.204733f, 0.056559f,  -0.000123f,
+  0.104049f,  0.138512f,  -0.128309f, 0.087855f,  0.232784f,  0.247138f,
+  0.162766f,  0.154829f,  0.313605f,  -0.164115f, -0.050844f, 0.156549f,
+  0.185279f,  -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+  -0.203399f, -0.096831f, -0.127867f, 0.310674f,  -0.008181f, 0.004078f,
+  -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+  0.114268f,  -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+  -0.352853f, -0.224001f, -0.156330f, 0.215436f,  0.171846f,  0.291849f,
+  0.108832f,  0.046991f,  -0.127801f, 0.032485f,  0.141493f,  0.123319f,
+  -0.057250f, 0.315346f,  -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+  -0.239089f, -0.073251f, -0.327718f, 0.054905f,  -0.283169f, -0.028900f,
+  0.071450f,  0.270072f,  0.248891f,  0.088052f,  0.253319f,  0.122808f,
+  0.175490f,  -0.147805f, 0.089169f,  -0.045457f, -0.330788f, 0.099791f,
+  -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+  0.162554f,  -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+  -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+  -0.182036f, 0.176772f,  -0.070823f, 0.216054f,  -0.211533f, -0.232992f,
+  0.279346f,  0.117984f,  0.236674f,  0.126625f,  -0.046220f, 0.044919f,
+  0.278492f,  0.083944f,  0.180512f,  0.217994f,  0.401170f,  -0.064417f,
+  0.011636f,  -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+  -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f,  -0.312849f,
+  -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+  -0.354389f, 0.169464f,  0.094151f,  -0.217122f, -0.456397f, 0.211478f,
+  0.219232f,  -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+  -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f,  -0.144181f,
+  0.335028f,  0.176439f,  0.105980f,  0.169390f,  0.155615f,  -0.040618f,
+  -0.176029f, 0.155569f,  -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+  -0.434334f, 0.092238f,  -0.263103f, 0.061804f,  -0.172957f, 0.005962f,
+  -0.100176f, 0.125898f,  0.048092f,  -0.088141f, 0.247196f,  -0.221601f,
+  -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+  0.403401f,  -0.046200f, 0.322259f,  0.219678f,  0.109850f,  0.051837f,
+  0.196861f,  -0.019118f, 0.248818f,  -0.137567f, 0.127862f,  0.052293f,
+  0.298726f,  0.275788f,  0.015344f,  0.058714f,  0.283691f,  -0.053794f,
+  -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+  -0.252396f, -0.069017f, 0.034803f,  -0.003388f, -0.262577f, 0.062115f,
+  -0.298393f, 0.215415f,  -0.153615f, 0.289902f,  0.085886f,  -0.504290f,
+  0.077178f,  0.150861f,  -0.228848f, -0.261020f, 0.198204f,  0.162113f,
+  0.346418f,  -0.286950f, 0.354756f,  -0.226419f, 0.024720f,  0.208037f,
+  0.107286f,  -0.110849f, 0.104415f,  -0.207725f, 0.063932f,  -0.037748f,
+  -0.167037f, -0.068282f, 0.320815f,  -0.051884f, 0.099989f,  -0.078388f,
+  0.127071f,  0.046675f,  -0.336571f, -0.273080f, 0.264694f,  -0.007352f,
+  -0.093828f, 0.094773f,  -0.144434f, 0.091795f,  -0.031615f, 0.056914f,
+  0.064673f,  -0.136669f, 0.344734f,  0.225926f,  0.283451f,  -0.068354f,
+  0.030572f,  0.180784f,  -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+  0.052094f,  -0.017932f, 0.216302f,  -0.184396f, 0.079888f,  0.210406f,
+  -0.020627f, 0.244744f,  0.336972f,  -0.182914f, -0.220976f, -0.304225f,
+  -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+  -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+  -0.274107f, 0.445751f,  0.234359f,  0.291593f,  0.163298f,  0.183707f,
+  -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f,  -0.354974f,
+  0.000000f,  -0.254630f, 0.220149f,  0.371104f,  0.789759f,  0.270300f,
+  0.195126f,  -0.206958f, 0.917708f,  -0.256232f, 1.131933f,  1.178944f,
+  0.461270f,  0.246169f,  -0.818614f, -0.111986f, 0.759355f,  0.154889f,
+  0.470299f,  -1.025250f, 0.678678f,  0.959346f,  -0.164105f, 0.544079f,
+  -0.448733f, 0.649221f,  -0.536672f, 0.962758f,  -0.256427f, 0.808664f,
+  -0.118694f, 0.684873f,  -0.015635f, -0.046469f, 0.075481f,  0.412647f,
+  0.454456f,  -0.107169f, 0.775235f,  -0.261629f, -1.194849f, 0.010093f,
+  -0.231289f, 0.658286f,  -0.769320f, 0.564545f,  0.482962f,  -0.131378f,
+  -0.255844f, -0.078400f, 0.476752f,  0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+  -0.145065f, -0.145101f, 0.174786f,  0.196692f,  0.102025f,  -0.087735f,
+  0.386353f,  -0.660539f, -0.183940f, 0.490045f,  -0.276404f, -0.145669f,
+  0.209846f,  -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+  -0.108545f, -0.261181f, 1.435606f,  -0.176621f, -1.158548f, 2.035680f,
+  0.218069f,  -0.138629f, 0.305958f,  -0.277194f, -0.602468f, 0.203873f,
+  0.120720f,  0.216095f,  -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+  0.545643f,  0.232091f,  0.330169f,  0.988136f,  -0.070465f, -0.345584f,
+  -0.162455f, -0.617064f, 0.123881f,  -0.201098f, 0.222756f,  0.112932f,
+  0.048647f,  -0.147890f, 0.394584f,  -0.262148f, 0.280564f,  -0.195432f,
+  -0.047515f, 1.133410f,  0.255415f,  -0.299032f, -0.397807f, -0.153246f,
+  -0.256734f, 0.177370f,  0.213522f,  -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+  0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      64,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x16_layer0,
+      av1_tx_split_nn_weights_8x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x16_layer0,
+      av1_tx_split_nn_bias_8x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+  -0.177215f, -0.297166f, 0.299924f,  0.207878f,  0.216871f,  0.173264f,
+  0.295464f,  0.048395f,  0.154731f,  0.305880f,  0.056787f,  -0.166617f,
+  0.115653f,  -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+  -0.024940f, -0.007055f, 0.001392f,  0.021678f,  -1.594600f, -0.099593f,
+  0.332930f,  0.103574f,  0.158249f,  0.182601f,  0.332665f,  0.226207f,
+  -0.139566f, 0.185531f,  0.099074f,  -0.185654f, -0.203121f, -0.285678f,
+  -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+  -0.066150f, -0.099058f, -0.458879f, 0.127544f,  0.338314f,  -0.161350f,
+  0.030091f,  -0.075528f, 0.004320f,  0.353690f,  -0.013480f, -0.420402f,
+  -0.004659f, -0.329401f, -0.001745f, 0.227384f,  -0.055183f, 0.121405f,
+  0.160340f,  0.143603f,  -0.221813f, 0.079107f,  -0.657639f, -0.084348f,
+  -0.303414f, 0.046774f,  -0.367679f, 0.060005f,  0.168645f,  0.084421f,
+  -0.133625f, 0.301375f,  0.079412f,  -0.419303f, 0.017235f,  0.068637f,
+  0.018384f,  -0.428325f, -0.019753f, 0.149444f,  -0.474836f, -0.287162f,
+  0.198083f,  0.028292f,  -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+  -0.217561f, -0.264003f, 0.269411f,  0.207032f,  -0.339411f, -0.198431f,
+  -0.028521f, 0.158076f,  0.177116f,  0.345702f,  -0.145132f, 0.064623f,
+  -0.090867f, 0.288816f,  -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+  -0.014100f, -0.271192f, -0.318559f, 0.129015f,  -0.050314f, -0.093355f,
+  -0.578498f, 0.099090f,  -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+  -0.321153f, -0.343671f, -0.242959f, 0.128304f,  0.017170f,  0.072787f,
+  -0.475838f, -0.003806f, -0.068615f, 0.150556f,  -0.159903f, -0.416513f,
+  0.218794f,  -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+  -0.077329f, -0.089747f, -0.096526f, 0.537952f,  0.134725f,  -0.006469f,
+  -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f,  -0.021712f,
+  -0.513992f, 0.259135f,  -0.319808f, 0.077811f,  0.104613f,  0.370571f,
+  0.185244f,  0.065530f,  -0.091098f, -0.573741f, 0.111934f,  0.437417f,
+  -0.123691f, 0.220641f,  -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+  0.038015f,  -0.380596f, 0.250980f,  0.142208f,  0.135170f,  -0.131129f,
+  -0.357556f, -0.530945f, 0.159672f,  -0.147025f, -0.377829f, -0.504508f,
+  -0.492870f, 0.020753f,  0.142818f,  0.025172f,  0.086140f,  0.091283f,
+  0.087491f,  -0.186415f, 0.177785f,  -0.195121f, -1.191148f, -0.477102f,
+  0.023371f,  0.227004f,  -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+  0.162900f,  0.415509f,  -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+  -0.080845f, -0.330274f, 0.021874f,  0.232398f,  0.069277f,  0.220567f,
+  -0.024237f, -0.366771f, 0.081673f,  -0.429906f, -0.302170f, 0.061045f,
+  0.352777f,  -0.230376f, 0.408153f,  0.064758f,  0.142051f,  0.007219f,
+  0.622878f,  0.212577f,  0.036489f,  0.081150f,  -0.284767f, 0.107763f,
+  -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+  -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f,  0.057565f,
+  0.414265f,  -0.159155f, 0.221456f,  0.146314f,  0.265776f,  -0.006516f,
+  0.473978f,  -0.186431f, 0.288672f,  -0.060437f, 0.083380f,  -0.205641f,
+  0.360016f,  0.222041f,  0.420011f,  0.024579f,  0.377546f,  0.250380f,
+  -0.069900f, 0.296743f,  0.073532f,  -0.243225f, -0.374987f, -0.387288f,
+  -0.237255f, -0.287013f, 0.417831f,  -0.252988f, -0.257652f, -0.066775f,
+  -0.253926f, 0.057841f,  0.346133f,  -0.157797f, -0.406028f, -0.286893f,
+  0.274507f,  -0.452561f, 0.143381f,  -0.097755f, 0.021242f,  0.034561f,
+  0.044115f,  0.004065f,  0.066729f,  0.043558f,  0.102991f,  -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+  -0.479033f, 1.467402f,  -0.366291f, 0.372511f,  0.715322f,  -0.605500f,
+  0.176848f,  0.032318f,  0.237429f,  -0.046047f, 0.452082f,  0.451805f,
+  -0.822845f, 0.636762f,  -0.057350f, 1.163978f,  0.728287f,  0.603654f,
+  -0.245519f, -0.893569f, -1.428185f, 0.808870f,  -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+  -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f,  -0.170504f,
+  -0.538432f, 0.033893f, 0.149842f,  0.404140f,  -0.377812f, 0.338838f,
+  -0.176091f, 0.249844f, -0.362533f, 1.412460f,  0.196862f,  0.278194f,
+  -0.140444f, 0.297746f, 0.172533f,  0.116470f,  -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+  0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x16_layer0,
+      av1_tx_split_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x16_layer0,
+      av1_tx_split_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+  -0.439303f, 0.004813f,  -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+  -0.196770f, -0.076096f, 0.357004f,  -0.044909f, -0.112910f, -0.129081f,
+  0.156725f,  -0.386346f, 0.038971f,  0.160696f,  0.204923f,  -0.384333f,
+  -0.319546f, 0.028179f,  -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+  -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f,  0.141414f,
+  0.303016f,  0.098066f,  0.482455f,  0.036069f,  -0.166279f, 0.210119f,
+  -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+  -0.306403f, 0.026318f,  -0.277296f, 0.092684f,  -0.033584f, -0.018371f,
+  -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+  0.361851f,  -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+  -0.097051f, 0.259172f,  0.016432f,  0.259358f,  0.145059f,  0.037196f,
+  0.091581f,  -0.219644f, 0.140384f,  -0.446837f, -0.234531f, 0.149508f,
+  -0.083429f, 0.186189f,  -0.099890f, -0.111277f, 0.495214f,  0.085053f,
+  -0.266613f, -0.051366f, 0.148593f,  0.111875f,  0.077787f,  -0.371653f,
+  -0.146157f, -0.229235f, 0.076203f,  0.488975f,  0.096771f,  -0.009483f,
+  0.192985f,  0.246273f,  -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+  -0.106892f, -0.329659f, 0.012105f,  -0.359326f, 0.170723f,  -0.004357f,
+  0.171593f,  -0.478768f, -0.236016f, -0.035077f, 0.133731f,  0.137962f,
+  -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+  -0.649359f, 0.127605f,  0.097930f,  0.182775f,  -0.313324f, 0.053349f,
+  0.204203f,  -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+  -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+  0.632147f,  0.221825f,  0.268394f,  -0.096357f, 0.442545f,  -0.007117f,
+  -0.036125f, 0.000525f,  0.088092f,  -0.203653f, 0.086925f,  0.439141f,
+  0.329889f,  -0.370050f, -0.194306f, -0.207430f, 0.132779f,  -0.217614f,
+  -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+  -0.007300f, 0.062257f,  -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+  -0.189117f, -0.087622f, -0.561091f, 0.184182f,  -0.044980f, 0.012643f,
+  0.241672f,  0.050272f,  -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+  -0.269471f, 0.231668f,  0.135749f,  -0.131162f, 0.062760f,  0.100949f,
+  0.074967f,  -0.056918f, 0.251707f,  0.034098f,  0.341290f,  -0.105027f,
+  0.313246f,  -0.092679f, -0.014632f, -0.390967f, 0.136881f,  -0.241554f,
+  0.097674f,  0.110832f,  -0.390245f, 0.017654f,  -0.506222f, 0.065252f,
+  0.244834f,  -0.171352f, -0.331702f, 0.111043f,  0.125217f,  -0.058116f,
+  -0.382595f, -0.052545f, 0.114261f,  -0.493617f, 0.243984f,  -0.171053f,
+  0.165009f,  -0.063020f, 0.096502f,  0.341339f,  -0.013443f, 0.056372f,
+  0.339284f,  0.398376f,  0.389409f,  0.257252f,  0.517368f,  0.078856f,
+  0.087716f,  -0.171092f, 0.227461f,  0.125307f,  -0.054423f, -0.143161f,
+  0.224041f,  -0.086477f, -0.092548f, 0.072392f,  -0.061608f, 0.258347f,
+  0.147033f,  -0.478244f, -0.204869f, 0.038552f,  -0.144563f, 0.224087f,
+  -0.296705f, 0.153889f,  -0.064624f, 0.085265f,  -0.103826f, 0.127971f,
+  0.019965f,  0.111937f,  -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+  0.042714f,  0.070052f,  -0.202360f, 0.348144f,  -0.132097f, -0.209585f,
+  -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f,  -0.013468f,
+  -0.406090f, -0.144936f, 0.208620f,  0.343445f,  -0.059639f, 0.114857f,
+  -0.069431f, -0.218725f, 0.190575f,  -0.368101f, 0.030030f,  0.062815f,
+  -0.239369f, -0.537852f, 0.022487f,  0.023038f,  0.190788f,  0.040123f,
+  -0.004304f, 0.060749f,  -0.108929f, 0.136796f,  -0.542875f, -0.227074f,
+  -0.182244f, 0.082559f,  0.019149f,  0.178854f,  0.120284f,  0.009070f,
+  0.068268f,  -0.544822f, 0.120536f,  0.354028f,  -0.119890f, -0.122055f,
+  -0.405335f, 0.122341f,  -0.304412f, 0.062405f,  -0.302568f, -0.276505f,
+  -0.120915f, -0.221841f, 0.282007f,  -0.253971f, 0.059517f,  -0.144976f,
+  0.149391f,  -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+  0.017485f,  0.021038f,  -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+  -0.114365f, -0.397732f, -0.065421f, 0.053084f,  0.035201f,  0.053019f,
+  -0.105377f, -0.039500f, 0.131904f,  -0.123911f, -0.390328f, -0.125198f,
+  -0.000126f, 0.014864f,  -0.220187f, 0.084056f,  -0.492155f, -0.164979f,
+  0.133592f,  0.121519f,  -0.240813f, 0.186680f,  0.118673f,  0.235006f,
+  -0.239894f, -0.185759f, -0.336992f, 0.209620f,  -0.298845f, 0.127803f,
+  -0.083992f, 0.194340f,  -0.245378f, 0.212308f,  0.142512f,  -0.163324f,
+  0.383495f,  0.291065f,  0.286620f,  -0.239957f, 0.225127f,  -0.174424f,
+  0.297231f,  -0.045434f, 0.156444f,  -0.184273f, -0.204567f, 0.202551f,
+  0.370019f,  -0.073910f, 0.344897f,  0.063100f,  0.338547f,  -0.099145f,
+  0.391863f,  -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+  0.143343f,  -0.021982f, -0.314939f, 0.170867f,  -0.081248f, 0.125758f,
+  -0.355762f, 0.279798f,  1.027712f,  -0.434660f, 1.072005f,  0.668893f,
+  -0.031216f, -0.528650f, 0.328349f,  0.543645f,  -0.188810f, 0.221110f,
+  -1.638637f, 0.058045f,  -1.731105f, -0.444284f, 0.513693f,  0.890025f,
+  0.160288f,  0.393312f,  0.332856f,  -0.080767f, 0.299822f,  0.235876f,
+  0.254942f,  -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+  -0.090326f, -0.267553f, -0.026071f, 0.100912f,  0.279137f,  0.079064f,
+  -0.074885f, 0.053804f,  0.736810f,  -0.031693f, -0.970514f, 0.174069f,
+  0.095940f,  -0.065047f, 0.052911f,  0.176728f,  -0.058274f, 0.148364f,
+  -0.162210f, 0.093875f,  -0.367663f, 0.020876f,  0.137280f,  -1.099116f,
+  0.146854f,  0.075590f,  0.228534f,  0.141993f,  0.072143f,  0.101421f,
+  -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+  0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x32_layer0,
+      av1_tx_split_nn_weights_32x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x32_layer0,
+      av1_tx_split_nn_bias_32x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+  -0.006828f, 0.149944f,  -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+  0.001039f,  0.037164f,  0.015091f,  -0.306620f, -0.162047f, -0.369440f,
+  0.396310f,  0.087121f,  0.208609f,  -0.083068f, 0.493774f,  0.217682f,
+  0.377393f,  0.172879f,  0.397422f,  0.078919f,  0.741350f,  0.064169f,
+  -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+  -0.436596f, -0.007551f, -0.396721f, 0.153570f,  -0.190838f, -0.071869f,
+  0.048799f,  -0.301301f, -0.005015f, 0.500480f,  -0.030622f, -0.559095f,
+  -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f,  -0.411323f,
+  -0.005366f, -0.069496f, 0.019990f,  0.327931f,  -0.002516f, 0.393190f,
+  0.001759f,  0.035093f,  -0.030302f, -0.528984f, 0.174781f,  0.241462f,
+  -0.415427f, -0.164502f, 0.143065f,  -0.122595f, 0.082049f,  -0.143346f,
+  0.055642f,  -0.124701f, 0.004050f,  -0.216235f, -2.681730f, 0.101658f,
+  0.381239f,  0.465936f,  0.331154f,  0.301708f,  -0.360171f, 0.054886f,
+  -0.118658f, 0.287921f,  0.277859f,  0.203784f,  0.247809f,  0.656924f,
+  -0.354628f, 0.315081f,  0.105108f,  -0.510179f, 0.059267f,  0.061386f,
+  0.076423f,  0.347119f,  0.100134f,  0.028402f,  -0.118621f, -0.238689f,
+  0.080141f,  -0.138863f, 0.009009f,  -0.100526f, -0.138875f, 0.066992f,
+  0.005949f,  0.564336f,  0.046994f,  0.004655f,  0.366047f,  0.014695f,
+  -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f,  -0.020925f,
+  -0.227236f, -0.068141f, 0.282009f,  0.040192f,  -0.267100f, 0.229228f,
+  0.133861f,  0.338706f,  -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+  -0.066931f, -0.110580f, -0.072056f, 0.599457f,  -0.020738f, 0.169200f,
+  0.836240f,  -0.157548f, 0.386273f,  0.002404f,  0.329410f,  -0.007020f,
+  0.351705f,  -0.041259f, 0.388861f,  0.003899f,  0.582627f,  0.023572f,
+  0.409912f,  -0.158472f, 0.536383f,  0.525093f,  0.604247f,  0.439159f,
+  0.692832f,  0.046272f,  0.590367f,  -0.082166f, 0.262357f,  0.478671f,
+  0.031935f,  0.042675f,  0.120002f,  0.398616f,  -0.078967f, 0.227986f,
+  -0.044679f, 0.151061f,  -0.085564f, 0.220205f,  -0.265606f, -0.203623f,
+  0.204719f,  -0.125922f, 0.038544f,  -0.269379f, 0.025866f,  0.109967f,
+  0.019064f,  -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+  0.278496f,  0.018620f,  0.209971f,  0.296250f,  0.142850f,  0.288689f,
+  0.137084f,  0.130517f,  0.128171f,  -0.155396f, -0.008449f, -0.099845f,
+  0.173455f,  -0.059909f, -0.147318f, 0.102851f,  -0.251389f, -0.001448f,
+  0.103907f,  0.297273f,  -0.027846f, 0.028260f,  -0.382601f, 0.346695f,
+  -0.601641f, 0.162366f,  -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+  -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f,  0.003008f,
+  0.099917f,  -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+  -0.050644f, 0.020041f,  -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+  -0.129115f, -0.710559f, 0.157213f,  -0.844037f, -0.121991f, -0.943386f,
+  -0.231269f, -0.003462f, 0.331478f,  -0.132703f, -1.285993f, -0.120957f,
+  -0.373755f, -0.322609f, 0.309059f,  -0.131523f, -0.118334f, -0.063805f,
+  -0.104251f, 0.012166f,  -0.094699f, -0.283753f, 0.128168f,  -0.526929f,
+  -0.050331f, 0.186153f,  0.005913f,  -0.221236f, 0.036363f,  0.160909f,
+  -0.001342f, -0.382749f, 0.037820f,  0.281689f,  -0.024275f, 0.028854f,
+  0.318291f,  0.318526f,  0.035778f,  0.034031f,  0.189663f,  -0.293367f,
+  0.082022f,  0.127923f,  0.078866f,  -0.081361f, -0.268117f, 0.246675f,
+  0.248605f,  -0.215479f, -0.073084f, 0.496140f,  -0.067327f, 0.396237f,
+  -0.120739f, 0.033752f,  -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+  -0.040400f, 0.281604f,  -0.100471f, 0.415207f,  -0.258503f, -0.429749f,
+  0.150569f,  -0.010859f, 0.136448f,  0.026589f,  0.148466f,  0.110764f,
+  0.380967f,  0.009177f,  0.103075f,  0.116417f,  0.226273f,  -0.327746f,
+  0.169346f,  0.284553f,  -0.094986f, 0.312745f,  -0.147840f, 0.025062f,
+  -0.494482f, 0.112388f,  -0.213962f, 0.107050f,  -0.433371f, -0.096276f,
+  -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f,  0.042846f,
+  -0.237479f, 0.104746f,  0.158677f,  0.358937f,  0.099921f,  0.277109f,
+  0.012410f,  -0.062897f, 0.116130f,  0.255309f,  0.341628f,  0.145002f,
+  -0.429344f, -0.016433f, -0.068985f, 0.285194f,  -0.286719f, -0.018298f,
+  -0.179369f, -0.194655f, -0.165380f, 0.026071f,  -0.428268f, -0.379929f,
+  -0.727543f, 0.179610f,  -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+  -0.784966f, 0.061205f,  -0.713357f, 0.129795f,  0.120512f,  -0.339545f,
+  0.353557f,  0.114906f,  -0.329813f, -0.209987f, 0.085410f,  0.214313f,
+  -0.122082f, 0.335770f,  -0.020937f, 0.202456f,  0.289023f,  -0.421186f,
+  0.337905f,  0.407663f,  0.132771f,  0.071734f,  0.213914f,  0.128595f,
+  0.302659f,  -0.209501f, 0.217756f,  0.253079f,  -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+  0.296914f,  -1.826816f, 0.346130f,  0.969520f,  -0.528154f, 1.175862f,
+  -0.075985f, -0.097323f, -0.233059f, 0.004846f,  0.401279f,  -2.272435f,
+  0.086257f,  0.414162f,  -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+  0.861214f,  0.298361f,  0.267397f,  -0.158557f, -0.119911f, -0.098134f,
+  -0.339263f, 0.385871f,  -0.678123f, 0.263218f,  0.251611f,  -1.155773f,
+  -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+  0.502104f,  -0.708023f, 0.419648f,  1.583418f,  0.419355f,  -1.462981f,
+  -0.439623f, 0.405691f,  0.823257f,  0.061654f,  0.750875f,  0.775031f,
+  -0.387909f, 0.447385f,  0.284690f,  0.353262f,  -0.224347f, 0.832864f,
+  -1.708491f, -1.042447f, -0.272829f, 0.540640f,  0.310509f,  0.723745f,
+  0.245592f,  -0.218417f, -0.597987f, -0.362301f, 0.702217f,  -0.692614f,
+  0.207812f,  0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_64x64_layer0,
+      av1_tx_split_nn_weights_64x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_64x64_layer0,
+      av1_tx_split_nn_bias_64x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+  -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+  -2.128968f, -0.655518f, 0.432180f,  0.879752f,  -0.222211f, 0.061615f,
+  -0.230969f, 0.569496f,  1.424188f,  0.598063f,  -0.436005f, -0.737606f,
+  -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+  -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f,  -0.331752f,
+  -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+  -0.636060f, 0.183271f,  -0.610212f, 0.345895f,  -1.100906f, -1.605713f,
+  0.111888f,  -0.140937f, 0.063013f,  -0.013315f, -0.273472f, -0.255870f,
+  1.200328f,  0.274002f,  1.005776f,  0.322392f,  1.222373f,  0.158227f,
+  0.408810f,  0.145022f,  0.139842f,  -1.249412f, 0.286672f,  -0.635699f,
+  0.312562f,  -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+  -0.132199f, -0.863055f, 0.217579f,  -1.161425f, -0.302087f, -1.357271f,
+  -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+  -2.057684f, -0.228755f, 0.606278f,  0.101198f,  -0.314847f, -1.303255f,
+  -0.294964f, 1.301923f,  0.041712f,  0.077593f,  -1.152746f, 0.495315f,
+  -0.751566f, 0.230249f,  -0.840661f, 0.100731f,  1.346269f,  0.649898f,
+  -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+  -0.354072f, 0.068292f,  -0.234168f, 0.277503f,  0.179134f,  0.907420f,
+  0.354626f,  -0.627210f, 0.905779f,  0.512612f,  0.161190f,  -0.843177f,
+  0.014953f,  -0.354983f, 0.011116f,  -0.429598f, -1.017138f, -0.211432f,
+  0.941840f,  -0.281747f, 0.957776f,  -0.541914f, 1.041880f,  -0.433580f,
+  -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+  3.086118f,  -3.235095f, 4.830956f,  -0.165706f, 0.955031f,  4.055783f,
+  -0.311489f, 4.660205f,  -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+  -1.191704f, -3.800073f, 4.121552f,  -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+  -0.758677f, 0.388776f,  0.439906f,  0.011390f, -0.084319f, -0.667969f,
+  -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f,  -0.549682f,
+  0.462109f,  0.343315f,  1.092593f,  0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+  0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x16_layer0,
+      av1_tx_split_nn_weights_4x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x16_layer0,
+      av1_tx_split_nn_bias_4x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+  0.180713f,  0.033211f,  0.607561f,  0.138642f,  0.637204f,  -0.000940f,
+  0.012630f,  0.358109f,  0.022238f,  0.190418f,  0.079088f,  0.065925f,
+  0.038242f,  0.162380f,  -0.122728f, 0.379382f,  -0.303283f, -0.327550f,
+  0.029120f,  -0.284553f, 0.269588f,  -0.309805f, -0.241036f, -0.161103f,
+  -0.304887f, 0.239843f,  -0.149146f, 0.311234f,  -0.073640f, -0.132718f,
+  0.178901f,  0.474712f,  0.020280f,  0.063685f,  -0.609170f, -0.013658f,
+  -0.338074f, 0.250429f,  0.082978f,  -0.186315f, -0.788959f, 0.039859f,
+  -0.426461f, -0.001524f, -0.447211f, 0.378102f,  0.315617f,  0.017428f,
+  0.745494f,  -0.219024f, 0.512836f,  0.200522f,  0.680449f,  0.313686f,
+  -0.412569f, -0.132927f, 0.631120f,  0.042735f,  0.336153f,  0.044772f,
+  0.432606f,  0.175681f,  -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+  -0.104034f, -0.570495f, -0.247365f, 0.063256f,  -0.582021f, -0.492585f,
+  -0.194955f, -0.207934f, -0.506627f, 0.021743f,  -0.416518f, 0.320876f,
+  0.115889f,  0.149399f,  -0.229376f, 0.095505f,  0.115191f,  -0.471921f,
+  0.113068f,  0.343684f,  -0.036831f, 0.021240f,  0.295112f,  0.031166f,
+  0.448201f,  -0.132241f, 0.164032f,  0.355572f,  0.072154f,  0.017335f,
+  -0.046113f, 0.178719f,  -0.026881f, -0.242590f, 0.055073f,  -0.012958f,
+  0.077904f,  0.351356f,  0.107655f,  0.260568f,  -0.080052f, -0.197553f,
+  0.085763f,  0.263416f,  -0.327741f, 0.158855f,  0.056899f,  -0.162121f,
+  0.339518f,  -0.571204f, 0.264966f,  -0.252214f, -0.202560f, -0.134213f,
+  -0.330188f, 0.009470f,  -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+  -0.222238f, -0.458716f, 0.186493f,  -0.391415f, 0.118649f,  -0.104653f,
+  -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+  -0.598358f, 0.164947f,  -0.119694f, -0.058520f, 0.203829f,  -0.267404f,
+  -0.048202f, -0.600006f, 0.181594f,  -0.731805f, 0.146417f,  -0.687148f,
+  -1.210525f, -0.450101f, -0.620635f, 0.208825f,  -0.611357f, 0.112202f,
+  -0.309468f, -0.323545f, 0.357770f,  0.308061f,  0.553199f,  0.049012f,
+  0.530093f,  -0.208597f, 0.607882f,  -0.058120f, -0.527634f, 0.018136f,
+  0.060753f,  0.118894f,  0.175649f,  0.014731f,  0.428318f,  -0.106465f,
+  -0.119077f, 0.080179f,  0.524997f,  0.368286f,  0.528286f,  0.213659f,
+  0.639286f,  0.195079f,  -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+  -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f,  0.325622f,
+  -0.115293f, 0.155188f,  0.047225f,  0.231050f,  -0.167447f, 0.349754f,
+  0.295544f,  -0.319466f, 0.095144f,  0.174612f,  -0.194652f, 0.305915f,
+  -0.239008f, -0.037453f, 0.280696f,  0.125850f,  0.749196f,  -0.101919f,
+  0.791808f,  -0.236811f, 0.064157f,  0.032865f,  -0.225911f, 0.350384f,
+  0.723183f,  -0.103992f, 0.483085f,  -0.123992f, 0.602138f,  0.023895f,
+  -0.692601f, -0.118387f, 0.162527f,  0.145178f,  -0.184702f, -0.017753f,
+  -0.159436f, 0.124105f,  -0.131067f, 0.310275f,  0.151499f,  0.138924f,
+  0.537459f,  0.263212f,  0.615896f,  0.281255f,  0.021293f,  -0.473459f,
+  0.210145f,  -0.056682f, 0.063658f,  0.377254f,  -0.314410f, -0.183487f,
+  0.300384f,  0.328471f,  0.164694f,  -0.159272f, -0.160942f, -0.502861f,
+  -0.129147f, 0.045916f,  -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+  0.051664f,  -0.212487f, -0.077596f, -0.818467f, 0.638475f,  -0.759937f,
+  0.157198f,  0.989640f,  1.586035f,  0.431144f,  0.041605f,  0.543085f,
+  0.498379f,  0.320504f,  0.134233f,  0.670979f,  -0.105562f, -1.574879f,
+  1.261812f,  -0.287530f, -1.610592f, 0.730899f,  -0.894240f, -0.657790f,
+  0.270806f,  -0.181708f, 0.298578f,  0.817240f,  -0.221508f, -0.201771f,
+  -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+  1.208914f,  0.324728f,  0.383352f,  -0.874321f, 0.172565f,  -0.580927f,
+  -0.432927f, 0.433698f,  -0.801935f, 0.672028f,  0.563493f,  0.260077f,
+  -0.200557f, -0.121638f, 0.530735f,  -0.525196f, 0.281799f,  0.624204f,
+  -0.662775f, -0.230887f, 0.980989f,  0.223437f,  -0.790591f, 0.600724f,
+  -0.273445f, 0.427635f,  -0.501641f, -0.878390f, 0.234731f,  -0.172550f,
+  0.418904f,  1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+  -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x32_layer0,
+      av1_tx_split_nn_weights_16x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x32_layer0,
+      av1_tx_split_nn_bias_16x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+  0.031614f,  -0.110926f, 0.052418f,  -0.702506f, 0.045708f,  0.238329f,
+  -0.021806f, -0.208128f, 0.509745f,  -0.293891f, 0.277788f,  0.113937f,
+  0.741576f,  0.062848f,  0.351878f,  0.212532f,  0.385842f,  0.081517f,
+  0.398502f,  -0.015156f, 0.242616f,  0.214619f,  -0.182678f, -0.170546f,
+  0.110605f,  -0.236749f, -0.023831f, -0.285243f, 0.147156f,  -0.257639f,
+  0.341355f,  -0.571641f, -0.721797f, 0.139588f,  -0.518494f, -0.206526f,
+  -0.570560f, -0.184295f, 0.110271f,  0.210292f,  -0.109132f, -0.001080f,
+  0.129251f,  -0.204230f, -0.396312f, -0.183024f, 0.421243f,  -0.013154f,
+  0.222627f,  0.169826f,  0.226037f,  0.218153f,  -0.343528f, 0.274906f,
+  -0.156632f, 0.250261f,  -0.484020f, 0.019909f,  -0.349575f, -0.286643f,
+  -0.507396f, 0.202446f,  -0.154110f, -0.292644f, 0.122666f,  0.306963f,
+  0.424895f,  0.005579f,  0.494094f,  -0.079551f, 0.473740f,  0.352414f,
+  -0.356917f, 0.264331f,  -0.554487f, 0.119978f,  0.012291f,  -0.141641f,
+  -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f,  -0.118501f,
+  0.305151f,  -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+  -0.177066f, -0.055114f, 0.229698f,  -0.199523f, 0.054278f,  0.365020f,
+  -0.060586f, -0.300618f, 0.157563f,  -0.064338f, -0.005711f, -0.176991f,
+  -0.424502f, -0.111914f, 0.092608f,  0.126621f,  0.078547f,  0.148008f,
+  0.024221f,  0.124599f,  0.001343f,  0.059402f,  0.453753f,  0.047102f,
+  0.242544f,  0.055735f,  -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+  0.214908f,  0.248889f,  0.544348f,  -0.084566f, 0.402478f,  0.298031f,
+  0.099038f,  -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+  -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+  0.100219f,  0.293934f,  0.099271f,  -0.036320f, 0.356626f,  -0.261445f,
+  0.879544f,  0.000878f,  0.532920f,  -0.093918f, 0.508867f,  -0.040215f,
+  -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f,  0.352989f,
+  -0.058831f, -0.164588f, 0.039890f,  0.122861f,  0.222508f,  0.061217f,
+  0.466487f,  0.022666f,  0.423777f,  -0.002200f, -0.656835f, -0.099760f,
+  -0.520606f, 0.303204f,  -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+  -0.336516f, -0.206764f, -0.236040f, 0.325899f,  -0.418748f, 0.163205f,
+  -0.476242f, -0.121928f, 0.139178f,  -0.157193f, -0.531766f, -0.180202f,
+  -0.485254f, 0.187703f,  -0.440072f, 0.137854f,  0.029139f,  0.109530f,
+  -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+  -0.304542f, 0.005123f,  0.413995f,  0.314639f,  0.342648f,  -0.293264f,
+  0.358135f,  -0.180425f, -0.369530f, -0.048413f, 0.498366f,  0.121875f,
+  0.270948f,  -0.187966f, 0.342503f,  0.174420f,  -0.352105f, 0.088080f,
+  0.008277f,  0.020275f,  -0.002381f, 0.504389f,  -0.018832f, -0.366047f,
+  -0.090947f, -0.168150f, 0.016184f,  -0.328914f, 0.089579f,  -0.017349f,
+  0.005844f,  -0.005010f, -1.857514f, -0.282426f, 0.010177f,  -0.214727f,
+  -0.182529f, 0.156943f,  -0.162032f, -0.472654f, 0.069432f,  0.016901f,
+  -0.767905f, 0.137129f,  -0.411463f, 0.049056f,  -0.431657f, -0.037641f,
+  0.785500f,  0.046225f,  0.195831f,  0.245204f,  0.368614f,  0.212261f,
+  0.440626f,  -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+  0.490777f,  -1.894238f, 0.621333f,  -0.076756f, 0.286298f, 0.286375f,
+  -0.126431f, -0.350034f, -1.017572f, 0.620125f,  0.408128f, 0.238756f,
+  -0.060728f, 0.210912f,  0.043124f,  0.445649f,  0.907025f, 0.360272f,
+  1.083101f,  -0.068952f, 1.062348f,  0.396354f,  0.280075f, 0.501732f,
+  0.328422f,  0.066241f,  0.474697f,  0.126313f,  0.741206f, 0.314796f,
+  0.552712f,  0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+  1.033823f,  0.603439f,  0.304591f,  -0.279940f, -0.780909f, -0.132801f,
+  0.154059f,  0.662014f,  -0.718368f, 0.198733f,  0.039766f,  -0.208516f,
+  -0.104909f, -0.394209f, 0.081617f,  0.365041f,  -0.874960f, -0.063315f,
+  -1.189897f, 0.337225f,  0.410893f,  0.307519f,  0.221323f,  0.233895f,
+  0.469536f,  0.438557f,  0.280144f,  0.422423f,  -1.394513f, 0.781900f,
+  0.352981f,  0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+  -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x64_layer0,
+      av1_tx_split_nn_weights_32x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x64_layer0,
+      av1_tx_split_nn_bias_32x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+  -0.687846f, 0.121404f,  -0.372905f, 0.126770f,  -0.103298f, -0.101650f,
+  -0.148490f, -0.271740f, 0.682915f,  -0.079765f, 0.634347f,  -0.151503f,
+  0.287692f,  -0.079072f, -0.236948f, 0.065064f,  0.713383f,  0.397123f,
+  0.553621f,  0.368529f,  0.767663f,  -0.046601f, -0.392402f, -0.294822f,
+  -0.292325f, -0.010573f, -0.837945f, 0.050113f,  -0.811360f, 0.199162f,
+  0.150832f,  0.011602f,  0.369694f,  -0.225876f, 0.234113f,  -0.269808f,
+  0.303805f,  -0.190281f, -0.451136f, 0.209755f,  -0.308894f, 0.326956f,
+  0.313591f,  0.089923f,  -0.095754f, 0.390981f,  0.467366f,  0.169670f,
+  0.853322f,  0.054055f,  0.830319f,  -0.121918f, 0.262019f,  -0.093526f,
+  0.385558f,  0.419174f,  0.040198f,  -0.347030f, -0.450492f, -0.106764f,
+  0.487502f,  -0.204188f, 0.430374f,  -0.116388f, 0.236407f,  -0.157376f,
+  0.732294f,  -0.651387f, 0.347446f,  0.342575f,  0.048406f,  0.187657f,
+  0.434899f,  -0.447782f, 0.032728f,  -0.071168f, -0.255327f, 0.104174f,
+  0.095689f,  -0.431743f, 0.725694f,  0.031797f,  0.523171f,  0.061801f,
+  0.469804f,  -0.071068f, -0.059024f, -0.211937f, 0.392134f,  -0.321490f,
+  0.366060f,  -0.427798f, 0.166771f,  0.299652f,  0.044660f,  0.205142f,
+  0.039133f,  -0.051835f, -0.465475f, 0.216976f,  -0.341156f, 0.095358f,
+  0.230807f,  0.201674f,  0.279266f,  -0.713534f, -0.091690f, -0.569708f,
+  -0.119001f, 0.252160f,  -1.544578f, -0.284477f, 0.555348f,  0.226471f,
+  0.347690f,  0.034365f,  0.770835f,  -0.241859f, -0.130241f, 0.292936f,
+  0.396622f,  -0.417916f, 0.492224f,  0.125517f,  0.344824f,  0.232172f,
+  -0.432106f, -0.278745f, 0.035069f,  -0.307247f, -0.120760f, 0.170950f,
+  0.433601f,  0.044286f,  0.141463f,  -0.041382f, 0.529346f,  0.010868f,
+  -0.323674f, 0.185205f,  0.623459f,  0.232842f,  -0.406693f, -0.142944f,
+  0.222988f,  0.343634f,  0.065401f,  0.002621f,  0.805335f,  -0.426926f,
+  0.279181f,  0.131364f,  0.192339f,  -0.402391f, 0.544120f,  -0.060618f,
+  0.467780f,  0.165224f,  -0.373131f, 0.002427f,  0.688064f,  0.322317f,
+  0.259713f,  0.130583f,  0.185032f,  -0.189111f, -0.067821f, 0.010875f,
+  0.644724f,  -0.179291f, 0.463222f,  0.155230f,  0.721384f,  -0.046019f,
+  0.438501f,  0.440027f,  -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+  -0.328530f, 0.370102f,  0.482531f,  0.043471f,  -0.469732f, -0.532663f,
+  0.122081f,  -0.379659f, 0.037219f,  -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+  -1.198965f, 0.395204f,  -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+  -0.288354f, 1.207574f,  0.411608f,  0.964678f,  -1.176893f, 1.059006f,
+  -0.472969f, 2.087975f,  1.065536f,  0.595569f,  0.197907f,  -0.349938f,
+  1.013651f,  -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+  0.815787f,  -0.393465f, -0.483427f, -0.565592f, 0.493494f,  0.430229f,
+  -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f,  0.649146f,
+  -0.487383f, 1.844503f,  0.480324f,  -0.982705f, -0.501446f, -0.220584f,
+  0.334299f,  0.802238f,  0.805838f,  -0.487848f, 0.300772f,  -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+  0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x32_layer0,
+      av1_tx_split_nn_weights_8x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x32_layer0,
+      av1_tx_split_nn_bias_8x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+  -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+  -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+  -0.454709f, -0.059461f, 0.210313f,  -0.155683f, 0.192968f,  -0.127804f,
+  0.471996f,  0.253377f,  0.472625f,  0.485322f,  0.150560f,  0.164868f,
+  -0.475587f, 0.447559f,  -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+  -0.243897f, 0.293020f,  -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+  -0.618848f, 0.096273f,  -0.444586f, 0.347750f,  -0.280643f, -0.062872f,
+  0.118661f,  0.540099f,  0.104141f,  -0.279300f, -0.098721f, -0.173427f,
+  -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+  -0.523103f, 0.093620f,  -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+  -0.422581f, -0.005354f, 0.450552f,  0.369210f,  0.562484f,  0.679922f,
+  0.282099f,  -0.039075f, 0.404196f,  0.006371f,  0.069679f,  -0.196160f,
+  -0.213675f, 0.275187f,  -0.104235f, -0.193090f, 0.003116f,  -0.252454f,
+  -0.094591f, 0.210439f,  -0.137070f, 0.145043f,  0.024558f,  0.121718f,
+  0.010138f,  0.301651f,  -0.377990f, 0.444414f,  0.001845f,  -0.095334f,
+  0.550259f,  0.087603f,  0.792492f,  -0.044584f, 0.641706f,  -0.328458f,
+  -0.447791f, 0.135376f,  0.356385f,  0.135748f,  0.310370f,  0.293757f,
+  -0.062000f, -0.056368f, 0.343930f,  0.312039f,  0.370763f,  0.452381f,
+  -0.023630f, -0.185909f, 0.422277f,  -0.006306f, 0.045166f,  0.423359f,
+  -0.157735f, -0.084901f, 0.219527f,  -0.209510f, 0.575057f,  0.249276f,
+  0.069267f,  0.233898f,  -0.229392f, 0.117197f,  -0.038551f, 0.293976f,
+  0.101996f,  0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+  1.036995f,  0.160249f,  0.100264f,  0.694881f,  0.694677f,  0.128379f,
+  -0.843405f, -0.405515f, 0.104139f,  0.182980f,  -0.025472f, 0.901067f,
+  -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+  -1.778868f, 0.174690f,  0.211991f, 0.712138f,  0.589352f,  0.466652f,
+  1.029146f,  -0.490044f, 0.483015f, 0.600215f,  -0.577776f, -0.755546f,
+  0.348337f,  -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+  0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x64_layer0,
+      av1_tx_split_nn_weights_16x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x64_layer0,
+      av1_tx_split_nn_bias_16x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+  NULL,                          // TX_4X4,
+  &av1_tx_split_nnconfig_8x8,    // TX_8X8,
+  &av1_tx_split_nnconfig_16x16,  // TX_16X16,
+  &av1_tx_split_nnconfig_32x32,  // TX_32X32,
+  &av1_tx_split_nnconfig_64x64,  // TX_64X64,
+  &av1_tx_split_nnconfig_4x8,    // TX_4X8,
+  &av1_tx_split_nnconfig_4x8,    // TX_8X4,
+  &av1_tx_split_nnconfig_8x16,   // TX_8X16,
+  &av1_tx_split_nnconfig_8x16,   // TX_16X8,
+  &av1_tx_split_nnconfig_16x32,  // TX_16X32,
+  &av1_tx_split_nnconfig_16x32,  // TX_32X16,
+  &av1_tx_split_nnconfig_32x64,  // TX_32X64,
+  &av1_tx_split_nnconfig_32x64,  // TX_64X32,
+  &av1_tx_split_nnconfig_4x16,   // TX_4X16,
+  &av1_tx_split_nnconfig_4x16,   // TX_16X4,
+  &av1_tx_split_nnconfig_8x32,   // TX_8X32,
+  &av1_tx_split_nnconfig_8x32,   // TX_32X8,
+  &av1_tx_split_nnconfig_16x64,  // TX_16X64,
+  &av1_tx_split_nnconfig_16x64,  // TX_64X16,
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 000000000..e6edbb6af
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1:  Residuals of p1.
+ *      (source - p1)
+ * d:   Difference of p1 and p0.
+ *      (p1 - p0)
+ * m:   The blending mask
+ * N:   Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ *    is equivalent to:
+ *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ *    which is the SSE of the residuals of the compound predictor scaled up by
+ *    MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+                                        const uint8_t *m, int N) {
+  uint64_t csse = 0;
+  int i;
+
+  for (i = 0; i < N; i++) {
+    int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+    t = clamp(t, INT16_MIN, INT16_MAX);
+    csse += t * t;
+  }
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds:    Difference of the squares of the residuals.
+ *        r0**2 - r1**2
+ * m:     The blending mask
+ * N:     Number of pixels
+ * limit: Pre-computed threshold value.
+ *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ *                                     >
+ *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ *  which can be simplified to:
+ *
+ *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ *  The right hand side does not depend on the mask, and needs to be passed as
+ *  the 'limit' parameter.
+ *
+ *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ *  hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ *  being small, this should not cause a noticeable issue.
+ */
+int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
+                                    int64_t limit) {
+  int64_t acc = 0;
+
+  do {
+    acc += *ds++ * *m++;
+  } while (--N);
+
+  return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+                                       const int16_t *b, int N) {
+  int i;
+
+  for (i = 0; i < N; i++)
+    d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..07615543c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1217 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  __m128i buf0[32];
+  __m128i buf1[32];
+  const int32_t *cospi;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm_add_epi32(input[0], input[31]);
+  buf1[31] = _mm_sub_epi32(input[0], input[31]);
+  buf1[1] = _mm_add_epi32(input[1], input[30]);
+  buf1[30] = _mm_sub_epi32(input[1], input[30]);
+  buf1[2] = _mm_add_epi32(input[2], input[29]);
+  buf1[29] = _mm_sub_epi32(input[2], input[29]);
+  buf1[3] = _mm_add_epi32(input[3], input[28]);
+  buf1[28] = _mm_sub_epi32(input[3], input[28]);
+  buf1[4] = _mm_add_epi32(input[4], input[27]);
+  buf1[27] = _mm_sub_epi32(input[4], input[27]);
+  buf1[5] = _mm_add_epi32(input[5], input[26]);
+  buf1[26] = _mm_sub_epi32(input[5], input[26]);
+  buf1[6] = _mm_add_epi32(input[6], input[25]);
+  buf1[25] = _mm_sub_epi32(input[6], input[25]);
+  buf1[7] = _mm_add_epi32(input[7], input[24]);
+  buf1[24] = _mm_sub_epi32(input[7], input[24]);
+  buf1[8] = _mm_add_epi32(input[8], input[23]);
+  buf1[23] = _mm_sub_epi32(input[8], input[23]);
+  buf1[9] = _mm_add_epi32(input[9], input[22]);
+  buf1[22] = _mm_sub_epi32(input[9], input[22]);
+  buf1[10] = _mm_add_epi32(input[10], input[21]);
+  buf1[21] = _mm_sub_epi32(input[10], input[21]);
+  buf1[11] = _mm_add_epi32(input[11], input[20]);
+  buf1[20] = _mm_sub_epi32(input[11], input[20]);
+  buf1[12] = _mm_add_epi32(input[12], input[19]);
+  buf1[19] = _mm_sub_epi32(input[12], input[19]);
+  buf1[13] = _mm_add_epi32(input[13], input[18]);
+  buf1[18] = _mm_sub_epi32(input[13], input[18]);
+  buf1[14] = _mm_add_epi32(input[14], input[17]);
+  buf1[17] = _mm_sub_epi32(input[14], input[17]);
+  buf1[15] = _mm_add_epi32(input[15], input[16]);
+  buf1[16] = _mm_sub_epi32(input[15], input[16]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                      cos_bit);
+  buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                      buf0[31], cos_bit);
+  btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+
+  // stage 9
+  output[0] = buf0[0];
+  output[1] = buf0[16];
+  output[2] = buf0[8];
+  output[3] = buf0[24];
+  output[4] = buf0[4];
+  output[5] = buf0[20];
+  output[6] = buf0[12];
+  output[7] = buf0[28];
+  output[8] = buf0[2];
+  output[9] = buf0[18];
+  output[10] = buf0[10];
+  output[11] = buf0[26];
+  output[12] = buf0[6];
+  output[13] = buf0[22];
+  output[14] = buf0[14];
+  output[15] = buf0[30];
+  output[16] = buf0[1];
+  output[17] = buf0[17];
+  output[18] = buf0[9];
+  output[19] = buf0[25];
+  output[20] = buf0[5];
+  output[21] = buf0[21];
+  output[22] = buf0[13];
+  output[23] = buf0[29];
+  output[24] = buf0[3];
+  output[25] = buf0[19];
+  output[26] = buf0[11];
+  output[27] = buf0[27];
+  output[28] = buf0[7];
+  output[29] = buf0[23];
+  output[30] = buf0[15];
+  output[31] = buf0[31];
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        cos_bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit, const int instride,
+                           const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+  __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+  __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+  __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+  __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+  __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+  __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+  __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+  __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+  __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+  __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+  __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+  __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+  __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+  __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+  __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+  __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+  __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+  __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+  __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+  __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+  __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+  __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+  __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+  __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+  __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+  __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+  __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+  __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+  __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+  __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+  __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+  __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+  __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+  __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+  __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+  __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+  __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+  __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+  __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+  __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+  __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+  __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+  __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+  __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+  __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+  __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+  __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+  __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+  __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+  __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+  __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+  __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+  __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+  __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+  __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+  __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+  __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+  __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+  __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+  __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+  __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+  __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+  __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+  __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+  __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+  __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+  __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+  __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+  __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+  __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+  __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+  __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+  __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+  __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+  __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+  __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+  __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+  x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+  x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+  x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+  x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+  x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+  x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+  x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+  x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+  x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+  x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+  x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+  x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+  x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+  x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+  x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+  x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+  x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+  x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+  x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+  x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+  x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+  x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+  x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+  x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+  x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+  x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+  x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+  x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+  x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+  x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+  x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+  x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+  x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+  x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+  x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+  x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+  x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+  x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+  x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+  x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+  x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+  x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+  x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+  x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+  x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+  x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+  x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+  x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+  x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+  x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+  x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+  x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+  x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+  x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+  x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+  x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+  x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+  x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+  x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+  x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+  x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+  x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+  x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                          __rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                          __rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                          __rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                          __rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                          __rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                          __rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+                          __rounding, cos_bit);
+  x6[4] = _mm_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                          __rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                          __rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                          __rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+                          __rounding, cos_bit);
+  x7[8] = _mm_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                          __rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                          __rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+                          __rounding, cos_bit);
+  x8[16] = _mm_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                          __rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                          __rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                          __rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                          __rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+                          __rounding, cos_bit);
+  x9[32] = _mm_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+                          x10[63], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+                          x10[62], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+                          x10[61], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+                          x10[60], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+                          x10[59], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+                          x10[58], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+                          x10[57], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+                          x10[56], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+                          x10[55], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+                          x10[54], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+                          x10[53], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+                          x10[52], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+                          x10[51], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+                          x10[50], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+                          x10[49], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+                          x10[48], __rounding, cos_bit);
+
+  // stage 11
+  output[0 * outstride] = x10[0];
+  output[1 * outstride] = x10[32];
+  output[2 * outstride] = x10[16];
+  output[3 * outstride] = x10[48];
+  output[4 * outstride] = x10[8];
+  output[5 * outstride] = x10[40];
+  output[6 * outstride] = x10[24];
+  output[7 * outstride] = x10[56];
+  output[8 * outstride] = x10[4];
+  output[9 * outstride] = x10[36];
+  output[10 * outstride] = x10[20];
+  output[11 * outstride] = x10[52];
+  output[12 * outstride] = x10[12];
+  output[13 * outstride] = x10[44];
+  output[14 * outstride] = x10[28];
+  output[15 * outstride] = x10[60];
+  output[16 * outstride] = x10[2];
+  output[17 * outstride] = x10[34];
+  output[18 * outstride] = x10[18];
+  output[19 * outstride] = x10[50];
+  output[20 * outstride] = x10[10];
+  output[21 * outstride] = x10[42];
+  output[22 * outstride] = x10[26];
+  output[23 * outstride] = x10[58];
+  output[24 * outstride] = x10[6];
+  output[25 * outstride] = x10[38];
+  output[26 * outstride] = x10[22];
+  output[27 * outstride] = x10[54];
+  output[28 * outstride] = x10[14];
+  output[29 * outstride] = x10[46];
+  output[30 * outstride] = x10[30];
+  output[31 * outstride] = x10[62];
+  output[32 * outstride] = x10[1];
+  output[33 * outstride] = x10[33];
+  output[34 * outstride] = x10[17];
+  output[35 * outstride] = x10[49];
+  output[36 * outstride] = x10[9];
+  output[37 * outstride] = x10[41];
+  output[38 * outstride] = x10[25];
+  output[39 * outstride] = x10[57];
+  output[40 * outstride] = x10[5];
+  output[41 * outstride] = x10[37];
+  output[42 * outstride] = x10[21];
+  output[43 * outstride] = x10[53];
+  output[44 * outstride] = x10[13];
+  output[45 * outstride] = x10[45];
+  output[46 * outstride] = x10[29];
+  output[47 * outstride] = x10[61];
+  output[48 * outstride] = x10[3];
+  output[49 * outstride] = x10[35];
+  output[50 * outstride] = x10[19];
+  output[51 * outstride] = x10[51];
+  output[52 * outstride] = x10[11];
+  output[53 * outstride] = x10[43];
+  output[54 * outstride] = x10[27];
+  output[55 * outstride] = x10[59];
+  output[56 * outstride] = x10[7];
+  output[57 * outstride] = x10[39];
+  output[58 * outstride] = x10[23];
+  output[59 * outstride] = x10[55];
+  output[60 * outstride] = x10[15];
+  output[61 * outstride] = x10[47];
+  output[62 * outstride] = x10[31];
+  output[63 * outstride] = x10[63];
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 000000000..592462e20
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,2068 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m256i x1[16];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 7
+  output[0] = x1[0];
+  output[1] = x1[8];
+  output[2] = x1[4];
+  output[3] = x1[12];
+  output[4] = x1[2];
+  output[5] = x1[10];
+  output[6] = x1[6];
+  output[7] = x1[14];
+  output[8] = x1[1];
+  output[9] = x1[9];
+  output[10] = x1[5];
+  output[11] = x1[13];
+  output[12] = x1[3];
+  output[13] = x1[11];
+  output[14] = x1[7];
+  output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m256i x1[32];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+  __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+  __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+  __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+  __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+  __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+  __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+  __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+  __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+  __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+  __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+  __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+  __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+  __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+  __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+  __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+  __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+  __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+  __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+  __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+  __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+  __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+  __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+  __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+  __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+  __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+  __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+  __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+  __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+  __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+  __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+  __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+  __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  __m256i x1[32];
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+  // stage 0
+  // stage 1
+  btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[31]);
+  btf_32_add_sub_avx2(&x1[1], &x1[30]);
+  btf_32_add_sub_avx2(&x1[2], &x1[29]);
+  btf_32_add_sub_avx2(&x1[3], &x1[28]);
+  btf_32_add_sub_avx2(&x1[4], &x1[27]);
+  btf_32_add_sub_avx2(&x1[5], &x1[26]);
+  btf_32_add_sub_avx2(&x1[6], &x1[25]);
+  btf_32_add_sub_avx2(&x1[7], &x1[24]);
+  btf_32_add_sub_avx2(&x1[8], &x1[23]);
+  btf_32_add_sub_avx2(&x1[9], &x1[22]);
+  btf_32_add_sub_avx2(&x1[10], &x1[21]);
+  btf_32_add_sub_avx2(&x1[11], &x1[20]);
+  btf_32_add_sub_avx2(&x1[12], &x1[19]);
+  btf_32_add_sub_avx2(&x1[13], &x1[18]);
+  btf_32_add_sub_avx2(&x1[14], &x1[17]);
+  btf_32_add_sub_avx2(&x1[15], &x1[16]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[47]);
+  btf_32_add_sub_avx2(&x1[33], &x1[46]);
+  btf_32_add_sub_avx2(&x1[34], &x1[45]);
+  btf_32_add_sub_avx2(&x1[35], &x1[44]);
+  btf_32_add_sub_avx2(&x1[36], &x1[43]);
+  btf_32_add_sub_avx2(&x1[37], &x1[42]);
+  btf_32_add_sub_avx2(&x1[38], &x1[41]);
+  btf_32_add_sub_avx2(&x1[39], &x1[40]);
+  btf_32_add_sub_avx2(&x1[63], &x1[48]);
+  btf_32_add_sub_avx2(&x1[62], &x1[49]);
+  btf_32_add_sub_avx2(&x1[61], &x1[50]);
+  btf_32_add_sub_avx2(&x1[60], &x1[51]);
+  btf_32_add_sub_avx2(&x1[59], &x1[52]);
+  btf_32_add_sub_avx2(&x1[58], &x1[53]);
+  btf_32_add_sub_avx2(&x1[57], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[39]);
+  btf_32_add_sub_avx2(&x1[33], &x1[38]);
+  btf_32_add_sub_avx2(&x1[34], &x1[37]);
+  btf_32_add_sub_avx2(&x1[35], &x1[36]);
+  btf_32_add_sub_avx2(&x1[47], &x1[40]);
+  btf_32_add_sub_avx2(&x1[46], &x1[41]);
+  btf_32_add_sub_avx2(&x1[45], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[43]);
+  btf_32_add_sub_avx2(&x1[48], &x1[55]);
+  btf_32_add_sub_avx2(&x1[49], &x1[54]);
+  btf_32_add_sub_avx2(&x1[50], &x1[53]);
+  btf_32_add_sub_avx2(&x1[51], &x1[52]);
+  btf_32_add_sub_avx2(&x1[63], &x1[56]);
+  btf_32_add_sub_avx2(&x1[62], &x1[57]);
+  btf_32_add_sub_avx2(&x1[61], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[35]);
+  btf_32_add_sub_avx2(&x1[33], &x1[34]);
+  btf_32_add_sub_avx2(&x1[39], &x1[36]);
+  btf_32_add_sub_avx2(&x1[38], &x1[37]);
+  btf_32_add_sub_avx2(&x1[40], &x1[43]);
+  btf_32_add_sub_avx2(&x1[41], &x1[42]);
+  btf_32_add_sub_avx2(&x1[47], &x1[44]);
+  btf_32_add_sub_avx2(&x1[46], &x1[45]);
+  btf_32_add_sub_avx2(&x1[48], &x1[51]);
+  btf_32_add_sub_avx2(&x1[49], &x1[50]);
+  btf_32_add_sub_avx2(&x1[55], &x1[52]);
+  btf_32_add_sub_avx2(&x1[54], &x1[53]);
+  btf_32_add_sub_avx2(&x1[56], &x1[59]);
+  btf_32_add_sub_avx2(&x1[57], &x1[58]);
+  btf_32_add_sub_avx2(&x1[63], &x1[60]);
+  btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+  btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[33]);
+  btf_32_add_sub_avx2(&x1[35], &x1[34]);
+  btf_32_add_sub_avx2(&x1[36], &x1[37]);
+  btf_32_add_sub_avx2(&x1[39], &x1[38]);
+  btf_32_add_sub_avx2(&x1[40], &x1[41]);
+  btf_32_add_sub_avx2(&x1[43], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[45]);
+  btf_32_add_sub_avx2(&x1[47], &x1[46]);
+  btf_32_add_sub_avx2(&x1[48], &x1[49]);
+  btf_32_add_sub_avx2(&x1[51], &x1[50]);
+  btf_32_add_sub_avx2(&x1[52], &x1[53]);
+  btf_32_add_sub_avx2(&x1[55], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[57]);
+  btf_32_add_sub_avx2(&x1[59], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[61]);
+  btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[15]);
+  x1[2] = _mm256_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm256_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm256_subs_epi16(__zero, input[11]);
+  x1[8] = _mm256_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm256_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm256_subs_epi16(__zero, input[13]);
+  x1[14] = _mm256_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[1];
+  output[1] = x1[14];
+  output[2] = x1[3];
+  output[3] = x1[12];
+  output[4] = x1[5];
+  output[5] = x1[10];
+  output[6] = x1[7];
+  output[7] = x1[8];
+  output[8] = x1[9];
+  output[9] = x1[6];
+  output[10] = x1[11];
+  output[11] = x1[4];
+  output[12] = x1[13];
+  output[13] = x1[2];
+  output[14] = x1[15];
+  output[15] = x1[0];
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale__r);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+                                           __m256i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm256_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity16x32_new_avx2(const __m256i *input,
+                                           __m256i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm256_slli_epi16(input[i], 2);
+  }
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
+                                                      __m256i *output,
+                                                      const int size,
+                                                      const int bit) {
+  const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
+                                         __m256i *output) {
+  __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
+  __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
+  __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
+  __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
+  __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
+  __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
+  __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
+  __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
+
+  __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
+  __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
+  __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
+  __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
+  __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
+  __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
+  __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
+  __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
+
+  output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
+  output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
+  output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
+  output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
+  output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
+  output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
+  output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
+  output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        int32_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out),
+                       _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+    _mm256_store_si256(
+        (__m256i *)(out + 8),
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+    out += stride;
+  }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+                                                  int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  _mm256_store_si256((__m256i *)b, b_lo);
+  _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+  }
+}
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_new_avx2,       // DCT_DCT
+  NULL,                     // ADST_DCT
+  NULL,                     // DCT_ADST
+  NULL,                     // ADST_ADST
+  NULL,                     // FLIPADST_DCT
+  NULL,                     // DCT_FLIPADST
+  NULL,                     // FLIPADST_FLIPADST
+  NULL,                     // ADST_FLIPADST
+  NULL,                     // FLIPADST_ADST
+  fidentity16x32_new_avx2,  // IDTX
+  fdct16x32_new_avx2,       // V_DCT
+  fidentity16x32_new_avx2,  // H_DCT
+  NULL,                     // V_ADST
+  NULL,                     // H_ADST
+  NULL,                     // V_FLIPADST
+  NULL                      // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_new_avx2,       // DCT_DCT
+  NULL,                     // ADST_DCT
+  NULL,                     // DCT_ADST
+  NULL,                     // ADST_ADST
+  NULL,                     // FLIPADST_DCT
+  NULL,                     // DCT_FLIPADST
+  NULL,                     // FLIPADST_FLIPADST
+  NULL,                     // ADST_FLIPADST
+  NULL,                     // FLIPADST_ADST
+  fidentity16x32_new_avx2,  // IDTX
+  fidentity16x32_new_avx2,  // V_DCT
+  fdct16x32_new_avx2,       // H_DCT
+  NULL,                     // V_ADST
+  NULL,                     // H_ADST
+  NULL,                     // V_FLIPADST
+  NULL                      // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fadst16x16_new_avx2,      // ADST_DCT
+  fdct16x16_new_avx2,       // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fadst16x16_new_avx2,      // FLIPADST_DCT
+  fdct16x16_new_avx2,       // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fdct16x16_new_avx2,       // V_DCT
+  fidentity16x16_new_avx2,  // H_DCT
+  fadst16x16_new_avx2,      // V_ADST
+  fidentity16x16_new_avx2,  // H_ADST
+  fadst16x16_new_avx2,      // V_FLIPADST
+  fidentity16x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fdct16x16_new_avx2,       // ADST_DCT
+  fadst16x16_new_avx2,      // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fdct16x16_new_avx2,       // FLIPADST_DCT
+  fadst16x16_new_avx2,      // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fidentity16x16_new_avx2,  // V_DCT
+  fdct16x16_new_avx2,       // H_DCT
+  fidentity16x16_new_avx2,  // V_ADST
+  fadst16x16_new_avx2,      // H_ADST
+  fidentity16x16_new_avx2,  // V_FLIPADST
+  fadst16x16_new_avx2       // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X16;
+  __m256i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32_t i = 0;
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1 + width * i, buf, width);
+  } else {
+    buf = buf1 + width * i;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  transpose_16bit_16x16_avx2(buf, buf);
+  store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_32X32;
+  __m256i buf0[32], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+    transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    transpose_16bit_16x16_avx2(buf, buf);
+    store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
+                                         16);
+    transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+    store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
+                                         width, 16);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X32;
+  __m256i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1);
+  transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    transpose_16bit_16x16_avx2(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
+                                              width, 16);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+  }
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  transpose_16bit_16x16_avx2(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
+
+  transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[32];
+    __m256i bufB[32];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
+    av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div16); i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    int32_t *output16 = output + 16 * width * i;
+    for (int j = 0; j < width_div16; ++j) {
+      __m256i *buf16 = buf + 16 * j;
+      transpose_16bit_16x16_avx2(buf16, buf16);
+      store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div16; i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    int32_t *output16 = output + 16 * 32 * i;
+    for (int j = 0; j < 2; ++j) {
+      __m256i *buf16 = buf + 16 * j;
+      transpose_16bit_16x16_avx2(buf16, buf16);
+      store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
+    }
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
+  lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
+  lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,  // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,  // 16x8 transform
+  lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
+  lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_avx2,     // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,  // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,  // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,  // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,  // 32x8 transform
+  lowbd_fwd_txfm2d_16x64_avx2,     // 16x64 transform
+  lowbd_fwd_txfm2d_64x16_avx2,     // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..8ec0256eb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    int j;
+    for (j = 0; j < 32; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+    av1_fdct32_new_sse4_1(buf0, buf1, cos_bit);
+    for (j = 0; j < 32; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+                          col_num);
+  }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+    case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride,
+                                     const TXFM_2D_FLIP_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  // TODO(sarahparker) This does not currently support rectangular transforms
+  // and will break without splitting txfm_size out into row and col size.
+  // Rectangular transforms use c code only, so it should be ok for now.
+  // It will be corrected when there are sse implementations for rectangular
+  // transforms.
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+                                           int32_t *output, const int stride,
+                                           const TXFM_2D_FLIP_CFG *cfg,
+                                           int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+                          col_num, (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32x32(buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
+                                      const __m128i *inputB, __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
+  __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
+  __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
+  __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+
+  temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
+  temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
+  temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
+  temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
+
+  output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m128i buf0[64], buf1[512];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[32];
+    __m128i bufB[32];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
+    av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < (32 / 4); ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_sse4_1,    // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_sse4_1,    // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_sse4_1,    // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+                               int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 000000000..38707137c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 000000000..6aae7ce1e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2889 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i u[4], v[4];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], cospi_p32_p32);  // 0
+  u[1] = _mm_madd_epi16(v[0], cospi_p32_m32);  // 2
+  u[2] = _mm_madd_epi16(v[1], cospi_p16_p48);  // 1
+  u[3] = _mm_madd_epi16(v[1], cospi_p48_m16);  // 3
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[3], __rounding);
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[1]);
+  output[1] = _mm_packs_epi32(u[2], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x1[4];
+  x1[0] = _mm_adds_epi16(input[0], input[3]);
+  x1[3] = _mm_subs_epi16(input[0], input[3]);
+  x1[1] = _mm_adds_epi16(input[1], input[2]);
+  x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+  // stage 2
+  __m128i x2[4];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+                 &x1[6], &x2[5], &x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+                 &x2[1], &x3[0], &x3[1]);
+  btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+                 &x2[3], &x3[2], &x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+                 &x3[7], &x4[4], &x4[7]);
+  btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+                 &x3[6], &x4[5], &x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = _mm_adds_epi16(input[0], input[15]);
+  x1[15] = _mm_subs_epi16(input[0], input[15]);
+  x1[1] = _mm_adds_epi16(input[1], input[14]);
+  x1[14] = _mm_subs_epi16(input[1], input[14]);
+  x1[2] = _mm_adds_epi16(input[2], input[13]);
+  x1[13] = _mm_subs_epi16(input[2], input[13]);
+  x1[3] = _mm_adds_epi16(input[3], input[12]);
+  x1[12] = _mm_subs_epi16(input[3], input[12]);
+  x1[4] = _mm_adds_epi16(input[4], input[11]);
+  x1[11] = _mm_subs_epi16(input[4], input[11]);
+  x1[5] = _mm_adds_epi16(input[5], input[10]);
+  x1[10] = _mm_subs_epi16(input[5], input[10]);
+  x1[6] = _mm_adds_epi16(input[6], input[9]);
+  x1[9] = _mm_subs_epi16(input[6], input[9]);
+  x1[7] = _mm_adds_epi16(input[7], input[8]);
+  x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+  x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+  x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+  x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+  x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+  x2[14] = x1[14];
+  x2[15] = x1[15];
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+  x3[4] = x2[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+  x3[7] = x2[7];
+  x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+  x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+  x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+  x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+  x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+  // stage 4
+  __m128i x4[16];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+  x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+  x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+  x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+  x4[8] = x3[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+  x4[11] = x3[11];
+  x4[12] = x3[12];
+  x4[15] = x3[15];
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = x4[0];
+  x5[1] = x4[1];
+  x5[2] = x4[2];
+  x5[3] = x4[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+  x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+  x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+  x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+  x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+  // stage 7
+  output[0] = x6[0];
+  output[1] = x6[8];
+  output[2] = x6[4];
+  output[3] = x6[12];
+  output[4] = x6[2];
+  output[5] = x6[10];
+  output[6] = x6[6];
+  output[7] = x6[14];
+  output[8] = x6[1];
+  output[9] = x6[9];
+  output[10] = x6[5];
+  output[11] = x6[13];
+  output[12] = x6[3];
+  output[13] = x6[11];
+  output[14] = x6[7];
+  output[15] = x6[15];
+}
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m128i x1[32];
+  x1[0] = _mm_adds_epi16(input[0], input[31]);
+  x1[31] = _mm_subs_epi16(input[0], input[31]);
+  x1[1] = _mm_adds_epi16(input[1], input[30]);
+  x1[30] = _mm_subs_epi16(input[1], input[30]);
+  x1[2] = _mm_adds_epi16(input[2], input[29]);
+  x1[29] = _mm_subs_epi16(input[2], input[29]);
+  x1[3] = _mm_adds_epi16(input[3], input[28]);
+  x1[28] = _mm_subs_epi16(input[3], input[28]);
+  x1[4] = _mm_adds_epi16(input[4], input[27]);
+  x1[27] = _mm_subs_epi16(input[4], input[27]);
+  x1[5] = _mm_adds_epi16(input[5], input[26]);
+  x1[26] = _mm_subs_epi16(input[5], input[26]);
+  x1[6] = _mm_adds_epi16(input[6], input[25]);
+  x1[25] = _mm_subs_epi16(input[6], input[25]);
+  x1[7] = _mm_adds_epi16(input[7], input[24]);
+  x1[24] = _mm_subs_epi16(input[7], input[24]);
+  x1[8] = _mm_adds_epi16(input[8], input[23]);
+  x1[23] = _mm_subs_epi16(input[8], input[23]);
+  x1[9] = _mm_adds_epi16(input[9], input[22]);
+  x1[22] = _mm_subs_epi16(input[9], input[22]);
+  x1[10] = _mm_adds_epi16(input[10], input[21]);
+  x1[21] = _mm_subs_epi16(input[10], input[21]);
+  x1[11] = _mm_adds_epi16(input[11], input[20]);
+  x1[20] = _mm_subs_epi16(input[11], input[20]);
+  x1[12] = _mm_adds_epi16(input[12], input[19]);
+  x1[19] = _mm_subs_epi16(input[12], input[19]);
+  x1[13] = _mm_adds_epi16(input[13], input[18]);
+  x1[18] = _mm_subs_epi16(input[13], input[18]);
+  x1[14] = _mm_adds_epi16(input[14], input[17]);
+  x1[17] = _mm_subs_epi16(input[14], input[17]);
+  x1[15] = _mm_adds_epi16(input[15], input[16]);
+  x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+  // stage 2
+  __m128i x2[32];
+  x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+  x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+  x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+  x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+  x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+  x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+  x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+  x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+  x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+  x2[16] = x1[16];
+  x2[17] = x1[17];
+  x2[18] = x1[18];
+  x2[19] = x1[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+  x2[28] = x1[28];
+  x2[29] = x1[29];
+  x2[30] = x1[30];
+  x2[31] = x1[31];
+
+  // stage 3
+  __m128i x3[32];
+  x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+  x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+  x3[8] = x2[8];
+  x3[9] = x2[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+  x3[14] = x2[14];
+  x3[15] = x2[15];
+  x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+  x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+  x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+  x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+  x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+  x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+  x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+  x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+  x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+  x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+  x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+  x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+  x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+  x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+  x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+  x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+  // stage 4
+  __m128i x4[32];
+  x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+  x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+  x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+  x4[4] = x3[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+  x4[7] = x3[7];
+  x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+  x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+  x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+  x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+  x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+  x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+  x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+  x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+  x4[16] = x3[16];
+  x4[17] = x3[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+  x4[22] = x3[22];
+  x4[23] = x3[23];
+  x4[24] = x3[24];
+  x4[25] = x3[25];
+  x4[30] = x3[30];
+  x4[31] = x3[31];
+
+  // stage 5
+  __m128i x5[32];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+  x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+  x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+  x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+  x5[8] = x4[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+  x5[11] = x4[11];
+  x5[12] = x4[12];
+  x5[15] = x4[15];
+  x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+  x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+  x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+  x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+  x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+  x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+  x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+  x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+  x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+  x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+  x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+  x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+  x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+  x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+  x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+  x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+  // stage 6
+  __m128i x6[32];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+  x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+  x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+  x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+  x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+  x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+  x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+  x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+  x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+  x6[16] = x5[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+  x6[19] = x5[19];
+  x6[20] = x5[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+  x6[23] = x5[23];
+  x6[24] = x5[24];
+  x6[27] = x5[27];
+  x6[28] = x5[28];
+  x6[31] = x5[31];
+
+  // stage 7
+  __m128i x7[32];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  x7[4] = x6[4];
+  x7[5] = x6[5];
+  x7[6] = x6[6];
+  x7[7] = x6[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+  x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+  x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+  x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+  x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+  x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+  x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+  x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+  x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+  x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+  x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+  x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+  x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+  x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+  x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+  x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+  x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+  // stage 8
+  __m128i x8[32];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  x8[8] = x7[8];
+  x8[9] = x7[9];
+  x8[10] = x7[10];
+  x8[11] = x7[11];
+  x8[12] = x7[12];
+  x8[13] = x7[13];
+  x8[14] = x7[14];
+  x8[15] = x7[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+  // stage 9
+  output[0] = x8[0];
+  output[1] = x8[16];
+  output[2] = x8[8];
+  output[3] = x8[24];
+  output[4] = x8[4];
+  output[5] = x8[20];
+  output[6] = x8[12];
+  output[7] = x8[28];
+  output[8] = x8[2];
+  output[9] = x8[18];
+  output[10] = x8[10];
+  output[11] = x8[26];
+  output[12] = x8[6];
+  output[13] = x8[22];
+  output[14] = x8[14];
+  output[15] = x8[30];
+  output[16] = x8[1];
+  output[17] = x8[17];
+  output[18] = x8[9];
+  output[19] = x8[25];
+  output[20] = x8[5];
+  output[21] = x8[21];
+  output[22] = x8[13];
+  output[23] = x8[29];
+  output[24] = x8[3];
+  output[25] = x8[19];
+  output[26] = x8[11];
+  output[27] = x8[27];
+  output[28] = x8[7];
+  output[29] = x8[23];
+  output[30] = x8[15];
+  output[31] = x8[31];
+}
+
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+  __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+  __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+  __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+  __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+  __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+  __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+  __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+  __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+  __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+  __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+  __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+  __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+  __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+  __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+  __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+  __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+  __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+  __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+  __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+  __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+  __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+  __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+  __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+  __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+  __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+  __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+  __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+  __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+  __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+  __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+  __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+  __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_adds_epi16(input[0], input[63]);
+  x1[63] = _mm_subs_epi16(input[0], input[63]);
+  x1[1] = _mm_adds_epi16(input[1], input[62]);
+  x1[62] = _mm_subs_epi16(input[1], input[62]);
+  x1[2] = _mm_adds_epi16(input[2], input[61]);
+  x1[61] = _mm_subs_epi16(input[2], input[61]);
+  x1[3] = _mm_adds_epi16(input[3], input[60]);
+  x1[60] = _mm_subs_epi16(input[3], input[60]);
+  x1[4] = _mm_adds_epi16(input[4], input[59]);
+  x1[59] = _mm_subs_epi16(input[4], input[59]);
+  x1[5] = _mm_adds_epi16(input[5], input[58]);
+  x1[58] = _mm_subs_epi16(input[5], input[58]);
+  x1[6] = _mm_adds_epi16(input[6], input[57]);
+  x1[57] = _mm_subs_epi16(input[6], input[57]);
+  x1[7] = _mm_adds_epi16(input[7], input[56]);
+  x1[56] = _mm_subs_epi16(input[7], input[56]);
+  x1[8] = _mm_adds_epi16(input[8], input[55]);
+  x1[55] = _mm_subs_epi16(input[8], input[55]);
+  x1[9] = _mm_adds_epi16(input[9], input[54]);
+  x1[54] = _mm_subs_epi16(input[9], input[54]);
+  x1[10] = _mm_adds_epi16(input[10], input[53]);
+  x1[53] = _mm_subs_epi16(input[10], input[53]);
+  x1[11] = _mm_adds_epi16(input[11], input[52]);
+  x1[52] = _mm_subs_epi16(input[11], input[52]);
+  x1[12] = _mm_adds_epi16(input[12], input[51]);
+  x1[51] = _mm_subs_epi16(input[12], input[51]);
+  x1[13] = _mm_adds_epi16(input[13], input[50]);
+  x1[50] = _mm_subs_epi16(input[13], input[50]);
+  x1[14] = _mm_adds_epi16(input[14], input[49]);
+  x1[49] = _mm_subs_epi16(input[14], input[49]);
+  x1[15] = _mm_adds_epi16(input[15], input[48]);
+  x1[48] = _mm_subs_epi16(input[15], input[48]);
+  x1[16] = _mm_adds_epi16(input[16], input[47]);
+  x1[47] = _mm_subs_epi16(input[16], input[47]);
+  x1[17] = _mm_adds_epi16(input[17], input[46]);
+  x1[46] = _mm_subs_epi16(input[17], input[46]);
+  x1[18] = _mm_adds_epi16(input[18], input[45]);
+  x1[45] = _mm_subs_epi16(input[18], input[45]);
+  x1[19] = _mm_adds_epi16(input[19], input[44]);
+  x1[44] = _mm_subs_epi16(input[19], input[44]);
+  x1[20] = _mm_adds_epi16(input[20], input[43]);
+  x1[43] = _mm_subs_epi16(input[20], input[43]);
+  x1[21] = _mm_adds_epi16(input[21], input[42]);
+  x1[42] = _mm_subs_epi16(input[21], input[42]);
+  x1[22] = _mm_adds_epi16(input[22], input[41]);
+  x1[41] = _mm_subs_epi16(input[22], input[41]);
+  x1[23] = _mm_adds_epi16(input[23], input[40]);
+  x1[40] = _mm_subs_epi16(input[23], input[40]);
+  x1[24] = _mm_adds_epi16(input[24], input[39]);
+  x1[39] = _mm_subs_epi16(input[24], input[39]);
+  x1[25] = _mm_adds_epi16(input[25], input[38]);
+  x1[38] = _mm_subs_epi16(input[25], input[38]);
+  x1[26] = _mm_adds_epi16(input[26], input[37]);
+  x1[37] = _mm_subs_epi16(input[26], input[37]);
+  x1[27] = _mm_adds_epi16(input[27], input[36]);
+  x1[36] = _mm_subs_epi16(input[27], input[36]);
+  x1[28] = _mm_adds_epi16(input[28], input[35]);
+  x1[35] = _mm_subs_epi16(input[28], input[35]);
+  x1[29] = _mm_adds_epi16(input[29], input[34]);
+  x1[34] = _mm_subs_epi16(input[29], input[34]);
+  x1[30] = _mm_adds_epi16(input[30], input[33]);
+  x1[33] = _mm_subs_epi16(input[30], input[33]);
+  x1[31] = _mm_adds_epi16(input[31], input[32]);
+  x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+  x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+  x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+  x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+  x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+  x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+  x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+  x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+  x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+  x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+  x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+  x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+  x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+  x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+  x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+  x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+  x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+  x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+  x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+  x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+  x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+  x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+  x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+  x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+  x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+  x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+  x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+  x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+  x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+  x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+  x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+  x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+  x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+  x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+  x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+  x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+  x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+  x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+  x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+  x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+  x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+  x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+  x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+  x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+  x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+  x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+  x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+  x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+  x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+  x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+  x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+  x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+  x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+  x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+  x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+  x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+  x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+  x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+  x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+  x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+  x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+  x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+  x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+  x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+  x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+  x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+  x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+  x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+  x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+  x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+  x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+  x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+  x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+  x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+  x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+  x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+  x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+  x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+  x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+  x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+  x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+  x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+  x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+  x5[7] = x4[7];
+  x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+  x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+  x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+  x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+  x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+  x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+  x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+  x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+  x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+  x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+  x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+  x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+  x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+  x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+  x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+  x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+  x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+  x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+  x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+  x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+  x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+  x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+  x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+  x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+  x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+  x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+  x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+  x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+  x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+  x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+  x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+  x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+  x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+  x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+  x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+  x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+  x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+  x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+  x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+  x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+  x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+  x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+  x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+  x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+  x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+  x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+  x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+  x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+  x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+  x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+  x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+  x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+  x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+  x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+  x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+  x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+  x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+  x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+  x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+  x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+  x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+  x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+  x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+  x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+  x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+  x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+  x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+  x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+  x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+  x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+  x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+  x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+  x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+  x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+  x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+  x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+  x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+  x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+  x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+  x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+  x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+  x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+  x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+  x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+  x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+  x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+  x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+  x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+  x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+  x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+  x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+  x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+  x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+  x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+  x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+  x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+  x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+  x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+  x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+  x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+  x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+  x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+  x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+  x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+  x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+  x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+  x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+  x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+  x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+  x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+  x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+  x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+  x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+  x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+  x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+  x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+  x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+  x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+  x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+  x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+  x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+  x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+  x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+  x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+  x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+  x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+  x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+  x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+  x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+  x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+  x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+  x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+  x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+  x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+  x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+  x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+  x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+  x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+  x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+  x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+  x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+  btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+  btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+  btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+  btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+  btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+  btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+  btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+  btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+  btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+  btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+  btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+  btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+  btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+  btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+  btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u[8], v[8];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u[2] = _mm_unpacklo_epi16(in7, __zero);
+  u[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+  v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[6], __rounding);
+
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[2]);
+  output[1] = _mm_packs_epi32(u[1], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+                 &x1[3], &x2[2], &x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+                 &x1[7], &x2[6], &x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+                 &x3[5], &x4[4], &x4[5]);
+  btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+                 &x3[7], &x4[6], &x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+                 &x5[1], &x6[0], &x6[1]);
+  btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+                 &x5[3], &x6[2], &x6[3]);
+  btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+                 &x5[5], &x6[4], &x6[5]);
+  btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+                 &x5[7], &x6[6], &x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+  u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+  u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+  u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+  u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+  u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+  u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+  u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+  v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02);  // s0 + s2
+  v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02);  // s0 + s2
+  v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04);  // s4 + s5
+  v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04);  // s4 + s5
+  v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03);  // x1
+  v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03);  // x1
+  v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01);  // s1 - s3
+  v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01);  // s1 - s3
+  v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02);  // -s4 + s6
+  v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02);  // -s4 + s6
+  v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03);  // s4
+  v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03);  // s4
+  v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+  v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+  u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+  u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+  u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+  u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+  u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+  u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+  u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+  u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+  u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+  u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+  u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+  u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+  u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+  u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+  v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+  v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+  v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+  v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+  v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+  v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+  v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+  v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+  u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+  u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+  u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+  u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+  u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+  u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+  u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+  u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+  output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+  output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+  output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[15]);
+  x1[2] = _mm_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm_subs_epi16(__zero, input[11]);
+  x1[8] = _mm_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm_subs_epi16(__zero, input[13]);
+  x1[14] = _mm_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+  x2[12] = x1[12];
+  x2[13] = x1[13];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+  x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+  x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+  x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+  // stage 4
+  __m128i x4[16];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  x4[10] = x3[10];
+  x4[11] = x3[11];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+  x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+  x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+  x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+  x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+  // stage 7
+  __m128i x7[16];
+  x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+  x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+  x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+  x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+  x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+  x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+  x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+  x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+  x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+  x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+  x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+  x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+  x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+  x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+  // stage 8
+  __m128i x8[16];
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+  // stage 9
+  output[0] = x8[1];
+  output[1] = x8[14];
+  output[2] = x8[3];
+  output[3] = x8[12];
+  output[4] = x8[5];
+  output[5] = x8[10];
+  output[6] = x8[7];
+  output[7] = x8[8];
+  output[8] = x8[9];
+  output[9] = x8[6];
+  output[10] = x8[11];
+  output[11] = x8[4];
+  output[12] = x8[13];
+  output[13] = x8[2];
+  output[14] = x8[15];
+  output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fadst4x4_new_sse2,      // ADST_DCT
+  fdct4x4_new_sse2,       // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fadst4x4_new_sse2,      // FLIPADST_DCT
+  fdct4x4_new_sse2,       // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fdct4x4_new_sse2,       // V_DCT
+  fidentity4x4_new_sse2,  // H_DCT
+  fadst4x4_new_sse2,      // V_ADST
+  fidentity4x4_new_sse2,  // H_ADST
+  fadst4x4_new_sse2,      // V_FLIPADST
+  fidentity4x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fdct4x4_new_sse2,       // ADST_DCT
+  fadst4x4_new_sse2,      // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fdct4x4_new_sse2,       // FLIPADST_DCT
+  fadst4x4_new_sse2,      // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fidentity4x4_new_sse2,  // V_DCT
+  fdct4x4_new_sse2,       // H_DCT
+  fidentity4x4_new_sse2,  // V_ADST
+  fadst4x4_new_sse2,      // H_ADST
+  fidentity4x4_new_sse2,  // V_FLIPADST
+  fadst4x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fadst4x8_new_sse2,      // ADST_DCT
+  fdct4x8_new_sse2,       // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fadst4x8_new_sse2,      // FLIPADST_DCT
+  fdct4x8_new_sse2,       // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct4x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst4x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst4x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fdct8x4_new_sse2,       // ADST_DCT
+  fadst8x4_new_sse2,      // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fdct8x4_new_sse2,       // FLIPADST_DCT
+  fadst8x4_new_sse2,      // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fidentity8x4_new_sse2,  // V_DCT
+  fdct8x4_new_sse2,       // H_DCT
+  fidentity8x4_new_sse2,  // V_ADST
+  fadst8x4_new_sse2,      // H_ADST
+  fidentity8x4_new_sse2,  // V_FLIPADST
+  fadst8x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fadst8x4_new_sse2,      // ADST_DCT
+  fdct8x4_new_sse2,       // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fadst8x4_new_sse2,      // FLIPADST_DCT
+  fdct8x4_new_sse2,       // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fdct8x4_new_sse2,       // V_DCT
+  fidentity8x4_new_sse2,  // H_DCT
+  fadst8x4_new_sse2,      // V_ADST
+  fidentity8x4_new_sse2,  // H_ADST
+  fadst8x4_new_sse2,      // V_FLIPADST
+  fidentity8x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fdct4x8_new_sse2,       // ADST_DCT
+  fadst4x8_new_sse2,      // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fdct4x8_new_sse2,       // FLIPADST_DCT
+  fadst4x8_new_sse2,      // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct4x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst4x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst4x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fadst8x16_new_sse2,      // ADST_DCT
+  fdct8x16_new_sse2,       // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fadst8x16_new_sse2,      // FLIPADST_DCT
+  fdct8x16_new_sse2,       // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fdct8x16_new_sse2,       // V_DCT
+  fidentity8x16_new_sse2,  // H_DCT
+  fadst8x16_new_sse2,      // V_ADST
+  fidentity8x16_new_sse2,  // H_ADST
+  fadst8x16_new_sse2,      // V_FLIPADST
+  fidentity8x16_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fdct8x16_new_sse2,       // ADST_DCT
+  fadst8x16_new_sse2,      // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fdct8x16_new_sse2,       // FLIPADST_DCT
+  fadst8x16_new_sse2,      // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fidentity8x16_new_sse2,  // V_DCT
+  fdct8x16_new_sse2,       // H_DCT
+  fidentity8x16_new_sse2,  // V_ADST
+  fadst8x16_new_sse2,      // H_ADST
+  fidentity8x16_new_sse2,  // V_FLIPADST
+  fadst8x16_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_new_sse2,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fidentity8x32_new_sse2,  // V_DCT
+  fdct8x32_new_sse2,       // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[4], buf1[4], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x4(buf, buf);
+  store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x4(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x4(buf, buf);
+    store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_4x8(buf + 8, buf + 8);
+  store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                   8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                     height);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     height);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, height);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, height);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+    }
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * width * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  NULL,                             // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  else
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 000000000..99a6b9082
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a = _mm_unpacklo_epi16(input[i], one);
+    const __m128i b = scale_round_sse2(a, NewSqrt2);
+    output[i] = _mm_packs_epi32(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm_adds_epi16(input[0], input[0]);
+  output[1] = _mm_adds_epi16(input[1], input[1]);
+  output[2] = _mm_adds_epi16(input[2], input[2]);
+  output[3] = _mm_adds_epi16(input[3], input[3]);
+  output[4] = _mm_adds_epi16(input[4], input[4]);
+  output[5] = _mm_adds_epi16(input[5], input[5]);
+  output[6] = _mm_adds_epi16(input[6], input[6]);
+  output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm_slli_epi16(input[i], 2);
+  }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_new_sse2,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fdct8x32_new_sse2,       // V_DCT
+  fidentity8x32_new_sse2,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 000000000..b58911fcb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+  const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  if (log_scale) {
+    const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+    round = _mm_mulhrs_epi16(round, round_scale);
+  }
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+  init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, int log_scale,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                            __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
+
+  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+  __m256i q_hi = _mm256_srli_epi64(q, 32);
+  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+  q_hi = _mm256_slli_epi64(q_hi, 32);
+  q = _mm256_or_si256(q_lo, q_hi);
+  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+  q = _mm256_andnot_si256(mask, q);
+
+  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+  dq = _mm256_srai_epi32(dq, log_scale);
+  q = _mm256_sign_epi32(q, *c);
+  dq = _mm256_sign_epi32(dq, *c);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+  const __m128i zr = _mm_setzero_si128();
+  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+  const __m256i iscan =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+  cur_eob = _mm256_and_si256(cur_eob, nz);
+  *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 8;
+  __m256i qp[3], coeff;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+    quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 000000000..40b3b460b
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+                                         const int shift, const int scale,
+                                         __m128i *qcoeff, __m128i *dquan,
+                                         __m128i *sign) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+
+  *sign = _mm_cmplt_epi32(*coeff, zero);
+  *sign = _mm_or_si128(*sign, one);
+  *coeff = _mm_abs_epi32(*coeff);
+
+  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+  dquan[0] = _mm_srli_epi64(dquan[0], scale);
+  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+                                         const __m128i *sign,
+                                         const __m128i *param, const int shift,
+                                         const int scale, tran_low_t *qAddr,
+                                         tran_low_t *dqAddr) {
+  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+  dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+  // combine L&H
+  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+  dquan[0] = _mm_and_si128(dquan[0], mask0H);
+  dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+                            __m128i *eob) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, iscanIdx;
+  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+  iscanIdx = _mm_and_si128(iscanIdx, mask);
+  *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+  __m128i eob_shuffled;
+  uint16_t eobValue;
+  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eobValue = _mm_extract_epi16(*eob, 0);
+  return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+  __m128i eob = _mm_setzero_si128();
+  const tran_low_t *src = coeff_ptr;
+  tran_low_t *quanAddr = qcoeff_ptr;
+  tran_low_t *dquanAddr = dqcoeff_ptr;
+  const int shift = 16 - log_scale;
+  const int coeff_stride = 4;
+  const int quan_stride = coeff_stride;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+  coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+  const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+  qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+  qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+  qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+  qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+                            dequant_ptr[0]);
+
+  // DC and first 3 AC
+  quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+
+  // update round/quan/dquan for AC
+  qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+  qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+  qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+  qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr, dquanAddr);
+
+  // next 4 AC
+  coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+  quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr + quan_stride, dquanAddr + quan_stride);
+
+  find_eob(quanAddr, iscan, &eob);
+
+  count -= 8;
+
+  // loop for the rest of AC
+  while (count > 0) {
+    src += coeff_stride << 1;
+    quanAddr += quan_stride << 1;
+    dquanAddr += quan_stride << 1;
+    iscan += quan_stride << 1;
+
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr, dquanAddr);
+
+    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr + quan_stride,
+                          dquanAddr + quan_stride);
+
+    find_eob(quanAddr, iscan, &eob);
+
+    count -= 8;
+  }
+  *eob_ptr = get_accumulated_eob(&eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 000000000..df22aaba7
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+    *c = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(*c, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)coeff);
+  }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+  const __m256i zero = _mm256_setzero_si256();
+  if (sizeof(tran_low_t) == 4) {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+  } else {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  }
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *thr, __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  if (log_scale > 0) {
+    const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+    round = _mm_add_epi16(round, rnd);
+    round = _mm_srai_epi16(round, log_scale);
+  }
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+
+  if (log_scale == 1) {
+    qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+  }
+
+  init_one_qp(&dequant, &qp[2]);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+#define store_quan(q, addr)                               \
+  do {                                                    \
+    __m256i sign_bits = _mm256_srai_epi16(q, 15);         \
+    __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits);     \
+    __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits);     \
+    __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
+    __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
+    _mm256_storeu_si256((__m256i *)addr, x0);             \
+    _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
+  } while (0)
+
+#define store_two_quan(q, addr1, dq, addr2)      \
+  do {                                           \
+    if (sizeof(tran_low_t) == 4) {               \
+      store_quan(q, addr1);                      \
+      store_quan(dq, addr2);                     \
+    } else {                                     \
+      _mm256_storeu_si256((__m256i *)addr1, q);  \
+      _mm256_storeu_si256((__m256i *)addr2, dq); \
+    }                                            \
+  } while (0)
+
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+  __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+  eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+  eob_s = _mm_minpos_epu16(eob_s);
+  return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
+static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
+                            tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    q = _mm256_mulhi_epi16(q, qp[1]);
+    q = _mm256_sign_epi16(q, *c);
+    const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 0;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    q = _mm256_mulhi_epu16(q, qp[1]);
+
+    __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+    dq = _mm256_srli_epi16(dq, 1);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 1;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+    __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+    qh = _mm256_slli_epi16(qh, 2);
+    ql = _mm256_srli_epi16(ql, 14);
+    q = _mm256_or_si256(qh, ql);
+    const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+    const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+    __m256i dq = _mm256_or_si256(dqh, dql);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 2;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 000000000..b07e7717f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m128i *c0, __m128i *c1) {
+  const tran_low_t *addr = coeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+    const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+    const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+    const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+    *c0 = _mm_packs_epi32(x0, x1);
+    *c1 = _mm_packs_epi32(x2, x3);
+  } else {
+    *c0 = _mm_load_si128((const __m128i *)addr);
+    *c1 = _mm_load_si128((const __m128i *)addr + 1);
+  }
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+                                tran_low_t *qcoeff, intptr_t offset) {
+  tran_low_t *addr = qcoeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+    __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+    __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+    _mm_store_si128((__m128i *)addr, y0);
+    _mm_store_si128((__m128i *)addr + 1, y1);
+
+    sign_bits = _mm_cmplt_epi16(*qc1, zero);
+    y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+    y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+    _mm_store_si128((__m128i *)addr + 2, y0);
+    _mm_store_si128((__m128i *)addr + 3, y1);
+  } else {
+    _mm_store_si128((__m128i *)addr, *qc0);
+    _mm_store_si128((__m128i *)addr + 1, *qc1);
+  }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+  const __m128i zero = _mm_setzero_si128();
+  tran_low_t *addr = qcoeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+    _mm_store_si128((__m128i *)addr + 2, zero);
+    _mm_store_si128((__m128i *)addr + 3, zero);
+  } else {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+  }
+}
+
+static INLINE void quantize(const int16_t *iscan_ptr,
+                            const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const __m128i *round0, const __m128i *round1,
+                            const __m128i *quant0, const __m128i *quant1,
+                            const __m128i *dequant0, const __m128i *dequant1,
+                            const __m128i *thr0, const __m128i *thr1,
+                            __m128i *eob) {
+  __m128i coeff0, coeff1;
+  // Do DC and first 15 AC
+  read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+                                     _mm_cmpeq_epi16(qcoeff0, *thr0));
+  const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+                                     _mm_cmpeq_epi16(qcoeff1, *thr1));
+  const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+  if (nzflag) {
+    qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+    qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+    const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+    const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+    // Reinsert signs
+    qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+    qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+    write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+    coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+    coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+    write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+    const __m128i zero = _mm_setzero_si128();
+    // Scan for eob
+    const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+    const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+    const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+    const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+    const __m128i iscan0 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+    const __m128i iscan1 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+    // Add one to convert from indices to counts
+    const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+    const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+    const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+    const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+    const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+    *eob = _mm_max_epi16(*eob, eob2);
+  } else {
+    write_zero(qcoeff_ptr, n_coeffs);
+    write_zero(dqcoeff_ptr, n_coeffs);
+  }
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+  const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+  __m128i eob = _mm_setzero_si128();
+
+  quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+           &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+             &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+             &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 000000000..ad4ae274e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, fp_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m1, m5
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+  psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+
+  or                              r6, r2
+  jz .skip_iter
+
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+  jmp .accumulate_eob
+.skip_iter:
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                           [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                     word [r3q], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 000000000..faa2a232a
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+sym(av1_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+sym(av1_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..6df2a8bdb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           int8_t cos_bit, const int instride,
+                           const int outstride);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t cos_bit, const int8_t *stage_range);
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    const __m128i ww0 = _mm_set1_epi32(w0);                    \
+    const __m128i ww1 = _mm_set1_epi32(w1);                    \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = av1_round_shift_32_sse4_1(out0, bit);               \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = av1_round_shift_32_sse4_1(out1, bit);               \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
+    out0 = _mm_add_epi32(out0, r);                                      \
+    out0 = _mm_srai_epi32(out0, bit);                                   \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
+    out1 = _mm_add_epi32(out1, r);                                      \
+    out1 = _mm_srai_epi32(out1, bit);                                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
new file mode 100644
index 000000000..93f37b71d
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
+};
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1,
+                                        int y1, unsigned char *im2, int stride2,
+                                        int x2, int y2) {
+  int i;
+  // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
+  // 2)
+  __m128i sum1_vec = _mm_setzero_si128();
+  __m128i sum2_vec = _mm_setzero_si128();
+  // 4 32-bit partial sums of squares
+  __m128i sumsq2_vec = _mm_setzero_si128();
+  __m128i cross_vec = _mm_setzero_si128();
+
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m128i zero = _mm_setzero_si128();
+
+  im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    const __m128i v1 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[i * stride1]), mask);
+    const __m128i v2 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[i * stride2]), mask);
+
+    // Using the 'sad' intrinsic here is a bit faster than adding
+    // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit
+    // conversion step later, for a net speedup of ~10%
+    sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero));
+    sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero));
+
+    const __m128i v1_l = _mm_cvtepu8_epi16(v1);
+    const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8));
+    const __m128i v2_l = _mm_cvtepu8_epi16(v2);
+    const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8));
+
+    sumsq2_vec = _mm_add_epi32(
+        sumsq2_vec,
+        _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r)));
+    cross_vec = _mm_add_epi32(
+        cross_vec,
+        _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r)));
+  }
+
+  // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec,
+  // cross_vec)
+  // as holding 4 32-bit elements each, which we want to sum horizontally.
+  // We do this by transposing and then summing vertically.
+  __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec);
+  __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec);
+
+  __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2);
+  __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2);
+  __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3);
+  __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3);
+
+  __m128i res =
+      _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7));
+
+  int sum1 = _mm_extract_epi32(res, 0);
+  int sum2 = _mm_extract_epi32(res, 1);
+  int sumsq2 = _mm_extract_epi32(res, 2);
+  int cross = _mm_extract_epi32(res, 3);
+
+  int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..b18554818
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,82 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+
+  RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 000000000..7642f57d1
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+#include <immintrin.h>  /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  const __m256i y_zeros = _mm256_setzero_si256();
+
+  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+  uint8_t *pre_buf_end = pre_buf + pre_len;
+  do {
+    yy_storeu_256(pre_buf, y_zeros);
+    pre_buf += 32;
+  } while (pre_buf < pre_buf_end);
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+  uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+  do {
+    yy_storeu_256(bottom_buf, y_zeros);
+    bottom_buf += 32;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m256i c0 = yy_loadu_256(cf);
+      const __m256i c1 = yy_loadu_256(cf + 8);
+      const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+      const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+      const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+      const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      ls += 32;
+      cf += 16;
+      i += 4;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      const __m128i res0 = _mm256_castsi256_si128(res);
+      const __m128i res1 = _mm256_extracti128_si256(res, 1);
+      xx_storel_64(ls, res0);
+      *(int32_t *)(ls + width) = 0;
+      xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+      *(int32_t *)(ls + width + stride) = 0;
+      xx_storel_64(ls + stride * 2, res1);
+      *(int32_t *)(ls + width + stride * 2) = 0;
+      xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+      *(int32_t *)(ls + width + stride * 3) = 0;
+      cf += 32;
+      ls += stride << 2;
+      i += 4;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      xx_storeu_128(ls, _mm256_castsi256_si128(res));
+      xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      *(int32_t *)(ls + stride + width) = 0;
+      ls += stride << 1;
+      i += 2;
+    } while (i < height);
+  } else {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 000000000..dedb4d02f
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+                                           const int stride,
+                                           const ptrdiff_t *const offsets,
+                                           __m128i *const level) {
+  level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+  level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+  level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+  level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+  level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+  const __m128i const_3 = _mm_set1_epi8(3);
+  const __m128i const_4 = _mm_set1_epi8(4);
+  __m128i count;
+
+  count = _mm_min_epu8(level[0], const_3);
+  level[1] = _mm_min_epu8(level[1], const_3);
+  level[2] = _mm_min_epu8(level[2], const_3);
+  level[3] = _mm_min_epu8(level[3], const_3);
+  level[4] = _mm_min_epu8(level[4], const_3);
+  count = _mm_add_epi8(count, level[1]);
+  count = _mm_add_epi8(count, level[2]);
+  count = _mm_add_epi8(count, level[3]);
+  count = _mm_add_epi8(count, level[4]);
+  count = _mm_avg_epu8(count, _mm_setzero_si128());
+  count = _mm_min_epu8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+  __m128i pos_to_offset =
+      (height == 4)
+          ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+          : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+                          21, 21);
+  __m128i count;
+  __m128i level[5];
+  int8_t *cc = coeff_contexts;
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+  __m128i pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] =
+        _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  } else if (height < 8) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+                                     21, 21, 21, 21, 21);
+  } else {
+    pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+                                     11, 11, 11, 11, 11);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  }
+  pos_to_offset[2] = _mm_set1_epi8(21);
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset[0]);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i pos_to_offset[5];
+  __m128i pos_to_offset_large[3];
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = _mm_set1_epi8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+        16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  const __m128i pos_to_offset_large =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  do {
+    __m128i pos_to_offset =
+        _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  __m128i pos_to_offset[3];
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 16) {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 000000000..5e0687cd3
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+                                const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  const __m128i zeros = _mm_setzero_si128();
+
+  const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+  uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+  uint8_t *pre_buf_end = pre_buf + pre_len;
+  do {
+    _mm_storeu_si128((__m128i *)(pre_buf), zeros);
+    pre_buf += 16;
+  } while (pre_buf < pre_buf_end);
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf = levels + stride * height;
+  uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+  do {
+    _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+    bottom_buf += 16;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+      xx_storeu_128(ls, lsAB);
+      ls += (stride << 1);
+      cf += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      xx_storeu_128(ls, absAB8);
+      ls += stride;
+      cf += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m128i coeffA = xx_loadu_128(cf);
+        const __m128i coeffB = xx_loadu_128(cf + 4);
+        const __m128i coeffC = xx_loadu_128(cf + 8);
+        const __m128i coeffD = xx_loadu_128(cf + 12);
+        const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+        const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+        const __m128i absAB = _mm_abs_epi16(coeffAB);
+        const __m128i absCD = _mm_abs_epi16(coeffCD);
+        const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+        xx_storeu_128(ls + j, absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < width);
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 000000000..7d4f69585
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m256i *c) {
+  const tran_low_t *addr = coeff + offset;
+
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+    const __m256i y = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(y, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)addr);
+  }
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_setzero_si256();
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_setzero_si256();
+  ssz_reg = _mm256_setzero_si256();
+
+  for (i = 0; i < block_size; i += 16) {
+    // load 32 bytes from coeff and dqcoeff
+    read_coeff(coeff, i, &coeff_reg);
+    read_coeff(dqcoeff, i, &dqcoeff_reg);
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+  _mm256_zeroupper();
+  return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 000000000..72e9e22b1
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,79 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m1
+  punpckhdq m2, m5
+  paddq     m6, m7
+  punpckldq m7, m3, m5
+  paddq     m6, m2
+  punpckhdq m3, m5
+  paddq     m6, m7
+  paddq     m6, m3
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 000000000..65fa46311
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                     size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  return (crc ^= 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 000000000..777304ace
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+                         _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+                         _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+                         _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+                         _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(
+        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i *)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error += diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 000000000..535485ae8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,1783 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  s0 = _mm_add_epi32(in[0], in[3]);
+  s1 = _mm_add_epi32(in[1], in[2]);
+  s2 = _mm_sub_epi32(in[1], in[2]);
+  s3 = _mm_sub_epi32(in[0], in[3]);
+
+  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+  u0 = _mm_mullo_epi32(s0, cospi32);
+  u1 = _mm_mullo_epi32(s1, cospi32);
+  u2 = _mm_add_epi32(u0, u1);
+  v0 = _mm_sub_epi32(u0, u1);
+
+  u3 = _mm_add_epi32(u2, rnding);
+  v1 = _mm_add_epi32(v0, rnding);
+
+  u0 = _mm_srai_epi32(u3, bit);
+  u2 = _mm_srai_epi32(v1, bit);
+
+  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+  v0 = _mm_mullo_epi32(s2, cospi48);
+  v1 = _mm_mullo_epi32(s3, cospi16);
+  v2 = _mm_add_epi32(v0, v1);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u1 = _mm_srai_epi32(v3, bit);
+
+  v0 = _mm_mullo_epi32(s2, cospi16);
+  v1 = _mm_mullo_epi32(s3, cospi48);
+  v2 = _mm_sub_epi32(v1, v0);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(v3, bit);
+
+  // Note: shift[1] and shift[2] are zeros
+
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  s0 = _mm_mullo_epi32(in[0], sinpi1);
+  s1 = _mm_mullo_epi32(in[0], sinpi4);
+  s2 = _mm_mullo_epi32(in[1], sinpi2);
+  s3 = _mm_mullo_epi32(in[1], sinpi1);
+  s4 = _mm_mullo_epi32(in[2], sinpi3);
+  s5 = _mm_mullo_epi32(in[3], sinpi4);
+  s6 = _mm_mullo_epi32(in[3], sinpi2);
+  t = _mm_add_epi32(in[0], in[1]);
+  s7 = _mm_sub_epi32(t, in[3]);
+
+  t = _mm_add_epi32(s0, s2);
+  x0 = _mm_add_epi32(t, s5);
+  x1 = _mm_mullo_epi32(s7, sinpi3);
+  t = _mm_sub_epi32(s1, s3);
+  x2 = _mm_add_epi32(t, s6);
+  x3 = s4;
+
+  s0 = _mm_add_epi32(x0, x3);
+  s1 = x1;
+  s2 = _mm_sub_epi32(x2, x3);
+  t = _mm_sub_epi32(x2, x0);
+  s3 = _mm_add_epi32(t, x3);
+
+  u0 = _mm_add_epi32(s0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(s1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(s2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(s3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
+
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+                               int input_stride, TX_TYPE tx_type, int bd) {
+  __m128i in[4];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+      fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+      write_buffer_4x4(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  __m128i u;
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  u = _mm_unpackhi_epi64(in[4], in[4]);
+  in[8] = _mm_cvtepi16_epi32(in[4]);
+  in[9] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[5], in[5]);
+  in[10] = _mm_cvtepi16_epi32(in[5]);
+  in[11] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[6], in[6]);
+  in[12] = _mm_cvtepi16_epi32(in[6]);
+  in[13] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[7], in[7]);
+  in[14] = _mm_cvtepi16_epi32(in[7]);
+  in[15] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[3], in[3]);
+  in[6] = _mm_cvtepi16_epi32(in[3]);
+  in[7] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[2], in[2]);
+  in[4] = _mm_cvtepi16_epi32(in[2]);
+  in[5] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[1], in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[1]);
+  in[3] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[0], in[0]);
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(u);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+  in[4] = _mm_slli_epi32(in[4], shift);
+  in[5] = _mm_slli_epi32(in[5], shift);
+  in[6] = _mm_slli_epi32(in[6], shift);
+  in[7] = _mm_slli_epi32(in[7], shift);
+
+  in[8] = _mm_slli_epi32(in[8], shift);
+  in[9] = _mm_slli_epi32(in[9], shift);
+  in[10] = _mm_slli_epi32(in[10], shift);
+  in[11] = _mm_slli_epi32(in[11], shift);
+  in[12] = _mm_slli_epi32(in[12], shift);
+  in[13] = _mm_slli_epi32(in[13], shift);
+  in[14] = _mm_slli_epi32(in[14], shift);
+  in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+  in[8] = _mm_add_epi32(in[8], rounding);
+  in[9] = _mm_add_epi32(in[9], rounding);
+  in[10] = _mm_add_epi32(in[10], rounding);
+  in[11] = _mm_add_epi32(in[11], rounding);
+  in[12] = _mm_add_epi32(in[12], rounding);
+  in[13] = _mm_add_epi32(in[13], rounding);
+  in[14] = _mm_add_epi32(in[14], rounding);
+  in[15] = _mm_add_epi32(in[15], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+  in[8] = _mm_srai_epi32(in[8], shift);
+  in[9] = _mm_srai_epi32(in[9], shift);
+  in[10] = _mm_srai_epi32(in[10], shift);
+  in[11] = _mm_srai_epi32(in[11], shift);
+  in[12] = _mm_srai_epi32(in[12], shift);
+  in[13] = _mm_srai_epi32(in[13], shift);
+  in[14] = _mm_srai_epi32(in[14], shift);
+  in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+                                     const int stride) {
+  _mm_storeu_si128((__m128i *)(output), res[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+  _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+  _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int col_num) {
+  (void)(col_num);
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[8], v[8];
+
+  // Even 8 points 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[0], in[14]);
+  v[7] = _mm_sub_epi32(in[0], in[14]);  // v[7]
+  u[1] = _mm_add_epi32(in[2], in[12]);
+  u[6] = _mm_sub_epi32(in[2], in[12]);
+  u[2] = _mm_add_epi32(in[4], in[10]);
+  u[5] = _mm_sub_epi32(in[4], in[10]);
+  u[3] = _mm_add_epi32(in[6], in[8]);
+  v[4] = _mm_sub_epi32(in[6], in[8]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[2] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[14] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[10] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[6] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[0] = u[0];   // buf0[0]
+  out[8] = u[1];   // buf0[1]
+  out[4] = u[2];   // buf0[2]
+  out[12] = u[3];  // buf0[3]
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[1], in[15]);
+  v[7] = _mm_sub_epi32(in[1], in[15]);  // v[7]
+  u[1] = _mm_add_epi32(in[3], in[13]);
+  u[6] = _mm_sub_epi32(in[3], in[13]);
+  u[2] = _mm_add_epi32(in[5], in[11]);
+  u[5] = _mm_sub_epi32(in[5], in[11]);
+  u[3] = _mm_add_epi32(in[7], in[9]);
+  v[4] = _mm_sub_epi32(in[7], in[9]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[3] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[15] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[11] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[7] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[1] = u[0];   // buf0[0]
+  out[9] = u[1];   // buf0[1]
+  out[5] = u[2];   // buf0[2]
+  out[13] = u[3];  // buf0[3]
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int col_num) {
+  (void)(col_num);
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
+}
+
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[16], out[16];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  __m128i in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                             const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm_mullo_epi32(u[10], cospim32);
+    x = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[13], cospim32);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospim32);
+    x = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi32);
+    x = _mm_mullo_epi32(u[12], cospim32);
+    v[12] = _mm_sub_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm_mullo_epi32(v[5], cospim32);
+    x = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi32);
+    x = _mm_mullo_epi32(v[6], cospim32);
+    u[6] = _mm_sub_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[15] = _mm_add_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm_mullo_epi32(u[0], cospi32);
+    u[1] = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(u[0], u[1]);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(u[0], u[1]);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(u[2], cospi48);
+    x = _mm_mullo_epi32(u[3], cospi16);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(u[2], cospi16);
+    x = _mm_mullo_epi32(u[3], cospi48);
+    v[3] = _mm_sub_epi32(x, v[3]);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm_mullo_epi32(u[9], cospim16);
+    x = _mm_mullo_epi32(u[14], cospi48);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi48);
+    x = _mm_mullo_epi32(u[14], cospim16);
+    v[14] = _mm_sub_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospim48);
+    x = _mm_mullo_epi32(u[13], cospim16);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospim16);
+    x = _mm_mullo_epi32(u[13], cospim48);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm_mullo_epi32(v[4], cospi56);
+    x = _mm_mullo_epi32(v[7], cospi8);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    u[7] = _mm_mullo_epi32(v[4], cospi8);
+    x = _mm_mullo_epi32(v[7], cospi56);
+    u[7] = _mm_sub_epi32(x, u[7]);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[5] = _mm_mullo_epi32(v[5], cospi24);
+    x = _mm_mullo_epi32(v[6], cospi40);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi40);
+    x = _mm_mullo_epi32(v[6], cospi24);
+    u[6] = _mm_sub_epi32(x, u[6]);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[11], v[10]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi60);
+    x = _mm_mullo_epi32(u[15], cospi4);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[15] = _mm_mullo_epi32(u[8], cospi4);
+    x = _mm_mullo_epi32(u[15], cospi60);
+    v[15] = _mm_sub_epi32(x, v[15]);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    v[9] = _mm_mullo_epi32(u[9], cospi28);
+    x = _mm_mullo_epi32(u[14], cospi36);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi36);
+    x = _mm_mullo_epi32(u[14], cospi28);
+    v[14] = _mm_sub_epi32(x, v[14]);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi44);
+    x = _mm_mullo_epi32(u[13], cospi20);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi20);
+    x = _mm_mullo_epi32(u[13], cospi44);
+    v[13] = _mm_sub_epi32(x, v[13]);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospi12);
+    x = _mm_mullo_epi32(u[12], cospi52);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi52);
+    x = _mm_mullo_epi32(u[12], cospi12);
+    v[12] = _mm_sub_epi32(x, v[12]);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                              const int num_cols) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+    // stage 9
+    out[0 * num_cols + col] = v[1];
+    out[1 * num_cols + col] = v[14];
+    out[2 * num_cols + col] = v[3];
+    out[3 * num_cols + col] = v[12];
+    out[4 * num_cols + col] = v[5];
+    out[5 * num_cols + col] = v[10];
+    out[6 * num_cols + col] = v[7];
+    out[7 * num_cols + col] = v[8];
+    out[8 * num_cols + col] = v[9];
+    out[9 * num_cols + col] = v[6];
+    out[10 * num_cols + col] = v[11];
+    out[11 * num_cols + col] = v[4];
+    out[12 * num_cols + col] = v[13];
+    out[13 * num_cols + col] = v[2];
+    out[14 * num_cols + col] = v[15];
+    out[15 * num_cols + col] = v[0];
+  }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+  col_txfm_8x8_rounding(&in[32], shift);
+  col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64], out[64];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int col_num = 4;
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fadst8x8_sse4_1,  // ADST_DCT
+  fdct8x8_sse4_1,   // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fadst8x8_sse4_1,  // FLIPADST_DCT
+  fdct8x8_sse4_1,   // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  NULL,             // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL              // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fdct16x16_sse4_1,   // ADST_DCT
+  fadst16x16_sse4_1,  // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fdct16x16_sse4_1,   // FLIPADST_DCT
+  fadst16x16_sse4_1,  // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  NULL,               // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fadst16x16_sse4_1,  // ADST_DCT
+  fdct16x16_sse4_1,   // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fadst16x16_sse4_1,  // FLIPADST_DCT
+  fdct16x16_sse4_1,   // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  NULL,               // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fdct8x8_sse4_1,   // ADST_DCT
+  fadst8x8_sse4_1,  // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fdct8x8_sse4_1,   // FLIPADST_DCT
+  fadst8x8_sse4_1,  // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  NULL,             // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL              // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+    col_txfm(in, in, bit, 0);
+    col_txfm_8x8_rounding(in, -shift[1]);
+    transpose_8x8(in, out + i * 16);
+  }
+
+  if (lr_flip) {
+    flip_buf_sse4_1(in, out, 32);
+    row_txfm(in, out, bit, 2);
+  } else {
+    row_txfm(out, out, bit, 2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    transpose_8x8(out + i * 16, in);
+    av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_16x8(in, coeff + i * 8, 16);
+  }
+
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bit, 2);
+  col_txfm_8x16_rounding(in, -shift[1]);
+  transpose_8x8(in, out);
+  transpose_8x8(in + 16, out + 16);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(out + i * 16, out, bit, 0);
+    transpose_8x8(out, in);
+    av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_8x8(in, coeff + i * 64);
+  }
+
+  (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 000000000..06aaaa7ee
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+                                 const __m128i *shuffle, const __m256i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+  const __m256i dst0 = yy_loadu_256(dst);
+  const __m256i r0 = _mm256_add_epi32(dst0, d0);
+  yy_storeu_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m256i kl =
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win7_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_win5_one_line_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m256i kl =
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win5_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+                            const uint8_t *src, int h_start, int h_end,
+                            int v_start, int v_end, int dgd_stride,
+                            int src_stride, double *M, double *H) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+                                dgd_stride, src_stride, M, H);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+                                dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+  __m256i sum64 = _mm256_setzero_si256();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt0_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+                               yy_loadu_256(flt0 + j + 8)),
+            0xd8);
+        const __m256i flt1_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+                               yy_loadu_256(flt1 + j + 8)),
+            0xd8);
+        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+        const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+        const __m256i v0 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i v1 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0) {
+    __m256i xq_coeff =
+        pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt0_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+                               yy_loadu_256(flt0 + j + 8)),
+            0xd8);
+        const __m256i v0 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+        const __m256i v1 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[1] > 0) {
+    __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt1_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+                               yy_loadu_256(flt1 + j + 8)),
+            0xd8);
+        const __m256i v0 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
+        const __m256i v1 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m256i sum32 = _mm256_setzero_si256();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+        const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m256i sum64_0 =
+        _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+    const __m256i sum64_1 =
+        _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+    sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[4];
+  yy_storeu_256(sum, sum64);
+  err += sum[0] + sum[1] + sum[2] + sum[3];
+  return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 000000000..04e4d1afc
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+                                  const __m128i *shuffle, const __m128i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+  const __m128i d1 =
+      _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+  const __m128i dst0 = xx_loadu_128(dst);
+  const __m128i dst1 = xx_loadu_128(dst + 4);
+  const __m128i r0 = _mm_add_epi32(dst0, d0);
+  const __m128i r1 = _mm_add_epi32(dst1, d1);
+  xx_storeu_128(dst, r0);
+  xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  const int wiener_win = 7;
+  int j, k, l;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win7_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  const int wiener_win = WIENER_WIN_CHROMA;
+  int j, k, l;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win5_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      double *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+                              const uint8_t *src, int h_start, int h_end,
+                              int v_start, int v_end, int dgd_stride,
+                              int src_stride, double *M, double *H) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) {
+  return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+  __m128i sum64 = _mm_setzero_si128();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j < width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt0_16b =
+            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i flt1_16b =
+            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+        const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+        const __m128i v0 = _mm_madd_epi16(
+            xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i v1 = _mm_madd_epi16(
+            xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j < width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt0_16b =
+            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i v0 =
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+        const __m128i v1 =
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[1] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j < width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt1_16b =
+            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+        const __m128i v0 =
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
+        const __m128i v1 =
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m128i sum32 = _mm_setzero_si128();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width - 16; j += 16) {
+        const __m128i d = xx_loadu_128(dat + j);
+        const __m128i s = xx_loadu_128(src + j);
+        const __m128i d0 = _mm_cvtepu8_epi16(d);
+        const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+        const __m128i s0 = _mm_cvtepu8_epi16(s);
+        const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+        const __m128i diff0 = _mm_sub_epi16(d0, s0);
+        const __m128i diff1 = _mm_sub_epi16(d1, s1);
+        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+        sum32 = _mm_add_epi32(sum32, err0);
+        sum32 = _mm_add_epi32(sum32, err1);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += e * e;
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+    const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+    sum64 = _mm_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[2];
+  xx_storeu_128(sum, sum64);
+  err += sum[0] + sum[1];
+  return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 000000000..30983d1c1
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,217 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+; void av1_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_width,      |  3
+;   unsigned int    block_height,     |  4
+;   int             strength,         |  5
+;   int             filter_weight,    |  6
+;   unsigned int   *accumulator,      |  7
+;   unsigned short *count)            |  8
+global sym(av1_temporal_filter_apply_sse2) PRIVATE
+sym(av1_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_width    0
+    %define block_height  16
+    %define strength      32
+    %define filter_weight 48
+    %define rounding_bit  64
+    %define rbp_backup    80
+    %define stack_size    96
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         edx,            arg(3)
+        mov         [rsp + block_width], rdx
+        mov         edx,            arg(4)
+        mov         [rsp + block_height], rdx
+        movd        xmm6,           arg(5)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(5) ; 16 - strength
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(7) ; accumulator
+        mov         rax,            arg(8) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(6) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        mov         rcx,            [rsp + block_width]
+        imul        rcx,            [rsp + block_height]
+        add         rcx,            rdx
+        cmp         dword ptr [rsp + block_width], 8
+        jne         .temporal_filter_apply_load_16
+
+.temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+        movdqa      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddd       xmm4,           xmm0
+        paddd       xmm5,           xmm2
+        paddd       xmm6,           xmm1
+        paddd       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          .temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_width], 16
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w:
+    times 8 dw 16
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 000000000..2a792f14e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+
+  uint64_t csse;
+
+  const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+  const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
+
+  __m256i v_acc0_q = _mm256_setzero_si256();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+    const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+    const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+    const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+    const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+    const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+    const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+    const __m256i v_sum0_q = _mm256_add_epi64(
+        _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+    v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc;
+  __m256i v_acc0_d = _mm256_setzero_si256();
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+    const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+    const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+    const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+    const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+    const __m256i v_m0_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+    const __m256i v_m1_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+    const __m256i v_m2_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+    const __m256i v_m3_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+    const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+    const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+    const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+    const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+    const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+    const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+    const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+    v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+  v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+                              _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+  return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+    const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+    const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+    const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+    const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+    const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+    const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+    const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+    const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+    const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+    const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+    const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+    const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+    const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+    const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+    const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+    const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+    _mm256_store_si256((__m256i *)(d), v_r0_w);
+    _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+    _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+    _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+    a += 64;
+    b += 64;
+    d += 64;
+    N -= 64;
+  } while (N);
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 000000000..4d2e99f25
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+  int n8 = n + 8;
+
+  uint64_t csse;
+
+  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+  const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
+
+  __m128i v_acc0_q = _mm_setzero_si128();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m128i v_r0_w = xx_load_128(r1 + n);
+    const __m128i v_r1_w = xx_load_128(r1 + n8);
+    const __m128i v_d0_w = xx_load_128(d + n);
+    const __m128i v_d1_w = xx_load_128(d + n8);
+    const __m128i v_m01_b = xx_load_128(m + n);
+
+    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq0_d, 32));
+    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq1_d, 32));
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+    n8 += 16;
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  xx_storel_64(&csse, v_acc0_q);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc;
+
+  __m128i v_sign_d;
+  __m128i v_acc0_d = _mm_setzero_si128();
+  __m128i v_acc1_d = _mm_setzero_si128();
+  __m128i v_acc_q;
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_m01_b = xx_load_128(m);
+    const __m128i v_m23_b = xx_load_128(m + 16);
+    const __m128i v_m45_b = xx_load_128(m + 32);
+    const __m128i v_m67_b = xx_load_128(m + 48);
+
+    const __m128i v_d0_w = xx_load_128(ds);
+    const __m128i v_d1_w = xx_load_128(ds + 8);
+    const __m128i v_d2_w = xx_load_128(ds + 16);
+    const __m128i v_d3_w = xx_load_128(ds + 24);
+    const __m128i v_d4_w = xx_load_128(ds + 32);
+    const __m128i v_d5_w = xx_load_128(ds + 40);
+    const __m128i v_d6_w = xx_load_128(ds + 48);
+    const __m128i v_d7_w = xx_load_128(ds + 56);
+
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  xx_storel_64(&acc, v_acc_q);
+#endif
+
+  return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m128i v_neg_w =
+      _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_a0_w = xx_load_128(a);
+    const __m128i v_b0_w = xx_load_128(b);
+    const __m128i v_a1_w = xx_load_128(a + 8);
+    const __m128i v_b1_w = xx_load_128(b + 8);
+    const __m128i v_a2_w = xx_load_128(a + 16);
+    const __m128i v_b2_w = xx_load_128(b + 16);
+    const __m128i v_a3_w = xx_load_128(a + 24);
+    const __m128i v_b3_w = xx_load_128(b + 24);
+
+    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+    xx_store_128(d, v_r0_w);
+    xx_store_128(d + 8, v_r1_w);
+    xx_store_128(d + 16, v_r2_w);
+    xx_store_128(d + 24, v_r3_w);
+
+    a += 32;
+    b += 32;
+    d += 32;
+    N -= 32;
+  } while (N);
+}
diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com
new file mode 100644
index 000000000..5c8e0e09d
--- /dev/null
+++ b/third_party/aom/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
new file mode 100644
index 000000000..daabf6766
--- /dev/null
+++ b/third_party/aom/av1/exports_dec
@@ -0,0 +1,3 @@
+data aom_codec_av1_dx_algo
+text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/third_party/aom/av1/exports_enc b/third_party/aom/av1/exports_enc
new file mode 100644
index 000000000..dc4a9eae7
--- /dev/null
+++ b/third_party/aom/av1/exports_enc
@@ -0,0 +1,2 @@
+data aom_codec_av1_cx_algo
+text aom_codec_av1_cx
diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test
new file mode 100644
index 000000000..dab377575
--- /dev/null
+++ b/third_party/aom/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd
diff --git a/third_party/aom/build/cmake/aom_config.c.template b/third_party/aom/build/cmake/aom_config.c.template
new file mode 100644
index 000000000..62f0a10ab
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_config.c.template
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_codec.h"
+static const char* const cfg = "${AOM_CMAKE_CONFIG}";
+const char *aom_codec_build_config(void) {return cfg;}
diff --git a/third_party/aom/build/cmake/aom_config_defaults.cmake b/third_party/aom/build/cmake/aom_config_defaults.cmake
new file mode 100644
index 000000000..a07438cfe
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_config_defaults.cmake
@@ -0,0 +1,196 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# This file sets default values for libaom configuration variables. All libaom
+# config variables are added to the CMake variable cache via the macros provided
+# in util.cmake.
+
+#
+# The variables in this section of the file are detected at configuration time,
+# but can be overridden via the use of CONFIG_* and ENABLE_* values also defined
+# in this file.
+#
+
+set_aom_detect_var(INLINE "" STRING "Sets INLINE value for current target.")
+
+# CPUs.
+set_aom_detect_var(ARCH_ARM 0 NUMBER "Enables ARM architecture.")
+set_aom_detect_var(ARCH_MIPS 0 NUMBER "Enables MIPS architecture.")
+set_aom_detect_var(ARCH_PPC 0 NUMBER "Enables PPC architecture.")
+set_aom_detect_var(ARCH_X86 0 NUMBER "Enables X86 architecture.")
+set_aom_detect_var(ARCH_X86_64 0 NUMBER "Enables X86_64 architecture.")
+
+# ARM feature flags.
+set_aom_detect_var(HAVE_NEON 0 NUMBER "Enables NEON intrinsics optimizations.")
+
+# MIPS feature flags.
+set_aom_detect_var(HAVE_DSPR2 0 NUMBER "Enables DSPR2 optimizations.")
+set_aom_detect_var(HAVE_MIPS32 0 NUMBER "Enables MIPS32 optimizations.")
+set_aom_detect_var(HAVE_MIPS64 0 NUMBER "Enables MIPS64 optimizations. ")
+set_aom_detect_var(HAVE_MSA 0 NUMBER "Enables MSA optimizations.")
+
+# PPC feature flags.
+set_aom_detect_var(HAVE_VSX 0 NUMBER "Enables VSX optimizations.")
+
+# x86/x86_64 feature flags.
+set_aom_detect_var(HAVE_AVX 0 NUMBER "Enables AVX optimizations.")
+set_aom_detect_var(HAVE_AVX2 0 NUMBER "Enables AVX2 optimizations.")
+set_aom_detect_var(HAVE_MMX 0 NUMBER "Enables MMX optimizations. ")
+set_aom_detect_var(HAVE_SSE 0 NUMBER "Enables SSE optimizations.")
+set_aom_detect_var(HAVE_SSE2 0 NUMBER "Enables SSE2 optimizations.")
+set_aom_detect_var(HAVE_SSE3 0 NUMBER "Enables SSE3 optimizations.")
+set_aom_detect_var(HAVE_SSE4_1 0 NUMBER "Enables SSE 4.1 optimizations.")
+set_aom_detect_var(HAVE_SSE4_2 0 NUMBER "Enables SSE 4.2 optimizations.")
+set_aom_detect_var(HAVE_SSSE3 0 NUMBER "Enables SSSE3 optimizations.")
+
+# Flags describing the build environment.
+set_aom_detect_var(HAVE_FEXCEPT 0 NUMBER
+                   "Internal flag, GNU fenv.h present for target.")
+set_aom_detect_var(HAVE_PTHREAD_H 0 NUMBER
+                   "Internal flag, target pthread support.")
+set_aom_detect_var(HAVE_UNISTD_H 0 NUMBER
+                   "Internal flag, unistd.h present for target.")
+set_aom_detect_var(HAVE_WXWIDGETS 0 NUMBER "WxWidgets present.")
+
+#
+# Variables in this section can be set from the CMake command line or from
+# within the CMake GUI. The variables control libaom features.
+#
+
+# Build configuration flags.
+set_aom_config_var(AOM_RTCD_FLAGS "" STRING
+                   "Arguments to pass to rtcd.pl. Separate with ';'")
+set_aom_config_var(CONFIG_AV1_DECODER 1 NUMBER "Enable AV1 decoder.")
+set_aom_config_var(CONFIG_AV1_ENCODER 1 NUMBER "Enable AV1 encoder.")
+set_aom_config_var(CONFIG_BIG_ENDIAN 0 NUMBER "Internal flag.")
+set_aom_config_var(CONFIG_GCC 0 NUMBER "Building with GCC (detect).")
+set_aom_config_var(CONFIG_GCOV 0 NUMBER "Enable gcov support.")
+set_aom_config_var(CONFIG_GPROF 0 NUMBER "Enable gprof support.")
+set_aom_config_var(CONFIG_LIBYUV 1 NUMBER
+                   "Enables libyuv scaling/conversion support.")
+
+set_aom_config_var(CONFIG_MULTITHREAD 1 NUMBER "Multithread support.")
+set_aom_config_var(CONFIG_OS_SUPPORT 0 NUMBER "Internal flag.")
+set_aom_config_var(CONFIG_PIC 0 NUMBER "Build with PIC enabled.")
+set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 NUMBER
+                   "Runtime CPU detection support.")
+set_aom_config_var(CONFIG_SHARED 0 NUMBER "Build shared libs.")
+set_aom_config_var(CONFIG_STATIC 1 NUMBER "Build static libs.")
+set_aom_config_var(CONFIG_WEBM_IO 1 NUMBER "Enables WebM support.")
+
+# Debugging flags.
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 NUMBER "Bitstream debugging flag.")
+set_aom_config_var(CONFIG_DEBUG 0 NUMBER "Debug build flag.")
+set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 NUMBER "Mismatch debugging flag.")
+
+# AV1 feature flags.
+set_aom_config_var(CONFIG_ACCOUNTING 0 NUMBER "Enables bit accounting.")
+set_aom_config_var(CONFIG_ANALYZER 0 NUMBER "Enables bit stream analyzer.")
+set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0 NUMBER
+                   "Coefficient range check.")
+set_aom_config_var(CONFIG_DENOISE 1 NUMBER
+                   "Denoise/noise modeling support in encoder.")
+set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER
+                   "Enables encoder config file support.")
+set_aom_config_var(CONFIG_FIX_GF_LENGTH 1 NUMBER
+                   "Fix the GF length if possible")
+set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.")
+set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER
+                   "Enables internal encoder stats.")
+set_aom_config_var(CONFIG_LOWBITDEPTH 0 NUMBER
+                   "Enables 8-bit optimized pipeline.")
+set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 NUMBER
+                   "Max profile to support decoding.")
+set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 NUMBER
+                   "Only enables normal tile mode.")
+set_aom_config_var(
+  CONFIG_REDUCED_ENCODER_BORDER 0 NUMBER
+  "Enable reduced border extention for encoder. \
+                    Disables superres and resize support."
+  )
+set_aom_config_var(CONFIG_SIZE_LIMIT 0 NUMBER "Limit max decode width/height.")
+set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 NUMBER "Spatial resampling.")
+set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.")
+set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.")
+set_aom_config_var(CONFIG_GLOBAL_MOTION_SEARCH 1 NUMBER
+                   "Global motion search flag.")
+
+# AV1 experiment flags.
+set_aom_config_var(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 NUMBER
+                   "AV1 experiment flag.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_FP_MB_STATS 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 NUMBER "AV1 experiment flag.")
+set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL 1 NUMBER
+                   "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER
+                   "Use sharper encoding settings")
+
+#
+# Variables in this section control optional features of the build system.
+#
+set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF)
+set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests"
+                   OFF)
+set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF)
+set_aom_option_var(ENABLE_DOCS
+                   "Enable documentation generation (doxygen required)." ON)
+set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests"
+                   OFF)
+set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON)
+set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF)
+set_aom_option_var(
+  ENABLE_IDE_TEST_HOSTING
+  "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
+set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets."
+                   ON)
+set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON)
+set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory."
+                   ON)
+set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
+                   OFF)
+
+# ARM assembly/intrinsics flags.
+set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
+
+# MIPS assembly/intrinsics flags.
+set_aom_option_var(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets."
+                   OFF)
+set_aom_option_var(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF)
+
+# VSX intrinsics flags.
+set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
+                   ON)
+
+# x86/x86_64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_MMX
+                   "Enables MMX optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE
+                   "Enables SSE optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE2
+                   "Enables SSE2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE3
+                   "Enables SSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSSE3
+                   "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_1
+                   "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_2
+                   "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX
+                   "Enables AVX optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX2
+                   "Enables AVX2 optimizations on x86/x86_64 targets." ON)
diff --git a/third_party/aom/build/cmake/aom_configure.cmake b/third_party/aom/build/cmake/aom_configure.cmake
new file mode 100644
index 000000000..c0c7381e8
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_configure.cmake
@@ -0,0 +1,377 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1)
+
+include(FindGit)
+include(FindPerl)
+include(FindThreads)
+
+set(AOM_SUPPORTED_CPU_TARGETS
+    "arm64 armv7 armv7s generic mips32 mips64 ppc x86 x86_64")
+
+include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake")
+include("${AOM_ROOT}/build/cmake/aom_optimization.cmake")
+include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
+include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Generate the user config settings.
+list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
+foreach(cache_var ${aom_build_vars})
+  get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
+  if("${cache_var_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
+  endif()
+endforeach()
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
+# Detect target CPU.
+if(NOT AOM_TARGET_CPU)
+  if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "AMD64" OR
+     "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+    if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+      set(AOM_TARGET_CPU "x86")
+    elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+      set(AOM_TARGET_CPU "x86_64")
+    else()
+      message(FATAL_ERROR
+                "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
+                "      CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
+                "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
+                "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
+    endif()
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "i386" OR
+         "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86")
+    set(AOM_TARGET_CPU "x86")
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^arm" OR
+         "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^mips")
+    set(AOM_TARGET_CPU "${CMAKE_SYSTEM_PROCESSOR}")
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64")
+    set(AOM_TARGET_CPU "arm64")
+  elseif("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "^ppc")
+    set(AOM_TARGET_CPU "ppc")
+  else()
+    message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
+                    "supported, falling back to the generic target")
+    set(AOM_TARGET_CPU "generic")
+  endif()
+endif()
+
+if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string.
+  file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}"
+                     "${CMAKE_TOOLCHAIN_FILE}")
+  set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"")
+  set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}")
+else()
+
+  # Add detected CPU to the config string.
+  set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}")
+endif()
+set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}")
+file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}")
+set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}")
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
+message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
+set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME})
+
+if("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
+  set(CONFIG_DEBUG 1)
+endif()
+
+if(BUILD_SHARED_LIBS)
+  set(CONFIG_PIC 1)
+  set(CONFIG_SHARED 1)
+  set(CONFIG_STATIC 0)
+endif()
+
+if(NOT MSVC)
+  if(CONFIG_PIC)
+
+    # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
+    # work.
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux" AND "${AOM_TARGET_CPU}" MATCHES
+       "^armv7")
+      set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
+    else()
+      set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
+    endif()
+  endif()
+endif()
+
+if(NOT "${AOM_SUPPORTED_CPU_TARGETS}" MATCHES "${AOM_TARGET_CPU}")
+  message(FATAL_ERROR
+            "No RTCD support for ${AOM_TARGET_CPU}. Create it, or "
+            "add -DAOM_TARGET_CPU=generic to your cmake command line for a "
+            "generic build of libaom and tools.")
+endif()
+
+if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+  find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH})
+  if(NOT AS_EXECUTABLE OR ENABLE_NASM)
+    unset(AS_EXECUTABLE CACHE)
+    find_program(AS_EXECUTABLE nasm $ENV{NASM_PATH})
+    if(AS_EXECUTABLE)
+      test_nasm()
+    endif()
+  endif()
+
+  if(NOT AS_EXECUTABLE)
+    message(FATAL_ERROR
+              "Unable to find assembler. Install 'yasm' or 'nasm.' "
+              "To build without optimizations, add -DAOM_TARGET_CPU=generic to "
+              "your cmake command line.")
+  endif()
+  get_asm_obj_format("objformat")
+  set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
+  string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
+elseif("${AOM_TARGET_CPU}" MATCHES "arm")
+  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+    set(AS_EXECUTABLE as)
+    set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
+  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE as)
+    endif()
+  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
+    endif()
+  endif()
+  if(NOT AS_EXECUTABLE)
+    message(FATAL_ERROR
+              "Unknown assembler for: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+  endif()
+
+  string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
+endif()
+
+if(CONFIG_ANALYZER)
+  include(FindwxWidgets)
+  find_package(wxWidgets REQUIRED adv base core)
+  include(${wxWidgets_USE_FILE})
+endif()
+
+if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang")
+  set(CONFIG_GCC 1)
+endif()
+
+if(CONFIG_GCOV)
+  message("--- Testing for CONFIG_GCOV support.")
+  require_linker_flag("-fprofile-arcs -ftest-coverage")
+  require_compiler_flag("-fprofile-arcs -ftest-coverage" YES)
+endif()
+
+if(CONFIG_GPROF)
+  message("--- Testing for CONFIG_GPROF support.")
+  require_compiler_flag("-pg" YES)
+endif()
+
+if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows")
+  set(CONFIG_OS_SUPPORT 1)
+endif()
+
+#
+# Fix CONFIG_* dependencies. This must be done before including cpu.cmake to
+# ensure RTCD_CONFIG_* are properly set.
+fix_experiment_configs()
+
+# Test compiler support.
+aom_get_inline("INLINE")
+
+# Don't just check for pthread.h, but use the result of the full pthreads
+# including a linking check in FindThreads above.
+set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT})
+aom_check_source_compiles("unistd_check" "#include <unistd.h>" HAVE_UNISTD_H)
+
+if(NOT MSVC)
+  aom_push_var(CMAKE_REQUIRED_LIBRARIES "m")
+  aom_check_c_compiles(
+    "fenv_check"
+    "#define _GNU_SOURCE
+                        #include <fenv.h>
+                        void unused(void) {
+                          (void)unused;
+                          (void)feenableexcept(FE_DIVBYZERO | FE_INVALID);
+                        }"
+    HAVE_FEXCEPT)
+  aom_pop_var(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+include("${AOM_ROOT}/build/cmake/cpu.cmake")
+
+if(ENABLE_CCACHE)
+  set_compiler_launcher(ENABLE_CCACHE ccache)
+endif()
+
+if(ENABLE_DISTCC)
+  set_compiler_launcher(ENABLE_DISTCC distcc)
+endif()
+
+if(ENABLE_GOMA)
+  set_compiler_launcher(ENABLE_GOMA gomacc)
+endif()
+
+if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER)
+  message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.")
+endif()
+
+if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT)
+  change_config_and_warn(CONFIG_SIZE_LIMIT 1
+                         "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT")
+endif()
+
+if(CONFIG_SIZE_LIMIT)
+  if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT)
+    message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT "
+                        "and DECODE_WIDTH_LIMIT must be set.")
+  endif()
+endif()
+
+# Test compiler flags.
+if(MSVC)
+  add_compiler_flag_if_supported("/W3")
+
+  # Disable MSVC warnings that suggest making code non-portable.
+  add_compiler_flag_if_supported("/wd4996")
+  if(ENABLE_WERROR)
+    add_compiler_flag_if_supported("/WX")
+  endif()
+else()
+  require_c_flag("-std=c99" YES)
+  add_compiler_flag_if_supported("-Wall")
+  add_compiler_flag_if_supported("-Wdisabled-optimization")
+  add_compiler_flag_if_supported("-Wextra")
+  add_compiler_flag_if_supported("-Wfloat-conversion")
+  add_compiler_flag_if_supported("-Wimplicit-function-declaration")
+  add_compiler_flag_if_supported("-Wlogical-op")
+  add_compiler_flag_if_supported("-Wpointer-arith")
+  add_compiler_flag_if_supported("-Wsign-compare")
+  add_compiler_flag_if_supported("-Wstring-conversion")
+  add_compiler_flag_if_supported("-Wtype-limits")
+  add_compiler_flag_if_supported("-Wuninitialized")
+  add_compiler_flag_if_supported("-Wunused")
+  add_compiler_flag_if_supported("-Wvla")
+
+  add_c_flag_if_supported("-Wstack-usage=100000")
+  add_cxx_flag_if_supported("-Wstack-usage=360000")
+
+  # TODO(jzern): this could be added as a cxx flags for test/*.cc only, avoiding
+  # third_party.
+  add_c_flag_if_supported("-Wshorten-64-to-32")
+
+  # Add -Wshadow only for C files to avoid massive gtest warning spam.
+  add_c_flag_if_supported("-Wshadow")
+
+  # Add -Wundef only for C files to avoid massive gtest warning spam.
+  add_c_flag_if_supported("-Wundef")
+
+  # Quiet gcc 6 vs 7 abi warnings:
+  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+  if("${AOM_TARGET_CPU}" MATCHES "arm")
+    add_cxx_flag_if_supported("-Wno-psabi")
+  endif()
+
+  if(ENABLE_WERROR)
+    add_compiler_flag_if_supported("-Werror")
+  endif()
+
+  if("${CMAKE_BUILD_TYPE}" MATCHES "Rel")
+    add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0")
+  endif()
+  add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE")
+  add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64")
+endif()
+
+set(AOM_LIB_LINK_TYPE PUBLIC)
+if(EMSCRIPTEN)
+
+  # Avoid CMake generation time errors resulting from collisions with the form
+  # of target_link_libraries() used by Emscripten.cmake.
+  unset(AOM_LIB_LINK_TYPE)
+endif()
+
+# Generate aom_config templates.
+set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+execute_process(COMMAND
+                  ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                  -DAOM_ROOT=${AOM_ROOT} -P
+                  "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
+
+# Generate aom_config.{asm,h}.
+configure_file("${aom_config_asm_template}"
+               "${AOM_CONFIG_DIR}/config/aom_config.asm")
+configure_file("${aom_config_h_template}"
+               "${AOM_CONFIG_DIR}/config/aom_config.h")
+
+# Read the current git hash.
+find_package(Git)
+if(NOT GIT_FOUND)
+  message("--- Git missing, version will be read from CHANGELOG.")
+endif()
+
+configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template"
+               "${AOM_CONFIG_DIR}/config/aom_config.c")
+
+# Find Perl and generate the RTCD sources.
+find_package(Perl)
+if(NOT PERL_FOUND)
+  message(FATAL_ERROR "Perl is required to build libaom.")
+endif()
+
+set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+    "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
+set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+    "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+    "${AOM_CONFIG_DIR}/config/av1_rtcd.h")
+set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+    "${AOM_ROOT}/av1/common/av1_rtcd.c")
+set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd)
+list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT)
+math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1")
+
+foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT})
+  list(GET AOM_RTCD_CONFIG_FILE_LIST ${NUM} AOM_RTCD_CONFIG_FILE)
+  list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE)
+  list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE)
+  list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
+  execute_process(COMMAND ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl"
+                          --arch=${AOM_TARGET_CPU}
+                          --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
+                          --config=${AOM_CONFIG_DIR}/config/aom_config.h
+                          ${AOM_RTCD_CONFIG_FILE}
+                  OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
+endforeach()
+
+# Generate aom_version.h.
+execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                        -DAOM_ROOT=${AOM_ROOT}
+                        -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                        -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                        "${AOM_ROOT}/build/cmake/version.cmake")
+
+if(NOT MSVC) # Generate aom.pc (pkg-config file).
+  execute_process(COMMAND ${CMAKE_COMMAND} -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+                          -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+                          -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+                          -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+                          "${AOM_ROOT}/build/cmake/pkg_config.cmake")
+endif()
diff --git a/third_party/aom/build/cmake/aom_experiment_deps.cmake b/third_party/aom/build/cmake/aom_experiment_deps.cmake
new file mode 100644
index 000000000..0688704e5
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_experiment_deps.cmake
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1)
+
+# Adjusts CONFIG_* CMake variables to address conflicts between active AV1
+# experiments.
+macro(fix_experiment_configs)
+
+  if(CONFIG_ANALYZER)
+    change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
+  endif()
+
+  if(CONFIG_RD_DEBUG)
+    change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
+  endif()
+
+  if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD)
+    change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD)
+  endif()
+
+endmacro()
diff --git a/third_party/aom/build/cmake/aom_optimization.cmake b/third_party/aom/build/cmake/aom_optimization.cmake
new file mode 100644
index 000000000..be32a3212
--- /dev/null
+++ b/third_party/aom/build/cmake/aom_optimization.cmake
@@ -0,0 +1,212 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1)
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Translate $flag to one which MSVC understands, and write the new flag to the
+# variable named by $translated_flag (or unset it, when MSVC needs no flag).
+function(get_msvc_intrinsic_flag flag translated_flag)
+  if("${flag}" STREQUAL "-mavx")
+    set(${translated_flag} "/arch:AVX" PARENT_SCOPE)
+  elseif("${flag}" STREQUAL "-mavx2")
+    set(${translated_flag} "/arch:AVX2" PARENT_SCOPE)
+  else()
+
+    # MSVC does not need flags for intrinsics flavors other than AVX/AVX2.
+    unset(${translated_flag} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Adds an object library target. Terminates generation if $flag is not supported
+# by the current compiler. $flag is the intrinsics flag required by the current
+# compiler, and is added to the compile flags for all sources in $sources.
+# $opt_name is used to name the target. $target_to_update is made dependent upon
+# the created target.
+#
+# Note: the libaom target is always updated because OBJECT libraries have rules
+# that disallow the direct addition of .o files to them as dependencies. Static
+# libraries do not have this limitation.
+function(add_intrinsics_object_library flag opt_name target_to_update sources
+         dependent_target)
+  if("${${sources}}" STREQUAL "")
+    return()
+  endif()
+  set(target_name ${target_to_update}_${opt_name}_intrinsics)
+  add_library(${target_name} OBJECT ${${sources}})
+
+  if(MSVC)
+    get_msvc_intrinsic_flag(${flag} "flag")
+  endif()
+
+  if(flag)
+    separate_arguments(flag)
+    target_compile_options(${target_name} PUBLIC ${flag})
+  endif()
+
+  target_sources(${dependent_target} PRIVATE $<TARGET_OBJECTS:${target_name}>)
+
+  # Add the new lib target to the global list of aom library targets.
+  list(APPEND AOM_LIB_TARGETS ${target_name})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+# Adds sources in list named by $sources to $target and adds $flag to the
+# compile flags for each source file.
+function(add_intrinsics_source_to_target flag target sources)
+  target_sources(${target} PRIVATE ${${sources}})
+  if(MSVC)
+    get_msvc_intrinsic_flag(${flag} "flag")
+  endif()
+  if(flag)
+    foreach(source ${${sources}})
+      set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag})
+    endforeach()
+  endif()
+endfunction()
+
+# Writes object format for the current target to the var named by $out_format,
+# or terminates the build when the object format for the current target is
+# unknown.
+function(get_asm_obj_format out_format)
+  if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      set(objformat "macho64")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+      set(objformat "elf64")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}"
+           STREQUAL "Windows")
+      set(objformat "win64")
+    else()
+      message(FATAL_ERROR "Unknown obj format: ${AOM_TARGET_SYSTEM}")
+    endif()
+  elseif("${AOM_TARGET_CPU}" STREQUAL "x86")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      set(objformat "macho32")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+      set(objformat "elf32")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS" OR "${AOM_TARGET_SYSTEM}"
+           STREQUAL "Windows")
+      set(objformat "win32")
+    else()
+      message(FATAL_ERROR "Unknown obj format: ${AOM_TARGET_SYSTEM}")
+    endif()
+  else()
+    message(FATAL_ERROR
+              "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+  endif()
+
+  set(${out_format} ${objformat} PARENT_SCOPE)
+endfunction()
+
+# Adds library target named $lib_name for ASM files in variable named by
+# $asm_sources. Builds an output directory path from $lib_name. Links $lib_name
+# into $dependent_target. Generates a dummy C file with a dummy function to
+# ensure that all cmake generators can determine the linker language, and that
+# build tools don't complain that an object exposes no symbols.
+function(add_asm_library lib_name asm_sources dependent_target)
+  if("${${asm_sources}}" STREQUAL "")
+    return()
+  endif()
+  set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${lib_name}")
+  if(NOT EXISTS "${asm_lib_obj_dir}")
+    file(MAKE_DIRECTORY "${asm_lib_obj_dir}")
+  endif()
+
+  # TODO(tomfinegan): If cmake ever allows addition of .o files to OBJECT lib
+  # targets, make this OBJECT instead of STATIC to hide the target from
+  # consumers of the AOM cmake build.
+  add_library(${lib_name} STATIC ${${asm_sources}})
+
+  foreach(asm_source ${${asm_sources}})
+    get_filename_component(asm_source_name "${asm_source}" NAME)
+    set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o")
+    add_custom_command(OUTPUT "${asm_object}"
+                       COMMAND ${AS_EXECUTABLE} ARGS ${AOM_AS_FLAGS}
+                               -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o
+                               "${asm_object}" "${asm_source}"
+                       DEPENDS "${asm_source}"
+                       COMMENT "Building ASM object ${asm_object}"
+                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}" VERBATIM)
+    target_sources(aom PRIVATE "${asm_object}")
+  endforeach()
+
+  # The above created a target containing only ASM sources. Cmake needs help
+  # here to determine the linker language. Add a dummy C file to force the
+  # linker language to C. We don't bother with setting the LINKER_LANGUAGE
+  # property on the library target because not all generators obey it (looking
+  # at you, xcode generator).
+  add_dummy_source_file_to_target("${lib_name}" "c")
+
+  # Add the new lib target to the global list of aom library targets.
+  list(APPEND AOM_LIB_TARGETS ${lib_name})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+# Terminates generation if nasm found in PATH does not meet requirements.
+# Currently checks only for presence of required object formats and support for
+# the -Ox argument (multipass optimization).
+function(test_nasm)
+  execute_process(COMMAND ${AS_EXECUTABLE} -hf OUTPUT_VARIABLE nasm_helptext)
+
+  if(NOT "${nasm_helptext}" MATCHES "-Ox")
+    message(FATAL_ERROR
+              "Unsupported nasm: multipass optimization not supported.")
+  endif()
+
+  if("${AOM_TARGET_CPU}" STREQUAL "x86")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      if(NOT "${nasm_helptext}" MATCHES "macho32")
+        message(FATAL_ERROR
+                  "Unsupported nasm: macho32 object format not supported.")
+      endif()
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+      if(NOT "${nasm_helptext}" MATCHES "elf32")
+        message(FATAL_ERROR
+                  "Unsupported nasm: elf32 object format not supported.")
+      endif()
+    endif()
+  else()
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      if(NOT "${nasm_helptext}" MATCHES "macho64")
+        message(FATAL_ERROR
+                  "Unsupported nasm: macho64 object format not supported.")
+      endif()
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Linux")
+      if(NOT "${nasm_helptext}" MATCHES "elf64")
+        message(FATAL_ERROR
+                  "Unsupported nasm: elf64 object format not supported.")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+# Adds build command for generation of rtcd C source files using
+# build/cmake/rtcd.pl. $config is the input perl file, $output is the output C
+# include file, $source is the C source file, and $symbol is used for the symbol
+# argument passed to rtcd.pl.
+function(add_rtcd_build_step config output source symbol)
+  add_custom_command(OUTPUT ${output}
+                     COMMAND ${PERL_EXECUTABLE} ARGS
+                             "${AOM_ROOT}/build/cmake/rtcd.pl"
+                             --arch=${AOM_TARGET_CPU}
+                             --sym=${symbol} ${AOM_RTCD_FLAGS}
+                             --config=${AOM_CONFIG_DIR}/config/aom_config.h
+                             ${config} > ${output}
+                     DEPENDS ${config}
+                     COMMENT "Generating ${output}"
+                     WORKING_DIRECTORY ${AOM_CONFIG_DIR} VERBATIM)
+  set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
+  set_property(SOURCE ${output} PROPERTY GENERATED)
+endfunction()
diff --git a/third_party/aom/build/cmake/compiler_flags.cmake b/third_party/aom/build/cmake/compiler_flags.cmake
new file mode 100644
index 000000000..79192c1fa
--- /dev/null
+++ b/third_party/aom/build/cmake/compiler_flags.cmake
@@ -0,0 +1,373 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_
+set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
+
+# Strings used to cache flags.
+set(AOM_C_FLAGS)
+set(AOM_CXX_FLAGS)
+set(AOM_EXE_LINKER_FLAGS)
+set(AOM_FAILED_C_FLAGS)
+set(AOM_FAILED_CXX_FLAGS)
+
+# Sets variable named by $out_is_present to YES in the caller's scope when $flag
+# is found in the string variable named by $flag_cache. Sets the var to NO
+# otherwise.
+function(is_flag_present flag_cache flag out_is_present)
+  string(FIND "${${flag_cache}}" "${flag}" flag_pos)
+  if(${flag_pos} EQUAL -1)
+    set(${out_is_present} NO PARENT_SCOPE)
+  else()
+    set(${out_is_present} YES PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Appends $flag to $flags. Ignores scope via use of FORCE with set() call.
+function(append_flag flags flag)
+  string(FIND "${${flags}}" "${flag}" found)
+  if(${found} EQUAL -1)
+    set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE)
+  endif()
+endfunction()
+
+# Checks C compiler for support of $c_flag. Adds $c_flag to all
+# $CMAKE_C_FLAGS_<CONFIG>s stored in AOM_C_CONFIGS when the compile test passes.
+# Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test
+# outcome.
+function(add_c_flag_if_supported c_flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
+  is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed)
+  if(${flag_ok} OR ${flag_failed})
+    return()
+  endif()
+
+  unset(C_FLAG_SUPPORTED CACHE)
+  message("Checking C compiler flag support for: " ${c_flag})
+  check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED)
+
+  if(${C_FLAG_SUPPORTED})
+    append_flag(AOM_C_FLAGS "${c_flag}")
+    foreach(config ${AOM_C_CONFIGS})
+      unset(C_FLAG_FOUND)
+      append_flag("${config}" "${c_flag}")
+    endforeach()
+  else()
+    append_flag(AOM_FAILED_C_FLAGS "${c_flag}")
+  endif()
+endfunction()
+
+# Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all
+# $CMAKE_CXX_FLAGS_<CONFIG>s stored in AOM_CXX_CONFIGS when the compile test
+# passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending
+# on test outcome.
+function(add_cxx_flag_if_supported cxx_flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
+  is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed)
+  if(${flag_ok} OR ${flag_failed})
+    return()
+  endif()
+
+  unset(CXX_FLAG_SUPPORTED CACHE)
+  message("Checking C++ compiler flag support for: " ${cxx_flag})
+  check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED)
+
+  if(${CXX_FLAG_SUPPORTED})
+    append_flag(AOM_CXX_FLAGS "${cxx_flag}")
+    foreach(config ${AOM_CXX_CONFIGS})
+      unset(CXX_FLAG_FOUND)
+      append_flag("${config}" "${cxx_flag}")
+    endforeach()
+  else()
+    append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}")
+  endif()
+endfunction()
+
+# Convenience method for adding a flag to both the C and C++ compiler command
+# lines.
+function(add_compiler_flag_if_supported flag)
+  add_c_flag_if_supported(${flag})
+  add_cxx_flag_if_supported(${flag})
+endfunction()
+
+# Checks C compiler for support of $c_flag and terminates generation when
+# support is not present.
+function(require_c_flag c_flag update_c_flags)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
+  if(${flag_ok})
+    return()
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
+  endif()
+
+  unset(HAVE_C_FLAG CACHE)
+  message("Checking C compiler flag support for: " ${c_flag})
+  check_c_compiler_flag("${c_flag}" HAVE_C_FLAG)
+  if(NOT HAVE_C_FLAG)
+    message(FATAL_ERROR
+              "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+
+  append_flag(AOM_C_FLAGS "${c_flag}")
+  if(update_c_flags)
+    foreach(config ${AOM_C_CONFIGS})
+      set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE)
+    endforeach()
+  endif()
+endfunction()
+
+# Checks CXX compiler for support of $cxx_flag and terminates generation when
+# support is not present.
+function(require_cxx_flag cxx_flag update_cxx_flags)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
+  if(${flag_ok})
+    return()
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
+  endif()
+
+  unset(HAVE_CXX_FLAG CACHE)
+  message("Checking C compiler flag support for: " ${cxx_flag})
+  check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG)
+  if(NOT HAVE_CXX_FLAG)
+    message(FATAL_ERROR
+              "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+
+  append_flag(AOM_CXX_FLAGS "${cxx_flag}")
+  if(update_cxx_flags)
+    foreach(config ${AOM_CXX_CONFIGS})
+      set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE)
+    endforeach()
+  endif()
+endfunction()
+
+# Checks for support of $flag by both the C and CXX compilers. Terminates
+# generation when support is not present in both compilers.
+function(require_compiler_flag flag update_cmake_flags)
+  require_c_flag(${flag} ${update_cmake_flags})
+  require_cxx_flag(${flag} ${update_cmake_flags})
+endfunction()
+
+# Checks only non-MSVC targets for support of $c_flag and terminates generation
+# when support is not present.
+function(require_c_flag_nomsvc c_flag update_c_flags)
+  if(NOT MSVC)
+    require_c_flag(${c_flag} ${update_c_flags})
+  endif()
+endfunction()
+
+# Checks only non-MSVC targets for support of $cxx_flag and terminates
+# generation when support is not present.
+function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags)
+  if(NOT MSVC)
+    require_cxx_flag(${cxx_flag} ${update_cxx_flags})
+  endif()
+endfunction()
+
+# Checks only non-MSVC targets for support of $flag by both the C and CXX
+# compilers. Terminates generation when support is not present in both
+# compilers.
+function(require_compiler_flag_nomsvc flag update_cmake_flags)
+  require_c_flag_nomsvc(${flag} ${update_cmake_flags})
+  require_cxx_flag_nomsvc(${flag} ${update_cmake_flags})
+endfunction()
+
+# Adds $preproc_def to C compiler command line (as -D$preproc_def) if not
+# already present.
+function(add_c_preproc_definition preproc_def)
+  set(preproc_def "-D${preproc_def}")
+  is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_C_CONFIGS})
+    set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
+  endforeach()
+endfunction()
+
+# Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not
+# already present.
+function(add_cxx_preproc_definition preproc_def)
+  set(preproc_def "-D${preproc_def}")
+  is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_CXX_CONFIGS})
+    set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
+  endforeach()
+endfunction()
+
+# Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if
+# not already present.
+function(add_preproc_definition preproc_def)
+  add_c_preproc_definition(${preproc_def})
+  add_cxx_preproc_definition(${preproc_def})
+endfunction()
+
+# Adds $flag to assembler command line.
+function(append_as_flag flag)
+  is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+  append_flag(AOM_AS_FLAGS "${flag}")
+endfunction()
+
+# Adds $flag to the C compiler command line.
+function(append_c_flag flag)
+  is_flag_present(AOM_C_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_C_CONFIGS})
+    append_flag(${config} "${flag}")
+  endforeach()
+endfunction()
+
+# Adds $flag to the CXX compiler command line.
+function(append_cxx_flag flag)
+  is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_CXX_CONFIGS})
+    append_flag(${config} "${flag}")
+  endforeach()
+endfunction()
+
+# Adds $flag to the C and CXX compiler command lines.
+function(append_compiler_flag flag)
+  append_c_flag(${flag})
+  append_cxx_flag(${flag})
+endfunction()
+
+# Adds $flag to the executable linker command line when not present.
+function(append_exe_linker_flag flag)
+  is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  append_flag(AOM_EXE_LINKER_FLAGS "${flag}")
+  foreach(config ${AOM_EXE_LINKER_CONFIGS})
+    append_flag(${config} "${flag}")
+  endforeach()
+endfunction()
+
+# Adds $flag to the link flags for $target.
+function(append_link_flag_to_target target flag)
+  unset(target_link_flags)
+  get_target_property(target_link_flags ${target} LINK_FLAGS)
+
+  if(target_link_flags)
+    is_flag_present(target_link_flags "${flag}" flag_found)
+    if(${flag_found})
+      return()
+    endif()
+    set(target_link_flags "${target_link_flags} ${flag}")
+  else()
+    set(target_link_flags "${flag}")
+  endif()
+
+  set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags})
+endfunction()
+
+# Adds $flag to executable linker flags, and makes sure C/CXX builds still work.
+function(require_linker_flag flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  append_exe_linker_flag(${flag})
+
+  unset(c_passed)
+  aom_check_c_compiles("LINKER_FLAG_C_TEST(${flag})" "" c_passed)
+  unset(cxx_passed)
+  aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed)
+
+  if(NOT c_passed OR NOT cxx_passed)
+    message(FATAL_ERROR "Linker flag test for ${flag} failed.")
+  endif()
+endfunction()
+
+# Appends flags in $AOM_EXTRA_<TYPE>_FLAGS variables to the flags used at build
+# time.
+function(set_user_flags)
+
+  # Linker flags are handled first because some C/CXX flags require that a
+  # linker flag is present at link time.
+  if(AOM_EXTRA_EXE_LINKER_FLAGS)
+    is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}"
+                    extra_present)
+    if(NOT ${extra_present})
+      require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}")
+    endif()
+  endif()
+  if(AOM_EXTRA_AS_FLAGS)
+
+    # TODO(tomfinegan): assembler flag testing would be a good thing to have.
+    is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present)
+    if(NOT ${extra_present})
+      append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}")
+    endif()
+  endif()
+  if(AOM_EXTRA_C_FLAGS)
+    is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present)
+    if(NOT ${extra_present})
+      require_c_flag("${AOM_EXTRA_C_FLAGS}" YES)
+    endif()
+  endif()
+  if(AOM_EXTRA_CXX_FLAGS)
+    is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present)
+    if(NOT ${extra_present})
+      require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES)
+    endif()
+  endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/compiler_tests.cmake b/third_party/aom/build/cmake/compiler_tests.cmake
new file mode 100644
index 000000000..f115610ba
--- /dev/null
+++ b/third_party/aom/build/cmake/compiler_tests.cmake
@@ -0,0 +1,175 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_
+set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1)
+
+include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
+
+# CMake passes command line flags like this:
+#
+# * $compiler $lang_flags $lang_flags_config ...
+#
+# To ensure the flags tested here and elsewhere are obeyed a list of active
+# build configuration types is built, and flags are applied to the flag strings
+# for each configuration currently active for C and CXX builds as determined by
+# reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When
+# $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in
+# use: currently this includes MSVC and Xcode. For other generators
+# $CMAKE_BUILD_TYPE is used. For both cases AOM_<LANG>_CONFIGS is populated with
+# CMake string variable names that contain flags for the currently available
+# configuration(s).
+unset(AOM_C_CONFIGS)
+unset(AOM_CXX_CONFIGS)
+list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs)
+if(${num_configs} GREATER 0)
+  foreach(config ${CMAKE_CONFIGURATION_TYPES})
+    string(TOUPPER ${config} config)
+    list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
+    list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
+    list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
+  endforeach()
+else()
+  string(TOUPPER ${CMAKE_BUILD_TYPE} config)
+  set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
+  set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
+  set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
+endif()
+
+# The basic main() function used in all compile tests.
+set(AOM_C_MAIN "\nint main(void) { return 0; }")
+set(AOM_CXX_MAIN "\nint main() { return 0; }")
+
+# Strings containing the names of passed and failed tests.
+set(AOM_C_PASSED_TESTS)
+set(AOM_C_FAILED_TESTS)
+set(AOM_CXX_PASSED_TESTS)
+set(AOM_CXX_FAILED_TESTS)
+
+function(aom_push_var var new_value)
+  set(SAVED_${var} ${${var}} PARENT_SCOPE)
+  set(${var} "${${var}} ${new_value}" PARENT_SCOPE)
+endfunction()
+
+function(aom_pop_var var)
+  set(var ${SAVED_${var}} PARENT_SCOPE)
+  unset(SAVED_${var} PARENT_SCOPE)
+endfunction()
+
+# Confirms $test_source compiles and stores $test_name in one of
+# $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
+function(aom_check_c_compiles test_name test_source result_var)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  unset(C_TEST_PASSED CACHE)
+  unset(C_TEST_FAILED CACHE)
+  string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED)
+  string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED)
+  if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1)
+    unset(C_TEST_COMPILED CACHE)
+    message("Running C compiler test: ${test_name}")
+    check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED)
+    set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE)
+
+    if(C_TEST_COMPILED)
+      set(AOM_C_PASSED_TESTS "${AOM_C_PASSED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+    else()
+      set(AOM_C_FAILED_TESTS "${AOM_C_FAILED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+      message("C Compiler test ${test_name} failed.")
+    endif()
+  elseif(NOT ${C_TEST_PASSED} EQUAL -1)
+    set(${result_var} 1 PARENT_SCOPE)
+  else() # ${C_TEST_FAILED} NOT EQUAL -1
+    unset(${result_var} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Confirms $test_source compiles and stores $test_name in one of
+# $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
+function(aom_check_cxx_compiles test_name test_source result_var)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  unset(CXX_TEST_PASSED CACHE)
+  unset(CXX_TEST_FAILED CACHE)
+  string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED)
+  string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED)
+  if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1)
+    unset(CXX_TEST_COMPILED CACHE)
+    message("Running CXX compiler test: ${test_name}")
+    check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}"
+                              CXX_TEST_COMPILED)
+    set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE)
+
+    if(CXX_TEST_COMPILED)
+      set(AOM_CXX_PASSED_TESTS "${AOM_CXX_PASSED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+    else()
+      set(AOM_CXX_FAILED_TESTS "${AOM_CXX_FAILED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+      message("CXX Compiler test ${test_name} failed.")
+    endif()
+  elseif(NOT ${CXX_TEST_PASSED} EQUAL -1)
+    set(${result_var} 1 PARENT_SCOPE)
+  else() # ${CXX_TEST_FAILED} NOT EQUAL -1
+    unset(${result_var} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Convenience function that confirms $test_source compiles as C and C++.
+# $result_var is set to 1 when both tests are successful, and 0 when one or both
+# tests fail. Note: This function is intended to be used to write to result
+# variables that are expanded via configure_file(). $result_var is set to 1 or 0
+# to allow direct usage of the value in generated source files.
+function(aom_check_source_compiles test_name test_source result_var)
+  unset(C_PASSED)
+  unset(CXX_PASSED)
+  aom_check_c_compiles(${test_name} ${test_source} C_PASSED)
+  aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED)
+  if(C_PASSED AND CXX_PASSED)
+    set(${result_var} 1 PARENT_SCOPE)
+  else()
+    set(${result_var} 0 PARENT_SCOPE)
+  endif()
+endfunction()
+
+# When inline support is detected for the current compiler the supported
+# inlining keyword is written to $result in caller scope.
+function(aom_get_inline result)
+  aom_check_source_compiles("inline_check_1"
+                            "static inline void function(void) {}"
+                            HAVE_INLINE_1)
+  if(HAVE_INLINE_1 EQUAL 1)
+    set(${result} "inline" PARENT_SCOPE)
+    return()
+  endif()
+
+  # Check __inline.
+  aom_check_source_compiles("inline_check_2"
+                            "static __inline void function(void) {}"
+                            HAVE_INLINE_2)
+  if(HAVE_INLINE_2 EQUAL 1)
+    set(${result} "__inline" PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/cpu.cmake b/third_party/aom/build/cmake/cpu.cmake
new file mode 100644
index 000000000..6e8089e63
--- /dev/null
+++ b/third_party/aom/build/cmake/cpu.cmake
@@ -0,0 +1,93 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if("${AOM_TARGET_CPU}" STREQUAL "arm64")
+  set(ARCH_ARM 1)
+  set(RTCD_ARCH_ARM "yes")
+
+  if(ENABLE_NEON)
+    set(HAVE_NEON 1)
+    set(RTCD_HAVE_NEON "yes")
+  else()
+    set(HAVE_NEON 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^armv7")
+  set(ARCH_ARM 1)
+  set(RTCD_ARCH_ARM "yes")
+
+  if(ENABLE_NEON)
+    set(HAVE_NEON 1)
+    set(RTCD_HAVE_NEON "yes")
+  else()
+    set(HAVE_NEON 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^mips")
+  set(ARCH_MIPS 1)
+  set(RTCD_ARCH_MIPS "yes")
+
+  if("${AOM_TARGET_CPU}" STREQUAL "mips32")
+    set(HAVE_MIPS32 1)
+    set(RTCD_HAVE_MIPS32 "yes")
+  elseif("${AOM_TARGET_CPU}" STREQUAL "mips64")
+    set(HAVE_MIPS64 1)
+    set(RTCD_HAVE_MIPS64 "yes")
+  endif()
+
+  # HAVE_DSPR2 is set by mips toolchain files.
+  if(ENABLE_DSPR2 AND HAVE_DSPR2)
+    set(RTCD_HAVE_DSPR2 "yes")
+  else()
+    set(HAVE_DSPR2 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-dspr2)
+  endif()
+
+  # HAVE_MSA is set by mips toolchain files.
+  if(ENABLE_MSA AND HAVE_MSA)
+    set(RTCD_HAVE_MSA "yes")
+  else()
+    set(HAVE_MSA 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-msa)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+  set(ARCH_PPC 1)
+  set(RTCD_ARCH_PPC "yes")
+
+  if(ENABLE_VSX)
+    set(HAVE_VSX 1)
+    set(RTCD_HAVE_VSX "yes")
+  else()
+    set(HAVE_VSX 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
+  if("${AOM_TARGET_CPU}" STREQUAL "x86")
+    set(ARCH_X86 1)
+    set(RTCD_ARCH_X86 "yes")
+  elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    set(ARCH_X86_64 1)
+    set(RTCD_ARCH_X86_64 "yes")
+  endif()
+
+  set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2")
+  foreach(flavor ${X86_FLAVORS})
+    if(ENABLE_${flavor} AND NOT disable_remaining_flavors)
+      set(HAVE_${flavor} 1)
+      set(RTCD_HAVE_${flavor} "yes")
+    else()
+      set(disable_remaining_flavors 1)
+      set(HAVE_${flavor} 0)
+      string(TOLOWER ${flavor} flavor)
+      set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+    endif()
+  endforeach()
+endif()
diff --git a/third_party/aom/build/cmake/dist.cmake b/third_party/aom/build/cmake/dist.cmake
new file mode 100644
index 000000000..6f81736f0
--- /dev/null
+++ b/third_party/aom/build/cmake/dist.cmake
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+# Converts spaces in $in_string to semicolons and writes the output to
+# $out_string. In CMake's eyes this converts the input string to a list.
+function(listify_string in_string out_string)
+  string(REPLACE " " ";" ${out_string} ${in_string})
+  set(${out_string} "${${out_string}}" PARENT_SCOPE)
+endfunction()
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR"
+    "AOM_DIST_INCLUDES" "AOM_DIST_LIBS" "ENABLE_DOCS")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+if(ENABLE_DOCS)
+  file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}")
+endif()
+
+if(AOM_DIST_EXAMPLES)
+  listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES")
+  foreach(example ${AOM_DIST_EXAMPLES})
+    if(NOT "${example}" MATCHES "aomdec\|aomenc")
+      file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples")
+    endif()
+  endforeach()
+endif()
+
+if(AOM_DIST_TOOLS)
+  listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS")
+  foreach(tool ${AOM_DIST_TOOLS})
+    file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools")
+  endforeach()
+endif()
+
+if(AOM_DIST_APPS)
+  listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS")
+  foreach(app ${AOM_DIST_APPS})
+    file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin")
+  endforeach()
+endif()
+
+listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES")
+foreach(inc ${AOM_DIST_INCLUDES})
+  file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom")
+endforeach()
+
+listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS")
+foreach(lib ${AOM_DIST_LIBS})
+  file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib")
+endforeach()
diff --git a/third_party/aom/build/cmake/exports.cmake b/third_party/aom/build/cmake/exports.cmake
new file mode 100644
index 000000000..e0813dc0f
--- /dev/null
+++ b/third_party/aom/build/cmake/exports.cmake
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_
+set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1)
+
+include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
+
+# Creates the custom target which handles generation of the symbol export lists.
+function(setup_exports_target)
+  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+    set(symbol_file_ext "syms")
+  elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND MSVC)
+    set(symbol_file_ext "def")
+  else()
+    set(symbol_file_ext "ver")
+  endif()
+
+  set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}")
+
+  add_custom_target(generate_exports
+                    COMMAND ${CMAKE_COMMAND} -DAOM_ROOT="${AOM_ROOT}"
+                            -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+                            -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
+                            -DAOM_SYM_FILE="${aom_sym_file}" -DAOM_MSVC=${MSVC}
+                            -DAOM_XCODE=${XCODE} -DCONFIG_NAME=$<CONFIG>
+                            -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
+                            -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
+                            -DENABLE_TESTS=${ENABLE_TESTS} -P
+                            "${AOM_ROOT}/build/cmake/generate_exports.cmake"
+                    SOURCES ${AOM_EXPORTS_SOURCES}
+                    DEPENDS ${AOM_EXPORTS_SOURCES})
+
+  # Make libaom depend on the exports file, and set flags to pick it up when
+  # creating the dylib.
+  add_dependencies(aom generate_exports)
+
+  if(APPLE)
+    set_property(TARGET aom APPEND_STRING
+                 PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
+  elseif(WIN32)
+    if(NOT MSVC)
+      set_property(TARGET aom APPEND_STRING
+                   PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}")
+    else()
+      set_property(TARGET aom APPEND_STRING
+                   PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}")
+    endif()
+
+    # TODO(tomfinegan): Sort out the import lib situation and flags for MSVC.
+
+  else()
+    set_property(TARGET aom APPEND_STRING
+                 PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}")
+  endif()
+endfunction()
diff --git a/third_party/aom/build/cmake/exports_sources.cmake b/third_party/aom/build/cmake/exports_sources.cmake
new file mode 100644
index 000000000..576920e36
--- /dev/null
+++ b/third_party/aom/build/cmake/exports_sources.cmake
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_
+set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1)
+
+list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com"
+            "${AOM_ROOT}/av1/exports_com")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec"
+              "${AOM_ROOT}/av1/exports_dec")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc"
+              "${AOM_ROOT}/av1/exports_enc")
+endif()
+
+if(ENABLE_TESTS)
+  list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_test"
+              "${AOM_ROOT}/av1/exports_test")
+endif()
diff --git a/third_party/aom/build/cmake/generate_aom_config_templates.cmake b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
new file mode 100644
index 000000000..b91c036de
--- /dev/null
+++ b/third_party/aom/build/cmake/generate_aom_config_templates.cmake
@@ -0,0 +1,101 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+string(TIMESTAMP year "%Y")
+set(
+  asm_file_header_block
+  "\;
+\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+\;
+\; This source code is subject to the terms of the BSD 2 Clause License and
+\; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+\; was not distributed with this source code in the LICENSE file, you can
+\; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+\; Media Patent License 1.0 was not distributed with this source code in the
+\; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+\;
+"
+  )
+set(
+  h_file_header_block
+  "/*
+ * Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+\#ifndef AOM_CONFIG_H_
+\#define AOM_CONFIG_H_
+"
+  )
+set(
+  cmake_file_header_block
+  "##
+## Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"
+  )
+
+# Terminates cmake execution when $var_name is an empty string, or the variable
+# name it contains does not expand to an existing directory.
+function(check_directory_var var_name)
+  if("${var_name}" STREQUAL "")
+    message(FATAL_ERROR "The CMake variable ${var_name} must be defined.")
+  endif()
+
+  if(NOT EXISTS "${${var_name}}")
+    message(FATAL_ERROR "${${var_name}} (${var_name}) missing.")
+  endif()
+endfunction()
+
+check_directory_var(AOM_CONFIG_DIR)
+check_directory_var(AOM_ROOT)
+
+set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+if(NOT EXISTS "${AOM_DEFAULTS}")
+  message(FATAL_ERROR
+            "Configuration default values file (${AOM_DEFAULTS}) missing.")
+endif()
+
+include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS})
+list(SORT aom_build_vars)
+
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+file(WRITE "${aom_config_h_template}" ${h_file_header_block})
+foreach(aom_var ${aom_build_vars})
+  if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
+    file(APPEND "${aom_config_h_template}"
+                "\#define ${aom_var} \${${aom_var}}\n")
+  endif()
+endforeach()
+file(APPEND "${aom_config_h_template}" "\#endif  // AOM_CONFIG_H_")
+
+set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
+foreach(aom_var ${aom_build_vars})
+  if(NOT "${aom_var}" STREQUAL "INLINE" AND NOT "${aom_var}" STREQUAL
+     "AOM_RTCD_FLAGS")
+    file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n")
+  endif()
+endforeach()
diff --git a/third_party/aom/build/cmake/generate_exports.cmake b/third_party/aom/build/cmake/generate_exports.cmake
new file mode 100644
index 000000000..7ab5aaef8
--- /dev/null
+++ b/third_party/aom/build/cmake/generate_exports.cmake
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM"
+    "AOM_SYM_FILE" "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
+
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+  set(symbol_prefix "_")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+  file(WRITE "${AOM_SYM_FILE}" "LIBRARY aom\n" "EXPORTS\n")
+else()
+  set(symbol_suffix ";")
+endif()
+
+set(aom_sym_file "${AOM_SYM_FILE}")
+
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+  file(REMOVE "${aom_sym_file}")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+  file(WRITE "${aom_sym_file}" "LIBRARY aom\n" "EXPORTS\n")
+else()
+  file(WRITE "${aom_sym_file}" "{\nglobal:\n")
+endif()
+
+foreach(export_file ${AOM_EXPORTS_SOURCES})
+  file(STRINGS "${export_file}" exported_file_data)
+  set(exported_symbols "${exported_symbols} ${exported_file_data};")
+  string(STRIP "${exported_symbols}" exported_symbols)
+endforeach()
+
+foreach(exported_symbol ${exported_symbols})
+  string(STRIP "${exported_symbol}" exported_symbol)
+  if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+    string(SUBSTRING ${exported_symbol} 0 4 export_type)
+    string(COMPARE EQUAL "${export_type}" "data" is_data)
+    if(is_data)
+      set(symbol_suffix " DATA")
+    else()
+      set(symbol_suffix "")
+    endif()
+  endif()
+  string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}")
+  set(exported_symbol "  ${symbol_prefix}${exported_symbol}${symbol_suffix}")
+  file(APPEND "${aom_sym_file}" "${exported_symbol}\n")
+endforeach()
+
+if("${aom_sym_file}" MATCHES "ver$")
+  file(APPEND "${aom_sym_file}" " \nlocal:\n  *;\n};")
+endif()
diff --git a/third_party/aom/build/cmake/ios-Info.plist b/third_party/aom/build/cmake/ios-Info.plist
new file mode 100644
index 000000000..300e3e310
--- /dev/null
+++ b/third_party/aom/build/cmake/ios-Info.plist
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>AOM</string>
+	<key>CFBundleIdentifier</key>
+	<string>org.webmproject.AOM</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>AOM</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${VERSION}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>${VERSION}</string>
+	<key>MinimumOSVersion</key>
+	<string>${IOS_VERSION_MIN}</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+		<integer>2</integer>
+	</array>
+	<key>AOMFullVersion</key>
+	<string>${FULLVERSION}</string>
+</dict>
+</plist>
diff --git a/third_party/aom/build/cmake/iosbuild.sh b/third_party/aom/build/cmake/iosbuild.sh
new file mode 100755
index 000000000..167ece200
--- /dev/null
+++ b/third_party/aom/build/cmake/iosbuild.sh
@@ -0,0 +1,384 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This script generates 'AOM.framework'. An iOS app can encode and decode AVx
+## video by including 'AOM.framework'.
+##
+## Run iosbuild.sh to create 'AOM.framework' in the current directory.
+##
+set -e
+devnull='> /dev/null 2>&1'
+
+BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+                --disable-examples
+                --disable-libyuv
+                --disable-unit-tests"
+DIST_DIR="_dist"
+FRAMEWORK_DIR="AOM.framework"
+FRAMEWORK_LIB="AOM.framework/AOM"
+HEADER_DIR="${FRAMEWORK_DIR}/Headers/aom"
+SCRIPT_DIR=$(dirname "$0")
+LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
+LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+ORIG_PWD="$(pwd)"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin16-gcc
+             x86_64-darwin16-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
+
+# Configures for the target specified by $1, and invokes make with the dist
+# target using $ as the distribution output directory.
+build_target() {
+  local target="$1"
+  local old_pwd="$(pwd)"
+  local target_specific_flags=""
+
+  vlog "***Building target: ${target}***"
+
+  case "${target}" in
+    x86-*)
+      target_specific_flags="--enable-pic"
+      vlog "Enabled PIC for ${target}"
+      ;;
+  esac
+
+  mkdir "${target}"
+  cd "${target}"
+  # TODO(tomfinegan@google.com): switch to cmake.
+  eval "${LIBAOM_SOURCE_DIR}/configure" --target="${target}" \
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
+    ${devnull}
+  export DIST_DIR
+  eval make dist ${devnull}
+  cd "${old_pwd}"
+
+  vlog "***Done building target: ${target}***"
+}
+
+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    arm64-*)
+      echo "__aarch64__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7A__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a aom_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real aom_config.h for the current target.
+# $1 is the list of targets.
+create_aom_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/aom_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="AOM_FRAMEWORK_HEADERS_AOM_AOM_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y), Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#define AOM_FRAMEWORK_TARGET \"${target}\"\n" >> "${config_file}"
+    printf "#include \"AOM/aom/${target}/aom_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/aom_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
+# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
+verify_framework_targets() {
+  local requested_cpus=""
+  local cpu=""
+
+  # Extract CPU from full target name.
+  for target; do
+    cpu="${target%%-*}"
+    if [ "${cpu}" = "x86" ]; then
+      # lipo -info outputs i386 for libaom x86 targets.
+      cpu="i386"
+    fi
+    requested_cpus="${requested_cpus}${cpu} "
+  done
+
+  # Get target CPUs present in framework library.
+  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
+
+  # $LIPO -info outputs a string like the following:
+  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
+  # Capture only the architecture strings.
+  targets_built=${targets_built##*: }
+
+  # Sort CPU strings to make the next step a simple string compare.
+  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
+  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
+
+  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
+  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
+
+  if [ "${requested}" != "${actual}" ]; then
+    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
+    elog "  Requested target CPUs: ${requested}"
+    elog "  Actual target CPUs: ${actual}"
+    return 1
+  fi
+}
+
+# Configures and builds each target specified by $1, and then builds
+# AOM.framework.
+build_framework() {
+  local lib_list=""
+  local targets="$1"
+  local target=""
+  local target_dist_dir=""
+
+  # Clean up from previous build(s).
+  rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}"
+
+  # Create output dirs.
+  mkdir -p "${BUILD_ROOT}"
+  mkdir -p "${HEADER_DIR}"
+
+  cd "${BUILD_ROOT}"
+
+  for target in ${targets}; do
+    build_target "${target}"
+    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
+    if [ "${ENABLE_SHARED}" = "yes" ]; then
+      local suffix="dylib"
+    else
+      local suffix="a"
+    fi
+    lib_list="${lib_list} ${target_dist_dir}/lib/libaom.${suffix}"
+  done
+
+  cd "${ORIG_PWD}"
+
+  # The basic libaom API includes are all the same; just grab the most recent
+  # set.
+  cp -p "${target_dist_dir}"/include/aom/* "${HEADER_DIR}"
+
+  # Build the fat library.
+  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/AOM
+
+  # Create the aom_config.h shim that allows usage of aom_config.h from
+  # within AOM.framework.
+  create_aom_framework_config_shim "${targets}"
+
+  # Copy in aom_version.h.
+  cp -p "${BUILD_ROOT}/${target}/aom_version.h" "${HEADER_DIR}"
+
+  if [ "${ENABLE_SHARED}" = "yes" ]; then
+    # Adjust the dylib's name so dynamic linking in apps works as expected.
+    install_name_tool -id '@rpath/AOM.framework/AOM' ${FRAMEWORK_DIR}/AOM
+
+    # Copy in Info.plist.
+    cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
+      | sed "s/\${VERSION}/${VERSION}/g" \
+      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
+      > "${FRAMEWORK_DIR}/Info.plist"
+  fi
+
+  # Confirm AOM.framework/AOM contains the targets requested.
+  verify_framework_targets ${targets}
+
+  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
+  for lib in ${lib_list}; do
+    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
+  done
+}
+
+# Trap function. Cleans up the subtree used to build all targets contained in
+# $TARGETS.
+cleanup() {
+  local res=$?
+  cd "${ORIG_PWD}"
+
+  if [ $res -ne 0 ]; then
+    elog "build exited with error ($res)"
+  fi
+
+  if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
+    rm -rf "${BUILD_ROOT}"
+  fi
+}
+
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
+iosbuild_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --help: Display this message and exit.
+    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
+    --extra-configure-args <args>: Extra args to pass when configuring libaom.
+    --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
+    --preserve-build-output: Do not delete the build directory.
+    --show-build-output: Show output from each library build.
+    --targets <targets>: Override default target list. Defaults:
+$(print_list "        " ${TARGETS})
+    --test-link: Confirms all targets can be linked. Functionally identical to
+                 passing --enable-examples via --extra-configure-args.
+    --verbose: Output information about the environment and each stage of the
+               build.
+EOF
+}
+
+elog() {
+  echo "${0##*/} failed because: $@" 1>&2
+}
+
+vlog() {
+  if [ "${VERBOSE}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+trap cleanup EXIT
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --extra-configure-args)
+      EXTRA_CONFIGURE_ARGS="$2"
+      shift
+      ;;
+    --help)
+      iosbuild_usage
+      exit
+      ;;
+    --enable-shared)
+      ENABLE_SHARED=yes
+      ;;
+    --preserve-build-output)
+      PRESERVE_BUILD_OUTPUT=yes
+      ;;
+    --show-build-output)
+      devnull=
+      ;;
+    --test-link)
+      EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples"
+      ;;
+    --targets)
+      TARGETS="$2"
+      shift
+      ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
+    --verbose)
+      VERBOSE=yes
+      ;;
+    *)
+      iosbuild_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [ "${ENABLE_SHARED}" = "yes" ]; then
+  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
+fi
+
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBAOM_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
+if [ "$ENABLE_SHARED" = "yes" ]; then
+  IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
+else
+  IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
+fi
+
+if [ "${VERBOSE}" = "yes" ]; then
+cat << EOF
+  BUILD_ROOT=${BUILD_ROOT}
+  DIST_DIR=${DIST_DIR}
+  CONFIGURE_ARGS=${CONFIGURE_ARGS}
+  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
+  FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  FRAMEWORK_LIB=${FRAMEWORK_LIB}
+  HEADER_DIR=${HEADER_DIR}
+  LIBAOM_SOURCE_DIR=${LIBAOM_SOURCE_DIR}
+  LIPO=${LIPO}
+  MAKEFLAGS=${MAKEFLAGS}
+  ORIG_PWD=${ORIG_PWD}
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
+  TARGETS="$(print_list "" ${TARGETS})"
+  ENABLE_SHARED=${ENABLE_SHARED}
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
+  SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
+  VERSION="${VERSION}"
+  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
+EOF
+fi
+
+build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+print_list "" ${TARGETS}
diff --git a/third_party/aom/build/cmake/msvc_runtime.cmake b/third_party/aom/build/cmake/msvc_runtime.cmake
new file mode 100644
index 000000000..9e4cbea43
--- /dev/null
+++ b/third_party/aom/build/cmake/msvc_runtime.cmake
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_
+set(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_ 1)
+
+if(MSVC)
+
+  # CMake defaults to producing code linked to the DLL MSVC runtime. That will
+  # not work with googletest, and isn't what we want anyway.
+  if(NOT "${MSVC_RUNTIME}" STREQUAL "dll")
+    foreach(flag_var
+            CMAKE_C_FLAGS
+            CMAKE_C_FLAGS_DEBUG
+            CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL
+            CMAKE_C_FLAGS_RELWITHDEBINFO
+            CMAKE_CXX_FLAGS
+            CMAKE_CXX_FLAGS_DEBUG
+            CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL
+            CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
+endif()
diff --git a/third_party/aom/build/cmake/pkg_config.cmake b/third_party/aom/build/cmake/pkg_config.cmake
new file mode 100644
index 000000000..64e20214e
--- /dev/null
+++ b/third_party/aom/build/cmake/pkg_config.cmake
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
+    "CMAKE_PROJECT_NAME" "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version)
+
+# Create a version string suitable for comparison using the RPM version compare
+# algorithm: strip out everything after the number.
+string(FIND "${aom_version}" "-" dash_pos)
+if(${dash_pos} EQUAL -1)
+  set(package_version "${aom_version}")
+else()
+  string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version)
+endif()
+
+# Write pkg-config info.
+set(prefix "${CMAKE_INSTALL_PREFIX}")
+set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc")
+string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name)
+file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n")
+file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n")
+file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}/bin\n")
+file(APPEND "${pkgconfig_file}" "libdir=\${prefix}/lib\n")
+file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/include\n\n")
+file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n")
+file(APPEND "${pkgconfig_file}"
+            "Description: AV1 codec library v${aom_version}.\n")
+file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n")
+file(APPEND "${pkgconfig_file}" "Requires:\n")
+file(APPEND "${pkgconfig_file}" "Conflicts:\n")
+if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
+  file(APPEND "${pkgconfig_file}"
+              "Libs: -L\${prefix}/lib -l${pkg_name} -lm -lpthread\n")
+  file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n")
+else()
+  file(APPEND "${pkgconfig_file}" "Libs: -L\${prefix}/lib -l${pkg_name} -lm\n")
+  file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
+endif()
+file(APPEND "${pkgconfig_file}" "Cflags: -I\${prefix}/include\n")
diff --git a/third_party/aom/build/cmake/rtcd.pl b/third_party/aom/build/cmake/rtcd.pl
new file mode 100755
index 000000000..46e06907c
--- /dev/null
+++ b/third_party/aom/build/cmake/rtcd.pl
@@ -0,0 +1,467 @@
+#!/usr/bin/env perl
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+no strict 'refs';
+use warnings;
+use Getopt::Long;
+Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32;
+
+my %ALL_FUNCS = ();
+my @ALL_ARCHS;
+my @ALL_FORWARD_DECLS;
+my @REQUIRES;
+
+my %opts = ();
+my %disabled = ();
+my %required = ();
+
+my @argv;
+foreach (@ARGV) {
+  $disabled{$1} = 1, next if /--disable-(.*)/;
+  $required{$1} = 1, next if /--require-(.*)/;
+  push @argv, $_;
+}
+
+# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility.
+@ARGV = @argv;
+GetOptions(
+  \%opts,
+  'arch=s',
+  'sym=s',
+  'config=s',
+);
+
+foreach my $opt (qw/arch config/) {
+  if (!defined($opts{$opt})) {
+    warn "--$opt is required!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+foreach my $defs_file (@ARGV) {
+  if (!-f $defs_file) {
+    warn "$defs_file: $!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+open CONFIG_FILE, $opts{config} or
+  die "Error opening config file '$opts{config}': $!\n";
+
+my %config = ();
+while (<CONFIG_FILE>) {
+  next if !/^#define\s+(?:CONFIG_|HAVE_)/;
+  chomp;
+  my @line_components = split /\s/;
+  scalar @line_components > 2 or
+    die "Invalid input passed to rtcd.pl via $opts{config}.";
+  # $line_components[0] = #define
+  # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING)
+  # $line_components[2] = flag value (0 or 1)
+  $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : "";
+}
+close CONFIG_FILE;
+
+#
+# Routines for the RTCD DSL to call
+#
+sub aom_config($) {
+  return (defined $config{$_[0]}) ? $config{$_[0]} : "";
+}
+
+sub specialize {
+  if (@_ <= 1) {
+    die "'specialize' must be called with a function name and at least one ",
+        "architecture ('C' is implied): \n@_\n";
+  }
+  my $fn=$_[0];
+  shift;
+  foreach my $opt (@_) {
+    eval "\$${fn}_${opt}=${fn}_${opt}";
+  }
+}
+
+sub add_proto {
+  my $fn = splice(@_, -2, 1);
+  $ALL_FUNCS{$fn} = \@_;
+  specialize $fn, "c";
+}
+
+sub require {
+  foreach my $fn (keys %ALL_FUNCS) {
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+
+      # if we already have a default, then we can disable it, as we know
+      # we can do better.
+      my $best = eval "\$${fn}_default";
+      if ($best) {
+        my $best_ofn = eval "\$${best}";
+        if ($best_ofn && "$best_ofn" ne "$ofn") {
+          eval "\$${best}_link = 'false'";
+        }
+      }
+      eval "\$${fn}_default=${fn}_${opt}";
+      eval "\$${fn}_${opt}_link='true'";
+    }
+  }
+}
+
+sub forward_decls {
+  push @ALL_FORWARD_DECLS, @_;
+}
+
+#
+# Include the user's directives
+#
+foreach my $f (@ARGV) {
+  open FILE, "<", $f or die "cannot open $f: $!\n";
+  my $contents = join('', <FILE>);
+  close FILE;
+  eval $contents or warn "eval failed: $@\n";
+}
+
+#
+# Process the directives according to the command line
+#
+sub process_forward_decls() {
+  foreach (@ALL_FORWARD_DECLS) {
+    $_->();
+  }
+}
+
+sub determine_indirection {
+  aom_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS);
+  foreach my $fn (keys %ALL_FUNCS) {
+    my $n = "";
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      my $link = eval "\$${fn}_${opt}_link";
+      next if $link && $link eq "false";
+      $n .= "x";
+    }
+    if ($n eq "x") {
+      eval "\$${fn}_indirect = 'false'";
+    } else {
+      eval "\$${fn}_indirect = 'true'";
+    }
+  }
+}
+
+sub declare_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      print "$rtyp ${ofn}($args);\n";
+    }
+    if (eval "\$${fn}_indirect" eq "false") {
+      print "#define ${fn} ${dfn}\n";
+    } else {
+      print "RTCD_EXTERN $rtyp (*${fn})($args);\n";
+    }
+    print "\n";
+  }
+}
+
+sub set_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    if (eval "\$${fn}_indirect" eq "true") {
+      print "    $fn = $dfn;\n";
+      foreach my $opt (@_) {
+        my $ofn = eval "\$${fn}_${opt}";
+        next if !$ofn;
+        next if "$ofn" eq "$dfn";
+        my $link = eval "\$${fn}_${opt}_link";
+        next if $link && $link eq "false";
+        my $cond = eval "\$have_${opt}";
+        print "    if (${cond}) $fn = $ofn;\n"
+      }
+    }
+  }
+}
+
+sub filter {
+  my @filtered;
+  foreach (@_) { push @filtered, $_ unless $disabled{$_}; }
+  return @filtered;
+}
+
+#
+# Helper functions for generating the arch specific RTCD files
+#
+sub common_top() {
+  my $include_guard = uc($opts{sym})."_H_";
+  print <<EOF;
+// This file is generated. Do not edit.
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+EOF
+
+process_forward_decls();
+print <<EOF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+declare_function_pointers("c", @ALL_ARCHS);
+
+print <<EOF;
+void $opts{sym}(void);
+
+EOF
+}
+
+sub common_bottom() {
+  print <<EOF;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
+EOF
+}
+
+sub x86() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub arm() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = aom_arm_cpu_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub mips() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+#if HAVE_DSPR2
+void aom_dsputil_static_init();
+aom_dsputil_static_init();
+#endif
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub ppc() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+  int flags = ppc_simd_caps();
+
+  (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub unoptimized() {
+  determine_indirection "c";
+  common_top;
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers "c";
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+#
+# Main Driver
+#
+
+&require("c");
+&require(keys %required);
+if ($opts{arch} eq 'x86') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
+  x86;
+} elsif ($opts{arch} eq 'x86_64') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
+  @REQUIRES = filter(qw/mmx sse sse2/);
+  &require(@REQUIRES);
+  x86;
+} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  @ALL_ARCHS = filter("$opts{arch}");
+  if (aom_config("HAVE_DSPR2") eq "yes") {
+    @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+  } elsif (aom_config("HAVE_MSA") eq "yes") {
+    @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+  }
+  mips;
+} elsif ($opts{arch} =~ /armv7\w?/) {
+  @ALL_ARCHS = filter(qw/neon/);
+  arm;
+} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
+  @ALL_ARCHS = filter(qw/neon/);
+  &require("neon");
+  arm;
+} elsif ($opts{arch} eq 'ppc') {
+  @ALL_ARCHS = filter(qw/vsx/);
+  ppc;
+} else {
+  unoptimized;
+}
+
+__END__
+
+=head1 NAME
+
+rtcd -
+
+=head1 SYNOPSIS
+
+Usage: rtcd.pl [options] FILE
+
+See 'perldoc rtcd.pl' for more details.
+
+=head1 DESCRIPTION
+
+Reads the Run Time CPU Detections definitions from FILE and generates a
+C header file on stdout.
+
+=head1 OPTIONS
+
+Options:
+  --arch=ARCH       Architecture to generate defs for (required)
+  --disable-EXT     Disable support for EXT extensions
+  --require-EXT     Require support for EXT extensions
+  --sym=SYMBOL      Unique symbol to use for RTCD initialization function
+  --config=FILE     Path to file containing C preprocessor directives to parse
diff --git a/third_party/aom/build/cmake/sanitizers.cmake b/third_party/aom/build/cmake/sanitizers.cmake
new file mode 100644
index 000000000..77708e101
--- /dev/null
+++ b/third_party/aom/build/cmake/sanitizers.cmake
@@ -0,0 +1,38 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_SANITIZERS_CMAKE_
+set(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_ 1)
+
+if(MSVC OR NOT SANITIZE)
+  return()
+endif()
+
+include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
+
+string(TOLOWER ${SANITIZE} SANITIZE)
+
+# Require the sanitizer requested.
+require_linker_flag("-fsanitize=${SANITIZE}")
+require_compiler_flag("-fsanitize=${SANITIZE}" YES)
+
+# Make callstacks accurate.
+require_compiler_flag("-fno-omit-frame-pointer -fno-optimize-sibling-calls" YES)
+
+# Fix link errors due to missing rt compiler lib in 32-bit builds.
+# http://llvm.org/bugs/show_bug.cgi?id=17693
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+  if(${CMAKE_SIZEOF_VOID_P} EQUAL 4 AND "${SANITIZE}" MATCHES
+     "integer|undefined")
+    require_linker_flag("--rtlib=compiler-rt -lgcc_s")
+  endif()
+endif()
diff --git a/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake b/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
new file mode 100644
index 000000000..8f4095145
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm-ios-common.cmake
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_
+set(AOM_BUILD_CMAKE_ARM_IOS_COMMON_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_SYSROOT iphoneos)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+
+# No runtime cpu detect for arm*-ios targets.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+
+# TODO(tomfinegan): Handle bit code embedding.
diff --git a/third_party/aom/build/cmake/toolchains/arm64-ios.cmake b/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
new file mode 100644
index 000000000..6feb1090f
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-ios.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_ 1)
+
+if(XCODE) # TODO(tomfinegan): Handle arm builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_OSX_ARCHITECTURES "arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
diff --git a/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
new file mode 100644
index 000000000..590a97a8e
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-march=armv8-a")
+set(CMAKE_CXX_COMPILER_ARG1 "-march=armv8-a")
+set(AOM_AS_FLAGS "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+
+# No intrinsics flag required for arm64-linux-gcc.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for arm64-linux-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake
new file mode 100644
index 000000000..b5b2ff1cd
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# No runtime cpu detect for arm64-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+
+# Disable the use of the gtest's CMake support.
+set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/third_party/aom/build/cmake/toolchains/armv7-ios.cmake b/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
new file mode 100644
index 000000000..32a1b534a
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-ios.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle arm builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_OSX_ARCHITECTURES "armv7")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
+
+# No intrinsics flag required for armv7s-ios.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7s-ios.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
new file mode 100644
index 000000000..7d3d63085
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -0,0 +1,43 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS arm-linux-gnueabihf-)
+endif()
+
+if(NOT ${CROSS} MATCHES hf-$)
+  set(AOM_EXTRA_TOOLCHAIN_FLAGS "-mfloat-abi=softfp")
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1
+    "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1
+    "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
+    ${AOM_EXTRA_TOOLCHAIN_FLAGS})
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+
+# No intrinsics flag required for armv7-linux-gcc.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7-linux-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake
new file mode 100644
index 000000000..cf06a11b3
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -0,0 +1,32 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS armv7-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# No runtime cpu detect for armv7-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
+
+# Disable the use of the gtest's CMake support.
+set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake b/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
new file mode 100644
index 000000000..0940a6ee8
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/armv7s-ios.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle arm builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7s")
+set(CMAKE_OSX_ARCHITECTURES "armv7s")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
+
+# No intrinsics flag required for armv7s-ios.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7s-ios.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake b/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
new file mode 100644
index 000000000..76e0bd140
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/ios-simulator-common.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_
+set(AOM_BUILD_CMAKE_IOS_SIMULATOR_COMMON_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_SYSROOT iphonesimulator)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+
+# TODO(tomfinegan): Handle bit code embedding.
diff --git a/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake
new file mode 100644
index 000000000..0f93490b1
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/mips32-linux-gcc.cmake
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if(ENABLE_DSPR2 AND ENABLE_MSA)
+  message(FATAL_ERROR "ENABLE_DSPR2 and ENABLE_MSA cannot be combined.")
+endif()
+
+if(ENABLE_DSPR2)
+  set(HAVE_DSPR2 1 CACHE BOOL "" FORCE)
+
+  if("${CROSS}" STREQUAL "")
+
+    # Default the cross compiler prefix to something known to work.
+    set(CROSS mips-linux-gnu-)
+  endif()
+
+  set(MIPS_CFLAGS "-mdspr2")
+  set(MIPS_CXXFLAGS "-mdspr2")
+elseif(ENABLE_MSA)
+  set(HAVE_MSA 1 CACHE BOOL "" FORCE)
+
+  if("${CROSS}" STREQUAL "")
+
+    # Default the cross compiler prefix to something known to work.
+    set(CROSS mips-mti-linux-gnu-)
+  endif()
+
+  set(MIPS_CFLAGS "-mmsa")
+  set(MIPS_CXXFLAGS "-mmsa")
+endif()
+
+if("${CROSS}" STREQUAL "")
+
+  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
+  # be desired on a mips host.  Default cross compiler prefix to something that
+  # might work for an  unoptimized build.
+  set(CROSS mips-linux-gnu-)
+endif()
+
+if("${MIPS_CPU}" STREQUAL "")
+  set(MIPS_CFLAGS "${MIPS_CFLAGS} -mips32r2")
+  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} -mips32r2")
+elseif("${MIPS_CPU}" STREQUAL "p5600")
+  set(P56_FLAGS
+      "-mips32r5 -mload-store-pairs -msched-weight -mhard-float -mfp64")
+  set(MIPS_CFLAGS "${MIPS_CFLAGS} ${P56_FLAGS}")
+  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${P56_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS "-mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-EL ${MIPS_CFLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_SYSTEM_PROCESSOR "mips32")
+
+# No runtime cpu detect for mips32-linux-gcc.
+if(CONFIG_RUNTIME_CPU_DETECT)
+  message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips32 targets.")
+endif()
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "" FORCE)
diff --git a/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake
new file mode 100644
index 000000000..ad9aab09d
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/mips64-linux-gcc.cmake
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
+  # be desired on a mips host.
+  #
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS mips-img-linux-gnu-)
+endif()
+
+if(ENABLE_MSA)
+  set(HAVE_MSA 1 CACHE BOOL "" FORCE)
+  set(MIPS_CFLAGS "-mmsa")
+  set(MIPS_CXXFLAGS "-mmsa")
+endif()
+
+if("${MIPS_CPU}" STREQUAL "i6400" OR "${MIPS_CPU}" STREQUAL "p6600")
+  set(MIPS_CPU_FLAGS "-mips64r6 -mabi=64 -mload-store-pairs -msched-weight")
+  set(MIPS_CPU_FLAGS "${MIPS_CPU_FLAGS} -mhard-float -mfp64")
+  set(MIPS_CFLAGS "${MIPS_CFLAGS} ${MIPS_CPU_FLAGS}")
+  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${MIPS_CPU_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "-mips64r6 -mabi64 -mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-EL ${MIPS_CFLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_SYSTEM_PROCESSOR "mips64")
+
+# No runtime cpu detect for mips64-linux-gcc.
+if(CONFIG_RUNTIME_CPU_DETECT)
+  message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips64 targets.")
+endif()
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "" FORCE)
diff --git a/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake b/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake
new file mode 100644
index 000000000..c86cc27e3
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/ppc-linux-gcc.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS powerpc64le-unknown-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_SYSTEM_PROCESSOR "ppc")
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake b/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
new file mode 100644
index 000000000..6b6f52cac
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-ios-simulator.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle ios sim builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "i386")
+set(CMAKE_OSX_ARCHITECTURES "i386")
+
+# Avoid noisy PIC/PIE warnings.
+set(CONFIG_PIC 1 CACHE NUMBER "")
+
+include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
diff --git a/third_party/aom/build/cmake/toolchains/x86-linux.cmake b/third_party/aom/build/cmake/toolchains/x86-linux.cmake
new file mode 100644
index 000000000..c2a700bfe
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-linux.cmake
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_C_COMPILER_ARG1 "-m32")
+set(CMAKE_CXX_COMPILER_ARG1 "-m32")
diff --git a/third_party/aom/build/cmake/toolchains/x86-macos.cmake b/third_party/aom/build/cmake/toolchains/x86-macos.cmake
new file mode 100644
index 000000000..7a46e06a9
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-macos.cmake
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "i386")
+set(CMAKE_C_COMPILER_ARG1 "-arch i386")
+set(CMAKE_CXX_COMPILER_ARG1 "-arch i386")
+
+# Apple tools always complain in 32 bit mode without PIC.
+set(CONFIG_PIC 1 CACHE NUMBER "")
diff --git a/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
new file mode 100644
index 000000000..c986c4ee3
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Windows")
+set(CMAKE_C_COMPILER_ARG1 "-m32")
+set(CMAKE_CXX_COMPILER_ARG1 "-m32")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS i686-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# Disable the use of the gtest's CMake support.
+set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake b/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
new file mode 100644
index 000000000..d4b40ed09
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86_64-ios-simulator.cmake
@@ -0,0 +1,25 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle ios sim builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_OSX_ARCHITECTURES "x86_64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
diff --git a/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
new file mode 100644
index 000000000..00d94d5f1
--- /dev/null
+++ b/third_party/aom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS x86_64-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# Disable the use of the gtest's CMake support.
+set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/third_party/aom/build/cmake/util.cmake b/third_party/aom/build/cmake/util.cmake
new file mode 100644
index 000000000..b70ec4013
--- /dev/null
+++ b/third_party/aom/build/cmake/util.cmake
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_UTIL_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_UTIL_CMAKE_
+set(AOM_BUILD_CMAKE_UTIL_CMAKE_ 1)
+
+# Directory where generated sources will be written.
+set(AOM_GEN_SRC_DIR "${AOM_CONFIG_DIR}/gen_src")
+
+# Creates dummy source file in $AOM_CONFIG_DIR named $basename.$extension and
+# returns the full path to the dummy source file via the $out_file_path
+# parameter.
+macro(create_dummy_source_file basename extension out_file_path)
+  set(dummy_source_file "${AOM_GEN_SRC_DIR}/${basename}_dummy.${extension}")
+  file(
+    WRITE
+      "${dummy_source_file}" "// Generated file. DO NOT EDIT!\n"
+      "// ${target_name} needs a ${extension} file to force link language, \n"
+      "// or to silence a harmless CMake warning: Ignore me.\n"
+      "void ${target_name}_dummy_function(void) {}\n")
+endmacro()
+
+# Convenience function for adding a dummy source file to $target_name using
+# $extension as the file extension. Wraps create_dummy_source_file().
+function(add_dummy_source_file_to_target target_name extension)
+  create_dummy_source_file("${target_name}" "${extension}" "dummy_source_file")
+  target_sources(${target_name} PRIVATE ${dummy_source_file})
+endfunction()
+
+# Sets the value of the variable referenced by $feature to $value, and reports
+# the change to the user via call to message(WARNING ...). $cause is expected to
+# be a configuration variable that conflicts with $feature in some way. This
+# function is a noop if $feature is already set to $value.
+function(change_config_and_warn feature value cause)
+  if(${feature} EQUAL ${value})
+    return()
+  endif()
+  set(${feature} ${value} PARENT_SCOPE)
+  if(${value} EQUAL 1)
+    set(verb "Enabled")
+    set(reason "required for")
+  else()
+    set(verb "Disabled")
+    set(reason "incompatible with")
+  endif()
+  set(warning_message "${verb} ${feature}, ${reason} ${cause}.")
+  message(WARNING "--- ${warning_message}")
+endfunction()
+
+# Extracts the version string from $version_file and returns it to the user via
+# $version_string_out_var. To achieve this VERSION_STRING_NOSP is located in
+# $version_file and then everything but the string literal assigned to the
+# variable is removed. Quotes and the leading 'v' are stripped from the returned
+# string.
+function(extract_version_string version_file version_string_out_var)
+  file(STRINGS "${version_file}" aom_version REGEX "VERSION_STRING_NOSP")
+  string(REPLACE "#define VERSION_STRING_NOSP " "" aom_version "${aom_version}")
+  string(REPLACE "\"" "" aom_version "${aom_version}")
+  string(REPLACE " " "" aom_version "${aom_version}")
+  string(FIND "${aom_version}" "v" v_pos)
+  if(${v_pos} EQUAL 0)
+    string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+  endif()
+  set("${version_string_out_var}" "${aom_version}" PARENT_SCOPE)
+endfunction()
+
+# Sets CMake compiler launcher to $launcher_name when $launcher_name is found in
+# $PATH. Warns user about ignoring build flag $launcher_flag when $launcher_name
+# is not found in $PATH.
+function(set_compiler_launcher launcher_flag launcher_name)
+  find_program(launcher_path "${launcher_name}")
+  if(launcher_path)
+    set(CMAKE_C_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
+    message("--- Using ${launcher_name} as compiler launcher.")
+  else()
+    message(WARNING
+              "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
+  endif()
+endfunction()
+
+# Sentinel value used to detect when a variable has been set via the -D argument
+# passed to CMake on the command line.
+set(cmake_cmdline_helpstring "No help, variable specified on the command line.")
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_detect_var name value type helpstring)
+  unset(list_index)
+  list(FIND AOM_DETECT_VARS ${name} list_index)
+  if(${list_index} EQUAL -1)
+    list(APPEND AOM_DETECT_VARS ${name})
+  endif()
+
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    set(${name} ${value} CACHE ${type} "${helpstring}")
+    mark_as_advanced(${name})
+  else()
+    message(
+      WARNING
+        "${name} has been set by CMake, but it may be overridden by the build "
+        "system during environment detection")
+  endif()
+endmacro()
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_config_var name value type helpstring)
+  unset(list_index)
+  list(FIND AOM_CONFIG_VARS ${name} list_index)
+  if(${list_index} EQUAL -1)
+    list(APPEND AOM_CONFIG_VARS ${name})
+  endif()
+
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    set(${name} ${value} CACHE ${type} "${helpstring}")
+  endif()
+endmacro()
+
+# Wrapper macro for option() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_OPTION_VARS to facilitate build logging and diagnostics.
+macro(set_aom_option_var name helpstring value)
+  unset(list_index)
+  list(FIND AOM_OPTION_VARS ${name} list_index)
+  if(${list_index} EQUAL -1)
+    list(APPEND AOM_OPTION_VARS ${name})
+  endif()
+
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    option(${name} "${helpstring}" ${value})
+  endif()
+endmacro()
diff --git a/third_party/aom/build/cmake/version.cmake b/third_party/aom/build/cmake/version.cmake
new file mode 100644
index 000000000..d169b12ac
--- /dev/null
+++ b/third_party/aom/build/cmake/version.cmake
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "GIT_EXECUTABLE"
+    "PERL_EXECUTABLE")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Generate the version string for this run.
+unset(aom_version)
+if(EXISTS "${GIT_EXECUTABLE}")
+  execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${AOM_ROOT}/.git describe
+                  OUTPUT_VARIABLE aom_version ERROR_QUIET)
+  string(STRIP "${aom_version}" aom_version)
+
+  # Remove the leading 'v' from the version string.
+  string(FIND "${aom_version}" "v" v_pos)
+  if(${v_pos} EQUAL 0)
+    string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+  endif()
+endif()
+
+if("${aom_version}" STREQUAL "")
+  set(aom_version "${AOM_ROOT}/CHANGELOG")
+endif()
+
+unset(last_aom_version)
+if(EXISTS "${AOM_CONFIG_DIR}/config/aom_version.h")
+  extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h"
+                         last_aom_version)
+endif()
+
+if(NOT "${aom_version}" STREQUAL "${last_aom_version}")
+
+  # TODO(tomfinegan): Perl dependency is unnecessary. CMake can do everything
+  # that is done by version.pl on its own (if a bit more verbose...).
+  execute_process(COMMAND
+                    ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/version.pl"
+                    --version_data=${aom_version}
+                    --version_filename=${AOM_CONFIG_DIR}/config/aom_version.h
+                    VERBATIM)
+endif()
diff --git a/third_party/aom/build/cmake/version.pl b/third_party/aom/build/cmake/version.pl
new file mode 100755
index 000000000..7d23f2b27
--- /dev/null
+++ b/third_party/aom/build/cmake/version.pl
@@ -0,0 +1,112 @@
+#!/usr/bin/env perl
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+use strict;
+use warnings;
+use 5.010;
+use Getopt::Long;
+
+my $git_desc = '';
+my $version_data;
+my $version_filename;
+GetOptions('version_data=s' => \$version_data,
+           'version_filename=s' => \$version_filename) or
+  die("Invalid arg(s): $!");
+
+if (!defined $version_data || length($version_data) == 0 ||
+    !defined $version_filename || length($version_filename) == 0) {
+  die("--version_data and --version_filename are required.");
+}
+
+# Determine if $version_data is a filename or a git tag/description.
+my $version_string;
+chomp($version_data);
+if (-r $version_data) {
+  # $version_data is the path to the CHANGELOG. Parse the most recent version.
+  my $changelog_filename = $version_data;
+  open(my $changelog_file, '<', $changelog_filename) or
+    die("Unable to open CHANGELOG @ $changelog_filename: $!.");
+
+  while (my $line = <$changelog_file>) {
+    my @split_line = split(" ", $line, 3);
+    next if @split_line < 2;
+    $version_string = $split_line[1];
+    last if substr($version_string, 0, 1) eq "v";
+  }
+  close($changelog_file);
+} else {
+  # $version_data is either a tag name or a full git description, one of:
+  # tagName OR tagName-commitsSinceTag-shortCommitHash
+  # In either case we want the first element of the array returned by split.
+  $version_string = (split("-", $version_data))[0];
+  $git_desc = $version_data;
+}
+
+if (substr($version_string, 0, 1) eq "v") {
+  $version_string = substr($version_string, 1);
+}
+
+my @version_components = split('\.', $version_string, 4);
+my $version_major = $version_components[0];
+my $version_minor = $version_components[1];
+my $version_patch = $version_components[2];
+
+my $version_extra = "";
+if (length($git_desc) > 0) {
+  my @git_desc_components = split('-', $git_desc, 2);
+  $version_extra = $git_desc_components[1];
+}
+
+open(my $version_file, '>', $version_filename) or
+  die("Cannot open $version_filename: $!");
+
+my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))";
+my $year = (localtime)[5] + 1900;
+my $lic_block = << "EOF";
+/*
+ * Copyright (c) $year, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+EOF
+
+select $version_file;
+if (length($git_desc)) {
+  print << "EOF";
+$lic_block
+#define VERSION_MAJOR $version_major
+#define VERSION_MINOR $version_minor
+#define VERSION_PATCH $version_patch
+#define VERSION_EXTRA \"$version_extra\"
+#define VERSION_PACKED \\
+  $version_packed
+#define VERSION_STRING_NOSP \"$git_desc\"
+#define VERSION_STRING \" $git_desc\"
+EOF
+} else {
+  print << "EOF";
+$lic_block
+#define VERSION_MAJOR $version_major
+#define VERSION_MINOR $version_minor
+#define VERSION_PATCH $version_patch
+#define VERSION_EXTRA \"$version_extra\"
+#define VERSION_PACKED \\
+  $version_packed
+#define VERSION_STRING_NOSP \"v$version_string\"
+#define VERSION_STRING \" v$version_string\"
+EOF
+}
+close($version_file);
diff --git a/third_party/aom/codereview.settings b/third_party/aom/codereview.settings
new file mode 100644
index 000000000..185e9344c
--- /dev/null
+++ b/third_party/aom/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: aomedia-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/third_party/aom/common/args.c b/third_party/aom/common/args.c
new file mode 100644
index 000000000..7131e24de
--- /dev/null
+++ b/third_party/aom/common/args.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/args.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/msvc.h"
+
+#if defined(__GNUC__) && __GNUC__
+extern void die(const char *fmt, ...) __attribute__((noreturn));
+#else
+extern void die(const char *fmt, ...);
+#endif
+
+struct arg arg_init(char **argv) {
+  struct arg a;
+
+  a.argv = argv;
+  a.argv_step = 1;
+  a.name = NULL;
+  a.val = NULL;
+  a.def = NULL;
+  return a;
+}
+
+char *ignore_front_spaces(const char *str) {
+  while (str[0] == ' ' || str[0] == '\t') ++str;
+  return (char *)str;
+}
+
+void ignore_end_spaces(char *str) {
+  char *end = str + strlen(str);
+  while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' ||
+                       end[0] == '\r' || end[0] == '\0'))
+    --end;
+  if (end >= str) end[1] = '\0';
+}
+
+int arg_cfg(int *argc, char ***argv, const char *file) {
+  char **argv_local = (char **)*argv;
+  char **argv_org = (char **)*argv;
+  char line[1024 * 10];
+  FILE *f = fopen(file, "r");
+  if (!f) return 1;
+
+  while (fgets(line, sizeof(line) - 1, f)) {
+    char *actual_line = ignore_front_spaces(line);
+    char *left, *right, *comment;
+    size_t length = strlen(actual_line);
+
+    if (length == 0 || actual_line[0] == '#') continue;
+    right = strchr(actual_line, ':');
+    if (right == NULL) continue;
+    right[0] = '\0';
+
+    left = ignore_front_spaces(actual_line);
+    right = ignore_front_spaces(right + 1);
+
+    comment = strchr(right, '#');
+    if (comment != NULL) comment[0] = '\0';
+
+    ignore_end_spaces(left);
+    ignore_end_spaces(right);
+
+    char **new_args = argv_dup(*argc, (const char **)argv_local);
+    char *new_line = (char *)malloc(sizeof(*new_line) * 128);
+
+    if (argv_local != argv_org) free(argv_local);
+
+    if (!strcmp(right, "ON"))
+      snprintf(new_line, sizeof(*new_line) * 128, "--%s", left);
+    else
+      snprintf(new_line, sizeof(*new_line) * 128, "--%s=%s", left, right);
+
+    new_args[(*argc) - 1] = new_args[(*argc) - 2];
+    new_args[(*argc) - 2] = new_line;
+    argv_local = new_args;
+    *argv = new_args;
+    (*argc)++;
+  }
+  fclose(f);
+  return 0;
+}
+
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
+  struct arg arg;
+
+  if (!argv[0] || argv[0][0] != '-') return 0;
+
+  arg = arg_init(argv);
+
+  if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 &&
+      !strcmp(arg.argv[0] + 1, def->short_name)) {
+    arg.name = arg.argv[0] + 1;
+    arg.val = def->has_val ? arg.argv[1] : NULL;
+    arg.argv_step = def->has_val ? 2 : 1;
+  } else if (def->long_name) {
+    const size_t name_len = strlen(def->long_name);
+
+    if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' &&
+        !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
+        (arg.argv[0][name_len + 2] == '=' ||
+         arg.argv[0][name_len + 2] == '\0')) {
+      arg.name = arg.argv[0] + 2;
+      arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
+      arg.argv_step = 1;
+    }
+  }
+
+  if (arg.name && !arg.val && def->has_val)
+    die("Error: option %s requires argument.\n", arg.name);
+
+  if (arg.name && arg.val && !def->has_val)
+    die("Error: option %s requires no argument.\n", arg.name);
+
+  if (arg.name && (arg.val || !def->has_val)) {
+    arg.def = def;
+    *arg_ = arg;
+    return 1;
+  }
+
+  return 0;
+}
+
+const char *arg_next(struct arg *arg) {
+  if (arg->argv[0]) arg->argv += arg->argv_step;
+
+  return *arg->argv;
+}
+
+char **argv_dup(int argc, const char **argv) {
+  char **new_argv = malloc((argc + 1) * sizeof(*argv));
+
+  memcpy(new_argv, argv, argc * sizeof(*argv));
+  new_argv[argc] = NULL;
+  return new_argv;
+}
+
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
+  char option_text[40] = { 0 };
+
+  for (; *defs; defs++) {
+    const struct arg_def *def = *defs;
+    char *short_val = def->has_val ? " <arg>" : "";
+    char *long_val = def->has_val ? "=<arg>" : "";
+
+    if (def->short_name && def->long_name) {
+      char *comma = def->has_val ? "," : ",      ";
+
+      snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val,
+               comma, def->long_name, long_val);
+    } else if (def->short_name)
+      snprintf(option_text, 37, "-%s%s", def->short_name, short_val);
+    else if (def->long_name)
+      snprintf(option_text, 37, "          --%s%s", def->long_name, long_val);
+
+    fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+
+    if (def->enums) {
+      const struct arg_enum_list *listptr;
+
+      fprintf(fp, "  %-37s\t  ", "");
+
+      for (listptr = def->enums; listptr->name; listptr++)
+        fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n");
+    }
+  }
+}
+
+unsigned int arg_parse_uint(const struct arg *arg) {
+  char *endptr;
+  const unsigned long rawval = strtoul(arg->val, &endptr, 10);  // NOLINT
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval <= UINT_MAX) return (unsigned int)rawval;
+
+    die("Option %s: Value %lu out of range for unsigned int\n", arg->name,
+        rawval);
+  }
+
+  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+int arg_parse_int(const struct arg *arg) {
+  char *endptr;
+  const long rawval = strtol(arg->val, &endptr, 10);  // NOLINT
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval;
+
+    die("Option %s: Value %ld out of range for signed int\n", arg->name,
+        rawval);
+  }
+
+  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+struct aom_rational {
+  int num; /**< fraction numerator */
+  int den; /**< fraction denominator */
+};
+struct aom_rational arg_parse_rational(const struct arg *arg) {
+  long int rawval;
+  char *endptr;
+  struct aom_rational rat;
+
+  /* parse numerator */
+  rawval = strtol(arg->val, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '/') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      rat.num = (int)rawval;
+    else
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+  } else
+    die("Option %s: Expected / at '%c'\n", arg->name, *endptr);
+
+  /* parse denominator */
+  rawval = strtol(endptr + 1, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      rat.den = (int)rawval;
+    else
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+  } else
+    die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+
+  return rat;
+}
+
+int arg_parse_enum(const struct arg *arg) {
+  const struct arg_enum_list *listptr;
+  long int rawval;
+  char *endptr;
+
+  /* First see if the value can be parsed as a raw value */
+  rawval = strtol(arg->val, &endptr, 10);
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    /* Got a raw value, make sure it's valid */
+    for (listptr = arg->def->enums; listptr->name; listptr++)
+      if (listptr->val == rawval) return (int)rawval;
+  }
+
+  /* Next see if it can be parsed as a string */
+  for (listptr = arg->def->enums; listptr->name; listptr++)
+    if (!strcmp(arg->val, listptr->name)) return listptr->val;
+
+  die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+  return 0;
+}
+
+int arg_parse_enum_or_int(const struct arg *arg) {
+  if (arg->def->enums) return arg_parse_enum(arg);
+  return arg_parse_int(arg);
+}
+
+// parse a comma separated list of at most n integers
+// return the number of elements in the list
+int arg_parse_list(const struct arg *arg, int *list, int n) {
+  const char *ptr = arg->val;
+  char *endptr;
+  int i = 0;
+
+  while (ptr[0] != '\0') {
+    int32_t rawval = (int32_t)strtol(ptr, &endptr, 10);
+    if (rawval < INT_MIN || rawval > INT_MAX) {
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+    } else if (i >= n) {
+      die("Option %s: List has more than %d entries\n", arg->name, n);
+    } else if (*endptr == ',') {
+      endptr++;
+    } else if (*endptr != '\0') {
+      die("Option %s: Bad list separator '%c'\n", arg->name, *endptr);
+    }
+    list[i++] = (int)rawval;
+    ptr = endptr;
+  }
+  return i;
+}
diff --git a/third_party/aom/common/args.h b/third_party/aom/common/args.h
new file mode 100644
index 000000000..6a2664269
--- /dev/null
+++ b/third_party/aom/common/args.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_ARGS_H_
+#define AOM_COMMON_ARGS_H_
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct arg {
+  char **argv;
+  const char *name;
+  const char *val;
+  unsigned int argv_step;
+  const struct arg_def *def;
+};
+
+struct arg_enum_list {
+  const char *name;
+  int val;
+};
+#define ARG_ENUM_LIST_END \
+  { 0 }
+
+typedef struct arg_def {
+  const char *short_name;
+  const char *long_name;
+  int has_val;
+  const char *desc;
+  const struct arg_enum_list *enums;
+} arg_def_t;
+#define ARG_DEF(s, l, v, d) \
+  { s, l, v, d, NULL }
+#define ARG_DEF_ENUM(s, l, v, d, e) \
+  { s, l, v, d, e }
+#define ARG_DEF_LIST_END \
+  { 0 }
+
+struct arg arg_init(char **argv);
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
+char *ignore_front_spaces(const char *str);
+void ignore_end_spaces(char *str);
+int arg_cfg(int *argc, char ***argv, const char *file);
+const char *arg_next(struct arg *arg);
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
+char **argv_dup(int argc, const char **argv);
+
+unsigned int arg_parse_uint(const struct arg *arg);
+int arg_parse_int(const struct arg *arg);
+struct aom_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
+int arg_parse_list(const struct arg *arg, int *list, int n);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_ARGS_H_
diff --git a/third_party/aom/common/av1_config.c b/third_party/aom/common/av1_config.c
new file mode 100644
index 000000000..e8decf76f
--- /dev/null
+++ b/third_party/aom/common/av1_config.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "av1/common/obu_util.h"
+#include "common/av1_config.h"
+#include "config/aom_config.h"
+
+// Helper macros to reduce verbosity required to check for read errors.
+//
+// Note that when using these macros, even single line if statements should use
+// curly braces to avoid unexpected behavior because all but the
+// AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements.
+#define AV1C_READ_BIT_OR_RETURN_ERROR(field)                                   \
+  int field = 0;                                                               \
+  do {                                                                         \
+    field = aom_rb_read_bit(reader);                                           \
+    if (result == -1) {                                                        \
+      fprintf(stderr,                                                          \
+              "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \
+              field, result);                                                  \
+      return -1;                                                               \
+    }                                                                          \
+  } while (0)
+
+#define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \
+  int field = 0;                                      \
+  do {                                                \
+    field = aom_rb_read_literal(reader, (length));    \
+    if (result == -1) {                               \
+      fprintf(stderr,                                 \
+              "av1c: Could not read bits for " #field \
+              ", value=%d result=%d.\n",              \
+              field, result);                         \
+      return -1;                                      \
+    }                                                 \
+  } while (0)
+
+// Helper macros for setting/restoring the error handler data in
+// aom_read_bit_buffer.
+#define AV1C_PUSH_ERROR_HANDLER_DATA(new_data)                \
+  void *original_error_handler_data = NULL;                   \
+  do {                                                        \
+    original_error_handler_data = reader->error_handler_data; \
+    reader->error_handler_data = &new_data;                   \
+  } while (0)
+
+#define AV1C_POP_ERROR_HANDLER_DATA()                         \
+  do {                                                        \
+    reader->error_handler_data = original_error_handler_data; \
+  } while (0)
+
+static const size_t kAv1cSize = 4;
+
+static void bitreader_error_handler(void *data) {
+  int *error_val = (int *)data;
+  *error_val = -1;
+}
+
+// Parse the AV1 timing_info() structure:
+// timing_info( ) {
+//   num_units_in_display_tick       f(32)
+//   time_scale                      f(32)
+//   equal_picture_interval          f(1)
+//   if (equal_picture_interval)
+//     num_ticks_per_picture_minus_1 uvlc()
+//   }
+static int parse_timing_info(struct aom_read_bit_buffer *reader) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32);
+  AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32);
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval);
+  if (equal_picture_interval) {
+    uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader);
+    if (result == -1) {
+      fprintf(stderr,
+              "av1c: Could not read bits for "
+              "num_ticks_per_picture_minus_1, value=%u.\n",
+              num_ticks_per_picture_minus_1);
+      return result;
+    }
+  }
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return result;
+}
+
+// Parse the AV1 decoder_model_info() structure:
+// decoder_model_info( ) {
+//   buffer_delay_length_minus_1            f(5)
+//   num_units_in_decoding_tick             f(32)
+//   buffer_removal_time_length_minus_1     f(5)
+//   frame_presentation_time_length_minus_1 f(5)
+// }
+//
+// Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1.
+static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5);
+  AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32);
+  AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5);
+  AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5);
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return buffer_delay_length_minus_1 + 1;
+}
+
+// Parse the AV1 operating_parameters_info() structure:
+// operating_parameters_info( op ) {
+//   n = buffer_delay_length_minus_1 + 1
+//   decoder_buffer_delay[ op ] f(n)
+//   encoder_buffer_delay[ op ] f(n)
+//   low_delay_mode_flag[ op ] f(1)
+// }
+static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader,
+                                           int buffer_delay_length_minus_1) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  const int buffer_delay_length = buffer_delay_length_minus_1 + 1;
+  AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length);
+  AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length);
+  AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag);
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return result;
+}
+
+// Parse the AV1 color_config() structure..See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44
+static int parse_color_config(struct aom_read_bit_buffer *reader,
+                              Av1Config *config) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+  config->high_bitdepth = high_bitdepth;
+
+  int bit_depth = 0;
+  if (config->seq_profile == 2 && config->high_bitdepth) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+    config->twelve_bit = twelve_bit;
+    bit_depth = config->twelve_bit ? 12 : 10;
+  } else {
+    bit_depth = config->high_bitdepth ? 10 : 8;
+  }
+
+  if (config->seq_profile != 1) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome);
+    config->monochrome = mono_chrome;
+  }
+
+  int color_primaries = AOM_CICP_CP_UNSPECIFIED;
+  int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+  int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag);
+  if (color_description_present_flag) {
+    AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8);
+    color_primaries = color_primaries_val;
+    AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8);
+    transfer_characteristics = transfer_characteristics_val;
+    AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8);
+    matrix_coefficients = matrix_coefficients_val;
+  }
+
+  if (config->monochrome) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+    config->chroma_subsampling_x = 1;
+    config->chroma_subsampling_y = 1;
+  } else if (color_primaries == AOM_CICP_CP_BT_709 &&
+             transfer_characteristics == AOM_CICP_TC_SRGB &&
+             matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    config->chroma_subsampling_x = 0;
+    config->chroma_subsampling_y = 0;
+  } else {
+    AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+    if (config->seq_profile == 0) {
+      config->chroma_subsampling_x = 1;
+      config->chroma_subsampling_y = 1;
+    } else if (config->seq_profile == 1) {
+      config->chroma_subsampling_x = 0;
+      config->chroma_subsampling_y = 0;
+    } else {
+      if (bit_depth == 12) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x);
+        config->chroma_subsampling_x = subsampling_x;
+        if (subsampling_x) {
+          AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y);
+          config->chroma_subsampling_y = subsampling_y;
+        } else {
+          config->chroma_subsampling_y = 0;
+        }
+      } else {
+        config->chroma_subsampling_x = 1;
+        config->chroma_subsampling_y = 0;
+      }
+    }
+
+    if (config->chroma_subsampling_x && config->chroma_subsampling_y) {
+      AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+      config->chroma_sample_position = chroma_sample_position;
+    }
+  }
+
+  if (!config->monochrome) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q);
+  }
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return result;
+}
+
+// Parse AV1 Sequence Header OBU. See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41
+static int parse_sequence_header(const uint8_t *const buffer, size_t length,
+                                 Av1Config *config) {
+  int result = 0;
+  // The reader instance is local to this function, but a pointer to the
+  // reader instance is used within this function and throughout this file to
+  // allow use of the helper macros that reduce parse error checking verbosity.
+  struct aom_read_bit_buffer reader_instance = {
+    buffer, buffer + length, 0, &result, bitreader_error_handler
+  };
+  struct aom_read_bit_buffer *reader = &reader_instance;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+  config->seq_profile = seq_profile;
+  AV1C_READ_BIT_OR_RETURN_ERROR(still_picture);
+  AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header);
+  if (reduced_still_picture_header) {
+    config->initial_presentation_delay_present = 0;
+    AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+    config->seq_level_idx_0 = seq_level_idx_0;
+    config->seq_tier_0 = 0;
+  } else {
+    int has_decoder_model = 0;
+    int buffer_delay_length = 0;
+
+    AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag);
+    if (timing_info_present_flag) {
+      if (parse_timing_info(reader) != 0) return -1;
+
+      AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag);
+      if (decoder_model_info_present_flag &&
+          (buffer_delay_length = parse_decoder_model_info(reader)) == -1) {
+        return -1;
+      }
+      has_decoder_model = 1;
+    }
+
+    AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+    config->initial_presentation_delay_present =
+        initial_presentation_delay_present;
+
+    AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5);
+    const int num_operating_points = operating_points_cnt_minus_1 + 1;
+
+    for (int op_index = 0; op_index < num_operating_points; ++op_index) {
+      AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12);
+      AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5);
+
+      int seq_tier = 0;
+      if (seq_level_idx > 7) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op);
+        seq_tier = seq_tier_this_op;
+      }
+
+      if (has_decoder_model) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op);
+        if (decoder_model_present_for_op) {
+          if (parse_operating_parameters_info(reader, buffer_delay_length) ==
+              -1) {
+            return -1;
+          }
+        }
+      }
+
+      if (config->initial_presentation_delay_present) {
+        // Skip the initial presentation delay bits if present since this
+        // function has no access to the data required to properly set the
+        // field.
+        AV1C_READ_BIT_OR_RETURN_ERROR(
+            initial_presentation_delay_present_for_this_op);
+        if (initial_presentation_delay_present_for_this_op) {
+          AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4);
+        }
+      }
+
+      if (op_index == 0) {
+        // Av1Config needs only the values from the first operating point.
+        config->seq_level_idx_0 = seq_level_idx;
+        config->seq_tier_0 = seq_tier;
+        config->initial_presentation_delay_present = 0;
+        config->initial_presentation_delay_minus_one = 0;
+      }
+    }
+  }
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4);
+  AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4);
+  AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1,
+                                 frame_width_bits_minus_1 + 1);
+  AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1,
+                                 frame_height_bits_minus_1 + 1);
+
+  int frame_id_numbers_present = 0;
+  if (!reduced_still_picture_header) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag);
+    frame_id_numbers_present = frame_id_numbers_present_flag;
+  }
+
+  if (frame_id_numbers_present) {
+    AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4);
+    AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3);
+  }
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter);
+
+  if (!reduced_still_picture_header) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound);
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound);
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion);
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter);
+
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint);
+    if (enable_order_hint) {
+      AV1C_READ_BIT_OR_RETURN_ERROR(enable_jnt_comp);
+      AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs);
+    }
+
+    const int SELECT_SCREEN_CONTENT_TOOLS = 2;
+    int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS;
+    AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools);
+    if (!seq_choose_screen_content_tools) {
+      AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val);
+      seq_force_screen_content_tools = seq_force_screen_content_tools_val;
+    }
+
+    if (seq_force_screen_content_tools > 0) {
+      AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv);
+
+      if (!seq_choose_integer_mv) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv);
+      }
+    }
+
+    if (enable_order_hint) {
+      AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3);
+    }
+  }
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration);
+
+  if (parse_color_config(reader, config) != 0) {
+    fprintf(stderr, "av1c: color_config() parse failed.\n");
+    return -1;
+  }
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present);
+  return 0;
+}
+
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+                           Av1Config *config) {
+  if (!buffer || length == 0 || !config) {
+    return -1;
+  }
+
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+
+  size_t sequence_header_length = 0;
+  size_t obu_header_length = 0;
+  if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header,
+                                   &sequence_header_length,
+                                   &obu_header_length) != AOM_CODEC_OK ||
+      obu_header.type != OBU_SEQUENCE_HEADER ||
+      sequence_header_length + obu_header_length > length) {
+    return -1;
+  }
+
+  memset(config, 0, sizeof(*config));
+  config->marker = 1;
+  config->version = 1;
+  return parse_sequence_header(buffer + obu_header_length,
+                               sequence_header_length, config);
+}
+
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+                   size_t *bytes_read, Av1Config *config) {
+  if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1;
+
+  *bytes_read = 0;
+
+  int result = 0;
+  struct aom_read_bit_buffer reader_instance = {
+    buffer, buffer + buffer_length, 0, &result, bitreader_error_handler
+  };
+  struct aom_read_bit_buffer *reader = &reader_instance;
+
+  memset(config, 0, sizeof(*config));
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(marker);
+  config->marker = marker;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(version, 7);
+  config->version = version;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+  config->seq_profile = seq_profile;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+  config->seq_level_idx_0 = seq_level_idx_0;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0);
+  config->seq_tier_0 = seq_tier_0;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+  config->high_bitdepth = high_bitdepth;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+  config->twelve_bit = twelve_bit;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(monochrome);
+  config->monochrome = monochrome;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x);
+  config->chroma_subsampling_x = chroma_subsampling_x;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y);
+  config->chroma_subsampling_y = chroma_subsampling_y;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+  config->chroma_sample_position = chroma_sample_position;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3);
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+  config->initial_presentation_delay_present =
+      initial_presentation_delay_present;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4);
+  config->initial_presentation_delay_minus_one =
+      initial_presentation_delay_minus_one;
+
+  *bytes_read = aom_rb_bytes_read(reader);
+
+  return 0;
+}
+
+int write_av1config(const Av1Config *config, size_t capacity,
+                    size_t *bytes_written, uint8_t *buffer) {
+  if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1;
+
+  *bytes_written = 0;
+  memset(buffer, 0, kAv1cSize);
+
+  struct aom_write_bit_buffer writer = { buffer, 0 };
+
+  aom_wb_write_bit(&writer, config->marker);
+  aom_wb_write_literal(&writer, config->version, 7);
+  aom_wb_write_literal(&writer, config->seq_profile, 3);
+  aom_wb_write_literal(&writer, config->seq_level_idx_0, 5);
+  aom_wb_write_bit(&writer, config->seq_tier_0);
+  aom_wb_write_bit(&writer, config->high_bitdepth);
+  aom_wb_write_bit(&writer, config->twelve_bit);
+  aom_wb_write_bit(&writer, config->monochrome);
+  aom_wb_write_bit(&writer, config->chroma_subsampling_x);
+  aom_wb_write_bit(&writer, config->chroma_subsampling_y);
+  aom_wb_write_literal(&writer, config->chroma_sample_position, 2);
+  aom_wb_write_literal(&writer, 0, 3);  // reserved
+  aom_wb_write_bit(&writer, config->initial_presentation_delay_present);
+
+  if (config->initial_presentation_delay_present) {
+    aom_wb_write_literal(&writer, config->initial_presentation_delay_minus_one,
+                         4);
+  } else {
+    aom_wb_write_literal(&writer, 0, 4);  // reserved
+  }
+
+  *bytes_written = aom_wb_bytes_written(&writer);
+  return 0;
+}
+
+#undef AV1C_READ_BIT_OR_RETURN_ERROR
+#undef AV1C_READ_BITS_OR_RETURN_ERROR
+#undef AV1C_PUSH_ERROR_HANDLER_DATA
+#undef AV1C_POP_ERROR_HANDLER_DATA
diff --git a/third_party/aom/common/av1_config.h b/third_party/aom/common/av1_config.h
new file mode 100644
index 000000000..a15bedb30
--- /dev/null
+++ b/third_party/aom/common/av1_config.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_AV1_CONFIG_H_
+#define AOM_COMMON_AV1_CONFIG_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Struct representing ISOBMFF/Matroska AV1 config. See:
+// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax
+//
+// The AV1 config has the following format:
+//
+// unsigned int (1) marker = 1;
+// unsigned int (7) version = 1;
+// unsigned int (3) seq_profile;
+// unsigned int (5) seq_level_idx_0;
+// unsigned int (1) seq_tier_0;
+// unsigned int (1) high_bitdepth;
+// unsigned int (1) twelve_bit;
+// unsigned int (1) monochrome;
+// unsigned int (1) chroma_subsampling_x;
+// unsigned int (1) chroma_subsampling_y;
+// unsigned int (2) chroma_sample_position;
+// unsigned int (3) reserved = 0;
+//
+// unsigned int (1) initial_presentation_delay_present;
+// if (initial_presentation_delay_present) {
+//   unsigned int (4) initial_presentation_delay_minus_one;
+// } else {
+//   unsigned int (4) reserved = 0;
+// }
+//
+// unsigned int (8)[] configOBUs;
+//
+// Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so
+// the field is omitted.
+typedef struct _Av1Config {
+  uint8_t marker;
+  uint8_t version;
+  uint8_t seq_profile;
+  uint8_t seq_level_idx_0;
+  uint8_t seq_tier_0;
+  uint8_t high_bitdepth;
+  uint8_t twelve_bit;
+  uint8_t monochrome;
+  uint8_t chroma_subsampling_x;
+  uint8_t chroma_subsampling_y;
+  uint8_t chroma_sample_position;
+  uint8_t initial_presentation_delay_present;
+  uint8_t initial_presentation_delay_minus_one;
+} Av1Config;
+
+// Attempts to parse a Sequence Header OBU and set the paramenters of 'config'.
+// Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple
+// OBUs, but the Sequence Header OBU must be the first OBU within the buffer.
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+                           Av1Config *config);
+
+// Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success.
+// Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or
+// when parsing of 'buffer' fails.
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+                   size_t *bytes_read, Av1Config *config);
+
+// Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'.
+// Returns -1 when passed NULL pointers or when 'capacity' insufficient.
+int write_av1config(const Av1Config *config, size_t capacity,
+                    size_t *bytes_written, uint8_t *buffer);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_AV1_CONFIG_H_
diff --git a/third_party/aom/common/ivfdec.c b/third_party/aom/common/ivfdec.c
new file mode 100644
index 000000000..80d73b04c
--- /dev/null
+++ b/third_party/aom/common/ivfdec.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/ivfdec.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/sanitizer.h"
+
+static const char *IVF_SIGNATURE = "DKIF";
+
+static void fix_framerate(int *num, int *den) {
+  if (*den <= 0 || *den >= 1000000000 || *num <= 0 || *num >= 1000) {
+    // framerate seems to be invalid, just default to 30fps.
+    *num = 30;
+    *den = 1;
+  }
+}
+
+int file_is_ivf(struct AvxInputContext *input_ctx) {
+  char raw_hdr[32];
+  int is_ivf = 0;
+
+  if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) {
+    if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) {
+      is_ivf = 1;
+
+      if (mem_get_le16(raw_hdr + 4) != 0) {
+        fprintf(stderr,
+                "Error: Unrecognized IVF version! This file may not"
+                " decode properly.");
+      }
+
+      input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
+      input_ctx->width = mem_get_le16(raw_hdr + 12);
+      input_ctx->height = mem_get_le16(raw_hdr + 14);
+      input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
+      input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
+      fix_framerate(&input_ctx->framerate.numerator,
+                    &input_ctx->framerate.denominator);
+    }
+  }
+
+  if (!is_ivf) {
+    rewind(input_ctx->file);
+    input_ctx->detect.buf_read = 0;
+  } else {
+    input_ctx->detect.position = 4;
+  }
+  return is_ivf;
+}
+
+int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                   size_t *buffer_size, aom_codec_pts_t *pts) {
+  char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
+  size_t frame_size = 0;
+
+  if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile)) warn("Failed to read frame size");
+  } else {
+    frame_size = mem_get_le32(raw_header);
+
+    if (frame_size > 256 * 1024 * 1024) {
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buffer = (uint8_t *)realloc(*buffer, 2 * frame_size);
+
+      if (new_buffer) {
+        *buffer = new_buffer;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer");
+        frame_size = 0;
+      }
+    }
+
+    if (pts) {
+      *pts = mem_get_le32(&raw_header[4]);
+      *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32);
+    }
+  }
+
+  if (!feof(infile)) {
+    ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame");
+      return 1;
+    }
+
+    ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size);
+    *bytes_read = frame_size;
+    return 0;
+  }
+
+  return 1;
+}
diff --git a/third_party/aom/common/ivfdec.h b/third_party/aom/common/ivfdec.h
new file mode 100644
index 000000000..ea294faa1
--- /dev/null
+++ b/third_party/aom/common/ivfdec.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_IVFDEC_H_
+#define AOM_COMMON_IVFDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int file_is_ivf(struct AvxInputContext *input);
+
+typedef int64_t aom_codec_pts_t;
+int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                   size_t *buffer_size, aom_codec_pts_t *pts);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_IVFDEC_H_
diff --git a/third_party/aom/common/ivfenc.c b/third_party/aom/common/ivfenc.c
new file mode 100644
index 000000000..64715f4d7
--- /dev/null
+++ b/third_party/aom/common/ivfenc.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/ivfenc.h"
+
+#include "aom/aom_encoder.h"
+#include "aom_ports/mem_ops.h"
+
+void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+                           unsigned int fourcc, int frame_cnt) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4, 0);                     // version
+  mem_put_le16(header + 6, 32);                    // header size
+  mem_put_le32(header + 8, fourcc);                // fourcc
+  mem_put_le16(header + 12, cfg->g_w);             // width
+  mem_put_le16(header + 14, cfg->g_h);             // height
+  mem_put_le32(header + 16, cfg->g_timebase.den);  // rate
+  mem_put_le32(header + 20, cfg->g_timebase.num);  // scale
+  mem_put_le32(header + 24, frame_cnt);            // length
+  mem_put_le32(header + 28, 0);                    // unused
+
+  fwrite(header, 1, 32, outfile);
+}
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
+  char header[12];
+
+  mem_put_le32(header, (int)frame_size);
+  mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+  mem_put_le32(header + 8, (int)(pts >> 32));
+  fwrite(header, 1, 12, outfile);
+}
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size) {
+  char header[4];
+
+  mem_put_le32(header, (int)frame_size);
+  fwrite(header, 1, 4, outfile);
+}
diff --git a/third_party/aom/common/ivfenc.h b/third_party/aom/common/ivfenc.h
new file mode 100644
index 000000000..8f6d947d4
--- /dev/null
+++ b/third_party/aom/common/ivfenc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_IVFENC_H_
+#define AOM_COMMON_IVFENC_H_
+
+#include "common/tools_common.h"
+
+struct aom_codec_enc_cfg;
+struct aom_codec_cx_pkt;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+                           uint32_t fourcc, int frame_cnt);
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size);
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_IVFENC_H_
diff --git a/third_party/aom/common/md5_utils.c b/third_party/aom/common/md5_utils.c
new file mode 100644
index 000000000..b69e1cc72
--- /dev/null
+++ b/third_party/aom/common/md5_utils.c
@@ -0,0 +1,249 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include <string.h> /* for memcpy() */
+
+#include "common/md5_utils.h"
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+  md5byte *p;
+
+  /* Only swap bytes for big endian machines */
+  int i = 1;
+
+  if (*(char *)&i == 1) return;
+
+  p = (md5byte *)buf;
+
+  do {
+    *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+             ((unsigned)p[1] << 8 | p[0]);
+    p += 4;
+  } while (--words);
+}
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bytes[0] = 0;
+  ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+  UWORD32 t;
+
+  /* Update byte count */
+
+  t = ctx->bytes[0];
+
+  if ((ctx->bytes[0] = t + len) < t)
+    ctx->bytes[1]++; /* Carry from low to high */
+
+  t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+  if (t > len) {
+    memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+    return;
+  }
+
+  /* First chunk is an odd size */
+  memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+  byteSwap(ctx->in, 16);
+  MD5Transform(ctx->buf, ctx->in);
+  buf += t;
+  len -= t;
+
+  /* Process data in 64-byte chunks */
+  while (len >= 64) {
+    memcpy(ctx->in, buf, 64);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    buf += 64;
+    len -= 64;
+  }
+
+  /* Handle any remaining bytes of data. */
+  memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+  int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+  md5byte *p = (md5byte *)ctx->in + count;
+
+  /* Set the first char of padding to 0x80.  There is always room. */
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 56 bytes (-8..55) */
+  count = 56 - 1 - count;
+
+  if (count < 0) { /* Padding forces an extra block */
+    memset(p, 0, count + 8);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    p = (md5byte *)ctx->in;
+    count = 56;
+  }
+
+  memset(p, 0, count);
+  byteSwap(ctx->in, 14);
+
+  /* Append length in bits and transform */
+  ctx->in[14] = ctx->bytes[0] << 3;
+  ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+  MD5Transform(ctx->buf, ctx->in);
+
+  byteSwap(ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+  (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define AOM_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#define AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+AOM_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+                                                 UWORD32 const in[16]) {
+  register UWORD32 a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+#undef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
diff --git a/third_party/aom/common/md5_utils.h b/third_party/aom/common/md5_utils.h
new file mode 100644
index 000000000..144fa3ad2
--- /dev/null
+++ b/third_party/aom/common/md5_utils.h
@@ -0,0 +1,49 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef AOM_COMMON_MD5_UTILS_H_
+#define AOM_COMMON_MD5_UTILS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+  UWORD32 buf[4];
+  UWORD32 bytes[2];
+  UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_MD5_UTILS_H_
diff --git a/third_party/aom/common/obudec.c b/third_party/aom/common/obudec.c
new file mode 100644
index 000000000..acbd12e0c
--- /dev/null
+++ b/third_party/aom/common/obudec.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/obudec.h"
+
+#include "aom_ports/mem_ops.h"
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+
+#define OBU_BUFFER_SIZE (500 * 1024)
+
+#define OBU_HEADER_SIZE 1
+#define OBU_EXTENSION_SIZE 1
+#define OBU_MAX_LENGTH_FIELD_SIZE 8
+#define OBU_DETECTION_SIZE \
+  (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 3 * OBU_MAX_LENGTH_FIELD_SIZE)
+
+// Reads unsigned LEB128 integer and returns 0 upon successful read and decode.
+// Stores raw bytes in 'value_buffer', length of the number in 'value_length',
+// and decoded value in 'value'.
+static int obudec_read_leb128(FILE *f, uint8_t *value_buffer,
+                              size_t *value_length, uint64_t *value) {
+  if (!f || !value_buffer || !value_length || !value) return -1;
+  size_t len;
+  for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) {
+    const size_t num_read = fread(&value_buffer[len], 1, 1, f);
+    if (num_read == 0) {
+      if (len == 0 && feof(f)) {
+        *value_length = 0;
+        return 0;
+      }
+      // Ran out of data before completing read of value.
+      return -1;
+    }
+    if ((value_buffer[len] >> 7) == 0) {
+      ++len;
+      *value_length = len;
+      break;
+    }
+  }
+
+  return aom_uleb_decode(value_buffer, len, value, NULL);
+}
+
+// Reads OBU header from 'f'. The 'buffer_capacity' passed in must be large
+// enough to store an OBU header with extension (2 bytes). Raw OBU data is
+// written to 'obu_data', parsed OBU header values are written to 'obu_header',
+// and total bytes read from file are written to 'bytes_read'. Returns 0 for
+// success, and non-zero on failure. When end of file is reached, the return
+// value is 0 and the 'bytes_read' value is set to 0.
+static int obudec_read_obu_header(FILE *f, size_t buffer_capacity,
+                                  int is_annexb, uint8_t *obu_data,
+                                  ObuHeader *obu_header, size_t *bytes_read) {
+  if (!f || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) ||
+      !obu_data || !obu_header || !bytes_read) {
+    return -1;
+  }
+  *bytes_read = fread(obu_data, 1, 1, f);
+
+  if (feof(f) && *bytes_read == 0) {
+    return 0;
+  } else if (*bytes_read != 1) {
+    fprintf(stderr, "obudec: Failure reading OBU header.\n");
+    return -1;
+  }
+
+  const int has_extension = (obu_data[0] >> 2) & 0x1;
+  if (has_extension) {
+    if (fread(&obu_data[1], 1, 1, f) != 1) {
+      fprintf(stderr, "obudec: Failure reading OBU extension.");
+      return -1;
+    }
+    ++*bytes_read;
+  }
+
+  size_t obu_bytes_parsed = 0;
+  const aom_codec_err_t parse_result = aom_read_obu_header(
+      obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb);
+  if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) {
+    fprintf(stderr, "obudec: Error parsing OBU header.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+// Reads OBU payload from 'f' and returns 0 for success when all payload bytes
+// are read from the file. Payload data is written to 'obu_data', and actual
+// bytes read added to 'bytes_read'.
+static int obudec_read_obu_payload(FILE *f, size_t payload_length,
+                                   uint8_t *obu_data, size_t *bytes_read) {
+  if (!f || payload_length == 0 || !obu_data || !bytes_read) return -1;
+
+  if (fread(obu_data, 1, payload_length, f) != payload_length) {
+    fprintf(stderr, "obudec: Failure reading OBU payload.\n");
+    return -1;
+  }
+
+  *bytes_read += payload_length;
+  return 0;
+}
+
+static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
+                                           int is_annexb, uint8_t *buffer,
+                                           size_t *bytes_read,
+                                           size_t *payload_length,
+                                           ObuHeader *obu_header) {
+  const size_t kMinimumBufferSize =
+      (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + OBU_MAX_LENGTH_FIELD_SIZE);
+  if (!f || !buffer || !bytes_read || !payload_length || !obu_header ||
+      buffer_capacity < kMinimumBufferSize) {
+    return -1;
+  }
+
+  size_t leb128_length = 0;
+  uint64_t obu_size = 0;
+  if (is_annexb) {
+    if (obudec_read_leb128(f, &buffer[0], &leb128_length, &obu_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading OBU size length.\n");
+      return -1;
+    } else if (leb128_length == 0) {
+      *payload_length = 0;
+      return 0;
+    }
+    if (obu_size > UINT32_MAX) {
+      fprintf(stderr, "obudec: OBU payload length too large.\n");
+      return -1;
+    }
+  }
+
+  size_t header_size = 0;
+  if (obudec_read_obu_header(f, buffer_capacity - leb128_length, is_annexb,
+                             buffer + leb128_length, obu_header,
+                             &header_size) != 0) {
+    return -1;
+  } else if (header_size == 0) {
+    *payload_length = 0;
+    return 0;
+  }
+
+  if (is_annexb) {
+    if (obu_size < header_size) {
+      fprintf(stderr, "obudec: OBU size is too small.\n");
+      return -1;
+    }
+    *payload_length = (size_t)obu_size - header_size;
+  } else {
+    uint64_t u64_payload_length = 0;
+    if (obudec_read_leb128(f, &buffer[header_size], &leb128_length,
+                           &u64_payload_length) != 0) {
+      fprintf(stderr, "obudec: Failure reading OBU payload length.\n");
+      return -1;
+    }
+    if (u64_payload_length > UINT32_MAX) {
+      fprintf(stderr, "obudec: OBU payload length too large.\n");
+      return -1;
+    }
+
+    *payload_length = (size_t)u64_payload_length;
+  }
+
+  *bytes_read = leb128_length + header_size;
+  return 0;
+}
+
+static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer,
+                               size_t obu_bytes_buffered,
+                               size_t *obu_buffer_capacity, size_t *obu_length,
+                               ObuHeader *obu_header, int is_annexb) {
+  size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered;
+
+  if (!(*obu_buffer)) return -1;
+
+  size_t bytes_read = 0;
+  size_t obu_payload_length = 0;
+  const int status = obudec_read_obu_header_and_size(
+      f, available_buffer_capacity, is_annexb, *obu_buffer + obu_bytes_buffered,
+      &bytes_read, &obu_payload_length, obu_header);
+  if (status < 0) return status;
+
+  if (obu_payload_length > SIZE_MAX - bytes_read) return -1;
+
+  if (obu_payload_length > 256 * 1024 * 1024) {
+    fprintf(stderr, "obudec: Read invalid OBU size (%u)\n",
+            (unsigned int)obu_payload_length);
+    *obu_length = bytes_read + obu_payload_length;
+    return -1;
+  }
+
+  if (bytes_read + obu_payload_length > available_buffer_capacity) {
+    // TODO(tomfinegan): Add overflow check.
+    const size_t new_capacity =
+        obu_bytes_buffered + bytes_read + 2 * obu_payload_length;
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+    if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) {
+      fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n");
+      return -1;
+    }
+#endif
+
+    uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity);
+
+    if (new_buffer) {
+      *obu_buffer = new_buffer;
+      *obu_buffer_capacity = new_capacity;
+    } else {
+      fprintf(stderr, "obudec: Failed to allocate compressed data buffer\n");
+      *obu_length = bytes_read + obu_payload_length;
+      return -1;
+    }
+  }
+
+  if (obu_payload_length > 0 &&
+      obudec_read_obu_payload(f, obu_payload_length,
+                              *obu_buffer + obu_bytes_buffered + bytes_read,
+                              &bytes_read) != 0) {
+    return -1;
+  }
+
+  *obu_length = bytes_read;
+  return 0;
+}
+
+int file_is_obu(struct ObuDecInputContext *obu_ctx) {
+  if (!obu_ctx || !obu_ctx->avx_ctx) return 0;
+
+  struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx;
+  uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 };
+  const int is_annexb = obu_ctx->is_annexb;
+  FILE *f = avx_ctx->file;
+  size_t payload_length = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t length_of_unit_size = 0;
+  size_t annexb_header_length = 0;
+  uint64_t unit_size = 0;
+
+  if (is_annexb) {
+    // read the size of first temporal unit
+    if (obudec_read_leb128(f, &detect_buf[0], &length_of_unit_size,
+                           &unit_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+      return 0;
+    }
+
+    // read the size of first frame unit
+    if (obudec_read_leb128(f, &detect_buf[length_of_unit_size],
+                           &annexb_header_length, &unit_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading frame unit header\n");
+      return 0;
+    }
+    annexb_header_length += length_of_unit_size;
+  }
+
+  size_t bytes_read = 0;
+  if (obudec_read_obu_header_and_size(
+          f, OBU_DETECTION_SIZE - annexb_header_length, is_annexb,
+          &detect_buf[annexb_header_length], &bytes_read, &payload_length,
+          &obu_header) != 0) {
+    fprintf(stderr, "obudec: Failure reading first OBU.\n");
+    rewind(f);
+    return 0;
+  }
+
+  if (is_annexb) {
+    bytes_read += annexb_header_length;
+  }
+
+  if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+      obu_header.type != OBU_SEQUENCE_HEADER) {
+    return 0;
+  }
+
+  if (obu_header.has_size_field) {
+    if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) {
+      fprintf(
+          stderr,
+          "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero).");
+      rewind(f);
+      return 0;
+    }
+  } else if (!is_annexb) {
+    fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n");
+    rewind(f);
+    return 0;
+  }
+
+  // Appears that input is valid Section 5 AV1 stream.
+  obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE);
+  if (!obu_ctx->buffer) {
+    fprintf(stderr, "Out of memory.\n");
+    rewind(f);
+    return 0;
+  }
+  obu_ctx->buffer_capacity = OBU_BUFFER_SIZE;
+
+  memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read);
+  obu_ctx->bytes_buffered = bytes_read;
+  // If the first OBU is a SEQUENCE_HEADER, then it will have a payload.
+  // We need to read this in so that our buffer only contains complete OBUs.
+  if (payload_length > 0) {
+    if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) {
+      fprintf(stderr, "obudec: First OBU's payload is too large\n");
+      rewind(f);
+      return 0;
+    }
+
+    size_t payload_bytes = 0;
+    const int status = obudec_read_obu_payload(
+        f, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes);
+    if (status < 0) {
+      rewind(f);
+      return 0;
+    }
+    obu_ctx->bytes_buffered += payload_bytes;
+  }
+  return 1;
+}
+
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+                              uint8_t **buffer, size_t *bytes_read,
+                              size_t *buffer_size) {
+  FILE *f = obu_ctx->avx_ctx->file;
+  if (!f) return -1;
+
+  *buffer_size = 0;
+  *bytes_read = 0;
+
+  if (feof(f)) {
+    return 1;
+  }
+
+  size_t tu_size;
+  size_t obu_size = 0;
+  size_t length_of_temporal_unit_size = 0;
+  uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 };
+
+  if (obu_ctx->is_annexb) {
+    uint64_t size = 0;
+
+    if (obu_ctx->bytes_buffered == 0) {
+      if (obudec_read_leb128(f, &tuheader[0], &length_of_temporal_unit_size,
+                             &size) != 0) {
+        fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+        return -1;
+      }
+      if (size == 0 && feof(f)) {
+        return 1;
+      }
+    } else {
+      // temporal unit size was already stored in buffer
+      if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size,
+                          &length_of_temporal_unit_size) != 0) {
+        fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+        return -1;
+      }
+    }
+
+    if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) {
+      fprintf(stderr, "obudec: TU too large.\n");
+      return -1;
+    }
+
+    size += length_of_temporal_unit_size;
+    tu_size = (size_t)size;
+  } else {
+    while (1) {
+      ObuHeader obu_header;
+      memset(&obu_header, 0, sizeof(obu_header));
+
+      if (obudec_read_one_obu(f, &obu_ctx->buffer, obu_ctx->bytes_buffered,
+                              &obu_ctx->buffer_capacity, &obu_size, &obu_header,
+                              0) != 0) {
+        fprintf(stderr, "obudec: read_one_obu failed in TU loop\n");
+        return -1;
+      }
+
+      if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) {
+        tu_size = obu_ctx->bytes_buffered;
+        break;
+      } else {
+        obu_ctx->bytes_buffered += obu_size;
+      }
+    }
+  }
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+  if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) {
+    fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n");
+    return -1;
+  }
+#endif
+  uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size);
+  if (!new_buffer) {
+    free(*buffer);
+    fprintf(stderr, "obudec: Out of memory.\n");
+    return -1;
+  }
+  *buffer = new_buffer;
+  *bytes_read = tu_size;
+  *buffer_size = tu_size;
+
+  if (!obu_ctx->is_annexb) {
+    memcpy(*buffer, obu_ctx->buffer, tu_size);
+
+    // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size)
+    // points to the end of the buffer.
+    memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered,
+            obu_size);
+    obu_ctx->bytes_buffered = obu_size;
+  } else {
+    if (!feof(f)) {
+      size_t data_size;
+      size_t offset;
+      if (!obu_ctx->bytes_buffered) {
+        data_size = tu_size - length_of_temporal_unit_size;
+        memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size);
+        offset = length_of_temporal_unit_size;
+      } else {
+        memcpy(*buffer, obu_ctx->buffer, obu_ctx->bytes_buffered);
+        offset = obu_ctx->bytes_buffered;
+        data_size = tu_size - obu_ctx->bytes_buffered;
+        obu_ctx->bytes_buffered = 0;
+      }
+
+      if (fread(*buffer + offset, 1, data_size, f) != data_size) {
+        fprintf(stderr, "obudec: Failed to read full temporal unit\n");
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+void obudec_free(struct ObuDecInputContext *obu_ctx) { free(obu_ctx->buffer); }
diff --git a/third_party/aom/common/obudec.h b/third_party/aom/common/obudec.h
new file mode 100644
index 000000000..b2adb1e3d
--- /dev/null
+++ b/third_party/aom/common/obudec.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_OBUDEC_H_
+#define AOM_COMMON_OBUDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ObuDecInputContext {
+  struct AvxInputContext *avx_ctx;
+  uint8_t *buffer;
+  size_t buffer_capacity;
+  size_t bytes_buffered;
+  int is_annexb;
+};
+
+// Returns 1 when file data starts (if Annex B stream, after reading the
+// size of the OBU) with what appears to be a Temporal Delimiter
+// OBU as defined by Section 5 of the AV1 bitstream specification.
+int file_is_obu(struct ObuDecInputContext *obu_ctx);
+
+// Reads one Temporal Unit from the input file. Returns 0 when a TU is
+// successfully read, 1 when end of file is reached, and less than 0 when an
+// error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size,
+// returns buffer capacity via 'buffer_size', and returns size of buffered data
+// via 'bytes_read'.
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+                              uint8_t **buffer, size_t *bytes_read,
+                              size_t *buffer_size);
+
+void obudec_free(struct ObuDecInputContext *obu_ctx);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_OBUDEC_H_
diff --git a/third_party/aom/common/rawenc.c b/third_party/aom/common/rawenc.c
new file mode 100644
index 000000000..5a2731d3a
--- /dev/null
+++ b/third_party/aom/common/rawenc.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/rawenc.h"
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+                          const int num_planes, FILE *file) {
+  const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+  for (int i = 0; i < num_planes; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = aom_img_plane_width(img, plane);
+    const int h = aom_img_plane_height(img, plane);
+    for (int y = 0; y < h; ++y) {
+      fwrite(buf, bytes_per_sample, w, file);
+      buf += stride;
+    }
+  }
+}
+
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+                          const int num_planes, MD5Context *md5) {
+  for (int i = 0; i < num_planes; ++i) {
+    const int plane = planes[i];
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    for (int y = 0; y < h; ++y) {
+      MD5Update(md5, buf, w);
+      buf += stride;
+    }
+  }
+}
diff --git a/third_party/aom/common/rawenc.h b/third_party/aom/common/rawenc.h
new file mode 100644
index 000000000..cf5e00e6f
--- /dev/null
+++ b/third_party/aom/common/rawenc.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_RAWENC_H_
+#define AOM_COMMON_RAWENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+                          const int num_planes, FILE *file);
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+                          const int num_planes, MD5Context *md5);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_RAWENC_H_
diff --git a/third_party/aom/common/tools_common.c b/third_party/aom/common/tools_common.c
new file mode 100644
index 000000000..21cd80026
--- /dev/null
+++ b/third_party/aom/common/tools_common.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/tools_common.h"
+
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+
+#if CONFIG_AV1_DECODER
+#include "aom/aomdx.h"
+#endif
+
+#if defined(_WIN32) || defined(__OS2__)
+#include <io.h>
+#include <fcntl.h>
+
+#ifdef __OS2__
+#define _setmode setmode
+#define _fileno fileno
+#define _O_BINARY O_BINARY
+#endif
+#endif
+
+#define LOG_ERROR(label)               \
+  do {                                 \
+    const char *l = label;             \
+    va_list ap;                        \
+    va_start(ap, fmt);                 \
+    if (l) fprintf(stderr, "%s: ", l); \
+    vfprintf(stderr, fmt, ap);         \
+    fprintf(stderr, "\n");             \
+    va_end(ap);                        \
+  } while (0)
+
+FILE *set_binary_mode(FILE *stream) {
+  (void)stream;
+#if defined(_WIN32) || defined(__OS2__)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+void die(const char *fmt, ...) {
+  LOG_ERROR(NULL);
+  usage_exit();
+}
+
+void fatal(const char *fmt, ...) {
+  LOG_ERROR("Fatal");
+  exit(EXIT_FAILURE);
+}
+
+void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
+
+void die_codec(aom_codec_ctx_t *ctx, const char *s) {
+  const char *detail = aom_codec_error_detail(ctx);
+
+  printf("%s: %s\n", s, aom_codec_error(ctx));
+  if (detail) printf("    %s\n", detail);
+  exit(EXIT_FAILURE);
+}
+
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) {
+  FILE *f = input_ctx->file;
+  struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
+  int plane = 0;
+  int shortread = 0;
+  const int bytespp = (yuv_frame->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+
+  for (plane = 0; plane < 3; ++plane) {
+    uint8_t *ptr;
+    const int w = aom_img_plane_width(yuv_frame, plane);
+    const int h = aom_img_plane_height(yuv_frame, plane);
+    int r;
+
+    /* Determine the correct plane based on the image format. The for-loop
+     * always counts in Y,U,V order, but this may not match the order of
+     * the data on disk.
+     */
+    switch (plane) {
+      case 1:
+        ptr =
+            yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_V
+                                                                 : AOM_PLANE_U];
+        break;
+      case 2:
+        ptr =
+            yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_U
+                                                                 : AOM_PLANE_V];
+        break;
+      default: ptr = yuv_frame->planes[plane];
+    }
+
+    for (r = 0; r < h; ++r) {
+      size_t needed = w * bytespp;
+      size_t buf_position = 0;
+      const size_t left = detect->buf_read - detect->position;
+      if (left > 0) {
+        const size_t more = (left < needed) ? left : needed;
+        memcpy(ptr, detect->buf + detect->position, more);
+        buf_position = more;
+        needed -= more;
+        detect->position += more;
+      }
+      if (needed > 0) {
+        shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+      }
+
+      ptr += yuv_frame->stride[plane];
+    }
+  }
+
+  return shortread;
+}
+
+#if CONFIG_AV1_ENCODER
+static const AvxInterface aom_encoders[] = {
+  { "av1", AV1_FOURCC, &aom_codec_av1_cx },
+};
+
+int get_aom_encoder_count(void) {
+  return sizeof(aom_encoders) / sizeof(aom_encoders[0]);
+}
+
+const AvxInterface *get_aom_encoder_by_index(int i) { return &aom_encoders[i]; }
+
+const AvxInterface *get_aom_encoder_by_name(const char *name) {
+  int i;
+
+  for (i = 0; i < get_aom_encoder_count(); ++i) {
+    const AvxInterface *encoder = get_aom_encoder_by_index(i);
+    if (strcmp(encoder->name, name) == 0) return encoder;
+  }
+
+  return NULL;
+}
+#endif  // CONFIG_AV1_ENCODER
+
+#if CONFIG_AV1_DECODER
+static const AvxInterface aom_decoders[] = {
+  { "av1", AV1_FOURCC, &aom_codec_av1_dx },
+};
+
+int get_aom_decoder_count(void) {
+  return sizeof(aom_decoders) / sizeof(aom_decoders[0]);
+}
+
+const AvxInterface *get_aom_decoder_by_index(int i) { return &aom_decoders[i]; }
+
+const AvxInterface *get_aom_decoder_by_name(const char *name) {
+  int i;
+
+  for (i = 0; i < get_aom_decoder_count(); ++i) {
+    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+    if (strcmp(decoder->name, name) == 0) return decoder;
+  }
+
+  return NULL;
+}
+
+const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc) {
+  int i;
+
+  for (i = 0; i < get_aom_decoder_count(); ++i) {
+    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+    if (decoder->fourcc == fourcc) return decoder;
+  }
+
+  return NULL;
+}
+#endif  // CONFIG_AV1_DECODER
+
+void aom_img_write(const aom_image_t *img, FILE *file) {
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, 1, w, file);
+      buf += stride;
+    }
+  }
+}
+
+int aom_img_read(aom_image_t *img, FILE *file) {
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      if (fread(buf, 1, w, file) != (size_t)w) return 0;
+      buf += stride;
+    }
+  }
+
+  return 1;
+}
+
+// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
+double sse_to_psnr(double samples, double peak, double sse) {
+  static const double kMaxPSNR = 100.0;
+
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > kMaxPSNR ? kMaxPSNR : psnr;
+  } else {
+    return kMaxPSNR;
+  }
+}
+
+// TODO(debargha): Consolidate the functions below into a separate file.
+static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                               int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                              int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                     int input_shift) {
+  if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_upshift(dst, src, input_shift);
+  } else {
+    lowbd_img_upshift(dst, src, input_shift);
+  }
+}
+
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) {
+  int plane;
+  if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w ||
+      dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (uint8_t)(*p_src++);
+      }
+    }
+  }
+}
+
+static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                                 int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift;
+    }
+  }
+}
+
+static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                                int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      src->fmt != dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH || down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++ >> down_shift;
+      }
+    }
+  }
+}
+
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                       int down_shift) {
+  if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_downshift(dst, src, down_shift);
+  } else {
+    lowbd_img_downshift(dst, src, down_shift);
+  }
+}
diff --git a/third_party/aom/common/tools_common.h b/third_party/aom/common/tools_common.h
new file mode 100644
index 000000000..4e1d12f4a
--- /dev/null
+++ b/third_party/aom/common/tools_common.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_TOOLS_COMMON_H_
+#define AOM_COMMON_TOOLS_COMMON_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/msvc.h"
+
+#if CONFIG_AV1_ENCODER
+#include "common/y4minput.h"
+#endif
+
+#if defined(_MSC_VER)
+/* MSVS uses _f{seek,tell}i64. */
+#define fseeko _fseeki64
+#define ftello _ftelli64
+typedef int64_t FileOffset;
+#elif defined(_WIN32)
+#include <sys/types.h> /* NOLINT*/
+/* MinGW uses f{seek,tell}o64 for large files. */
+#define fseeko fseeko64
+#define ftello ftello64
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT
+#include <sys/types.h> /* NOLINT*/
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset; /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h> /* NOLINT */
+#define isatty _isatty
+#define fileno _fileno
+#else
+#include <unistd.h> /* NOLINT */
+#endif              /* _MSC_VER */
+#endif              /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+
+#define AV1_FOURCC 0x31305641
+
+enum VideoFileType {
+  FILE_TYPE_OBU,
+  FILE_TYPE_RAW,
+  FILE_TYPE_IVF,
+  FILE_TYPE_Y4M,
+  FILE_TYPE_WEBM
+};
+
+struct FileTypeDetectionBuffer {
+  char buf[4];
+  size_t buf_read;
+  size_t position;
+};
+
+struct AvxRational {
+  int numerator;
+  int denominator;
+};
+
+struct AvxInputContext {
+  const char *filename;
+  FILE *file;
+  int64_t length;
+  struct FileTypeDetectionBuffer detect;
+  enum VideoFileType file_type;
+  uint32_t width;
+  uint32_t height;
+  struct AvxRational pixel_aspect_ratio;
+  aom_img_fmt_t fmt;
+  aom_bit_depth_t bit_depth;
+  int only_i420;
+  uint32_t fourcc;
+  struct AvxRational framerate;
+#if CONFIG_AV1_ENCODER
+  y4m_input y4m;
+#endif
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+#define AOM_NO_RETURN __attribute__((noreturn))
+#else
+#define AOM_NO_RETURN
+#endif
+
+/* Sets a stdio stream into binary mode */
+FILE *set_binary_mode(FILE *stream);
+
+void die(const char *fmt, ...) AOM_NO_RETURN;
+void fatal(const char *fmt, ...) AOM_NO_RETURN;
+void warn(const char *fmt, ...);
+
+void die_codec(aom_codec_ctx_t *ctx, const char *s) AOM_NO_RETURN;
+
+/* The tool including this file must define usage_exit() */
+void usage_exit(void) AOM_NO_RETURN;
+
+#undef AOM_NO_RETURN
+
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
+
+typedef struct AvxInterface {
+  const char *const name;
+  const uint32_t fourcc;
+  aom_codec_iface_t *(*const codec_interface)();
+} AvxInterface;
+
+int get_aom_encoder_count(void);
+const AvxInterface *get_aom_encoder_by_index(int i);
+const AvxInterface *get_aom_encoder_by_name(const char *name);
+
+int get_aom_decoder_count(void);
+const AvxInterface *get_aom_decoder_by_index(int i);
+const AvxInterface *get_aom_decoder_by_name(const char *name);
+const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc);
+
+void aom_img_write(const aom_image_t *img, FILE *file);
+int aom_img_read(aom_image_t *img, FILE *file);
+
+double sse_to_psnr(double samples, double peak, double mse);
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                       int down_shift);
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_TOOLS_COMMON_H_
diff --git a/third_party/aom/common/video_common.h b/third_party/aom/common/video_common.h
new file mode 100644
index 000000000..bf95031be
--- /dev/null
+++ b/third_party/aom/common/video_common.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_COMMON_H_
+#define AOM_COMMON_VIDEO_COMMON_H_
+
+#include "common/tools_common.h"
+
+typedef struct {
+  uint32_t codec_fourcc;
+  int frame_width;
+  int frame_height;
+  struct AvxRational time_base;
+  unsigned int is_annexb;
+} AvxVideoInfo;
+
+#endif  // AOM_COMMON_VIDEO_COMMON_H_
diff --git a/third_party/aom/common/video_reader.c b/third_party/aom/common/video_reader.c
new file mode 100644
index 000000000..47ad6e189
--- /dev/null
+++ b/third_party/aom/common/video_reader.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/webmdec.h"
+
+struct AvxVideoReaderStruct {
+  AvxVideoInfo info;
+  struct AvxInputContext input_ctx;
+  struct ObuDecInputContext obu_ctx;
+  struct WebmInputContext webm_ctx;
+  uint8_t *buffer;
+  size_t buffer_size;
+  size_t frame_size;
+  aom_codec_pts_t pts;
+};
+
+AvxVideoReader *aom_video_reader_open(const char *filename) {
+  AvxVideoReader *reader = NULL;
+  FILE *const file = fopen(filename, "rb");
+  if (!file) return NULL;  // Can't open file
+
+  reader = (AvxVideoReader *)calloc(1, sizeof(*reader));
+  if (!reader) {
+    fclose(file);
+    return NULL;  // Can't allocate AvxVideoReader
+  }
+
+  reader->input_ctx.filename = filename;
+  reader->input_ctx.file = file;
+  reader->obu_ctx.avx_ctx = &reader->input_ctx;
+  reader->obu_ctx.is_annexb = 1;
+
+  if (file_is_ivf(&reader->input_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_IVF;
+    reader->info.codec_fourcc = reader->input_ctx.fourcc;
+    reader->info.frame_width = reader->input_ctx.width;
+    reader->info.frame_height = reader->input_ctx.height;
+#if CONFIG_WEBM_IO
+  } else if (file_is_webm(&reader->webm_ctx, &reader->input_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_WEBM;
+    reader->info.codec_fourcc = reader->input_ctx.fourcc;
+    reader->info.frame_width = reader->input_ctx.width;
+    reader->info.frame_height = reader->input_ctx.height;
+#endif
+  } else if (file_is_obu(&reader->obu_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_OBU;
+    // assume AV1
+    reader->info.codec_fourcc = AV1_FOURCC;
+    reader->info.is_annexb = reader->obu_ctx.is_annexb;
+  } else {
+    fclose(file);
+    free(reader);
+    return NULL;  // Unknown file type
+  }
+
+  return reader;
+}
+
+void aom_video_reader_close(AvxVideoReader *reader) {
+  if (reader) {
+    fclose(reader->input_ctx.file);
+    if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+      obudec_free(&reader->obu_ctx);
+    }
+    free(reader->buffer);
+    free(reader);
+  }
+}
+
+int aom_video_reader_read_frame(AvxVideoReader *reader) {
+  if (reader->input_ctx.file_type == FILE_TYPE_IVF) {
+    return !ivf_read_frame(reader->input_ctx.file, &reader->buffer,
+                           &reader->frame_size, &reader->buffer_size,
+                           &reader->pts);
+  } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+    return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer,
+                                      &reader->frame_size,
+                                      &reader->buffer_size);
+#if CONFIG_WEBM_IO
+  } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) {
+    return !webm_read_frame(&reader->webm_ctx, &reader->buffer,
+                            &reader->frame_size, &reader->buffer_size);
+#endif
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader,
+                                          size_t *size) {
+  if (size) *size = reader->frame_size;
+
+  return reader->buffer;
+}
+
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) {
+  return (int64_t)reader->pts;
+}
+
+FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
+  return reader->input_ctx.file;
+}
+
+const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
+  return &reader->info;
+}
diff --git a/third_party/aom/common/video_reader.h b/third_party/aom/common/video_reader.h
new file mode 100644
index 000000000..903deae84
--- /dev/null
+++ b/third_party/aom/common/video_reader.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_READER_H_
+#define AOM_COMMON_VIDEO_READER_H_
+
+#include "common/video_common.h"
+
+// The following code is work in progress. It is going to  support transparent
+// reading of input files. Right now only IVF format is supported for
+// simplicity. The main goal the API is to be simple and easy to use in example
+// code and in aomenc/aomdec later. All low-level details like memory
+// buffer management are hidden from API users.
+struct AvxVideoReaderStruct;
+typedef struct AvxVideoReaderStruct AvxVideoReader;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opens the input file for reading and inspects it to determine file type.
+// Returns an opaque AvxVideoReader* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+AvxVideoReader *aom_video_reader_open(const char *filename);
+
+// Frees all resources associated with AvxVideoReader* returned from
+// aom_video_reader_open() call.
+void aom_video_reader_close(AvxVideoReader *reader);
+
+// Reads frame from the file and stores it in internal buffer.
+int aom_video_reader_read_frame(AvxVideoReader *reader);
+
+// Returns the pointer to memory buffer with frame data read by last call to
+// aom_video_reader_read_frame().
+const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size);
+
+// Returns the pts of the frame.
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader);
+// Return the reader file.
+FILE *aom_video_reader_get_file(AvxVideoReader *reader);
+
+// Fills AvxVideoInfo with information from opened video file.
+const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_VIDEO_READER_H_
diff --git a/third_party/aom/common/video_writer.c b/third_party/aom/common/video_writer.c
new file mode 100644
index 000000000..a7ec309fc
--- /dev/null
+++ b/third_party/aom/common/video_writer.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "common/video_writer.h"
+
+#include <stdlib.h>
+
+#include "aom/aom_encoder.h"
+#include "common/ivfenc.h"
+
+struct AvxVideoWriterStruct {
+  AvxVideoInfo info;
+  FILE *file;
+  int frame_count;
+};
+
+static void write_header(FILE *file, const AvxVideoInfo *info,
+                         int frame_count) {
+  struct aom_codec_enc_cfg cfg;
+  cfg.g_w = info->frame_width;
+  cfg.g_h = info->frame_height;
+  cfg.g_timebase.num = info->time_base.numerator;
+  cfg.g_timebase.den = info->time_base.denominator;
+
+  ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count);
+}
+
+AvxVideoWriter *aom_video_writer_open(const char *filename,
+                                      AvxContainer container,
+                                      const AvxVideoInfo *info) {
+  if (container == kContainerIVF) {
+    AvxVideoWriter *writer = NULL;
+    FILE *const file = fopen(filename, "wb");
+    if (!file) return NULL;
+
+    writer = malloc(sizeof(*writer));
+    if (!writer) return NULL;
+
+    writer->frame_count = 0;
+    writer->info = *info;
+    writer->file = file;
+
+    write_header(writer->file, info, 0);
+
+    return writer;
+  }
+
+  return NULL;
+}
+
+void aom_video_writer_close(AvxVideoWriter *writer) {
+  if (writer) {
+    // Rewriting frame header with real frame count
+    rewind(writer->file);
+    write_header(writer->file, &writer->info, writer->frame_count);
+
+    fclose(writer->file);
+    free(writer);
+  }
+}
+
+int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+                                 size_t size, int64_t pts) {
+  ivf_write_frame_header(writer->file, pts, size);
+  if (fwrite(buffer, 1, size, writer->file) != size) return 0;
+
+  ++writer->frame_count;
+
+  return 1;
+}
diff --git a/third_party/aom/common/video_writer.h b/third_party/aom/common/video_writer.h
new file mode 100644
index 000000000..3e2b6554b
--- /dev/null
+++ b/third_party/aom/common/video_writer.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_WRITER_H_
+#define AOM_COMMON_VIDEO_WRITER_H_
+
+#include "common/video_common.h"
+
+typedef enum { kContainerIVF } AvxContainer;
+
+struct AvxVideoWriterStruct;
+typedef struct AvxVideoWriterStruct AvxVideoWriter;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Finds and opens writer for specified container format.
+// Returns an opaque AvxVideoWriter* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+AvxVideoWriter *aom_video_writer_open(const char *filename,
+                                      AvxContainer container,
+                                      const AvxVideoInfo *info);
+
+// Frees all resources associated with AvxVideoWriter* returned from
+// aom_video_writer_open() call.
+void aom_video_writer_close(AvxVideoWriter *writer);
+
+// Writes frame bytes to the file.
+int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+                                 size_t size, int64_t pts);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_VIDEO_WRITER_H_
diff --git a/third_party/aom/common/warnings.c b/third_party/aom/common/warnings.c
new file mode 100644
index 000000000..2facee252
--- /dev/null
+++ b/third_party/aom/common/warnings.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/warnings.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "apps/aomenc.h"
+#include "common/tools_common.h"
+
+static const char quantizer_warning_string[] =
+    "Bad quantizer values. Quantizer values should not be equal, and should "
+    "differ by at least 8.";
+
+struct WarningListNode {
+  const char *warning_string;
+  struct WarningListNode *next_warning;
+};
+
+struct WarningList {
+  struct WarningListNode *warning_node;
+};
+
+static void add_warning(const char *warning_string,
+                        struct WarningList *warning_list) {
+  struct WarningListNode **node = &warning_list->warning_node;
+
+  struct WarningListNode *new_node = malloc(sizeof(*new_node));
+  if (new_node == NULL) {
+    fatal("Unable to allocate warning node.");
+  }
+
+  new_node->warning_string = warning_string;
+  new_node->next_warning = NULL;
+
+  while (*node != NULL) node = &(*node)->next_warning;
+
+  *node = new_node;
+}
+
+static void free_warning_list(struct WarningList *warning_list) {
+  while (warning_list->warning_node != NULL) {
+    struct WarningListNode *const node = warning_list->warning_node;
+    warning_list->warning_node = node->next_warning;
+    free(node);
+  }
+}
+
+static int continue_prompt(int num_warnings) {
+  int c;
+  fprintf(stderr,
+          "%d encoder configuration warning(s). Continue? (y to continue) ",
+          num_warnings);
+  c = getchar();
+  return c == 'y';
+}
+
+static void check_quantizer(int min_q, int max_q,
+                            struct WarningList *warning_list) {
+  const int lossless = min_q == 0 && max_q == 0;
+  if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8))
+    add_warning(quantizer_warning_string, warning_list);
+}
+
+void check_encoder_config(int disable_prompt,
+                          const struct AvxEncoderConfig *global_config,
+                          const struct aom_codec_enc_cfg *stream_config) {
+  int num_warnings = 0;
+  struct WarningListNode *warning = NULL;
+  struct WarningList warning_list = { 0 };
+  (void)global_config;
+  check_quantizer(stream_config->rc_min_quantizer,
+                  stream_config->rc_max_quantizer, &warning_list);
+  /* Count and print warnings. */
+  for (warning = warning_list.warning_node; warning != NULL;
+       warning = warning->next_warning, ++num_warnings) {
+    warn(warning->warning_string);
+  }
+
+  free_warning_list(&warning_list);
+
+  if (num_warnings) {
+    if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE);
+  }
+}
diff --git a/third_party/aom/common/warnings.h b/third_party/aom/common/warnings.h
new file mode 100644
index 000000000..36f1fe070
--- /dev/null
+++ b/third_party/aom/common/warnings.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WARNINGS_H_
+#define AOM_COMMON_WARNINGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_codec_enc_cfg;
+struct AvxEncoderConfig;
+
+/*
+ * Checks config for improperly used settings. Warns user upon encountering
+ * settings that will lead to poor output quality. Prompts user to continue
+ * when warnings are issued.
+ */
+void check_encoder_config(int disable_prompt,
+                          const struct AvxEncoderConfig *global_config,
+                          const struct aom_codec_enc_cfg *stream_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_WARNINGS_H_
diff --git a/third_party/aom/common/webmdec.cc b/third_party/aom/common/webmdec.cc
new file mode 100644
index 000000000..17ac53c93
--- /dev/null
+++ b/third_party/aom/common/webmdec.cc
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/webmdec.h"
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+
+#include "third_party/libwebm/mkvparser/mkvparser.h"
+#include "third_party/libwebm/mkvparser/mkvreader.h"
+
+namespace {
+
+void reset(struct WebmInputContext *const webm_ctx) {
+  if (webm_ctx->reader != NULL) {
+    mkvparser::MkvReader *const reader =
+        reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
+    delete reader;
+  }
+  if (webm_ctx->segment != NULL) {
+    mkvparser::Segment *const segment =
+        reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+    delete segment;
+  }
+  if (webm_ctx->buffer != NULL) {
+    delete[] webm_ctx->buffer;
+  }
+  webm_ctx->reader = NULL;
+  webm_ctx->segment = NULL;
+  webm_ctx->buffer = NULL;
+  webm_ctx->cluster = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->video_track_index = 0;
+  webm_ctx->timestamp_ns = 0;
+  webm_ctx->is_key_frame = false;
+}
+
+void get_first_cluster(struct WebmInputContext *const webm_ctx) {
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+  const mkvparser::Cluster *const cluster = segment->GetFirst();
+  webm_ctx->cluster = cluster;
+}
+
+void rewind_and_reset(struct WebmInputContext *const webm_ctx,
+                      struct AvxInputContext *const aom_ctx) {
+  rewind(aom_ctx->file);
+  reset(webm_ctx);
+}
+
+}  // namespace
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct AvxInputContext *aom_ctx) {
+  mkvparser::MkvReader *const reader = new mkvparser::MkvReader(aom_ctx->file);
+  webm_ctx->reader = reader;
+  webm_ctx->reached_eos = 0;
+
+  mkvparser::EBMLHeader header;
+  long long pos = 0;
+  if (header.Parse(reader, pos) < 0) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  mkvparser::Segment *segment;
+  if (mkvparser::Segment::CreateInstance(reader, pos, segment)) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+  webm_ctx->segment = segment;
+  if (segment->Load() < 0) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  const mkvparser::Tracks *const tracks = segment->GetTracks();
+  const mkvparser::VideoTrack *video_track = NULL;
+  for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
+    const mkvparser::Track *const track = tracks->GetTrackByIndex(i);
+    if (track->GetType() == mkvparser::Track::kVideo) {
+      video_track = static_cast<const mkvparser::VideoTrack *>(track);
+      webm_ctx->video_track_index = static_cast<int>(track->GetNumber());
+      break;
+    }
+  }
+
+  if (video_track == NULL || video_track->GetCodecId() == NULL) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  if (!strncmp(video_track->GetCodecId(), "V_AV1", 5)) {
+    aom_ctx->fourcc = AV1_FOURCC;
+  } else {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  aom_ctx->framerate.denominator = 0;
+  aom_ctx->framerate.numerator = 0;
+  aom_ctx->width = static_cast<uint32_t>(video_track->GetWidth());
+  aom_ctx->height = static_cast<uint32_t>(video_track->GetHeight());
+
+  get_first_cluster(webm_ctx);
+
+  return 1;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
+                    size_t *bytes_read, size_t *buffer_size) {
+  assert(webm_ctx->buffer == *buffer);
+  // This check is needed for frame parallel decoding, in which case this
+  // function could be called even after it has reached end of input stream.
+  if (webm_ctx->reached_eos) {
+    return 1;
+  }
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+  const mkvparser::Cluster *cluster =
+      reinterpret_cast<const mkvparser::Cluster *>(webm_ctx->cluster);
+  const mkvparser::Block *block =
+      reinterpret_cast<const mkvparser::Block *>(webm_ctx->block);
+  const mkvparser::BlockEntry *block_entry =
+      reinterpret_cast<const mkvparser::BlockEntry *>(webm_ctx->block_entry);
+  bool block_entry_eos = false;
+  do {
+    long status = 0;
+    bool get_new_block = false;
+    if (block_entry == NULL && !block_entry_eos) {
+      status = cluster->GetFirst(block_entry);
+      get_new_block = true;
+    } else if (block_entry_eos || block_entry->EOS()) {
+      cluster = segment->GetNext(cluster);
+      if (cluster == NULL || cluster->EOS()) {
+        *bytes_read = 0;
+        webm_ctx->reached_eos = 1;
+        return 1;
+      }
+      status = cluster->GetFirst(block_entry);
+      block_entry_eos = false;
+      get_new_block = true;
+    } else if (block == NULL ||
+               webm_ctx->block_frame_index == block->GetFrameCount() ||
+               block->GetTrackNumber() != webm_ctx->video_track_index) {
+      status = cluster->GetNext(block_entry, block_entry);
+      if (block_entry == NULL || block_entry->EOS()) {
+        block_entry_eos = true;
+        continue;
+      }
+      get_new_block = true;
+    }
+    if (status || block_entry == NULL) {
+      return -1;
+    }
+    if (get_new_block) {
+      block = block_entry->GetBlock();
+      if (block == NULL) return -1;
+      webm_ctx->block_frame_index = 0;
+    }
+  } while (block_entry_eos ||
+           block->GetTrackNumber() != webm_ctx->video_track_index);
+
+  webm_ctx->cluster = cluster;
+  webm_ctx->block_entry = block_entry;
+  webm_ctx->block = block;
+
+  const mkvparser::Block::Frame &frame =
+      block->GetFrame(webm_ctx->block_frame_index);
+  ++webm_ctx->block_frame_index;
+  if (frame.len > static_cast<long>(*buffer_size)) {
+    delete[] * buffer;
+    *buffer = new uint8_t[frame.len];
+    webm_ctx->buffer = *buffer;
+    if (*buffer == NULL) {
+      return -1;
+    }
+    *buffer_size = frame.len;
+  }
+  *bytes_read = frame.len;
+  webm_ctx->timestamp_ns = block->GetTime(cluster);
+  webm_ctx->is_key_frame = block->IsKey();
+
+  mkvparser::MkvReader *const reader =
+      reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
+  return frame.Read(reader, *buffer) ? -1 : 0;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct AvxInputContext *aom_ctx) {
+  uint32_t i = 0;
+  uint8_t *buffer = NULL;
+  size_t buffer_size = 0;
+  size_t bytes_read = 0;
+  assert(webm_ctx->buffer == NULL);
+  while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
+    if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) {
+      break;
+    }
+    ++i;
+  }
+  aom_ctx->framerate.numerator = (i - 1) * 1000000;
+  aom_ctx->framerate.denominator =
+      static_cast<int>(webm_ctx->timestamp_ns / 1000);
+  delete[] buffer;
+  webm_ctx->buffer = NULL;
+
+  get_first_cluster(webm_ctx);
+  webm_ctx->block = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->timestamp_ns = 0;
+  webm_ctx->reached_eos = 0;
+
+  return 0;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); }
diff --git a/third_party/aom/common/webmdec.h b/third_party/aom/common/webmdec.h
new file mode 100644
index 000000000..5ac75cb30
--- /dev/null
+++ b/third_party/aom/common/webmdec.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WEBMDEC_H_
+#define AOM_COMMON_WEBMDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AvxInputContext;
+
+struct WebmInputContext {
+  void *reader;
+  void *segment;
+  uint8_t *buffer;
+  const void *cluster;
+  const void *block_entry;
+  const void *block;
+  int block_frame_index;
+  int video_track_index;
+  uint64_t timestamp_ns;
+  int is_key_frame;
+  int reached_eos;
+};
+
+// Checks if the input is a WebM file. If so, initializes WebMInputContext so
+// that webm_read_frame can be called to retrieve a video frame.
+// Returns 1 on success and 0 on failure or input is not WebM file.
+// TODO(vigneshv): Refactor this function into two smaller functions specific
+// to their task.
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct AvxInputContext *aom_ctx);
+
+// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
+// by this function. For the first call, |buffer| should be NULL and
+// |*buffer_size| should be 0. Once all the frames are read and used,
+// webm_free() should be called, otherwise there will be a leak.
+// Parameters:
+//      webm_ctx - WebmInputContext object
+//      buffer - pointer where the frame data will be filled.
+//      bytes_read - pointer to bytes read.
+//      buffer_size - pointer to buffer size.
+// Return values:
+//      0 - Success
+//      1 - End of Stream
+//     -1 - Error
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
+                    size_t *bytes_read, size_t *buffer_size);
+
+// Guesses the frame rate of the input file based on the container timestamps.
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct AvxInputContext *aom_ctx);
+
+// Resets the WebMInputContext.
+void webm_free(struct WebmInputContext *webm_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_WEBMDEC_H_
diff --git a/third_party/aom/common/webmenc.cc b/third_party/aom/common/webmenc.cc
new file mode 100644
index 000000000..58ab33670
--- /dev/null
+++ b/third_party/aom/common/webmenc.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/webmenc.h"
+
+#include <string>
+
+#include "third_party/libwebm/mkvmuxer/mkvmuxer.h"
+#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+#include "third_party/libwebm/mkvmuxer/mkvwriter.h"
+
+namespace {
+const uint64_t kDebugTrackUid = 0xDEADBEEF;
+const int kVideoTrackNumber = 1;
+}  // namespace
+
+void write_webm_file_header(struct WebmOutputContext *webm_ctx,
+                            const aom_codec_enc_cfg_t *cfg,
+                            stereo_format_t stereo_fmt, unsigned int fourcc,
+                            const struct AvxRational *par) {
+  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
+  mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
+  segment->Init(writer);
+  segment->set_mode(mkvmuxer::Segment::kFile);
+  segment->OutputCues(true);
+
+  mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo();
+  const uint64_t kTimecodeScale = 1000000;
+  info->set_timecode_scale(kTimecodeScale);
+  std::string version = "aomenc";
+  if (!webm_ctx->debug) {
+    version.append(std::string(" ") + aom_codec_version_str());
+  }
+  info->set_writing_app(version.c_str());
+
+  const uint64_t video_track_id =
+      segment->AddVideoTrack(static_cast<int>(cfg->g_w),
+                             static_cast<int>(cfg->g_h), kVideoTrackNumber);
+  mkvmuxer::VideoTrack *const video_track = static_cast<mkvmuxer::VideoTrack *>(
+      segment->GetTrackByNumber(video_track_id));
+  video_track->SetStereoMode(stereo_fmt);
+  const char *codec_id;
+  switch (fourcc) {
+    case AV1_FOURCC: codec_id = "V_AV1"; break;
+    default: codec_id = "V_AV1"; break;
+  }
+  video_track->set_codec_id(codec_id);
+  if (par->numerator > 1 || par->denominator > 1) {
+    // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type
+    // to WebM format.
+    const uint64_t display_width = static_cast<uint64_t>(
+        ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5);
+    video_track->set_display_width(display_width);
+    video_track->set_display_height(cfg->g_h);
+  }
+  if (webm_ctx->debug) {
+    video_track->set_uid(kDebugTrackUid);
+  }
+  webm_ctx->writer = writer;
+  webm_ctx->segment = segment;
+}
+
+void write_webm_block(struct WebmOutputContext *webm_ctx,
+                      const aom_codec_enc_cfg_t *cfg,
+                      const aom_codec_cx_pkt_t *pkt) {
+  mkvmuxer::Segment *const segment =
+      reinterpret_cast<mkvmuxer::Segment *>(webm_ctx->segment);
+  int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num /
+                   cfg->g_timebase.den;
+  if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000;
+  webm_ctx->last_pts_ns = pts_ns;
+
+  segment->AddFrame(static_cast<uint8_t *>(pkt->data.frame.buf),
+                    pkt->data.frame.sz, kVideoTrackNumber, pts_ns,
+                    pkt->data.frame.flags & AOM_FRAME_IS_KEY);
+}
+
+void write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
+  mkvmuxer::MkvWriter *const writer =
+      reinterpret_cast<mkvmuxer::MkvWriter *>(webm_ctx->writer);
+  mkvmuxer::Segment *const segment =
+      reinterpret_cast<mkvmuxer::Segment *>(webm_ctx->segment);
+  segment->Finalize();
+  delete segment;
+  delete writer;
+  webm_ctx->writer = NULL;
+  webm_ctx->segment = NULL;
+}
diff --git a/third_party/aom/common/webmenc.h b/third_party/aom/common/webmenc.h
new file mode 100644
index 000000000..aa9832fba
--- /dev/null
+++ b/third_party/aom/common/webmenc.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WEBMENC_H_
+#define AOM_COMMON_WEBMENC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tools_common.h"
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WebmOutputContext {
+  int debug;
+  FILE *stream;
+  int64_t last_pts_ns;
+  void *writer;
+  void *segment;
+};
+
+/* Stereo 3D packed frame format */
+typedef enum stereo_format {
+  STEREO_FORMAT_MONO = 0,
+  STEREO_FORMAT_LEFT_RIGHT = 1,
+  STEREO_FORMAT_BOTTOM_TOP = 2,
+  STEREO_FORMAT_TOP_BOTTOM = 3,
+  STEREO_FORMAT_RIGHT_LEFT = 11
+} stereo_format_t;
+
+void write_webm_file_header(struct WebmOutputContext *webm_ctx,
+                            const aom_codec_enc_cfg_t *cfg,
+                            stereo_format_t stereo_fmt, unsigned int fourcc,
+                            const struct AvxRational *par);
+
+void write_webm_block(struct WebmOutputContext *webm_ctx,
+                      const aom_codec_enc_cfg_t *cfg,
+                      const aom_codec_cx_pkt_t *pkt);
+
+void write_webm_file_footer(struct WebmOutputContext *webm_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_WEBMENC_H_
diff --git a/third_party/aom/common/y4menc.c b/third_party/aom/common/y4menc.c
new file mode 100644
index 000000000..585d22197
--- /dev/null
+++ b/third_party/aom/common/y4menc.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "common/rawenc.h"
+#include "common/y4menc.h"
+
+// Returns the Y4M name associated with the monochrome colorspace.
+const char *monochrome_colorspace(unsigned int bit_depth) {
+  switch (bit_depth) {
+    case 8: return "Cmono";
+    case 9: return "Cmono9";
+    case 10: return "Cmono10";
+    case 12: return "Cmono12";
+    case 16: return "Cmono16";
+    default: assert(0); return NULL;
+  }
+}
+
+// Return the Y4M name of the 8-bit colorspace, given the chroma position and
+// image format.
+const char *colorspace8(aom_chroma_sample_position_t csp, aom_img_fmt_t fmt) {
+  switch (fmt) {
+    case AOM_IMG_FMT_444A: return "C444alpha";
+    case AOM_IMG_FMT_I444: return "C444";
+    case AOM_IMG_FMT_I422: return "C422";
+    default:
+      if (csp == AOM_CSP_VERTICAL) {
+        return "C420mpeg2 XYSCSS=420MPEG2";
+      } else {
+        return "C420jpeg";
+      }
+  }
+}
+
+// Return the Y4M name of the colorspace, given the bit depth and image format.
+const char *colorspace(unsigned int bit_depth, aom_chroma_sample_position_t csp,
+                       aom_img_fmt_t fmt) {
+  switch (bit_depth) {
+    case 8: return colorspace8(csp, fmt);
+    case 9:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p9 XYSCSS=444P9"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9"
+                                             : "C420p9 XYSCSS=420P9";
+    case 10:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p10 XYSCSS=444P10"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10"
+                                             : "C420p10 XYSCSS=420P10";
+    case 12:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p12 XYSCSS=444P12"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12"
+                                             : "C420p12 XYSCSS=420P12";
+    case 14:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p14 XYSCSS=444P14"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14"
+                                             : "C420p14 XYSCSS=420P14";
+    case 16:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p16 XYSCSS=444P16"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16"
+                                             : "C420p16 XYSCSS=420P16";
+    default: assert(0); return NULL;
+  }
+}
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+                          const struct AvxRational *framerate, int monochrome,
+                          aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+                          unsigned int bit_depth) {
+  const char *color = monochrome ? monochrome_colorspace(bit_depth)
+                                 : colorspace(bit_depth, csp, fmt);
+  return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s\n", width, height,
+                  framerate->numerator, framerate->denominator, 'p', color);
+}
+
+int y4m_write_frame_header(char *buf, size_t len) {
+  return snprintf(buf, len, "FRAME\n");
+}
+
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+                          FILE *file) {
+  int num_planes = img->monochrome ? 1 : 3;
+  raw_write_image_file(img, planes, num_planes, file);
+}
+
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+                          MD5Context *md5) {
+  int num_planes = img->monochrome ? 1 : 3;
+  raw_update_image_md5(img, planes, num_planes, md5);
+}
diff --git a/third_party/aom/common/y4menc.h b/third_party/aom/common/y4menc.h
new file mode 100644
index 000000000..f6d5fd86b
--- /dev/null
+++ b/third_party/aom/common/y4menc.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_Y4MENC_H_
+#define AOM_COMMON_Y4MENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define Y4M_BUFFER_SIZE 128
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+                          const struct AvxRational *framerate, int monochrome,
+                          aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+                          unsigned int bit_depth);
+int y4m_write_frame_header(char *buf, size_t len);
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+                          FILE *file);
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+                          MD5Context *md5);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_Y4MENC_H_
diff --git a/third_party/aom/common/y4minput.c b/third_party/aom/common/y4minput.c
new file mode 100644
index 000000000..eca8b1bba
--- /dev/null
+++ b/third_party/aom/common/y4minput.c
@@ -0,0 +1,1142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * Based on code from the OggTheora software codec source code,
+ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "y4minput.h"
+
+// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
+// Returns true on success.
+static int file_read(void *buf, size_t size, FILE *file) {
+  const int kMaxRetries = 5;
+  int retry_count = 0;
+  int file_error;
+  size_t len = 0;
+  do {
+    const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+    len += n;
+    file_error = ferror(file);
+    if (file_error) {
+      if (errno == EINTR || errno == EAGAIN) {
+        clearerr(file);
+        continue;
+      } else {
+        fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n",
+                (uint32_t)len, (uint32_t)size, errno, strerror(errno));
+        return 0;
+      }
+    }
+  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+
+  if (!feof(file) && len != size) {
+    fprintf(stderr,
+            "Error reading file: %u of %u bytes read,"
+            " error: %d, retries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            strerror(errno));
+  }
+  return len == size;
+}
+
+static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
+  int got_w;
+  int got_h;
+  int got_fps;
+  int got_interlace;
+  int got_par;
+  int got_chroma;
+  char *p;
+  char *q;
+  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
+  for (p = _tags;; p = q) {
+    /*Skip any leading spaces.*/
+    while (*p == ' ') p++;
+    /*If that's all we have, stop.*/
+    if (p[0] == '\0') break;
+    /*Find the end of this tag.*/
+    for (q = p + 1; *q != '\0' && *q != ' '; q++) {
+    }
+    /*Process the tag.*/
+    switch (p[0]) {
+      case 'W': {
+        if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
+        got_w = 1;
+      } break;
+      case 'H': {
+        if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
+        got_h = 1;
+      } break;
+      case 'F': {
+        if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
+          return -1;
+        }
+        got_fps = 1;
+      } break;
+      case 'I': {
+        _y4m->interlace = p[1];
+        got_interlace = 1;
+      } break;
+      case 'A': {
+        if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
+          return -1;
+        }
+        got_par = 1;
+      } break;
+      case 'C': {
+        if (q - p > 16) return -1;
+        memcpy(_y4m->chroma_type, p + 1, q - p - 1);
+        _y4m->chroma_type[q - p - 1] = '\0';
+        got_chroma = 1;
+      } break;
+        /*Ignore unknown tags.*/
+    }
+  }
+  if (!got_w || !got_h || !got_fps) return -1;
+  if (!got_interlace) _y4m->interlace = '?';
+  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
+  /*Chroma-type is not specified in older files, e.g., those generated by
+     mplayer.*/
+  if (!got_chroma) strcpy(_y4m->chroma_type, "420");
+  return 0;
+}
+
+/*All anti-aliasing filters in the following conversion functions are based on
+   one of two window functions:
+  The 6-tap Lanczos window (for down-sampling and shifts):
+   sinc(\pi*t)*sinc(\pi*t/3), |t|<3  (sinc(t)==sin(t)/t)
+   0,                         |t|>=3
+  The 4-tap Mitchell window (for up-sampling):
+   7|t|^3-12|t|^2+16/3,             |t|<1
+   -(7/3)|x|^3+12|x|^2-20|x|+32/3,  |t|<2
+   0,                               |t|>=2
+  The number of taps is intentionally kept small to reduce computational
+   overhead and limit ringing.
+
+  The taps from these filters are scaled so that their sum is 1, and the
+  result is scaled by 128 and rounded to integers to create a filter whose
+   intermediate values fit inside 16 bits.
+  Coefficients are rounded in such a way as to ensure their sum is still 128,
+   which is usually equivalent to normal rounding.
+
+  Conversions which require both horizontal and vertical filtering could
+   have these steps pipelined, for less memory consumption and better cache
+   performance, but we do them separately for simplicity.*/
+#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
+#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
+#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420mpeg2 chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  The 4:2:2 modes look exactly the same, except there are twice as many chroma
+   lines, and they are vertically co-sited with the luma samples in both the
+   mpeg2 and jpeg cases (thus requiring no vertical resampling).*/
+static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
+                                        const unsigned char *_src, int _c_w,
+                                        int _c_h) {
+  int y;
+  int x;
+  for (y = 0; y < _c_h; y++) {
+    /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
+       window.*/
+    for (x = 0; x < OC_MINI(_c_w, 2); x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0,
+          (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+           64) >>
+              7,
+          255);
+    }
+    for (; x < _c_w - 3; x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+              7,
+          255);
+    }
+    for (; x < _c_w; x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+              7,
+          255);
+    }
+    _dst += _c_w;
+    _src += _c_w;
+  }
+}
+
+/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
+static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+                                         unsigned char *_aux) {
+  int c_w;
+  int c_h;
+  int c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  for (pli = 1; pli < 3; pli++) {
+    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
+    _dst += c_sz;
+    _aux += c_sz;
+  }
+}
+
+/*This format is only used for interlaced content, but is included for
+   completeness.
+
+  420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420paldv chroma samples are sited like:
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  Then we use another filter to move the C_r location down one quarter pixel,
+   and the C_b location up one quarter pixel.*/
+static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+                                         unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + 1) / 2;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_sz = c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*First do the horizontal re-sampling.
+      This is the same as the mpeg2 case, except that after the horizontal
+       case, we need to apply a second vertical filter.*/
+    y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+    _aux += c_sz;
+    switch (pli) {
+      case 1: {
+        /*Slide C_b up a quarter-pel.
+          This is the same filter used above, but in the other order.*/
+        for (x = 0; x < c_w; x++) {
+          for (y = 0; y < OC_MINI(c_h, 3); y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+                 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h - 2; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[(c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          _dst++;
+          tmp++;
+        }
+        _dst += c_sz - c_w;
+        tmp -= c_w;
+      } break;
+      case 2: {
+        /*Slide C_r down a quarter-pel.
+          This is the same as the horizontal filter.*/
+        for (x = 0; x < c_w; x++) {
+          for (y = 0; y < OC_MINI(c_h, 2); y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+                 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] +
+                 tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h - 3; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+                 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+                 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] +
+                 64) >>
+                    7,
+                255);
+          }
+          _dst++;
+          tmp++;
+        }
+      } break;
+    }
+    /*For actual interlaced material, this would have to be done separately on
+       each field, and the shift amounts would be different.
+      C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8,
+       C_b up 1/8 in the bottom field.
+      The corresponding filters would be:
+       Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128
+       Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/
+  }
+}
+
+/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0.
+  This is used as a helper by several conversion routines.*/
+static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
+                                       const unsigned char *_src, int _c_w,
+                                       int _c_h) {
+  int y;
+  int x;
+  /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+  for (x = 0; x < _c_w; x++) {
+    for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
+      _dst[(y >> 1) * _c_w] =
+          OC_CLAMPI(0,
+                    (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+                     17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+                     3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+                        7,
+                    255);
+    }
+    for (; y < _c_h - 3; y += 2) {
+      _dst[(y >> 1) * _c_w] =
+          OC_CLAMPI(0,
+                    (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+                     17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+                     78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+                        7,
+                    255);
+    }
+    for (; y < _c_h; y += 2) {
+      _dst[(y >> 1) * _c_w] = OC_CLAMPI(
+          0,
+          (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) -
+           17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) +
+           78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) +
+           64) >>
+              7,
+          255);
+    }
+    _src++;
+    _dst++;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  422jpeg chroma samples are sited like:
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to decimate the chroma planes by two in the
+   vertical direction.*/
+static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                        unsigned char *_aux) {
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  for (pli = 1; pli < 3; pli++) {
+    y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h);
+    _aux += c_sz;
+    _dst += dst_c_sz;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  422 chroma samples are sited like:
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the original site locations one quarter
+   pixel (at the original chroma resolution) to the right.
+  Then we use a second resampling filter to decimate the chroma planes by two
+   in the vertical direction.*/
+static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_h;
+  int dst_c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = c_w * dst_c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 422jpeg)*/
+    y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h);
+    _aux += c_sz;
+    _dst += dst_c_sz;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  411 chroma samples are sited like:
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a filter to resample at site locations one eighth pixel (at the source
+   chroma plane's horizontal resolution) and five eighths of a pixel to the
+   right.
+  Then we use another filter to decimate the planes by 2 in the vertical
+   direction.*/
+static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int tmp_sz;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  tmp_sz = dst_c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 422jpeg)*/
+    for (y = 0; y < c_h; y++) {
+      /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a
+         4-tap Mitchell window.*/
+      for (x = 0; x < OC_MINI(c_w, 1); x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+             _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
+            255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+             5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
+            255);
+      }
+      for (; x < c_w - 2; x++) {
+        tmp[x << 1] =
+            (unsigned char)OC_CLAMPI(0,
+                                     (_aux[x - 1] + 110 * _aux[x] +
+                                      18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+                                         7,
+                                     255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+             5 * _aux[x + 2] + 64) >>
+                7,
+            255);
+      }
+      for (; x < c_w; x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+             _aux[c_w - 1] + 64) >>
+                7,
+            255);
+        if ((x << 1 | 1) < dst_c_w) {
+          tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+              0,
+              (-3 * _aux[x - 1] + 50 * _aux[x] +
+               86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >>
+                  7,
+              255);
+        }
+      }
+      tmp += dst_c_w;
+      _aux += c_w;
+    }
+    tmp -= tmp_sz;
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+    _dst += dst_c_sz;
+  }
+}
+
+/*Convert 444 to 420jpeg.*/
+static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int tmp_sz;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  tmp_sz = dst_c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+    for (y = 0; y < c_h; y++) {
+      for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+                                 17 * _aux[OC_MINI(2, c_w - 1)] +
+                                 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+                                    7,
+                                255);
+      }
+      for (; x < c_w - 3; x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (3 * (_aux[x - 2] + _aux[x + 3]) -
+                                 17 * (_aux[x - 1] + _aux[x + 2]) +
+                                 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+                                    7,
+                                255);
+      }
+      for (; x < c_w; x += 2) {
+        tmp[x >> 1] =
+            OC_CLAMPI(0,
+                      (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                       17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                       78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+                          7,
+                      255);
+      }
+      tmp += dst_c_w;
+      _aux += c_w;
+    }
+    tmp -= tmp_sz;
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+    _dst += dst_c_sz;
+  }
+}
+
+/*The image is padded with empty chroma components at 4:2:0.*/
+static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                     unsigned char *_aux) {
+  int c_sz;
+  (void)_aux;
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+         ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  memset(_dst, 128, c_sz * 2);
+}
+
+/*No conversion function needed.*/
+static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
+                             unsigned char *_aux) {
+  (void)_y4m;
+  (void)_dst;
+  (void)_aux;
+}
+
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420) {
+  char buffer[80] = { 0 };
+  int ret;
+  int i;
+  /*Read until newline, or 80 cols, whichever happens first.*/
+  for (i = 0; i < 79; i++) {
+    if (_nskip > 0) {
+      buffer[i] = *_skip++;
+      _nskip--;
+    } else {
+      if (!file_read(buffer + i, 1, _fin)) return -1;
+    }
+    if (buffer[i] == '\n') break;
+  }
+  /*We skipped too much header data.*/
+  if (_nskip > 0) return -1;
+  if (i == 79) {
+    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+    return -1;
+  }
+  buffer[i] = '\0';
+  if (memcmp(buffer, "YUV4MPEG", 8)) {
+    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+    return -1;
+  }
+  if (buffer[8] != '2') {
+    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  }
+  ret = y4m_parse_tags(_y4m, buffer + 5);
+  if (ret < 0) {
+    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
+    return ret;
+  }
+  if (_y4m->interlace == '?') {
+    fprintf(stderr,
+            "Warning: Input video interlacing format unknown; "
+            "assuming progressive scan.\n");
+  } else if (_y4m->interlace != 'p') {
+    fprintf(stderr,
+            "Input video is interlaced; "
+            "Only progressive scan handled.\n");
+    return -1;
+  }
+  _y4m->aom_fmt = AOM_IMG_FMT_I420;
+  _y4m->bps = 12;
+  _y4m->bit_depth = 8;
+  if (strcmp(_y4m->chroma_type, "420") == 0 ||
+      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        _y4m->pic_w * _y4m->pic_h +
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        2 * (_y4m->pic_w * _y4m->pic_h +
+             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 10;
+    _y4m->bps = 15;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        2 * (_y4m->pic_w * _y4m->pic_h +
+             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 12;
+    _y4m->bps = 18;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
+  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.
+      We need to make two filter passes, so we need some extra space in the
+       aux buffer.*/
+    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz =
+          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->aom_fmt = AOM_IMG_FMT_I422;
+      _y4m->bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz =
+          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42216;
+    _y4m->bps = 20;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42216;
+    _y4m->bps = 24;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
+    _y4m->src_c_dec_h = 4;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.
+      We need to make two filter passes, so we need some extra space in the
+       aux buffer.*/
+    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
+    _y4m->aux_buf_sz =
+        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->convert = y4m_convert_411_420jpeg;
+  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz =
+          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->aom_fmt = AOM_IMG_FMT_I444;
+      _y4m->bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I44416;
+    _y4m->bps = 30;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I44416;
+    _y4m->bps = 36;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->aom_fmt = AOM_IMG_FMT_444A;
+      _y4m->bps = 32;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
+    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
+    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*No extra space required, but we need to clear the chroma planes.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_mono_420jpeg;
+  } else {
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    return -1;
+  }
+  /*The size of the final frame buffers is always computed from the
+     destination chroma decimation type.*/
+  _y4m->dst_buf_sz =
+      _y4m->pic_w * _y4m->pic_h +
+      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  if (_y4m->bit_depth == 8)
+    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  else
+    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+
+  if (_y4m->aux_buf_sz > 0)
+    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  return 0;
+}
+
+void y4m_input_close(y4m_input *_y4m) {
+  free(_y4m->dst_buf);
+  free(_y4m->aux_buf);
+}
+
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *_img) {
+  char frame[6];
+  int pic_sz;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
+  /*Read and skip the frame header.*/
+  if (!file_read(frame, 6, _fin)) return 0;
+  if (memcmp(frame, "FRAME", 5)) {
+    fprintf(stderr, "Loss of framing in Y4M input data\n");
+    return -1;
+  }
+  if (frame[5] != '\n') {
+    char c;
+    int j;
+    for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {
+    }
+    if (j == 79) {
+      fprintf(stderr, "Error parsing Y4M frame header\n");
+      return -1;
+    }
+  }
+  /*Read the frame data that needs no conversion.*/
+  if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) {
+    fprintf(stderr, "Error reading Y4M frame data.\n");
+    return -1;
+  }
+  /*Read the frame data that does need conversion.*/
+  if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) {
+    fprintf(stderr, "Error reading Y4M frame data.\n");
+    return -1;
+  }
+  /*Now convert the just read frame.*/
+  (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf);
+  /*Fill in the frame buffer pointers.
+    We don't use aom_img_wrap() because it forces padding for odd picture
+     sizes, which would require a separate fread call for every row.*/
+  memset(_img, 0, sizeof(*_img));
+  /*Y4M has the planes in Y'CbCr order, which libaom calls Y, U, and V.*/
+  _img->fmt = _y4m->aom_fmt;
+  _img->w = _img->d_w = _y4m->pic_w;
+  _img->h = _img->d_h = _y4m->pic_h;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->bps;
+
+  /*Set up the buffer pointers.*/
+  pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample;
+  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_w *= bytes_per_sample;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  _img->stride[AOM_PLANE_Y] = _img->stride[AOM_PLANE_ALPHA] =
+      _y4m->pic_w * bytes_per_sample;
+  _img->stride[AOM_PLANE_U] = _img->stride[AOM_PLANE_V] = c_w;
+  _img->planes[AOM_PLANE_Y] = _y4m->dst_buf;
+  _img->planes[AOM_PLANE_U] = _y4m->dst_buf + pic_sz;
+  _img->planes[AOM_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[AOM_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
+  return 1;
+}
diff --git a/third_party/aom/common/y4minput.h b/third_party/aom/common/y4minput.h
new file mode 100644
index 000000000..01b9ce972
--- /dev/null
+++ b/third_party/aom/common/y4minput.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * Based on code from the OggTheora software codec source code,
+ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+
+#ifndef AOM_COMMON_Y4MINPUT_H_
+#define AOM_COMMON_Y4MINPUT_H_
+
+#include <stdio.h>
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct y4m_input y4m_input;
+
+/*The function used to perform chroma conversion.*/
+typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst,
+                                 unsigned char *_src);
+
+struct y4m_input {
+  int pic_w;
+  int pic_h;
+  int fps_n;
+  int fps_d;
+  int par_n;
+  int par_d;
+  char interlace;
+  int src_c_dec_h;
+  int src_c_dec_v;
+  int dst_c_dec_h;
+  int dst_c_dec_v;
+  char chroma_type[16];
+  /*The size of each converted frame buffer.*/
+  size_t dst_buf_sz;
+  /*The amount to read directly into the converted frame buffer.*/
+  size_t dst_buf_read_sz;
+  /*The size of the auxilliary buffer.*/
+  size_t aux_buf_sz;
+  /*The amount to read into the auxilliary buffer.*/
+  size_t aux_buf_read_sz;
+  y4m_convert_func convert;
+  unsigned char *dst_buf;
+  unsigned char *aux_buf;
+  enum aom_img_fmt aom_fmt;
+  int bps;
+  unsigned int bit_depth;
+};
+
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420);
+void y4m_input_close(y4m_input *_y4m);
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_Y4MINPUT_H_
diff --git a/third_party/aom/docs.cmake b/third_party/aom/docs.cmake
new file mode 100644
index 000000000..b5bfa9b56
--- /dev/null
+++ b/third_party/aom/docs.cmake
@@ -0,0 +1,251 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_DOCS_CMAKE_)
+  return()
+endif() # AOM_DOCS_CMAKE_
+set(AOM_DOCS_CMAKE_ 1)
+
+cmake_minimum_required(VERSION 3.5)
+
+set(AOM_DOXYFILE "${AOM_CONFIG_DIR}/doxyfile")
+set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template")
+set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
+set(AOM_DOXYGEN_SECTIONS "av1")
+
+set(AOM_DOXYGEN_SOURCES
+    "${AOM_ROOT}/aom/aom.h"
+    "${AOM_ROOT}/aom/aom_codec.h"
+    "${AOM_ROOT}/aom/aom_decoder.h"
+    "${AOM_ROOT}/aom/aom_encoder.h"
+    "${AOM_ROOT}/aom/aom_frame_buffer.h"
+    "${AOM_ROOT}/aom/aom_image.h"
+    "${AOM_ROOT}/aom/aom_integer.h"
+    "${AOM_ROOT}/keywords.dox"
+    "${AOM_ROOT}/mainpage.dox"
+    "${AOM_ROOT}/usage.dox")
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/apps/aomdec.c" "${AOM_ROOT}/examples/decode_to_md5.c"
+      "${AOM_ROOT}/examples/decode_with_drops.c"
+      "${AOM_ROOT}/examples/simple_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Full featured decoder." "Frame by frame MD5 checksum."
+      "Drops frames while decoding." "Simplified decoder loop.")
+
+  set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
+
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
+      "${AOM_ROOT}/usage_dx.dox")
+
+  if(CONFIG_ANALYZER)
+    set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+        "${AOM_ROOT}/examples/analyzer.cc")
+
+    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+        "Bitstream analyzer.")
+  endif()
+
+  if(CONFIG_INSPECTION)
+    set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+        "${AOM_ROOT}/examples/inspect.c")
+
+    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+        "Bitstream inspector.")
+  endif()
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/apps/aomenc.c" "${AOM_ROOT}/examples/lossless_encoder.c"
+      "${AOM_ROOT}/examples/set_maps.c" "${AOM_ROOT}/examples/simple_encoder.c"
+      "${AOM_ROOT}/examples/twopass_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Full featured encoder." "Simplified lossless encoder."
+      "Set active and ROI maps." "Simplified encoder loop."
+      "Two-pass encoder loop.")
+
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/scalable_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Scalable encoder loop.")
+
+  set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder")
+
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
+      "${AOM_ROOT}/usage_cx.dox")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/aom_cx_set_ref.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Set encoder reference frame.")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield encoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield tile list decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+      "Lightfield bitstream parsing example.")
+endif()
+
+# Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE
+# as values assigned to $var_name with no line breaks between list items.
+# Appends a new line after the entire config variable is expanded.
+function(write_cmake_list_to_doxygen_config_var var_name list_name)
+  unset(output_string)
+  foreach(list_item ${${list_name}})
+    set(output_string "${output_string} ${list_item} ")
+  endforeach()
+  string(STRIP "${output_string}" output_string)
+  file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n")
+endfunction()
+
+function(get_name file_path name_var)
+  get_filename_component(file_basename ${file_path} NAME)
+  get_filename_component(${name_var} ${file_basename} NAME_WE)
+  set(${name_var} ${${name_var}} PARENT_SCOPE)
+endfunction()
+
+function(setup_documentation_targets)
+
+  # Sanity check: the lengths of these lists must match.
+  list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources)
+  list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs)
+  if(NOT ${num_sources} EQUAL ${num_descs})
+    message(FATAL_ERROR "Unqeual example and description totals.")
+  endif()
+
+  # Take the list of examples and produce example_basename.dox for each file in
+  # the list.
+  file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}")
+  foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES})
+    unset(example_basename)
+    get_name("${example_file}" "example_name")
+    set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox")
+    set(dox_string "/*!\\page example_${example_name} ${example_name}\n")
+    set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n")
+    file(WRITE "${example_dox}" ${dox_string})
+    set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}")
+  endforeach()
+
+  # Generate samples.dox, an index page that refers to the example_basename.dox
+  # files that were just created.
+  set(
+    samples_header
+    "
+/*!\\page samples Sample Code
+This SDK includes a number of sample applications. Each sample documents a
+feature of the SDK in both prose and the associated C code. The following
+samples are included:
+"
+    )
+
+  set(
+    utils_desc
+    "
+In addition, the SDK contains a number of utilities. Since these utilities are
+built upon the concepts described in the sample code listed above, they are not
+documented in pieces like the samples are. Their source is included here for
+reference. The following utilities are included:
+"
+    )
+
+  # Write the description for the samples section.
+  set(samples_dox "${AOM_CONFIG_DIR}/samples.dox")
+  file(WRITE "${samples_dox}" "${samples_header}\n")
+
+  # Iterate over $AOM_DOXYGEN_EXAMPLE_SOURCES and
+  # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by
+  # AV1's doxygen setup.
+  math(EXPR max_example_index "${num_sources} - 1")
+  foreach(NUM RANGE ${max_example_index})
+    list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name)
+    get_name("${ex_name}" "ex_name")
+
+    # AV1's doxygen lists aomdec and aomenc as utils apart from the examples.
+    # Save the indexes for another pass.
+    if("${ex_name}" MATCHES "aomdec\|aomenc")
+      set(util_indexes "${util_indexes}" "${NUM}")
+      continue()
+    endif()
+    list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc)
+    file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
+  endforeach()
+
+  # Write the description and index for the utils.
+  file(APPEND "${samples_dox}" "${utils_desc}\n")
+  foreach(util_index ${util_indexes})
+    list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name)
+    get_name("${ex_name}" "ex_name")
+    list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc)
+    file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
+  endforeach()
+  file(APPEND "${samples_dox}" "*/")
+
+  # Add $samples_dox to the doxygen inputs.
+  get_filename_component(samples_dox ${samples_dox} NAME)
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox})
+
+  # Generate libaom's doxyfile.
+  file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n")
+  file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
+  file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data})
+  file(APPEND "${AOM_DOXYFILE}"
+              "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
+  file(APPEND "${AOM_DOXYFILE}"
+              "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
+  file(APPEND "${AOM_DOXYFILE}"
+              "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
+  write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES")
+  write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
+                                         "AOM_DOXYGEN_SECTIONS")
+
+  # Add the doxygen generation rule.
+  add_custom_target(docs ALL
+                    COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}"
+                    DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
+                            ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                            "${AOM_DOXYGEN_CONFIG_TEMPLATE}"
+                    SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
+                            ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                            "${AOM_DOXYGEN_CONFIG_TEMPLATE}")
+endfunction()
diff --git a/third_party/aom/examples/analyzer.cc b/third_party/aom/examples/analyzer.cc
new file mode 100644
index 000000000..6a42eca24
--- /dev/null
+++ b/third_party/aom/examples/analyzer.cc
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <wx/wx.h>
+#include <wx/aboutdlg.h>
+#include <wx/cmdline.h>
+#include <wx/dcbuffer.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/decoder/accounting.h"
+#include "av1/decoder/inspection.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+#define OD_SIGNMASK(a) (-((a) < 0))
+#define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+#define OD_DIV_ROUND(x, y) (((x) + OD_FLIPSIGNI((y) >> 1, x)) / (y))
+
+enum {
+  OD_LUMA_MASK = 1 << 0,
+  OD_CB_MASK = 1 << 1,
+  OD_CR_MASK = 1 << 2,
+  OD_ALL_MASK = OD_LUMA_MASK | OD_CB_MASK | OD_CR_MASK
+};
+
+class AV1Decoder {
+ private:
+  FILE *input;
+  wxString path;
+
+  AvxVideoReader *reader;
+  const AvxVideoInfo *info;
+  const AvxInterface *decoder;
+
+  insp_frame_data frame_data;
+
+  aom_codec_ctx_t codec;
+  bool show_padding;
+
+ public:
+  aom_image_t *image;
+  int frame;
+
+  int plane_mask;
+
+  AV1Decoder();
+  ~AV1Decoder();
+
+  bool open(const wxString &path);
+  void close();
+  bool step();
+
+  int getWidthPadding() const;
+  int getHeightPadding() const;
+  void togglePadding();
+  int getWidth() const;
+  int getHeight() const;
+
+  bool getAccountingStruct(Accounting **acct);
+  bool setInspectionCallback();
+
+  static void inspect(void *decoder, void *data);
+};
+
+AV1Decoder::AV1Decoder()
+    : reader(NULL), info(NULL), decoder(NULL), show_padding(false), image(NULL),
+      frame(0) {}
+
+AV1Decoder::~AV1Decoder() {}
+
+void AV1Decoder::togglePadding() { show_padding = !show_padding; }
+
+bool AV1Decoder::open(const wxString &path) {
+  reader = aom_video_reader_open(path.mb_str());
+  if (!reader) {
+    fprintf(stderr, "Failed to open %s for reading.", path.mb_str().data());
+    return false;
+  }
+  this->path = path;
+  info = aom_video_reader_get_info(reader);
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) {
+    fprintf(stderr, "Unknown input codec.");
+    return false;
+  }
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) {
+    fprintf(stderr, "Failed to initialize decoder.");
+    return false;
+  }
+  ifd_init(&frame_data, info->frame_width, info->frame_height);
+  setInspectionCallback();
+  return true;
+}
+
+void AV1Decoder::close() {}
+
+bool AV1Decoder::step() {
+  if (aom_video_reader_read_frame(reader)) {
+    size_t frame_size;
+    const unsigned char *frame_data;
+    frame_data = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) {
+      fprintf(stderr, "Failed to decode frame.");
+      return false;
+    } else {
+      aom_codec_iter_t iter = NULL;
+      image = aom_codec_get_frame(&codec, &iter);
+      if (image != NULL) {
+        frame++;
+        return true;
+      }
+      return false;
+    }
+  }
+  return false;
+}
+
+int AV1Decoder::getWidth() const {
+  return info->frame_width + 2 * getWidthPadding();
+}
+
+int AV1Decoder::getWidthPadding() const {
+  return show_padding ? AOMMAX(info->frame_width + 16,
+                               ALIGN_POWER_OF_TWO(info->frame_width, 6)) -
+                            info->frame_width
+                      : 0;
+}
+
+int AV1Decoder::getHeight() const {
+  return info->frame_height + 2 * getHeightPadding();
+}
+
+int AV1Decoder::getHeightPadding() const {
+  return show_padding ? AOMMAX(info->frame_height + 16,
+                               ALIGN_POWER_OF_TWO(info->frame_height, 6)) -
+                            info->frame_height
+                      : 0;
+}
+
+bool AV1Decoder::getAccountingStruct(Accounting **accounting) {
+  return aom_codec_control(&codec, AV1_GET_ACCOUNTING, accounting) ==
+         AOM_CODEC_OK;
+}
+
+bool AV1Decoder::setInspectionCallback() {
+  aom_inspect_init ii;
+  ii.inspect_cb = AV1Decoder::inspect;
+  ii.inspect_ctx = (void *)this;
+  return aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii) ==
+         AOM_CODEC_OK;
+}
+
+void AV1Decoder::inspect(void *pbi, void *data) {
+  AV1Decoder *decoder = (AV1Decoder *)data;
+  ifd_inspect(&decoder->frame_data, pbi);
+}
+
+#define MIN_ZOOM (1)
+#define MAX_ZOOM (4)
+
+class AnalyzerPanel : public wxPanel {
+  DECLARE_EVENT_TABLE()
+
+ private:
+  AV1Decoder decoder;
+  const wxString path;
+
+  int zoom;
+  unsigned char *pixels;
+
+  const bool bit_accounting;
+  double *bpp_q3;
+
+  int plane_mask;
+
+  // The display size is the decode size, scaled by the zoom.
+  int getDisplayWidth() const;
+  int getDisplayHeight() const;
+
+  bool updateDisplaySize();
+
+  void computeBitsPerPixel();
+
+ public:
+  AnalyzerPanel(wxWindow *parent, const wxString &path,
+                const bool bit_accounting);
+  ~AnalyzerPanel();
+
+  bool open(const wxString &path);
+  void close();
+  void render();
+  void togglePadding();
+  bool nextFrame();
+  void refresh();
+
+  int getZoom() const;
+  bool setZoom(int zoom);
+
+  void setShowPlane(bool show_plane, int mask);
+
+  void onPaint(wxPaintEvent &event);  // NOLINT
+};
+
+BEGIN_EVENT_TABLE(AnalyzerPanel, wxPanel)
+EVT_PAINT(AnalyzerPanel::onPaint)
+END_EVENT_TABLE()
+
+AnalyzerPanel::AnalyzerPanel(wxWindow *parent, const wxString &path,
+                             const bool bit_accounting)
+    : wxPanel(parent), path(path), zoom(0), pixels(NULL),
+      bit_accounting(bit_accounting), bpp_q3(NULL), plane_mask(OD_ALL_MASK) {}
+
+AnalyzerPanel::~AnalyzerPanel() { close(); }
+
+void AnalyzerPanel::setShowPlane(bool show_plane, int mask) {
+  if (show_plane) {
+    plane_mask |= mask;
+  } else {
+    plane_mask &= ~mask;
+  }
+}
+
+void AnalyzerPanel::render() {
+  aom_image_t *img = decoder.image;
+  const int hbd = !!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+  int y_stride = img->stride[0] >> hbd;
+  int cb_stride = img->stride[1] >> hbd;
+  int cr_stride = img->stride[2] >> hbd;
+  int p_stride = 3 * getDisplayWidth();
+  unsigned char *y_row = img->planes[0];
+  unsigned char *cb_row = img->planes[1];
+  unsigned char *cr_row = img->planes[2];
+  uint16_t *y_row16 = reinterpret_cast<uint16_t *>(y_row);
+  uint16_t *cb_row16 = reinterpret_cast<uint16_t *>(cb_row);
+  uint16_t *cr_row16 = reinterpret_cast<uint16_t *>(cr_row);
+  unsigned char *p_row = pixels;
+  int y_width_padding = decoder.getWidthPadding();
+  int cb_width_padding = y_width_padding >> 1;
+  int cr_width_padding = y_width_padding >> 1;
+  int y_height_padding = decoder.getHeightPadding();
+  int cb_height_padding = y_height_padding >> 1;
+  int cr_height_padding = y_height_padding >> 1;
+  for (int j = 0; j < decoder.getHeight(); j++) {
+    unsigned char *y = y_row - y_stride * y_height_padding;
+    unsigned char *cb = cb_row - cb_stride * cb_height_padding;
+    unsigned char *cr = cr_row - cr_stride * cr_height_padding;
+    uint16_t *y16 = y_row16 - y_stride * y_height_padding;
+    uint16_t *cb16 = cb_row16 - cb_stride * cb_height_padding;
+    uint16_t *cr16 = cr_row16 - cr_stride * cr_height_padding;
+    unsigned char *p = p_row;
+    for (int i = 0; i < decoder.getWidth(); i++) {
+      int64_t yval;
+      int64_t cbval;
+      int64_t crval;
+      int pmask;
+      unsigned rval;
+      unsigned gval;
+      unsigned bval;
+      if (hbd) {
+        yval = *(y16 - y_width_padding);
+        cbval = *(cb16 - cb_width_padding);
+        crval = *(cr16 - cr_width_padding);
+      } else {
+        yval = *(y - y_width_padding);
+        cbval = *(cb - cb_width_padding);
+        crval = *(cr - cr_width_padding);
+      }
+      pmask = plane_mask;
+      if (pmask & OD_LUMA_MASK) {
+        yval -= 16;
+      } else {
+        yval = 128;
+      }
+      cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128);
+      crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128);
+      /*This is intentionally slow and very accurate.*/
+      rval = OD_CLAMPI(
+          0,
+          (int32_t)OD_DIV_ROUND(
+              2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL),
+          65535);
+      gval = OD_CLAMPI(0,
+                       (int32_t)OD_DIV_ROUND(2916394880000LL * yval -
+                                                 534117096223LL * cbval -
+                                                 1334761232047LL * crval,
+                                             9745792000LL),
+                       65535);
+      bval = OD_CLAMPI(
+          0,
+          (int32_t)OD_DIV_ROUND(
+              2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL),
+          65535);
+      unsigned char *px_row = p;
+      for (int v = 0; v < zoom; v++) {
+        unsigned char *px = px_row;
+        for (int u = 0; u < zoom; u++) {
+          *(px + 0) = (unsigned char)(rval >> 8);
+          *(px + 1) = (unsigned char)(gval >> 8);
+          *(px + 2) = (unsigned char)(bval >> 8);
+          px += 3;
+        }
+        px_row += p_stride;
+      }
+      if (hbd) {
+        int dc = ((y16 - y_row16) & 1) | (1 - img->x_chroma_shift);
+        y16++;
+        cb16 += dc;
+        cr16 += dc;
+      } else {
+        int dc = ((y - y_row) & 1) | (1 - img->x_chroma_shift);
+        y++;
+        cb += dc;
+        cr += dc;
+      }
+      p += zoom * 3;
+    }
+    int dc = -((j & 1) | (1 - img->y_chroma_shift));
+    if (hbd) {
+      y_row16 += y_stride;
+      cb_row16 += dc & cb_stride;
+      cr_row16 += dc & cr_stride;
+    } else {
+      y_row += y_stride;
+      cb_row += dc & cb_stride;
+      cr_row += dc & cr_stride;
+    }
+    p_row += zoom * p_stride;
+  }
+}
+
+void AnalyzerPanel::computeBitsPerPixel() {
+  Accounting *acct;
+  double bpp_total;
+  int totals_q3[MAX_SYMBOL_TYPES] = { 0 };
+  int sym_count[MAX_SYMBOL_TYPES] = { 0 };
+  decoder.getAccountingStruct(&acct);
+  for (int j = 0; j < decoder.getHeight(); j++) {
+    for (int i = 0; i < decoder.getWidth(); i++) {
+      bpp_q3[j * decoder.getWidth() + i] = 0.0;
+    }
+  }
+  bpp_total = 0;
+  for (int i = 0; i < acct->syms.num_syms; i++) {
+    AccountingSymbol *s;
+    s = &acct->syms.syms[i];
+    totals_q3[s->id] += s->bits;
+    sym_count[s->id] += s->samples;
+  }
+  printf("=== Frame: %-3i ===\n", decoder.frame - 1);
+  for (int i = 0; i < acct->syms.dictionary.num_strs; i++) {
+    if (totals_q3[i]) {
+      printf("%30s = %10.3f (%f bit/symbol)\n", acct->syms.dictionary.strs[i],
+             (float)totals_q3[i] / 8, (float)totals_q3[i] / 8 / sym_count[i]);
+    }
+  }
+  printf("\n");
+}
+
+void AnalyzerPanel::togglePadding() {
+  decoder.togglePadding();
+  updateDisplaySize();
+}
+
+bool AnalyzerPanel::nextFrame() {
+  if (decoder.step()) {
+    refresh();
+    return true;
+  }
+  return false;
+}
+
+void AnalyzerPanel::refresh() {
+  if (bit_accounting) {
+    computeBitsPerPixel();
+  }
+  render();
+}
+
+int AnalyzerPanel::getDisplayWidth() const { return zoom * decoder.getWidth(); }
+
+int AnalyzerPanel::getDisplayHeight() const {
+  return zoom * decoder.getHeight();
+}
+
+bool AnalyzerPanel::updateDisplaySize() {
+  unsigned char *p = (unsigned char *)malloc(
+      sizeof(*p) * 3 * getDisplayWidth() * getDisplayHeight());
+  if (p == NULL) {
+    return false;
+  }
+  free(pixels);
+  pixels = p;
+  SetSize(getDisplayWidth(), getDisplayHeight());
+  return true;
+}
+
+bool AnalyzerPanel::open(const wxString &path) {
+  if (!decoder.open(path)) {
+    return false;
+  }
+  if (!setZoom(MIN_ZOOM)) {
+    return false;
+  }
+  if (bit_accounting) {
+    bpp_q3 = (double *)malloc(sizeof(*bpp_q3) * decoder.getWidth() *
+                              decoder.getHeight());
+    if (bpp_q3 == NULL) {
+      fprintf(stderr, "Could not allocate memory for bit accounting\n");
+      close();
+      return false;
+    }
+  }
+  if (!nextFrame()) {
+    close();
+    return false;
+  }
+  SetFocus();
+  return true;
+}
+
+void AnalyzerPanel::close() {
+  decoder.close();
+  free(pixels);
+  pixels = NULL;
+  free(bpp_q3);
+  bpp_q3 = NULL;
+}
+
+int AnalyzerPanel::getZoom() const { return zoom; }
+
+bool AnalyzerPanel::setZoom(int z) {
+  if (z <= MAX_ZOOM && z >= MIN_ZOOM && zoom != z) {
+    int old_zoom = zoom;
+    zoom = z;
+    if (!updateDisplaySize()) {
+      zoom = old_zoom;
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+void AnalyzerPanel::onPaint(wxPaintEvent &) {
+  wxBitmap bmp(wxImage(getDisplayWidth(), getDisplayHeight(), pixels, true));
+  wxBufferedPaintDC dc(this, bmp);
+}
+
+class AnalyzerFrame : public wxFrame {
+  DECLARE_EVENT_TABLE()
+
+ private:
+  AnalyzerPanel *panel;
+  const bool bit_accounting;
+
+  wxMenu *fileMenu;
+  wxMenu *viewMenu;
+  wxMenu *playbackMenu;
+
+ public:
+  AnalyzerFrame(const bool bit_accounting);  // NOLINT
+
+  void onOpen(wxCommandEvent &event);   // NOLINT
+  void onClose(wxCommandEvent &event);  // NOLINT
+  void onQuit(wxCommandEvent &event);   // NOLINT
+
+  void onTogglePadding(wxCommandEvent &event);  // NOLINT
+  void onZoomIn(wxCommandEvent &event);         // NOLINT
+  void onZoomOut(wxCommandEvent &event);        // NOLINT
+  void onActualSize(wxCommandEvent &event);     // NOLINT
+
+  void onToggleViewMenuCheckBox(wxCommandEvent &event);          // NOLINT
+  void onResetAndToggleViewMenuCheckBox(wxCommandEvent &event);  // NOLINT
+
+  void onNextFrame(wxCommandEvent &event);  // NOLINT
+  void onGotoFrame(wxCommandEvent &event);  // NOLINT
+  void onRestart(wxCommandEvent &event);    // NOLINT
+
+  void onAbout(wxCommandEvent &event);  // NOLINT
+
+  bool open(const wxString &path);
+  bool setZoom(int zoom);
+  void updateViewMenu();
+};
+
+enum {
+  wxID_NEXT_FRAME = 6000,
+  wxID_SHOW_Y,
+  wxID_SHOW_U,
+  wxID_SHOW_V,
+  wxID_GOTO_FRAME,
+  wxID_RESTART,
+  wxID_ACTUAL_SIZE,
+  wxID_PADDING
+};
+
+BEGIN_EVENT_TABLE(AnalyzerFrame, wxFrame)
+EVT_MENU(wxID_OPEN, AnalyzerFrame::onOpen)
+EVT_MENU(wxID_CLOSE, AnalyzerFrame::onClose)
+EVT_MENU(wxID_EXIT, AnalyzerFrame::onQuit)
+EVT_MENU(wxID_PADDING, AnalyzerFrame::onTogglePadding)
+EVT_MENU(wxID_ZOOM_IN, AnalyzerFrame::onZoomIn)
+EVT_MENU(wxID_ZOOM_OUT, AnalyzerFrame::onZoomOut)
+EVT_MENU(wxID_ACTUAL_SIZE, AnalyzerFrame::onActualSize)
+EVT_MENU(wxID_SHOW_Y, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_SHOW_U, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_SHOW_V, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_NEXT_FRAME, AnalyzerFrame::onNextFrame)
+EVT_MENU(wxID_GOTO_FRAME, AnalyzerFrame::onGotoFrame)
+EVT_MENU(wxID_RESTART, AnalyzerFrame::onRestart)
+EVT_MENU(wxID_ABOUT, AnalyzerFrame::onAbout)
+END_EVENT_TABLE()
+
+AnalyzerFrame::AnalyzerFrame(const bool bit_accounting)
+    : wxFrame(NULL, wxID_ANY, _("AV1 Stream Analyzer"), wxDefaultPosition,
+              wxDefaultSize, wxDEFAULT_FRAME_STYLE),
+      panel(NULL), bit_accounting(bit_accounting) {
+  wxMenuBar *mb = new wxMenuBar();
+
+  fileMenu = new wxMenu();
+  fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open daala file"));
+  fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close daala file"));
+  fileMenu->Enable(wxID_CLOSE, false);
+  fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program"));
+  mb->Append(fileMenu, _("&File"));
+
+  wxAcceleratorEntry entries[2];
+  entries[0].Set(wxACCEL_CTRL, (int)'=', wxID_ZOOM_IN);
+  entries[1].Set(wxACCEL_CTRL | wxACCEL_SHIFT, (int)'-', wxID_ZOOM_OUT);
+  wxAcceleratorTable accel(2, entries);
+  this->SetAcceleratorTable(accel);
+
+  viewMenu = new wxMenu();
+  +viewMenu->Append(wxID_PADDING, _("Toggle padding\tCtrl-p"),
+                    _("Show padding"));
+  viewMenu->Append(wxID_ZOOM_IN, _("Zoom-In\tCtrl-+"), _("Double image size"));
+  viewMenu->Append(wxID_ZOOM_OUT, _("Zoom-Out\tCtrl--"), _("Half image size"));
+  viewMenu->Append(wxID_ACTUAL_SIZE, _("Actual size\tCtrl-0"),
+                   _("Actual size of the frame"));
+  viewMenu->AppendSeparator();
+  viewMenu->AppendCheckItem(wxID_SHOW_Y, _("&Y plane\tCtrl-Y"),
+                            _("Show Y plane"));
+  viewMenu->AppendCheckItem(wxID_SHOW_U, _("&U plane\tCtrl-U"),
+                            _("Show U plane"));
+  viewMenu->AppendCheckItem(wxID_SHOW_V, _("&V plane\tCtrl-V"),
+                            _("Show V plane"));
+  mb->Append(viewMenu, _("&View"));
+
+  playbackMenu = new wxMenu();
+  playbackMenu->Append(wxID_NEXT_FRAME, _("Next frame\tCtrl-."),
+                       _("Go to next frame"));
+  /*playbackMenu->Append(wxID_RESTART, _("&Restart\tCtrl-R"),
+                       _("Set video to frame 0"));
+  playbackMenu->Append(wxID_GOTO_FRAME, _("Jump to Frame\tCtrl-J"),
+                       _("Go to frame number"));*/
+  mb->Append(playbackMenu, _("&Playback"));
+
+  wxMenu *helpMenu = new wxMenu();
+  helpMenu->Append(wxID_ABOUT, _("&About...\tF1"), _("Show about dialog"));
+  mb->Append(helpMenu, _("&Help"));
+
+  SetMenuBar(mb);
+
+  CreateStatusBar(1);
+}
+
+void AnalyzerFrame::onOpen(wxCommandEvent &WXUNUSED(event)) {
+  wxFileDialog openFileDialog(this, _("Open file"), wxEmptyString,
+                              wxEmptyString, _("AV1 files (*.ivf)|*.ivf"),
+                              wxFD_OPEN | wxFD_FILE_MUST_EXIST);
+  if (openFileDialog.ShowModal() != wxID_CANCEL) {
+    open(openFileDialog.GetPath());
+  }
+}
+
+void AnalyzerFrame::onClose(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onQuit(wxCommandEvent &WXUNUSED(event)) { Close(true); }
+
+void AnalyzerFrame::onTogglePadding(wxCommandEvent &WXUNUSED(event)) {
+  panel->togglePadding();
+  SetClientSize(panel->GetSize());
+  panel->render();
+  panel->Refresh();
+}
+
+void AnalyzerFrame::onZoomIn(wxCommandEvent &WXUNUSED(event)) {
+  setZoom(panel->getZoom() + 1);
+}
+
+void AnalyzerFrame::onZoomOut(wxCommandEvent &WXUNUSED(event)) {
+  setZoom(panel->getZoom() - 1);
+}
+
+void AnalyzerFrame::onActualSize(wxCommandEvent &WXUNUSED(event)) {
+  setZoom(MIN_ZOOM);
+}
+
+void AnalyzerFrame::onToggleViewMenuCheckBox(wxCommandEvent &event) {  // NOLINT
+  GetMenuBar()->Check(event.GetId(), event.IsChecked());
+  updateViewMenu();
+}
+
+void AnalyzerFrame::onResetAndToggleViewMenuCheckBox(
+    wxCommandEvent &event) {  // NOLINT
+  int id = event.GetId();
+  if (id != wxID_SHOW_Y && id != wxID_SHOW_U && id != wxID_SHOW_V) {
+    GetMenuBar()->Check(wxID_SHOW_Y, true);
+    GetMenuBar()->Check(wxID_SHOW_U, true);
+    GetMenuBar()->Check(wxID_SHOW_V, true);
+  }
+  onToggleViewMenuCheckBox(event);
+}
+
+void AnalyzerFrame::onNextFrame(wxCommandEvent &WXUNUSED(event)) {
+  panel->nextFrame();
+  panel->Refresh(false);
+}
+
+void AnalyzerFrame::onGotoFrame(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onRestart(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onAbout(wxCommandEvent &WXUNUSED(event)) {
+  wxAboutDialogInfo info;
+  info.SetName(_("AV1 Bitstream Analyzer"));
+  info.SetVersion(_("0.1-beta"));
+  info.SetDescription(
+      _("This program implements a bitstream analyzer for AV1"));
+  info.SetCopyright(
+      wxT("(C) 2017 Alliance for Open Media <negge@mozilla.com>"));
+  wxAboutBox(info);
+}
+
+bool AnalyzerFrame::open(const wxString &path) {
+  panel = new AnalyzerPanel(this, path, bit_accounting);
+  if (panel->open(path)) {
+    SetClientSize(panel->GetSize());
+    return true;
+  } else {
+    delete panel;
+    return false;
+  }
+}
+
+bool AnalyzerFrame::setZoom(int zoom) {
+  if (panel->setZoom(zoom)) {
+    GetMenuBar()->Enable(wxID_ACTUAL_SIZE, zoom != MIN_ZOOM);
+    GetMenuBar()->Enable(wxID_ZOOM_IN, zoom != MAX_ZOOM);
+    GetMenuBar()->Enable(wxID_ZOOM_OUT, zoom != MIN_ZOOM);
+    SetClientSize(panel->GetSize());
+    panel->render();
+    panel->Refresh();
+    return true;
+  }
+  return false;
+}
+
+void AnalyzerFrame::updateViewMenu() {
+  panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_Y), OD_LUMA_MASK);
+  panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_U), OD_CB_MASK);
+  panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_V), OD_CR_MASK);
+  SetClientSize(panel->GetSize());
+  panel->render();
+  panel->Refresh(false);
+}
+
+class Analyzer : public wxApp {
+ private:
+  AnalyzerFrame *frame;
+
+ public:
+  void OnInitCmdLine(wxCmdLineParser &parser);    // NOLINT
+  bool OnCmdLineParsed(wxCmdLineParser &parser);  // NOLINT
+};
+
+static const wxCmdLineEntryDesc CMD_LINE_DESC[] = {
+  { wxCMD_LINE_SWITCH, _("h"), _("help"), _("Display this help and exit."),
+    wxCMD_LINE_VAL_NONE, wxCMD_LINE_OPTION_HELP },
+  { wxCMD_LINE_SWITCH, _("a"), _("bit-accounting"), _("Enable bit accounting"),
+    wxCMD_LINE_VAL_NONE, wxCMD_LINE_PARAM_OPTIONAL },
+  { wxCMD_LINE_PARAM, NULL, NULL, _("input.ivf"), wxCMD_LINE_VAL_STRING,
+    wxCMD_LINE_PARAM_OPTIONAL },
+  { wxCMD_LINE_NONE }
+};
+
+void Analyzer::OnInitCmdLine(wxCmdLineParser &parser) {  // NOLINT
+  parser.SetDesc(CMD_LINE_DESC);
+  parser.SetSwitchChars(_("-"));
+}
+
+bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) {  // NOLINT
+  bool bit_accounting = parser.Found(_("a"));
+  if (bit_accounting && !CONFIG_ACCOUNTING) {
+    fprintf(stderr,
+            "Bit accounting support not found. "
+            "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n");
+    return false;
+  }
+  frame = new AnalyzerFrame(parser.Found(_("a")));
+  frame->Show();
+  if (parser.GetParamCount() > 0) {
+    return frame->open(parser.GetParam(0));
+  }
+  return true;
+}
+
+void usage_exit(void) {
+  fprintf(stderr, "uhh\n");
+  exit(EXIT_FAILURE);
+}
+
+IMPLEMENT_APP(Analyzer)
diff --git a/third_party/aom/examples/aom_cx_set_ref.c b/third_party/aom/examples/aom_cx_set_ref.c
new file mode 100644
index 000000000..8e3d216fe
--- /dev/null
+++ b/third_party/aom/examples/aom_cx_set_ref.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// AV1 Set Reference Frame
+// ============================
+//
+// This is an example demonstrating how to overwrite the AV1 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. This technique could be used to bounce between two cameras.
+//
+// The decoder would also have to set the reference frame to the same value
+// on the same frame, or the video will become corrupt. The 'test_decode'
+// variable is set to 1 in this example that tests if the encoder and decoder
+// results are matching.
+//
+// Usage
+// -----
+// This example encodes a raw video. And the last argument passed in specifies
+// the frame number to update the reference frame on. For example, run
+// examples/aom_cx_set_ref av1 352 288 in.yuv out.ivf 4 30
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// The encoder and decoder results should be matching when the same reference
+// frame setting operation is done in both encoder and decoder. Otherwise,
+// the encoder/decoder mismatch would be seen.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+#include "examples/encoder_util.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<frame> <limit(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
+                           unsigned int frame_out, int *mismatch_seen) {
+  aom_image_t enc_img, dec_img;
+
+  if (*mismatch_seen) return;
+
+  /* Get the internal reference frame */
+  if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img))
+    die_codec(encoder, "Failed to get encoder reference frame");
+  if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img))
+    die_codec(decoder, "Failed to get decoder reference frame");
+
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
+
+  if (!aom_compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}",
+        frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
+    *mismatch_seen = 1;
+  }
+
+  aom_img_free(&enc_img);
+  aom_img_free(&dec_img);
+}
+
+static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
+                        unsigned int frame_in, AvxVideoWriter *writer,
+                        int test_decode, aom_codec_ctx_t *dcodec,
+                        unsigned int *frame_out, int *mismatch_seen,
+                        aom_image_t *ext_ref) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  int got_data;
+  const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0);
+  if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame");
+
+  got_data = 0;
+
+  while ((pkt = aom_codec_get_cx_data(ecodec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+      if (!(pkt->data.frame.flags & AOM_FRAME_IS_FRAGMENT)) {
+        *frame_out += 1;
+      }
+
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(ecodec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+      got_data = 1;
+
+      // Decode 1 frame.
+      if (test_decode) {
+        if (aom_codec_decode(dcodec, pkt->data.frame.buf,
+                             (unsigned int)pkt->data.frame.sz, NULL))
+          die_codec(dcodec, "Failed to decode frame.");
+
+        // Copy out first decoded frame, and use it as reference later.
+        if (*frame_out == 1 && ext_ref != NULL)
+          if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref))
+            die_codec(dcodec, "Failed to get decoder new frame");
+      }
+    }
+  }
+
+  // Mismatch checking
+  if (got_data && test_decode) {
+    testing_decode(ecodec, dcodec, *frame_out, mismatch_seen);
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  // Encoder
+  aom_codec_ctx_t ecodec;
+  aom_codec_enc_cfg_t cfg;
+  unsigned int frame_in = 0;
+  aom_image_t raw;
+  aom_image_t raw_shift;
+  aom_image_t ext_ref;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  int flags = 0;
+  int allocated_raw_shift = 0;
+  aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420;
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+
+  // Test encoder/decoder mismatch.
+  int test_decode = 1;
+  // Decoder
+  aom_codec_ctx_t dcodec;
+  unsigned int frame_out = 0;
+
+  // The frame number to set reference frame on
+  unsigned int update_frame_num = 0;
+  int mismatch_seen = 0;
+
+  const int fps = 30;
+  const int bitrate = 500;
+
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  const char *update_frame_num_arg = NULL;
+  unsigned int limit = 0;
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&ecodec, 0, sizeof(ecodec));
+  memset(&cfg, 0, sizeof(cfg));
+  memset(&info, 0, sizeof(info));
+
+  if (argc < 7) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+  update_frame_num_arg = argv[6];
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
+  // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
+  // allocated while calling aom_codec_encode(), thus, setting reference for
+  // 1st frame isn't supported.
+  if (update_frame_num <= 1) {
+    die("Couldn't parse frame number '%s'\n", update_frame_num_arg);
+  }
+
+  if (argc > 7) {
+    limit = (unsigned int)strtoul(argv[7], NULL, 0);
+    if (update_frame_num > limit)
+      die("Update frame number couldn't larger than limit\n");
+  }
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  // In this test, the bit depth of input video is 8-bit, and the input format
+  // is AOM_IMG_FMT_I420.
+  if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) {
+    die("Failed to allocate image.");
+  }
+
+  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
+                                 info.frame_height, 32, 8,
+                                 AOM_BORDER_IN_PIXELS)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&ecodec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_lag_in_frames = 3;
+  cfg.g_bit_depth = AOM_BITS_8;
+
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH)
+               ? AOM_CODEC_USE_HIGHBITDEPTH
+               : 0;
+
+  writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, flags))
+    die_codec(&ecodec, "Failed to initialize encoder");
+
+  // Disable alt_ref.
+  if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&ecodec, "Failed to set enable auto alt ref");
+
+  if (test_decode) {
+    const AvxInterface *decoder = get_aom_decoder_by_name(codec_arg);
+    if (aom_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
+      die_codec(&dcodec, "Failed to initialize decoder.");
+  }
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile)) {
+    if (limit && frame_in >= limit) break;
+    aom_image_t *frame_to_encode;
+
+    if (!CONFIG_LOWBITDEPTH) {
+      // Need to allocate larger buffer to use hbd internal.
+      int input_shift = 0;
+      if (!allocated_raw_shift) {
+        aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                      info.frame_width, info.frame_height, 32);
+        allocated_raw_shift = 1;
+      }
+      aom_img_upshift(&raw_shift, &raw, input_shift);
+      frame_to_encode = &raw_shift;
+    } else {
+      frame_to_encode = &raw;
+    }
+
+    if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
+      av1_ref_frame_t ref;
+      ref.idx = 0;
+      ref.use_external_ref = 0;
+      ref.img = ext_ref;
+      // Set reference frame in encoder.
+      if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref))
+        die_codec(&ecodec, "Failed to set encoder reference frame");
+      printf(" <SET_REF>");
+
+      // If set_reference in decoder is commented out, the enc/dec mismatch
+      // would be seen.
+      if (test_decode) {
+        ref.use_external_ref = 1;
+        if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref))
+          die_codec(&dcodec, "Failed to set decoder reference frame");
+      }
+    }
+
+    encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode,
+                 &dcodec, &frame_out, &mismatch_seen, &ext_ref);
+    frame_in++;
+    if (mismatch_seen) break;
+  }
+
+  // Flush encoder.
+  if (!mismatch_seen)
+    while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
+                        &frame_out, &mismatch_seen, NULL)) {
+    }
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_out);
+
+  if (test_decode) {
+    if (!mismatch_seen)
+      printf("Encoder/decoder results are matching.\n");
+    else
+      printf("Encoder/decoder results are NOT matching.\n");
+  }
+
+  if (test_decode)
+    if (aom_codec_destroy(&dcodec))
+      die_codec(&dcodec, "Failed to destroy decoder");
+
+  if (allocated_raw_shift) aom_img_free(&raw_shift);
+  aom_img_free(&ext_ref);
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&ecodec))
+    die_codec(&ecodec, "Failed to destroy encoder.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/decode_to_md5.c b/third_party/aom/examples/decode_to_md5.c
new file mode 100644
index 000000000..bc127b78d
--- /dev/null
+++ b/third_party/aom/examples/decode_to_md5.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Frame-by-frame MD5 Checksum
+// ===========================
+//
+// This example builds upon the simple decoder loop to show how checksums
+// of the decoded output can be generated. These are used for validating
+// decoder implementations against the reference implementation, for example.
+//
+// MD5 algorithm
+// -------------
+// The Message-Digest 5 (MD5) is a well known hash function. We have provided
+// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm for your use. Our implmentation only changes the interface of this
+// reference code. You must include the `md5_utils.h` header for access to these
+// functions.
+//
+// Processing The Decoded Data
+// ---------------------------
+// Each row of the image is passed to the MD5 accumulator. First the Y plane
+// is processed, then U, then V. It is important to honor the image's `stride`
+// values.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) {
+  int plane, y;
+  MD5Context md5;
+
+  MD5Init(&md5);
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+    const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+
+    for (y = 0; y < h; ++y) {
+      MD5Update(&md5, buf, w);
+      buf += stride;
+    }
+  }
+
+  MD5Final(digest, &md5);
+}
+
+static void print_md5(FILE *stream, unsigned char digest[16]) {
+  int i;
+
+  for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]);
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxVideoInfo *info = NULL;
+  const AvxInterface *decoder = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder");
+
+  while (aom_video_reader_read_frame(reader)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      unsigned char digest[16];
+
+      get_image_md5(img, digest);
+      print_md5(outfile, digest);
+      fprintf(outfile, "  img-%dx%d-%04d.i420\n", img->d_w, img->d_h,
+              ++frame_cnt);
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_reader_close(reader);
+
+  fclose(outfile);
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/decode_with_drops.c b/third_party/aom/examples/decode_with_drops.c
new file mode 100644
index 000000000..214401958
--- /dev/null
+++ b/third_party/aom/examples/decode_with_drops.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Decode With Drops Example
+// =========================
+//
+// This is an example utility which drops a series of frames, as specified
+// on the command line. This is useful for observing the error recovery
+// features of the codec.
+//
+// Usage
+// -----
+// This example adds a single argument to the `simple_decoder` example,
+// which specifies the range or pattern of frames to drop. The parameter is
+// parsed as follows:
+//
+// Dropping A Range Of Frames
+// --------------------------
+// To drop a range of frames, specify the starting frame and the ending
+// frame to drop, separated by a dash. The following command will drop
+// frames 5 through 10 (base 1).
+//
+//  $ ./decode_with_drops in.ivf out.i420 5-10
+//
+//
+// Dropping A Pattern Of Frames
+// ----------------------------
+// To drop a pattern of frames, specify the number of frames to drop and
+// the number of frames after which to repeat the pattern, separated by
+// a forward-slash. The following command will drop 3 of 7 frames.
+// Specifically, it will decode 4 frames, then drop 3 frames, and then
+// repeat.
+//
+//  $ ./decode_with_drops in.ivf out.i420 3/7
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the pattern passed on the command line in the
+// `n`, `m`, and `is_range` variables:
+//
+//
+// Making The Drop Decision
+// ------------------------
+// The example decides whether to drop the frame based on the current
+// frame number, immediately before decoding the frame.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  const AvxInterface *decoder = NULL;
+  AvxVideoReader *reader = NULL;
+  const AvxVideoInfo *info = NULL;
+  int n = 0;
+  int m = 0;
+  int is_range = 0;
+  char *nptr = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 4) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  n = (int)strtol(argv[3], &nptr, 0);
+  m = (int)strtol(nptr + 1, NULL, 0);
+  is_range = (*nptr == '-');
+  if (!n || !m || (*nptr != '-' && *nptr != '/'))
+    die("Couldn't parse pattern %s.\n", argv[3]);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (aom_video_reader_read_frame(reader)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    size_t frame_size = 0;
+    int skip;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    ++frame_cnt;
+
+    skip = (is_range && frame_cnt >= n && frame_cnt <= m) ||
+           (!is_range && m - (frame_cnt - 1) % m <= n);
+
+    if (!skip) {
+      putc('.', stdout);
+      if (aom_codec_decode(&codec, frame, frame_size, NULL))
+        die_codec(&codec, "Failed to decode frame.");
+
+      while ((img = aom_codec_get_frame(&codec, &iter)) != NULL)
+        aom_img_write(img, outfile);
+    } else {
+      putc('X', stdout);
+    }
+
+    fflush(stdout);
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/encoder_util.c b/third_party/aom/examples/encoder_util.c
new file mode 100644
index 000000000..e43b37250
--- /dev/null
+++ b/third_party/aom/examples/encoder_util.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Utility functions used by encoder binaries.
+
+#include "examples/encoder_util.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+static void find_mismatch_plane(const aom_image_t *const img1,
+                                const aom_image_t *const img2, int plane,
+                                int use_highbitdepth, int loc[4]) {
+  const unsigned char *const p1 = img1->planes[plane];
+  const int p1_stride = img1->stride[plane] >> use_highbitdepth;
+  const unsigned char *const p2 = img2->planes[plane];
+  const int p2_stride = img2->stride[plane] >> use_highbitdepth;
+  const uint32_t bsize = 64;
+  const int is_y_plane = (plane == AOM_PLANE_Y);
+  const uint32_t bsizex = is_y_plane ? bsize : bsize >> img1->x_chroma_shift;
+  const uint32_t bsizey = is_y_plane ? bsize : bsize >> img1->y_chroma_shift;
+  const uint32_t c_w =
+      is_y_plane ? img1->d_w
+                 : (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      is_y_plane ? img1->d_h
+                 : (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  assert(img1->d_w == img2->d_w && img1->d_h == img2->d_h);
+  assert(img1->x_chroma_shift == img2->x_chroma_shift &&
+         img1->y_chroma_shift == img2->y_chroma_shift);
+  loc[0] = loc[1] = loc[2] = loc[3] = -1;
+  if (img1->monochrome && img2->monochrome && plane) return;
+  int match = 1;
+  uint32_t i, j;
+  for (i = 0; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      const int si =
+          is_y_plane ? mmin(i + bsizey, c_h) - i : mmin(i + bsizey, c_h - i);
+      const int sj =
+          is_y_plane ? mmin(j + bsizex, c_w) - j : mmin(j + bsizex, c_w - j);
+      int k, l;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          const int row = i + k;
+          const int col = j + l;
+          const int offset1 = row * p1_stride + col;
+          const int offset2 = row * p2_stride + col;
+          const int val1 = use_highbitdepth
+                               ? p1[2 * offset1] | (p1[2 * offset1 + 1] << 8)
+                               : p1[offset1];
+          const int val2 = use_highbitdepth
+                               ? p2[2 * offset2] | (p2[2 * offset2 + 1] << 8)
+                               : p2[offset2];
+          if (val1 != val2) {
+            loc[0] = row;
+            loc[1] = col;
+            loc[2] = val1;
+            loc[3] = val2;
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void find_mismatch_helper(const aom_image_t *const img1,
+                                 const aom_image_t *const img2,
+                                 int use_highbitdepth, int yloc[4], int uloc[4],
+                                 int vloc[4]) {
+  find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc);
+  find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc);
+  find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc);
+}
+
+void aom_find_mismatch_high(const aom_image_t *const img1,
+                            const aom_image_t *const img2, int yloc[4],
+                            int uloc[4], int vloc[4]) {
+  find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc);
+}
+
+void aom_find_mismatch(const aom_image_t *const img1,
+                       const aom_image_t *const img2, int yloc[4], int uloc[4],
+                       int vloc[4]) {
+  find_mismatch_helper(img1, img2, 0, yloc, uloc, vloc);
+}
+
+int aom_compare_img(const aom_image_t *const img1,
+                    const aom_image_t *const img2) {
+  assert(img1->cp == img2->cp);
+  assert(img1->tc == img2->tc);
+  assert(img1->mc == img2->mc);
+  assert(img1->monochrome == img2->monochrome);
+
+  int num_planes = img1->monochrome ? 1 : 3;
+
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+  if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    uint32_t height = plane ? c_h : img1->d_h;
+    uint32_t width = plane ? c_w : l_w;
+
+    for (uint32_t i = 0; i < height; ++i) {
+      match &=
+          (memcmp(img1->planes[plane] + i * img1->stride[plane],
+                  img2->planes[plane] + i * img2->stride[plane], width) == 0);
+    }
+  }
+
+  return match;
+}
diff --git a/third_party/aom/examples/encoder_util.h b/third_party/aom/examples/encoder_util.h
new file mode 100644
index 000000000..a6bb3fb48
--- /dev/null
+++ b/third_party/aom/examples/encoder_util.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Utility functions used by encoder binaries.
+
+#ifndef AOM_EXAMPLES_ENCODER_UTIL_H_
+#define AOM_EXAMPLES_ENCODER_UTIL_H_
+
+#include "aom/aom_image.h"
+
+// Returns mismatch location (?loc[0],?loc[1]) and the values at that location
+// in img1 (?loc[2]) and img2 (?loc[3]).
+void aom_find_mismatch_high(const aom_image_t *const img1,
+                            const aom_image_t *const img2, int yloc[4],
+                            int uloc[4], int vloc[4]);
+
+void aom_find_mismatch(const aom_image_t *const img1,
+                       const aom_image_t *const img2, int yloc[4], int uloc[4],
+                       int vloc[4]);
+
+// Returns 1 if the two images match.
+int aom_compare_img(const aom_image_t *const img1,
+                    const aom_image_t *const img2);
+
+#endif  // AOM_EXAMPLES_ENCODER_UTIL_H_
diff --git a/third_party/aom/examples/inspect.c b/third_party/aom/examples/inspect.c
new file mode 100644
index 000000000..9d5f0dcfc
--- /dev/null
+++ b/third_party/aom/examples/inspect.c
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Inspect Decoder
+// ================
+//
+// This is a simple decoder loop that writes JSON stats to stdout. This tool
+// can also be compiled with Emscripten and used as a library.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#else
+#define EMSCRIPTEN_KEEPALIVE
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "av1/common/onyxc_int.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#include "av1/decoder/inspection.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_common.h"
+#include "common/video_reader.h"
+
+// Max JSON buffer size.
+const int MAX_BUFFER = 1024 * 1024 * 32;
+
+typedef enum {
+  ACCOUNTING_LAYER = 1,
+  BLOCK_SIZE_LAYER = 1 << 1,
+  TRANSFORM_SIZE_LAYER = 1 << 2,
+  TRANSFORM_TYPE_LAYER = 1 << 3,
+  MODE_LAYER = 1 << 4,
+  SKIP_LAYER = 1 << 5,
+  FILTER_LAYER = 1 << 6,
+  CDEF_LAYER = 1 << 7,
+  REFERENCE_FRAME_LAYER = 1 << 8,
+  MOTION_VECTORS_LAYER = 1 << 9,
+  UV_MODE_LAYER = 1 << 10,
+  CFL_LAYER = 1 << 11,
+  DUAL_FILTER_LAYER = 1 << 12,
+  Q_INDEX_LAYER = 1 << 13,
+  SEGMENT_ID_LAYER = 1 << 14,
+  ALL_LAYERS = (1 << 15) - 1
+} LayerType;
+
+static LayerType layers = 0;
+
+static int stop_after = 0;
+static int compress = 0;
+
+static const arg_def_t limit_arg =
+    ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t dump_all_arg = ARG_DEF("A", "all", 0, "Dump All");
+static const arg_def_t compress_arg =
+    ARG_DEF("x", "compress", 0, "Compress JSON using RLE");
+static const arg_def_t dump_accounting_arg =
+    ARG_DEF("a", "accounting", 0, "Dump Accounting");
+static const arg_def_t dump_block_size_arg =
+    ARG_DEF("bs", "blockSize", 0, "Dump Block Size");
+static const arg_def_t dump_motion_vectors_arg =
+    ARG_DEF("mv", "motionVectors", 0, "Dump Motion Vectors");
+static const arg_def_t dump_transform_size_arg =
+    ARG_DEF("ts", "transformSize", 0, "Dump Transform Size");
+static const arg_def_t dump_transform_type_arg =
+    ARG_DEF("tt", "transformType", 0, "Dump Transform Type");
+static const arg_def_t dump_mode_arg = ARG_DEF("m", "mode", 0, "Dump Mode");
+static const arg_def_t dump_uv_mode_arg =
+    ARG_DEF("uvm", "uv_mode", 0, "Dump UV Intra Prediction Modes");
+static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip");
+static const arg_def_t dump_filter_arg =
+    ARG_DEF("f", "filter", 0, "Dump Filter");
+static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF");
+static const arg_def_t dump_cfl_arg =
+    ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas");
+static const arg_def_t dump_dual_filter_type_arg =
+    ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type");
+static const arg_def_t dump_reference_frame_arg =
+    ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame");
+static const arg_def_t dump_delta_q_arg =
+    ARG_DEF("dq", "delta_q", 0, "Dump QIndex");
+static const arg_def_t dump_seg_id_arg =
+    ARG_DEF("si", "seg_id", 0, "Dump Segment ID");
+static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help");
+
+static const arg_def_t *main_args[] = { &limit_arg,
+                                        &dump_all_arg,
+                                        &compress_arg,
+#if CONFIG_ACCOUNTING
+                                        &dump_accounting_arg,
+#endif
+                                        &dump_block_size_arg,
+                                        &dump_transform_size_arg,
+                                        &dump_transform_type_arg,
+                                        &dump_mode_arg,
+                                        &dump_uv_mode_arg,
+                                        &dump_skip_arg,
+                                        &dump_filter_arg,
+                                        &dump_cdef_arg,
+                                        &dump_dual_filter_type_arg,
+                                        &dump_cfl_arg,
+                                        &dump_reference_frame_arg,
+                                        &dump_motion_vectors_arg,
+                                        &dump_delta_q_arg,
+                                        &dump_seg_id_arg,
+                                        &usage_arg,
+                                        NULL };
+#define ENUM(name) \
+  { #name, name }
+#define LAST_ENUM \
+  { NULL, 0 }
+typedef struct map_entry {
+  const char *name;
+  int value;
+} map_entry;
+
+const map_entry refs_map[] = {
+  ENUM(INTRA_FRAME),   ENUM(LAST_FRAME),   ENUM(LAST2_FRAME),
+  ENUM(LAST3_FRAME),   ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME),
+  ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM
+};
+
+const map_entry block_size_map[] = {
+  ENUM(BLOCK_4X4),     ENUM(BLOCK_4X8),    ENUM(BLOCK_8X4),
+  ENUM(BLOCK_8X8),     ENUM(BLOCK_8X16),   ENUM(BLOCK_16X8),
+  ENUM(BLOCK_16X16),   ENUM(BLOCK_16X32),  ENUM(BLOCK_32X16),
+  ENUM(BLOCK_32X32),   ENUM(BLOCK_32X64),  ENUM(BLOCK_64X32),
+  ENUM(BLOCK_64X64),   ENUM(BLOCK_64X128), ENUM(BLOCK_128X64),
+  ENUM(BLOCK_128X128), ENUM(BLOCK_4X16),   ENUM(BLOCK_16X4),
+  ENUM(BLOCK_8X32),    ENUM(BLOCK_32X8),   ENUM(BLOCK_16X64),
+  ENUM(BLOCK_64X16),   LAST_ENUM
+};
+
+const map_entry tx_size_map[] = {
+  ENUM(TX_4X4),   ENUM(TX_8X8),   ENUM(TX_16X16), ENUM(TX_32X32),
+  ENUM(TX_64X64), ENUM(TX_4X8),   ENUM(TX_8X4),   ENUM(TX_8X16),
+  ENUM(TX_16X8),  ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64),
+  ENUM(TX_64X32), ENUM(TX_4X16),  ENUM(TX_16X4),  ENUM(TX_8X32),
+  ENUM(TX_32X8),  LAST_ENUM
+};
+
+const map_entry tx_type_map[] = { ENUM(DCT_DCT),
+                                  ENUM(ADST_DCT),
+                                  ENUM(DCT_ADST),
+                                  ENUM(ADST_ADST),
+                                  ENUM(FLIPADST_DCT),
+                                  ENUM(DCT_FLIPADST),
+                                  ENUM(FLIPADST_FLIPADST),
+                                  ENUM(ADST_FLIPADST),
+                                  ENUM(FLIPADST_ADST),
+                                  ENUM(IDTX),
+                                  ENUM(V_DCT),
+                                  ENUM(H_DCT),
+                                  ENUM(V_ADST),
+                                  ENUM(H_ADST),
+                                  ENUM(V_FLIPADST),
+                                  ENUM(H_FLIPADST),
+                                  LAST_ENUM };
+const map_entry dual_filter_map[] = { ENUM(REG_REG),       ENUM(REG_SMOOTH),
+                                      ENUM(REG_SHARP),     ENUM(SMOOTH_REG),
+                                      ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP),
+                                      ENUM(SHARP_REG),     ENUM(SHARP_SMOOTH),
+                                      ENUM(SHARP_SHARP),   LAST_ENUM };
+
+const map_entry prediction_mode_map[] = {
+  ENUM(DC_PRED),     ENUM(V_PRED),        ENUM(H_PRED),
+  ENUM(D45_PRED),    ENUM(D135_PRED),     ENUM(D113_PRED),
+  ENUM(D157_PRED),   ENUM(D203_PRED),     ENUM(D67_PRED),
+  ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED),
+  ENUM(PAETH_PRED),  ENUM(NEARESTMV),     ENUM(NEARMV),
+  ENUM(GLOBALMV),    ENUM(NEWMV),         ENUM(NEAREST_NEARESTMV),
+  ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV),
+  ENUM(NEAR_NEWMV),  ENUM(NEW_NEARMV),    ENUM(GLOBAL_GLOBALMV),
+  ENUM(NEW_NEWMV),   ENUM(INTRA_INVALID), LAST_ENUM
+};
+
+const map_entry uv_prediction_mode_map[] = {
+  ENUM(UV_DC_PRED),       ENUM(UV_V_PRED),
+  ENUM(UV_H_PRED),        ENUM(UV_D45_PRED),
+  ENUM(UV_D135_PRED),     ENUM(UV_D113_PRED),
+  ENUM(UV_D157_PRED),     ENUM(UV_D203_PRED),
+  ENUM(UV_D67_PRED),      ENUM(UV_SMOOTH_PRED),
+  ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED),
+  ENUM(UV_PAETH_PRED),    ENUM(UV_CFL_PRED),
+  ENUM(UV_MODE_INVALID),  LAST_ENUM
+};
+#define NO_SKIP 0
+#define SKIP 1
+
+const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM };
+
+const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM };
+
+static const char *exec_name;
+
+insp_frame_data frame_data;
+int frame_count = 0;
+int decoded_frame_count = 0;
+aom_codec_ctx_t codec;
+AvxVideoReader *reader = NULL;
+const AvxVideoInfo *info = NULL;
+aom_image_t *img = NULL;
+
+void on_frame_decoded_dump(char *json) {
+#ifdef __EMSCRIPTEN__
+  EM_ASM_({ Module.on_frame_decoded_json($0); }, json);
+#else
+  printf("%s", json);
+#endif
+}
+
+// Writing out the JSON buffer using snprintf is very slow, especially when
+// compiled with emscripten, these functions speed things up quite a bit.
+int put_str(char *buffer, const char *str) {
+  int i;
+  for (i = 0; str[i] != '\0'; i++) {
+    buffer[i] = str[i];
+  }
+  return i;
+}
+
+int put_str_with_escape(char *buffer, const char *str) {
+  int i;
+  int j = 0;
+  for (i = 0; str[i] != '\0'; i++) {
+    if (str[i] < ' ') {
+      continue;
+    } else if (str[i] == '"' || str[i] == '\\') {
+      buffer[j++] = '\\';
+    }
+    buffer[j++] = str[i];
+  }
+  return j;
+}
+
+int put_num(char *buffer, char prefix, int num, char suffix) {
+  int i = 0;
+  char *buf = buffer;
+  int is_neg = 0;
+  if (prefix) {
+    buf[i++] = prefix;
+  }
+  if (num == 0) {
+    buf[i++] = '0';
+  } else {
+    if (num < 0) {
+      num = -num;
+      is_neg = 1;
+    }
+    int s = i;
+    while (num != 0) {
+      buf[i++] = '0' + (num % 10);
+      num = num / 10;
+    }
+    if (is_neg) {
+      buf[i++] = '-';
+    }
+    int e = i - 1;
+    while (s < e) {
+      int t = buf[s];
+      buf[s] = buf[e];
+      buf[e] = t;
+      s++;
+      e--;
+    }
+  }
+  if (suffix) {
+    buf[i++] = suffix;
+  }
+  return i;
+}
+
+int put_map(char *buffer, const map_entry *map) {
+  char *buf = buffer;
+  const map_entry *entry = map;
+  while (entry->name != NULL) {
+    *(buf++) = '"';
+    buf += put_str(buf, entry->name);
+    *(buf++) = '"';
+    buf += put_num(buf, ':', entry->value, 0);
+    entry++;
+    if (entry->name != NULL) {
+      *(buf++) = ',';
+    }
+  }
+  return (int)(buf - buffer);
+}
+
+int put_reference_frame(char *buffer) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, t;
+  buf += put_str(buf, "  \"referenceFrameMap\": {");
+  buf += put_map(buf, refs_map);
+  buf += put_str(buf, "},\n");
+  buf += put_str(buf, "  \"referenceFrame\": [");
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      buf += put_num(buf, '[', mi->ref_frame[0], 0);
+      buf += put_num(buf, ',', mi->ref_frame[1], ']');
+      if (compress) {  // RLE
+        for (t = c + 1; t < mi_cols; ++t) {
+          insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+          if (mi->ref_frame[0] != next_mi->ref_frame[0] ||
+              mi->ref_frame[1] != next_mi->ref_frame[1]) {
+            break;
+          }
+        }
+        if (t - c > 1) {
+          *(buf++) = ',';
+          buf += put_num(buf, '[', t - c - 1, ']');
+          c = t - 1;
+        }
+      }
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+int put_motion_vectors(char *buffer) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, t;
+  buf += put_str(buf, "  \"motionVectors\": [");
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      buf += put_num(buf, '[', mi->mv[0].col, 0);
+      buf += put_num(buf, ',', mi->mv[0].row, 0);
+      buf += put_num(buf, ',', mi->mv[1].col, 0);
+      buf += put_num(buf, ',', mi->mv[1].row, ']');
+      if (compress) {  // RLE
+        for (t = c + 1; t < mi_cols; ++t) {
+          insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+          if (mi->mv[0].col != next_mi->mv[0].col ||
+              mi->mv[0].row != next_mi->mv[0].row ||
+              mi->mv[1].col != next_mi->mv[1].col ||
+              mi->mv[1].row != next_mi->mv[1].row) {
+            break;
+          }
+        }
+        if (t - c > 1) {
+          *(buf++) = ',';
+          buf += put_num(buf, '[', t - c - 1, ']');
+          c = t - 1;
+        }
+      }
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+int put_block_info(char *buffer, const map_entry *map, const char *name,
+                   size_t offset, int len) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, t, i;
+  if (compress && len == 1) {
+    die("Can't encode scalars as arrays when RLE compression is enabled.");
+    return -1;
+  }
+  if (map) {
+    buf += snprintf(buf, MAX_BUFFER, "  \"%sMap\": {", name);
+    buf += put_map(buf, map);
+    buf += put_str(buf, "},\n");
+  }
+  buf += snprintf(buf, MAX_BUFFER, "  \"%s\": [", name);
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      int16_t *v = (int16_t *)(((int8_t *)mi) + offset);
+      if (len == 0) {
+        buf += put_num(buf, 0, v[0], 0);
+      } else {
+        buf += put_str(buf, "[");
+        for (i = 0; i < len; i++) {
+          buf += put_num(buf, 0, v[i], 0);
+          if (i < len - 1) {
+            buf += put_str(buf, ",");
+          }
+        }
+        buf += put_str(buf, "]");
+      }
+      if (compress) {  // RLE
+        for (t = c + 1; t < mi_cols; ++t) {
+          insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+          int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset);
+          int same = 0;
+          if (len == 0) {
+            same = v[0] == nv[0];
+          } else {
+            for (i = 0; i < len; i++) {
+              same = v[i] == nv[i];
+              if (!same) {
+                break;
+              }
+            }
+          }
+          if (!same) {
+            break;
+          }
+        }
+        if (t - c > 1) {
+          *(buf++) = ',';
+          buf += put_num(buf, '[', t - c - 1, ']');
+          c = t - 1;
+        }
+      }
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+#if CONFIG_ACCOUNTING
+int put_accounting(char *buffer) {
+  char *buf = buffer;
+  int i;
+  const Accounting *accounting = frame_data.accounting;
+  if (accounting == NULL) {
+    printf("XXX\n");
+    return 0;
+  }
+  const int num_syms = accounting->syms.num_syms;
+  const int num_strs = accounting->syms.dictionary.num_strs;
+  buf += put_str(buf, "  \"symbolsMap\": [");
+  for (i = 0; i < num_strs; i++) {
+    buf += snprintf(buf, MAX_BUFFER, "\"%s\"",
+                    accounting->syms.dictionary.strs[i]);
+    if (i < num_strs - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  buf += put_str(buf, "  \"symbols\": [\n    ");
+  AccountingSymbolContext context;
+  context.x = -2;
+  context.y = -2;
+  AccountingSymbol *sym;
+  for (i = 0; i < num_syms; i++) {
+    sym = &accounting->syms.syms[i];
+    if (memcmp(&context, &sym->context, sizeof(AccountingSymbolContext)) != 0) {
+      buf += put_num(buf, '[', sym->context.x, 0);
+      buf += put_num(buf, ',', sym->context.y, ']');
+    } else {
+      buf += put_num(buf, '[', sym->id, 0);
+      buf += put_num(buf, ',', sym->bits, 0);
+      buf += put_num(buf, ',', sym->samples, ']');
+    }
+    context = sym->context;
+    if (i < num_syms - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+#endif
+
+void inspect(void *pbi, void *data) {
+  /* Fetch frame data. */
+  ifd_inspect(&frame_data, pbi);
+  (void)data;
+  // We allocate enough space and hope we don't write out of bounds. Totally
+  // unsafe but this speeds things up, especially when compiled to Javascript.
+  char *buffer = aom_malloc(MAX_BUFFER);
+  char *buf = buffer;
+  buf += put_str(buf, "{\n");
+  if (layers & BLOCK_SIZE_LAYER) {
+    buf += put_block_info(buf, block_size_map, "blockSize",
+                          offsetof(insp_mi_data, sb_type), 0);
+  }
+  if (layers & TRANSFORM_SIZE_LAYER) {
+    buf += put_block_info(buf, tx_size_map, "transformSize",
+                          offsetof(insp_mi_data, tx_size), 0);
+  }
+  if (layers & TRANSFORM_TYPE_LAYER) {
+    buf += put_block_info(buf, tx_type_map, "transformType",
+                          offsetof(insp_mi_data, tx_type), 0);
+  }
+  if (layers & DUAL_FILTER_LAYER) {
+    buf += put_block_info(buf, dual_filter_map, "dualFilterType",
+                          offsetof(insp_mi_data, dual_filter_type), 0);
+  }
+  if (layers & MODE_LAYER) {
+    buf += put_block_info(buf, prediction_mode_map, "mode",
+                          offsetof(insp_mi_data, mode), 0);
+  }
+  if (layers & UV_MODE_LAYER) {
+    buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode",
+                          offsetof(insp_mi_data, uv_mode), 0);
+  }
+  if (layers & SKIP_LAYER) {
+    buf +=
+        put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0);
+  }
+  if (layers & FILTER_LAYER) {
+    buf +=
+        put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2);
+  }
+  if (layers & CDEF_LAYER) {
+    buf += put_block_info(buf, NULL, "cdef_level",
+                          offsetof(insp_mi_data, cdef_level), 0);
+    buf += put_block_info(buf, NULL, "cdef_strength",
+                          offsetof(insp_mi_data, cdef_strength), 0);
+  }
+  if (layers & CFL_LAYER) {
+    buf += put_block_info(buf, NULL, "cfl_alpha_idx",
+                          offsetof(insp_mi_data, cfl_alpha_idx), 0);
+    buf += put_block_info(buf, NULL, "cfl_alpha_sign",
+                          offsetof(insp_mi_data, cfl_alpha_sign), 0);
+  }
+  if (layers & Q_INDEX_LAYER) {
+    buf += put_block_info(buf, NULL, "delta_q",
+                          offsetof(insp_mi_data, current_qindex), 0);
+  }
+  if (layers & SEGMENT_ID_LAYER) {
+    buf += put_block_info(buf, NULL, "seg_id",
+                          offsetof(insp_mi_data, segment_id), 0);
+  }
+  if (layers & MOTION_VECTORS_LAYER) {
+    buf += put_motion_vectors(buf);
+  }
+  if (layers & REFERENCE_FRAME_LAYER) {
+    buf += put_block_info(buf, refs_map, "referenceFrame",
+                          offsetof(insp_mi_data, ref_frame), 2);
+  }
+#if CONFIG_ACCOUNTING
+  if (layers & ACCOUNTING_LAYER) {
+    buf += put_accounting(buf);
+  }
+#endif
+  buf += snprintf(buf, MAX_BUFFER, "  \"frame\": %d,\n", decoded_frame_count);
+  buf += snprintf(buf, MAX_BUFFER, "  \"showFrame\": %d,\n",
+                  frame_data.show_frame);
+  buf += snprintf(buf, MAX_BUFFER, "  \"frameType\": %d,\n",
+                  frame_data.frame_type);
+  buf += snprintf(buf, MAX_BUFFER, "  \"baseQIndex\": %d,\n",
+                  frame_data.base_qindex);
+  buf += snprintf(buf, MAX_BUFFER, "  \"tileCols\": %d,\n",
+                  frame_data.tile_mi_cols);
+  buf += snprintf(buf, MAX_BUFFER, "  \"tileRows\": %d,\n",
+                  frame_data.tile_mi_rows);
+  buf += snprintf(buf, MAX_BUFFER, "  \"deltaQPresentFlag\": %d,\n",
+                  frame_data.delta_q_present_flag);
+  buf += snprintf(buf, MAX_BUFFER, "  \"deltaQRes\": %d,\n",
+                  frame_data.delta_q_res);
+  buf += put_str(buf, "  \"config\": {");
+  buf += put_map(buf, config_map);
+  buf += put_str(buf, "},\n");
+  buf += put_str(buf, "  \"configString\": \"");
+  buf += put_str_with_escape(buf, aom_codec_build_config());
+  buf += put_str(buf, "\"\n");
+  decoded_frame_count++;
+  buf += put_str(buf, "},\n");
+  *(buf++) = 0;
+  on_frame_decoded_dump(buffer);
+  aom_free(buffer);
+}
+
+void ifd_init_cb() {
+  aom_inspect_init ii;
+  ii.inspect_cb = inspect;
+  ii.inspect_ctx = NULL;
+  aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii);
+}
+
+EMSCRIPTEN_KEEPALIVE
+int open_file(char *file) {
+  if (file == NULL) {
+    // The JS analyzer puts the .ivf file at this location.
+    file = "/tmp/input.ivf";
+  }
+  reader = aom_video_reader_open(file);
+  if (!reader) die("Failed to open %s for reading.", file);
+  info = aom_video_reader_get_info(reader);
+  const AvxInterface *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  fprintf(stderr, "Using %s\n",
+          aom_codec_iface_name(decoder->codec_interface()));
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+  ifd_init(&frame_data, info->frame_width, info->frame_height);
+  ifd_init_cb();
+  return EXIT_SUCCESS;
+}
+
+EMSCRIPTEN_KEEPALIVE
+int read_frame() {
+  if (!aom_video_reader_read_frame(reader)) return EXIT_FAILURE;
+  img = NULL;
+  aom_codec_iter_t iter = NULL;
+  size_t frame_size = 0;
+  const unsigned char *frame = aom_video_reader_get_frame(reader, &frame_size);
+  if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, NULL) !=
+      AOM_CODEC_OK) {
+    die_codec(&codec, "Failed to decode frame.");
+  }
+  int got_any_frames = 0;
+  aom_image_t *frame_img;
+  while ((frame_img = aom_codec_get_frame(&codec, &iter))) {
+    img = frame_img;
+    ++frame_count;
+    got_any_frames = 1;
+  }
+  if (!got_any_frames) {
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
+
+EMSCRIPTEN_KEEPALIVE
+const char *get_aom_codec_build_config() { return aom_codec_build_config(); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_bit_depth() { return img->bit_depth; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_bits_per_sample() { return img->bps; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_image_format() { return img->fmt; }
+
+EMSCRIPTEN_KEEPALIVE
+unsigned char *get_plane(int plane) { return img->planes[plane]; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_stride(int plane) { return img->stride[plane]; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_width(int plane) { return aom_img_plane_width(img, plane); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_height(int plane) { return aom_img_plane_height(img, plane); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_frame_width() { return info->frame_width; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_frame_height() { return info->frame_height; }
+
+static void parse_args(char **argv) {
+  char **argi, **argj;
+  struct arg arg;
+  (void)dump_accounting_arg;
+  (void)dump_cdef_arg;
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+    if (arg_match(&arg, &dump_block_size_arg, argi)) layers |= BLOCK_SIZE_LAYER;
+#if CONFIG_ACCOUNTING
+    else if (arg_match(&arg, &dump_accounting_arg, argi))
+      layers |= ACCOUNTING_LAYER;
+#endif
+    else if (arg_match(&arg, &dump_transform_size_arg, argi))
+      layers |= TRANSFORM_SIZE_LAYER;
+    else if (arg_match(&arg, &dump_transform_type_arg, argi))
+      layers |= TRANSFORM_TYPE_LAYER;
+    else if (arg_match(&arg, &dump_mode_arg, argi))
+      layers |= MODE_LAYER;
+    else if (arg_match(&arg, &dump_uv_mode_arg, argi))
+      layers |= UV_MODE_LAYER;
+    else if (arg_match(&arg, &dump_skip_arg, argi))
+      layers |= SKIP_LAYER;
+    else if (arg_match(&arg, &dump_filter_arg, argi))
+      layers |= FILTER_LAYER;
+    else if (arg_match(&arg, &dump_cdef_arg, argi))
+      layers |= CDEF_LAYER;
+    else if (arg_match(&arg, &dump_cfl_arg, argi))
+      layers |= CFL_LAYER;
+    else if (arg_match(&arg, &dump_reference_frame_arg, argi))
+      layers |= REFERENCE_FRAME_LAYER;
+    else if (arg_match(&arg, &dump_motion_vectors_arg, argi))
+      layers |= MOTION_VECTORS_LAYER;
+    else if (arg_match(&arg, &dump_dual_filter_type_arg, argi))
+      layers |= DUAL_FILTER_LAYER;
+    else if (arg_match(&arg, &dump_delta_q_arg, argi))
+      layers |= Q_INDEX_LAYER;
+    else if (arg_match(&arg, &dump_seg_id_arg, argi))
+      layers |= SEGMENT_ID_LAYER;
+    else if (arg_match(&arg, &dump_all_arg, argi))
+      layers |= ALL_LAYERS;
+    else if (arg_match(&arg, &compress_arg, argi))
+      compress = 1;
+    else if (arg_match(&arg, &usage_arg, argi))
+      usage_exit();
+    else if (arg_match(&arg, &limit_arg, argi))
+      stop_after = arg_parse_uint(&arg);
+    else
+      argj++;
+  }
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s src_filename <options>\n", exec_name);
+  fprintf(stderr, "\nOptions:\n");
+  arg_show_usage(stderr, main_args);
+  exit(EXIT_FAILURE);
+}
+
+EMSCRIPTEN_KEEPALIVE
+int main(int argc, char **argv) {
+  exec_name = argv[0];
+  parse_args(argv);
+  if (argc >= 2) {
+    open_file(argv[1]);
+    printf("[\n");
+    while (1) {
+      if (stop_after && (decoded_frame_count >= stop_after)) break;
+      if (read_frame()) break;
+    }
+    printf("null\n");
+    printf("]");
+  } else {
+    usage_exit();
+  }
+}
+
+EMSCRIPTEN_KEEPALIVE
+void quit() {
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+}
+
+EMSCRIPTEN_KEEPALIVE
+void set_layers(LayerType v) { layers = v; }
+
+EMSCRIPTEN_KEEPALIVE
+void set_compress(int v) { compress = v; }
diff --git a/third_party/aom/examples/lightfield_bitstream_parsing.c b/third_party/aom/examples/lightfield_bitstream_parsing.c
new file mode 100644
index 000000000..159f1617a
--- /dev/null
+++ b/third_party/aom/examples/lightfield_bitstream_parsing.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Bitstream Parsing
+// ============================
+//
+// This is a lightfield bitstream parsing example. It takes an input file
+// containing the whole compressed lightfield bitstream(ivf file), and parses it
+// and constructs and outputs a new bitstream that can be decoded by an AV1
+// decoder. The output bitstream contains reference frames(i.e. anchor frames),
+// camera frame header, and tile list OBUs. num_references is the number of
+// anchor frames coded at the beginning of the light field file.
+// After running the lightfield encoder, run lightfield bitstream parsing:
+// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/video_writer.h"
+
+#define MAX_TILES 512
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <num_references> \n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+// SB size: 64x64
+const uint8_t output_frame_width_in_tiles_minus_1 = 512 / 64 - 1;
+const uint8_t output_frame_height_in_tiles_minus_1 = 512 / 64 - 1;
+
+// Spec:
+// typedef struct {
+//   uint8_t anchor_frame_idx;
+//   uint8_t tile_row;
+//   uint8_t tile_col;
+//   uint16_t coded_tile_data_size_minus_1;
+//   uint8_t *coded_tile_data;
+// } TILE_LIST_ENTRY;
+
+// Tile list entry provided by the application
+typedef struct {
+  int image_idx;
+  int reference_idx;
+  int tile_col;
+  int tile_row;
+} TILE_LIST_INFO;
+
+// M references: 0 - M-1; N images(including references): 0 - N-1;
+// Note: order the image index incrementally, so that we only go through the
+// bitstream once to construct the tile list.
+const int num_tile_lists = 2;
+const uint16_t tile_count_minus_1 = 9 - 1;
+const TILE_LIST_INFO tile_list[2][9] = {
+  { { 16, 0, 4, 5 },
+    { 83, 3, 13, 2 },
+    { 57, 2, 2, 6 },
+    { 31, 1, 11, 5 },
+    { 2, 0, 7, 4 },
+    { 77, 3, 9, 9 },
+    { 49, 1, 0, 1 },
+    { 6, 0, 3, 10 },
+    { 63, 2, 5, 8 } },
+  { { 65, 2, 11, 1 },
+    { 42, 1, 3, 7 },
+    { 88, 3, 8, 4 },
+    { 76, 3, 1, 15 },
+    { 1, 0, 2, 2 },
+    { 19, 0, 5, 6 },
+    { 60, 2, 4, 0 },
+    { 25, 1, 11, 15 },
+    { 50, 2, 5, 4 } },
+};
+
+static int get_image_bps(aom_img_fmt_t fmt) {
+  switch (fmt) {
+    case AOM_IMG_FMT_I420: return 12;
+    case AOM_IMG_FMT_I422: return 16;
+    case AOM_IMG_FMT_I444: return 24;
+    case AOM_IMG_FMT_I42016: return 24;
+    case AOM_IMG_FMT_I42216: return 32;
+    case AOM_IMG_FMT_I44416: return 48;
+    default: die("Invalid image format");
+  }
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  int num_references;
+  int n, i;
+  aom_codec_pts_t pts;
+
+  exec_name = argv[0];
+  if (argc != 4) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  num_references = (int)strtol(argv[3], NULL, 0);
+  info = aom_video_reader_get_info(reader);
+
+  // The writer to write out ivf file in tile list OBU, which can be decoded by
+  // AV1 decoder.
+  writer = aom_video_writer_open(argv[2], kContainerIVF, info);
+  if (!writer) die("Failed to open %s for writing", argv[2]);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  // Decode anchor frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader);
+
+    // Copy references bitstream directly.
+    if (!aom_video_writer_write_frame(writer, frame, frame_size, pts))
+      die_codec(&codec, "Failed to copy compressed anchor frame.");
+
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+  }
+
+  // Decode camera frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+  aom_codec_control_(&codec, AV1D_EXT_TILE_DEBUG, 1);
+
+  FILE *infile = aom_video_reader_get_file(reader);
+  // Record the offset of the first camera image.
+  const FileOffset camera_frame_pos = ftello(infile);
+
+  // Read out the first camera frame.
+  aom_video_reader_read_frame(reader);
+
+  // Copy first camera frame for getting camera frame header. This is done
+  // only once.
+  {
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader);
+    aom_tile_data frame_header_info = { 0, NULL, 0 };
+
+    // Need to decode frame header to get camera frame header info. So, here
+    // decoding 1 tile is enough.
+    aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, 0);
+    aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, 0);
+
+    aom_codec_err_t aom_status =
+        aom_codec_decode(&codec, frame, frame_size, NULL);
+    if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+    aom_codec_control_(&codec, AV1D_GET_FRAME_HEADER_INFO, &frame_header_info);
+
+    size_t obu_size_offset =
+        (uint8_t *)frame_header_info.coded_tile_data - frame;
+    size_t length_field_size = frame_header_info.coded_tile_data_size;
+    // Remove ext-tile tile info.
+    uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1;
+    size_t bytes_to_copy =
+        obu_size_offset + length_field_size + frame_header_size;
+
+    unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy);
+    if (frame_hdr_buf == NULL)
+      die_codec(&codec, "Failed to allocate frame header buffer.");
+
+    memcpy(frame_hdr_buf, frame, bytes_to_copy);
+
+    // Update frame header OBU size.
+    size_t bytes_written = 0;
+    if (aom_uleb_encode_fixed_size(
+            frame_header_size, length_field_size, length_field_size,
+            frame_hdr_buf + obu_size_offset, &bytes_written))
+      die_codec(&codec, "Failed to encode the tile list obu size.");
+
+    // Copy camera frame header bitstream.
+    if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy,
+                                      pts))
+      die_codec(&codec, "Failed to copy compressed camera frame header.");
+    free(frame_hdr_buf);
+  }
+
+  // Read out the image format.
+  aom_img_fmt_t ref_fmt = 0;
+  if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+    die_codec(&codec, "Failed to get the image format");
+  const int bps = get_image_bps(ref_fmt);
+  if (!bps) die_codec(&codec, "Invalid image format.");
+  // read out the tile size.
+  unsigned int tile_size = 0;
+  if (aom_codec_control(&codec, AV1D_GET_TILE_SIZE, &tile_size))
+    die_codec(&codec, "Failed to get the tile size");
+  const unsigned int tile_width = tile_size >> 16;
+  const unsigned int tile_height = tile_size & 65535;
+  // Allocate a buffer to store tile list bitstream.
+  const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) *
+                         ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8;
+  unsigned char *tl_buf = (unsigned char *)malloc(data_sz);
+  if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer.");
+
+  aom_codec_pts_t tl_pts = pts;
+
+  // Process 1 tile list.
+  for (n = 0; n < num_tile_lists; n++) {
+    unsigned char *tl = tl_buf;
+    struct aom_write_bit_buffer wb = { tl, 0 };
+    unsigned char *saved_obu_size_loc = NULL;
+    uint32_t tile_list_obu_header_size = 0;
+    uint32_t tile_list_obu_size = 0;
+
+    // Write the tile list OBU header that is 1 byte long.
+    aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+    aom_wb_write_literal(&wb, 8, 4);  // tile list OBU: "1000"
+    aom_wb_write_literal(&wb, 0, 1);  // obu_extension = 0
+    aom_wb_write_literal(&wb, 1, 1);  // obu_has_size_field
+    aom_wb_write_literal(&wb, 0, 1);  // reserved
+    tl++;
+    tile_list_obu_header_size++;
+
+    // Write the OBU size using a fixed length_field_size of 4 bytes.
+    saved_obu_size_loc = tl;
+    // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32.
+    aom_wb_write_unsigned_literal(&wb, 0, 32);
+    tl += 4;
+    tile_list_obu_header_size += 4;
+
+    // write_tile_list_obu()
+    aom_wb_write_literal(&wb, output_frame_width_in_tiles_minus_1, 8);
+    aom_wb_write_literal(&wb, output_frame_height_in_tiles_minus_1, 8);
+    aom_wb_write_literal(&wb, tile_count_minus_1, 16);
+    tl += 4;
+    tile_list_obu_size += 4;
+
+    // Write each tile's data
+    for (i = 0; i <= tile_count_minus_1; i++) {
+      aom_tile_data tile_data = { 0, NULL, 0 };
+
+      int image_idx = tile_list[n][i].image_idx;
+      int ref_idx = tile_list[n][i].reference_idx;
+      int tc = tile_list[n][i].tile_col;
+      int tr = tile_list[n][i].tile_row;
+      int frame_cnt = -1;
+
+      // Reset bit writer to the right location.
+      wb.bit_buffer = tl;
+      wb.bit_offset = 0;
+
+      // Seek to the first camera image.
+      fseeko(infile, camera_frame_pos, SEEK_SET);
+
+      // Read out the camera image
+      while (frame_cnt != image_idx) {
+        aom_video_reader_read_frame(reader);
+        frame_cnt++;
+      }
+
+      size_t frame_size = 0;
+      const unsigned char *frame =
+          aom_video_reader_get_frame(reader, &frame_size);
+
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, tr);
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, tc);
+
+      aom_codec_err_t aom_status =
+          aom_codec_decode(&codec, frame, frame_size, NULL);
+      if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+      aom_codec_control_(&codec, AV1D_GET_TILE_DATA, &tile_data);
+
+      // Copy over tile info.
+      //  uint8_t anchor_frame_idx;
+      //  uint8_t tile_row;
+      //  uint8_t tile_col;
+      //  uint16_t coded_tile_data_size_minus_1;
+      //  uint8_t *coded_tile_data;
+      uint32_t tile_info_bytes = 5;
+      aom_wb_write_literal(&wb, ref_idx, 8);
+      aom_wb_write_literal(&wb, tr, 8);
+      aom_wb_write_literal(&wb, tc, 8);
+      aom_wb_write_literal(&wb, (int)tile_data.coded_tile_data_size - 1, 16);
+      tl += tile_info_bytes;
+
+      memcpy(tl, (uint8_t *)tile_data.coded_tile_data,
+             tile_data.coded_tile_data_size);
+      tl += tile_data.coded_tile_data_size;
+
+      tile_list_obu_size +=
+          tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size;
+    }
+
+    // Write tile list OBU size.
+    size_t bytes_written = 0;
+    if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc,
+                                   &bytes_written))
+      die_codec(&codec, "Failed to encode the tile list obu size.");
+
+    // Copy the tile list.
+    if (!aom_video_writer_write_frame(
+            writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size,
+            tl_pts))
+      die_codec(&codec, "Failed to copy compressed tile list.");
+
+    tl_pts++;
+  }
+
+  free(tl_buf);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_writer_close(writer);
+  aom_video_reader_close(reader);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_decoder.c b/third_party/aom/examples/lightfield_decoder.c
new file mode 100644
index 000000000..f5e54db7f
--- /dev/null
+++ b/third_party/aom/examples/lightfield_decoder.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Decoder
+// ==================
+//
+// This is an example of a simple lightfield decoder. It builds upon the
+// simple_decoder.c example.  It takes an input file containing the compressed
+// data (in ivf format), treating it as a lightfield instead of a video.
+// After running the lightfield encoder, run lightfield decoder to decode a
+// batch of tiles:
+// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <num_references>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+// Tile list entry provided by the application
+typedef struct {
+  int image_idx;
+  int reference_idx;
+  int tile_col;
+  int tile_row;
+} TILE_LIST_INFO;
+
+// M references: 0 - M-1; N images(including references): 0 - N-1;
+// Note: order the image index incrementally, so that we only go through the
+// bitstream once to construct the tile list.
+const int num_tile_lists = 2;
+const uint16_t tile_count_minus_1 = 9 - 1;
+const TILE_LIST_INFO tile_list[2][9] = {
+  { { 16, 0, 4, 5 },
+    { 83, 3, 13, 2 },
+    { 57, 2, 2, 6 },
+    { 31, 1, 11, 5 },
+    { 2, 0, 7, 4 },
+    { 77, 3, 9, 9 },
+    { 49, 1, 0, 1 },
+    { 6, 0, 3, 10 },
+    { 63, 2, 5, 8 } },
+  { { 65, 2, 11, 1 },
+    { 42, 1, 3, 7 },
+    { 88, 3, 8, 4 },
+    { 76, 3, 1, 15 },
+    { 1, 0, 2, 2 },
+    { 19, 0, 5, 6 },
+    { 60, 2, 4, 0 },
+    { 25, 1, 11, 15 },
+    { 50, 2, 5, 4 } },
+};
+
+int main(int argc, char **argv) {
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  int num_references;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  size_t frame_size = 0;
+  const unsigned char *frame = NULL;
+  int n, i, j;
+  exec_name = argv[0];
+
+  if (argc != 4) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  num_references = (int)strtol(argv[3], NULL, 0);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) {
+    die("Failed to set annex b status");
+  }
+
+  // Decode anchor frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    if (i == 0) {
+      aom_img_fmt_t ref_fmt = 0;
+      if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+        die_codec(&codec, "Failed to get the image format");
+
+      int frame_res[2];
+      if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+        die_codec(&codec, "Failed to get the image frame size");
+
+      // Allocate memory to store decoded references. Allocate memory with the
+      // border so that it can be used as a reference.
+      for (j = 0; j < num_references; j++) {
+        unsigned int border = AOM_BORDER_IN_PIXELS;
+        if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+                                       frame_res[0], frame_res[1], 32, 8,
+                                       border)) {
+          die("Failed to allocate references.");
+        }
+      }
+    }
+
+    if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                          &reference_images[i]))
+      die_codec(&codec, "Failed to copy decoded reference frame");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      char name[1024];
+      snprintf(name, sizeof(name), "ref_%d.yuv", i);
+      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      FILE *ref_file = fopen(name, "wb");
+      aom_img_write(img, ref_file);
+      fclose(ref_file);
+    }
+  }
+
+  FILE *infile = aom_video_reader_get_file(reader);
+  // Record the offset of the first camera image.
+  const FileOffset camera_frame_pos = ftello(infile);
+
+  // Process 1 tile.
+  for (n = 0; n < num_tile_lists; n++) {
+    for (i = 0; i <= tile_count_minus_1; i++) {
+      int image_idx = tile_list[n][i].image_idx;
+      int ref_idx = tile_list[n][i].reference_idx;
+      int tc = tile_list[n][i].tile_col;
+      int tr = tile_list[n][i].tile_row;
+      int frame_cnt = -1;
+
+      // Seek to the first camera image.
+      fseeko(infile, camera_frame_pos, SEEK_SET);
+
+      // Read out the camera image
+      while (frame_cnt != image_idx) {
+        aom_video_reader_read_frame(reader);
+        frame_cnt++;
+      }
+
+      frame = aom_video_reader_get_frame(reader, &frame_size);
+
+      aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+      aom_codec_control_(&codec, AV1D_EXT_TILE_DEBUG, 1);
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_ROW, tr);
+      aom_codec_control_(&codec, AV1_SET_DECODE_TILE_COL, tc);
+
+      av1_ref_frame_t ref;
+      ref.idx = 0;
+      ref.use_external_ref = 1;
+      ref.img = reference_images[ref_idx];
+      if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref)) {
+        die_codec(&codec, "Failed to set reference frame.");
+      }
+
+      aom_codec_err_t aom_status =
+          aom_codec_decode(&codec, frame, frame_size, NULL);
+      if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+      aom_codec_iter_t iter = NULL;
+      aom_image_t *img = aom_codec_get_frame(&codec, &iter);
+      aom_img_write(img, outfile);
+    }
+  }
+
+  for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_encoder.c b/third_party/aom/examples/lightfield_encoder.c
new file mode 100644
index 000000000..e55cd5ce3
--- /dev/null
+++ b/third_party/aom/examples/lightfield_encoder.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Encoder
+// ==================
+//
+// This is an example of a simple lightfield encoder.  It builds upon the
+// twopass_encoder.c example. It takes an input file in YV12 format,
+// treating it as a planar lightfield instead of a video. The img_width
+// and img_height arguments are the dimensions of the lightfield images,
+// while the lf_width and lf_height arguments are the number of
+// lightfield images in each dimension. The lf_blocksize determines the
+// number of reference images used for MCP. For example, 5 means that there
+// is a reference image for every 5x5 lightfield image block. All images
+// within a block will use the center image in that block as the reference
+// image for MCP.
+// Run "make test" to download lightfield test data: vase10x10.yuv.
+// Run lightfield encoder to encode whole lightfield:
+// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5
+
+// Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print
+// out the uncompressed header and the frame contexts, which can be used to
+// test the bit exactness of the headers and the frame contexts for large scale
+// tile coded frames.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <img_width> <img_height> <infile> <outfile> "
+          "<lf_width> <lf_height> <lf_blocksize>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int aom_img_size_bytes(aom_image_t *img) {
+  int image_size_bytes = 0;
+  int plane;
+  for (plane = 0; plane < 3; ++plane) {
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    image_size_bytes += w * h;
+  }
+  return image_size_bytes;
+}
+
+static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                           aom_codec_pts_t pts, unsigned int duration,
+                           aom_enc_frame_flags_t flags,
+                           aom_fixed_buf_t *stats) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_STATS_PKT) {
+      const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+      const size_t pkt_size = pkt->data.twopass_stats.sz;
+      stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+      stats->sz += pkt_size;
+    }
+  }
+
+  return got_pkts;
+}
+
+static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                        aom_codec_pts_t pts, unsigned int duration,
+                        aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts))
+        die_codec(ctx, "Failed to write compressed frame.");
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw,
+                          aom_image_t *raw_shift) {
+  if (!CONFIG_LOWBITDEPTH) {
+    // Need to allocate larger buffer to use hbd internal.
+    int input_shift = 0;
+    aom_img_upshift(raw_shift, raw, input_shift);
+    *frame_to_encode = raw_shift;
+  } else {
+    *frame_to_encode = raw;
+  }
+}
+
+static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
+                             const AvxInterface *encoder,
+                             const aom_codec_enc_cfg_t *cfg, int lf_width,
+                             int lf_height, int lf_blocksize, int flags,
+                             aom_image_t *raw_shift) {
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+  int image_size_bytes = aom_img_size_bytes(raw);
+  int u_blocks, v_blocks;
+  int bu, bv;
+  aom_fixed_buf_t stats = { NULL, 0 };
+  aom_image_t *frame_to_encode;
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&codec, "Failed to turn off auto altref");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+
+  // How many reference images we need to encode.
+  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+  printf("\n First pass: ");
+
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u_block_size, v_block_size;
+      int block_ref_u, block_ref_v;
+
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      u_block_size = block_u_end - block_u_min;
+      v_block_size = block_v_end - block_v_min;
+      block_ref_u = block_u_min + u_block_size / 2;
+      block_ref_v = block_v_min + v_block_size / 2;
+
+      printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
+      fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
+            SEEK_SET);
+      aom_img_read(raw, infile);
+      get_raw_image(&frame_to_encode, raw, raw_shift);
+
+      // Reference frames can be encoded encoded without tiles.
+      ++frame_count;
+      get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+                      AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                          AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                          AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                          AOM_EFLAG_NO_UPD_ARF,
+                      &stats);
+    }
+  }
+
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u, v;
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      for (v = block_v_min; v < block_v_end; ++v) {
+        for (u = block_u_min; u < block_u_end; ++u) {
+          printf("C%d, ", (u + v * lf_width));
+          fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
+          aom_img_read(raw, infile);
+          get_raw_image(&frame_to_encode, raw, raw_shift);
+
+          ++frame_count;
+          get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+                          AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                              AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                              AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                              AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                              AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                          &stats);
+        }
+      }
+    }
+  }
+  // Flush encoder.
+  // No ARF, this should not be needed.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
+  }
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  printf("\nFirst pass complete. Processed %d frames.\n", frame_count);
+
+  return stats;
+}
+
+static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
+                  const AvxInterface *encoder, aom_codec_enc_cfg_t *cfg,
+                  int lf_width, int lf_height, int lf_blocksize, int flags,
+                  aom_image_t *raw_shift) {
+  AvxVideoInfo info = { encoder->fourcc,
+                        cfg->g_w,
+                        cfg->g_h,
+                        { cfg->g_timebase.num, cfg->g_timebase.den },
+                        0 };
+  AvxVideoWriter *writer = NULL;
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+  int image_size_bytes = aom_img_size_bytes(raw);
+  int bu, bv;
+  int u_blocks, v_blocks;
+  aom_image_t *frame_to_encode;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  int reference_image_num = 0;
+  int i;
+
+  writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing", outfile_name);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&codec, "Failed to turn off auto altref");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+  // Note: The superblock is a sequence parameter and has to be the same for 1
+  // sequence. In lightfield application, must choose the superblock size(either
+  // 64x64 or 128x128) before the encoding starts. Otherwise, the default is
+  // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64
+  // internally.
+  if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE,
+                        AOM_SUPERBLOCK_SIZE_64X64))
+    die_codec(&codec, "Failed to set SB size");
+
+  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+  reference_image_num = u_blocks * v_blocks;
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+  if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  for (i = 0; i < reference_image_num; i++) {
+    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w,
+                                   cfg->g_h, 32, 8, AOM_BORDER_IN_PIXELS)) {
+      die("Failed to allocate image.");
+    }
+  }
+
+  printf("\n Second pass: ");
+
+  // Encode reference images first.
+  printf("Encoding Reference Images\n");
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u_block_size, v_block_size;
+      int block_ref_u, block_ref_v;
+
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      u_block_size = block_u_end - block_u_min;
+      v_block_size = block_v_end - block_v_min;
+      block_ref_u = block_u_min + u_block_size / 2;
+      block_ref_v = block_v_min + v_block_size / 2;
+
+      printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
+      fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
+            SEEK_SET);
+      aom_img_read(raw, infile);
+
+      get_raw_image(&frame_to_encode, raw, raw_shift);
+
+      // Reference frames may be encoded without tiles.
+      ++frame_count;
+      printf("Encoding reference image %d of %d\n", bv * u_blocks + bu,
+             u_blocks * v_blocks);
+      encode_frame(&codec, frame_to_encode, frame_count, 1,
+                   AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                       AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                       AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                   writer);
+
+      if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                            &reference_images[frame_count - 1]))
+        die_codec(&codec, "Failed to copy decoder reference frame");
+    }
+  }
+
+  cfg->large_scale_tile = 1;
+  // Fixed q encoding for camera frames.
+  cfg->rc_end_usage = AOM_Q;
+  if (aom_codec_enc_config_set(&codec, cfg))
+    die_codec(&codec, "Failed to configure encoder");
+
+  // The fixed q value used in encoding.
+  if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36))
+    die_codec(&codec, "Failed to set cq level");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+  if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1))
+    die_codec(&codec, "Failed to turn on single tile decoding");
+  // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+  // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+  if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6))
+    die_codec(&codec, "Failed to set tile width");
+  if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6))
+    die_codec(&codec, "Failed to set tile height");
+
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u, v;
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      for (v = block_v_min; v < block_v_end; ++v) {
+        for (u = block_u_min; u < block_u_end; ++u) {
+          av1_ref_frame_t ref;
+          ref.idx = 0;
+          ref.use_external_ref = 1;
+          ref.img = reference_images[bv * u_blocks + bu];
+          if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref))
+            die_codec(&codec, "Failed to set reference frame");
+
+          printf("C%d, ", (u + v * lf_width));
+          fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
+          aom_img_read(raw, infile);
+          get_raw_image(&frame_to_encode, raw, raw_shift);
+
+          ++frame_count;
+          printf("Encoding image %d of %d\n",
+                 frame_count - (u_blocks * v_blocks), lf_width * lf_height);
+          encode_frame(&codec, frame_to_encode, frame_count, 1,
+                       AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                           AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                           AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                           AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                           AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                       writer);
+        }
+      }
+    }
+  }
+
+  // Flush encoder.
+  // No ARF, this should not be needed.
+  while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
+  }
+
+  for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+  aom_video_writer_close(writer);
+
+  printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  int w, h;
+  // The number of lightfield images in the u and v dimensions.
+  int lf_width, lf_height;
+  // Defines how many images refer to the same reference image for MCP.
+  // lf_blocksize X lf_blocksize images will all use the reference image
+  // in the middle of the block of images.
+  int lf_blocksize;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  aom_image_t raw;
+  aom_image_t raw_shift;
+  aom_codec_err_t res;
+  aom_fixed_buf_t stats;
+  int flags = 0;
+
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;  // kbit/s
+  const char *const width_arg = argv[1];
+  const char *const height_arg = argv[2];
+  const char *const infile_arg = argv[3];
+  const char *const outfile_arg = argv[4];
+  const char *const lf_width_arg = argv[5];
+  const char *const lf_height_arg = argv[6];
+  const char *lf_blocksize_arg = argv[7];
+  exec_name = argv[0];
+
+  if (argc < 8) die("Invalid number of arguments");
+
+  encoder = get_aom_encoder_by_name("av1");
+  if (!encoder) die("Unsupported codec.");
+
+  w = (int)strtol(width_arg, NULL, 0);
+  h = (int)strtol(height_arg, NULL, 0);
+  lf_width = (int)strtol(lf_width_arg, NULL, 0);
+  lf_height = (int)strtol(lf_height_arg, NULL, 0);
+  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
+  lf_blocksize = lf_blocksize < lf_width ? lf_blocksize : lf_width;
+  lf_blocksize = lf_blocksize < lf_height ? lf_blocksize : lf_height;
+
+  if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
+  if (lf_width <= 0 || lf_height <= 0)
+    die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height);
+  if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize);
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) {
+    die("Failed to allocate image.");
+  }
+  if (!CONFIG_LOWBITDEPTH) {
+    // Need to allocate larger buffer to use hbd internal.
+    aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h,
+                  32);
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  // Configuration
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = 0;  // This is required.
+  cfg.g_lag_in_frames = 0;    // need to set this since default is 19.
+  cfg.kf_mode = AOM_KF_DISABLED;
+  cfg.large_scale_tile = 0;  // Only set it to 1 for camera frame encoding.
+  cfg.g_bit_depth = AOM_BITS_8;
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || !CONFIG_LOWBITDEPTH)
+               ? AOM_CODEC_USE_HIGHBITDEPTH
+               : 0;
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
+
+  // Pass 0
+  cfg.g_pass = AOM_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize,
+                flags, &raw_shift);
+
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = AOM_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height,
+        lf_blocksize, flags, &raw_shift);
+  free(stats.buf);
+
+  if (!CONFIG_LOWBITDEPTH) aom_img_free(&raw_shift);
+  aom_img_free(&raw);
+  fclose(infile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lightfield_tile_list_decoder.c b/third_party/aom/examples/lightfield_tile_list_decoder.c
new file mode 100644
index 000000000..5556bf0e7
--- /dev/null
+++ b/third_party/aom/examples/lightfield_tile_list_decoder.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Tile List Decoder
+// ============================
+//
+// This is a lightfield tile list decoder example. It takes an input file that
+// contains the anchor frames that are references of the coded tiles, the camera
+// frame header, and tile list OBUs that include the tile information and the
+// compressed tile data. This input file is reconstructed from the encoded
+// lightfield ivf file, and is decodable by AV1 decoder. num_references is
+// the number of anchor frames coded at the beginning of the light field file.
+// num_tile_lists is the number of tile lists need to be decoded.
+// Run lightfield tile list decoder to decode an AV1 tile list file:
+// examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv
+// 4 2
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <infile> <outfile> <num_references> <num_tile_lists>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  int num_references;
+  int num_tile_lists;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  size_t frame_size = 0;
+  const unsigned char *frame = NULL;
+  int i, j, n;
+
+  exec_name = argv[0];
+
+  if (argc != 5) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  num_references = (int)strtol(argv[3], NULL, 0);
+  num_tile_lists = (int)strtol(argv[4], NULL, 0);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (aom_codec_control(&codec, AV1D_SET_IS_ANNEXB, info->is_annexb)) {
+    die("Failed to set annex b status");
+  }
+
+  // Decode anchor frames.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 0);
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    if (i == 0) {
+      aom_img_fmt_t ref_fmt = 0;
+      if (aom_codec_control(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+        die_codec(&codec, "Failed to get the image format");
+
+      int frame_res[2];
+      if (aom_codec_control(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+        die_codec(&codec, "Failed to get the image frame size");
+
+      // Allocate memory to store decoded references. Allocate memory with the
+      // border so that it can be used as a reference.
+      for (j = 0; j < num_references; j++) {
+        unsigned int border = AOM_BORDER_IN_PIXELS;
+        if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+                                       frame_res[0], frame_res[1], 32, 8,
+                                       border)) {
+          die("Failed to allocate references.");
+        }
+      }
+    }
+
+    if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                          &reference_images[i]))
+      die_codec(&codec, "Failed to copy decoded reference frame");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      char name[1024];
+      snprintf(name, sizeof(name), "ref_%d.yuv", i);
+      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      FILE *ref_file = fopen(name, "wb");
+      aom_img_write(img, ref_file);
+      fclose(ref_file);
+    }
+  }
+
+  // Decode the lightfield.
+  aom_codec_control_(&codec, AV1_SET_TILE_MODE, 1);
+
+  // Set external references.
+  av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references };
+  aom_codec_control_(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
+  // Must decode the camera frame header first.
+  aom_video_reader_read_frame(reader);
+  frame = aom_video_reader_get_frame(reader, &frame_size);
+  if (aom_codec_decode(&codec, frame, frame_size, NULL))
+    die_codec(&codec, "Failed to decode the frame.");
+  // Decode tile lists one by one.
+  for (n = 0; n < num_tile_lists; n++) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode the tile list.");
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img;
+    while ((img = aom_codec_get_frame(&codec, &iter)))
+      fwrite(img->img_data, 1, img->sz, outfile);
+  }
+
+  for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/lossless_encoder.c b/third_party/aom/examples/lossless_encoder.c
new file mode 100644
index 000000000..438ff21c6
--- /dev/null
+++ b/third_party/aom/examples/lossless_encoder.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "lossless_encoder: Example demonstrating lossless "
+          "encoding feature. Supports raw input only.\n");
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc < 5) die("Invalid number of arguments");
+
+  encoder = get_aom_encoder_by_name("av1");
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(argv[1], NULL, 0);
+  info.frame_height = (int)strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+
+  writer = aom_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (aom_codec_control_(&codec, AV1E_SET_LOSSLESS, 1))
+    die_codec(&codec, "Failed to use lossless mode");
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile)) {
+    encode_frame(&codec, &raw, frame_count++, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {
+  }
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/noise_model.c b/third_party/aom/examples/noise_model.c
new file mode 100644
index 000000000..5cc6003b6
--- /dev/null
+++ b/third_party/aom/examples/noise_model.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This is an sample binary to create noise params from input video.
+ *
+ * To allow for external denoising applications, this sample binary illustrates
+ * how to create a film grain table (film grain params as a function of time)
+ * from an input video and its corresponding denoised source.
+ *
+ * The --output-grain-table file can be passed as input to the encoder (in
+ * aomenc this is done through the "--film-grain-table" parameter).
+ *
+ * As an example, where the input source is an 854x480 yuv420p 8-bit video
+ * named "input.854_480.yuv" you would use steps similar to the following:
+ *
+ * # Run your denoiser (e.g, using hqdn3d filter):
+ * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \
+ *    -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \
+ *    denoised.854_480.yuv
+ *
+ * # Model the noise between the denoised version and original source:
+ * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \
+ *    --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \
+ *    --output-grain-table=film_grain.tbl
+ *
+ * # Encode with your favorite settings (including the grain table):
+ * aomenc --limit=100  --cpu-used=4 --input-bit-depth=8                  \
+ *    --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \
+ *    --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \
+ *    -o denoised_with_grain_params.ivf denoised.854_480.yuv
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#if CONFIG_AV1_DECODER
+#include "aom_dsp/grain_synthesis.h"
+#endif
+
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s --input=<input> --input-denoised=<denoised> "
+          "--output-grain-table=<outfile> "
+          "See comments in noise_model.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t width_arg =
+    ARG_DEF("w", "width", 1, "Input width (if rawvideo)");
+static const arg_def_t height_arg =
+    ARG_DEF("h", "height", 1, "Input height (if rawvideo)");
+static const arg_def_t skip_frames_arg =
+    ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)");
+static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate");
+static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename");
+static const arg_def_t output_grain_table_arg =
+    ARG_DEF("n", "output-grain-table", 1, "Output noise file");
+static const arg_def_t input_denoised_arg =
+    ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only");
+static const arg_def_t flat_block_finder_arg =
+    ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder");
+static const arg_def_t block_size_arg =
+    ARG_DEF("b", "block-size", 1, "Block size");
+static const arg_def_t bit_depth_arg =
+    ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)");
+static const arg_def_t use_i422 =
+    ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422");
+static const arg_def_t use_i444 =
+    ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444");
+static const arg_def_t debug_file_arg =
+    ARG_DEF(NULL, "debug-file", 1, "File to output debug info");
+
+typedef struct {
+  int width;
+  int height;
+  struct aom_rational fps;
+  const char *input;
+  const char *input_denoised;
+  const char *output_grain_table;
+  int img_fmt;
+  int block_size;
+  int bit_depth;
+  int run_flat_block_finder;
+  int force_flat_psd;
+  int skip_frames;
+  const char *debug_file;
+} noise_model_args_t;
+
+void parse_args(noise_model_args_t *noise_args, int *argc, char **argv) {
+  struct arg arg;
+  static const arg_def_t *main_args[] = { &help,
+                                          &input_arg,
+                                          &fps_arg,
+                                          &width_arg,
+                                          &height_arg,
+                                          &block_size_arg,
+                                          &output_grain_table_arg,
+                                          &input_denoised_arg,
+                                          &use_i420,
+                                          &use_i422,
+                                          &use_i444,
+                                          &debug_file_arg,
+                                          NULL };
+  for (int argi = *argc + 1; *argv; argi++, argv++) {
+    if (arg_match(&arg, &help, argv)) {
+      fprintf(stdout, "\nOptions:\n");
+      arg_show_usage(stdout, main_args);
+      exit(0);
+    } else if (arg_match(&arg, &width_arg, argv)) {
+      noise_args->width = atoi(arg.val);
+    } else if (arg_match(&arg, &height_arg, argv)) {
+      noise_args->height = atoi(arg.val);
+    } else if (arg_match(&arg, &input_arg, argv)) {
+      noise_args->input = arg.val;
+    } else if (arg_match(&arg, &input_denoised_arg, argv)) {
+      noise_args->input_denoised = arg.val;
+    } else if (arg_match(&arg, &output_grain_table_arg, argv)) {
+      noise_args->output_grain_table = arg.val;
+    } else if (arg_match(&arg, &block_size_arg, argv)) {
+      noise_args->block_size = atoi(arg.val);
+    } else if (arg_match(&arg, &bit_depth_arg, argv)) {
+      noise_args->bit_depth = atoi(arg.val);
+    } else if (arg_match(&arg, &flat_block_finder_arg, argv)) {
+      noise_args->run_flat_block_finder = atoi(arg.val);
+    } else if (arg_match(&arg, &fps_arg, argv)) {
+      noise_args->fps = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &use_i420, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I420;
+    } else if (arg_match(&arg, &use_i422, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I422;
+    } else if (arg_match(&arg, &use_i444, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I444;
+    } else if (arg_match(&arg, &skip_frames_arg, argv)) {
+      noise_args->skip_frames = atoi(arg.val);
+    } else if (arg_match(&arg, &debug_file_arg, argv)) {
+      noise_args->debug_file = arg.val;
+    } else {
+      fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv);
+      arg_show_usage(stdout, main_args);
+      exit(0);
+    }
+  }
+  if (noise_args->bit_depth > 8) {
+    noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  }
+}
+
+#if CONFIG_AV1_DECODER
+static void print_variance_y(FILE *debug_file, aom_image_t *raw,
+                             aom_image_t *denoised, const uint8_t *flat_blocks,
+                             int block_size, aom_film_grain_t *grain) {
+  aom_image_t renoised;
+  grain->apply_grain = 1;
+  grain->random_seed = 7391;
+  aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
+
+  if (av1_add_film_grain(grain, denoised, &renoised)) {
+    fprintf(stderr, "Internal failure in av1_add_film_grain().\n");
+    aom_img_free(&renoised);
+    return;
+  }
+
+  const int num_blocks_w = (raw->w + block_size - 1) / block_size;
+  const int num_blocks_h = (raw->h + block_size - 1) / block_size;
+  fprintf(debug_file, "x = [");
+  for (int by = 0; by < num_blocks_h; by++) {
+    for (int bx = 0; bx < num_blocks_w; bx++) {
+      double block_mean = 0;
+      double noise_std = 0, noise_mean = 0;
+      double renoise_std = 0, renoise_mean = 0;
+      for (int yi = 0; yi < block_size; ++yi) {
+        const int y = by * block_size + yi;
+        for (int xi = 0; xi < block_size; ++xi) {
+          const int x = bx * block_size + xi;
+          const double noise_v = (raw->planes[0][y * raw->stride[0] + x] -
+                                  denoised->planes[0][y * raw->stride[0] + x]);
+          noise_mean += noise_v;
+          noise_std += noise_v * noise_v;
+
+          block_mean += raw->planes[0][y * raw->stride[0] + x];
+
+          const double renoise_v =
+              (renoised.planes[0][y * raw->stride[0] + x] -
+               denoised->planes[0][y * raw->stride[0] + x]);
+          renoise_mean += renoise_v;
+          renoise_std += renoise_v * renoise_v;
+        }
+      }
+      int n = (block_size * block_size);
+      block_mean /= n;
+      noise_mean /= n;
+      renoise_mean /= n;
+      noise_std = sqrt(noise_std / n - noise_mean * noise_mean);
+      renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean);
+      fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf  ",
+              flat_blocks[by * num_blocks_w + bx], block_mean, noise_std,
+              renoise_std);
+    }
+    fprintf(debug_file, "\n");
+  }
+  fprintf(debug_file, "];\n");
+
+  if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    fprintf(stderr,
+            "Detailed debug info not supported for high bit"
+            "depth formats\n");
+  } else {
+    fprintf(debug_file, "figure(2); clf;\n");
+    fprintf(debug_file,
+            "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n");
+    fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n");
+    fprintf(debug_file,
+            "plot(linspace(0, 255, length(noise_strength_0)), "
+            "noise_strength_0, 'b');\n");
+    fprintf(debug_file,
+            "title('Scatter plot of intensity vs noise strength');\n");
+    fprintf(debug_file,
+            "legend('Actual', 'Estimated', 'Estimated strength');\n");
+    fprintf(debug_file, "figure(3); clf;\n");
+    fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n");
+    fprintf(debug_file, "title('Actual vs Estimated');\n");
+    fprintf(debug_file, "pause(3);\n");
+  }
+  aom_img_free(&renoised);
+}
+#endif
+
+static void print_debug_info(FILE *debug_file, aom_image_t *raw,
+                             aom_image_t *denoised, uint8_t *flat_blocks,
+                             int block_size, aom_noise_model_t *noise_model) {
+  (void)raw;
+  (void)denoised;
+  (void)flat_blocks;
+  (void)block_size;
+  fprintf(debug_file, "figure(3); clf;\n");
+  fprintf(debug_file, "figure(2); clf;\n");
+  fprintf(debug_file, "figure(1); clf;\n");
+  for (int c = 0; c < 3; ++c) {
+    fprintf(debug_file, "noise_strength_%d = [\n", c);
+    const aom_equation_system_t *eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    for (int k = 0; k < eqns->n; ++k) {
+      fprintf(debug_file, "%lf ", eqns->x[k]);
+    }
+    fprintf(debug_file, "];\n");
+    fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c);
+  }
+  fprintf(debug_file, "legend('Y', 'cb', 'cr');\n");
+  fprintf(debug_file, "title('Noise strength function');\n");
+
+#if CONFIG_AV1_DECODER
+  aom_film_grain_t grain;
+  aom_noise_model_get_grain_parameters(noise_model, &grain);
+  print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain);
+#endif
+  fflush(debug_file);
+}
+
+int main(int argc, char *argv[]) {
+  noise_model_args_t args = { 0,  0, { 25, 1 }, 0, 0, 0,   AOM_IMG_FMT_I420,
+                              32, 8, 1,         0, 1, NULL };
+  aom_image_t raw, denoised;
+  FILE *infile = NULL;
+  AvxVideoInfo info;
+
+  memset(&info, 0, sizeof(info));
+
+  exec_name = argv[0];
+  parse_args(&args, &argc, argv + 1);
+
+  info.frame_width = args.width;
+  info.frame_height = args.height;
+  info.time_base.numerator = args.fps.den;
+  info.time_base.denominator = args.fps.num;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+  if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height,
+                     1)) {
+    die("Failed to allocate image.");
+  }
+  if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+  infile = fopen(args.input, "r");
+  if (!infile) {
+    die("Failed to open input file:", args.input);
+  }
+  fprintf(stderr, "Bit depth: %d  stride:%d\n", args.bit_depth, raw.stride[0]);
+
+  const int high_bd = args.bit_depth > 8;
+  const int block_size = args.block_size;
+  aom_flat_block_finder_t block_finder;
+  aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth,
+                             high_bd);
+
+  const int num_blocks_w = (info.frame_width + block_size - 1) / block_size;
+  const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
+  uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h);
+  // Sets the random seed on the first entry in the output table
+  int16_t random_seed = 7391;
+  aom_noise_model_t noise_model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth,
+                                      high_bd };
+  aom_noise_model_init(&noise_model, params);
+
+  FILE *denoised_file = 0;
+  if (args.input_denoised) {
+    denoised_file = fopen(args.input_denoised, "rb");
+    if (!denoised_file)
+      die("Unable to open input_denoised: %s", args.input_denoised);
+  } else {
+    die("--input-denoised file must be specified");
+  }
+  FILE *debug_file = 0;
+  if (args.debug_file) {
+    debug_file = fopen(args.debug_file, "w");
+  }
+  aom_film_grain_table_t grain_table = { 0, 0 };
+
+  int64_t prev_timestamp = 0;
+  int frame_count = 0;
+  while (aom_img_read(&raw, infile)) {
+    if (args.input_denoised) {
+      if (!aom_img_read(&denoised, denoised_file)) {
+        die("Unable to read input denoised file");
+      }
+    }
+    if (frame_count % args.skip_frames == 0) {
+      int num_flat_blocks = num_blocks_w * num_blocks_h;
+      memset(flat_blocks, 1, num_flat_blocks);
+      if (args.run_flat_block_finder) {
+        memset(flat_blocks, 0, num_flat_blocks);
+        num_flat_blocks = aom_flat_block_finder_run(
+            &block_finder, raw.planes[0], info.frame_width, info.frame_height,
+            info.frame_width, flat_blocks);
+        fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks);
+      }
+
+      const uint8_t *planes[3] = { raw.planes[0], raw.planes[1],
+                                   raw.planes[2] };
+      uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1],
+                                      denoised.planes[2] };
+      int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd,
+                         raw.stride[2] >> high_bd };
+      int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 };
+
+      fprintf(stdout, "Updating noise model...\n");
+      aom_noise_status_t status = aom_noise_model_update(
+          &noise_model, (const uint8_t *const *)planes,
+          (const uint8_t *const *)denoised_planes, info.frame_width,
+          info.frame_height, strides, chroma_sub, flat_blocks, block_size);
+
+      int64_t cur_timestamp =
+          frame_count * 10000000ULL * args.fps.den / args.fps.num;
+      if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+        fprintf(stdout,
+                "Noise type is different, updating parameters for time "
+                "[ %" PRId64 ", %" PRId64 ")\n",
+                prev_timestamp, cur_timestamp);
+        aom_film_grain_t grain;
+        aom_noise_model_get_grain_parameters(&noise_model, &grain);
+        grain.random_seed = random_seed;
+        random_seed = 0;
+        aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp,
+                                    &grain);
+        aom_noise_model_save_latest(&noise_model);
+        prev_timestamp = cur_timestamp;
+      }
+      if (debug_file) {
+        print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size,
+                         &noise_model);
+      }
+      fprintf(stdout, "Done noise model update, status = %d\n", status);
+    }
+    frame_count++;
+  }
+
+  aom_film_grain_t grain;
+  aom_noise_model_get_grain_parameters(&noise_model, &grain);
+  grain.random_seed = random_seed;
+  aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain);
+  if (args.output_grain_table) {
+    struct aom_internal_error_info error_info;
+    if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table,
+                                                   args.output_grain_table,
+                                                   &error_info)) {
+      die("Unable to write output film grain table");
+    }
+  }
+  aom_film_grain_table_free(&grain_table);
+
+  if (infile) fclose(infile);
+  if (denoised_file) fclose(denoised_file);
+  if (debug_file) fclose(debug_file);
+  aom_img_free(&raw);
+  aom_img_free(&denoised);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/resize_util.c b/third_party/aom/examples/resize_util.c
new file mode 100644
index 000000000..6a84d5740
--- /dev/null
+++ b/third_party/aom/examples/resize_util.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "av1/common/resize.h"
+#include "common/tools_common.h"
+
+static const char *exec_name = NULL;
+
+static void usage() {
+  printf("Usage:\n");
+  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+         exec_name);
+  printf("<output_yuv> [<frames>]\n");
+}
+
+void usage_exit(void) {
+  usage();
+  exit(EXIT_FAILURE);
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+  char *x = strchr(v, 'x');
+  if (x == NULL) x = strchr(v, 'X');
+  if (x == NULL) return 0;
+  *width = atoi(v);
+  *height = atoi(&x[1]);
+  if (*width <= 0 || *height <= 0)
+    return 0;
+  else
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+  char *fin, *fout;
+  FILE *fpin, *fpout;
+  uint8_t *inbuf, *outbuf;
+  uint8_t *inbuf_u, *outbuf_u;
+  uint8_t *inbuf_v, *outbuf_v;
+  int f, frames;
+  int width, height, target_width, target_height;
+
+  exec_name = argv[0];
+
+  if (argc < 5) {
+    printf("Incorrect parameters:\n");
+    usage();
+    return 1;
+  }
+
+  fin = argv[1];
+  fout = argv[4];
+  if (!parse_dim(argv[2], &width, &height)) {
+    printf("Incorrect parameters: %s\n", argv[2]);
+    usage();
+    return 1;
+  }
+  if (!parse_dim(argv[3], &target_width, &target_height)) {
+    printf("Incorrect parameters: %s\n", argv[3]);
+    usage();
+    return 1;
+  }
+
+  fpin = fopen(fin, "rb");
+  if (fpin == NULL) {
+    printf("Can't open file %s to read\n", fin);
+    usage();
+    return 1;
+  }
+  fpout = fopen(fout, "wb");
+  if (fpout == NULL) {
+    printf("Can't open file %s to write\n", fout);
+    usage();
+    return 1;
+  }
+  if (argc >= 6)
+    frames = atoi(argv[5]);
+  else
+    frames = INT_MAX;
+
+  printf("Input size:  %dx%d\n", width, height);
+  printf("Target size: %dx%d, Frames: ", target_width, target_height);
+  if (frames == INT_MAX)
+    printf("All\n");
+  else
+    printf("%d\n", frames);
+
+  inbuf = (uint8_t *)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  inbuf_u = inbuf + width * height;
+  inbuf_v = inbuf_u + width * height / 4;
+  outbuf_u = outbuf + target_width * target_height;
+  outbuf_v = outbuf_u + target_width * target_height / 4;
+  f = 0;
+  while (f < frames) {
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
+    av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
+                        width, outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2, target_height, target_width);
+    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+    f++;
+  }
+  printf("%d frames processed\n", f);
+  fclose(fpin);
+  fclose(fpout);
+
+  free(inbuf);
+  free(outbuf);
+  return 0;
+}
diff --git a/third_party/aom/examples/scalable_decoder.c b/third_party/aom/examples/scalable_decoder.c
new file mode 100644
index 000000000..c22924223
--- /dev/null
+++ b/third_party/aom/examples/scalable_decoder.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Decoder
+// ==============
+//
+// This is an example of a scalable decoder loop. It takes a 2-spatial-layer
+// input file
+// containing the compressed data (in OBU format), passes it through the
+// decoder, and writes the decompressed frames to disk. The base layer and
+// enhancement layers are stored as separate files, out_lyr0.yuv and
+// out_lyr1.yuv, respectively.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// av1.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL. The `deadline` parameter is left at zero for this
+// example. This parameter is generally only used when doing adaptive post
+// processing.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+#define MAX_LAYERS 5
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile[MAX_LAYERS];
+  char filename[80];
+  aom_codec_ctx_t codec;
+  const AvxInterface *decoder = NULL;
+  FILE *inputfile = NULL;
+  uint8_t *buf = NULL;
+  size_t bytes_in_buffer = 0;
+  size_t buffer_size = 0;
+  struct AvxInputContext aom_input_ctx;
+  struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 };
+  aom_codec_stream_info_t si;
+  uint8_t tmpbuf[32];
+  unsigned int i;
+
+  exec_name = argv[0];
+
+  if (argc != 2) die("Invalid number of arguments.");
+
+  if (!(inputfile = fopen(argv[1], "rb")))
+    die("Failed to open %s for read.", argv[1]);
+  obu_ctx.avx_ctx->file = inputfile;
+  obu_ctx.avx_ctx->filename = argv[1];
+
+  decoder = get_aom_decoder_by_index(0);
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) {
+    die_codec(&codec, "Failed to set output_all_layers control.");
+  }
+
+  // peak sequence header OBU to get number of spatial layers
+  const size_t ret = fread(tmpbuf, 1, 32, inputfile);
+  if (ret != 32) die_codec(&codec, "Input is not a valid obu file");
+  si.is_annexb = 0;
+  if (aom_codec_peek_stream_info(decoder->codec_interface(), tmpbuf, 32, &si)) {
+    die_codec(&codec, "Input is not a valid obu file");
+  }
+  fseek(inputfile, -32, SEEK_CUR);
+
+  if (!file_is_obu(&obu_ctx))
+    die_codec(&codec, "Input is not a valid obu file");
+
+  // open base layer output yuv file
+  snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0);
+  if (!(outfile[0] = fopen(filename, "wb")))
+    die("Failed top open output for writing.");
+
+  // open any enhancement layer output yuv files
+  for (i = 1; i < si.number_spatial_layers; i++) {
+    snprintf(filename, sizeof(filename), "out_lyr%d.yuv", i);
+    if (!(outfile[i] = fopen(filename, "wb")))
+      die("Failed to open output for writing.");
+  }
+
+  while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer,
+                                    &buffer_size)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16);
+      img_shifted->bit_depth = 8;
+      aom_img_downshift(img_shifted, img,
+                        img->bit_depth - img_shifted->bit_depth);
+      if (img->spatial_id == 0) {
+        printf("Writing        base layer 0 %d\n", frame_cnt);
+        aom_img_write(img_shifted, outfile[0]);
+      } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) {
+        printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt);
+        aom_img_write(img_shifted, outfile[img->spatial_id]);
+      } else {
+        die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count");
+      }
+      if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt;
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]);
+
+  fclose(inputfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/scalable_encoder.c b/third_party/aom/examples/scalable_encoder.c
new file mode 100644
index 000000000..7af03e29f
--- /dev/null
+++ b/third_party/aom/examples/scalable_encoder.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Encoder
+// ==============
+//
+// This is an example of a scalable encoder loop. It takes two input files in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in OBU format.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size = width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile0> <infile1> "
+          "<outfile> <frames to encode>\n"
+          "See comments in scalable_encoder.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, FILE *outfile) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) !=
+          pkt->data.frame.sz) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      printf(" %6d\n", (int)pkt->data.frame.sz);
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile0 = NULL;
+  FILE *infile1 = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw0, raw1;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;
+  int keyframe_interval = 0;
+  int max_frames = 0;
+  int frames_encoded = 0;
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile0_arg = NULL;
+  const char *infile1_arg = NULL;
+  const char *outfile_arg = NULL;
+  //  const char *keyframe_interval_arg = NULL;
+  FILE *outfile = NULL;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc != 8) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile0_arg = argv[4];
+  infile1_arg = argv[5];
+  outfile_arg = argv[6];
+  max_frames = (int)strtol(argv[7], NULL, 0);
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image for layer 0.");
+  }
+  if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image for layer 1.");
+  }
+
+  //  keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+  keyframe_interval = 100;
+  if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = 0;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = AOM_Q;
+  cfg.save_as_annexb = 0;
+
+  outfile = fopen(outfile_arg, "wb");
+  if (!outfile) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile0 = fopen(infile0_arg, "rb")))
+    die("Failed to open %s for reading.", infile0_arg);
+  if (!(infile1 = fopen(infile1_arg, "rb")))
+    die("Failed to open %s for reading.", infile0_arg);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8))
+    die_codec(&codec, "Failed to set cpu to 8");
+
+  if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2))
+    die_codec(&codec, "Failed to set tile columns to 2");
+  if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3))
+    die_codec(&codec, "Failed to set num of tile groups to 3");
+
+  if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2))
+    die_codec(&codec, "Failed to set number of spatial layers to 2");
+
+  // Encode frames.
+  while (aom_img_read(&raw0, infile0)) {
+    int flags = 0;
+
+    // configure and encode base layer
+
+    if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0)
+      flags |= AOM_EFLAG_FORCE_KF;
+    else
+      // use previous base layer (LAST) as sole reference
+      // save this frame as LAST to be used as reference by enhanmcent layer
+      // and next base layer
+      flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+               AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+               AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+               AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+               AOM_EFLAG_NO_UPD_ENTROPY;
+    cfg.g_w = info.frame_width;
+    cfg.g_h = info.frame_height;
+    if (aom_codec_enc_config_set(&codec, &cfg))
+      die_codec(&codec, "Failed to set enc cfg for layer 0");
+    if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0))
+      die_codec(&codec, "Failed to set layer id to 0");
+    if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62))
+      die_codec(&codec, "Failed to set cq level");
+    encode_frame(&codec, &raw0, frame_count++, flags, outfile);
+
+    // configure and encode enhancement layer
+
+    //  use LAST (base layer) as sole reference
+    flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+            AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+            AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST |
+            AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+            AOM_EFLAG_NO_UPD_ENTROPY;
+    cfg.g_w = info.frame_width;
+    cfg.g_h = info.frame_height;
+    aom_img_read(&raw1, infile1);
+    if (aom_codec_enc_config_set(&codec, &cfg))
+      die_codec(&codec, "Failed to set enc cfg for layer 1");
+    if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1))
+      die_codec(&codec, "Failed to set layer id to 1");
+    if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10))
+      die_codec(&codec, "Failed to set cq level");
+    encode_frame(&codec, &raw1, frame_count++, flags, outfile);
+
+    frames_encoded++;
+
+    if (max_frames > 0 && frames_encoded >= max_frames) break;
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, outfile)) continue;
+
+  printf("\n");
+  fclose(infile0);
+  fclose(infile1);
+  printf("Processed %d frames.\n", frame_count / 2);
+
+  aom_img_free(&raw0);
+  aom_img_free(&raw1);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/set_maps.c b/third_party/aom/examples/set_maps.c
new file mode 100644
index 000000000..9aeb96e43
--- /dev/null
+++ b/third_party/aom/examples/set_maps.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// AOM Set Active and ROI Maps
+// ===========================
+//
+// This is an example demonstrating how to control the AOM encoder's
+// ROI and Active maps.
+//
+// ROI (Reigon of Interest) maps are a way for the application to assign
+// each macroblock in the image to a region, and then set quantizer and
+// filtering parameters on that image.
+//
+// Active maps are a way for the application to specify on a
+// macroblock-by-macroblock basis whether there is any activity in that
+// macroblock.
+//
+//
+// Configuration
+// -------------
+// An ROI map is set on frame 22. If the width of the image in macroblocks
+// is evenly divisble by 4, then the output will appear to have distinct
+// columns, where the quantizer, loopfilter, and static threshold differ
+// from column to column.
+//
+// An active map is set on frame 33. If the width of the image in macroblocks
+// is evenly divisble by 4, then the output will appear to have distinct
+// columns, where one column will have motion and the next will not.
+//
+// The active map is cleared on frame 44.
+//
+// Observing The Effects
+// ---------------------
+// Use the `simple_decoder` example to decode this sample, and observe
+// the change in the image at frames 22, 33, and 44.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static void set_active_map(const aom_codec_enc_cfg_t *cfg,
+                           aom_codec_ctx_t *codec) {
+  unsigned int i;
+  aom_active_map_t map = { 0, 0, 0 };
+
+  map.rows = (cfg->g_h + 15) / 16;
+  map.cols = (cfg->g_w + 15) / 16;
+
+  map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+  for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2;
+
+  if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+
+  free(map.active_map);
+}
+
+static void unset_active_map(const aom_codec_enc_cfg_t *cfg,
+                             aom_codec_ctx_t *codec) {
+  aom_active_map_t map = { 0, 0, 0 };
+
+  map.rows = (cfg->g_h + 15) / 16;
+  map.cols = (cfg->g_w + 15) / 16;
+  map.active_map = NULL;
+
+  if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  const int limit = 15;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  const int fps = 2;  // TODO(dkovalev) add command line argument
+  const double bits_per_pixel_per_frame = 0.067;
+
+  exec_name = argv[0];
+  if (argc != 6) die("Invalid number of arguments");
+
+  memset(&info, 0, sizeof(info));
+
+  encoder = get_aom_encoder_by_name(argv[1]);
+  if (encoder == NULL) {
+    die("Unsupported codec.");
+  }
+  assert(encoder != NULL);
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(argv[2], NULL, 0);
+  info.frame_height = (int)strtol(argv[3], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate =
+      (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000);
+  cfg.g_lag_in_frames = 0;
+
+  writer = aom_video_writer_open(argv[5], kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", argv[5]);
+
+  if (!(infile = fopen(argv[4], "rb")))
+    die("Failed to open %s for reading.", argv[4]);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile) && frame_count < limit) {
+    ++frame_count;
+
+    if (frame_count == 5) {
+      set_active_map(&cfg, &codec);
+    } else if (frame_count == 11) {
+      unset_active_map(&cfg, &codec);
+    }
+
+    encode_frame(&codec, &raw, frame_count, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {
+  }
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/simple_decoder.c b/third_party/aom/examples/simple_decoder.c
new file mode 100644
index 000000000..d098d1e0b
--- /dev/null
+++ b/third_party/aom/examples/simple_decoder.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Simple Decoder
+// ==============
+//
+// This is an example of a simple decoder loop. It takes an input file
+// containing the compressed data (in IVF format), passes it through the
+// decoder, and writes the decompressed frames to disk. Other decoder
+// examples build upon this one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// aom.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (aom_video_reader_read_frame(reader)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      aom_img_write(img, outfile);
+      ++frame_cnt;
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  aom_video_reader_close(reader);
+
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/simple_encoder.c b/third_party/aom/examples/simple_encoder.c
new file mode 100644
index 000000000..01a37cf0c
--- /dev/null
+++ b/third_party/aom/examples/simple_encoder.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Simple Encoder
+// ==============
+//
+// This is an example of a simple encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in IVF format. Other decoder examples build upon this
+// one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For encoders, you only have to include `aom_encoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// aom.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Initializing The Codec
+// ----------------------
+// The encoder is initialized by the following code.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+//
+// Error Resiliency Features
+// -------------------------
+// Error resiliency is controlled by the g_error_resilient member of the
+// configuration structure. Use the `decode_with_drops` example to decode with
+// frames 5-10 dropped. Compare the output for a file encoded with this example
+// versus one encoded with the `simple_encoder` example.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<keyframe-interval> <error-resilient> <frames to encode>\n"
+          "See comments in simple_encoder.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;
+  int keyframe_interval = 0;
+  int max_frames = 0;
+  int frames_encoded = 0;
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  const char *keyframe_interval_arg = NULL;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc != 9) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+  keyframe_interval_arg = argv[6];
+  max_frames = (int)strtol(argv[8], NULL, 0);
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+  if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = (aom_codec_er_flags_t)strtoul(argv[7], NULL, 0);
+
+  writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile)) {
+    int flags = 0;
+    if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
+      flags |= AOM_EFLAG_FORCE_KF;
+    encode_frame(&codec, &raw, frame_count++, flags, writer);
+    frames_encoded++;
+    if (max_frames > 0 && frames_encoded >= max_frames) break;
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) continue;
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/examples/twopass_encoder.c b/third_party/aom/examples/twopass_encoder.c
new file mode 100644
index 000000000..a03bc6cc2
--- /dev/null
+++ b/third_party/aom/examples/twopass_encoder.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Two Pass Encoder
+// ================
+//
+// This is an example of a two pass encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder twice, and writes the compressed
+// frames to disk in IVF format. It builds upon the simple_encoder example.
+//
+// Twopass Variables
+// -----------------
+// Twopass mode needs to track the current pass number and the buffer of
+// statistics packets.
+//
+// Updating The Configuration
+// ---------------------------------
+// In two pass mode, the configuration has to be updated on each pass. The
+// statistics buffer is passed on the last pass.
+//
+// Encoding A Frame
+// ----------------
+// Encoding a frame in two pass mode is identical to the simple encoder
+// example.
+//
+// Processing Statistics Packets
+// -----------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+//
+// Pass Progress Reporting
+// -----------------------------
+// It's sometimes helpful to see when each pass completes.
+//
+//
+// Clean-up
+// -----------------------------
+// Destruction of the encoder instance must be done on each pass. The
+// raw image should be destroyed at the end as usual.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<limit(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                           aom_codec_pts_t pts, unsigned int duration,
+                           aom_enc_frame_flags_t flags,
+                           aom_fixed_buf_t *stats) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_STATS_PKT) {
+      const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+      const size_t pkt_size = pkt->data.twopass_stats.sz;
+      stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+      stats->sz += pkt_size;
+    }
+  }
+
+  return got_pkts;
+}
+
+static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                        aom_codec_pts_t pts, unsigned int duration,
+                        aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts))
+        die_codec(ctx, "Failed to write compressed frame.");
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
+                             const AvxInterface *encoder,
+                             const aom_codec_enc_cfg_t *cfg, int limit) {
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+  aom_fixed_buf_t stats = { NULL, 0 };
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Calculate frame statistics.
+  while (aom_img_read(raw, infile) && frame_count < limit) {
+    ++frame_count;
+    get_frame_stats(&codec, raw, frame_count, 1, 0, &stats);
+  }
+
+  // Flush encoder.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
+  }
+
+  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  return stats;
+}
+
+static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
+                  const AvxInterface *encoder, const aom_codec_enc_cfg_t *cfg,
+                  int limit) {
+  AvxVideoInfo info = { encoder->fourcc,
+                        cfg->g_w,
+                        cfg->g_h,
+                        { cfg->g_timebase.num, cfg->g_timebase.den },
+                        0 };
+  AvxVideoWriter *writer = NULL;
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+
+  writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing", outfile_name);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (aom_img_read(raw, infile) && frame_count < limit) {
+    ++frame_count;
+    encode_frame(&codec, raw, frame_count, 1, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
+  }
+
+  printf("\n");
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  int w, h;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  aom_fixed_buf_t stats;
+
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;       // TODO(dkovalev) add command line argument
+  const int bitrate = 200;  // kbit/s TODO(dkovalev) add command line argument
+  const char *const codec_arg = argv[1];
+  const char *const width_arg = argv[2];
+  const char *const height_arg = argv[3];
+  const char *const infile_arg = argv[4];
+  const char *const outfile_arg = argv[5];
+  int limit = 0;
+  exec_name = argv[0];
+
+  if (argc < 6) die("Invalid number of arguments");
+
+  if (argc > 6) limit = (int)strtol(argv[6], NULL, 0);
+
+  if (limit == 0) limit = 100;
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  w = (int)strtol(width_arg, NULL, 0);
+  h = (int)strtol(height_arg, NULL, 0);
+
+  if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1))
+    die("Failed to allocate image", w, h);
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  // Configuration
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
+  cfg.rc_target_bitrate = bitrate;
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
+
+  // Pass 0
+  cfg.g_pass = AOM_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg, limit);
+
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = AOM_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, limit);
+  free(stats.buf);
+
+  aom_img_free(&raw);
+  fclose(infile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/keywords.dox b/third_party/aom/keywords.dox
new file mode 100644
index 000000000..56f536890
--- /dev/null
+++ b/third_party/aom/keywords.dox
@@ -0,0 +1,51 @@
+/*!\page rfc2119 RFC2119 Keywords
+
+      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
+      NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED",  "MAY", and
+      "OPTIONAL" in this document are to be interpreted as described in
+      <a href="http://www.ietf.org/rfc/rfc2119.txt">RFC 2119.</a>
+
+Specifically, the following definitions are used:
+
+\section MUST
+\anchor REQUIRED
+\anchor SHALL
+   This word, or the terms "REQUIRED" or "SHALL", mean that the
+   definition is an absolute requirement of the specification.
+
+\section MUSTNOT MUST NOT
+\anchor SHALLNOT
+   This phrase, or the phrase "SHALL NOT", mean that the
+   definition is an absolute prohibition of the specification.
+
+\section SHOULD
+\anchor RECOMMENDED
+   This word, or the adjective "RECOMMENDED", mean that there
+   may exist valid reasons in particular circumstances to ignore a
+   particular item, but the full implications must be understood and
+   carefully weighed before choosing a different course.
+
+\section SHOULDNOT SHOULD NOT
+\anchor NOTRECOMMENDED
+   This phrase, or the phrase "NOT RECOMMENDED" mean that
+   there may exist valid reasons in particular circumstances when the
+   particular behavior is acceptable or even useful, but the full
+   implications should be understood and the case carefully weighed
+   before implementing any behavior described with this label.
+
+\section MAY
+\anchor OPTIONAL
+   This word, or the adjective "OPTIONAL", mean that an item is
+   truly optional.  One vendor may choose to include the item because a
+   particular marketplace requires it or because the vendor feels that
+   it enhances the product while another vendor may omit the same item.
+   An implementation which does not include a particular option \ref MUST be
+   prepared to interoperate with another implementation which does
+   include the option, though perhaps with reduced functionality. In the
+   same vein an implementation which does include a particular option
+   \ref MUST be prepared to interoperate with another implementation which
+   does not include the option (except, of course, for the feature the
+   option provides.)
+
+
+*/
diff --git a/third_party/aom/libs.doxy_template b/third_party/aom/libs.doxy_template
new file mode 100644
index 000000000..c522e21d3
--- /dev/null
+++ b/third_party/aom/libs.doxy_template
@@ -0,0 +1,1260 @@
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+# Doxyfile 1.5.4
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file that
+# follow. The default is UTF-8 which is also the encoding used for all text before
+# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into
+# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of
+# possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "AOMedia Codec SDK"
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian,
+# Italian, Japanese, Japanese-en (Japanese with English messages), Korean,
+# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian,
+# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to java_doc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a java_doc-style
+# comment as the brief description. If set to NO, the java_doc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for Java.
+# For instance, namespaces will be presented as packages, qualified scopes
+# will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
+# include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is
+# documented as struct with the name of the typedef. So
+# typedef struct type_s {} type_t, will appear in the documentation as a struct
+# with name type_t. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named type_s. This can typically
+# be useful for C code where the coding convention is that all structs are
+# typedef'ed and only the typedef is referenced never the struct's name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be extracted
+# and appear in the documentation as a namespace called 'anonymous_namespace{file}',
+# where file will be replaced with the base name of the file that contains the anonymous
+# namespace. By default anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = NO
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from the
+# version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT =
+
+# This tag can be used to specify the character encoding of the source files that
+# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default
+# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding.
+# See http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the output.
+# The symbol name can be a fully qualified name, a word, or if the wildcard * is used,
+# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH
+# then you must also enable this option. If you don't then doxygen will produce
+# a warning and turn it on anyway
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES (the default)
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES (the default)
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the la_te_x output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for la_te_x. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# la_te_x documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = YES
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = letter
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x
+# packages that should be included in the la_te_x output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal la_te_x header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated la_te_x files. This will instruct la_te_x to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the auto_gen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an auto_gen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and la_te_x code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  = *.h
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
+# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
+# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
+# be found in the default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
+# generate a call dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
+# generate a caller dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the number
+# of direct children of the root node in a graph is already larger than
+# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, which results in a white background.
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff --git a/third_party/aom/mainpage.dox b/third_party/aom/mainpage.dox
new file mode 100644
index 000000000..03a299ae1
--- /dev/null
+++ b/third_party/aom/mainpage.dox
@@ -0,0 +1,52 @@
+/*!\mainpage AMedia Codec SDK
+
+  \section main_contents Page Contents
+  - \ref main_intro
+  - \ref main_startpoints
+  - \ref main_support
+
+  \section main_intro Introduction
+  Welcome to the AMedia Codec SDK. This SDK allows you to integrate your
+  applications with the AOM and AV1 video codecs.
+
+  This distribution of the AOMedia Codec SDK includes the following support:
+
+  \if aom_encoder
+  - \ref aom_encoder
+  \endif
+  \if aom_decoder
+  - \ref aom_decoder
+  \endif
+
+
+  \section main_startpoints Starting Points
+  - Consult the \ref changelog for a complete list of improvements in this
+    release.
+  - \ref readme contains instructions on compiling the sample applications.
+  - Read the \ref usage "usage" for a narrative on codec usage.
+  - Read the \ref samples "sample code" for examples of how to interact with the
+    codec.
+  - \ref codec reference
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
+
+  \section main_support Support Options & FAQ
+  The AOMedia project is an open source project supported by its community. For
+  questions about this SDK, please mail the apps-devel@webmproject.org list.
+  To contribute, see http://www.webmproject.org/code/contribute and mail
+  codec-devel@webmproject.org.
+*/
+
+/*!\page changelog CHANGELOG
+   \verbinclude CHANGELOG
+*/
+
+/*!\page readme README.md
+   \include README.md
+*/
+
+/*!\defgroup codecs Supported Codecs */
diff --git a/third_party/aom/stats/aomstats.c b/third_party/aom/stats/aomstats.c
new file mode 100644
index 000000000..4a15adf02
--- /dev/null
+++ b/third_party/aom/stats/aomstats.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "stats/aomstats.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (pass == 0) {
+    stats->file = fopen(fpf, "wb");
+    stats->buf.sz = 0;
+    stats->buf.buf = NULL;
+    res = (stats->file != NULL);
+  } else {
+    size_t nbytes;
+
+    stats->file = fopen(fpf, "rb");
+
+    if (stats->file == NULL) fatal("First-pass stats file does not exist!");
+
+    if (fseek(stats->file, 0, SEEK_END))
+      fatal("First-pass stats file must be seekable!");
+
+    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+    rewind(stats->file);
+
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+    if (!stats->buf.buf)
+      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+            (unsigned int)stats->buf_alloc_sz);
+
+    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+    res = (nbytes == stats->buf.sz);
+  }
+
+  return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (!pass) {
+    stats->buf.sz = 0;
+    stats->buf_alloc_sz = 64 * 1024;
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+  }
+
+  stats->buf_ptr = stats->buf.buf;
+  res = (stats->buf.buf != NULL);
+  return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+  if (stats->file) {
+    if (stats->pass == last_pass) {
+      free(stats->buf.buf);
+    }
+
+    fclose(stats->file);
+    stats->file = NULL;
+  } else {
+    if (stats->pass == last_pass) free(stats->buf.buf);
+  }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+  if (stats->file) {
+    (void)fwrite(pkt, 1, len, stats->file);
+  } else {
+    if (stats->buf.sz + len > stats->buf_alloc_sz) {
+      size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
+      char *new_ptr = realloc(stats->buf.buf, new_sz);
+
+      if (new_ptr) {
+        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+        stats->buf.buf = new_ptr;
+        stats->buf_alloc_sz = new_sz;
+      } else {
+        fatal("Failed to realloc firstpass stats buffer.");
+      }
+    }
+
+    memcpy(stats->buf_ptr, pkt, len);
+    stats->buf.sz += len;
+    stats->buf_ptr += len;
+  }
+}
+
+aom_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; }
diff --git a/third_party/aom/stats/aomstats.h b/third_party/aom/stats/aomstats.h
new file mode 100644
index 000000000..b9c71871a
--- /dev/null
+++ b/third_party/aom/stats/aomstats.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_STATS_AOMSTATS_H_
+#define AOM_STATS_AOMSTATS_H_
+
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+  aom_fixed_buf_t buf;
+  int pass;
+  FILE *file;
+  char *buf_ptr;
+  size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+aom_fixed_buf_t stats_get(stats_io_t *stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_STATS_AOMSTATS_H_
diff --git a/third_party/aom/stats/rate_hist.c b/third_party/aom/stats/rate_hist.c
new file mode 100644
index 000000000..71eb78b72
--- /dev/null
+++ b/third_party/aom/stats/rate_hist.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "stats/rate_hist.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include <math.h>
+
+#define RATE_BINS 100
+#define HIST_BAR_MAX 40
+
+struct hist_bucket {
+  int low;
+  int high;
+  int count;
+};
+
+struct rate_hist {
+  int64_t *pts;
+  int *sz;
+  int samples;
+  int frames;
+  struct hist_bucket bucket[RATE_BINS];
+  int total;
+};
+
+struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
+                                      const aom_rational_t *fps) {
+  int i;
+  struct rate_hist *hist = malloc(sizeof(*hist));
+
+  // Determine the number of samples in the buffer. Use the file's framerate
+  // to determine the number of frames in rc_buf_sz milliseconds, with an
+  // adjustment (5/4) to account for alt-refs
+  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+
+  // prevent division by zero
+  if (hist->samples == 0) hist->samples = 1;
+
+  hist->frames = 0;
+  hist->total = 0;
+
+  hist->pts = calloc(hist->samples, sizeof(*hist->pts));
+  hist->sz = calloc(hist->samples, sizeof(*hist->sz));
+  for (i = 0; i < RATE_BINS; i++) {
+    hist->bucket[i].low = INT_MAX;
+    hist->bucket[i].high = 0;
+    hist->bucket[i].count = 0;
+  }
+
+  return hist;
+}
+
+void destroy_rate_histogram(struct rate_hist *hist) {
+  if (hist) {
+    free(hist->pts);
+    free(hist->sz);
+    free(hist);
+  }
+}
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const aom_codec_enc_cfg_t *cfg,
+                           const aom_codec_cx_pkt_t *pkt) {
+  int i;
+  int64_t then = 0;
+  int64_t avg_bitrate = 0;
+  int64_t sum_sz = 0;
+  const int64_t now = pkt->data.frame.pts * 1000 *
+                      (uint64_t)cfg->g_timebase.num /
+                      (uint64_t)cfg->g_timebase.den;
+
+  int idx = hist->frames++ % hist->samples;
+  hist->pts[idx] = now;
+  hist->sz[idx] = (int)pkt->data.frame.sz;
+
+  if (now < cfg->rc_buf_initial_sz) return;
+
+  if (!cfg->rc_target_bitrate) return;
+
+  then = now;
+
+  /* Sum the size over the past rc_buf_sz ms */
+  for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) {
+    const int i_idx = (i - 1) % hist->samples;
+
+    then = hist->pts[i_idx];
+    if (now - then > cfg->rc_buf_sz) break;
+    sum_sz += hist->sz[i_idx];
+  }
+
+  if (now == then) return;
+
+  avg_bitrate = sum_sz * 8 * 1000 / (now - then);
+  idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
+  if (idx < 0) idx = 0;
+  if (idx > RATE_BINS - 1) idx = RATE_BINS - 1;
+  if (hist->bucket[idx].low > avg_bitrate)
+    hist->bucket[idx].low = (int)avg_bitrate;
+  if (hist->bucket[idx].high < avg_bitrate)
+    hist->bucket[idx].high = (int)avg_bitrate;
+  hist->bucket[idx].count++;
+  hist->total++;
+}
+
+static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
+                              int *num_buckets) {
+  int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
+  int buckets = *num_buckets;
+  int i;
+
+  /* Find the extrema for this list of buckets */
+  big_bucket = small_bucket = 0;
+  for (i = 0; i < buckets; i++) {
+    if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+    if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
+  }
+
+  /* If we have too many buckets, merge the smallest with an adjacent
+   * bucket.
+   */
+  while (buckets > max_buckets) {
+    int last_bucket = buckets - 1;
+
+    /* merge the small bucket with an adjacent one. */
+    if (small_bucket == 0)
+      merge_bucket = 1;
+    else if (small_bucket == last_bucket)
+      merge_bucket = last_bucket - 1;
+    else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count)
+      merge_bucket = small_bucket - 1;
+    else
+      merge_bucket = small_bucket + 1;
+
+    assert(abs(merge_bucket - small_bucket) <= 1);
+    assert(small_bucket < buckets);
+    assert(big_bucket < buckets);
+    assert(merge_bucket < buckets);
+
+    if (merge_bucket < small_bucket) {
+      bucket[merge_bucket].high = bucket[small_bucket].high;
+      bucket[merge_bucket].count += bucket[small_bucket].count;
+    } else {
+      bucket[small_bucket].high = bucket[merge_bucket].high;
+      bucket[small_bucket].count += bucket[merge_bucket].count;
+      merge_bucket = small_bucket;
+    }
+
+    assert(bucket[merge_bucket].low != bucket[merge_bucket].high);
+
+    buckets--;
+
+    /* Remove the merge_bucket from the list, and find the new small
+     * and big buckets while we're at it
+     */
+    big_bucket = small_bucket = 0;
+    for (i = 0; i < buckets; i++) {
+      if (i > merge_bucket) bucket[i] = bucket[i + 1];
+
+      if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+      if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
+    }
+  }
+
+  *num_buckets = buckets;
+  return bucket[big_bucket].count;
+}
+
+static void show_histogram(const struct hist_bucket *bucket, int buckets,
+                           int total, int scale) {
+  const char *pat1, *pat2;
+  int i;
+
+  switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
+    case 1:
+    case 2:
+      pat1 = "%4d %2s: ";
+      pat2 = "%4d-%2d: ";
+      break;
+    case 3:
+      pat1 = "%5d %3s: ";
+      pat2 = "%5d-%3d: ";
+      break;
+    case 4:
+      pat1 = "%6d %4s: ";
+      pat2 = "%6d-%4d: ";
+      break;
+    case 5:
+      pat1 = "%7d %5s: ";
+      pat2 = "%7d-%5d: ";
+      break;
+    case 6:
+      pat1 = "%8d %6s: ";
+      pat2 = "%8d-%6d: ";
+      break;
+    case 7:
+      pat1 = "%9d %7s: ";
+      pat2 = "%9d-%7d: ";
+      break;
+    default:
+      pat1 = "%12d %10s: ";
+      pat2 = "%12d-%10d: ";
+      break;
+  }
+
+  for (i = 0; i < buckets; i++) {
+    int len;
+    int j;
+    float pct;
+
+    pct = (float)(100.0 * bucket[i].count / total);
+    len = HIST_BAR_MAX * bucket[i].count / scale;
+    if (len < 1) len = 1;
+    assert(len <= HIST_BAR_MAX);
+
+    if (bucket[i].low == bucket[i].high)
+      fprintf(stderr, pat1, bucket[i].low, "");
+    else
+      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+
+    for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
+    fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
+  }
+}
+
+void show_q_histogram(const int counts[64], int max_buckets) {
+  struct hist_bucket bucket[64];
+  int buckets = 0;
+  int total = 0;
+  int scale;
+  int i;
+
+  for (i = 0; i < 64; i++) {
+    if (counts[i]) {
+      bucket[buckets].low = bucket[buckets].high = i;
+      bucket[buckets].count = counts[i];
+      buckets++;
+      total += counts[i];
+    }
+  }
+
+  fprintf(stderr, "\nQuantizer Selection:\n");
+  scale = merge_hist_buckets(bucket, max_buckets, &buckets);
+  show_histogram(bucket, buckets, total, scale);
+}
+
+void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
+                         int max_buckets) {
+  int i, scale;
+  int buckets = 0;
+
+  for (i = 0; i < RATE_BINS; i++) {
+    if (hist->bucket[i].low == INT_MAX) continue;
+    hist->bucket[buckets++] = hist->bucket[i];
+  }
+
+  fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz);
+  scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets);
+  show_histogram(hist->bucket, buckets, hist->total, scale);
+}
diff --git a/third_party/aom/stats/rate_hist.h b/third_party/aom/stats/rate_hist.h
new file mode 100644
index 000000000..55b8c5d43
--- /dev/null
+++ b/third_party/aom/stats/rate_hist.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_STATS_RATE_HIST_H_
+#define AOM_STATS_RATE_HIST_H_
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rate_hist;
+
+struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
+                                      const aom_rational_t *fps);
+
+void destroy_rate_histogram(struct rate_hist *hist);
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const aom_codec_enc_cfg_t *cfg,
+                           const aom_codec_cx_pkt_t *pkt);
+
+void show_q_histogram(const int counts[64], int max_buckets);
+
+void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
+                         int max_buckets);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_STATS_RATE_HIST_H_
diff --git a/third_party/aom/test/accounting_test.cc b/third_party/aom/test/accounting_test.cc
new file mode 100644
index 000000000..8b5c8af13
--- /dev/null
+++ b/third_party/aom/test/accounting_test.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+TEST(AV1, TestAccounting) {
+  const int kBufferSize = 10000;
+  const int kSymbols = 1024;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  aom_start_encode(&bw, bw_buffer);
+  for (int i = 0; i < kSymbols; i++) {
+    aom_write(&bw, 0, 32);
+    aom_write(&bw, 0, 32);
+    aom_write(&bw, 0, 32);
+  }
+  aom_stop_encode(&bw);
+  aom_reader br;
+  aom_reader_init(&br, bw_buffer, bw.pos);
+
+  Accounting accounting;
+  aom_accounting_init(&accounting);
+  br.accounting = &accounting;
+  for (int i = 0; i < kSymbols; i++) {
+    aom_read(&br, 32, "A");
+  }
+  // Consecutive symbols that are the same are coalesced.
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, 1);
+  GTEST_ASSERT_EQ(accounting.syms.syms[0].samples, (unsigned int)kSymbols);
+
+  aom_accounting_reset(&accounting);
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, 0);
+
+  // Should record 2 * kSymbols accounting symbols.
+  aom_reader_init(&br, bw_buffer, bw.pos);
+  br.accounting = &accounting;
+  for (int i = 0; i < kSymbols; i++) {
+    aom_read(&br, 32, "A");
+    aom_read(&br, 32, "B");
+    aom_read(&br, 32, "B");
+  }
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, kSymbols * 2);
+  uint32_t tell_frac = aom_reader_tell_frac(&br);
+  for (int i = 0; i < accounting.syms.num_syms; i++) {
+    tell_frac -= accounting.syms.syms[i].bits;
+  }
+  GTEST_ASSERT_EQ(tell_frac, 0U);
+
+  GTEST_ASSERT_EQ(aom_accounting_dictionary_lookup(&accounting, "A"),
+                  aom_accounting_dictionary_lookup(&accounting, "A"));
+
+  // Check for collisions. The current aom_accounting_hash function returns
+  // the same hash code for AB and BA.
+  GTEST_ASSERT_NE(aom_accounting_dictionary_lookup(&accounting, "AB"),
+                  aom_accounting_dictionary_lookup(&accounting, "BA"));
+}
diff --git a/third_party/aom/test/acm_random.h b/third_party/aom/test/acm_random.h
new file mode 100644
index 000000000..0a8317fd5
--- /dev/null
+++ b/third_party/aom/test/acm_random.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_ACM_RANDOM_H_
+#define AOM_TEST_ACM_RANDOM_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_integer.h"
+
+namespace libaom_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() : random_(DeterministicSeed()) {}
+
+  explicit ACMRandom(int seed) : random_(seed) {}
+
+  void Reset(int seed) { random_.Reseed(seed); }
+
+  uint32_t Rand31(void) {
+    return random_.Generate(testing::internal::Random::kMaxRange);
+  }
+
+  uint16_t Rand16(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 15) & 0xffff;
+  }
+
+  int16_t Rand15Signed(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 17) & 0xffff;
+  }
+
+  uint16_t Rand12(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
+  }
+
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
+  uint8_t Rand8(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 23) & 0xff;
+  }
+
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return r < 128 ? r << 4 : r >> 4;
+  }
+
+  int PseudoUniform(int range) { return random_.Generate(range); }
+
+  int operator()(int n) { return PseudoUniform(n); }
+
+  static int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+  testing::internal::Random random_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_ACM_RANDOM_H_
diff --git a/third_party/aom/test/active_map_test.cc b/third_party/aom/test/active_map_test.cc
new file mode 100644
index 000000000..a2b0546ed
--- /dev/null
+++ b/third_party/aom/test/active_map_test.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class ActiveMapTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  static const int kWidth = 208;
+  static const int kHeight = 144;
+
+  ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ActiveMapTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+    } else if (video->frame() == 3) {
+      aom_active_map_t map = aom_active_map_t();
+      /* clang-format off */
+      uint8_t active_map[9 * 13] = {
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
+        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+      };
+      /* clang-format on */
+      map.cols = (kWidth + 15) / 16;
+      map.rows = (kHeight + 15) / 16;
+      ASSERT_EQ(map.cols, 13u);
+      ASSERT_EQ(map.rows, 9u);
+      map.active_map = active_map;
+      encoder->Control(AOME_SET_ACTIVEMAP, &map);
+    } else if (video->frame() == 15) {
+      aom_active_map_t map = aom_active_map_t();
+      map.cols = (kWidth + 15) / 16;
+      map.rows = (kHeight + 15) / 16;
+      map.active_map = NULL;
+      encoder->Control(AOME_SET_ACTIVEMAP, &map);
+    }
+  }
+
+  void DoTest() {
+    // Validate that this non multiple of 64 wide clip encodes
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_resize_mode = 0;
+    cfg_.g_pass = AOM_RC_ONE_PASS;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.kf_max_dist = 90000;
+    ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
+                                         1, 0, 20);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int cpu_used_;
+};
+
+TEST_P(ActiveMapTest, Test) { DoTest(); }
+
+class ActiveMapTestLarge : public ActiveMapTest {};
+
+TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(ActiveMapTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(0, 5));
+
+AV1_INSTANTIATE_TEST_CASE(ActiveMapTest,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 9));
+
+}  // namespace
diff --git a/third_party/aom/test/altref_test.cc b/third_party/aom/test/altref_test.cc
new file mode 100644
index 000000000..dabb1475a
--- /dev/null
+++ b/third_party/aom/test/altref_test.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+namespace {
+
+class AltRefForcedKeyTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AltRefForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
+  virtual ~AltRefForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+#if CONFIG_AV1_ENCODER
+      // override test default for tile columns if necessary.
+      if (GET_PARAM(0) == &libaom_test::kAV1) {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        (video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame_num_ == forced_kf_frame_num_) {
+      ASSERT_TRUE(!!(pkt->data.frame.flags & AOM_FRAME_IS_KEY))
+          << "Frame #" << frame_num_ << " isn't a keyframe!";
+    }
+    ++frame_num_;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  unsigned int forced_kf_frame_num_;
+  unsigned int frame_num_;
+};
+
+TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood),
+                          ::testing::Values(2, 5));
+
+}  // namespace
diff --git a/third_party/aom/test/aom_integer_test.cc b/third_party/aom/test/aom_integer_test.cc
new file mode 100644
index 000000000..fe88a54e9
--- /dev/null
+++ b/third_party/aom/test/aom_integer_test.cc
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+const uint64_t kMaximumLeb128CodedSize = 8;
+const uint8_t kLeb128PadByte = 0x80;  // Binary: 10000000
+const uint64_t kMaximumLeb128Value = UINT32_MAX;
+const uint32_t kSizeTestNumValues = 6;
+const uint32_t kSizeTestExpectedSizes[kSizeTestNumValues] = {
+  1, 1, 2, 3, 4, 5
+};
+const uint64_t kSizeTestInputs[kSizeTestNumValues] = {
+  0, 0x7f, 0x3fff, 0x1fffff, 0xffffff, 0x10000000
+};
+
+const uint8_t kOutOfRangeLeb128Value[5] = { 0x80, 0x80, 0x80, 0x80,
+                                            0x10 };  // UINT32_MAX + 1
+}  // namespace
+
+TEST(AomLeb128, DecodeTest) {
+  const size_t num_leb128_bytes = 3;
+  const uint8_t leb128_bytes[num_leb128_bytes] = { 0xE5, 0x8E, 0x26 };
+  const uint64_t expected_value = 0x98765;  // 624485
+  const size_t expected_length = 3;
+  uint64_t value = ~0ULL;  // make sure value is cleared by the function
+  size_t length;
+  ASSERT_EQ(
+      aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes, &value, &length), 0);
+  ASSERT_EQ(expected_value, value);
+  ASSERT_EQ(expected_length, length);
+
+  // Make sure the decoder stops on the last marked LEB128 byte.
+  aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes + 1, &value, &length);
+  ASSERT_EQ(expected_value, value);
+  ASSERT_EQ(expected_length, length);
+}
+
+TEST(AomLeb128, EncodeTest) {
+  const uint32_t test_value = 0x98765;  // 624485
+  const uint8_t expected_bytes[3] = { 0xE5, 0x8E, 0x26 };
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(aom_uleb_encode(test_value, kWriteBufferSize, &write_buffer[0],
+                            &bytes_written),
+            0);
+  ASSERT_EQ(bytes_written, 3u);
+  for (size_t i = 0; i < bytes_written; ++i) {
+    ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+  }
+}
+
+TEST(AomLeb128, EncodeDecodeTest) {
+  const uint32_t value = 0x98765;  // 624485
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(aom_uleb_encode(value, kWriteBufferSize, &write_buffer[0],
+                            &bytes_written),
+            0);
+  ASSERT_EQ(bytes_written, 3u);
+  uint64_t decoded_value;
+  size_t decoded_length;
+  aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+                  &decoded_length);
+  ASSERT_EQ(value, decoded_value);
+  ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, FixedSizeEncodeTest) {
+  const uint32_t test_value = 0x123;
+  const uint8_t expected_bytes[4] = { 0xa3, 0x82, 0x80, 0x00 };
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(0, aom_uleb_encode_fixed_size(test_value, kWriteBufferSize,
+                                          kWriteBufferSize, &write_buffer[0],
+                                          &bytes_written));
+  ASSERT_EQ(kWriteBufferSize, bytes_written);
+  for (size_t i = 0; i < bytes_written; ++i) {
+    ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+  }
+}
+
+TEST(AomLeb128, FixedSizeEncodeDecodeTest) {
+  const uint32_t value = 0x1;
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(
+      aom_uleb_encode_fixed_size(value, kWriteBufferSize, kWriteBufferSize,
+                                 &write_buffer[0], &bytes_written),
+      0);
+  ASSERT_EQ(bytes_written, 4u);
+  uint64_t decoded_value;
+  size_t decoded_length;
+  aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+                  &decoded_length);
+  ASSERT_EQ(value, decoded_value);
+  ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, SizeTest) {
+  for (size_t i = 0; i < kSizeTestNumValues; ++i) {
+    ASSERT_EQ(kSizeTestExpectedSizes[i],
+              aom_uleb_size_in_bytes(kSizeTestInputs[i]));
+  }
+}
+
+TEST(AomLeb128, DecodeFailTest) {
+  // Input buffer containing what would be a valid 9 byte LEB128 encoded
+  // unsigned integer.
+  const uint8_t kAllPadBytesBuffer[kMaximumLeb128CodedSize + 1] = {
+    kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+    kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+    kLeb128PadByte, kLeb128PadByte, 0
+  };
+  uint64_t decoded_value;
+
+  // Test that decode fails when result would be valid 9 byte integer.
+  ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize + 1,
+                            &decoded_value, NULL),
+            -1);
+
+  // Test that encoded value missing terminator byte within available buffer
+  // range causes decode error.
+  ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize,
+                            &decoded_value, NULL),
+            -1);
+
+  // Test that LEB128 input that decodes to a value larger than 32-bits fails.
+  size_t value_size = 0;
+  ASSERT_EQ(aom_uleb_decode(&kOutOfRangeLeb128Value[0],
+                            sizeof(kOutOfRangeLeb128Value), &decoded_value,
+                            &value_size),
+            -1);
+}
+
+TEST(AomLeb128, EncodeFailTest) {
+  const size_t kWriteBufferSize = 4;
+  const uint32_t kValidTestValue = 1;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t coded_size = 0;
+  ASSERT_EQ(
+      aom_uleb_encode(kValidTestValue, kWriteBufferSize, NULL, &coded_size),
+      -1);
+  ASSERT_EQ(aom_uleb_encode(kValidTestValue, kWriteBufferSize, &write_buffer[0],
+                            NULL),
+            -1);
+
+  const uint32_t kValueOutOfRangeForBuffer = 0xFFFFFFFF;
+  ASSERT_EQ(aom_uleb_encode(kValueOutOfRangeForBuffer, kWriteBufferSize,
+                            &write_buffer[0], &coded_size),
+            -1);
+
+  const uint64_t kValueOutOfRange = kMaximumLeb128Value + 1;
+  ASSERT_EQ(aom_uleb_encode(kValueOutOfRange, kWriteBufferSize,
+                            &write_buffer[0], &coded_size),
+            -1);
+
+  const size_t kPadSizeOutOfRange = 5;
+  ASSERT_EQ(aom_uleb_encode_fixed_size(kValidTestValue, kWriteBufferSize,
+                                       kPadSizeOutOfRange, &write_buffer[0],
+                                       &coded_size),
+            -1);
+}
diff --git a/third_party/aom/test/aomcx_set_ref.sh b/third_party/aom/test/aomcx_set_ref.sh
new file mode 100755
index 000000000..f51b73c58
--- /dev/null
+++ b/third_party/aom/test/aomcx_set_ref.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom aom_cx_set_ref example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to aom_cx_set_ref_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+aom_cx_set_ref_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs aom_cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name, which aom_cx_set_ref does not support at present: It's
+# currently used only to name the output file.
+# TODO(tomfinegan): Pass the codec param once the example is updated to support
+# AV1.
+aom_set_ref() {
+  local encoder="${LIBAOM_BIN_PATH}/aom_cx_set_ref${AOM_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/aom_cx_set_ref_${codec}.ivf"
+  local ref_frame_num=4
+  local limit=10
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${ref_frame_num}" "${limit}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+aom_cx_set_ref_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    aom_set_ref av1 || return 1
+  fi
+}
+
+aom_cx_set_ref_tests="aom_cx_set_ref_av1"
+
+run_tests aom_cx_set_ref_verify_environment "${aom_cx_set_ref_tests}"
+
diff --git a/third_party/aom/test/aomdec.sh b/third_party/aom/test/aomdec.sh
new file mode 100755
index 000000000..927142287
--- /dev/null
+++ b/third_party/aom/test/aomdec.sh
@@ -0,0 +1,147 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests aomdec. To add new tests to this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to aomdec_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available.
+aomdec_verify_environment() {
+  if [ "$(av1_encode_available)" != "yes" ] ; then
+    if [ ! -e "${AV1_IVF_FILE}" ] || \
+       [ ! -e "${AV1_OBU_ANNEXB_FILE}" ] || \
+       [ ! -e "${AV1_OBU_SEC5_FILE}" ] || \
+       [ ! -e "${AV1_WEBM_FILE}" ]; then
+      elog "Libaom test data must exist before running this test script when " \
+           " encoding is disabled. "
+      return 1
+    fi
+  fi
+  if [ -z "$(aom_tool_path aomdec)" ]; then
+    elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Wrapper function for running aomdec with pipe input. Requires that
+# LIBAOM_BIN_PATH points to the directory containing aomdec. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to aomdec.
+aomdec_pipe() {
+  local input="$1"
+  shift
+  if [ ! -e "${input}" ]; then
+    elog "Input file ($input) missing in aomdec_pipe()"
+    return 1
+  fi
+  cat "${file}" | aomdec - "$@" ${devnull}
+}
+
+
+# Wrapper function for running aomdec. Requires that LIBAOM_BIN_PATH points to
+# the directory containing aomdec. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to aomdec.
+aomdec() {
+  local decoder="$(aom_tool_path aomdec)"
+  local input="$1"
+  shift
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
+}
+
+aomdec_can_decode_av1() {
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomdec_av1_ivf() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    aomdec "${AV1_IVF_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_av1_ivf_error_resilient() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="av1.error-resilient.ivf"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1
+    fi
+    aomdec "${file}" --summary --noblit
+  fi
+}
+
+aomdec_av1_ivf_multithread() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    for threads in 2 3 4 5 6 7 8; do
+      aomdec "${file}" --summary --noblit --threads=$threads
+    done
+  fi
+}
+
+aomdec_aom_ivf_pipe_input() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_av1_obu_annexb() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_OBU_ANNEXB_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1
+    fi
+    aomdec "${file}" --summary --noblit --annexb
+  fi
+}
+
+aomdec_av1_obu_section5() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_OBU_SEC5_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu
+    fi
+    aomdec "${file}" --summary --noblit
+  fi
+}
+
+aomdec_av1_webm() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local file="${AV1_WEBM_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}"
+    fi
+    aomdec "${AV1_WEBM_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_tests="aomdec_av1_ivf
+              aomdec_av1_ivf_error_resilient
+              aomdec_av1_ivf_multithread
+              aomdec_aom_ivf_pipe_input
+              aomdec_av1_obu_annexb
+              aomdec_av1_obu_section5
+              aomdec_av1_webm"
+
+run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/third_party/aom/test/aomenc.sh b/third_party/aom/test/aomenc.sh
new file mode 100755
index 000000000..b030397a3
--- /dev/null
+++ b/third_party/aom/test/aomenc.sh
@@ -0,0 +1,269 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests aomenc using hantro_collage_w352h288.yuv as input. To add
+## new tests to this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to aomenc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available.
+aomenc_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then
+      elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in"
+      elog "LIBAOM_TEST_DATA_PATH."
+      return 1
+    fi
+  fi
+  if [ -z "$(aom_tool_path aomenc)" ]; then
+    elog "aomenc not found. It must exist in LIBAOM_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+aomenc_can_encode_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomenc_can_encode_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+# Utilities that echo aomenc input file parameters.
+y4m_input_non_square_par() {
+  echo ""${Y4M_NOSQ_PAR_INPUT}""
+}
+
+y4m_input_720p() {
+  echo ""${Y4M_720P_INPUT}""
+}
+
+# Wrapper function for running aomenc with pipe input. Requires that
+# LIBAOM_BIN_PATH points to the directory containing aomenc. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to aomenc.
+aomenc_pipe() {
+  local encoder="$(aom_tool_path aomenc)"
+  local input="$1"
+  shift
+  cat "${input}" | eval "${AOM_TEST_PREFIX}" "${encoder}" - \
+    --test-decode=fatal \
+    "$@" ${devnull}
+}
+
+# Wrapper function for running aomenc. Requires that LIBAOM_BIN_PATH points to
+# the directory containing aomenc. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to aomenc.
+aomenc() {
+  local encoder="$(aom_tool_path aomenc)"
+  local input="$1"
+  shift
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${input}" \
+    --test-decode=fatal \
+    "$@" ${devnull}
+}
+
+aomenc_av1_ivf() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_IVF_FILE}"
+    if [ -e "${AV1_IVF_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_obu_annexb() {
+   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_OBU_ANNEXB_FILE}"
+    if [ -e "${AV1_OBU_ANNEXB_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.annexb.obu"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --obu \
+      --annexb=1 \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_obu_section5() {
+   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_OBU_SEC5_FILE}"
+    if [ -e "${AV1_OBU_SEC5_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.section5.obu"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --obu \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local output="${AV1_WEBM_FILE}"
+    if [ -e "${AV1_WEBM_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm_1pass() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --passes=1 \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_ivf_lossless() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --output="${output}" \
+      --lossless=1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_ivf_minq0_maxq0() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --output="${output}" \
+      --min-q=0 \
+      --max-q=0
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm_lag5_frames10() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local lag_total_frames=10
+    local lag_frames=5
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --limit=${lag_total_frames} \
+      --lag-in-frames=${lag_frames} \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+# TODO(fgalligan): Test that DisplayWidth is different than video width.
+aomenc_av1_webm_non_square_par() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
+    aomenc $(y4m_input_non_square_par) \
+      $(aomenc_encode_test_fast_params) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm_cdf_update_mode() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    for mode in 0 1 2; do
+      local output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm"
+      aomenc $(yuv_raw_input) \
+        $(aomenc_encode_test_fast_params) \
+        --cdf-update-mode=${mode} \
+        --output="${output}"
+
+      if [ ! -e "${output}" ]; then
+        elog "Output file does not exist."
+        return 1
+      fi
+    done
+  fi
+}
+
+aomenc_tests="aomenc_av1_ivf
+              aomenc_av1_obu_annexb
+              aomenc_av1_obu_section5
+              aomenc_av1_webm
+              aomenc_av1_webm_1pass
+              aomenc_av1_ivf_lossless
+              aomenc_av1_ivf_minq0_maxq0
+              aomenc_av1_webm_lag5_frames10
+              aomenc_av1_webm_non_square_par
+              aomenc_av1_webm_cdf_update_mode"
+
+run_tests aomenc_verify_environment "${aomenc_tests}"
diff --git a/third_party/aom/test/aq_segment_test.cc b/third_party/aom/test/aq_segment_test.cc
new file mode 100644
index 000000000..bbb5027d4
--- /dev/null
+++ b/third_party/aom/test/aq_segment_test.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class AqSegmentTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~AqSegmentTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  void DoTest(int aq_mode) {
+    aq_mode_ = aq_mode;
+    deltaq_mode_ = 0;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int aq_mode_;
+  int deltaq_mode_;
+};
+
+// Validate that this AQ segmentation mode (AQ=1, variance_ap)
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchAQ1) { DoTest(1); }
+
+// Validate that this AQ segmentation mode (AQ=2, complexity_aq)
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchAQ2) { DoTest(2); }
+
+// Validate that this AQ segmentation mode (AQ=3, cyclic_refresh_aq)
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchAQ3) { DoTest(3); }
+
+class AqSegmentTestLarge : public AqSegmentTest {};
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ1) { DoTest(1); }
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ2) { DoTest(2); }
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ3) { DoTest(3); }
+
+// Validate that this delta q mode
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
+  cfg_.rc_end_usage = AOM_CQ;
+  aq_mode_ = 0;
+  deltaq_mode_ = 2;
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 15);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(5, 9));
+AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(3, 5));
+}  // namespace
diff --git a/third_party/aom/test/arf_freq_test.cc b/third_party/aom/test/arf_freq_test.cc
new file mode 100644
index 000000000..083f4022f
--- /dev/null
+++ b/third_party/aom/test/arf_freq_test.cc
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "av1/encoder/ratectrl.h"
+
+namespace {
+
+const unsigned int kFrames = 100;
+const int kBitrate = 500;
+
+#define ARF_NOT_SEEN 1000001
+#define ARF_SEEN_ONCE 1000000
+
+typedef struct {
+  const char *filename;
+  unsigned int width;
+  unsigned int height;
+  unsigned int framerate_num;
+  unsigned int framerate_den;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+typedef struct {
+  libaom_test::TestMode mode;
+  int cpu_used;
+} TestEncodeParam;
+
+const TestVideoParam kTestVectors[] = {
+  // artificially increase framerate to trigger default check
+  { "hantro_collage_w352h288.yuv", 352, 288, 5000, 1, 8, AOM_IMG_FMT_I420,
+    AOM_BITS_8, 0 },
+  { "hantro_collage_w352h288.yuv", 352, 288, 30, 1, 8, AOM_IMG_FMT_I420,
+    AOM_BITS_8, 0 },
+  { "rush_hour_444.y4m", 352, 288, 30, 1, 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+  // Add list of profile 2/3 test videos here ...
+};
+
+const TestEncodeParam kEncodeVectors[] = {
+  { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 },
+  { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 },
+  { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 },
+};
+
+const int kMinArfVectors[] = {
+  // NOTE: 0 refers to the default built-in logic in:
+  //       av1_rc_get_default_min_gf_interval(...)
+  0, 4, 8, 12, 15
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class ArfFreqTestLarge
+    : public ::libaom_test::CodecTestWith3Params<TestVideoParam,
+                                                 TestEncodeParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ArfFreqTestLarge()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
+
+  virtual ~ArfFreqTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(test_encode_param_.mode);
+    if (test_encode_param_.mode != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    min_run_ = ARF_NOT_SEEN;
+    run_of_visible_frames_ = 0;
+  }
+
+  int GetNumFramesInPkt(const aom_codec_cx_pkt_t *pkt) {
+    const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+    const int mag = ((marker >> 3) & 3) + 1;
+    int frames = (marker & 0x7) + 1;
+    const unsigned int index_sz = 2 + mag * frames;
+    // Check for superframe or not.
+    // Assume superframe has only one visible frame, the rest being
+    // invisible. If superframe index is not found, then there is only
+    // one frame.
+    if (!((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz &&
+          buffer[pkt->data.frame.sz - index_sz] == marker)) {
+      frames = 1;
+    }
+    return frames;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
+    const int frames = GetNumFramesInPkt(pkt);
+    if (frames == 1) {
+      run_of_visible_frames_++;
+    } else if (frames == 2) {
+      if (min_run_ == ARF_NOT_SEEN) {
+        min_run_ = ARF_SEEN_ONCE;
+      } else if (min_run_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_run_) {
+        min_run_ = run_of_visible_frames_;
+      }
+      run_of_visible_frames_ = 1;
+    } else {
+      min_run_ = 0;
+      run_of_visible_frames_ = 1;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, test_encode_param_.cpu_used);
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL, min_arf_requested_);
+      if (test_encode_param_.mode != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  int GetMinVisibleRun() const { return min_run_; }
+
+  int GetMinArfDistanceRequested() const {
+    if (min_arf_requested_)
+      return min_arf_requested_;
+    else
+      return av1_rc_get_default_min_gf_interval(
+          test_video_param_.width, test_video_param_.height,
+          (double)test_video_param_.framerate_num /
+              test_video_param_.framerate_den);
+  }
+
+  TestVideoParam test_video_param_;
+  TestEncodeParam test_encode_param_;
+
+ private:
+  int min_arf_requested_;
+  int min_run_;
+  int run_of_visible_frames_;
+};
+
+TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+  testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+  } else {
+    video.reset(new libaom_test::YUVVideoSource(
+        test_video_param_.filename, test_video_param_.fmt,
+        test_video_param_.width, test_video_param_.height,
+        test_video_param_.framerate_num, test_video_param_.framerate_den, 0,
+        kFrames));
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  const int min_run = GetMinVisibleRun();
+  const int min_arf_dist_requested = GetMinArfDistanceRequested();
+  if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+    const int min_arf_dist = min_run + 1;
+    EXPECT_GE(min_arf_dist, min_arf_dist_requested);
+  }
+}
+
+#if CONFIG_AV1_ENCODER
+// TODO(angiebird): 25-29 fail in high bitdepth mode.
+// TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as
+// BWDREF_FRAME is also a non-show frame, and the minimum run between two
+// consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive
+// number as long as it does not exceed the gf_group interval.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_AV1, ArfFreqTestLarge,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::ValuesIn(kTestVectors), ::testing::ValuesIn(kEncodeVectors),
+        ::testing::ValuesIn(kMinArfVectors)));
+#endif  // CONFIG_AV1_ENCODER
+}  // namespace
diff --git a/third_party/aom/test/av1_config_test.cc b/third_party/aom/test/av1_config_test.cc
new file mode 100644
index 000000000..e2f2c5390
--- /dev/null
+++ b/third_party/aom/test/av1_config_test.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include "common/av1_config.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+//
+// Input buffers containing exactly one Sequence Header OBU.
+//
+// Each buffer is named according to the OBU storage format (Annex-B vs Low
+// Overhead Bitstream Format) and the type of Sequence Header OBU ("Full"
+// Sequence Header OBUs vs Sequence Header OBUs with the
+// reduced_still_image_flag set).
+//
+const uint8_t kAnnexBFullSequenceHeaderObu[] = {
+  0x0c, 0x08, 0x00, 0x00, 0x00, 0x04, 0x45, 0x7e, 0x3e, 0xff, 0xfc, 0xc0, 0x20
+};
+const uint8_t kAnnexBReducedStillImageSequenceHeaderObu[] = {
+  0x08, 0x08, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
+};
+
+const uint8_t kLobfFullSequenceHeaderObu[] = {
+  0x0a, 0x0b, 0x00, 0x00, 0x00, 0x04, 0x45, 0x7e, 0x3e, 0xff, 0xfc, 0xc0, 0x20
+};
+
+const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = {
+  0x0a, 0x07, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
+};
+
+const uint8_t kAv1cAllZero[] = { 0, 0, 0, 0 };
+
+// The size of AV1 config when no configOBUs are present at the end of the
+// configuration structure.
+const size_t kAv1cNoConfigObusSize = 4;
+
+bool VerifyAv1c(const uint8_t *const obu_buffer, size_t obu_buffer_length,
+                bool is_annexb) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  bool parse_ok = get_av1config_from_obu(obu_buffer, obu_buffer_length,
+                                         is_annexb, &av1_config) == 0;
+  if (parse_ok) {
+    EXPECT_EQ(1, av1_config.marker);
+    EXPECT_EQ(1, av1_config.version);
+    EXPECT_EQ(0, av1_config.seq_profile);
+    EXPECT_EQ(0, av1_config.seq_level_idx_0);
+    EXPECT_EQ(0, av1_config.seq_tier_0);
+    EXPECT_EQ(0, av1_config.high_bitdepth);
+    EXPECT_EQ(0, av1_config.twelve_bit);
+    EXPECT_EQ(0, av1_config.monochrome);
+    EXPECT_EQ(1, av1_config.chroma_subsampling_x);
+    EXPECT_EQ(1, av1_config.chroma_subsampling_y);
+    EXPECT_EQ(0, av1_config.chroma_sample_position);
+    EXPECT_EQ(0, av1_config.initial_presentation_delay_present);
+    EXPECT_EQ(0, av1_config.initial_presentation_delay_minus_one);
+  }
+  return parse_ok && ::testing::Test::HasFailure() == false;
+}
+
+TEST(Av1Config, ObuInvalidInputs) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  ASSERT_EQ(-1, get_av1config_from_obu(NULL, 0, 0, NULL));
+  ASSERT_EQ(-1,
+            get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0, NULL));
+  ASSERT_EQ(
+      -1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0],
+                                 sizeof(kLobfFullSequenceHeaderObu), 0, NULL));
+  ASSERT_EQ(-1, get_av1config_from_obu(NULL, sizeof(kLobfFullSequenceHeaderObu),
+                                       0, NULL));
+  ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0,
+                                       &av1_config));
+}
+
+TEST(Av1Config, ReadInvalidInputs) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  size_t bytes_read = 0;
+  ASSERT_EQ(-1, read_av1config(NULL, 0, NULL, NULL));
+  ASSERT_EQ(-1, read_av1config(NULL, 4, NULL, NULL));
+  ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 0, NULL, NULL));
+  ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 4, &bytes_read, NULL));
+  ASSERT_EQ(-1, read_av1config(NULL, 4, &bytes_read, &av1_config));
+}
+
+TEST(Av1Config, WriteInvalidInputs) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  size_t bytes_written = 0;
+  uint8_t av1c_buffer[4] = { 0 };
+  ASSERT_EQ(-1, write_av1config(NULL, 0, NULL, NULL));
+  ASSERT_EQ(-1, write_av1config(&av1_config, 0, NULL, NULL));
+  ASSERT_EQ(-1, write_av1config(&av1_config, 0, &bytes_written, NULL));
+
+  ASSERT_EQ(-1,
+            write_av1config(&av1_config, 0, &bytes_written, &av1c_buffer[0]));
+  ASSERT_EQ(-1, write_av1config(&av1_config, 4, &bytes_written, NULL));
+}
+
+TEST(Av1Config, GetAv1ConfigFromLobfObu) {
+  // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+  // unset-- aka a full Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kLobfFullSequenceHeaderObu,
+                         sizeof(kLobfFullSequenceHeaderObu), false));
+
+  // Test parsing of a reduced still image Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kLobfReducedStillImageSequenceHeaderObu,
+                         sizeof(kLobfReducedStillImageSequenceHeaderObu),
+                         false));
+}
+
+TEST(Av1Config, GetAv1ConfigFromAnnexBObu) {
+  // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+  // unset-- aka a full Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kAnnexBFullSequenceHeaderObu,
+                         sizeof(kAnnexBFullSequenceHeaderObu), true));
+
+  // Test parsing of a reduced still image Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kAnnexBReducedStillImageSequenceHeaderObu,
+                         sizeof(kAnnexBReducedStillImageSequenceHeaderObu),
+                         true));
+}
+
+TEST(Av1Config, ReadWriteConfig) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+
+  // Test writing out the AV1 config.
+  size_t bytes_written = 0;
+  uint8_t av1c_buffer[4] = { 0 };
+  ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+                               &av1c_buffer[0]));
+  ASSERT_EQ(kAv1cNoConfigObusSize, bytes_written);
+  for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+    ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+        << "Mismatch in output Av1Config at offset=" << i;
+  }
+
+  // Test reading the AV1 config.
+  size_t bytes_read = 0;
+  ASSERT_EQ(0, read_av1config(&kAv1cAllZero[0], sizeof(kAv1cAllZero),
+                              &bytes_read, &av1_config));
+  ASSERT_EQ(kAv1cNoConfigObusSize, bytes_read);
+  ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+                               &av1c_buffer[0]));
+  for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+    ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+        << "Mismatch in output Av1Config at offset=" << i;
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/av1_convolve_2d_test.cc b/third_party/aom/test/av1_convolve_2d_test.cc
new file mode 100644
index 000000000..03286260e
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_2d_test.cc
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/av1_convolve_2d_test_util.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
+using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
+using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
+using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
+namespace {
+
+TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+INSTANTIATE_TEST_CASE_P(
+    C_COPY, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    C_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0));
+INSTANTIATE_TEST_CASE_P(
+    C_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
+INSTANTIATE_TEST_CASE_P(
+    C, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1Convolve2DSrTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_convolve_2d_copy_sr_sse2, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    SSE2_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0));
+INSTANTIATE_TEST_CASE_P(
+    SSE2_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1Convolve2DSrTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_convolve_2d_copy_sr_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    AVX2_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1));
+
+INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1Convolve2DSrTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_convolve_2d_copy_sr_neon, 0, 0));
+#endif  // HAVE_NEON
+
+TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+INSTANTIATE_TEST_CASE_P(
+    C_COPY, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_copy_c, 0, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    C_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_c, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    C_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_c, 0, 1));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_jnt_convolve_2d_copy_sse2, 0, 0));
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE2_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_sse2, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_sse2, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_ssse3, 1, 1));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_jnt_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(
+    AVX2_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_avx2, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_avx2, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_avx2, 1, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
+#endif  // HAVE_SSE2
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1JntConvolve2DTest,
+                        libaom_test::AV1Convolve2D::BuildParams(
+                            av1_jnt_convolve_2d_copy_neon, 0, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_neon, 1, 1));
+INSTANTIATE_TEST_CASE_P(
+    NEON_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_neon, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_neon, 0, 1));
+#endif  // HAVE_NEON
+
+TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_c, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_y_sr_c, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_copy_sr_c, 0, 0));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_sr_ssse3, 1, 1));
+INSTANTIATE_TEST_CASE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_ssse3, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_y_sr_ssse3, 0, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_sr_avx2, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_avx2, 1, 0));
+INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_y_sr_avx2, 0, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSSE3
+#endif  // HAVE_SSE2
+TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_c, 1, 0));
+
+INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_y_c, 0, 1));
+
+INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_copy_c, 0, 0));
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_copy_sse4_1, 0, 0));
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_sse4_1, 1, 1));
+INSTANTIATE_TEST_CASE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_sse4_1, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_y_sse4_1, 0, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_avx2, 1, 0));
+INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_y_avx2, 0, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
+}  // namespace
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.cc b/third_party/aom/test/av1_convolve_2d_test_util.cc
new file mode 100644
index 000000000..409fd23e1
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_2d_test_util.cc
@@ -0,0 +1,705 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/av1_convolve_2d_test_util.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
+#include "av1/common/convolve.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+namespace libaom_test {
+
+const int kMaxSize = 128 + 32;  // padding
+namespace AV1Convolve2D {
+
+::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
+    convolve_2d_func filter, int has_subx, int has_suby) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Values(has_subx),
+                            ::testing::Values(has_suby),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {}
+void AV1Convolve2DSrTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1Convolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+  int hfilter, vfilter, subx, suby;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, output2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i)
+    output[i] = output2[i] = rnd_.Rand31();
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
+           ++vfilter) {
+        const InterpFilterParams *filter_params_x =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                         out_w);
+        const InterpFilterParams *filter_params_y =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                         out_h);
+        for (int do_average = 0; do_average < 1; ++do_average) {
+          ConvolveParams conv_params1 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
+          ConvolveParams conv_params2 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
+
+          const int subx_range = has_subx ? 16 : 1;
+          const int suby_range = has_suby ? 16 : 1;
+          for (subx = 0; subx < subx_range; ++subx) {
+            for (suby = 0; suby < suby_range; ++suby) {
+              // Choose random locations within the source block
+              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+              av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output,
+                                   MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                                   filter_params_y, subx, suby, &conv_params1);
+              test_impl(input + offset_r * w + offset_c, w, output2,
+                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                        filter_params_y, subx, suby, &conv_params2);
+
+              if (memcmp(output, output2, sizeof(output))) {
+                for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
+  int subx = 0, suby = 0;
+
+  const int do_average = 0;
+  ConvolveParams conv_params2 =
+      get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    const int num_loops = 1000000000 / (out_w + out_h);
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                     out_w);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                     out_h);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                filter_params_y, subx, suby, &conv_params2);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+           out_h, 1000.0 * elapsed_time / num_loops);
+  }
+}
+
+AV1JntConvolve2DTest::~AV1JntConvolve2DTest() {}
+void AV1JntConvolve2DTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1JntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+  int hfilter, vfilter, subx, suby;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8_1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8_2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output1[i] = output2[i] = rnd_.Rand16();
+    output8_1[i] = output8_2[i] = rnd_.Rand8();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
+      const InterpFilterParams *filter_params_x =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                       out_w);
+      const InterpFilterParams *filter_params_y =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                       out_h);
+      for (int do_average = 0; do_average <= 1; ++do_average) {
+        ConvolveParams conv_params1 =
+            get_conv_params_no_round(do_average, 0, output1, MAX_SB_SIZE, 1, 8);
+        ConvolveParams conv_params2 =
+            get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
+
+        // Test special case where jnt_comp_avg is not used
+        conv_params1.use_jnt_comp_avg = 0;
+        conv_params2.use_jnt_comp_avg = 0;
+
+        const int subx_range = has_subx ? 16 : 1;
+        const int suby_range = has_suby ? 16 : 1;
+        for (subx = 0; subx < subx_range; ++subx) {
+          for (suby = 0; suby < suby_range; ++suby) {
+            // Choose random locations within the source block
+            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+            av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1,
+                                  MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                                  filter_params_y, subx, suby, &conv_params1);
+            test_impl(input + offset_r * w + offset_c, w, output8_2,
+                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                      filter_params_y, subx, suby, &conv_params2);
+
+            for (int i = 0; i < out_h; ++i) {
+              for (int j = 0; j < out_w; ++j) {
+                int idx = i * MAX_SB_SIZE + j;
+                ASSERT_EQ(output1[idx], output2[idx])
+                    << "Mismatch at unit tests for av1_jnt_convolve_2d\n"
+                    << out_w << "x" << out_h << " Pixel mismatch at index "
+                    << idx << " = (" << i << ", " << j
+                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
+              }
+            }
+
+            if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
+              for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                  int idx = i * MAX_SB_SIZE + j;
+                  ASSERT_EQ(output8_1[idx], output8_2[idx])
+                      << out_w << "x" << out_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << suby << ", " << subx
+                      << ")";
+                }
+              }
+            }
+          }
+        }
+
+        // Test different combination of fwd and bck offset weights
+        for (int k = 0; k < 2; ++k) {
+          for (int l = 0; l < 4; ++l) {
+            conv_params1.use_jnt_comp_avg = 1;
+            conv_params2.use_jnt_comp_avg = 1;
+            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
+            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
+
+            for (subx = 0; subx < subx_range; ++subx) {
+              for (suby = 0; suby < suby_range; ++suby) {
+                // Choose random locations within the source block
+                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+                av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                      output8_1, MAX_SB_SIZE, out_w, out_h,
+                                      filter_params_x, filter_params_y, subx,
+                                      suby, &conv_params1);
+                test_impl(input + offset_r * w + offset_c, w, output8_2,
+                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                          filter_params_y, subx, suby, &conv_params2);
+
+                for (int i = 0; i < out_h; ++i) {
+                  for (int j = 0; j < out_w; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output1[idx], output2[idx])
+                        << "Mismatch at unit tests for "
+                           "av1_jnt_convolve_2d\n"
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+                if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
+                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                      int idx = i * MAX_SB_SIZE + j;
+                      ASSERT_EQ(output8_1[idx], output8_2[idx])
+                          << out_w << "x" << out_h
+                          << " Pixel mismatch at index " << idx << " = (" << i
+                          << ", " << j << "), sub pixel offset = (" << suby
+                          << ", " << subx << ")";
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+
+  int subx = 0, suby = 0;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8[MAX_SB_SQUARE]);
+  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output[i] = rnd_.Rand16();
+    output8[i] = rnd_.Rand8();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  const int num_loops = 1000000000 / (out_w + out_h);
+  const int do_average = 0;
+
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                   out_w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                   out_h);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
+
+  conv_params.use_jnt_comp_avg = 0;
+
+  // Choose random locations within the source block
+  const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+  const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w,
+              out_h, filter_params_x, filter_params_y, subx, suby,
+              &conv_params);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+         out_h, 1000.0 * elapsed_time / num_loops);
+}
+}  // namespace AV1Convolve2D
+
+namespace AV1HighbdConvolve2D {
+::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
+    highbd_convolve_2d_func filter, int has_subx, int has_suby) {
+  return ::testing::Combine(
+      ::testing::Range(8, 13, 2), ::testing::Values(filter),
+      ::testing::Values(has_subx), ::testing::Values(has_suby),
+      ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {}
+void AV1HighbdConvolve2DSrTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdConvolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdConvolve2DSrTest::RunSpeedTest(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+
+  const int offset_r = 3;
+  const int offset_c = 3;
+  subx = 0;
+  suby = 0;
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    const int num_loops = 1000000000 / (out_w + out_h);
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                     out_w);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                     out_h);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
+                out_h, filter_params_x, filter_params_y, subx, suby,
+                &conv_params, bd);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+           out_h, 1000.0 * elapsed_time / num_loops);
+  }
+}
+
+void AV1HighbdConvolve2DSrTest::RunCheckOutput(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i)
+    output[i] = output2[i] = rnd_.Rand31();
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
+           ++vfilter) {
+        const InterpFilterParams *filter_params_x =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                         out_w);
+        const InterpFilterParams *filter_params_y =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                         out_h);
+        for (int do_average = 0; do_average < 1; ++do_average) {
+          ConvolveParams conv_params1 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
+          ConvolveParams conv_params2 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
+
+          const int subx_range = has_subx ? 16 : 1;
+          const int suby_range = has_suby ? 16 : 1;
+          for (subx = 0; subx < subx_range; ++subx) {
+            for (suby = 0; suby < suby_range; ++suby) {
+              // Choose random locations within the source block
+              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+              av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w,
+                                          output, MAX_SB_SIZE, out_w, out_h,
+                                          filter_params_x, filter_params_y,
+                                          subx, suby, &conv_params1, bd);
+              test_impl(input + offset_r * w + offset_c, w, output2,
+                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                        filter_params_y, subx, suby, &conv_params2, bd);
+
+              if (memcmp(output, output2, sizeof(output))) {
+                for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+AV1HighbdJntConvolve2DTest::~AV1HighbdJntConvolve2DTest() {}
+void AV1HighbdJntConvolve2DTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdJntConvolve2DTest::RunSpeedTest(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand16();
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                   out_w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                   out_h);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
+
+  // Test special case where jnt_comp_avg is not used
+  conv_params.use_jnt_comp_avg = 0;
+
+  subx = 0;
+  suby = 0;
+  // Choose random locations within the source block
+  const int offset_r = 3;
+  const int offset_c = 3;
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w,
+              out_h, filter_params_x, filter_params_y, subx, suby, &conv_params,
+              bd);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("convolve %3dx%-3d: %7.2f us\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+}
+
+void AV1HighbdJntConvolve2DTest::RunCheckOutput(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16_1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16_2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output1[i] = output2[i] = rnd_.Rand16();
+    output16_1[i] = output16_2[i] = rnd_.Rand16();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
+      const InterpFilterParams *filter_params_x =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                       out_w);
+      const InterpFilterParams *filter_params_y =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                       out_h);
+      for (int do_average = 0; do_average <= 1; ++do_average) {
+        ConvolveParams conv_params1 = get_conv_params_no_round(
+            do_average, 0, output1, MAX_SB_SIZE, 1, bd);
+        ConvolveParams conv_params2 = get_conv_params_no_round(
+            do_average, 0, output2, MAX_SB_SIZE, 1, bd);
+
+        // Test special case where jnt_comp_avg is not used
+        conv_params1.use_jnt_comp_avg = 0;
+        conv_params2.use_jnt_comp_avg = 0;
+
+        const int subx_range = has_subx ? 16 : 1;
+        const int suby_range = has_suby ? 16 : 1;
+        for (subx = 0; subx < subx_range; ++subx) {
+          for (suby = 0; suby < suby_range; ++suby) {
+            // Choose random locations within the source block
+            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+            av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                         output16_1, MAX_SB_SIZE, out_w, out_h,
+                                         filter_params_x, filter_params_y, subx,
+                                         suby, &conv_params1, bd);
+            test_impl(input + offset_r * w + offset_c, w, output16_2,
+                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                      filter_params_y, subx, suby, &conv_params2, bd);
+
+            for (int i = 0; i < out_h; ++i) {
+              for (int j = 0; j < out_w; ++j) {
+                int idx = i * MAX_SB_SIZE + j;
+                ASSERT_EQ(output1[idx], output2[idx])
+                    << out_w << "x" << out_h << " Pixel mismatch at index "
+                    << idx << " = (" << i << ", " << j
+                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
+              }
+            }
+
+            if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
+              for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                  int idx = i * MAX_SB_SIZE + j;
+                  ASSERT_EQ(output16_1[idx], output16_2[idx])
+                      << out_w << "x" << out_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << suby << ", " << subx
+                      << ")";
+                }
+              }
+            }
+          }
+        }
+
+        // Test different combination of fwd and bck offset weights
+        for (int k = 0; k < 2; ++k) {
+          for (int l = 0; l < 4; ++l) {
+            conv_params1.use_jnt_comp_avg = 1;
+            conv_params2.use_jnt_comp_avg = 1;
+            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
+            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
+
+            const int subx_range = has_subx ? 16 : 1;
+            const int suby_range = has_suby ? 16 : 1;
+            for (subx = 0; subx < subx_range; ++subx) {
+              for (suby = 0; suby < suby_range; ++suby) {
+                // Choose random locations within the source block
+                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+                av1_highbd_jnt_convolve_2d_c(
+                    input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
+                    out_w, out_h, filter_params_x, filter_params_y, subx, suby,
+                    &conv_params1, bd);
+                test_impl(input + offset_r * w + offset_c, w, output16_2,
+                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                          filter_params_y, subx, suby, &conv_params2, bd);
+
+                for (int i = 0; i < out_h; ++i) {
+                  for (int j = 0; j < out_w; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output1[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+
+                if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
+                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                      int idx = i * MAX_SB_SIZE + j;
+                      ASSERT_EQ(output16_1[idx], output16_2[idx])
+                          << out_w << "x" << out_h
+                          << " Pixel mismatch at index " << idx << " = (" << i
+                          << ", " << j << "), sub pixel offset = (" << suby
+                          << ", " << subx << ")";
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace AV1HighbdConvolve2D
+}  // namespace libaom_test
diff --git a/third_party/aom/test/av1_convolve_2d_test_util.h b/third_party/aom/test/av1_convolve_2d_test_util.h
new file mode 100644
index 000000000..e0eb58410
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_2d_test_util.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
+#define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+namespace libaom_test {
+
+namespace AV1Convolve2D {
+
+typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_q4, const int subpel_y_q4,
+                                 ConvolveParams *conv_params);
+
+typedef ::testing::tuple<convolve_2d_func, int, int, BLOCK_SIZE>
+    Convolve2DParam;
+
+::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
+    convolve_2d_func filter, int subx_exist, int suby_exist);
+
+class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
+ public:
+  virtual ~AV1Convolve2DSrTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(convolve_2d_func test_impl);
+  void RunSpeedTest(convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
+ public:
+  virtual ~AV1JntConvolve2DTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(convolve_2d_func test_impl);
+  void RunSpeedTest(convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+}  // namespace AV1Convolve2D
+
+namespace AV1HighbdConvolve2D {
+typedef void (*highbd_convolve_2d_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+    const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+typedef ::testing::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
+    HighbdConvolve2DParam;
+
+::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
+    highbd_convolve_2d_func filter, int subx_exist, int suby_exist);
+
+class AV1HighbdConvolve2DSrTest
+    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
+ public:
+  virtual ~AV1HighbdConvolve2DSrTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+class AV1HighbdJntConvolve2DTest
+    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
+ public:
+  virtual ~AV1HighbdJntConvolve2DTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+}  // namespace AV1HighbdConvolve2D
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
diff --git a/third_party/aom/test/av1_convolve_scale_test.cc b/third_party/aom/test/av1_convolve_scale_test.cc
new file mode 100644
index 000000000..3b1698eeb
--- /dev/null
+++ b/third_party/aom/test/av1_convolve_scale_test.cc
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "av1/common/common_data.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+const int kXStepQn = 16;
+const int kYStepQn = 20;
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
+int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; }
+
+// A 16-bit filter with a configurable number of taps.
+class TestFilter {
+ public:
+  void set(NTaps ntaps, bool backwards);
+
+  InterpFilterParams params_;
+
+ private:
+  std::vector<int16_t> coeffs_;
+};
+
+void TestFilter::set(NTaps ntaps, bool backwards) {
+  const int n = NTapsToInt(ntaps);
+  assert(n >= 8 && n <= 12);
+
+  // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus
+  // elements at the end so that convolutions can read off the end safely.
+  coeffs_.resize(n * SUBPEL_SHIFTS + 8);
+
+  // The coefficients are pretty much arbitrary, but convolutions shouldn't
+  // over or underflow. For the first filter (subpels = 0), we use an
+  // increasing or decreasing ramp (depending on the backwards parameter). We
+  // don't want any zero coefficients, so we make it have an x-intercept at -1
+  // or n. To ensure absence of under/overflow, we normalise the area under the
+  // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function
+  // gives the identity).
+  //
+  // When increasing, the function has the form:
+  //
+  //   f(x) = A * (x + 1)
+  //
+  // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the
+  // filter is reversed, we have the same A but with formula
+  //
+  //   g(x) = A * (n - x)
+  const int I = 1 << FILTER_BITS;
+  const float A = 2.f * I / (n * (n + 1.f));
+  for (int i = 0; i < n; ++i) {
+    coeffs_[i] = static_cast<int16_t>(A * (backwards ? (n - i) : (i + 1)));
+  }
+
+  // For the other filters, make them slightly different by swapping two
+  // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped.
+  const size_t filter_size = sizeof(coeffs_[0] * n);
+  int16_t *const filter0 = &coeffs_[0];
+  for (int k = 1; k < SUBPEL_SHIFTS; ++k) {
+    int16_t *filterk = &coeffs_[k * n];
+    memcpy(filterk, filter0, filter_size);
+
+    const int idx0 = k % n;
+    const int idx1 = (7 * k) % n;
+
+    const int16_t tmp = filterk[idx0];
+    filterk[idx0] = filterk[idx1];
+    filterk[idx1] = tmp;
+  }
+
+  // Finally, write some rubbish at the end to make sure we don't use it.
+  for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i;
+
+  // Fill in params
+  params_.filter_ptr = &coeffs_[0];
+  params_.taps = n;
+  // These are ignored by the functions being tested. Set them to whatever.
+  params_.subpel_shifts = SUBPEL_SHIFTS;
+  params_.interp_filter = EIGHTTAP_REGULAR;
+}
+
+template <typename SrcPixel>
+class TestImage {
+ public:
+  TestImage(int w, int h, int bd) : w_(w), h_(h), bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(SrcPixel)));
+
+    // Pad width by 2*kHPad and then round up to the next multiple of 16
+    // to get src_stride_. Add another 16 for dst_stride_ (to make sure
+    // something goes wrong if we use the wrong one)
+    src_stride_ = (w_ + 2 * kHPad + 15) & ~15;
+    dst_stride_ = src_stride_ + 16;
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+    dst_16_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  const SrcPixel *GetSrcData(bool ref, bool borders) const {
+    const SrcPixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  SrcPixel *GetDstData(bool ref, bool borders) {
+    SrcPixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+  CONV_BUF_TYPE *GetDst16Data(bool ref, bool borders) {
+    CONV_BUF_TYPE *block = &dst_16_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_, h_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<SrcPixel> src_data_;
+  std::vector<SrcPixel> dst_data_;
+  std::vector<CONV_BUF_TYPE> dst_16_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+  PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_16_data_[0]);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Check() const {
+  // If memcmp returns 0, there's nothing to do.
+  const int num_pixels = dst_block_size();
+  const SrcPixel *ref_dst = &dst_data_[0];
+  const SrcPixel *tst_dst = &dst_data_[num_pixels];
+
+  const CONV_BUF_TYPE *ref_16_dst = &dst_16_data_[0];
+  const CONV_BUF_TYPE *tst_16_dst = &dst_16_data_[num_pixels];
+
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) {
+    if (0 == memcmp(ref_16_dst, tst_16_dst, sizeof(*ref_16_dst) * num_pixels))
+      return;
+  }
+  // Otherwise, iterate through the buffer looking for differences (including
+  // the edges)
+  const int stride = dst_stride_;
+  for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+    for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+    }
+  }
+
+  for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+    for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+      const int32_t ref_value = ref_16_dst[r * stride + c];
+      const int32_t tst_value = tst_16_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error in 16 bit buffer "
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+    }
+  }
+}
+
+typedef tuple<int, int> BlockDimension;
+
+struct BaseParams {
+  BaseParams(BlockDimension dims, NTaps ntaps_x, NTaps ntaps_y, bool avg)
+      : dims(dims), ntaps_x(ntaps_x), ntaps_y(ntaps_y), avg(avg) {}
+
+  BlockDimension dims;
+  NTaps ntaps_x, ntaps_y;
+  bool avg;
+};
+
+template <typename SrcPixel>
+class ConvolveScaleTestBase : public ::testing::Test {
+ public:
+  ConvolveScaleTestBase() : image_(NULL) {}
+  virtual ~ConvolveScaleTestBase() { delete image_; }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetParams(const BaseParams &params, int bd) {
+    width_ = ::testing::get<0>(params.dims);
+    height_ = ::testing::get<1>(params.dims);
+    ntaps_x_ = params.ntaps_x;
+    ntaps_y_ = params.ntaps_y;
+    bd_ = bd;
+    avg_ = params.avg;
+
+    filter_x_.set(ntaps_x_, false);
+    filter_y_.set(ntaps_y_, true);
+    convolve_params_ =
+        get_conv_params_no_round(avg_ != false, 0, NULL, 0, 1, bd);
+
+    delete image_;
+    image_ = new TestImage<SrcPixel>(width_, height_, bd_);
+  }
+
+  void SetConvParamOffset(int i, int j, int is_compound, int do_average,
+                          int use_jnt_comp_avg) {
+    if (i == -1 && j == -1) {
+      convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+      convolve_params_.is_compound = is_compound;
+      convolve_params_.do_average = do_average;
+    } else {
+      convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+      convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
+      convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
+      convolve_params_.is_compound = is_compound;
+      convolve_params_.do_average = do_average;
+    }
+  }
+
+  void Run() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      int is_compound = 0;
+      SetConvParamOffset(-1, -1, is_compound, 0, 0);
+      Prep(&rnd);
+      RunOne(true);
+      RunOne(false);
+      image_->Check();
+
+      is_compound = 1;
+      for (int do_average = 0; do_average < 2; do_average++) {
+        for (int use_jnt_comp_avg = 0; use_jnt_comp_avg < 2;
+             use_jnt_comp_avg++) {
+          for (int j = 0; j < 2; ++j) {
+            for (int k = 0; k < 4; ++k) {
+              SetConvParamOffset(j, k, is_compound, do_average,
+                                 use_jnt_comp_avg);
+              Prep(&rnd);
+              RunOne(true);
+              RunOne(false);
+              image_->Check();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: CDEFSpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  static int RandomSubpel(ACMRandom *rnd) {
+    const uint8_t subpel_mode = rnd->Rand8();
+    if ((subpel_mode & 7) == 0) {
+      return 0;
+    } else if ((subpel_mode & 7) == 1) {
+      return SCALE_SUBPEL_SHIFTS - 1;
+    } else {
+      return 1 + rnd->PseudoUniform(SCALE_SUBPEL_SHIFTS - 2);
+    }
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+
+    // Choose subpel_x_ and subpel_y_. They should be less than
+    // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting"
+    // values: 0 and SCALE_SUBPEL_SHIFTS - 1
+    subpel_x_ = RandomSubpel(rnd);
+    subpel_y_ = RandomSubpel(rnd);
+
+    image_->Initialize(rnd);
+  }
+
+  int width_, height_, bd_;
+  NTaps ntaps_x_, ntaps_y_;
+  bool avg_;
+  int subpel_x_, subpel_y_;
+  TestFilter filter_x_, filter_y_;
+  TestImage<SrcPixel> *image_;
+  ConvolveParams convolve_params_;
+};
+
+typedef tuple<int, int> BlockDimension;
+
+typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int x_step_qn,
+                                  const int subpel_y_qn, const int y_step_qn,
+                                  ConvolveParams *conv_params);
+
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg>
+typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool>
+    LowBDParams;
+
+class LowBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveScaleTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const int bd = 8;
+    const bool avg = GET_PARAM(4);
+
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    convolve_params_.dst = image_->GetDst16Data(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    if (ref) {
+      av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
+                              &filter_x_.params_, &filter_y_.params_, subpel_x_,
+                              kXStepQn, subpel_y_, kYStepQn, &convolve_params_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_);
+    }
+  }
+
+ private:
+  LowbdConvolveFunc tst_fun_;
+};
+
+const BlockDimension kBlockDim[] = {
+  make_tuple(2, 2),    make_tuple(2, 4),    make_tuple(4, 4),
+  make_tuple(4, 8),    make_tuple(8, 4),    make_tuple(8, 8),
+  make_tuple(8, 16),   make_tuple(16, 8),   make_tuple(16, 16),
+  make_tuple(16, 32),  make_tuple(32, 16),  make_tuple(32, 32),
+  make_tuple(32, 64),  make_tuple(64, 32),  make_tuple(64, 64),
+  make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
+};
+
+const NTaps kNTaps[] = { EIGHT_TAP };
+
+TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
+TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, LowBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool()));
+
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int x_step_qn,
+                                   const int subpel_y_qn, const int y_step_qn,
+                                   ConvolveParams *conv_params, int bd);
+
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+    HighBDParams;
+
+class HighBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveScaleTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const bool avg = GET_PARAM(4);
+    const int bd = GET_PARAM(5);
+
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    convolve_params_.dst = image_->GetDst16Data(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+
+    if (ref) {
+      av1_highbd_convolve_2d_scale_c(
+          src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+          &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+          &convolve_params_, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_, bd_);
+    }
+  }
+
+ private:
+  HighbdConvolveFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, HighBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+}  // namespace
diff --git a/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc b/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc
new file mode 100644
index 000000000..e8470e5d5
--- /dev/null
+++ b/third_party/aom/test/av1_encoder_parms_get_to_decoder.cc
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+#include "aom/aom_decoder.h"
+#include "av1/decoder/decoder.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+struct ParamPassingTestVideo {
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const ParamPassingTestVideo kAV1ParamPassingTestVector = {
+  "niklas_1280_720_30.y4m", 1280, 720, 600, 3
+};
+
+struct EncodeParameters {
+  int32_t lossless;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  aom_color_range_t color_range;
+  aom_chroma_sample_position_t chroma_sample_position;
+  int32_t render_size[2];
+};
+
+const EncodeParameters kAV1EncodeParameterSet[] = {
+  { 1,
+    AOM_CICP_CP_BT_709,
+    AOM_CICP_TC_BT_709,
+    AOM_CICP_MC_BT_709,
+    AOM_CR_STUDIO_RANGE,
+    AOM_CSP_UNKNOWN,
+    { 0, 0 } },
+  { 0,
+    AOM_CICP_CP_BT_470_M,
+    AOM_CICP_TC_BT_470_M,
+    AOM_CICP_MC_BT_470_B_G,
+    AOM_CR_FULL_RANGE,
+    AOM_CSP_VERTICAL,
+    { 0, 0 } },
+  { 1,
+    AOM_CICP_CP_BT_601,
+    AOM_CICP_TC_BT_601,
+    AOM_CICP_MC_BT_601,
+    AOM_CR_STUDIO_RANGE,
+    AOM_CSP_COLOCATED,
+    { 0, 0 } },
+  { 0,
+    AOM_CICP_CP_BT_2020,
+    AOM_CICP_TC_BT_2020_10_BIT,
+    AOM_CICP_MC_BT_2020_NCL,
+    AOM_CR_FULL_RANGE,
+    AOM_CSP_RESERVED,
+    { 640, 480 } },
+};
+
+class AVxEncoderParmsGetToDecoder
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWithParam<EncodeParameters> {
+ protected:
+  AVxEncoderParmsGetToDecoder()
+      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
+
+  virtual ~AVxEncoderParmsGetToDecoder() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 25;
+    test_video_ = kAV1ParamPassingTestVector;
+    cfg_.rc_target_bitrate = test_video_.bitrate;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
+      encoder->Control(AV1E_SET_TRANSFER_CHARACTERISTICS,
+                       encode_parms.transfer_characteristics);
+      encoder->Control(AV1E_SET_MATRIX_COEFFICIENTS,
+                       encode_parms.matrix_coefficients);
+      encoder->Control(AV1E_SET_COLOR_RANGE, encode_parms.color_range);
+      encoder->Control(AV1E_SET_CHROMA_SAMPLE_POSITION,
+                       encode_parms.chroma_sample_position);
+      encoder->Control(AV1E_SET_LOSSLESS, encode_parms.lossless);
+      if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+        encoder->Control(AV1E_SET_RENDER_SIZE, encode_parms.render_size);
+      }
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)pts;
+    if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+      EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w);
+      EXPECT_EQ(encode_parms.render_size[1], (int)img.r_h);
+    }
+    EXPECT_EQ(encode_parms.color_primaries, img.cp);
+    EXPECT_EQ(encode_parms.transfer_characteristics, img.tc);
+    EXPECT_EQ(encode_parms.matrix_coefficients, img.mc);
+    EXPECT_EQ(encode_parms.color_range, img.range);
+    EXPECT_EQ(encode_parms.chroma_sample_position, img.csp);
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (encode_parms.lossless) {
+      EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ParamPassingTestVideo test_video_;
+
+ private:
+  EncodeParameters encode_parms;
+};
+
+TEST_P(AVxEncoderParmsGetToDecoder, BitstreamParms) {
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  testing::internal::scoped_ptr<libaom_test::VideoSource> video(
+      new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
+  ASSERT_TRUE(video.get() != NULL);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderParmsGetToDecoder,
+                          ::testing::ValuesIn(kAV1EncodeParameterSet));
+}  // namespace
diff --git a/third_party/aom/test/av1_ext_tile_test.cc b/third_party/aom/test/av1_ext_tile_test.cc
new file mode 100644
index 000000000..424d2f065
--- /dev/null
+++ b/third_party/aom/test/av1_ext_tile_test.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+
+namespace {
+// The number of frames to be encoded/decoded
+const int kLimit = 8;
+// Skip 1 frame to check the frame decoding independency.
+const int kSkip = 5;
+const int kTileSize = 1;
+const int kTIleSizeInPixels = (kTileSize << 6);
+// Fake width and height so that they can be multiples of the tile size.
+const int kImgWidth = 704;
+const int kImgHeight = 576;
+
+// This test tests large scale tile coding case. Non-large-scale tile coding
+// is tested by the tile_independence test.
+class AV1ExtTileTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1ExtTileTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = kImgWidth;
+    cfg.h = kImgHeight;
+    cfg.allow_lowbitdepth = 1;
+
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    decoder_->Control(AV1_SET_TILE_MODE, 1);
+    decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+    decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+    decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+
+    // Allocate buffer to store tile image.
+    aom_img_alloc(&tile_img_, AOM_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+
+    md5_.clear();
+    tile_md5_.clear();
+  }
+
+  virtual ~AV1ExtTileTest() {
+    aom_img_free(&tile_img_);
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_error_resilient = 1;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      // Encode setting
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+
+      // TODO(yunqingwang): test single_tile_decoding = 0.
+      encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1);
+      // Always use 64x64 max partition.
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64);
+      // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+      // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      encoder->Control(AV1E_SET_TILE_ROWS, 6);
+    }
+
+    if (video->frame() == 1) {
+      frame_flags_ =
+          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    // Skip 1 already decoded frame to be consistent with the decoder in this
+    // test.
+    if (pts == (aom_codec_pts_t)kSkip) return;
+
+    // Calculate MD5 as the reference.
+    ::libaom_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Skip decoding 1 frame.
+    if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return;
+
+    bool IsLastFrame = (pkt->data.frame.pts == (aom_codec_pts_t)(kLimit - 1));
+
+    // Decode the first (kLimit - 1) frames as whole frame, and decode the last
+    // frame in single tiles.
+    for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) {
+      for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) {
+        if (!IsLastFrame) {
+          decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+          decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+        } else {
+          decoder_->Control(AV1_SET_DECODE_TILE_ROW, r);
+          decoder_->Control(AV1_SET_DECODE_TILE_COL, c);
+        }
+
+        const aom_codec_err_t res = decoder_->DecodeFrame(
+            reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+            pkt->data.frame.sz);
+        if (res != AOM_CODEC_OK) {
+          abort_ = true;
+          ASSERT_EQ(AOM_CODEC_OK, res);
+        }
+        const aom_image_t *img = decoder_->GetDxData().Next();
+
+        if (!IsLastFrame) {
+          if (img) {
+            ::libaom_test::MD5 md5_res;
+            md5_res.Add(img);
+            tile_md5_.push_back(md5_res.Get());
+          }
+          break;
+        }
+
+        const int kMaxMBPlane = 3;
+        for (int plane = 0; plane < kMaxMBPlane; ++plane) {
+          const int shift = (plane == 0) ? 0 : 1;
+          int tile_height = kTIleSizeInPixels >> shift;
+          int tile_width = kTIleSizeInPixels >> shift;
+
+          for (int tr = 0; tr < tile_height; ++tr) {
+            memcpy(tile_img_.planes[plane] +
+                       tile_img_.stride[plane] * (r * tile_height + tr) +
+                       c * tile_width,
+                   img->planes[plane] + img->stride[plane] * tr, tile_width);
+          }
+        }
+      }
+
+      if (!IsLastFrame) break;
+    }
+
+    if (IsLastFrame) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(&tile_img_);
+      tile_md5_.push_back(md5_res.Get());
+    }
+  }
+
+  void TestRoundTrip() {
+    ::libaom_test::I420VideoSource video(
+        "hantro_collage_w352h288.yuv", kImgWidth, kImgHeight, 30, 1, 0, kLimit);
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
+    cfg_.large_scale_tile = 1;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_threads = 1;
+
+    // Tile encoding
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    // Compare to check if two vectors are equal.
+    ASSERT_EQ(md5_, tile_md5_);
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libaom_test::Decoder *decoder_;
+  aom_image_t tile_img_;
+  std::vector<std::string> md5_;
+  std::vector<std::string> tile_md5_;
+};
+
+TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); }
+
+AV1_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(1, 4));
+
+class AV1ExtTileTestLarge : public AV1ExtTileTest {};
+
+TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); }
+
+AV1_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(0, 1));
+}  // namespace
diff --git a/third_party/aom/test/av1_fwd_txfm1d_test.cc b/third_party/aom/test/av1_fwd_txfm1d_test.cc
new file mode 100644
index 000000000..49a666879
--- /dev/null
+++ b/third_party/aom/test/av1_fwd_txfm1d_test.cc
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "test/av1_txfm_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::TYPE_ADST;
+using libaom_test::TYPE_DCT;
+using libaom_test::TYPE_IDTX;
+using libaom_test::TYPE_TXFM;
+using libaom_test::input_base;
+using libaom_test::reference_hybrid_1d;
+
+namespace {
+const int txfm_type_num = 3;
+const TYPE_TXFM txfm_type_ls[txfm_type_num] = { TYPE_DCT, TYPE_ADST,
+                                                TYPE_IDTX };
+
+const int txfm_size_num = 5;
+
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+  { av1_fdct4_new, av1_fadst4_new, av1_fidentity4_c },
+  { av1_fdct8_new, av1_fadst8_new, av1_fidentity8_c },
+  { av1_fdct16_new, av1_fadst16_new, av1_fidentity16_c },
+  { av1_fdct32_new, NULL, av1_fidentity32_c },
+  { av1_fdct64_new, NULL, NULL },
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit = 14;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
+
+TEST(av1_fwd_txfm1d, round_shift) {
+  EXPECT_EQ(round_shift(7, 1), 4);
+  EXPECT_EQ(round_shift(-7, 1), -3);
+
+  EXPECT_EQ(round_shift(7, 2), 2);
+  EXPECT_EQ(round_shift(-7, 2), -2);
+
+  EXPECT_EQ(round_shift(8, 2), 2);
+  EXPECT_EQ(round_shift(-8, 2), -2);
+}
+
+TEST(av1_fwd_txfm1d, av1_cospi_arr_data) {
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 64; j++) {
+      EXPECT_EQ(av1_cospi_arr_data[i][j],
+                (int32_t)round(cos(M_PI * j / 128) * (1 << (cos_bit_min + i))));
+    }
+  }
+}
+
+TEST(av1_fwd_txfm1d, accuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    double *ref_input = new double[txfm_size];
+    double *ref_output = new double[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TYPE_TXFM txfm_type = txfm_type_ls[ti];
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
+      int max_error = 7;
+
+      const int count_test_block = 5000;
+      if (fwd_txfm_func != NULL) {
+        for (int ti = 0; ti < count_test_block; ++ti) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+            ref_input[ni] = static_cast<double>(input[ni]);
+          }
+
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            ASSERT_LE(
+                abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+                max_error)
+                << "tx size = " << txfm_size << ", tx type = " << txfm_type;
+          }
+        }
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+}  // namespace
diff --git a/third_party/aom/test/av1_fwd_txfm2d_test.cc b/third_party/aom/test/av1_fwd_txfm2d_test.cc
new file mode 100644
index 000000000..75f20536b
--- /dev/null
+++ b/third_party/aom/test/av1_fwd_txfm2d_test.cc
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/av1_txfm_test.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::TYPE_TXFM;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::input_base;
+
+using std::vector;
+
+namespace {
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef ::testing::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
+
+class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    count_ = 500;
+    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+    av1_get_fwd_txfm_cfg(tx_type_, tx_size_, &fwd_txfm_flip_cfg);
+    amplify_factor_ = libaom_test::get_amplification_factor(tx_type_, tx_size_);
+    tx_width_ = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+    tx_height_ = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+    ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
+    lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
+
+    fwd_txfm_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+    txfm2d_size_ = tx_width_ * tx_height_;
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(input_[0]) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(output_[0]) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<double *>(
+        aom_memalign(16, sizeof(ref_input_[0]) * txfm2d_size_));
+    ref_output_ = reinterpret_cast<double *>(
+        aom_memalign(16, sizeof(ref_output_[0]) * txfm2d_size_));
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double avg_abs_error = 0;
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        input_[ni] = rnd.Rand16() % input_base;
+        ref_input_[ni] = static_cast<double>(input_[ni]);
+        output_[ni] = 0;
+        ref_output_[ni] = 0;
+      }
+
+      fwd_txfm_(input_, output_, tx_width_, tx_type_, bd);
+
+      if (lr_flip_ && ud_flip_) {
+        libaom_test::fliplrud(ref_input_, tx_width_, tx_height_, tx_width_);
+      } else if (lr_flip_) {
+        libaom_test::fliplr(ref_input_, tx_width_, tx_height_, tx_width_);
+      } else if (ud_flip_) {
+        libaom_test::flipud(ref_input_, tx_width_, tx_height_, tx_width_);
+      }
+
+      libaom_test::reference_hybrid_2d(ref_input_, ref_output_, tx_type_,
+                                       tx_size_);
+
+      double actual_max_error = 0;
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        ref_output_[ni] = round(ref_output_[ni]);
+        const double this_error =
+            fabs(output_[ni] - ref_output_[ni]) / amplify_factor_;
+        actual_max_error = AOMMAX(actual_max_error, this_error);
+      }
+      EXPECT_GE(max_error_, actual_max_error)
+          << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
+      if (actual_max_error > max_error_) {  // exit early.
+        break;
+      }
+
+      avg_abs_error += compute_avg_abs_error<int32_t, double>(
+          output_, ref_output_, txfm2d_size_);
+    }
+
+    avg_abs_error /= amplify_factor_;
+    avg_abs_error /= count_;
+    EXPECT_GE(max_avg_error_, avg_abs_error)
+        << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(ref_input_);
+    aom_free(ref_output_);
+  }
+
+ private:
+  double max_error_;
+  double max_avg_error_;
+  int count_;
+  double amplify_factor_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int tx_width_;
+  int tx_height_;
+  int txfm2d_size_;
+  FwdTxfm2dFunc fwd_txfm_;
+  int16_t *input_;
+  int32_t *output_;
+  double *ref_input_;
+  double *ref_output_;
+  int ud_flip_;  // flip upside down
+  int lr_flip_;  // flip left to right
+};
+
+static double avg_error_ls[TX_SIZES_ALL] = {
+  0.5,   // 4x4 transform
+  0.5,   // 8x8 transform
+  1.2,   // 16x16 transform
+  6.1,   // 32x32 transform
+  3.4,   // 64x64 transform
+  0.57,  // 4x8 transform
+  0.68,  // 8x4 transform
+  0.92,  // 8x16 transform
+  1.1,   // 16x8 transform
+  4.1,   // 16x32 transform
+  6,     // 32x16 transform
+  3.5,   // 32x64 transform
+  5.7,   // 64x32 transform
+  0.6,   // 4x16 transform
+  0.9,   // 16x4 transform
+  1.2,   // 8x32 transform
+  1.7,   // 32x8 transform
+  2.0,   // 16x64 transform
+  4.7,   // 64x16 transform
+};
+
+static double max_error_ls[TX_SIZES_ALL] = {
+  3,    // 4x4 transform
+  5,    // 8x8 transform
+  11,   // 16x16 transform
+  70,   // 32x32 transform
+  64,   // 64x64 transform
+  3.9,  // 4x8 transform
+  4.3,  // 8x4 transform
+  12,   // 8x16 transform
+  12,   // 16x8 transform
+  32,   // 16x32 transform
+  46,   // 32x16 transform
+  136,  // 32x64 transform
+  136,  // 64x32 transform
+  5,    // 4x16 transform
+  6,    // 16x4 transform
+  21,   // 8x32 transform
+  13,   // 32x8 transform
+  30,   // 16x64 transform
+  36,   // 64x16 transform
+};
+
+vector<AV1FwdTxfm2dParam> GetTxfm2dParamList() {
+  vector<AV1FwdTxfm2dParam> param_list;
+  for (int s = 0; s < TX_SIZES; ++s) {
+    const double max_error = max_error_ls[s];
+    const double avg_error = avg_error_ls[s];
+    for (int t = 0; t < TX_TYPES; ++t) {
+      const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
+      const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+        param_list.push_back(
+            AV1FwdTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+      }
+    }
+  }
+  return param_list;
+}
+
+INSTANTIATE_TEST_CASE_P(C, AV1FwdTxfm2d,
+                        ::testing::ValuesIn(GetTxfm2dParamList()));
+
+TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
+
+TEST(AV1FwdTxfm2d, CfgTest) {
+  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+    int bd = libaom_test::bd_arr[bd_idx];
+    int8_t low_range = libaom_test::low_range_arr[bd_idx];
+    int8_t high_range = libaom_test::high_range_arr[bd_idx];
+    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                           static_cast<TX_TYPE>(tx_type)) ==
+            false) {
+          continue;
+        }
+        TXFM_2D_FLIP_CFG cfg;
+        av1_get_fwd_txfm_cfg(static_cast<TX_TYPE>(tx_type),
+                             static_cast<TX_SIZE>(tx_size), &cfg);
+        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+        av1_gen_fwd_stage_range(stage_range_col, stage_range_row, &cfg, bd);
+        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+                                            cfg.cos_bit_col, low_range,
+                                            high_range);
+        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+                                            cfg.cos_bit_row, low_range,
+                                            high_range);
+      }
+    }
+  }
+}
+
+typedef void (*lowbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+                                    int diff_stride, TxfmParam *txfm_param);
+
+void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+  const int bd = 8;
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  // printf("%d x %d\n", cols, rows);
+  for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+    if (libaom_test::IsTxSizeTypeValid(
+            tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+      continue;
+    }
+
+    FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+    if (ref_func != NULL) {
+      DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+      DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+      DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+      int input_stride = 64;
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      for (int cnt = 0; cnt < 500; ++cnt) {
+        if (cnt == 0) {
+          for (int r = 0; r < rows; ++r) {
+            for (int c = 0; c < cols; ++c) {
+              input[r * input_stride + c] = (1 << bd) - 1;
+            }
+          }
+        } else {
+          for (int r = 0; r < rows; ++r) {
+            for (int c = 0; c < cols; ++c) {
+              input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+            }
+          }
+        }
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+        ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        target_func(input, output, input_stride, &param);
+        const int check_rows = AOMMIN(32, rows);
+        const int check_cols = AOMMIN(32, rows * cols / check_rows);
+        for (int r = 0; r < check_rows; ++r) {
+          for (int c = 0; c < check_cols; ++c) {
+            ASSERT_EQ(ref_output[r * check_cols + c],
+                      output[r * check_cols + c])
+                << "[" << r << "," << c << "] cnt:" << cnt
+                << " tx_size: " << tx_size << " tx_type: " << tx_type;
+          }
+        }
+      }
+    }
+  }
+}
+
+typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
+
+class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
+
+TEST_P(AV1FwdTxfm2dTest, match) {
+  AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE2
+static TX_SIZE fwd_txfm_for_sse2[] = {
+  TX_4X4,
+  TX_8X8,
+  TX_16X16,
+  TX_32X32,
+  // TX_64X64,
+  TX_4X8,
+  TX_8X4,
+  TX_8X16,
+  TX_16X8,
+  TX_16X32,
+  TX_32X16,
+  // TX_32X64,
+  // TX_64X32,
+  TX_4X16,
+  TX_16X4,
+  TX_8X32,
+  TX_32X8,
+  TX_16X64,
+  TX_64X16,
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, AV1FwdTxfm2dTest,
+                        Combine(ValuesIn(fwd_txfm_for_sse2),
+                                Values(av1_lowbd_fwd_txfm_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1
+static TX_SIZE fwd_txfm_for_sse41[] = {
+  TX_4X4,
+  TX_64X64,
+  TX_32X64,
+  TX_64X32,
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1FwdTxfm2dTest,
+                        Combine(ValuesIn(fwd_txfm_for_sse41),
+                                Values(av1_lowbd_fwd_txfm_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+static TX_SIZE fwd_txfm_for_avx2[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1FwdTxfm2dTest,
+                        Combine(ValuesIn(fwd_txfm_for_avx2),
+                                Values(av1_lowbd_fwd_txfm_avx2)));
+#endif  // HAVE_AVX2
+
+typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+                                     int diff_stride, TxfmParam *txfm_param);
+
+void AV1HighbdFwdTxfm2dMatchTest(TX_SIZE tx_size,
+                                 Highbd_fwd_txfm_func target_func) {
+  const int bd_ar[2] = { 10, 12 };
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  for (int i = 0; i < 2; ++i) {
+    const int bd = bd_ar[i];
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+        for (int cnt = 0; cnt < 500; ++cnt) {
+          if (cnt == 0) {
+            for (int r = 0; r < rows; ++r) {
+              for (int c = 0; c < cols; ++c) {
+                input[r * input_stride + c] = (1 << bd) - 1;
+              }
+            }
+          } else {
+            for (int r = 0; r < rows; ++r) {
+              for (int c = 0; c < cols; ++c) {
+                input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+              }
+            }
+          }
+          param.tx_type = (TX_TYPE)tx_type;
+          param.tx_size = (TX_SIZE)tx_size;
+          param.tx_set_type = EXT_TX_SET_ALL16;
+          param.bd = bd;
+
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+          target_func(input, output, input_stride, &param);
+          const int check_rows = AOMMIN(32, rows);
+          const int check_cols = AOMMIN(32, rows * cols / check_rows);
+          for (int r = 0; r < check_rows; ++r) {
+            for (int c = 0; c < check_cols; ++c) {
+              ASSERT_EQ(ref_output[r * check_cols + c],
+                        output[r * check_cols + c])
+                  << "[" << r << "," << c << "] cnt:" << cnt
+                  << " tx_size: " << tx_size << " tx_type: " << tx_type;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1HighbdFwdTxfm2dSpeedTest(TX_SIZE tx_size,
+                                 Highbd_fwd_txfm_func target_func) {
+  const int bd_ar[2] = { 10, 12 };
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  const int num_loops = 1000000 / (rows * cols);
+
+  for (int i = 0; i < 2; ++i) {
+    const int bd = bd_ar[i];
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+        for (int r = 0; r < rows; ++r) {
+          for (int c = 0; c < cols; ++c) {
+            input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+          }
+        }
+
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+
+        aom_usec_timer ref_timer, test_timer;
+
+        aom_usec_timer_start(&ref_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        }
+        aom_usec_timer_mark(&ref_timer);
+        const int elapsed_time_c =
+            static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+        aom_usec_timer_start(&test_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          target_func(input, output, input_stride, &param);
+        }
+        aom_usec_timer_mark(&test_timer);
+        const int elapsed_time_simd =
+            static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+        printf(
+            "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+            "gain=%d \n",
+            tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+            (elapsed_time_c / elapsed_time_simd));
+      }
+    }
+  }
+}
+
+typedef ::testing::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
+
+class AV1HighbdFwdTxfm2dTest
+    : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
+
+TEST_P(AV1HighbdFwdTxfm2dTest, match) {
+  AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdFwdTxfm2dTest, DISABLED_Speed) {
+  AV1HighbdFwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE4_1
+static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
+                        Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
+                                Values(av1_highbd_fwd_txfm)));
+#endif  // HAVE_SSE4_1
+
+}  // namespace
diff --git a/third_party/aom/test/av1_highbd_iht_test.cc b/third_party/aom/test/av1_highbd_iht_test.cc
new file mode 100644
index 000000000..2d6490c2a
--- /dev/null
+++ b/third_party/aom/test/av1_highbd_iht_test.cc
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+#include "av1/common/scan.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd);
+
+typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
+                           TX_TYPE tx_type, int bd);
+
+// Test parameter argument list:
+//   <transform reference function,
+//    optimized inverse transform function,
+//    inverse transform reference function,
+//    num_coeffs,
+//    tx_type,
+//    bit_depth>
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, TX_TYPE, int> IHbdHtParam;
+
+class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
+ public:
+  virtual ~AV1HighbdInvHTNxN() {}
+
+  virtual void SetUp() {
+    txfm_ref_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    inv_txfm_ref_ = GET_PARAM(2);
+    num_coeffs_ = GET_PARAM(3);
+    tx_type_ = GET_PARAM(4);
+    bit_depth_ = GET_PARAM(5);
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(input_[0]) * num_coeffs_));
+
+    // Note:
+    // Inverse transform input buffer is 32-byte aligned
+    // Refer to <root>/av1/encoder/context_tree.c, function,
+    // void alloc_mode_context().
+    coeffs_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+    output_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(32, sizeof(output_[0]) * num_coeffs_));
+    output_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(coeffs_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  int GetStride() const {
+    if (16 == num_coeffs_) {
+      return 4;
+    } else if (64 == num_coeffs_) {
+      return 8;
+    } else if (256 == num_coeffs_) {
+      return 16;
+    } else if (1024 == num_coeffs_) {
+      return 32;
+    } else if (4096 == num_coeffs_) {
+      return 64;
+    } else {
+      return 0;
+    }
+  }
+
+  HbdHtFunc txfm_ref_;
+  IHbdHtFunc inv_txfm_;
+  IHbdHtFunc inv_txfm_ref_;
+  int num_coeffs_;
+  TX_TYPE tx_type_;
+  int bit_depth_;
+
+  int16_t *input_;
+  int32_t *coeffs_;
+  uint16_t *output_;
+  uint16_t *output_ref_;
+};
+
+void AV1HighbdInvHTNxN::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int stride = GetStride();
+  const int num_tests = 20000;
+  const uint16_t mask = (1 << bit_depth_) - 1;
+
+  for (int i = 0; i < num_tests; ++i) {
+    for (int j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+      output_ref_[j] = rnd.Rand16() & mask;
+      output_[j] = output_ref_[j];
+    }
+
+    txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
+    inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_));
+
+    for (int j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " At test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
+
+using ::testing::make_tuple;
+
+#if HAVE_SSE4_1
+#define PARAM_LIST_4X4                                   \
+  &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
+      &av1_inv_txfm2d_add_4x4_c, 16
+
+const IHbdHtParam kArrayIhtParam[] = {
+  // 4x4
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
+                        ::testing::ValuesIn(kArrayIhtParam));
+#endif  // HAVE_SSE4_1
+
+typedef void (*HighbdInvTxfm2dFunc)(const int32_t *input, uint8_t *output,
+                                    int stride, const TxfmParam *txfm_param);
+
+typedef ::testing::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
+class AV1HighbdInvTxfm2d
+    : public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
+ public:
+  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
+                           int bit_depth);
+
+ private:
+  HighbdInvTxfm2dFunc target_func_;
+};
+
+void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
+                                             int run_times, int bit_depth_) {
+  FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+  TxfmParam txfm_param;
+  const int BLK_WIDTH = 64;
+  const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+  DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, uint16_t, output[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+  int stride = BLK_WIDTH;
+  int rows = tx_size_high[tx_size_];
+  int cols = tx_size_wide[tx_size_];
+  const int rows_nonezero = AOMMIN(32, rows);
+  const int cols_nonezero = AOMMIN(32, cols);
+  const uint16_t mask = (1 << bit_depth_) - 1;
+  run_times /= (rows * cols);
+  run_times = AOMMAX(1, run_times);
+  const SCAN_ORDER *scan_order = get_default_scan(tx_size_, tx_type_);
+  const int16_t *scan = scan_order->scan;
+  const int16_t eobmax = rows_nonezero * cols_nonezero;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int randTimes = run_times == 1 ? (eobmax) : 1;
+
+  txfm_param.tx_type = tx_type_;
+  txfm_param.tx_size = tx_size_;
+  txfm_param.lossless = 0;
+  txfm_param.bd = bit_depth_;
+  txfm_param.is_hbd = 1;
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  for (int cnt = 0; cnt < randTimes; ++cnt) {
+    for (int r = 0; r < BLK_WIDTH; ++r) {
+      for (int c = 0; c < BLK_WIDTH; ++c) {
+        input[r * cols + c] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+        output[r * stride + c] = rnd.Rand16() & mask;
+
+        ref_output[r * stride + c] = output[r * stride + c];
+      }
+    }
+    fwd_func_(input, inv_input, stride, tx_type_, bit_depth_);
+
+    // produce eob input by setting high freq coeffs to zero
+    const int eob = AOMMIN(cnt + 1, eobmax);
+    for (int i = eob; i < eobmax; i++) {
+      inv_input[scan[i]] = 0;
+    }
+    txfm_param.eob = eob;
+    aom_usec_timer ref_timer, test_timer;
+
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_highbd_inv_txfm_add_c(inv_input, CONVERT_TO_BYTEPTR(ref_output),
+                                stride, &txfm_param);
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int elapsed_time_c =
+        static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+    aom_usec_timer_start(&test_timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(inv_input, CONVERT_TO_BYTEPTR(output), stride, &txfm_param);
+    }
+    aom_usec_timer_mark(&test_timer);
+    const int elapsed_time_simd =
+        static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+    if (run_times > 10) {
+      printf(
+          "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+          "gain=%d \n",
+          tx_size_, tx_type_, elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          ASSERT_EQ(ref_output[r * stride + c], output[r * stride + c])
+              << "[" << r << "," << c << "] " << cnt
+              << " tx_size: " << static_cast<int>(tx_size_)
+              << " tx_type: " << tx_type_ << " eob " << eob;
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, match) {
+  int bitdepth_ar[2] = { 10, 12 };
+  for (int k = 0; k < 2; ++k) {
+    int bd = bitdepth_ar[k];
+    for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+      for (int i = 0; i < (int)TX_TYPES; ++i) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                           static_cast<TX_TYPE>(i))) {
+          RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                              1, bd);
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, DISABLED_Speed) {
+  int bitdepth_ar[2] = { 10, 12 };
+  for (int k = 0; k < 2; ++k) {
+    int bd = bitdepth_ar[k];
+    for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+      for (int i = 0; i < (int)TX_TYPES; ++i) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                           static_cast<TX_TYPE>(i))) {
+          RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                              1000000, bd);
+        }
+      }
+    }
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvTxfm2d,
+                        ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d,
+                        ::testing::Values(av1_highbd_inv_txfm_add_avx2));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/av1_horz_only_frame_superres_test.cc b/third_party/aom/test/av1_horz_only_frame_superres_test.cc
new file mode 100644
index 000000000..fd77ef35d
--- /dev/null
+++ b/third_party/aom/test/av1_horz_only_frame_superres_test.cc
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class TestImage {
+ public:
+  TestImage(int w_src, int h, int superres_denom, int x0, int bd)
+      : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0),
+        bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(Pixel)));
+    assert(9 <= superres_denom && superres_denom <= 16);
+    assert(SCALE_NUMERATOR == 8);
+    assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK);
+
+    w_dst_ = w_src_;
+    av1_calculate_unscaled_superres_size(&w_dst_, NULL, superres_denom);
+
+    src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4);
+    dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4);
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  int src_width() const { return w_src_; }
+  int dst_width() const { return w_dst_; }
+  int height() const { return h_; }
+  int x0() const { return x0_; }
+
+  const Pixel *GetSrcData(bool ref, bool borders) const {
+    const Pixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  Pixel *GetDstData(bool ref, bool borders) {
+    Pixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_src_, w_dst_, h_, superres_denom_, x0_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<Pixel> src_data_;
+  std::vector<Pixel> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Check() const {
+  const int num_pixels = dst_block_size();
+  const Pixel *ref_dst = &dst_data_[0];
+  const Pixel *tst_dst = &dst_data_[num_pixels];
+
+  // If memcmp returns 0, there's nothing to do.
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+  // Otherwise, iterate through the buffer looking for differences, *ignoring
+  // the edges*
+  const int stride = dst_stride_;
+  for (int r = kVPad; r < h_ + kVPad; ++r) {
+    for (int c = kVPad; c < w_dst_ + kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad)
+          << ", superres_denom: " << superres_denom_ << ", height: " << h_
+          << ", src_width: " << w_src_ << ", dst_width: " << w_dst_
+          << ", x0: " << x0_;
+    }
+  }
+}
+
+template <typename Pixel>
+class ConvolveHorizRSTestBase : public ::testing::Test {
+ public:
+  ConvolveHorizRSTestBase() : image_(NULL) {}
+  virtual ~ConvolveHorizRSTestBase() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetBitDepth(int bd) { bd_ = bd; }
+
+  void CorrectnessTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      for (int superres_denom = 9; superres_denom <= 16; superres_denom++) {
+        // Get a random height between 512 and 767
+        int height = rnd.Rand8() + 512;
+
+        // Get a random src width between 128 and 383
+        int width_src = rnd.Rand8() + 128;
+
+        // x0 is normally calculated by get_upscale_convolve_x0 in
+        // av1/common/resize.c. However, this test should work for
+        // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK
+        // (inclusive), so we choose one at random.
+        int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1);
+
+        image_ =
+            new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+        Prep(&rnd);
+        RunOne(true);
+        RunOne(false);
+        image_->Check();
+
+        delete image_;
+      }
+    }
+  }
+
+  void SpeedTest() {
+    // Pick some specific parameters to test
+    int height = 767;
+    int width_src = 129;
+    int superres_denom = 13;
+    int x0 = RS_SCALE_SUBPEL_MASK >> 1;
+
+    image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: ConvolveHorizRSTest (Speed Test), SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+    image_->Initialize(rnd);
+  }
+
+  int bd_;
+  TestImage<Pixel> *image_;
+};
+
+typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
+                                         uint8_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         const int x0_qn, const int x_step_qn);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
+
+class LowBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = 8;
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
+                              height, &av1_resize_filter_normative[0][0], x0_qn,
+                              x_step_qn);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
+    }
+  }
+
+ private:
+  LowBDConvolveHorizRsFunc tst_fun_;
+};
+
+TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, LowBDConvolveHorizRSTest,
+                        ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+
+typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
+                                          uint16_t *dst, int dst_stride, int w,
+                                          int h, const int16_t *x_filters,
+                                          const int x0_qn, const int x_step_qn,
+                                          int bd);
+
+// Test parameter list:
+//  <tst_fun_, bd_>
+typedef tuple<HighBDConvolveHorizRsFunc, int> HighBDParams;
+
+class HighBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = GET_PARAM(1);
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_highbd_convolve_horiz_rs_c(
+          src, src_stride, dst, dst_stride, width_dst, height,
+          &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    }
+  }
+
+ private:
+  HighBDConvolveHorizRsFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, HighBDConvolveHorizRSTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
+                       ::testing::ValuesIn(kBDs)));
+
+}  // namespace
diff --git a/third_party/aom/test/av1_inv_txfm1d_test.cc b/third_party/aom/test/av1_inv_txfm1d_test.cc
new file mode 100644
index 000000000..bf3a44ed1
--- /dev/null
+++ b/third_party/aom/test/av1_inv_txfm1d_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "test/av1_txfm_test.h"
+#include "test/util.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+
+namespace {
+const int txfm_type_num = 2;
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+  { av1_fdct4_new, av1_fadst4_new },
+  { av1_fdct8_new, av1_fadst8_new },
+  { av1_fdct16_new, av1_fadst16_new },
+  { av1_fdct32_new, NULL },
+  { av1_fdct64_new, NULL },
+};
+
+const TxfmFunc inv_txfm_func_ls[][txfm_type_num] = {
+  { av1_idct4_new, av1_iadst4_new },
+  { av1_idct8_new, av1_iadst8_new },
+  { av1_idct16_new, av1_iadst16_new },
+  { av1_idct32_new, NULL },
+  { av1_idct64_new, NULL },
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit = 13;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
+
+void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) {
+  double input[64];
+  for (int i = 0; i < size; ++i) input[i] = in[i];
+
+  double output[64];
+  libaom_test::reference_idct_1d(input, output, size);
+
+  for (int i = 0; i < size; ++i) {
+    ASSERT_GE(output[i], INT32_MIN);
+    ASSERT_LE(output[i], INT32_MAX);
+    out[i] = static_cast<int32_t>(round(output[i]));
+  }
+}
+
+void random_matrix(int32_t *dst, int len, ACMRandom *rnd) {
+  const int bits = 16;
+  const int maxVal = (1 << (bits - 1)) - 1;
+  const int minVal = -(1 << (bits - 1));
+  for (int i = 0; i < len; ++i) {
+    if (rnd->Rand8() % 10)
+      dst[i] = minVal + rnd->Rand16() % (1 << bits);
+    else
+      dst[i] = rnd->Rand8() % 2 ? minVal : maxVal;
+  }
+}
+
+TEST(av1_inv_txfm1d, InvAccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 20000;
+  const int max_error[] = { 6, 10, 19, 31, 40 };
+  ASSERT_EQ(NELEMENTS(max_error), TX_SIZES);
+  ASSERT_EQ(NELEMENTS(inv_txfm_func_ls), TX_SIZES);
+  for (int k = 0; k < count_test_block; ++k) {
+    // choose a random transform to test
+    const TX_SIZE tx_size = static_cast<TX_SIZE>(rnd.Rand8() % TX_SIZES);
+    const int tx_size_pix = txfm_size_ls[tx_size];
+    const TxfmFunc inv_txfm_func = inv_txfm_func_ls[tx_size][0];
+
+    int32_t input[64];
+    random_matrix(input, tx_size_pix, &rnd);
+
+    // 64x64 transform assumes last 32 values are zero.
+    memset(input + 32, 0, 32 * sizeof(input[0]));
+
+    int32_t ref_output[64];
+    reference_idct_1d_int(input, ref_output, tx_size_pix);
+
+    int32_t output[64];
+    inv_txfm_func(input, output, cos_bit, range_bit);
+
+    for (int i = 0; i < tx_size_pix; ++i) {
+      EXPECT_LE(abs(output[i] - ref_output[i]), max_error[tx_size])
+          << "tx_size = " << tx_size << ", i = " << i
+          << ", output[i] = " << output[i]
+          << ", ref_output[i] = " << ref_output[i];
+    }
+  }
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+TEST(av1_inv_txfm1d, get_max_bit) {
+  int max_bit = get_max_bit(8);
+  EXPECT_EQ(max_bit, 3);
+}
+
+TEST(av1_inv_txfm1d, round_trip) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < NELEMENTS(fwd_txfm_func_ls); ++si) {
+    int txfm_size = txfm_size_ls[si];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
+      TxfmFunc inv_txfm_func = inv_txfm_func_ls[si][ti];
+      int max_error = 2;
+
+      if (!fwd_txfm_func) continue;
+
+      const int count_test_block = 5000;
+      for (int ci = 0; ci < count_test_block; ++ci) {
+        int32_t input[64];
+        int32_t output[64];
+        int32_t round_trip_output[64];
+
+        ASSERT_LE(txfm_size, NELEMENTS(input));
+
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+        }
+
+        fwd_txfm_func(input, output, cos_bit, range_bit);
+        inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          int node_err =
+              abs(input[ni] - round_shift(round_trip_output[ni],
+                                          get_max_bit(txfm_size) - 1));
+          EXPECT_LE(node_err, max_error);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/av1_inv_txfm2d_test.cc b/third_party/aom/test/av1_inv_txfm2d_test.cc
new file mode 100644
index 000000000..11e231ba6
--- /dev/null
+++ b/third_party/aom/test/av1_inv_txfm2d_test.cc
@@ -0,0 +1,378 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::InvTxfm2dFunc;
+using libaom_test::LbdInvTxfm2dFunc;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::input_base;
+
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+
+using std::vector;
+
+namespace {
+
+// AV1InvTxfm2dParam argument list:
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef ::testing::tuple<TX_TYPE, TX_SIZE, int, double> AV1InvTxfm2dParam;
+
+class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+  }
+
+  void RunRoundtripCheck() {
+    int tx_w = tx_size_wide[tx_size_];
+    int tx_h = tx_size_high[tx_size_];
+    int txfm2d_size = tx_w * tx_h;
+    const FwdTxfm2dFunc fwd_txfm_func = libaom_test::fwd_txfm_func_ls[tx_size_];
+    const InvTxfm2dFunc inv_txfm_func = libaom_test::inv_txfm_func_ls[tx_size_];
+    double avg_abs_error = 0;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    const int count = 500;
+
+    for (int ci = 0; ci < count; ci++) {
+      DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(input));
+
+      for (int ni = 0; ni < txfm2d_size; ++ni) {
+        if (ci == 0) {
+          int extreme_input = input_base - 1;
+          input[ni] = extreme_input;  // extreme case
+        } else {
+          input[ni] = rnd.Rand16() % input_base;
+        }
+      }
+
+      DECLARE_ALIGNED(16, uint16_t, expected[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(expected));
+      if (TxfmUsesApproximation()) {
+        // Compare reference forward HT + inverse HT vs forward HT + inverse HT.
+        double ref_input[64 * 64];
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_input));
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          ref_input[ni] = input[ni];
+        }
+        double ref_coeffs[64 * 64] = { 0 };
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs));
+        ASSERT_EQ(tx_type_, DCT_DCT);
+        libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_,
+                                         tx_size_);
+        DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 };
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs_int));
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          ref_coeffs_int[ni] = (int32_t)round(ref_coeffs[ni]);
+        }
+        inv_txfm_func(ref_coeffs_int, expected, tx_w, tx_type_, bd);
+      } else {
+        // Compare original input vs forward HT + inverse HT.
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          expected[ni] = input[ni];
+        }
+      }
+
+      DECLARE_ALIGNED(16, int32_t, coeffs[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(coeffs));
+      fwd_txfm_func(input, coeffs, tx_w, tx_type_, bd);
+
+      DECLARE_ALIGNED(16, uint16_t, actual[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(actual));
+      inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd);
+
+      double actual_max_error = 0;
+      for (int ni = 0; ni < txfm2d_size; ++ni) {
+        const double this_error = abs(expected[ni] - actual[ni]);
+        actual_max_error = AOMMAX(actual_max_error, this_error);
+      }
+      EXPECT_GE(max_error_, actual_max_error)
+          << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
+      if (actual_max_error > max_error_) {  // exit early.
+        break;
+      }
+      avg_abs_error += compute_avg_abs_error<uint16_t, uint16_t>(
+          expected, actual, txfm2d_size);
+    }
+
+    avg_abs_error /= count;
+    EXPECT_GE(max_avg_error_, avg_abs_error)
+        << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
+  }
+
+ private:
+  bool TxfmUsesApproximation() {
+    if (tx_size_wide[tx_size_] == 64 || tx_size_high[tx_size_] == 64) {
+      return true;
+    }
+    return false;
+  }
+
+  int max_error_;
+  double max_avg_error_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+};
+
+static int max_error_ls[TX_SIZES_ALL] = {
+  2,  // 4x4 transform
+  2,  // 8x8 transform
+  2,  // 16x16 transform
+  4,  // 32x32 transform
+  3,  // 64x64 transform
+  2,  // 4x8 transform
+  2,  // 8x4 transform
+  2,  // 8x16 transform
+  2,  // 16x8 transform
+  3,  // 16x32 transform
+  3,  // 32x16 transform
+  5,  // 32x64 transform
+  5,  // 64x32 transform
+  2,  // 4x16 transform
+  2,  // 16x4 transform
+  2,  // 8x32 transform
+  2,  // 32x8 transform
+  3,  // 16x64 transform
+  3,  // 64x16 transform
+};
+
+static double avg_error_ls[TX_SIZES_ALL] = {
+  0.002,  // 4x4 transform
+  0.05,   // 8x8 transform
+  0.07,   // 16x16 transform
+  0.4,    // 32x32 transform
+  0.3,    // 64x64 transform
+  0.02,   // 4x8 transform
+  0.02,   // 8x4 transform
+  0.04,   // 8x16 transform
+  0.07,   // 16x8 transform
+  0.4,    // 16x32 transform
+  0.5,    // 32x16 transform
+  0.38,   // 32x64 transform
+  0.39,   // 64x32 transform
+  0.2,    // 4x16 transform
+  0.2,    // 16x4 transform
+  0.2,    // 8x32 transform
+  0.2,    // 32x8 transform
+  0.38,   // 16x64 transform
+  0.38,   // 64x16 transform
+};
+
+vector<AV1InvTxfm2dParam> GetInvTxfm2dParamList() {
+  vector<AV1InvTxfm2dParam> param_list;
+  for (int s = 0; s < TX_SIZES; ++s) {
+    const int max_error = max_error_ls[s];
+    const double avg_error = avg_error_ls[s];
+    for (int t = 0; t < TX_TYPES; ++t) {
+      const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
+      const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+        param_list.push_back(
+            AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+      }
+    }
+  }
+  return param_list;
+}
+
+INSTANTIATE_TEST_CASE_P(C, AV1InvTxfm2d,
+                        ::testing::ValuesIn(GetInvTxfm2dParamList()));
+
+TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+
+TEST(AV1InvTxfm2d, CfgTest) {
+  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+    int bd = libaom_test::bd_arr[bd_idx];
+    int8_t low_range = libaom_test::low_range_arr[bd_idx];
+    int8_t high_range = libaom_test::high_range_arr[bd_idx];
+    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                           static_cast<TX_TYPE>(tx_type)) ==
+            false) {
+          continue;
+        }
+        TXFM_2D_FLIP_CFG cfg;
+        av1_get_inv_txfm_cfg(static_cast<TX_TYPE>(tx_type),
+                             static_cast<TX_SIZE>(tx_size), &cfg);
+        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+        av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
+                                (TX_SIZE)tx_size, bd);
+        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+                                            cfg.cos_bit_col, low_range,
+                                            high_range);
+        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+                                            cfg.cos_bit_row, low_range,
+                                            high_range);
+      }
+    }
+  }
+}
+
+typedef ::testing::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
+class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
+ public:
+  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times);
+
+ private:
+  LbdInvTxfm2dFunc target_func_;
+};
+
+void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size,
+                                          int run_times) {
+  FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size];
+  InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size];
+  if (fwd_func_ == NULL || ref_func_ == NULL || target_func_ == NULL) {
+    return;
+  }
+  const int bd = 8;
+  const int BLK_WIDTH = 64;
+  const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+  DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(16, uint8_t, output[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+  int stride = BLK_WIDTH;
+  int rows = tx_size_high[tx_size];
+  int cols = tx_size_wide[tx_size];
+  const int rows_nonezero = AOMMIN(32, rows);
+  const int cols_nonezero = AOMMIN(32, cols);
+  run_times /= (rows * cols);
+  run_times = AOMMAX(1, run_times);
+  const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int16_t eobmax = rows_nonezero * cols_nonezero;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int randTimes = run_times == 1 ? (eobmax + 500) : 1;
+  for (int cnt = 0; cnt < randTimes; ++cnt) {
+    const int16_t max_in = (1 << (bd)) - 1;
+    for (int r = 0; r < BLK_WIDTH; ++r) {
+      for (int c = 0; c < BLK_WIDTH; ++c) {
+        input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes();
+        output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8();
+        ref_output[r * stride + c] = output[r * stride + c];
+      }
+    }
+    fwd_func_(input, inv_input, stride, tx_type, bd);
+
+    // produce eob input by setting high freq coeffs to zero
+    const int eob = AOMMIN(cnt + 1, eobmax);
+    for (int i = eob; i < eobmax; i++) {
+      inv_input[scan[i]] = 0;
+    }
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      ref_func_(inv_input, ref_output, stride, tx_type, bd);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(inv_input, output, stride, tx_type, tx_size, eob);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1,
+             time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
+        ASSERT_EQ(ref_value, output[r * stride + c])
+            << "[" << r << "," << c << "] " << cnt
+            << " tx_size: " << static_cast<int>(tx_size)
+            << " tx_type: " << tx_type << " eob " << eob;
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, match) {
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    for (int i = 0; i < (int)TX_TYPES; ++i) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                         static_cast<TX_TYPE>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                            1);
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    for (int i = 0; i < (int)TX_TYPES; ++i) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                         static_cast<TX_TYPE>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                            10000000);
+      }
+    }
+  }
+}
+
+#if HAVE_SSSE3
+#if defined(_MSC_VER) || defined(__SSSE3__)
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1LbdInvTxfm2d,
+                        ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
+#endif  // _MSC_VER || __SSSE3__
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input,
+                                              uint8_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob);
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1LbdInvTxfm2d,
+                        ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+
+extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
+                                              uint8_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob);
+
+INSTANTIATE_TEST_CASE_P(NEON, AV1LbdInvTxfm2d,
+                        ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
+#endif  // HAVE_NEON
+
+}  // namespace
diff --git a/third_party/aom/test/av1_quantize_test.cc b/third_party/aom/test/av1_quantize_test.cc
new file mode 100644
index 000000000..aaf093918
--- /dev/null
+++ b/third_party/aom/test/av1_quantize_test.cc
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+typedef void (*QuantizeFpFunc)(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale);
+
+struct QuantizeFuncParams {
+  QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL,
+                     int count = 16)
+      : qFunc(qF), qFuncRef(qRefF), coeffCount(count) {}
+  QuantizeFpFunc qFunc;
+  QuantizeFpFunc qFuncRef;
+  int coeffCount;
+};
+
+using libaom_test::ACMRandom;
+
+const int numTests = 1000;
+const int maxSize = 1024;
+const int roundFactorRange = 127;
+const int dequantRange = 32768;
+const int coeffRange = (1 << 20) - 1;
+
+class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
+ public:
+  void RunQuantizeTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int err_count_total = 0;
+    int first_failure = -1;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+
+    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+    for (int i = 0; i < numTests; i++) {
+      int err_count = 0;
+      ref_eob = eob = -1;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = rnd(coeffRange);
+      }
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
+      quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                  quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+      ASM_REGISTER_STATE_CHECK(
+          quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                   quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+                   scanOrder.scan, scanOrder.iscan, log_scale));
+
+      for (int j = 0; j < count; ++j) {
+        err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
+                     (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+            << "qcoeff error: i = " << i << " j = " << j << "\n";
+        EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
+            << "dqcoeff error: i = " << i << " j = " << j << "\n";
+      }
+      EXPECT_EQ(ref_eob, eob) << "eob error: "
+                              << "i = " << i << "\n";
+      err_count += (ref_eob != eob);
+      if (err_count && !err_count_total) {
+        first_failure = i;
+      }
+      err_count_total += err_count;
+    }
+    EXPECT_EQ(0, err_count_total)
+        << "Error: Quantization Test, C output doesn't match SSE2 output. "
+        << "First failed at test case " << first_failure;
+  }
+
+  void RunEobTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+
+    for (int i = 0; i < numTests; i++) {
+      ref_eob = eob = -1;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+      }
+
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
+
+      quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                  quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+      ASM_REGISTER_STATE_CHECK(
+          quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                   quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+                   scanOrder.scan, scanOrder.iscan, log_scale));
+      EXPECT_EQ(ref_eob, eob) << "eob error: "
+                              << "i = " << i << "\n";
+    }
+  }
+
+  virtual void SetUp() { params_ = GetParam(); }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  virtual ~AV1QuantizeTest() {}
+
+ private:
+  TX_SIZE getTxSize(int count) {
+    switch (count) {
+      case 16: return TX_4X4;
+      case 64: return TX_8X8;
+      case 256: return TX_16X16;
+      case 1024: return TX_32X32;
+      default: return TX_4X4;
+    }
+  }
+
+  QuantizeFuncParams params_;
+};
+
+TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
+TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
+
+#if HAVE_SSE4_1
+const QuantizeFuncParams qfps[4] = {
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     16),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     64),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     256),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     1024),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const QuantizeFuncParams qfps_avx2[4] = {
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     16),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     64),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     256),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     1024),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/third_party/aom/test/av1_round_shift_array_test.cc b/third_party/aom/test/av1_round_shift_array_test.cc
new file mode 100644
index 000000000..181a39460
--- /dev/null
+++ b/third_party/aom/test/av1_round_shift_array_test.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompRoundShift {
+
+typedef void (*comp_round_shift_array_func)(int32_t *arr, int size, int bit);
+
+#if HAVE_SSE4_1 || HAVE_NEON
+const int kValidBitCheck[] = {
+  -4, -3, -2, -1, 0, 1, 2, 3, 4,
+};
+#endif  // HAVE_SSE4_1 || HAVE_NEON
+
+typedef ::testing::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
+    CompRoundShiftParam;
+
+class AV1CompRoundShiftTest
+    : public ::testing::TestWithParam<CompRoundShiftParam> {
+ public:
+  ~AV1CompRoundShiftTest();
+
+  void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+                      int bit);
+  void RunSpeedTest(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+                    int bit);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() { ; }
+
+void AV1CompRoundShiftTest::RunCheckOutput(
+    comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int blk_wd = 64;
+  DECLARE_ALIGNED(32, int32_t, pred_[blk_wd]);
+  DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+  for (int i = 0; i < (blk_wd); ++i) {
+    ref_buffer_[i] = pred_[i] = rnd_.Rand31() / 16;
+  }
+  av1_round_shift_array_c(ref_buffer_, w, bit);
+  test_impl(pred_, w, bit);
+  for (int x = 0; x < w; ++x) {
+    ASSERT_EQ(ref_buffer_[x], pred_[x]) << w << "x" << h << "mismatch @"
+                                        << "(" << x << ")";
+  }
+}
+
+void AV1CompRoundShiftTest::RunSpeedTest(comp_round_shift_array_func test_impl,
+                                         BLOCK_SIZE bsize, int bit) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int blk_wd = 64;
+  DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+  for (int i = 0; i < (blk_wd); ++i) {
+    ref_buffer_[i] = rnd_.Rand31();
+  }
+
+  const int num_loops = 1000000000 / (w + h);
+  comp_round_shift_array_func funcs[2] = { av1_round_shift_array_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    comp_round_shift_array_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(ref_buffer_, w, bit);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_round_shift_array %3dx%-3d: bit : %d %7.2f/%7.2fns", w, h, bit,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompRoundShiftTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1CompRoundShiftTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1CompRoundShiftTest,
+    ::testing::Combine(::testing::Values(&av1_round_shift_array_sse4_1),
+                       ::testing::ValuesIn(txsize_to_bsize),
+                       ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1CompRoundShiftTest,
+    ::testing::Combine(::testing::Values(&av1_round_shift_array_neon),
+                       ::testing::ValuesIn(txsize_to_bsize),
+                       ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+};  // namespace AV1CompRoundShift
diff --git a/third_party/aom/test/av1_txfm_test.cc b/third_party/aom/test/av1_txfm_test.cc
new file mode 100644
index 000000000..d5b0ce325
--- /dev/null
+++ b/third_party/aom/test/av1_txfm_test.cc
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include "test/av1_txfm_test.h"
+
+namespace libaom_test {
+
+int get_txfm1d_size(TX_SIZE tx_size) { return tx_size_wide[tx_size]; }
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
+  switch (txfm2d_type) {
+    case DCT_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      break;
+    case ADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_ADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_FLIPADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case IDTX:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_DCT:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_DCT;
+      break;
+    case V_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_ADST:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_ADST;
+      break;
+    case V_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_FLIPADST:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_ADST;
+      break;
+    case V_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_IDTX;
+      break;
+    default:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      assert(0);
+      break;
+  }
+}
+
+double Sqrt2 = pow(2, 0.5);
+double invSqrt2 = 1 / pow(2, 0.5);
+
+double dct_matrix(double n, double k, int size) {
+  return cos(M_PI * (2 * n + 1) * k / (2 * size));
+}
+
+void reference_dct_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * dct_matrix(n, k, size);
+    }
+    if (k == 0) out[k] = out[k] * invSqrt2;
+  }
+}
+
+void reference_idct_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      if (n == 0)
+        out[k] += invSqrt2 * in[n] * dct_matrix(k, n, size);
+      else
+        out[k] += in[n] * dct_matrix(k, n, size);
+    }
+  }
+}
+
+// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4_new'
+// function). Should be replaced by a proper reference function that takes
+// 'double' input & output.
+static void fadst4_new(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+void reference_adst_1d(const double *in, double *out, int size) {
+  if (size == 4) {  // Special case.
+    tran_low_t int_input[4];
+    for (int i = 0; i < 4; ++i) {
+      int_input[i] = static_cast<tran_low_t>(round(in[i]));
+    }
+    tran_low_t int_output[4];
+    fadst4_new(int_input, int_output);
+    for (int i = 0; i < 4; ++i) {
+      out[i] = int_output[i];
+    }
+    return;
+  }
+
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_idtx_1d(const double *in, double *out, int size) {
+  double scale = 0;
+  if (size == 4)
+    scale = Sqrt2;
+  else if (size == 8)
+    scale = 2;
+  else if (size == 16)
+    scale = 2 * Sqrt2;
+  else if (size == 32)
+    scale = 4;
+  else if (size == 64)
+    scale = 4 * Sqrt2;
+  for (int k = 0; k < size; ++k) {
+    out[k] = in[k] * scale;
+  }
+}
+
+void reference_hybrid_1d(double *in, double *out, int size, int type) {
+  if (type == TYPE_DCT)
+    reference_dct_1d(in, out, size);
+  else if (type == TYPE_ADST)
+    reference_adst_1d(in, out, size);
+  else
+    reference_idtx_1d(in, out, size);
+}
+
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size) {
+  TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+  av1_get_fwd_txfm_cfg(tx_type, tx_size, &fwd_txfm_flip_cfg);
+  const int tx_width = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+  const int tx_height = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+  const int8_t *shift = fwd_txfm_flip_cfg.shift;
+  const int amplify_bit = shift[0] + shift[1] + shift[2];
+  double amplify_factor =
+      amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+  // For rectangular transforms, we need to multiply by an extra factor.
+  const int rect_type = get_rect_tx_log_ratio(tx_width, tx_height);
+  if (abs(rect_type) == 1) {
+    amplify_factor *= pow(2, 0.5);
+  }
+  return amplify_factor;
+}
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+                         TX_SIZE tx_size) {
+  // Get transform type and size of each dimension.
+  TYPE_TXFM type0;
+  TYPE_TXFM type1;
+  get_txfm1d_type(tx_type, &type0, &type1);
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+
+  double *const temp_in = new double[AOMMAX(tx_width, tx_height)];
+  double *const temp_out = new double[AOMMAX(tx_width, tx_height)];
+  double *const out_interm = new double[tx_width * tx_height];
+  const int stride = tx_width;
+
+  // Transform columns.
+  for (int c = 0; c < tx_width; ++c) {
+    for (int r = 0; r < tx_height; ++r) {
+      temp_in[r] = in[r * stride + c];
+    }
+    reference_hybrid_1d(temp_in, temp_out, tx_height, type0);
+    for (int r = 0; r < tx_height; ++r) {
+      out_interm[r * stride + c] = temp_out[r];
+    }
+  }
+
+  // Transform rows.
+  for (int r = 0; r < tx_height; ++r) {
+    reference_hybrid_1d(out_interm + r * stride, out + r * stride, tx_width,
+                        type1);
+  }
+
+  delete[] temp_in;
+  delete[] temp_out;
+  delete[] out_interm;
+
+  // These transforms use an approximate 2D DCT transform, by only keeping the
+  // top-left quarter of the coefficients, and repacking them in the first
+  // quarter indices.
+  // TODO(urvang): Refactor this code.
+  if (tx_width == 64 && tx_height == 64) {  // tx_size == TX_64X64
+    // Zero out top-right 32x32 area.
+    for (int row = 0; row < 32; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Zero out the bottom 64x32 area.
+    memset(out + 32 * 64, 0, 32 * 64 * sizeof(*out));
+    // Re-pack non-zero coeffs in the first 32x32 indices.
+    for (int row = 1; row < 32; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  } else if (tx_width == 32 && tx_height == 64) {  // tx_size == TX_32X64
+    // Zero out the bottom 32x32 area.
+    memset(out + 32 * 32, 0, 32 * 32 * sizeof(*out));
+    // Note: no repacking needed here.
+  } else if (tx_width == 64 && tx_height == 32) {  // tx_size == TX_64X32
+    // Zero out right 32x32 area.
+    for (int row = 0; row < 32; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Re-pack non-zero coeffs in the first 32x32 indices.
+    for (int row = 1; row < 32; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  } else if (tx_width == 16 && tx_height == 64) {  // tx_size == TX_16X64
+    // Zero out the bottom 16x32 area.
+    memset(out + 16 * 32, 0, 16 * 32 * sizeof(*out));
+    // Note: no repacking needed here.
+  } else if (tx_width == 64 && tx_height == 16) {  // tx_size == TX_64X16
+    // Zero out right 32x16 area.
+    for (int row = 0; row < 16; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Re-pack non-zero coeffs in the first 32x16 indices.
+    for (int row = 1; row < 16; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  }
+
+  // Apply appropriate scale.
+  const double amplify_factor = get_amplification_factor(tx_type, tx_size);
+  for (int c = 0; c < tx_width; ++c) {
+    for (int r = 0; r < tx_height; ++r) {
+      out[r * stride + c] *= amplify_factor;
+    }
+  }
+}
+
+template <typename Type>
+void fliplr(Type *dest, int width, int height, int stride) {
+  for (int r = 0; r < height; ++r) {
+    for (int c = 0; c < width / 2; ++c) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[r * stride + width - 1 - c];
+      dest[r * stride + width - 1 - c] = tmp;
+    }
+  }
+}
+
+template <typename Type>
+void flipud(Type *dest, int width, int height, int stride) {
+  for (int c = 0; c < width; ++c) {
+    for (int r = 0; r < height / 2; ++r) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[(height - 1 - r) * stride + c];
+      dest[(height - 1 - r) * stride + c] = tmp;
+    }
+  }
+}
+
+template <typename Type>
+void fliplrud(Type *dest, int width, int height, int stride) {
+  for (int r = 0; r < height / 2; ++r) {
+    for (int c = 0; c < width; ++c) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[(height - 1 - r) * stride + width - 1 - c];
+      dest[(height - 1 - r) * stride + width - 1 - c] = tmp;
+    }
+  }
+}
+
+template void fliplr<double>(double *dest, int width, int height, int stride);
+template void flipud<double>(double *dest, int width, int height, int stride);
+template void fliplrud<double>(double *dest, int width, int height, int stride);
+
+int bd_arr[BD_NUM] = { 8, 10, 12 };
+
+int8_t low_range_arr[BD_NUM] = { 18, 32, 32 };
+int8_t high_range_arr[BD_NUM] = { 32, 32, 32 };
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+                            int8_t cos_bit, int low_range, int high_range) {
+  for (int i = 0; i < stage_num; ++i) {
+    EXPECT_LE(stage_range[i], low_range);
+    ASSERT_LE(stage_range[i] + cos_bit, high_range) << "stage = " << i;
+  }
+  for (int i = 0; i < stage_num - 1; ++i) {
+    // make sure there is no overflow while doing half_btf()
+    ASSERT_LE(stage_range[i + 1] + cos_bit, high_range) << "stage = " << i;
+  }
+}
+}  // namespace libaom_test
diff --git a/third_party/aom/test/av1_txfm_test.h b/third_party/aom/test/av1_txfm_test.h
new file mode 100644
index 000000000..a18164741
--- /dev/null
+++ b/third_party/aom/test/av1_txfm_test.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_AV1_TXFM_TEST_H_
+#define AOM_TEST_AV1_TXFM_TEST_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+namespace libaom_test {
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDTX,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+int get_txfm1d_size(TX_SIZE tx_size);
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1);
+
+void reference_dct_1d(const double *in, double *out, int size);
+void reference_idct_1d(const double *in, double *out, int size);
+
+void reference_adst_1d(const double *in, double *out, int size);
+
+void reference_hybrid_1d(double *in, double *out, int size, int type);
+
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size);
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+                         TX_SIZE tx_size);
+template <typename Type1, typename Type2>
+static double compute_avg_abs_error(const Type1 *a, const Type2 *b,
+                                    const int size) {
+  double error = 0;
+  for (int i = 0; i < size; i++) {
+    error += fabs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
+  }
+  error = error / size;
+  return error;
+}
+
+template <typename Type>
+void fliplr(Type *dest, int width, int height, int stride);
+
+template <typename Type>
+void flipud(Type *dest, int width, int height, int stride);
+
+template <typename Type>
+void fliplrud(Type *dest, int width, int height, int stride);
+
+typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t cos_bit,
+                         const int8_t *range_bit);
+
+typedef void (*InvTxfm2dFunc)(const int32_t *, uint16_t *, int, TX_TYPE, int);
+typedef void (*LbdInvTxfm2dFunc)(const int32_t *, uint8_t *, int, TX_TYPE,
+                                 TX_SIZE, int);
+
+static const int bd = 10;
+static const int input_base = (1 << bd);
+
+static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  TxSetType tx_set_type;
+  if (tx_size_sqr_up > TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCTONLY;
+  } else if (tx_size_sqr_up == TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCT_IDTX;
+  } else {
+    tx_set_type = EXT_TX_SET_ALL16;
+  }
+  return av1_ext_tx_used[tx_set_type][tx_type] != 0;
+}
+
+#if CONFIG_AV1_ENCODER
+
+static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_fwd_txfm2d_4x4_c,   av1_fwd_txfm2d_8x8_c,   av1_fwd_txfm2d_16x16_c,
+  av1_fwd_txfm2d_32x32_c, av1_fwd_txfm2d_64x64_c, av1_fwd_txfm2d_4x8_c,
+  av1_fwd_txfm2d_8x4_c,   av1_fwd_txfm2d_8x16_c,  av1_fwd_txfm2d_16x8_c,
+  av1_fwd_txfm2d_16x32_c, av1_fwd_txfm2d_32x16_c, av1_fwd_txfm2d_32x64_c,
+  av1_fwd_txfm2d_64x32_c, av1_fwd_txfm2d_4x16_c,  av1_fwd_txfm2d_16x4_c,
+  av1_fwd_txfm2d_8x32_c,  av1_fwd_txfm2d_32x8_c,  av1_fwd_txfm2d_16x64_c,
+  av1_fwd_txfm2d_64x16_c,
+};
+#endif
+
+static const InvTxfm2dFunc inv_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_inv_txfm2d_add_4x4_c,   av1_inv_txfm2d_add_8x8_c,
+  av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c,
+  av1_inv_txfm2d_add_64x64_c, av1_inv_txfm2d_add_4x8_c,
+  av1_inv_txfm2d_add_8x4_c,   av1_inv_txfm2d_add_8x16_c,
+  av1_inv_txfm2d_add_16x8_c,  av1_inv_txfm2d_add_16x32_c,
+  av1_inv_txfm2d_add_32x16_c, av1_inv_txfm2d_add_32x64_c,
+  av1_inv_txfm2d_add_64x32_c, av1_inv_txfm2d_add_4x16_c,
+  av1_inv_txfm2d_add_16x4_c,  av1_inv_txfm2d_add_8x32_c,
+  av1_inv_txfm2d_add_32x8_c,  av1_inv_txfm2d_add_16x64_c,
+  av1_inv_txfm2d_add_64x16_c,
+};
+
+#define BD_NUM 3
+
+extern int bd_arr[];
+extern int8_t low_range_arr[];
+extern int8_t high_range_arr[];
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+                            const int8_t cos_bit, int low_range,
+                            int high_range);
+}  // namespace libaom_test
+#endif  // AOM_TEST_AV1_TXFM_TEST_H_
diff --git a/third_party/aom/test/av1_wedge_utils_test.cc b/third_party/aom/test/av1_wedge_utils_test.cc
new file mode 100644
index 000000000..e8fbe69a4
--- /dev/null
+++ b/third_party/aom/test/av1_wedge_utils_test.cc
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/enums.h"
+
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#define WEDGE_WEIGHT_BITS 6
+#define MAX_MASK_VALUE (1 << (WEDGE_WEIGHT_BITS))
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - functionality
+//////////////////////////////////////////////////////////////////////////////
+
+class WedgeUtilsSSEFuncTest : public testing::Test {
+ protected:
+  WedgeUtilsSSEFuncTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  static const int kIterations = 1000;
+
+  ACMRandom rng_;
+};
+
+static void equiv_blend_residuals(int16_t *r, const int16_t *r0,
+                                  const int16_t *r1, const uint8_t *m, int N) {
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    // Note that this rounding is designed to match the result
+    // you would get when actually blending the 2 predictors and computing
+    // the residuals.
+    r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+  }
+}
+
+static uint64_t equiv_sse_from_residuals(const int16_t *r0, const int16_t *r1,
+                                         const uint8_t *m, int N) {
+  uint64_t acc = 0;
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+    acc += r * r;
+  }
+  return acc;
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
+  DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]);
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      s[i] = rng_.Rand8();
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int N = w * h;
+
+    for (int j = 0; j < N; j++) {
+      p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+      p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+    }
+
+    aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, w, h, 0, 0);
+
+    aom_subtract_block(h, w, r0, w, s, w, p0, w);
+    aom_subtract_block(h, w, r1, w, s, w, p1, w);
+
+    aom_subtract_block(h, w, r_ref, w, s, w, p, w);
+    equiv_blend_residuals(r_tst, r0, r1, m, N);
+
+    for (int i = 0; i < N; ++i) ASSERT_EQ(r_ref[i], r_tst[i]);
+
+    uint64_t ref_sse = aom_sum_squares_i16(r_ref, N);
+    uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N);
+
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+static uint64_t sse_from_residuals(const int16_t *r0, const int16_t *r1,
+                                   const uint8_t *m, int N) {
+  uint64_t acc = 0;
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int32_t r = m0 * r0[i] + m1 * r1[i];
+    acc += r * r;
+  }
+  return ROUND_POWER_OF_TWO(acc, 2 * WEDGE_WEIGHT_BITS);
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingMethod) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r1[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      d[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    for (int i = 0; i < N; i++) r0[i] = r1[i] + d[i];
+
+    const uint64_t ref_res = sse_from_residuals(r0, r1, m, N);
+    const uint64_t tst_res = av1_wedge_sse_from_residuals(r1, d, m, N);
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - optimizations
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*FSSE)(const int16_t *r1, const int16_t *d, const uint8_t *m,
+                         int N);
+typedef libaom_test::FuncParam<FSSE> TestFuncsFSSE;
+
+class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest<FSSE> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      d[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = -kInt13Max;
+    }
+
+    if (rng_(2)) {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = -kInt13Max;
+    }
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sign_from_residuals
+//////////////////////////////////////////////////////////////////////////////
+
+typedef int (*FSign)(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
+typedef libaom_test::FuncParam<FSign> TestFuncsFSign;
+
+class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxSize = 8196;  // Size limited by SIMD implementation.
+};
+
+TEST_P(WedgeUtilsSignOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r0[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)aom_sum_squares_i16(r0, N);
+    limit -= (int64_t)aom_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0; i < N; i++)
+      ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(4)) {
+      case 0:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = 0;
+          r1[i] = kInt13Max;
+        }
+        break;
+      case 1:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = kInt13Max;
+          r1[i] = 0;
+        }
+        break;
+      case 2:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = 0;
+          r1[i] = -kInt13Max;
+        }
+        break;
+      default:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = -kInt13Max;
+          r1[i] = 0;
+        }
+        break;
+    }
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+    const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)aom_sum_squares_i16(r0, N);
+    limit -= (int64_t)aom_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0; i < N; i++)
+      ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_compute_delta_squares
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FDS)(int16_t *d, const int16_t *a, const int16_t *b, int N);
+typedef libaom_test::FuncParam<FDS> TestFuncsFDS;
+
+class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest<FDS> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, b[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_tst[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      a[i] = rng_.Rand16();
+      b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX;
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    memset(&d_ref, INT16_MAX, sizeof(d_ref));
+    memset(&d_tst, INT16_MAX, sizeof(d_tst));
+
+    params_.ref_func(d_ref, a, b, N);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+                                    av1_wedge_sse_from_residuals_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+                                     av1_wedge_sign_from_residuals_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+                                   av1_wedge_compute_delta_squares_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2,
+                                    av1_wedge_sse_from_residuals_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2,
+                                     av1_wedge_sign_from_residuals_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2,
+                                   av1_wedge_compute_delta_squares_avx2)));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/third_party/aom/test/best_encode.sh b/third_party/aom/test/best_encode.sh
new file mode 100755
index 000000000..fe31a01cb
--- /dev/null
+++ b/third_party/aom/test/best_encode.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 2 ]]; then
+  echo "Encodes a file using best known settings (slow!)"
+  echo "  Usage:    be [FILE] [BITRATE]"
+  echo "  Example:  be akiyo_cif.y4m 200"
+  exit
+fi
+
+f=$1  # file is first parameter
+b=$2  # bitrate is second parameter
+
+if [[ -e $f.fpf ]]; then
+  # First-pass file found, do second pass only
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=2 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0 \
+    --bias-pct=50 \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --psnr \
+    --arnr-maxframes=7 \
+    --arnr-strength=3 \
+    --arnr-type=3
+else
+  # No first-pass file found, do 2-pass encode
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=1 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0
+
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=2 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0 \
+    --bias-pct=50 \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --psnr \
+    --arnr-maxframes=7 \
+    --arnr-strength=3 \
+    --arnr-type=3
+fi
diff --git a/third_party/aom/test/binary_codes_test.cc b/third_party/aom/test/binary_codes_test.cc
new file mode 100644
index 000000000..45660cf85
--- /dev/null
+++ b/third_party/aom/test/binary_codes_test.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/binary_codes_writer.h"
+
+#define ACCT_STR __func__
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+// Test for Finite subexponential code with reference
+TEST(AV1, TestPrimitiveRefsubexpfin) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int kBufferSize = 65536;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const uint16_t kRanges = 8;
+  const uint16_t kSubexpParams = 6;
+  const uint16_t kReferences = 8;
+  const uint16_t kValues = 16;
+  uint16_t enc_values[kRanges][kSubexpParams][kReferences][kValues][4];
+  const uint16_t range_vals[kRanges] = { 1, 13, 64, 120, 230, 420, 1100, 8000 };
+  aom_start_encode(&bw, bw_buffer);
+  for (int n = 0; n < kRanges; ++n) {
+    const uint16_t range = range_vals[n];
+    for (int k = 0; k < kSubexpParams; ++k) {
+      for (int r = 0; r < kReferences; ++r) {
+        const uint16_t ref = rnd(range);
+        for (int v = 0; v < kValues; ++v) {
+          const uint16_t value = rnd(range);
+          enc_values[n][k][r][v][0] = range;
+          enc_values[n][k][r][v][1] = k;
+          enc_values[n][k][r][v][2] = ref;
+          enc_values[n][k][r][v][3] = value;
+          aom_write_primitive_refsubexpfin(&bw, range, k, ref, value);
+        }
+      }
+    }
+  }
+  aom_stop_encode(&bw);
+  aom_reader br;
+  aom_reader_init(&br, bw_buffer, bw.pos);
+  GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
+  GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+  for (int n = 0; n < kRanges; ++n) {
+    for (int k = 0; k < kSubexpParams; ++k) {
+      for (int r = 0; r < kReferences; ++r) {
+        for (int v = 0; v < kValues; ++v) {
+          const uint16_t range = enc_values[n][k][r][v][0];
+          assert(k == enc_values[n][k][r][v][1]);
+          const uint16_t ref = enc_values[n][k][r][v][2];
+          const uint16_t value =
+              aom_read_primitive_refsubexpfin(&br, range, k, ref, ACCT_STR);
+          GTEST_ASSERT_EQ(value, enc_values[n][k][r][v][3]);
+        }
+      }
+    }
+  }
+}
+// TODO(debargha): Adds tests for other primitives
+}  // namespace
diff --git a/third_party/aom/test/blend_a64_mask_1d_test.cc b/third_party/aom/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 000000000..f8844eef8
--- /dev/null
+++ b/third_party/aom/test/blend_a64_mask_1d_test.cc
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#include "aom_dsp/blend.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth;
+
+  virtual ~BlendA64Mask1DTest() {}
+
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+
+  void Common() {
+    w_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+    h_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+
+    dst_offset_ = this->rng_(33);
+    dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = this->rng_(33);
+    src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = this->rng_(33);
+    src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (this->rng_(3)) {
+      case 0:  // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:  // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:  // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default: FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0; r < h_; ++r) {
+      for (int c = 0; c < w_; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  uint32_t dst_stride_;
+  uint32_t dst_offset_;
+
+  T src0_[kBufSize];
+  uint32_t src0_stride_;
+  uint32_t src0_offset_;
+
+  T src1_[kBufSize];
+  uint32_t src1_stride_;
+  uint32_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+
+  int w_;
+  int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                    uint32_t src0_stride, const uint8_t *src1,
+                    uint32_t src1_stride, const uint8_t *mask, int w, int h);
+typedef libaom_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+                     src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
+                     w_, h_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
+  }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
+
+  aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
+                       0, 0);
+}
+
+static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
+
+  aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
+                       0, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, BlendA64Mask1DTest8B,
+    ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_c),
+                      TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64Mask1DTest8B,
+    ::testing::Values(
+        TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_sse4_1),
+        TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, BlendA64Mask1DTest8B,
+                        ::testing::Values(TestFuncs(blend_a64_hmask_ref,
+                                                    aom_blend_a64_hmask_neon),
+                                          TestFuncs(blend_a64_vmask_ref,
+                                                    aom_blend_a64_vmask_neon)));
+#endif  // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                     uint32_t src0_stride, const uint8_t *src1,
+                     uint32_t src1_stride, const uint8_t *mask, int w, int h,
+                     int bd);
+typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                     CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                     CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                     mask_, w_, h_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+        CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_,
+        bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+  for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void highbd_blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
+
+  aom_highbd_blend_a64_mask_c(
+      dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
+      BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
+
+  aom_highbd_blend_a64_mask_c(
+      dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
+      BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, BlendA64Mask1DTestHBD,
+    ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+                                   aom_highbd_blend_a64_hmask_c),
+                      TestFuncsHBD(highbd_blend_a64_vmask_ref,
+                                   aom_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64Mask1DTestHBD,
+    ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+                                   aom_highbd_blend_a64_hmask_sse4_1),
+                      TestFuncsHBD(highbd_blend_a64_vmask_ref,
+                                   aom_highbd_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+}  // namespace
diff --git a/third_party/aom/test/blend_a64_mask_test.cc b/third_party/aom/test/blend_a64_mask_test.cc
new file mode 100644
index 000000000..66ca6fc5f
--- /dev/null
+++ b/third_party/aom/test/blend_a64_mask_test.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#include "aom_dsp/blend.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename BlendA64Func, typename SrcPixel, typename DstPixel>
+class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
+
+  virtual ~BlendA64MaskTest() {}
+
+  virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
+                       int run_times) = 0;
+
+  template <typename Pixel>
+  void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) {
+    if (run_times > 1) {
+      *src0 = src0_;
+      *src1 = src1_;
+      return;
+    }
+    switch (this->rng_(3)) {
+      case 0:  // Separate sources
+        *src0 = src0_;
+        *src1 = src1_;
+        break;
+      case 1:  // src0 == dst
+        *src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        *src1 = src1_;
+        break;
+      case 2:  // src1 == dst
+        *src0 = src0_;
+        *src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default: FAIL();
+    }
+  }
+
+  void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/,
+                  int /*run_times*/) {
+    *src0 = src0_;
+    *src1 = src1_;
+  }
+
+  uint8_t Rand1() { return this->rng_.Rand8() & 1; }
+
+  void RunOneTest(int block_size, int subx, int suby, int run_times) {
+    w_ = block_size_wide[block_size];
+    h_ = block_size_high[block_size];
+    run_times = run_times > 1 ? run_times / w_ : 1;
+    subx_ = subx;
+    suby_ = suby;
+
+    dst_offset_ = this->rng_(33);
+    dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = this->rng_(33);
+    src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = this->rng_(33);
+    src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    mask_stride_ =
+        this->rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1)) + w_ * (subx_ ? 2 : 1);
+
+    SrcPixel *p_src0;
+    SrcPixel *p_src1;
+
+    p_src0 = src0_;
+    p_src1 = src1_;
+
+    GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times);
+
+    Execute(p_src0, p_src1, run_times);
+
+    for (int r = 0; r < h_; ++r) {
+      for (int c = 0; c < w_; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c])
+            << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_
+            << " r: " << r << " c: " << c;
+      }
+    }
+  }
+
+  void RunTest(int block_size, int run_times) {
+    subx_ = Rand1();
+    suby_ = Rand1();
+    RunOneTest(block_size, subx_, suby_, run_times);
+  }
+
+  DstPixel dst_ref_[kBufSize];
+  DstPixel dst_tst_[kBufSize];
+  uint32_t dst_stride_;
+  uint32_t dst_offset_;
+
+  SrcPixel src0_[kBufSize];
+  uint32_t src0_stride_;
+  uint32_t src0_offset_;
+
+  SrcPixel src1_[kBufSize];
+  uint32_t src1_stride_;
+  uint32_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+  size_t mask_stride_;
+
+  int w_;
+  int h_;
+
+  int suby_;
+  int subx_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                    uint32_t src0_stride, const uint8_t *src1,
+                    uint32_t src1_stride, const uint8_t *mask,
+                    uint32_t mask_stride, int w, int h, int subx, int suby);
+typedef libaom_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+  }
+};
+
+TEST_P(BlendA64MaskTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest(bsize, 1);
+  }
+}
+TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunOneTest(bsize, 1, 1, kRunTimes);
+    RunOneTest(bsize, 1, 0, kRunTimes);
+    RunOneTest(bsize, 0, 1, kRunTimes);
+    RunOneTest(bsize, 0, 0, kRunTimes);
+  }
+}
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, BlendA64MaskTest8B,
+                        ::testing::Values(TestFuncs(
+                            aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
+#endif  // HAVE_AVX2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, BlendA64MaskTest8B,
+                        ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
+                                                    aom_blend_a64_mask_avx2)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B_D16)(uint8_t *dst, uint32_t dst_stride, const uint16_t *src0,
+                        uint32_t src0_stride, const uint16_t *src1,
+                        uint32_t src1_stride, const uint8_t *mask,
+                        uint32_t mask_stride, int w, int h, int subx, int suby,
+                        ConvolveParams *conv_params);
+typedef libaom_test::FuncParam<F8B_D16> TestFuncs_d16;
+
+class BlendA64MaskTest8B_d16
+    : public BlendA64MaskTest<F8B_D16, uint16_t, uint8_t> {
+ protected:
+  // max number of bits used by the source
+  static const int kSrcMaxBitsMask = 0x3fff;
+
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    ConvolveParams conv_params;
+    conv_params.round_0 = ROUND0_BITS;
+    conv_params.round_1 = COMPOUND_ROUND1_BITS;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+  }
+};
+
+TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+      src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+
+TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = 255;
+      dst_tst_[i] = 255;
+
+      src0_[i] = kSrcMaxBitsMask;
+      src1_[i] = kSrcMaxBitsMask;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest(bsize, 1);
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_avx2)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_neon)));
+#endif  // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                     uint32_t src0_stride, const uint8_t *src1,
+                     uint32_t src1_stride, const uint8_t *mask,
+                     uint32_t mask_stride, int w, int h, int subx, int suby,
+                     int bd);
+typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                       CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                       CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                       mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+                       CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                       CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                       mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
+  for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest(bsize, 1);
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTestHBD,
+    ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+                                   aom_highbd_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// HBD _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD_D16)(uint8_t *dst, uint32_t dst_stride,
+                         const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                         const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                         const uint8_t *mask, uint32_t mask_stride, int w,
+                         int h, int subx, int suby, ConvolveParams *conv_params,
+                         const int bd);
+typedef libaom_test::FuncParam<FHBD_D16> TestFuncsHBD_d16;
+
+class BlendA64MaskTestHBD_d16
+    : public BlendA64MaskTest<FHBD_D16, uint16_t, uint16_t> {
+ protected:
+  // max number of bits used by the source
+  static const int kSrcMaxBitsMask = (1 << 14) - 1;
+  static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
+
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    ConvolveParams conv_params;
+    conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+    conv_params.round_1 = COMPOUND_ROUND1_BITS;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+                       bit_depth_);
+    }
+    if (params_.tst_func) {
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < run_times; ++i) {
+        params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_),
+                         dst_stride_, p_src0 + src0_offset_, src0_stride_,
+                         p_src1 + src1_offset_, src1_stride_, mask_,
+                         kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+                         bit_depth_);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 1) {
+        printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+               time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+    }
+  }
+
+  int bit_depth_;
+  int src_max_bits_mask_;
+};
+
+TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
+  if (params_.tst_func == NULL) return;
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    src_max_bits_mask_ =
+        (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & src_max_bits_mask_;
+      src1_[i] = rng_.Rand16() & src_max_bits_mask_;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+// TODO (Scott LaVarnway), fix this test
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) {
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+      src_max_bits_mask_ =
+          (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = 0;
+        dst_tst_[i] = (1 << bit_depth_) - 1;
+
+        src0_[i] = src_max_bits_mask_;
+        src1_[i] = src_max_bits_mask_;
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+      RunTest(bsize, 1);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
+
+// TODO(slavarnway): Enable the following in the avx2 commit. (56501)
+#if 0
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BlendA64MaskTestHBD,
+    ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+                                   aom_highbd_blend_a64_mask_avx2)));
+#endif  // HAVE_AVX2
+#endif
+}  // namespace
diff --git a/third_party/aom/test/blockd_test.cc b/third_party/aom/test/blockd_test.cc
new file mode 100644
index 000000000..ab624007c
--- /dev/null
+++ b/third_party/aom/test/blockd_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/blockd.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+// Verify the optimized implementation of get_partition_subsize() produces the
+// same results as the Partition_Subsize lookup table in the spec.
+TEST(BlockdTest, GetPartitionSubsize) {
+  // The Partition_Subsize table in the spec (Section 9.3. Conversion tables).
+  /* clang-format off */
+  static const BLOCK_SIZE kPartitionSubsize[10][BLOCK_SIZES_ALL] = {
+    {
+                                    BLOCK_4X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }
+  };
+  /* clang-format on */
+
+  for (int partition = 0; partition < 10; partition++) {
+    for (int bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+      EXPECT_EQ(kPartitionSubsize[partition][bsize],
+                get_partition_subsize(static_cast<BLOCK_SIZE>(bsize),
+                                      static_cast<PARTITION_TYPE>(partition)));
+    }
+  }
+}
diff --git a/third_party/aom/test/boolcoder_test.cc b/third_party/aom/test/boolcoder_test.cc
new file mode 100644
index 000000000..680ec1877
--- /dev/null
+++ b/third_party/aom/test/boolcoder_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int num_tests = 10;
+}  // namespace
+
+TEST(AV1, TestBitIO) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int n = 0; n < num_tests; ++n) {
+    for (int method = 0; method <= 7; ++method) {  // we generate various proba
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
+
+      for (int i = 0; i < kBitsToTest; ++i) {
+        const int parity = i & 1;
+        /* clang-format off */
+        probas[i] =
+          (method == 0) ? 0 : (method == 1) ? 255 :
+          (method == 2) ? 128 :
+          (method == 3) ? rnd.Rand8() :
+          (method == 4) ? (parity ? 0 : 255) :
+            // alternate between low and high proba:
+            (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
+            (method == 6) ?
+            (parity ? rnd(64) : 255 - rnd(64)) :
+            (parity ? rnd(32) : 255 - rnd(32));
+        /* clang-format on */
+      }
+      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
+        const int random_seed = 6432;
+        const int kBufferSize = 10000;
+        ACMRandom bit_rnd(random_seed);
+        aom_writer bw;
+        uint8_t bw_buffer[kBufferSize];
+        aom_start_encode(&bw, bw_buffer);
+
+        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          aom_write(&bw, bit, static_cast<int>(probas[i]));
+        }
+
+        aom_stop_encode(&bw);
+
+        aom_reader br;
+        aom_reader_init(&br, bw_buffer, bw.pos);
+        bit_rnd.Reset(random_seed);
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          GTEST_ASSERT_EQ(aom_read(&br, probas[i], NULL), bit)
+              << "pos: " << i << " / " << kBitsToTest
+              << " bit_method: " << bit_method << " method: " << method;
+        }
+      }
+    }
+  }
+}
+
+#define FRAC_DIFF_TOTAL_ERROR 0.18
+
+TEST(AV1, TestTell) {
+  const int kBufferSize = 10000;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const int kSymbols = 1024;
+  // Coders are noisier at low probabilities, so we start at p = 4.
+  for (int p = 4; p < 256; p++) {
+    double probability = p / 256.;
+    aom_start_encode(&bw, bw_buffer);
+    for (int i = 0; i < kSymbols; i++) {
+      aom_write(&bw, 0, p);
+    }
+    aom_stop_encode(&bw);
+    aom_reader br;
+    aom_reader_init(&br, bw_buffer, bw.pos);
+    uint32_t last_tell = aom_reader_tell(&br);
+    uint32_t last_tell_frac = aom_reader_tell_frac(&br);
+    double frac_diff_total = 0;
+    GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
+    GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+    ASSERT_FALSE(aom_reader_has_overflowed(&br));
+    for (int i = 0; i < kSymbols; i++) {
+      aom_read(&br, p, NULL);
+      uint32_t tell = aom_reader_tell(&br);
+      uint32_t tell_frac = aom_reader_tell_frac(&br);
+      GTEST_ASSERT_GE(tell, last_tell)
+          << "tell: " << tell << ", last_tell: " << last_tell;
+      GTEST_ASSERT_GE(tell_frac, last_tell_frac)
+          << "tell_frac: " << tell_frac
+          << ", last_tell_frac: " << last_tell_frac;
+      // Frac tell should round up to tell.
+      GTEST_ASSERT_EQ(tell, (tell_frac + 7) >> 3);
+      last_tell = tell;
+      frac_diff_total +=
+          fabs(((tell_frac - last_tell_frac) / 8.0) + log2(probability));
+      last_tell_frac = tell_frac;
+    }
+    const uint32_t expected = (uint32_t)(-kSymbols * log2(probability));
+    // Last tell should be close to the expected value.
+    GTEST_ASSERT_LE(last_tell, expected + 20) << " last_tell: " << last_tell;
+    // The average frac_diff error should be pretty small.
+    GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR)
+        << " frac_diff_total: " << frac_diff_total;
+    ASSERT_FALSE(aom_reader_has_overflowed(&br));
+  }
+}
+
+TEST(AV1, TestHasOverflowed) {
+  const int kBufferSize = 10000;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const int kSymbols = 1024;
+  // Coders are noisier at low probabilities, so we start at p = 4.
+  for (int p = 4; p < 256; p++) {
+    aom_start_encode(&bw, bw_buffer);
+    for (int i = 0; i < kSymbols; i++) {
+      aom_write(&bw, 1, p);
+    }
+    aom_stop_encode(&bw);
+    aom_reader br;
+    aom_reader_init(&br, bw_buffer, bw.pos);
+    ASSERT_FALSE(aom_reader_has_overflowed(&br));
+    for (int i = 0; i < kSymbols; i++) {
+      GTEST_ASSERT_EQ(aom_read(&br, p, NULL), 1);
+      ASSERT_FALSE(aom_reader_has_overflowed(&br));
+    }
+    // In the worst case, the encoder uses just a tiny fraction of the last
+    // byte in the buffer. So to guarantee that aom_reader_has_overflowed()
+    // returns true, we have to consume very nearly 8 additional bits of data.
+    // In the worse case, one of the bits in that byte will be 1, and the rest
+    // will be zero. Once we are past that 1 bit, when the probability of
+    // reading zero symbol from aom_read() is high, each additional symbol read
+    // will consume very little additional data (in the case that p == 255,
+    // approximately -log_2(255/256) ~= 0.0056 bits). In that case it would
+    // take around 178 calls to consume more than 8 bits. That is only an upper
+    // bound. In practice we are not guaranteed to hit the worse case and can
+    // get away with 174 calls.
+    for (int i = 0; i < 174; i++) {
+      aom_read(&br, p, NULL);
+    }
+    ASSERT_TRUE(aom_reader_has_overflowed(&br));
+  }
+}
diff --git a/third_party/aom/test/borders_test.cc b/third_party/aom/test/borders_test.cc
new file mode 100644
index 000000000..893237ef3
--- /dev/null
+++ b/third_party/aom/test/borders_test.cc
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class BordersTestLarge
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~BordersTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, 1);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & AOM_FRAME_IS_KEY) {
+    }
+  }
+};
+
+TEST_P(BordersTestLarge, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 10;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(BordersTestLarge, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(BordersTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+}  // namespace
diff --git a/third_party/aom/test/cdef_test.cc b/third_party/aom/test/cdef_test.cc
new file mode 100644
index 000000000..becc07291
--- /dev/null
+++ b/third_party/aom/test/cdef_test.cc
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/cdef_block.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef ::testing::tuple<cdef_filter_block_func, cdef_filter_block_func,
+                         BLOCK_SIZE, int, int>
+    cdef_dir_param_t;
+
+class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
+ public:
+  virtual ~CDEFBlockTest() {}
+  virtual void SetUp() {
+    cdef = GET_PARAM(0);
+    ref_cdef = GET_PARAM(1);
+    bsize = GET_PARAM(2);
+    boundary = GET_PARAM(3);
+    depth = GET_PARAM(4);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bsize;
+  int boundary;
+  int depth;
+  cdef_filter_block_func cdef;
+  cdef_filter_block_func ref_cdef;
+};
+
+typedef CDEFBlockTest CDEFSpeedTest;
+
+void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
+               cdef_filter_block_func ref_cdef, int boundary, int depth) {
+  const int size = 8;
+  const int ysize = size + 2 * CDEF_VBORDER;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
+  DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
+  DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  int error = 0, pristrength = 0, secstrength, dir;
+  int pridamping, secdamping, bits, level, count,
+      errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
+      errpridamping = 0, errsecdamping = 0;
+  unsigned int pos = 0;
+
+  const unsigned int max_pos = size * size >> static_cast<int>(depth == 8);
+  for (pridamping = 3 + depth - 8; pridamping < 7 - 3 * !!boundary + depth - 8;
+       pridamping++) {
+    for (secdamping = 3 + depth - 8;
+         secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+      for (count = 0; count < iterations; count++) {
+        for (level = 0; level < (1 << depth) && !error;
+             level += (2 + 6 * !!boundary) << (depth - 8)) {
+          for (bits = 1; bits <= depth && !error; bits += 1 + 3 * !!boundary) {
+            for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+              s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                           (1 << depth) - 1);
+            if (boundary) {
+              if (boundary & 1) {  // Left
+                for (int i = 0; i < ysize; i++)
+                  for (int j = 0; j < CDEF_HBORDER; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 2) {  // Right
+                for (int i = 0; i < ysize; i++)
+                  for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 4) {  // Above
+                for (int i = 0; i < CDEF_VBORDER; i++)
+                  for (int j = 0; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 8) {  // Below
+                for (int i = CDEF_VBORDER + size; i < ysize; i++)
+                  for (int j = 0; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+            }
+            for (dir = 0; dir < 8; dir++) {
+              for (pristrength = 0; pristrength <= 19 << (depth - 8) && !error;
+                   pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+                if (pristrength == 16) pristrength = 19;
+                for (secstrength = 0; secstrength <= 4 << (depth - 8) && !error;
+                     secstrength += 1 << (depth - 8)) {
+                  if (secstrength == 3 << (depth - 8)) continue;
+                  ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
+                           s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                           pristrength, secstrength, dir, pridamping,
+                           secdamping, bsize, (1 << depth) - 1, depth - 8);
+                  // If cdef and ref_cdef are the same, we're just testing
+                  // speed
+                  if (cdef != ref_cdef)
+                    ASM_REGISTER_STATE_CHECK(
+                        cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
+                             s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                             pristrength, secstrength, dir, pridamping,
+                             secdamping, bsize, (1 << depth) - 1, depth - 8));
+                  if (ref_cdef != cdef) {
+                    for (pos = 0; pos < max_pos && !error; pos++) {
+                      error = ref_d[pos] != d[pos];
+                      errdepth = depth;
+                      errpristrength = pristrength;
+                      errsecstrength = secstrength;
+                      errboundary = boundary;
+                      errpridamping = pridamping;
+                      errsecdamping = secdamping;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  pos--;
+  EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
+                      << std::endl
+                      << "First error at " << pos % size << "," << pos / size
+                      << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
+                      << ") " << std::endl
+                      << "pristrength: " << errpristrength << std::endl
+                      << "pridamping: " << errpridamping << std::endl
+                      << "secstrength: " << errsecstrength << std::endl
+                      << "secdamping: " << errsecdamping << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << "size: " << bsize << std::endl
+                      << "boundary: " << errboundary << std::endl
+                      << std::endl;
+}
+
+void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
+                     cdef_filter_block_func ref_cdef, int boundary, int depth) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_cdef(bsize, iterations, cdef, cdef, boundary, depth);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
+                          int coeff_shift);
+
+typedef ::testing::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+
+class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
+ public:
+  virtual ~CDEFFindDirTest() {}
+  virtual void SetUp() {
+    finddir = GET_PARAM(0);
+    ref_finddir = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  find_dir_t finddir;
+  find_dir_t ref_finddir;
+};
+
+typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+                                 int coeff_shift),
+                  int (*ref_finddir)(const uint16_t *img, int stride,
+                                     int32_t *var, int coeff_shift)) {
+  const int size = 8;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[size * size]);
+
+  int error = 0;
+  int depth, bits, level, count, errdepth = 0;
+  int ref_res = 0, res = 0;
+  int32_t ref_var = 0, var = 0;
+
+  for (depth = 8; depth <= 12 && !error; depth += 2) {
+    for (count = 0; count < 512 && !error; count++) {
+      for (level = 0; level < (1 << depth) && !error;
+           level += 1 << (depth - 8)) {
+        for (bits = 1; bits <= depth && !error; bits++) {
+          for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+            s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                         (1 << depth) - 1);
+          for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+            ref_res = ref_finddir(s, size, &ref_var, depth - 8);
+          if (finddir != ref_finddir)
+            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+          if (ref_finddir != finddir) {
+            if (res != ref_res || var != ref_var) error = 1;
+            errdepth = depth;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+                      << std::endl
+                      << "return: " << res << " : " << ref_res << std::endl
+                      << "var: " << var << " : " << ref_var << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << std::endl;
+}
+
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
+                                       int32_t *var, int coeff_shift),
+                        int (*ref_finddir)(const uint16_t *img, int stride,
+                                           int32_t *var, int coeff_shift)) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_finddir(ref_finddir, ref_finddir);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_finddir(finddir, finddir);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
+  test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
+  test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
+  test_finddir(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
+  test_finddir_speed(finddir, ref_finddir);
+}
+
+using ::testing::make_tuple;
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
+#endif
+
+// Test speed for all supported architectures
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(SSE2, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(SSSE3, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(SSE4_1, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(AVX2, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_CASE_P(NEON, CDEFFindDirSpeedTest,
+                        ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                     &cdef_find_dir_c)));
+#endif
+
+#endif  // defined(_WIN64) || !defined(_MSC_VER)
+}  // namespace
diff --git a/third_party/aom/test/cfl_test.cc b/third_party/aom/test/cfl_test.cc
new file mode 100644
index 000000000..e4d438d6a
--- /dev/null
+++ b/third_party/aom/test/cfl_test.cc
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/util.h"
+#include "test/acm_random.h"
+
+using ::testing::make_tuple;
+
+using libaom_test::ACMRandom;
+
+#define NUM_ITERATIONS (100)
+#define NUM_ITERATIONS_SPEED (INT16_MAX)
+
+#define ALL_CFL_TX_SIZES(function)                                     \
+  make_tuple(TX_4X4, &function), make_tuple(TX_4X8, &function),        \
+      make_tuple(TX_4X16, &function), make_tuple(TX_8X4, &function),   \
+      make_tuple(TX_8X8, &function), make_tuple(TX_8X16, &function),   \
+      make_tuple(TX_8X32, &function), make_tuple(TX_16X4, &function),  \
+      make_tuple(TX_16X8, &function), make_tuple(TX_16X16, &function), \
+      make_tuple(TX_16X32, &function), make_tuple(TX_32X8, &function), \
+      make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
+
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444) \
+  make_tuple(TX_4X4, &fun420, &fun422, &fun444),           \
+      make_tuple(TX_4X8, &fun420, &fun422, &fun444),       \
+      make_tuple(TX_4X16, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_8X4, &fun420, &fun422, &fun444),       \
+      make_tuple(TX_8X8, &fun420, &fun422, &fun444),       \
+      make_tuple(TX_8X16, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_8X32, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_16X4, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_16X8, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_16X16, &fun420, &fun422, &fun444),     \
+      make_tuple(TX_16X32, &fun420, &fun422, &fun444),     \
+      make_tuple(TX_32X8, &fun420, &fun422, &fun444),      \
+      make_tuple(TX_32X16, &fun420, &fun422, &fun444),     \
+      make_tuple(TX_32X32, &fun420, &fun422, &fun444)
+
+namespace {
+
+template <typename A>
+static void assert_eq(const A *a, const A *b, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      ASSERT_EQ(a[j * CFL_BUF_LINE + i], b[j * CFL_BUF_LINE + i]);
+    }
+  }
+}
+
+static void assertFaster(int ref_elapsed_time, int elapsed_time) {
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
+                       int height) {
+  std::cout.precision(2);
+  std::cout << "[          ] " << width << "x" << height
+            << ": C time = " << ref_elapsed_time
+            << " us, SIMD time = " << elapsed_time << " us"
+            << " (~" << ref_elapsed_time / (double)elapsed_time << "x) "
+            << std::endl;
+}
+
+class CFLTest {
+ public:
+  virtual ~CFLTest() {}
+  void init(TX_SIZE tx) {
+    tx_size = tx;
+    width = tx_size_wide[tx_size];
+    height = tx_size_high[tx_size];
+    rnd(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  TX_SIZE tx_size;
+  int width;
+  int height;
+  ACMRandom rnd;
+};
+
+template <typename I>
+class CFLTestWithData : public CFLTest {
+ public:
+  virtual ~CFLTestWithData() {}
+
+ protected:
+  I data[CFL_BUF_SQUARE];
+  I data_ref[CFL_BUF_SQUARE];
+  void randData(I (ACMRandom::*random)()) {
+    for (int j = 0; j < this->height; j++) {
+      for (int i = 0; i < this->width; i++) {
+        const I d = (this->rnd.*random)();
+        data[j * CFL_BUF_LINE + i] = d;
+        data_ref[j * CFL_BUF_LINE + i] = d;
+      }
+    }
+  }
+};
+
+template <typename I>
+class CFLTestWithAlignedData : public CFLTest {
+ public:
+  CFLTestWithAlignedData() {
+    chroma_pels_ref =
+        reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    chroma_pels =
+        reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    sub_luma_pels_ref = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    sub_luma_pels = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    memset(chroma_pels_ref, 0, sizeof(I) * CFL_BUF_SQUARE);
+    memset(chroma_pels, 0, sizeof(I) * CFL_BUF_SQUARE);
+    memset(sub_luma_pels_ref, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+    memset(sub_luma_pels, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+  }
+  ~CFLTestWithAlignedData() {
+    aom_free(chroma_pels_ref);
+    aom_free(sub_luma_pels_ref);
+    aom_free(chroma_pels);
+    aom_free(sub_luma_pels);
+  }
+
+ protected:
+  I *chroma_pels_ref;
+  I *chroma_pels;
+  int16_t *sub_luma_pels_ref;
+  int16_t *sub_luma_pels;
+  int alpha_q3;
+  I dc;
+  void randData(int bd) {
+    alpha_q3 = this->rnd(33) - 16;
+    dc = this->rnd(1 << bd);
+    for (int j = 0; j < this->height; j++) {
+      for (int i = 0; i < this->width; i++) {
+        chroma_pels[j * CFL_BUF_LINE + i] = dc;
+        chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
+        sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
+            sub_luma_pels[j * CFL_BUF_LINE + i] = this->rnd(1 << (bd + 3));
+      }
+    }
+  }
+};
+
+typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
+class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
+                      public CFLTestWithData<int16_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    sub_avg = ::testing::get<1>(this->GetParam())(tx_size);
+    sub_avg_ref = get_subtract_average_fn_c(tx_size);
+  }
+  virtual ~CFLSubAvgTest() {}
+
+ protected:
+  cfl_subtract_average_fn sub_avg;
+  cfl_subtract_average_fn sub_avg_ref;
+};
+
+TEST_P(CFLSubAvgTest, SubAvgTest) {
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(&ACMRandom::Rand15Signed);
+    sub_avg((uint16_t *)data, data);
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
+    assert_eq<int16_t>(data, data_ref, width, height);
+  }
+}
+
+TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  randData(&ACMRandom::Rand15Signed);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    sub_avg((uint16_t *)data, data);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+template <typename S, typename T, typename I>
+class CFLSubsampleTest : public ::testing::TestWithParam<S>,
+                         public CFLTestWithData<I> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    fun_420 = ::testing::get<1>(this->GetParam())(this->tx_size);
+    fun_422 = ::testing::get<2>(this->GetParam())(this->tx_size);
+    fun_444 = ::testing::get<3>(this->GetParam())(this->tx_size);
+  }
+
+ protected:
+  T fun_420;
+  T fun_422;
+  T fun_444;
+  T fun_420_ref;
+  T fun_422_ref;
+  T fun_444_ref;
+
+  void subsampleTest(T fun, T fun_ref, int sub_width, int sub_height,
+                     I (ACMRandom::*random)()) {
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+
+    for (int it = 0; it < NUM_ITERATIONS; it++) {
+      CFLTestWithData<I>::randData(random);
+      fun(this->data, CFL_BUF_LINE, sub_luma_pels);
+      fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels_ref);
+      assert_eq<uint16_t>(sub_luma_pels, sub_luma_pels_ref, sub_width,
+                          sub_height);
+    }
+  }
+
+  void subsampleSpeedTest(T fun, T fun_ref, I (ACMRandom::*random)()) {
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+    aom_usec_timer ref_timer;
+    aom_usec_timer timer;
+
+    CFLTestWithData<I>::randData(random);
+    aom_usec_timer_start(&ref_timer);
+    for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+      fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels);
+    }
+    aom_usec_timer_mark(&ref_timer);
+    int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+    aom_usec_timer_start(&timer);
+    for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+      fun(this->data, CFL_BUF_LINE, sub_luma_pels_ref);
+    }
+    aom_usec_timer_mark(&timer);
+    int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+    printSpeed(ref_elapsed_time, elapsed_time, this->width, this->height);
+    assertFaster(ref_elapsed_time, elapsed_time);
+  }
+};
+
+typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
+                         get_subsample_lbd_fn>
+    subsample_lbd_param;
+class CFLSubsampleLBDTest
+    : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
+                              uint8_t> {
+ public:
+  virtual ~CFLSubsampleLBDTest() {}
+  virtual void SetUp() {
+    CFLSubsampleTest::SetUp();
+    fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
+  }
+};
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
+  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+                &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD420SpeedTest) {
+  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8);
+}
+
+typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
+                         get_subsample_hbd_fn>
+    subsample_hbd_param;
+class CFLSubsampleHBDTest
+    : public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
+                              uint16_t> {
+ public:
+  virtual ~CFLSubsampleHBDTest() {}
+  virtual void SetUp() {
+    CFLSubsampleTest::SetUp();
+    fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
+  }
+};
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) {
+  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+                &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD420SpeedTest) {
+  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
+}
+
+typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param;
+class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
+                       public CFLTestWithAlignedData<uint8_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    predict = ::testing::get<1>(this->GetParam())(tx_size);
+    predict_ref = get_predict_lbd_fn_c(tx_size);
+  }
+  virtual ~CFLPredictTest() {}
+
+ protected:
+  cfl_predict_lbd_fn predict;
+  cfl_predict_lbd_fn predict_ref;
+};
+
+TEST_P(CFLPredictTest, PredictTest) {
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(8);
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+    assert_eq<uint8_t>(chroma_pels, chroma_pels_ref, width, height);
+  }
+}
+TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  randData(8);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
+typedef ::testing::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
+class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
+                          public CFLTestWithAlignedData<uint16_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(::testing::get<0>(this->GetParam()));
+    predict = ::testing::get<1>(this->GetParam())(tx_size);
+    predict_ref = get_predict_hbd_fn_c(tx_size);
+  }
+  virtual ~CFLPredictHBDTest() {}
+
+ protected:
+  cfl_predict_hbd_fn predict;
+  cfl_predict_hbd_fn predict_ref;
+};
+
+TEST_P(CFLPredictHBDTest, PredictHBDTest) {
+  int bd = 12;
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(bd);
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+    assert_eq<uint16_t>(chroma_pels, chroma_pels_ref, width, height);
+  }
+}
+TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  const int bd = 12;
+  randData(bd);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+#if HAVE_SSE2
+const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_sse2) };
+
+INSTANTIATE_TEST_CASE_P(SSE2, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_sse2));
+
+#endif
+
+#if HAVE_SSSE3
+const subsample_lbd_param subsample_lbd_sizes_ssse3[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_ssse3,
+                             cfl_get_luma_subsampling_422_lbd_ssse3,
+                             cfl_get_luma_subsampling_444_lbd_ssse3)
+};
+
+const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
+                             cfl_get_luma_subsampling_422_hbd_ssse3,
+                             cfl_get_luma_subsampling_444_hbd_ssse3)
+};
+
+const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
+    get_predict_lbd_fn_ssse3) };
+
+const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_ssse3) };
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleLBDTest,
+                        ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleHBDTest,
+                        ::testing::ValuesIn(subsample_hbd_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictTest,
+                        ::testing::ValuesIn(predict_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_ssse3));
+#endif
+
+#if HAVE_AVX2
+const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_avx2) };
+
+const subsample_lbd_param subsample_lbd_sizes_avx2[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2,
+                             cfl_get_luma_subsampling_422_lbd_avx2,
+                             cfl_get_luma_subsampling_444_lbd_avx2)
+};
+
+const subsample_hbd_param subsample_hbd_sizes_avx2[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2,
+                             cfl_get_luma_subsampling_422_hbd_avx2,
+                             cfl_get_luma_subsampling_444_hbd_avx2)
+};
+
+const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    get_predict_lbd_fn_avx2) };
+
+const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_avx2) };
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleLBDTest,
+                        ::testing::ValuesIn(subsample_lbd_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLSubsampleHBDTest,
+                        ::testing::ValuesIn(subsample_hbd_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictTest,
+                        ::testing::ValuesIn(predict_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_avx2));
+#endif
+
+#if HAVE_NEON
+
+const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_neon) };
+
+const subsample_lbd_param subsample_lbd_sizes_neon[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon,
+                             cfl_get_luma_subsampling_422_lbd_neon,
+                             cfl_get_luma_subsampling_444_lbd_neon)
+};
+
+const subsample_hbd_param subsample_hbd_sizes_neon[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
+                             cfl_get_luma_subsampling_422_hbd_neon,
+                             cfl_get_luma_subsampling_444_hbd_neon)
+};
+
+const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    get_predict_lbd_fn_neon) };
+
+const predict_param_hbd predict_sizes_hbd_neon[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_neon) };
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleLBDTest,
+                        ::testing::ValuesIn(subsample_lbd_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleHBDTest,
+                        ::testing::ValuesIn(subsample_hbd_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLPredictTest,
+                        ::testing::ValuesIn(predict_sizes_neon));
+
+INSTANTIATE_TEST_CASE_P(NEON, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_neon));
+#endif
+
+#if HAVE_VSX
+const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES(
+    get_subtract_average_fn_vsx) };
+
+INSTANTIATE_TEST_CASE_P(VSX, CFLSubAvgTest,
+                        ::testing::ValuesIn(sub_avg_sizes_vsx));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/clear_system_state.h b/third_party/aom/test/clear_system_state.h
new file mode 100644
index 000000000..d38ff5dd5
--- /dev/null
+++ b/third_party/aom/test/clear_system_state.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_CLEAR_SYSTEM_STATE_H_
+#define AOM_TEST_CLEAR_SYSTEM_STATE_H_
+
+#include "config/aom_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+
+namespace libaom_test {
+
+// Reset system to a known state. This function should be used for all non-API
+// test cases.
+inline void ClearSystemState() {
+#if ARCH_X86 || ARCH_X86_64
+  aom_reset_mmx_state();
+#endif
+}
+
+}  // namespace libaom_test
+#endif  // AOM_TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/third_party/aom/test/codec_factory.h b/third_party/aom/test/codec_factory.h
new file mode 100644
index 000000000..dd99110ee
--- /dev/null
+++ b/third_party/aom/test/codec_factory.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_CODEC_FACTORY_H_
+#define AOM_TEST_CODEC_FACTORY_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+#if CONFIG_AV1_DECODER
+#include "aom/aomdx.h"
+#endif
+
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+namespace libaom_test {
+
+const int kCodecFactoryParam = 0;
+
+class CodecFactory {
+ public:
+  CodecFactory() {}
+
+  virtual ~CodecFactory() {}
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const = 0;
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+                                 const aom_codec_flags_t flags) const = 0;
+
+  virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const = 0;
+
+  virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+                                               int usage) const = 0;
+};
+
+/* Provide CodecTestWith<n>Params classes for a variable number of parameters
+ * to avoid having to include a pointer to the CodecFactory in every test
+ * definition.
+ */
+template <class T1>
+class CodecTestWithParam
+    : public ::testing::TestWithParam<
+          ::testing::tuple<const libaom_test::CodecFactory *, T1> > {};
+
+template <class T1, class T2>
+class CodecTestWith2Params
+    : public ::testing::TestWithParam<
+          ::testing::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
+
+template <class T1, class T2, class T3>
+class CodecTestWith3Params
+    : public ::testing::TestWithParam<
+          ::testing::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
+
+template <class T1, class T2, class T3, class T4>
+class CodecTestWith4Params
+    : public ::testing::TestWithParam< ::testing::tuple<
+          const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
+
+template <class T1, class T2, class T3, class T4, class T5>
+class CodecTestWith5Params
+    : public ::testing::TestWithParam< ::testing::tuple<
+          const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {};
+
+/*
+ * AV1 Codec Definitions
+ */
+class AV1Decoder : public Decoder {
+ public:
+  explicit AV1Decoder(aom_codec_dec_cfg_t cfg) : Decoder(cfg) {}
+
+  AV1Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag)
+      : Decoder(cfg, flag) {}
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const {
+#if CONFIG_AV1_DECODER
+    return aom_codec_av1_dx();
+#else
+    return NULL;
+#endif
+  }
+};
+
+class AV1Encoder : public Encoder {
+ public:
+  AV1Encoder(aom_codec_enc_cfg_t cfg, const uint32_t init_flags,
+             TwopassStatsStore *stats)
+      : Encoder(cfg, init_flags, stats) {}
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const {
+#if CONFIG_AV1_ENCODER
+    return aom_codec_av1_cx();
+#else
+    return NULL;
+#endif
+  }
+};
+
+class AV1CodecFactory : public CodecFactory {
+ public:
+  AV1CodecFactory() : CodecFactory() {}
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const {
+    return CreateDecoder(cfg, 0);
+  }
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+                                 const aom_codec_flags_t flags) const {
+#if CONFIG_AV1_DECODER
+    return new AV1Decoder(cfg, flags);
+#else
+    (void)cfg;
+    (void)flags;
+    return NULL;
+#endif
+  }
+
+  virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+                                 const unsigned long init_flags,
+                                 TwopassStatsStore *stats) const {
+#if CONFIG_AV1_ENCODER
+    return new AV1Encoder(cfg, init_flags, stats);
+#else
+    (void)cfg;
+    (void)init_flags;
+    (void)stats;
+    return NULL;
+#endif
+  }
+
+  virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+                                               int usage) const {
+#if CONFIG_AV1_ENCODER
+    return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
+#else
+    (void)cfg;
+    (void)usage;
+    return AOM_CODEC_INCAPABLE;
+#endif
+  }
+};
+
+const libaom_test::AV1CodecFactory kAV1;
+
+#define AV1_INSTANTIATE_TEST_CASE(test, ...)                                \
+  INSTANTIATE_TEST_CASE_P(                                                  \
+      AV1, test,                                                            \
+      ::testing::Combine(                                                   \
+          ::testing::Values(static_cast<const libaom_test::CodecFactory *>( \
+              &libaom_test::kAV1)),                                         \
+          __VA_ARGS__))
+
+}  // namespace libaom_test
+#endif  // AOM_TEST_CODEC_FACTORY_H_
diff --git a/third_party/aom/test/coding_path_sync.cc b/third_party/aom/test/coding_path_sync.cc
new file mode 100644
index 000000000..6735236cc
--- /dev/null
+++ b/third_party/aom/test/coding_path_sync.cc
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"  // ROUND_POWER_OF_TWO
+#include "aom/aomcx.h"
+#include "aom/aomdx.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_decoder.h"
+
+using libaom_test::ACMRandom;
+namespace {
+
+class CompressedSource {
+ public:
+  explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) {
+    aom_codec_iface_t *algo = aom_codec_av1_cx();
+
+    aom_codec_enc_cfg_t cfg;
+    aom_codec_enc_config_default(algo, &cfg, 0);
+
+    // force the quantizer, to reduce the sensitivity on encoding choices.
+    // e.g, we don't want this test to break when the rate control is modified.
+    {
+      const int max_q = cfg.rc_max_quantizer;
+      const int min_q = cfg.rc_min_quantizer;
+      const int q = rnd_.PseudoUniform(max_q - min_q + 1) + min_q;
+
+      cfg.rc_end_usage = AOM_Q;
+      cfg.rc_max_quantizer = q;
+      cfg.rc_min_quantizer = q;
+    }
+
+    // choose the picture size
+    {
+      width_ = rnd_.PseudoUniform(kWidth - 8) + 8;
+      height_ = rnd_.PseudoUniform(kHeight - 8) + 8;
+    }
+
+    // choose the chroma subsampling
+    {
+      const aom_img_fmt_t fmts[] = {
+        AOM_IMG_FMT_I420,
+        AOM_IMG_FMT_I422,
+        AOM_IMG_FMT_I444,
+      };
+
+      format_ = fmts[rnd_.PseudoUniform(NELEMENTS(fmts))];
+    }
+
+    cfg.g_w = width_;
+    cfg.g_h = height_;
+    cfg.g_lag_in_frames = 0;
+    if (format_ == AOM_IMG_FMT_I420)
+      cfg.g_profile = 0;
+    else if (format_ == AOM_IMG_FMT_I444)
+      cfg.g_profile = 1;
+    else if (format_ == AOM_IMG_FMT_I422)
+      cfg.g_profile = 2;
+
+    aom_codec_enc_init(&enc_, algo, &cfg, 0);
+  }
+
+  ~CompressedSource() { aom_codec_destroy(&enc_); }
+
+  const aom_codec_cx_pkt_t *ReadFrame() {
+    uint8_t buf[kWidth * kHeight * 3] = { 0 };
+
+    // render regular pattern
+    const int period = rnd_.Rand8() % 32 + 1;
+    const int phase = rnd_.Rand8() % period;
+
+    const int val_a = rnd_.Rand8();
+    const int val_b = rnd_.Rand8();
+
+    for (int i = 0; i < (int)sizeof buf; ++i)
+      buf[i] = (i + phase) % period < period / 2 ? val_a : val_b;
+
+    aom_image_t img;
+    aom_img_wrap(&img, format_, width_, height_, 0, buf);
+    aom_codec_encode(&enc_, &img, frame_count_++, 1, 0);
+
+    aom_codec_iter_t iter = NULL;
+
+    const aom_codec_cx_pkt_t *pkt = NULL;
+
+    do {
+      pkt = aom_codec_get_cx_data(&enc_, &iter);
+    } while (pkt && pkt->kind != AOM_CODEC_CX_FRAME_PKT);
+
+    return pkt;
+  }
+
+ private:
+  static const int kWidth = 128;
+  static const int kHeight = 128;
+
+  ACMRandom rnd_;
+  aom_img_fmt_t format_;
+  aom_codec_ctx_t enc_;
+  int frame_count_;
+  int width_, height_;
+};
+
+// lowers an aom_image_t to a easily comparable/printable form
+std::vector<int16_t> Serialize(const aom_image_t *img) {
+  std::vector<int16_t> bytes;
+  bytes.reserve(img->d_w * img->d_h * 3);
+  for (int plane = 0; plane < 3; ++plane) {
+    const int w = aom_img_plane_width(img, plane);
+    const int h = aom_img_plane_height(img, plane);
+
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        unsigned char *row = img->planes[plane] + r * img->stride[plane];
+        if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
+          bytes.push_back(row[c * 2]);
+        else
+          bytes.push_back(row[c]);
+      }
+    }
+  }
+
+  return bytes;
+}
+
+class Decoder {
+ public:
+  explicit Decoder(int allowLowbitdepth) {
+    aom_codec_iface_t *algo = aom_codec_av1_dx();
+
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.allow_lowbitdepth = allowLowbitdepth;
+
+    aom_codec_dec_init(&dec_, algo, &cfg, 0);
+  }
+
+  ~Decoder() { aom_codec_destroy(&dec_); }
+
+  std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
+    aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
+                     pkt->data.frame.sz, NULL);
+
+    aom_codec_iter_t iter = NULL;
+    return Serialize(aom_codec_get_frame(&dec_, &iter));
+  }
+
+ private:
+  aom_codec_ctx_t dec_;
+};
+
+// Try to reveal a mismatch between LBD and HBD coding paths.
+TEST(CodingPathSync, SearchForHbdLbdMismatch) {
+  const int count_tests = 10;
+  for (int i = 0; i < count_tests; ++i) {
+    Decoder dec_hbd(0);
+    Decoder dec_lbd(1);
+
+    CompressedSource enc(i);
+
+    for (int k = 0; k < 3; ++k) {
+      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+
+      ASSERT_EQ(lbd_yuv, hbd_yuv);
+    }
+  }
+}
+
+TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
+  const int count_tests = 100;
+  const int seed = 1234;
+  for (int i = 0; i < count_tests; ++i) {
+    Decoder dec_hbd(0);
+    Decoder dec_lbd(1);
+
+    CompressedSource enc(seed + i);
+
+    for (int k = 0; k < 5; ++k) {
+      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+
+      ASSERT_EQ(lbd_yuv, hbd_yuv);
+    }
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/comp_avg_pred_test.cc b/third_party/aom/test/comp_avg_pred_test.cc
new file mode 100644
index 000000000..9ad8973f0
--- /dev/null
+++ b/third_party/aom/test/comp_avg_pred_test.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/comp_avg_pred_test.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGTest;
+using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGUPSAMPLEDTest;
+using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGTest;
+using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGUPSAMPLEDTest;
+
+namespace {
+
+TEST_P(AV1JNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1JNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1JNTCOMPAVGTest,
+    libaom_test::AV1JNTCOMPAVG::BuildParams(aom_jnt_comp_avg_pred_ssse3));
+#endif
+
+TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0));
+}
+
+TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1JNTCOMPAVGUPSAMPLEDTest,
+                        libaom_test::AV1JNTCOMPAVG::BuildParams(
+                            aom_jnt_comp_avg_upsampled_pred_ssse3));
+#endif
+
+TEST_P(AV1HighBDJNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); }
+
+TEST_P(AV1HighBDJNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGTest,
+                        libaom_test::AV1JNTCOMPAVG::BuildParams(
+                            aom_highbd_jnt_comp_avg_pred_sse2, 1));
+#endif
+
+TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGUPSAMPLEDTest,
+                        libaom_test::AV1JNTCOMPAVG::BuildParams(
+                            aom_highbd_jnt_comp_avg_upsampled_pred_sse2));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/comp_avg_pred_test.h b/third_party/aom/test/comp_avg_pred_test.h
new file mode 100644
index 000000000..9661dd9f5
--- /dev/null
+++ b/third_party/aom/test/comp_avg_pred_test.h
@@ -0,0 +1,555 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_
+#define AOM_TEST_COMP_AVG_PRED_TEST_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "av1/common/common_data.h"
+#include "aom_ports/aom_timer.h"
+
+namespace libaom_test {
+const int kMaxSize = 128 + 32;  // padding
+
+namespace AV1JNTCOMPAVG {
+
+typedef void (*jntcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                int width, int height, const uint8_t *ref,
+                                int ref_stride,
+                                const JNT_COMP_PARAMS *jcp_param);
+
+typedef void (*jntcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search);
+
+typedef void (*highbdjntcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int subpel_search);
+
+typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam;
+
+typedef ::testing::tuple<jntcompavgupsampled_func, BLOCK_SIZE>
+    JNTCOMPAVGUPSAMPLEDParam;
+
+typedef ::testing::tuple<int, jntcompavg_func, BLOCK_SIZE>
+    HighbdJNTCOMPAVGParam;
+
+typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE>
+    HighbdJNTCOMPAVGUPSAMPLEDParam;
+
+::testing::internal::ParamGenerator<JNTCOMPAVGParam> BuildParams(
+    jntcompavg_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<JNTCOMPAVGUPSAMPLEDParam> BuildParams(
+    jntcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdJNTCOMPAVGParam> BuildParams(
+    jntcompavg_func filter, int is_hbd) {
+  (void)is_hbd;
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdJNTCOMPAVGUPSAMPLEDParam> BuildParams(
+    highbdjntcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
+ public:
+  ~AV1JNTCOMPAVGTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(jntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_jnt_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, in_w,
+                                in_h, ref8 + offset_r * w + offset_c, in_w,
+                                &jnt_comp_params);
+        test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
+                  ref8 + offset_r * w + offset_c, in_w, &jnt_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1JNTCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(jntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_jnt_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+                              &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("jntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("jntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1JNTCOMPAVGTest
+
+class AV1JNTCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<JNTCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1JNTCOMPAVGUPSAMPLEDTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(jntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+              jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+              aom_jnt_comp_avg_upsampled_pred_c(
+                  NULL, NULL, 0, 0, NULL, output,
+                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                  &jnt_comp_params, subpel_search);
+              test_impl(NULL, NULL, 0, 0, NULL, output2,
+                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                        &jnt_comp_params, subpel_search);
+
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1JNTCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(jntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8,
+                                        in_w, in_h, sub_x_q3, sub_y_q3, ref8,
+                                        in_w, &jnt_comp_params, subpel_search);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("jntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
+                sub_y_q3, ref8, in_w, &jnt_comp_params, subpel_search);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("jntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1JNTCOMPAVGUPSAMPLEDTest
+
+class AV1HighBDJNTCOMPAVGTest
+    : public ::testing::TestWithParam<HighbdJNTCOMPAVGParam> {
+ public:
+  ~AV1HighBDJNTCOMPAVGTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(jntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_highbd_jnt_comp_avg_pred_c(
+            CONVERT_TO_BYTEPTR(output),
+            CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
+            CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+            &jnt_comp_params);
+        test_impl(CONVERT_TO_BYTEPTR(output2),
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                  in_w, &jnt_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1HighBDJNTCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(jntcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_jnt_comp_avg_pred_c(
+          CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+          CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbdjntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
+                in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbdjntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDJNTCOMPAVGTest
+
+class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<HighbdJNTCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1HighBDJNTCOMPAVGUPSAMPLEDTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(highbdjntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+              jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+              aom_highbd_jnt_comp_avg_upsampled_pred_c(
+                  NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, sub_x_q3, sub_y_q3,
+                  CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+                  &jnt_comp_params, subpel_search);
+              test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
+                        CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+                        in_w, in_h, sub_x_q3, sub_y_q3,
+                        CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                        in_w, bd, &jnt_comp_params, subpel_search);
+
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(highbdjntcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    JNT_COMP_PARAMS jnt_comp_params;
+    jnt_comp_params.use_jnt_comp_avg = 1;
+
+    jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_jnt_comp_avg_upsampled_pred_c(
+          NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
+          CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+          CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params, subpel_search);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbdjntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
+                CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+                CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params,
+                subpel_search);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbdjntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
+
+}  // namespace AV1JNTCOMPAVG
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/third_party/aom/test/comp_mask_variance_test.cc b/third_party/aom/test/comp_mask_variance_test.cc
new file mode 100644
index 000000000..34be2aa6d
--- /dev/null
+++ b/third_party/aom/test/comp_mask_variance_test.cc
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/reconinter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompMaskVariance {
+typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask);
+
+#if HAVE_SSSE3 || HAVE_AV2
+const BLOCK_SIZE kValidBlockSize[] = {
+  BLOCK_8X8,   BLOCK_8X16, BLOCK_8X32,  BLOCK_16X8,  BLOCK_16X16,
+  BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+};
+#endif
+typedef ::testing::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
+
+class AV1CompMaskVarianceTest
+    : public ::testing::TestWithParam<CompMaskPredParam> {
+ public:
+  ~AV1CompMaskVarianceTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint8_t *comp_pred1_;
+  uint8_t *comp_pred2_;
+  uint8_t *pred_;
+  uint8_t *ref_buffer_;
+  uint8_t *ref_;
+};
+
+AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
+
+void AV1CompMaskVarianceTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_init_wedge_masks();
+  comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  ref_buffer_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (8 * MAX_SB_SIZE));
+  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand8();
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1CompMaskVarianceTest::TearDown() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_buffer_);
+  libaom_test::ClearSystemState();
+}
+
+void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
+                                             BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+    aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
+                         inv);
+    test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
+
+    ASSERT_EQ(CheckResult(w, h), true)
+        << " wedge " << wedge_index << " inv " << inv;
+  }
+}
+
+void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
+                                           BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  const int num_loops = 1000000000 / (w + h);
+
+  comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    comp_mask_pred_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompMaskVarianceTest, CheckOutput) {
+  // inv = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1CompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1CompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#ifndef aom_comp_mask_pred
+// can't run this test if aom_comp_mask_pred is defined to aom_comp_mask_pred_c
+class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest {
+ public:
+  ~AV1CompMaskUpVarianceTest();
+
+ protected:
+  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                    int havSub);
+};
+
+AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; }
+
+void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
+                                               BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int subpel_search;
+  for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+    // loop through subx and suby
+    for (int sub = 0; sub < 8 * 8; ++sub) {
+      int subx = sub & 0x7;
+      int suby = (sub >> 3);
+      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        const uint8_t *mask =
+            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+        // ref
+        aom_comp_mask_upsampled_pred_c(
+            NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_,
+            MAX_SB_SIZE, mask, w, inv, subpel_search);
+
+        aom_comp_mask_pred = test_impl;  // test
+        aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
+                                     w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
+                                     w, inv, subpel_search);
+        ASSERT_EQ(CheckResult(w, h), true)
+            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+            << "," << suby << ")";
+      }
+    }
+  }
+}
+
+void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
+                                             BLOCK_SIZE bsize, int havSub) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int subx = havSub ? 3 : 0;
+  const int suby = havSub ? 4 : 0;
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+  const int num_loops = 1000000000 / (w + h);
+  comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  int subpel_search = 2;  // set to 1 to test 4-tap filter.
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    aom_comp_mask_pred = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
+                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
+                                   0, subpel_search);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) {
+  // inv mask = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AV1CompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1CompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#endif  // ifndef aom_comp_mask_pred
+
+typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8,
+                                           const uint8_t *pred8, int width,
+                                           int height, const uint8_t *ref8,
+                                           int ref_stride, const uint8_t *mask,
+                                           int mask_stride, int invert_mask);
+
+typedef ::testing::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int>
+    HighbdCompMaskPredParam;
+
+class AV1HighbdCompMaskVarianceTest
+    : public ::testing::TestWithParam<HighbdCompMaskPredParam> {
+ public:
+  ~AV1HighbdCompMaskVarianceTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                      int inv);
+  void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint16_t *comp_pred1_;
+  uint16_t *comp_pred2_;
+  uint16_t *pred_;
+  uint16_t *ref_buffer_;
+  uint16_t *ref_;
+};
+
+AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; }
+
+void AV1HighbdCompMaskVarianceTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_init_wedge_masks();
+
+  comp_pred1_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+  comp_pred2_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+  pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+  ref_buffer_ = (uint16_t *)aom_memalign(
+      16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_));
+  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
+}
+
+void AV1HighbdCompMaskVarianceTest::TearDown() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_buffer_);
+  libaom_test::ClearSystemState();
+}
+
+void AV1HighbdCompMaskVarianceTest::RunCheckOutput(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  int bd_ = GET_PARAM(2);
+
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+    aom_highbd_comp_mask_pred_c(
+        CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+        CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+    test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+              CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+    ASSERT_EQ(CheckResult(w, h), true)
+        << " wedge " << wedge_index << " inv " << inv;
+  }
+}
+
+void AV1HighbdCompMaskVarianceTest::RunSpeedTest(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) {
+  int bd_ = GET_PARAM(2);
+
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  const int num_loops = 1000000000 / (w + h);
+
+  highbd_comp_mask_pred_func funcs[2] = { aom_highbd_comp_mask_pred_c,
+                                          test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    highbd_comp_mask_pred_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+           CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1HighbdCompMaskVarianceTest, CheckOutput) {
+  // inv = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1HighbdCompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1HighbdCompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#ifndef aom_highbd_comp_mask_pred
+// can't run this test if aom_highbd_comp_mask_pred is defined to
+// aom_highbd_comp_mask_pred_c
+class AV1HighbdCompMaskUpVarianceTest : public AV1HighbdCompMaskVarianceTest {
+ public:
+  ~AV1HighbdCompMaskUpVarianceTest();
+
+ protected:
+  void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                      int inv);
+  void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                    int havSub);
+};
+
+AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() { ; }
+
+void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  int subpel_search;
+  for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+    // loop through subx and suby
+    for (int sub = 0; sub < 8 * 8; ++sub) {
+      int subx = sub & 0x7;
+      int suby = (sub >> 3);
+      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        const uint8_t *mask =
+            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+        aom_highbd_comp_mask_pred = aom_highbd_comp_mask_pred_c;  // ref
+        aom_highbd_comp_mask_upsampled_pred(
+            NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_),
+            CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby,
+            CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_,
+            subpel_search);
+
+        aom_highbd_comp_mask_pred = test_impl;  // test
+        aom_highbd_comp_mask_upsampled_pred(
+            NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred2_),
+            CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby,
+            CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv, bd_,
+            subpel_search);
+        ASSERT_EQ(CheckResult(w, h), true)
+            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+            << "," << suby << ")";
+      }
+    }
+  }
+}
+
+void AV1HighbdCompMaskUpVarianceTest::RunSpeedTest(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int havSub) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int subx = havSub ? 3 : 0;
+  const int suby = havSub ? 4 : 0;
+
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  const int num_loops = 1000000000 / (w + h);
+  highbd_comp_mask_pred_func funcs[2] = { &aom_highbd_comp_mask_pred_c,
+                                          test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    aom_highbd_comp_mask_pred = funcs[i];
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
+    for (int j = 0; j < num_loops; ++j) {
+      aom_highbd_comp_mask_upsampled_pred(
+          NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_),
+          CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby, CONVERT_TO_BYTEPTR(ref_),
+          MAX_SB_SIZE, mask, w, 0, bd_, subpel_search);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1HighbdCompMaskUpVarianceTest, CheckOutput) {
+  // inv mask = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskUpVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1HighbdCompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1HighbdCompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#endif  // ifndef aom_highbd_comp_mask_pred
+}  // namespace AV1CompMaskVariance
diff --git a/third_party/aom/test/convolve_round_test.cc b/third_party/aom/test/convolve_round_test.cc
new file mode 100644
index 000000000..2f801e7d4
--- /dev/null
+++ b/third_party/aom/test/convolve_round_test.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+#define CONVOLVE_ROUNDING_PARAM                                            \
+  const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \
+      int h, int bits
+
+typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM);
+
+typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd);
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 8;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 10;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 12;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
+
+using ::testing::tuple;
+
+typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
+    ConvolveRoundParam;
+
+const int kTestNum = 5000;
+
+class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
+ protected:
+  ConvolveRoundTest()
+      : func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) {
+  }
+  virtual ~ConvolveRoundTest() {}
+
+  virtual void SetUp() {
+    const size_t block_size = 128 * 128;
+    src_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, block_size * sizeof(*src_)));
+    dst_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size * sizeof(*dst_ref_)));
+    dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size * sizeof(*dst_)));
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_ref_);
+    aom_free(dst_);
+  }
+
+  void ConvolveRoundingRun() {
+    int test_num = 0;
+    const int src_stride = 128;
+    const int dst_stride = 128;
+    int bits = 13;
+    uint8_t *dst = 0;
+    uint8_t *dst_ref = 0;
+
+    if (data_path_ == LOWBITDEPTH_TEST) {
+      dst = reinterpret_cast<uint8_t *>(dst_);
+      dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
+    } else if (data_path_ == HIGHBITDEPTH_TEST) {
+      dst = CONVERT_TO_BYTEPTR(dst_);
+      dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
+    } else {
+      assert(0);
+    }
+
+    while (test_num < kTestNum) {
+      int block_size = test_num % BLOCK_SIZES_ALL;
+      int w = block_size_wide[block_size];
+      int h = block_size_high[block_size];
+
+      if (test_num % 2 == 0)
+        bits -= 1;
+      else
+        bits += 1;
+
+      GenerateBufferWithRandom(src_, src_stride, bits, w, h);
+
+      func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
+      ASM_REGISTER_STATE_CHECK(
+          func_(src_, src_stride, dst, dst_stride, w, h, bits));
+
+      if (data_path_ == LOWBITDEPTH_TEST) {
+        for (int r = 0; r < h; ++r) {
+          for (int c = 0; c < w; ++c) {
+            ASSERT_EQ(dst_ref[r * dst_stride + c], dst[r * dst_stride + c])
+                << "Mismatch at r: " << r << " c: " << c << " w: " << w
+                << " h: " << h << " test: " << test_num;
+          }
+        }
+      } else {
+        for (int r = 0; r < h; ++r) {
+          for (int c = 0; c < w; ++c) {
+            ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
+                << "Mismatch at r: " << r << " c: " << c << " w: " << w
+                << " h: " << h << " test: " << test_num;
+          }
+        }
+      }
+
+      test_num++;
+    }
+  }
+
+  void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w,
+                                int h) {
+    int32_t number;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        number = static_cast<int32_t>(rand_.Rand31());
+        number %= 1 << (bits + 9);
+        src[r * src_stride + c] = number;
+      }
+    }
+  }
+
+  ACMRandom rand_;
+  int32_t *src_;
+  uint16_t *dst_ref_;
+  uint16_t *dst_;
+
+  ConvolveRoundFunc func_ref_;
+  ConvolveRoundFunc func_;
+  DataPathType data_path_;
+};
+
+TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
+
+using ::testing::make_tuple;
+#if HAVE_AVX2
+const ConvolveRoundParam kConvRndParamArray[] = {
+  make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
+             LOWBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_8<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_8<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_10<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_10<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_12<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST)
+};
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveRoundTest,
+                        ::testing::ValuesIn(kConvRndParamArray));
+#endif  // HAVE_AVX2
+}  // namespace
diff --git a/third_party/aom/test/convolve_test.cc b/third_party/aom/test/convolve_test.cc
new file mode 100644
index 000000000..de3f47628
--- /dev/null
+++ b/third_party/aom/test/convolve_test.cc
@@ -0,0 +1,856 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/filter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+static const unsigned int kMaxDimension = MAX_SB_SIZE;
+
+typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h);
+
+struct ConvolveFunctions {
+  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc h8, ConvolveFunc v8, int bd)
+      : copy_(copy), h8_(h8), v8_(v8), use_highbd_(bd) {}
+
+  ConvolveFunc copy_;
+  ConvolveFunc h8_;
+  ConvolveFunc v8_;
+  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
+};
+
+typedef ::testing::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+
+#define ALL_SIZES_64(convolve_fn)                                         \
+  make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
+      make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn),     \
+      make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn),   \
+      make_tuple(16, 16, &convolve_fn), make_tuple(32, 16, &convolve_fn), \
+      make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \
+      make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
+      make_tuple(64, 64, &convolve_fn)
+
+#define ALL_SIZES(convolve_fn)                                          \
+  make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn), \
+      make_tuple(128, 128, &convolve_fn), ALL_SIZES_64(convolve_fn)
+
+// Reference 8-tap subpixel filter, slightly modified to fit into this test.
+#define AV1_FILTER_WEIGHT 128
+#define AV1_FILTER_SHIFT 7
+uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; }
+
+void filter_block2d_8_c(const uint8_t *src_ptr, unsigned int src_stride,
+                        const int16_t *HFilter, const int16_t *VFilter,
+                        uint8_t *dst_ptr, unsigned int dst_stride,
+                        unsigned int output_width, unsigned int output_height) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+  unsigned int i, j;
+
+  assert(intermediate_height > 7);
+
+  // Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+  // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+  //                                 + kInterp_Extend
+  //                               = 3 + 16 + 4
+  //                               = 23
+  // and filter_max_width          = 16
+  //
+  uint8_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension];
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
+
+  // Horizontal pass (src -> transposed intermediate).
+  uint8_t *output_ptr = intermediate_buffer;
+  const int src_next_row_stride = src_stride - output_width;
+  src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+  for (i = 0; i < intermediate_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
+                       (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
+                       (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
+                       (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+                       (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+      // Normalize back to 0-255...
+      *output_ptr = clip_pixel(temp >> AV1_FILTER_SHIFT);
+      ++src_ptr;
+      output_ptr += intermediate_height;
+    }
+    src_ptr += src_next_row_stride;
+    output_ptr += intermediate_next_stride;
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  src_ptr = intermediate_buffer;
+  const int dst_next_row_stride = dst_stride - output_width;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) +
+                       (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) +
+                       (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) +
+                       (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) +
+                       (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+      // Normalize back to 0-255...
+      *dst_ptr++ = clip_pixel(temp >> AV1_FILTER_SHIFT);
+      src_ptr += intermediate_height;
+    }
+    src_ptr += intermediate_next_stride;
+    dst_ptr += dst_next_row_stride;
+  }
+}
+
+void block2d_average_c(uint8_t *src, unsigned int src_stride,
+                       uint8_t *output_ptr, unsigned int output_stride,
+                       unsigned int output_width, unsigned int output_height) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter, const int16_t *VFilter,
+                                uint8_t *dst_ptr, unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
+                     output_width, output_height);
+  block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width,
+                    output_height);
+}
+
+void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
+                               const unsigned int src_stride,
+                               const int16_t *HFilter, const int16_t *VFilter,
+                               uint16_t *dst_ptr, unsigned int dst_stride,
+                               unsigned int output_width,
+                               unsigned int output_height, int bd) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+
+  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+   *                                 + kInterp_Extend
+   *                               = 3 + 16 + 4
+   *                               = 23
+   * and filter_max_width = 16
+   */
+  uint16_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension] = { 0 };
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
+
+  // Horizontal pass (src -> transposed intermediate).
+  {
+    uint16_t *output_ptr = intermediate_buffer;
+    const int src_next_row_stride = src_stride - output_width;
+    unsigned int i, j;
+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+                         (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *output_ptr = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd);
+        ++src_ptr;
+        output_ptr += intermediate_height;
+      }
+      src_ptr += src_next_row_stride;
+      output_ptr += intermediate_next_stride;
+    }
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  {
+    const uint16_t *interm_ptr = intermediate_buffer;
+    const int dst_next_row_stride = dst_stride - output_width;
+    unsigned int i, j;
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp =
+            (interm_ptr[0] * VFilter[0]) + (interm_ptr[1] * VFilter[1]) +
+            (interm_ptr[2] * VFilter[2]) + (interm_ptr[3] * VFilter[3]) +
+            (interm_ptr[4] * VFilter[4]) + (interm_ptr[5] * VFilter[5]) +
+            (interm_ptr[6] * VFilter[6]) + (interm_ptr[7] * VFilter[7]) +
+            (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *dst_ptr++ = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd);
+        interm_ptr += intermediate_height;
+      }
+      interm_ptr += intermediate_next_stride;
+      dst_ptr += dst_next_row_stride;
+    }
+  }
+}
+
+void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride,
+                              uint16_t *output_ptr, unsigned int output_stride,
+                              unsigned int output_width,
+                              unsigned int output_height) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void highbd_filter_average_block2d_8_c(
+    const uint16_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+    const int16_t *VFilter, uint16_t *dst_ptr, unsigned int dst_stride,
+    unsigned int output_width, unsigned int output_height, int bd) {
+  uint16_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp,
+                            kMaxDimension, output_width, output_height, bd);
+  highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
+                           output_width, output_height);
+}
+
+class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
+ public:
+  static void SetUpTestCase() {
+    // Force input_ to be unaligned, output to be 16 byte aligned.
+    input_ = reinterpret_cast<uint8_t *>(
+                 aom_memalign(kDataAlignment, kInputBufferSize + 1)) +
+             1;
+    output_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputBufferSize));
+    output_ref_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputBufferSize));
+    input16_ = reinterpret_cast<uint16_t *>(aom_memalign(
+                   kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) +
+               1;
+    output16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+    output16_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  static void TearDownTestCase() {
+    aom_free(input_ - 1);
+    input_ = NULL;
+    aom_free(output_);
+    output_ = NULL;
+    aom_free(output_ref_);
+    output_ref_ = NULL;
+    aom_free(input16_ - 1);
+    input16_ = NULL;
+    aom_free(output16_);
+    output16_ = NULL;
+    aom_free(output16_ref_);
+    output16_ref_ = NULL;
+  }
+
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 4 * kMaxDimension;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
+
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+    if (UUT_->use_highbd_ != 0)
+      mask_ = (1 << UUT_->use_highbd_) - 1;
+    else
+      mask_ = 255;
+    /* Set up guard blocks for an inner block centered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        output_[i] = 255;
+        output16_[i] = mask_;
+      } else {
+        output_[i] = 0;
+        output16_[i] = 0;
+      }
+    }
+
+    ::libaom_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i) {
+      if (i & 1) {
+        input_[i] = 255;
+        input16_[i] = mask_;
+      } else {
+        input_[i] = prng.Rand8Extremes();
+        input16_[i] = prng.Rand16() & mask_;
+      }
+    }
+  }
+
+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+    aom_memset16(input16_, value, kInputBufferSize);
+  }
+
+  void CopyOutputToRef() {
+    memcpy(output_ref_, output_, kOutputBufferSize);
+    // Copy 16-bit pixels values. The effective number of bytes is double.
+    memcpy(output16_ref_, output16_, sizeof(output16_[0]) * kOutputBufferSize);
+  }
+
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
+    }
+  }
+
+  uint8_t *input() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    if (UUT_->use_highbd_ == 0) {
+      return input_ + offset;
+    } else {
+      return CONVERT_TO_BYTEPTR(input16_) + offset;
+    }
+  }
+
+  uint8_t *output() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    if (UUT_->use_highbd_ == 0) {
+      return output_ + offset;
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_) + offset;
+    }
+  }
+
+  uint8_t *output_ref() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    if (UUT_->use_highbd_ == 0) {
+      return output_ref_ + offset;
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+    }
+  }
+
+  uint16_t lookup(uint8_t *list, int index) const {
+    if (UUT_->use_highbd_ == 0) {
+      return list[index];
+    } else {
+      return CONVERT_TO_SHORTPTR(list)[index];
+    }
+  }
+
+  void assign_val(uint8_t *list, int index, uint16_t val) const {
+    if (UUT_->use_highbd_ == 0) {
+      list[index] = (uint8_t)val;
+    } else {
+      CONVERT_TO_SHORTPTR(list)[index] = val;
+    }
+  }
+
+  void wrapper_filter_average_block2d_8_c(
+      const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+      const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
+      unsigned int output_width, unsigned int output_height) {
+    if (UUT_->use_highbd_ == 0) {
+      filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
+                                 dst_stride, output_width, output_height);
+    } else {
+      highbd_filter_average_block2d_8_c(
+          CONVERT_TO_SHORTPTR(src_ptr), src_stride, HFilter, VFilter,
+          CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height,
+          UUT_->use_highbd_);
+    }
+  }
+
+  void wrapper_filter_block2d_8_c(
+      const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+      const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
+      unsigned int output_width, unsigned int output_height) {
+    if (UUT_->use_highbd_ == 0) {
+      filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
+                         dst_stride, output_width, output_height);
+    } else {
+      highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                HFilter, VFilter, CONVERT_TO_SHORTPTR(dst_ptr),
+                                dst_stride, output_width, output_height,
+                                UUT_->use_highbd_);
+    }
+  }
+
+  const ConvolveFunctions *UUT_;
+  static uint8_t *input_;
+  static uint8_t *output_;
+  static uint8_t *output_ref_;
+  static uint16_t *input16_;
+  static uint16_t *output16_;
+  static uint16_t *output16_ref_;
+  int mask_;
+};
+
+uint8_t *ConvolveTest::input_ = NULL;
+uint8_t *ConvolveTest::output_ = NULL;
+uint8_t *ConvolveTest::output_ref_ = NULL;
+uint16_t *ConvolveTest::input16_ = NULL;
+uint16_t *ConvolveTest::output16_ = NULL;
+uint16_t *ConvolveTest::output16_ref_ = NULL;
+
+TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
+
+TEST_P(ConvolveTest, Copy) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+
+  ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride,
+                                       NULL, 0, NULL, 0, Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+const int kNumFilterBanks = SWITCHABLE_FILTERS;
+const int kNumFilters = 16;
+
+TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+    const InterpFilter filter = (InterpFilter)filter_bank;
+    const InterpKernel *filters =
+        (const InterpKernel *)av1_get_interp_filter_kernel(filter);
+    const InterpFilterParams *filter_params =
+        av1_get_interp_filter_params_with_block_size(filter, 8);
+    if (filter_params->taps != SUBPEL_TAPS) continue;
+    for (int i = 0; i < kNumFilters; i++) {
+      const int p0 = filters[i][0] + filters[i][1];
+      const int p1 = filters[i][2] + filters[i][3];
+      const int p2 = filters[i][4] + filters[i][5];
+      const int p3 = filters[i][6] + filters[i][7];
+      EXPECT_LE(p0, 128);
+      EXPECT_LE(p1, 128);
+      EXPECT_LE(p2, 128);
+      EXPECT_LE(p3, 128);
+      EXPECT_LE(p0 + p3, 128);
+      EXPECT_LE(p0 + p3 + p1, 128);
+      EXPECT_LE(p0 + p3 + p1 + p2, 128);
+      EXPECT_EQ(p0 + p1 + p2 + p3, 128);
+    }
+  }
+}
+
+const int16_t kInvalidFilter[8] = { 0 };
+
+TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+
+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+    const InterpFilter filter = (InterpFilter)filter_bank;
+    const InterpKernel *filters =
+        (const InterpKernel *)av1_get_interp_filter_kernel(filter);
+    const InterpFilterParams *filter_params =
+        av1_get_interp_filter_params_with_block_size(filter, 8);
+    if (filter_params->taps != SUBPEL_TAPS) continue;
+
+    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+        wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
+                                   filters[filter_y], ref, kOutputStride,
+                                   Width(), Height());
+
+        if (filter_x && filter_y)
+          continue;
+        else if (filter_y)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
+                        16, filters[filter_y], 16, Width(), Height()));
+        else if (filter_x)
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->h8_(in, kInputStride, out, kOutputStride, filters[filter_x],
+                        16, kInvalidFilter, 16, Width(), Height()));
+        else
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->copy_(in, kInputStride, out, kOutputStride, kInvalidFilter,
+                          0, kInvalidFilter, 0, Width(), Height()));
+
+        CheckGuardBlocks();
+
+        for (int y = 0; y < Height(); ++y)
+          for (int x = 0; x < Width(); ++x)
+            ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                      lookup(out, y * kOutputStride + x))
+                << "mismatch at (" << x << "," << y << "), "
+                << "filters (" << filter_bank << "," << filter_x << ","
+                << filter_y << ")";
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest, FilterExtremes) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+
+  // Populate ref and out with some random data
+  ::libaom_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  for (int axis = 0; axis < 2; axis++) {
+    int seed_val = 0;
+    while (seed_val < 256) {
+      for (int y = 0; y < 8; ++y) {
+        for (int x = 0; x < 8; ++x) {
+          assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+                     ((seed_val >> (axis ? y : x)) & 1) * mask_);
+          if (axis) seed_val++;
+        }
+        if (axis)
+          seed_val -= 8;
+        else
+          seed_val++;
+      }
+      if (axis) seed_val += 8;
+
+      for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+        const InterpFilter filter = (InterpFilter)filter_bank;
+        const InterpKernel *filters =
+            (const InterpKernel *)av1_get_interp_filter_kernel(filter);
+        const InterpFilterParams *filter_params =
+            av1_get_interp_filter_params_with_block_size(filter, 8);
+        if (filter_params->taps != SUBPEL_TAPS) continue;
+        for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+          for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+            wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
+                                       filters[filter_y], ref, kOutputStride,
+                                       Width(), Height());
+            if (filter_x && filter_y)
+              continue;
+            else if (filter_y)
+              ASM_REGISTER_STATE_CHECK(UUT_->v8_(
+                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
+                  filters[filter_y], 16, Width(), Height()));
+            else if (filter_x)
+              ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+                  kInvalidFilter, 16, Width(), Height()));
+            else
+              ASM_REGISTER_STATE_CHECK(UUT_->copy_(
+                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
+                  kInvalidFilter, 0, Width(), Height()));
+
+            for (int y = 0; y < Height(); ++y)
+              for (int x = 0; x < Width(); ++x)
+                ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                          lookup(out, y * kOutputStride + x))
+                    << "mismatch at (" << x << "," << y << "), "
+                    << "filters (" << filter_bank << "," << filter_x << ","
+                    << filter_y << ")";
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, width,
+                height);
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Speed) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t ref8[kOutputStride * kMaxDimension];
+  uint16_t ref16[kOutputStride * kMaxDimension];
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16);
+  }
+
+  // Populate ref and out with some random data
+  ::libaom_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  const InterpFilter filter = (InterpFilter)1;
+  const InterpKernel *filters =
+      (const InterpKernel *)av1_get_interp_filter_kernel(filter);
+  wrapper_filter_average_block2d_8_c(in, kInputStride, filters[1], filters[1],
+                                     out, kOutputStride, Width(), Height());
+
+  aom_usec_timer timer;
+  int tests_num = 1000;
+
+  aom_usec_timer_start(&timer);
+  while (tests_num > 0) {
+    for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+      const InterpFilter filter = (InterpFilter)filter_bank;
+      const InterpKernel *filters =
+          (const InterpKernel *)av1_get_interp_filter_kernel(filter);
+      const InterpFilterParams *filter_params =
+          av1_get_interp_filter_params_with_block_size(filter, 8);
+      if (filter_params->taps != SUBPEL_TAPS) continue;
+
+      for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+        for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+          if (filter_x && filter_y) continue;
+          if (filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
+                          16, filters[filter_y], 16, Width(), Height()));
+          else if (filter_x)
+            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+                kInvalidFilter, 16, Width(), Height()));
+        }
+      }
+    }
+    tests_num--;
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time =
+      static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+  printf("%dx%d (bitdepth %d) time: %5d ms\n", Width(), Height(),
+         UUT_->use_highbd_, elapsed_time);
+}
+
+using ::testing::make_tuple;
+
+#define WRAP(func, bd)                                                       \
+  static void wrap_##func##_##bd(                                            \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,    \
+      const int16_t *filter_y, int filter_y_stride, int w, int h) {          \
+    aom_highbd_##func(src, src_stride, dst, dst_stride, filter_x,            \
+                      filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+  }
+#if HAVE_SSE2 && ARCH_X86_64
+WRAP(convolve_copy_sse2, 8)
+WRAP(convolve_copy_sse2, 10)
+WRAP(convolve_copy_sse2, 12)
+WRAP(convolve8_horiz_sse2, 8)
+WRAP(convolve8_vert_sse2, 8)
+WRAP(convolve8_horiz_sse2, 10)
+WRAP(convolve8_vert_sse2, 10)
+WRAP(convolve8_horiz_sse2, 12)
+WRAP(convolve8_vert_sse2, 12)
+#endif  // HAVE_SSE2 && ARCH_X86_64
+
+WRAP(convolve_copy_c, 8)
+WRAP(convolve8_horiz_c, 8)
+WRAP(convolve8_vert_c, 8)
+WRAP(convolve_copy_c, 10)
+WRAP(convolve8_horiz_c, 10)
+WRAP(convolve8_vert_c, 10)
+WRAP(convolve_copy_c, 12)
+WRAP(convolve8_horiz_c, 12)
+WRAP(convolve8_vert_c, 12)
+
+#if HAVE_AVX2
+WRAP(convolve_copy_avx2, 8)
+WRAP(convolve8_horiz_avx2, 8)
+WRAP(convolve8_vert_avx2, 8)
+
+WRAP(convolve_copy_avx2, 10)
+WRAP(convolve8_horiz_avx2, 10)
+WRAP(convolve8_vert_avx2, 10)
+
+WRAP(convolve_copy_avx2, 12)
+WRAP(convolve8_horiz_avx2, 12)
+WRAP(convolve8_vert_avx2, 12)
+#endif  // HAVE_AVX2
+
+#undef WRAP
+
+const ConvolveFunctions convolve8_c(wrap_convolve_copy_c_8,
+                                    wrap_convolve8_horiz_c_8,
+                                    wrap_convolve8_vert_c_8, 8);
+const ConvolveFunctions convolve10_c(wrap_convolve_copy_c_10,
+                                     wrap_convolve8_horiz_c_10,
+                                     wrap_convolve8_vert_c_10, 10);
+const ConvolveFunctions convolve12_c(wrap_convolve_copy_c_12,
+                                     wrap_convolve8_horiz_c_12,
+                                     wrap_convolve8_vert_c_12, 12);
+const ConvolveParam kArrayConvolve_c[] = {
+  ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
+};
+
+INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c));
+
+#if HAVE_SSE2 && ARCH_X86_64
+const ConvolveFunctions convolve8_sse2(wrap_convolve_copy_sse2_8,
+                                       wrap_convolve8_horiz_sse2_8,
+                                       wrap_convolve8_vert_sse2_8, 8);
+const ConvolveFunctions convolve10_sse2(wrap_convolve_copy_sse2_10,
+                                        wrap_convolve8_horiz_sse2_10,
+                                        wrap_convolve8_vert_sse2_10, 10);
+const ConvolveFunctions convolve12_sse2(wrap_convolve_copy_sse2_12,
+                                        wrap_convolve8_horiz_sse2_12,
+                                        wrap_convolve8_vert_sse2_12, 12);
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2),
+                                              ALL_SIZES(convolve10_sse2),
+                                              ALL_SIZES(convolve12_sse2) };
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_sse2));
+#endif
+
+#if HAVE_SSSE3
+const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c,
+                                        aom_convolve8_horiz_ssse3,
+                                        aom_convolve8_vert_ssse3, 0);
+
+const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
+#endif
+
+#if HAVE_AVX2
+const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
+                                       aom_convolve8_horiz_avx2,
+                                       aom_convolve8_vert_avx2, 0);
+
+const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8,
+                                            wrap_convolve8_horiz_avx2_8,
+                                            wrap_convolve8_vert_avx2_8, 8);
+const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10,
+                                             wrap_convolve8_horiz_avx2_10,
+                                             wrap_convolve8_vert_avx2_10, 10);
+const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12,
+                                             wrap_convolve8_horiz_avx2_12,
+                                             wrap_convolve8_vert_avx2_12, 12);
+const ConvolveParam kArray_Convolve8_avx2[] = {
+  ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
+  ALL_SIZES_64(wrap_convolve12_avx2), ALL_SIZES(convolve8_avx2)
+};
+INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
+                        ::testing::ValuesIn(kArray_Convolve8_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/third_party/aom/test/corner_match_test.cc b/third_party/aom/test/corner_match_test.cc
new file mode 100644
index 000000000..58e3139c5
--- /dev/null
+++ b/third_party/aom/test/corner_match_test.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+#include "av1/encoder/corner_match.h"
+
+namespace test_libaom {
+
+namespace AV1CornerMatch {
+
+using libaom_test::ACMRandom;
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+typedef tuple<int> CornerMatchParam;
+
+class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
+ public:
+  virtual ~AV1CornerMatchTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput();
+
+  libaom_test::ACMRandom rnd_;
+};
+
+AV1CornerMatchTest::~AV1CornerMatchTest() {}
+void AV1CornerMatchTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1CornerMatchTest::RunCheckOutput() {
+  const int w = 128, h = 128;
+  const int num_iters = 10000;
+  int i, j;
+
+  uint8_t *input1 = new uint8_t[w * h];
+  uint8_t *input2 = new uint8_t[w * h];
+
+  // Test the two extreme cases:
+  // i) Random data, should have correlation close to 0
+  // ii) Linearly related data + noise, should have correlation close to 1
+  int mode = GET_PARAM(0);
+  if (mode == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        input1[i * w + j] = rnd_.Rand8();
+        input2[i * w + j] = rnd_.Rand8();
+      }
+  } else if (mode == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        int v = rnd_.Rand8();
+        input1[i * w + j] = v;
+        input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15);
+      }
+  }
+
+  for (i = 0; i < num_iters; ++i) {
+    int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
+    int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
+    int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
+    int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
+
+    double res_c =
+        compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+    double res_sse4 =
+        compute_cross_correlation_sse4_1(input1, w, x1, y1, input2, w, x2, y2);
+
+    ASSERT_EQ(res_sse4, res_c);
+  }
+
+  delete[] input1;
+  delete[] input2;
+}
+
+TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(); }
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1CornerMatchTest,
+                        ::testing::Values(make_tuple(0), make_tuple(1)));
+
+}  // namespace AV1CornerMatch
+
+}  // namespace test_libaom
diff --git a/third_party/aom/test/cpu_speed_test.cc b/third_party/aom/test/cpu_speed_test.cc
new file mode 100644
index 000000000..8ea3e6965
--- /dev/null
+++ b/third_party/aom/test/cpu_speed_test.cc
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPSNR = 100;
+
+class CpuSpeedTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  CpuSpeedTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
+        tune_content_(AOM_CONTENT_DEFAULT) {}
+  virtual ~CpuSpeedTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  void TestQ0();
+  void TestScreencastQ0();
+  void TestTuneScreen();
+  void TestEncodeHighBitrate();
+  void TestLowBitrate();
+
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  double min_psnr_;
+  int tune_content_;
+};
+
+void CpuSpeedTest::TestQ0() {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+void CpuSpeedTest::TestScreencastQ0() {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+void CpuSpeedTest::TestTuneScreen() {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  tune_content_ = AOM_CONTENT_SCREEN;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestEncodeHighBitrate() {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 12000;
+  cfg_.rc_max_quantizer = 10;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestLowBitrate() {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(CpuSpeedTest, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTest, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTest, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTest, TestLowBitrate) { TestLowBitrate(); }
+
+class CpuSpeedTestLarge : public CpuSpeedTest {};
+
+TEST_P(CpuSpeedTestLarge, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTestLarge, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTestLarge, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); }
+
+AV1_INSTANTIATE_TEST_CASE(CpuSpeedTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(1, 3));
+AV1_INSTANTIATE_TEST_CASE(CpuSpeedTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 1));
+}  // namespace
diff --git a/third_party/aom/test/datarate_test.cc b/third_party/aom/test/datarate_test.cc
new file mode 100644
index 000000000..1588d3cc1
--- /dev/null
+++ b/third_party/aom/test/datarate_test.cc
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+
+namespace {
+
+class DatarateTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ public:
+  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
+
+ protected:
+  virtual ~DatarateTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    bits_total_ = 0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(AV1E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    const aom_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_) first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits.
+    bits_total_ += frame_size_in_bits;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = (last_pts_ + 1) * timebase_;
+    // Effective file datarate:
+    effective_datarate_ = (bits_total_ / 1000.0) / duration_;
+  }
+
+  aom_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_;
+  double duration_;
+  double effective_datarate_;
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  aom_codec_pts_t first_drop_;
+  int num_drops_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestLarge, BasicRateTargetingVBR) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = AOM_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR,
+TEST_P(DatarateTestLarge, BasicRateTargeting) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int i = 150; i < 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargeting444) {
+  ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = AOM_CBR;
+
+  for (int i = 250; i < 900; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " " << effective_datarate_;
+  }
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
+
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 100);
+
+  const int kDropFrameThreshTestStep = 30;
+  aom_codec_pts_t last_drop = 140;
+  int last_num_drops = 0;
+  for (int i = 40; i < 100; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
+        << " The number of dropped frames for drop_thresh " << i
+        << " < number of dropped frames for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+    last_num_drops = num_drops_;
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood,
+                                            ::libaom_test::kRealTime),
+                          ::testing::Values(2, 5));
+}  // namespace
diff --git a/third_party/aom/test/decode_api_test.cc b/third_party/aom/test/decode_api_test.cc
new file mode 100644
index 000000000..c1beacee1
--- /dev/null
+++ b/third_party/aom/test/decode_api_test.cc
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/util.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+
+namespace {
+
+TEST(DecodeAPI, InvalidParams) {
+  static const aom_codec_iface_t *kCodecs[] = {
+#if CONFIG_AV1_DECODER
+    aom_codec_av1_dx(),
+#endif
+  };
+  uint8_t buf[1] = { 0 };
+  aom_codec_ctx_t dec;
+
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(&dec, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, NULL, 0, NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, buf, 0, NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_decode(NULL, buf, NELEMENTS(buf), NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_decode(NULL, NULL, NELEMENTS(buf), NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
+  EXPECT_TRUE(aom_codec_error(NULL) != NULL);
+
+  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_dec_init(NULL, kCodecs[i], NULL, 0));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, kCodecs[i], NULL, 0));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_decode(&dec, NULL, NELEMENTS(buf), NULL));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/decode_multithreaded_test.cc b/third_party/aom/test/decode_multithreaded_test.cc
new file mode 100644
index 000000000..cea1d144f
--- /dev/null
+++ b/third_party/aom/test/decode_multithreaded_test.cc
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "aom_mem/aom_mem.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+static const int kNumMultiThreadDecoders = 3;
+
+class AV1DecodeMultiThreadedTest
+    : public ::libaom_test::CodecTestWith5Params<int, int, int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1DecodeMultiThreadedTest()
+      : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(),
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+        n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)),
+        row_mt_(GET_PARAM(5)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 704;
+    cfg.h = 576;
+    cfg.threads = 1;
+    cfg.allow_lowbitdepth = 1;
+    single_thread_dec_ = codec_->CreateDecoder(cfg, 0);
+
+    // Test cfg.threads == powers of 2.
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      cfg.threads <<= 1;
+      multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0);
+      multi_thread_dec_[i]->Control(AV1D_SET_ROW_MT, row_mt_);
+    }
+
+    if (single_thread_dec_->IsAV1()) {
+      single_thread_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+      single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      if (multi_thread_dec_[i]->IsAV1()) {
+        multi_thread_dec_[i]->Control(AV1D_EXT_TILE_DEBUG, 1);
+        multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1);
+        multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1);
+      }
+    }
+  }
+
+  virtual ~AV1DecodeMultiThreadedTest() {
+    delete single_thread_dec_;
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+      delete multi_thread_dec_[i];
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libaom_test::kTwoPassGood);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+      encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+    }
+  }
+
+  void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+                 ::libaom_test::MD5 *md5) {
+    const aom_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = dec->GetDxData().Next();
+    md5->Add(img);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_);
+
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+      UpdateMD5(multi_thread_dec_[i], pkt, &md5_multi_thread_[i]);
+  }
+
+  void DoTest() {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = AOM_VBR;
+
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_single_thread_str = md5_single_thread_.Get();
+
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      const char *md5_multi_thread_str = md5_multi_thread_[i].Get();
+      ASSERT_STREQ(md5_single_thread_str, md5_multi_thread_str);
+    }
+  }
+
+  ::libaom_test::MD5 md5_single_thread_;
+  ::libaom_test::MD5 md5_multi_thread_[kNumMultiThreadDecoders];
+  ::libaom_test::Decoder *single_thread_dec_;
+  ::libaom_test::Decoder *multi_thread_dec_[kNumMultiThreadDecoders];
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+  int n_tile_groups_;
+  int set_cpu_used_;
+  int row_mt_;
+};
+
+// run an encode and do the decode both in single thread
+// and multi thread. Ensure that the MD5 of the output in both cases
+// is identical. If so, the test passes.
+TEST_P(AV1DecodeMultiThreadedTest, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AV1DecodeMultiThreadedTestLarge : public AV1DecodeMultiThreadedTest {};
+
+TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+// TODO(ranjit): More tests have to be added using pre-generated MD5.
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
+                          ::testing::Values(1, 2), ::testing::Values(1),
+                          ::testing::Values(3), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge,
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(1, 4), ::testing::Values(0),
+                          ::testing::Values(0, 1));
+
+class AV1DecodeMultiThreadedLSTestLarge
+    : public AV1DecodeMultiThreadedTestLarge {};
+
+TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 1;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 1);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge,
+                          ::testing::Values(6), ::testing::Values(6),
+                          ::testing::Values(1), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
+
+}  // namespace
diff --git a/third_party/aom/test/decode_perf_test.cc b/third_party/aom/test/decode_perf_test.cc
new file mode 100644
index 000000000..bb7b00032
--- /dev/null
+++ b/third_party/aom/test/decode_perf_test.cc
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_timer.h"
+#include "common/ivfenc.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+using ::testing::make_tuple;
+
+namespace {
+
+#define VIDEO_NAME 0
+#define THREADS 1
+
+const double kUsecsInSec = 1000000.0;
+const char kNewEncodeOutputFile[] = "new_encode.ivf";
+
+/*
+ DecodePerfTest takes a tuple of filename + number of threads to decode with
+ */
+typedef ::testing::tuple<const char *, unsigned> DecodePerfParam;
+
+// TODO(jimbankoski): Add actual test vectors here when available.
+// const DecodePerfParam kAV1DecodePerfVectors[] = {};
+
+/*
+ In order to reflect real world performance as much as possible, Perf tests
+ *DO NOT* do any correctness checks. Please run them alongside correctness
+ tests to ensure proper codec integrity. Furthermore, in this test we
+ deliberately limit the amount of system calls we make to avoid OS
+ preemption.
+
+ TODO(joshualitt) create a more detailed perf measurement test to collect
+   power/temp/min max frame decode times/etc
+ */
+
+class DecodePerfTest : public ::testing::TestWithParam<DecodePerfParam> {};
+
+TEST_P(DecodePerfTest, PerfTest) {
+  const char *const video_name = GET_PARAM(VIDEO_NAME);
+  const unsigned threads = GET_PARAM(THREADS);
+
+  libaom_test::WebMVideoSource video(video_name);
+  video.Init();
+
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  cfg.threads = threads;
+  cfg.allow_lowbitdepth = 1;
+  libaom_test::AV1Decoder decoder(cfg, 0);
+
+  aom_usec_timer t;
+  aom_usec_timer_start(&t);
+
+  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+    decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  }
+
+  aom_usec_timer_mark(&t);
+  const double elapsed_secs = double(aom_usec_timer_elapsed(&t)) / kUsecsInSec;
+  const unsigned frames = video.frame_number();
+  const double fps = double(frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", video_name);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+// TODO(jimbankoski): Enabled when we have actual AV1 Decode vectors.
+// INSTANTIATE_TEST_CASE_P(AV1, DecodePerfTest,
+//                        ::testing::ValuesIn(kAV1DecodePerfVectors));
+
+class AV1NewEncodeDecodePerfTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1NewEncodeDecodePerfTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
+        outfile_(0), out_frames_(0) {}
+
+  virtual ~AV1NewEncodeDecodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 25;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_end_usage = AOM_VBR;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, speed_);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    const char *const env = getenv("LIBAOM_TEST_DATA_PATH");
+    const std::string data_path(env ? env : ".");
+    const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
+    outfile_ = fopen(path_to_source.c_str(), "wb");
+    ASSERT_TRUE(outfile_ != NULL);
+  }
+
+  virtual void EndPassHook() {
+    if (outfile_ != NULL) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
+
+    // Write frame header and data.
+    ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+    ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+              pkt->data.frame.sz);
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  void set_speed(unsigned int speed) { speed_ = speed; }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  uint32_t speed_;
+  FILE *outfile_;
+  uint32_t out_frames_;
+};
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_), width(width_), height(height_), bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+TEST_P(AV1NewEncodeDecodePerfTest, PerfTest) {
+  SetUp();
+
+  // TODO(JBB): Make this work by going through the set of given files.
+  const int i = 0;
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = kAV1EncodePerfTestVectors[i].bitrate;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  const char *video_name = kAV1EncodePerfTestVectors[i].name;
+  libaom_test::I420VideoSource video(
+      video_name, kAV1EncodePerfTestVectors[i].width,
+      kAV1EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0,
+      kAV1EncodePerfTestVectors[i].frames);
+  set_speed(2);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  const uint32_t threads = 4;
+
+  libaom_test::IVFVideoSource decode_video(kNewEncodeOutputFile);
+  decode_video.Init();
+
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  cfg.threads = threads;
+  cfg.allow_lowbitdepth = 1;
+  libaom_test::AV1Decoder decoder(cfg, 0);
+
+  aom_usec_timer t;
+  aom_usec_timer_start(&t);
+
+  for (decode_video.Begin(); decode_video.cxdata() != NULL;
+       decode_video.Next()) {
+    decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
+  }
+
+  aom_usec_timer_mark(&t);
+  const double elapsed_secs =
+      static_cast<double>(aom_usec_timer_elapsed(&t)) / kUsecsInSec;
+  const unsigned decode_frames = decode_video.frame_number();
+  const double fps = static_cast<double>(decode_frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", decode_frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1NewEncodeDecodePerfTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+}  // namespace
diff --git a/third_party/aom/test/decode_test_driver.cc b/third_party/aom/test/decode_test_driver.cc
new file mode 100644
index 000000000..70de0cff6
--- /dev/null
+++ b/third_party/aom/test/decode_test_driver.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+const char kAV1Name[] = "AOMedia Project AV1 Decoder";
+
+aom_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    aom_codec_stream_info_t *stream_info) {
+  return aom_codec_peek_stream_info(CodecInterface(), cxdata, size,
+                                    stream_info);
+}
+
+aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
+  aom_codec_err_t res_dec;
+  InitOnce();
+  API_REGISTER_STATE_CHECK(
+      res_dec = aom_codec_decode(&decoder_, cxdata, size, user_priv));
+  return res_dec;
+}
+
+bool Decoder::IsAV1() const {
+  const char *codec_name = GetDecoderName();
+  return strncmp(kAV1Name, codec_name, sizeof(kAV1Name) - 1) == 0;
+}
+
+void DecoderTest::HandlePeekResult(Decoder *const /*decoder*/,
+                                   CompressedVideoSource * /*video*/,
+                                   const aom_codec_err_t res_peek) {
+  /* The Av1 implementation of PeekStream returns an error only if the
+   * data passed to it isn't a valid Av1 chunk. */
+  ASSERT_EQ(AOM_CODEC_OK, res_peek)
+      << "Peek return failed: " << aom_codec_err_to_string(res_peek);
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video,
+                          const aom_codec_dec_cfg_t &dec_cfg) {
+  Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
+  ASSERT_TRUE(decoder != NULL);
+  bool end_of_file = false;
+  bool peeked_stream = false;
+
+  // Decode frames.
+  for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
+       video->Next()) {
+    PreDecodeFrameHook(*video, decoder);
+
+    aom_codec_stream_info_t stream_info;
+    stream_info.is_annexb = 0;
+
+    if (video->cxdata() != NULL) {
+      if (!peeked_stream) {
+        // TODO(yaowu): PeekStream returns error for non-sequence_header_obu,
+        // therefore should only be tried once per sequence, this shall be fixed
+        // once PeekStream is updated to properly operate on other obus.
+        const aom_codec_err_t res_peek = decoder->PeekStream(
+            video->cxdata(), video->frame_size(), &stream_info);
+        HandlePeekResult(decoder, video, res_peek);
+        ASSERT_FALSE(::testing::Test::HasFailure());
+        peeked_stream = true;
+      }
+
+      aom_codec_err_t res_dec =
+          decoder->DecodeFrame(video->cxdata(), video->frame_size());
+      if (!HandleDecodeResult(res_dec, *video, decoder)) break;
+    } else {
+      // Signal end of the file to the decoder.
+      const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      ASSERT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+      end_of_file = true;
+    }
+
+    DxDataIterator dec_iter = decoder->GetDxData();
+    const aom_image_t *img = NULL;
+
+    // Get decompressed data
+    while (!::testing::Test::HasFailure() && (img = dec_iter.Next()))
+      DecompressedFrameHook(*img, video->frame_number());
+  }
+  delete decoder;
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video) {
+  aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+  RunLoop(video, dec_cfg);
+}
+
+void DecoderTest::set_cfg(const aom_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const aom_codec_flags_t flags) { flags_ = flags; }
+
+}  // namespace libaom_test
diff --git a/third_party/aom/test/decode_test_driver.h b/third_party/aom/test/decode_test_driver.h
new file mode 100644
index 000000000..d13e13ea1
--- /dev/null
+++ b/third_party/aom/test/decode_test_driver.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_DECODE_TEST_DRIVER_H_
+#define AOM_TEST_DECODE_TEST_DRIVER_H_
+#include <cstring>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+
+namespace libaom_test {
+
+class CodecFactory;
+class CompressedVideoSource;
+
+// Provides an object to handle decoding output
+class DxDataIterator {
+ public:
+  explicit DxDataIterator(aom_codec_ctx_t *decoder)
+      : decoder_(decoder), iter_(NULL) {}
+
+  const aom_image_t *Next() { return aom_codec_get_frame(decoder_, &iter_); }
+
+ private:
+  aom_codec_ctx_t *decoder_;
+  aom_codec_iter_t iter_;
+};
+
+// Provides a simplified interface to manage one video decoding.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
+class Decoder {
+ public:
+  explicit Decoder(aom_codec_dec_cfg_t cfg)
+      : cfg_(cfg), flags_(0), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag)
+      : cfg_(cfg), flags_(flag), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  virtual ~Decoder() { aom_codec_destroy(&decoder_); }
+
+  aom_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             aom_codec_stream_info_t *stream_info);
+
+  aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);
+
+  aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
+  DxDataIterator GetDxData() { return DxDataIterator(&decoder_); }
+
+  void Control(int ctrl_id, int arg) { Control(ctrl_id, arg, AOM_CODEC_OK); }
+
+  void Control(int ctrl_id, const void *arg) {
+    InitOnce();
+    const aom_codec_err_t res = aom_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
+  }
+
+  void Control(int ctrl_id, int arg, aom_codec_err_t expected_value) {
+    InitOnce();
+    const aom_codec_err_t res = aom_codec_control_(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(expected_value, res) << DecodeError();
+  }
+
+  const char *DecodeError() {
+    const char *detail = aom_codec_error_detail(&decoder_);
+    return detail ? detail : aom_codec_error(&decoder_);
+  }
+
+  // Passes the external frame buffer information to libaom.
+  aom_codec_err_t SetFrameBufferFunctions(
+      aom_get_frame_buffer_cb_fn_t cb_get,
+      aom_release_frame_buffer_cb_fn_t cb_release, void *user_priv) {
+    InitOnce();
+    return aom_codec_set_frame_buffer_functions(&decoder_, cb_get, cb_release,
+                                                user_priv);
+  }
+
+  const char *GetDecoderName() const {
+    return aom_codec_iface_name(CodecInterface());
+  }
+
+  bool IsAV1() const;
+
+  aom_codec_ctx_t *GetDecoder() { return &decoder_; }
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const = 0;
+
+  void InitOnce() {
+    if (!init_done_) {
+      const aom_codec_err_t res =
+          aom_codec_dec_init(&decoder_, CodecInterface(), &cfg_, flags_);
+      ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
+      init_done_ = true;
+    }
+  }
+
+  aom_codec_ctx_t decoder_;
+  aom_codec_dec_cfg_t cfg_;
+  aom_codec_flags_t flags_;
+  bool init_done_;
+};
+
+// Common test functionality for all Decoder tests.
+class DecoderTest {
+ public:
+  // Main decoding loop
+  virtual void RunLoop(CompressedVideoSource *video);
+  virtual void RunLoop(CompressedVideoSource *video,
+                       const aom_codec_dec_cfg_t &dec_cfg);
+
+  virtual void set_cfg(const aom_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const aom_codec_flags_t flags);
+
+  // Hook to be called before decompressing every frame.
+  virtual void PreDecodeFrameHook(const CompressedVideoSource & /*video*/,
+                                  Decoder * /*decoder*/) {}
+
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  const CompressedVideoSource & /*video*/,
+                                  Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const aom_image_t & /*img*/,
+                                     const unsigned int /*frame_number*/) {}
+
+  // Hook to be called on peek result
+  virtual void HandlePeekResult(Decoder *const decoder,
+                                CompressedVideoSource *video,
+                                const aom_codec_err_t res_peek);
+
+ protected:
+  explicit DecoderTest(const CodecFactory *codec)
+      : codec_(codec), cfg_(), flags_(0) {}
+
+  virtual ~DecoderTest() {}
+
+  const CodecFactory *codec_;
+  aom_codec_dec_cfg_t cfg_;
+  aom_codec_flags_t flags_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_DECODE_TEST_DRIVER_H_
diff --git a/third_party/aom/test/decode_to_md5.sh b/third_party/aom/test/decode_to_md5.sh
new file mode 100755
index 000000000..2edd1cb52
--- /dev/null
+++ b/third_party/aom/test/decode_to_md5.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom decode_to_md5 example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to decode_to_md5_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $AV1_IVF_FILE is required.
+decode_to_md5_verify_environment() {
+  if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+    return 1
+  fi
+}
+
+# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is
+# interpreted as codec name and used solely to name the output file. $3 is the
+# expected md5 sum: It must match that of the final frame.
+decode_to_md5() {
+  local decoder="$(aom_tool_path decode_to_md5)"
+  local input_file="$1"
+  local codec="$2"
+  local expected_md5="$3"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+
+  local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+  local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
+  if [ "${actual_md5}" = "${expected_md5}" ]; then
+    return 0
+  else
+    elog "MD5 mismatch:"
+    elog "Expected: ${expected_md5}"
+    elog "Actual: ${actual_md5}"
+    return 1
+  fi
+}
+
+DISABLED_decode_to_md5_av1() {
+  # expected MD5 sum for the last frame.
+  local expected_md5="567dd6d4b7a7170edddbf58bbcc3aff1"
+  local file="${AV1_IVF_FILE}"
+
+  # TODO(urvang): Check in the encoded file (like libvpx does) to avoid
+  # encoding every time.
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    if [ ! -e "${AV1_IVF_FILE}" ]; then
+      file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    decode_to_md5 "${file}" "av1" "${expected_md5}"
+  fi
+}
+
+# TODO(tomfinegan): Enable when the bitstream stabilizes.
+decode_to_md5_tests="DISABLED_decode_to_md5_av1"
+
+run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}"
diff --git a/third_party/aom/test/decode_with_drops.sh b/third_party/aom/test/decode_with_drops.sh
new file mode 100755
index 000000000..155ee9207
--- /dev/null
+++ b/third_party/aom/test/decode_with_drops.sh
@@ -0,0 +1,68 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom decode_with_drops example. To add new tests to
+## this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to decode_with_drops_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $AV1_IVF_FILE is required.
+decode_with_drops_verify_environment() {
+  if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+    return 1
+  fi
+}
+
+# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely
+# to name the output file. $3 is the drop mode, and is passed directly to
+# decode_with_drops.
+decode_with_drops() {
+  local decoder="$(aom_tool_path decode_with_drops)"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
+  local drop_mode="$3"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      "${drop_mode}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+
+# Decodes $AV1_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+DISABLED_decode_with_drops_av1() {
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${AV1_IVF_FILE}" ]; then
+      file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    # Drop frames 3 and 4.
+    decode_with_drops "${file}" "av1" "3-4"
+
+    # Test pattern mode: Drop 3 of every 4 frames.
+    decode_with_drops "${file}" "av1" "3/4"
+  fi
+}
+
+# TODO(yaowu): Disable this test as trailing_bit check is expected to fail
+decode_with_drops_tests="DISABLED_decode_with_drops_av1"
+
+run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}"
diff --git a/third_party/aom/test/divu_small_test.cc b/third_party/aom/test/divu_small_test.cc
new file mode 100644
index 000000000..064f8ee45
--- /dev/null
+++ b/third_party/aom/test/divu_small_test.cc
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "av1/common/odintrin.h"
+
+using libaom_test::ACMRandom;
+
+TEST(Daala, TestDIVUuptoMAX) {
+  for (int d = 1; d <= OD_DIVU_DMAX; d++) {
+    for (uint32_t x = 1; x <= 1000000; x++) {
+      GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
+          << "x=" << x << " d=" << d << " x/d=" << (x / d)
+          << " != " << OD_DIVU_SMALL(x, d);
+    }
+  }
+}
+
+TEST(Daala, TestDIVUrandI31) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int d = 1; d < OD_DIVU_DMAX; d++) {
+    for (int i = 0; i < 1000000; i++) {
+      uint32_t x = rnd.Rand31();
+      GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
+          << "x=" << x << " d=" << d << " x/d=" << (x / d)
+          << " != " << OD_DIVU_SMALL(x, d);
+    }
+  }
+}
diff --git a/third_party/aom/test/dr_prediction_test.cc b/third_party/aom/test/dr_prediction_test.cc
new file mode 100644
index 000000000..ff2c1de4e
--- /dev/null
+++ b/third_party/aom/test/dr_prediction_test.cc
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+const int kZ1Start = 0;
+const int kZ2Start = 90;
+const int kZ3Start = 180;
+
+const TX_SIZE kTxSize[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32, TX_64X64,
+                            TX_4X8,   TX_8X4,   TX_8X16,  TX_16X8,  TX_16X32,
+                            TX_32X16, TX_32X64, TX_64X32, TX_4X16,  TX_16X4,
+                            TX_8X32,  TX_32X8,  TX_16X64, TX_64X16 };
+
+const char *const kTxSizeStrings[] = {
+  "TX_4X4",   "TX_8X8",   "TX_16X16", "TX_32X32", "TX_64X64",
+  "TX_4X8",   "TX_8X4",   "TX_8X16",  "TX_16X8",  "TX_16X32",
+  "TX_32X16", "TX_32X64", "TX_64X32", "TX_4X16",  "TX_16X4",
+  "TX_8X32",  "TX_32X8",  "TX_16X64", "TX_64X16"
+};
+
+using libaom_test::ACMRandom;
+
+typedef void (*DrPred_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                           const uint16_t *above, const uint16_t *left,
+                           int upsample_above, int upsample_left, int dx,
+                           int dy, int bd);
+
+typedef void (*DrPred)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy,
+                       int bd);
+
+typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int dx, int dy);
+template <Z1_Lbd fn>
+void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int /*upsample_left*/, int dx, int dy, int /*bd*/) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy);
+}
+
+typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy);
+template <Z2_Lbd fn>
+void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int upsample_left, int dx, int dy, int /*bd*/) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy);
+}
+
+typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_left, int dx, int dy);
+template <Z3_Lbd fn>
+void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left,
+                int /*upsample_above*/, int upsample_left, int dx, int dy,
+                int /*bd*/) {
+  fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy);
+}
+
+typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_above, int dx, int dy, int bd);
+template <Z1_Hbd fn>
+void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int /*upsample_left*/, int dx, int dy,
+                    int bd) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd);
+}
+
+typedef void (*Z2_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy,
+                       int bd);
+template <Z2_Hbd fn>
+void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int upsample_left, int dx, int dy,
+                    int bd) {
+  fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy,
+     bd);
+}
+
+typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_left, int dx, int dy, int bd);
+template <Z3_Hbd fn>
+void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int /*upsample_above*/, int upsample_left, int dx, int dy,
+                    int bd) {
+  fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd);
+}
+
+template <typename FuncType>
+struct DrPredFunc {
+  DrPredFunc(FuncType pred = NULL, FuncType tst = NULL, int bit_depth_value = 0,
+             int start_angle_value = 0)
+      : ref_fn(pred), tst_fn(tst), bit_depth(bit_depth_value),
+        start_angle(start_angle_value) {}
+
+  FuncType ref_fn;
+  FuncType tst_fn;
+  int bit_depth;
+  int start_angle;
+};
+
+template <typename Pixel, typename FuncType>
+class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
+ protected:
+  static const int kMaxNumTests = 100000;
+  static const int kIterations = 10;
+  static const int kDstStride = 64;
+  static const int kDstSize = kDstStride * kDstStride;
+  static const int kOffset = 16;
+  static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
+
+  DrPredTest()
+      : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0),
+        bh_(0), dx_(1), dy_(1), bd_(8), txsize_(TX_4X4) {
+    params_ = this->GetParam();
+    start_angle_ = params_.start_angle;
+    stop_angle_ = start_angle_ + 90;
+
+    dst_ref_ = &dst_ref_data_[0];
+    dst_tst_ = &dst_tst_data_[0];
+    dst_stride_ = kDstStride;
+    above_ = &above_data_[kOffset];
+    left_ = &left_data_[kOffset];
+
+    for (int i = 0; i < kBufSize; ++i) {
+      above_data_[i] = rng_.Rand8();
+      left_data_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kDstSize; ++i) {
+      dst_ref_[i] = 0;
+    }
+  }
+
+  virtual ~DrPredTest() {}
+
+  void Predict(bool speedtest, int tx) {
+    const int kNumTests = speedtest ? kMaxNumTests : 1;
+    aom_usec_timer timer;
+
+    aom_usec_timer_start(&timer);
+    for (int k = 0; k < kNumTests; ++k) {
+      params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_,
+                     upsample_above_, upsample_left_, dx_, dy_, bd_);
+    }
+    aom_usec_timer_mark(&timer);
+    const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer_start(&timer);
+    if (params_.tst_fn) {
+      for (int k = 0; k < kNumTests; ++k) {
+        ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+                                                above_, left_, upsample_above_,
+                                                upsample_left_, dx_, dy_, bd_));
+      }
+    }
+    aom_usec_timer_mark(&timer);
+    const int tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    OutputTimes(kNumTests, ref_time, tst_time, tx);
+  }
+
+  void RunTest(bool speedtest, int p_angle) {
+    for (int i = 0; i < kBufSize; ++i) {
+      above_data_[i] = left_data_[i] = (1 << bd_) - 1;
+    }
+
+    for (int tx = 0; tx < TX_SIZES_ALL; ++tx) {
+      if (params_.tst_fn == NULL) {
+        for (int i = 0; i < kDstSize; ++i) {
+          dst_tst_[i] = (1 << bd_) - 1;
+        }
+      } else {
+        for (int i = 0; i < kDstSize; ++i) {
+          dst_tst_[i] = 0;
+        }
+      }
+
+      bw_ = tx_size_wide[kTxSize[tx]];
+      bh_ = tx_size_high[kTxSize[tx]];
+
+      if (enable_upsample_) {
+        upsample_above_ =
+            av1_use_intra_edge_upsample(bw_, bh_, p_angle - 90, 0);
+        upsample_left_ =
+            av1_use_intra_edge_upsample(bw_, bh_, p_angle - 180, 0);
+      } else {
+        upsample_above_ = upsample_left_ = 0;
+      }
+
+      Predict(speedtest, tx);
+
+      for (int r = 0; r < bh_; ++r) {
+        for (int c = 0; c < bw_; ++c) {
+          ASSERT_EQ(dst_ref_[r * dst_stride_ + c],
+                    dst_tst_[r * dst_stride_ + c])
+              << bw_ << "x" << bh_ << " r: " << r << " c: " << c
+              << " dx: " << dx_ << " dy: " << dy_
+              << " upsample_above: " << upsample_above_
+              << " upsample_left: " << upsample_left_;
+        }
+      }
+    }
+  }
+
+  void OutputTimes(int num_tests, int ref_time, int tst_time, int tx) {
+    if (num_tests > 1) {
+      if (params_.tst_fn) {
+        const float x = static_cast<float>(ref_time) / tst_time;
+        printf("\t[%8s] :: ref time %6d, tst time %6d     %3.2f\n",
+               kTxSizeStrings[tx], ref_time, tst_time, x);
+      } else {
+        printf("\t[%8s] :: ref time %6d\n", kTxSizeStrings[tx], ref_time);
+      }
+    }
+  }
+
+  Pixel dst_ref_data_[kDstSize];
+  Pixel dst_tst_data_[kDstSize];
+
+  Pixel left_data_[kBufSize];
+  Pixel dummy_data_[kBufSize];
+  Pixel above_data_[kBufSize];
+
+  Pixel *dst_ref_;
+  Pixel *dst_tst_;
+  Pixel *above_;
+  Pixel *left_;
+  int dst_stride_;
+
+  int enable_upsample_;
+  int upsample_above_;
+  int upsample_left_;
+  int bw_;
+  int bh_;
+  int dx_;
+  int dy_;
+  int bd_;
+  TX_SIZE txsize_;
+
+  int start_angle_;
+  int stop_angle_;
+
+  ACMRandom rng_;
+
+  DrPredFunc<FuncType> params_;
+};
+
+class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
+
+TEST_P(LowbdDrPredTest, SaturatedValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    enable_upsample_ = iter & 1;
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, angle);
+    }
+  }
+}
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int i = 0; i < 3; ++i) {
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, angle);
+    }
+  }
+}
+
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         NULL, AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         NULL, AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         NULL, AOM_BITS_8, kZ3Start)));
+
+class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
+
+TEST_P(HighbdDrPredTest, SaturatedValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    enable_upsample_ = iter & 1;
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, angle);
+    }
+  }
+}
+
+TEST_P(HighbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int i = 0; i < 3; ++i) {
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, angle);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, HighbdDrPredTest,
+    ::testing::Values(
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_8, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_10, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_12, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_8, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_10, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_12, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_8, kZ3Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_10, kZ3Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_12, kZ3Start)));
+
+}  // namespace
diff --git a/third_party/aom/test/dump_obu.sh b/third_party/aom/test/dump_obu.sh
new file mode 100755
index 000000000..da44dd7e6
--- /dev/null
+++ b/third_party/aom/test/dump_obu.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom dump_obu tool. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to dump_obu_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+readonly dump_obu_test_file="${AOM_TEST_OUTPUT_DIR}/av1_obu_test.ivf"
+
+dump_obu_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(dump_obu_available)" = "yes" ]; then
+    if [ -z "$(aom_tool_path dump_obu)" ]; then
+      elog "dump_obu not found in LIBAOM_BIN_PATH, its parent, or child tools/."
+    fi
+  fi
+}
+
+dump_obu_available() {
+  if [ "$(av1_decode_available)" = "yes" ] && \
+     [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomenc_available() {
+  if [ -x "$(aom_tool_path aomenc)" ]; then
+    echo yes
+  fi
+}
+
+encode_test_file() {
+  if [ "$(aomenc_available)" = "yes" ]; then
+    local encoder="$(aom_tool_path aomenc)"
+
+    eval "${encoder}" \
+      $(aomenc_encode_test_fast_params) \
+      $(yuv_raw_input) \
+      --ivf \
+      --output=${dump_obu_test_file} \
+      ${devnull}
+
+    if [ ! -e "${dump_obu_test_file}" ]; then
+      elog "dump_obu test input encode failed."
+      return 1
+    fi
+  fi
+}
+
+dump_obu() {
+  encode_test_file
+  eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull}
+}
+
+dump_obu_tests="dump_obu"
+
+run_tests dump_obu_verify_environment "${dump_obu_tests}"
diff --git a/third_party/aom/test/ec_test.cc b/third_party/aom/test/ec_test.cc
new file mode 100644
index 000000000..e6a5ea63b
--- /dev/null
+++ b/third_party/aom/test/ec_test.cc
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include <cstdlib>
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/entdec.h"
+
+TEST(EC_TEST, random_ec_test) {
+  od_ec_enc enc;
+  od_ec_dec dec;
+  int sz;
+  int i;
+  int ret;
+  unsigned int sym;
+  unsigned int seed;
+  unsigned char *ptr;
+  uint32_t ptr_sz;
+  char *seed_str;
+  ret = 0;
+  seed_str = getenv("EC_TEST_SEED");
+  if (seed_str) {
+    seed = atoi(seed_str);
+  } else {
+    seed = 0xdaa1a;
+  }
+  srand(seed);
+  od_ec_enc_init(&enc, 1);
+  /*Test compatibility between multiple different encode/decode routines.*/
+  for (i = 0; i < 409600; i++) {
+    unsigned *fz;
+    unsigned *fts;
+    unsigned *data;
+    unsigned *tell;
+    unsigned *enc_method;
+    int j;
+    sz = rand() / ((RAND_MAX >> (rand() % 9U)) + 1U);
+    fz = (unsigned *)malloc(sz * sizeof(*fz));
+    fts = (unsigned *)malloc(sz * sizeof(*fts));
+    data = (unsigned *)malloc(sz * sizeof(*data));
+    tell = (unsigned *)malloc((sz + 1) * sizeof(*tell));
+    enc_method = (unsigned *)malloc(sz * sizeof(*enc_method));
+    od_ec_enc_reset(&enc);
+    tell[0] = od_ec_enc_tell_frac(&enc);
+    for (j = 0; j < sz; j++) {
+      data[j] = rand() / ((RAND_MAX >> 1) + 1);
+
+      fts[j] = CDF_PROB_BITS;
+      fz[j] = (rand() % (CDF_PROB_TOP - 2)) >> (CDF_PROB_BITS - fts[j]);
+      fz[j] = OD_MAXI(fz[j], 1);
+      enc_method[j] = 3 + (rand() & 1);
+      switch (enc_method[j]) {
+        case 3: {
+          od_ec_encode_bool_q15(&enc, data[j],
+                                OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+          break;
+        }
+        case 4: {
+          uint16_t cdf[2];
+          cdf[0] = OD_ICDF(fz[j]);
+          cdf[1] = OD_ICDF(1U << fts[j]);
+          od_ec_encode_cdf_q15(&enc, data[j], cdf, 2);
+          break;
+        }
+      }
+
+      tell[j + 1] = od_ec_enc_tell_frac(&enc);
+    }
+    ptr = od_ec_enc_done(&enc, &ptr_sz);
+    EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz)
+        << "od_ec_enc_tell() lied: "
+           "there's "
+        << ptr_sz << " bytes instead of " << ((od_ec_enc_tell(&enc) + 7) >> 3)
+        << " (Random seed: " << seed << ")\n";
+    od_ec_dec_init(&dec, ptr, ptr_sz);
+    EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[0])
+        << "od_ec_dec_tell() mismatch between encoder and decoder "
+           "at symbol 0: "
+        << (unsigned)od_ec_dec_tell_frac(&dec) << " instead of " << tell[0]
+        << " (Random seed: " << seed << ").\n";
+    for (j = 0; j < sz; j++) {
+      int dec_method;
+      if (CDF_SHIFT == 0) {
+        dec_method = 3 + (rand() & 1);
+      } else {
+        dec_method = enc_method[j];
+      }
+      switch (dec_method) {
+        case 3: {
+          sym = od_ec_decode_bool_q15(
+              &dec, OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+          break;
+        }
+        case 4: {
+          uint16_t cdf[2];
+          cdf[0] = OD_ICDF(fz[j]);
+          cdf[1] = OD_ICDF(1U << fts[j]);
+          sym = od_ec_decode_cdf_q15(&dec, cdf, 2);
+          break;
+        }
+      }
+
+      EXPECT_EQ(sym, data[j])
+          << "Decoded " << sym << " instead of " << data[j]
+          << " with fz=" << fz[j] << " and ftb=" << fts[j] << "at position "
+          << j << " of " << sz << " (Random seed: " << seed << ").\n"
+          << "Encoding method: " << enc_method[j]
+          << " decoding method: " << dec_method << "\n";
+      EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[j + 1])
+          << "od_ec_dec_tell() mismatch between encoder and "
+             "decoder at symbol "
+          << j + 1 << ": " << (unsigned)od_ec_dec_tell_frac(&dec)
+          << " instead of " << tell[j + 1] << " (Random seed: " << seed
+          << ").\n";
+    }
+    free(enc_method);
+    free(tell);
+    free(data);
+    free(fts);
+    free(fz);
+  }
+  od_ec_enc_reset(&enc);
+  if (CDF_SHIFT == 0) {
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+    od_ec_enc_patch_initial_bits(&enc, 3, 2);
+    EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+    od_ec_enc_patch_initial_bits(&enc, 0, 5);
+    EXPECT_TRUE(enc.error)
+        << "od_ec_enc_patch_initial_bits() didn't fail when it should have.\n";
+    od_ec_enc_reset(&enc);
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 1, OD_ICDF(32256));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+    od_ec_enc_patch_initial_bits(&enc, 0, 2);
+    EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+    ptr = od_ec_enc_done(&enc, &ptr_sz);
+    EXPECT_EQ(ptr_sz, 2u);
+    EXPECT_EQ(ptr[0], 63)
+        << "Got " << ptr[0]
+        << " when expecting 63 for od_ec_enc_patch_initial_bits().\n";
+  }
+  od_ec_enc_clear(&enc);
+  EXPECT_EQ(ret, 0);
+}
diff --git a/third_party/aom/test/encode_api_test.cc b/third_party/aom/test/encode_api_test.cc
new file mode 100644
index 000000000..c26f5720f
--- /dev/null
+++ b/third_party/aom/test/encode_api_test.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/util.h"
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+
+namespace {
+
+TEST(EncodeAPI, InvalidParams) {
+  static const aom_codec_iface_t *kCodecs[] = {
+#if CONFIG_AV1_ENCODER
+    aom_codec_av1_cx(),
+#endif
+  };
+  uint8_t buf[1] = { 0 };
+  aom_image_t img;
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf));
+
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, &img, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_enc_config_default(NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_enc_config_default(NULL, &cfg, 0));
+  EXPECT_TRUE(aom_codec_error(NULL) != NULL);
+
+  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
+    SCOPED_TRACE(aom_codec_iface_name(kCodecs[i]));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_enc_init(NULL, kCodecs[i], NULL, 0));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_enc_init(&enc, kCodecs[i], NULL, 0));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_enc_config_default(kCodecs[i], &cfg, 1));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+
+    EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
+
+    aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
+    EXPECT_TRUE(glob_headers->buf != NULL);
+    if (glob_headers) {
+      free(glob_headers->buf);
+      free(glob_headers);
+    }
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/encode_perf_test.cc b/third_party/aom/test/encode_perf_test.cc
new file mode 100644
index 000000000..fe649b153
--- /dev/null
+++ b/third_party/aom/test/encode_perf_test.cc
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom_ports/aom_timer.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_), width(width_), height(height_), bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484),
+  EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987),
+  EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718),
+  EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471),
+  EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv", 640, 480, 200,
+                      300),
+  EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestThreads[] = { 1, 2, 4 };
+
+class AV1EncodePerfTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1EncodePerfTest()
+      : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
+        encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
+
+  virtual ~AV1EncodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_error_resilient = 1;
+    cfg_.g_threads = threads_;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      const int log2_tile_columns = 3;
+      encoder->Control(AOME_SET_CPUUSED, speed_);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, log2_tile_columns);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) {
+      min_psnr_ = pkt->data.psnr.psnr[0];
+    }
+  }
+
+  // for performance reasons don't decode
+  virtual bool DoDecode() { return 0; }
+
+  double min_psnr() const { return min_psnr_; }
+
+  void set_speed(unsigned int speed) { speed_ = speed; }
+
+  void set_threads(unsigned int threads) { threads_ = threads; }
+
+ private:
+  double min_psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+  unsigned speed_;
+  unsigned int threads_;
+};
+
+TEST_P(AV1EncodePerfTest, PerfTest) {
+  for (size_t i = 0; i < NELEMENTS(kAV1EncodePerfTestVectors); ++i) {
+    for (size_t j = 0; j < NELEMENTS(kEncodePerfTestSpeeds); ++j) {
+      for (size_t k = 0; k < NELEMENTS(kEncodePerfTestThreads); ++k) {
+        if (kAV1EncodePerfTestVectors[i].width < 512 &&
+            kEncodePerfTestThreads[k] > 1)
+          continue;
+        else if (kAV1EncodePerfTestVectors[i].width < 1024 &&
+                 kEncodePerfTestThreads[k] > 2)
+          continue;
+
+        set_threads(kEncodePerfTestThreads[k]);
+        SetUp();
+
+        const aom_rational timebase = { 33333333, 1000000000 };
+        cfg_.g_timebase = timebase;
+        cfg_.rc_target_bitrate = kAV1EncodePerfTestVectors[i].bitrate;
+
+        init_flags_ = AOM_CODEC_USE_PSNR;
+
+        const unsigned frames = kAV1EncodePerfTestVectors[i].frames;
+        const char *video_name = kAV1EncodePerfTestVectors[i].name;
+        libaom_test::I420VideoSource video(
+            video_name, kAV1EncodePerfTestVectors[i].width,
+            kAV1EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0,
+            kAV1EncodePerfTestVectors[i].frames);
+        set_speed(kEncodePerfTestSpeeds[j]);
+
+        aom_usec_timer t;
+        aom_usec_timer_start(&t);
+
+        ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+        aom_usec_timer_mark(&t);
+        const double elapsed_secs = aom_usec_timer_elapsed(&t) / kUsecsInSec;
+        const double fps = frames / elapsed_secs;
+        const double minimum_psnr = min_psnr();
+        std::string display_name(video_name);
+        if (kEncodePerfTestThreads[k] > 1) {
+          char thread_count[32];
+          snprintf(thread_count, sizeof(thread_count), "_t-%d",
+                   kEncodePerfTestThreads[k]);
+          display_name += thread_count;
+        }
+
+        printf("{\n");
+        printf("\t\"type\" : \"encode_perf_test\",\n");
+        printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+        printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
+        printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+        printf("\t\"totalFrames\" : %u,\n", frames);
+        printf("\t\"framesPerSecond\" : %f,\n", fps);
+        printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+        printf("\t\"speed\" : %d,\n", kEncodePerfTestSpeeds[j]);
+        printf("\t\"threads\" : %d\n", kEncodePerfTestThreads[k]);
+        printf("}\n");
+      }
+    }
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1EncodePerfTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+}  // namespace
diff --git a/third_party/aom/test/encode_test_driver.cc b/third_party/aom/test/encode_test_driver.cc
new file mode 100644
index 000000000..f3d61dc36
--- /dev/null
+++ b/third_party/aom/test/encode_test_driver.cc
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+void Encoder::InitEncoder(VideoSource *video) {
+  aom_codec_err_t res;
+  const aom_image_t *img = video->img();
+
+  if (video->img() && !encoder_.priv) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    cfg_.g_timebase = video->timebase();
+    cfg_.rc_twopass_stats_in = stats_->buf();
+
+    res = aom_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+}
+
+void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+  if (video->img())
+    EncodeFrameInternal(*video, frame_flags);
+  else
+    Flush();
+
+  // Handle twopass stats
+  CxDataIterator iter = GetCxData();
+
+  while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+    if (pkt->kind != AOM_CODEC_STATS_PKT) continue;
+
+    stats_->Append(*pkt);
+  }
+}
+
+void Encoder::EncodeFrameInternal(const VideoSource &video,
+                                  const unsigned long frame_flags) {
+  aom_codec_err_t res;
+  const aom_image_t *img = video.img();
+
+  // Handle frame resizing
+  if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    res = aom_codec_enc_config_set(&encoder_, &cfg_);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  // Encode the frame
+  API_REGISTER_STATE_CHECK(res =
+                               aom_codec_encode(&encoder_, img, video.pts(),
+                                                video.duration(), frame_flags));
+  ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+}
+
+void Encoder::Flush() {
+  const aom_codec_err_t res = aom_codec_encode(&encoder_, NULL, 0, 0, 0);
+  if (!encoder_.priv)
+    ASSERT_EQ(AOM_CODEC_ERROR, res) << EncoderError();
+  else
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+}
+
+void EncoderTest::InitializeConfig() {
+  const aom_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0);
+  ASSERT_EQ(AOM_CODEC_OK, res);
+}
+
+void EncoderTest::SetMode(TestMode mode) {
+  switch (mode) {
+    case kOnePassGood:
+    case kTwoPassGood: break;
+    case kRealTime: cfg_.g_lag_in_frames = 0; break;
+    default: ASSERT_TRUE(false) << "Unexpected mode " << mode;
+  }
+  mode_ = mode;
+  if (mode == kTwoPassGood)
+    passes_ = 2;
+  else
+    passes_ = 1;
+}
+
+static bool compare_plane(const uint8_t *const buf1, int stride1,
+                          const uint8_t *const buf2, int stride2, int w, int h,
+                          int *const mismatch_row, int *const mismatch_col,
+                          int *const mismatch_pix1, int *const mismatch_pix2) {
+  int r, c;
+
+  for (r = 0; r < h; ++r) {
+    for (c = 0; c < w; ++c) {
+      const int pix1 = buf1[r * stride1 + c];
+      const int pix2 = buf2[r * stride2 + c];
+
+      if (pix1 != pix2) {
+        if (mismatch_row != NULL) *mismatch_row = r;
+        if (mismatch_col != NULL) *mismatch_col = c;
+        if (mismatch_pix1 != NULL) *mismatch_pix1 = pix1;
+        if (mismatch_pix2 != NULL) *mismatch_pix2 = pix2;
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// The function should return "true" most of the time, therefore no early
+// break-out is implemented within the match checking process.
+static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
+                        int *const mismatch_row, int *const mismatch_col,
+                        int *const mismatch_plane, int *const mismatch_pix1,
+                        int *const mismatch_pix2) {
+  if (img1->fmt != img2->fmt || img1->cp != img2->cp || img1->tc != img2->tc ||
+      img1->mc != img2->mc || img1->d_w != img2->d_w ||
+      img1->d_h != img2->d_h || img1->monochrome != img2->monochrome) {
+    if (mismatch_row != NULL) *mismatch_row = -1;
+    if (mismatch_col != NULL) *mismatch_col = -1;
+    return false;
+  }
+
+  const int num_planes = img1->monochrome ? 1 : 3;
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (!compare_plane(img1->planes[plane], img1->stride[plane],
+                       img2->planes[plane], img2->stride[plane],
+                       aom_img_plane_width(img1, plane),
+                       aom_img_plane_height(img1, plane), mismatch_row,
+                       mismatch_col, mismatch_pix1, mismatch_pix2)) {
+      if (mismatch_plane != NULL) *mismatch_plane = plane;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void EncoderTest::MismatchHook(const aom_image_t *img_enc,
+                               const aom_image_t *img_dec) {
+  int mismatch_row = 0;
+  int mismatch_col = 0;
+  int mismatch_plane = 0;
+  int mismatch_pix_enc = 0;
+  int mismatch_pix_dec = 0;
+
+  ASSERT_FALSE(compare_img(img_enc, img_dec, &mismatch_row, &mismatch_col,
+                           &mismatch_plane, &mismatch_pix_enc,
+                           &mismatch_pix_dec));
+
+  GTEST_FAIL() << "Encode/Decode mismatch found:" << std::endl
+               << "  pixel value enc/dec: " << mismatch_pix_enc << "/"
+               << mismatch_pix_dec << std::endl
+               << "                plane: " << mismatch_plane << std::endl
+               << "              row/col: " << mismatch_row << "/"
+               << mismatch_col << std::endl;
+}
+
+void EncoderTest::RunLoop(VideoSource *video) {
+  aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+  dec_cfg.allow_lowbitdepth = 1;
+
+  stats_.Reset();
+
+  ASSERT_TRUE(passes_ == 1 || passes_ == 2);
+  for (unsigned int pass = 0; pass < passes_; pass++) {
+    last_pts_ = 0;
+
+    if (passes_ == 1)
+      cfg_.g_pass = AOM_RC_ONE_PASS;
+    else if (pass == 0)
+      cfg_.g_pass = AOM_RC_FIRST_PASS;
+    else
+      cfg_.g_pass = AOM_RC_LAST_PASS;
+
+    BeginPassHook(pass);
+    testing::internal::scoped_ptr<Encoder> encoder(
+        codec_->CreateEncoder(cfg_, init_flags_, &stats_));
+    ASSERT_TRUE(encoder.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(video->Begin());
+    encoder->InitEncoder(video);
+
+    if (mode_ == kRealTime) {
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+    }
+
+    ASSERT_FALSE(::testing::Test::HasFatalFailure());
+
+    testing::internal::scoped_ptr<Decoder> decoder(
+        codec_->CreateDecoder(dec_cfg, 0 /* flags */));
+#if CONFIG_AV1_DECODER
+    if (decoder->IsAV1()) {
+      // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
+      // frame is decoded.
+      decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile);
+      decoder->Control(AV1D_EXT_TILE_DEBUG, 1);
+      decoder->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+#endif
+
+    bool again;
+    for (again = true; again; video->Next()) {
+      again = (video->img() != NULL);
+
+      PreEncodeFrameHook(video);
+      PreEncodeFrameHook(video, encoder.get());
+      encoder->EncodeFrame(video, frame_flags_);
+
+      CxDataIterator iter = encoder->GetCxData();
+
+      bool has_cxdata = false;
+      bool has_dxdata = false;
+      while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+        pkt = MutateEncoderOutputHook(pkt);
+        again = true;
+        switch (pkt->kind) {
+          case AOM_CODEC_CX_FRAME_PKT:
+            has_cxdata = true;
+            if (decoder.get() != NULL && DoDecode()) {
+              aom_codec_err_t res_dec;
+              if (DoDecodeInvisible()) {
+                res_dec = decoder->DecodeFrame(
+                    (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
+              } else {
+                res_dec = decoder->DecodeFrame(
+                    (const uint8_t *)pkt->data.frame.buf +
+                        (pkt->data.frame.sz - pkt->data.frame.vis_frame_size),
+                    pkt->data.frame.vis_frame_size);
+              }
+
+              if (!HandleDecodeResult(res_dec, decoder.get())) break;
+
+              has_dxdata = true;
+            }
+            ASSERT_GE(pkt->data.frame.pts, last_pts_);
+            last_pts_ = pkt->data.frame.pts;
+            FramePktHook(pkt);
+            break;
+
+          case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
+
+          default: break;
+        }
+      }
+
+      if (has_dxdata && has_cxdata) {
+        const aom_image_t *img_enc = encoder->GetPreviewFrame();
+        DxDataIterator dec_iter = decoder->GetDxData();
+        const aom_image_t *img_dec = dec_iter.Next();
+        if (img_enc && img_dec) {
+          const bool res =
+              compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
+          if (!res) {  // Mismatch
+            MismatchHook(img_enc, img_dec);
+          }
+        }
+        if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
+      }
+      if (!Continue()) break;
+    }
+
+    EndPassHook();
+
+    if (!Continue()) break;
+  }
+}
+
+}  // namespace libaom_test
diff --git a/third_party/aom/test/encode_test_driver.h b/third_party/aom/test/encode_test_driver.h
new file mode 100644
index 000000000..4f3f855cf
--- /dev/null
+++ b/third_party/aom/test/encode_test_driver.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_ENCODE_TEST_DRIVER_H_
+#define AOM_TEST_ENCODE_TEST_DRIVER_H_
+
+#include <string>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+#include "aom/aom_encoder.h"
+
+namespace libaom_test {
+
+class CodecFactory;
+class VideoSource;
+
+enum TestMode { kRealTime, kOnePassGood, kTwoPassGood };
+#define ALL_TEST_MODES                                                     \
+  ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood, \
+                    ::libaom_test::kTwoPassGood)
+
+#define ONE_PASS_TEST_MODES \
+  ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood)
+
+#define TWO_PASS_TEST_MODES ::testing::Values(::libaom_test::kTwoPassGood)
+
+#define NONREALTIME_TEST_MODES \
+  ::testing::Values(::libaom_test::kOnePassGood, ::libaom_test::kTwoPassGood)
+
+// Provides an object to handle the libaom get_cx_data() iteration pattern
+class CxDataIterator {
+ public:
+  explicit CxDataIterator(aom_codec_ctx_t *encoder)
+      : encoder_(encoder), iter_(NULL) {}
+
+  const aom_codec_cx_pkt_t *Next() {
+    return aom_codec_get_cx_data(encoder_, &iter_);
+  }
+
+ private:
+  aom_codec_ctx_t *encoder_;
+  aom_codec_iter_t iter_;
+};
+
+// Implements an in-memory store for libaom twopass statistics
+class TwopassStatsStore {
+ public:
+  void Append(const aom_codec_cx_pkt_t &pkt) {
+    buffer_.append(reinterpret_cast<char *>(pkt.data.twopass_stats.buf),
+                   pkt.data.twopass_stats.sz);
+  }
+
+  aom_fixed_buf_t buf() {
+    const aom_fixed_buf_t buf = { &buffer_[0], buffer_.size() };
+    return buf;
+  }
+
+  void Reset() { buffer_.clear(); }
+
+ protected:
+  std::string buffer_;
+};
+
+// Provides a simplified interface to manage one video encoding pass, given
+// a configuration and video source.
+//
+// TODO(jkoleszar): The exact services it provides and the appropriate
+// level of abstraction will be fleshed out as more tests are written.
+class Encoder {
+ public:
+  Encoder(aom_codec_enc_cfg_t cfg, const uint32_t init_flags,
+          TwopassStatsStore *stats)
+      : cfg_(cfg), init_flags_(init_flags), stats_(stats) {
+    memset(&encoder_, 0, sizeof(encoder_));
+  }
+
+  virtual ~Encoder() { aom_codec_destroy(&encoder_); }
+
+  CxDataIterator GetCxData() { return CxDataIterator(&encoder_); }
+
+  void InitEncoder(VideoSource *video);
+
+  const aom_image_t *GetPreviewFrame() {
+    return aom_codec_get_preview_frame(&encoder_);
+  }
+  // This is a thin wrapper around aom_codec_encode(), so refer to
+  // aom_encoder.h for its semantics.
+  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+
+  // Convenience wrapper for EncodeFrame()
+  void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
+
+  void Control(int ctrl_id, int arg) {
+    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, int *arg) {
+    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_scaling_mode *arg) {
+    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+#if CONFIG_AV1_ENCODER
+  void Control(int ctrl_id, aom_active_map_t *arg) {
+    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+#endif
+
+  void Config(const aom_codec_enc_cfg_t *cfg) {
+    const aom_codec_err_t res = aom_codec_enc_config_set(&encoder_, cfg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+    cfg_ = *cfg;
+  }
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const = 0;
+
+  const char *EncoderError() {
+    const char *detail = aom_codec_error_detail(&encoder_);
+    return detail ? detail : aom_codec_error(&encoder_);
+  }
+
+  // Encode an image
+  void EncodeFrameInternal(const VideoSource &video,
+                           const unsigned long frame_flags);
+
+  // Flush the encoder on EOS
+  void Flush();
+
+  aom_codec_ctx_t encoder_;
+  aom_codec_enc_cfg_t cfg_;
+  unsigned long init_flags_;
+  TwopassStatsStore *stats_;
+};
+
+// Common test functionality for all Encoder tests.
+//
+// This class is a mixin which provides the main loop common to all
+// encoder tests. It provides hooks which can be overridden by subclasses
+// to implement each test's specific behavior, while centralizing the bulk
+// of the boilerplate. Note that it doesn't inherit the gtest testing
+// classes directly, so that tests can be parameterized differently.
+class EncoderTest {
+ protected:
+  explicit EncoderTest(const CodecFactory *codec)
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
+        last_pts_(0), mode_(kRealTime) {
+    // Default to 1 thread.
+    cfg_.g_threads = 1;
+  }
+
+  virtual ~EncoderTest() {}
+
+  // Initialize the cfg_ member with the default configuration.
+  void InitializeConfig();
+
+  // Map the TestMode enum to the passes_ variables.
+  void SetMode(TestMode mode);
+
+  // Set encoder flag.
+  void set_init_flags(unsigned long flag) {  // NOLINT(runtime/int)
+    init_flags_ = flag;
+  }
+
+  // Main loop
+  virtual void RunLoop(VideoSource *video);
+
+  // Hook to be called at the beginning of a pass.
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+  // Hook to be called at the end of a pass.
+  virtual void EndPassHook() {}
+
+  // Hook to be called before encoding a frame.
+  virtual void PreEncodeFrameHook(VideoSource * /*video*/) {}
+  virtual void PreEncodeFrameHook(VideoSource * /*video*/,
+                                  Encoder * /*encoder*/) {}
+
+  // Hook to be called on every compressed data packet.
+  virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+  // Hook to be called on every PSNR packet.
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+  // Hook to determine whether the encode loop should continue.
+  virtual bool Continue() const {
+    return !(::testing::Test::HasFatalFailure() || abort_);
+  }
+
+  // Hook to determine whether to decode frame after encoding
+  virtual bool DoDecode() const { return true; }
+
+  // Hook to determine whether to decode invisible frames after encoding
+  virtual bool DoDecodeInvisible() const { return true; }
+
+  // Hook to handle encode/decode mismatch
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2);
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const aom_image_t & /*img*/,
+                                     aom_codec_pts_t /*pts*/) {}
+
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  // Hook that can modify the encoder's output data
+  virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const aom_codec_cx_pkt_t *pkt) {
+    return pkt;
+  }
+
+  const CodecFactory *codec_;
+  bool abort_;
+  aom_codec_enc_cfg_t cfg_;
+  unsigned int passes_;
+  TwopassStatsStore stats_;
+  unsigned long init_flags_;
+  unsigned long frame_flags_;
+  aom_codec_pts_t last_pts_;
+  TestMode mode_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_ENCODE_TEST_DRIVER_H_
diff --git a/third_party/aom/test/encodetxb_test.cc b/third_party/aom/test/encodetxb_test.cc
new file mode 100644
index 000000000..11cc07368
--- /dev/null
+++ b/third_party/aom/test/encodetxb_test.cc
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*GetNzMapContextsFunc)(const uint8_t *const levels,
+                                     const int16_t *const scan,
+                                     const uint16_t eob, const TX_SIZE tx_size,
+                                     const TX_CLASS tx_class,
+                                     int8_t *const coeff_contexts);
+
+class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
+ public:
+  EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
+
+  virtual ~EncodeTxbTest() {}
+
+  virtual void SetUp() {
+    coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
+        aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
+    ASSERT_TRUE(coeff_contexts_ref_ != NULL);
+    coeff_contexts_ = reinterpret_cast<int8_t *>(
+        aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE));
+    ASSERT_TRUE(coeff_contexts_ != NULL);
+  }
+
+  virtual void TearDown() {
+    aom_free(coeff_contexts_ref_);
+    aom_free(coeff_contexts_);
+    libaom_test::ClearSystemState();
+  }
+
+  void GetNzMapContextsRun() {
+    const int kNumTests = 10;
+    int result = 0;
+
+    for (int is_inter = 0; is_inter < 2; ++is_inter) {
+      for (int tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+        const TX_CLASS tx_class = tx_type_to_class[tx_type];
+        for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+          const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+          const int width = get_txb_wide((TX_SIZE)tx_size);
+          const int height = get_txb_high((TX_SIZE)tx_size);
+          const int real_width = tx_size_wide[tx_size];
+          const int real_height = tx_size_high[tx_size];
+          const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+
+          levels_ = set_levels(levels_buf_, width);
+          for (int i = 0; i < kNumTests && !result; ++i) {
+            for (int eob = 1; eob <= width * height && !result; ++eob) {
+              InitDataWithEob(scan, bwl, eob);
+
+              av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+                                        tx_class, coeff_contexts_ref_);
+              get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+                                        tx_class, coeff_contexts_);
+
+              result = Compare(scan, eob);
+
+              EXPECT_EQ(result, 0)
+                  << " tx_class " << tx_class << " width " << real_width
+                  << " height " << real_height << " eob " << eob;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void SpeedTestGetNzMapContextsRun() {
+    const int kNumTests = 2000000000;
+    aom_usec_timer timer;
+
+    printf("Note: Only test the largest possible eob case!\n");
+    for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+      const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+      const int width = get_txb_wide((TX_SIZE)tx_size);
+      const int height = get_txb_high((TX_SIZE)tx_size);
+      const int real_width = tx_size_wide[tx_size];
+      const int real_height = tx_size_high[tx_size];
+      const TX_TYPE tx_type = DCT_DCT;
+      const TX_CLASS tx_class = tx_type_to_class[tx_type];
+      const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+      const int eob = width * height;
+      const int numTests = kNumTests / (width * height);
+
+      levels_ = set_levels(levels_buf_, width);
+      InitDataWithEob(scan, bwl, eob);
+
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < numTests; ++i) {
+        get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+                                  tx_class, coeff_contexts_);
+      }
+      aom_usec_timer_mark(&timer);
+
+      const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
+             elapsed_time / 1000.0);
+    }
+  }
+
+ private:
+  void InitDataWithEob(const int16_t *const scan, const int bwl,
+                       const int eob) {
+    memset(levels_buf_, 0, sizeof(levels_buf_));
+    memset(coeff_contexts_, 0, sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+
+    for (int c = 0; c < eob; ++c) {
+      levels_[get_padded_idx(scan[c], bwl)] =
+          static_cast<uint8_t>(clamp(rnd_.Rand8(), 0, INT8_MAX));
+      coeff_contexts_[scan[c]] = rnd_.Rand16() >> 1;
+    }
+
+    memcpy(coeff_contexts_ref_, coeff_contexts_,
+           sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+  }
+
+  bool Compare(const int16_t *const scan, const int eob) const {
+    bool result = false;
+    if (memcmp(coeff_contexts_, coeff_contexts_ref_,
+               sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)) {
+      for (int i = 0; i < eob; i++) {
+        const int pos = scan[i];
+        if (coeff_contexts_ref_[pos] != coeff_contexts_[pos]) {
+          printf("coeff_contexts_[%d] diff:%6d (ref),%6d (opt)\n", pos,
+                 coeff_contexts_ref_[pos], coeff_contexts_[pos]);
+          result = true;
+          break;
+        }
+      }
+    }
+    return result;
+  }
+
+  GetNzMapContextsFunc get_nz_map_contexts_func_;
+  ACMRandom rnd_;
+  uint8_t levels_buf_[TX_PAD_2D];
+  uint8_t *levels_;
+  int8_t *coeff_contexts_ref_;
+  int8_t *coeff_contexts_;
+};
+
+TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
+
+TEST_P(EncodeTxbTest, DISABLED_SpeedTestGetNzMapContexts) {
+  SpeedTestGetNzMapContextsRun();
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, EncodeTxbTest,
+                        ::testing::Values(av1_get_nz_map_contexts_sse2));
+#endif
+
+typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
+                                         const int width, const int height,
+                                         uint8_t *const levels);
+
+typedef ::testing::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
+
+class EncodeTxbInitLevelTest
+    : public ::testing::TestWithParam<TxbInitLevelParam> {
+ public:
+  virtual ~EncodeTxbInitLevelTest() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
+};
+
+void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
+                                     int tx_size, int is_speed) {
+  const int width = get_txb_wide((TX_SIZE)tx_size);
+  const int height = get_txb_high((TX_SIZE)tx_size);
+  tran_low_t coeff[MAX_TX_SQUARE];
+
+  uint8_t levels_buf[2][TX_PAD_2D];
+  uint8_t *const levels0 = set_levels(levels_buf[0], width);
+  uint8_t *const levels1 = set_levels(levels_buf[1], width);
+
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < width * height; i++) {
+    coeff[i] = rnd.Rand15Signed() + rnd.Rand15Signed();
+  }
+  for (int i = 0; i < TX_PAD_2D; i++) {
+    levels_buf[0][i] = rnd.Rand8();
+    levels_buf[1][i] = rnd.Rand8();
+  }
+  const int run_times = is_speed ? (width * height) * 10000 : 1;
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_txb_init_levels_c(coeff, width, height, levels0);
+  }
+  const double t1 = get_time_mark(&timer);
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    test_func(coeff, width, height, levels1);
+  }
+  const double t2 = get_time_mark(&timer);
+  if (is_speed) {
+    printf("init %3dx%-3d:%7.2f/%7.2fns", width, height, t1, t2);
+    printf("(%3.2f)\n", t1 / t2);
+  }
+  const int stride = width + TX_PAD_HOR;
+  for (int r = 0; r < height + TX_PAD_VER; ++r) {
+    for (int c = 0; c < stride; ++c) {
+      ASSERT_EQ(levels_buf[0][c + r * stride], levels_buf[1][c + r * stride])
+          << "[" << r << "," << c << "] " << run_times << width << "x"
+          << height;
+    }
+  }
+}
+
+TEST_P(EncodeTxbInitLevelTest, match) {
+  RunTest(GET_PARAM(0), GET_PARAM(1), 0);
+}
+
+TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) {
+  RunTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/end_to_end_test.cc b/third_party/aom/test/end_to_end_test.cc
new file mode 100644
index 000000000..1ac0ae931
--- /dev/null
+++ b/third_party/aom/test/end_to_end_test.cc
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kWidth = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+// List of psnr thresholds for speed settings 0-7 and 5 encoding modes
+const double kPsnrThreshold[][5] = {
+// Note:
+// AV1 HBD average PSNR is slightly lower than AV1.
+// We make two cases here to enable the testing and
+// guard picture quality.
+#if CONFIG_AV1_ENCODER
+  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 },
+  { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 },
+  { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#else
+  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
+  { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
+  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#endif  // CONFIG_AV1_ENCODER
+};
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+};
+
+// Encoding modes tested
+const libaom_test::TestMode kEncodingModeVectors[] = {
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
+  ::libaom_test::kRealTime,
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class EndToEndTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {}
+
+  virtual ~EndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass.
+      if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[cpu_used_][encoding_mode_];
+  }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+    if (is_extension_y4m(test_video_param_.filename)) {
+      video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                  kFrames));
+    } else {
+      video.reset(new libaom_test::YUVVideoSource(
+          test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+          kFramerate, 1, 0, kFrames));
+    }
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+};
+
+class EndToEndTestLarge : public EndToEndTest {};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
+                          ::testing::ValuesIn(kEncodingModeVectors),
+                          ::testing::ValuesIn(kTestVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_CASE(EndToEndTest,
+                          ::testing::Values(kEncodingModeVectors[0]),
+                          ::testing::Values(kTestVectors[2]),  // 444
+                          ::testing::Values(kCpuUsedVectors[2]));
+}  // namespace
diff --git a/third_party/aom/test/error_block_test.cc b/third_party/aom/test/error_block_test.cc
new file mode 100644
index 000000000..353947c3d
--- /dev/null
+++ b/third_party/aom/test/error_block_test.cc
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int kNumIterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  intptr_t block_size, int64_t *ssz, int bps);
+
+typedef ::testing::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
+    ErrorBlockParam;
+
+class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_ = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  aom_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      // coeff and dqcoeff will always have at least the same sign, and this
+      // can be used for optimization, so generate test input precisely.
+      if (rnd(2)) {
+        // Positive number
+        coeff[j] = rnd(1 << msb);
+        dqcoeff[j] = rnd(1 << msb);
+      } else {
+        // Negative number
+        coeff[j] = -rnd(1 << msb);
+        dqcoeff[j] = -rnd(1 << msb);
+      }
+    }
+    ref_ret =
+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match optimized output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
+  int max_val = ((1 << msb) - 1);
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 9;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if (k == 8 && (i % 9) == 0) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {
+        // Test at positive maximum values
+        coeff[j] = k % 2 ? max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+      } else if (k < 8) {
+        // Test at negative maximum values
+        coeff[j] = k % 2 ? -max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
+      } else {
+        if (rnd(2)) {
+          // Positive number
+          coeff[j] = rnd(1 << 14);
+          dqcoeff[j] = rnd(1 << 14);
+        } else {
+          // Negative number
+          coeff[j] = -rnd(1 << 14);
+          dqcoeff[j] = -rnd(1 << 14);
+        }
+      }
+    }
+    ref_ret =
+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match optimized output. "
+      << "First failed at test case " << first_failure;
+}
+
+#if (HAVE_SSE2 || HAVE_AVX)
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, ErrorBlockTest,
+    ::testing::Values(make_tuple(&av1_highbd_block_error_sse2,
+                                 &av1_highbd_block_error_c, AOM_BITS_10),
+                      make_tuple(&av1_highbd_block_error_sse2,
+                                 &av1_highbd_block_error_c, AOM_BITS_12),
+                      make_tuple(&av1_highbd_block_error_sse2,
+                                 &av1_highbd_block_error_c, AOM_BITS_8)));
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/third_party/aom/test/error_resilience_test.cc b/third_party/aom/test/error_resilience_test.cc
new file mode 100644
index 000000000..13ac0bf93
--- /dev/null
+++ b/third_party/aom/test/error_resilience_test.cc
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxErrorFrames = 12;
+const int kMaxInvisibleErrorFrames = 12;
+const int kMaxDroppableFrames = 12;
+const int kMaxErrorResilientFrames = 12;
+const int kMaxNoMFMVFrames = 12;
+const int kMaxPrimRefNoneFrames = 12;
+const int kMaxSFrames = 12;
+const int kCpuUsed = 1;
+
+class ErrorResilienceTestLarge
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ErrorResilienceTestLarge()
+      : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
+        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0) {
+    Reset();
+  }
+
+  virtual ~ErrorResilienceTestLarge() {}
+
+  void Reset() {
+    error_nframes_ = 0;
+    invisible_error_nframes_ = 0;
+    droppable_nframes_ = 0;
+    error_resilient_nframes_ = 0;
+    nomfmv_nframes_ = 0;
+    prim_ref_none_nframes_ = 0;
+    s_nframes_ = 0;
+  }
+
+  void SetupEncoder(int bitrate, int lag) {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = bitrate;
+    cfg_.kf_mode = AOM_KF_DISABLED;
+    cfg_.g_lag_in_frames = lag;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+    decoded_nframes_ = 0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+    frame_flags_ &=
+        ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+          AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT |
+          AOM_EFLAG_SET_S_FRAME | AOM_EFLAG_SET_PRIMARY_REF_NONE);
+    if (droppable_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
+        if (droppable_frames_[i] == video->frame()) {
+          std::cout << "             Encoding droppable frame: "
+                    << droppable_frames_[i] << "\n";
+          frame_flags_ |= (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                           AOM_EFLAG_NO_UPD_ARF);
+          break;
+        }
+      }
+    }
+
+    if (error_resilient_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < error_resilient_nframes_; ++i) {
+        if (error_resilient_frames_[i] == video->frame()) {
+          std::cout << "             Encoding error_resilient frame: "
+                    << error_resilient_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_ERROR_RESILIENT;
+          break;
+        }
+      }
+    }
+
+    if (nomfmv_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < nomfmv_nframes_; ++i) {
+        if (nomfmv_frames_[i] == video->frame()) {
+          std::cout << "             Encoding no mfmv frame: "
+                    << nomfmv_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_NO_REF_FRAME_MVS;
+          break;
+        }
+      }
+    }
+
+    if (prim_ref_none_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i) {
+        if (prim_ref_none_frames_[i] == video->frame()) {
+          std::cout << "             Encoding no PRIMARY_REF_NONE frame: "
+                    << prim_ref_none_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_SET_PRIMARY_REF_NONE;
+          break;
+        }
+      }
+    }
+
+    encoder->Control(AV1E_SET_S_FRAME_MODE, 0);
+    if (s_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < s_nframes_; ++i) {
+        if (s_frames_[i] == video->frame()) {
+          std::cout << "             Encoding S frame: " << s_frames_[i]
+                    << "\n";
+          frame_flags_ |= AOM_EFLAG_SET_S_FRAME;
+          break;
+        }
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetAverageMismatchPsnr() const {
+    if (mismatch_nframes_) return mismatch_psnr_ / mismatch_nframes_;
+    return 0.0;
+  }
+
+  virtual bool DoDecode() const {
+    if (error_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < error_nframes_; ++i) {
+        if (error_frames_[i] == nframes_ - 1) {
+          std::cout << "             Skipping decoding frame: "
+                    << error_frames_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
+  virtual bool DoDecodeInvisible() const {
+    if (invisible_error_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < invisible_error_nframes_; ++i) {
+        if (invisible_error_frames_[i] == nframes_ - 1) {
+          std::cout << "             Skipping decoding all invisible frames in "
+                       "frame pkt: "
+                    << invisible_error_frames_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    if (allow_mismatch_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+      // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+    } else {
+      ::libaom_test::EncoderTest::MismatchHook(img1, img2);
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)img;
+    (void)pts;
+    ++decoded_nframes_;
+  }
+
+  void SetErrorFrames(int num, unsigned int *list) {
+    if (num > kMaxErrorFrames)
+      num = kMaxErrorFrames;
+    else if (num < 0)
+      num = 0;
+    error_nframes_ = num;
+    for (unsigned int i = 0; i < error_nframes_; ++i)
+      error_frames_[i] = list[i];
+  }
+
+  void SetInvisibleErrorFrames(int num, unsigned int *list) {
+    if (num > kMaxInvisibleErrorFrames)
+      num = kMaxInvisibleErrorFrames;
+    else if (num < 0)
+      num = 0;
+    invisible_error_nframes_ = num;
+    for (unsigned int i = 0; i < invisible_error_nframes_; ++i)
+      invisible_error_frames_[i] = list[i];
+  }
+
+  void SetDroppableFrames(int num, unsigned int *list) {
+    if (num > kMaxDroppableFrames)
+      num = kMaxDroppableFrames;
+    else if (num < 0)
+      num = 0;
+    droppable_nframes_ = num;
+    for (unsigned int i = 0; i < droppable_nframes_; ++i)
+      droppable_frames_[i] = list[i];
+  }
+
+  void SetErrorResilientFrames(int num, unsigned int *list) {
+    if (num > kMaxErrorResilientFrames)
+      num = kMaxErrorResilientFrames;
+    else if (num < 0)
+      num = 0;
+    error_resilient_nframes_ = num;
+    for (unsigned int i = 0; i < error_resilient_nframes_; ++i)
+      error_resilient_frames_[i] = list[i];
+  }
+
+  void SetNoMFMVFrames(int num, unsigned int *list) {
+    if (num > kMaxNoMFMVFrames)
+      num = kMaxNoMFMVFrames;
+    else if (num < 0)
+      num = 0;
+    nomfmv_nframes_ = num;
+    for (unsigned int i = 0; i < nomfmv_nframes_; ++i)
+      nomfmv_frames_[i] = list[i];
+  }
+
+  void SetPrimaryRefNoneFrames(int num, unsigned int *list) {
+    if (num > kMaxPrimRefNoneFrames)
+      num = kMaxPrimRefNoneFrames;
+    else if (num < 0)
+      num = 0;
+    prim_ref_none_nframes_ = num;
+    for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i)
+      prim_ref_none_frames_[i] = list[i];
+  }
+
+  void SetSFrames(int num, unsigned int *list) {
+    if (num > kMaxSFrames)
+      num = kMaxSFrames;
+    else if (num < 0)
+      num = 0;
+    s_nframes_ = num;
+    for (unsigned int i = 0; i < s_nframes_; ++i) s_frames_[i] = list[i];
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetEncodedFrames() { return nframes_; }
+  unsigned int GetDecodedFrames() { return decoded_nframes_; }
+
+  void SetAllowMismatch(int allow) { allow_mismatch_ = allow; }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  unsigned int decoded_nframes_;
+  unsigned int error_nframes_;
+  unsigned int invisible_error_nframes_;
+  unsigned int droppable_nframes_;
+  unsigned int error_resilient_nframes_;
+  unsigned int nomfmv_nframes_;
+  unsigned int prim_ref_none_nframes_;
+  unsigned int s_nframes_;
+  double mismatch_psnr_;
+  unsigned int mismatch_nframes_;
+  unsigned int error_frames_[kMaxErrorFrames];
+  unsigned int invisible_error_frames_[kMaxInvisibleErrorFrames];
+  unsigned int droppable_frames_[kMaxDroppableFrames];
+  unsigned int error_resilient_frames_[kMaxErrorResilientFrames];
+  unsigned int nomfmv_frames_[kMaxNoMFMVFrames];
+  unsigned int prim_ref_none_frames_[kMaxPrimRefNoneFrames];
+  unsigned int s_frames_[kMaxSFrames];
+  libaom_test::TestMode encoding_mode_;
+  int allow_mismatch_;
+};
+
+TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
+  SetupEncoder(2000, 10);
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 12);
+
+  // Global error resilient mode OFF.
+  cfg_.g_error_resilient = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_off = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_off, 25.0);
+
+  Reset();
+  // Error resilient mode ON for certain frames
+  unsigned int num_error_resilient_frames = 5;
+  unsigned int error_resilient_frame_list[] = { 3, 5, 6, 9, 11 };
+  SetErrorResilientFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_on = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_on, 25.0);
+
+  // Test that turning on error resilient mode hurts by 10% at most.
+  if (psnr_resilience_off > 0.0) {
+    const double psnr_ratio = psnr_resilience_on / psnr_resilience_off;
+    EXPECT_GE(psnr_ratio, 0.9);
+    EXPECT_LE(psnr_ratio, 1.1);
+  }
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) a set of droppable
+// frames (i.e., frames that don't update any reference buffers).
+TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
+  SetupEncoder(500, 10);
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 20);
+
+  // Set an arbitrary set of error frames same as droppable frames.
+  unsigned int num_droppable_frames = 3;
+  unsigned int droppable_frame_list[] = { 5, 10, 13 };
+  SetDroppableFrames(num_droppable_frames, droppable_frame_list);
+  SetErrorFrames(num_droppable_frames, droppable_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_droppable_frames);
+}
+
+// Check for ParseAbility property of an error-resilient frame.
+// Encode a frame in error-resilient mode (E-frame), and disallow all
+// subsequent frames from using MFMV. If frames are dropped before the
+// E frame, all frames starting from the E frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, ParseAbilityTest) {
+  SetupEncoder(500, 10);
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 15);
+
+  SetAllowMismatch(1);
+
+  // Note that an E-frame cannot be forced on a frame that is a
+  // show_existing_frame, or a frame that comes directly after an invisible
+  // frame. Currently, this will cause an assertion failure.
+  // Set an arbitrary error resilient (E) frame
+  unsigned int num_error_resilient_frames = 1;
+  unsigned int error_resilient_frame_list[] = { 8 };
+  SetErrorResilientFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  // Ensure that any invisible frames before the E frame are dropped
+  SetInvisibleErrorFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  // Set all frames after the error resilient frame to not allow MFMV
+  unsigned int num_post_error_resilient_frames = 6;
+  unsigned int post_error_resilient_frame_list[] = { 9, 10, 11, 12, 13, 14 };
+  SetNoMFMVFrames(num_post_error_resilient_frames,
+                  post_error_resilient_frame_list);
+
+  // Set a few frames before the E frame that are lost (not decoded)
+  unsigned int num_error_frames = 5;
+  unsigned int error_frame_list[] = { 3, 4, 5, 6, 7 };
+  SetErrorFrames(num_error_frames, error_frame_list);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+  // All frames following the E-frame and the E-frame are expected to have
+  // mismatches, but still be parse-able.
+  EXPECT_LE(GetMismatchFrames(), num_post_error_resilient_frames + 1);
+}
+
+// Check for ParseAbility property of an S frame.
+// Encode an S-frame. If frames are dropped before the S-frame, all frames
+// starting from the S frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, SFrameTest) {
+  SetupEncoder(500, 10);
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 15);
+
+  SetAllowMismatch(1);
+
+  // Note that an S-frame cannot be forced on a frame that is a
+  // show_existing_frame. This issue still needs to be addressed.
+  // Set an arbitrary S-frame
+  unsigned int num_s_frames = 1;
+  unsigned int s_frame_list[] = { 6 };
+  SetSFrames(num_s_frames, s_frame_list);
+  // Ensure that any invisible frames before the S frame are dropped
+  SetInvisibleErrorFrames(num_s_frames, s_frame_list);
+
+  // Set a few frames before the S frame that are lost (not decoded)
+  unsigned int num_error_frames = 4;
+  unsigned int error_frame_list[] = { 2, 3, 4, 5 };
+  SetErrorFrames(num_error_frames, error_frame_list);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+  // All frames following the S-frame and the S-frame are expected to have
+  // mismatches, but still be parse-able.
+  EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]);
+}
+
+AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES);
+}  // namespace
diff --git a/third_party/aom/test/ethread_test.cc b/third_party/aom/test/ethread_test.cc
new file mode 100644
index 000000000..d9ac78282
--- /dev/null
+++ b/third_party/aom/test/ethread_test.cc
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+class AVxEncoderThreadTest
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AVxEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+        tile_cols_(GET_PARAM(3)), tile_rows_(GET_PARAM(4)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    cfg.allow_lowbitdepth = 1;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    if (decoder_->IsAV1()) {
+      decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+
+    size_enc_.clear();
+    md5_dec_.clear();
+    md5_enc_.clear();
+  }
+  virtual ~AVxEncoderThreadTest() { delete decoder_; }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = AOM_VBR;
+      cfg_.rc_2pass_vbr_minsection_pct = 5;
+      cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.g_error_resilient = 1;
+    }
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                                  ::libaom_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      SetTileSize(encoder);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+      } else {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+        encoder->Control(AV1E_SET_AQ_MODE, 3);
+      }
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
+  }
+
+  void DoTest() {
+    ::libaom_test::YUVVideoSource video(
+        "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 21);
+    cfg_.rc_target_bitrate = 1000;
+
+    // Encode using single thread.
+    row_mt_ = 0;
+    cfg_.g_threads = 1;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> single_thr_size_enc;
+    std::vector<std::string> single_thr_md5_enc;
+    std::vector<std::string> single_thr_md5_dec;
+    single_thr_size_enc = size_enc_;
+    single_thr_md5_enc = md5_enc_;
+    single_thr_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Encode using multiple threads.
+    cfg_.g_threads = 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_thr_size_enc;
+    std::vector<std::string> multi_thr_md5_enc;
+    std::vector<std::string> multi_thr_md5_dec;
+    multi_thr_size_enc = size_enc_;
+    multi_thr_md5_enc = md5_enc_;
+    multi_thr_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
+    ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
+    ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
+
+    // Encode using multiple threads row-mt enabled.
+    row_mt_ = 1;
+    cfg_.g_threads = 2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_thr2_row_mt_size_enc;
+    std::vector<std::string> multi_thr2_row_mt_md5_enc;
+    std::vector<std::string> multi_thr2_row_mt_md5_dec;
+    multi_thr2_row_mt_size_enc = size_enc_;
+    multi_thr2_row_mt_md5_enc = md5_enc_;
+    multi_thr2_row_mt_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Disable threads=3 test for now to reduce the time so that the nightly
+    // test would not time out.
+    // cfg_.g_threads = 3;
+    // ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // std::vector<size_t> multi_thr3_row_mt_size_enc;
+    // std::vector<std::string> multi_thr3_row_mt_md5_enc;
+    // std::vector<std::string> multi_thr3_row_mt_md5_dec;
+    // multi_thr3_row_mt_size_enc = size_enc_;
+    // multi_thr3_row_mt_md5_enc = md5_enc_;
+    // multi_thr3_row_mt_md5_dec = md5_dec_;
+    // size_enc_.clear();
+    // md5_enc_.clear();
+    // md5_dec_.clear();
+    // Check that the vectors are equal.
+    // ASSERT_EQ(multi_thr3_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+    // ASSERT_EQ(multi_thr3_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+    // ASSERT_EQ(multi_thr3_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+    cfg_.g_threads = 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_thr4_row_mt_size_enc;
+    std::vector<std::string> multi_thr4_row_mt_md5_enc;
+    std::vector<std::string> multi_thr4_row_mt_md5_dec;
+    multi_thr4_row_mt_size_enc = size_enc_;
+    multi_thr4_row_mt_md5_enc = md5_enc_;
+    multi_thr4_row_mt_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+    ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+    ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+  }
+
+  bool encoder_initialized_;
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int tile_cols_;
+  int tile_rows_;
+  int row_mt_;
+  ::libaom_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
+};
+
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+// For AV1, only test speed 0 to 3.
+// Here test cpu_used 2 and 3
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::Range(2, 4), ::testing::Values(0, 2),
+                          ::testing::Values(0, 1));
+
+// Test cpu_used 0 and 1.
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1, 2, 6));
+
+class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+};
+
+TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 1;
+  decoder_->Control(AV1_SET_TILE_MODE, 1);
+  decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
+
+TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
+  cfg_.large_scale_tile = 1;
+  decoder_->Control(AV1_SET_TILE_MODE, 1);
+  decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 4), ::testing::Values(0, 6),
+                          ::testing::Values(0, 6));
+}  // namespace
diff --git a/third_party/aom/test/examples.sh b/third_party/aom/test/examples.sh
new file mode 100755
index 000000000..2cdb89dd0
--- /dev/null
+++ b/third_party/aom/test/examples.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file runs all of the tests for the libaom examples.
+##
+. $(dirname $0)/tools_common.sh
+
+example_tests=$(ls -r $(dirname $0)/*.sh)
+
+# List of script names to exclude.
+exclude_list="best_encode examples run_encodes tools_common"
+
+# Filter out the scripts in $exclude_list.
+for word in ${exclude_list}; do
+  example_tests=$(filter_strings "${example_tests}" "${word}" exclude)
+done
+
+for test in ${example_tests}; do
+  # Source each test script so that exporting variables can be avoided.
+  AOM_TEST_NAME="$(basename ${test%.*})"
+  . "${test}"
+done
diff --git a/third_party/aom/test/external_frame_buffer_test.cc b/third_party/aom/test/external_frame_buffer_test.cc
new file mode 100644
index 000000000..c2af059a4
--- /dev/null
+++ b/third_party/aom/test/external_frame_buffer_test.cc
@@ -0,0 +1,512 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kVideoNameParam = 1;
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+  ExternalFrameBufferList()
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
+
+  virtual ~ExternalFrameBufferList() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete[] ext_fb_list_[i].data;
+    }
+    delete[] ext_fb_list_;
+  }
+
+  // Creates the list to hold the external buffers. Returns true on success.
+  bool CreateBufferList(int num_buffers) {
+    if (num_buffers < 0) return false;
+
+    num_buffers_ = num_buffers;
+    ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+    EXPECT_TRUE(ext_fb_list_ != NULL);
+    memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+    return true;
+  }
+
+  // Searches the frame buffer list for a free frame buffer. Makes sure
+  // that the frame buffer is at least |min_size| in bytes. Marks that the
+  // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+  // external frame buffer. Returns < 0 on an error.
+  int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_) return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete[] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = new uint8_t[min_size];
+      memset(ext_fb_list_[idx].data, 0, min_size);
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+
+    num_used_buffers_++;
+    return 0;
+  }
+
+  // Test function that will not allocate any data for the frame buffer.
+  // Returns < 0 on an error.
+  int GetZeroFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_) return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete[] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Marks the external frame buffer that |fb| is pointing to as free.
+  // Returns < 0 on an error.
+  int ReturnFrameBuffer(aom_codec_frame_buffer_t *fb) {
+    if (fb == NULL) {
+      EXPECT_TRUE(fb != NULL);
+      return -1;
+    }
+    ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
+    if (ext_fb == NULL) {
+      EXPECT_TRUE(ext_fb != NULL);
+      return -1;
+    }
+    EXPECT_EQ(1, ext_fb->in_use);
+    ext_fb->in_use = 0;
+    num_used_buffers_--;
+    return 0;
+  }
+
+  // Checks that the ximage data is contained within the external frame buffer
+  // private data passed back in the ximage.
+  void CheckXImageFrameBuffer(const aom_image_t *img) {
+    if (img->fb_priv != NULL) {
+      const struct ExternalFrameBuffer *const ext_fb =
+          reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
+
+      ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+                  img->planes[0] < (ext_fb->data + ext_fb->size));
+    }
+  }
+
+  int num_used_buffers() const { return num_used_buffers_; }
+
+ private:
+  // Returns the index of the first free frame buffer. Returns |num_buffers_|
+  // if there are no free frame buffers.
+  int FindFreeBufferIndex() {
+    int i;
+    // Find a free frame buffer.
+    for (i = 0; i < num_buffers_; ++i) {
+      if (!ext_fb_list_[i].in_use) break;
+    }
+    return i;
+  }
+
+  // Sets |fb| to an external frame buffer. idx is the index into the frame
+  // buffer list.
+  void SetFrameBuffer(int idx, aom_codec_frame_buffer_t *fb) {
+    ASSERT_TRUE(fb != NULL);
+    fb->data = ext_fb_list_[idx].data;
+    fb->size = ext_fb_list_[idx].size;
+    ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+    ext_fb_list_[idx].in_use = 1;
+    fb->priv = &ext_fb_list_[idx];
+  }
+
+  int num_buffers_;
+  int num_used_buffers_;
+  ExternalFrameBuffer *ext_fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+
+// Callback used by libvpx to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_aom_frame_buffer(void *user_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libvpx to tell the application that |fb| is not needed
+// anymore.
+int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_aom_zero_frame_buffer(void *user_priv, size_t min_size,
+                              aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_aom_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+                                       aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_aom_frame_buffer(void *user_priv,
+                                    aom_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  (void)fb;
+  return 0;
+}
+
+#endif  // CONFIG_WEBM_IO
+
+// Class for testing passing in external frame buffers to libaom.
+class ExternalFrameBufferMD5Test
+    : public ::libaom_test::DecoderTest,
+      public ::libaom_test::CodecTestWithParam<const char *> {
+ protected:
+  ExternalFrameBufferMD5Test()
+      : DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)),
+        md5_file_(NULL), num_buffers_(0) {}
+
+  virtual ~ExternalFrameBufferMD5Test() {
+    if (md5_file_ != NULL) fclose(md5_file_);
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    if (num_buffers_ > 0 && video.frame_number() == 0) {
+      // Have libvpx use frame buffers we create.
+      ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+      ASSERT_EQ(AOM_CODEC_OK,
+                decoder->SetFrameBufferFunctions(GetAV1FrameBuffer,
+                                                 ReleaseAV1FrameBuffer, this));
+    }
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libaom_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  // Callback to get a free external frame buffer. Return value < 0 is an
+  // error.
+  static int GetAV1FrameBuffer(void *user_priv, size_t min_size,
+                               aom_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+    return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+  }
+
+  // Callback to release an external frame buffer. Return value < 0 is an
+  // error.
+  static int ReleaseAV1FrameBuffer(void *user_priv,
+                                   aom_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+    return md5Test->fb_list_.ReturnFrameBuffer(fb);
+  }
+
+  void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+  int num_buffers() const { return num_buffers_; }
+
+ private:
+  FILE *md5_file_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+const char kAV1TestFile[] = "av1-1-b8-01-size-226x226.ivf";
+const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf";
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+  ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
+
+  virtual void SetUp() {
+    video_ = new libaom_test::IVFVideoSource(kAV1TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void TearDown() {
+    delete decoder_;
+    decoder_ = NULL;
+    delete video_;
+    video_ = NULL;
+  }
+
+  // Passes the external frame buffer information to libvpx.
+  aom_codec_err_t SetFrameBufferFunctions(
+      int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get,
+      aom_release_frame_buffer_cb_fn_t cb_release) {
+    if (num_buffers > 0) {
+      num_buffers_ = num_buffers;
+      EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+    }
+
+    return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+  }
+
+  aom_codec_err_t DecodeOneFrame() {
+    const aom_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames();
+    if (res == AOM_CODEC_OK) video_->Next();
+    return res;
+  }
+
+  aom_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const aom_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != AOM_CODEC_OK) return res;
+      CheckDecodedFrames();
+    }
+    return AOM_CODEC_OK;
+  }
+
+ protected:
+  void CheckDecodedFrames() {
+    libaom_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const aom_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      fb_list_.CheckXImageFrameBuffer(img);
+    }
+  }
+
+  libaom_test::IVFVideoSource *video_;
+  libaom_test::AV1Decoder *decoder_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+  virtual void SetUp() {
+    video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void CheckFrameBufferRelease() {
+    TearDown();
+    ASSERT_EQ(0, fb_list_.num_used_buffers());
+  }
+};
+#endif  // CONFIG_WEBM_IO
+
+// This test runs through the set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, DISABLED_ExtFBMD5Match) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+
+  // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+  // #AOM_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+  const int jitter_buffers = 4;
+  const int num_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  set_num_buffers(num_buffers);
+
+  // Open compressed video file.
+  testing::internal::scoped_ptr<libaom_test::CompressedVideoSource> video;
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video.reset(new libaom_test::IVFVideoSource(filename));
+  } else {
+#if CONFIG_WEBM_IO
+    video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+#if CONFIG_WEBM_IO
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+  // Minimum number of external frame buffers for AV1 is
+  // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS.
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+  // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+  // #AOM_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+  const int jitter_buffers = 8;
+  const int num_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, DISABLED_NotEnoughBuffers) {
+  // Minimum number of external frame buffers for AV1 is
+  // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS. Most files will
+  // only use 5 frame buffers at one time.
+  const int num_buffers = 2;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+  // Only run this on long clips. Decoding a very short clip will return
+  // AOM_CODEC_OK even with only 2 buffers.
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, DISABLED_NoRelease) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    do_not_release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_zero_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(
+                              num_buffers, get_aom_one_less_byte_frame_buffer,
+                              release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(
+      AOM_CODEC_INVALID_PARAM,
+      SetFrameBufferFunctions(num_buffers, NULL, release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, NULL));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(AOM_CODEC_ERROR,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+  CheckFrameBufferRelease();
+}
+#endif  // CONFIG_WEBM_IO
+
+AV1_INSTANTIATE_TEST_CASE(
+    ExternalFrameBufferMD5Test,
+    ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                        libaom_test::kAV1TestVectors +
+                            libaom_test::kNumAV1TestVectors));
+}  // namespace
diff --git a/third_party/aom/test/fft_test.cc b/third_party/aom/test/fft_test.cc
new file mode 100644
index 000000000..e24e451a3
--- /dev/null
+++ b/third_party/aom/test/fft_test.cc
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <algorithm>
+#include <complex>
+#include <vector>
+
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef void (*tform_fun_t)(const float *input, float *temp, float *output);
+
+// Simple 1D FFT implementation
+template <typename InputType>
+void fft(const InputType *data, std::complex<float> *result, int n) {
+  if (n == 1) {
+    result[0] = data[0];
+    return;
+  }
+  std::vector<InputType> temp(n);
+  for (int k = 0; k < n / 2; ++k) {
+    temp[k] = data[2 * k];
+    temp[n / 2 + k] = data[2 * k + 1];
+  }
+  fft(&temp[0], result, n / 2);
+  fft(&temp[n / 2], result + n / 2, n / 2);
+  for (int k = 0; k < n / 2; ++k) {
+    std::complex<float> w = std::complex<float>((float)cos(2. * PI * k / n),
+                                                (float)-sin(2. * PI * k / n));
+    std::complex<float> a = result[k];
+    std::complex<float> b = result[n / 2 + k];
+    result[k] = a + w * b;
+    result[n / 2 + k] = a - w * b;
+  }
+}
+
+void transpose(std::vector<std::complex<float> > *data, int n) {
+  for (int y = 0; y < n; ++y) {
+    for (int x = y + 1; x < n; ++x) {
+      std::swap((*data)[y * n + x], (*data)[x * n + y]);
+    }
+  }
+}
+
+// Simple 2D FFT implementation
+template <class InputType>
+std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
+  std::vector<std::complex<float> > rowfft(n * n);
+  std::vector<std::complex<float> > result(n * n);
+  for (int y = 0; y < n; ++y) {
+    fft(input + y * n, &rowfft[y * n], n);
+  }
+  transpose(&rowfft, n);
+  for (int y = 0; y < n; ++y) {
+    fft(&rowfft[y * n], &result[y * n], n);
+  }
+  transpose(&result, n);
+  return result;
+}
+
+struct FFTTestArg {
+  int n;
+  void (*fft)(const float *input, float *temp, float *output);
+  FFTTestArg(int n_in, tform_fun_t fft_in) : n(n_in), fft(fft_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) {
+  return os << "fft_arg { n:" << test_arg.n << " fft:" << test_arg.fft << " }";
+}
+
+class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+    memset(input_, 0, sizeof(*input_) * n * n);
+    memset(temp_, 0, sizeof(*temp_) * n * n);
+    memset(output_, 0, sizeof(*output_) * n * n * 2);
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(FFT2DTest, Correct) {
+  int n = GetParam().n;
+  for (int i = 0; i < n * n; ++i) {
+    input_[i] = 1;
+    std::vector<std::complex<float> > expected = fft2d<float>(&input_[0], n);
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    for (int y = 0; y < n; ++y) {
+      for (int x = 0; x < (n / 2) + 1; ++x) {
+        EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5);
+        EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1],
+                    1e-5);
+      }
+    }
+    input_[i] = 0;
+  }
+}
+
+TEST_P(FFT2DTest, Benchmark) {
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, FFT2DTest,
+                        ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c),
+                                          FFTTestArg(4, aom_fft4x4_float_c),
+                                          FFTTestArg(8, aom_fft8x8_float_c),
+                                          FFTTestArg(16, aom_fft16x16_float_c),
+                                          FFTTestArg(32,
+                                                     aom_fft32x32_float_c)));
+#if ARCH_X86 || ARCH_X86_64
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, FFT2DTest,
+    ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2),
+                      FFTTestArg(8, aom_fft8x8_float_sse2),
+                      FFTTestArg(16, aom_fft16x16_float_sse2),
+                      FFTTestArg(32, aom_fft32x32_float_sse2)));
+#endif  // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, FFT2DTest,
+    ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2),
+                      FFTTestArg(16, aom_fft16x16_float_avx2),
+                      FFTTestArg(32, aom_fft32x32_float_avx2)));
+#endif  // HAVE_AVX2
+#endif  // ARCH_X86 || ARCH_X86_64
+
+struct IFFTTestArg {
+  int n;
+  tform_fun_t ifft;
+  IFFTTestArg(int n_in, tform_fun_t ifft_in) : n(n_in), ifft(ifft_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) {
+  return os << "ifft_arg { n:" << test_arg.n << " fft:" << test_arg.ifft
+            << " }";
+}
+
+class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+    memset(input_, 0, sizeof(*input_) * n * n * 2);
+    memset(temp_, 0, sizeof(*temp_) * n * n * 2);
+    memset(output_, 0, sizeof(*output_) * n * n);
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(IFFT2DTest, Correctness) {
+  int n = GetParam().n;
+  ASSERT_GE(n, 2);
+  std::vector<float> expected(n * n);
+  std::vector<float> actual(n * n);
+  // Do forward transform then invert to make sure we get back expected
+  for (int y = 0; y < n; ++y) {
+    for (int x = 0; x < n; ++x) {
+      expected[y * n + x] = 1;
+      std::vector<std::complex<float> > input_c = fft2d(&expected[0], n);
+      for (int i = 0; i < n * n; ++i) {
+        input_[2 * i + 0] = input_c[i].real();
+        input_[2 * i + 1] = input_c[i].imag();
+      }
+      GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+
+      for (int yy = 0; yy < n; ++yy) {
+        for (int xx = 0; xx < n; ++xx) {
+          EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n),
+                      1e-5);
+        }
+      }
+      expected[y * n + x] = 0;
+    }
+  }
+};
+
+TEST_P(IFFT2DTest, Benchmark) {
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+INSTANTIATE_TEST_CASE_P(
+    C, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c),
+                      IFFTTestArg(4, aom_ifft4x4_float_c),
+                      IFFTTestArg(8, aom_ifft8x8_float_c),
+                      IFFTTestArg(16, aom_ifft16x16_float_c),
+                      IFFTTestArg(32, aom_ifft32x32_float_c)));
+#if ARCH_X86 || ARCH_X86_64
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2),
+                      IFFTTestArg(8, aom_ifft8x8_float_sse2),
+                      IFFTTestArg(16, aom_ifft16x16_float_sse2),
+                      IFFTTestArg(32, aom_ifft32x32_float_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2),
+                      IFFTTestArg(16, aom_ifft16x16_float_avx2),
+                      IFFTTestArg(32, aom_ifft32x32_float_avx2)));
+#endif  // HAVE_AVX2
+#endif  // ARCH_X86 || ARCH_X86_64
+
+}  // namespace
diff --git a/third_party/aom/test/film_grain_table_test.cc b/third_party/aom/test/film_grain_table_test.cc
new file mode 100644
index 000000000..524d67d7b
--- /dev/null
+++ b/third_party/aom/test/film_grain_table_test.cc
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom_dsp/grain_table.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "test/video_source.h"
+
+void grain_equal(const aom_film_grain_t *expected,
+                 const aom_film_grain_t *actual) {
+  EXPECT_EQ(expected->apply_grain, actual->apply_grain);
+  EXPECT_EQ(expected->update_parameters, actual->update_parameters);
+  if (!expected->update_parameters) return;
+  EXPECT_EQ(expected->num_y_points, actual->num_y_points);
+  EXPECT_EQ(expected->num_cb_points, actual->num_cb_points);
+  EXPECT_EQ(expected->num_cr_points, actual->num_cr_points);
+  EXPECT_EQ(0, memcmp(expected->scaling_points_y, actual->scaling_points_y,
+                      expected->num_y_points *
+                          sizeof(expected->scaling_points_y[0])));
+  EXPECT_EQ(0, memcmp(expected->scaling_points_cb, actual->scaling_points_cb,
+                      expected->num_cb_points *
+                          sizeof(expected->scaling_points_cb[0])));
+  EXPECT_EQ(0, memcmp(expected->scaling_points_cr, actual->scaling_points_cr,
+                      expected->num_cr_points *
+                          sizeof(expected->scaling_points_cr[0])));
+  EXPECT_EQ(expected->scaling_shift, actual->scaling_shift);
+  EXPECT_EQ(expected->ar_coeff_lag, actual->ar_coeff_lag);
+  EXPECT_EQ(expected->ar_coeff_shift, actual->ar_coeff_shift);
+
+  const int num_pos_luma =
+      2 * expected->ar_coeff_lag * (expected->ar_coeff_lag + 1);
+  const int num_pos_chroma = num_pos_luma;
+  EXPECT_EQ(0, memcmp(expected->ar_coeffs_y, actual->ar_coeffs_y,
+                      sizeof(expected->ar_coeffs_y[0]) * num_pos_luma));
+  if (actual->num_cb_points || actual->chroma_scaling_from_luma) {
+    EXPECT_EQ(0, memcmp(expected->ar_coeffs_cb, actual->ar_coeffs_cb,
+                        sizeof(expected->ar_coeffs_cb[0]) * num_pos_chroma));
+  }
+  if (actual->num_cr_points || actual->chroma_scaling_from_luma) {
+    EXPECT_EQ(0, memcmp(expected->ar_coeffs_cr, actual->ar_coeffs_cr,
+                        sizeof(expected->ar_coeffs_cr[0]) * num_pos_chroma));
+  }
+  EXPECT_EQ(expected->overlap_flag, actual->overlap_flag);
+  EXPECT_EQ(expected->chroma_scaling_from_luma,
+            actual->chroma_scaling_from_luma);
+  EXPECT_EQ(expected->grain_scale_shift, actual->grain_scale_shift);
+  // EXPECT_EQ(expected->random_seed, actual->random_seed);
+
+  // clip_to_restricted and bit_depth aren't written
+  if (expected->num_cb_points) {
+    EXPECT_EQ(expected->cb_mult, actual->cb_mult);
+    EXPECT_EQ(expected->cb_luma_mult, actual->cb_luma_mult);
+    EXPECT_EQ(expected->cb_offset, actual->cb_offset);
+  }
+  if (expected->num_cr_points) {
+    EXPECT_EQ(expected->cr_mult, actual->cr_mult);
+    EXPECT_EQ(expected->cr_luma_mult, actual->cr_luma_mult);
+    EXPECT_EQ(expected->cr_offset, actual->cr_offset);
+  }
+}
+
+TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain;
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+
+  aom_film_grain_table_append(&table, 1000, 2000, film_grain_test_vectors + 0);
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+  grain.bit_depth = film_grain_test_vectors[0].bit_depth;
+  EXPECT_EQ(0, memcmp(&grain, film_grain_test_vectors + 0, sizeof(table)));
+
+  // Extend the existing segment
+  aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0);
+  EXPECT_EQ(0, table.head->next);
+
+  // Lookup and remove and check that the entry is no longer there
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+  EXPECT_EQ(0, table.head);
+  EXPECT_EQ(0, table.tail);
+  aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, SplitSingleSegment) {
+  aom_film_grain_table_t table;
+  aom_film_grain_t grain;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+
+  // Test lookup and remove that adjusts start time
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 100, true, &grain));
+  EXPECT_EQ(NULL, table.head->next);
+  EXPECT_EQ(100, table.head->start_time);
+
+  // Test lookup and remove that adjusts end time
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 900, 1000, true, &grain));
+  EXPECT_EQ(NULL, table.head->next);
+  EXPECT_EQ(100, table.head->start_time);
+  EXPECT_EQ(900, table.head->end_time);
+
+  // Test lookup and remove that splits the first entry
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 400, 600, true, &grain));
+  EXPECT_EQ(100, table.head->start_time);
+  EXPECT_EQ(400, table.head->end_time);
+
+  ASSERT_NE((void *)NULL, table.head->next);
+  EXPECT_EQ(table.tail, table.head->next);
+  EXPECT_EQ(600, table.head->next->start_time);
+  EXPECT_EQ(900, table.head->next->end_time);
+
+  aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, AddAndLookupMultipleSegments) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain;
+  const int kNumTestVectors =
+      sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+                                film_grain_test_vectors + i);
+  }
+
+  for (int i = kNumTestVectors - 1; i >= 0; --i) {
+    EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                            true, &grain));
+    grain_equal(film_grain_test_vectors + i, &grain);
+    EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                             true, &grain));
+  }
+
+  // Verify that all the data has been removed
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                             true, &grain));
+  }
+  aom_film_grain_table_free(&table);
+}
+
+class FilmGrainTableIOTest : public ::testing::Test {
+ protected:
+  void SetUp() { memset(&error_, 0, sizeof(error_)); }
+  struct aom_internal_error_info error_;
+};
+
+TEST_F(FilmGrainTableIOTest, ReadMissingFile) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_ERROR, aom_film_grain_table_read(
+                                 &table, "/path/to/missing/file", &error_));
+}
+
+TEST_F(FilmGrainTableIOTest, ReadTruncatedFile) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  std::string grain_file;
+  FILE *file = libaom_test::GetTempOutFile(&grain_file);
+  fwrite("deadbeef", 8, 1, file);
+  fclose(file);
+  ASSERT_EQ(AOM_CODEC_ERROR,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripReadWrite) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t expected_grain[16];
+  const int kNumTestVectors =
+      sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    expected_grain[i] = film_grain_test_vectors[i];
+    expected_grain[i].random_seed = i;
+    expected_grain[i].update_parameters = i % 2;
+    expected_grain[i].apply_grain = (i + 1) % 2;
+    expected_grain[i].bit_depth = 0;
+    aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+                                expected_grain + i);
+  }
+  std::string grain_file;
+  fclose(libaom_test::GetTempOutFile(&grain_file));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+  aom_film_grain_table_free(&table);
+
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    aom_film_grain_t grain;
+    EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                            true, &grain));
+    grain_equal(expected_grain + i, &grain);
+  }
+  aom_film_grain_table_free(&table);
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripSplit) {
+  std::string grain_file;
+  fclose(libaom_test::GetTempOutFile(&grain_file));
+
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain = film_grain_test_vectors[0];
+  aom_film_grain_table_append(&table, 0, 3000, &grain);
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+  aom_film_grain_table_free(&table);
+
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  ASSERT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+  aom_film_grain_table_free(&table);
+
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
diff --git a/third_party/aom/test/filterintra_test.cc b/third_party/aom/test/filterintra_test.cc
new file mode 100644
index 000000000..597134940
--- /dev/null
+++ b/third_party/aom/test/filterintra_test.cc
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                          const uint8_t *above, const uint8_t *left, int mode);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, tx size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, TX_SIZE> PredParams;
+
+const int MaxTxSize = 32;
+
+const int MaxTestNum = 100;
+
+class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
+ public:
+  virtual ~AV1FilterIntraPredTest() {}
+  virtual void SetUp() {
+    PredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = ::testing::get<0>(funcMode);
+    predFunc_ = ::testing::get<1>(funcMode);
+    mode_ = ::testing::get<2>(funcMode);
+    txSize_ = GET_PARAM(1);
+
+    alloc_ = new uint8_t[2 * MaxTxSize + 1];
+    predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
+    pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = tx_size_wide[txSize_];
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxTxSize;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (2 * MaxTxSize + 1)) {
+      alloc_[i] = rnd.Rand8();
+      i++;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) {
+      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+                                       << "Tx size: " << tx_size_wide[txSize_]
+                                       << "x" << tx_size_high[txSize_] << " "
+                                       << "Test number: " << testNum;
+      i++;
+    }
+  }
+
+  Predictor predFunc_;
+  Predictor predFuncRef_;
+  int mode_;
+  TX_SIZE txSize_;
+  uint8_t *alloc_;
+  uint8_t *pred_;
+  uint8_t *predRef_;
+};
+
+TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
+
+using ::testing::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_DC_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_V_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_H_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_D157_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSize[] = { TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_4X8,
+                            TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
+                            TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1FilterIntraPredTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+                       ::testing::ValuesIn(kTxSize)));
+}  // namespace
diff --git a/third_party/aom/test/frame_size_tests.cc b/third_party/aom/test/frame_size_tests.cc
new file mode 100644
index 000000000..eaf0b8370
--- /dev/null
+++ b/third_party/aom/test/frame_size_tests.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class AV1FrameSizeTests : public ::testing::Test,
+                          public ::libaom_test::EncoderTest {
+ protected:
+  AV1FrameSizeTests()
+      : EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {}
+  virtual ~AV1FrameSizeTests() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kRealTime);
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
+    return !::testing::Test::HasFailure();
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, 7);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  int expected_res_;
+};
+
+#if CONFIG_SIZE_LIMIT
+TEST_F(AV1FrameSizeTests, TestInvalidSizes) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16);
+  video.set_limit(2);
+  expected_res_ = AOM_CODEC_CORRUPT_FRAME;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_F(AV1FrameSizeTests, LargeValidSizes) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+  video.set_limit(2);
+  expected_res_ = AOM_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#endif
+
+TEST_F(AV1FrameSizeTests, OneByOneVideo) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(1, 1);
+  video.set_limit(2);
+  expected_res_ = AOM_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#undef ONE_BY_ONE_VIDEO_NAME
+}  // namespace
diff --git a/third_party/aom/test/function_equivalence_test.h b/third_party/aom/test/function_equivalence_test.h
new file mode 100644
index 000000000..f27068902
--- /dev/null
+++ b/third_party/aom/test/function_equivalence_test.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace libaom_test {
+// Base class for tests that compare 2 implementations of the same function
+// for equivalence. The template parameter should be pointer to a function
+// that is being tested.
+//
+// The test takes a 3-parameters encapsulating struct 'FuncParam', containing:
+//   - Pointer to reference function
+//   - Pointer to tested function
+//   - Integer bit depth (default to 0).
+//
+// These values are then accessible in the tests as member of params_:
+// params_.ref_func, params_.tst_func, and params_.bit_depth.
+//
+
+template <typename T>
+struct FuncParam {
+  FuncParam(T ref = NULL, T tst = NULL, int bit_depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(bit_depth) {}
+  T ref_func;
+  T tst_func;
+  int bit_depth;
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const FuncParam<T> &p) {
+  return os << "bit_depth:" << p.bit_depth
+            << " function:" << reinterpret_cast<const void *>(p.ref_func)
+            << " function:" << reinterpret_cast<const void *>(p.tst_func);
+}
+
+template <typename T>
+class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
+ public:
+  FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  virtual ~FunctionEquivalenceTest() {}
+
+  virtual void SetUp() { params_ = this->GetParam(); }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  ACMRandom rng_;
+  FuncParam<T> params_;
+};
+
+}  // namespace libaom_test
+#endif  // AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/third_party/aom/test/fwht4x4_test.cc b/third_party/aom/test/fwht4x4_test.cc
new file mode 100644
index 000000000..c8d98c519
--- /dev/null
+++ b/third_party/aom/test/fwht4x4_test.cc
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+
+using libaom_test::FhtFunc;
+
+typedef ::testing::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+    Dct4x4Param;
+
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 TxfmParam * /*txfm_param*/) {
+  av1_fwht4x4_c(in, out, stride);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+}
+
+class Trans4x4WHT : public libaom_test::TransformTestBase,
+                    public ::testing::TestWithParam<Dct4x4Param> {
+ public:
+  virtual ~Trans4x4WHT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    pitch_ = 4;
+    height_ = 4;
+    fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
+
+TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
+                                 AOM_BITS_10, 16),
+                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
+                                 AOM_BITS_12, 16)));
+}  // namespace
diff --git a/third_party/aom/test/gviz_api.py b/third_party/aom/test/gviz_api.py
new file mode 100755
index 000000000..d3a443dab
--- /dev/null
+++ b/third_party/aom/test/gviz_api.py
@@ -0,0 +1,1087 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts Python data into data for Google Visualization API clients.
+
+This library can be used to create a google.visualization.DataTable usable by
+visualizations built on the Google Visualization API. Output formats are raw
+JSON, JSON response, JavaScript, CSV, and HTML table.
+
+See http://code.google.com/apis/visualization/ for documentation on the
+Google Visualization API.
+"""
+
+__author__ = "Amit Weinstein, Misha Seltzer, Jacob Baskin"
+
+import cgi
+import cStringIO
+import csv
+import datetime
+try:
+  import json
+except ImportError:
+  import simplejson as json
+import types
+
+
+class DataTableException(Exception):
+  """The general exception object thrown by DataTable."""
+  pass
+
+
+class DataTableJSONEncoder(json.JSONEncoder):
+  """JSON encoder that handles date/time/datetime objects correctly."""
+
+  def __init__(self):
+    json.JSONEncoder.__init__(self,
+                              separators=(",", ":"),
+                              ensure_ascii=False)
+
+  def default(self, o):
+    if isinstance(o, datetime.datetime):
+      if o.microsecond == 0:
+        # If the time doesn't have ms-resolution, leave it out to keep
+        # things smaller.
+        return "Date(%d,%d,%d,%d,%d,%d)" % (
+            o.year, o.month - 1, o.day, o.hour, o.minute, o.second)
+      else:
+        return "Date(%d,%d,%d,%d,%d,%d,%d)" % (
+            o.year, o.month - 1, o.day, o.hour, o.minute, o.second,
+            o.microsecond / 1000)
+    elif isinstance(o, datetime.date):
+      return "Date(%d,%d,%d)" % (o.year, o.month - 1, o.day)
+    elif isinstance(o, datetime.time):
+      return [o.hour, o.minute, o.second]
+    else:
+      return super(DataTableJSONEncoder, self).default(o)
+
+
+class DataTable(object):
+  """Wraps the data to convert to a Google Visualization API DataTable.
+
+  Create this object, populate it with data, then call one of the ToJS...
+  methods to return a string representation of the data in the format described.
+
+  You can clear all data from the object to reuse it, but you cannot clear
+  individual cells, rows, or columns. You also cannot modify the table schema
+  specified in the class constructor.
+
+  You can add new data one or more rows at a time. All data added to an
+  instantiated DataTable must conform to the schema passed in to __init__().
+
+  You can reorder the columns in the output table, and also specify row sorting
+  order by column. The default column order is according to the original
+  table_description parameter. Default row sort order is ascending, by column
+  1 values. For a dictionary, we sort the keys for order.
+
+  The data and the table_description are closely tied, as described here:
+
+  The table schema is defined in the class constructor's table_description
+  parameter. The user defines each column using a tuple of
+  (id[, type[, label[, custom_properties]]]). The default value for type is
+  string, label is the same as ID if not specified, and custom properties is
+  an empty dictionary if not specified.
+
+  table_description is a dictionary or list, containing one or more column
+  descriptor tuples, nested dictionaries, and lists. Each dictionary key, list
+  element, or dictionary element must eventually be defined as
+  a column description tuple. Here's an example of a dictionary where the key
+  is a tuple, and the value is a list of two tuples:
+    {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+
+  This flexibility in data entry enables you to build and manipulate your data
+  in a Python structure that makes sense for your program.
+
+  Add data to the table using the same nested design as the table's
+  table_description, replacing column descriptor tuples with cell data, and
+  each row is an element in the top level collection. This will be a bit
+  clearer after you look at the following examples showing the
+  table_description, matching data, and the resulting table:
+
+  Columns as list of tuples [col1, col2, col3]
+    table_description: [('a', 'number'), ('b', 'string')]
+    AppendData( [[1, 'z'], [2, 'w'], [4, 'o'], [5, 'k']] )
+    Table:
+    a  b   <--- these are column ids/labels
+    1  z
+    2  w
+    4  o
+    5  k
+
+  Dictionary of columns, where key is a column, and value is a list of
+  columns  {col1: [col2, col3]}
+    table_description: {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+    AppendData( data: {1: [2, 'z'], 3: [4, 'w']}
+    Table:
+    a  b  c
+    1  2  z
+    3  4  w
+
+  Dictionary where key is a column, and the value is itself a dictionary of
+  columns {col1: {col2, col3}}
+    table_description: {('a', 'number'): {'b': 'number', 'c': 'string'}}
+    AppendData( data: {1: {'b': 2, 'c': 'z'}, 3: {'b': 4, 'c': 'w'}}
+    Table:
+    a  b  c
+    1  2  z
+    3  4  w
+  """
+
+  def __init__(self, table_description, data=None, custom_properties=None):
+    """Initialize the data table from a table schema and (optionally) data.
+
+    See the class documentation for more information on table schema and data
+    values.
+
+    Args:
+      table_description: A table schema, following one of the formats described
+                         in TableDescriptionParser(). Schemas describe the
+                         column names, data types, and labels. See
+                         TableDescriptionParser() for acceptable formats.
+      data: Optional. If given, fills the table with the given data. The data
+            structure must be consistent with schema in table_description. See
+            the class documentation for more information on acceptable data. You
+            can add data later by calling AppendData().
+      custom_properties: Optional. A dictionary from string to string that
+                         goes into the table's custom properties. This can be
+                         later changed by changing self.custom_properties.
+
+    Raises:
+      DataTableException: Raised if the data and the description did not match,
+                          or did not use the supported formats.
+    """
+    self.__columns = self.TableDescriptionParser(table_description)
+    self.__data = []
+    self.custom_properties = {}
+    if custom_properties is not None:
+      self.custom_properties = custom_properties
+    if data:
+      self.LoadData(data)
+
+  @staticmethod
+  def CoerceValue(value, value_type):
+    """Coerces a single value into the type expected for its column.
+
+    Internal helper method.
+
+    Args:
+      value: The value which should be converted
+      value_type: One of "string", "number", "boolean", "date", "datetime" or
+                  "timeofday".
+
+    Returns:
+      An item of the Python type appropriate to the given value_type. Strings
+      are also converted to Unicode using UTF-8 encoding if necessary.
+      If a tuple is given, it should be in one of the following forms:
+        - (value, formatted value)
+        - (value, formatted value, custom properties)
+      where the formatted value is a string, and custom properties is a
+      dictionary of the custom properties for this cell.
+      To specify custom properties without specifying formatted value, one can
+      pass None as the formatted value.
+      One can also have a null-valued cell with formatted value and/or custom
+      properties by specifying None for the value.
+      This method ignores the custom properties except for checking that it is a
+      dictionary. The custom properties are handled in the ToJSon and ToJSCode
+      methods.
+      The real type of the given value is not strictly checked. For example,
+      any type can be used for string - as we simply take its str( ) and for
+      boolean value we just check "if value".
+      Examples:
+        CoerceValue(None, "string") returns None
+        CoerceValue((5, "5$"), "number") returns (5, "5$")
+        CoerceValue(100, "string") returns "100"
+        CoerceValue(0, "boolean") returns False
+
+    Raises:
+      DataTableException: The value and type did not match in a not-recoverable
+                          way, for example given value 'abc' for type 'number'.
+    """
+    if isinstance(value, tuple):
+      # In case of a tuple, we run the same function on the value itself and
+      # add the formatted value.
+      if (len(value) not in [2, 3] or
+          (len(value) == 3 and not isinstance(value[2], dict))):
+        raise DataTableException("Wrong format for value and formatting - %s." %
+                                 str(value))
+      if not isinstance(value[1], types.StringTypes + (types.NoneType,)):
+        raise DataTableException("Formatted value is not string, given %s." %
+                                 type(value[1]))
+      js_value = DataTable.CoerceValue(value[0], value_type)
+      return (js_value,) + value[1:]
+
+    t_value = type(value)
+    if value is None:
+      return value
+    if value_type == "boolean":
+      return bool(value)
+
+    elif value_type == "number":
+      if isinstance(value, (int, long, float)):
+        return value
+      raise DataTableException("Wrong type %s when expected number" % t_value)
+
+    elif value_type == "string":
+      if isinstance(value, unicode):
+        return value
+      else:
+        return str(value).decode("utf-8")
+
+    elif value_type == "date":
+      if isinstance(value, datetime.datetime):
+        return datetime.date(value.year, value.month, value.day)
+      elif isinstance(value, datetime.date):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected date" % t_value)
+
+    elif value_type == "timeofday":
+      if isinstance(value, datetime.datetime):
+        return datetime.time(value.hour, value.minute, value.second)
+      elif isinstance(value, datetime.time):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected time" % t_value)
+
+    elif value_type == "datetime":
+      if isinstance(value, datetime.datetime):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected datetime" %
+                                 t_value)
+    # If we got here, it means the given value_type was not one of the
+    # supported types.
+    raise DataTableException("Unsupported type %s" % value_type)
+
+  @staticmethod
+  def EscapeForJSCode(encoder, value):
+    if value is None:
+      return "null"
+    elif isinstance(value, datetime.datetime):
+      if value.microsecond == 0:
+        # If it's not ms-resolution, leave that out to save space.
+        return "new Date(%d,%d,%d,%d,%d,%d)" % (value.year,
+                                                value.month - 1,  # To match JS
+                                                value.day,
+                                                value.hour,
+                                                value.minute,
+                                                value.second)
+      else:
+        return "new Date(%d,%d,%d,%d,%d,%d,%d)" % (value.year,
+                                                   value.month - 1,  # match JS
+                                                   value.day,
+                                                   value.hour,
+                                                   value.minute,
+                                                   value.second,
+                                                   value.microsecond / 1000)
+    elif isinstance(value, datetime.date):
+      return "new Date(%d,%d,%d)" % (value.year, value.month - 1, value.day)
+    else:
+      return encoder.encode(value)
+
+  @staticmethod
+  def ToString(value):
+    if value is None:
+      return "(empty)"
+    elif isinstance(value, (datetime.datetime,
+                            datetime.date,
+                            datetime.time)):
+      return str(value)
+    elif isinstance(value, unicode):
+      return value
+    elif isinstance(value, bool):
+      return str(value).lower()
+    else:
+      return str(value).decode("utf-8")
+
+  @staticmethod
+  def ColumnTypeParser(description):
+    """Parses a single column description. Internal helper method.
+
+    Args:
+      description: a column description in the possible formats:
+       'id'
+       ('id',)
+       ('id', 'type')
+       ('id', 'type', 'label')
+       ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+    Returns:
+      Dictionary with the following keys: id, label, type, and
+      custom_properties where:
+        - If label not given, it equals the id.
+        - If type not given, string is used by default.
+        - If custom properties are not given, an empty dictionary is used by
+          default.
+
+    Raises:
+      DataTableException: The column description did not match the RE, or
+          unsupported type was passed.
+    """
+    if not description:
+      raise DataTableException("Description error: empty description given")
+
+    if not isinstance(description, (types.StringTypes, tuple)):
+      raise DataTableException("Description error: expected either string or "
+                               "tuple, got %s." % type(description))
+
+    if isinstance(description, types.StringTypes):
+      description = (description,)
+
+    # According to the tuple's length, we fill the keys
+    # We verify everything is of type string
+    for elem in description[:3]:
+      if not isinstance(elem, types.StringTypes):
+        raise DataTableException("Description error: expected tuple of "
+                                 "strings, current element of type %s." %
+                                 type(elem))
+    desc_dict = {"id": description[0],
+                 "label": description[0],
+                 "type": "string",
+                 "custom_properties": {}}
+    if len(description) > 1:
+      desc_dict["type"] = description[1].lower()
+      if len(description) > 2:
+        desc_dict["label"] = description[2]
+        if len(description) > 3:
+          if not isinstance(description[3], dict):
+            raise DataTableException("Description error: expected custom "
+                                     "properties of type dict, current element "
+                                     "of type %s." % type(description[3]))
+          desc_dict["custom_properties"] = description[3]
+          if len(description) > 4:
+            raise DataTableException("Description error: tuple of length > 4")
+    if desc_dict["type"] not in ["string", "number", "boolean",
+                                 "date", "datetime", "timeofday"]:
+      raise DataTableException(
+          "Description error: unsupported type '%s'" % desc_dict["type"])
+    return desc_dict
+
+  @staticmethod
+  def TableDescriptionParser(table_description, depth=0):
+    """Parses the table_description object for internal use.
+
+    Parses the user-submitted table description into an internal format used
+    by the Python DataTable class. Returns the flat list of parsed columns.
+
+    Args:
+      table_description: A description of the table which should comply
+                         with one of the formats described below.
+      depth: Optional. The depth of the first level in the current description.
+             Used by recursive calls to this function.
+
+    Returns:
+      List of columns, where each column represented by a dictionary with the
+      keys: id, label, type, depth, container which means the following:
+      - id: the id of the column
+      - name: The name of the column
+      - type: The datatype of the elements in this column. Allowed types are
+              described in ColumnTypeParser().
+      - depth: The depth of this column in the table description
+      - container: 'dict', 'iter' or 'scalar' for parsing the format easily.
+      - custom_properties: The custom properties for this column.
+      The returned description is flattened regardless of how it was given.
+
+    Raises:
+      DataTableException: Error in a column description or in the description
+                          structure.
+
+    Examples:
+      A column description can be of the following forms:
+       'id'
+       ('id',)
+       ('id', 'type')
+       ('id', 'type', 'label')
+       ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+       or as a dictionary:
+       'id': 'type'
+       'id': ('type',)
+       'id': ('type', 'label')
+       'id': ('type', 'label', {'custom_prop1': 'custom_val1'})
+      If the type is not specified, we treat it as string.
+      If no specific label is given, the label is simply the id.
+      If no custom properties are given, we use an empty dictionary.
+
+      input: [('a', 'date'), ('b', 'timeofday', 'b', {'foo': 'bar'})]
+      output: [{'id': 'a', 'label': 'a', 'type': 'date',
+                'depth': 0, 'container': 'iter', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'timeofday',
+                'depth': 0, 'container': 'iter',
+                'custom_properties': {'foo': 'bar'}}]
+
+      input: {'a': [('b', 'number'), ('c', 'string', 'column c')]}
+      output: [{'id': 'a', 'label': 'a', 'type': 'string',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'number',
+                'depth': 1, 'container': 'iter', 'custom_properties': {}},
+               {'id': 'c', 'label': 'column c', 'type': 'string',
+                'depth': 1, 'container': 'iter', 'custom_properties': {}}]
+
+      input:  {('a', 'number', 'column a'): { 'b': 'number', 'c': 'string'}}
+      output: [{'id': 'a', 'label': 'column a', 'type': 'number',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'number',
+                'depth': 1, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'c', 'label': 'c', 'type': 'string',
+                'depth': 1, 'container': 'dict', 'custom_properties': {}}]
+
+      input: { ('w', 'string', 'word'): ('c', 'number', 'count') }
+      output: [{'id': 'w', 'label': 'word', 'type': 'string',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'c', 'label': 'count', 'type': 'number',
+                'depth': 1, 'container': 'scalar', 'custom_properties': {}}]
+
+      input: {'a': ('number', 'column a'), 'b': ('string', 'column b')}
+      output: [{'id': 'a', 'label': 'column a', 'type': 'number', 'depth': 0,
+               'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'column b', 'type': 'string', 'depth': 0,
+               'container': 'dict', 'custom_properties': {}}
+
+      NOTE: there might be ambiguity in the case of a dictionary representation
+      of a single column. For example, the following description can be parsed
+      in 2 different ways: {'a': ('b', 'c')} can be thought of a single column
+      with the id 'a', of type 'b' and the label 'c', or as 2 columns: one named
+      'a', and the other named 'b' of type 'c'. We choose the first option by
+      default, and in case the second option is the right one, it is possible to
+      make the key into a tuple (i.e. {('a',): ('b', 'c')}) or add more info
+      into the tuple, thus making it look like this: {'a': ('b', 'c', 'b', {})}
+      -- second 'b' is the label, and {} is the custom properties field.
+    """
+    # For the recursion step, we check for a scalar object (string or tuple)
+    if isinstance(table_description, (types.StringTypes, tuple)):
+      parsed_col = DataTable.ColumnTypeParser(table_description)
+      parsed_col["depth"] = depth
+      parsed_col["container"] = "scalar"
+      return [parsed_col]
+
+    # Since it is not scalar, table_description must be iterable.
+    if not hasattr(table_description, "__iter__"):
+      raise DataTableException("Expected an iterable object, got %s" %
+                               type(table_description))
+    if not isinstance(table_description, dict):
+      # We expects a non-dictionary iterable item.
+      columns = []
+      for desc in table_description:
+        parsed_col = DataTable.ColumnTypeParser(desc)
+        parsed_col["depth"] = depth
+        parsed_col["container"] = "iter"
+        columns.append(parsed_col)
+      if not columns:
+        raise DataTableException("Description iterable objects should not"
+                                 " be empty.")
+      return columns
+    # The other case is a dictionary
+    if not table_description:
+      raise DataTableException("Empty dictionaries are not allowed inside"
+                               " description")
+
+    # To differentiate between the two cases of more levels below or this is
+    # the most inner dictionary, we consider the number of keys (more then one
+    # key is indication for most inner dictionary) and the type of the key and
+    # value in case of only 1 key (if the type of key is string and the type of
+    # the value is a tuple of 0-3 items, we assume this is the most inner
+    # dictionary).
+    # NOTE: this way of differentiating might create ambiguity. See docs.
+    if (len(table_description) != 1 or
+        (isinstance(table_description.keys()[0], types.StringTypes) and
+         isinstance(table_description.values()[0], tuple) and
+         len(table_description.values()[0]) < 4)):
+      # This is the most inner dictionary. Parsing types.
+      columns = []
+      # We sort the items, equivalent to sort the keys since they are unique
+      for key, value in sorted(table_description.items()):
+        # We parse the column type as (key, type) or (key, type, label) using
+        # ColumnTypeParser.
+        if isinstance(value, tuple):
+          parsed_col = DataTable.ColumnTypeParser((key,) + value)
+        else:
+          parsed_col = DataTable.ColumnTypeParser((key, value))
+        parsed_col["depth"] = depth
+        parsed_col["container"] = "dict"
+        columns.append(parsed_col)
+      return columns
+    # This is an outer dictionary, must have at most one key.
+    parsed_col = DataTable.ColumnTypeParser(table_description.keys()[0])
+    parsed_col["depth"] = depth
+    parsed_col["container"] = "dict"
+    return ([parsed_col] +
+            DataTable.TableDescriptionParser(table_description.values()[0],
+                                             depth=depth + 1))
+
+  @property
+  def columns(self):
+    """Returns the parsed table description."""
+    return self.__columns
+
+  def NumberOfRows(self):
+    """Returns the number of rows in the current data stored in the table."""
+    return len(self.__data)
+
+  def SetRowsCustomProperties(self, rows, custom_properties):
+    """Sets the custom properties for given row(s).
+
+    Can accept a single row or an iterable of rows.
+    Sets the given custom properties for all specified rows.
+
+    Args:
+      rows: The row, or rows, to set the custom properties for.
+      custom_properties: A string to string dictionary of custom properties to
+      set for all rows.
+    """
+    if not hasattr(rows, "__iter__"):
+      rows = [rows]
+    for row in rows:
+      self.__data[row] = (self.__data[row][0], custom_properties)
+
+  def LoadData(self, data, custom_properties=None):
+    """Loads new rows to the data table, clearing existing rows.
+
+    May also set the custom_properties for the added rows. The given custom
+    properties dictionary specifies the dictionary that will be used for *all*
+    given rows.
+
+    Args:
+      data: The rows that the table will contain.
+      custom_properties: A dictionary of string to string to set as the custom
+                         properties for all rows.
+    """
+    self.__data = []
+    self.AppendData(data, custom_properties)
+
+  def AppendData(self, data, custom_properties=None):
+    """Appends new data to the table.
+
+    Data is appended in rows. Data must comply with
+    the table schema passed in to __init__(). See CoerceValue() for a list
+    of acceptable data types. See the class documentation for more information
+    and examples of schema and data values.
+
+    Args:
+      data: The row to add to the table. The data must conform to the table
+            description format.
+      custom_properties: A dictionary of string to string, representing the
+                         custom properties to add to all the rows.
+
+    Raises:
+      DataTableException: The data structure does not match the description.
+    """
+    # If the maximal depth is 0, we simply iterate over the data table
+    # lines and insert them using _InnerAppendData. Otherwise, we simply
+    # let the _InnerAppendData handle all the levels.
+    if not self.__columns[-1]["depth"]:
+      for row in data:
+        self._InnerAppendData(({}, custom_properties), row, 0)
+    else:
+      self._InnerAppendData(({}, custom_properties), data, 0)
+
+  def _InnerAppendData(self, prev_col_values, data, col_index):
+    """Inner function to assist LoadData."""
+    # We first check that col_index has not exceeded the columns size
+    if col_index >= len(self.__columns):
+      raise DataTableException("The data does not match description, too deep")
+
+    # Dealing with the scalar case, the data is the last value.
+    if self.__columns[col_index]["container"] == "scalar":
+      prev_col_values[0][self.__columns[col_index]["id"]] = data
+      self.__data.append(prev_col_values)
+      return
+
+    if self.__columns[col_index]["container"] == "iter":
+      if not hasattr(data, "__iter__") or isinstance(data, dict):
+        raise DataTableException("Expected iterable object, got %s" %
+                                 type(data))
+      # We only need to insert the rest of the columns
+      # If there are less items than expected, we only add what there is.
+      for value in data:
+        if col_index >= len(self.__columns):
+          raise DataTableException("Too many elements given in data")
+        prev_col_values[0][self.__columns[col_index]["id"]] = value
+        col_index += 1
+      self.__data.append(prev_col_values)
+      return
+
+    # We know the current level is a dictionary, we verify the type.
+    if not isinstance(data, dict):
+      raise DataTableException("Expected dictionary at current level, got %s" %
+                               type(data))
+    # We check if this is the last level
+    if self.__columns[col_index]["depth"] == self.__columns[-1]["depth"]:
+      # We need to add the keys in the dictionary as they are
+      for col in self.__columns[col_index:]:
+        if col["id"] in data:
+          prev_col_values[0][col["id"]] = data[col["id"]]
+      self.__data.append(prev_col_values)
+      return
+
+    # We have a dictionary in an inner depth level.
+    if not data.keys():
+      # In case this is an empty dictionary, we add a record with the columns
+      # filled only until this point.
+      self.__data.append(prev_col_values)
+    else:
+      for key in sorted(data):
+        col_values = dict(prev_col_values[0])
+        col_values[self.__columns[col_index]["id"]] = key
+        self._InnerAppendData((col_values, prev_col_values[1]),
+                              data[key], col_index + 1)
+
+  def _PreparedData(self, order_by=()):
+    """Prepares the data for enumeration - sorting it by order_by.
+
+    Args:
+      order_by: Optional. Specifies the name of the column(s) to sort by, and
+                (optionally) which direction to sort in. Default sort direction
+                is asc. Following formats are accepted:
+                "string_col_name"  -- For a single key in default (asc) order.
+                ("string_col_name", "asc|desc") -- For a single key.
+                [("col_1","asc|desc"), ("col_2","asc|desc")] -- For more than
+                    one column, an array of tuples of (col_name, "asc|desc").
+
+    Returns:
+      The data sorted by the keys given.
+
+    Raises:
+      DataTableException: Sort direction not in 'asc' or 'desc'
+    """
+    if not order_by:
+      return self.__data
+
+    proper_sort_keys = []
+    if isinstance(order_by, types.StringTypes) or (
+        isinstance(order_by, tuple) and len(order_by) == 2 and
+        order_by[1].lower() in ["asc", "desc"]):
+      order_by = (order_by,)
+    for key in order_by:
+      if isinstance(key, types.StringTypes):
+        proper_sort_keys.append((key, 1))
+      elif (isinstance(key, (list, tuple)) and len(key) == 2 and
+            key[1].lower() in ("asc", "desc")):
+        proper_sort_keys.append((key[0], key[1].lower() == "asc" and 1 or -1))
+      else:
+        raise DataTableException("Expected tuple with second value: "
+                                 "'asc' or 'desc'")
+
+    def SortCmpFunc(row1, row2):
+      """cmp function for sorted. Compares by keys and 'asc'/'desc' keywords."""
+      for key, asc_mult in proper_sort_keys:
+        cmp_result = asc_mult * cmp(row1[0].get(key), row2[0].get(key))
+        if cmp_result:
+          return cmp_result
+      return 0
+
+    return sorted(self.__data, cmp=SortCmpFunc)
+
+  def ToJSCode(self, name, columns_order=None, order_by=()):
+    """Writes the data table as a JS code string.
+
+    This method writes a string of JS code that can be run to
+    generate a DataTable with the specified data. Typically used for debugging
+    only.
+
+    Args:
+      name: The name of the table. The name would be used as the DataTable's
+            variable name in the created JS code.
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+
+    Returns:
+      A string of JS code that, when run, generates a DataTable with the given
+      name and the data stored in the DataTable object.
+      Example result:
+        "var tab1 = new google.visualization.DataTable();
+         tab1.addColumn("string", "a", "a");
+         tab1.addColumn("number", "b", "b");
+         tab1.addColumn("boolean", "c", "c");
+         tab1.addRows(10);
+         tab1.setCell(0, 0, "a");
+         tab1.setCell(0, 1, 1, null, {"foo": "bar"});
+         tab1.setCell(0, 2, true);
+         ...
+         tab1.setCell(9, 0, "c");
+         tab1.setCell(9, 1, 3, "3$");
+         tab1.setCell(9, 2, false);"
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    encoder = DataTableJSONEncoder()
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    # We first create the table with the given name
+    jscode = "var %s = new google.visualization.DataTable();\n" % name
+    if self.custom_properties:
+      jscode += "%s.setTableProperties(%s);\n" % (
+          name, encoder.encode(self.custom_properties))
+
+    # We add the columns to the table
+    for i, col in enumerate(columns_order):
+      jscode += "%s.addColumn(%s, %s, %s);\n" % (
+          name,
+          encoder.encode(col_dict[col]["type"]),
+          encoder.encode(col_dict[col]["label"]),
+          encoder.encode(col_dict[col]["id"]))
+      if col_dict[col]["custom_properties"]:
+        jscode += "%s.setColumnProperties(%d, %s);\n" % (
+            name, i, encoder.encode(col_dict[col]["custom_properties"]))
+    jscode += "%s.addRows(%d);\n" % (name, len(self.__data))
+
+    # We now go over the data and add each row
+    for (i, (row, cp)) in enumerate(self._PreparedData(order_by)):
+      # We add all the elements of this row by their order
+      for (j, col) in enumerate(columns_order):
+        if col not in row or row[col] is None:
+          continue
+        value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          cell_cp = ""
+          if len(value) == 3:
+            cell_cp = ", %s" % encoder.encode(row[col][2])
+          # We have a formatted value or custom property as well
+          jscode += ("%s.setCell(%d, %d, %s, %s%s);\n" %
+                     (name, i, j,
+                      self.EscapeForJSCode(encoder, value[0]),
+                      self.EscapeForJSCode(encoder, value[1]), cell_cp))
+        else:
+          jscode += "%s.setCell(%d, %d, %s);\n" % (
+              name, i, j, self.EscapeForJSCode(encoder, value))
+      if cp:
+        jscode += "%s.setRowProperties(%d, %s);\n" % (
+            name, i, encoder.encode(cp))
+    return jscode
+
+  def ToHtml(self, columns_order=None, order_by=()):
+    """Writes the data table as an HTML table code string.
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+
+    Returns:
+      An HTML table code string.
+      Example result (the result is without the newlines):
+       <html><body><table border="1">
+        <thead><tr><th>a</th><th>b</th><th>c</th></tr></thead>
+        <tbody>
+         <tr><td>1</td><td>"z"</td><td>2</td></tr>
+         <tr><td>"3$"</td><td>"w"</td><td></td></tr>
+        </tbody>
+       </table></body></html>
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+    table_template = "<html><body><table border=\"1\">%s</table></body></html>"
+    columns_template = "<thead><tr>%s</tr></thead>"
+    rows_template = "<tbody>%s</tbody>"
+    row_template = "<tr>%s</tr>"
+    header_cell_template = "<th>%s</th>"
+    cell_template = "<td>%s</td>"
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    columns_list = []
+    for col in columns_order:
+      columns_list.append(header_cell_template %
+                          cgi.escape(col_dict[col]["label"]))
+    columns_html = columns_template % "".join(columns_list)
+
+    rows_list = []
+    # We now go over the data and add each row
+    for row, unused_cp in self._PreparedData(order_by):
+      cells_list = []
+      # We add all the elements of this row by their order
+      for col in columns_order:
+        # For empty string we want empty quotes ("").
+        value = ""
+        if col in row and row[col] is not None:
+          value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          # We have a formatted value and we're going to use it
+          cells_list.append(cell_template % cgi.escape(self.ToString(value[1])))
+        else:
+          cells_list.append(cell_template % cgi.escape(self.ToString(value)))
+      rows_list.append(row_template % "".join(cells_list))
+    rows_html = rows_template % "".join(rows_list)
+
+    return table_template % (columns_html + rows_html)
+
+  def ToCsv(self, columns_order=None, order_by=(), separator=","):
+    """Writes the data table as a CSV string.
+
+    Output is encoded in UTF-8 because the Python "csv" module can't handle
+    Unicode properly according to its documentation.
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+      separator: Optional. The separator to use between the values.
+
+    Returns:
+      A CSV string representing the table.
+      Example result:
+       'a','b','c'
+       1,'z',2
+       3,'w',''
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    csv_buffer = cStringIO.StringIO()
+    writer = csv.writer(csv_buffer, delimiter=separator)
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    writer.writerow([col_dict[col]["label"].encode("utf-8")
+                     for col in columns_order])
+
+    # We now go over the data and add each row
+    for row, unused_cp in self._PreparedData(order_by):
+      cells_list = []
+      # We add all the elements of this row by their order
+      for col in columns_order:
+        value = ""
+        if col in row and row[col] is not None:
+          value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          # We have a formatted value. Using it only for date/time types.
+          if col_dict[col]["type"] in ["date", "datetime", "timeofday"]:
+            cells_list.append(self.ToString(value[1]).encode("utf-8"))
+          else:
+            cells_list.append(self.ToString(value[0]).encode("utf-8"))
+        else:
+          cells_list.append(self.ToString(value).encode("utf-8"))
+      writer.writerow(cells_list)
+    return csv_buffer.getvalue()
+
+  def ToTsvExcel(self, columns_order=None, order_by=()):
+    """Returns a file in tab-separated-format readable by MS Excel.
+
+    Returns a file in UTF-16 little endian encoding, with tabs separating the
+    values.
+
+    Args:
+      columns_order: Delegated to ToCsv.
+      order_by: Delegated to ToCsv.
+
+    Returns:
+      A tab-separated little endian UTF16 file representing the table.
+    """
+    return (self.ToCsv(columns_order, order_by, separator="\t")
+            .decode("utf-8").encode("UTF-16LE"))
+
+  def _ToJSonObj(self, columns_order=None, order_by=()):
+    """Returns an object suitable to be converted to JSON.
+
+    Args:
+      columns_order: Optional. A list of all column IDs in the order in which
+                     you want them created in the output table. If specified,
+                     all column IDs must be present.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData().
+
+    Returns:
+      A dictionary object for use by ToJSon or ToJSonResponse.
+    """
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    # Creating the column JSON objects
+    col_objs = []
+    for col_id in columns_order:
+      col_obj = {"id": col_dict[col_id]["id"],
+                 "label": col_dict[col_id]["label"],
+                 "type": col_dict[col_id]["type"]}
+      if col_dict[col_id]["custom_properties"]:
+        col_obj["p"] = col_dict[col_id]["custom_properties"]
+      col_objs.append(col_obj)
+
+    # Creating the rows jsons
+    row_objs = []
+    for row, cp in self._PreparedData(order_by):
+      cell_objs = []
+      for col in columns_order:
+        value = self.CoerceValue(row.get(col, None), col_dict[col]["type"])
+        if value is None:
+          cell_obj = None
+        elif isinstance(value, tuple):
+          cell_obj = {"v": value[0]}
+          if len(value) > 1 and value[1] is not None:
+            cell_obj["f"] = value[1]
+          if len(value) == 3:
+            cell_obj["p"] = value[2]
+        else:
+          cell_obj = {"v": value}
+        cell_objs.append(cell_obj)
+      row_obj = {"c": cell_objs}
+      if cp:
+        row_obj["p"] = cp
+      row_objs.append(row_obj)
+
+    json_obj = {"cols": col_objs, "rows": row_objs}
+    if self.custom_properties:
+      json_obj["p"] = self.custom_properties
+
+    return json_obj
+
+  def ToJSon(self, columns_order=None, order_by=()):
+    """Returns a string that can be used in a JS DataTable constructor.
+
+    This method writes a JSON string that can be passed directly into a Google
+    Visualization API DataTable constructor. Use this output if you are
+    hosting the visualization HTML on your site, and want to code the data
+    table in Python. Pass this string into the
+    google.visualization.DataTable constructor, e.g,:
+      ... on my page that hosts my visualization ...
+      google.setOnLoadCallback(drawTable);
+      function drawTable() {
+        var data = new google.visualization.DataTable(_my_JSon_string, 0.6);
+        myTable.draw(data);
+      }
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData().
+
+    Returns:
+      A JSon constructor string to generate a JS DataTable with the data
+      stored in the DataTable object.
+      Example result (the result is without the newlines):
+       {cols: [{id:"a",label:"a",type:"number"},
+               {id:"b",label:"b",type:"string"},
+              {id:"c",label:"c",type:"number"}],
+        rows: [{c:[{v:1},{v:"z"},{v:2}]}, c:{[{v:3,f:"3$"},{v:"w"},{v:null}]}],
+        p:    {'foo': 'bar'}}
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    encoder = DataTableJSONEncoder()
+    return encoder.encode(
+        self._ToJSonObj(columns_order, order_by)).encode("utf-8")
+
+  def ToJSonResponse(self, columns_order=None, order_by=(), req_id=0,
+                     response_handler="google.visualization.Query.setResponse"):
+    """Writes a table as a JSON response that can be returned as-is to a client.
+
+    This method writes a JSON response to return to a client in response to a
+    Google Visualization API query. This string can be processed by the calling
+    page, and is used to deliver a data table to a visualization hosted on
+    a different page.
+
+    Args:
+      columns_order: Optional. Passed straight to self.ToJSon().
+      order_by: Optional. Passed straight to self.ToJSon().
+      req_id: Optional. The response id, as retrieved by the request.
+      response_handler: Optional. The response handler, as retrieved by the
+          request.
+
+    Returns:
+      A JSON response string to be received by JS the visualization Query
+      object. This response would be translated into a DataTable on the
+      client side.
+      Example result (newlines added for readability):
+       google.visualization.Query.setResponse({
+          'version':'0.6', 'reqId':'0', 'status':'OK',
+          'table': {cols: [...], rows: [...]}});
+
+    Note: The URL returning this string can be used as a data source by Google
+          Visualization Gadgets or from JS code.
+    """
+
+    response_obj = {
+        "version": "0.6",
+        "reqId": str(req_id),
+        "table": self._ToJSonObj(columns_order, order_by),
+        "status": "ok"
+    }
+    encoder = DataTableJSONEncoder()
+    return "%s(%s);" % (response_handler,
+                        encoder.encode(response_obj).encode("utf-8"))
+
+  def ToResponse(self, columns_order=None, order_by=(), tqx=""):
+    """Writes the right response according to the request string passed in tqx.
+
+    This method parses the tqx request string (format of which is defined in
+    the documentation for implementing a data source of Google Visualization),
+    and returns the right response according to the request.
+    It parses out the "out" parameter of tqx, calls the relevant response
+    (ToJSonResponse() for "json", ToCsv() for "csv", ToHtml() for "html",
+    ToTsvExcel() for "tsv-excel") and passes the response function the rest of
+    the relevant request keys.
+
+    Args:
+      columns_order: Optional. Passed as is to the relevant response function.
+      order_by: Optional. Passed as is to the relevant response function.
+      tqx: Optional. The request string as received by HTTP GET. Should be in
+           the format "key1:value1;key2:value2...". All keys have a default
+           value, so an empty string will just do the default (which is calling
+           ToJSonResponse() with no extra parameters).
+
+    Returns:
+      A response string, as returned by the relevant response function.
+
+    Raises:
+      DataTableException: One of the parameters passed in tqx is not supported.
+    """
+    tqx_dict = {}
+    if tqx:
+      tqx_dict = dict(opt.split(":") for opt in tqx.split(";"))
+    if tqx_dict.get("version", "0.6") != "0.6":
+      raise DataTableException(
+          "Version (%s) passed by request is not supported."
+          % tqx_dict["version"])
+
+    if tqx_dict.get("out", "json") == "json":
+      response_handler = tqx_dict.get("responseHandler",
+                                      "google.visualization.Query.setResponse")
+      return self.ToJSonResponse(columns_order, order_by,
+                                 req_id=tqx_dict.get("reqId", 0),
+                                 response_handler=response_handler)
+    elif tqx_dict["out"] == "html":
+      return self.ToHtml(columns_order, order_by)
+    elif tqx_dict["out"] == "csv":
+      return self.ToCsv(columns_order, order_by)
+    elif tqx_dict["out"] == "tsv-excel":
+      return self.ToTsvExcel(columns_order, order_by)
+    else:
+      raise DataTableException(
+          "'out' parameter: '%s' is not supported" % tqx_dict["out"])
diff --git a/third_party/aom/test/hash_test.cc b/third_party/aom/test/hash_test.cc
new file mode 100644
index 000000000..e9f7f63c9
--- /dev/null
+++ b/third_party/aom/test/hash_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/hash.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef uint32_t (*get_crc32c_value_func)(void *calculator, uint8_t *p,
+                                          int length);
+
+typedef ::testing::tuple<get_crc32c_value_func, int> HashParam;
+
+class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
+ public:
+  ~AV1Crc32cHashTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(get_crc32c_value_func test_impl);
+  void RunSpeedTest(get_crc32c_value_func test_impl);
+
+  void RunZeroTest(get_crc32c_value_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+  CRC32C calc_;
+  uint8_t *buffer_;
+  int bsize_;
+  int length_;
+};
+
+AV1Crc32cHashTest::~AV1Crc32cHashTest() { ; }
+
+void AV1Crc32cHashTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_crc32c_calculator_init(&calc_);
+
+  bsize_ = GET_PARAM(1);
+  length_ = bsize_ * bsize_ * sizeof(uint16_t);
+  buffer_ = new uint8_t[length_];
+  ASSERT_TRUE(buffer_ != NULL);
+  for (int i = 0; i < length_; ++i) {
+    buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1Crc32cHashTest::TearDown() { delete[] buffer_; }
+
+void AV1Crc32cHashTest::RunCheckOutput(get_crc32c_value_func test_impl) {
+  get_crc32c_value_func ref_impl = av1_get_crc32c_value_c;
+  // for the same buffer crc should be the same
+  uint32_t crc0 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc1 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc2 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_EQ(crc0, crc1);
+  ASSERT_EQ(crc0, crc2);  // should equal to software version
+  // modify buffer
+  buffer_[0] += 1;
+  uint32_t crc3 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc4 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_NE(crc0, crc3);  // crc shoud not equal to previous one
+  ASSERT_EQ(crc3, crc4);
+}
+
+void AV1Crc32cHashTest::RunSpeedTest(get_crc32c_value_func test_impl) {
+  get_crc32c_value_func impls[] = { av1_get_crc32c_value_c, test_impl };
+  const int repeat = 10000000 / (bsize_ + bsize_);
+
+  aom_usec_timer timer;
+  double time[2];
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer_start(&timer);
+    for (int j = 0; j < repeat; ++j) {
+      impls[i](&calc_, buffer_, length_);
+    }
+    aom_usec_timer_mark(&timer);
+    time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer));
+  }
+  printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]);
+  printf("(%3.2f)\n", time[0] / time[1]);
+}
+
+void AV1Crc32cHashTest::RunZeroTest(get_crc32c_value_func test_impl) {
+  uint8_t buffer0[1024] = { 0 };
+  // for buffer with different size the crc should not be the same
+  const uint32_t crc0 = test_impl(&calc_, buffer0, 32);
+  const uint32_t crc1 = test_impl(&calc_, buffer0, 128);
+  const uint32_t crc2 = test_impl(&calc_, buffer0, 1024);
+  ASSERT_NE(crc0, crc1);
+  ASSERT_NE(crc0, crc2);
+  ASSERT_NE(crc1, crc2);
+}
+
+TEST_P(AV1Crc32cHashTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, CheckZero) { RunZeroTest(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+const int kValidBlockSize[] = { 64, 32, 8, 4 };
+
+INSTANTIATE_TEST_CASE_P(
+    C, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_c),
+                       ::testing::ValuesIn(kValidBlockSize)));
+
+#if HAVE_SSE4_2
+INSTANTIATE_TEST_CASE_P(
+    SSE4_2, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_sse4_2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/hbd_metrics_test.cc b/third_party/aom/test/hbd_metrics_test.cc
new file mode 100644
index 000000000..09df9bde4
--- /dev/null
+++ b/third_party/aom/test/hbd_metrics_test.cc
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/msvc.h"
+#include "aom_scale/yv12config.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef double (*LBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+                                const YV12_BUFFER_CONFIG *dest);
+typedef double (*HBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+                                const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                                uint32_t bd);
+
+double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                        uint32_t bd) {
+  PSNR_STATS psnr;
+  aom_calc_highbd_psnr(source, dest, &psnr, bd, in_bd);
+  return psnr.psnr[0];
+}
+
+double compute_psnr(const YV12_BUFFER_CONFIG *source,
+                    const YV12_BUFFER_CONFIG *dest) {
+  PSNR_STATS psnr;
+  aom_calc_psnr(source, dest, &psnr);
+  return psnr.psnr[0];
+}
+
+double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                           uint32_t bd) {
+  double tempy, tempu, tempv;
+  return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                       const YV12_BUFFER_CONFIG *dest) {
+  double tempy, tempu, tempv;
+  return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_fastssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                            uint32_t bd) {
+  double tempy, tempu, tempv;
+  return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_fastssim(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest) {
+  double tempy, tempu, tempv;
+  return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                           uint32_t bd) {
+  double ssim, weight;
+  ssim = aom_highbd_calc_ssim(source, dest, &weight, bd, in_bd);
+  return 100 * pow(ssim / weight, 8.0);
+}
+
+double compute_aomssim(const YV12_BUFFER_CONFIG *source,
+                       const YV12_BUFFER_CONFIG *dest) {
+  double ssim, weight;
+  ssim = aom_calc_ssim(source, dest, &weight);
+  return 100 * pow(ssim / weight, 8.0);
+}
+
+class HBDMetricsTestBase {
+ public:
+  virtual ~HBDMetricsTestBase() {}
+
+ protected:
+  void RunAccuracyCheck() {
+    const int width = 1920;
+    const int height = 1080;
+    size_t i = 0;
+    const uint8_t kPixFiller = 128;
+    YV12_BUFFER_CONFIG lbd_src, lbd_dst;
+    YV12_BUFFER_CONFIG hbd_src, hbd_dst;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double lbd_db, hbd_db;
+
+    memset(&lbd_src, 0, sizeof(lbd_src));
+    memset(&lbd_dst, 0, sizeof(lbd_dst));
+    memset(&hbd_src, 0, sizeof(hbd_src));
+    memset(&hbd_dst, 0, sizeof(hbd_dst));
+
+    aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16);
+    aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16);
+    aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16);
+    aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16);
+
+    memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz);
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t spel, dpel;
+      spel = lbd_src.buffer_alloc[i];
+      // Create some distortion for dst buffer.
+      dpel = rnd.Rand8();
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t *)(hbd_src.buffer_alloc))[i] = spel << (bit_depth_ - 8);
+      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    i = 0;
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t dpel;
+      // Create some small distortion for dst buffer.
+      dpel = 120 + (rnd.Rand8() >> 4);
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    i = 0;
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t dpel;
+      // Create some small distortion for dst buffer.
+      dpel = 126 + (rnd.Rand8() >> 6);
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    aom_free_frame_buffer(&lbd_src);
+    aom_free_frame_buffer(&lbd_dst);
+    aom_free_frame_buffer(&hbd_src);
+    aom_free_frame_buffer(&hbd_dst);
+  }
+
+  int input_bit_depth_;
+  int bit_depth_;
+  double threshold_;
+  LBDMetricFunc lbd_metric_;
+  HBDMetricFunc hbd_metric_;
+};
+
+typedef ::testing::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
+    MetricTestTParam;
+class HBDMetricsTest : public HBDMetricsTestBase,
+                       public ::testing::TestWithParam<MetricTestTParam> {
+ public:
+  virtual void SetUp() {
+    lbd_metric_ = GET_PARAM(0);
+    hbd_metric_ = GET_PARAM(1);
+    input_bit_depth_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    threshold_ = GET_PARAM(4);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); }
+
+// Allow small variation due to floating point operations.
+static const double kSsim_thresh = 0.001;
+// Allow some additional errors accumulated in floating point operations.
+static const double kFSsim_thresh = 0.03;
+// Allow some extra variation due to rounding error accumulated in dct.
+static const double kPhvs_thresh = 0.3;
+
+INSTANTIATE_TEST_CASE_P(
+    AOMSSIM, HBDMetricsTest,
+    ::testing::Values(MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       8, 10, kSsim_thresh),
+                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       10, 10, kPhvs_thresh),
+                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       8, 12, kSsim_thresh),
+                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       12, 12, kPhvs_thresh)));
+INSTANTIATE_TEST_CASE_P(
+    FASTSSIM, HBDMetricsTest,
+    ::testing::Values(MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       8, 10, kFSsim_thresh),
+                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       10, 10, kFSsim_thresh),
+                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       8, 12, kFSsim_thresh),
+                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       12, 12, kFSsim_thresh)));
+INSTANTIATE_TEST_CASE_P(
+    PSNRHVS, HBDMetricsTest,
+    ::testing::Values(MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       8, 10, kPhvs_thresh),
+                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       10, 10, kPhvs_thresh),
+                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       8, 12, kPhvs_thresh),
+                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       12, 12, kPhvs_thresh)));
+INSTANTIATE_TEST_CASE_P(
+    PSNR, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10, kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 12, kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12, 12,
+                         kPhvs_thresh)));
+}  // namespace
diff --git a/third_party/aom/test/hiprec_convolve_test.cc b/third_party/aom/test/hiprec_convolve_test.cc
new file mode 100644
index 000000000..f94a0730c
--- /dev/null
+++ b/third_party/aom/test/hiprec_convolve_test.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/hiprec_convolve_test_util.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
+using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
+
+namespace {
+
+TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
+TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
+  RunSpeedTest(GET_PARAM(3));
+}
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HiprecConvolveTest,
+                        libaom_test::AV1HiprecConvolve::BuildParams(
+                            av1_wiener_convolve_add_src_sse2));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HiprecConvolveTest,
+                        libaom_test::AV1HiprecConvolve::BuildParams(
+                            av1_wiener_convolve_add_src_avx2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AV1HiprecConvolveTest,
+                        libaom_test::AV1HiprecConvolve::BuildParams(
+                            av1_wiener_convolve_add_src_neon));
+#endif
+
+#if HAVE_SSSE3 || HAVE_AVX2
+TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(4));
+}
+TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) {
+  RunSpeedTest(GET_PARAM(4));
+}
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdHiprecConvolveTest,
+                        libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                            av1_highbd_wiener_convolve_add_src_ssse3));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdHiprecConvolveTest,
+                        libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                            av1_highbd_wiener_convolve_add_src_avx2));
+#endif
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/hiprec_convolve_test_util.cc b/third_party/aom/test/hiprec_convolve_test_util.cc
new file mode 100644
index 000000000..2672bcec3
--- /dev/null
+++ b/third_party/aom/test/hiprec_convolve_test_util.cc
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/hiprec_convolve_test_util.h"
+
+#include "av1/common/restoration.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+namespace libaom_test {
+
+// Generate a random pair of filter kernels, using the ranges
+// of possible values from the loop-restoration experiment
+static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
+                             InterpKernel vkernel) {
+  hkernel[0] = hkernel[6] =
+      WIENER_FILT_TAP0_MINV +
+      rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
+  hkernel[1] = hkernel[5] =
+      WIENER_FILT_TAP1_MINV +
+      rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+  hkernel[2] = hkernel[4] =
+      WIENER_FILT_TAP2_MINV +
+      rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+  hkernel[3] = -(hkernel[0] + hkernel[1] + hkernel[2]);
+  hkernel[7] = 0;
+
+  vkernel[0] = vkernel[6] =
+      WIENER_FILT_TAP0_MINV +
+      rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
+  vkernel[1] = vkernel[5] =
+      WIENER_FILT_TAP1_MINV +
+      rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+  vkernel[2] = vkernel[4] =
+      WIENER_FILT_TAP2_MINV +
+      rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+  vkernel[3] = -(vkernel[0] + vkernel[1] + vkernel[2]);
+  vkernel[7] = 0;
+}
+
+namespace AV1HiprecConvolve {
+
+::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
+    hiprec_convolve_func filter) {
+  const HiprecConvolveParam params[] = {
+    make_tuple(8, 8, 50000, filter),   make_tuple(8, 4, 50000, filter),
+    make_tuple(64, 24, 1000, filter),  make_tuple(64, 64, 1000, filter),
+    make_tuple(64, 56, 1000, filter),  make_tuple(32, 8, 10000, filter),
+    make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
+    make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
+    make_tuple(64, 34, 1000, filter),  make_tuple(8, 17, 10000, filter),
+    make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
+  };
+  return ::testing::ValuesIn(params);
+}
+
+AV1HiprecConvolveTest::~AV1HiprecConvolveTest() {}
+void AV1HiprecConvolveTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HiprecConvolveTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2);
+  int i, j;
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  uint8_t *input_ = new uint8_t[h * w];
+  uint8_t *input = input_;
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+  for (i = 0; i < num_iters; ++i) {
+    // Choose random locations within the source block
+    int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+    int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+    av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
+                                  out_w, hkernel, 16, vkernel, 16, out_w, out_h,
+                                  &conv_params);
+    test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
+              vkernel, 16, out_w, out_h, &conv_params);
+
+    for (j = 0; j < out_w * out_h; ++j)
+      ASSERT_EQ(output[j], output2[j])
+          << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+          << (j / out_w) << ") on iteration " << i;
+  }
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+}
+
+void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2) / 500;
+  int i, j, k;
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  uint8_t *input_ = new uint8_t[h * w];
+  uint8_t *input = input_;
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        av1_wiener_convolve_add_src_c(input + j * w + k, w, output, out_w,
+                                      hkernel, 16, vkernel, 16, out_w, out_h,
+                                      &conv_params);
+      }
+    }
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer tst_timer;
+  aom_usec_timer_start(&tst_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel,
+                  16, out_w, out_h, &conv_params);
+      }
+    }
+  }
+  aom_usec_timer_mark(&tst_timer);
+  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+  std::cout << "[          ] C time = " << ref_time / 1000
+            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+  EXPECT_GT(ref_time, tst_time)
+      << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+      << "C time: " << ref_time << " us\n"
+      << "SIMD time: " << tst_time << " us\n";
+
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+}
+}  // namespace AV1HiprecConvolve
+
+namespace AV1HighbdHiprecConvolve {
+
+::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
+    highbd_hiprec_convolve_func filter) {
+  const HighbdHiprecConvolveParam params[] = {
+    make_tuple(8, 8, 50000, 8, filter),   make_tuple(64, 64, 1000, 8, filter),
+    make_tuple(32, 8, 10000, 8, filter),  make_tuple(8, 8, 50000, 10, filter),
+    make_tuple(64, 64, 1000, 10, filter), make_tuple(32, 8, 10000, 10, filter),
+    make_tuple(8, 8, 50000, 12, filter),  make_tuple(64, 64, 1000, 12, filter),
+    make_tuple(32, 8, 10000, 12, filter),
+  };
+  return ::testing::ValuesIn(params);
+}
+
+AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() {}
+void AV1HighbdHiprecConvolveTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdHiprecConvolveTest::TearDown() {
+  libaom_test::ClearSystemState();
+}
+
+void AV1HighbdHiprecConvolveTest::RunCheckOutput(
+    highbd_hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2);
+  const int bd = GET_PARAM(3);
+  int i, j;
+  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+
+  uint16_t *input = new uint16_t[h * w];
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+
+  for (i = 0; i < num_iters; ++i) {
+    // Choose random locations within the source block
+    int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+    int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+    av1_highbd_wiener_convolve_add_src_c(
+        input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel, 16,
+        vkernel, 16, out_w, out_h, &conv_params, bd);
+    test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
+              hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
+
+    for (j = 0; j < out_w * out_h; ++j)
+      ASSERT_EQ(output[j], output2[j])
+          << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+          << (j / out_w) << ") on iteration " << i;
+  }
+  delete[] input;
+  delete[] output;
+  delete[] output2;
+}
+
+void AV1HighbdHiprecConvolveTest::RunSpeedTest(
+    highbd_hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2) / 500;
+  const int bd = GET_PARAM(3);
+  int i, j, k;
+  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+
+  uint16_t *input = new uint16_t[h * w];
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        av1_highbd_wiener_convolve_add_src_c(
+            input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel,
+            16, out_w, out_h, &conv_params, bd);
+      }
+    }
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer tst_timer;
+  aom_usec_timer_start(&tst_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
+                  vkernel, 16, out_w, out_h, &conv_params, bd);
+      }
+    }
+  }
+  aom_usec_timer_mark(&tst_timer);
+  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+  std::cout << "[          ] C time = " << ref_time / 1000
+            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+  EXPECT_GT(ref_time, tst_time)
+      << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+      << "C time: " << ref_time << " us\n"
+      << "SIMD time: " << tst_time << " us\n";
+
+  delete[] input;
+  delete[] output;
+  delete[] output2;
+}
+}  // namespace AV1HighbdHiprecConvolve
+}  // namespace libaom_test
diff --git a/third_party/aom/test/hiprec_convolve_test_util.h b/third_party/aom/test/hiprec_convolve_test_util.h
new file mode 100644
index 000000000..2abe24b57
--- /dev/null
+++ b/third_party/aom/test/hiprec_convolve_test_util.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#define AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+namespace libaom_test {
+
+namespace AV1HiprecConvolve {
+
+typedef void (*hiprec_convolve_func)(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h,
+                                     const ConvolveParams *conv_params);
+
+typedef ::testing::tuple<int, int, int, hiprec_convolve_func>
+    HiprecConvolveParam;
+
+::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
+    hiprec_convolve_func filter);
+
+class AV1HiprecConvolveTest
+    : public ::testing::TestWithParam<HiprecConvolveParam> {
+ public:
+  virtual ~AV1HiprecConvolveTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(hiprec_convolve_func test_impl);
+  void RunSpeedTest(hiprec_convolve_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1HiprecConvolve
+
+namespace AV1HighbdHiprecConvolve {
+typedef void (*highbd_hiprec_convolve_func)(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bps);
+
+typedef ::testing::tuple<int, int, int, int, highbd_hiprec_convolve_func>
+    HighbdHiprecConvolveParam;
+
+::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
+    highbd_hiprec_convolve_func filter);
+
+class AV1HighbdHiprecConvolveTest
+    : public ::testing::TestWithParam<HighbdHiprecConvolveParam> {
+ public:
+  virtual ~AV1HighbdHiprecConvolveTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
+  void RunSpeedTest(highbd_hiprec_convolve_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1HighbdHiprecConvolve
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
diff --git a/third_party/aom/test/horz_superres_test.cc b/third_party/aom/test/horz_superres_test.cc
new file mode 100644
index 000000000..973f55b66
--- /dev/null
+++ b/third_party/aom/test/horz_superres_test.cc
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/encoder.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+/* TESTING PARAMETERS */
+
+#define NUM_TEST_VIDEOS 3
+
+const int kBitrate = 40;
+
+// PSNR thresholds found by experiment
+const double kPSNRThresholds[] = { 26.0, 28.0, 20.0 };
+
+typedef struct {
+  const char *filename;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+  unsigned int limit;
+  unsigned int screen_content;
+} TestVideoParam;
+
+const TestVideoParam kTestVideoVectors[] = {
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0 },
+  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0 },
+  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1 },
+};
+
+// Superres modes tested
+// SUPERRES_QTHRESH is not included, as it has its own test
+const SUPERRES_MODE kSuperresModesNotQThresh[] = { SUPERRES_FIXED,
+                                                   SUPERRES_RANDOM };
+
+// Superres denominators and superres kf denominators to be tested
+typedef tuple<int, int> SuperresDenominatorPair;
+const SuperresDenominatorPair kSuperresDenominators[] = {
+  make_tuple(16, 9),  make_tuple(13, 11), make_tuple(9, 9),
+  make_tuple(13, 13), make_tuple(11, 16), make_tuple(8, 16),
+  make_tuple(16, 8),  make_tuple(8, 8),   make_tuple(9, 14),
+};
+
+// Superres q thresholds and superres kf q thresholds to be tested
+typedef tuple<int, int> SuperresQThresholdPair;
+const SuperresQThresholdPair kSuperresQThresholds[] = {
+  make_tuple(63, 63), make_tuple(63, 41), make_tuple(17, 63),
+  make_tuple(41, 11), make_tuple(1, 37),  make_tuple(11, 11),
+  make_tuple(1, 1),   make_tuple(17, 29), make_tuple(29, 11),
+};
+
+/* END (TESTING PARAMETERS) */
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_idx_, superres_mode_,
+//  tuple(superres_denom_, superres_kf_denom_)>
+typedef tuple<const libaom_test::CodecFactory *, int, SUPERRES_MODE,
+              SuperresDenominatorPair>
+    HorzSuperresTestParam;
+
+class HorzSuperresEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
+        superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {
+    test_video_param_ = kTestVideoVectors[test_video_idx_];
+
+    SuperresDenominatorPair denoms = GET_PARAM(3);
+    superres_denom_ = ::testing::get<0>(denoms);
+    superres_kf_denom_ = ::testing::get<1>(denoms);
+  }
+
+  virtual ~HorzSuperresEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+    cfg_.rc_superres_denominator = superres_denom_;
+    cfg_.rc_superres_kf_denominator = superres_kf_denom_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
+
+  void DoTest() {
+    testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_;
+  }
+
+  int test_video_idx_;
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+  int superres_denom_;
+  int superres_kf_denom_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
+                          ::testing::Range(0, NUM_TEST_VIDEOS),
+                          ::testing::ValuesIn(kSuperresModesNotQThresh),
+                          ::testing::ValuesIn(kSuperresDenominators));
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_idx_, tuple(superres_denom_,
+//  superres_kf_denom_), tuple(superres_qthresh_,superres_kf_qthresh_)>
+typedef tuple<const libaom_test::CodecFactory *, int, SuperresDenominatorPair,
+              SuperresQThresholdPair>
+    HorzSuperresQThreshTestParam;
+
+class HorzSuperresQThreshEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresQThreshTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresQThreshEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
+        superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
+    test_video_param_ = kTestVideoVectors[test_video_idx_];
+
+    SuperresDenominatorPair denoms = GET_PARAM(2);
+    superres_denom_ = ::testing::get<0>(denoms);
+    superres_kf_denom_ = ::testing::get<1>(denoms);
+
+    SuperresQThresholdPair qthresholds = GET_PARAM(3);
+    superres_qthresh_ = ::testing::get<0>(qthresholds);
+    superres_kf_qthresh_ = ::testing::get<1>(qthresholds);
+  }
+
+  virtual ~HorzSuperresQThreshEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+    cfg_.rc_superres_denominator = superres_denom_;
+    cfg_.rc_superres_kf_denominator = superres_kf_denom_;
+    cfg_.rc_superres_qthresh = superres_qthresh_;
+    cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
+
+  void DoTest() {
+    testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_
+        << ", superres_qthresh_ = " << superres_qthresh_
+        << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_
+        << ", superres_qthresh_ = " << superres_qthresh_
+        << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+  }
+
+  int test_video_idx_;
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+  int superres_denom_;
+  int superres_kf_denom_;
+  int superres_qthresh_;
+  int superres_kf_qthresh_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
+                          ::testing::Range(0, NUM_TEST_VIDEOS),
+                          ::testing::ValuesIn(kSuperresDenominators),
+                          ::testing::ValuesIn(kSuperresQThresholds));
+
+}  // namespace
diff --git a/third_party/aom/test/i420_video_source.h b/third_party/aom/test/i420_video_source.h
new file mode 100644
index 000000000..233e7152b
--- /dev/null
+++ b/third_party/aom/test/i420_video_source.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_I420_VIDEO_SOURCE_H_
+#define AOM_TEST_I420_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/yuv_video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class I420VideoSource : public YUVVideoSource {
+ public:
+  I420VideoSource(const std::string &file_name, unsigned int width,
+                  unsigned int height, int rate_numerator, int rate_denominator,
+                  unsigned int start, int limit)
+      : YUVVideoSource(file_name, AOM_IMG_FMT_I420, width, height,
+                       rate_numerator, rate_denominator, start, limit) {}
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_I420_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/intra_edge_test.cc b/third_party/aom/test/intra_edge_test.cc
new file mode 100644
index 000000000..ce61402ac
--- /dev/null
+++ b/third_party/aom/test/intra_edge_test.cc
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class UpsampleTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMinEdge = 4;
+  static const int kMaxEdge = 24;
+  static const int kBufSize = 2 * 64 + 32;
+  static const int kOffset = 16;
+
+  virtual ~UpsampleTest() {}
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    const int max_idx = (size_ - 1) * 2;
+    for (int r = -2; r <= max_idx; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*UP8B)(uint8_t *p, int size);
+typedef libaom_test::FuncParam<UP8B> TestFuncs;
+
+class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
+  }
+};
+
+TEST_P(UpsampleTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    size_ = 4 * (this->rng_(4) + 1);
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = edge_ref_data_[i];
+    }
+
+    // Extend final sample
+    while (i < kBufSize) {
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+      i++;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, UpsampleTest8B,
+    ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+                                av1_upsample_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*UPHB)(uint16_t *p, int size, int bd);
+typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
+
+class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
+ protected:
+  void Execute(uint16_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
+  }
+  int bit_depth_;
+};
+
+TEST_P(UpsampleTestHB, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    const int hi = 1 << bit_depth_;
+
+    size_ = 4 * (this->rng_(4) + 1);
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_(hi);
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    // Extend final sample
+    while (i < kBufSize) {
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+      i++;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, UpsampleTestHB,
+    ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c,
+                                   av1_upsample_intra_edge_high_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+template <typename F, typename T>
+class FilterEdgeTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMaxEdge = 2 * 64;
+  static const int kBufSize = kMaxEdge + 32;
+  static const int kOffset = 15;
+
+  virtual ~FilterEdgeTest() {}
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    for (int r = 0; r < size_; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+  int strength_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FE8B)(uint8_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
+
+class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, strength_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+};
+
+TEST_P(FilterEdgeTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, FilterEdgeTest8B,
+    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+                                          av1_filter_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FEHB)(uint16_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
+
+class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
+ protected:
+  void Execute(uint16_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, strength_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+  int bit_depth_;
+};
+
+TEST_P(FilterEdgeTestHB, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    const int hi = 1 << bit_depth_;
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_(hi);
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, FilterEdgeTestHB,
+                        ::testing::Values(FilterEdgeTestFuncsHBD(
+                            av1_filter_intra_edge_high_c,
+                            av1_filter_intra_edge_high_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+// Speed tests
+
+TEST_P(UpsampleTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+  }
+}
+
+TEST_P(UpsampleTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
+  }
+}
+
+TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+  }
+}
+
+TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/intrabc_test.cc b/third_party/aom/test/intrabc_test.cc
new file mode 100644
index 000000000..3ea421708
--- /dev/null
+++ b/third_party/aom/test/intrabc_test.cc
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/tile_common.h"
+
+namespace {
+TEST(IntrabcTest, DvValidation) {
+  struct DvTestCase {
+    MV dv;
+    int mi_row_offset;
+    int mi_col_offset;
+    BLOCK_SIZE bsize;
+    bool valid;
+  };
+  const int kSubPelScale = 8;
+  const int kTileMaxMibWidth = 8;
+  const DvTestCase kDvCases[] = {
+    { { 0, 0 }, 0, 0, BLOCK_128X128, false },
+    { { 0, 0 }, 0, 0, BLOCK_64X64, false },
+    { { 0, 0 }, 0, 0, BLOCK_32X32, false },
+    { { 0, 0 }, 0, 0, BLOCK_16X16, false },
+    { { 0, 0 }, 0, 0, BLOCK_8X8, false },
+    { { 0, 0 }, 0, 0, BLOCK_4X4, false },
+    { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      true },
+    { { 0, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, 0 },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      true },
+    { { MAX_SB_SIZE * kSubPelScale, 0 },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      false },
+    { { 0, MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      false },
+    { { -32 * kSubPelScale, -32 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      true },
+    { { -32 * kSubPelScale, -32 * kSubPelScale },
+      32 / MI_SIZE,
+      32 / MI_SIZE,
+      BLOCK_32X32,
+      false },
+    { { -32 * kSubPelScale - kSubPelScale / 2, -32 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      false },
+    { { -33 * kSubPelScale, -32 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      true },
+    { { -32 * kSubPelScale, -32 * kSubPelScale - kSubPelScale / 2 },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      false },
+    { { -32 * kSubPelScale, -33 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      true },
+    { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      true },
+    { { -(MAX_SB_SIZE + 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE + 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -(MAX_SB_SIZE - 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      true },
+    { { -(MAX_SB_SIZE - 1) * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale,
+        (kTileMaxMibWidth - 2) * MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale,
+        ((kTileMaxMibWidth - 2) * MAX_SB_SIZE + 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+  };
+
+  MACROBLOCKD xd;
+  memset(&xd, 0, sizeof(xd));
+  xd.tile.mi_row_start = 8 * MAX_MIB_SIZE;
+  xd.tile.mi_row_end = 16 * MAX_MIB_SIZE;
+  xd.tile.mi_col_start = 24 * MAX_MIB_SIZE;
+  xd.tile.mi_col_end = xd.tile.mi_col_start + kTileMaxMibWidth * MAX_MIB_SIZE;
+  xd.plane[1].subsampling_x = 1;
+  xd.plane[1].subsampling_y = 1;
+  xd.plane[2].subsampling_x = 1;
+  xd.plane[2].subsampling_y = 1;
+
+  AV1_COMMON cm;
+  memset(&cm, 0, sizeof(cm));
+
+  for (int i = 0; i < static_cast<int>(GTEST_ARRAY_SIZE_(kDvCases)); ++i) {
+    EXPECT_EQ(static_cast<int>(kDvCases[i].valid),
+              av1_is_dv_valid(kDvCases[i].dv, &cm, &xd,
+                              xd.tile.mi_row_start + kDvCases[i].mi_row_offset,
+                              xd.tile.mi_col_start + kDvCases[i].mi_col_offset,
+                              kDvCases[i].bsize, MAX_MIB_SIZE_LOG2))
+        << "DvCases[" << i << "]";
+  }
+}
+}  // namespace
diff --git a/third_party/aom/test/intrapred_test.cc b/third_party/aom/test/intrapred_test.cc
new file mode 100644
index 000000000..1a1c0fc42
--- /dev/null
+++ b/third_party/aom/test/intrapred_test.cc
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+const int count_test_block = 100000;
+
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+                                const uint16_t *above, const uint16_t *left,
+                                int bps);
+typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left);
+
+}  // namespace
+
+// NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the
+// anonymous namespace, then we get a strange compiler warning in
+// the begin() and end() methods of the ParamGenerator template class in
+// gtest/internal/gtest-param-util.h:
+//   warning: ‘<anonymous>’ is used uninitialized in this function
+// As a workaround, put this template outside the anonymous namespace.
+// See bug aomedia:2003.
+template <typename FuncType>
+struct IntraPredFunc {
+  IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL,
+                int block_width_value = 0, int block_height_value = 0,
+                int bit_depth_value = 0)
+      : pred_fn(pred), ref_fn(ref), block_width(block_width_value),
+        block_height(block_height_value), bit_depth(bit_depth_value) {}
+
+  FuncType pred_fn;
+  FuncType ref_fn;
+  int block_width;
+  int block_height;
+  int bit_depth;
+};
+
+namespace {
+
+template <typename FuncType, typename Pixel>
+class AV1IntraPredTest
+    : public ::testing::TestWithParam<IntraPredFunc<FuncType> > {
+ public:
+  void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    above_row_ = above_data + 16;
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    int error_count = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_width * 2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_height; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+      Predict();
+      CheckPrediction(i, &error_count);
+    }
+    ASSERT_EQ(0, error_count);
+  }
+
+ protected:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    stride_ = params_.block_width * 3;
+    mask_ = (1 << params_.bit_depth) - 1;
+  }
+
+  virtual void Predict() = 0;
+
+  void CheckPrediction(int test_case_number, int *error_count) const {
+    // For each pixel ensure that the calculated value is the same as reference.
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    for (int y = 0; y < block_height; y++) {
+      for (int x = 0; x < block_width; x++) {
+        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+        if (*error_count == 1) {
+          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+              << " Failed on Test Case Number " << test_case_number
+              << " location: x = " << x << " y = " << y;
+        }
+      }
+    }
+  }
+
+  Pixel *above_row_;
+  Pixel *left_col_;
+  Pixel *dst_;
+  Pixel *ref_dst_;
+  ptrdiff_t stride_;
+  int mask_;
+
+  IntraPredFunc<FuncType> params_;
+};
+
+class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
+ protected:
+  void Predict() {
+    const int bit_depth = params_.bit_depth;
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+  }
+};
+
+class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
+ protected:
+  void Predict() {
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_));
+  }
+};
+
+// Suppress an unitialized warning. Once there are implementations to test then
+// this can be restored.
+TEST_P(HighbdIntraPredTest, Bitexact) {
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+// Same issue as above but for arm.
+#if !HAVE_NEON
+TEST_P(LowbdIntraPredTest, Bitexact) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+#endif  // !HAVE_NEON
+
+// -----------------------------------------------------------------------------
+// High Bit Depth Tests
+#define highbd_entry(type, width, height, opt, bd)                          \
+  IntraPredFunc<HighbdIntraPred>(                                           \
+      &aom_highbd_##type##_predictor_##width##x##height##_##opt,            \
+      &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \
+      bd)
+
+#if 0
+#define highbd_intrapred(type, opt, bd)                                       \
+  highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd),       \
+      highbd_entry(type, 8, 4, opt, bd), highbd_entry(type, 8, 8, opt, bd),   \
+      highbd_entry(type, 8, 16, opt, bd), highbd_entry(type, 16, 8, opt, bd), \
+      highbd_entry(type, 16, 16, opt, bd),                                    \
+      highbd_entry(type, 16, 32, opt, bd),                                    \
+      highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
+#endif
+
+  // ---------------------------------------------------------------------------
+  // Low Bit Depth Tests
+
+#define lowbd_entry(type, width, height, opt)                                  \
+  IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
+                           &aom_##type##_predictor_##width##x##height##_c,     \
+                           width, height, 8)
+
+#define lowbd_intrapred(type, opt)                                    \
+  lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt),         \
+      lowbd_entry(type, 8, 4, opt), lowbd_entry(type, 8, 8, opt),     \
+      lowbd_entry(type, 8, 16, opt), lowbd_entry(type, 16, 8, opt),   \
+      lowbd_entry(type, 16, 16, opt), lowbd_entry(type, 16, 32, opt), \
+      lowbd_entry(type, 32, 16, opt), lowbd_entry(type, 32, 32, opt)
+
+#if HAVE_SSE2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
+  lowbd_intrapred(dc, sse2),      lowbd_intrapred(dc_top, sse2),
+  lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
+  lowbd_intrapred(v, sse2),       lowbd_intrapred(h, sse2),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVector));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
+  lowbd_intrapred(paeth, ssse3),
+  lowbd_intrapred(smooth, ssse3),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
+
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
+  lowbd_entry(dc, 32, 32, avx2),      lowbd_entry(dc_top, 32, 32, avx2),
+  lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2),
+  lowbd_entry(v, 32, 32, avx2),       lowbd_entry(h, 32, 32, avx2),
+  lowbd_entry(dc, 32, 16, avx2),      lowbd_entry(dc_top, 32, 16, avx2),
+  lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2),
+  lowbd_entry(v, 32, 16, avx2),       lowbd_entry(paeth, 16, 8, avx2),
+  lowbd_entry(paeth, 16, 16, avx2),   lowbd_entry(paeth, 16, 32, avx2),
+  lowbd_entry(paeth, 32, 16, avx2),   lowbd_entry(paeth, 32, 32, avx2),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
+
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {
+  highbd_entry(dc, 4, 4, neon, 8),   highbd_entry(dc, 8, 8, neon, 8),
+  highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8),
+  highbd_entry(dc, 64, 64, neon, 8),
+};
+
+INSTANTIATE_TEST_CASE_P(NEON, HighbdIntraPredTest,
+                        ::testing::ValuesIn(HighbdIntraPredTestVectorNeon));
+
+#endif  // HAVE_NEON
+}  // namespace
diff --git a/third_party/aom/test/invalid_file_test.cc b/third_party/aom/test/invalid_file_test.cc
new file mode 100644
index 000000000..5b4f5a6c3
--- /dev/null
+++ b/third_party/aom/test/invalid_file_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+
+struct DecodeParam {
+  int threads;
+  const char *filename;
+};
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+  return os << "threads: " << dp.threads << " file: " << dp.filename;
+}
+
+class InvalidFileTest : public ::libaom_test::DecoderTest,
+                        public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL) fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name) {
+    res_file_ = libaom_test::OpenTestDataFile(res_file_name);
+    ASSERT_TRUE(res_file_ != NULL)
+        << "Result file open failed. Filename: " << res_file_name;
+  }
+
+  virtual bool HandleDecodeResult(
+      const aom_codec_err_t res_dec,
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec = -1;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    if (expected_res_dec != -1) {
+      // Check results match.
+      const DecodeParam input = GET_PARAM(1);
+      if (input.threads > 1) {
+        // The serial decode check is too strict for tile-threaded decoding as
+        // there is no guarantee on the decode order nor which specific error
+        // will take precedence. Currently a tile-level error is not forwarded
+        // so the frame will simply be marked corrupt.
+        EXPECT_TRUE(res_dec == expected_res_dec ||
+                    res_dec == AOM_CODEC_CORRUPT_FRAME)
+            << "Results don't match: frame number = " << video.frame_number()
+            << ". (" << decoder->DecodeError()
+            << "). Expected: " << expected_res_dec << " or "
+            << AOM_CODEC_CORRUPT_FRAME;
+      } else {
+        EXPECT_EQ(expected_res_dec, res_dec)
+            << "Results don't match: frame number = " << video.frame_number()
+            << ". (" << decoder->DecodeError() << ")";
+      }
+    }
+
+    return !HasFailure();
+  }
+
+  virtual void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
+                                libaom_test::CompressedVideoSource * /*video*/,
+                                const aom_codec_err_t /*res_peek*/) {}
+
+  void RunTest() {
+    const DecodeParam input = GET_PARAM(1);
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, CONFIG_LOWBITDEPTH, { 1 } };
+    cfg.threads = input.threads;
+    const std::string filename = input.filename;
+    libaom_test::IVFVideoSource decode_video(filename);
+    decode_video.Init();
+
+    // Construct result file name. The file holds a list of expected integer
+    // results, one for each decoded frame.  Any result that doesn't match
+    // the files list will cause a test failure.
+    const std::string res_filename = filename + ".res";
+    OpenResFile(res_filename);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
+
+const DecodeParam kAV1InvalidFileTests[] = {
+  { 1, "invalid-bug-1814.ivf" },
+  { 4, "invalid-oss-fuzz-9463.ivf" },
+  { 1, "invalid-oss-fuzz-9482.ivf" },
+  { 1, "invalid-oss-fuzz-9720.ivf" },
+  { 1, "invalid-oss-fuzz-10061.ivf" },
+  { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf" },
+  { 1, "invalid-oss-fuzz-10227.ivf" },
+};
+
+AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kAV1InvalidFileTests));
+
+}  // namespace
diff --git a/third_party/aom/test/ivf_video_source.h b/third_party/aom/test/ivf_video_source.h
new file mode 100644
index 000000000..ff2841445
--- /dev/null
+++ b/third_party/aom/test/ivf_video_source.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_IVF_VIDEO_SOURCE_H_
+#define AOM_TEST_IVF_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+
+#include "aom_ports/sanitizer.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
+const unsigned int kIvfFileHdrSize = 32;
+const unsigned int kIvfFrameHdrSize = 12;
+
+static unsigned int MemGetLe32(const uint8_t *mem) {
+  return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]);
+}
+
+// This class extends VideoSource to allow parsing of ivf files,
+// so that we can do actual file decodes.
+class IVFVideoSource : public CompressedVideoSource {
+ public:
+  explicit IVFVideoSource(const std::string &file_name)
+      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
+        frame_sz_(0), frame_(0), end_of_file_(false) {}
+
+  virtual ~IVFVideoSource() {
+    delete[] compressed_frame_buf_;
+
+    if (input_file_) fclose(input_file_);
+  }
+
+  virtual void Init() {
+    // Allocate a buffer for read in the compressed video frame.
+    compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
+    ASSERT_TRUE(compressed_frame_buf_ != NULL)
+        << "Allocate frame buffer failed";
+    ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
+  }
+
+  virtual void Begin() {
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+
+    // Read file header
+    uint8_t file_hdr[kIvfFileHdrSize];
+    ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_))
+        << "File header read failed.";
+    // Check file header
+    ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' &&
+                file_hdr[2] == 'I' && file_hdr[3] == 'F')
+        << "Input is not an IVF file.";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    uint8_t frame_hdr[kIvfFrameHdrSize];
+    // Check frame header and read a frame from input_file.
+    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
+        kIvfFrameHdrSize) {
+      end_of_file_ = true;
+    } else {
+      end_of_file_ = false;
+
+      frame_sz_ = MemGetLe32(frame_hdr);
+      ASSERT_LE(frame_sz_, kCodeBufferSize)
+          << "Frame is too big for allocated code buffer";
+      ASAN_UNPOISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
+      ASSERT_EQ(frame_sz_,
+                fread(compressed_frame_buf_, 1, frame_sz_, input_file_))
+          << "Failed to read complete frame";
+      ASAN_POISON_MEMORY_REGION(compressed_frame_buf_ + frame_sz_,
+                                kCodeBufferSize - frame_sz_);
+    }
+  }
+
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? NULL : compressed_frame_buf_;
+  }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  uint8_t *compressed_frame_buf_;
+  size_t frame_sz_;
+  unsigned int frame_;
+  bool end_of_file_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_IVF_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/lightfield_test.sh b/third_party/aom/test/lightfield_test.sh
new file mode 100755
index 000000000..b957a6b79
--- /dev/null
+++ b/third_party/aom/test/lightfield_test.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the lightfield example.
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $infile is required.
+lightfield_test_verify_environment() {
+  local infile="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+  if [ ! -e "${infile}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Run the lightfield example
+lightfield_test() {
+  local img_width=1024
+  local img_height=1024
+  local lf_width=10
+  local lf_height=10
+  local lf_blocksize=5
+  local num_references=4
+  local num_tile_lists=2
+
+  # Encode the lightfield.
+  local encoder="${LIBAOM_BIN_PATH}/lightfield_encoder${AOM_TEST_EXE_SUFFIX}"
+  local yuv_file="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+  local lf_file="${AOM_TEST_OUTPUT_DIR}/vase10x10.ivf"
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \
+      "${yuv_file}" "${lf_file}" "${lf_width}" \
+      "${lf_height}" "${lf_blocksize}" ${devnull}
+
+  [ -e "${lf_file}" ] || return 1
+
+  # Parse lightfield bitstream to construct and output a new bitstream that can
+  # be decoded by an AV1 decoder.
+  local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}"
+  local tl_file="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.ivf"
+  if [ ! -x "${bs_decoder}" ]; then
+    elog "${bs_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \
+      "${num_references}" ${devnull}
+
+  [ -e "${tl_file}" ] || return 1
+
+  # Run lightfield tile list decoder
+  local tl_decoder="${LIBAOM_BIN_PATH}/lightfield_tile_list_decoder${AOM_TEST_EXE_SUFFIX}"
+  local tl_outfile="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.yuv"
+  if [ ! -x "${tl_decoder}" ]; then
+    elog "${tl_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \
+      "${num_references}" "${num_tile_lists}" ${devnull}
+
+  [ -e "${tl_outfile}" ] || return 1
+
+  # Run reference lightfield decoder
+  local ref_decoder="${LIBAOM_BIN_PATH}/lightfield_decoder${AOM_TEST_EXE_SUFFIX}"
+  local tl_reffile="${AOM_TEST_OUTPUT_DIR}/vase_reference.yuv"
+  if [ ! -x "${ref_decoder}" ]; then
+    elog "${ref_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \
+      "${num_references}" ${devnull}
+
+  [ -e "${tl_reffile}" ] || return 1
+
+  # Check if tl_outfile and tl_reffile are identical. If not identical, this test fails.
+  diff ${tl_outfile} ${tl_reffile} > /dev/null
+  if [ $? -eq 1 ]; then
+    return 1
+  fi
+}
+
+lightfield_test_tests="lightfield_test"
+
+run_tests lightfield_test_verify_environment "${lightfield_test_tests}"
diff --git a/third_party/aom/test/log2_test.cc b/third_party/aom/test/log2_test.cc
new file mode 100644
index 000000000..d7840c68b
--- /dev/null
+++ b/third_party/aom/test/log2_test.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/bitops.h"
+#include "av1/common/entropymode.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(Log2Test, GetMsb) {
+  // Test small numbers exhaustively.
+  for (unsigned int n = 1; n < 10000; n++) {
+    EXPECT_EQ(get_msb(n), static_cast<int>(floor(log2(n))));
+  }
+
+  // Test every power of 2 and the two adjacent numbers.
+  for (int exponent = 2; exponent < 32; exponent++) {
+    const unsigned int power_of_2 = 1U << exponent;
+    EXPECT_EQ(get_msb(power_of_2 - 1), exponent - 1);
+    EXPECT_EQ(get_msb(power_of_2), exponent);
+    EXPECT_EQ(get_msb(power_of_2 + 1), exponent);
+  }
+}
+
+TEST(Log2Test, Av1CeilLog2) {
+  // Test small numbers exhaustively.
+  EXPECT_EQ(av1_ceil_log2(0), 0);
+  for (int n = 1; n < 10000; n++) {
+    EXPECT_EQ(av1_ceil_log2(n), static_cast<int>(ceil(log2(n))));
+  }
+
+  // Test every power of 2 and the two adjacent numbers.
+  for (int exponent = 2; exponent < 31; exponent++) {
+    const int power_of_2 = 1 << exponent;
+    EXPECT_EQ(av1_ceil_log2(power_of_2 - 1), exponent);
+    EXPECT_EQ(av1_ceil_log2(power_of_2), exponent);
+    // The current implementation of av1_ceil_log2 only works up to 2^30.
+    if (exponent < 30) {
+      EXPECT_EQ(av1_ceil_log2(power_of_2 + 1), exponent + 1);
+    }
+  }
+}
diff --git a/third_party/aom/test/lossless_test.cc b/third_party/aom/test/lossless_test.cc
new file mode 100644
index 000000000..3f8e89c81
--- /dev/null
+++ b/third_party/aom/test/lossless_test.cc
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LosslessTestLarge
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  LosslessTestLarge()
+      : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {}
+
+  virtual ~LosslessTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      // Only call Control if quantizer > 0 to verify that using quantizer
+      // alone will activate lossless
+      if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
+        encoder->Control(AV1E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  double GetMinPsnr() const { return psnr_; }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+};
+
+TEST_P(LosslessTestLarge, TestLossLessEncoding) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  // intentionally changed the dimension for better testing coverage
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
+  libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 5);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+AV1_INSTANTIATE_TEST_CASE(LosslessTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood,
+                                            ::libaom_test::kTwoPassGood));
+}  // namespace
diff --git a/third_party/aom/test/lpf_test.cc b/third_party/aom/test/lpf_test.cc
new file mode 100644
index 000000000..451bffd2a
--- /dev/null
+++ b/third_party/aom/test/lpf_test.cc
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+const int kSpeedTestNum = 500000;
+
+#define LOOP_PARAM \
+  int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh
+#define DUAL_LOOP_PARAM                                                      \
+  int p, const uint8_t *blimit0, const uint8_t *limit0,                      \
+      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \
+      const uint8_t *thresh1
+
+typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM);
+typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM);
+typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
+typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
+
+typedef ::testing::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
+typedef ::testing::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
+    hbddual_loop_param_t;
+typedef ::testing::tuple<loop_op_t, loop_op_t, int> loop_param_t;
+typedef ::testing::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
+
+template <typename Pixel_t, int PIXEL_WIDTH_t>
+void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
+               const int mask, const int32_t p, const int i) {
+  uint16_t tmp_s[kNumCoeffs];
+
+  for (int j = 0; j < kNumCoeffs;) {
+    const uint8_t val = rnd->Rand8();
+    if (val & 0x80) {  // 50% chance to choose a new value.
+      tmp_s[j] = rnd->Rand16();
+      j++;
+    } else {  // 50% chance to repeat previous value in row X times.
+      int k = 0;
+      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+        if (j < 1) {
+          tmp_s[j] = rnd->Rand16();
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp_s[j] = tmp_s[j - 1] + (limit - 1);
+        } else {  // Decrement by a value within the limit.
+          tmp_s[j] = tmp_s[j - 1] - (limit - 1);
+        }
+        j++;
+      }
+    }
+  }
+
+  for (int j = 0; j < kNumCoeffs;) {
+    const uint8_t val = rnd->Rand8();
+    if (val & 0x80) {
+      j++;
+    } else {  // 50% chance to repeat previous value in column X times.
+      int k = 0;
+      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+        if (j < 1) {
+          tmp_s[j] = rnd->Rand16();
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp_s[(j % 32) * 32 + j / 32] =
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1);
+        } else {  // Decrement by a value within the limit.
+          tmp_s[(j % 32) * 32 + j / 32] =
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1);
+        }
+        j++;
+      }
+    }
+  }
+
+  for (int j = 0; j < kNumCoeffs; j++) {
+    if (i % 2) {
+      s[j] = tmp_s[j] & mask;
+    } else {
+      s[j] = tmp_s[p * (j % p) + j / p] & mask;
+    }
+    ref_s[j] = s[j];
+  }
+}
+
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4);
+}
+
+template <typename func_type_t, typename params_t>
+class LoopTestParam : public ::testing::TestWithParam<params_t> {
+ public:
+  virtual ~LoopTestParam() {}
+  virtual void SetUp() {
+    loopfilter_op_ = ::testing::get<0>(this->GetParam());
+    ref_loopfilter_op_ = ::testing::get<1>(this->GetParam());
+    bit_depth_ = ::testing::get<2>(this->GetParam());
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  func_type_t loopfilter_op_;
+  func_type_t ref_loopfilter_op_;
+};
+
+void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
+  op(s, p, blimit, limit, thresh, bd);
+}
+void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit, limit, thresh);
+}
+void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
+                     hbddual_loop_op_t op) {
+  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
+}
+void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+};
+
+typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
+typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
+typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
+    Loop8Test9Param_hbd;
+typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
+
+#define OPCHECK(a, b)                                                          \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  const int32_t p = kNumCoeffs / 32;                                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                      \
+    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
+                ref_loopfilter_op_);                                           \
+    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+                                         thresh, bit_depth_, loopfilter_op_)); \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
+TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
+
+#define VALCHECK(a, b)                                                         \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    int32_t p = kNumCoeffs / 32;                                               \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      s[j] = rnd.Rand16() & mask_;                                             \
+      ref_s[j] = s[j];                                                         \
+    }                                                                          \
+    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
+                ref_loopfilter_op_);                                           \
+    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+                                         thresh, bit_depth_, loopfilter_op_)); \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
+TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
+
+#define SPEEDCHECK(a, b)                                                      \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                              \
+  const int count_test_block = kSpeedTestNum;                                 \
+  const int32_t bd = bit_depth_;                                              \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                       \
+  uint8_t tmp = GetOuterThresh(&rnd);                                         \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+  tmp = GetInnerThresh(&rnd);                                                 \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,      \
+                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };    \
+  tmp = GetHevThresh(&rnd);                                                   \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+  int32_t p = kNumCoeffs / 32;                                                \
+  for (int j = 0; j < kNumCoeffs; ++j) {                                      \
+    s[j] = rnd.Rand16() & mask_;                                              \
+  }                                                                           \
+  for (int i = 0; i < count_test_block; ++i) {                                \
+    call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
+  }
+
+TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
+TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
+
+#define OPCHECKd(a, b)                                                         \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetOuterThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                               \
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;               \
+    InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                       \
+    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
+    ASM_REGISTER_STATE_CHECK(                                                  \
+        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
+TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
+
+#define VALCHECKd(a, b)                                                        \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetOuterThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                               \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      s[j] = rnd.Rand16() & mask_;                                             \
+      ref_s[j] = s[j];                                                         \
+    }                                                                          \
+    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
+    ASM_REGISTER_STATE_CHECK(                                                  \
+        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
+TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
+
+#define SPEEDCHECKd(a, b)                                                    \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+  const int count_test_block = kSpeedTestNum;                                \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+  uint8_t tmp = GetOuterThresh(&rnd);                                        \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetInnerThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+  tmp = GetHevThresh(&rnd);                                                  \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetOuterThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetInnerThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+  tmp = GetHevThresh(&rnd);                                                  \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  int32_t p = kNumCoeffs / 32;                                               \
+  for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+    s[j] = rnd.Rand16() & mask_;                                             \
+  }                                                                          \
+  for (int i = 0; i < count_test_block; ++i) {                               \
+    call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,     \
+                    limit1, thresh1, bit_depth_, loopfilter_op_);            \
+  }
+
+TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
+TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); }
+
+using ::testing::make_tuple;
+
+#if HAVE_SSE2
+
+const hbdloop_param_t kHbdLoop8Test6[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             8),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
+
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             10),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_hbd,
+                        ::testing::ValuesIn(kHbdLoop8Test6));
+
+const loop_param_t kLoop8Test6[] = {
+  make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
+  make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
+  make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test6Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test6));
+
+const dual_loop_param_t kLoop8Test9[] = {
+  make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_dual_sse2, &aom_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_6_dual_sse2, &aom_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
+             8),
+  make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test9));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE2
+const hbddual_loop_param_t kHbdLoop8Test9[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, Loop8Test9Param_hbd,
+                        ::testing::ValuesIn(kHbdLoop8Test9));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+const loop_param_t kLoop8Test6[] = {
+  make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
+  make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
+  make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
+  make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(NEON, Loop8Test6Param_lbd,
+                        ::testing::ValuesIn(kLoop8Test6));
+#endif  // HAVE_NEON
+
+#if HAVE_AVX2
+const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, Loop8Test9Param_hbd,
+                        ::testing::ValuesIn(kHbdLoop8Test9Avx2));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/masked_sad_test.cc b/third_party/aom/test/masked_sad_test.cc
new file mode 100644
index 000000000..311f1877d
--- /dev/null
+++ b/third_party/aom/test/masked_sad_test.cc
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 200;
+
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *msk, int msk_stride,
+                                      int invert_mask);
+typedef ::testing::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+
+class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
+ public:
+  virtual ~MaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runMaskedSADTest(int run_times);
+
+ protected:
+  MaskedSADFunc maskedSAD_op_;
+  MaskedSADFunc ref_maskedSAD_op_;
+};
+void MaskedSADTest::runMaskedSADTest(int run_times) {
+  unsigned int ref_ret = 0, ret = 1;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  const int iters = run_times == 1 ? number_of_iterations : 1;
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      second_pred_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+      assert(msk_ptr[j] <= 64);
+    }
+
+    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int repeat = 0; repeat < run_times; ++repeat) {
+        ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    second_pred_ptr, msk_ptr, msk_stride,
+                                    invert_mask);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      if (run_times == 1) {
+        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride,
+                                                     ref_ptr, ref_stride,
+                                                     second_pred_ptr, msk_ptr,
+                                                     msk_stride, invert_mask));
+      } else {
+        for (int repeat = 0; repeat < run_times; ++repeat) {
+          ret =
+              maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                            second_pred_ptr, msk_ptr, msk_stride, invert_mask);
+        }
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 10) {
+        printf("%7.2f/%7.2fns", time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+      if (ret != ref_ret) {
+        err_count++;
+        if (first_failure == -1) first_failure = i;
+      }
+    }
+  }
+  EXPECT_EQ(0, err_count) << "Error: Masked SAD Test,  output doesn't match. "
+                          << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *msk, int msk_stride,
+                                            int invert_mask);
+typedef ::testing::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+    HighbdMaskedSADParam;
+
+class HighbdMaskedSADTest
+    : public ::testing::TestWithParam<HighbdMaskedSADParam> {
+ public:
+  virtual ~HighbdMaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runHighbdMaskedSADTest(int run_times);
+
+ protected:
+  HighbdMaskedSADFunc maskedSAD_op_;
+  HighbdMaskedSADFunc ref_maskedSAD_op_;
+};
+void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
+  unsigned int ref_ret = 0, ret = 1;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  const int iters = run_times == 1 ? number_of_iterations : 1;
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16() & 0xfff;
+      ref_ptr[j] = rnd.Rand16() & 0xfff;
+      second_pred_ptr[j] = rnd.Rand16() & 0xfff;
+      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+    }
+
+    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int repeat = 0; repeat < run_times; ++repeat) {
+        ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                                    second_pred8_ptr, msk_ptr, msk_stride,
+                                    invert_mask);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      if (run_times == 1) {
+        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+                                                     ref8_ptr, ref_stride,
+                                                     second_pred8_ptr, msk_ptr,
+                                                     msk_stride, invert_mask));
+      } else {
+        for (int repeat = 0; repeat < run_times; ++repeat) {
+          ret =
+              maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                            second_pred8_ptr, msk_ptr, msk_stride, invert_mask);
+        }
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 10) {
+        printf("%7.2f/%7.2fns", time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+      if (ret != ref_ret) {
+        err_count++;
+        if (first_failure == -1) first_failure = i;
+      }
+    }
+  }
+  EXPECT_EQ(0, err_count)
+      << "Error: High BD Masked SAD Test, output doesn't match. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); }
+
+TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); }
+
+using ::testing::make_tuple;
+
+#if HAVE_SSSE3
+const MaskedSADParam msad_test[] = {
+  make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c),
+  make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
+  make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
+  make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
+  make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
+  make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
+  make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
+  make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
+  make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
+  make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
+  make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
+  make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
+  make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
+  make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
+  make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
+  make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+  make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
+  make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
+  make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
+  make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
+  make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
+  make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
+
+const HighbdMaskedSADParam hbd_msad_test[] = {
+  make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
+  make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
+  make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c),
+  make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c),
+  make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c),
+  make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c),
+  make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c),
+  make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c),
+  make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c),
+  make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c),
+  make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c),
+  make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c),
+  make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c),
+  make_tuple(&aom_highbd_masked_sad64x128_ssse3,
+             &aom_highbd_masked_sad64x128_c),
+  make_tuple(&aom_highbd_masked_sad128x64_ssse3,
+             &aom_highbd_masked_sad128x64_c),
+  make_tuple(&aom_highbd_masked_sad128x128_ssse3,
+             &aom_highbd_masked_sad128x128_c),
+  make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c),
+  make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c),
+  make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c),
+  make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c),
+  make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c),
+  make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, HighbdMaskedSADTest,
+                        ::testing::ValuesIn(hbd_msad_test));
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const MaskedSADParam msad_avx2_test[] = {
+  make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3),
+  make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3),
+  make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3),
+  make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3),
+  make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3),
+  make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3),
+  make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3),
+  make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3),
+  make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3),
+  make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3),
+  make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3),
+  make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3),
+  make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3),
+  make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
+  make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
+  make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
+  make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
+  make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
+  make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
+  make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
+  make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
+  make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, MaskedSADTest,
+                        ::testing::ValuesIn(msad_avx2_test));
+
+const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
+  make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x4_avx2, &aom_highbd_masked_sad8x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x8_avx2, &aom_highbd_masked_sad8x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x16_avx2, &aom_highbd_masked_sad8x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x8_avx2, &aom_highbd_masked_sad16x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x16_avx2,
+             &aom_highbd_masked_sad16x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x32_avx2,
+             &aom_highbd_masked_sad16x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x16_avx2,
+             &aom_highbd_masked_sad32x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x32_avx2,
+             &aom_highbd_masked_sad32x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x64_avx2,
+             &aom_highbd_masked_sad32x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x32_avx2,
+             &aom_highbd_masked_sad64x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x64_avx2,
+             &aom_highbd_masked_sad64x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x128_avx2,
+             &aom_highbd_masked_sad64x128_ssse3),
+  make_tuple(&aom_highbd_masked_sad128x64_avx2,
+             &aom_highbd_masked_sad128x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad128x128_avx2,
+             &aom_highbd_masked_sad128x128_ssse3),
+  make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x8_avx2, &aom_highbd_masked_sad32x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x64_avx2,
+             &aom_highbd_masked_sad16x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x16_avx2,
+             &aom_highbd_masked_sad64x16_ssse3)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, HighbdMaskedSADTest,
+                        ::testing::ValuesIn(hbd_msad_avx2_test));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/third_party/aom/test/masked_variance_test.cc b/third_party/aom/test/masked_variance_test.cc
new file mode 100644
index 000000000..275b9feb6
--- /dev/null
+++ b/third_party/aom/test/masked_variance_test.cc
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 200;
+
+typedef unsigned int (*MaskedSubPixelVarianceFunc)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+typedef ::testing::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
+    MaskedSubPixelVarianceParam;
+
+class MaskedSubPixelVarianceTest
+    : public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
+ public:
+  virtual ~MaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  // Note: We pad out the input array to a multiple of 16 bytes wide, so that
+  // consecutive rows keep the 16-byte alignment.
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 16);
+  int ref_stride = (MAX_SB_SIZE + 16);
+  int msk_stride = (MAX_SB_SIZE + 16);
+  int xoffset;
+  int yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+    int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16); j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      second_pred_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+    for (int k = 0; k < 3; k++) {
+      for (int l = 0; l < 3; l++) {
+        xoffset = xoffsets[k];
+        yoffset = yoffsets[l];
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+                                  msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) first_failure = i;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+      << "C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 16);
+  int ref_stride = (MAX_SB_SIZE + 16);
+  int msk_stride = (MAX_SB_SIZE + 16);
+
+  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+      for (int i = 0; i < 16; ++i) {
+        memset(src_ptr, (i & 0x1) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+        memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+        memset(msk_ptr, (i & 0x8) ? 64 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+                                  msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure
+                          << " x_offset = " << first_failure_x
+                          << " y_offset = " << first_failure_y;
+}
+
+typedef ::testing::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
+                         aom_bit_depth_t>
+    HighbdMaskedSubPixelVarianceParam;
+
+class HighbdMaskedSubPixelVarianceTest
+    : public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
+ public:
+  virtual ~HighbdMaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+  aom_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int src_stride = (MAX_SB_SIZE + 8);
+  int ref_stride = (MAX_SB_SIZE + 8);
+  int msk_stride = (MAX_SB_SIZE + 8);
+  int xoffset, yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8); j++) {
+      src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      msk_ptr[j] = rnd(65);
+    }
+    for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+      for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                              ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+                                  ref8_ptr, ref_stride, second_pred8_ptr,
+                                  msk_ptr, msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+      << "C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure
+      << " x_offset = " << first_failure_x << " y_offset = " << first_failure_y;
+}
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 8);
+  int ref_stride = (MAX_SB_SIZE + 8);
+  int msk_stride = (MAX_SB_SIZE + 8);
+
+  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+      for (int i = 0; i < 16; ++i) {
+        aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+        aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+        aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+        memset(msk_ptr, (i & 0x8) ? 64 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                              ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+                                  ref8_ptr, ref_stride, second_pred8_ptr,
+                                  msk_ptr, msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure
+                          << " x_offset = " << first_failure_x
+                          << " y_offset = " << first_failure_y;
+}
+
+using ::testing::make_tuple;
+
+#if HAVE_SSSE3
+
+const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
+  make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
+             &aom_masked_sub_pixel_variance128x128_c),
+  make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
+             &aom_masked_sub_pixel_variance128x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
+             &aom_masked_sub_pixel_variance64x128_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
+             &aom_masked_sub_pixel_variance64x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
+             &aom_masked_sub_pixel_variance64x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
+             &aom_masked_sub_pixel_variance32x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
+             &aom_masked_sub_pixel_variance32x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
+             &aom_masked_sub_pixel_variance32x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
+             &aom_masked_sub_pixel_variance16x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
+             &aom_masked_sub_pixel_variance16x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
+             &aom_masked_sub_pixel_variance16x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
+             &aom_masked_sub_pixel_variance8x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
+             &aom_masked_sub_pixel_variance8x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
+             &aom_masked_sub_pixel_variance8x4_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
+             &aom_masked_sub_pixel_variance4x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
+             &aom_masked_sub_pixel_variance4x4_c)
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+                        ::testing::ValuesIn(sub_pel_var_test));
+
+const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12)
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+                        ::testing::ValuesIn(hbd_sub_pel_var_test));
+#endif  // HAVE_SSSE3
+}  // namespace
diff --git a/third_party/aom/test/md5_helper.h b/third_party/aom/test/md5_helper.h
new file mode 100644
index 000000000..9443cb262
--- /dev/null
+++ b/third_party/aom/test/md5_helper.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_MD5_HELPER_H_
+#define AOM_TEST_MD5_HELPER_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+
+namespace libaom_test {
+class MD5 {
+ public:
+  MD5() { MD5Init(&md5_); }
+
+  void Add(const aom_image_t *img) {
+    for (int plane = 0; plane < 3; ++plane) {
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int bytes_per_sample =
+          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      const int h =
+          plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift
+                : img->d_h;
+      const int w =
+          (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift
+                 : img->d_w) *
+          bytes_per_sample;
+
+      for (int y = 0; y < h; ++y) {
+        MD5Update(&md5_, buf, w);
+        buf += img->stride[plane];
+      }
+    }
+  }
+
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
+  const char *Get(void) {
+    static const char hex[16] = {
+      '0', '1', '2', '3', '4', '5', '6', '7',
+      '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+    };
+    uint8_t tmp[16];
+    MD5Context ctx_tmp = md5_;
+
+    MD5Final(tmp, &ctx_tmp);
+    for (int i = 0; i < 16; i++) {
+      res_[i * 2 + 0] = hex[tmp[i] >> 4];
+      res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+    }
+    res_[32] = 0;
+
+    return res_;
+  }
+
+ protected:
+  char res_[33];
+  MD5Context md5_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_MD5_HELPER_H_
diff --git a/third_party/aom/test/metrics_template.html b/third_party/aom/test/metrics_template.html
new file mode 100644
index 000000000..b57c62314
--- /dev/null
+++ b/third_party/aom/test/metrics_template.html
@@ -0,0 +1,422 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Video Codec Test Results</title>
+<style type="text/css">
+<!-- Begin 960 reset -->
+a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,c
+ode,dd,del,details,dfn,dialog,div,dl,dt,em,embed,fieldset,figcaption,figure,font,footer,form,h1,h2,h
+3,h4,h5,h6,header,hgroup,hr,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,meter,nav,object,ol,
+output,p,pre,progress,q,rp,rt,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbo
+dy,td,tfoot,th,thead,time,tr,tt,u,ul,var,video,xmp{border:0;margin:0;padding:0;font-size:100%}html,b
+ody{height:100%}article,aside,details,figcaption,figure,footer,header,hgroup,menu,nav,section{displa
+y:block}b,strong{font-weight:bold}img{color:transparent;font-size:0;vertical-align:middle;-ms-interp
+olation-mode:bicubic}ol,ul{list-style:none}li{display:list-item}table{border-collapse:collapse;borde
+r-spacing:0}th,td,caption{font-weight:normal;vertical-align:top;text-align:left}q{quotes:none}q:befo
+re,q:after{content:'';content:none}sub,sup,small{font-size:75%}sub,sup{line-height:0;position:relati
+ve;vertical-align:baseline}sub{bottom:-0.25em}sup{top:-0.5em}svg{overflow:hidden}
+<!-- End 960 reset -->
+<!-- Begin 960 text -->
+body{font:13px/1.5 'Helvetica Neue',Arial,'Liberation Sans',FreeSans,sans-serif}pre,code{font-family
+:'DejaVu Sans Mono',Menlo,Consolas,monospace}hr{border:0 #ccc solid;border-top-width:1px;clear:both;
+height:0}h1{font-size:25px}h2{font-size:23px}h3{font-size:21px}h4{font-size:19px}h5{font-size:17px}h
+6{font-size:15px}ol{list-style:decimal}ul{list-style:disc}li{margin-left:30px}p,dl,hr,h1,h2,h3,h4,h5
+,h6,ol,ul,pre,table,address,fieldset,figure{margin-bottom:20px}
+<!-- End 960 text -->
+<!-- Begin 960 grid (fluid variant)
+     12 columns, 1152px total width
+     http://960.gs/ | http://grids.heroku.com/ -->
+.container_12{width:92%;margin-left:4%;margin-right:4%}.grid_1,.grid_2,.grid_3,.grid_4,.grid_5,.grid
+_6,.grid_7,.grid_8,.grid_9,.grid_10,.grid_11,.grid_12{display:inline;float:left;position:relative;ma
+rgin-left:1%;margin-right:1%}.alpha{margin-left:0}.omega{margin-right:0}.container_12 .grid_1{width:
+6.333%}.container_12 .grid_2{width:14.667%}.container_12 .grid_3{width:23.0%}.container_12 .grid_4{w
+idth:31.333%}.container_12 .grid_5{width:39.667%}.container_12 .grid_6{width:48.0%}.container_12 .gr
+id_7{width:56.333%}.container_12 .grid_8{width:64.667%}.container_12 .grid_9{width:73.0%}.container_
+12 .grid_10{width:81.333%}.container_12 .grid_11{width:89.667%}.container_12 .grid_12{width:98.0%}.c
+ontainer_12 .prefix_1{padding-left:8.333%}.container_12 .prefix_2{padding-left:16.667%}.container_12
+ .prefix_3{padding-left:25.0%}.container_12 .prefix_4{padding-left:33.333%}.container_12 .prefix_5{p
+adding-left:41.667%}.container_12 .prefix_6{padding-left:50.0%}.container_12 .prefix_7{padding-left:
+58.333%}.container_12 .prefix_8{padding-left:66.667%}.container_12 .prefix_9{padding-left:75.0%}.con
+tainer_12 .prefix_10{padding-left:83.333%}.container_12 .prefix_11{padding-left:91.667%}.container_1
+2 .suffix_1{padding-right:8.333%}.container_12 .suffix_2{padding-right:16.667%}.container_12 .suffix
+_3{padding-right:25.0%}.container_12 .suffix_4{padding-right:33.333%}.container_12 .suffix_5{padding
+-right:41.667%}.container_12 .suffix_6{padding-right:50.0%}.container_12 .suffix_7{padding-right:58.
+333%}.container_12 .suffix_8{padding-right:66.667%}.container_12 .suffix_9{padding-right:75.0%}.cont
+ainer_12 .suffix_10{padding-right:83.333%}.container_12 .suffix_11{padding-right:91.667%}.container_
+12 .push_1{left:8.333%}.container_12 .push_2{left:16.667%}.container_12 .push_3{left:25.0%}.containe
+r_12 .push_4{left:33.333%}.container_12 .push_5{left:41.667%}.container_12 .push_6{left:50.0%}.conta
+iner_12 .push_7{left:58.333%}.container_12 .push_8{left:66.667%}.container_12 .push_9{left:75.0%}.co
+ntainer_12 .push_10{left:83.333%}.container_12 .push_11{left:91.667%}.container_12 .pull_1{left:-8.3
+33%}.container_12 .pull_2{left:-16.667%}.container_12 .pull_3{left:-25.0%}.container_12 .pull_4{left
+:-33.333%}.container_12 .pull_5{left:-41.667%}.container_12 .pull_6{left:-50.0%}.container_12 .pull_
+7{left:-58.333%}.container_12 .pull_8{left:-66.667%}.container_12 .pull_9{left:-75.0%}.container_12
+.pull_10{left:-83.333%}.container_12 .pull_11{left:-91.667%}.clear{clear:both;display:block;overflow
+:hidden;visibility:hidden;width:0;height:0}.clearfix:after{clear:both;content:' ';display:block;font
+-size:0;line-height:0;visibility:hidden;width:0;height:0}.clearfix{display:inline-block}* html .clea
+rfix{height:1%}.clearfix{display:block}
+<!-- End 960 grid -->
+
+div.metricgraph {
+
+}
+
+body {
+
+}
+
+div.header {
+  font-family: Arial, sans-serif;
+}
+
+div.header h2 {
+  margin: .5em auto;
+}
+
+div.radio {
+  font-family: Arial, sans-serif;
+  margin-bottom: 1em;
+}
+
+div.main {
+
+}
+
+div.cliplist {
+  font-family: Arial, sans-serif;
+  margin-top: 6px;
+}
+
+div.chartarea {
+  font-family: Arial, sans-serif;
+}
+
+div.indicators {
+  font-family: Arial, sans-serif;
+  font-size: 13px;
+  margin-top: 6px;
+  min-height: 600px;
+  background-color: #f7f7f7;
+}
+
+div.indicators div.content {
+  margin: 1em;
+}
+
+div.indicators div.content h5 {
+  font-size: 13px;
+  text-align: center;
+  margin: 0;
+}
+
+div.indicators div.content ul {
+  margin-left: 0;
+  padding-left: 0;
+  margin-top: 0;
+}
+
+div.indicators div.content ul li {
+  margin-left: 1.5em;
+}
+
+div.indicators div.content p:first-child {
+  margin-bottom: .5em;
+}
+
+span.google-visualization-table-sortind {
+  color: #000;
+}
+.header-style {
+  font-weight: bold;
+  border: 1px solid #fff;
+  background-color: #ccc;
+}
+
+td.header-style+td {
+
+}
+
+.orange-background {
+  background-color: orange;
+}
+
+.light-gray-background {
+  background-color: #f0f0f0;
+}
+</style>
+<script type="text/javascript" src="https://www.google.com/jsapi"></script>
+<script type="text/javascript">
+var chart_left   = 40;
+var chart_top    = 6;
+var chart_height = document.documentElement.clientHeight-100;
+var chart_width  = "100%";
+ftable='filestable_avg'
+var snrs = [];
+var filestable_dsnr = [];
+var filestable_drate = [];
+var filestable_avg = [];
+
+// Python template code replaces the following 2 lines.
+//%%metrics_js%%//
+//%%filestable_dpsnr%%//
+//%%filestable_avg%%//
+//%%filestable_drate%%//
+//%%snrs%%//
+
+var selected = 0
+var imagestr = '';
+var bettertable=0;
+var chart=0;
+var better=0;
+var metricdata=0;
+var metricView=0;
+var column=1;
+var formatter=0;
+
+function changeColumn(col) {
+  column = col;
+  console.log(col)
+  draw_files();
+}
+
+function changeMetric(m) {
+  ftable=m
+  draw_files()
+}
+
+function setup_vis() {
+  chart = new google.visualization.ScatterChart(
+      document.getElementById("metricgraph"));
+
+  bettertable = new google.visualization.Table(
+      document.getElementById("bettertable"));
+
+  draw_files();
+  build_metrics_radio();
+}
+
+function build_metrics_radio() {
+  for (metric=1; metric < metrics.length; metric++) {
+    var rb = document.createElement('input');
+    var l = document.createElement('label');
+    rb.setAttribute('type','radio');
+    rb.setAttribute('name','metric');
+    rb.setAttribute('onClick', "changeColumn('"+metric.toString()+"')");
+    l.innerHTML = metrics[metric];
+    document.getElementById('metrics').appendChild(rb);
+    document.getElementById('metrics').appendChild(l);
+  }
+}
+
+function draw_files() {
+  var options = {'allowHtml': true, 'width': "100%", 'height': "50%"};
+  if (better != 0) delete better;
+
+  col=eval(ftable+'[column]')
+  better = new google.visualization.DataTable(col)
+
+  // Python Template code replaces the following line with a list of
+  // formatters.
+  if (ftable == 'filestable_dsnr')
+    formatter = new google.visualization.NumberFormat(
+      {fractionDigits: 4, suffix:" db"});
+  else
+    formatter = new google.visualization.NumberFormat(
+       {fractionDigits: 4, suffix:"%"});
+
+  //%%formatters%%//
+
+  bettertable.draw(better,options);
+  google.visualization.events.addListener(bettertable, 'select',
+                                          selectBetterHandler);
+  query_file()
+}
+
+function query_file() {
+  imagestr = better.getFormattedValue(selected, 0)
+  var metricjson = eval('(' + snrs[column][selected] + ')');
+  metricdata = new google.visualization.DataTable(metricjson, 0.6);
+  if( metricView != 0 ) delete metricView;
+  metricView = new google.visualization.DataView(metricdata);
+
+  chart.draw(metricView, {curveType:'function',
+      explorer: {},
+      chartArea:{left:chart_left, top:chart_top, width:chart_width,
+      height:chart_height-90},
+      hAxis:{title:"Datarate in kbps"},
+      vAxis:{title:"Quality in decibels", format: '##.0', textPosition: 'in'},
+      legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+      width:chart_width, height:chart_height-50 });
+
+  google.visualization.events.addListener(chart, 'select', chartSelect);
+  google.visualization.events.addListener(chart, 'onmouseover', chartMouseOver);
+  google.visualization.events.addListener(chart, 'onmouseout', chartMouseOut);
+}
+
+function chartMouseOut(e) {
+  statusbar = document.getElementById('status');
+  statusbar.style.display = 'none';
+}
+
+function chartMouseOver(e) {
+  pointDifference(e.row, e.column)
+}
+
+function pointDifference(row, col) {
+  if(!row || !col)
+    return;
+
+  var cols = metricdata.getNumberOfColumns();
+  var rows = metricdata.getNumberOfRows();
+
+  var sel_bitrate = metricView.getValue(row, 0 );
+  var sel_metric = metricView.getValue(row, col);
+
+  var message = '<ul>' + metricView.getColumnLabel(col) +
+     ' (' + sel_bitrate.toFixed(0) + ' kbps, ' + sel_metric.toFixed(2) + ')' + ' is ';
+
+
+  // col 0 is datarate
+  for( var i=1;i<cols;++i) {
+
+    var metric_greatest_thats_less = 0;
+    var rate_greatest_thats_less = 0;
+    var metric_smallest_thats_greater = 999;
+    var rate_smallest_thats_greater = 0;
+
+    if(i==col)
+      continue;
+
+    // Find the lowest metric for the column that's greater than sel_metric and
+    // the highest metric for this column that's less than the metric.
+    for(var line_count = 0; line_count < rows; ++line_count) {
+      this_metric = metricdata.getValue(line_count, i)
+      this_rate = metricdata.getValue(line_count, 0)
+      if(!this_metric)
+        continue;
+
+      if(this_metric > metric_greatest_thats_less &&
+         this_metric <= sel_metric) {
+        metric_greatest_thats_less = this_metric;
+        rate_greatest_thats_less = this_rate;
+      }
+      if(this_metric < metric_smallest_thats_greater &&
+        this_metric > sel_metric) {
+        metric_smallest_thats_greater = this_metric;
+        rate_smallest_thats_greater = this_rate;
+      }
+    }
+
+    if(rate_smallest_thats_greater == 0 || rate_greatest_thats_less == 0) {
+      message = message + " <li> Couldn't find a point on both sides.</li>"
+    } else {
+      metric_slope = ( rate_smallest_thats_greater - rate_greatest_thats_less) /
+          ( metric_smallest_thats_greater - metric_greatest_thats_less);
+
+      projected_rate = ( sel_metric - metric_greatest_thats_less) *
+          metric_slope + rate_greatest_thats_less;
+
+      difference = 100 * (projected_rate / sel_bitrate - 1);
+
+
+      if (difference > 0)
+        message = message + "<li>  " + difference.toFixed(2) +
+                  "% smaller than <em>" +
+                  metricdata.getColumnLabel(i) + "</em></li> "
+      else
+        message = message + "<li>  " + -difference.toFixed(2) +
+                  "% bigger than <em>" +
+                  metricdata.getColumnLabel(i) + "</em></li> "
+    }
+
+  }
+  message = message + "</ul>"
+  statusbar = document.getElementById('status');
+  statusbar.innerHTML = "<p>" + message + "</p>";
+  statusbar.style.display = 'block';
+}
+
+function chartSelect() {
+  var selection = chart.getSelection();
+  var message = '';
+  var min = metricView.getFormattedValue(selection[0].row, 0);
+  var max = metricView.getFormattedValue(selection[selection.length-1].row, 0);
+  var val = metricView.getFormattedValue(selection[0].row,selection[0].column);
+
+  pointDifference(selection[0].row, selection[0].column)
+  min = min / 3
+  max = max * 3
+  metricView.setRows(metricdata.getFilteredRows(
+      [{column: 0,minValue: min, maxValue:max}]));
+
+  chart.draw(metricView, {curveType:'function',
+      chartArea:{left:40, top:10, width:chart_width, height:chart_height - 110},
+      hAxis:{title:"datarate in kbps"}, vAxis:{title:"quality in decibels"},
+      legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+      width:chart_width, height:chart_height - 50});
+}
+
+function selectBetterHandler() {
+  var selection = bettertable.getSelection();
+  for (var i = 0; i < selection.length; i++) {
+    item = selection[i];
+  }
+  selected = item.row
+  query_file()
+}
+
+
+google.load('visualization', '1', {'packages' : ['corechart','table']});
+google.setOnLoadCallback(setup_vis);
+</script>
+</head>
+
+<body>
+
+  <div class="container_12">
+
+    <div class="grid_12 header">
+      <h2>Codec Comparison Results</h2>
+    </div>
+
+    <div class="grid_12 radio">
+
+      <form name="myform">
+        Method For Combining Points
+        <input type="radio" checked name="column" value="1"
+          onClick="changeMetric('filestable_avg')" />Average of bitrates difference
+        <input type="radio" name="column" value="2"
+          onClick="changeMetric('filestable_dsnr')" />BDSNR
+        <input type="radio" name="column" value="3"
+          onClick="changeMetric('filestable_drate')" />BDRATE
+      </form>
+
+      <form id="metrics" name="myform">
+      </form>
+
+    </div>
+
+    <div class="grid_12 main">
+
+      <div class="grid_5 alpha cliplist">
+        <div id="bettertable"></div>
+      </div>
+
+      <div class="grid_5 chartarea">
+        <div id="metricgraph"></div>
+      </div>
+
+      <div class="grid_2 omega indicators">
+        <div class="content">
+          <h5>Indicators</h5>
+          <hr>
+          <div id="status"></div>
+        </div>
+      </div>
+
+    </div>
+
+  </div>
+
+</body>
+</html>
diff --git a/third_party/aom/test/monochrome_test.cc b/third_party/aom/test/monochrome_test.cc
new file mode 100644
index 000000000..ebccba584
--- /dev/null
+++ b/third_party/aom/test/monochrome_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class MonochromeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {}
+
+  virtual ~MonochromeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)pts;
+
+    // Get value of top-left corner pixel of U plane
+    int chroma_value = img.planes[AOM_PLANE_U][0];
+
+    bool is_chroma_constant =
+        ComparePlaneToValue(img, AOM_PLANE_U, chroma_value) &&
+        ComparePlaneToValue(img, AOM_PLANE_V, chroma_value);
+
+    // Chroma planes should be constant
+    EXPECT_TRUE(is_chroma_constant);
+
+    // Monochrome flag on image should be set
+    EXPECT_EQ(img.monochrome, 1);
+
+    chroma_value_list_.push_back(chroma_value);
+  }
+
+  // Returns true if all pixels on the plane are equal to value, and returns
+  // false otherwise.
+  bool ComparePlaneToValue(const aom_image_t &img, const int plane,
+                           const int value) {
+    const int w = aom_img_plane_width(&img, plane);
+    const int h = aom_img_plane_height(&img, plane);
+    const uint8_t *const buf = img.planes[plane];
+    const int stride = img.stride[plane];
+
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        if (buf[r * stride + c] != value) return false;
+      }
+    }
+    return true;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check that the initial Y PSNR value is 'high enough', and check that
+    // subsequent Y PSNR values are 'close' to this initial value.
+    if (frame0_psnr_y_ == 0.) {
+      frame0_psnr_y_ = pkt->data.psnr.psnr[1];
+      EXPECT_GT(frame0_psnr_y_, 29.);
+    }
+    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5);
+  }
+
+  std::vector<int> chroma_value_list_;
+  double frame0_psnr_y_;
+};
+
+TEST_P(MonochromeTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 600;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_undershoot_pct = 50;
+  cfg_.rc_overshoot_pct = 50;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.kf_mode = AOM_KF_AUTO;
+  cfg_.g_lag_in_frames = 1;
+  cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+  // Enable dropped frames.
+  cfg_.rc_dropframe_thresh = 1;
+  // Disable error_resilience mode.
+  cfg_.g_error_resilient = 0;
+  // Run at low bitrate.
+  cfg_.rc_target_bitrate = 40;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(MonochromeTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+
+}  // namespace
diff --git a/third_party/aom/test/motion_vector_test.cc b/third_party/aom/test/motion_vector_test.cc
new file mode 100644
index 000000000..27eb93893
--- /dev/null
+++ b/third_party/aom/test/motion_vector_test.cc
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+#define MAX_EXTREME_MV 1
+#define MIN_EXTREME_MV 2
+
+// Encoding modes
+const libaom_test::TestMode kEncodingModeVectors[] = {
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
+};
+
+// Encoding speeds
+const int kCpuUsedVectors[] = { 1, 5 };
+
+// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV.
+const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV };
+
+class MotionVectorTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MotionVectorTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
+
+  virtual ~MotionVectorTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int mv_test_mode_;
+};
+
+TEST_P(MotionVectorTestLarge, OverallTest) {
+  int width = 3840;
+  int height = 2160;
+
+  // Reduce the test clip's resolution while testing on 32-bit system.
+  if (sizeof(void *) == 4) {
+    width = 2048;
+    height = 360;
+  }
+
+  cfg_.rc_target_bitrate = 24000;
+  cfg_.g_profile = 0;
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  testing::internal::scoped_ptr<libaom_test::VideoSource> video;
+  video.reset(new libaom_test::YUVVideoSource(
+      "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, width, height, 30, 1, 0, 3));
+
+  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(MotionVectorTestLarge,
+                          ::testing::ValuesIn(kEncodingModeVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors),
+                          ::testing::ValuesIn(kMVTestModes));
+}  // namespace
diff --git a/third_party/aom/test/noise_model_test.cc b/third_party/aom/test/noise_model_test.cc
new file mode 100644
index 000000000..b5b387e31
--- /dev/null
+++ b/third_party/aom/test/noise_model_test.cc
@@ -0,0 +1,1343 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Return normally distrbuted values with standard deviation of sigma.
+double randn(libaom_test::ACMRandom *random, double sigma) {
+  while (1) {
+    const double u = 2.0 * ((double)random->Rand31() /
+                            testing::internal::Random::kMaxRange) -
+                     1.0;
+    const double v = 2.0 * ((double)random->Rand31() /
+                            testing::internal::Random::kMaxRange) -
+                     1.0;
+    const double s = u * u + v * v;
+    if (s > 0 && s < 1) {
+      return sigma * (u * sqrt(-2.0 * log(s) / s));
+    }
+  }
+  return 0;
+}
+
+// Synthesizes noise using the auto-regressive filter of the given lag,
+// with the provided n coefficients sampled at the given coords.
+void noise_synth(libaom_test::ACMRandom *random, int lag, int n,
+                 const int (*coords)[2], const double *coeffs, double *data,
+                 int w, int h) {
+  const int pad_size = 3 * lag;
+  const int padded_w = w + pad_size;
+  const int padded_h = h + pad_size;
+  int x = 0, y = 0;
+  std::vector<double> padded(padded_w * padded_h);
+
+  for (y = 0; y < padded_h; ++y) {
+    for (x = 0; x < padded_w; ++x) {
+      padded[y * padded_w + x] = randn(random, 1.0);
+    }
+  }
+  for (y = lag; y < padded_h; ++y) {
+    for (x = lag; x < padded_w; ++x) {
+      double sum = 0;
+      int i = 0;
+      for (i = 0; i < n; ++i) {
+        const int dx = coords[i][0];
+        const int dy = coords[i][1];
+        sum += padded[(y + dy) * padded_w + (x + dx)] * coeffs[i];
+      }
+      padded[y * padded_w + x] += sum;
+    }
+  }
+  // Copy over the padded rows to the output
+  for (y = 0; y < h; ++y) {
+    memcpy(data + y * w, &padded[0] + y * padded_w, sizeof(*data) * w);
+  }
+}
+
+std::vector<float> get_noise_psd(double *noise, int width, int height,
+                                 int block_size) {
+  float *block =
+      (float *)aom_memalign(32, block_size * block_size * sizeof(block));
+  std::vector<float> psd(block_size * block_size);
+  int num_blocks = 0;
+  struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+  for (int y = 0; y <= height - block_size; y += block_size / 2) {
+    for (int x = 0; x <= width - block_size; x += block_size / 2) {
+      for (int yy = 0; yy < block_size; ++yy) {
+        for (int xx = 0; xx < block_size; ++xx) {
+          block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx];
+        }
+      }
+      aom_noise_tx_forward(tx, &block[0]);
+      aom_noise_tx_add_energy(tx, &psd[0]);
+      num_blocks++;
+    }
+  }
+  for (int yy = 0; yy < block_size; ++yy) {
+    for (int xx = 0; xx <= block_size / 2; ++xx) {
+      psd[yy * block_size + xx] /= num_blocks;
+    }
+  }
+  // Fill in the data that is missing due to symmetries
+  for (int xx = 1; xx < block_size / 2; ++xx) {
+    psd[(block_size - xx)] = psd[xx];
+  }
+  for (int yy = 1; yy < block_size; ++yy) {
+    for (int xx = 1; xx < block_size / 2; ++xx) {
+      psd[(block_size - yy) * block_size + (block_size - xx)] =
+          psd[yy * block_size + xx];
+    }
+  }
+  aom_noise_tx_free(tx);
+  aom_free(block);
+  return psd;
+}
+
+}  // namespace
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins) {
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, 2, 8);
+  EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+  EXPECT_NEAR(255, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins10bit) {
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, 2, 10);
+  EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+  EXPECT_NEAR(1023, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCenters256Bins) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, num_bins, 8);
+
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_NEAR(i, aom_noise_strength_solver_get_center(&solver, i), 1e-5);
+  }
+  aom_noise_strength_solver_free(&solver);
+}
+
+// Tests that the noise strength solver returns the identity transform when
+// given identity-like constraints.
+TEST(NoiseStrengthSolver, ObserveIdentity) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+  // We have to add a big more strength to constraints at the boundary to
+  // overcome any regularization.
+  for (int j = 0; j < 5; ++j) {
+    aom_noise_strength_solver_add_measurement(&solver, 0, 0);
+    aom_noise_strength_solver_add_measurement(&solver, 255, 255);
+  }
+  for (int i = 0; i < 256; ++i) {
+    aom_noise_strength_solver_add_measurement(&solver, i, i);
+  }
+  EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+  for (int i = 2; i < num_bins - 2; ++i) {
+    EXPECT_NEAR(i, solver.eqns.x[i], 0.1);
+  }
+
+  aom_noise_strength_lut_t lut;
+  EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, 2, &lut));
+
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(0.0, lut.points[0][1], 0.5);
+  EXPECT_NEAR(255.0, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(255.0, lut.points[1][1], 0.5);
+
+  aom_noise_strength_lut_free(&lut);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, SimplifiesCurve) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+  // Create a parabolic input
+  for (int i = 0; i < 256; ++i) {
+    const double x = (i - 127.5) / 63.5;
+    aom_noise_strength_solver_add_measurement(&solver, i, x * x);
+  }
+  EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+
+  // First try to fit an unconstrained lut
+  aom_noise_strength_lut_t lut;
+  EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, -1, &lut));
+  ASSERT_LE(20, lut.num_points);
+  aom_noise_strength_lut_free(&lut);
+
+  // Now constrain the maximum number of points
+  const int kMaxPoints = 9;
+  EXPECT_EQ(1,
+            aom_noise_strength_solver_fit_piecewise(&solver, kMaxPoints, &lut));
+  ASSERT_EQ(kMaxPoints, lut.num_points);
+
+  // Check that the input parabola is still well represented
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(4.0, lut.points[0][1], 0.1);
+  for (int i = 1; i < lut.num_points - 1; ++i) {
+    const double x = (lut.points[i][0] - 128.) / 64.;
+    EXPECT_NEAR(x * x, lut.points[i][1], 0.1);
+  }
+  EXPECT_NEAR(255.0, lut.points[kMaxPoints - 1][0], 1e-5);
+
+  EXPECT_NEAR(4.0, lut.points[kMaxPoints - 1][1], 0.1);
+  aom_noise_strength_lut_free(&lut);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthLut, LutEvalSinglePoint) {
+  aom_noise_strength_lut_t lut;
+  ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1));
+  ASSERT_EQ(1, lut.num_points);
+  lut.points[0][0] = 0;
+  lut.points[0][1] = 1;
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, -1));
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 0));
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 1));
+  aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseStrengthLut, LutEvalMultiPointInterp) {
+  const double kEps = 1e-5;
+  aom_noise_strength_lut_t lut;
+  ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 4));
+  ASSERT_EQ(4, lut.num_points);
+
+  lut.points[0][0] = 0;
+  lut.points[0][1] = 0;
+
+  lut.points[1][0] = 1;
+  lut.points[1][1] = 1;
+
+  lut.points[2][0] = 2;
+  lut.points[2][1] = 1;
+
+  lut.points[3][0] = 100;
+  lut.points[3][1] = 1001;
+
+  // Test lower boundary
+  EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, -1));
+  EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, 0));
+
+  // Test first part that should be identity
+  EXPECT_NEAR(0.25, aom_noise_strength_lut_eval(&lut, 0.25), kEps);
+  EXPECT_NEAR(0.75, aom_noise_strength_lut_eval(&lut, 0.75), kEps);
+
+  // This is a constant section (should evaluate to 1)
+  EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.25), kEps);
+  EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.75), kEps);
+
+  // Test interpolation between to non-zero y coords.
+  EXPECT_NEAR(1, aom_noise_strength_lut_eval(&lut, 2), kEps);
+  EXPECT_NEAR(251, aom_noise_strength_lut_eval(&lut, 26.5), kEps);
+  EXPECT_NEAR(751, aom_noise_strength_lut_eval(&lut, 75.5), kEps);
+
+  // Test upper boundary
+  EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 100));
+  EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 101));
+
+  aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseModel, InitSuccessWithValidSquareShape) {
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+  aom_noise_model_t model;
+
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  const int kNumCoords = 12;
+  const int kCoords[][2] = { { -2, -2 }, { -1, -2 }, { 0, -2 },  { 1, -2 },
+                             { 2, -2 },  { -2, -1 }, { -1, -1 }, { 0, -1 },
+                             { 1, -1 },  { 2, -1 },  { -2, 0 },  { -1, 0 } };
+  EXPECT_EQ(kNumCoords, model.n);
+  for (int i = 0; i < kNumCoords; ++i) {
+    const int *coord = kCoords[i];
+    EXPECT_EQ(coord[0], model.coords[i][0]);
+    EXPECT_EQ(coord[1], model.coords[i][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitSuccessWithValidDiamondShape) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8, 0 };
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+  EXPECT_EQ(6, model.n);
+  const int kNumCoords = 6;
+  const int kCoords[][2] = { { 0, -2 }, { -1, -1 }, { 0, -1 },
+                             { 1, -1 }, { -2, 0 },  { -1, 0 } };
+  EXPECT_EQ(kNumCoords, model.n);
+  for (int i = 0; i < kNumCoords; ++i) {
+    const int *coord = kCoords[i];
+    EXPECT_EQ(coord[0], model.coords[i][0]);
+    EXPECT_EQ(coord[1], model.coords[i][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooLargeLag) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooSmallLag) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithInvalidShape) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+// A container template class to hold a data type and extra arguments.
+// All of these args are bundled into one struct so that we can use
+// parameterized tests on combinations of supported data types
+// (uint8_t and uint16_t) and bit depths (8, 10, 12).
+template <typename T, int bit_depth, bool use_highbd>
+struct BitDepthParams {
+  typedef T data_type_t;
+  static const int kBitDepth = bit_depth;
+  static const bool kUseHighBD = use_highbd;
+};
+
+template <typename T>
+class FlatBlockEstimatorTest : public ::testing::Test, public T {
+ public:
+  virtual void SetUp() { random_.Reset(171); }
+  typedef std::vector<typename T::data_type_t> VecType;
+  VecType data_;
+  libaom_test::ACMRandom random_;
+};
+
+TYPED_TEST_CASE_P(FlatBlockEstimatorTest);
+
+TYPED_TEST_P(FlatBlockEstimatorTest, ExtractBlock) {
+  const int kBlockSize = 16;
+  aom_flat_block_finder_t flat_block_finder;
+  ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+                                          this->kBitDepth, this->kUseHighBD));
+  const double normalization = flat_block_finder.normalization;
+
+  // Test with an image of more than one block.
+  const int h = 2 * kBlockSize;
+  const int w = 2 * kBlockSize;
+  const int stride = 2 * kBlockSize;
+  this->data_.resize(h * stride, 128);
+
+  // Set up the (0,0) block to be a plane and the (0,1) block to be a
+  // checkerboard
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      this->data_[y * stride + x] = (-y + x + 128) << shift;
+      this->data_[y * stride + x + kBlockSize] =
+          ((x % 2 + y % 2) % 2 ? 128 - 20 : 128 + 20) << shift;
+    }
+  }
+  std::vector<double> block(kBlockSize * kBlockSize, 1);
+  std::vector<double> plane(kBlockSize * kBlockSize, 1);
+
+  // The block data should be a constant (zero) and the rest of the plane
+  // trend is covered in the plane data.
+  aom_flat_block_finder_extract_block(&flat_block_finder,
+                                      (uint8_t *)&this->data_[0], w, h, stride,
+                                      0, 0, &plane[0], &block[0]);
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      EXPECT_NEAR(0, block[y * kBlockSize + x], 1e-5);
+      EXPECT_NEAR((double)(this->data_[y * stride + x]) / normalization,
+                  plane[y * kBlockSize + x], 1e-5);
+    }
+  }
+
+  // The plane trend is a constant, and the block is a zero mean checkerboard.
+  aom_flat_block_finder_extract_block(&flat_block_finder,
+                                      (uint8_t *)&this->data_[0], w, h, stride,
+                                      kBlockSize, 0, &plane[0], &block[0]);
+  const int mid = 128 << shift;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      EXPECT_NEAR(((double)this->data_[y * stride + x + kBlockSize] - mid) /
+                      normalization,
+                  block[y * kBlockSize + x], 1e-5);
+      EXPECT_NEAR(mid / normalization, plane[y * kBlockSize + x], 1e-5);
+    }
+  }
+  aom_flat_block_finder_free(&flat_block_finder);
+}
+
+TYPED_TEST_P(FlatBlockEstimatorTest, FindFlatBlocks) {
+  const int kBlockSize = 32;
+  aom_flat_block_finder_t flat_block_finder;
+  ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+                                          this->kBitDepth, this->kUseHighBD));
+
+  const int num_blocks_w = 8;
+  const int h = kBlockSize;
+  const int w = kBlockSize * num_blocks_w;
+  const int stride = w;
+  this->data_.resize(h * stride, 128);
+  std::vector<uint8_t> flat_blocks(num_blocks_w, 0);
+
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      // Block 0 (not flat): constant doesn't have enough variance to qualify
+      this->data_[y * stride + x + 0 * kBlockSize] = 128 << shift;
+
+      // Block 1 (not flat): too high of variance is hard to validate as flat
+      this->data_[y * stride + x + 1 * kBlockSize] =
+          ((uint8_t)(128 + randn(&this->random_, 5))) << shift;
+
+      // Block 2 (flat): slight checkerboard added to constant
+      const int check = (x % 2 + y % 2) % 2 ? -2 : 2;
+      this->data_[y * stride + x + 2 * kBlockSize] = (128 + check) << shift;
+
+      // Block 3 (flat): planar block with checkerboard pattern is also flat
+      this->data_[y * stride + x + 3 * kBlockSize] =
+          (y * 2 - x / 2 + 128 + check) << shift;
+
+      // Block 4 (flat): gaussian random with standard deviation 1.
+      this->data_[y * stride + x + 4 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + x + 128.0)) << shift;
+
+      // Block 5 (flat): gaussian random with standard deviation 2.
+      this->data_[y * stride + x + 5 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 2) + y + 128.0)) << shift;
+
+      // Block 6 (not flat): too high of directional gradient.
+      const int strong_edge = x > kBlockSize / 2 ? 64 : 0;
+      this->data_[y * stride + x + 6 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + strong_edge + 128.0)) << shift;
+
+      // Block 7 (not flat): too high gradient.
+      const int big_check = ((x >> 2) % 2 + (y >> 2) % 2) % 2 ? -16 : 16;
+      this->data_[y * stride + x + 7 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + big_check + 128.0)) << shift;
+    }
+  }
+
+  EXPECT_EQ(4, aom_flat_block_finder_run(&flat_block_finder,
+                                         (uint8_t *)&this->data_[0], w, h,
+                                         stride, &flat_blocks[0]));
+
+  // First two blocks are not flat
+  EXPECT_EQ(0, flat_blocks[0]);
+  EXPECT_EQ(0, flat_blocks[1]);
+
+  // Next 4 blocks are flat.
+  EXPECT_EQ(255, flat_blocks[2]);
+  EXPECT_EQ(255, flat_blocks[3]);
+  EXPECT_EQ(255, flat_blocks[4]);
+  EXPECT_EQ(255, flat_blocks[5]);
+
+  // Last 2 are not flat by threshold
+  EXPECT_EQ(0, flat_blocks[6]);
+  EXPECT_EQ(0, flat_blocks[7]);
+
+  // Add the noise from non-flat block 1 to every block.
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize * num_blocks_w; ++x) {
+      this->data_[y * stride + x] +=
+          (this->data_[y * stride + x % kBlockSize + kBlockSize] -
+           (128 << shift));
+    }
+  }
+  // Now the scored selection will pick the one that is most likely flat (block
+  // 0)
+  EXPECT_EQ(1, aom_flat_block_finder_run(&flat_block_finder,
+                                         (uint8_t *)&this->data_[0], w, h,
+                                         stride, &flat_blocks[0]));
+  EXPECT_EQ(1, flat_blocks[0]);
+  EXPECT_EQ(0, flat_blocks[1]);
+  EXPECT_EQ(0, flat_blocks[2]);
+  EXPECT_EQ(0, flat_blocks[3]);
+  EXPECT_EQ(0, flat_blocks[4]);
+  EXPECT_EQ(0, flat_blocks[5]);
+  EXPECT_EQ(0, flat_blocks[6]);
+  EXPECT_EQ(0, flat_blocks[7]);
+
+  aom_flat_block_finder_free(&flat_block_finder);
+}
+
+REGISTER_TYPED_TEST_CASE_P(FlatBlockEstimatorTest, ExtractBlock,
+                           FindFlatBlocks);
+
+typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
+                         BitDepthParams<uint16_t, 8, true>,   // lowbd in 16-bit
+                         BitDepthParams<uint16_t, 10, true>,  // highbd data
+                         BitDepthParams<uint16_t, 12, true> >
+    AllBitDepthParams;
+INSTANTIATE_TYPED_TEST_CASE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
+                              AllBitDepthParams);
+
+template <typename T>
+class NoiseModelUpdateTest : public ::testing::Test, public T {
+ public:
+  static const int kWidth = 128;
+  static const int kHeight = 128;
+  static const int kBlockSize = 16;
+  static const int kNumBlocksX = kWidth / kBlockSize;
+  static const int kNumBlocksY = kHeight / kBlockSize;
+
+  virtual void SetUp() {
+    const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                              T::kBitDepth, T::kUseHighBD };
+    ASSERT_TRUE(aom_noise_model_init(&model_, params));
+
+    random_.Reset(100171);
+
+    data_.resize(kWidth * kHeight * 3);
+    denoised_.resize(kWidth * kHeight * 3);
+    noise_.resize(kWidth * kHeight * 3);
+    renoise_.resize(kWidth * kHeight);
+    flat_blocks_.resize(kNumBlocksX * kNumBlocksY);
+
+    for (int c = 0, offset = 0; c < 3; ++c, offset += kWidth * kHeight) {
+      data_ptr_[c] = &data_[offset];
+      noise_ptr_[c] = &noise_[offset];
+      denoised_ptr_[c] = &denoised_[offset];
+      strides_[c] = kWidth;
+
+      data_ptr_raw_[c] = (uint8_t *)&data_[offset];
+      denoised_ptr_raw_[c] = (uint8_t *)&denoised_[offset];
+    }
+    chroma_sub_[0] = 0;
+    chroma_sub_[1] = 0;
+  }
+
+  int NoiseModelUpdate(int block_size = kBlockSize) {
+    return aom_noise_model_update(&model_, data_ptr_raw_, denoised_ptr_raw_,
+                                  kWidth, kHeight, strides_, chroma_sub_,
+                                  &flat_blocks_[0], block_size);
+  }
+
+  void TearDown() { aom_noise_model_free(&model_); }
+
+ protected:
+  aom_noise_model_t model_;
+  std::vector<typename T::data_type_t> data_;
+  std::vector<typename T::data_type_t> denoised_;
+
+  std::vector<double> noise_;
+  std::vector<double> renoise_;
+  std::vector<uint8_t> flat_blocks_;
+
+  typename T::data_type_t *data_ptr_[3];
+  typename T::data_type_t *denoised_ptr_[3];
+
+  double *noise_ptr_[3];
+  int strides_[3];
+  int chroma_sub_[2];
+  libaom_test::ACMRandom random_;
+
+ private:
+  uint8_t *data_ptr_raw_[3];
+  uint8_t *denoised_ptr_raw_[3];
+};
+
+TYPED_TEST_CASE_P(NoiseModelUpdateTest);
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks) {
+  EXPECT_EQ(AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+            this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForZeroNoiseAllFlat) {
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  this->denoised_.assign(this->denoised_.size(), 128);
+  this->data_.assign(this->denoised_.size(), 128);
+  EXPECT_EQ(AOM_NOISE_STATUS_INTERNAL_ERROR, this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsBlockSizeTooSmall) {
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  this->denoised_.assign(this->denoised_.size(), 128);
+  this->data_.assign(this->denoised_.size(), 128);
+  EXPECT_EQ(AOM_NOISE_STATUS_INVALID_ARGUMENT,
+            this->NoiseModelUpdate(6 /* block_size=6 is too small*/));
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForWhiteRandomNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kHeight; ++y) {
+    for (int x = 0; x < kWidth; ++x) {
+      this->data_ptr_[0][y * kWidth + x] =
+          int(64 + y + randn(&this->random_, 1)) << shift;
+      this->denoised_ptr_[0][y * kWidth + x] = (64 + y) << shift;
+      // Make the chroma planes completely correlated with the Y plane
+      for (int c = 1; c < 3; ++c) {
+        this->data_ptr_[c][y * kWidth + x] = this->data_ptr_[0][y * kWidth + x];
+        this->denoised_ptr_[c][y * kWidth + x] =
+            this->denoised_ptr_[0][y * kWidth + x];
+      }
+    }
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const double kCoeffEps = 0.075;
+  const int n = model.n;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    // The second and third channels are highly correlated with the first.
+    if (c > 0) {
+      ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+      ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+      EXPECT_NEAR(1, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(1, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+  }
+
+  // The fitted noise strength should be close to the standard deviation
+  // for all intensity bins.
+  const double kStdEps = 0.1;
+  const double normalize = 1 << shift;
+
+  for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+    EXPECT_NEAR(1.0,
+                model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+    EXPECT_NEAR(1.0,
+                model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+  }
+
+  aom_noise_strength_lut_t lut;
+  aom_noise_strength_solver_fit_piecewise(
+      &model.latest_state[0].strength_solver, -1, &lut);
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(1.0, lut.points[0][1] / normalize, kStdEps);
+  EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(1.0, lut.points[1][1] / normalize, kStdEps);
+  aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForScaledWhiteNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+
+  const double kCoeffEps = 0.055;
+  const double kLowStd = 1;
+  const double kHighStd = 4;
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kHeight; ++y) {
+    for (int x = 0; x < kWidth; ++x) {
+      for (int c = 0; c < 3; ++c) {
+        // The image data is bimodal:
+        // Bottom half has low intensity and low noise strength
+        // Top half has high intensity and high noise strength
+        const int avg = (y < kHeight / 2) ? 4 : 245;
+        const double std = (y < kHeight / 2) ? kLowStd : kHighStd;
+        this->data_ptr_[c][y * kWidth + x] =
+            ((uint8_t)std::min((int)255,
+                               (int)(2 + avg + randn(&this->random_, std))))
+            << shift;
+        this->denoised_ptr_[c][y * kWidth + x] = (2 + avg) << shift;
+      }
+    }
+  }
+  // Label all blocks as flat for the update
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const int n = model.n;
+  // The noise is uncorrelated spatially and with the y channel.
+  // All coefficients should be reasonably close to zero.
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    if (c > 0) {
+      ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+      ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+      // The correlation to the y channel should be low (near zero)
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+  }
+
+  // Noise strength should vary between kLowStd and kHighStd.
+  const double kStdEps = 0.15;
+  // We have to normalize fitted standard deviation based on bit depth.
+  const double normalize = (1 << shift);
+
+  ASSERT_EQ(20, model.latest_state[0].strength_solver.eqns.n);
+  for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+    const double a = i / 19.0;
+    const double expected = (kLowStd * (1.0 - a) + kHighStd * a);
+    EXPECT_NEAR(expected,
+                model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+    EXPECT_NEAR(expected,
+                model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+  }
+
+  // If we fit a piecewise linear model, there should be two points:
+  // one near kLowStd at 0, and the other near kHighStd and 255.
+  aom_noise_strength_lut_t lut;
+  aom_noise_strength_solver_fit_piecewise(
+      &model.latest_state[0].strength_solver, 2, &lut);
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0, lut.points[0][0], 1e-4);
+  EXPECT_NEAR(kLowStd, lut.points[0][1] / normalize, kStdEps);
+  EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(kHighStd, lut.points[1][1] / normalize, kStdEps);
+  aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForCorrelatedNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kNumCoeffs = 24;
+  const double kStd = 4;
+  const double kStdEps = 0.3;
+  const double kCoeffEps = 0.065;
+  // Use different coefficients for each channel
+  const double kCoeffs[3][24] = {
+    { 0.02884, -0.03356, 0.00633,  0.01757,  0.02849,  -0.04620,
+      0.02833, -0.07178, 0.07076,  -0.11603, -0.10413, -0.16571,
+      0.05158, -0.07969, 0.02640,  -0.07191, 0.02530,  0.41968,
+      0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+    { 0.00269, -0.01291, -0.01513, 0.07234,  0.03208,   0.00477,
+      0.00226, -0.00254, 0.03533,  0.12841,  -0.25970,  -0.06336,
+      0.05238, -0.00845, -0.03118, 0.09043,  -0.36558,  0.48903,
+      0.00595, -0.11938, 0.02106,  0.095956, -0.350139, 0.59305 },
+    { -0.00643, -0.01080, -0.01466, 0.06951, 0.03707,  -0.00482,
+      0.00817,  -0.00909, 0.02949,  0.12181, -0.25210, -0.07886,
+      0.06083,  -0.01210, -0.03108, 0.08944, -0.35875, 0.49150,
+      0.00415,  -0.12905, 0.02870,  0.09740, -0.34610, 0.58824 },
+  };
+
+  ASSERT_EQ(model.n, kNumCoeffs);
+  this->chroma_sub_[0] = this->chroma_sub_[1] = 1;
+
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+
+  // Add different noise onto each plane
+  const int shift = this->kBitDepth - 8;
+  for (int c = 0; c < 3; ++c) {
+    noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+                kCoeffs[c], this->noise_ptr_[c], kWidth, kHeight);
+    const int x_shift = c > 0 ? this->chroma_sub_[0] : 0;
+    const int y_shift = c > 0 ? this->chroma_sub_[1] : 0;
+    for (int y = 0; y < (kHeight >> y_shift); ++y) {
+      for (int x = 0; x < (kWidth >> x_shift); ++x) {
+        const uint8_t value = 64 + x / 2 + y / 4;
+        this->data_ptr_[c][y * kWidth + x] =
+            (uint8_t(value + this->noise_ptr_[c][y * kWidth + x] * kStd))
+            << shift;
+        this->denoised_ptr_[c][y * kWidth + x] = value << shift;
+      }
+    }
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  // For the Y plane, the solved coefficients should be close to the original
+  const int n = model.n;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(kCoeffs[c][i], model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(kCoeffs[c][i], model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    // The chroma planes should be uncorrelated with the luma plane
+    if (c > 0) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+    // Correlation between the coefficient vector and the fitted coefficients
+    // should be close to 1.
+    EXPECT_LT(0.98, aom_normalized_cross_correlation(
+                        model.latest_state[c].eqns.x, kCoeffs[c], kNumCoeffs));
+
+    noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+                model.latest_state[c].eqns.x, &this->renoise_[0], kWidth,
+                kHeight);
+
+    EXPECT_TRUE(aom_noise_data_validate(&this->renoise_[0], kWidth, kHeight));
+  }
+
+  // Check fitted noise strength
+  const double normalize = 1 << shift;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < model.latest_state[c].strength_solver.eqns.n; ++i) {
+      EXPECT_NEAR(kStd,
+                  model.latest_state[c].strength_solver.eqns.x[i] / normalize,
+                  kStdEps);
+    }
+  }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest,
+             NoiseStrengthChangeSignalsDifferentNoiseType) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  // Create a gradient image with std = 2 uncorrelated noise
+  const double kStd = 2;
+  const int shift = this->kBitDepth - 8;
+
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192;
+    for (int c = 0; c < 3; ++c) {
+      this->noise_ptr_[c][i] = randn(&this->random_, 1);
+      this->data_ptr_[c][i] = ((uint8_t)(this->noise_ptr_[c][i] * kStd + val))
+                              << shift;
+      this->denoised_ptr_[c][i] = val << shift;
+    }
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const int kNumBlocks = kWidth * kHeight / kBlockSize / kBlockSize;
+  EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.latest_state[1].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.latest_state[2].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[0].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[1].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[2].strength_solver.num_equations);
+
+  // Bump up noise by an insignificant amount
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192;
+    this->data_ptr_[0][i] =
+        ((uint8_t)(this->noise_ptr_[0][i] * (kStd + 0.085) + val)) << shift;
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const double kARGainTolerance = 0.02;
+  for (int c = 0; c < 3; ++c) {
+    EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.latest_state[c].num_observations);
+    EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+    EXPECT_EQ(2 * kNumBlocks,
+              model.combined_state[c].strength_solver.num_equations);
+    EXPECT_EQ(2 * 15250, model.combined_state[c].num_observations);
+    EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+  }
+
+  // Bump up the noise strength on half the image for one channel by a
+  // significant amount.
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 128;
+    if (i % kWidth < kWidth / 2) {
+      this->data_ptr_[0][i] =
+          ((uint8_t)(randn(&this->random_, kStd + 0.5) + val)) << shift;
+    }
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+
+  // Since we didn't update the combined state, it should still be at 2 *
+  // num_blocks
+  EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+  EXPECT_EQ(2 * kNumBlocks,
+            model.combined_state[0].strength_solver.num_equations);
+
+  // In normal operation, the "latest" estimate can be saved to the "combined"
+  // state for continued updates.
+  aom_noise_model_save_latest(&model);
+  for (int c = 0; c < 3; ++c) {
+    EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.latest_state[c].num_observations);
+    EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+    EXPECT_EQ(kNumBlocks,
+              model.combined_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.combined_state[c].num_observations);
+    EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+  }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, NoiseCoeffsSignalsDifferentNoiseType) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const double kCoeffs[2][24] = {
+    { 0.02884, -0.03356, 0.00633,  0.01757,  0.02849,  -0.04620,
+      0.02833, -0.07178, 0.07076,  -0.11603, -0.10413, -0.16571,
+      0.05158, -0.07969, 0.02640,  -0.07191, 0.02530,  0.41968,
+      0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+    { 0.00269, -0.01291, -0.01513, 0.07234,  0.03208,   0.00477,
+      0.00226, -0.00254, 0.03533,  0.12841,  -0.25970,  -0.06336,
+      0.05238, -0.00845, -0.03118, 0.09043,  -0.36558,  0.48903,
+      0.00595, -0.11938, 0.02106,  0.095956, -0.350139, 0.59305 }
+  };
+
+  noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+              kCoeffs[0], this->noise_ptr_[0], kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  // Now try with the second set of AR coefficients
+  noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+              kCoeffs[1], this->noise_ptr_[0], kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+}
+REGISTER_TYPED_TEST_CASE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
+                           UpdateSuccessForZeroNoiseAllFlat,
+                           UpdateFailsBlockSizeTooSmall,
+                           UpdateSuccessForWhiteRandomNoise,
+                           UpdateSuccessForScaledWhiteNoise,
+                           UpdateSuccessForCorrelatedNoise,
+                           NoiseStrengthChangeSignalsDifferentNoiseType,
+                           NoiseCoeffsSignalsDifferentNoiseType);
+
+INSTANTIATE_TYPED_TEST_CASE_P(NoiseModelUpdateTestInstatiation,
+                              NoiseModelUpdateTest, AllBitDepthParams);
+
+TEST(NoiseModelGetGrainParameters, TestLagSize) {
+  aom_film_grain_t film_grain;
+  for (int lag = 1; lag <= 3; ++lag) {
+    aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+    aom_noise_model_t model;
+    EXPECT_TRUE(aom_noise_model_init(&model, params));
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+    aom_noise_model_free(&model);
+  }
+
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+  EXPECT_FALSE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestARCoeffShiftBounds) {
+  struct TestCase {
+    double max_input_value;
+    int expected_ar_coeff_shift;
+    int expected_value;
+  };
+  const int lag = 1;
+  const int kNumTestCases = 19;
+  const TestCase test_cases[] = {
+    // Test cases for ar_coeff_shift = 9
+    { 0, 9, 0 },
+    { 0.125, 9, 64 },
+    { -0.125, 9, -64 },
+    { 0.2499, 9, 127 },
+    { -0.25, 9, -128 },
+    // Test cases for ar_coeff_shift = 8
+    { 0.25, 8, 64 },
+    { -0.2501, 8, -64 },
+    { 0.499, 8, 127 },
+    { -0.5, 8, -128 },
+    // Test cases for ar_coeff_shift = 7
+    { 0.5, 7, 64 },
+    { -0.5001, 7, -64 },
+    { 0.999, 7, 127 },
+    { -1, 7, -128 },
+    // Test cases for ar_coeff_shift = 6
+    { 1.0, 6, 64 },
+    { -1.0001, 6, -64 },
+    { 2.0, 6, 127 },
+    { -2.0, 6, -128 },
+    { 4, 6, 127 },
+    { -4, 6, -128 },
+  };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  for (int i = 0; i < kNumTestCases; ++i) {
+    const TestCase &test_case = test_cases[i];
+    model.combined_state[0].eqns.x[0] = test_case.max_input_value;
+
+    aom_film_grain_t film_grain;
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    EXPECT_EQ(1, film_grain.ar_coeff_lag);
+    EXPECT_EQ(test_case.expected_ar_coeff_shift, film_grain.ar_coeff_shift);
+    EXPECT_EQ(test_case.expected_value, film_grain.ar_coeffs_y[0]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestNoiseStrengthShiftBounds) {
+  struct TestCase {
+    double max_input_value;
+    int expected_scaling_shift;
+    int expected_value;
+  };
+  const int kNumTestCases = 10;
+  const TestCase test_cases[] = {
+    { 0, 11, 0 },      { 1, 11, 64 },     { 2, 11, 128 }, { 3.99, 11, 255 },
+    { 4, 10, 128 },    { 7.99, 10, 255 }, { 8, 9, 128 },  { 16, 8, 128 },
+    { 31.99, 8, 255 }, { 64, 8, 255 },  // clipped
+  };
+  const int lag = 1;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  for (int i = 0; i < kNumTestCases; ++i) {
+    const TestCase &test_case = test_cases[i];
+    aom_equation_system_t &eqns = model.combined_state[0].strength_solver.eqns;
+    // Set the fitted scale parameters to be a constant value.
+    for (int j = 0; j < eqns.n; ++j) {
+      eqns.x[j] = test_case.max_input_value;
+    }
+    aom_film_grain_t film_grain;
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    // We expect a single constant segemnt
+    EXPECT_EQ(test_case.expected_scaling_shift, film_grain.scaling_shift);
+    EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[0][1]);
+    EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[1][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+// The AR coefficients are the same inputs used to generate "Test 2" in the test
+// vectors
+TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) {
+  const double kInputCoeffsY[] = { 0.0315,  0.0073,  0.0218,  0.00235, 0.00511,
+                                   -0.0222, 0.0627,  -0.022,  0.05575, -0.1816,
+                                   0.0107,  -0.1966, 0.00065, -0.0809, 0.04934,
+                                   -0.1349, -0.0352, 0.41772, 0.27973, 0.04207,
+                                   -0.0429, -0.1372, 0.06193, 0.52032 };
+  const double kInputCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,
+                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5 };
+  const double kInputCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5 };
+  const int kExpectedARCoeffsY[] = { 4,  1,   3,  0,   1,  -3,  8, -3,
+                                     7,  -23, 1,  -25, 0,  -10, 6, -17,
+                                     -5, 53,  36, 5,   -5, -18, 8, 67 };
+  const int kExpectedARCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 };
+  const int kExpectedARCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -126 };
+  // Scaling function is initialized analytically with a sqrt function.
+  const int kNumScalingPointsY = 12;
+  const int kExpectedScalingPointsY[][2] = {
+    { 0, 0 },     { 13, 44 },   { 27, 62 },   { 40, 76 },
+    { 54, 88 },   { 67, 98 },   { 94, 117 },  { 121, 132 },
+    { 148, 146 }, { 174, 159 }, { 201, 171 }, { 255, 192 },
+  };
+
+  const int lag = 3;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  // Setup the AR coeffs
+  memcpy(model.combined_state[0].eqns.x, kInputCoeffsY, sizeof(kInputCoeffsY));
+  memcpy(model.combined_state[1].eqns.x, kInputCoeffsCB,
+         sizeof(kInputCoeffsCB));
+  memcpy(model.combined_state[2].eqns.x, kInputCoeffsCR,
+         sizeof(kInputCoeffsCR));
+  for (int i = 0; i < model.combined_state[0].strength_solver.num_bins; ++i) {
+    const double x =
+        ((double)i) / (model.combined_state[0].strength_solver.num_bins - 1.0);
+    model.combined_state[0].strength_solver.eqns.x[i] = 6 * sqrt(x);
+    model.combined_state[1].strength_solver.eqns.x[i] = 3;
+    model.combined_state[2].strength_solver.eqns.x[i] = 2;
+
+    // Inject some observations into the strength solver, as during film grain
+    // parameter extraction an estimate of the average strength will be used to
+    // adjust correlation.
+    const int n = model.combined_state[0].strength_solver.num_bins;
+    for (int j = 0; j < model.combined_state[0].strength_solver.num_bins; ++j) {
+      model.combined_state[0].strength_solver.eqns.A[i * n + j] = 1;
+      model.combined_state[1].strength_solver.eqns.A[i * n + j] = 1;
+      model.combined_state[2].strength_solver.eqns.A[i * n + j] = 1;
+    }
+  }
+
+  aom_film_grain_t film_grain;
+  EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+  EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+  EXPECT_EQ(3, film_grain.ar_coeff_lag);
+  EXPECT_EQ(7, film_grain.ar_coeff_shift);
+  EXPECT_EQ(10, film_grain.scaling_shift);
+  EXPECT_EQ(kNumScalingPointsY, film_grain.num_y_points);
+  EXPECT_EQ(1, film_grain.update_parameters);
+  EXPECT_EQ(1, film_grain.apply_grain);
+
+  const int kNumARCoeffs = 24;
+  for (int i = 0; i < kNumARCoeffs; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsY[i], film_grain.ar_coeffs_y[i]);
+  }
+  for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsCB[i], film_grain.ar_coeffs_cb[i]);
+  }
+  for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsCR[i], film_grain.ar_coeffs_cr[i]);
+  }
+  for (int i = 0; i < kNumScalingPointsY; ++i) {
+    EXPECT_EQ(kExpectedScalingPointsY[i][0], film_grain.scaling_points_y[i][0]);
+    EXPECT_EQ(kExpectedScalingPointsY[i][1], film_grain.scaling_points_y[i][1]);
+  }
+
+  // CB strength should just be a piecewise segment
+  EXPECT_EQ(2, film_grain.num_cb_points);
+  EXPECT_EQ(0, film_grain.scaling_points_cb[0][0]);
+  EXPECT_EQ(255, film_grain.scaling_points_cb[1][0]);
+  EXPECT_EQ(96, film_grain.scaling_points_cb[0][1]);
+  EXPECT_EQ(96, film_grain.scaling_points_cb[1][1]);
+
+  // CR strength should just be a piecewise segment
+  EXPECT_EQ(2, film_grain.num_cr_points);
+  EXPECT_EQ(0, film_grain.scaling_points_cr[0][0]);
+  EXPECT_EQ(255, film_grain.scaling_points_cr[1][0]);
+  EXPECT_EQ(64, film_grain.scaling_points_cr[0][1]);
+  EXPECT_EQ(64, film_grain.scaling_points_cr[1][1]);
+
+  EXPECT_EQ(128, film_grain.cb_mult);
+  EXPECT_EQ(192, film_grain.cb_luma_mult);
+  EXPECT_EQ(256, film_grain.cb_offset);
+  EXPECT_EQ(128, film_grain.cr_mult);
+  EXPECT_EQ(192, film_grain.cr_luma_mult);
+  EXPECT_EQ(256, film_grain.cr_offset);
+  EXPECT_EQ(0, film_grain.chroma_scaling_from_luma);
+  EXPECT_EQ(0, film_grain.grain_scale_shift);
+
+  aom_noise_model_free(&model);
+}
+
+template <typename T>
+class WienerDenoiseTest : public ::testing::Test, public T {
+ public:
+  static void SetUpTestCase() { aom_dsp_rtcd(); }
+
+ protected:
+  void SetUp() {
+    static const float kNoiseLevel = 5.f;
+    static const float kStd = 4.0;
+    static const double kMaxValue = (1 << T::kBitDepth) - 1;
+
+    chroma_sub_[0] = 1;
+    chroma_sub_[1] = 1;
+    stride_[0] = kWidth;
+    stride_[1] = kWidth / 2;
+    stride_[2] = kWidth / 2;
+    for (int k = 0; k < 3; ++k) {
+      data_[k].resize(kWidth * kHeight);
+      denoised_[k].resize(kWidth * kHeight);
+      noise_psd_[k].resize(kBlockSize * kBlockSize);
+    }
+
+    const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093,
+                                0.048,  0.404,  0.2353, -0.035, -0.093, 0.441 };
+    const int kCoords[12][2] = {
+      { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 },
+      { -1, -1 }, { 0, -1 },  { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 }
+    };
+    const int kLag = 2;
+    const int kLength = 12;
+    libaom_test::ACMRandom random;
+    std::vector<double> noise(kWidth * kHeight);
+    noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth,
+                kHeight);
+    noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize);
+    for (int i = 0; i < kBlockSize * kBlockSize; ++i) {
+      noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise *
+                                 kScaleNoise / (kMaxValue * kMaxValue));
+    }
+
+    float psd_value =
+        aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel);
+    for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) {
+      noise_psd_[1][i] = psd_value;
+      noise_psd_[2][i] = psd_value;
+    }
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kWidth; ++x) {
+        data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp(
+            (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue);
+      }
+    }
+
+    for (int c = 1; c < 3; ++c) {
+      for (int y = 0; y < (kHeight >> 1); ++y) {
+        for (int x = 0; x < (kWidth >> 1); ++x) {
+          data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp(
+              (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue);
+        }
+      }
+    }
+    for (int k = 0; k < 3; ++k) {
+      noise_psd_ptrs_[k] = &noise_psd_[k][0];
+    }
+  }
+  static const int kBlockSize = 32;
+  static const int kBlockSizeChroma = 16;
+  static const int kWidth = 256;
+  static const int kHeight = 256;
+  static const int kScaleNoise = 1 << (T::kBitDepth - 8);
+
+  std::vector<typename T::data_type_t> data_[3];
+  std::vector<typename T::data_type_t> denoised_[3];
+  std::vector<float> noise_psd_[3];
+  int chroma_sub_[2];
+  float *noise_psd_ptrs_[3];
+  int stride_[3];
+};
+
+TYPED_TEST_CASE_P(WienerDenoiseTest);
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     18, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     48, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     64, this->kBitDepth, this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  int chroma_sub[2] = { 1, 0 };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+
+  chroma_sub[0] = 0;
+  chroma_sub[1] = 1;
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  const int ret = aom_wiener_denoise_2d(
+      data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_,
+      this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize,
+      this->kBitDepth, this->kUseHighBD);
+  EXPECT_EQ(1, ret);
+
+  // Check the noise on the denoised image (from the analytical gradient)
+  // and make sure that it is less than what we added.
+  for (int c = 0; c < 3; ++c) {
+    std::vector<double> measured_noise(kWidth * kHeight);
+
+    double var = 0;
+    const int shift = (c > 0);
+    for (int x = 0; x < (kWidth >> shift); ++x) {
+      for (int y = 0; y < (kHeight >> shift); ++y) {
+        const double diff = this->denoised_[c][y * this->stride_[c] + x] -
+                            x * this->kScaleNoise;
+        var += diff * diff;
+        measured_noise[y * kWidth + x] = diff;
+      }
+    }
+    var /= (kWidth * kHeight);
+    const double std = sqrt(std::max(0.0, var));
+    EXPECT_LE(std, 1.25f * this->kScaleNoise);
+    if (c == 0) {
+      std::vector<float> measured_psd =
+          get_noise_psd(&measured_noise[0], kWidth, kHeight, kBlockSize);
+      std::vector<double> measured_psd_d(kBlockSize * kBlockSize);
+      std::vector<double> noise_psd_d(kBlockSize * kBlockSize);
+      std::copy(measured_psd.begin(), measured_psd.end(),
+                measured_psd_d.begin());
+      std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(),
+                noise_psd_d.begin());
+      EXPECT_LT(
+          aom_normalized_cross_correlation(&measured_psd_d[0], &noise_psd_d[0],
+                                           (int)(noise_psd_d.size())),
+          0.35);
+    }
+  }
+}
+
+REGISTER_TYPED_TEST_CASE_P(WienerDenoiseTest, InvalidBlockSize,
+                           InvalidChromaSubsampling, GradientTest);
+
+INSTANTIATE_TYPED_TEST_CASE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+                              AllBitDepthParams);
diff --git a/third_party/aom/test/obmc_sad_test.cc b/third_party/aom/test/obmc_sad_test.cc
new file mode 100644
index 000000000..6cef86961
--- /dev/null
+++ b/third_party/aom/test/obmc_sad_test.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask);
+typedef libaom_test::FuncParam<ObmcSadF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
+
+TEST_P(ObmcSadTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = rng_.Rand8();
+      wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+                                 params_.tst_func(pre, pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(ObmcSadTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = UINT8_MAX;
+      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+                                 params_.tst_func(pre, pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE4_1
+const ObmcSadTest::ParamType sse4_functions[] = {
+  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1),
+  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1),
+  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1),
+  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
+  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
+  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
+  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_sse4_1),
+  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_sse4_1),
+  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_sse4_1),
+  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_sse4_1),
+  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_sse4_1),
+  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_sse4_1),
+  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1),
+  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1),
+  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1),
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadTest,
+                        ::testing::ValuesIn(sse4_functions));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const ObmcSadTest::ParamType avx2_functions[] = {
+  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2),
+  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2),
+  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2),
+  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2),
+  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2),
+  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2),
+  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2),
+  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2),
+  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2),
+  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2),
+  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2),
+  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2),
+  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2),
+  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2),
+  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2),
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadTest, ::testing::ValuesIn(avx2_functions));
+#endif  // HAVE_AVX2
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
+
+TEST_P(ObmcSadHBDTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = rng_(1 << 12);
+      wsrc[i] = rng_(1 << 12) * rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    const unsigned int ref_res =
+        params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res =
+            params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(ObmcSadHBDTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = (1 << 12) - 1;
+      wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    const unsigned int ref_res =
+        params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res =
+            params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE4_1
+ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcSadHBDTest,
+                        ::testing::ValuesIn(sse4_functions_hbd));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+ObmcSadHBDTest::ParamType avx2_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_avx2),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ObmcSadHBDTest,
+                        ::testing::ValuesIn(avx2_functions_hbd));
+#endif  // HAVE_AVX2
+}  // namespace
diff --git a/third_party/aom/test/obmc_variance_test.cc b/third_party/aom/test/obmc_variance_test.cc
new file mode 100644
index 000000000..4563b964a
--- /dev/null
+++ b/third_party/aom/test/obmc_variance_test.cc
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcVarF)(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 unsigned int *sse);
+typedef libaom_test::FuncParam<ObmcVarF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
+
+TEST_P(ObmcVarianceTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = this->rng_.Rand8();
+      wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res =
+        params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = UINT8_MAX;
+      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res =
+        params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceTest, DISABLED_Speed) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pre[i] = this->rng_.Rand8();
+    wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+    mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+  }
+
+  const int num_loops = 1000000;
+  unsigned int ref_sse, tst_sse;
+  aom_usec_timer ref_timer, test_timer;
+
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < num_loops; ++i) {
+    params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int elapsed_time_c =
+      static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < num_loops; ++i) {
+    params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse);
+  }
+  aom_usec_timer_mark(&test_timer);
+  const int elapsed_time_simd =
+      static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("c_time=%d \t simd_time=%d \t gain=%d \n", elapsed_time_c,
+         elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
+}
+
+#if HAVE_SSE4_1
+const ObmcVarianceTest::ParamType sse4_functions[] = {
+  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
+  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1),
+  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1),
+  TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1),
+  TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1),
+  TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1),
+  TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_sse4_1),
+  TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_sse4_1),
+  TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_sse4_1),
+  TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_sse4_1),
+  TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_sse4_1),
+  TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_sse4_1),
+  TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_sse4_1),
+  TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_sse4_1),
+  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
+  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceTest,
+                        ::testing::ValuesIn(sse4_functions));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const ObmcVarianceTest::ParamType avx2_functions[] = {
+  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_avx2),
+  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_avx2),
+  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_avx2),
+  TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_avx2),
+  TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_avx2),
+  TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_avx2),
+  TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_avx2),
+  TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_avx2),
+  TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_avx2),
+  TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_avx2),
+  TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_avx2),
+  TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2),
+  TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2),
+  TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2),
+  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
+  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, ObmcVarianceTest,
+                        ::testing::ValuesIn(avx2_functions));
+#endif  // HAVE_AVX2
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
+
+TEST_P(ObmcVarianceHBDTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = this->rng_(1 << params_.bit_depth);
+      wsrc[i] = this->rng_(1 << params_.bit_depth) *
+                this->rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = params_.ref_func(
+        CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+                                                        pre_stride, wsrc, mask,
+                                                        &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = (1 << params_.bit_depth) - 1;
+      wsrc[i] = ((1 << params_.bit_depth) - 1) * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = params_.ref_func(
+        CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+                                                        pre_stride, wsrc, mask,
+                                                        &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+#if HAVE_SSE4_1
+ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_variance128x128_c,
+            aom_highbd_obmc_variance128x128_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance128x64_c,
+            aom_highbd_obmc_variance128x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x128_c,
+            aom_highbd_obmc_variance64x128_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x64_c,
+            aom_highbd_obmc_variance64x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x32_c,
+            aom_highbd_obmc_variance64x32_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x64_c,
+            aom_highbd_obmc_variance32x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x32_c,
+            aom_highbd_obmc_variance32x32_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x16_c,
+            aom_highbd_obmc_variance32x16_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x32_c,
+            aom_highbd_obmc_variance16x32_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x16_c,
+            aom_highbd_obmc_variance16x16_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x8_c, aom_highbd_obmc_variance16x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x16_c, aom_highbd_obmc_variance8x16_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x8_c, aom_highbd_obmc_variance8x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x4_c, aom_highbd_obmc_variance8x4_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance4x8_c, aom_highbd_obmc_variance4x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1,
+            8),
+  TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+            aom_highbd_10_obmc_variance128x128_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+            aom_highbd_10_obmc_variance128x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+            aom_highbd_10_obmc_variance64x128_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x64_c,
+            aom_highbd_10_obmc_variance64x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x32_c,
+            aom_highbd_10_obmc_variance64x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x64_c,
+            aom_highbd_10_obmc_variance32x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x32_c,
+            aom_highbd_10_obmc_variance32x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x16_c,
+            aom_highbd_10_obmc_variance32x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x32_c,
+            aom_highbd_10_obmc_variance16x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x16_c,
+            aom_highbd_10_obmc_variance16x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x8_c,
+            aom_highbd_10_obmc_variance16x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x16_c,
+            aom_highbd_10_obmc_variance8x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x8_c,
+            aom_highbd_10_obmc_variance8x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x4_c,
+            aom_highbd_10_obmc_variance8x4_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x8_c,
+            aom_highbd_10_obmc_variance4x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x4_c,
+            aom_highbd_10_obmc_variance4x4_sse4_1, 10),
+  TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+            aom_highbd_12_obmc_variance128x128_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+            aom_highbd_12_obmc_variance128x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+            aom_highbd_12_obmc_variance64x128_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x64_c,
+            aom_highbd_12_obmc_variance64x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x32_c,
+            aom_highbd_12_obmc_variance64x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x64_c,
+            aom_highbd_12_obmc_variance32x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x32_c,
+            aom_highbd_12_obmc_variance32x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x16_c,
+            aom_highbd_12_obmc_variance32x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x32_c,
+            aom_highbd_12_obmc_variance16x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x16_c,
+            aom_highbd_12_obmc_variance16x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x8_c,
+            aom_highbd_12_obmc_variance16x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x16_c,
+            aom_highbd_12_obmc_variance8x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x8_c,
+            aom_highbd_12_obmc_variance8x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x4_c,
+            aom_highbd_12_obmc_variance8x4_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x8_c,
+            aom_highbd_12_obmc_variance4x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x4_c,
+            aom_highbd_12_obmc_variance4x4_sse4_1, 12)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, ObmcVarianceHBDTest,
+                        ::testing::ValuesIn(sse4_functions_hbd));
+#endif  // HAVE_SSE4_1
+}  // namespace
diff --git a/third_party/aom/test/onyxc_int_test.cc b/third_party/aom/test/onyxc_int_test.cc
new file mode 100644
index 000000000..388959518
--- /dev/null
+++ b/third_party/aom/test/onyxc_int_test.cc
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/common/onyxc_int.h"
+
+TEST(OnyxcInt, TestGetTxSize) {
+  for (int t = TX_4X4; t < TX_SIZES_ALL; t++) {
+    TX_SIZE t2 = get_tx_size(tx_size_wide[t], tx_size_high[t]);
+    GTEST_ASSERT_EQ(tx_size_wide[t], tx_size_wide[t2]);
+    GTEST_ASSERT_EQ(tx_size_high[t], tx_size_high[t2]);
+  }
+}
diff --git a/third_party/aom/test/pickrst_test.cc b/third_party/aom/test/pickrst_test.cc
new file mode 100644
index 000000000..040e8e8b7
--- /dev/null
+++ b/third_party/aom/test/pickrst_test.cc
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/pickrst.h"
+using libaom_test::FunctionEquivalenceTest;
+
+#define MAX_DATA_BLOCK 384
+
+namespace {
+static const int kIterations = 100;
+
+typedef int64_t (*lowbd_pixel_proj_error_func)(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+
+typedef libaom_test::FuncParam<lowbd_pixel_proj_error_func> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef ::testing::tuple<const lowbd_pixel_proj_error_func>
+    PixelProjErrorTestParam;
+
+class PixelProjErrorTest
+    : public ::testing::TestWithParam<PixelProjErrorTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(uint8_t)));
+    dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(uint8_t)));
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(int32_t)));
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(int32_t)));
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void runPixelProjErrorTest(int32_t run_times);
+  void runPixelProjErrorTest_ExtremeValues();
+
+ private:
+  lowbd_pixel_proj_error_func target_func_;
+  ACMRandom rng_;
+  uint8_t *src_;
+  uint8_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+
+void PixelProjErrorTest::runPixelProjErrorTest(int32_t run_times) {
+  int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+  int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  int xq[2];
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t err_ref = 0, err_test = 1;
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand8();
+      src_[i] = rng_.Rand8();
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      err_ref = av1_lowbd_pixel_proj_error_c(src, h_end, v_end, src_stride, dgd,
+                                             dgd_stride, flt0_, flt0_stride,
+                                             flt1_, flt1_stride, xq, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      err_test =
+          target_func_(src, h_end, v_end, src_stride, dgd, dgd_stride, flt0_,
+                       flt0_stride, flt1_, flt1_stride, xq, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    }
+    ASSERT_EQ(err_ref, err_test);
+  }
+}
+
+void PixelProjErrorTest::runPixelProjErrorTest_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = 192;
+  const int v_start = 0;
+  int v_end = 192;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  int xq[2];
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t err_ref = 0, err_test = 1;
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 255;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    params.r[0] = rng_.Rand8() % MAX_RADIUS;
+    params.r[1] = rng_.Rand8() % MAX_RADIUS;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    err_ref = av1_lowbd_pixel_proj_error_c(
+        src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+        flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
+
+    err_test = target_func_(src, h_end - h_start, v_end - v_start, src_stride,
+                            dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+                            flt1_stride, xq, &params);
+
+    ASSERT_EQ(err_ref, err_test);
+  }
+}
+
+TEST_P(PixelProjErrorTest, RandomValues) { runPixelProjErrorTest(1); }
+
+TEST_P(PixelProjErrorTest, ExtremeValues) {
+  runPixelProjErrorTest_ExtremeValues();
+}
+
+TEST_P(PixelProjErrorTest, DISABLED_Speed) { runPixelProjErrorTest(200000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, PixelProjErrorTest,
+                        ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, PixelProjErrorTest,
+                        ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/third_party/aom/test/qm_test.cc b/third_party/aom/test/qm_test.cc
new file mode 100644
index 000000000..c87506b41
--- /dev/null
+++ b/third_party/aom/test/qm_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class QMTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QMTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~QMTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ENABLE_QM, 1);
+      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
+      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
+
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  void DoTest(int qm_min, int qm_max) {
+    qm_min_ = qm_min;
+    qm_max_ = qm_max;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int qm_min_;
+  int qm_max_;
+};
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
+
+AV1_INSTANTIATE_TEST_CASE(QMTest,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(5, 9));
+}  // namespace
diff --git a/third_party/aom/test/quantize_func_test.cc b/third_party/aom/test/quantize_func_test.cc
new file mode 100644
index 000000000..554d0c721
--- /dev/null
+++ b/third_party/aom/test/quantize_func_test.cc
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+#define QUAN_PARAM_LIST                                                       \
+  const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,    \
+      const int16_t *round_ptr, const int16_t *quant_ptr,                     \
+      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,                 \
+      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \
+      const int16_t *scan, const int16_t *iscan
+
+typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
+typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
+
+#define HBD_QUAN_FUNC                                                      \
+  fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale)
+
+#define LBD_QUAN_FUNC                                                      \
+  fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan)
+
+template <QuantizeFuncHbd fn>
+void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) {
+  const int log_scale = 0;
+  HBD_QUAN_FUNC;
+}
+
+template <QuantizeFuncHbd fn>
+void highbd_quan32x32_wrapper(QUAN_PARAM_LIST) {
+  const int log_scale = 1;
+  HBD_QUAN_FUNC;
+}
+
+template <QuantizeFuncHbd fn>
+void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
+  const int log_scale = 2;
+  HBD_QUAN_FUNC;
+}
+
+typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType;
+
+using ::testing::tuple;
+typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
+    QuantizeParam;
+
+typedef struct {
+  QUANTS quant;
+  Dequants dequant;
+} QuanTable;
+
+const int kTestNum = 1000;
+
+class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ protected:
+  QuantizeTest()
+      : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)),
+        type_(GET_PARAM(3)), bd_(GET_PARAM(4)) {}
+
+  virtual ~QuantizeTest() {}
+
+  virtual void SetUp() {
+    qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
+    const int n_coeffs = coeff_num();
+    coeff_ = reinterpret_cast<tran_low_t *>(
+        aom_memalign(32, 6 * n_coeffs * sizeof(tran_low_t)));
+    InitQuantizer();
+  }
+
+  virtual void TearDown() {
+    aom_free(qtab_);
+    qtab_ = NULL;
+    aom_free(coeff_);
+    coeff_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+  void InitQuantizer() {
+    av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
+  }
+
+  void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
+    tran_low_t *coeff_ptr = coeff_;
+    const intptr_t n_coeffs = coeff_num();
+
+    tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
+    tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+    tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
+    tran_low_t *dqcoeff = qcoeff + n_coeffs;
+    uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+    // Testing uses 2-D DCT scan order table
+    const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+    // Testing uses luminance quantization table
+    const int16_t *zbin = qtab_->quant.y_zbin[q];
+
+    const int16_t *round = 0;
+    const int16_t *quant = 0;
+    if (type_ == TYPE_B) {
+      round = qtab_->quant.y_round[q];
+      quant = qtab_->quant.y_quant[q];
+    } else if (type_ == TYPE_FP) {
+      round = qtab_->quant.y_round_fp[q];
+      quant = qtab_->quant.y_quant_fp[q];
+    }
+
+    const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
+    const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+
+    for (int i = 0; i < test_num; ++i) {
+      if (is_loop) FillCoeffRandom();
+
+      memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
+
+      quant_ref_(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
+                 qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan,
+                 sc->iscan);
+
+      ASM_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant,
+                                      quant_shift, qcoeff, dqcoeff, dequant,
+                                      &eob[1], sc->scan, sc->iscan));
+
+      for (int j = 0; j < n_coeffs; ++j) {
+        ASSERT_EQ(qcoeff_ref[j], qcoeff[j])
+            << "Q mismatch on test: " << i << " at position: " << j
+            << " Q: " << q << " coeff: " << coeff_ptr[j];
+      }
+
+      for (int j = 0; j < n_coeffs; ++j) {
+        ASSERT_EQ(dqcoeff_ref[j], dqcoeff[j])
+            << "Dq mismatch on test: " << i << " at position: " << j
+            << " Q: " << q << " coeff: " << coeff_ptr[j];
+      }
+
+      ASSERT_EQ(eob[0], eob[1])
+          << "eobs mismatch on test: " << i << " Q: " << q;
+    }
+  }
+
+  void CompareResults(const tran_low_t *buf_ref, const tran_low_t *buf,
+                      int size, const char *text, int q, int number) {
+    int i;
+    for (i = 0; i < size; ++i) {
+      ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number
+                                    << " at position: " << i << " Q: " << q;
+    }
+  }
+
+  int coeff_num() const { return av1_get_max_eob(tx_size_); }
+
+  void FillCoeff(tran_low_t c) {
+    const int n_coeffs = coeff_num();
+    for (int i = 0; i < n_coeffs; ++i) {
+      coeff_[i] = c;
+    }
+  }
+
+  void FillCoeffRandom() {
+    const int n_coeffs = coeff_num();
+    FillCoeffZero();
+    int num = rnd_.Rand16() % n_coeffs;
+    for (int i = 0; i < num; ++i) {
+      coeff_[i] = GetRandomCoeff();
+    }
+  }
+
+  void FillCoeffZero() { FillCoeff(0); }
+
+  void FillCoeffConstant() {
+    tran_low_t c = GetRandomCoeff();
+    FillCoeff(c);
+  }
+
+  void FillDcOnly() {
+    FillCoeffZero();
+    coeff_[0] = GetRandomCoeff();
+  }
+
+  void FillDcLargeNegative() {
+    FillCoeffZero();
+    // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+    // like BUG=883 where the constant being compared was incorrectly
+    // initialized.
+    coeff_[0] = -8191;
+  }
+
+  tran_low_t GetRandomCoeff() {
+    tran_low_t coeff;
+    if (bd_ == AOM_BITS_8) {
+      coeff =
+          clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX);
+    } else {
+      tran_low_t min = -(1 << (7 + bd_));
+      tran_low_t max = -min - 1;
+      coeff = clamp(static_cast<tran_low_t>(rnd_.Rand31()), min, max);
+    }
+    return coeff;
+  }
+
+  ACMRandom rnd_;
+  QuanTable *qtab_;
+  tran_low_t *coeff_;
+  QuantizeFunc quant_ref_;
+  QuantizeFunc quant_;
+  TX_SIZE tx_size_;
+  QuantType type_;
+  aom_bit_depth_t bd_;
+};
+
+TEST_P(QuantizeTest, ZeroInput) {
+  FillCoeffZero();
+  QuantizeRun(false);
+}
+
+TEST_P(QuantizeTest, LargeNegativeInput) {
+  FillDcLargeNegative();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(QuantizeTest, DcOnlyInput) {
+  FillDcOnly();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(QuantizeTest, RandomInput) { QuantizeRun(true, 0, kTestNum); }
+
+TEST_P(QuantizeTest, MultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    QuantizeRun(true, q, kTestNum);
+  }
+}
+
+// Force the coeff to be half the value of the dequant.  This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(QuantizeTest, CoeffHalfDequant) {
+  FillCoeff(16);
+  QuantizeRun(false, 25, 1);
+}
+
+TEST_P(QuantizeTest, DISABLED_Speed) {
+  tran_low_t *coeff_ptr = coeff_;
+  const intptr_t n_coeffs = coeff_num();
+
+  tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
+  tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+  tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
+  tran_low_t *dqcoeff = qcoeff + n_coeffs;
+  uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+  // Testing uses 2-D DCT scan order table
+  const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+  // Testing uses luminance quantization table
+  const int q = 22;
+  const int16_t *zbin = qtab_->quant.y_zbin[q];
+  const int16_t *round_fp = qtab_->quant.y_round_fp[q];
+  const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
+  const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
+  const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+  const int kNumTests = 5000000;
+  aom_usec_timer timer;
+
+  FillCoeffRandom();
+
+  aom_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+           dqcoeff, dequant, eob, sc->scan, sc->iscan);
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("Elapsed time: %d us\n", elapsed_time);
+}
+
+using ::testing::make_tuple;
+
+#if HAVE_AVX2
+const QuantizeParam kQParamArrayAvx2[] = {
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_4X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_16X4, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_32X8, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2, TX_8X32, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_32X32,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_16X64,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2, TX_64X16,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2, TX_64X64,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
+             TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>, TX_16X16,
+             TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
+             TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>, TX_32X32,
+             TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
+             TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
+             TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>, TX_64X64,
+             TYPE_FP, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
+             TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
+             TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2, TX_16X16,
+             TYPE_B, AOM_BITS_12),
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, QuantizeTest,
+                        ::testing::ValuesIn(kQParamArrayAvx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+const QuantizeParam kQParamArraySSE2[] = {
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_4X16, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_16X4, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_8X32, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2, TX_32X8, TYPE_FP,
+             AOM_BITS_8),
+  make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2, TX_16X16, TYPE_B,
+             AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
+             TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
+             TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2, TX_16X16,
+             TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             TX_32X32, TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             TX_32X32, TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             TX_32X32, TYPE_B, AOM_BITS_12),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,
+                        ::testing::ValuesIn(kQParamArraySSE2));
+#endif
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, QuantizeTest,
+    ::testing::Values(make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
+                                 TX_16X16, TYPE_B, AOM_BITS_8)));
+
+// Like libvpx, the ssse3 and avx quantize tests do not pass.
+// https://bugs.chromium.org/p/webm/issues/detail?id=1448
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_SSSE3_32x32, QuantizeTest,
+    ::testing::Values(make_tuple(&aom_quantize_b_32x32_c,
+                                 &aom_quantize_b_32x32_ssse3, TX_16X16, TYPE_B,
+                                 AOM_BITS_8)));
+
+#endif  // HAVE_SSSE3 && ARCH_X86_64
+
+#if HAVE_AVX && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    AVX, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx, TX_16X16, TYPE_B,
+                   AOM_BITS_8),
+        // Although these tests will not pass against _c, test them against each
+        // other so there is some minor checking.
+        make_tuple(&aom_quantize_b_32x32_ssse3, &aom_quantize_b_32x32_avx,
+                   TX_32X32, TYPE_B, AOM_BITS_8)));
+
+#endif  // HAVE_AVX && ARCH_X86_64
+}  // namespace
diff --git a/third_party/aom/test/reconinter_test.cc b/third_party/aom/test/reconinter_test.cc
new file mode 100644
index 000000000..a8536e517
--- /dev/null
+++ b/third_party/aom/test/reconinter_test.cc
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask,
+                                           DIFFWTD_MASK_TYPE mask_type,
+                                           const uint8_t *src0, int src0_stride,
+                                           const uint8_t *src1, int src1_stride,
+                                           int h, int w);
+
+typedef ::testing::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
+    BuildCompDiffwtdMaskDParam;
+
+#if HAVE_SSE4_1
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
+    buildcompdiffwtdmaskd_func filter) {
+  return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+                            ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskTest
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
+ public:
+  virtual ~BuildCompDiffwtdMaskTest() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
+               const DIFFWTD_MASK_TYPE type);
+
+ private:
+  ACMRandom rnd_;
+};
+
+typedef void (*buildcompdiffwtdmaskd16_func)(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd);
+
+typedef ::testing::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
+    BuildCompDiffwtdMaskD16Param;
+
+#if HAVE_SSE4_1 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
+    buildcompdiffwtdmaskd16_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif
+class BuildCompDiffwtdMaskD16Test
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
+ public:
+  ~BuildCompDiffwtdMaskD16Test() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
+  void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
+                    DIFFWTD_MASK_TYPE mask_type);
+  libaom_test::ACMRandom rnd_;
+};  // class BuildCompDiffwtdMaskD16Test
+
+void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
+    buildcompdiffwtdmaskd16_func test_impl) {
+  const int block_idx = GET_PARAM(2);
+  const int bd = GET_PARAM(0);
+  const int width = block_size_wide[block_idx];
+  const int height = block_size_high[block_idx];
+  DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+  ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);
+
+  int in_precision =
+      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; i++) {
+    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+  }
+
+  for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
+    av1_build_compound_diffwtd_mask_d16_c(
+        mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+        height, width, &conv_params, bd);
+
+    test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+              height, width, &conv_params, bd);
+
+    for (int r = 0; r < height; ++r) {
+      for (int c = 0; c < width; ++c) {
+        ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+            << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
+            << " Pixel mismatch at index "
+            << "[" << r << "," << c << "] "
+            << " @ " << width << "x" << height << " inv " << mask_type;
+      }
+    }
+  }
+}
+
+void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
+    buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
+  const int block_idx = GET_PARAM(2);
+  const int bd = GET_PARAM(0);
+  const int width = block_size_wide[block_idx];
+  const int height = block_size_high[block_idx];
+  DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+  ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);
+
+  int in_precision =
+      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; i++) {
+    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+  }
+
+  const int num_loops = 10000000 / (width + height);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
+                                          width, height, width, &conv_params,
+                                          bd);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+  aom_usec_timer timer1;
+  aom_usec_timer_start(&timer1);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mask, mask_type, src0, width, src1, width, height, width,
+              &conv_params, bd);
+
+  aom_usec_timer_mark(&timer1);
+  const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+  printf("av1_build_compound_diffwtd_mask_d16  %3dx%-3d: %7.2f \n", width,
+         height, elapsed_time / double(elapsed_time1));
+}
+#if HAVE_SSE4_1
+void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
+                                       const int is_speed,
+                                       const DIFFWTD_MASK_TYPE type) {
+  const int sb_type = GET_PARAM(0);
+  const int width = block_size_wide[sb_type];
+  const int height = block_size_high[sb_type];
+  DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < width * height; i++) {
+    src0[i] = rnd.Rand8();
+    src1[i] = rnd.Rand8();
+  }
+  const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width,
+                                      height, width);
+  }
+  const double t1 = get_time_mark(&timer);
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    test_impl(mask_test, type, src0, width, src1, width, height, width);
+  }
+  const double t2 = get_time_mark(&timer);
+  if (is_speed) {
+    printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+    printf("(%3.2f)\n", t1 / t2);
+  }
+  for (int r = 0; r < height; ++r) {
+    for (int c = 0; c < width; ++c) {
+      ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+          << "[" << r << "," << c << "] " << run_times << " @ " << width << "x"
+          << height << " inv " << type;
+    }
+  }
+}
+
+TEST_P(BuildCompDiffwtdMaskTest, match) {
+  RunTest(GET_PARAM(1), 0, DIFFWTD_38);
+  RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
+  RunTest(GET_PARAM(1), 1, DIFFWTD_38);
+  RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
+}
+#endif
+TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1), DIFFWTD_38);
+  RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+                        BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, BuildCompDiffwtdMaskD16Test,
+    BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, BuildCompDiffwtdMaskTest,
+                        BuildParams(av1_build_compound_diffwtd_mask_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, BuildCompDiffwtdMaskD16Test,
+                        BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, BuildCompDiffwtdMaskD16Test,
+                        BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+#endif
+
+}  // namespace
diff --git a/third_party/aom/test/register_state_check.h b/third_party/aom/test/register_state_check.h
new file mode 100644
index 000000000..d404621dd
--- /dev/null
+++ b/third_party/aom/test/register_state_check.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_REGISTER_STATE_CHECK_H_
+#define AOM_TEST_REGISTER_STATE_CHECK_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+// ASM_REGISTER_STATE_CHECK(asm_function)
+//   Minimally validates the environment pre & post function execution. This
+//   variant should be used with assembly functions which are not expected to
+//   fully restore the system state. See platform implementations of
+//   RegisterStateCheck for details.
+//
+// API_REGISTER_STATE_CHECK(api_function)
+//   Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any
+//   additional checks to ensure the environment is in a consistent state pre &
+//   post function execution. This variant should be used with API functions.
+//   See platform implementations of RegisterStateCheckXXX for details.
+//
+
+#if defined(_WIN64) && ARCH_X86_64
+
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+inline bool operator==(const M128A &lhs, const M128A &rhs) {
+  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+namespace libaom_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+  ~RegisterStateCheck() { Check(); }
+
+ private:
+  static bool StoreRegisters(CONTEXT *const context) {
+    const HANDLE this_thread = GetCurrentThread();
+    EXPECT_TRUE(this_thread != NULL);
+    context->ContextFlags = CONTEXT_FLOATING_POINT;
+    const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+    EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+    return context_saved;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  void Check() const {
+    ASSERT_TRUE(initialized_);
+    CONTEXT post_context;
+    ASSERT_TRUE(StoreRegisters(&post_context));
+
+    const M128A *xmm_pre = &pre_context_.Xmm6;
+    const M128A *xmm_post = &post_context.Xmm6;
+    for (int i = 6; i <= 15; ++i) {
+      EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+      ++xmm_pre;
+      ++xmm_post;
+    }
+  }
+
+  bool initialized_;
+  CONTEXT pre_context_;
+};
+
+#define ASM_REGISTER_STATE_CHECK(statement)    \
+  do {                                         \
+    libaom_test::RegisterStateCheck reg_check; \
+    statement;                                 \
+  } while (false)
+
+}  // namespace libaom_test
+
+#else
+
+namespace libaom_test {
+
+class RegisterStateCheck {};
+#define ASM_REGISTER_STATE_CHECK(statement) statement
+
+}  // namespace libaom_test
+
+#endif  // _WIN64 && ARCH_X86_64
+
+#if ARCH_X86 || ARCH_X86_64
+#if defined(__GNUC__)
+
+namespace libaom_test {
+
+// Checks the FPU tag word pre/post execution to ensure emms has been called.
+class RegisterStateCheckMMX {
+ public:
+  RegisterStateCheckMMX() {
+    __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_));
+  }
+  ~RegisterStateCheckMMX() { Check(); }
+
+ private:
+  // Checks the FPU tag word pre/post execution, returning false if not cleared
+  // to 0xffff.
+  void Check() const {
+    EXPECT_EQ(0xffff, pre_fpu_env_[4])
+        << "FPU was in an inconsistent state prior to call";
+
+    uint16_t post_fpu_env[14];
+    __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env));
+    EXPECT_EQ(0xffff, post_fpu_env[4])
+        << "FPU was left in an inconsistent state after call";
+  }
+
+  uint16_t pre_fpu_env_[14];
+};
+
+#define API_REGISTER_STATE_CHECK(statement)       \
+  do {                                            \
+    libaom_test::RegisterStateCheckMMX reg_check; \
+    ASM_REGISTER_STATE_CHECK(statement);          \
+  } while (false)
+
+}  // namespace libaom_test
+
+#endif  // __GNUC__
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#ifndef API_REGISTER_STATE_CHECK
+#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
+#endif
+
+#endif  // AOM_TEST_REGISTER_STATE_CHECK_H_
diff --git a/third_party/aom/test/resize_test.cc b/third_party/aom/test/resize_test.cc
new file mode 100644
index 000000000..b270b8362
--- /dev/null
+++ b/third_party/aom/test/resize_test.cc
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
+namespace {
+
+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const aom_codec_enc_cfg_t *const cfg,
+                                  int frame_cnt, FILE *const outfile) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4, 0);                    /* version */
+  mem_put_le16(header + 6, 32);                   /* headersize */
+  mem_put_le32(header + 8, 0x30395056);           /* fourcc (av1) */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+  char header[4];
+  mem_put_le32(header, static_cast<unsigned int>(size));
+  (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const aom_codec_cx_pkt_t *const pkt,
+                                   FILE *const outfile) {
+  char header[12];
+  aom_codec_pts_t pts;
+
+  if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+#endif  // WRITE_COMPRESSED_STREAM
+
+const unsigned int kInitialWidth = 320;
+const unsigned int kInitialHeight = 240;
+
+struct FrameInfo {
+  FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  aom_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
+void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
+                         unsigned int initial_h, unsigned int *w,
+                         unsigned int *h, int flag_codec) {
+  if (frame < 10) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 20) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 30) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 40) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 50) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 60) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 70) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 80) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 90) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 100) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 110) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  // Go down very low
+  if (frame < 120) {
+    *w = initial_w / 4;
+    *h = initial_h / 4;
+    return;
+  }
+  if (flag_codec == 1) {
+    // Cases that only works for AV1.
+    // For AV1: Swap width and height of original.
+    if (frame < 140) {
+      *w = initial_h;
+      *h = initial_w;
+      return;
+    }
+  }
+  *w = initial_w;
+  *h = initial_h;
+}
+
+class ResizingVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+  ResizingVideoSource() {
+    SetSize(kInitialWidth, kInitialHeight);
+    limit_ = 150;
+  }
+  int flag_codec_;
+  virtual ~ResizingVideoSource() {}
+
+ protected:
+  virtual void Next() {
+    ++frame_;
+    unsigned int width;
+    unsigned int height;
+    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
+                        flag_codec_);
+    SetSize(width, height);
+    FillFrame();
+  }
+};
+
+class ResizeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ResizeTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~ResizeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  std::vector<FrameInfo> frame_info_list_;
+};
+
+TEST_P(ResizeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 0;
+  cfg_.g_lag_in_frames = 0;
+  // We use max(kInitialWidth, kInitialHeight) because during the test
+  // the width and height of the frame are swapped
+  cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+      AOMMAX(kInitialWidth, kInitialHeight);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, 0);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+  }
+}
+
+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
+class ResizeInternalTestLarge : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeInternalTestLarge()
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+#else
+  ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeInternalTestLarge() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("av10-2-05-resize.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (change_config_) {
+      int new_q = 60;
+      if (video->frame() == 0) {
+        struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() == 1) {
+        struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+        cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q;
+        encoder->Config(&cfg_);
+      }
+    } else {
+      if (video->frame() >= kStepDownFrame && video->frame() < kStepUpFrame) {
+        struct aom_scaling_mode mode = { AOME_FOURFIVE, AOME_THREEFIVE };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() >= kStepUpFrame) {
+        struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+      }
+    }
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.5);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+  bool change_config_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+TEST_P(ResizeInternalTestLarge, TestInternalResizeWorks) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  change_config_ = false;
+
+  // q picked such that initial keyframe on this clip is ~30dB PSNR
+  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+  }
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const aom_codec_pts_t pts = info->pts;
+    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
+      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
+      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
+    } else {
+      EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width";
+      EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height";
+    }
+  }
+}
+
+TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_config_ = true;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+class ResizeRealtimeTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeRealtimeTest() {}
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_AQ_MODE, 3);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+    }
+
+    if (change_bitrate_ && video->frame() == 120) {
+      change_bitrate_ = false;
+      cfg_.rc_target_bitrate = 500;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+
+  void DefaultConfig() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.kf_mode = AOM_KF_AUTO;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+    // Enable dropped frames.
+    cfg_.rc_dropframe_thresh = 1;
+    // Disable error_resilience mode.
+    cfg_.g_error_resilient = 0;
+    // Run at low bitrate.
+    cfg_.rc_target_bitrate = 200;
+    // We use max(kInitialWidth, kInitialHeight) because during the test
+    // the width and height of the frame are swapped
+    cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+        AOMMAX(kInitialWidth, kInitialHeight);
+  }
+
+  std::vector<FrameInfo> frame_info_list_;
+  int set_cpu_used_;
+  bool change_bitrate_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
+};
+
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 1;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, 1);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Run at low bitrate, with resize_allowed = 1, and verify that we get
+// one resize down event.
+TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      // Verify that resize down occurs.
+      ASSERT_LT(info->w, last_w);
+      ASSERT_LT(info->h, last_h);
+      last_w = info->w;
+      last_h = info->h;
+      resize_count++;
+    }
+  }
+
+#if CONFIG_AV1_DECODER
+  // Verify that we get 1 resize down event in this test.
+  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip,
+// scaling-up should occur after bitrate changed.
+TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 359);
+  DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = true;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 80;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      resize_count++;
+      if (resize_count == 1) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+      } else if (resize_count == 2) {
+        // Verify that resize up occurs.
+        ASSERT_GT(info->w, last_w);
+        ASSERT_GT(info->h, last_h);
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+  }
+
+#if CONFIG_AV1_DECODER
+  // Verify that we get 2 resize events in this test.
+  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+class ResizeCspTest : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeCspTest()
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+#else
+  ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeCspTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("av11-2-05-cspchape.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+  explicit ResizingCspVideoSource(aom_img_fmt_t image_format) {
+    SetSize(kInitialWidth, kInitialHeight);
+    SetImageFormat(image_format);
+    limit_ = 30;
+  }
+
+  virtual ~ResizingCspVideoSource() {}
+};
+
+#if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH)
+TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) {
+#else
+TEST_P(ResizeCspTest, TestResizeCspWorks) {
+#endif
+  const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 };
+  for (size_t i = 0; i < GTEST_ARRAY_SIZE_(image_formats); ++i) {
+    ResizingCspVideoSource video(image_formats[i]);
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_profile = (image_formats[i] == AOM_IMG_FMT_I420) ? 0 : 1;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    // Check we decoded the same number of frames as we attempted to encode
+    ASSERT_EQ(frame_info_list_.size(), video.limit());
+    frame_info_list_.clear();
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(ResizeTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_CASE(ResizeInternalTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood));
+AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 9));
+AV1_INSTANTIATE_TEST_CASE(ResizeCspTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+}  // namespace
diff --git a/third_party/aom/test/run_encodes.sh b/third_party/aom/test/run_encodes.sh
new file mode 100755
index 000000000..2096d8b15
--- /dev/null
+++ b/third_party/aom/test/run_encodes.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 4 ]]; then
+  echo Encodes all the y4m files in the directory at the bitrates specified by
+  echo the first 3 parameters and stores the results in a subdirectory named by
+  echo the 4th parameter:
+  echo
+  echo Usage:    run_encodes.sh start-kbps end-kbps step-kbps output-directory
+  echo Example:  run_encodes.sh 200 500 50 baseline
+  exit
+fi
+
+s=$1
+e=$2
+step=$3
+newdir=$4
+
+for i in ./*y4m; do
+  for (( b=$s; b<= $e; b+= $step ))
+  do
+    best_encode.sh $i $b
+  done
+  mv opsnr.stt $i.stt
+done
+
+mkdir $newdir
+mv *.stt $newdir
+mv *.webm $newdir
diff --git a/third_party/aom/test/sad_test.cc b/third_party/aom/test/sad_test.cc
new file mode 100644
index 000000000..845fe79da
--- /dev/null
+++ b/third_party/aom/test/sad_test.cc
@@ -0,0 +1,1528 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "aom/aom_codec.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride);
+typedef ::testing::tuple<int, int, SadMxNFunc, int> SadMxNParam;
+
+typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  const uint8_t *second_pred);
+typedef ::testing::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+
+typedef void (*JntCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+                               int width, int height, const uint8_t *ref,
+                               int ref_stride,
+                               const JNT_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, JntCompAvgFunc, int> JntCompAvgParam;
+
+typedef unsigned int (*JntSadMxhFunc)(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int width, int height);
+typedef ::testing::tuple<int, int, JntSadMxhFunc, int> JntSadMxhParam;
+
+typedef uint32_t (*JntSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const uint8_t *second_pred,
+                                     const JNT_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, JntSadMxNAvgFunc, int> JntSadMxNAvgParam;
+
+typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_ptr[], int ref_stride,
+                             uint32_t *sad_array);
+typedef ::testing::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+
+using libaom_test::ACMRandom;
+
+namespace {
+class SADTestBase : public ::testing::Test {
+ public:
+  SADTestBase(int width, int height, int bit_depth)
+      : width_(width), height_(height), bd_(bit_depth) {}
+
+  static void SetUpTestCase() {
+    source_data8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize));
+    reference_data8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize));
+    second_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    comp_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    comp_pred8_test_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    source_data16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t)));
+    reference_data16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+    second_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    comp_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+  }
+
+  static void TearDownTestCase() {
+    aom_free(source_data8_);
+    source_data8_ = NULL;
+    aom_free(reference_data8_);
+    reference_data8_ = NULL;
+    aom_free(second_pred8_);
+    second_pred8_ = NULL;
+    aom_free(comp_pred8_);
+    comp_pred8_ = NULL;
+    aom_free(comp_pred8_test_);
+    comp_pred8_test_ = NULL;
+    aom_free(source_data16_);
+    source_data16_ = NULL;
+    aom_free(reference_data16_);
+    reference_data16_ = NULL;
+    aom_free(second_pred16_);
+    second_pred16_ = NULL;
+    aom_free(comp_pred16_);
+    comp_pred16_ = NULL;
+    aom_free(comp_pred16_test_);
+    comp_pred16_test_ = NULL;
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  // Handle up to 4 128x128 blocks, with stride up to 256
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 128 * 256;
+  static const int kDataBufferSize = 4 * kDataBlockSize;
+
+  virtual void SetUp() {
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = AOM_BITS_8;
+      source_data_ = source_data8_;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+      comp_pred_ = comp_pred8_;
+      comp_pred_test_ = comp_pred8_test_;
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+      source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+      comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+      comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+    }
+    mask_ = (1 << bit_depth_) - 1;
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual uint8_t *GetReference(int block_idx) {
+    if (use_high_bit_depth_)
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+    return reference_data_ + block_idx * kDataBlockSize;
+  }
+
+  // Sum of Absolute Differences. Given two blocks, calculate the absolute
+  // difference between two pixels in the same relative location; accumulate.
+  unsigned int ReferenceSAD(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+        }
+      }
+    }
+    return sad;
+  }
+
+  // Sum of Absolute Differences Average. Given two blocks, and a prediction
+  // calculate the absolute difference between one pixel and average of the
+  // corresponding and predicted pixels; accumulate.
+  unsigned int ReferenceSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp = second_pred8[h * width_ + w] +
+                          reference8[h * reference_stride_ + w];
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+        } else {
+          const int tmp = second_pred16[h * width_ + w] +
+                          reference16[h * reference_stride_ + w];
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+        }
+      }
+    }
+    return sad;
+  }
+
+  void ReferenceJntCompAvg(int block_idx) {
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const second_pred8 = second_pred_;
+    uint8_t *const comp_pred8 = comp_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        }
+      }
+    }
+  }
+
+  unsigned int ReferenceJntSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+        }
+      }
+    }
+    return sad;
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+        } else {
+          data16[h * stride + w] = fill_constant;
+        }
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+        }
+      }
+    }
+  }
+
+  int width_, height_, mask_, bd_;
+  aom_bit_depth_t bit_depth_;
+  static uint8_t *source_data_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
+  int source_stride_;
+  bool use_high_bit_depth_;
+  static uint8_t *source_data8_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *source_data16_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
+  int reference_stride_;
+  static uint8_t *comp_pred_;
+  static uint8_t *comp_pred8_;
+  static uint16_t *comp_pred16_;
+  static uint8_t *comp_pred_test_;
+  static uint8_t *comp_pred8_test_;
+  static uint16_t *comp_pred16_test_;
+  JNT_COMP_PARAMS jcp_param_;
+
+  ACMRandom rnd_;
+};
+
+class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                  public SADTestBase {
+ public:
+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSAD(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
+class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
+                public SADTestBase {
+ public:
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
+class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
+                   public SADTestBase {
+ public:
+  SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADavg(0);
+    const unsigned int exp_sad = SAD_avg(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
+
+class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
+                       public SADTestBase {
+ public:
+  JntCompAvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void jnt_comp_avg(int block_idx) {
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+                                          height_, reference, reference_stride_,
+                                          &jcp_param_));
+  }
+
+  void CheckCompAvg() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
+        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+
+        ReferenceJntCompAvg(0);
+        jnt_comp_avg(0);
+
+        for (int y = 0; y < height_; ++y)
+          for (int x = 0; x < width_; ++x)
+            ASSERT_EQ(comp_pred_[y * width_ + x],
+                      comp_pred_test_[y * width_ + x]);
+      }
+    }
+  }
+};
+
+class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>,
+                   public SADTestBase {
+ public:
+  JntSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                GET_PARAM(0), GET_PARAM(1)));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
+class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>,
+                      public SADTestBase {
+ public:
+  JntSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int jnt_SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_, &jcp_param_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
+        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+
+        const unsigned int reference_sad = ReferenceJntSADavg(0);
+        const unsigned int exp_sad = jnt_SAD_avg(0);
+
+        ASSERT_EQ(reference_sad, exp_sad);
+      }
+    }
+  }
+};
+
+uint8_t *SADTestBase::source_data_ = NULL;
+uint8_t *SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::second_pred_ = NULL;
+uint8_t *SADTestBase::comp_pred_ = NULL;
+uint8_t *SADTestBase::comp_pred_test_ = NULL;
+uint8_t *SADTestBase::source_data8_ = NULL;
+uint8_t *SADTestBase::reference_data8_ = NULL;
+uint8_t *SADTestBase::second_pred8_ = NULL;
+uint8_t *SADTestBase::comp_pred8_ = NULL;
+uint8_t *SADTestBase::comp_pred8_test_ = NULL;
+uint16_t *SADTestBase::source_data16_ = NULL;
+uint16_t *SADTestBase::reference_data16_ = NULL;
+uint16_t *SADTestBase::second_pred16_ = NULL;
+uint16_t *SADTestBase::comp_pred16_ = NULL;
+uint16_t *SADTestBase::comp_pred16_test_ = NULL;
+
+TEST_P(SADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+#define SPEED_TEST (0)
+#if SPEED_TEST
+TEST_P(SADTest, Speed) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  SpeedSAD();
+  source_stride_ = tmp_stride;
+}
+#endif
+
+TEST_P(SADavgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(SADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADavgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    FillRandom(second_pred_, width_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(JntCompAvgTest, MaxRef) {
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckCompAvg();
+}
+
+TEST_P(JntCompAvgTest, MaxSecondPred) {
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, mask_);
+  CheckCompAvg();
+}
+
+TEST_P(JntCompAvgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntCompAvgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(JntSADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(JntSADTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADavgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(JntSADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(JntSADavgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADavgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(JntSADavgTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    FillRandom(second_pred_, width_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+using ::testing::make_tuple;
+
+//------------------------------------------------------------------------------
+// C functions
+const SadMxNParam c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_c, -1),
+  make_tuple(128, 64, &aom_sad128x64_c, -1),
+  make_tuple(64, 128, &aom_sad64x128_c, -1),
+  make_tuple(64, 64, &aom_sad64x64_c, -1),
+  make_tuple(64, 32, &aom_sad64x32_c, -1),
+  make_tuple(32, 64, &aom_sad32x64_c, -1),
+  make_tuple(32, 32, &aom_sad32x32_c, -1),
+  make_tuple(32, 16, &aom_sad32x16_c, -1),
+  make_tuple(16, 32, &aom_sad16x32_c, -1),
+  make_tuple(16, 16, &aom_sad16x16_c, -1),
+  make_tuple(16, 8, &aom_sad16x8_c, -1),
+  make_tuple(8, 16, &aom_sad8x16_c, -1),
+  make_tuple(8, 8, &aom_sad8x8_c, -1),
+  make_tuple(8, 4, &aom_sad8x4_c, -1),
+  make_tuple(4, 8, &aom_sad4x8_c, -1),
+  make_tuple(4, 4, &aom_sad4x4_c, -1),
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
+};
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+
+const SadMxNAvgParam avg_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_c, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
+};
+INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
+
+// TODO(chengchen): add highbd tests
+const JntCompAvgParam jnt_comp_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(128, 64, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(64, 128, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(64, 64, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(64, 32, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(32, 64, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(32, 32, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(32, 16, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(16, 32, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(16, 16, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(16, 8, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(8, 16, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(8, 8, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(8, 4, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(4, 8, &aom_jnt_comp_avg_pred_c, -1),
+  make_tuple(4, 4, &aom_jnt_comp_avg_pred_c, -1),
+};
+
+INSTANTIATE_TEST_CASE_P(C, JntCompAvgTest,
+                        ::testing::ValuesIn(jnt_comp_avg_c_tests));
+
+const JntSadMxNAvgParam jnt_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_jnt_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_jnt_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_jnt_sad64x128_avg_c, -1),
+  make_tuple(64, 64, &aom_jnt_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &aom_jnt_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &aom_jnt_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &aom_jnt_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &aom_jnt_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &aom_jnt_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &aom_jnt_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &aom_jnt_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &aom_jnt_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &aom_jnt_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &aom_jnt_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &aom_jnt_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &aom_jnt_sad4x4_avg_c, -1),
+};
+INSTANTIATE_TEST_CASE_P(C, JntSADavgTest, ::testing::ValuesIn(jnt_avg_c_tests));
+
+const SadMxNx4Param x4d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_c, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_c, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_c, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_c, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_c, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_c, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_c, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_c, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_c, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_c, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_c, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_c, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
+};
+INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+
+//------------------------------------------------------------------------------
+// ARM functions
+#if HAVE_NEON
+const SadMxNParam neon_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64_neon, -1),
+  make_tuple(32, 32, &aom_sad32x32_neon, -1),
+  make_tuple(16, 16, &aom_sad16x16_neon, -1),
+  make_tuple(16, 8, &aom_sad16x8_neon, -1),
+  make_tuple(8, 16, &aom_sad8x16_neon, -1),
+  make_tuple(8, 8, &aom_sad8x8_neon, -1),
+  make_tuple(4, 4, &aom_sad4x4_neon, -1),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+
+const SadMxNx4Param x4d_neon_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+#endif  // HAVE_NEON
+
+//------------------------------------------------------------------------------
+// x86 functions
+#if HAVE_SSE2
+const SadMxNParam sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4_sse2, -1),
+  make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+
+const SadMxNAvgParam avg_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
+
+const SadMxNx4Param x4d_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
+};
+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+// Note: These are named sse2, but part of ssse3 file and only built and linked
+// when ssse3 is enabled.
+const JntSadMxhParam jnt_sad_sse2_tests[] = {
+  make_tuple(4, 4, &aom_sad4xh_sse2, -1),
+  make_tuple(4, 8, &aom_sad4xh_sse2, -1),
+  make_tuple(8, 4, &aom_sad8xh_sse2, -1),
+  make_tuple(8, 8, &aom_sad8xh_sse2, -1),
+  make_tuple(8, 16, &aom_sad8xh_sse2, -1),
+  make_tuple(16, 8, &aom_sad16xh_sse2, -1),
+  make_tuple(16, 16, &aom_sad16xh_sse2, -1),
+  make_tuple(16, 32, &aom_sad16xh_sse2, -1),
+  make_tuple(32, 16, &aom_sad32xh_sse2, -1),
+  make_tuple(32, 32, &aom_sad32xh_sse2, -1),
+  make_tuple(32, 64, &aom_sad32xh_sse2, -1),
+  make_tuple(64, 32, &aom_sad64xh_sse2, -1),
+  make_tuple(64, 64, &aom_sad64xh_sse2, -1),
+  make_tuple(128, 128, &aom_sad128xh_sse2, -1),
+  make_tuple(128, 64, &aom_sad128xh_sse2, -1),
+  make_tuple(64, 128, &aom_sad64xh_sse2, -1),
+  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
+  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
+  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
+  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
+  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
+  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest,
+                        ::testing::ValuesIn(jnt_sad_sse2_tests));
+
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE3
+// Only functions are x3, which do not have tests.
+#endif  // HAVE_SSE3
+
+#if HAVE_SSSE3
+const JntCompAvgParam jnt_comp_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(128, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+};
+
+INSTANTIATE_TEST_CASE_P(SSSE3, JntCompAvgTest,
+                        ::testing::ValuesIn(jnt_comp_avg_ssse3_tests));
+
+const JntSadMxNAvgParam jnt_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_jnt_sad128x128_avg_ssse3, -1),
+  make_tuple(128, 64, &aom_jnt_sad128x64_avg_ssse3, -1),
+  make_tuple(64, 128, &aom_jnt_sad64x128_avg_ssse3, -1),
+  make_tuple(64, 64, &aom_jnt_sad64x64_avg_ssse3, -1),
+  make_tuple(64, 32, &aom_jnt_sad64x32_avg_ssse3, -1),
+  make_tuple(32, 64, &aom_jnt_sad32x64_avg_ssse3, -1),
+  make_tuple(32, 32, &aom_jnt_sad32x32_avg_ssse3, -1),
+  make_tuple(32, 16, &aom_jnt_sad32x16_avg_ssse3, -1),
+  make_tuple(16, 32, &aom_jnt_sad16x32_avg_ssse3, -1),
+  make_tuple(16, 16, &aom_jnt_sad16x16_avg_ssse3, -1),
+  make_tuple(16, 8, &aom_jnt_sad16x8_avg_ssse3, -1),
+  make_tuple(8, 16, &aom_jnt_sad8x16_avg_ssse3, -1),
+  make_tuple(8, 8, &aom_jnt_sad8x8_avg_ssse3, -1),
+  make_tuple(8, 4, &aom_jnt_sad8x4_avg_ssse3, -1),
+  make_tuple(4, 8, &aom_jnt_sad4x8_avg_ssse3, -1),
+  make_tuple(4, 4, &aom_jnt_sad4x4_avg_ssse3, -1),
+};
+INSTANTIATE_TEST_CASE_P(SSSE3, JntSADavgTest,
+                        ::testing::ValuesIn(jnt_avg_ssse3_tests));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+// Only functions are x8, which do not have tests.
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const SadMxNParam avx2_tests[] = {
+  make_tuple(64, 128, &aom_sad64x128_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32_avx2, -1),
+  make_tuple(32, 64, &aom_sad32x64_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16_avx2, -1),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+
+const SadMxNAvgParam avg_avx2_tests[] = {
+  make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128_avg_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_avx2, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+
+const SadMxNx4Param x4d_avx2_tests[] = {
+  make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
+};
+INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+#endif  // HAVE_AVX2
+
+//------------------------------------------------------------------------------
+// MIPS functions
+#if HAVE_MSA
+const SadMxNParam msa_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64_msa, -1),
+  make_tuple(64, 32, &aom_sad64x32_msa, -1),
+  make_tuple(32, 64, &aom_sad32x64_msa, -1),
+  make_tuple(32, 32, &aom_sad32x32_msa, -1),
+  make_tuple(32, 16, &aom_sad32x16_msa, -1),
+  make_tuple(16, 32, &aom_sad16x32_msa, -1),
+  make_tuple(16, 16, &aom_sad16x16_msa, -1),
+  make_tuple(16, 8, &aom_sad16x8_msa, -1),
+  make_tuple(8, 16, &aom_sad8x16_msa, -1),
+  make_tuple(8, 8, &aom_sad8x8_msa, -1),
+  make_tuple(8, 4, &aom_sad8x4_msa, -1),
+  make_tuple(4, 8, &aom_sad4x8_msa, -1),
+  make_tuple(4, 4, &aom_sad4x4_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
+
+const SadMxNAvgParam avg_msa_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64_avg_msa, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_msa, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_msa, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_msa, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_msa, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_msa, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_msa, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_msa, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_msa, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_msa, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_msa, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_msa, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
+
+const SadMxNx4Param x4d_msa_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64x4d_msa, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_msa, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_msa, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_msa, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_msa, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_msa, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_msa, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_msa, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_msa, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_msa, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_msa, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_msa, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_msa, -1),
+};
+INSTANTIATE_TEST_CASE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
+#endif  // HAVE_MSA
+
+}  // namespace
diff --git a/third_party/aom/test/scalability_test.cc b/third_party/aom/test/scalability_test.cc
new file mode 100644
index 000000000..b39918861
--- /dev/null
+++ b/third_party/aom/test/scalability_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kCpuUsed = 8;
+const int kBaseLayerQp = 55;
+const int kEnhancementLayerQp = 20;
+
+class ScalabilityTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ScalabilityTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ScalabilityTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    num_spatial_layers_ = 2;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+      encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_);
+    } else if (video->frame() % num_spatial_layers_) {
+      frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                     AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                     AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                     AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                     AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY;
+      encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 1);
+      encoder->Control(AOME_SET_CQ_LEVEL, kEnhancementLayerQp);
+    } else {
+      frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                     AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                     AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                     AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+                     AOM_EFLAG_NO_UPD_ENTROPY;
+      encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 0);
+      encoder->Control(AOME_SET_CQ_LEVEL, kBaseLayerQp);
+    }
+  }
+
+  void DoTest(int num_spatial_layers) {
+    num_spatial_layers_ = num_spatial_layers;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 18);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int num_spatial_layers_;
+};
+
+TEST_P(ScalabilityTest, TestNoMismatch2SpatialLayers) { DoTest(2); }
+
+TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); }
+
+AV1_INSTANTIATE_TEST_CASE(ScalabilityTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+
+}  // namespace
diff --git a/third_party/aom/test/scan_test.cc b/third_party/aom/test/scan_test.cc
new file mode 100644
index 000000000..dee2ab5a6
--- /dev/null
+++ b/third_party/aom/test/scan_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/av1_txfm_test.h"
+
+static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r,
+                     int c, int w) {
+  if (iscan[r * w + c] != si || scan[si] != r * w + c) {
+    printf("r %d c %d ref_iscan %d iscan %d ref_scan %d scan %d\n", r, c, si,
+           iscan[r * w + c], r * w + c, scan[si]);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+int scan_order_test(const SCAN_ORDER *scan_order, int w, int h,
+                    SCAN_MODE mode) {
+  const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
+  int dim = w + h - 1;
+  if (mode == SCAN_MODE_ZIG_ZAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      if (i % 2 == 0) {
+        for (int c = 0; c < w; ++c) {
+          int r = i - c;
+          if (r >= 0 && r < h) {
+            if (scan_test(scan, iscan, si, r, c, w)) return 1;
+            ++si;
+          }
+        }
+      } else {
+        for (int r = 0; r < h; ++r) {
+          int c = i - r;
+          if (c >= 0 && c < w) {
+            if (scan_test(scan, iscan, si, r, c, w)) return 1;
+            ++si;
+          }
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_COL_DIAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      for (int c = 0; c < w; ++c) {
+        int r = i - c;
+        if (r >= 0 && r < h) {
+          if (scan_test(scan, iscan, si, r, c, w)) return 1;
+          ++si;
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_ROW_DIAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      for (int r = 0; r < h; ++r) {
+        int c = i - r;
+        if (c >= 0 && c < w) {
+          if (scan_test(scan, iscan, si, r, c, w)) return 1;
+          ++si;
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_ROW_1D) {
+    int si = 0;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        if (scan_test(scan, iscan, si, r, c, w)) return 1;
+        ++si;
+      }
+    }
+  } else {
+    assert(mode == SCAN_MODE_COL_1D);
+    int si = 0;
+    for (int c = 0; c < w; ++c) {
+      for (int r = 0; r < h; ++r) {
+        if (scan_test(scan, iscan, si, r, c, w)) return 1;
+        ++si;
+      }
+    }
+  }
+  return 0;
+}
+
+TEST(Av1ScanTest, Dependency) {
+  for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+    const int org_rows = tx_size_high[(TX_SIZE)tx_size];
+    const int org_cols = tx_size_wide[(TX_SIZE)tx_size];
+    const int rows = get_txb_high((TX_SIZE)tx_size);
+    const int cols = get_txb_wide((TX_SIZE)tx_size);
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                         static_cast<TX_TYPE>(tx_type)) ==
+          false) {
+        continue;
+      }
+      SCAN_MODE scan_mode;
+      TX_CLASS tx_class = tx_type_to_class[(TX_TYPE)tx_type];
+      if (tx_class == TX_CLASS_2D) {
+        if (rows == cols) {
+          scan_mode = SCAN_MODE_ZIG_ZAG;
+        } else if (rows > cols) {
+          scan_mode = SCAN_MODE_ROW_DIAG;
+        } else {
+          scan_mode = SCAN_MODE_COL_DIAG;
+        }
+      } else if (tx_class == TX_CLASS_VERT) {
+        scan_mode = SCAN_MODE_ROW_1D;
+      } else {
+        assert(tx_class == TX_CLASS_HORIZ);
+        scan_mode = SCAN_MODE_COL_1D;
+      }
+      const SCAN_ORDER *scan_order =
+          get_default_scan((TX_SIZE)tx_size, (TX_TYPE)tx_type);
+      ASSERT_EQ(scan_order_test(scan_order, cols, rows, scan_mode), 0)
+          << "scan mismatch tx_class " << tx_class << " tx_type " << tx_type
+          << " tx_w " << org_cols << " tx_h " << org_rows << " scan_mode "
+          << scan_mode << "\n";
+    }
+  }
+}
diff --git a/third_party/aom/test/segment_binarization_sync.cc b/third_party/aom/test/segment_binarization_sync.cc
new file mode 100644
index 000000000..bd8cf1141
--- /dev/null
+++ b/third_party/aom/test/segment_binarization_sync.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+using libaom_test::ACMRandom;
+
+extern "C" {
+int av1_neg_interleave(int x, int ref, int max);
+int av1_neg_deinterleave(int diff, int ref, int max);
+}
+
+namespace {
+
+struct Segment {
+  int id;
+  int pred;
+  int last_id;
+};
+
+Segment GenerateSegment(int seed) {
+  static const int MAX_SEGMENTS = 8;
+
+  ACMRandom rnd_(seed);
+
+  Segment segment;
+  const int last_segid = rnd_.PseudoUniform(MAX_SEGMENTS);
+  segment.last_id = last_segid;
+  segment.pred = rnd_.PseudoUniform(MAX_SEGMENTS);
+  segment.id = rnd_.PseudoUniform(last_segid + 1);
+
+  return segment;
+}
+
+// Try to reveal a mismatch between segment binarization and debinarization
+TEST(SegmentBinarizationSync, SearchForBinarizationMismatch) {
+  const int count_tests = 1000;
+  const int seed_init = 4321;
+
+  for (int i = 0; i < count_tests; ++i) {
+    const Segment seg = GenerateSegment(seed_init + i);
+
+    const int max_segid = seg.last_id + 1;
+    const int seg_diff = av1_neg_interleave(seg.id, seg.pred, max_segid);
+    const int decoded_segid =
+        av1_neg_deinterleave(seg_diff, seg.pred, max_segid);
+
+    ASSERT_EQ(decoded_segid, seg.id);
+  }
+}
+
+}  // namespace
diff --git a/third_party/aom/test/selfguided_filter_test.cc b/third_party/aom/test/selfguided_filter_test.cc
new file mode 100644
index 000000000..d2d5c6105
--- /dev/null
+++ b/third_party/aom/test/selfguided_filter_test.cc
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ctime>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/mv.h"
+#include "av1/common/restoration.h"
+
+namespace {
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+
+typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
+                        int eps, const int *xqd, uint8_t *dst8, int dst_stride,
+                        int32_t *tmpbuf, int bit_depth, int highbd);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<SgrFunc> FilterTestParam;
+
+class AV1SelfguidedFilterTest
+    : public ::testing::TestWithParam<FilterTestParam> {
+ public:
+  virtual ~AV1SelfguidedFilterTest() {}
+  virtual void SetUp() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunSpeedTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
+    const int NUM_ITERS = 2000;
+    int i, j, k;
+
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        32, out_stride * (height + 32) * sizeof(uint8_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & 0xFF;
+
+    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                        SGRPROJ_PRJ_MIN0),
+                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                        SGRPROJ_PRJ_MIN1) };
+    // Fix a parameter set, since the speed depends slightly on r.
+    // Change this to test different combinations of values of r.
+    int eps = 15;
+
+    av1_loop_restoration_precal();
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                         output_p, out_stride, tmpbuf, 8, 0);
+        }
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
+                   tmpbuf, 8, 0);
+        }
+    }
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(tmpbuf);
+  }
+
+  void RunCorrectnessTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    // Set the maximum width/height to test here. We actually test a small
+    // range of sizes *up to* this size, so that we can check, eg.,
+    // the behaviour on tiles which are not a multiple of 4 wide.
+    const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
+    const int NUM_ITERS = 81;
+    int i, j, k;
+
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output2_ = (uint8_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
+    uint8_t *output2 = output2_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    av1_loop_restoration_precal();
+
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & 0xFF;
+
+      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                          SGRPROJ_PRJ_MIN0),
+                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                          SGRPROJ_PRJ_MIN1) };
+      int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
+
+      // Test various tile sizes around 256x256
+      int test_w = max_w - (i / 9);
+      int test_h = max_h - (i % 9);
+
+      for (k = 0; k < test_h; k += pu_height)
+        for (j = 0; j < test_w; j += pu_width) {
+          int w = AOMMIN(pu_width, test_w - j);
+          int h = AOMMIN(pu_height, test_h - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          uint8_t *output2_p = output2 + k * out_stride + j;
+          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
+                   tmpbuf, 8, 0);
+          apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                         output2_p, out_stride, tmpbuf, 8, 0);
+        }
+
+      for (j = 0; j < test_h; ++j)
+        for (k = 0; k < test_w; ++k) {
+          ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+        }
+    }
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
+    aom_free(tmpbuf);
+  }
+
+ private:
+  SgrFunc tst_fun_;
+};
+
+TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1SelfguidedFilterTest,
+                        ::testing::Values(apply_selfguided_restoration_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AV1SelfguidedFilterTest,
+                        ::testing::Values(apply_selfguided_restoration_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AV1SelfguidedFilterTest,
+                        ::testing::Values(apply_selfguided_restoration_neon));
+#endif
+
+// Test parameter list:
+//  <tst_fun_, bit_depth>
+typedef tuple<SgrFunc, int> HighbdFilterTestParam;
+
+class AV1HighbdSelfguidedFilterTest
+    : public ::testing::TestWithParam<HighbdFilterTestParam> {
+ public:
+  virtual ~AV1HighbdSelfguidedFilterTest() {}
+  virtual void SetUp() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunSpeedTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
+    const int NUM_ITERS = 2000;
+    int i, j, k;
+    int bit_depth = GET_PARAM(1);
+    int mask = (1 << bit_depth) - 1;
+
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        32, out_stride * (height + 32) * sizeof(uint16_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & mask;
+
+    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                        SGRPROJ_PRJ_MIN0),
+                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                        SGRPROJ_PRJ_MIN1) };
+    // Fix a parameter set, since the speed depends slightly on r.
+    // Change this to test different combinations of values of r.
+    int eps = 15;
+
+    av1_loop_restoration_precal();
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
+        }
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+                   1);
+        }
+    }
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
+           "C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(tmpbuf);
+  }
+
+  void RunCorrectnessTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    // Set the maximum width/height to test here. We actually test a small
+    // range of sizes *up to* this size, so that we can check, eg.,
+    // the behaviour on tiles which are not a multiple of 4 wide.
+    const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
+    const int NUM_ITERS = 81;
+    int i, j, k;
+    int bit_depth = GET_PARAM(1);
+    int mask = (1 << bit_depth) - 1;
+
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output2_ = (uint16_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
+    uint16_t *output2 = output2_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    av1_loop_restoration_precal();
+
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & mask;
+
+      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                          SGRPROJ_PRJ_MIN0),
+                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                          SGRPROJ_PRJ_MIN1) };
+      int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
+
+      // Test various tile sizes around 256x256
+      int test_w = max_w - (i / 9);
+      int test_h = max_h - (i % 9);
+
+      for (k = 0; k < test_h; k += pu_height)
+        for (j = 0; j < test_w; j += pu_width) {
+          int w = AOMMIN(pu_width, test_w - j);
+          int h = AOMMIN(pu_height, test_h - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          uint16_t *output2_p = output2 + k * out_stride + j;
+          tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+                   1);
+          apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
+        }
+
+      for (j = 0; j < test_h; ++j)
+        for (k = 0; k < test_w; ++k)
+          ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+    }
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
+    aom_free(tmpbuf);
+  }
+
+ private:
+  SgrFunc tst_fun_;
+};
+
+TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
+
+#if HAVE_SSE4_1
+const int highbd_params_sse4_1[] = { 8, 10, 12 };
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(apply_selfguided_restoration_sse4_1),
+                       ::testing::ValuesIn(highbd_params_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+const int highbd_params_avx2[] = { 8, 10, 12 };
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(apply_selfguided_restoration_avx2),
+                       ::testing::ValuesIn(highbd_params_avx2)));
+#endif
+#if HAVE_NEON
+const int highbd_params_neon[] = { 8, 10, 12 };
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(apply_selfguided_restoration_neon),
+                       ::testing::ValuesIn(highbd_params_neon)));
+#endif
+}  // namespace
diff --git a/third_party/aom/test/set_maps.sh b/third_party/aom/test/set_maps.sh
new file mode 100755
index 000000000..4f59b06d6
--- /dev/null
+++ b/third_party/aom/test/set_maps.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom set_maps example. To add new tests to this file,
+## do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBAOM_BIN_PATH.
+set_maps_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(aom_tool_path set_maps)" ]; then
+    elog "set_maps not found. It must exist in LIBAOM_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+  local encoder="$(aom_tool_path set_maps)"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+set_maps_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    set_maps av1 || return 1
+  fi
+}
+
+set_maps_tests="set_maps_av1"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"
diff --git a/third_party/aom/test/simd_avx2_test.cc b/third_party/aom/test/simd_avx2_test.cc
new file mode 100644
index 000000000..8a012bff8
--- /dev/null
+++ b/third_party/aom/test/simd_avx2_test.cc
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define ARCH AVX2
+#define ARCH_POSTFIX(name) name##_avx2
+#define SIMD_NAMESPACE simd_test_avx2
+#include "test/simd_impl.h"
diff --git a/third_party/aom/test/simd_cmp_avx2.cc b/third_party/aom/test/simd_cmp_avx2.cc
new file mode 100644
index 000000000..cda632bcd
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_avx2.cc
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define ARCH AVX2
+#define ARCH_POSTFIX(name) name##_avx2
+#define SIMD_NAMESPACE simd_test_avx2
+#include "test/simd_cmp_impl.h"
diff --git a/third_party/aom/test/simd_cmp_impl.h b/third_party/aom/test/simd_cmp_impl.h
new file mode 100644
index 000000000..b98af9aad
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_impl.h
@@ -0,0 +1,2171 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "aom_dsp/aom_simd.h"
+#undef SIMD_INLINE
+#define SIMD_INLINE static  // Don't enforce inlining
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+
+// Machine tuned code goes into this file. This file is included from
+// simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
+// ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
+
+#ifdef _MSC_VER
+// Disable "value of intrinsic immediate argument 'value' is out of range
+// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
+// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
+// mask doesn't always appear to be sufficient.
+#pragma warning(disable : 4556)
+#endif
+
+using libaom_test::ACMRandom;
+
+namespace SIMD_NAMESPACE {
+
+// Wrap templates around intrinsics using immediate values
+template <int shift>
+v64 imm_v64_shl_n_byte(v64 a) {
+  return v64_shl_n_byte(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_byte(v64 a) {
+  return v64_shr_n_byte(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_8(v64 a) {
+  return v64_shl_n_8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u8(v64 a) {
+  return v64_shr_n_u8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s8(v64 a) {
+  return v64_shr_n_s8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_16(v64 a) {
+  return v64_shl_n_16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u16(v64 a) {
+  return v64_shr_n_u16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s16(v64 a) {
+  return v64_shr_n_s16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_32(v64 a) {
+  return v64_shl_n_32(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u32(v64 a) {
+  return v64_shr_n_u32(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s32(v64 a) {
+  return v64_shr_n_s32(a, shift);
+}
+template <int shift>
+v64 imm_v64_align(v64 a, v64 b) {
+  return v64_align(a, b, shift);
+}
+
+// Wrap templates around corresponding C implementations of the above
+template <int shift>
+c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
+  return c_v64_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
+  return c_v64_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_8(c_v64 a) {
+  return c_v64_shl_n_8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
+  return c_v64_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
+  return c_v64_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_16(c_v64 a) {
+  return c_v64_shl_n_16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
+  return c_v64_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
+  return c_v64_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_32(c_v64 a) {
+  return c_v64_shl_n_32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
+  return c_v64_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
+  return c_v64_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
+  return c_v64_align(a, b, shift);
+}
+
+template <int shift>
+v128 imm_v128_shl_n_byte(v128 a) {
+  return v128_shl_n_byte(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_byte(v128 a) {
+  return v128_shr_n_byte(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_8(v128 a) {
+  return v128_shl_n_8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u8(v128 a) {
+  return v128_shr_n_u8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s8(v128 a) {
+  return v128_shr_n_s8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_16(v128 a) {
+  return v128_shl_n_16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u16(v128 a) {
+  return v128_shr_n_u16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s16(v128 a) {
+  return v128_shr_n_s16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_32(v128 a) {
+  return v128_shl_n_32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u32(v128 a) {
+  return v128_shr_n_u32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s32(v128 a) {
+  return v128_shr_n_s32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_64(v128 a) {
+  return v128_shl_n_64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u64(v128 a) {
+  return v128_shr_n_u64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s64(v128 a) {
+  return v128_shr_n_s64(a, shift);
+}
+template <int shift>
+v128 imm_v128_align(v128 a, v128 b) {
+  return v128_align(a, b, shift);
+}
+
+template <int shift>
+c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
+  return c_v128_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
+  return c_v128_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_8(c_v128 a) {
+  return c_v128_shl_n_8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
+  return c_v128_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
+  return c_v128_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_16(c_v128 a) {
+  return c_v128_shl_n_16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
+  return c_v128_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
+  return c_v128_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_32(c_v128 a) {
+  return c_v128_shl_n_32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
+  return c_v128_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
+  return c_v128_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_64(c_v128 a) {
+  return c_v128_shl_n_64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
+  return c_v128_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
+  return c_v128_shr_n_s64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
+  return c_v128_align(a, b, shift);
+}
+
+template <int shift>
+v256 imm_v256_shl_n_word(v256 a) {
+  return v256_shl_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_word(v256 a) {
+  return v256_shr_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_byte(v256 a) {
+  return v256_shl_n_byte(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_byte(v256 a) {
+  return v256_shr_n_byte(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_8(v256 a) {
+  return v256_shl_n_8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u8(v256 a) {
+  return v256_shr_n_u8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s8(v256 a) {
+  return v256_shr_n_s8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_16(v256 a) {
+  return v256_shl_n_16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u16(v256 a) {
+  return v256_shr_n_u16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s16(v256 a) {
+  return v256_shr_n_s16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_32(v256 a) {
+  return v256_shl_n_32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u32(v256 a) {
+  return v256_shr_n_u32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s32(v256 a) {
+  return v256_shr_n_s32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_64(v256 a) {
+  return v256_shl_n_64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u64(v256 a) {
+  return v256_shr_n_u64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s64(v256 a) {
+  return v256_shr_n_s64(a, shift);
+}
+template <int shift>
+v256 imm_v256_align(v256 a, v256 b) {
+  return v256_align(a, b, shift);
+}
+
+template <int shift>
+c_v256 c_imm_v256_shl_n_word(c_v256 a) {
+  return c_v256_shl_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_word(c_v256 a) {
+  return c_v256_shr_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
+  return c_v256_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
+  return c_v256_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_8(c_v256 a) {
+  return c_v256_shl_n_8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
+  return c_v256_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
+  return c_v256_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_16(c_v256 a) {
+  return c_v256_shl_n_16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
+  return c_v256_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
+  return c_v256_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_32(c_v256 a) {
+  return c_v256_shl_n_32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
+  return c_v256_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
+  return c_v256_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_64(c_v256 a) {
+  return c_v256_shl_n_64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
+  return c_v256_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
+  return c_v256_shr_n_s64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
+  return c_v256_align(a, b, shift);
+}
+
+// Wrappers around the the SAD and SSD functions
+uint32_t v64_sad_u8(v64 a, v64 b) {
+  return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
+}
+uint32_t v64_ssd_u8(v64 a, v64 b) {
+  return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
+}
+
+uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
+  return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
+}
+uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
+  return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
+}
+uint32_t v128_sad_u8(v128 a, v128 b) {
+  return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
+}
+uint32_t v128_ssd_u8(v128 a, v128 b) {
+  return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
+}
+uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
+  return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
+}
+uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
+  return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
+}
+uint32_t v128_sad_u16(v128 a, v128 b) {
+  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
+}
+uint64_t v128_ssd_s16(v128 a, v128 b) {
+  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
+}
+uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
+  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
+}
+uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
+  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
+}
+uint32_t v256_sad_u8(v256 a, v256 b) {
+  return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
+}
+uint32_t v256_ssd_u8(v256 a, v256 b) {
+  return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
+}
+uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
+  return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
+}
+uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
+  return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
+}
+uint32_t v256_sad_u16(v256 a, v256 b) {
+  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
+}
+uint64_t v256_ssd_s16(v256 a, v256 b) {
+  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
+}
+uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
+  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
+}
+uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
+  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
+}
+
+namespace {
+
+typedef void (*fptr)();
+
+typedef struct {
+  const char *name;
+  fptr ref;
+  fptr simd;
+} mapping;
+
+#define MAP(name)                                \
+  {                                              \
+    #name, reinterpret_cast < fptr > (c_##name), \
+        reinterpret_cast < fptr > (name)         \
+  }
+
+const mapping m[] = { MAP(v64_sad_u8),
+                      MAP(v64_ssd_u8),
+                      MAP(v64_add_8),
+                      MAP(v64_add_16),
+                      MAP(v64_sadd_s8),
+                      MAP(v64_sadd_u8),
+                      MAP(v64_sadd_s16),
+                      MAP(v64_add_32),
+                      MAP(v64_sub_8),
+                      MAP(v64_ssub_u8),
+                      MAP(v64_ssub_s8),
+                      MAP(v64_sub_16),
+                      MAP(v64_ssub_s16),
+                      MAP(v64_ssub_u16),
+                      MAP(v64_sub_32),
+                      MAP(v64_ziplo_8),
+                      MAP(v64_ziphi_8),
+                      MAP(v64_ziplo_16),
+                      MAP(v64_ziphi_16),
+                      MAP(v64_ziplo_32),
+                      MAP(v64_ziphi_32),
+                      MAP(v64_pack_s32_u16),
+                      MAP(v64_pack_s32_s16),
+                      MAP(v64_pack_s16_u8),
+                      MAP(v64_pack_s16_s8),
+                      MAP(v64_unziphi_8),
+                      MAP(v64_unziplo_8),
+                      MAP(v64_unziphi_16),
+                      MAP(v64_unziplo_16),
+                      MAP(v64_or),
+                      MAP(v64_xor),
+                      MAP(v64_and),
+                      MAP(v64_andn),
+                      MAP(v64_mullo_s16),
+                      MAP(v64_mulhi_s16),
+                      MAP(v64_mullo_s32),
+                      MAP(v64_madd_s16),
+                      MAP(v64_madd_us8),
+                      MAP(v64_avg_u8),
+                      MAP(v64_rdavg_u8),
+                      MAP(v64_rdavg_u16),
+                      MAP(v64_avg_u16),
+                      MAP(v64_min_u8),
+                      MAP(v64_max_u8),
+                      MAP(v64_min_s8),
+                      MAP(v64_max_s8),
+                      MAP(v64_min_s16),
+                      MAP(v64_max_s16),
+                      MAP(v64_cmpgt_s8),
+                      MAP(v64_cmplt_s8),
+                      MAP(v64_cmpeq_8),
+                      MAP(v64_cmpgt_s16),
+                      MAP(v64_cmplt_s16),
+                      MAP(v64_cmpeq_16),
+                      MAP(v64_shuffle_8),
+                      MAP(imm_v64_align<1>),
+                      MAP(imm_v64_align<2>),
+                      MAP(imm_v64_align<3>),
+                      MAP(imm_v64_align<4>),
+                      MAP(imm_v64_align<5>),
+                      MAP(imm_v64_align<6>),
+                      MAP(imm_v64_align<7>),
+                      MAP(v64_abs_s8),
+                      MAP(v64_abs_s16),
+                      MAP(v64_unpacklo_u8_s16),
+                      MAP(v64_unpackhi_u8_s16),
+                      MAP(v64_unpacklo_s8_s16),
+                      MAP(v64_unpackhi_s8_s16),
+                      MAP(v64_unpacklo_u16_s32),
+                      MAP(v64_unpacklo_s16_s32),
+                      MAP(v64_unpackhi_u16_s32),
+                      MAP(v64_unpackhi_s16_s32),
+                      MAP(imm_v64_shr_n_byte<1>),
+                      MAP(imm_v64_shr_n_byte<2>),
+                      MAP(imm_v64_shr_n_byte<3>),
+                      MAP(imm_v64_shr_n_byte<4>),
+                      MAP(imm_v64_shr_n_byte<5>),
+                      MAP(imm_v64_shr_n_byte<6>),
+                      MAP(imm_v64_shr_n_byte<7>),
+                      MAP(imm_v64_shl_n_byte<1>),
+                      MAP(imm_v64_shl_n_byte<2>),
+                      MAP(imm_v64_shl_n_byte<3>),
+                      MAP(imm_v64_shl_n_byte<4>),
+                      MAP(imm_v64_shl_n_byte<5>),
+                      MAP(imm_v64_shl_n_byte<6>),
+                      MAP(imm_v64_shl_n_byte<7>),
+                      MAP(imm_v64_shl_n_8<1>),
+                      MAP(imm_v64_shl_n_8<2>),
+                      MAP(imm_v64_shl_n_8<3>),
+                      MAP(imm_v64_shl_n_8<4>),
+                      MAP(imm_v64_shl_n_8<5>),
+                      MAP(imm_v64_shl_n_8<6>),
+                      MAP(imm_v64_shl_n_8<7>),
+                      MAP(imm_v64_shr_n_u8<1>),
+                      MAP(imm_v64_shr_n_u8<2>),
+                      MAP(imm_v64_shr_n_u8<3>),
+                      MAP(imm_v64_shr_n_u8<4>),
+                      MAP(imm_v64_shr_n_u8<5>),
+                      MAP(imm_v64_shr_n_u8<6>),
+                      MAP(imm_v64_shr_n_u8<7>),
+                      MAP(imm_v64_shr_n_s8<1>),
+                      MAP(imm_v64_shr_n_s8<2>),
+                      MAP(imm_v64_shr_n_s8<3>),
+                      MAP(imm_v64_shr_n_s8<4>),
+                      MAP(imm_v64_shr_n_s8<5>),
+                      MAP(imm_v64_shr_n_s8<6>),
+                      MAP(imm_v64_shr_n_s8<7>),
+                      MAP(imm_v64_shl_n_16<1>),
+                      MAP(imm_v64_shl_n_16<2>),
+                      MAP(imm_v64_shl_n_16<4>),
+                      MAP(imm_v64_shl_n_16<6>),
+                      MAP(imm_v64_shl_n_16<8>),
+                      MAP(imm_v64_shl_n_16<10>),
+                      MAP(imm_v64_shl_n_16<12>),
+                      MAP(imm_v64_shl_n_16<14>),
+                      MAP(imm_v64_shr_n_u16<1>),
+                      MAP(imm_v64_shr_n_u16<2>),
+                      MAP(imm_v64_shr_n_u16<4>),
+                      MAP(imm_v64_shr_n_u16<6>),
+                      MAP(imm_v64_shr_n_u16<8>),
+                      MAP(imm_v64_shr_n_u16<10>),
+                      MAP(imm_v64_shr_n_u16<12>),
+                      MAP(imm_v64_shr_n_u16<14>),
+                      MAP(imm_v64_shr_n_s16<1>),
+                      MAP(imm_v64_shr_n_s16<2>),
+                      MAP(imm_v64_shr_n_s16<4>),
+                      MAP(imm_v64_shr_n_s16<6>),
+                      MAP(imm_v64_shr_n_s16<8>),
+                      MAP(imm_v64_shr_n_s16<10>),
+                      MAP(imm_v64_shr_n_s16<12>),
+                      MAP(imm_v64_shr_n_s16<14>),
+                      MAP(imm_v64_shl_n_32<1>),
+                      MAP(imm_v64_shl_n_32<4>),
+                      MAP(imm_v64_shl_n_32<8>),
+                      MAP(imm_v64_shl_n_32<12>),
+                      MAP(imm_v64_shl_n_32<16>),
+                      MAP(imm_v64_shl_n_32<20>),
+                      MAP(imm_v64_shl_n_32<24>),
+                      MAP(imm_v64_shl_n_32<28>),
+                      MAP(imm_v64_shr_n_u32<1>),
+                      MAP(imm_v64_shr_n_u32<4>),
+                      MAP(imm_v64_shr_n_u32<8>),
+                      MAP(imm_v64_shr_n_u32<12>),
+                      MAP(imm_v64_shr_n_u32<16>),
+                      MAP(imm_v64_shr_n_u32<20>),
+                      MAP(imm_v64_shr_n_u32<24>),
+                      MAP(imm_v64_shr_n_u32<28>),
+                      MAP(imm_v64_shr_n_s32<1>),
+                      MAP(imm_v64_shr_n_s32<4>),
+                      MAP(imm_v64_shr_n_s32<8>),
+                      MAP(imm_v64_shr_n_s32<12>),
+                      MAP(imm_v64_shr_n_s32<16>),
+                      MAP(imm_v64_shr_n_s32<20>),
+                      MAP(imm_v64_shr_n_s32<24>),
+                      MAP(imm_v64_shr_n_s32<28>),
+                      MAP(v64_shl_8),
+                      MAP(v64_shr_u8),
+                      MAP(v64_shr_s8),
+                      MAP(v64_shl_16),
+                      MAP(v64_shr_u16),
+                      MAP(v64_shr_s16),
+                      MAP(v64_shl_32),
+                      MAP(v64_shr_u32),
+                      MAP(v64_shr_s32),
+                      MAP(v64_hadd_u8),
+                      MAP(v64_hadd_s16),
+                      MAP(v64_dotp_s16),
+                      MAP(v64_dotp_su8),
+                      MAP(v64_u64),
+                      MAP(v64_low_u32),
+                      MAP(v64_high_u32),
+                      MAP(v64_low_s32),
+                      MAP(v64_high_s32),
+                      MAP(v64_dup_8),
+                      MAP(v64_dup_16),
+                      MAP(v64_dup_32),
+                      MAP(v64_from_32),
+                      MAP(v64_zero),
+                      MAP(v64_from_16),
+                      MAP(v128_sad_u8),
+                      MAP(v128_ssd_u8),
+                      MAP(v128_sad_u16),
+                      MAP(v128_ssd_s16),
+                      MAP(v128_add_8),
+                      MAP(v128_add_16),
+                      MAP(v128_sadd_s8),
+                      MAP(v128_sadd_u8),
+                      MAP(v128_sadd_s16),
+                      MAP(v128_add_32),
+                      MAP(v128_add_64),
+                      MAP(v128_sub_8),
+                      MAP(v128_ssub_u8),
+                      MAP(v128_ssub_s8),
+                      MAP(v128_sub_16),
+                      MAP(v128_ssub_s16),
+                      MAP(v128_ssub_u16),
+                      MAP(v128_sub_32),
+                      MAP(v128_sub_64),
+                      MAP(v128_ziplo_8),
+                      MAP(v128_ziphi_8),
+                      MAP(v128_ziplo_16),
+                      MAP(v128_ziphi_16),
+                      MAP(v128_ziplo_32),
+                      MAP(v128_ziphi_32),
+                      MAP(v128_ziplo_64),
+                      MAP(v128_ziphi_64),
+                      MAP(v128_unziphi_8),
+                      MAP(v128_unziplo_8),
+                      MAP(v128_unziphi_16),
+                      MAP(v128_unziplo_16),
+                      MAP(v128_unziphi_32),
+                      MAP(v128_unziplo_32),
+                      MAP(v128_pack_s32_u16),
+                      MAP(v128_pack_s32_s16),
+                      MAP(v128_pack_s16_u8),
+                      MAP(v128_pack_s16_s8),
+                      MAP(v128_or),
+                      MAP(v128_xor),
+                      MAP(v128_and),
+                      MAP(v128_andn),
+                      MAP(v128_mullo_s16),
+                      MAP(v128_mulhi_s16),
+                      MAP(v128_mullo_s32),
+                      MAP(v128_madd_s16),
+                      MAP(v128_madd_us8),
+                      MAP(v128_avg_u8),
+                      MAP(v128_rdavg_u8),
+                      MAP(v128_rdavg_u16),
+                      MAP(v128_avg_u16),
+                      MAP(v128_min_u8),
+                      MAP(v128_max_u8),
+                      MAP(v128_min_s8),
+                      MAP(v128_max_s8),
+                      MAP(v128_min_s16),
+                      MAP(v128_max_s16),
+                      MAP(v128_min_s32),
+                      MAP(v128_max_s32),
+                      MAP(v128_cmpgt_s8),
+                      MAP(v128_cmplt_s8),
+                      MAP(v128_cmpeq_8),
+                      MAP(v128_cmpgt_s16),
+                      MAP(v128_cmpeq_16),
+                      MAP(v128_cmplt_s16),
+                      MAP(v128_cmpgt_s32),
+                      MAP(v128_cmpeq_32),
+                      MAP(v128_cmplt_s32),
+                      MAP(v128_shuffle_8),
+                      MAP(imm_v128_align<1>),
+                      MAP(imm_v128_align<2>),
+                      MAP(imm_v128_align<3>),
+                      MAP(imm_v128_align<4>),
+                      MAP(imm_v128_align<5>),
+                      MAP(imm_v128_align<6>),
+                      MAP(imm_v128_align<7>),
+                      MAP(imm_v128_align<8>),
+                      MAP(imm_v128_align<9>),
+                      MAP(imm_v128_align<10>),
+                      MAP(imm_v128_align<11>),
+                      MAP(imm_v128_align<12>),
+                      MAP(imm_v128_align<13>),
+                      MAP(imm_v128_align<14>),
+                      MAP(imm_v128_align<15>),
+                      MAP(v128_abs_s8),
+                      MAP(v128_abs_s16),
+                      MAP(v128_padd_u8),
+                      MAP(v128_padd_s16),
+                      MAP(v128_unpacklo_u16_s32),
+                      MAP(v128_unpacklo_s16_s32),
+                      MAP(v128_unpackhi_u16_s32),
+                      MAP(v128_unpackhi_s16_s32),
+                      MAP(imm_v128_shr_n_byte<1>),
+                      MAP(imm_v128_shr_n_byte<2>),
+                      MAP(imm_v128_shr_n_byte<3>),
+                      MAP(imm_v128_shr_n_byte<4>),
+                      MAP(imm_v128_shr_n_byte<5>),
+                      MAP(imm_v128_shr_n_byte<6>),
+                      MAP(imm_v128_shr_n_byte<7>),
+                      MAP(imm_v128_shr_n_byte<8>),
+                      MAP(imm_v128_shr_n_byte<9>),
+                      MAP(imm_v128_shr_n_byte<10>),
+                      MAP(imm_v128_shr_n_byte<11>),
+                      MAP(imm_v128_shr_n_byte<12>),
+                      MAP(imm_v128_shr_n_byte<13>),
+                      MAP(imm_v128_shr_n_byte<14>),
+                      MAP(imm_v128_shr_n_byte<15>),
+                      MAP(imm_v128_shl_n_byte<1>),
+                      MAP(imm_v128_shl_n_byte<2>),
+                      MAP(imm_v128_shl_n_byte<3>),
+                      MAP(imm_v128_shl_n_byte<4>),
+                      MAP(imm_v128_shl_n_byte<5>),
+                      MAP(imm_v128_shl_n_byte<6>),
+                      MAP(imm_v128_shl_n_byte<7>),
+                      MAP(imm_v128_shl_n_byte<8>),
+                      MAP(imm_v128_shl_n_byte<9>),
+                      MAP(imm_v128_shl_n_byte<10>),
+                      MAP(imm_v128_shl_n_byte<11>),
+                      MAP(imm_v128_shl_n_byte<12>),
+                      MAP(imm_v128_shl_n_byte<13>),
+                      MAP(imm_v128_shl_n_byte<14>),
+                      MAP(imm_v128_shl_n_byte<15>),
+                      MAP(imm_v128_shl_n_8<1>),
+                      MAP(imm_v128_shl_n_8<2>),
+                      MAP(imm_v128_shl_n_8<3>),
+                      MAP(imm_v128_shl_n_8<4>),
+                      MAP(imm_v128_shl_n_8<5>),
+                      MAP(imm_v128_shl_n_8<6>),
+                      MAP(imm_v128_shl_n_8<7>),
+                      MAP(imm_v128_shr_n_u8<1>),
+                      MAP(imm_v128_shr_n_u8<2>),
+                      MAP(imm_v128_shr_n_u8<3>),
+                      MAP(imm_v128_shr_n_u8<4>),
+                      MAP(imm_v128_shr_n_u8<5>),
+                      MAP(imm_v128_shr_n_u8<6>),
+                      MAP(imm_v128_shr_n_u8<7>),
+                      MAP(imm_v128_shr_n_s8<1>),
+                      MAP(imm_v128_shr_n_s8<2>),
+                      MAP(imm_v128_shr_n_s8<3>),
+                      MAP(imm_v128_shr_n_s8<4>),
+                      MAP(imm_v128_shr_n_s8<5>),
+                      MAP(imm_v128_shr_n_s8<6>),
+                      MAP(imm_v128_shr_n_s8<7>),
+                      MAP(imm_v128_shl_n_16<1>),
+                      MAP(imm_v128_shl_n_16<2>),
+                      MAP(imm_v128_shl_n_16<4>),
+                      MAP(imm_v128_shl_n_16<6>),
+                      MAP(imm_v128_shl_n_16<8>),
+                      MAP(imm_v128_shl_n_16<10>),
+                      MAP(imm_v128_shl_n_16<12>),
+                      MAP(imm_v128_shl_n_16<14>),
+                      MAP(imm_v128_shr_n_u16<1>),
+                      MAP(imm_v128_shr_n_u16<2>),
+                      MAP(imm_v128_shr_n_u16<4>),
+                      MAP(imm_v128_shr_n_u16<6>),
+                      MAP(imm_v128_shr_n_u16<8>),
+                      MAP(imm_v128_shr_n_u16<10>),
+                      MAP(imm_v128_shr_n_u16<12>),
+                      MAP(imm_v128_shr_n_u16<14>),
+                      MAP(imm_v128_shr_n_s16<1>),
+                      MAP(imm_v128_shr_n_s16<2>),
+                      MAP(imm_v128_shr_n_s16<4>),
+                      MAP(imm_v128_shr_n_s16<6>),
+                      MAP(imm_v128_shr_n_s16<8>),
+                      MAP(imm_v128_shr_n_s16<10>),
+                      MAP(imm_v128_shr_n_s16<12>),
+                      MAP(imm_v128_shr_n_s16<14>),
+                      MAP(imm_v128_shl_n_32<1>),
+                      MAP(imm_v128_shl_n_32<4>),
+                      MAP(imm_v128_shl_n_32<8>),
+                      MAP(imm_v128_shl_n_32<12>),
+                      MAP(imm_v128_shl_n_32<16>),
+                      MAP(imm_v128_shl_n_32<20>),
+                      MAP(imm_v128_shl_n_32<24>),
+                      MAP(imm_v128_shl_n_32<28>),
+                      MAP(imm_v128_shr_n_u32<1>),
+                      MAP(imm_v128_shr_n_u32<4>),
+                      MAP(imm_v128_shr_n_u32<8>),
+                      MAP(imm_v128_shr_n_u32<12>),
+                      MAP(imm_v128_shr_n_u32<16>),
+                      MAP(imm_v128_shr_n_u32<20>),
+                      MAP(imm_v128_shr_n_u32<24>),
+                      MAP(imm_v128_shr_n_u32<28>),
+                      MAP(imm_v128_shr_n_s32<1>),
+                      MAP(imm_v128_shr_n_s32<4>),
+                      MAP(imm_v128_shr_n_s32<8>),
+                      MAP(imm_v128_shr_n_s32<12>),
+                      MAP(imm_v128_shr_n_s32<16>),
+                      MAP(imm_v128_shr_n_s32<20>),
+                      MAP(imm_v128_shr_n_s32<24>),
+                      MAP(imm_v128_shr_n_s32<28>),
+                      MAP(imm_v128_shl_n_64<1>),
+                      MAP(imm_v128_shl_n_64<4>),
+                      MAP(imm_v128_shl_n_64<8>),
+                      MAP(imm_v128_shl_n_64<12>),
+                      MAP(imm_v128_shl_n_64<16>),
+                      MAP(imm_v128_shl_n_64<20>),
+                      MAP(imm_v128_shl_n_64<24>),
+                      MAP(imm_v128_shl_n_64<28>),
+                      MAP(imm_v128_shl_n_64<32>),
+                      MAP(imm_v128_shl_n_64<36>),
+                      MAP(imm_v128_shl_n_64<40>),
+                      MAP(imm_v128_shl_n_64<44>),
+                      MAP(imm_v128_shl_n_64<48>),
+                      MAP(imm_v128_shl_n_64<52>),
+                      MAP(imm_v128_shl_n_64<56>),
+                      MAP(imm_v128_shl_n_64<60>),
+                      MAP(imm_v128_shr_n_u64<1>),
+                      MAP(imm_v128_shr_n_u64<4>),
+                      MAP(imm_v128_shr_n_u64<8>),
+                      MAP(imm_v128_shr_n_u64<12>),
+                      MAP(imm_v128_shr_n_u64<16>),
+                      MAP(imm_v128_shr_n_u64<20>),
+                      MAP(imm_v128_shr_n_u64<24>),
+                      MAP(imm_v128_shr_n_u64<28>),
+                      MAP(imm_v128_shr_n_u64<32>),
+                      MAP(imm_v128_shr_n_u64<36>),
+                      MAP(imm_v128_shr_n_u64<40>),
+                      MAP(imm_v128_shr_n_u64<44>),
+                      MAP(imm_v128_shr_n_u64<48>),
+                      MAP(imm_v128_shr_n_u64<52>),
+                      MAP(imm_v128_shr_n_u64<56>),
+                      MAP(imm_v128_shr_n_u64<60>),
+                      MAP(imm_v128_shr_n_s64<1>),
+                      MAP(imm_v128_shr_n_s64<4>),
+                      MAP(imm_v128_shr_n_s64<8>),
+                      MAP(imm_v128_shr_n_s64<12>),
+                      MAP(imm_v128_shr_n_s64<16>),
+                      MAP(imm_v128_shr_n_s64<20>),
+                      MAP(imm_v128_shr_n_s64<24>),
+                      MAP(imm_v128_shr_n_s64<28>),
+                      MAP(imm_v128_shr_n_s64<32>),
+                      MAP(imm_v128_shr_n_s64<36>),
+                      MAP(imm_v128_shr_n_s64<40>),
+                      MAP(imm_v128_shr_n_s64<44>),
+                      MAP(imm_v128_shr_n_s64<48>),
+                      MAP(imm_v128_shr_n_s64<52>),
+                      MAP(imm_v128_shr_n_s64<56>),
+                      MAP(imm_v128_shr_n_s64<60>),
+                      MAP(v128_from_v64),
+                      MAP(v128_zip_8),
+                      MAP(v128_zip_16),
+                      MAP(v128_zip_32),
+                      MAP(v128_mul_s16),
+                      MAP(v128_unpack_u8_s16),
+                      MAP(v128_unpack_s8_s16),
+                      MAP(v128_unpack_u16_s32),
+                      MAP(v128_unpack_s16_s32),
+                      MAP(v128_shl_8),
+                      MAP(v128_shr_u8),
+                      MAP(v128_shr_s8),
+                      MAP(v128_shl_16),
+                      MAP(v128_shr_u16),
+                      MAP(v128_shr_s16),
+                      MAP(v128_shl_32),
+                      MAP(v128_shr_u32),
+                      MAP(v128_shr_s32),
+                      MAP(v128_shl_64),
+                      MAP(v128_shr_u64),
+                      MAP(v128_shr_s64),
+                      MAP(v128_hadd_u8),
+                      MAP(v128_dotp_su8),
+                      MAP(v128_dotp_s16),
+                      MAP(v128_dotp_s32),
+                      MAP(v128_low_u32),
+                      MAP(v128_low_v64),
+                      MAP(v128_high_v64),
+                      MAP(v128_from_64),
+                      MAP(v128_from_32),
+                      MAP(v128_movemask_8),
+                      MAP(v128_zero),
+                      MAP(v128_dup_8),
+                      MAP(v128_dup_16),
+                      MAP(v128_dup_32),
+                      MAP(v128_dup_64),
+                      MAP(v128_unpacklo_u8_s16),
+                      MAP(v128_unpackhi_u8_s16),
+                      MAP(v128_unpacklo_s8_s16),
+                      MAP(v128_unpackhi_s8_s16),
+                      MAP(v128_blend_8),
+                      MAP(u32_load_unaligned),
+                      MAP(u32_store_unaligned),
+                      MAP(v64_load_unaligned),
+                      MAP(v64_store_unaligned),
+                      MAP(v128_load_unaligned),
+                      MAP(v128_store_unaligned),
+                      MAP(v256_sad_u8),
+                      MAP(v256_ssd_u8),
+                      MAP(v256_sad_u16),
+                      MAP(v256_ssd_s16),
+                      MAP(v256_hadd_u8),
+                      MAP(v256_low_u64),
+                      MAP(v256_dotp_su8),
+                      MAP(v256_dotp_s16),
+                      MAP(v256_dotp_s32),
+                      MAP(v256_add_8),
+                      MAP(v256_add_16),
+                      MAP(v256_sadd_s8),
+                      MAP(v256_sadd_u8),
+                      MAP(v256_sadd_s16),
+                      MAP(v256_add_32),
+                      MAP(v256_add_64),
+                      MAP(v256_sub_8),
+                      MAP(v256_ssub_u8),
+                      MAP(v256_ssub_s8),
+                      MAP(v256_sub_16),
+                      MAP(v256_ssub_u16),
+                      MAP(v256_ssub_s16),
+                      MAP(v256_sub_32),
+                      MAP(v256_sub_64),
+                      MAP(v256_ziplo_8),
+                      MAP(v256_ziphi_8),
+                      MAP(v256_ziplo_16),
+                      MAP(v256_ziphi_16),
+                      MAP(v256_ziplo_32),
+                      MAP(v256_ziphi_32),
+                      MAP(v256_ziplo_64),
+                      MAP(v256_ziphi_64),
+                      MAP(v256_unziphi_8),
+                      MAP(v256_unziplo_8),
+                      MAP(v256_unziphi_16),
+                      MAP(v256_unziplo_16),
+                      MAP(v256_unziphi_32),
+                      MAP(v256_unziplo_32),
+                      MAP(v256_unziphi_64),
+                      MAP(v256_unziplo_64),
+                      MAP(v256_pack_s32_u16),
+                      MAP(v256_pack_s32_s16),
+                      MAP(v256_pack_s16_u8),
+                      MAP(v256_pack_s16_s8),
+                      MAP(v256_or),
+                      MAP(v256_xor),
+                      MAP(v256_and),
+                      MAP(v256_andn),
+                      MAP(v256_mullo_s16),
+                      MAP(v256_mulhi_s16),
+                      MAP(v256_mullo_s32),
+                      MAP(v256_madd_s16),
+                      MAP(v256_madd_us8),
+                      MAP(v256_avg_u8),
+                      MAP(v256_rdavg_u8),
+                      MAP(v256_rdavg_u16),
+                      MAP(v256_avg_u16),
+                      MAP(v256_min_u8),
+                      MAP(v256_max_u8),
+                      MAP(v256_min_s8),
+                      MAP(v256_max_s8),
+                      MAP(v256_min_s16),
+                      MAP(v256_max_s16),
+                      MAP(v256_min_s32),
+                      MAP(v256_max_s32),
+                      MAP(v256_cmpgt_s8),
+                      MAP(v256_cmplt_s8),
+                      MAP(v256_cmpeq_8),
+                      MAP(v256_cmpgt_s16),
+                      MAP(v256_cmplt_s16),
+                      MAP(v256_cmpeq_16),
+                      MAP(v256_cmpgt_s32),
+                      MAP(v256_cmplt_s32),
+                      MAP(v256_cmpeq_32),
+                      MAP(v256_shuffle_8),
+                      MAP(v256_pshuffle_8),
+                      MAP(v256_wideshuffle_8),
+                      MAP(imm_v256_align<1>),
+                      MAP(imm_v256_align<2>),
+                      MAP(imm_v256_align<3>),
+                      MAP(imm_v256_align<4>),
+                      MAP(imm_v256_align<5>),
+                      MAP(imm_v256_align<6>),
+                      MAP(imm_v256_align<7>),
+                      MAP(imm_v256_align<8>),
+                      MAP(imm_v256_align<9>),
+                      MAP(imm_v256_align<10>),
+                      MAP(imm_v256_align<11>),
+                      MAP(imm_v256_align<12>),
+                      MAP(imm_v256_align<13>),
+                      MAP(imm_v256_align<14>),
+                      MAP(imm_v256_align<15>),
+                      MAP(imm_v256_align<16>),
+                      MAP(imm_v256_align<17>),
+                      MAP(imm_v256_align<18>),
+                      MAP(imm_v256_align<19>),
+                      MAP(imm_v256_align<20>),
+                      MAP(imm_v256_align<21>),
+                      MAP(imm_v256_align<22>),
+                      MAP(imm_v256_align<23>),
+                      MAP(imm_v256_align<24>),
+                      MAP(imm_v256_align<25>),
+                      MAP(imm_v256_align<26>),
+                      MAP(imm_v256_align<27>),
+                      MAP(imm_v256_align<28>),
+                      MAP(imm_v256_align<29>),
+                      MAP(imm_v256_align<30>),
+                      MAP(imm_v256_align<31>),
+                      MAP(v256_from_v128),
+                      MAP(v256_zip_8),
+                      MAP(v256_zip_16),
+                      MAP(v256_zip_32),
+                      MAP(v256_mul_s16),
+                      MAP(v256_unpack_u8_s16),
+                      MAP(v256_unpack_s8_s16),
+                      MAP(v256_unpack_u16_s32),
+                      MAP(v256_unpack_s16_s32),
+                      MAP(v256_shl_8),
+                      MAP(v256_shr_u8),
+                      MAP(v256_shr_s8),
+                      MAP(v256_shl_16),
+                      MAP(v256_shr_u16),
+                      MAP(v256_shr_s16),
+                      MAP(v256_shl_32),
+                      MAP(v256_shr_u32),
+                      MAP(v256_shr_s32),
+                      MAP(v256_shl_64),
+                      MAP(v256_shr_u64),
+                      MAP(v256_shr_s64),
+                      MAP(v256_abs_s8),
+                      MAP(v256_abs_s16),
+                      MAP(v256_padd_u8),
+                      MAP(v256_padd_s16),
+                      MAP(v256_unpacklo_u16_s32),
+                      MAP(v256_unpacklo_s16_s32),
+                      MAP(v256_unpackhi_u16_s32),
+                      MAP(v256_unpackhi_s16_s32),
+                      MAP(imm_v256_shr_n_word<1>),
+                      MAP(imm_v256_shr_n_word<2>),
+                      MAP(imm_v256_shr_n_word<3>),
+                      MAP(imm_v256_shr_n_word<4>),
+                      MAP(imm_v256_shr_n_word<5>),
+                      MAP(imm_v256_shr_n_word<6>),
+                      MAP(imm_v256_shr_n_word<7>),
+                      MAP(imm_v256_shr_n_word<8>),
+                      MAP(imm_v256_shr_n_word<9>),
+                      MAP(imm_v256_shr_n_word<10>),
+                      MAP(imm_v256_shr_n_word<11>),
+                      MAP(imm_v256_shr_n_word<12>),
+                      MAP(imm_v256_shr_n_word<13>),
+                      MAP(imm_v256_shr_n_word<14>),
+                      MAP(imm_v256_shr_n_word<15>),
+                      MAP(imm_v256_shl_n_word<1>),
+                      MAP(imm_v256_shl_n_word<2>),
+                      MAP(imm_v256_shl_n_word<3>),
+                      MAP(imm_v256_shl_n_word<4>),
+                      MAP(imm_v256_shl_n_word<5>),
+                      MAP(imm_v256_shl_n_word<6>),
+                      MAP(imm_v256_shl_n_word<7>),
+                      MAP(imm_v256_shl_n_word<8>),
+                      MAP(imm_v256_shl_n_word<9>),
+                      MAP(imm_v256_shl_n_word<10>),
+                      MAP(imm_v256_shl_n_word<11>),
+                      MAP(imm_v256_shl_n_word<12>),
+                      MAP(imm_v256_shl_n_word<13>),
+                      MAP(imm_v256_shl_n_word<14>),
+                      MAP(imm_v256_shl_n_word<15>),
+                      MAP(imm_v256_shr_n_byte<1>),
+                      MAP(imm_v256_shr_n_byte<2>),
+                      MAP(imm_v256_shr_n_byte<3>),
+                      MAP(imm_v256_shr_n_byte<4>),
+                      MAP(imm_v256_shr_n_byte<5>),
+                      MAP(imm_v256_shr_n_byte<6>),
+                      MAP(imm_v256_shr_n_byte<7>),
+                      MAP(imm_v256_shr_n_byte<8>),
+                      MAP(imm_v256_shr_n_byte<9>),
+                      MAP(imm_v256_shr_n_byte<10>),
+                      MAP(imm_v256_shr_n_byte<11>),
+                      MAP(imm_v256_shr_n_byte<12>),
+                      MAP(imm_v256_shr_n_byte<13>),
+                      MAP(imm_v256_shr_n_byte<14>),
+                      MAP(imm_v256_shr_n_byte<15>),
+                      MAP(imm_v256_shr_n_byte<16>),
+                      MAP(imm_v256_shr_n_byte<17>),
+                      MAP(imm_v256_shr_n_byte<18>),
+                      MAP(imm_v256_shr_n_byte<19>),
+                      MAP(imm_v256_shr_n_byte<20>),
+                      MAP(imm_v256_shr_n_byte<21>),
+                      MAP(imm_v256_shr_n_byte<22>),
+                      MAP(imm_v256_shr_n_byte<23>),
+                      MAP(imm_v256_shr_n_byte<24>),
+                      MAP(imm_v256_shr_n_byte<25>),
+                      MAP(imm_v256_shr_n_byte<26>),
+                      MAP(imm_v256_shr_n_byte<27>),
+                      MAP(imm_v256_shr_n_byte<28>),
+                      MAP(imm_v256_shr_n_byte<29>),
+                      MAP(imm_v256_shr_n_byte<30>),
+                      MAP(imm_v256_shr_n_byte<31>),
+                      MAP(imm_v256_shl_n_byte<1>),
+                      MAP(imm_v256_shl_n_byte<2>),
+                      MAP(imm_v256_shl_n_byte<3>),
+                      MAP(imm_v256_shl_n_byte<4>),
+                      MAP(imm_v256_shl_n_byte<5>),
+                      MAP(imm_v256_shl_n_byte<6>),
+                      MAP(imm_v256_shl_n_byte<7>),
+                      MAP(imm_v256_shl_n_byte<8>),
+                      MAP(imm_v256_shl_n_byte<9>),
+                      MAP(imm_v256_shl_n_byte<10>),
+                      MAP(imm_v256_shl_n_byte<11>),
+                      MAP(imm_v256_shl_n_byte<12>),
+                      MAP(imm_v256_shl_n_byte<13>),
+                      MAP(imm_v256_shl_n_byte<14>),
+                      MAP(imm_v256_shl_n_byte<15>),
+                      MAP(imm_v256_shl_n_byte<16>),
+                      MAP(imm_v256_shl_n_byte<17>),
+                      MAP(imm_v256_shl_n_byte<18>),
+                      MAP(imm_v256_shl_n_byte<19>),
+                      MAP(imm_v256_shl_n_byte<20>),
+                      MAP(imm_v256_shl_n_byte<21>),
+                      MAP(imm_v256_shl_n_byte<22>),
+                      MAP(imm_v256_shl_n_byte<23>),
+                      MAP(imm_v256_shl_n_byte<24>),
+                      MAP(imm_v256_shl_n_byte<25>),
+                      MAP(imm_v256_shl_n_byte<26>),
+                      MAP(imm_v256_shl_n_byte<27>),
+                      MAP(imm_v256_shl_n_byte<28>),
+                      MAP(imm_v256_shl_n_byte<29>),
+                      MAP(imm_v256_shl_n_byte<30>),
+                      MAP(imm_v256_shl_n_byte<31>),
+                      MAP(imm_v256_shl_n_8<1>),
+                      MAP(imm_v256_shl_n_8<2>),
+                      MAP(imm_v256_shl_n_8<3>),
+                      MAP(imm_v256_shl_n_8<4>),
+                      MAP(imm_v256_shl_n_8<5>),
+                      MAP(imm_v256_shl_n_8<6>),
+                      MAP(imm_v256_shl_n_8<7>),
+                      MAP(imm_v256_shr_n_u8<1>),
+                      MAP(imm_v256_shr_n_u8<2>),
+                      MAP(imm_v256_shr_n_u8<3>),
+                      MAP(imm_v256_shr_n_u8<4>),
+                      MAP(imm_v256_shr_n_u8<5>),
+                      MAP(imm_v256_shr_n_u8<6>),
+                      MAP(imm_v256_shr_n_u8<7>),
+                      MAP(imm_v256_shr_n_s8<1>),
+                      MAP(imm_v256_shr_n_s8<2>),
+                      MAP(imm_v256_shr_n_s8<3>),
+                      MAP(imm_v256_shr_n_s8<4>),
+                      MAP(imm_v256_shr_n_s8<5>),
+                      MAP(imm_v256_shr_n_s8<6>),
+                      MAP(imm_v256_shr_n_s8<7>),
+                      MAP(imm_v256_shl_n_16<1>),
+                      MAP(imm_v256_shl_n_16<2>),
+                      MAP(imm_v256_shl_n_16<4>),
+                      MAP(imm_v256_shl_n_16<6>),
+                      MAP(imm_v256_shl_n_16<8>),
+                      MAP(imm_v256_shl_n_16<10>),
+                      MAP(imm_v256_shl_n_16<12>),
+                      MAP(imm_v256_shl_n_16<14>),
+                      MAP(imm_v256_shr_n_u16<1>),
+                      MAP(imm_v256_shr_n_u16<2>),
+                      MAP(imm_v256_shr_n_u16<4>),
+                      MAP(imm_v256_shr_n_u16<6>),
+                      MAP(imm_v256_shr_n_u16<8>),
+                      MAP(imm_v256_shr_n_u16<10>),
+                      MAP(imm_v256_shr_n_u16<12>),
+                      MAP(imm_v256_shr_n_u16<14>),
+                      MAP(imm_v256_shr_n_s16<1>),
+                      MAP(imm_v256_shr_n_s16<2>),
+                      MAP(imm_v256_shr_n_s16<4>),
+                      MAP(imm_v256_shr_n_s16<6>),
+                      MAP(imm_v256_shr_n_s16<8>),
+                      MAP(imm_v256_shr_n_s16<10>),
+                      MAP(imm_v256_shr_n_s16<12>),
+                      MAP(imm_v256_shr_n_s16<14>),
+                      MAP(imm_v256_shl_n_32<1>),
+                      MAP(imm_v256_shl_n_32<4>),
+                      MAP(imm_v256_shl_n_32<8>),
+                      MAP(imm_v256_shl_n_32<12>),
+                      MAP(imm_v256_shl_n_32<16>),
+                      MAP(imm_v256_shl_n_32<20>),
+                      MAP(imm_v256_shl_n_32<24>),
+                      MAP(imm_v256_shl_n_32<28>),
+                      MAP(imm_v256_shr_n_u32<1>),
+                      MAP(imm_v256_shr_n_u32<4>),
+                      MAP(imm_v256_shr_n_u32<8>),
+                      MAP(imm_v256_shr_n_u32<12>),
+                      MAP(imm_v256_shr_n_u32<16>),
+                      MAP(imm_v256_shr_n_u32<20>),
+                      MAP(imm_v256_shr_n_u32<24>),
+                      MAP(imm_v256_shr_n_u32<28>),
+                      MAP(imm_v256_shr_n_s32<1>),
+                      MAP(imm_v256_shr_n_s32<4>),
+                      MAP(imm_v256_shr_n_s32<8>),
+                      MAP(imm_v256_shr_n_s32<12>),
+                      MAP(imm_v256_shr_n_s32<16>),
+                      MAP(imm_v256_shr_n_s32<20>),
+                      MAP(imm_v256_shr_n_s32<24>),
+                      MAP(imm_v256_shr_n_s32<28>),
+                      MAP(imm_v256_shl_n_64<1>),
+                      MAP(imm_v256_shl_n_64<4>),
+                      MAP(imm_v256_shl_n_64<8>),
+                      MAP(imm_v256_shl_n_64<12>),
+                      MAP(imm_v256_shl_n_64<16>),
+                      MAP(imm_v256_shl_n_64<20>),
+                      MAP(imm_v256_shl_n_64<24>),
+                      MAP(imm_v256_shl_n_64<28>),
+                      MAP(imm_v256_shl_n_64<32>),
+                      MAP(imm_v256_shl_n_64<36>),
+                      MAP(imm_v256_shl_n_64<40>),
+                      MAP(imm_v256_shl_n_64<44>),
+                      MAP(imm_v256_shl_n_64<48>),
+                      MAP(imm_v256_shl_n_64<52>),
+                      MAP(imm_v256_shl_n_64<56>),
+                      MAP(imm_v256_shl_n_64<60>),
+                      MAP(imm_v256_shr_n_u64<1>),
+                      MAP(imm_v256_shr_n_u64<4>),
+                      MAP(imm_v256_shr_n_u64<8>),
+                      MAP(imm_v256_shr_n_u64<12>),
+                      MAP(imm_v256_shr_n_u64<16>),
+                      MAP(imm_v256_shr_n_u64<20>),
+                      MAP(imm_v256_shr_n_u64<24>),
+                      MAP(imm_v256_shr_n_u64<28>),
+                      MAP(imm_v256_shr_n_u64<32>),
+                      MAP(imm_v256_shr_n_u64<36>),
+                      MAP(imm_v256_shr_n_u64<40>),
+                      MAP(imm_v256_shr_n_u64<44>),
+                      MAP(imm_v256_shr_n_u64<48>),
+                      MAP(imm_v256_shr_n_u64<52>),
+                      MAP(imm_v256_shr_n_u64<56>),
+                      MAP(imm_v256_shr_n_u64<60>),
+                      MAP(imm_v256_shr_n_s64<1>),
+                      MAP(imm_v256_shr_n_s64<4>),
+                      MAP(imm_v256_shr_n_s64<8>),
+                      MAP(imm_v256_shr_n_s64<12>),
+                      MAP(imm_v256_shr_n_s64<16>),
+                      MAP(imm_v256_shr_n_s64<20>),
+                      MAP(imm_v256_shr_n_s64<24>),
+                      MAP(imm_v256_shr_n_s64<28>),
+                      MAP(imm_v256_shr_n_s64<32>),
+                      MAP(imm_v256_shr_n_s64<36>),
+                      MAP(imm_v256_shr_n_s64<40>),
+                      MAP(imm_v256_shr_n_s64<44>),
+                      MAP(imm_v256_shr_n_s64<48>),
+                      MAP(imm_v256_shr_n_s64<52>),
+                      MAP(imm_v256_shr_n_s64<56>),
+                      MAP(imm_v256_shr_n_s64<60>),
+                      MAP(v256_movemask_8),
+                      MAP(v256_zero),
+                      MAP(v256_dup_8),
+                      MAP(v256_dup_16),
+                      MAP(v256_dup_32),
+                      MAP(v256_dup_64),
+                      MAP(v256_low_u32),
+                      MAP(v256_low_v64),
+                      MAP(v256_from_64),
+                      MAP(v256_from_v64),
+                      MAP(v256_ziplo_128),
+                      MAP(v256_ziphi_128),
+                      MAP(v256_unpacklo_u8_s16),
+                      MAP(v256_unpackhi_u8_s16),
+                      MAP(v256_unpacklo_s8_s16),
+                      MAP(v256_unpackhi_s8_s16),
+                      MAP(v256_blend_8),
+                      { NULL, NULL, NULL } };
+#undef MAP
+
+// Map reference functions to machine tuned functions. Since the
+// functions depend on machine tuned types, the non-machine tuned
+// instantiations of the test can't refer to these functions directly,
+// so we refer to them by name and do the mapping here.
+void Map(const char *name, fptr *ref, fptr *simd) {
+  unsigned int i;
+  for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
+  }
+
+  *ref = m[i].ref;
+  *simd = m[i].simd;
+}
+
+// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
+std::string Print(const uint8_t *a, int size) {
+  std::string text = "0x";
+  for (int i = 0; i < size; i++) {
+    const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
+    // Same as snprintf(..., ..., "%02x", c)
+    text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
+    text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
+  }
+
+  return text;
+}
+
+// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
+// ranges
+void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
+  switch (maskwidth) {
+    case 0: {
+      break;
+    }
+    case 8: {
+      for (int i = 0; i < size; i++) s[i] &= mask;
+      break;
+    }
+    case 16: {
+      uint16_t *t = reinterpret_cast<uint16_t *>(s);
+      assert(!(reinterpret_cast<uintptr_t>(s) & 1));
+      for (int i = 0; i < size / 2; i++) t[i] &= mask;
+      break;
+    }
+    case 32: {
+      uint32_t *t = reinterpret_cast<uint32_t *>(s);
+      assert(!(reinterpret_cast<uintptr_t>(s) & 3));
+      for (int i = 0; i < size / 4; i++) t[i] &= mask;
+      break;
+    }
+    case 64: {
+      uint64_t *t = reinterpret_cast<uint64_t *>(s);
+      assert(!(reinterpret_cast<uintptr_t>(s) & 7));
+      for (int i = 0; i < size / 8; i++) t[i] &= mask;
+      break;
+    }
+    default: {
+      FAIL() << "Unsupported mask width";
+      break;
+    }
+  }
+}
+
+// We need some extra load/store functions
+void u64_store_aligned(void *p, uint64_t a) {
+  v64_store_aligned(p, v64_from_64(a));
+}
+void s32_store_aligned(void *p, int32_t a) {
+  u32_store_aligned(p, static_cast<uint32_t>(a));
+}
+void s64_store_aligned(void *p, int64_t a) {
+  v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
+}
+
+void c_u64_store_aligned(void *p, uint64_t a) {
+  c_v64_store_aligned(p, c_v64_from_64(a));
+}
+
+void c_s32_store_aligned(void *p, int32_t a) {
+  c_u32_store_aligned(p, static_cast<uint32_t>(a));
+}
+
+void c_s64_store_aligned(void *p, int64_t a) {
+  c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
+}
+
+uint64_t u64_load_aligned(const void *p) {
+  return v64_u64(v64_load_aligned(p));
+}
+uint16_t u16_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint16_t *>(p));
+}
+uint8_t u8_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint8_t *>(p));
+}
+
+uint64_t c_u64_load_aligned(const void *p) {
+  return c_v64_u64(c_v64_load_aligned(p));
+}
+uint16_t c_u16_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint16_t *>(p));
+}
+uint8_t c_u8_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint8_t *>(p));
+}
+
+// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
+// intrinsics taking 1, 2 or 3 arguments respectively with their
+// corresponding C reference.  Ideally, the loads and stores should
+// have gone into the template parameter list, but v64 and v128 could
+// be typedef'ed to the same type (which is the case on x86) and then
+// we can't instantiate both v64 and v128, so the function return and
+// argument types, including the always differing types in the C
+// equivalent are used instead.  The function arguments must be void
+// pointers and then go through a cast to avoid matching errors in the
+// branches eliminated by the typeid tests in the calling function.
+template <typename Ret, typename Arg, typename CRet, typename CArg>
+int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
+                    fptr c_load, fptr c_simd, void *ref_d, const void *a) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
+  Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
+  CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load(a)));
+  my_store(d, my_simd(my_load(a)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
+template <typename Ret, typename Arg1, typename Arg2, typename CRet,
+          typename CArg1, typename CArg2>
+int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
+                     fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
+                     void *ref_d, const void *a, const void *b) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
+  my_store(d, my_simd(my_load1(a), my_load2(b)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
+template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
+          typename CRet, typename CArg1, typename CArg2, typename CArg3>
+int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
+                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
+                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
+                     const void *b, const void *c) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
+  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CArg2 (*const my_c_load3)(const void *) =
+      (CArg2(*const)(const void *))c_load3;
+  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
+      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
+  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
+}  // namespace
+
+template <typename CRet, typename CArg>
+void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                  const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
+
+    if (maskwidth) {
+      SetMask(s, sizeof(CArg), mask, maskwidth);
+    }
+
+    if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
+      // V64_V64
+      error = CompareSimd1Arg<v64, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(uint8_t)) {
+      // V64_U8
+      error = CompareSimd1Arg<v64, uint8_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(uint16_t)) {
+      // V64_U16
+      error = CompareSimd1Arg<v64, uint16_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(uint32_t)) {
+      // V64_U32
+      error = CompareSimd1Arg<v64, uint32_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // U64_V64
+      error = CompareSimd1Arg<uint64_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // S64_V64
+      error = CompareSimd1Arg<int64_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // U32_V64
+      error = CompareSimd1Arg<uint32_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(int32_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // S32_V64
+      error = CompareSimd1Arg<int32_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(s32_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s32_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // U32_V128
+      error = CompareSimd1Arg<uint32_t, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // U64_V128
+      error = CompareSimd1Arg<uint64_t, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // U64_V256
+      error = CompareSimd1Arg<uint64_t, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // V64_V128
+      error = CompareSimd1Arg<v64, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // V128_V128
+      error = CompareSimd1Arg<v128, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // V128_V64
+      error = CompareSimd1Arg<v128, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint8_t)) {
+      // V128_U8
+      error = CompareSimd1Arg<v128, uint8_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint16_t)) {
+      // V128_U16
+      error = CompareSimd1Arg<v128, uint16_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint32_t)) {
+      // V128_U32
+      error = CompareSimd1Arg<v128, uint32_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V128_U64
+      error = CompareSimd1Arg<v128, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // V256_V256
+      error = CompareSimd1Arg<v256, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // V256_V128
+      error = CompareSimd1Arg<v256, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint8_t)) {
+      // V256_U8
+      error = CompareSimd1Arg<v256, uint8_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint16_t)) {
+      // V256_U16
+      error = CompareSimd1Arg<v256, uint16_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint32_t)) {
+      // V256_U32
+      error = CompareSimd1Arg<v256, uint32_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V256_U64
+      error = CompareSimd1Arg<v256, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // U32_V256
+      error = CompareSimd1Arg<uint32_t, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // V64_V256
+      error = CompareSimd1Arg<v64, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
+             << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s, sizeof(s)) << ") -> " << Print(d, sizeof(d))
+                      << " (simd), " << Print(ref_d, sizeof(ref_d)) << " (ref)";
+}
+
+template <typename CRet, typename CArg1, typename CArg2>
+void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[32]);
+  DECLARE_ALIGNED(32, uint8_t, s2[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
+        typeid(CArg2) == typeid(c_v64)) {
+      // V64_V64V64
+      error = CompareSimd2Args<v64, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg1) == typeid(uint32_t) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V64_U32U32
+      error = CompareSimd2Args<v64, uint32_t, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(c_v64)) {
+      // U32_V64V64
+      error = CompareSimd2Args<uint32_t, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(c_v64)) {
+      // S64_V64V64
+      error = CompareSimd2Args<int64_t, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V64_V64U32
+      error = CompareSimd2Args<v64, v64, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // V128_V128V128
+      error = CompareSimd2Args<v128, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U32_V128V128
+      error = CompareSimd2Args<uint32_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U64_V128V128
+      error = CompareSimd2Args<uint64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // S64_V128V128
+      error = CompareSimd2Args<int64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(uint64_t) &&
+               typeid(CArg2) == typeid(uint64_t)) {
+      // V128_U64U64
+      error = CompareSimd2Args<v128, uint64_t, uint64_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(c_v64)) {
+      // V128_V64V64
+      error = CompareSimd2Args<v128, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V128_V128U32
+      error = CompareSimd2Args<v128, v128, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // V256_V256V256
+      error = CompareSimd2Args<v256, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U64_V256V256
+      error = CompareSimd2Args<uint64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // S64_V256V256
+      error = CompareSimd2Args<int64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U32_V256V256
+      error = CompareSimd2Args<uint32_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // V256_V128V128
+      error = CompareSimd2Args<v256, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V256_V256U32
+      error = CompareSimd2Args<v256, v256, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
+                      << ") -> " << Print(d, sizeof(d)) << " (simd), "
+                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+}
+
+template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[32]);
+  DECLARE_ALIGNED(32, uint8_t, s2[32]);
+  DECLARE_ALIGNED(32, uint8_t, s3[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
+         sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
+        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
+      // V128_V128V128V128
+      error =
+          CompareSimd3Args<v128, v128, v128, v128, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v128_store_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v128_store_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256) &&
+               typeid(CArg3) == typeid(c_v256)) {
+      // V256_V256V256V256
+      error =
+          CompareSimd3Args<v256, v256, v256, v256, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v256_store_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v256_store_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
+             << typeid(CArg3).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
+                      << ", " << Print(s3, sizeof(s3)) << ") -> "
+                      << Print(d, sizeof(d)) << " (simd), "
+                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+}
+
+// Instantiations to make the functions callable from another files
+template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                         const char *);
+template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
+                                                       uint32_t, const char *);
+template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                                 const char *);
+template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                                   const char *);
+template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
+                                          const char *);
+template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
+                                          const char *);
+template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
+                                                        uint32_t, const char *);
+template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                                  const char *);
+template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                     uint32_t, const char *);
+template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
+template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
+                                          const char *);
+template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                     uint32_t, const char *);
+template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
+
+}  // namespace SIMD_NAMESPACE
diff --git a/third_party/aom/test/simd_cmp_neon.cc b/third_party/aom/test/simd_cmp_neon.cc
new file mode 100644
index 000000000..53c1e2a07
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_neon.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#define ARCH NEON
+#define ARCH_POSTFIX(name) name##_neon
+#define SIMD_NAMESPACE simd_test_neon
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_cmp_sse2.cc b/third_party/aom/test/simd_cmp_sse2.cc
new file mode 100644
index 000000000..f7827a7fa
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_sse2.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE2
+#define ARCH_POSTFIX(name) name##_sse2
+#define SIMD_NAMESPACE simd_test_sse2
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_cmp_sse4.cc b/third_party/aom/test/simd_cmp_sse4.cc
new file mode 100644
index 000000000..3566764b6
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_sse4.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE4_1
+#define ARCH_POSTFIX(name) name##_sse4_1
+#define SIMD_NAMESPACE simd_test_sse4_1
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_cmp_ssse3.cc b/third_party/aom/test/simd_cmp_ssse3.cc
new file mode 100644
index 000000000..57bf135dd
--- /dev/null
+++ b/third_party/aom/test/simd_cmp_ssse3.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSSE3
+#define ARCH_POSTFIX(name) name##_ssse3
+#define SIMD_NAMESPACE simd_test_ssse3
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_impl.h b/third_party/aom/test/simd_impl.h
new file mode 100644
index 000000000..fd06f67fd
--- /dev/null
+++ b/third_party/aom/test/simd_impl.h
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define SIMD_CHECK 1
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "aom_dsp/aom_simd_inline.h"
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+
+namespace SIMD_NAMESPACE {
+
+template <typename param_signature>
+class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
+ public:
+  virtual ~TestIntrinsic() {}
+  virtual void SetUp() {
+    mask = ::testing::get<0>(this->GetParam());
+    maskwidth = ::testing::get<1>(this->GetParam());
+    name = ::testing::get<2>(this->GetParam());
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  uint32_t mask, maskwidth;
+  const char *name;
+};
+
+// Create one typedef for each function signature
+#define TYPEDEF_SIMD(name)                                                    \
+  typedef TestIntrinsic< ::testing::tuple<uint32_t, uint32_t, const char *> > \
+      ARCH_POSTFIX(name)
+
+TYPEDEF_SIMD(V64_U8);
+TYPEDEF_SIMD(V64_U16);
+TYPEDEF_SIMD(V64_U32);
+TYPEDEF_SIMD(V64_V64);
+TYPEDEF_SIMD(U32_V64);
+TYPEDEF_SIMD(S32_V64);
+TYPEDEF_SIMD(U64_V64);
+TYPEDEF_SIMD(S64_V64);
+TYPEDEF_SIMD(V64_U32U32);
+TYPEDEF_SIMD(V64_V64V64);
+TYPEDEF_SIMD(S64_V64V64);
+TYPEDEF_SIMD(V64_V64U32);
+TYPEDEF_SIMD(U32_V64V64);
+TYPEDEF_SIMD(V128_V64);
+TYPEDEF_SIMD(V128_V128);
+TYPEDEF_SIMD(U32_V128);
+TYPEDEF_SIMD(U64_V128);
+TYPEDEF_SIMD(V64_V128);
+TYPEDEF_SIMD(V128_U8);
+TYPEDEF_SIMD(V128_U16);
+TYPEDEF_SIMD(V128_U32);
+TYPEDEF_SIMD(V128_U64);
+TYPEDEF_SIMD(V128_U64U64);
+TYPEDEF_SIMD(V128_V64V64);
+TYPEDEF_SIMD(V128_V128V128);
+TYPEDEF_SIMD(V128_V128V128V128);
+TYPEDEF_SIMD(S64_V128V128);
+TYPEDEF_SIMD(V128_V128U32);
+TYPEDEF_SIMD(U32_V128V128);
+TYPEDEF_SIMD(U64_V128V128);
+TYPEDEF_SIMD(V256_V128);
+TYPEDEF_SIMD(V256_V256);
+TYPEDEF_SIMD(U64_V256);
+TYPEDEF_SIMD(V256_V128V128);
+TYPEDEF_SIMD(V256_V256V256);
+TYPEDEF_SIMD(V256_V256V256V256);
+TYPEDEF_SIMD(U64_V256V256);
+TYPEDEF_SIMD(S64_V256V256);
+TYPEDEF_SIMD(V256_V256U32);
+TYPEDEF_SIMD(U32_V256V256);
+TYPEDEF_SIMD(V256_U8);
+TYPEDEF_SIMD(V256_U16);
+TYPEDEF_SIMD(V256_U32);
+TYPEDEF_SIMD(V256_U64);
+TYPEDEF_SIMD(U32_V256);
+TYPEDEF_SIMD(V64_V256);
+
+// Google Test allows up to 50 tests per case, so split the largest
+typedef ARCH_POSTFIX(V64_V64) ARCH_POSTFIX(V64_V64_Part2);
+typedef ARCH_POSTFIX(V64_V64V64) ARCH_POSTFIX(V64_V64V64_Part2);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part2);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part3);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part4);
+typedef ARCH_POSTFIX(V128_V128V128) ARCH_POSTFIX(V128_V128V128_Part2);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part2);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part3);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part4);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part5);
+typedef ARCH_POSTFIX(V256_V256V256) ARCH_POSTFIX(V256_V256V256_Part2);
+
+// These functions are machine tuned located elsewhere
+template <typename c_ret, typename c_arg>
+void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                  const char *name);
+
+template <typename c_ret, typename c_arg1, typename c_arg2>
+void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name);
+
+template <typename c_ret, typename c_arg1, typename c_arg2, typename c_arg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name);
+
+const int kIterations = 65536;
+
+// Add a macro layer since TEST_P will quote the name so we need to
+// expand it first with the prefix.
+#define MY_TEST_P(name, test) TEST_P(name, test)
+
+MY_TEST_P(ARCH_POSTFIX(V64_U8), TestIntrinsics) {
+  TestSimd1Arg<c_v64, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U16), TestIntrinsics) {
+  TestSimd1Arg<c_v64, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U32), TestIntrinsics) {
+  TestSimd1Arg<c_v64, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V64), TestIntrinsics) {
+  TestSimd1Arg<uint64_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V64), TestIntrinsics) {
+  TestSimd1Arg<int64_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V64), TestIntrinsics) {
+  TestSimd1Arg<uint32_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S32_V64), TestIntrinsics) {
+  TestSimd1Arg<int32_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U32U32), TestIntrinsics) {
+  TestSimd2Args<c_v64, uint32_t, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64V64), TestIntrinsics) {
+  TestSimd2Args<c_v64, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V64V64), TestIntrinsics) {
+  TestSimd2Args<int64_t, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V64V64), TestIntrinsics) {
+  TestSimd2Args<uint32_t, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64U32), TestIntrinsics) {
+  TestSimd2Args<c_v64, c_v64, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+// Google Test allows up to 50 tests per case, so split the largest
+MY_TEST_P(ARCH_POSTFIX(V64_V64_Part2), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64V64_Part2), TestIntrinsics) {
+  TestSimd2Args<c_v64, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V128), TestIntrinsics) {
+  TestSimd1Arg<uint32_t, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V128), TestIntrinsics) {
+  TestSimd1Arg<uint64_t, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V128), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U8), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U16), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U32), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V64), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128V128), TestIntrinsics) {
+  TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(kIterations, mask, maskwidth,
+                                                name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V128V128), TestIntrinsics) {
+  TestSimd2Args<uint32_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V128V128), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V128V128), TestIntrinsics) {
+  TestSimd2Args<int64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U64U64), TestIntrinsics) {
+  TestSimd2Args<c_v128, uint64_t, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V64V64), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128U32), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v128, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128_Part2), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part2), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part3), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V256), TestIntrinsics) {
+  TestSimd1Arg<uint64_t, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V128), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256V256), TestIntrinsics) {
+  TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(kIterations, mask, maskwidth,
+                                                name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V128V128), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V256V256), TestIntrinsics) {
+  TestSimd2Args<uint32_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V256V256), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V256V256), TestIntrinsics) {
+  TestSimd2Args<int64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256_Part2), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256U32), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v256, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part2), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part3), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part5), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U8), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U16), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U32), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V256), TestIntrinsics) {
+  TestSimd1Arg<uint32_t, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V256), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+// Add a macro layer since INSTANTIATE_TEST_CASE_P will quote the name
+// so we need to expand it first with the prefix
+#define INSTANTIATE(name, type, ...) \
+  INSTANTIATE_TEST_CASE_P(name, type, ::testing::Values(__VA_ARGS__))
+
+#define SIMD_TUPLE(name, mask, maskwidth) \
+  ::testing::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64),
+            (SIMD_TUPLE(v64_sad_u8, 0U, 0U), SIMD_TUPLE(v64_ssd_u8, 0U, 0U)));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V64_V64V64), SIMD_TUPLE(v64_add_8, 0U, 0U),
+    SIMD_TUPLE(v64_add_16, 0U, 0U), SIMD_TUPLE(v64_sadd_s16, 0U, 0U),
+    SIMD_TUPLE(v64_add_32, 0U, 0U), SIMD_TUPLE(v64_sub_8, 0U, 0U),
+    SIMD_TUPLE(v64_ssub_u8, 0U, 0U), SIMD_TUPLE(v64_ssub_s8, 0U, 0U),
+    SIMD_TUPLE(v64_sub_16, 0U, 0U), SIMD_TUPLE(v64_ssub_s16, 0U, 0U),
+    SIMD_TUPLE(v64_ssub_u16, 0U, 0U), SIMD_TUPLE(v64_sub_32, 0U, 0U),
+    SIMD_TUPLE(v64_ziplo_8, 0U, 0U), SIMD_TUPLE(v64_ziphi_8, 0U, 0U),
+    SIMD_TUPLE(v64_ziplo_16, 0U, 0U), SIMD_TUPLE(v64_ziphi_16, 0U, 0U),
+    SIMD_TUPLE(v64_ziplo_32, 0U, 0U), SIMD_TUPLE(v64_ziphi_32, 0U, 0U),
+    SIMD_TUPLE(v64_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v64_pack_s16_u8, 0U, 0U),
+    SIMD_TUPLE(v64_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v64_unziphi_8, 0U, 0U),
+    SIMD_TUPLE(v64_unziplo_8, 0U, 0U), SIMD_TUPLE(v64_unziphi_16, 0U, 0U),
+    SIMD_TUPLE(v64_unziplo_16, 0U, 0U), SIMD_TUPLE(v64_or, 0U, 0U),
+    SIMD_TUPLE(v64_xor, 0U, 0U), SIMD_TUPLE(v64_and, 0U, 0U),
+    SIMD_TUPLE(v64_andn, 0U, 0U), SIMD_TUPLE(v64_mullo_s16, 0U, 0U),
+    SIMD_TUPLE(v64_mulhi_s16, 0U, 0U), SIMD_TUPLE(v64_mullo_s32, 0U, 0U),
+    SIMD_TUPLE(v64_madd_s16, 0U, 0U), SIMD_TUPLE(v64_madd_us8, 0U, 0U),
+    SIMD_TUPLE(v64_avg_u8, 0U, 0U), SIMD_TUPLE(v64_rdavg_u8, 0U, 0U),
+    SIMD_TUPLE(v64_avg_u16, 0U, 0U), SIMD_TUPLE(v64_min_u8, 0U, 0U),
+    SIMD_TUPLE(v64_max_u8, 0U, 0U), SIMD_TUPLE(v64_min_s8, 0U, 0U),
+    SIMD_TUPLE(v64_max_s8, 0U, 0U), SIMD_TUPLE(v64_min_s16, 0U, 0U),
+    SIMD_TUPLE(v64_max_s16, 0U, 0U), SIMD_TUPLE(v64_cmpgt_s8, 0U, 0U),
+    SIMD_TUPLE(v64_cmplt_s8, 0U, 0U), SIMD_TUPLE(v64_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v64_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v64_cmplt_s16, 0U, 0U),
+    SIMD_TUPLE(v64_cmpeq_16, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U),
+    SIMD_TUPLE(v64_pack_s32_u16, 0U, 0U), SIMD_TUPLE(v64_rdavg_u16, 0U, 0U),
+    SIMD_TUPLE(v64_sadd_s8, 0U, 0U), SIMD_TUPLE(v64_sadd_u8, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<7>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U),
+            SIMD_TUPLE(v64_abs_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_s16_s32, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_s16_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64_Part2),
+            SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64U32), SIMD_TUPLE(v64_shl_8, 7U, 32U),
+            SIMD_TUPLE(v64_shr_u8, 7U, 32U), SIMD_TUPLE(v64_shr_s8, 7U, 32U),
+            SIMD_TUPLE(v64_shl_16, 15U, 32U), SIMD_TUPLE(v64_shr_u16, 15U, 32U),
+            SIMD_TUPLE(v64_shr_s16, 15U, 32U), SIMD_TUPLE(v64_shl_32, 31U, 32U),
+            SIMD_TUPLE(v64_shr_u32, 31U, 32U),
+            SIMD_TUPLE(v64_shr_s32, 31U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V64), SIMD_TUPLE(v64_hadd_u8, 0U, 0U),
+            SIMD_TUPLE(v64_u64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64), SIMD_TUPLE(v64_hadd_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64), SIMD_TUPLE(v64_low_u32, 0U, 0U),
+            SIMD_TUPLE(v64_high_u32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S32_V64), SIMD_TUPLE(v64_low_s32, 0U, 0U),
+            SIMD_TUPLE(v64_high_s32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64V64), SIMD_TUPLE(v64_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v64_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U8), SIMD_TUPLE(v64_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U16), SIMD_TUPLE(v64_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32), SIMD_TUPLE(v64_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32U32), SIMD_TUPLE(v64_from_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128V128), SIMD_TUPLE(v128_sad_u8, 0U, 0U),
+            SIMD_TUPLE(v128_ssd_u8, 0U, 0U), SIMD_TUPLE(v128_sad_u16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128V128), SIMD_TUPLE(v128_ssd_s16, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V128_V128V128), SIMD_TUPLE(v128_add_8, 0U, 0U),
+    SIMD_TUPLE(v128_add_16, 0U, 0U), SIMD_TUPLE(v128_sadd_s16, 0U, 0U),
+    SIMD_TUPLE(v128_add_32, 0U, 0U), SIMD_TUPLE(v128_sub_8, 0U, 0U),
+    SIMD_TUPLE(v128_ssub_u8, 0U, 0U), SIMD_TUPLE(v128_ssub_s8, 0U, 0U),
+    SIMD_TUPLE(v128_sub_16, 0U, 0U), SIMD_TUPLE(v128_ssub_s16, 0U, 0U),
+    SIMD_TUPLE(v128_ssub_u16, 0U, 0U), SIMD_TUPLE(v128_sub_32, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_8, 0U, 0U), SIMD_TUPLE(v128_ziphi_8, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_16, 0U, 0U), SIMD_TUPLE(v128_ziphi_16, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_32, 0U, 0U), SIMD_TUPLE(v128_ziphi_32, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_64, 0U, 0U), SIMD_TUPLE(v128_ziphi_64, 0U, 0U),
+    SIMD_TUPLE(v128_unziphi_8, 0U, 0U), SIMD_TUPLE(v128_unziplo_8, 0U, 0U),
+    SIMD_TUPLE(v128_unziphi_16, 0U, 0U), SIMD_TUPLE(v128_unziplo_16, 0U, 0U),
+    SIMD_TUPLE(v128_unziphi_32, 0U, 0U), SIMD_TUPLE(v128_unziplo_32, 0U, 0U),
+    SIMD_TUPLE(v128_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v128_pack_s16_u8, 0U, 0U),
+    SIMD_TUPLE(v128_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v128_or, 0U, 0U),
+    SIMD_TUPLE(v128_xor, 0U, 0U), SIMD_TUPLE(v128_and, 0U, 0U),
+    SIMD_TUPLE(v128_andn, 0U, 0U), SIMD_TUPLE(v128_mullo_s16, 0U, 0U),
+    SIMD_TUPLE(v128_mulhi_s16, 0U, 0U), SIMD_TUPLE(v128_mullo_s32, 0U, 0U),
+    SIMD_TUPLE(v128_madd_s16, 0U, 0U), SIMD_TUPLE(v128_madd_us8, 0U, 0U),
+    SIMD_TUPLE(v128_avg_u8, 0U, 0U), SIMD_TUPLE(v128_rdavg_u8, 0U, 0U),
+    SIMD_TUPLE(v128_avg_u16, 0U, 0U), SIMD_TUPLE(v128_min_u8, 0U, 0U),
+    SIMD_TUPLE(v128_max_u8, 0U, 0U), SIMD_TUPLE(v128_min_s8, 0U, 0U),
+    SIMD_TUPLE(v128_max_s8, 0U, 0U), SIMD_TUPLE(v128_min_s16, 0U, 0U),
+    SIMD_TUPLE(v128_max_s16, 0U, 0U), SIMD_TUPLE(v128_cmpgt_s8, 0U, 0U),
+    SIMD_TUPLE(v128_cmplt_s8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
+            SIMD_TUPLE(v128_pack_s32_u16, 0U, 0U),
+            SIMD_TUPLE(v128_rdavg_u16, 0U, 0U), SIMD_TUPLE(v128_add_64, 0U, 0U),
+            SIMD_TUPLE(v128_sub_64, 0U, 0U), SIMD_TUPLE(v128_sadd_s8, 0U, 0U),
+            SIMD_TUPLE(v128_sadd_u8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
+            SIMD_TUPLE(v128_cmplt_s16, 0U, 0U),
+            SIMD_TUPLE(v128_cmplt_s32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpeq_32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpgt_s32, 0U, 0U),
+            SIMD_TUPLE(v128_shuffle_8, 15U, 8U),
+            SIMD_TUPLE(v128_min_s32, 0U, 0U), SIMD_TUPLE(v128_max_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128V128),
+            SIMD_TUPLE(v128_blend_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
+            SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_s16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_s16_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
+            SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3),
+            SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part4),
+            SIMD_TUPLE(imm_v128_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v128_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64V64), SIMD_TUPLE(v128_from_v64, 0U, 0U),
+            SIMD_TUPLE(v128_zip_8, 0U, 0U), SIMD_TUPLE(v128_zip_16, 0U, 0U),
+            SIMD_TUPLE(v128_zip_32, 0U, 0U), SIMD_TUPLE(v128_mul_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64U64), SIMD_TUPLE(v128_from_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64),
+            SIMD_TUPLE(v128_unpack_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpack_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
+    SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v128_shl_16, 15U, 32U), SIMD_TUPLE(v128_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v128_shr_s16, 15U, 32U), SIMD_TUPLE(v128_shl_32, 31U, 32U),
+    SIMD_TUPLE(v128_shr_u32, 31U, 32U), SIMD_TUPLE(v128_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v128_shl_64, 63U, 32U), SIMD_TUPLE(v128_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v128_shr_s64, 63U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U),
+            SIMD_TUPLE(v128_movemask_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128), SIMD_TUPLE(v128_hadd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V128), SIMD_TUPLE(v128_low_v64, 0U, 0U),
+            SIMD_TUPLE(v128_high_v64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U8), SIMD_TUPLE(v128_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U16), SIMD_TUPLE(v128_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U32), SIMD_TUPLE(v128_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64), SIMD_TUPLE(v128_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128), SIMD_TUPLE(v128_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256V256), SIMD_TUPLE(v256_sad_u8, 0U, 0U),
+            SIMD_TUPLE(v256_ssd_u8, 0U, 0U), SIMD_TUPLE(v256_sad_u16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U),
+            SIMD_TUPLE(v256_low_u64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256), SIMD_TUPLE(v256_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256V256), SIMD_TUPLE(v256_ssd_s16, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256V256), SIMD_TUPLE(v256_add_8, 0U, 0U),
+    SIMD_TUPLE(v256_add_16, 0U, 0U), SIMD_TUPLE(v256_sadd_s16, 0U, 0U),
+    SIMD_TUPLE(v256_add_32, 0U, 0U), SIMD_TUPLE(v256_sub_8, 0U, 0U),
+    SIMD_TUPLE(v256_ssub_u8, 0U, 0U), SIMD_TUPLE(v256_ssub_s8, 0U, 0U),
+    SIMD_TUPLE(v256_sub_16, 0U, 0U), SIMD_TUPLE(v256_ssub_s16, 0U, 0U),
+    SIMD_TUPLE(v256_ssub_u16, 0U, 0U), SIMD_TUPLE(v256_sub_32, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_8, 0U, 0U), SIMD_TUPLE(v256_ziphi_8, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_16, 0U, 0U), SIMD_TUPLE(v256_ziphi_16, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_32, 0U, 0U), SIMD_TUPLE(v256_ziphi_32, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_64, 0U, 0U), SIMD_TUPLE(v256_ziphi_64, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_128, 0U, 0U), SIMD_TUPLE(v256_ziphi_128, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_8, 0U, 0U), SIMD_TUPLE(v256_unziplo_8, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_16, 0U, 0U), SIMD_TUPLE(v256_unziplo_16, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_32, 0U, 0U), SIMD_TUPLE(v256_unziplo_32, 0U, 0U),
+    SIMD_TUPLE(v256_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v256_pack_s16_u8, 0U, 0U),
+    SIMD_TUPLE(v256_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v256_or, 0U, 0U),
+    SIMD_TUPLE(v256_xor, 0U, 0U), SIMD_TUPLE(v256_and, 0U, 0U),
+    SIMD_TUPLE(v256_andn, 0U, 0U), SIMD_TUPLE(v256_mullo_s16, 0U, 0U),
+    SIMD_TUPLE(v256_mulhi_s16, 0U, 0U), SIMD_TUPLE(v256_mullo_s32, 0U, 0U),
+    SIMD_TUPLE(v256_madd_s16, 0U, 0U), SIMD_TUPLE(v256_madd_us8, 0U, 0U),
+    SIMD_TUPLE(v256_avg_u8, 0U, 0U), SIMD_TUPLE(v256_rdavg_u8, 0U, 0U),
+    SIMD_TUPLE(v256_avg_u16, 0U, 0U), SIMD_TUPLE(v256_min_u8, 0U, 0U),
+    SIMD_TUPLE(v256_max_u8, 0U, 0U), SIMD_TUPLE(v256_min_s8, 0U, 0U),
+    SIMD_TUPLE(v256_max_s8, 0U, 0U), SIMD_TUPLE(v256_min_s16, 0U, 0U),
+    SIMD_TUPLE(v256_max_s16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s8, 0U, 0U),
+    SIMD_TUPLE(v256_cmplt_s8, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256V256_Part2), SIMD_TUPLE(v256_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v256_min_s32, 0U, 0U), SIMD_TUPLE(v256_max_s32, 0U, 0U),
+    SIMD_TUPLE(v256_add_64, 0U, 0U), SIMD_TUPLE(v256_sub_64, 0U, 0U),
+    SIMD_TUPLE(v256_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v256_cmplt_s16, 0U, 0U),
+    SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s32, 0U, 0U),
+    SIMD_TUPLE(v256_cmplt_s32, 0U, 0U), SIMD_TUPLE(v256_cmpeq_32, 0U, 0U),
+    SIMD_TUPLE(v256_shuffle_8, 31U, 8U), SIMD_TUPLE(v256_pshuffle_8, 15U, 8U),
+    SIMD_TUPLE(imm_v256_align<1>, 0U, 0U), SIMD_TUPLE(v256_sadd_s8, 0U, 0U),
+    SIMD_TUPLE(v256_sadd_u8, 0U, 0U), SIMD_TUPLE(v256_pack_s32_u16, 0U, 0U),
+    SIMD_TUPLE(v256_rdavg_u16, 0U, 0U), SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_64, 0U, 0U), SIMD_TUPLE(v256_unziplo_64, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<3>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<4>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<5>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<6>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<7>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<8>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<9>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<10>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<11>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<12>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<13>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<14>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<15>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<16>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<17>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<18>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<19>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<20>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<21>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<22>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<23>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<24>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<25>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<26>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<27>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<28>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<29>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<30>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<31>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128V128),
+            SIMD_TUPLE(v256_from_v128, 0U, 0U), SIMD_TUPLE(v256_zip_8, 0U, 0U),
+            SIMD_TUPLE(v256_zip_16, 0U, 0U), SIMD_TUPLE(v256_zip_32, 0U, 0U),
+            SIMD_TUPLE(v256_mul_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128),
+            SIMD_TUPLE(v256_unpack_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpack_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpack_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpack_s16_s32, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
+    SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v256_shl_16, 15U, 32U), SIMD_TUPLE(v256_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v256_shr_s16, 15U, 32U), SIMD_TUPLE(v256_shl_32, 31U, 32U),
+    SIMD_TUPLE(v256_shr_u32, 31U, 32U), SIMD_TUPLE(v256_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v256_shl_64, 63U, 32U), SIMD_TUPLE(v256_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v256_shr_s64, 63U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256), SIMD_TUPLE(v256_abs_s8, 0U, 0U),
+            SIMD_TUPLE(v256_abs_s16, 0U, 0U), SIMD_TUPLE(v256_padd_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_s16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_s16_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<17>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<18>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<19>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<21>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<22>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<23>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<25>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<26>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<27>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<29>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<30>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<31>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<8>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part2),
+            SIMD_TUPLE(imm_v256_shl_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<17>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<18>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<19>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<21>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<22>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<23>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<25>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<26>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<27>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<29>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<30>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<31>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<10>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part3),
+            SIMD_TUPLE(imm_v256_shl_n_16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part4),
+            SIMD_TUPLE(imm_v256_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v256_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part5),
+            SIMD_TUPLE(imm_v256_shr_n_word<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256V256V256),
+            SIMD_TUPLE(v256_blend_8, 0U, 0U),
+            SIMD_TUPLE(v256_wideshuffle_8, 63U, 8U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U8), SIMD_TUPLE(v256_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U16), SIMD_TUPLE(v256_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U32), SIMD_TUPLE(v256_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U64), SIMD_TUPLE(v256_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U),
+            SIMD_TUPLE(v256_movemask_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V256), SIMD_TUPLE(v256_low_v64, 0U, 0U));
+
+}  // namespace SIMD_NAMESPACE
diff --git a/third_party/aom/test/simd_neon_test.cc b/third_party/aom/test/simd_neon_test.cc
new file mode 100644
index 000000000..b67b18895
--- /dev/null
+++ b/third_party/aom/test/simd_neon_test.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#define ARCH NEON
+#define ARCH_POSTFIX(name) name##_neon
+#define SIMD_NAMESPACE simd_test_neon
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_sse2_test.cc b/third_party/aom/test/simd_sse2_test.cc
new file mode 100644
index 000000000..b37a931b3
--- /dev/null
+++ b/third_party/aom/test/simd_sse2_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE2
+#define ARCH_POSTFIX(name) name##_sse2
+#define SIMD_NAMESPACE simd_test_sse2
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_sse4_test.cc b/third_party/aom/test/simd_sse4_test.cc
new file mode 100644
index 000000000..b1c9d5cd8
--- /dev/null
+++ b/third_party/aom/test/simd_sse4_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE4_1
+#define ARCH_POSTFIX(name) name##_sse4_1
+#define SIMD_NAMESPACE simd_test_sse4_1
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simd_ssse3_test.cc b/third_party/aom/test/simd_ssse3_test.cc
new file mode 100644
index 000000000..d95c26fb5
--- /dev/null
+++ b/third_party/aom/test/simd_ssse3_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSSE3
+#define ARCH_POSTFIX(name) name##_ssse3
+#define SIMD_NAMESPACE simd_test_ssse3
+#include "test/simd_impl.h"
+#endif
diff --git a/third_party/aom/test/simple_decoder.sh b/third_party/aom/test/simple_decoder.sh
new file mode 100755
index 000000000..5f39ad206
--- /dev/null
+++ b/third_party/aom/test/simple_decoder.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom simple_decoder example code. To add new tests to
+## this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to simple_decoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+simple_decoder_verify_environment() {
+  if [ ! "$(av1_encode_available)" = "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+    return 1
+  fi
+}
+
+# Runs simple_decoder using $1 as input file. $2 is the codec name, and is used
+# solely to name the output file.
+simple_decoder() {
+  local decoder="$(aom_tool_path simple_decoder)"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+simple_decoder_av1() {
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    if [ ! -e "${AV1_IVF_FILE}" ]; then
+      local file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+      encode_yuv_raw_input_av1 "${file}" --ivf
+      simple_decoder "${file}" av1 || return 1
+    else
+      simple_decoder "${AV1_IVF_FILE}" av1 || return 1
+    fi
+  fi
+}
+
+simple_decoder_tests="simple_decoder_av1"
+
+run_tests simple_decoder_verify_environment "${simple_decoder_tests}"
diff --git a/third_party/aom/test/simple_encoder.sh b/third_party/aom/test/simple_encoder.sh
new file mode 100755
index 000000000..5cd6b46a1
--- /dev/null
+++ b/third_party/aom/test/simple_encoder.sh
@@ -0,0 +1,53 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom simple_encoder example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to simple_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+simple_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
+simple_encoder() {
+  local encoder="${LIBAOM_BIN_PATH}/simple_encoder${AOM_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 5 \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+
+simple_encoder_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    simple_encoder av1 || return 1
+  fi
+}
+
+simple_encoder_tests="simple_encoder_av1"
+
+run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/third_party/aom/test/subtract_test.cc b/third_party/aom/test/subtract_test.cc
new file mode 100644
index 000000000..7dcedf56d
--- /dev/null
+++ b/third_party/aom/test/subtract_test.cc
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/blockd.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride);
+
+namespace {
+
+class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+ public:
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+};
+
+using libaom_test::ACMRandom;
+
+TEST_P(AV1SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+    const int block_width = block_size_wide[bsize];
+    const int block_height = block_size_high[bsize];
+    int16_t *diff = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+    uint8_t *pred = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, block_width * block_height * 2));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, block_width * block_height * 2));
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width, src, block_width,
+                 pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] - pred[r * block_width + c]))
+              << "r = " << r << ", c = " << c << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2, src,
+                 block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(
+              diff[r * block_width * 2 + c],
+              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
+              << "r = " << r << ", c = " << c << ", bs = " << bsize;
+        }
+      }
+    }
+    aom_free(diff);
+    aom_free(pred);
+    aom_free(src);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, AV1SubtractBlockTest,
+                        ::testing::Values(aom_subtract_block_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AV1SubtractBlockTest,
+                        ::testing::Values(aom_subtract_block_sse2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AV1SubtractBlockTest,
+                        ::testing::Values(aom_subtract_block_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, AV1SubtractBlockTest,
+                        ::testing::Values(aom_subtract_block_msa));
+#endif
+
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                ptrdiff_t pred_stride, int bd);
+
+using ::testing::get;
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = GET_PARAM(0);
+    block_height_ = GET_PARAM(1);
+    bit_depth_ = static_cast<aom_bit_depth_t>(GET_PARAM(2));
+    func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    const size_t max_width = 128;
+    const size_t max_block_size = max_width * max_width;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    diff_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, max_block_size * sizeof(int16_t)));
+  }
+
+  virtual void TearDown() {
+    aom_free(CONVERT_TO_SHORTPTR(src_));
+    aom_free(CONVERT_TO_SHORTPTR(pred_));
+    aom_free(diff_);
+  }
+
+ protected:
+  void CheckResult();
+  void RunForSpeed();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  aom_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void AV1HBDSubtractBlockTest::CheckResult() {
+  const int test_num = 100;
+  const size_t max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (i = 0; i < test_num; ++i) {
+    for (j = 0; j < max_block_size; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void AV1HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+  const size_t max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+}
+
+TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+#if HAVE_SSE2
+
+const Params kAV1HBDSubtractBlock_sse2[] = {
+  make_tuple(4, 4, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(4, 4, 12, &aom_highbd_subtract_block_c),
+  make_tuple(4, 8, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(4, 8, 12, &aom_highbd_subtract_block_c),
+  make_tuple(8, 4, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(8, 4, 12, &aom_highbd_subtract_block_c),
+  make_tuple(8, 8, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(8, 8, 12, &aom_highbd_subtract_block_c),
+  make_tuple(8, 16, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(8, 16, 12, &aom_highbd_subtract_block_c),
+  make_tuple(16, 8, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(16, 8, 12, &aom_highbd_subtract_block_c),
+  make_tuple(16, 16, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(16, 16, 12, &aom_highbd_subtract_block_c),
+  make_tuple(16, 32, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(16, 32, 12, &aom_highbd_subtract_block_c),
+  make_tuple(32, 16, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(32, 16, 12, &aom_highbd_subtract_block_c),
+  make_tuple(32, 32, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(32, 32, 12, &aom_highbd_subtract_block_c),
+  make_tuple(32, 64, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(32, 64, 12, &aom_highbd_subtract_block_c),
+  make_tuple(64, 32, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(64, 32, 12, &aom_highbd_subtract_block_c),
+  make_tuple(64, 64, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(64, 64, 12, &aom_highbd_subtract_block_c),
+  make_tuple(64, 128, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(64, 128, 12, &aom_highbd_subtract_block_c),
+  make_tuple(128, 64, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(128, 64, 12, &aom_highbd_subtract_block_c),
+  make_tuple(128, 128, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(128, 128, 12, &aom_highbd_subtract_block_c)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HBDSubtractBlockTest,
+                        ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2));
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/third_party/aom/test/sum_squares_test.cc b/third_party/aom/test/sum_squares_test.cc
new file mode 100644
index 000000000..f10998498
--- /dev/null
+++ b/third_party/aom/test/sum_squares_test.cc
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+const int kNumIterations = 10000;
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int width,
+                              int height);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+  virtual ~SumSquaresTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
+
+void SumSquaresTest::RunTest(int isRandom) {
+  int failed = 0;
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
+    const int height = 4 * (rnd_(31) + 1);  // Up to 128x128
+    int stride = 4 << rnd_(7);              // Up to 256 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst =
+                                 params_.tst_func(src_, stride, width, height));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+void SumSquaresTest::RunSpeedTest() {
+  for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+    const int width = block_size_wide[block];   // Up to 128x128
+    const int height = block_size_high[block];  // Up to 128x128
+    int stride = 4 << rnd_(7);                  // Up to 256 stride
+    while (stride < width) {                    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(src_, stride, width, height);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(src_, stride, width, height);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+}
+
+TEST_P(SumSquaresTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_sse2)));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_avx2)));
+#endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t N);
+typedef libaom_test::FuncParam<F1D> TestFuncs1D;
+
+class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
+ protected:
+  static const int kIterations = 1000;
+  static const int kMaxSize = 256;
+};
+
+TEST_P(SumSquares1DTest, RandomValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kMaxSize * kMaxSize; ++i)
+      src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(SumSquares1DTest, ExtremeValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
+    }
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, SumSquares1DTest,
+                        ::testing::Values(TestFuncs1D(
+                            aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
+
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/third_party/aom/test/superframe_test.cc b/third_party/aom/test/superframe_test.cc
new file mode 100644
index 000000000..7be18f72a
--- /dev/null
+++ b/third_party/aom/test/superframe_test.cc
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kTestMode = 0;
+const int kTileCols = 1;
+const int kTileRows = 2;
+
+typedef ::testing::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
+
+class SuperframeTest
+    : public ::libaom_test::CodecTestWithParam<SuperframeTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SuperframeTest() : EncoderTest(GET_PARAM(0)), last_sf_pts_(0) {}
+  virtual ~SuperframeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    const SuperframeTestParam input = GET_PARAM(1);
+    const libaom_test::TestMode mode = ::testing::get<kTestMode>(input);
+    SetMode(mode);
+    sf_count_ = 0;
+    sf_count_max_ = INT_MAX;
+    n_tile_cols_ = ::testing::get<kTileCols>(input);
+    n_tile_rows_ = ::testing::get<kTileRows>(input);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_CPUUSED, 2);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+    }
+  }
+
+  virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return pkt;
+
+    const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[0];
+    const int frames = (marker & 0x7) + 1;
+    const int mag = ((marker >> 3) & 3) + 1;
+    const unsigned int index_sz = 2 + mag * (frames - 1);
+    if ((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz &&
+        buffer[index_sz - 1] == marker) {
+      // frame is a superframe. strip off the index.
+      modified_buf_.resize(pkt->data.frame.sz - index_sz);
+      memcpy(&modified_buf_[0], (uint8_t *)pkt->data.frame.buf + index_sz,
+             pkt->data.frame.sz - index_sz);
+      modified_pkt_ = *pkt;
+      modified_pkt_.data.frame.buf = &modified_buf_[0];
+      modified_pkt_.data.frame.sz -= index_sz;
+
+      sf_count_++;
+      last_sf_pts_ = pkt->data.frame.pts;
+      return &modified_pkt_;
+    }
+
+    // Make sure we do a few frames after the last SF
+    abort_ |=
+        sf_count_ > sf_count_max_ && pkt->data.frame.pts - last_sf_pts_ >= 5;
+    return pkt;
+  }
+
+  int sf_count_;
+  int sf_count_max_;
+  aom_codec_cx_pkt_t modified_pkt_;
+  std::vector<uint8_t> modified_buf_;
+  aom_codec_pts_t last_sf_pts_;
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+};
+
+TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
+  sf_count_max_ = 0;  // early exit on successful test.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.large_scale_tile = 1;
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show
+  //       frames besides ALTREF_FRAME.
+  EXPECT_GE(sf_count_, 1);
+}
+
+}  // namespace
diff --git a/third_party/aom/test/test-data.sha1 b/third_party/aom/test/test-data.sha1
new file mode 100644
index 000000000..b6ee34701
--- /dev/null
+++ b/third_party/aom/test/test-data.sha1
@@ -0,0 +1,507 @@
+d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
+26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-1814.ivf.res
+fa06784f23751d8c37be94160fb821e855199af4 *invalid-oss-fuzz-10061.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10061.ivf.res
+c9e06c4c7fb7d69fd635a1f606a5e478d60e99cf *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf
+88e18e61bd2b7457b4c71ebefbdff0029c41cc04 *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res
+91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
+c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res
+f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
+a686989de79af89136f631fd630df639c7861851 *invalid-oss-fuzz-9720.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9720.ivf.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
+614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
+b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m
+81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv
+b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m
+eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
+717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m
+9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
+5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
+36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv
+c2e1ec9936b95254187a359e94aa32a9f3dad1b7 *av1-1-b8-00-quantizer-00.ivf
+26cd2a0321d01d9db5f6dace8b43a40cd5b9d58d *av1-1-b8-00-quantizer-00.ivf.md5
+a56dd02c0258d4afea1ee358a22b54e99e39d5e1 *av1-1-b8-00-quantizer-01.ivf
+b3d24124d81f1fbb26f5eb0036accb54f3ec69b2 *av1-1-b8-00-quantizer-01.ivf.md5
+3466327cb842a91d69839b11ef930a74f086f4c6 *av1-1-b8-00-quantizer-02.ivf
+c111dce946100efeaad34203080eee1d55464df6 *av1-1-b8-00-quantizer-02.ivf.md5
+d3f1f32de5e2c0c19a58bb8ef096108388c6a820 *av1-1-b8-00-quantizer-03.ivf
+6265321b31130545b4454982ca93e412a56845b8 *av1-1-b8-00-quantizer-03.ivf.md5
+f37c393ebe73266a5ec8508a2ca33c586ff28e64 *av1-1-b8-00-quantizer-04.ivf
+c6e979da71aecc593c0abb40135dd304152b00dd *av1-1-b8-00-quantizer-04.ivf.md5
+ac9c5e93cb19942a9be259d0567ec96c54dcdc7c *av1-1-b8-00-quantizer-05.ivf
+49e35a7399568a0e4f015ce323d5a45ea780ca87 *av1-1-b8-00-quantizer-05.ivf.md5
+461142b1b50ae74c6b698d23f5ed3b764eadfb89 *av1-1-b8-00-quantizer-06.ivf
+6477ff260624e0f76c94ac872d1e7d5576af4177 *av1-1-b8-00-quantizer-06.ivf.md5
+7f8113cd13d8faaa06fdbaaa50dc328daf037e6d *av1-1-b8-00-quantizer-07.ivf
+b26795c6cb408487c20737977cd6b77311772bf7 *av1-1-b8-00-quantizer-07.ivf.md5
+4218f7945a172e1fe4f9e77ec35085a394eda9f4 *av1-1-b8-00-quantizer-08.ivf
+ea5d7d501e9a69d805251e4871515d28468d8676 *av1-1-b8-00-quantizer-08.ivf.md5
+837f3bcadfe56cf302db2ebaf9a990446fb35801 *av1-1-b8-00-quantizer-09.ivf
+eede995cdac5fd01a411da2e74e86e8394138be1 *av1-1-b8-00-quantizer-09.ivf.md5
+adc229b3780a4968c18ded1bcbe72e3f04643833 *av1-1-b8-00-quantizer-10.ivf
+0799b7e54e54ee97bf0e8aad2b75509ce59c7097 *av1-1-b8-00-quantizer-10.ivf.md5
+44bac8247160a8d9a0ab19f890fc89cc9298de1d *av1-1-b8-00-quantizer-11.ivf
+cc6b2bf167e114599b242aba574e8c6f1fa2f047 *av1-1-b8-00-quantizer-11.ivf.md5
+ebb3af7dfc15567188bcb617021cdc95ebc560e3 *av1-1-b8-00-quantizer-12.ivf
+b716ae29d56cd0c052dbfa1b5dcf850cd0fa8ca7 *av1-1-b8-00-quantizer-12.ivf.md5
+46159641f981a26fb9c374a5ca41e44f0ce0a9f0 *av1-1-b8-00-quantizer-13.ivf
+c6db1b8b4a74f83e4a0647e053cea0fc00f6abab *av1-1-b8-00-quantizer-13.ivf.md5
+fadc909d18eb640760fbb075f922fb050e715470 *av1-1-b8-00-quantizer-14.ivf
+e36bb6b23273633ba3ef7d28160a7258840a1476 *av1-1-b8-00-quantizer-14.ivf.md5
+8befbd9cc1601dcd36ec6911613855f68e6fd40e *av1-1-b8-00-quantizer-15.ivf
+cfc2334b76fb5e7aa9d8607e89d37cbc7716d62e *av1-1-b8-00-quantizer-15.ivf.md5
+ca42e00ae27c6b7f684fe3d2a787d50d2827cb3f *av1-1-b8-00-quantizer-16.ivf
+f11278218a7c3c73cfaab2332bab55f06cedcc81 *av1-1-b8-00-quantizer-16.ivf.md5
+05270d365bdc067f9446eda3029a6f41571a5229 *av1-1-b8-00-quantizer-17.ivf
+fb6482f35e7ad04bf231ea1806226760abcb3c26 *av1-1-b8-00-quantizer-17.ivf.md5
+617bc72037165efbff478d5a0d342b3c20ffcafd *av1-1-b8-00-quantizer-18.ivf
+1ff68d5424f91322123fe0d58f436b8e49cfa99d *av1-1-b8-00-quantizer-18.ivf.md5
+821c3b1ae6054c7a91b2f64428806e57f1157ca6 *av1-1-b8-00-quantizer-19.ivf
+f2fd118e786697553d6987f786660a2bb9f00680 *av1-1-b8-00-quantizer-19.ivf.md5
+48bcf17c27d9a4eb73632a68c09f42eff9f9af99 *av1-1-b8-00-quantizer-20.ivf
+64d55e4c858414bc2837c9c3e2d5fb6d2208c4b8 *av1-1-b8-00-quantizer-20.ivf.md5
+d61ecdd4f0950bc5c8bae1270b22e711bdd22763 *av1-1-b8-00-quantizer-21.ivf
+9d447938596096704fd5f4d41bcdf6fabf9cdfb9 *av1-1-b8-00-quantizer-21.ivf.md5
+59b4b65d8e56ccdd1bddff26a03e991a63409334 *av1-1-b8-00-quantizer-22.ivf
+aa1be0c7c7622d612af85f9bf96a212f6fe5ab56 *av1-1-b8-00-quantizer-22.ivf.md5
+95ed96988eb9916cad956db9b929718769de49f1 *av1-1-b8-00-quantizer-23.ivf
+596b8a3aea468996d609624367465c412751f52b *av1-1-b8-00-quantizer-23.ivf.md5
+e6c2dc4ce725003152797b3d7b34d7eb34da50c8 *av1-1-b8-00-quantizer-24.ivf
+1cd3d7e8b3813a9e5591b94eaeb72d471780e64a *av1-1-b8-00-quantizer-24.ivf.md5
+6734e353008824e523939d1a18daa3f2ab2d8ec6 *av1-1-b8-00-quantizer-25.ivf
+c45cf440a05802c1f9e29472175ed397d130d988 *av1-1-b8-00-quantizer-25.ivf.md5
+3372b1c69fb39811156adcea4f6dba802c0918c2 *av1-1-b8-00-quantizer-26.ivf
+b1751d55bb3fb788751fe28fb7434bee153bda68 *av1-1-b8-00-quantizer-26.ivf.md5
+e7ddb19a6e2a798d6a4e7dfdfc10b4df777b60e3 *av1-1-b8-00-quantizer-27.ivf
+0e19d6b79cd71de69d03e0455349568af979b170 *av1-1-b8-00-quantizer-27.ivf.md5
+7f1c90a35543d6b673e353b3702baf3aa1caeaa7 *av1-1-b8-00-quantizer-28.ivf
+d9a4f9cb88103249a05a7e6aa616bf0c16bf9c95 *av1-1-b8-00-quantizer-28.ivf.md5
+28d741b923011c7fcc50a7318256a638d3110a07 *av1-1-b8-00-quantizer-29.ivf
+c68cacf2b2ff2694945a99ad836dcf1ee3961c09 *av1-1-b8-00-quantizer-29.ivf.md5
+9a5d9ea4bc76dd40d04e92f33f45e9c2e120e85d *av1-1-b8-00-quantizer-30.ivf
+eb02bb8c16c4c0368ddff83e05e516e84ec9eaf3 *av1-1-b8-00-quantizer-30.ivf.md5
+20193c372f44f522e094c2c05fc7e4aaa0717fa8 *av1-1-b8-00-quantizer-31.ivf
+a4c1a4ac332f4911f0d5abbd826ebecfb8432d6c *av1-1-b8-00-quantizer-31.ivf.md5
+9617bbd691f093d259dbc8a642a57a153c1fc00c *av1-1-b8-00-quantizer-32.ivf
+73d60a348454b126ea6368ea604954bc23f210ae *av1-1-b8-00-quantizer-32.ivf.md5
+d9aea9d72a686c59b60584d827f60ca1ee8eee26 *av1-1-b8-00-quantizer-33.ivf
+fbf64de376a63d2d3051da83b0e4e56579b55c0a *av1-1-b8-00-quantizer-33.ivf.md5
+791aaf067f125e5cf4a247cf06a2e29ab071ec90 *av1-1-b8-00-quantizer-34.ivf
+8e2e6efe4c069e54844da19125c4280b95990c69 *av1-1-b8-00-quantizer-34.ivf.md5
+01ba67bba5cbf7c94c65da8f4c9bd6e7db24cf3a *av1-1-b8-00-quantizer-35.ivf
+0c5e60704a4a6bd27e67b6fd72ca7d2cf7fff50f *av1-1-b8-00-quantizer-35.ivf.md5
+3e255b4a320c9522dcec539fef770b6920b9a102 *av1-1-b8-00-quantizer-36.ivf
+1241aab865fd7b4bae73736cbeec1866ea9c90ec *av1-1-b8-00-quantizer-36.ivf.md5
+44fa6fca109747d8f43f6c6aa46d782e5d476d54 *av1-1-b8-00-quantizer-37.ivf
+947f0f887c5ac9149cf85e8114a709d6f410fc32 *av1-1-b8-00-quantizer-37.ivf.md5
+8319ac1ddd6ce3279da5780175dff7a3a5fa1054 *av1-1-b8-00-quantizer-38.ivf
+5f571b7f88678eab9e54f162cc9898f14e437770 *av1-1-b8-00-quantizer-38.ivf.md5
+5975e7056e17608593a8c40619b68e6576d373d9 *av1-1-b8-00-quantizer-39.ivf
+7c870192d6eb70ce5367147a3d2c6a52e11f7bec *av1-1-b8-00-quantizer-39.ivf.md5
+47da942f1e455f1422fc65f06dd57304541d16ac *av1-1-b8-00-quantizer-40.ivf
+6ea7116c9ce3a1641c7060bab2f5e06fd0910d61 *av1-1-b8-00-quantizer-40.ivf.md5
+ab35c15dfde21c2572b14e04dbfd5fac1adae449 *av1-1-b8-00-quantizer-41.ivf
+19596f9849653b913186b9d6b7072984ede96177 *av1-1-b8-00-quantizer-41.ivf.md5
+23a5fa6c3d0eaffaf13f6402465f5dd33d8ea7f1 *av1-1-b8-00-quantizer-42.ivf
+5a2726f0d1b1799d4f70883f1bfe5c9d976c6cf5 *av1-1-b8-00-quantizer-42.ivf.md5
+86cddfc463d2b186ec5a1aa25c4562c05201e3c3 *av1-1-b8-00-quantizer-43.ivf
+674c64ec8487ee774ad09350380fa6ac43815807 *av1-1-b8-00-quantizer-43.ivf.md5
+6894c154eb56c4f3fe44d54fc4f9af468b03d175 *av1-1-b8-00-quantizer-44.ivf
+eca679a2781eb894d18b3d578e3aaf4f48019a15 *av1-1-b8-00-quantizer-44.ivf.md5
+0960bf018ada4224b8344519cf091850d50a57bd *av1-1-b8-00-quantizer-45.ivf
+291bb43b9e1ab167040b51019daf1ccf94fd1e50 *av1-1-b8-00-quantizer-45.ivf.md5
+ea644a4732f1a2534332802c2fa5073344f3c356 *av1-1-b8-00-quantizer-46.ivf
+4c7915382b1d6d08709c95525b04ab8830f20ca1 *av1-1-b8-00-quantizer-46.ivf.md5
+d1f8832d33234e2c74a2280090850153ea24ea82 *av1-1-b8-00-quantizer-47.ivf
+90eb9959e612602934dcc512fe6f54abf0c88d9c *av1-1-b8-00-quantizer-47.ivf.md5
+69c93f760e8b666eb5b98f510e09d90f9230ac9b *av1-1-b8-00-quantizer-48.ivf
+931f869e14bd455de9dac2101b383c29e7d6f04c *av1-1-b8-00-quantizer-48.ivf.md5
+8b660c577d95c031d6711c1134b8d115097f8d7e *av1-1-b8-00-quantizer-49.ivf
+0e3fe8b49d497050dc1a0eac5f3ad60f5fe068fe *av1-1-b8-00-quantizer-49.ivf.md5
+d40bb21448a6da0fc9b88cbcf76d2f4226573acb *av1-1-b8-00-quantizer-50.ivf
+bcd2a9c9a021ba44fc5dc74ae02194fe49ca76a4 *av1-1-b8-00-quantizer-50.ivf.md5
+3b5a1d464aa89b0f1a6ad4f5a03602292b826172 *av1-1-b8-00-quantizer-51.ivf
+49bcde0c56cf8b7fbe429336981be22d39025b74 *av1-1-b8-00-quantizer-51.ivf.md5
+38970a02fb38ddb4954fe4240164cb75de5fc744 *av1-1-b8-00-quantizer-52.ivf
+fd02b034d79d4be150efb02bd4349edfd0e41311 *av1-1-b8-00-quantizer-52.ivf.md5
+2fde7a7cf3014d5196d011c47de4a144227ed122 *av1-1-b8-00-quantizer-53.ivf
+0cb66e6d8fbb29962a69ae1703e22da50db2c92b *av1-1-b8-00-quantizer-53.ivf.md5
+89a69e9b9a601e40cb491ac3a1d32491f2468ac8 *av1-1-b8-00-quantizer-54.ivf
+2f8af51acc73c99b5af81db2bdd1883b611ad311 *av1-1-b8-00-quantizer-54.ivf.md5
+31ee4f56fcb0043e95fff7af49e4ef82aafa5543 *av1-1-b8-00-quantizer-55.ivf
+04a7104e02bdd0fa38c118202dbbecdbd11ace02 *av1-1-b8-00-quantizer-55.ivf.md5
+f262f0b234006a2652fceb77b1a8711aa53abb54 *av1-1-b8-00-quantizer-56.ivf
+bdd54dc25bc5a147c76163af0bced45c56435d79 *av1-1-b8-00-quantizer-56.ivf.md5
+1ef00617091db4b2b839de623bd6b4fb0b2f5f83 *av1-1-b8-00-quantizer-57.ivf
+714c65363a87ed5e6e4ad75c79ddb6af57d41fd9 *av1-1-b8-00-quantizer-57.ivf.md5
+43c9b02feccbb3c709d96015f126b7e3d4c24c64 *av1-1-b8-00-quantizer-58.ivf
+bae22b8d6377862bff8219470c0d87205d186a68 *av1-1-b8-00-quantizer-58.ivf.md5
+ca5f780abe4c02e48cceb9c804f3625723c359bf *av1-1-b8-00-quantizer-59.ivf
+c60a20bbf60b0b0a442ef3f7b682979053909d6e *av1-1-b8-00-quantizer-59.ivf.md5
+1f6f047e9f0e1da22fb514370d92c3c7c66dcf89 *av1-1-b8-00-quantizer-60.ivf
+86dc7fa59d363cf1ae4b027a57b119bda893c1c1 *av1-1-b8-00-quantizer-60.ivf.md5
+bcf0c3353568c47a043f2dc34c9abd3fc04eebd4 *av1-1-b8-00-quantizer-61.ivf
+66fc4f729c5915aa19939d1b6e28e5b398e747bb *av1-1-b8-00-quantizer-61.ivf.md5
+ac8d3c54451b52cf557ef435d33e7638088d66df *av1-1-b8-00-quantizer-62.ivf
+b57f4e1276ead626a3662339a86111ae6fda49d2 *av1-1-b8-00-quantizer-62.ivf.md5
+2a8aa33513d8e01ae9410c4bf5fe1e471b775482 *av1-1-b8-00-quantizer-63.ivf
+9f646ec35a168f495e144c64ba7ce9aeb41cd0a2 *av1-1-b8-00-quantizer-63.ivf.md5
+838388fbda4a1d91be81ff62694c3bf13c460d38 *av1-1-b8-01-size-16x16.ivf
+4229c1caf8e25eb3073456fb90ceed206753901e *av1-1-b8-01-size-16x16.ivf.md5
+23f4253bf71e02b2e8ead66da4b3de875e879ef2 *av1-1-b8-01-size-18x16.ivf
+af125644436d4b6897dade68336cedad663b6610 *av1-1-b8-01-size-18x16.ivf.md5
+94e4a75bd93052f79998e9e08e6b5dd73dc27e50 *av1-1-b8-01-size-32x16.ivf
+e7b3fbc5e4b2469838e7ae36512bd3ce0a81040c *av1-1-b8-01-size-32x16.ivf.md5
+f297bde01c05ec5c07ff8118a0280bd36c52b246 *av1-1-b8-01-size-34x16.ivf
+f6bbd94d6063c689de3c7cf94afa2c68b969d12c *av1-1-b8-01-size-34x16.ivf.md5
+1e18bdf68bab7e7282aacc77e423bc7d93d04a8e *av1-1-b8-01-size-64x16.ivf
+de75732fccfb385294b23c17f0f1a57b455edcf7 *av1-1-b8-01-size-64x16.ivf.md5
+26b1f6ae80b161e971468085778cc1ece502b330 *av1-1-b8-01-size-66x16.ivf
+48bd99813557c314d398e6952da78da07c79d416 *av1-1-b8-01-size-66x16.ivf.md5
+ff213ecf31b982a3a7f009c9739f64e066e1ffe9 *av1-1-b8-01-size-16x18.ivf
+86b20a13b1939dc5f678e80491f190d376233d58 *av1-1-b8-01-size-16x18.ivf.md5
+c90bd878c59263a15c6a6f515d1c7e071f141559 *av1-1-b8-01-size-18x18.ivf
+6f659036ffcd3dd380cf970cf1a06f7755e0b2de *av1-1-b8-01-size-18x18.ivf.md5
+e16a1411381b34817a4c0d8e5eeaeb8cddcc9c46 *av1-1-b8-01-size-32x18.ivf
+fdb1c4ec56f5aa690eadbe897340fee86a06ae2f *av1-1-b8-01-size-32x18.ivf.md5
+fac7052b39bd2d0ae107e0e94050226712c770c2 *av1-1-b8-01-size-34x18.ivf
+adb0d5a99228027eaa3b016963df447c9818c447 *av1-1-b8-01-size-34x18.ivf.md5
+b8be5e55d9be42746c2b547d0e26e80b21c9802a *av1-1-b8-01-size-64x18.ivf
+8f8f6da34cdf78c5a6551c637e1afe279cc3884e *av1-1-b8-01-size-64x18.ivf.md5
+9e066bdcc2cd789cdf551bd4c9c85c178887b880 *av1-1-b8-01-size-66x18.ivf
+e8ec6effa936423ae2eec2b60a3160720d2de912 *av1-1-b8-01-size-66x18.ivf.md5
+6ebe45085cdeebc2acd6da5abd542a59312c0ff4 *av1-1-b8-01-size-16x32.ivf
+044695669103dbf158591dce9c649317a177d5f6 *av1-1-b8-01-size-16x32.ivf.md5
+9fabb4f60641b8c7995d1dc451419165d41258ff *av1-1-b8-01-size-18x32.ivf
+7263764680dfec864c3fad5df824ab1973489a14 *av1-1-b8-01-size-18x32.ivf.md5
+3f72841a24a13e601d79cf029aa1fdb02970ce0b *av1-1-b8-01-size-32x32.ivf
+bbe1ae2888d291ec6bc98cd0784937580c554103 *av1-1-b8-01-size-32x32.ivf.md5
+392131a7c7609acd0dba88fee14f1ed042d23ab1 *av1-1-b8-01-size-34x32.ivf
+eea68165ebe9acd28693374bf2266374b9c77786 *av1-1-b8-01-size-34x32.ivf.md5
+78afdd96265811ab9466e906347b57161e5c010d *av1-1-b8-01-size-64x32.ivf
+47b317af582700b67f6e77659db1dfaa26c8cde6 *av1-1-b8-01-size-64x32.ivf.md5
+2b4d01f2c9f23044c0d886482c7073bd4d5d37d1 *av1-1-b8-01-size-66x32.ivf
+3ad5a58a0ee5086af370b22ab2b5b7592a4f33e7 *av1-1-b8-01-size-66x32.ivf.md5
+78ddae04eb8277ae605bd7017ad7ad27bfc82d39 *av1-1-b8-01-size-16x34.ivf
+d0c18e679f1fc51e4f7409831321eed9c4858f6f *av1-1-b8-01-size-16x34.ivf.md5
+38d8ed885f46aead6ec1271d8a5d4aee79b8eb68 *av1-1-b8-01-size-18x34.ivf
+097ddbd69b8f54826a35efeb0b8b07ec198bba6b *av1-1-b8-01-size-18x34.ivf.md5
+91a42720bc2e7ba701f4d97b463a098b6707cdbd *av1-1-b8-01-size-32x34.ivf
+c590d43d37095bd2e8f8d12c9278477419b72d1a *av1-1-b8-01-size-32x34.ivf.md5
+4cc2a437dba56e8878113d9b390b980522542028 *av1-1-b8-01-size-34x34.ivf
+57eeb971f00e64abde25be69dbcb4e3ce5065a57 *av1-1-b8-01-size-34x34.ivf.md5
+b36fee1b6ad69d1206466615d69c05e0a4407939 *av1-1-b8-01-size-64x34.ivf
+a78aea0250d0b32657dc0eaf2d8394bc766c0e35 *av1-1-b8-01-size-64x34.ivf.md5
+10e441209262e082e31fef8c15b51579c9e81509 *av1-1-b8-01-size-66x34.ivf
+558b46f6ef1662c208012d0b66d1857eeff3244e *av1-1-b8-01-size-66x34.ivf.md5
+dd44aad500c7ca0fc97e3d8f0abed3c83b24c79c *av1-1-b8-01-size-16x64.ivf
+a5b64e8063abcf3e4872dc4baf1c32384dc5cf83 *av1-1-b8-01-size-16x64.ivf.md5
+aa849f0d09bcb2ead44719d63043536932d5c9f2 *av1-1-b8-01-size-18x64.ivf
+bcdf2dea3590c7031158ffe7b907d9ee35e2fe57 *av1-1-b8-01-size-18x64.ivf.md5
+36e856d30e160ba2fbb00510296202f61afaae49 *av1-1-b8-01-size-32x64.ivf
+99299f75b82c40c13f168adf2d124f57044a39a2 *av1-1-b8-01-size-32x64.ivf.md5
+e3e03ec5d38eb25e97e4ec3adc6ed40ecdebd278 *av1-1-b8-01-size-34x64.ivf
+84625abf8a200a7d20dd3dd3b277b50b3d62ce32 *av1-1-b8-01-size-34x64.ivf.md5
+7d017daebef2d39ed42a505a8e6103ab0c0988c1 *av1-1-b8-01-size-64x64.ivf
+1ff38d5ecba82fb2e6ac3b09c29c9fe74885ac29 *av1-1-b8-01-size-64x64.ivf.md5
+e1b58ba0b462508593399a2ed84db5f1c59ffcd2 *av1-1-b8-01-size-66x64.ivf
+a6b2c84c94fe79ab0373d157d1203f8d66de0706 *av1-1-b8-01-size-66x64.ivf.md5
+7b4faa7eb7b73392b62de6613282a98dddc13bb6 *av1-1-b8-01-size-16x66.ivf
+a2dacf2bae3c4ab352af66a9600946d29ab9a6ee *av1-1-b8-01-size-16x66.ivf.md5
+0f97805fa30497d4cf39665150f00dfdea52d862 *av1-1-b8-01-size-18x66.ivf
+33d8ea0765953250f998da3fe161f2a8cfca2353 *av1-1-b8-01-size-18x66.ivf.md5
+c8bb00256de973e3b3ee31b924f554336d310cdb *av1-1-b8-01-size-32x66.ivf
+6a6588e6edc68ff7739968a9e7cc6d9eaaeed356 *av1-1-b8-01-size-32x66.ivf.md5
+75ec54fec5c36eecde6d0a16e0389a5f7ad8ec22 *av1-1-b8-01-size-34x66.ivf
+36101dfa9495c18696c0d7d61f25e748f4de7425 *av1-1-b8-01-size-34x66.ivf.md5
+7e5491716e70f8199156b8843513c935667b281e *av1-1-b8-01-size-64x66.ivf
+da38755bb0c9ef56b81617835ddf1340242c6dce *av1-1-b8-01-size-64x66.ivf.md5
+68b47b386f61d67cb5b824a7e6bf87c8b9c2bf7b *av1-1-b8-01-size-66x66.ivf
+25974893956ebd92df474325946130c34f880ea7 *av1-1-b8-01-size-66x66.ivf.md5
+9f386d19c87dbfd6ac84a06d2393dd88863ac003 *av1-1-b8-01-size-196x196.ivf
+788f77f655f55de3db94dd69870316134c149116 *av1-1-b8-01-size-196x196.ivf.md5
+ed3bb2bb52a9d1786e233ef38142b15b85097875 *av1-1-b8-01-size-198x196.ivf
+3bb6b6721ad9b2838b2d07e47b29d6c0117526b1 *av1-1-b8-01-size-198x196.ivf.md5
+49461772caaaa7b824d48f4e9c77a906b0dc02d5 *av1-1-b8-01-size-200x196.ivf
+f1cba00c36909c56097c8785df476d42bc91f259 *av1-1-b8-01-size-200x196.ivf.md5
+44a656a22958e26ed169a69deb8f373117224f06 *av1-1-b8-01-size-202x196.ivf
+69be876b52fe42811bba52d36d0bcc88d6c25b3f *av1-1-b8-01-size-202x196.ivf.md5
+0a6fe9b478363faedbfd465a75790b4c2661b9ba *av1-1-b8-01-size-208x196.ivf
+fc8e95a6860a8a37ccdf1dfe49828502fcf96a08 *av1-1-b8-01-size-208x196.ivf.md5
+8e05b5a20ec95afd92bb615a7daa2e17a7ef55a8 *av1-1-b8-01-size-210x196.ivf
+0add512bffbda3300d8f684a53b13b996fe2e46d *av1-1-b8-01-size-210x196.ivf.md5
+a15f12652c6b4d0c30f13a439c941bfc4a431d1a *av1-1-b8-01-size-224x196.ivf
+b904b93252175f79e0e2b28896131ce93d5fc925 *av1-1-b8-01-size-224x196.ivf.md5
+1a57b913443b267f4a31a6925c39f5b58022f550 *av1-1-b8-01-size-226x196.ivf
+7cf3087de5804763a82d2a798243a66459664772 *av1-1-b8-01-size-226x196.ivf.md5
+2cc28541a2a72e8b45a368f71e70fc294e2de3ab *av1-1-b8-01-size-196x198.ivf
+bb736eedb4bd1e39bf9d60435b4b27a12842e112 *av1-1-b8-01-size-196x198.ivf.md5
+c4ebf93fbf3ae52108fd7b39ddef3afae48188ea *av1-1-b8-01-size-198x198.ivf
+fa4de6881511728bafa15b5f441a0cfdf683cc75 *av1-1-b8-01-size-198x198.ivf.md5
+55fce983186d454b0eb15527393bb2465ba41c6b *av1-1-b8-01-size-200x198.ivf
+1ac8fb1ee622cbc4aa1b83cb46b4731c85efae62 *av1-1-b8-01-size-200x198.ivf.md5
+67d276c67886f0a91a7ee06751a64f95eeb7bc1f *av1-1-b8-01-size-202x198.ivf
+1633b62d9e4ea41737c42f70cbde9a5671da0cef *av1-1-b8-01-size-202x198.ivf.md5
+081cb3f29d3956d4d858d9661fd3d62c94b68867 *av1-1-b8-01-size-208x198.ivf
+871d1c99167408dd32fa7603a7296c9b99ccda15 *av1-1-b8-01-size-208x198.ivf.md5
+b2d80b42468d5f296ae240cfb1fc0b3dd3d96bbc *av1-1-b8-01-size-210x198.ivf
+6a3382656cb17b532a97b1061697f9a878fc58d1 *av1-1-b8-01-size-210x198.ivf.md5
+84d7994fa20fcf6c1d8dbd4c2060c988a6fce831 *av1-1-b8-01-size-224x198.ivf
+42ea12e15de81f2e8617b6de7bae76de2da4d648 *av1-1-b8-01-size-224x198.ivf.md5
+c74a9281cf98c597121df6bff0ac5312b887f969 *av1-1-b8-01-size-226x198.ivf
+4133aae0001804e2bbc7928fc065517a6dd8b288 *av1-1-b8-01-size-226x198.ivf.md5
+27adbf148c63f807bd617cfd78aeaedb8b0f2304 *av1-1-b8-01-size-196x200.ivf
+9253e525e6207ef1ce0839b8f88ea781e9abe41e *av1-1-b8-01-size-196x200.ivf.md5
+21c9ea4d882e48353d3df66fcde0e4746168163f *av1-1-b8-01-size-198x200.ivf
+3d5ee59fde9194f0eaff736051cfd1d7b7daeff1 *av1-1-b8-01-size-198x200.ivf.md5
+c27b0b57667910847122a0309c703315e444110f *av1-1-b8-01-size-200x200.ivf
+7b2a15a17b421ef07e285ca4e8a224f0512c434d *av1-1-b8-01-size-200x200.ivf.md5
+780de549e4163a52590f7c0f488e027a8a4aa053 *av1-1-b8-01-size-202x200.ivf
+cb0ec0969522ca60d79a639e9b9509363468ffd0 *av1-1-b8-01-size-202x200.ivf.md5
+2c59821904863e264ae61401cbd494a79bc04f13 *av1-1-b8-01-size-208x200.ivf
+9963955966a52b65cdd13465c9fb2ba3b5356755 *av1-1-b8-01-size-208x200.ivf.md5
+ff63121611ea9c0628c7e5af13de5e7786611ca6 *av1-1-b8-01-size-210x200.ivf
+2a5993be234e3af2af6d185b2a6f3aaf1979b83a *av1-1-b8-01-size-210x200.ivf.md5
+b8485ada95440d78b51153227231b1aced1a8273 *av1-1-b8-01-size-224x200.ivf
+9c3cd32ea6c006a91eb37d69dbeccf878de5d214 *av1-1-b8-01-size-224x200.ivf.md5
+1aa0ce3e3a74f9b600a146e98b05547a0b454c48 *av1-1-b8-01-size-226x200.ivf
+e045be96c3af16a9ddc10a9933e8ddfb3319d716 *av1-1-b8-01-size-226x200.ivf.md5
+e92b76480f4339855d998b97182f36b28deadcfa *av1-1-b8-01-size-196x202.ivf
+480c707abcd2a650e2160ec397f8348cecb45770 *av1-1-b8-01-size-196x202.ivf.md5
+137b9c0d10a3bdbdf6f97b3e6331f3e8acaf8f91 *av1-1-b8-01-size-198x202.ivf
+7429642146d0da55161ab13024a261094ee2ce87 *av1-1-b8-01-size-198x202.ivf.md5
+9cea71c44ad015ac702d675bacca17876e65cb1a *av1-1-b8-01-size-200x202.ivf
+76b1ec6c42da55f47e389a561590d1a7c713e495 *av1-1-b8-01-size-200x202.ivf.md5
+26dffdcd0dac9becf68d12e31fcd91eddf1f7154 *av1-1-b8-01-size-202x202.ivf
+ddb75e99123fed4ef05d9b85200cefd8985bc84c *av1-1-b8-01-size-202x202.ivf.md5
+04007e83bb66ba547d09f8926ea5bfc7fd9e4b2a *av1-1-b8-01-size-208x202.ivf
+5b72eb58db22087ad416c499119f41e718395b52 *av1-1-b8-01-size-208x202.ivf.md5
+721ff7c0ae0e2ed896b5acac230113f1404e769c *av1-1-b8-01-size-210x202.ivf
+187d2ef939fc26e1a1c7de65abe8e058d8aae17a *av1-1-b8-01-size-210x202.ivf.md5
+dba41421cc938bcf0234254f96be0325ab66186e *av1-1-b8-01-size-224x202.ivf
+58856038c1eb13a7bf0353a30b1affe844cd31b1 *av1-1-b8-01-size-224x202.ivf.md5
+55eba14878d25dcc351ee5e92fa06e559035b409 *av1-1-b8-01-size-226x202.ivf
+e295b3d791d40d7c1fff2c40a260078dccaef24a *av1-1-b8-01-size-226x202.ivf.md5
+6c777223990ddfd92040a8526646ed0f39299b0d *av1-1-b8-01-size-196x208.ivf
+5210daff766cddaf3945610ee05ff242aef8175a *av1-1-b8-01-size-196x208.ivf.md5
+252831abfb9f4a9a8556c21cc3bf60adfe88210f *av1-1-b8-01-size-198x208.ivf
+35ed9601e608a829980cec81e41b7bd3e5f4c2ce *av1-1-b8-01-size-198x208.ivf.md5
+e800ed893a88704a4576d4984957f3664560daa9 *av1-1-b8-01-size-200x208.ivf
+82c038f9072a2fcf8d55fb4a474fdd791ba9a290 *av1-1-b8-01-size-200x208.ivf.md5
+9ce7bb932dd99f86da8ff2ab89fa4d3089a78da8 *av1-1-b8-01-size-202x208.ivf
+0611bf0179abe3c820a447a2bd3a04c3790f3a87 *av1-1-b8-01-size-202x208.ivf.md5
+e5900d9150c8bebc49776227afd3b0a21f5a6ac6 *av1-1-b8-01-size-208x208.ivf
+86d6b9a3840aa0a77938547c905bd6f45d069681 *av1-1-b8-01-size-208x208.ivf.md5
+2758ba5dad16f4a91334f2ed07a4a037201bb873 *av1-1-b8-01-size-210x208.ivf
+78453b1fda2ccc6f35e0d762567807757bcddb16 *av1-1-b8-01-size-210x208.ivf.md5
+fff88fb8e833f6b4ad64cb591b219c7cceb7f2d2 *av1-1-b8-01-size-224x208.ivf
+87266fc34aaed82cdb98cbc309b221ad52eccd81 *av1-1-b8-01-size-224x208.ivf.md5
+dec839fe64046461015b56cda191835284f42a52 *av1-1-b8-01-size-226x208.ivf
+d7a15264fc3fd55d3aec0ccfaa7c434c6d90969f *av1-1-b8-01-size-226x208.ivf.md5
+584782e93ed1cb7797a90fece44becdd1e23bf0d *av1-1-b8-01-size-196x210.ivf
+ed76ec841b18a457853e368576967c4768fc2730 *av1-1-b8-01-size-196x210.ivf.md5
+dab625599b9f01398b593e865d9a4a95a029d60f *av1-1-b8-01-size-198x210.ivf
+b90e8d96a1f5b329b088b467a11fed2d055d74ca *av1-1-b8-01-size-198x210.ivf.md5
+6774bee17b9e50d2d8630e2e1afc30ded67e662d *av1-1-b8-01-size-200x210.ivf
+343a86bd54eb3dd5e9902eb62a3d776dcff2f4f3 *av1-1-b8-01-size-200x210.ivf.md5
+0456c3b8e242eeee019ca97d155f81124de62c90 *av1-1-b8-01-size-202x210.ivf
+5a6a6428c9858a0d3561db42ceaf981c143fe479 *av1-1-b8-01-size-202x210.ivf.md5
+6a3a8f65bf806b1be7726b983427880f772c9986 *av1-1-b8-01-size-208x210.ivf
+5563ea6d8c65887553ff3000addc6418913f1650 *av1-1-b8-01-size-208x210.ivf.md5
+5a8b69489f8e9b917ea7718ad2645101cdbe5644 *av1-1-b8-01-size-210x210.ivf
+f4b01604036fa23000d44fbf42097ae1181bcd62 *av1-1-b8-01-size-210x210.ivf.md5
+fb6f5b08a048698cfe324557ee8cd840c4a3f6ce *av1-1-b8-01-size-224x210.ivf
+3ce5c404e3ca09c8e994b3043bad42cd555b00c0 *av1-1-b8-01-size-224x210.ivf.md5
+2e9fc8510d2131b2f3c9a93bececac985e4426d2 *av1-1-b8-01-size-226x210.ivf
+897c537e259331ca86cdd6e4d2bd343f8538402e *av1-1-b8-01-size-226x210.ivf.md5
+8300512106fce3424eb74b5d4bc0f4f19f7c9af8 *av1-1-b8-01-size-196x224.ivf
+43662ea025ea79afe4964fd4d12a77f4aa4e565e *av1-1-b8-01-size-196x224.ivf.md5
+640f8fda7ade8f2850e2275a9f5e233e33a0ba8d *av1-1-b8-01-size-198x224.ivf
+9ac690bdbbce47d7b169128b568f955e70076f8c *av1-1-b8-01-size-198x224.ivf.md5
+ce2e9379c72fc924e364d5727605394a1438a211 *av1-1-b8-01-size-200x224.ivf
+1ec35a53d88072b96b255202f678178bc7e5bb20 *av1-1-b8-01-size-200x224.ivf.md5
+5d3af7921623deccb578115c8ce207c019f97f50 *av1-1-b8-01-size-202x224.ivf
+14eafd55b0cda3a3476cae7ad500dbd5ee899dd5 *av1-1-b8-01-size-202x224.ivf.md5
+6b6d78e466cf94a5ef8dfe252caa0948dd2ec175 *av1-1-b8-01-size-208x224.ivf
+e178b0c272dfcfe614c6b49cb28dad11781af0b6 *av1-1-b8-01-size-208x224.ivf.md5
+dd2232b9e18971d7e19650a1e3218aef1010247f *av1-1-b8-01-size-210x224.ivf
+40a66198c47820f5fa2d2e389ec0c1191ea4ffcc *av1-1-b8-01-size-210x224.ivf.md5
+9ec028b81a5ea311683328d856f436e6d0b0e6a0 *av1-1-b8-01-size-224x224.ivf
+143b9530ce722385db2c2d883daa649ed42b8d40 *av1-1-b8-01-size-224x224.ivf.md5
+bf833947e62935c54e1e727ccb36157f7c1e9e5d *av1-1-b8-01-size-226x224.ivf
+ca4f3b44463106e4f0bb54e490c3bd457d7d780b *av1-1-b8-01-size-226x224.ivf.md5
+5525f7e312ec073f480ed5a2be5bdc4f0ce51a09 *av1-1-b8-01-size-196x226.ivf
+062d4b240741184458d2d2abd243ed7877631de8 *av1-1-b8-01-size-196x226.ivf.md5
+e6b911142394b94c23191eaa63c9eb41a00f80b0 *av1-1-b8-01-size-198x226.ivf
+3b580d903dddf47082f5e055bfb01a4f05c09b7d *av1-1-b8-01-size-198x226.ivf.md5
+70feb5efeb28df25f7d1a661c73bf013c5ada9b4 *av1-1-b8-01-size-200x226.ivf
+f0b894e7f787e62f1492be62f3dedeb065062160 *av1-1-b8-01-size-200x226.ivf.md5
+7f9a10831e2389b31497fad50080b4d5452d6e91 *av1-1-b8-01-size-202x226.ivf
+45b7194eba9367c8059403c23ca4ae49e988dfaf *av1-1-b8-01-size-202x226.ivf.md5
+967837a2cfbf9aa3131f73aec6a52dcdd82926c7 *av1-1-b8-01-size-208x226.ivf
+c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5
+9c926226b9f6b015501d8ac1e3f95e8570283a05 *av1-1-b8-01-size-210x226.ivf
+57d4837667fd4c5a7aeb908626d701b632852c60 *av1-1-b8-01-size-210x226.ivf.md5
+25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf
+87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5
+40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf
+34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
+9bbe8499796aa588ff02e313fb0d4349940d2fea *av1-1-b10-00-quantizer-00.ivf
+36b402eedad2bacee8ac09acce44e2fc356dd80b *av1-1-b10-00-quantizer-00.ivf.md5
+1d5e1d2827624f328020bf123df213bb175577e0 *av1-1-b10-00-quantizer-01.ivf
+16c529be5502369e43ce9c6fe99a9709968e3daf *av1-1-b10-00-quantizer-01.ivf.md5
+39abc20739242a8f05efd4b35d7603c8ad7ff45d *av1-1-b10-00-quantizer-02.ivf
+81faa72c3d43b003966fe09ffaae51b07b1059be *av1-1-b10-00-quantizer-02.ivf.md5
+92ebf349b803333a43824a83d997b8cf76f656f9 *av1-1-b10-00-quantizer-03.ivf
+5e7556dc998cb8b506a43cc078e30802d7e600e6 *av1-1-b10-00-quantizer-03.ivf.md5
+1c496177c66e49f2e3556af87ec67afb5060170b *av1-1-b10-00-quantizer-04.ivf
+560fea4800a44fe19ed8d3e74f425bdbf1fb8abd *av1-1-b10-00-quantizer-04.ivf.md5
+7de864b8475ce0acd0ecb01827f2c9add815352b *av1-1-b10-00-quantizer-05.ivf
+1c1aea3db3f54a91866d89fd3b1a0d285ca10310 *av1-1-b10-00-quantizer-05.ivf.md5
+b6501c165619b036d0f7864fd4739973d2d18970 *av1-1-b10-00-quantizer-06.ivf
+d758c8eff275651006c41e7dd447cac13b489ad7 *av1-1-b10-00-quantizer-06.ivf.md5
+e4df6f588f156dffaafd9517b64f753cfc9ccf05 *av1-1-b10-00-quantizer-07.ivf
+3c577f67dade4537de642fd457ea2b367424f336 *av1-1-b10-00-quantizer-07.ivf.md5
+07e9c4c18abb36c8699c1c12bebcc727f090b525 *av1-1-b10-00-quantizer-08.ivf
+4981568ade3170f311cb114fa2689edc4bc35e67 *av1-1-b10-00-quantizer-08.ivf.md5
+2268ecd2899f1b41ae9898925b1d62cfefa30282 *av1-1-b10-00-quantizer-09.ivf
+029b03029b65b7c4c208961f0820467ad42fd3d6 *av1-1-b10-00-quantizer-09.ivf.md5
+3d2adaf6441cfa9585dcbf7d19d65bf6992a29a3 *av1-1-b10-00-quantizer-10.ivf
+017b7fb4c3ba0747c2d5688d493da33ef993d110 *av1-1-b10-00-quantizer-10.ivf.md5
+006535760bd7dc1cfc95e648b05215954a2e76c2 *av1-1-b10-00-quantizer-11.ivf
+c0ae083deb8e820aa49034af4d100944dd977018 *av1-1-b10-00-quantizer-11.ivf.md5
+840e0cbfe1acc8a7a45c823dc55ab44a0b6b553e *av1-1-b10-00-quantizer-12.ivf
+49232ea38bdef650c94808f53834f1137cd4bf39 *av1-1-b10-00-quantizer-12.ivf.md5
+04b0e5a7387e07474f51be4b2c3e05211b40f0d0 *av1-1-b10-00-quantizer-13.ivf
+a51b5ec4b890df3a64f9f0d866b8c41296c9e081 *av1-1-b10-00-quantizer-13.ivf.md5
+5dc47a140fbcbf08bf91481ee3585e9e067561ab *av1-1-b10-00-quantizer-14.ivf
+2625319eef69d6225e6ab6e5ce7790491406cb5d *av1-1-b10-00-quantizer-14.ivf.md5
+f866be86d8d8aa08ded30e42988b0936c1a16064 *av1-1-b10-00-quantizer-15.ivf
+03b7c1eefb54d99e30051c7123c0453f04a6579d *av1-1-b10-00-quantizer-15.ivf.md5
+548df2371dfb485419ed9baf28e3f495c64f364a *av1-1-b10-00-quantizer-16.ivf
+8a0d6bf1626b05b65c77331305414fe9be54e8c6 *av1-1-b10-00-quantizer-16.ivf.md5
+0077c82f96a2e095a3cb8de9bfa63715e3c9f438 *av1-1-b10-00-quantizer-17.ivf
+5d85f77f3087f4b206930722a945c60039262be4 *av1-1-b10-00-quantizer-17.ivf.md5
+1e0f1245ecb4c903b5dc7072d959fc43a7bba381 *av1-1-b10-00-quantizer-18.ivf
+06316ae2b45f2359a70cc3855ffd6ab81048b41a *av1-1-b10-00-quantizer-18.ivf.md5
+f197198f7ec058110185fda5297a1a43993654df *av1-1-b10-00-quantizer-19.ivf
+bac522c7f234d506c75b5495d74b3fa57c83a4df *av1-1-b10-00-quantizer-19.ivf.md5
+c2f57324d000b349323f37d5ebebde8c2b861f30 *av1-1-b10-00-quantizer-20.ivf
+999c6110786cbc25e67792234a5a02f2cb4553c7 *av1-1-b10-00-quantizer-20.ivf.md5
+2ffad9adfd19286fe2166ba877289d201c9a634f *av1-1-b10-00-quantizer-21.ivf
+d55713eaa791cfd7bf69b6c26d5032029d9a0f06 *av1-1-b10-00-quantizer-21.ivf.md5
+382528db53328c1a38976f5d9b579eef35d839f4 *av1-1-b10-00-quantizer-22.ivf
+cb5bd459e1a90126da9264cff4281515f95755b2 *av1-1-b10-00-quantizer-22.ivf.md5
+b52cc6160fc66f72ad66c198d275a1c73f925022 *av1-1-b10-00-quantizer-23.ivf
+c0f9d6659e1f283e9356fd7b4ac9f7cc5544cdc2 *av1-1-b10-00-quantizer-23.ivf.md5
+e11f15e3b63e7606b1122bb3670ee77c09c04840 *av1-1-b10-00-quantizer-24.ivf
+e9f141b924440e044270c81a68458fe498599a8e *av1-1-b10-00-quantizer-24.ivf.md5
+fb91793b69824c99b0218788dcea0a74ebd7e84e *av1-1-b10-00-quantizer-25.ivf
+434e33d609b2683c3cfbcc3a2cdfc26339590fb6 *av1-1-b10-00-quantizer-25.ivf.md5
+d82e38f31cdcf8b43479e6ddaa83373de38f70a2 *av1-1-b10-00-quantizer-26.ivf
+183943b851ba383a536f13c83b93f61ac8961ad5 *av1-1-b10-00-quantizer-26.ivf.md5
+6bf5e4e8e0aca699e493b9eb3672d2117494d74d *av1-1-b10-00-quantizer-27.ivf
+f0fb7e0a99180828b0e38b2cfe0622eecc2d26b8 *av1-1-b10-00-quantizer-27.ivf.md5
+d5adee2567544c3ae4223b3f3528a770377878d2 *av1-1-b10-00-quantizer-28.ivf
+14edf588efc67570e529b0ff8aeb8e7a0c69238b *av1-1-b10-00-quantizer-28.ivf.md5
+e6dcdc106847956035e3f00aabf4470f97e1887e *av1-1-b10-00-quantizer-29.ivf
+413c5cb778611c7c1a810b53861b9ab1fb391f17 *av1-1-b10-00-quantizer-29.ivf.md5
+b5e98b3f6b1db04d46bf43064c6ac64f797aff00 *av1-1-b10-00-quantizer-30.ivf
+d1a603661d76c28658c7cd2892b408e91d77893e *av1-1-b10-00-quantizer-30.ivf.md5
+80168371d1150e82e3f46bcbbcabba458b835b19 *av1-1-b10-00-quantizer-31.ivf
+904ecd033d4af5239c4d5b3f86e51ed5c3c2e3fb *av1-1-b10-00-quantizer-31.ivf.md5
+96291f6ace85980892d135a5b74188cd629c325f *av1-1-b10-00-quantizer-32.ivf
+a5ceace390d4a75d48281fe29060c21557e4f5ae *av1-1-b10-00-quantizer-32.ivf.md5
+0f80495de34eae07c4905b72573a315a879390ec *av1-1-b10-00-quantizer-33.ivf
+72b8f662973a660412946687dff878b276ae518e *av1-1-b10-00-quantizer-33.ivf.md5
+24905e3be7db320994b7fb8311dfd50a7c9e54da *av1-1-b10-00-quantizer-34.ivf
+cea514bb1b7b064c4d31914a2cb266611c278577 *av1-1-b10-00-quantizer-34.ivf.md5
+083012960dd7c17d3b00fa0e807759c98faded8f *av1-1-b10-00-quantizer-35.ivf
+de5fdb9e1e581484af1cc7d2dd3c3e84c90cebb2 *av1-1-b10-00-quantizer-35.ivf.md5
+f725f179aeee5b413620c0dd81b007b245c2a7ed *av1-1-b10-00-quantizer-36.ivf
+246b1931c04c02df1f168090e2650827cd5dbabd *av1-1-b10-00-quantizer-36.ivf.md5
+f6aa824156e9848f237481889a8103eb6130f31d *av1-1-b10-00-quantizer-37.ivf
+a8f78dd15fc2994369a08c2ddddcd0760c62ea5b *av1-1-b10-00-quantizer-37.ivf.md5
+a8dd662338c493aea266b99203e70af25982633f *av1-1-b10-00-quantizer-38.ivf
+09f36d998e85d0450060f540e50b075ae1432fc6 *av1-1-b10-00-quantizer-38.ivf.md5
+d97428871720ed658da6ed0e3f7c15da83387e4c *av1-1-b10-00-quantizer-39.ivf
+8c5230048909ee8f86f87c116f153cd910d0141f *av1-1-b10-00-quantizer-39.ivf.md5
+86e754e55e9b63c6e0a4fef01761414f8a6b61ca *av1-1-b10-00-quantizer-40.ivf
+99a71accf6457264e45ca80d3b1f082ee5acdecc *av1-1-b10-00-quantizer-40.ivf.md5
+9d18b7236506ab7e107c062620b64096ec0cf423 *av1-1-b10-00-quantizer-41.ivf
+5771159a9a7c7b66c9e13bb13ec3d53b37860208 *av1-1-b10-00-quantizer-41.ivf.md5
+54b72bc879a80e66613f421e67db62bba1c0041b *av1-1-b10-00-quantizer-42.ivf
+bf958236883ee7209ef4cb0b7503b430634a291e *av1-1-b10-00-quantizer-42.ivf.md5
+a06d5321a51d90404dd7085ae511d7df5d5e1e05 *av1-1-b10-00-quantizer-43.ivf
+ddb25723d976043d863634b9dc3b5fb84a245803 *av1-1-b10-00-quantizer-43.ivf.md5
+2ea0b64c170d7299dae1c14a8a49349aee8e0d08 *av1-1-b10-00-quantizer-44.ivf
+d18bde1b4893792173fa2014665e9364395ad5e9 *av1-1-b10-00-quantizer-44.ivf.md5
+73e506a32d3518e23424f231c7b5323d7a34a3d6 *av1-1-b10-00-quantizer-45.ivf
+be6224ebc77a3e5fb9c1645b876007e584a09d89 *av1-1-b10-00-quantizer-45.ivf.md5
+841223871374464194edc739c48dc7cefd1ff255 *av1-1-b10-00-quantizer-46.ivf
+4766d616f923496a8dc113c9b7f875f0c0735f9a *av1-1-b10-00-quantizer-46.ivf.md5
+8bbbbea130aaea453f7b826956a5520d10a0eccf *av1-1-b10-00-quantizer-47.ivf
+3ea21fac0c492b03d8ec25e4ee0971cd57e5f71a *av1-1-b10-00-quantizer-47.ivf.md5
+3ce83e0f1e1835b9a6c10fe502a16fd3650839e0 *av1-1-b10-00-quantizer-48.ivf
+b468de2c09fca5a6b2bb7a20bab4afd8d192c31d *av1-1-b10-00-quantizer-48.ivf.md5
+f3a757c678aa00f9a9c4c4658d37733fd935925a *av1-1-b10-00-quantizer-49.ivf
+f888dc88db576122695d4eb41c486aacd28a2d1d *av1-1-b10-00-quantizer-49.ivf.md5
+a9d78aaef105cc5a95b7ebb54783f37e75673123 *av1-1-b10-00-quantizer-50.ivf
+06d0c5e79cc794030c4be022089b1d12c1383f71 *av1-1-b10-00-quantizer-50.ivf.md5
+165c20ee372f83682d094541097e375227353239 *av1-1-b10-00-quantizer-51.ivf
+b3d90214b8c6e6f6d9357bb5784d10081325c356 *av1-1-b10-00-quantizer-51.ivf.md5
+5b3ea7a18654d943065f5c176974c3960b56664e *av1-1-b10-00-quantizer-52.ivf
+dc61a6e4e2549074130023b14b137fb4fe442ce3 *av1-1-b10-00-quantizer-52.ivf.md5
+74c3b5851b6a94d33b575a689eb8d34592e95d5f *av1-1-b10-00-quantizer-53.ivf
+a80e43a0fb2b852426bd941b8d4b8f56690e9bc9 *av1-1-b10-00-quantizer-53.ivf.md5
+d05b8dea2cddd4f0d9e792f42f71afbd29f7811c *av1-1-b10-00-quantizer-54.ivf
+432937893321f4bd25fa400b8988c5788cb06ecf *av1-1-b10-00-quantizer-54.ivf.md5
+4eaee0f1970426be0bbeb7d4fccdc7e804e9bea4 *av1-1-b10-00-quantizer-55.ivf
+710ab95ce1dcd2540db4477ff4ee6ab771fe0759 *av1-1-b10-00-quantizer-55.ivf.md5
+fe637930c9faa8744cba37effc4cb5510315d1c0 *av1-1-b10-00-quantizer-56.ivf
+2f9431b30523fb6a3e4122f22c6c3ff7b96a7987 *av1-1-b10-00-quantizer-56.ivf.md5
+ed54fc7fcec194eef1f50adbbe12a6a36ab6836b *av1-1-b10-00-quantizer-57.ivf
+43bccac7800b399210cf15520a83739c23a5d9c7 *av1-1-b10-00-quantizer-57.ivf.md5
+a7b8d628ba3e4c5f37aa6a3d7b82afda73ac89dc *av1-1-b10-00-quantizer-58.ivf
+b26638272b787df54f45a46629b852acbcb73e3d *av1-1-b10-00-quantizer-58.ivf.md5
+c077f22ff547fb5ffd020e8dac91d05942fb52df *av1-1-b10-00-quantizer-59.ivf
+4efd99cc0891bf345b8cd2ae8e21709d61be497b *av1-1-b10-00-quantizer-59.ivf.md5
+301ab53039d75e1ffa8cc6a0874d9ea94e4a6a0d *av1-1-b10-00-quantizer-60.ivf
+4729bd734a6edd2d8d0432a3f66b3d91d565050e *av1-1-b10-00-quantizer-60.ivf.md5
+c78640d3211034df9fcb273bdfc18625819652f2 *av1-1-b10-00-quantizer-61.ivf
+3d823eb2b33ccfea68db506626bcbecf49b0f167 *av1-1-b10-00-quantizer-61.ivf.md5
+bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf
+75457d8476f1927f737d089dcf3d0f7f99f3c4fb *av1-1-b10-00-quantizer-62.ivf.md5
+8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf
+63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5
+a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf
+8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5
+e69e41fee40b408b6eebcc79f266a95f2ee24f9e *av1-1-b8-03-sizedown.mkv
+8c528fb3ccda959a29721566e132f730935ca32b *av1-1-b8-03-sizedown.mkv.md5
+1889da5ee1708007e47bb887470ac477e1d7ba01 *av1-1-b8-03-sizeup.mkv
+8de81b170635d456602dc8923a8b39c534d01fa8 *av1-1-b8-03-sizeup.mkv.md5
+d3ed7de0aa8c155fe35e0f5f4203240710d31383 *park_joy_90p_8_420_monochrome.y4m
+5b3f0907407b809aa66b62cb080feda8c92454ca *park_joy_90p_8_420_vertical_csp.y4m
diff --git a/third_party/aom/test/test.cmake b/third_party/aom/test/test.cmake
new file mode 100644
index 000000000..b16ae14c3
--- /dev/null
+++ b/third_party/aom/test/test.cmake
@@ -0,0 +1,438 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_TEST_TEST_CMAKE_)
+  return()
+endif() # AOM_TEST_TEST_CMAKE_
+set(AOM_TEST_TEST_CMAKE_ 1)
+
+include(FindPythonInterp)
+include(ProcessorCount)
+
+include("${AOM_ROOT}/test/test_data_util.cmake")
+
+set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
+
+list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+            "${AOM_ROOT}/test/test_libaom.cc")
+
+list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+            "${AOM_ROOT}/test/acm_random.h"
+            "${AOM_ROOT}/test/aom_integer_test.cc"
+            "${AOM_ROOT}/test/av1_config_test.cc"
+            "${AOM_ROOT}/test/blockd_test.cc"
+            "${AOM_ROOT}/test/clear_system_state.h"
+            "${AOM_ROOT}/test/codec_factory.h"
+            "${AOM_ROOT}/test/decode_test_driver.cc"
+            "${AOM_ROOT}/test/decode_test_driver.h"
+            "${AOM_ROOT}/test/function_equivalence_test.h"
+            "${AOM_ROOT}/test/log2_test.cc"
+            "${AOM_ROOT}/test/md5_helper.h"
+            "${AOM_ROOT}/test/register_state_check.h"
+            "${AOM_ROOT}/test/test_vectors.cc"
+            "${AOM_ROOT}/test/test_vectors.h"
+            "${AOM_ROOT}/test/transform_test_base.h"
+            "${AOM_ROOT}/test/util.h"
+            "${AOM_ROOT}/test/video_source.h")
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/hbd_metrics_test.cc")
+endif()
+
+list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
+            "${AOM_ROOT}/test/external_frame_buffer_test.cc"
+            "${AOM_ROOT}/test/invalid_file_test.cc"
+            "${AOM_ROOT}/test/test_vector_test.cc"
+            "${AOM_ROOT}/test/ivf_video_source.h")
+
+list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+            "${AOM_ROOT}/test/active_map_test.cc"
+            "${AOM_ROOT}/test/altref_test.cc"
+            "${AOM_ROOT}/test/aq_segment_test.cc"
+            "${AOM_ROOT}/test/borders_test.cc"
+            "${AOM_ROOT}/test/cpu_speed_test.cc"
+            "${AOM_ROOT}/test/datarate_test.cc"
+            "${AOM_ROOT}/test/encode_api_test.cc"
+            "${AOM_ROOT}/test/encode_test_driver.cc"
+            "${AOM_ROOT}/test/encode_test_driver.h"
+            "${AOM_ROOT}/test/end_to_end_test.cc"
+            "${AOM_ROOT}/test/error_resilience_test.cc"
+            "${AOM_ROOT}/test/frame_size_tests.cc"
+            "${AOM_ROOT}/test/horz_superres_test.cc"
+            "${AOM_ROOT}/test/i420_video_source.h"
+            "${AOM_ROOT}/test/lossless_test.cc"
+            "${AOM_ROOT}/test/monochrome_test.cc"
+            "${AOM_ROOT}/test/qm_test.cc"
+            "${AOM_ROOT}/test/resize_test.cc"
+            "${AOM_ROOT}/test/scalability_test.cc"
+            "${AOM_ROOT}/test/y4m_test.cc"
+            "${AOM_ROOT}/test/y4m_video_source.h"
+            "${AOM_ROOT}/test/yuv_video_source.h")
+
+list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
+list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
+list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
+list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+            "${AOM_ROOT}/test/test_intra_pred_speed.cc")
+
+if(NOT BUILD_SHARED_LIBS)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/cdef_test.cc"
+              "${AOM_ROOT}/test/cfl_test.cc"
+              "${AOM_ROOT}/test/convolve_test.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test_util.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test_util.h"
+              "${AOM_ROOT}/test/intrabc_test.cc"
+              "${AOM_ROOT}/test/intrapred_test.cc"
+              "${AOM_ROOT}/test/lpf_test.cc"
+              "${AOM_ROOT}/test/onyxc_int_test.cc"
+              "${AOM_ROOT}/test/scan_test.cc"
+              "${AOM_ROOT}/test/selfguided_filter_test.cc"
+              "${AOM_ROOT}/test/simd_cmp_impl.h"
+              "${AOM_ROOT}/test/simd_impl.h")
+
+  if(CONFIG_ACCOUNTING)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/accounting_test.cc")
+  endif()
+
+  if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+                "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+                "${AOM_ROOT}/test/binary_codes_test.cc"
+                "${AOM_ROOT}/test/boolcoder_test.cc"
+                "${AOM_ROOT}/test/coding_path_sync.cc"
+                "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+                "${AOM_ROOT}/test/divu_small_test.cc"
+                "${AOM_ROOT}/test/dr_prediction_test.cc"
+                "${AOM_ROOT}/test/ec_test.cc"
+                "${AOM_ROOT}/test/ethread_test.cc"
+                "${AOM_ROOT}/test/film_grain_table_test.cc"
+                "${AOM_ROOT}/test/segment_binarization_sync.cc"
+                "${AOM_ROOT}/test/superframe_test.cc"
+                "${AOM_ROOT}/test/tile_independence_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/test/simd_cmp_neon.cc")
+  if(HAVE_NEON)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_neon_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
+              "${AOM_ROOT}/test/simd_cmp_sse2.cc")
+  if(HAVE_SSE2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse2_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
+  if(HAVE_SSSE3)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_ssse3_test.cc")
+  endif()
+
+  if(HAVE_SSE4)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse4_test.cc")
+  endif()
+
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/filterintra_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/test/simd_cmp_avx2.cc")
+  if(HAVE_AVX2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_avx2_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+              "${AOM_ROOT}/test/arf_freq_test.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
+              "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
+              "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
+              "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+              "${AOM_ROOT}/test/av1_txfm_test.cc"
+              "${AOM_ROOT}/test/av1_txfm_test.h"
+              "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
+              "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
+              "${AOM_ROOT}/test/blend_a64_mask_test.cc"
+              "${AOM_ROOT}/test/comp_avg_pred_test.cc"
+              "${AOM_ROOT}/test/comp_avg_pred_test.h"
+              "${AOM_ROOT}/test/comp_mask_variance_test.cc"
+              "${AOM_ROOT}/test/encodetxb_test.cc"
+              "${AOM_ROOT}/test/error_block_test.cc"
+              "${AOM_ROOT}/test/fft_test.cc"
+              "${AOM_ROOT}/test/fwht4x4_test.cc"
+              "${AOM_ROOT}/test/masked_sad_test.cc"
+              "${AOM_ROOT}/test/masked_variance_test.cc"
+              "${AOM_ROOT}/test/motion_vector_test.cc"
+              "${AOM_ROOT}/test/noise_model_test.cc"
+              "${AOM_ROOT}/test/obmc_sad_test.cc"
+              "${AOM_ROOT}/test/obmc_variance_test.cc"
+              "${AOM_ROOT}/test/pickrst_test.cc"
+              "${AOM_ROOT}/test/sad_test.cc"
+              "${AOM_ROOT}/test/subtract_test.cc"
+              "${AOM_ROOT}/test/reconinter_test.cc"
+              "${AOM_ROOT}/test/sum_squares_test.cc"
+              "${AOM_ROOT}/test/variance_test.cc"
+              "${AOM_ROOT}/test/wiener_test.cc"
+              "${AOM_ROOT}/test/warp_filter_test.cc"
+              "${AOM_ROOT}/test/warp_filter_test_util.cc"
+              "${AOM_ROOT}/test/warp_filter_test_util.h")
+
+  list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
+              "${AOM_ROOT}/test/av1_quantize_test.cc"
+              "${AOM_ROOT}/test/corner_match_test.cc"
+              "${AOM_ROOT}/test/quantize_func_test.cc"
+              "${AOM_ROOT}/test/simd_cmp_sse4.cc")
+
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
+                "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+                "${AOM_ROOT}/test/intra_edge_test.cc")
+
+  endif()
+
+  if(HAVE_SSE4_2)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
+  endif()
+
+endif()
+
+if(ENABLE_TESTS)
+  find_package(PythonInterp)
+  if(NOT PYTHONINTERP_FOUND)
+    message(FATAL_ERROR
+              "--- Unit tests require Python, rerun cmake with "
+              "-DENABLE_TESTS=0 to avoid this error, or install Python and "
+              "make sure it's in your PATH.")
+  endif()
+
+  if(MSVC) # Force static run time to avoid collisions with googletest.
+    include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake")
+    if(BUILD_SHARED_LIBS)
+      set(AOM_DISABLE_GTEST_CMAKE 1)
+    endif()
+  endif()
+
+  if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
+    set(CMAKE_MACOSX_RPATH 1)
+  endif()
+
+  include_directories(
+    "${AOM_ROOT}/third_party/googletest/src/googletest/include")
+
+  if(AOM_DISABLE_GTEST_CMAKE)
+    include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
+    add_library(
+      gtest
+      STATIC
+      "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+  else()
+    add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest"
+                     EXCLUDE_FROM_ALL)
+  endif()
+endif()
+
+# Setup testdata download targets, test build targets, and test run targets. The
+# libaom and app util targets must exist before this function is called.
+function(setup_aom_test_targets)
+
+  # TODO(tomfinegan): Build speed optimization. $AOM_UNIT_TEST_COMMON_SOURCES
+  # and $AOM_UNIT_TEST_ENCODER_SOURCES are very large. The build of test targets
+  # could be sped up (on multicore build machines) by compiling sources in each
+  # list into separate object library targets, and then linking them into
+  # test_libaom.
+  add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES})
+  add_dependencies(test_aom_common aom)
+
+  if(CONFIG_AV1_DECODER)
+    add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES})
+    add_dependencies(test_aom_decoder aom)
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES})
+    add_dependencies(test_aom_encoder aom)
+  endif()
+
+  add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:test_aom_common>)
+  list(APPEND AOM_APP_TARGETS test_libaom)
+
+  if(CONFIG_AV1_DECODER)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:aom_decoder_app_util>
+                   $<TARGET_OBJECTS:test_aom_decoder>)
+
+    if(ENABLE_DECODE_PERF_TESTS AND CONFIG_WEBM_IO)
+      target_sources(test_libaom PRIVATE ${AOM_DECODE_PERF_TEST_SOURCES})
+    endif()
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:test_aom_encoder>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    if(ENABLE_ENCODE_PERF_TESTS)
+      target_sources(test_libaom PRIVATE ${AOM_ENCODE_PERF_TEST_SOURCES})
+    endif()
+
+    if(NOT BUILD_SHARED_LIBS)
+      add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
+                     $<TARGET_OBJECTS:aom_common_app_util>)
+      target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
+                            gtest)
+      list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
+    endif()
+  endif()
+
+  target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom gtest)
+
+  if(CONFIG_LIBYUV)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
+  endif()
+  if(CONFIG_WEBM_IO)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:webm>)
+  endif()
+  if(HAVE_SSE2)
+    add_intrinsics_source_to_target("-msse2" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_SSE2")
+  endif()
+  if(HAVE_SSSE3)
+    add_intrinsics_source_to_target("-mssse3" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_SSSE3")
+  endif()
+  if(HAVE_SSE4_1)
+    add_intrinsics_source_to_target("-msse4.1" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1")
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
+        add_intrinsics_source_to_target("-msse4.1" "test_libaom"
+                                        "AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1")
+      endif()
+    endif()
+  endif()
+  if(HAVE_AVX2)
+    add_intrinsics_source_to_target("-mavx2" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_AVX2")
+  endif()
+  if(HAVE_NEON)
+    add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
+  endif()
+
+  if(ENABLE_TESTDATA)
+    make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
+                         test_file_checksums)
+    list(LENGTH test_files num_test_files)
+    list(LENGTH test_file_checksums num_test_file_checksums)
+
+    math(EXPR max_file_index "${num_test_files} - 1")
+    foreach(test_index RANGE ${max_file_index})
+      list(GET test_files ${test_index} test_file)
+      list(GET test_file_checksums ${test_index} test_file_checksum)
+      add_custom_target(testdata_${test_index}
+                        COMMAND
+                          ${CMAKE_COMMAND} -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+                          -DAOM_ROOT="${AOM_ROOT}"
+                          -DAOM_TEST_FILE="${test_file}"
+                          -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
+                          "${AOM_ROOT}/test/test_data_download_worker.cmake")
+      list(APPEND testdata_targets testdata_${test_index})
+    endforeach()
+
+    # Create a custom build target for running each test data download target.
+    add_custom_target(testdata)
+    add_dependencies(testdata ${testdata_targets})
+
+    # Skip creation of test run targets when generating for Visual Studio and
+    # Xcode unless the user explicitly requests IDE test hosting. This is done
+    # to make build cycles in the IDE tolerable when the IDE command for build
+    # project is used to build AOM. Default behavior in IDEs is to build all
+    # targets, and the test run takes hours.
+    if(((NOT MSVC) AND (NOT XCODE)) OR ENABLE_IDE_TEST_HOSTING)
+
+      # Pick a reasonable number of targets (this controls parallelization).
+      processorcount(num_test_targets)
+      if(num_test_targets EQUAL 0) # Just default to 10 targets when there's no
+                                   # processor count available.
+        set(num_test_targets 10)
+      endif()
+
+      math(EXPR max_shard_index "${num_test_targets} - 1")
+      foreach(shard_index RANGE ${max_shard_index})
+        set(test_name "test_${shard_index}")
+        add_custom_target(${test_name}
+                          COMMAND ${CMAKE_COMMAND}
+                                  -DGTEST_SHARD_INDEX=${shard_index}
+                                  -DGTEST_TOTAL_SHARDS=${num_test_targets}
+                                  -DTEST_LIBAOM=$<TARGET_FILE:test_libaom> -P
+                                  "${AOM_ROOT}/test/test_runner.cmake"
+                          DEPENDS testdata test_libaom)
+        list(APPEND test_targets ${test_name})
+      endforeach()
+      add_custom_target(runtests)
+      add_dependencies(runtests ${test_targets})
+    endif()
+  endif()
+
+  # Collect all variables containing libaom test source files.
+  get_cmake_property(all_cmake_vars VARIABLES)
+  foreach(var ${all_cmake_vars})
+
+    # https://github.com/cheshirekow/cmake_format/issues/34
+# cmake-format: off
+    if (("${var}" MATCHES "_TEST_" AND NOT
+         "${var}" MATCHES
+         "_DATA_\|_CMAKE_\|INTRA_PRED\|_COMPILED\|_HOSTING\|_PERF_\|CODER_")
+        OR (CONFIG_AV1_ENCODER AND ENABLE_ENCODE_PERF_TESTS AND
+            "${var}" MATCHES "_ENCODE_PERF_TEST_")
+        OR (CONFIG_AV1_DECODER AND ENABLE_DECODE_PERF_TESTS AND
+            "${var}" MATCHES "_DECODE_PERF_TEST_")
+        OR (CONFIG_AV1_ENCODER AND "${var}" MATCHES "_TEST_ENCODER_")
+        OR (CONFIG_AV1_DECODER AND  "${var}" MATCHES "_TEST_DECODER_"))
+      list(APPEND aom_test_source_vars ${var})
+    endif()
+    # cmake-format: on
+  endforeach()
+
+  # Libaom_test_srcs.txt generation.
+  set(libaom_test_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_test_srcs.txt")
+  file(WRITE "${libaom_test_srcs_txt_file}"
+             "# This file is generated. DO NOT EDIT.\n")
+
+  # Static source file list first.
+  foreach(aom_test_source_var ${aom_test_source_vars})
+    foreach(file ${${aom_test_source_var}})
+      if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+        string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+        file(APPEND "${libaom_test_srcs_txt_file}" "${file}\n")
+      endif()
+    endforeach()
+  endforeach()
+
+  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/test/test_data_download_worker.cmake b/third_party/aom/test/test_data_download_worker.cmake
new file mode 100644
index 000000000..dc803497d
--- /dev/null
+++ b/third_party/aom/test/test_data_download_worker.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+include("${AOM_ROOT}/test/test_data_util.cmake")
+
+# https://github.com/cheshirekow/cmake_format/issues/34
+# cmake-format: off
+if (NOT AOM_ROOT OR NOT AOM_CONFIG_DIR OR NOT AOM_TEST_FILE
+    OR NOT AOM_TEST_CHECKSUM)
+  message(FATAL_ERROR
+          "AOM_ROOT, AOM_CONFIG_DIR, AOM_TEST_FILE and AOM_TEST_CHECKSUM must be
+          defined.")
+endif ()
+# cmake-format: on
+
+set(AOM_TEST_DATA_URL "http://storage.googleapis.com/aom-test-data")
+
+if(NOT AOM_TEST_DATA_PATH)
+  set(AOM_TEST_DATA_PATH "$ENV{LIBAOM_TEST_DATA_PATH}")
+endif()
+
+if("${AOM_TEST_DATA_PATH}" STREQUAL "")
+  message(WARNING
+            "Writing test data to ${AOM_CONFIG_DIR}, set "
+            "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.")
+  set(AOM_TEST_DATA_PATH "${AOM_CONFIG_DIR}")
+endif()
+
+if(NOT EXISTS "${AOM_TEST_DATA_PATH}")
+  file(MAKE_DIRECTORY "${AOM_TEST_DATA_PATH}")
+endif()
+
+expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_PATH}" "filepath")
+expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_URL}" "url")
+
+check_file("${filepath}" "${AOM_TEST_CHECKSUM}" "needs_download")
+if(needs_download)
+  download_test_file("${url}" "${AOM_TEST_CHECKSUM}" "${filepath}")
+endif()
diff --git a/third_party/aom/test/test_data_util.cmake b/third_party/aom/test/test_data_util.cmake
new file mode 100644
index 000000000..45c951478
--- /dev/null
+++ b/third_party/aom/test/test_data_util.cmake
@@ -0,0 +1,598 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+list(APPEND AOM_TEST_DATA_FILE_NAMES
+            "hantro_collage_w352h288.yuv"
+            "hantro_odd.yuv"
+            "invalid-bug-1814.ivf"
+            "invalid-bug-1814.ivf.res"
+            "invalid-oss-fuzz-10061.ivf"
+            "invalid-oss-fuzz-10061.ivf.res"
+            "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf"
+            "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res"
+            "invalid-oss-fuzz-10227.ivf"
+            "invalid-oss-fuzz-10227.ivf.res"
+            "invalid-oss-fuzz-9463.ivf"
+            "invalid-oss-fuzz-9463.ivf.res"
+            "invalid-oss-fuzz-9482.ivf"
+            "invalid-oss-fuzz-9482.ivf.res"
+            "invalid-oss-fuzz-9720.ivf"
+            "invalid-oss-fuzz-9720.ivf.res"
+            "park_joy_90p_10_420.y4m"
+            "park_joy_90p_10_422.y4m"
+            "park_joy_90p_10_444.y4m"
+            "park_joy_90p_12_420.y4m"
+            "park_joy_90p_12_422.y4m"
+            "park_joy_90p_12_444.y4m"
+            "park_joy_90p_8_420_a10-1.y4m"
+            "park_joy_90p_8_420.y4m"
+            "park_joy_90p_8_420_monochrome.y4m"
+            "park_joy_90p_8_420_vertical_csp.y4m"
+            "park_joy_90p_8_422.y4m"
+            "park_joy_90p_8_444.y4m"
+            "desktop_credits.y4m"
+            "niklas_1280_720_30.y4m"
+            "rush_hour_444.y4m"
+            "screendata.y4m"
+            "niklas_640_480_30.yuv"
+            "vase10x10.yuv")
+
+if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES
+              "av1-1-b8-00-quantizer-00.ivf"
+              "av1-1-b8-00-quantizer-00.ivf.md5"
+              "av1-1-b8-00-quantizer-01.ivf"
+              "av1-1-b8-00-quantizer-01.ivf.md5"
+              "av1-1-b8-00-quantizer-02.ivf"
+              "av1-1-b8-00-quantizer-02.ivf.md5"
+              "av1-1-b8-00-quantizer-03.ivf"
+              "av1-1-b8-00-quantizer-03.ivf.md5"
+              "av1-1-b8-00-quantizer-04.ivf"
+              "av1-1-b8-00-quantizer-04.ivf.md5"
+              "av1-1-b8-00-quantizer-05.ivf"
+              "av1-1-b8-00-quantizer-05.ivf.md5"
+              "av1-1-b8-00-quantizer-06.ivf"
+              "av1-1-b8-00-quantizer-06.ivf.md5"
+              "av1-1-b8-00-quantizer-07.ivf"
+              "av1-1-b8-00-quantizer-07.ivf.md5"
+              "av1-1-b8-00-quantizer-08.ivf"
+              "av1-1-b8-00-quantizer-08.ivf.md5"
+              "av1-1-b8-00-quantizer-09.ivf"
+              "av1-1-b8-00-quantizer-09.ivf.md5"
+              "av1-1-b8-00-quantizer-10.ivf"
+              "av1-1-b8-00-quantizer-10.ivf.md5"
+              "av1-1-b8-00-quantizer-11.ivf"
+              "av1-1-b8-00-quantizer-11.ivf.md5"
+              "av1-1-b8-00-quantizer-12.ivf"
+              "av1-1-b8-00-quantizer-12.ivf.md5"
+              "av1-1-b8-00-quantizer-13.ivf"
+              "av1-1-b8-00-quantizer-13.ivf.md5"
+              "av1-1-b8-00-quantizer-14.ivf"
+              "av1-1-b8-00-quantizer-14.ivf.md5"
+              "av1-1-b8-00-quantizer-15.ivf"
+              "av1-1-b8-00-quantizer-15.ivf.md5"
+              "av1-1-b8-00-quantizer-16.ivf"
+              "av1-1-b8-00-quantizer-16.ivf.md5"
+              "av1-1-b8-00-quantizer-17.ivf"
+              "av1-1-b8-00-quantizer-17.ivf.md5"
+              "av1-1-b8-00-quantizer-18.ivf"
+              "av1-1-b8-00-quantizer-18.ivf.md5"
+              "av1-1-b8-00-quantizer-19.ivf"
+              "av1-1-b8-00-quantizer-19.ivf.md5"
+              "av1-1-b8-00-quantizer-20.ivf"
+              "av1-1-b8-00-quantizer-20.ivf.md5"
+              "av1-1-b8-00-quantizer-21.ivf"
+              "av1-1-b8-00-quantizer-21.ivf.md5"
+              "av1-1-b8-00-quantizer-22.ivf"
+              "av1-1-b8-00-quantizer-22.ivf.md5"
+              "av1-1-b8-00-quantizer-23.ivf"
+              "av1-1-b8-00-quantizer-23.ivf.md5"
+              "av1-1-b8-00-quantizer-24.ivf"
+              "av1-1-b8-00-quantizer-24.ivf.md5"
+              "av1-1-b8-00-quantizer-25.ivf"
+              "av1-1-b8-00-quantizer-25.ivf.md5"
+              "av1-1-b8-00-quantizer-26.ivf"
+              "av1-1-b8-00-quantizer-26.ivf.md5"
+              "av1-1-b8-00-quantizer-27.ivf"
+              "av1-1-b8-00-quantizer-27.ivf.md5"
+              "av1-1-b8-00-quantizer-28.ivf"
+              "av1-1-b8-00-quantizer-28.ivf.md5"
+              "av1-1-b8-00-quantizer-29.ivf"
+              "av1-1-b8-00-quantizer-29.ivf.md5"
+              "av1-1-b8-00-quantizer-30.ivf"
+              "av1-1-b8-00-quantizer-30.ivf.md5"
+              "av1-1-b8-00-quantizer-31.ivf"
+              "av1-1-b8-00-quantizer-31.ivf.md5"
+              "av1-1-b8-00-quantizer-32.ivf"
+              "av1-1-b8-00-quantizer-32.ivf.md5"
+              "av1-1-b8-00-quantizer-33.ivf"
+              "av1-1-b8-00-quantizer-33.ivf.md5"
+              "av1-1-b8-00-quantizer-34.ivf"
+              "av1-1-b8-00-quantizer-34.ivf.md5"
+              "av1-1-b8-00-quantizer-35.ivf"
+              "av1-1-b8-00-quantizer-35.ivf.md5"
+              "av1-1-b8-00-quantizer-36.ivf"
+              "av1-1-b8-00-quantizer-36.ivf.md5"
+              "av1-1-b8-00-quantizer-37.ivf"
+              "av1-1-b8-00-quantizer-37.ivf.md5"
+              "av1-1-b8-00-quantizer-38.ivf"
+              "av1-1-b8-00-quantizer-38.ivf.md5"
+              "av1-1-b8-00-quantizer-39.ivf"
+              "av1-1-b8-00-quantizer-39.ivf.md5"
+              "av1-1-b8-00-quantizer-40.ivf"
+              "av1-1-b8-00-quantizer-40.ivf.md5"
+              "av1-1-b8-00-quantizer-41.ivf"
+              "av1-1-b8-00-quantizer-41.ivf.md5"
+              "av1-1-b8-00-quantizer-42.ivf"
+              "av1-1-b8-00-quantizer-42.ivf.md5"
+              "av1-1-b8-00-quantizer-43.ivf"
+              "av1-1-b8-00-quantizer-43.ivf.md5"
+              "av1-1-b8-00-quantizer-44.ivf"
+              "av1-1-b8-00-quantizer-44.ivf.md5"
+              "av1-1-b8-00-quantizer-45.ivf"
+              "av1-1-b8-00-quantizer-45.ivf.md5"
+              "av1-1-b8-00-quantizer-46.ivf"
+              "av1-1-b8-00-quantizer-46.ivf.md5"
+              "av1-1-b8-00-quantizer-47.ivf"
+              "av1-1-b8-00-quantizer-47.ivf.md5"
+              "av1-1-b8-00-quantizer-48.ivf"
+              "av1-1-b8-00-quantizer-48.ivf.md5"
+              "av1-1-b8-00-quantizer-49.ivf"
+              "av1-1-b8-00-quantizer-49.ivf.md5"
+              "av1-1-b8-00-quantizer-50.ivf"
+              "av1-1-b8-00-quantizer-50.ivf.md5"
+              "av1-1-b8-00-quantizer-51.ivf"
+              "av1-1-b8-00-quantizer-51.ivf.md5"
+              "av1-1-b8-00-quantizer-52.ivf"
+              "av1-1-b8-00-quantizer-52.ivf.md5"
+              "av1-1-b8-00-quantizer-53.ivf"
+              "av1-1-b8-00-quantizer-53.ivf.md5"
+              "av1-1-b8-00-quantizer-54.ivf"
+              "av1-1-b8-00-quantizer-54.ivf.md5"
+              "av1-1-b8-00-quantizer-55.ivf"
+              "av1-1-b8-00-quantizer-55.ivf.md5"
+              "av1-1-b8-00-quantizer-56.ivf"
+              "av1-1-b8-00-quantizer-56.ivf.md5"
+              "av1-1-b8-00-quantizer-57.ivf"
+              "av1-1-b8-00-quantizer-57.ivf.md5"
+              "av1-1-b8-00-quantizer-58.ivf"
+              "av1-1-b8-00-quantizer-58.ivf.md5"
+              "av1-1-b8-00-quantizer-59.ivf"
+              "av1-1-b8-00-quantizer-59.ivf.md5"
+              "av1-1-b8-00-quantizer-60.ivf"
+              "av1-1-b8-00-quantizer-60.ivf.md5"
+              "av1-1-b8-00-quantizer-61.ivf"
+              "av1-1-b8-00-quantizer-61.ivf.md5"
+              "av1-1-b8-00-quantizer-62.ivf"
+              "av1-1-b8-00-quantizer-62.ivf.md5"
+              "av1-1-b8-00-quantizer-63.ivf"
+              "av1-1-b8-00-quantizer-63.ivf.md5"
+              "av1-1-b10-00-quantizer-00.ivf"
+              "av1-1-b10-00-quantizer-00.ivf.md5"
+              "av1-1-b10-00-quantizer-01.ivf"
+              "av1-1-b10-00-quantizer-01.ivf.md5"
+              "av1-1-b10-00-quantizer-02.ivf"
+              "av1-1-b10-00-quantizer-02.ivf.md5"
+              "av1-1-b10-00-quantizer-03.ivf"
+              "av1-1-b10-00-quantizer-03.ivf.md5"
+              "av1-1-b10-00-quantizer-04.ivf"
+              "av1-1-b10-00-quantizer-04.ivf.md5"
+              "av1-1-b10-00-quantizer-05.ivf"
+              "av1-1-b10-00-quantizer-05.ivf.md5"
+              "av1-1-b10-00-quantizer-06.ivf"
+              "av1-1-b10-00-quantizer-06.ivf.md5"
+              "av1-1-b10-00-quantizer-07.ivf"
+              "av1-1-b10-00-quantizer-07.ivf.md5"
+              "av1-1-b10-00-quantizer-08.ivf"
+              "av1-1-b10-00-quantizer-08.ivf.md5"
+              "av1-1-b10-00-quantizer-09.ivf"
+              "av1-1-b10-00-quantizer-09.ivf.md5"
+              "av1-1-b10-00-quantizer-10.ivf"
+              "av1-1-b10-00-quantizer-10.ivf.md5"
+              "av1-1-b10-00-quantizer-11.ivf"
+              "av1-1-b10-00-quantizer-11.ivf.md5"
+              "av1-1-b10-00-quantizer-12.ivf"
+              "av1-1-b10-00-quantizer-12.ivf.md5"
+              "av1-1-b10-00-quantizer-13.ivf"
+              "av1-1-b10-00-quantizer-13.ivf.md5"
+              "av1-1-b10-00-quantizer-14.ivf"
+              "av1-1-b10-00-quantizer-14.ivf.md5"
+              "av1-1-b10-00-quantizer-15.ivf"
+              "av1-1-b10-00-quantizer-15.ivf.md5"
+              "av1-1-b10-00-quantizer-16.ivf"
+              "av1-1-b10-00-quantizer-16.ivf.md5"
+              "av1-1-b10-00-quantizer-17.ivf"
+              "av1-1-b10-00-quantizer-17.ivf.md5"
+              "av1-1-b10-00-quantizer-18.ivf"
+              "av1-1-b10-00-quantizer-18.ivf.md5"
+              "av1-1-b10-00-quantizer-19.ivf"
+              "av1-1-b10-00-quantizer-19.ivf.md5"
+              "av1-1-b10-00-quantizer-20.ivf"
+              "av1-1-b10-00-quantizer-20.ivf.md5"
+              "av1-1-b10-00-quantizer-21.ivf"
+              "av1-1-b10-00-quantizer-21.ivf.md5"
+              "av1-1-b10-00-quantizer-22.ivf"
+              "av1-1-b10-00-quantizer-22.ivf.md5"
+              "av1-1-b10-00-quantizer-23.ivf"
+              "av1-1-b10-00-quantizer-23.ivf.md5"
+              "av1-1-b10-00-quantizer-24.ivf"
+              "av1-1-b10-00-quantizer-24.ivf.md5"
+              "av1-1-b10-00-quantizer-25.ivf"
+              "av1-1-b10-00-quantizer-25.ivf.md5"
+              "av1-1-b10-00-quantizer-26.ivf"
+              "av1-1-b10-00-quantizer-26.ivf.md5"
+              "av1-1-b10-00-quantizer-27.ivf"
+              "av1-1-b10-00-quantizer-27.ivf.md5"
+              "av1-1-b10-00-quantizer-28.ivf"
+              "av1-1-b10-00-quantizer-28.ivf.md5"
+              "av1-1-b10-00-quantizer-29.ivf"
+              "av1-1-b10-00-quantizer-29.ivf.md5"
+              "av1-1-b10-00-quantizer-30.ivf"
+              "av1-1-b10-00-quantizer-30.ivf.md5"
+              "av1-1-b10-00-quantizer-31.ivf"
+              "av1-1-b10-00-quantizer-31.ivf.md5"
+              "av1-1-b10-00-quantizer-32.ivf"
+              "av1-1-b10-00-quantizer-32.ivf.md5"
+              "av1-1-b10-00-quantizer-33.ivf"
+              "av1-1-b10-00-quantizer-33.ivf.md5"
+              "av1-1-b10-00-quantizer-34.ivf"
+              "av1-1-b10-00-quantizer-34.ivf.md5"
+              "av1-1-b10-00-quantizer-35.ivf"
+              "av1-1-b10-00-quantizer-35.ivf.md5"
+              "av1-1-b10-00-quantizer-36.ivf"
+              "av1-1-b10-00-quantizer-36.ivf.md5"
+              "av1-1-b10-00-quantizer-37.ivf"
+              "av1-1-b10-00-quantizer-37.ivf.md5"
+              "av1-1-b10-00-quantizer-38.ivf"
+              "av1-1-b10-00-quantizer-38.ivf.md5"
+              "av1-1-b10-00-quantizer-39.ivf"
+              "av1-1-b10-00-quantizer-39.ivf.md5"
+              "av1-1-b10-00-quantizer-40.ivf"
+              "av1-1-b10-00-quantizer-40.ivf.md5"
+              "av1-1-b10-00-quantizer-41.ivf"
+              "av1-1-b10-00-quantizer-41.ivf.md5"
+              "av1-1-b10-00-quantizer-42.ivf"
+              "av1-1-b10-00-quantizer-42.ivf.md5"
+              "av1-1-b10-00-quantizer-43.ivf"
+              "av1-1-b10-00-quantizer-43.ivf.md5"
+              "av1-1-b10-00-quantizer-44.ivf"
+              "av1-1-b10-00-quantizer-44.ivf.md5"
+              "av1-1-b10-00-quantizer-45.ivf"
+              "av1-1-b10-00-quantizer-45.ivf.md5"
+              "av1-1-b10-00-quantizer-46.ivf"
+              "av1-1-b10-00-quantizer-46.ivf.md5"
+              "av1-1-b10-00-quantizer-47.ivf"
+              "av1-1-b10-00-quantizer-47.ivf.md5"
+              "av1-1-b10-00-quantizer-48.ivf"
+              "av1-1-b10-00-quantizer-48.ivf.md5"
+              "av1-1-b10-00-quantizer-49.ivf"
+              "av1-1-b10-00-quantizer-49.ivf.md5"
+              "av1-1-b10-00-quantizer-50.ivf"
+              "av1-1-b10-00-quantizer-50.ivf.md5"
+              "av1-1-b10-00-quantizer-51.ivf"
+              "av1-1-b10-00-quantizer-51.ivf.md5"
+              "av1-1-b10-00-quantizer-52.ivf"
+              "av1-1-b10-00-quantizer-52.ivf.md5"
+              "av1-1-b10-00-quantizer-53.ivf"
+              "av1-1-b10-00-quantizer-53.ivf.md5"
+              "av1-1-b10-00-quantizer-54.ivf"
+              "av1-1-b10-00-quantizer-54.ivf.md5"
+              "av1-1-b10-00-quantizer-55.ivf"
+              "av1-1-b10-00-quantizer-55.ivf.md5"
+              "av1-1-b10-00-quantizer-56.ivf"
+              "av1-1-b10-00-quantizer-56.ivf.md5"
+              "av1-1-b10-00-quantizer-57.ivf"
+              "av1-1-b10-00-quantizer-57.ivf.md5"
+              "av1-1-b10-00-quantizer-58.ivf"
+              "av1-1-b10-00-quantizer-58.ivf.md5"
+              "av1-1-b10-00-quantizer-59.ivf"
+              "av1-1-b10-00-quantizer-59.ivf.md5"
+              "av1-1-b10-00-quantizer-60.ivf"
+              "av1-1-b10-00-quantizer-60.ivf.md5"
+              "av1-1-b10-00-quantizer-61.ivf"
+              "av1-1-b10-00-quantizer-61.ivf.md5"
+              "av1-1-b10-00-quantizer-62.ivf"
+              "av1-1-b10-00-quantizer-62.ivf.md5"
+              "av1-1-b10-00-quantizer-63.ivf"
+              "av1-1-b10-00-quantizer-63.ivf.md5"
+              "av1-1-b8-01-size-16x16.ivf"
+              "av1-1-b8-01-size-16x16.ivf.md5"
+              "av1-1-b8-01-size-16x18.ivf"
+              "av1-1-b8-01-size-16x18.ivf.md5"
+              "av1-1-b8-01-size-16x32.ivf"
+              "av1-1-b8-01-size-16x32.ivf.md5"
+              "av1-1-b8-01-size-16x34.ivf"
+              "av1-1-b8-01-size-16x34.ivf.md5"
+              "av1-1-b8-01-size-16x64.ivf"
+              "av1-1-b8-01-size-16x64.ivf.md5"
+              "av1-1-b8-01-size-16x66.ivf"
+              "av1-1-b8-01-size-16x66.ivf.md5"
+              "av1-1-b8-01-size-18x16.ivf"
+              "av1-1-b8-01-size-18x16.ivf.md5"
+              "av1-1-b8-01-size-18x18.ivf"
+              "av1-1-b8-01-size-18x18.ivf.md5"
+              "av1-1-b8-01-size-18x32.ivf"
+              "av1-1-b8-01-size-18x32.ivf.md5"
+              "av1-1-b8-01-size-18x34.ivf"
+              "av1-1-b8-01-size-18x34.ivf.md5"
+              "av1-1-b8-01-size-18x64.ivf"
+              "av1-1-b8-01-size-18x64.ivf.md5"
+              "av1-1-b8-01-size-18x66.ivf"
+              "av1-1-b8-01-size-18x66.ivf.md5"
+              "av1-1-b8-01-size-196x196.ivf"
+              "av1-1-b8-01-size-196x196.ivf.md5"
+              "av1-1-b8-01-size-196x198.ivf"
+              "av1-1-b8-01-size-196x198.ivf.md5"
+              "av1-1-b8-01-size-196x200.ivf"
+              "av1-1-b8-01-size-196x200.ivf.md5"
+              "av1-1-b8-01-size-196x202.ivf"
+              "av1-1-b8-01-size-196x202.ivf.md5"
+              "av1-1-b8-01-size-196x208.ivf"
+              "av1-1-b8-01-size-196x208.ivf.md5"
+              "av1-1-b8-01-size-196x210.ivf"
+              "av1-1-b8-01-size-196x210.ivf.md5"
+              "av1-1-b8-01-size-196x224.ivf"
+              "av1-1-b8-01-size-196x224.ivf.md5"
+              "av1-1-b8-01-size-196x226.ivf"
+              "av1-1-b8-01-size-196x226.ivf.md5"
+              "av1-1-b8-01-size-198x196.ivf"
+              "av1-1-b8-01-size-198x196.ivf.md5"
+              "av1-1-b8-01-size-198x198.ivf"
+              "av1-1-b8-01-size-198x198.ivf.md5"
+              "av1-1-b8-01-size-198x200.ivf"
+              "av1-1-b8-01-size-198x200.ivf.md5"
+              "av1-1-b8-01-size-198x202.ivf"
+              "av1-1-b8-01-size-198x202.ivf.md5"
+              "av1-1-b8-01-size-198x208.ivf"
+              "av1-1-b8-01-size-198x208.ivf.md5"
+              "av1-1-b8-01-size-198x210.ivf"
+              "av1-1-b8-01-size-198x210.ivf.md5"
+              "av1-1-b8-01-size-198x224.ivf"
+              "av1-1-b8-01-size-198x224.ivf.md5"
+              "av1-1-b8-01-size-198x226.ivf"
+              "av1-1-b8-01-size-198x226.ivf.md5"
+              "av1-1-b8-01-size-200x196.ivf"
+              "av1-1-b8-01-size-200x196.ivf.md5"
+              "av1-1-b8-01-size-200x198.ivf"
+              "av1-1-b8-01-size-200x198.ivf.md5"
+              "av1-1-b8-01-size-200x200.ivf"
+              "av1-1-b8-01-size-200x200.ivf.md5"
+              "av1-1-b8-01-size-200x202.ivf"
+              "av1-1-b8-01-size-200x202.ivf.md5"
+              "av1-1-b8-01-size-200x208.ivf"
+              "av1-1-b8-01-size-200x208.ivf.md5"
+              "av1-1-b8-01-size-200x210.ivf"
+              "av1-1-b8-01-size-200x210.ivf.md5"
+              "av1-1-b8-01-size-200x224.ivf"
+              "av1-1-b8-01-size-200x224.ivf.md5"
+              "av1-1-b8-01-size-200x226.ivf"
+              "av1-1-b8-01-size-200x226.ivf.md5"
+              "av1-1-b8-01-size-202x196.ivf"
+              "av1-1-b8-01-size-202x196.ivf.md5"
+              "av1-1-b8-01-size-202x198.ivf"
+              "av1-1-b8-01-size-202x198.ivf.md5"
+              "av1-1-b8-01-size-202x200.ivf"
+              "av1-1-b8-01-size-202x200.ivf.md5"
+              "av1-1-b8-01-size-202x202.ivf"
+              "av1-1-b8-01-size-202x202.ivf.md5"
+              "av1-1-b8-01-size-202x208.ivf"
+              "av1-1-b8-01-size-202x208.ivf.md5"
+              "av1-1-b8-01-size-202x210.ivf"
+              "av1-1-b8-01-size-202x210.ivf.md5"
+              "av1-1-b8-01-size-202x224.ivf"
+              "av1-1-b8-01-size-202x224.ivf.md5"
+              "av1-1-b8-01-size-202x226.ivf"
+              "av1-1-b8-01-size-202x226.ivf.md5"
+              "av1-1-b8-01-size-208x196.ivf"
+              "av1-1-b8-01-size-208x196.ivf.md5"
+              "av1-1-b8-01-size-208x198.ivf"
+              "av1-1-b8-01-size-208x198.ivf.md5"
+              "av1-1-b8-01-size-208x200.ivf"
+              "av1-1-b8-01-size-208x200.ivf.md5"
+              "av1-1-b8-01-size-208x202.ivf"
+              "av1-1-b8-01-size-208x202.ivf.md5"
+              "av1-1-b8-01-size-208x208.ivf"
+              "av1-1-b8-01-size-208x208.ivf.md5"
+              "av1-1-b8-01-size-208x210.ivf"
+              "av1-1-b8-01-size-208x210.ivf.md5"
+              "av1-1-b8-01-size-208x224.ivf"
+              "av1-1-b8-01-size-208x224.ivf.md5"
+              "av1-1-b8-01-size-208x226.ivf"
+              "av1-1-b8-01-size-208x226.ivf.md5"
+              "av1-1-b8-01-size-210x196.ivf"
+              "av1-1-b8-01-size-210x196.ivf.md5"
+              "av1-1-b8-01-size-210x198.ivf"
+              "av1-1-b8-01-size-210x198.ivf.md5"
+              "av1-1-b8-01-size-210x200.ivf"
+              "av1-1-b8-01-size-210x200.ivf.md5"
+              "av1-1-b8-01-size-210x202.ivf"
+              "av1-1-b8-01-size-210x202.ivf.md5"
+              "av1-1-b8-01-size-210x208.ivf"
+              "av1-1-b8-01-size-210x208.ivf.md5"
+              "av1-1-b8-01-size-210x210.ivf"
+              "av1-1-b8-01-size-210x210.ivf.md5"
+              "av1-1-b8-01-size-210x224.ivf"
+              "av1-1-b8-01-size-210x224.ivf.md5"
+              "av1-1-b8-01-size-210x226.ivf"
+              "av1-1-b8-01-size-210x226.ivf.md5"
+              "av1-1-b8-01-size-224x196.ivf"
+              "av1-1-b8-01-size-224x196.ivf.md5"
+              "av1-1-b8-01-size-224x198.ivf"
+              "av1-1-b8-01-size-224x198.ivf.md5"
+              "av1-1-b8-01-size-224x200.ivf"
+              "av1-1-b8-01-size-224x200.ivf.md5"
+              "av1-1-b8-01-size-224x202.ivf"
+              "av1-1-b8-01-size-224x202.ivf.md5"
+              "av1-1-b8-01-size-224x208.ivf"
+              "av1-1-b8-01-size-224x208.ivf.md5"
+              "av1-1-b8-01-size-224x210.ivf"
+              "av1-1-b8-01-size-224x210.ivf.md5"
+              "av1-1-b8-01-size-224x224.ivf"
+              "av1-1-b8-01-size-224x224.ivf.md5"
+              "av1-1-b8-01-size-224x226.ivf"
+              "av1-1-b8-01-size-224x226.ivf.md5"
+              "av1-1-b8-01-size-226x196.ivf"
+              "av1-1-b8-01-size-226x196.ivf.md5"
+              "av1-1-b8-01-size-226x198.ivf"
+              "av1-1-b8-01-size-226x198.ivf.md5"
+              "av1-1-b8-01-size-226x200.ivf"
+              "av1-1-b8-01-size-226x200.ivf.md5"
+              "av1-1-b8-01-size-226x202.ivf"
+              "av1-1-b8-01-size-226x202.ivf.md5"
+              "av1-1-b8-01-size-226x208.ivf"
+              "av1-1-b8-01-size-226x208.ivf.md5"
+              "av1-1-b8-01-size-226x210.ivf"
+              "av1-1-b8-01-size-226x210.ivf.md5"
+              "av1-1-b8-01-size-226x224.ivf"
+              "av1-1-b8-01-size-226x224.ivf.md5"
+              "av1-1-b8-01-size-226x226.ivf"
+              "av1-1-b8-01-size-226x226.ivf.md5"
+              "av1-1-b8-01-size-32x16.ivf"
+              "av1-1-b8-01-size-32x16.ivf.md5"
+              "av1-1-b8-01-size-32x18.ivf"
+              "av1-1-b8-01-size-32x18.ivf.md5"
+              "av1-1-b8-01-size-32x32.ivf"
+              "av1-1-b8-01-size-32x32.ivf.md5"
+              "av1-1-b8-01-size-32x34.ivf"
+              "av1-1-b8-01-size-32x34.ivf.md5"
+              "av1-1-b8-01-size-32x64.ivf"
+              "av1-1-b8-01-size-32x64.ivf.md5"
+              "av1-1-b8-01-size-32x66.ivf"
+              "av1-1-b8-01-size-32x66.ivf.md5"
+              "av1-1-b8-01-size-34x16.ivf"
+              "av1-1-b8-01-size-34x16.ivf.md5"
+              "av1-1-b8-01-size-34x18.ivf"
+              "av1-1-b8-01-size-34x18.ivf.md5"
+              "av1-1-b8-01-size-34x32.ivf"
+              "av1-1-b8-01-size-34x32.ivf.md5"
+              "av1-1-b8-01-size-34x34.ivf"
+              "av1-1-b8-01-size-34x34.ivf.md5"
+              "av1-1-b8-01-size-34x64.ivf"
+              "av1-1-b8-01-size-34x64.ivf.md5"
+              "av1-1-b8-01-size-34x66.ivf"
+              "av1-1-b8-01-size-34x66.ivf.md5"
+              "av1-1-b8-01-size-64x16.ivf"
+              "av1-1-b8-01-size-64x16.ivf.md5"
+              "av1-1-b8-01-size-64x18.ivf"
+              "av1-1-b8-01-size-64x18.ivf.md5"
+              "av1-1-b8-01-size-64x32.ivf"
+              "av1-1-b8-01-size-64x32.ivf.md5"
+              "av1-1-b8-01-size-64x34.ivf"
+              "av1-1-b8-01-size-64x34.ivf.md5"
+              "av1-1-b8-01-size-64x64.ivf"
+              "av1-1-b8-01-size-64x64.ivf.md5"
+              "av1-1-b8-01-size-64x66.ivf"
+              "av1-1-b8-01-size-64x66.ivf.md5"
+              "av1-1-b8-01-size-66x16.ivf"
+              "av1-1-b8-01-size-66x16.ivf.md5"
+              "av1-1-b8-01-size-66x18.ivf"
+              "av1-1-b8-01-size-66x18.ivf.md5"
+              "av1-1-b8-01-size-66x32.ivf"
+              "av1-1-b8-01-size-66x32.ivf.md5"
+              "av1-1-b8-01-size-66x34.ivf"
+              "av1-1-b8-01-size-66x34.ivf.md5"
+              "av1-1-b8-01-size-66x64.ivf"
+              "av1-1-b8-01-size-66x64.ivf.md5"
+              "av1-1-b8-01-size-66x66.ivf"
+              "av1-1-b8-01-size-66x66.ivf.md5"
+              "av1-1-b8-02-allintra.ivf"
+              "av1-1-b8-02-allintra.ivf.md5"
+              "av1-1-b8-03-sizeup.mkv"
+              "av1-1-b8-03-sizeup.mkv.md5"
+              "av1-1-b8-03-sizedown.mkv"
+              "av1-1-b8-03-sizedown.mkv.md5")
+endif()
+
+if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES "desktop_640_360_30.yuv"
+              "kirland_640_480_30.yuv" "macmarcomoving_640_480_30.yuv"
+              "macmarcostationary_640_480_30.yuv" "niklas_1280_720_30.yuv"
+              "tacomanarrows_640_480_30.yuv"
+              "tacomasmallcameramovement_640_480_30.yuv"
+              "thaloundeskmtg_640_480_30.yuv")
+endif()
+
+# Parses test/test-data.sha1 and writes captured file names and checksums to
+# $out_files and $out_checksums as lists.
+function(make_test_data_lists test_data_file out_files out_checksums)
+  if(NOT test_data_file OR NOT EXISTS "${test_data_file}")
+    message(FATAL_ERROR "Test info file missing or empty (${test_data_file})")
+  endif()
+
+  # Read $test_data_file into $files_and_checksums. $files_and_checksums becomes
+  # a list with an entry for each line from $test_data_file.
+  file(STRINGS "${test_data_file}" files_and_checksums)
+
+  # Iterate over the list of lines and split it into $checksums and $filenames.
+  foreach(line ${files_and_checksums})
+    string(FIND "${line}" " *" delim_pos)
+
+    math(EXPR filename_pos "${delim_pos} + 2")
+    string(SUBSTRING "${line}" 0 ${delim_pos} checksum)
+    string(SUBSTRING "${line}" ${filename_pos} -1 filename)
+
+    list(FIND AOM_TEST_DATA_FILE_NAMES ${filename} list_index)
+    if(NOT ${list_index} EQUAL -1)
+
+      # Include the name and checksum in output only when the file is needed.
+      set(checksums ${checksums} ${checksum})
+      set(filenames ${filenames} ${filename})
+    endif()
+  endforeach()
+
+  list(LENGTH filenames num_files)
+  list(LENGTH checksums num_checksums)
+  if(NOT checksums OR NOT filenames OR NOT num_files EQUAL num_checksums)
+    message(FATAL_ERROR "Parsing of ${test_data_file} failed.")
+  endif()
+
+  set(${out_checksums} ${checksums} PARENT_SCOPE)
+  set(${out_files} ${filenames} PARENT_SCOPE)
+endfunction()
+
+# Appends each file name in $test_files to $test_dir and adds the result path to
+# $out_path_list.
+function(expand_test_file_paths test_files test_dir out_path_list)
+  foreach(filename ${${test_files}})
+    set(path_list ${path_list} "${test_dir}/${filename}")
+  endforeach()
+  set(${out_path_list} ${path_list} PARENT_SCOPE)
+endfunction()
+
+function(check_file local_path expected_checksum out_needs_update)
+  if(EXISTS "${local_path}")
+    file(SHA1 "${local_path}" file_checksum)
+  else()
+    set(${out_needs_update} 1 PARENT_SCOPE)
+    return()
+  endif()
+
+  if("${file_checksum}" STREQUAL "${expected_checksum}")
+    unset(${out_needs_update} PARENT_SCOPE)
+  else()
+    set(${out_needs_update} 1 PARENT_SCOPE)
+    return()
+  endif()
+  message("${local_path} up to date.")
+endfunction()
+
+# Downloads data from $file_url, confirms that $file_checksum matches, and
+# writes it to $local_path.
+function(download_test_file file_url file_checksum local_path)
+  message("Downloading ${file_url} ...")
+  file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS
+       EXPECTED_HASH SHA1=${file_checksum})
+  message("Download of ${file_url} complete.")
+endfunction()
diff --git a/third_party/aom/test/test_intra_pred_speed.cc b/third_party/aom/test/test_intra_pred_speed.cc
new file mode 100644
index 000000000..b72ac1167
--- /dev/null
+++ b/third_party/aom/test/test_intra_pred_speed.cc
@@ -0,0 +1,1464 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+//  Test and time AOM intra-predictor functions
+
+#include <stdio.h>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/md5_helper.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
+
+// -----------------------------------------------------------------------------
+
+namespace {
+
+// Note:
+// APPLY_UNIT_TESTS
+// 1: Do unit tests
+// 0: Generate MD5 array as required
+#define APPLY_UNIT_TESTS 1
+
+typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
+                            const uint8_t *above, const uint8_t *left);
+
+const int kBPS = 64;
+const int kTotalPixels = kBPS * kBPS;
+// 4 DC variants, V, H, PAETH, SMOOTH, SMOOTH_V, SMOOTH_H
+const int kNumAv1IntraFuncs = 10;
+
+#if APPLY_UNIT_TESTS
+const char *kAv1IntraPredNames[kNumAv1IntraFuncs] = {
+  "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED",   "V_PRED",
+  "H_PRED",  "PAETH_PRED",   "SMOOTH_PRED", "SMOOTH_V_PRED", "SMOOTH_H_PRED",
+};
+#endif  // APPLY_UNIT_TESTS
+
+template <typename Pixel>
+struct IntraPredTestMem {
+  void Init(int block_width, int block_height, int bd) {
+    ASSERT_LE(block_width, kBPS);
+    ASSERT_LE(block_height, kBPS);
+    // Note: for blocks having width <= 32 and height <= 32, we generate 32x32
+    // random pixels as before to avoid having to recalculate all hashes again.
+    const int block_size_upto_32 = (block_width <= 32) && (block_height <= 32);
+    stride = block_size_upto_32 ? 32 : kBPS;
+    num_pixels = stride * stride;
+    libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed());
+    above = above_mem + 16;
+    const int mask = (1 << bd) - 1;
+    for (int i = 0; i < num_pixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < stride; ++i) left[i] = rnd.Rand16() & mask;
+    for (int i = -1; i < stride; ++i) above[i] = rnd.Rand16() & mask;
+
+    for (int i = stride; i < 2 * stride; ++i) {
+      left[i] = rnd.Rand16() & mask;
+      above[i] = rnd.Rand16() & mask;
+    }
+  }
+
+  DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, left[2 * kBPS]);
+  Pixel *above;
+  int stride;
+  int num_pixels;
+
+ private:
+  DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+// -----------------------------------------------------------------------------
+// Low Bittdepth
+
+typedef IntraPredTestMem<uint8_t> Av1IntraPredTestMem;
+
+static const char *const kTxSizeStrings[TX_SIZES_ALL] = {
+  "4X4",  "8X8",  "16X16", "32X32", "64X64", "4X8",   "8X4",
+  "8X16", "16X8", "16X32", "32X16", "32X64", "64X32", "4X16",
+  "16X4", "8X32", "32X8",  "16X64", "64X16",
+};
+
+void CheckMd5Signature(TX_SIZE tx_size, bool is_hbd,
+                       const char *const signatures[], const void *data,
+                       size_t data_size, int elapsed_time, int idx) {
+  const std::string hbd_str = is_hbd ? "Hbd " : "";
+  const std::string name_str = hbd_str + "Intra" + kTxSizeStrings[tx_size];
+  libaom_test::MD5 md5;
+  md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+#if APPLY_UNIT_TESTS
+  printf("Mode %s[%13s]: %5d ms     MD5: %s\n", name_str.c_str(),
+         kAv1IntraPredNames[idx], elapsed_time, md5.Get());
+  EXPECT_STREQ(signatures[idx], md5.Get());
+#else
+  (void)signatures;
+  (void)elapsed_time;
+  (void)idx;
+  printf("\"%s\",\n", md5.Get());
+#endif
+}
+
+void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs,
+                   const char *const signatures[]) {
+  const int block_width = tx_size_wide[tx_size];
+  const int block_height = tx_size_high[tx_size];
+  const int num_pixels_per_test =
+      block_width * block_height * kNumAv1IntraFuncs;
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+  Av1IntraPredTestMem intra_pred_test_mem;
+  intra_pred_test_mem.Init(block_width, block_height, 8);
+
+  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+                    intra_pred_test_mem.above, intra_pred_test_mem.left);
+    }
+    libaom_test::ClearSystemState();
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(
+        tx_size, false, signatures, intra_pred_test_mem.src,
+        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+        elapsed_time, k);
+  }
+}
+
+static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+  {
+      // 4X4
+      "e7ed7353c3383fff942e500e9bfe82fe",
+      "2a4a26fcc6ce005eadc08354d196c8a9",
+      "269d92eff86f315d9c38fe7640d85b15",
+      "ae2960eea9f71ee3dabe08b282ec1773",
+      "6c1abcc44e90148998b51acd11144e9c",
+      "f7bb3186e1ef8a2b326037ff898cad8e",
+      "59fc0e923a08cfac0a493fb38988e2bb",
+      "9ff8bb37d9c830e6ab8ecb0c435d3c91",
+      "de6937fca02354f2874dbc5dbec5d5b3",
+      "723cf948137f7d8c7860d814e55ae67d",
+  },
+  {
+      // 8X8
+      "d8bbae5d6547cfc17e4f5f44c8730e88",
+      "373bab6d931868d41a601d9d88ce9ac3",
+      "6fdd5ff4ff79656c14747598ca9e3706",
+      "d9661c2811d6a73674f40ffb2b841847",
+      "7c722d10b19ccff0b8c171868e747385",
+      "f81dd986eb2b50f750d3a7da716b7e27",
+      "064404361748dd111a890a1470d7f0ea",
+      "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
+      "97111eb1bc26bade6272015df829f1ae",
+      "d19a8a73cc46b807f2c5e817576cc1e1",
+  },
+  {
+      // 16X16
+      "50971c07ce26977d30298538fffec619",
+      "527a6b9e0dc5b21b98cf276305432bef",
+      "7eff2868f80ebc2c43a4f367281d80f7",
+      "67cd60512b54964ef6aff1bd4816d922",
+      "48371c87dc95c08a33b2048f89cf6468",
+      "b0acf2872ee411d7530af6d2625a7084",
+      "93d6b5352b571805ab16a55e1bbed86a",
+      "03764e4c0aebbc180e4e2c68fb06df2b",
+      "bb6c74c9076c9f266ab11fb57060d8e6",
+      "0c5162bc28489756ddb847b5678e6f07",
+  },
+  {
+      // 32X32
+      "a0a618c900e65ae521ccc8af789729f2",
+      "985aaa7c72b4a6c2fb431d32100cf13a",
+      "10662d09febc3ca13ee4e700120daeb5",
+      "b3b01379ba08916ef6b1b35f7d9ad51c",
+      "9f4261755795af97e34679c333ec7004",
+      "bc2c9da91ad97ef0d1610fb0a9041657",
+      "ef1653982b69e1f64bee3759f3e1ec45",
+      "1a51a675deba2c83282142eb48d3dc3d",
+      "866c224746dc260cda861a7b1b383fb3",
+      "cea23799fc3526e1b6a6ff02b42b82af",
+  },
+  {
+      // 64X64
+      "6e1094fa7b50bc813aa2ba29f5df8755",
+      "afe020786b83b793c2bbd9468097ff6e",
+      "be91585259bc37bf4dc1651936e90b3e",
+      "a1650dbcd56e10288c3e269eca37967d",
+      "9e5c34f3797e0cdd3cd9d4c05b0d8950",
+      "bc87be7ac899cc6a28f399d7516c49fe",
+      "9811fd0d2dd515f06122f5d1bd18b784",
+      "3c140e466f2c2c0d9cb7d2157ab8dc27",
+      "9543de76c925a8f6adc884cc7f98dc91",
+      "df1df0376cc944afe7e74e94f53e575a",
+  },
+  {
+      // 4X8
+      "d9fbebdc85f71ab1e18461b2db4a2adc",
+      "5ccb2a68284bc9714d94b8a06ccadbb2",
+      "735d059abc2744f3ff3f9590f7191b37",
+      "d9fbebdc85f71ab1e18461b2db4a2adc",
+      "6819497c44cd0ace120add83672996ee",
+      "7e3244f5a2d3edf81c7e962a842b97f9",
+      "809350f164cd4d1650850bb0f59c3260",
+      "1b60a394331eeab6927a6f8aaff57040",
+      "5307de1bd7329ba6b281d2c1b0b457f9",
+      "24c58a8138339846d95568efb91751db",
+  },
+  {
+      // 8X4
+      "23f9fc11344426c9bee2e06d57dfd628",
+      "2d71a26d1bae1fb34734de7b42fc5eb7",
+      "5af9c1b2fd9d5721fad67b67b3f7c816",
+      "00d71b17be662753813d515f197d145e",
+      "bef10ec984427e28f4390f43809d10af",
+      "77773cdfb7ed6bc882ab202a64b0a470",
+      "2cc48bd66d6b0121b5221d52ccd732af",
+      "b302155e1c9eeeafe2ba2bf68e807a46",
+      "561bc8d0e76d5041ebd5168fc6a115e1",
+      "81d0113fb1d0a9a24ffd6f1987b77948",
+  },
+  {
+      // 8X16
+      "c849de88b24f773dfcdd1d48d1209796",
+      "6cb807c1897b94866a0f3d3c56ed8695",
+      "d56db05a8ac7981762f5b877f486c4ef",
+      "b4bc01eb6e59a40922ad17715cafb04b",
+      "09d178439534f4062ae687c351f66d64",
+      "644501399cf73080ac606e5cef7ca09b",
+      "278076495180e17c065a95ab7278539a",
+      "9dd7f324816f242be408ffeb0c673732",
+      "f520c4a20acfa0bea1d253c6f0f040fd",
+      "85f38df809df2c2d7c8b4a157a65cd44",
+  },
+  {
+      // 16X8
+      "b4cbdbdf10ce13300b4063a3daf99e04",
+      "3731e1e6202064a9d0604d7c293ecee4",
+      "6c856188c4256a06452f0d5d70cac436",
+      "1f2192b4c8c497589484ea7bf9c944e8",
+      "84011bd4b7f565119d06787840e333a0",
+      "0e48949f7a6aa36f0d76b5d01f91124a",
+      "60eff8064634b6c73b10681356baeee9",
+      "1559aeb081a9c0c71111d6093c2ff9fd",
+      "c15479b739713773e5cabb748451987b",
+      "72e33ec12c9b67aea26d8d005fb82de2",
+  },
+  {
+      // 16X32
+      "abe5233d189cdbf79424721571bbaa7b",
+      "282759f81e3cfb2e2d396fe406b72a8b",
+      "e2224926c264f6f174cbc3167a233168",
+      "6814e85c2b33f8c9415d62e80394b47b",
+      "99cbbb60459c08a3061d72c4e4f6276a",
+      "1d1567d40b8e816f8c1f71e576fe0f87",
+      "36fdd371b624a075814d497c4832ec85",
+      "8ab8da61b727442b6ff692b40d0df018",
+      "e35a10ad7fdf2327e821504a90f6a6eb",
+      "1f7211e727dc1de7d6a55d082fbdd821",
+  },
+  {
+      // 32X16
+      "d1aeb8d5fdcfd3307922af01a798a4dc",
+      "b0bcb514ebfbee065faea9d34c12ae75",
+      "d6a18c63b4e909871c0137ca652fad23",
+      "fd047f2fc1b8ffb95d0eeef3e8796a45",
+      "645ab60779ea348fd93c81561c31bab9",
+      "4409633c9db8dff41ade4292a3a56e7f",
+      "5e36a11e069b31c2a739f3a9c7b37c24",
+      "e83b9483d702cfae496991c3c7fa92c0",
+      "12f6ddf98c7f30a277307f1ea935b030",
+      "354321d6c32bbdb0739e4fa2acbf41e1",
+  },
+  {
+      // 32X64
+      "0ce332b343934b34cd4417725faa85cb",
+      "4e2a2cfd8f56f15939bdfc753145b303",
+      "0f46d124ba9f48cdd5d5290acf786d6d",
+      "e1e8ed803236367821981500a3d9eebe",
+      "1d2f8e48e3adb7c448be05d9f66f4954",
+      "9fb2e176636a5689b26f73ca73fcc512",
+      "e720ebccae7e25e36f23da53ae5b5d6a",
+      "86fe4364734169aaa4520d799890d530",
+      "b1870290764bb1b100d1974e2bd70f1d",
+      "ce5b238e19d85ef69d85badfab4e63ae",
+  },
+  {
+      // 64X32
+      "a6c5aeb722615089efbca80b02951ceb",
+      "538424b24bd0830f21788e7238ca762f",
+      "80c15b303235f9bc2259027bb92dfdc4",
+      "e48e1ac15e97191a8fda08d62fff343e",
+      "12604b37875533665078405ef4582e35",
+      "0048afa17bd3e1632d68b96048836530",
+      "07a0cfcb56a5eed50c4bd6c26814336b",
+      "529d8a070de5bc6531fa3ee8f450c233",
+      "33c50a11c7d78f72434064f634305e95",
+      "e0ef7f0559c1a50ec5a8c12011b962f7",
+  },
+  {
+      // 4X16
+      "750491056568eb8fe15387b86bdf06b8",
+      "3a52dae9f599f08cfb3bd1b910dc0e11",
+      "af79f71e3e03dbeca44e2e13561f70c7",
+      "ca7dfd7624afc0c06fb5552f44398535",
+      "b591af115444bf43140c29c269f68fb2",
+      "483d942ae36e69e62f31eb215331416f",
+      "f14b58525e81870bc5d95c7ac71a347f",
+      "371208bb4027d9badb04095d1590bbc4",
+      "c7049c21b2924d70c7c12784d6b6b796",
+      "7d87233f4b5b0f12086045e5d7b2d4c2",
+  },
+  {
+      // 16X4
+      "7c6e325a65e77e732b3adbe237e045e4",
+      "24478f93ffcec47852e004d0fe948464",
+      "258d042c67d4ba3ecfa667f0adc9aebf",
+      "b2cd21d06959f159a1f3c4d9768ee7fb",
+      "b4e1f38157bf8410e7c3da02f687a343",
+      "869e703729eb0fc0711c254944ff5d5a",
+      "9638dd77105a640b146a8201ea7a0801",
+      "919d932c6af8a1cc7486e8ce996dd487",
+      "e1c9be493b6714c7ae48f30044c43140",
+      "bf0fe3889d654b2f6eb98c8fc751f9e4",
+  },
+  {
+      // 8X32
+      "8dfac4319fe0bd40013ffb3102da8c72",
+      "feb46b6dc4e2ca0a09533bfc51d4dcb0",
+      "850837ec714c37262216527aaf4cbbe9",
+      "4603c7800fb08361f163daca876e8bda",
+      "1ff95e7d2debc27b05806fb25abfd624",
+      "d81b9a51a062b23ca7823804cb7bec22",
+      "f1d8978158766f46335203608cb807e7",
+      "f3527096256258c0878d644a9d7d53ca",
+      "cbde98ac8b009953eb112807ad2ea29e",
+      "654fb1153415747feae599f538122af5",
+  },
+  {
+      // 32X8
+      "3d4ee16fab374357474f60b845327bc7",
+      "bc17c5059473a476df4e85f56395ad55",
+      "3d4ee16fab374357474f60b845327bc7",
+      "c14b8db34dc2355b84e3735c9ba16c7f",
+      "a71d25b5d47a92a8b9223c98f18458ee",
+      "6c1cfe2b1893f4576a80675687cb6426",
+      "92d11bbef8b85bb48d799bb055de3514",
+      "bcf81d1db8ae5cc03360467f44f498ec",
+      "79f8c564163555592e808e145eaf5c60",
+      "46fff139cef2ef773938bcc8b0e5abb8",
+  },
+  {
+      // 16X64
+      "3b2a053ee8b05a8ac35ad23b0422a151",
+      "12b0c69595328c465e0b25e0c9e3e9fc",
+      "f77c544ac8035e01920deae40cee7b07",
+      "727797ef15ccd8d325476fe8f12006a3",
+      "f3be77c0fe67eb5d9d515e92bec21eb7",
+      "f1ece6409e01e9dd98b800d49628247d",
+      "efd2ec9bfbbd4fd1f6604ea369df1894",
+      "ec703de918422b9e03197ba0ed60a199",
+      "739418efb89c07f700895deaa5d0b3e3",
+      "9943ae1bbeeebfe1d3a92dc39e049d63",
+  },
+  {
+      // 64X16
+      "821b76b1494d4f84d20817840f719a1a",
+      "69e462c3338a9aaf993c3f7cfbc15649",
+      "516d8f6eb054d74d150e7b444185b6b9",
+      "de1b736e9d99129609d6ef3a491507a0",
+      "fd9b4276e7affe1e0e4ce4f428058994",
+      "cd82fd361a4767ac29a9f406b480b8f3",
+      "2792c2f810157a4a6cb13c28529ff779",
+      "1220442d90c4255ba0969d28b91e93a6",
+      "c7253e10b45f7f67dfee3256c9b94825",
+      "879792198071c7e0b50b9b5010d8c18f",
+  },
+};
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to TestIntraPred. The test name is 'arch.TestIntraPred_tx_size', e.g.,
+// C.TestIntraPred.0
+#define INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, h,  \
+                        paeth, smooth, smooth_v, smooth_h)                 \
+  TEST(arch, DISABLED_##TestIntraPred_##tx_size) {                         \
+    static const AvxPredFunc aom_intra_pred[] = {                          \
+      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \
+    };                                                                     \
+    TestIntraPred(tx_size, aom_intra_pred, kSignatures[tx_size]);          \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4, 4x8, 4x16
+
+INTRA_PRED_TEST(C_1, TX_4X4, aom_dc_predictor_4x4_c,
+                aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
+                aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
+                aom_h_predictor_4x4_c, aom_paeth_predictor_4x4_c,
+                aom_smooth_predictor_4x4_c, aom_smooth_v_predictor_4x4_c,
+                aom_smooth_h_predictor_4x4_c)
+
+INTRA_PRED_TEST(C_2, TX_4X8, aom_dc_predictor_4x8_c,
+                aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
+                aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
+                aom_h_predictor_4x8_c, aom_paeth_predictor_4x8_c,
+                aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c,
+                aom_smooth_h_predictor_4x8_c)
+
+INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
+                aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
+                aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
+                aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
+                aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c,
+                aom_smooth_h_predictor_4x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2,
+                aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
+                aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
+                aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_dc_predictor_4x8_sse2,
+                aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
+                aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
+                aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2,
+                aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
+                aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
+                aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
+                aom_smooth_v_predictor_4x4_ssse3,
+                aom_smooth_h_predictor_4x4_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
+                aom_smooth_v_predictor_4x8_ssse3,
+                aom_smooth_h_predictor_4x8_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3,
+                aom_smooth_v_predictor_4x16_ssse3,
+                aom_smooth_h_predictor_4x16_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
+                aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
+                aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
+                aom_h_predictor_4x4_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa,
+                aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
+                aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
+                aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
+
+INTRA_PRED_TEST(C_1, TX_8X8, aom_dc_predictor_8x8_c,
+                aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
+                aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
+                aom_h_predictor_8x8_c, aom_paeth_predictor_8x8_c,
+                aom_smooth_predictor_8x8_c, aom_smooth_v_predictor_8x8_c,
+                aom_smooth_h_predictor_8x8_c)
+
+INTRA_PRED_TEST(C_2, TX_8X4, aom_dc_predictor_8x4_c,
+                aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
+                aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
+                aom_h_predictor_8x4_c, aom_paeth_predictor_8x4_c,
+                aom_smooth_predictor_8x4_c, aom_smooth_v_predictor_8x4_c,
+                aom_smooth_h_predictor_8x4_c)
+
+INTRA_PRED_TEST(C_3, TX_8X16, aom_dc_predictor_8x16_c,
+                aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
+                aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
+                aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c,
+                aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
+                aom_smooth_h_predictor_8x16_c)
+
+INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
+                aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
+                aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
+                aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
+                aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c,
+                aom_smooth_h_predictor_8x32_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2,
+                aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
+                aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
+                aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_dc_predictor_8x4_sse2,
+                aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
+                aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
+                aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_dc_predictor_8x16_sse2,
+                aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
+                aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
+                aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
+                aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
+                aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
+                aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x8_ssse3, aom_smooth_predictor_8x8_ssse3,
+                aom_smooth_v_predictor_8x8_ssse3,
+                aom_smooth_h_predictor_8x8_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3,
+                aom_smooth_v_predictor_8x4_ssse3,
+                aom_smooth_h_predictor_8x4_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
+                aom_smooth_v_predictor_8x16_ssse3,
+                aom_smooth_h_predictor_8x16_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
+                aom_smooth_v_predictor_8x32_ssse3,
+                aom_smooth_h_predictor_8x32_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
+                aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
+                aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
+                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa,
+                aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
+                aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
+                aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
+
+INTRA_PRED_TEST(C_1, TX_16X16, aom_dc_predictor_16x16_c,
+                aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
+                aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
+                aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c,
+                aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c,
+                aom_smooth_h_predictor_16x16_c)
+
+INTRA_PRED_TEST(C_2, TX_16X8, aom_dc_predictor_16x8_c,
+                aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
+                aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
+                aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c,
+                aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c,
+                aom_smooth_h_predictor_16x8_c)
+
+INTRA_PRED_TEST(C_3, TX_16X32, aom_dc_predictor_16x32_c,
+                aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
+                aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
+                aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c,
+                aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
+                aom_smooth_h_predictor_16x32_c)
+
+INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c,
+                aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
+                aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
+                aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c,
+                aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c,
+                aom_smooth_h_predictor_16x4_c)
+
+INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c,
+                aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c,
+                aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c,
+                aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
+                aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c,
+                aom_smooth_h_predictor_16x64_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2,
+                aom_dc_left_predictor_16x16_sse2,
+                aom_dc_top_predictor_16x16_sse2,
+                aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
+                aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_dc_predictor_16x8_sse2,
+                aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2,
+                aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2,
+                aom_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_dc_predictor_16x32_sse2,
+                aom_dc_left_predictor_16x32_sse2,
+                aom_dc_top_predictor_16x32_sse2,
+                aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
+                aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
+                aom_dc_left_predictor_16x64_sse2,
+                aom_dc_top_predictor_16x64_sse2,
+                aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
+                aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
+                aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
+                aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
+                aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x16_ssse3,
+                aom_smooth_predictor_16x16_ssse3,
+                aom_smooth_v_predictor_16x16_ssse3,
+                aom_smooth_h_predictor_16x16_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
+                aom_smooth_v_predictor_16x8_ssse3,
+                aom_smooth_h_predictor_16x8_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_ssse3,
+                aom_smooth_predictor_16x32_ssse3,
+                aom_smooth_v_predictor_16x32_ssse3,
+                aom_smooth_h_predictor_16x32_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_ssse3,
+                aom_smooth_predictor_16x64_ssse3,
+                aom_smooth_v_predictor_16x64_ssse3,
+                aom_smooth_h_predictor_16x64_ssse3)
+INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3,
+                aom_smooth_v_predictor_16x4_ssse3,
+                aom_smooth_h_predictor_16x4_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
+#endif  // HAVE_AVX2
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TX_16X16, aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
+                aom_dc_left_predictor_16x16_neon,
+                aom_dc_top_predictor_16x16_neon,
+                aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
+                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa,
+                aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
+                aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
+                aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
+
+INTRA_PRED_TEST(C_1, TX_32X32, aom_dc_predictor_32x32_c,
+                aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
+                aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
+                aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c,
+                aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c,
+                aom_smooth_h_predictor_32x32_c)
+
+INTRA_PRED_TEST(C_2, TX_32X16, aom_dc_predictor_32x16_c,
+                aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
+                aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
+                aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c,
+                aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c,
+                aom_smooth_h_predictor_32x16_c)
+
+INTRA_PRED_TEST(C_3, TX_32X64, aom_dc_predictor_32x64_c,
+                aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c,
+                aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c,
+                aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c,
+                aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
+                aom_smooth_h_predictor_32x64_c)
+
+INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
+                aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
+                aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
+                aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
+                aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c,
+                aom_smooth_h_predictor_32x8_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2,
+                aom_dc_left_predictor_32x32_sse2,
+                aom_dc_top_predictor_32x32_sse2,
+                aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
+                aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_dc_predictor_32x16_sse2,
+                aom_dc_left_predictor_32x16_sse2,
+                aom_dc_top_predictor_32x16_sse2,
+                aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
+                aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
+                aom_dc_left_predictor_32x64_sse2,
+                aom_dc_top_predictor_32x64_sse2,
+                aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
+                aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2,
+                aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
+                aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
+                aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x32_ssse3,
+                aom_smooth_predictor_32x32_ssse3,
+                aom_smooth_v_predictor_32x32_ssse3,
+                aom_smooth_h_predictor_32x32_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x16_ssse3,
+                aom_smooth_predictor_32x16_ssse3,
+                aom_smooth_v_predictor_32x16_ssse3,
+                aom_smooth_h_predictor_32x16_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x64_ssse3,
+                aom_smooth_predictor_32x64_ssse3,
+                aom_smooth_v_predictor_32x64_ssse3,
+                aom_smooth_h_predictor_32x64_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3,
+                aom_smooth_v_predictor_32x8_ssse3,
+                aom_smooth_h_predictor_32x8_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_1, TX_32X32, aom_dc_predictor_32x32_avx2,
+                aom_dc_left_predictor_32x32_avx2,
+                aom_dc_top_predictor_32x32_avx2,
+                aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
+                aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2,
+                NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TX_32X16, aom_dc_predictor_32x16_avx2,
+                aom_dc_left_predictor_32x16_avx2,
+                aom_dc_top_predictor_32x16_avx2,
+                aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
+                NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
+                aom_dc_left_predictor_32x64_avx2,
+                aom_dc_top_predictor_32x64_avx2,
+                aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
+                NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon,
+                aom_dc_left_predictor_32x32_neon,
+                aom_dc_top_predictor_32x32_neon,
+                aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
+                aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa,
+                aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
+                aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
+                aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+INTRA_PRED_TEST(C_1, TX_64X64, aom_dc_predictor_64x64_c,
+                aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c,
+                aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c,
+                aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c,
+                aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c,
+                aom_smooth_h_predictor_64x64_c)
+
+INTRA_PRED_TEST(C_2, TX_64X32, aom_dc_predictor_64x32_c,
+                aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c,
+                aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c,
+                aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c,
+                aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
+                aom_smooth_h_predictor_64x32_c)
+
+INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
+                aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
+                aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
+                aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
+                aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
+                aom_smooth_h_predictor_64x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
+                aom_dc_left_predictor_64x64_sse2,
+                aom_dc_top_predictor_64x64_sse2,
+                aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
+                aom_h_predictor_64x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
+                aom_dc_left_predictor_64x32_sse2,
+                aom_dc_top_predictor_64x32_sse2,
+                aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
+                aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
+                aom_dc_left_predictor_64x16_sse2,
+                aom_dc_top_predictor_64x16_sse2,
+                aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
+                aom_h_predictor_64x16_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x64_ssse3,
+                aom_smooth_predictor_64x64_ssse3,
+                aom_smooth_v_predictor_64x64_ssse3,
+                aom_smooth_h_predictor_64x64_ssse3)
+INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x32_ssse3,
+                aom_smooth_predictor_64x32_ssse3,
+                aom_smooth_v_predictor_64x32_ssse3,
+                aom_smooth_h_predictor_64x32_ssse3)
+INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x16_ssse3,
+                aom_smooth_predictor_64x16_ssse3,
+                aom_smooth_v_predictor_64x16_ssse3,
+                aom_smooth_h_predictor_64x16_ssse3)
+#endif
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
+                aom_dc_left_predictor_64x64_avx2,
+                aom_dc_top_predictor_64x64_avx2,
+                aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
+                NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
+                aom_dc_left_predictor_64x32_avx2,
+                aom_dc_top_predictor_64x32_avx2,
+                aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
+                NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
+                aom_dc_left_predictor_64x16_avx2,
+                aom_dc_top_predictor_64x16_avx2,
+                aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
+                NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
+#endif
+// -----------------------------------------------------------------------------
+// High Bitdepth
+namespace {
+
+typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+                                  const uint16_t *above, const uint16_t *left,
+                                  int bd);
+
+typedef IntraPredTestMem<uint16_t> Av1HighbdIntraPredTestMem;
+
+void TestHighbdIntraPred(TX_SIZE tx_size, AvxHighbdPredFunc const *pred_funcs,
+                         const char *const signatures[]) {
+  const int block_width = tx_size_wide[tx_size];
+  const int block_height = tx_size_high[tx_size];
+  const int num_pixels_per_test =
+      block_width * block_height * kNumAv1IntraFuncs;
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+  Av1HighbdIntraPredTestMem intra_pred_test_mem;
+  const int bd = 12;
+  intra_pred_test_mem.Init(block_width, block_height, bd);
+
+  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+                    intra_pred_test_mem.above, intra_pred_test_mem.left, bd);
+    }
+    libaom_test::ClearSystemState();
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(
+        tx_size, true, signatures, intra_pred_test_mem.src,
+        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+        elapsed_time, k);
+  }
+}
+
+static const char *const kHighbdSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+  {
+      // 4X4
+      "11f74af6c5737df472f3275cbde062fa",
+      "51bea056b6447c93f6eb8f6b7e8f6f71",
+      "27e97f946766331795886f4de04c5594",
+      "53ab15974b049111fb596c5168ec7e3f",
+      "f0b640bb176fbe4584cf3d32a9b0320a",
+      "729783ca909e03afd4b47111c80d967b",
+      "6e30009c45474a22032678b1bd579c8f",
+      "e57cba016d808aa8a35619df2a65f049",
+      "55a6c37f39afcbbf5abca4a985b96459",
+      "a623d45b37dafec1f8a75c4c5218913d",
+  },
+  {
+      // 8X8
+      "03da8829fe94663047fd108c5fcaa71d",
+      "ecdb37b8120a2d3a4c706b016bd1bfd7",
+      "1d4543ed8d2b9368cb96898095fe8a75",
+      "f791c9a67b913cbd82d9da8ecede30e2",
+      "065c70646f4dbaff913282f55a45a441",
+      "51f87123616662ef7c35691497dfd0ba",
+      "85c01ba03df68f9ece7bd3fa0f8980e6",
+      "ad19b7dac092f56df6d054e1f67f21e7",
+      "0edc415b5dd7299f7a34fb9f71d31d78",
+      "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
+  },
+  {
+      // 16X16
+      "e33cb3f56a878e2fddb1b2fc51cdd275",
+      "c7bff6f04b6052c8ab335d726dbbd52d",
+      "d0b0b47b654a9bcc5c6008110a44589b",
+      "78f5da7b10b2b9ab39f114a33b6254e9",
+      "c78e31d23831abb40d6271a318fdd6f3",
+      "90d1347f4ec9198a0320daecb6ff90b8",
+      "e63ded54ab3d0e8728b6f24d4f01e53f",
+      "35ce21fbe0ea114c089fc3489a78155d",
+      "f277f6ef8e4d717f1f0dfe2706ac197d",
+      "e8014d3f41256976c02e0f1e622ba2b9",
+  },
+  {
+      // 32X32
+      "a3e8056ba7e36628cce4917cd956fedd",
+      "cc7d3024fe8748b512407edee045377e",
+      "2aab0a0f330a1d3e19b8ecb8f06387a3",
+      "a547bc3fb7b06910bf3973122a426661",
+      "26f712514da95042f93d6e8dc8e431dc",
+      "bb08c6e16177081daa3d936538dbc2e3",
+      "84bf83f94a51b33654ca940c6f8bc057",
+      "7168b03fc31bf29596a344d6a35d007c",
+      "b073a70d3672f1282236994f5d12e94b",
+      "c51607aebad5dcb3c1e3b58ef9e5b84e",
+  },
+  {
+      // 64X64
+      "a6baa0d4bfb2269a94c7a38f86a4bccf",
+      "3f1ef5f473a49eba743f17a3324adf9d",
+      "12ac11889ae5f55b7781454efd706a6a",
+      "d9a906c0e692b22e1b4414e71a704b7e",
+      "47d4cadd56f70c11ff8f3e5d8df81161",
+      "de997744cf24c16c5ac2a36b02b351cc",
+      "23781211ae178ddeb6c4bb97a6bd7d83",
+      "a79d2e28340ca34b9e37daabbf030f63",
+      "0372bd3ddfc258750a6ac106b70587f4",
+      "228ef625d9460cbf6fa253a16a730976",
+  },
+  {
+      // 4X8
+      "22d519b796d59644043466320e4ccd14",
+      "09513a738c49b3f9542d27f34abbe1d5",
+      "807ae5e8813443ff01e71be6efacfb69",
+      "cbfa18d0293430b6e9708b0be1fd2394",
+      "346c354c34ec7fa780b576db355dab88",
+      "f97dae85c35359632380b09ca98d611e",
+      "698ae351d8896d89ed9e4e67b6e53eda",
+      "dcc197034a9c45a3d8238bf085835f4e",
+      "7a35e2c42ffdc2efc2d6d1d75a100fc7",
+      "41ab6cebd4516c87a91b2a593e2c2506",
+  },
+  {
+      // 8X4
+      "d58cd4c4bf3b7bbaa5db5e1a5622ec78",
+      "6e572c35aa782d00cafcb99e9ea047ea",
+      "e8c22a3702b416dc9ab974505afbed09",
+      "aaa4e4762a795aad7ad74de0c662c4e4",
+      "a19f9101967383c3dcbd516dc317a291",
+      "9ab8cb91f1a595b9ebe3fe8de58031aa",
+      "2cf9021d5f1169268699807ee118b65f",
+      "ee9605fcbd6fb871f1c5cd81a6989327",
+      "b4871af8316089e3e23522175df7e93f",
+      "d33301e1c2cb173be46792a22d19881a",
+  },
+  {
+      // 8X16
+      "4562de1d0336610880fdd5685498a9ec",
+      "16310fa7076394f16fc85c4b149d89c9",
+      "0e94af88e1dc573b6f0f499cddd1f530",
+      "dfd245ee20d091c67809160340365aa9",
+      "d3562504327f70c096c5be23fd8a3747",
+      "601b853558502acbb5135eadd2da117a",
+      "3c624345a723a1b2b1bea05a6a08bc99",
+      "2a9c781de609e0184cc7ab442050f4e5",
+      "0ddc5035c22252747126b61fc238c74d",
+      "e43f5d83bab759af69c7b6773fc8f9b2",
+  },
+  {
+      // 16X8
+      "a57d6b5a9bfd30c29591d8717ace9c51",
+      "f5907ba97ee6c53e339e953fc8d845ee",
+      "ea3aa727913ce45af06f89dd1808db5f",
+      "408af4f23e48d14b48ee35ae094fcd18",
+      "85c41cbcb5d744f7961e8950026fbffe",
+      "8a4e588a837638887ba671f8d4910485",
+      "b792d8826b67a21757ea7097cff9e05b",
+      "f94ce7101bb87fd3bb9312112527dbf4",
+      "688c6660a6dc6fa61fa1aa38e708c209",
+      "0cdf641b4f81d69509c92ae0b93ef5ff",
+  },
+  {
+      // 16X32
+      "aee4b3b0e3cc02d48e2c40d77f807927",
+      "8baef2b2e789f79c8df9d90ad10f34a4",
+      "038c38ee3c4f090bb8d736eab136aafc",
+      "1a3de2aaeaffd68a9fd6c7f6557b83f3",
+      "385c6e0ea29421dd81011a2934641e26",
+      "6cf96c285d1a2d4787f955dad715b08c",
+      "2d7f75dcd73b9528c8396279ff09ff3a",
+      "5a63cd1841e4ed470e4ca5ef845f2281",
+      "610d899ca945fbead33287d4335a8b32",
+      "6bafaad81fce37be46730187e78d8b11",
+  },
+  {
+      // 32X16
+      "290b23c9f5a1de7905bfa71a942da29b",
+      "701e7b82593c66da5052fc4b6afd79ce",
+      "4da828c5455cd246735a663fbb204989",
+      "e3fbeaf234efece8dbd752b77226200c",
+      "4d1d8c969f05155a7e7e84cf7aad021b",
+      "c22e4877c2c946d5bdc0d542e29e70cf",
+      "8ac1ce815e7780500f842b0beb0bb980",
+      "9fee2e2502b507f25bfad30a55b0b610",
+      "4ced9c212ec6f9956e27f68a91b59fef",
+      "4a7a0b93f138bb0863e4e465b01ec0b1",
+  },
+  {
+      // 32X64
+      "ad9cfc395a5c5644a21d958c7274ac14",
+      "f29d6d03c143ddf96fef04c19f2c8333",
+      "a8bdc852ef704dd4975c61893e8fbc3f",
+      "7d0bd7dea26226741dbca9a97f27fa74",
+      "45c27c5cca9a91b6ae8379feb0881c9f",
+      "8a0b78df1e001b85c874d686eac4aa1b",
+      "ce9fa75fac54a3f6c0cc3f2083b938f1",
+      "c0dca10d88762c954af18dc9e3791a39",
+      "61df229eddfccab913b8fda4bb02f9ac",
+      "4f4df6bc8d50a5600b573f0e44d70e66",
+  },
+  {
+      // 64X32
+      "db9d82921fd88b24fdff6f849f2f9c87",
+      "5ecc7fdc52d2f575ad4f2d0e9e6b1e11",
+      "b4581311a0a73d95dfac7f8f44591032",
+      "68bd283cfd1a125f6b2ee47cee874d36",
+      "804179f05c032908a5e36077bb87c994",
+      "fc5fd041a8ee779015394d0c066ee43c",
+      "68f5579ccadfe9a1baafb158334a3db2",
+      "fe237e45e215ab06d79046da9ad71e84",
+      "9a8a938a6824551bf7d21b8fd1d70ea1",
+      "eb7332f2017cd96882c76e7136aeaf53",
+  },
+  {
+      // 4X16
+      "7bafa307d507747b8132e7735b7f1c73",
+      "e58bc2d8213a97d1fea9cfb73d7a9633",
+      "435f8a8e8bbf14dbf2fe16b2be9e97aa",
+      "1d0e767b68d84acbfb50b7a04e633836",
+      "5f713bd7b324fe73bb7063e35ee14e5e",
+      "0dac4e1fa3d59814202715468c01ed56",
+      "47709d1db4a330c7a8900f450e6fddd1",
+      "258e0b930bb27db28f05da9cf7d1ee7c",
+      "36cf030fbae767912593efea045bfff5",
+      "248d7aceabb7499febae663fae41a920",
+  },
+  {
+      // 16X4
+      "04dde98e632670e393704742c89f9067",
+      "8c72543f1664651ae1fa08e2ac0adb9b",
+      "2354a2cdc2773aa2df8ab4010db1be39",
+      "6300ad3221c26da39b10e0e6d87ee3be",
+      "8ea30b661c6ba60b28d3167f19e449b8",
+      "fb6c1e4ff101a371cede63c2955cdb7e",
+      "a517c06433d6d7927b16a72184a23e92",
+      "393828be5d62ab6c48668bea5e2f801a",
+      "b1e510c542013eb9d6fb188dea2ce90a",
+      "569a8f2fe01679ca216535ecbcdccb62",
+  },
+  {
+      // 8X32
+      "9d541865c185ca7607852852613ac1fc",
+      "b96be67f08c6b5fa5ebd3411299c2f7c",
+      "75a2dcf50004b9d188849b048239767e",
+      "429492ff415c9fd9b050d73b2ad500f8",
+      "64b3606c1ccd036bd766bd5711392cf4",
+      "cb59844a0f01660ac955bae3511f1100",
+      "3e076155b7a70e8828618e3f33b51e3d",
+      "ed2d1f597ab7c50beff690f737cf9726",
+      "7909c6a26aaf20c59d996d3e5b5f9c29",
+      "965798807240c98c6f7cc9b457ed0773",
+  },
+  {
+      // 32X8
+      "36f391aa31619eec1f4d9ee95ea454cc",
+      "b82648f14eeba2527357cb50bc3223cb",
+      "7a7b2adf429125e8bee9d1d00a66e13f",
+      "4198e4d6ba503b7cc2d7e96bb845f661",
+      "96c160d2ec1be9fe0cdea9682f14d257",
+      "19a450bcebaa75afb4fc6bd1fd6434af",
+      "2bd2e35967d43d0ec1c6587a36f204d5",
+      "49799a99aa4ccfbd989bee92a99422f1",
+      "955530e99813812a74659edeac3f5475",
+      "f0316b84e378a19cd11b19a6e40b2914",
+  },
+  {
+      // 16X64
+      "8cba1b70a0bde29e8ef235cedc5faa7d",
+      "96d00ddc7537bf7f196006591b733b4e",
+      "cbf69d5d157c9f3355a4757b1d6e3414",
+      "3ac1f642019493dec1b737d7a3a1b4e5",
+      "35f9ee300d7fa3c97338e81a6f21dcd4",
+      "aae335442e77c8ebc280f16ea50ba9c7",
+      "a6140fdac2278644328be094d88731db",
+      "2df93621b6ff100f7008432d509f4161",
+      "c77bf5aee39e7ed4a3dd715f816f452a",
+      "02109bd63557d90225c32a8f1338258e",
+  },
+  {
+      // 64X16
+      "a5e2f9fb685d5f4a048e9a96affd25a4",
+      "1348f249690d9eefe09d9ad7ead2c801",
+      "525da4b187acd81b1ff1116b60461141",
+      "e99d072de858094c98b01bd4a6772634",
+      "873bfa9dc24693f19721f7c8d527f7d3",
+      "0acfc6507bd3468e9679efc127d6e4b9",
+      "57d03f8d079c7264854e22ac1157cfae",
+      "6c2c4036f70c7d957a9399b5436c0774",
+      "42b8e4a97b7f8416c72a5148c031c0b1",
+      "a38a2c5f79993dfae8530e9e25800893",
+  },
+};
+
+}  // namespace
+
+#define HIGHBD_INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, \
+                               h, paeth, smooth, smooth_v, smooth_h)          \
+  TEST(arch, DISABLED_##TestHighbdIntraPred_##tx_size) {                      \
+    static const AvxHighbdPredFunc aom_intra_pred[] = {                       \
+      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h    \
+    };                                                                        \
+    TestHighbdIntraPred(tx_size, aom_intra_pred, kHighbdSignatures[tx_size]); \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4, 4x8, 4x16
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_4X4, aom_highbd_dc_predictor_4x4_c,
+    aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c,
+    aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c,
+    aom_highbd_h_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
+    aom_highbd_smooth_predictor_4x4_c, aom_highbd_smooth_v_predictor_4x4_c,
+    aom_highbd_smooth_h_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_4X8, aom_highbd_dc_predictor_4x8_c,
+    aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
+    aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
+    aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
+    aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c,
+    aom_highbd_smooth_h_predictor_4x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_4X16, aom_highbd_dc_predictor_4x16_c,
+    aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c,
+    aom_highbd_dc_128_predictor_4x16_c, aom_highbd_v_predictor_4x16_c,
+    aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c,
+    aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c,
+    aom_highbd_smooth_h_predictor_4x16_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
+                       aom_highbd_dc_left_predictor_4x4_sse2,
+                       aom_highbd_dc_top_predictor_4x4_sse2,
+                       aom_highbd_dc_128_predictor_4x4_sse2,
+                       aom_highbd_v_predictor_4x4_sse2,
+                       aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2,
+                       aom_highbd_dc_left_predictor_4x8_sse2,
+                       aom_highbd_dc_top_predictor_4x8_sse2,
+                       aom_highbd_dc_128_predictor_4x8_sse2,
+                       aom_highbd_v_predictor_4x8_sse2,
+                       aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_8X8, aom_highbd_dc_predictor_8x8_c,
+    aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
+    aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
+    aom_highbd_h_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
+    aom_highbd_smooth_predictor_8x8_c, aom_highbd_smooth_v_predictor_8x8_c,
+    aom_highbd_smooth_h_predictor_8x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_8X4, aom_highbd_dc_predictor_8x4_c,
+    aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
+    aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
+    aom_highbd_h_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
+    aom_highbd_smooth_predictor_8x4_c, aom_highbd_smooth_v_predictor_8x4_c,
+    aom_highbd_smooth_h_predictor_8x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_8X16, aom_highbd_dc_predictor_8x16_c,
+    aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
+    aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
+    aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
+    aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c,
+    aom_highbd_smooth_h_predictor_8x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_8X32, aom_highbd_dc_predictor_8x32_c,
+    aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c,
+    aom_highbd_dc_128_predictor_8x32_c, aom_highbd_v_predictor_8x32_c,
+    aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c,
+    aom_highbd_smooth_predictor_8x32_c, aom_highbd_smooth_v_predictor_8x32_c,
+    aom_highbd_smooth_h_predictor_8x32_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
+                       aom_highbd_dc_left_predictor_8x8_sse2,
+                       aom_highbd_dc_top_predictor_8x8_sse2,
+                       aom_highbd_dc_128_predictor_8x8_sse2,
+                       aom_highbd_v_predictor_8x8_sse2,
+                       aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2,
+                       aom_highbd_dc_left_predictor_8x4_sse2,
+                       aom_highbd_dc_top_predictor_8x4_sse2,
+                       aom_highbd_dc_128_predictor_8x4_sse2,
+                       aom_highbd_v_predictor_8x4_sse2,
+                       aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_highbd_dc_predictor_8x16_sse2,
+                       aom_highbd_dc_left_predictor_8x16_sse2,
+                       aom_highbd_dc_top_predictor_8x16_sse2,
+                       aom_highbd_dc_128_predictor_8x16_sse2,
+                       aom_highbd_v_predictor_8x16_sse2,
+                       aom_highbd_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_16X16, aom_highbd_dc_predictor_16x16_c,
+    aom_highbd_dc_left_predictor_16x16_c, aom_highbd_dc_top_predictor_16x16_c,
+    aom_highbd_dc_128_predictor_16x16_c, aom_highbd_v_predictor_16x16_c,
+    aom_highbd_h_predictor_16x16_c, aom_highbd_paeth_predictor_16x16_c,
+    aom_highbd_smooth_predictor_16x16_c, aom_highbd_smooth_v_predictor_16x16_c,
+    aom_highbd_smooth_h_predictor_16x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_16X8, aom_highbd_dc_predictor_16x8_c,
+    aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
+    aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
+    aom_highbd_h_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
+    aom_highbd_smooth_predictor_16x8_c, aom_highbd_smooth_v_predictor_16x8_c,
+    aom_highbd_smooth_h_predictor_16x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_16X32, aom_highbd_dc_predictor_16x32_c,
+    aom_highbd_dc_left_predictor_16x32_c, aom_highbd_dc_top_predictor_16x32_c,
+    aom_highbd_dc_128_predictor_16x32_c, aom_highbd_v_predictor_16x32_c,
+    aom_highbd_h_predictor_16x32_c, aom_highbd_paeth_predictor_16x32_c,
+    aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c,
+    aom_highbd_smooth_h_predictor_16x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_16X4, aom_highbd_dc_predictor_16x4_c,
+    aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c,
+    aom_highbd_dc_128_predictor_16x4_c, aom_highbd_v_predictor_16x4_c,
+    aom_highbd_h_predictor_16x4_c, aom_highbd_paeth_predictor_16x4_c,
+    aom_highbd_smooth_predictor_16x4_c, aom_highbd_smooth_v_predictor_16x4_c,
+    aom_highbd_smooth_h_predictor_16x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_5, TX_16X64, aom_highbd_dc_predictor_16x64_c,
+    aom_highbd_dc_left_predictor_16x64_c, aom_highbd_dc_top_predictor_16x64_c,
+    aom_highbd_dc_128_predictor_16x64_c, aom_highbd_v_predictor_16x64_c,
+    aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c,
+    aom_highbd_smooth_predictor_16x64_c, aom_highbd_smooth_v_predictor_16x64_c,
+    aom_highbd_smooth_h_predictor_16x64_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
+                       aom_highbd_dc_left_predictor_16x16_sse2,
+                       aom_highbd_dc_top_predictor_16x16_sse2,
+                       aom_highbd_dc_128_predictor_16x16_sse2,
+                       aom_highbd_v_predictor_16x16_sse2,
+                       aom_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2,
+                       aom_highbd_dc_left_predictor_16x8_sse2,
+                       aom_highbd_dc_top_predictor_16x8_sse2,
+                       aom_highbd_dc_128_predictor_16x8_sse2,
+                       aom_highbd_v_predictor_16x8_sse2,
+                       aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
+                       aom_highbd_dc_left_predictor_16x32_sse2,
+                       aom_highbd_dc_top_predictor_16x32_sse2,
+                       aom_highbd_dc_128_predictor_16x32_sse2,
+                       aom_highbd_v_predictor_16x32_sse2,
+                       aom_highbd_h_predictor_16x32_sse2, NULL, NULL, NULL,
+                       NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_32X32, aom_highbd_dc_predictor_32x32_c,
+    aom_highbd_dc_left_predictor_32x32_c, aom_highbd_dc_top_predictor_32x32_c,
+    aom_highbd_dc_128_predictor_32x32_c, aom_highbd_v_predictor_32x32_c,
+    aom_highbd_h_predictor_32x32_c, aom_highbd_paeth_predictor_32x32_c,
+    aom_highbd_smooth_predictor_32x32_c, aom_highbd_smooth_v_predictor_32x32_c,
+    aom_highbd_smooth_h_predictor_32x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_32X16, aom_highbd_dc_predictor_32x16_c,
+    aom_highbd_dc_left_predictor_32x16_c, aom_highbd_dc_top_predictor_32x16_c,
+    aom_highbd_dc_128_predictor_32x16_c, aom_highbd_v_predictor_32x16_c,
+    aom_highbd_h_predictor_32x16_c, aom_highbd_paeth_predictor_32x16_c,
+    aom_highbd_smooth_predictor_32x16_c, aom_highbd_smooth_v_predictor_32x16_c,
+    aom_highbd_smooth_h_predictor_32x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_32X64, aom_highbd_dc_predictor_32x64_c,
+    aom_highbd_dc_left_predictor_32x64_c, aom_highbd_dc_top_predictor_32x64_c,
+    aom_highbd_dc_128_predictor_32x64_c, aom_highbd_v_predictor_32x64_c,
+    aom_highbd_h_predictor_32x64_c, aom_highbd_paeth_predictor_32x64_c,
+    aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c,
+    aom_highbd_smooth_h_predictor_32x64_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_32X8, aom_highbd_dc_predictor_32x8_c,
+    aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c,
+    aom_highbd_dc_128_predictor_32x8_c, aom_highbd_v_predictor_32x8_c,
+    aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c,
+    aom_highbd_smooth_predictor_32x8_c, aom_highbd_smooth_v_predictor_32x8_c,
+    aom_highbd_smooth_h_predictor_32x8_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
+                       aom_highbd_dc_left_predictor_32x32_sse2,
+                       aom_highbd_dc_top_predictor_32x32_sse2,
+                       aom_highbd_dc_128_predictor_32x32_sse2,
+                       aom_highbd_v_predictor_32x32_sse2,
+                       aom_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
+                       aom_highbd_dc_left_predictor_32x16_sse2,
+                       aom_highbd_dc_top_predictor_32x16_sse2,
+                       aom_highbd_dc_128_predictor_32x16_sse2,
+                       aom_highbd_v_predictor_32x16_sse2,
+                       aom_highbd_h_predictor_32x16_sse2, NULL, NULL, NULL,
+                       NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_64X64, aom_highbd_dc_predictor_64x64_c,
+    aom_highbd_dc_left_predictor_64x64_c, aom_highbd_dc_top_predictor_64x64_c,
+    aom_highbd_dc_128_predictor_64x64_c, aom_highbd_v_predictor_64x64_c,
+    aom_highbd_h_predictor_64x64_c, aom_highbd_paeth_predictor_64x64_c,
+    aom_highbd_smooth_predictor_64x64_c, aom_highbd_smooth_v_predictor_64x64_c,
+    aom_highbd_smooth_h_predictor_64x64_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_64X32, aom_highbd_dc_predictor_64x32_c,
+    aom_highbd_dc_left_predictor_64x32_c, aom_highbd_dc_top_predictor_64x32_c,
+    aom_highbd_dc_128_predictor_64x32_c, aom_highbd_v_predictor_64x32_c,
+    aom_highbd_h_predictor_64x32_c, aom_highbd_paeth_predictor_64x32_c,
+    aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c,
+    aom_highbd_smooth_h_predictor_64x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_64X16, aom_highbd_dc_predictor_64x16_c,
+    aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c,
+    aom_highbd_dc_128_predictor_64x16_c, aom_highbd_v_predictor_64x16_c,
+    aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c,
+    aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c,
+    aom_highbd_smooth_h_predictor_64x16_c)
+
+// -----------------------------------------------------------------------------
+
+#include "test/test_libaom.cc"
diff --git a/third_party/aom/test/test_libaom.cc b/third_party/aom/test/test_libaom.cc
new file mode 100644
index 000000000..b55d76237
--- /dev/null
+++ b/third_party/aom/test/test_libaom.cc
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+extern "C" {
+extern void av1_rtcd();
+extern void aom_dsp_rtcd();
+extern void aom_scale_rtcd();
+}
+
+#if ARCH_X86 || ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = ::testing::FLAGS_gtest_filter;
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  // OPT.* matches TEST() functions
+  // OPT/* matches TEST_P() functions
+  // OPT_* matches tests which have been manually sharded.
+  // We do not match OPT* because of SSE/SSE2 collisions.
+  const char *search_terminators = "./_";
+  for (size_t pos = 0; pos < strlen(search_terminators); ++pos) {
+    filter += ":";
+    filter += str;
+    filter += search_terminators[pos];
+    filter += "*";
+  }
+  ::testing::FLAGS_gtest_filter = filter;
+}
+#endif  // ARCH_X86 || ARCH_X86_64
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+#if ARCH_X86 || ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter("SSE");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter("SSE2");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter("SSE3");
+  if (!(simd_caps & HAS_SSSE3)) append_negative_gtest_filter("SSSE3");
+  if (!(simd_caps & HAS_SSE4_1)) append_negative_gtest_filter("SSE4_1");
+  if (!(simd_caps & HAS_SSE4_2)) append_negative_gtest_filter("SSE4_2");
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter("AVX");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2");
+#endif  // ARCH_X86 || ARCH_X86_64
+
+// Shared library builds don't support whitebox tests that exercise internal
+// symbols.
+#if !CONFIG_SHARED
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+#endif  // !CONFIG_SHARED
+
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/aom/test/test_runner.cmake b/third_party/aom/test/test_runner.cmake
new file mode 100644
index 000000000..d3747b1e3
--- /dev/null
+++ b/third_party/aom/test/test_runner.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(NOT GTEST_TOTAL_SHARDS OR "${GTEST_SHARD_INDEX}" STREQUAL "" OR NOT
+   TEST_LIBAOM)
+  message(
+    FATAL_ERROR
+      "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM
+          must be defined."
+    )
+endif()
+
+set($ENV{GTEST_SHARD_INDEX} ${GTEST_SHARD_INDEX})
+set($ENV{GTEST_TOTAL_SHARDS} ${GTEST_TOTAL_SHARDS})
+execute_process(COMMAND ${TEST_LIBAOM} RESULT_VARIABLE test_result)
+set(test_message "Test shard ${GTEST_SHARD_INDEX}/${GTEST_TOTAL_SHARDS} result")
+message("${test_message}: ${test_result}")
+
+if(NOT "${test_result}" STREQUAL "0")
+  message(FATAL_ERROR "${test_message}: FAILED, non-zero exit code.")
+endif()
diff --git a/third_party/aom/test/test_vector_test.cc b/third_party/aom/test/test_vector_test.cc
new file mode 100644
index 000000000..286988b17
--- /dev/null
+++ b/third_party/aom/test/test_vector_test.cc
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <set>
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kThreads = 0;
+const int kFileName = 1;
+const int kRowMT = 2;
+
+typedef ::testing::tuple<int, const char *, int> DecodeParam;
+
+class TestVectorTest : public ::libaom_test::DecoderTest,
+                       public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}
+
+  virtual ~TestVectorTest() {
+    if (md5_file_) fclose(md5_file_);
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_);
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(res, EOF) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libaom_test::MD5 md5_res;
+#if !CONFIG_LOWBITDEPTH
+    const aom_img_fmt_t shifted_fmt =
+        (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
+    if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16);
+      img_shifted->bit_depth = img.bit_depth;
+      img_shifted->monochrome = img.monochrome;
+      aom_img_downshift(img_shifted, &img, 0);
+      md5_res.Add(img_shifted);
+      aom_img_free(img_shifted);
+    } else {
+#endif
+      md5_res.Add(&img);
+#if !CONFIG_LOWBITDEPTH
+    }
+#endif
+
+    const char *actual_md5 = md5_res.Get();
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  unsigned int row_mt_;
+
+ private:
+  FILE *md5_file_;
+};
+
+// This test runs through the whole set of test vectors, and decodes them.
+// The md5 checksums are computed for each frame in the video file. If md5
+// checksums match the correct md5 data, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(TestVectorTest, MD5Match) {
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = ::testing::get<kFileName>(input);
+  aom_codec_flags_t flags = 0;
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  char str[256];
+
+  cfg.threads = ::testing::get<kThreads>(input);
+  row_mt_ = ::testing::get<kRowMT>(input);
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+           filename.c_str(), cfg.threads);
+  SCOPED_TRACE(str);
+
+  // Open compressed video file.
+  testing::internal::scoped_ptr<libaom_test::CompressedVideoSource> video;
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video.reset(new libaom_test::IVFVideoSource(filename));
+  } else if (filename.substr(filename.length() - 4, 4) == "webm" ||
+             filename.substr(filename.length() - 3, 3) == "mkv") {
+#if CONFIG_WEBM_IO
+    video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Set decode config and flags.
+  cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+  set_cfg(cfg);
+  set_flags(flags);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
+}
+
+#if CONFIG_AV1_DECODER
+AV1_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(::testing::Values(1),  // Single thread.
+                       ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                                           libaom_test::kAV1TestVectors +
+                                               libaom_test::kNumAV1TestVectors),
+                       ::testing::Values(0)));
+
+// Test AV1 decode in with different numbers of threads.
+INSTANTIATE_TEST_CASE_P(
+    AV1MultiThreaded, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::Combine(
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                                libaom_test::kAV1TestVectors +
+                                    libaom_test::kNumAV1TestVectors),
+            ::testing::Range(0, 2))));
+
+#endif  // CONFIG_AV1_DECODER
+
+}  // namespace
diff --git a/third_party/aom/test/test_vectors.cc b/third_party/aom/test/test_vectors.cc
new file mode 100644
index 000000000..71e431e18
--- /dev/null
+++ b/third_party/aom/test/test_vectors.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/test_vectors.h"
+
+namespace libaom_test {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+#if CONFIG_AV1_DECODER
+const char *const kAV1TestVectors[] = {
+  "av1-1-b8-00-quantizer-00.ivf",  "av1-1-b8-00-quantizer-01.ivf",
+  "av1-1-b8-00-quantizer-02.ivf",  "av1-1-b8-00-quantizer-03.ivf",
+  "av1-1-b8-00-quantizer-04.ivf",  "av1-1-b8-00-quantizer-05.ivf",
+  "av1-1-b8-00-quantizer-06.ivf",  "av1-1-b8-00-quantizer-07.ivf",
+  "av1-1-b8-00-quantizer-08.ivf",  "av1-1-b8-00-quantizer-09.ivf",
+  "av1-1-b8-00-quantizer-10.ivf",  "av1-1-b8-00-quantizer-11.ivf",
+  "av1-1-b8-00-quantizer-12.ivf",  "av1-1-b8-00-quantizer-13.ivf",
+  "av1-1-b8-00-quantizer-14.ivf",  "av1-1-b8-00-quantizer-15.ivf",
+  "av1-1-b8-00-quantizer-16.ivf",  "av1-1-b8-00-quantizer-17.ivf",
+  "av1-1-b8-00-quantizer-18.ivf",  "av1-1-b8-00-quantizer-19.ivf",
+  "av1-1-b8-00-quantizer-20.ivf",  "av1-1-b8-00-quantizer-21.ivf",
+  "av1-1-b8-00-quantizer-22.ivf",  "av1-1-b8-00-quantizer-23.ivf",
+  "av1-1-b8-00-quantizer-24.ivf",  "av1-1-b8-00-quantizer-25.ivf",
+  "av1-1-b8-00-quantizer-26.ivf",  "av1-1-b8-00-quantizer-27.ivf",
+  "av1-1-b8-00-quantizer-28.ivf",  "av1-1-b8-00-quantizer-29.ivf",
+  "av1-1-b8-00-quantizer-30.ivf",  "av1-1-b8-00-quantizer-31.ivf",
+  "av1-1-b8-00-quantizer-32.ivf",  "av1-1-b8-00-quantizer-33.ivf",
+  "av1-1-b8-00-quantizer-34.ivf",  "av1-1-b8-00-quantizer-35.ivf",
+  "av1-1-b8-00-quantizer-36.ivf",  "av1-1-b8-00-quantizer-37.ivf",
+  "av1-1-b8-00-quantizer-38.ivf",  "av1-1-b8-00-quantizer-39.ivf",
+  "av1-1-b8-00-quantizer-40.ivf",  "av1-1-b8-00-quantizer-41.ivf",
+  "av1-1-b8-00-quantizer-42.ivf",  "av1-1-b8-00-quantizer-43.ivf",
+  "av1-1-b8-00-quantizer-44.ivf",  "av1-1-b8-00-quantizer-45.ivf",
+  "av1-1-b8-00-quantizer-46.ivf",  "av1-1-b8-00-quantizer-47.ivf",
+  "av1-1-b8-00-quantizer-48.ivf",  "av1-1-b8-00-quantizer-49.ivf",
+  "av1-1-b8-00-quantizer-50.ivf",  "av1-1-b8-00-quantizer-51.ivf",
+  "av1-1-b8-00-quantizer-52.ivf",  "av1-1-b8-00-quantizer-53.ivf",
+  "av1-1-b8-00-quantizer-54.ivf",  "av1-1-b8-00-quantizer-55.ivf",
+  "av1-1-b8-00-quantizer-56.ivf",  "av1-1-b8-00-quantizer-57.ivf",
+  "av1-1-b8-00-quantizer-58.ivf",  "av1-1-b8-00-quantizer-59.ivf",
+  "av1-1-b8-00-quantizer-60.ivf",  "av1-1-b8-00-quantizer-61.ivf",
+  "av1-1-b8-00-quantizer-62.ivf",  "av1-1-b8-00-quantizer-63.ivf",
+  "av1-1-b10-00-quantizer-00.ivf", "av1-1-b10-00-quantizer-01.ivf",
+  "av1-1-b10-00-quantizer-02.ivf", "av1-1-b10-00-quantizer-03.ivf",
+  "av1-1-b10-00-quantizer-04.ivf", "av1-1-b10-00-quantizer-05.ivf",
+  "av1-1-b10-00-quantizer-06.ivf", "av1-1-b10-00-quantizer-07.ivf",
+  "av1-1-b10-00-quantizer-08.ivf", "av1-1-b10-00-quantizer-09.ivf",
+  "av1-1-b10-00-quantizer-10.ivf", "av1-1-b10-00-quantizer-11.ivf",
+  "av1-1-b10-00-quantizer-12.ivf", "av1-1-b10-00-quantizer-13.ivf",
+  "av1-1-b10-00-quantizer-14.ivf", "av1-1-b10-00-quantizer-15.ivf",
+  "av1-1-b10-00-quantizer-16.ivf", "av1-1-b10-00-quantizer-17.ivf",
+  "av1-1-b10-00-quantizer-18.ivf", "av1-1-b10-00-quantizer-19.ivf",
+  "av1-1-b10-00-quantizer-20.ivf", "av1-1-b10-00-quantizer-21.ivf",
+  "av1-1-b10-00-quantizer-22.ivf", "av1-1-b10-00-quantizer-23.ivf",
+  "av1-1-b10-00-quantizer-24.ivf", "av1-1-b10-00-quantizer-25.ivf",
+  "av1-1-b10-00-quantizer-26.ivf", "av1-1-b10-00-quantizer-27.ivf",
+  "av1-1-b10-00-quantizer-28.ivf", "av1-1-b10-00-quantizer-29.ivf",
+  "av1-1-b10-00-quantizer-30.ivf", "av1-1-b10-00-quantizer-31.ivf",
+  "av1-1-b10-00-quantizer-32.ivf", "av1-1-b10-00-quantizer-33.ivf",
+  "av1-1-b10-00-quantizer-34.ivf", "av1-1-b10-00-quantizer-35.ivf",
+  "av1-1-b10-00-quantizer-36.ivf", "av1-1-b10-00-quantizer-37.ivf",
+  "av1-1-b10-00-quantizer-38.ivf", "av1-1-b10-00-quantizer-39.ivf",
+  "av1-1-b10-00-quantizer-40.ivf", "av1-1-b10-00-quantizer-41.ivf",
+  "av1-1-b10-00-quantizer-42.ivf", "av1-1-b10-00-quantizer-43.ivf",
+  "av1-1-b10-00-quantizer-44.ivf", "av1-1-b10-00-quantizer-45.ivf",
+  "av1-1-b10-00-quantizer-46.ivf", "av1-1-b10-00-quantizer-47.ivf",
+  "av1-1-b10-00-quantizer-48.ivf", "av1-1-b10-00-quantizer-49.ivf",
+  "av1-1-b10-00-quantizer-50.ivf", "av1-1-b10-00-quantizer-51.ivf",
+  "av1-1-b10-00-quantizer-52.ivf", "av1-1-b10-00-quantizer-53.ivf",
+  "av1-1-b10-00-quantizer-54.ivf", "av1-1-b10-00-quantizer-55.ivf",
+  "av1-1-b10-00-quantizer-56.ivf", "av1-1-b10-00-quantizer-57.ivf",
+  "av1-1-b10-00-quantizer-58.ivf", "av1-1-b10-00-quantizer-59.ivf",
+  "av1-1-b10-00-quantizer-60.ivf", "av1-1-b10-00-quantizer-61.ivf",
+  "av1-1-b10-00-quantizer-62.ivf", "av1-1-b10-00-quantizer-63.ivf",
+  "av1-1-b8-01-size-16x16.ivf",    "av1-1-b8-01-size-16x18.ivf",
+  "av1-1-b8-01-size-16x32.ivf",    "av1-1-b8-01-size-16x34.ivf",
+  "av1-1-b8-01-size-16x64.ivf",    "av1-1-b8-01-size-16x66.ivf",
+  "av1-1-b8-01-size-18x16.ivf",    "av1-1-b8-01-size-18x18.ivf",
+  "av1-1-b8-01-size-18x32.ivf",    "av1-1-b8-01-size-18x34.ivf",
+  "av1-1-b8-01-size-18x64.ivf",    "av1-1-b8-01-size-18x66.ivf",
+  "av1-1-b8-01-size-196x196.ivf",  "av1-1-b8-01-size-196x198.ivf",
+  "av1-1-b8-01-size-196x200.ivf",  "av1-1-b8-01-size-196x202.ivf",
+  "av1-1-b8-01-size-196x208.ivf",  "av1-1-b8-01-size-196x210.ivf",
+  "av1-1-b8-01-size-196x224.ivf",  "av1-1-b8-01-size-196x226.ivf",
+  "av1-1-b8-01-size-198x196.ivf",  "av1-1-b8-01-size-198x198.ivf",
+  "av1-1-b8-01-size-198x200.ivf",  "av1-1-b8-01-size-198x202.ivf",
+  "av1-1-b8-01-size-198x208.ivf",  "av1-1-b8-01-size-198x210.ivf",
+  "av1-1-b8-01-size-198x224.ivf",  "av1-1-b8-01-size-198x226.ivf",
+  "av1-1-b8-01-size-200x196.ivf",  "av1-1-b8-01-size-200x198.ivf",
+  "av1-1-b8-01-size-200x200.ivf",  "av1-1-b8-01-size-200x202.ivf",
+  "av1-1-b8-01-size-200x208.ivf",  "av1-1-b8-01-size-200x210.ivf",
+  "av1-1-b8-01-size-200x224.ivf",  "av1-1-b8-01-size-200x226.ivf",
+  "av1-1-b8-01-size-202x196.ivf",  "av1-1-b8-01-size-202x198.ivf",
+  "av1-1-b8-01-size-202x200.ivf",  "av1-1-b8-01-size-202x202.ivf",
+  "av1-1-b8-01-size-202x208.ivf",  "av1-1-b8-01-size-202x210.ivf",
+  "av1-1-b8-01-size-202x224.ivf",  "av1-1-b8-01-size-202x226.ivf",
+  "av1-1-b8-01-size-208x196.ivf",  "av1-1-b8-01-size-208x198.ivf",
+  "av1-1-b8-01-size-208x200.ivf",  "av1-1-b8-01-size-208x202.ivf",
+  "av1-1-b8-01-size-208x208.ivf",  "av1-1-b8-01-size-208x210.ivf",
+  "av1-1-b8-01-size-208x224.ivf",  "av1-1-b8-01-size-208x226.ivf",
+  "av1-1-b8-01-size-210x196.ivf",  "av1-1-b8-01-size-210x198.ivf",
+  "av1-1-b8-01-size-210x200.ivf",  "av1-1-b8-01-size-210x202.ivf",
+  "av1-1-b8-01-size-210x208.ivf",  "av1-1-b8-01-size-210x210.ivf",
+  "av1-1-b8-01-size-210x224.ivf",  "av1-1-b8-01-size-210x226.ivf",
+  "av1-1-b8-01-size-224x196.ivf",  "av1-1-b8-01-size-224x198.ivf",
+  "av1-1-b8-01-size-224x200.ivf",  "av1-1-b8-01-size-224x202.ivf",
+  "av1-1-b8-01-size-224x208.ivf",  "av1-1-b8-01-size-224x210.ivf",
+  "av1-1-b8-01-size-224x224.ivf",  "av1-1-b8-01-size-224x226.ivf",
+  "av1-1-b8-01-size-226x196.ivf",  "av1-1-b8-01-size-226x198.ivf",
+  "av1-1-b8-01-size-226x200.ivf",  "av1-1-b8-01-size-226x202.ivf",
+  "av1-1-b8-01-size-226x208.ivf",  "av1-1-b8-01-size-226x210.ivf",
+  "av1-1-b8-01-size-226x224.ivf",  "av1-1-b8-01-size-226x226.ivf",
+  "av1-1-b8-01-size-32x16.ivf",    "av1-1-b8-01-size-32x18.ivf",
+  "av1-1-b8-01-size-32x32.ivf",    "av1-1-b8-01-size-32x34.ivf",
+  "av1-1-b8-01-size-32x64.ivf",    "av1-1-b8-01-size-32x66.ivf",
+  "av1-1-b8-01-size-34x16.ivf",    "av1-1-b8-01-size-34x18.ivf",
+  "av1-1-b8-01-size-34x32.ivf",    "av1-1-b8-01-size-34x34.ivf",
+  "av1-1-b8-01-size-34x64.ivf",    "av1-1-b8-01-size-34x66.ivf",
+  "av1-1-b8-01-size-64x16.ivf",    "av1-1-b8-01-size-64x18.ivf",
+  "av1-1-b8-01-size-64x32.ivf",    "av1-1-b8-01-size-64x34.ivf",
+  "av1-1-b8-01-size-64x64.ivf",    "av1-1-b8-01-size-64x66.ivf",
+  "av1-1-b8-01-size-66x16.ivf",    "av1-1-b8-01-size-66x18.ivf",
+  "av1-1-b8-01-size-66x32.ivf",    "av1-1-b8-01-size-66x34.ivf",
+  "av1-1-b8-01-size-66x64.ivf",    "av1-1-b8-01-size-66x66.ivf",
+  "av1-1-b8-02-allintra.ivf",      "av1-1-b8-03-sizedown.mkv",
+  "av1-1-b8-03-sizeup.mkv"
+};
+const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
+#endif  // CONFIG_AV1_DECODER
+
+}  // namespace libaom_test
diff --git a/third_party/aom/test/test_vectors.h b/third_party/aom/test/test_vectors.h
new file mode 100644
index 000000000..be37f6e37
--- /dev/null
+++ b/third_party/aom/test/test_vectors.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_TEST_VECTORS_H_
+#define AOM_TEST_TEST_VECTORS_H_
+
+#include "config/aom_config.h"
+
+namespace libaom_test {
+
+#if CONFIG_AV1_DECODER
+extern const int kNumAV1TestVectors;
+extern const char *const kAV1TestVectors[];
+#endif
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_TEST_VECTORS_H_
diff --git a/third_party/aom/test/tile_independence_test.cc b/third_party/aom/test/tile_independence_test.cc
new file mode 100644
index 000000000..cf534c0c5
--- /dev/null
+++ b/third_party/aom/test/tile_independence_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/md5_helper.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+class TileIndependenceTest
+    : public ::libaom_test::CodecTestWith3Params<int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  TileIndependenceTest()
+      : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(),
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+        n_tile_groups_(GET_PARAM(3)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 704;
+    cfg.h = 576;
+    cfg.threads = 1;
+    cfg.allow_lowbitdepth = 1;
+    fw_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1);
+
+    if (fw_dec_->IsAV1() && inv_dec_->IsAV1()) {
+      fw_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      fw_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+      inv_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      inv_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+  }
+
+  virtual ~TileIndependenceTest() {
+    delete fw_dec_;
+    delete inv_dec_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libaom_test::kTwoPassGood);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+      SetCpuUsed(encoder);
+    } else if (video->frame() == 3) {
+      encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+    }
+  }
+
+  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+    static const int kCpuUsed = 3;
+    encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+  }
+
+  void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+                 ::libaom_test::MD5 *md5) {
+    const aom_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = dec->GetDxData().Next();
+    md5->Add(img);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
+    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
+  }
+
+  void DoTest() {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = AOM_VBR;
+
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_fw_str = md5_fw_order_.Get();
+    const char *md5_inv_str = md5_inv_order_.Get();
+    ASSERT_STREQ(md5_fw_str, md5_inv_str);
+  }
+
+  ::libaom_test::MD5 md5_fw_order_, md5_inv_order_;
+  ::libaom_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+  int n_tile_groups_;
+};
+
+// run an encode with 2 or 4 tiles, and do the decode both in normal and
+// inverted tile ordering. Ensure that the MD5 of the output in both cases
+// is identical. If so, tiles are considered independent and the test passes.
+TEST_P(TileIndependenceTest, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class TileIndependenceTestLarge : public TileIndependenceTest {
+  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+    static const int kCpuUsed = 0;
+    encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+  }
+};
+
+TEST_P(TileIndependenceTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
+                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+
+class TileIndependenceLSTest : public TileIndependenceTest {};
+
+TEST_P(TileIndependenceLSTest, MD5Match) {
+  cfg_.large_scale_tile = 1;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {};
+
+TEST_P(TileIndependenceLSTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 1;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6),
+                          ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6),
+                          ::testing::Values(6), ::testing::Values(1));
+}  // namespace
diff --git a/third_party/aom/test/tools_common.sh b/third_party/aom/test/tools_common.sh
new file mode 100755
index 000000000..c08710606
--- /dev/null
+++ b/third_party/aom/test/tools_common.sh
@@ -0,0 +1,477 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+##  This file contains shell code shared by test scripts for libaom tools.
+
+# Use $AOM_TEST_TOOLS_COMMON_SH as a pseudo include guard.
+if [ -z "${AOM_TEST_TOOLS_COMMON_SH}" ]; then
+AOM_TEST_TOOLS_COMMON_SH=included
+
+set -e
+devnull='> /dev/null 2>&1'
+AOM_TEST_PREFIX=""
+
+elog() {
+  echo "$@" 1>&2
+}
+
+vlog() {
+  if [ "${AOM_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+# Sets $AOM_TOOL_TEST to the name specified by positional parameter one.
+test_begin() {
+  AOM_TOOL_TEST="${1}"
+}
+
+# Clears the AOM_TOOL_TEST variable after confirming that $AOM_TOOL_TEST matches
+# positional parameter one.
+test_end() {
+  if [ "$1" != "${AOM_TOOL_TEST}" ]; then
+    echo "FAIL completed test mismatch!."
+    echo "  completed test: ${1}"
+    echo "  active test: ${AOM_TOOL_TEST}."
+    return 1
+  fi
+  AOM_TOOL_TEST='<unset>'
+}
+
+# Echoes the target configuration being tested.
+test_configuration_target() {
+  aom_config_c="${LIBAOM_CONFIG_PATH}/config/aom_config.c"
+  # Clean up the cfg pointer line from aom_config.c for easier re-use by
+  # someone examining a failure in the example tests.
+  # 1. Run grep on aom_config.c for cfg and limit the results to 1.
+  # 2. Split the line using ' = ' as separator.
+  # 3. Abuse sed to consume the leading " and trailing "; from the assignment
+  #    to the cfg pointer.
+  cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \
+    | sed -e s/\"// -e s/\"\;//)
+  echo cmake generated via command: cmake path/to/aom ${cmake_config}
+}
+
+# Trap function used for failure reports and tool output directory removal.
+# When the contents of $AOM_TOOL_TEST do not match the string '<unset>', reports
+# failure of test stored in $AOM_TOOL_TEST.
+cleanup() {
+  if [ -n "${AOM_TOOL_TEST}" ] && [ "${AOM_TOOL_TEST}" != '<unset>' ]; then
+    echo "FAIL: $AOM_TOOL_TEST"
+  fi
+  if [ "${AOM_TEST_PRESERVE_OUTPUT}" = "yes" ]; then
+    return
+  fi
+  if [ -n "${AOM_TEST_OUTPUT_DIR}" ] && [ -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+    rm -rf "${AOM_TEST_OUTPUT_DIR}"
+  fi
+}
+
+# Echoes the version string assigned to the VERSION_STRING_NOSP variable defined
+# in $LIBAOM_CONFIG_PATH/config/aom_version.h to stdout.
+cmake_version() {
+  aom_version_h="${LIBAOM_CONFIG_PATH}/config/aom_version.h"
+
+  # Find VERSION_STRING_NOSP line, split it with '"' and print the next to last
+  # field to output the version string to stdout.
+  aom_version=$(awk -F \" '/VERSION_STRING_NOSP/ {print $(NF-1)}' \
+    "${aom_version_h}")
+  echo "v${aom_version}"
+}
+
+# Echoes current git version as reported by running 'git describe', or the
+# version used by the cmake build when git is unavailable.
+source_version() {
+  if git --version > /dev/null 2>&1; then
+    (cd "$(dirname "${0}")"
+    git describe)
+  else
+    cmake_version
+  fi
+}
+
+# Echoes warnings to stdout when source version and CMake build generated
+# version are out of sync.
+check_version_strings() {
+  cmake_version=$(cmake_version)
+  source_version=$(source_version)
+
+  if [ "${cmake_version}" != "${source_version}" ]; then
+    echo "Warning: version has changed since last cmake run."
+    vlog "  cmake version: ${cmake_version} version now: ${source_version}"
+  fi
+}
+
+# $1 is the name of an environment variable containing a directory name to
+# test.
+test_env_var_dir() {
+  local dir=$(eval echo "\${$1}")
+  if [ ! -d "${dir}" ]; then
+    elog "'${dir}': No such directory"
+    elog "The $1 environment variable must be set to a valid directory."
+    return 1
+  fi
+}
+
+# This script requires that the LIBAOM_BIN_PATH, LIBAOM_CONFIG_PATH, and
+# LIBAOM_TEST_DATA_PATH variables are in the environment: Confirm that
+# the variables are set and that they all evaluate to directory paths.
+verify_aom_test_environment() {
+  test_env_var_dir "LIBAOM_BIN_PATH" \
+    && test_env_var_dir "LIBAOM_CONFIG_PATH" \
+    && test_env_var_dir "LIBAOM_TEST_DATA_PATH"
+}
+
+# Greps aom_config.h in LIBAOM_CONFIG_PATH for positional parameter one, which
+# should be a LIBAOM preprocessor flag. Echoes yes to stdout when the feature
+# is available.
+aom_config_option_enabled() {
+  aom_config_option="${1}"
+  aom_config_file="${LIBAOM_CONFIG_PATH}/config/aom_config.h"
+  config_line=$(grep "${aom_config_option}" "${aom_config_file}")
+  if echo "${config_line}" | egrep -q '1$'; then
+    echo yes
+  fi
+}
+
+# Echoes yes when output of test_configuration_target() contains win32 or win64.
+is_windows_target() {
+  if test_configuration_target \
+     | grep -q -e win32 -e win64 > /dev/null 2>&1; then
+    echo yes
+  fi
+}
+
+# Echoes path to $1 when it's executable and exists in one of the directories
+# included in $tool_paths, or an empty string. Caller is responsible for testing
+# the string once the function returns.
+aom_tool_path() {
+  local tool_name="$1"
+  local root_path="${LIBAOM_BIN_PATH}"
+  local suffix="${AOM_TEST_EXE_SUFFIX}"
+  local tool_paths="\
+    ${root_path}/${tool_name}${suffix} \
+    ${root_path}/../${tool_name}${suffix} \
+    ${root_path}/tools/${tool_name}${suffix} \
+    ${root_path}/../tools/${tool_name}${suffix}"
+
+  local toolpath=""
+
+  for tool_path in ${tool_paths}; do
+    if [ -x "${tool_path}" ] && [ -f "${tool_path}" ]; then
+      echo "${tool_path}"
+      return 0
+    fi
+  done
+
+  return 1
+}
+
+# Echoes yes to stdout when the file named by positional parameter one exists
+# in LIBAOM_BIN_PATH, and is executable.
+aom_tool_available() {
+  local tool_name="$1"
+  local tool="${LIBAOM_BIN_PATH}/${tool_name}${AOM_TEST_EXE_SUFFIX}"
+  [ -x "${tool}" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_DECODER.
+av1_decode_available() {
+  [ "$(aom_config_option_enabled CONFIG_AV1_DECODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_ENCODER.
+av1_encode_available() {
+  [ "$(aom_config_option_enabled CONFIG_AV1_ENCODER)" = "yes" ] && echo yes
+}
+
+# Echoes "fast" encode params for use with aomenc.
+aomenc_encode_test_fast_params() {
+  echo "--cpu-used=1
+        --limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+        --lag-in-frames=0
+        --test-decode=fatal"
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_WEBM_IO.
+webm_io_available() {
+  [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
+}
+
+# Filters strings from $1 using the filter specified by $2. Filter behavior
+# depends on the presence of $3. When $3 is present, strings that match the
+# filter are excluded. When $3 is omitted, strings matching the filter are
+# included.
+# The filtered result is echoed to stdout.
+filter_strings() {
+  strings=${1}
+  filter=${2}
+  exclude=${3}
+
+  if [ -n "${exclude}" ]; then
+    # When positional parameter three exists the caller wants to remove strings.
+    # Tell grep to invert matches using the -v argument.
+    exclude='-v'
+  else
+    unset exclude
+  fi
+
+  if [ -n "${filter}" ]; then
+    for s in ${strings}; do
+      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+        filtered_strings="${filtered_strings} ${s}"
+      fi
+    done
+  else
+    filtered_strings="${strings}"
+  fi
+  echo "${filtered_strings}"
+}
+
+# Runs user test functions passed via positional parameters one and two.
+# Functions in positional parameter one are treated as environment verification
+# functions and are run unconditionally. Functions in positional parameter two
+# are run according to the rules specified in aom_test_usage().
+run_tests() {
+  local env_tests="verify_aom_test_environment $1"
+  local tests_to_filter="$2"
+  local test_name="${AOM_TEST_NAME}"
+
+  if [ -z "${test_name}" ]; then
+    test_name="$(basename "${0%.*}")"
+  fi
+
+  if [ "${AOM_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then
+    # Filter out DISABLED tests.
+    tests_to_filter=$(filter_strings "${tests_to_filter}" ^DISABLED exclude)
+  fi
+
+  if [ -n "${AOM_TEST_FILTER}" ]; then
+    # Remove tests not matching the user's filter.
+    tests_to_filter=$(filter_strings "${tests_to_filter}" ${AOM_TEST_FILTER})
+  fi
+
+  # User requested test listing: Dump test names and return.
+  if [ "${AOM_TEST_LIST_TESTS}" = "yes" ]; then
+    for test_name in $tests_to_filter; do
+      echo ${test_name}
+    done
+    return
+  fi
+
+  # Don't bother with the environment tests if everything else was disabled.
+  [ -z "${tests_to_filter}" ] && return
+
+  # Combine environment and actual tests.
+  local tests_to_run="${env_tests} ${tests_to_filter}"
+
+  check_version_strings
+
+  # Run tests.
+  for test in ${tests_to_run}; do
+    test_begin "${test}"
+    vlog "  RUN  ${test}"
+    "${test}"
+    vlog "  PASS ${test}"
+    test_end "${test}"
+  done
+
+  local tested_config="$(test_configuration_target) @ $(source_version)"
+  echo "${test_name}: Done, all tests pass for ${tested_config}."
+}
+
+aom_test_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --bin-path <path to libaom binaries directory>
+    --config-path <path to libaom config directory>
+    --filter <filter>: User test filter. Only tests matching filter are run.
+    --run-disabled-tests: Run disabled tests.
+    --help: Display this message and exit.
+    --test-data-path <path to libaom test data directory>
+    --show-program-output: Shows output from all programs being tested.
+    --prefix: Allows for a user specified prefix to be inserted before all test
+              programs. Grants the ability, for example, to run test programs
+              within valgrind.
+    --list-tests: List all test names and exit without actually running tests.
+    --verbose: Verbose output.
+
+    When the --bin-path option is not specified the script attempts to use
+    \$LIBAOM_BIN_PATH and then the current directory.
+
+    When the --config-path option is not specified the script attempts to use
+    \$LIBAOM_CONFIG_PATH and then the current directory.
+
+    When the -test-data-path option is not specified the script attempts to use
+    \$LIBAOM_TEST_DATA_PATH and then the current directory.
+EOF
+}
+
+# Returns non-zero (failure) when required environment variables are empty
+# strings.
+aom_test_check_environment() {
+  if [ -z "${LIBAOM_BIN_PATH}" ] || \
+     [ -z "${LIBAOM_CONFIG_PATH}" ] || \
+     [ -z "${LIBAOM_TEST_DATA_PATH}" ]; then
+    return 1
+  fi
+}
+
+# Echo aomenc command line parameters allowing use of a raw yuv file as
+# input to aomenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width="${YUV_RAW_INPUT_WIDTH}"
+       --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Do a small encode for testing decoders.
+encode_yuv_raw_input_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    local output="$1"
+    local encoder="$(aom_tool_path aomenc)"
+    shift
+    eval "${encoder}" $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --output="${output}" \
+      $@ \
+      ${devnull}
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --bin-path)
+      LIBAOM_BIN_PATH="$2"
+      shift
+      ;;
+    --config-path)
+      LIBAOM_CONFIG_PATH="$2"
+      shift
+      ;;
+    --filter)
+      AOM_TEST_FILTER="$2"
+      shift
+      ;;
+    --run-disabled-tests)
+      AOM_TEST_RUN_DISABLED_TESTS=yes
+      ;;
+    --help)
+      aom_test_usage
+      exit
+      ;;
+    --test-data-path)
+      LIBAOM_TEST_DATA_PATH="$2"
+      shift
+      ;;
+    --prefix)
+      AOM_TEST_PREFIX="$2"
+      shift
+      ;;
+    --verbose)
+      AOM_TEST_VERBOSE_OUTPUT=yes
+      ;;
+    --show-program-output)
+      devnull=
+      ;;
+    --list-tests)
+      AOM_TEST_LIST_TESTS=yes
+      ;;
+    *)
+      aom_test_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+# Handle running the tests from a build directory without arguments when running
+# the tests on *nix/macosx.
+LIBAOM_BIN_PATH="${LIBAOM_BIN_PATH:-.}"
+LIBAOM_CONFIG_PATH="${LIBAOM_CONFIG_PATH:-.}"
+LIBAOM_TEST_DATA_PATH="${LIBAOM_TEST_DATA_PATH:-.}"
+
+# Create a temporary directory for output files, and a trap to clean it up.
+if [ -n "${TMPDIR}" ]; then
+  AOM_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  AOM_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  AOM_TEST_TEMP_ROOT=/tmp
+fi
+
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_OUTPUT_DIR:-${AOM_TEST_TEMP_ROOT}/aom_test_$$}"
+
+if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no}
+
+if [ "$(is_windows_target)" = "yes" ]; then
+  AOM_TEST_EXE_SUFFIX=".exe"
+fi
+
+# Variables shared by tests.
+AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED:-1}
+AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT:-5}
+AV1_IVF_FILE="${AV1_IVF_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.ivf}"
+AV1_OBU_ANNEXB_FILE="${AV1_OBU_ANNEXB_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.annexb.obu}"
+AV1_OBU_SEC5_FILE="${AV1_OBU_SEC5_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.section5.obu}"
+AV1_WEBM_FILE="${AV1_WEBM_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.webm}"
+
+YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_RAW_INPUT_WIDTH=352
+YUV_RAW_INPUT_HEIGHT=288
+
+Y4M_NOSQ_PAR_INPUT="${LIBAOM_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Setup a trap function to clean up after tests complete.
+trap cleanup EXIT
+
+vlog "$(basename "${0%.*}") test configuration:
+  LIBAOM_BIN_PATH=${LIBAOM_BIN_PATH}
+  LIBAOM_CONFIG_PATH=${LIBAOM_CONFIG_PATH}
+  LIBAOM_TEST_DATA_PATH=${LIBAOM_TEST_DATA_PATH}
+  AOM_TEST_EXE_SUFFIX=${AOM_TEST_EXE_SUFFIX}
+  AOM_TEST_FILTER=${AOM_TEST_FILTER}
+  AOM_TEST_LIST_TESTS=${AOM_TEST_LIST_TESTS}
+  AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}
+  AOM_TEST_PREFIX=${AOM_TEST_PREFIX}
+  AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT}
+  AOM_TEST_RUN_DISABLED_TESTS=${AOM_TEST_RUN_DISABLED_TESTS}
+  AOM_TEST_SHOW_PROGRAM_OUTPUT=${AOM_TEST_SHOW_PROGRAM_OUTPUT}
+  AOM_TEST_TEMP_ROOT=${AOM_TEST_TEMP_ROOT}
+  AOM_TEST_VERBOSE_OUTPUT=${AOM_TEST_VERBOSE_OUTPUT}
+  AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED}
+  AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT}
+  AV1_IVF_FILE=${AV1_IVF_FILE}
+  AV1_OBU_ANNEXB_FILE=${AV1_OBU_ANNEXB_FILE}
+  AV1_OBU_SEC5_FILE=${AV1_OBU_SEC5_FILE}
+  AV1_WEBM_FILE=${AV1_WEBM_FILE}
+  YUV_RAW_INPUT=${YUV_RAW_INPUT}
+  YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
+  Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}"
+
+fi  # End $AOM_TEST_TOOLS_COMMON_SH pseudo include guard.
diff --git a/third_party/aom/test/transform_test_base.h b/third_party/aom/test/transform_test_base.h
new file mode 100644
index 000000000..8ebcf5ff7
--- /dev/null
+++ b/third_party/aom/test/transform_test_base.h
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
+#define AOM_TEST_TRANSFORM_TEST_BASE_H_
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_codec.h"
+#include "aom_dsp/txfm_common.h"
+
+namespace libaom_test {
+
+//  Note:
+//   Same constant are defined in av1/common/av1_entropy.h and
+//   av1/common/entropy.h.  Goal is to make this base class
+//   to use for future codec transform testing.  But including
+//   either of them would lead to compiling error when we do
+//   unit test for another codec. Suggest to move the definition
+//   to a aom header file.
+const int kDctMaxValue = 16384;
+
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        TxfmParam *txfm_param);
+
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        const TxfmParam *txfm_param);
+
+class TransformTestBase {
+ public:
+  virtual ~TransformTestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck(uint32_t ref_max_error, double ref_avg_error) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+
+    int16_t *test_input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *test_temp_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == AOM_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      if (bit_depth_ == AOM_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+        const int diff =
+            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error) max_error = error;
+        total_error += error;
+      }
+    }
+
+    double avg_error = total_error * 1. / count_test_block / num_coeffs_;
+
+    EXPECT_GE(ref_max_error, max_error)
+        << "Error: FHT/IHT has an individual round trip error > "
+        << ref_max_error;
+
+    EXPECT_GE(ref_avg_error, avg_error)
+        << "Error: FHT/IHT has average round trip error > " << ref_avg_error
+        << " per block";
+
+    aom_free(test_input_block);
+    aom_free(test_temp_block);
+    aom_free(dst);
+    aom_free(src);
+    aom_free(dst16);
+    aom_free(src16);
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    // Use a stride value which is not the width of any transform, to catch
+    // cases where the transforms use the stride incorrectly.
+    int stride = 96;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * stride * height_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      int j, k;
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int in_idx = j * stride + k;
+          int out_idx = j * pitch_ + k;
+          input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+          if (bit_depth_ == AOM_BITS_8) {
+            output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+          } else {
+            output_block[out_idx] = output_ref_block[out_idx] =
+                rnd.Rand16() & mask_;
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_block, output_ref_block, stride, &txfm_param_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
+
+      // The minimum quant value is 4.
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int out_idx = j * pitch_ + k;
+          ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+              << "Error: not bit-exact result at index: " << out_idx
+              << " at test block: " << i;
+        }
+      }
+    }
+    aom_free(input_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunInvCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    // Use a stride value which is not the width of any transform, to catch
+    // cases where the transforms use the stride incorrectly.
+    int stride = 96;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *trans_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *output_block = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * stride * height_));
+    uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * stride * height_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      int j, k;
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int in_idx = j * pitch_ + k;
+          int out_idx = j * stride + k;
+          input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+          output_ref_block[out_idx] = rnd.Rand16() & mask_;
+          output_block[out_idx] = output_ref_block[out_idx];
+        }
+      }
+
+      fwd_txfm_ref(input_block, trans_block, pitch_, &txfm_param_);
+
+      inv_txfm_ref(trans_block, output_ref_block, stride, &txfm_param_);
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
+
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int out_idx = j * stride + k;
+          ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+              << "Error: not bit-exact result at index: " << out_idx
+              << " j = " << j << " k = " << k << " at test block: " << i;
+        }
+      }
+    }
+    aom_free(input_block);
+    aom_free(trans_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_extreme_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(input_extreme_block, output_block, pitch_));
+
+      int row_length = FindRowLength();
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        ASSERT_EQ(output_block[j], output_ref_block[j])
+            << "Not bit-exact at test index: " << i << ", "
+            << "j = " << j << std::endl;
+        EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
+                  abs(output_block[j]))
+            << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";
+      }
+    }
+    aom_free(input_extreme_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+
+    int16_t *in = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *coeff = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == AOM_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+        }
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, &txfm_param_);
+
+      if (bit_depth_ == AOM_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+        const int diff =
+            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+        const uint32_t error = diff * diff;
+        ASSERT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error << " at index " << j;
+      }
+    }
+    aom_free(in);
+    aom_free(coeff);
+    aom_free(dst);
+    aom_free(src);
+    aom_free(src16);
+    aom_free(dst16);
+  }
+
+  int pitch_;
+  int height_;
+  FhtFunc fwd_txfm_ref;
+  IhtFunc inv_txfm_ref;
+  aom_bit_depth_t bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  TxfmParam txfm_param_;
+
+ private:
+  //  Assume transform size is 4x4, 8x8, 16x16,...
+  int FindRowLength() const {
+    int row = 4;
+    if (16 == num_coeffs_) {
+      row = 4;
+    } else if (64 == num_coeffs_) {
+      row = 8;
+    } else if (256 == num_coeffs_) {
+      row = 16;
+    } else if (1024 == num_coeffs_) {
+      row = 32;
+    }
+    return row;
+  }
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_TRANSFORM_TEST_BASE_H_
diff --git a/third_party/aom/test/twopass_encoder.sh b/third_party/aom/test/twopass_encoder.sh
new file mode 100755
index 000000000..cca44ced8
--- /dev/null
+++ b/third_party/aom/test/twopass_encoder.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom twopass_encoder example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to twopass_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+twopass_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
+twopass_encoder() {
+  local encoder="$(aom_tool_path twopass_encoder)"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"
+  local limit=7
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" "${limit}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+twopass_encoder_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    twopass_encoder av1 || return 1
+  fi
+}
+
+twopass_encoder_tests="twopass_encoder_av1"
+
+run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/third_party/aom/test/util.h b/third_party/aom/test/util.h
new file mode 100644
index 000000000..c3f4e4442
--- /dev/null
+++ b/third_party/aom/test/util.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_UTIL_H_
+#define AOM_TEST_UTIL_H_
+
+#include <stdio.h>
+#include <math.h>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom/aom_integer.h"
+#include "aom/aom_image.h"
+#include "aom_ports/aom_timer.h"
+
+// Macros
+#define GET_PARAM(k) ::testing::get<k>(GetParam())
+
+inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
+  assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
+         (img1->d_h == img2->d_h));
+
+  const unsigned int width_y = img1->d_w;
+  const unsigned int height_y = img1->d_h;
+  unsigned int i, j;
+
+  int64_t sqrerr = 0;
+  for (i = 0; i < height_y; ++i)
+    for (j = 0; j < width_y; ++j) {
+      int64_t d = img1->planes[AOM_PLANE_Y][i * img1->stride[AOM_PLANE_Y] + j] -
+                  img2->planes[AOM_PLANE_Y][i * img2->stride[AOM_PLANE_Y] + j];
+      sqrerr += d * d;
+    }
+  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
+  double psnr = 100.0;
+  if (mse > 0.0) {
+    psnr = 10 * log10(255.0 * 255.0 / mse);
+  }
+  return psnr;
+}
+
+static INLINE double get_time_mark(aom_usec_timer *t) {
+  aom_usec_timer_mark(t);
+  return static_cast<double>(aom_usec_timer_elapsed(t));
+}
+
+#endif  // AOM_TEST_UTIL_H_
diff --git a/third_party/aom/test/variance_test.cc b/third_party/aom/test/variance_test.cc
new file mode 100644
index 000000000..0df314b0f
--- /dev/null
+++ b/third_party/aom/test/variance_test.cc
@@ -0,0 +1,2064 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
+                                        unsigned int *sse);
+typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse,
+                                            const uint8_t *second_pred);
+typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+typedef unsigned int (*JntSubpixAvgVarMxNFunc)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, uint32_t *sse, const uint8_t *second_pred,
+    const JNT_COMP_PARAMS *jcp_param);
+typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
+                                      int xoffset, int yoffset,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *sse);
+
+using libaom_test::ACMRandom;
+
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+  switch (bit_depth) {
+    case AOM_BITS_12:
+      *sse = (*sse + 128) >> 8;
+      *se = (*se + 8) >> 4;
+      break;
+    case AOM_BITS_10:
+      *sse = (*sse + 8) >> 4;
+      *se = (*se + 2) >> 2;
+      break;
+    case AOM_BITS_8:
+    default: break;
+  }
+}
+
+static unsigned int mb_ss_ref(const int16_t *src) {
+  unsigned int res = 0;
+  for (int i = 0; i < 256; ++i) {
+    res += src[i] * src[i];
+  }
+  return res;
+}
+
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+                             int l2h, int src_stride, int ref_stride,
+                             uint32_t *sse_ptr, bool use_high_bit_depth_,
+                             aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      int diff;
+      if (!use_high_bit_depth_) {
+        diff = src[y * src_stride + x] - ref[y * ref_stride + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
+               CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+/* The subpel reference functions differ from the codec version in one aspect:
+ * they calculate the bilinear factors directly instead of using a lookup table
+ * and therefore upshift xoff and yoff by 1. Only every other calculated value
+ * is used so the codec version shrinks the table to save space and maintain
+ * compatibility with vp8.
+ */
+static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                    int l2w, int l2h, int xoff, int yoff,
+                                    uint32_t *sse_ptr, bool use_high_bit_depth_,
+                                    aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                        const uint8_t *second_pred, int l2w,
+                                        int l2h, int xoff, int yoff,
+                                        uint32_t *sse_ptr,
+                                        bool use_high_bit_depth,
+                                        aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff =
+            ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t jnt_subpel_avg_variance_ref(
+    const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
+    int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
+    aom_bit_depth_t bit_depth, JNT_COMP_PARAMS *jcp_param) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 0) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 0) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 0) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 0) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int avg = ROUND_POWER_OF_TWO(
+            r * jcp_param->fwd_offset +
+                second_pred[w * y + x] * jcp_param->bck_offset,
+            DIST_PRECISION_BITS);
+        const int diff = avg - src[w * y + x];
+
+        se += diff;
+        sse += diff * diff;
+      } else {
+        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 0) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 0) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 0) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 0) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int avg =
+            ROUND_POWER_OF_TWO(r * jcp_param->fwd_offset +
+                                   sec16[w * y + x] * jcp_param->bck_offset,
+                               DIST_PRECISION_BITS);
+        const int diff = avg - src16[w * y + x];
+
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
+                                         int xoff, int yoff,
+                                         const int32_t *wsrc,
+                                         const int32_t *mask, uint32_t *sse_ptr,
+                                         bool use_high_bit_depth_,
+                                         aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = pre[(w + 1) * (y + 0) + x + 0];
+        const int a2 = pre[(w + 1) * (y + 0) + x + 1];
+        const int b1 = pre[(w + 1) * (y + 1) + x + 0];
+        const int b2 = pre[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ROUND_POWER_OF_TWO_SIGNED(
+            wsrc[w * y + x] - r * mask[w * y + x], 12);
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre);
+        const int a1 = pre16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = pre16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = pre16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = pre16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ROUND_POWER_OF_TWO_SIGNED(
+            wsrc[w * y + x] - r * mask[w * y + x], 12);
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+  SumOfSquaresTest() : func_(GetParam()) {}
+
+  virtual ~SumOfSquaresTest() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void ConstTest();
+  void RefTest();
+
+  SumOfSquaresFunction func_;
+  ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+  int16_t mem[256];
+  unsigned int res;
+  for (int v = 0; v < 256; ++v) {
+    for (int i = 0; i < 256; ++i) {
+      mem[i] = v;
+    }
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(256u * (v * v), res);
+  }
+}
+
+void SumOfSquaresTest::RefTest() {
+  int16_t mem[256];
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      mem[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const unsigned int expected = mb_ss_ref(mem);
+    unsigned int res;
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(expected, res);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Encapsulating struct to store the function to test along with
+// some testing context.
+// Can be used for MSE, SSE, Variance, etc.
+
+template <typename Func>
+struct TestParams {
+  TestParams(int log2w = 0, int log2h = 0, Func function = NULL,
+             int bit_depth_value = 0)
+      : log2width(log2w), log2height(log2h), func(function) {
+    use_high_bit_depth = (bit_depth_value > 0);
+    if (use_high_bit_depth) {
+      bit_depth = static_cast<aom_bit_depth_t>(bit_depth_value);
+    } else {
+      bit_depth = AOM_BITS_8;
+    }
+    width = 1 << log2width;
+    height = 1 << log2height;
+    block_size = width * height;
+    mask = (1u << bit_depth) - 1;
+  }
+
+  int log2width, log2height;
+  int width, height;
+  int block_size;
+  Func func;
+  aom_bit_depth_t bit_depth;
+  bool use_high_bit_depth;
+  uint32_t mask;
+};
+
+template <typename Func>
+std::ostream &operator<<(std::ostream &os, const TestParams<Func> &p) {
+  return os << "width/height:" << p.width << "/" << p.height
+            << " function:" << reinterpret_cast<const void *>(p.func)
+            << " bit-depth:" << p.bit_depth;
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
+class MainTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    const size_t unit =
+        use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
+    src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size() * unit));
+    ref_ = new uint8_t[block_size() * unit];
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+    if (use_high_bit_depth()) {
+      // TODO(skal): remove!
+      src_ = CONVERT_TO_BYTEPTR(src_);
+      ref_ = CONVERT_TO_BYTEPTR(ref_);
+    }
+  }
+
+  virtual void TearDown() {
+    if (use_high_bit_depth()) {
+      // TODO(skal): remove!
+      src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
+      ref_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(ref_));
+    }
+
+    aom_free(src_);
+    delete[] ref_;
+    src_ = NULL;
+    ref_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  // We could sub-class MainTestClass into dedicated class for Variance
+  // and MSE/SSE, but it involves a lot of 'this->xxx' dereferencing
+  // to access top class fields xxx. That's cumbersome, so for now we'll just
+  // implement the testing methods here:
+
+  // Variance tests
+  void ZeroTest();
+  void RefTest();
+  void RefStrideTest();
+  void OneQuarterTest();
+  void SpeedTest();
+
+  // MSE/SSE tests
+  void RefTestMse();
+  void RefTestSse();
+  void MaxTestMse();
+  void MaxTestSse();
+
+ protected:
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to variance.
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::ZeroTest() {
+  for (int i = 0; i <= 255; ++i) {
+    if (!use_high_bit_depth()) {
+      memset(src_, i, block_size());
+    } else {
+      uint16_t *const src16 = CONVERT_TO_SHORTPTR(src_);
+      for (int k = 0; k < block_size(); ++k) src16[k] = i << byte_shift();
+    }
+    for (int j = 0; j <= 255; ++j) {
+      if (!use_high_bit_depth()) {
+        memset(ref_, j, block_size());
+      } else {
+        uint16_t *const ref16 = CONVERT_TO_SHORTPTR(ref_);
+        for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift();
+      }
+      unsigned int sse, var;
+      ASM_REGISTER_STATE_CHECK(
+          var = params_.func(src_, width(), ref_, width(), &sse));
+      EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
+    }
+  }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::RefTest() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); j++) {
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+      }
+    }
+    unsigned int sse1, sse2, var1, var2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(
+        var1 = params_.func(src_, stride, ref_, stride, &sse1));
+    var2 =
+        variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                     stride, &sse2, use_high_bit_depth(), params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2) << "Error at test index: " << i;
+  }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::RefStrideTest() {
+  for (int i = 0; i < 10; ++i) {
+    const int ref_stride = (i & 1) * width();
+    const int src_stride = ((i >> 1) & 1) * width();
+    for (int j = 0; j < block_size(); j++) {
+      const int ref_ind = (j / width()) * ref_stride + j % width();
+      const int src_ind = (j / width()) * src_stride + j % width();
+      if (!use_high_bit_depth()) {
+        src_[src_ind] = rnd_.Rand8();
+        ref_[ref_ind] = rnd_.Rand8();
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask();
+      }
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1, var2;
+
+    ASM_REGISTER_STATE_CHECK(
+        var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1));
+    var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height,
+                        src_stride, ref_stride, &sse2, use_high_bit_depth(),
+                        params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2) << "Error at test index: " << i;
+  }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+  } else {
+    aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+    aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+  }
+  unsigned int sse, var, expected;
+  ASM_REGISTER_STATE_CHECK(
+      var = params_.func(src_, width(), ref_, width(), &sse));
+  expected = block_size() * 255 * 255 / 4;
+  EXPECT_EQ(expected, var);
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  for (int j = 0; j < block_size(); j++) {
+    if (!use_high_bit_depth()) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+    }
+  }
+  unsigned int sse;
+  const int stride = width();
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    params_.func(src_, stride, ref_, stride, &sse);
+  }
+
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to MSE / SSE.
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestMse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    }
+    unsigned int sse1, sse2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
+    variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                 stride, &sse2, false, AOM_BITS_8);
+    EXPECT_EQ(sse1, sse2);
+  }
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestSse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    }
+    unsigned int sse2;
+    unsigned int var1;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
+    variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                 stride, &sse2, false, AOM_BITS_8);
+    EXPECT_EQ(var1, sse2);
+  }
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestMse() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
+  unsigned int sse;
+  ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
+  const unsigned int expected = block_size() * 255 * 255;
+  EXPECT_EQ(expected, sse);
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestSse() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
+  unsigned int var;
+  ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
+  const unsigned int expected = block_size() * 255 * 255;
+  EXPECT_EQ(expected, var);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using ::testing::get;
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+template <typename FunctionType>
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    if (!use_high_bit_depth()) {
+      src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      ref_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(32, block_size() + width() + height() + 1));
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
+      sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
+          32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
+    }
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(sec_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth()) {
+      aom_free(src_);
+      aom_free(ref_);
+      aom_free(sec_);
+    } else {
+      aom_free(CONVERT_TO_SHORTPTR(src_));
+      aom_free(CONVERT_TO_SHORTPTR(ref_));
+      aom_free(CONVERT_TO_SHORTPTR(sec_));
+    }
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+  void ExtremeRefTest();
+
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  uint8_t *sec_;
+  TestParams<FunctionType> params_;
+  JNT_COMP_PARAMS jcp_param_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
+};
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+  // Compare against reference.
+  // Src: Set the first half of values to 0, the second half to the maximum.
+  // Ref: Set the first half of values to the maximum, the second half to 0.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
+        memset(src_, 0, half);
+        memset(src_ + half, 255, half);
+        memset(ref_, 255, half);
+        memset(ref_ + half, 0, half + width() + height() + 1);
+      } else {
+        aom_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
+        aom_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+        aom_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+        aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+                     half + width() + height() + 1);
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+                                                   src_, width(), &sse1, sec_));
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+                                     params_.log2height, x, y, &sse2,
+                                     use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+template <>
+void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      for (int x0 = 0; x0 < 2; ++x0) {
+        for (int y0 = 0; y0 < 4; ++y0) {
+          uint32_t sse1, sse2;
+          uint32_t var1, var2;
+          jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0];
+          jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1];
+          ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
+                                                       src_, width(), &sse1,
+                                                       sec_, &jcp_param_));
+          var2 = jnt_subpel_avg_variance_ref(
+              ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
+              &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
+          EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+          EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static const int kMaskMax = 64;
+
+typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
+
+template <typename FunctionType>
+class ObmcVarianceTest
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    if (!use_high_bit_depth()) {
+      pre_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(32, block_size() + width() + height() + 1));
+    } else {
+      pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign(
+          32, block_size() + width() + height() + 1 * sizeof(uint16_t))));
+    }
+    wsrc_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, block_size() * sizeof(uint32_t)));
+    mask_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, block_size() * sizeof(uint32_t)));
+    ASSERT_TRUE(pre_ != NULL);
+    ASSERT_TRUE(wsrc_ != NULL);
+    ASSERT_TRUE(mask_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth()) {
+      aom_free(pre_);
+    } else {
+      aom_free(CONVERT_TO_SHORTPTR(pre_));
+    }
+    aom_free(wsrc_);
+    aom_free(mask_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+  void ExtremeRefTest();
+  void SpeedTest();
+
+  ACMRandom rnd_;
+  uint8_t *pre_;
+  int32_t *wsrc_;
+  int32_t *mask_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t bd_mask() const { return params_.mask; }
+};
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth())
+        for (int j = 0; j < block_size() + width() + height() + 1; j++)
+          pre_[j] = rnd_.Rand8();
+      else
+        for (int j = 0; j < block_size() + width() + height() + 1; j++)
+          CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+      for (int j = 0; j < block_size(); j++) {
+        wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+        mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+      }
+
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+      var2 = obmc_subpel_variance_ref(
+          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+          &sse2, use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
+  // Pre: Set the first half of values to the maximum, the second half to 0.
+  // Mask: same as above
+  // WSrc: Set the first half of values to 0, the second half to the maximum.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
+        memset(pre_, 255, half);
+        memset(pre_ + half, 0, half + width() + height() + 1);
+      } else {
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half);
+      }
+      for (int j = 0; j < half; j++) {
+        wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
+        mask_[j] = 0;
+      }
+      for (int j = half; j < block_size(); j++) {
+        wsrc_[j] = 0;
+        mask_[j] = kMaskMax * kMaskMax;
+      }
+
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+      var2 = obmc_subpel_variance_ref(
+          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+          &sse2, use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
+  if (!use_high_bit_depth())
+    for (int j = 0; j < block_size() + width() + height() + 1; j++)
+      pre_[j] = rnd_.Rand8();
+  else
+    for (int j = 0; j < block_size() + width() + height() + 1; j++)
+      CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+  for (int j = 0; j < block_size(); j++) {
+    wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+    mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+  }
+  unsigned int sse1;
+  const int stride = width() + 1;
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    ASM_REGISTER_STATE_CHECK(
+        params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+         params_.bit_depth, elapsed_time);
+}
+
+typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest;
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
+
+TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
+TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
+TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(AvxVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
+                        ::testing::Values(aom_get_mb_ss_c));
+
+typedef TestParams<Get4x4SseFunc> SseParams;
+INSTANTIATE_TEST_CASE_P(C, AvxSseTest,
+                        ::testing::Values(SseParams(2, 2,
+                                                    &aom_get4x4sse_cs_c)));
+
+typedef TestParams<VarianceMxNFunc> MseParams;
+INSTANTIATE_TEST_CASE_P(C, AvxMseTest,
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
+                                          MseParams(4, 3, &aom_mse16x8_c),
+                                          MseParams(3, 4, &aom_mse8x16_c),
+                                          MseParams(3, 3, &aom_mse8x8_c)));
+
+typedef TestParams<VarianceMxNFunc> VarianceParams;
+INSTANTIATE_TEST_CASE_P(
+    C, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
+                      VarianceParams(7, 6, &aom_variance128x64_c),
+                      VarianceParams(6, 7, &aom_variance64x128_c),
+                      VarianceParams(6, 6, &aom_variance64x64_c),
+                      VarianceParams(6, 5, &aom_variance64x32_c),
+                      VarianceParams(5, 6, &aom_variance32x64_c),
+                      VarianceParams(5, 5, &aom_variance32x32_c),
+                      VarianceParams(5, 4, &aom_variance32x16_c),
+                      VarianceParams(4, 5, &aom_variance16x32_c),
+                      VarianceParams(4, 4, &aom_variance16x16_c),
+                      VarianceParams(4, 3, &aom_variance16x8_c),
+                      VarianceParams(3, 4, &aom_variance8x16_c),
+                      VarianceParams(3, 3, &aom_variance8x8_c),
+                      VarianceParams(3, 2, &aom_variance8x4_c),
+                      VarianceParams(2, 3, &aom_variance4x8_c),
+                      VarianceParams(2, 2, &aom_variance4x4_c)));
+
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
+INSTANTIATE_TEST_CASE_P(
+    C, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0)));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
+INSTANTIATE_TEST_CASE_P(
+    C, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0)));
+
+typedef TestParams<JntSubpixAvgVarMxNFunc> JntSubpelAvgVarianceParams;
+INSTANTIATE_TEST_CASE_P(
+    C, AvxJntSubpelAvgVarianceTest,
+    ::testing::Values(
+        JntSubpelAvgVarianceParams(6, 6, &aom_jnt_sub_pixel_avg_variance64x64_c,
+                                   0),
+        JntSubpelAvgVarianceParams(6, 5, &aom_jnt_sub_pixel_avg_variance64x32_c,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 6, &aom_jnt_sub_pixel_avg_variance32x64_c,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 5, &aom_jnt_sub_pixel_avg_variance32x32_c,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 4, &aom_jnt_sub_pixel_avg_variance32x16_c,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 5, &aom_jnt_sub_pixel_avg_variance16x32_c,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 4, &aom_jnt_sub_pixel_avg_variance16x16_c,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 3, &aom_jnt_sub_pixel_avg_variance16x8_c,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 4, &aom_jnt_sub_pixel_avg_variance8x16_c,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 3, &aom_jnt_sub_pixel_avg_variance8x8_c,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 2, &aom_jnt_sub_pixel_avg_variance8x4_c,
+                                   0),
+        JntSubpelAvgVarianceParams(2, 3, &aom_jnt_sub_pixel_avg_variance4x8_c,
+                                   0),
+        JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c,
+                                   0)));
+
+INSTANTIATE_TEST_CASE_P(
+    C, AvxObmcSubpelVarianceTest,
+    ::testing::Values(
+        ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c,
+                                 0),
+        ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0),
+        ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0),
+        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0),
+        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0),
+        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0),
+        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0),
+        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0),
+        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0),
+        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0),
+        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0),
+        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0),
+        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0),
+        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0),
+        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0),
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0)));
+
+typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
+
+TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(AvxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+
+/* TODO(debargha): This test does not support the highbd version
+INSTANTIATE_TEST_CASE_P(
+    C, AvxHBDMseTest,
+    ::testing::Values(make_tuple(4, 4, &aom_highbd_12_mse16x16_c),
+                      make_tuple(4, 4, &aom_highbd_12_mse16x8_c),
+                      make_tuple(4, 4, &aom_highbd_12_mse8x16_c),
+                      make_tuple(4, 4, &aom_highbd_12_mse8x8_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse16x16_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse16x8_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse8x16_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse8x8_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse16x16_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse16x8_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse8x16_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse8x8_c)));
+*/
+
+const VarianceParams kArrayHBDVariance_c[] = {
+  VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
+  VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
+  VarianceParams(6, 7, &aom_highbd_12_variance64x128_c, 12),
+  VarianceParams(6, 6, &aom_highbd_12_variance64x64_c, 12),
+  VarianceParams(6, 5, &aom_highbd_12_variance64x32_c, 12),
+  VarianceParams(5, 6, &aom_highbd_12_variance32x64_c, 12),
+  VarianceParams(5, 5, &aom_highbd_12_variance32x32_c, 12),
+  VarianceParams(5, 4, &aom_highbd_12_variance32x16_c, 12),
+  VarianceParams(4, 5, &aom_highbd_12_variance16x32_c, 12),
+  VarianceParams(4, 4, &aom_highbd_12_variance16x16_c, 12),
+  VarianceParams(4, 3, &aom_highbd_12_variance16x8_c, 12),
+  VarianceParams(3, 4, &aom_highbd_12_variance8x16_c, 12),
+  VarianceParams(3, 3, &aom_highbd_12_variance8x8_c, 12),
+  VarianceParams(3, 2, &aom_highbd_12_variance8x4_c, 12),
+  VarianceParams(2, 3, &aom_highbd_12_variance4x8_c, 12),
+  VarianceParams(2, 2, &aom_highbd_12_variance4x4_c, 12),
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_c, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_c, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_c, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_c, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_c, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_c, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_c, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_c, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_c, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_c, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_c, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_c, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_c, 10),
+  VarianceParams(3, 2, &aom_highbd_10_variance8x4_c, 10),
+  VarianceParams(2, 3, &aom_highbd_10_variance4x8_c, 10),
+  VarianceParams(2, 2, &aom_highbd_10_variance4x4_c, 10),
+  VarianceParams(7, 7, &aom_highbd_8_variance128x128_c, 8),
+  VarianceParams(7, 6, &aom_highbd_8_variance128x64_c, 8),
+  VarianceParams(6, 7, &aom_highbd_8_variance64x128_c, 8),
+  VarianceParams(6, 6, &aom_highbd_8_variance64x64_c, 8),
+  VarianceParams(6, 5, &aom_highbd_8_variance64x32_c, 8),
+  VarianceParams(5, 6, &aom_highbd_8_variance32x64_c, 8),
+  VarianceParams(5, 5, &aom_highbd_8_variance32x32_c, 8),
+  VarianceParams(5, 4, &aom_highbd_8_variance32x16_c, 8),
+  VarianceParams(4, 5, &aom_highbd_8_variance16x32_c, 8),
+  VarianceParams(4, 4, &aom_highbd_8_variance16x16_c, 8),
+  VarianceParams(4, 3, &aom_highbd_8_variance16x8_c, 8),
+  VarianceParams(3, 4, &aom_highbd_8_variance8x16_c, 8),
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_c, 8),
+  VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
+  VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
+  VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8)
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDVariance_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8),
+        VarianceParams(2, 2, &aom_highbd_10_variance4x4_sse4_1, 10),
+        VarianceParams(2, 2, &aom_highbd_12_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
+  SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_c, 8),
+  SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_c, 8),
+  SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_c, 8),
+  SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8),
+  SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8),
+  SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8),
+  SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_c, 8),
+  SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_c, 8),
+  SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_c, 8),
+  SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_c, 8),
+  SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_c, 8),
+  SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_c, 8),
+  SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_c, 8),
+  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8),
+  SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8),
+  SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8),
+  SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_c, 10),
+  SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_c, 10),
+  SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_c, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_c, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_c, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_c, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_c, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_c, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_c, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_c, 10),
+  SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10),
+  SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10),
+  SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10),
+  SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_c, 12),
+  SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_c, 12),
+  SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_c, 12),
+  SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12),
+  SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12),
+  SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12),
+  SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_c, 12),
+  SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_c, 12),
+  SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_c, 12),
+  SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_c, 12),
+  SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_c, 12),
+  SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_c, 12),
+  SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_c, 12),
+  SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
+  SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
+  SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c,
+                          8),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_c,
+                          8),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_c,
+                          8),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+  SubpelAvgVarianceParams(7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_c,
+                          10),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_10_sub_pixel_avg_variance128x64_c,
+                          10),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_10_sub_pixel_avg_variance64x128_c,
+                          10),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c,
+                          10),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c,
+                          10),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_c,
+                          10),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_c,
+                          10),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_c,
+                          10),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_c,
+                          10),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_c,
+                          10),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_c,
+                          10),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_c,
+                          10),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+  SubpelAvgVarianceParams(7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_c,
+                          12),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_12_sub_pixel_avg_variance128x64_c,
+                          12),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_12_sub_pixel_avg_variance64x128_c,
+                          12),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c,
+                          12),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c,
+                          12),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_c,
+                          12),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_c,
+                          12),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_c,
+                          12),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_c,
+                          12),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_c,
+                          12),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_c,
+                          12),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_c,
+                          12),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12)
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelAvgVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
+
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
+  ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
+                           8),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8),
+  ObmcSubpelVarianceParams(7, 7,
+                           &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_10_obmc_sub_pixel_variance16x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_10_obmc_sub_pixel_variance8x16_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_c,
+                           10),
+  ObmcSubpelVarianceParams(7, 7,
+                           &aom_highbd_12_obmc_sub_pixel_variance128x128_c, 12),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_12_obmc_sub_pixel_variance16x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_12_obmc_sub_pixel_variance8x16_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
+                           12)
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDObmcSubpelVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+                        ::testing::Values(aom_get_mb_ss_sse2));
+
+INSTANTIATE_TEST_CASE_P(SSE2, AvxMseTest,
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
+                                          MseParams(4, 3, &aom_mse16x8_sse2),
+                                          MseParams(3, 4, &aom_mse8x16_sse2),
+                                          MseParams(3, 3, &aom_mse8x8_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
+                      VarianceParams(7, 6, &aom_variance128x64_sse2),
+                      VarianceParams(6, 7, &aom_variance64x128_sse2),
+                      VarianceParams(6, 6, &aom_variance64x64_sse2),
+                      VarianceParams(6, 5, &aom_variance64x32_sse2),
+                      VarianceParams(6, 4, &aom_variance64x16_sse2),
+                      VarianceParams(5, 6, &aom_variance32x64_sse2),
+                      VarianceParams(5, 5, &aom_variance32x32_sse2),
+                      VarianceParams(5, 4, &aom_variance32x16_sse2),
+                      VarianceParams(5, 3, &aom_variance32x8_sse2),
+                      VarianceParams(4, 6, &aom_variance16x64_sse2),
+                      VarianceParams(4, 5, &aom_variance16x32_sse2),
+                      VarianceParams(4, 4, &aom_variance16x16_sse2),
+                      VarianceParams(4, 3, &aom_variance16x8_sse2),
+                      VarianceParams(4, 2, &aom_variance16x4_sse2),
+                      VarianceParams(3, 5, &aom_variance8x32_sse2),
+                      VarianceParams(3, 4, &aom_variance8x16_sse2),
+                      VarianceParams(3, 3, &aom_variance8x8_sse2),
+                      VarianceParams(3, 2, &aom_variance8x4_sse2),
+                      VarianceParams(2, 4, &aom_variance4x16_sse2),
+                      VarianceParams(2, 3, &aom_variance4x8_sse2),
+                      VarianceParams(2, 2, &aom_variance4x4_sse2)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2,
+                                0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_sse4_1,
+                             8),
+        SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_sse4_1,
+                             10),
+        SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_sse4_1,
+                             12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(2, 2,
+                                &aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1,
+                                12)));
+#endif  // HAVE_SSE4_1
+
+/* TODO(debargha): This test does not support the highbd version
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AvxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_12_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_12_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_12_mse8x8_sse2),
+                      MseParams(4, 4, &aom_highbd_10_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_10_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_10_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_10_mse8x8_sse2),
+                      MseParams(4, 4, &aom_highbd_8_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_8_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_8_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_8_mse8x8_sse2)));
+*/
+
+const VarianceParams kArrayHBDVariance_sse2[] = {
+  VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12),
+  VarianceParams(7, 6, &aom_highbd_12_variance128x64_sse2, 12),
+  VarianceParams(6, 7, &aom_highbd_12_variance64x128_sse2, 12),
+  VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
+  VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
+  VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
+  VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
+  VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
+  VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
+  VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
+  VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
+  VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
+  VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_sse2, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_sse2, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_sse2, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
+  VarianceParams(7, 7, &aom_highbd_8_variance128x128_sse2, 8),
+  VarianceParams(7, 6, &aom_highbd_8_variance128x64_sse2, 8),
+  VarianceParams(6, 7, &aom_highbd_8_variance64x128_sse2, 8),
+  VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
+  VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
+  VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
+  VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
+  VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
+  VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
+  VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
+  VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
+  VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8)
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDVariance_sse2));
+
+#if HAVE_AVX2
+
+const VarianceParams kArrayHBDVariance_avx2[] = {
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_avx2, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_avx2, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_avx2, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_avx2, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_avx2, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_avx2, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_avx2, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10)
+};
+
+INSTANTIATE_TEST_CASE_P(AVX2, AvxHBDVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDVariance_avx2));
+#endif  // HAVE_AVX2
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
+  SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
+  SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12),
+  SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12),
+  SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_sse2, 12),
+  SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_sse2, 12),
+  SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_sse2, 12),
+  SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_sse2, 12),
+  SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_sse2, 12),
+  SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12),
+  SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12),
+  SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_sse2, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_sse2, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_sse2, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_sse2, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_sse2, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10),
+  SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8),
+  SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8),
+  SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8),
+  SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_sse2, 8),
+  SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_sse2, 8),
+  SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_sse2, 8),
+  SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_sse2, 8),
+  SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_sse2, 8),
+  SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8),
+  SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
+  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_sse2,
+                          12),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_sse2,
+                          12),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_sse2,
+                          10),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_sse2,
+                          10),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_sse2,
+                          8),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3,
+                                0)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, AvxJntSubpelAvgVarianceTest,
+    ::testing::Values(
+        JntSubpelAvgVarianceParams(6, 6,
+                                   &aom_jnt_sub_pixel_avg_variance64x64_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(6, 5,
+                                   &aom_jnt_sub_pixel_avg_variance64x32_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 6,
+                                   &aom_jnt_sub_pixel_avg_variance32x64_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 5,
+                                   &aom_jnt_sub_pixel_avg_variance32x32_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(5, 4,
+                                   &aom_jnt_sub_pixel_avg_variance32x16_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 5,
+                                   &aom_jnt_sub_pixel_avg_variance16x32_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 4,
+                                   &aom_jnt_sub_pixel_avg_variance16x16_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(4, 3,
+                                   &aom_jnt_sub_pixel_avg_variance16x8_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 4,
+                                   &aom_jnt_sub_pixel_avg_variance8x16_ssse3,
+                                   0),
+        JntSubpelAvgVarianceParams(3, 3,
+                                   &aom_jnt_sub_pixel_avg_variance8x8_ssse3, 0),
+        JntSubpelAvgVarianceParams(3, 2,
+                                   &aom_jnt_sub_pixel_avg_variance8x4_ssse3, 0),
+        JntSubpelAvgVarianceParams(2, 3,
+                                   &aom_jnt_sub_pixel_avg_variance4x8_ssse3, 0),
+        JntSubpelAvgVarianceParams(2, 2,
+                                   &aom_jnt_sub_pixel_avg_variance4x4_ssse3,
+                                   0)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxObmcSubpelVarianceTest,
+    ::testing::Values(
+        ObmcSubpelVarianceParams(7, 7,
+                                 &aom_obmc_sub_pixel_variance128x128_sse4_1, 0),
+        ObmcSubpelVarianceParams(7, 6,
+                                 &aom_obmc_sub_pixel_variance128x64_sse4_1, 0),
+        ObmcSubpelVarianceParams(6, 7,
+                                 &aom_obmc_sub_pixel_variance64x128_sse4_1, 0),
+        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
+                                 0)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, AvxMseTest,
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
+                      VarianceParams(7, 6, &aom_variance128x64_avx2),
+                      VarianceParams(6, 7, &aom_variance64x128_avx2),
+                      VarianceParams(6, 6, &aom_variance64x64_avx2),
+                      VarianceParams(6, 5, &aom_variance64x32_avx2),
+                      VarianceParams(6, 4, &aom_variance64x16_avx2),
+                      VarianceParams(5, 6, &aom_variance32x64_avx2),
+                      VarianceParams(5, 5, &aom_variance32x32_avx2),
+                      VarianceParams(5, 4, &aom_variance32x16_avx2),
+                      VarianceParams(5, 3, &aom_variance32x8_avx2),
+                      VarianceParams(4, 6, &aom_variance16x64_avx2),
+                      VarianceParams(4, 5, &aom_variance16x32_avx2),
+                      VarianceParams(4, 4, &aom_variance16x16_avx2),
+                      VarianceParams(4, 3, &aom_variance16x8_avx2),
+                      VarianceParams(4, 2, &aom_variance16x4_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_avx2,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_avx2,
+                                0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2,
+                                0)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvxSseTest,
+                        ::testing::Values(SseParams(2, 2,
+                                                    &aom_get4x4sse_cs_neon)));
+
+INSTANTIATE_TEST_CASE_P(NEON, AvxMseTest,
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_neon)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, AvxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_neon),
+                      VarianceParams(6, 5, &aom_variance64x32_neon),
+                      VarianceParams(5, 6, &aom_variance32x64_neon),
+                      VarianceParams(5, 5, &aom_variance32x32_neon),
+                      VarianceParams(4, 4, &aom_variance16x16_neon),
+                      VarianceParams(4, 3, &aom_variance16x8_neon),
+                      VarianceParams(3, 4, &aom_variance8x16_neon),
+                      VarianceParams(3, 3, &aom_variance8x8_neon)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
+                        ::testing::Values(aom_get_mb_ss_msa));
+
+INSTANTIATE_TEST_CASE_P(MSA, AvxSseTest,
+                        ::testing::Values(SseParams(2, 2,
+                                                    &aom_get4x4sse_cs_msa)));
+
+INSTANTIATE_TEST_CASE_P(MSA, AvxMseTest,
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
+                                          MseParams(4, 3, &aom_mse16x8_msa),
+                                          MseParams(3, 4, &aom_mse8x16_msa),
+                                          MseParams(3, 3, &aom_mse8x8_msa)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, AvxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa),
+                      VarianceParams(6, 5, &aom_variance64x32_msa),
+                      VarianceParams(5, 6, &aom_variance32x64_msa),
+                      VarianceParams(5, 5, &aom_variance32x32_msa),
+                      VarianceParams(5, 4, &aom_variance32x16_msa),
+                      VarianceParams(4, 5, &aom_variance16x32_msa),
+                      VarianceParams(4, 4, &aom_variance16x16_msa),
+                      VarianceParams(4, 3, &aom_variance16x8_msa),
+                      VarianceParams(3, 4, &aom_variance8x16_msa),
+                      VarianceParams(3, 3, &aom_variance8x8_msa),
+                      VarianceParams(3, 2, &aom_variance8x4_msa),
+                      VarianceParams(2, 3, &aom_variance4x8_msa),
+                      VarianceParams(2, 2, &aom_variance4x4_msa)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_msa, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_msa, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_msa, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_msa, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_msa, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_msa, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_msa, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_msa, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_msa, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_msa, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_msa, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_msa, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_msa, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MSA, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_msa, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_msa, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_msa, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_msa, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_msa, 0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_msa, 0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_msa, 0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_msa, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_msa, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_msa, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_msa, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_msa, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_msa, 0)));
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/third_party/aom/test/video_source.h b/third_party/aom/test/video_source.h
new file mode 100644
index 000000000..3c1c5e559
--- /dev/null
+++ b/third_party/aom/test/video_source.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_VIDEO_SOURCE_H_
+#define AOM_TEST_VIDEO_SOURCE_H_
+
+#if defined(_WIN32)
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "test/acm_random.h"
+#include "aom/aom_encoder.h"
+
+namespace libaom_test {
+
+// Helper macros to ensure LIBAOM_TEST_DATA_PATH is a quoted string.
+// These are undefined right below GetDataPath
+// NOTE: LIBAOM_TEST_DATA_PATH MUST NOT be a quoted string before
+// Stringification or the GetDataPath will fail at runtime
+#define TO_STRING(S) #S
+#define STRINGIFY(S) TO_STRING(S)
+
+// A simple function to encapsulate cross platform retrieval of test data path
+static std::string GetDataPath() {
+  const char *const data_path = getenv("LIBAOM_TEST_DATA_PATH");
+  if (data_path == NULL) {
+#ifdef LIBAOM_TEST_DATA_PATH
+    // In some environments, we cannot set environment variables
+    // Instead, we set the data path by using a preprocessor symbol
+    // which can be set from make files
+    return STRINGIFY(LIBAOM_TEST_DATA_PATH);
+#else
+    return ".";
+#endif
+  }
+  return data_path;
+}
+
+// Undefining stringification macros because they are not used elsewhere
+#undef TO_STRING
+#undef STRINGIFY
+
+inline FILE *OpenTestDataFile(const std::string &file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
+  return fopen(path_to_source.c_str(), "rb");
+}
+
+static FILE *GetTempOutFile(std::string *file_name) {
+  file_name->clear();
+#if defined(_WIN32)
+  char fname[MAX_PATH];
+  char tmppath[MAX_PATH];
+  if (GetTempPathA(MAX_PATH, tmppath)) {
+    // Assume for now that the filename generated is unique per process
+    if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
+      file_name->assign(fname);
+      return fopen(fname, "wb+");
+    }
+  }
+  return NULL;
+#else
+  char name_template[] = "/tmp/libaomtest.XXXXXX";
+  const int fd = mkstemp(name_template);
+  *file_name = name_template;
+  return fdopen(fd, "wb+");
+#endif
+}
+
+class TempOutFile {
+ public:
+  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  ~TempOutFile() {
+    CloseFile();
+    if (!file_name_.empty()) {
+      EXPECT_EQ(0, remove(file_name_.c_str()));
+    }
+  }
+  FILE *file() { return file_; }
+  const std::string &file_name() { return file_name_; }
+
+ protected:
+  void CloseFile() {
+    if (file_) {
+      fclose(file_);
+      file_ = NULL;
+    }
+  }
+  FILE *file_;
+  std::string file_name_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// aom_image_t images with associated timestamps and duration.
+class VideoSource {
+ public:
+  virtual ~VideoSource() {}
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  // Get the current video frame, or NULL on End-Of-Stream.
+  virtual aom_image_t *img() const = 0;
+
+  // Get the presentation timestamp of the current frame.
+  virtual aom_codec_pts_t pts() const = 0;
+
+  // Get the current frame's duration
+  virtual unsigned long duration() const = 0;
+
+  // Get the timebase for the stream
+  virtual aom_rational_t timebase() const = 0;
+
+  // Get the current frame counter, starting at 0.
+  virtual unsigned int frame() const = 0;
+
+  // Get the current file limit.
+  virtual unsigned int limit() const = 0;
+};
+
+class DummyVideoSource : public VideoSource {
+ public:
+  DummyVideoSource()
+      : img_(NULL), limit_(100), width_(80), height_(64),
+        format_(AOM_IMG_FMT_I420) {
+    ReallocImage();
+  }
+
+  virtual ~DummyVideoSource() { aom_img_free(img_); }
+
+  virtual void Begin() {
+    frame_ = 0;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual aom_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual aom_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual aom_rational_t timebase() const {
+    const aom_rational_t t = { 1, 30 };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  void set_limit(unsigned int limit) { limit_ = limit; }
+
+  void SetSize(unsigned int width, unsigned int height) {
+    if (width != width_ || height != height_) {
+      width_ = width;
+      height_ = height;
+      ReallocImage();
+    }
+  }
+
+  void SetImageFormat(aom_img_fmt_t format) {
+    if (format_ != format) {
+      format_ = format;
+      ReallocImage();
+    }
+  }
+
+ protected:
+  virtual void FillFrame() {
+    if (img_) memset(img_->img_data, 0, raw_sz_);
+  }
+
+  void ReallocImage() {
+    aom_img_free(img_);
+    img_ = aom_img_alloc(NULL, format_, width_, height_, 32);
+    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+  }
+
+  aom_image_t *img_;
+  size_t raw_sz_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  aom_img_fmt_t format_;
+};
+
+class RandomVideoSource : public DummyVideoSource {
+ public:
+  RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
+      : rnd_(seed), seed_(seed) {}
+
+ protected:
+  // Reset the RNG to get a matching stream for the second pass
+  virtual void Begin() {
+    frame_ = 0;
+    rnd_.Reset(seed_);
+    FillFrame();
+  }
+
+  // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
+  // than holding previous frames to encourage keyframes to be thrown.
+  virtual void FillFrame() {
+    if (img_) {
+      if (frame_ % 30 < 15)
+        for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
+      else
+        memset(img_->img_data, 0, raw_sz_);
+    }
+  }
+
+  ACMRandom rnd_;
+  int seed_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// decompressed images to the decoder.
+class CompressedVideoSource {
+ public:
+  virtual ~CompressedVideoSource() {}
+
+  virtual void Init() = 0;
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  virtual const uint8_t *cxdata() const = 0;
+
+  virtual size_t frame_size() const = 0;
+
+  virtual unsigned int frame_number() const = 0;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/visual_metrics.py b/third_party/aom/test/visual_metrics.py
new file mode 100755
index 000000000..9055feb33
--- /dev/null
+++ b/third_party/aom/test/visual_metrics.py
@@ -0,0 +1,466 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts video encoding result data from text files to visualization
+data source."""
+
+__author__ = "jzern@google.com (James Zern),"
+__author__ += "jimbankoski@google.com (Jim Bankoski)"
+
+import fnmatch
+import numpy as np
+import scipy as sp
+import scipy.interpolate
+import os
+import re
+import string
+import sys
+import math
+import warnings
+
+import gviz_api
+
+from os.path import basename
+from os.path import splitext
+
+warnings.simplefilter('ignore', np.RankWarning)
+warnings.simplefilter('ignore', RuntimeWarning)
+
+def bdsnr2(metric_set1, metric_set2):
+  """
+  BJONTEGAARD    Bjontegaard metric calculation adapted
+  Bjontegaard's snr metric allows to compute the average % saving in decibels
+  between two rate-distortion curves [1].  This is an adaptation of that
+  method that fixes inconsistencies when the curve fit operation goes awry
+  by replacing the curve fit function with a Piecewise Cubic Hermite
+  Interpolating Polynomial and then integrating that by evaluating that
+  function at small intervals using the trapezoid method to calculate
+  the integral.
+
+  metric_set1 - list of tuples ( bitrate,  metric ) for first graph
+  metric_set2 - list of tuples ( bitrate,  metric ) for second graph
+  """
+
+  if not metric_set1 or not metric_set2:
+    return 0.0
+
+  try:
+
+    # pchip_interlopate requires keys sorted by x axis. x-axis will
+    # be our metric not the bitrate so sort by metric.
+    metric_set1.sort()
+    metric_set2.sort()
+
+    # Pull the log of the rate and clamped psnr from metric_sets.
+    log_rate1 = [math.log(x[0]) for x in metric_set1]
+    metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+    log_rate2 = [math.log(x[0]) for x in metric_set2]
+    metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+    # Integration interval.  This metric only works on the area that's
+    # overlapping.   Extrapolation of these things is sketchy so we avoid.
+    min_int = max([min(log_rate1), min(log_rate2)])
+    max_int = min([max(log_rate1), max(log_rate2)])
+
+    # No overlap means no sensible metric possible.
+    if max_int <= min_int:
+      return 0.0
+
+    # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+    # create 100 new samples points separated by interval.
+    lin = np.linspace(min_int, max_int, num=100, retstep=True)
+    interval = lin[1]
+    samples = lin[0]
+    v1 = scipy.interpolate.pchip_interpolate(log_rate1, metric1, samples)
+    v2 = scipy.interpolate.pchip_interpolate(log_rate2, metric2, samples)
+
+    # Calculate the integral using the trapezoid method on the samples.
+    int_v1 = np.trapz(v1, dx=interval)
+    int_v2 = np.trapz(v2, dx=interval)
+
+    # Calculate the average improvement.
+    avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+  except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+    return 0
+
+  return avg_exp_diff
+
+def bdrate2(metric_set1, metric_set2):
+  """
+  BJONTEGAARD    Bjontegaard metric calculation adapted
+  Bjontegaard's metric allows to compute the average % saving in bitrate
+  between two rate-distortion curves [1].  This is an adaptation of that
+  method that fixes inconsistencies when the curve fit operation goes awry
+  by replacing the curve fit function with a Piecewise Cubic Hermite
+  Interpolating Polynomial and then integrating that by evaluating that
+  function at small intervals using the trapezoid method to calculate
+  the integral.
+
+  metric_set1 - list of tuples ( bitrate,  metric ) for first graph
+  metric_set2 - list of tuples ( bitrate,  metric ) for second graph
+  """
+
+  if not metric_set1 or not metric_set2:
+    return 0.0
+
+  try:
+
+    # pchip_interlopate requires keys sorted by x axis. x-axis will
+    # be our metric not the bitrate so sort by metric.
+    metric_set1.sort(key=lambda tup: tup[1])
+    metric_set2.sort(key=lambda tup: tup[1])
+
+    # Pull the log of the rate and clamped psnr from metric_sets.
+    log_rate1 = [math.log(x[0]) for x in metric_set1]
+    metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+    log_rate2 = [math.log(x[0]) for x in metric_set2]
+    metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+    # Integration interval.  This metric only works on the area that's
+    # overlapping.   Extrapolation of these things is sketchy so we avoid.
+    min_int = max([min(metric1), min(metric2)])
+    max_int = min([max(metric1), max(metric2)])
+
+    # No overlap means no sensible metric possible.
+    if max_int <= min_int:
+      return 0.0
+
+    # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+    # create 100 new samples points separated by interval.
+    lin = np.linspace(min_int, max_int, num=100, retstep=True)
+    interval = lin[1]
+    samples = lin[0]
+    v1 = scipy.interpolate.pchip_interpolate(metric1, log_rate1, samples)
+    v2 = scipy.interpolate.pchip_interpolate(metric2, log_rate2, samples)
+
+    # Calculate the integral using the trapezoid method on the samples.
+    int_v1 = np.trapz(v1, dx=interval)
+    int_v2 = np.trapz(v2, dx=interval)
+
+    # Calculate the average improvement.
+    avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+  except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+    return 0
+
+  # Convert to a percentage.
+  avg_diff = (math.exp(avg_exp_diff) - 1) * 100
+
+  return avg_diff
+
+
+
+def FillForm(string_for_substitution, dictionary_of_vars):
+  """
+  This function substitutes all matches of the command string //%% ... %%//
+  with the variable represented by ...  .
+  """
+  return_string = string_for_substitution
+  for i in re.findall("//%%(.*)%%//", string_for_substitution):
+    return_string = re.sub("//%%" + i + "%%//", dictionary_of_vars[i],
+                           return_string)
+  return return_string
+
+
+def HasMetrics(line):
+  """
+  The metrics files produced by aomenc are started with a B for headers.
+  """
+  # If the first char of the first word on the line is a digit
+  if len(line) == 0:
+    return False
+  if len(line.split()) == 0:
+    return False
+  if line.split()[0][0:1].isdigit():
+    return True
+  return False
+
+def GetMetrics(file_name):
+  metric_file = open(file_name, "r")
+  return metric_file.readline().split();
+
+def ParseMetricFile(file_name, metric_column):
+  metric_set1 = set([])
+  metric_file = open(file_name, "r")
+  for line in metric_file:
+    metrics = string.split(line)
+    if HasMetrics(line):
+      if metric_column < len(metrics):
+        try:
+          tuple = float(metrics[0]), float(metrics[metric_column])
+        except:
+          tuple = float(metrics[0]), 0
+      else:
+        tuple = float(metrics[0]), 0
+      metric_set1.add(tuple)
+  metric_set1_sorted = sorted(metric_set1)
+  return metric_set1_sorted
+
+
+def FileBetter(file_name_1, file_name_2, metric_column, method):
+  """
+  Compares two data files and determines which is better and by how
+  much. Also produces a histogram of how much better, by PSNR.
+  metric_column is the metric.
+  """
+  # Store and parse our two files into lists of unique tuples.
+
+  # Read the two files, parsing out lines starting with bitrate.
+  metric_set1_sorted = ParseMetricFile(file_name_1, metric_column)
+  metric_set2_sorted = ParseMetricFile(file_name_2, metric_column)
+
+
+  def GraphBetter(metric_set1_sorted, metric_set2_sorted, base_is_set_2):
+    """
+    Search through the sorted metric file for metrics on either side of
+    the metric from file 1.  Since both lists are sorted we really
+    should not have to search through the entire range, but these
+    are small files."""
+    total_bitrate_difference_ratio = 0.0
+    count = 0
+    for bitrate, metric in metric_set1_sorted:
+      if bitrate == 0:
+        continue
+      for i in range(len(metric_set2_sorted) - 1):
+        s2_bitrate_0, s2_metric_0 = metric_set2_sorted[i]
+        s2_bitrate_1, s2_metric_1 = metric_set2_sorted[i + 1]
+        # We have a point on either side of our metric range.
+        if metric > s2_metric_0 and metric <= s2_metric_1:
+
+          # Calculate a slope.
+          if s2_metric_1 - s2_metric_0 != 0:
+            metric_slope = ((s2_bitrate_1 - s2_bitrate_0) /
+                            (s2_metric_1 - s2_metric_0))
+          else:
+            metric_slope = 0
+
+          estimated_s2_bitrate = (s2_bitrate_0 + (metric - s2_metric_0) *
+                                  metric_slope)
+
+          if estimated_s2_bitrate == 0:
+            continue
+          # Calculate percentage difference as given by base.
+          if base_is_set_2 == 0:
+            bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+                                        bitrate)
+          else:
+            bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+                                        estimated_s2_bitrate)
+
+          total_bitrate_difference_ratio += bitrate_difference_ratio
+          count += 1
+          break
+
+    # Calculate the average improvement between graphs.
+    if count != 0:
+      avg = total_bitrate_difference_ratio / count
+
+    else:
+      avg = 0.0
+
+    return avg
+
+  # Be fair to both graphs by testing all the points in each.
+  if method == 'avg':
+    avg_improvement = 50 * (
+                       GraphBetter(metric_set1_sorted, metric_set2_sorted, 1) -
+                       GraphBetter(metric_set2_sorted, metric_set1_sorted, 0))
+  elif method == 'dsnr':
+      avg_improvement = bdsnr2(metric_set1_sorted, metric_set2_sorted)
+  else:
+      avg_improvement = bdrate2(metric_set2_sorted, metric_set1_sorted)
+
+  return avg_improvement
+
+
+def HandleFiles(variables):
+  """
+  This script creates html for displaying metric data produced from data
+  in a video stats file,  as created by the AOM project when enable_psnr
+  is turned on:
+
+  Usage: visual_metrics.py template.html pattern base_dir sub_dir [ sub_dir2 ..]
+
+  The script parses each metrics file [see below] that matches the
+  statfile_pattern  in the baseline directory and looks for the file that
+  matches that same file in each of the sub_dirs, and compares the resultant
+  metrics bitrate, avg psnr, glb psnr, and ssim. "
+
+  It provides a table in which each row is a file in the line directory,
+  and a column for each subdir, with the cells representing how that clip
+  compares to baseline for that subdir.   A graph is given for each which
+  compares filesize to that metric.  If you click on a point in the graph it
+  zooms in on that point.
+
+  a SAMPLE metrics file:
+
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   25.911   38.242   38.104   38.258   38.121   75.790    14103
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   49.982   41.264   41.129   41.255   41.122   83.993    19817
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   74.967   42.911   42.767   42.899   42.756   87.928    17332
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  100.012   43.983   43.838   43.881   43.738   89.695    25389
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  149.980   45.338   45.203   45.184   45.043   91.591    25438
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  199.852   46.225   46.123   46.113   45.999   92.679    28302
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  249.922   46.864   46.773   46.777   46.673   93.334    27244
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  299.998   47.366   47.281   47.317   47.220   93.844    27137
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  349.769   47.746   47.677   47.722   47.648   94.178    32226
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  399.773   48.032   47.971   48.013   47.946   94.362    36203
+
+  sample use:
+  visual_metrics.py template.html "*stt" aom aom_b aom_c > metrics.html
+  """
+
+  # The template file is the html file into which we will write the
+  # data from the stats file, formatted correctly for the gviz_api.
+  template_file = open(variables[1], "r")
+  page_template = template_file.read()
+  template_file.close()
+
+  # This is the path match pattern for finding stats files amongst
+  # all the other files it could be.  eg: *.stt
+  file_pattern = variables[2]
+
+  # This is the directory with files that we will use to do the comparison
+  # against.
+  baseline_dir = variables[3]
+  snrs = ''
+  filestable = {}
+
+  filestable['dsnr'] = ''
+  filestable['drate'] = ''
+  filestable['avg'] = ''
+
+  # Dirs is directories after the baseline to compare to the base.
+  dirs = variables[4:len(variables)]
+
+  # Find the metric files in the baseline directory.
+  dir_list = sorted(fnmatch.filter(os.listdir(baseline_dir), file_pattern))
+
+  metrics = GetMetrics(baseline_dir + "/" + dir_list[0])
+
+  metrics_js = 'metrics = ["' + '", "'.join(metrics) + '"];'
+
+  for column in range(1, len(metrics)):
+
+    for metric in ['avg','dsnr','drate']:
+      description = {"file": ("string", "File")}
+
+      # Go through each directory and add a column header to our description.
+      countoverall = {}
+      sumoverall = {}
+
+      for directory in dirs:
+        description[directory] = ("number", directory)
+        countoverall[directory] = 0
+        sumoverall[directory] = 0
+
+      # Data holds the data for the visualization, name given comes from
+      # gviz_api sample code.
+      data = []
+      for filename in dir_list:
+        row = {'file': splitext(basename(filename))[0] }
+        baseline_file_name = baseline_dir + "/" + filename
+
+        # Read the metric file from each of the directories in our list.
+        for directory in dirs:
+          metric_file_name = directory + "/" + filename
+
+          # If there is a metric file in the current directory, open it
+          # and calculate its overall difference between it and the baseline
+          # directory's metric file.
+          if os.path.isfile(metric_file_name):
+            overall = FileBetter(baseline_file_name, metric_file_name,
+                                 column, metric)
+            row[directory] = overall
+
+            sumoverall[directory] += overall
+            countoverall[directory] += 1
+
+        data.append(row)
+
+      # Add the overall numbers.
+      row = {"file": "OVERALL" }
+      for directory in dirs:
+        row[directory] = sumoverall[directory] / countoverall[directory]
+      data.append(row)
+
+      # write the tables out
+      data_table = gviz_api.DataTable(description)
+      data_table.LoadData(data)
+
+      filestable[metric] = ( filestable[metric] + "filestable_" + metric +
+                             "[" + str(column) + "]=" +
+                             data_table.ToJSon(columns_order=["file"]+dirs) + "\n" )
+
+    filestable_avg = filestable['avg']
+    filestable_dpsnr = filestable['dsnr']
+    filestable_drate = filestable['drate']
+
+    # Now we collect all the data for all the graphs.  First the column
+    # headers which will be Datarate and then each directory.
+    columns = ("datarate",baseline_dir)
+    description = {"datarate":("number", "Datarate")}
+    for directory in dirs:
+      description[directory] = ("number", directory)
+
+    description[baseline_dir] = ("number", baseline_dir)
+
+    snrs = snrs + "snrs[" + str(column) + "] = ["
+
+    # Now collect the data for the graphs, file by file.
+    for filename in dir_list:
+
+      data = []
+
+      # Collect the file in each directory and store all of its metrics
+      # in the associated gviz metrics table.
+      all_dirs = dirs + [baseline_dir]
+      for directory in all_dirs:
+
+        metric_file_name = directory + "/" + filename
+        if not os.path.isfile(metric_file_name):
+          continue
+
+        # Read and parse the metrics file storing it to the data we'll
+        # use for the gviz_api.Datatable.
+        metrics = ParseMetricFile(metric_file_name, column)
+        for bitrate, metric in metrics:
+          data.append({"datarate": bitrate, directory: metric})
+
+      data_table = gviz_api.DataTable(description)
+      data_table.LoadData(data)
+      snrs = snrs + "'" + data_table.ToJSon(
+         columns_order=tuple(["datarate",baseline_dir]+dirs)) + "',"
+
+    snrs = snrs + "]\n"
+
+    formatters = ""
+    for i in range(len(dirs)):
+      formatters = "%s   formatter.format(better, %d);" % (formatters, i+1)
+
+  print FillForm(page_template, vars())
+  return
+
+if len(sys.argv) < 3:
+  print HandleFiles.__doc__
+else:
+  HandleFiles(sys.argv)
diff --git a/third_party/aom/test/warp_filter_test.cc b/third_party/aom/test/warp_filter_test.cc
new file mode 100644
index 000000000..19a4e8b6a
--- /dev/null
+++ b/third_party/aom/test/warp_filter_test.cc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/warp_filter_test_util.h"
+using ::testing::make_tuple;
+using ::testing::tuple;
+using libaom_test::ACMRandom;
+using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
+using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
+
+namespace {
+
+TEST_P(AV1WarpFilterTest, CheckOutput) {
+  RunCheckOutput(::testing::get<3>(GET_PARAM(0)));
+}
+TEST_P(AV1WarpFilterTest, DISABLED_Speed) {
+  RunSpeedTest(::testing::get<3>(GET_PARAM(0)));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    C, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
+
+TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
+  RunCheckOutput(::testing::get<4>(GET_PARAM(0)));
+}
+TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
+  RunSpeedTest(::testing::get<4>(GET_PARAM(0)));
+}
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdWarpFilterTest,
+                        libaom_test::AV1HighbdWarpFilter::BuildParams(
+                            av1_highbd_warp_affine_sse4_1));
+
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
+#endif  // HAVE_NEON
+
+}  // namespace
diff --git a/third_party/aom/test/warp_filter_test_util.cc b/third_party/aom/test/warp_filter_test_util.cc
new file mode 100644
index 000000000..69b2ed4af
--- /dev/null
+++ b/third_party/aom/test/warp_filter_test_util.cc
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom_ports/aom_timer.h"
+#include "test/warp_filter_test_util.h"
+
+using ::testing::make_tuple;
+using ::testing::tuple;
+
+namespace libaom_test {
+
+int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
+  // 1 in 8 chance of generating zero (arbitrarily chosen)
+  if (((rnd->Rand8()) & 7) == 0) return 0;
+  // Otherwise, enerate uniform values in the range
+  // [-(1 << bits), 1] U [1, 1<<bits]
+  int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
+  if ((rnd->Rand8()) & 1) return -v;
+  return v;
+}
+
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+                           int16_t *alpha, int16_t *beta, int16_t *gamma,
+                           int16_t *delta, const int is_alpha_zero,
+                           const int is_beta_zero, const int is_gamma_zero,
+                           const int is_delta_zero) {
+  while (1) {
+    int rnd8 = rnd->Rand8() & 3;
+    mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
+    mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
+    mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+             (1 << WARPEDMODEL_PREC_BITS);
+    mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+
+    if (rnd8 <= 1) {
+      // AFFINE
+      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+               (1 << WARPEDMODEL_PREC_BITS);
+    } else if (rnd8 == 2) {
+      mat[4] = -mat[3];
+      mat[5] = mat[2];
+    } else {
+      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+               (1 << WARPEDMODEL_PREC_BITS);
+      if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+      if (is_beta_zero == 1) mat[3] = 0;
+      if (is_gamma_zero == 1) mat[4] = 0;
+      if (is_delta_zero == 1)
+        mat[5] = (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) +
+                 (1 << WARPEDMODEL_PREC_BITS);
+    }
+
+    // Calculate the derived parameters and check that they are suitable
+    // for the warp filter.
+    assert(mat[2] != 0);
+
+    *alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+    *beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+    *gamma = clamp(((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) / mat[2],
+                   INT16_MIN, INT16_MAX);
+    *delta =
+        clamp(mat[5] - (((int64_t)mat[3] * mat[4] + (mat[2] / 2)) / mat[2]) -
+                  (1 << WARPEDMODEL_PREC_BITS),
+              INT16_MIN, INT16_MAX);
+
+    if ((4 * abs(*alpha) + 7 * abs(*beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+        (4 * abs(*gamma) + 4 * abs(*delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+      continue;
+
+    *alpha = ROUND_POWER_OF_TWO_SIGNED(*alpha, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+    *beta = ROUND_POWER_OF_TWO_SIGNED(*beta, WARP_PARAM_REDUCE_BITS) *
+            (1 << WARP_PARAM_REDUCE_BITS);
+    *gamma = ROUND_POWER_OF_TWO_SIGNED(*gamma, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+    *delta = ROUND_POWER_OF_TWO_SIGNED(*delta, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+
+    // We have a valid model, so finish
+    return;
+  }
+}
+
+namespace AV1WarpFilter {
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
+    warp_affine_func filter) {
+  WarpTestParam params[] = {
+    make_tuple(4, 4, 50000, filter),  make_tuple(8, 8, 50000, filter),
+    make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
+    make_tuple(32, 8, 10000, filter),
+  };
+  return ::testing::Combine(::testing::ValuesIn(params),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1));
+}
+
+AV1WarpFilterTest::~AV1WarpFilterTest() {}
+void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  WarpTestParam params = GET_PARAM(0);
+  const int out_w = ::testing::get<0>(params),
+            out_h = ::testing::get<1>(params);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  int sub_x, sub_y;
+  const int bd = 8;
+
+  uint8_t *input_ = new uint8_t[h * stride];
+  uint8_t *input = input_ + border;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                        is_alpha_zero, is_beta_zero, is_gamma_zero,
+                        is_delta_zero);
+
+  for (int r = 0; r < h; ++r)
+    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+  for (int r = 0; r < h; ++r) {
+    memset(input + r * stride - border, input[r * stride], border);
+    memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+  }
+
+  sub_x = 0;
+  sub_y = 0;
+  int do_average = 0;
+
+  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+  conv_params.use_jnt_comp_avg = 0;
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
+              sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+
+  delete[] input_;
+  delete[] output;
+  delete[] dsta;
+}
+
+void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  WarpTestParam params = GET_PARAM(0);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  const int out_w = ::testing::get<0>(params),
+            out_h = ::testing::get<1>(params);
+  const int num_iters = ::testing::get<2>(params);
+  int i, j, sub_x, sub_y;
+  const int bd = 8;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint8_t *input_ = new uint8_t[h * stride];
+  uint8_t *input = input_ + border;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
+
+  for (i = 0; i < num_iters; ++i) {
+    // Generate an input block and extend its borders horizontally
+    for (int r = 0; r < h; ++r)
+      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+    for (int r = 0; r < h; ++r) {
+      memset(input + r * stride - border, input[r * stride], border);
+      memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+    }
+    const int use_no_round = rnd_.Rand8() & 1;
+    for (sub_x = 0; sub_x < 2; ++sub_x)
+      for (sub_y = 0; sub_y < 2; ++sub_y) {
+        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                              is_alpha_zero, is_beta_zero, is_gamma_zero,
+                              is_delta_zero);
+
+        for (int ii = 0; ii < 2; ++ii) {
+          for (int jj = 0; jj < 5; ++jj) {
+            for (int do_average = 0; do_average <= 1; ++do_average) {
+              if (use_no_round) {
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+              } else {
+                conv_params = get_conv_params(0, 0, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
+                                out_h, out_w, sub_x, sub_y, &conv_params, alpha,
+                                beta, gamma, delta);
+              if (use_no_round) {
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+                        out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
+                        delta);
+              if (use_no_round) {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(dsta[j], dstb[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              } else {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              }
+            }
+          }
+        }
+      }
+  }
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+  delete[] dsta;
+  delete[] dstb;
+}
+}  // namespace AV1WarpFilter
+
+namespace AV1HighbdWarpFilter {
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
+    highbd_warp_affine_func filter) {
+  const HighbdWarpTestParam params[] = {
+    make_tuple(4, 4, 100, 8, filter),    make_tuple(8, 8, 100, 8, filter),
+    make_tuple(64, 64, 100, 8, filter),  make_tuple(4, 16, 100, 8, filter),
+    make_tuple(32, 8, 100, 8, filter),   make_tuple(4, 4, 100, 10, filter),
+    make_tuple(8, 8, 100, 10, filter),   make_tuple(64, 64, 100, 10, filter),
+    make_tuple(4, 16, 100, 10, filter),  make_tuple(32, 8, 100, 10, filter),
+    make_tuple(4, 4, 100, 12, filter),   make_tuple(8, 8, 100, 12, filter),
+    make_tuple(64, 64, 100, 12, filter), make_tuple(4, 16, 100, 12, filter),
+    make_tuple(32, 8, 100, 12, filter),
+  };
+  return ::testing::Combine(::testing::ValuesIn(params),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1));
+}
+
+AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
+void AV1HighbdWarpFilterTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  HighbdWarpTestParam param = GET_PARAM(0);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  const int out_w = ::testing::get<0>(param), out_h = ::testing::get<1>(param);
+  const int bd = ::testing::get<3>(param);
+  const int mask = (1 << bd) - 1;
+  int sub_x, sub_y;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint16_t *input_ = new uint16_t[h * stride];
+  uint16_t *input = input_ + border;
+  uint16_t *output = new uint16_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+
+  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                        is_alpha_zero, is_beta_zero, is_gamma_zero,
+                        is_delta_zero);
+  // Generate an input block and extend its borders horizontally
+  for (int r = 0; r < h; ++r)
+    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < border; ++c) {
+      input[r * stride - border + c] = input[r * stride];
+      input[r * stride + w + c] = input[r * stride + (w - 1)];
+    }
+  }
+
+  sub_x = 0;
+  sub_y = 0;
+  int do_average = 0;
+  conv_params.use_jnt_comp_avg = 0;
+  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
+              sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+
+  delete[] input_;
+  delete[] output;
+  delete[] dsta;
+}
+
+void AV1HighbdWarpFilterTest::RunCheckOutput(
+    highbd_warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  HighbdWarpTestParam param = GET_PARAM(0);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  const int out_w = ::testing::get<0>(param), out_h = ::testing::get<1>(param);
+  const int bd = ::testing::get<3>(param);
+  const int num_iters = ::testing::get<2>(param);
+  const int mask = (1 << bd) - 1;
+  int i, j, sub_x, sub_y;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint16_t *input_ = new uint16_t[h * stride];
+  uint16_t *input = input_ + border;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
+
+  for (i = 0; i < num_iters; ++i) {
+    // Generate an input block and extend its borders horizontally
+    for (int r = 0; r < h; ++r)
+      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < border; ++c) {
+        input[r * stride - border + c] = input[r * stride];
+        input[r * stride + w + c] = input[r * stride + (w - 1)];
+      }
+    }
+    const int use_no_round = rnd_.Rand8() & 1;
+    for (sub_x = 0; sub_x < 2; ++sub_x)
+      for (sub_y = 0; sub_y < 2; ++sub_y) {
+        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                              is_alpha_zero, is_beta_zero, is_gamma_zero,
+                              is_delta_zero);
+        for (int ii = 0; ii < 2; ++ii) {
+          for (int jj = 0; jj < 5; ++jj) {
+            for (int do_average = 0; do_average <= 1; ++do_average) {
+              if (use_no_round) {
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+              } else {
+                conv_params = get_conv_params(0, 0, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+
+              av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
+                                       out_w, out_h, out_w, sub_x, sub_y, bd,
+                                       &conv_params, alpha, beta, gamma, delta);
+              if (use_no_round) {
+                // TODO(angiebird): Change this to test_impl once we have SIMD
+                // implementation
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_jnt_comp_avg = 0;
+              } else {
+                conv_params.use_jnt_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+                        out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,
+                        gamma, delta);
+
+              if (use_no_round) {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(dsta[j], dstb[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              } else {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              }
+            }
+          }
+        }
+      }
+  }
+
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+  delete[] dsta;
+  delete[] dstb;
+}
+}  // namespace AV1HighbdWarpFilter
+}  // namespace libaom_test
diff --git a/third_party/aom/test/warp_filter_test_util.h b/third_party/aom/test/warp_filter_test_util.h
new file mode 100644
index 000000000..b8998e5c8
--- /dev/null
+++ b/third_party/aom/test/warp_filter_test_util.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+#define AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/common_data.h"
+
+namespace libaom_test {
+
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+                           int16_t *alpha, int16_t *beta, int16_t *gamma,
+                           int16_t *delta, int is_alpha_zero, int is_beta_zero,
+                           int is_gamma_zero, int is_delta_zero);
+
+namespace AV1WarpFilter {
+
+typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref,
+                                 int width, int height, int stride,
+                                 uint8_t *pred, int p_col, int p_row,
+                                 int p_width, int p_height, int p_stride,
+                                 int subsampling_x, int subsampling_y,
+                                 ConvolveParams *conv_params, int16_t alpha,
+                                 int16_t beta, int16_t gamma, int16_t delta);
+
+typedef ::testing::tuple<int, int, int, warp_affine_func> WarpTestParam;
+typedef ::testing::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
+
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
+    warp_affine_func filter);
+
+class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParams> {
+ public:
+  virtual ~AV1WarpFilterTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(warp_affine_func test_impl);
+  void RunSpeedTest(warp_affine_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1WarpFilter
+
+namespace AV1HighbdWarpFilter {
+typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
+                                        int width, int height, int stride,
+                                        uint16_t *pred, int p_col, int p_row,
+                                        int p_width, int p_height, int p_stride,
+                                        int subsampling_x, int subsampling_y,
+                                        int bd, ConvolveParams *conv_params,
+                                        int16_t alpha, int16_t beta,
+                                        int16_t gamma, int16_t delta);
+
+typedef ::testing::tuple<int, int, int, int, highbd_warp_affine_func>
+    HighbdWarpTestParam;
+typedef ::testing::tuple<HighbdWarpTestParam, int, int, int, int>
+    HighbdWarpTestParams;
+
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
+    highbd_warp_affine_func filter);
+
+class AV1HighbdWarpFilterTest
+    : public ::testing::TestWithParam<HighbdWarpTestParams> {
+ public:
+  virtual ~AV1HighbdWarpFilterTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_warp_affine_func test_impl);
+  void RunSpeedTest(highbd_warp_affine_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1HighbdWarpFilter
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_WARP_FILTER_TEST_UTIL_H_
diff --git a/third_party/aom/test/webm_video_source.h b/third_party/aom/test/webm_video_source.h
new file mode 100644
index 000000000..bb3d11735
--- /dev/null
+++ b/third_party/aom/test/webm_video_source.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#define AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of WebM files,
+// so that we can do actual file decodes.
+class WebMVideoSource : public CompressedVideoSource {
+ public:
+  explicit WebMVideoSource(const std::string &file_name)
+      : file_name_(file_name), aom_ctx_(new AvxInputContext()),
+        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_sz_(0),
+        frame_number_(0), end_of_file_(false) {}
+
+  virtual ~WebMVideoSource() {
+    if (aom_ctx_->file != NULL) fclose(aom_ctx_->file);
+    webm_free(webm_ctx_);
+    delete aom_ctx_;
+    delete webm_ctx_;
+  }
+
+  virtual void Init() {}
+
+  virtual void Begin() {
+    aom_ctx_->file = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(aom_ctx_->file != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+
+    ASSERT_EQ(file_is_webm(webm_ctx_, aom_ctx_), 1) << "file is not WebM";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_number_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    ASSERT_TRUE(aom_ctx_->file != NULL);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
+    ASSERT_GE(status, 0) << "webm_read_frame failed";
+    if (status == 1) {
+      end_of_file_ = true;
+    }
+  }
+
+  void SeekToNextKeyFrame() {
+    ASSERT_TRUE(aom_ctx_->file != NULL);
+    do {
+      const int status =
+          webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
+      ASSERT_GE(status, 0) << "webm_read_frame failed";
+      ++frame_number_;
+      if (status == 1) {
+        end_of_file_ = true;
+      }
+    } while (!webm_ctx_->is_key_frame && !end_of_file_);
+  }
+
+  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_number_; }
+
+ protected:
+  std::string file_name_;
+  AvxInputContext *aom_ctx_;
+  WebmInputContext *webm_ctx_;
+  uint8_t *buf_;  // Owned by webm_ctx_ and freed when webm_ctx_ is freed.
+  size_t buf_sz_;
+  size_t frame_sz_;
+  unsigned int frame_number_;
+  bool end_of_file_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/wiener_test.cc b/third_party/aom/test/wiener_test.cc
new file mode 100644
index 000000000..dfec09119
--- /dev/null
+++ b/third_party/aom/test/wiener_test.cc
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_WIENER_BLOCK 384
+#define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN)
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
+                                    const uint8_t *src, int h_start, int h_end,
+                                    int v_start, int v_end, int dgd_stride,
+                                    int src_stride, double *M, double *H) {
+  ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+  int i, j, k, l, m, n;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  std::vector<std::vector<int64_t> > M_int(wiener_win,
+                                           std::vector<int64_t>(wiener_win, 0));
+  std::vector<std::vector<int64_t> > H_int(
+      wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
+  std::vector<std::vector<int32_t> > sumY(wiener_win,
+                                          std::vector<int32_t>(wiener_win, 0));
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j += 2) {
+      const uint8_t X1 = src[i * src_stride + j];
+      const uint8_t X2 = src[i * src_stride + j + 1];
+      sumX += X1 + X2;
+
+      const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+      for (k = 0; k < wiener_win; k++) {
+        for (l = 0; l < wiener_win; l++) {
+          const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+          const uint8_t D1 = dgd_ijkl[0];
+          const uint8_t D2 = dgd_ijkl[1];
+          sumY[k][l] += D1 + D2;
+          M_int[l][k] += D1 * X1 + D2 * X2;
+          for (m = 0; m < wiener_win; m++) {
+            for (n = 0; n < wiener_win; n++) {
+              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+                                       D2 * dgd_ij[n + dgd_stride * m + 1];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const double avg_square_sum = avg * avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      M[l * wiener_win + k] =
+          M_int[l][k] + avg_square_sum - avg * (sumX + sumY[k][l]);
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+              H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
+              avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+                         int h_start, int h_end, int v_start, int v_end,
+                         int dgd_stride, int src_stride, double *M, double *H) {
+  if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start,
+                            v_end, dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static const int kIterations = 100;
+static const double min_error = (double)(0.01);
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+                                   const uint8_t *src, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, double *M, double *H);
+
+typedef libaom_test::FuncParam<compute_stats_Func> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef ::testing::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void runWienerTest(const int32_t wiener_win, int32_t run_times);
+  void runWienerTest_ExtremeValues(const int32_t wiener_win);
+
+ private:
+  compute_stats_Func target_func_;
+  ACMRandom rng_;
+};
+
+void WienerTest::runWienerTest(const int32_t wiener_win, int32_t run_times) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, uint8_t, dgd_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+  DECLARE_ALIGNED(32, uint8_t, src_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+  DECLARE_ALIGNED(32, double, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, double, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, double, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, double, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+  int h_end =
+      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+  int v_end =
+      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = run_times == 1 ? kIterations : 2;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = rng_.Rand8();
+      src_buf[i] = rng_.Rand8();
+    }
+    uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+    uint8_t *src = src_buf;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                          dgd_stride, src_stride, M_ref, H_ref);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                   dgd_stride, src_stride, M_test, H_test);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
+             time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (fabs(M_ref[i] - M_test[i]) > min_error) {
+        failed = 1;
+        printf("win %d M iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+               iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    // ASSERT_EQ(failed, 0);
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (fabs(H_ref[i] - H_test[i]) > min_error) {
+        failed = 1;
+        printf("win %d H iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+               iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
+void WienerTest::runWienerTest_ExtremeValues(const int32_t wiener_win) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, uint8_t, dgd_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+  DECLARE_ALIGNED(32, uint8_t, src_buf[MAX_DATA_BLOCK * MAX_DATA_BLOCK]);
+  DECLARE_ALIGNED(32, double, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, double, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, double, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, double, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = 16;
+  const int h_end = MAX_WIENER_BLOCK;
+  const int v_start = 16;
+  const int v_end = MAX_WIENER_BLOCK;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = 1;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = 255;
+      src_buf[i] = 255;
+    }
+    uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+    uint8_t *src = src_buf;
+
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M_ref, H_ref);
+
+    target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                 dgd_stride, src_stride, M_test, H_test);
+
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (fabs(M_ref[i] - M_test[i]) > min_error) {
+        failed = 1;
+        printf("win %d M iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+               iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    // ASSERT_EQ(failed, 0);
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (fabs(H_ref[i] - H_test[i]) > min_error) {
+        failed = 1;
+        printf("win %d H iter %d [%4d] ref %6.0f test %6.0f \n", wiener_win,
+               iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
+TEST_P(WienerTest, RandomValues) {
+  runWienerTest(WIENER_WIN, 1);
+  runWienerTest(WIENER_WIN_CHROMA, 1);
+}
+
+TEST_P(WienerTest, ExtremeValues) {
+  runWienerTest_ExtremeValues(WIENER_WIN);
+  runWienerTest_ExtremeValues(WIENER_WIN_CHROMA);
+}
+
+TEST_P(WienerTest, DISABLED_Speed) {
+  runWienerTest(WIENER_WIN, 200);
+  runWienerTest(WIENER_WIN_CHROMA, 200);
+}
+
+INSTANTIATE_TEST_CASE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, WienerTest,
+                        ::testing::Values(av1_compute_stats_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_CASE_P(AVX2, WienerTest,
+                        ::testing::Values(av1_compute_stats_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/third_party/aom/test/y4m_test.cc b/third_party/aom/test/y4m_test.cc
new file mode 100644
index 000000000..6cc75ef5b
--- /dev/null
+++ b/third_party/aom/test/y4m_test.cc
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/y4menc.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+using std::string;
+
+static const unsigned int kWidth = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+struct Y4mTestParam {
+  const char *filename;
+  unsigned int bit_depth;
+  aom_img_fmt format;
+  const char *md5raw;
+};
+
+const Y4mTestParam kY4mTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420,
+    "e5406275b9fc6bb3436c31d4a05c1cab" },
+  { "park_joy_90p_8_420_monochrome.y4m", 8, AOM_IMG_FMT_I420,
+    "95ef5bf6218580588be24a5271bb6a7f" },
+  { "park_joy_90p_8_420_vertical_csp.y4m", 8, AOM_IMG_FMT_I420,
+    "f53a40fec15254ac312527339d9c686b" },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422,
+    "284a47a47133b12884ec3a14e959a0b6" },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444,
+    "90517ff33843d85de712fd4fe60dbed0" },
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016,
+    "63f21f9f717d8b8631bd2288ee87137b" },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216,
+    "48ab51fb540aed07f7ff5af130c9b605" },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416,
+    "067bfd75aa85ff9bae91fa3e0edd1e3e" },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016,
+    "9e6d8f6508c6e55625f6b697bc461cef" },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216,
+    "b239c6b301c0b835485be349ca83a7e3" },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416,
+    "5a6481a550821dab6d0192f5c63845e9" },
+};
+
+static const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+
+class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
+                           public ::libaom_test::Y4mVideoSource {
+ protected:
+  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    file_name_ = file_name;
+    start_ = 0;
+    limit_ = limit;
+    frame_ = 0;
+    Begin();
+  }
+
+  // Checks y4m header information
+  void HeaderChecks(unsigned int bit_depth, aom_img_fmt_t fmt) {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+    ASSERT_EQ(img()->d_w, kWidth);
+    ASSERT_EQ(img()->d_h, kHeight);
+    ASSERT_EQ(y4m_.bit_depth, bit_depth);
+    ASSERT_EQ(y4m_.aom_fmt, fmt);
+    if (fmt == AOM_IMG_FMT_I420 || fmt == AOM_IMG_FMT_I42016) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 1U);
+    }
+    if (fmt == AOM_IMG_FMT_I422 || fmt == AOM_IMG_FMT_I42216) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+    if (fmt == AOM_IMG_FMT_I444 || fmt == AOM_IMG_FMT_I44416) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+      ASSERT_EQ(img()->x_chroma_shift, 0U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+  }
+
+  // Checks MD5 of the raw frame data
+  void Md5Check(const string &expected_md5) {
+    ASSERT_TRUE(input_file_ != NULL);
+    libaom_test::MD5 md5;
+    for (unsigned int i = start_; i < limit_; i++) {
+      md5.Add(img());
+      Next();
+    }
+    ASSERT_EQ(string(md5.Get()), expected_md5);
+  }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+  const Y4mTestParam t = GetParam();
+  Init(t.filename, kFrames);
+  HeaderChecks(t.bit_depth, t.format);
+  Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoSourceTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest : public Y4mVideoSourceTest {
+ protected:
+  Y4mVideoWriteTest() : tmpfile_(NULL) {}
+
+  virtual ~Y4mVideoWriteTest() {
+    delete tmpfile_;
+    input_file_ = NULL;
+  }
+
+  void ReplaceInputFile(FILE *input_file) {
+    CloseSource();
+    frame_ = 0;
+    input_file_ = input_file;
+    rewind(input_file_);
+    ReadSourceToStart();
+  }
+
+  // Writes out a y4m file and then reads it back
+  void WriteY4mAndReadBack() {
+    ASSERT_TRUE(input_file_ != NULL);
+    char buf[Y4M_BUFFER_SIZE] = { 0 };
+    const struct AvxRational framerate = { y4m_.fps_n, y4m_.fps_d };
+    tmpfile_ = new libaom_test::TempOutFile;
+    ASSERT_TRUE(tmpfile_->file() != NULL);
+    y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
+                          img()->monochrome, img()->csp, y4m_.aom_fmt,
+                          y4m_.bit_depth);
+    fputs(buf, tmpfile_->file());
+    for (unsigned int i = start_; i < limit_; i++) {
+      y4m_write_frame_header(buf, sizeof(buf));
+      fputs(buf, tmpfile_->file());
+      y4m_write_image_file(img(), PLANES_YUV, tmpfile_->file());
+      Next();
+    }
+    ReplaceInputFile(tmpfile_->file());
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    Y4mVideoSourceTest::Init(file_name, limit);
+    WriteY4mAndReadBack();
+  }
+  libaom_test::TempOutFile *tmpfile_;
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+  const Y4mTestParam t = GetParam();
+  Init(t.filename, kFrames);
+  HeaderChecks(t.bit_depth, t.format);
+  Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_CASE_P(C, Y4mVideoWriteTest,
+                        ::testing::ValuesIn(kY4mTestVectors));
+}  // namespace
diff --git a/third_party/aom/test/y4m_video_source.h b/third_party/aom/test/y4m_video_source.h
new file mode 100644
index 000000000..3dea901e6
--- /dev/null
+++ b/third_party/aom/test/y4m_video_source.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#define AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#include <algorithm>
+#include <string>
+
+#include "common/y4minput.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class Y4mVideoSource : public VideoSource {
+ public:
+  Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
+      : file_name_(file_name), input_file_(NULL), img_(new aom_image_t()),
+        start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
+        framerate_denominator_(0), y4m_() {}
+
+  virtual ~Y4mVideoSource() {
+    aom_img_free(img_.get());
+    CloseSource();
+  }
+
+  virtual void OpenSource() {
+    CloseSource();
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+  }
+
+  virtual void ReadSourceToStart() {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    framerate_numerator_ = y4m_.fps_n;
+    framerate_denominator_ = y4m_.fps_d;
+    frame_ = 0;
+    for (unsigned int i = 0; i < start_; i++) {
+      Next();
+    }
+    FillFrame();
+  }
+
+  virtual void Begin() {
+    OpenSource();
+    ReadSourceToStart();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual aom_image_t *img() const {
+    return (frame_ < limit_) ? img_.get() : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual aom_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual aom_rational_t timebase() const {
+    const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
+  }
+
+  // Swap buffers with another y4m source. This allows reading a new frame
+  // while keeping the old frame around. A whole Y4mSource is required and
+  // not just a aom_image_t because of how the y4m reader manipulates
+  // aom_image_t internals,
+  void SwapBuffers(Y4mVideoSource *other) {
+    std::swap(other->y4m_.dst_buf, y4m_.dst_buf);
+    aom_image_t *tmp;
+    tmp = other->img_.release();
+    other->img_.reset(img_.release());
+    img_.reset(tmp);
+  }
+
+ protected:
+  void CloseSource() {
+    y4m_input_close(&y4m_);
+    y4m_ = y4m_input();
+    if (input_file_ != NULL) {
+      fclose(input_file_);
+      input_file_ = NULL;
+    }
+  }
+
+  std::string file_name_;
+  FILE *input_file_;
+  testing::internal::scoped_ptr<aom_image_t> img_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  y4m_input y4m_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/third_party/aom/test/yuv_video_source.h b/third_party/aom/test/yuv_video_source.h
new file mode 100644
index 000000000..774ecc008
--- /dev/null
+++ b/third_party/aom/test/yuv_video_source.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_YUV_VIDEO_SOURCE_H_
+#define AOM_TEST_YUV_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/video_source.h"
+#include "aom/aom_image.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw YUV
+// formats of various color sampling and bit-depths so that we can
+// do actual file encodes.
+class YUVVideoSource : public VideoSource {
+ public:
+  YUVVideoSource(const std::string &file_name, aom_img_fmt format,
+                 unsigned int width, unsigned int height, int rate_numerator,
+                 int rate_denominator, unsigned int start, int limit)
+      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
+        limit_(limit), frame_(0), width_(0), height_(0),
+        format_(AOM_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+    // This initializes format_, raw_size_, width_, height_ and allocates img.
+    SetSize(width, height, format);
+  }
+
+  virtual ~YUVVideoSource() {
+    aom_img_free(img_);
+    if (input_file_) fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_) fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+    if (start_)
+      fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual aom_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual aom_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual aom_rational_t timebase() const {
+    const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void SetSize(unsigned int width, unsigned int height,
+                       aom_img_fmt format) {
+    if (width != width_ || height != height_ || format != format_) {
+      aom_img_free(img_);
+      img_ = aom_img_alloc(NULL, format, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      format_ = format;
+      switch (format) {
+        case AOM_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break;
+        case AOM_IMG_FMT_I422: raw_size_ = width * height * 2; break;
+        case AOM_IMG_FMT_I444: raw_size_ = width * height * 3; break;
+        case AOM_IMG_FMT_I42016: raw_size_ = width * height * 3; break;
+        case AOM_IMG_FMT_I42216: raw_size_ = width * height * 4; break;
+        case AOM_IMG_FMT_I44416: raw_size_ = width * height * 6; break;
+        default: ASSERT_TRUE(0);
+      }
+    }
+  }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  aom_image_t *img_;
+  size_t raw_size_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  aom_img_fmt format_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_YUV_VIDEO_SOURCE_H_
diff --git a/third_party/aom/third_party/fastfeat/LICENSE b/third_party/aom/third_party/fastfeat/LICENSE
new file mode 100644
index 000000000..f347008d6
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+
+	*Redistributions of source code must retain the above copyright
+	 notice, this list of conditions and the following disclaimer.
+
+	*Redistributions in binary form must reproduce the above copyright
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
+
+	*Neither the name of the University of Cambridge nor the names of 
+	 its contributors may be used to endorse or promote products derived 
+	 from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/aom/third_party/fastfeat/README.libvpx b/third_party/aom/third_party/fastfeat/README.libvpx
new file mode 100644
index 000000000..1e58a303b
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/README.libvpx
@@ -0,0 +1,39 @@
+URL: https://github.com/edrosten/fast-C-src
+Version: 391d5e939eb1545d24c10533d7de424db8d9c191
+License: BSD
+License File: LICENSE
+
+Description:
+Library to compute FAST features with non-maximum suppression.
+
+The files are valid C and C++ code, and have no special requirements for
+compiling, and they do not depend on any libraries. Just compile them along with
+the rest of your project.
+
+To use the functions, #include "fast.h"
+
+The corner detectors have the following prototype (where X is 9, 10, 11 or 12):
+
+xy* fastX_detect_nonmax(const unsigned char * data, int xsize, int ysize, int stride, int threshold, int* numcorners)
+
+Where xy is the following simple struct typedef:
+
+typedef struct
+{
+	int x, y;
+} xy;
+
+The image is passed in as a block of data and dimensions, and the list of
+corners is returned as an array of xy structs, and an integer (numcorners)
+with the number of corners returned.  The data can be deallocated with free().
+Nonmaximal suppression is performed on the corners. Note that the stride
+is the number of bytes between rows. If your image has no padding, then this
+is the same as xsize.
+
+The detection, scoring and nonmaximal suppression are available as individual
+functions.  To see how to use the individual functions, see fast.c
+
+Local Modifications:
+Add lines to turn off clang formatting for these files
+Remove Fast 10, 11 and 12
+Convert tabs to spaces
diff --git a/third_party/aom/third_party/fastfeat/fast.c b/third_party/aom/third_party/fastfeat/fast.c
new file mode 100644
index 000000000..0d7efc154
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/fast.c
@@ -0,0 +1,22 @@
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+xy* fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+	xy* corners;
+	int num_corners;
+	int* scores;
+	xy* nonmax;
+
+	corners = fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+	scores = fast9_score(im, stride, corners, num_corners, b);
+	nonmax = nonmax_suppression(corners, scores, num_corners, ret_num_corners);
+
+	free(corners);
+	free(scores);
+
+	return nonmax;
+}
+// clang-format on
diff --git a/third_party/aom/third_party/fastfeat/fast.h b/third_party/aom/third_party/fastfeat/fast.h
new file mode 100644
index 000000000..a00730e3d
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/fast.h
@@ -0,0 +1,20 @@
+// clang-format off
+#ifndef FAST_H
+#define FAST_H
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int fast9_corner_score(const byte* p, const int pixel[], int bstart);
+
+xy* fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+int* fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
+
+xy* fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+xy* nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax);
+
+
+#endif
+// clang-format on
diff --git a/third_party/aom/third_party/fastfeat/fast_9.c b/third_party/aom/third_party/fastfeat/fast_9.c
new file mode 100644
index 000000000..ec167a953
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/fast_9.c
@@ -0,0 +1,5911 @@
+// clang-format off
+/*This is mechanically generated code*/
+#include <stdlib.h>
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int fast9_corner_score(const byte* p, const int pixel[], int bstart)
+{
+  int bmin = bstart;
+  int bmax = 255;
+  int b = (bmax + bmin)/2;
+
+  /*Compute the score using binary search*/
+  for(;;)
+  {
+    int cb = *p + b;
+    int c_b= *p - b;
+
+
+    if( p[pixel[0]] > cb)
+      if( p[pixel[1]] > cb)
+        if( p[pixel[2]] > cb)
+          if( p[pixel[3]] > cb)
+            if( p[pixel[4]] > cb)
+              if( p[pixel[5]] > cb)
+                if( p[pixel[6]] > cb)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                      goto is_a_corner;
+                    else
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else if( p[pixel[7]] < c_b)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[14]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else if( p[pixel[6]] < c_b)
+                  if( p[pixel[15]] > cb)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[14]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[13]] < c_b)
+                      if( p[pixel[7]] < c_b)
+                        if( p[pixel[8]] < c_b)
+                          if( p[pixel[9]] < c_b)
+                            if( p[pixel[10]] < c_b)
+                              if( p[pixel[11]] < c_b)
+                                if( p[pixel[12]] < c_b)
+                                  if( p[pixel[14]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  if( p[pixel[14]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[13]] < c_b)
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[14]] < c_b)
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else if( p[pixel[5]] < c_b)
+                if( p[pixel[14]] > cb)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  if( p[pixel[11]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[12]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                      if( p[pixel[7]] < c_b)
+                        if( p[pixel[8]] < c_b)
+                          if( p[pixel[9]] < c_b)
+                            if( p[pixel[10]] < c_b)
+                              if( p[pixel[11]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[14]] < c_b)
+                  if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                      if( p[pixel[9]] < c_b)
+                        if( p[pixel[10]] < c_b)
+                          if( p[pixel[11]] < c_b)
+                            if( p[pixel[12]] < c_b)
+                              if( p[pixel[13]] < c_b)
+                                if( p[pixel[6]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  if( p[pixel[6]] < c_b)
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  if( p[pixel[11]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[12]] < c_b)
+                  if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                      if( p[pixel[9]] < c_b)
+                        if( p[pixel[10]] < c_b)
+                          if( p[pixel[11]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              if( p[pixel[14]] < c_b)
+                                if( p[pixel[6]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else if( p[pixel[4]] < c_b)
+              if( p[pixel[13]] > cb)
+                if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[11]] < c_b)
+                  if( p[pixel[5]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                      if( p[pixel[7]] < c_b)
+                        if( p[pixel[8]] < c_b)
+                          if( p[pixel[9]] < c_b)
+                            if( p[pixel[10]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[13]] < c_b)
+                if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                    if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[6]] < c_b)
+                              if( p[pixel[5]] < c_b)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] < c_b)
+                                if( p[pixel[15]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[11]] < c_b)
+                if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                    if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[6]] < c_b)
+                              if( p[pixel[5]] < c_b)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] < c_b)
+                                if( p[pixel[15]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else if( p[pixel[3]] < c_b)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[6]] < c_b)
+                        if( p[pixel[5]] < c_b)
+                          if( p[pixel[4]] < c_b)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] < c_b)
+                              if( p[pixel[13]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              if( p[pixel[14]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              if( p[pixel[15]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[5]] < c_b)
+                            if( p[pixel[4]] < c_b)
+                              goto is_a_corner;
+                            else
+                              if( p[pixel[13]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            if( p[pixel[13]] < c_b)
+                              if( p[pixel[14]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              if( p[pixel[15]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+        else if( p[pixel[2]] < c_b)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[7]] < c_b)
+              if( p[pixel[8]] < c_b)
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                    if( p[pixel[5]] < c_b)
+                      if( p[pixel[4]] < c_b)
+                        if( p[pixel[3]] < c_b)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] < c_b)
+                            if( p[pixel[12]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            if( p[pixel[15]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[7]] < c_b)
+              if( p[pixel[8]] < c_b)
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[4]] < c_b)
+                          if( p[pixel[3]] < c_b)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            if( p[pixel[15]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+      else if( p[pixel[1]] < c_b)
+        if( p[pixel[8]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] > cb)
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[7]] < c_b)
+            if( p[pixel[9]] < c_b)
+              if( p[pixel[6]] < c_b)
+                if( p[pixel[5]] < c_b)
+                  if( p[pixel[4]] < c_b)
+                    if( p[pixel[3]] < c_b)
+                      if( p[pixel[2]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] < c_b)
+                          if( p[pixel[11]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] < c_b)
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                        if( p[pixel[14]] < c_b)
+                          if( p[pixel[15]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else
+        if( p[pixel[8]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] > cb)
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[7]] < c_b)
+            if( p[pixel[9]] < c_b)
+              if( p[pixel[10]] < c_b)
+                if( p[pixel[6]] < c_b)
+                  if( p[pixel[5]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[3]] < c_b)
+                        if( p[pixel[2]] < c_b)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                        if( p[pixel[14]] < c_b)
+                          if( p[pixel[15]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+    else if( p[pixel[0]] < c_b)
+      if( p[pixel[1]] > cb)
+        if( p[pixel[8]] > cb)
+          if( p[pixel[7]] > cb)
+            if( p[pixel[9]] > cb)
+              if( p[pixel[6]] > cb)
+                if( p[pixel[5]] > cb)
+                  if( p[pixel[4]] > cb)
+                    if( p[pixel[3]] > cb)
+                      if( p[pixel[2]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] > cb)
+                          if( p[pixel[11]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] > cb)
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                        if( p[pixel[14]] > cb)
+                          if( p[pixel[15]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else if( p[pixel[1]] < c_b)
+        if( p[pixel[2]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[7]] > cb)
+              if( p[pixel[8]] > cb)
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[6]] > cb)
+                    if( p[pixel[5]] > cb)
+                      if( p[pixel[4]] > cb)
+                        if( p[pixel[3]] > cb)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] > cb)
+                            if( p[pixel[12]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[13]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            if( p[pixel[15]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[2]] < c_b)
+          if( p[pixel[3]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[6]] > cb)
+                        if( p[pixel[5]] > cb)
+                          if( p[pixel[4]] > cb)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] > cb)
+                              if( p[pixel[13]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[13]] > cb)
+                              if( p[pixel[14]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              if( p[pixel[15]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[3]] < c_b)
+            if( p[pixel[4]] > cb)
+              if( p[pixel[13]] > cb)
+                if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                    if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[6]] > cb)
+                              if( p[pixel[5]] > cb)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] > cb)
+                                if( p[pixel[15]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[13]] < c_b)
+                if( p[pixel[11]] > cb)
+                  if( p[pixel[5]] > cb)
+                    if( p[pixel[6]] > cb)
+                      if( p[pixel[7]] > cb)
+                        if( p[pixel[8]] > cb)
+                          if( p[pixel[9]] > cb)
+                            if( p[pixel[10]] > cb)
+                              if( p[pixel[12]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else if( p[pixel[4]] < c_b)
+              if( p[pixel[5]] > cb)
+                if( p[pixel[14]] > cb)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                      if( p[pixel[9]] > cb)
+                        if( p[pixel[10]] > cb)
+                          if( p[pixel[11]] > cb)
+                            if( p[pixel[12]] > cb)
+                              if( p[pixel[13]] > cb)
+                                if( p[pixel[6]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[14]] < c_b)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[6]] > cb)
+                      if( p[pixel[7]] > cb)
+                        if( p[pixel[8]] > cb)
+                          if( p[pixel[9]] > cb)
+                            if( p[pixel[10]] > cb)
+                              if( p[pixel[11]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  if( p[pixel[11]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  if( p[pixel[6]] > cb)
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else if( p[pixel[5]] < c_b)
+                if( p[pixel[6]] > cb)
+                  if( p[pixel[15]] < c_b)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[7]] > cb)
+                        if( p[pixel[8]] > cb)
+                          if( p[pixel[9]] > cb)
+                            if( p[pixel[10]] > cb)
+                              if( p[pixel[11]] > cb)
+                                if( p[pixel[12]] > cb)
+                                  if( p[pixel[14]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[13]] < c_b)
+                      if( p[pixel[14]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  if( p[pixel[14]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else if( p[pixel[6]] < c_b)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                      goto is_a_corner;
+                    else
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[14]] > cb)
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                      if( p[pixel[9]] > cb)
+                        if( p[pixel[10]] > cb)
+                          if( p[pixel[11]] > cb)
+                            if( p[pixel[13]] > cb)
+                              if( p[pixel[14]] > cb)
+                                if( p[pixel[6]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  if( p[pixel[11]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[11]] > cb)
+                if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                    if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[6]] > cb)
+                              if( p[pixel[5]] > cb)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] > cb)
+                                if( p[pixel[15]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else
+            if( p[pixel[10]] > cb)
+              if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[5]] > cb)
+                            if( p[pixel[4]] > cb)
+                              goto is_a_corner;
+                            else
+                              if( p[pixel[13]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            if( p[pixel[13]] > cb)
+                              if( p[pixel[14]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              if( p[pixel[15]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+        else
+          if( p[pixel[9]] > cb)
+            if( p[pixel[7]] > cb)
+              if( p[pixel[8]] > cb)
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[6]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[4]] > cb)
+                          if( p[pixel[3]] > cb)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[13]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            if( p[pixel[15]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+      else
+        if( p[pixel[8]] > cb)
+          if( p[pixel[7]] > cb)
+            if( p[pixel[9]] > cb)
+              if( p[pixel[10]] > cb)
+                if( p[pixel[6]] > cb)
+                  if( p[pixel[5]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[3]] > cb)
+                        if( p[pixel[2]] > cb)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                        if( p[pixel[14]] > cb)
+                          if( p[pixel[15]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+    else
+      if( p[pixel[7]] > cb)
+        if( p[pixel[8]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[6]] > cb)
+              if( p[pixel[5]] > cb)
+                if( p[pixel[4]] > cb)
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[2]] > cb)
+                      if( p[pixel[1]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[11]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] > cb)
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                        if( p[pixel[14]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[10]] > cb)
+                if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[14]] > cb)
+                        if( p[pixel[15]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else if( p[pixel[7]] < c_b)
+        if( p[pixel[8]] < c_b)
+          if( p[pixel[9]] < c_b)
+            if( p[pixel[6]] < c_b)
+              if( p[pixel[5]] < c_b)
+                if( p[pixel[4]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[2]] < c_b)
+                      if( p[pixel[1]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[11]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] < c_b)
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                        if( p[pixel[14]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[10]] < c_b)
+                if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                      if( p[pixel[14]] < c_b)
+                        if( p[pixel[15]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else
+        goto is_not_a_corner;
+
+is_a_corner:
+    bmin=b;
+    goto end_if;
+
+is_not_a_corner:
+    bmax=b;
+    goto end_if;
+
+end_if:
+
+    if(bmin == bmax - 1 || bmin == bmax)
+      return bmin;
+    b = (bmin + bmax) / 2;
+  }
+}
+
+static void make_offsets(int pixel[], int row_stride)
+{
+  pixel[0] = 0 + row_stride * 3;
+  pixel[1] = 1 + row_stride * 3;
+  pixel[2] = 2 + row_stride * 2;
+  pixel[3] = 3 + row_stride * 1;
+  pixel[4] = 3 + row_stride * 0;
+  pixel[5] = 3 + row_stride * -1;
+  pixel[6] = 2 + row_stride * -2;
+  pixel[7] = 1 + row_stride * -3;
+  pixel[8] = 0 + row_stride * -3;
+  pixel[9] = -1 + row_stride * -3;
+  pixel[10] = -2 + row_stride * -2;
+  pixel[11] = -3 + row_stride * -1;
+  pixel[12] = -3 + row_stride * 0;
+  pixel[13] = -3 + row_stride * 1;
+  pixel[14] = -2 + row_stride * 2;
+  pixel[15] = -1 + row_stride * 3;
+}
+
+
+
+int* fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
+{
+  int* scores = (int*)malloc(sizeof(int)* num_corners);
+  int n;
+
+  int pixel[16];
+  make_offsets(pixel, stride);
+
+  for(n=0; n < num_corners; n++)
+    scores[n] = fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b);
+
+  return scores;
+}
+
+
+xy* fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+  int num_corners=0;
+  xy* ret_corners;
+  int rsize=512;
+  int pixel[16];
+  int x, y;
+
+  ret_corners = (xy*)malloc(sizeof(xy)*rsize);
+  make_offsets(pixel, stride);
+
+  for(y=3; y < ysize - 3; y++)
+    for(x=3; x < xsize - 3; x++)
+    {
+      const byte* p = im + y*stride + x;
+
+      int cb = *p + b;
+      int c_b= *p - b;
+      if(p[pixel[0]] > cb)
+        if(p[pixel[1]] > cb)
+          if(p[pixel[2]] > cb)
+            if(p[pixel[3]] > cb)
+              if(p[pixel[4]] > cb)
+                if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[8]] > cb)
+                      {}
+                      else
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                    else if(p[pixel[7]] < c_b)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                      else if(p[pixel[14]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                  else if(p[pixel[6]] < c_b)
+                    if(p[pixel[15]] > cb)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[14]] > cb)
+                        {}
+                        else
+                          continue;
+                      else if(p[pixel[13]] < c_b)
+                        if(p[pixel[7]] < c_b)
+                          if(p[pixel[8]] < c_b)
+                            if(p[pixel[9]] < c_b)
+                              if(p[pixel[10]] < c_b)
+                                if(p[pixel[11]] < c_b)
+                                  if(p[pixel[12]] < c_b)
+                                    if(p[pixel[14]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                    if(p[pixel[14]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[13]] < c_b)
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[14]] < c_b)
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else if(p[pixel[5]] < c_b)
+                  if(p[pixel[14]] > cb)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                    if(p[pixel[11]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else if(p[pixel[12]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                        if(p[pixel[7]] < c_b)
+                          if(p[pixel[8]] < c_b)
+                            if(p[pixel[9]] < c_b)
+                              if(p[pixel[10]] < c_b)
+                                if(p[pixel[11]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[14]] < c_b)
+                    if(p[pixel[7]] < c_b)
+                      if(p[pixel[8]] < c_b)
+                        if(p[pixel[9]] < c_b)
+                          if(p[pixel[10]] < c_b)
+                            if(p[pixel[11]] < c_b)
+                              if(p[pixel[12]] < c_b)
+                                if(p[pixel[13]] < c_b)
+                                  if(p[pixel[6]] < c_b)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    if(p[pixel[6]] < c_b)
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                    if(p[pixel[11]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[12]] < c_b)
+                    if(p[pixel[7]] < c_b)
+                      if(p[pixel[8]] < c_b)
+                        if(p[pixel[9]] < c_b)
+                          if(p[pixel[10]] < c_b)
+                            if(p[pixel[11]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                                if(p[pixel[14]] < c_b)
+                                  if(p[pixel[6]] < c_b)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else if(p[pixel[4]] < c_b)
+                if(p[pixel[13]] > cb)
+                  if(p[pixel[11]] > cb)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else if(p[pixel[11]] < c_b)
+                    if(p[pixel[5]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                        if(p[pixel[7]] < c_b)
+                          if(p[pixel[8]] < c_b)
+                            if(p[pixel[9]] < c_b)
+                              if(p[pixel[10]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[13]] < c_b)
+                  if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                      if(p[pixel[9]] < c_b)
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[6]] < c_b)
+                                if(p[pixel[5]] < c_b)
+                                {}
+                                else
+                                  if(p[pixel[14]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] < c_b)
+                                  if(p[pixel[15]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  if(p[pixel[5]] < c_b)
+                    if(p[pixel[6]] < c_b)
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[11]] < c_b)
+                  if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                      if(p[pixel[9]] < c_b)
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[6]] < c_b)
+                                if(p[pixel[5]] < c_b)
+                                {}
+                                else
+                                  if(p[pixel[14]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] < c_b)
+                                  if(p[pixel[15]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else if(p[pixel[3]] < c_b)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                    if(p[pixel[9]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[6]] < c_b)
+                          if(p[pixel[5]] < c_b)
+                            if(p[pixel[4]] < c_b)
+                            {}
+                            else
+                              if(p[pixel[12]] < c_b)
+                                if(p[pixel[13]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                                if(p[pixel[14]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                                if(p[pixel[15]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                    if(p[pixel[9]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[5]] < c_b)
+                              if(p[pixel[4]] < c_b)
+                              {}
+                              else
+                                if(p[pixel[13]] < c_b)
+                                {}
+                                else
+                                  continue;
+                            else
+                              if(p[pixel[13]] < c_b)
+                                if(p[pixel[14]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                                if(p[pixel[15]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+          else if(p[pixel[2]] < c_b)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[6]] < c_b)
+                      if(p[pixel[5]] < c_b)
+                        if(p[pixel[4]] < c_b)
+                          if(p[pixel[3]] < c_b)
+                          {}
+                          else
+                            if(p[pixel[11]] < c_b)
+                              if(p[pixel[12]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                              if(p[pixel[15]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[4]] < c_b)
+                            if(p[pixel[3]] < c_b)
+                            {}
+                            else
+                              if(p[pixel[12]] < c_b)
+                              {}
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                              if(p[pixel[15]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+        else if(p[pixel[1]] < c_b)
+          if(p[pixel[8]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] > cb)
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[7]] < c_b)
+              if(p[pixel[9]] < c_b)
+                if(p[pixel[6]] < c_b)
+                  if(p[pixel[5]] < c_b)
+                    if(p[pixel[4]] < c_b)
+                      if(p[pixel[3]] < c_b)
+                        if(p[pixel[2]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[10]] < c_b)
+                            if(p[pixel[11]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] < c_b)
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                        if(p[pixel[13]] < c_b)
+                          if(p[pixel[14]] < c_b)
+                            if(p[pixel[15]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+        else
+          if(p[pixel[8]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] > cb)
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[7]] < c_b)
+              if(p[pixel[9]] < c_b)
+                if(p[pixel[10]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                    if(p[pixel[5]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[3]] < c_b)
+                          if(p[pixel[2]] < c_b)
+                          {}
+                          else
+                            if(p[pixel[11]] < c_b)
+                            {}
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                        if(p[pixel[13]] < c_b)
+                          if(p[pixel[14]] < c_b)
+                            if(p[pixel[15]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+      else if(p[pixel[0]] < c_b)
+        if(p[pixel[1]] > cb)
+          if(p[pixel[8]] > cb)
+            if(p[pixel[7]] > cb)
+              if(p[pixel[9]] > cb)
+                if(p[pixel[6]] > cb)
+                  if(p[pixel[5]] > cb)
+                    if(p[pixel[4]] > cb)
+                      if(p[pixel[3]] > cb)
+                        if(p[pixel[2]] > cb)
+                        {}
+                        else
+                          if(p[pixel[10]] > cb)
+                            if(p[pixel[11]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] > cb)
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                        if(p[pixel[13]] > cb)
+                          if(p[pixel[14]] > cb)
+                            if(p[pixel[15]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] < c_b)
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+        else if(p[pixel[1]] < c_b)
+          if(p[pixel[2]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[6]] > cb)
+                      if(p[pixel[5]] > cb)
+                        if(p[pixel[4]] > cb)
+                          if(p[pixel[3]] > cb)
+                          {}
+                          else
+                            if(p[pixel[11]] > cb)
+                              if(p[pixel[12]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[13]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                              if(p[pixel[15]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[2]] < c_b)
+            if(p[pixel[3]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                    if(p[pixel[9]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[6]] > cb)
+                          if(p[pixel[5]] > cb)
+                            if(p[pixel[4]] > cb)
+                            {}
+                            else
+                              if(p[pixel[12]] > cb)
+                                if(p[pixel[13]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[13]] > cb)
+                                if(p[pixel[14]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                                if(p[pixel[15]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[3]] < c_b)
+              if(p[pixel[4]] > cb)
+                if(p[pixel[13]] > cb)
+                  if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                      if(p[pixel[9]] > cb)
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[6]] > cb)
+                                if(p[pixel[5]] > cb)
+                                {}
+                                else
+                                  if(p[pixel[14]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] > cb)
+                                  if(p[pixel[15]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[13]] < c_b)
+                  if(p[pixel[11]] > cb)
+                    if(p[pixel[5]] > cb)
+                      if(p[pixel[6]] > cb)
+                        if(p[pixel[7]] > cb)
+                          if(p[pixel[8]] > cb)
+                            if(p[pixel[9]] > cb)
+                              if(p[pixel[10]] > cb)
+                                if(p[pixel[12]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[11]] < c_b)
+                    if(p[pixel[12]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  if(p[pixel[5]] > cb)
+                    if(p[pixel[6]] > cb)
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else if(p[pixel[4]] < c_b)
+                if(p[pixel[5]] > cb)
+                  if(p[pixel[14]] > cb)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[8]] > cb)
+                        if(p[pixel[9]] > cb)
+                          if(p[pixel[10]] > cb)
+                            if(p[pixel[11]] > cb)
+                              if(p[pixel[12]] > cb)
+                                if(p[pixel[13]] > cb)
+                                  if(p[pixel[6]] > cb)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[14]] < c_b)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[6]] > cb)
+                        if(p[pixel[7]] > cb)
+                          if(p[pixel[8]] > cb)
+                            if(p[pixel[9]] > cb)
+                              if(p[pixel[10]] > cb)
+                                if(p[pixel[11]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                    if(p[pixel[11]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    if(p[pixel[6]] > cb)
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] > cb)
+                    if(p[pixel[15]] < c_b)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[7]] > cb)
+                          if(p[pixel[8]] > cb)
+                            if(p[pixel[9]] > cb)
+                              if(p[pixel[10]] > cb)
+                                if(p[pixel[11]] > cb)
+                                  if(p[pixel[12]] > cb)
+                                    if(p[pixel[14]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else if(p[pixel[13]] < c_b)
+                        if(p[pixel[14]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                    if(p[pixel[14]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else if(p[pixel[6]] < c_b)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[7]] < c_b)
+                      if(p[pixel[8]] < c_b)
+                      {}
+                      else
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                    else
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[14]] > cb)
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[8]] > cb)
+                        if(p[pixel[9]] > cb)
+                          if(p[pixel[10]] > cb)
+                            if(p[pixel[11]] > cb)
+                              if(p[pixel[13]] > cb)
+                                if(p[pixel[14]] > cb)
+                                  if(p[pixel[6]] > cb)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                    if(p[pixel[11]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                      if(p[pixel[9]] > cb)
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[6]] > cb)
+                                if(p[pixel[5]] > cb)
+                                {}
+                                else
+                                  if(p[pixel[14]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] > cb)
+                                  if(p[pixel[15]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else
+              if(p[pixel[10]] > cb)
+                if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                    if(p[pixel[9]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[5]] > cb)
+                              if(p[pixel[4]] > cb)
+                              {}
+                              else
+                                if(p[pixel[13]] > cb)
+                                {}
+                                else
+                                  continue;
+                            else
+                              if(p[pixel[13]] > cb)
+                                if(p[pixel[14]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                                if(p[pixel[15]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+          else
+            if(p[pixel[9]] > cb)
+              if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[6]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[4]] > cb)
+                            if(p[pixel[3]] > cb)
+                            {}
+                            else
+                              if(p[pixel[12]] > cb)
+                              {}
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[13]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                              if(p[pixel[15]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+        else
+          if(p[pixel[8]] > cb)
+            if(p[pixel[7]] > cb)
+              if(p[pixel[9]] > cb)
+                if(p[pixel[10]] > cb)
+                  if(p[pixel[6]] > cb)
+                    if(p[pixel[5]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[3]] > cb)
+                          if(p[pixel[2]] > cb)
+                          {}
+                          else
+                            if(p[pixel[11]] > cb)
+                            {}
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                        if(p[pixel[13]] > cb)
+                          if(p[pixel[14]] > cb)
+                            if(p[pixel[15]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] < c_b)
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+      else
+        if(p[pixel[7]] > cb)
+          if(p[pixel[8]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[6]] > cb)
+                if(p[pixel[5]] > cb)
+                  if(p[pixel[4]] > cb)
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[2]] > cb)
+                        if(p[pixel[1]] > cb)
+                        {}
+                        else
+                          if(p[pixel[10]] > cb)
+                          {}
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[11]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] > cb)
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                        if(p[pixel[13]] > cb)
+                          if(p[pixel[14]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[10]] > cb)
+                  if(p[pixel[11]] > cb)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[14]] > cb)
+                          if(p[pixel[15]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else
+              continue;
+          else
+            continue;
+        else if(p[pixel[7]] < c_b)
+          if(p[pixel[8]] < c_b)
+            if(p[pixel[9]] < c_b)
+              if(p[pixel[6]] < c_b)
+                if(p[pixel[5]] < c_b)
+                  if(p[pixel[4]] < c_b)
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[2]] < c_b)
+                        if(p[pixel[1]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[10]] < c_b)
+                          {}
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[11]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] < c_b)
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                        if(p[pixel[13]] < c_b)
+                          if(p[pixel[14]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[10]] < c_b)
+                  if(p[pixel[11]] < c_b)
+                    if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                        if(p[pixel[14]] < c_b)
+                          if(p[pixel[15]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else
+              continue;
+          else
+            continue;
+        else
+          continue;
+      if(num_corners == rsize)
+      {
+        rsize*=2;
+        ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
+      }
+      ret_corners[num_corners].x = x;
+      ret_corners[num_corners].y = y;
+      num_corners++;
+
+    }
+
+  *ret_num_corners = num_corners;
+  return ret_corners;
+
+}
+
+// clang-format on
diff --git a/third_party/aom/third_party/fastfeat/nonmax.c b/third_party/aom/third_party/fastfeat/nonmax.c
new file mode 100644
index 000000000..0438c4dc1
--- /dev/null
+++ b/third_party/aom/third_party/fastfeat/nonmax.c
@@ -0,0 +1,121 @@
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+#define Compare(X, Y) ((X)>=(Y))
+
+xy* nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax)
+{
+  int num_nonmax=0;
+  int last_row;
+  int* row_start;
+  int i, j;
+  xy* ret_nonmax;
+  const int sz = (int)num_corners;
+
+  /*Point above points (roughly) to the pixel above the one of interest, if there
+    is a feature there.*/
+  int point_above = 0;
+  int point_below = 0;
+
+
+  if(num_corners < 1)
+  {
+    *ret_num_nonmax = 0;
+    return 0;
+  }
+
+  ret_nonmax = (xy*)malloc(num_corners * sizeof(xy));
+
+  /* Find where each row begins
+     (the corners are output in raster scan order). A beginning of -1 signifies
+     that there are no corners on that row. */
+  last_row = corners[num_corners-1].y;
+  row_start = (int*)malloc((last_row+1)*sizeof(int));
+
+  for(i=0; i < last_row+1; i++)
+    row_start[i] = -1;
+
+  {
+    int prev_row = -1;
+    for(i=0; i< num_corners; i++)
+      if(corners[i].y != prev_row)
+      {
+        row_start[corners[i].y] = i;
+        prev_row = corners[i].y;
+      }
+  }
+
+
+
+  for(i=0; i < sz; i++)
+  {
+    int score = scores[i];
+    xy pos = corners[i];
+
+    /*Check left */
+    if(i > 0)
+      if(corners[i-1].x == pos.x-1 && corners[i-1].y == pos.y && Compare(scores[i-1], score))
+        continue;
+
+    /*Check right*/
+    if(i < (sz - 1))
+      if(corners[i+1].x == pos.x+1 && corners[i+1].y == pos.y && Compare(scores[i+1], score))
+        continue;
+
+    /*Check above (if there is a valid row above)*/
+    if(pos.y > 0)
+      if (row_start[pos.y - 1] != -1)
+      {
+        /*Make sure that current point_above is one
+          row above.*/
+        if(corners[point_above].y < pos.y - 1)
+          point_above = row_start[pos.y-1];
+
+        /*Make point_above point to the first of the pixels above the current point,
+          if it exists.*/
+        for(; corners[point_above].y < pos.y && corners[point_above].x < pos.x - 1; point_above++)
+        {}
+
+
+        for(j=point_above; corners[j].y < pos.y && corners[j].x <= pos.x + 1; j++)
+        {
+          int x = corners[j].x;
+          if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j], score))
+            goto cont;
+        }
+
+      }
+
+    /*Check below (if there is anything below)*/
+    if(pos.y >= 0)
+      if (pos.y != last_row && row_start[pos.y + 1] != -1 && point_below < sz) /*Nothing below*/
+      {
+        if(corners[point_below].y < pos.y + 1)
+          point_below = row_start[pos.y+1];
+
+        /* Make point below point to one of the pixels belowthe current point, if it
+           exists.*/
+        for(; point_below < sz && corners[point_below].y == pos.y+1 && corners[point_below].x < pos.x - 1; point_below++)
+        {}
+
+        for(j=point_below; j < sz && corners[j].y == pos.y+1 && corners[j].x <= pos.x + 1; j++)
+        {
+          int x = corners[j].x;
+          if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j],score))
+            goto cont;
+        }
+      }
+
+    ret_nonmax[num_nonmax++] = corners[i];
+cont:
+    ;
+  }
+
+  free(row_start);
+  *ret_num_nonmax = num_nonmax;
+  return ret_nonmax;
+}
+
+// clang-format on
diff --git a/third_party/aom/third_party/googletest/README.libaom b/third_party/aom/third_party/googletest/README.libaom
new file mode 100644
index 000000000..9784dd51b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/README.libaom
@@ -0,0 +1,26 @@
+URL: https://github.com/google/googletest
+Version: 1.8.0
+License: BSD
+License File: LICENSE
+
+Description:
+Google's framework for writing C++ tests on a variety of platforms
+(Linux, Mac OS X, Windows, Windows CE, Symbian, etc).  Based on the
+xUnit architecture.  Supports automatic test discovery, a rich set of
+assertions, user-defined assertions, death tests, fatal and non-fatal
+failures, various options for running the tests, and XML test report
+generation.
+
+Local Modifications:
+- Remove everything but:
+  googletest-release-1.8.0/googletest/
+    cmake/
+    include/
+    src/
+    CHANGES
+    CMakelists.txt
+    CONTRIBUTORS
+    LICENSE
+    README.md
+- Suppress unsigned overflow instrumentation in the LCG
+  https://github.com/google/googletest/pull/1066
diff --git a/third_party/aom/third_party/googletest/gtest.mk b/third_party/aom/third_party/googletest/gtest.mk
new file mode 100644
index 000000000..fc4dbdc24
--- /dev/null
+++ b/third_party/aom/third_party/googletest/gtest.mk
@@ -0,0 +1 @@
+GTEST_SRCS-yes += googletest/src/googletest/src/gtest-all.cc
diff --git a/third_party/aom/third_party/googletest/src/googletest/CHANGES b/third_party/aom/third_party/googletest/src/googletest/CHANGES
new file mode 100644
index 000000000..055213242
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/CHANGES
@@ -0,0 +1,157 @@
+Changes for 1.7.0:
+
+* New feature: death tests are supported on OpenBSD and in iOS
+  simulator now.
+* New feature: Google Test now implements a protocol to allow
+  a test runner to detect that a test program has exited
+  prematurely and report it as a failure (before it would be
+  falsely reported as a success if the exit code is 0).
+* New feature: Test::RecordProperty() can now be used outside of the
+  lifespan of a test method, in which case it will be attributed to
+  the current test case or the test program in the XML report.
+* New feature (potentially breaking): --gtest_list_tests now prints
+  the type parameters and value parameters for each test.
+* Improvement: char pointers and char arrays are now escaped properly
+  in failure messages.
+* Improvement: failure summary in XML reports now includes file and
+  line information.
+* Improvement: the <testsuites> XML element now has a timestamp attribute.
+* Improvement: When --gtest_filter is specified, XML report now doesn't
+  contain information about tests that are filtered out.
+* Fixed the bug where long --gtest_filter flag values are truncated in
+  death tests.
+* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
+  function instead of a macro in order to work better with Clang.
+* Compatibility fixes with C++ 11 and various platforms.
+* Bug/warning fixes.
+
+Changes for 1.6.0:
+
+* New feature: ADD_FAILURE_AT() for reporting a test failure at the
+  given source location -- useful for writing testing utilities.
+* New feature: the universal value printer is moved from Google Mock
+  to Google Test.
+* New feature: type parameters and value parameters are reported in
+  the XML report now.
+* A gtest_disable_pthreads CMake option.
+* Colored output works in GNU Screen sessions now.
+* Parameters of value-parameterized tests are now printed in the
+  textual output.
+* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
+  now correctly reported.
+* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
+  ostream.
+* More complete handling of exceptions.
+* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
+  name is already used by another library.
+* --gtest_catch_exceptions is now true by default, allowing a test
+  program to continue after an exception is thrown.
+* Value-parameterized test fixtures can now derive from Test and
+  WithParamInterface<T> separately, easing conversion of legacy tests.
+* Death test messages are clearly marked to make them more
+  distinguishable from other messages.
+* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
+  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
+  IBM XL C++ (Visual Age C++), and C++0x.
+* Bug fixes and implementation clean-ups.
+* Potentially incompatible changes: disables the harmful 'make install'
+  command in autotools.
+
+Changes for 1.5.0:
+
+ * New feature: assertions can be safely called in multiple threads
+   where the pthreads library is available.
+ * New feature: predicates used inside EXPECT_TRUE() and friends
+   can now generate custom failure messages.
+ * New feature: Google Test can now be compiled as a DLL.
+ * New feature: fused source files are included.
+ * New feature: prints help when encountering unrecognized Google Test flags.
+ * Experimental feature: CMake build script (requires CMake 2.6.4+).
+ * Experimental feature: the Pump script for meta programming.
+ * double values streamed to an assertion are printed with enough precision
+   to differentiate any two different values.
+ * Google Test now works on Solaris and AIX.
+ * Build and test script improvements.
+ * Bug fixes and implementation clean-ups.
+
+ Potentially breaking changes:
+
+ * Stopped supporting VC++ 7.1 with exceptions disabled.
+ * Dropped support for 'make install'.
+
+Changes for 1.4.0:
+
+ * New feature: the event listener API
+ * New feature: test shuffling
+ * New feature: the XML report format is closer to junitreport and can
+   be parsed by Hudson now.
+ * New feature: when a test runs under Visual Studio, its failures are
+   integrated in the IDE.
+ * New feature: /MD(d) versions of VC++ projects.
+ * New feature: elapsed time for the tests is printed by default.
+ * New feature: comes with a TR1 tuple implementation such that Boost
+   is no longer needed for Combine().
+ * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
+ * New feature: the Xcode project can now produce static gtest
+   libraries in addition to a framework.
+ * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
+   Symbian, gcc, and C++Builder.
+ * Bug fixes and implementation clean-ups.
+
+Changes for 1.3.0:
+
+ * New feature: death tests on Windows, Cygwin, and Mac.
+ * New feature: ability to use Google Test assertions in other testing
+   frameworks.
+ * New feature: ability to run disabled test via
+   --gtest_also_run_disabled_tests.
+ * New feature: the --help flag for printing the usage.
+ * New feature: access to Google Test flag values in user code.
+ * New feature: a script that packs Google Test into one .h and one
+   .cc file for easy deployment.
+ * New feature: support for distributing test functions to multiple
+   machines (requires support from the test runner).
+ * Bug fixes and implementation clean-ups.
+
+Changes for 1.2.1:
+
+ * Compatibility fixes for Linux IA-64 and IBM z/OS.
+ * Added support for using Boost and other TR1 implementations.
+ * Changes to the build scripts to support upcoming release of Google C++
+   Mocking Framework.
+ * Added Makefile to the distribution package.
+ * Improved build instructions in README.
+
+Changes for 1.2.0:
+
+ * New feature: value-parameterized tests.
+ * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
+   macros.
+ * Changed the XML report format to match JUnit/Ant's.
+ * Added tests to the Xcode project.
+ * Added scons/SConscript for building with SCons.
+ * Added src/gtest-all.cc for building Google Test from a single file.
+ * Fixed compatibility with Solaris and z/OS.
+ * Enabled running Python tests on systems with python 2.3 installed,
+   e.g. Mac OS X 10.4.
+ * Bug fixes.
+
+Changes for 1.1.0:
+
+ * New feature: type-parameterized tests.
+ * New feature: exception assertions.
+ * New feature: printing elapsed time of tests.
+ * Improved the robustness of death tests.
+ * Added an Xcode project and samples.
+ * Adjusted the output format on Windows to be understandable by Visual Studio.
+ * Minor bug fixes.
+
+Changes for 1.0.1:
+
+ * Added project files for Visual Studio 7.1.
+ * Fixed issues with compiling on Mac OS X.
+ * Fixed issues with compiling on Cygwin.
+
+Changes for 1.0.0:
+
+ * Initial Open Source release of Google Test
diff --git a/third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt b/third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt
new file mode 100644
index 000000000..621d0f042
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/CMakeLists.txt
@@ -0,0 +1,286 @@
+########################################################################
+# CMake build script for Google Test.
+#
+# To run the tests for Google Test itself on Linux, use 'make test' or
+# ctest.  You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+# BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+# make it prominent in the GUI.
+option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+
+# When other libraries are using a shared version of runtime libraries,
+# Google Test also has to use one.
+option(
+  gtest_force_shared_crt
+  "Use shared (DLL) run-time lib even when Google Test is built as static lib."
+  OFF)
+
+option(gtest_build_tests "Build all of gtest's own tests." OFF)
+
+option(gtest_build_samples "Build gtest's sample programs." OFF)
+
+option(gtest_disable_pthreads "Disable uses of pthreads in gtest." OFF)
+
+option(
+  gtest_hide_internal_symbols
+  "Build gtest with internal symbols hidden in shared libraries."
+  OFF)
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include(cmake/hermetic_build.cmake OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+  pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gtest_SOURCE_DIR} and to the root binary directory as
+# ${gtest_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+project(gtest CXX C)
+cmake_minimum_required(VERSION 2.6.2)
+
+if (COMMAND set_up_hermetic_build)
+  set_up_hermetic_build()
+endif()
+
+if (gtest_hide_internal_symbols)
+  set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+  set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+endif()
+
+# Define helper functions and macros used by Google Test.
+include(cmake/internal_utils.cmake)
+
+config_compiler_and_linker()  # Defined in internal_utils.cmake.
+
+# Where Google Test's .h files can be found.
+include_directories(
+  ${gtest_SOURCE_DIR}/include
+  ${gtest_SOURCE_DIR})
+
+# Where Google Test's libraries can be found.
+link_directories(${gtest_BINARY_DIR}/src)
+
+# Summary of tuple support for Microsoft Visual Studio:
+# Compiler    version(MS)  version(cmake)  Support
+# ----------  -----------  --------------  -----------------------------
+# <= VS 2010  <= 10        <= 1600         Use Google Tests's own tuple.
+# VS 2012     11           1700            std::tr1::tuple + _VARIADIC_MAX=10
+# VS 2013     12           1800            std::tr1::tuple
+if (MSVC AND MSVC_VERSION EQUAL 1700)
+  add_definitions(/D _VARIADIC_MAX=10)
+endif()
+
+########################################################################
+#
+# Defines the gtest & gtest_main libraries.  User tests should link
+# with one of them.
+
+# Google Test libraries.  We build them using more strict warnings than what
+# are used for other targets, to ensure that gtest can be compiled by a user
+# aggressive about warnings.
+cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
+cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
+target_link_libraries(gtest_main gtest)
+
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  target_include_directories(gtest      INTERFACE "${gtest_SOURCE_DIR}/include")
+  target_include_directories(gtest_main INTERFACE "${gtest_SOURCE_DIR}/include")
+endif()
+
+########################################################################
+#
+# Install rules
+install(TARGETS gtest gtest_main
+  DESTINATION lib)
+install(DIRECTORY ${gtest_SOURCE_DIR}/include/gtest
+  DESTINATION include)
+
+########################################################################
+#
+# Samples on how to link user tests with gtest or gtest_main.
+#
+# They are not built by default.  To build them, set the
+# gtest_build_samples option to ON.  You can do it by running ccmake
+# or specifying the -Dgtest_build_samples=ON flag when running cmake.
+
+if (gtest_build_samples)
+  cxx_executable(sample1_unittest samples gtest_main samples/sample1.cc)
+  cxx_executable(sample2_unittest samples gtest_main samples/sample2.cc)
+  cxx_executable(sample3_unittest samples gtest_main)
+  cxx_executable(sample4_unittest samples gtest_main samples/sample4.cc)
+  cxx_executable(sample5_unittest samples gtest_main samples/sample1.cc)
+  cxx_executable(sample6_unittest samples gtest_main)
+  cxx_executable(sample7_unittest samples gtest_main)
+  cxx_executable(sample8_unittest samples gtest_main)
+  cxx_executable(sample9_unittest samples gtest)
+  cxx_executable(sample10_unittest samples gtest)
+endif()
+
+########################################################################
+#
+# Google Test's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Test itself.
+#
+# The tests are not built by default.  To build them, set the
+# gtest_build_tests option to ON.  You can do it by running ccmake
+# or specifying the -Dgtest_build_tests=ON flag when running cmake.
+
+if (gtest_build_tests)
+  # This must be set in the root directory for the tests to be run by
+  # 'make test' or ctest.
+  enable_testing()
+
+  ############################################################
+  # C++ tests built with standard compiler flags.
+
+  cxx_test(gtest-death-test_test gtest_main)
+  cxx_test(gtest_environment_test gtest)
+  cxx_test(gtest-filepath_test gtest_main)
+  cxx_test(gtest-linked_ptr_test gtest_main)
+  cxx_test(gtest-listener_test gtest_main)
+  cxx_test(gtest_main_unittest gtest_main)
+  cxx_test(gtest-message_test gtest_main)
+  cxx_test(gtest_no_test_unittest gtest)
+  cxx_test(gtest-options_test gtest_main)
+  cxx_test(gtest-param-test_test gtest
+    test/gtest-param-test2_test.cc)
+  cxx_test(gtest-port_test gtest_main)
+  cxx_test(gtest_pred_impl_unittest gtest_main)
+  cxx_test(gtest_premature_exit_test gtest
+    test/gtest_premature_exit_test.cc)
+  cxx_test(gtest-printers_test gtest_main)
+  cxx_test(gtest_prod_test gtest_main
+    test/production.cc)
+  cxx_test(gtest_repeat_test gtest)
+  cxx_test(gtest_sole_header_test gtest_main)
+  cxx_test(gtest_stress_test gtest)
+  cxx_test(gtest-test-part_test gtest_main)
+  cxx_test(gtest_throw_on_failure_ex_test gtest)
+  cxx_test(gtest-typed-test_test gtest_main
+    test/gtest-typed-test2_test.cc)
+  cxx_test(gtest_unittest gtest_main)
+  cxx_test(gtest-unittest-api_test gtest)
+
+  ############################################################
+  # C++ tests built with non-standard compiler flags.
+
+  # MSVC 7.1 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+    cxx_library(gtest_no_exception "${cxx_no_exception}"
+      src/gtest-all.cc)
+    cxx_library(gtest_main_no_exception "${cxx_no_exception}"
+      src/gtest-all.cc src/gtest_main.cc)
+  endif()
+  cxx_library(gtest_main_no_rtti "${cxx_no_rtti}"
+    src/gtest-all.cc src/gtest_main.cc)
+
+  cxx_test_with_flags(gtest-death-test_ex_nocatch_test
+    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0"
+    gtest test/gtest-death-test_ex_test.cc)
+  cxx_test_with_flags(gtest-death-test_ex_catch_test
+    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1"
+    gtest test/gtest-death-test_ex_test.cc)
+
+  cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}"
+    gtest_main_no_rtti test/gtest_unittest.cc)
+
+  cxx_shared_library(gtest_dll "${cxx_default}"
+    src/gtest-all.cc src/gtest_main.cc)
+
+  cxx_executable_with_flags(gtest_dll_test_ "${cxx_default}"
+    gtest_dll test/gtest_all_test.cc)
+  set_target_properties(gtest_dll_test_
+                        PROPERTIES
+                        COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+  if (NOT MSVC OR MSVC_VERSION LESS 1600)  # 1600 is Visual Studio 2010.
+    # Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that
+    # conflict with our own definitions. Therefore using our own tuple does not
+    # work on those compilers.
+    cxx_library(gtest_main_use_own_tuple "${cxx_use_own_tuple}"
+      src/gtest-all.cc src/gtest_main.cc)
+
+    cxx_test_with_flags(gtest-tuple_test "${cxx_use_own_tuple}"
+      gtest_main_use_own_tuple test/gtest-tuple_test.cc)
+
+    cxx_test_with_flags(gtest_use_own_tuple_test "${cxx_use_own_tuple}"
+      gtest_main_use_own_tuple
+      test/gtest-param-test_test.cc test/gtest-param-test2_test.cc)
+  endif()
+
+  ############################################################
+  # Python tests.
+
+  cxx_executable(gtest_break_on_failure_unittest_ test gtest)
+  py_test(gtest_break_on_failure_unittest)
+
+  # Visual Studio .NET 2003 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)  # 1310 is Visual Studio .NET 2003
+    cxx_executable_with_flags(
+      gtest_catch_exceptions_no_ex_test_
+      "${cxx_no_exception}"
+      gtest_main_no_exception
+      test/gtest_catch_exceptions_test_.cc)
+  endif()
+
+  cxx_executable_with_flags(
+    gtest_catch_exceptions_ex_test_
+    "${cxx_exception}"
+    gtest_main
+    test/gtest_catch_exceptions_test_.cc)
+  py_test(gtest_catch_exceptions_test)
+
+  cxx_executable(gtest_color_test_ test gtest)
+  py_test(gtest_color_test)
+
+  cxx_executable(gtest_env_var_test_ test gtest)
+  py_test(gtest_env_var_test)
+
+  cxx_executable(gtest_filter_unittest_ test gtest)
+  py_test(gtest_filter_unittest)
+
+  cxx_executable(gtest_help_test_ test gtest_main)
+  py_test(gtest_help_test)
+
+  cxx_executable(gtest_list_tests_unittest_ test gtest)
+  py_test(gtest_list_tests_unittest)
+
+  cxx_executable(gtest_output_test_ test gtest)
+  py_test(gtest_output_test)
+
+  cxx_executable(gtest_shuffle_test_ test gtest)
+  py_test(gtest_shuffle_test)
+
+  # MSVC 7.1 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+    cxx_executable(gtest_throw_on_failure_test_ test gtest_no_exception)
+    set_target_properties(gtest_throw_on_failure_test_
+      PROPERTIES
+      COMPILE_FLAGS "${cxx_no_exception}")
+    py_test(gtest_throw_on_failure_test)
+  endif()
+
+  cxx_executable(gtest_uninitialized_test_ test gtest)
+  py_test(gtest_uninitialized_test)
+
+  cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
+  cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
+  py_test(gtest_xml_outfiles_test)
+
+  cxx_executable(gtest_xml_output_unittest_ test gtest)
+  py_test(gtest_xml_output_unittest)
+endif()
diff --git a/third_party/aom/third_party/googletest/src/googletest/CONTRIBUTORS b/third_party/aom/third_party/googletest/src/googletest/CONTRIBUTORS
new file mode 100644
index 000000000..feae2fc04
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/CONTRIBUTORS
@@ -0,0 +1,37 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Testing Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Ajay Joshi <jaj@google.com>
+Balázs Dán <balazs.dan@gmail.com>
+Bharat Mediratta <bharat@menalto.com>
+Chandler Carruth <chandlerc@google.com>
+Chris Prince <cprince@google.com>
+Chris Taylor <taylorc@google.com>
+Dan Egnor <egnor@google.com>
+Eric Roman <eroman@chromium.org>
+Hady Zalek <hady.zalek@gmail.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jói Sigurðsson <joi@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kenton Varda <kenton@google.com>
+Manuel Klimek <klimek@google.com>
+Markus Heule <markus.heule@gmail.com>
+Mika Raento <mikie@iki.fi>
+Miklós Fazekas <mfazekas@szemafor.com>
+Pasi Valminen <pasi.valminen@gmail.com>
+Patrick Hanna <phanna@google.com>
+Patrick Riley <pfr@google.com>
+Peter Kaminski <piotrk@google.com>
+Preston Jackson <preston.a.jackson@gmail.com>
+Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
+Russ Cox <rsc@google.com>
+Russ Rufer <russ@pentad.com>
+Sean Mcafee <eefacm@gmail.com>
+Sigurður Ásgeirsson <siggi@google.com>
+Tracy Bialik <tracy@pentad.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/third_party/aom/third_party/googletest/src/googletest/LICENSE b/third_party/aom/third_party/googletest/src/googletest/LICENSE
new file mode 100644
index 000000000..1941a11f8
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/aom/third_party/googletest/src/googletest/README.md b/third_party/aom/third_party/googletest/src/googletest/README.md
new file mode 100644
index 000000000..edd440805
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/README.md
@@ -0,0 +1,280 @@
+
+### Generic Build Instructions ###
+
+#### Setup ####
+
+To build Google Test and your tests that use it, you need to tell your
+build system where to find its headers and source files.  The exact
+way to do it depends on which build system you use, and is usually
+straightforward.
+
+#### Build ####
+
+Suppose you put Google Test in directory `${GTEST_DIR}`.  To build it,
+create a library build target (or a project as called by Visual Studio
+and Xcode) to compile
+
+    ${GTEST_DIR}/src/gtest-all.cc
+
+with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
+in the normal header search path.  Assuming a Linux-like system and gcc,
+something like the following will do:
+
+    g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
+        -pthread -c ${GTEST_DIR}/src/gtest-all.cc
+    ar -rv libgtest.a gtest-all.o
+
+(We need `-pthread` as Google Test uses threads.)
+
+Next, you should compile your test source file with
+`${GTEST_DIR}/include` in the system header search path, and link it
+with gtest and any other necessary libraries:
+
+    g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
+        -o your_test
+
+As an example, the make/ directory contains a Makefile that you can
+use to build Google Test on systems where GNU make is available
+(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
+Test's own tests.  Instead, it just builds the Google Test library and
+a sample test.  You can use it as a starting point for your own build
+script.
+
+If the default settings are correct for your environment, the
+following commands should succeed:
+
+    cd ${GTEST_DIR}/make
+    make
+    ./sample1_unittest
+
+If you see errors, try to tweak the contents of `make/Makefile` to make
+them go away.  There are instructions in `make/Makefile` on how to do
+it.
+
+### Using CMake ###
+
+Google Test comes with a CMake build script (
+[CMakeLists.txt](CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for
+cross-platform.). If you don't have CMake installed already, you can
+download it for free from <http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can
+be used in the compiler environment of your choice.  The typical
+workflow starts with:
+
+    mkdir mybuild       # Create a directory to hold the build output.
+    cd mybuild
+    cmake ${GTEST_DIR}  # Generate native build scripts.
+
+If you want to build Google Test's samples, you should replace the
+last command with
+
+    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+
+If you are on a \*nix system, you should now see a Makefile in the
+current directory.  Just type 'make' to build gtest.
+
+If you use Windows and have Visual Studio installed, a `gtest.sln` file
+and several `.vcproj` files will be created.  You can then build them
+using Visual Studio.
+
+On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
+
+### Legacy Build Scripts ###
+
+Before settling on CMake, we have been providing hand-maintained build
+projects/scripts for Visual Studio, Xcode, and Autotools.  While we
+continue to provide them for convenience, they are not actively
+maintained any more.  We highly recommend that you follow the
+instructions in the previous two sections to integrate Google Test
+with your existing build system.
+
+If you still need to use the legacy build scripts, here's how:
+
+The msvc\ folder contains two solutions with Visual C++ projects.
+Open the `gtest.sln` or `gtest-md.sln` file using Visual Studio, and you
+are ready to build Google Test the same way you build any Visual
+Studio project.  Files that have names ending with -md use DLL
+versions of Microsoft runtime libraries (the /MD or the /MDd compiler
+option).  Files without that suffix use static versions of the runtime
+libraries (the /MT or the /MTd option).  Please note that one must use
+the same option to compile both gtest and the test code.  If you use
+Visual Studio 2005 or above, we recommend the -md version as /MD is
+the default for new projects in these versions of Visual Studio.
+
+On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using
+Xcode.  Build the "gtest" target.  The universal binary framework will
+end up in your selected build directory (selected in the Xcode
+"Preferences..." -> "Building" pane and defaults to xcode/build).
+Alternatively, at the command line, enter:
+
+    xcodebuild
+
+This will build the "Release" configuration of gtest.framework in your
+default build location.  See the "xcodebuild" man page for more
+information about building different configurations and building in
+different locations.
+
+If you wish to use the Google Test Xcode project with Xcode 4.x and
+above, you need to either:
+
+ * update the SDK configuration options in xcode/Config/General.xconfig.
+   Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
+   you choose this route you lose the ability to target earlier versions
+   of MacOS X.
+ * Install an SDK for an earlier version. This doesn't appear to be
+   supported by Apple, but has been reported to work
+   (http://stackoverflow.com/questions/5378518).
+
+### Tweaking Google Test ###
+
+Google Test can be used in diverse environments.  The default
+configuration may not work (or may not work well) out of the box in
+some environments.  However, you can easily tweak Google Test by
+defining control macros on the compiler command line.  Generally,
+these macros are named like `GTEST_XYZ` and you define them to either 1
+or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below.  For a complete list,
+see file [include/gtest/internal/gtest-port.h](include/gtest/internal/gtest-port.h).
+
+### Choosing a TR1 Tuple Library ###
+
+Some Google Test features require the C++ Technical Report 1 (TR1)
+tuple library, which is not yet available with all compilers.  The
+good news is that Google Test implements a subset of TR1 tuple that's
+enough for its own need, and will automatically use this when the
+compiler doesn't provide TR1 tuple.
+
+Usually you don't need to care about which tuple library Google Test
+uses.  However, if your project already uses TR1 tuple, you need to
+tell Google Test to use the same TR1 tuple library the rest of your
+project uses, or the two tuple implementations will clash.  To do
+that, add
+
+    -DGTEST_USE_OWN_TR1_TUPLE=0
+
+to the compiler flags while compiling Google Test and your tests.  If
+you want to force Google Test to use its own tuple library, just add
+
+    -DGTEST_USE_OWN_TR1_TUPLE=1
+
+to the compiler flags instead.
+
+If you don't want Google Test to use tuple at all, add
+
+    -DGTEST_HAS_TR1_TUPLE=0
+
+and all features using tuple will be disabled.
+
+### Multi-threaded Tests ###
+
+Google Test is thread-safe where the pthread library is available.
+After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE`
+macro to see whether this is the case (yes if the macro is `#defined` to
+1, no if it's undefined.).
+
+If Google Test doesn't correctly detect whether pthread is available
+in your environment, you can force it with
+
+    -DGTEST_HAS_PTHREAD=1
+
+or
+
+    -DGTEST_HAS_PTHREAD=0
+
+When Google Test uses pthread, you may need to add flags to your
+compiler and/or linker to select the pthread library, or you'll get
+link errors.  If you use the CMake script or the deprecated Autotools
+script, this is taken care of for you.  If you use your own build
+script, you'll need to read your compiler and linker's manual to
+figure out what flags to add.
+
+### As a Shared Library (DLL) ###
+
+Google Test is compact, so most users can build and link it as a
+static library for the simplicity.  You can choose to use Google Test
+as a shared library (known as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+    -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags.  You'll also need to tell the linker to produce
+a shared library instead - consult your linker's manual for how to do
+it.
+
+To compile your *tests* that use the gtest shared library, add
+
+    -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when
+using some compilers (e.g. GCC), they may become necessary in the
+future, if we decide to improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details).  Therefore you are
+recommended to always add the above flags when using Google Test as a
+shared library.  Otherwise a future release of Google Test may break
+your build script.
+
+### Avoiding Macro Name Clashes ###
+
+In C++, macros don't obey namespaces.  Therefore two libraries that
+both define a macro of the same name will clash if you `#include` both
+definitions.  In case a Google Test macro clashes with another
+library, you can force Google Test to rename its macro to avoid the
+conflict.
+
+Specifically, if both Google Test and some other code define macro
+FOO, you can add
+
+    -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell Google Test to change the macro's name
+from `FOO` to `GTEST_FOO`.  Currently `FOO` can be `FAIL`, `SUCCEED`,
+or `TEST`.  For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll
+need to write
+
+    GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+    TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
+
+## Developing Google Test ##
+
+This section discusses how to make your own changes to Google Test.
+
+### Testing Google Test Itself ###
+
+To make sure your changes work as intended and don't break existing
+functionality, you'll want to compile and run Google Test's own tests.
+For that you can use CMake:
+
+    mkdir mybuild
+    cd mybuild
+    cmake -Dgtest_build_tests=ON ${GTEST_DIR}
+
+Make sure you have Python installed, as some of Google Test's tests
+are written in Python.  If the cmake command complains about not being
+able to find Python (`Could NOT find PythonInterp (missing:
+PYTHON_EXECUTABLE)`), try telling it explicitly where your Python
+executable can be found:
+
+    cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
+
+Next, you can build Google Test and all of its own tests.  On \*nix,
+this is usually done by 'make'.  To run the tests, do
+
+    make test
+
+All tests should pass.
+
+Normally you don't need to worry about regenerating the source files,
+unless you need to modify them.  In that case, you should modify the
+corresponding .pump files instead and run the pump.py Python script to
+regenerate them.  You can find pump.py in the [scripts/](scripts/) directory.
+Read the [Pump manual](docs/PumpManual.md) for how to use it.
diff --git a/third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake
new file mode 100644
index 000000000..777b91ed4
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/cmake/internal_utils.cmake
@@ -0,0 +1,254 @@
+# Defines functions and macros useful for building Google Test and
+# Google Mock.
+#
+# Note:
+#
+# - This file will be run twice when building Google Mock (once via
+#   Google Test's CMakeLists.txt, and once via Google Mock's).
+#   Therefore it shouldn't have any side effects other than defining
+#   the functions and macros.
+#
+# - The functions/macros defined in this file may depend on Google
+#   Test and Google Mock's option() definitions, and thus must be
+#   called *after* the options have been defined.
+
+# Tweaks CMake's default compiler/linker settings to suit Google Test's needs.
+#
+# This must be a macro(), as inside a function string() can only
+# update variables in the function scope.
+macro(fix_default_compiler_settings_)
+  if (MSVC)
+    # For MSVC, CMake sets certain flags to defaults we want to override.
+    # This replacement code is taken from sample in the CMake Wiki at
+    # http://www.cmake.org/Wiki/CMake_FAQ#Dynamic_Replace.
+    foreach (flag_var
+             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+             CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt)
+        # When Google Test is built as a shared library, it should also use
+        # shared runtime libraries.  Otherwise, it may end up with multiple
+        # copies of runtime library data in different modules, resulting in
+        # hard-to-find crashes. When it is built as a static library, it is
+        # preferable to use CRT as static libraries, as we don't have to rely
+        # on CRT DLLs being available. CMake always defaults to using shared
+        # CRT libraries, so we override that default here.
+        string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}")
+      endif()
+
+      # We prefer more strict warning checking for building Google Test.
+      # Replaces /W3 with /W4 in defaults.
+      string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}")
+    endforeach()
+  endif()
+endmacro()
+
+# Defines the compiler/linker flags used to build Google Test and
+# Google Mock.  You can tweak these definitions to suit your need.  A
+# variable's value is empty before it's explicitly assigned to.
+macro(config_compiler_and_linker)
+  if (NOT gtest_disable_pthreads)
+    # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
+    find_package(Threads)
+  endif()
+
+  fix_default_compiler_settings_()
+  if (MSVC)
+    # Newlines inside flags variables break CMake's NMake generator.
+    # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds.
+    set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J -Zi")
+    if (MSVC_VERSION LESS 1400)  # 1400 is Visual Studio 2005
+      # Suppress spurious warnings MSVC 7.1 sometimes issues.
+      # Forcing value to bool.
+      set(cxx_base_flags "${cxx_base_flags} -wd4800")
+      # Copy constructor and assignment operator could not be generated.
+      set(cxx_base_flags "${cxx_base_flags} -wd4511 -wd4512")
+      # Compatibility warnings not applicable to Google Test.
+      # Resolved overload was found by argument-dependent lookup.
+      set(cxx_base_flags "${cxx_base_flags} -wd4675")
+    endif()
+    if (MSVC_VERSION LESS 1500)  # 1500 is Visual Studio 2008
+      # Conditional expression is constant.
+      # When compiling with /W4, we get several instances of C4127
+      # (Conditional expression is constant). In our code, we disable that
+      # warning on a case-by-case basis. However, on Visual Studio 2005,
+      # the warning fires on std::list. Therefore on that compiler and earlier,
+      # we disable the warning project-wide.
+      set(cxx_base_flags "${cxx_base_flags} -wd4127")
+    endif()
+    if (NOT (MSVC_VERSION LESS 1700))  # 1700 is Visual Studio 2012.
+      # Suppress "unreachable code" warning on VS 2012 and later.
+      # http://stackoverflow.com/questions/3232669 explains the issue.
+      set(cxx_base_flags "${cxx_base_flags} -wd4702")
+    endif()
+    if (NOT (MSVC_VERSION GREATER 1900))  # 1900 is Visual Studio 2015
+      # BigObj required for tests.
+      set(cxx_base_flags "${cxx_base_flags} -bigobj")
+    endif()
+
+    set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
+    set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
+    set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1")
+    set(cxx_no_exception_flags "-D_HAS_EXCEPTIONS=0")
+    set(cxx_no_rtti_flags "-GR-")
+  elseif (CMAKE_COMPILER_IS_GNUCXX)
+    set(cxx_base_flags "-Wall -Wshadow")
+    set(cxx_exception_flags "-fexceptions")
+    set(cxx_no_exception_flags "-fno-exceptions")
+    # Until version 4.3.2, GCC doesn't define a macro to indicate
+    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
+    # explicitly.
+    set(cxx_no_rtti_flags "-fno-rtti -DGTEST_HAS_RTTI=0")
+    set(cxx_strict_flags
+      "-Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro")
+    set(cxx_exception_flags "-features=except")
+    # Sun Pro doesn't provide macros to indicate whether exceptions and
+    # RTTI are enabled, so we define GTEST_HAS_* explicitly.
+    set(cxx_no_exception_flags "-features=no%except -DGTEST_HAS_EXCEPTIONS=0")
+    set(cxx_no_rtti_flags "-features=no%rtti -DGTEST_HAS_RTTI=0")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR
+      CMAKE_CXX_COMPILER_ID STREQUAL "XL")
+    # CMake 2.8 changes Visual Age's compiler ID to "XL".
+    set(cxx_exception_flags "-qeh")
+    set(cxx_no_exception_flags "-qnoeh")
+    # Until version 9.0, Visual Age doesn't define a macro to indicate
+    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
+    # explicitly.
+    set(cxx_no_rtti_flags "-qnortti -DGTEST_HAS_RTTI=0")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "HP")
+    set(cxx_base_flags "-AA -mt")
+    set(cxx_exception_flags "-DGTEST_HAS_EXCEPTIONS=1")
+    set(cxx_no_exception_flags "+noeh -DGTEST_HAS_EXCEPTIONS=0")
+    # RTTI can not be disabled in HP aCC compiler.
+    set(cxx_no_rtti_flags "")
+  endif()
+
+  if (CMAKE_USE_PTHREADS_INIT)  # The pthreads library is available and allowed.
+    set(cxx_base_flags "${cxx_base_flags} -DGTEST_HAS_PTHREAD=1")
+  else()
+    set(cxx_base_flags "${cxx_base_flags} -DGTEST_HAS_PTHREAD=0")
+  endif()
+
+  # For building gtest's own tests and samples.
+  set(cxx_exception "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_exception_flags}")
+  set(cxx_no_exception
+    "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
+  set(cxx_default "${cxx_exception}")
+  set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}")
+  set(cxx_use_own_tuple "${cxx_default} -DGTEST_USE_OWN_TR1_TUPLE=1")
+
+  # For building the gtest libraries.
+  set(cxx_strict "${cxx_default} ${cxx_strict_flags}")
+endmacro()
+
+# Defines the gtest & gtest_main libraries.  User tests should link
+# with one of them.
+function(cxx_library_with_type name type cxx_flags)
+  # type can be either STATIC or SHARED to denote a static or shared library.
+  # ARGN refers to additional arguments after 'cxx_flags'.
+  add_library(${name} ${type} ${ARGN})
+  set_target_properties(${name}
+    PROPERTIES
+    COMPILE_FLAGS "${cxx_flags}")
+  if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1")
+  endif()
+  if (CMAKE_USE_PTHREADS_INIT)
+    target_link_libraries(${name} ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+endfunction()
+
+########################################################################
+#
+# Helper functions for creating build targets.
+
+function(cxx_shared_library name cxx_flags)
+  cxx_library_with_type(${name} SHARED "${cxx_flags}" ${ARGN})
+endfunction()
+
+function(cxx_library name cxx_flags)
+  cxx_library_with_type(${name} "" "${cxx_flags}" ${ARGN})
+endfunction()
+
+# cxx_executable_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ executable that depends on the given libraries and
+# is built from the given source files with the given compiler flags.
+function(cxx_executable_with_flags name cxx_flags libs)
+  add_executable(${name} ${ARGN})
+  if (cxx_flags)
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_FLAGS "${cxx_flags}")
+  endif()
+  if (BUILD_SHARED_LIBS)
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+  endif()
+  # To support mixing linking in static and dynamic libraries, link each
+  # library in with an extra call to target_link_libraries.
+  foreach (lib "${libs}")
+    target_link_libraries(${name} ${lib})
+  endforeach()
+endfunction()
+
+# cxx_executable(name dir lib srcs...)
+#
+# creates a named target that depends on the given libs and is built
+# from the given source files.  dir/name.cc is implicitly included in
+# the source file list.
+function(cxx_executable name dir libs)
+  cxx_executable_with_flags(
+    ${name} "${cxx_default}" "${libs}" "${dir}/${name}.cc" ${ARGN})
+endfunction()
+
+# Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE.
+find_package(PythonInterp)
+
+# cxx_test_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ test that depends on the given libs and is built
+# from the given source files with the given compiler flags.
+function(cxx_test_with_flags name cxx_flags libs)
+  cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
+  add_test(${name} ${name})
+endfunction()
+
+# cxx_test(name libs srcs...)
+#
+# creates a named test target that depends on the given libs and is
+# built from the given source files.  Unlike cxx_test_with_flags,
+# test/name.cc is already implicitly included in the source file list.
+function(cxx_test name libs)
+  cxx_test_with_flags("${name}" "${cxx_default}" "${libs}"
+    "test/${name}.cc" ${ARGN})
+endfunction()
+
+# py_test(name)
+#
+# creates a Python test with the given name whose main module is in
+# test/name.py.  It does nothing if Python is not installed.
+function(py_test name)
+  # We are not supporting Python tests on Linux yet as they consider
+  # all Linux environments to be google3 and try to use google3 features.
+  if (PYTHONINTERP_FOUND)
+    # ${CMAKE_BINARY_DIR} is known at configuration time, so we can
+    # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
+    # only at ctest runtime (by calling ctest -c <Configuration>), so
+    # we have to escape $ to delay variable substitution here.
+    if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+      add_test(
+        NAME ${name}
+        COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+            --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>)
+    else (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+      add_test(
+        ${name}
+        ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+          --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE})
+    endif (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+  endif()
+endfunction()
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
new file mode 100644
index 000000000..957a69c6a
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
@@ -0,0 +1,294 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+#include "gtest/internal/gtest-death-test-internal.h"
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+// TODO(wan@google.com): make thread-safe death tests search the PATH.
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+# define ASSERT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_EXIT(statement, predicate, regex) \
+    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+# define ASSERT_DEATH(statement, regex) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test case, if any:
+# define EXPECT_DEATH(statement, regex) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode& other);
+
+  const int exit_code_;
+};
+
+# if !GTEST_OS_WINDOWS
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+ private:
+  const int signum_;
+};
+# endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+# ifdef NDEBUG
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+# else
+
+#  define EXPECT_DEBUG_DEATH(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+
+#  define ASSERT_DEBUG_DEATH(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+
+# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    EXPECT_DEATH(statement, regex)
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    ASSERT_DEATH(statement, regex)
+#else
+# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h
new file mode 100644
index 000000000..fe879bca7
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-message.h
@@ -0,0 +1,250 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+
+#include "gtest/internal/gtest-port.h"
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret&, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char* str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+#if GTEST_OS_SYMBIAN
+  // Streams a value (either a pointer or not) to this object.
+  template <typename T>
+  inline Message& operator <<(const T& value) {
+    StreamHelper(typename internal::is_pointer<T>::type(), value);
+    return *this;
+  }
+#else
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message& operator <<(const T& val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator <<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message& operator <<(T* const& pointer) {  // NOLINT
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message& operator <<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message& operator <<(bool b) {
+    return *this << (b ? "true" : "false");
+  }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message& operator <<(const wchar_t* wide_c_str);
+  Message& operator <<(wchar_t* wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::std::wstring& wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message& operator <<(const ::wstring& wstr);
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+
+#if GTEST_OS_SYMBIAN
+  // These are needed as the Nokia Symbian Compiler cannot decide between
+  // const T& and const T* in a function template. The Nokia compiler _can_
+  // decide between class template specializations for T and T*, so a
+  // tr1::type_traits-like is_pointer works, and we can overload on that.
+  template <typename T>
+  inline void StreamHelper(internal::true_type /*is_pointer*/, T* pointer) {
+    if (pointer == NULL) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+  }
+  template <typename T>
+  inline void StreamHelper(internal::false_type /*is_pointer*/,
+                           const T& value) {
+    // See the comments in Message& operator <<(const T&) above for why
+    // we need this using statement.
+    using ::operator <<;
+    *ss_ << value;
+  }
+#endif  // GTEST_OS_SYMBIAN
+
+  // We'll hold the text streamed to this object here.
+  const internal::scoped_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message&);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T& streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
new file mode 100644
index 000000000..038f9ba79
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
@@ -0,0 +1,1444 @@
+// This file was GENERATED by command:
+//     pump.py gtest-param-test.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-param-util-generated.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to 50 parameters.
+//
+template <typename T1>
+internal::ValueArray1<T1> Values(T1 v1) {
+  return internal::ValueArray1<T1>(v1);
+}
+
+template <typename T1, typename T2>
+internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
+  return internal::ValueArray2<T1, T2>(v1, v2);
+}
+
+template <typename T1, typename T2, typename T3>
+internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
+  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
+  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5) {
+  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6) {
+  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7) {
+  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
+      v6, v7);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
+  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
+      v5, v6, v7, v8);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
+  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
+  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11) {
+  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
+      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12) {
+  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13) {
+  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
+  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
+  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16) {
+  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17) {
+  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18) {
+  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
+  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
+  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
+  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22) {
+  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23) {
+  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24) {
+  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
+  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26) {
+  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27) {
+  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28) {
+  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29) {
+  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
+  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
+  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32) {
+  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33) {
+  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34) {
+  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
+  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
+  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
+    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37) {
+  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
+      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38) {
+  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
+      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
+      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
+      v33, v34, v35, v36, v37, v38);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
+    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
+    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
+    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
+    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
+    T37 v37, T38 v38, T39 v39) {
+  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
+      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
+      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+      v32, v33, v34, v35, v36, v37, v38, v39);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
+    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
+    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
+    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
+    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
+    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
+  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
+      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
+  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
+      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
+      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42) {
+  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
+      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
+      v42);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43) {
+  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
+      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
+      v41, v42, v43);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+    T42 v42, T43 v43, T44 v44) {
+  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
+      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
+      v40, v41, v42, v43, v44);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
+  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
+      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
+      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
+      v39, v40, v41, v42, v43, v44, v45);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
+  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
+      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
+  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
+      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
+      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
+      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
+    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
+    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
+    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
+    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
+    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
+    T48 v48) {
+  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
+      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
+      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
+    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
+    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
+    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
+    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
+    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
+    T47 v47, T48 v48, T49 v49) {
+  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
+      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
+      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
+      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
+    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
+    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
+    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
+    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
+    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
+    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
+  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
+      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
+      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
+      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+      v48, v49, v50);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+template <typename Generator1, typename Generator2>
+internal::CartesianProductHolder2<Generator1, Generator2> Combine(
+    const Generator1& g1, const Generator2& g2) {
+  return internal::CartesianProductHolder2<Generator1, Generator2>(
+      g1, g2);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3>
+internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
+  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
+      g1, g2, g3);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4>
+internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+    Generator4> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4) {
+  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
+      Generator4>(
+      g1, g2, g3, g4);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5>
+internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+    Generator4, Generator5> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5) {
+  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
+      Generator4, Generator5>(
+      g1, g2, g3, g4, g5);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6>
+internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
+  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6>(
+      g1, g2, g3, g4, g5, g6);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7>
+internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7) {
+  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7>(
+      g1, g2, g3, g4, g5, g6, g7);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8>
+internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8) {
+  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8>(
+      g1, g2, g3, g4, g5, g6, g7, g8);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9>
+internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8,
+    Generator9> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
+  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9);
+}
+
+template <typename Generator1, typename Generator2, typename Generator3,
+    typename Generator4, typename Generator5, typename Generator6,
+    typename Generator7, typename Generator8, typename Generator9,
+    typename Generator10>
+internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+    Generator10> Combine(
+    const Generator1& g1, const Generator2& g2, const Generator3& g3,
+        const Generator4& g4, const Generator5& g5, const Generator6& g6,
+        const Generator7& g7, const Generator8& g8, const Generator9& g9,
+        const Generator10& g10) {
+  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
+      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
+      Generator10>(
+      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
+}
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      #test_case_name, \
+                      #test_name, \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()). It does not work
+// for std::string or C strings.
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h.pump b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h.pump
new file mode 100644
index 000000000..3078d6d2a
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h.pump
@@ -0,0 +1,510 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of Values arguments we want to support.
+$var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: vladl@google.com (Vlad Losev)
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a  summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test case
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_CASE_P(InstantiationName,
+                        FooTest,
+                        Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more then once) the first argument to the
+// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
+// actual test case name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
+// in the given test case, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_CASE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+
+#if !GTEST_OS_SYMBIAN
+# include <utility>
+#endif
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-param-util-generated.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test case is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test case FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test case StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+//
+// This instantiates tests from test case StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_CASE_P(CharSequence,
+//                         StlStringTest,
+//                         ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_CASE_P(CharSequence2,
+//                         CharTest,
+//                         ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
+      ::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test case BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
+//
+// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+// Currently, Values() supports from 1 to $n parameters.
+//
+$range i 1..n
+$for i [[
+$range j 1..i
+
+template <$for j, [[typename T$j]]>
+internal::ValueArray$i<$for j, [[T$j]]> Values($for j, [[T$j v$j]]) {
+  return internal::ValueArray$i<$for j, [[T$j]]>($for j, [[v$j]]);
+}
+
+]]
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test case FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() {
+  return Values(false, true);
+}
+
+# if GTEST_HAS_COMBINE
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to $maxtuple arguments. This number is currently limited
+// by the maximum number of elements in the tuple implementation used by Google
+// Test.
+//
+// Example:
+//
+// This will instantiate tests in test case AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
+//                         Combine(Values("cat", "dog"),
+//                                 Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
+//                         Combine(Bool(), Bool()));
+//
+$range i 2..maxtuple
+$for i [[
+$range j 1..i
+
+template <$for j, [[typename Generator$j]]>
+internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
+    $for j, [[const Generator$j& g$j]]) {
+  return internal::CartesianProductHolder$i<$for j, [[Generator$j]]>(
+      $for j, [[g$j]]);
+}
+
+]]
+# endif  // GTEST_HAS_COMBINE
+
+
+
+# define TEST_P(test_case_name, test_name) \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+      : public test_case_name { \
+   public: \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
+    virtual void TestBody(); \
+   private: \
+    static int AddToRegistry() { \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestPattern(\
+                      #test_case_name, \
+                      #test_name, \
+                      new ::testing::internal::TestMetaFactory< \
+                          GTEST_TEST_CLASS_NAME_(\
+                              test_case_name, test_name)>()); \
+      return 0; \
+    } \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
+  }; \
+  int GTEST_TEST_CLASS_NAME_(test_case_name, \
+                             test_name)::gtest_registering_dummy_ = \
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
+  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+// The optional last argument to INSTANTIATE_TEST_CASE_P allows the user
+// to specify a function or functor that generates custom test name suffixes
+// based on the test parameters. The function should accept one argument of
+// type testing::TestParamInfo<class ParamType>, and return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
+  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
+  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+      const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
+    return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
+        (__VA_ARGS__)(info); \
+  } \
+  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
+          GetTestCasePatternHolder<test_case_name>(\
+              #test_case_name, \
+              ::testing::internal::CodeLocation(\
+                  __FILE__, __LINE__))->AddTestCaseInstantiation(\
+                      #prefix, \
+                      &gtest_##prefix##test_case_name##_EvalGenerator_, \
+                      &gtest_##prefix##test_case_name##_EvalGenerateName_, \
+                      __FILE__, __LINE__)
+
+}  // namespace testing
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
new file mode 100644
index 000000000..8a33164cb
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
@@ -0,0 +1,993 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-internal.h"
+
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>
+#endif
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+  kOtherType              // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+                         sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const ::testing::internal::string short_str = value.ShortDebugString();
+    const ::testing::internal::string pretty_str =
+        short_str.length() <= kProtobufOneLinerMaxLength ?
+        short_str : ("\n" + value.DebugString());
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits>& operator<<(
+    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
+  TypeWithoutFormatter<T,
+      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
+       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
+       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+  //
+  // We cannot write 'using ::testing::internal2::operator<<;', which
+  // gcc 3.3 fails to compile due to a compiler bug.
+  using namespace ::testing::internal2;  // NOLINT
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint& value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint* value) {
+    return FormatForComparison<const ToPrint*, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                \
+  template <typename OtherOperand>                                      \
+  class FormatForComparison<CharType*, OtherOperand> {                  \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(static_cast<const void*>(value)); \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                           \
+  class FormatForComparison<CharType*, OtherStringType> {               \
+   public:                                                              \
+    static ::std::string Format(CharType* value) {                      \
+      return ::testing::PrintToString(value);                           \
+    }                                                                   \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::string);
+#endif
+
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::wstring);
+#endif
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(
+    const T1& value, const T2& /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(IsContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const C& container, ::std::ostream* os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin();
+       it != container.end(); ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    true_type /* is a pointer */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // C++ doesn't allow casting from a function pointer to any object
+    // pointer.
+    //
+    // IsTrue() silences warnings: "Condition is always true",
+    // "unreachable code".
+    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.  However, we cannot cast it to const void* directly,
+      // even using reinterpret_cast, as earlier versions of gcc
+      // (e.g. 3.4.5) cannot compile the cast when p is a function
+      // pointer.  Casting to UInt64 first solves the problem.
+      *os << reinterpret_cast<const void*>(
+          reinterpret_cast<internal::UInt64>(p));
+    }
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(IsNotContainer /* dummy */,
+                    false_type /* is not a pointer */,
+                    const T& value, ::std::ostream* os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T& value, ::std::ostream* os) {
+  // DefaultPrintTo() is overloaded.  The type of its first two
+  // arguments determine which version will be picked.  If T is an
+  // STL-style container, the version for container will be called; if
+  // T is a pointer, the pointer version will be called; otherwise the
+  // generic version will be called.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // The second argument of DefaultPrintTo() is needed to bypass a bug
+  // in Symbian's C++ compiler that prevents it from picking the right
+  // overload between:
+  //
+  //   PrintTo(const T& x, ...);
+  //   PrintTo(T* x, ...);
+  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
+inline void PrintTo(char c, ::std::ostream* os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream* os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
+inline void PrintTo(char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char*>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(signed char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+inline void PrintTo(unsigned char* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const void*>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
+inline void PrintTo(wchar_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::string and ::std::string.
+#if GTEST_HAS_GLOBAL_STRING
+GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
+inline void PrintTo(const ::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::wstring and ::std::wstring.
+#if GTEST_HAS_GLOBAL_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T& t, ::std::ostream* os);
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE
+// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
+// which are packed as tuples.
+
+// Overloaded PrintTo() for tuples of various arities.  We support
+// tuples of up-to 10 fields.  The following implementation works
+// regardless of whether tr1::tuple is implemented using the
+// non-standard variadic template feature or not.
+
+inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1>
+void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2>
+void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9>
+void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
+             ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+          typename T6, typename T7, typename T8, typename T9, typename T10>
+void PrintTo(
+    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
+    ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...>& t, ::std::ostream* os) {
+  PrintTupleTo(t, os);
+}
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T& value, ::std::ostream* os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    // TODO(wan@google.com): let the user control the threshold using a flag.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const char* begin, size_t len, ::std::ostream* os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(
+    const wchar_t* begin, size_t len, ::std::ostream* os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream* os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T&> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T& value, ::std::ostream* os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T&> {
+ public:
+  static void Print(const T& value, ::std::ostream* os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream* os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char*> {
+ public:
+  static void Print(const char* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char*> {
+ public:
+  static void Print(char* str, ::std::ostream* os) {
+    UniversalTersePrinter<const char*>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t*> {
+ public:
+  static void Print(const wchar_t* str, ::std::ostream* os) {
+    if (str == NULL) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t*> {
+ public:
+  static void Print(wchar_t* str, ::std::ostream* os) {
+    UniversalTersePrinter<const wchar_t*>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T& value, ::std::ostream* os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector<string> Strings;
+
+// TuplePolicy<TupleT> must provide:
+// - tuple_size
+//     size of tuple TupleT.
+// - get<size_t I>(const TupleT& t)
+//     static function extracting element I of tuple TupleT.
+// - tuple_element<size_t I>::type
+//     type of element I of tuple TupleT.
+template <typename TupleT>
+struct TuplePolicy;
+
+#if GTEST_HAS_TR1_TUPLE
+template <typename TupleT>
+struct TuplePolicy {
+  typedef TupleT Tuple;
+  static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static typename AddReference<
+      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
+      const Tuple& tuple) {
+    return ::std::tr1::get<I>(tuple);
+  }
+};
+template <typename TupleT>
+const size_t TuplePolicy<TupleT>::tuple_size;
+#endif  // GTEST_HAS_TR1_TUPLE
+
+#if GTEST_HAS_STD_TUPLE_
+template <typename... Types>
+struct TuplePolicy< ::std::tuple<Types...> > {
+  typedef ::std::tuple<Types...> Tuple;
+  static const size_t tuple_size = ::std::tuple_size<Tuple>::value;
+
+  template <size_t I>
+  struct tuple_element : ::std::tuple_element<I, Tuple> {};
+
+  template <size_t I>
+  static const typename ::std::tuple_element<I, Tuple>::type& get(
+      const Tuple& tuple) {
+    return ::std::get<I>(tuple);
+  }
+};
+template <typename... Types>
+const size_t TuplePolicy< ::std::tuple<Types...> >::tuple_size;
+#endif  // GTEST_HAS_STD_TUPLE_
+
+#if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+// This helper template allows PrintTo() for tuples and
+// UniversalTersePrintTupleFieldsToStrings() to be defined by
+// induction on the number of tuple fields.  The idea is that
+// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
+// fields in tuple t, and can be defined in terms of
+// TuplePrefixPrinter<N - 1>.
+//
+// The inductive case.
+template <size_t N>
+struct TuplePrefixPrinter {
+  // Prints the first N fields of a tuple.
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
+    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
+    GTEST_INTENTIONAL_CONST_COND_PUSH_()
+    if (N > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+      *os << ", ";
+    }
+    UniversalPrinter<
+        typename TuplePolicy<Tuple>::template tuple_element<N - 1>::type>
+        ::Print(TuplePolicy<Tuple>::template get<N - 1>(t), os);
+  }
+
+  // Tersely prints the first N fields of a tuple to a string vector,
+  // one element for each field.
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
+    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
+    ::std::stringstream ss;
+    UniversalTersePrint(TuplePolicy<Tuple>::template get<N - 1>(t), &ss);
+    strings->push_back(ss.str());
+  }
+};
+
+// Base case.
+template <>
+struct TuplePrefixPrinter<0> {
+  template <typename Tuple>
+  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
+
+  template <typename Tuple>
+  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
+};
+
+// Helper function for printing a tuple.
+// Tuple must be either std::tr1::tuple or std::tuple type.
+template <typename Tuple>
+void PrintTupleTo(const Tuple& t, ::std::ostream* os) {
+  *os << "(";
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::PrintPrefixTo(t, os);
+  *os << ")";
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
+  Strings result;
+  TuplePrefixPrinter<TuplePolicy<Tuple>::tuple_size>::
+      TersePrintPrefixToStrings(value, &result);
+  return result;
+}
+#endif  // GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
+
+}  // namespace internal
+
+template <typename T>
+::std::string PrintToString(const T& value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
new file mode 100644
index 000000000..f63fa9a1b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
@@ -0,0 +1,232 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray* result);
+
+  // The d'tor restores the previous test part result reporter.
+  virtual ~ScopedFakeTestPartResultReporter();
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface* old_reporter_;
+  TestPartResultArray* const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray* results,
+                       TestPartResult::Type type,
+                       const string& substr);
+  ~SingleFailureChecker();
+ private:
+  const TestPartResultArray* const results_;
+  const TestPartResult::Type type_;
+  const string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do { \
+    class GTestExpectFatalFailureHelper {\
+     public:\
+      static void Execute() { statement; }\
+    };\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ALL_THREADS, &gtest_failures);\
+      GTestExpectFatalFailureHelper::Execute();\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter:: \
+          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
+  do {\
+    ::testing::TestPartResultArray gtest_failures;\
+    ::testing::internal::SingleFailureChecker gtest_checker(\
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));\
+    {\
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);\
+      if (::testing::internal::AlwaysTrue()) { statement; }\
+    }\
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
new file mode 100644
index 000000000..77eb84483
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
@@ -0,0 +1,179 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure      // Failed and the test should be terminated.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type,
+                 const char* a_file_name,
+                 int a_line_number,
+                 const char* a_message)
+      : type_(a_type),
+        file_name_(a_file_name == NULL ? "" : a_file_name),
+        line_number_(a_line_number),
+        summary_(ExtractSummary(a_message)),
+        message_(a_message) {
+  }
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char* file_name() const {
+    return file_name_.empty() ? NULL : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char* summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char* message() const { return message_.c_str(); }
+
+  // Returns true iff the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true iff the test part failed.
+  bool failed() const { return type_ != kSuccess; }
+
+  // Returns true iff the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true iff the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char* message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult& result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult& GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  virtual ~HasNewFatalFailureHelper();
+  virtual void ReportTestPartResult(const TestPartResult& result);
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface* original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
new file mode 100644
index 000000000..5f69d5678
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
@@ -0,0 +1,263 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test case, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_CASE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_CASE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test case as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  // Since we are inside a derived class template, C++ requires use to
+  // visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test case
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_CASE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test case as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test case name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_CASE_P(FooTest,
+                           DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test case name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+
+#endif  // 0
+
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test case.
+# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define TYPED_TEST_CASE(CaseName, Types) \
+  typedef ::testing::internal::TypeList< Types >::type \
+      GTEST_TYPE_PARAMS_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName) \
+  template <typename gtest_TypeParam_> \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
+      : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTest< \
+          CaseName, \
+          ::testing::internal::TemplateSel< \
+              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
+          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
+              "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+              #CaseName, #TestName, 0); \
+  template <typename gtest_TypeParam_> \
+  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test case are defined in.  The exact
+// name of the namespace is subject to change without notice.
+# define GTEST_CASE_NAMESPACE_(TestCaseName) \
+  gtest_case_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test case.
+# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
+  gtest_typed_test_case_p_state_##TestCaseName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test case.
+# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
+  gtest_registered_test_names_##TestCaseName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+# define TYPED_TEST_CASE_P(CaseName) \
+  static ::testing::internal::TypedTestCasePState \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
+
+# define TYPED_TEST_P(CaseName, TestName) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  template <typename gtest_TypeParam_> \
+  class TestName : public CaseName<gtest_TypeParam_> { \
+   private: \
+    typedef CaseName<gtest_TypeParam_> TestFixture; \
+    typedef gtest_TypeParam_ TypeParam; \
+    virtual void TestBody(); \
+  }; \
+  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
+          __FILE__, __LINE__, #CaseName, #TestName); \
+  } \
+  template <typename gtest_TypeParam_> \
+  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
+
+# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \
+  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
+  typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
+  } \
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
+      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
+          __FILE__, __LINE__, #__VA_ARGS__)
+
+// The 'Types' template argument below must have spaces around it
+// since some compilers may choke on '>>' when passing a template
+// instance (e.g. Types<int>)
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
+  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
+      ::testing::internal::TypeParameterizedTestCase<CaseName, \
+          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
+          ::testing::internal::TypeList< Types >::type>::Register(\
+              #Prefix, \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+              &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), \
+              #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h
new file mode 100644
index 000000000..f846c5bd6
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest.h
@@ -0,0 +1,2236 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <limits>
+#include <ostream>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-param-test.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/gtest-typed-test.h"
+
+// Depending on the platform, different string classes are available.
+// On Linux, in addition to ::std::string, Google also makes use of
+// class ::string, which has the same interface as ::std::string, but
+// has a different implementation.
+//
+// You can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
+// ::string is available AND is a distinct type to ::std::string, or
+// define it to 0 to indicate otherwise.
+//
+// If ::std::string and ::string are the same class on your platform
+// due to aliasing, you should define GTEST_HAS_GLOBAL_STRING to 0.
+//
+// If you do not define GTEST_HAS_GLOBAL_STRING, it is defined
+// heuristically.
+
+namespace testing {
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class UnitTestImpl* GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message);
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestCase;
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename internal::EnableIf<
+          !internal::ImplicitlyConvertible<T, AssertionResult>::value>::type*
+          /*enabler*/ = NULL)
+      : success_(success) {}
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true iff the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != NULL ?  message_->c_str() : "";
+  }
+  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T> AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == NULL)
+      message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  internal::scoped_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestCases, and
+// each TestCase contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // Defines types for pointers to functions that set up and tear down
+  // a test case.
+  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
+  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestCase() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestCase() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestCase() {}
+
+  // Tears down the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::TearDownTestCase() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestCase() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestCase() {}
+
+  // Returns true iff the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true iff the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true iff the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test case, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test case.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestCase or TearDownTestCase are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string& key, const std::string& value);
+  static void RecordProperty(const std::string& key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true iff the current test has the same fixture class as
+  // the first test in the current test case.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  const internal::scoped_ptr< GTEST_FLAG_SAVER_ > gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string& a_key, const std::string& a_value) :
+    key_(a_key), value_(a_value) {
+  }
+
+  // Gets the user supplied key.
+  const char* key() const {
+    return key_.c_str();
+  }
+
+  // Gets the user supplied value.
+  const char* value() const {
+    return value_.c_str();
+  }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string& new_value) {
+    value_ = new_value;
+  }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true iff the test passed (i.e. no test part failed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test failed.
+  bool Failed() const;
+
+  // Returns true iff the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true iff the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test part result among all the results. i can range
+  // from 0 to test_property_count() - 1. If i is not in that range, aborts
+  // the program.
+  const TestPartResult& GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty& GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestCase;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult>& test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty>& test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string& xml_element,
+                      const TestProperty& test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testcase tags.  Returns true if the property is valid.
+  // TODO(russr): Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string& xml_element,
+                                   const TestProperty& test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult& test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test case name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test case name.
+  const char* test_case_name() const { return test_case_name_.c_str(); }
+
+  // Returns the test name.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char* value_param() const {
+    if (value_param_.get() != NULL)
+      return value_param_->c_str();
+    return NULL;
+  }
+
+  // Returns the file name where this test is defined.
+  const char* file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test case Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true iff this test will appear in the XML report.
+  bool is_reportable() const {
+    // For now, the XML report includes all tests matching the filter.
+    // In the future, we may trim tests that are excluded because of
+    // sharding.
+    return matches_filter_;
+  }
+
+  // Returns the result of the test.
+  const TestResult* result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestCase;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo* internal::MakeAndRegisterTestInfo(
+      const char* test_case_name,
+      const char* name,
+      const char* type_param,
+      const char* value_param,
+      internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id,
+      Test::SetUpTestCaseFunc set_up_tc,
+      Test::TearDownTestCaseFunc tear_down_tc,
+      internal::TestFactoryBase* factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string& test_case_name,
+           const std::string& name,
+           const char* a_type_param,   // NULL if not a type-parameterized test
+           const char* a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase* factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo* test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_case_name_;     // Test case name
+  const std::string name_;               // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const internal::scoped_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
+  bool should_run_;                 // True iff this test should run
+  bool is_disabled_;                // True iff this test is disabled
+  bool matches_filter_;             // True if this test matches the
+                                    // user-specified filter.
+  internal::TestFactoryBase* const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test case, which consists of a vector of TestInfos.
+//
+// TestCase is not copyable.
+class GTEST_API_ TestCase {
+ public:
+  // Creates a TestCase with the given name.
+  //
+  // TestCase does NOT have a default constructor.  Always use this
+  // constructor to create a TestCase object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test case
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  TestCase(const char* name, const char* a_type_param,
+           Test::SetUpTestCaseFunc set_up_tc,
+           Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Destructor of TestCase.
+  virtual ~TestCase();
+
+  // Gets the name of the TestCase.
+  const char* name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test case.
+  const char* type_param() const {
+    if (type_param_.get() != NULL)
+      return type_param_->c_str();
+    return NULL;
+  }
+
+  // Returns true if any test in this test case should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test case.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests in this test case.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test case.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test case that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test case.
+  int total_test_count() const;
+
+  // Returns true iff the test case passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the test case failed.
+  bool Failed() const { return failed_test_count() > 0; }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo* GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestCase and TearDownTestCase.
+  const TestResult& ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestCase.
+  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestCase.
+  const std::vector<TestInfo*>& test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo* GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
+  // destruction of the TestCase object.
+  void AddTestInfo(TestInfo * test_info);
+
+  // Clears the results of all tests in this test case.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test case.
+  static void ClearTestCaseResult(TestCase* test_case) {
+    test_case->ClearResult();
+  }
+
+  // Runs every test in this TestCase.
+  void Run();
+
+  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestCase().
+  void RunSetUpTestCase() { (*set_up_tc_)(); }
+
+  // Runs TearDownTestCase() for this TestCase.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestCase().
+  void RunTearDownTestCase() { (*tear_down_tc_)(); }
+
+  // Returns true iff test passed.
+  static bool TestPassed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true iff test failed.
+  static bool TestFailed(const TestInfo* test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true iff the test is disabled and will be reported in the XML
+  // report.
+  static bool TestReportableDisabled(const TestInfo* test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true iff test is disabled.
+  static bool TestDisabled(const TestInfo* test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true iff this test will appear in the XML report.
+  static bool TestReportable(const TestInfo* test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo* test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test case.
+  void ShuffleTests(internal::Random* random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test case.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const internal::scoped_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo*> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test case.
+  Test::SetUpTestCaseFunc set_up_tc_;
+  // Pointer to the function that tears down the test case.
+  Test::TearDownTestCaseFunc tear_down_tc_;
+  // True iff any test in this test case should run.
+  bool should_run_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestCase and
+  // TearDownTestCase.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestCases.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
+};
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest& unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
+
+  // Fired before the test case starts.
+  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo& test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo& test_info) = 0;
+
+  // Fired after the test case ends.
+  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest& unit_test,
+                                  int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                                    int /*iteration*/) {}
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
+  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
+  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
+  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
+  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
+                                  int /*iteration*/) {}
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener* listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener* default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener* default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestCase;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener* repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener* listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener* listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater* repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener* default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener* default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestCases.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest* GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char* original_working_dir() const;
+
+  // Returns the TestCase object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestCase* current_test_case() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo* current_test_info() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns the ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const;
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const;
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test cases.
+  const TestResult& ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners& listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment* AddEnvironment(Environment* env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char* file_name,
+                         int line_number,
+                         const std::string& message,
+                         const std::string& os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+  // from SetUpTestCase or TearDownTestCase, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string& key, const std::string& value);
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl* impl() { return impl_; }
+  const internal::UnitTestImpl* impl() const { return impl_; }
+
+  // These classes and funcions are friends as they need to access private
+  // members of UnitTest.
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::ScopedTrace;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment* AddGlobalTestEnvironment(Environment* env);
+  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type,
+      const std::string& message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo& trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl* impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment* AddGlobalTestEnvironment(Environment* env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
+
+namespace internal {
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const T1& lhs, const T2& rhs) {
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            const T1& lhs,
+                            const T2& rhs) {
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
+                                       const char* rhs_expression,
+                                       BiggestInt lhs,
+                                       BiggestInt rhs);
+
+// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
+// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
+// is a null pointer literal.  The following default implementation is
+// for lhs_is_null_literal being false.
+template <bool lhs_is_null_literal>
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <typename T1, typename T2>
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 const T1& lhs,
+                                 const T2& rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char* lhs_expression,
+                                 const char* rhs_expression,
+                                 BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+};
+
+// This specialization is used when the first argument to ASSERT_EQ()
+// is a null pointer literal, like NULL, false, or 0.
+template <>
+class EqHelper<true> {
+ public:
+  // We define two overloaded versions of Compare().  The first
+  // version will be picked when the second argument to ASSERT_EQ() is
+  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
+  // EXPECT_EQ(false, a_bool).
+  template <typename T1, typename T2>
+  static AssertionResult Compare(
+      const char* lhs_expression,
+      const char* rhs_expression,
+      const T1& lhs,
+      const T2& rhs,
+      // The following line prevents this overload from being considered if T2
+      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
+      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
+      // to match the Secret* in the other overload, which would otherwise make
+      // this template match better.
+      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  // This version will be picked when the second argument to ASSERT_EQ() is a
+  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
+  template <typename T>
+  static AssertionResult Compare(
+      const char* lhs_expression,
+      const char* rhs_expression,
+      // We used to have a second template parameter instead of Secret*.  That
+      // template parameter would deduce to 'long', making this a better match
+      // than the first overload even without the first overload's EnableIf.
+      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
+      // non-pointer argument" (even a deduced integral argument), so the old
+      // implementation caused warnings in user code.
+      Secret* /* lhs (NULL) */,
+      T* rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T*>(NULL), rhs);
+  }
+};
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
+                                   const T1& val1, const T2& val2,
+                                   const char* op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+template <typename T1, typename T2>\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   const T1& val1, const T2& val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
+  }\
+}\
+GTEST_API_ AssertionResult CmpHelper##op_name(\
+    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const char* s1,
+                                          const char* s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                              const char* s2_expression,
+                                              const char* s1,
+                                              const char* s2);
+
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                                          const char* s2_expression,
+                                          const wchar_t* s1,
+                                          const wchar_t* s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
+                                         const char* rhs_expression,
+                                         RawType lhs_value,
+                                         RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
+
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   StringStreamToString(&lhs_ss),
+                   StringStreamToString(&rhs_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
+                                                const char* expr2,
+                                                const char* abs_error_expr,
+                                                double val1,
+                                                double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type,
+               const char* file,
+               int line,
+               const char* message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message& message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t,
+                     const char* srcfile,
+                     int line_num,
+                     const char* msg)
+        : type(t), file(srcfile), line(line_num), message(msg) { }
+
+    TestPartResult::Type const type;
+    const char* const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData* const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+}  // namespace internal
+
+#if GTEST_HAS_PARAM_TEST
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual ~FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void SetUp() {
+//     // Can use GetParam() here.
+//   }
+//   virtual void TearDown {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor. This member function is non-static, even though it only
+  // references static data, to reduce the opportunity for incorrect uses
+  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
+  // uses a fixture whose parameter type is int.
+  const ParamType& GetParam() const {
+    GTEST_CHECK_(parameter_ != NULL)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType* parameter) {
+    parameter_ = parameter;
+  }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType* parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass> friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T* WithParamInterface<T>::parameter_ = NULL;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {
+};
+
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Macros for indicating success/failure in test code.
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line) \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+# define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+# define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
+                      GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Includes the auto-generated header that implements a family of
+// generic predicate assertion macros.
+#include "gtest/gtest_pred_impl.h"
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(5, Foo());
+//   EXPECT_EQ(NULL, a_pointer);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal:: \
+                      EqHelper<GTEST_IS_NULL_LITERAL_(val1)>::Compare, \
+                      val1, val2)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define EXPECT_STRCASENE(s1, s2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define ASSERT_STRCASENE(s1, s2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(val1, val2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define EXPECT_DOUBLE_EQ(val1, val2)\
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define ASSERT_FLOAT_EQ(val1, val2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define ASSERT_DOUBLE_EQ(val1, val2)\
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define EXPECT_NEAR(val1, val2, abs_error)\
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)\
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
+                      val1, val2, abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                                    double val1, double val2);
+
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+# define EXPECT_HRESULT_SUCCEEDED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define ASSERT_HRESULT_SUCCEEDED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+# define EXPECT_HRESULT_FAILED(expr) \
+    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+# define ASSERT_HRESULT_FAILED(expr) \
+    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+#define SCOPED_TRACE(message) \
+  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, ::testing::Message() << (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
+// the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+bool StaticAssertTypeEq() {
+  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test case, and the second
+// parameter is the name of the test within the test case.
+//
+// The convention is to end the test case name with "Test".  For
+// example, a test case for the Foo class can be named FooTest.
+//
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_case_name, test_name)\
+  GTEST_TEST_(test_case_name, test_name, \
+              ::testing::Test, ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test case name.  The second parameter is the
+// name of the test within the test case.
+//
+// A test fixture class must be declared earlier.  The user should put
+// his test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     virtual void SetUp() { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(0, a_.size());
+//     EXPECT_EQ(1, b_.size());
+//   }
+
+#define TEST_F(test_fixture, test_name)\
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() {
+  return ::testing::UnitTest::GetInstance()->Run();
+}
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
new file mode 100644
index 000000000..30ae712f5
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
@@ -0,0 +1,358 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+// Makes sure this header is not included before gtest.h.
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ; \
+  else \
+    on_failure(gtest_ar.failure_message())
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text,
+                                  const char* e1,
+                                  Pred pred,
+                                  const T1& v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, v1), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
+                                             #v1, \
+                                             pred, \
+                                             v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) \
+  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             pred, \
+                                             v1, \
+                                             v2), on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3), on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4), on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred,
+          typename T1,
+          typename T2,
+          typename T3,
+          typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char* pred_text,
+                                  const char* e1,
+                                  const char* e2,
+                                  const char* e3,
+                                  const char* e4,
+                                  const char* e5,
+                                  Pred pred,
+                                  const T1& v1,
+                                  const T2& v2,
+                                  const T3& v3,
+                                  const T4& v4,
+                                  const T5& v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure() << pred_text << "("
+                            << e1 << ", "
+                            << e2 << ", "
+                            << e3 << ", "
+                            << e4 << ", "
+                            << e5 << ") evaluates to false, where"
+                            << "\n" << e1 << " evaluates to " << v1
+                            << "\n" << e2 << " evaluates to " << v2
+                            << "\n" << e3 << " evaluates to " << v3
+                            << "\n" << e4 << " evaluates to " << v4
+                            << "\n" << e5 << " evaluates to " << v5;
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
+                                             #v1, \
+                                             #v2, \
+                                             #v3, \
+                                             #v4, \
+                                             #v5, \
+                                             pred, \
+                                             v1, \
+                                             v2, \
+                                             v3, \
+                                             v4, \
+                                             v5), on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
new file mode 100644
index 000000000..da80ddc6c
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
@@ -0,0 +1,58 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// Google C++ Testing Framework definitions useful in production code.
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name)\
+friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
new file mode 100644
index 000000000..7e744bd3b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,69 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+//   Flag related macros:
+//     GTEST_FLAG(flag_name)
+//     GTEST_USE_OWN_FLAGFILE_FLAG_  - Define to 0 when the system provides its
+//                                     own flagfile flag parsing.
+//     GTEST_DECLARE_bool_(name)
+//     GTEST_DECLARE_int32_(name)
+//     GTEST_DECLARE_string_(name)
+//     GTEST_DEFINE_bool_(name, default_val, doc)
+//     GTEST_DEFINE_int32_(name, default_val, doc)
+//     GTEST_DEFINE_string_(name, default_val, doc)
+//
+//   Test filtering:
+//     GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that
+//                                  will be used if --GTEST_FLAG(test_filter)
+//                                  is not provided.
+//
+//   Logging:
+//     GTEST_LOG_(severity)
+//     GTEST_CHECK_(condition)
+//     Functions LogToStderr() and FlushInfoLog() have to be provided too.
+//
+//   Threading:
+//     GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided.
+//     GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are
+//                                         already provided.
+//     Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and
+//     GTEST_DEFINE_STATIC_MUTEX_(mutex)
+//
+//     GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+//     GTEST_LOCK_EXCLUDED_(locks)
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
new file mode 100644
index 000000000..60c1ea050
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+// See documentation at gtest/gtest-printers.h for details on how to define a
+// custom printer.
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
new file mode 100644
index 000000000..c27412a89
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,41 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations.
+// The following macros can be defined:
+//
+// GTEST_OS_STACK_TRACE_GETTER_  - The name of an implementation of
+//                                 OsStackTraceGetterInterface.
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
new file mode 100644
index 000000000..2b3a78f5b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -0,0 +1,319 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-internal.h"
+
+#include <stdio.h>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char* statement, const RE* regex,
+                     const char* file, int line, DeathTest** test);
+  DeathTest();
+  virtual ~DeathTest() { }
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+   private:
+    DeathTest* const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char* LastMessage();
+
+  static void set_last_death_test_message(const std::string& message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() { }
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  virtual bool Create(const char* statement, const RE* regex,
+                      const char* file, int line, DeathTest** test);
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+# if GTEST_HAS_EXCEPTIONS
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  try { \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } catch (const ::std::exception& gtest_exception) { \
+    fprintf(\
+        stderr, \
+        "\n%s: Caught std::exception-derived exception escaping the " \
+        "death test statement. Exception message: %s\n", \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what()); \
+    fflush(stderr); \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) { \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+# else
+#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+# endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    const ::testing::internal::RE& gtest_regex = (regex); \
+    ::testing::internal::DeathTest* gtest_dt; \
+    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
+        __FILE__, __LINE__, &gtest_dt)) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+    } \
+    if (gtest_dt != NULL) { \
+      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
+          gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) { \
+        case ::testing::internal::DeathTest::OVERSEE_TEST: \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
+          } \
+          break; \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
+          ::testing::internal::DeathTest::ReturnSentinel \
+              gtest_sentinel(gtest_dt); \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
+          break; \
+        } \
+        default: \
+          break; \
+      } \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
+      fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed, the regex is
+// ignored, and the macro must accept a streamed message even though the message
+// is never printed.
+# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string& a_file,
+                           int a_line,
+                           int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index),
+        write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0)
+      posix::Close(write_fd_);
+  }
+
+  const std::string& file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
+
+#else  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
new file mode 100644
index 000000000..7a13b4b0d
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
@@ -0,0 +1,206 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: keith.ray@gmail.com (Keith Ray)
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in <gtest/internal/gtest-internal.h>.
+// Do not include this header file separately!
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+#include "gtest/internal/gtest-string.h"
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") { }
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+
+  explicit FilePath(const std::string& pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath& operator=(const FilePath& rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath& rhs) {
+    pathname_ = rhs.pathname_;
+  }
+
+  const std::string& string() const { return pathname_; }
+  const char* c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath& directory,
+                               const FilePath& base_name,
+                               int number,
+                               const char* extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath& directory,
+                              const FilePath& relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath& directory,
+                                         const FilePath& base_name,
+                                         const char* extension);
+
+  // Returns true iff the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char* extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char* FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
new file mode 100644
index 000000000..ebd1cf615
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
@@ -0,0 +1,1238 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_LINUX
+# include <stdlib.h>
+# include <sys/types.h>
+# include <sys/wait.h>
+# include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+
+class ProtocolMessage;
+namespace proto2 { class Message; }
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;                 // Result of an assertion.
+class Message;                         // Represents a failure message.
+class Test;                            // Represents a test.
+class TestInfo;                        // Information about a test.
+class TestPartResult;                  // Result of a test part.
+class UnitTest;                        // A collection of test cases.
+
+template <typename T>
+::std::string PrintToString(const T& value);
+
+namespace internal {
+
+struct TraceInfo;                      // Information about a trace point.
+class ScopedTrace;                     // Implements scoped trace.
+class TestInfoImpl;                    // Opaque implementation of TestInfo
+class UnitTestImpl;                    // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// Two overloaded helpers for checking at compile time whether an
+// expression is a null pointer literal (i.e. NULL or any 0-valued
+// compile-time integral constant).  Their return values have
+// different sizes, so we can use sizeof() to test which version is
+// picked by the compiler.  These helpers have no implementations, as
+// we only need their signatures.
+//
+// Given IsNullLiteralHelper(x), the compiler will pick the first
+// version if x can be implicitly converted to Secret*, and pick the
+// second version otherwise.  Since Secret is a secret and incomplete
+// type, the only expression a user can write that has type Secret* is
+// a null pointer literal.  Therefore, we know that x is a null
+// pointer literal if and only if the first version is picked by the
+// compiler.
+char IsNullLiteralHelper(Secret* p);
+char (&IsNullLiteralHelper(...))[2];  // NOLINT
+
+// A compile-time bool constant that is true if and only if x is a
+// null pointer literal (i.e. NULL or any 0-valued compile-time
+// integral constant).
+#ifdef GTEST_ELLIPSIS_NEEDS_POD_
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_IS_NULL_LITERAL_(x) false
+#else
+# define GTEST_IS_NULL_LITERAL_(x) \
+    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
+#endif  // GTEST_ELLIPSIS_NEEDS_POD_
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(
+    const std::string& gtest_msg, const Message& user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult& failure);
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// A helper class for creating scoped traces in user programs.
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+  ScopedTrace(const char* file, int line, const Message& message);
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner–Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t>& left, const std::vector<size_t>& right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                                         const std::vector<std::string>& right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string& left,
+                                   const std::string& right,
+                                   size_t* total_line_count);
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
+                                     const char* actual_expression,
+                                     const std::string& expected_value,
+                                     const std::string& actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8*sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+    std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask =
+    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() {
+    return ReinterpretBits(kExponentBitMask);
+  }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true iff this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true iff this number is at most kMaxUlps ULP's away from
+  // rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint& rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
+        <= kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+template <>
+inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test case, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void* TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test* CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  virtual Test* CreateTest() { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestCase() and TearDownTestCase() functions.
+typedef void (*SetUpTestCaseFunc)();
+typedef void (*TearDownTestCaseFunc)();
+
+struct CodeLocation {
+  CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {}
+
+  string file;
+  int line;
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    CodeLocation code_location,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// State of the definition of a type-parameterized test case.
+class GTEST_API_ TypedTestCasePState {
+ public:
+  TypedTestCasePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test case hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char* file, int line, const char* case_name,
+                   const char* test_name) {
+    if (registered_) {
+      fprintf(stderr, "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
+    return true;
+  }
+
+  bool TestExists(const std::string& test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation& GetCodeLocation(const std::string& test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char* VerifyRegisteredTestNames(
+      const char* file, int line, const char* registered_tests);
+
+ private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+  bool registered_;
+  RegisteredTestsMap registered_tests_;
+};
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char* SkipComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  if (comma == NULL) {
+    return NULL;
+  }
+  while (IsSpace(*(++comma))) {}
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char* str) {
+  const char* comma = strchr(str, ',');
+  return comma == NULL ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest);
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char* prefix,
+                       CodeLocation code_location,
+                       const char* case_name, const char* test_names,
+                       int index) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
+         + StreamableToString(index)).c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        NULL,  // No value parameter.
+        code_location,
+        GetTypeId<FixtureClass>(),
+        TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase,
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
+        ::Register(prefix, code_location, case_name, test_names, index + 1);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, Types0> {
+ public:
+  static bool Register(const char* /*prefix*/, CodeLocation,
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       int /*index*/) {
+    return true;
+  }
+};
+
+// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestCase {
+ public:
+  static bool Register(const char* prefix, CodeLocation code_location,
+                       const TypedTestCasePState* state,
+                       const char* case_name, const char* test_names) {
+    std::string test_name = StripTrailingSpaces(
+        GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(),
+                                 code_location.line).c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation& test_location = state->GetCodeLocation(test_name);
+
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, test_location, case_name, test_names, 0);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
+        ::Register(prefix, code_location, state,
+                   case_name, SkipComma(test_names));
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestCase<Fixture, Templates0, Types> {
+ public:
+  static bool Register(const char* /*prefix*/, CodeLocation,
+                       const TypedTestCasePState* /*state*/,
+                       const char* /*case_name*/, const char* /*test_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
+    UnitTest* unit_test, int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char* str) : value(str) {}
+  operator bool() const { return true; }
+  const char* value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const UInt32 kMaxRange = 1u << 31;
+
+  explicit Random(UInt32 seed) : state_(seed) {}
+
+  void Reseed(UInt32 seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  UInt32 Generate(UInt32 range);
+
+ private:
+  UInt32 state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
+// compiler error iff T1 and T2 are different types.
+template <typename T1, typename T2>
+struct CompileAssertTypesEqual;
+
+template <typename T>
+struct CompileAssertTypesEqual<T, T> {
+};
+
+// Removes the reference from a type if it is a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::remove_reference, which is not widely available yet.
+template <typename T>
+struct RemoveReference { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveReference<T&> { typedef T type; };  // NOLINT
+
+// A handy wrapper around RemoveReference that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_REFERENCE_(T) \
+    typename ::testing::internal::RemoveReference<T>::type
+
+// Removes const from a type if it is a const type, otherwise leaves
+// it unchanged.  This is the same as tr1::remove_const, which is not
+// widely available yet.
+template <typename T>
+struct RemoveConst { typedef T type; };  // NOLINT
+template <typename T>
+struct RemoveConst<const T> { typedef T type; };  // NOLINT
+
+// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
+// definition to fail to remove the const in 'const int[3]' and 'const
+// char[3][4]'.  The following specialization works around the bug.
+template <typename T, size_t N>
+struct RemoveConst<const T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+
+#if defined(_MSC_VER) && _MSC_VER < 1400
+// This is the only specialization that allows VC++ 7.1 to remove const in
+// 'const int[3] and 'const int[3][4]'.  However, it causes trouble with GCC
+// and thus needs to be conditionally compiled.
+template <typename T, size_t N>
+struct RemoveConst<T[N]> {
+  typedef typename RemoveConst<T>::type type[N];
+};
+#endif
+
+// A handy wrapper around RemoveConst that works when the argument
+// T depends on template parameters.
+#define GTEST_REMOVE_CONST_(T) \
+    typename ::testing::internal::RemoveConst<T>::type
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
+
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Adds a reference to const on top of T as necessary.  For example,
+// it transforms
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> const char&
+//   const char&  ==> const char&
+//
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
+
+// ImplicitlyConvertible<From, To>::value is a compile-time bool
+// constant that's true iff type From can be implicitly converted to
+// type To.
+template <typename From, typename To>
+class ImplicitlyConvertible {
+ private:
+  // We need the following helper functions only for their types.
+  // They have no implementations.
+
+  // MakeFrom() is an expression whose type is From.  We cannot simply
+  // use From(), as the type From may not have a public default
+  // constructor.
+  static typename AddReference<From>::type MakeFrom();
+
+  // These two functions are overloaded.  Given an expression
+  // Helper(x), the compiler will pick the first version if x can be
+  // implicitly converted to type To; otherwise it will pick the
+  // second version.
+  //
+  // The first version returns a value of size 1, and the second
+  // version returns a value of size 2.  Therefore, by checking the
+  // size of Helper(x), which can be done at compile time, we can tell
+  // which version of Helper() is used, and hence whether x can be
+  // implicitly converted to type To.
+  static char Helper(To);
+  static char (&Helper(...))[2];  // NOLINT
+
+  // We have to put the 'public' section after the 'private' section,
+  // or MSVC refuses to compile the code.
+ public:
+#if defined(__BORLANDC__)
+  // C++Builder cannot use member overload resolution during template
+  // instantiation.  The simplest workaround is to use its C++0x type traits
+  // functions (C++Builder 2009 and above only).
+  static const bool value = __is_convertible(From, To);
+#else
+  // MSVC warns about implicitly converting from double to int for
+  // possible loss of data, so we need to temporarily disable the
+  // warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4244)
+  static const bool value =
+      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif  // __BORLANDC__
+};
+template <typename From, typename To>
+const bool ImplicitlyConvertible<From, To>::value;
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true iff T is type ProtocolMessage, proto2::Message, or a subclass
+// of those.
+template <typename T>
+struct IsAProtocolMessage
+    : public bool_constant<
+  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
+  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
+};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// Note that we look for both C::iterator and C::const_iterator.  The
+// reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C>
+IsContainer IsContainerTest(int /* dummy */,
+                            typename C::iterator* /* it */ = NULL,
+                            typename C::const_iterator* /* const_it */ = NULL) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+
+// EnableIf<condition>::type is void when 'Cond' is true, and
+// undefined when 'Cond' is false.  To use SFINAE to make a function
+// overload only apply when a particular expression is true, add
+// "typename EnableIf<expression>::type* = 0" as the last parameter.
+template<bool> struct EnableIf;
+template<> struct EnableIf<true> { typedef void type; };  // NOLINT
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i]))
+      return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem))
+      return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T& from, U* to) { *to = from; }
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T* from, size_t size, U* to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element* iterator;
+  typedef const Element* const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element* array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray& rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef)
+      delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray& rhs) const {
+    return size() == rhs.size() &&
+        ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  enum {
+    kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+  };
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element* array, size_t a_size) {
+    Element* const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element* array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element* array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element*, size_t);
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
+  ::testing::internal::AssertHelper(result_type, file, line, message) \
+    = ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) { statement; }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
+    bool gtest_caught_expected = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (expected_exception const&) { \
+      gtest_caught_expected = true; \
+    } \
+    catch (...) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws a different type."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+    if (!gtest_caught_expected) { \
+      gtest_msg.value = \
+          "Expected: " #statement " throws an exception of type " \
+          #expected_exception ".\n  Actual: it throws nothing."; \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
+      fail(gtest_msg.value)
+
+#define GTEST_TEST_NO_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
+      fail("Expected: " #statement " doesn't throw an exception.\n" \
+           "  Actual: it throws.")
+
+#define GTEST_TEST_ANY_THROW_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    bool gtest_caught_any = false; \
+    try { \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    } \
+    catch (...) { \
+      gtest_caught_any = true; \
+    } \
+    if (!gtest_caught_any) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
+      fail("Expected: " #statement " throws an exception.\n" \
+           "  Actual: it doesn't.")
+
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (const ::testing::AssertionResult gtest_ar_ = \
+      ::testing::AssertionResult(expression)) \
+    ; \
+  else \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(\
+        gtest_ar_, text, #actual, #expected).c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  if (::testing::internal::AlwaysTrue()) { \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
+    } \
+  } else \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
+      fail("Expected: " #statement " doesn't generate new fatal " \
+           "failures in the current thread.\n" \
+           "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
+  test_case_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
+class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
+ public:\
+  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
+ private:\
+  virtual void TestBody();\
+  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
+      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
+};\
+\
+::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
+  ::test_info_ =\
+    ::testing::internal::MakeAndRegisterTestInfo(\
+        #test_case_name, #test_name, NULL, NULL, \
+        ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+        (parent_id), \
+        parent_class::SetUpTestCase, \
+        parent_class::TearDownTestCase, \
+        new ::testing::internal::TestFactoryImpl<\
+            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
+void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-linked_ptr.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-linked_ptr.h
new file mode 100644
index 000000000..360294221
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-linked_ptr.h
@@ -0,0 +1,243 @@
+// Copyright 2003 Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: Dan Egnor (egnor@google.com)
+//
+// A "smart" pointer type with reference tracking.  Every pointer to a
+// particular object is kept on a circular linked list.  When the last pointer
+// to an object is destroyed or reassigned, the object is deleted.
+//
+// Used properly, this deletes the object when the last reference goes away.
+// There are several caveats:
+// - Like all reference counting schemes, cycles lead to leaks.
+// - Each smart pointer is actually two pointers (8 bytes instead of 4).
+// - Every time a pointer is assigned, the entire list of pointers to that
+//   object is traversed.  This class is therefore NOT SUITABLE when there
+//   will often be more than two or three pointers to a particular object.
+// - References are only tracked as long as linked_ptr<> objects are copied.
+//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
+//   will happen (double deletion).
+//
+// A good use of this class is storing object references in STL containers.
+// You can safely put linked_ptr<> in a vector<>.
+// Other uses may not be as good.
+//
+// Note: If you use an incomplete type with linked_ptr<>, the class
+// *containing* linked_ptr<> must have a constructor and destructor (even
+// if they do nothing!).
+//
+// Bill Gibbons suggested we use something like this.
+//
+// Thread Safety:
+//   Unlike other linked_ptr implementations, in this implementation
+//   a linked_ptr object is thread-safe in the sense that:
+//     - it's safe to copy linked_ptr objects concurrently,
+//     - it's safe to copy *from* a linked_ptr and read its underlying
+//       raw pointer (e.g. via get()) concurrently, and
+//     - it's safe to write to two linked_ptrs that point to the same
+//       shared object concurrently.
+// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// confusion with normal linked_ptr.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
+
+#include <stdlib.h>
+#include <assert.h>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// Protects copying of all linked_ptr objects.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// This is used internally by all instances of linked_ptr<>.  It needs to be
+// a non-template class because different types of linked_ptr<> can refer to
+// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
+// So, it needs to be possible for different types of linked_ptr to participate
+// in the same circular linked list, so we need a single class type here.
+//
+// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
+class linked_ptr_internal {
+ public:
+  // Create a new circle that includes only this instance.
+  void join_new() {
+    next_ = this;
+  }
+
+  // Many linked_ptr operations may change p.link_ for some linked_ptr
+  // variable p in the same circle as this object.  Therefore we need
+  // to prevent two such operations from occurring concurrently.
+  //
+  // Note that different types of linked_ptr objects can coexist in a
+  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
+  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
+  // protect all linked_ptr objects.  This can create serious
+  // contention in production code, but is acceptable in a testing
+  // framework.
+
+  // Join an existing circle.
+  void join(linked_ptr_internal const* ptr)
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    linked_ptr_internal const* p = ptr;
+    while (p->next_ != ptr) {
+      assert(p->next_ != this &&
+             "Trying to join() a linked ring we are already in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = this;
+    next_ = ptr;
+  }
+
+  // Leave whatever circle we're part of.  Returns true if we were the
+  // last member of the circle.  Once this is done, you can join() another.
+  bool depart()
+      GTEST_LOCK_EXCLUDED_(g_linked_ptr_mutex) {
+    MutexLock lock(&g_linked_ptr_mutex);
+
+    if (next_ == this) return true;
+    linked_ptr_internal const* p = next_;
+    while (p->next_ != this) {
+      assert(p->next_ != next_ &&
+             "Trying to depart() a linked ring we are not in. "
+             "Is GMock thread safety enabled?");
+      p = p->next_;
+    }
+    p->next_ = next_;
+    return false;
+  }
+
+ private:
+  mutable linked_ptr_internal const* next_;
+};
+
+template <typename T>
+class linked_ptr {
+ public:
+  typedef T element_type;
+
+  // Take over ownership of a raw pointer.  This should happen as soon as
+  // possible after the object is created.
+  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
+  ~linked_ptr() { depart(); }
+
+  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
+  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
+  linked_ptr(linked_ptr const& ptr) {  // NOLINT
+    assert(&ptr != this);
+    copy(&ptr);
+  }
+
+  // Assignment releases the old value and acquires the new.
+  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
+    depart();
+    copy(&ptr);
+    return *this;
+  }
+
+  linked_ptr& operator=(linked_ptr const& ptr) {
+    if (&ptr != this) {
+      depart();
+      copy(&ptr);
+    }
+    return *this;
+  }
+
+  // Smart pointer members.
+  void reset(T* ptr = NULL) {
+    depart();
+    capture(ptr);
+  }
+  T* get() const { return value_; }
+  T* operator->() const { return value_; }
+  T& operator*() const { return *value_; }
+
+  bool operator==(T* p) const { return value_ == p; }
+  bool operator!=(T* p) const { return value_ != p; }
+  template <typename U>
+  bool operator==(linked_ptr<U> const& ptr) const {
+    return value_ == ptr.get();
+  }
+  template <typename U>
+  bool operator!=(linked_ptr<U> const& ptr) const {
+    return value_ != ptr.get();
+  }
+
+ private:
+  template <typename U>
+  friend class linked_ptr;
+
+  T* value_;
+  linked_ptr_internal link_;
+
+  void depart() {
+    if (link_.depart()) delete value_;
+  }
+
+  void capture(T* ptr) {
+    value_ = ptr;
+    link_.join_new();
+  }
+
+  template <typename U> void copy(linked_ptr<U> const* ptr) {
+    value_ = ptr->get();
+    if (value_)
+      link_.join(&ptr->link_);
+    else
+      link_.join_new();
+  }
+};
+
+template<typename T> inline
+bool operator==(T* ptr, const linked_ptr<T>& x) {
+  return ptr == x.get();
+}
+
+template<typename T> inline
+bool operator!=(T* ptr, const linked_ptr<T>& x) {
+  return ptr != x.get();
+}
+
+// A function to convert T* into linked_ptr<T>
+// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
+// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
+template <typename T>
+linked_ptr<T> make_linked_ptr(T* ptr) {
+  return linked_ptr<T>(ptr);
+}
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h
new file mode 100644
index 000000000..4d1d81d20
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h
@@ -0,0 +1,5146 @@
+// This file was GENERATED by command:
+//     pump.py gtest-param-util-generated.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most 50 arguments in Values,
+// and at most 10 arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tuple which is
+// currently set at 10.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+template <typename T1>
+class ValueArray1 {
+ public:
+  explicit ValueArray1(T1 v1) : v1_(v1) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray1& other);
+
+  const T1 v1_;
+};
+
+template <typename T1, typename T2>
+class ValueArray2 {
+ public:
+  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray2& other);
+
+  const T1 v1_;
+  const T2 v2_;
+};
+
+template <typename T1, typename T2, typename T3>
+class ValueArray3 {
+ public:
+  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray3& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+class ValueArray4 {
+ public:
+  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray4& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class ValueArray5 {
+ public:
+  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray5& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class ValueArray6 {
+ public:
+  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray6& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class ValueArray7 {
+ public:
+  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray7& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class ValueArray8 {
+ public:
+  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
+      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray8& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class ValueArray9 {
+ public:
+  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
+      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray9& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class ValueArray10 {
+ public:
+  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray10& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+class ValueArray11 {
+ public:
+  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray11& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+class ValueArray12 {
+ public:
+  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray12& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+class ValueArray13 {
+ public:
+  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray13& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+class ValueArray14 {
+ public:
+  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray14& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+class ValueArray15 {
+ public:
+  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray15& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+class ValueArray16 {
+ public:
+  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray16& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+class ValueArray17 {
+ public:
+  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
+      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray17& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+class ValueArray18 {
+ public:
+  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray18& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+class ValueArray19 {
+ public:
+  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray19& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+class ValueArray20 {
+ public:
+  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray20& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+class ValueArray21 {
+ public:
+  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray21& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+class ValueArray22 {
+ public:
+  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray22& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+class ValueArray23 {
+ public:
+  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray23& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+class ValueArray24 {
+ public:
+  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray24& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+class ValueArray25 {
+ public:
+  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
+      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray25& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+class ValueArray26 {
+ public:
+  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray26& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+class ValueArray27 {
+ public:
+  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray27& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+class ValueArray28 {
+ public:
+  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray28& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+class ValueArray29 {
+ public:
+  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray29& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+class ValueArray30 {
+ public:
+  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray30& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+class ValueArray31 {
+ public:
+  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray31& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+class ValueArray32 {
+ public:
+  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray32& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+class ValueArray33 {
+ public:
+  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
+      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray33& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+class ValueArray34 {
+ public:
+  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray34& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+class ValueArray35 {
+ public:
+  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray35& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+class ValueArray36 {
+ public:
+  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray36& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+class ValueArray37 {
+ public:
+  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray37& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+class ValueArray38 {
+ public:
+  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray38& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+class ValueArray39 {
+ public:
+  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray39& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+class ValueArray40 {
+ public:
+  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray40& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+class ValueArray41 {
+ public:
+  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
+      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray41& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+class ValueArray42 {
+ public:
+  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray42& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+class ValueArray43 {
+ public:
+  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
+      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
+      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
+      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
+      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
+      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
+      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray43& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+class ValueArray44 {
+ public:
+  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
+      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
+      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
+      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
+      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
+      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
+      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
+      v43_(v43), v44_(v44) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray44& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+class ValueArray45 {
+ public:
+  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
+      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
+      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
+      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
+      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
+      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
+      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
+      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray45& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+class ValueArray46 {
+ public:
+  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
+      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray46& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+class ValueArray47 {
+ public:
+  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
+      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
+      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
+      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
+      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
+      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
+      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
+      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
+      v47_(v47) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray47& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+class ValueArray48 {
+ public:
+  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
+      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
+      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
+      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
+      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
+      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
+      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
+      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
+      v46_(v46), v47_(v47), v48_(v48) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray48& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+class ValueArray49 {
+ public:
+  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
+      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray49& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+class ValueArray50 {
+ public:
+  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
+      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
+      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
+      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
+      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
+      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
+      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
+      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
+      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
+      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
+      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
+      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
+      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
+      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {static_cast<T>(v1_), static_cast<T>(v2_),
+        static_cast<T>(v3_), static_cast<T>(v4_), static_cast<T>(v5_),
+        static_cast<T>(v6_), static_cast<T>(v7_), static_cast<T>(v8_),
+        static_cast<T>(v9_), static_cast<T>(v10_), static_cast<T>(v11_),
+        static_cast<T>(v12_), static_cast<T>(v13_), static_cast<T>(v14_),
+        static_cast<T>(v15_), static_cast<T>(v16_), static_cast<T>(v17_),
+        static_cast<T>(v18_), static_cast<T>(v19_), static_cast<T>(v20_),
+        static_cast<T>(v21_), static_cast<T>(v22_), static_cast<T>(v23_),
+        static_cast<T>(v24_), static_cast<T>(v25_), static_cast<T>(v26_),
+        static_cast<T>(v27_), static_cast<T>(v28_), static_cast<T>(v29_),
+        static_cast<T>(v30_), static_cast<T>(v31_), static_cast<T>(v32_),
+        static_cast<T>(v33_), static_cast<T>(v34_), static_cast<T>(v35_),
+        static_cast<T>(v36_), static_cast<T>(v37_), static_cast<T>(v38_),
+        static_cast<T>(v39_), static_cast<T>(v40_), static_cast<T>(v41_),
+        static_cast<T>(v42_), static_cast<T>(v43_), static_cast<T>(v44_),
+        static_cast<T>(v45_), static_cast<T>(v46_), static_cast<T>(v47_),
+        static_cast<T>(v48_), static_cast<T>(v49_), static_cast<T>(v50_)};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray50& other);
+
+  const T1 v1_;
+  const T2 v2_;
+  const T3 v3_;
+  const T4 v4_;
+  const T5 v5_;
+  const T6 v6_;
+  const T7 v7_;
+  const T8 v8_;
+  const T9 v9_;
+  const T10 v10_;
+  const T11 v11_;
+  const T12 v12_;
+  const T13 v13_;
+  const T14 v14_;
+  const T15 v15_;
+  const T16 v16_;
+  const T17 v17_;
+  const T18 v18_;
+  const T19 v19_;
+  const T20 v20_;
+  const T21 v21_;
+  const T22 v22_;
+  const T23 v23_;
+  const T24 v24_;
+  const T25 v25_;
+  const T26 v26_;
+  const T27 v27_;
+  const T28 v28_;
+  const T29 v29_;
+  const T30 v30_;
+  const T31 v31_;
+  const T32 v32_;
+  const T33 v33_;
+  const T34 v34_;
+  const T35 v35_;
+  const T36 v36_;
+  const T37 v37_;
+  const T38 v38_;
+  const T39 v39_;
+  const T40 v40_;
+  const T41 v41_;
+  const T42 v42_;
+  const T43 v43_;
+  const T44 v44_;
+  const T45 v45_;
+  const T46 v46_;
+  const T47 v47_;
+  const T48 v48_;
+  const T49 v49_;
+  const T50 v50_;
+};
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+template <typename T1, typename T2>
+class CartesianProductGenerator2
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2> > {
+ public:
+  typedef ::testing::tuple<T1, T2> ParamType;
+
+  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2)
+      : g1_(g1), g2_(g2) {}
+  virtual ~CartesianProductGenerator2() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current2_;
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator2::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator2& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+};  // class CartesianProductGenerator2
+
+
+template <typename T1, typename T2, typename T3>
+class CartesianProductGenerator3
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3> ParamType;
+
+  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  virtual ~CartesianProductGenerator3() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current3_;
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator3::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator3& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+};  // class CartesianProductGenerator3
+
+
+template <typename T1, typename T2, typename T3, typename T4>
+class CartesianProductGenerator4
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4> ParamType;
+
+  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  virtual ~CartesianProductGenerator4() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current4_;
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator4::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator4& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+};  // class CartesianProductGenerator4
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class CartesianProductGenerator5
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5> ParamType;
+
+  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  virtual ~CartesianProductGenerator5() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current5_;
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator5::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator5& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+};  // class CartesianProductGenerator5
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+class CartesianProductGenerator6
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5,
+        T6> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6> ParamType;
+
+  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  virtual ~CartesianProductGenerator6() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current6_;
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator6::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator6& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+};  // class CartesianProductGenerator6
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+class CartesianProductGenerator7
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
+
+  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  virtual ~CartesianProductGenerator7() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current7_;
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator7::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator7& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+};  // class CartesianProductGenerator7
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+class CartesianProductGenerator8
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
+
+  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  virtual ~CartesianProductGenerator8() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current8_;
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator8::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator8& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+};  // class CartesianProductGenerator8
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+class CartesianProductGenerator9
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
+
+  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  virtual ~CartesianProductGenerator9() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current9_;
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator9::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator9& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+};  // class CartesianProductGenerator9
+
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+class CartesianProductGenerator10
+    : public ParamGeneratorInterface< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+        T7, T8, T9, T10> > {
+ public:
+  typedef ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
+
+  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
+      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
+      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
+      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
+      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
+      const ParamGenerator<T10>& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  virtual ~CartesianProductGenerator10() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
+        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
+        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
+        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
+        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base,
+      const ParamGenerator<T1>& g1,
+      const typename ParamGenerator<T1>::iterator& current1,
+      const ParamGenerator<T2>& g2,
+      const typename ParamGenerator<T2>::iterator& current2,
+      const ParamGenerator<T3>& g3,
+      const typename ParamGenerator<T3>::iterator& current3,
+      const ParamGenerator<T4>& g4,
+      const typename ParamGenerator<T4>::iterator& current4,
+      const ParamGenerator<T5>& g5,
+      const typename ParamGenerator<T5>::iterator& current5,
+      const ParamGenerator<T6>& g6,
+      const typename ParamGenerator<T6>::iterator& current6,
+      const ParamGenerator<T7>& g7,
+      const typename ParamGenerator<T7>::iterator& current7,
+      const ParamGenerator<T8>& g8,
+      const typename ParamGenerator<T8>::iterator& current8,
+      const ParamGenerator<T9>& g9,
+      const typename ParamGenerator<T9>::iterator& current9,
+      const ParamGenerator<T10>& g10,
+      const typename ParamGenerator<T10>::iterator& current10)
+        : base_(base),
+          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
+          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
+          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
+          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
+          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
+          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
+          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
+          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
+          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
+          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current10_;
+      if (current10_ == end10_) {
+        current10_ = begin10_;
+        ++current9_;
+      }
+      if (current9_ == end9_) {
+        current9_ = begin9_;
+        ++current8_;
+      }
+      if (current8_ == end8_) {
+        current8_ = begin8_;
+        ++current7_;
+      }
+      if (current7_ == end7_) {
+        current7_ = begin7_;
+        ++current6_;
+      }
+      if (current6_ == end6_) {
+        current6_ = begin6_;
+        ++current5_;
+      }
+      if (current5_ == end5_) {
+        current5_ = begin5_;
+        ++current4_;
+      }
+      if (current4_ == end4_) {
+        current4_ = begin4_;
+        ++current3_;
+      }
+      if (current3_ == end3_) {
+        current3_ = begin3_;
+        ++current2_;
+      }
+      if (current2_ == end2_) {
+        current2_ = begin2_;
+        ++current1_;
+      }
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         (
+          current1_ == typed_other->current1_ &&
+          current2_ == typed_other->current2_ &&
+          current3_ == typed_other->current3_ &&
+          current4_ == typed_other->current4_ &&
+          current5_ == typed_other->current5_ &&
+          current6_ == typed_other->current6_ &&
+          current7_ == typed_other->current7_ &&
+          current8_ == typed_other->current8_ &&
+          current9_ == typed_other->current9_ &&
+          current10_ == typed_other->current10_);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_),
+        begin1_(other.begin1_),
+        end1_(other.end1_),
+        current1_(other.current1_),
+        begin2_(other.begin2_),
+        end2_(other.end2_),
+        current2_(other.current2_),
+        begin3_(other.begin3_),
+        end3_(other.end3_),
+        current3_(other.current3_),
+        begin4_(other.begin4_),
+        end4_(other.end4_),
+        current4_(other.current4_),
+        begin5_(other.begin5_),
+        end5_(other.end5_),
+        current5_(other.current5_),
+        begin6_(other.begin6_),
+        end6_(other.end6_),
+        current6_(other.current6_),
+        begin7_(other.begin7_),
+        end7_(other.end7_),
+        current7_(other.current7_),
+        begin8_(other.begin8_),
+        end8_(other.end8_),
+        current8_(other.current8_),
+        begin9_(other.begin9_),
+        end9_(other.end9_),
+        current9_(other.current9_),
+        begin10_(other.begin10_),
+        end10_(other.end10_),
+        current10_(other.current10_) {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_,
+            *current9_, *current10_);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+          current1_ == end1_ ||
+          current2_ == end2_ ||
+          current3_ == end3_ ||
+          current4_ == end4_ ||
+          current5_ == end5_ ||
+          current6_ == end6_ ||
+          current7_ == end7_ ||
+          current8_ == end8_ ||
+          current9_ == end9_ ||
+          current10_ == end10_;
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+    const typename ParamGenerator<T1>::iterator begin1_;
+    const typename ParamGenerator<T1>::iterator end1_;
+    typename ParamGenerator<T1>::iterator current1_;
+    const typename ParamGenerator<T2>::iterator begin2_;
+    const typename ParamGenerator<T2>::iterator end2_;
+    typename ParamGenerator<T2>::iterator current2_;
+    const typename ParamGenerator<T3>::iterator begin3_;
+    const typename ParamGenerator<T3>::iterator end3_;
+    typename ParamGenerator<T3>::iterator current3_;
+    const typename ParamGenerator<T4>::iterator begin4_;
+    const typename ParamGenerator<T4>::iterator end4_;
+    typename ParamGenerator<T4>::iterator current4_;
+    const typename ParamGenerator<T5>::iterator begin5_;
+    const typename ParamGenerator<T5>::iterator end5_;
+    typename ParamGenerator<T5>::iterator current5_;
+    const typename ParamGenerator<T6>::iterator begin6_;
+    const typename ParamGenerator<T6>::iterator end6_;
+    typename ParamGenerator<T6>::iterator current6_;
+    const typename ParamGenerator<T7>::iterator begin7_;
+    const typename ParamGenerator<T7>::iterator end7_;
+    typename ParamGenerator<T7>::iterator current7_;
+    const typename ParamGenerator<T8>::iterator begin8_;
+    const typename ParamGenerator<T8>::iterator end8_;
+    typename ParamGenerator<T8>::iterator current8_;
+    const typename ParamGenerator<T9>::iterator begin9_;
+    const typename ParamGenerator<T9>::iterator end9_;
+    typename ParamGenerator<T9>::iterator current9_;
+    const typename ParamGenerator<T10>::iterator begin10_;
+    const typename ParamGenerator<T10>::iterator end10_;
+    typename ParamGenerator<T10>::iterator current10_;
+    ParamType current_value_;
+  };  // class CartesianProductGenerator10::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator10& other);
+
+  const ParamGenerator<T1> g1_;
+  const ParamGenerator<T2> g2_;
+  const ParamGenerator<T3> g3_;
+  const ParamGenerator<T4> g4_;
+  const ParamGenerator<T5> g5_;
+  const ParamGenerator<T6> g6_;
+  const ParamGenerator<T7> g7_;
+  const ParamGenerator<T8> g8_;
+  const ParamGenerator<T9> g9_;
+  const ParamGenerator<T10> g10_;
+};  // class CartesianProductGenerator10
+
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+template <class Generator1, class Generator2>
+class CartesianProductHolder2 {
+ public:
+CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
+      : g1_(g1), g2_(g2) {}
+  template <typename T1, typename T2>
+  operator ParamGenerator< ::testing::tuple<T1, T2> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2> >(
+        new CartesianProductGenerator2<T1, T2>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder2& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+};  // class CartesianProductHolder2
+
+template <class Generator1, class Generator2, class Generator3>
+class CartesianProductHolder3 {
+ public:
+CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3)
+      : g1_(g1), g2_(g2), g3_(g3) {}
+  template <typename T1, typename T2, typename T3>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3> >(
+        new CartesianProductGenerator3<T1, T2, T3>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder3& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+};  // class CartesianProductHolder3
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4>
+class CartesianProductHolder4 {
+ public:
+CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
+  template <typename T1, typename T2, typename T3, typename T4>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4> >(
+        new CartesianProductGenerator4<T1, T2, T3, T4>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder4& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+};  // class CartesianProductHolder4
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5>
+class CartesianProductHolder5 {
+ public:
+CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5> >(
+        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder5& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+};  // class CartesianProductHolder5
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6>
+class CartesianProductHolder6 {
+ public:
+CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6> >(
+        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder6& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+};  // class CartesianProductHolder6
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7>
+class CartesianProductHolder7 {
+ public:
+CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6,
+      T7> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7> >(
+        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder7& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+};  // class CartesianProductHolder7
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8>
+class CartesianProductHolder8 {
+ public:
+CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
+          g8_(g8) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7,
+      T8> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
+        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder8& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+};  // class CartesianProductHolder8
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9>
+class CartesianProductHolder9 {
+ public:
+CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+      T9> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
+        T9> >(
+        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder9& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+};  // class CartesianProductHolder9
+
+template <class Generator1, class Generator2, class Generator3,
+    class Generator4, class Generator5, class Generator6, class Generator7,
+    class Generator8, class Generator9, class Generator10>
+class CartesianProductHolder10 {
+ public:
+CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
+    const Generator3& g3, const Generator4& g4, const Generator5& g5,
+    const Generator6& g6, const Generator7& g7, const Generator8& g8,
+    const Generator9& g9, const Generator10& g10)
+      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
+          g9_(g9), g10_(g10) {}
+  template <typename T1, typename T2, typename T3, typename T4, typename T5,
+      typename T6, typename T7, typename T8, typename T9, typename T10>
+  operator ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+      T10> >() const {
+    return ParamGenerator< ::testing::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+        T10> >(
+        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
+            T10>(
+        static_cast<ParamGenerator<T1> >(g1_),
+        static_cast<ParamGenerator<T2> >(g2_),
+        static_cast<ParamGenerator<T3> >(g3_),
+        static_cast<ParamGenerator<T4> >(g4_),
+        static_cast<ParamGenerator<T5> >(g5_),
+        static_cast<ParamGenerator<T6> >(g6_),
+        static_cast<ParamGenerator<T7> >(g7_),
+        static_cast<ParamGenerator<T8> >(g8_),
+        static_cast<ParamGenerator<T9> >(g9_),
+        static_cast<ParamGenerator<T10> >(g10_)));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder10& other);
+
+  const Generator1 g1_;
+  const Generator2 g2_;
+  const Generator3 g3_;
+  const Generator4 g4_;
+  const Generator5 g5_;
+  const Generator6 g6_;
+  const Generator7 g7_;
+  const Generator8 g8_;
+  const Generator9 g9_;
+  const Generator10 g10_;
+};  // class CartesianProductHolder10
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h.pump b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
new file mode 100644
index 000000000..5c7c47af0
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util-generated.h.pump
@@ -0,0 +1,286 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of Values arguments we want to support.
+$var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently Google Test supports at most $n arguments in Values,
+// and at most $maxtuple arguments in Combine. Please contact
+// googletestframework@googlegroups.com if you need more.
+// Please note that the number of arguments to Combine is limited
+// by the maximum arity of the implementation of tuple which is
+// currently set at $maxtuple.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <typename ForwardIterator>
+internal::ParamGenerator<
+  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end);
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container& container);
+
+namespace internal {
+
+// Used in the Values() function to provide polymorphic capabilities.
+$range i 1..n
+$for i [[
+$range j 1..i
+
+template <$for j, [[typename T$j]]>
+class ValueArray$i {
+ public:
+  $if i==1 [[explicit ]]ValueArray$i($for j, [[T$j v$j]]) : $for j, [[v$(j)_(v$j)]] {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {
+    const T array[] = {$for j, [[static_cast<T>(v$(j)_)]]};
+    return ValuesIn(array);
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ValueArray$i& other);
+
+$for j [[
+
+  const T$j v$(j)_;
+]]
+
+};
+
+]]
+
+# if GTEST_HAS_COMBINE
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Generates values from the Cartesian product of values produced
+// by the argument generators.
+//
+$range i 2..maxtuple
+$for i [[
+$range j 1..i
+$range k 2..i
+
+template <$for j, [[typename T$j]]>
+class CartesianProductGenerator$i
+    : public ParamGeneratorInterface< ::testing::tuple<$for j, [[T$j]]> > {
+ public:
+  typedef ::testing::tuple<$for j, [[T$j]]> ParamType;
+
+  CartesianProductGenerator$i($for j, [[const ParamGenerator<T$j>& g$j]])
+      : $for j, [[g$(j)_(g$j)]] {}
+  virtual ~CartesianProductGenerator$i() {}
+
+  virtual ParamIteratorInterface<ParamType>* Begin() const {
+    return new Iterator(this, $for j, [[g$(j)_, g$(j)_.begin()]]);
+  }
+  virtual ParamIteratorInterface<ParamType>* End() const {
+    return new Iterator(this, $for j, [[g$(j)_, g$(j)_.end()]]);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<ParamType> {
+   public:
+    Iterator(const ParamGeneratorInterface<ParamType>* base, $for j, [[
+
+      const ParamGenerator<T$j>& g$j,
+      const typename ParamGenerator<T$j>::iterator& current$(j)]])
+        : base_(base),
+$for j, [[
+
+          begin$(j)_(g$j.begin()), end$(j)_(g$j.end()), current$(j)_(current$j)
+]]    {
+      ComputeCurrentValue();
+    }
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    virtual void Advance() {
+      assert(!AtEnd());
+      ++current$(i)_;
+
+$for k [[
+      if (current$(i+2-k)_ == end$(i+2-k)_) {
+        current$(i+2-k)_ = begin$(i+2-k)_;
+        ++current$(i+2-k-1)_;
+      }
+
+]]
+      ComputeCurrentValue();
+    }
+    virtual ParamIteratorInterface<ParamType>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const ParamType* Current() const { return &current_value_; }
+    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const Iterator* typed_other =
+          CheckedDowncastToActualType<const Iterator>(&other);
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      return (AtEnd() && typed_other->AtEnd()) ||
+         ($for j  && [[
+
+          current$(j)_ == typed_other->current$(j)_
+]]);
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : base_(other.base_), $for j, [[
+
+        begin$(j)_(other.begin$(j)_),
+        end$(j)_(other.end$(j)_),
+        current$(j)_(other.current$(j)_)
+]] {
+      ComputeCurrentValue();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = ParamType($for j, [[*current$(j)_]]);
+    }
+    bool AtEnd() const {
+      // We must report iterator past the end of the range when either of the
+      // component iterators has reached the end of its range.
+      return
+$for j  || [[
+
+          current$(j)_ == end$(j)_
+]];
+    }
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<ParamType>* const base_;
+    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
+    // current[i]_ is the actual traversing iterator.
+$for j [[
+
+    const typename ParamGenerator<T$j>::iterator begin$(j)_;
+    const typename ParamGenerator<T$j>::iterator end$(j)_;
+    typename ParamGenerator<T$j>::iterator current$(j)_;
+]]
+
+    ParamType current_value_;
+  };  // class CartesianProductGenerator$i::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductGenerator$i& other);
+
+
+$for j [[
+  const ParamGenerator<T$j> g$(j)_;
+
+]]
+};  // class CartesianProductGenerator$i
+
+
+]]
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Helper classes providing Combine() with polymorphic features. They allow
+// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
+// convertible to U.
+//
+$range i 2..maxtuple
+$for i [[
+$range j 1..i
+
+template <$for j, [[class Generator$j]]>
+class CartesianProductHolder$i {
+ public:
+CartesianProductHolder$i($for j, [[const Generator$j& g$j]])
+      : $for j, [[g$(j)_(g$j)]] {}
+  template <$for j, [[typename T$j]]>
+  operator ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >() const {
+    return ParamGenerator< ::testing::tuple<$for j, [[T$j]]> >(
+        new CartesianProductGenerator$i<$for j, [[T$j]]>(
+$for j,[[
+
+        static_cast<ParamGenerator<T$j> >(g$(j)_)
+]]));
+  }
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const CartesianProductHolder$i& other);
+
+
+$for j [[
+  const Generator$j g$(j)_;
+
+]]
+};  // class CartesianProductHolder$i
+
+]]
+
+# endif  // GTEST_HAS_COMBINE
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
new file mode 100644
index 000000000..82cab9b02
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
@@ -0,0 +1,731 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: vladl@google.com (Vlad Losev)
+
+// Type and function utilities for implementing parameterized tests.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+// scripts/fuse_gtest.py depends on gtest's own header being #included
+// *unconditionally*.  Therefore these #includes cannot be moved
+// inside #if GTEST_HAS_PARAM_TEST.
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-linked_ptr.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-printers.h"
+
+#if GTEST_HAS_PARAM_TEST
+
+namespace testing {
+
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType& a_param, size_t an_index) :
+    param(a_param),
+    index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType>& info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test case. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
+                                          CodeLocation code_location);
+
+template <typename> class ParamGeneratorInterface;
+template <typename> class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface* Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T* Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T& reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
+  ParamIterator& operator=(const ParamIterator& other) {
+    if (this != &other)
+      impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T& operator*() const { return *impl_->Current(); }
+  const T* operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator& operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T>* clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator& other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
+  scoped_ptr<ParamIteratorInterface<T> > impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T>* Begin() const = 0;
+  virtual ParamIteratorInterface<T>* End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template<typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
+
+  ParamGenerator& operator=(const ParamGenerator& other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  linked_ptr<const ParamGeneratorInterface<T> > impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end),
+        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+  virtual ~RangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      value_ = static_cast<T>(value_ + step_);
+      index_++;
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    virtual const T* Current() const { return &value_; }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+        : ParamIteratorInterface<T>(),
+          base_(other.base_), value_(other.value_), index_(other.index_),
+          step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator& other);
+
+    const ParamGeneratorInterface<T>* const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T& begin,
+                               const T& end,
+                               const IncrementT& step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = static_cast<T>(i + step))
+      end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator& other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  virtual ~ValuesInIteratorRangeGenerator() {}
+
+  virtual ParamIteratorInterface<T>* Begin() const {
+    return new Iterator(this, container_.begin());
+  }
+  virtual ParamIteratorInterface<T>* End() const {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T>* base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    virtual ~Iterator() {}
+
+    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
+      return base_;
+    }
+    virtual void Advance() {
+      ++iterator_;
+      value_.reset();
+    }
+    virtual ParamIteratorInterface<T>* Clone() const {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    virtual const T* Current() const {
+      if (value_.get() == NULL)
+        value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator& other)
+          // The explicit constructor call suppresses a false warning
+          // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(),
+          base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T>* const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of scoped_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable scoped_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator& other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType>& info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Parameterized test name overload helpers, which help the
+// INSTANTIATE_TEST_CASE_P macro choose between the default parameterized
+// test name generator and user param name generator.
+template <class ParamType, class ParamNameGenFunctor>
+ParamNameGenFunctor GetParamNameGen(ParamNameGenFunctor func) {
+  return func;
+}
+
+template <class ParamType>
+struct ParamNameGenFunc {
+  typedef std::string Type(const TestParamInfo<ParamType>&);
+};
+
+template <class ParamType>
+typename ParamNameGenFunc<ParamType>::Type *GetParamNameGen() {
+  return DefaultParamName;
+}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter) :
+      parameter_(parameter) {}
+  virtual Test* CreateTest() {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestCaseInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestCase>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestCase::ParamType> {
+ public:
+  typedef typename TestCase::ParamType ParamType;
+
+  TestMetaFactory() {}
+
+  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
+    return new ParameterizedTestFactory<TestCase>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfoBase is a generic interface
+// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
+// a collection of pointers to the ParameterizedTestCaseInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestCaseInfoBase {
+ public:
+  virtual ~ParameterizedTestCaseInfoBase() {}
+
+  // Base part of test case name for display purposes.
+  virtual const string& GetTestCaseName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test case right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestCaseInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test case and generators
+// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
+// test case. It registers tests with all values generated by all
+// generators when asked.
+template <class TestCase>
+class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestCaseInstantiation().
+  typedef typename TestCase::ParamType ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  typedef typename ParamNameGenFunc<ParamType>::Type ParamNameGeneratorFunc;
+
+  explicit ParameterizedTestCaseInfo(
+      const char* name, CodeLocation code_location)
+      : test_case_name_(name), code_location_(code_location) {}
+
+  // Test case base name for display purposes.
+  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  // Test case id to verify identity.
+  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_case_name is the base name of the test case (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test case base name and DoBar is test base name.
+  void AddTestPattern(const char* test_case_name,
+                      const char* test_base_name,
+                      TestMetaFactoryBase<ParamType>* meta_factory) {
+    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
+                                                       test_base_name,
+                                                       meta_factory)));
+  }
+  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestCaseInstantiation(const string& instantiation_name,
+                               GeneratorCreationFunc* func,
+                               ParamNameGeneratorFunc* name_func,
+                               const char* file,
+                               int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test case
+  // test cases right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more then once on any single
+  // instance of a ParameterizedTestCaseInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more then once.
+  virtual void RegisterTests() {
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      linked_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin(); gen_it != instantiations_.end();
+               ++gen_it) {
+        const string& instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc* name_func = gen_it->name_func;
+        const char* file = gen_it->file;
+        int line = gen_it->line;
+
+        string test_case_name;
+        if ( !instantiation_name.empty() )
+          test_case_name = instantiation_name + "/";
+        test_case_name += test_info->test_case_base_name;
+
+        size_t i = 0;
+        std::set<std::string> test_param_names;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          Message test_name_stream;
+
+          std::string param_name = name_func(
+              TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file
+              << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name
+              << "', in " << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          test_name_stream << test_info->test_base_name << "/" << param_name;
+          MakeAndRegisterTestInfo(
+              test_case_name.c_str(),
+              test_name_stream.GetString().c_str(),
+              NULL,  // No type parameter.
+              PrintToString(*param_it).c_str(),
+              code_location_,
+              GetTestCaseTypeId(),
+              TestCase::SetUpTestCase,
+              TestCase::TearDownTestCase,
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }  // for gen_it
+    }  // for test_it
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char* a_test_case_base_name,
+             const char* a_test_base_name,
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
+        test_case_base_name(a_test_case_base_name),
+        test_base_name(a_test_base_name),
+        test_meta_factory(a_test_meta_factory) {}
+
+    const string test_case_base_name;
+    const string test_base_name;
+    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+  };
+  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
+  // Records data received from INSTANTIATE_TEST_CASE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+      InstantiationInfo(const std::string &name_in,
+                        GeneratorCreationFunc* generator_in,
+                        ParamNameGeneratorFunc* name_func_in,
+                        const char* file_in,
+                        int line_in)
+          : name(name_in),
+            generator(generator_in),
+            name_func(name_func_in),
+            file(file_in),
+            line(line_in) {}
+
+      std::string name;
+      GeneratorCreationFunc* generator;
+      ParamNameGeneratorFunc* name_func;
+      const char* file;
+      int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string& name) {
+    // Check for empty string
+    if (name.empty())
+      return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_')
+        return false;
+    }
+
+    return true;
+  }
+
+  const string test_case_name_;
+  CodeLocation code_location_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
+};  // class ParameterizedTestCaseInfo
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
+// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
+// macros use it to locate their corresponding ParameterizedTestCaseInfo
+// descriptors.
+class ParameterizedTestCaseRegistry {
+ public:
+  ParameterizedTestCaseRegistry() {}
+  ~ParameterizedTestCaseRegistry() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test case.
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
+      const char* test_case_name,
+      CodeLocation code_location) {
+    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      if ((*it)->GetTestCaseName() == test_case_name) {
+        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test case setup and tear-down in this case.
+          ReportInvalidTestCaseType(test_case_name, code_location);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestCaseInfo<TestCase> >(*it);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == NULL) {
+      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(
+          test_case_name, code_location);
+      test_case_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
+         it != test_case_infos_.end(); ++it) {
+      (*it)->RegisterTests();
+    }
+  }
+
+ private:
+  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
+
+  TestCaseInfoContainer test_case_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  //  GTEST_HAS_PARAM_TEST
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
new file mode 100644
index 000000000..74ab94905
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,93 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+# define GTEST_OS_CYGWIN 1
+#elif defined __SYMBIAN32__
+# define GTEST_OS_SYMBIAN 1
+#elif defined _WIN32
+# define GTEST_OS_WINDOWS 1
+# ifdef _WIN32_WCE
+#  define GTEST_OS_WINDOWS_MOBILE 1
+# elif defined(__MINGW__) || defined(__MINGW32__)
+#  define GTEST_OS_WINDOWS_MINGW 1
+# elif defined(WINAPI_FAMILY)
+#  include <winapifamily.h>
+#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#   define GTEST_OS_WINDOWS_RT 1
+#  else
+    // WINAPI_FAMILY defined but no known partition matched.
+    // Default to desktop.
+#   define GTEST_OS_WINDOWS_DESKTOP 1
+#  endif
+# else
+#  define GTEST_OS_WINDOWS_DESKTOP 1
+# endif  // _WIN32_WCE
+#elif defined __APPLE__
+# define GTEST_OS_MAC 1
+# if TARGET_OS_IPHONE
+#  define GTEST_OS_IOS 1
+# endif
+#elif defined __FreeBSD__
+# define GTEST_OS_FREEBSD 1
+#elif defined __linux__
+# define GTEST_OS_LINUX 1
+# if defined __ANDROID__
+#  define GTEST_OS_LINUX_ANDROID 1
+# endif
+#elif defined __MVS__
+# define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+# define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+# define GTEST_OS_AIX 1
+#elif defined(__hpux)
+# define GTEST_OS_HPUX 1
+#elif defined __native_client__
+# define GTEST_OS_NACL 1
+#elif defined __OpenBSD__
+# define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+# define GTEST_OS_QNX 1
+#endif  // __CYGWIN__
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
new file mode 100644
index 000000000..da57e65d3
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
@@ -0,0 +1,2567 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan)
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::string, which is different to std::string).
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
+//                              is/isn't available (some systems define
+//                              ::wstring, which is different to std::wstring).
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
+//                              is/isn't available.
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
+//                              Test's own tr1 tuple implementation should be
+//                              used.  Unused when the user sets
+//                              GTEST_HAS_TR1_TUPLE to 0.
+//   GTEST_LANG_CXX11         - Define it to 1/0 to indicate that Google Test
+//                              is building in C++11/C++98 mode.
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_SYMBIAN  - Symbian
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
+//                            tests)
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above two are mutually exclusive.
+//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//
+// C++11 feature wrappers:
+//
+//   testing::internal::move  - portability wrapper for std::move.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Template meta programming:
+//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
+//   IteratorTraits - partial implementation of std::iterator_traits, which
+//                    is not available in libCstd when compiled with Sun C++.
+//
+// Smart pointers:
+//   scoped_ptr     - as in TR2.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like
+//                    platforms, or a reduced regular exception syntax on
+//                    other platforms, including Windows.
+//
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   Int32, UInt32, Int64, UInt64, TimeInMillis
+//                  - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an Int32 environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#ifndef _WIN32_WCE
+# include <sys/types.h>
+# include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+# include <AvailabilityMacros.h>
+# include <TargetConditionals.h>
+#endif
+
+#include <algorithm>  // NOLINT
+#include <iostream>  // NOLINT
+#include <sstream>  // NOLINT
+#include <string>  // NOLINT
+#include <utility>
+#include <vector>  // NOLINT
+
+#include "gtest/internal/gtest-port-arch.h"
+#include "gtest/internal/custom/gtest-port.h"
+
+#if !defined(GTEST_DEV_EMAIL_)
+# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+# define GTEST_FLAG_PREFIX_ "gtest_"
+# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+# define GTEST_NAME_ "Google Test"
+# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+# define GTEST_GCC_VER_ \
+    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if _MSC_VER >= 1500
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+    __pragma(warning(push))                        \
+    __pragma(warning(disable: warnings))
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
+    __pragma(warning(pop))
+#else
+// Older versions of MSVC don't have __pragma.
+# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+#ifndef GTEST_LANG_CXX11
+// gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
+// -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
+// value for __cplusplus, and recent versions of clang, gcc, and
+// probably other compilers set that too in C++11 mode.
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+// Compiling in at least C++11 mode.
+#  define GTEST_LANG_CXX11 1
+# else
+#  define GTEST_LANG_CXX11 0
+# endif
+#endif
+
+// Distinct from C++11 language support, some environments don't provide
+// proper C++11 library support. Notably, it's possible to build in
+// C++11 mode when targeting Mac OS X 10.6, which has an old libstdc++
+// with no C++11 support.
+//
+// libstdc++ has sufficient C++11 support as of GCC 4.6.0, __GLIBCXX__
+// 20110325, but maintenance releases in the 4.4 and 4.5 series followed
+// this date, so check for those versions by their date stamps.
+// https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html#abi.versioning
+#if GTEST_LANG_CXX11 && \
+    (!defined(__GLIBCXX__) || ( \
+        __GLIBCXX__ >= 20110325ul &&  /* GCC >= 4.6.0 */ \
+        /* Blacklist of patch releases of older branches: */ \
+        __GLIBCXX__ != 20110416ul &&  /* GCC 4.4.6 */ \
+        __GLIBCXX__ != 20120313ul &&  /* GCC 4.4.7 */ \
+        __GLIBCXX__ != 20110428ul &&  /* GCC 4.5.3 */ \
+        __GLIBCXX__ != 20120702ul))   /* GCC 4.5.4 */
+# define GTEST_STDLIB_CXX11 1
+#endif
+
+// Only use C++11 library features if the library provides them.
+#if GTEST_STDLIB_CXX11
+# define GTEST_HAS_STD_BEGIN_AND_END_ 1
+# define GTEST_HAS_STD_FORWARD_LIST_ 1
+# define GTEST_HAS_STD_FUNCTION_ 1
+# define GTEST_HAS_STD_INITIALIZER_LIST_ 1
+# define GTEST_HAS_STD_MOVE_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_STD_TYPE_TRAITS_ 1
+# define GTEST_HAS_STD_UNIQUE_PTR_ 1
+#endif
+
+// C++11 specifies that <tuple> provides std::tuple.
+// Some platforms still might not have it, however.
+#if GTEST_LANG_CXX11
+# define GTEST_HAS_STD_TUPLE_ 1
+# if defined(__clang__)
+// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+#  if defined(__has_include) && !__has_include(<tuple>)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(_MSC_VER)
+// Inspired by boost/config/stdlib/dinkumware.hpp
+#  if defined(_CPPLIB_VER) && _CPPLIB_VER < 520
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# elif defined(__GLIBCXX__)
+// Inspired by boost/config/stdlib/libstdcpp3.hpp,
+// http://gcc.gnu.org/gcc-4.2/changes.html and
+// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
+#   undef GTEST_HAS_STD_TUPLE_
+#  endif
+# endif
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS_MOBILE
+#  include <direct.h>
+#  include <io.h>
+# endif
+// In order to avoid having to include <windows.h>, use forward declaration
+// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+struct _RTL_CRITICAL_SECTION;
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+# include <unistd.h>
+# include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#  include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true iff Google Test can use POSIX regular expressions.
+#ifndef GTEST_HAS_POSIX_RE
+# if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+# else
+#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+# endif
+#endif
+
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+# include <regex.h>  // NOLINT
+
+# define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+# define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_USES_PCRE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#  ifndef _HAS_EXCEPTIONS
+#   define _HAS_EXCEPTIONS 1
+#  endif  // _HAS_EXCEPTIONS
+#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+# elif defined(__clang__)
+// clang defines __EXCEPTIONS iff exceptions are enabled before clang 220714,
+// but iff cleanups are enabled after that. In Obj-C++ files, there can be
+// cleanups for ObjC exceptions which also need cleanups, even if C++ exceptions
+// are disabled. clang has __has_feature(cxx_exceptions) which checks for C++
+// exceptions starting at clang r206352, but which checked for cleanups prior to
+// that. To reliably check for C++ exception availability with clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+# elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#  define GTEST_HAS_EXCEPTIONS 1
+# else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#  define GTEST_HAS_EXCEPTIONS 0
+# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#if !defined(GTEST_HAS_STD_STRING)
+// Even though we don't use this macro any longer, we keep it in case
+// some clients still depend on it.
+# define GTEST_HAS_STD_STRING 1
+#elif !GTEST_HAS_STD_STRING
+// The user told us that ::std::string isn't available.
+# error "Google Test cannot be used where ::std::string isn't available."
+#endif  // !defined(GTEST_HAS_STD_STRING)
+
+#ifndef GTEST_HAS_GLOBAL_STRING
+// The user didn't tell us whether ::string is available, so we need
+// to figure it out.
+
+# define GTEST_HAS_GLOBAL_STRING 0
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+//   is available.
+
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+# define GTEST_HAS_STD_WSTRING \
+    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+#ifndef GTEST_HAS_GLOBAL_WSTRING
+// The user didn't tell us whether ::wstring is available, so we need
+// to figure it out.
+# define GTEST_HAS_GLOBAL_WSTRING \
+    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+# ifdef _MSC_VER
+
+#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
+
+#  ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
+       !defined(__EXCEPTIONS)
+#    define GTEST_HAS_RTTI 0
+#   else
+#    define GTEST_HAS_RTTI 1
+#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+# elif defined(__clang__)
+
+#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#  ifdef __RTTI_ALL__
+#   define GTEST_HAS_RTTI 1
+#  else
+#   define GTEST_HAS_RTTI 0
+#  endif
+
+# else
+
+// For all other compilers, we assume RTTI is enabled.
+#  define GTEST_HAS_RTTI 1
+
+# endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+# include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
+    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+# include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+# include <time.h>  // NOLINT
+#endif
+
+// Determines if hash_map/hash_set are available.
+// Only used for testing against those containers.
+#if !defined(GTEST_HAS_HASH_MAP_)
+# if _MSC_VER
+#  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
+#  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
+# endif  // _MSC_VER
+#endif  // !defined(GTEST_HAS_HASH_MAP_)
+
+// Determines whether Google Test can use tr1/tuple.  You can define
+// this macro to 0 to prevent Google Test from using tuple (any
+// feature depending on tuple with be disabled in this mode).
+#ifndef GTEST_HAS_TR1_TUPLE
+# if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
+// STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
+#  define GTEST_HAS_TR1_TUPLE 0
+# else
+// The user didn't tell us not to do it, so we assume it's OK.
+#  define GTEST_HAS_TR1_TUPLE 1
+# endif
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether Google Test's own tr1 tuple implementation
+// should be used.
+#ifndef GTEST_USE_OWN_TR1_TUPLE
+// The user didn't tell us, so we need to figure it out.
+
+// We use our own TR1 tuple if we aren't sure the user has an
+// implementation of it already.  At this time, libstdc++ 4.0.0+ and
+// MSVC 2010 are the only mainstream standard libraries that come
+// with a TR1 tuple implementation.  NVIDIA's CUDA NVCC compiler
+// pretends to be GCC by defining __GNUC__ and friends, but cannot
+// compile GCC's tuple implementation.  MSVC 2008 (9.0) provides TR1
+// tuple in a 323 MB Feature Pack download, which we cannot assume the
+// user has.  QNX's QCC compiler is a modified GCC but it doesn't
+// support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
+// and it can be used with some compilers that define __GNUC__.
+# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+#  define GTEST_ENV_HAS_TR1_TUPLE_ 1
+# endif
+
+// C++11 specifies that <tuple> provides std::tuple. Use that if gtest is used
+// in C++11 mode and libstdc++ isn't very old (binaries targeting OS X 10.6
+// can build with clang but need to use gcc4.2's libstdc++).
+# if GTEST_LANG_CXX11 && (!defined(__GLIBCXX__) || __GLIBCXX__ > 20110325)
+#  define GTEST_ENV_HAS_STD_TUPLE_ 1
+# endif
+
+# if GTEST_ENV_HAS_TR1_TUPLE_ || GTEST_ENV_HAS_STD_TUPLE_
+#  define GTEST_USE_OWN_TR1_TUPLE 0
+# else
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# endif
+
+#endif  // GTEST_USE_OWN_TR1_TUPLE
+
+// To avoid conditional compilation everywhere, we make it
+// gtest-port.h's responsibility to #include the header implementing
+// tuple.
+#if GTEST_HAS_STD_TUPLE_
+# include <tuple>  // IWYU pragma: export
+# define GTEST_TUPLE_NAMESPACE_ ::std
+#endif  // GTEST_HAS_STD_TUPLE_
+
+// We include tr1::tuple even if std::tuple is available to define printers for
+// them.
+#if GTEST_HAS_TR1_TUPLE
+# ifndef GTEST_TUPLE_NAMESPACE_
+#  define GTEST_TUPLE_NAMESPACE_ ::std::tr1
+# endif  // GTEST_TUPLE_NAMESPACE_
+
+# if GTEST_USE_OWN_TR1_TUPLE
+#  include "gtest/internal/gtest-tuple.h"  // IWYU pragma: export  // NOLINT
+# elif GTEST_ENV_HAS_STD_TUPLE_
+#  include <tuple>
+// C++11 puts its tuple into the ::std namespace rather than
+// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
+// This causes undefined behavior, but supported compilers react in
+// the way we intend.
+namespace std {
+namespace tr1 {
+using ::std::get;
+using ::std::make_tuple;
+using ::std::tuple;
+using ::std::tuple_element;
+using ::std::tuple_size;
+}
+}
+
+# elif GTEST_OS_SYMBIAN
+
+// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
+// use STLport's tuple implementation, which unfortunately doesn't
+// work as the copy of STLport distributed with Symbian is incomplete.
+// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
+// use its own tuple implementation.
+#  ifdef BOOST_HAS_TR1_TUPLE
+#   undef BOOST_HAS_TR1_TUPLE
+#  endif  // BOOST_HAS_TR1_TUPLE
+
+// This prevents <boost/tr1/detail/config.hpp>, which defines
+// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
+#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
+// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
+// not conform to the TR1 spec, which requires the header to be <tuple>.
+
+#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
+// which is #included by <tr1/tuple>, to not compile when RTTI is
+// disabled.  _TR1_FUNCTIONAL is the header guard for
+// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional> from being included.
+#   define _TR1_FUNCTIONAL 1
+#   include <tr1/tuple>
+#   undef _TR1_FUNCTIONAL  // Allows the user to #include
+                        // <tr1/functional> if he chooses to.
+#  else
+#   include <tr1/tuple>  // NOLINT
+#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
+
+# else
+// If the compiler is not GCC 4.0+, we assume the user is using a
+// spec-conforming TR1 implementation.
+#  include <tuple>  // IWYU pragma: export  // NOLINT
+# endif  // GTEST_USE_OWN_TR1_TUPLE
+
+#endif  // GTEST_HAS_TR1_TUPLE
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+# if GTEST_OS_LINUX && !defined(__ia64__)
+#  if GTEST_OS_LINUX_ANDROID
+// On Android, clone() is only available on ARM starting with Gingerbread.
+#    if defined(__arm__) && __ANDROID_API__ >= 9
+#     define GTEST_HAS_CLONE 1
+#    else
+#     define GTEST_HAS_CLONE 0
+#    endif
+#  else
+#   define GTEST_HAS_CLONE 1
+#  endif
+# else
+#  define GTEST_HAS_CLONE 0
+# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+#  define GTEST_HAS_STREAM_REDIRECTION 0
+# else
+#  define GTEST_HAS_STREAM_REDIRECTION 1
+# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// Google Test does not support death tests for VC 7.1 and earlier as
+// abort() in a VC 7.1 application compiled as GUI in debug config
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
+# define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
+// all the compilers we care about are adequate for supporting
+// value-parameterized tests.
+#define GTEST_HAS_PARAM_TEST 1
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+# define GTEST_HAS_TYPED_TEST 1
+# define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether to support Combine(). This only makes sense when
+// value-parameterized tests are enabled.  The implementation doesn't
+// work on Sun Studio since it doesn't understand templated conversion
+// operators.
+#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+# define GTEST_HAS_COMBINE 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX
+# define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#elif defined(__clang__)
+# if __has_attribute(unused)
+#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+# endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+# define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// A macro to disallow operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type)\
+  void operator=(type const &)
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
+  type(type const &);\
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
+# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#else
+# define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+# define GTEST_INTENTIONAL_CONST_COND_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+# if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#  define GTEST_HAS_SEH 1
+# else
+// Assume no SEH.
+#  define GTEST_HAS_SEH 0
+# endif
+
+#define GTEST_IS_THREADSAFE \
+    (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ \
+     || (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) \
+     || GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_HAS_SEH
+
+#ifdef _MSC_VER
+# if GTEST_LINKED_AS_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllimport)
+# elif GTEST_CREATE_SHARED_LIBRARY
+#  define GTEST_API_ __declspec(dllexport)
+# endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+# define GTEST_API_ __attribute__((visibility ("default")))
+#endif // _MSC_VER
+
+#ifndef GTEST_API_
+# define GTEST_API_
+#endif
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+# define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+# define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
+# define GTEST_HAS_CXXABI_H_ 1
+#else
+# define GTEST_HAS_CXXABI_H_ 0
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+# if __has_feature(memory_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
+       __attribute__((no_sanitize_memory))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+# endif  // __has_feature(memory_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(address_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+       __attribute__((no_sanitize_address))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+# endif  // __has_feature(address_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+# if __has_feature(thread_sanitizer)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
+       __attribute__((no_sanitize_thread))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+# endif  // __has_feature(thread_sanitizer)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+// A function level attribute to disable UndefinedBehaviorSanitizer's (defined)
+// unsigned integer overflow instrumentation.
+#if defined(__clang__)
+# if defined(__has_attribute) && __has_attribute(no_sanitize)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \
+       __attribute__((no_sanitize("unsigned-integer-overflow")))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+# endif  // defined(__has_attribute) && __has_attribute(no_sanitize)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+#if defined(GTEST_TUPLE_NAMESPACE_)
+// Import tuple and friends into the ::testing namespace.
+// It is part of our interface, having them in ::testing allows us to change
+// their types as needed.
+using GTEST_TUPLE_NAMESPACE_::get;
+using GTEST_TUPLE_NAMESPACE_::make_tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple;
+using GTEST_TUPLE_NAMESPACE_::tuple_size;
+using GTEST_TUPLE_NAMESPACE_::tuple_element;
+#endif  // defined(GTEST_TUPLE_NAMESPACE_)
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
+// expression is true. For example, you could use it to verify the
+// size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
+//
+// or to make sure a struct is smaller than a certain size:
+//
+//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
+//
+// The second argument to the macro is the name of the variable. If
+// the expression is false, most compilers will issue a warning/error
+// containing the name of the variable.
+
+#if GTEST_LANG_CXX11
+# define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+#else  // !GTEST_LANG_CXX11
+template <bool>
+  struct CompileAssert {
+};
+
+# define GTEST_COMPILE_ASSERT_(expr, msg) \
+  typedef ::testing::internal::CompileAssert<(static_cast<bool>(expr))> \
+      msg[static_cast<bool>(expr) ? 1 : -1] GTEST_ATTRIBUTE_UNUSED_
+#endif  // !GTEST_LANG_CXX11
+
+// Implementation details of GTEST_COMPILE_ASSERT_:
+//
+// (In C++11, we simply use static_assert instead of the following)
+//
+// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
+//   elements (and thus is invalid) when the expression is false.
+//
+// - The simpler definition
+//
+//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
+//
+//   does not work, as gcc supports variable-length arrays whose sizes
+//   are determined at run-time (this is gcc's extension and not part
+//   of the C++ standard).  As a result, gcc fails to reject the
+//   following code with the simple definition:
+//
+//     int foo;
+//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
+//                                      // not a compile-time constant.
+//
+// - By using the type CompileAssert<(bool(expr))>, we ensures that
+//   expr is a compile-time constant.  (Template arguments must be
+//   determined at compile-time.)
+//
+// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
+//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
+//
+//     CompileAssert<bool(expr)>
+//
+//   instead, these compilers will refuse to compile
+//
+//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
+//
+//   (They seem to think the ">" in "5 > 0" marks the end of the
+//   template argument list.)
+//
+// - The array size is (bool(expr) ? 1 : -1), instead of simply
+//
+//     ((expr) ? 1 : -1).
+//
+//   This is to avoid running into a bug in MS VC 7.1, which
+//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
+
+// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
+//
+// This template is declared, but intentionally undefined.
+template <typename T1, typename T2>
+struct StaticAssertTypeEqHelper;
+
+template <typename T>
+struct StaticAssertTypeEqHelper<T, T> {
+  enum { value = true };
+};
+
+// Evaluates to the number of elements in 'array'.
+#define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
+
+#if GTEST_HAS_GLOBAL_STRING
+typedef ::string string;
+#else
+typedef ::std::string string;
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+typedef ::wstring wstring;
+#elif GTEST_HAS_STD_WSTRING
+typedef ::std::wstring wstring;
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines scoped_ptr.
+
+// This implementation of scoped_ptr is PARTIAL - it only contains
+// enough stuff to satisfy Google Test's need.
+template <typename T>
+class scoped_ptr {
+ public:
+  typedef T element_type;
+
+  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
+  ~scoped_ptr() { reset(); }
+
+  T& operator*() const { return *ptr_; }
+  T* operator->() const { return ptr_; }
+  T* get() const { return ptr_; }
+
+  T* release() {
+    T* const ptr = ptr_;
+    ptr_ = NULL;
+    return ptr;
+  }
+
+  void reset(T* p = NULL) {
+    if (p != ptr_) {
+      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
+        delete ptr_;
+      }
+      ptr_ = p;
+    }
+  }
+
+  friend void swap(scoped_ptr& a, scoped_ptr& b) {
+    using std::swap;
+    swap(a.ptr_, b.ptr_);
+  }
+
+ private:
+  T* ptr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
+};
+
+// Defines RE.
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE& other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  RE(const char* regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char* pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true iff regular expression re matches
+  // the entire str.
+  // PartialMatch(str, re) returns true iff regular expression re
+  // matches a substring of str (including str itself).
+  //
+  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // when str contains NUL characters.
+  static bool FullMatch(const ::std::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const ::string& str, const RE& re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::string& str, const RE& re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+  static bool FullMatch(const char* str, const RE& re);
+  static bool PartialMatch(const char* str, const RE& re);
+
+ private:
+  void Init(const char* regex);
+
+  // We use a const char* instead of an std::string, as Google Test used to be
+  // used where std::string is not available.  TODO(wan@google.com): change to
+  // std::string.
+  const char* pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char* full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity {
+  GTEST_INFO,
+  GTEST_WARNING,
+  GTEST_ERROR,
+  GTEST_FATAL
+};
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char* file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream& GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#if !defined(GTEST_LOG_)
+
+# define GTEST_LOG_(severity) \
+    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                  __FILE__, __LINE__).GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(NULL); }
+
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+# define GTEST_CHECK_(condition) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::IsTrue(condition)) \
+      ; \
+    else \
+      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call)) \
+    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
+                      << gtest_error
+
+#if GTEST_HAS_STD_MOVE_
+using std::move;
+#else  // GTEST_HAS_STD_MOVE_
+template <typename T>
+const T& move(const T& t) {
+  return t;
+}
+#endif  // GTEST_HAS_STD_MOVE_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To>
+inline To ImplicitCast_(To x) { return x; }
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {  // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = NULL;
+    ::testing::internal::ImplicitCast_<From*>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived* CheckedDowncastToActualType(Base* base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived*>(base);
+#elif GTEST_HAS_RTTI
+  return dynamic_cast<Derived*>(base);  // NOLINT
+#else
+  return static_cast<Derived*>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Returns a path to temporary directory.
+GTEST_API_ std::string TempDir();
+
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE* file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE* file);
+
+// All command line arguments.
+GTEST_API_ const ::std::vector<testing::internal::string>& GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs();
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
+                             new_argvs);
+
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+# if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, NULL);
+}
+# endif  // GTEST_HAS_PTHREAD
+
+# if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_HAS_PTHREAD
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+  }
+  ~Notification() {
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified)
+        break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void* Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true iff the handle is a valid handle object that can be closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+# endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
+  static_cast<ThreadWithParamBase*>(thread)->Run();
+  return NULL;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : func_(func),
+        param_(param),
+        thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase* const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
+      finished_ = true;
+    }
+  }
+
+  virtual void Run() {
+    if (thread_can_start_ != NULL)
+      thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc* const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification* const thread_can_start_;
+  bool finished_;  // true iff we know that the thread function has finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  _RTL_CRITICAL_SECTION* critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+    extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
+  }
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc* func, T param)
+        : func_(func),
+          param_(param) {
+    }
+    virtual ~RunnableImpl() {}
+    virtual void Run() {
+      func_(param_);
+    }
+
+   private:
+    UserThreadFunc* const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+
+  T* GetOrCreateValue() const {
+    return static_cast<ValueHolder*>(
+        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+     extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
+    has_owner_ = false;
+  }
+  ~Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase* mutex)
+      : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase* const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T& value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T* pointer() { return GetOrCreateValue(); }
+  const T* pointer() const { return GetOrCreateValue(); }
+  const T& get() const { return *pointer(); }
+  void set(const T& value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T& value) : value_(value) {}
+
+    T* pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T* GetOrCreateValue() const {
+    ThreadLocalValueHolderBase* const holder =
+        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
+    if (holder != NULL) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder* const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase* const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    virtual ValueHolder* MakeNewHolder() const { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T& value) : value_(value) {}
+    virtual ValueHolder* MakeNewHolder() const {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  scoped_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex*) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T& value) : value_(value) {}
+  T* pointer() { return &value_; }
+  const T* pointer() const { return &value_; }
+  const T& get() const { return value_; }
+  void set(const T& value) { value_ = value; }
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+// Passing non-POD classes through ellipsis (...) crashes the ARM
+// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// and the IBM XL C/C++ compiler try to instantiate a copy constructor
+// for objects passed through ellipsis (...), failing for uncopyable
+// objects.  We define this to ensure that only POD is passed through
+// ellipsis on these systems.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+// We lose support for NULL detection where the compiler doesn't like
+// passing non-POD classes through ellipsis (...).
+# define GTEST_ELLIPSIS_NEEDS_POD_ 1
+#else
+# define GTEST_CAN_COMPARE_NULL 1
+#endif
+
+// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
+// const T& and const T* in a function template.  These compilers
+// _can_ decide between class template specializations for T and T*,
+// so a tr1::type_traits-like is_pointer works.
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
+# define GTEST_NEEDS_IS_POINTER_ 1
+#endif
+
+template <bool bool_value>
+struct bool_constant {
+  typedef bool_constant<bool_value> type;
+  static const bool value = bool_value;
+};
+template <bool bool_value> const bool bool_constant<bool_value>::value;
+
+typedef bool_constant<false> false_type;
+typedef bool_constant<true> true_type;
+
+template <typename T>
+struct is_pointer : public false_type {};
+
+template <typename T>
+struct is_pointer<T*> : public true_type {};
+
+template <typename Iterator>
+struct IteratorTraits {
+  typedef typename Iterator::value_type value_type;
+};
+
+template <typename T>
+struct IteratorTraits<T*> {
+  typedef T value_type;
+};
+
+template <typename T>
+struct IteratorTraits<const T*> {
+  typedef T value_type;
+};
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_SEP_ "\\"
+# define GTEST_HAS_ALT_PATH_SEP_ 1
+// The biggest signed integer type the compiler supports.
+typedef __int64 BiggestInt;
+#else
+# define GTEST_PATH_SEP_ "/"
+# define GTEST_HAS_ALT_PATH_SEP_ 0
+typedef long long BiggestInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it))
+    it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+# ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+# else  // !__BORLANDC__
+#  if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#  else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#  endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return _stricmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return _strdup(src); }
+# endif  // __BORLANDC__
+
+# if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+# else
+inline int FileNo(FILE* file) { return _fileno(file); }
+inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
+inline int RmDir(const char* dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct& st) {
+  return (_S_IFDIR & st.st_mode) != 0;
+}
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE* file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char* s1, const char* s2) {
+  return strcasecmp(s1, s2);
+}
+inline char* StrDup(const char* src) { return strdup(src); }
+inline int RmDir(const char* dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+
+inline const char* StrNCpy(char* dest, const char* src, size_t n) {
+  return strncpy(dest, src, n);
+}
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+inline int ChDir(const char* dir) { return chdir(dir); }
+#endif
+inline FILE* FOpen(const char* path, const char* mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE* fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void* buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void* buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char* StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char* GetEnv(const char* name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
+  // We are on Windows CE, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return NULL;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char* const env = getenv(name);
+  return (env != NULL && env[0] != '\0') ? env : NULL;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+void Abort();
+#else
+inline void Abort() { abort(); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+# define GTEST_SNPRINTF_(buffer, size, format, ...) \
+     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s and MSVC prior to 2005 doesn't
+// complain about _snprintf.
+# define GTEST_SNPRINTF_ _snprintf
+#else
+# define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The maximum number a BiggestInt can represent.  This definition
+// works no matter BiggestInt is represented in one's complement or
+// two's complement.
+//
+// We cannot rely on numeric_limits in STL, as __int64 and long long
+// are not part of standard C++ and numeric_limits doesn't need to be
+// defined for them.
+const BiggestInt kMaxBiggestInt =
+    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  typedef void UInt;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  // unsigned int has size 4 in both gcc and MSVC.
+  //
+  // As base/basictypes.h doesn't compile on Windows, we cannot use
+  // uint32, uint64, and etc here.
+  typedef int Int;
+  typedef unsigned int UInt;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+#if GTEST_OS_WINDOWS
+  typedef __int64 Int;
+  typedef unsigned __int64 UInt;
+#else
+  typedef long long Int;  // NOLINT
+  typedef unsigned long long UInt;  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Integer types of known sizes.
+typedef TypeWithSize<4>::Int Int32;
+typedef TypeWithSize<4>::UInt UInt32;
+typedef TypeWithSize<8>::Int Int64;
+typedef TypeWithSize<8>::UInt UInt64;
+typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+# define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+// Macros for declaring flags.
+# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+# define GTEST_DECLARE_int32_(name) \
+    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+#endif  // !defined(GTEST_DECLARE_bool_)
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+# define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
+// function.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value);
+
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char* flag, bool default_val);
+GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
+std::string StringFromGTestEnv(const char* flag, const char* default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
new file mode 100644
index 000000000..97f1a7fdd
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
@@ -0,0 +1,167 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
+//
+// The Google C++ Testing Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by <gtest/internal/gtest-internal.h>.
+// It should not be #included by other files.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+# include <mem.h>
+#endif
+
+#include <string.h>
+#include <string>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char* CloneCString(const char* c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char* c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true iff they have the same content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char* lhs, const char* rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t* wide_c_str);
+
+  // Compares two wide C strings.  Returns true iff they have the same
+  // content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
+
+  // Compares two C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char* lhs,
+                                           const char* rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                               const wchar_t* rhs);
+
+  // Returns true iff the given string ends with the given suffix, ignoring
+  // case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(
+      const std::string& str, const std::string& suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};  // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream* stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h
new file mode 100644
index 000000000..e9b405340
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h
@@ -0,0 +1,1020 @@
+// This file was GENERATED by command:
+//     pump.py gtest-tuple.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
+    void, void, void>
+#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
+    void, void, void>
+#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
+    void, void, void>
+#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
+    void, void, void>
+#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
+    void, void, void>
+#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    void, void, void>
+#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, void, void>
+#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, void>
+#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
+    T##7, T##8, T##9>
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+#define GTEST_0_TYPENAMES_(T)
+#define GTEST_1_TYPENAMES_(T) typename T##0
+#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
+#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
+#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3
+#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4
+#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5
+#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6
+#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
+#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8
+#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
+    typename T##3, typename T##4, typename T##5, typename T##6, \
+    typename T##7, typename T##8, typename T##9
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+    typename T3 = void, typename T4 = void, typename T5 = void,
+    typename T6 = void, typename T7 = void, typename T8 = void,
+    typename T9 = void>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 0, GTEST_10_TUPLE_(T) > {
+  typedef T0 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 1, GTEST_10_TUPLE_(T) > {
+  typedef T1 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 2, GTEST_10_TUPLE_(T) > {
+  typedef T2 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 3, GTEST_10_TUPLE_(T) > {
+  typedef T3 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 4, GTEST_10_TUPLE_(T) > {
+  typedef T4 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 5, GTEST_10_TUPLE_(T) > {
+  typedef T5 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 6, GTEST_10_TUPLE_(T) > {
+  typedef T6 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 7, GTEST_10_TUPLE_(T) > {
+  typedef T7 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 8, GTEST_10_TUPLE_(T) > {
+  typedef T8 type;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct TupleElement<true, 9, GTEST_10_TUPLE_(T) > {
+  typedef T9 type;
+};
+
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+class GTEST_1_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
+
+  tuple(const tuple& t) : f0_(t.f0_) {}
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_1_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    return *this;
+  }
+
+  T0 f0_;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+class GTEST_2_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
+      f1_(f1) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_2_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+class GTEST_3_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_3_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+class GTEST_4_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_4_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+class GTEST_5_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
+      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_) {}
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_5_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+class GTEST_6_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_) {}
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_6_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+class GTEST_7_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_7_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+class GTEST_8_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
+      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_8_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+class GTEST_9_TUPLE_(T) {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
+      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_9_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+class tuple {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
+      f9_() {}
+
+  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
+      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
+      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
+      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
+      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
+
+  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
+      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
+      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
+      f9_(t.f9_) {}
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_10_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
+    f0_ = t.f0_;
+    f1_ = t.f1_;
+    f2_ = t.f2_;
+    f3_ = t.f3_;
+    f4_ = t.f4_;
+    f5_ = t.f5_;
+    f6_ = t.f6_;
+    f7_ = t.f7_;
+    f8_ = t.f8_;
+    f9_ = t.f9_;
+    return *this;
+  }
+
+  T0 f0_;
+  T1 f1_;
+  T2 f2_;
+  T3 f3_;
+  T4 f4_;
+  T5 f5_;
+  T6 f6_;
+  T7 f7_;
+  T8 f8_;
+  T9 f9_;
+};
+
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+template <GTEST_1_TYPENAMES_(T)>
+inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
+  return GTEST_1_TUPLE_(T)(f0);
+}
+
+template <GTEST_2_TYPENAMES_(T)>
+inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
+  return GTEST_2_TUPLE_(T)(f0, f1);
+}
+
+template <GTEST_3_TYPENAMES_(T)>
+inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
+  return GTEST_3_TUPLE_(T)(f0, f1, f2);
+}
+
+template <GTEST_4_TYPENAMES_(T)>
+inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3) {
+  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
+}
+
+template <GTEST_5_TYPENAMES_(T)>
+inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4) {
+  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
+}
+
+template <GTEST_6_TYPENAMES_(T)>
+inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5) {
+  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
+}
+
+template <GTEST_7_TYPENAMES_(T)>
+inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
+  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
+}
+
+template <GTEST_8_TYPENAMES_(T)>
+inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
+  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
+}
+
+template <GTEST_9_TYPENAMES_(T)>
+inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8) {
+  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
+}
+
+template <GTEST_10_TYPENAMES_(T)>
+inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
+    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
+    const T8& f8, const T9& f9) {
+  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
+}
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+template <GTEST_0_TYPENAMES_(T)>
+struct tuple_size<GTEST_0_TUPLE_(T) > {
+  static const int value = 0;
+};
+
+template <GTEST_1_TYPENAMES_(T)>
+struct tuple_size<GTEST_1_TUPLE_(T) > {
+  static const int value = 1;
+};
+
+template <GTEST_2_TYPENAMES_(T)>
+struct tuple_size<GTEST_2_TUPLE_(T) > {
+  static const int value = 2;
+};
+
+template <GTEST_3_TYPENAMES_(T)>
+struct tuple_size<GTEST_3_TUPLE_(T) > {
+  static const int value = 3;
+};
+
+template <GTEST_4_TYPENAMES_(T)>
+struct tuple_size<GTEST_4_TUPLE_(T) > {
+  static const int value = 4;
+};
+
+template <GTEST_5_TYPENAMES_(T)>
+struct tuple_size<GTEST_5_TUPLE_(T) > {
+  static const int value = 5;
+};
+
+template <GTEST_6_TYPENAMES_(T)>
+struct tuple_size<GTEST_6_TUPLE_(T) > {
+  static const int value = 6;
+};
+
+template <GTEST_7_TYPENAMES_(T)>
+struct tuple_size<GTEST_7_TUPLE_(T) > {
+  static const int value = 7;
+};
+
+template <GTEST_8_TYPENAMES_(T)>
+struct tuple_size<GTEST_8_TUPLE_(T) > {
+  static const int value = 8;
+};
+
+template <GTEST_9_TYPENAMES_(T)>
+struct tuple_size<GTEST_9_TUPLE_(T) > {
+  static const int value = 9;
+};
+
+template <GTEST_10_TYPENAMES_(T)>
+struct tuple_size<GTEST_10_TUPLE_(T) > {
+  static const int value = 10;
+};
+
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+template <>
+class Get<0> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  Field(Tuple& t) { return t.f0_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
+  ConstField(const Tuple& t) { return t.f0_; }
+};
+
+template <>
+class Get<1> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  Field(Tuple& t) { return t.f1_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
+  ConstField(const Tuple& t) { return t.f1_; }
+};
+
+template <>
+class Get<2> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  Field(Tuple& t) { return t.f2_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
+  ConstField(const Tuple& t) { return t.f2_; }
+};
+
+template <>
+class Get<3> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  Field(Tuple& t) { return t.f3_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
+  ConstField(const Tuple& t) { return t.f3_; }
+};
+
+template <>
+class Get<4> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  Field(Tuple& t) { return t.f4_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
+  ConstField(const Tuple& t) { return t.f4_; }
+};
+
+template <>
+class Get<5> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  Field(Tuple& t) { return t.f5_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
+  ConstField(const Tuple& t) { return t.f5_; }
+};
+
+template <>
+class Get<6> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  Field(Tuple& t) { return t.f6_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
+  ConstField(const Tuple& t) { return t.f6_; }
+};
+
+template <>
+class Get<7> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  Field(Tuple& t) { return t.f7_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
+  ConstField(const Tuple& t) { return t.f7_; }
+};
+
+template <>
+class Get<8> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  Field(Tuple& t) { return t.f8_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
+  ConstField(const Tuple& t) { return t.f8_; }
+};
+
+template <>
+class Get<9> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  Field(Tuple& t) { return t.f9_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
+  ConstField(const Tuple& t) { return t.f9_; }
+};
+
+}  // namespace gtest_internal
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
+get(GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_10_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
+get(const GTEST_10_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator==(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_10_TUPLE_(T) >::value,
+      tuple_size<GTEST_10_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
+                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+#undef GTEST_0_TUPLE_
+#undef GTEST_1_TUPLE_
+#undef GTEST_2_TUPLE_
+#undef GTEST_3_TUPLE_
+#undef GTEST_4_TUPLE_
+#undef GTEST_5_TUPLE_
+#undef GTEST_6_TUPLE_
+#undef GTEST_7_TUPLE_
+#undef GTEST_8_TUPLE_
+#undef GTEST_9_TUPLE_
+#undef GTEST_10_TUPLE_
+
+#undef GTEST_0_TYPENAMES_
+#undef GTEST_1_TYPENAMES_
+#undef GTEST_2_TYPENAMES_
+#undef GTEST_3_TYPENAMES_
+#undef GTEST_4_TYPENAMES_
+#undef GTEST_5_TYPENAMES_
+#undef GTEST_6_TYPENAMES_
+#undef GTEST_7_TYPENAMES_
+#undef GTEST_8_TYPENAMES_
+#undef GTEST_9_TYPENAMES_
+#undef GTEST_10_TYPENAMES_
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h.pump b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h.pump
new file mode 100644
index 000000000..429ddfeec
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-tuple.h.pump
@@ -0,0 +1,347 @@
+$$ -*- mode: c++; -*-
+$var n = 10  $$ Maximum number of tuple fields we want to support.
+$$ This meta comment fixes auto-indentation in Emacs. }}
+// Copyright 2009 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
+
+#include <utility>  // For ::std::pair.
+
+// The compiler used in Symbian has a bug that prevents us from declaring the
+// tuple template as a friend (it complains that tuple is redefined).  This
+// hack bypasses the bug by declaring the members that should otherwise be
+// private as public.
+// Sun Studio versions < 12 also have the above bug.
+#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
+#else
+# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
+    template <GTEST_$(n)_TYPENAMES_(U)> friend class tuple; \
+   private:
+#endif
+
+// Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that conflict
+// with our own definitions. Therefore using our own tuple does not work on
+// those compilers.
+#if defined(_MSC_VER) && _MSC_VER >= 1600  /* 1600 is Visual Studio 2010 */
+# error "gtest's tuple doesn't compile on Visual Studio 2010 or later. \
+GTEST_USE_OWN_TR1_TUPLE must be set to 0 on those compilers."
+#endif
+
+
+$range i 0..n-1
+$range j 0..n
+$range k 1..n
+// GTEST_n_TUPLE_(T) is the type of an n-tuple.
+#define GTEST_0_TUPLE_(T) tuple<>
+
+$for k [[
+$range m 0..k-1
+$range m2 k..n-1
+#define GTEST_$(k)_TUPLE_(T) tuple<$for m, [[T##$m]]$for m2 [[, void]]>
+
+]]
+
+// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
+
+$for j [[
+$range m 0..j-1
+#define GTEST_$(j)_TYPENAMES_(T) $for m, [[typename T##$m]]
+
+
+]]
+
+// In theory, defining stuff in the ::std namespace is undefined
+// behavior.  We can do this as we are playing the role of a standard
+// library vendor.
+namespace std {
+namespace tr1 {
+
+template <$for i, [[typename T$i = void]]>
+class tuple;
+
+// Anything in namespace gtest_internal is Google Test's INTERNAL
+// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
+namespace gtest_internal {
+
+// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
+template <typename T>
+struct ByRef { typedef const T& type; };  // NOLINT
+template <typename T>
+struct ByRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for ByRef.
+#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
+
+// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
+// is the same as tr1::add_reference<T>::type.
+template <typename T>
+struct AddRef { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddRef<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper for AddRef.
+#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
+
+// A helper for implementing get<k>().
+template <int k> class Get;
+
+// A helper for implementing tuple_element<k, T>.  kIndexValid is true
+// iff k < the number of fields in tuple type T.
+template <bool kIndexValid, int kIndex, class Tuple>
+struct TupleElement;
+
+
+$for i [[
+template <GTEST_$(n)_TYPENAMES_(T)>
+struct TupleElement<true, $i, GTEST_$(n)_TUPLE_(T) > {
+  typedef T$i type;
+};
+
+
+]]
+}  // namespace gtest_internal
+
+template <>
+class tuple<> {
+ public:
+  tuple() {}
+  tuple(const tuple& /* t */)  {}
+  tuple& operator=(const tuple& /* t */) { return *this; }
+};
+
+
+$for k [[
+$range m 0..k-1
+template <GTEST_$(k)_TYPENAMES_(T)>
+class $if k < n [[GTEST_$(k)_TUPLE_(T)]] $else [[tuple]] {
+ public:
+  template <int k> friend class gtest_internal::Get;
+
+  tuple() : $for m, [[f$(m)_()]] {}
+
+  explicit tuple($for m, [[GTEST_BY_REF_(T$m) f$m]]) : [[]]
+$for m, [[f$(m)_(f$m)]] {}
+
+  tuple(const tuple& t) : $for m, [[f$(m)_(t.f$(m)_)]] {}
+
+  template <GTEST_$(k)_TYPENAMES_(U)>
+  tuple(const GTEST_$(k)_TUPLE_(U)& t) : $for m, [[f$(m)_(t.f$(m)_)]] {}
+
+$if k == 2 [[
+  template <typename U0, typename U1>
+  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
+
+]]
+
+  tuple& operator=(const tuple& t) { return CopyFrom(t); }
+
+  template <GTEST_$(k)_TYPENAMES_(U)>
+  tuple& operator=(const GTEST_$(k)_TUPLE_(U)& t) {
+    return CopyFrom(t);
+  }
+
+$if k == 2 [[
+  template <typename U0, typename U1>
+  tuple& operator=(const ::std::pair<U0, U1>& p) {
+    f0_ = p.first;
+    f1_ = p.second;
+    return *this;
+  }
+
+]]
+
+  GTEST_DECLARE_TUPLE_AS_FRIEND_
+
+  template <GTEST_$(k)_TYPENAMES_(U)>
+  tuple& CopyFrom(const GTEST_$(k)_TUPLE_(U)& t) {
+
+$for m [[
+    f$(m)_ = t.f$(m)_;
+
+]]
+    return *this;
+  }
+
+
+$for m [[
+  T$m f$(m)_;
+
+]]
+};
+
+
+]]
+// 6.1.3.2 Tuple creation functions.
+
+// Known limitations: we don't support passing an
+// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
+// implement tie().
+
+inline tuple<> make_tuple() { return tuple<>(); }
+
+$for k [[
+$range m 0..k-1
+
+template <GTEST_$(k)_TYPENAMES_(T)>
+inline GTEST_$(k)_TUPLE_(T) make_tuple($for m, [[const T$m& f$m]]) {
+  return GTEST_$(k)_TUPLE_(T)($for m, [[f$m]]);
+}
+
+]]
+
+// 6.1.3.3 Tuple helper classes.
+
+template <typename Tuple> struct tuple_size;
+
+
+$for j [[
+template <GTEST_$(j)_TYPENAMES_(T)>
+struct tuple_size<GTEST_$(j)_TUPLE_(T) > {
+  static const int value = $j;
+};
+
+
+]]
+template <int k, class Tuple>
+struct tuple_element {
+  typedef typename gtest_internal::TupleElement<
+      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
+};
+
+#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
+
+// 6.1.3.4 Element access.
+
+namespace gtest_internal {
+
+
+$for i [[
+template <>
+class Get<$i> {
+ public:
+  template <class Tuple>
+  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple))
+  Field(Tuple& t) { return t.f$(i)_; }  // NOLINT
+
+  template <class Tuple>
+  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_($i, Tuple))
+  ConstField(const Tuple& t) { return t.f$(i)_; }
+};
+
+
+]]
+}  // namespace gtest_internal
+
+template <int k, GTEST_$(n)_TYPENAMES_(T)>
+GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_$(n)_TUPLE_(T)))
+get(GTEST_$(n)_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::Field(t);
+}
+
+template <int k, GTEST_$(n)_TYPENAMES_(T)>
+GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_$(n)_TUPLE_(T)))
+get(const GTEST_$(n)_TUPLE_(T)& t) {
+  return gtest_internal::Get<k>::ConstField(t);
+}
+
+// 6.1.3.5 Relational operators
+
+// We only implement == and !=, as we don't have a need for the rest yet.
+
+namespace gtest_internal {
+
+// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
+// first k fields of t1 equals the first k fields of t2.
+// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
+// k1 != k2.
+template <int kSize1, int kSize2>
+struct SameSizeTuplePrefixComparator;
+
+template <>
+struct SameSizeTuplePrefixComparator<0, 0> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
+    return true;
+  }
+};
+
+template <int k>
+struct SameSizeTuplePrefixComparator<k, k> {
+  template <class Tuple1, class Tuple2>
+  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
+    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
+        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
+  }
+};
+
+}  // namespace gtest_internal
+
+template <GTEST_$(n)_TYPENAMES_(T), GTEST_$(n)_TYPENAMES_(U)>
+inline bool operator==(const GTEST_$(n)_TUPLE_(T)& t,
+                       const GTEST_$(n)_TUPLE_(U)& u) {
+  return gtest_internal::SameSizeTuplePrefixComparator<
+      tuple_size<GTEST_$(n)_TUPLE_(T) >::value,
+      tuple_size<GTEST_$(n)_TUPLE_(U) >::value>::Eq(t, u);
+}
+
+template <GTEST_$(n)_TYPENAMES_(T), GTEST_$(n)_TYPENAMES_(U)>
+inline bool operator!=(const GTEST_$(n)_TUPLE_(T)& t,
+                       const GTEST_$(n)_TUPLE_(U)& u) { return !(t == u); }
+
+// 6.1.4 Pairs.
+// Unimplemented.
+
+}  // namespace tr1
+}  // namespace std
+
+
+$for j [[
+#undef GTEST_$(j)_TUPLE_
+
+]]
+
+
+$for j [[
+#undef GTEST_$(j)_TYPENAMES_
+
+]]
+
+#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
+#undef GTEST_BY_REF_
+#undef GTEST_ADD_REF_
+#undef GTEST_TUPLE_ELEMENT_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
new file mode 100644
index 000000000..e46f7cfcb
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
@@ -0,0 +1,3331 @@
+// This file was GENERATED by command:
+//     pump.py gtest-type-util.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most 50 types in a list, and at most 50
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+template <typename T1, typename T2>
+struct Types2 {
+  typedef T1 Head;
+  typedef Types1<T2> Tail;
+};
+
+template <typename T1, typename T2, typename T3>
+struct Types3 {
+  typedef T1 Head;
+  typedef Types2<T2, T3> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types4 {
+  typedef T1 Head;
+  typedef Types3<T2, T3, T4> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types5 {
+  typedef T1 Head;
+  typedef Types4<T2, T3, T4, T5> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types6 {
+  typedef T1 Head;
+  typedef Types5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types7 {
+  typedef T1 Head;
+  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types8 {
+  typedef T1 Head;
+  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types9 {
+  typedef T1 Head;
+  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types10 {
+  typedef T1 Head;
+  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types11 {
+  typedef T1 Head;
+  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types12 {
+  typedef T1 Head;
+  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types13 {
+  typedef T1 Head;
+  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types14 {
+  typedef T1 Head;
+  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types15 {
+  typedef T1 Head;
+  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types16 {
+  typedef T1 Head;
+  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types17 {
+  typedef T1 Head;
+  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types18 {
+  typedef T1 Head;
+  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types19 {
+  typedef T1 Head;
+  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types20 {
+  typedef T1 Head;
+  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types21 {
+  typedef T1 Head;
+  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types22 {
+  typedef T1 Head;
+  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types23 {
+  typedef T1 Head;
+  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types24 {
+  typedef T1 Head;
+  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types25 {
+  typedef T1 Head;
+  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types26 {
+  typedef T1 Head;
+  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types27 {
+  typedef T1 Head;
+  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types28 {
+  typedef T1 Head;
+  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types29 {
+  typedef T1 Head;
+  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types30 {
+  typedef T1 Head;
+  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types31 {
+  typedef T1 Head;
+  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types32 {
+  typedef T1 Head;
+  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types33 {
+  typedef T1 Head;
+  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types34 {
+  typedef T1 Head;
+  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types35 {
+  typedef T1 Head;
+  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types36 {
+  typedef T1 Head;
+  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types37 {
+  typedef T1 Head;
+  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types38 {
+  typedef T1 Head;
+  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types39 {
+  typedef T1 Head;
+  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types40 {
+  typedef T1 Head;
+  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types41 {
+  typedef T1 Head;
+  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types42 {
+  typedef T1 Head;
+  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types43 {
+  typedef T1 Head;
+  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types44 {
+  typedef T1 Head;
+  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types45 {
+  typedef T1 Head;
+  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types46 {
+  typedef T1 Head;
+  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types47 {
+  typedef T1 Head;
+  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types48 {
+  typedef T1 Head;
+  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types49 {
+  typedef T1 Head;
+  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct Types50 {
+  typedef T1 Head;
+  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+      T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+template <typename T1 = internal::None, typename T2 = internal::None,
+    typename T3 = internal::None, typename T4 = internal::None,
+    typename T5 = internal::None, typename T6 = internal::None,
+    typename T7 = internal::None, typename T8 = internal::None,
+    typename T9 = internal::None, typename T10 = internal::None,
+    typename T11 = internal::None, typename T12 = internal::None,
+    typename T13 = internal::None, typename T14 = internal::None,
+    typename T15 = internal::None, typename T16 = internal::None,
+    typename T17 = internal::None, typename T18 = internal::None,
+    typename T19 = internal::None, typename T20 = internal::None,
+    typename T21 = internal::None, typename T22 = internal::None,
+    typename T23 = internal::None, typename T24 = internal::None,
+    typename T25 = internal::None, typename T26 = internal::None,
+    typename T27 = internal::None, typename T28 = internal::None,
+    typename T29 = internal::None, typename T30 = internal::None,
+    typename T31 = internal::None, typename T32 = internal::None,
+    typename T33 = internal::None, typename T34 = internal::None,
+    typename T35 = internal::None, typename T36 = internal::None,
+    typename T37 = internal::None, typename T38 = internal::None,
+    typename T39 = internal::None, typename T40 = internal::None,
+    typename T41 = internal::None, typename T42 = internal::None,
+    typename T43 = internal::None, typename T44 = internal::None,
+    typename T45 = internal::None, typename T46 = internal::None,
+    typename T47 = internal::None, typename T48 = internal::None,
+    typename T49 = internal::None, typename T50 = internal::None>
+struct Types {
+  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Types<internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types0 type;
+};
+template <typename T1>
+struct Types<T1, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types1<T1> type;
+};
+template <typename T1, typename T2>
+struct Types<T1, T2, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types2<T1, T2> type;
+};
+template <typename T1, typename T2, typename T3>
+struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types3<T1, T2, T3> type;
+};
+template <typename T1, typename T2, typename T3, typename T4>
+struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types4<T1, T2, T3, T4> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types5<T1, T2, T3, T4, T5> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6>
+struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7>
+struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
+      T12> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
+      T26> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
+      T40> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None, internal::None> {
+  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None, internal::None> {
+  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    internal::None, internal::None, internal::None, internal::None,
+    internal::None> {
+  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, internal::None, internal::None, internal::None, internal::None> {
+  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, internal::None, internal::None, internal::None> {
+  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, internal::None, internal::None> {
+  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49>
+struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
+    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
+    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
+    T46, T47, T48, T49, internal::None> {
+  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates2 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates1<T2> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates3 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates2<T2, T3> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates4 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates3<T2, T3, T4> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates5 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates4<T2, T3, T4, T5> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates6 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates5<T2, T3, T4, T5, T6> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates7 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates8 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates9 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates10 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates11 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates12 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates13 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates14 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates15 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates16 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates17 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates18 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates19 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates20 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates21 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates22 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates23 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates24 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates25 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates26 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates27 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates28 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates29 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates30 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates31 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates32 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates33 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates34 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates35 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates36 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates37 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates38 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates39 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates40 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates41 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates42 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates43 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates44 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates45 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates46 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates47 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates48 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates49 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49> Tail;
+};
+
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
+struct Templates50 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
+      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
+};
+
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
+    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
+    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
+    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
+    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
+    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
+    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
+    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
+    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
+    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
+    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
+    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
+    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
+    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
+    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
+    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
+    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
+    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
+    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
+    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
+    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
+    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
+    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
+    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
+    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
+struct Templates {
+  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
+};
+
+template <>
+struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates0 type;
+};
+template <GTEST_TEMPLATE_ T1>
+struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates1<T1> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
+struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates2<T1, T2> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
+struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates3<T1, T2, T3> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4>
+struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates4<T1, T2, T3, T4> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
+struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates5<T1, T2, T3, T4, T5> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
+struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT> {
+  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT> {
+  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT> {
+  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, NoneT, NoneT, NoneT, NoneT> {
+  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, NoneT, NoneT, NoneT> {
+  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, NoneT, NoneT> {
+  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48> type;
+};
+template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
+    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
+    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
+    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
+    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
+    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
+    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
+    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
+    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
+    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
+    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
+    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
+    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
+    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
+    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
+    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
+    GTEST_TEMPLATE_ T49>
+struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
+    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
+    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
+    T45, T46, T47, T48, T49, NoneT> {
+  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
+      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
+      T42, T43, T44, T45, T46, T47, T48, T49> type;
+};
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+template <typename T1, typename T2, typename T3, typename T4, typename T5,
+    typename T6, typename T7, typename T8, typename T9, typename T10,
+    typename T11, typename T12, typename T13, typename T14, typename T15,
+    typename T16, typename T17, typename T18, typename T19, typename T20,
+    typename T21, typename T22, typename T23, typename T24, typename T25,
+    typename T26, typename T27, typename T28, typename T29, typename T30,
+    typename T31, typename T32, typename T33, typename T34, typename T35,
+    typename T36, typename T37, typename T38, typename T39, typename T40,
+    typename T41, typename T42, typename T43, typename T44, typename T45,
+    typename T46, typename T47, typename T48, typename T49, typename T50>
+struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
+    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
+    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
+    T44, T45, T46, T47, T48, T49, T50> > {
+  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
+      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
+      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
+      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h.pump b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h.pump
new file mode 100644
index 000000000..251fdf025
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h.pump
@@ -0,0 +1,297 @@
+$$ -*- mode: c++; -*-
+$var n = 50  $$ Maximum length of type lists we want to support.
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// Currently we support at most $n types in a list, and at most $n
+// type-parameterized tests in one type-parameterized test case.
+// Please contact googletestframework@googlegroups.com if you need
+// more.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+# if GTEST_HAS_CXXABI_H_
+#  include <cxxabi.h>
+# elif defined(__HP_aCC)
+#  include <acxx_demangle.h>
+# endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+# if GTEST_HAS_RTTI
+
+  const char* const name = typeid(T).name();
+#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#   if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#   endif  // GTEST_HAS_CXXABI_H_
+  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return name_str;
+#  else
+  return name;
+#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+# else
+
+  return "<type>";
+
+# endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
+// type.  This can be used as a compile-time assertion to ensure that
+// two types are equal.
+
+template <typename T1, typename T2>
+struct AssertTypeEq;
+
+template <typename T>
+struct AssertTypeEq<T, T> {
+  typedef bool type;
+};
+
+// A unique type used as the default value for the arguments of class
+// template Types.  This allows us to simulate variadic templates
+// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
+// support directly.
+struct None {};
+
+// The following family of struct and struct templates are used to
+// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
+// represents a type list with N types (T1, T2, ..., and TN) in it.
+// Except for Types0, every struct in the family has two member types:
+// Head for the first type in the list, and Tail for the rest of the
+// list.
+
+// The empty type list.
+struct Types0 {};
+
+// Type lists of length 1, 2, 3, and so on.
+
+template <typename T1>
+struct Types1 {
+  typedef T1 Head;
+  typedef Types0 Tail;
+};
+
+$range i 2..n
+
+$for i [[
+$range j 1..i
+$range k 2..i
+template <$for j, [[typename T$j]]>
+struct Types$i {
+  typedef T1 Head;
+  typedef Types$(i-1)<$for k, [[T$k]]> Tail;
+};
+
+
+]]
+
+}  // namespace internal
+
+// We don't want to require the users to write TypesN<...> directly,
+// as that would require them to count the length.  Types<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Types<int>
+// will appear as Types<int, None, None, ..., None> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Types<T1, ..., TN>, and Google Test will translate
+// that to TypesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Types template.
+
+$range i 1..n
+template <$for i, [[typename T$i = internal::None]]>
+struct Types {
+  typedef internal::Types$n<$for i, [[T$i]]> type;
+};
+
+template <>
+struct Types<$for i, [[internal::None]]> {
+  typedef internal::Types0 type;
+};
+
+$range i 1..n-1
+$for i [[
+$range j 1..i
+$range k i+1..n
+template <$for j, [[typename T$j]]>
+struct Types<$for j, [[T$j]]$for k[[, internal::None]]> {
+  typedef internal::Types$i<$for j, [[T$j]]> type;
+};
+
+]]
+
+namespace internal {
+
+# define GTEST_TEMPLATE_ template <typename T> class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+# define GTEST_BIND_(TmplSel, T) \
+  TmplSel::template Bind<T>::type
+
+// A unique struct template used as the default value for the
+// arguments of class template Templates.  This allows us to simulate
+// variadic templates (e.g. Templates<int>, Templates<int, double>,
+// and etc), which C++ doesn't support directly.
+template <typename T>
+struct NoneT {};
+
+// The following family of struct and struct templates are used to
+// represent template lists.  In particular, TemplatesN<T1, T2, ...,
+// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
+// for Templates0, every struct in the family has two member types:
+// Head for the selector of the first template in the list, and Tail
+// for the rest of the list.
+
+// The empty template list.
+struct Templates0 {};
+
+// Template lists of length 1, 2, 3, and so on.
+
+template <GTEST_TEMPLATE_ T1>
+struct Templates1 {
+  typedef TemplateSel<T1> Head;
+  typedef Templates0 Tail;
+};
+
+$range i 2..n
+
+$for i [[
+$range j 1..i
+$range k 2..i
+template <$for j, [[GTEST_TEMPLATE_ T$j]]>
+struct Templates$i {
+  typedef TemplateSel<T1> Head;
+  typedef Templates$(i-1)<$for k, [[T$k]]> Tail;
+};
+
+
+]]
+
+// We don't want to require the users to write TemplatesN<...> directly,
+// as that would require them to count the length.  Templates<...> is much
+// easier to write, but generates horrible messages when there is a
+// compiler error, as gcc insists on printing out each template
+// argument, even if it has the default value (this means Templates<list>
+// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
+// errors).
+//
+// Our solution is to combine the best part of the two approaches: a
+// user would write Templates<T1, ..., TN>, and Google Test will translate
+// that to TemplatesN<T1, ..., TN> internally to make error messages
+// readable.  The translation is done by the 'type' member of the
+// Templates template.
+
+$range i 1..n
+template <$for i, [[GTEST_TEMPLATE_ T$i = NoneT]]>
+struct Templates {
+  typedef Templates$n<$for i, [[T$i]]> type;
+};
+
+template <>
+struct Templates<$for i, [[NoneT]]> {
+  typedef Templates0 type;
+};
+
+$range i 1..n-1
+$for i [[
+$range j 1..i
+$range k i+1..n
+template <$for j, [[GTEST_TEMPLATE_ T$j]]>
+struct Templates<$for j, [[T$j]]$for k[[, NoneT]]> {
+  typedef Templates$i<$for j, [[T$j]]> type;
+};
+
+]]
+
+// The TypeList template makes it possible to use either a single type
+// or a Types<...> list in TYPED_TEST_CASE() and
+// INSTANTIATE_TYPED_TEST_CASE_P().
+
+template <typename T>
+struct TypeList {
+  typedef Types1<T> type;
+};
+
+
+$range i 1..n
+template <$for i, [[typename T$i]]>
+struct TypeList<Types<$for i, [[T$i]]> > {
+  typedef typename Types<$for i, [[T$i]]>::type type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc
new file mode 100644
index 000000000..0a9cee522
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-all.cc
@@ -0,0 +1,48 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// Google C++ Testing Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+#include "src/gtest.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc
new file mode 100644
index 000000000..a01a36983
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-death-test.cc
@@ -0,0 +1,1342 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+# if GTEST_OS_MAC
+#  include <crt_externs.h>
+# endif  // GTEST_OS_MAC
+
+# include <errno.h>
+# include <fcntl.h>
+# include <limits.h>
+
+# if GTEST_OS_LINUX
+#  include <signal.h>
+# endif  // GTEST_OS_LINUX
+
+# include <stdarg.h>
+
+# if GTEST_OS_WINDOWS
+#  include <windows.h>
+# else
+#  include <sys/mman.h>
+#  include <sys/wait.h>
+# endif  // GTEST_OS_WINDOWS
+
+# if GTEST_OS_QNX
+#  include <spawn.h>
+# endif  // GTEST_OS_QNX
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+static const char kDefaultDeathTestStyle[] = "fast";
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the current "
+    "process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+# if !GTEST_OS_WINDOWS
+static bool g_in_fast_death_test_child = false;
+# endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+# if GTEST_OS_WINDOWS
+
+  // On Windows, death tests are thread-safe regardless of the value of the
+  // death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
+}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+# if GTEST_OS_WINDOWS
+
+  return exit_status == exit_code_;
+
+# else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+# endif  // GTEST_OS_WINDOWS
+}
+
+# if !GTEST_OS_WINDOWS
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
+}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+# endif  // !GTEST_OS_WINDOWS
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+# if GTEST_OS_WINDOWS
+
+  m << "Exited with exit status " << exit_code;
+
+# else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#  ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#  endif
+# endif  // GTEST_OS_WINDOWS
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+# if !GTEST_OS_WINDOWS
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0)
+    msg << "couldn't detect the number of threads.";
+  else
+    msg << "detected " << thread_count << " threads.";
+  return msg.GetString();
+}
+# endif  // !GTEST_OS_WINDOWS
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+// TODO(vladl@google.com): Unify names and possibly values for
+// AbortReason, DeathTestOutcome, and flag characters above.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+void DeathTestAbort(const std::string& message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag* const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != NULL) {
+    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+# define GTEST_DEATH_TEST_CHECK_(expression) \
+  do { \
+    if (!::testing::internal::IsTrue(expression)) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
+  do { \
+    int gtest_retval; \
+    do { \
+      gtest_retval = (expression); \
+    } while (gtest_retval == -1 && errno == EINTR); \
+    if (gtest_retval == -1) { \
+      DeathTestAbort( \
+          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
+          + ::testing::internal::StreamableToString(__LINE__) + ": " \
+          + #expression + " != -1"); \
+    } \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+    return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo* const info = GetUnitTestImpl()->current_test_info();
+  if (info == NULL) {
+    DeathTestAbort("Cannot run a death test outside of a TEST or "
+                   "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char* statement, const RE* regex,
+                       const char* file, int line, DeathTest** test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, regex, file, line, test);
+}
+
+const char* DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string& message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char* a_statement, const RE* a_regex)
+      : statement_(a_statement),
+        regex_(a_regex),
+        spawned_(false),
+        status_(-1),
+        outcome_(IN_PROGRESS),
+        read_fd_(-1),
+        write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason);
+  virtual bool Passed(bool status_ok);
+
+  const char* statement() const { return statement_; }
+  const RE* regex() const { return regex_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char* const statement_;
+  // The regular expression which test output must match.  DeathTestImpl
+  // doesn't own this object and should not attempt to delete it.
+  const RE* const regex_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned:
+        set_outcome(RETURNED);
+        break;
+      case kDeathTestThrew:
+        set_outcome(THREW);
+        break;
+      case kDeathTestLived:
+        set_outcome(LIVED);
+        break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch =
+      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
+      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string& output) {
+  ::std::string ret;
+  for (size_t at = 0; ; ) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   regex:    A regular expression object to be applied to
+//             the test's captured standard error output; the death test
+//             fails if it does not match.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true iff all of the above conditions are met.  Otherwise, the
+// first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned())
+    return false;
+
+  const std::string error_message = GetCapturedStderr();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+        if (matched) {
+          success = true;
+        } else {
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << regex()->pattern() << "\n"
+                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+# if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2,
+                                   wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1:
+      break;
+    default:
+      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(
+      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
+                                             INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = {
+    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
+                   0)  // Default buffer size.
+      != FALSE);
+  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
+                                O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,    // The event will automatically reset to non-signaled state.
+      FALSE,   // The initial state is non-signalled.
+      NULL));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "=" +
+      info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
+      "=" + file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
+      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(
+      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
+                                            executable_path,
+                                            _MAX_PATH));
+
+  std::string command_line =
+      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
+      internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
+      executable_path,
+      const_cast<char*>(command_line.c_str()),
+      NULL,   // Retuned process handle is not inheritable.
+      NULL,   // Retuned thread handle is not inheritable.
+      TRUE,   // Child inherits all inheritable handles (for write_handle_).
+      0x0,    // Default creation flags.
+      NULL,   // Inherit the parent's environment.
+      UnitTest::GetInstance()->original_working_dir(),
+      &startup_info,
+      &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+# else  // We are not on Windows.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char* statement, const RE* regex);
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
+    : DeathTestImpl(a_statement, a_regex),
+      child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
+      ForkingDeathTest(a_statement, a_regex) { }
+  virtual TestRole AssumeRole();
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char* a_statement, const RE* a_regex,
+                const char* file, int line) :
+      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
+  virtual TestRole AssumeRole();
+ private:
+  static ::std::vector<testing::internal::string>
+  GetArgvsForDeathTestChildProcess() {
+    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<testing::internal::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char* const* argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#  if GTEST_OS_MAC
+inline char** GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#  else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char** environ;
+inline char** GetEnviron() { return environ; }
+#  endif  // GTEST_OS_MAC
+
+#  if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void* child_arg) {
+  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " +
+                 GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#  endif  // !GTEST_OS_QNX
+
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
+void StackLowerThanAddress(const void* ptr, bool* result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#  if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char* const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
+                   GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
+                                        fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = {0};
+  // spawn is a system call.
+  child_pid = spawn(args.argv[0], 0, NULL, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#  else   // GTEST_OS_QNX
+#   if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
+      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#   endif  // GTEST_OS_LINUX
+
+#   if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const size_t stack_size = getpagesize();
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void* const stack_top =
+        static_cast<char*>(stack) +
+            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(stack_size > kMaxStackAlignment &&
+        reinterpret_cast<intptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#   else
+  const bool use_fork = true;
+#   endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+      ExecDeathTestChildMain(&args);
+      _exit(0);
+  }
+#  endif  // GTEST_OS_QNX
+#  if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, NULL));
+#  endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|" + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index) + "|"
+      + StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+# endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
+                                     const char* file, int line,
+                                     DeathTest** test) {
+  UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index = impl->current_test_info()
+      ->increment_death_test_count();
+
+  if (flag != NULL) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index)
+          + ") somehow exceeded expected maximum ("
+          + StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = NULL;
+      return true;
+    }
+  }
+
+# if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, regex, file, line);
+  }
+
+# else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, regex, file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, regex);
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message(
+        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
+        + "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+# if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+int GetStatusFileDescriptor(unsigned int parent_process_id,
+                            size_t write_handle_as_size_t,
+                            size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                   FALSE,  // Non-inheritable.
+                                                   parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  // TODO(vladl@google.com): Replace the following check with a
+  // compile-time assertion when available.
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle =
+      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle,
+                         0x0,
+                         FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+# endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+# if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &parent_process_id)
+      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
+      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id,
+                                     write_handle_as_size_t,
+                                     event_handle_as_size_t);
+# else
+
+  if (fields.size() != 4
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)
+      || !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
+# endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc
new file mode 100644
index 000000000..0292dc119
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-filepath.cc
@@ -0,0 +1,387 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Authors: keith.ray@gmail.com (Keith Ray)
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-port.h"
+
+#include <stdlib.h>
+
+#if GTEST_OS_WINDOWS_MOBILE
+# include <windows.h>
+#elif GTEST_OS_WINDOWS
+# include <direct.h>
+# include <io.h>
+#elif GTEST_OS_SYMBIAN
+// Symbian OpenC has PATH_MAX in sys/syslimits.h
+# include <sys/syslimits.h>
+#else
+# include <limits.h>
+# include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#if GTEST_OS_WINDOWS
+# define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+# define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+#include "gtest/internal/gtest-string.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+# if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+# else
+const char kCurrentDirectoryString[] = ".\\";
+# endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  // Windows CE doesn't have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char* result = getcwd(cwd, sizeof(cwd));
+# if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == NULL ? kCurrentDirectoryString : cwd);
+# endif  // GTEST_OS_NACL
+  return FilePath(result == NULL ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char* extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(pathname_.substr(
+        0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char* FilePath::FindLastPathSeparator() const {
+  const char* const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != NULL &&
+      (last_sep == NULL || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char* const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), last_sep + 1 - c_str());
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath& directory,
+                                const FilePath& base_name,
+                                int number,
+                                const char* extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file = base_name.string() + "_" + StreamableToString(number)
+        + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath& directory,
+                               const FilePath& relative_path) {
+  if (directory.IsEmpty())
+    return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath& path(IsRootDirectory() ? *this :
+                                           RemoveTrailingPathSeparator());
+#else
+  const FilePath& path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete [] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
+      posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  // TODO(wan@google.com): on Windows a network share like
+  // \\server\share can be a root directory, although it cannot be the
+  // current directory.  Handle this properly.
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char* const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+     ((name[0] >= 'a' && name[0] <= 'z') ||
+      (name[0] >= 'A' && name[0] <= 'Z')) &&
+     name[1] == ':' &&
+     IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
+                                          const FilePath& base_name,
+                                          const char* extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
+  delete [] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory()
+      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+      : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+void FilePath::Normalize() {
+  if (pathname_.c_str() == NULL) {
+    pathname_ = "";
+    return;
+  }
+  const char* src = pathname_.c_str();
+  char* const dest = new char[pathname_.length() + 1];
+  char* dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src))
+        src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h
new file mode 100644
index 000000000..ed8a682a9
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-internal-inl.h
@@ -0,0 +1,1183 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
+// part of Google Test's implementation; otherwise it's undefined.
+#if !GTEST_IMPLEMENTATION_
+// If this file is included from the user's code, just say no.
+# error "gtest-internal-inl.h is part of Google Test's internal implementation."
+# error "It must not be included except by Google Test itself."
+#endif  // GTEST_IMPLEMENTATION_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>  // NOLINT
+#endif  // GTEST_OS_WINDOWS
+
+#include "gtest/gtest.h"  // NOLINT
+#include "gtest/gtest-spi.h"
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true iff Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(
+    const char* str, const char* flag, Int32* value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
+  const unsigned int raw_seed = (random_seed_flag == 0) ?
+      static_cast<unsigned int>(GetTimeInMillis()) :
+      static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  internal::Int32 random_seed_;
+  internal::Int32 repeat_;
+  bool shuffle_;
+  internal::Int32 stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(UInt32 code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t* str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char* total_shards_str,
+                            const char* shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error and
+// and aborts.
+GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(
+    int total_shards, int shard_index, int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container& c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it))
+      ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container& c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random* random, int begin, int end,
+                  std::vector<E>* v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected = begin + random->Generate(range_width);
+    std::swap((*v)[selected], (*v)[last_in_range]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random* random, std::vector<E>* v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T* x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string& key) : key_(key) {}
+
+  // Returns true iff the test name of test property matches on key_.
+  bool operator()(const TestProperty& test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true iff the wildcard pattern matches the string.  The
+  // first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true iff the user-specified filter matches the test case
+  // name and the test name.
+  static bool FilterMatchesTest(const std::string &test_case_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string& name, const char* filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char* const kElidedFramesMarker;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() {}
+
+  virtual string CurrentStackTrace(int max_depth, int skip_count);
+  virtual void UponLeavingGTest();
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char* file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+  : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  virtual void ReportTestPartResult(const TestPartResult& result);
+
+ private:
+  UnitTestImpl* const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest* parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface* reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface* reporter);
+
+  // Gets the number of successful test cases.
+  int successful_test_case_count() const;
+
+  // Gets the number of failed test cases.
+  int failed_test_case_count() const;
+
+  // Gets the number of all test cases.
+  int total_test_case_count() const;
+
+  // Gets the number of all test cases that contain at least one test
+  // that should run.
+  int test_case_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true iff the unit test passed (i.e. all test cases passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true iff the unit test failed (i.e. some test case failed
+  // or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  const TestCase* GetTestCase(int i) const {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[i];
+  }
+
+  // Gets the i-th test case among all the test cases. i can range from 0 to
+  // total_test_case_count() - 1. If i is not in that range, returns NULL.
+  TestCase* GetMutableTestCase(int i) {
+    const int index = GetElementOr(test_case_indices_, i, -1);
+    return index < 0 ? NULL : test_cases_[index];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners* listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult* current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface* os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestCase with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_case_name: name of the test case
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test case
+  //   tear_down_tc:   pointer to the function that tears down the test case
+  TestCase* GetTestCase(const char* test_case_name,
+                        const char* type_param,
+                        Test::SetUpTestCaseFunc set_up_tc,
+                        Test::TearDownTestCaseFunc tear_down_tc);
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test case
+  //   tear_down_tc: pointer to the function that tears down the test case
+  //   test_info:    the TestInfo object
+  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc,
+                   TestInfo* test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestCase(test_info->test_case_name(),
+                test_info->type_param(),
+                set_up_tc,
+                tear_down_tc)->AddTestInfo(test_info);
+  }
+
+#if GTEST_HAS_PARAM_TEST
+  // Returns ParameterizedTestCaseRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Sets the TestCase object for the test that's currently running.
+  void set_current_test_case(TestCase* a_current_test_case) {
+    current_test_case_ = a_current_test_case;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo* a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_cases_, TestCase::ClearTestCaseResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() {
+    ad_hoc_test_result_.Clear();
+  }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test case, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty& test_property);
+
+  enum ReactionToSharding {
+    HONOR_SHARDING_PROTOCOL,
+    IGNORE_SHARDING_PROTOCOL
+  };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestCase and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestCase* current_test_case() const { return current_test_case_; }
+  TestInfo* current_test_info() { return current_test_info_; }
+  const TestInfo* current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*>& environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo>& gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo>& gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory* death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random* random() { return &random_; }
+
+  // Shuffles all test cases, and the tests within each test case,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test cases and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest* const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface* global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface*>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment*> environments_;
+
+  // The vector of TestCases in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestCase*> test_cases_;
+
+  // Provides a level of indirection for the test case list to allow
+  // easy shuffling and restoring the test case order.  The i-th
+  // element of this vector is the index of the i-th test case in the
+  // shuffled order.
+  std::vector<int> test_case_indices_;
+
+#if GTEST_HAS_PARAM_TEST
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+#endif  // GTEST_HAS_PARAM_TEST
+
+  // Index of the last death test case registered.  Initially -1.
+  int last_death_test_case_;
+
+  // This points to the TestCase for the currently running test.  It
+  // changes as Google Test goes through one test case after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestCase* current_test_case_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo* current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface* os_stack_trace_getter_;
+
+  // True iff PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl* GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char* str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char* regex);
+GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char* end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+
+# if GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  // MSVC and C++ Builder define __int64 instead of the standard long long.
+  typedef unsigned __int64 BiggestConvertible;
+  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
+
+# else
+
+  typedef unsigned long long BiggestConvertible;  // NOLINT
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
+
+# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
+
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // available.
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult* test_result,
+                             const std::string& xml_element,
+                             const TestProperty& property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult* test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult>& test_part_results(
+      const TestResult& test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class GTEST_API_ StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const string& message) {
+      Send(message + "\n");
+    }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const string& host, const string& port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    virtual ~SocketWriter() {
+      if (sockfd_ != -1)
+        CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    virtual void Send(const string& message) {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const int len = static_cast<int>(message.length());
+      if (write(sockfd_, message.c_str(), len) != len) {
+        GTEST_LOG_(WARNING)
+            << "stream_result_to: failed to stream to "
+            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const string host_name_;
+    const string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static string UrlEncode(const char* str);
+
+  StreamingListener(const string& host, const string& port)
+      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+
+  explicit StreamingListener(AbstractSocketWriter* socket_writer)
+      : socket_writer_(socket_writer) { Start(); }
+
+  void OnTestProgramStart(const UnitTest& /* unit_test */) {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest& unit_test) {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
+    SendLn("event=TestIterationEnd&passed=" +
+           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
+           StreamableToString(unit_test.elapsed_time()) + "ms");
+  }
+
+  void OnTestCaseStart(const TestCase& test_case) {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  void OnTestCaseEnd(const TestCase& test_case) {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed())
+           + "&elapsed_time=" + StreamableToString(test_case.elapsed_time())
+           + "ms");
+  }
+
+  void OnTestStart(const TestInfo& test_info) {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo& test_info) {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) +
+           "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult& test_part_result) {
+    const char* file_name = test_part_result.file_name();
+    if (file_name == NULL)
+      file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const scoped_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc
new file mode 100644
index 000000000..e5bf3dd2b
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-port.cc
@@ -0,0 +1,1259 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+
+#if GTEST_OS_WINDOWS
+# include <windows.h>
+# include <io.h>
+# include <sys/stat.h>
+# include <map>  // Used in ThreadLocal.
+#else
+# include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+# include <mach/mach_init.h>
+# include <mach/task.h>
+# include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_QNX
+# include <devctl.h>
+# include <fcntl.h>
+# include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+# include <procinfo.h>
+# include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const string& filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<int>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task,
+                  reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), NULL);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), NULL, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) {
+  ::Sleep(n);
+}
+
+AutoHandle::AutoHandle()
+    : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle)
+    : handle_(handle) {}
+
+AutoHandle::~AutoHandle() {
+  Reset();
+}
+
+AutoHandle::Handle AutoHandle::Get() const {
+  return handle_;
+}
+
+void AutoHandle::Reset() {
+  Reset(INVALID_HANDLE_VALUE);
+}
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+            "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != NULL && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(NULL,   // Default security attributes.
+                           TRUE,   // Do not reset automatically.
+                           FALSE,  // Initially unset.
+                           NULL)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != NULL);
+}
+
+void Notification::Notify() {
+  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
+}
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(
+      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0),
+      type_(kDynamic),
+      critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // nothing to clean it up but is available only on Vista and later.
+  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = NULL;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        critical_section_ = new CRITICAL_SECTION;
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(
+                          &critical_section_init_phase_, 2L, 1L) ==
+                      1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_,
+                                            2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2:
+        break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable* runnable,
+                             Notification* thread_can_start) {
+    ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    // TODO(yukawa): Consider to use _beginthreadex instead.
+    HANDLE thread_handle = ::CreateThread(
+        NULL,    // Default security.
+        0,       // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,   // Parameter to ThreadMainStatic
+        0x0,     // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != NULL) << "CreateThread failed with error "
+                                        << ::GetLastError() << ".";
+    if (thread_handle == NULL) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
+        : runnable_(runnable),
+          thread_can_start_(thread_can_start) {
+    }
+    scoped_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification* thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void* ptr) {
+    // Transfers ownership.
+    scoped_ptr<ThreadMainParam> param(static_cast<ThreadMainParam*>(ptr));
+    if (param->thread_can_start_ != NULL)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification* thread_can_start)
+      : thread_(ThreadWithParamSupport::CreateThread(runnable,
+                                                     thread_can_start)) {
+}
+
+ThreadWithParamBase::~ThreadWithParamBase() {
+  Join();
+}
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase* GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals* const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos = thread_to_thread_locals->insert(
+          std::make_pair(current_thread, ThreadLocalValues())).first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues& thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  linked_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+          thread_to_thread_locals->begin();
+          it != thread_to_thread_locals->end();
+          ++it) {
+        ThreadLocalValues& thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<linked_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals* const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues& thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+            thread_local_values.begin();
+            value_pos != thread_local_values.end();
+            ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase*,
+                   linked_ptr<ThreadLocalValueHolderBase> > ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
+                                 FALSE,
+                                 thread_id);
+    GTEST_CHECK_(thread != NULL);
+    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        NULL,   // Default security.
+        0,      // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED,
+        &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != NULL);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle* tah =
+        reinterpret_cast<const ThreadIdAndHandle*>(param);
+    GTEST_CHECK_(
+        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
+      const ThreadLocalBase* thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+      const ThreadLocalBase* thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char*>(pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char* const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true iff ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char* str) {
+  return ch != '\0' && strchr(str, ch) != NULL;
+}
+
+// Returns true iff ch belongs to the given classification.  Unlike
+// similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+      ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true iff "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true iff the given atom (specified by escaped and pattern)
+// matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+std::string FormatRegexSyntaxError(const char* regex, int index) {
+  return (Message() << "Syntax error at index " << index
+          << " in simple regular expression \"" << regex << "\": ").GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char* regex) {
+  if (regex == NULL) {
+    // TODO(wan@google.com): fix the source file location in the
+    // assertion failures to match where the regex is used in user
+    // code.
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True iff ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'" << ch << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(
+    bool escaped, char c, char repeat, const char* regex,
+    const char* str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 :
+      static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
+      return false;
+  }
+  return false;
+}
+
+// Returns true iff regex matches a prefix of str.  regex must be a
+// valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char* regex, const char* str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$')
+    return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped)
+    ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(
+        escaped, regex[0], regex[1], regex + 2, str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+        MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true iff regex matches any substring of str.  regex must be
+// a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char* regex, const char* str) {
+  if (regex == NULL || str == NULL)
+    return false;
+
+  if (*regex == '^')
+    return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str))
+      return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char*>(pattern_));
+  free(const_cast<char*>(full_pattern_));
+}
+
+// Returns true iff regular expression re matches the entire str.
+bool RE::FullMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true iff regular expression re matches a substring of str
+// (including str itself).
+bool RE::PartialMatch(const char* str, const RE& re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char* regex) {
+  pattern_ = full_pattern_ = NULL;
+  if (regex != NULL) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char* buffer = static_cast<char*>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
+    const char* file, int line) {
+  const std::string file_name(file == NULL ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
+    : severity_(severity) {
+  const char* const marker =
+      severity == GTEST_INFO ?    "[  INFO ]" :
+      severity == GTEST_WARNING ? "[WARNING]" :
+      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl << marker << " "
+              << FormatFileLocation(file, line).c_str() << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+# if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path,
+                                            "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
+                                    << temp_file_path;
+    filename_ = temp_file_path;
+# else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#  if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /sdcard is directly accessible from native code
+    // and is the only location (unofficially) supported by the Android
+    // team. It's generally a symlink to the real SD Card mount point
+    // which can be /mnt/sdcard, /mnt/sdcard0, /system/media/sdcard, or
+    // other OEM-customized locations. Never rely on these, and always
+    // use /sdcard.
+    char name_template[] = "/sdcard/gtest_captured_stream.XXXXXX";
+#  else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#  endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
+# endif  // GTEST_OS_WINDOWS
+    fflush(NULL);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() {
+    remove(filename_.c_str());
+  }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(NULL);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE* const file = posix::FOpen(filename_.c_str(), "r");
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+static CapturedStream* g_captured_stderr = NULL;
+static CapturedStream* g_captured_stdout = NULL;
+
+// Starts capturing an output stream (stdout/stderr).
+void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+  if (*stream != NULL) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+std::string GetCapturedStream(CapturedStream** captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = NULL;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+std::string TempDir() {
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+size_t GetFileSize(FILE* file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE* file) {
+  const size_t file_size = GetFileSize(file);
+  char* const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+
+static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
+                                        NULL;  // Owned.
+
+void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
+  if (g_injected_test_argvs != argvs)
+    delete g_injected_test_argvs;
+  g_injected_test_argvs = argvs;
+}
+
+const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+  if (g_injected_test_argvs != NULL) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = NULL;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const Int32 result = static_cast<Int32>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+      ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromGTestEnv(const char* flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == NULL ?
+      default_value : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == NULL) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  Int32 result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var,
+                  string_value, &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+std::string StringFromGTestEnv(const char* flag, const char* default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* value = posix::GetEnv(env_var.c_str());
+  if (value != NULL) {
+    return value;
+  }
+
+  // As a special case for the 'output' flag, if GTEST_OUTPUT is not
+  // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+  // system.  The value of XML_OUTPUT_FILE is a filename without the
+  // "xml:" prefix of GTEST_OUTPUT.
+  //
+  // The net priority order after flag processing is thus:
+  //   --gtest_output command line flag
+  //   GTEST_OUTPUT environment variable
+  //   XML_OUTPUT_FILE environment variable
+  //   'default_value'
+  if (strcmp(flag, "output") == 0) {
+    value = posix::GetEnv("XML_OUTPUT_FILE");
+    if (value != NULL) {
+      return std::string("xml:") + value;
+    }
+  }
+  return default_value;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc
new file mode 100644
index 000000000..a2df412f8
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-printers.cc
@@ -0,0 +1,373 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+// Google Test - The Google C++ Testing Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include "gtest/gtest-printers.h"
+#include <ctype.h>
+#include <stdio.h>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
+                                size_t count, ostream* os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
+                              ostream* os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  // TODO(wan): let the user control the threshold using a flag.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
+                          ostream* os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat {
+  kAsIs,
+  kHexEscape,
+  kSpecialEscape
+};
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) {
+  return 0x20 <= c && c <= 0x7E;
+}
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
+  switch (static_cast<wchar_t>(c)) {
+    case L'\0':
+      *os << "\\0";
+      break;
+    case L'\'':
+      *os << "\\'";
+      break;
+    case L'\\':
+      *os << "\\\\";
+      break;
+    case L'\a':
+      *os << "\\a";
+      break;
+    case L'\b':
+      *os << "\\b";
+      break;
+    case L'\f':
+      *os << "\\f";
+      break;
+    case L'\n':
+      *os << "\\n";
+      break;
+    case L'\r':
+      *os << "\\r";
+      break;
+    case L'\t':
+      *os << "\\t";
+      break;
+    case L'\v':
+      *os << "\\v";
+      break;
+    default:
+      if (IsPrintableAscii(c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  switch (c) {
+    case L'\'':
+      *os << "'";
+      return kAsIs;
+    case L'"':
+      *os << "\\\"";
+      return kSpecialEscape;
+    default:
+      return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream* os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0)
+    return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexidecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<UnsignedChar>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream* os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream* os) {
+  PrintCharAndCodeTo<wchar_t>(wc, os);
+}
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void PrintCharsAsStringTo(
+    const CharType* begin, size_t len, ostream* os) {
+  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+  }
+  *os << "\"";
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+static void UniversalPrintCharArray(
+    const CharType* begin, size_t len, ostream* os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t* s, ostream* os) {
+  if (s == NULL) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void*>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, std::wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+// Prints a ::string object.
+#if GTEST_HAS_GLOBAL_STRING
+void PrintStringTo(const ::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void PrintStringTo(const ::std::string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+// Prints a ::wstring object.
+#if GTEST_HAS_GLOBAL_WSTRING
+void PrintWideStringTo(const ::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc
new file mode 100644
index 000000000..fb0e35425
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-test-part.cc
@@ -0,0 +1,110 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: mheule@google.com (Markus Heule)
+//
+// The Google C++ Testing Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick exists to
+// prevent the accidental inclusion of gtest-internal-inl.h in the
+// user's code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char* message) {
+  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == NULL ? message :
+      std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
+  return os
+      << result.file_name() << ":" << result.line_number() << ": "
+      << (result.type() == TestPartResult::kSuccess ? "Success" :
+          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
+          "Non-fatal failure") << ":\n"
+      << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult& result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[index];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(GetUnitTestImpl()->
+                         GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult& result) {
+  if (result.fatally_failed())
+    has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc
new file mode 100644
index 000000000..df1eef475
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest-typed-test.cc
@@ -0,0 +1,118 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+
+#include "gtest/gtest-typed-test.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char* SkipSpaces(const char* str) {
+  while (IsSpace(*str))
+    str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char* src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != NULL; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char* TypedTestCasePState::VerifyRegisteredTestNames(
+    const char* file, int line, const char* registered_tests) {
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string& name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end();
+         ++it) {
+      if (name == it->first) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test case.\n";
+    }
+  }
+
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end();
+       ++it) {
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
+    }
+  }
+
+  const std::string& errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
new file mode 100644
index 000000000..5a8932c73
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest.cc
@@ -0,0 +1,5389 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Author: wan@google.com (Zhanyong Wan)
+//
+// The Google C++ Testing Framework (Google Test)
+
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+#include "gtest/gtest-spi.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+// TODO(kenton@google.com): Use autoconf to detect availability of
+// gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+# include <fcntl.h>  // NOLINT
+# include <limits.h>  // NOLINT
+# include <sched.h>  // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+# include <strings.h>  // NOLINT
+# include <sys/mman.h>  // NOLINT
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+# include <string>
+
+#elif GTEST_OS_SYMBIAN
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+#elif GTEST_OS_ZOS
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+# include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+# include <strings.h>  // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+# include <windows.h>  // NOLINT
+# undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+# include <io.h>  // NOLINT
+# include <sys/timeb.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+# include <sys/stat.h>  // NOLINT
+
+# if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+// TODO(kenton@google.com): There are other ways to get the time on
+//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
+//   supports these.  consider using them instead.
+#  define GTEST_HAS_GETTIMEOFDAY_ 1
+#  include <sys/time.h>  // NOLINT
+# endif  // GTEST_OS_WINDOWS_MINGW
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <windows.h>  // NOLINT
+# undef min
+
+#else
+
+// Assume other platforms have gettimeofday().
+// TODO(kenton@google.com): Use autoconf to detect availability of
+//   gettimeofday().
+# define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+# include <sys/time.h>  // NOLINT
+# include <unistd.h>  // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+# include <arpa/inet.h>  // NOLINT
+# include <netdb.h>  // NOLINT
+# include <sys/socket.h>  // NOLINT
+# include <sys/types.h>  // NOLINT
+#endif
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// his code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_WINDOWS
+# define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test case name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test case whose name matches this filter is considered a death
+// test case and will be run before test cases whose name doesn't
+// match this filter.
+static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output file for XML output.
+static const char kDefaultOutputFile[] = "test_detail.xml";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true iff the --help flag or an equivalent form is
+// specified on the command line.
+bool g_help_flag = false;
+
+}  // namespace internal
+
+static const char* GetDefaultFilter() {
+#ifdef GTEST_TEST_FILTER_ENV_VAR_
+  const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_);
+  if (testbridge_test_only != NULL) {
+    return testbridge_test_only;
+  }
+#endif  // GTEST_TEST_FILTER_ENV_VAR_
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure,
+    internal::BoolFromGTestEnv("break_on_failure", false),
+    "True iff a failed assertion should be a debugger break-point.");
+
+GTEST_DEFINE_bool_(
+    catch_exceptions,
+    internal::BoolFromGTestEnv("catch_exceptions", true),
+    "True iff " GTEST_NAME_
+    " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color,
+    internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter,
+    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(list_tests, false,
+                   "List all tests without running them.");
+
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output", ""),
+    "A format (currently must be \"xml\"), optionally followed "
+    "by a colon and an output file name or directory. A directory "
+    "is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(
+    print_time,
+    internal::BoolFromGTestEnv("print_time", true),
+    "True iff " GTEST_NAME_
+    " should display elapsed time in text output.");
+
+GTEST_DEFINE_int32_(
+    random_seed,
+    internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat,
+    internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(
+    show_internal_stack_frames, false,
+    "True iff " GTEST_NAME_ " should include internal stack frames when "
+    "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(
+    shuffle,
+    internal::BoolFromGTestEnv("shuffle", false),
+    "True iff " GTEST_NAME_
+    " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to,
+    internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure,
+    internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile,
+    internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+UInt32 Random::Generate(UInt32 range) {
+  // These constants are the same as are used in glibc's rand(3).
+  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0)
+      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true iff the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestCases, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
+                               int (TestCase::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true iff the test case passed.
+static bool TestCasePassed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Passed();
+}
+
+// Returns true iff the test case failed.
+static bool TestCaseFailed(const TestCase* test_case) {
+  return test_case->should_run() && test_case->Failed();
+}
+
+// Returns true iff test_case contains at least one test that should
+// run.
+static bool ShouldRunTestCase(const TestCase* test_case) {
+  return test_case->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type,
+                           const char* file,
+                           int line,
+                           const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {
+}
+
+AssertHelper::~AssertHelper() {
+  delete data_;
+}
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message& message) const {
+  UnitTest::GetInstance()->
+    AddTestPartResult(data_->type, data_->file, data_->line,
+                      AppendUserMessage(data_->message, message),
+                      UnitTest::GetInstance()->impl()
+                      ->CurrentOsStackTraceExceptTop(1)
+                      // Skips the stack frame for this function itself.
+                      );  // NOLINT
+}
+
+// Mutex for linked pointers.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+::std::vector<testing::internal::string> g_argvs;
+
+const ::std::vector<testing::internal::string>& GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  return GTEST_CUSTOM_GET_ARGVS_();
+#else  // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(GetArgvs()[0]));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL) return std::string("");
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  return (colon == NULL) ?
+      std::string(gtest_output_flag) :
+      std::string(gtest_output_flag, colon - gtest_output_flag);
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  if (gtest_output_flag == NULL)
+    return "";
+
+  const char* const colon = strchr(gtest_output_flag, ':');
+  if (colon == NULL)
+    return internal::FilePath::ConcatPaths(
+        internal::FilePath(
+            UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(kDefaultOutputFile)).string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // path (as its meaning depends on the current drive), yet the
+    // following logic for turning it into an absolute path is wrong.
+    // Fix it.
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory())
+    return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true iff the wildcard pattern matches the string.  The
+// first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+          PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str &&
+          PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(
+    const std::string& name, const char* filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == NULL) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true iff the user-specified filter matches the test case
+// name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_case_name,
+                                        const std::string &test_name) {
+  const std::string& full_name = test_case_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char* const p = GTEST_FLAG(filter).c_str();
+  const char* const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == NULL) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray* result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
+      result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray* result)
+    : intercept_mode_(intercept_mode),
+      result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() {
+  return GetTypeId<Test>();
+}
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+AssertionResult HasOneFailure(const char* /* results_expr */,
+                              const char* /* type_expr */,
+                              const char* /* substr_expr */,
+                              const TestPartResultArray& results,
+                              TestPartResult::Type type,
+                              const string& substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure ?
+                        "1 fatal failure" :
+                        "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult& r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == NULL) {
+    return AssertionFailure() << "Expected: " << expected << " containing \""
+                              << substr << "\"\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker:: SingleFailureChecker(
+    const TestPartResultArray* results,
+    TestPartResult::Type type,
+    const string& substr)
+    : results_(results),
+      type_(type),
+      substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult& result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface*
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface* reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface*
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface* reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test cases.
+int UnitTestImpl::successful_test_case_count() const {
+  return CountIf(test_cases_, TestCasePassed);
+}
+
+// Gets the number of failed test cases.
+int UnitTestImpl::failed_test_case_count() const {
+  return CountIf(test_cases_, TestCaseFailed);
+}
+
+// Gets the number of all test cases.
+int UnitTestImpl::total_test_case_count() const {
+  return static_cast<int>(test_cases_.size());
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTestImpl::test_case_to_run_count() const {
+  return CountIf(test_cases_, ShouldRunTestCase);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_,
+                             &TestCase::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
+      skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+      );  // NOLINT
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+    static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  // TODO(kenton@google.com): Shouldn't this just use
+  //   GetSystemTimeAsFileTime()?
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+      kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  //   SystemTimeToFileTime()
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  _ftime64(&now);
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, NULL);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+# error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char* ansi) {
+  if (!ansi) return NULL;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                          NULL, 0);
+  WCHAR* unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length,
+                      unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+  if (!utf16_str) return NULL;
+  const int ansi_length =
+      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                          NULL, 0, NULL, NULL);
+  char* ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
+                      ansi, ansi_length, NULL, NULL);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true iff they have the same content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char * lhs, const char * rhs) {
+  if ( lhs == NULL ) return rhs == NULL;
+
+  if ( rhs == NULL ) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
+                                     Message* msg) {
+  for (size_t i = 0; i != length; ) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0')
+        i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
+
+void SplitString(const ::std::string& str, char delimiter,
+                 ::std::vector< ::std::string>* dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message& Message::operator <<(const wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message& Message::operator <<(wchar_t* wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::std::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_HAS_GLOBAL_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message& Message::operator <<(const ::wstring& wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_GLOBAL_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != NULL ?
+               new ::std::string(*other.message_) :
+               static_cast< ::std::string*>(NULL)) {
+}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != NULL)
+    negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() {
+  return AssertionResult(true);
+}
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() {
+  return AssertionResult(false);
+}
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string& str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are ommitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string& str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
+//
+// The ignoring_case parameter is true iff the assertion is a
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char* lhs_expression,
+                          const char* rhs_expression,
+                          const std::string& lhs_value,
+                          const std::string& rhs_value,
+                          bool ignoring_case) {
+  Message msg;
+  msg << "      Expected: " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n      Which is: " << lhs_value;
+  }
+  msg << "\nTo be equal to: " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n      Which is: " << rhs_value;
+  }
+
+  if (ignoring_case) {
+    msg << "\nIgnoring case";
+  }
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines =
+        SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines =
+        SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult& assertion_result,
+    const char* expression_text,
+    const char* actual_predicate_value,
+    const char* expected_predicate_value) {
+  const char* actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0')
+    msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char* expr1,
+                                     const char* expr2,
+                                     const char* abs_error_expr,
+                                     double val1,
+                                     double val2,
+                                     double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  // TODO(wan): do not print the value of an expression if it's
+  // already a literal.
+  return AssertionFailure()
+      << "The difference between " << expr1 << " and " << expr2
+      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
+      << expr1 << " evaluates to " << val1 << ",\n"
+      << expr2 << " evaluates to " << val2 << ", and\n"
+      << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char* expr1,
+                                const char* expr2,
+                                RawType val1,
+                                RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+      << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char* expr1, const char* expr2,
+                        float val1, float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char* expr1, const char* expr2,
+                         double val1, double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char* lhs_expression,
+                            const char* rhs_expression,
+                            BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs),
+                   false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
+AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                   BiggestInt val1, BiggestInt val2) {\
+  if (val1 op val2) {\
+    return AssertionSuccess();\
+  } else {\
+    return AssertionFailure() \
+        << "Expected: (" << expr1 << ") " #op " (" << expr2\
+        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
+        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
+  }\
+}
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, < )
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, > )
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const char* lhs,
+                               const char* rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
+                                   const char* rhs_expression,
+                                   const char* lhs,
+                                   const char* rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const char* s1,
+                               const char* s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                              << s2_expression << "), actual: \""
+                              << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
+                                   const char* s2_expression,
+                                   const char* s1,
+                                   const char* s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+        << "Expected: (" << s1_expression << ") != ("
+        << s2_expression << ") (ignoring case), actual: \""
+        << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true iff needle is a
+// substring of haystack.  NULL is considered a substring of itself
+// only.
+
+bool IsSubstringPred(const char* needle, const char* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return strstr(haystack, needle) != NULL;
+}
+
+bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
+  if (needle == NULL || haystack == NULL)
+    return needle == haystack;
+
+  return wcsstr(haystack, needle) != NULL;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType& needle,
+                     const StringType& haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(
+    bool expected_to_be_substring,
+    const char* needle_expr, const char* haystack_expr,
+    const StringType& needle, const StringType& haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+      << "Value of: " << needle_expr << "\n"
+      << "  Actual: " << begin_string_quote << needle << "\"\n"
+      << "Expected: " << (expected_to_be_substring ? "" : "not ")
+      << "a substring of " << haystack_expr << "\n"
+      << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const char* needle, const char* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const wchar_t* needle, const wchar_t* haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::string& needle, const ::std::string& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(
+    const char* needle_expr, const char* haystack_expr,
+    const ::std::wstring& needle, const ::std::wstring& haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char* expr,
+                                     const char* expected,
+                                     long hr) {  // NOLINT
+# if GTEST_OS_WINDOWS_MOBILE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+# else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
+                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          hr,  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,  // buf size
+                                          NULL);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+          --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+# endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+      << "Expected: " << expr << " " << expected << ".\n"
+      << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline UInt32 ChopLowBits(UInt32* bits, int n) {
+  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(UInt32 code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexInt(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 &&
+      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                    wchar_t second) {
+  const UInt32 mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2) ?
+      (((first & mask) << 10) | (second & mask)) + 0x10000 :
+      // This function should not be called when the condition is
+      // false, but we provide a sensible default in case it is.
+      static_cast<UInt32>(first);
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
+  if (num_chars == -1)
+    num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    UInt32 unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
+                                                                 str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<UInt32>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+  if (wide_c_str == NULL)  return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true iff they have the same
+// content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char* lhs_expression,
+                               const char* rhs_expression,
+                               const wchar_t* lhs,
+                               const wchar_t* rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression,
+                   rhs_expression,
+                   PrintToString(lhs),
+                   PrintToString(rhs),
+                   false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char* s1_expression,
+                               const char* s2_expression,
+                               const wchar_t* s1,
+                               const wchar_t* s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
+                            << s2_expression << "), actual: "
+                            << PrintToString(s1)
+                            << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true iff they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+  if (lhs == NULL)
+    return rhs == NULL;
+  if (rhs == NULL)
+    return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+  // Compares two wide C strings, ignoring case.  Returns true iff they
+  // have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
+                                              const wchar_t* rhs) {
+  if (lhs == NULL) return rhs == NULL;
+
+  if (rhs == NULL) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(*lhs++);
+    right = towlower(*rhs++);
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true iff str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(
+    const std::string& str, const std::string& suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream* ss) {
+  const ::std::string& str = ss->str();
+  const char* const start = str.c_str();
+  const char* const end = start + str.length();
+
+  std::string result;
+  result.reserve(2 * (end - start));
+  for (const char* ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string& gtest_msg,
+                              const Message& user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0),
+      elapsed_time_(0) {
+}
+
+// D'tor.
+TestResult::~TestResult() {
+}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult& TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count())
+    internal::posix::Abort();
+  return test_part_results_.at(i);
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty& TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count())
+    internal::posix::Abort();
+  return test_properties_.at(i);
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() {
+  test_part_results_.clear();
+}
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string& xml_element,
+                                const TestProperty& test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char* const kReservedTestSuitesAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "random_seed",
+  "tests",
+  "time",
+  "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char* const kReservedTestSuiteAttributes[] = {
+  "disabled",
+  "errors",
+  "failures",
+  "name",
+  "tests",
+  "time"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char* const kReservedTestCaseAttributes[] = {
+  "classname",
+  "name",
+  "status",
+  "time",
+  "type_param",
+  "value_param"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string& xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string>& words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+bool ValidateTestPropertyName(const std::string& property_name,
+                              const std::vector<std::string>& reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+          reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string& xml_element,
+                                      const TestProperty& test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true iff the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed())
+      return true;
+  }
+  return false;
+}
+
+// Returns true iff the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult& result) {
+  return result.fatally_failed();
+}
+
+// Returns true iff the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true iff the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult& result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true iff the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test()
+    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
+}
+
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {
+}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {
+}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, const std::string& value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string& key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string& message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      NULL,  // No info about the source file where the exception occurred.
+      -1,    // We have no info on which line caused the exception.
+      message,
+      "");   // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test case to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test case.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  const TestCase* const test_case = impl->current_test_case();
+
+  // Info about the first test in the current test case.
+  const TestInfo* const first_test_info = test_case->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char* const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo* const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char* const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test case, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char* const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char* const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test case is\n"
+          << "illegal.  In test case " << this_test_info->test_case_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test case must use the same test fixture\n"
+          << "class.  However, in test case "
+          << this_test_info->test_case_name() << ",\n"
+          << "you defined test " << first_test_name
+          << " and test " << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test cases.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string* FormatSehExceptionMessage(DWORD exception_code,
+                                              const char* location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) <<
+    exception_code << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char* description,
+                                             const char* location) {
+  Message message;
+  if (description != NULL) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult& failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string* exception_message = FormatSehExceptionMessage(
+        GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(
+    T* object, Result (T::*method)(), const char* location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const internal::GoogleTestFailureException&) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception& e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(NULL, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful.
+  if (!HasFatalFailure()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        this, &Test::TestBody, "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &Test::TearDown, "TearDown()");
+}
+
+// Returns true iff the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true iff the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->
+      HasNonfatalFailure();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string& a_test_case_name,
+                   const std::string& a_name,
+                   const char* a_type_param,
+                   const char* a_value_param,
+                   internal::CodeLocation a_code_location,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase* factory)
+    : test_case_name_(a_test_case_name),
+      name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
+      location_(a_code_location),
+      fixture_class_id_(fixture_class_id),
+      should_run_(false),
+      is_disabled_(false),
+      matches_filter_(false),
+      factory_(factory),
+      result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_case_name:   name of the test case
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test case
+//   tear_down_tc:     pointer to the function that tears down the test case
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo* MakeAndRegisterTestInfo(
+    const char* test_case_name,
+    const char* name,
+    const char* type_param,
+    const char* value_param,
+    CodeLocation code_location,
+    TypeId fixture_class_id,
+    SetUpTestCaseFunc set_up_tc,
+    TearDownTestCaseFunc tear_down_tc,
+    TestFactoryBase* factory) {
+  TestInfo* const test_info =
+      new TestInfo(test_case_name, name, type_param, value_param,
+                   code_location, fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+#if GTEST_HAS_PARAM_TEST
+void ReportInvalidTestCaseType(const char* test_case_name,
+                               CodeLocation code_location) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test case " << test_case_name << ".\n"
+      << "All tests in the same test case must use the same test fixture\n"
+      << "class.  However, in test case " << test_case_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test cases.";
+
+  fprintf(stderr, "%s %s",
+          FormatFileLocation(code_location.file.c_str(),
+                             code_location.line).c_str(),
+          errors.GetString().c_str());
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestCase class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char* name)
+      : name_(name) {}
+
+  // Returns true iff the test name of test_info matches name_.
+  bool operator()(const TestInfo * test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+#if GTEST_HAS_PARAM_TEST
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    parameterized_tests_registered_ = true;
+  }
+#endif
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test* const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test only if the test object was created and its
+  // constructor didn't generate a fatal failure.
+  if ((test != NULL) && !Test::HasFatalFailure()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  // Deletes the test object.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      test, &Test::DeleteSelf_, "the test fixture's destructor");
+
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(NULL);
+}
+
+// class TestCase
+
+// Gets the number of successful tests in this test case.
+int TestCase::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of failed tests in this test case.
+int TestCase::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestCase::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test case.
+int TestCase::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestCase::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test case that should run.
+int TestCase::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestCase::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestCase with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test case
+//   a_type_param: the name of the test case's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test case.
+//   set_up_tc:    pointer to the function that sets up the test case
+//   tear_down_tc: pointer to the function that tears down the test case
+TestCase::TestCase(const char* a_name, const char* a_type_param,
+                   Test::SetUpTestCaseFunc set_up_tc,
+                   Test::TearDownTestCaseFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
+      set_up_tc_(set_up_tc),
+      tear_down_tc_(tear_down_tc),
+      should_run_(false),
+      elapsed_time_(0) {
+}
+
+// Destructor of TestCase.
+TestCase::~TestCase() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo* TestCase::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo* TestCase::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? NULL : test_info_list_[index];
+}
+
+// Adds a test to this test case.  Will delete the test upon
+// destruction of the TestCase object.
+void TestCase::AddTestInfo(TestInfo * test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestCase.
+void TestCase::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_case(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  repeater->OnTestCaseStart(*this);
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
+
+  const internal::TimeInMillis start = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
+
+  repeater->OnTestCaseEnd(*this);
+  impl->set_current_test_case(NULL);
+}
+
+// Clears the results of all tests in this test case.
+void TestCase::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test case.
+void TestCase::ShuffleTests(internal::Random* random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestCase::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count,
+                                       const char * singular_form,
+                                       const char * plural_form) {
+  return internal::StreamableToString(count) + " " +
+      (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test cases.
+static std::string FormatTestCaseCount(int test_case_count) {
+  return FormatCountableNoun(test_case_count, "test case", "test cases");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSuccess:
+      return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default:
+      return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult& test_part_result) {
+  return (Message()
+          << internal::FormatFileLocation(test_part_result.file_name(),
+                                          test_part_result.line_number())
+          << " " << TestPartResultTypeToString(test_part_result.type())
+          << test_part_result.message()).GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult& test_part_result) {
+  const std::string& result =
+      PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+
+enum GTestColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW
+};
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Returns the character attribute for the given color.
+WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:    return FOREGROUND_RED;
+    case COLOR_GREEN:  return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default:           return 0;
+  }
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+const char* GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED:     return "1";
+    case COLOR_GREEN:   return "2";
+    case COLOR_YELLOW:  return "3";
+    default:            return NULL;
+  };
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true iff Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char* const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char* const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+      String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS || \
+    GTEST_OS_IOS || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
+  const bool use_color = AlwaysFalse();
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
+  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gunit_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+  const char* const type_param = test_info.type_param();
+  const char* const value_param = test_info.value_param();
+
+  if (type_param != NULL || value_param != NULL) {
+    printf(", where ");
+    if (type_param != NULL) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != NULL)
+        printf(" and ");
+    }
+    if (value_param != NULL) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char * test_case, const char * test) {
+    printf("%s.%s", test_case, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
+
+ private:
+  static void PrintFailedTests(const UnitTest& unit_test);
+};
+
+  // Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest& unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char* const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == NULL) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
+  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  // If the test part succeeded, we don't need to do anything.
+  if (result.type() == TestPartResult::kSuccess)
+    return;
+
+  // Print failure message from the assertion (e.g. expected this and got that).
+  PrintTestPartResult(result);
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_case_name(), test_info.name());
+  if (test_info.result()->Failed())
+    PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n", internal::StreamableToString(
+           test_info.result()->elapsed_time()).c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n",
+         counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest& /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  if (failed_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    const TestCase& test_case = *unit_test.GetTestCase(i);
+    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_case.total_test_count(); ++j) {
+      const TestInfo& test_info = *test_case.GetTestInfo(j);
+      if (!test_info.should_run() || test_info.result()->Passed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_case.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  int num_failures = unit_test.failed_test_count();
+  if (!unit_test.Passed()) {
+    const int failed_test_count = unit_test.failed_test_count();
+    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+    PrintFailedTests(unit_test);
+    printf("\n%2d FAILED %s\n", num_failures,
+                        num_failures == 1 ? "TEST" : "TESTS");
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (!num_failures) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW,
+                  "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  virtual ~TestEventRepeater();
+  void Append(TestEventListener *listener);
+  TestEventListener* Release(TestEventListener* listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  virtual void OnTestProgramStart(const UnitTest& unit_test);
+  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
+  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
+  virtual void OnTestCaseStart(const TestCase& test_case);
+  virtual void OnTestStart(const TestInfo& test_info);
+  virtual void OnTestPartResult(const TestPartResult& result);
+  virtual void OnTestEnd(const TestInfo& test_info);
+  virtual void OnTestCaseEnd(const TestCase& test_case);
+  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  virtual void OnTestProgramEnd(const UnitTest& unit_test);
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener*> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + i);
+      return listener;
+    }
+  }
+
+  return NULL;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (size_t i = 0; i < listeners_.size(); i++) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
+void TestEventRepeater::Name(const Type& parameter) { \
+  if (forwarding_enabled_) { \
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
+      listeners_[i]->Name(parameter); \
+    } \
+  } \
+}
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
+      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string& str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string& str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string& str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char* str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream* stream,
+                                 const std::string& element_name,
+                                 const std::string& name,
+                                 const std::string& value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream* stream,
+                                const char* test_case_name,
+                                const TestInfo& test_info);
+
+  // Prints an XML representation of a TestCase object
+  static void PrintXmlTestCase(::std::ostream* stream,
+                               const TestCase& test_case);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream* stream,
+                               const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.c_str() == NULL || output_file_.empty()) {
+    fprintf(stderr, "XML output file may not be null\n");
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* xmlout = NULL;
+  FilePath output_file(output_file_);
+  FilePath output_dir(output_file.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    xmlout = posix::FOpen(output_file_.c_str(), "w");
+  }
+  if (xmlout == NULL) {
+    // TODO(wan): report the reason of the failure.
+    //
+    // We don't do it for now as:
+    //
+    //   1. There is no urgent need for it.
+    //   2. It's a bit involved to make the errno variable thread-safe on
+    //      all three operating systems (Linux, Windows, and Mac OS).
+    //   3. To interpret the meaning of errno in a thread-safe way,
+    //      we need the strerror_r() function, which is not available on
+    //      Windows.
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            output_file_.c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// escaping scheme for invalid characters, rather than dropping them.
+std::string XmlUnitTestResultPrinter::EscapeXml(
+    const std::string& str, bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<':
+        m << "&lt;";
+        break;
+      case '>':
+        m << "&gt;";
+        break;
+      case '&':
+        m << "&amp;";
+        break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string& str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it))
+      output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3);
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm* out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm* tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == NULL)
+    return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != NULL;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
+                                                     const char* data) {
+  const char* segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char* const next_segment = strstr(segment, "]]>");
+    if (next_segment != NULL) {
+      stream->write(
+          segment, static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+// TODO(wan): There is also value in printing properties with the plain printer.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
+                                                 const char* test_case_name,
+                                                 const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
+
+  if (test_info.value_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != NULL) {
+    OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
+  }
+
+  OutputXmlAttribute(stream, kTestcase, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestcase, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
+  *stream << TestPropertiesAsXmlAttributes(result);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const string location = internal::FormatCompilerIndependentFileLocation(
+          part.file_name(), part.line_number());
+      const string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str())
+              << "\" type=\"\">";
+      const string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0)
+    *stream << " />\n";
+  else
+    *stream << "    </testcase>\n";
+}
+
+// Prints an XML representation of a TestCase object
+void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
+                                                const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_case.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "failures",
+                     StreamableToString(test_case.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "disabled",
+      StreamableToString(test_case.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
+          << ">\n";
+
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
+                                                const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0)
+      PrintXmlTestCase(stream, *unit_test.GetTestCase(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult& result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End XmlUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+string StreamingListener::UrlEncode(const char* str) {
+  string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default:
+        result.push_back(ch);
+        break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* servinfo = NULL;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num = getaddrinfo(
+      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(
+        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message = message.GetString();
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+
+// class OsStackTraceGetter
+
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/,
+                                             int /*skip_count*/) {
+  return "";
+}
+
+void OsStackTraceGetter::UponLeavingGTest() {}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
+      : premature_exit_filepath_(premature_exit_filepath) {
+    // If a path to the premature-exit file is specified...
+    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
+      remove(premature_exit_filepath_);
+    }
+  }
+
+ private:
+  const char* const premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(NULL),
+      default_xml_generator_(NULL) {
+}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener* listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = NULL;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = NULL;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener* TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != NULL)
+      Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest* UnitTest::GetInstance() {
+  // When compiled with MSVC 7.1 in optimized mode, destroying the
+  // UnitTest object upon exiting the program messes up the exit code,
+  // causing successful tests to appear failed.  We have to use a
+  // different implementation in this case to bypass the compiler bug.
+  // This implementation makes the compiler happy, at the cost of
+  // leaking the UnitTest object.
+
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+  static UnitTest* const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
+}
+
+// Gets the number of successful test cases.
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_case_count();
+}
+
+// Gets the number of failed test cases.
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_case_count();
+}
+
+// Gets the number of all test cases.
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_case_count();
+}
+
+// Gets the number of all test cases that contain at least one test
+// that should run.
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_case_to_run_count();
+}
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+    return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true iff the unit test passed (i.e. all test cases passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true iff the unit test failed (i.e. some test case failed
+// or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+const TestCase* UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test cases.
+const TestResult& UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test case among all the test cases. i can range from 0 to
+// total_test_case_count() - 1. If i is not in that range, returns NULL.
+TestCase* UnitTest::GetMutableTestCase(int i) {
+  return impl()->GetMutableTestCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners& UnitTest::listeners() {
+  return *impl()->listeners();
+}
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment* UnitTest::AddEnvironment(Environment* env) {
+  if (env == NULL) {
+    return NULL;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(
+    TestPartResult::Type result_type,
+    const char* file_name,
+    int line_number,
+    const std::string& message,
+    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
+         i > 0; --i) {
+      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
+          << " " << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result =
+    TestPartResult(result_type, file_name, line_number,
+                   msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->
+      ReportTestPartResult(result);
+
+  if (result_type != TestPartResult::kSuccess) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#else
+      // Dereference NULL through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: Symbian doesn't implement abort() well, and some debuggers
+      // don't correctly trap abort().
+      *static_cast<volatile int*>(NULL) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestCase's ad_hoc_test_result_ when invoked
+// from SetUpTestCase or TearDownTestCase, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string& key,
+                              const std::string& value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process ?
+      NULL : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_HAS_SEH
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+# endif  // !GTEST_OS_WINDOWS_MOBILE
+
+# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+# endif
+
+# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    //
+    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
+    // Users of prior VC versions shall suffer the agony and pain of
+    // clicking through the countless debug dialogs.
+    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // debug mode when compiled with VC 7.1 or lower.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+# endif
+  }
+#endif  // GTEST_HAS_SEH
+
+  return internal::HandleExceptionsInMethodIfSupported(
+      impl(),
+      &internal::UnitTestImpl::RunAllTests,
+      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char* UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestCase object for the test that's currently running,
+// or NULL if no test is running.
+const TestCase* UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_case();
+}
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo* UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+#if GTEST_HAS_PARAM_TEST
+// Returns ParameterizedTestCaseRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestCaseRegistry&
+    UnitTest::parameterized_test_registry()
+        GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+#endif  // GTEST_HAS_PARAM_TEST
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() {
+  impl_ = new internal::UnitTestImpl(this);
+}
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() {
+  delete impl_;
+}
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace()
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest* parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+      default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_()
+      global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+#if GTEST_HAS_PARAM_TEST
+      parameterized_test_registry_(),
+      parameterized_tests_registered_(false),
+#endif  // GTEST_HAS_PARAM_TEST
+      last_death_test_case_(-1),
+      current_test_case_(NULL),
+      current_test_info_(NULL),
+      ad_hoc_test_result_(),
+      os_stack_trace_getter_(NULL),
+      post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),  // Will be reseeded before first use.
+      start_timestamp_(0),
+      elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestCase.
+  ForEach(test_cases_, internal::Delete<TestCase>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test case's ad_hoc_test_result when invoke
+// from SetUpTestCase/TearDownTestCase, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty& test_property) {
+  std::string xml_element;
+  TestResult* test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != NULL) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_case_ != NULL) {
+    xml_element = "testsuite";
+    test_result = &(current_test_case_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != NULL)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
+           output_format.c_str());
+    fflush(stdout);
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string& target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(new StreamingListener(target.substr(0, pos),
+                                                target.substr(pos+1)));
+    } else {
+      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
+             target.c_str());
+      fflush(stdout);
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+  }
+}
+
+// A predicate that checks the name of a TestCase against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestCaseNameIs is copyable.
+class TestCaseNameIs {
+ public:
+  // Constructor.
+  explicit TestCaseNameIs(const std::string& name)
+      : name_(name) {}
+
+  // Returns true iff the name of test_case matches name_.
+  bool operator()(const TestCase* test_case) const {
+    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestCase with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_case_name: name of the test case
+//   type_param:     the name of the test case's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test case.
+//   set_up_tc:      pointer to the function that sets up the test case
+//   tear_down_tc:   pointer to the function that tears down the test case
+TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
+                                    const char* type_param,
+                                    Test::SetUpTestCaseFunc set_up_tc,
+                                    Test::TearDownTestCaseFunc tear_down_tc) {
+  // Can we find a TestCase with the given name?
+  const std::vector<TestCase*>::const_iterator test_case =
+      std::find_if(test_cases_.begin(), test_cases_.end(),
+                   TestCaseNameIs(test_case_name));
+
+  if (test_case != test_cases_.end())
+    return *test_case;
+
+  // No.  Let's create one.
+  TestCase* const new_test_case =
+      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test case?
+  if (internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                               kDeathTestCaseFilter)) {
+    // Yes.  Inserts the test case after the last death test case
+    // defined so far.  This only works when the test cases haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_case_;
+    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
+                       new_test_case);
+  } else {
+    // No.  Appends to the end of the list.
+    test_cases_.push_back(new_test_case);
+  }
+
+  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
+  return new_test_case;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment* env) { env->SetUp(); }
+static void TearDownEnvironment(Environment* env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // Makes sure InitGoogleTest() was called.
+  if (!GTestIsInitialized()) {
+    printf("%s",
+           "\nThis test program did NOT call ::testing::InitGoogleTest "
+           "before calling RUN_ALL_TESTS().  Please fix it.\n");
+    return false;
+  }
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag)
+    return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True iff we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
+# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run = FilterTests(should_shard
+                                              ? HONOR_SHARDING_PROTOCOL
+                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ = GTEST_FLAG(shuffle) ?
+      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True iff at least one test has failed.
+  bool failed = false;
+
+  TestEventListener* repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool forever = repeat < 0;
+  for (int i = 0; forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test cases and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(random_seed_);
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test case if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure during global
+      // set-up.
+      if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_case_count();
+             test_index++) {
+          GetMutableTestCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != NULL) {
+    FILE* const file = posix::FOpen(test_shard_file, "w");
+    if (file == NULL) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char* total_shards_env,
+                 const char* shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestShardIndex << " = " << shard_index
+      << ", but have left " << kTestTotalShards << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+      << "Invalid environment variables: you have "
+      << kTestTotalShards << " = " << total_shards
+      << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg = Message()
+      << "Invalid environment variables: we require 0 <= "
+      << kTestShardIndex << " < " << kTestTotalShards
+      << ", but you have " << kTestShardIndex << "=" << shard_index
+      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
+  const char* str_val = posix::GetEnv(var);
+  if (str_val == NULL) {
+    return default_val;
+  }
+
+  Int32 result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true iff the test should be run on this shard. The test id is
+// some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestCase and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
+// Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
+  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
+      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    TestCase* const test_case = test_cases_[i];
+    const std::string &test_case_name = test_case->name();
+    test_case->set_should_run(false);
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      TestInfo* const test_info = test_case->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test case name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled =
+          internal::UnitTestOptions::MatchesFilter(test_case_name,
+                                                   kDisableTestFilter) ||
+          internal::UnitTestOptions::MatchesFilter(test_name,
+                                                   kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter =
+          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
+                                                       test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_selected = is_runnable &&
+          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
+           ShouldRunTestOnShard(total_shards, shard_index,
+                                num_runnable_tests));
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_case->set_should_run(test_case->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char* str, int max_length) {
+  if (str != NULL) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    const TestCase* const test_case = test_cases_[i];
+    bool printed_test_case_name = false;
+
+    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
+      const TestInfo* const test_info =
+          test_case->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_case_name) {
+          printed_test_case_name = true;
+          printf("%s.", test_case->name());
+          if (test_case->type_param() != NULL) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_case->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != NULL) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface* getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == NULL) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the TestResult for the test that's currently running, or
+// the TestResult for the ad hoc test if no test is running.
+TestResult* UnitTestImpl::current_test_result() {
+  return current_test_info_ ?
+      &(current_test_info_->result_) : &ad_hoc_test_result_;
+}
+
+// Shuffles all test cases, and the tests within each test case,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test cases.
+  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
+
+  // Shuffles the non-death test cases.
+  ShuffleRange(random(), last_death_test_case_ + 1,
+               static_cast<int>(test_cases_.size()), &test_case_indices_);
+
+  // Shuffles the tests inside each test case.
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    test_cases_[i]->ShuffleTests(random());
+  }
+}
+
+// Restores the test cases and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_cases_.size(); i++) {
+    // Unshuffles the tests in each test case.
+    test_cases_[i]->UnshuffleTests();
+    // Resets the index of each test case.
+    test_case_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false))
+    throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char* prefix, const char** pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+const char* ParseFlagValue(const char* str,
+                           const char* flag,
+                           bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == NULL || flag == NULL) return NULL;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return NULL;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == NULL) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char* str) {
+  return (SkipPrefix("--", &str) ||
+          SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+// TODO(wan@google.com): Write tests for this once we add stdout
+// capturing to Google Test.
+static void PrintColorEncoded(const char* str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char* p = strchr(str, '@');
+    if (p == NULL) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
+"following command line flags to control its behavior:\n"
+"\n"
+"Test Selection:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
+"      List the names of all tests instead of running them. The name of\n"
+"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+"      Run only the tests whose name matches one of the positive patterns but\n"
+"      none of the negative patterns. '?' matches any single character; '*'\n"
+"      matches any substring; ':' separates two patterns.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
+"      Run all disabled tests too.\n"
+"\n"
+"Test Execution:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
+"      Run the tests repeatedly; use a negative count to repeat forever.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
+"      Randomize tests' orders on every iteration.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
+"      Random number seed to use for shuffling test orders (between 1 and\n"
+"      99999, or 0 to use a seed based on the current time).\n"
+"\n"
+"Test Output:\n"
+"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+"      Enable/disable colored output. The default is @Gauto@D.\n"
+"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
+"      Don't print the elapsed time of each test.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
+"      Generate an XML report in the given directory or with the given file\n"
+"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
+"      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+"\n"
+"Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+"      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
+"      Turn assertion failures into debugger break-points.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
+"      Turn assertion failures into C++ exceptions.\n"
+"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
+"      Do not report exceptions as test failures. Instead, allow them\n"
+"      to crash the program or throw a pop-up (on Windows).\n"
+"\n"
+"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+"environment variable of a flag (all letters in upper-case). For example, to\n"
+"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
+"\n"
+"For more information, please read the " GTEST_NAME_ " documentation at\n"
+"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
+"(not one in your own code or tests), please report it to\n"
+"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+bool ParseGoogleTestFlag(const char* const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+      ParseBoolFlag(arg, kBreakOnFailureFlag,
+                    &GTEST_FLAG(break_on_failure)) ||
+      ParseBoolFlag(arg, kCatchExceptionsFlag,
+                    &GTEST_FLAG(catch_exceptions)) ||
+      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+      ParseStringFlag(arg, kDeathTestStyleFlag,
+                      &GTEST_FLAG(death_test_style)) ||
+      ParseBoolFlag(arg, kDeathTestUseFork,
+                    &GTEST_FLAG(death_test_use_fork)) ||
+      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+      ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                      &GTEST_FLAG(internal_run_death_test)) ||
+      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+      ParseInt32Flag(arg, kStackTraceDepthFlag,
+                     &GTEST_FLAG(stack_trace_depth)) ||
+      ParseStringFlag(arg, kStreamResultToFlag,
+                      &GTEST_FLAG(stream_result_to)) ||
+      ParseBoolFlag(arg, kThrowOnFailureFlag,
+                    &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+void LoadFlagsFromFile(const std::string& path) {
+  FILE* flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    fprintf(stderr,
+            "Unable to open file \"%s\"\n",
+            GTEST_FLAG(flagfile).c_str());
+    fflush(stderr);
+    exit(EXIT_FAILURE);
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty())
+      continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str()))
+      g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int* argc, CharType** argv) {
+  // We don't want to run the initialization code twice.
+  if (GTestIsInitialized()) return;
+
+  if (*argc <= 0) return;
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int* argc, char** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int* argc, wchar_t** argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+}  // namespace testing
diff --git a/third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc b/third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc
new file mode 100644
index 000000000..f30282255
--- /dev/null
+++ b/third_party/aom/third_party/googletest/src/googletest/src/gtest_main.cc
@@ -0,0 +1,38 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from gtest_main.cc\n");
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/aom/third_party/libwebm/AUTHORS.TXT b/third_party/aom/third_party/libwebm/AUTHORS.TXT
new file mode 100644
index 000000000..9686ac13e
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/AUTHORS.TXT
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/third_party/aom/third_party/libwebm/Android.mk b/third_party/aom/third_party/libwebm/Android.mk
new file mode 100644
index 000000000..b46ba101d
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/Android.mk
@@ -0,0 +1,17 @@
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libwebm
+LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
+LOCAL_C_INCLUDES:= $(LOCAL_PATH)
+LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
+
+LOCAL_SRC_FILES:= common/file_util.cc \
+                  common/hdr_util.cc \
+                  mkvparser/mkvparser.cc \
+                  mkvparser/mkvreader.cc \
+                  mkvmuxer/mkvmuxer.cc \
+                  mkvmuxer/mkvmuxerutil.cc \
+                  mkvmuxer/mkvwriter.cc
+include $(BUILD_STATIC_LIBRARY)
diff --git a/third_party/aom/third_party/libwebm/LICENSE.TXT b/third_party/aom/third_party/libwebm/LICENSE.TXT
new file mode 100644
index 000000000..7a6f99547
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/LICENSE.TXT
@@ -0,0 +1,30 @@
+Copyright (c) 2010, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/third_party/aom/third_party/libwebm/PATENTS.TXT b/third_party/aom/third_party/libwebm/PATENTS.TXT
new file mode 100644
index 000000000..caedf607e
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/PATENTS.TXT
@@ -0,0 +1,23 @@
+Additional IP Rights Grant (Patents)
+------------------------------------
+
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/third_party/aom/third_party/libwebm/README.libaom b/third_party/aom/third_party/libwebm/README.libaom
new file mode 100644
index 000000000..bd288d201
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/README.libaom
@@ -0,0 +1,22 @@
+URL: https://chromium.googlesource.com/webm/libwebm
+Version: af81f26025b7435fa9a14ad07c58b44cf9280430
+License: BSD
+License File: LICENSE.txt
+
+Description:
+libwebm is used to handle WebM container I/O.
+
+Local Changes:
+Add av1 codec as an eligible codec for webm:
+ https://aomedia-review.googlesource.com/c/aom/+/15103
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+    file_util.cc/h
+    hdr_util.cc/h
+    webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/third_party/aom/third_party/libwebm/common/file_util.cc b/third_party/aom/third_party/libwebm/common/file_util.cc
new file mode 100644
index 000000000..618ffc087
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/file_util.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "common/file_util.h"
+
+#include <sys/stat.h>
+#ifndef _MSC_VER
+#include <unistd.h>  // close()
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <ios>
+#include <string>
+
+namespace libwebm {
+
+std::string GetTempFileName() {
+#if !defined _MSC_VER && !defined __MINGW32__
+  std::string temp_file_name_template_str =
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") :
+                                               ".") +
+      "/libwebm_temp.XXXXXX";
+  char* temp_file_name_template =
+      new char[temp_file_name_template_str.length() + 1];
+  memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1);
+  temp_file_name_template_str.copy(temp_file_name_template,
+                                   temp_file_name_template_str.length(), 0);
+  int fd = mkstemp(temp_file_name_template);
+  std::string temp_file_name =
+      (fd != -1) ? std::string(temp_file_name_template) : std::string();
+  delete[] temp_file_name_template;
+  if (fd != -1) {
+    close(fd);
+  }
+  return temp_file_name;
+#else
+  char tmp_file_name[_MAX_PATH];
+#if defined _MSC_VER || defined MINGW_HAS_SECURE_API
+  errno_t err = tmpnam_s(tmp_file_name);
+#else
+  char* fname_pointer = tmpnam(tmp_file_name);
+  errno_t err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+#endif
+  if (err == 0) {
+    return std::string(tmp_file_name);
+  }
+  return std::string();
+#endif
+}
+
+uint64_t GetFileSize(const std::string& file_name) {
+  uint64_t file_size = 0;
+#ifndef _MSC_VER
+  struct stat st;
+  st.st_size = 0;
+  if (stat(file_name.c_str(), &st) == 0) {
+#else
+  struct _stat st;
+  st.st_size = 0;
+  if (_stat(file_name.c_str(), &st) == 0) {
+#endif
+    file_size = st.st_size;
+  }
+  return file_size;
+}
+
+bool GetFileContents(const std::string& file_name, std::string* contents) {
+  std::ifstream file(file_name.c_str());
+  *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0);
+  if (file.good() && contents->size()) {
+    file.read(&(*contents)[0], contents->size());
+  }
+  return !file.fail();
+}
+
+TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
+
+TempFileDeleter::~TempFileDeleter() {
+  std::ifstream file(file_name_.c_str());
+  if (file.good()) {
+    file.close();
+    std::remove(file_name_.c_str());
+  }
+}
+
+}  // namespace libwebm
diff --git a/third_party/aom/third_party/libwebm/common/file_util.h b/third_party/aom/third_party/libwebm/common/file_util.h
new file mode 100644
index 000000000..a87373464
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/file_util.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_FILE_UTIL_H_
+#define LIBWEBM_COMMON_FILE_UTIL_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "mkvmuxer/mkvmuxertypes.h"  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN()
+
+namespace libwebm {
+
+// Returns a temporary file name.
+std::string GetTempFileName();
+
+// Returns size of file specified by |file_name|, or 0 upon failure.
+uint64_t GetFileSize(const std::string& file_name);
+
+// Gets the contents file_name as a string. Returns false on error.
+bool GetFileContents(const std::string& file_name, std::string* contents);
+
+// Manages life of temporary file specified at time of construction. Deletes
+// file upon destruction.
+class TempFileDeleter {
+ public:
+  TempFileDeleter();
+  explicit TempFileDeleter(std::string file_name) : file_name_(file_name) {}
+  ~TempFileDeleter();
+  const std::string& name() const { return file_name_; }
+
+ private:
+  std::string file_name_;
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TempFileDeleter);
+};
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
diff --git a/third_party/aom/third_party/libwebm/common/hdr_util.cc b/third_party/aom/third_party/libwebm/common/hdr_util.cc
new file mode 100644
index 000000000..916f7170b
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/hdr_util.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "hdr_util.h"
+
+#include <climits>
+#include <cstddef>
+#include <new>
+
+#include "mkvparser/mkvparser.h"
+
+namespace libwebm {
+const int Vp9CodecFeatures::kValueNotPresent = INT_MAX;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc) {
+  muxer_pc->reset(new (std::nothrow)
+                      mkvmuxer::PrimaryChromaticity(parser_pc.x, parser_pc.y));
+  if (!muxer_pc->get())
+    return false;
+  return true;
+}
+
+bool MasteringMetadataValuePresent(double value) {
+  return value != mkvparser::MasteringMetadata::kValueNotPresent;
+}
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm) {
+  if (MasteringMetadataValuePresent(parser_mm.luminance_max))
+    muxer_mm->set_luminance_max(parser_mm.luminance_max);
+  if (MasteringMetadataValuePresent(parser_mm.luminance_min))
+    muxer_mm->set_luminance_min(parser_mm.luminance_min);
+
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
+
+  if (parser_mm.r) {
+    if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
+      return false;
+  }
+  if (parser_mm.g) {
+    if (!CopyPrimaryChromaticity(*parser_mm.g, &g_ptr))
+      return false;
+  }
+  if (parser_mm.b) {
+    if (!CopyPrimaryChromaticity(*parser_mm.b, &b_ptr))
+      return false;
+  }
+  if (parser_mm.white_point) {
+    if (!CopyPrimaryChromaticity(*parser_mm.white_point, &wp_ptr))
+      return false;
+  }
+
+  if (!muxer_mm->SetChromaticity(r_ptr.get(), g_ptr.get(), b_ptr.get(),
+                                 wp_ptr.get())) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ColourValuePresent(long long value) {
+  return value != mkvparser::Colour::kValueNotPresent;
+}
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour) {
+  if (!muxer_colour)
+    return false;
+
+  if (ColourValuePresent(parser_colour.matrix_coefficients))
+    muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients);
+  if (ColourValuePresent(parser_colour.bits_per_channel))
+    muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel);
+  if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) {
+    muxer_colour->set_chroma_subsampling_horz(
+        parser_colour.chroma_subsampling_horz);
+  }
+  if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) {
+    muxer_colour->set_chroma_subsampling_vert(
+        parser_colour.chroma_subsampling_vert);
+  }
+  if (ColourValuePresent(parser_colour.cb_subsampling_horz))
+    muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz);
+  if (ColourValuePresent(parser_colour.cb_subsampling_vert))
+    muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert);
+  if (ColourValuePresent(parser_colour.chroma_siting_horz))
+    muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz);
+  if (ColourValuePresent(parser_colour.chroma_siting_vert))
+    muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert);
+  if (ColourValuePresent(parser_colour.range))
+    muxer_colour->set_range(parser_colour.range);
+  if (ColourValuePresent(parser_colour.transfer_characteristics)) {
+    muxer_colour->set_transfer_characteristics(
+        parser_colour.transfer_characteristics);
+  }
+  if (ColourValuePresent(parser_colour.primaries))
+    muxer_colour->set_primaries(parser_colour.primaries);
+  if (ColourValuePresent(parser_colour.max_cll))
+    muxer_colour->set_max_cll(parser_colour.max_cll);
+  if (ColourValuePresent(parser_colour.max_fall))
+    muxer_colour->set_max_fall(parser_colour.max_fall);
+
+  if (parser_colour.mastering_metadata) {
+    mkvmuxer::MasteringMetadata muxer_mm;
+    if (!CopyMasteringMetadata(*parser_colour.mastering_metadata, &muxer_mm))
+      return false;
+    if (!muxer_colour->SetMasteringMetadata(muxer_mm))
+      return false;
+  }
+  return true;
+}
+
+// Format of VPx private data:
+//
+//   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//  |    ID Byte    |   Length      |                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
+//  |                                                               |
+//  :               Bytes 1..Length of Codec Feature                :
+//  |                                                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
+// ID Byte Format
+// ID byte is an unsigned byte.
+//   0 1 2 3 4 5 6 7
+//  +-+-+-+-+-+-+-+-+
+//  |X|    ID       |
+//  +-+-+-+-+-+-+-+-+
+//
+// The X bit is reserved.
+//
+// See the following link for more information:
+// http://www.webmproject.org/vp9/profiles/
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features) {
+  const int kVpxCodecPrivateMinLength = 3;
+  if (!private_data || !features || length < kVpxCodecPrivateMinLength)
+    return false;
+
+  const uint8_t kVp9ProfileId = 1;
+  const uint8_t kVp9LevelId = 2;
+  const uint8_t kVp9BitDepthId = 3;
+  const uint8_t kVp9ChromaSubsamplingId = 4;
+  const int kVpxFeatureLength = 1;
+  int offset = 0;
+
+  // Set features to not set.
+  features->profile = Vp9CodecFeatures::kValueNotPresent;
+  features->level = Vp9CodecFeatures::kValueNotPresent;
+  features->bit_depth = Vp9CodecFeatures::kValueNotPresent;
+  features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent;
+  do {
+    const uint8_t id_byte = private_data[offset++];
+    const uint8_t length_byte = private_data[offset++];
+    if (length_byte != kVpxFeatureLength)
+      return false;
+    if (id_byte == kVp9ProfileId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile < 0 || priv_profile > 3)
+        return false;
+      if (features->profile != Vp9CodecFeatures::kValueNotPresent &&
+          features->profile != priv_profile) {
+        return false;
+      }
+      features->profile = priv_profile;
+    } else if (id_byte == kVp9LevelId) {
+      const int priv_level = static_cast<int>(private_data[offset++]);
+
+      const int kNumLevels = 14;
+      const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+                                      41, 50, 51, 52, 60, 61, 62};
+
+      for (int i = 0; i < kNumLevels; ++i) {
+        if (priv_level == levels[i]) {
+          if (features->level != Vp9CodecFeatures::kValueNotPresent &&
+              features->level != priv_level) {
+            return false;
+          }
+          features->level = priv_level;
+          break;
+        }
+      }
+      if (features->level == Vp9CodecFeatures::kValueNotPresent)
+        return false;
+    } else if (id_byte == kVp9BitDepthId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12)
+        return false;
+      if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent &&
+          features->bit_depth != priv_profile) {
+        return false;
+      }
+      features->bit_depth = priv_profile;
+    } else if (id_byte == kVp9ChromaSubsamplingId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+        return false;
+      if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
+          features->chroma_subsampling != priv_profile) {
+        return false;
+      }
+      features->chroma_subsampling = priv_profile;
+    } else {
+      // Invalid ID.
+      return false;
+    }
+  } while (offset + kVpxCodecPrivateMinLength <= length);
+
+  return true;
+}
+}  // namespace libwebm
diff --git a/third_party/aom/third_party/libwebm/common/hdr_util.h b/third_party/aom/third_party/libwebm/common/hdr_util.h
new file mode 100644
index 000000000..78e2eeb70
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/hdr_util.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_HDR_UTIL_H_
+#define LIBWEBM_COMMON_HDR_UTIL_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "mkvmuxer/mkvmuxer.h"
+
+namespace mkvparser {
+struct Colour;
+struct MasteringMetadata;
+struct PrimaryChromaticity;
+}  // namespace mkvparser
+
+namespace libwebm {
+// Utility types and functions for working with the Colour element and its
+// children. Copiers return true upon success. Presence functions return true
+// when the specified element is present.
+
+// TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
+// required by libwebm.
+
+// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video
+// stream. A value of kValueNotPresent represents that the value was not set in
+// the CodecPrivate.
+struct Vp9CodecFeatures {
+  static const int kValueNotPresent;
+
+  Vp9CodecFeatures()
+      : profile(kValueNotPresent),
+        level(kValueNotPresent),
+        bit_depth(kValueNotPresent),
+        chroma_subsampling(kValueNotPresent) {}
+  ~Vp9CodecFeatures() {}
+
+  int profile;
+  int level;
+  int bit_depth;
+  int chroma_subsampling;
+};
+
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc);
+
+bool MasteringMetadataValuePresent(double value);
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm);
+
+bool ColourValuePresent(long long value);
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour);
+
+// Returns true if |features| is set to one or more valid values.
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features);
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_HDR_UTIL_H_
diff --git a/third_party/aom/third_party/libwebm/common/webmids.h b/third_party/aom/third_party/libwebm/common/webmids.h
new file mode 100644
index 000000000..89d722a71
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/common/webmids.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef COMMON_WEBMIDS_H_
+#define COMMON_WEBMIDS_H_
+
+namespace libwebm {
+
+enum MkvId {
+  kMkvEBML = 0x1A45DFA3,
+  kMkvEBMLVersion = 0x4286,
+  kMkvEBMLReadVersion = 0x42F7,
+  kMkvEBMLMaxIDLength = 0x42F2,
+  kMkvEBMLMaxSizeLength = 0x42F3,
+  kMkvDocType = 0x4282,
+  kMkvDocTypeVersion = 0x4287,
+  kMkvDocTypeReadVersion = 0x4285,
+  kMkvVoid = 0xEC,
+  kMkvSignatureSlot = 0x1B538667,
+  kMkvSignatureAlgo = 0x7E8A,
+  kMkvSignatureHash = 0x7E9A,
+  kMkvSignaturePublicKey = 0x7EA5,
+  kMkvSignature = 0x7EB5,
+  kMkvSignatureElements = 0x7E5B,
+  kMkvSignatureElementList = 0x7E7B,
+  kMkvSignedElement = 0x6532,
+  // segment
+  kMkvSegment = 0x18538067,
+  // Meta Seek Information
+  kMkvSeekHead = 0x114D9B74,
+  kMkvSeek = 0x4DBB,
+  kMkvSeekID = 0x53AB,
+  kMkvSeekPosition = 0x53AC,
+  // Segment Information
+  kMkvInfo = 0x1549A966,
+  kMkvTimecodeScale = 0x2AD7B1,
+  kMkvDuration = 0x4489,
+  kMkvDateUTC = 0x4461,
+  kMkvTitle = 0x7BA9,
+  kMkvMuxingApp = 0x4D80,
+  kMkvWritingApp = 0x5741,
+  // Cluster
+  kMkvCluster = 0x1F43B675,
+  kMkvTimecode = 0xE7,
+  kMkvPrevSize = 0xAB,
+  kMkvBlockGroup = 0xA0,
+  kMkvBlock = 0xA1,
+  kMkvBlockDuration = 0x9B,
+  kMkvReferenceBlock = 0xFB,
+  kMkvLaceNumber = 0xCC,
+  kMkvSimpleBlock = 0xA3,
+  kMkvBlockAdditions = 0x75A1,
+  kMkvBlockMore = 0xA6,
+  kMkvBlockAddID = 0xEE,
+  kMkvBlockAdditional = 0xA5,
+  kMkvDiscardPadding = 0x75A2,
+  // Track
+  kMkvTracks = 0x1654AE6B,
+  kMkvTrackEntry = 0xAE,
+  kMkvTrackNumber = 0xD7,
+  kMkvTrackUID = 0x73C5,
+  kMkvTrackType = 0x83,
+  kMkvFlagEnabled = 0xB9,
+  kMkvFlagDefault = 0x88,
+  kMkvFlagForced = 0x55AA,
+  kMkvFlagLacing = 0x9C,
+  kMkvDefaultDuration = 0x23E383,
+  kMkvMaxBlockAdditionID = 0x55EE,
+  kMkvName = 0x536E,
+  kMkvLanguage = 0x22B59C,
+  kMkvCodecID = 0x86,
+  kMkvCodecPrivate = 0x63A2,
+  kMkvCodecName = 0x258688,
+  kMkvCodecDelay = 0x56AA,
+  kMkvSeekPreRoll = 0x56BB,
+  // video
+  kMkvVideo = 0xE0,
+  kMkvFlagInterlaced = 0x9A,
+  kMkvStereoMode = 0x53B8,
+  kMkvAlphaMode = 0x53C0,
+  kMkvPixelWidth = 0xB0,
+  kMkvPixelHeight = 0xBA,
+  kMkvPixelCropBottom = 0x54AA,
+  kMkvPixelCropTop = 0x54BB,
+  kMkvPixelCropLeft = 0x54CC,
+  kMkvPixelCropRight = 0x54DD,
+  kMkvDisplayWidth = 0x54B0,
+  kMkvDisplayHeight = 0x54BA,
+  kMkvDisplayUnit = 0x54B2,
+  kMkvAspectRatioType = 0x54B3,
+  kMkvFrameRate = 0x2383E3,
+  // end video
+  // colour
+  kMkvColour = 0x55B0,
+  kMkvMatrixCoefficients = 0x55B1,
+  kMkvBitsPerChannel = 0x55B2,
+  kMkvChromaSubsamplingHorz = 0x55B3,
+  kMkvChromaSubsamplingVert = 0x55B4,
+  kMkvCbSubsamplingHorz = 0x55B5,
+  kMkvCbSubsamplingVert = 0x55B6,
+  kMkvChromaSitingHorz = 0x55B7,
+  kMkvChromaSitingVert = 0x55B8,
+  kMkvRange = 0x55B9,
+  kMkvTransferCharacteristics = 0x55BA,
+  kMkvPrimaries = 0x55BB,
+  kMkvMaxCLL = 0x55BC,
+  kMkvMaxFALL = 0x55BD,
+  // mastering metadata
+  kMkvMasteringMetadata = 0x55D0,
+  kMkvPrimaryRChromaticityX = 0x55D1,
+  kMkvPrimaryRChromaticityY = 0x55D2,
+  kMkvPrimaryGChromaticityX = 0x55D3,
+  kMkvPrimaryGChromaticityY = 0x55D4,
+  kMkvPrimaryBChromaticityX = 0x55D5,
+  kMkvPrimaryBChromaticityY = 0x55D6,
+  kMkvWhitePointChromaticityX = 0x55D7,
+  kMkvWhitePointChromaticityY = 0x55D8,
+  kMkvLuminanceMax = 0x55D9,
+  kMkvLuminanceMin = 0x55DA,
+  // end mastering metadata
+  // end colour
+  // projection
+  kMkvProjection = 0x7670,
+  kMkvProjectionType = 0x7671,
+  kMkvProjectionPrivate = 0x7672,
+  kMkvProjectionPoseYaw = 0x7673,
+  kMkvProjectionPosePitch = 0x7674,
+  kMkvProjectionPoseRoll = 0x7675,
+  // end projection
+  // audio
+  kMkvAudio = 0xE1,
+  kMkvSamplingFrequency = 0xB5,
+  kMkvOutputSamplingFrequency = 0x78B5,
+  kMkvChannels = 0x9F,
+  kMkvBitDepth = 0x6264,
+  // end audio
+  // ContentEncodings
+  kMkvContentEncodings = 0x6D80,
+  kMkvContentEncoding = 0x6240,
+  kMkvContentEncodingOrder = 0x5031,
+  kMkvContentEncodingScope = 0x5032,
+  kMkvContentEncodingType = 0x5033,
+  kMkvContentCompression = 0x5034,
+  kMkvContentCompAlgo = 0x4254,
+  kMkvContentCompSettings = 0x4255,
+  kMkvContentEncryption = 0x5035,
+  kMkvContentEncAlgo = 0x47E1,
+  kMkvContentEncKeyID = 0x47E2,
+  kMkvContentSignature = 0x47E3,
+  kMkvContentSigKeyID = 0x47E4,
+  kMkvContentSigAlgo = 0x47E5,
+  kMkvContentSigHashAlgo = 0x47E6,
+  kMkvContentEncAESSettings = 0x47E7,
+  kMkvAESSettingsCipherMode = 0x47E8,
+  kMkvAESSettingsCipherInitData = 0x47E9,
+  // end ContentEncodings
+  // Cueing Data
+  kMkvCues = 0x1C53BB6B,
+  kMkvCuePoint = 0xBB,
+  kMkvCueTime = 0xB3,
+  kMkvCueTrackPositions = 0xB7,
+  kMkvCueTrack = 0xF7,
+  kMkvCueClusterPosition = 0xF1,
+  kMkvCueBlockNumber = 0x5378,
+  // Chapters
+  kMkvChapters = 0x1043A770,
+  kMkvEditionEntry = 0x45B9,
+  kMkvChapterAtom = 0xB6,
+  kMkvChapterUID = 0x73C4,
+  kMkvChapterStringUID = 0x5654,
+  kMkvChapterTimeStart = 0x91,
+  kMkvChapterTimeEnd = 0x92,
+  kMkvChapterDisplay = 0x80,
+  kMkvChapString = 0x85,
+  kMkvChapLanguage = 0x437C,
+  kMkvChapCountry = 0x437E,
+  // Tags
+  kMkvTags = 0x1254C367,
+  kMkvTag = 0x7373,
+  kMkvSimpleTag = 0x67C8,
+  kMkvTagName = 0x45A3,
+  kMkvTagString = 0x4487
+};
+
+}  // namespace libwebm
+
+#endif  // COMMON_WEBMIDS_H_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
new file mode 100644
index 000000000..bae2c99b8
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -0,0 +1,4194 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxer.h"
+
+#include <stdint.h>
+
+#include <cfloat>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxerutil.h"
+#include "mkvmuxer/mkvwriter.h"
+#include "mkvparser/mkvparser.h"
+
+namespace mkvmuxer {
+
+const float PrimaryChromaticity::kChromaticityMin = 0.0f;
+const float PrimaryChromaticity::kChromaticityMax = 1.0f;
+const float MasteringMetadata::kMinLuminance = 0.0f;
+const float MasteringMetadata::kMinLuminanceMax = 999.99f;
+const float MasteringMetadata::kMaxLuminanceMax = 9999.99f;
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const uint64_t Colour::kValueNotPresent = UINT64_MAX;
+
+namespace {
+
+const char kDocTypeWebm[] = "webm";
+const char kDocTypeMatroska[] = "matroska";
+
+// Deallocate the string designated by |dst|, and then copy the |src|
+// string to |dst|.  The caller owns both the |src| string and the
+// |dst| copy (hence the caller is responsible for eventually
+// deallocating the strings, either directly, or indirectly via
+// StrCpy).  Returns true if the source string was successfully copied
+// to the destination.
+bool StrCpy(const char* src, char** dst_ptr) {
+  if (dst_ptr == NULL)
+    return false;
+
+  char*& dst = *dst_ptr;
+
+  delete[] dst;
+  dst = NULL;
+
+  if (src == NULL)
+    return true;
+
+  const size_t size = strlen(src) + 1;
+
+  dst = new (std::nothrow) char[size];  // NOLINT
+  if (dst == NULL)
+    return false;
+
+  strcpy(dst, src);  // NOLINT
+  return true;
+}
+
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+bool CopyChromaticity(const PrimaryChromaticity* src,
+                      PrimaryChromaticityPtr* dst) {
+  if (!dst)
+    return false;
+
+  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y()));
+  if (!dst->get())
+    return false;
+
+  return true;
+}
+
+}  // namespace
+
+///////////////////////////////////////////////////////////////
+//
+// IMkvWriter Class
+
+IMkvWriter::IMkvWriter() {}
+
+IMkvWriter::~IMkvWriter() {}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type) {
+  // Level 0
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast<uint64>(4));
+  size +=
+      EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast<uint64>(8));
+  size += EbmlElementSize(libwebm::kMkvDocType, doc_type);
+  size += EbmlElementSize(libwebm::kMkvDocTypeVersion,
+                          static_cast<uint64>(doc_type_version));
+  size +=
+      EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast<uint64>(2));
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion,
+                        static_cast<uint64>(1))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion,
+                        static_cast<uint64>(1))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength,
+                        static_cast<uint64>(4))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength,
+                        static_cast<uint64>(8))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion,
+                        static_cast<uint64>(doc_type_version))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion,
+                        static_cast<uint64>(2))) {
+    return false;
+  }
+
+  return true;
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+  return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm);
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer) {
+  return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
+}
+
+bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
+                 int64_t start, int64_t size) {
+  // TODO(vigneshv): Check if this is a reasonable value.
+  const uint32_t kBufSize = 2048;
+  uint8_t* buf = new uint8_t[kBufSize];
+  int64_t offset = start;
+  while (size > 0) {
+    const int64_t read_len = (size > kBufSize) ? kBufSize : size;
+    if (source->Read(offset, static_cast<long>(read_len), buf))
+      return false;
+    dst->Write(buf, static_cast<uint32_t>(read_len));
+    offset += read_len;
+    size -= read_len;
+  }
+  delete[] buf;
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Frame Class
+
+Frame::Frame()
+    : add_id_(0),
+      additional_(NULL),
+      additional_length_(0),
+      duration_(0),
+      duration_set_(false),
+      frame_(NULL),
+      is_key_(false),
+      length_(0),
+      track_number_(0),
+      timestamp_(0),
+      discard_padding_(0),
+      reference_block_timestamp_(0),
+      reference_block_timestamp_set_(false) {}
+
+Frame::~Frame() {
+  delete[] frame_;
+  delete[] additional_;
+}
+
+bool Frame::CopyFrom(const Frame& frame) {
+  delete[] frame_;
+  frame_ = NULL;
+  length_ = 0;
+  if (frame.length() > 0 && frame.frame() != NULL &&
+      !Init(frame.frame(), frame.length())) {
+    return false;
+  }
+  add_id_ = 0;
+  delete[] additional_;
+  additional_ = NULL;
+  additional_length_ = 0;
+  if (frame.additional_length() > 0 && frame.additional() != NULL &&
+      !AddAdditionalData(frame.additional(), frame.additional_length(),
+                         frame.add_id())) {
+    return false;
+  }
+  duration_ = frame.duration();
+  duration_set_ = frame.duration_set();
+  is_key_ = frame.is_key();
+  track_number_ = frame.track_number();
+  timestamp_ = frame.timestamp();
+  discard_padding_ = frame.discard_padding();
+  reference_block_timestamp_ = frame.reference_block_timestamp();
+  reference_block_timestamp_set_ = frame.reference_block_timestamp_set();
+  return true;
+}
+
+bool Frame::Init(const uint8_t* frame, uint64_t length) {
+  uint8_t* const data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!data)
+    return false;
+
+  delete[] frame_;
+  frame_ = data;
+  length_ = length;
+
+  memcpy(frame_, frame, static_cast<size_t>(length_));
+  return true;
+}
+
+bool Frame::AddAdditionalData(const uint8_t* additional, uint64_t length,
+                              uint64_t add_id) {
+  uint8_t* const data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!data)
+    return false;
+
+  delete[] additional_;
+  additional_ = data;
+  additional_length_ = length;
+  add_id_ = add_id;
+
+  memcpy(additional_, additional, static_cast<size_t>(additional_length_));
+  return true;
+}
+
+bool Frame::IsValid() const {
+  if (length_ == 0 || !frame_) {
+    return false;
+  }
+  if ((additional_length_ != 0 && !additional_) ||
+      (additional_ != NULL && additional_length_ == 0)) {
+    return false;
+  }
+  if (track_number_ == 0 || track_number_ > kMaxTrackNumber) {
+    return false;
+  }
+  if (!CanBeSimpleBlock() && !is_key_ && !reference_block_timestamp_set_) {
+    return false;
+  }
+  return true;
+}
+
+bool Frame::CanBeSimpleBlock() const {
+  return additional_ == NULL && discard_padding_ == 0 && duration_ == 0;
+}
+
+void Frame::set_duration(uint64_t duration) {
+  duration_ = duration;
+  duration_set_ = true;
+}
+
+void Frame::set_reference_block_timestamp(int64_t reference_block_timestamp) {
+  reference_block_timestamp_ = reference_block_timestamp;
+  reference_block_timestamp_set_ = true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// CuePoint Class
+
+CuePoint::CuePoint()
+    : time_(0),
+      track_(0),
+      cluster_pos_(0),
+      block_number_(1),
+      output_block_number_(true) {}
+
+CuePoint::~CuePoint() {}
+
+bool CuePoint::Write(IMkvWriter* writer) const {
+  if (!writer || track_ < 1 || cluster_pos_ < 1)
+    return false;
+
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
+  if (output_block_number_ && block_number_ > 1)
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
+  const uint64_t track_pos_size =
+      EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+  const uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime,
+                        static_cast<uint64>(time_))) {
+    return false;
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack,
+                        static_cast<uint64>(track_))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition,
+                        static_cast<uint64>(cluster_pos_))) {
+    return false;
+  }
+  if (output_block_number_ && block_number_ > 1) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber,
+                          static_cast<uint64>(block_number_))) {
+      return false;
+    }
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+
+  if (stop_position - payload_position != static_cast<int64_t>(payload_size))
+    return false;
+
+  return true;
+}
+
+uint64_t CuePoint::PayloadSize() const {
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
+  if (output_block_number_ && block_number_ > 1)
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
+  const uint64_t track_pos_size =
+      EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+  const uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
+
+  return payload_size;
+}
+
+uint64_t CuePoint::Size() const {
+  const uint64_t payload_size = PayloadSize();
+  return EbmlMasterElementSize(libwebm::kMkvCuePoint, payload_size) +
+         payload_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cues Class
+
+Cues::Cues()
+    : cue_entries_capacity_(0),
+      cue_entries_size_(0),
+      cue_entries_(NULL),
+      output_block_number_(true) {}
+
+Cues::~Cues() {
+  if (cue_entries_) {
+    for (int32_t i = 0; i < cue_entries_size_; ++i) {
+      CuePoint* const cue = cue_entries_[i];
+      delete cue;
+    }
+    delete[] cue_entries_;
+  }
+}
+
+bool Cues::AddCue(CuePoint* cue) {
+  if (!cue)
+    return false;
+
+  if ((cue_entries_size_ + 1) > cue_entries_capacity_) {
+    // Add more CuePoints.
+    const int32_t new_capacity =
+        (!cue_entries_capacity_) ? 2 : cue_entries_capacity_ * 2;
+
+    if (new_capacity < 1)
+      return false;
+
+    CuePoint** const cues =
+        new (std::nothrow) CuePoint*[new_capacity];  // NOLINT
+    if (!cues)
+      return false;
+
+    for (int32_t i = 0; i < cue_entries_size_; ++i) {
+      cues[i] = cue_entries_[i];
+    }
+
+    delete[] cue_entries_;
+
+    cue_entries_ = cues;
+    cue_entries_capacity_ = new_capacity;
+  }
+
+  cue->set_output_block_number(output_block_number_);
+  cue_entries_[cue_entries_size_++] = cue;
+  return true;
+}
+
+CuePoint* Cues::GetCueByIndex(int32_t index) const {
+  if (cue_entries_ == NULL)
+    return NULL;
+
+  if (index >= cue_entries_size_)
+    return NULL;
+
+  return cue_entries_[index];
+}
+
+uint64_t Cues::Size() {
+  uint64_t size = 0;
+  for (int32_t i = 0; i < cue_entries_size_; ++i)
+    size += GetCueByIndex(i)->Size();
+  size += EbmlMasterElementSize(libwebm::kMkvCues, size);
+  return size;
+}
+
+bool Cues::Write(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  uint64_t size = 0;
+  for (int32_t i = 0; i < cue_entries_size_; ++i) {
+    const CuePoint* const cue = GetCueByIndex(i);
+
+    if (!cue)
+      return false;
+
+    size += cue->Size();
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCues, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  for (int32_t i = 0; i < cue_entries_size_; ++i) {
+    const CuePoint* const cue = GetCueByIndex(i);
+
+    if (!cue->Write(writer))
+      return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+
+  if (stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncAESSettings Class
+
+ContentEncAESSettings::ContentEncAESSettings() : cipher_mode_(kCTR) {}
+
+uint64_t ContentEncAESSettings::Size() const {
+  const uint64_t payload = PayloadSize();
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncAESSettings, payload) +
+      payload;
+  return size;
+}
+
+bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
+  const uint64_t payload = PayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncAESSettings,
+                              payload))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode,
+                        static_cast<uint64>(cipher_mode_))) {
+    return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(payload))
+    return false;
+
+  return true;
+}
+
+uint64_t ContentEncAESSettings::PayloadSize() const {
+  uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode,
+                                  static_cast<uint64>(cipher_mode_));
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncoding Class
+
+ContentEncoding::ContentEncoding()
+    : enc_algo_(5),
+      enc_key_id_(NULL),
+      encoding_order_(0),
+      encoding_scope_(1),
+      encoding_type_(1),
+      enc_key_id_length_(0) {}
+
+ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; }
+
+bool ContentEncoding::SetEncryptionID(const uint8_t* id, uint64_t length) {
+  if (!id || length < 1)
+    return false;
+
+  delete[] enc_key_id_;
+
+  enc_key_id_ =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!enc_key_id_)
+    return false;
+
+  memcpy(enc_key_id_, id, static_cast<size_t>(length));
+  enc_key_id_length_ = length;
+
+  return true;
+}
+
+uint64_t ContentEncoding::Size() const {
+  const uint64_t encryption_size = EncryptionSize();
+  const uint64_t encoding_size = EncodingSize(0, encryption_size);
+  const uint64_t encodings_size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+      encoding_size;
+
+  return encodings_size;
+}
+
+bool ContentEncoding::Write(IMkvWriter* writer) const {
+  const uint64_t encryption_size = EncryptionSize();
+  const uint64_t encoding_size = EncodingSize(0, encryption_size);
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+      encoding_size;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncoding,
+                              encoding_size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder,
+                        static_cast<uint64>(encoding_order_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope,
+                        static_cast<uint64>(encoding_scope_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType,
+                        static_cast<uint64>(encoding_type_)))
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption,
+                              encryption_size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo,
+                        static_cast<uint64>(enc_algo_))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_,
+                        enc_key_id_length_))
+    return false;
+
+  if (!enc_aes_settings_.Write(writer))
+    return false;
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+                                       uint64_t encryption_size) const {
+  // TODO(fgalligan): Add support for compression settings.
+  if (compresion_size != 0)
+    return 0;
+
+  uint64_t encoding_size = 0;
+
+  if (encryption_size > 0) {
+    encoding_size +=
+        EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) +
+        encryption_size;
+  }
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType,
+                                   static_cast<uint64>(encoding_type_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope,
+                                   static_cast<uint64>(encoding_scope_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder,
+                                   static_cast<uint64>(encoding_order_));
+
+  return encoding_size;
+}
+
+uint64_t ContentEncoding::EncryptionSize() const {
+  const uint64_t aes_size = enc_aes_settings_.Size();
+
+  uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID,
+                                             enc_key_id_, enc_key_id_length_);
+  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo,
+                                     static_cast<uint64>(enc_algo_));
+
+  return encryption_size + aes_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Track Class
+
+Track::Track(unsigned int* seed)
+    : codec_id_(NULL),
+      codec_private_(NULL),
+      language_(NULL),
+      max_block_additional_id_(0),
+      name_(NULL),
+      number_(0),
+      type_(0),
+      uid_(MakeUID(seed)),
+      codec_delay_(0),
+      seek_pre_roll_(0),
+      default_duration_(0),
+      codec_private_length_(0),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_size_(0) {}
+
+Track::~Track() {
+  delete[] codec_id_;
+  delete[] codec_private_;
+  delete[] language_;
+  delete[] name_;
+
+  if (content_encoding_entries_) {
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      delete encoding;
+    }
+    delete[] content_encoding_entries_;
+  }
+}
+
+bool Track::AddContentEncoding() {
+  const uint32_t count = content_encoding_entries_size_ + 1;
+
+  ContentEncoding** const content_encoding_entries =
+      new (std::nothrow) ContentEncoding*[count];  // NOLINT
+  if (!content_encoding_entries)
+    return false;
+
+  ContentEncoding* const content_encoding =
+      new (std::nothrow) ContentEncoding();  // NOLINT
+  if (!content_encoding) {
+    delete[] content_encoding_entries;
+    return false;
+  }
+
+  for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+    content_encoding_entries[i] = content_encoding_entries_[i];
+  }
+
+  delete[] content_encoding_entries_;
+
+  content_encoding_entries_ = content_encoding_entries;
+  content_encoding_entries_[content_encoding_entries_size_] = content_encoding;
+  content_encoding_entries_size_ = count;
+  return true;
+}
+
+ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const {
+  if (content_encoding_entries_ == NULL)
+    return NULL;
+
+  if (index >= content_encoding_entries_size_)
+    return NULL;
+
+  return content_encoding_entries_[index];
+}
+
+uint64_t Track::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
+  if (codec_id_)
+    size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
+  if (codec_private_)
+    size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
+                            codec_private_length_);
+  if (language_)
+    size += EbmlElementSize(libwebm::kMkvLanguage, language_);
+  if (name_)
+    size += EbmlElementSize(libwebm::kMkvName, name_);
+  if (max_block_additional_id_) {
+    size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+                            static_cast<uint64>(max_block_additional_id_));
+  }
+  if (codec_delay_) {
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
+  }
+  if (seek_pre_roll_) {
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
+  }
+  if (default_duration_) {
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
+  }
+
+  if (content_encoding_entries_size_ > 0) {
+    uint64_t content_encodings_size = 0;
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      content_encodings_size += encoding->Size();
+    }
+
+    size += EbmlMasterElementSize(libwebm::kMkvContentEncodings,
+                                  content_encodings_size) +
+            content_encodings_size;
+  }
+
+  return size;
+}
+
+uint64_t Track::Size() const {
+  uint64_t size = PayloadSize();
+  size += EbmlMasterElementSize(libwebm::kMkvTrackEntry, size);
+  return size;
+}
+
+bool Track::Write(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  // mandatory elements without a default value.
+  if (!type_ || !codec_id_)
+    return false;
+
+  // |size| may be bigger than what is written out in this function because
+  // derived classes may write out more data in the Track element.
+  const uint64_t payload_size = PayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size))
+    return false;
+
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
+  if (codec_id_)
+    size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
+  if (codec_private_)
+    size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
+                            static_cast<uint64>(codec_private_length_));
+  if (language_)
+    size += EbmlElementSize(libwebm::kMkvLanguage, language_);
+  if (name_)
+    size += EbmlElementSize(libwebm::kMkvName, name_);
+  if (max_block_additional_id_)
+    size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+                            static_cast<uint64>(max_block_additional_id_));
+  if (codec_delay_)
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
+  if (seek_pre_roll_)
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
+  if (default_duration_)
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber,
+                        static_cast<uint64>(number_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID,
+                        static_cast<uint64>(uid_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType,
+                        static_cast<uint64>(type_)))
+    return false;
+  if (max_block_additional_id_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID,
+                          static_cast<uint64>(max_block_additional_id_))) {
+      return false;
+    }
+  }
+  if (codec_delay_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay,
+                          static_cast<uint64>(codec_delay_)))
+      return false;
+  }
+  if (seek_pre_roll_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll,
+                          static_cast<uint64>(seek_pre_roll_)))
+      return false;
+  }
+  if (default_duration_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration,
+                          static_cast<uint64>(default_duration_)))
+      return false;
+  }
+  if (codec_id_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecID, codec_id_))
+      return false;
+  }
+  if (codec_private_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_,
+                          static_cast<uint64>(codec_private_length_)))
+      return false;
+  }
+  if (language_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvLanguage, language_))
+      return false;
+  }
+  if (name_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvName, name_))
+      return false;
+  }
+
+  int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  if (content_encoding_entries_size_ > 0) {
+    uint64_t content_encodings_size = 0;
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      content_encodings_size += encoding->Size();
+    }
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncodings,
+                                content_encodings_size))
+      return false;
+
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      if (!encoding->Write(writer))
+        return false;
+    }
+  }
+
+  stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+  return true;
+}
+
+bool Track::SetCodecPrivate(const uint8_t* codec_private, uint64_t length) {
+  if (!codec_private || length < 1)
+    return false;
+
+  delete[] codec_private_;
+
+  codec_private_ =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!codec_private_)
+    return false;
+
+  memcpy(codec_private_, codec_private, static_cast<size_t>(length));
+  codec_private_length_ = length;
+
+  return true;
+}
+
+void Track::set_codec_id(const char* codec_id) {
+  if (codec_id) {
+    delete[] codec_id_;
+
+    const size_t length = strlen(codec_id) + 1;
+    codec_id_ = new (std::nothrow) char[length];  // NOLINT
+    if (codec_id_) {
+#ifdef _MSC_VER
+      strcpy_s(codec_id_, length, codec_id);
+#else
+      strcpy(codec_id_, codec_id);
+#endif
+    }
+  }
+}
+
+// TODO(fgalligan): Vet the language parameter.
+void Track::set_language(const char* language) {
+  if (language) {
+    delete[] language_;
+
+    const size_t length = strlen(language) + 1;
+    language_ = new (std::nothrow) char[length];  // NOLINT
+    if (language_) {
+#ifdef _MSC_VER
+      strcpy_s(language_, length, language);
+#else
+      strcpy(language_, language);
+#endif
+    }
+  }
+}
+
+void Track::set_name(const char* name) {
+  if (name) {
+    delete[] name_;
+
+    const size_t length = strlen(name) + 1;
+    name_ = new (std::nothrow) char[length];  // NOLINT
+    if (name_) {
+#ifdef _MSC_VER
+      strcpy_s(name_, length, name);
+#else
+      strcpy(name_, name);
+#endif
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Colour and its child elements
+
+uint64_t PrimaryChromaticity::PrimaryChromaticitySize(
+    libwebm::MkvId x_id, libwebm::MkvId y_id) const {
+  return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_);
+}
+
+bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id,
+                                libwebm::MkvId y_id) const {
+  if (!Valid()) {
+    return false;
+  }
+  return WriteEbmlElement(writer, x_id, x_) &&
+         WriteEbmlElement(writer, y_id, y_);
+}
+
+bool PrimaryChromaticity::Valid() const {
+  return (x_ >= kChromaticityMin && x_ <= kChromaticityMax &&
+          y_ >= kChromaticityMin && y_ <= kChromaticityMax);
+}
+
+uint64_t MasteringMetadata::MasteringMetadataSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvMasteringMetadata, size);
+
+  return size;
+}
+
+bool MasteringMetadata::Valid() const {
+  if (luminance_min_ != kValueNotPresent) {
+    if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax ||
+        luminance_min_ > luminance_max_) {
+      return false;
+    }
+  }
+  if (luminance_max_ != kValueNotPresent) {
+    if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax ||
+        luminance_max_ < luminance_min_) {
+      return false;
+    }
+  }
+  if (r_ && !r_->Valid())
+    return false;
+  if (g_ && !g_->Valid())
+    return false;
+  if (b_ && !b_->Valid())
+    return false;
+  if (white_point_ && !white_point_->Valid())
+    return false;
+
+  return true;
+}
+
+bool MasteringMetadata::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size))
+    return false;
+  if (luminance_max_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) {
+    return false;
+  }
+  if (luminance_min_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
+    return false;
+  }
+  if (r_ &&
+      !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+                 libwebm::kMkvPrimaryRChromaticityY)) {
+    return false;
+  }
+  if (g_ &&
+      !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+                 libwebm::kMkvPrimaryGChromaticityY)) {
+    return false;
+  }
+  if (b_ &&
+      !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+                 libwebm::kMkvPrimaryBChromaticityY)) {
+    return false;
+  }
+  if (white_point_ &&
+      !white_point_->Write(writer, libwebm::kMkvWhitePointChromaticityX,
+                           libwebm::kMkvWhitePointChromaticityY)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MasteringMetadata::SetChromaticity(
+    const PrimaryChromaticity* r, const PrimaryChromaticity* g,
+    const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  if (r) {
+    if (!CopyChromaticity(r, &r_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  if (g) {
+    if (!CopyChromaticity(g, &g_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  if (b) {
+    if (!CopyChromaticity(b, &b_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr wp_ptr(nullptr);
+  if (white_point) {
+    if (!CopyChromaticity(white_point, &wp_ptr))
+      return false;
+  }
+
+  r_ = r_ptr.release();
+  g_ = g_ptr.release();
+  b_ = b_ptr.release();
+  white_point_ = wp_ptr.release();
+  return true;
+}
+
+uint64_t MasteringMetadata::PayloadSize() const {
+  uint64_t size = 0;
+
+  if (luminance_max_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_);
+  if (luminance_min_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_);
+
+  if (r_) {
+    size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX,
+                                        libwebm::kMkvPrimaryRChromaticityY);
+  }
+  if (g_) {
+    size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX,
+                                        libwebm::kMkvPrimaryGChromaticityY);
+  }
+  if (b_) {
+    size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX,
+                                        libwebm::kMkvPrimaryBChromaticityY);
+  }
+  if (white_point_) {
+    size += white_point_->PrimaryChromaticitySize(
+        libwebm::kMkvWhitePointChromaticityX,
+        libwebm::kMkvWhitePointChromaticityY);
+  }
+
+  return size;
+}
+
+uint64_t Colour::ColourSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvColour, size);
+
+  return size;
+}
+
+bool Colour::Valid() const {
+  if (mastering_metadata_ && !mastering_metadata_->Valid())
+    return false;
+  if (matrix_coefficients_ != kValueNotPresent &&
+      !IsMatrixCoefficientsValueValid(matrix_coefficients_)) {
+    return false;
+  }
+  if (chroma_siting_horz_ != kValueNotPresent &&
+      !IsChromaSitingHorzValueValid(chroma_siting_horz_)) {
+    return false;
+  }
+  if (chroma_siting_vert_ != kValueNotPresent &&
+      !IsChromaSitingVertValueValid(chroma_siting_vert_)) {
+    return false;
+  }
+  if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_))
+    return false;
+  if (transfer_characteristics_ != kValueNotPresent &&
+      !IsTransferCharacteristicsValueValid(transfer_characteristics_)) {
+    return false;
+  }
+  if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_))
+    return false;
+
+  return true;
+}
+
+bool Colour::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  // Don't write an invalid element.
+  if (!Valid())
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size))
+    return false;
+
+  if (matrix_coefficients_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients,
+                        static_cast<uint64>(matrix_coefficients_))) {
+    return false;
+  }
+  if (bits_per_channel_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel,
+                        static_cast<uint64>(bits_per_channel_))) {
+    return false;
+  }
+  if (chroma_subsampling_horz_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz,
+                        static_cast<uint64>(chroma_subsampling_horz_))) {
+    return false;
+  }
+  if (chroma_subsampling_vert_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert,
+                        static_cast<uint64>(chroma_subsampling_vert_))) {
+    return false;
+  }
+
+  if (cb_subsampling_horz_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz,
+                        static_cast<uint64>(cb_subsampling_horz_))) {
+    return false;
+  }
+  if (cb_subsampling_vert_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert,
+                        static_cast<uint64>(cb_subsampling_vert_))) {
+    return false;
+  }
+  if (chroma_siting_horz_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz,
+                        static_cast<uint64>(chroma_siting_horz_))) {
+    return false;
+  }
+  if (chroma_siting_vert_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert,
+                        static_cast<uint64>(chroma_siting_vert_))) {
+    return false;
+  }
+  if (range_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvRange,
+                        static_cast<uint64>(range_))) {
+    return false;
+  }
+  if (transfer_characteristics_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics,
+                        static_cast<uint64>(transfer_characteristics_))) {
+    return false;
+  }
+  if (primaries_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvPrimaries,
+                        static_cast<uint64>(primaries_))) {
+    return false;
+  }
+  if (max_cll_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL,
+                        static_cast<uint64>(max_cll_))) {
+    return false;
+  }
+  if (max_fall_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL,
+                        static_cast<uint64>(max_fall_))) {
+    return false;
+  }
+
+  if (mastering_metadata_ && !mastering_metadata_->Write(writer))
+    return false;
+
+  return true;
+}
+
+bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  if (!mm_ptr.get())
+    return false;
+
+  mm_ptr->set_luminance_max(mastering_metadata.luminance_max());
+  mm_ptr->set_luminance_min(mastering_metadata.luminance_min());
+
+  if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(),
+                               mastering_metadata.b(),
+                               mastering_metadata.white_point())) {
+    return false;
+  }
+
+  delete mastering_metadata_;
+  mastering_metadata_ = mm_ptr.release();
+  return true;
+}
+
+uint64_t Colour::PayloadSize() const {
+  uint64_t size = 0;
+
+  if (matrix_coefficients_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMatrixCoefficients,
+                            static_cast<uint64>(matrix_coefficients_));
+  }
+  if (bits_per_channel_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvBitsPerChannel,
+                            static_cast<uint64>(bits_per_channel_));
+  }
+  if (chroma_subsampling_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz,
+                            static_cast<uint64>(chroma_subsampling_horz_));
+  }
+  if (chroma_subsampling_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert,
+                            static_cast<uint64>(chroma_subsampling_vert_));
+  }
+  if (cb_subsampling_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz,
+                            static_cast<uint64>(cb_subsampling_horz_));
+  }
+  if (cb_subsampling_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert,
+                            static_cast<uint64>(cb_subsampling_vert_));
+  }
+  if (chroma_siting_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz,
+                            static_cast<uint64>(chroma_siting_horz_));
+  }
+  if (chroma_siting_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingVert,
+                            static_cast<uint64>(chroma_siting_vert_));
+  }
+  if (range_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvRange, static_cast<uint64>(range_));
+  }
+  if (transfer_characteristics_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvTransferCharacteristics,
+                            static_cast<uint64>(transfer_characteristics_));
+  }
+  if (primaries_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvPrimaries,
+                            static_cast<uint64>(primaries_));
+  }
+  if (max_cll_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast<uint64>(max_cll_));
+  }
+  if (max_fall_ != kValueNotPresent) {
+    size +=
+        EbmlElementSize(libwebm::kMkvMaxFALL, static_cast<uint64>(max_fall_));
+  }
+
+  if (mastering_metadata_)
+    size += mastering_metadata_->MasteringMetadataSize();
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Projection element
+
+uint64_t Projection::ProjectionSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvProjection, size);
+
+  return size;
+}
+
+bool Projection::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType,
+                        static_cast<uint64>(type_))) {
+    return false;
+  }
+
+  if (private_data_length_ > 0 && private_data_ != NULL &&
+      !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_,
+                        private_data_length_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch,
+                        pose_pitch_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool Projection::SetProjectionPrivate(const uint8_t* data,
+                                      uint64_t data_length) {
+  if (data == NULL || data_length == 0) {
+    return false;
+  }
+
+  if (data_length != static_cast<size_t>(data_length)) {
+    return false;
+  }
+
+  uint8_t* new_private_data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(data_length)];
+  if (new_private_data == NULL) {
+    return false;
+  }
+
+  delete[] private_data_;
+  private_data_ = new_private_data;
+  private_data_length_ = data_length;
+  memcpy(private_data_, data, static_cast<size_t>(data_length));
+
+  return true;
+}
+
+uint64_t Projection::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvProjection, static_cast<uint64>(type_));
+
+  if (private_data_length_ > 0 && private_data_ != NULL) {
+    size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_,
+                            private_data_length_);
+  }
+
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_);
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// VideoTrack Class
+
+VideoTrack::VideoTrack(unsigned int* seed)
+    : Track(seed),
+      display_height_(0),
+      display_width_(0),
+      pixel_height_(0),
+      pixel_width_(0),
+      crop_left_(0),
+      crop_right_(0),
+      crop_top_(0),
+      crop_bottom_(0),
+      frame_rate_(0.0),
+      height_(0),
+      stereo_mode_(0),
+      alpha_mode_(0),
+      width_(0),
+      colour_(NULL),
+      projection_(NULL) {}
+
+VideoTrack::~VideoTrack() {
+  delete colour_;
+  delete projection_;
+}
+
+bool VideoTrack::SetStereoMode(uint64_t stereo_mode) {
+  if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
+      stereo_mode != kTopBottomRightIsFirst &&
+      stereo_mode != kTopBottomLeftIsFirst &&
+      stereo_mode != kSideBySideRightIsFirst)
+    return false;
+
+  stereo_mode_ = stereo_mode;
+  return true;
+}
+
+bool VideoTrack::SetAlphaMode(uint64_t alpha_mode) {
+  if (alpha_mode != kNoAlpha && alpha_mode != kAlpha)
+    return false;
+
+  alpha_mode_ = alpha_mode;
+  return true;
+}
+
+uint64_t VideoTrack::PayloadSize() const {
+  const uint64_t parent_size = Track::PayloadSize();
+
+  uint64_t size = VideoPayloadSize();
+  size += EbmlMasterElementSize(libwebm::kMkvVideo, size);
+
+  return parent_size + size;
+}
+
+bool VideoTrack::Write(IMkvWriter* writer) const {
+  if (!Track::Write(writer))
+    return false;
+
+  const uint64_t size = VideoPayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvVideo, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelWidth,
+          static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_)))
+    return false;
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelHeight,
+          static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_)))
+    return false;
+  if (display_width_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth,
+                          static_cast<uint64>(display_width_)))
+      return false;
+  }
+  if (display_height_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight,
+                          static_cast<uint64>(display_height_)))
+      return false;
+  }
+  if (crop_left_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft,
+                          static_cast<uint64>(crop_left_)))
+      return false;
+  }
+  if (crop_right_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight,
+                          static_cast<uint64>(crop_right_)))
+      return false;
+  }
+  if (crop_top_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop,
+                          static_cast<uint64>(crop_top_)))
+      return false;
+  }
+  if (crop_bottom_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom,
+                          static_cast<uint64>(crop_bottom_)))
+      return false;
+  }
+  if (stereo_mode_ > kMono) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode,
+                          static_cast<uint64>(stereo_mode_)))
+      return false;
+  }
+  if (alpha_mode_ > kNoAlpha) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode,
+                          static_cast<uint64>(alpha_mode_)))
+      return false;
+  }
+  if (frame_rate_ > 0.0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
+                          static_cast<float>(frame_rate_))) {
+      return false;
+    }
+  }
+  if (colour_) {
+    if (!colour_->Write(writer))
+      return false;
+  }
+  if (projection_) {
+    if (!projection_->Write(writer))
+      return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool VideoTrack::SetColour(const Colour& colour) {
+  std::unique_ptr<Colour> colour_ptr(new Colour());
+  if (!colour_ptr.get())
+    return false;
+
+  if (colour.mastering_metadata()) {
+    if (!colour_ptr->SetMasteringMetadata(*colour.mastering_metadata()))
+      return false;
+  }
+
+  colour_ptr->set_matrix_coefficients(colour.matrix_coefficients());
+  colour_ptr->set_bits_per_channel(colour.bits_per_channel());
+  colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz());
+  colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert());
+  colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz());
+  colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert());
+  colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz());
+  colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert());
+  colour_ptr->set_range(colour.range());
+  colour_ptr->set_transfer_characteristics(colour.transfer_characteristics());
+  colour_ptr->set_primaries(colour.primaries());
+  colour_ptr->set_max_cll(colour.max_cll());
+  colour_ptr->set_max_fall(colour.max_fall());
+  delete colour_;
+  colour_ = colour_ptr.release();
+  return true;
+}
+
+bool VideoTrack::SetProjection(const Projection& projection) {
+  std::unique_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  if (projection.private_data()) {
+    if (!projection_ptr->SetProjectionPrivate(
+            projection.private_data(), projection.private_data_length())) {
+      return false;
+    }
+  }
+
+  projection_ptr->set_type(projection.type());
+  projection_ptr->set_pose_yaw(projection.pose_yaw());
+  projection_ptr->set_pose_pitch(projection.pose_pitch());
+  projection_ptr->set_pose_roll(projection.pose_roll());
+  delete projection_;
+  projection_ = projection_ptr.release();
+  return true;
+}
+
+uint64_t VideoTrack::VideoPayloadSize() const {
+  uint64_t size = EbmlElementSize(
+      libwebm::kMkvPixelWidth,
+      static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_));
+  size += EbmlElementSize(
+      libwebm::kMkvPixelHeight,
+      static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_));
+  if (display_width_ > 0)
+    size += EbmlElementSize(libwebm::kMkvDisplayWidth,
+                            static_cast<uint64>(display_width_));
+  if (display_height_ > 0)
+    size += EbmlElementSize(libwebm::kMkvDisplayHeight,
+                            static_cast<uint64>(display_height_));
+  if (crop_left_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropLeft,
+                            static_cast<uint64>(crop_left_));
+  if (crop_right_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropRight,
+                            static_cast<uint64>(crop_right_));
+  if (crop_top_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropTop,
+                            static_cast<uint64>(crop_top_));
+  if (crop_bottom_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropBottom,
+                            static_cast<uint64>(crop_bottom_));
+  if (stereo_mode_ > kMono)
+    size += EbmlElementSize(libwebm::kMkvStereoMode,
+                            static_cast<uint64>(stereo_mode_));
+  if (alpha_mode_ > kNoAlpha)
+    size += EbmlElementSize(libwebm::kMkvAlphaMode,
+                            static_cast<uint64>(alpha_mode_));
+  if (frame_rate_ > 0.0)
+    size += EbmlElementSize(libwebm::kMkvFrameRate,
+                            static_cast<float>(frame_rate_));
+  if (colour_)
+    size += colour_->ColourSize();
+  if (projection_)
+    size += projection_->ProjectionSize();
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// AudioTrack Class
+
+AudioTrack::AudioTrack(unsigned int* seed)
+    : Track(seed), bit_depth_(0), channels_(1), sample_rate_(0.0) {}
+
+AudioTrack::~AudioTrack() {}
+
+uint64_t AudioTrack::PayloadSize() const {
+  const uint64_t parent_size = Track::PayloadSize();
+
+  uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+                                  static_cast<float>(sample_rate_));
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
+  if (bit_depth_ > 0)
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
+  size += EbmlMasterElementSize(libwebm::kMkvAudio, size);
+
+  return parent_size + size;
+}
+
+bool AudioTrack::Write(IMkvWriter* writer) const {
+  if (!Track::Write(writer))
+    return false;
+
+  // Calculate AudioSettings size.
+  uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+                                  static_cast<float>(sample_rate_));
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
+  if (bit_depth_ > 0)
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency,
+                        static_cast<float>(sample_rate_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvChannels,
+                        static_cast<uint64>(channels_)))
+    return false;
+  if (bit_depth_ > 0)
+    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth,
+                          static_cast<uint64>(bit_depth_)))
+      return false;
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Tracks Class
+
+const char Tracks::kOpusCodecId[] = "A_OPUS";
+const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kVp8CodecId[] = "V_VP8";
+const char Tracks::kVp9CodecId[] = "V_VP9";
+const char Tracks::kVp10CodecId[] = "V_VP10";
+const char Tracks::kAV1CodecId[] = "V_AV1";
+const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
+const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
+const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
+const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES";
+
+Tracks::Tracks()
+    : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {}
+
+Tracks::~Tracks() {
+  if (track_entries_) {
+    for (uint32_t i = 0; i < track_entries_size_; ++i) {
+      Track* const track = track_entries_[i];
+      delete track;
+    }
+    delete[] track_entries_;
+  }
+}
+
+bool Tracks::AddTrack(Track* track, int32_t number) {
+  if (number < 0 || wrote_tracks_)
+    return false;
+
+  // This muxer only supports track numbers in the range [1, 126], in
+  // order to be able (to use Matroska integer representation) to
+  // serialize the block header (of which the track number is a part)
+  // for a frame using exactly 4 bytes.
+
+  if (number > 0x7E)
+    return false;
+
+  uint32_t track_num = number;
+
+  if (track_num > 0) {
+    // Check to make sure a track does not already have |track_num|.
+    for (uint32_t i = 0; i < track_entries_size_; ++i) {
+      if (track_entries_[i]->number() == track_num)
+        return false;
+    }
+  }
+
+  const uint32_t count = track_entries_size_ + 1;
+
+  Track** const track_entries = new (std::nothrow) Track*[count];  // NOLINT
+  if (!track_entries)
+    return false;
+
+  for (uint32_t i = 0; i < track_entries_size_; ++i) {
+    track_entries[i] = track_entries_[i];
+  }
+
+  delete[] track_entries_;
+
+  // Find the lowest availible track number > 0.
+  if (track_num == 0) {
+    track_num = count;
+
+    // Check to make sure a track does not already have |track_num|.
+    bool exit = false;
+    do {
+      exit = true;
+      for (uint32_t i = 0; i < track_entries_size_; ++i) {
+        if (track_entries[i]->number() == track_num) {
+          track_num++;
+          exit = false;
+          break;
+        }
+      }
+    } while (!exit);
+  }
+  track->set_number(track_num);
+
+  track_entries_ = track_entries;
+  track_entries_[track_entries_size_] = track;
+  track_entries_size_ = count;
+  return true;
+}
+
+const Track* Tracks::GetTrackByIndex(uint32_t index) const {
+  if (track_entries_ == NULL)
+    return NULL;
+
+  if (index >= track_entries_size_)
+    return NULL;
+
+  return track_entries_[index];
+}
+
+Track* Tracks::GetTrackByNumber(uint64_t track_number) const {
+  const int32_t count = track_entries_size();
+  for (int32_t i = 0; i < count; ++i) {
+    if (track_entries_[i]->number() == track_number)
+      return track_entries_[i];
+  }
+
+  return NULL;
+}
+
+bool Tracks::TrackIsAudio(uint64_t track_number) const {
+  const Track* const track = GetTrackByNumber(track_number);
+
+  if (track->type() == kAudio)
+    return true;
+
+  return false;
+}
+
+bool Tracks::TrackIsVideo(uint64_t track_number) const {
+  const Track* const track = GetTrackByNumber(track_number);
+
+  if (track->type() == kVideo)
+    return true;
+
+  return false;
+}
+
+bool Tracks::Write(IMkvWriter* writer) const {
+  uint64_t size = 0;
+  const int32_t count = track_entries_size();
+  for (int32_t i = 0; i < count; ++i) {
+    const Track* const track = GetTrackByIndex(i);
+
+    if (!track)
+      return false;
+
+    size += track->Size();
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTracks, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  for (int32_t i = 0; i < count; ++i) {
+    const Track* const track = GetTrackByIndex(i);
+    if (!track->Write(writer))
+      return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  wrote_tracks_ = true;
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapter Class
+
+bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); }
+
+void Chapter::set_time(const Segment& segment, uint64_t start_ns,
+                       uint64_t end_ns) {
+  const SegmentInfo* const info = segment.GetSegmentInfo();
+  const uint64_t timecode_scale = info->timecode_scale();
+  start_timecode_ = start_ns / timecode_scale;
+  end_timecode_ = end_ns / timecode_scale;
+}
+
+bool Chapter::add_string(const char* title, const char* language,
+                         const char* country) {
+  if (!ExpandDisplaysArray())
+    return false;
+
+  Display& d = displays_[displays_count_++];
+  d.Init();
+
+  if (!d.set_title(title))
+    return false;
+
+  if (!d.set_language(language))
+    return false;
+
+  if (!d.set_country(country))
+    return false;
+
+  return true;
+}
+
+Chapter::Chapter() {
+  // This ctor only constructs the object.  Proper initialization is
+  // done in Init() (called in Chapters::AddChapter()).  The only
+  // reason we bother implementing this ctor is because we had to
+  // declare it as private (along with the dtor), in order to prevent
+  // clients from creating Chapter instances (a privelege we grant
+  // only to the Chapters class).  Doing no initialization here also
+  // means that creating arrays of chapter objects is more efficient,
+  // because we only initialize each new chapter object as it becomes
+  // active on the array.
+}
+
+Chapter::~Chapter() {}
+
+void Chapter::Init(unsigned int* seed) {
+  id_ = NULL;
+  start_timecode_ = 0;
+  end_timecode_ = 0;
+  displays_ = NULL;
+  displays_size_ = 0;
+  displays_count_ = 0;
+  uid_ = MakeUID(seed);
+}
+
+void Chapter::ShallowCopy(Chapter* dst) const {
+  dst->id_ = id_;
+  dst->start_timecode_ = start_timecode_;
+  dst->end_timecode_ = end_timecode_;
+  dst->uid_ = uid_;
+  dst->displays_ = displays_;
+  dst->displays_size_ = displays_size_;
+  dst->displays_count_ = displays_count_;
+}
+
+void Chapter::Clear() {
+  StrCpy(NULL, &id_);
+
+  while (displays_count_ > 0) {
+    Display& d = displays_[--displays_count_];
+    d.Clear();
+  }
+
+  delete[] displays_;
+  displays_ = NULL;
+
+  displays_size_ = 0;
+}
+
+bool Chapter::ExpandDisplaysArray() {
+  if (displays_size_ > displays_count_)
+    return true;  // nothing to do yet
+
+  const int size = (displays_size_ == 0) ? 1 : 2 * displays_size_;
+
+  Display* const displays = new (std::nothrow) Display[size];  // NOLINT
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    displays[idx] = displays_[idx];  // shallow copy
+  }
+
+  delete[] displays_;
+
+  displays_ = displays;
+  displays_size_ = size;
+
+  return true;
+}
+
+uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
+  uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvChapterStringUID, id_) +
+      EbmlElementSize(libwebm::kMkvChapterUID, static_cast<uint64>(uid_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeStart,
+                      static_cast<uint64>(start_timecode_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeEnd,
+                      static_cast<uint64>(end_timecode_));
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    const Display& d = displays_[idx];
+    payload_size += d.WriteDisplay(NULL);
+  }
+
+  const uint64_t atom_size =
+      EbmlMasterElementSize(libwebm::kMkvChapterAtom, payload_size) +
+      payload_size;
+
+  if (writer == NULL)
+    return atom_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterAtom, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID,
+                        static_cast<uint64>(uid_)))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart,
+                        static_cast<uint64>(start_timecode_)))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd,
+                        static_cast<uint64>(end_timecode_)))
+    return 0;
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    const Display& d = displays_[idx];
+
+    if (!d.WriteDisplay(writer))
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != atom_size)
+    return 0;
+
+  return atom_size;
+}
+
+void Chapter::Display::Init() {
+  title_ = NULL;
+  language_ = NULL;
+  country_ = NULL;
+}
+
+void Chapter::Display::Clear() {
+  StrCpy(NULL, &title_);
+  StrCpy(NULL, &language_);
+  StrCpy(NULL, &country_);
+}
+
+bool Chapter::Display::set_title(const char* title) {
+  return StrCpy(title, &title_);
+}
+
+bool Chapter::Display::set_language(const char* language) {
+  return StrCpy(language, &language_);
+}
+
+bool Chapter::Display::set_country(const char* country) {
+  return StrCpy(country, &country_);
+}
+
+uint64_t Chapter::Display::WriteDisplay(IMkvWriter* writer) const {
+  uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapString, title_);
+
+  if (language_)
+    payload_size += EbmlElementSize(libwebm::kMkvChapLanguage, language_);
+
+  if (country_)
+    payload_size += EbmlElementSize(libwebm::kMkvChapCountry, country_);
+
+  const uint64_t display_size =
+      EbmlMasterElementSize(libwebm::kMkvChapterDisplay, payload_size) +
+      payload_size;
+
+  if (writer == NULL)
+    return display_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterDisplay,
+                              payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapString, title_))
+    return 0;
+
+  if (language_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvChapLanguage, language_))
+      return 0;
+  }
+
+  if (country_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvChapCountry, country_))
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != display_size)
+    return 0;
+
+  return display_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapters Class
+
+Chapters::Chapters() : chapters_size_(0), chapters_count_(0), chapters_(NULL) {}
+
+Chapters::~Chapters() {
+  while (chapters_count_ > 0) {
+    Chapter& chapter = chapters_[--chapters_count_];
+    chapter.Clear();
+  }
+
+  delete[] chapters_;
+  chapters_ = NULL;
+}
+
+int Chapters::Count() const { return chapters_count_; }
+
+Chapter* Chapters::AddChapter(unsigned int* seed) {
+  if (!ExpandChaptersArray())
+    return NULL;
+
+  Chapter& chapter = chapters_[chapters_count_++];
+  chapter.Init(seed);
+
+  return &chapter;
+}
+
+bool Chapters::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  const uint64_t payload_size = WriteEdition(NULL);  // return size only
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapters, payload_size))
+    return false;
+
+  const int64_t start = writer->Position();
+
+  if (WriteEdition(writer) == 0)  // error
+    return false;
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Chapters::ExpandChaptersArray() {
+  if (chapters_size_ > chapters_count_)
+    return true;  // nothing to do yet
+
+  const int size = (chapters_size_ == 0) ? 1 : 2 * chapters_size_;
+
+  Chapter* const chapters = new (std::nothrow) Chapter[size];  // NOLINT
+  if (chapters == NULL)
+    return false;
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& src = chapters_[idx];
+    Chapter* const dst = chapters + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] chapters_;
+
+  chapters_ = chapters;
+  chapters_size_ = size;
+
+  return true;
+}
+
+uint64_t Chapters::WriteEdition(IMkvWriter* writer) const {
+  uint64_t payload_size = 0;
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& chapter = chapters_[idx];
+    payload_size += chapter.WriteAtom(NULL);
+  }
+
+  const uint64_t edition_size =
+      EbmlMasterElementSize(libwebm::kMkvEditionEntry, payload_size) +
+      payload_size;
+
+  if (writer == NULL)  // return size only
+    return edition_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvEditionEntry, payload_size))
+    return 0;  // error
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& chapter = chapters_[idx];
+
+    const uint64_t chapter_size = chapter.WriteAtom(writer);
+    if (chapter_size == 0)  // error
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != edition_size)
+    return 0;
+
+  return edition_size;
+}
+
+// Tag Class
+
+bool Tag::add_simple_tag(const char* tag_name, const char* tag_string) {
+  if (!ExpandSimpleTagsArray())
+    return false;
+
+  SimpleTag& st = simple_tags_[simple_tags_count_++];
+  st.Init();
+
+  if (!st.set_tag_name(tag_name))
+    return false;
+
+  if (!st.set_tag_string(tag_string))
+    return false;
+
+  return true;
+}
+
+Tag::Tag() {
+  simple_tags_ = NULL;
+  simple_tags_size_ = 0;
+  simple_tags_count_ = 0;
+}
+
+Tag::~Tag() {}
+
+void Tag::ShallowCopy(Tag* dst) const {
+  dst->simple_tags_ = simple_tags_;
+  dst->simple_tags_size_ = simple_tags_size_;
+  dst->simple_tags_count_ = simple_tags_count_;
+}
+
+void Tag::Clear() {
+  while (simple_tags_count_ > 0) {
+    SimpleTag& st = simple_tags_[--simple_tags_count_];
+    st.Clear();
+  }
+
+  delete[] simple_tags_;
+  simple_tags_ = NULL;
+
+  simple_tags_size_ = 0;
+}
+
+bool Tag::ExpandSimpleTagsArray() {
+  if (simple_tags_size_ > simple_tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (simple_tags_size_ == 0) ? 1 : 2 * simple_tags_size_;
+
+  SimpleTag* const simple_tags = new (std::nothrow) SimpleTag[size];  // NOLINT
+  if (simple_tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    simple_tags[idx] = simple_tags_[idx];  // shallow copy
+  }
+
+  delete[] simple_tags_;
+
+  simple_tags_ = simple_tags;
+  simple_tags_size_ = size;
+
+  return true;
+}
+
+uint64_t Tag::Write(IMkvWriter* writer) const {
+  uint64_t payload_size = 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+    payload_size += st.Write(NULL);
+  }
+
+  const uint64_t tag_size =
+      EbmlMasterElementSize(libwebm::kMkvTag, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return tag_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTag, payload_size))
+    return 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+
+    if (!st.Write(writer))
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != tag_size)
+    return 0;
+
+  return tag_size;
+}
+
+// Tag::SimpleTag
+
+void Tag::SimpleTag::Init() {
+  tag_name_ = NULL;
+  tag_string_ = NULL;
+}
+
+void Tag::SimpleTag::Clear() {
+  StrCpy(NULL, &tag_name_);
+  StrCpy(NULL, &tag_string_);
+}
+
+bool Tag::SimpleTag::set_tag_name(const char* tag_name) {
+  return StrCpy(tag_name, &tag_name_);
+}
+
+bool Tag::SimpleTag::set_tag_string(const char* tag_string) {
+  return StrCpy(tag_string, &tag_string_);
+}
+
+uint64_t Tag::SimpleTag::Write(IMkvWriter* writer) const {
+  uint64_t payload_size = EbmlElementSize(libwebm::kMkvTagName, tag_name_);
+
+  payload_size += EbmlElementSize(libwebm::kMkvTagString, tag_string_);
+
+  const uint64_t simple_tag_size =
+      EbmlMasterElementSize(libwebm::kMkvSimpleTag, payload_size) +
+      payload_size;
+
+  if (writer == NULL)
+    return simple_tag_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvSimpleTag, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTagName, tag_name_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTagString, tag_string_))
+    return 0;
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != simple_tag_size)
+    return 0;
+
+  return simple_tag_size;
+}
+
+// Tags Class
+
+Tags::Tags() : tags_size_(0), tags_count_(0), tags_(NULL) {}
+
+Tags::~Tags() {
+  while (tags_count_ > 0) {
+    Tag& tag = tags_[--tags_count_];
+    tag.Clear();
+  }
+
+  delete[] tags_;
+  tags_ = NULL;
+}
+
+int Tags::Count() const { return tags_count_; }
+
+Tag* Tags::AddTag() {
+  if (!ExpandTagsArray())
+    return NULL;
+
+  Tag& tag = tags_[tags_count_++];
+
+  return &tag;
+}
+
+bool Tags::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  uint64_t payload_size = 0;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+    payload_size += tag.Write(NULL);
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTags, payload_size))
+    return false;
+
+  const int64_t start = writer->Position();
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+
+    const uint64_t tag_size = tag.Write(writer);
+    if (tag_size == 0)  // error
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (tags_size_ > tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (tags_size_ == 0) ? 1 : 2 * tags_size_;
+
+  Tag* const tags = new (std::nothrow) Tag[size];  // NOLINT
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& src = tags_[idx];
+    Tag* const dst = tags + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] tags_;
+
+  tags_ = tags;
+  tags_size_ = size;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cluster class
+
+Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+                 bool write_last_frame_with_duration, bool fixed_size_timecode)
+    : blocks_added_(0),
+      finalized_(false),
+      fixed_size_timecode_(fixed_size_timecode),
+      header_written_(false),
+      payload_size_(0),
+      position_for_cues_(cues_pos),
+      size_position_(-1),
+      timecode_(timecode),
+      timecode_scale_(timecode_scale),
+      write_last_frame_with_duration_(write_last_frame_with_duration),
+      writer_(NULL) {}
+
+Cluster::~Cluster() {
+  // Delete any stored frames that are left behind. This will happen if the
+  // Cluster was not Finalized for whatever reason.
+  while (!stored_frames_.empty()) {
+    while (!stored_frames_.begin()->second.empty()) {
+      delete stored_frames_.begin()->second.front();
+      stored_frames_.begin()->second.pop_front();
+    }
+    stored_frames_.erase(stored_frames_.begin()->first);
+  }
+}
+
+bool Cluster::Init(IMkvWriter* ptr_writer) {
+  if (!ptr_writer) {
+    return false;
+  }
+  writer_ = ptr_writer;
+  return true;
+}
+
+bool Cluster::AddFrame(const Frame* const frame) {
+  return QueueOrWriteFrame(frame);
+}
+
+bool Cluster::AddFrame(const uint8_t* data, uint64_t length,
+                       uint64_t track_number, uint64_t abs_timecode,
+                       bool is_key) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                                     const uint8_t* additional,
+                                     uint64_t additional_length,
+                                     uint64_t add_id, uint64_t track_number,
+                                     uint64_t abs_timecode, bool is_key) {
+  if (!additional || additional_length == 0) {
+    return false;
+  }
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                         int64_t discard_padding,
+                                         uint64_t track_number,
+                                         uint64_t abs_timecode, bool is_key) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddMetadata(const uint8_t* data, uint64_t length,
+                          uint64_t track_number, uint64_t abs_timecode,
+                          uint64_t duration_timecode) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_duration(duration_timecode);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return QueueOrWriteFrame(&frame);
+}
+
+void Cluster::AddPayloadSize(uint64_t size) { payload_size_ += size; }
+
+bool Cluster::Finalize() {
+  return !write_last_frame_with_duration_ && Finalize(false, 0);
+}
+
+bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) {
+  if (!writer_ || finalized_)
+    return false;
+
+  if (write_last_frame_with_duration_) {
+    // Write out held back Frames. This essentially performs a k-way merge
+    // across all tracks in the increasing order of timestamps.
+    while (!stored_frames_.empty()) {
+      Frame* frame = stored_frames_.begin()->second.front();
+
+      // Get the next frame to write (frame with least timestamp across all
+      // tracks).
+      for (FrameMapIterator frames_iterator = ++stored_frames_.begin();
+           frames_iterator != stored_frames_.end(); ++frames_iterator) {
+        if (frames_iterator->second.front()->timestamp() < frame->timestamp()) {
+          frame = frames_iterator->second.front();
+        }
+      }
+
+      // Set the duration if it's the last frame for the track.
+      if (set_last_frame_duration &&
+          stored_frames_[frame->track_number()].size() == 1 &&
+          !frame->duration_set()) {
+        frame->set_duration(duration - frame->timestamp());
+        if (!frame->is_key() && !frame->reference_block_timestamp_set()) {
+          frame->set_reference_block_timestamp(
+              last_block_timestamp_[frame->track_number()]);
+        }
+      }
+
+      // Write the frame and remove it from |stored_frames_|.
+      const bool wrote_frame = DoWriteFrame(frame);
+      stored_frames_[frame->track_number()].pop_front();
+      if (stored_frames_[frame->track_number()].empty()) {
+        stored_frames_.erase(frame->track_number());
+      }
+      delete frame;
+      if (!wrote_frame)
+        return false;
+    }
+  }
+
+  if (size_position_ == -1)
+    return false;
+
+  if (writer_->Seekable()) {
+    const int64_t pos = writer_->Position();
+
+    if (writer_->Position(size_position_))
+      return false;
+
+    if (WriteUIntSize(writer_, payload_size(), 8))
+      return false;
+
+    if (writer_->Position(pos))
+      return false;
+  }
+
+  finalized_ = true;
+
+  return true;
+}
+
+uint64_t Cluster::Size() const {
+  const uint64_t element_size =
+      EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) +
+      payload_size_;
+  return element_size;
+}
+
+bool Cluster::PreWriteBlock() {
+  if (finalized_)
+    return false;
+
+  if (!header_written_) {
+    if (!WriteClusterHeader())
+      return false;
+  }
+
+  return true;
+}
+
+void Cluster::PostWriteBlock(uint64_t element_size) {
+  AddPayloadSize(element_size);
+  ++blocks_added_;
+}
+
+int64_t Cluster::GetRelativeTimecode(int64_t abs_timecode) const {
+  const int64_t cluster_timecode = this->Cluster::timecode();
+  const int64_t rel_timecode =
+      static_cast<int64_t>(abs_timecode) - cluster_timecode;
+
+  if (rel_timecode < 0 || rel_timecode > kMaxBlockTimecode)
+    return -1;
+
+  return rel_timecode;
+}
+
+bool Cluster::DoWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
+    return false;
+
+  if (!PreWriteBlock())
+    return false;
+
+  const uint64_t element_size = WriteFrame(writer_, frame, this);
+  if (element_size == 0)
+    return false;
+
+  PostWriteBlock(element_size);
+  last_block_timestamp_[frame->track_number()] = frame->timestamp();
+  return true;
+}
+
+bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
+    return false;
+
+  // If |write_last_frame_with_duration_| is not set, then write the frame right
+  // away.
+  if (!write_last_frame_with_duration_) {
+    return DoWriteFrame(frame);
+  }
+
+  // Queue the current frame.
+  uint64_t track_number = frame->track_number();
+  Frame* const frame_to_store = new Frame();
+  frame_to_store->CopyFrom(*frame);
+  stored_frames_[track_number].push_back(frame_to_store);
+
+  // Iterate through all queued frames in the current track except the last one
+  // and write it if it is okay to do so (i.e.) no other track has an held back
+  // frame with timestamp <= the timestamp of the frame in question.
+  std::vector<std::list<Frame*>::iterator> frames_to_erase;
+  for (std::list<Frame*>::iterator
+           current_track_iterator = stored_frames_[track_number].begin(),
+           end = --stored_frames_[track_number].end();
+       current_track_iterator != end; ++current_track_iterator) {
+    const Frame* const frame_to_write = *current_track_iterator;
+    bool okay_to_write = true;
+    for (FrameMapIterator track_iterator = stored_frames_.begin();
+         track_iterator != stored_frames_.end(); ++track_iterator) {
+      if (track_iterator->first == track_number) {
+        continue;
+      }
+      if (track_iterator->second.front()->timestamp() <
+          frame_to_write->timestamp()) {
+        okay_to_write = false;
+        break;
+      }
+    }
+    if (okay_to_write) {
+      const bool wrote_frame = DoWriteFrame(frame_to_write);
+      delete frame_to_write;
+      if (!wrote_frame)
+        return false;
+      frames_to_erase.push_back(current_track_iterator);
+    } else {
+      break;
+    }
+  }
+  for (std::vector<std::list<Frame*>::iterator>::iterator iterator =
+           frames_to_erase.begin();
+       iterator != frames_to_erase.end(); ++iterator) {
+    stored_frames_[track_number].erase(*iterator);
+  }
+  return true;
+}
+
+bool Cluster::WriteClusterHeader() {
+  if (finalized_)
+    return false;
+
+  if (WriteID(writer_, libwebm::kMkvCluster))
+    return false;
+
+  // Save for later.
+  size_position_ = writer_->Position();
+
+  // Write "unknown" (EBML coded -1) as cluster size value. We need to write 8
+  // bytes because we do not know how big our cluster will be.
+  if (SerializeInt(writer_, kEbmlUnknownValue, 8))
+    return false;
+
+  if (!WriteEbmlElement(writer_, libwebm::kMkvTimecode, timecode(),
+                        fixed_size_timecode_ ? 8 : 0)) {
+    return false;
+  }
+  AddPayloadSize(EbmlElementSize(libwebm::kMkvTimecode, timecode(),
+                                 fixed_size_timecode_ ? 8 : 0));
+  header_written_ = true;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SeekHead Class
+
+SeekHead::SeekHead() : start_pos_(0ULL) {
+  for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+    seek_entry_id_[i] = 0;
+    seek_entry_pos_[i] = 0;
+  }
+}
+
+SeekHead::~SeekHead() {}
+
+bool SeekHead::Finalize(IMkvWriter* writer) const {
+  if (writer->Seekable()) {
+    if (start_pos_ == -1)
+      return false;
+
+    uint64_t payload_size = 0;
+    uint64_t entry_size[kSeekEntryCount];
+
+    for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+      if (seek_entry_id_[i] != 0) {
+        entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID,
+                                        static_cast<uint64>(seek_entry_id_[i]));
+        entry_size[i] += EbmlElementSize(
+            libwebm::kMkvSeekPosition, static_cast<uint64>(seek_entry_pos_[i]));
+
+        payload_size +=
+            EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) +
+            entry_size[i];
+      }
+    }
+
+    // No SeekHead elements
+    if (payload_size == 0)
+      return true;
+
+    const int64_t pos = writer->Position();
+    if (writer->Position(start_pos_))
+      return false;
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeekHead, payload_size))
+      return false;
+
+    for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+      if (seek_entry_id_[i] != 0) {
+        if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeek, entry_size[i]))
+          return false;
+
+        if (!WriteEbmlElement(writer, libwebm::kMkvSeekID,
+                              static_cast<uint64>(seek_entry_id_[i])))
+          return false;
+
+        if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition,
+                              static_cast<uint64>(seek_entry_pos_[i])))
+          return false;
+      }
+    }
+
+    const uint64_t total_entry_size = kSeekEntryCount * MaxEntrySize();
+    const uint64_t total_size =
+        EbmlMasterElementSize(libwebm::kMkvSeekHead, total_entry_size) +
+        total_entry_size;
+    const int64_t size_left = total_size - (writer->Position() - start_pos_);
+
+    const uint64_t bytes_written = WriteVoidElement(writer, size_left);
+    if (!bytes_written)
+      return false;
+
+    if (writer->Position(pos))
+      return false;
+  }
+
+  return true;
+}
+
+bool SeekHead::Write(IMkvWriter* writer) {
+  const uint64_t entry_size = kSeekEntryCount * MaxEntrySize();
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvSeekHead, entry_size);
+
+  start_pos_ = writer->Position();
+
+  const uint64_t bytes_written = WriteVoidElement(writer, size + entry_size);
+  if (!bytes_written)
+    return false;
+
+  return true;
+}
+
+bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) {
+  for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+    if (seek_entry_id_[i] == 0) {
+      seek_entry_id_[i] = id;
+      seek_entry_pos_[i] = pos;
+      return true;
+    }
+  }
+  return false;
+}
+
+uint32_t SeekHead::GetId(int index) const {
+  if (index < 0 || index >= kSeekEntryCount)
+    return UINT_MAX;
+  return seek_entry_id_[index];
+}
+
+uint64_t SeekHead::GetPosition(int index) const {
+  if (index < 0 || index >= kSeekEntryCount)
+    return ULLONG_MAX;
+  return seek_entry_pos_[index];
+}
+
+bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) {
+  if (index < 0 || index >= kSeekEntryCount)
+    return false;
+  seek_entry_id_[index] = id;
+  seek_entry_pos_[index] = position;
+  return true;
+}
+
+uint64_t SeekHead::MaxEntrySize() const {
+  const uint64_t max_entry_payload_size =
+      EbmlElementSize(libwebm::kMkvSeekID,
+                      static_cast<uint64>(UINT64_C(0xffffffff))) +
+      EbmlElementSize(libwebm::kMkvSeekPosition,
+                      static_cast<uint64>(UINT64_C(0xffffffffffffffff)));
+  const uint64_t max_entry_size =
+      EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) +
+      max_entry_payload_size;
+
+  return max_entry_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SegmentInfo Class
+
+SegmentInfo::SegmentInfo()
+    : duration_(-1.0),
+      muxing_app_(NULL),
+      timecode_scale_(1000000ULL),
+      writing_app_(NULL),
+      date_utc_(LLONG_MIN),
+      duration_pos_(-1) {}
+
+SegmentInfo::~SegmentInfo() {
+  delete[] muxing_app_;
+  delete[] writing_app_;
+}
+
+bool SegmentInfo::Init() {
+  int32_t major;
+  int32_t minor;
+  int32_t build;
+  int32_t revision;
+  GetVersion(&major, &minor, &build, &revision);
+  char temp[256];
+#ifdef _MSC_VER
+  sprintf_s(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+            minor, build, revision);
+#else
+  snprintf(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+           minor, build, revision);
+#endif
+
+  const size_t app_len = strlen(temp) + 1;
+
+  delete[] muxing_app_;
+
+  muxing_app_ = new (std::nothrow) char[app_len];  // NOLINT
+  if (!muxing_app_)
+    return false;
+
+#ifdef _MSC_VER
+  strcpy_s(muxing_app_, app_len, temp);
+#else
+  strcpy(muxing_app_, temp);
+#endif
+
+  set_writing_app(temp);
+  if (!writing_app_)
+    return false;
+  return true;
+}
+
+bool SegmentInfo::Finalize(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  if (duration_ > 0.0) {
+    if (writer->Seekable()) {
+      if (duration_pos_ == -1)
+        return false;
+
+      const int64_t pos = writer->Position();
+
+      if (writer->Position(duration_pos_))
+        return false;
+
+      if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+                            static_cast<float>(duration_)))
+        return false;
+
+      if (writer->Position(pos))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool SegmentInfo::Write(IMkvWriter* writer) {
+  if (!writer || !muxing_app_ || !writing_app_)
+    return false;
+
+  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale,
+                                  static_cast<uint64>(timecode_scale_));
+  if (duration_ > 0.0)
+    size +=
+        EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
+  if (date_utc_ != LLONG_MIN)
+    size += EbmlDateElementSize(libwebm::kMkvDateUTC);
+  size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_);
+  size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_);
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvInfo, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale,
+                        static_cast<uint64>(timecode_scale_)))
+    return false;
+
+  if (duration_ > 0.0) {
+    // Save for later
+    duration_pos_ = writer->Position();
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+                          static_cast<float>(duration_)))
+      return false;
+  }
+
+  if (date_utc_ != LLONG_MIN)
+    WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_);
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvWritingApp, writing_app_))
+    return false;
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+void SegmentInfo::set_muxing_app(const char* app) {
+  if (app) {
+    const size_t length = strlen(app) + 1;
+    char* temp_str = new (std::nothrow) char[length];  // NOLINT
+    if (!temp_str)
+      return;
+
+#ifdef _MSC_VER
+    strcpy_s(temp_str, length, app);
+#else
+    strcpy(temp_str, app);
+#endif
+
+    delete[] muxing_app_;
+    muxing_app_ = temp_str;
+  }
+}
+
+void SegmentInfo::set_writing_app(const char* app) {
+  if (app) {
+    const size_t length = strlen(app) + 1;
+    char* temp_str = new (std::nothrow) char[length];  // NOLINT
+    if (!temp_str)
+      return;
+
+#ifdef _MSC_VER
+    strcpy_s(temp_str, length, app);
+#else
+    strcpy(temp_str, app);
+#endif
+
+    delete[] writing_app_;
+    writing_app_ = temp_str;
+  }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Segment Class
+
+Segment::Segment()
+    : chunk_count_(0),
+      chunk_name_(NULL),
+      chunk_writer_cluster_(NULL),
+      chunk_writer_cues_(NULL),
+      chunk_writer_header_(NULL),
+      chunking_(false),
+      chunking_base_name_(NULL),
+      cluster_list_(NULL),
+      cluster_list_capacity_(0),
+      cluster_list_size_(0),
+      cues_position_(kAfterClusters),
+      cues_track_(0),
+      force_new_cluster_(false),
+      frames_(NULL),
+      frames_capacity_(0),
+      frames_size_(0),
+      has_video_(false),
+      header_written_(false),
+      last_block_duration_(0),
+      last_timestamp_(0),
+      max_cluster_duration_(kDefaultMaxClusterDuration),
+      max_cluster_size_(0),
+      mode_(kFile),
+      new_cuepoint_(false),
+      output_cues_(true),
+      accurate_cluster_duration_(false),
+      fixed_size_cluster_timecode_(false),
+      estimate_file_duration_(false),
+      payload_pos_(0),
+      size_position_(0),
+      doc_type_version_(kDefaultDocTypeVersion),
+      doc_type_version_written_(0),
+      duration_(0.0),
+      writer_cluster_(NULL),
+      writer_cues_(NULL),
+      writer_header_(NULL) {
+  const time_t curr_time = time(NULL);
+  seed_ = static_cast<unsigned int>(curr_time);
+#ifdef _WIN32
+  srand(seed_);
+#endif
+}
+
+Segment::~Segment() {
+  if (cluster_list_) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
+      Cluster* const cluster = cluster_list_[i];
+      delete cluster;
+    }
+    delete[] cluster_list_;
+  }
+
+  if (frames_) {
+    for (int32_t i = 0; i < frames_size_; ++i) {
+      Frame* const frame = frames_[i];
+      delete frame;
+    }
+    delete[] frames_;
+  }
+
+  delete[] chunk_name_;
+  delete[] chunking_base_name_;
+
+  if (chunk_writer_cluster_) {
+    chunk_writer_cluster_->Close();
+    delete chunk_writer_cluster_;
+  }
+  if (chunk_writer_cues_) {
+    chunk_writer_cues_->Close();
+    delete chunk_writer_cues_;
+  }
+  if (chunk_writer_header_) {
+    chunk_writer_header_->Close();
+    delete chunk_writer_header_;
+  }
+}
+
+void Segment::MoveCuesBeforeClustersHelper(uint64_t diff, int32_t index,
+                                           uint64_t* cues_size) {
+  CuePoint* const cue_point = cues_.GetCueByIndex(index);
+  if (cue_point == NULL)
+    return;
+  const uint64_t old_cue_point_size = cue_point->Size();
+  const uint64_t cluster_pos = cue_point->cluster_pos() + diff;
+  cue_point->set_cluster_pos(cluster_pos);  // update the new cluster position
+  // New size of the cue is computed as follows
+  //    Let a = current sum of size of all CuePoints
+  //    Let b = Increase in Cue Point's size due to this iteration
+  //    Let c = Increase in size of Cues Element's length due to this iteration
+  //            (This is computed as CodedSize(a + b) - CodedSize(a))
+  //    Let d = b + c. Now d is the |diff| passed to the next recursive call.
+  //    Let e = a + b. Now e is the |cues_size| passed to the next recursive
+  //                   call.
+  const uint64_t cue_point_size_diff = cue_point->Size() - old_cue_point_size;
+  const uint64_t cue_size_diff =
+      GetCodedUIntSize(*cues_size + cue_point_size_diff) -
+      GetCodedUIntSize(*cues_size);
+  *cues_size += cue_point_size_diff;
+  diff = cue_size_diff + cue_point_size_diff;
+  if (diff > 0) {
+    for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) {
+      MoveCuesBeforeClustersHelper(diff, i, cues_size);
+    }
+  }
+}
+
+void Segment::MoveCuesBeforeClusters() {
+  const uint64_t current_cue_size = cues_.Size();
+  uint64_t cue_size = 0;
+  for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
+    cue_size += cues_.GetCueByIndex(i)->Size();
+  for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
+    MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size);
+
+  // Adjust the Seek Entry to reflect the change in position
+  // of Cluster and Cues
+  int32_t cluster_index = 0;
+  int32_t cues_index = 0;
+  for (int32_t i = 0; i < SeekHead::kSeekEntryCount; ++i) {
+    if (seek_head_.GetId(i) == libwebm::kMkvCluster)
+      cluster_index = i;
+    if (seek_head_.GetId(i) == libwebm::kMkvCues)
+      cues_index = i;
+  }
+  seek_head_.SetSeekEntry(cues_index, libwebm::kMkvCues,
+                          seek_head_.GetPosition(cluster_index));
+  seek_head_.SetSeekEntry(cluster_index, libwebm::kMkvCluster,
+                          cues_.Size() + seek_head_.GetPosition(cues_index));
+}
+
+bool Segment::Init(IMkvWriter* ptr_writer) {
+  if (!ptr_writer) {
+    return false;
+  }
+  writer_cluster_ = ptr_writer;
+  writer_cues_ = ptr_writer;
+  writer_header_ = ptr_writer;
+  memset(&track_frames_written_, 0,
+         sizeof(track_frames_written_[0]) * kMaxTrackNumber);
+  memset(&last_track_timestamp_, 0,
+         sizeof(last_track_timestamp_[0]) * kMaxTrackNumber);
+  return segment_info_.Init();
+}
+
+bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+                                            IMkvWriter* writer) {
+  if (!writer->Seekable() || chunking_)
+    return false;
+  const int64_t cluster_offset =
+      cluster_list_[0]->size_position() - GetUIntSize(libwebm::kMkvCluster);
+
+  // Copy the headers.
+  if (!ChunkedCopy(reader, writer, 0, cluster_offset))
+    return false;
+
+  // Recompute cue positions and seek entries.
+  MoveCuesBeforeClusters();
+
+  // Write cues and seek entries.
+  // TODO(vigneshv): As of now, it's safe to call seek_head_.Finalize() for the
+  // second time with a different writer object. But the name Finalize() doesn't
+  // indicate something we want to call more than once. So consider renaming it
+  // to write() or some such.
+  if (!cues_.Write(writer) || !seek_head_.Finalize(writer))
+    return false;
+
+  // Copy the Clusters.
+  if (!ChunkedCopy(reader, writer, cluster_offset,
+                   cluster_end_offset_ - cluster_offset))
+    return false;
+
+  // Update the Segment size in case the Cues size has changed.
+  const int64_t pos = writer->Position();
+  const int64_t segment_size = writer->Position() - payload_pos_;
+  if (writer->Position(size_position_) ||
+      WriteUIntSize(writer, segment_size, 8) || writer->Position(pos))
+    return false;
+  return true;
+}
+
+bool Segment::Finalize() {
+  if (WriteFramesAll() < 0)
+    return false;
+
+  // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_|
+  // is set. In all other modes, always call Cluster::Finalize.
+  if ((mode_ == kLive ? accurate_cluster_duration_ : true) &&
+      cluster_list_size_ > 0) {
+    // Update last cluster's size
+    Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+    // For the last frame of the last Cluster, we don't write it as a BlockGroup
+    // with Duration unless the frame itself has duration set explicitly.
+    if (!old_cluster || !old_cluster->Finalize(false, 0))
+      return false;
+  }
+
+  if (mode_ == kFile) {
+    if (chunking_ && chunk_writer_cluster_) {
+      chunk_writer_cluster_->Close();
+      chunk_count_++;
+    }
+
+    double duration =
+        (static_cast<double>(last_timestamp_) + last_block_duration_) /
+        segment_info_.timecode_scale();
+    if (duration_ > 0.0) {
+      duration = duration_;
+    } else {
+      if (last_block_duration_ == 0 && estimate_file_duration_) {
+        const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+        for (int i = 0; i < num_tracks; ++i) {
+          if (track_frames_written_[i] < 2)
+            continue;
+
+          // Estimate the duration for the last block of a Track.
+          const double nano_per_frame =
+              static_cast<double>(last_track_timestamp_[i]) /
+              (track_frames_written_[i] - 1);
+          const double track_duration =
+              (last_track_timestamp_[i] + nano_per_frame) /
+              segment_info_.timecode_scale();
+          if (track_duration > duration)
+            duration = track_duration;
+        }
+      }
+    }
+    segment_info_.set_duration(duration);
+    if (!segment_info_.Finalize(writer_header_))
+      return false;
+
+    if (output_cues_)
+      if (!seek_head_.AddSeekEntry(libwebm::kMkvCues, MaxOffset()))
+        return false;
+
+    if (chunking_) {
+      if (!chunk_writer_cues_)
+        return false;
+
+      char* name = NULL;
+      if (!UpdateChunkName("cues", &name))
+        return false;
+
+      const bool cues_open = chunk_writer_cues_->Open(name);
+      delete[] name;
+      if (!cues_open)
+        return false;
+    }
+
+    cluster_end_offset_ = writer_cluster_->Position();
+
+    // Write the seek headers and cues
+    if (output_cues_)
+      if (!cues_.Write(writer_cues_))
+        return false;
+
+    if (!seek_head_.Finalize(writer_header_))
+      return false;
+
+    if (writer_header_->Seekable()) {
+      if (size_position_ == -1)
+        return false;
+
+      const int64_t segment_size = MaxOffset();
+      if (segment_size < 1)
+        return false;
+
+      const int64_t pos = writer_header_->Position();
+      UpdateDocTypeVersion();
+      if (doc_type_version_ != doc_type_version_written_) {
+        if (writer_header_->Position(0))
+          return false;
+
+        const char* const doc_type =
+            DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+        if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
+          return false;
+        if (writer_header_->Position() != ebml_header_size_)
+          return false;
+
+        doc_type_version_written_ = doc_type_version_;
+      }
+
+      if (writer_header_->Position(size_position_))
+        return false;
+
+      if (WriteUIntSize(writer_header_, segment_size, 8))
+        return false;
+
+      if (writer_header_->Position(pos))
+        return false;
+    }
+
+    if (chunking_) {
+      // Do not close any writers until the segment size has been written,
+      // otherwise the size may be off.
+      if (!chunk_writer_cues_ || !chunk_writer_header_)
+        return false;
+
+      chunk_writer_cues_->Close();
+      chunk_writer_header_->Close();
+    }
+  }
+
+  return true;
+}
+
+Track* Segment::AddTrack(int32_t number) {
+  Track* const track = new (std::nothrow) Track(&seed_);  // NOLINT
+
+  if (!track)
+    return NULL;
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return NULL;
+  }
+
+  return track;
+}
+
+Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
+
+Tag* Segment::AddTag() { return tags_.AddTag(); }
+
+uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
+  VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_);  // NOLINT
+  if (!track)
+    return 0;
+
+  track->set_type(Tracks::kVideo);
+  track->set_codec_id(Tracks::kVp8CodecId);
+  track->set_width(width);
+  track->set_height(height);
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
+  has_video_ = true;
+
+  return track->number();
+}
+
+bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
+  if (cluster_list_size_ < 1)
+    return false;
+
+  const Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+  if (!cluster)
+    return false;
+
+  CuePoint* const cue = new (std::nothrow) CuePoint();  // NOLINT
+  if (!cue)
+    return false;
+
+  cue->set_time(timestamp / segment_info_.timecode_scale());
+  cue->set_block_number(cluster->blocks_added());
+  cue->set_cluster_pos(cluster->position_for_cues());
+  cue->set_track(track);
+  if (!cues_.AddCue(cue)) {
+    delete cue;
+    return false;
+  }
+
+  new_cuepoint_ = false;
+  return true;
+}
+
+uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
+                                int32_t number) {
+  AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_);  // NOLINT
+  if (!track)
+    return 0;
+
+  track->set_type(Tracks::kAudio);
+  track->set_codec_id(Tracks::kVorbisCodecId);
+  track->set_sample_rate(sample_rate);
+  track->set_channels(channels);
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
+
+  return track->number();
+}
+
+bool Segment::AddFrame(const uint8_t* data, uint64_t length,
+                       uint64_t track_number, uint64_t timestamp, bool is_key) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                                     const uint8_t* additional,
+                                     uint64_t additional_length,
+                                     uint64_t add_id, uint64_t track_number,
+                                     uint64_t timestamp, bool is_key) {
+  if (!data || !additional)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                         int64_t discard_padding,
+                                         uint64_t track_number,
+                                         uint64_t timestamp, bool is_key) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddMetadata(const uint8_t* data, uint64_t length,
+                          uint64_t track_number, uint64_t timestamp_ns,
+                          uint64_t duration_ns) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp_ns);
+  frame.set_duration(duration_ns);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddGenericFrame(const Frame* frame) {
+  if (!frame)
+    return false;
+
+  if (!CheckHeaderInfo())
+    return false;
+
+  // Check for non-monotonically increasing timestamps.
+  if (frame->timestamp() < last_timestamp_)
+    return false;
+
+  // Check if the track number is valid.
+  if (!tracks_.GetTrackByNumber(frame->track_number()))
+    return false;
+
+  if (frame->discard_padding() != 0)
+    doc_type_version_ = 4;
+
+  if (cluster_list_size_ > 0) {
+    const uint64_t timecode_scale = segment_info_.timecode_scale();
+    const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+    const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+    const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+    const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+    if (rel_timecode > kMaxBlockTimecode) {
+      force_new_cluster_ = true;
+    }
+  }
+
+  // If the segment has a video track hold onto audio frames to make sure the
+  // audio that is associated with the start time of a video key-frame is
+  // muxed into the same cluster.
+  if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
+      !force_new_cluster_) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
+      return false;
+    }
+    if (!QueueFrame(new_frame)) {
+      delete new_frame;
+      return false;
+    }
+    track_frames_written_[frame->track_number() - 1]++;
+    return true;
+  }
+
+  if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
+                              frame->is_key())) {
+    return false;
+  }
+
+  if (cluster_list_size_ < 1)
+    return false;
+
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+  if (!cluster)
+    return false;
+
+  // If the Frame is not a SimpleBlock, then set the reference_block_timestamp
+  // if it is not set already.
+  bool frame_created = false;
+  if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
+      !frame->reference_block_timestamp_set()) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
+      return false;
+    }
+    new_frame->set_reference_block_timestamp(
+        last_track_timestamp_[frame->track_number() - 1]);
+    frame = new_frame;
+    frame_created = true;
+  }
+
+  if (!cluster->AddFrame(frame))
+    return false;
+
+  if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+    if (!AddCuePoint(frame->timestamp(), cues_track_))
+      return false;
+  }
+
+  last_timestamp_ = frame->timestamp();
+  last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+  last_block_duration_ = frame->duration();
+  track_frames_written_[frame->track_number() - 1]++;
+
+  if (frame_created)
+    delete frame;
+  return true;
+}
+
+void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
+
+void Segment::AccurateClusterDuration(bool accurate_cluster_duration) {
+  accurate_cluster_duration_ = accurate_cluster_duration;
+}
+
+void Segment::UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode) {
+  fixed_size_cluster_timecode_ = fixed_size_cluster_timecode;
+}
+
+bool Segment::SetChunking(bool chunking, const char* filename) {
+  if (chunk_count_ > 0)
+    return false;
+
+  if (chunking) {
+    if (!filename)
+      return false;
+
+    // Check if we are being set to what is already set.
+    if (chunking_ && !strcmp(filename, chunking_base_name_))
+      return true;
+
+    const size_t name_length = strlen(filename) + 1;
+    char* const temp = new (std::nothrow) char[name_length];  // NOLINT
+    if (!temp)
+      return false;
+
+#ifdef _MSC_VER
+    strcpy_s(temp, name_length, filename);
+#else
+    strcpy(temp, filename);
+#endif
+
+    delete[] chunking_base_name_;
+    chunking_base_name_ = temp;
+
+    if (!UpdateChunkName("chk", &chunk_name_))
+      return false;
+
+    if (!chunk_writer_cluster_) {
+      chunk_writer_cluster_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_cluster_)
+        return false;
+    }
+
+    if (!chunk_writer_cues_) {
+      chunk_writer_cues_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_cues_)
+        return false;
+    }
+
+    if (!chunk_writer_header_) {
+      chunk_writer_header_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_header_)
+        return false;
+    }
+
+    if (!chunk_writer_cluster_->Open(chunk_name_))
+      return false;
+
+    const size_t header_length = strlen(filename) + strlen(".hdr") + 1;
+    char* const header = new (std::nothrow) char[header_length];  // NOLINT
+    if (!header)
+      return false;
+
+#ifdef _MSC_VER
+    strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_);
+    strcat_s(header, header_length, ".hdr");
+#else
+    strcpy(header, chunking_base_name_);
+    strcat(header, ".hdr");
+#endif
+    if (!chunk_writer_header_->Open(header)) {
+      delete[] header;
+      return false;
+    }
+
+    writer_cluster_ = chunk_writer_cluster_;
+    writer_cues_ = chunk_writer_cues_;
+    writer_header_ = chunk_writer_header_;
+
+    delete[] header;
+  }
+
+  chunking_ = chunking;
+
+  return true;
+}
+
+bool Segment::CuesTrack(uint64_t track_number) {
+  const Track* const track = GetTrackByNumber(track_number);
+  if (!track)
+    return false;
+
+  cues_track_ = track_number;
+  return true;
+}
+
+void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; }
+
+Track* Segment::GetTrackByNumber(uint64_t track_number) const {
+  return tracks_.GetTrackByNumber(track_number);
+}
+
+bool Segment::WriteSegmentHeader() {
+  UpdateDocTypeVersion();
+
+  const char* const doc_type =
+      DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+  if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
+    return false;
+  doc_type_version_written_ = doc_type_version_;
+  ebml_header_size_ = static_cast<int32_t>(writer_header_->Position());
+
+  // Write "unknown" (-1) as segment size value. If mode is kFile, Segment
+  // will write over duration when the file is finalized.
+  if (WriteID(writer_header_, libwebm::kMkvSegment))
+    return false;
+
+  // Save for later.
+  size_position_ = writer_header_->Position();
+
+  // Write "unknown" (EBML coded -1) as segment size value. We need to write 8
+  // bytes because if we are going to overwrite the segment size later we do
+  // not know how big our segment will be.
+  if (SerializeInt(writer_header_, kEbmlUnknownValue, 8))
+    return false;
+
+  payload_pos_ = writer_header_->Position();
+
+  if (mode_ == kFile && writer_header_->Seekable()) {
+    // Set the duration > 0.0 so SegmentInfo will write out the duration. When
+    // the muxer is done writing we will set the correct duration and have
+    // SegmentInfo upadte it.
+    segment_info_.set_duration(1.0);
+
+    if (!seek_head_.Write(writer_header_))
+      return false;
+  }
+
+  if (!seek_head_.AddSeekEntry(libwebm::kMkvInfo, MaxOffset()))
+    return false;
+  if (!segment_info_.Write(writer_header_))
+    return false;
+
+  if (!seek_head_.AddSeekEntry(libwebm::kMkvTracks, MaxOffset()))
+    return false;
+  if (!tracks_.Write(writer_header_))
+    return false;
+
+  if (chapters_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvChapters, MaxOffset()))
+      return false;
+    if (!chapters_.Write(writer_header_))
+      return false;
+  }
+
+  if (tags_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvTags, MaxOffset()))
+      return false;
+    if (!tags_.Write(writer_header_))
+      return false;
+  }
+
+  if (chunking_ && (mode_ == kLive || !writer_header_->Seekable())) {
+    if (!chunk_writer_header_)
+      return false;
+
+    chunk_writer_header_->Close();
+  }
+
+  header_written_ = true;
+
+  return true;
+}
+
+// Here we are testing whether to create a new cluster, given a frame
+// having time frame_timestamp_ns.
+//
+int Segment::TestFrame(uint64_t track_number, uint64_t frame_timestamp_ns,
+                       bool is_key) const {
+  if (force_new_cluster_)
+    return 1;
+
+  // If no clusters have been created yet, then create a new cluster
+  // and write this frame immediately, in the new cluster.  This path
+  // should only be followed once, the first time we attempt to write
+  // a frame.
+
+  if (cluster_list_size_ <= 0)
+    return 1;
+
+  // There exists at least one cluster. We must compare the frame to
+  // the last cluster, in order to determine whether the frame is
+  // written to the existing cluster, or that a new cluster should be
+  // created.
+
+  const uint64_t timecode_scale = segment_info_.timecode_scale();
+  const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
+
+  const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+  const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+  // For completeness we test for the case when the frame's timecode
+  // is less than the cluster's timecode.  Although in principle that
+  // is allowed, this muxer doesn't actually write clusters like that,
+  // so this indicates a bug somewhere in our algorithm.
+
+  if (frame_timecode < last_cluster_timecode)  // should never happen
+    return -1;
+
+  // If the frame has a timestamp significantly larger than the last
+  // cluster (in Matroska, cluster-relative timestamps are serialized
+  // using a 16-bit signed integer), then we cannot write this frame
+  // to that cluster, and so we must create a new cluster.
+
+  const int64_t delta_timecode = frame_timecode - last_cluster_timecode;
+
+  if (delta_timecode > kMaxBlockTimecode)
+    return 2;
+
+  // We decide to create a new cluster when we have a video keyframe.
+  // This will flush queued (audio) frames, and write the keyframe
+  // immediately, in the newly-created cluster.
+
+  if (is_key && tracks_.TrackIsVideo(track_number))
+    return 1;
+
+  // Create a new cluster if we have accumulated too many frames
+  // already, where "too many" is defined as "the total time of frames
+  // in the cluster exceeds a threshold".
+
+  const uint64_t delta_ns = delta_timecode * timecode_scale;
+
+  if (max_cluster_duration_ > 0 && delta_ns >= max_cluster_duration_)
+    return 1;
+
+  // This is similar to the case above, with the difference that a new
+  // cluster is created when the size of the current cluster exceeds a
+  // threshold.
+
+  const uint64_t cluster_size = last_cluster->payload_size();
+
+  if (max_cluster_size_ > 0 && cluster_size >= max_cluster_size_)
+    return 1;
+
+  // There's no need to create a new cluster, so emit this frame now.
+
+  return 0;
+}
+
+bool Segment::MakeNewCluster(uint64_t frame_timestamp_ns) {
+  const int32_t new_size = cluster_list_size_ + 1;
+
+  if (new_size > cluster_list_capacity_) {
+    // Add more clusters.
+    const int32_t new_capacity =
+        (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
+    Cluster** const clusters =
+        new (std::nothrow) Cluster*[new_capacity];  // NOLINT
+    if (!clusters)
+      return false;
+
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
+      clusters[i] = cluster_list_[i];
+    }
+
+    delete[] cluster_list_;
+
+    cluster_list_ = clusters;
+    cluster_list_capacity_ = new_capacity;
+  }
+
+  if (!WriteFramesLessThan(frame_timestamp_ns))
+    return false;
+
+  if (cluster_list_size_ > 0) {
+    // Update old cluster's size
+    Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+    if (!old_cluster || !old_cluster->Finalize(true, frame_timestamp_ns))
+      return false;
+  }
+
+  if (output_cues_)
+    new_cuepoint_ = true;
+
+  if (chunking_ && cluster_list_size_ > 0) {
+    chunk_writer_cluster_->Close();
+    chunk_count_++;
+
+    if (!UpdateChunkName("chk", &chunk_name_))
+      return false;
+    if (!chunk_writer_cluster_->Open(chunk_name_))
+      return false;
+  }
+
+  const uint64_t timecode_scale = segment_info_.timecode_scale();
+  const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
+
+  uint64_t cluster_timecode = frame_timecode;
+
+  if (frames_size_ > 0) {
+    const Frame* const f = frames_[0];  // earliest queued frame
+    const uint64_t ns = f->timestamp();
+    const uint64_t tc = ns / timecode_scale;
+
+    if (tc < cluster_timecode)
+      cluster_timecode = tc;
+  }
+
+  Cluster*& cluster = cluster_list_[cluster_list_size_];
+  const int64_t offset = MaxOffset();
+  cluster = new (std::nothrow)
+      Cluster(cluster_timecode, offset, segment_info_.timecode_scale(),
+              accurate_cluster_duration_, fixed_size_cluster_timecode_);
+  if (!cluster)
+    return false;
+
+  if (!cluster->Init(writer_cluster_))
+    return false;
+
+  cluster_list_size_ = new_size;
+  return true;
+}
+
+bool Segment::DoNewClusterProcessing(uint64_t track_number,
+                                     uint64_t frame_timestamp_ns, bool is_key) {
+  for (;;) {
+    // Based on the characteristics of the current frame and current
+    // cluster, decide whether to create a new cluster.
+    const int result = TestFrame(track_number, frame_timestamp_ns, is_key);
+    if (result < 0)  // error
+      return false;
+
+    // Always set force_new_cluster_ to false after TestFrame.
+    force_new_cluster_ = false;
+
+    // A non-zero result means create a new cluster.
+    if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
+      return false;
+
+    // Write queued (audio) frames.
+    const int frame_count = WriteFramesAll();
+    if (frame_count < 0)  // error
+      return false;
+
+    // Write the current frame to the current cluster (if TestFrame
+    // returns 0) or to a newly created cluster (TestFrame returns 1).
+    if (result <= 1)
+      return true;
+
+    // TestFrame returned 2, which means there was a large time
+    // difference between the cluster and the frame itself.  Do the
+    // test again, comparing the frame to the new cluster.
+  }
+}
+
+bool Segment::CheckHeaderInfo() {
+  if (!header_written_) {
+    if (!WriteSegmentHeader())
+      return false;
+
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvCluster, MaxOffset()))
+      return false;
+
+    if (output_cues_ && cues_track_ == 0) {
+      // Check for a video track
+      for (uint32_t i = 0; i < tracks_.track_entries_size(); ++i) {
+        const Track* const track = tracks_.GetTrackByIndex(i);
+        if (!track)
+          return false;
+
+        if (tracks_.TrackIsVideo(track->number())) {
+          cues_track_ = track->number();
+          break;
+        }
+      }
+
+      // Set first track found
+      if (cues_track_ == 0) {
+        const Track* const track = tracks_.GetTrackByIndex(0);
+        if (!track)
+          return false;
+
+        cues_track_ = track->number();
+      }
+    }
+  }
+  return true;
+}
+
+void Segment::UpdateDocTypeVersion() {
+  for (uint32_t index = 0; index < tracks_.track_entries_size(); ++index) {
+    const Track* track = tracks_.GetTrackByIndex(index);
+    if (track == NULL)
+      break;
+    if ((track->codec_delay() || track->seek_pre_roll()) &&
+        doc_type_version_ < 4) {
+      doc_type_version_ = 4;
+      break;
+    }
+  }
+}
+
+bool Segment::UpdateChunkName(const char* ext, char** name) const {
+  if (!name || !ext)
+    return false;
+
+  char ext_chk[64];
+#ifdef _MSC_VER
+  sprintf_s(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#else
+  snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#endif
+
+  const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1;
+  char* const str = new (std::nothrow) char[length];  // NOLINT
+  if (!str)
+    return false;
+
+#ifdef _MSC_VER
+  strcpy_s(str, length - strlen(ext_chk), chunking_base_name_);
+  strcat_s(str, length, ext_chk);
+#else
+  strcpy(str, chunking_base_name_);
+  strcat(str, ext_chk);
+#endif
+
+  delete[] * name;
+  *name = str;
+
+  return true;
+}
+
+int64_t Segment::MaxOffset() {
+  if (!writer_header_)
+    return -1;
+
+  int64_t offset = writer_header_->Position() - payload_pos_;
+
+  if (chunking_) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
+      Cluster* const cluster = cluster_list_[i];
+      offset += cluster->Size();
+    }
+
+    if (writer_cues_)
+      offset += writer_cues_->Position();
+  }
+
+  return offset;
+}
+
+bool Segment::QueueFrame(Frame* frame) {
+  const int32_t new_size = frames_size_ + 1;
+
+  if (new_size > frames_capacity_) {
+    // Add more frames.
+    const int32_t new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2;
+
+    if (new_capacity < 1)
+      return false;
+
+    Frame** const frames = new (std::nothrow) Frame*[new_capacity];  // NOLINT
+    if (!frames)
+      return false;
+
+    for (int32_t i = 0; i < frames_size_; ++i) {
+      frames[i] = frames_[i];
+    }
+
+    delete[] frames_;
+    frames_ = frames;
+    frames_capacity_ = new_capacity;
+  }
+
+  frames_[frames_size_++] = frame;
+
+  return true;
+}
+
+int Segment::WriteFramesAll() {
+  if (frames_ == NULL)
+    return 0;
+
+  if (cluster_list_size_ < 1)
+    return -1;
+
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+
+  if (!cluster)
+    return -1;
+
+  for (int32_t i = 0; i < frames_size_; ++i) {
+    Frame*& frame = frames_[i];
+    // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the
+    // places where |doc_type_version_| needs to be updated.
+    if (frame->discard_padding() != 0)
+      doc_type_version_ = 4;
+    if (!cluster->AddFrame(frame))
+      return -1;
+
+    if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+      if (!AddCuePoint(frame->timestamp(), cues_track_))
+        return -1;
+    }
+
+    if (frame->timestamp() > last_timestamp_) {
+      last_timestamp_ = frame->timestamp();
+      last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+    }
+
+    delete frame;
+    frame = NULL;
+  }
+
+  const int result = frames_size_;
+  frames_size_ = 0;
+
+  return result;
+}
+
+bool Segment::WriteFramesLessThan(uint64_t timestamp) {
+  // Check |cluster_list_size_| to see if this is the first cluster. If it is
+  // the first cluster the audio frames that are less than the first video
+  // timesatmp will be written in a later step.
+  if (frames_size_ > 0 && cluster_list_size_ > 0) {
+    if (!frames_)
+      return false;
+
+    Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+    if (!cluster)
+      return false;
+
+    int32_t shift_left = 0;
+
+    // TODO(fgalligan): Change this to use the durations of frames instead of
+    // the next frame's start time if the duration is accurate.
+    for (int32_t i = 1; i < frames_size_; ++i) {
+      const Frame* const frame_curr = frames_[i];
+
+      if (frame_curr->timestamp() > timestamp)
+        break;
+
+      const Frame* const frame_prev = frames_[i - 1];
+      if (frame_prev->discard_padding() != 0)
+        doc_type_version_ = 4;
+      if (!cluster->AddFrame(frame_prev))
+        return false;
+
+      if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
+          return false;
+      }
+
+      ++shift_left;
+      if (frame_prev->timestamp() > last_timestamp_) {
+        last_timestamp_ = frame_prev->timestamp();
+        last_track_timestamp_[frame_prev->track_number() - 1] =
+            frame_prev->timestamp();
+      }
+
+      delete frame_prev;
+    }
+
+    if (shift_left > 0) {
+      if (shift_left >= frames_size_)
+        return false;
+
+      const int32_t new_frames_size = frames_size_ - shift_left;
+      for (int32_t i = 0; i < new_frames_size; ++i) {
+        frames_[i] = frames_[i + shift_left];
+      }
+
+      frames_size_ = new_frames_size;
+    }
+  }
+
+  return true;
+}
+
+bool Segment::DocTypeIsWebm() const {
+  const int kNumCodecIds = 10;
+
+  // TODO(vigneshv): Tweak .clang-format.
+  const char* kWebmCodecIds[kNumCodecIds] = {
+      Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
+      Tracks::kVp8CodecId,           Tracks::kVp9CodecId,
+      Tracks::kVp10CodecId,          Tracks::kAV1CodecId,
+      Tracks::kWebVttCaptionsId,     Tracks::kWebVttDescriptionsId,
+      Tracks::kWebVttMetadataId,     Tracks::kWebVttSubtitlesId};
+
+  const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+  for (int track_index = 0; track_index < num_tracks; ++track_index) {
+    const Track* const track = tracks_.GetTrackByIndex(track_index);
+    const std::string codec_id = track->codec_id();
+
+    bool id_is_webm = false;
+    for (int id_index = 0; id_index < kNumCodecIds; ++id_index) {
+      if (codec_id == kWebmCodecIds[id_index]) {
+        id_is_webm = true;
+        break;
+      }
+    }
+
+    if (!id_is_webm)
+      return false;
+  }
+
+  return true;
+}
+
+}  // namespace mkvmuxer
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h
new file mode 100644
index 000000000..9e817bced
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -0,0 +1,1922 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVMUXER_H_
+#define MKVMUXER_MKVMUXER_H_
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <list>
+#include <map>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxertypes.h"
+
+// For a description of the WebM elements see
+// http://www.webmproject.org/code/specs/container/.
+
+namespace mkvparser {
+class IMkvReader;
+}  // namespace mkvparser
+
+namespace mkvmuxer {
+
+class MkvWriter;
+class Segment;
+
+const uint64_t kMaxTrackNumber = 126;
+
+///////////////////////////////////////////////////////////////
+// Interface used by the mkvmuxer to write out the Mkv data.
+class IMkvWriter {
+ public:
+  // Writes out |len| bytes of |buf|. Returns 0 on success.
+  virtual int32 Write(const void* buf, uint32 len) = 0;
+
+  // Returns the offset of the output position from the beginning of the
+  // output.
+  virtual int64 Position() const = 0;
+
+  // Set the current File position. Returns 0 on success.
+  virtual int32 Position(int64 position) = 0;
+
+  // Returns true if the writer is seekable.
+  virtual bool Seekable() const = 0;
+
+  // Element start notification. Called whenever an element identifier is about
+  // to be written to the stream. |element_id| is the element identifier, and
+  // |position| is the location in the WebM stream where the first octet of the
+  // element identifier will be written.
+  // Note: the |MkvId| enumeration in webmids.hpp defines element values.
+  virtual void ElementStartNotify(uint64 element_id, int64 position) = 0;
+
+ protected:
+  IMkvWriter();
+  virtual ~IMkvWriter();
+
+ private:
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter);
+};
+
+// Writes out the EBML header for a WebM file, but allows caller to specify
+// DocType. This function must be called before any other libwebm writing
+// functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type);
+
+// Writes out the EBML header for a WebM file. This function must be called
+// before any other libwebm writing functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version);
+
+// Deprecated. Writes out EBML header with doc_type_version as
+// kDefaultDocTypeVersion. Exists for backward compatibility.
+bool WriteEbmlHeader(IMkvWriter* writer);
+
+// Copies in Chunk from source to destination between the given byte positions
+bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64_t start,
+                 int64_t size);
+
+///////////////////////////////////////////////////////////////
+// Class to hold data the will be written to a block.
+class Frame {
+ public:
+  Frame();
+  ~Frame();
+
+  // Sets this frame's contents based on |frame|. Returns true on success. On
+  // failure, this frame's existing contents may be lost.
+  bool CopyFrom(const Frame& frame);
+
+  // Copies |frame| data into |frame_|. Returns true on success.
+  bool Init(const uint8_t* frame, uint64_t length);
+
+  // Copies |additional| data into |additional_|. Returns true on success.
+  bool AddAdditionalData(const uint8_t* additional, uint64_t length,
+                         uint64_t add_id);
+
+  // Returns true if the frame has valid parameters.
+  bool IsValid() const;
+
+  // Returns true if the frame can be written as a SimpleBlock based on current
+  // parameters.
+  bool CanBeSimpleBlock() const;
+
+  uint64_t add_id() const { return add_id_; }
+  const uint8_t* additional() const { return additional_; }
+  uint64_t additional_length() const { return additional_length_; }
+  void set_duration(uint64_t duration);
+  uint64_t duration() const { return duration_; }
+  bool duration_set() const { return duration_set_; }
+  const uint8_t* frame() const { return frame_; }
+  void set_is_key(bool key) { is_key_ = key; }
+  bool is_key() const { return is_key_; }
+  uint64_t length() const { return length_; }
+  void set_track_number(uint64_t track_number) { track_number_ = track_number; }
+  uint64_t track_number() const { return track_number_; }
+  void set_timestamp(uint64_t timestamp) { timestamp_ = timestamp; }
+  uint64_t timestamp() const { return timestamp_; }
+  void set_discard_padding(int64_t discard_padding) {
+    discard_padding_ = discard_padding;
+  }
+  int64_t discard_padding() const { return discard_padding_; }
+  void set_reference_block_timestamp(int64_t reference_block_timestamp);
+  int64_t reference_block_timestamp() const {
+    return reference_block_timestamp_;
+  }
+  bool reference_block_timestamp_set() const {
+    return reference_block_timestamp_set_;
+  }
+
+ private:
+  // Id of the Additional data.
+  uint64_t add_id_;
+
+  // Pointer to additional data. Owned by this class.
+  uint8_t* additional_;
+
+  // Length of the additional data.
+  uint64_t additional_length_;
+
+  // Duration of the frame in nanoseconds.
+  uint64_t duration_;
+
+  // Flag indicating that |duration_| has been set. Setting duration causes the
+  // frame to be written out as a Block with BlockDuration instead of as a
+  // SimpleBlock.
+  bool duration_set_;
+
+  // Pointer to the data. Owned by this class.
+  uint8_t* frame_;
+
+  // Flag telling if the data should set the key flag of a block.
+  bool is_key_;
+
+  // Length of the data.
+  uint64_t length_;
+
+  // Mkv track number the data is associated with.
+  uint64_t track_number_;
+
+  // Timestamp of the data in nanoseconds.
+  uint64_t timestamp_;
+
+  // Discard padding for the frame.
+  int64_t discard_padding_;
+
+  // Reference block timestamp.
+  int64_t reference_block_timestamp_;
+
+  // Flag indicating if |reference_block_timestamp_| has been set.
+  bool reference_block_timestamp_set_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Frame);
+};
+
+///////////////////////////////////////////////////////////////
+// Class to hold one cue point in a Cues element.
+class CuePoint {
+ public:
+  CuePoint();
+  ~CuePoint();
+
+  // Returns the size in bytes for the entire CuePoint element.
+  uint64_t Size() const;
+
+  // Output the CuePoint element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  void set_time(uint64_t time) { time_ = time; }
+  uint64_t time() const { return time_; }
+  void set_track(uint64_t track) { track_ = track; }
+  uint64_t track() const { return track_; }
+  void set_cluster_pos(uint64_t cluster_pos) { cluster_pos_ = cluster_pos; }
+  uint64_t cluster_pos() const { return cluster_pos_; }
+  void set_block_number(uint64_t block_number) { block_number_ = block_number; }
+  uint64_t block_number() const { return block_number_; }
+  void set_output_block_number(bool output_block_number) {
+    output_block_number_ = output_block_number;
+  }
+  bool output_block_number() const { return output_block_number_; }
+
+ private:
+  // Returns the size in bytes for the payload of the CuePoint element.
+  uint64_t PayloadSize() const;
+
+  // Absolute timecode according to the segment time base.
+  uint64_t time_;
+
+  // The Track element associated with the CuePoint.
+  uint64_t track_;
+
+  // The position of the Cluster containing the Block.
+  uint64_t cluster_pos_;
+
+  // Number of the Block within the Cluster, starting from 1.
+  uint64_t block_number_;
+
+  // If true the muxer will write out the block number for the cue if the
+  // block number is different than the default of 1. Default is set to true.
+  bool output_block_number_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(CuePoint);
+};
+
+///////////////////////////////////////////////////////////////
+// Cues element.
+class Cues {
+ public:
+  Cues();
+  ~Cues();
+
+  // Adds a cue point to the Cues element. Returns true on success.
+  bool AddCue(CuePoint* cue);
+
+  // Returns the cue point by index. Returns NULL if there is no cue point
+  // match.
+  CuePoint* GetCueByIndex(int32_t index) const;
+
+  // Returns the total size of the Cues element
+  uint64_t Size();
+
+  // Output the Cues element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  int32_t cue_entries_size() const { return cue_entries_size_; }
+  void set_output_block_number(bool output_block_number) {
+    output_block_number_ = output_block_number;
+  }
+  bool output_block_number() const { return output_block_number_; }
+
+ private:
+  // Number of allocated elements in |cue_entries_|.
+  int32_t cue_entries_capacity_;
+
+  // Number of CuePoints in |cue_entries_|.
+  int32_t cue_entries_size_;
+
+  // CuePoint list.
+  CuePoint** cue_entries_;
+
+  // If true the muxer will write out the block number for the cue if the
+  // block number is different than the default of 1. Default is set to true.
+  bool output_block_number_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cues);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncAESSettings element
+class ContentEncAESSettings {
+ public:
+  enum { kCTR = 1 };
+
+  ContentEncAESSettings();
+  ~ContentEncAESSettings() {}
+
+  // Returns the size in bytes for the ContentEncAESSettings element.
+  uint64_t Size() const;
+
+  // Writes out the ContentEncAESSettings element to |writer|. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint64_t cipher_mode() const { return cipher_mode_; }
+
+ private:
+  // Returns the size in bytes for the payload of the ContentEncAESSettings
+  // element.
+  uint64_t PayloadSize() const;
+
+  // Sub elements
+  uint64_t cipher_mode_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncAESSettings);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+// Currently only whole frames can be encrypted with AES. This dictates that
+// ContentEncodingOrder will be 0, ContentEncodingScope will be 1,
+// ContentEncodingType will be 1, and ContentEncAlgo will be 5.
+class ContentEncoding {
+ public:
+  ContentEncoding();
+  ~ContentEncoding();
+
+  // Sets the content encryption id. Copies |length| bytes from |id| to
+  // |enc_key_id_|. Returns true on success.
+  bool SetEncryptionID(const uint8_t* id, uint64_t length);
+
+  // Returns the size in bytes for the ContentEncoding element.
+  uint64_t Size() const;
+
+  // Writes out the ContentEncoding element to |writer|. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint64_t enc_algo() const { return enc_algo_; }
+  uint64_t encoding_order() const { return encoding_order_; }
+  uint64_t encoding_scope() const { return encoding_scope_; }
+  uint64_t encoding_type() const { return encoding_type_; }
+  ContentEncAESSettings* enc_aes_settings() { return &enc_aes_settings_; }
+
+ private:
+  // Returns the size in bytes for the encoding elements.
+  uint64_t EncodingSize(uint64_t compresion_size,
+                        uint64_t encryption_size) const;
+
+  // Returns the size in bytes for the encryption elements.
+  uint64_t EncryptionSize() const;
+
+  // Track element names
+  uint64_t enc_algo_;
+  uint8_t* enc_key_id_;
+  uint64_t encoding_order_;
+  uint64_t encoding_scope_;
+  uint64_t encoding_type_;
+
+  // ContentEncAESSettings element.
+  ContentEncAESSettings enc_aes_settings_;
+
+  // Size of the ContentEncKeyID data in bytes.
+  uint64_t enc_key_id_length_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+};
+
+///////////////////////////////////////////////////////////////
+// Colour element.
+class PrimaryChromaticity {
+ public:
+  static const float kChromaticityMin;
+  static const float kChromaticityMax;
+
+  PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {}
+  PrimaryChromaticity() : x_(0), y_(0) {}
+  ~PrimaryChromaticity() {}
+
+  // Returns sum of |x_id| and |y_id| element id sizes and payload sizes.
+  uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id,
+                                   libwebm::MkvId y_id) const;
+  bool Valid() const;
+  bool Write(IMkvWriter* writer, libwebm::MkvId x_id,
+             libwebm::MkvId y_id) const;
+
+  float x() const { return x_; }
+  void set_x(float new_x) { x_ = new_x; }
+  float y() const { return y_; }
+  void set_y(float new_y) { y_ = new_y; }
+
+ private:
+  float x_;
+  float y_;
+};
+
+class MasteringMetadata {
+ public:
+  static const float kValueNotPresent;
+  static const float kMinLuminance;
+  static const float kMinLuminanceMax;
+  static const float kMaxLuminanceMax;
+
+  MasteringMetadata()
+      : luminance_max_(kValueNotPresent),
+        luminance_min_(kValueNotPresent),
+        r_(NULL),
+        g_(NULL),
+        b_(NULL),
+        white_point_(NULL) {}
+  ~MasteringMetadata() {
+    delete r_;
+    delete g_;
+    delete b_;
+    delete white_point_;
+  }
+
+  // Returns total size of the MasteringMetadata element.
+  uint64_t MasteringMetadataSize() const;
+  bool Valid() const;
+  bool Write(IMkvWriter* writer) const;
+
+  // Copies non-null chromaticity.
+  bool SetChromaticity(const PrimaryChromaticity* r,
+                       const PrimaryChromaticity* g,
+                       const PrimaryChromaticity* b,
+                       const PrimaryChromaticity* white_point);
+  const PrimaryChromaticity* r() const { return r_; }
+  const PrimaryChromaticity* g() const { return g_; }
+  const PrimaryChromaticity* b() const { return b_; }
+  const PrimaryChromaticity* white_point() const { return white_point_; }
+
+  float luminance_max() const { return luminance_max_; }
+  void set_luminance_max(float luminance_max) {
+    luminance_max_ = luminance_max;
+  }
+  float luminance_min() const { return luminance_min_; }
+  void set_luminance_min(float luminance_min) {
+    luminance_min_ = luminance_min;
+  }
+
+ private:
+  // Returns size of MasteringMetadata child elements.
+  uint64_t PayloadSize() const;
+
+  float luminance_max_;
+  float luminance_min_;
+  PrimaryChromaticity* r_;
+  PrimaryChromaticity* g_;
+  PrimaryChromaticity* b_;
+  PrimaryChromaticity* white_point_;
+};
+
+class Colour {
+ public:
+  enum MatrixCoefficients {
+    kGbr = 0,
+    kBt709 = 1,
+    kUnspecifiedMc = 2,
+    kReserved = 3,
+    kFcc = 4,
+    kBt470bg = 5,
+    kSmpte170MMc = 6,
+    kSmpte240MMc = 7,
+    kYcocg = 8,
+    kBt2020NonConstantLuminance = 9,
+    kBt2020ConstantLuminance = 10,
+  };
+  enum ChromaSitingHorz {
+    kUnspecifiedCsh = 0,
+    kLeftCollocated = 1,
+    kHalfCsh = 2,
+  };
+  enum ChromaSitingVert {
+    kUnspecifiedCsv = 0,
+    kTopCollocated = 1,
+    kHalfCsv = 2,
+  };
+  enum Range {
+    kUnspecifiedCr = 0,
+    kBroadcastRange = 1,
+    kFullRange = 2,
+    kMcTcDefined = 3,  // Defined by MatrixCoefficients/TransferCharacteristics.
+  };
+  enum TransferCharacteristics {
+    kIturBt709Tc = 1,
+    kUnspecifiedTc = 2,
+    kReservedTc = 3,
+    kGamma22Curve = 4,
+    kGamma28Curve = 5,
+    kSmpte170MTc = 6,
+    kSmpte240MTc = 7,
+    kLinear = 8,
+    kLog = 9,
+    kLogSqrt = 10,
+    kIec6196624 = 11,
+    kIturBt1361ExtendedColourGamut = 12,
+    kIec6196621 = 13,
+    kIturBt202010bit = 14,
+    kIturBt202012bit = 15,
+    kSmpteSt2084 = 16,
+    kSmpteSt4281Tc = 17,
+    kAribStdB67Hlg = 18,
+  };
+  enum Primaries {
+    kReservedP0 = 0,
+    kIturBt709P = 1,
+    kUnspecifiedP = 2,
+    kReservedP3 = 3,
+    kIturBt470M = 4,
+    kIturBt470Bg = 5,
+    kSmpte170MP = 6,
+    kSmpte240MP = 7,
+    kFilm = 8,
+    kIturBt2020 = 9,
+    kSmpteSt4281P = 10,
+    kJedecP22Phosphors = 22,
+  };
+  static const uint64_t kValueNotPresent;
+  Colour()
+      : matrix_coefficients_(kValueNotPresent),
+        bits_per_channel_(kValueNotPresent),
+        chroma_subsampling_horz_(kValueNotPresent),
+        chroma_subsampling_vert_(kValueNotPresent),
+        cb_subsampling_horz_(kValueNotPresent),
+        cb_subsampling_vert_(kValueNotPresent),
+        chroma_siting_horz_(kValueNotPresent),
+        chroma_siting_vert_(kValueNotPresent),
+        range_(kValueNotPresent),
+        transfer_characteristics_(kValueNotPresent),
+        primaries_(kValueNotPresent),
+        max_cll_(kValueNotPresent),
+        max_fall_(kValueNotPresent),
+        mastering_metadata_(NULL) {}
+  ~Colour() { delete mastering_metadata_; }
+
+  // Returns total size of the Colour element.
+  uint64_t ColourSize() const;
+  bool Valid() const;
+  bool Write(IMkvWriter* writer) const;
+
+  // Deep copies |mastering_metadata|.
+  bool SetMasteringMetadata(const MasteringMetadata& mastering_metadata);
+
+  const MasteringMetadata* mastering_metadata() const {
+    return mastering_metadata_;
+  }
+
+  uint64_t matrix_coefficients() const { return matrix_coefficients_; }
+  void set_matrix_coefficients(uint64_t matrix_coefficients) {
+    matrix_coefficients_ = matrix_coefficients;
+  }
+  uint64_t bits_per_channel() const { return bits_per_channel_; }
+  void set_bits_per_channel(uint64_t bits_per_channel) {
+    bits_per_channel_ = bits_per_channel;
+  }
+  uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; }
+  void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) {
+    chroma_subsampling_horz_ = chroma_subsampling_horz;
+  }
+  uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; }
+  void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) {
+    chroma_subsampling_vert_ = chroma_subsampling_vert;
+  }
+  uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; }
+  void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) {
+    cb_subsampling_horz_ = cb_subsampling_horz;
+  }
+  uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; }
+  void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) {
+    cb_subsampling_vert_ = cb_subsampling_vert;
+  }
+  uint64_t chroma_siting_horz() const { return chroma_siting_horz_; }
+  void set_chroma_siting_horz(uint64_t chroma_siting_horz) {
+    chroma_siting_horz_ = chroma_siting_horz;
+  }
+  uint64_t chroma_siting_vert() const { return chroma_siting_vert_; }
+  void set_chroma_siting_vert(uint64_t chroma_siting_vert) {
+    chroma_siting_vert_ = chroma_siting_vert;
+  }
+  uint64_t range() const { return range_; }
+  void set_range(uint64_t range) { range_ = range; }
+  uint64_t transfer_characteristics() const {
+    return transfer_characteristics_;
+  }
+  void set_transfer_characteristics(uint64_t transfer_characteristics) {
+    transfer_characteristics_ = transfer_characteristics;
+  }
+  uint64_t primaries() const { return primaries_; }
+  void set_primaries(uint64_t primaries) { primaries_ = primaries; }
+  uint64_t max_cll() const { return max_cll_; }
+  void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; }
+  uint64_t max_fall() const { return max_fall_; }
+  void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; }
+
+ private:
+  // Returns size of Colour child elements.
+  uint64_t PayloadSize() const;
+
+  uint64_t matrix_coefficients_;
+  uint64_t bits_per_channel_;
+  uint64_t chroma_subsampling_horz_;
+  uint64_t chroma_subsampling_vert_;
+  uint64_t cb_subsampling_horz_;
+  uint64_t cb_subsampling_vert_;
+  uint64_t chroma_siting_horz_;
+  uint64_t chroma_siting_vert_;
+  uint64_t range_;
+  uint64_t transfer_characteristics_;
+  uint64_t primaries_;
+  uint64_t max_cll_;
+  uint64_t max_fall_;
+
+  MasteringMetadata* mastering_metadata_;
+};
+
+///////////////////////////////////////////////////////////////
+// Projection element.
+class Projection {
+ public:
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const uint64_t kValueNotPresent;
+  Projection()
+      : type_(kRectangular),
+        pose_yaw_(0.0),
+        pose_pitch_(0.0),
+        pose_roll_(0.0),
+        private_data_(NULL),
+        private_data_length_(0) {}
+  ~Projection() { delete[] private_data_; }
+
+  uint64_t ProjectionSize() const;
+  bool Write(IMkvWriter* writer) const;
+
+  bool SetProjectionPrivate(const uint8_t* private_data,
+                            uint64_t private_data_length);
+
+  ProjectionType type() const { return type_; }
+  void set_type(ProjectionType type) { type_ = type; }
+  float pose_yaw() const { return pose_yaw_; }
+  void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; }
+  float pose_pitch() const { return pose_pitch_; }
+  void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; }
+  float pose_roll() const { return pose_roll_; }
+  void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; }
+  uint8_t* private_data() const { return private_data_; }
+  uint64_t private_data_length() const { return private_data_length_; }
+
+ private:
+  // Returns size of VideoProjection child elements.
+  uint64_t PayloadSize() const;
+
+  ProjectionType type_;
+  float pose_yaw_;
+  float pose_pitch_;
+  float pose_roll_;
+  uint8_t* private_data_;
+  uint64_t private_data_length_;
+};
+
+///////////////////////////////////////////////////////////////
+// Track element.
+class Track {
+ public:
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit Track(unsigned int* seed);
+  virtual ~Track();
+
+  // Adds a ContentEncoding element to the Track. Returns true on success.
+  virtual bool AddContentEncoding();
+
+  // Returns the ContentEncoding by index. Returns NULL if there is no
+  // ContentEncoding match.
+  ContentEncoding* GetContentEncodingByIndex(uint32_t index) const;
+
+  // Returns the size in bytes for the payload of the Track element.
+  virtual uint64_t PayloadSize() const;
+
+  // Returns the size in bytes of the Track element.
+  virtual uint64_t Size() const;
+
+  // Output the Track element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  // Sets the CodecPrivate element of the Track element. Copies |length|
+  // bytes from |codec_private| to |codec_private_|. Returns true on success.
+  bool SetCodecPrivate(const uint8_t* codec_private, uint64_t length);
+
+  void set_codec_id(const char* codec_id);
+  const char* codec_id() const { return codec_id_; }
+  const uint8_t* codec_private() const { return codec_private_; }
+  void set_language(const char* language);
+  const char* language() const { return language_; }
+  void set_max_block_additional_id(uint64_t max_block_additional_id) {
+    max_block_additional_id_ = max_block_additional_id;
+  }
+  uint64_t max_block_additional_id() const { return max_block_additional_id_; }
+  void set_name(const char* name);
+  const char* name() const { return name_; }
+  void set_number(uint64_t number) { number_ = number; }
+  uint64_t number() const { return number_; }
+  void set_type(uint64_t type) { type_ = type; }
+  uint64_t type() const { return type_; }
+  void set_uid(uint64_t uid) { uid_ = uid; }
+  uint64_t uid() const { return uid_; }
+  void set_codec_delay(uint64_t codec_delay) { codec_delay_ = codec_delay; }
+  uint64_t codec_delay() const { return codec_delay_; }
+  void set_seek_pre_roll(uint64_t seek_pre_roll) {
+    seek_pre_roll_ = seek_pre_roll;
+  }
+  uint64_t seek_pre_roll() const { return seek_pre_roll_; }
+  void set_default_duration(uint64_t default_duration) {
+    default_duration_ = default_duration;
+  }
+  uint64_t default_duration() const { return default_duration_; }
+
+  uint64_t codec_private_length() const { return codec_private_length_; }
+  uint32_t content_encoding_entries_size() const {
+    return content_encoding_entries_size_;
+  }
+
+ private:
+  // Track element names.
+  char* codec_id_;
+  uint8_t* codec_private_;
+  char* language_;
+  uint64_t max_block_additional_id_;
+  char* name_;
+  uint64_t number_;
+  uint64_t type_;
+  uint64_t uid_;
+  uint64_t codec_delay_;
+  uint64_t seek_pre_roll_;
+  uint64_t default_duration_;
+
+  // Size of the CodecPrivate data in bytes.
+  uint64_t codec_private_length_;
+
+  // ContentEncoding element list.
+  ContentEncoding** content_encoding_entries_;
+
+  // Number of ContentEncoding elements added.
+  uint32_t content_encoding_entries_size_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Track);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has video specific elements.
+class VideoTrack : public Track {
+ public:
+  // Supported modes for stereo 3D.
+  enum StereoMode {
+    kMono = 0,
+    kSideBySideLeftIsFirst = 1,
+    kTopBottomRightIsFirst = 2,
+    kTopBottomLeftIsFirst = 3,
+    kSideBySideRightIsFirst = 11
+  };
+
+  enum AlphaMode { kNoAlpha = 0, kAlpha = 1 };
+
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit VideoTrack(unsigned int* seed);
+  virtual ~VideoTrack();
+
+  // Returns the size in bytes for the payload of the Track element plus the
+  // video specific elements.
+  virtual uint64_t PayloadSize() const;
+
+  // Output the VideoTrack element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  // Sets the video's stereo mode. Returns true on success.
+  bool SetStereoMode(uint64_t stereo_mode);
+
+  // Sets the video's alpha mode. Returns true on success.
+  bool SetAlphaMode(uint64_t alpha_mode);
+
+  void set_display_height(uint64_t height) { display_height_ = height; }
+  uint64_t display_height() const { return display_height_; }
+  void set_display_width(uint64_t width) { display_width_ = width; }
+  uint64_t display_width() const { return display_width_; }
+  void set_pixel_height(uint64_t height) { pixel_height_ = height; }
+  uint64_t pixel_height() const { return pixel_height_; }
+  void set_pixel_width(uint64_t width) { pixel_width_ = width; }
+  uint64_t pixel_width() const { return pixel_width_; }
+
+  void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; }
+  uint64_t crop_left() const { return crop_left_; }
+  void set_crop_right(uint64_t crop_right) { crop_right_ = crop_right; }
+  uint64_t crop_right() const { return crop_right_; }
+  void set_crop_top(uint64_t crop_top) { crop_top_ = crop_top; }
+  uint64_t crop_top() const { return crop_top_; }
+  void set_crop_bottom(uint64_t crop_bottom) { crop_bottom_ = crop_bottom; }
+  uint64_t crop_bottom() const { return crop_bottom_; }
+
+  void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; }
+  double frame_rate() const { return frame_rate_; }
+  void set_height(uint64_t height) { height_ = height; }
+  uint64_t height() const { return height_; }
+  uint64_t stereo_mode() { return stereo_mode_; }
+  uint64_t alpha_mode() { return alpha_mode_; }
+  void set_width(uint64_t width) { width_ = width; }
+  uint64_t width() const { return width_; }
+
+  Colour* colour() { return colour_; }
+
+  // Deep copies |colour|.
+  bool SetColour(const Colour& colour);
+
+  Projection* projection() { return projection_; }
+
+  // Deep copies |projection|.
+  bool SetProjection(const Projection& projection);
+
+ private:
+  // Returns the size in bytes of the Video element.
+  uint64_t VideoPayloadSize() const;
+
+  // Video track element names.
+  uint64_t display_height_;
+  uint64_t display_width_;
+  uint64_t pixel_height_;
+  uint64_t pixel_width_;
+  uint64_t crop_left_;
+  uint64_t crop_right_;
+  uint64_t crop_top_;
+  uint64_t crop_bottom_;
+  double frame_rate_;
+  uint64_t height_;
+  uint64_t stereo_mode_;
+  uint64_t alpha_mode_;
+  uint64_t width_;
+
+  Colour* colour_;
+  Projection* projection_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has audio specific elements.
+class AudioTrack : public Track {
+ public:
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit AudioTrack(unsigned int* seed);
+  virtual ~AudioTrack();
+
+  // Returns the size in bytes for the payload of the Track element plus the
+  // audio specific elements.
+  virtual uint64_t PayloadSize() const;
+
+  // Output the AudioTrack element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  void set_bit_depth(uint64_t bit_depth) { bit_depth_ = bit_depth; }
+  uint64_t bit_depth() const { return bit_depth_; }
+  void set_channels(uint64_t channels) { channels_ = channels; }
+  uint64_t channels() const { return channels_; }
+  void set_sample_rate(double sample_rate) { sample_rate_ = sample_rate; }
+  double sample_rate() const { return sample_rate_; }
+
+ private:
+  // Audio track element names.
+  uint64_t bit_depth_;
+  uint64_t channels_;
+  double sample_rate_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(AudioTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Tracks element
+class Tracks {
+ public:
+  // Audio and video type defined by the Matroska specs.
+  enum { kVideo = 0x1, kAudio = 0x2 };
+
+  static const char kOpusCodecId[];
+  static const char kVorbisCodecId[];
+  static const char kVp8CodecId[];
+  static const char kVp9CodecId[];
+  static const char kVp10CodecId[];
+  static const char kAV1CodecId[];
+  static const char kWebVttCaptionsId[];
+  static const char kWebVttDescriptionsId[];
+  static const char kWebVttMetadataId[];
+  static const char kWebVttSubtitlesId[];
+
+  Tracks();
+  ~Tracks();
+
+  // Adds a Track element to the Tracks object. |track| will be owned and
+  // deleted by the Tracks object. Returns true on success. |number| is the
+  // number to use for the track. |number| must be >= 0. If |number| == 0
+  // then the muxer will decide on the track number.
+  bool AddTrack(Track* track, int32_t number);
+
+  // Returns the track by index. Returns NULL if there is no track match.
+  const Track* GetTrackByIndex(uint32_t idx) const;
+
+  // Search the Tracks and return the track that matches |tn|. Returns NULL
+  // if there is no track match.
+  Track* GetTrackByNumber(uint64_t track_number) const;
+
+  // Returns true if the track number is an audio track.
+  bool TrackIsAudio(uint64_t track_number) const;
+
+  // Returns true if the track number is a video track.
+  bool TrackIsVideo(uint64_t track_number) const;
+
+  // Output the Tracks element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint32_t track_entries_size() const { return track_entries_size_; }
+
+ private:
+  // Track element list.
+  Track** track_entries_;
+
+  // Number of Track elements added.
+  uint32_t track_entries_size_;
+
+  // Whether or not Tracks element has already been written via IMkvWriter.
+  mutable bool wrote_tracks_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tracks);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapter element
+//
+class Chapter {
+ public:
+  // Set the identifier for this chapter.  (This corresponds to the
+  // Cue Identifier line in WebVTT.)
+  // TODO(matthewjheaney): the actual serialization of this item in
+  // MKV is pending.
+  bool set_id(const char* id);
+
+  // Converts the nanosecond start and stop times of this chapter to
+  // their corresponding timecode values, and stores them that way.
+  void set_time(const Segment& segment, uint64_t start_time_ns,
+                uint64_t end_time_ns);
+
+  // Sets the uid for this chapter. Primarily used to enable
+  // deterministic output from the muxer.
+  void set_uid(const uint64_t uid) { uid_ = uid; }
+
+  // Add a title string to this chapter, per the semantics described
+  // here:
+  //  http://www.matroska.org/technical/specs/index.html
+  //
+  // The title ("chapter string") is a UTF-8 string.
+  //
+  // The language has ISO 639-2 representation, described here:
+  //  http://www.loc.gov/standards/iso639-2/englangn.html
+  //  http://www.loc.gov/standards/iso639-2/php/English_list.php
+  // If you specify NULL as the language value, this implies
+  // English ("eng").
+  //
+  // The country value corresponds to the codes listed here:
+  //  http://www.iana.org/domains/root/db/
+  //
+  // The function returns false if the string could not be allocated.
+  bool add_string(const char* title, const char* language, const char* country);
+
+ private:
+  friend class Chapters;
+
+  // For storage of chapter titles that differ by language.
+  class Display {
+   public:
+    // Establish representation invariant for new Display object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |title_| member.  Returns false on
+    // error.
+    bool set_title(const char* title);
+
+    // Copies the language to the |language_| member.  Returns false
+    // on error.
+    bool set_language(const char* language);
+
+    // Copies the country to the |country_| member.  Returns false on
+    // error.
+    bool set_country(const char* country);
+
+    // If |writer| is non-NULL, serialize the Display sub-element of
+    // the Atom into the stream.  Returns the Display element size on
+    // success, 0 if error.
+    uint64_t WriteDisplay(IMkvWriter* writer) const;
+
+   private:
+    char* title_;
+    char* language_;
+    char* country_;
+  };
+
+  Chapter();
+  ~Chapter();
+
+  // Establish the representation invariant for a newly-created
+  // Chapter object.  The |seed| parameter is used to create the UID
+  // for this chapter atom.
+  void Init(unsigned int* seed);
+
+  // Copies this Chapter object to a different one.  This is used when
+  // expanding a plain array of Chapter objects (see Chapters).
+  void ShallowCopy(Chapter* dst) const;
+
+  // Reclaim resources used by this Chapter object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |displays_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing Display objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandDisplaysArray();
+
+  // If |writer| is non-NULL, serialize the Atom sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64_t WriteAtom(IMkvWriter* writer) const;
+
+  // The string identifier for this chapter (corresponds to WebVTT cue
+  // identifier).
+  char* id_;
+
+  // Start timecode of the chapter.
+  uint64_t start_timecode_;
+
+  // Stop timecode of the chapter.
+  uint64_t end_timecode_;
+
+  // The binary identifier for this chapter.
+  uint64_t uid_;
+
+  // The Atom element can contain multiple Display sub-elements, as
+  // the same logical title can be rendered in different languages.
+  Display* displays_;
+
+  // The physical length (total size) of the |displays_| array.
+  int displays_size_;
+
+  // The logical length (number of active elements) on the |displays_|
+  // array.
+  int displays_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapter);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapters element
+//
+class Chapters {
+ public:
+  Chapters();
+  ~Chapters();
+
+  Chapter* AddChapter(unsigned int* seed);
+
+  // Returns the number of chapters that have been added.
+  int Count() const;
+
+  // Output the Chapters element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the chapters_ array if there is not enough space to contain
+  // another chapter object.  Returns true on success.
+  bool ExpandChaptersArray();
+
+  // If |writer| is non-NULL, serialize the Edition sub-element of the
+  // Chapters element into the stream.  Returns the Edition element
+  // size on success, 0 if error.
+  uint64_t WriteEdition(IMkvWriter* writer) const;
+
+  // Total length of the chapters_ array.
+  int chapters_size_;
+
+  // Number of active chapters on the chapters_ array.
+  int chapters_count_;
+
+  // Array for storage of chapter objects.
+  Chapter* chapters_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapters);
+};
+
+///////////////////////////////////////////////////////////////
+// Tag element
+//
+class Tag {
+ public:
+  bool add_simple_tag(const char* tag_name, const char* tag_string);
+
+ private:
+  // Tags calls Clear and the destructor of Tag
+  friend class Tags;
+
+  // For storage of simple tags
+  class SimpleTag {
+   public:
+    // Establish representation invariant for new SimpleTag object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |tag_name_| member.  Returns false on
+    // error.
+    bool set_tag_name(const char* tag_name);
+
+    // Copies the language to the |tag_string_| member.  Returns false
+    // on error.
+    bool set_tag_string(const char* tag_string);
+
+    // If |writer| is non-NULL, serialize the SimpleTag sub-element of
+    // the Atom into the stream.  Returns the SimpleTag element size on
+    // success, 0 if error.
+    uint64_t Write(IMkvWriter* writer) const;
+
+   private:
+    char* tag_name_;
+    char* tag_string_;
+  };
+
+  Tag();
+  ~Tag();
+
+  // Copies this Tag object to a different one.  This is used when
+  // expanding a plain array of Tag objects (see Tags).
+  void ShallowCopy(Tag* dst) const;
+
+  // Reclaim resources used by this Tag object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |simple_tags_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing SimpleTag objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandSimpleTagsArray();
+
+  // If |writer| is non-NULL, serialize the Tag sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64_t Write(IMkvWriter* writer) const;
+
+  // The Atom element can contain multiple SimpleTag sub-elements
+  SimpleTag* simple_tags_;
+
+  // The physical length (total size) of the |simple_tags_| array.
+  int simple_tags_size_;
+
+  // The logical length (number of active elements) on the |simple_tags_|
+  // array.
+  int simple_tags_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tag);
+};
+
+///////////////////////////////////////////////////////////////
+// Tags element
+//
+class Tags {
+ public:
+  Tags();
+  ~Tags();
+
+  Tag* AddTag();
+
+  // Returns the number of tags that have been added.
+  int Count() const;
+
+  // Output the Tags element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the tags_ array if there is not enough space to contain
+  // another tag object.  Returns true on success.
+  bool ExpandTagsArray();
+
+  // Total length of the tags_ array.
+  int tags_size_;
+
+  // Number of active tags on the tags_ array.
+  int tags_count_;
+
+  // Array for storage of tag objects.
+  Tag* tags_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tags);
+};
+
+///////////////////////////////////////////////////////////////
+// Cluster element
+//
+// Notes:
+//  |Init| must be called before any other method in this class.
+class Cluster {
+ public:
+  // |timecode| is the absolute timecode of the cluster. |cues_pos| is the
+  // position for the cluster within the segment that should be written in
+  // the cues element. |timecode_scale| is the timecode scale of the segment.
+  Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+          bool write_last_frame_with_duration = false,
+          bool fixed_size_timecode = false);
+  ~Cluster();
+
+  bool Init(IMkvWriter* ptr_writer);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  bool AddFrame(const Frame* frame);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   timecode:     Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+                uint64_t timecode,  // timecode units (absolute)
+                bool is_key);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   additional: Pointer to the additional data
+  //   additional_length: Length of the additional data
+  //   add_id: Value of BlockAddID element
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   abs_timecode: Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                              const uint8_t* additional,
+                              uint64_t additional_length, uint64_t add_id,
+                              uint64_t track_number, uint64_t abs_timecode,
+                              bool is_key);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   discard_padding: DiscardPadding element value.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   abs_timecode: Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                  int64_t discard_padding,
+                                  uint64_t track_number, uint64_t abs_timecode,
+                                  bool is_key);
+
+  // Writes a frame of metadata to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   timecode:     Absolute (not relative to cluster) timestamp of the
+  //                 metadata frame, expressed in timecode units.
+  //   duration:     Duration of metadata frame, in timecode units.
+  //
+  // The metadata frame is written as a block group, with a duration
+  // sub-element but no reference time sub-elements (indicating that
+  // it is considered a keyframe, per Matroska semantics).
+  bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+                   uint64_t timecode, uint64_t duration);
+
+  // Increments the size of the cluster's data in bytes.
+  void AddPayloadSize(uint64_t size);
+
+  // Closes the cluster so no more data can be written to it. Will update the
+  // cluster's size if |writer_| is seekable. Returns true on success. This
+  // variant of Finalize() fails when |write_last_frame_with_duration_| is set
+  // to true.
+  bool Finalize();
+
+  // Closes the cluster so no more data can be written to it. Will update the
+  // cluster's size if |writer_| is seekable. Returns true on success.
+  // Inputs:
+  //   set_last_frame_duration: Boolean indicating whether or not the duration
+  //                            of the last frame should be set. If set to
+  //                            false, the |duration| value is ignored and
+  //                            |write_last_frame_with_duration_| will not be
+  //                            honored.
+  //   duration: Duration of the Cluster in timecode scale.
+  bool Finalize(bool set_last_frame_duration, uint64_t duration);
+
+  // Returns the size in bytes for the entire Cluster element.
+  uint64_t Size() const;
+
+  // Given |abs_timecode|, calculates timecode relative to most recent timecode.
+  // Returns -1 on failure, or a relative timecode.
+  int64_t GetRelativeTimecode(int64_t abs_timecode) const;
+
+  int64_t size_position() const { return size_position_; }
+  int32_t blocks_added() const { return blocks_added_; }
+  uint64_t payload_size() const { return payload_size_; }
+  int64_t position_for_cues() const { return position_for_cues_; }
+  uint64_t timecode() const { return timecode_; }
+  uint64_t timecode_scale() const { return timecode_scale_; }
+  void set_write_last_frame_with_duration(bool write_last_frame_with_duration) {
+    write_last_frame_with_duration_ = write_last_frame_with_duration;
+  }
+  bool write_last_frame_with_duration() const {
+    return write_last_frame_with_duration_;
+  }
+
+ private:
+  // Iterator type for the |stored_frames_| map.
+  typedef std::map<uint64_t, std::list<Frame*> >::iterator FrameMapIterator;
+
+  // Utility method that confirms that blocks can still be added, and that the
+  // cluster header has been written. Used by |DoWriteFrame*|. Returns true
+  // when successful.
+  bool PreWriteBlock();
+
+  // Utility method used by the |DoWriteFrame*| methods that handles the book
+  // keeping required after each block is written.
+  void PostWriteBlock(uint64_t element_size);
+
+  // Does some verification and calls WriteFrame.
+  bool DoWriteFrame(const Frame* const frame);
+
+  // Either holds back the given frame, or writes it out depending on whether or
+  // not |write_last_frame_with_duration_| is set.
+  bool QueueOrWriteFrame(const Frame* const frame);
+
+  // Outputs the Cluster header to |writer_|. Returns true on success.
+  bool WriteClusterHeader();
+
+  // Number of blocks added to the cluster.
+  int32_t blocks_added_;
+
+  // Flag telling if the cluster has been closed.
+  bool finalized_;
+
+  // Flag indicating whether the cluster's timecode will always be written out
+  // using 8 bytes.
+  bool fixed_size_timecode_;
+
+  // Flag telling if the cluster's header has been written.
+  bool header_written_;
+
+  // The size of the cluster elements in bytes.
+  uint64_t payload_size_;
+
+  // The file position used for cue points.
+  const int64_t position_for_cues_;
+
+  // The file position of the cluster's size element.
+  int64_t size_position_;
+
+  // The absolute timecode of the cluster.
+  const uint64_t timecode_;
+
+  // The timecode scale of the Segment containing the cluster.
+  const uint64_t timecode_scale_;
+
+  // Flag indicating whether the last frame of the cluster should be written as
+  // a Block with Duration. If set to true, then it will result in holding back
+  // of frames and the parameterized version of Finalize() must be called to
+  // finish writing the Cluster.
+  bool write_last_frame_with_duration_;
+
+  // Map used to hold back frames, if required. Track number is the key.
+  std::map<uint64_t, std::list<Frame*> > stored_frames_;
+
+  // Map from track number to the timestamp of the last block written for that
+  // track.
+  std::map<uint64_t, uint64_t> last_block_timestamp_;
+
+  // Pointer to the writer object. Not owned by this class.
+  IMkvWriter* writer_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cluster);
+};
+
+///////////////////////////////////////////////////////////////
+// SeekHead element
+class SeekHead {
+ public:
+  SeekHead();
+  ~SeekHead();
+
+  // TODO(fgalligan): Change this to reserve a certain size. Then check how
+  // big the seek entry to be added is as not every seek entry will be the
+  // maximum size it could be.
+  // Adds a seek entry to be written out when the element is finalized. |id|
+  // must be the coded mkv element id. |pos| is the file position of the
+  // element. Returns true on success.
+  bool AddSeekEntry(uint32_t id, uint64_t pos);
+
+  // Writes out SeekHead and SeekEntry elements. Returns true on success.
+  bool Finalize(IMkvWriter* writer) const;
+
+  // Returns the id of the Seek Entry at the given index. Returns -1 if index is
+  // out of range.
+  uint32_t GetId(int index) const;
+
+  // Returns the position of the Seek Entry at the given index. Returns -1 if
+  // index is out of range.
+  uint64_t GetPosition(int index) const;
+
+  // Sets the Seek Entry id and position at given index.
+  // Returns true on success.
+  bool SetSeekEntry(int index, uint32_t id, uint64_t position);
+
+  // Reserves space by writing out a Void element which will be updated with
+  // a SeekHead element later. Returns true on success.
+  bool Write(IMkvWriter* writer);
+
+  // We are going to put a cap on the number of Seek Entries.
+  const static int32_t kSeekEntryCount = 5;
+
+ private:
+  // Returns the maximum size in bytes of one seek entry.
+  uint64_t MaxEntrySize() const;
+
+  // Seek entry id element list.
+  uint32_t seek_entry_id_[kSeekEntryCount];
+
+  // Seek entry pos element list.
+  uint64_t seek_entry_pos_[kSeekEntryCount];
+
+  // The file position of SeekHead element.
+  int64_t start_pos_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SeekHead);
+};
+
+///////////////////////////////////////////////////////////////
+// Segment Information element
+class SegmentInfo {
+ public:
+  SegmentInfo();
+  ~SegmentInfo();
+
+  // Will update the duration if |duration_| is > 0.0. Returns true on success.
+  bool Finalize(IMkvWriter* writer) const;
+
+  // Sets |muxing_app_| and |writing_app_|.
+  bool Init();
+
+  // Output the Segment Information element to the writer. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer);
+
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+  void set_muxing_app(const char* app);
+  const char* muxing_app() const { return muxing_app_; }
+  void set_timecode_scale(uint64_t scale) { timecode_scale_ = scale; }
+  uint64_t timecode_scale() const { return timecode_scale_; }
+  void set_writing_app(const char* app);
+  const char* writing_app() const { return writing_app_; }
+  void set_date_utc(int64_t date_utc) { date_utc_ = date_utc; }
+  int64_t date_utc() const { return date_utc_; }
+
+ private:
+  // Segment Information element names.
+  // Initially set to -1 to signify that a duration has not been set and should
+  // not be written out.
+  double duration_;
+  // Set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+  char* muxing_app_;
+  uint64_t timecode_scale_;
+  // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+  char* writing_app_;
+  // LLONG_MIN when DateUTC is not set.
+  int64_t date_utc_;
+
+  // The file position of the duration element.
+  int64_t duration_pos_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SegmentInfo);
+};
+
+///////////////////////////////////////////////////////////////
+// This class represents the main segment in a WebM file. Currently only
+// supports one Segment element.
+//
+// Notes:
+//  |Init| must be called before any other method in this class.
+class Segment {
+ public:
+  enum Mode { kLive = 0x1, kFile = 0x2 };
+
+  enum CuesPosition {
+    kAfterClusters = 0x0,  // Position Cues after Clusters - Default
+    kBeforeClusters = 0x1  // Position Cues before Clusters
+  };
+
+  static const uint32_t kDefaultDocTypeVersion = 4;
+  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+
+  Segment();
+  ~Segment();
+
+  // Initializes |SegmentInfo| and returns result. Always returns false when
+  // |ptr_writer| is NULL.
+  bool Init(IMkvWriter* ptr_writer);
+
+  // Adds a generic track to the segment.  Returns the newly-allocated
+  // track object (which is owned by the segment) on success, NULL on
+  // error. |number| is the number to use for the track.  |number|
+  // must be >= 0. If |number| == 0 then the muxer will decide on the
+  // track number.
+  Track* AddTrack(int32_t number);
+
+  // Adds a Vorbis audio track to the segment. Returns the number of the track
+  // on success, 0 on error. |number| is the number to use for the audio track.
+  // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+  // the track number.
+  uint64_t AddAudioTrack(int32_t sample_rate, int32_t channels, int32_t number);
+
+  // Adds an empty chapter to the chapters of this segment.  Returns
+  // non-NULL on success.  After adding the chapter, the caller should
+  // populate its fields via the Chapter member functions.
+  Chapter* AddChapter();
+
+  // Adds an empty tag to the tags of this segment.  Returns
+  // non-NULL on success.  After adding the tag, the caller should
+  // populate its fields via the Tag member functions.
+  Tag* AddTag();
+
+  // Adds a cue point to the Cues element. |timestamp| is the time in
+  // nanoseconds of the cue's time. |track| is the Track of the Cue. This
+  // function must be called after AddFrame to calculate the correct
+  // BlockNumber for the CuePoint. Returns true on success.
+  bool AddCuePoint(uint64_t timestamp, uint64_t track);
+
+  // Adds a frame to be output in the file. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Timestamp of the frame in nanoseconds from 0.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+                uint64_t timestamp_ns, bool is_key);
+
+  // Writes a frame of metadata to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timecode:     Absolute timestamp of the metadata frame, expressed
+  //                 in nanosecond units.
+  //   duration:     Duration of metadata frame, in nanosecond units.
+  //
+  // The metadata frame is written as a block group, with a duration
+  // sub-element but no reference time sub-elements (indicating that
+  // it is considered a keyframe, per Matroska semantics).
+  bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+                   uint64_t timestamp_ns, uint64_t duration_ns);
+
+  // Writes a frame with additional data to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   additional: Pointer to additional data.
+  //   additional_length: Length of additional data.
+  //   add_id: Additional ID which identifies the type of additional data.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
+  //                 units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                              const uint8_t* additional,
+                              uint64_t additional_length, uint64_t add_id,
+                              uint64_t track_number, uint64_t timestamp,
+                              bool is_key);
+
+  // Writes a frame with DiscardPadding to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   discard_padding: DiscardPadding element value.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
+  //                 units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                  int64_t discard_padding,
+                                  uint64_t track_number, uint64_t timestamp,
+                                  bool is_key);
+
+  // Writes a Frame to the output medium. Chooses the correct way of writing
+  // the frame (Block vs SimpleBlock) based on the parameters passed.
+  // Inputs:
+  //   frame: frame object
+  bool AddGenericFrame(const Frame* frame);
+
+  // Adds a VP8 video track to the segment. Returns the number of the track on
+  // success, 0 on error. |number| is the number to use for the video track.
+  // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+  // the track number.
+  uint64_t AddVideoTrack(int32_t width, int32_t height, int32_t number);
+
+  // This function must be called after Finalize() if you need a copy of the
+  // output with Cues written before the Clusters. It will return false if the
+  // writer is not seekable of if chunking is set to true.
+  // Input parameters:
+  // reader - an IMkvReader object created with the same underlying file of the
+  //          current writer object. Make sure to close the existing writer
+  //          object before creating this so that all the data is properly
+  //          flushed and available for reading.
+  // writer - an IMkvWriter object pointing to a *different* file than the one
+  //          pointed by the current writer object. This file will contain the
+  //          Cues element before the Clusters.
+  bool CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+                                     IMkvWriter* writer);
+
+  // Sets which track to use for the Cues element. Must have added the track
+  // before calling this function. Returns true on success. |track_number| is
+  // returned by the Add track functions.
+  bool CuesTrack(uint64_t track_number);
+
+  // This will force the muxer to create a new Cluster when the next frame is
+  // added.
+  void ForceNewClusterOnNextFrame();
+
+  // Writes out any frames that have not been written out. Finalizes the last
+  // cluster. May update the size and duration of the segment. May output the
+  // Cues element. May finalize the SeekHead element. Returns true on success.
+  bool Finalize();
+
+  // Returns the Cues object.
+  Cues* GetCues() { return &cues_; }
+
+  // Returns the Segment Information object.
+  const SegmentInfo* GetSegmentInfo() const { return &segment_info_; }
+  SegmentInfo* GetSegmentInfo() { return &segment_info_; }
+
+  // Search the Tracks and return the track that matches |track_number|.
+  // Returns NULL if there is no track match.
+  Track* GetTrackByNumber(uint64_t track_number) const;
+
+  // Toggles whether to output a cues element.
+  void OutputCues(bool output_cues);
+
+  // Toggles whether to write the last frame in each Cluster with Duration.
+  void AccurateClusterDuration(bool accurate_cluster_duration);
+
+  // Toggles whether to write the Cluster Timecode using exactly 8 bytes.
+  void UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode);
+
+  // Sets if the muxer will output files in chunks or not. |chunking| is a
+  // flag telling whether or not to turn on chunking. |filename| is the base
+  // filename for the chunk files. The header chunk file will be named
+  // |filename|.hdr and the data chunks will be named
+  // |filename|_XXXXXX.chk. Chunking implies that the muxer will be writing
+  // to files so the muxer will use the default MkvWriter class to control
+  // what data is written to what files. Returns true on success.
+  // TODO: Should we change the IMkvWriter Interface to add Open and Close?
+  // That will force the interface to be dependent on files.
+  bool SetChunking(bool chunking, const char* filename);
+
+  bool chunking() const { return chunking_; }
+  uint64_t cues_track() const { return cues_track_; }
+  void set_max_cluster_duration(uint64_t max_cluster_duration) {
+    max_cluster_duration_ = max_cluster_duration;
+  }
+  uint64_t max_cluster_duration() const { return max_cluster_duration_; }
+  void set_max_cluster_size(uint64_t max_cluster_size) {
+    max_cluster_size_ = max_cluster_size;
+  }
+  uint64_t max_cluster_size() const { return max_cluster_size_; }
+  void set_mode(Mode mode) { mode_ = mode; }
+  Mode mode() const { return mode_; }
+  CuesPosition cues_position() const { return cues_position_; }
+  bool output_cues() const { return output_cues_; }
+  void set_estimate_file_duration(bool estimate_duration) {
+    estimate_file_duration_ = estimate_duration;
+  }
+  bool estimate_file_duration() const { return estimate_file_duration_; }
+  const SegmentInfo* segment_info() const { return &segment_info_; }
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+
+  // Returns true when codec IDs are valid for WebM.
+  bool DocTypeIsWebm() const;
+
+ private:
+  // Checks if header information has been output and initialized. If not it
+  // will output the Segment element and initialize the SeekHead elment and
+  // Cues elements.
+  bool CheckHeaderInfo();
+
+  // Sets |doc_type_version_| based on the current element requirements.
+  void UpdateDocTypeVersion();
+
+  // Sets |name| according to how many chunks have been written. |ext| is the
+  // file extension. |name| must be deleted by the calling app. Returns true
+  // on success.
+  bool UpdateChunkName(const char* ext, char** name) const;
+
+  // Returns the maximum offset within the segment's payload. When chunking
+  // this function is needed to determine offsets of elements within the
+  // chunked files. Returns -1 on error.
+  int64_t MaxOffset();
+
+  // Adds the frame to our frame array.
+  bool QueueFrame(Frame* frame);
+
+  // Output all frames that are queued. Returns -1 on error, otherwise
+  // it returns the number of frames written.
+  int WriteFramesAll();
+
+  // Output all frames that are queued that have an end time that is less
+  // then |timestamp|. Returns true on success and if there are no frames
+  // queued.
+  bool WriteFramesLessThan(uint64_t timestamp);
+
+  // Outputs the segment header, Segment Information element, SeekHead element,
+  // and Tracks element to |writer_|.
+  bool WriteSegmentHeader();
+
+  // Given a frame with the specified timestamp (nanosecond units) and
+  // keyframe status, determine whether a new cluster should be
+  // created, before writing enqueued frames and the frame itself. The
+  // function returns one of the following values:
+  //  -1 = error: an out-of-order frame was detected
+  //  0 = do not create a new cluster, and write frame to the existing cluster
+  //  1 = create a new cluster, and write frame to that new cluster
+  //  2 = create a new cluster, and re-run test
+  int TestFrame(uint64_t track_num, uint64_t timestamp_ns, bool key) const;
+
+  // Create a new cluster, using the earlier of the first enqueued
+  // frame, or the indicated time. Returns true on success.
+  bool MakeNewCluster(uint64_t timestamp_ns);
+
+  // Checks whether a new cluster needs to be created, and if so
+  // creates a new cluster. Returns false if creation of a new cluster
+  // was necessary but creation was not successful.
+  bool DoNewClusterProcessing(uint64_t track_num, uint64_t timestamp_ns,
+                              bool key);
+
+  // Adjusts Cue Point values (to place Cues before Clusters) so that they
+  // reflect the correct offsets.
+  void MoveCuesBeforeClusters();
+
+  // This function recursively computes the correct cluster offsets (this is
+  // done to move the Cues before Clusters). It recursively updates the change
+  // in size (which indicates a change in cluster offset) until no sizes change.
+  // Parameters:
+  // diff - indicates the difference in size of the Cues element that needs to
+  //        accounted for.
+  // index - index in the list of Cues which is currently being adjusted.
+  // cue_size - sum of size of all the CuePoint elements.
+  void MoveCuesBeforeClustersHelper(uint64_t diff, int index,
+                                    uint64_t* cue_size);
+
+  // Seeds the random number generator used to make UIDs.
+  unsigned int seed_;
+
+  // WebM elements
+  Cues cues_;
+  SeekHead seek_head_;
+  SegmentInfo segment_info_;
+  Tracks tracks_;
+  Chapters chapters_;
+  Tags tags_;
+
+  // Number of chunks written.
+  int chunk_count_;
+
+  // Current chunk filename.
+  char* chunk_name_;
+
+  // Default MkvWriter object created by this class used for writing clusters
+  // out in separate files.
+  MkvWriter* chunk_writer_cluster_;
+
+  // Default MkvWriter object created by this class used for writing Cues
+  // element out to a file.
+  MkvWriter* chunk_writer_cues_;
+
+  // Default MkvWriter object created by this class used for writing the
+  // Matroska header out to a file.
+  MkvWriter* chunk_writer_header_;
+
+  // Flag telling whether or not the muxer is chunking output to multiple
+  // files.
+  bool chunking_;
+
+  // Base filename for the chunked files.
+  char* chunking_base_name_;
+
+  // File position offset where the Clusters end.
+  int64_t cluster_end_offset_;
+
+  // List of clusters.
+  Cluster** cluster_list_;
+
+  // Number of cluster pointers allocated in the cluster list.
+  int32_t cluster_list_capacity_;
+
+  // Number of clusters in the cluster list.
+  int32_t cluster_list_size_;
+
+  // Indicates whether Cues should be written before or after Clusters
+  CuesPosition cues_position_;
+
+  // Track number that is associated with the cues element for this segment.
+  uint64_t cues_track_;
+
+  // Tells the muxer to force a new cluster on the next Block.
+  bool force_new_cluster_;
+
+  // List of stored audio frames. These variables are used to store frames so
+  // the muxer can follow the guideline "Audio blocks that contain the video
+  // key frame's timecode should be in the same cluster as the video key frame
+  // block."
+  Frame** frames_;
+
+  // Number of frame pointers allocated in the frame list.
+  int32_t frames_capacity_;
+
+  // Number of frames in the frame list.
+  int32_t frames_size_;
+
+  // Flag telling if a video track has been added to the segment.
+  bool has_video_;
+
+  // Flag telling if the segment's header has been written.
+  bool header_written_;
+
+  // Duration of the last block in nanoseconds.
+  uint64_t last_block_duration_;
+
+  // Last timestamp in nanoseconds added to a cluster.
+  uint64_t last_timestamp_;
+
+  // Last timestamp in nanoseconds by track number added to a cluster.
+  uint64_t last_track_timestamp_[kMaxTrackNumber];
+
+  // Number of frames written per track.
+  uint64_t track_frames_written_[kMaxTrackNumber];
+
+  // Maximum time in nanoseconds for a cluster duration. This variable is a
+  // guideline and some clusters may have a longer duration. Default is 30
+  // seconds.
+  uint64_t max_cluster_duration_;
+
+  // Maximum size in bytes for a cluster. This variable is a guideline and
+  // some clusters may have a larger size. Default is 0 which signifies that
+  // the muxer will decide the size.
+  uint64_t max_cluster_size_;
+
+  // The mode that segment is in. If set to |kLive| the writer must not
+  // seek backwards.
+  Mode mode_;
+
+  // Flag telling the muxer that a new cue point should be added.
+  bool new_cuepoint_;
+
+  // TODO(fgalligan): Should we add support for more than one Cues element?
+  // Flag whether or not the muxer should output a Cues element.
+  bool output_cues_;
+
+  // Flag whether or not the last frame in each Cluster will have a Duration
+  // element in it.
+  bool accurate_cluster_duration_;
+
+  // Flag whether or not to write the Cluster Timecode using exactly 8 bytes.
+  bool fixed_size_cluster_timecode_;
+
+  // Flag whether or not to estimate the file duration.
+  bool estimate_file_duration_;
+
+  // The size of the EBML header, used to validate the header if
+  // WriteEbmlHeader() is called more than once.
+  int32_t ebml_header_size_;
+
+  // The file position of the segment's payload.
+  int64_t payload_pos_;
+
+  // The file position of the element's size.
+  int64_t size_position_;
+
+  // Current DocTypeVersion (|doc_type_version_|) and that written in
+  // WriteSegmentHeader().
+  // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_|
+  // differs from |doc_type_version_written_|.
+  uint32_t doc_type_version_;
+  uint32_t doc_type_version_written_;
+
+  // If |duration_| is > 0, then explicitly set the duration of the segment.
+  double duration_;
+
+  // Pointer to the writer objects. Not owned by this class.
+  IMkvWriter* writer_cluster_;
+  IMkvWriter* writer_cues_;
+  IMkvWriter* writer_header_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment);
+};
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVMUXER_H_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
new file mode 100644
index 000000000..e5db12160
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVMUXERTYPES_H_
+#define MKVMUXER_MKVMUXERTYPES_H_
+
+namespace mkvmuxer {
+typedef unsigned char uint8;
+typedef short int16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+}  // namespace mkvmuxer
+
+// Copied from Chromium basictypes.h
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                       \
+  void operator=(const TypeName&)
+
+#endif  // MKVMUXER_MKVMUXERTYPES_HPP_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
new file mode 100644
index 000000000..355d4e22b
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -0,0 +1,744 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxerutil.h"
+
+#ifdef __ANDROID__
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <new>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvwriter.h"
+
+namespace mkvmuxer {
+
+namespace {
+
+// Date elements are always 8 octets in size.
+const int kDateElementSize = 8;
+
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+                  uint64 timecode_scale) {
+  uint64 block_additional_elem_size = 0;
+  uint64 block_addid_elem_size = 0;
+  uint64 block_more_payload_size = 0;
+  uint64 block_more_elem_size = 0;
+  uint64 block_additions_payload_size = 0;
+  uint64 block_additions_elem_size = 0;
+  if (frame->additional()) {
+    block_additional_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
+                        frame->additional_length());
+    block_addid_elem_size = EbmlElementSize(
+        libwebm::kMkvBlockAddID, static_cast<uint64>(frame->add_id()));
+
+    block_more_payload_size =
+        block_addid_elem_size + block_additional_elem_size;
+    block_more_elem_size =
+        EbmlMasterElementSize(libwebm::kMkvBlockMore, block_more_payload_size) +
+        block_more_payload_size;
+    block_additions_payload_size = block_more_elem_size;
+    block_additions_elem_size =
+        EbmlMasterElementSize(libwebm::kMkvBlockAdditions,
+                              block_additions_payload_size) +
+        block_additions_payload_size;
+  }
+
+  uint64 discard_padding_elem_size = 0;
+  if (frame->discard_padding() != 0) {
+    discard_padding_elem_size =
+        EbmlElementSize(libwebm::kMkvDiscardPadding,
+                        static_cast<int64>(frame->discard_padding()));
+  }
+
+  const uint64 reference_block_timestamp =
+      frame->reference_block_timestamp() / timecode_scale;
+  uint64 reference_block_elem_size = 0;
+  if (!frame->is_key()) {
+    reference_block_elem_size =
+        EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
+  }
+
+  const uint64 duration = frame->duration() / timecode_scale;
+  uint64 block_duration_elem_size = 0;
+  if (duration > 0)
+    block_duration_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockDuration, duration);
+
+  const uint64 block_payload_size = 4 + frame->length();
+  const uint64 block_elem_size =
+      EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
+      block_payload_size;
+
+  const uint64 block_group_payload_size =
+      block_elem_size + block_additions_elem_size + block_duration_elem_size +
+      discard_padding_elem_size + reference_block_elem_size;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockGroup,
+                              block_group_payload_size)) {
+    return 0;
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlock, block_payload_size))
+    return 0;
+
+  if (WriteUInt(writer, frame->track_number()))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  // For a Block, flags is always 0.
+  if (SerializeInt(writer, 0, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  if (frame->additional()) {
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockAdditions,
+                                block_additions_payload_size)) {
+      return 0;
+    }
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockMore,
+                                block_more_payload_size))
+      return 0;
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID,
+                          static_cast<uint64>(frame->add_id())))
+      return 0;
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
+                          frame->additional(), frame->additional_length())) {
+      return 0;
+    }
+  }
+
+  if (frame->discard_padding() != 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
+                        static_cast<int64>(frame->discard_padding()))) {
+    return false;
+  }
+
+  if (!frame->is_key() &&
+      !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+                        reference_block_timestamp)) {
+    return false;
+  }
+
+  if (duration > 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvBlockDuration, duration)) {
+    return false;
+  }
+  return EbmlMasterElementSize(libwebm::kMkvBlockGroup,
+                               block_group_payload_size) +
+         block_group_payload_size;
+}
+
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                        int64 timecode) {
+  if (WriteID(writer, libwebm::kMkvSimpleBlock))
+    return 0;
+
+  const int32 size = static_cast<int32>(frame->length()) + 4;
+  if (WriteUInt(writer, size))
+    return 0;
+
+  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  uint64 flags = 0;
+  if (frame->is_key())
+    flags |= 0x80;
+
+  if (SerializeInt(writer, flags, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+         frame->length();
+}
+
+}  // namespace
+
+int32 GetCodedUIntSize(uint64 value) {
+  if (value < 0x000000000000007FULL)
+    return 1;
+  else if (value < 0x0000000000003FFFULL)
+    return 2;
+  else if (value < 0x00000000001FFFFFULL)
+    return 3;
+  else if (value < 0x000000000FFFFFFFULL)
+    return 4;
+  else if (value < 0x00000007FFFFFFFFULL)
+    return 5;
+  else if (value < 0x000003FFFFFFFFFFULL)
+    return 6;
+  else if (value < 0x0001FFFFFFFFFFFFULL)
+    return 7;
+  return 8;
+}
+
+int32 GetUIntSize(uint64 value) {
+  if (value < 0x0000000000000100ULL)
+    return 1;
+  else if (value < 0x0000000000010000ULL)
+    return 2;
+  else if (value < 0x0000000001000000ULL)
+    return 3;
+  else if (value < 0x0000000100000000ULL)
+    return 4;
+  else if (value < 0x0000010000000000ULL)
+    return 5;
+  else if (value < 0x0001000000000000ULL)
+    return 6;
+  else if (value < 0x0100000000000000ULL)
+    return 7;
+  return 8;
+}
+
+int32 GetIntSize(int64 value) {
+  // Doubling the requested value ensures positive values with their high bit
+  // set are written with 0-padding to avoid flipping the signedness.
+  const uint64 v = (value < 0) ? value ^ -1LL : value;
+  return GetUIntSize(2 * v);
+}
+
+uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetCodedUIntSize(value);
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, int64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value) {
+  return EbmlElementSize(type, value, 0);
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, float /* value */) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += sizeof(float);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const char* value) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += strlen(value);
+
+  // Size of Datasize
+  ebml_size += GetCodedUIntSize(strlen(value));
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += size;
+
+  // Size of Datasize
+  ebml_size += GetCodedUIntSize(size);
+
+  return ebml_size;
+}
+
+uint64 EbmlDateElementSize(uint64 type) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += kDateElementSize;
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
+  if (!writer || size < 1 || size > 8)
+    return -1;
+
+  for (int32 i = 1; i <= size; ++i) {
+    const int32 byte_count = size - i;
+    const int32 bit_count = byte_count * 8;
+
+    const int64 bb = value >> bit_count;
+    const uint8 b = static_cast<uint8>(bb);
+
+    const int32 status = writer->Write(&b, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32 SerializeFloat(IMkvWriter* writer, float f) {
+  if (!writer)
+    return -1;
+
+  assert(sizeof(uint32) == sizeof(float));
+  // This union is merely used to avoid a reinterpret_cast from float& to
+  // uint32& which will result in violation of strict aliasing.
+  union U32 {
+    uint32 u32;
+    float f;
+  } value;
+  value.f = f;
+
+  for (int32 i = 1; i <= 4; ++i) {
+    const int32 byte_count = 4 - i;
+    const int32 bit_count = byte_count * 8;
+
+    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
+
+    const int32 status = writer->Write(&byte, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32 WriteUInt(IMkvWriter* writer, uint64 value) {
+  if (!writer)
+    return -1;
+
+  int32 size = GetCodedUIntSize(value);
+
+  return WriteUIntSize(writer, value, size);
+}
+
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
+  if (!writer || size < 0 || size > 8)
+    return -1;
+
+  if (size > 0) {
+    const uint64 bit = 1LL << (size * 7);
+
+    if (value > (bit - 2))
+      return -1;
+
+    value |= bit;
+  } else {
+    size = 1;
+    int64 bit;
+
+    for (;;) {
+      bit = 1LL << (size * 7);
+      const uint64 max = bit - 2;
+
+      if (value <= max)
+        break;
+
+      ++size;
+    }
+
+    if (size > 8)
+      return false;
+
+    value |= bit;
+  }
+
+  return SerializeInt(writer, value, size);
+}
+
+int32 WriteID(IMkvWriter* writer, uint64 type) {
+  if (!writer)
+    return -1;
+
+  writer->ElementStartNotify(type, writer->Position());
+
+  const int32 size = GetUIntSize(type);
+
+  return SerializeInt(writer, type, size);
+}
+
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
+  return WriteEbmlElement(writer, type, value, 0);
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  uint64 size = GetUIntSize(value);
+  if (fixed_size > 0) {
+    if (size > fixed_size)
+      return false;
+    size = fixed_size;
+  }
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return 0;
+
+  const uint64 size = GetIntSize(value);
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, 4))
+    return false;
+
+  if (SerializeFloat(writer, value))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
+  if (!writer || !value)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  const uint64 length = strlen(value);
+  if (WriteUInt(writer, length))
+    return false;
+
+  if (writer->Write(value, static_cast<const uint32>(length)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size) {
+  if (!writer || !value || size < 1)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (writer->Write(value, static_cast<uint32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, kDateElementSize))
+    return false;
+
+  if (SerializeInt(writer, value, kDateElementSize))
+    return false;
+
+  return true;
+}
+
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster) {
+  if (!writer || !frame || !frame->IsValid() || !cluster ||
+      !cluster->timecode_scale())
+    return 0;
+
+  //  Technically the timecode for a block can be less than the
+  //  timecode for the cluster itself (remember that block timecode
+  //  is a signed, 16-bit integer).  However, as a simplification we
+  //  only permit non-negative cluster-relative timecodes for blocks.
+  const int64 relative_timecode = cluster->GetRelativeTimecode(
+      frame->timestamp() / cluster->timecode_scale());
+  if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
+    return 0;
+
+  return frame->CanBeSimpleBlock() ?
+             WriteSimpleBlock(writer, frame, relative_timecode) :
+             WriteBlock(writer, frame, relative_timecode,
+                        cluster->timecode_scale());
+}
+
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
+  if (!writer)
+    return false;
+
+  // Subtract one for the void ID and the coded size.
+  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+                     void_entry_size;
+
+  if (void_size != size)
+    return 0;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return 0;
+
+  if (WriteID(writer, libwebm::kMkvVoid))
+    return 0;
+
+  if (WriteUInt(writer, void_entry_size))
+    return 0;
+
+  const uint8 value = 0;
+  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
+    if (writer->Write(&value, 1))
+      return 0;
+  }
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(void_size))
+    return 0;
+
+  return void_size;
+}
+
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
+  *major = 0;
+  *minor = 2;
+  *build = 1;
+  *revision = 0;
+}
+
+uint64 MakeUID(unsigned int* seed) {
+  uint64 uid = 0;
+
+#ifdef __MINGW32__
+  srand(*seed);
+#endif
+
+  for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
+    uid <<= 8;
+
+// TODO(fgalligan): Move random number generation to platform specific code.
+#ifdef _MSC_VER
+    (void)seed;
+    const int32 nn = rand();
+#elif __ANDROID__
+    (void)seed;
+    int32 temp_num = 1;
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd != -1) {
+      read(fd, &temp_num, sizeof(temp_num));
+      close(fd);
+    }
+    const int32 nn = temp_num;
+#elif defined __MINGW32__
+    const int32 nn = rand();
+#else
+    const int32 nn = rand_r(seed);
+#endif
+    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
+
+    uid |= n;
+  }
+
+  return uid;
+}
+
+bool IsMatrixCoefficientsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kGbr:
+    case mkvmuxer::Colour::kBt709:
+    case mkvmuxer::Colour::kUnspecifiedMc:
+    case mkvmuxer::Colour::kReserved:
+    case mkvmuxer::Colour::kFcc:
+    case mkvmuxer::Colour::kBt470bg:
+    case mkvmuxer::Colour::kSmpte170MMc:
+    case mkvmuxer::Colour::kSmpte240MMc:
+    case mkvmuxer::Colour::kYcocg:
+    case mkvmuxer::Colour::kBt2020NonConstantLuminance:
+    case mkvmuxer::Colour::kBt2020ConstantLuminance:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingHorzValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsh:
+    case mkvmuxer::Colour::kLeftCollocated:
+    case mkvmuxer::Colour::kHalfCsh:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingVertValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsv:
+    case mkvmuxer::Colour::kTopCollocated:
+    case mkvmuxer::Colour::kHalfCsv:
+      return true;
+  }
+  return false;
+}
+
+bool IsColourRangeValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCr:
+    case mkvmuxer::Colour::kBroadcastRange:
+    case mkvmuxer::Colour::kFullRange:
+    case mkvmuxer::Colour::kMcTcDefined:
+      return true;
+  }
+  return false;
+}
+
+bool IsTransferCharacteristicsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kIturBt709Tc:
+    case mkvmuxer::Colour::kUnspecifiedTc:
+    case mkvmuxer::Colour::kReservedTc:
+    case mkvmuxer::Colour::kGamma22Curve:
+    case mkvmuxer::Colour::kGamma28Curve:
+    case mkvmuxer::Colour::kSmpte170MTc:
+    case mkvmuxer::Colour::kSmpte240MTc:
+    case mkvmuxer::Colour::kLinear:
+    case mkvmuxer::Colour::kLog:
+    case mkvmuxer::Colour::kLogSqrt:
+    case mkvmuxer::Colour::kIec6196624:
+    case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut:
+    case mkvmuxer::Colour::kIec6196621:
+    case mkvmuxer::Colour::kIturBt202010bit:
+    case mkvmuxer::Colour::kIturBt202012bit:
+    case mkvmuxer::Colour::kSmpteSt2084:
+    case mkvmuxer::Colour::kSmpteSt4281Tc:
+    case mkvmuxer::Colour::kAribStdB67Hlg:
+      return true;
+  }
+  return false;
+}
+
+bool IsPrimariesValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kReservedP0:
+    case mkvmuxer::Colour::kIturBt709P:
+    case mkvmuxer::Colour::kUnspecifiedP:
+    case mkvmuxer::Colour::kReservedP3:
+    case mkvmuxer::Colour::kIturBt470M:
+    case mkvmuxer::Colour::kIturBt470Bg:
+    case mkvmuxer::Colour::kSmpte170MP:
+    case mkvmuxer::Colour::kSmpte240MP:
+    case mkvmuxer::Colour::kFilm:
+    case mkvmuxer::Colour::kIturBt2020:
+    case mkvmuxer::Colour::kSmpteSt4281P:
+    case mkvmuxer::Colour::kJedecP22Phosphors:
+      return true;
+  }
+  return false;
+}
+
+}  // namespace mkvmuxer
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
new file mode 100644
index 000000000..132388da5
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVMUXER_MKVMUXERUTIL_H_
+#define MKVMUXER_MKVMUXERUTIL_H_
+
+#include "mkvmuxertypes.h"
+
+#include "stdint.h"
+
+namespace mkvmuxer {
+class Cluster;
+class Frame;
+class IMkvWriter;
+
+// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because
+// changing them causes pain for downstream projects. It would be nice if a
+// solution that allows removal of the mkvmuxer:: integer types while avoiding
+// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h}
+// are really, for the great majority of cases, EBML size calculation and writer
+// functions, perhaps a more EBML focused utility would be the way to go as a
+// first step.
+
+const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64 kMaxBlockTimecode = 0x07FFFLL;
+
+// Writes out |value| in Big Endian order. Returns 0 on success.
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
+
+// Returns the size in bytes of the element.
+int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
+int32 GetCodedUIntSize(uint64 value);
+uint64 EbmlMasterElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, int64 value);
+uint64 EbmlElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, float value);
+uint64 EbmlElementSize(uint64 type, const char* value);
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type);
+
+// Returns the size in bytes of the element assuming that the element was
+// written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
+// computes the necessary number of bytes based on |value|.
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |value|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUInt(IMkvWriter* writer, uint64 value);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |size|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
+
+// Output an Mkv master element. Returns true if the element was written.
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
+
+// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
+// ID to |SerializeInt|. Returns 0 on success.
+int32 WriteID(IMkvWriter* writer, uint64 type);
+
+// Output an Mkv non-master element. Returns true if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
+
+// Output an Mkv non-master element using fixed size. The element will be
+// written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
+// then it computes the necessary number of bytes based on |value|. Returns true
+// if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size);
+
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster);
+
+// Output a void element. |size| must be the entire size in bytes that will be
+// void. The function will calculate the size of the void header and subtract
+// it from |size|.
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
+
+// Returns the version number of the muxer in |major|, |minor|, |build|,
+// and |revision|.
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
+
+// Returns a random number to be used for UID, using |seed| to seed
+// the random-number generator (see POSIX rand_r() for semantics).
+uint64 MakeUID(unsigned int* seed);
+
+// Colour field validation helpers. All return true when |value| is valid.
+bool IsMatrixCoefficientsValueValid(uint64_t value);
+bool IsChromaSitingHorzValueValid(uint64_t value);
+bool IsChromaSitingVertValueValid(uint64_t value);
+bool IsColourRangeValueValid(uint64_t value);
+bool IsTransferCharacteristicsValueValid(uint64_t value);
+bool IsPrimariesValueValid(uint64_t value);
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVMUXERUTIL_H_
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
new file mode 100644
index 000000000..84655d802
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvwriter.h"
+
+#include <sys/types.h>
+
+#ifdef _MSC_VER
+#include <share.h>  // for _SH_DENYWR
+#endif
+
+namespace mkvmuxer {
+
+MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
+
+MkvWriter::MkvWriter(FILE* fp) : file_(fp), writer_owns_file_(false) {}
+
+MkvWriter::~MkvWriter() { Close(); }
+
+int32 MkvWriter::Write(const void* buffer, uint32 length) {
+  if (!file_)
+    return -1;
+
+  if (length == 0)
+    return 0;
+
+  if (buffer == NULL)
+    return -1;
+
+  const size_t bytes_written = fwrite(buffer, 1, length, file_);
+
+  return (bytes_written == length) ? 0 : -1;
+}
+
+bool MkvWriter::Open(const char* filename) {
+  if (filename == NULL)
+    return false;
+
+  if (file_)
+    return false;
+
+#ifdef _MSC_VER
+  file_ = _fsopen(filename, "wb", _SH_DENYWR);
+#else
+  file_ = fopen(filename, "wb");
+#endif
+  if (file_ == NULL)
+    return false;
+  return true;
+}
+
+void MkvWriter::Close() {
+  if (file_ && writer_owns_file_) {
+    fclose(file_);
+  }
+  file_ = NULL;
+}
+
+int64 MkvWriter::Position() const {
+  if (!file_)
+    return 0;
+
+#ifdef _MSC_VER
+  return _ftelli64(file_);
+#else
+  return ftell(file_);
+#endif
+}
+
+int32 MkvWriter::Position(int64 position) {
+  if (!file_)
+    return -1;
+
+#ifdef _MSC_VER
+  return _fseeki64(file_, position, SEEK_SET);
+#else
+  return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
+#endif
+}
+
+bool MkvWriter::Seekable() const { return true; }
+
+void MkvWriter::ElementStartNotify(uint64, int64) {}
+
+}  // namespace mkvmuxer
diff --git a/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h
new file mode 100644
index 000000000..4227c6374
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvmuxer/mkvwriter.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVWRITER_H_
+#define MKVMUXER_MKVWRITER_H_
+
+#include <stdio.h>
+
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvmuxertypes.h"
+
+namespace mkvmuxer {
+
+// Default implementation of the IMkvWriter interface on Windows.
+class MkvWriter : public IMkvWriter {
+ public:
+  MkvWriter();
+  explicit MkvWriter(FILE* fp);
+  virtual ~MkvWriter();
+
+  // IMkvWriter interface
+  virtual int64 Position() const;
+  virtual int32 Position(int64 position);
+  virtual bool Seekable() const;
+  virtual int32 Write(const void* buffer, uint32 length);
+  virtual void ElementStartNotify(uint64 element_id, int64 position);
+
+  // Creates and opens a file for writing. |filename| is the name of the file
+  // to open. This function will overwrite the contents of |filename|. Returns
+  // true on success.
+  bool Open(const char* filename);
+
+  // Closes an opened file.
+  void Close();
+
+ private:
+  // File handle to output file.
+  FILE* file_;
+  bool writer_owns_file_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
+};
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVWRITER_H_
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
new file mode 100644
index 000000000..e7b76f7da
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.cc
@@ -0,0 +1,8049 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "mkvparser/mkvparser.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#include <float.h>  // _isnan() / _finite()
+#define MSC_COMPAT
+#endif
+
+#include <cassert>
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <new>
+
+#include "common/webmids.h"
+
+namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const long long Colour::kValueNotPresent = LLONG_MAX;
+const float Projection::kValueNotPresent = FLT_MAX;
+
+#ifdef MSC_COMPAT
+inline bool isnan(double val) { return !!_isnan(val); }
+inline bool isinf(double val) { return !_finite(val); }
+#else
+inline bool isnan(double val) { return std::isnan(val); }
+inline bool isinf(double val) { return std::isinf(val); }
+#endif  // MSC_COMPAT
+
+IMkvReader::~IMkvReader() {}
+
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size) {
+  if (num_elements == 0 || element_size == 0)
+    return NULL;
+
+  const size_t kMaxAllocSize = 0x80000000;  // 2GiB
+  const unsigned long long num_bytes = num_elements * element_size;
+  if (element_size > (kMaxAllocSize / num_elements))
+    return NULL;
+  if (num_bytes != static_cast<size_t>(num_bytes))
+    return NULL;
+
+  return new (std::nothrow) Type[static_cast<size_t>(num_bytes)];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
+  major = 1;
+  minor = 0;
+  build = 0;
+  revision = 30;
+}
+
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  len = 1;
+  unsigned char b;
+  int status = pReader->Read(pos, 1, &b);
+
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status > 0)  // interpreted as "underflow"
+    return E_BUFFER_NOT_FULL;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  long long result = b & (~m);
+  ++pos;
+
+  for (int i = 1; i < len; ++i) {
+    status = pReader->Read(pos, 1, &b);
+
+    if (status < 0) {
+      len = 1;
+      return status;
+    }
+
+    if (status > 0) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  return result;
+}
+
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+  if (pReader == NULL || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Read the first byte. The length in bytes of the ID is determined by
+  // finding the first set bit in the first byte of the ID.
+  unsigned char temp_byte = 0;
+  int read_status = pReader->Read(pos, 1, &temp_byte);
+
+  if (read_status < 0)
+    return E_FILE_FORMAT_INVALID;
+  else if (read_status > 0)  // No data to read.
+    return E_BUFFER_NOT_FULL;
+
+  if (temp_byte == 0)  // ID length > 8 bytes; invalid file.
+    return E_FILE_FORMAT_INVALID;
+
+  int bit_pos = 0;
+  const int kMaxIdLengthInBytes = 4;
+  const int kCheckByte = 0x80;
+
+  // Find the first bit that's set.
+  bool found_bit = false;
+  for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+    if ((kCheckByte >> bit_pos) & temp_byte) {
+      found_bit = true;
+      break;
+    }
+  }
+
+  if (!found_bit) {
+    // The value is too large to be a valid ID.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Read the remaining bytes of the ID (if any).
+  const int id_length = bit_pos + 1;
+  long long ebml_id = temp_byte;
+  for (int i = 1; i < id_length; ++i) {
+    ebml_id <<= 8;
+    read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+    if (read_status < 0)
+      return E_FILE_FORMAT_INVALID;
+    else if (read_status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    ebml_id |= temp_byte;
+  }
+
+  len = id_length;
+  return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  long long total, available;
+
+  int status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return E_FILE_FORMAT_INVALID;
+
+  len = 1;
+
+  if (pos >= available)
+    return pos;  // too few bytes available
+
+  unsigned char b;
+
+  status = pReader->Read(pos, 1, &b);
+
+  if (status != 0)
+    return status;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  return 0;  // success
+}
+
+// TODO(vigneshv): This function assumes that unsigned values never have their
+// high bit set.
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+  if (!pReader || pos < 0 || (size <= 0) || (size > 8))
+    return E_FILE_FORMAT_INVALID;
+
+  long long result = 0;
+
+  for (long long i = 0; i < size; ++i) {
+    unsigned char b;
+
+    const long status = pReader->Read(pos, 1, &b);
+
+    if (status < 0)
+      return status;
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  return result;
+}
+
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+                      double& result) {
+  if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
+    return E_FILE_FORMAT_INVALID;
+
+  const long size = static_cast<long>(size_);
+
+  unsigned char buf[8];
+
+  const int status = pReader->Read(pos, size, buf);
+
+  if (status < 0)  // error
+    return status;
+
+  if (size == 4) {
+    union {
+      float f;
+      unsigned long ff;
+    };
+
+    ff = 0;
+
+    for (int i = 0;;) {
+      ff |= buf[i];
+
+      if (++i >= 4)
+        break;
+
+      ff <<= 8;
+    }
+
+    result = f;
+  } else {
+    union {
+      double d;
+      unsigned long long dd;
+    };
+
+    dd = 0;
+
+    for (int i = 0;;) {
+      dd |= buf[i];
+
+      if (++i >= 8)
+        break;
+
+      dd <<= 8;
+    }
+
+    result = d;
+  }
+
+  if (mkvparser::isinf(result) || mkvparser::isnan(result))
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+                    long long& result_ref) {
+  if (!pReader || pos < 0 || size < 1 || size > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  signed char first_byte = 0;
+  const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
+
+  if (status < 0)
+    return status;
+
+  unsigned long long result = first_byte;
+  ++pos;
+
+  for (long i = 1; i < size; ++i) {
+    unsigned char b;
+
+    const long status = pReader->Read(pos, 1, &b);
+
+    if (status < 0)
+      return status;
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  result_ref = static_cast<long long>(result);
+  return 0;
+}
+
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+                       char*& str) {
+  delete[] str;
+  str = NULL;
+
+  if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
+    return E_FILE_FORMAT_INVALID;
+
+  // +1 for '\0' terminator
+  const long required_size = static_cast<long>(size) + 1;
+
+  str = SafeArrayAlloc<char>(1, required_size);
+  if (str == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
+
+  const long status = pReader->Read(pos, static_cast<long>(size), buf);
+
+  if (status) {
+    delete[] str;
+    str = NULL;
+
+    return status;
+  }
+
+  str[required_size - 1] = '\0';
+  return 0;
+}
+
+long ParseElementHeader(IMkvReader* pReader, long long& pos, long long stop,
+                        long long& id, long long& size) {
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  long len;
+
+  id = ReadID(pReader, pos, len);
+
+  if (id < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume id
+
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  size = ReadUInt(pReader, pos, len);
+
+  if (size < 0 || len < 1 || len > 8) {
+    // Invalid: Negative payload size, negative or 0 length integer, or integer
+    // larger than 64 bits (libwebm cannot handle them).
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Avoid rolling over pos when very close to LLONG_MAX.
+  const unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume length of size
+
+  // pos now designates payload
+
+  if (stop >= 0 && pos > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           long long& val) {
+  if (!pReader || pos < 0)
+    return false;
+
+  long long total = 0;
+  long long available = 0;
+
+  const long status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return false;
+
+  long len = 0;
+
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
+
+  if (static_cast<unsigned long>(id) != expected_id)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+    return false;
+
+  pos += len;  // consume length of size of payload
+
+  val = UnserializeUInt(pReader, pos, size);
+  if (val < 0)
+    return false;
+
+  pos += size;  // consume size of payload
+
+  return true;
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           unsigned char*& buf, size_t& buflen) {
+  if (!pReader || pos < 0)
+    return false;
+
+  long long total = 0;
+  long long available = 0;
+
+  long status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return false;
+
+  long len = 0;
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
+
+  if (static_cast<unsigned long>(id) != expected_id)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+    return false;
+
+  unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  pos += len;  // consume length of size of payload
+
+  rollover_check = static_cast<unsigned long long>(pos) + size;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  if ((pos + size) > available)
+    return false;
+
+  if (size >= LONG_MAX)
+    return false;
+
+  const long buflen_ = static_cast<long>(size);
+
+  buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+  if (!buf)
+    return false;
+
+  status = pReader->Read(pos, buflen_, buf);
+  if (status != 0)
+    return false;
+
+  buflen = buflen_;
+
+  pos += size;  // consume size of payload
+  return true;
+}
+
+EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
+
+EBMLHeader::~EBMLHeader() { delete[] m_docType; }
+
+void EBMLHeader::Init() {
+  m_version = 1;
+  m_readVersion = 1;
+  m_maxIdLength = 4;
+  m_maxSizeLength = 8;
+
+  if (m_docType) {
+    delete[] m_docType;
+    m_docType = NULL;
+  }
+
+  m_docTypeVersion = 1;
+  m_docTypeReadVersion = 1;
+}
+
+long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
+  if (!pReader)
+    return E_FILE_FORMAT_INVALID;
+
+  long long total, available;
+
+  long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  pos = 0;
+
+  // Scan until we find what looks like the first byte of the EBML header.
+  const long long kMaxScanBytes = (available >= 1024) ? 1024 : available;
+  const unsigned char kEbmlByte0 = 0x1A;
+  unsigned char scan_byte = 0;
+
+  while (pos < kMaxScanBytes) {
+    status = pReader->Read(pos, 1, &scan_byte);
+
+    if (status < 0)  // error
+      return status;
+    else if (status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if (scan_byte == kEbmlByte0)
+      break;
+
+    ++pos;
+  }
+
+  long len = 0;
+  const long long ebml_id = ReadID(pReader, pos, len);
+
+  if (ebml_id == E_BUFFER_NOT_FULL)
+    return E_BUFFER_NOT_FULL;
+
+  if (len != 4 || ebml_id != libwebm::kMkvEBML)
+    return E_FILE_FORMAT_INVALID;
+
+  // Move read pos forward to the EBML header size field.
+  pos += 4;
+
+  // Read length of size field.
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return E_FILE_FORMAT_INVALID;
+  else if (result > 0)  // need more data
+    return E_BUFFER_NOT_FULL;
+
+  if (len < 1 || len > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((total >= 0) && ((total - pos) < len))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < len)
+    return pos + len;  // try again later
+
+  // Read the EBML header size.
+  result = ReadUInt(pReader, pos, len);
+
+  if (result < 0)  // error
+    return result;
+
+  pos += len;  // consume size field
+
+  // pos now designates start of payload
+
+  if ((total >= 0) && ((total - pos) < result))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < result)
+    return pos + result;
+
+  const long long end = pos + result;
+
+  Init();
+
+  while (pos < end) {
+    long long id, size;
+
+    status = ParseElementHeader(pReader, pos, end, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvEBMLVersion) {
+      m_version = UnserializeUInt(pReader, pos, size);
+
+      if (m_version <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvEBMLReadVersion) {
+      m_readVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_readVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvEBMLMaxIDLength) {
+      m_maxIdLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxIdLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvEBMLMaxSizeLength) {
+      m_maxSizeLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxSizeLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDocType) {
+      if (m_docType)
+        return E_FILE_FORMAT_INVALID;
+
+      status = UnserializeString(pReader, pos, size, m_docType);
+
+      if (status)  // error
+        return status;
+    } else if (id == libwebm::kMkvDocTypeVersion) {
+      m_docTypeVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDocTypeReadVersion) {
+      m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeReadVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;
+  }
+
+  if (pos != end)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+  if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+  if (m_maxIdLength <= 0 || m_maxIdLength > 4 || m_maxSizeLength <= 0 ||
+      m_maxSizeLength > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+Segment::Segment(IMkvReader* pReader, long long elem_start,
+                 // long long elem_size,
+                 long long start, long long size)
+    : m_pReader(pReader),
+      m_element_start(elem_start),
+      // m_element_size(elem_size),
+      m_start(start),
+      m_size(size),
+      m_pos(start),
+      m_pUnknownSize(0),
+      m_pSeekHead(NULL),
+      m_pInfo(NULL),
+      m_pTracks(NULL),
+      m_pCues(NULL),
+      m_pChapters(NULL),
+      m_pTags(NULL),
+      m_clusters(NULL),
+      m_clusterCount(0),
+      m_clusterPreloadCount(0),
+      m_clusterSize(0) {}
+
+Segment::~Segment() {
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** i = m_clusters;
+  Cluster** j = m_clusters + count;
+
+  while (i != j) {
+    Cluster* const p = *i++;
+    delete p;
+  }
+
+  delete[] m_clusters;
+
+  delete m_pTracks;
+  delete m_pInfo;
+  delete m_pCues;
+  delete m_pChapters;
+  delete m_pTags;
+  delete m_pSeekHead;
+}
+
+long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
+                                  Segment*& pSegment) {
+  if (pReader == NULL || pos < 0)
+    return E_PARSE_FAILED;
+
+  pSegment = NULL;
+
+  long long total, available;
+
+  const long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (available < 0)
+    return -1;
+
+  if ((total >= 0) && (available > total))
+    return -1;
+
+  // I would assume that in practice this loop would execute
+  // exactly once, but we allow for other elements (e.g. Void)
+  // to immediately follow the EBML header.  This is fine for
+  // the source filter case (since the entire file is available),
+  // but in the splitter case over a network we should probably
+  // just give up early.  We could for example decide only to
+  // execute this loop a maximum of, say, 10 times.
+  // TODO:
+  // There is an implied "give up early" by only parsing up
+  // to the available limit.  We do do that, but only if the
+  // total file size is unknown.  We could decide to always
+  // use what's available as our limit (irrespective of whether
+  // we happen to know the total file length).  This would have
+  // as its sense "parse this much of the file before giving up",
+  // which a slightly different sense from "try to parse up to
+  // 10 EMBL elements before giving up".
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return E_FILE_FORMAT_INVALID;
+
+    // Read ID
+    long len;
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result)  // error, or too few available bytes
+      return result;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long idpos = pos;
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result)  // error, or too few available bytes
+      return result;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return size;
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    // Handle "unknown size" for live streaming of webm files.
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (id == libwebm::kMkvSegment) {
+      if (size == unknown_size)
+        size = -1;
+
+      else if (total < 0)
+        size = -1;
+
+      else if ((pos + size) > total)
+        size = -1;
+
+      pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+      if (pSegment == NULL)
+        return E_PARSE_FAILED;
+
+      return 0;  // success
+    }
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + size) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    pos += size;  // consume payload
+  }
+}
+
+long long Segment::ParseHeaders() {
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+  long long total, available;
+
+  const int status = m_pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total > 0 && available > total)
+    return E_FILE_FORMAT_INVALID;
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+      (segment_stop >= 0 && m_pos > segment_stop)) {
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      break;
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      break;
+
+    long long pos = m_pos;
+    const long long element_start = pos;
+
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    unsigned long long rollover_check = pos + 1ULL;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    long len;
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
+      return (pos + 1);
+    }
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long idpos = pos;
+    const long long id = ReadID(m_pReader, idpos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvCluster)
+      break;
+
+    pos += len;  // consume ID
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
+      return (pos + 1);
+    }
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0 || len < 1 || len > 8) {
+      // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+      // len > 8 is true instead of checking this _everywhere_.
+      return size;
+    }
+
+    pos += len;  // consume length of size of element
+
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    rollover_check = static_cast<unsigned long long>(pos) + size;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long element_size = size + pos - element_start;
+
+    // Pos now points to start of payload
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // We read EBML elements either in total or nothing at all.
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    if (id == libwebm::kMkvInfo) {
+      if (m_pInfo)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pInfo = new (std::nothrow)
+          SegmentInfo(this, pos, size, element_start, element_size);
+
+      if (m_pInfo == NULL)
+        return -1;
+
+      const long status = m_pInfo->Parse();
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvTracks) {
+      if (m_pTracks)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pTracks = new (std::nothrow)
+          Tracks(this, pos, size, element_start, element_size);
+
+      if (m_pTracks == NULL)
+        return -1;
+
+      const long status = m_pTracks->Parse();
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvCues) {
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+
+        if (m_pCues == NULL)
+          return -1;
+      }
+    } else if (id == libwebm::kMkvSeekHead) {
+      if (m_pSeekHead == NULL) {
+        m_pSeekHead = new (std::nothrow)
+            SeekHead(this, pos, size, element_start, element_size);
+
+        if (m_pSeekHead == NULL)
+          return -1;
+
+        const long status = m_pSeekHead->Parse();
+
+        if (status)
+          return status;
+      }
+    } else if (id == libwebm::kMkvChapters) {
+      if (m_pChapters == NULL) {
+        m_pChapters = new (std::nothrow)
+            Chapters(this, pos, size, element_start, element_size);
+
+        if (m_pChapters == NULL)
+          return -1;
+
+        const long status = m_pChapters->Parse();
+
+        if (status)
+          return status;
+      }
+    } else if (id == libwebm::kMkvTags) {
+      if (m_pTags == NULL) {
+        m_pTags = new (std::nothrow)
+            Tags(this, pos, size, element_start, element_size);
+
+        if (m_pTags == NULL)
+          return -1;
+
+        const long status = m_pTags->Parse();
+
+        if (status)
+          return status;
+      }
+    }
+
+    m_pos = pos + size;  // consume payload
+  }
+
+  if (segment_stop >= 0 && m_pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_pInfo == NULL)  // TODO: liberalize this behavior
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+long Segment::LoadCluster(long long& pos, long& len) {
+  for (;;) {
+    const long result = DoLoadCluster(pos, len);
+
+    if (result <= 1)
+      return result;
+  }
+}
+
+long Segment::DoLoadCluster(long long& pos, long& len) {
+  if (m_pos < 0)
+    return DoLoadClusterUnknownSize(pos, len);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  long long cluster_off = -1;  // offset relative to start of segment
+  long long cluster_size = -1;  // size of cluster payload
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      return 1;  // no more clusters
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      return 1;  // no more clusters
+
+    pos = m_pos;
+
+    // Read ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;
+    const long long id = ReadID(m_pReader, idpos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // pos now points to start of payload
+
+    if (size == 0) {
+      // Missing element payload: move on.
+      m_pos = pos;
+      continue;
+    }
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == libwebm::kMkvCues) {
+      if (size == unknown_size) {
+        // Cues element of unknown size: Not supported.
+        return E_FILE_FORMAT_INVALID;
+      }
+
+      if (m_pCues == NULL) {
+        const long long element_size = (pos - idpos) + size;
+
+        m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+        if (m_pCues == NULL)
+          return -1;
+      }
+
+      m_pos = pos + size;  // consume payload
+      continue;
+    }
+
+    if (id != libwebm::kMkvCluster) {
+      // Besides the Segment, Libwebm allows only cluster elements of unknown
+      // size. Fail the parse upon encountering a non-cluster element reporting
+      // unknown size.
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pos = pos + size;  // consume payload
+      continue;
+    }
+
+    // We have a cluster.
+
+    cluster_off = idpos - m_start;  // relative pos
+
+    if (size != unknown_size)
+      cluster_size = size;
+
+    break;
+  }
+
+  if (cluster_off < 0) {
+    // No cluster, die.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_);
+
+  if (status < 0) {  // error, or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  // status == 0 means "no block entries found"
+  // status > 0 means "found at least one block entry"
+
+  // TODO:
+  // The issue here is that the segment increments its own
+  // pos ptr past the most recent cluster parsed, and then
+  // starts from there to parse the next cluster.  If we
+  // don't know the size of the current cluster, then we
+  // must either parse its payload (as we do below), looking
+  // for the cluster (or cues) ID to terminate the parse.
+  // This isn't really what we want: rather, we really need
+  // a way to create the curr cluster object immediately.
+  // The pity is that cluster::parse can determine its own
+  // boundary, and we largely duplicate that same logic here.
+  //
+  // Maybe we need to get rid of our look-ahead preloading
+  // in source::parse???
+  //
+  // As we're parsing the blocks in the curr cluster
+  //(in cluster::parse), we should have some way to signal
+  // to the segment that we have determined the boundary,
+  // so it can adjust its own segment::m_pos member.
+  //
+  // The problem is that we're asserting in asyncreadinit,
+  // because we adjust the pos down to the curr seek pos,
+  // and the resulting adjusted len is > 2GB.  I'm suspicious
+  // that this is even correct, but even if it is, we can't
+  // be loading that much data in the cache anyway.
+
+  const long idx = m_clusterCount;
+
+  if (m_clusterPreloadCount > 0) {
+    if (idx >= m_clusterSize)
+      return E_FILE_FORMAT_INVALID;
+
+    Cluster* const pCluster = m_clusters[idx];
+    if (pCluster == NULL || pCluster->m_index >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long off = pCluster->GetPosition();
+    if (off < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (off == cluster_off) {  // preloaded already
+      if (status == 0)  // no entries found
+        return E_FILE_FORMAT_INVALID;
+
+      if (cluster_size >= 0)
+        pos += cluster_size;
+      else {
+        const long long element_size = pCluster->GetElementSize();
+
+        if (element_size <= 0)
+          return E_FILE_FORMAT_INVALID;  // TODO: handle this case
+
+        pos = pCluster->m_element_start + element_size;
+      }
+
+      pCluster->m_index = idx;  // move from preloaded to loaded
+      ++m_clusterCount;
+      --m_clusterPreloadCount;
+
+      m_pos = pos;  // consume payload
+      if (segment_stop >= 0 && m_pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      return 0;  // success
+    }
+  }
+
+  if (status == 0) {  // no entries found
+    if (cluster_size >= 0)
+      pos += cluster_size;
+
+    if ((total >= 0) && (pos >= total)) {
+      m_pos = total;
+      return 1;  // no more clusters
+    }
+
+    if ((segment_stop >= 0) && (pos >= segment_stop)) {
+      m_pos = segment_stop;
+      return 1;  // no more clusters
+    }
+
+    m_pos = pos;
+    return 2;  // try again
+  }
+
+  // status > 0 means we have an entry
+
+  Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
+  if (pCluster == NULL)
+    return -1;
+
+  if (!AppendCluster(pCluster)) {
+    delete pCluster;
+    return -1;
+  }
+
+  if (cluster_size >= 0) {
+    pos += cluster_size;
+
+    m_pos = pos;
+
+    if (segment_stop > 0 && m_pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    return 0;
+  }
+
+  m_pUnknownSize = pCluster;
+  m_pos = -pos;
+
+  return 0;  // partial success, since we have a new cluster
+
+  // status == 0 means "no block entries found"
+  // pos designates start of payload
+  // m_pos has NOT been adjusted yet (in case we need to come back here)
+}
+
+long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
+  if (m_pos >= 0 || m_pUnknownSize == NULL)
+    return E_PARSE_FAILED;
+
+  const long status = m_pUnknownSize->Parse(pos, len);
+
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status == 0)  // parsed a block
+    return 2;  // continue parsing
+
+  const long long start = m_pUnknownSize->m_element_start;
+  const long long size = m_pUnknownSize->GetElementSize();
+
+  if (size < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos = start + size;
+  m_pos = pos;
+
+  m_pUnknownSize = 0;
+
+  return 2;  // continue parsing
+}
+
+bool Segment::AppendCluster(Cluster* pCluster) {
+  if (pCluster == NULL || pCluster->m_index < 0)
+    return false;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  long& size = m_clusterSize;
+  const long idx = pCluster->m_index;
+
+  if (size < count || idx != m_clusterCount)
+    return false;
+
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
+
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
+
+    Cluster** q = qq;
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_clusters;
+
+    m_clusters = qq;
+    size = n;
+  }
+
+  if (m_clusterPreloadCount > 0) {
+    Cluster** const p = m_clusters + m_clusterCount;
+    if (*p == NULL || (*p)->m_index >= 0)
+      return false;
+
+    Cluster** q = p + m_clusterPreloadCount;
+    if (q >= (m_clusters + size))
+      return false;
+
+    for (;;) {
+      Cluster** const qq = q - 1;
+      if ((*qq)->m_index >= 0)
+        return false;
+
+      *q = *qq;
+      q = qq;
+
+      if (q == p)
+        break;
+    }
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterCount;
+  return true;
+}
+
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+  if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+    return false;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  long& size = m_clusterSize;
+  if (size < count)
+    return false;
+
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
+
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
+    Cluster** q = qq;
+
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_clusters;
+
+    m_clusters = qq;
+    size = n;
+  }
+
+  if (m_clusters == NULL)
+    return false;
+
+  Cluster** const p = m_clusters + idx;
+
+  Cluster** q = m_clusters + count;
+  if (q < p || q >= (m_clusters + size))
+    return false;
+
+  while (q > p) {
+    Cluster** const qq = q - 1;
+
+    if ((*qq)->m_index >= 0)
+      return false;
+
+    *q = *qq;
+    q = qq;
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterPreloadCount;
+  return true;
+}
+
+long Segment::Load() {
+  if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+    return E_PARSE_FAILED;
+
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+
+  const long long header_status = ParseHeaders();
+
+  if (header_status < 0)  // error
+    return static_cast<long>(header_status);
+
+  if (header_status > 0)  // underflow
+    return E_BUFFER_NOT_FULL;
+
+  if (m_pInfo == NULL || m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  for (;;) {
+    const long status = LoadCluster();
+
+    if (status < 0)  // error
+      return status;
+
+    if (status >= 1)  // no more clusters
+      return 0;
+  }
+}
+
+SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {}
+
+SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
+                   long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_entries(0),
+      m_entry_count(0),
+      m_void_elements(0),
+      m_void_element_count(0) {}
+
+SeekHead::~SeekHead() {
+  delete[] m_entries;
+  delete[] m_void_elements;
+}
+
+long SeekHead::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  // first count the seek head entries
+
+  int entry_count = 0;
+  int void_element_count = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvSeek)
+      ++entry_count;
+    else if (id == libwebm::kMkvVoid)
+      ++void_element_count;
+
+    pos += size;  // consume payload
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (entry_count > 0) {
+    m_entries = new (std::nothrow) Entry[entry_count];
+
+    if (m_entries == NULL)
+      return -1;
+  }
+
+  if (void_element_count > 0) {
+    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+
+    if (m_void_elements == NULL)
+      return -1;
+  }
+
+  // now parse the entries and void elements
+
+  Entry* pEntry = m_entries;
+  VoidElement* pVoidElement = m_void_elements;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvSeek && entry_count > 0) {
+      if (ParseEntry(pReader, pos, size, pEntry)) {
+        Entry& e = *pEntry++;
+
+        e.element_start = idpos;
+        e.element_size = (pos + size) - idpos;
+      }
+    } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
+      VoidElement& e = *pVoidElement++;
+
+      e.element_start = idpos;
+      e.element_size = (pos + size) - idpos;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
+  assert(count_ >= 0);
+  assert(count_ <= entry_count);
+
+  m_entry_count = static_cast<int>(count_);
+
+  count_ = ptrdiff_t(pVoidElement - m_void_elements);
+  assert(count_ >= 0);
+  assert(count_ <= void_element_count);
+
+  m_void_element_count = static_cast<int>(count_);
+
+  return 0;
+}
+
+int SeekHead::GetCount() const { return m_entry_count; }
+
+const SeekHead::Entry* SeekHead::GetEntry(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_entry_count)
+    return 0;
+
+  return m_entries + idx;
+}
+
+int SeekHead::GetVoidElementCount() const { return m_void_element_count; }
+
+const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_void_element_count)
+    return 0;
+
+  return m_void_elements + idx;
+}
+
+long Segment::ParseCues(long long off, long long& pos, long& len) {
+  if (m_pCues)
+    return 0;  // success
+
+  if (off < 0)
+    return -1;
+
+  long long total, avail;
+
+  const int status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = m_start + off;
+
+  if ((total < 0) || (pos >= total))
+    return 1;  // don't bother parsing cues
+
+  const long long element_start = pos;
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(m_pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long idpos = pos;
+
+  const long long id = ReadID(m_pReader, idpos, len);
+
+  if (id != libwebm::kMkvCues)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume ID
+  assert((segment_stop < 0) || (pos <= segment_stop));
+
+  // Read Size
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  result = GetUIntLength(m_pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long size = ReadUInt(m_pReader, pos, len);
+
+  if (size < 0)  // error
+    return static_cast<long>(size);
+
+  if (size == 0)  // weird, although technically not illegal
+    return 1;  // done
+
+  pos += len;  // consume length of size of element
+  assert((segment_stop < 0) || (pos <= segment_stop));
+
+  // Pos now points to start of payload
+
+  const long long element_stop = pos + size;
+
+  if ((segment_stop >= 0) && (element_stop > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((total >= 0) && (element_stop > total))
+    return 1;  // don't bother parsing anymore
+
+  len = static_cast<long>(size);
+
+  if (element_stop > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long element_size = element_stop - element_start;
+
+  m_pCues =
+      new (std::nothrow) Cues(this, pos, size, element_start, element_size);
+  if (m_pCues == NULL)
+    return -1;
+
+  return 0;  // success
+}
+
+bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
+                          Entry* pEntry) {
+  if (size_ <= 0)
+    return false;
+
+  long long pos = start;
+  const long long stop = start + size_;
+
+  long len;
+
+  // parse the container for the level-1 element ID
+
+  const long long seekIdId = ReadID(pReader, pos, len);
+  if (seekIdId < 0)
+    return false;
+
+  if (seekIdId != libwebm::kMkvSeekID)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume SeekID id
+
+  const long long seekIdSize = ReadUInt(pReader, pos, len);
+
+  if (seekIdSize <= 0)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume size of field
+
+  if ((pos + seekIdSize) > stop)
+    return false;
+
+  pEntry->id = ReadID(pReader, pos, len);  // payload
+
+  if (pEntry->id <= 0)
+    return false;
+
+  if (len != seekIdSize)
+    return false;
+
+  pos += seekIdSize;  // consume SeekID payload
+
+  const long long seekPosId = ReadID(pReader, pos, len);
+
+  if (seekPosId != libwebm::kMkvSeekPosition)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long seekPosSize = ReadUInt(pReader, pos, len);
+
+  if (seekPosSize <= 0)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume size
+
+  if ((pos + seekPosSize) > stop)
+    return false;
+
+  pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize);
+
+  if (pEntry->pos < 0)
+    return false;
+
+  pos += seekPosSize;  // consume payload
+
+  if (pos != stop)
+    return false;
+
+  return true;
+}
+
+Cues::Cues(Segment* pSegment, long long start_, long long size_,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start_),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_cue_points(NULL),
+      m_count(0),
+      m_preload_count(0),
+      m_pos(start_) {}
+
+Cues::~Cues() {
+  const long n = m_count + m_preload_count;
+
+  CuePoint** p = m_cue_points;
+  CuePoint** const q = p + n;
+
+  while (p != q) {
+    CuePoint* const pCP = *p++;
+    assert(pCP);
+
+    delete pCP;
+  }
+
+  delete[] m_cue_points;
+}
+
+long Cues::GetCount() const {
+  if (m_cue_points == NULL)
+    return -1;
+
+  return m_count;  // TODO: really ignore preload count?
+}
+
+bool Cues::DoneParsing() const {
+  const long long stop = m_start + m_size;
+  return (m_pos >= stop);
+}
+
+bool Cues::Init() const {
+  if (m_cue_points)
+    return true;
+
+  if (m_count != 0 || m_preload_count != 0)
+    return false;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  const long long stop = m_start + m_size;
+  long long pos = m_start;
+
+  long cue_points_size = 0;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if (size < 0 || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if (pos + size > stop) {
+      return false;
+    }
+
+    if (id == libwebm::kMkvCuePoint) {
+      if (!PreloadCuePoint(cue_points_size, idpos))
+        return false;
+    }
+
+    pos += size;  // skip payload
+  }
+  return true;
+}
+
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+  if (m_count != 0)
+    return false;
+
+  if (m_preload_count >= cue_points_size) {
+    const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
+
+    CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+    if (qq == NULL)
+      return false;
+
+    CuePoint** q = qq;  // beginning of target
+
+    CuePoint** p = m_cue_points;  // beginning of source
+    CuePoint** const pp = p + m_preload_count;  // end of source
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_cue_points;
+
+    m_cue_points = qq;
+    cue_points_size = n;
+  }
+
+  CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+  if (pCP == NULL)
+    return false;
+
+  m_cue_points[m_preload_count++] = pCP;
+  return true;
+}
+
+bool Cues::LoadCuePoint() const {
+  const long long stop = m_start + m_size;
+
+  if (m_pos >= stop)
+    return false;  // nothing else to do
+
+  if (!Init()) {
+    m_pos = stop;
+    return false;
+  }
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  while (m_pos < stop) {
+    const long long idpos = m_pos;
+
+    long len;
+
+    const long long id = ReadID(pReader, m_pos, len);
+    if (id < 0 || (m_pos + len) > stop)
+      return false;
+
+    m_pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, m_pos, len);
+    if (size < 0 || (m_pos + len) > stop)
+      return false;
+
+    m_pos += len;  // consume Size field
+    if ((m_pos + size) > stop)
+      return false;
+
+    if (id != libwebm::kMkvCuePoint) {
+      m_pos += size;  // consume payload
+      if (m_pos > stop)
+        return false;
+
+      continue;
+    }
+
+    if (m_preload_count < 1)
+      return false;
+
+    CuePoint* const pCP = m_cue_points[m_count];
+    if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
+      return false;
+
+    if (!pCP->Load(pReader)) {
+      m_pos = stop;
+      return false;
+    }
+    ++m_count;
+    --m_preload_count;
+
+    m_pos += size;  // consume payload
+    if (m_pos > stop)
+      return false;
+
+    return true;  // yes, we loaded a cue point
+  }
+
+  return false;  // no, we did not load a cue point
+}
+
+bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
+                const CuePoint::TrackPosition*& pTP) const {
+  if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
+    return false;
+
+  CuePoint** const ii = m_cue_points;
+  CuePoint** i = ii;
+
+  CuePoint** const jj = ii + m_count;
+  CuePoint** j = jj;
+
+  pCP = *i;
+  if (pCP == NULL)
+    return false;
+
+  if (time_ns <= pCP->GetTime(m_pSegment)) {
+    pTP = pCP->Find(pTrack);
+    return (pTP != NULL);
+  }
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) <= time_ns
+    //[i, j)  ?
+    //[j, jj) > time_ns
+
+    CuePoint** const k = i + (j - i) / 2;
+    if (k >= jj)
+      return false;
+
+    CuePoint* const pCP = *k;
+    if (pCP == NULL)
+      return false;
+
+    const long long t = pCP->GetTime(m_pSegment);
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    if (i > j)
+      return false;
+  }
+
+  if (i != j || i > jj || i <= ii)
+    return false;
+
+  pCP = *--i;
+
+  if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+    return false;
+
+  // TODO: here and elsewhere, it's probably not correct to search
+  // for the cue point with this time, and then search for a matching
+  // track.  In principle, the matching track could be on some earlier
+  // cue point, and with our current algorithm, we'd miss it.  To make
+  // this bullet-proof, we'd need to create a secondary structure,
+  // with a list of cue points that apply to a track, and then search
+  // that track-based structure for a matching cue point.
+
+  pTP = pCP->Find(pTrack);
+  return (pTP != NULL);
+}
+
+const CuePoint* Cues::GetFirst() const {
+  if (m_cue_points == NULL || m_count == 0)
+    return NULL;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL)
+    return NULL;
+
+  CuePoint* const pCP = pp[0];
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
+
+  return pCP;
+}
+
+const CuePoint* Cues::GetLast() const {
+  if (m_cue_points == NULL || m_count <= 0)
+    return NULL;
+
+  const long index = m_count - 1;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL)
+    return NULL;
+
+  CuePoint* const pCP = pp[index];
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
+
+  return pCP;
+}
+
+const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
+  if (pCurr == NULL || pCurr->GetTimeCode() < 0 || m_cue_points == NULL ||
+      m_count < 1) {
+    return NULL;
+  }
+
+  long index = pCurr->m_index;
+  if (index >= m_count)
+    return NULL;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL || pp[index] != pCurr)
+    return NULL;
+
+  ++index;
+
+  if (index >= m_count)
+    return NULL;
+
+  CuePoint* const pNext = pp[index];
+
+  if (pNext == NULL || pNext->GetTimeCode() < 0)
+    return NULL;
+
+  return pNext;
+}
+
+const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
+                                 const CuePoint::TrackPosition* pTP) const {
+  if (pCP == NULL || pTP == NULL)
+    return NULL;
+
+  return m_pSegment->GetBlock(*pCP, *pTP);
+}
+
+const BlockEntry* Segment::GetBlock(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) {
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pCluster = *k;
+    assert(pCluster);
+
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < tp.m_pos)
+      i = k + 1;
+    else if (pos > tp.m_pos)
+      j = k;
+    else
+      return pCluster->GetEntry(cp, tp);
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos);  //, -1);
+  if (pCluster == NULL)
+    return NULL;
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster->GetEntry(cp, tp);
+}
+
+const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) {
+  if (requested_pos < 0)
+    return 0;
+
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pCluster = *k;
+    assert(pCluster);
+
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < requested_pos)
+      i = k + 1;
+    else if (pos > requested_pos)
+      j = k;
+    else
+      return pCluster;
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
+  if (pCluster == NULL)
+    return NULL;
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster;
+}
+
+CuePoint::CuePoint(long idx, long long pos)
+    : m_element_start(0),
+      m_element_size(0),
+      m_index(idx),
+      m_timecode(-1 * pos),
+      m_track_positions(NULL),
+      m_track_positions_count(0) {
+  assert(pos > 0);
+}
+
+CuePoint::~CuePoint() { delete[] m_track_positions; }
+
+bool CuePoint::Load(IMkvReader* pReader) {
+  // odbgstream os;
+  // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
+
+  if (m_timecode >= 0)  // already loaded
+    return true;
+
+  assert(m_track_positions == NULL);
+  assert(m_track_positions_count == 0);
+
+  long long pos_ = -m_timecode;
+  const long long element_start = pos_;
+
+  long long stop;
+
+  {
+    long len;
+
+    const long long id = ReadID(pReader, pos_, len);
+    if (id != libwebm::kMkvCuePoint)
+      return false;
+
+    pos_ += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos_, len);
+    assert(size >= 0);
+
+    pos_ += len;  // consume Size field
+    // pos_ now points to start of payload
+
+    stop = pos_ + size;
+  }
+
+  const long long element_size = stop - element_start;
+
+  long long pos = pos_;
+
+  // First count number of track positions
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if ((id < 0) || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if ((size < 0) || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if ((pos + size) > stop) {
+      return false;
+    }
+
+    if (id == libwebm::kMkvCueTime)
+      m_timecode = UnserializeUInt(pReader, pos, size);
+
+    else if (id == libwebm::kMkvCueTrackPositions)
+      ++m_track_positions_count;
+
+    pos += size;  // consume payload
+  }
+
+  if (m_timecode < 0 || m_track_positions_count <= 0) {
+    return false;
+  }
+
+  // os << "CuePoint::Load(cont'd): idpos=" << idpos
+  //   << " timecode=" << m_timecode
+  //   << endl;
+
+  m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+  if (m_track_positions == NULL)
+    return false;
+
+  // Now parse track positions
+
+  TrackPosition* p = m_track_positions;
+  pos = pos_;
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return false;
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
+
+    if (id == libwebm::kMkvCueTrackPositions) {
+      TrackPosition& tp = *p++;
+      if (!tp.Parse(pReader, pos, size)) {
+        return false;
+      }
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return false;
+  }
+
+  assert(size_t(p - m_track_positions) == m_track_positions_count);
+
+  m_element_start = element_start;
+  m_element_size = element_size;
+
+  return true;
+}
+
+bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
+                                    long long size_) {
+  const long long stop = start_ + size_;
+  long long pos = start_;
+
+  m_track = -1;
+  m_pos = -1;
+  m_block = 1;  // default
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if ((id < 0) || ((pos + len) > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if ((size < 0) || ((pos + len) > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if ((pos + size) > stop) {
+      return false;
+    }
+
+    if (id == libwebm::kMkvCueTrack)
+      m_track = UnserializeUInt(pReader, pos, size);
+    else if (id == libwebm::kMkvCueClusterPosition)
+      m_pos = UnserializeUInt(pReader, pos, size);
+    else if (id == libwebm::kMkvCueBlockNumber)
+      m_block = UnserializeUInt(pReader, pos, size);
+
+    pos += size;  // consume payload
+  }
+
+  if ((m_pos < 0) || (m_track <= 0)) {
+    return false;
+  }
+
+  return true;
+}
+
+const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
+  if (pTrack == NULL) {
+    return NULL;
+  }
+
+  const long long n = pTrack->GetNumber();
+
+  const TrackPosition* i = m_track_positions;
+  const TrackPosition* const j = i + m_track_positions_count;
+
+  while (i != j) {
+    const TrackPosition& p = *i++;
+
+    if (p.m_track == n)
+      return &p;
+  }
+
+  return NULL;  // no matching track number found
+}
+
+long long CuePoint::GetTimeCode() const { return m_timecode; }
+
+long long CuePoint::GetTime(const Segment* pSegment) const {
+  assert(pSegment);
+  assert(m_timecode >= 0);
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long time = scale * m_timecode;
+
+  return time;
+}
+
+bool Segment::DoneParsing() const {
+  if (m_size < 0) {
+    long long total, avail;
+
+    const int status = m_pReader->Length(&total, &avail);
+
+    if (status < 0)  // error
+      return true;  // must assume done
+
+    if (total < 0)
+      return false;  // assume live stream
+
+    return (m_pos >= total);
+  }
+
+  const long long stop = m_start + m_size;
+
+  return (m_pos >= stop);
+}
+
+const Cluster* Segment::GetFirst() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  Cluster* const pCluster = m_clusters[0];
+  assert(pCluster);
+
+  return pCluster;
+}
+
+const Cluster* Segment::GetLast() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  const long idx = m_clusterCount - 1;
+
+  Cluster* const pCluster = m_clusters[idx];
+  assert(pCluster);
+
+  return pCluster;
+}
+
+unsigned long Segment::GetCount() const { return m_clusterCount; }
+
+const Cluster* Segment::GetNext(const Cluster* pCurr) {
+  assert(pCurr);
+  assert(pCurr != &m_eos);
+  assert(m_clusters);
+
+  long idx = pCurr->m_index;
+
+  if (idx >= 0) {
+    assert(m_clusterCount > 0);
+    assert(idx < m_clusterCount);
+    assert(pCurr == m_clusters[idx]);
+
+    ++idx;
+
+    if (idx >= m_clusterCount)
+      return &m_eos;  // caller will LoadCluster as desired
+
+    Cluster* const pNext = m_clusters[idx];
+    assert(pNext);
+    assert(pNext->m_index >= 0);
+    assert(pNext->m_index == idx);
+
+    return pNext;
+  }
+
+  assert(m_clusterPreloadCount > 0);
+
+  long long pos = pCurr->m_element_start;
+
+  assert(m_size >= 0);  // TODO
+  const long long stop = m_start + m_size;  // end of segment
+
+  {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long id = ReadID(m_pReader, pos, len);
+    if (id != libwebm::kMkvCluster)
+      return NULL;
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size > 0);  // TODO
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload
+  }
+
+  long long off_next = 0;
+
+  while (pos < stop) {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long idpos = pos;  // pos of next (potential) cluster
+
+    const long long id = ReadID(m_pReader, idpos, len);
+    if (id < 0)
+      return NULL;
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size >= 0);  // TODO
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvCluster) {
+      const long long off_next_ = idpos - m_start;
+
+      long long pos_;
+      long len_;
+
+      const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_);
+
+      assert(status >= 0);
+
+      if (status > 0) {
+        off_next = off_next_;
+        break;
+      }
+    }
+
+    pos += size;  // consume payload
+  }
+
+  if (off_next <= 0)
+    return 0;
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    // const long long pos_ = pNext->m_pos;
+    // assert(pos_);
+    // pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    pos = pNext->GetPosition();
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else
+      return pNext;
+  }
+
+  assert(i == j);
+
+  Cluster* const pNext = Cluster::Create(this, -1, off_next);
+  if (pNext == NULL)
+    return NULL;
+
+  const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+  if (!PreloadCluster(pNext, idx_next)) {
+    delete pNext;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(idx_next < m_clusterSize);
+  assert(m_clusters[idx_next] == pNext);
+
+  return pNext;
+}
+
+long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
+                        long long& pos, long& len) {
+  assert(pCurr);
+  assert(!pCurr->EOS());
+  assert(m_clusters);
+
+  pResult = 0;
+
+  if (pCurr->m_index >= 0) {  // loaded (not merely preloaded)
+    assert(m_clusters[pCurr->m_index] == pCurr);
+
+    const long next_idx = pCurr->m_index + 1;
+
+    if (next_idx < m_clusterCount) {
+      pResult = m_clusters[next_idx];
+      return 0;  // success
+    }
+
+    // curr cluster is last among loaded
+
+    const long result = LoadCluster(pos, len);
+
+    if (result < 0)  // error or underflow
+      return result;
+
+    if (result > 0)  // no more clusters
+    {
+      // pResult = &m_eos;
+      return 1;
+    }
+
+    pResult = GetLast();
+    return 0;  // success
+  }
+
+  assert(m_pos > 0);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // interrogate curr cluster
+
+  pos = pCurr->m_element_start;
+
+  if (pCurr->m_element_size >= 0)
+    pos += pCurr->m_element_size;
+  else {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadUInt(m_pReader, pos, len);
+
+    if (id != libwebm::kMkvCluster)
+      return -1;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)  // TODO: should never happen
+      return E_FILE_FORMAT_INVALID;  // TODO: resolve this
+
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload (that is, the current cluster)
+    if (segment_stop >= 0 && pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    // By consuming the payload, we are assuming that the curr
+    // cluster isn't interesting.  That is, we don't bother checking
+    // whether the payload of the curr cluster is less than what
+    // happens to be available (obtained via IMkvReader::Length).
+    // Presumably the caller has already dispensed with the current
+    // cluster, and really does want the next cluster.
+  }
+
+  // pos now points to just beyond the last fully-loaded cluster
+
+  for (;;) {
+    const long status = DoParseNext(pResult, pos, len);
+
+    if (status <= 1)
+      return status;
+  }
+}
+
+long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // Parse next cluster.  This is strictly a parsing activity.
+  // Creation of a new cluster object happens later, after the
+  // parsing is done.
+
+  long long off_next = 0;
+  long long cluster_size = -1;
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return 1;  // EOF
+
+    if ((segment_stop >= 0) && (pos >= segment_stop))
+      return 1;  // EOF
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;  // absolute
+    const long long idoff = pos - m_start;  // relative
+
+    const long long id = ReadID(m_pReader, idpos, len);  // absolute
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // weird
+      return -1;  // generic error
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == libwebm::kMkvCues) {
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_stop = pos + size;
+
+      if ((segment_stop >= 0) && (element_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_start = idpos;
+      const long long element_size = element_stop - element_start;
+
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+        if (m_pCues == NULL)
+          return false;
+      }
+
+      pos += size;  // consume payload
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    if (id != libwebm::kMkvCluster) {  // not a Cluster ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    // We have a cluster.
+    off_next = idoff;
+
+    if (size != unknown_size)
+      cluster_size = size;
+
+    break;
+  }
+
+  assert(off_next > 0);  // have cluster
+
+  // We have parsed the next cluster.
+  // We have not created a cluster object yet.  What we need
+  // to do now is determine whether it has already be preloaded
+  //(in which case, an object for this cluster has already been
+  // created), and if not, create a new cluster object.
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    const Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    pos = pNext->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else {
+      pResult = pNext;
+      return 0;  // success
+    }
+  }
+
+  assert(i == j);
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, off_next, pos_, len_);
+
+  if (status < 0) {  // error or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  if (status > 0) {  // means "found at least one block entry"
+    Cluster* const pNext = Cluster::Create(this,
+                                           -1,  // preloaded
+                                           off_next);
+    if (pNext == NULL)
+      return -1;
+
+    const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+    if (!PreloadCluster(pNext, idx_next)) {
+      delete pNext;
+      return -1;
+    }
+    assert(m_clusters);
+    assert(idx_next < m_clusterSize);
+    assert(m_clusters[idx_next] == pNext);
+
+    pResult = pNext;
+    return 0;  // success
+  }
+
+  // status == 0 means "no block entries found"
+
+  if (cluster_size < 0) {  // unknown size
+    const long long payload_pos = pos;  // absolute pos of cluster payload
+
+    for (;;) {  // determine cluster size
+      if ((total >= 0) && (pos >= total))
+        break;
+
+      if ((segment_stop >= 0) && (pos >= segment_stop))
+        break;  // no more clusters
+
+      // Read ID
+
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
+
+      long long result = GetUIntLength(m_pReader, pos, len);
+
+      if (result < 0)  // error
+        return static_cast<long>(result);
+
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
+
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      const long long idpos = pos;
+      const long long id = ReadID(m_pReader, idpos, len);
+
+      if (id < 0)  // error (or underflow)
+        return static_cast<long>(id);
+
+      // This is the distinguished set of ID's we use to determine
+      // that we have exhausted the sub-element's inside the cluster
+      // whose ID we parsed earlier.
+
+      if (id == libwebm::kMkvCluster || id == libwebm::kMkvCues)
+        break;
+
+      pos += len;  // consume ID (of sub-element)
+
+      // Read Size
+
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
+
+      result = GetUIntLength(m_pReader, pos, len);
+
+      if (result < 0)  // error
+        return static_cast<long>(result);
+
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
+
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      const long long size = ReadUInt(m_pReader, pos, len);
+
+      if (size < 0)  // error
+        return static_cast<long>(size);
+
+      pos += len;  // consume size field of element
+
+      // pos now points to start of sub-element's payload
+
+      if (size == 0)  // weird
+        continue;
+
+      const long long unknown_size = (1LL << (7 * len)) - 1;
+
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;  // not allowed for sub-elements
+
+      if ((segment_stop >= 0) && ((pos + size) > segment_stop))  // weird
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload of sub-element
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+    }  // determine cluster size
+
+    cluster_size = pos - payload_pos;
+    assert(cluster_size >= 0);  // TODO: handle cluster_size = 0
+
+    pos = payload_pos;  // reset and re-parse original cluster
+  }
+
+  pos += cluster_size;  // consume payload
+  if (segment_stop >= 0 && pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 2;  // try to find a cluster that follows next
+}
+
+const Cluster* Segment::FindCluster(long long time_ns) const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  {
+    Cluster* const pCluster = m_clusters[0];
+    assert(pCluster);
+    assert(pCluster->m_index == 0);
+
+    if (time_ns <= pCluster->GetTime())
+      return pCluster;
+  }
+
+  // Binary search of cluster array
+
+  long i = 0;
+  long j = m_clusterCount;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) <= time_ns
+    //[i, j) ?
+    //[j, m_clusterCount)  > time_ns
+
+    const long k = i + (j - i) / 2;
+    assert(k < m_clusterCount);
+
+    Cluster* const pCluster = m_clusters[k];
+    assert(pCluster);
+    assert(pCluster->m_index == k);
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    assert(i <= j);
+  }
+
+  assert(i == j);
+  assert(i > 0);
+  assert(i <= m_clusterCount);
+
+  const long k = i - 1;
+
+  Cluster* const pCluster = m_clusters[k];
+  assert(pCluster);
+  assert(pCluster->m_index == k);
+  assert(pCluster->GetTime() <= time_ns);
+
+  return pCluster;
+}
+
+const Tracks* Segment::GetTracks() const { return m_pTracks; }
+const SegmentInfo* Segment::GetInfo() const { return m_pInfo; }
+const Cues* Segment::GetCues() const { return m_pCues; }
+const Chapters* Segment::GetChapters() const { return m_pChapters; }
+const Tags* Segment::GetTags() const { return m_pTags; }
+const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; }
+
+long long Segment::GetDuration() const {
+  assert(m_pInfo);
+  return m_pInfo->GetDuration();
+}
+
+Chapters::Chapters(Segment* pSegment, long long payload_start,
+                   long long payload_size, long long element_start,
+                   long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_editions(NULL),
+      m_editions_size(0),
+      m_editions_count(0) {}
+
+Chapters::~Chapters() {
+  while (m_editions_count > 0) {
+    Edition& e = m_editions[--m_editions_count];
+    e.Clear();
+  }
+  delete[] m_editions;
+}
+
+long Chapters::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvEditionEntry) {
+      status = ParseEdition(pos, size);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+int Chapters::GetEditionCount() const { return m_editions_count; }
+
+const Chapters::Edition* Chapters::GetEdition(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_editions_count)
+    return NULL;
+
+  return m_editions + idx;
+}
+
+bool Chapters::ExpandEditionsArray() {
+  if (m_editions_size > m_editions_count)
+    return true;  // nothing else to do
+
+  const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size;
+
+  Edition* const editions = new (std::nothrow) Edition[size];
+
+  if (editions == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_editions_count; ++idx) {
+    m_editions[idx].ShallowCopy(editions[idx]);
+  }
+
+  delete[] m_editions;
+  m_editions = editions;
+
+  m_editions_size = size;
+  return true;
+}
+
+long Chapters::ParseEdition(long long pos, long long size) {
+  if (!ExpandEditionsArray())
+    return -1;
+
+  Edition& e = m_editions[m_editions_count++];
+  e.Init();
+
+  return e.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Chapters::Edition::Edition() {}
+
+Chapters::Edition::~Edition() {}
+
+int Chapters::Edition::GetAtomCount() const { return m_atoms_count; }
+
+const Chapters::Atom* Chapters::Edition::GetAtom(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_atoms_count)
+    return NULL;
+
+  return m_atoms + index;
+}
+
+void Chapters::Edition::Init() {
+  m_atoms = NULL;
+  m_atoms_size = 0;
+  m_atoms_count = 0;
+}
+
+void Chapters::Edition::ShallowCopy(Edition& rhs) const {
+  rhs.m_atoms = m_atoms;
+  rhs.m_atoms_size = m_atoms_size;
+  rhs.m_atoms_count = m_atoms_count;
+}
+
+void Chapters::Edition::Clear() {
+  while (m_atoms_count > 0) {
+    Atom& a = m_atoms[--m_atoms_count];
+    a.Clear();
+  }
+
+  delete[] m_atoms;
+  m_atoms = NULL;
+
+  m_atoms_size = 0;
+}
+
+long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)
+      continue;
+
+    if (id == libwebm::kMkvChapterAtom) {
+      status = ParseAtom(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandAtomsArray())
+    return -1;
+
+  Atom& a = m_atoms[m_atoms_count++];
+  a.Init();
+
+  return a.Parse(pReader, pos, size);
+}
+
+bool Chapters::Edition::ExpandAtomsArray() {
+  if (m_atoms_size > m_atoms_count)
+    return true;  // nothing else to do
+
+  const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size;
+
+  Atom* const atoms = new (std::nothrow) Atom[size];
+
+  if (atoms == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_atoms_count; ++idx) {
+    m_atoms[idx].ShallowCopy(atoms[idx]);
+  }
+
+  delete[] m_atoms;
+  m_atoms = atoms;
+
+  m_atoms_size = size;
+  return true;
+}
+
+Chapters::Atom::Atom() {}
+
+Chapters::Atom::~Atom() {}
+
+unsigned long long Chapters::Atom::GetUID() const { return m_uid; }
+
+const char* Chapters::Atom::GetStringUID() const { return m_string_uid; }
+
+long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; }
+
+long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; }
+
+long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_start_timecode);
+}
+
+long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_stop_timecode);
+}
+
+int Chapters::Atom::GetDisplayCount() const { return m_displays_count; }
+
+const Chapters::Display* Chapters::Atom::GetDisplay(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_displays_count)
+    return NULL;
+
+  return m_displays + index;
+}
+
+void Chapters::Atom::Init() {
+  m_string_uid = NULL;
+  m_uid = 0;
+  m_start_timecode = -1;
+  m_stop_timecode = -1;
+
+  m_displays = NULL;
+  m_displays_size = 0;
+  m_displays_count = 0;
+}
+
+void Chapters::Atom::ShallowCopy(Atom& rhs) const {
+  rhs.m_string_uid = m_string_uid;
+  rhs.m_uid = m_uid;
+  rhs.m_start_timecode = m_start_timecode;
+  rhs.m_stop_timecode = m_stop_timecode;
+
+  rhs.m_displays = m_displays;
+  rhs.m_displays_size = m_displays_size;
+  rhs.m_displays_count = m_displays_count;
+}
+
+void Chapters::Atom::Clear() {
+  delete[] m_string_uid;
+  m_string_uid = NULL;
+
+  while (m_displays_count > 0) {
+    Display& d = m_displays[--m_displays_count];
+    d.Clear();
+  }
+
+  delete[] m_displays;
+  m_displays = NULL;
+
+  m_displays_size = 0;
+}
+
+long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // 0 length payload, skip.
+      continue;
+
+    if (id == libwebm::kMkvChapterDisplay) {
+      status = ParseDisplay(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == libwebm::kMkvChapterStringUID) {
+      status = UnserializeString(pReader, pos, size, m_string_uid);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == libwebm::kMkvChapterUID) {
+      long long val;
+      status = UnserializeInt(pReader, pos, size, val);
+
+      if (status < 0)  // error
+        return status;
+
+      m_uid = static_cast<unsigned long long>(val);
+    } else if (id == libwebm::kMkvChapterTimeStart) {
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_start_timecode = val;
+    } else if (id == libwebm::kMkvChapterTimeEnd) {
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_stop_timecode = val;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long long Chapters::Atom::GetTime(const Chapters* pChapters,
+                                  long long timecode) {
+  if (pChapters == NULL)
+    return -1;
+
+  Segment* const pSegment = pChapters->m_pSegment;
+
+  if (pSegment == NULL)  // weird
+    return -1;
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+
+  if (pInfo == NULL)
+    return -1;
+
+  const long long timecode_scale = pInfo->GetTimeCodeScale();
+
+  if (timecode_scale < 1)  // weird
+    return -1;
+
+  if (timecode < 0)
+    return -1;
+
+  const long long result = timecode_scale * timecode;
+
+  return result;
+}
+
+long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandDisplaysArray())
+    return -1;
+
+  Display& d = m_displays[m_displays_count++];
+  d.Init();
+
+  return d.Parse(pReader, pos, size);
+}
+
+bool Chapters::Atom::ExpandDisplaysArray() {
+  if (m_displays_size > m_displays_count)
+    return true;  // nothing else to do
+
+  const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size;
+
+  Display* const displays = new (std::nothrow) Display[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_displays_count; ++idx) {
+    m_displays[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_displays;
+  m_displays = displays;
+
+  m_displays_size = size;
+  return true;
+}
+
+Chapters::Display::Display() {}
+
+Chapters::Display::~Display() {}
+
+const char* Chapters::Display::GetString() const { return m_string; }
+
+const char* Chapters::Display::GetLanguage() const { return m_language; }
+
+const char* Chapters::Display::GetCountry() const { return m_country; }
+
+void Chapters::Display::Init() {
+  m_string = NULL;
+  m_language = NULL;
+  m_country = NULL;
+}
+
+void Chapters::Display::ShallowCopy(Display& rhs) const {
+  rhs.m_string = m_string;
+  rhs.m_language = m_language;
+  rhs.m_country = m_country;
+}
+
+void Chapters::Display::Clear() {
+  delete[] m_string;
+  m_string = NULL;
+
+  delete[] m_language;
+  m_language = NULL;
+
+  delete[] m_country;
+  m_country = NULL;
+}
+
+long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // No payload.
+      continue;
+
+    if (id == libwebm::kMkvChapString) {
+      status = UnserializeString(pReader, pos, size, m_string);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvChapLanguage) {
+      status = UnserializeString(pReader, pos, size, m_language);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvChapCountry) {
+      status = UnserializeString(pReader, pos, size, m_country);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_tags(NULL),
+      m_tags_size(0),
+      m_tags_count(0) {}
+
+Tags::~Tags() {
+  while (m_tags_count > 0) {
+    Tag& t = m_tags[--m_tags_count];
+    t.Clear();
+  }
+  delete[] m_tags;
+}
+
+long Tags::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == libwebm::kMkvTag) {
+      status = ParseTag(pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+int Tags::GetTagCount() const { return m_tags_count; }
+
+const Tags::Tag* Tags::GetTag(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_tags_count)
+    return NULL;
+
+  return m_tags + idx;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (m_tags_size > m_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size;
+
+  Tag* const tags = new (std::nothrow) Tag[size];
+
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_tags_count; ++idx) {
+    m_tags[idx].ShallowCopy(tags[idx]);
+  }
+
+  delete[] m_tags;
+  m_tags = tags;
+
+  m_tags_size = size;
+  return true;
+}
+
+long Tags::ParseTag(long long pos, long long size) {
+  if (!ExpandTagsArray())
+    return -1;
+
+  Tag& t = m_tags[m_tags_count++];
+  t.Init();
+
+  return t.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Tags::Tag::Tag() {}
+
+Tags::Tag::~Tag() {}
+
+int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; }
+
+const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_simple_tags_count)
+    return NULL;
+
+  return m_simple_tags + index;
+}
+
+void Tags::Tag::Init() {
+  m_simple_tags = NULL;
+  m_simple_tags_size = 0;
+  m_simple_tags_count = 0;
+}
+
+void Tags::Tag::ShallowCopy(Tag& rhs) const {
+  rhs.m_simple_tags = m_simple_tags;
+  rhs.m_simple_tags_size = m_simple_tags_size;
+  rhs.m_simple_tags_count = m_simple_tags_count;
+}
+
+void Tags::Tag::Clear() {
+  while (m_simple_tags_count > 0) {
+    SimpleTag& d = m_simple_tags[--m_simple_tags_count];
+    d.Clear();
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = NULL;
+
+  m_simple_tags_size = 0;
+}
+
+long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == libwebm::kMkvSimpleTag) {
+      status = ParseSimpleTag(pReader, pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos,
+                               long long size) {
+  if (!ExpandSimpleTagsArray())
+    return -1;
+
+  SimpleTag& st = m_simple_tags[m_simple_tags_count++];
+  st.Init();
+
+  return st.Parse(pReader, pos, size);
+}
+
+bool Tags::Tag::ExpandSimpleTagsArray() {
+  if (m_simple_tags_size > m_simple_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size;
+
+  SimpleTag* const displays = new (std::nothrow) SimpleTag[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_simple_tags_count; ++idx) {
+    m_simple_tags[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = displays;
+
+  m_simple_tags_size = size;
+  return true;
+}
+
+Tags::SimpleTag::SimpleTag() {}
+
+Tags::SimpleTag::~SimpleTag() {}
+
+const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; }
+
+const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; }
+
+void Tags::SimpleTag::Init() {
+  m_tag_name = NULL;
+  m_tag_string = NULL;
+}
+
+void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const {
+  rhs.m_tag_name = m_tag_name;
+  rhs.m_tag_string = m_tag_string;
+}
+
+void Tags::SimpleTag::Clear() {
+  delete[] m_tag_name;
+  m_tag_name = NULL;
+
+  delete[] m_tag_string;
+  m_tag_string = NULL;
+}
+
+long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
+                            long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvTagName) {
+      status = UnserializeString(pReader, pos, size, m_tag_name);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvTagString) {
+      status = UnserializeString(pReader, pos, size, m_tag_string);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_,
+                         long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_pMuxingAppAsUTF8(NULL),
+      m_pWritingAppAsUTF8(NULL),
+      m_pTitleAsUTF8(NULL) {}
+
+SegmentInfo::~SegmentInfo() {
+  delete[] m_pMuxingAppAsUTF8;
+  m_pMuxingAppAsUTF8 = NULL;
+
+  delete[] m_pWritingAppAsUTF8;
+  m_pWritingAppAsUTF8 = NULL;
+
+  delete[] m_pTitleAsUTF8;
+  m_pTitleAsUTF8 = NULL;
+}
+
+long SegmentInfo::Parse() {
+  assert(m_pMuxingAppAsUTF8 == NULL);
+  assert(m_pWritingAppAsUTF8 == NULL);
+  assert(m_pTitleAsUTF8 == NULL);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  m_timecodeScale = 1000000;
+  m_duration = -1;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvTimecodeScale) {
+      m_timecodeScale = UnserializeUInt(pReader, pos, size);
+
+      if (m_timecodeScale <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDuration) {
+      const long status = UnserializeFloat(pReader, pos, size, m_duration);
+
+      if (status < 0)
+        return status;
+
+      if (m_duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvMuxingApp) {
+      const long status =
+          UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvWritingApp) {
+      const long status =
+          UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvTitle) {
+      const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  const double rollover_check = m_duration * m_timecodeScale;
+  if (rollover_check > static_cast<double>(LLONG_MAX))
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; }
+
+long long SegmentInfo::GetDuration() const {
+  if (m_duration < 0)
+    return -1;
+
+  assert(m_timecodeScale >= 1);
+
+  const double dd = double(m_duration) * double(m_timecodeScale);
+  const long long d = static_cast<long long>(dd);
+
+  return d;
+}
+
+const char* SegmentInfo::GetMuxingAppAsUTF8() const {
+  return m_pMuxingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetWritingAppAsUTF8() const {
+  return m_pWritingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; }
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+ContentEncoding::ContentCompression::ContentCompression()
+    : algo(0), settings(NULL), settings_len(0) {}
+
+ContentEncoding::ContentCompression::~ContentCompression() {
+  delete[] settings;
+}
+
+ContentEncoding::ContentEncryption::ContentEncryption()
+    : algo(0),
+      key_id(NULL),
+      key_id_len(0),
+      signature(NULL),
+      signature_len(0),
+      sig_key_id(NULL),
+      sig_key_id_len(0),
+      sig_algo(0),
+      sig_hash_algo(0) {}
+
+ContentEncoding::ContentEncryption::~ContentEncryption() {
+  delete[] key_id;
+  delete[] signature;
+  delete[] sig_key_id;
+}
+
+ContentEncoding::ContentEncoding()
+    : compression_entries_(NULL),
+      compression_entries_end_(NULL),
+      encryption_entries_(NULL),
+      encryption_entries_end_(NULL),
+      encoding_order_(0),
+      encoding_scope_(1),
+      encoding_type_(0) {}
+
+ContentEncoding::~ContentEncoding() {
+  ContentCompression** comp_i = compression_entries_;
+  ContentCompression** const comp_j = compression_entries_end_;
+
+  while (comp_i != comp_j) {
+    ContentCompression* const comp = *comp_i++;
+    delete comp;
+  }
+
+  delete[] compression_entries_;
+
+  ContentEncryption** enc_i = encryption_entries_;
+  ContentEncryption** const enc_j = encryption_entries_end_;
+
+  while (enc_i != enc_j) {
+    ContentEncryption* const enc = *enc_i++;
+    delete enc;
+  }
+
+  delete[] encryption_entries_;
+}
+
+const ContentEncoding::ContentCompression*
+ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+  const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return compression_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetCompressionCount() const {
+  const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex(
+    unsigned long idx) const {
+  const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return encryption_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetEncryptionCount() const {
+  const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+long ContentEncoding::ParseContentEncAESSettingsEntry(
+    long long start, long long size, IMkvReader* pReader,
+    ContentEncAESSettings* aes) {
+  assert(pReader);
+  assert(aes);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvAESSettingsCipherMode) {
+      aes->cipher_mode = UnserializeUInt(pReader, pos, size);
+      if (aes->cipher_mode != 1)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;
+}
+
+long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
+                                                IMkvReader* pReader) {
+  assert(pReader);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  // Count ContentCompression and ContentEncryption elements.
+  int compression_count = 0;
+  int encryption_count = 0;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentCompression)
+      ++compression_count;
+
+    if (id == libwebm::kMkvContentEncryption)
+      ++encryption_count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (compression_count <= 0 && encryption_count <= 0)
+    return -1;
+
+  if (compression_count > 0) {
+    compression_entries_ =
+        new (std::nothrow) ContentCompression*[compression_count];
+    if (!compression_entries_)
+      return -1;
+    compression_entries_end_ = compression_entries_;
+  }
+
+  if (encryption_count > 0) {
+    encryption_entries_ =
+        new (std::nothrow) ContentEncryption*[encryption_count];
+    if (!encryption_entries_) {
+      delete[] compression_entries_;
+      return -1;
+    }
+    encryption_entries_end_ = encryption_entries_;
+  }
+
+  pos = start;
+  while (pos < stop) {
+    long long id, size;
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentEncodingOrder) {
+      encoding_order_ = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentEncodingScope) {
+      encoding_scope_ = UnserializeUInt(pReader, pos, size);
+      if (encoding_scope_ < 1)
+        return -1;
+    } else if (id == libwebm::kMkvContentEncodingType) {
+      encoding_type_ = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentCompression) {
+      ContentCompression* const compression =
+          new (std::nothrow) ContentCompression();
+      if (!compression)
+        return -1;
+
+      status = ParseCompressionEntry(pos, size, pReader, compression);
+      if (status) {
+        delete compression;
+        return status;
+      }
+      *compression_entries_end_++ = compression;
+    } else if (id == libwebm::kMkvContentEncryption) {
+      ContentEncryption* const encryption =
+          new (std::nothrow) ContentEncryption();
+      if (!encryption)
+        return -1;
+
+      status = ParseEncryptionEntry(pos, size, pReader, encryption);
+      if (status) {
+        delete encryption;
+        return status;
+      }
+      *encryption_entries_end_++ = encryption;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long ContentEncoding::ParseCompressionEntry(long long start, long long size,
+                                            IMkvReader* pReader,
+                                            ContentCompression* compression) {
+  assert(pReader);
+  assert(compression);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  bool valid = false;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentCompAlgo) {
+      long long algo = UnserializeUInt(pReader, pos, size);
+      if (algo < 0)
+        return E_FILE_FORMAT_INVALID;
+      compression->algo = algo;
+      valid = true;
+    } else if (id == libwebm::kMkvContentCompSettings) {
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      compression->settings = buf;
+      compression->settings_len = buflen;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  // ContentCompAlgo is mandatory
+  if (!valid)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
+                                           IMkvReader* pReader,
+                                           ContentEncryption* encryption) {
+  assert(pReader);
+  assert(encryption);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentEncAlgo) {
+      encryption->algo = UnserializeUInt(pReader, pos, size);
+      if (encryption->algo != 5)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvContentEncKeyID) {
+      delete[] encryption->key_id;
+      encryption->key_id = NULL;
+      encryption->key_id_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->key_id = buf;
+      encryption->key_id_len = buflen;
+    } else if (id == libwebm::kMkvContentSignature) {
+      delete[] encryption->signature;
+      encryption->signature = NULL;
+      encryption->signature_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->signature = buf;
+      encryption->signature_len = buflen;
+    } else if (id == libwebm::kMkvContentSigKeyID) {
+      delete[] encryption->sig_key_id;
+      encryption->sig_key_id = NULL;
+      encryption->sig_key_id_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->sig_key_id = buf;
+      encryption->sig_key_id_len = buflen;
+    } else if (id == libwebm::kMkvContentSigAlgo) {
+      encryption->sig_algo = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentSigHashAlgo) {
+      encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentEncAESSettings) {
+      const long status = ParseContentEncAESSettingsEntry(
+          pos, size, pReader, &encryption->aes_settings);
+      if (status)
+        return status;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;
+}
+
+Track::Track(Segment* pSegment, long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_end_(NULL) {}
+
+Track::~Track() {
+  Info& info = const_cast<Info&>(m_info);
+  info.Clear();
+
+  ContentEncoding** i = content_encoding_entries_;
+  ContentEncoding** const j = content_encoding_entries_end_;
+
+  while (i != j) {
+    ContentEncoding* const encoding = *i++;
+    delete encoding;
+  }
+
+  delete[] content_encoding_entries_;
+}
+
+long Track::Create(Segment* pSegment, const Info& info, long long element_start,
+                   long long element_size, Track*& pResult) {
+  if (pResult)
+    return -1;
+
+  Track* const pTrack =
+      new (std::nothrow) Track(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+Track::Info::Info()
+    : uid(0),
+      defaultDuration(0),
+      codecDelay(0),
+      seekPreRoll(0),
+      nameAsUTF8(NULL),
+      language(NULL),
+      codecId(NULL),
+      codecNameAsUTF8(NULL),
+      codecPrivate(NULL),
+      codecPrivateSize(0),
+      lacing(false) {}
+
+Track::Info::~Info() { Clear(); }
+
+void Track::Info::Clear() {
+  delete[] nameAsUTF8;
+  nameAsUTF8 = NULL;
+
+  delete[] language;
+  language = NULL;
+
+  delete[] codecId;
+  codecId = NULL;
+
+  delete[] codecPrivate;
+  codecPrivate = NULL;
+  codecPrivateSize = 0;
+
+  delete[] codecNameAsUTF8;
+  codecNameAsUTF8 = NULL;
+}
+
+int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
+  if (str == static_cast<char * Info::*>(NULL))
+    return -1;
+
+  char*& dst = dst_.*str;
+
+  if (dst)  // should be NULL already
+    return -1;
+
+  const char* const src = this->*str;
+
+  if (src == NULL)
+    return 0;
+
+  const size_t len = strlen(src);
+
+  dst = SafeArrayAlloc<char>(1, len + 1);
+
+  if (dst == NULL)
+    return -1;
+
+  strcpy(dst, src);
+
+  return 0;
+}
+
+int Track::Info::Copy(Info& dst) const {
+  if (&dst == this)
+    return 0;
+
+  dst.type = type;
+  dst.number = number;
+  dst.defaultDuration = defaultDuration;
+  dst.codecDelay = codecDelay;
+  dst.seekPreRoll = seekPreRoll;
+  dst.uid = uid;
+  dst.lacing = lacing;
+  dst.settings = settings;
+
+  // We now copy the string member variables from src to dst.
+  // This involves memory allocation so in principle the operation
+  // can fail (indeed, that's why we have Info::Copy), so we must
+  // report this to the caller.  An error return from this function
+  // therefore implies that the copy was only partially successful.
+
+  if (int status = CopyStr(&Info::nameAsUTF8, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::language, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::codecId, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::codecNameAsUTF8, dst))
+    return status;
+
+  if (codecPrivateSize > 0) {
+    if (codecPrivate == NULL)
+      return -1;
+
+    if (dst.codecPrivate)
+      return -1;
+
+    if (dst.codecPrivateSize != 0)
+      return -1;
+
+    dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
+
+    if (dst.codecPrivate == NULL)
+      return -1;
+
+    memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize);
+    dst.codecPrivateSize = codecPrivateSize;
+  }
+
+  return 0;
+}
+
+const BlockEntry* Track::GetEOS() const { return &m_eos; }
+
+long Track::GetType() const { return m_info.type; }
+
+long Track::GetNumber() const { return m_info.number; }
+
+unsigned long long Track::GetUid() const { return m_info.uid; }
+
+const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; }
+
+const char* Track::GetLanguage() const { return m_info.language; }
+
+const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; }
+
+const char* Track::GetCodecId() const { return m_info.codecId; }
+
+const unsigned char* Track::GetCodecPrivate(size_t& size) const {
+  size = m_info.codecPrivateSize;
+  return m_info.codecPrivate;
+}
+
+bool Track::GetLacing() const { return m_info.lacing; }
+
+unsigned long long Track::GetDefaultDuration() const {
+  return m_info.defaultDuration;
+}
+
+unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; }
+
+unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; }
+
+long Track::GetFirst(const BlockEntry*& pBlockEntry) const {
+  const Cluster* pCluster = m_pSegment->GetFirst();
+
+  for (int i = 0;;) {
+    if (pCluster == NULL) {
+      pBlockEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
+      if (m_pSegment->DoneParsing()) {
+        pBlockEntry = GetEOS();
+        return 1;
+      }
+
+      pBlockEntry = 0;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long status = pCluster->GetFirst(pBlockEntry);
+
+    if (status < 0)  // error
+      return status;
+
+    if (pBlockEntry == 0) {  // empty cluster
+      pCluster = m_pSegment->GetNext(pCluster);
+      continue;
+    }
+
+    for (;;) {
+      const Block* const pBlock = pBlockEntry->GetBlock();
+      assert(pBlock);
+
+      const long long tn = pBlock->GetTrackNumber();
+
+      if ((tn == m_info.number) && VetEntry(pBlockEntry))
+        return 0;
+
+      const BlockEntry* pNextEntry;
+
+      status = pCluster->GetNext(pBlockEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+
+      if (pNextEntry == 0)
+        break;
+
+      pBlockEntry = pNextEntry;
+    }
+
+    ++i;
+
+    if (i >= 100)
+      break;
+
+    pCluster = m_pSegment->GetNext(pCluster);
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number.  We interpret that as an error (which
+  // might be too conservative).
+
+  pBlockEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
+
+long Track::GetNext(const BlockEntry* pCurrEntry,
+                    const BlockEntry*& pNextEntry) const {
+  assert(pCurrEntry);
+  assert(!pCurrEntry->EOS());  //?
+
+  const Block* const pCurrBlock = pCurrEntry->GetBlock();
+  assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number);
+  if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number)
+    return -1;
+
+  const Cluster* pCluster = pCurrEntry->GetCluster();
+  assert(pCluster);
+  assert(!pCluster->EOS());
+
+  long status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  for (int i = 0;;) {
+    while (pNextEntry) {
+      const Block* const pNextBlock = pNextEntry->GetBlock();
+      assert(pNextBlock);
+
+      if (pNextBlock->GetTrackNumber() == m_info.number)
+        return 0;
+
+      pCurrEntry = pNextEntry;
+
+      status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pCluster = m_pSegment->GetNext(pCluster);
+
+    if (pCluster == NULL) {
+      pNextEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
+      if (m_pSegment->DoneParsing()) {
+        pNextEntry = GetEOS();
+        return 1;
+      }
+
+      // TODO: there is a potential O(n^2) problem here: we tell the
+      // caller to (pre)load another cluster, which he does, but then he
+      // calls GetNext again, which repeats the same search.  This is
+      // a pathological case, since the only way it can happen is if
+      // there exists a long sequence of clusters none of which contain a
+      // block from this track.  One way around this problem is for the
+      // caller to be smarter when he loads another cluster: don't call
+      // us back until you have a cluster that contains a block from this
+      // track. (Of course, that's not cheap either, since our caller
+      // would have to scan the each cluster as it's loaded, so that
+      // would just push back the problem.)
+
+      pNextEntry = NULL;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    status = pCluster->GetFirst(pNextEntry);
+
+    if (status < 0)  // error
+      return status;
+
+    if (pNextEntry == NULL)  // empty cluster
+      continue;
+
+    ++i;
+
+    if (i >= 100)
+      break;
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number after lots of searching, so we give
+  // up trying.
+
+  pNextEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
+
+bool Track::VetEntry(const BlockEntry* pBlockEntry) const {
+  assert(pBlockEntry);
+  const Block* const pBlock = pBlockEntry->GetBlock();
+  assert(pBlock);
+  assert(pBlock->GetTrackNumber() == m_info.number);
+  if (!pBlock || pBlock->GetTrackNumber() != m_info.number)
+    return false;
+
+  // This function is used during a seek to determine whether the
+  // frame is a valid seek target.  This default function simply
+  // returns true, which means all frames are valid seek targets.
+  // It gets overridden by the VideoTrack class, because only video
+  // keyframes can be used as seek target.
+
+  return true;
+}
+
+long Track::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not preloaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
+    assert(pCluster);
+    assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
+
+    assert(lo <= hi);
+  }
+
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  while (lo > i) {
+    pCluster = *--lo;
+    assert(pCluster);
+    assert(pCluster->GetTime() <= time_ns);
+
+    pResult = pCluster->GetEntry(this);
+
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+
+    // landed on empty cluster (no entries)
+  }
+
+  pResult = GetEOS();  // weird
+  return 0;
+}
+
+const ContentEncoding* Track::GetContentEncodingByIndex(
+    unsigned long idx) const {
+  const ptrdiff_t count =
+      content_encoding_entries_end_ - content_encoding_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return content_encoding_entries_[idx];
+}
+
+unsigned long Track::GetContentEncodingCount() const {
+  const ptrdiff_t count =
+      content_encoding_entries_end_ - content_encoding_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+long Track::ParseContentEncodingsEntry(long long start, long long size) {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+  assert(pReader);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  // Count ContentEncoding elements.
+  int count = 0;
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    // pos now designates start of element
+    if (id == libwebm::kMkvContentEncoding)
+      ++count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (count <= 0)
+    return -1;
+
+  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  if (!content_encoding_entries_)
+    return -1;
+
+  content_encoding_entries_end_ = content_encoding_entries_;
+
+  pos = start;
+  while (pos < stop) {
+    long long id, size;
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    // pos now designates start of element
+    if (id == libwebm::kMkvContentEncoding) {
+      ContentEncoding* const content_encoding =
+          new (std::nothrow) ContentEncoding();
+      if (!content_encoding)
+        return -1;
+
+      status = content_encoding->ParseContentEncodingEntry(pos, size, pReader);
+      if (status) {
+        delete content_encoding;
+        return status;
+      }
+
+      *content_encoding_entries_end_++ = content_encoding;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {}
+
+BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; }
+
+const Block* Track::EOSBlock::GetBlock() const { return NULL; }
+
+bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
+                                long long value_size, bool is_x,
+                                PrimaryChromaticity** chromaticity) {
+  if (!reader)
+    return false;
+
+  if (!*chromaticity)
+    *chromaticity = new PrimaryChromaticity();
+
+  if (!*chromaticity)
+    return false;
+
+  PrimaryChromaticity* pc = *chromaticity;
+  float* value = is_x ? &pc->x : &pc->y;
+
+  double parser_value = 0;
+  const long long parse_status =
+      UnserializeFloat(reader, read_pos, value_size, parser_value);
+
+  // Valid range is [0, 1]. Make sure the double is representable as a float
+  // before casting.
+  if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+      (parser_value > 0.0 && parser_value < FLT_MIN))
+    return false;
+
+  *value = static_cast<float>(parser_value);
+
+  return true;
+}
+
+bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
+                              long long mm_size, MasteringMetadata** mm) {
+  if (!reader || *mm)
+    return false;
+
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  if (!mm_ptr.get())
+    return false;
+
+  const long long mm_end = mm_start + mm_size;
+  long long read_pos = mm_start;
+
+  while (read_pos < mm_end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, mm_end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvLuminanceMax) {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+      mm_ptr->luminance_max = static_cast<float>(value);
+      if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
+          mm_ptr->luminance_max > 9999.99) {
+        return false;
+      }
+    } else if (child_id == libwebm::kMkvLuminanceMin) {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+      mm_ptr->luminance_min = static_cast<float>(value);
+      if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
+          mm_ptr->luminance_min > 999.9999) {
+        return false;
+      }
+    } else {
+      bool is_x = false;
+      PrimaryChromaticity** chromaticity;
+      switch (child_id) {
+        case libwebm::kMkvPrimaryRChromaticityX:
+        case libwebm::kMkvPrimaryRChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryRChromaticityX;
+          chromaticity = &mm_ptr->r;
+          break;
+        case libwebm::kMkvPrimaryGChromaticityX:
+        case libwebm::kMkvPrimaryGChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryGChromaticityX;
+          chromaticity = &mm_ptr->g;
+          break;
+        case libwebm::kMkvPrimaryBChromaticityX:
+        case libwebm::kMkvPrimaryBChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryBChromaticityX;
+          chromaticity = &mm_ptr->b;
+          break;
+        case libwebm::kMkvWhitePointChromaticityX:
+        case libwebm::kMkvWhitePointChromaticityY:
+          is_x = child_id == libwebm::kMkvWhitePointChromaticityX;
+          chromaticity = &mm_ptr->white_point;
+          break;
+        default:
+          return false;
+      }
+      const bool value_parse_status = PrimaryChromaticity::Parse(
+          reader, read_pos, child_size, is_x, chromaticity);
+      if (!value_parse_status)
+        return false;
+    }
+
+    read_pos += child_size;
+    if (read_pos > mm_end)
+      return false;
+  }
+
+  *mm = mm_ptr.release();
+  return true;
+}
+
+bool Colour::Parse(IMkvReader* reader, long long colour_start,
+                   long long colour_size, Colour** colour) {
+  if (!reader || *colour)
+    return false;
+
+  std::unique_ptr<Colour> colour_ptr(new Colour());
+  if (!colour_ptr.get())
+    return false;
+
+  const long long colour_end = colour_start + colour_size;
+  long long read_pos = colour_start;
+
+  while (read_pos < colour_end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long status =
+        ParseElementHeader(reader, read_pos, colour_end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvMatrixCoefficients) {
+      colour_ptr->matrix_coefficients =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->matrix_coefficients < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvBitsPerChannel) {
+      colour_ptr->bits_per_channel =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->bits_per_channel < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSubsamplingHorz) {
+      colour_ptr->chroma_subsampling_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_subsampling_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSubsamplingVert) {
+      colour_ptr->chroma_subsampling_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_subsampling_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvCbSubsamplingHorz) {
+      colour_ptr->cb_subsampling_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->cb_subsampling_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvCbSubsamplingVert) {
+      colour_ptr->cb_subsampling_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->cb_subsampling_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSitingHorz) {
+      colour_ptr->chroma_siting_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_siting_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSitingVert) {
+      colour_ptr->chroma_siting_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_siting_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvRange) {
+      colour_ptr->range = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->range < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvTransferCharacteristics) {
+      colour_ptr->transfer_characteristics =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->transfer_characteristics < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvPrimaries) {
+      colour_ptr->primaries = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->primaries < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMaxCLL) {
+      colour_ptr->max_cll = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->max_cll < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMaxFALL) {
+      colour_ptr->max_fall = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->max_fall < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMasteringMetadata) {
+      if (!MasteringMetadata::Parse(reader, read_pos, child_size,
+                                    &colour_ptr->mastering_metadata))
+        return false;
+    } else {
+      return false;
+    }
+
+    read_pos += child_size;
+    if (read_pos > colour_end)
+      return false;
+  }
+  *colour = colour_ptr.release();
+  return true;
+}
+
+bool Projection::Parse(IMkvReader* reader, long long start, long long size,
+                       Projection** projection) {
+  if (!reader || *projection)
+    return false;
+
+  std::unique_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  const long long end = start + size;
+  long long read_pos = start;
+
+  while (read_pos < end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvProjectionType) {
+      long long projection_type = kTypeNotPresent;
+      projection_type = UnserializeUInt(reader, read_pos, child_size);
+      if (projection_type < 0)
+        return false;
+
+      projection_ptr->type = static_cast<ProjectionType>(projection_type);
+    } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
+
+      if (data == NULL)
+        return false;
+
+      const int status =
+          reader->Read(read_pos, static_cast<long>(child_size), data);
+
+      if (status) {
+        delete[] data;
+        return false;
+      }
+
+      projection_ptr->private_data = data;
+      projection_ptr->private_data_length = static_cast<size_t>(child_size);
+    } else {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      // Make sure value is representable as a float before casting.
+      if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+
+      switch (child_id) {
+        case libwebm::kMkvProjectionPoseYaw:
+          projection_ptr->pose_yaw = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPosePitch:
+          projection_ptr->pose_pitch = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPoseRoll:
+          projection_ptr->pose_roll = static_cast<float>(value);
+          break;
+        default:
+          return false;
+      }
+    }
+
+    read_pos += child_size;
+    if (read_pos > end)
+      return false;
+  }
+
+  *projection = projection_ptr.release();
+  return true;
+}
+
+VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size),
+      m_colour(NULL),
+      m_projection(NULL) {}
+
+VideoTrack::~VideoTrack() {
+  delete m_colour;
+  delete m_projection;
+}
+
+long VideoTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       VideoTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kVideo)
+    return -1;
+
+  long long width = 0;
+  long long height = 0;
+  long long display_width = 0;
+  long long display_height = 0;
+  long long display_unit = 0;
+  long long stereo_mode = 0;
+
+  double rate = 0.0;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  Colour* colour = NULL;
+  Projection* projection = NULL;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvPixelWidth) {
+      width = UnserializeUInt(pReader, pos, size);
+
+      if (width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvPixelHeight) {
+      height = UnserializeUInt(pReader, pos, size);
+
+      if (height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDisplayWidth) {
+      display_width = UnserializeUInt(pReader, pos, size);
+
+      if (display_width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDisplayHeight) {
+      display_height = UnserializeUInt(pReader, pos, size);
+
+      if (display_height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDisplayUnit) {
+      display_unit = UnserializeUInt(pReader, pos, size);
+
+      if (display_unit < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvStereoMode) {
+      stereo_mode = UnserializeUInt(pReader, pos, size);
+
+      if (stereo_mode < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvFrameRate) {
+      const long status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvColour) {
+      if (!Colour::Parse(pReader, pos, size, &colour))
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvProjection) {
+      if (!Projection::Parse(pReader, pos, size, &projection))
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  VideoTrack* const pTrack =
+      new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_width = width;
+  pTrack->m_height = height;
+  pTrack->m_display_width = display_width;
+  pTrack->m_display_height = display_height;
+  pTrack->m_display_unit = display_unit;
+  pTrack->m_stereo_mode = stereo_mode;
+  pTrack->m_rate = rate;
+  pTrack->m_colour = colour;
+  pTrack->m_projection = projection;
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const {
+  return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey();
+}
+
+long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not pre-loaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
+    assert(pCluster);
+    assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
+
+    assert(lo <= hi);
+  }
+
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  pCluster = *--lo;
+  assert(pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  pResult = pCluster->GetEntry(this, time_ns);
+
+  if ((pResult != 0) && !pResult->EOS())  // found a keyframe
+    return 0;
+
+  while (lo != i) {
+    pCluster = *--lo;
+    assert(pCluster);
+    assert(pCluster->GetTime() <= time_ns);
+
+    pResult = pCluster->GetEntry(this, time_ns);
+
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+  }
+
+  // weird: we're on the first cluster, but no keyframe found
+  // should never happen but we must return something anyway
+
+  pResult = GetEOS();
+  return 0;
+}
+
+Colour* VideoTrack::GetColour() const { return m_colour; }
+
+Projection* VideoTrack::GetProjection() const { return m_projection; }
+
+long long VideoTrack::GetWidth() const { return m_width; }
+
+long long VideoTrack::GetHeight() const { return m_height; }
+
+long long VideoTrack::GetDisplayWidth() const {
+  return m_display_width > 0 ? m_display_width : GetWidth();
+}
+
+long long VideoTrack::GetDisplayHeight() const {
+  return m_display_height > 0 ? m_display_height : GetHeight();
+}
+
+long long VideoTrack::GetDisplayUnit() const { return m_display_unit; }
+
+long long VideoTrack::GetStereoMode() const { return m_stereo_mode; }
+
+double VideoTrack::GetFrameRate() const { return m_rate; }
+
+AudioTrack::AudioTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size) {}
+
+long AudioTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       AudioTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kAudio)
+    return -1;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  double rate = 8000.0;  // MKV default
+  long long channels = 1;
+  long long bit_depth = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvSamplingFrequency) {
+      status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvChannels) {
+      channels = UnserializeUInt(pReader, pos, size);
+
+      if (channels <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvBitDepth) {
+      bit_depth = UnserializeUInt(pReader, pos, size);
+
+      if (bit_depth <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  AudioTrack* const pTrack =
+      new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_rate = rate;
+  pTrack->m_channels = channels;
+  pTrack->m_bitDepth = bit_depth;
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+double AudioTrack::GetSamplingRate() const { return m_rate; }
+
+long long AudioTrack::GetChannels() const { return m_channels; }
+
+long long AudioTrack::GetBitDepth() const { return m_bitDepth; }
+
+Tracks::Tracks(Segment* pSegment, long long start, long long size_,
+               long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_trackEntries(NULL),
+      m_trackEntriesEnd(NULL) {}
+
+long Tracks::Parse() {
+  assert(m_trackEntries == NULL);
+  assert(m_trackEntriesEnd == NULL);
+
+  const long long stop = m_start + m_size;
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  int count = 0;
+  long long pos = m_start;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvTrackEntry)
+      ++count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (count <= 0)
+    return 0;  // success
+
+  m_trackEntries = new (std::nothrow) Track*[count];
+
+  if (m_trackEntries == NULL)
+    return -1;
+
+  m_trackEntriesEnd = m_trackEntries;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long element_start = pos;
+
+    long long id, payload_size;
+
+    const long status =
+        ParseElementHeader(pReader, pos, stop, id, payload_size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (payload_size == 0)  // weird
+      continue;
+
+    const long long payload_stop = pos + payload_size;
+    assert(payload_stop <= stop);  // checked in ParseElement
+
+    const long long element_size = payload_stop - element_start;
+
+    if (id == libwebm::kMkvTrackEntry) {
+      Track*& pTrack = *m_trackEntriesEnd;
+      pTrack = NULL;
+
+      const long status = ParseTrackEntry(pos, payload_size, element_start,
+                                          element_size, pTrack);
+      if (status)
+        return status;
+
+      if (pTrack)
+        ++m_trackEntriesEnd;
+    }
+
+    pos = payload_stop;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+unsigned long Tracks::GetTracksCount() const {
+  const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries;
+  assert(result >= 0);
+
+  return static_cast<unsigned long>(result);
+}
+
+long Tracks::ParseTrackEntry(long long track_start, long long track_size,
+                             long long element_start, long long element_size,
+                             Track*& pResult) const {
+  if (pResult)
+    return -1;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = track_start;
+  const long long track_stop = track_start + track_size;
+
+  Track::Info info;
+
+  info.type = 0;
+  info.number = 0;
+  info.uid = 0;
+  info.defaultDuration = 0;
+
+  Track::Settings v;
+  v.start = -1;
+  v.size = -1;
+
+  Track::Settings a;
+  a.start = -1;
+  a.size = -1;
+
+  Track::Settings e;  // content_encodings_settings;
+  e.start = -1;
+  e.size = -1;
+
+  long long lacing = 1;  // default is true
+
+  while (pos < track_stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, track_stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long start = pos;
+
+    if (id == libwebm::kMkvVideo) {
+      v.start = start;
+      v.size = size;
+    } else if (id == libwebm::kMkvAudio) {
+      a.start = start;
+      a.size = size;
+    } else if (id == libwebm::kMkvContentEncodings) {
+      e.start = start;
+      e.size = size;
+    } else if (id == libwebm::kMkvTrackUID) {
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      info.uid = 0;
+
+      long long pos_ = start;
+      const long long pos_end = start + size;
+
+      while (pos_ != pos_end) {
+        unsigned char b;
+
+        const int status = pReader->Read(pos_, 1, &b);
+
+        if (status)
+          return status;
+
+        info.uid <<= 8;
+        info.uid |= b;
+
+        ++pos_;
+      }
+    } else if (id == libwebm::kMkvTrackNumber) {
+      const long long num = UnserializeUInt(pReader, pos, size);
+
+      if ((num <= 0) || (num > 127))
+        return E_FILE_FORMAT_INVALID;
+
+      info.number = static_cast<long>(num);
+    } else if (id == libwebm::kMkvTrackType) {
+      const long long type = UnserializeUInt(pReader, pos, size);
+
+      if ((type <= 0) || (type > 254))
+        return E_FILE_FORMAT_INVALID;
+
+      info.type = static_cast<long>(type);
+    } else if (id == libwebm::kMkvName) {
+      const long status =
+          UnserializeString(pReader, pos, size, info.nameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvLanguage) {
+      const long status = UnserializeString(pReader, pos, size, info.language);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvDefaultDuration) {
+      const long long duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      info.defaultDuration = static_cast<unsigned long long>(duration);
+    } else if (id == libwebm::kMkvCodecID) {
+      const long status = UnserializeString(pReader, pos, size, info.codecId);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvFlagLacing) {
+      lacing = UnserializeUInt(pReader, pos, size);
+
+      if ((lacing < 0) || (lacing > 1))
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvCodecPrivate) {
+      delete[] info.codecPrivate;
+      info.codecPrivate = NULL;
+      info.codecPrivateSize = 0;
+
+      const size_t buflen = static_cast<size_t>(size);
+
+      if (buflen) {
+        unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+
+        if (buf == NULL)
+          return -1;
+
+        const int status = pReader->Read(pos, static_cast<long>(buflen), buf);
+
+        if (status) {
+          delete[] buf;
+          return status;
+        }
+
+        info.codecPrivate = buf;
+        info.codecPrivateSize = buflen;
+      }
+    } else if (id == libwebm::kMkvCodecName) {
+      const long status =
+          UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvCodecDelay) {
+      info.codecDelay = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvSeekPreRoll) {
+      info.seekPreRoll = UnserializeUInt(pReader, pos, size);
+    }
+
+    pos += size;  // consume payload
+    if (pos > track_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != track_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.number <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  if (GetTrackByNumber(info.number))
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.type <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  info.lacing = (lacing > 0) ? true : false;
+
+  if (info.type == Track::kVideo) {
+    if (v.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = v;
+
+    VideoTrack* pTrack = NULL;
+
+    const long status = VideoTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else if (info.type == Track::kAudio) {
+    if (a.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = a;
+
+    AudioTrack* pTrack = NULL;
+
+    const long status = AudioTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else {
+    // neither video nor audio - probably metadata or subtitles
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (info.type == Track::kMetadata && e.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings.start = -1;
+    info.settings.size = 0;
+
+    Track* pTrack = NULL;
+
+    const long status =
+        Track::Create(m_pSegment, info, element_start, element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+  }
+
+  return 0;  // success
+}
+
+Tracks::~Tracks() {
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+    delete pTrack;
+  }
+
+  delete[] m_trackEntries;
+}
+
+const Track* Tracks::GetTrackByNumber(long tn) const {
+  if (tn < 0)
+    return NULL;
+
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+
+    if (pTrack == NULL)
+      continue;
+
+    if (tn == pTrack->GetNumber())
+      return pTrack;
+  }
+
+  return NULL;  // not found
+}
+
+const Track* Tracks::GetTrackByIndex(unsigned long idx) const {
+  const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries;
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return m_trackEntries[idx];
+}
+
+long Cluster::Load(long long& pos, long& len) const {
+  if (m_pSegment == NULL)
+    return E_PARSE_FAILED;
+
+  if (m_timecode >= 0)  // at least partially loaded
+    return 0;
+
+  if (m_pos != m_element_start || m_element_size >= 0)
+    return E_PARSE_FAILED;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+  long long total, avail;
+  const int status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && (avail > total || m_pos > total))
+    return E_FILE_FORMAT_INVALID;
+
+  pos = m_pos;
+
+  long long cluster_size = -1;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error or underflow
+    return static_cast<long>(result);
+
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long id_ = ReadID(pReader, pos, len);
+
+  if (id_ < 0)  // error
+    return static_cast<long>(id_);
+
+  if (id_ != libwebm::kMkvCluster)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume id
+
+  // read cluster size
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long size = ReadUInt(pReader, pos, len);
+
+  if (size < 0)  // error
+    return static_cast<long>(cluster_size);
+
+  if (size == 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume length of size of element
+
+  const long long unknown_size = (1LL << (7 * len)) - 1;
+
+  if (size != unknown_size)
+    cluster_size = size;
+
+  // pos points to start of payload
+  long long timecode = -1;
+  long long new_pos = -1;
+  bool bBlock = false;
+
+  long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == libwebm::kMkvCluster)
+      break;
+
+    if (id == libwebm::kMkvCues)
+      break;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)
+      continue;
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvTimecode) {
+      len = static_cast<long>(size);
+
+      if ((pos + size) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      timecode = UnserializeUInt(pReader, pos, size);
+
+      if (timecode < 0)  // error (or underflow)
+        return static_cast<long>(timecode);
+
+      new_pos = pos + size;
+
+      if (bBlock)
+        break;
+    } else if (id == libwebm::kMkvBlockGroup) {
+      bBlock = true;
+      break;
+    } else if (id == libwebm::kMkvSimpleBlock) {
+      bBlock = true;
+      break;
+    }
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (cluster_stop >= 0 && pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (timecode < 0)  // no timecode found
+    return E_FILE_FORMAT_INVALID;
+
+  if (!bBlock)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = new_pos;  // designates position just beyond timecode payload
+  m_timecode = timecode;  // m_timecode >= 0 means we're partially loaded
+
+  if (cluster_size >= 0)
+    m_element_size = cluster_stop - m_element_start;
+
+  return 0;
+}
+
+long Cluster::Parse(long long& pos, long& len) const {
+  long status = Load(pos, len);
+
+  if (status < 0)
+    return status;
+
+  if (m_pos < m_element_start || m_timecode < 0)
+    return E_PARSE_FAILED;
+
+  const long long cluster_stop =
+      (m_element_size < 0) ? -1 : m_element_start + m_element_size;
+
+  if ((cluster_stop >= 0) && (m_pos >= cluster_stop))
+    return 1;  // nothing else to do
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
+
+  pos = m_pos;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    if ((total >= 0) && (pos >= total)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if ((id == libwebm::kMkvCluster) || (id == libwebm::kMkvCues)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)
+      continue;
+
+    // const long long block_start = pos;
+    const long long block_stop = pos + size;
+
+    if (cluster_stop >= 0) {
+      if (block_stop > cluster_stop) {
+        if (id == libwebm::kMkvBlockGroup || id == libwebm::kMkvSimpleBlock) {
+          return E_FILE_FORMAT_INVALID;
+        }
+
+        pos = cluster_stop;
+        break;
+      }
+    } else if ((total >= 0) && (block_stop > total)) {
+      m_element_size = total - m_element_start;
+      pos = total;
+      break;
+    } else if (block_stop > avail) {
+      len = static_cast<long>(size);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    Cluster* const this_ = const_cast<Cluster*>(this);
+
+    if (id == libwebm::kMkvBlockGroup)
+      return this_->ParseBlockGroup(size, pos, len);
+
+    if (id == libwebm::kMkvSimpleBlock)
+      return this_->ParseSimpleBlock(size, pos, len);
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (m_element_size < 1)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = pos;
+  if (cluster_stop >= 0 && m_pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_entries_count > 0) {
+    const long idx = m_entries_count - 1;
+
+    const BlockEntry* const pLast = m_entries[idx];
+    if (pLast == NULL)
+      return E_PARSE_FAILED;
+
+    const Block* const pBlock = pLast->GetBlock();
+    if (pBlock == NULL)
+      return E_PARSE_FAILED;
+
+    const long long start = pBlock->m_start;
+
+    if ((total >= 0) && (start > total))
+      return E_PARSE_FAILED;  // defend against trucated stream
+
+    const long long size = pBlock->m_size;
+
+    const long long stop = start + size;
+    if (cluster_stop >= 0 && stop > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && (stop > total))
+      return E_PARSE_FAILED;  // defend against trucated stream
+  }
+
+  return 1;  // no more entries
+}
+
+long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
+                               long& len) {
+  const long long block_start = pos;
+  const long long block_stop = pos + block_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  // parse track number
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // weird
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long track = ReadUInt(pReader, pos, len);
+
+  if (track < 0)  // error
+    return static_cast<long>(track);
+
+  if (track == 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume track number
+
+  if ((pos + 2) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + 2) > avail) {
+    len = 2;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  pos += 2;  // consume timecode
+
+  if ((pos + 1) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  unsigned char flags;
+
+  status = pReader->Read(pos, 1, &flags);
+
+  if (status < 0) {  // error or underflow
+    len = 1;
+    return status;
+  }
+
+  ++pos;  // consume flags byte
+  assert(pos <= avail);
+
+  if (pos >= block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  const int lacing = int(flags & 0x06) >> 1;
+
+  if ((lacing != 0) && (block_stop > avail)) {
+    len = static_cast<long>(block_stop - pos);
+    return E_BUFFER_NOT_FULL;
+  }
+
+  status = CreateBlock(libwebm::kMkvSimpleBlock, block_start, block_size,
+                       0);  // DiscardPadding
+
+  if (status != 0)
+    return status;
+
+  m_pos = block_stop;
+
+  return 0;  // success
+}
+
+long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
+                              long& len) {
+  const long long payload_start = pos;
+  const long long payload_stop = pos + payload_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  if ((total >= 0) && (payload_stop > total))
+    return E_FILE_FORMAT_INVALID;
+
+  if (payload_stop > avail) {
+    len = static_cast<long>(payload_size);
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long discard_padding = 0;
+
+  while (pos < payload_stop) {
+    // parse sub-block element ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // not a valid ID
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of sub-block group payload
+
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvDiscardPadding) {
+      status = UnserializeInt(pReader, pos, size, discard_padding);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    if (id != libwebm::kMkvBlock) {
+      pos += size;  // consume sub-part of block group
+
+      if (pos > payload_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    const long long block_stop = pos + size;
+
+    if (block_stop > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    // parse track number
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long track = ReadUInt(pReader, pos, len);
+
+    if (track < 0)  // error
+      return static_cast<long>(track);
+
+    if (track == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume track number
+
+    if ((pos + 2) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 2) > avail) {
+      len = 2;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos += 2;  // consume timecode
+
+    if ((pos + 1) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    unsigned char flags;
+
+    status = pReader->Read(pos, 1, &flags);
+
+    if (status < 0) {  // error or underflow
+      len = 1;
+      return status;
+    }
+
+    ++pos;  // consume flags byte
+    assert(pos <= avail);
+
+    if (pos >= block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const int lacing = int(flags & 0x06) >> 1;
+
+    if ((lacing != 0) && (block_stop > avail)) {
+      len = static_cast<long>(block_stop - pos);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos = block_stop;  // consume block-part of block group
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != payload_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  status = CreateBlock(libwebm::kMkvBlockGroup, payload_start, payload_size,
+                       discard_padding);
+  if (status != 0)
+    return status;
+
+  m_pos = payload_stop;
+
+  return 0;  // success
+}
+
+long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const {
+  assert(m_pos >= m_element_start);
+
+  pEntry = NULL;
+
+  if (index < 0)
+    return -1;  // generic error
+
+  if (m_entries_count < 0)
+    return E_BUFFER_NOT_FULL;
+
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count <= m_entries_size);
+
+  if (index < m_entries_count) {
+    pEntry = m_entries[index];
+    assert(pEntry);
+
+    return 1;  // found entry
+  }
+
+  if (m_element_size < 0)  // we don't know cluster end yet
+    return E_BUFFER_NOT_FULL;  // underflow
+
+  const long long element_stop = m_element_start + m_element_size;
+
+  if (m_pos >= element_stop)
+    return 0;  // nothing left to parse
+
+  return E_BUFFER_NOT_FULL;  // underflow, since more remains to be parsed
+}
+
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+  if (!pSegment || off < 0)
+    return NULL;
+
+  const long long element_start = pSegment->m_start + off;
+
+  Cluster* const pCluster =
+      new (std::nothrow) Cluster(pSegment, idx, element_start);
+
+  return pCluster;
+}
+
+Cluster::Cluster()
+    : m_pSegment(NULL),
+      m_element_start(0),
+      m_index(0),
+      m_pos(0),
+      m_element_size(0),
+      m_timecode(0),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(0)  // means "no entries"
+{}
+
+Cluster::Cluster(Segment* pSegment, long idx, long long element_start
+                 /* long long element_size */)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_index(idx),
+      m_pos(element_start),
+      m_element_size(-1 /* element_size */),
+      m_timecode(-1),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(-1)  // means "has not been parsed yet"
+{}
+
+Cluster::~Cluster() {
+  if (m_entries_count <= 0) {
+    delete[] m_entries;
+    return;
+  }
+
+  BlockEntry** i = m_entries;
+  BlockEntry** const j = m_entries + m_entries_count;
+
+  while (i != j) {
+    BlockEntry* p = *i++;
+    assert(p);
+
+    delete p;
+  }
+
+  delete[] m_entries;
+}
+
+bool Cluster::EOS() const { return (m_pSegment == NULL); }
+
+long Cluster::GetIndex() const { return m_index; }
+
+long long Cluster::GetPosition() const {
+  const long long pos = m_element_start - m_pSegment->m_start;
+  assert(pos >= 0);
+
+  return pos;
+}
+
+long long Cluster::GetElementSize() const { return m_element_size; }
+
+long Cluster::HasBlockEntries(
+    const Segment* pSegment,
+    long long off,  // relative to start of segment payload
+    long long& pos, long& len) {
+  assert(pSegment);
+  assert(off >= 0);  // relative to segment
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = pSegment->m_start + off;  // absolute
+
+  if ((total >= 0) && (pos >= total))
+    return 0;  // we don't even have a complete cluster
+
+  const long long segment_stop =
+      (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size;
+
+  long long cluster_stop = -1;  // interpreted later to mean "unknown size"
+
+  {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id != libwebm::kMkvCluster)
+      return E_PARSE_FAILED;
+
+    pos += len;  // consume Cluster ID field
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    if (size == 0)
+      return 0;  // cluster does not have entries
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size != unknown_size) {
+      cluster_stop = pos + size;
+      assert(cluster_stop >= 0);
+
+      if ((segment_stop >= 0) && (cluster_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((total >= 0) && (cluster_stop > total))
+        // return E_FILE_FORMAT_INVALID;  //too conservative
+        return 0;  // cluster does not have any entries
+    }
+  }
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return 0;  // no entries detected
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == libwebm::kMkvCluster)
+      return 0;  // no entries found
+
+    if (id == libwebm::kMkvCues)
+      return 0;  // no entries found
+
+    pos += len;  // consume id field
+
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // underflow
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;  // not supported inside cluster
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvBlockGroup)
+      return 1;  // have at least one entry
+
+    if (id == libwebm::kMkvSimpleBlock)
+      return 1;  // have at least one entry
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+}
+
+long long Cluster::GetTimeCode() const {
+  long long pos;
+  long len;
+
+  const long status = Load(pos, len);
+
+  if (status < 0)  // error
+    return status;
+
+  return m_timecode;
+}
+
+long long Cluster::GetTime() const {
+  const long long tc = GetTimeCode();
+
+  if (tc < 0)
+    return tc;
+
+  const SegmentInfo* const pInfo = m_pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long t = m_timecode * scale;
+
+  return t;
+}
+
+long long Cluster::GetFirstTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetFirst(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long long Cluster::GetLastTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetLast(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long Cluster::CreateBlock(long long id,
+                          long long pos,  // absolute pos of payload
+                          long long size, long long discard_padding) {
+  if (id != libwebm::kMkvBlockGroup && id != libwebm::kMkvSimpleBlock)
+    return E_PARSE_FAILED;
+
+  if (m_entries_count < 0) {  // haven't parsed anything yet
+    assert(m_entries == NULL);
+    assert(m_entries_size == 0);
+
+    m_entries_size = 1024;
+    m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+    if (m_entries == NULL)
+      return -1;
+
+    m_entries_count = 0;
+  } else {
+    assert(m_entries);
+    assert(m_entries_size > 0);
+    assert(m_entries_count <= m_entries_size);
+
+    if (m_entries_count >= m_entries_size) {
+      const long entries_size = 2 * m_entries_size;
+
+      BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+      if (entries == NULL)
+        return -1;
+
+      BlockEntry** src = m_entries;
+      BlockEntry** const src_end = src + m_entries_count;
+
+      BlockEntry** dst = entries;
+
+      while (src != src_end)
+        *dst++ = *src++;
+
+      delete[] m_entries;
+
+      m_entries = entries;
+      m_entries_size = entries_size;
+    }
+  }
+
+  if (id == libwebm::kMkvBlockGroup)
+    return CreateBlockGroup(pos, size, discard_padding);
+  else
+    return CreateSimpleBlock(pos, size);
+}
+
+long Cluster::CreateBlockGroup(long long start_offset, long long size,
+                               long long discard_padding) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = start_offset;
+  const long long stop = start_offset + size;
+
+  // For WebM files, there is a bias towards previous reference times
+  //(in order to support alt-ref frames, which refer back to the previous
+  // keyframe).  Normally a 0 value is not possible, but here we tenatively
+  // allow 0 as the value of a reference frame, with the interpretation
+  // that this is a "previous" reference time.
+
+  long long prev = 1;  // nonce
+  long long next = 0;  // nonce
+  long long duration = -1;  // really, this is unsigned
+
+  long long bpos = -1;
+  long long bsize = -1;
+
+  while (pos < stop) {
+    long len;
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume size
+
+    if (id == libwebm::kMkvBlock) {
+      if (bpos < 0) {  // Block ID
+        bpos = pos;
+        bsize = size;
+      }
+    } else if (id == libwebm::kMkvBlockDuration) {
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvReferenceBlock) {
+      if (size > 8 || size <= 0)
+        return E_FILE_FORMAT_INVALID;
+      const long size_ = static_cast<long>(size);
+
+      long long time;
+
+      long status = UnserializeInt(pReader, pos, size_, time);
+      assert(status == 0);
+      if (status != 0)
+        return -1;
+
+      if (time <= 0)  // see note above
+        prev = time;
+      else
+        next = time;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+  if (bpos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  assert(bsize >= 0);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow)
+      BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  BlockGroup* const p = static_cast<BlockGroup*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {  // success
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::CreateSimpleBlock(long long st, long long sz) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  SimpleBlock* const p = static_cast<SimpleBlock*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::GetFirst(const BlockEntry*& pFirst) const {
+  if (m_entries_count <= 0) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pFirst = NULL;
+      return status;
+    }
+
+    if (m_entries_count <= 0) {  // empty cluster
+      pFirst = NULL;
+      return 0;
+    }
+  }
+
+  assert(m_entries);
+
+  pFirst = m_entries[0];
+  assert(pFirst);
+
+  return 0;  // success
+}
+
+long Cluster::GetLast(const BlockEntry*& pLast) const {
+  for (;;) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pLast = NULL;
+      return status;
+    }
+
+    if (status > 0)  // no new block
+      break;
+  }
+
+  if (m_entries_count <= 0) {
+    pLast = NULL;
+    return 0;
+  }
+
+  assert(m_entries);
+
+  const long idx = m_entries_count - 1;
+
+  pLast = m_entries[idx];
+  assert(pLast);
+
+  return 0;
+}
+
+long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const {
+  assert(pCurr);
+  assert(m_entries);
+  assert(m_entries_count > 0);
+
+  size_t idx = pCurr->GetIndex();
+  assert(idx < size_t(m_entries_count));
+  assert(m_entries[idx] == pCurr);
+
+  ++idx;
+
+  if (idx >= size_t(m_entries_count)) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pNext = NULL;
+      return status;
+    }
+
+    if (status > 0) {
+      pNext = NULL;
+      return 0;
+    }
+
+    assert(m_entries);
+    assert(m_entries_count > 0);
+    assert(idx < size_t(m_entries_count));
+  }
+
+  pNext = m_entries[idx];
+  assert(pNext);
+
+  return 0;
+}
+
+long Cluster::GetEntryCount() const { return m_entries_count; }
+
+const BlockEntry* Cluster::GetEntry(const Track* pTrack,
+                                    long long time_ns) const {
+  assert(pTrack);
+
+  if (m_pSegment == NULL)  // this is the special EOS cluster
+    return pTrack->GetEOS();
+
+  const BlockEntry* pResult = pTrack->GetEOS();
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+      assert(status >= 0);
+
+      if (status > 0)  // completely parsed, and no more entries
+        return pResult;
+
+      if (status < 0)  // should never happen
+        return 0;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != pTrack->GetNumber()) {
+      ++index;
+      continue;
+    }
+
+    if (pTrack->VetEntry(pEntry)) {
+      if (time_ns < 0)  // just want first candidate block
+        return pEntry;
+
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+
+      pResult = pEntry;  // have a candidate
+    } else if (time_ns >= 0) {
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+    }
+
+    ++index;
+  }
+}
+
+const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) const {
+  assert(m_pSegment);
+  const long long tc = cp.GetTimeCode();
+
+  if (tp.m_block > 0) {
+    const long block = static_cast<long>(tp.m_block);
+    const long index = block - 1;
+
+    while (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if ((pBlock->GetTrackNumber() == tp.m_track) &&
+        (pBlock->GetTimeCode(this) == tc)) {
+      return pEntry;
+    }
+  }
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != tp.m_track) {
+      ++index;
+      continue;
+    }
+
+    const long long tc_ = pBlock->GetTimeCode(this);
+
+    if (tc_ < tc) {
+      ++index;
+      continue;
+    }
+
+    if (tc_ > tc)
+      return NULL;
+
+    const Tracks* const pTracks = m_pSegment->GetTracks();
+    assert(pTracks);
+
+    const long tn = static_cast<long>(tp.m_track);
+    const Track* const pTrack = pTracks->GetTrackByNumber(tn);
+
+    if (pTrack == NULL)
+      return NULL;
+
+    const long long type = pTrack->GetType();
+
+    if (type == 2)  // audio
+      return pEntry;
+
+    if (type != 1)  // not video
+      return NULL;
+
+    if (!pBlock->IsKey())
+      return NULL;
+
+    return pEntry;
+  }
+}
+
+BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
+BlockEntry::~BlockEntry() {}
+const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
+long BlockEntry::GetIndex() const { return m_index; }
+
+SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start,
+                         long long size)
+    : BlockEntry(pCluster, idx), m_block(start, size, 0) {}
+
+long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); }
+BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; }
+const Block* SimpleBlock::GetBlock() const { return &m_block; }
+
+BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start,
+                       long long block_size, long long prev, long long next,
+                       long long duration, long long discard_padding)
+    : BlockEntry(pCluster, idx),
+      m_block(block_start, block_size, discard_padding),
+      m_prev(prev),
+      m_next(next),
+      m_duration(duration) {}
+
+long BlockGroup::Parse() {
+  const long status = m_block.Parse(m_pCluster);
+
+  if (status)
+    return status;
+
+  m_block.SetKey((m_prev > 0) && (m_next <= 0));
+
+  return 0;
+}
+
+BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; }
+const Block* BlockGroup::GetBlock() const { return &m_block; }
+long long BlockGroup::GetPrevTimeCode() const { return m_prev; }
+long long BlockGroup::GetNextTimeCode() const { return m_next; }
+long long BlockGroup::GetDurationTimeCode() const { return m_duration; }
+
+Block::Block(long long start, long long size_, long long discard_padding)
+    : m_start(start),
+      m_size(size_),
+      m_track(0),
+      m_timecode(-1),
+      m_flags(0),
+      m_frames(NULL),
+      m_frame_count(-1),
+      m_discard_padding(discard_padding) {}
+
+Block::~Block() { delete[] m_frames; }
+
+long Block::Parse(const Cluster* pCluster) {
+  if (pCluster == NULL)
+    return -1;
+
+  if (pCluster->m_pSegment == NULL)
+    return -1;
+
+  assert(m_start >= 0);
+  assert(m_size >= 0);
+  assert(m_track <= 0);
+  assert(m_frames == NULL);
+  assert(m_frame_count <= 0);
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  long len;
+
+  IMkvReader* const pReader = pCluster->m_pSegment->m_pReader;
+
+  m_track = ReadUInt(pReader, pos, len);
+
+  if (m_track <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume track number
+
+  if ((stop - pos) < 2)
+    return E_FILE_FORMAT_INVALID;
+
+  long status;
+  long long value;
+
+  status = UnserializeInt(pReader, pos, 2, value);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  if (value < SHRT_MIN)
+    return E_FILE_FORMAT_INVALID;
+
+  if (value > SHRT_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  m_timecode = static_cast<short>(value);
+
+  pos += 2;
+
+  if ((stop - pos) <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  status = pReader->Read(pos, 1, &m_flags);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  const int lacing = int(m_flags & 0x06) >> 1;
+
+  ++pos;  // consume flags byte
+
+  if (lacing == 0) {  // no lacing
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    m_frame_count = 1;
+    m_frames = new (std::nothrow) Frame[m_frame_count];
+    if (m_frames == NULL)
+      return -1;
+
+    Frame& f = m_frames[0];
+    f.pos = pos;
+
+    const long long frame_size = stop - pos;
+
+    if (frame_size > LONG_MAX || frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    f.len = static_cast<long>(frame_size);
+
+    return 0;  // success
+  }
+
+  if (pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char biased_count;
+
+  status = pReader->Read(pos, 1, &biased_count);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  ++pos;  // consume frame count
+  if (pos > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  m_frame_count = int(biased_count) + 1;
+
+  m_frames = new (std::nothrow) Frame[m_frame_count];
+  if (m_frames == NULL)
+    return -1;
+
+  if (!m_frames)
+    return E_FILE_FORMAT_INVALID;
+
+  if (lacing == 1) {  // Xiph
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    long long size = 0;
+    int frame_count = m_frame_count;
+
+    while (frame_count > 1) {
+      long frame_size = 0;
+
+      for (;;) {
+        unsigned char val;
+
+        if (pos >= stop)
+          return E_FILE_FORMAT_INVALID;
+
+        status = pReader->Read(pos, 1, &val);
+
+        if (status)
+          return E_FILE_FORMAT_INVALID;
+
+        ++pos;  // consume xiph size byte
+
+        frame_size += val;
+
+        if (val < 255)
+          break;
+      }
+
+      Frame& f = *pf++;
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = frame_size;
+      size += frame_size;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    if (pf >= pf_end || pos > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    {
+      Frame& f = *pf++;
+
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX || frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      assert((pos + f.len) <= stop);
+
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
+  } else if (lacing == 2) {  // fixed-size lacing
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long total_size = stop - pos;
+
+    if ((total_size % m_frame_count) != 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long frame_size = total_size / m_frame_count;
+
+    if (frame_size > LONG_MAX || frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    while (pf != pf_end) {
+      assert((pos + frame_size) <= stop);
+      if ((pos + frame_size) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& f = *pf++;
+
+      f.pos = pos;
+      f.len = static_cast<long>(frame_size);
+
+      pos += frame_size;
+    }
+
+    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
+  } else {
+    assert(lacing == 3);  // EBML lacing
+
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    long long size = 0;
+    int frame_count = m_frame_count;
+
+    long long frame_size = ReadUInt(pReader, pos, len);
+
+    if (frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (frame_size > LONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume length of size of first frame
+
+    if ((pos + frame_size) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    {
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      curr.len = static_cast<long>(frame_size);
+      size += curr.len;  // contribution of this frame
+    }
+
+    --frame_count;
+
+    while (frame_count > 1) {
+      if (pos >= stop)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      const long long delta_size_ = ReadUInt(pReader, pos, len);
+
+      if (delta_size_ < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += len;  // consume length of (delta) size
+      if (pos > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      const long exp = 7 * len - 1;
+      const long long bias = (1LL << exp) - 1LL;
+      const long long delta_size = delta_size_ - bias;
+
+      frame_size += delta_size;
+
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if (frame_size > LONG_MAX)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+      // Check if size + curr.len could overflow.
+      if (size > LLONG_MAX - curr.len) {
+        return E_FILE_FORMAT_INVALID;
+      }
+      size += curr.len;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    // parse last frame
+    if (frame_count > 0) {
+      if (pos > stop || pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& curr = *pf++;
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX || frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;  // success
+}
+
+long long Block::GetTimeCode(const Cluster* pCluster) const {
+  if (pCluster == 0)
+    return m_timecode;
+
+  const long long tc0 = pCluster->GetTimeCode();
+  assert(tc0 >= 0);
+
+  // Check if tc0 + m_timecode would overflow.
+  if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+    return -1;
+  }
+
+  const long long tc = tc0 + m_timecode;
+
+  return tc;  // unscaled timecode units
+}
+
+long long Block::GetTime(const Cluster* pCluster) const {
+  assert(pCluster);
+
+  const long long tc = GetTimeCode(pCluster);
+
+  const Segment* const pSegment = pCluster->m_pSegment;
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  // Check if tc * scale could overflow.
+  if (tc != 0 && scale > LLONG_MAX / tc) {
+    return -1;
+  }
+  const long long ns = tc * scale;
+
+  return ns;
+}
+
+long long Block::GetTrackNumber() const { return m_track; }
+
+bool Block::IsKey() const {
+  return ((m_flags & static_cast<unsigned char>(1 << 7)) != 0);
+}
+
+void Block::SetKey(bool bKey) {
+  if (bKey)
+    m_flags |= static_cast<unsigned char>(1 << 7);
+  else
+    m_flags &= 0x7F;
+}
+
+bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); }
+
+Block::Lacing Block::GetLacing() const {
+  const int value = int(m_flags & 0x06) >> 1;
+  return static_cast<Lacing>(value);
+}
+
+int Block::GetFrameCount() const { return m_frame_count; }
+
+const Block::Frame& Block::GetFrame(int idx) const {
+  assert(idx >= 0);
+  assert(idx < m_frame_count);
+
+  const Frame& f = m_frames[idx];
+  assert(f.pos > 0);
+  assert(f.len > 0);
+
+  return f;
+}
+
+long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const {
+  assert(pReader);
+  assert(buf);
+
+  const long status = pReader->Read(pos, len, buf);
+  return status;
+}
+
+long long Block::GetDiscardPadding() const { return m_discard_padding; }
+
+}  // namespace mkvparser
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvparser.h b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.h
new file mode 100644
index 000000000..26c2b7e5e
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvparser.h
@@ -0,0 +1,1145 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVPARSER_H_
+#define MKVPARSER_MKVPARSER_H_
+
+#include <cstddef>
+
+namespace mkvparser {
+
+const int E_PARSE_FAILED = -1;
+const int E_FILE_FORMAT_INVALID = -2;
+const int E_BUFFER_NOT_FULL = -3;
+
+class IMkvReader {
+ public:
+  virtual int Read(long long pos, long len, unsigned char* buf) = 0;
+  virtual int Length(long long* total, long long* available) = 0;
+
+ protected:
+  virtual ~IMkvReader();
+};
+
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size);
+long long GetUIntLength(IMkvReader*, long long, long&);
+long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
+long long UnserializeUInt(IMkvReader*, long long pos, long long size);
+
+long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
+long UnserializeInt(IMkvReader*, long long pos, long long size,
+                    long long& result);
+
+long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);
+
+long ParseElementHeader(IMkvReader* pReader,
+                        long long& pos,  // consume id and size fields
+                        long long stop,  // if you know size of element's parent
+                        long long& id, long long& size);
+
+bool Match(IMkvReader*, long long&, unsigned long, long long&);
+bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&);
+
+void GetVersion(int& major, int& minor, int& build, int& revision);
+
+struct EBMLHeader {
+  EBMLHeader();
+  ~EBMLHeader();
+  long long m_version;
+  long long m_readVersion;
+  long long m_maxIdLength;
+  long long m_maxSizeLength;
+  char* m_docType;
+  long long m_docTypeVersion;
+  long long m_docTypeReadVersion;
+
+  long long Parse(IMkvReader*, long long&);
+  void Init();
+};
+
+class Segment;
+class Track;
+class Cluster;
+
+class Block {
+  Block(const Block&);
+  Block& operator=(const Block&);
+
+ public:
+  const long long m_start;
+  const long long m_size;
+
+  Block(long long start, long long size, long long discard_padding);
+  ~Block();
+
+  long Parse(const Cluster*);
+
+  long long GetTrackNumber() const;
+  long long GetTimeCode(const Cluster*) const;  // absolute, but not scaled
+  long long GetTime(const Cluster*) const;  // absolute, and scaled (ns)
+  bool IsKey() const;
+  void SetKey(bool);
+  bool IsInvisible() const;
+
+  enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml };
+  Lacing GetLacing() const;
+
+  int GetFrameCount() const;  // to index frames: [0, count)
+
+  struct Frame {
+    long long pos;  // absolute offset
+    long len;
+
+    long Read(IMkvReader*, unsigned char*) const;
+  };
+
+  const Frame& GetFrame(int frame_index) const;
+
+  long long GetDiscardPadding() const;
+
+ private:
+  long long m_track;  // Track::Number()
+  short m_timecode;  // relative to cluster
+  unsigned char m_flags;
+
+  Frame* m_frames;
+  int m_frame_count;
+
+ protected:
+  const long long m_discard_padding;
+};
+
+class BlockEntry {
+  BlockEntry(const BlockEntry&);
+  BlockEntry& operator=(const BlockEntry&);
+
+ protected:
+  BlockEntry(Cluster*, long index);
+
+ public:
+  virtual ~BlockEntry();
+
+  bool EOS() const { return (GetKind() == kBlockEOS); }
+  const Cluster* GetCluster() const;
+  long GetIndex() const;
+  virtual const Block* GetBlock() const = 0;
+
+  enum Kind { kBlockEOS, kBlockSimple, kBlockGroup };
+  virtual Kind GetKind() const = 0;
+
+ protected:
+  Cluster* const m_pCluster;
+  const long m_index;
+};
+
+class SimpleBlock : public BlockEntry {
+  SimpleBlock(const SimpleBlock&);
+  SimpleBlock& operator=(const SimpleBlock&);
+
+ public:
+  SimpleBlock(Cluster*, long index, long long start, long long size);
+  long Parse();
+
+  Kind GetKind() const;
+  const Block* GetBlock() const;
+
+ protected:
+  Block m_block;
+};
+
+class BlockGroup : public BlockEntry {
+  BlockGroup(const BlockGroup&);
+  BlockGroup& operator=(const BlockGroup&);
+
+ public:
+  BlockGroup(Cluster*, long index,
+             long long block_start,  // absolute pos of block's payload
+             long long block_size,  // size of block's payload
+             long long prev, long long next, long long duration,
+             long long discard_padding);
+
+  long Parse();
+
+  Kind GetKind() const;
+  const Block* GetBlock() const;
+
+  long long GetPrevTimeCode() const;  // relative to block's time
+  long long GetNextTimeCode() const;  // as above
+  long long GetDurationTimeCode() const;
+
+ private:
+  Block m_block;
+  const long long m_prev;
+  const long long m_next;
+  const long long m_duration;
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+class ContentEncoding {
+ public:
+  enum { kCTR = 1 };
+
+  ContentEncoding();
+  ~ContentEncoding();
+
+  // ContentCompression element names
+  struct ContentCompression {
+    ContentCompression();
+    ~ContentCompression();
+
+    unsigned long long algo;
+    unsigned char* settings;
+    long long settings_len;
+  };
+
+  // ContentEncAESSettings element names
+  struct ContentEncAESSettings {
+    ContentEncAESSettings() : cipher_mode(kCTR) {}
+    ~ContentEncAESSettings() {}
+
+    unsigned long long cipher_mode;
+  };
+
+  // ContentEncryption element names
+  struct ContentEncryption {
+    ContentEncryption();
+    ~ContentEncryption();
+
+    unsigned long long algo;
+    unsigned char* key_id;
+    long long key_id_len;
+    unsigned char* signature;
+    long long signature_len;
+    unsigned char* sig_key_id;
+    long long sig_key_id_len;
+    unsigned long long sig_algo;
+    unsigned long long sig_hash_algo;
+
+    ContentEncAESSettings aes_settings;
+  };
+
+  // Returns ContentCompression represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentCompression* GetCompressionByIndex(unsigned long idx) const;
+
+  // Returns number of ContentCompression elements in this ContentEncoding
+  // element.
+  unsigned long GetCompressionCount() const;
+
+  // Parses the ContentCompression element from |pReader|. |start| is the
+  // starting offset of the ContentCompression payload. |size| is the size in
+  // bytes of the ContentCompression payload. |compression| is where the parsed
+  // values will be stored.
+  long ParseCompressionEntry(long long start, long long size,
+                             IMkvReader* pReader,
+                             ContentCompression* compression);
+
+  // Returns ContentEncryption represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const;
+
+  // Returns number of ContentEncryption elements in this ContentEncoding
+  // element.
+  unsigned long GetEncryptionCount() const;
+
+  // Parses the ContentEncAESSettings element from |pReader|. |start| is the
+  // starting offset of the ContentEncAESSettings payload. |size| is the
+  // size in bytes of the ContentEncAESSettings payload. |encryption| is
+  // where the parsed values will be stored.
+  long ParseContentEncAESSettingsEntry(long long start, long long size,
+                                       IMkvReader* pReader,
+                                       ContentEncAESSettings* aes);
+
+  // Parses the ContentEncoding element from |pReader|. |start| is the
+  // starting offset of the ContentEncoding payload. |size| is the size in
+  // bytes of the ContentEncoding payload. Returns true on success.
+  long ParseContentEncodingEntry(long long start, long long size,
+                                 IMkvReader* pReader);
+
+  // Parses the ContentEncryption element from |pReader|. |start| is the
+  // starting offset of the ContentEncryption payload. |size| is the size in
+  // bytes of the ContentEncryption payload. |encryption| is where the parsed
+  // values will be stored.
+  long ParseEncryptionEntry(long long start, long long size,
+                            IMkvReader* pReader, ContentEncryption* encryption);
+
+  unsigned long long encoding_order() const { return encoding_order_; }
+  unsigned long long encoding_scope() const { return encoding_scope_; }
+  unsigned long long encoding_type() const { return encoding_type_; }
+
+ private:
+  // Member variables for list of ContentCompression elements.
+  ContentCompression** compression_entries_;
+  ContentCompression** compression_entries_end_;
+
+  // Member variables for list of ContentEncryption elements.
+  ContentEncryption** encryption_entries_;
+  ContentEncryption** encryption_entries_end_;
+
+  // ContentEncoding element names
+  unsigned long long encoding_order_;
+  unsigned long long encoding_scope_;
+  unsigned long long encoding_type_;
+
+  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+  ContentEncoding(const ContentEncoding&);
+  ContentEncoding& operator=(const ContentEncoding&);
+};
+
+class Track {
+  Track(const Track&);
+  Track& operator=(const Track&);
+
+ public:
+  class Info;
+  static long Create(Segment*, const Info&, long long element_start,
+                     long long element_size, Track*&);
+
+  enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 };
+
+  Segment* const m_pSegment;
+  const long long m_element_start;
+  const long long m_element_size;
+  virtual ~Track();
+
+  long GetType() const;
+  long GetNumber() const;
+  unsigned long long GetUid() const;
+  const char* GetNameAsUTF8() const;
+  const char* GetLanguage() const;
+  const char* GetCodecNameAsUTF8() const;
+  const char* GetCodecId() const;
+  const unsigned char* GetCodecPrivate(size_t&) const;
+  bool GetLacing() const;
+  unsigned long long GetDefaultDuration() const;
+  unsigned long long GetCodecDelay() const;
+  unsigned long long GetSeekPreRoll() const;
+
+  const BlockEntry* GetEOS() const;
+
+  struct Settings {
+    long long start;
+    long long size;
+  };
+
+  class Info {
+   public:
+    Info();
+    ~Info();
+    int Copy(Info&) const;
+    void Clear();
+    long type;
+    long number;
+    unsigned long long uid;
+    unsigned long long defaultDuration;
+    unsigned long long codecDelay;
+    unsigned long long seekPreRoll;
+    char* nameAsUTF8;
+    char* language;
+    char* codecId;
+    char* codecNameAsUTF8;
+    unsigned char* codecPrivate;
+    size_t codecPrivateSize;
+    bool lacing;
+    Settings settings;
+
+   private:
+    Info(const Info&);
+    Info& operator=(const Info&);
+    int CopyStr(char* Info::*str, Info&) const;
+  };
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const;
+  virtual bool VetEntry(const BlockEntry*) const;
+  virtual long Seek(long long time_ns, const BlockEntry*&) const;
+
+  const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const;
+  unsigned long GetContentEncodingCount() const;
+
+  long ParseContentEncodingsEntry(long long start, long long size);
+
+ protected:
+  Track(Segment*, long long element_start, long long element_size);
+
+  Info m_info;
+
+  class EOSBlock : public BlockEntry {
+   public:
+    EOSBlock();
+
+    Kind GetKind() const;
+    const Block* GetBlock() const;
+  };
+
+  EOSBlock m_eos;
+
+ private:
+  ContentEncoding** content_encoding_entries_;
+  ContentEncoding** content_encoding_entries_end_;
+};
+
+struct PrimaryChromaticity {
+  PrimaryChromaticity() : x(0), y(0) {}
+  ~PrimaryChromaticity() {}
+  static bool Parse(IMkvReader* reader, long long read_pos,
+                    long long value_size, bool is_x,
+                    PrimaryChromaticity** chromaticity);
+  float x;
+  float y;
+};
+
+struct MasteringMetadata {
+  static const float kValueNotPresent;
+
+  MasteringMetadata()
+      : r(NULL),
+        g(NULL),
+        b(NULL),
+        white_point(NULL),
+        luminance_max(kValueNotPresent),
+        luminance_min(kValueNotPresent) {}
+  ~MasteringMetadata() {
+    delete r;
+    delete g;
+    delete b;
+    delete white_point;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size,
+                    MasteringMetadata** mastering_metadata);
+
+  PrimaryChromaticity* r;
+  PrimaryChromaticity* g;
+  PrimaryChromaticity* b;
+  PrimaryChromaticity* white_point;
+  float luminance_max;
+  float luminance_min;
+};
+
+struct Colour {
+  static const long long kValueNotPresent;
+
+  // Unless otherwise noted all values assigned upon construction are the
+  // equivalent of unspecified/default.
+  Colour()
+      : matrix_coefficients(kValueNotPresent),
+        bits_per_channel(kValueNotPresent),
+        chroma_subsampling_horz(kValueNotPresent),
+        chroma_subsampling_vert(kValueNotPresent),
+        cb_subsampling_horz(kValueNotPresent),
+        cb_subsampling_vert(kValueNotPresent),
+        chroma_siting_horz(kValueNotPresent),
+        chroma_siting_vert(kValueNotPresent),
+        range(kValueNotPresent),
+        transfer_characteristics(kValueNotPresent),
+        primaries(kValueNotPresent),
+        max_cll(kValueNotPresent),
+        max_fall(kValueNotPresent),
+        mastering_metadata(NULL) {}
+  ~Colour() {
+    delete mastering_metadata;
+    mastering_metadata = NULL;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Colour** colour);
+
+  long long matrix_coefficients;
+  long long bits_per_channel;
+  long long chroma_subsampling_horz;
+  long long chroma_subsampling_vert;
+  long long cb_subsampling_horz;
+  long long cb_subsampling_vert;
+  long long chroma_siting_horz;
+  long long chroma_siting_vert;
+  long long range;
+  long long transfer_characteristics;
+  long long primaries;
+  long long max_cll;
+  long long max_fall;
+
+  MasteringMetadata* mastering_metadata;
+};
+
+struct Projection {
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const float kValueNotPresent;
+  Projection()
+      : type(kTypeNotPresent),
+        private_data(NULL),
+        private_data_length(0),
+        pose_yaw(kValueNotPresent),
+        pose_pitch(kValueNotPresent),
+        pose_roll(kValueNotPresent) {}
+  ~Projection() { delete[] private_data; }
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Projection** projection);
+
+  ProjectionType type;
+  unsigned char* private_data;
+  size_t private_data_length;
+  float pose_yaw;
+  float pose_pitch;
+  float pose_roll;
+};
+
+class VideoTrack : public Track {
+  VideoTrack(const VideoTrack&);
+  VideoTrack& operator=(const VideoTrack&);
+
+  VideoTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  virtual ~VideoTrack();
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, VideoTrack*&);
+
+  long long GetWidth() const;
+  long long GetHeight() const;
+  long long GetDisplayWidth() const;
+  long long GetDisplayHeight() const;
+  long long GetDisplayUnit() const;
+  long long GetStereoMode() const;
+  double GetFrameRate() const;
+
+  bool VetEntry(const BlockEntry*) const;
+  long Seek(long long time_ns, const BlockEntry*&) const;
+
+  Colour* GetColour() const;
+
+  Projection* GetProjection() const;
+
+ private:
+  long long m_width;
+  long long m_height;
+  long long m_display_width;
+  long long m_display_height;
+  long long m_display_unit;
+  long long m_stereo_mode;
+
+  double m_rate;
+
+  Colour* m_colour;
+  Projection* m_projection;
+};
+
+class AudioTrack : public Track {
+  AudioTrack(const AudioTrack&);
+  AudioTrack& operator=(const AudioTrack&);
+
+  AudioTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, AudioTrack*&);
+
+  double GetSamplingRate() const;
+  long long GetChannels() const;
+  long long GetBitDepth() const;
+
+ private:
+  double m_rate;
+  long long m_channels;
+  long long m_bitDepth;
+};
+
+class Tracks {
+  Tracks(const Tracks&);
+  Tracks& operator=(const Tracks&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tracks(Segment*, long long start, long long size, long long element_start,
+         long long element_size);
+
+  ~Tracks();
+
+  long Parse();
+
+  unsigned long GetTracksCount() const;
+
+  const Track* GetTrackByNumber(long tn) const;
+  const Track* GetTrackByIndex(unsigned long idx) const;
+
+ private:
+  Track** m_trackEntries;
+  Track** m_trackEntriesEnd;
+
+  long ParseTrackEntry(long long payload_start, long long payload_size,
+                       long long element_start, long long element_size,
+                       Track*&) const;
+};
+
+class Chapters {
+  Chapters(const Chapters&);
+  Chapters& operator=(const Chapters&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Chapters(Segment*, long long payload_start, long long payload_size,
+           long long element_start, long long element_size);
+
+  ~Chapters();
+
+  long Parse();
+
+  class Atom;
+  class Edition;
+
+  class Display {
+    friend class Atom;
+    Display();
+    Display(const Display&);
+    ~Display();
+    Display& operator=(const Display&);
+
+   public:
+    const char* GetString() const;
+    const char* GetLanguage() const;
+    const char* GetCountry() const;
+
+   private:
+    void Init();
+    void ShallowCopy(Display&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_string;
+    char* m_language;
+    char* m_country;
+  };
+
+  class Atom {
+    friend class Edition;
+    Atom();
+    Atom(const Atom&);
+    ~Atom();
+    Atom& operator=(const Atom&);
+
+   public:
+    unsigned long long GetUID() const;
+    const char* GetStringUID() const;
+
+    long long GetStartTimecode() const;
+    long long GetStopTimecode() const;
+
+    long long GetStartTime(const Chapters*) const;
+    long long GetStopTime(const Chapters*) const;
+
+    int GetDisplayCount() const;
+    const Display* GetDisplay(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Atom&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+    static long long GetTime(const Chapters*, long long timecode);
+
+    long ParseDisplay(IMkvReader*, long long pos, long long size);
+    bool ExpandDisplaysArray();
+
+    char* m_string_uid;
+    unsigned long long m_uid;
+    long long m_start_timecode;
+    long long m_stop_timecode;
+
+    Display* m_displays;
+    int m_displays_size;
+    int m_displays_count;
+  };
+
+  class Edition {
+    friend class Chapters;
+    Edition();
+    Edition(const Edition&);
+    ~Edition();
+    Edition& operator=(const Edition&);
+
+   public:
+    int GetAtomCount() const;
+    const Atom* GetAtom(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Edition&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseAtom(IMkvReader*, long long pos, long long size);
+    bool ExpandAtomsArray();
+
+    Atom* m_atoms;
+    int m_atoms_size;
+    int m_atoms_count;
+  };
+
+  int GetEditionCount() const;
+  const Edition* GetEdition(int index) const;
+
+ private:
+  long ParseEdition(long long pos, long long size);
+  bool ExpandEditionsArray();
+
+  Edition* m_editions;
+  int m_editions_size;
+  int m_editions_count;
+};
+
+class Tags {
+  Tags(const Tags&);
+  Tags& operator=(const Tags&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tags(Segment*, long long payload_start, long long payload_size,
+       long long element_start, long long element_size);
+
+  ~Tags();
+
+  long Parse();
+
+  class Tag;
+  class SimpleTag;
+
+  class SimpleTag {
+    friend class Tag;
+    SimpleTag();
+    SimpleTag(const SimpleTag&);
+    ~SimpleTag();
+    SimpleTag& operator=(const SimpleTag&);
+
+   public:
+    const char* GetTagName() const;
+    const char* GetTagString() const;
+
+   private:
+    void Init();
+    void ShallowCopy(SimpleTag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_tag_name;
+    char* m_tag_string;
+  };
+
+  class Tag {
+    friend class Tags;
+    Tag();
+    Tag(const Tag&);
+    ~Tag();
+    Tag& operator=(const Tag&);
+
+   public:
+    int GetSimpleTagCount() const;
+    const SimpleTag* GetSimpleTag(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Tag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseSimpleTag(IMkvReader*, long long pos, long long size);
+    bool ExpandSimpleTagsArray();
+
+    SimpleTag* m_simple_tags;
+    int m_simple_tags_size;
+    int m_simple_tags_count;
+  };
+
+  int GetTagCount() const;
+  const Tag* GetTag(int index) const;
+
+ private:
+  long ParseTag(long long pos, long long size);
+  bool ExpandTagsArray();
+
+  Tag* m_tags;
+  int m_tags_size;
+  int m_tags_count;
+};
+
+class SegmentInfo {
+  SegmentInfo(const SegmentInfo&);
+  SegmentInfo& operator=(const SegmentInfo&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  SegmentInfo(Segment*, long long start, long long size,
+              long long element_start, long long element_size);
+
+  ~SegmentInfo();
+
+  long Parse();
+
+  long long GetTimeCodeScale() const;
+  long long GetDuration() const;  // scaled
+  const char* GetMuxingAppAsUTF8() const;
+  const char* GetWritingAppAsUTF8() const;
+  const char* GetTitleAsUTF8() const;
+
+ private:
+  long long m_timecodeScale;
+  double m_duration;
+  char* m_pMuxingAppAsUTF8;
+  char* m_pWritingAppAsUTF8;
+  char* m_pTitleAsUTF8;
+};
+
+class SeekHead {
+  SeekHead(const SeekHead&);
+  SeekHead& operator=(const SeekHead&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  SeekHead(Segment*, long long start, long long size, long long element_start,
+           long long element_size);
+
+  ~SeekHead();
+
+  long Parse();
+
+  struct Entry {
+    Entry();
+
+    // the SeekHead entry payload
+    long long id;
+    long long pos;
+
+    // absolute pos of SeekEntry ID
+    long long element_start;
+
+    // SeekEntry ID size + size size + payload
+    long long element_size;
+  };
+
+  int GetCount() const;
+  const Entry* GetEntry(int idx) const;
+
+  struct VoidElement {
+    // absolute pos of Void ID
+    long long element_start;
+
+    // ID size + size size + payload size
+    long long element_size;
+  };
+
+  int GetVoidElementCount() const;
+  const VoidElement* GetVoidElement(int idx) const;
+
+ private:
+  Entry* m_entries;
+  int m_entry_count;
+
+  VoidElement* m_void_elements;
+  int m_void_element_count;
+
+  static bool ParseEntry(IMkvReader*,
+                         long long pos,  // payload
+                         long long size, Entry*);
+};
+
+class Cues;
+class CuePoint {
+  friend class Cues;
+
+  CuePoint(long, long long);
+  ~CuePoint();
+
+  CuePoint(const CuePoint&);
+  CuePoint& operator=(const CuePoint&);
+
+ public:
+  long long m_element_start;
+  long long m_element_size;
+
+  bool Load(IMkvReader*);
+
+  long long GetTimeCode() const;  // absolute but unscaled
+  long long GetTime(const Segment*) const;  // absolute and scaled (ns units)
+
+  struct TrackPosition {
+    long long m_track;
+    long long m_pos;  // of cluster
+    long long m_block;
+    // codec_state  //defaults to 0
+    // reference = clusters containing req'd referenced blocks
+    //  reftime = timecode of the referenced block
+
+    bool Parse(IMkvReader*, long long, long long);
+  };
+
+  const TrackPosition* Find(const Track*) const;
+
+ private:
+  const long m_index;
+  long long m_timecode;
+  TrackPosition* m_track_positions;
+  size_t m_track_positions_count;
+};
+
+class Cues {
+  friend class Segment;
+
+  Cues(Segment*, long long start, long long size, long long element_start,
+       long long element_size);
+  ~Cues();
+
+  Cues(const Cues&);
+  Cues& operator=(const Cues&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  bool Find(  // lower bound of time_ns
+      long long time_ns, const Track*, const CuePoint*&,
+      const CuePoint::TrackPosition*&) const;
+
+  const CuePoint* GetFirst() const;
+  const CuePoint* GetLast() const;
+  const CuePoint* GetNext(const CuePoint*) const;
+
+  const BlockEntry* GetBlock(const CuePoint*,
+                             const CuePoint::TrackPosition*) const;
+
+  bool LoadCuePoint() const;
+  long GetCount() const;  // loaded only
+  // long GetTotal() const;  //loaded + preloaded
+  bool DoneParsing() const;
+
+ private:
+  bool Init() const;
+  bool PreloadCuePoint(long&, long long) const;
+
+  mutable CuePoint** m_cue_points;
+  mutable long m_count;
+  mutable long m_preload_count;
+  mutable long long m_pos;
+};
+
+class Cluster {
+  friend class Segment;
+
+  Cluster(const Cluster&);
+  Cluster& operator=(const Cluster&);
+
+ public:
+  Segment* const m_pSegment;
+
+ public:
+  static Cluster* Create(Segment*,
+                         long index,  // index in segment
+                         long long off);  // offset relative to segment
+  // long long element_size);
+
+  Cluster();  // EndOfStream
+  ~Cluster();
+
+  bool EOS() const;
+
+  long long GetTimeCode() const;  // absolute, but not scaled
+  long long GetTime() const;  // absolute, and scaled (nanosecond units)
+  long long GetFirstTime() const;  // time (ns) of first (earliest) block
+  long long GetLastTime() const;  // time (ns) of last (latest) block
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetLast(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* curr, const BlockEntry*& next) const;
+
+  const BlockEntry* GetEntry(const Track*, long long ns = -1) const;
+  const BlockEntry* GetEntry(const CuePoint&,
+                             const CuePoint::TrackPosition&) const;
+  // const BlockEntry* GetMaxKey(const VideoTrack*) const;
+
+  //    static bool HasBlockEntries(const Segment*, long long);
+
+  static long HasBlockEntries(const Segment*, long long idoff, long long& pos,
+                              long& size);
+
+  long GetEntryCount() const;
+
+  long Load(long long& pos, long& size) const;
+
+  long Parse(long long& pos, long& size) const;
+  long GetEntry(long index, const mkvparser::BlockEntry*&) const;
+
+ protected:
+  Cluster(Segment*, long index, long long element_start);
+  // long long element_size);
+
+ public:
+  const long long m_element_start;
+  long long GetPosition() const;  // offset relative to segment
+
+  long GetIndex() const;
+  long long GetElementSize() const;
+  // long long GetPayloadSize() const;
+
+  // long long Unparsed() const;
+
+ private:
+  long m_index;
+  mutable long long m_pos;
+  // mutable long long m_size;
+  mutable long long m_element_size;
+  mutable long long m_timecode;
+  mutable BlockEntry** m_entries;
+  mutable long m_entries_size;
+  mutable long m_entries_count;
+
+  long ParseSimpleBlock(long long, long long&, long&);
+  long ParseBlockGroup(long long, long long&, long&);
+
+  long CreateBlock(long long id, long long pos, long long size,
+                   long long discard_padding);
+  long CreateBlockGroup(long long start_offset, long long size,
+                        long long discard_padding);
+  long CreateSimpleBlock(long long, long long);
+};
+
+class Segment {
+  friend class Cues;
+  friend class Track;
+  friend class VideoTrack;
+
+  Segment(const Segment&);
+  Segment& operator=(const Segment&);
+
+ private:
+  Segment(IMkvReader*, long long elem_start,
+          // long long elem_size,
+          long long pos, long long size);
+
+ public:
+  IMkvReader* const m_pReader;
+  const long long m_element_start;
+  // const long long m_element_size;
+  const long long m_start;  // posn of segment payload
+  const long long m_size;  // size of segment payload
+  Cluster m_eos;  // TODO: make private?
+
+  static long long CreateInstance(IMkvReader*, long long, Segment*&);
+  ~Segment();
+
+  long Load();  // loads headers and all clusters
+
+  // for incremental loading
+  // long long Unparsed() const;
+  bool DoneParsing() const;
+  long long ParseHeaders();  // stops when first cluster is found
+  // long FindNextCluster(long long& pos, long& size) const;
+  long LoadCluster(long long& pos, long& size);  // load one cluster
+  long LoadCluster();
+
+  long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
+                 long& size);
+
+  const SeekHead* GetSeekHead() const;
+  const Tracks* GetTracks() const;
+  const SegmentInfo* GetInfo() const;
+  const Cues* GetCues() const;
+  const Chapters* GetChapters() const;
+  const Tags* GetTags() const;
+
+  long long GetDuration() const;
+
+  unsigned long GetCount() const;
+  const Cluster* GetFirst() const;
+  const Cluster* GetLast() const;
+  const Cluster* GetNext(const Cluster*);
+
+  const Cluster* FindCluster(long long time_nanoseconds) const;
+  // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const;
+
+  const Cluster* FindOrPreloadCluster(long long pos);
+
+  long ParseCues(long long cues_off,  // offset relative to start of segment
+                 long long& parse_pos, long& parse_len);
+
+ private:
+  long long m_pos;  // absolute file posn; what has been consumed so far
+  Cluster* m_pUnknownSize;
+
+  SeekHead* m_pSeekHead;
+  SegmentInfo* m_pInfo;
+  Tracks* m_pTracks;
+  Cues* m_pCues;
+  Chapters* m_pChapters;
+  Tags* m_pTags;
+  Cluster** m_clusters;
+  long m_clusterCount;  // number of entries for which m_index >= 0
+  long m_clusterPreloadCount;  // number of entries for which m_index < 0
+  long m_clusterSize;  // array size
+
+  long DoLoadCluster(long long&, long&);
+  long DoLoadClusterUnknownSize(long long&, long&);
+  long DoParseNext(const Cluster*&, long long&, long&);
+
+  bool AppendCluster(Cluster*);
+  bool PreloadCluster(Cluster*, ptrdiff_t);
+
+  // void ParseSeekHead(long long pos, long long size);
+  // void ParseSeekEntry(long long pos, long long size);
+  // void ParseCues(long long);
+
+  const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
+};
+
+}  // namespace mkvparser
+
+inline long mkvparser::Segment::LoadCluster() {
+  long long pos;
+  long size;
+
+  return LoadCluster(pos, size);
+}
+
+#endif  // MKVPARSER_MKVPARSER_H_
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc
new file mode 100644
index 000000000..23d68f508
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "mkvparser/mkvreader.h"
+
+#include <sys/types.h>
+
+#include <cassert>
+
+namespace mkvparser {
+
+MkvReader::MkvReader() : m_file(NULL), reader_owns_file_(true) {}
+
+MkvReader::MkvReader(FILE* fp) : m_file(fp), reader_owns_file_(false) {
+  GetFileSize();
+}
+
+MkvReader::~MkvReader() {
+  if (reader_owns_file_)
+    Close();
+  m_file = NULL;
+}
+
+int MkvReader::Open(const char* fileName) {
+  if (fileName == NULL)
+    return -1;
+
+  if (m_file)
+    return -1;
+
+#ifdef _MSC_VER
+  const errno_t e = fopen_s(&m_file, fileName, "rb");
+
+  if (e)
+    return -1;  // error
+#else
+  m_file = fopen(fileName, "rb");
+
+  if (m_file == NULL)
+    return -1;
+#endif
+  return !GetFileSize();
+}
+
+bool MkvReader::GetFileSize() {
+  if (m_file == NULL)
+    return false;
+#ifdef _MSC_VER
+  int status = _fseeki64(m_file, 0L, SEEK_END);
+
+  if (status)
+    return false;  // error
+
+  m_length = _ftelli64(m_file);
+#else
+  fseek(m_file, 0L, SEEK_END);
+  m_length = ftell(m_file);
+#endif
+  assert(m_length >= 0);
+
+  if (m_length < 0)
+    return false;
+
+#ifdef _MSC_VER
+  status = _fseeki64(m_file, 0L, SEEK_SET);
+
+  if (status)
+    return false;  // error
+#else
+  fseek(m_file, 0L, SEEK_SET);
+#endif
+
+  return true;
+}
+
+void MkvReader::Close() {
+  if (m_file != NULL) {
+    fclose(m_file);
+    m_file = NULL;
+  }
+}
+
+int MkvReader::Length(long long* total, long long* available) {
+  if (m_file == NULL)
+    return -1;
+
+  if (total)
+    *total = m_length;
+
+  if (available)
+    *available = m_length;
+
+  return 0;
+}
+
+int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
+  if (m_file == NULL)
+    return -1;
+
+  if (offset < 0)
+    return -1;
+
+  if (len < 0)
+    return -1;
+
+  if (len == 0)
+    return 0;
+
+  if (offset >= m_length)
+    return -1;
+
+#ifdef _MSC_VER
+  const int status = _fseeki64(m_file, offset, SEEK_SET);
+
+  if (status)
+    return -1;  // error
+#else
+  fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
+#endif
+
+  const size_t size = fread(buffer, 1, len, m_file);
+
+  if (size < size_t(len))
+    return -1;  // error
+
+  return 0;  // success
+}
+
+}  // namespace mkvparser
diff --git a/third_party/aom/third_party/libwebm/mkvparser/mkvreader.h b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.h
new file mode 100644
index 000000000..9831ecf64
--- /dev/null
+++ b/third_party/aom/third_party/libwebm/mkvparser/mkvreader.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVREADER_H_
+#define MKVPARSER_MKVREADER_H_
+
+#include <cstdio>
+
+#include "mkvparser/mkvparser.h"
+
+namespace mkvparser {
+
+class MkvReader : public IMkvReader {
+ public:
+  MkvReader();
+  explicit MkvReader(FILE* fp);
+  virtual ~MkvReader();
+
+  int Open(const char*);
+  void Close();
+
+  virtual int Read(long long position, long length, unsigned char* buffer);
+  virtual int Length(long long* total, long long* available);
+
+ private:
+  MkvReader(const MkvReader&);
+  MkvReader& operator=(const MkvReader&);
+
+  // Determines the size of the file. This is called either by the constructor
+  // or by the Open function depending on file ownership. Returns true on
+  // success.
+  bool GetFileSize();
+
+  long long m_length;
+  FILE* m_file;
+  bool reader_owns_file_;
+};
+
+}  // namespace mkvparser
+
+#endif  // MKVPARSER_MKVREADER_H_
diff --git a/third_party/aom/third_party/libyuv/README.libaom b/third_party/aom/third_party/libyuv/README.libaom
new file mode 100644
index 000000000..09693c1f2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/README.libaom
@@ -0,0 +1,15 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 1456
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/basic_types.h b/third_party/aom/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 000000000..66e68536c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // for NULL, size_t
+
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h>  // for uintptr_t on x86
+#else
+#include <stdint.h>  // for uintptr_t
+#endif
+
+#ifndef GG_LONGLONG
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64;  // NOLINT
+typedef long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64;  // NOLINT
+typedef long long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;  // NOLINT
+typedef short int16;  // NOLINT
+typedef unsigned char uint8;
+typedef signed char int8;
+#endif  // INT_TYPES_DEFINED
+#endif  // GG_LONGLONG
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+
+#ifndef ALIGNP
+#ifdef __cplusplus
+#define ALIGNP(p, t) \
+    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+    ((t) - 1)) & ~((t) - 1))))
+#else
+#define ALIGNP(p, t) \
+    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
+#endif
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+    defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif  // __GNUC__
+#endif  // LIBYUV_API
+
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/compare.h b/third_party/aom/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 000000000..2a9f1560c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert.h b/third_party/aom/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 000000000..d6f206c10
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height);
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 000000000..ea75c0b26
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert_from.h b/third_party/aom/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 000000000..3591b4fd6
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height);
+
+// TODO(fbarchard): I420ToM420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_frame, int dst_stride_frame,
+              int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 000000000..4a6226813
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 000000000..870e94e8c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Consider overlapping bits for different architectures.
+// Internal flag to indicate cpuid requires initialization.
+#define kCpuInit 0x1
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
+void MaskCpuFlags(int enable_flags);
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 000000000..fa1e51f9a
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv411,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+
+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8* const* data,
+                                   const int* strides,
+                                   int rows);
+
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+
+  MJpegDecoder();
+  ~MJpegDecoder();
+
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+
+  // Number of color components in the color space.
+  int GetNumComponents();
+
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+
+  int GetVertSampFactor(int component);
+
+  int GetHorizSubSampFactor(int component);
+
+  int GetVertSubSampFactor(int component);
+
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+
+  // Height of a component.
+  int GetComponentHeight(int component);
+
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  LIBYUV_BOOL UnloadFrame();
+
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
+                        int dst_width, int dst_height);
+
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+     int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
+
+  void SetScanlinePointers(uint8** data);
+  LIBYUV_BOOL DecodeImcuRow();
+
+  int GetComponentScanlinePadding(int component);
+
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  LIBYUV_BOOL has_scanline_padding_;
+
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8** databuf_;
+  int* databuf_strides_;
+};
+
+}  // namespace libyuv
+
+#endif  //  __cplusplus
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 000000000..7fe4d8eed
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value);
+
+// Copy I400.  Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y, int width, int height,
+             int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma_rgb_table,
+                       int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+                             uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h, int dw, int dh);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/rotate.h b/third_party/aom/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 000000000..8a9673f28
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,  // No rotation.
+  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 000000000..2bdc8ec6b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h b/third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 000000000..d0bfbdd2b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".private_extern _" #name "                \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+#name ":                                       \n"
+#endif
+#endif
+
+// The following are available for Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 32 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_MIPS_DSPR2
+#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/row.h b/third_party/aom/third_party/libyuv/include/libyuv/row.h
new file mode 100644
index 000000000..5c3187ef7
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/row.h
@@ -0,0 +1,1857 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stdlib.h>  // For malloc.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#ifdef __cplusplus
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
+  uint8* var = reinterpret_cast<uint8*>                                        \
+      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
+#else
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
+  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
+#endif
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);  \
+  var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV422ROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+#endif
+
+// The following are available on x64 Visual C and clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available require VS2012.  Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOABGRROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORAWROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV422ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOABGRROW_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORUVROW_MIPS_DSPR2
+#define HAS_SPLITUVROW_MIPS_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+#elif defined(__GNUC__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n" \
+    BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%" #arg "\n" \
+    BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#else  // defined(__native_client__) && defined(__x86_64__)
+#define NACL_R14
+#define BUNDLEALIGN
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__) || defined(__aarch64__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base)
+#endif
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width);
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width);
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_rgb565,
+                          int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                            uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                           uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                            uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+                              int src_stride_argb1555,
+                              uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+                              int src_stride_argb4444,
+                              uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+                  uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                   int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int pix);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                               int pix);
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     int width);
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_vu,
+                       uint8* dst_argb,
+                       int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_argb,
+                       int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* dst_argb,
+                     int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* dst_argb,
+                     int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* dst_argb,
+                     int width);
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_bgra,
+                     int width);
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_abgr,
+                     int width);
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_rgba,
+                     int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* dst_rgb24,
+                      int width);
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* dst_raw,
+                    int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToABGRRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I411ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_vu,
+                         uint8* dst_argb,
+                         int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_argb,
+                           int width);
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_vu,
+                           uint8* dst_argb,
+                           int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_argb,
+                          int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void J422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_bgra,
+                         int width);
+void I422ToABGRRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_abgr,
+                         int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgba,
+                         int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          int width);
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb24,
+                          int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
+void I422ToRAWRow_SSSE3(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_raw,
+                        int width);
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I411ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_uv,
+                             uint8* dst_argb,
+                             int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_vu,
+                             uint8* dst_argb,
+                             int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_uv,
+                               uint8* dst_argb,
+                               int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_vu,
+                               uint8* dst_argb,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_vu,
+                              uint8* dst_argb,
+                              int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+                             uint8* dst_argb,
+                             int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+                             uint8* dst_argb,
+                             int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void J422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_bgra,
+                             int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_abgr,
+                             int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_rgba,
+                             int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_rgba,
+                               int width);
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRAWRow_Any_AVX2(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
+                        uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+                  uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToRAWRow_Any_NEON(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride_ptr,
+                      int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride_ptr, int width,
+                               int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
+                              int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride_ptr, int width,
+                                   int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride_ptr,
+                         int width, int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale.h b/third_party/aom/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 000000000..3974aba34
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+  kFilterNone = 0,  // Point sample; Fastest.
+  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3  // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                   int src_width, int src_height,
+                   uint16* dst, int dst_stride,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API.  Deprecated.
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate);
+
+// Legacy API.  Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h b/third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 000000000..22563837d
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering);
+
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/scale_row.h b/third_party/aom/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 000000000..a46b5ce69
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSE2
+#endif
+
+// The following are available on VS2012:
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                            int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx);
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x, int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/version.h b/third_party/aom/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 000000000..287b98ebf
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1456
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/include/libyuv/video_common.h b/third_party/aom/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 000000000..7b0a19cc9
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+  // 2 Secondary YUV formats: row biplanar.
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
+
+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+  // 1 Auxiliary compressed YUV format set aside for capturer.
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+  // Match any fourcc.
+  FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
+
+  // Match any fourcc.
+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/third_party/aom/third_party/libyuv/source/compare.cc b/third_party/aom/third_party/libyuv/source/compare.cc
new file mode 100644
index 000000000..46aa8473d
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare.cc
@@ -0,0 +1,373 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif  // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
+
+  while (count >= (uint64)(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  remainder = (int)(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = (int)(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64 sse = 0;
+  int i;
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 31;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height) {
+  uint64 sse = 0;
+  int h;
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = (double)(count) / (double)(sse);
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr)
+    psnr = kMaxPsnr;
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  const uint64 samples = width * height;
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+                                                src_b, stride_b,
+                                                width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+                                                  src_y_b, stride_y_b,
+                                                  width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+                                                  src_u_b, stride_u_b,
+                                                  width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+                                                  src_v_b, stride_v_b,
+                                                  width_uv, height_uv);
+  const uint64 samples = width * height + 2 * (width_uv * height_uv);
+  const uint64 sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+                        const uint8* src_b, int stride_b) {
+  int64 sum_a = 0;
+  int64 sum_b = 0;
+  int64 sum_sq_a = 0;
+  int64 sum_sq_b = 0;
+  int64 sum_axb = 0;
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  {
+    const int64 count = 64;
+    // scale the constants by number of pixels
+    const int64 c1 = (cc1 * count * count) >> 12;
+    const int64 c2 = (cc2 * count * count) >> 12;
+
+    const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+    const int64 sum_a_sq = sum_a*sum_a;
+    const int64 sum_b_sq = sum_b*sum_b;
+
+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                         (count * sum_sq_a - sum_a_sq +
+                          count * sum_sq_b - sum_b_sq + c2);
+
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  int samples = 0;
+  double ssim_total = 0;
+  double (*Ssim8x8)(const uint8* src_a, int stride_a,
+                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+                                      src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+                                      src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+                                      src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_common.cc b/third_party/aom/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 000000000..c546b5182
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_gcc.cc b/third_party/aom/third_party/libyuv/source/compare_gcc.cc
new file mode 100644
index 000000000..247cb33bb
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_gcc.cc
@@ -0,0 +1,152 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );  // NOLINT
+  return sse;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (  // NOLINT
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );  // NOLINT
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/aom/third_party/libyuv/source/compare_neon.cc b/third_party/aom/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 000000000..ef006ec41
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_neon64.cc b/third_party/aom/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 000000000..6d1e5e1bc
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_neon64.cc
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/compare_win.cc b/third_party/aom/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 000000000..19806f275
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+  wloop:
+    movdqu     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqu     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked)
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked)
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, kHash16x33
+
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked)
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+    movdqa     xmm6, kHash16x33
+
+  wloop:
+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
+    pmulld     xmm3, kHashMul0
+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
+    pmulld     xmm4, kHashMul1
+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
+    pmulld     xmm2, kHashMul2
+    lea        eax, [eax + 16]
+    pmulld     xmm1, kHashMul3
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert.cc b/third_party/aom/third_party/libyuv/source/convert.cc
new file mode 100644
index 000000000..3ad6bd7a4
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert.cc
@@ -0,0 +1,1389 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
+}
+
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                       uint8* dst, int dst_stride,
+                       int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  if (!src_y || !src_uv ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (dst_y) {
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
+                    dst_y, dst_stride_y,
+                    dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
+                    width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int pix) = YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+      BGRAToYRow_C;
+  if (!src_bgra ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      BGRAToUVRow = BGRAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        BGRAToUVRow = BGRAToUVRow_NEON;
+      }
+    }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+      ABGRToYRow_C;
+  if (!src_abgr ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+      RGBAToYRow_C;
+  if (!src_rgba ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_argb.cc b/third_party/aom/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 000000000..44756bc41
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,1155 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u == width &&
+      src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I411ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 4 == width &&
+      src_stride_v * 4 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I411TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I411ToARGBRow = I411ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = I400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB),
+                     width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int pix) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int pix) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_from.cc b/third_party/aom/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 000000000..31f1ac992
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,1348 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  for (y = 0; y < halfheight; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height) {
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, src_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v || !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v || !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_raw, int dst_stride_raw,
+                int width, int height) {
+  int y;
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
+  if (!src_y || !src_u || !src_v || !dst_raw ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+    dst_stride_raw = -dst_stride_raw;
+  }
+#if defined(HAS_I422TORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRAWRow = I422ToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRAWRow = I422ToRAWRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+    dst_raw += dst_stride_raw;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width) = I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u|| !v || !dst_sample ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
+      break;
+    case FOURCC_NV12: {
+      uint8* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    // TODO(fbarchard): Add M420.
+    // Triplanar formats
+    // TODO(fbarchard): halfstride instead of halfwidth
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      int halfwidth = (width + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * halfheight;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * halfheight;
+      }
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      int halfwidth = (width + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * height;
+      }
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + width * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + width * height;
+      }
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (width + 3) / 4;
+      uint8* dst_u = dst_sample + width * height;
+      uint8* dst_v = dst_u + quarterwidth * height;
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
+      break;
+    }
+
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_from_argb.cc b/third_party/aom/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 000000000..8d1e97aec
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1301 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 4 == width &&
+      dst_stride_v * 4 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return ARGBShuffle(src_argb, src_stride_argb,
+                     dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA),
+                     width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to J422
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUVJ422Row_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_jpeg.cc b/third_party/aom/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 000000000..bcb980f7f
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_to_argb.cc b/third_party/aom/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 000000000..af829fbd3
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,306 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* crop_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination crop_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+      crop_argb == sample;
+  uint8* tmp_argb = crop_argb;
+  int tmp_argb_stride = argb_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (crop_argb == NULL || sample == NULL ||
+      src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * abs_crop_height * 4;
+    rotate_buffer = (uint8*)malloc(argb_size);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    crop_argb = rotate_buffer;
+    argb_stride = crop_width;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      crop_argb, argb_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    crop_argb, argb_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       crop_argb, argb_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     crop_argb, argb_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(crop_argb, argb_stride,
+                     tmp_argb, tmp_argb_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/convert_to_i420.cc b/third_party/aom/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 000000000..5e75369b5
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,339 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+                  size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || crop_width <= 0  ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = rotate_buffer;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = crop_width;
+    u_stride = v_stride = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/cpu_id.cc b/third_party/aom/third_party/libyuv/source/cpu_id.cc
new file mode 100644
index 000000000..72f686e3b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/cpu_id.cc
@@ -0,0 +1,307 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+#include <intrin.h>  // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include <stdlib.h>  // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"  // For CPU_X86
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if (_MSC_FULL_VER >= 160040219)
+  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
+#else
+  if (info_ecx == 0) {
+    __cpuid((int*)(cpu_info), info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+  }
+#endif
+// GCC version uses inline x86 assembly.
+#else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+  uint32 info_ebx, info_edx;
+  asm volatile (  // NOLINT
+#if defined( __i386__) && defined(__PIC__)
+    // Preserve ebx for fpic 32 bit.
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=D" (info_ebx),
+#else
+    "cpuid                                     \n"
+    : "=b" (info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+}
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// TODO(fbarchard): Enable xgetbv when validator supports it.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+  uint32 xcr0 = 0u;
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
+  __asm {
+    xor        ecx, ecx    // xcr 0
+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+    mov        xcr0, eax
+  }
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return((xcr0 & 6) == 6);  // Is ymm saved?
+}
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+
+// based on libaom arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS
+int ArmCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+  char cpuinfo_line[512];
+  const char* file_name = "/proc/cpuinfo";
+  FILE* f = fopen(file_name, "r");
+  if (!f) {
+    // Assume DSP if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasMIPS_DSP;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
+    if (strstr(cpuinfo_line, search_string) != NULL) {
+      fclose(f);
+      return kCpuHasMIPS_DSP;
+    }
+  }
+  fclose(f);
+  return 0;
+}
+#endif
+
+// CPU detect function for SIMD instruction sets.
+LIBYUV_API
+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_FALSE;
+}
+#else  // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+  return LIBYUV_FALSE;
+}
+#endif
+
+LIBYUV_API SAFEBUFFERS
+int InitCpuFlags(void) {
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
+  CpuId(1, 0, cpu_info1);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+              kCpuHasX86;
+
+#ifdef HAS_XGETBV
+  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
+      TestOsSaveYmm()) {  // Saves YMM.
+    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                 kCpuHasAVX;
+  }
+#endif
+  // Environment variable overrides for testing.
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
+    cpu_info_ &= ~kCpuHasX86;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info_ &= ~kCpuHasSSE2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info_ &= ~kCpuHasSSSE3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+    cpu_info_ &= ~kCpuHasSSE41;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+    cpu_info_ &= ~kCpuHasSSE42;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
+    cpu_info_ &= ~kCpuHasAVX;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+    cpu_info_ &= ~kCpuHasAVX2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+    cpu_info_ &= ~kCpuHasERMS;
+  }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info_ &= ~kCpuHasFMA3;
+  }
+#endif
+#if defined(__mips__) && defined(__linux__)
+  // Linux mips parse text file for dsp detect.
+  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
+#if defined(__mips_dspr2)
+  cpu_info_ |= kCpuHasMIPS_DSPR2;
+#endif
+  cpu_info_ |= kCpuHasMIPS;
+
+  if (getenv("LIBYUV_DISABLE_MIPS")) {
+    cpu_info_ &= ~kCpuHasMIPS;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSP;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+  }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info_ = kCpuHasNEON;
+#else
+  // Linux arm parse text file for neon detect.
+  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+#endif
+  cpu_info_ |= kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
+    cpu_info_ &= ~kCpuHasNEON;
+  }
+#endif  // __arm__
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info_ = 0;
+  }
+  return cpu_info_;
+}
+
+LIBYUV_API
+void MaskCpuFlags(int enable_flags) {
+  cpu_info_ = InitCpuFlags() & enable_flags;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 000000000..75f8a610e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,572 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
+#endif
+
+#endif
+struct FILE;  // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo,
+                     long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(LIBYUV_FALSE),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return LIBYUV_FALSE;
+  }
+
+  buf_.data = src;
+  buf_.len = static_cast<int>(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return LIBYUV_FALSE;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor /
+      GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor /
+      GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
+    uint8** planes, int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip =
+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+                                rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+                  planes[i], GetComponentWidth(i),
+                  GetComponentWidth(i), scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+    int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo,
+                     long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+  // This is called when a jpeglib command experiences an error. Unfortunately
+  // jpeglib's error handling model is not very flexible, because it expects the
+  // error handler to not return--i.e., it wants the program to terminate. To
+  // recover from errors we use setjmp() as shown in their example. setjmp() is
+  // C's implementation for the "call with current continuation" functionality
+  // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+  // ERROR: Error in jpeglib: buf
+#endif
+
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8** [num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8* [num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete [] scanlines_[i];
+    delete [] databuf_[i];
+  }
+  delete [] scanlines_;
+  delete [] databuf_;
+  delete [] scanlines_sizes_;
+  delete [] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return LIBYUV_FALSE;
+  }
+  return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+      jpeg_read_raw_data(decompress_struct_,
+                         scanlines_,
+                         GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x, int* subsample_y, int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 2 &&
+        subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 1 &&
+        subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 1 && subsample_y[1] == 1 &&
+        subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
+
diff --git a/third_party/aom/third_party/libyuv/source/mjpeg_validate.cc b/third_party/aom/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 000000000..8edfbe1e7
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h>  // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked)
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // src
+    mov        eax, [esp + 8]   // val
+    mov        ecx, [esp + 12]  // count
+    repne scasb
+    jne        sr99
+    mov        eax, edi
+    sub        eax, 1
+    mov        edi, edx
+    ret
+
+  sr99:
+    mov        eax, 0
+    mov        edi, edx
+    ret
+  }
+}
+#endif
+
+// Helper function to scan for EOI marker.
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  const uint8* end = sample + sample_size - 1;
+  const uint8* it = sample;
+  for (;;) {
+#ifdef ENABLE_SCASB
+    it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+    if (it == NULL) {
+      break;
+    }
+    if (it[1] == 0xd9) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    ++it;  // Skip over current 0xff.
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+  // Step over SOI marker.
+  sample += 2;
+  sample_size -= 2;
+
+  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  return ScanEOI(sample, sample_size);
+
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/aom/third_party/libyuv/source/planar_functions.cc b/third_party/aom/third_party/libyuv/source/planar_functions.cc
new file mode 100644
index 000000000..b96bd5020
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/planar_functions.cc
@@ -0,0 +1,2555 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h>  // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height) {
+  int y;
+  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_COPYROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_16_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_16_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+                 uint8* dst_y, int dst_stride_y,
+                 int width, int height) {
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) =
+      YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+      YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) =
+      UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    ARGBMirrorRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    return ARGBBlendRow;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBBlendRow = ARGBBlendRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+  return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = GetARGBBlend();
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+
+  for (y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+    }
+  }
+#endif
+
+  // Multiply plane
+  for (y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height) {
+  int y;
+  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                     int width) = ARGBAddRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAddRow = ARGBAddRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_NEON;
+    }
+  }
+#endif
+
+  // Add plane
+  for (y = 0; y < height; ++y) {
+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBSubtractRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_NEON;
+    }
+  }
+#endif
+
+  // Subtract plane
+  for (y = 0; y < height; ++y) {
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_bgra == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
+  }
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
+  }
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV12ToRGB565Row)(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_vu, int src_stride_vu,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV21ToRGB565Row)(const uint8* y_buf,
+                          const uint8* src_vu,
+                          uint8* rgb_buf,
+                          int width) = NV21ToRGB565Row_C;
+  if (!src_y || !src_vu || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value) {
+  int y;
+  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
+  }
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    SetRow(dst_y, value, width);
+    dst_y += dst_stride_y;
+  }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+
+  SetPlane(start_y, dst_stride_y, width, height, value_y);
+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height,
+             uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
+  if (!dst_argb ||
+      width <= 0 || height == 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBSetRow = ARGBSetRow_X86;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+//   p is output pixel
+//   f is foreground pixel
+//   b is background pixel
+//   a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+//   f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBAttenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Neon version.
+
+  for (y = 0; y < height; ++y) {
+    ARGBUnattenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(dst, dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBSepiaRow(dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                           int width) = RGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+      interval_size < 1 || interval_size > 255) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  int32* previous_cumsum = dst_cumsum;
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
+  for (y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
+    src_argb += src_stride_argb;
+  }
+  return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+  int32* cumsum_bot_row;
+  int32* max_cumsum_bot_row;
+  int32* cumsum_top_row;
+
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+  }
+#endif
+  // Compute enough CumulativeSum for first row to be blurred. After this
+  // one row of CumulativeSum is updated at a time.
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+                           dst_cumsum, dst_stride32_cumsum,
+                           width, radius);
+
+  src_argb = src_argb + radius * src_stride_argb;
+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  cumsum_top_row = &dst_cumsum[0];
+
+  for (y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    int n;
+
+    // Increment cumsum_top_row pointer with circular buffer wrap around.
+    if (top_y) {
+      cumsum_top_row += dst_stride32_cumsum;
+      if (cumsum_top_row >= max_cumsum_bot_row) {
+        cumsum_top_row = dst_cumsum;
+      }
+    }
+    // Increment cumsum_bot_row pointer with circular buffer wrap around and
+    // then fill in a row of CumulativeSum.
+    if ((y + radius) < height) {
+      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      cumsum_bot_row += dst_stride32_cumsum;
+      if (cumsum_bot_row >= max_cumsum_bot_row) {
+        cumsum_bot_row = dst_cumsum;
+      }
+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+                              width);
+      src_argb += src_stride_argb;
+    }
+
+    // Left clipped.
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value) {
+  int y;
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation) {
+  int y;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+                   width * 4, interpolation);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height) {
+  int y;
+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_bgra == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_argb, int dst_stride_argb,
+                        int width, int height,
+                        void (*SobelRow)(const uint8* src_sobelx,
+                                         const uint8* src_sobely,
+                                         uint8* dst, int width)) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
+      ARGBToYJRow_C;
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
+  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+  {
+    // 3 rows with edges before/after.
+    const int kRowSize = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    uint8* row_sobelx = rows;
+    uint8* row_sobely = rows + kRowSize;
+    uint8* row_y = rows + kRowSize * 2;
+
+    // Convert first row.
+    uint8* row_y0 = row_y + kEdge;
+    uint8* row_y1 = row_y0 + kRowSize;
+    uint8* row_y2 = row_y1 + kRowSize;
+    ARGBToYJRow(src_argb, row_y0, width);
+    row_y0[-1] = row_y0[0];
+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
+    ARGBToYJRow(src_argb, row_y1, width);
+    row_y1[-1] = row_y1[0];
+    memset(row_y1 + width, row_y1[width - 1], 16);
+    memset(row_y2 + width, 0, 16);
+
+    for (y = 0; y < height; ++y) {
+      // Convert next row of ARGB to G.
+      if (y < (height - 1)) {
+        src_argb += src_stride_argb;
+      }
+      ARGBToYJRow(src_argb, row_y2, width);
+      row_y2[-1] = row_y2[0];
+      row_y2[width] = row_y2[width - 1];
+
+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+      SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+      // Cycle thru circular queue of 3 row_y buffers.
+      {
+        uint8* row_yt = row_y0;
+        row_y0 = row_y1;
+        row_y1 = row_y2;
+        row_y2 = row_yt;
+      }
+
+      dst_argb += dst_stride_argb;
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height) {
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                      width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height) {
+  int y;
+  void (*ARGBPolynomialRow)(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) = ARGBPolynomialRow_C;
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma,
+                       int width, int height) {
+  int y;
+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+      int width, const uint8* luma, const uint32 lumacoeff) =
+      ARGBLumaColorTableRow_C;
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBCopyAlphaRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height) {
+  int y;
+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      ARGBCopyYToAlphaRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, rows, awidth);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
+                 rows + awidth, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows, dst_y, awidth);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
+                 dst_y + dst_stride_y, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate.cc b/third_party/aom/third_party/libyuv/source/rotate.cc
new file mode 100644
index 000000000..be3d58920
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,496 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i = height;
+  void (*TransposeWx8)(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst += 8;                 // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  void (*TransposeUVWx8)(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    TransposeUVWx8 = TransposeUVWx8_SSE2;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+  }
+#endif
+
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst_a += 8;               // Move over 8 columns.
+    dst_b += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride,
+                     dst_a, dst_stride_a,
+                     dst_b, dst_stride_b,
+                     width, i);
+  }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorRowUV = MirrorUVRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorRowUV = MirrorUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorRowUV(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int width, int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride,
+                dst, dst_stride,
+                width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride,
+                    dst, dst_stride,
+                    width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y,
+                        src_uv, src_stride_uv,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_any.cc b/third_party/aom/third_party/libyuv/source/rotate_any.cc
new file mode 100644
index 000000000..4d6eb34e1
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_any.cc
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
+TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
+#endif
+
+#undef TANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/third_party/aom/third_party/libyuv/source/rotate_argb.cc b/third_party/aom/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 000000000..787c0ad1b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                            int src_stepx, uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+                          uint8* dst, int dst_stride, int width, int height) {
+  int i;
+  int src_pixel_step = src_stride >> 2;
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
+#endif
+
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+    dst += dst_stride;
+    src += 4;
+  }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+                  uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride, int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
+               enum RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb,
+                      dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb,
+                   dst_argb, dst_stride_argb,
+                   width, height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_common.cc b/third_party/aom/third_party/libyuv/source/rotate_common.cc
new file mode 100644
index 000000000..b33a9a0c6
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_common.cc
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_gcc.cc b/third_party/aom/third_party/libyuv/source/rotate_gcc.cc
new file mode 100644
index 000000000..fd385bcd3
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,493 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    ".p2align  2                                 \n"
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+  asm (
+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+    "push   %ebx                               \n"
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "push   %ebp                               \n"
+    "mov    0x14(%esp),%eax                    \n"
+    "mov    0x18(%esp),%edi                    \n"
+    "mov    0x1c(%esp),%edx                    \n"
+    "mov    0x20(%esp),%esi                    \n"
+    "mov    0x24(%esp),%ebx                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    %esp,%ecx                          \n"
+    "sub    $0x14,%esp                         \n"
+    "and    $0xfffffff0,%esp                   \n"
+    "mov    %ecx,0x10(%esp)                    \n"
+    "mov    0x2c(%ecx),%ecx                    \n"
+
+"1:                                            \n"
+    "movdqu (%eax),%xmm0                       \n"
+    "movdqu (%eax,%edi,1),%xmm1                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm0,%xmm7                        \n"
+    "punpcklbw %xmm1,%xmm0                     \n"
+    "punpckhbw %xmm1,%xmm7                     \n"
+    "movdqa %xmm7,%xmm1                        \n"
+    "movdqu (%eax),%xmm2                       \n"
+    "movdqu (%eax,%edi,1),%xmm3                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm2,%xmm7                        \n"
+    "punpcklbw %xmm3,%xmm2                     \n"
+    "punpckhbw %xmm3,%xmm7                     \n"
+    "movdqa %xmm7,%xmm3                        \n"
+    "movdqu (%eax),%xmm4                       \n"
+    "movdqu (%eax,%edi,1),%xmm5                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm4,%xmm7                        \n"
+    "punpcklbw %xmm5,%xmm4                     \n"
+    "punpckhbw %xmm5,%xmm7                     \n"
+    "movdqa %xmm7,%xmm5                        \n"
+    "movdqu (%eax),%xmm6                       \n"
+    "movdqu (%eax,%edi,1),%xmm7                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqu %xmm5,(%esp)                       \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm6,%xmm5                        \n"
+    "punpcklbw %xmm7,%xmm6                     \n"
+    "punpckhbw %xmm7,%xmm5                     \n"
+    "movdqa %xmm5,%xmm7                        \n"
+    "lea    0x10(%eax,%edi,8),%eax             \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm0,%xmm5                        \n"
+    "punpcklwd %xmm2,%xmm0                     \n"
+    "punpckhwd %xmm2,%xmm5                     \n"
+    "movdqa %xmm5,%xmm2                        \n"
+    "movdqa %xmm1,%xmm5                        \n"
+    "punpcklwd %xmm3,%xmm1                     \n"
+    "punpckhwd %xmm3,%xmm5                     \n"
+    "movdqa %xmm5,%xmm3                        \n"
+    "movdqa %xmm4,%xmm5                        \n"
+    "punpcklwd %xmm6,%xmm4                     \n"
+    "punpckhwd %xmm6,%xmm5                     \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "movdqu (%esp),%xmm5                       \n"
+    "movdqu %xmm6,(%esp)                       \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "punpcklwd %xmm7,%xmm5                     \n"
+    "punpckhwd %xmm7,%xmm6                     \n"
+    "movdqa %xmm6,%xmm7                        \n"
+    "movdqa %xmm0,%xmm6                        \n"
+    "punpckldq %xmm4,%xmm0                     \n"
+    "punpckhdq %xmm4,%xmm6                     \n"
+    "movdqa %xmm6,%xmm4                        \n"
+    "movdqu (%esp),%xmm6                       \n"
+    "movlpd %xmm0,(%edx)                       \n"
+    "movhpd %xmm0,(%ebx)                       \n"
+    "movlpd %xmm4,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "punpckldq %xmm6,%xmm2                     \n"
+    "movlpd %xmm2,(%edx)                       \n"
+    "movhpd %xmm2,(%ebx)                       \n"
+    "punpckhdq %xmm6,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm1,%xmm0                        \n"
+    "punpckldq %xmm5,%xmm1                     \n"
+    "movlpd %xmm1,(%edx)                       \n"
+    "movhpd %xmm1,(%ebx)                       \n"
+    "punpckhdq %xmm5,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm3,%xmm0                        \n"
+    "punpckldq %xmm7,%xmm3                     \n"
+    "movlpd %xmm3,(%edx)                       \n"
+    "movhpd %xmm3,(%ebx)                       \n"
+    "punpckhdq %xmm7,%xmm0                     \n"
+    "sub    $0x8,%ecx                          \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "jg     1b                                 \n"
+    "mov    0x10(%esp),%esp                    \n"
+    "pop    %ebp                               \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "pop    %ebx                               \n"
+#if defined(__native_client__)
+    "pop    %ecx                               \n"
+    "and    $0xffffffe0,%ecx                   \n"
+    "jmp    *%ecx                              \n"
+#else
+    "ret                                       \n"
+#endif
+);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%3),%%xmm1                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm9                    \n"
+  "palignr    $0x8,%%xmm1,%%xmm1               \n"
+  "palignr    $0x8,%%xmm9,%%xmm9               \n"
+  "movdqu     (%0,%3),%%xmm3                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm10                   \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm10                   \n"
+  "movdqa     %%xmm2,%%xmm3                    \n"
+  "movdqa     %%xmm10,%%xmm11                  \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "movdqu     (%0,%3),%%xmm5                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm12                   \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm12                   \n"
+  "movdqa     %%xmm4,%%xmm5                    \n"
+  "movdqa     %%xmm12,%%xmm13                  \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movdqu     (%0,%3),%%xmm7                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm14                   \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "punpckhbw  %%xmm7,%%xmm14                   \n"
+  "neg        %3                               \n"
+  "movdqa     %%xmm6,%%xmm7                    \n"
+  "movdqa     %%xmm14,%%xmm15                  \n"
+  "lea        0x10(%0,%3,8),%0                 \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "neg        %3                               \n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "palignr    $0x8,%%xmm2,%%xmm2               \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm4,%%xmm6                    \n"
+  "movdqa     %%xmm5,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "punpcklwd  %%xmm10,%%xmm8                   \n"
+  "punpcklwd  %%xmm11,%%xmm9                   \n"
+  "movdqa     %%xmm8,%%xmm10                   \n"
+  "movdqa     %%xmm9,%%xmm11                   \n"
+  "palignr    $0x8,%%xmm10,%%xmm10             \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "punpcklwd  %%xmm14,%%xmm12                  \n"
+  "punpcklwd  %%xmm15,%%xmm13                  \n"
+  "movdqa     %%xmm12,%%xmm14                  \n"
+  "movdqa     %%xmm13,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqa     %%xmm0,%%xmm4                    \n"
+  "palignr    $0x8,%%xmm4,%%xmm4               \n"
+  "movq       %%xmm4,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "movq       %%xmm2,(%1)                      \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movq       %%xmm6,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm1,%%xmm5                    \n"
+  "movq       %%xmm1,(%1)                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "movq       %%xmm5,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movq       %%xmm3,(%1)                      \n"
+  "movdqa     %%xmm3,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "movq       %%xmm7,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm12,%%xmm8                   \n"
+  "movq       %%xmm8,(%1)                      \n"
+  "movdqa     %%xmm8,%%xmm12                   \n"
+  "palignr    $0x8,%%xmm12,%%xmm12             \n"
+  "movq       %%xmm12,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm14,%%xmm10                  \n"
+  "movdqa     %%xmm10,%%xmm14                  \n"
+  "movq       %%xmm10,(%1)                     \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "punpckldq  %%xmm13,%%xmm9                   \n"
+  "movq       %%xmm14,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm9,%%xmm13                   \n"
+  "movq       %%xmm9,(%1)                      \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movq       %%xmm13,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm15,%%xmm11                  \n"
+  "movq       %%xmm11,(%1)                     \n"
+  "movdqa     %%xmm11,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "sub        $0x10,%2                         \n"
+  "movq       %%xmm15,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(dst_stride))   // %4
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+);
+}
+
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%4),%%xmm1                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm1                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqu     (%0,%4),%%xmm3                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm3                    \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "movdqu     (%0,%4),%%xmm5                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm5                    \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "movdqu     (%0,%4),%%xmm7                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm8                    \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "neg        %4                               \n"
+  "lea        0x10(%0,%4,8),%0                 \n"
+  "punpckhbw  %%xmm7,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm7                    \n"
+  "neg        %4                               \n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "movdqa     %%xmm1,%%xmm9                    \n"
+  "punpckhwd  %%xmm2,%%xmm8                    \n"
+  "punpckhwd  %%xmm3,%%xmm9                    \n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm2                    \n"
+  "movdqa     %%xmm9,%%xmm3                    \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "movdqa     %%xmm5,%%xmm9                    \n"
+  "punpckhwd  %%xmm6,%%xmm8                    \n"
+  "punpckhwd  %%xmm7,%%xmm9                    \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm8,%%xmm6                    \n"
+  "movdqa     %%xmm9,%%xmm7                    \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movlpd     %%xmm2,(%1)                      \n"
+  "movhpd     %%xmm2,(%2)                      \n"
+  "punpckhdq  %%xmm6,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm1,%%xmm8                    \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movlpd     %%xmm1,(%1)                      \n"
+  "movhpd     %%xmm1,(%2)                      \n"
+  "punpckhdq  %%xmm5,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm3,%%xmm8                    \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movlpd     %%xmm3,(%1)                      \n"
+  "movhpd     %%xmm3,(%2)                      \n"
+  "punpckhdq  %%xmm7,%%xmm8                    \n"
+  "sub        $0x8,%3                          \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(width)   // %3
+  : "r"((intptr_t)(src_stride)),    // %4
+    "r"((intptr_t)(dst_stride_a)),  // %5
+    "r"((intptr_t)(dst_stride_b))   // %6
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_mips.cc b/third_party/aom/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 000000000..efe6bd909
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_neon.cc b/third_party/aom/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 000000000..76043b3b3
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,535 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %5, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %5, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %5, #4                        \n"
+    "blt         2f                            \n"
+
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"
+
+    "vtbl.8      d4, {d0, d1}, d6              \n"
+    "vtbl.8      d5, {d0, d1}, d7              \n"
+    "vtbl.8      d0, {d2, d3}, d6              \n"
+    "vtbl.8      d1, {d2, d3}, d7              \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"
+
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"
+
+    "vtrn.8      d0, d1                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"
+
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),          // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
+  );
+}
+
+static uvec8 kVTbl4x4TransposeDi =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %7, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %7, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %7, #4                        \n"
+    "blt         2f                            \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"
+
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"
+
+    "vtrn.8      q0, q1                        \n"
+    "vtrn.8      q2, q3                        \n"
+
+    "vtbl.8      d16, {d0, d1}, d30            \n"
+    "vtbl.8      d17, {d0, d1}, d31            \n"
+    "vtbl.8      d18, {d2, d3}, d30            \n"
+    "vtbl.8      d19, {d2, d3}, d31            \n"
+    "vtbl.8      d20, {d4, d5}, d30            \n"
+    "vtbl.8      d21, {d4, d5}, d31            \n"
+    "vtbl.8      d22, {d6, d7}, d30            \n"
+    "vtbl.8      d23, {d6, d7}, d31            \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"
+
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"
+
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+    "vtrn.8      d0, d1                        \n"
+    "vtrn.8      d2, d3                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"
+
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),            // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_neon64.cc b/third_party/aom/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 000000000..f52c082b3
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/rotate_win.cc b/third_party/aom/third_party/libyuv/source/rotate_win.cc
new file mode 100644
index 000000000..2760066df
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/rotate_win.cc
@@ -0,0 +1,248 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_any.cc b/third_party/aom/third_party/libyuv/source/row_any.cc
new file mode 100644
index 000000000..1cb1f6b93
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_any.cc
@@ -0,0 +1,680 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TORAWROW_AVX2
+ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_J422TOARGBROW_SSSE3
+ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_J422TOARGBROW_AVX2
+ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
+ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#undef ANY31
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#undef ANY11
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+       const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+       const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+       const uint32, 4, 2, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 ptrdiff_t src_stride_ptr, int width,                          \
+                 int source_y_fraction) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSE2
+ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr, r * BPP);                                          \
+      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
+      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_SSE2
+ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
+    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
+      SIMD_ALIGNED(uint8 temp[64]);                                            \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, v32, n);                                             \
+      }                                                                        \
+      ANY_SIMD(temp, v32, MASK + 1);                                           \
+      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
+    }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
+      memset(temp, 0, 128);  /* for msan */                                    \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
+      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
+      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
+    }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+      memset(temp, 0, 128 * 2);  /* for msan */                                \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
+             SS(r, UVSHIFT) * BPP);                                            \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4);                    \
+      }                                                                        \
+      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_common.cc b/third_party/aom/third_party/libyuv/source/row_common.cc
new file mode 100644
index 000000000..49875894f
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_common.cc
@@ -0,0 +1,2576 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  int m = v >> 31;
+  return (v + m) ^ m;
+}
+#else  // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb24[0];
+    uint8 g = src_rgb24[1];
+    uint8 r = src_rgb24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb24 += 3;
+  }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 2) | (g >> 4);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb565 += 2;
+  }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 a = src_argb1555[1] >> 7;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 3) | (g >> 2);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = -a;
+    dst_argb += 4;
+    src_argb1555 += 2;
+  }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    uint8 a = src_argb4444[1] >> 4;
+    dst_argb[0] = (b << 4) | b;
+    dst_argb[1] = (g << 4) | g;
+    dst_argb[2] = (r << 4) | r;
+    dst_argb[3] = (a << 4) | a;
+    dst_argb += 4;
+    src_argb4444 += 2;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = b;
+    dst_rgb[1] = g;
+    dst_rgb[2] = r;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = r;
+    dst_rgb[1] = g;
+    dst_rgb[2] = b;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 2;
+    uint8 r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 3;
+    uint8 r1 = src_argb[6] >> 3;
+    uint8 a1 = src_argb[7] >> 7;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+  }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    uint8 b1 = src_argb[4] >> 4;
+    uint8 g1 = src_argb[5] >> 4;
+    uint8 r1 = src_argb[6] >> 4;
+    uint8 a1 = src_argb[7] >> 4;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
+                       uint8* dst_u, uint8* dst_v, int width) {                \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+  }                                                                            \
+}
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
+                        uint8* dst_u, uint8* dst_v, int width) {               \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
+                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
+    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
+                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
+    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
+                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
+    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
+    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+  }                                                                            \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+  }
+}
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    b = (b << 3) | (b >> 2);
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb1555 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    b = (b << 4) | b;
+    g = (g << 4) | g;
+    r = (r << 4) | r;
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb4444 += 2;
+    dst_y += 1;
+  }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b1 = src_rgb565[2] & 0x1f;
+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8 r1 = src_rgb565[3] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b3 = next_rgb565[2] & 0x1f;
+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8 r3 = next_rgb565[3] >> 3;
+    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 787 -> 888.
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b = (b0 + b2);  // 565 * 2 = 676.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 676 -> 888
+    g = (g << 1) | (g >> 6);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b1 = src_argb1555[2] & 0x1f;
+    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8 b3 = next_argb1555[2] & 0x1f;
+    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 777 -> 888.
+    g = (g << 1) | (g >> 6);
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb1555 += 4;
+    next_argb1555 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = next_argb1555[1] >> 3;
+    uint8 b = (b0 + b2);  // 555 * 2 = 666.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b1 = src_argb4444[2] & 0x0f;
+    uint8 g1 = src_argb4444[2] >> 4;
+    uint8 r1 = src_argb4444[3] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b3 = next_argb4444[2] & 0x0f;
+    uint8 g3 = next_argb4444[2] >> 4;
+    uint8 r3 = next_argb4444[3] & 0x0f;
+    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb4444 += 4;
+    next_argb4444 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b = (b0 + b2);  // 444 * 2 = 555.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 3) | (b >> 2);  // 555 -> 888.
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 16;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if ((width & 3) == 3) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = src_argb[3];
+    dst_argb += 4;
+    src_argb += 4;
+  }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow. a is preserved from original.
+    dst_argb[0] = sb;
+    dst_argb[1] = clamp255(sg);
+    dst_argb[2] = clamp255(sr);
+    dst_argb += 4;
+  }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    dst_argb[0] = Clamp(sb);
+    dst_argb[1] = Clamp(sg);
+    dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb += 4;
+  }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_add = src_argb1[0];
+    const int g_add = src_argb1[1];
+    const int r_add = src_argb1[2];
+    const int a_add = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_add);
+    dst_argb[1] = SHADE(g, g_add);
+    dst_argb[2] = SHADE(r, r_add);
+    dst_argb[3] = SHADE(a, a_add);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_sub = src_argb1[0];
+    const int g_sub = src_argb1[1];
+    const int r_sub = src_argb1[2];
+    const int a_sub = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_sub);
+    dst_argb[1] = SHADE(g, g_sub);
+    dst_argb[2] = SHADE(r, r_sub);
+    dst_argb[3] = SHADE(a, a_sub);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i];
+    int b = src_y1[i];
+    int c = src_y2[i];
+    int a_sub = src_y0[i + 2];
+    int b_sub = src_y1[i + 2];
+    int c_sub = src_y2[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobelx[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i + 0];
+    int b = src_y0[i + 1];
+    int c = src_y0[i + 2];
+    int a_sub = src_y1[i + 0];
+    int b_sub = src_y1[i + 1];
+    int c_sub = src_y1[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobely[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_argb[0] = (uint8)(s);
+    dst_argb[1] = (uint8)(s);
+    dst_argb[2] = (uint8)(s);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = (uint8)(s);
+  }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int g = clamp255(r + b);
+    dst_argb[0] = (uint8)(b);
+    dst_argb[1] = (uint8)(g);
+    dst_argb[2] = (uint8)(r);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+  // Copy a Y to RGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+                              uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
+  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
+  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
+}
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 + YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+                               uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
+  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
+  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
+  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
+}
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvJPixel(src_y[1], src_u[0], src_v[0],
+              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* rgb_buf,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* rgb_buf,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+  }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    b1 = b1 >> 4;
+    g1 = g1 >> 4;
+    r1 = r1 >> 4;
+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        0xf000;
+  }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb1555,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 3;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        0x8000;
+  }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    YuvPixel(src_y[2], src_u[0], src_v[0],
+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
+    rgb_buf[11] = 255;
+    YuvPixel(src_y[3], src_u[0], src_v[0],
+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
+    rgb_buf[15] = 255;
+    src_y += 4;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+
+    YuvPixel(src_y[1], src_vu[1], src_vu[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_uv += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* vsrc_u,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    vsrc_u += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_yuy2 += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_uyvy += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  const uint32* src32 = (const uint32*)(src);
+  uint32* dst32 = (uint32*)(dst);
+  src32 += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst32[x] = src32[0];
+    dst32[x + 1] = src32[-1];
+    src32 -= 2;
+  }
+  if (width & 1) {
+    dst32[width - 1] = src32[0];
+  }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+  memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+  memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
+  }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
+  }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_uyvy[1];
+  }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+
+    fb = src_argb0[4 + 0];
+    fg = src_argb0[4 + 1];
+    fr = src_argb0[4 + 2];
+    a = src_argb0[4 + 3];
+    bb = src_argb1[4 + 0];
+    bg = src_argb1[4 + 1];
+    br = src_argb1[4 + 2];
+    dst_argb[4 + 0] = BLEND(fb, bb, a);
+    dst_argb[4 + 1] = BLEND(fg, bg, a);
+    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 3] = 255u;
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+  }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+    b = src_argb[4];
+    g = src_argb[5];
+    r = src_argb[6];
+    a = src_argb[7];
+    dst_argb[4] = ATTENUATE(b, a);
+    dst_argb[5] = ATTENUATE(g, a);
+    dst_argb[6] = ATTENUATE(r, a);
+    dst_argb[7] = a;
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32 b = src_argb[0];
+    const uint32 g = src_argb[1];
+    const uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+  }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    b = (b * ia) >> 8;
+    g = (g * ia) >> 8;
+    r = (r * ia) >> 8;
+    // Clamping should not be necessary but is free in assembly.
+    dst_argb[0] = clamp255(b);
+    dst_argb[1] = clamp255(g);
+    dst_argb[2] = clamp255(r);
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width) {
+  int32 row_sum[4] = {0, 0, 0, 0};
+  int x;
+  for (x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+                                int w, int area, uint8* dst, int count) {
+  float ooa = 1.0f / area;
+  int i;
+  for (i = 0; i < count; ++i) {
+    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width) {
+  int i;
+  // Render a row of pixels from source into a buffer.
+  float uv[2];
+  uv[0] = uv_dudv[0];
+  uv[1] = uv_dudv[1];
+  for (i = 0; i < width; ++i) {
+    int x = (int)(uv[0]);
+    int y = (int)(uv[1]);
+    *(uint32*)(dst_argb) =
+        *(const uint32*)(src_argb + y * src_argb_stride +
+                                         x * 4);
+    dst_argb += 4;
+    uv[0] += uv_dudv[2];
+    uv[1] += uv_dudv[3];
+  }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int pix) {
+  int x;
+  for (x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+                         uint16* dst_uv, int pix) {
+  int x;
+  for (x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  int x;
+  for (x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 b = src_argb[index0];
+    uint8 g = src_argb[index1];
+    uint8 r = src_argb[index2];
+    uint8 a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = src_y[1];
+    dst_frame[3] = src_v[0];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = 0;
+    dst_frame[3] = src_v[0];
+  }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = src_y[1];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = 0;
+  }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && !defined(__clang__)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_rgb565,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORAWROW_AVX2)
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_raw,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRAWRow_AVX2
+    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_raw += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32)(db));
+    dst_argb[1] = Clamp((int32)(dg));
+    dst_argb[2] = Clamp((int32)(dr));
+    dst_argb[3] = Clamp((int32)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff) {
+  uint32 bc = lumacoeff & 0xff;
+  uint32 gc = (lumacoeff >> 8) & 0xff;
+  uint32 rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+              src_argb[6] * rc) & 0x7F00u) + luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_gcc.cc b/third_party/aom/third_party/libyuv/source/row_gcc.cc
new file mode 100644
index 000000000..820de0a1c
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_gcc.cc
@@ -0,0 +1,5475 @@
+// VERSION 2
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  5                               \n"
+    "mov       %%eax,%%eax                     \n"
+    "mov       %%ebx,%%ebx                     \n"
+    "mov       %%ecx,%%ecx                     \n"
+    "mov       %%edx,%%edx                     \n"
+    "mov       %%esi,%%esi                     \n"
+    "mov       %%edi,%%edi                     \n"
+    "mov       %%ebp,%%ebp                     \n"
+    "mov       %%esp,%%esp                     \n"
+    ".p2align  5                               \n"
+    "mov       %%r8d,%%r8d                     \n"
+    "mov       %%r9d,%%r9d                     \n"
+    "mov       %%r10d,%%r10d                   \n"
+    "mov       %%r11d,%%r11d                   \n"
+    "mov       %%r12d,%%r12d                   \n"
+    "mov       %%r13d,%%r13d                   \n"
+    "mov       %%r14d,%%r14d                   \n"
+    "mov       %%r15d,%%r15d                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%rax),%%eax                   \n"
+    "lea       (%%rbx),%%ebx                   \n"
+    "lea       (%%rcx),%%ecx                   \n"
+    "lea       (%%rdx),%%edx                   \n"
+    "lea       (%%rsi),%%esi                   \n"
+    "lea       (%%rdi),%%edi                   \n"
+    "lea       (%%rbp),%%ebp                   \n"
+    "lea       (%%rsp),%%esp                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%r8),%%r8d                    \n"
+    "lea       (%%r9),%%r9d                    \n"
+    "lea       (%%r10),%%r10d                  \n"
+    "lea       (%%r11),%%r11d                  \n"
+    "lea       (%%r12),%%r12d                  \n"
+    "lea       (%%r13),%%r13d                  \n"
+    "lea       (%%r14),%%r14d                  \n"
+    "lea       (%%r15),%%r15d                  \n"
+
+    ".p2align  5                               \n"
+    "lea       0x10(%%rax),%%eax               \n"
+    "lea       0x10(%%rbx),%%ebx               \n"
+    "lea       0x10(%%rcx),%%ecx               \n"
+    "lea       0x10(%%rdx),%%edx               \n"
+    "lea       0x10(%%rsi),%%esi               \n"
+    "lea       0x10(%%rdi),%%edi               \n"
+    "lea       0x10(%%rbp),%%ebp               \n"
+    "lea       0x10(%%rsp),%%esp               \n"
+    ".p2align  5                               \n"
+    "lea       0x10(%%r8),%%r8d                \n"
+    "lea       0x10(%%r9),%%r9d                \n"
+    "lea       0x10(%%r10),%%r10d              \n"
+    "lea       0x10(%%r11),%%r11d              \n"
+    "lea       0x10(%%r12),%%r12d              \n"
+    "lea       0x10(%%r13),%%r13d              \n"
+    "lea       0x10(%%r14),%%r14d              \n"
+    "lea       0x10(%%r15),%%r15d              \n"
+
+    ".p2align  5                               \n"
+    "add       0x10,%%eax                      \n"
+    "add       0x10,%%ebx                      \n"
+    "add       0x10,%%ecx                      \n"
+    "add       0x10,%%edx                      \n"
+    "add       0x10,%%esi                      \n"
+    "add       0x10,%%edi                      \n"
+    "add       0x10,%%ebp                      \n"
+    "add       0x10,%%esp                      \n"
+    ".p2align  5                               \n"
+    "add       0x10,%%r8d                      \n"
+    "add       0x10,%%r9d                      \n"
+    "add       0x10,%%r10d                     \n"
+    "add       0x10,%%r11d                     \n"
+    "add       0x10,%%r12d                     \n"
+    "add       0x10,%%r13d                     \n"
+    "add       0x10,%%r14d                     \n"
+    "add       0x10,%%r15d                     \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // TESTING
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRGB24ToARGB)  // %3
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x20802080,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xa,%%xmm4                     \n"
+    "psrlw     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x42004200,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm4                   \n"
+    "psrlw     $0x6,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psllw     $0x1,%%xmm1                     \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0xf0f0f0f,%%eax                \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x4,%%xmm5                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "psllw     $0x4,%%xmm1                     \n"
+    "psrlw     $0x4,%%xmm3                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRGB24)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRAW)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psrld     $0x1b,%%xmm3                    \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1a,%%xmm4                    \n"
+    "pslld     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0xb,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pslld     $0x8,%%xmm0                     \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x5,%%xmm2                     \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1b,%%xmm4                    \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x5,%%xmm5                     \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "pslld     $0xa,%%xmm6                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "pslld     $0xf,%%xmm7                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x6,%%xmm2                     \n"
+    "psrld     $0x9,%%xmm3                     \n"
+    "pand      %%xmm7,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm6,%%xmm3                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm4,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm3,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "psrlq     $0x4,%%xmm0                     \n"
+    "psrlq     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),  // %3
+    "m"(kAddYJ64)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                          int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),        // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kRGBAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// Read 8 UV from 411
+#define READYUV444                                                             \
+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "punpckldq  %%xmm0,%%xmm0                                   \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(YuvConstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
+    "paddsw     %%xmm3,%%xmm0                                   \n"            \
+    "paddsw     %%xmm3,%%xmm1                                   \n"            \
+    "paddsw     %%xmm3,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+
+// Store 8 ARGB values. Assumes XMM5 is zero.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR                                                              \
+    "punpcklbw %%xmm1,%%xmm2                                     \n"           \
+    "punpcklbw %%xmm5,%%xmm0                                     \n"           \
+    "movdqa    %%xmm2,%%xmm1                                     \n"           \
+    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
+    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+// TODO(fbarchard): Consider putting masks into constants.
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* dst_rgb24,
+                                 int width) {
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_raw,
+                               int width) {
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)    // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_bgra,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_abgr,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREABGR
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_rgba,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGBA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
+    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
+    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
+    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
+    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
+    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
+    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
+    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
+    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
+
+#if defined(HAS_I422TOBGRAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_bgra,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
+
+    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
+    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_J422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOABGRROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+    "movd      %%eax,%%xmm2                    \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm0                   \n"
+    "psrlw     $6, %%xmm0                      \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+
+    // Step 2: Weave into ARGB
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "por       %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1)",%1            \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
+    "jg        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
+    "jg         1b                               \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
+    "jg        1b                                \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep movsb " MEMMOVESTRING(0,1) "          \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpckhwd %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm2,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "41:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       41b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+//    pshuflw    xmm3, xmm3,0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x8,%%xmm5                     \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
+};
+static uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
+};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrld     $0x18,%%xmm2                    \n"
+    "psrld     $0x18,%%xmm3                    \n"
+    "packuswb  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64)     // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %2,%%xmm2                       \n"
+    "movdqa    %3,%%xmm3                       \n"
+    "movdqa    %4,%%xmm4                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "m"(kARGBToSepiaB),  // %2
+    "m"(kARGBToSepiaG),  // %3
+    "m"(kARGBToSepiaR)   // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "movd      %2,%%xmm2                       \n"
+    "movd      %3,%%xmm3                       \n"
+    "movd      %4,%%xmm4                       \n"
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "pslld     $0x18,%%xmm6                    \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "pmullw    %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm6,%%xmm7                   \n"
+    "paddw     %%xmm4,%%xmm0                   \n"
+    "paddw     %%xmm4,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "movd      %3,%%xmm2                       \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(value)       // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqu    %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psubusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "sub       %0,%3                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm2                   \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklwd %%xmm0,%%xmm3                   \n"
+    "punpckhwd %%xmm0,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "paddusb   %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "punpckhbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "punpcklbw %%xmm2,%%xmm4                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "punpcklwd %%xmm3,%%xmm6                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "punpcklwd %%xmm0,%%xmm7                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  asm volatile (
+    "movd      %5,%%xmm5                       \n"
+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+    "rcpss     %%xmm5,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "cmpl      $0x80,%5                        \n"
+    "ja        40f                             \n"
+
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrld     $0x10,%%xmm6                    \n"
+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+    "addps     %%xmm6,%%xmm5                   \n"
+    "mulps     %%xmm4,%%xmm5                   \n"
+    "cvtps2dq  %%xmm5,%%xmm5                   \n"
+    "packssdw  %%xmm5,%%xmm5                   \n"
+
+  // 4 pixel small loop                        \n"
+    LABELALIGN
+  "4:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       4b                              \n"
+    "jmp       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"((intptr_t)(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* src_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
+  asm volatile (
+    "movq      " MEMACCESS(3) ",%%xmm2         \n"
+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1," MEMACCESS(2) "         \n"
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%k1                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(src_dudv),  // %3
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x80,%3                       \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
+
+  "99:                                         \n"
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+c"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm2                   \n"
+    "psubw     %%xmm1,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm2                   \n"
+    "paddw     %%xmm3,%%xmm3                   \n"
+    "pmulhw    %%xmm5,%%xmm2                   \n"
+    "pmulhw    %%xmm5,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSE2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "mov       " MEMACCESS(4) ",%k2            \n"
+    "cmp       $0x3000102,%k2                  \n"
+    "je        3012f                           \n"
+    "cmp       $0x10203,%k2                    \n"
+    "je        123f                            \n"
+    "cmp       $0x30201,%k2                    \n"
+    "je        321f                            \n"
+    "cmp       $0x2010003,%k2                  \n"
+    "je        2103f                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(4) ",%2             \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS(1) "            \n"
+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "sub       $0x1,%3                         \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "123:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        123b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "321:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        321b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "2103:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        2103b                           \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "3012:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        3012b                           \n"
+
+  "99:                                         \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+d"(pixel_temp),  // %2
+    "+r"(pix)         // %3
+  : "r"(shuffler)      // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub       %1,%2                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "pxor      %%xmm3,%%xmm3                   \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm3,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm4                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
+    "addps     " MEMACCESS(3) ",%%xmm0         \n"
+    "addps     " MEMACCESS(3) ",%%xmm4         \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm1,%%xmm2                   \n"
+    "mulps     %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm2,%%xmm1                   \n"
+    "mulps     %%xmm6,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
+    "addps     %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm6,%%xmm4                   \n"
+    "addps     %%xmm1,%%xmm0                   \n"
+    "addps     %%xmm5,%%xmm4                   \n"
+    "cvttps2dq %%xmm0,%%xmm0                   \n"
+    "cvttps2dq %%xmm4,%%xmm4                   \n"
+    "packuswb  %%xmm4,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
+    "lea         " MEMLEA(0x8,0) ",%0          \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
+    "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
+    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  uintptr_t pixel_temp = 0u;
+  uintptr_t table_temp = 0u;
+  asm volatile (
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS(2) ",%0             \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS(3) "            \n"
+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
+    "jg        1b                              \n"
+  : "+d"(pixel_temp),  // %0
+    "+a"(table_temp),  // %1
+    "+r"(src_argb),    // %2
+    "+r"(dst_argb),    // %3
+    "+rm"(width)       // %4
+  : "r"(luma),         // %5
+    "rm"(lumacoeff)    // %6
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_mips.cc b/third_party/aom/third_party/libyuv/source/row_mips.cc
new file mode 100644
index 000000000..cfc9ffe03
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_mips.cc
@@ -0,0 +1,911 @@
+/*
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+
+    //
+    // UNALIGNED case
+    //
+
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
+}
+#endif  // HAS_COPYROW_MIPS
+
+// MIPS DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+    ".p2align        2                             \n"
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+     "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+    ".set push                             \n"
+    ".set noreorder                        \n"
+
+    "srl       $t4, %[width], 4            \n"  // multiplies of 16
+    "andi      $t5, %[width], 0xf          \n"
+    "blez      $t4, 2f                     \n"
+    " addu     %[src], %[src], %[width]    \n"  // src += width
+
+    ".p2align  2                           \n"
+   "1:                                     \n"
+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+    "addiu     %[src], %[src], -16         \n"
+    "addiu     $t4, $t4, -1                \n"
+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+    "bgtz      $t4, 1b                     \n"
+    " addiu    %[dst], %[dst], 16          \n"
+    "beqz      $t5, 3f                     \n"
+    " nop                                  \n"
+
+   "2:                                     \n"
+    "lbu       $t0, -1(%[src])             \n"
+    "addiu     $t5, $t5, -1                \n"
+    "addiu     %[src], %[src], -1          \n"
+    "sb        $t0, 0(%[dst])              \n"
+    "bgez      $t5, 2b                     \n"
+    " addiu    %[dst], %[dst], 1           \n"
+
+   "3:                                     \n"
+    ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "addu            $t4, %[width], %[width]      \n"
+    "srl             %[x], %[width], 4            \n"
+    "andi            %[y], %[width], 0xf          \n"
+    "blez            %[x], 2f                     \n"
+    " addu           %[src_uv], %[src_uv], $t4    \n"
+
+    ".p2align        2                            \n"
+   "1:                                            \n"
+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+    "addiu           %[src_uv], %[src_uv], -32    \n"
+    "addiu           %[x], %[x], -1               \n"
+    "swr             $t4, 0(%[dst_u])             \n"
+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+    "swr             $t6, 0(%[dst_v])             \n"
+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+    "swr             $t2, 4(%[dst_u])             \n"
+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+    "swr             $t3, 4(%[dst_v])             \n"
+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+    "swr             $t0, 8(%[dst_u])             \n"
+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+    "swr             $t1, 8(%[dst_v])             \n"
+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+    "swr             $t9, 12(%[dst_u])            \n"
+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+    "swr             $t5, 12(%[dst_v])            \n"
+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+    "addiu           %[dst_v], %[dst_v], 16       \n"
+    "bgtz            %[x], 1b                     \n"
+    " addiu          %[dst_u], %[dst_u], 16       \n"
+    "beqz            %[y], 3f                     \n"
+    " nop                                         \n"
+    "b               2f                           \n"
+    " nop                                         \n"
+
+   "2:                                            \n"
+    "lbu             $t0, -2(%[src_uv])           \n"
+    "lbu             $t1, -1(%[src_uv])           \n"
+    "addiu           %[src_uv], %[src_uv], -2     \n"
+    "addiu           %[y], %[y], -1               \n"
+    "sb              $t0, 0(%[dst_u])             \n"
+    "sb              $t1, 0(%[dst_v])             \n"
+    "addiu           %[dst_u], %[dst_u], 1        \n"
+    "bgtz            %[y], 2b                     \n"
+    " addiu          %[dst_v], %[dst_v], 1        \n"
+
+   "3:                                            \n"
+    ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "+r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+      "t5", "t7", "t8", "t9"
+  );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB                                                 \
+      "lw                $t0, 0(%[y_buf])       \n"                            \
+      "lhu               $t1, 0(%[u_buf])       \n"                            \
+      "lhu               $t2, 0(%[v_buf])       \n"                            \
+      "preceu.ph.qbr     $t1, $t1               \n"                            \
+      "preceu.ph.qbr     $t2, $t2               \n"                            \
+      "preceu.ph.qbra    $t3, $t0               \n"                            \
+      "preceu.ph.qbla    $t0, $t0               \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t3, $t3, $s4          \n"                            \
+      "subu.ph           $t0, $t0, $s4          \n"                            \
+      "mul.ph            $t3, $t3, $s0          \n"                            \
+      "mul.ph            $t0, $t0, $s0          \n"                            \
+      "shll.ph           $t4, $t1, 0x7          \n"                            \
+      "subu.ph           $t4, $t4, $t1          \n"                            \
+      "mul.ph            $t6, $t1, $s1          \n"                            \
+      "mul.ph            $t1, $t2, $s2          \n"                            \
+      "addq_s.ph         $t5, $t4, $t3          \n"                            \
+      "addq_s.ph         $t4, $t4, $t0          \n"                            \
+      "shra.ph           $t5, $t5, 6            \n"                            \
+      "shra.ph           $t4, $t4, 6            \n"                            \
+      "addiu             %[u_buf], 2            \n"                            \
+      "addiu             %[v_buf], 2            \n"                            \
+      "addu.ph           $t6, $t6, $t1          \n"                            \
+      "mul.ph            $t1, $t2, $s3          \n"                            \
+      "addu.ph           $t9, $t6, $t3          \n"                            \
+      "addu.ph           $t8, $t6, $t0          \n"                            \
+      "shra.ph           $t9, $t9, 6            \n"                            \
+      "shra.ph           $t8, $t8, 6            \n"                            \
+      "addu.ph           $t2, $t1, $t3          \n"                            \
+      "addu.ph           $t1, $t1, $t0          \n"                            \
+      "shra.ph           $t2, $t2, 6            \n"                            \
+      "shra.ph           $t1, $t1, 6            \n"                            \
+      "subu.ph           $t5, $t5, $s5          \n"                            \
+      "subu.ph           $t4, $t4, $s5          \n"                            \
+      "subu.ph           $t9, $t9, $s5          \n"                            \
+      "subu.ph           $t8, $t8, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "shll_s.ph         $t5, $t5, 8            \n"                            \
+      "shll_s.ph         $t4, $t4, 8            \n"                            \
+      "shll_s.ph         $t9, $t9, 8            \n"                            \
+      "shll_s.ph         $t8, $t8, 8            \n"                            \
+      "shll_s.ph         $t2, $t2, 8            \n"                            \
+      "shll_s.ph         $t1, $t1, 8            \n"                            \
+      "shra.ph           $t5, $t5, 8            \n"                            \
+      "shra.ph           $t4, $t4, 8            \n"                            \
+      "shra.ph           $t9, $t9, 8            \n"                            \
+      "shra.ph           $t8, $t8, 8            \n"                            \
+      "shra.ph           $t2, $t2, 8            \n"                            \
+      "shra.ph           $t1, $t1, 8            \n"                            \
+      "addu.ph           $t5, $t5, $s5          \n"                            \
+      "addu.ph           $t4, $t4, $s5          \n"                            \
+      "addu.ph           $t9, $t9, $s5          \n"                            \
+      "addu.ph           $t8, $t8, $s5          \n"                            \
+      "addu.ph           $t2, $t2, $s5          \n"                            \
+      "addu.ph           $t1, $t1, $s5          \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+// Arranging into argb format
+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+    "addiu             %[width], -4           \n"
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
+
+    ".p2align          2                       \n"
+   "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into abgr format
+    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
+    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
+    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
+    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
+
+    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
+    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
+    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff              \n"
+    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+      // Arranging into bgra format
+    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
+    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
+
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+    "sll               $t1, $t1, 16           \n"
+    "sll               $t2, $t2, 16           \n"
+    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
+    int y0_fraction = 256 - source_y_fraction;
+    const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+     ".set push                                           \n"
+     ".set noreorder                                      \n"
+
+     "replv.ph          $t0, %[y0_fraction]               \n"
+     "replv.ph          $t1, %[source_y_fraction]         \n"
+
+    ".p2align           2                                 \n"
+   "1:                                                    \n"
+     "lw                $t2, 0(%[src_ptr])                \n"
+     "lw                $t3, 0(%[src_ptr1])               \n"
+     "lw                $t4, 4(%[src_ptr])                \n"
+     "lw                $t5, 4(%[src_ptr1])               \n"
+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+     "addq.ph           $t6, $t6, $t8                     \n"
+     "addq.ph           $t7, $t7, $t9                     \n"
+     "addq.ph           $t2, $t2, $t4                     \n"
+     "addq.ph           $t3, $t3, $t5                     \n"
+     "shra.ph           $t6, $t6, 8                       \n"
+     "shra.ph           $t7, $t7, 8                       \n"
+     "shra.ph           $t2, $t2, 8                       \n"
+     "shra.ph           $t3, $t3, 8                       \n"
+     "precr.qb.ph       $t6, $t6, $t7                     \n"
+     "precr.qb.ph       $t2, $t2, $t3                     \n"
+     "addiu             %[src_ptr], %[src_ptr], 8         \n"
+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+     "addiu             %[dst_width], %[dst_width], -8    \n"
+     "sw                $t6, 0(%[dst_ptr])                \n"
+     "sw                $t2, 4(%[dst_ptr])                \n"
+     "bgtz              %[dst_width], 1b                  \n"
+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+     ".set pop                                            \n"
+  : [dst_ptr] "+r" (dst_ptr),
+    [src_ptr1] "+r" (src_ptr1),
+    [src_ptr] "+r" (src_ptr),
+    [dst_width] "+r" (dst_width)
+  : [source_y_fraction] "r" (source_y_fraction),
+    [y0_fraction] "r" (y0_fraction),
+    [src_stride] "r" (src_stride)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+#endif  // __mips_dsp_rev >= 2
+
+#endif  // defined(__mips__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_neon.cc b/third_party/aom/third_party/libyuv/source/row_neon.cc
new file mode 100644
index 000000000..1a72eb903
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_neon.cc
@@ -0,0 +1,3084 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vzip.u8    d2, d3                         \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.8     {d3}, [%2]!                    \n"                             \
+    "vpaddl.u8  q1, q1                         \n"                             \
+    "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d3, d2                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d0, d2}, [%0]!                \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d2, d3}, [%0]!                \n"                             \
+    "vmov.u8    d0, d3                         \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    MEMACCESS([kUVToRB])                                                       \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    MEMACCESS([kUVToG])                                                        \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    MEMACCESS([kYToRgb])                                                       \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUV422TORGB                                                            \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
+    "vadd.s16   d18, d19                       \n"                             \
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
+                          0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+                        0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_raw),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTORGB565                                                           \
+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB1555                                                         \
+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB4444                                                         \
+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "vmov.u8    d23, #255                      \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
+    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2                     \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #16                        \n"  // 16 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r12, #-16                      \n"
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+    "subs       %3, #8                         \n"  // 8 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
+  :
+  : "cc", "memory", "r12", "q0"
+  );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                         \
+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                         \
+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    d2, %2                         \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d20, d20, d2                   \n"
+    "vqadd.u8   d21, d21, d2                   \n"
+    "vqadd.u8   d22, d22, d2                   \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlsl.u8   q2, d1, d25                    \n"  // G
+    "vmlsl.u8   q2, d2, d26                    \n"  // R
+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+    "vmull.u8   q3, d2, d24                    \n"  // R
+    "vmlsl.u8   q3, d1, d28                    \n"  // G
+    "vmlsl.u8   q3, d0, d27                    \n"  // B
+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
+    "vpadd.u16  d1, d8, d9                     \n"  // B
+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
+    "vpadd.u16  d3, d10, d11                   \n"  // G
+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
+    "vpadd.u16  d5, d12, d13                   \n"  // R
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_stride_argb1555),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_stride_argb4444),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp        %4, #0                         \n"
+    "beq        100f                           \n"
+    "add        %2, %1                         \n"
+    "cmp        %4, #64                        \n"
+    "beq        75f                            \n"
+    "cmp        %4, #128                       \n"
+    "beq        50f                            \n"
+    "cmp        %4, #192                       \n"
+    "beq        25f                            \n"
+
+    "vdup.8     d5, %4                         \n"
+    "rsb        %4, #256                       \n"
+    "vdup.8     d4, %4                         \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vmull.u8   q13, d0, d4                    \n"
+    "vmull.u8   q14, d1, d4                    \n"
+    "vmlal.u8   q13, d2, d5                    \n"
+    "vmlal.u8   q14, d3, d5                    \n"
+    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %3, #8                         \n"
+    "blt        89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+    "bge        8b                             \n"
+
+  "89:                                         \n"
+    "adds       %3, #8-1                       \n"
+    "blt        99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+    "bge        1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "vdup.u16   q8, %2                         \n"
+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+    "vdup.u16   q9, %3                         \n"  // interval multiply.
+    "vdup.u16   q10, %4                        \n"  // interval add
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+    "vqdmulh.s16 q1, q1, q8                    \n"  // g
+    "vqdmulh.s16 q2, q2, q8                    \n"  // r
+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+    "vmul.u16   q1, q1, q9                     \n"  // g
+    "vmul.u16   q2, q2, q9                     \n"  // r
+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+    "vadd.u16   q1, q1, q10                    \n"  // g
+    "vadd.u16   q2, q2, q10                    \n"  // r
+    "vqmovn.u16 d0, q0                         \n"
+    "vqmovn.u16 d2, q1                         \n"
+    "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+    "vmovl.u8   q11, d22                       \n"
+    "vmovl.u8   q12, d24                       \n"
+    "vmovl.u8   q13, d26                       \n"
+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+    "vqmovn.u16 d20, q10                       \n"
+    "vqmovn.u16 d22, q11                       \n"
+    "vqmovn.u16 d24, q12                       \n"
+    "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+    "vmov       d1, d0                         \n"  // G
+    "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d20, #17                       \n"  // BB coefficient
+    "vmov.u8    d21, #68                       \n"  // BG coefficient
+    "vmov.u8    d22, #35                       \n"  // BR coefficient
+    "vmov.u8    d24, #22                       \n"  // GB coefficient
+    "vmov.u8    d25, #88                       \n"  // GG coefficient
+    "vmov.u8    d26, #45                       \n"  // GR coefficient
+    "vmov.u8    d28, #24                       \n"  // BB coefficient
+    "vmov.u8    d29, #98                       \n"  // BG coefficient
+    "vmov.u8    d30, #50                       \n"  // BR coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+    "vmlal.u8   q2, d1, d21                    \n"  // G
+    "vmlal.u8   q2, d2, d22                    \n"  // R
+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+    "vmlal.u8   q3, d1, d25                    \n"  // G
+    "vmlal.u8   q3, d2, d26                    \n"  // R
+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+    "vmlal.u8   q8, d1, d29                    \n"  // G
+    "vmlal.u8   q8, d2, d30                    \n"  // R
+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q9, d18                        \n"  // g
+    "vmovl.u8   q10, d20                       \n"  // r
+    "vmovl.u8   q11, d22                       \n"  // a
+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q0, d0, d1                     \n"  // multiply B
+    "vmull.u8   q1, d2, d3                     \n"  // multiply G
+    "vmull.u8   q2, d4, d5                     \n"  // multiply R
+    "vmull.u8   q3, d6, d7                     \n"  // multiply A
+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
+    "vld1.8     {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_neon64.cc b/third_party/aom/third_party/libyuv/source/row_neon64.cc
new file mode 100644
index 000000000..5d015454b
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_neon64.cc
@@ -0,0 +1,3087 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.s}[1], [%2], #4            \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "movi       v27.8h, #128                   \n"                             \
+    "movi       v28.8h, #102                   \n"                             \
+    "movi       v29.8h, #25                    \n"                             \
+    "movi       v30.8h, #52                    \n"
+
+#define YUV422TORGB(vR, vG, vB)                                                \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
+
+#ifdef HAS_I444TOARGBROW_NEON
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                 \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I444TOARGBROW_NEON
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v21, v22, v23)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_raw),   // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORAWROW_NEON
+
+#define ARGBTORGB565                                                           \
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+
+#ifdef HAS_I422TORGB565ROW_NEON
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB565ROW_NEON
+
+#define ARGBTOARGB1555                                                         \
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+
+#ifdef HAS_I422TOARGB1555ROW_NEON
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB1555ROW_NEON
+
+#define ARGBTOARGB4444                                                         \
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+
+#ifdef HAS_I422TOARGB4444ROW_NEON
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB4444ROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_J400TOARGBROW_NEON
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_J400TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV21TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_NEON
+
+#ifdef HAS_UYVYTOARGBROW_NEON
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_NEON
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
+    MEMACCESS(2)
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
+    "b.gt       1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_SPLITUVROW_NEON
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_MERGEUVROW_NEON
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2                     \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORUVROW_NEON
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width64)    // %3
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_MIRRORUVROW_NEON
+
+#ifdef HAS_ARGBMIRRORROW_NEON
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v4.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v5.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_RAWTOARGBROW_NEON
+
+#define RGB565TOARGB                                                           \
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
+
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+#endif  // HAS_RGB565TOARGBROW_NEON
+
+#define ARGB1555TOARGB                                                         \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB1555TOARGBROW_NEON
+
+#define ARGB4444TOARGB                                                         \
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
+
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB4444TOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+    MEMACCESS(1)
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOUV422ROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUV422ROW_NEON
+
+#ifdef HAS_UYVYTOUV422ROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUV422ROW_NEON
+
+#ifdef HAS_YUY2TOUVROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(src_yuy2b),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUVROW_NEON
+
+#ifdef HAS_UYVYTOUVROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(src_uyvyb),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUVROW_NEON
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_NEON
+
+#ifdef HAS_I422TOYUY2ROW_NEON
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "orr        v2.8b, v1.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_NEON
+
+#ifdef HAS_I422TOUYVYROW_NEON
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_NEON
+
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "dup        v1.4s, %w2                     \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v20.8b, v20.8b, v1.8b          \n"
+    "uqadd      v21.8b, v21.8b, v1.8b          \n"
+    "uqadd      v22.8b, v22.8b, v1.8b          \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB1555ROW_NEON
+
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB4444ROW_NEON
+
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_NEON
+
+// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_NEON
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV411ROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+#ifdef HAS_ARGBTOUVROW_NEON
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_NEON
+
+// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_NEON
+
+#ifdef HAS_BGRATOUVROW_NEON
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_BGRATOUVROW_NEON
+
+#ifdef HAS_ABGRTOUVROW_NEON
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ABGRTOUVROW_NEON
+
+#ifdef HAS_RGBATOUVROW_NEON
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGBATOUVROW_NEON
+
+#ifdef HAS_RGB24TOUVROW_NEON
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGB24TOUVROW_NEON
+
+#ifdef HAS_RAWTOUVROW_NEON
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RAWTOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile (
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_rgb565_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_argb1555_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+  );
+}
+#endif  // HAS_ARGB1555TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_argb4444_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
+  );
+}
+#endif  // HAS_ARGB4444TOUVROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOYROW_NEON
+
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGB1555TOYROW_NEON
+
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_ARGB4444TOYROW_NEON
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),    // %1
+    "+r"(pix)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RAWTOYROW_NEON
+
+// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  asm volatile (
+    "cmp        %w4, #0                        \n"
+    "b.eq       100f                           \n"
+    "cmp        %w4, #64                       \n"
+    "b.eq       75f                            \n"
+    "cmp        %w4, #128                      \n"
+    "b.eq       50f                            \n"
+    "cmp        %w4, #192                      \n"
+    "b.eq       25f                            \n"
+
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v0.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_ptr1),         // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
+  :
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_NEON
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %w3, %w3, #8                   \n"
+    "b.lt       89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
+
+  "89:                                         \n"
+    "adds       %w3, %w3, #8-1                 \n"
+    "b.lt       99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_NEON
+
+// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_NEON
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_NEON
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
+    MEMACCESS(1)
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_NEON
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_NEON
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+#ifdef HAS_ARGBSEPIAROW_NEON
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_NEON
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_NEON
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBADDROW_NEON
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+#ifdef HAS_SOBELROW_NEON
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_NEON
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELXYROW_NEON
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+#ifdef HAS_SOBELXROW_NEON
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
+    MEMACCESS(0)
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2],%6               \n"
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(3)
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2LL),          // %5
+    "r"(6LL)           // %6
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELXROW_NEON
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+#ifdef HAS_SOBELYROW_NEON
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%5               \n"
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1LL),          // %4
+    "r"(6LL)           // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELYROW_NEON
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_win.cc b/third_party/aom/third_party/libyuv/source/row_win.cc
new file mode 100644
index 000000000..71be268b4
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_win.cc
@@ -0,0 +1,6331 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// 64 bit
+#if defined(_M_X64)
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm2 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+#endif
+// 32 bit
+#else  // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static const vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static const vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static const vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static const vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static const vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static const uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    pslld      xmm5, 24
+
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov         eax, [esp + 4]        // src_y
+    mov         edx, [esp + 8]        // dst_argb
+    mov         ecx, [esp + 12]       // pix
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
+
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked)
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_rgb24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRGB24ToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                        int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRAWToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked)
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]   // src_rgb565
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // pix
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]   // src_argb1555
+    mov        edx,  [esp + 8]   // dst_argb
+    mov        ecx,  [esp + 12]  // pix
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpsraw     ymm2, ymm0, 8       // A
+    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]   // src_argb4444
+    mov       edx,  [esp + 8]   // dst_argb
+    mov       ecx,  [esp + 12]  // pix
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5    // mask high nibbles
+    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked)
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 18 instructions.
+__declspec(naked)
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    movd      xmm4, eax
+    pshufd    xmm4, xmm4, 0
+    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    pslld     xmm5, 4
+    mov       eax, [esp + 4]   // src_argb4444
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // mask low nibbles
+    pand      xmm2, xmm5    // mask high nibbles
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    psllw     xmm1, 4
+    psrlw     xmm3, 4
+    por       xmm0, xmm1
+    por       xmm2, xmm3
+    movdqa    xmm1, xmm0
+    punpcklbw xmm0, xmm2
+    punpckhbw xmm1, xmm2
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+// 4 pixels
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+// 8 pixels
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    movd      xmm6, [esp + 12] // dither4
+    mov       ecx, [esp + 16]  // pix
+    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6    // add dither
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]     // pix
+    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6    // add dither
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    psrld     xmm4, 27
+    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    pslld     xmm5, 5
+    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    pslld     xmm6, 10
+    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pslld     xmm7, 15
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    movdqa    xmm3, xmm0    // R
+    psrad     xmm0, 16      // A
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 6       // G
+    psrld     xmm3, 9       // R
+    pand      xmm0, xmm7    // A
+    pand      xmm1, xmm4    // B
+    pand      xmm2, xmm5    // G
+    pand      xmm3, xmm6    // R
+    por       xmm0, xmm1    // BA
+    por       xmm2, xmm3    // GR
+    por       xmm0, xmm2    // BGRA
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3    // low nibble
+    pand      xmm1, xmm4    // high nibble
+    psrld     xmm0, 4
+    psrld     xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // pix
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kARGBToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kARGBToYJ
+    movdqa     xmm5, kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    paddw      xmm2, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    vbroadcastf128 ymm4, kARGBToY
+    vbroadcastf128 ymm5, kAddY16
+    vmovdqu    ymm6, kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    vbroadcastf128 ymm4, kARGBToYJ
+    vbroadcastf128 ymm5, kAddYJ64
+    vmovdqu    ymm6, kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
+    vpaddw     ymm2, ymm2, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kBGRAToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kABGRToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kRGBAToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUVJ128
+    movdqa     xmm6, kARGBToVJ
+    movdqa     xmm7, kARGBToUJ
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
+    paddw      xmm1, xmm5
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked)
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    vbroadcastf128 ymm5, kAddUV128
+    vbroadcastf128 ymm6, kARGBToV
+    vbroadcastf128 ymm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+__declspec(naked)
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* convert to U and V */
+    movdqu     xmm0, [eax]          // U
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+
+    movdqu     xmm0, [eax]          // V
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm6
+    pmaddubsw  xmm1, xmm6
+    pmaddubsw  xmm2, xmm6
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    lea        eax,  [eax + 64]
+    movdqu     [edx + edi], xmm0
+    lea        edx,  [edx + 16]
+    sub        ecx,  16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm7, kBGRAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm7, kABGRToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm7, kRGBAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm {                                                \
+    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
+    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+  }
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm {                                                \
+    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+  }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
+    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
+    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
+    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
+    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
+    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_J422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvJConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV411_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYvuConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    vpunpcklbw ymm1, ymm1, ymm0           // GB
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm2           // AR
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
+    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm2
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    vpunpcklbw ymm1, ymm1, ymm2           // GR
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm0           // AB
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
+    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I422TOABGRROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToABGRRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    vpunpcklbw ymm1, ymm2, ymm1           // RG
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm0, ymm5           // BA
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 444.
+#define READYUV444 __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm {                                                     \
+    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
+    __asm movd       xmm0, ebx                                                 \
+    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
+    __asm movd       xmm1, ebx                                                 \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
+  }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) __asm {                                         \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
+    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
+    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
+    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
+    __asm psubw      xmm2, xmm3                                                \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm3                                                \
+    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    /* Step 3: Weave into BGRA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 ABGR values.
+#define STOREABGR __asm {                                                      \
+    /* Step 3: Weave into ABGR */                                              \
+    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
+    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
+    __asm movdqa     xmm1, xmm2                                                \
+    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm2                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    /* Step 3: Weave into RGBA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB24 */                                                \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RAW values.
+#define STORERAW __asm {                                                       \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RAW */                                                  \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB565 */                                               \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV444
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* dst_rgb24,
+                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb24
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
+__declspec(naked)
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_raw,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // raw
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    movdqa     xmm5, kShuffleMaskARGBToRAW_0
+    movdqa     xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERAW
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb565_buf,
+                           int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb565
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    psrld      xmm5, 27
+    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    psrld      xmm6, 26
+    pslld      xmm6, 5
+    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pslld      xmm7, 11
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGB565
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// JPeg color space version of I422ToARGB
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvJConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked)
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ecx, [esp + 12 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV411  // modifies EBX
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(kYvuConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_bgra,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_abgr,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // abgr
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREABGR
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_rgba,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgba
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGBA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    movq       xmm0, qword ptr [eax]
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm0           // Y.Y
+    pmulhuw    xmm0, xmm2
+    psubusw    xmm0, xmm3
+    psrlw      xmm0, 6
+    packuswb   xmm0, xmm0           // G
+
+    // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0           // GG
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    por        xmm0, xmm4
+    por        xmm1, xmm4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, kShuffleMirror
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    pshufb    xmm0, xmm5
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked)
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vbroadcastf128 ymm5, kShuffleMirror
+
+ convertloop:
+    vmovdqu   ymm0, [eax - 32 + ecx]
+    vpshufb   ymm0, ymm0, ymm5
+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 32
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+__declspec(naked)
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    movdqa    xmm1, xmm0        // swap bytes
+    psllw     xmm0, 8
+    psrlw     xmm1, 8
+    por       xmm0, xmm1
+    pshuflw   xmm0, xmm0, 0x1b  // swap words
+    pshufhw   xmm0, xmm0, 0x1b
+    pshufd    xmm0, xmm0, 0x4e  // swap qwords
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked)
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    sub       ecx, 8
+    jg        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked)
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
+
+ convertloop:
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked)
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5   // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked)
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm3, ymm1, 8
+    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1
+    vpackuswb  ymm2, ymm2, ymm3
+    vpermq     ymm0, ymm0, 0xd8
+    vpermq     ymm2, ymm2, 0xd8
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + edi], ymm2
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked)
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked)
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]           // read 32 U's
+    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    lea        eax,  [eax + 32]
+    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
+    lea        edi, [edi + 64]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked)
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]   // src
+    mov        edi, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    rep movsb
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
+  __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked)
+void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked)
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked)
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 1
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+//    pshuflw    xmm3, xmm3, 0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked)
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
+    psrld      xmm5, 8
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    punpcklbw  xmm0, xmm0       // first 2
+    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
+    pmulhuw    xmm0, xmm2       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpckhbw  xmm1, xmm1       // next 2 pixels
+    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // alphas
+    lea        eax, [eax + 16]
+    psrlw      xmm0, 8
+    pand       xmm2, xmm4
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    pand       xmm0, xmm5       // keep original alphas
+    por        xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked)
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, kShuffleAlpha0
+    movdqa     xmm5, kShuffleAlpha1
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    pshufb     xmm0, xmm4       // isolate first 2 alphas
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    pshufb     xmm1, xmm5       // isolate next 2 alphas
+    movdqu     xmm2, [eax]      // read 4 pixels
+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // mask original alpha
+    lea        eax, [eax + 16]
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2       // copy original alpha
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+__declspec(naked)
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpsrlw     ymm0, ymm0, 8
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb0
+    mov        edx, [esp + 8 + 8]   // dst_argb
+    mov        ecx, [esp + 8 + 12]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0       // first 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2       // rgb * a
+
+    movdqu     xmm1, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1       // next 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2       // rgb * a
+    lea        eax, [eax + 16]
+
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#else  // USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
+
+    push       esi
+    push       edi
+
+ convertloop:
+    // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]                 // alpha0
+    movzx      edi, byte ptr [eax + 7]                 // alpha1
+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]                // alpha2
+    movzx      edi, byte ptr [eax + 15]                // alpha3
+    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]                // alpha4
+    movzx      edi, byte ptr [eax + 23]                // alpha5
+    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]                // alpha6
+    movzx      edi, byte ptr [eax + 31]                // alpha7
+    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    // end of VPGATHER
+
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // USE_GATHER
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked)
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, kARGBToYJ
+    movdqa     xmm5, kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 G bytes
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm2, 24
+    psrld      xmm3, 24
+    packuswb   xmm2, xmm3
+    packuswb   xmm2, xmm2   // 8 A bytes
+    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0   // 8 GG words
+    punpcklbw  xmm3, xmm2   // 8 GA words
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm3   // GGGA first 4
+    punpckhwd  xmm1, xmm3   // GGGA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked)
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        ecx, [esp + 8]   /* width */
+    movdqa     xmm2, kARGBToSepiaB
+    movdqa     xmm3, kARGBToSepiaG
+    movdqa     xmm4, kARGBToSepiaR
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 B values
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked)
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16]  /* width */
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm7, xmm2
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm6, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
+    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
+    lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked)
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  __asm {
+    mov        eax, [esp + 4]    /* dst_argb */
+    movd       xmm2, [esp + 8]   /* scale */
+    movd       xmm3, [esp + 12]  /* interval_size */
+    movd       xmm4, [esp + 16]  /* interval_offset */
+    mov        ecx, [esp + 20]   /* width */
+    pshuflw    xmm2, xmm2, 040h
+    pshufd     xmm2, xmm2, 044h
+    pshuflw    xmm3, xmm3, 040h
+    pshufd     xmm3, xmm3, 044h
+    pshuflw    xmm4, xmm4, 040h
+    pshufd     xmm4, xmm4, 044h
+    pxor       xmm5, xmm5  // constant 0
+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
+    pslld      xmm6, 24
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    punpcklbw  xmm0, xmm5   // first 2 pixels
+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpckhbw  xmm1, xmm5   // next 2 pixels
+    pmulhuw    xmm1, xmm2
+    pmullw     xmm0, xmm3   // * interval_size
+    movdqu     xmm7, [eax]  // read 4 pixels
+    pmullw     xmm1, xmm3
+    pand       xmm7, xmm6   // mask alpha
+    paddw      xmm0, xmm4   // + interval_size / 2
+    paddw      xmm1, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm7
+    movdqu     [eax], xmm0
+    lea        eax, [eax + 16]
+    sub        ecx, 4
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked)
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked)
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+    sub        ecx, 4
+    jl         convertloop49
+
+ convertloop4:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+ convertloop49:
+    add        ecx, 4 - 1
+    jl         convertloop19
+
+ convertloop1:
+    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+ convertloop19:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5     // constant 0
+
+ convertloop:
+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vpunpcklbw ymm0, ymm1, ymm1   // low 4
+    vpunpckhbw ymm1, ymm1, ymm1   // high 4
+    vpunpcklbw ymm2, ymm3, ymm5   // low 4
+    vpunpckhbw ymm3, ymm3, ymm5   // high 4
+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+__declspec(naked)
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_y0
+    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        edi, [esp + 8 + 12]  // src_y2
+    mov        edx, [esp + 8 + 16]  // dst_sobelx
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        esi, eax
+    sub        edi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+__declspec(naked)
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_y0
+    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        edx, [esp + 4 + 12]  // dst_sobely
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked)
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+    pslld      xmm5, 24             // 0xff000000
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0             // GG
+    punpcklbw  xmm2, xmm0             // First 8
+    punpckhbw  xmm0, xmm0             // Next 8
+    movdqa     xmm1, xmm2             // GGGG
+    punpcklwd  xmm1, xmm2             // First 4
+    punpckhwd  xmm2, xmm2             // Next 4
+    por        xmm1, xmm5             // GGGA
+    por        xmm2, xmm5
+    movdqa     xmm3, xmm0             // GGGG
+    punpcklwd  xmm3, xmm0             // Next 4
+    punpckhwd  xmm0, xmm0             // Last 4
+    por        xmm3, xmm5             // GGGA
+    por        xmm0, xmm5
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked)
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked)
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    movdqa     xmm2, xmm0
+    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0             // XA
+    punpcklbw  xmm3, xmm5
+    punpckhbw  xmm0, xmm5
+    movdqa     xmm4, xmm1             // YS
+    punpcklbw  xmm4, xmm2
+    punpckhbw  xmm1, xmm2
+    movdqa     xmm6, xmm4             // YSXA
+    punpcklwd  xmm6, xmm3             // First 4
+    punpckhwd  xmm4, xmm3             // Next 4
+    movdqa     xmm7, xmm1             // YSXA
+    punpcklwd  xmm7, xmm0             // Next 4
+    punpckhwd  xmm1, xmm0             // Last 4
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm5, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0        // area
+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6           // (65536.0 + area - 1)
+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
+    packssdw   xmm5, xmm5           // 16 bit shorts
+
+    // 4 pixel loop small blocks.
+  s4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
+    // 4 pixel loop
+  l4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movdqu     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+    // 4 pixel loop
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqu     xmm3, [esi + 16]
+    paddd      xmm3, xmm0
+
+    paddd      xmm0, xmm4
+    movdqu     xmm4, [esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqu     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
+    paddd      xmm5, xmm0
+
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm1
+    punpcklwd  xmm2, xmm1
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked)
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 12]  // src_argb
+    mov        esi, [esp + 16]  // stride
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 28]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b
+
+    // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm7
+    movlhps    xmm2, xmm0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4    // dudv *= 2
+    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4    // dudv *= 4
+
+    // 4 pixel loop
+  l4:
+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
+    packssdw   xmm0, xmm1    // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       xmm0, [eax + edi]  // read pixel 3
+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    movq       qword ptr 8[edx], xmm6
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    cvttps2dq  xmm0, xmm2    // x, y float to int
+    packssdw   xmm0, xmm0    // x, y as shorts
+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
+    addps      xmm2, xmm7    // x, y += dx, dy
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // copy a pixel
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    sub        edi, esi
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    vmovd      xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    vmovd      xmm5, eax  // low fraction 128..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vpxor      ymm0, ymm0, ymm0
+    vpermd     ymm5, ymm0, ymm5
+
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2  // mutates
+    vpmaddubsw ymm0, ymm0, ymm5
+    vpmaddubsw ymm1, ymm1, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm1, ymm1, 7
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    sub        ecx, 32
+    jg         xloop
+    jmp        xloop99
+
+   // Blend 25 / 75.
+ xloop25:
+   vmovdqu    ymm0, [esi]
+   vmovdqu    ymm1, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop25
+   jmp        xloop99
+
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+   // Blend 75 / 25.
+ xloop75:
+   vmovdqu    ymm1, [esi]
+   vmovdqu    ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop75
+   jmp        xloop99
+
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+
+  xloop:
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+  xloop25:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    cmp        eax, 64
+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
+    cmp        eax, 128
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    cmp        eax, 192
+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
+
+    movd       xmm5, eax            // xmm5 = y fraction
+    punpcklbw  xmm5, xmm5
+    psrlw      xmm5, 1
+    punpcklwd  xmm5, xmm5
+    punpckldq  xmm5, xmm5
+    punpcklqdq xmm5, xmm5
+    pxor       xmm4, xmm4
+
+  xloop:
+    movdqu     xmm0, [esi]  // row0
+    movdqu     xmm2, [esi + edx]  // row1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    psubw      xmm2, xmm0  // row1 - row0
+    psubw      xmm3, xmm1
+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
+    paddw      xmm3, xmm3
+    pmulhw     xmm2, xmm5  // scale diff
+    pmulhw     xmm3, xmm5
+    paddw      xmm0, xmm2  // sum rows
+    paddw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+  xloop25:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_SSE2
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked)
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_argb
+    mov        ecx, [esp + 12]   // shuffler
+    movdqu     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // pix
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked)
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]     // src_argb
+    mov        edx, [esp + 8]     // dst_argb
+    mov        ecx, [esp + 12]    // shuffler
+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
+    mov        ecx, [esp + 16]    // pix
+
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         wloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked)
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    push       ebx
+    push       esi
+    mov        eax, [esp + 8 + 4]    // src_argb
+    mov        edx, [esp + 8 + 8]    // dst_argb
+    mov        esi, [esp + 8 + 12]   // shuffler
+    mov        ecx, [esp + 8 + 16]   // pix
+    pxor       xmm5, xmm5
+
+    mov        ebx, [esi]   // shuffler
+    cmp        ebx, 0x03000102
+    je         shuf_3012
+    cmp        ebx, 0x00010203
+    je         shuf_0123
+    cmp        ebx, 0x00030201
+    je         shuf_0321
+    cmp        ebx, 0x02010003
+    je         shuf_2103
+
+  // TODO(fbarchard): Use one source pointer and 3 offsets.
+  shuf_any1:
+    movzx      ebx, byte ptr [esi]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx], bl
+    movzx      ebx, byte ptr [esi + 1]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 1], bl
+    movzx      ebx, byte ptr [esi + 2]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 2], bl
+    movzx      ebx, byte ptr [esi + 3]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 3], bl
+    lea        eax, [eax + 4]
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jg         shuf_any1
+    jmp        shuf99
+
+  shuf_0123:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshuflw    xmm0, xmm0, 01Bh
+    pshufhw    xmm1, xmm1, 01Bh
+    pshuflw    xmm1, xmm1, 01Bh
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0123
+    jmp        shuf99
+
+  shuf_0321:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshuflw    xmm0, xmm0, 039h
+    pshufhw    xmm1, xmm1, 039h
+    pshuflw    xmm1, xmm1, 039h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0321
+    jmp        shuf99
+
+  shuf_2103:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshuflw    xmm0, xmm0, 093h
+    pshufhw    xmm1, xmm1, 093h
+    pshuflw    xmm1, xmm1, 093h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_2103
+    jmp        shuf99
+
+  shuf_3012:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshuflw    xmm0, xmm0, 0C6h
+    pshufhw    xmm1, xmm1, 0C6h
+    pshuflw    xmm1, xmm1, 0C6h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_3012
+
+  shuf99:
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked)
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked)
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+    // 2 pixel loop.
+ convertloop:
+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 2
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked)
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]   /* poly */
+    vbroadcastf128 ymm4, [ecx]       // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16]  /* width */
+
+    // 2 pixel loop.
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    sub         ecx, 2
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked)
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked)
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked)
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    mov        ecx, [esp + 8 + 12]  /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+  convertloop:
+    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(_M_X64)
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/row_x86.asm b/third_party/aom/third_party/libyuv/source/row_x86.asm
new file mode 100644
index 000000000..0cb326f8e
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/row_x86.asm
@@ -0,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
+    psrlw      m2, m2, 8
+%endif
+
+    ALIGN      4
+.convertloop:
+    mov%2      m0, [src_yuy2q]
+    mov%2      m1, [src_yuy2q + mmsize]
+    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+    pand       m0, m0, m2   ; YUY2 even bytes are Y
+    pand       m1, m1, m2
+%else
+    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
+    psrlw      m1, m1, 8
+%endif
+    packuswb   m0, m0, m1
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+%endif
+    sub        pixd, mmsize
+    mov%2      [dst_yq], m0
+    lea        dst_yq, [dst_yq + mmsize]
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
+    psrlw      m4, m4, 8
+    sub        dst_vq, dst_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uvq]
+    mov%1      m1, [src_uvq + mmsize]
+    lea        src_uvq, [src_uvq + mmsize * 2]
+    psrlw      m2, m0, 8         ; odd bytes
+    psrlw      m3, m1, 8
+    pand       m0, m0, m4        ; even bytes
+    pand       m1, m1, m4
+    packuswb   m0, m0, m1
+    packuswb   m2, m2, m3
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+    vpermq     m2, m2, 0xd8
+%endif
+    mov%1      [dst_uq], m0
+    mov%1      [dst_uq + dst_vq], m2
+    lea        dst_uq, [dst_uq + mmsize]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+;                      int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+    sub        src_vq, src_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uq]
+    mov%1      m1, [src_vq]
+    lea        src_uq, [src_uq + mmsize]
+    punpcklbw  m2, m0, m1       // first 8 UV pairs
+    punpckhbw  m0, m0, m1       // next 8 UV pairs
+%if cpuflag(AVX2)
+    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
+    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
+    mov%1      [dst_uvq], m1
+    mov%1      [dst_uvq + mmsize], m2
+%else
+    mov%1      [dst_uvq], m2
+    mov%1      [dst_uvq + mmsize], m0
+%endif
+    lea        dst_uvq, [dst_uvq + mmsize * 2]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
diff --git a/third_party/aom/third_party/libyuv/source/scale.cc b/third_party/aom/third_party/libyuv/source/scale.cc
new file mode 100644
index 000000000..0a01304c4
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale.cc
@@ -0,0 +1,1689 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering == kFilterNone ? ScaleRowDown2_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+        ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+          ScaleRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
+        ScaleRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+          ScaleRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+        ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+          ScaleRowDown2Box_AVX2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown2_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+    filtering == kFilterNone ? ScaleRowDown2_16_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+        ScaleRowDown2Box_16_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+        ScaleRowDown2_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown4_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+        ScaleRowDown4_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+        ScaleRowDown4_16_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown34_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_16_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_C;
+    ScaleRowDown38_2 = ScaleRowDown38_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+  }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    }
+    if (dst_width % 6 == 0 && filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown38_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_16_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int* scaleptr = scaletbl - minboxwidth;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int* scaleptr = scaletbl - minboxwidth;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  x >>= 16;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint16.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+        ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint8* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row16);
+  }
+}
+
+static void ScalePlaneBox_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint32.
+    align_buffer_64(row32, src_width * 4);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+        ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint16* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row32);
+  }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width * 2);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr,
+                          enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr,
+                             enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_16_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 4);
+
+    uint16* rowptr = (uint16*)row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+static void ScalePlaneSimple_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_16_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height,
+                       dst_width, dst_height,
+                       src_stride, dst_stride, src, dst,
+                       0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                  int src_width, int src_height,
+                  uint16* dst, int dst_stride,
+                  int dst_width, int dst_height,
+                  enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical_16(src_height,
+                          dst_width, dst_height,
+                          src_stride, dst_stride, src, dst,
+                          0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
+      // optimized, 1/4
+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+                dst_y, dst_stride_y, dst_width, dst_height,
+                filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+                filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+                filtering);
+  return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y,
+                   src_u, src_stride_u,
+                   src_v, src_stride_v,
+                   src_width, src_height,
+                   dst_y, dst_stride_y,
+                   dst_u, dst_stride_u,
+                   dst_v, dst_stride_v,
+                   dst_width, dst_height,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate) {
+  // Chroma requires offset to multiple of 2.
+  int dst_yoffset_even = dst_yoffset & ~1;
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
+  const uint8* src_y = src;
+  const uint8* src_u = src + src_width * src_height;
+  const uint8* src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset_even * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+      dst_yoffset_even >= dst_height) {
+    return -1;
+  }
+  return I420Scale(src_y, src_width,
+                   src_u, src_halfwidth,
+                   src_v, src_halfwidth,
+                   src_width, src_height,
+                   dst_y, dst_width,
+                   dst_u, dst_halfwidth,
+                   dst_v, dst_halfwidth,
+                   dst_width, aheight,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_any.cc b/third_party/aom/third_party/libyuv/source/scale_any.cc
new file mode 100644
index 000000000..2f6a2c8ba
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_any.cc
@@ -0,0 +1,200 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSE2
+SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
+      2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSE2
+SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/third_party/aom/third_party/libyuv/source/scale_argb.cc b/third_party/aom/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 000000000..40a2d1ab2
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,853 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_argb, uint8* dst_argb,
+                           int x, int dx, int y, int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  int src_stride, int dst_stride,
+                                  const uint8* src_argb, uint8* dst_argb,
+                                  int x, int dx, int y, int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint8* src_argb, uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
+
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_argb, uint8* dst_argb,
+                            int x, int dx, int y, int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+                  dst_width, x, dx);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+                      int src_width, int src_height,
+                      uint8* dst, int dst_stride,
+                      int dst_width, int dst_height,
+                      int clip_x, int clip_y, int clip_width, int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64 clipf = (int64)(clip_x) * dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
+  }
+  if (clip_y) {
+    int64 clipf = (int64)(clip_y) * dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
+                         src_stride, dst_stride, src, dst,
+                         x, dx, y, dy, filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
+    }
+  }
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+      clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            clip_x, clip_y, clip_width, clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_common.cc b/third_party/aom/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 000000000..1711f3d54
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,1137 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  int dst_width_bytes = dst_width * bpp;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return (int)((((int64)(num) << 16) - 0x00010001) /
+                          (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  // Check for 1 pixel and avoid FixedDiv overflow.
+  if (dst_width == 1 && src_width >= 32768) {
+    dst_width = src_width;
+  }
+  if (dst_height == 1 && src_height >= 32768) {
+    dst_height = src_height;
+  }
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height,  dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FixedDiv1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    // src_width = -src_width;   // Caller must do this.
+  }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_gcc.cc b/third_party/aom/third_party/libyuv/source/scale_gcc.cc
new file mode 100644
index 000000000..8a6ac5459
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1089 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    "pslld     $0x10,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  intptr_t stridex3 = 0;
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0x8,%%xmm7                     \n"
+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
+    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pand      %%xmm7,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(stridex3)     // %3
+  : "r"((intptr_t)(src_stride))    // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+  );
+}
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "m"(kMadd21)     // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "lea       " MEMLEA(0xc,1) ",%1            \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride))  // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movd      %%xmm6," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width, int src_height) {
+  int tmp_height = 0;
+  intptr_t tmp_src = 0;
+  asm volatile (
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
+    "add       %6,%3                           \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x1,%2                         \n"
+    "jg        1b                              \n"
+
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x10,%4                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(tmp_height),  // %2
+    "+r"(tmp_src),     // %3
+    "+r"(src_width),   // %4
+    "+rm"(src_height)  // %5
+  : "rm"((intptr_t)(src_stride))  // %6
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+  asm volatile (
+    "movd      %6,%%xmm2                       \n"
+    "movd      %7,%%xmm3                       \n"
+    "movl      $0x04040000,%k2                 \n"
+    "movd      %k2,%%xmm5                      \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "subl      $0x2,%5                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
+    "movd      %k2,%%xmm4                      \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "punpcklwd %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %w2," MEMACCESS(0) "            \n"
+    "lea       " MEMLEA(0x2,0) ",%0            \n"
+    "sub       $0x2,%5                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "addl      $0x1,%5                         \n"
+    "jl        99f                             \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %b2," MEMACCESS(0) "            \n"
+  "99:                                         \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+a"(temp_pixel),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1),          // %4
+    "+rm"(dst_width)   // %5
+  : "rm"(x),           // %6
+    "rm"(dx)           // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movd      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(src_stepx_x4),  // %1
+    "+r"(dst_argb),      // %2
+    "+r"(dst_width),     // %3
+    "+r"(src_stepx_x12)  // %4
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride, int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "movq      " MEMACCESS(5) ",%%xmm2         \n"
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_argb),       // %2
+    "+rm"(dst_width),     // %3
+    "+r"(src_stepx_x12),  // %4
+    "+r"(row1)            // %5
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "cmp       $0x0,%4                         \n"
+    "jl        99f                             \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    LABELALIGN
+  "40:                                         \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "pextrw    $0x7,%%xmm2,%k1                 \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "punpckldq %%xmm4,%%xmm1                   \n"
+    "punpcklqdq %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "test      $0x2,%4                         \n"
+    "je        29f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x8,2) ",%2            \n"
+  "29:                                         \n"
+    "test      $0x1,%4                         \n"
+    "je        99f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+  "99:                                         \n"
+  : "+a"(x0),          // %0
+    "+d"(x1),          // %1
+    "+r"(dst_argb),    // %2
+    "+r"(src_argb),    // %3
+    "+r"(dst_width)    // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm5                       \n"
+  :
+  : "m"(kShuffleColARGB),  // %0
+    "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "sub       $0x2,%2                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x2,%2                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "add       $0x1,%2                         \n"
+    "jl        99f                             \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(0) "         \n"
+
+    LABELALIGN
+  "99:                                         \n"
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+rm"(dst_width),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1)           // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "sub       $0x10001,%%eax                  \n"
+    "sbb       $0x0,%%edx                      \n"
+    "sub       $0x1,%1                         \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_mips.cc b/third_party/aom/third_party/libyuv/source/scale_mips.cc
new file mode 100644
index 000000000..3eb4f27c4
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_mips.cc
@@ -0,0 +1,654 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
+  __asm__ __volatile__(
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+
+    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
+    "beqz           $t9, 2f                        \n"
+    " nop                                          \n"
+
+    ".p2align       2                              \n"
+  "1:                                              \n"
+    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
+    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
+    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
+    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
+    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+    // TODO(fbarchard): Use odd pixels instead of even.
+    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
+    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
+    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
+    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
+    "addiu          %[src_ptr], %[src_ptr], 32     \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sw             $t8, 0(%[dst])                 \n"
+    "sw             $t0, 4(%[dst])                 \n"
+    "sw             $t1, 8(%[dst])                 \n"
+    "sw             $t2, 12(%[dst])                \n"
+    "bgtz           $t9, 1b                        \n"
+    " addiu         %[dst], %[dst], 16             \n"
+
+  "2:                                              \n"
+    "andi           $t9, %[dst_width], 0xf         \n"  // residue
+    "beqz           $t9, 3f                        \n"
+    " nop                                          \n"
+
+  "21:                                             \n"
+    "lbu            $t0, 0(%[src_ptr])             \n"
+    "addiu          %[src_ptr], %[src_ptr], 2      \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sb             $t0, 0(%[dst])                 \n"
+    "bgtz           $t9, 21b                       \n"
+    " addiu         %[dst], %[dst], 1              \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
+  const uint8* t = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
+    "bltz           $t9, 2f                       \n"
+    " nop                                         \n"
+
+    ".p2align       2                             \n"
+  "1:                                             \n"
+    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
+    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
+    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
+    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
+    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
+    "addiu          $t9, $t9, -1                  \n"
+    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
+    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
+    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
+    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
+    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
+    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
+    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
+    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
+    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
+    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
+    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
+    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
+    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
+    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
+    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
+    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
+    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
+    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
+    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
+    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
+    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
+    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
+    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
+    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
+    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
+    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
+    "addiu          %[src_ptr], %[src_ptr], 16    \n"
+    "addiu          %[t], %[t], 16                \n"
+    "sb             $t0, 0(%[dst])                \n"
+    "sb             $t4, 1(%[dst])                \n"
+    "sb             $t1, 2(%[dst])                \n"
+    "sb             $t5, 3(%[dst])                \n"
+    "sb             $t2, 4(%[dst])                \n"
+    "sb             $t6, 5(%[dst])                \n"
+    "sb             $t3, 6(%[dst])                \n"
+    "sb             $t7, 7(%[dst])                \n"
+    "bgtz           $t9, 1b                       \n"
+    " addiu         %[dst], %[dst], 8             \n"
+
+  "2:                                             \n"
+    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
+    "beqz           $t9, 3f                       \n"
+    " nop                                         \n"
+
+    "21:                                          \n"
+    "lwr            $t1, 0(%[src_ptr])            \n"
+    "lwl            $t1, 3(%[src_ptr])            \n"
+    "lwr            $t2, 0(%[t])                  \n"
+    "lwl            $t2, 3(%[t])                  \n"
+    "srl            $t8, $t1, 16                  \n"
+    "ins            $t1, $t2, 16, 16              \n"
+    "ins            $t2, $t8, 0, 16               \n"
+    "raddu.w.qb     $t1, $t1                      \n"
+    "raddu.w.qb     $t2, $t2                      \n"
+    "shra_r.w       $t1, $t1, 2                   \n"
+    "shra_r.w       $t2, $t2, 2                   \n"
+    "sb             $t1, 0(%[dst])                \n"
+    "sb             $t2, 1(%[dst])                \n"
+    "addiu          %[src_ptr], %[src_ptr], 4     \n"
+    "addiu          $t9, $t9, -2                  \n"
+    "addiu          %[t], %[t], 4                 \n"
+    "bgtz           $t9, 21b                      \n"
+    " addiu         %[dst], %[dst], 2             \n"
+
+  "3:                                             \n"
+    ".set pop                                     \n"
+
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst), [t] "+r" (t)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+      ".p2align       2                             \n"
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+      ".p2align      2                            \n"
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+      ".p2align        2                                  \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+     ".p2align           2                               \n"
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+      ".p2align          2                                 \n"
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+
+      ".p2align   2                                  \n"
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/aom/third_party/libyuv/source/scale_neon.cc b/third_party/aom/third_party/libyuv/source/scale_neon.cc
new file mode 100644
index 000000000..7825878e9
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_neon.cc
@@ -0,0 +1,1037 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vshrn.s32  d18, q11, #16                  \n"
+    "vshrn.s32  d19, q12, #16                  \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          100f                         \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
+    "cmp          %4, #128                     \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp = 0;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_neon64.cc b/third_party/aom/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 000000000..1d5519357
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into v0, odd into v1
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #1              \n"
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "v0", "v1", "v2", "v3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
+    MEMACCESS(3)
+    "ld1     {v1.16b}, [%2], #16               \n"
+    MEMACCESS(4)
+    "ld1     {v2.16b}, [%3], #16               \n"
+    MEMACCESS(5)
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %w5, %w5, #4                      \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
+    MEMACCESS(1)
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %w2, %w2, #24                           \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
+
+    // 3 * line_0 + line_1
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+    // average src line 0 with src line 1
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+  );
+}
+
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %w2, %w2, #12                           \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    MEMACCESS(1)
+    "st1       {v2.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile (
+    MEMACCESS(5)
+    "ld1       {v29.8h}, [%5]                          \n"
+    MEMACCESS(6)
+    "ld1       {v30.16b}, [%6]                         \n"
+    MEMACCESS(7)
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    MEMACCESS(4)
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %w4, %w4, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+
+    // combine source lines
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // combine source lines
+    "add       v0.8h, v0.8h, v16.8h                    \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile (
+    MEMACCESS(4)
+    "ld1       {v30.8h}, [%4]                          \n"
+    MEMACCESS(5)
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %w3, %w3, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+
+    // combine source lines
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+    // combine source lines
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       w12, %w5                        \n"
+    "eor       v2.16b, v2.16b, v2.16b          \n"
+    "eor       v3.16b, v3.16b, v3.16b          \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %3              \n"
+    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+    "uaddw     v2.8h, v2.8h, v0.8b             \n"
+    "subs      w12, w12, #1                    \n"
+    "b.gt      2b                              \n"
+    MEMACCESS(2)
+    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+    "add      %1, %1, #16                      \n"
+    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+    "b.gt     1b                               \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                    \n"              \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "shrn      v6.4h, v16.4s, #16              \n"
+    "shrn2     v6.8h, v17.4s, #16              \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    MEMACCESS(0)
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
+  asm volatile (
+    "cmp          %w4, #0                      \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
+    "cmp          %w4, #64                     \n"
+    "b.eq         75f                          \n"
+    "cmp          %w4, #128                    \n"
+    "b.eq         50f                          \n"
+    "cmp          %w4, #192                    \n"
+    "b.eq         25f                          \n"
+
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v1.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v0.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "st1          {v0.b}[15], [%0]             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS (0)
+    // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #1               \n"
+    "rshrn      v2.8b, v2.8h, #1               \n"
+    "rshrn      v3.8b, v3.8h, #1               \n"
+    MEMACCESS (1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[0], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[1], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[2], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[3], [%0], %3            \n"
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"((int64)(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v4.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v5.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v6.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v7.8b}, [%1], %4              \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+    "mov        v0.d[1], v2.d[0]               \n"
+    "mov        v2.d[0], v16.d[1]              \n"
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+    "mov        v4.d[1], v6.d[0]               \n"
+    "mov        v6.d[0], v16.d[1]              \n"
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"((int64)(src_stepx * 4)) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld1        {"#vn".s}["#n"], [%6]          \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  int64 tmp64 = 0;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(v0, 0)
+    LOAD1_DATA32_LANE(v0, 1)
+    LOAD1_DATA32_LANE(v0, 2)
+    LOAD1_DATA32_LANE(v0, 3)
+    LOAD1_DATA32_LANE(v1, 0)
+    LOAD1_DATA32_LANE(v1, 1)
+    LOAD1_DATA32_LANE(v1, 2)
+    LOAD1_DATA32_LANE(v1, 3)
+
+    MEMACCESS(0)
+    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    "b.gt        1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp64),            // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    MEMACCESS(0)
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/scale_win.cc b/third_party/aom/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 000000000..c3896ebad
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1354 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked)
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]    // src_ptr
+    mov         esi, [esp + 4 + 8]    // src_stride
+    mov         edx, [esp + 4 + 12]   // dst_ptr
+    mov         ecx, [esp + 4 + 16]   // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked)
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked)
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    movdqu     xmm4, [eax + edi]
+    movdqu     xmm5, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride ignored
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]    // src_ptr
+    mov         esi, [esp + 8 + 8]    // src_stride
+    mov         edx, [esp + 8 + 12]   // dst_ptr
+    mov         ecx, [esp + 8 + 16]   // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
+    vpsrlw      ymm7, ymm7, 8
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpavgb      ymm2, ymm2, [eax + edi]
+    vpavgb      ymm3, ymm3, [eax + edi + 32]
+    lea         eax, [eax + 64]
+    vpavgb      ymm0, ymm0, ymm2
+    vpavgb      ymm1, ymm1, ymm3
+
+    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
+    vpand       ymm3, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 8
+    vpsrlw      ymm1, ymm1, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpavgw      ymm1, ymm1, ymm3
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
+    vpsrlw      ymm0, ymm0, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked)
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, kShuf0
+    movdqa     xmm4, kShuf1
+    movdqa     xmm5, kShuf2
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, kShuf38a
+    movdqa     xmm5, kShuf38b
+
+  xloop:
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    sub        ecx, 12
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAc
+    movdqa     xmm3, kShufAc3
+    movdqa     xmm4, kScaleAc33
+    pxor       xmm5, xmm5
+
+  xloop:
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqu     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAb0
+    movdqa     xmm3, kShufAb1
+    movdqa     xmm4, kShufAb2
+    movdqa     xmm5, kScaleAb2
+
+  xloop:
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
+    lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_ptr
+    mov        edx, [esp + 8]   // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
+
+  // sum rows
+  xloop:
+    movdqu     xmm3, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
+    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 16
+    jg         xloop
+    ret
+  }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov         eax, [esp + 4]   // src_ptr
+    mov         edx, [esp + 8]   // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
+
+  // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]       // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]    // dst_ptr
+    mov        esi, [esp + 12 + 8]    // src_ptr
+    mov        ecx, [esp + 12 + 12]   // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5           // 0011
+    punpcklwd  xmm0, xmm4
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
+    movd       ebx, xmm0
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9              // 7 bit fractions.
+    pshufb     xmm2, xmm5           // 0011
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // 16 bit
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits
+    movd       ebx, xmm0
+    mov        [edi], bl
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked)
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked)
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_argb
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_argb
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked)
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_argb
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_argb
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked)
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_argb
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_argb
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]       // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked)
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0            // x3 x2 x1 x0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1          // get x0 integer.
+    pextrw     edx, xmm2, 3          // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+    // 4 Pixel loop.
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    pextrw     edx, xmm2, 7           // get x3 integer.
+    paddd      xmm2, xmm3             // x += dx
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4             // x2 x3
+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
+    jge        xloop4
+
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+    // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+    // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked)
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, kShuffleColARGB
+    movdqa     xmm5, kShuffleFractions
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5           // 0000000011111111
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    psrlw      xmm2, 9              // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked)
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_argb
+    mov        eax, [esp + 8]    // src_argb
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    mov        ecx, [esp + 8]    // denom
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/third_party/aom/third_party/libyuv/source/video_common.cc b/third_party/aom/third_party/libyuv/source/video_common.cc
new file mode 100644
index 000000000..379a0669a
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU16, FOURCC_I422},
+  {FOURCC_YU24, FOURCC_I444},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_DMB1, FOURCC_MJPG},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+  {FOURCC_RGB3, FOURCC_RAW },
+  {FOURCC_BGR3, FOURCC_24BG},
+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+  int i;
+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/third_party/aom/third_party/libyuv/source/x86inc.asm b/third_party/aom/third_party/libyuv/source/x86inc.asm
new file mode 100644
index 000000000..cb5c32df3
--- /dev/null
+++ b/third_party/aom/third_party/libyuv/source/x86inc.asm
@@ -0,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho64
+        SECTION .text align=%1
+    %elifidn __OUTPUT_FORMAT__,macho
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        section .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+    %ifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .text align=%1
+    %endif
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rsp + stack_offset + %3]
+        %define r%1mp qword r %+ %1m
+    %else
+        %define r%1m [esp + stack_offset + %3]
+        %define r%1mp dword r %+ %1m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+    pop %1
+    %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    %if mmsize == 8
+        %assign xmm_regs_used 0
+    %else
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 6
+        SUB rsp, (xmm_regs_used-6)*16+16
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %if xmm_regs_used > 6
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+        %endrep
+        add %1, (xmm_regs_used-6)*16+16
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [esp + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    ASSERT regs_used >= num_args
+    PUSH_IF_USED 3, 4, 5, 6
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+    cglobal_internal %1 %+ SUFFIX
+%else
+    cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+    %ifndef cglobaled_%1
+        %xdefine %1 mangle(%1)
+        %xdefine %1.skip_prologue %1 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %1, 1
+    %endif
+    %xdefine current_function %1
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %1:function hidden
+    %else
+        global %1
+    %endif
+    align function_align
+    %1:
+    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %assign stack_offset 0
+    %if %0 > 1
+        PROLOGUE %2
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 2+
+    %xdefine %1 mangle(%1)
+    global %1
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX      (1<<0)
+%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop      (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4     (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3     (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<22)
+%assign cpuflags_bmi1     (1<<23)
+%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
+
+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+    %if %0 >= 1
+        %xdefine cpuname %1
+        %assign cpuflags cpuflags_%1
+        %if %0 >= 2
+            %xdefine cpuname %1_%2
+            %assign cpuflags cpuflags | cpuflags_%2
+        %endif
+        %xdefine SUFFIX _ %+ cpuname
+        %if cpuflag(AVX)
+            %assign AVX_enabled 1
+        %endif
+        %if mmsize == 16 && notcpuflag(SSE2)
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elifidn %1, SSE3
+            %define movu lddqu
+        %endif
+    %else
+        %xdefine SUFFIX
+        %undef cpuname
+        %undef cpuflags
+    %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nmm, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign AVX_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova vmovaps
+    %define movu vmovups
+    %undef movh
+    %define movnta vmovntps
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine tmp%2 m%2
+    %xdefine ntmp%2 nm%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 tmp%2
+    %xdefine nm%1 ntmp%2
+    %undef tmp%2
+    %undef ntmp%2
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+    %xdefine tmp m%1
+    %xdefine m%1 m%2
+    %xdefine m%2 tmp
+    CAT_XDEFINE n, m%1, %1
+    CAT_XDEFINE n, m%2, %2
+%else
+    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+    ; Be careful using this mode in nested macros though, as in some cases there may be
+    ; other copies of m# that have already been dereferenced and don't get updated correctly.
+    %xdefine %%n1 n %+ %1
+    %xdefine %%n2 n %+ %2
+    %xdefine tmp m %+ %%n1
+    CAT_XDEFINE m, %%n1, m %+ %%n2
+    CAT_XDEFINE m, %%n2, tmp
+    CAT_XDEFINE n, m %+ %%n1, %%n1
+    CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+    %undef tmp
+    %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE n, m %+ %%i, %%i
+        %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %1
+    %ifndef cglobaled_%1
+        %ifdef cglobaled_%2
+            %xdefine %%i %2
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-AVX emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+    %ifid %6
+        %define %%sizeofreg sizeof%6
+    %elifid %5
+        %define %%sizeofreg sizeof%5
+    %else
+        %define %%sizeofreg mmsize
+    %endif
+    %if %%sizeofreg==32
+        %if %4>=3
+            v%1 %5, %6, %7
+        %else
+            v%1 %5, %6
+        %endif
+    %else
+        %if %%sizeofreg==8
+            %define %%regmov movq
+        %elif %2
+            %define %%regmov movaps
+        %else
+            %define %%regmov movdqa
+        %endif
+
+        %if %4>=3+%3
+            %ifnidn %5, %6
+                %if AVX_enabled && %%sizeofreg==16
+                    v%1 %5, %6, %7
+                %else
+                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+                    %%regmov %5, %6
+                    %1 %5, %7
+                %endif
+            %else
+                %1 %5, %7
+            %endif
+        %elif %4>=3
+            %1 %5, %6, %7
+        %else
+            %1 %5, %6
+        %endif
+    %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+    %assign %%swap 0
+    %if AVX_enabled
+        %ifnid %6
+            %assign %%swap 1
+        %endif
+    %elifnidn %5, %6
+        %ifnid %7
+            %assign %%swap 1
+        %endif
+    %endif
+    %if %%swap && %3 == 0 && %8 == 1
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %else
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %else
+            %6 %1, %2, %3
+            %7 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsdd,  pmulld, paddd
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
diff --git a/third_party/aom/third_party/vector/LICENSE b/third_party/aom/third_party/vector/LICENSE
new file mode 100644
index 000000000..afcb9f00a
--- /dev/null
+++ b/third_party/aom/third_party/vector/LICENSE
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+Copyright (c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/third_party/aom/third_party/vector/README.libaom b/third_party/aom/third_party/vector/README.libaom
new file mode 100644
index 000000000..2bb8b2d5d
--- /dev/null
+++ b/third_party/aom/third_party/vector/README.libaom
@@ -0,0 +1,14 @@
+Name: vector
+URL: https://github.com/goldsborough/vector
+Version: commit-id: 40efe82
+License: MIT
+License File: LICENSE
+
+Description:
+A feature-complete, generic and customizable resizable
+array implementation in pure C that supports almost
+the entire C++ std::vector API, including iterators.
+
+Local Modifications:
+Renamed some functions to fit in with the AOMedia
+naming convention.
diff --git a/third_party/aom/third_party/vector/vector.c b/third_party/aom/third_party/vector/vector.c
new file mode 100644
index 000000000..fe46246a1
--- /dev/null
+++ b/third_party/aom/third_party/vector/vector.c
@@ -0,0 +1,543 @@
+/*
+The MIT License(MIT)
+Copyright(c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions :
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#define __STDC_WANT_LIB_EXT1__ 1
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/vector/vector.h"
+
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size) {
+  assert(vector != NULL);
+
+  if (vector == NULL) return VECTOR_ERROR;
+
+  vector->size = 0;
+  vector->capacity = MAX(VECTOR_MINIMUM_CAPACITY, capacity);
+  vector->element_size = element_size;
+  vector->data = malloc(vector->capacity * element_size);
+
+  return vector->data == NULL ? VECTOR_ERROR : VECTOR_SUCCESS;
+}
+
+int aom_vector_copy(Vector *destination, Vector *source) {
+  assert(destination != NULL);
+  assert(source != NULL);
+  assert(aom_vector_is_initialized(source));
+  assert(!aom_vector_is_initialized(destination));
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+  if (aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+  /* Copy ALL the data */
+  destination->size = source->size;
+  destination->capacity = source->size * 2;
+  destination->element_size = source->element_size;
+
+  /* Note that we are not necessarily allocating the same capacity */
+  destination->data = malloc(destination->capacity * source->element_size);
+  if (destination->data == NULL) return VECTOR_ERROR;
+
+  memcpy(destination->data, source->data, aom_vector_byte_size(source));
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_copy_assign(Vector *destination, Vector *source) {
+  assert(destination != NULL);
+  assert(source != NULL);
+  assert(aom_vector_is_initialized(source));
+  assert(aom_vector_is_initialized(destination));
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+  aom_vector_destroy(destination);
+
+  return aom_vector_copy(destination, source);
+}
+
+int aom_vector_move(Vector *destination, Vector *source) {
+  assert(destination != NULL);
+  assert(source != NULL);
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+
+  *destination = *source;
+  source->data = NULL;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_move_assign(Vector *destination, Vector *source) {
+  aom_vector_swap(destination, source);
+  return aom_vector_destroy(source);
+}
+
+int aom_vector_swap(Vector *destination, Vector *source) {
+  void *temp;
+
+  assert(destination != NULL);
+  assert(source != NULL);
+  assert(aom_vector_is_initialized(source));
+  assert(aom_vector_is_initialized(destination));
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+  _vector_swap(&destination->size, &source->size);
+  _vector_swap(&destination->capacity, &source->capacity);
+  _vector_swap(&destination->element_size, &source->element_size);
+
+  temp = destination->data;
+  destination->data = source->data;
+  source->data = temp;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_destroy(Vector *vector) {
+  assert(vector != NULL);
+
+  if (vector == NULL) return VECTOR_ERROR;
+
+  free(vector->data);
+  vector->data = NULL;
+
+  return VECTOR_SUCCESS;
+}
+
+/* Insertion */
+int aom_vector_push_back(Vector *vector, void *element) {
+  assert(vector != NULL);
+  assert(element != NULL);
+
+  if (_vector_should_grow(vector)) {
+    if (_vector_adjust_capacity(vector) == VECTOR_ERROR) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  _vector_assign(vector, vector->size, element);
+
+  ++vector->size;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_push_front(Vector *vector, void *element) {
+  return aom_vector_insert(vector, 0, element);
+}
+
+int aom_vector_insert(Vector *vector, size_t index, void *element) {
+  void *offset;
+
+  assert(vector != NULL);
+  assert(element != NULL);
+  assert(index <= vector->size);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (element == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+  if (index > vector->size) return VECTOR_ERROR;
+
+  if (_vector_should_grow(vector)) {
+    if (_vector_adjust_capacity(vector) == VECTOR_ERROR) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  /* Move other elements to the right */
+  if (_vector_move_right(vector, index) == VECTOR_ERROR) {
+    return VECTOR_ERROR;
+  }
+
+  /* Insert the element */
+  offset = _vector_offset(vector, index);
+  memcpy(offset, element, vector->element_size);
+  ++vector->size;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_assign(Vector *vector, size_t index, void *element) {
+  assert(vector != NULL);
+  assert(element != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (element == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+  if (index >= vector->size) return VECTOR_ERROR;
+
+  _vector_assign(vector, index, element);
+
+  return VECTOR_SUCCESS;
+}
+
+/* Deletion */
+int aom_vector_pop_back(Vector *vector) {
+  assert(vector != NULL);
+  assert(vector->size > 0);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+
+  --vector->size;
+
+#ifndef VECTOR_NO_SHRINK
+  if (_vector_should_shrink(vector)) {
+    _vector_adjust_capacity(vector);
+  }
+#endif
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_pop_front(Vector *vector) { return aom_vector_erase(vector, 0); }
+
+int aom_vector_erase(Vector *vector, size_t index) {
+  assert(vector != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+  if (index >= vector->size) return VECTOR_ERROR;
+
+  /* Just overwrite */
+  _vector_move_left(vector, index);
+
+#ifndef VECTOR_NO_SHRINK
+  if (--vector->size == vector->capacity / 4) {
+    _vector_adjust_capacity(vector);
+  }
+#endif
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_clear(Vector *vector) { return aom_vector_resize(vector, 0); }
+
+/* Lookup */
+void *aom_vector_get(Vector *vector, size_t index) {
+  assert(vector != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return NULL;
+  if (vector->element_size == 0) return NULL;
+  if (index >= vector->size) return NULL;
+
+  return _vector_offset(vector, index);
+}
+
+const void *aom_vector_const_get(const Vector *vector, size_t index) {
+  assert(vector != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return NULL;
+  if (vector->element_size == 0) return NULL;
+  if (index >= vector->size) return NULL;
+
+  return _vector_const_offset(vector, index);
+}
+
+void *aom_vector_front(Vector *vector) { return aom_vector_get(vector, 0); }
+
+void *aom_vector_back(Vector *vector) {
+  return aom_vector_get(vector, vector->size - 1);
+}
+
+/* Information */
+
+bool aom_vector_is_initialized(const Vector *vector) {
+  return vector->data != NULL;
+}
+
+size_t aom_vector_byte_size(const Vector *vector) {
+  return vector->size * vector->element_size;
+}
+
+size_t aom_vector_free_space(const Vector *vector) {
+  return vector->capacity - vector->size;
+}
+
+bool aom_vector_is_empty(const Vector *vector) { return vector->size == 0; }
+
+/* Memory management */
+int aom_vector_resize(Vector *vector, size_t new_size) {
+  if (new_size <= vector->capacity * VECTOR_SHRINK_THRESHOLD) {
+    vector->size = new_size;
+    if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
+      return VECTOR_ERROR;
+    }
+  } else if (new_size > vector->capacity) {
+    if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  vector->size = new_size;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity) {
+  if (minimum_capacity > vector->capacity) {
+    if (_vector_reallocate(vector, minimum_capacity) == VECTOR_ERROR) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_shrink_to_fit(Vector *vector) {
+  return _vector_reallocate(vector, vector->size);
+}
+
+/* Iterators */
+Iterator aom_vector_begin(Vector *vector) { return aom_vector_iterator(vector, 0); }
+
+Iterator aom_vector_end(Vector *vector) {
+  return aom_vector_iterator(vector, vector->size);
+}
+
+Iterator aom_vector_iterator(Vector *vector, size_t index) {
+  Iterator iterator = { NULL, 0 };
+
+  assert(vector != NULL);
+  assert(index <= vector->size);
+
+  if (vector == NULL) return iterator;
+  if (index > vector->size) return iterator;
+  if (vector->element_size == 0) return iterator;
+
+  iterator.pointer = _vector_offset(vector, index);
+  iterator.element_size = vector->element_size;
+
+  return iterator;
+}
+
+void *iterator_get(Iterator *iterator) { return iterator->pointer; }
+
+int iterator_erase(Vector *vector, Iterator *iterator) {
+  size_t index = iterator_index(vector, iterator);
+
+  if (aom_vector_erase(vector, index) == VECTOR_ERROR) {
+    return VECTOR_ERROR;
+  }
+
+  *iterator = aom_vector_iterator(vector, index);
+
+  return VECTOR_SUCCESS;
+}
+
+void iterator_increment(Iterator *iterator) {
+  assert(iterator != NULL);
+  // iterator->pointer += iterator->element_size;
+  iterator->pointer =
+      (unsigned char *)iterator->pointer + iterator->element_size;
+}
+
+void iterator_decrement(Iterator *iterator) {
+  assert(iterator != NULL);
+  // iterator->pointer -= iterator->element_size;
+  iterator->pointer =
+      (unsigned char *)iterator->pointer - iterator->element_size;
+}
+
+void *iterator_next(Iterator *iterator) {
+  void *current = iterator->pointer;
+  iterator_increment(iterator);
+
+  return current;
+}
+
+void *iterator_previous(Iterator *iterator) {
+  void *current = iterator->pointer;
+  iterator_decrement(iterator);
+
+  return current;
+}
+
+bool iterator_equals(Iterator *first, Iterator *second) {
+  assert(first->element_size == second->element_size);
+  return first->pointer == second->pointer;
+}
+
+bool iterator_is_before(Iterator *first, Iterator *second) {
+  assert(first->element_size == second->element_size);
+  return first->pointer < second->pointer;
+}
+
+bool iterator_is_after(Iterator *first, Iterator *second) {
+  assert(first->element_size == second->element_size);
+  return first->pointer > second->pointer;
+}
+
+size_t iterator_index(Vector *vector, Iterator *iterator) {
+  assert(vector != NULL);
+  assert(iterator != NULL);
+  // return (iterator->pointer - vector->data) / vector->element_size;
+  return ((unsigned char *)iterator->pointer - (unsigned char *)vector->data) /
+         vector->element_size;
+}
+
+/***** PRIVATE *****/
+
+bool _vector_should_grow(Vector *vector) {
+  assert(vector->size <= vector->capacity);
+  return vector->size == vector->capacity;
+}
+
+bool _vector_should_shrink(Vector *vector) {
+  assert(vector->size <= vector->capacity);
+  return vector->size == vector->capacity * VECTOR_SHRINK_THRESHOLD;
+}
+
+size_t _vector_free_bytes(const Vector *vector) {
+  return aom_vector_free_space(vector) * vector->element_size;
+}
+
+void *_vector_offset(Vector *vector, size_t index) {
+  // return vector->data + (index * vector->element_size);
+  return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+const void *_vector_const_offset(const Vector *vector, size_t index) {
+  // return vector->data + (index * vector->element_size);
+  return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+void _vector_assign(Vector *vector, size_t index, void *element) {
+  /* Insert the element */
+  void *offset = _vector_offset(vector, index);
+  memcpy(offset, element, vector->element_size);
+}
+
+int _vector_move_right(Vector *vector, size_t index) {
+  assert(vector->size < vector->capacity);
+
+  /* The location where to start to move from. */
+  void *offset = _vector_offset(vector, index);
+
+  /* How many to move to the right. */
+  size_t elements_in_bytes = (vector->size - index) * vector->element_size;
+
+#ifdef __STDC_LIB_EXT1__
+  size_t right_capacity_in_bytes =
+      (vector->capacity - (index + 1)) * vector->element_size;
+
+  /* clang-format off */
+    int return_code =  memmove_s(
+        offset + vector->element_size,
+        right_capacity_in_bytes,
+        offset,
+        elements_in_bytes);
+
+  /* clang-format on */
+
+  return return_code == 0 ? VECTOR_SUCCESS : VECTOR_ERROR;
+
+#else
+  // memmove(offset + vector->element_size, offset, elements_in_bytes);
+  memmove((unsigned char *)offset + vector->element_size, offset,
+          elements_in_bytes);
+  return VECTOR_SUCCESS;
+#endif
+}
+
+void _vector_move_left(Vector *vector, size_t index) {
+  size_t right_elements_in_bytes;
+  void *offset;
+
+  /* The offset into the memory */
+  offset = _vector_offset(vector, index);
+
+  /* How many to move to the left */
+  right_elements_in_bytes = (vector->size - index - 1) * vector->element_size;
+
+  // memmove(offset, offset + vector->element_size, right_elements_in_bytes);
+  memmove(offset, (unsigned char *)offset + vector->element_size,
+          right_elements_in_bytes);
+}
+
+int _vector_adjust_capacity(Vector *vector) {
+  return _vector_reallocate(vector,
+                            MAX(1, vector->size * VECTOR_GROWTH_FACTOR));
+}
+
+int _vector_reallocate(Vector *vector, size_t new_capacity) {
+  size_t new_capacity_in_bytes;
+  void *old;
+  assert(vector != NULL);
+
+  if (new_capacity < VECTOR_MINIMUM_CAPACITY) {
+    if (vector->capacity > VECTOR_MINIMUM_CAPACITY) {
+      new_capacity = VECTOR_MINIMUM_CAPACITY;
+    } else {
+      /* NO-OP */
+      return VECTOR_SUCCESS;
+    }
+  }
+
+  new_capacity_in_bytes = new_capacity * vector->element_size;
+  old = vector->data;
+
+  if ((vector->data = malloc(new_capacity_in_bytes)) == NULL) {
+    return VECTOR_ERROR;
+  }
+
+#ifdef __STDC_LIB_EXT1__
+  /* clang-format off */
+    if (memcpy_s(vector->data,
+                             new_capacity_in_bytes,
+                             old,
+                             aom_vector_byte_size(vector)) != 0) {
+        return VECTOR_ERROR;
+    }
+/* clang-format on */
+#else
+  memcpy(vector->data, old, aom_vector_byte_size(vector));
+#endif
+
+  vector->capacity = new_capacity;
+
+  free(old);
+
+  return VECTOR_SUCCESS;
+}
+
+void _vector_swap(size_t *first, size_t *second) {
+  size_t temp = *first;
+  *first = *second;
+  *second = temp;
+}
diff --git a/third_party/aom/third_party/vector/vector.h b/third_party/aom/third_party/vector/vector.h
new file mode 100644
index 000000000..02743f5f1
--- /dev/null
+++ b/third_party/aom/third_party/vector/vector.h
@@ -0,0 +1,159 @@
+/*
+The MIT License(MIT)
+Copyright(c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions :
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef VECTOR_H
+#define VECTOR_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/***** DEFINITIONS *****/
+
+#define VECTOR_MINIMUM_CAPACITY 2
+#define VECTOR_GROWTH_FACTOR 2
+#define VECTOR_SHRINK_THRESHOLD (1 / 4)
+
+#define VECTOR_ERROR -1
+#define VECTOR_SUCCESS 0
+
+#define VECTOR_UNINITIALIZED NULL
+#define VECTOR_INITIALIZER \
+  { 0, 0, 0, VECTOR_UNINITIALIZED }
+
+/***** STRUCTURES *****/
+
+typedef struct Vector {
+  size_t size;
+  size_t capacity;
+  size_t element_size;
+
+  void *data;
+} Vector;
+
+typedef struct Iterator {
+  void *pointer;
+  size_t element_size;
+} Iterator;
+
+/***** METHODS *****/
+
+/* Constructor */
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size);
+
+/* Copy Constructor */
+int aom_vector_copy(Vector *destination, Vector *source);
+
+/* Copy Assignment */
+int aom_vector_copy_assign(Vector *destination, Vector *source);
+
+/* Move Constructor */
+int aom_vector_move(Vector *destination, Vector *source);
+
+/* Move Assignment */
+int aom_vector_move_assign(Vector *destination, Vector *source);
+
+int aom_vector_swap(Vector *destination, Vector *source);
+
+/* Destructor */
+int aom_vector_destroy(Vector *vector);
+
+/* Insertion */
+int aom_vector_push_back(Vector *vector, void *element);
+int aom_vector_push_front(Vector *vector, void *element);
+int aom_vector_insert(Vector *vector, size_t index, void *element);
+int aom_vector_assign(Vector *vector, size_t index, void *element);
+
+/* Deletion */
+int aom_vector_pop_back(Vector *vector);
+int aom_vector_pop_front(Vector *vector);
+int aom_vector_erase(Vector *vector, size_t index);
+int aom_vector_clear(Vector *vector);
+
+/* Lookup */
+void *aom_vector_get(Vector *vector, size_t index);
+const void *aom_vector_const_get(const Vector *vector, size_t index);
+void *aom_vector_front(Vector *vector);
+void *aom_vector_back(Vector *vector);
+#define VECTOR_GET_AS(type, aom_vector_pointer, index) \
+  *((type *)aom_vector_get((aom_vector_pointer), (index)))
+
+/* Information */
+bool aom_vector_is_initialized(const Vector *vector);
+size_t aom_vector_byte_size(const Vector *vector);
+size_t aom_vector_free_space(const Vector *vector);
+bool aom_vector_is_empty(const Vector *vector);
+
+/* Memory management */
+int aom_vector_resize(Vector *vector, size_t new_size);
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity);
+int aom_vector_shrink_to_fit(Vector *vector);
+
+/* Iterators */
+Iterator aom_vector_begin(Vector *vector);
+Iterator aom_vector_end(Vector *vector);
+Iterator aom_vector_iterator(Vector *vector, size_t index);
+
+void *iterator_get(Iterator *iterator);
+#define ITERATOR_GET_AS(type, iterator) *((type *)iterator_get((iterator)))
+
+int iterator_erase(Vector *vector, Iterator *iterator);
+
+void iterator_increment(Iterator *iterator);
+void iterator_decrement(Iterator *iterator);
+
+void *iterator_next(Iterator *iterator);
+void *iterator_previous(Iterator *iterator);
+
+bool iterator_equals(Iterator *first, Iterator *second);
+bool iterator_is_before(Iterator *first, Iterator *second);
+bool iterator_is_after(Iterator *first, Iterator *second);
+
+size_t iterator_index(Vector *vector, Iterator *iterator);
+
+#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name)           \
+  for (Iterator(iterator_name) = aom_vector_begin((aom_vector_pointer)), \
+      end = aom_vector_end((aom_vector_pointer));                        \
+       !iterator_equals(&(iterator_name), &end);                 \
+       iterator_increment(&(iterator_name)))
+
+/***** PRIVATE *****/
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+bool _vector_should_grow(Vector *vector);
+bool _vector_should_shrink(Vector *vector);
+
+size_t _vector_free_bytes(const Vector *vector);
+void *_vector_offset(Vector *vector, size_t index);
+const void *_vector_const_offset(const Vector *vector, size_t index);
+
+void _vector_assign(Vector *vector, size_t index, void *element);
+
+int _vector_move_right(Vector *vector, size_t index);
+void _vector_move_left(Vector *vector, size_t index);
+
+int _vector_adjust_capacity(Vector *vector);
+int _vector_reallocate(Vector *vector, size_t new_capacity);
+
+void _vector_swap(size_t *first, size_t *second);
+
+#endif /* VECTOR_H */
diff --git a/third_party/aom/third_party/x86inc/LICENSE b/third_party/aom/third_party/x86inc/LICENSE
new file mode 100644
index 000000000..7d07645a1
--- /dev/null
+++ b/third_party/aom/third_party/x86inc/LICENSE
@@ -0,0 +1,18 @@
+Copyright (C) 2005-2012 x264 project
+
+Authors: Loren Merritt <lorenm@u.washington.edu>
+         Anton Mitrofanov <BugMaster@narod.ru>
+         Jason Garrett-Glaser <darkshikari@gmail.com>
+         Henrik Gramner <hengar-6@student.ltu.se>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/third_party/aom/third_party/x86inc/README.libaom b/third_party/aom/third_party/x86inc/README.libaom
new file mode 100644
index 000000000..07c4dad20
--- /dev/null
+++ b/third_party/aom/third_party/x86inc/README.libaom
@@ -0,0 +1,20 @@
+URL: https://git.videolan.org/git/x264.git
+Version: d23d18655249944c1ca894b451e2c82c7a584c62
+License: ISC
+License File: LICENSE
+
+Description:
+x264/libav's framework for x86 assembly. Contains a variety of macros and
+defines that help automatically allow assembly to work cross-platform.
+
+Local Modifications:
+Get configuration from aom_config.asm.
+Prefix functions with aom by default.
+Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
+  exist in libaom.
+Expand PIC default to macho64 and respect CONFIG_PIC from libaom
+Set 'private_extern' visibility for macho targets.
+Copy PIC 'GLOBAL' macros from x86_abi_support.asm
+Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
+Use .text with no alignment for aout
+Only use 'hidden' visibility with Chromium
diff --git a/third_party/aom/third_party/x86inc/x86inc.asm b/third_party/aom/third_party/x86inc/x86inc.asm
new file mode 100644
index 000000000..adaf2d99e
--- /dev/null
+++ b/third_party/aom/third_party/x86inc/x86inc.asm
@@ -0,0 +1,1649 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2016 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%include "config/aom_config.asm"
+
+%ifndef private_prefix
+    %define private_prefix aom
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define FORMAT_ELF 1
+%endif
+
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,macho32
+     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+     %define FORMAT_MACHO 1
+%endif
+
+; Set PREFIX for libaom builds.
+%if FORMAT_ELF
+    %undef PREFIX
+%elif WIN64
+    %undef PREFIX
+%else
+    %define PREFIX
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; In some instances macho32 tables get misaligned when using .rodata.
+; When looking at the disassembly it appears that the offset is either
+; correct or consistently off by 90. Placing them in the .text section
+; works around the issue. It appears to be specific to the way libaom
+; handles the tables.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho32
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+; PIC macros are copied from aom_ports/x86_abi_support.asm. The "define PIC"
+; from original code is added in for 64bit.
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+    %if CONFIG_PIC=1
+        %ifidn __OUTPUT_FORMAT__,elf32
+            %define GET_GOT_DEFINED 1
+            %define WRT_PLT wrt ..plt
+            %macro GET_GOT 1
+                extern _GLOBAL_OFFSET_TABLE_
+                push %1
+                call %%get_got
+                %%sub_offset:
+                jmp %%exitGG
+                %%get_got:
+                mov %1, [esp]
+                add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+                ret
+                %%exitGG:
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 wrt ..gotoff
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %elifidn __OUTPUT_FORMAT__,macho32
+            %define GET_GOT_DEFINED 1
+            %macro GET_GOT 1
+                push %1
+                call %%get_got
+                %%get_got:
+                pop  %1
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 - %%get_got
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
+        %endif
+    %endif
+
+    %if ARCH_X86_64 == 0
+        %undef PIC
+    %endif
+
+%else
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) rel x
+    %define WRT_PLT wrt ..plt
+
+    %if WIN64
+        %define PIC
+    %elifidn __OUTPUT_FORMAT__,macho64
+        %define PIC
+    %elif CONFIG_PIC
+        %define PIC
+    %endif
+%endif
+
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+    %define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+    %define WRT_PLT
+%endif
+
+%ifdef PIC
+    default rel
+%endif
+
+%ifndef GET_GOT_DEFINED
+    %define GET_GOT_DEFINED 0
+%endif
+; Done with PIC macros
+
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %define %2q %2
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
+%endmacro
+
+%macro POP 1
+    pop %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assertion ``%1'' failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%pad 0
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if WIN64
+                %assign %%pad %%pad + 32 ; shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+                    %endif
+                %endif
+            %endif
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                %if %1 < 0 ; need to store rsp on stack
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
+                %else ; can keep rsp in rstk during whole function
+                    %xdefine rstkm rstk
+                %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments
+                %assign regs_used 5 + UNIX64 * 3
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 8
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+        SUB rsp, stack_size_padded
+    %endif
+    WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
+        %assign %%i xmm_regs_used
+        %rep xmm_regs_used-8
+            %assign %%i %%i-1
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+        %endrep
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset (stack_offset-stack_size_padded)
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 6, 5, 4, 3
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+    %macro WIN64_SPILL_XMM 1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 1
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+    annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+    %endif
+    ret
+    annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+    annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    annotate_function_size
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix
+        ; libaom explicitly sets visibility in shared object builds. Avoid
+        ; setting visibility to hidden as it may break builds that split
+        ; sources on e.g., directory boundaries.
+        %ifdef CHROMIUM
+            %xdefine %%VISIBILITY hidden
+        %else
+            %xdefine %%VISIBILITY
+        %endif
+    %else
+        %xdefine %%FUNCTION_PREFIX public_prefix
+        %xdefine %%VISIBILITY
+    %endif
+    %ifndef cglobaled_%2
+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %xdefine current_function_section __SECT__
+    %if FORMAT_ELF
+        global %2:function %%VISIBILITY
+    %elif FORMAT_MACHO
+        %ifdef __NASM_VER__
+            global %2
+        %else
+            global %2:private_extern
+        %endif
+    %else
+        global %2
+    %endif
+    align function_align
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %ifnidn %3, ""
+        PROLOGUE %3
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %if FORMAT_ELF
+        global %1:data hidden
+    %else
+        global %1
+    %endif
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx      (1<<0)
+%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2     (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
+%assign cpuflags_avx      (1<<11)| cpuflags_sse42
+%assign cpuflags_xop      (1<<12)| cpuflags_avx
+%assign cpuflags_fma4     (1<<13)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
+    %if %0 >= 1
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
+        %xdefine SUFFIX _ %+ cpuname
+
+        %if cpuflag(avx)
+            %assign avx_enabled 1
+        %endif
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
+            %define movu lddqu
+        %endif
+    %endif
+
+    %if ARCH_X86_64 || cpuflag(sse2)
+        %ifdef __NASM_VER__
+            ALIGNMODE k8
+        %else
+            CPU amdnop
+        %endif
+    %else
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
+    %endif
+%endmacro
+
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+        CAT_XDEFINE m, %%i, mm %+ %%i
+        CAT_XDEFINE nnmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %rep 8
+        CAT_UNDEF m, %%i
+        CAT_UNDEF nnmm, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, xmm %+ %%i
+        CAT_XDEFINE nnxmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, ymm %+ %%i
+        CAT_XDEFINE nnymm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+    DECLARE_MMCAST i
+    %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args nn %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, nn %+ %2
+        %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+        %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1 %+ SUFFIX, %1
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %2
+    %ifndef cglobaled_%2
+        %ifdef cglobaled_%1
+            %xdefine %%i %1
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+    %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
+        %assign __sizeofreg sizeof%6
+    %else
+        %assign __sizeofreg mmsize
+    %endif
+    %assign __emulate_avx 0
+    %if avx_enabled && __sizeofreg >= 16
+        %xdefine __instr v%1
+    %else
+        %xdefine __instr %1
+        %if %0 >= 8+%4
+            %assign __emulate_avx 1
+        %endif
+    %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %endif
+        %endif
+    %endif
+
+    %if __emulate_avx
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %ifnidn %6, %7
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
+            %endif
+            %if %5 && %4 == 0
+                %ifnid %8
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %endif
+            %endif
+            %if __sizeofreg == 8
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
+            %else
+                MOVDQA %6, __src1
+            %endif
+        %endif
+        %if %0 >= 9
+            %1 %6, __src2, %9
+        %else
+            %1 %6, __src2
+        %endif
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
+    %elif %0 == 7
+        __instr %6, %7
+    %else
+        __instr %6
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 1
+AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 0, 0
+AVX_INSTR blendps, sse4, 1, 0, 0
+AVX_INSTR blendvpd, sse4, 1, 0, 0
+AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2
+AVX_INSTR cvtsi2sd, sse2
+AVX_INSTR cvtsi2ss, sse
+AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 1
+AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 1
+AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 1
+AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 0, 0
+AVX_INSTR pblendw, sse4
+AVX_INSTR pclmulqdq
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4
+AVX_INSTR pinsrd, sse4
+AVX_INSTR pinsrq, sse4
+AVX_INSTR pinsrw, mmx2
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4
+AVX_INSTR roundss, sse4
+AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1, 0, 0
+AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+    %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %elifnidn %1, %4
+            %6 %1, %2, %3
+            %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifid %3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
+%endmacro
+
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
+%endif
diff --git a/third_party/aom/tools/aggregate_entropy_stats.py b/third_party/aom/tools/aggregate_entropy_stats.py
new file mode 100644
index 000000000..7cb4d18e1
--- /dev/null
+++ b/third_party/aom/tools/aggregate_entropy_stats.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Aggregate multiple entropy stats output which is written in 32-bit int.
+
+python ./aggregate_entropy_stats.py [dir of stats files] [keyword of filenames]
+ [filename of final stats]
+"""
+
+__author__ = "yuec@google.com"
+
+import os
+import sys
+import numpy as np
+
+def main():
+    dir = sys.argv[1]
+    sum = []
+    for fn in os.listdir(dir):
+        if sys.argv[2] in fn:
+            stats = np.fromfile(dir + fn, dtype=np.int32)
+            if len(sum) == 0:
+                sum = stats
+            else:
+                sum = np.add(sum, stats)
+    if len(sum) == 0:
+        print("No stats file is found. Double-check directory and keyword?")
+    else:
+        sum.tofile(dir+sys.argv[3])
+
+if __name__ == '__main__':
+    main()
diff --git a/third_party/aom/tools/aom_entropy_optimizer.c b/third_party/aom/tools/aom_entropy_optimizer.c
new file mode 100644
index 000000000..551adf4f2
--- /dev/null
+++ b/third_party/aom/tools/aom_entropy_optimizer.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// This tool is a gadget for offline probability training.
+// A binary executable aom_entropy_optimizer will be generated in tools/. It
+// parses a binary file consisting of counts written in the format of
+// FRAME_COUNTS in entropymode.h, and computes optimized probability tables
+// and CDF tables, which will be written to a new c file optimized_probs.c
+// according to format in the codebase.
+//
+// Command line: ./aom_entropy_optimizer [directory of the count file]
+//
+// The input file can either be generated by encoding a single clip by
+// turning on entropy_stats experiment, or be collected at a larger scale at
+// which a python script which will be provided soon can be used to aggregate
+// multiple stats output.
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encoder.h"
+
+#define SPACES_PER_TAB 2
+#define CDF_MAX_SIZE 16
+
+typedef unsigned int aom_count_type;
+// A log file recording parsed counts
+static FILE *logfile;  // TODO(yuec): make it a command line option
+
+static void counts_to_cdf(const aom_count_type *counts, aom_cdf_prob *cdf,
+                          int modes) {
+  int64_t csum[CDF_MAX_SIZE];
+  assert(modes <= CDF_MAX_SIZE);
+
+  csum[0] = counts[0] + 1;
+  for (int i = 1; i < modes; ++i) csum[i] = counts[i] + 1 + csum[i - 1];
+
+  for (int i = 0; i < modes; ++i) fprintf(logfile, "%d ", counts[i]);
+  fprintf(logfile, "\n");
+
+  int64_t sum = csum[modes - 1];
+  const int64_t round_shift = sum >> 1;
+  for (int i = 0; i < modes; ++i) {
+    cdf[i] = (csum[i] * CDF_PROB_TOP + round_shift) / sum;
+    cdf[i] = AOMMIN(cdf[i], CDF_PROB_TOP - (modes - 1 + i) * 4);
+    cdf[i] = (i == 0) ? AOMMAX(cdf[i], 4) : AOMMAX(cdf[i], cdf[i - 1] + 4);
+  }
+}
+
+static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr,
+                                    FILE *const probsfile, int tabs,
+                                    int dim_of_cts, int *cts_each_dim) {
+  if (dim_of_cts < 1) {
+    fprintf(stderr, "The dimension of a counts vector should be at least 1!\n");
+    return 1;
+  }
+  const int total_modes = cts_each_dim[0];
+  if (dim_of_cts == 1) {
+    assert(total_modes <= CDF_MAX_SIZE);
+    aom_cdf_prob cdfs[CDF_MAX_SIZE];
+    aom_count_type *counts1d = *ct_ptr;
+
+    counts_to_cdf(counts1d, cdfs, total_modes);
+    (*ct_ptr) += total_modes;
+
+    if (tabs > 0) fprintf(probsfile, "%*c", tabs * SPACES_PER_TAB, ' ');
+    fprintf(probsfile, "AOM_CDF%d(", total_modes);
+    for (int k = 0; k < total_modes - 1; ++k) {
+      fprintf(probsfile, "%d", cdfs[k]);
+      if (k < total_modes - 2) fprintf(probsfile, ", ");
+    }
+    fprintf(probsfile, ")");
+  } else {
+    for (int k = 0; k < total_modes; ++k) {
+      int tabs_next_level;
+
+      if (dim_of_cts == 2)
+        fprintf(probsfile, "%*c{ ", tabs * SPACES_PER_TAB, ' ');
+      else
+        fprintf(probsfile, "%*c{\n", tabs * SPACES_PER_TAB, ' ');
+      tabs_next_level = dim_of_cts == 2 ? 0 : tabs + 1;
+
+      if (parse_counts_for_cdf_opt(ct_ptr, probsfile, tabs_next_level,
+                                   dim_of_cts - 1, cts_each_dim + 1)) {
+        return 1;
+      }
+
+      if (dim_of_cts == 2) {
+        if (k == total_modes - 1)
+          fprintf(probsfile, " }\n");
+        else
+          fprintf(probsfile, " },\n");
+      } else {
+        if (k == total_modes - 1)
+          fprintf(probsfile, "%*c}\n", tabs * SPACES_PER_TAB, ' ');
+        else
+          fprintf(probsfile, "%*c},\n", tabs * SPACES_PER_TAB, ' ');
+      }
+    }
+  }
+  return 0;
+}
+
+static void optimize_cdf_table(aom_count_type *counts, FILE *const probsfile,
+                               int dim_of_cts, int *cts_each_dim,
+                               char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+  if (parse_counts_for_cdf_opt(&ct_ptr, probsfile, 1, dim_of_cts,
+                               cts_each_dim)) {
+    fprintf(probsfile, "Optimizer failed!\n");
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_uv_mode(aom_count_type *counts, FILE *const probsfile,
+                             int dim_of_cts, int *cts_each_dim, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+  fprintf(logfile, "%s\n", prefix);
+  cts_each_dim[2] = UV_INTRA_MODES - 1;
+  for (int k = 0; k < cts_each_dim[1]; ++k) {
+    fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+    parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, dim_of_cts - 2,
+                             cts_each_dim + 2);
+    if (k + 1 == cts_each_dim[1]) {
+      fprintf(probsfile, " }\n");
+    } else {
+      fprintf(probsfile, " },\n");
+    }
+    ++ct_ptr;
+  }
+  fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+  cts_each_dim[2] = UV_INTRA_MODES;
+  parse_counts_for_cdf_opt(&ct_ptr, probsfile, 2, dim_of_cts - 1,
+                           cts_each_dim + 1);
+  fprintf(probsfile, "%*c}\n", SPACES_PER_TAB, ' ');
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_2d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 2);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    int num_of_modes = modes_each_ctx[d0_idx];
+
+    if (num_of_modes > 0) {
+      fprintf(probsfile, "%*c{ ", SPACES_PER_TAB, ' ');
+      parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+      ct_ptr += cts_each_dim[1] - num_of_modes;
+      fprintf(probsfile, " },\n");
+    } else {
+      fprintf(probsfile, "%*c{ 0 },\n", SPACES_PER_TAB, ' ');
+      fprintf(logfile, "dummy cdf, no need to optimize\n");
+      ct_ptr += cts_each_dim[1];
+    }
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_3d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 3);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+    for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+      int num_of_modes = modes_each_ctx[d0_idx];
+
+      if (num_of_modes > 0) {
+        fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+        parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+        ct_ptr += cts_each_dim[2] - num_of_modes;
+        fprintf(probsfile, " },\n");
+      } else {
+        fprintf(probsfile, "%*c{ 0 },\n", 2 * SPACES_PER_TAB, ' ');
+        fprintf(logfile, "dummy cdf, no need to optimize\n");
+        ct_ptr += cts_each_dim[2];
+      }
+    }
+    fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_4d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 4);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+    for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+      fprintf(probsfile, "%*c{\n", 2 * SPACES_PER_TAB, ' ');
+      for (int d2_idx = 0; d2_idx < cts_each_dim[2]; ++d2_idx) {
+        int num_of_modes = modes_each_ctx[d0_idx];
+
+        if (num_of_modes > 0) {
+          fprintf(probsfile, "%*c{ ", 3 * SPACES_PER_TAB, ' ');
+          parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+          ct_ptr += cts_each_dim[3] - num_of_modes;
+          fprintf(probsfile, " },\n");
+        } else {
+          fprintf(probsfile, "%*c{ 0 },\n", 3 * SPACES_PER_TAB, ' ');
+          fprintf(logfile, "dummy cdf, no need to optimize\n");
+          ct_ptr += cts_each_dim[3];
+        }
+      }
+      fprintf(probsfile, "%*c},\n", 2 * SPACES_PER_TAB, ' ');
+    }
+    fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+int main(int argc, const char **argv) {
+  if (argc < 2) {
+    fprintf(stderr, "Please specify the input stats file!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  FILE *const statsfile = fopen(argv[1], "rb");
+  if (statsfile == NULL) {
+    fprintf(stderr, "Failed to open input file!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  FRAME_COUNTS fc;
+  const size_t bytes = fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile);
+  if (!bytes) return 1;
+
+  FILE *const probsfile = fopen("optimized_probs.c", "w");
+  if (probsfile == NULL) {
+    fprintf(stderr,
+            "Failed to create output file for optimized entropy tables!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  logfile = fopen("aom_entropy_optimizer_parsed_counts.log", "w");
+  if (logfile == NULL) {
+    fprintf(stderr, "Failed to create log file for parsed counts!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  int cts_each_dim[10];
+
+  /* Intra mode (keyframe luma) */
+  cts_each_dim[0] = KF_MODE_CONTEXTS;
+  cts_each_dim[1] = KF_MODE_CONTEXTS;
+  cts_each_dim[2] = INTRA_MODES;
+  optimize_cdf_table(&fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+                     "const aom_cdf_prob\n"
+                     "default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]"
+                     "[CDF_SIZE(INTRA_MODES)]");
+
+  cts_each_dim[0] = DIRECTIONAL_MODES;
+  cts_each_dim[1] = 2 * MAX_ANGLE_DELTA + 1;
+  optimize_cdf_table(&fc.angle_delta[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_angle_delta_cdf"
+                     "[DIRECTIONAL_MODES][CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]");
+
+  /* Intra mode (non-keyframe luma) */
+  cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+  cts_each_dim[1] = INTRA_MODES;
+  optimize_cdf_table(
+      &fc.y_mode[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]");
+
+  /* Intra mode (chroma) */
+  cts_each_dim[0] = CFL_ALLOWED_TYPES;
+  cts_each_dim[1] = INTRA_MODES;
+  cts_each_dim[2] = UV_INTRA_MODES;
+  optimize_uv_mode(&fc.uv_mode[0][0][0], probsfile, 3, cts_each_dim,
+                   "static const aom_cdf_prob\n"
+                   "default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]"
+                   "[CDF_SIZE(UV_INTRA_MODES)]");
+
+  /* block partition */
+  cts_each_dim[0] = PARTITION_CONTEXTS;
+  cts_each_dim[1] = EXT_PARTITION_TYPES;
+  int part_types_each_ctx[PARTITION_CONTEXTS] = {
+    4, 4, 4, 4, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8
+  };
+  optimize_cdf_table_var_modes_2d(
+      &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx,
+      "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]"
+      "[CDF_SIZE(EXT_PARTITION_TYPES)]");
+
+  /* tx type */
+  cts_each_dim[0] = EXT_TX_SETS_INTRA;
+  cts_each_dim[1] = EXT_TX_SIZES;
+  cts_each_dim[2] = INTRA_MODES;
+  cts_each_dim[3] = TX_TYPES;
+  int intra_ext_tx_types_each_ctx[EXT_TX_SETS_INTRA] = { 0, 7, 5 };
+  optimize_cdf_table_var_modes_4d(
+      &fc.intra_ext_tx[0][0][0][0], probsfile, 4, cts_each_dim,
+      intra_ext_tx_types_each_ctx,
+      "static const aom_cdf_prob default_intra_ext_tx_cdf[EXT_TX_SETS_INTRA]"
+      "[EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)]");
+
+  cts_each_dim[0] = EXT_TX_SETS_INTER;
+  cts_each_dim[1] = EXT_TX_SIZES;
+  cts_each_dim[2] = TX_TYPES;
+  int inter_ext_tx_types_each_ctx[EXT_TX_SETS_INTER] = { 0, 16, 12, 2 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.inter_ext_tx[0][0][0], probsfile, 3, cts_each_dim,
+      inter_ext_tx_types_each_ctx,
+      "static const aom_cdf_prob default_inter_ext_tx_cdf[EXT_TX_SETS_INTER]"
+      "[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)]");
+
+  /* Chroma from Luma */
+  cts_each_dim[0] = CFL_JOINT_SIGNS;
+  optimize_cdf_table(&fc.cfl_sign[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]");
+  cts_each_dim[0] = CFL_ALPHA_CONTEXTS;
+  cts_each_dim[1] = CFL_ALPHABET_SIZE;
+  optimize_cdf_table(&fc.cfl_alpha[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS]"
+                     "[CDF_SIZE(CFL_ALPHABET_SIZE)]");
+
+  /* Interpolation filter */
+  cts_each_dim[0] = SWITCHABLE_FILTER_CONTEXTS;
+  cts_each_dim[1] = SWITCHABLE_FILTERS;
+  optimize_cdf_table(&fc.switchable_interp[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]"
+                     "[CDF_SIZE(SWITCHABLE_FILTERS)]");
+
+  /* Motion vector referencing */
+  cts_each_dim[0] = NEWMV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.newmv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = GLOBALMV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.zeromv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = REFMV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.refmv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = DRL_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.drl_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  /* ext_inter experiment */
+  /* New compound mode */
+  cts_each_dim[0] = INTER_MODE_CONTEXTS;
+  cts_each_dim[1] = INTER_COMPOUND_MODES;
+  optimize_cdf_table(&fc.inter_compound_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_"
+                     "SIZE(INTER_COMPOUND_MODES)]");
+
+  /* Interintra */
+  cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.interintra[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+  cts_each_dim[1] = INTERINTRA_MODES;
+  optimize_cdf_table(&fc.interintra_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE("
+                     "INTERINTRA_MODES)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.wedge_interintra[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+  /* Compound type */
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = COMPOUND_TYPES - 1;
+  optimize_cdf_table(&fc.compound_type[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_compound_type_cdf"
+                     "[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 16;
+  optimize_cdf_table(&fc.wedge_idx[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]");
+
+  /* motion_var and warped_motion experiments */
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = MOTION_MODES;
+  optimize_cdf_table(
+      &fc.motion_mode[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]");
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.obmc[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+  /* Intra/inter flag */
+  cts_each_dim[0] = INTRA_INTER_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.intra_inter[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]");
+
+  /* Single/comp ref flag */
+  cts_each_dim[0] = COMP_INTER_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.comp_inter[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]");
+
+  /* ext_comp_refs experiment */
+  cts_each_dim[0] = COMP_REF_TYPE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.comp_ref_type[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = UNI_COMP_REF_CONTEXTS;
+  cts_each_dim[1] = UNIDIR_COMP_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(&fc.uni_comp_ref[0][0][0], probsfile, 3, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_"
+                     "COMP_REFS - 1][CDF_SIZE(2)]");
+
+  /* Reference frame (single ref) */
+  cts_each_dim[0] = REF_CONTEXTS;
+  cts_each_dim[1] = SINGLE_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(
+      &fc.single_ref[0][0][0], probsfile, 3, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]");
+
+  /* ext_refs experiment */
+  cts_each_dim[0] = REF_CONTEXTS;
+  cts_each_dim[1] = FWD_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(
+      &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = REF_CONTEXTS;
+  cts_each_dim[1] = BWD_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(
+      &fc.comp_bwdref[0][0][0], probsfile, 3, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]");
+
+  /* palette */
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_SIZES;
+  optimize_cdf_table(&fc.palette_y_size[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_y_size_cdf"
+                     "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_SIZES;
+  optimize_cdf_table(&fc.palette_uv_size[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_uv_size_cdf"
+                     "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_Y_MODE_CONTEXTS;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(&fc.palette_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+                     "const aom_cdf_prob default_palette_y_mode_cdf"
+                     "[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]"
+                     "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = PALETTE_UV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.palette_uv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_uv_mode_cdf"
+                     "[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = PALETTE_SIZES;
+  cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+  cts_each_dim[2] = PALETTE_COLORS;
+  int palette_color_indexes_each_ctx[PALETTE_SIZES] = { 2, 3, 4, 5, 6, 7, 8 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.palette_y_color_index[0][0][0], probsfile, 3, cts_each_dim,
+      palette_color_indexes_each_ctx,
+      "const aom_cdf_prob default_palette_y_color_index_cdf[PALETTE_SIZES]"
+      "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+  cts_each_dim[0] = PALETTE_SIZES;
+  cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+  cts_each_dim[2] = PALETTE_COLORS;
+  optimize_cdf_table_var_modes_3d(
+      &fc.palette_uv_color_index[0][0][0], probsfile, 3, cts_each_dim,
+      palette_color_indexes_each_ctx,
+      "const aom_cdf_prob default_palette_uv_color_index_cdf[PALETTE_SIZES]"
+      "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+  /* Transform size */
+  cts_each_dim[0] = TXFM_PARTITION_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.txfm_partition[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]");
+
+  /* Skip flag */
+  cts_each_dim[0] = SKIP_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.skip[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
+
+  /* Skip mode flag */
+  cts_each_dim[0] = SKIP_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.skip_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  /* joint compound flag */
+  cts_each_dim[0] = COMP_INDEX_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.compound_index[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_compound_idx_cdfs"
+                     "[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = COMP_GROUP_IDX_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.comp_group_idx[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_comp_group_idx_cdfs"
+                     "[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]");
+
+  /* intrabc */
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(
+      &fc.intrabc[0], probsfile, 1, cts_each_dim,
+      "static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)]");
+
+  /* filter_intra experiment */
+  cts_each_dim[0] = FILTER_INTRA_MODES;
+  optimize_cdf_table(
+      &fc.filter_intra_mode[0], probsfile, 1, cts_each_dim,
+      "static const aom_cdf_prob "
+      "default_filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.filter_intra[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+  /* restoration type */
+  cts_each_dim[0] = RESTORE_SWITCHABLE_TYPES;
+  optimize_cdf_table(&fc.switchable_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_switchable_restore_cdf"
+                     "[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]");
+
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(&fc.wiener_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_wiener_restore_cdf"
+                     "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(&fc.sgrproj_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_sgrproj_restore_cdf"
+                     "[CDF_SIZE(2)]");
+
+  /* intra tx size */
+  cts_each_dim[0] = MAX_TX_CATS;
+  cts_each_dim[1] = TX_SIZE_CONTEXTS;
+  cts_each_dim[2] = MAX_TX_DEPTH + 1;
+  int intra_tx_sizes_each_ctx[MAX_TX_CATS] = { 2, 3, 3, 3 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.intra_tx_size[0][0][0], probsfile, 3, cts_each_dim,
+      intra_tx_sizes_each_ctx,
+      "static const aom_cdf_prob default_tx_size_cdf"
+      "[MAX_TX_CATS][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH + 1)]");
+
+  /* transform coding */
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = TXB_SKIP_CONTEXTS;
+  cts_each_dim[3] = 2;
+  optimize_cdf_table(&fc.txb_skip[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES]"
+                     "[TXB_SKIP_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = EOB_COEF_CONTEXTS;
+  cts_each_dim[4] = 2;
+  optimize_cdf_table(
+      &fc.eob_extra[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_eob_extra_cdfs "
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]"
+      "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 5;
+  optimize_cdf_table(&fc.eob_multi16[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi16_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(5)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 6;
+  optimize_cdf_table(&fc.eob_multi32[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi32_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(6)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 7;
+  optimize_cdf_table(&fc.eob_multi64[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi64_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(7)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 8;
+  optimize_cdf_table(&fc.eob_multi128[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi128_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(8)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 9;
+  optimize_cdf_table(&fc.eob_multi256[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi256_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(9)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 10;
+  optimize_cdf_table(&fc.eob_multi512[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi512_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(10)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 11;
+  optimize_cdf_table(&fc.eob_multi1024[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi1024_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(11)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = LEVEL_CONTEXTS;
+  cts_each_dim[4] = BR_CDF_SIZE;
+  optimize_cdf_table(&fc.coeff_lps_multi[0][0][0][0][0], probsfile, 5,
+                     cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "av1_default_coeff_lps_multi_cdfs[TOKEN_CDF_Q_CTXS]"
+                     "[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]"
+                     "[CDF_SIZE(BR_CDF_SIZE)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = SIG_COEF_CONTEXTS;
+  cts_each_dim[4] = NUM_BASE_LEVELS + 2;
+  optimize_cdf_table(
+      &fc.coeff_base_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_coeff_base_multi_cdfs"
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]"
+      "[CDF_SIZE(NUM_BASE_LEVELS + 2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = SIG_COEF_CONTEXTS_EOB;
+  cts_each_dim[4] = NUM_BASE_LEVELS + 1;
+  optimize_cdf_table(
+      &fc.coeff_base_eob_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs"
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]"
+      "[CDF_SIZE(NUM_BASE_LEVELS + 1)]");
+
+  fclose(statsfile);
+  fclose(logfile);
+  fclose(probsfile);
+
+  return 0;
+}
diff --git a/third_party/aom/tools/cpplint.py b/third_party/aom/tools/cpplint.py
new file mode 100755
index 000000000..25fbef73d
--- /dev/null
+++ b/third_party/aom/tools/cpplint.py
@@ -0,0 +1,4756 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed] [--root=subdir]
+                   [--linelength=digits]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
+  extensions with the --extensions flag.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+
+    linelength=digits
+      This is the allowed line length for the project. The default value is
+      80 characters.
+
+      Examples:
+        --linelength=120
+
+    extensions=extension,extension,...
+      The allowed file extensions that cpplint will check
+
+      Examples:
+        --extensions=hpp,cpp
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+  'build/class',
+  'build/deprecated',
+  'build/endif_comment',
+  'build/explicit_make_pair',
+  'build/forward_decl',
+  'build/header_guard',
+  'build/include',
+  'build/include_alpha',
+  'build/include_order',
+  'build/include_what_you_use',
+  'build/namespaces',
+  'build/printf_format',
+  'build/storage_class',
+  'legal/copyright',
+  'readability/alt_tokens',
+  'readability/braces',
+  'readability/casting',
+  'readability/check',
+  'readability/constructors',
+  'readability/fn_size',
+  'readability/function',
+  'readability/multiline_comment',
+  'readability/multiline_string',
+  'readability/namespace',
+  'readability/nolint',
+  'readability/nul',
+  'readability/streams',
+  'readability/todo',
+  'readability/utf8',
+  'runtime/arrays',
+  'runtime/casting',
+  'runtime/explicit',
+  'runtime/int',
+  'runtime/init',
+  'runtime/invalid_increment',
+  'runtime/member_string_references',
+  'runtime/memset',
+  'runtime/operator',
+  'runtime/printf',
+  'runtime/printf_format',
+  'runtime/references',
+  'runtime/sizeof',
+  'runtime/string',
+  'runtime/threadsafe_fn',
+  'runtime/vlog',
+  'whitespace/blank_line',
+  'whitespace/braces',
+  'whitespace/comma',
+  'whitespace/comments',
+  'whitespace/empty_conditional_body',
+  'whitespace/empty_loop_body',
+  'whitespace/end_of_line',
+  'whitespace/ending_newline',
+  'whitespace/forcolon',
+  'whitespace/indent',
+  'whitespace/line_length',
+  'whitespace/newline',
+  'whitespace/operators',
+  'whitespace/parens',
+  'whitespace/semicolon',
+  'whitespace/tab',
+  'whitespace/todo'
+  ]
+
+# The default state of the category filter. This is overrided by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+    # Legacy
+    'algobase.h',
+    'algo.h',
+    'alloc.h',
+    'builtinbuf.h',
+    'bvector.h',
+    'complex.h',
+    'defalloc.h',
+    'deque.h',
+    'editbuf.h',
+    'fstream.h',
+    'function.h',
+    'hash_map',
+    'hash_map.h',
+    'hash_set',
+    'hash_set.h',
+    'hashtable.h',
+    'heap.h',
+    'indstream.h',
+    'iomanip.h',
+    'iostream.h',
+    'istream.h',
+    'iterator.h',
+    'list.h',
+    'map.h',
+    'multimap.h',
+    'multiset.h',
+    'ostream.h',
+    'pair.h',
+    'parsestream.h',
+    'pfstream.h',
+    'procbuf.h',
+    'pthread_alloc',
+    'pthread_alloc.h',
+    'rope',
+    'rope.h',
+    'ropeimpl.h',
+    'set.h',
+    'slist',
+    'slist.h',
+    'stack.h',
+    'stdiostream.h',
+    'stl_alloc.h',
+    'stl_relops.h',
+    'streambuf.h',
+    'stream.h',
+    'strfile.h',
+    'strstream.h',
+    'tempbuf.h',
+    'tree.h',
+    'type_traits.h',
+    'vector.h',
+    # 17.6.1.2 C++ library headers
+    'algorithm',
+    'array',
+    'atomic',
+    'bitset',
+    'chrono',
+    'codecvt',
+    'complex',
+    'condition_variable',
+    'deque',
+    'exception',
+    'forward_list',
+    'fstream',
+    'functional',
+    'future',
+    'initializer_list',
+    'iomanip',
+    'ios',
+    'iosfwd',
+    'iostream',
+    'istream',
+    'iterator',
+    'limits',
+    'list',
+    'locale',
+    'map',
+    'memory',
+    'mutex',
+    'new',
+    'numeric',
+    'ostream',
+    'queue',
+    'random',
+    'ratio',
+    'regex',
+    'set',
+    'sstream',
+    'stack',
+    'stdexcept',
+    'streambuf',
+    'string',
+    'strstream',
+    'system_error',
+    'thread',
+    'tuple',
+    'typeindex',
+    'typeinfo',
+    'type_traits',
+    'unordered_map',
+    'unordered_set',
+    'utility',
+    'valarray',
+    'vector',
+    # 17.6.1.2 C++ headers for C library facilities
+    'cassert',
+    'ccomplex',
+    'cctype',
+    'cerrno',
+    'cfenv',
+    'cfloat',
+    'cinttypes',
+    'ciso646',
+    'climits',
+    'clocale',
+    'cmath',
+    'csetjmp',
+    'csignal',
+    'cstdalign',
+    'cstdarg',
+    'cstdbool',
+    'cstddef',
+    'cstdint',
+    'cstdio',
+    'cstdlib',
+    'cstring',
+    'ctgmath',
+    'ctime',
+    'cuchar',
+    'cwchar',
+    'cwctype',
+    ])
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# Finds occurrences of NOLINT or NOLINT(...).
+_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
+  matched = _RE_SUPPRESSION.search(raw_line)
+  if matched:
+    category = matched.group(1)
+    if category in (None, '(*)'):  # => "suppress all"
+      _error_suppressions.setdefault(None, set()).add(linenum)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          _error_suppressions.setdefault(category, set()).add(linenum)
+        else:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  "Resets the set of NOLINT suppressions to empty."
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+  """Replaces instances of pattern in a string with a replacement.
+
+  The compiled regex is kept in a cache shared by Match and Search.
+
+  Args:
+    pattern: regex pattern
+    rep: replacement text
+    s: search string
+
+  Returns:
+    string with replacements made (or original string if no replacements)
+  """
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(dict):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  As a dict, an _IncludeState object serves as a mapping between include
+  filename and line number on which that file was included.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    dict.__init__(self)
+    self.ResetSection()
+
+  def ResetSection(self):
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+  def SetLastHeader(self, header_path):
+    self._last_header = header_path
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      header_path: Canonicalized header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    # If previous section is different from current section, _last_header will
+    # be reset to empty string, so it's always less than current header.
+    #
+    # If previous line was a blank line, assume that the headers are
+    # intentionally sorted the way they are.
+    if (self._last_header > header_path and
+        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+      return False
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stderr.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo:
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    elif _cpplint_state.output_format == 'eclipse':
+      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Matches strings.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
+# Matches characters.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
+# Matches multi-line C++ comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r"""(\s*/\*.*\*/\s*$|
+            /\*.*\*/\s+|
+         \s+/\*.*\*/(?=\W)|
+            /\*.*\*/)""", re.VERBOSE)
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+  """Removes C++11 raw strings from lines.
+
+    Before:
+      static const char kData[] = R"(
+          multi-line string
+          )";
+
+    After:
+      static const char kData[] = ""
+          (replaced by blank line)
+          "";
+
+  Args:
+    raw_lines: list of raw lines.
+
+  Returns:
+    list of lines with C++11 raw strings replaced by empty strings.
+  """
+
+  delimiter = None
+  lines_without_raw_strings = []
+  for line in raw_lines:
+    if delimiter:
+      # Inside a raw string, look for the end
+      end = line.find(delimiter)
+      if end >= 0:
+        # Found the end of the string, match leading space for this
+        # line and resume copying the original lines, and also insert
+        # a "" on the last line.
+        leading_space = Match(r'^(\s*)\S', line)
+        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+        delimiter = None
+      else:
+        # Haven't found the end yet, append a blank line.
+        line = ''
+
+    else:
+      # Look for beginning of a raw string.
+      # See 2.14.15 [lex.string] for syntax.
+      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if matched:
+        delimiter = ')' + matched.group(2) + '"'
+
+        end = matched.group(3).find(delimiter)
+        if end >= 0:
+          # Raw string ended on same line
+          line = (matched.group(1) + '""' +
+                  matched.group(3)[end + len(delimiter):])
+          delimiter = None
+        else:
+          # Start of a multi-line raw string
+          line = matched.group(1) + '""'
+
+    lines_without_raw_strings.append(line)
+
+  # TODO(unknown): if delimiter is not None here, we might want to
+  # emit a warning for unterminated string.
+  return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '// dummy'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 3 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments,
+  2) lines member contains lines without comments, and
+  3) raw_lines member contains all the lines without processing.
+  All these three members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    self.lines_without_raw_strings = CleanseRawStrings(lines)
+    for linenum in range(len(self.lines_without_raw_strings)):
+      self.lines.append(CleanseComments(
+          self.lines_without_raw_strings[linenum]))
+      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if not _RE_PATTERN_INCLUDE.match(elided):
+      # Remove escaped characters first to make quote/single quote collapsing
+      # basic.  Things that look like escaped characters shouldn't occur
+      # outside of strings and chars.
+      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
+    return elided
+
+
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching endchar: (index just after matching endchar, 0)
+    Otherwise: (-1, new depth at end of this line)
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return (i + 1, 0)
+  return (-1, depth)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [ or <, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  startchar = line[pos]
+  if startchar not in '({[<':
+    return (line, clean_lines.NumLines(), -1)
+  if startchar == '(': endchar = ')'
+  if startchar == '[': endchar = ']'
+  if startchar == '{': endchar = '}'
+  if startchar == '<': endchar = '>'
+
+  # Check first line
+  (end_pos, num_open) = FindEndOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+
+  # Continue scanning forward
+  while linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    (end_pos, num_open) = FindEndOfExpressionInLine(
+        line, 0, num_open, startchar, endchar)
+    if end_pos > -1:
+      return (line, linenum, end_pos)
+
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
+  """Find position at the matching startchar.
+
+  This is almost the reverse of FindEndOfExpressionInLine, but note
+  that the input position and returned position differs by 1.
+
+  Args:
+    line: a CleansedLines line.
+    endpos: start searching at this position.
+    depth: nesting level at endpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching startchar: (index at matching startchar, 0)
+    Otherwise: (-1, new depth at beginning of this line)
+  """
+  for i in xrange(endpos, -1, -1):
+    if line[i] == endchar:
+      depth += 1
+    elif line[i] == startchar:
+      depth -= 1
+      if depth == 0:
+        return (i, 0)
+  return (-1, depth)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+  """If input points to ) or } or ] or >, finds the position that opens it.
+
+  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+  linenum/pos that correspond to the opening of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *at* the opening brace, or
+    (line, 0, -1) if we never find the matching opening brace.  Note
+    we ignore strings and comments when matching; and the line we
+    return is the 'cleansed' line at linenum.
+  """
+  line = clean_lines.elided[linenum]
+  endchar = line[pos]
+  if endchar not in ')}]>':
+    return (line, 0, -1)
+  if endchar == ')': startchar = '('
+  if endchar == ']': startchar = '['
+  if endchar == '}': startchar = '{'
+  if endchar == '>': startchar = '<'
+
+  # Check last line
+  (start_pos, num_open) = FindStartOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if start_pos > -1:
+    return (line, linenum, start_pos)
+
+  # Continue scanning backward
+  while linenum > 0:
+    linenum -= 1
+    line = clean_lines.elided[linenum]
+    (start_pos, num_open) = FindStartOfExpressionInLine(
+        line, len(line) - 1, num_open, startchar, endchar)
+    if start_pos > -1:
+      return (line, linenum, start_pos)
+
+  # Did not find startchar before beginning of file, give up
+  return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+
+  # Restores original filename in case that cpplint is invoked from Emacs's
+  # flymake.
+  filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+
+  fileinfo = FileInfo(filename)
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = None
+  ifndef_linenum = 0
+  define = None
+  endif = None
+  endif_linenum = 0
+  for linenum, line in enumerate(lines):
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+
+  if not ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if not define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #define header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  if define != ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if endif != ('#endif  // %s' % cppvar):
+    error_level = 0
+    if endif != ('#endif  // %s' % (cppvar + '_')):
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
+                            error)
+    error(filename, endif_linenum, 'build/header_guard', error_level,
+          '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckForBadCharacters(filename, lines, error):
+  """Logs an error for each line containing bad characters.
+
+  Two kinds of bad characters:
+
+  1. Unicode replacement characters: These indicate that either the file
+  contained invalid UTF-8 (likely) or Unicode replacement characters (which
+  it shouldn't).  Note that it's possible for this to throw off line
+  numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+  2. NUL bytes.  These are problematic for some tools.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+    if '\0' in line:
+      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  '
+          'Use C++11 raw strings or concatenation instead.')
+
+
+threading_list = (
+    ('asctime(', 'asctime_r('),
+    ('ctime(', 'ctime_r('),
+    ('getgrgid(', 'getgrgid_r('),
+    ('getgrnam(', 'getgrnam_r('),
+    ('getlogin(', 'getlogin_r('),
+    ('getpwnam(', 'getpwnam_r('),
+    ('getpwuid(', 'getpwuid_r('),
+    ('gmtime(', 'gmtime_r('),
+    ('localtime(', 'localtime_r('),
+    ('rand(', 'rand_r('),
+    ('strtok(', 'strtok_r('),
+    ('ttyname(', 'ttyname_r('),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_function, multithread_safe_function in threading_list:
+    ix = line.find(single_thread_function)
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
+                                line[ix - 1] not in ('_', '.', '>'))):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_function +
+            '...) instead of ' + single_thread_function +
+            '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+  """Checks that VLOG() is only used for defining a logging level.
+
+  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+  VLOG(FATAL) are not.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+    error(filename, linenum, 'runtime/vlog', 5,
+          'VLOG() should be used with numeric verbosity level.  '
+          'Use LOG() if you want symbolic severity levels.')
+
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    if class_or_struct == 'struct':
+      self.access = 'public'
+      self.is_struct = True
+    else:
+      self.access = 'private'
+      self.is_struct = False
+
+    # Remember initial indentation level for this class.  Using raw_lines here
+    # instead of elided to account for leading comments.
+    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
+    if initial_indent:
+      self.class_indent = len(initial_indent.group(1))
+    else:
+      self.class_indent = 0
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    # Check that closing brace is aligned with beginning of the class.
+    # Only do this if the closing brace is indented by only whitespaces.
+    # This means we will not check single-line class definitions.
+    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+    if indent and len(indent.group(1)) != self.class_indent:
+      if self.is_struct:
+        parent = 'struct ' + self.name
+      else:
+        parent = 'class ' + self.name
+      error(filename, linenum, 'whitespace/indent', 3,
+            'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      classinfo = self.stack[-1]
+      access_match = Match(
+          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+          r':(?:[^:]|$)',
+          line)
+      if access_match:
+        classinfo.access = access_match.group(2)
+
+        # Check that access keywords are indented +1 space.  Skip this
+        # check if the keywords are not preceded by whitespaces.
+        indent = access_match.group(1)
+        if (len(indent) != classinfo.class_indent + 1 and
+            Match(r'^\s*$', indent)):
+          if classinfo.is_struct:
+            parent = 'struct ' + classinfo.name
+          else:
+            parent = 'class ' + classinfo.name
+          slots = ''
+          if access_match.group(3):
+            slots = access_match.group(3)
+          error(filename, linenum, 'whitespace/indent', 3,
+                '%s%s: should be indented +1 space inside %s' % (
+                    access_match.group(2), slots, parent))
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckCompletedBlocks(self, filename, error):
+    """Checks that all classes and namespaces have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+      elif isinstance(obj, _NamespaceInfo):
+        error(filename, obj.starting_linenum, 'build/namespaces', 5,
+              'Failed to find complete declaration of namespace %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style.
+  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
+               % re.escape(base_classname),
+               line)
+  if (args and
+      args.group(1) != 'void' and
+      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
+                % re.escape(base_classname), args.group(1).strip())):
+    error(filename, linenum, 'runtime/explicit', 5,
+          'Single-argument constructors should be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    line: The text of the line to check.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                 fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  raw = clean_lines.raw_lines
+  raw_line = raw[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(comment, filename, linenum, error):
+  """Checks for common mistakes in TODO comments.
+
+  Args:
+    comment: The text of the comment from the line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  match = _RE_PATTERN_TODO.match(comment)
+  if match:
+    # One whitespace is correct; zero whitespace is handled elsewhere.
+    leading_whitespace = match.group(1)
+    if len(leading_whitespace) > 1:
+      error(filename, linenum, 'whitespace/todo', 2,
+            'Too many spaces before TODO')
+
+    username = match.group(2)
+    if not username:
+      error(filename, linenum, 'readability/todo', 2,
+            'Missing username in TODO; it should look like '
+            '"// TODO(my_username): Stuff."')
+
+    middle_whitespace = match.group(3)
+    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+    if middle_whitespace != ' ' and middle_whitespace != '':
+      error(filename, linenum, 'whitespace/todo', 2,
+            'TODO(my_username) should be followed by a space')
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments and other template expressions.
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw = clean_lines.lines_without_raw_strings
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Redundant blank line at the start of a code block '
+              'should be deleted.')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Redundant blank line at the end of a code block '
+              'should be deleted.')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, we complain if there's a comment too near the text
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not Match(r'^\s*{ //', line) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+      # There should always be a space between the // and the comment
+      commentend = commentpos + 2
+      if commentend < len(line) and not line[commentend] == ' ':
+        # but some lines are exceptions -- e.g. if they're big
+        # comment delimiters like:
+        # //----------------------------------------------------------
+        # or are an empty C++ style Doxygen comment, like:
+        # ///
+        # or C++ style Doxygen comments placed after the variable:
+        # ///<  Header comment
+        # //!<  Header comment
+        # or they begin with multiple slashes followed by a space:
+        # //////// Header comment
+        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
+                 Search(r'^/$', line[commentend:]) or
+                 Search(r'^!< ', line[commentend:]) or
+                 Search(r'^/< ', line[commentend:]) or
+                 Search(r'^/+ ', line[commentend:]))
+        if not match:
+          error(filename, linenum, 'whitespace/comments', 4,
+                'Should have a space between // and comment')
+      CheckComment(line[commentpos:], filename, linenum, error)
+
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  # Don't try to do spacing checks for operator methods
+  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  # Also ignore using ns::operator<<;
+  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if (match and
+      not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+  # A pet peeve of mine: no spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if len(match.group(2)) not in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  #
+  # This does not apply when the non-space character following the
+  # comma is another comma, since the only time when that happens is
+  # for empty macro arguments.
+  #
+  # We run this check in two passes: first pass on elided lines to
+  # verify that lines contain missing whitespaces, second pass on raw
+  # lines to confirm that those missing whitespaces are not due to
+  # elided comments.
+  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+  # Next we will look for issues with function calls.
+  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  match = Match(r'^(.*[^ ({]){', line)
+  if match:
+    # Try a bit harder to check for brace initialization.  This
+    # happens in one of the following forms:
+    #   Constructor() : initializer_list_{} { ... }
+    #   Constructor{}.MemberFunction()
+    #   Type variable{};
+    #   FunctionCall(type{}, ...);
+    #   LastArgument(..., type{});
+    #   LOG(INFO) << type{} << " ...";
+    #   map_of_type[{...}] = ...;
+    #
+    # We check for the character following the closing brace, and
+    # silence the warning if it's one of those listed above, i.e.
+    # "{.;,)<]".
+    #
+    # To account for nested initializer list, we allow any number of
+    # closing braces up to "{;,)<".  We can't simply silence the
+    # warning on first sight of closing brace, because that would
+    # cause false negatives for things that are not initializer lists.
+    #   Silence this:         But not this:
+    #     Outer{                if (...) {
+    #       Inner{...}            if (...){  // Missing space before {
+    #     };                    }
+    #
+    # There is a false negative with this approach if people inserted
+    # spurious semicolons, e.g. "if (cond){};", but we will catch the
+    # spurious semicolon with a separate check.
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    trailing_text = ''
+    if endpos > -1:
+      trailing_text = endline[endpos:]
+    for offset in xrange(endlinenum + 1,
+                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
+      trailing_text += clean_lines.elided[offset]
+    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+      error(filename, linenum, 'whitespace/braces', 5,
+            'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'new char * []'.
+  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone is using
+    # braces in a block to explicitly create a new scope, which is commonly used
+    # to control the lifetime of stack-allocated variables.  Braces are also
+    # used for brace initializers inside function calls.  We don't detect this
+    # perfectly: we just don't complain if the last non-whitespace character on
+    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+    # previous line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[,;:}{(]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\s*', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
+      # find the ( after the if
+      pos = line.find('else if')
+      pos = line.find('(', pos)
+      if pos > 0:
+        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+        if endline[endpos:].find('{') == -1:    # must be brace after if
+          error(filename, linenum, 'readability/braces', 5,
+                'If an else has a brace on one side, it should have it on both')
+    else:            # common case: else not followed by a multi-line if
+      error(filename, linenum, 'readability/braces', 5,
+            'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Block bodies should not be followed by a semicolon.  Due to C++11
+  # brace initialization, there are more places where semicolons are
+  # required than not, so we use a whitelist approach to check these
+  # rather than a blacklist.  These are the places where "};" should
+  # be replaced by just "}":
+  # 1. Some flavor of block following closing parenthesis:
+  #    for (;;) {};
+  #    while (...) {};
+  #    switch (...) {};
+  #    Function(...) {};
+  #    if (...) {};
+  #    if (...) else if (...) {};
+  #
+  # 2. else block:
+  #    if (...) else {};
+  #
+  # 3. const member function:
+  #    Function(...) const {};
+  #
+  # 4. Block following some statement:
+  #    x = 42;
+  #    {};
+  #
+  # 5. Block at the beginning of a function:
+  #    Function(...) {
+  #      {};
+  #    }
+  #
+  #    Note that naively checking for the preceding "{" will also match
+  #    braces inside multi-dimensional arrays, but this is fine since
+  #    that expression will not contain semicolons.
+  #
+  # 6. Block following another block:
+  #    while (true) {}
+  #    {};
+  #
+  # 7. End of namespaces:
+  #    namespace {};
+  #
+  #    These semicolons seems far more common than other kinds of
+  #    redundant semicolons, possibly due to people converting classes
+  #    to namespaces.  For now we do not warn for this case.
+  #
+  # Try matching case 1 first.
+  match = Match(r'^(.*\)\s*)\{', line)
+  if match:
+    # Matched closing parenthesis (case 1).  Check the token before the
+    # matching opening parenthesis, and don't warn if it looks like a
+    # macro.  This avoids these false positives:
+    #  - macro that defines a base class
+    #  - multi-line macro that defines a base class
+    #  - macro that defines the whole class-head
+    #
+    # But we still issue warnings for macros that we know are safe to
+    # warn, specifically:
+    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+    #  - TYPED_TEST
+    #  - INTERFACE_DEF
+    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    #
+    # We implement a whitelist of safe macros instead of a blacklist of
+    # unsafe macros, even though the latter appears less frequently in
+    # google code and would have been easier to implement.  This is because
+    # the downside for getting the whitelist wrong means some extra
+    # semicolons, while the downside for getting the blacklist wrong
+    # would result in compile errors.
+    #
+    # In addition to macros, we also don't want to warn on compound
+    # literals.
+    closing_brace_pos = match.group(1).rfind(')')
+    opening_parenthesis = ReverseCloseExpression(
+        clean_lines, linenum, closing_brace_pos)
+    if opening_parenthesis[2] > -1:
+      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      if ((macro and
+           macro.group(1) not in (
+               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          Search(r'\s+=\s*$', line_prefix)):
+        match = None
+
+  else:
+    # Try matching cases 2-3.
+    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+    if not match:
+      # Try matching cases 4-6.  These are always matched on separate lines.
+      #
+      # Note that we can't simply concatenate the previous line to the
+      # current line and do a single match, otherwise we may output
+      # duplicate warnings for the blank line case:
+      #   if (cond) {
+      #     // blank line
+      #   }
+      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+      if prevline and Search(r'[;{}]\s*$', prevline):
+        match = Match(r'^(\s*)\{', line)
+
+  # Check matching closing brace
+  if match:
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+      # Current {} pair is eligible for semicolon check, and we have found
+      # the redundant semicolon, output warning here.
+      #
+      # Note: because we are scanning forward for opening braces, and
+      # outputting warnings for the matching closing brace, if there are
+      # nested blocks with trailing semicolons, we will get the error
+      # messages in reversed order.
+      error(filename, endlinenum, 'readability/braces', 4,
+            "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+  """Look for empty loop/conditional body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  #
+  # We also check "if" blocks here, since an empty conditional block
+  # is likely an error.
+  line = clean_lines.elided[linenum]
+  matched = Match(r'\s*(for|while|if)\s*\(', line)
+  if matched:
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      if matched.group(1) == 'if':
+        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+              'Empty conditional bodies should use {}')
+      else:
+        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+              'Empty loop bodies should use {} or continue')
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  lines = clean_lines.elided
+  check_macro = None
+  start_pos = -1
+  for macro in _CHECK_MACROS:
+    i = lines[linenum].find(macro)
+    if i >= 0:
+      check_macro = macro
+
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
+      if not matched:
+        continue
+      start_pos = len(matched.group(1))
+      break
+  if not check_macro or start_pos < 0:
+    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+    return
+
+  # Find end of the boolean expression by matching parentheses
+  (last_line, end_line, end_pos) = CloseExpression(
+      clean_lines, linenum, start_pos)
+  if end_pos < 0:
+    return
+  if linenum == end_line:
+    expression = lines[linenum][start_pos + 1:end_pos - 1]
+  else:
+    expression = lines[linenum][start_pos + 1:]
+    for i in xrange(linenum + 1, end_line):
+      expression += lines[i]
+    expression += last_line[0:end_pos - 1]
+
+  # Parse expression so that we can take parentheses into account.
+  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+  # which is not replaceable by CHECK_LE.
+  lhs = ''
+  rhs = ''
+  operator = None
+  while expression:
+    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+    if matched:
+      token = matched.group(1)
+      if token == '(':
+        # Parenthesized operand
+        expression = matched.group(2)
+        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        if end < 0:
+          return  # Unmatched parenthesis
+        lhs += '(' + expression[0:end]
+        expression = expression[end:]
+      elif token in ('&&', '||'):
+        # Logical and/or operators.  This means the expression
+        # contains more than one term, for example:
+        #   CHECK(42 < a && a < b);
+        #
+        # These are not replaceable with CHECK_LE, so bail out early.
+        return
+      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+        # Non-relational operator
+        lhs += token
+        expression = matched.group(2)
+      else:
+        # Relational operator
+        operator = token
+        rhs = matched.group(2)
+        break
+    else:
+      # Unparenthesized operand.  Instead of appending to lhs one character
+      # at a time, we do another regular expression match to consume several
+      # characters at once if possible.  Trivial benchmark shows that this
+      # is more efficient when the operands are longer than a single
+      # character, which is generally the case.
+      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+      if not matched:
+        matched = Match(r'^(\s*\S)(.*)$', expression)
+        if not matched:
+          break
+      lhs += matched.group(1)
+      expression = matched.group(2)
+
+  # Only apply checks if we got all parts of the boolean expression
+  if not (lhs and operator and rhs):
+    return
+
+  # Check that rhs do not contain logical operators.  We already know
+  # that lhs is fine since the loop above parses out && and ||.
+  if rhs.find('&&') > -1 or rhs.find('||') > -1:
+    return
+
+  # At least one of the operands must be a constant literal.  This is
+  # to avoid suggesting replacements for unprintable things like
+  # CHECK(variable != iterator)
+  #
+  # The following pattern matches decimal, hex integers, strings, and
+  # characters (in that order).
+  lhs = lhs.strip()
+  rhs = rhs.strip()
+  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+  if Match(match_constant, lhs) or Match(match_constant, rhs):
+    # Note: since we know both lhs and rhs, we can provide a more
+    # descriptive error message like:
+    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+    # Instead of:
+    #   Consider using CHECK_EQ instead of CHECK(a == b)
+    #
+    # We are still keeping the less descriptive message because if lhs
+    # or rhs gets long, the error message might become unreadable.
+    error(filename, linenum, 'readability/check', 2,
+          'Consider using %s instead of %s(a %s b)' % (
+              _CHECK_REPLACEMENT[check_macro][operator],
+              check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw_lines = clean_lines.lines_without_raw_strings
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for section labels
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    extended_length = int((_line_length * 1.25))
+    if line_width > extended_length:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than %i characters' %
+            extended_length)
+    elif line_width > _line_length:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= %i characters long' % _line_length)
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_cpp_h = include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+
+  line = clean_lines.lines[linenum]
+
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+    error(filename, linenum, 'build/include', 4,
+          'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    if include in include_state:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, include_state[include]))
+    else:
+      include_state[include] = linenum
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+      if not include_state.IsInAlphabeticalOrder(
+          clean_lines, linenum, canonical_include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+      include_state.SetLastHeader(canonical_include)
+
+  # Look for any of the stream classes that are part of standard C++.
+  match = _RE_PATTERN_INCLUDE.match(line)
+  if match:
+    include = match.group(2)
+    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
+      # Many unit tests use cout, so we exempt them.
+      if not _IsTestFilename(filename):
+        error(filename, linenum, 'readability/streams', 3,
+              'Streams are highly discouraged.')
+
+
+def _GetTextInside(text, start_pattern):
+  r"""Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+#   < (?: < (?: < [^<>]*
+#               >
+#           |   [^<>] )*
+#         >
+#     |   [^<>] )*
+#   >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+    r'(?:\w|'
+    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+    r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+                  include_state, nesting_state, error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Reset include state across preprocessor directives.  This is meant
+  # to silence warnings for conditional includes.
+  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
+    include_state.ResetSection()
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+
+  # TODO(unknown): figure out if they're using default arguments in fn proto.
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  if match:
+    matched_new = match.group(1)
+    matched_type = match.group(2)
+    matched_funcptr = match.group(3)
+
+    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
+    # where type may be float(), int(string), etc.  Without context they are
+    # virtually indistinguishable from int(x) casts. Likewise, gMock's
+    # MockCallback takes a template parameter of the form return_type(arg_type),
+    # which looks much like the cast we're trying to detect.
+    #
+    # std::function<> wrapper has a similar problem.
+    #
+    # Return types for function pointers also look like casts if they
+    # don't have an extra space.
+    if (matched_new is None and  # If new operator, then this isn't a cast
+        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+             Search(r'\bMockCallback<.*>', line) or
+             Search(r'\bstd::function<.*>', line)) and
+        not (matched_funcptr and
+             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                   matched_funcptr))):
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines.  The missing MOCK_METHOD is usually one or two
+      # lines back, so scan back one or two lines.
+      #
+      # It's not possible for gmock macros to appear in the first 2
+      # lines, since the class head + section name takes up 2 lines.
+      if (linenum < 2 or
+          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                     clean_lines.elided[linenum - 1]) or
+               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                     clean_lines.elided[linenum - 2]))):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              matched_type)
+
+  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                  'static_cast',
+                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  match = Search(
+      r'(?:&\(([^)]+)\)[\w(])|'
+      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match and match.group(1) != '*':
+    error(filename, linenum, 'runtime/casting', 4,
+          ('Are you taking an address of a cast?  '
+           'This is dangerous: could be a temp var.  '
+           'Take the address before doing the cast, rather than after'))
+
+  # Create an extended_line, which is the concatenation of the current and
+  # next lines, for more effective checking of code that may span more than one
+  # line.
+  if linenum + 1 < clean_lines.NumLines():
+    extended_line = line + clean_lines.elided[linenum + 1]
+  else:
+    extended_line = line
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+  # Make sure it's not a function.
+  # Function template specialization looks like: "string foo<Type>(...".
+  # Class template definitions look like: "string Foo<Type>::Method(...".
+  #
+  # Also ignore things that look like operators.  These are matched separately
+  # because operator names cross non-word boundaries.  If we change the pattern
+  # above, we would decrease the accuracy of matching identifiers.
+  if (match and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\b', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf.  Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\b', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
+  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
+  # in the class declaration.
+  match = Match(
+      (r'\s*'
+       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
+       r'\(.*\);$'),
+      line)
+  if match and linenum + 1 < clean_lines.NumLines():
+    next_line = clean_lines.elided[linenum + 1]
+    # We allow some, but not all, declarations of variables to be present
+    # in the statement that defines the class.  The [\w\*,\s]* fragment of
+    # the regular expression below allows users to declare instances of
+    # the class or pointers to instances, but not less common types such
+    # as function pointers or arrays.  It's a tradeoff between allowing
+    # reasonable code and avoiding trying to parse more C++ using regexps.
+    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
+      error(filename, linenum, 'readability/constructors', 3,
+            match.group(1) + ' should be the last thing in the class')
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+                              nesting_state, error):
+  """Check for non-const references.
+
+  Separate from CheckLanguage since it scans backwards from current
+  line, instead of scanning forward.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Do nothing if there is no '&' on current line.
+  line = clean_lines.elided[linenum]
+  if '&' not in line:
+    return
+
+  # Long type names may be broken across multiple lines, usually in one
+  # of these forms:
+  #   LongType
+  #       ::LongTypeContinued &identifier
+  #   LongType::
+  #       LongTypeContinued &identifier
+  #   LongType<
+  #       ...>::LongTypeContinued &identifier
+  #
+  # If we detected a type split across two lines, join the previous
+  # line to current line so that we can match const references
+  # accordingly.
+  #
+  # Note that this only scans back one line, since scanning back
+  # arbitrary number of lines would be expensive.  If you have a type
+  # that spans more than 2 lines, please use a typedef.
+  if linenum > 1:
+    previous = None
+    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+      # previous_line\n + ::current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                        clean_lines.elided[linenum - 1])
+    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+      # previous_line::\n + current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                        clean_lines.elided[linenum - 1])
+    if previous:
+      line = previous.group(1) + line.lstrip()
+    else:
+      # Check for templated parameter that is split across multiple lines
+      endpos = line.rfind('>')
+      if endpos > -1:
+        (_, startline, startpos) = ReverseCloseExpression(
+            clean_lines, linenum, endpos)
+        if startpos > -1 and startline < linenum:
+          # Found the matching < on an earlier line, collect all
+          # pieces up to current line.
+          line = ''
+          for i in xrange(startline, linenum + 1):
+            line += clean_lines.elided[i].strip()
+
+  # Check for non-const references in function parameters.  A single '&' may
+  # found in the following places:
+  #   inside expression: binary & for bitwise AND
+  #   inside expression: unary & for taking the address of something
+  #   inside declarators: reference parameter
+  # We will exclude the first two cases by checking that we are not inside a
+  # function body, including one that was just introduced by a trailing '{'.
+  # TODO(unknwon): Doesn't account for preprocessor directives.
+  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+  check_params = False
+  if not nesting_state.stack:
+    check_params = True  # top level
+  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+    check_params = True  # within class or namespace
+  elif Match(r'.*{\s*$', line):
+    if (len(nesting_state.stack) == 1 or
+        isinstance(nesting_state.stack[-2], _ClassInfo) or
+        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
+      check_params = True  # just opened global/class/namespace block
+  # We allow non-const references in a few standard places, like functions
+  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+  # those function parameters.
+  #
+  # We also accept & in static_assert, which looks like a function but
+  # it's actually a declaration expression.
+  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                           r'operator\s*[<>][<>]|'
+                           r'static_assert|COMPILE_ASSERT'
+                           r')\s*\(')
+  if Search(whitelisted_functions, line):
+    check_params = False
+  elif not Search(r'\S+\([^)]*$', line):
+    # Don't see a whitelisted function on this line.  Actually we
+    # didn't see any function name on this line, so this is likely a
+    # multi-line parameter list.  Try a bit harder to catch this case.
+    for i in xrange(2):
+      if (linenum > i and
+          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+        check_params = False
+        break
+
+  if check_params:
+    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+        error(filename, linenum, 'runtime/references', 2,
+              'Is this a non-const reference? '
+              'If so, make const or use a pointer: ' +
+              ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
+                    error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  Args:
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    line: The line of code to check.
+    raw_line: The raw line of code to check, with comments.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # e.g., sizeof(int)
+  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
+  if sizeof_match:
+    error(filename, linenum, 'runtime/sizeof', 1,
+          'Using sizeof(type).  Use sizeof(varname) instead if possible')
+    return True
+
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
+  # A single unnamed argument for a function tends to look like old
+  # style cast.  If we see those, don't issue warnings for deprecated
+  # casts, instead issue warnings for unnamed arguments where
+  # appropriate.
+  #
+  # These are things that we want warnings for, since the style guide
+  # explicitly require all parameters to be named:
+  #   Function(int);
+  #   Function(int) {
+  #   ConstMember(int) const;
+  #   ConstMember(int) const {
+  #   ExceptionMember(int) throw (...);
+  #   ExceptionMember(int) throw (...) {
+  #   PureVirtual(int) = 0;
+  #
+  # These are functions of some sort, where the compiler would be fine
+  # if they had named parameters, but people often omit those
+  # identifiers to reduce clutter:
+  #   (FunctionPointer)(int);
+  #   (FunctionPointer)(int) = value;
+  #   Function((function_pointer_arg)(int))
+  #   <TemplateArgument(int)>;
+  #   <(FunctionPointerTemplateArgument)(int)>;
+  remainder = line[match.end(0):]
+  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
+    # Looks like an unnamed parameter.
+
+    # Don't warn on any kind of template arguments.
+    if Match(r'^\s*>', remainder):
+      return False
+
+    # Don't warn on assignments to function pointers, but keep warnings for
+    # unnamed parameters to pure virtual functions.  Note that this pattern
+    # will also pass on assignments of "0" to function pointers, but the
+    # preferred values for those would be "nullptr" or "NULL".
+    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+    if matched_zero and matched_zero.group(1) != '0':
+      return False
+
+    # Don't warn on function pointer declarations.  For this we need
+    # to check what came before the "(type)" string.
+    if Match(r'.*\)\s*$', line[0:match.start(0)]):
+      return False
+
+    # Don't warn if the parameter is named with block comments, e.g.:
+    #  Function(int /*unused_param*/);
+    if '/*' in raw_line:
+      return False
+
+    # Passed all filters, issue warning here.
+    error(filename, linenum, 'readability/function', 3,
+          'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_state, io=codecs):
+  """Fill up the include_state with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_state: an _IncludeState instance in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was succesfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      # The value formatting is cute, but not really used right now.
+      # What matters here is that the key is in include_state.
+      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's copy the include_state so it is only messed up within this function.
+  include_state = include_state.copy()
+
+  # Did we find the header for this file (if any) and succesfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_state is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_state.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_state, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_state:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                nesting_state, error)
+  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckVlogArguments(filename, clean_lines, line, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = _NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+  nesting_state.CheckCompletedBlocks(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForBadCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below. If it is not expected to be present (i.e. os.linesep !=
+    # '\r\n' as in Windows), a warning is issued below if this file
+    # is processed.
+
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    carriage_return_found = False
+    # Remove trailing '\r'.
+    for linenum in range(len(lines)):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        carriage_return_found = True
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if filename != '-' and file_extension not in _valid_extensions:
+    sys.stderr.write('Ignoring %s; not a valid file name '
+                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+    if carriage_return_found and os.linesep != '\r\n':
+      # Use 0 for linenum since outputting only one error for potentially
+      # several lines.
+      Error(filename, 0, 'whitespace/newline', 1,
+            'One or more unexpected \\r (^M) found;'
+            'better to use only a \\n')
+
+  sys.stderr.write('Done processing %s\n' % filename)
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root=',
+                                                 'linelength=',
+                                                 'extensions='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if val not in ('emacs', 'vs7', 'eclipse'):
+        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+    elif opt == '--linelength':
+      global _line_length
+      try:
+          _line_length = int(val)
+      except ValueError:
+          PrintUsage('Line length must be digits.')
+    elif opt == '--extensions':
+      global _valid_extensions
+      try:
+          _valid_extensions = set(val.split(','))
+      except ValueError:
+          PrintUsage('Extensions must be comma seperated list.')
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/third_party/aom/tools/diff.py b/third_party/aom/tools/diff.py
new file mode 100644
index 000000000..bac6aabdc
--- /dev/null
+++ b/third_party/aom/tools/diff.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Classes for representing diff pieces."""
+
+__author__ = "jkoleszar@google.com"
+
+import re
+
+
+class DiffLines(object):
+    """A container for one half of a diff."""
+
+    def __init__(self, filename, offset, length):
+        self.filename = filename
+        self.offset = offset
+        self.length = length
+        self.lines = []
+        self.delta_line_nums = []
+
+    def Append(self, line):
+        l = len(self.lines)
+        if line[0] != " ":
+            self.delta_line_nums.append(self.offset + l)
+        self.lines.append(line[1:])
+        assert l+1 <= self.length
+
+    def Complete(self):
+        return len(self.lines) == self.length
+
+    def __contains__(self, item):
+        return item >= self.offset and item <= self.offset + self.length - 1
+
+
+class DiffHunk(object):
+    """A container for one diff hunk, consisting of two DiffLines."""
+
+    def __init__(self, header, file_a, file_b, start_a, len_a, start_b, len_b):
+        self.header = header
+        self.left = DiffLines(file_a, start_a, len_a)
+        self.right = DiffLines(file_b, start_b, len_b)
+        self.lines = []
+
+    def Append(self, line):
+        """Adds a line to the DiffHunk and its DiffLines children."""
+        if line[0] == "-":
+            self.left.Append(line)
+        elif line[0] == "+":
+            self.right.Append(line)
+        elif line[0] == " ":
+            self.left.Append(line)
+            self.right.Append(line)
+        elif line[0] == "\\":
+            # Ignore newline messages from git diff.
+            pass
+        else:
+            assert False, ("Unrecognized character at start of diff line "
+                           "%r" % line[0])
+        self.lines.append(line)
+
+    def Complete(self):
+        return self.left.Complete() and self.right.Complete()
+
+    def __repr__(self):
+        return "DiffHunk(%s, %s, len %d)" % (
+            self.left.filename, self.right.filename,
+            max(self.left.length, self.right.length))
+
+
+def ParseDiffHunks(stream):
+    """Walk a file-like object, yielding DiffHunks as they're parsed."""
+
+    file_regex = re.compile(r"(\+\+\+|---) (\S+)")
+    range_regex = re.compile(r"@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?")
+    hunk = None
+    while True:
+        line = stream.readline()
+        if not line:
+            break
+
+        if hunk is None:
+            # Parse file names
+            diff_file = file_regex.match(line)
+            if diff_file:
+              if line.startswith("---"):
+                  a_line = line
+                  a = diff_file.group(2)
+                  continue
+              if line.startswith("+++"):
+                  b_line = line
+                  b = diff_file.group(2)
+                  continue
+
+            # Parse offset/lengths
+            diffrange = range_regex.match(line)
+            if diffrange:
+                if diffrange.group(2):
+                    start_a = int(diffrange.group(1))
+                    len_a = int(diffrange.group(3))
+                else:
+                    start_a = 1
+                    len_a = int(diffrange.group(1))
+
+                if diffrange.group(5):
+                    start_b = int(diffrange.group(4))
+                    len_b = int(diffrange.group(6))
+                else:
+                    start_b = 1
+                    len_b = int(diffrange.group(4))
+
+                header = [a_line, b_line, line]
+                hunk = DiffHunk(header, a, b, start_a, len_a, start_b, len_b)
+        else:
+            # Add the current line to the hunk
+            hunk.Append(line)
+
+            # See if the whole hunk has been parsed. If so, yield it and prepare
+            # for the next hunk.
+            if hunk.Complete():
+                yield hunk
+                hunk = None
+
+    # Partial hunks are a parse error
+    assert hunk is None
diff --git a/third_party/aom/tools/dump_obu.cc b/third_party/aom/tools/dump_obu.cc
new file mode 100644
index 000000000..30ee5e7a1
--- /dev/null
+++ b/third_party/aom/tools/dump_obu.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "tools/obu_parser.h"
+
+namespace {
+
+const size_t kInitialBufferSize = 100 * 1024;
+
+struct InputContext {
+  InputContext() = default;
+  ~InputContext() { free(unit_buffer); }
+
+  void Init() {
+    memset(avx_ctx, 0, sizeof(*avx_ctx));
+    memset(obu_ctx, 0, sizeof(*obu_ctx));
+    obu_ctx->avx_ctx = avx_ctx;
+#if CONFIG_WEBM_IO
+    memset(webm_ctx, 0, sizeof(*webm_ctx));
+#endif
+  }
+
+  AvxInputContext *avx_ctx = nullptr;
+  ObuDecInputContext *obu_ctx = nullptr;
+#if CONFIG_WEBM_IO
+  WebmInputContext *webm_ctx = nullptr;
+#endif
+  uint8_t *unit_buffer = nullptr;
+  size_t unit_buffer_size = 0;
+};
+
+void PrintUsage() {
+  printf("Libaom OBU dump.\nUsage: dump_obu <input_file>\n");
+}
+
+VideoFileType GetFileType(InputContext *ctx) {
+  if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF;
+  if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU;
+#if CONFIG_WEBM_IO
+  if (file_is_webm(ctx->webm_ctx, ctx->avx_ctx)) return FILE_TYPE_WEBM;
+#endif
+  return FILE_TYPE_RAW;
+}
+
+bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) {
+  const VideoFileType file_type = ctx->avx_ctx->file_type;
+  switch (file_type) {
+    case FILE_TYPE_IVF: {
+      if (ivf_read_frame(ctx->avx_ctx->file, &ctx->unit_buffer, unit_size,
+                         &ctx->unit_buffer_size, NULL)) {
+        return false;
+      }
+      break;
+    }
+    case FILE_TYPE_OBU: {
+      if (obudec_read_temporal_unit(ctx->obu_ctx, &ctx->unit_buffer, unit_size,
+                                    &ctx->unit_buffer_size)) {
+        return false;
+      }
+      break;
+    }
+#if CONFIG_WEBM_IO
+    case FILE_TYPE_WEBM: {
+      if (webm_read_frame(ctx->webm_ctx, &ctx->unit_buffer, unit_size,
+                          &ctx->unit_buffer_size)) {
+        return false;
+      }
+      break;
+    }
+#endif
+    default:
+      // TODO(tomfinegan): Abuse FILE_TYPE_RAW for AV1/OBU elementary streams?
+      fprintf(stderr, "Error: Unsupported file type.\n");
+      return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+int main(int argc, const char *argv[]) {
+  // TODO(tomfinegan): Could do with some params for verbosity.
+  if (argc < 2) {
+    PrintUsage();
+    return EXIT_SUCCESS;
+  }
+
+  const std::string filename = argv[1];
+
+  using FilePtr = std::unique_ptr<FILE, decltype(&fclose)>;
+  FilePtr input_file(fopen(filename.c_str(), "rb"), &fclose);
+  if (input_file.get() == nullptr) {
+    input_file.release();
+    fprintf(stderr, "Error: Cannot open input file.\n");
+    return EXIT_FAILURE;
+  }
+
+  AvxInputContext avx_ctx;
+  InputContext input_ctx;
+  input_ctx.avx_ctx = &avx_ctx;
+  ObuDecInputContext obu_ctx;
+  input_ctx.obu_ctx = &obu_ctx;
+#if CONFIG_WEBM_IO
+  WebmInputContext webm_ctx;
+  input_ctx.webm_ctx = &webm_ctx;
+#endif
+
+  input_ctx.Init();
+  avx_ctx.file = input_file.get();
+  avx_ctx.file_type = GetFileType(&input_ctx);
+
+  // Note: the reader utilities will realloc the buffer using realloc() etc.
+  // Can't have nice things like unique_ptr wrappers with that type of
+  // behavior underneath the function calls.
+  input_ctx.unit_buffer =
+      reinterpret_cast<uint8_t *>(calloc(kInitialBufferSize, 1));
+  if (!input_ctx.unit_buffer) {
+    fprintf(stderr, "Error: No memory, can't alloc input buffer.\n");
+    return EXIT_FAILURE;
+  }
+  input_ctx.unit_buffer_size = kInitialBufferSize;
+
+  size_t unit_size = 0;
+  int unit_number = 0;
+  int64_t obu_overhead_bytes_total = 0;
+  while (ReadTemporalUnit(&input_ctx, &unit_size)) {
+    printf("Temporal unit %d\n", unit_number);
+
+    int obu_overhead_current_unit = 0;
+    if (!aom_tools::DumpObu(input_ctx.unit_buffer, static_cast<int>(unit_size),
+                            &obu_overhead_current_unit)) {
+      fprintf(stderr, "Error: Temporal Unit parse failed on unit number %d.\n",
+              unit_number);
+      return EXIT_FAILURE;
+    }
+    printf("  OBU overhead:    %d\n", obu_overhead_current_unit);
+    ++unit_number;
+    obu_overhead_bytes_total += obu_overhead_current_unit;
+  }
+
+  printf("File total OBU overhead: %" PRId64 "\n", obu_overhead_bytes_total);
+  return EXIT_SUCCESS;
+}
diff --git a/third_party/aom/tools/gen_authors.sh b/third_party/aom/tools/gen_authors.sh
new file mode 100755
index 000000000..5def8bc89
--- /dev/null
+++ b/third_party/aom/tools/gen_authors.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Add organization names manually.
+
+cat <<EOF
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v "corp.google\|clang-format")
+EOF
diff --git a/third_party/aom/tools/gen_constrained_tokenset.py b/third_party/aom/tools/gen_constrained_tokenset.py
new file mode 100755
index 000000000..5d12ee1ef
--- /dev/null
+++ b/third_party/aom/tools/gen_constrained_tokenset.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Generate the probability model for the constrained token set.
+
+Model obtained from a 2-sided zero-centered distribution derived
+from a Pareto distribution. The cdf of the distribution is:
+cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+
+For a given beta and a given probability of the 1-node, the alpha
+is first solved, and then the {alpha, beta} pair is used to generate
+the probabilities for the rest of the nodes.
+"""
+
+import heapq
+import sys
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+
+def cdf_spareto(x, xm, beta):
+  p = 1 - (xm / (np.abs(x) + xm))**beta
+  p = 0.5 + 0.5 * np.sign(x) * p
+  return p
+
+
+def get_spareto(p, beta):
+  cdf = cdf_spareto
+
+  def func(x):
+    return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) /
+            (1 - cdf(0.5, x, beta)) - p)**2
+
+  alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12)
+  parray = np.zeros(11)
+  parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5)
+  parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta)))
+  parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta)))
+  parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta)))
+  parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta)))
+  parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta)))
+  parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta)))
+  parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta)))
+  parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta)))
+  parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta)))
+  parray[10] = 2 * (1. - cdf(66.5, alpha, beta))
+  return parray
+
+
+def quantize_probs(p, save_first_bin, bits):
+  """Quantize probability precisely.
+
+  Quantize probabilities minimizing dH (Kullback-Leibler divergence)
+  approximated by: sum (p_i-q_i)^2/p_i.
+  References:
+  https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+  https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit
+  """
+  num_sym = p.size
+  p = np.clip(p, 1e-16, 1)
+  L = 2**bits
+  pL = p * L
+  ip = 1. / p  # inverse probability
+  q = np.clip(np.round(pL), 1, L + 1 - num_sym)
+  quant_err = (pL - q)**2 * ip
+  sgn = np.sign(L - q.sum())  # direction of correction
+  if sgn != 0:  # correction is needed
+    v = []  # heap of adjustment results (adjustment err, index) of each symbol
+    for i in range(1 if save_first_bin else 0, num_sym):
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+    while q.sum() != L:
+      # apply lowest error adjustment
+      (adj_err, i) = heapq.heappop(v)
+      quant_err[i] += adj_err
+      q[i] += sgn
+      # calculate the cost of adjusting this symbol again
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+  return q
+
+
+def get_quantized_spareto(p, beta, bits, first_token):
+  parray = get_spareto(p, beta)
+  parray = parray[1:] / (1 - parray[0])
+  # CONFIG_NEW_TOKENSET
+  if first_token > 1:
+    parray = parray[1:] / (1 - parray[0])
+  qarray = quantize_probs(parray, first_token == 1, bits)
+  return qarray.astype(np.int)
+
+
+def main(bits=15, first_token=1):
+  beta = 8
+  for q in range(1, 256):
+    parray = get_quantized_spareto(q / 256., beta, bits, first_token)
+    assert parray.sum() == 2**bits
+    print '{', ', '.join('%d' % i for i in parray), '},'
+
+
+if __name__ == '__main__':
+  if len(sys.argv) > 2:
+    main(int(sys.argv[1]), int(sys.argv[2]))
+  elif len(sys.argv) > 1:
+    main(int(sys.argv[1]))
+  else:
+    main()
diff --git a/third_party/aom/tools/inspect-cli.js b/third_party/aom/tools/inspect-cli.js
new file mode 100644
index 000000000..a14c08111
--- /dev/null
+++ b/third_party/aom/tools/inspect-cli.js
@@ -0,0 +1,39 @@
+/**
+ * This tool lets you test if the compiled Javascript decoder is functioning properly. You'll
+ * need to download a SpiderMonkey js-shell to run this script.
+ * https://archive.mozilla.org/pub/firefox/nightly/latest-mozilla-central/
+ *
+ * Example:
+ *   js-shell inspect-cli.js video.ivf
+ */
+load("inspect.js");
+var buffer = read(scriptArgs[0], "binary");
+var Module = {
+  noExitRuntime: true,
+  noInitialRun: true,
+  preInit: [],
+  preRun: [],
+  postRun: [function () {
+    printErr(`Loaded Javascript Decoder OK`);
+  }],
+  memoryInitializerPrefixURL: "bin/",
+  arguments: ['input.ivf', 'output.raw'],
+  on_frame_decoded_json: function (jsonString) {
+    let json = JSON.parse("[" + Module.UTF8ToString(jsonString) + "null]");
+    json.forEach(frame => {
+      if (frame) {
+        print(frame.frame);
+      }
+    });
+  }
+};
+DecoderModule(Module);
+Module.FS.writeFile("/tmp/input.ivf", buffer, { encoding: "binary" });
+Module._open_file();
+Module._set_layers(0xFFFFFFFF); // Set this to zero if you want to benchmark decoding.
+while(true) {
+  printErr("Decoding Frame ...");
+  if (Module._read_frame()) {
+    break;
+  }
+}
diff --git a/third_party/aom/tools/inspect-post.js b/third_party/aom/tools/inspect-post.js
new file mode 100644
index 000000000..31c40bb82
--- /dev/null
+++ b/third_party/aom/tools/inspect-post.js
@@ -0,0 +1 @@
+Module["FS"] = FS;
diff --git a/third_party/aom/tools/intersect-diffs.py b/third_party/aom/tools/intersect-diffs.py
new file mode 100755
index 000000000..df13c4ef7
--- /dev/null
+++ b/third_party/aom/tools/intersect-diffs.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Calculates the "intersection" of two unified diffs.
+
+Given two diffs, A and B, it finds all hunks in B that had non-context lines
+in A and prints them to stdout. This is useful to determine the hunks in B that
+are relevant to A. The resulting file can be applied with patch(1) on top of A.
+"""
+
+__author__ = "jkoleszar@google.com"
+
+import sys
+
+import diff
+
+
+def FormatDiffHunks(hunks):
+    """Re-serialize a list of DiffHunks."""
+    r = []
+    last_header = None
+    for hunk in hunks:
+        this_header = hunk.header[0:2]
+        if last_header != this_header:
+            r.extend(hunk.header)
+            last_header = this_header
+        else:
+            r.extend(hunk.header[2])
+        r.extend(hunk.lines)
+        r.append("\n")
+    return "".join(r)
+
+
+def ZipHunks(rhs_hunks, lhs_hunks):
+    """Join two hunk lists on filename."""
+    for rhs_hunk in rhs_hunks:
+        rhs_file = rhs_hunk.right.filename.split("/")[1:]
+
+        for lhs_hunk in lhs_hunks:
+            lhs_file = lhs_hunk.left.filename.split("/")[1:]
+            if lhs_file != rhs_file:
+                continue
+            yield (rhs_hunk, lhs_hunk)
+
+
+def main():
+    old_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[1], "r"))]
+    new_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[2], "r"))]
+    out_hunks = []
+
+    # Join the right hand side of the older diff with the left hand side of the
+    # newer diff.
+    for old_hunk, new_hunk in ZipHunks(old_hunks, new_hunks):
+        if new_hunk in out_hunks:
+            continue
+        old_lines = old_hunk.right
+        new_lines = new_hunk.left
+
+        # Determine if this hunk overlaps any non-context line from the other
+        for i in old_lines.delta_line_nums:
+            if i in new_lines:
+                out_hunks.append(new_hunk)
+                break
+
+    if out_hunks:
+        print FormatDiffHunks(out_hunks)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/aom/tools/lint-hunks.py b/third_party/aom/tools/lint-hunks.py
new file mode 100755
index 000000000..d02bee16c
--- /dev/null
+++ b/third_party/aom/tools/lint-hunks.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Performs style checking on each diff hunk."""
+import getopt
+import os
+import StringIO
+import subprocess
+import sys
+
+import diff
+
+
+SHORT_OPTIONS = "h"
+LONG_OPTIONS = ["help"]
+
+TOPLEVEL_CMD = ["git", "rev-parse", "--show-toplevel"]
+DIFF_CMD = ["git", "diff"]
+DIFF_INDEX_CMD = ["git", "diff-index", "-u", "HEAD", "--"]
+SHOW_CMD = ["git", "show"]
+CPPLINT_FILTERS = ["-readability/casting"]
+
+
+class Usage(Exception):
+    pass
+
+
+class SubprocessException(Exception):
+    def __init__(self, args):
+        msg = "Failed to execute '%s'"%(" ".join(args))
+        super(SubprocessException, self).__init__(msg)
+
+
+class Subprocess(subprocess.Popen):
+    """Adds the notion of an expected returncode to Popen."""
+
+    def __init__(self, args, expected_returncode=0, **kwargs):
+        self._args = args
+        self._expected_returncode = expected_returncode
+        super(Subprocess, self).__init__(args, **kwargs)
+
+    def communicate(self, *args, **kwargs):
+        result = super(Subprocess, self).communicate(*args, **kwargs)
+        if self._expected_returncode is not None:
+            try:
+                ok = self.returncode in self._expected_returncode
+            except TypeError:
+                ok = self.returncode == self._expected_returncode
+            if not ok:
+                raise SubprocessException(self._args)
+        return result
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    try:
+        try:
+            opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
+        except getopt.error, msg:
+            raise Usage(msg)
+
+        # process options
+        for o, _ in opts:
+            if o in ("-h", "--help"):
+                print __doc__
+                sys.exit(0)
+
+        if args and len(args) > 1:
+            print __doc__
+            sys.exit(0)
+
+        # Find the fully qualified path to the root of the tree
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = tl.communicate()[0].strip()
+
+        # See if we're working on the index or not.
+        if args:
+            diff_cmd = DIFF_CMD + [args[0] + "^!"]
+        else:
+            diff_cmd = DIFF_INDEX_CMD
+
+        # Build the command line to execute cpplint
+        cpplint_cmd = [os.path.join(tl, "tools", "cpplint.py"),
+                       "--filter=" + ",".join(CPPLINT_FILTERS),
+                       "-"]
+
+        # Get a list of all affected lines
+        file_affected_line_map = {}
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        stdout = p.communicate()[0]
+        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+            filename = hunk.right.filename[2:]
+            if filename not in file_affected_line_map:
+                file_affected_line_map[filename] = set()
+            file_affected_line_map[filename].update(hunk.right.delta_line_nums)
+
+        # Run each affected file through cpplint
+        lint_failed = False
+        for filename, affected_lines in file_affected_line_map.iteritems():
+            if filename.split(".")[-1] not in ("c", "h", "cc"):
+                continue
+
+            if args:
+                # File contents come from git
+                show_cmd = SHOW_CMD + [args[0] + ":" + filename]
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                lint_out = lint.communicate()[1]
+            else:
+                # File contents come from the working tree
+                lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                stdin = open(os.path.join(tl, filename)).read()
+                lint_out = lint.communicate(stdin)[1]
+
+            for line in lint_out.split("\n"):
+                fields = line.split(":")
+                if fields[0] != "-":
+                    continue
+                warning_line_num = int(fields[1])
+                if warning_line_num in affected_lines:
+                    print "%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:]))
+                    lint_failed = True
+
+        # Set exit code if any relevant lint errors seen
+        if lint_failed:
+            return 1
+
+    except Usage, err:
+        print >>sys.stderr, err
+        print >>sys.stderr, "for help use --help"
+        return 2
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/third_party/aom/tools/obu_parser.cc b/third_party/aom/tools/obu_parser.cc
new file mode 100644
index 000000000..7d71386ce
--- /dev/null
+++ b/third_party/aom/tools/obu_parser.cc
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include <cstdio>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/obu_util.h"
+#include "tools/obu_parser.h"
+
+namespace aom_tools {
+
+// Basic OBU syntax
+// 8 bits: Header
+//   7
+//     forbidden bit
+//   6,5,4,3
+//     type bits
+//   2
+//     extension flag bit
+//   1
+//     has size field bit
+//   0
+//     reserved bit
+const uint32_t kObuForbiddenBitMask = 0x1;
+const uint32_t kObuForbiddenBitShift = 7;
+const uint32_t kObuTypeBitsMask = 0xF;
+const uint32_t kObuTypeBitsShift = 3;
+const uint32_t kObuExtensionFlagBitMask = 0x1;
+const uint32_t kObuExtensionFlagBitShift = 2;
+const uint32_t kObuHasSizeFieldBitMask = 0x1;
+const uint32_t kObuHasSizeFieldBitShift = 1;
+
+// When extension flag bit is set:
+// 8 bits: extension header
+// 7,6,5
+//   temporal ID
+// 4,3
+//   spatial ID
+// 2,1,0
+//   reserved bits
+const uint32_t kObuExtTemporalIdBitsMask = 0x7;
+const uint32_t kObuExtTemporalIdBitsShift = 5;
+const uint32_t kObuExtSpatialIdBitsMask = 0x3;
+const uint32_t kObuExtSpatialIdBitsShift = 3;
+
+bool ValidObuType(int obu_type) {
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: return true;
+  }
+  return false;
+}
+
+bool ParseObuHeader(uint8_t obu_header_byte, ObuHeader *obu_header) {
+  const int forbidden_bit =
+      (obu_header_byte >> kObuForbiddenBitShift) & kObuForbiddenBitMask;
+  if (forbidden_bit) {
+    fprintf(stderr, "Invalid OBU, forbidden bit set.\n");
+    return false;
+  }
+
+  obu_header->type = static_cast<OBU_TYPE>(
+      (obu_header_byte >> kObuTypeBitsShift) & kObuTypeBitsMask);
+  if (!ValidObuType(obu_header->type)) {
+    fprintf(stderr, "Invalid OBU type: %d.\n", obu_header->type);
+    return false;
+  }
+
+  obu_header->has_extension =
+      (obu_header_byte >> kObuExtensionFlagBitShift) & kObuExtensionFlagBitMask;
+  obu_header->has_size_field =
+      (obu_header_byte >> kObuHasSizeFieldBitShift) & kObuHasSizeFieldBitMask;
+  return true;
+}
+
+bool ParseObuExtensionHeader(uint8_t ext_header_byte, ObuHeader *obu_header) {
+  obu_header->temporal_layer_id =
+      (ext_header_byte >> kObuExtTemporalIdBitsShift) &
+      kObuExtTemporalIdBitsMask;
+  obu_header->spatial_layer_id =
+      (ext_header_byte >> kObuExtSpatialIdBitsShift) & kObuExtSpatialIdBitsMask;
+
+  return true;
+}
+
+void PrintObuHeader(const ObuHeader *header) {
+  printf(
+      "  OBU type:        %s\n"
+      "      extension:   %s\n",
+      aom_obu_type_to_string(static_cast<OBU_TYPE>(header->type)),
+      header->has_extension ? "yes" : "no");
+  if (header->has_extension) {
+    printf(
+        "      temporal_id: %d\n"
+        "      spatial_id:  %d\n",
+        header->temporal_layer_id, header->temporal_layer_id);
+  }
+}
+
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) {
+  const int kObuHeaderSizeBytes = 1;
+  const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes;
+  int consumed = 0;
+  int obu_overhead = 0;
+  ObuHeader obu_header;
+  while (consumed < length) {
+    const int remaining = length - consumed;
+    if (remaining < kMinimumBytesRequired) {
+      fprintf(stderr,
+              "OBU parse error. Did not consume all data, %d bytes remain.\n",
+              remaining);
+      return false;
+    }
+
+    int obu_header_size = 0;
+
+    memset(&obu_header, 0, sizeof(obu_header));
+    const uint8_t obu_header_byte = *(data + consumed);
+    if (!ParseObuHeader(obu_header_byte, &obu_header)) {
+      fprintf(stderr, "OBU parsing failed at offset %d.\n", consumed);
+      return false;
+    }
+
+    ++obu_overhead;
+    ++obu_header_size;
+
+    if (obu_header.has_extension) {
+      const uint8_t obu_ext_header_byte =
+          *(data + consumed + kObuHeaderSizeBytes);
+      if (!ParseObuExtensionHeader(obu_ext_header_byte, &obu_header)) {
+        fprintf(stderr, "OBU extension parsing failed at offset %d.\n",
+                consumed + kObuHeaderSizeBytes);
+        return false;
+      }
+
+      ++obu_overhead;
+      ++obu_header_size;
+    }
+
+    PrintObuHeader(&obu_header);
+
+    uint64_t obu_size = 0;
+    size_t length_field_size = 0;
+    if (aom_uleb_decode(data + consumed + obu_header_size,
+                        remaining - obu_header_size, &obu_size,
+                        &length_field_size) != 0) {
+      fprintf(stderr, "OBU size parsing failed at offset %d.\n",
+              consumed + obu_header_size);
+      return false;
+    }
+    int current_obu_length = static_cast<int>(obu_size);
+    if (obu_header_size + static_cast<int>(length_field_size) +
+            current_obu_length >
+        remaining) {
+      fprintf(stderr, "OBU parsing failed: not enough OBU data.\n");
+      return false;
+    }
+    consumed += obu_header_size + static_cast<int>(length_field_size) +
+                current_obu_length;
+    printf("      length:      %d\n",
+           static_cast<int>(obu_header_size + length_field_size +
+                            current_obu_length));
+  }
+
+  if (obu_overhead_bytes != nullptr) *obu_overhead_bytes = obu_overhead;
+  printf("  TU size: %d\n", consumed);
+
+  return true;
+}
+
+}  // namespace aom_tools
diff --git a/third_party/aom/tools/obu_parser.h b/third_party/aom/tools/obu_parser.h
new file mode 100644
index 000000000..1d7d2d794
--- /dev/null
+++ b/third_party/aom/tools/obu_parser.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TOOLS_OBU_PARSER_H_
+#define AOM_TOOLS_OBU_PARSER_H_
+
+#include <cstdint>
+
+namespace aom_tools {
+
+// Print information obtained from OBU(s) in data until data is exhausted or an
+// error occurs. Returns true when all data is consumed successfully, and
+// optionally reports OBU storage overhead via obu_overhead_bytes when the
+// pointer is non-null.
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes);
+
+}  // namespace aom_tools
+
+#endif  // AOM_TOOLS_OBU_PARSER_H_
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc b/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc
new file mode 100644
index 000000000..7c5400b91
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_gen_code.cc
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+typedef enum CODE_TYPE {
+  CODE_TYPE_C,
+  CODE_TYPE_SSE2,
+  CODE_TYPE_SSE4_1
+} CODE_TYPE;
+
+int get_cos_idx(double value, int mod) {
+  return round(acos(fabs(value)) / PI * mod);
+}
+
+char *cos_text_arr(double value, int mod, char *text, int size) {
+  int num = get_cos_idx(value, mod);
+  if (value < 0) {
+    snprintf(text, size, "-cospi[%2d]", num);
+  } else {
+    snprintf(text, size, " cospi[%2d]", num);
+  }
+
+  if (num == 0)
+    printf("v: %f -> %d/%d v==-1 is %d\n", value, num, mod, value == -1);
+
+  return text;
+}
+
+char *cos_text_sse2(double w0, double w1, int mod, char *text, int size) {
+  int idx0 = get_cos_idx(w0, mod);
+  int idx1 = get_cos_idx(w1, mod);
+  char p[] = "p";
+  char n[] = "m";
+  char *sgn0 = w0 < 0 ? n : p;
+  char *sgn1 = w1 < 0 ? n : p;
+  snprintf(text, size, "cospi_%s%02d_%s%02d", sgn0, idx0, sgn1, idx1);
+  return text;
+}
+
+char *cos_text_sse4_1(double w, int mod, char *text, int size) {
+  int idx = get_cos_idx(w, mod);
+  char p[] = "p";
+  char n[] = "m";
+  char *sgn = w < 0 ? n : p;
+  snprintf(text, size, "cospi_%s%02d", sgn, idx);
+  return text;
+}
+
+void node_to_code_c(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  for (int i = 0; i < 2; i++) {
+    if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+  }
+  if (cnt == 2) {
+    int cnt2 = 0;
+    printf("  %s[%d] =", buf1, node->nodeIdx);
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1) {
+        cnt2++;
+      }
+    }
+    if (cnt2 == 2) {
+      printf(" apply_value(");
+    }
+    int cnt1 = 0;
+    for (int i = 0; i < 2; i++) {
+      if (node->inWeight[i] == 1) {
+        if (cnt1 > 0)
+          printf(" + %s[%d]", buf0, node->inNodeIdx[i]);
+        else
+          printf(" %s[%d]", buf0, node->inNodeIdx[i]);
+        cnt1++;
+      } else if (node->inWeight[i] == -1) {
+        if (cnt1 > 0)
+          printf(" - %s[%d]", buf0, node->inNodeIdx[i]);
+        else
+          printf("-%s[%d]", buf0, node->inNodeIdx[i]);
+        cnt1++;
+      }
+    }
+    if (cnt2 == 2) {
+      printf(", stage_range[stage])");
+    }
+    printf(";\n");
+  } else {
+    char w0[100];
+    char w1[100];
+    printf(
+        "  %s[%d] = half_btf(%s, %s[%d], %s, %s[%d], "
+        "cos_bit);\n",
+        buf1, node->nodeIdx, cos_text_arr(node->inWeight[0], COS_MOD, w0, 100),
+        buf0, node->inNodeIdx[0],
+        cos_text_arr(node->inWeight[1], COS_MOD, w1, 100), buf0,
+        node->inNodeIdx[1]);
+  }
+}
+
+void gen_code_c(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void av1_%s(const int32_t *input, int32_t *output, int8_t cos_bit, "
+      "const int8_t* stage_range) "
+      "{\n",
+      fun_name);
+  printf("  assert(output != input);\n");
+  printf("  const int32_t size = %d;\n", node_num);
+  printf("  const int32_t *cospi = cospi_arr(cos_bit);\n");
+  printf("\n");
+
+  printf("  int32_t stage = 0;\n");
+  printf("  int32_t *bf0, *bf1;\n");
+  printf("  int32_t step[%d];\n", node_num);
+
+  const char *buf0 = "bf0";
+  const char *buf1 = "bf1";
+  const char *input = "input";
+
+  int si = 0;
+  printf("\n");
+  printf("  // stage %d;\n", si);
+  printf("  apply_range(stage, input, %s, size, stage_range[stage]);\n", input);
+
+  si = 1;
+  printf("\n");
+  printf("  // stage %d;\n", si);
+  printf("  stage++;\n");
+  if (si % 2 == (stage_num - 1) % 2) {
+    printf("  %s = output;\n", buf1);
+  } else {
+    printf("  %s = step;\n", buf1);
+  }
+
+  for (int ni = 0; ni < node_num; ni++) {
+    int idx = get_idx(si, ni, node_num);
+    node_to_code_c(node + idx, input, buf1);
+  }
+
+  printf("  range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+
+  for (int si = 2; si < stage_num; si++) {
+    printf("\n");
+    printf("  // stage %d\n", si);
+    printf("  stage++;\n");
+    if (si % 2 == (stage_num - 1) % 2) {
+      printf("  %s = step;\n", buf0);
+      printf("  %s = output;\n", buf1);
+    } else {
+      printf("  %s = output;\n", buf0);
+      printf("  %s = step;\n", buf1);
+    }
+
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_c(node + idx, buf0, buf1);
+    }
+
+    if (si != stage_num - 1) {
+      printf(
+          "  range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+    }
+  }
+  printf("  apply_range(stage, input, output, size, stage_range[stage]);\n");
+  printf("}\n");
+}
+
+void single_node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+  printf("  %s[%2d] =", buf1, node->nodeIdx);
+  if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+    printf(" _mm_adds_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+    printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+    printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+           node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+    printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+    printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+  }
+  printf(";\n");
+}
+
+void pair_node_to_code_sse2(Node *node, Node *partnerNode, const char *buf0,
+                            const char *buf1) {
+  char temp0[100];
+  char temp1[100];
+  // btf_16_sse2_type0(w0, w1, in0, in1, out0, out1)
+  if (node->inNodeIdx[0] != partnerNode->inNodeIdx[0])
+    printf("  btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+           cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+                         100),
+           cos_text_sse2(partnerNode->inWeight[1], partnerNode->inWeight[0],
+                         COS_MOD, temp1, 100),
+           buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+           node->nodeIdx, buf1, partnerNode->nodeIdx);
+  else
+    printf("  btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+           cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+                         100),
+           cos_text_sse2(partnerNode->inWeight[0], partnerNode->inWeight[1],
+                         COS_MOD, temp1, 100),
+           buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+           node->nodeIdx, buf1, partnerNode->nodeIdx);
+}
+
+Node *get_partner_node(Node *node) {
+  int diff = node->inNode[1]->nodeIdx - node->nodeIdx;
+  return node + diff;
+}
+
+void node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  int cnt1 = 0;
+  if (node->visited == 0) {
+    node->visited = 1;
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+      if (fabs(node->inWeight[i]) == 1) cnt1++;
+    }
+    if (cnt == 2) {
+      if (cnt1 == 2) {
+        // has a partner
+        Node *partnerNode = get_partner_node(node);
+        partnerNode->visited = 1;
+        single_node_to_code_sse2(node, buf0, buf1);
+        single_node_to_code_sse2(partnerNode, buf0, buf1);
+      } else {
+        single_node_to_code_sse2(node, buf0, buf1);
+      }
+    } else {
+      Node *partnerNode = get_partner_node(node);
+      partnerNode->visited = 1;
+      pair_node_to_code_sse2(node, partnerNode, buf0, buf1);
+    }
+  }
+}
+
+void gen_cospi_list_sse2(Node *node, int stage_num, int node_num) {
+  int visited[65][65][2][2];
+  memset(visited, 0, sizeof(visited));
+  char text[100];
+  char text1[100];
+  char text2[100];
+  int size = 100;
+  printf("\n");
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      int cnt = 0;
+      Node *node0 = node + idx;
+      if (node0->visited == 0) {
+        node0->visited = 1;
+        for (int i = 0; i < 2; i++) {
+          if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+            cnt++;
+        }
+        if (cnt != 2) {
+          {
+            double w0 = node0->inWeight[0];
+            double w1 = node0->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx0][idx1][sgn0][sgn1]) {
+              visited[idx0][idx1][sgn0][sgn1] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w0, w1, COS_MOD, text, size),
+                     cos_text_arr(w0, COS_MOD, text1, size),
+                     cos_text_arr(w1, COS_MOD, text2, size));
+            }
+          }
+          Node *node1 = get_partner_node(node0);
+          node1->visited = 1;
+          if (node1->inNode[0]->nodeIdx != node0->inNode[0]->nodeIdx) {
+            double w0 = node1->inWeight[0];
+            double w1 = node1->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx1][idx0][sgn1][sgn0]) {
+              visited[idx1][idx0][sgn1][sgn0] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w1, w0, COS_MOD, text, size),
+                     cos_text_arr(w1, COS_MOD, text1, size),
+                     cos_text_arr(w0, COS_MOD, text2, size));
+            }
+          } else {
+            double w0 = node1->inWeight[0];
+            double w1 = node1->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx0][idx1][sgn0][sgn1]) {
+              visited[idx0][idx1][sgn0][sgn1] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w0, w1, COS_MOD, text, size),
+                     cos_text_arr(w0, COS_MOD, text1, size),
+                     cos_text_arr(w1, COS_MOD, text2, size));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void gen_code_sse2(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void %s_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) "
+      "{\n",
+      fun_name);
+
+  printf("  const int32_t* cospi = cospi_arr(cos_bit);\n");
+  printf("  const __m128i __zero = _mm_setzero_si128();\n");
+  printf("  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+  graph_reset_visited(node, stage_num, node_num);
+  gen_cospi_list_sse2(node, stage_num, node_num);
+  graph_reset_visited(node, stage_num, node_num);
+  for (int si = 1; si < stage_num; si++) {
+    char in[100];
+    char out[100];
+    printf("\n");
+    printf("  // stage %d\n", si);
+    if (si == 1)
+      snprintf(in, 100, "%s", "input");
+    else
+      snprintf(in, 100, "x%d", si - 1);
+    if (si == stage_num - 1) {
+      snprintf(out, 100, "%s", "output");
+    } else {
+      snprintf(out, 100, "x%d", si);
+      printf("  __m128i %s[%d];\n", out, node_num);
+    }
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_sse2(node + idx, in, out);
+    }
+  }
+
+  printf("}\n");
+}
+void gen_cospi_list_sse4_1(Node *node, int stage_num, int node_num) {
+  int visited[65][2];
+  memset(visited, 0, sizeof(visited));
+  char text[100];
+  char text1[100];
+  int size = 100;
+  printf("\n");
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      Node *node0 = node + idx;
+      if (node0->visited == 0) {
+        int cnt = 0;
+        node0->visited = 1;
+        for (int i = 0; i < 2; i++) {
+          if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+            cnt++;
+        }
+        if (cnt != 2) {
+          for (int i = 0; i < 2; i++) {
+            if (fabs(node0->inWeight[i]) != 1 &&
+                fabs(node0->inWeight[i]) != 0) {
+              double w = node0->inWeight[i];
+              int idx = get_cos_idx(w, COS_MOD);
+              int sgn = w < 0 ? 1 : 0;
+
+              if (!visited[idx][sgn]) {
+                visited[idx][sgn] = 1;
+                printf("  __m128i %s = _mm_set1_epi32(%s);\n",
+                       cos_text_sse4_1(w, COS_MOD, text, size),
+                       cos_text_arr(w, COS_MOD, text1, size));
+              }
+            }
+          }
+          Node *node1 = get_partner_node(node0);
+          node1->visited = 1;
+        }
+      }
+    }
+  }
+}
+
+void single_node_to_code_sse4_1(Node *node, const char *buf0,
+                                const char *buf1) {
+  printf("  %s[%2d] =", buf1, node->nodeIdx);
+  if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+    printf(" _mm_add_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+    printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+    printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+           node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+    printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+    printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+  }
+  printf(";\n");
+}
+
+void pair_node_to_code_sse4_1(Node *node, Node *partnerNode, const char *buf0,
+                              const char *buf1) {
+  char temp0[100];
+  char temp1[100];
+  if (node->inWeight[0] * partnerNode->inWeight[0] < 0) {
+    /* type0
+     * cos  sin
+     * sin -cos
+     */
+    // btf_32_sse2_type0(w0, w1, in0, in1, out0, out1)
+    // out0 = w0*in0 + w1*in1
+    // out1 = -w0*in1 + w1*in0
+    printf(
+        "  btf_32_type0_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+        "__rounding, cos_bit);\n",
+        cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+        cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+        node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+        partnerNode->nodeIdx);
+  } else {
+    /* type1
+     *  cos sin
+     * -sin cos
+     */
+    // btf_32_sse2_type1(w0, w1, in0, in1, out0, out1)
+    // out0 = w0*in0 + w1*in1
+    // out1 = w0*in1 - w1*in0
+    printf(
+        "  btf_32_type1_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+        "__rounding, cos_bit);\n",
+        cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+        cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+        node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+        partnerNode->nodeIdx);
+  }
+}
+
+void node_to_code_sse4_1(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  int cnt1 = 0;
+  if (node->visited == 0) {
+    node->visited = 1;
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+      if (fabs(node->inWeight[i]) == 1) cnt1++;
+    }
+    if (cnt == 2) {
+      if (cnt1 == 2) {
+        // has a partner
+        Node *partnerNode = get_partner_node(node);
+        partnerNode->visited = 1;
+        single_node_to_code_sse4_1(node, buf0, buf1);
+        single_node_to_code_sse4_1(partnerNode, buf0, buf1);
+      } else {
+        single_node_to_code_sse2(node, buf0, buf1);
+      }
+    } else {
+      Node *partnerNode = get_partner_node(node);
+      partnerNode->visited = 1;
+      pair_node_to_code_sse4_1(node, partnerNode, buf0, buf1);
+    }
+  }
+}
+
+void gen_code_sse4_1(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void %s_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit) "
+      "{\n",
+      fun_name);
+
+  printf("  const int32_t* cospi = cospi_arr(cos_bit);\n");
+  printf("  const __m128i __zero = _mm_setzero_si128();\n");
+  printf("  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+  graph_reset_visited(node, stage_num, node_num);
+  gen_cospi_list_sse4_1(node, stage_num, node_num);
+  graph_reset_visited(node, stage_num, node_num);
+  for (int si = 1; si < stage_num; si++) {
+    char in[100];
+    char out[100];
+    printf("\n");
+    printf("  // stage %d\n", si);
+    if (si == 1)
+      snprintf(in, 100, "%s", "input");
+    else
+      snprintf(in, 100, "x%d", si - 1);
+    if (si == stage_num - 1) {
+      snprintf(out, 100, "%s", "output");
+    } else {
+      snprintf(out, 100, "x%d", si);
+      printf("  __m128i %s[%d];\n", out, node_num);
+    }
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_sse4_1(node + idx, in, out);
+    }
+  }
+
+  printf("}\n");
+}
+
+void gen_hybrid_code(CODE_TYPE code_type, TYPE_TXFM txfm_type, int node_num) {
+  int stage_num = get_hybrid_stage_num(txfm_type, node_num);
+
+  Node *node = new Node[node_num * stage_num];
+  init_graph(node, stage_num, node_num);
+
+  gen_hybrid_graph_1d(node, stage_num, node_num, 0, 0, node_num, txfm_type);
+
+  switch (code_type) {
+    case CODE_TYPE_C: gen_code_c(node, stage_num, node_num, txfm_type); break;
+    case CODE_TYPE_SSE2:
+      gen_code_sse2(node, stage_num, node_num, txfm_type);
+      break;
+    case CODE_TYPE_SSE4_1:
+      gen_code_sse4_1(node, stage_num, node_num, txfm_type);
+      break;
+  }
+
+  delete[] node;
+}
+
+int main(int argc, char **argv) {
+  CODE_TYPE code_type = CODE_TYPE_SSE4_1;
+  for (int txfm_type = TYPE_DCT; txfm_type < TYPE_LAST; txfm_type++) {
+    for (int node_num = 4; node_num <= 64; node_num *= 2) {
+      gen_hybrid_code(code_type, (TYPE_TXFM)txfm_type, node_num);
+    }
+  }
+  return 0;
+}
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.cc b/third_party/aom/tools/txfm_analyzer/txfm_graph.cc
new file mode 100644
index 000000000..a24906100
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.cc
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct Node Node;
+
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+                  const int txfm_size) {
+  if (type == TYPE_DCT)
+    snprintf(str_fun_name, str_buf_size, "fdct%d_new", txfm_size);
+  else if (type == TYPE_ADST)
+    snprintf(str_fun_name, str_buf_size, "fadst%d_new", txfm_size);
+  else if (type == TYPE_IDCT)
+    snprintf(str_fun_name, str_buf_size, "idct%d_new", txfm_size);
+  else if (type == TYPE_IADST)
+    snprintf(str_fun_name, str_buf_size, "iadst%d_new", txfm_size);
+}
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+                        const TYPE_TXFM type, const int txfm_size) {
+  if (type == TYPE_DCT)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+  else if (type == TYPE_ADST)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+  else if (type == TYPE_IDCT)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+  else if (type == TYPE_IADST)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+}
+
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+                             const TYPE_TXFM type1, const int txfm_size0,
+                             const int txfm_size1) {
+  if (type0 == TYPE_DCT && type1 == TYPE_DCT)
+    snprintf(buf, buf_size, "_dct_dct_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_DCT && type1 == TYPE_ADST)
+    snprintf(buf, buf_size, "_dct_adst_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_ADST && type1 == TYPE_ADST)
+    snprintf(buf, buf_size, "_adst_adst_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_ADST && type1 == TYPE_DCT)
+    snprintf(buf, buf_size, "_adst_dct_%dx%d", txfm_size1, txfm_size0);
+}
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type) {
+  if (type == TYPE_DCT)
+    return TYPE_IDCT;
+  else if (type == TYPE_ADST)
+    return TYPE_IADST;
+  else if (type == TYPE_IDCT)
+    return TYPE_DCT;
+  else if (type == TYPE_IADST)
+    return TYPE_ADST;
+  else
+    return TYPE_LAST;
+}
+
+void reference_dct_1d(double *in, double *out, int size) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < size; k++) {
+    out[k] = 0;  // initialize out[k]
+    for (int n = 0; n < size; n++) {
+      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_dct_2d(double *in, double *out, int size) {
+  double *tempOut = new double[size * size];
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_dct_1d(in + r * size, out + r * size, size);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+  for (int r = 0; r < size; r++) {
+    reference_dct_1d(tempOut + r * size, out + r * size, size);
+  }
+  delete[] tempOut;
+}
+
+void reference_adst_1d(double *in, double *out, int size) {
+  for (int k = 0; k < size; k++) {
+    out[k] = 0;  // initialize out[k]
+    for (int n = 0; n < size; n++) {
+      out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1) {
+  double *tempOut = new double[size * size];
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    if (type0 == TYPE_DCT)
+      reference_dct_1d(in + r * size, out + r * size, size);
+    else
+      reference_adst_1d(in + r * size, out + r * size, size);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+  for (int r = 0; r < size; r++) {
+    if (type1 == TYPE_DCT)
+      reference_dct_1d(tempOut + r * size, out + r * size, size);
+    else
+      reference_adst_1d(tempOut + r * size, out + r * size, size);
+  }
+  delete[] tempOut;
+}
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+                             int type0, int type1) {
+  double *tempOut = new double[size0 * size1];
+  // dct each row: in -> out
+  for (int r = 0; r < size1; r++) {
+    if (type0 == TYPE_DCT)
+      reference_dct_1d(in + r * size0, out + r * size0, size0);
+    else
+      reference_adst_1d(in + r * size0, out + r * size0, size0);
+  }
+
+  for (int r = 0; r < size1; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size0; c++) {
+      tempOut[c * size1 + r] = out[r * size0 + c];
+    }
+  }
+  for (int r = 0; r < size0; r++) {
+    if (type1 == TYPE_DCT)
+      reference_dct_1d(tempOut + r * size1, out + r * size1, size1);
+    else
+      reference_adst_1d(tempOut + r * size1, out + r * size1, size1);
+  }
+  delete[] tempOut;
+}
+
+unsigned int get_max_bit(unsigned int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+unsigned int bitwise_reverse(unsigned int x, int max_bit) {
+  x = ((x >> 16) & 0x0000ffff) | ((x & 0x0000ffff) << 16);
+  x = ((x >> 8) & 0x00ff00ff) | ((x & 0x00ff00ff) << 8);
+  x = ((x >> 4) & 0x0f0f0f0f) | ((x & 0x0f0f0f0f) << 4);
+  x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
+  x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
+  x = x >> (31 - max_bit);
+  return x;
+}
+
+int get_idx(int ri, int ci, int cSize) { return ri * cSize + ci; }
+
+void add_node(Node *node, int stage_num, int node_num, int stage_idx,
+              int node_idx, int in, double w) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  int inIdx = get_idx(stage_idx - 1, in, node_num);
+  int idx = node[outIdx].inNodeNum;
+  if (idx < 2) {
+    node[outIdx].inNode[idx] = &node[inIdx];
+    node[outIdx].inNodeIdx[idx] = in;
+    node[outIdx].inWeight[idx] = w;
+    idx++;
+    node[outIdx].inNodeNum = idx;
+  } else {
+    printf("Error: inNode is full");
+  }
+}
+
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+                  int node_idx, int in0, double w0, int in1, double w1) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  int inIdx0 = get_idx(stage_idx - 1, in0, node_num);
+  int inIdx1 = get_idx(stage_idx - 1, in1, node_num);
+
+  int idx = 0;
+  // if(w0 != 0) {
+  node[outIdx].inNode[idx] = &node[inIdx0];
+  node[outIdx].inNodeIdx[idx] = in0;
+  node[outIdx].inWeight[idx] = w0;
+  idx++;
+  //}
+
+  // if(w1 != 0) {
+  node[outIdx].inNode[idx] = &node[inIdx1];
+  node[outIdx].inNodeIdx[idx] = in1;
+  node[outIdx].inWeight[idx] = w1;
+  idx++;
+  //}
+
+  node[outIdx].inNodeNum = idx;
+}
+
+void propagate(Node *node, int stage_num, int node_num, int stage_idx) {
+  for (int ni = 0; ni < node_num; ni++) {
+    int outIdx = get_idx(stage_idx, ni, node_num);
+    node[outIdx].value = 0;
+    for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+      node[outIdx].value +=
+          node[outIdx].inNode[k]->value * node[outIdx].inWeight[k];
+    }
+  }
+}
+
+int64_t round_shift(int64_t value, int bit) {
+  if (bit > 0) {
+    if (value < 0) {
+      return -round_shift(-value, bit);
+    } else {
+      return (value + (1 << (bit - 1))) >> bit;
+    }
+  } else {
+    return value << (-bit);
+  }
+}
+
+void round_shift_array(int32_t *arr, int size, int bit) {
+  if (bit == 0) {
+    return;
+  } else {
+    for (int i = 0; i < size; i++) {
+      arr[i] = round_shift(arr[i], bit);
+    }
+  }
+}
+
+void graph_reset_visited(Node *node, int stage_num, int node_num) {
+  for (int si = 0; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node[idx].visited = 0;
+    }
+  }
+}
+
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int estimate_bit) {
+  if (stage_idx > 0) {
+    int outIdx = get_idx(stage_idx, node_idx, node_num);
+    int64_t out = 0;
+    node[outIdx].value = 0;
+    for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+      int64_t w = round(node[outIdx].inWeight[k] * (1 << estimate_bit));
+      int64_t v = round(node[outIdx].inNode[k]->value);
+      out += v * w;
+    }
+    node[outIdx].value = round_shift(out, estimate_bit);
+  }
+}
+
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int amplify_bit) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  node[outIdx].value = round_shift(round(node[outIdx].value), -amplify_bit);
+}
+
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+                               int stage_idx, int amplify_bit,
+                               int estimate_bit) {
+  for (int ni = 0; ni < node_num; ni++) {
+    estimate_value(node, stage_num, node_num, stage_idx, ni, estimate_bit);
+    amplify_value(node, stage_num, node_num, stage_idx, ni, amplify_bit);
+  }
+}
+
+void init_graph(Node *node, int stage_num, int node_num) {
+  for (int si = 0; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int outIdx = get_idx(si, ni, node_num);
+      node[outIdx].stageIdx = si;
+      node[outIdx].nodeIdx = ni;
+      node[outIdx].value = 0;
+      node[outIdx].inNodeNum = 0;
+      if (si >= 1) {
+        connect_node(node, stage_num, node_num, si, ni, ni, 1, ni, 0);
+      }
+    }
+  }
+}
+
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N, int star) {
+  for (int i = 0; i < N / 2; i++) {
+    int out = node_idx + i;
+    int in1 = node_idx + N - 1 - i;
+    if (star == 1) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+                   1);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+                   1);
+    }
+  }
+  for (int i = N / 2; i < N; i++) {
+    int out = node_idx + i;
+    int in1 = node_idx + N - 1 - i;
+    if (star == 1) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+                   1);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+                   1);
+    }
+  }
+}
+
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N) {
+  int max_bit = get_max_bit(N - 1);
+  for (int i = 0; i < N; i++) {
+    int out = node_idx + bitwise_reverse(i, max_bit);
+    int in = node_idx + i;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N) {
+  int max_bit = get_max_bit(N);
+  for (int ni = 0; ni < N / 2; ni++) {
+    int ai = bitwise_reverse(N + ni, max_bit);
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 sin(PI * ai / (2 * 2 * N)), in1, cos(PI * ai / (2 * 2 * N)));
+  }
+  for (int ni = N / 2; ni < N; ni++) {
+    int ai = bitwise_reverse(N + ni, max_bit);
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 cos(PI * ai / (2 * 2 * N)), in1, -sin(PI * ai / (2 * 2 * N)));
+  }
+}
+
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N) {
+  for (int ni = 0; ni < N / 4; ni++) {
+    int out = node_idx + ni;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+  }
+
+  for (int ni = N / 4; ni < N / 2; ni++) {
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 -cos(PI / 4), in1, cos(-PI / 4));
+  }
+
+  for (int ni = N / 2; ni < N * 3 / 4; ni++) {
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 cos(-PI / 4), in1, cos(PI / 4));
+  }
+
+  for (int ni = N * 3 / 4; ni < N; ni++) {
+    int out = node_idx + ni;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+  }
+}
+
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N) {
+  // TODO(angiebird): Simplify and clarify this function
+
+  int i = 2 * N / (1 << (idx / 2));
+  int max_bit =
+      get_max_bit(i / 2) - 1;  // the max_bit counts on i/2 instead of N here
+  int N_over_i = 2 << (idx / 2);
+
+  for (int nj = 0; nj < N / 2; nj += N_over_i) {
+    int j = nj / (N_over_i);
+    int kj = bitwise_reverse(i / 4 + j, max_bit);
+    // printf("kj = %d\n", kj);
+
+    // I_N/2i   --- 0
+    int offset = nj;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+
+    // -C_Kj/i --- S_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = -cos(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = sin(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // S_kj/i  --- -C_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = -sin(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = -cos(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // I_N/2i   --- 0
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+
+  for (int nj = N / 2; nj < N; nj += N_over_i) {
+    int j = nj / N_over_i;
+    int kj = bitwise_reverse(i / 4 + j, max_bit);
+
+    // I_N/2i --- 0
+    int offset = nj;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+
+    // C_kj/i --- -S_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = cos(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = -sin(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // S_kj/i --- C_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = sin(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = cos(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // I_N/2i --- 0
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+}
+
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N) {
+  int B_size = 1 << ((idx + 1) / 2);
+  for (int ni = 0; ni < N; ni += B_size) {
+    gen_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni, B_size,
+                (ni / B_size) % 2);
+  }
+}
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N) {
+  int max_idx = 2 * (get_max_bit(N) + 1) - 3;
+  for (int idx = 0; idx < max_idx; idx++) {
+    int s = stage_idx + max_idx - idx - 1;
+    if (idx == 0) {
+      // type 1
+      gen_type1_graph(node, stage_num, node_num, s, node_idx, N);
+    } else if (idx == max_idx - 1) {
+      // type 2
+      gen_type2_graph(node, stage_num, node_num, s, node_idx, N);
+    } else if ((idx + 1) % 2 == 0) {
+      // type 4
+      gen_type4_graph(node, stage_num, node_num, s, node_idx, idx, N);
+    } else if ((idx + 1) % 2 == 1) {
+      // type 3
+      gen_type3_graph(node, stage_num, node_num, s, node_idx, idx, N);
+    } else {
+      printf("check gen_R_graph()\n");
+    }
+  }
+}
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int N) {
+  if (N > 2) {
+    gen_B_graph(node, stage_num, node_num, stage_idx, node_idx, N, 0);
+    gen_DCT_graph(node, stage_num, node_num, stage_idx + 1, node_idx, N / 2);
+    gen_R_graph(node, stage_num, node_num, stage_idx + 1, node_idx + N / 2,
+                N / 2);
+  } else {
+    // generate dct_2
+    connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+                 cos(PI / 4), node_idx + 1, cos(PI / 4));
+    connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+                 node_idx + 1, -cos(PI / 4), node_idx, cos(PI / 4));
+  }
+}
+
+int get_dct_stage_num(int size) { return 2 * get_max_bit(size); }
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  gen_DCT_graph(node, stage_num, node_num, stage_idx, node_idx, dct_node_num);
+  int dct_stage_num = get_dct_stage_num(dct_node_num);
+  gen_P_graph(node, stage_num, node_num, stage_idx + dct_stage_num - 2,
+              node_idx, dct_node_num);
+}
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx) {
+  int size = 1 << (adst_idx + 1);
+  for (int ni = 0; ni < size / 2; ni++) {
+    int nOut = node_idx + ni;
+    int nIn = nOut + size / 2;
+    connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, 1, nIn,
+                 1);
+    // printf("nOut: %d nIn: %d\n", nOut, nIn);
+  }
+  for (int ni = size / 2; ni < size; ni++) {
+    int nOut = node_idx + ni;
+    int nIn = nOut - size / 2;
+    connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, -1, nIn,
+                 1);
+    // printf("ndctOut: %d nIn: %d\n", nOut, nIn);
+  }
+}
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num) {
+  int size = 1 << (adst_idx + 1);
+  for (int ni = 0; ni < adst_node_num; ni += size) {
+    gen_adst_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+                     adst_idx);
+  }
+}
+
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, double freq) {
+  connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+               cos(freq * PI), node_idx + 1, sin(freq * PI));
+  connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+               node_idx + 1, -cos(freq * PI), node_idx, sin(freq * PI));
+}
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx) {
+  int size = 1 << (adst_idx);
+  for (int i = 0; i < size / 2; i++) {
+    int ni = i * 2;
+    double fi = (1 + 4 * i) * 1.0 / (1 << (adst_idx + 1));
+    gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+  }
+}
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num) {
+  int size = 1 << (adst_idx);
+  for (int i = 0; i < adst_node_num / size; i++) {
+    if (i % 2 == 1) {
+      int ni = i * size;
+      gen_adst_E_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+                       adst_idx);
+    }
+  }
+}
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num) {
+  for (int i = 0; i < adst_node_num / 2; i++) {
+    int ni = i * 2;
+    double fi = (1 + 4 * i) * 1.0 / (4 * adst_node_num);
+    gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+  }
+}
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num) {
+  // reverse order when idx is 1, 3, 5, 7 ...
+  // example of adst_node_num = 8:
+  //   0 1 2 3 4 5 6 7
+  // --> 0 7 2 5 4 3 6 1
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    if (ni % 2 == 0) {
+      int out = node_idx + ni;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out,
+                   0);
+    } else {
+      int out = node_idx + ni;
+      int in = node_idx + adst_node_num - ni;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+}
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int adst_node_num) {
+  // reverse order
+  // 0 1 2 3 --> 3 2 1 0
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + adst_node_num - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+int get_Q_out2in(int adst_node_num, int out) {
+  int in;
+  if (out % 2 == 0) {
+    in = out;
+  } else {
+    in = adst_node_num - out;
+  }
+  return in;
+}
+
+int get_Ibar_out2in(int adst_node_num, int out) {
+  return adst_node_num - out - 1;
+}
+
+void gen_adst_IbarQ_graph(Node *node, int stage_num, int node_num,
+                          int stage_idx, int node_idx, int adst_node_num) {
+  // in -> Ibar -> Q -> out
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx +
+             get_Ibar_out2in(adst_node_num, get_Q_out2in(adst_node_num, ni));
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num) {
+  // reverse order
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = out;
+    if (ni % 2 == 0) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, -1, in,
+                   0);
+    }
+  }
+}
+
+int get_hadamard_idx(int x, int adst_node_num) {
+  int max_bit = get_max_bit(adst_node_num - 1);
+  x = bitwise_reverse(x, max_bit);
+
+  // gray code
+  int c = x & 1;
+  int p = x & 1;
+  int y = c;
+
+  for (int i = 1; i <= max_bit; i++) {
+    p = c;
+    c = (x >> i) & 1;
+    y += (c ^ p) << i;
+  }
+  return y;
+}
+
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num) {
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_adst_HtD_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                        int node_idx, int adst_node_num) {
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+    double inW;
+    if (ni % 2 == 0)
+      inW = 1;
+    else
+      inW = -1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, inW, in, 0);
+  }
+}
+
+int get_adst_stage_num(int adst_node_num) {
+  return 2 * get_max_bit(adst_node_num) + 2;
+}
+
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int adst_node_num) {
+  int max_bit = get_max_bit(adst_node_num);
+  int si = 0;
+  gen_adst_IbarQ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                       adst_node_num);
+  si++;
+  gen_adst_VJ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                    adst_node_num);
+  si++;
+  for (int adst_idx = max_bit - 1; adst_idx >= 1; adst_idx--) {
+    gen_adst_U_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_idx, adst_node_num);
+    si++;
+    gen_adst_V_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_idx, adst_node_num);
+    si++;
+  }
+  gen_adst_HtD_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_node_num);
+  si++;
+  return si + 1;
+}
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int adst_node_num) {
+  int hybrid_stage_num = get_hybrid_stage_num(TYPE_ADST, adst_node_num);
+  // generate a adst tempNode
+  Node *tempNode = new Node[hybrid_stage_num * adst_node_num];
+  init_graph(tempNode, hybrid_stage_num, adst_node_num);
+  int si = gen_iadst_graph(tempNode, hybrid_stage_num, adst_node_num, 0, 0,
+                           adst_node_num);
+
+  // tempNode's inverse graph to node[stage_idx][node_idx]
+  gen_inv_graph(tempNode, hybrid_stage_num, adst_node_num, node, stage_num,
+                node_num, stage_idx, node_idx);
+  delete[] tempNode;
+  return si;
+}
+
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  for (int first = 0; first < dct_node_num; first++) {
+    for (int second = 0; second < dct_node_num; second++) {
+      // int sIn = stage_idx;
+      int sOut = stage_idx + 1;
+      int nIn = node_idx + first * dct_node_num + second;
+      int nOut = node_idx + second * dct_node_num + first;
+
+      // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+      connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+    }
+  }
+}
+
+void connect_layer_2d_new(Node *node, int stage_num, int node_num,
+                          int stage_idx, int node_idx, int dct_node_num0,
+                          int dct_node_num1) {
+  for (int i = 0; i < dct_node_num1; i++) {
+    for (int j = 0; j < dct_node_num0; j++) {
+      // int sIn = stage_idx;
+      int sOut = stage_idx + 1;
+      int nIn = node_idx + i * dct_node_num0 + j;
+      int nOut = node_idx + j * dct_node_num1 + i;
+
+      // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+      connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+    }
+  }
+}
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  int dct_stage_num = get_dct_stage_num(dct_node_num);
+  // put 2 layers of dct_node_num DCTs on the graph
+  for (int ni = 0; ni < dct_node_num; ni++) {
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx,
+                     node_idx + ni * dct_node_num, dct_node_num);
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx + dct_stage_num,
+                     node_idx + ni * dct_node_num, dct_node_num);
+  }
+  // connect first layer and second layer
+  connect_layer_2d(node, stage_num, node_num, stage_idx + dct_stage_num - 1,
+                   node_idx, dct_node_num);
+}
+
+int get_hybrid_stage_num(int type, int hybrid_node_num) {
+  if (type == TYPE_DCT || type == TYPE_IDCT) {
+    return get_dct_stage_num(hybrid_node_num);
+  } else if (type == TYPE_ADST || type == TYPE_IADST) {
+    return get_adst_stage_num(hybrid_node_num);
+  }
+  return 0;
+}
+
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num) {
+  int stage_num = 0;
+  stage_num += get_hybrid_stage_num(type0, hybrid_node_num);
+  stage_num += get_hybrid_stage_num(type1, hybrid_node_num);
+  return stage_num;
+}
+
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+                                int hybrid_node_num1) {
+  int stage_num = 0;
+  stage_num += get_hybrid_stage_num(type0, hybrid_node_num0);
+  stage_num += get_hybrid_stage_num(type1, hybrid_node_num1);
+  return stage_num;
+}
+
+int get_hybrid_amplify_factor(int type, int hybrid_node_num) {
+  return get_max_bit(hybrid_node_num) - 1;
+}
+
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type) {
+  if (type == TYPE_DCT) {
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx, node_idx,
+                     hybrid_node_num);
+  } else if (type == TYPE_ADST) {
+    gen_adst_graph(node, stage_num, node_num, stage_idx, node_idx,
+                   hybrid_node_num);
+  } else if (type == TYPE_IDCT) {
+    int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+    // generate a dct tempNode
+    Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+    init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+    gen_DCT_graph_1d(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+                     hybrid_node_num);
+
+    // tempNode's inverse graph to node[stage_idx][node_idx]
+    gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+                  node_num, stage_idx, node_idx);
+    delete[] tempNode;
+  } else if (type == TYPE_IADST) {
+    int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+    // generate a adst tempNode
+    Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+    init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+    gen_adst_graph(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+                   hybrid_node_num);
+
+    // tempNode's inverse graph to node[stage_idx][node_idx]
+    gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+                  node_num, stage_idx, node_idx);
+    delete[] tempNode;
+  }
+}
+
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type0,
+                         int type1) {
+  int hybrid_stage_num = get_hybrid_stage_num(type0, hybrid_node_num);
+
+  for (int ni = 0; ni < hybrid_node_num; ni++) {
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+                        node_idx + ni * hybrid_node_num, hybrid_node_num,
+                        type0);
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx + hybrid_stage_num,
+                        node_idx + ni * hybrid_node_num, hybrid_node_num,
+                        type1);
+  }
+
+  // connect first layer and second layer
+  connect_layer_2d(node, stage_num, node_num, stage_idx + hybrid_stage_num - 1,
+                   node_idx, hybrid_node_num);
+}
+
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+                             int stage_idx, int node_idx, int hybrid_node_num0,
+                             int hybrid_node_num1, int type0, int type1) {
+  int hybrid_stage_num0 = get_hybrid_stage_num(type0, hybrid_node_num0);
+
+  for (int ni = 0; ni < hybrid_node_num1; ni++) {
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+                        node_idx + ni * hybrid_node_num0, hybrid_node_num0,
+                        type0);
+  }
+  for (int ni = 0; ni < hybrid_node_num0; ni++) {
+    gen_hybrid_graph_1d(
+        node, stage_num, node_num, stage_idx + hybrid_stage_num0,
+        node_idx + ni * hybrid_node_num1, hybrid_node_num1, type1);
+  }
+
+  // connect first layer and second layer
+  connect_layer_2d_new(node, stage_num, node_num,
+                       stage_idx + hybrid_stage_num0 - 1, node_idx,
+                       hybrid_node_num0, hybrid_node_num1);
+}
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+                   int inv_stage_num, int inv_node_num, int inv_stage_idx,
+                   int inv_node_idx) {
+  // clean up inNodeNum in invNode because of add_node
+  for (int si = 1 + inv_stage_idx; si < inv_stage_idx + stage_num; si++) {
+    for (int ni = inv_node_idx; ni < inv_node_idx + node_num; ni++) {
+      int idx = get_idx(si, ni, inv_node_num);
+      invNode[idx].inNodeNum = 0;
+    }
+  }
+  // generate inverse graph of node on invNode
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int invSi = stage_num - si;
+      int idx = get_idx(si, ni, node_num);
+      for (int k = 0; k < node[idx].inNodeNum; k++) {
+        int invNi = node[idx].inNodeIdx[k];
+        add_node(invNode, inv_stage_num, inv_node_num, invSi + inv_stage_idx,
+                 invNi + inv_node_idx, ni + inv_node_idx,
+                 node[idx].inWeight[k]);
+      }
+    }
+  }
+}
diff --git a/third_party/aom/tools/txfm_analyzer/txfm_graph.h b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
new file mode 100644
index 000000000..2e3c9551e
--- /dev/null
+++ b/third_party/aom/tools/txfm_analyzer/txfm_graph.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+#define AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+
+struct Node {
+  Node *inNode[2];
+  int inNodeNum;
+  int inNodeIdx[2];
+  double inWeight[2];
+  double value;
+  int nodeIdx;
+  int stageIdx;
+  int visited;
+};
+
+#define PI (3.141592653589793238462643383279502884)
+#define STAGENUM (10)
+#define NODENUM (32)
+#define COS_MOD (128)
+
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type);
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+                  const int txfm_size);
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+                        const TYPE_TXFM type, const int txfm_size);
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+                             const TYPE_TXFM type1, const int txfm_size0,
+                             const int txfm_size1);
+unsigned int get_max_bit(unsigned int x);
+unsigned int bitwise_reverse(unsigned int x, int max_bit);
+int get_idx(int ri, int ci, int cSize);
+
+int get_dct_stage_num(int size);
+void reference_dct_1d(double *in, double *out, int size);
+void reference_dct_2d(double *in, double *out, int size);
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+                  int node_idx, int in0, double w0, int in1, double w1);
+void propagate(Node *node, int stage_num, int node_num, int stage);
+void init_graph(Node *node, int stage_num, int node_num);
+void graph_reset_visited(Node *node, int stage_num, int node_num);
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N, int star);
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N);
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N);
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N);
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N);
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N);
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N);
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int N);
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx);
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num);
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, double freq);
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx);
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num);
+
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num);
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num);
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int adst_node_num);
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num);
+
+int get_hadamard_idx(int x, int adst_node_num);
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num);
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int adst_node_num);
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int adst_node_num);
+void reference_adst_1d(double *in, double *out, int size);
+
+int get_adst_stage_num(int adst_node_num);
+int get_hybrid_stage_num(int type, int hybrid_node_num);
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num);
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+                                int hybrid_node_num1);
+int get_hybrid_amplify_factor(int type, int hybrid_node_num);
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type);
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type0,
+                         int type1);
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+                             int stage_idx, int node_idx, int hybrid_node_num0,
+                             int hybrid_node_num1, int type0, int type1);
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1);
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+                             int type0, int type1);
+void reference_adst_dct_2d(double *in, double *out, int size);
+
+void gen_code(Node *node, int stage_num, int node_num, TYPE_TXFM type);
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+                   int inv_stage_num, int inv_node_num, int inv_stage_idx,
+                   int inv_node_idx);
+
+TYPE_TXFM hybrid_char_to_int(char ctype);
+
+int64_t round_shift(int64_t value, int bit);
+void round_shift_array(int32_t *arr, int size, int bit);
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int estimate_bit);
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int estimate_bit);
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+                               int stage_idx, int amplify_bit,
+                               int estimate_bit);
+#endif  // AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
diff --git a/third_party/aom/tools/wrap-commit-msg.py b/third_party/aom/tools/wrap-commit-msg.py
new file mode 100755
index 000000000..1c7882443
--- /dev/null
+++ b/third_party/aom/tools/wrap-commit-msg.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Wraps paragraphs of text, preserving manual formatting
+
+This is like fold(1), but has the special convention of not modifying lines
+that start with whitespace. This allows you to intersperse blocks with
+special formatting, like code blocks, with written prose. The prose will
+be wordwrapped, and the manual formatting will be preserved.
+
+ * This won't handle the case of a bulleted (or ordered) list specially, so
+   manual wrapping must be done.
+
+Occasionally it's useful to put something with explicit formatting that
+doesn't look at all like a block of text inline.
+
+  indicator = has_leading_whitespace(line);
+  if (indicator)
+    preserve_formatting(line);
+
+The intent is that this docstring would make it through the transform
+and still be legible and presented as it is in the source. If additional
+cases are handled, update this doc to describe the effect.
+"""
+
+__author__ = "jkoleszar@google.com"
+import textwrap
+import sys
+
+def wrap(text):
+    if text:
+        return textwrap.fill(text, break_long_words=False) + '\n'
+    return ""
+
+
+def main(fileobj):
+    text = ""
+    output = ""
+    while True:
+        line = fileobj.readline()
+        if not line:
+            break
+
+        if line.lstrip() == line:
+            text += line
+        else:
+            output += wrap(text)
+            text=""
+            output += line
+    output += wrap(text)
+
+    # Replace the file or write to stdout.
+    if fileobj == sys.stdin:
+        fileobj = sys.stdout
+    else:
+        fileobj.seek(0)
+        fileobj.truncate(0)
+    fileobj.write(output)
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        main(open(sys.argv[1], "r+"))
+    else:
+        main(sys.stdin)
diff --git a/third_party/aom/usage.dox b/third_party/aom/usage.dox
new file mode 100644
index 000000000..062d35a83
--- /dev/null
+++ b/third_party/aom/usage.dox
@@ -0,0 +1,111 @@
+/*!\page usage Usage
+
+    The aom multi-format codec SDK provides a unified interface amongst its
+    supported codecs. This abstraction allows applications using this SDK to
+    easily support multiple video formats with minimal code duplication or
+    "special casing." This section describes the interface common to all codecs.
+    For codec-specific details, see the \ref codecs page.
+
+    The following sections are common to all codecs:
+    - \ref usage_types
+    - \ref usage_features
+    - \ref usage_init
+    - \ref usage_errors
+
+    For more information on decoder and encoder specific usage, see the
+    following pages:
+    \if decoder
+    \li \subpage usage_decode
+    \endif
+    \if encoder
+    \li \subpage usage_encode
+    \endif
+
+    \section usage_types Important Data Types
+    There are two important data structures to consider in this interface.
+
+    \subsection usage_ctxs Contexts
+    A context is a storage area allocated by the calling application that the
+    codec may write into to store details about a single instance of that codec.
+    Most of the context is implementation specific, and thus opaque to the
+    application. The context structure as seen by the application is of fixed
+    size, and thus can be allocated with automatic storage or dynamically
+    on the heap.
+
+    Most operations require an initialized codec context. Codec context
+    instances are codec specific. That is, the codec to be used for the encoded
+    video must be known at initialization time. See #aom_codec_ctx_t for further
+    information.
+
+    \subsection usage_ifaces Interfaces
+    A codec interface is an opaque structure that controls how function calls
+    into the generic interface are dispatched to their codec-specific
+    implementations. Applications \ref MUSTNOT attempt to examine or override
+    this storage, as it contains internal implementation details likely to
+    change from release to release.
+
+    Each supported codec will expose an interface structure to the application
+    as an <code>extern</code> reference to a structure of the incomplete type
+    #aom_codec_iface_t.
+
+    \section usage_features Features
+    Several "features" are defined that are optionally implemented by codec
+    algorithms. Indeed, the same algorithm may support different features on
+    different platforms. The purpose of defining these features is that when
+    they are implemented, they conform to a common interface. The features, or
+    capabilities, of an algorithm can be queried from it's interface by using
+    the aom_codec_get_caps() method. Attempts to invoke features not supported
+    by an algorithm will generally result in #AOM_CODEC_INCAPABLE.
+
+    \if decoder
+    Currently defined decoder features include:
+    - \ref usage_cb
+    \endif
+
+    \section usage_init Initialization
+    To initialize a codec instance, the address of the codec context
+    and interface structures are passed to an initialization function. Depending
+    on the \ref usage_features that the codec supports, the codec could be
+    initialized in different modes.
+
+    To prevent cases of confusion where the ABI of the library changes,
+    the ABI is versioned. The ABI version number must be passed at
+    initialization time to ensure the application is using a header file that
+    matches the library. The current ABI version number is stored in the
+    preprocessor macros #AOM_CODEC_ABI_VERSION, #AOM_ENCODER_ABI_VERSION, and
+    #AOM_DECODER_ABI_VERSION. For convenience, each initialization function has
+    a wrapper macro that inserts the correct version number. These macros are
+    named like the initialization methods, but without the _ver suffix.
+
+
+    The available initialization methods are:
+    \if encoder
+    \li #aom_codec_enc_init (calls aom_codec_enc_init_ver())
+    \li #aom_codec_enc_init_multi (calls aom_codec_enc_init_multi_ver())
+    \endif
+    \if decoder
+    \li #aom_codec_dec_init (calls aom_codec_dec_init_ver())
+    \endif
+
+
+    \section usage_errors Error Handling
+    Almost all codec functions return an error status of type #aom_codec_err_t.
+    The semantics of how each error condition should be processed is clearly
+    defined in the definitions of each enumerated value. Error values can be
+    converted into ASCII strings with the aom_codec_error() and
+    aom_codec_err_to_string() methods. The difference between these two methods is
+    that aom_codec_error() returns the error state from an initialized context,
+    whereas aom_codec_err_to_string() can be used in cases where an error occurs
+    outside any context. The enumerated value returned from the last call can be
+    retrieved from the <code>err</code> member of the decoder context as well.
+    Finally, more detailed error information may be able to be obtained by using
+    the aom_codec_error_detail() method. Not all errors produce detailed error
+    information.
+
+    In addition to error information, the codec library's build configuration
+    is available at runtime on some platforms. This information can be returned
+    by calling aom_codec_build_config(), and is formatted as a base64 coded string
+    (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not
+    useful to an application at runtime, but may be of use to aom for support.
+
+*/
diff --git a/third_party/aom/usage_cx.dox b/third_party/aom/usage_cx.dox
new file mode 100644
index 000000000..51b4e8e3e
--- /dev/null
+++ b/third_party/aom/usage_cx.dox
@@ -0,0 +1,9 @@
+/*! \page usage_encode Encoding
+
+    The aom_codec_encode() function is at the core of the encode loop. It
+    processes raw images passed by the application, producing packets of
+    compressed data.
+
+    \ref samples
+
+*/
diff --git a/third_party/aom/usage_dx.dox b/third_party/aom/usage_dx.dox
new file mode 100644
index 000000000..eef78376f
--- /dev/null
+++ b/third_party/aom/usage_dx.dox
@@ -0,0 +1,57 @@
+/*! \page usage_decode Decoding
+
+    The aom_codec_decode() function is at the core of the decode loop. It
+    processes packets of compressed data passed by the application, producing
+    decoded images. The decoder expects packets to comprise exactly one image
+    frame of data. Packets \ref MUST be passed in decode order. If the
+    application wishes to associate some data with the frame, the
+    <code>user_priv</code> member may be set.
+
+    \ref samples
+
+
+    \section usage_cb Callback Based Decoding
+    There are two methods for the application to access decoded frame data. Some
+    codecs support asynchronous (callback-based) decoding \ref usage_features
+    that allow the application to register a callback to be invoked by the
+    decoder when decoded data becomes available. Decoders are not required to
+    support this feature, however. Like all \ref usage_features, support can be
+    determined by calling aom_codec_get_caps(). Callbacks are available in both
+    frame-based and slice-based variants. Frame based callbacks conform to the
+    signature of #aom_codec_put_frame_cb_fn_t and are invoked once the entire
+    frame has been decoded. Slice based callbacks conform to the signature of
+    #aom_codec_put_slice_cb_fn_t and are invoked after a subsection of the frame
+    is decoded. For example, a slice callback could be issued for each
+    macroblock row. However, the number and size of slices to return is
+    implementation specific. Also, the image data passed in a slice callback is
+    not necessarily in the same memory segment as the data will be when it is
+    assembled into a full frame. For this reason, the application \ref MUST
+    examine the rectangles that describe what data is valid to access and what
+    data has been updated in this call. For all their additional complexity,
+    slice based decoding callbacks provide substantial speed gains to the
+    overall application in some cases, due to improved cache behavior.
+
+
+    \section usage_frame_iter Frame Iterator Based Decoding
+    If the codec does not support callback based decoding, or the application
+    chooses not to make use of that feature, decoded frames are made available
+    through the aom_codec_get_frame() iterator. The application initializes the
+    iterator storage (of type #aom_codec_iter_t) to NULL, then calls
+    aom_codec_get_frame repeatedly until it returns NULL, indicating that all
+    images have been returned. This process may result in zero, one, or many
+    frames that are ready for display, depending on the codec.
+
+
+    \section usage_postproc Postprocessing
+    Postprocessing is a process that is applied after a frame is decoded to
+    enhance the image's appearance by removing artifacts introduced in the
+    compression process. It is not required to properly decode the frame, and
+    is generally done only when there is enough spare CPU time to execute
+    the required filters. Codecs may support a number of different
+    postprocessing filters, and the available filters may differ from platform
+    to platform. Embedded devices often do not have enough CPU to implement
+    postprocessing in software. The filter selection is generally handled
+    automatically by the codec.
+
+
+*/
diff --git a/toolkit/components/build/nsToolkitCompsModule.cpp b/toolkit/components/build/nsToolkitCompsModule.cpp
index 190c4da06..33c604c4e 100644
--- a/toolkit/components/build/nsToolkitCompsModule.cpp
+++ b/toolkit/components/build/nsToolkitCompsModule.cpp
@@ -5,7 +5,9 @@
 #include "mozilla/ModuleUtils.h"
 #include "nsAppStartup.h"
 #include "nsNetCID.h"
+#ifdef MOZ_USERINFO
 #include "nsUserInfo.h"
+#endif
 #include "nsToolkitCompsCID.h"
 #include "nsFindService.h"
 #if defined(MOZ_UPDATER) && !defined(MOZ_WIDGET_ANDROID)
@@ -76,7 +78,9 @@ NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsPerformanceStatsService, Init)
 NS_GENERIC_FACTORY_CONSTRUCTOR(nsTerminator)
 #endif
 
+#if defined(MOZ_USERINFO)
 NS_GENERIC_FACTORY_CONSTRUCTOR(nsUserInfo)
+#endif // defined (MOZ_USERINFO)
 NS_GENERIC_FACTORY_CONSTRUCTOR(nsFindService)
 
 #if !defined(MOZ_DISABLE_PARENTAL_CONTROLS)
@@ -141,7 +145,9 @@ NS_DEFINE_NAMED_CID(NS_TOOLKIT_PERFORMANCESTATSSERVICE_CID);
 #if defined(MOZ_HAS_TERMINATOR)
 NS_DEFINE_NAMED_CID(NS_TOOLKIT_TERMINATOR_CID);
 #endif
+#if defined(MOZ_USERINFO)
 NS_DEFINE_NAMED_CID(NS_USERINFO_CID);
+#endif // defined (MOZ_USERINFO)
 NS_DEFINE_NAMED_CID(ALERT_NOTIFICATION_CID);
 NS_DEFINE_NAMED_CID(NS_ALERTSSERVICE_CID);
 #if !defined(MOZ_DISABLE_PARENTAL_CONTROLS)
@@ -179,7 +185,9 @@ static const Module::CIDEntry kToolkitCIDs[] = {
 #if defined(MOZ_HAS_PERFSTATS)
   { &kNS_TOOLKIT_PERFORMANCESTATSSERVICE_CID, false, nullptr, nsPerformanceStatsServiceConstructor },
 #endif // defined (MOZ_HAS_PERFSTATS)
+#if defined(MOZ_USERINFO)
   { &kNS_USERINFO_CID, false, nullptr, nsUserInfoConstructor },
+#endif // defined (MOZ_USERINFO)
   { &kALERT_NOTIFICATION_CID, false, nullptr, AlertNotificationConstructor },
   { &kNS_ALERTSSERVICE_CID, false, nullptr, nsAlertsServiceConstructor },
 #if !defined(MOZ_DISABLE_PARENTAL_CONTROLS)
@@ -219,7 +227,9 @@ static const Module::ContractIDEntry kToolkitContracts[] = {
 #if defined(MOZ_HAS_PERFSTATS)
   { NS_TOOLKIT_PERFORMANCESTATSSERVICE_CONTRACTID, &kNS_TOOLKIT_PERFORMANCESTATSSERVICE_CID },
 #endif // defined (MOZ_HAS_PERFSTATS)
+#if defined(MOZ_USERINFO)
   { NS_USERINFO_CONTRACTID, &kNS_USERINFO_CID },
+#endif // defined(MOZ_USERINFO)
   { ALERT_NOTIFICATION_CONTRACTID, &kALERT_NOTIFICATION_CID },
   { NS_ALERTSERVICE_CONTRACTID, &kNS_ALERTSSERVICE_CID },
 #if !defined(MOZ_DISABLE_PARENTAL_CONTROLS)
diff --git a/toolkit/components/reader/AboutReader.jsm b/toolkit/components/reader/AboutReader.jsm
index c5d04476d..6ec959eba 100644
--- a/toolkit/components/reader/AboutReader.jsm
+++ b/toolkit/components/reader/AboutReader.jsm
@@ -14,7 +14,6 @@ Cu.import("resource://gre/modules/XPCOMUtils.jsm");
 
 XPCOMUtils.defineLazyModuleGetter(this, "AsyncPrefs", "resource://gre/modules/AsyncPrefs.jsm");
 XPCOMUtils.defineLazyModuleGetter(this, "NarrateControls", "resource://gre/modules/narrate/NarrateControls.jsm");
-XPCOMUtils.defineLazyModuleGetter(this, "Rect", "resource://gre/modules/Geometry.jsm");
 XPCOMUtils.defineLazyModuleGetter(this, "PluralForm", "resource://gre/modules/PluralForm.jsm");
 XPCOMUtils.defineLazyModuleGetter(this, "PlacesUtils", "resource://gre/modules/PlacesUtils.jsm");
 
diff --git a/toolkit/components/reader/JSDOMParser.js b/toolkit/components/reader/JSDOMParser.js
index dd9d37230..debdb08eb 100644
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -618,6 +618,13 @@
   };
 
   var Element = function (tag) {
+    // We use this to find the closing tag.
+    this._matchingTag = tag;
+    // We're explicitly a non-namespace aware parser, we just pretend it's all HTML.
+    var lastColonIndex = tag.lastIndexOf(":");
+    if (lastColonIndex != -1) {
+      tag = tag.substring(lastColonIndex + 1);
+    }
     this.attributes = [];
     this.childNodes = [];
     this.children = [];
@@ -785,7 +792,13 @@
           break;
         }
       }
-    }
+    },
+
+    hasAttribute: function (name) {
+      return this.attributes.some(function (attr) {
+        return attr.name == name;
+      });
+    },
   };
 
   var Style = function (node) {
@@ -1062,9 +1075,10 @@
         return null;
 
       // Read any text as Text node
+      var textNode;
       if (c !== "<") {
         --this.currentChar;
-        var textNode = new Text();
+        textNode = new Text();
         var n = this.html.indexOf("<", this.currentChar);
         if (n === -1) {
           textNode.innerHTML = this.html.substring(this.currentChar, this.html.length);
@@ -1076,6 +1090,18 @@
         return textNode;
       }
 
+      if (this.match("![CDATA[")) {
+        var endChar = this.html.indexOf("]]>", this.currentChar);
+        if (endChar === -1) {
+          this.error("unclosed CDATA section");
+          return null;
+        }
+        textNode = new Text();
+        textNode.textContent = this.html.substring(this.currentChar, endChar);
+        this.currentChar = endChar + ("]]>").length;
+        return textNode;
+      }
+
       c = this.peekNext();
 
       // Read Comment node. Normally, Comment nodes know their inner
@@ -1107,7 +1133,7 @@
       // If this isn't a void Element, read its child nodes
       if (!closed) {
         this.readChildren(node);
-        var closingTag = "</" + localName + ">";
+        var closingTag = "</" + node._matchingTag + ">";
         if (!this.match(closingTag)) {
           this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
           return null;
diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js
index 064d2ae88..c2bba0cd3 100644
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -29,14 +29,19 @@
 
 /**
  * Public constructor.
- * @param {Object}       uri     The URI descriptor object.
  * @param {HTMLDocument} doc     The document to parse.
  * @param {Object}       options The options object.
  */
-function Readability(uri, doc, options) {
+function Readability(doc, options) {
+  // In some older versions, people passed a URI as the first argument. Cope:
+  if (options && options.documentElement) {
+    doc = options;
+    options = arguments[2];
+  } else if (!doc || !doc.documentElement) {
+    throw new Error("First argument to Readability constructor should be a document object.");
+  }
   options = options || {};
 
-  this._uri = uri;
   this._doc = doc;
   this._articleTitle = null;
   this._articleByline = null;
@@ -47,7 +52,7 @@ function Readability(uri, doc, options) {
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
-  this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+  this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
 
   // Start with all flags set
@@ -93,6 +98,10 @@ Readability.prototype = {
   FLAG_WEIGHT_CLASSES: 0x2,
   FLAG_CLEAN_CONDITIONALLY: 0x4,
 
+  // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+  ELEMENT_NODE: 1,
+  TEXT_NODE: 3,
+
   // Max number of nodes supported by this parser. Default: 0 (no limit)
   DEFAULT_MAX_ELEMS_TO_PARSE: 0,
 
@@ -103,13 +112,13 @@ Readability.prototype = {
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
-  // The default number of words an article must have in order to return a result
-  DEFAULT_WORD_THRESHOLD: 500,
+  // The default number of chars an article must have in order to return a result
+  DEFAULT_CHAR_THRESHOLD: 500,
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+    unlikelyCandidates: /-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@@ -132,8 +141,19 @@ Readability.prototype = {
 
   DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
 
+  // The commented out elements qualify as phrasing content but tend to be
+  // removed by readability when put into paragraphs, so we ignore them here.
+  PHRASING_ELEMS: [
+    // "CANVAS", "IFRAME", "SVG", "VIDEO",
+    "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
+    "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
+    "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
+    "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
+    "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
+  ],
+
   // These are the classes that readability sets itself.
-  CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+  CLASSES_TO_PRESERVE: [ "page" ],
 
   /**
    * Run any post-process modifications to article content as necessary.
@@ -216,6 +236,21 @@ Readability.prototype = {
   },
 
   /**
+   * Iterate over a NodeList, return true if all of the provided iterate
+   * function calls return true, false otherwise.
+   *
+   * For convenience, the current object context is applied to the
+   * provided iterate function.
+   *
+   * @param  NodeList nodeList The NodeList.
+   * @param  Function fn       The iterate function.
+   * @return Boolean
+   */
+  _everyNode: function(nodeList, fn) {
+    return Array.prototype.every.call(nodeList, fn, this);
+  },
+
+  /**
    * Concat all nodelists passed as arguments.
    *
    * @return ...NodeList
@@ -327,7 +362,7 @@ Readability.prototype = {
     var origTitle = "";
 
     try {
-      curTitle = origTitle = doc.title;
+      curTitle = origTitle = doc.title.trim();
 
       // If they had an element with id "title" in their HTML
       if (typeof curTitle !== "string")
@@ -355,8 +390,9 @@ Readability.prototype = {
         doc.getElementsByTagName('h1'),
         doc.getElementsByTagName('h2')
       );
+      var trimmedTitle = curTitle.trim();
       var match = this._someNode(headings, function(heading) {
-        return heading.textContent === curTitle;
+        return heading.textContent.trim() === trimmedTitle;
       });
 
       // If we don't, let's extract the title out of the original title string.
@@ -421,7 +457,7 @@ Readability.prototype = {
   _nextElement: function (node) {
     var next = node;
     while (next
-        && (next.nodeType != Node.ELEMENT_NODE)
+        && (next.nodeType != this.ELEMENT_NODE)
         && this.REGEXPS.whitespace.test(next.textContent)) {
       next = next.nextSibling;
     }
@@ -464,16 +500,22 @@ Readability.prototype = {
         while (next) {
           // If we've hit another <br><br>, we're done adding children to this <p>.
           if (next.tagName == "BR") {
-            var nextElem = this._nextElement(next);
+            var nextElem = this._nextElement(next.nextSibling);
             if (nextElem && nextElem.tagName == "BR")
               break;
           }
 
+          if (!this._isPhrasingContent(next)) break;
+
           // Otherwise, make this node a child of the new <p>.
           var sibling = next.nextSibling;
           p.appendChild(next);
           next = sibling;
         }
+
+        while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+
+        if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
       }
     });
   },
@@ -523,6 +565,7 @@ Readability.prototype = {
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
     this._clean(articleContent, "link");
+    this._clean(articleContent, "aside");
 
     // Clean out elements have "share" in their id/class combinations from final top candidates,
     // which means we don't remove the top candidates even they have "share".
@@ -579,6 +622,19 @@ Readability.prototype = {
       if (next && next.tagName == "P")
         br.parentNode.removeChild(br);
     });
+
+    // Remove single-cell tables
+    this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
+      var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
+      if (this._hasSingleTagInsideElement(tbody, "TR")) {
+        var row = tbody.firstElementChild;
+        if (this._hasSingleTagInsideElement(row, "TD")) {
+          var cell = row.firstElementChild;
+          cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
+          table.parentNode.replaceChild(cell, table);
+        }
+      }
+    });
   },
 
   /**
@@ -658,37 +714,6 @@ Readability.prototype = {
     return node && node.nextElementSibling;
   },
 
-  /**
-   * Like _getNextNode, but for DOM implementations with no
-   * firstElementChild/nextElementSibling functionality...
-   */
-  _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
-    function nextSiblingEl(n) {
-      do {
-        n = n.nextSibling;
-      } while (n && n.nodeType !== n.ELEMENT_NODE);
-      return n;
-    }
-    // First check for kids if those aren't being ignored
-    if (!ignoreSelfAndKids && node.children[0]) {
-      return node.children[0];
-    }
-    // Then for siblings...
-    var next = nextSiblingEl(node);
-    if (next) {
-      return next;
-    }
-    // And finally, move up the parent chain *and* find a sibling
-    // (because this is depth-first traversal, we will have already
-    // seen the parent nodes themselves).
-    do {
-      node = node.parentNode;
-      if (node)
-        next = nextSiblingEl(node);
-    } while (node && !next);
-    return node && next;
-  },
-
   _checkByline: function(node, matchString) {
     if (this._articleByline) {
       return false;
@@ -751,6 +776,12 @@ Readability.prototype = {
       while (node) {
         var matchString = node.className + " " + node.id;
 
+        if (!this._isProbablyVisible(node)) {
+          this.log("Removing hidden node - " + matchString);
+          node = this._removeAndGetNext(node);
+          continue;
+        }
+
         // Check to see if this node is a byline, and remove it if it is.
         if (this._checkByline(node, matchString)) {
           node = this._removeAndGetNext(node);
@@ -784,11 +815,31 @@ Readability.prototype = {
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
+          // Put phrasing content into paragraphs.
+          var p = null;
+          var childNode = node.firstChild;
+          while (childNode) {
+            var nextSibling = childNode.nextSibling;
+            if (this._isPhrasingContent(childNode)) {
+              if (p !== null) {
+                p.appendChild(childNode);
+              } else if (!this._isWhitespace(childNode)) {
+                p = doc.createElement('p');
+                node.replaceChild(p, childNode);
+                p.appendChild(childNode);
+              }
+            } else if (p !== null) {
+              while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+              p = null;
+            }
+            childNode = nextSibling;
+          }
+
           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
           // element. DIVs with only a P element inside and no text content can be
           // safely converted into plain P elements to avoid confusing the scoring
           // algorithm with DIVs with are, in practice, paragraphs.
-          if (this._hasSinglePInsideElement(node)) {
+          if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
             var newNode = node.children[0];
             node.parentNode.replaceChild(newNode, node);
             node = newNode;
@@ -796,17 +847,6 @@ Readability.prototype = {
           } else if (!this._hasChildBlockElement(node)) {
             node = this._setNodeTag(node, "P");
             elementsToScore.push(node);
-          } else {
-            // EXPERIMENTAL
-            this._forEachNode(node.childNodes, function(childNode) {
-              if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
-                var p = doc.createElement('p');
-                p.textContent = childNode.textContent;
-                p.style.display = 'inline';
-                p.className = 'readability-styled';
-                node.replaceChild(p, childNode);
-              }
-            });
           }
         }
         node = this._getNextNode(node);
@@ -846,7 +886,7 @@ Readability.prototype = {
 
         // Initialize and score ancestors.
         this._forEachNode(ancestors, function(ancestor, level) {
-          if (!ancestor.tagName)
+          if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
             return;
 
           if (typeof(ancestor.readability) === 'undefined') {
@@ -1085,7 +1125,7 @@ Readability.prototype = {
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
       var textLength = this._getInnerText(articleContent, true).length;
-      if (textLength < this._wordThreshold) {
+      if (textLength < this._charThreshold) {
         parseSuccessful = false;
         page.innerHTML = pageCacheHtml;
 
@@ -1233,27 +1273,28 @@ Readability.prototype = {
   },
 
   /**
-   * Check if this node has only whitespace and a single P element
+   * Check if this node has only whitespace and a single element with given tag
    * Returns false if the DIV node contains non-empty text nodes
-   * or if it contains no P or more than 1 element.
+   * or if it contains no element with given tag or more than 1 element.
    *
    * @param Element
+   * @param string tag of child element
   **/
-  _hasSinglePInsideElement: function(element) {
-    // There should be exactly 1 element child which is a P:
-    if (element.children.length != 1 || element.children[0].tagName !== "P") {
+  _hasSingleTagInsideElement: function(element, tag) {
+    // There should be exactly 1 element child with given tag
+    if (element.children.length != 1 || element.children[0].tagName !== tag) {
       return false;
     }
 
     // And there should be no text nodes with real content
     return !this._someNode(element.childNodes, function(node) {
-      return node.nodeType === Node.TEXT_NODE &&
+      return node.nodeType === this.TEXT_NODE &&
              this.REGEXPS.hasContent.test(node.textContent);
     });
   },
 
   _isElementWithoutContent: function(node) {
-    return node.nodeType === Node.ELEMENT_NODE &&
+    return node.nodeType === this.ELEMENT_NODE &&
       node.textContent.trim().length == 0 &&
       (node.children.length == 0 ||
        node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
@@ -1271,6 +1312,21 @@ Readability.prototype = {
     });
   },
 
+  /***
+   * Determine if a node qualifies as phrasing content.
+   * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
+  **/
+  _isPhrasingContent: function(node) {
+    return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
+      ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
+        this._everyNode(node.childNodes, this._isPhrasingContent));
+  },
+
+  _isWhitespace: function(node) {
+    return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
+           (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
+  },
+
   /**
    * Get the inner text of a node - cross browser compatibly.
    * This also strips out any excess whitespace to be found.
@@ -1312,16 +1368,14 @@ Readability.prototype = {
     if (!e || e.tagName.toLowerCase() === 'svg')
       return;
 
-    if (e.className !== 'readability-styled') {
-      // Remove `style` and deprecated presentational attributes
-      for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
-        e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
-      }
+    // Remove `style` and deprecated presentational attributes
+    for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+      e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+    }
 
-      if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
-        e.removeAttribute('width');
-        e.removeAttribute('height');
-      }
+    if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+      e.removeAttribute('width');
+      e.removeAttribute('height');
     }
 
     var cur = e.firstElementChild;
@@ -1639,6 +1693,10 @@ Readability.prototype = {
     this._flags = this._flags & ~flag;
   },
 
+  _isProbablyVisible: function(node) {
+    return node.style.display != "none" && !node.hasAttribute("hidden");
+  },
+
   /**
    * Decides whether or not the document is reader-able without parsing the whole thing.
    *
@@ -1663,9 +1721,9 @@ Readability.prototype = {
       nodes = [].concat.apply(Array.from(set), nodes);
     }
 
-    // FIXME we should have a fallback for helperIsVisible, but this is
-    // problematic because of jsdom's elem.style handling - see
-    // https://github.com/mozilla/readability/pull/186 for context.
+    if (!helperIsVisible) {
+      helperIsVisible = this._isProbablyVisible;
+    }
 
     var score = 0;
     // This is a little cheeky, we use the accumulator 'score' to decide what to return from
@@ -1719,9 +1777,6 @@ Readability.prototype = {
       }
     }
 
-    if (typeof this._doc.documentElement.firstElementChild === "undefined") {
-      this._getNextNode = this._getNextNodeNoElementProperties;
-    }
     // Remove script tags from the document.
     this._removeScripts(this._doc);
 
@@ -1750,7 +1805,6 @@ Readability.prototype = {
 
     var textContent = articleContent.textContent;
     return {
-      uri: this._uri,
       title: this._articleTitle,
       byline: metadata.byline || this._articleByline,
       dir: this._articleDir,
diff --git a/toolkit/components/reader/ReaderMode.jsm b/toolkit/components/reader/ReaderMode.jsm
index e9eb83154..218e12d60 100644
--- a/toolkit/components/reader/ReaderMode.jsm
+++ b/toolkit/components/reader/ReaderMode.jsm
@@ -195,7 +195,7 @@ this.ReaderMode = {
     // We pass in a helper function to determine if a node is visible, because
     // it uses gecko APIs that the engine-agnostic readability code can't rely
     // upon.
-    return new Readability(uri, doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils));
+    return new Readability(doc).isProbablyReaderable(this.isNodeVisible.bind(this, utils));
   },
 
   isNodeVisible(utils, node) {
diff --git a/toolkit/components/reader/ReaderWorker.js b/toolkit/components/reader/ReaderWorker.js
index 69426788b..9cc684e9b 100644
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -48,6 +48,6 @@ var Agent = {
    */
   parseDocument(uri, serializedDoc, options) {
     let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
-    return new Readability(uri, doc, options).parse();
+    return new Readability(doc, options).parse();
   },
 };
diff --git a/toolkit/components/search/moz.build b/toolkit/components/search/moz.build
index 4cc86ff9e..2accffd00 100644
--- a/toolkit/components/search/moz.build
+++ b/toolkit/components/search/moz.build
@@ -6,10 +6,10 @@
 
 XPCSHELL_TESTS_MANIFESTS += ['tests/xpcshell/xpcshell.ini']
 
-if CONFIG['MC_PALEMOON']:
-    DIRS += ['orginal']
-else:
+if CONFIG['MC_BASILISK'] or CONFIG['HYPE_ICEWEASEL'] or CONFIG['HYPE_ICEDOVE']:
     DIRS += ['current']
+else:
+    DIRS += ['orginal']
 
 with Files('**'):
     BUG_COMPONENT = ('Firefox', 'Search')
diff --git a/toolkit/components/startup/moz.build b/toolkit/components/startup/moz.build
index dbd580384..5f290b783 100644
--- a/toolkit/components/startup/moz.build
+++ b/toolkit/components/startup/moz.build
@@ -18,19 +18,14 @@ UNIFIED_SOURCES += [
     'StartupTimeline.cpp',
 ]
 
-if CONFIG['OS_ARCH'] == 'WINNT':
-    # This file cannot be built in unified mode because of name clashes with Windows headers.
-    SOURCES += [
-        'nsUserInfoWin.cpp',
-    ]
-elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'cocoa':
-    UNIFIED_SOURCES += [
-        'nsUserInfoMac.mm',
-    ]
-else:
-    UNIFIED_SOURCES += [
-        'nsUserInfoUnix.cpp',
-    ]
+if CONFIG['MOZ_USERINFO']:
+    if CONFIG['OS_ARCH'] == 'WINNT':
+        # This file cannot be built in unified mode because of name clashes with Windows headers.
+        SOURCES += ['nsUserInfoWin.cpp']
+    elif CONFIG['MOZ_WIDGET_TOOLKIT'] == 'cocoa':
+        UNIFIED_SOURCES += ['nsUserInfoMac.mm']
+    else:
+        UNIFIED_SOURCES += ['nsUserInfoUnix.cpp']
 
 FINAL_LIBRARY = 'xul'
 
diff --git a/toolkit/components/startup/public/moz.build b/toolkit/components/startup/public/moz.build
index 5894b6c51..948a7d7ee 100644
--- a/toolkit/components/startup/public/moz.build
+++ b/toolkit/components/startup/public/moz.build
@@ -4,10 +4,10 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-XPIDL_SOURCES += [
-    'nsIAppStartup.idl',
-    'nsIUserInfo.idl',
-]
+XPIDL_SOURCES += ['nsIAppStartup.idl']
+
+if CONFIG['MOZ_USERINFO']:
+    XPIDL_SOURCES += ['nsIUserInfo.idl']
 
 XPIDL_MODULE = 'appstartup'
 
diff --git a/toolkit/components/telemetry/Histograms.json b/toolkit/components/telemetry/Histograms.json
index 7132c07b0..f61a67efc 100644
--- a/toolkit/components/telemetry/Histograms.json
+++ b/toolkit/components/telemetry/Histograms.json
@@ -2560,78 +2560,6 @@
     "kind": "boolean",
     "description": "Rate of page load from offline cache"
   },
-  "HTTP_CACHE_IO_QUEUE_2_OPEN_PRIORITY": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_READ_PRIORITY": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_MANAGEMENT": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_OPEN": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_READ": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_WRITE": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_WRITE_PRIORITY": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_INDEX": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
-  "HTTP_CACHE_IO_QUEUE_2_EVICT": {
-    "alert_emails": ["hbambas@mozilla.com"],
-    "bug_numbers": [1294183],
-    "expires_in_version": "55",
-    "kind": "enumerated",
-    "n_values": 10,
-    "description": "HTTP Cache IO queue length"
-  },
   "CACHE_DEVICE_SEARCH_2": {
     "expires_in_version": "never",
     "kind": "exponential",
@@ -8426,31 +8354,6 @@
     "n_values": 7,
     "description": "Final status of the CacheFileInputStream (0=ok, 1=other error, 2=out of memory, 3=disk full, 4=file corrupted, 5=file not found, 6=binding aborted)"
   },
-  "NETWORK_CACHE_FS_TYPE": {
-    "expires_in_version": "42",
-    "kind": "enumerated",
-    "n_values": 5,
-    "description": "Type of FS that the cache is stored on (0=NTFS (Win), 1=FAT32 (Win), 2=FAT (Win), 3=other FS (Win), 4=other OS)"
-  },
-  "NETWORK_CACHE_SIZE_FULL_FAT": {
-    "expires_in_version": "42",
-    "kind": "linear",
-    "high": 500,
-    "n_buckets": 50,
-    "description": "Size (in MB) of a cache that reached a file count limit"
-  },
-  "NETWORK_CACHE_HIT_MISS_STAT_PER_CACHE_SIZE": {
-    "expires_in_version": "never",
-    "kind": "enumerated",
-    "n_values": 40,
-    "description": "Hit/Miss count split by cache size in file count (0=Hit 0-5000, 1=Miss 0-5000, 2=Hit 5001-10000, ...)"
-  },
-  "NETWORK_CACHE_HIT_RATE_PER_CACHE_SIZE": {
-    "expires_in_version": "never",
-    "kind": "enumerated",
-    "n_values": 400,
-    "description": "Hit rate for a specific cache size in file count. The hit rate is split into 20 buckets, the lower limit of the range in percents is 5*n/20. The cache size is divided into 20 ranges of length 5000, the lower limit of the range is 5000*(n%20)"
-  },
   "NETWORK_CACHE_METADATA_FIRST_READ_TIME_MS": {
     "expires_in_version": "never",
     "kind": "exponential",
@@ -8479,12 +8382,6 @@
     "n_buckets": 256,
     "description": "Actual size of the metadata parsed from the disk."
   },
-  "NETWORK_CACHE_HASH_STATS": {
-    "expires_in_version": "46",
-    "kind": "enumerated",
-    "n_values": 160,
-    "description": "The longest hash match between a newly added entry and all the existing entries."
-  },
   "DATABASE_LOCKED_EXCEPTION": {
     "expires_in_version": "42",
     "kind": "enumerated",
diff --git a/toolkit/components/telemetry/histogram-whitelists.json b/toolkit/components/telemetry/histogram-whitelists.json
index deb1bd5b3..529d9f4d5 100644
--- a/toolkit/components/telemetry/histogram-whitelists.json
+++ b/toolkit/components/telemetry/histogram-whitelists.json
@@ -429,15 +429,10 @@
     "MOZ_SQLITE_WEBAPPS_WRITE_B",
     "MOZ_SQLITE_WEBAPPS_WRITE_MAIN_THREAD_MS",
     "MOZ_SQLITE_WEBAPPS_WRITE_MS",
-    "NETWORK_CACHE_FS_TYPE",
-    "NETWORK_CACHE_HASH_STATS",
-    "NETWORK_CACHE_HIT_MISS_STAT_PER_CACHE_SIZE",
-    "NETWORK_CACHE_HIT_RATE_PER_CACHE_SIZE",
     "NETWORK_CACHE_METADATA_FIRST_READ_SIZE",
     "NETWORK_CACHE_METADATA_FIRST_READ_TIME_MS",
     "NETWORK_CACHE_METADATA_SECOND_READ_TIME_MS",
     "NETWORK_CACHE_METADATA_SIZE",
-    "NETWORK_CACHE_SIZE_FULL_FAT",
     "NETWORK_CACHE_V1_HIT_TIME_MS",
     "NETWORK_CACHE_V1_MISS_TIME_MS",
     "NETWORK_CACHE_V1_TRUNCATE_TIME_MS",
@@ -1276,15 +1271,10 @@
     "MOZ_SQLITE_WEBAPPS_WRITE_MS",
     "MOZ_STORAGE_ASYNC_REQUESTS_MS",
     "MOZ_STORAGE_ASYNC_REQUESTS_SUCCESS",
-    "NETWORK_CACHE_FS_TYPE",
-    "NETWORK_CACHE_HASH_STATS",
-    "NETWORK_CACHE_HIT_MISS_STAT_PER_CACHE_SIZE",
-    "NETWORK_CACHE_HIT_RATE_PER_CACHE_SIZE",
     "NETWORK_CACHE_METADATA_FIRST_READ_SIZE",
     "NETWORK_CACHE_METADATA_FIRST_READ_TIME_MS",
     "NETWORK_CACHE_METADATA_SECOND_READ_TIME_MS",
     "NETWORK_CACHE_METADATA_SIZE",
-    "NETWORK_CACHE_SIZE_FULL_FAT",
     "NETWORK_CACHE_V1_HIT_TIME_MS",
     "NETWORK_CACHE_V1_MISS_TIME_MS",
     "NETWORK_CACHE_V1_TRUNCATE_TIME_MS",
@@ -1862,10 +1852,8 @@
     "DEVTOOLS_READ_HEAP_SNAPSHOT_MS",
     "DEVTOOLS_HEAP_SNAPSHOT_NODE_COUNT",
     "DEVTOOLS_HEAP_SNAPSHOT_EDGE_COUNT",
-    "NETWORK_CACHE_HIT_RATE_PER_CACHE_SIZE",
     "NETWORK_CACHE_METADATA_FIRST_READ_SIZE",
     "NETWORK_CACHE_METADATA_SIZE",
-    "NETWORK_CACHE_HASH_STATS",
     "SSL_CIPHER_SUITE_FULL",
     "SSL_CIPHER_SUITE_RESUMED",
     "SSL_HANDSHAKE_RESULT",
diff --git a/toolkit/components/thumbnails/BackgroundPageThumbs.jsm b/toolkit/components/thumbnails/BackgroundPageThumbs.jsm
index 7b86fa07c..3eec9827d 100644
--- a/toolkit/components/thumbnails/BackgroundPageThumbs.jsm
+++ b/toolkit/components/thumbnails/BackgroundPageThumbs.jsm
@@ -206,9 +206,13 @@ const BackgroundPageThumbs = {
 
     let browser = this._parentWin.document.createElementNS(XUL_NS, "browser");
     browser.setAttribute("type", "content");
-    browser.setAttribute("remote", "true");
     browser.setAttribute("disableglobalhistory", "true");
 
+    if (Services.prefs.getPrefType("browser.tabs.remote") == Services.prefs.PREF_BOOL &&
+        Services.prefs.getBoolPref("browser.tabs.remote")) {
+      browser.setAttribute("remote", "true");
+    }
+
     if (Services.prefs.getBoolPref(ABOUT_NEWTAB_SEGREGATION_PREF)) {
       // Use the private container for thumbnails.
       let privateIdentity =
diff --git a/toolkit/components/thumbnails/test/test_thumbnails_interfaces.js b/toolkit/components/thumbnails/test/test_thumbnails_interfaces.js
index 8272b2e06..a8fe51418 100644
--- a/toolkit/components/thumbnails/test/test_thumbnails_interfaces.js
+++ b/toolkit/components/thumbnails/test/test_thumbnails_interfaces.js
@@ -22,10 +22,4 @@ function run_test() {
                                null, null);
   ok(uri instanceof Ci.nsIFileURL, "moz-page-thumb:// is a FileURL");
   ok(uri.file, "This moz-page-thumb:// object is backed by a file");
-
-  // and check that the error case works as specified
-  let bad = Services.io.newURI("moz-page-thumb://wronghost/?url=http%3A%2F%2Fwww.mozilla.org%2F",
-                               null, null);
-  Assert.throws(() => handler.resolveURI(bad), /NS_ERROR_NOT_AVAILABLE/i,
-                "moz-page-thumb object with wrong host must not resolve to a file path");
 }
diff --git a/toolkit/content/license.html b/toolkit/content/license.html
index a348fdfa6..7221a8ae2 100644
--- a/toolkit/content/license.html
+++ b/toolkit/content/license.html
@@ -97,6 +97,7 @@
       <li><a href="about:license#gears">Google Gears License</a></li>
       <li><a href="about:license#gears-istumbler">Google Gears/iStumbler License</a></li>
       <li><a href="about:license#vp8">Google VP8 License</a></li>
+      <li><a href="about:license#gsl">GSL License</a></li>
       <li><a href="about:license#gyp">gyp License</a></li>
       <li><a href="about:license#halloc">halloc License</a></li>
       <li><a href="about:license#harfbuzz">HarfBuzz License</a></li>
@@ -3285,6 +3286,38 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     <hr>
 
+    <h1><a id="gsl"></a>GSL License</h1>
+
+    <p>This license applies to <span class="path">mfbt/Span.h</span> and
+    <span class="path">mfbt/tests/gtest/TestSpan.cpp</span>.</p>
+    <!-- https://github.com/Microsoft/GSL/blob/3819df6e378ffccf0e29465afe99c3b324c2aa70/LICENSE -->
+<pre>
+Copyright (c) 2015 Microsoft Corporation. All rights reserved.
+
+This code is licensed under the MIT License (MIT).
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+</pre>
+
+
+    <hr>
+
     <h1><a id="gyp"></a>gyp License</h1>
 
     <p>This license applies to certain files in the directory
diff --git a/toolkit/content/widgets/browser.xml b/toolkit/content/widgets/browser.xml
index e595c847d..a30ff1c43 100644
--- a/toolkit/content/widgets/browser.xml
+++ b/toolkit/content/widgets/browser.xml
@@ -389,10 +389,11 @@
       <method name="getTabBrowser">
         <body>
           <![CDATA[
-            var tabBrowser = this.parentNode;
-            while (tabBrowser && tabBrowser.localName != "tabbrowser")
-              tabBrowser = tabBrowser.parentNode;
-            return tabBrowser;
+            for (let node = this.parentNode; node instanceof Element; node = node.parentNode) {
+              if (node.localName == "tabbrowser")
+                return node;
+            }
+            return null;
           ]]>
         </body>
       </method>
diff --git a/toolkit/moz.configure b/toolkit/moz.configure
index 7d3065f53..792fb113d 100644
--- a/toolkit/moz.configure
+++ b/toolkit/moz.configure
@@ -358,6 +358,20 @@ set_config('MOZ_FMP4', fmp4)
 set_define('MOZ_FMP4', fmp4)
 add_old_configure_assignment('MOZ_FMP4', fmp4)
 
+# Libaom AV1 Video Codec Support
+# ==============================================================
+option('--enable-av1',
+        help='Enable libaom for av1 video support')
+
+@depends('--enable-av1')
+def av1(value):
+    enabled = bool(value)
+    if enabled:
+        return True
+
+set_config('MOZ_AV1', av1)
+set_define('MOZ_AV1', av1)
+
 # Miscellaneous
 # ==============================================================
 option(name='--enable-chrome-format',
diff --git a/toolkit/mozapps/extensions/AddonManager.jsm b/toolkit/mozapps/extensions/AddonManager.jsm
index d453a8981..4cd2c3d0a 100644
--- a/toolkit/mozapps/extensions/AddonManager.jsm
+++ b/toolkit/mozapps/extensions/AddonManager.jsm
@@ -123,13 +123,13 @@ function providerName(aProvider) {
  * parent 'addons' level logger accordingly.
  */
 var PrefObserver = {
-    init: function PrefObserver_init() {
+    init: function() {
       Services.prefs.addObserver(PREF_LOGGING_ENABLED, this, false);
       Services.obs.addObserver(this, "xpcom-shutdown", false);
       this.observe(null, NS_PREFBRANCH_PREFCHANGE_TOPIC_ID, PREF_LOGGING_ENABLED);
     },
 
-    observe: function PrefObserver_observe(aSubject, aTopic, aData) {
+    observe: function(aSubject, aTopic, aData) {
       if (aTopic == "xpcom-shutdown") {
         Services.prefs.removeObserver(PREF_LOGGING_ENABLED, this);
         Services.obs.removeObserver(this, "xpcom-shutdown");
@@ -323,7 +323,7 @@ AsyncObjectCaller.prototype = {
    * Passes the next object to the listener or calls noMoreObjects if there
    * are none left.
    */
-  callNext: function AOC_callNext() {
+  callNext: function() {
     if (this.objects.length == 0) {
       this.listener.noMoreObjects(this);
       return;
@@ -449,7 +449,7 @@ AddonAuthor.prototype = {
   url: null,
 
   // Returns the author's name, defaulting to the empty string
-  toString: function AddonAuthor_toString() {
+  toString: function() {
     return this.name || "";
   }
 }
@@ -493,7 +493,7 @@ AddonScreenshot.prototype = {
   caption: null,
 
   // Returns the screenshot URL, defaulting to the empty string
-  toString: function AddonScreenshot_toString() {
+  toString: function() {
     return this.url || "";
   }
 }
@@ -640,11 +640,11 @@ var AddonManagerInternal = {
   // Store telemetry details per addon provider
   telemetryDetails: {},
 
-  recordTimestamp: function AMI_recordTimestamp(name, value) {
+  recordTimestamp: function(name, value) {
     this.TelemetryTimestamps.add(name, value);
   },
 
-  validateBlocklist: function AMI_validateBlocklist() {
+  validateBlocklist: function() {
     let appBlocklist = FileUtils.getFile(KEY_APPDIR, [FILE_BLOCKLIST]);
 
     // If there is no application shipped blocklist then there is nothing to do
@@ -771,7 +771,7 @@ var AddonManagerInternal = {
    * Initializes the AddonManager, loading any known providers and initializing
    * them.
    */
-  startup: function AMI_startup() {
+  startup: function() {
     try {
       if (gStarted)
         return;
@@ -925,7 +925,7 @@ var AddonManagerInternal = {
    * @param  aTypes
    *         An optional array of add-on types
    */
-  registerProvider: function AMI_registerProvider(aProvider, aTypes) {
+  registerProvider: function(aProvider, aTypes) {
     if (!aProvider || typeof aProvider != "object")
       throw Components.Exception("aProvider must be specified",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -977,7 +977,7 @@ var AddonManagerInternal = {
    *         For providers that have async shutdown methods returning Promises,
    *         the caller should wait for that Promise to resolve.
    */
-  unregisterProvider: function AMI_unregisterProvider(aProvider) {
+  unregisterProvider: function(aProvider) {
     if (!aProvider || typeof aProvider != "object")
       throw Components.Exception("aProvider must be specified",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -1032,7 +1032,7 @@ var AddonManagerInternal = {
    *
    * @param aProvider Provider object to mark safe
    */
-  markProviderSafe: function AMI_markProviderSafe(aProvider) {
+  markProviderSafe: function(aProvider) {
     if (!gStarted) {
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1062,7 +1062,7 @@ var AddonManagerInternal = {
    *         The method name to call
    * @see    callProvider
    */
-  callProviders: function AMI_callProviders(aMethod, ...aArgs) {
+  callProviders: function(aMethod, ...aArgs) {
     if (!aMethod || typeof aMethod != "string")
       throw Components.Exception("aMethod must be a non-empty string",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -1163,7 +1163,7 @@ var AddonManagerInternal = {
    *
    * @see nsIObserver
    */
-  observe: function AMI_observe(aSubject, aTopic, aData) {
+  observe: function(aSubject, aTopic, aData) {
     switch (aData) {
       case PREF_EM_CHECK_COMPATIBILITY: {
         let oldValue = gCheckCompatibility;
@@ -1247,7 +1247,7 @@ var AddonManagerInternal = {
    *         The optional application version to use for %APP_VERSION%
    * @return The appropriately escaped URI.
    */
-  escapeAddonURI: function AMI_escapeAddonURI(aAddon, aUri, aAppVersion)
+  escapeAddonURI: function(aAddon, aUri, aAppVersion)
   {
     if (!aAddon || typeof aAddon != "object")
       throw Components.Exception("aAddon must be an Addon object",
@@ -1318,7 +1318,7 @@ var AddonManagerInternal = {
    * @return Promise{null} Resolves when the background update check is complete
    *                       (the resulting addon installations may still be in progress).
    */
-  backgroundUpdateCheck: function AMI_backgroundUpdateCheck() {
+  backgroundUpdateCheck: function() {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1347,7 +1347,7 @@ var AddonManagerInternal = {
           // be applied
           updates.push(new Promise((resolve, reject) => {
             addon.findUpdates({
-              onUpdateAvailable: function BUC_onUpdateAvailable(aAddon, aInstall) {
+              onUpdateAvailable: function(aAddon, aInstall) {
                 // Start installing updates when the add-on can be updated and
                 // background updates should be applied.
                 logger.debug("Found update for add-on ${id}", aAddon);
@@ -1389,7 +1389,7 @@ var AddonManagerInternal = {
    * @param  aID
    *         The ID of the add-on
    */
-  addStartupChange: function AMI_addStartupChange(aType, aID) {
+  addStartupChange: function(aType, aID) {
     if (!aType || typeof aType != "string")
       throw Components.Exception("aType must be a non-empty string",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -1418,7 +1418,7 @@ var AddonManagerInternal = {
    * @param  aID
    *         The ID of the add-on
    */
-  removeStartupChange: function AMI_removeStartupChange(aType, aID) {
+  removeStartupChange: function(aType, aID) {
     if (!aType || typeof aType != "string")
       throw Components.Exception("aType must be a non-empty string",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -1444,7 +1444,7 @@ var AddonManagerInternal = {
    * @param  aMethod
    *         The method on the listeners to call
    */
-  callManagerListeners: function AMI_callManagerListeners(aMethod, ...aArgs) {
+  callManagerListeners: function(aMethod, ...aArgs) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1475,7 +1475,7 @@ var AddonManagerInternal = {
    *         An optional array of extra InstallListeners to also call
    * @return false if any of the listeners returned false, true otherwise
    */
-  callInstallListeners: function AMI_callInstallListeners(aMethod,
+  callInstallListeners: function(aMethod,
                                  aExtraListeners, ...aArgs) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
@@ -1517,7 +1517,7 @@ var AddonManagerInternal = {
    * @param  aMethod
    *         The method on the listeners to call
    */
-  callAddonListeners: function AMI_callAddonListeners(aMethod, ...aArgs) {
+  callAddonListeners: function(aMethod, ...aArgs) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1551,7 +1551,7 @@ var AddonManagerInternal = {
    *         A boolean indicating if the change will only take place the next
    *         time the application is restarted
    */
-  notifyAddonChanged: function AMI_notifyAddonChanged(aID, aType, aPendingRestart) {
+  notifyAddonChanged: function(aID, aType, aPendingRestart) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1588,7 +1588,7 @@ var AddonManagerInternal = {
    * their add-ons in response to an application change such as a blocklist
    * update.
    */
-  updateAddonAppDisabledStates: function AMI_updateAddonAppDisabledStates() {
+  updateAddonAppDisabledStates: function() {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1603,7 +1603,7 @@ var AddonManagerInternal = {
    * @param  aCallback
    *         Function to call when operation is complete.
    */
-  updateAddonRepositoryData: function AMI_updateAddonRepositoryData(aCallback) {
+  updateAddonRepositoryData: function(aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1613,11 +1613,11 @@ var AddonManagerInternal = {
                                  Cr.NS_ERROR_INVALID_ARG);
 
     new AsyncObjectCaller(this.providers, "updateAddonRepositoryData", {
-      nextObject: function updateAddonRepositoryData_nextObject(aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "updateAddonRepositoryData",
                           aCaller.callNext.bind(aCaller));
       },
-      noMoreObjects: function updateAddonRepositoryData_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback);
         // only tests should care about this
         Services.obs.notifyObservers(null, "TEST:addon-repository-data-updated", null);
@@ -1646,7 +1646,7 @@ var AddonManagerInternal = {
    *         An optional nsILoadGroup to associate any network requests with
    * @throws if the aUrl, aCallback or aMimetype arguments are not specified
    */
-  getInstallForURL: function AMI_getInstallForURL(aUrl, aCallback, aMimetype,
+  getInstallForURL: function(aUrl, aCallback, aMimetype,
                                                   aHash, aName, aIcons,
                                                   aVersion, aBrowser) {
     if (!gStarted)
@@ -1716,7 +1716,7 @@ var AddonManagerInternal = {
    *         An optional mimetype hint for the add-on
    * @throws if the aFile or aCallback arguments are not specified
    */
-  getInstallForFile: function AMI_getInstallForFile(aFile, aCallback, aMimetype) {
+  getInstallForFile: function(aFile, aCallback, aMimetype) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1734,7 +1734,7 @@ var AddonManagerInternal = {
                                  Cr.NS_ERROR_INVALID_ARG);
 
     new AsyncObjectCaller(this.providers, "getInstallForFile", {
-      nextObject: function getInstallForFile_nextObject(aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "getInstallForFile", aFile,
                           function getInstallForFile_safeCall(aInstall) {
           if (aInstall)
@@ -1744,7 +1744,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getInstallForFile_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback, null);
       }
     });
@@ -1760,7 +1760,7 @@ var AddonManagerInternal = {
    *         A callback which will be passed an array of AddonInstalls
    * @throws If the aCallback argument is not specified
    */
-  getInstallsByTypes: function AMI_getInstallsByTypes(aTypes, aCallback) {
+  getInstallsByTypes: function(aTypes, aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1776,7 +1776,7 @@ var AddonManagerInternal = {
     let installs = [];
 
     new AsyncObjectCaller(this.providers, "getInstallsByTypes", {
-      nextObject: function getInstallsByTypes_nextObject(aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "getInstallsByTypes", aTypes,
                           function getInstallsByTypes_safeCall(aProviderInstalls) {
           if (aProviderInstalls) {
@@ -1786,7 +1786,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getInstallsByTypes_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback, installs);
       }
     });
@@ -1798,7 +1798,7 @@ var AddonManagerInternal = {
    * @param  aCallback
    *         A callback which will be passed an array of AddonInstalls
    */
-  getAllInstalls: function AMI_getAllInstalls(aCallback) {
+  getAllInstalls: function(aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1818,7 +1818,7 @@ var AddonManagerInternal = {
    * @return string containing the Addon ID or null
    * @see    amIAddonManager.mapURIToAddonID
    */
-  mapURIToAddonID: function AMI_mapURIToAddonID(aURI) {
+  mapURIToAddonID: function(aURI) {
     if (!(aURI instanceof Ci.nsIURI)) {
       throw Components.Exception("aURI is not a nsIURI",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -1843,7 +1843,7 @@ var AddonManagerInternal = {
    *         The mimetype to check
    * @return true if installation is enabled for the mimetype
    */
-  isInstallEnabled: function AMI_isInstallEnabled(aMimetype) {
+  isInstallEnabled: function(aMimetype) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1871,7 +1871,7 @@ var AddonManagerInternal = {
    *         The nsIPrincipal that initiated the install
    * @return true if the source is allowed to install this mimetype
    */
-  isInstallAllowed: function AMI_isInstallAllowed(aMimetype, aInstallingPrincipal) {
+  isInstallAllowed: function(aMimetype, aInstallingPrincipal) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -1906,10 +1906,7 @@ var AddonManagerInternal = {
    * @param  aInstalls
    *         The array of AddonInstalls to be installed
    */
-  installAddonsFromWebpage: function AMI_installAddonsFromWebpage(aMimetype,
-                                                                  aBrowser,
-                                                                  aInstallingPrincipal,
-                                                                  aInstalls) {
+  installAddonsFromWebpage: function(aMimetype, aBrowser, aInstallingPrincipal, aInstalls) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2011,7 +2008,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The InstallListener to add
    */
-  addInstallListener: function AMI_addInstallListener(aListener) {
+  addInstallListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be a InstallListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2027,7 +2024,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The InstallListener to remove
    */
-  removeInstallListener: function AMI_removeInstallListener(aListener) {
+  removeInstallListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be a InstallListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2050,7 +2047,7 @@ var AddonManagerInternal = {
    *         The callback to pass the retrieved add-on to
    * @throws if the aID or aCallback arguments are not specified
    */
-  getAddonByID: function AMI_getAddonByID(aID, aCallback) {
+  getAddonByID: function(aID, aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2064,7 +2061,7 @@ var AddonManagerInternal = {
                                  Cr.NS_ERROR_INVALID_ARG);
 
     new AsyncObjectCaller(this.providers, "getAddonByID", {
-      nextObject: function getAddonByID_nextObject(aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "getAddonByID", aID,
                           function getAddonByID_safeCall(aAddon) {
           if (aAddon)
@@ -2074,7 +2071,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getAddonByID_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback, null);
       }
     });
@@ -2089,7 +2086,7 @@ var AddonManagerInternal = {
    *         The callback to pass the retrieved add-on to.
    * @throws if the aGUID or aCallback arguments are not specified
    */
-  getAddonBySyncGUID: function AMI_getAddonBySyncGUID(aGUID, aCallback) {
+  getAddonBySyncGUID: function(aGUID, aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2103,7 +2100,7 @@ var AddonManagerInternal = {
                                  Cr.NS_ERROR_INVALID_ARG);
 
     new AsyncObjectCaller(this.providers, "getAddonBySyncGUID", {
-      nextObject: function getAddonBySyncGUID_nextObject(aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "getAddonBySyncGUID", aGUID,
                           function getAddonBySyncGUID_safeCall(aAddon) {
           if (aAddon) {
@@ -2114,7 +2111,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getAddonBySyncGUID_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback, null);
       }
     });
@@ -2129,7 +2126,7 @@ var AddonManagerInternal = {
    *         The callback to pass an array of Addons to
    * @throws if the aID or aCallback arguments are not specified
    */
-  getAddonsByIDs: function AMI_getAddonsByIDs(aIDs, aCallback) {
+  getAddonsByIDs: function(aIDs, aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2145,7 +2142,7 @@ var AddonManagerInternal = {
     let addons = [];
 
     new AsyncObjectCaller(aIDs, null, {
-      nextObject: function getAddonsByIDs_nextObject(aCaller, aID) {
+      nextObject: function(aCaller, aID) {
         AddonManagerInternal.getAddonByID(aID,
                              function getAddonsByIDs_getAddonByID(aAddon) {
           addons.push(aAddon);
@@ -2153,7 +2150,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getAddonsByIDs_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback, addons);
       }
     });
@@ -2168,7 +2165,7 @@ var AddonManagerInternal = {
    *         The callback to pass an array of Addons to.
    * @throws if the aCallback argument is not specified
    */
-  getAddonsByTypes: function AMI_getAddonsByTypes(aTypes, aCallback) {
+  getAddonsByTypes: function(aTypes, aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2184,7 +2181,7 @@ var AddonManagerInternal = {
     let addons = [];
 
     new AsyncObjectCaller(this.providers, "getAddonsByTypes", {
-      nextObject: function getAddonsByTypes_nextObject(aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "getAddonsByTypes", aTypes,
                           function getAddonsByTypes_concatAddons(aProviderAddons) {
           if (aProviderAddons) {
@@ -2194,7 +2191,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getAddonsByTypes_noMoreObjects(aCaller) {
+      noMoreObjects: function(aCaller) {
         safeCall(aCallback, addons);
       }
     });
@@ -2206,7 +2203,7 @@ var AddonManagerInternal = {
    * @param  aCallback
    *         A callback which will be passed an array of Addons
    */
-  getAllAddons: function AMI_getAllAddons(aCallback) {
+  getAllAddons: function(aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2228,8 +2225,7 @@ var AddonManagerInternal = {
    *         The callback to pass the array of Addons to
    * @throws if the aCallback argument is not specified
    */
-  getAddonsWithOperationsByTypes:
-  function AMI_getAddonsWithOperationsByTypes(aTypes, aCallback) {
+  getAddonsWithOperationsByTypes: function(aTypes, aCallback) {
     if (!gStarted)
       throw Components.Exception("AddonManager is not initialized",
                                  Cr.NS_ERROR_NOT_INITIALIZED);
@@ -2245,8 +2241,7 @@ var AddonManagerInternal = {
     let addons = [];
 
     new AsyncObjectCaller(this.providers, "getAddonsWithOperationsByTypes", {
-      nextObject: function getAddonsWithOperationsByTypes_nextObject
-                           (aCaller, aProvider) {
+      nextObject: function(aCaller, aProvider) {
         callProviderAsync(aProvider, "getAddonsWithOperationsByTypes", aTypes,
                           function getAddonsWithOperationsByTypes_concatAddons
                                    (aProviderAddons) {
@@ -2257,7 +2252,7 @@ var AddonManagerInternal = {
         });
       },
 
-      noMoreObjects: function getAddonsWithOperationsByTypes_noMoreObjects(caller) {
+      noMoreObjects: function(caller) {
         safeCall(aCallback, addons);
       }
     });
@@ -2269,7 +2264,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The listener to add
    */
-  addManagerListener: function AMI_addManagerListener(aListener) {
+  addManagerListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be an AddonManagerListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2285,7 +2280,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The listener to remove
    */
-  removeManagerListener: function AMI_removeManagerListener(aListener) {
+  removeManagerListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be an AddonManagerListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2305,7 +2300,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The AddonListener to add
    */
-  addAddonListener: function AMI_addAddonListener(aListener) {
+  addAddonListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be an AddonListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2321,7 +2316,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The AddonListener to remove
    */
-  removeAddonListener: function AMI_removeAddonListener(aListener) {
+  removeAddonListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be an AddonListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2341,7 +2336,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The TypeListener to add
    */
-  addTypeListener: function AMI_addTypeListener(aListener) {
+  addTypeListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be a TypeListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2357,7 +2352,7 @@ var AddonManagerInternal = {
    * @param  aListener
    *         The TypeListener to remove
    */
-  removeTypeListener: function AMI_removeTypeListener(aListener) {
+  removeTypeListener: function(aListener) {
     if (!aListener || typeof aListener != "object")
       throw Components.Exception("aListener must be a TypeListener object",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -2496,23 +2491,23 @@ var AddonManagerInternal = {
  * subject to change at any time.
  */
 this.AddonManagerPrivate = {
-  startup: function AMP_startup() {
+  startup: function() {
     AddonManagerInternal.startup();
   },
 
-  registerProvider: function AMP_registerProvider(aProvider, aTypes) {
+  registerProvider: function(aProvider, aTypes) {
     AddonManagerInternal.registerProvider(aProvider, aTypes);
   },
 
-  unregisterProvider: function AMP_unregisterProvider(aProvider) {
+  unregisterProvider: function(aProvider) {
     AddonManagerInternal.unregisterProvider(aProvider);
   },
 
-  markProviderSafe: function AMP_markProviderSafe(aProvider) {
+  markProviderSafe: function(aProvider) {
     AddonManagerInternal.markProviderSafe(aProvider);
   },
 
-  backgroundUpdateCheck: function AMP_backgroundUpdateCheck() {
+  backgroundUpdateCheck: function() {
     return AddonManagerInternal.backgroundUpdateCheck();
   },
 
@@ -2526,32 +2521,32 @@ this.AddonManagerPrivate = {
     AddonManagerInternal.backgroundUpdateCheck();
   },
 
-  addStartupChange: function AMP_addStartupChange(aType, aID) {
+  addStartupChange: function(aType, aID) {
     AddonManagerInternal.addStartupChange(aType, aID);
   },
 
-  removeStartupChange: function AMP_removeStartupChange(aType, aID) {
+  removeStartupChange: function(aType, aID) {
     AddonManagerInternal.removeStartupChange(aType, aID);
   },
 
-  notifyAddonChanged: function AMP_notifyAddonChanged(aID, aType, aPendingRestart) {
+  notifyAddonChanged: function(aID, aType, aPendingRestart) {
     AddonManagerInternal.notifyAddonChanged(aID, aType, aPendingRestart);
   },
 
-  updateAddonAppDisabledStates: function AMP_updateAddonAppDisabledStates() {
+  updateAddonAppDisabledStates: function() {
     AddonManagerInternal.updateAddonAppDisabledStates();
   },
 
-  updateAddonRepositoryData: function AMP_updateAddonRepositoryData(aCallback) {
+  updateAddonRepositoryData: function(aCallback) {
     AddonManagerInternal.updateAddonRepositoryData(aCallback);
   },
 
-  callInstallListeners: function AMP_callInstallListeners(...aArgs) {
+  callInstallListeners: function(...aArgs) {
     return AddonManagerInternal.callInstallListeners.apply(AddonManagerInternal,
                                                            aArgs);
   },
 
-  callAddonListeners: function AMP_callAddonListeners(...aArgs) {
+  callAddonListeners: function(...aArgs) {
     AddonManagerInternal.callAddonListeners.apply(AddonManagerInternal, aArgs);
   },
 
@@ -2563,16 +2558,16 @@ this.AddonManagerPrivate = {
 
   AddonType: AddonType,
 
-  recordTimestamp: function AMP_recordTimestamp(name, value) {
+  recordTimestamp: function(name, value) {
     AddonManagerInternal.recordTimestamp(name, value);
   },
 
   _simpleMeasures: {},
-  recordSimpleMeasure: function AMP_recordSimpleMeasure(name, value) {
+  recordSimpleMeasure: function(name, value) {
     this._simpleMeasures[name] = value;
   },
 
-  recordException: function AMP_recordException(aModule, aContext, aException) {
+  recordException: function(aModule, aContext, aException) {
     let report = {
       module: aModule,
       context: aContext
@@ -2592,15 +2587,15 @@ this.AddonManagerPrivate = {
     this._simpleMeasures.exception = report;
   },
 
-  getSimpleMeasures: function AMP_getSimpleMeasures() {
+  getSimpleMeasures: function() {
     return this._simpleMeasures;
   },
 
-  getTelemetryDetails: function AMP_getTelemetryDetails() {
+  getTelemetryDetails: function() {
     return AddonManagerInternal.telemetryDetails;
   },
 
-  setTelemetryDetails: function AMP_setTelemetryDetails(aProvider, aDetails) {
+  setTelemetryDetails: function(aProvider, aDetails) {
     AddonManagerInternal.telemetryDetails[aProvider] = aDetails;
   },
 
@@ -2619,7 +2614,7 @@ this.AddonManagerPrivate = {
    * This can be used as an implementation for Addon.findUpdates() when
    * no update mechanism is available.
    */
-  callNoUpdateListeners: function (addon, listener, reason, appVersion, platformVersion) {
+  callNoUpdateListeners: function(addon, listener, reason, appVersion, platformVersion) {
     if ("onNoCompatibilityUpdateAvailable" in listener) {
       safeCall(listener.onNoCompatibilityUpdateAvailable.bind(listener), addon);
     }
@@ -2830,14 +2825,13 @@ this.AddonManager = {
     return gStartupComplete && !gShutdownInProgress;
   },
 
-  getInstallForURL: function AM_getInstallForURL(aUrl, aCallback, aMimetype,
-                                                 aHash, aName, aIcons,
-                                                 aVersion, aBrowser) {
+  getInstallForURL: function(aUrl, aCallback, aMimetype, aHash, aName, aIcons,
+                             aVersion, aBrowser) {
     AddonManagerInternal.getInstallForURL(aUrl, aCallback, aMimetype, aHash,
                                           aName, aIcons, aVersion, aBrowser);
   },
 
-  getInstallForFile: function AM_getInstallForFile(aFile, aCallback, aMimetype) {
+  getInstallForFile: function(aFile, aCallback, aMimetype) {
     AddonManagerInternal.getInstallForFile(aFile, aCallback, aMimetype);
   },
 
@@ -2848,94 +2842,91 @@ this.AddonManager = {
    *         The type of startup change to get
    * @return An array of add-on IDs
    */
-  getStartupChanges: function AM_getStartupChanges(aType) {
+  getStartupChanges: function(aType) {
     if (!(aType in AddonManagerInternal.startupChanges))
       return [];
     return AddonManagerInternal.startupChanges[aType].slice(0);
   },
 
-  getAddonByID: function AM_getAddonByID(aID, aCallback) {
+  getAddonByID: function(aID, aCallback) {
     AddonManagerInternal.getAddonByID(aID, aCallback);
   },
 
-  getAddonBySyncGUID: function AM_getAddonBySyncGUID(aGUID, aCallback) {
+  getAddonBySyncGUID: function(aGUID, aCallback) {
     AddonManagerInternal.getAddonBySyncGUID(aGUID, aCallback);
   },
 
-  getAddonsByIDs: function AM_getAddonsByIDs(aIDs, aCallback) {
+  getAddonsByIDs: function(aIDs, aCallback) {
     AddonManagerInternal.getAddonsByIDs(aIDs, aCallback);
   },
 
-  getAddonsWithOperationsByTypes:
-  function AM_getAddonsWithOperationsByTypes(aTypes, aCallback) {
+  getAddonsWithOperationsByTypes: function(aTypes, aCallback) {
     AddonManagerInternal.getAddonsWithOperationsByTypes(aTypes, aCallback);
   },
 
-  getAddonsByTypes: function AM_getAddonsByTypes(aTypes, aCallback) {
+  getAddonsByTypes: function(aTypes, aCallback) {
     AddonManagerInternal.getAddonsByTypes(aTypes, aCallback);
   },
 
-  getAllAddons: function AM_getAllAddons(aCallback) {
+  getAllAddons: function(aCallback) {
     AddonManagerInternal.getAllAddons(aCallback);
   },
 
-  getInstallsByTypes: function AM_getInstallsByTypes(aTypes, aCallback) {
+  getInstallsByTypes: function(aTypes, aCallback) {
     AddonManagerInternal.getInstallsByTypes(aTypes, aCallback);
   },
 
-  getAllInstalls: function AM_getAllInstalls(aCallback) {
+  getAllInstalls: function(aCallback) {
     AddonManagerInternal.getAllInstalls(aCallback);
   },
 
-  mapURIToAddonID: function AM_mapURIToAddonID(aURI) {
+  mapURIToAddonID: function(aURI) {
     return AddonManagerInternal.mapURIToAddonID(aURI);
   },
 
-  isInstallEnabled: function AM_isInstallEnabled(aType) {
+  isInstallEnabled: function(aType) {
     return AddonManagerInternal.isInstallEnabled(aType);
   },
 
-  isInstallAllowed: function AM_isInstallAllowed(aType, aInstallingPrincipal) {
+  isInstallAllowed: function(aType, aInstallingPrincipal) {
     return AddonManagerInternal.isInstallAllowed(aType, ensurePrincipal(aInstallingPrincipal));
   },
 
-  installAddonsFromWebpage: function AM_installAddonsFromWebpage(aType, aBrowser,
-                                                                 aInstallingPrincipal,
-                                                                 aInstalls) {
+  installAddonsFromWebpage: function(aType, aBrowser, aInstallingPrincipal, aInstalls) {
     AddonManagerInternal.installAddonsFromWebpage(aType, aBrowser,
                                                   ensurePrincipal(aInstallingPrincipal),
                                                   aInstalls);
   },
 
-  addManagerListener: function AM_addManagerListener(aListener) {
+  addManagerListener: function(aListener) {
     AddonManagerInternal.addManagerListener(aListener);
   },
 
-  removeManagerListener: function AM_removeManagerListener(aListener) {
+  removeManagerListener: function(aListener) {
     AddonManagerInternal.removeManagerListener(aListener);
   },
 
-  addInstallListener: function AM_addInstallListener(aListener) {
+  addInstallListener: function(aListener) {
     AddonManagerInternal.addInstallListener(aListener);
   },
 
-  removeInstallListener: function AM_removeInstallListener(aListener) {
+  removeInstallListener: function(aListener) {
     AddonManagerInternal.removeInstallListener(aListener);
   },
 
-  addAddonListener: function AM_addAddonListener(aListener) {
+  addAddonListener: function(aListener) {
     AddonManagerInternal.addAddonListener(aListener);
   },
 
-  removeAddonListener: function AM_removeAddonListener(aListener) {
+  removeAddonListener: function(aListener) {
     AddonManagerInternal.removeAddonListener(aListener);
   },
 
-  addTypeListener: function AM_addTypeListener(aListener) {
+  addTypeListener: function(aListener) {
     AddonManagerInternal.addTypeListener(aListener);
   },
 
-  removeTypeListener: function AM_removeTypeListener(aListener) {
+  removeTypeListener: function(aListener) {
     AddonManagerInternal.removeTypeListener(aListener);
   },
 
@@ -2950,7 +2941,7 @@ this.AddonManager = {
    *         The Addon representing the add-on
    * @return true if the addon should auto-update, false otherwise.
    */
-  shouldAutoUpdate: function AM_shouldAutoUpdate(aAddon) {
+  shouldAutoUpdate: function(aAddon) {
     if (!aAddon || typeof aAddon != "object")
       throw Components.Exception("aAddon must be specified",
                                  Cr.NS_ERROR_INVALID_ARG);
@@ -3008,7 +2999,7 @@ this.AddonManager = {
     AddonManagerInternal.autoUpdateDefault = aValue;
   },
 
-  escapeAddonURI: function AM_escapeAddonURI(aAddon, aUri, aAppVersion) {
+  escapeAddonURI: function(aAddon, aUri, aAppVersion) {
     return AddonManagerInternal.escapeAddonURI(aAddon, aUri, aAppVersion);
   },
 
diff --git a/toolkit/mozapps/extensions/LightweightThemeManager.jsm b/toolkit/mozapps/extensions/LightweightThemeManager.jsm
index 5856bfa91..372a9f3b8 100644
--- a/toolkit/mozapps/extensions/LightweightThemeManager.jsm
+++ b/toolkit/mozapps/extensions/LightweightThemeManager.jsm
@@ -110,11 +110,11 @@ this.LightweightThemeManager = {
     return _setCurrentTheme(aData, false);
   },
 
-  setLocalTheme: function LightweightThemeManager_setLocalTheme(aData) {
+  setLocalTheme: function(aData) {
     _setCurrentTheme(aData, true);
   },
 
-  getUsedTheme: function LightweightThemeManager_getUsedTheme(aId) {
+  getUsedTheme: function(aId) {
     var usedThemes = this.usedThemes;
     for (let usedTheme of usedThemes) {
       if (usedTheme.id == aId)
@@ -123,7 +123,7 @@ this.LightweightThemeManager = {
     return null;
   },
 
-  forgetUsedTheme: function LightweightThemeManager_forgetUsedTheme(aId) {
+  forgetUsedTheme: function(aId) {
     let theme = this.getUsedTheme(aId);
     if (!theme)
       return;
@@ -141,7 +141,7 @@ this.LightweightThemeManager = {
     AddonManagerPrivate.callAddonListeners("onUninstalled", wrapper);
   },
 
-  previewTheme: function LightweightThemeManager_previewTheme(aData) {
+  previewTheme: function(aData) {
     let cancel = Cc["@mozilla.org/supports-PRBool;1"].createInstance(Ci.nsISupportsPRBool);
     cancel.data = false;
     Services.obs.notifyObservers(cancel, "lightweight-theme-preview-requested",
@@ -160,7 +160,7 @@ this.LightweightThemeManager = {
     _notifyWindows(aData);
   },
 
-  resetPreview: function LightweightThemeManager_resetPreview() {
+  resetPreview: function() {
     if (_previewTimer) {
       _previewTimer.cancel();
       _previewTimer = null;
@@ -168,7 +168,7 @@ this.LightweightThemeManager = {
     }
   },
 
-  parseTheme: function LightweightThemeManager_parseTheme(aString, aBaseURI) {
+  parseTheme: function(aString, aBaseURI) {
     try {
       return _sanitizeTheme(JSON.parse(aString), aBaseURI, false);
     } catch (e) {
@@ -176,7 +176,7 @@ this.LightweightThemeManager = {
     }
   },
 
-  updateCurrentTheme: function LightweightThemeManager_updateCurrentTheme() {
+  updateCurrentTheme: function() {
     try {
       if (!_prefs.getBoolPref("update.enabled"))
         return;
@@ -224,7 +224,7 @@ this.LightweightThemeManager = {
    * @param  aData
    *         The lightweight theme to switch to
    */
-  themeChanged: function LightweightThemeManager_themeChanged(aData) {
+  themeChanged: function(aData) {
     if (_previewTimer) {
       _previewTimer.cancel();
       _previewTimer = null;
@@ -251,7 +251,7 @@ this.LightweightThemeManager = {
    * Starts the Addons provider and enables the new lightweight theme if
    * necessary.
    */
-  startup: function LightweightThemeManager_startup() {
+  startup: function() {
     if (Services.prefs.prefHasUserValue(PREF_LWTHEME_TO_SELECT)) {
       let id = Services.prefs.getCharPref(PREF_LWTHEME_TO_SELECT);
       if (id)
@@ -267,7 +267,7 @@ this.LightweightThemeManager = {
   /**
    * Shuts down the provider.
    */
-  shutdown: function LightweightThemeManager_shutdown() {
+  shutdown: function() {
     _prefs.removeObserver("", _prefObserver);
   },
 
@@ -283,7 +283,7 @@ this.LightweightThemeManager = {
    *         true if the newly enabled add-on will only become enabled after a
    *         restart
    */
-  addonChanged: function LightweightThemeManager_addonChanged(aId, aType, aPendingRestart) {
+  addonChanged: function(aId, aType, aPendingRestart) {
     if (aType != ADDON_TYPE)
       return;
 
@@ -356,7 +356,7 @@ this.LightweightThemeManager = {
    * @param  aCallback
    *         A callback to pass the Addon to
    */
-  getAddonByID: function LightweightThemeManager_getAddonByID(aId, aCallback) {
+  getAddonByID: function(aId, aCallback) {
     let id = _getInternalID(aId);
     if (!id) {
       aCallback(null);
@@ -380,7 +380,7 @@ this.LightweightThemeManager = {
    * @param  aCallback
    *         A callback to pass an array of Addons to
    */
-  getAddonsByTypes: function LightweightThemeManager_getAddonsByTypes(aTypes, aCallback) {
+  getAddonsByTypes: function(aTypes, aCallback) {
     if (aTypes && aTypes.indexOf(ADDON_TYPE) == -1) {
       aCallback([]);
       return;
@@ -541,7 +541,7 @@ AddonWrapper.prototype = {
   },
 
   // Lightweight themes are always compatible
-  isCompatibleWith: function AddonWrapper_isCompatibleWith(appVersion, platformVersion) {
+  isCompatibleWith: function(appVersion, platformVersion) {
     return true;
   },
 
@@ -708,7 +708,7 @@ function _notifyWindows(aThemeData) {
 
 var _previewTimer;
 var _previewTimerCallback = {
-  notify: function _previewTimerCallback_notify() {
+  notify: function() {
     LightweightThemeManager.resetPreview();
   }
 };
diff --git a/toolkit/mozapps/extensions/addonManager.js b/toolkit/mozapps/extensions/addonManager.js
index 731e70c6c..2628ea87b 100644
--- a/toolkit/mozapps/extensions/addonManager.js
+++ b/toolkit/mozapps/extensions/addonManager.js
@@ -53,7 +53,7 @@ function amManager() {
 }
 
 amManager.prototype = {
-  observe: function AMC_observe(aSubject, aTopic, aData) {
+  observe: function(aSubject, aTopic, aData) {
     if (aTopic == "addons-startup")
       AddonManagerPrivate.startup();
   },
@@ -61,7 +61,7 @@ amManager.prototype = {
   /**
    * @see amIAddonManager.idl
    */
-  mapURIToAddonID: function AMC_mapURIToAddonID(uri, id) {
+  mapURIToAddonID: function(uri, id) {
     id.value = AddonManager.mapURIToAddonID(uri);
     return !!id.value;
   },
@@ -69,19 +69,15 @@ amManager.prototype = {
   /**
    * @see amIWebInstaller.idl
    */
-  isInstallEnabled: function AMC_isInstallEnabled(aMimetype, aReferer) {
+  isInstallEnabled: function(aMimetype, aReferer) {
     return AddonManager.isInstallEnabled(aMimetype);
   },
 
   /**
    * @see amIWebInstaller.idl
    */
-  installAddonsFromWebpage: function AMC_installAddonsFromWebpage(aMimetype,
-                                                                  aBrowser,
-                                                                  aInstallingPrincipal,
-                                                                  aUris, aHashes,
-                                                                  aNames, aIcons,
-                                                                  aCallback) {
+  installAddonsFromWebpage: function(aMimetype, aBrowser, aInstallingPrincipal,
+                                     aUris, aHashes, aNames, aIcons, aCallback) {
     if (aUris.length == 0)
       return false;
 
@@ -112,22 +108,22 @@ amManager.prototype = {
           installs.push(aInstall);
           if (aCallback) {
             aInstall.addListener({
-              onDownloadCancelled: function buildNextInstall_onDownloadCancelled(aInstall) {
+              onDownloadCancelled: function(aInstall) {
                 callCallback(uri, USER_CANCELLED);
               },
 
-              onDownloadFailed: function buildNextInstall_onDownloadFailed(aInstall) {
+              onDownloadFailed: function(aInstall) {
                 if (aInstall.error == AddonManager.ERROR_CORRUPT_FILE)
                   callCallback(uri, CANT_READ_ARCHIVE);
                 else
                   callCallback(uri, DOWNLOAD_ERROR);
               },
 
-              onInstallFailed: function buildNextInstall_onInstallFailed(aInstall) {
+              onInstallFailed: function(aInstall) {
                 callCallback(uri, EXECUTION_ERROR);
               },
 
-              onInstallEnded: function buildNextInstall_onInstallEnded(aInstall, aStatus) {
+              onInstallEnded: function(aInstall, aStatus) {
                 callCallback(uri, SUCCESS);
               }
             });
@@ -144,7 +140,7 @@ amManager.prototype = {
     return retval;
   },
 
-  notify: function AMC_notify(aTimer) {
+  notify: function(aTimer) {
     AddonManagerPrivate.backgroundUpdateTimerHandler();
   },
 
@@ -154,7 +150,7 @@ amManager.prototype = {
    * Listens to requests from child processes for InstallTrigger
    * activity, and sends back callbacks.
    */
-  receiveMessage: function AMC_receiveMessage(aMessage) {
+  receiveMessage: function(aMessage) {
     let payload = aMessage.data;
 
     switch (aMessage.name) {
@@ -165,7 +161,7 @@ amManager.prototype = {
         let callback = null;
         if (payload.callbackID != -1) {
           callback = {
-            onInstallEnded: function ITP_callback(url, status) {
+            onInstallEnded: function(url, status) {
               gParentMM.broadcastAsyncMessage(MSG_INSTALL_CALLBACK, {
                 callbackID: payload.callbackID,
                 url: url,
@@ -184,7 +180,7 @@ amManager.prototype = {
 
   classID: Components.ID("{4399533d-08d1-458c-a87a-235f74451cfa}"),
   _xpcom_factory: {
-    createInstance: function AMC_createInstance(aOuter, aIid) {
+    createInstance: function(aOuter, aIid) {
       if (aOuter != null)
         throw Components.Exception("Component does not support aggregation",
                                    Cr.NS_ERROR_NO_AGGREGATION);
diff --git a/toolkit/mozapps/extensions/amInstallTrigger.js b/toolkit/mozapps/extensions/amInstallTrigger.js
index a18fe84c4..5fc0e1717 100644
--- a/toolkit/mozapps/extensions/amInstallTrigger.js
+++ b/toolkit/mozapps/extensions/amInstallTrigger.js
@@ -203,7 +203,7 @@ InstallTrigger.prototype = {
     return this.startSoftwareUpdate(url);
   },
 
-  _resolveURL: function (url) {
+  _resolveURL: function(url) {
     return Services.io.newURI(url, null, this._url);
   },
 
diff --git a/toolkit/mozapps/extensions/amWebInstallListener.js b/toolkit/mozapps/extensions/amWebInstallListener.js
index ac6e2495d..088f56640 100644
--- a/toolkit/mozapps/extensions/amWebInstallListener.js
+++ b/toolkit/mozapps/extensions/amWebInstallListener.js
@@ -89,7 +89,7 @@ Installer.prototype = {
   /**
    * Checks if all downloads are now complete and if so prompts to install.
    */
-  checkAllDownloaded: function Installer_checkAllDownloaded() {
+  checkAllDownloaded: function() {
     // Prevent re-entrancy caused by the confirmation dialog cancelling unwanted
     // installs.
     if (!this.isDownloading)
@@ -199,7 +199,7 @@ Installer.prototype = {
   /**
    * Checks if all installs are now complete and if so notifies observers.
    */
-  checkAllInstalled: function Installer_checkAllInstalled() {
+  checkAllInstalled: function() {
     var failed = [];
 
     for (let install of this.downloads) {
@@ -225,32 +225,32 @@ Installer.prototype = {
     this.installed = null;
   },
 
-  onDownloadCancelled: function Installer_onDownloadCancelled(aInstall) {
+  onDownloadCancelled: function(aInstall) {
     aInstall.removeListener(this);
     this.checkAllDownloaded();
   },
 
-  onDownloadFailed: function Installer_onDownloadFailed(aInstall) {
+  onDownloadFailed: function(aInstall) {
     aInstall.removeListener(this);
     this.checkAllDownloaded();
   },
 
-  onDownloadEnded: function Installer_onDownloadEnded(aInstall) {
+  onDownloadEnded: function(aInstall) {
     this.checkAllDownloaded();
     return false;
   },
 
-  onInstallCancelled: function Installer_onInstallCancelled(aInstall) {
+  onInstallCancelled: function(aInstall) {
     aInstall.removeListener(this);
     this.checkAllInstalled();
   },
 
-  onInstallFailed: function Installer_onInstallFailed(aInstall) {
+  onInstallFailed: function(aInstall) {
     aInstall.removeListener(this);
     this.checkAllInstalled();
   },
 
-  onInstallEnded: function Installer_onInstallEnded(aInstall) {
+  onInstallEnded: function(aInstall) {
     aInstall.removeListener(this);
     this.installed.push(aInstall);
 
@@ -272,7 +272,7 @@ extWebInstallListener.prototype = {
   /**
    * @see amIWebInstallListener.idl
    */
-  onWebInstallDisabled: function extWebInstallListener_onWebInstallDisabled(aBrowser, aUri, aInstalls) {
+  onWebInstallDisabled: function(aBrowser, aUri, aInstalls) {
     let info = {
       browser: aBrowser,
       originatingURI: aUri,
@@ -286,13 +286,13 @@ extWebInstallListener.prototype = {
   /**
    * @see amIWebInstallListener.idl
    */
-  onWebInstallOriginBlocked: function extWebInstallListener_onWebInstallOriginBlocked(aBrowser, aUri, aInstalls) {
+  onWebInstallOriginBlocked: function(aBrowser, aUri, aInstalls) {
     let info = {
       browser: aBrowser,
       originatingURI: aUri,
       installs: aInstalls,
 
-      install: function onWebInstallBlocked_install() {
+      install: function() {
       },
 
       QueryInterface: XPCOMUtils.generateQI([Ci.amIWebInstallInfo])
@@ -305,13 +305,13 @@ extWebInstallListener.prototype = {
   /**
    * @see amIWebInstallListener.idl
    */
-  onWebInstallBlocked: function extWebInstallListener_onWebInstallBlocked(aBrowser, aUri, aInstalls) {
+  onWebInstallBlocked: function(aBrowser, aUri, aInstalls) {
     let info = {
       browser: aBrowser,
       originatingURI: aUri,
       installs: aInstalls,
 
-      install: function onWebInstallBlocked_install() {
+      install: function() {
         new Installer(this.browser, this.originatingURI, this.installs);
       },
 
@@ -325,7 +325,7 @@ extWebInstallListener.prototype = {
   /**
    * @see amIWebInstallListener.idl
    */
-  onWebInstallRequested: function extWebInstallListener_onWebInstallRequested(aBrowser, aUri, aInstalls) {
+  onWebInstallRequested: function(aBrowser, aUri, aInstalls) {
     new Installer(aBrowser, aUri, aInstalls);
 
     // We start the installs ourself
diff --git a/toolkit/themes/osx/global/notification.css b/toolkit/themes/osx/global/notification.css
index 24b3d3920..6d22cf9c8 100644
--- a/toolkit/themes/osx/global/notification.css
+++ b/toolkit/themes/osx/global/notification.css
@@ -81,12 +81,12 @@ notificationbox[notificationside="bottom"] > notification {
  Invert the close icon for @type=info since both are normally dark. It's unclear
  why !important is necessary here so remove it if it's no longer needed.
 */
-notification[type="info"]:not([value="translation"]) .close-icon:not(:hover) {
+notification[type="info"] .close-icon:not(:hover) {
   -moz-image-region: rect(0, 64px, 16px, 48px) !important;
 }
 
 @media (min-resolution: 2dppx) {
-  notification[type="info"]:not([value="translation"]) .close-icon:not(:hover) {
+  notification[type="info"] .close-icon:not(:hover) {
     -moz-image-region: rect(0, 128px, 32px, 96px) !important;
   }
 }
diff --git a/tools/lint/eslint/modules.json b/tools/lint/eslint/modules.json
index 5fb645762..767b43db0 100644
--- a/tools/lint/eslint/modules.json
+++ b/tools/lint/eslint/modules.json
@@ -224,7 +224,6 @@
   "tokenserverclient.js": ["TokenServerClient", "TokenServerClientError", "TokenServerClientNetworkError", "TokenServerClientServerError"],
   "ToolboxProcess.jsm": ["BrowserToolboxProcess"],
   "tps.jsm": ["ACTIONS", "TPS"],
-  "Translation.jsm": ["Translation", "TranslationTelemetry"],
   "Traversal.jsm": ["TraversalRules", "TraversalHelper"],
   "UpdateTelemetry.jsm": ["AUSTLMY"],
   "userapi.js": ["UserAPI10Client"],
diff --git a/tools/rewriting/ThirdPartyPaths.txt b/tools/rewriting/ThirdPartyPaths.txt
index a5822c981..e784566de 100644
--- a/tools/rewriting/ThirdPartyPaths.txt
+++ b/tools/rewriting/ThirdPartyPaths.txt
@@ -56,6 +56,7 @@ netwerk/srtp/src/
 nsprpub/
 other-licenses/
 security/sandbox/chromium/
+third_party/aom/
 testing/gtest/gmock/
 testing/gtest/gtest/
 toolkit/components/protobuf/
diff --git a/widget/cocoa/nsChildView.mm b/widget/cocoa/nsChildView.mm
index 8f72a81be..1a257d899 100644
--- a/widget/cocoa/nsChildView.mm
+++ b/widget/cocoa/nsChildView.mm
@@ -425,13 +425,6 @@ nsChildView::Create(nsIWidget* aParent,
   if (!gChildViewMethodsSwizzled) {
     nsToolkit::SwizzleMethods([NSView class], @selector(mouseDownCanMoveWindow),
                               @selector(nsChildView_NSView_mouseDownCanMoveWindow));
-#ifdef __LP64__
-    nsToolkit::SwizzleMethods([NSEvent class], @selector(addLocalMonitorForEventsMatchingMask:handler:),
-                              @selector(nsChildView_NSEvent_addLocalMonitorForEventsMatchingMask:handler:),
-                              true);
-    nsToolkit::SwizzleMethods([NSEvent class], @selector(removeMonitor:),
-                              @selector(nsChildView_NSEvent_removeMonitor:), true);
-#endif
     gChildViewMethodsSwizzled = true;
   }
 
@@ -6513,48 +6506,3 @@ static const CGEventField kCGWindowNumberField = (const CGEventField) 51;
 }
 
 @end
-
-#ifdef __LP64__
-// When using blocks, at least on OS X 10.7, the OS sometimes calls
-// +[NSEvent removeMonitor:] more than once on a single event monitor, which
-// causes crashes.  See bug 678607.  We hook these methods to work around
-// the problem.
-@interface NSEvent (MethodSwizzling)
-+ (id)nsChildView_NSEvent_addLocalMonitorForEventsMatchingMask:(unsigned long long)mask handler:(id)block;
-+ (void)nsChildView_NSEvent_removeMonitor:(id)eventMonitor;
-@end
-
-// This is a local copy of the AppKit frameworks sEventObservers hashtable.
-// It only stores "local monitors".  We use it to ensure that +[NSEvent
-// removeMonitor:] is never called more than once on the same local monitor.
-static NSHashTable *sLocalEventObservers = nil;
-
-@implementation NSEvent (MethodSwizzling)
-
-+ (id)nsChildView_NSEvent_addLocalMonitorForEventsMatchingMask:(unsigned long long)mask handler:(id)block
-{
-  if (!sLocalEventObservers) {
-    sLocalEventObservers = [[NSHashTable hashTableWithOptions:
-      NSHashTableStrongMemory | NSHashTableObjectPointerPersonality] retain];
-  }
-  id retval =
-    [self nsChildView_NSEvent_addLocalMonitorForEventsMatchingMask:mask handler:block];
-  if (sLocalEventObservers && retval && ![sLocalEventObservers containsObject:retval]) {
-    [sLocalEventObservers addObject:retval];
-  }
-  return retval;
-}
-
-+ (void)nsChildView_NSEvent_removeMonitor:(id)eventMonitor
-{
-  if (sLocalEventObservers && [eventMonitor isKindOfClass: ::NSClassFromString(@"_NSLocalEventObserver")]) {
-    if (![sLocalEventObservers containsObject:eventMonitor]) {
-      return;
-    }
-    [sLocalEventObservers removeObject:eventMonitor];
-  }
-  [self nsChildView_NSEvent_removeMonitor:eventMonitor];
-}
-
-@end
-#endif // #ifdef __LP64__
diff --git a/widget/windows/InProcessWinCompositorWidget.cpp b/widget/windows/InProcessWinCompositorWidget.cpp
index 685eaf5ca..a11790b32 100644
--- a/widget/windows/InProcessWinCompositorWidget.cpp
+++ b/widget/windows/InProcessWinCompositorWidget.cpp
@@ -23,6 +23,31 @@ InProcessWinCompositorWidget::InProcessWinCompositorWidget(const CompositorWidge
   MOZ_ASSERT(mWindow);
 }
 
+void
+InProcessWinCompositorWidget::OnDestroyWindow()
+{
+  EnterPresentLock();
+  WinCompositorWidget::OnDestroyWindow();
+  LeavePresentLock();
+}
+
+void
+InProcessWinCompositorWidget::UpdateTransparency(nsTransparencyMode aMode)
+{
+  EnterPresentLock();
+  WinCompositorWidget::UpdateTransparency(aMode);
+  LeavePresentLock();
+}
+
+void
+InProcessWinCompositorWidget::ClearTransparentWindow()
+{
+  EnterPresentLock();
+  WinCompositorWidget::ClearTransparentWindow();
+  LeavePresentLock();
+}
+
+
 nsIWidget*
 InProcessWinCompositorWidget::RealWidget()
 {
diff --git a/widget/windows/InProcessWinCompositorWidget.h b/widget/windows/InProcessWinCompositorWidget.h
index 2ce6ba0be..afe5a7f53 100644
--- a/widget/windows/InProcessWinCompositorWidget.h
+++ b/widget/windows/InProcessWinCompositorWidget.h
@@ -22,6 +22,10 @@ class InProcessWinCompositorWidget final : public WinCompositorWidget
 public:
   InProcessWinCompositorWidget(const CompositorWidgetInitData& aInitData, nsWindow* aWindow);
 
+  void OnDestroyWindow() override;
+  void UpdateTransparency(nsTransparencyMode aMode) override;
+  void ClearTransparentWindow() override;
+
   void ObserveVsync(VsyncObserver* aObserver) override;
   nsIWidget* RealWidget() override;
 
diff --git a/widget/windows/TSFTextStore.cpp b/widget/windows/TSFTextStore.cpp
index 7224126b8..c80de831c 100644
--- a/widget/windows/TSFTextStore.cpp
+++ b/widget/windows/TSFTextStore.cpp
@@ -4247,14 +4247,14 @@ TSFTextStore::InsertTextAtSelectionInternal(const nsAString& aInsertStr,
   TS_SELECTION_ACP oldSelection = contentForTSF.Selection().ACP();
   if (!mComposition.IsComposing()) {
     // Use a temporary composition to contain the text
-    PendingAction* compositionStart = mPendingActions.AppendElement();
+    PendingAction* compositionStart = mPendingActions.AppendElements(2);
+    PendingAction* compositionEnd = compositionStart + 1;
     compositionStart->mType = PendingAction::COMPOSITION_START;
     compositionStart->mSelectionStart = oldSelection.acpStart;
     compositionStart->mSelectionLength =
       oldSelection.acpEnd - oldSelection.acpStart;
     compositionStart->mAdjustSelection = false;
 
-    PendingAction* compositionEnd = mPendingActions.AppendElement();
     compositionEnd->mType = PendingAction::COMPOSITION_END;
     compositionEnd->mData = aInsertStr;
 
@@ -4455,10 +4455,12 @@ TSFTextStore::RecordCompositionEndAction()
       }
       // When only setting selection is necessary, we should append it.
       if (pendingAction.mAdjustSelection) {
+        LONG selectionStart = pendingAction.mSelectionStart;
+        LONG selectionLength = pendingAction.mSelectionLength;
         PendingAction* setSelection = mPendingActions.AppendElement();
         setSelection->mType = PendingAction::SET_SELECTION;
-        setSelection->mSelectionStart = pendingAction.mSelectionStart;
-        setSelection->mSelectionLength = pendingAction.mSelectionLength;
+        setSelection->mSelectionStart = selectionStart;
+        setSelection->mSelectionLength = selectionLength;
         setSelection->mSelectionReversed = false;
       }
       // Remove the redundant pending composition.
diff --git a/xpcom/base/CycleCollectedJSContext.cpp b/xpcom/base/CycleCollectedJSContext.cpp
index 02fc1aa4c..206984656 100644
--- a/xpcom/base/CycleCollectedJSContext.cpp
+++ b/xpcom/base/CycleCollectedJSContext.cpp
@@ -1537,7 +1537,6 @@ IncrementalFinalizeRunnable::Run()
     return NS_OK;
   }
 
-  TimeStamp start = TimeStamp::Now();
   ReleaseNow(true);
 
   if (mDeferredFinalizeFunctions.Length()) {
diff --git a/xpcom/base/nsCycleCollector.cpp b/xpcom/base/nsCycleCollector.cpp
index a349e086d..7109d85bd 100644
--- a/xpcom/base/nsCycleCollector.cpp
+++ b/xpcom/base/nsCycleCollector.cpp
@@ -3524,9 +3524,9 @@ nsCycleCollector::CleanupAfterCollection()
   mGraph.Clear();
   timeLog.Checkpoint("CleanupAfterCollection::mGraph.Clear()");
 
+#ifdef COLLECT_TIME_DEBUG
   uint32_t interval =
     (uint32_t)((TimeStamp::Now() - mCollectionStart).ToMilliseconds());
-#ifdef COLLECT_TIME_DEBUG
   printf("cc: total cycle collector time was %ums in %u slices\n", interval,
          mResults.mNumSlices);
   printf("cc: visited %u ref counted and %u GCed objects, freed %d ref counted and %d GCed objects",
diff --git a/xpcom/glue/nsTArray-inl.h b/xpcom/glue/nsTArray-inl.h
index af57c9866..7e667a327 100644
--- a/xpcom/glue/nsTArray-inl.h
+++ b/xpcom/glue/nsTArray-inl.h
@@ -111,6 +111,23 @@ bool IsTwiceTheRequiredBytesRepresentableAsUint32(size_t aCapacity,
 template<class Alloc, class Copy>
 template<typename ActualAlloc>
 typename ActualAlloc::ResultTypeProxy
+nsTArray_base<Alloc, Copy>::ExtendCapacity(size_type aLength,
+                                           size_type aCount,
+                                           size_type aElemSize)
+{
+  mozilla::CheckedInt<size_type> newLength = aLength;
+  newLength += aCount;
+
+  if (!newLength.isValid()) {
+    return ActualAlloc::FailureResult();
+  }
+
+  return this->EnsureCapacity<ActualAlloc>(newLength.value(), aElemSize);
+}
+
+template<class Alloc, class Copy>
+template<typename ActualAlloc>
+typename ActualAlloc::ResultTypeProxy
 nsTArray_base<Alloc, Copy>::EnsureCapacity(size_type aCapacity,
                                            size_type aElemSize)
 {
@@ -275,26 +292,21 @@ nsTArray_base<Alloc, Copy>::ShiftData(index_type aStart,
 
 template<class Alloc, class Copy>
 template<typename ActualAlloc>
-bool
+typename ActualAlloc::ResultTypeProxy 
 nsTArray_base<Alloc, Copy>::InsertSlotsAt(index_type aIndex, size_type aCount,
                                           size_type aElemSize,
                                           size_t aElemAlign)
 {
   MOZ_ASSERT(aIndex <= Length(), "Bogus insertion index");
-  size_type newLen = Length() + aCount;
-
-  EnsureCapacity<ActualAlloc>(newLen, aElemSize);
-
-  // Check for out of memory conditions
-  if (Capacity() < newLen) {
-    return false;
+  if (!ActualAlloc::Successful(this->ExtendCapacity<ActualAlloc>(Length(), aCount, aElemSize))) {
+    return ActualAlloc::FailureResult();
   }
-
+  
   // Move the existing elements as needed.  Note that this will
   // change our mLength, so no need to call IncrementLength.
   ShiftData<ActualAlloc>(aIndex, 0, aCount, aElemSize, aElemAlign);
 
-  return true;
+  return ActualAlloc::SuccessResult();
 }
 
 // nsTArray_base::IsAutoArrayRestorer is an RAII class which takes
diff --git a/xpcom/glue/nsTArray.h b/xpcom/glue/nsTArray.h
index ca74a41f7..82586a79a 100644
--- a/xpcom/glue/nsTArray.h
+++ b/xpcom/glue/nsTArray.h
@@ -12,6 +12,7 @@
 #include "mozilla/Assertions.h"
 #include "mozilla/Attributes.h"
 #include "mozilla/BinarySearch.h"
+#include "mozilla/CheckedInt.h"
 #include "mozilla/fallible.h"
 #include "mozilla/Function.h"
 #include "mozilla/MathAlgorithms.h"
@@ -19,6 +20,7 @@
 #include "mozilla/Move.h"
 #include "mozilla/ReverseIterator.h"
 #include "mozilla/TypeTraits.h"
+#include "mozilla/Span.h"
 
 #include <string.h>
 
@@ -420,6 +422,17 @@ protected:
   typename ActualAlloc::ResultTypeProxy EnsureCapacity(size_type aCapacity,
                                                        size_type aElemSize);
 
+  // Extend the storage to accommodate aCount extra elements.
+  // @param aLength The current size of the array.
+  // @param aCount The number of elements to add.
+  // @param aElemSize The size of an array element.
+  // @return False if insufficient memory is available or the new length
+  //   would overflow; true otherwise.
+  template<typename ActualAlloc>
+  typename ActualAlloc::ResultTypeProxy ExtendCapacity(size_type aLength,
+                                                       size_type aCount,
+                                                       size_type aElemSize);
+
   // Tries to resize the storage to the minimum required amount. If this fails,
   // the array is left as-is.
   // @param aElemSize  The size of an array element.
@@ -461,8 +474,9 @@ protected:
   // @param aElementSize the size of an array element.
   // @param aElemAlign the alignment in bytes of an array element.
   template<typename ActualAlloc>
-  bool InsertSlotsAt(index_type aIndex, size_type aCount,
-                     size_type aElementSize, size_t aElemAlign);
+  typename ActualAlloc::ResultTypeProxy
+  InsertSlotsAt(index_type aIndex, size_type aCount,
+                size_type aElementSize, size_t aElemAlign);
 
   template<typename ActualAlloc, class Allocator>
   typename ActualAlloc::ResultTypeProxy
@@ -1112,6 +1126,18 @@ public:
   const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
   const_reverse_iterator crend() const { return rend(); }
 
+  // Span integration
+
+  operator mozilla::Span<elem_type>()
+  {
+    return mozilla::Span<elem_type>(Elements(), Length());
+  }
+
+  operator mozilla::Span<const elem_type>() const
+  {
+    return mozilla::Span<const elem_type>(Elements(), Length());
+ }
+
   //
   // Search methods
   //
@@ -1336,6 +1362,16 @@ protected:
     return ReplaceElementsAt<Item, ActualAlloc>(
       aStart, aCount, aArray.Elements(), aArray.Length());
   }
+
+  template<class Item, typename ActualAlloc = Alloc>
+  elem_type* ReplaceElementsAt(index_type aStart,
+                               size_type aCount,
+                               mozilla::Span<const Item> aSpan)
+  {
+    return ReplaceElementsAt<Item, ActualAlloc>(
+      aStart, aCount, aSpan.Elements(), aSpan.Length());
+  }
+
 public:
 
   template<class Item>
@@ -1347,6 +1383,15 @@ public:
     return ReplaceElementsAt<Item, FallibleAlloc>(aStart, aCount, aArray);
   }
 
+  template<class Item>
+  MOZ_MUST_USE elem_type* ReplaceElementsAt(index_type aStart,
+                                            size_type aCount,
+                                            mozilla::Span<const Item> aSpan,
+                                            const mozilla::fallible_t&)
+  {
+    return ReplaceElementsAt<Item, FallibleAlloc>(aStart, aCount, aSpan);
+  }
+
   // A variation on the ReplaceElementsAt method defined above.
 protected:
   template<class Item, typename ActualAlloc = Alloc>
@@ -1399,6 +1444,15 @@ protected:
     return ReplaceElementsAt<Item, ActualAlloc>(
       aIndex, 0, aArray.Elements(), aArray.Length());
   }
+
+  template<class Item, typename ActualAlloc = Alloc>
+  elem_type* InsertElementsAt(index_type aIndex,
+                              mozilla::Span<const Item> aSpan)
+  {
+    return ReplaceElementsAt<Item, ActualAlloc>(
+      aIndex, 0, aSpan.Elements(), aSpan.Length());
+  }
+
 public:
 
   template<class Item, class Allocator>
@@ -1425,6 +1479,14 @@ public:
     return InsertElementAt<FallibleAlloc>(aIndex);
   }
 
+  template<class Item>
+  MOZ_MUST_USE elem_type* InsertElementsAt(index_type aIndex,
+                                           mozilla::Span<const Item> aSpan,
+                                           const mozilla::fallible_t&)
+  {
+    return InsertElementsAt<Item, FallibleAlloc>(aIndex, aSpan);
+  }
+
   // Insert a new element, move constructing if possible.
 protected:
   template<class Item, typename ActualAlloc = Alloc>
@@ -1526,6 +1588,13 @@ protected:
   template<class Item, typename ActualAlloc = Alloc>
   elem_type* AppendElements(const Item* aArray, size_type aArrayLen);
 
+  template<class Item, typename ActualAlloc = Alloc>
+  elem_type* AppendElements(mozilla::Span<const Item> aSpan)
+  {
+    return AppendElements<Item, FallibleAlloc>(aSpan.Elements(),
+                                               aSpan.Length());
+  }
+
 public:
 
   template<class Item>
@@ -1536,6 +1605,15 @@ public:
     return AppendElements<Item, FallibleAlloc>(aArray, aArrayLen);
   }
 
+  template<class Item>
+  /* MOZ_MUST_USE */
+  elem_type* AppendElements(mozilla::Span<const Item> aSpan,
+                            const mozilla::fallible_t&)
+  {
+    return AppendElements<Item, FallibleAlloc>(aSpan.Elements(),
+                                               aSpan.Length());
+  }
+
   // A variation on the AppendElements method defined above.
 protected:
   template<class Item, class Allocator, typename ActualAlloc = Alloc>
@@ -1590,8 +1668,8 @@ public:
 protected:
   template<typename ActualAlloc = Alloc>
   elem_type* AppendElements(size_type aCount) {
-    if (!ActualAlloc::Successful(this->template EnsureCapacity<ActualAlloc>(
-          Length() + aCount, sizeof(elem_type)))) {
+    if (!ActualAlloc::Successful(this->template ExtendCapacity<ActualAlloc>(
+          Length(), aCount, sizeof(elem_type)))) {
       return nullptr;
     }
     elem_type* elems = Elements() + Length();
@@ -1807,9 +1885,8 @@ protected:
   template<typename ActualAlloc = Alloc>
   elem_type* InsertElementsAt(index_type aIndex, size_type aCount)
   {
-    if (!base_type::template InsertSlotsAt<ActualAlloc>(aIndex, aCount,
-                                                        sizeof(elem_type),
-                                                        MOZ_ALIGNOF(elem_type))) {
+    if (!ActualAlloc::Successful(this->template InsertSlotsAt<ActualAlloc>(
+          aIndex, aCount, sizeof(elem_type), MOZ_ALIGNOF(elem_type)))) {
       return nullptr;
     }
 
@@ -1982,9 +2059,8 @@ auto
 nsTArray_Impl<E, Alloc>::InsertElementsAt(index_type aIndex, size_type aCount,
                                           const Item& aItem) -> elem_type*
 {
-  if (!base_type::template InsertSlotsAt<ActualAlloc>(aIndex, aCount,
-                                                      sizeof(elem_type),
-                                                      MOZ_ALIGNOF(elem_type))) {
+  if (!ActualAlloc::Successful(this->template InsertSlotsAt<ActualAlloc>(
+        aIndex, aCount, sizeof(elem_type), MOZ_ALIGNOF(elem_type)))) {
     return nullptr;
   }
 
@@ -2003,6 +2079,7 @@ template<typename ActualAlloc>
 auto
 nsTArray_Impl<E, Alloc>::InsertElementAt(index_type aIndex) -> elem_type*
 {
+  // Length() + 1 is guaranteed to not overflow, so EnsureCapacity is OK.
   if (!ActualAlloc::Successful(this->template EnsureCapacity<ActualAlloc>(
         Length() + 1, sizeof(elem_type)))) {
     return nullptr;
@@ -2019,6 +2096,7 @@ template<class Item, typename ActualAlloc>
 auto
 nsTArray_Impl<E, Alloc>::InsertElementAt(index_type aIndex, Item&& aItem) -> elem_type*
 {
+  // Length() + 1 is guaranteed to not overflow, so EnsureCapacity is OK.
   if (!ActualAlloc::Successful(this->template EnsureCapacity<ActualAlloc>(
          Length() + 1, sizeof(elem_type)))) {
     return nullptr;
@@ -2035,8 +2113,8 @@ template<class Item, typename ActualAlloc>
 auto
 nsTArray_Impl<E, Alloc>::AppendElements(const Item* aArray, size_type aArrayLen) -> elem_type*
 {
-  if (!ActualAlloc::Successful(this->template EnsureCapacity<ActualAlloc>(
-        Length() + aArrayLen, sizeof(elem_type)))) {
+  if (!ActualAlloc::Successful(this->template ExtendCapacity<ActualAlloc>(
+        Length(), aArrayLen, sizeof(elem_type)))) {
     return nullptr;
   }
   index_type len = Length();
@@ -2058,8 +2136,8 @@ nsTArray_Impl<E, Alloc>::AppendElements(nsTArray_Impl<Item, Allocator>&& aArray)
 
   index_type len = Length();
   index_type otherLen = aArray.Length();
-  if (!Alloc::Successful(this->template EnsureCapacity<Alloc>(
-        len + otherLen, sizeof(elem_type)))) {
+  if (!Alloc::Successful(this->template ExtendCapacity<Alloc>(
+        len, otherLen, sizeof(elem_type)))) {
     return nullptr;
   }
   copy_type::MoveNonOverlappingRegion(Elements() + len, aArray.Elements(), otherLen,
@@ -2075,6 +2153,7 @@ template<class Item, typename ActualAlloc>
 auto
 nsTArray_Impl<E, Alloc>::AppendElement(Item&& aItem) -> elem_type*
 {
+  // Length() + 1 is guaranteed to not overflow, so EnsureCapacity is OK.
   if (!ActualAlloc::Successful(this->template EnsureCapacity<ActualAlloc>(
          Length() + 1, sizeof(elem_type)))) {
     return nullptr;
@@ -2347,6 +2426,25 @@ struct nsTArray_CopyChooser<AutoTArray<E, N>>
   typedef nsTArray_CopyWithConstructors<AutoTArray<E, N>> Type;
 };
 
+// Span integration
+namespace mozilla {
+
+template<class ElementType, class TArrayAlloc>
+Span<ElementType>
+MakeSpan(nsTArray_Impl<ElementType, TArrayAlloc>& aTArray)
+{
+  return aTArray;
+}
+
+template<class ElementType, class TArrayAlloc>
+Span<const ElementType>
+MakeSpan(const nsTArray_Impl<ElementType, TArrayAlloc>& aTArray)
+{
+  return aTArray;
+}
+
+} // namespace mozilla
+
 // Assert that AutoTArray doesn't have any extra padding inside.
 //
 // It's important that the data stored in this auto array takes up a multiple of
diff --git a/xpcom/string/nsTSubstring.h b/xpcom/string/nsTSubstring.h
index a08036b1f..53b4fb9a8 100644
--- a/xpcom/string/nsTSubstring.h
+++ b/xpcom/string/nsTSubstring.h
@@ -7,6 +7,8 @@
 
 #include "mozilla/Casting.h"
 #include "mozilla/MemoryReporting.h"
+#include "mozilla/IntegerTypeTraits.h"
+#include "mozilla/Span.h"
 
 #ifndef MOZILLA_INTERNAL_API
 #error Cannot use internal string classes without MOZILLA_INTERNAL_API defined. Use the frozen header nsStringAPI.h instead.
@@ -798,6 +800,68 @@ public:
   }
 #endif
 
+  /**
+   * Span integration
+   */
+
+  operator mozilla::Span<char_type>()
+  {
+    return mozilla::MakeSpan(BeginWriting(), Length());
+  }
+
+  operator mozilla::Span<const char_type>() const
+  {
+    return mozilla::MakeSpan(BeginReading(), Length());
+  }
+
+  void Append(mozilla::Span<const char_type> aSpan)
+  {
+    auto len = aSpan.Length();
+    MOZ_RELEASE_ASSERT(len <= mozilla::MaxValue<size_type>::value);
+    Append(aSpan.Elements(), len);
+  }
+
+  MOZ_MUST_USE bool Append(mozilla::Span<const char_type> aSpan,
+                           const fallible_t& aFallible)
+  {
+    auto len = aSpan.Length();
+    if (len > mozilla::MaxValue<size_type>::value) {
+      return false;
+    }
+    return Append(aSpan.Elements(), len, aFallible);
+  }
+
+#if !defined(CharT_is_PRUnichar)
+  operator mozilla::Span<uint8_t>()
+  {
+    return mozilla::MakeSpan(reinterpret_cast<uint8_t*>(BeginWriting()),
+                             Length());
+  }
+
+  operator mozilla::Span<const uint8_t>() const
+  {
+    return mozilla::MakeSpan(reinterpret_cast<const uint8_t*>(BeginReading()),
+                             Length());
+  }
+
+  void Append(mozilla::Span<const uint8_t> aSpan)
+  {
+    auto len = aSpan.Length();
+    MOZ_RELEASE_ASSERT(len <= mozilla::MaxValue<size_type>::value);
+    Append(reinterpret_cast<const char*>(aSpan.Elements()), len);
+  }
+
+  MOZ_MUST_USE bool Append(mozilla::Span<const uint8_t> aSpan,
+                           const fallible_t& aFallible)
+  {
+    auto len = aSpan.Length();
+    if (len > mozilla::MaxValue<size_type>::value) {
+      return false;
+    }
+    return Append(
+      reinterpret_cast<const char*>(aSpan.Elements()), len, aFallible);
+  }
+#endif
 
   /**
    * string data is never null, but can be marked void.  if true, the
@@ -1184,3 +1248,22 @@ operator>(const nsTSubstring_CharT::base_string_type& aLhs,
 {
   return Compare(aLhs, aRhs) > 0;
 }
+
+/**
+ * Span integration
+ */
+namespace mozilla {
+
+inline Span<CharT>
+MakeSpan(nsTSubstring_CharT& aString)
+{
+  return aString;
+}
+
+inline Span<const CharT>
+MakeSpan(const nsTSubstring_CharT& aString)
+{
+  return aString;
+}
+
+} // namespace mozilla
author	wolfbeast <mcwerewolf@gmail.com>	2018-11-03 09:06:25 +0100
committer	wolfbeast <mcwerewolf@gmail.com>	2018-11-03 09:06:25 +0100
commit	1626b5d7041ea9c3db92200f91542da46e49dde6 (patch)
tree	e99c393052ef818645027da57774672990e29514
parent	314fb761d144b160d3aeb72840c89e31c4f21a4a (diff)
parent	1d55939c7ca0e80555a24b240ff68d5bdbb48b4a (diff)
download	UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar.gz UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar.lz UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.tar.xz UXP-1626b5d7041ea9c3db92200f91542da46e49dde6.zip